diff --git a/421m32b1b5/eval.txt b/421m32b1b5/eval.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d3e286eef44d35542941455253b3894a368b9c4 --- /dev/null +++ b/421m32b1b5/eval.txt @@ -0,0 +1 @@ +2.980892E+00 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e17ef3cade4b5991153e08eb4b8ffb8240f7a06 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a15587a91baed8ebf6aaa42b995170dd41ac617e4770b62d47c0dabbfa5bdd +size 78980887 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01c4ae34fa1871af033593efeea2431547cb0dcb --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c179b3d59b424faf0a6e5305e6ab0df2ca53d8ebea2752615d503baf803a1532 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6434d5dc4b4c3ffe46064c36d689c164c7fe4c9a --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80588b8ce54f5b80086c15657b414163dea2ecb03cec85926a04d742fcf477b1 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..289ff2d20b6278a29f39f7c796dee9e0d88a0bb7 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa22a2cf3806493caa672a24a0d4a391dbe431f6ca0ade303b68c1a6fe4663c1 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f64389fd388693dfdf3ae693e4cc176bc4df6e23 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08052902cd27306b1e3773d1dca6a5445ad16bb9a19822285c9e9a841564e96b +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d1243940b565d7e81e177bc6651fa273d7e562b --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:662e298a3a4ab6c06c380df5ea694fdc204dca67c4bc30acd9c9774f0bbcf52f +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e56726cabfc5d44c9200b229fef6e02c1a2907cf --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6ddd7529d7f371c4f3904ac09cad60da428b26e02cbf2ab25e9ac6206a2526 +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4725f2eeb0a8a030c5080e33175ab9b50ada126 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af297b29d7ebae26008fb99afb3abc2c9acc7d3a024a8db01e302e11c7a4abbe +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8135d36ce18ce818fe0a521d74ee311ef410d4ff --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07a519893987bc43464540ae54a0eab5274351b6c5b78e19895e6e275cc26e7 +size 78981090 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b9d4376863244e7b19aad2a0a2a2b3526f0a51f --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68a89e51aa1b8af1705278b7e0351ecd1f19cbf1d3a01f5bd68a3d0fa23ee86 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac5978a8cc76167637a4a17335d59edfb08aac74 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da271b41b0f1415a1eceaea469e3b63fd16935bbc6ed0ef90732849305061c05 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51c883883bcb58f54564d045294060ad5e602e53 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a85e33a963ad364d25ba0f66a70da8bb782a205e3ba5a11e80fd04765d05a8c4 +size 78980951 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe67d948cb4c03fad278fc4c8b85460ca04a59a4 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:542b52d2d525fd3dab11ad7e286178db174e8e360d085427b3d7c167a4164b08 +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4068b70c88d369d13aa52da480bdf3126401f15 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae2b75400f539f1cb219ec076700913418ab98aec9440e00545d3544fdb3579 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff7ee0a3950f1b79f543f5e4aac246a631d1cd76 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30c40cdef47ee56056a895f036ca3b3662b1e44d978f81980f9a7b990499345 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba464d10fa48deec0c0072ee606592067f484e31 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9d52541b54181af82fcf40bc67835c62fc346c86c1abe6a34c24dcad61d4ee4 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d43904ab95de07cbaef2ac22f71b391f9cb8185 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67ab992e7b37d9eebe1e0c22bb24f7b181f4c7c9d5877b409d777e58231aa805 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6abafa2a0a30f553d342075a8656ee64886bd290 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144c274fa282e83182ad47ba93a547e90c69791a531b76ff66b2d47687a51e3 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f3c387253ff92af5fce38a92ead81881cbe1ea3 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54f73a6068d99cf7ab10d0e898867d2d6a92efdd56954f321ed7d8f8bc4a11a9 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9216eeec64651483931596d7f8d424225476278c --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20a50d1cc2f8029d4cd65aec2094ca518d3bc5f1a2fac6a398a1882014e8c2ec +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a44e9c7f96ea3cbd7d4812c25b57450251e82677 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2e3e18c46e6c34e0703aeb2943bb1678a8463e8b8cd04c77d1d53a62944dea +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15a7c35791fe572bcc81e09953bbe2aaaba19bf1 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c65eda65e9e645df1982244838fabc42e933f663fb996c600921ee8d06413b +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9961affa8d47e9cc1ebe7b5209bc7f2bf739ced6 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bc31e90b9d2ad1465c5dae81d5e7b55d4278f6cee4a7b0b3c2b398aa79ded29 +size 78980887 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..653e1f7438ea03aab613c713da987a2660f26075 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e999a55093e64570829ae5773b749f68344720bcc73e4117f6e378b59e27e595 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1913dd8695e56ad31d3989d879059f4fbaab77e7 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53504560633d6ad9252956c702bbd1c77c9c7612e3c1d25dcdc42b896a33bc9e +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65c377e4beb26928cf6b3ff4e127827d54519d68 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a954b6b91108c974561e8dfee4f1c0c2adc9af8100bebec36bb39c2ccd6d8cee +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d2ca7b40e372476baf5ee687b0cc41f404151e3 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9790000932017c71eab9a34156b0a5bbdae4dea23291520b95db6393dd852acd +size 78981090 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f783194107af198bbe3dca2737a290aef81f3d06 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfcb0de2363ff2a8155fb6086508bbe4ba46b7069417bcf0ed058365fae212a0 +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5379e1e338fdb962e597909074221393cceec4e --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e6b55754a02218fdf8498a9c3a9f6238113caca82839ac3782862071480282 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..506e119feed0764f42192904884da66195ce4c85 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b797fb05f957d13914b3256e8247d96386fc1f6c40940fe22f26a517a6024697 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc00423d8047dec28060aa6c6a393c737585ea2d --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0decbf2fbaf568ab9d3b7d2711a899c2745190fa8fe3d7dca59c1b94ef86a4f6 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd0e155eed39a86adf39a25df5a8bb12a65932e1 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:274edb614e677d8c67ce24abd6dfc6068e44438a625cf6e80ff8519dd7022504 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..197d2466db1a95adda37a937ce0f3ec5d96cb007 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeee5f04398e69cf535e336916b407a4cc1ad6e5624d8dfe829c8dfaeef8228b +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f362dbcdb57d395361e725f14d14e3cc44268f17 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a58841b662fe09021f1864a592c360c573f4d072e6f31999503801dd82d683 +size 78980887 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d21a1e119f50a231b6150d909b7b3bc434f13dc5 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d4bf108d55040664a33081d645ca5a8292955b0d9870b18938ad7dd05e55a5 +size 78981090 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c25d97a3c555958e80156fb60007f473b3cf8a18 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f607a39d497791e22be9047c4e34a12dfccf4626ee8f94bdfa66dee3b9ece85e +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57999fd13b0e1a8318bb070107cd6214c9fe4bdd --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c60c9754449f7322be2ac04ef957e17584afdbc3791caae9f677da32c29b0ca6 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b98b3885fe9bcd7941502075bbc7e2d438fad02 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05ebd1c9c1735c13e26bf8615ffacb4b75bb3da4a11153d20f37f300657d924 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eea662e51ad441ac4768d34e69cd63f272a578ae --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b2b4895c673566887a86d99b509c6fc80f8e0c59f9c90e85f0b9227ed75d74 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25b382a4174a712f9ba4882bca6f2fae0aa3a231 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20dc355966b52a09f696b1dfc60230613da480ae7769620a3f1ead06ab5176ec +size 78981090 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a412f65ed5584ccc90405dfb8bdeb2ffd607af6d --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19c8927ca9703b25fb83730d57317fded816398a89087f05f9b7fcfea564b7f +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..944c1a8cd2eb898414d899da2d698d0b434911cd --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f452d218bd2f773b3b8b820b9e25ab25779d997ff49e470c55bc1f5beea6cea1 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55258e27ebab621bdfd3b2fc45f41833502313d6 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d42c742857d4ef9dff9f0bb626c00e578a9ae1693163bcbce5bd0c49794dc4f +size 78980770 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9e552c3acef087a6004bccd2ee832dbcf2e0da7 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2ebca772d779fd31bbe25fddc3c53eade273fff4ee2f22fe7627f771731000 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d778991932d8c32944b3e5ccb2ce4af23377304c --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c3b8e6b7875e3bc80d354e946da1947a630f971d3576fff8c1fea128f715a4c +size 78980951 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8c5c14c5c52542b2036080f8e45aad63068c050 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6ef81928854342101fcf7a8b4d58e1cf27537cf3adf06b01265032decd7fa3 +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..417377a193bf4053255946b49f852675607b152a --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:225cf63a6b8ca6a5e8527165130573c9b87e8afbf6d632f25be134348ec4fd13 +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcb8c873c42d57f47caf7a2b61cb95d776deefa4 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26529ce0db9e1c3af8f232207ae0c2a25852842a4ffb87d07a730539f286dd0 +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8619ff09c2af5996a8830f3b2f43d894ad325772 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4219f2a75c822e3564ce77ea23cfa7bf379797f74a2766585b12b140a6b56f4 +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d69592fb8c429e22970a23ede4d530b5083e4331 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4539ea012ba022e77986b8780edd1c9d18f07781a4d11941a24f9f5c275773f +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fbd694c5ee6cc65499ae8f4c4edc188aa26ad9b --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eff2810e3d6ab629decae9b42d82cfa37273e445db207d4317ab4da72ab0854 +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc826a4ac5a43384cb68fa75481e7ea7d4d8b685 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b802b915baec1c0188f30fb9f3a2e8fd1bb31e849526f1469ab44f334f1a4634 +size 78981090 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7673aff97271df00d4f64db2c29865f53ee181d6 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5713f60d852ff62ac14fb62179d778e3d149264d5e90e6bedecc595c178e691 +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e20fed2487009ccb6d4deea868909e96cbec4e9f --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e3781442b27b679777d7d63c5404b55fb5e6aacc47a90abee06dbfd54ca317d +size 78981026 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f640b3fc78d7a996dde2064920f4bcce03fc15 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3fb931e567089e4715a86db812cb6a059df02ac238b183b433ce54896cedcbe +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef9090568655f6d6b46ccc6bf1ccf116e97e26ab --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0c3227f5c4696ec9c026bf88b09fd7d62dc716a7f69a564ab9a856b1f4a990 +size 78980887 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27a991924f43565ae5064948c8fabd33a73676d8 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:509c52cb4281da447b31884bda810d803ff643dc818656aaf392703f476a1f7a +size 78980962 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc7a946192d9e8316b125a43a3ee94dce7014576 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a85be3c9b6815b0ecde8cff0eb0bdb372504c4916211e50f539633e01948983 +size 78981090 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4090dd90571f4ca83027b9f836ac108cf07595f2 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf1f71bf6ad7be9fb40fcb5d1b4a17d099c16d0a8daf5d8283f1b0f8e212f5be +size 78980834 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfc99bc818b91c2e950f7c2e01e8fa5a98207289 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e86a54ae89ddf6d464a32ed8140a6ca3d59d194bdaf98cbf3f55c3ea16b579f +size 78980898 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e869e9523f96b389233c5aa97acb962188fb289 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2b58f978d8e70c7b6dcf3a21a97f5bcfb12c55b6ddf9485d3bffbdf232e94cc +size 78980823 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b97bf4640384b3318fc1760c11e6acc0c0f38425 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa2a873c1c8b61a4713cc785d44214b0cfd579e5e1f29f4c3e705a308e20f78 +size 78980887 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5ef5d44bf7fdd9bfd18992d12e680ac550fa6da --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2f5a2a31b5a9a39e8563fc1d4c15df0eaecb57be525ad075ca2b28ad8a237b +size 78980951 diff --git a/421m32b1b5/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/421m32b1b5/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c12700c799efc5ccabddea79fa9616dd3f2c7cf7 --- /dev/null +++ b/421m32b1b5/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdb641e711bc352f9b4640fb293fb94359dfcdeb9616c720e9dbb28fcf25b0d8 +size 78980887 diff --git a/421m32b1b5/global_step60336/layer_01-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b32d9c90dcc95aa515aebccdc2e4225f39ab8225 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c234cbcfca10529b88902d2480570cfca078233a1228575108176cfc3d9cc5a +size 134022403 diff --git a/421m32b1b5/global_step60336/layer_03-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25dcb012520cef53009dce23f4dd96f43a7e5a73 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d38bbf317bb35ab6a600c2d7ab3b2fb72e8b7a0eb49d5a24650ca8226947aaf3 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_04-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3a4615b644456989a5cc3b139ea730fca7dd6e5 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbdade96ce1c8449bc2a2e670dfb5306bdaec741bf1086628dcf158bb6a1479 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_05-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb6a4e3f8408289d33bdf8ce88692d604e91b1b2 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f578cb05078c360a957e0a95373096a3e9f9d8ea148df4c050d837fd04fdb112 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_06-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6312853f2fb9ccdd060de5e085d1ec7eed13185b --- /dev/null +++ b/421m32b1b5/global_step60336/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30c6a763f5c5ed82df81ed3483959586594c6b46c552812356fa472890c829a +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_07-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7d6495a877981431cc94f58704b92f28a130c47 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6170dedaf3acc6359b0ed319342305993fd5af4f73fc8967c7819cf8e01dd421 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_08-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a77c4f1ce99c87f7b60adc00369aeef0fa4556b --- /dev/null +++ b/421m32b1b5/global_step60336/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3696db856f5116445968b5733f6c6cf53426dfeca569cb94760b9b30ff5348 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_09-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..488f7fa3877e924a8b21fbed2a0fecb5dba77ba5 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc068eb080f11e6dcbb06c367d2c92e6b2382e271646112ff7aba10543068c1b +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_10-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1f2b217f4d4915ccc334b8564b9faae98effc1d --- /dev/null +++ b/421m32b1b5/global_step60336/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55cc08f25c0457233b2ff0b0b8e53ac3e31c0019073fdafe2bfb0a09dea72762 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_11-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da31ab3fc6bb46139835e4852f09c4203b107f42 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45d2f0a78870f2a2aeb2c58b9dea24547ed45e284dc8eda959ef972afbd8c61e +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_12-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b64b4da4322d54be10083a85b1cbba3574e17c6a --- /dev/null +++ b/421m32b1b5/global_step60336/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00237c8c508ec236abdae1574843b8b425dda45ce091410114f90cb1c260921d +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_13-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d5bf79d9fe8e1700c2384d988edcf3af95609b --- /dev/null +++ b/421m32b1b5/global_step60336/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ccd8cdcddba09b2680149b72822f7313ff45c422279a02d3e95525c1e9690e +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_14-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb01ae22b37489652bacc79117ca550200b4f6a2 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a569ae6d3df0d696c086561a0a5e9f866288b6386c567dba728fd6753f9d685 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_15-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fddcbdc339f8a916943a42745b737e8fadeda5b --- /dev/null +++ b/421m32b1b5/global_step60336/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb26c4561079791538ce64be1876ac3571f28d1a96ba1627a78f5a78c52265a5 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_16-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c58013d22b04805cc6acdb0022a3b4ef4e71ce91 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:652f5917ba1a8e9e941fa11b74ea3633b56bcbf5bc71c6bf6e8e12a58d6a60fe +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_17-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cdd7a919d59efe0a9e427a4744be8cc04a259bb --- /dev/null +++ b/421m32b1b5/global_step60336/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d557bed673d3b4b7945477d5bd6789849a37d52ab4f0dbad4aae93cbbcccb1d4 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_18-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e034f7e260ff52a9eca02eeffd68605c982921b --- /dev/null +++ b/421m32b1b5/global_step60336/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ffb0b45304c87419fb5b9b1ebb0327884916ab7fbd1633e6f17a4ee1e8a331 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_19-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98dddcdbc5c4c45da979b2c8a8a4f25bfcd2e767 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1dec98a1d8568d2ee2ad0f459e634d6d6c8e83928a8a44b4b3ccfd8e62353e6 +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_20-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d0d2db0f0ce2ea8d437df1f547169768d68421d --- /dev/null +++ b/421m32b1b5/global_step60336/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73f3fb535419e2347180784ea7eb0444bba5628f2e894f7a2c7944e83f3bdceb +size 39359235 diff --git a/421m32b1b5/global_step60336/layer_22-model_00-model_states.pt b/421m32b1b5/global_step60336/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..284efd4d1bccc05cd28339b13807464031824109 --- /dev/null +++ b/421m32b1b5/global_step60336/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7df4d5e015ec20822cf9d7dcee2a8368918ffc4a4d095471e19b627afea28fb +size 6339 diff --git a/421m32b1b5/global_step60336/mp_rank_00_model_states.pt b/421m32b1b5/global_step60336/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42abf56e6ffdbca315e86d8047541eff687f3fab --- /dev/null +++ b/421m32b1b5/global_step60336/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc37de79b41afdde47d2545c6242ed9fda2bb23ba23987f6dcb566f27351426d +size 37747 diff --git a/421m32b1b5/sbatch_421m32b1b5.sh b/421m32b1b5/sbatch_421m32b1b5.sh new file mode 100644 index 0000000000000000000000000000000000000000..bb1d4db647f7724652f1d05d82ad797db6b6ab4c --- /dev/null +++ b/421m32b1b5/sbatch_421m32b1b5.sh @@ -0,0 +1,163 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m32b1b5 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train1b5.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=15_446_035 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 154_460 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 10000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m32b1b5/sbatch_421m32b1b5val.sh b/421m32b1b5/sbatch_421m32b1b5val.sh new file mode 100644 index 0000000000000000000000000000000000000000..a987449523f2c5d04cc34525c7495186d4d9baf1 --- /dev/null +++ b/421m32b1b5/sbatch_421m32b1b5val.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m32b1b5val +VARIANT_CKPT=421m32b1b5 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train1b5.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --no-load-optim \ + --reset-progress \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m32b1b5/tensorboard_421m32b1b5/events.out.tfevents.1677456683.nid006251.74179.0 b/421m32b1b5/tensorboard_421m32b1b5/events.out.tfevents.1677456683.nid006251.74179.0 new file mode 100644 index 0000000000000000000000000000000000000000..073105da212a778568d7b8079da2598712a0e718 --- /dev/null +++ b/421m32b1b5/tensorboard_421m32b1b5/events.out.tfevents.1677456683.nid006251.74179.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e68b70a6642f4af4c76e213b26c996cfe7c0da1dc4843782beec3c5873bdbe +size 107878252 diff --git a/421m32b1b5/tensorboard_421m32b1b5val/events.out.tfevents.1677493533.nid006453.98370.0 b/421m32b1b5/tensorboard_421m32b1b5val/events.out.tfevents.1677493533.nid006453.98370.0 new file mode 100644 index 0000000000000000000000000000000000000000..d4b82b6a95906051278f486f1c726b414f76d093 --- /dev/null +++ b/421m32b1b5/tensorboard_421m32b1b5val/events.out.tfevents.1677493533.nid006453.98370.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c52d0c5fa887c7b1cb7fecf817ebe3a81927c8d93865c8a4ce42e5b71afdfd2 +size 980 diff --git a/421m32b400m/3319356.err b/421m32b400m/3319356.err new file mode 100644 index 0000000000000000000000000000000000000000..7cdb0dec20d27b1c020c805b61f35f1852f9ad54 --- /dev/null +++ b/421m32b400m/3319356.err @@ -0,0 +1,1108 @@ +3: 2023-03-16 09:02:55.539665: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539670: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539673: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539660: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539675: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539673: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:55.539683: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659788: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659795: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659807: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659798: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659801: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659808: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:55.659812: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659953: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659958: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659947: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659957: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659970: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659971: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659960: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:55.659959: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692605: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692609: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692616: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692620: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692609: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692613: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:55.692617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745364: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745382: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: 2023-03-16 09:02:55.745091: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:55.745089: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:55.745082: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745388: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:55.745099: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:55.745081: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745377: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745386: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:55.745382: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: 2023-03-16 09:02:55.745100: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:55.745080: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:55.745105: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818394: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818400: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818409: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818401: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818396: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818420: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:55.818426: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870481: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870486: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870488: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870492: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870476: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870497: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870501: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:55.870505: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:57.377062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377078: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377076: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:57.377687: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377690: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377695: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377698: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377700: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377703: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377704: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:57.377709: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.377300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377313: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.377315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:57.378055: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378059: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378061: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378061: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378064: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378064: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378070: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:57.378069: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.451963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451974: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451980: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451980: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451979: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.451977: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:57.452523: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452532: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452533: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452535: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452538: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452541: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452542: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:57.452546: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468316: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468318: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468319: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:57.468697: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468700: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468701: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468703: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468705: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468705: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468708: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:57.468712: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.484692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484703: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484698: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484704: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484707: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.484703: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:57.485108: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485111: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485116: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485118: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485120: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485118: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485126: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:57.485130: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485544: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:57.485908: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485913: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485913: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485915: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485917: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485917: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485918: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:57.485918: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496197: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496191: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496208: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496200: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496200: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:57.496607: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496613: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496617: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496616: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496618: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496618: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496621: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:57.496622: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.505979: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505982: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.505988: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:57.506402: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506404: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506410: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506411: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506414: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506415: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506416: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:57.506420: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:00.875577: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875579: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875584: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875592: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.875593: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877664: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877668: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877673: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877669: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877677: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877678: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877682: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877685: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877687: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877837: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877838: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:00.877850: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:00.877851: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.945357: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.945363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.945366: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-16 09:03:00.945367: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.945369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-16 09:03:00.945375: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-16 09:03:00.945373: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945515: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-16 09:03:00.945373: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945509: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.945512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947039: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947044: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947045: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947044: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947051: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947055: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947056: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947057: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947060: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947061: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947063: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947064: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:00.947067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 09:03:00.947298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:00.947082: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947305: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947306: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947312: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947312: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947323: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947324: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947331: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947331: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:00.947371: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:00.947372: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.953023: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953027: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953032: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953041: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.953044: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955136: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955143: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955140: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955143: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955150: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955151: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955158: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955160: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955162: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955165: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955167: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955169: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:00.955189: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:00.955207: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.031079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: 2023-03-16 09:03:01.031112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.031147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031114: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031092: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031155: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031119: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031151: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031093: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031122: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031162: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031123: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031162: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 09:03:01.031099: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031323: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.031122: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:03:01.031164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031336: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031332: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.031341: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032716: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032716: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032722: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032730: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032731: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032733: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032735: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032737: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032739: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032774: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:01.032782: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:01.032787: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033138: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033158: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033160: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033139: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033168: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033163: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033160: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033146: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033160: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033162: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033146: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033172: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033177: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.033152: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033179: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033181: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033180: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:01.033152: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:01.033160: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:01.033162: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033183: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:01.033164: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:01.033163: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:01.033186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 09:03:01.033188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:01.033199: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.033195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:01.033200: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:01.033208: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049961: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049970: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049967: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:01.049970: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049970: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049978: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049979: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049980: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049981: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:01.049984: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Loading extension module scaled_masked_softmax_cuda... +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +3: Building extension module utils... +3: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +3: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +5: +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +3: Building extension module utils... +3: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +0: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Loading extension module utils...Loading extension module utils...Loading extension module utils... +3: +3: +3: Loading extension module utils... +3: Loading extension module utils...Loading extension module utils... +3: +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils...Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils...Loading extension module utils... +2: +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/421m32b400m/3319356.out b/421m32b400m/3319356.out new file mode 100644 index 0000000000000000000000000000000000000000..b409a3ecb6d8562d156107c418fea2cedf85a739 --- /dev/null +++ b/421m32b400m/3319356.out @@ -0,0 +1,6414 @@ +Model parameters: d_model 1280 ffw_size 5120 kv_size 128 n_heads 10 n_layers 18 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 18 --hidden-size 1280 --num-attention-heads 10 --kv-channels 128 --ffn-hidden-size 5120 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-421m32b400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_421m32b400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_421m32b400m --load checkpoints_421m32b400m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3319356.json --zero-stage 0 +START 3319356: Thu 16 Mar 2023 09:02:35 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 41.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 48.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 38.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 47.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 38.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +7: Launching on nid005627 (7/8), master nid005620 port 9999, GPUs 8, CUDA: True +6: Launching on nid005626 (6/8), master nid005620 port 9999, GPUs 8, CUDA: True +5: Launching on nid005625 (5/8), master nid005620 port 9999, GPUs 8, CUDA: True +2: Launching on nid005622 (2/8), master nid005620 port 9999, GPUs 8, CUDA: True +0: Launching on nid005620 (0/8), master nid005620 port 9999, GPUs 8, CUDA: True +3: Launching on nid005623 (3/8), master nid005620 port 9999, GPUs 8, CUDA: True +1: Launching on nid005621 (1/8), master nid005620 port 9999, GPUs 8, CUDA: True +4: Launching on nid005624 (4/8), master nid005620 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3319356.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 5120 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1280 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-421m32b400mval +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_421m32b400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 18 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_421m32b400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_421m32b400mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-16 09:03:56,717] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.097 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 32.331 seconds +0: time to initialize megatron (seconds): -0.007 +0: [after megatron is initialized] datetime: 2023-03-16 09:04:31 +0: building GPT model ... +0: [2023-03-16 09:04:32,129] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-16 09:04:32,130] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-16 09:04:32,130] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.51 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-16 09:04:34,132] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=25 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: undo +0: 22: MixedFusedLayerNorm +0: 23: EmbeddingPipe +0: 24: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-16 09:04:34,356] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-16 09:04:34,357] [INFO] [utils.py:828:see_memory_usage] MA 0.79 GB Max_MA 0.79 GB CA 0.86 GB Max_CA 1 GB +0: [2023-03-16 09:04:34,357] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.54 GB, percent = 6.1% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-16 09:04:34,359] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-16 09:04:47,504] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-16 09:04:47,504] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-16 09:04:47,504] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-16 09:04:47,511] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-16 09:04:47,511] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-16 09:04:47,627] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-16 09:04:47,627] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.79 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-16 09:04:47,627] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.21 GB, percent = 6.2% +3: ninja: no work to do. +3: Time to load utils op: 0.22856855392456055 seconds +3: ninja: no work to do. +3: Time to load utils op: 0.14011836051940918 seconds +3: Time to load utils op: 0.0006604194641113281 seconds +3: Time to load utils op: 0.00045990943908691406 seconds +0: Time to load utils op: 0.31243371963500977 seconds +3: Time to load utils op: 0.20283031463623047 seconds +3: Time to load utils op: 0.20235824584960938 seconds +3: Time to load utils op: 0.2026534080505371 secondsTime to load utils op: 0.20366120338439941 seconds +3: +3: Time to load utils op: 0.20327186584472656 secondsTime to load utils op: 0.2027757167816162 seconds +3: +0: Time to load utils op: 0.20421147346496582 seconds +0: Time to load utils op: 0.20471596717834473 seconds +0: Time to load utils op: 0.20485234260559082 secondsTime to load utils op: 0.2050919532775879 seconds +0: +0: Time to load utils op: 0.20510077476501465 seconds +0: Time to load utils op: 0.204193115234375 seconds +0: Time to load utils op: 0.20547986030578613 seconds +1: Time to load utils op: 0.2126777172088623 seconds +1: Time to load utils op: 0.21305561065673828 seconds +1: Time to load utils op: 0.2127547264099121 seconds +1: Time to load utils op: 0.21270751953125 seconds +1: Time to load utils op: 0.21290159225463867 secondsTime to load utils op: 0.21341586112976074 secondsTime to load utils op: 0.21289372444152832 seconds +1: +1: +1: Time to load utils op: 0.2132728099822998 seconds +2: Time to load utils op: 0.21135711669921875 secondsTime to load utils op: 0.21136116981506348 seconds +2: +2: Time to load utils op: 0.21138381958007812 seconds +2: Time to load utils op: 0.21138811111450195 seconds +2: Time to load utils op: 0.21141982078552246 seconds +2: Time to load utils op: 0.21144866943359375 secondsTime to load utils op: 0.21142315864562988 seconds +2: +2: Time to load utils op: 0.21145224571228027 seconds +4: Time to load utils op: 0.21118450164794922 secondsTime to load utils op: 0.21120810508728027 seconds +4: +4: Time to load utils op: 0.21120500564575195 seconds +4: Time to load utils op: 0.2112267017364502 seconds +4: Time to load utils op: 0.2112574577331543 seconds +4: Time to load utils op: 0.2112433910369873 seconds +4: Time to load utils op: 0.21127939224243164 seconds +4: Time to load utils op: 0.21127820014953613 seconds +5: Time to load utils op: 0.21180009841918945 seconds +5: Time to load utils op: 0.21182823181152344 seconds +5: Time to load utils op: 0.2118816375732422 seconds +5: Time to load utils op: 0.21189355850219727 seconds +5: Time to load utils op: 0.21190547943115234 seconds +5: Time to load utils op: 0.21190214157104492 seconds +5: Time to load utils op: 0.21191048622131348 secondsTime to load utils op: 0.21191763877868652 seconds +5: +6: Time to load utils op: 0.21103644371032715 secondsTime to load utils op: 0.21103310585021973 seconds +6: +6: Time to load utils op: 0.21103906631469727 seconds +6: Time to load utils op: 0.2110729217529297 seconds +6: Time to load utils op: 0.21109485626220703 secondsTime to load utils op: 0.21108245849609375 seconds +6: +6: Time to load utils op: 0.2110908031463623 secondsTime to load utils op: 0.21110296249389648 seconds +6: +3: Time to load utils op: 0.0003533363342285156 seconds +3: Time to load utils op: 0.0003333091735839844 seconds +7: Time to load utils op: 0.21101140975952148 seconds +7: Time to load utils op: 0.21102023124694824 seconds +7: Time to load utils op: 0.21105003356933594 seconds +7: Time to load utils op: 0.21109414100646973 secondsTime to load utils op: 0.21108579635620117 seconds +7: +7: Time to load utils op: 0.21109485626220703 seconds +7: Time to load utils op: 0.21106433868408203 seconds +7: Time to load utils op: 0.21110892295837402 seconds +3: Time to load utils op: 0.00034880638122558594 seconds +3: Time to load utils op: 0.0003306865692138672 seconds +3: Time to load utils op: 0.00039887428283691406 seconds +3: Time to load utils op: 0.00036644935607910156 seconds +0: Time to load utils op: 0.0005388259887695312 seconds +0: Time to load utils op: 0.00038361549377441406 seconds +0: Time to load utils op: 0.0004115104675292969 seconds +0: Time to load utils op: 0.00047659873962402344 seconds +0: Time to load utils op: 0.00045990943908691406 seconds +0: Time to load utils op: 0.00042366981506347656 seconds +0: Time to load utils op: 0.00042128562927246094 seconds +1: Time to load utils op: 0.0009293556213378906 seconds +1: Time to load utils op: 0.0013544559478759766 seconds +1: Time to load utils op: 0.0013659000396728516 secondsTime to load utils op: 0.0013358592987060547 secondsTime to load utils op: 0.001276254653930664 seconds +1: +1: +1: Time to load utils op: 0.0013382434844970703 seconds +1: Time to load utils op: 0.0013527870178222656 seconds +1: Time to load utils op: 0.0013043880462646484 seconds +4: Time to load utils op: 0.0010728836059570312 seconds +5: Time to load utils op: 0.001117706298828125 seconds +7: Time to load utils op: 0.0007750988006591797 seconds +7: Time to load utils op: 0.0006995201110839844 seconds +4: Time to load utils op: 0.0012912750244140625 secondsTime to load utils op: 0.0013213157653808594 seconds +4: +4: Time to load utils op: 0.0013256072998046875 seconds +4: Time to load utils op: 0.0013375282287597656 seconds +4: Time to load utils op: 0.0013811588287353516 seconds +4: Time to load utils op: 0.0012793540954589844 seconds +4: Time to load utils op: 0.001346588134765625 seconds +7: Time to load utils op: 0.0008509159088134766 seconds +5: Time to load utils op: 0.0015461444854736328 seconds +5: Time to load utils op: 0.0015490055084228516 secondsTime to load utils op: 0.0015170574188232422 seconds +5: +5: Time to load utils op: 0.0015757083892822266 seconds +7: Time to load utils op: 0.0009374618530273438 secondsTime to load utils op: 0.001104116439819336 seconds +7: +5: Time to load utils op: 0.0015552043914794922 seconds +7: Time to load utils op: 0.0010578632354736328 seconds +5: Time to load utils op: 0.0015556812286376953 seconds +2: Time to load utils op: 0.0006682872772216797 seconds +5: Time to load utils op: 0.0015530586242675781 seconds +7: Time to load utils op: 0.001071929931640625 seconds +2: Time to load utils op: 0.0008022785186767578 seconds +2: Time to load utils op: 0.0005297660827636719 seconds +2: Time to load utils op: 0.0004520416259765625 secondsTime to load utils op: 0.0004515647888183594 seconds +2: +2: Time to load utils op: 0.0004265308380126953 seconds +2: Time to load utils op: 0.0004355907440185547 seconds +2: Time to load utils op: 0.00042700767517089844 seconds +7: Time to load utils op: 0.0003528594970703125 seconds +6: Time to load utils op: 0.0008833408355712891 seconds +6: Time to load utils op: 0.0010159015655517578 seconds +6: Time to load utils op: 0.001065969467163086 seconds +6: Time to load utils op: 0.0011720657348632812 seconds +6: Time to load utils op: 0.0011744499206542969 seconds +6: Time to load utils op: 0.0011055469512939453 seconds +6: Time to load utils op: 0.0012183189392089844 seconds +6: Time to load utils op: 0.0013186931610107422 seconds +0: [2023-03-16 09:04:48,065] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-16 09:04:48,066] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.78 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-16 09:04:48,066] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.33 GB, percent = 6.2% +0: [2023-03-16 09:04:48,187] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-16 09:04:48,188] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-16 09:04:48,188] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,291] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-16 09:04:48,292] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-16 09:04:48,292] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,393] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-16 09:04:48,393] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 09:04:48,394] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,492] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-16 09:04:48,493] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 09:04:48,493] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,595] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-16 09:04:48,595] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 09:04:48,595] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,694] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-16 09:04:48,694] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 09:04:48,695] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,798] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-16 09:04:48,799] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 09:04:48,799] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,898] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-16 09:04:48,899] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 09:04:48,899] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: [2023-03-16 09:04:48,899] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-16 09:04:48,899] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-16 09:04:48,899] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-16 09:04:48,899] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-16 09:04:48,900] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-16 09:04:48,900] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-16 09:04:48,900] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-16 09:04:48,900] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-16 09:04:48,900] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-16 09:04:48,901] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-16 09:04:48,902] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-16 09:04:48,902] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0005838871002197266 seconds +0: [2023-03-16 09:04:48,903] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-16 09:04:48,958] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=25 [0, 25) STAGE_PARAMS=421207040 (421.207M) TOTAL_PARAMS=421207040 (421.207M) UNIQUE_PARAMS=421207040 (421.207M) +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:48,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:48,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:48,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:49,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:49,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:49,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:49,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:49,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:49,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:49,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:49,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:49,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:49,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:49,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:49,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:49,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:49,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:49,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:49,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:49,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:49,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:49,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:49,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:49,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:49,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:49,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:49,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:49,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:49,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:49,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:49,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:49,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:49,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:49,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:49,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:49,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:49,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:49,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:49,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:49,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:49,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:49,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:49,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:49,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:49,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:49,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:49,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:49,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:49,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:49,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:49,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:50,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:50,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:50,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:50,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:50,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:50,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:50,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:50,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:50,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:50,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:50,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:50,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:50,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:50,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:50,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:50,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:50,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:50,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:50,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:50,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:50,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:50,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:50,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:50,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:50,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:50,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:50,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:50,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:50,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:50,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:50,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:50,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:50,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:50,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:50,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:50,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:50,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:50,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:51,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:51,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:51,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:51,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:51,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:51,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:51,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:51,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:51,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:51,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:51,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:51,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:51,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:51,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:51,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:51,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:51,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:51,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:51,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:51,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:51,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:51,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:51,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:51,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:51,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:51,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:51,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:51,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:51,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:51,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:51,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:51,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:51,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:51,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:51,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:51,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:52,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:52,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:52,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:52,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:52,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:52,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:52,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:52,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:52,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:52,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:52,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:52,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:52,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:52,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:52,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:52,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:52,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:52,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:52,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:52,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:52,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:52,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:52,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:52,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:52,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:52,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:52,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +1: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +1: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +5: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +3: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +6: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +4: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +4: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +2: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +7: [2023-03-16 09:04:52,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +0: [2023-03-16 09:04:52,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: [2023-03-16 09:04:52,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +3: [2023-03-16 09:04:52,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +0: > overriding learning rate value to 0.0002 +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +2: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/layer_22-model_00-model_states.pt. +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:52,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,768] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +1: [2023-03-16 09:04:52,771] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +7: [2023-03-16 09:04:52,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +6: [2023-03-16 09:04:52,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,780] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +7: [2023-03-16 09:04:52,782] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +6: [2023-03-16 09:04:52,783] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +3: [2023-03-16 09:04:52,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,803] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +3: [2023-03-16 09:04:52,806] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +5: [2023-03-16 09:04:52,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,812] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +5: [2023-03-16 09:04:52,815] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +4: [2023-03-16 09:04:52,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,818] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +0: [2023-03-16 09:04:52,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,820] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +4: [2023-03-16 09:04:52,821] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +6: [2023-03-16 09:04:52,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,823] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +0: [2023-03-16 09:04:52,823] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +6: [2023-03-16 09:04:52,826] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +1: [2023-03-16 09:04:52,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,827] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +1: [2023-03-16 09:04:52,830] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +6: [2023-03-16 09:04:52,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,836] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +7: [2023-03-16 09:04:52,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,836] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +6: [2023-03-16 09:04:52,839] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +7: [2023-03-16 09:04:52,839] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +3: [2023-03-16 09:04:52,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,842] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +5: [2023-03-16 09:04:52,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,842] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +5: [2023-03-16 09:04:52,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,843] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +6: [2023-03-16 09:04:52,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,845] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +3: [2023-03-16 09:04:52,845] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +5: [2023-03-16 09:04:52,846] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +5: [2023-03-16 09:04:52,846] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +2: [2023-03-16 09:04:52,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,847] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +6: [2023-03-16 09:04:52,848] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +3: [2023-03-16 09:04:52,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,849] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +2: [2023-03-16 09:04:52,851] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +6: [2023-03-16 09:04:52,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,851] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +3: [2023-03-16 09:04:52,852] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +6: [2023-03-16 09:04:52,855] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +1: [2023-03-16 09:04:52,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,856] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +1: [2023-03-16 09:04:52,859] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +1: [2023-03-16 09:04:52,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,861] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +1: [2023-03-16 09:04:52,864] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +5: [2023-03-16 09:04:52,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,866] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +5: [2023-03-16 09:04:52,870] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +7: [2023-03-16 09:04:52,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,875] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +7: [2023-03-16 09:04:52,878] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +2: [2023-03-16 09:04:52,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,879] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +4: [2023-03-16 09:04:52,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,881] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +2: [2023-03-16 09:04:52,883] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +4: [2023-03-16 09:04:52,884] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +7: [2023-03-16 09:04:52,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,886] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +0: [2023-03-16 09:04:52,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,887] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +7: [2023-03-16 09:04:52,889] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +0: [2023-03-16 09:04:52,890] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +1: [2023-03-16 09:04:52,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,892] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +1: [2023-03-16 09:04:52,895] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +3: [2023-03-16 09:04:52,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,898] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +3: [2023-03-16 09:04:52,901] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +6: [2023-03-16 09:04:52,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,902] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +6: [2023-03-16 09:04:52,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,903] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +3: [2023-03-16 09:04:52,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,904] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +2: [2023-03-16 09:04:52,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,905] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +5: [2023-03-16 09:04:52,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,905] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +6: [2023-03-16 09:04:52,906] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +6: [2023-03-16 09:04:52,906] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +3: [2023-03-16 09:04:52,907] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +1: [2023-03-16 09:04:52,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,908] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +5: [2023-03-16 09:04:52,908] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +2: [2023-03-16 09:04:52,908] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +7: [2023-03-16 09:04:52,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,911] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +1: [2023-03-16 09:04:52,911] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +0: [2023-03-16 09:04:52,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,914] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +4: [2023-03-16 09:04:52,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,915] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +7: [2023-03-16 09:04:52,915] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +0: [2023-03-16 09:04:52,917] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +1: [2023-03-16 09:04:52,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,919] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +4: [2023-03-16 09:04:52,919] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +1: [2023-03-16 09:04:52,922] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +0: [2023-03-16 09:04:52,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,922] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +4: [2023-03-16 09:04:52,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,923] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +4: [2023-03-16 09:04:52,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,923] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +2: [2023-03-16 09:04:52,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,925] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +0: [2023-03-16 09:04:52,925] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +4: [2023-03-16 09:04:52,926] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +4: [2023-03-16 09:04:52,926] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +0: [2023-03-16 09:04:52,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,927] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +2: [2023-03-16 09:04:52,928] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +2: [2023-03-16 09:04:52,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,929] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +5: [2023-03-16 09:04:52,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,929] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +0: [2023-03-16 09:04:52,931] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +2: [2023-03-16 09:04:52,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,931] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +5: [2023-03-16 09:04:52,933] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +7: [2023-03-16 09:04:52,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,934] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +2: [2023-03-16 09:04:52,934] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +2: [2023-03-16 09:04:52,934] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +7: [2023-03-16 09:04:52,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,936] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +3: [2023-03-16 09:04:52,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,937] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +7: [2023-03-16 09:04:52,937] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +3: [2023-03-16 09:04:52,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,937] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +5: [2023-03-16 09:04:52,937] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +7: [2023-03-16 09:04:52,940] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +3: [2023-03-16 09:04:52,940] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +5: [2023-03-16 09:04:52,941] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +3: [2023-03-16 09:04:52,941] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +3: [2023-03-16 09:04:52,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:52,942] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +3: [2023-03-16 09:04:52,945] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +6: [2023-03-16 09:04:52,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:52,948] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +1: [2023-03-16 09:04:52,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:52,949] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +5: [2023-03-16 09:04:52,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:52,950] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +6: [2023-03-16 09:04:52,952] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +1: [2023-03-16 09:04:52,952] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +5: [2023-03-16 09:04:52,955] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +7: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:52,961] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +4: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,963] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +0: [2023-03-16 09:04:52,963] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +7: [2023-03-16 09:04:52,964] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +4: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,966] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +0: [2023-03-16 09:04:52,966] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +4: [2023-03-16 09:04:52,966] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +0: [2023-03-16 09:04:52,967] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +4: [2023-03-16 09:04:52,969] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +0: [2023-03-16 09:04:52,970] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +0: [2023-03-16 09:04:52,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:52,972] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +0: [2023-03-16 09:04:52,975] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +4: [2023-03-16 09:04:52,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:52,981] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +4: [2023-03-16 09:04:52,985] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +2: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,985] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +2: [2023-03-16 09:04:52,991] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +2: [2023-03-16 09:04:52,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m32b400m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:52,996] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +2: [2023-03-16 09:04:52,999] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +0: successfully loaded checkpoint from checkpoints_421m32b400m at iteration 0 +7: time (ms) | load-checkpoint: 4047.30 +0: estimated model parameters: 0.42120704 +0: estimated model parameters without embeddings: 0.35419648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 09:04:53 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.031701 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.066 seconds +0: total number of samples: 48805 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.037345 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.009 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-16 09:05:06 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 21237.16 | train/valid/test-data-iterators-setup: 13142.28 +0: [after training is done] datetime: 2023-03-16 09:05:06 +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.366502E+00 | lm loss PPL: 2.897698E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3319356: Thu 16 Mar 2023 09:05:33 AM EET diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7393e81a9713f090af51a4d6e90b4bec53ee09a --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a48bc6a935514a134c118eb3da4729b292dfa5ab0204c8e53e118ca3b787789 +size 78980887 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d15d229c242a435dd76aaebda60d351302ed9454 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fddcb030c80cf4b47e783eeebc9d0cc28b7addd03369a1ed6cf9a1fcb5793f50 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d16586839364a032721554df5704ef223cd8897 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc43c0b2c97f1cb6701ada18bb5eedd6770a781d393a1a6b1b3e05e518638b0f +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..750306850aa48236d071a21a0c396e0f2211a9b9 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1ae9219c27793cc893b16fad2628bbab7481f96b62a9f07db3da419dea9ada +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7844569f67187acec618c9d986cb62a2b3371b03 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f86fbfe42461a91f6c0e6efd3ede8f4530f69e08e841b9e58a3a7977d5bc08 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6721fe75b8ccd4e1c90192d23374ab838ba6966 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207c039016fdc10a9bd710d49be032b0c87a24adf8da35bf07988f9bfb67c1c7 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90ad041bb5f1f7252abbad99ff2aec898ea64d0b --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:588b3846bbd6f002d0d39df2a2fb86d44fa398177b73dc814dc4b7436fc116e4 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d02f6d0f906aa2d8a83f77e109ea96b87ba2cbdb --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519be1e88c8e1a8752add8908a635c3f97ac16b9ea2038bf9893cdc3b37fd2f6 +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b8f69a515036a99a8f239d4525d09a6f249315e --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5119c48b5bef2f4b25c1c5d322ff0ac61691e52b886c93babdc6a85579af5f +size 78981090 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a6eb4b16e55c0bca684650c25bbc81ba9a37d9b --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa0eb221b3ee11c10a706bb1060cc01912304577ef2c9f38ca477e243c6cae4b +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..860ce498233a5063d50d1eb7f1421df2b0d73fb4 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e446fd1d8e52e1d890ae90ac3eba41321798fba897da15034aab3ba65bcbe611 +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c0fe0ef47b119c3b394d293306cc1c3a16f89ca --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f37d6040bb19c659195b286303063755785006d5c3d8b5105f9aa49bce7149f +size 78980951 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df8e20a2f5c4d4f770006a0eec325eeae00137ea --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30306580ada8c938886fc434b56f5037f70ab16f323864917272aead66111edb +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..950247493afdc6841792e122669603f572060bc6 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:387174a8da009e772b75ff8e193662027b272e2b5ca3ddb3ccd03a8566eaa37b +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d49d5091b9819f828220f7947b45472fdee5d98 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d6ce729bdcd0372da33215fd16cbeed119d3627da15fcfc917a17df21cf5ba +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e79f511e3d86c6def3dbd6784402775c98bcec2 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ab5cdb55db60138ec5193346ca437f20609c6e6c3dc87239376f7fd39dd306 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01052fe7527c9b9f2abba4410b1d35ddb693ddb0 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e865f9d16642dce2ad1c8b4112c3afaacd6411e8da3af6c9af3dbee348d8be +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89cf44a95ddf4c3cb1c3904c303a68de506ce22a --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fce06d498a76eea84c5a6e6dd516855602169c5a99786f507530540ec3ac8b3a +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39a27f922b74c7b63441bbb245bb0997b83e38ba --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27384dcba5b2dc7258daf879c0ffef0f67931b84bc081b7080cc17133be86faf +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f050e09b13c43f98ac6dbddc31aad8949900f6d --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21a1a5e20983098543f77f48e72118a6def9c52c38c678c139ddfe5b2bc63bf3 +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccb87abade1c0f7671995147eac8ff397e474e45 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e49a95ccb18effd255b506a412def849bb6f6f2b3532f96d146f48b5c8b2284 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33c425b466f18d4b1ad80c1d43e83db13e0e6754 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a83452aa0fa28a1a8a134247273f005071b93459ba6eae665095278a703777 +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bb1305ee16a7f827d8037c0cb4f05d5a6f8f7de --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cb6665876a51dc98e30c78e9914d39b13f25cdd9c1d176b01edda1362dcab1d +size 78980887 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dc6e83247a228906f24742b88f56f4f299c83fd --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93920e198ea32942b70c53157be3594c2b8c131bbf49b8f41a16a5b37492eff1 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63839ce51b6ba7cc54b00d936eafc15f1e26b489 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dfc85bf9119f284ee3d77174d808756ab99fbb79fedbea3f10c972484068687 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f8388e1c8524f5bc0032ea8ca722d576cff3f25 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ab632f83cdaa8404cd33d461ac5651755aa028e5cb1f1f4bf0081911653d333 +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b302089eae14506b6ced82a466c47b52342904d6 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341f188f19676a8722c01c3f82008c6db1c44c49c9a4b70d3002068986198ac7 +size 78981090 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19684abb4030c2ceffdf5c6c12ee35dc43f27e37 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6299f45c714158d9ad011b513770e14ee60df06d8724991059d5c2a7ac5f0858 +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afb2d46385fb54576f3d28246162eeec6e4f199d --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad47b571e6f7802067bd1249592dd341483c4f8197f494d311c3c19ec772133 +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2c1cff440d5e8f294e10015d304fb7ba36f524e --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860facaac1483e96052cd4e29da3a55542f9f685c50cf98f129ed9d4f3875ff4 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c44c205184263ca8a242d00b32289efe8fb3d527 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54c28e92d4ff9241ccb4a2571891f363e932ac2be7bad72d07d34672b2a2178 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99be2266e52c436e16bc848da77fe01777fba4cd --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5061e4049e2e23923dac87ab9dd6aac807c94a09b34e2d3aa5350974bcfe490e +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..628838035222937c8c186b725627ca0fd9f7938d --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05629b76e3c326724aad4068578ef3c045d8b6556edf177d7d580574e15fcd1a +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2aefe66dcc27c5dfff98917eb7ec84565803ec1 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:442dfaa7d7fa8467a38f5a1d7d7886fff255d0b70e68e78f7c4deafa38d71c85 +size 78980887 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c595f8268de86e16b570c471be7a04cd839c0054 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ca6c3eecafbddeaa4abe3fd1d57bb4ccedcd464acb93e83efff5d44abe05254 +size 78981090 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89fc957589cc01f89d28f000f7aa87baf94d4384 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b0aa786ea5095729941d06c82f3d335d2c82ba190cb4a841d44f735fcdfedc +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e52507e129800f9d34e41f3f9a597db3617de83 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbdd73463adb1958cc7e9c05e0edd09348bf7e12440c74fe0ef3c373beb27a9c +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aba931997a3fcae77e95fe9f710f958599009fda --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ee0dfacae6ec580adcbae166079c6efb99e13ba41f0418257739ef2defeb27 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5216a14e77c3ee039bc33f85e7d11e83b4db6f35 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:474bdba372151a1b79c5f3a2b04219559d5ea54d1fdeb8955fd290e8f28a6a0f +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4705fe04dd10d567e013b0ff7700c5735ef9172 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e7e9f8ba85f30e924e38ee503fbbb136c9c51784b2fa1ee219adb750c8d9730 +size 78981090 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5f401522abb14d85ff8e3979a718387d1ca9068 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13f88c2b9dff32f98e43db2b971e0d38b8c8203bc1d61d5699050f42062ef62d +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..995ec73db925b68c73bfbfde97b5eaca2e054d6c --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4a9eda113d76a14e95b2a108ea67451346d134f760c41f9a6c22947734d6434 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4b681dc3107832cdf012ad6937a8ada58d4fa59 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:672b8a838b539daefabcd912de32f69d3ca9b71575546d2ea55ef1a8c914bfe6 +size 78980770 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2980a8b0f36a336887fcb2f82b826847ea24ab8c --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72ce64a5b7a5af6ed22405cee2a32a7b7a9772bb22ef16e8b9e4a0eb36a673c7 +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ff5145d32f596f28c2473d5450c044e4825e9ac --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2283cf2ff93078ed20f4cfefe5fefc60db83283491c7993e6edd3c96cc6dc4c3 +size 78980951 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..382f3b276842ffa795952f6871a008f92d39c430 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a7b85713765e956bc5f9f37302126211f5a2fb0cedd0d84c5ee5934a0185a1e +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78b07c86724e9a6af9f8091ca4e7bacd449d5558 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aaa438de3c90d1bbfd2ab64dc15e68c9376fdcada8b40ba98f01cd7e17d1c3e +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..709566624f1c5351ffef7dcc9724bc3004cf3267 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ba42382765185dcd85f147260e381ca0efdf42ff2f7eaf2a01add0cf70f9f0 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee714d7c355ded3efee22481a2aa7cbfadab7c39 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2f7c201b3438fbc7557d2b4880ba115c7230ec00e85df217c5a8b6d32cdef68 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aef72ed28bbced7815f91b01e54e6f7b05f8facb --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e43c60b3f2952332c3e37ad6da39312efe134173af1646b84efbb349caf030be +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33849de6424ef6ded8c8ea96339cd8dc381eb200 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c136311d73961eb16c0c65e45271066cf647bd624e08bde110eb03f0eab3082 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dda1a3adb38aeddfc5f83adcc237e769f3a0a49e --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1edc9970aeb61bdf85aa3e61c4f17faef2698affee54b2eb034abc74d8749c86 +size 78981090 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72105c41ded85fc90fd07e71275a5bd5654cd446 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d430c4ffd29209f54c1d24dbdc72ddfe1db6c5f83f47ffbdc5825076581fe1 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..436112f07a49aefdfaabfc6ebfe43a8ea5c63e8b --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc8712e45dc8bf6052ef10ae922e02f7083c1df3b10cd048bc708054eae013c +size 78981026 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f2ff66fb16918a5498f3070ba847e4720266e6d --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fc04cf5855807d5d7f4b34472a9865f99819748684bc700585d25470f071e81 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44e5d76ee921d0468f6eb6d5a7ac98864f515208 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11254fbcdc1285ecf4c1bc1746dd657c68a2abf1f4ac1501580abeecf055ae2 +size 78980887 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..496f97b2626914e1b3619845b7bb5677bf0b7b7b --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52f761b15a378786d493e3819261d85ba8098a473df78129f0d4be4655afd694 +size 78980962 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e933630184194bcbc2602ad659c6227bf8ba4939 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8ab2f03380077a9dad2c58dc5ab2bcd862668d7bb79894dee489dc9c155b937 +size 78981090 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0dcc809b7623b9c7a2f65205fd580fbeaea3c08 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d42599da86bd0637365315249cd6e499db577d26df1e6144db1ae435fff61db +size 78980834 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c025aec1ec6d4891a3bc759e77c7b675c017dcd --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe7a526578b02694c6b6aa0a9e67a051c5bfeea87203c8085d5a12e1abe9fd5 +size 78980898 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83bb085a411c41871c07a8efd1ba313e71d46e0c --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c4052a5adefd53677f70f12051a6af746a80b89038b70af948ff2bfa175a7eb +size 78980823 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98ae141765c39ad31e5a29b29d52ae9851f5fb66 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2747c05b09ba5d1eeed1c10e6af9de5a7a99c87e5f3ecefc0d377c157dba932b +size 78980887 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..695370ee34ef0905c7f6f8ba5fb56730890cce15 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7417aa35eedf099f890bfcce0af8a919a18e4c434e6dea1a641e538e1a4dd13d +size 78980951 diff --git a/421m32b400m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/421m32b400m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54b6a4c11de16ff812ff30edc99f012dd167e738 --- /dev/null +++ b/421m32b400m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47b44cd279a7c0cee097bc2608b1bc71466850d899c577b777081ec208d5c55b +size 78980887 diff --git a/421m32b400m/global_step60336/layer_01-model_00-model_states.pt b/421m32b400m/global_step60336/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9abefd8fdb234a27eebb305cdf7736bc5af64142 --- /dev/null +++ b/421m32b400m/global_step60336/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205ba76bbfe9dc068fff45b5501f0fc854324a42657f4b8c26ee74274801ff83 +size 134022403 diff --git a/421m32b400m/global_step60336/layer_03-model_00-model_states.pt b/421m32b400m/global_step60336/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f50aef7d67643f0e9003b845f761cffb57a7ec49 --- /dev/null +++ b/421m32b400m/global_step60336/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10ba69ab774ce7f143d2ae150eb99f3a162e0e028dcd5111e86f4a895b78dbbc +size 39359235 diff --git a/421m32b400m/global_step60336/layer_04-model_00-model_states.pt b/421m32b400m/global_step60336/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07da6211737464dc5ea38f74a95063d5c8256a3c --- /dev/null +++ b/421m32b400m/global_step60336/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0236bbaf8598dc37494dbf99a26fdea0f53aa38eff85def78871abf9d9663144 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_05-model_00-model_states.pt b/421m32b400m/global_step60336/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..441659da86b6114285ccc9494b400ba76d3c4294 --- /dev/null +++ b/421m32b400m/global_step60336/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8135c89057bd0cc5ace3f823be8d14460027e3e852226aa717d0361b5e8a41f0 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_06-model_00-model_states.pt b/421m32b400m/global_step60336/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de5165781d15598ab71bd13e7ecac9e763d742e1 --- /dev/null +++ b/421m32b400m/global_step60336/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a19e23ff31d71e26996ea8ff4ca1124c7878e81f64b4017ccadb583cbe19995 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_07-model_00-model_states.pt b/421m32b400m/global_step60336/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86ea457ef4b147b4bfb0778c22e991c6abda9002 --- /dev/null +++ b/421m32b400m/global_step60336/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07deb89d59a13b1c2cd5bf00d930c52709d3e860ee7c83342a40d7b46fccd91 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_08-model_00-model_states.pt b/421m32b400m/global_step60336/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02daa7bfcc140adaf1751e489895aa556a6ad294 --- /dev/null +++ b/421m32b400m/global_step60336/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:412a43db5a7c168eed77a577076417b43fff350f285f596f08eb4f81d942e91e +size 39359235 diff --git a/421m32b400m/global_step60336/layer_09-model_00-model_states.pt b/421m32b400m/global_step60336/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f92f46b80988bcd29efe6f8499f48259cfab9a7b --- /dev/null +++ b/421m32b400m/global_step60336/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30dd8617d22b66a194d7297e4ff1a287832d8b5f65ff99bfd1a46edef2ea5782 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_10-model_00-model_states.pt b/421m32b400m/global_step60336/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b259a481c4f1b94c0599e3d1eeb317386ee39c5 --- /dev/null +++ b/421m32b400m/global_step60336/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12cb0cdb97a665a17496b52a8580876e513342632f27915231c7b583767ee5a1 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_11-model_00-model_states.pt b/421m32b400m/global_step60336/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d3753c8a3644a2e921658a2e464d0b678cc40f --- /dev/null +++ b/421m32b400m/global_step60336/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8184edf25042faf7eec55aab430972bb91658f27ad11e66235d8642d32915476 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_12-model_00-model_states.pt b/421m32b400m/global_step60336/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f07f0ee27d69d6cb35193e7e7485c2ceacb16b80 --- /dev/null +++ b/421m32b400m/global_step60336/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16008e9b36c1c0730b3db1e66451715e156c86c8c1a6849943edfc5aeb3d8db +size 39359235 diff --git a/421m32b400m/global_step60336/layer_13-model_00-model_states.pt b/421m32b400m/global_step60336/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4df35cf4b0e657a47b77b60bbfe2e9a368144f53 --- /dev/null +++ b/421m32b400m/global_step60336/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57ae8daefe09653889aa5d3d676fcefa42d8427b7a4232ee5e71c900ce62fc6e +size 39359235 diff --git a/421m32b400m/global_step60336/layer_14-model_00-model_states.pt b/421m32b400m/global_step60336/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5366d402745b0c7fb9c782e99deac214fab58d65 --- /dev/null +++ b/421m32b400m/global_step60336/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51ca5da5d591ad7f019b9d006b6b9d177852fbe736c518009115f4d6604fe380 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_15-model_00-model_states.pt b/421m32b400m/global_step60336/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98d2d622179694cabc6a2068cf58cf471a12ff66 --- /dev/null +++ b/421m32b400m/global_step60336/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f904b50acda9c97bc3f9090030fab99b426aee77b5d8dd95ee3895a8186c6288 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_16-model_00-model_states.pt b/421m32b400m/global_step60336/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de7b6b9e45eb7359a45a0a980414435d650904b6 --- /dev/null +++ b/421m32b400m/global_step60336/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e094b0dfea2f435a841bd3ab3e9db8c4994c446e09111817afe1af55c76503a +size 39359235 diff --git a/421m32b400m/global_step60336/layer_17-model_00-model_states.pt b/421m32b400m/global_step60336/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72d3407fdb108e1b81619e0cf149f53fcaefcde7 --- /dev/null +++ b/421m32b400m/global_step60336/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5857e4bed06ebf1dde5e30446d1db8118458b59fa817ff9ff3b624ed09443e88 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_18-model_00-model_states.pt b/421m32b400m/global_step60336/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1beb93cda0ac9dfce170d4ddc26730599736d634 --- /dev/null +++ b/421m32b400m/global_step60336/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07538d4a938a7e7b31e8c17739401fe414ddeaaf7717d157171b2ff72cfdb141 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_19-model_00-model_states.pt b/421m32b400m/global_step60336/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..202d6bdc314a5363f5bc2d39e29455cce9aab7ad --- /dev/null +++ b/421m32b400m/global_step60336/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15e24c5aed6e59e07ce4d3c52f86b37ef27dee3c19577465307817314fa5a88 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_20-model_00-model_states.pt b/421m32b400m/global_step60336/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec3e693f4028edf1fbe91d48fa4364f89d1baff7 --- /dev/null +++ b/421m32b400m/global_step60336/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2826cd24f0e96e66fdcea2a8aa20b8a5c5abdfa69847f19e398a65fdbbc22e8 +size 39359235 diff --git a/421m32b400m/global_step60336/layer_22-model_00-model_states.pt b/421m32b400m/global_step60336/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3edd771a415af52e03a769eacbda85a48fe5137 --- /dev/null +++ b/421m32b400m/global_step60336/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e94721b41d2bbc451f879550219cb9a90aac5cdaa06a5589354d8bf3875809a +size 6339 diff --git a/421m32b400m/global_step60336/mp_rank_00_model_states.pt b/421m32b400m/global_step60336/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbbc0b00655c65a04cc2b4af768dfaca432dcaab --- /dev/null +++ b/421m32b400m/global_step60336/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18bcd8a328e1639115cd2072043508b17e171ab61fbc807e6ab39d6db3e549ba +size 37747 diff --git a/421m32b400m/sbatch_421m32b400m.sh b/421m32b400m/sbatch_421m32b400m.sh new file mode 100644 index 0000000000000000000000000000000000000000..e23856887c97c265a6faa7d4e08c89e74e4da79e --- /dev/null +++ b/421m32b400m/sbatch_421m32b400m.sh @@ -0,0 +1,163 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m32b400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=15_446_035 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 154_460 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 10000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m32b400m/sbatch_421m32b400mval.sh b/421m32b400m/sbatch_421m32b400mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..9e550281fdda34d534c7f95c0e3bfdc92b9dd5fa --- /dev/null +++ b/421m32b400m/sbatch_421m32b400mval.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m32b400mval +VARIANT_CKPT=421m32b400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --no-load-optim \ + --reset-progress \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678909944.nid005627.108020.0 b/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678909944.nid005627.108020.0 new file mode 100644 index 0000000000000000000000000000000000000000..aef23d719bc95dc76e8a605809f2b7de9c0c5571 --- /dev/null +++ b/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678909944.nid005627.108020.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1ad55bf852358b76f934ff39a7a2b3ee7de2e6b6426d90f09931f820db9cd3 +size 107878257 diff --git a/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678944086.nid006946.70779.0 b/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678944086.nid006946.70779.0 new file mode 100644 index 0000000000000000000000000000000000000000..74fb46ca6ab9cb0d7ed2117b54a61271455ba022 --- /dev/null +++ b/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678944086.nid006946.70779.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00ce424045ad3adf30f8837a6e17e80cd6c686aa392c39befedeba641cedc26 +size 21471 diff --git a/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678944215.nid006591.10291.0 b/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678944215.nid006591.10291.0 new file mode 100644 index 0000000000000000000000000000000000000000..df57b502fbc6831eb6a7f175fe6f8f6c1f03b60a --- /dev/null +++ b/421m32b400m/tensorboard_421m32b400m/events.out.tfevents.1678944215.nid006591.10291.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6e415c6e40b7ea8fbeb08f01682ca9cd6b819cee9f457a6f78cc3d531997b5 +size 21471 diff --git a/421m32b400m/tensorboard_421m32b400mval/events.out.tfevents.1678950236.nid005627.70908.0 b/421m32b400m/tensorboard_421m32b400mval/events.out.tfevents.1678950236.nid005627.70908.0 new file mode 100644 index 0000000000000000000000000000000000000000..f988e8874a9cc5976f8c55a139e7dd288d909178 --- /dev/null +++ b/421m32b400m/tensorboard_421m32b400mval/events.out.tfevents.1678950236.nid005627.70908.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa69d3ef9d908da35d990ec86cb87a22da87863a7c10be02fa626adf5b30444 +size 980 diff --git a/421m3b9400m/3318426.err b/421m3b9400m/3318426.err new file mode 100644 index 0000000000000000000000000000000000000000..000e0fbc3c0edd7711d0fc483e9d764587791f1c --- /dev/null +++ b/421m3b9400m/3318426.err @@ -0,0 +1,1121 @@ +0: 2023-03-15 21:58:56.868100: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868106: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868114: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868114: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868110: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:58:56.868103: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936017: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936021: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936021: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936013: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936010: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936014: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936011: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:58:56.936011: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936197: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936205: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936212: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936207: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936201: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936217: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:58:56.936218: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936553: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936555: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936561: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936561: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936566: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936551: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936561: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:56.936567: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995720: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995726: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995737: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995731: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995731: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995741: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995725: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:58:56.995739: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995789: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995800: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995803: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995804: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995803: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:58:56.995795: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071850: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071861: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071868: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071853: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071861: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:58:57.071855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095541: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095532: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095547: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095554: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095551: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095561: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095566: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:58:57.095546: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:58:58.502892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502887: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502891: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502888: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502897: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502904: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.502899: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:58:58.503288: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503291: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503295: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503294: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503298: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503296: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503299: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:58:58.503303: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555305: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555312: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555318: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555317: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:58:58.555882: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555889: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555891: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555893: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555896: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555899: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555900: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:58:58.555901: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556141: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556145: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:58:58.556549: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556553: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556557: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556561: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556563: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556565: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556568: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:58:58.556570: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.558564: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558567: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558568: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558576: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558573: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558581: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.558578: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:58:58.559003: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559005: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559011: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559013: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559014: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559022: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559024: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:58:58.559025: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.590738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590743: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590749: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590746: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.590757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:58:58.591185: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591186: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591191: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591194: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591193: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591196: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591197: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:58:58.591199: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666339: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666336: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666339: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666344: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666346: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:58:58.666753: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666753: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666758: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666763: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666767: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666768: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666772: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:58:58.666774: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742509: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742506: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742509: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742508: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:58:58.742884: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742886: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742890: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742893: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742895: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742896: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742898: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:58:58.742893: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:58:58.794412: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794416: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794419: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794421: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794425: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794426: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794430: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:58:58.794433: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:59:01.679066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679075: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679080: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.679085: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680937: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680942: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.680950: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680950: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680960: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.680962: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.680957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.680970: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.680971: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.680975: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.681000: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.681008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:59:01.681016: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:59:01.681023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.775040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775045: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775047: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775053: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775056: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775054: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775060: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.775060: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775473: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775483: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775480: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.775484: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777301: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:59:01.777315: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777320: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777320: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777323: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777324: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777325: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777327: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:59:01.777327: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:59:01.815743: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815749: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815759: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.815765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817947: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: 2023-03-15 21:59:01.776942: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: 2023-03-15 21:59:01.776942: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817960: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817962: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:59:01.817964: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:59:01.817965: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776942: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-15 21:59:01.817967: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:59:01.817967: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817982: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: 2023-03-15 21:59:01.776944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.817991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: 2023-03-15 21:59:01.776949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:59:01.818000: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:59:01.818006: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.776958: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776959: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776959: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.776961: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776962: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776965: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776971: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:59:01.776975: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:59:01.776992: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.858443: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858458: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858456: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858464: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.858462: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859348: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859344: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859352: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859359: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.859365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860383: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860388: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860389: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860389: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860398: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860407: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860408: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860409: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:59:01.860420: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:59:01.860425: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861333: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861346: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861346: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861340: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861346: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861344: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861349: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861348: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861349: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:59:01.861363: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861366: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861368: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861370: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861369: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:59:01.861371: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.964034: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964037: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964043: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964043: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964047: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964052: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964057: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.964055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965887: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965888: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965896: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965895: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965905: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965906: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965910: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965930: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965934: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:59:01.965948: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965948: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:59:01.965951: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.991550: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991554: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991546: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991558: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991562: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991557: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991559: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.991559: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993907: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993909: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993908: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993911: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993913: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993914: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.993924: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.993924: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.993927: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.993929: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.993930: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.993931: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.994036: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.994042: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:59:01.994049: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:59:01.994055: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +2: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: +4: +4: +4: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +5: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: +6: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +3: +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils...Loading extension module utils... +4: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +7: +7: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils...Loading extension module utils... +6: +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/421m3b9400m/3318426.out b/421m3b9400m/3318426.out new file mode 100644 index 0000000000000000000000000000000000000000..0c5633044aa8ad05c120b3a54233e80c7d693e9a --- /dev/null +++ b/421m3b9400m/3318426.out @@ -0,0 +1,4076 @@ +Model parameters: d_model 1280 ffw_size 5120 kv_size 128 n_heads 10 n_layers 18 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 18 --hidden-size 1280 --num-attention-heads 10 --kv-channels 128 --ffn-hidden-size 5120 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1_922_149 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-421m3b9400m --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1_922_149 --lr-warmup-samples 19_221 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_421m3b9400m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_421m3b9400m --load checkpoints_421m3b9400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3318426.json --zero-stage 0 +START 3318426: Wed 15 Mar 2023 09:58:39 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 48.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 49.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 40.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 41.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 45.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 50.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 47.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 48.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +2: Launching on nid006719 (2/8), master nid006717 port 9999, GPUs 8, CUDA: True +1: Launching on nid006718 (1/8), master nid006717 port 9999, GPUs 8, CUDA: True +6: Launching on nid006723 (6/8), master nid006717 port 9999, GPUs 8, CUDA: True +5: Launching on nid006722 (5/8), master nid006717 port 9999, GPUs 8, CUDA: True +0: Launching on nid006717 (0/8), master nid006717 port 9999, GPUs 8, CUDA: True +3: Launching on nid006720 (3/8), master nid006717 port 9999, GPUs 8, CUDA: True +4: Launching on nid006721 (4/8), master nid006717 port 9999, GPUs 8, CUDA: True +7: Launching on nid006724 (7/8), master nid006717 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3318426.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 5120 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1280 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-421m3b9400m +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_421m3b9400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1922149 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 19221 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 18 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_421m3b9400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_421m3b9400m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1922149 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-15 21:59:15,555] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.083 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_cuda.o layer_norm_hip_kernel.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 22.767 seconds +0: time to initialize megatron (seconds): 46.324 +0: [after megatron is initialized] datetime: 2023-03-15 21:59:42 +0: building GPT model ... +0: [2023-03-15 21:59:42,461] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-15 21:59:42,461] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-15 21:59:42,462] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.4 GB, percent = 6.2% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-15 21:59:44,461] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=25 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: undo +0: 22: MixedFusedLayerNorm +0: 23: EmbeddingPipe +0: 24: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-15 21:59:44,672] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-15 21:59:44,672] [INFO] [utils.py:828:see_memory_usage] MA 0.79 GB Max_MA 0.79 GB CA 0.86 GB Max_CA 1 GB +0: [2023-03-15 21:59:44,673] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: setting training iterations to 7508 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-15 21:59:44,674] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-15 21:59:55,643] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-15 21:59:55,644] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-15 21:59:55,644] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-15 21:59:55,650] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-15 21:59:55,650] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-15 21:59:55,778] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-15 21:59:55,778] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.79 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-15 21:59:55,779] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.11 GB, percent = 6.4% +0: ninja: no work to do. +0: Time to load utils op: 0.1608273983001709 seconds +0: Time to load utils op: 0.10195684432983398 seconds +0: Time to load utils op: 0.2040114402770996 seconds +0: Time to load utils op: 0.20441913604736328 seconds +0: Time to load utils op: 0.20382380485534668 secondsTime to load utils op: 0.20487594604492188 seconds +0: +0: Time to load utils op: 0.2035372257232666 seconds +0: Time to load utils op: 0.20509719848632812 seconds +0: Time to load utils op: 0.0006198883056640625 seconds +3: Time to load utils op: 0.21095633506774902 seconds +3: Time to load utils op: 0.21029043197631836 secondsTime to load utils op: 0.21058082580566406 seconds +3: +3: Time to load utils op: 0.21052241325378418 secondsTime to load utils op: 0.21057939529418945 seconds +3: +3: Time to load utils op: 0.20959162712097168 secondsTime to load utils op: 0.20994997024536133 seconds +3: +3: Time to load utils op: 0.21004819869995117 seconds +1: Time to load utils op: 0.21272611618041992 seconds +1: Time to load utils op: 0.21274781227111816 seconds +1: Time to load utils op: 0.2126467227935791 seconds +1: Time to load utils op: 0.21241497993469238 seconds +1: Time to load utils op: 0.2127988338470459 secondsTime to load utils op: 0.2127828598022461 seconds +1: +1: Time to load utils op: 0.2125720977783203 secondsTime to load utils op: 0.21225810050964355 seconds +1: +2: Time to load utils op: 0.21106576919555664 seconds +2: Time to load utils op: 0.21106410026550293 seconds +2: Time to load utils op: 0.21109509468078613 seconds +2: Time to load utils op: 0.21110200881958008 secondsTime to load utils op: 0.2111053466796875 seconds +2: +2: Time to load utils op: 0.21111297607421875 seconds +2: Time to load utils op: 0.21112060546875 seconds +2: Time to load utils op: 0.2111363410949707 seconds +0: Time to load utils op: 0.0003495216369628906 seconds +0: Time to load utils op: 0.0004038810729980469 seconds +4: Time to load utils op: 0.21197271347045898 seconds +4: Time to load utils op: 0.2119917869567871 seconds +4: Time to load utils op: 0.21201300621032715 seconds +4: Time to load utils op: 0.21202468872070312 seconds +4: Time to load utils op: 0.2120521068572998 secondsTime to load utils op: 0.21205687522888184 seconds +4: Time to load utils op: 0.21205973625183105 seconds +4: +4: Time to load utils op: 0.21206378936767578 seconds +5: Time to load utils op: 0.2110896110534668 seconds +5: Time to load utils op: 0.21109533309936523 seconds +5: Time to load utils op: 0.2111217975616455 secondsTime to load utils op: 0.21112513542175293 seconds +5: +5: Time to load utils op: 0.21113252639770508 seconds +5: Time to load utils op: 0.21113085746765137 secondsTime to load utils op: 0.2111377716064453 seconds +5: +5: Time to load utils op: 0.21114516258239746 seconds +0: Time to load utils op: 0.0004127025604248047 seconds +0: Time to load utils op: 0.0003745555877685547 seconds +0: Time to load utils op: 0.00037407875061035156 seconds +0: Time to load utils op: 0.0003807544708251953 seconds +6: Time to load utils op: 0.21246123313903809 seconds +6: Time to load utils op: 0.21246814727783203 seconds +6: Time to load utils op: 0.21248412132263184 seconds +6: Time to load utils op: 0.21248769760131836 seconds +6: Time to load utils op: 0.21248960494995117 seconds +6: Time to load utils op: 0.21250057220458984 seconds +6: Time to load utils op: 0.21249675750732422 secondsTime to load utils op: 0.21249914169311523 seconds +6: +7: Time to load utils op: 0.21160674095153809 secondsTime to load utils op: 0.21161389350891113 seconds +7: +7: Time to load utils op: 0.21161818504333496 seconds +7: Time to load utils op: 0.2116382122039795 seconds +7: Time to load utils op: 0.21164870262145996 seconds +7: Time to load utils op: 0.21164250373840332 seconds +7: Time to load utils op: 0.2116541862487793 seconds +7: Time to load utils op: 0.21166563034057617 seconds +3: Time to load utils op: 0.0008625984191894531 seconds +3: Time to load utils op: 0.0010554790496826172 seconds +3: Time to load utils op: 0.0012013912200927734 seconds +3: Time to load utils op: 0.0013821125030517578 secondsTime to load utils op: 0.0012857913970947266 seconds +3: Time to load utils op: 0.001344919204711914 seconds +3: +3: Time to load utils op: 0.0013699531555175781 seconds +3: Time to load utils op: 0.0013837814331054688 seconds +2: Time to load utils op: 0.0009226799011230469 seconds +2: Time to load utils op: 0.0009694099426269531 seconds +2: Time to load utils op: 0.0013272762298583984 seconds +2: Time to load utils op: 0.0013096332550048828 seconds +2: Time to load utils op: 0.0012273788452148438 secondsTime to load utils op: 0.0011620521545410156 seconds +2: +2: Time to load utils op: 0.0013213157653808594 seconds +2: Time to load utils op: 0.001323699951171875 seconds +1: Time to load utils op: 0.0007865428924560547 seconds +1: Time to load utils op: 0.0008211135864257812 seconds +1: Time to load utils op: 0.0010268688201904297 seconds +1: Time to load utils op: 0.0010259151458740234 seconds +1: Time to load utils op: 0.0010461807250976562 seconds +1: Time to load utils op: 0.0009963512420654297 seconds +1: Time to load utils op: 0.001051187515258789 seconds +1: Time to load utils op: 0.001110076904296875 seconds +4: Time to load utils op: 0.0008680820465087891 seconds +4: Time to load utils op: 0.0012094974517822266 seconds +4: Time to load utils op: 0.0011000633239746094 seconds +4: Time to load utils op: 0.0011134147644042969 seconds +4: Time to load utils op: 0.001123666763305664 seconds +4: Time to load utils op: 0.0010728836059570312 seconds +4: Time to load utils op: 0.0010919570922851562 seconds +4: Time to load utils op: 0.0011775493621826172 seconds +5: Time to load utils op: 0.0009765625 seconds +7: Time to load utils op: 0.0009970664978027344 seconds +7: Time to load utils op: 0.0012590885162353516 seconds +7: Time to load utils op: 0.0013360977172851562 secondsTime to load utils op: 0.0013227462768554688 seconds +7: +7: Time to load utils op: 0.0012202262878417969 seconds +5: Time to load utils op: 0.0012919902801513672 seconds +5: Time to load utils op: 0.0012590885162353516 secondsTime to load utils op: 0.0012385845184326172 seconds +5: +7: Time to load utils op: 0.001230478286743164 seconds +7: Time to load utils op: 0.0013699531555175781 seconds +5: Time to load utils op: 0.0012965202331542969 seconds +5: Time to load utils op: 0.001257181167602539 seconds +5: Time to load utils op: 0.0012106895446777344 seconds +5: Time to load utils op: 0.0012316703796386719 seconds +7: Time to load utils op: 0.0003826618194580078 seconds +6: Time to load utils op: 0.0007281303405761719 seconds +6: Time to load utils op: 0.0007078647613525391 seconds +6: Time to load utils op: 0.0009222030639648438 seconds +6: Time to load utils op: 0.0009074211120605469 seconds +6: Time to load utils op: 0.0009479522705078125 seconds +6: Time to load utils op: 0.001085519790649414 secondsTime to load utils op: 0.0010528564453125 seconds +6: +6: Time to load utils op: 0.0010941028594970703 seconds +0: [2023-03-15 21:59:56,004] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-15 21:59:56,005] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.78 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-15 21:59:56,005] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,118] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-15 21:59:56,118] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-15 21:59:56,119] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,220] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-15 21:59:56,221] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-15 21:59:56,221] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,323] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-15 21:59:56,324] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 21:59:56,324] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,424] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-15 21:59:56,425] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 21:59:56,425] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,528] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-15 21:59:56,528] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 21:59:56,528] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,629] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-15 21:59:56,629] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 21:59:56,629] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,735] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-15 21:59:56,735] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 21:59:56,735] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,836] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-15 21:59:56,836] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 21:59:56,836] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.26 GB, percent = 6.4% +0: [2023-03-15 21:59:56,836] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-15 21:59:56,836] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-15 21:59:56,837] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-15 21:59:56,837] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-15 21:59:56,837] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-15 21:59:56,837] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-15 21:59:56,837] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-15 21:59:56,837] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-15 21:59:56,837] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-15 21:59:56,838] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-15 21:59:56,839] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-15 21:59:56,839] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00041937828063964844 seconds +0: [2023-03-15 21:59:56,840] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-15 21:59:56,902] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=25 [0, 25) STAGE_PARAMS=421207040 (421.207M) TOTAL_PARAMS=421207040 (421.207M) UNIQUE_PARAMS=421207040 (421.207M) +0: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_421m3b9400m +0: will not load any checkpoints and will start from random +6: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,907] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:59:56,908] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m3b9400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 9.26 +0: estimated model parameters: 0.42120704 +0: estimated model parameters without embeddings: 0.35419648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-15 21:59:57 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1922149 +0: validation: 2048 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.008117 seconds +0: number of documents: 835726 +0: > dataset split: +0: train: +0: document indices in [0, 835726) total of 835726 documents +0: > WARNING: could not find index map files, building the indices on rank 0 ... +0: > last epoch number of samples (166245) is smaller than 95.0% of number of samples per epoch (195100), setting separate_last_epoch to True +0: > elasped time to build and save doc-idx mapping (seconds): 0.350806 +0: using: +0: number of documents: 835726 +0: number of epochs: 10 +0: sequence length: 2048 +0: total number of samples: 1951005 +0: > elasped time to build and save sample-idx mapping (seconds): 0.047800 +0: > building shuffle index with split [0, 1755904) and [1755904, 1951005) ... +0: > elasped time to build and save shuffle-idx mapping (seconds): 0.051496 +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1922149ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1922149ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1922149ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.013 seconds +0: total number of samples: 1951006 +0: total number of epochs: 10 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.055011 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_2048ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_2048ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_2048ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.051 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-15 22:00:10 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 14750.76 | train/valid/test-data-iterators-setup: 13314.54 +0: [000-000] 0.4212B / 0.3542B +0: [before the start of training step] datetime: 2023-03-15 22:00:10 +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 4873.00537109375 | max allocated: 27817.7919921875 | reserved: 29816.0 | max reserved: 29816.0 +7: iteration 10/ 7508 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 1.51 | learning rate: 2.664E-05 | global batch size: 256 | lm loss: 1.006145E+01 | grad norm: 2.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.491 | TFLOPs: 16.16 | +7: iteration 20/ 7508 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.60 | learning rate: 5.328E-05 | global batch size: 256 | lm loss: 8.927153E+00 | grad norm: 1.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.612 | TFLOPs: 40.67 | +7: iteration 30/ 7508 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.58 | learning rate: 7.991E-05 | global batch size: 256 | lm loss: 8.233161E+00 | grad norm: 2.025 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.187 | TFLOPs: 41.87 | +7: iteration 40/ 7508 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.58 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 7.605612E+00 | grad norm: 0.899 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.127 | TFLOPs: 42.15 | +7: iteration 50/ 7508 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.58 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 7.268809E+00 | grad norm: 1.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.050 | TFLOPs: 42.05 | +7: iteration 60/ 7508 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.59 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 7.089064E+00 | grad norm: 0.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.419 | TFLOPs: 41.70 | +7: iteration 70/ 7508 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.58 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 6.917805E+00 | grad norm: 1.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.385 | TFLOPs: 42.08 | +7: iteration 80/ 7508 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.61 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.768745E+00 | grad norm: 0.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.222 | TFLOPs: 40.16 | +7: iteration 90/ 7508 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.653751E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.849 | TFLOPs: 41.27 | +7: iteration 100/ 7508 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.532082E+00 | grad norm: 0.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.024 | TFLOPs: 41.19 | +7: iteration 110/ 7508 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.60 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.464809E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.036 | TFLOPs: 40.52 | +7: iteration 120/ 7508 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.413338E+00 | grad norm: 0.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.248 | TFLOPs: 42.07 | +7: iteration 130/ 7508 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.371860E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.846 | TFLOPs: 42.51 | +7: iteration 140/ 7508 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.316917E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.171 | TFLOPs: 42.73 | +7: iteration 150/ 7508 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.277857E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.181 | TFLOPs: 41.20 | +7: iteration 160/ 7508 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.60 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.238290E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.482 | TFLOPs: 40.57 | +7: iteration 170/ 7508 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.206771E+00 | grad norm: 0.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.187 | TFLOPs: 41.59 | +7: iteration 180/ 7508 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.60 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.190221E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.491 | TFLOPs: 40.76 | +7: iteration 190/ 7508 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.142646E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.045 | TFLOPs: 41.86 | +7: iteration 200/ 7508 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.120612E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.361 | TFLOPs: 42.17 | +7: iteration 210/ 7508 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.080025E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.593 | TFLOPs: 41.53 | +7: iteration 220/ 7508 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.60 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 6.047094E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.321 | TFLOPs: 40.84 | +7: iteration 230/ 7508 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.60 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 6.034698E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.744 | TFLOPs: 40.49 | +7: iteration 240/ 7508 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.59 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.989299E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.736 | TFLOPs: 41.35 | +7: iteration 250/ 7508 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.950758E+00 | grad norm: 0.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.750 | TFLOPs: 42.02 | +7: iteration 260/ 7508 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.968780E+00 | grad norm: 0.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.571 | TFLOPs: 41.53 | +7: iteration 270/ 7508 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.60 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.935786E+00 | grad norm: 0.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.542 | TFLOPs: 40.48 | +7: iteration 280/ 7508 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.58 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.904284E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.439 | TFLOPs: 41.80 | +7: iteration 290/ 7508 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.59 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.862142E+00 | grad norm: 0.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.438 | TFLOPs: 41.23 | +7: iteration 300/ 7508 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.58 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.839970E+00 | grad norm: 0.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.376 | TFLOPs: 41.99 | +7: iteration 310/ 7508 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.58 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.801073E+00 | grad norm: 0.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.336 | TFLOPs: 41.89 | +7: iteration 320/ 7508 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.58 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.785889E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.638 | TFLOPs: 42.11 | +7: iteration 330/ 7508 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.739093E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.922 | TFLOPs: 42.51 | +7: iteration 340/ 7508 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.59 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 5.710794E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.532 | TFLOPs: 41.52 | +7: iteration 350/ 7508 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.57 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 5.696539E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.135 | TFLOPs: 42.53 | +7: iteration 360/ 7508 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.59 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 5.648029E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.240 | TFLOPs: 41.40 | +7: iteration 370/ 7508 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.58 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 5.636728E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.982 | TFLOPs: 42.14 | +7: iteration 380/ 7508 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.60 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 5.602863E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.446 | TFLOPs: 40.85 | +7: iteration 390/ 7508 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.59 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 5.559724E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.155 | TFLOPs: 41.49 | +7: iteration 400/ 7508 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.59 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 5.553529E+00 | grad norm: 0.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.543 | TFLOPs: 41.62 | +7: iteration 410/ 7508 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.57 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 5.511174E+00 | grad norm: 1.059 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.399 | TFLOPs: 42.46 | +7: iteration 420/ 7508 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.58 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 5.489057E+00 | grad norm: 0.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.449 | TFLOPs: 42.28 | +7: iteration 430/ 7508 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.58 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 5.457536E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.154 | TFLOPs: 42.35 | +7: iteration 440/ 7508 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.58 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 5.412500E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.153 | TFLOPs: 42.15 | +7: iteration 450/ 7508 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.58 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 5.413997E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.450 | TFLOPs: 41.99 | +7: iteration 460/ 7508 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.58 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 5.368180E+00 | grad norm: 0.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.033 | TFLOPs: 41.76 | +7: iteration 470/ 7508 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.57 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 5.350980E+00 | grad norm: 0.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.767 | TFLOPs: 43.07 | +7: iteration 480/ 7508 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.57 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 5.297159E+00 | grad norm: 0.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.061 | TFLOPs: 42.62 | +7: iteration 490/ 7508 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.59 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 5.258167E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.800 | TFLOPs: 41.26 | +7: iteration 500/ 7508 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.57 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 5.250610E+00 | grad norm: 0.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.325 | TFLOPs: 42.46 | +7: iteration 510/ 7508 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.60 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 5.214092E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.742 | TFLOPs: 40.78 | +7: iteration 520/ 7508 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.59 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 5.183879E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.677 | TFLOPs: 41.16 | +7: iteration 530/ 7508 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.59 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 5.140098E+00 | grad norm: 0.853 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.348 | TFLOPs: 41.32 | +7: iteration 540/ 7508 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.57 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 5.117105E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.102 | TFLOPs: 42.53 | +7: iteration 550/ 7508 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.60 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 5.107265E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.290 | TFLOPs: 40.74 | +7: iteration 560/ 7508 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.57 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 5.082716E+00 | grad norm: 0.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.376 | TFLOPs: 42.84 | +7: iteration 570/ 7508 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.59 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 5.050010E+00 | grad norm: 0.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.539 | TFLOPs: 41.24 | +7: iteration 580/ 7508 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.58 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 5.011089E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.129 | TFLOPs: 42.44 | +7: iteration 590/ 7508 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.59 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 5.003833E+00 | grad norm: 0.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.019 | TFLOPs: 41.19 | +7: iteration 600/ 7508 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.60 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.963654E+00 | grad norm: 0.713 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.220 | TFLOPs: 40.73 | +7: iteration 610/ 7508 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.60 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.936683E+00 | grad norm: 0.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.549 | TFLOPs: 40.95 | +7: iteration 620/ 7508 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.59 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.882063E+00 | grad norm: 0.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.032 | TFLOPs: 41.67 | +7: iteration 630/ 7508 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.57 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.870457E+00 | grad norm: 0.880 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.684 | TFLOPs: 42.97 | +7: iteration 640/ 7508 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.58 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.846832E+00 | grad norm: 0.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.782 | TFLOPs: 41.74 | +7: iteration 650/ 7508 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.58 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.796571E+00 | grad norm: 0.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.920 | TFLOPs: 42.23 | +7: iteration 660/ 7508 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.57 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.778519E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.962 | TFLOPs: 42.80 | +7: iteration 670/ 7508 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.58 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.737239E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.386 | TFLOPs: 42.37 | +7: iteration 680/ 7508 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.58 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.711980E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.193 | TFLOPs: 41.97 | +7: iteration 690/ 7508 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.59 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.681390E+00 | grad norm: 0.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.108 | TFLOPs: 41.67 | +7: iteration 700/ 7508 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.59 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.656599E+00 | grad norm: 0.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.183 | TFLOPs: 41.59 | +7: iteration 710/ 7508 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.58 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.645308E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.858 | TFLOPs: 42.41 | +7: iteration 720/ 7508 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.59 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.602104E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.957 | TFLOPs: 41.56 | +7: iteration 730/ 7508 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.58 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.570047E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.405 | TFLOPs: 41.89 | +7: iteration 740/ 7508 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.57 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.560801E+00 | grad norm: 0.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.929 | TFLOPs: 43.09 | +7: iteration 750/ 7508 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.58 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.552834E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.314 | TFLOPs: 42.07 | +7: iteration 760/ 7508 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.59 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.513063E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.020 | TFLOPs: 41.19 | +7: iteration 770/ 7508 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.505115E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.721 | TFLOPs: 43.35 | +7: iteration 780/ 7508 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.58 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.502512E+00 | grad norm: 0.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.739 | TFLOPs: 42.40 | +7: iteration 790/ 7508 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.58 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.480969E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.415 | TFLOPs: 41.99 | +7: iteration 800/ 7508 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.57 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.462569E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.092 | TFLOPs: 43.01 | +7: iteration 810/ 7508 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.57 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.439249E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.742 | TFLOPs: 42.88 | +7: iteration 820/ 7508 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.58 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.420035E+00 | grad norm: 0.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.378 | TFLOPs: 41.89 | +7: iteration 830/ 7508 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.57 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.419695E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.025 | TFLOPs: 42.71 | +7: iteration 840/ 7508 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.57 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.398795E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.549 | TFLOPs: 42.67 | +7: iteration 850/ 7508 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.61 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.374532E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 419.666 | TFLOPs: 40.01 | +7: iteration 860/ 7508 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.60 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.385594E+00 | grad norm: 0.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.704 | TFLOPs: 40.40 | +7: iteration 870/ 7508 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.58 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.381159E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.904 | TFLOPs: 42.13 | +7: iteration 880/ 7508 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.57 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.350595E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.156 | TFLOPs: 42.73 | +7: iteration 890/ 7508 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.58 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.327864E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.077 | TFLOPs: 41.86 | +7: iteration 900/ 7508 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.58 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.333812E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.741 | TFLOPs: 41.92 | +7: iteration 910/ 7508 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.58 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.313394E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.510 | TFLOPs: 42.09 | +7: iteration 920/ 7508 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.58 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.301981E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.671 | TFLOPs: 42.01 | +7: iteration 930/ 7508 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.59 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.290026E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.344 | TFLOPs: 41.51 | +7: iteration 940/ 7508 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.58 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.302665E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.256 | TFLOPs: 42.35 | +7: iteration 950/ 7508 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.58 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.267967E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.807 | TFLOPs: 41.74 | +7: iteration 960/ 7508 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.56 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.261798E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.953 | TFLOPs: 43.66 | +7: iteration 970/ 7508 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.59 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.258514E+00 | grad norm: 0.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.191 | TFLOPs: 41.40 | +7: iteration 980/ 7508 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.57 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.252273E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.545 | TFLOPs: 42.86 | +7: iteration 990/ 7508 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.58 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.236480E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.061 | TFLOPs: 42.43 | +7: iteration 1000/ 7508 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.58 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.221117E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.544 | TFLOPs: 42.29 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 1000 | lm loss value: 4.226662E+00 | lm loss PPL: 6.848821E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 1000 to checkpoints_421m3b9400m +0: [2023-03-15 22:10:03,390] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +0: [2023-03-15 22:10:03,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:10:03,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:10:03,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:10:03,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:10:03,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:10:03,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:10:03,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:10:03,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:10:03,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:10:03,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:10:03,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:10:03,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:10:03,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:10:03,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:10:03,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:10:03,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:10:03,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:10:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:10:03,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:10:03,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:10:03,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:10:03,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:10:03,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:10:04,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:10:04,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:10:04,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:10:04,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:10:04,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:10:04,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:10:04,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:10:04,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:10:04,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:10:04,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:10:04,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:10:04,227] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:10:04,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:10:04,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:10:04,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:10:04,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/layer_22-model_00-model_states.pt... +0: [2023-03-15 22:10:04,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/layer_22-model_00-model_states.pt. +0: [2023-03-15 22:10:04,311] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step1000/mp_rank_00_model_states.pt +0: [2023-03-15 22:10:04,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:10:04,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:10:04,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:10:04,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:10:04,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:10:04,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:10:04,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:10:04,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:10:04,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:10:04,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:10:04,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:10:04,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:10:04,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:10:04,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:10:04,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:10:04,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:10:04,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:10:04,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:10:04,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:10:04,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:10:04,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:10:04,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:10:04,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:10:04,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:10:04,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:10:04,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:10:04,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:10:04,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:10:04,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: successfully saved checkpoint at iteration 1000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1192.58 +7: iteration 1010/ 7508 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.72 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.215424E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 355.514 | TFLOPs: 33.89 | +7: iteration 1020/ 7508 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.57 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.206202E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.046 | TFLOPs: 42.91 | +7: iteration 1030/ 7508 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.59 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.199915E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.167 | TFLOPs: 41.58 | +7: iteration 1040/ 7508 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.57 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.197660E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.072 | TFLOPs: 42.91 | +7: iteration 1050/ 7508 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.57 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.193019E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.396 | TFLOPs: 43.04 | +7: iteration 1060/ 7508 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.59 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.171957E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.187 | TFLOPs: 41.49 | +7: iteration 1070/ 7508 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.57 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.176270E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.445 | TFLOPs: 42.66 | +7: iteration 1080/ 7508 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.57 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.182755E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.743 | TFLOPs: 42.59 | +7: iteration 1090/ 7508 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.56 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.167914E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.366 | TFLOPs: 43.22 | +7: iteration 1100/ 7508 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.59 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.170648E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.690 | TFLOPs: 41.35 | +7: iteration 1110/ 7508 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.59 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.158697E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.626 | TFLOPs: 41.53 | +7: iteration 1120/ 7508 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.58 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.129662E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.371 | TFLOPs: 42.18 | +7: iteration 1130/ 7508 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.58 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.117728E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.136 | TFLOPs: 42.25 | +7: iteration 1140/ 7508 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.58 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.128640E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.075 | TFLOPs: 42.15 | +7: iteration 1150/ 7508 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.123708E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.494 | TFLOPs: 43.24 | +7: iteration 1160/ 7508 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.57 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.110278E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.843 | TFLOPs: 42.89 | +7: iteration 1170/ 7508 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.58 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.105482E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.167 | TFLOPs: 42.25 | +7: iteration 1180/ 7508 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.57 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.102699E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.386 | TFLOPs: 42.56 | +7: iteration 1190/ 7508 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.56 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.084481E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.873 | TFLOPs: 43.46 | +7: iteration 1200/ 7508 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.59 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.101614E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.475 | TFLOPs: 41.71 | +7: iteration 1210/ 7508 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.57 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.070854E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.738 | TFLOPs: 42.50 | +7: iteration 1220/ 7508 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.57 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.085820E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.873 | TFLOPs: 42.89 | +7: iteration 1230/ 7508 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.058322E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.486 | TFLOPs: 43.23 | +7: iteration 1240/ 7508 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.57 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 4.052719E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.904 | TFLOPs: 42.61 | +7: iteration 1250/ 7508 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.58 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.069592E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.127 | TFLOPs: 42.15 | +7: iteration 1260/ 7508 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.58 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 4.057657E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.537 | TFLOPs: 42.29 | +7: iteration 1270/ 7508 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.57 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 4.040623E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.836 | TFLOPs: 42.79 | +7: iteration 1280/ 7508 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 4.040840E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.087 | TFLOPs: 43.39 | +7: iteration 1290/ 7508 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 4.022441E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.996 | TFLOPs: 43.76 | +7: iteration 1300/ 7508 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.57 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 4.018026E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.696 | TFLOPs: 42.78 | +7: iteration 1310/ 7508 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.57 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 4.030741E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.028 | TFLOPs: 43.19 | +7: iteration 1320/ 7508 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.57 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 4.012928E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.509 | TFLOPs: 43.05 | +7: iteration 1330/ 7508 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.58 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 4.000174E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.375 | TFLOPs: 41.89 | +7: iteration 1340/ 7508 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.58 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 4.012724E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.211 | TFLOPs: 42.06 | +7: iteration 1350/ 7508 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.999617E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.064 | TFLOPs: 43.29 | +7: iteration 1360/ 7508 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.57 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 4.001517E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.552 | TFLOPs: 42.48 | +7: iteration 1370/ 7508 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.57 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.999969E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.629 | TFLOPs: 43.06 | +7: iteration 1380/ 7508 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.59 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.985201E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.275 | TFLOPs: 41.50 | +7: iteration 1390/ 7508 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.59 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.974304E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.361 | TFLOPs: 41.13 | +7: iteration 1400/ 7508 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.57 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.961385E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.318 | TFLOPs: 42.55 | +7: iteration 1410/ 7508 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.57 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.964062E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.098 | TFLOPs: 42.72 | +7: iteration 1420/ 7508 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.57 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.987347E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.844 | TFLOPs: 43.17 | +7: iteration 1430/ 7508 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.57 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.971365E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.087 | TFLOPs: 42.72 | +7: iteration 1440/ 7508 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.57 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.969248E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.492 | TFLOPs: 42.57 | +7: iteration 1450/ 7508 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.57 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.955738E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.590 | TFLOPs: 42.96 | +7: iteration 1460/ 7508 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.58 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.954625E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.922 | TFLOPs: 42.23 | +7: iteration 1470/ 7508 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.57 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.946361E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.437 | TFLOPs: 42.66 | +7: iteration 1480/ 7508 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.57 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.951626E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.339 | TFLOPs: 42.46 | +7: iteration 1490/ 7508 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.57 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.941691E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.848 | TFLOPs: 42.89 | +7: iteration 1500/ 7508 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.57 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.945967E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.721 | TFLOPs: 42.49 | +7: iteration 1510/ 7508 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.57 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.928553E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.587 | TFLOPs: 42.48 | +7: iteration 1520/ 7508 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.58 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.939030E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.321 | TFLOPs: 42.27 | +7: iteration 1530/ 7508 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.57 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.925671E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.360 | TFLOPs: 42.65 | +7: iteration 1540/ 7508 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.58 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.932888E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.717 | TFLOPs: 42.40 | +7: iteration 1550/ 7508 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.57 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.915633E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.661 | TFLOPs: 43.06 | +7: iteration 1560/ 7508 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.58 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.917697E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.946 | TFLOPs: 42.42 | +7: iteration 1570/ 7508 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.58 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.911370E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.016 | TFLOPs: 41.86 | +7: iteration 1580/ 7508 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.57 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.901250E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.843 | TFLOPs: 42.51 | +7: iteration 1590/ 7508 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.58 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.883964E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.000 | TFLOPs: 42.14 | +7: iteration 1600/ 7508 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.58 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.882514E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.559 | TFLOPs: 42.10 | +7: iteration 1610/ 7508 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.57 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.892468E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.698 | TFLOPs: 42.97 | +7: iteration 1620/ 7508 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.58 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.890338E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.198 | TFLOPs: 42.16 | +7: iteration 1630/ 7508 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.58 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.893804E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.899 | TFLOPs: 42.23 | +7: iteration 1640/ 7508 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.58 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.881180E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.659 | TFLOPs: 42.01 | +7: iteration 1650/ 7508 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.57 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.865858E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.908 | TFLOPs: 42.99 | +7: iteration 1660/ 7508 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.58 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.881733E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.702 | TFLOPs: 42.40 | +7: iteration 1670/ 7508 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.57 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.884261E+00 | grad norm: 0.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.136 | TFLOPs: 42.82 | +7: iteration 1680/ 7508 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.57 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.876353E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.506 | TFLOPs: 42.57 | +7: iteration 1690/ 7508 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.57 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.878365E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.955 | TFLOPs: 42.80 | +7: iteration 1700/ 7508 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.59 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.863544E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.545 | TFLOPs: 41.72 | +7: iteration 1710/ 7508 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.59 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.856571E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.233 | TFLOPs: 41.69 | +7: iteration 1720/ 7508 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.60 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.845264E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.726 | TFLOPs: 40.87 | +7: iteration 1730/ 7508 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.57 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.864864E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.610 | TFLOPs: 42.77 | +7: iteration 1740/ 7508 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.58 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.858142E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.681 | TFLOPs: 42.11 | +7: iteration 1750/ 7508 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.59 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.849133E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.922 | TFLOPs: 41.27 | +7: iteration 1760/ 7508 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.57 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.846405E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.225 | TFLOPs: 43.11 | +7: iteration 1770/ 7508 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.57 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.834338E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.105 | TFLOPs: 43.01 | +7: iteration 1780/ 7508 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.58 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.835206E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.635 | TFLOPs: 42.01 | +7: iteration 1790/ 7508 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.56 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.824564E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.977 | TFLOPs: 43.28 | +7: iteration 1800/ 7508 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.820828E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.982 | TFLOPs: 43.28 | +7: iteration 1810/ 7508 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.58 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.821827E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.037 | TFLOPs: 42.43 | +7: iteration 1820/ 7508 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.57 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.854585E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.170 | TFLOPs: 42.54 | +7: iteration 1830/ 7508 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.56 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.837045E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.481 | TFLOPs: 43.23 | +7: iteration 1840/ 7508 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.57 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.839314E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.672 | TFLOPs: 42.97 | +7: iteration 1850/ 7508 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.58 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.804906E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.868 | TFLOPs: 41.75 | +7: iteration 1860/ 7508 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.57 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.824200E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.814 | TFLOPs: 42.69 | +7: iteration 1870/ 7508 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.56 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.823698E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.512 | TFLOPs: 43.33 | +7: iteration 1880/ 7508 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.57 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.816482E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.201 | TFLOPs: 43.11 | +7: iteration 1890/ 7508 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.56 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.805994E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.389 | TFLOPs: 43.23 | +7: iteration 1900/ 7508 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.57 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.821275E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.538 | TFLOPs: 43.05 | +7: iteration 1910/ 7508 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.56 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.778225E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.367 | TFLOPs: 43.51 | +7: iteration 1920/ 7508 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.57 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.797107E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.505 | TFLOPs: 42.57 | +7: iteration 1930/ 7508 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.57 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.787389E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.235 | TFLOPs: 42.54 | +7: iteration 1940/ 7508 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.56 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.800018E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.557 | TFLOPs: 43.72 | +7: iteration 1950/ 7508 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.58 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.798354E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.651 | TFLOPs: 42.39 | +7: iteration 1960/ 7508 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.57 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.788736E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.940 | TFLOPs: 42.71 | +7: iteration 1970/ 7508 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.56 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.788472E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.248 | TFLOPs: 43.50 | +7: iteration 1980/ 7508 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.57 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.786220E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.164 | TFLOPs: 42.82 | +7: iteration 1990/ 7508 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.56 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.778287E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.914 | TFLOPs: 43.28 | +0: [2023-03-15 22:19:37,844] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00017182361507925355, 0.00017182361507925355, 0.00017182361507925355], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 7508 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.56 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.762609E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.530 | TFLOPs: 43.72 | +0: steps: 2000 loss: 3.7898 iter time (s): 0.581 samples/sec: 440.788 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 2000 | lm loss value: 3.753593E+00 | lm loss PPL: 4.267415E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 2000 to checkpoints_421m3b9400m +0: [2023-03-15 22:19:38,053] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +0: [2023-03-15 22:19:38,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:19:38,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:19:38,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:19:38,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:19:38,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:19:38,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:19:38,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:19:38,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:19:38,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:19:38,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:19:38,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:19:38,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:19:38,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:19:38,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:19:38,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:19:38,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:19:38,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:19:38,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:19:38,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:19:38,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:19:38,550] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:19:38,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:19:38,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:19:38,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:19:38,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:19:38,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:19:38,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:19:38,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:19:38,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:19:38,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:19:38,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:19:38,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:19:38,786] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:19:38,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:19:38,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:19:38,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:19:38,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:19:38,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:19:38,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/layer_22-model_00-model_states.pt... +0: [2023-03-15 22:19:38,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/layer_22-model_00-model_states.pt. +0: [2023-03-15 22:19:38,908] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step2000/mp_rank_00_model_states.pt +0: [2023-03-15 22:19:38,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:19:38,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:38,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:39,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:39,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:39,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:39,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:39,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:39,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:39,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:39,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:19:39,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:39,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:39,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:19:39,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:39,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:39,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:39,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:39,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:39,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:39,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:19:39,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:39,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:39,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:19:39,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:39,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:39,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:19:39,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:39,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:19:39,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:39,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:39,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:19:39,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:39,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:39,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:19:39,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:39,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:39,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: successfully saved checkpoint at iteration 2000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1124.07 +7: iteration 2010/ 7508 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.69 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.773200E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 368.656 | TFLOPs: 35.15 | +7: iteration 2020/ 7508 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.58 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.761010E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.823 | TFLOPs: 42.41 | +7: iteration 2030/ 7508 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.57 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.754540E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.928 | TFLOPs: 43.18 | +7: iteration 2040/ 7508 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.57 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.759930E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.725 | TFLOPs: 42.97 | +7: iteration 2050/ 7508 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.58 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.771344E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.721 | TFLOPs: 42.40 | +7: iteration 2060/ 7508 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.56 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.746180E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.367 | TFLOPs: 43.70 | +7: iteration 2070/ 7508 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.58 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.758833E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.527 | TFLOPs: 41.90 | +7: iteration 2080/ 7508 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.57 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.752855E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.044 | TFLOPs: 42.62 | +7: iteration 2090/ 7508 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.57 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.750624E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.566 | TFLOPs: 43.05 | +7: iteration 2100/ 7508 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.56 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.747837E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.843 | TFLOPs: 43.46 | +7: iteration 2110/ 7508 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.57 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.738166E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.187 | TFLOPs: 42.83 | +7: iteration 2120/ 7508 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.56 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.736009E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.773 | TFLOPs: 43.74 | +7: iteration 2130/ 7508 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.56 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.750104E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.185 | TFLOPs: 43.30 | +7: iteration 2140/ 7508 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.56 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.737434E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.470 | TFLOPs: 43.23 | +7: iteration 2150/ 7508 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.57 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.731372E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.108 | TFLOPs: 43.10 | +7: iteration 2160/ 7508 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.57 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.732676E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.307 | TFLOPs: 42.84 | +7: iteration 2170/ 7508 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.57 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.748211E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.483 | TFLOPs: 43.14 | +7: iteration 2180/ 7508 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.57 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.736808E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.545 | TFLOPs: 43.05 | +7: iteration 2190/ 7508 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.57 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.730737E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.031 | TFLOPs: 43.10 | +7: iteration 2200/ 7508 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.57 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.726586E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.597 | TFLOPs: 42.96 | +7: iteration 2210/ 7508 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.57 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.725328E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.642 | TFLOPs: 43.15 | +7: iteration 2220/ 7508 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.57 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.718322E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.191 | TFLOPs: 42.92 | +7: iteration 2230/ 7508 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.56 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.735183E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.272 | TFLOPs: 43.31 | +7: iteration 2240/ 7508 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.56 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.735881E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.999 | TFLOPs: 43.76 | +7: iteration 2250/ 7508 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.56 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.716405E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.598 | TFLOPs: 43.72 | +7: iteration 2260/ 7508 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.56 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.728008E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.447 | TFLOPs: 43.23 | +7: iteration 2270/ 7508 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.57 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.703832E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.295 | TFLOPs: 43.12 | +7: iteration 2280/ 7508 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.57 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.714517E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.350 | TFLOPs: 43.13 | +7: iteration 2290/ 7508 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.56 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.706293E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.570 | TFLOPs: 43.24 | +7: iteration 2300/ 7508 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.56 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.701373E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.789 | TFLOPs: 43.36 | +7: iteration 2310/ 7508 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.57 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.695753E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.113 | TFLOPs: 42.72 | +7: iteration 2320/ 7508 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.57 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.700679E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.757 | TFLOPs: 42.97 | +7: iteration 2330/ 7508 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.58 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.694535E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.190 | TFLOPs: 41.97 | +7: iteration 2340/ 7508 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.58 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.708411E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.712 | TFLOPs: 42.11 | +7: iteration 2350/ 7508 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.56 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.701344E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.934 | TFLOPs: 43.28 | +7: iteration 2360/ 7508 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.57 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.727740E+00 | grad norm: 1.087 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.878 | TFLOPs: 43.08 | +7: iteration 2370/ 7508 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.57 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 4.005746E+00 | grad norm: 3.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.848 | TFLOPs: 42.70 | +7: iteration 2380/ 7508 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.56 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 4.223266E+00 | grad norm: 2.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.452 | TFLOPs: 43.42 | +7: iteration 2390/ 7508 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.57 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.970057E+00 | grad norm: 1.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.217 | TFLOPs: 43.11 | +7: iteration 2400/ 7508 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.57 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.867175E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.972 | TFLOPs: 43.09 | +7: iteration 2410/ 7508 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.57 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.781541E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.016 | TFLOPs: 43.09 | +7: iteration 2420/ 7508 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.56 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.736027E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.769 | TFLOPs: 43.55 | +7: iteration 2430/ 7508 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.56 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.744676E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.372 | TFLOPs: 43.70 | +7: iteration 2440/ 7508 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.56 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.724743E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.327 | TFLOPs: 43.41 | +7: iteration 2450/ 7508 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.56 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.710573E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.818 | TFLOPs: 43.74 | +7: iteration 2460/ 7508 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.58 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.680960E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.569 | TFLOPs: 42.38 | +7: iteration 2470/ 7508 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.56 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.689943E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.003 | TFLOPs: 43.76 | +7: iteration 2480/ 7508 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.57 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.689799E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.948 | TFLOPs: 43.18 | +7: iteration 2490/ 7508 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.56 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.687939E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.405 | TFLOPs: 43.70 | +7: iteration 2500/ 7508 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.56 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.693857E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.600 | TFLOPs: 43.72 | +7: iteration 2510/ 7508 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.56 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.669497E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.696 | TFLOPs: 43.45 | +7: iteration 2520/ 7508 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.56 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.670699E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.808 | TFLOPs: 43.46 | +7: iteration 2530/ 7508 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.56 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.668615E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.862 | TFLOPs: 43.75 | +7: iteration 2540/ 7508 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.56 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.685891E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.269 | TFLOPs: 43.31 | +7: iteration 2550/ 7508 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.56 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.663092E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.585 | TFLOPs: 43.44 | +7: iteration 2560/ 7508 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.56 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.669876E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.995 | TFLOPs: 43.38 | +7: iteration 2570/ 7508 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.56 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.668943E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.434 | TFLOPs: 43.23 | +7: iteration 2580/ 7508 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.56 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.667421E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.672 | TFLOPs: 43.73 | +7: iteration 2590/ 7508 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.57 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.663770E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.931 | TFLOPs: 43.18 | +7: iteration 2600/ 7508 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.56 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.665194E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.818 | TFLOPs: 43.65 | +7: iteration 2610/ 7508 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.57 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.671231E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.459 | TFLOPs: 42.85 | +7: iteration 2620/ 7508 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.56 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.651331E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.102 | TFLOPs: 43.77 | +7: iteration 2630/ 7508 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.56 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.657736E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.918 | TFLOPs: 43.75 | +7: iteration 2640/ 7508 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.57 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.648739E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.130 | TFLOPs: 42.82 | +7: iteration 2650/ 7508 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.56 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.661346E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.533 | TFLOPs: 43.43 | +7: iteration 2660/ 7508 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.56 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.659969E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.368 | TFLOPs: 43.70 | +7: iteration 2670/ 7508 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.56 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.645971E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.243 | TFLOPs: 43.78 | +7: iteration 2680/ 7508 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.56 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.645935E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.938 | TFLOPs: 43.75 | +7: iteration 2690/ 7508 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.56 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.647491E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.617 | TFLOPs: 43.72 | +7: iteration 2700/ 7508 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.56 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.640040E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.014 | TFLOPs: 43.29 | +7: iteration 2710/ 7508 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.57 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.624057E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.171 | TFLOPs: 42.82 | +7: iteration 2720/ 7508 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.56 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.660926E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.506 | TFLOPs: 43.43 | +7: iteration 2730/ 7508 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.56 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.639190E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.346 | TFLOPs: 43.60 | +7: iteration 2740/ 7508 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.56 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.643328E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.615 | TFLOPs: 43.72 | +7: iteration 2750/ 7508 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.56 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.641146E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.733 | TFLOPs: 43.74 | +7: iteration 2760/ 7508 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.57 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.631690E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.005 | TFLOPs: 42.62 | +7: iteration 2770/ 7508 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.57 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.636833E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.940 | TFLOPs: 42.52 | +7: iteration 2780/ 7508 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.57 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.622952E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.677 | TFLOPs: 42.97 | +7: iteration 2790/ 7508 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.56 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.620234E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.052 | TFLOPs: 43.77 | +7: iteration 2800/ 7508 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.56 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.631149E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.021 | TFLOPs: 43.76 | +7: iteration 2810/ 7508 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.56 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.621072E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.908 | TFLOPs: 43.75 | +7: iteration 2820/ 7508 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.56 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.617939E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.120 | TFLOPs: 43.20 | +7: iteration 2830/ 7508 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.56 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.615345E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.928 | TFLOPs: 43.75 | +7: iteration 2840/ 7508 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.56 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.612407E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.689 | TFLOPs: 43.73 | +7: iteration 2850/ 7508 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.56 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.613616E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.544 | TFLOPs: 43.43 | +7: iteration 2860/ 7508 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.57 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.613719E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.688 | TFLOPs: 43.16 | +7: iteration 2870/ 7508 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.56 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.612329E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.774 | TFLOPs: 43.64 | +7: iteration 2880/ 7508 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.56 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.602702E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.254 | TFLOPs: 43.50 | +7: iteration 2890/ 7508 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.56 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.596695E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.687 | TFLOPs: 43.64 | +7: iteration 2900/ 7508 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.56 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.608396E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.466 | TFLOPs: 43.33 | +7: iteration 2910/ 7508 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.56 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.617682E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.517 | TFLOPs: 43.43 | +7: iteration 2920/ 7508 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.56 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.605019E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.854 | TFLOPs: 43.75 | +7: iteration 2930/ 7508 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.57 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.607066E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.103 | TFLOPs: 42.82 | +7: iteration 2940/ 7508 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.57 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.606855E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.588 | TFLOPs: 42.67 | +7: iteration 2950/ 7508 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.56 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.586607E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.504 | TFLOPs: 43.71 | +7: iteration 2960/ 7508 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.56 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.597160E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.143 | TFLOPs: 43.30 | +7: iteration 2970/ 7508 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.56 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.600301E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.784 | TFLOPs: 43.26 | +7: iteration 2980/ 7508 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.56 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.602338E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.823 | TFLOPs: 43.74 | +7: iteration 2990/ 7508 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.56 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.597633E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.532 | TFLOPs: 43.72 | +7: iteration 3000/ 7508 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.56 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.570630E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.524 | TFLOPs: 43.72 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 3000 | lm loss value: 3.599472E+00 | lm loss PPL: 3.657893E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 3000 to checkpoints_421m3b9400m +0: [2023-03-15 22:29:03,387] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +0: [2023-03-15 22:29:03,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:29:03,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:29:03,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:29:03,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:29:03,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:29:03,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:29:03,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:29:03,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:29:03,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:29:03,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:29:03,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:29:03,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:29:03,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:29:03,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:29:03,769] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:29:03,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:29:03,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:29:03,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:29:03,849] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:29:03,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:29:03,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:29:03,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:29:03,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:29:03,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:29:03,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:29:04,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:29:04,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:29:04,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:29:04,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:29:04,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:29:04,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:29:04,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:29:04,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:29:04,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:29:04,167] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:29:04,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:29:04,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:29:04,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:29:04,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/layer_22-model_00-model_states.pt... +0: [2023-03-15 22:29:04,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/layer_22-model_00-model_states.pt. +0: [2023-03-15 22:29:04,250] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step3000/mp_rank_00_model_states.pt +0: [2023-03-15 22:29:04,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:29:04,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:29:04,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:29:04,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,370] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:29:04,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:29:04,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:29:04,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:29:04,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:29:04,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:29:04,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:29:04,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:29:04,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:29:04,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:29:04,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:29:04,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:29:04,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:29:04,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:29:04,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:29:04,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:29:04,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:29:04,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:29:04,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:29:04,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:29:04,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:29:04,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:29:04,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:29:04,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:29:04,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: successfully saved checkpoint at iteration 3000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1126.95 +7: iteration 3010/ 7508 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.69 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.597546E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 369.945 | TFLOPs: 35.27 | +7: iteration 3020/ 7508 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.58 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.594525E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.593 | TFLOPs: 42.10 | +7: iteration 3030/ 7508 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.56 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.581987E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.693 | TFLOPs: 43.64 | +7: iteration 3040/ 7508 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.57 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.600139E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.431 | TFLOPs: 42.94 | +7: iteration 3050/ 7508 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.57 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.586633E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.002 | TFLOPs: 43.09 | +7: iteration 3060/ 7508 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.56 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.596339E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.719 | TFLOPs: 43.73 | +7: iteration 3070/ 7508 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.56 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.599325E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.691 | TFLOPs: 43.73 | +7: iteration 3080/ 7508 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.56 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.580317E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.154 | TFLOPs: 43.30 | +7: iteration 3090/ 7508 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.58 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.585961E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.386 | TFLOPs: 41.89 | +7: iteration 3100/ 7508 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.56 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.577980E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.127 | TFLOPs: 43.39 | +7: iteration 3110/ 7508 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.56 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.581285E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.203 | TFLOPs: 43.68 | +7: iteration 3120/ 7508 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.57 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.584850E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.044 | TFLOPs: 43.19 | +7: iteration 3130/ 7508 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.57 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.583200E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.648 | TFLOPs: 42.87 | +7: iteration 3140/ 7508 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.56 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.573369E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.383 | TFLOPs: 43.42 | +7: iteration 3150/ 7508 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.56 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.569590E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.225 | TFLOPs: 43.59 | +7: iteration 3160/ 7508 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.56 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.576242E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.773 | TFLOPs: 43.64 | +7: iteration 3170/ 7508 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.56 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.573221E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.623 | TFLOPs: 43.72 | +7: iteration 3180/ 7508 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.56 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.566383E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.133 | TFLOPs: 43.20 | +7: iteration 3190/ 7508 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.57 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.570203E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.187 | TFLOPs: 42.83 | +7: iteration 3200/ 7508 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.56 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.570334E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.704 | TFLOPs: 43.73 | +7: iteration 3210/ 7508 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.56 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.554629E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.033 | TFLOPs: 43.38 | +7: iteration 3220/ 7508 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.56 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.554605E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.434 | TFLOPs: 43.42 | +7: iteration 3230/ 7508 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.56 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.568313E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.413 | TFLOPs: 43.42 | +7: iteration 3240/ 7508 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.57 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.565379E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.104 | TFLOPs: 42.72 | +7: iteration 3250/ 7508 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.56 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.556808E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.669 | TFLOPs: 43.73 | +7: iteration 3260/ 7508 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.56 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.553951E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.449 | TFLOPs: 43.42 | +7: iteration 3270/ 7508 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.57 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.565692E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.270 | TFLOPs: 42.74 | +7: iteration 3280/ 7508 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.56 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.561080E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.486 | TFLOPs: 43.43 | +7: iteration 3290/ 7508 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.60 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.555170E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.823 | TFLOPs: 40.98 | +7: iteration 3300/ 7508 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.58 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.549337E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.199 | TFLOPs: 41.97 | +7: iteration 3310/ 7508 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.57 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.553089E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.800 | TFLOPs: 43.17 | +7: iteration 3320/ 7508 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.56 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.558733E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.775 | TFLOPs: 43.26 | +7: iteration 3330/ 7508 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.57 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.557674E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.163 | TFLOPs: 42.92 | +7: iteration 3340/ 7508 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.57 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.546249E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.208 | TFLOPs: 43.11 | +7: iteration 3350/ 7508 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.56 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.547253E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.845 | TFLOPs: 43.36 | +7: iteration 3360/ 7508 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.56 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.543342E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.658 | TFLOPs: 43.25 | +7: iteration 3370/ 7508 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.56 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.528561E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.822 | TFLOPs: 43.27 | +7: iteration 3380/ 7508 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.56 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.545770E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.111 | TFLOPs: 43.29 | +7: iteration 3390/ 7508 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.56 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.538339E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.451 | TFLOPs: 43.52 | +7: iteration 3400/ 7508 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.56 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.522427E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.410 | TFLOPs: 43.42 | +7: iteration 3410/ 7508 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.56 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.547684E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.861 | TFLOPs: 43.75 | +7: iteration 3420/ 7508 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.56 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.537737E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.376 | TFLOPs: 43.22 | +7: iteration 3430/ 7508 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.56 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.532629E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.448 | TFLOPs: 43.23 | +7: iteration 3440/ 7508 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.57 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.535319E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.309 | TFLOPs: 42.65 | +7: iteration 3450/ 7508 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.56 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.537677E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.675 | TFLOPs: 43.63 | +7: iteration 3460/ 7508 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.56 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.543310E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.400 | TFLOPs: 43.23 | +7: iteration 3470/ 7508 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.56 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.533130E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.682 | TFLOPs: 43.73 | +7: iteration 3480/ 7508 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.56 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.536364E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.617 | TFLOPs: 43.63 | +7: iteration 3490/ 7508 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.56 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.531052E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.133 | TFLOPs: 43.39 | +7: iteration 3500/ 7508 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.56 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.523325E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.959 | TFLOPs: 43.47 | +7: iteration 3510/ 7508 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.57 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.547144E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.873 | TFLOPs: 42.80 | +7: iteration 3520/ 7508 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.58 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.526099E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.914 | TFLOPs: 41.75 | +7: iteration 3530/ 7508 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.56 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.528129E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.876 | TFLOPs: 43.46 | +7: iteration 3540/ 7508 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.57 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.540895E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.262 | TFLOPs: 42.93 | +7: iteration 3550/ 7508 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.56 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.532552E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.813 | TFLOPs: 43.74 | +7: iteration 3560/ 7508 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.58 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.521117E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.469 | TFLOPs: 42.18 | +7: iteration 3570/ 7508 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.57 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.531847E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.965 | TFLOPs: 42.52 | +7: iteration 3580/ 7508 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.56 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.523979E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.173 | TFLOPs: 43.40 | +7: iteration 3590/ 7508 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.56 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.534419E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.421 | TFLOPs: 43.61 | +7: iteration 3600/ 7508 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.56 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.527007E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.384 | TFLOPs: 43.23 | +7: iteration 3610/ 7508 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.56 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.512424E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.495 | TFLOPs: 43.71 | +7: iteration 3620/ 7508 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.56 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.523959E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.622 | TFLOPs: 43.72 | +7: iteration 3630/ 7508 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.56 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.519453E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.911 | TFLOPs: 43.75 | +7: iteration 3640/ 7508 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.57 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.516512E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.062 | TFLOPs: 43.19 | +7: iteration 3650/ 7508 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.57 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.504018E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.189 | TFLOPs: 43.11 | +7: iteration 3660/ 7508 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.56 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.516206E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.925 | TFLOPs: 43.28 | +7: iteration 3670/ 7508 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.56 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.504586E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.305 | TFLOPs: 43.60 | +7: iteration 3680/ 7508 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.57 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.502142E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.629 | TFLOPs: 42.68 | +7: iteration 3690/ 7508 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.56 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.516582E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.570 | TFLOPs: 43.72 | +7: iteration 3700/ 7508 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.57 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.508198E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.834 | TFLOPs: 42.98 | +7: iteration 3710/ 7508 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.56 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.527278E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.015 | TFLOPs: 43.76 | +7: iteration 3720/ 7508 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.56 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.514952E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.991 | TFLOPs: 43.28 | +7: iteration 3730/ 7508 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.57 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.499903E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.937 | TFLOPs: 43.18 | +7: iteration 3740/ 7508 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.57 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.490654E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.517 | TFLOPs: 43.14 | +7: iteration 3750/ 7508 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.56 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.497351E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.478 | TFLOPs: 43.23 | +7: iteration 3760/ 7508 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.57 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.508524E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.618 | TFLOPs: 42.96 | +7: iteration 3770/ 7508 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.56 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.487280E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.602 | TFLOPs: 43.72 | +7: iteration 3780/ 7508 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.56 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.509847E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.781 | TFLOPs: 43.74 | +7: iteration 3790/ 7508 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.56 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.512749E+00 | grad norm: 0.883 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.021 | TFLOPs: 43.29 | +7: iteration 3800/ 7508 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.56 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.510681E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.233 | TFLOPs: 43.40 | +7: iteration 3810/ 7508 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.57 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.504979E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.903 | TFLOPs: 43.18 | +7: iteration 3820/ 7508 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.56 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.509956E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.542 | TFLOPs: 43.72 | +7: iteration 3830/ 7508 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.56 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.493198E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.508 | TFLOPs: 43.71 | +7: iteration 3840/ 7508 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.56 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.496402E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.538 | TFLOPs: 43.72 | +7: iteration 3850/ 7508 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.57 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.499805E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.868 | TFLOPs: 42.99 | +7: iteration 3860/ 7508 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.56 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.496325E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.565 | TFLOPs: 43.72 | +7: iteration 3870/ 7508 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.56 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.493738E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.746 | TFLOPs: 43.74 | +7: iteration 3880/ 7508 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.56 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.501098E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.701 | TFLOPs: 43.73 | +7: iteration 3890/ 7508 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.56 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.486794E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.885 | TFLOPs: 43.37 | +7: iteration 3900/ 7508 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.56 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.503338E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.841 | TFLOPs: 43.75 | +7: iteration 3910/ 7508 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.57 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.496079E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.814 | TFLOPs: 42.79 | +7: iteration 3920/ 7508 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.56 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.492062E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.901 | TFLOPs: 43.75 | +7: iteration 3930/ 7508 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.56 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.487858E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.097 | TFLOPs: 43.39 | +7: iteration 3940/ 7508 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.56 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.497834E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.528 | TFLOPs: 43.72 | +7: iteration 3950/ 7508 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.57 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.498782E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.024 | TFLOPs: 43.00 | +7: iteration 3960/ 7508 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.56 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.476460E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.626 | TFLOPs: 43.25 | +7: iteration 3970/ 7508 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.56 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.490267E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.473 | TFLOPs: 43.71 | +7: iteration 3980/ 7508 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.56 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.504402E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.254 | TFLOPs: 43.40 | +7: iteration 3990/ 7508 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.57 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.485297E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.590 | TFLOPs: 42.86 | +0: [2023-03-15 22:38:28,651] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.00010208850566272403, 0.00010208850566272403, 0.00010208850566272403], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 7508 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.56 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.488346E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.849 | TFLOPs: 43.46 | +0: steps: 4000 loss: 3.4421 iter time (s): 0.562 samples/sec: 455.712 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 4000 | lm loss value: 3.528797E+00 | lm loss PPL: 3.408293E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 4000 to checkpoints_421m3b9400m +0: [2023-03-15 22:38:28,859] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +0: [2023-03-15 22:38:28,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:38:29,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:38:29,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:38:29,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:38:29,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:38:29,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:38:29,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:38:29,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:38:29,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:38:29,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:38:29,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:38:29,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:38:29,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:38:29,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:38:29,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:38:29,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:38:29,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:38:29,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:38:29,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:38:29,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:38:29,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:38:29,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:38:29,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:38:29,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:38:29,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:38:29,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:38:29,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:38:29,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:38:29,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:38:29,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:38:29,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:38:29,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:38:29,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:38:29,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:38:29,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:38:29,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:38:29,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:38:29,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:38:29,710] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/layer_22-model_00-model_states.pt... +0: [2023-03-15 22:38:29,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/layer_22-model_00-model_states.pt. +0: [2023-03-15 22:38:29,714] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step4000/mp_rank_00_model_states.pt +0: [2023-03-15 22:38:29,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:38:29,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:29,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:29,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:38:29,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:38:29,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:38:29,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:29,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:38:29,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:38:29,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:38:29,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:38:29,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:29,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:29,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:38:29,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:38:29,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:38:29,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:38:29,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:38:29,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:29,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:29,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:38:29,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,932] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,932] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:29,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:29,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:38:29,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:29,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:29,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:38:29,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:29,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:29,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:38:29,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:29,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:29,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:38:29,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:29,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:29,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:38:29,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:29,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:29,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: successfully saved checkpoint at iteration 4000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1115.16 +7: iteration 4010/ 7508 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.69 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.485440E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 372.087 | TFLOPs: 35.47 | +7: iteration 4020/ 7508 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.56 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.479712E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.782 | TFLOPs: 43.74 | +7: iteration 4030/ 7508 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.56 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.465002E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.977 | TFLOPs: 43.38 | +7: iteration 4040/ 7508 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.56 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.475229E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.621 | TFLOPs: 43.72 | +7: iteration 4050/ 7508 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.56 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.475048E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.661 | TFLOPs: 43.63 | +7: iteration 4060/ 7508 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.56 | learning rate: 9.982E-05 | global batch size: 256 | lm loss: 3.480368E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.740 | TFLOPs: 43.74 | +7: iteration 4070/ 7508 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.56 | learning rate: 9.944E-05 | global batch size: 256 | lm loss: 3.474697E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.037 | TFLOPs: 43.76 | +7: iteration 4080/ 7508 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.56 | learning rate: 9.906E-05 | global batch size: 256 | lm loss: 3.477742E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.711 | TFLOPs: 43.54 | +7: iteration 4090/ 7508 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.57 | learning rate: 9.868E-05 | global batch size: 256 | lm loss: 3.483778E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.526 | TFLOPs: 43.05 | +7: iteration 4100/ 7508 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.57 | learning rate: 9.831E-05 | global batch size: 256 | lm loss: 3.474825E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.118 | TFLOPs: 43.10 | +7: iteration 4110/ 7508 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.57 | learning rate: 9.793E-05 | global batch size: 256 | lm loss: 3.481419E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.680 | TFLOPs: 42.87 | +7: iteration 4120/ 7508 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.56 | learning rate: 9.755E-05 | global batch size: 256 | lm loss: 3.477240E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.987 | TFLOPs: 43.38 | +7: iteration 4130/ 7508 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.57 | learning rate: 9.718E-05 | global batch size: 256 | lm loss: 3.479900E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.427 | TFLOPs: 42.66 | +7: iteration 4140/ 7508 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.56 | learning rate: 9.680E-05 | global batch size: 256 | lm loss: 3.464589E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.454 | TFLOPs: 43.42 | +7: iteration 4150/ 7508 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.56 | learning rate: 9.642E-05 | global batch size: 256 | lm loss: 3.469365E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.784 | TFLOPs: 43.74 | +7: iteration 4160/ 7508 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.56 | learning rate: 9.605E-05 | global batch size: 256 | lm loss: 3.463848E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.662 | TFLOPs: 43.73 | +7: iteration 4170/ 7508 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.56 | learning rate: 9.567E-05 | global batch size: 256 | lm loss: 3.480102E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.755 | TFLOPs: 43.74 | +7: iteration 4180/ 7508 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.57 | learning rate: 9.530E-05 | global batch size: 256 | lm loss: 3.480338E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.366 | TFLOPs: 43.13 | +7: iteration 4190/ 7508 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.56 | learning rate: 9.492E-05 | global batch size: 256 | lm loss: 3.462108E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.630 | TFLOPs: 43.73 | +7: iteration 4200/ 7508 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.56 | learning rate: 9.455E-05 | global batch size: 256 | lm loss: 3.457166E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.505 | TFLOPs: 43.71 | +7: iteration 4210/ 7508 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.56 | learning rate: 9.417E-05 | global batch size: 256 | lm loss: 3.463266E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.576 | TFLOPs: 43.72 | +7: iteration 4220/ 7508 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.56 | learning rate: 9.380E-05 | global batch size: 256 | lm loss: 3.465564E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.101 | TFLOPs: 43.20 | +7: iteration 4230/ 7508 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.56 | learning rate: 9.342E-05 | global batch size: 256 | lm loss: 3.465510E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.244 | TFLOPs: 43.40 | +7: iteration 4240/ 7508 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.56 | learning rate: 9.305E-05 | global batch size: 256 | lm loss: 3.452650E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.571 | TFLOPs: 43.72 | +7: iteration 4250/ 7508 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.56 | learning rate: 9.268E-05 | global batch size: 256 | lm loss: 3.468330E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.675 | TFLOPs: 43.73 | +7: iteration 4260/ 7508 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.56 | learning rate: 9.230E-05 | global batch size: 256 | lm loss: 3.458376E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.528 | TFLOPs: 43.72 | +7: iteration 4270/ 7508 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.57 | learning rate: 9.193E-05 | global batch size: 256 | lm loss: 3.470048E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.305 | TFLOPs: 43.12 | +7: iteration 4280/ 7508 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.56 | learning rate: 9.156E-05 | global batch size: 256 | lm loss: 3.463378E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.605 | TFLOPs: 43.25 | +7: iteration 4290/ 7508 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.57 | learning rate: 9.119E-05 | global batch size: 256 | lm loss: 3.461371E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.013 | TFLOPs: 43.19 | +7: iteration 4300/ 7508 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.56 | learning rate: 9.082E-05 | global batch size: 256 | lm loss: 3.461876E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.620 | TFLOPs: 43.72 | +7: iteration 4310/ 7508 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.56 | learning rate: 9.044E-05 | global batch size: 256 | lm loss: 3.452111E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.831 | TFLOPs: 43.65 | +7: iteration 4320/ 7508 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.56 | learning rate: 9.007E-05 | global batch size: 256 | lm loss: 3.468581E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.672 | TFLOPs: 43.73 | +7: iteration 4330/ 7508 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.56 | learning rate: 8.970E-05 | global batch size: 256 | lm loss: 3.472961E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.242 | TFLOPs: 43.31 | +7: iteration 4340/ 7508 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.56 | learning rate: 8.933E-05 | global batch size: 256 | lm loss: 3.453167E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.559 | TFLOPs: 43.62 | +7: iteration 4350/ 7508 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.56 | learning rate: 8.896E-05 | global batch size: 256 | lm loss: 3.451600E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.621 | TFLOPs: 43.72 | +7: iteration 4360/ 7508 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.56 | learning rate: 8.859E-05 | global batch size: 256 | lm loss: 3.442281E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.539 | TFLOPs: 43.72 | +7: iteration 4370/ 7508 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.56 | learning rate: 8.822E-05 | global batch size: 256 | lm loss: 3.455022E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.554 | TFLOPs: 43.72 | +7: iteration 4380/ 7508 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.56 | learning rate: 8.785E-05 | global batch size: 256 | lm loss: 3.459504E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.828 | TFLOPs: 43.74 | +7: iteration 4390/ 7508 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.56 | learning rate: 8.749E-05 | global batch size: 256 | lm loss: 3.446539E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.483 | TFLOPs: 43.71 | +7: iteration 4400/ 7508 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.56 | learning rate: 8.712E-05 | global batch size: 256 | lm loss: 3.454771E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.459 | TFLOPs: 43.71 | +7: iteration 4410/ 7508 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.56 | learning rate: 8.675E-05 | global batch size: 256 | lm loss: 3.443706E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.277 | TFLOPs: 43.22 | +7: iteration 4420/ 7508 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.56 | learning rate: 8.638E-05 | global batch size: 256 | lm loss: 3.474681E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.753 | TFLOPs: 43.26 | +7: iteration 4430/ 7508 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.56 | learning rate: 8.602E-05 | global batch size: 256 | lm loss: 3.451614E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.574 | TFLOPs: 43.72 | +7: iteration 4440/ 7508 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.56 | learning rate: 8.565E-05 | global batch size: 256 | lm loss: 3.455949E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.337 | TFLOPs: 43.70 | +7: iteration 4450/ 7508 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.56 | learning rate: 8.528E-05 | global batch size: 256 | lm loss: 3.454312E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.522 | TFLOPs: 43.72 | +7: iteration 4460/ 7508 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.56 | learning rate: 8.492E-05 | global batch size: 256 | lm loss: 3.438947E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.478 | TFLOPs: 43.71 | +7: iteration 4470/ 7508 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.56 | learning rate: 8.455E-05 | global batch size: 256 | lm loss: 3.448003E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.421 | TFLOPs: 43.71 | +7: iteration 4480/ 7508 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.56 | learning rate: 8.419E-05 | global batch size: 256 | lm loss: 3.457011E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.463 | TFLOPs: 43.71 | +7: iteration 4490/ 7508 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.56 | learning rate: 8.382E-05 | global batch size: 256 | lm loss: 3.444246E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.174 | TFLOPs: 43.59 | +7: iteration 4500/ 7508 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.56 | learning rate: 8.346E-05 | global batch size: 256 | lm loss: 3.453012E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.061 | TFLOPs: 43.48 | +7: iteration 4510/ 7508 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.56 | learning rate: 8.310E-05 | global batch size: 256 | lm loss: 3.451587E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.963 | TFLOPs: 43.28 | +7: iteration 4520/ 7508 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.56 | learning rate: 8.273E-05 | global batch size: 256 | lm loss: 3.424966E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.506 | TFLOPs: 43.71 | +7: iteration 4530/ 7508 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.56 | learning rate: 8.237E-05 | global batch size: 256 | lm loss: 3.436588E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.392 | TFLOPs: 43.70 | +7: iteration 4540/ 7508 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.56 | learning rate: 8.201E-05 | global batch size: 256 | lm loss: 3.434040E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.348 | TFLOPs: 43.70 | +7: iteration 4550/ 7508 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.56 | learning rate: 8.165E-05 | global batch size: 256 | lm loss: 3.441765E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.388 | TFLOPs: 43.70 | +7: iteration 4560/ 7508 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.56 | learning rate: 8.129E-05 | global batch size: 256 | lm loss: 3.440281E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.320 | TFLOPs: 43.70 | +7: iteration 4570/ 7508 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.56 | learning rate: 8.093E-05 | global batch size: 256 | lm loss: 3.434863E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.390 | TFLOPs: 43.70 | +7: iteration 4580/ 7508 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.56 | learning rate: 8.057E-05 | global batch size: 256 | lm loss: 3.430741E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.449 | TFLOPs: 43.71 | +7: iteration 4590/ 7508 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.56 | learning rate: 8.021E-05 | global batch size: 256 | lm loss: 3.445058E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.405 | TFLOPs: 43.23 | +7: iteration 4600/ 7508 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.56 | learning rate: 7.985E-05 | global batch size: 256 | lm loss: 3.451571E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.480 | TFLOPs: 43.71 | +7: iteration 4610/ 7508 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.57 | learning rate: 7.949E-05 | global batch size: 256 | lm loss: 3.420820E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.052 | TFLOPs: 43.10 | +7: iteration 4620/ 7508 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.56 | learning rate: 7.913E-05 | global batch size: 256 | lm loss: 3.425192E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.651 | TFLOPs: 43.73 | +7: iteration 4630/ 7508 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.56 | learning rate: 7.878E-05 | global batch size: 256 | lm loss: 3.454863E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.451 | TFLOPs: 43.71 | +7: iteration 4640/ 7508 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.57 | learning rate: 7.842E-05 | global batch size: 256 | lm loss: 3.439952E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.585 | TFLOPs: 43.05 | +7: iteration 4650/ 7508 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.56 | learning rate: 7.807E-05 | global batch size: 256 | lm loss: 3.425388E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.599 | TFLOPs: 43.72 | +7: iteration 4660/ 7508 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.56 | learning rate: 7.771E-05 | global batch size: 256 | lm loss: 3.418001E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.416 | TFLOPs: 43.23 | +7: iteration 4670/ 7508 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.56 | learning rate: 7.736E-05 | global batch size: 256 | lm loss: 3.417061E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.243 | TFLOPs: 43.50 | +7: iteration 4680/ 7508 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.56 | learning rate: 7.700E-05 | global batch size: 256 | lm loss: 3.424746E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.960 | TFLOPs: 43.76 | +7: iteration 4690/ 7508 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.56 | learning rate: 7.665E-05 | global batch size: 256 | lm loss: 3.433486E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.590 | TFLOPs: 43.63 | +7: iteration 4700/ 7508 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.56 | learning rate: 7.629E-05 | global batch size: 256 | lm loss: 3.428445E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.940 | TFLOPs: 43.75 | +7: iteration 4710/ 7508 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.56 | learning rate: 7.594E-05 | global batch size: 256 | lm loss: 3.424924E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.756 | TFLOPs: 43.74 | +7: iteration 4720/ 7508 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.56 | learning rate: 7.559E-05 | global batch size: 256 | lm loss: 3.415987E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.814 | TFLOPs: 43.74 | +7: iteration 4730/ 7508 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.56 | learning rate: 7.524E-05 | global batch size: 256 | lm loss: 3.434333E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.628 | TFLOPs: 43.73 | +7: iteration 4740/ 7508 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.56 | learning rate: 7.489E-05 | global batch size: 256 | lm loss: 3.430486E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.573 | TFLOPs: 43.72 | +7: iteration 4750/ 7508 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.56 | learning rate: 7.454E-05 | global batch size: 256 | lm loss: 3.414970E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.712 | TFLOPs: 43.73 | +7: iteration 4760/ 7508 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.56 | learning rate: 7.419E-05 | global batch size: 256 | lm loss: 3.431653E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.757 | TFLOPs: 43.74 | +7: iteration 4770/ 7508 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.56 | learning rate: 7.384E-05 | global batch size: 256 | lm loss: 3.423899E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.640 | TFLOPs: 43.73 | +7: iteration 4780/ 7508 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.56 | learning rate: 7.349E-05 | global batch size: 256 | lm loss: 3.433836E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.849 | TFLOPs: 43.75 | +7: iteration 4790/ 7508 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.56 | learning rate: 7.315E-05 | global batch size: 256 | lm loss: 3.412126E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.462 | TFLOPs: 43.71 | +7: iteration 4800/ 7508 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.56 | learning rate: 7.280E-05 | global batch size: 256 | lm loss: 3.426505E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.507 | TFLOPs: 43.71 | +7: iteration 4810/ 7508 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.56 | learning rate: 7.245E-05 | global batch size: 256 | lm loss: 3.435622E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.839 | TFLOPs: 43.75 | +7: iteration 4820/ 7508 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.56 | learning rate: 7.211E-05 | global batch size: 256 | lm loss: 3.416074E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.849 | TFLOPs: 43.75 | +7: iteration 4830/ 7508 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.56 | learning rate: 7.176E-05 | global batch size: 256 | lm loss: 3.428770E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.793 | TFLOPs: 43.45 | +7: iteration 4840/ 7508 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.56 | learning rate: 7.142E-05 | global batch size: 256 | lm loss: 3.420377E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.664 | TFLOPs: 43.73 | +7: iteration 4850/ 7508 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.56 | learning rate: 7.108E-05 | global batch size: 256 | lm loss: 3.416388E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.897 | TFLOPs: 43.37 | +7: iteration 4860/ 7508 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.56 | learning rate: 7.073E-05 | global batch size: 256 | lm loss: 3.404826E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.568 | TFLOPs: 43.72 | +7: iteration 4870/ 7508 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.56 | learning rate: 7.039E-05 | global batch size: 256 | lm loss: 3.420869E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.219 | TFLOPs: 43.69 | +7: iteration 4880/ 7508 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.56 | learning rate: 7.005E-05 | global batch size: 256 | lm loss: 3.412381E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.646 | TFLOPs: 43.73 | +7: iteration 4890/ 7508 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.56 | learning rate: 6.971E-05 | global batch size: 256 | lm loss: 3.407948E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.596 | TFLOPs: 43.72 | +7: iteration 4900/ 7508 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.56 | learning rate: 6.937E-05 | global batch size: 256 | lm loss: 3.422163E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.708 | TFLOPs: 43.73 | +7: iteration 4910/ 7508 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.56 | learning rate: 6.903E-05 | global batch size: 256 | lm loss: 3.412191E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.547 | TFLOPs: 43.72 | +7: iteration 4920/ 7508 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.56 | learning rate: 6.869E-05 | global batch size: 256 | lm loss: 3.400195E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.877 | TFLOPs: 43.75 | +7: iteration 4930/ 7508 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.57 | learning rate: 6.835E-05 | global batch size: 256 | lm loss: 3.412011E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.112 | TFLOPs: 42.72 | +7: iteration 4940/ 7508 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.56 | learning rate: 6.802E-05 | global batch size: 256 | lm loss: 3.408373E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.795 | TFLOPs: 43.74 | +7: iteration 4950/ 7508 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.56 | learning rate: 6.768E-05 | global batch size: 256 | lm loss: 3.406860E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.995 | TFLOPs: 43.76 | +7: iteration 4960/ 7508 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.56 | learning rate: 6.735E-05 | global batch size: 256 | lm loss: 3.419186E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.850 | TFLOPs: 43.75 | +7: iteration 4970/ 7508 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.56 | learning rate: 6.701E-05 | global batch size: 256 | lm loss: 3.409924E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.483 | TFLOPs: 43.71 | +7: iteration 4980/ 7508 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.56 | learning rate: 6.668E-05 | global batch size: 256 | lm loss: 3.408933E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.528 | TFLOPs: 43.72 | +7: iteration 4990/ 7508 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.56 | learning rate: 6.634E-05 | global batch size: 256 | lm loss: 3.405156E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.566 | TFLOPs: 43.72 | +7: iteration 5000/ 7508 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.56 | learning rate: 6.601E-05 | global batch size: 256 | lm loss: 3.397984E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.778 | TFLOPs: 43.74 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 5000 | lm loss value: 3.473173E+00 | lm loss PPL: 3.223888E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 5000 to checkpoints_421m3b9400m +0: [2023-03-15 22:47:50,171] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +0: [2023-03-15 22:47:50,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:47:50,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:47:50,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:47:50,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:47:50,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:47:50,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:47:50,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:47:50,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:47:50,431] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:47:50,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:47:50,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:47:50,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:47:50,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:47:50,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:47:50,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:47:50,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:47:50,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:47:50,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:47:50,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:47:50,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:47:50,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:47:50,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:47:50,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:47:50,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:47:50,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:47:50,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:47:50,792] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:47:50,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:47:50,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:47:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:47:50,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:47:50,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:47:50,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:47:50,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:47:50,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:47:50,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:47:50,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:47:51,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:47:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/layer_22-model_00-model_states.pt... +0: [2023-03-15 22:47:51,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/layer_22-model_00-model_states.pt. +0: [2023-03-15 22:47:51,037] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step5000/mp_rank_00_model_states.pt +0: [2023-03-15 22:47:51,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:47:51,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:47:51,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:47:51,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:47:51,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:47:51,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:47:51,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:47:51,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:47:51,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:47:51,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:47:51,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:47:51,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:47:51,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:47:51,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:47:51,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:47:51,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:47:51,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:47:51,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:47:51,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:47:51,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,260] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,260] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:47:51,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:47:51,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:47:51,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:47:51,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:47:51,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:47:51,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:47:51,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:47:51,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:47:51,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:47:51,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:47:51,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:47:51,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:47:51,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:47:51,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:47:51,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: successfully saved checkpoint at iteration 5000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1125.47 +7: iteration 5010/ 7508 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.69 | learning rate: 6.568E-05 | global batch size: 256 | lm loss: 3.403362E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 370.455 | TFLOPs: 35.32 | +7: iteration 5020/ 7508 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.56 | learning rate: 6.535E-05 | global batch size: 256 | lm loss: 3.409329E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.682 | TFLOPs: 43.73 | +7: iteration 5030/ 7508 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.56 | learning rate: 6.502E-05 | global batch size: 256 | lm loss: 3.396631E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.732 | TFLOPs: 43.74 | +7: iteration 5040/ 7508 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.56 | learning rate: 6.469E-05 | global batch size: 256 | lm loss: 3.392386E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.640 | TFLOPs: 43.73 | +7: iteration 5050/ 7508 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.56 | learning rate: 6.436E-05 | global batch size: 256 | lm loss: 3.401043E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.612 | TFLOPs: 43.72 | +7: iteration 5060/ 7508 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.56 | learning rate: 6.404E-05 | global batch size: 256 | lm loss: 3.399649E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.568 | TFLOPs: 43.72 | +7: iteration 5070/ 7508 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.56 | learning rate: 6.371E-05 | global batch size: 256 | lm loss: 3.408157E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.385 | TFLOPs: 43.70 | +7: iteration 5080/ 7508 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.56 | learning rate: 6.338E-05 | global batch size: 256 | lm loss: 3.393936E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.650 | TFLOPs: 43.73 | +7: iteration 5090/ 7508 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.56 | learning rate: 6.306E-05 | global batch size: 256 | lm loss: 3.394501E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.922 | TFLOPs: 43.75 | +7: iteration 5100/ 7508 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.56 | learning rate: 6.273E-05 | global batch size: 256 | lm loss: 3.405134E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.918 | TFLOPs: 43.75 | +7: iteration 5110/ 7508 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.56 | learning rate: 6.241E-05 | global batch size: 256 | lm loss: 3.402128E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.933 | TFLOPs: 43.75 | +7: iteration 5120/ 7508 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 0.56 | learning rate: 6.209E-05 | global batch size: 256 | lm loss: 3.398429E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.643 | TFLOPs: 43.73 | +7: iteration 5130/ 7508 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 0.56 | learning rate: 6.177E-05 | global batch size: 256 | lm loss: 3.416089E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.902 | TFLOPs: 43.75 | +7: iteration 5140/ 7508 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 0.56 | learning rate: 6.145E-05 | global batch size: 256 | lm loss: 3.396450E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.135 | TFLOPs: 43.77 | +7: iteration 5150/ 7508 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 0.56 | learning rate: 6.113E-05 | global batch size: 256 | lm loss: 3.396815E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.618 | TFLOPs: 43.34 | +7: iteration 5160/ 7508 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 0.56 | learning rate: 6.081E-05 | global batch size: 256 | lm loss: 3.391193E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.226 | TFLOPs: 43.78 | +7: iteration 5170/ 7508 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 0.56 | learning rate: 6.049E-05 | global batch size: 256 | lm loss: 3.415035E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.824 | TFLOPs: 43.74 | +7: iteration 5180/ 7508 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 0.56 | learning rate: 6.017E-05 | global batch size: 256 | lm loss: 3.396195E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.803 | TFLOPs: 43.74 | +7: iteration 5190/ 7508 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 0.56 | learning rate: 5.986E-05 | global batch size: 256 | lm loss: 3.411328E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.932 | TFLOPs: 43.75 | +7: iteration 5200/ 7508 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 0.56 | learning rate: 5.954E-05 | global batch size: 256 | lm loss: 3.395579E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.102 | TFLOPs: 43.77 | +7: iteration 5210/ 7508 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 0.56 | learning rate: 5.923E-05 | global batch size: 256 | lm loss: 3.389607E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.798 | TFLOPs: 43.74 | +7: iteration 5220/ 7508 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 0.56 | learning rate: 5.891E-05 | global batch size: 256 | lm loss: 3.394913E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.986 | TFLOPs: 43.66 | +7: iteration 5230/ 7508 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 0.56 | learning rate: 5.860E-05 | global batch size: 256 | lm loss: 3.391452E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.767 | TFLOPs: 43.74 | +7: iteration 5240/ 7508 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 0.56 | learning rate: 5.829E-05 | global batch size: 256 | lm loss: 3.403254E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.909 | TFLOPs: 43.75 | +7: iteration 5250/ 7508 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 0.56 | learning rate: 5.798E-05 | global batch size: 256 | lm loss: 3.394223E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.012 | TFLOPs: 43.38 | +7: iteration 5260/ 7508 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 0.56 | learning rate: 5.767E-05 | global batch size: 256 | lm loss: 3.397235E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.991 | TFLOPs: 43.76 | +7: iteration 5270/ 7508 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 0.56 | learning rate: 5.736E-05 | global batch size: 256 | lm loss: 3.383051E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.936 | TFLOPs: 43.75 | +7: iteration 5280/ 7508 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 0.57 | learning rate: 5.705E-05 | global batch size: 256 | lm loss: 3.393817E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.963 | TFLOPs: 43.09 | +7: iteration 5290/ 7508 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 0.56 | learning rate: 5.674E-05 | global batch size: 256 | lm loss: 3.386700E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.291 | TFLOPs: 43.79 | +7: iteration 5300/ 7508 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 0.56 | learning rate: 5.644E-05 | global batch size: 256 | lm loss: 3.395275E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.008 | TFLOPs: 43.76 | +7: iteration 5310/ 7508 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 0.57 | learning rate: 5.613E-05 | global batch size: 256 | lm loss: 3.378440E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.573 | TFLOPs: 42.67 | +7: iteration 5320/ 7508 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 0.56 | learning rate: 5.583E-05 | global batch size: 256 | lm loss: 3.389675E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.897 | TFLOPs: 43.75 | +7: iteration 5330/ 7508 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 0.56 | learning rate: 5.552E-05 | global batch size: 256 | lm loss: 3.400396E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.821 | TFLOPs: 43.74 | +7: iteration 5340/ 7508 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 0.56 | learning rate: 5.522E-05 | global batch size: 256 | lm loss: 3.384896E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.650 | TFLOPs: 43.73 | +7: iteration 5350/ 7508 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 0.56 | learning rate: 5.492E-05 | global batch size: 256 | lm loss: 3.380336E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.075 | TFLOPs: 43.77 | +7: iteration 5360/ 7508 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 0.56 | learning rate: 5.462E-05 | global batch size: 256 | lm loss: 3.374234E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.956 | TFLOPs: 43.76 | +7: iteration 5370/ 7508 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 0.56 | learning rate: 5.432E-05 | global batch size: 256 | lm loss: 3.370121E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.008 | TFLOPs: 43.76 | +7: iteration 5380/ 7508 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 0.56 | learning rate: 5.402E-05 | global batch size: 256 | lm loss: 3.386075E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.840 | TFLOPs: 43.75 | +7: iteration 5390/ 7508 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 0.56 | learning rate: 5.373E-05 | global batch size: 256 | lm loss: 3.406191E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.952 | TFLOPs: 43.76 | +7: iteration 5400/ 7508 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 0.56 | learning rate: 5.343E-05 | global batch size: 256 | lm loss: 3.381652E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.598 | TFLOPs: 43.72 | +7: iteration 5410/ 7508 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 0.56 | learning rate: 5.313E-05 | global batch size: 256 | lm loss: 3.395408E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.252 | TFLOPs: 43.59 | +7: iteration 5420/ 7508 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 0.56 | learning rate: 5.284E-05 | global batch size: 256 | lm loss: 3.388205E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.752 | TFLOPs: 43.74 | +7: iteration 5430/ 7508 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 0.56 | learning rate: 5.255E-05 | global batch size: 256 | lm loss: 3.374800E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.779 | TFLOPs: 43.74 | +7: iteration 5440/ 7508 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 0.56 | learning rate: 5.225E-05 | global batch size: 256 | lm loss: 3.394939E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.603 | TFLOPs: 43.72 | +7: iteration 5450/ 7508 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 0.56 | learning rate: 5.196E-05 | global batch size: 256 | lm loss: 3.371590E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.110 | TFLOPs: 43.68 | +7: iteration 5460/ 7508 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 0.56 | learning rate: 5.167E-05 | global batch size: 256 | lm loss: 3.364726E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.225 | TFLOPs: 43.69 | +7: iteration 5470/ 7508 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 0.56 | learning rate: 5.138E-05 | global batch size: 256 | lm loss: 3.403647E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.448 | TFLOPs: 43.71 | +7: iteration 5480/ 7508 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 0.56 | learning rate: 5.109E-05 | global batch size: 256 | lm loss: 3.382705E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.335 | TFLOPs: 43.60 | +7: iteration 5490/ 7508 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 0.56 | learning rate: 5.081E-05 | global batch size: 256 | lm loss: 3.376212E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.679 | TFLOPs: 43.73 | +7: iteration 5500/ 7508 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 0.56 | learning rate: 5.052E-05 | global batch size: 256 | lm loss: 3.390626E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.641 | TFLOPs: 43.73 | +7: iteration 5510/ 7508 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 0.56 | learning rate: 5.024E-05 | global batch size: 256 | lm loss: 3.374250E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.868 | TFLOPs: 43.75 | +7: iteration 5520/ 7508 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 0.56 | learning rate: 4.995E-05 | global batch size: 256 | lm loss: 3.372469E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.729 | TFLOPs: 43.73 | +7: iteration 5530/ 7508 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 0.56 | learning rate: 4.967E-05 | global batch size: 256 | lm loss: 3.389101E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.929 | TFLOPs: 43.75 | +7: iteration 5540/ 7508 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 0.56 | learning rate: 4.939E-05 | global batch size: 256 | lm loss: 3.374599E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.582 | TFLOPs: 43.72 | +7: iteration 5550/ 7508 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 0.56 | learning rate: 4.911E-05 | global batch size: 256 | lm loss: 3.353680E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.186 | TFLOPs: 43.21 | +7: iteration 5560/ 7508 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 0.56 | learning rate: 4.883E-05 | global batch size: 256 | lm loss: 3.362779E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.835 | TFLOPs: 43.74 | +7: iteration 5570/ 7508 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 0.56 | learning rate: 4.855E-05 | global batch size: 256 | lm loss: 3.371349E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.728 | TFLOPs: 43.73 | +7: iteration 5580/ 7508 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 0.56 | learning rate: 4.827E-05 | global batch size: 256 | lm loss: 3.374587E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.587 | TFLOPs: 43.72 | +7: iteration 5590/ 7508 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 0.56 | learning rate: 4.800E-05 | global batch size: 256 | lm loss: 3.385492E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.522 | TFLOPs: 43.72 | +7: iteration 5600/ 7508 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 0.56 | learning rate: 4.772E-05 | global batch size: 256 | lm loss: 3.364052E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.676 | TFLOPs: 43.73 | +7: iteration 5610/ 7508 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 0.56 | learning rate: 4.745E-05 | global batch size: 256 | lm loss: 3.368394E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.657 | TFLOPs: 43.73 | +7: iteration 5620/ 7508 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 0.56 | learning rate: 4.717E-05 | global batch size: 256 | lm loss: 3.376643E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.740 | TFLOPs: 43.74 | +7: iteration 5630/ 7508 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 0.56 | learning rate: 4.690E-05 | global batch size: 256 | lm loss: 3.357499E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.637 | TFLOPs: 43.25 | +7: iteration 5640/ 7508 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 0.56 | learning rate: 4.663E-05 | global batch size: 256 | lm loss: 3.369983E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.510 | TFLOPs: 43.71 | +7: iteration 5650/ 7508 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 0.56 | learning rate: 4.636E-05 | global batch size: 256 | lm loss: 3.371241E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.762 | TFLOPs: 43.74 | +7: iteration 5660/ 7508 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 0.56 | learning rate: 4.609E-05 | global batch size: 256 | lm loss: 3.359615E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.624 | TFLOPs: 43.72 | +7: iteration 5670/ 7508 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 0.56 | learning rate: 4.583E-05 | global batch size: 256 | lm loss: 3.374706E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.879 | TFLOPs: 43.75 | +7: iteration 5680/ 7508 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 0.56 | learning rate: 4.556E-05 | global batch size: 256 | lm loss: 3.383095E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.639 | TFLOPs: 43.73 | +7: iteration 5690/ 7508 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 0.56 | learning rate: 4.530E-05 | global batch size: 256 | lm loss: 3.359373E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.618 | TFLOPs: 43.72 | +7: iteration 5700/ 7508 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 0.56 | learning rate: 4.503E-05 | global batch size: 256 | lm loss: 3.354126E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.645 | TFLOPs: 43.73 | +7: iteration 5710/ 7508 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 0.56 | learning rate: 4.477E-05 | global batch size: 256 | lm loss: 3.373780E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.665 | TFLOPs: 43.73 | +7: iteration 5720/ 7508 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 0.56 | learning rate: 4.451E-05 | global batch size: 256 | lm loss: 3.375966E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.672 | TFLOPs: 43.73 | +7: iteration 5730/ 7508 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 0.56 | learning rate: 4.425E-05 | global batch size: 256 | lm loss: 3.377298E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.691 | TFLOPs: 43.73 | +7: iteration 5740/ 7508 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 0.56 | learning rate: 4.399E-05 | global batch size: 256 | lm loss: 3.372506E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.656 | TFLOPs: 43.73 | +7: iteration 5750/ 7508 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 0.56 | learning rate: 4.373E-05 | global batch size: 256 | lm loss: 3.378728E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.750 | TFLOPs: 43.74 | +7: iteration 5760/ 7508 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 0.56 | learning rate: 4.347E-05 | global batch size: 256 | lm loss: 3.368060E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.800 | TFLOPs: 43.74 | +7: iteration 5770/ 7508 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 0.56 | learning rate: 4.322E-05 | global batch size: 256 | lm loss: 3.366974E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.838 | TFLOPs: 43.75 | +7: iteration 5780/ 7508 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 0.56 | learning rate: 4.296E-05 | global batch size: 256 | lm loss: 3.358761E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.932 | TFLOPs: 43.75 | +7: iteration 5790/ 7508 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 0.56 | learning rate: 4.271E-05 | global batch size: 256 | lm loss: 3.370967E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.779 | TFLOPs: 43.55 | +7: iteration 5800/ 7508 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 0.56 | learning rate: 4.246E-05 | global batch size: 256 | lm loss: 3.357978E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.064 | TFLOPs: 43.77 | +7: iteration 5810/ 7508 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 0.56 | learning rate: 4.221E-05 | global batch size: 256 | lm loss: 3.371177E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.509 | TFLOPs: 43.62 | +7: iteration 5820/ 7508 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 0.56 | learning rate: 4.196E-05 | global batch size: 256 | lm loss: 3.358704E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.142 | TFLOPs: 43.77 | +7: iteration 5830/ 7508 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 0.56 | learning rate: 4.171E-05 | global batch size: 256 | lm loss: 3.365821E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.221 | TFLOPs: 43.78 | +7: iteration 5840/ 7508 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 0.56 | learning rate: 4.146E-05 | global batch size: 256 | lm loss: 3.371870E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.936 | TFLOPs: 43.75 | +7: iteration 5850/ 7508 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 0.56 | learning rate: 4.122E-05 | global batch size: 256 | lm loss: 3.361333E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.084 | TFLOPs: 43.77 | +7: iteration 5860/ 7508 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 0.56 | learning rate: 4.097E-05 | global batch size: 256 | lm loss: 3.368621E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.006 | TFLOPs: 43.76 | +7: iteration 5870/ 7508 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 0.56 | learning rate: 4.073E-05 | global batch size: 256 | lm loss: 3.356162E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.929 | TFLOPs: 43.75 | +7: iteration 5880/ 7508 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 0.56 | learning rate: 4.049E-05 | global batch size: 256 | lm loss: 3.371424E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.899 | TFLOPs: 43.75 | +7: iteration 5890/ 7508 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 0.56 | learning rate: 4.025E-05 | global batch size: 256 | lm loss: 3.366093E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.870 | TFLOPs: 43.75 | +7: iteration 5900/ 7508 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 0.56 | learning rate: 4.001E-05 | global batch size: 256 | lm loss: 3.357327E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.846 | TFLOPs: 43.75 | +7: iteration 5910/ 7508 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 0.56 | learning rate: 3.977E-05 | global batch size: 256 | lm loss: 3.357347E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.797 | TFLOPs: 43.74 | +7: iteration 5920/ 7508 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 0.56 | learning rate: 3.953E-05 | global batch size: 256 | lm loss: 3.366264E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.683 | TFLOPs: 43.73 | +7: iteration 5930/ 7508 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 0.56 | learning rate: 3.929E-05 | global batch size: 256 | lm loss: 3.361262E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.781 | TFLOPs: 43.74 | +7: iteration 5940/ 7508 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 0.56 | learning rate: 3.906E-05 | global batch size: 256 | lm loss: 3.368137E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.823 | TFLOPs: 43.74 | +7: iteration 5950/ 7508 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 0.56 | learning rate: 3.883E-05 | global batch size: 256 | lm loss: 3.358440E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.025 | TFLOPs: 43.76 | +7: iteration 5960/ 7508 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 0.56 | learning rate: 3.859E-05 | global batch size: 256 | lm loss: 3.372092E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.806 | TFLOPs: 43.74 | +7: iteration 5970/ 7508 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 0.56 | learning rate: 3.836E-05 | global batch size: 256 | lm loss: 3.354662E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.069 | TFLOPs: 43.77 | +7: iteration 5980/ 7508 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 0.56 | learning rate: 3.813E-05 | global batch size: 256 | lm loss: 3.347910E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.623 | TFLOPs: 43.72 | +7: iteration 5990/ 7508 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 0.56 | learning rate: 3.790E-05 | global batch size: 256 | lm loss: 3.369402E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.895 | TFLOPs: 43.75 | +0: [2023-03-15 22:57:09,861] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[3.76774148080129e-05, 3.76774148080129e-05, 3.76774148080129e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 6000/ 7508 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 0.56 | learning rate: 3.768E-05 | global batch size: 256 | lm loss: 3.367018E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.774 | TFLOPs: 43.55 | +0: steps: 6000 loss: 3.4029 iter time (s): 0.557 samples/sec: 459.579 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 6000 | lm loss value: 3.448859E+00 | lm loss PPL: 3.146449E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 6000 to checkpoints_421m3b9400m +0: [2023-03-15 22:57:10,067] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! +0: [2023-03-15 22:57:10,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:57:10,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:57:10,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:57:10,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:57:10,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:57:10,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:57:10,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:57:10,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:57:10,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:57:10,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:57:10,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:57:10,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:57:10,407] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:57:10,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:57:10,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:57:10,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:57:10,486] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:57:10,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:57:10,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:57:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:57:10,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:57:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:57:10,605] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:57:10,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:57:10,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:57:10,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:57:10,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:57:10,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:57:10,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:57:10,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:57:10,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:57:10,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:57:10,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:57:10,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:57:10,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:57:10,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:57:10,885] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:57:10,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:57:10,925] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/layer_22-model_00-model_states.pt... +0: [2023-03-15 22:57:10,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/layer_22-model_00-model_states.pt. +0: [2023-03-15 22:57:10,929] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step6000/mp_rank_00_model_states.pt +0: [2023-03-15 22:57:10,929] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:57:10,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:10,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:11,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:57:11,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:11,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:11,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:57:11,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:11,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:11,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:11,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:11,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:11,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:57:11,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:57:11,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:11,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:57:11,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:11,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:11,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:57:11,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:11,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:11,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:57:11,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:11,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:11,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: successfully saved checkpoint at iteration 6000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1153.85 +7: iteration 6010/ 7508 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 0.69 | learning rate: 3.745E-05 | global batch size: 256 | lm loss: 3.344419E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 370.883 | TFLOPs: 35.36 | +7: iteration 6020/ 7508 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 0.56 | learning rate: 3.723E-05 | global batch size: 256 | lm loss: 3.354915E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.152 | TFLOPs: 43.78 | +7: iteration 6030/ 7508 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 0.56 | learning rate: 3.700E-05 | global batch size: 256 | lm loss: 3.343265E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.778 | TFLOPs: 43.74 | +7: iteration 6040/ 7508 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 0.56 | learning rate: 3.678E-05 | global batch size: 256 | lm loss: 3.346341E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.943 | TFLOPs: 43.76 | +7: iteration 6050/ 7508 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 0.56 | learning rate: 3.656E-05 | global batch size: 256 | lm loss: 3.354976E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.047 | TFLOPs: 43.77 | +7: iteration 6060/ 7508 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 0.56 | learning rate: 3.634E-05 | global batch size: 256 | lm loss: 3.351102E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.774 | TFLOPs: 43.74 | +7: iteration 6070/ 7508 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 0.56 | learning rate: 3.612E-05 | global batch size: 256 | lm loss: 3.355171E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.951 | TFLOPs: 43.76 | +7: iteration 6080/ 7508 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 0.56 | learning rate: 3.591E-05 | global batch size: 256 | lm loss: 3.362010E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.754 | TFLOPs: 43.74 | +7: iteration 6090/ 7508 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 0.56 | learning rate: 3.569E-05 | global batch size: 256 | lm loss: 3.360678E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.826 | TFLOPs: 43.74 | +7: iteration 6100/ 7508 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 0.56 | learning rate: 3.548E-05 | global batch size: 256 | lm loss: 3.358246E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.654 | TFLOPs: 43.25 | +7: iteration 6110/ 7508 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 0.56 | learning rate: 3.527E-05 | global batch size: 256 | lm loss: 3.357929E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.109 | TFLOPs: 43.77 | +7: iteration 6120/ 7508 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 0.57 | learning rate: 3.505E-05 | global batch size: 256 | lm loss: 3.345982E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.696 | TFLOPs: 42.97 | +7: iteration 6130/ 7508 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 0.56 | learning rate: 3.484E-05 | global batch size: 256 | lm loss: 3.340796E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.986 | TFLOPs: 43.76 | +7: iteration 6140/ 7508 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 0.56 | learning rate: 3.464E-05 | global batch size: 256 | lm loss: 3.352510E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.404 | TFLOPs: 43.70 | +7: iteration 6150/ 7508 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 0.56 | learning rate: 3.443E-05 | global batch size: 256 | lm loss: 3.341417E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.400 | TFLOPs: 43.70 | +7: iteration 6160/ 7508 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 0.56 | learning rate: 3.422E-05 | global batch size: 256 | lm loss: 3.344698E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.436 | TFLOPs: 43.71 | +7: iteration 6170/ 7508 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 0.56 | learning rate: 3.402E-05 | global batch size: 256 | lm loss: 3.340636E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.493 | TFLOPs: 43.71 | +7: iteration 6180/ 7508 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 0.56 | learning rate: 3.382E-05 | global batch size: 256 | lm loss: 3.355936E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.577 | TFLOPs: 43.72 | +7: iteration 6190/ 7508 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 0.56 | learning rate: 3.361E-05 | global batch size: 256 | lm loss: 3.336853E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.967 | TFLOPs: 43.76 | +7: iteration 6200/ 7508 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 0.56 | learning rate: 3.341E-05 | global batch size: 256 | lm loss: 3.349119E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.861 | TFLOPs: 43.75 | +7: iteration 6210/ 7508 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 0.56 | learning rate: 3.321E-05 | global batch size: 256 | lm loss: 3.346620E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.067 | TFLOPs: 43.77 | +7: iteration 6220/ 7508 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 0.56 | learning rate: 3.302E-05 | global batch size: 256 | lm loss: 3.345049E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.872 | TFLOPs: 43.75 | +7: iteration 6230/ 7508 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 0.56 | learning rate: 3.282E-05 | global batch size: 256 | lm loss: 3.365792E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.058 | TFLOPs: 43.77 | +7: iteration 6240/ 7508 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 0.56 | learning rate: 3.262E-05 | global batch size: 256 | lm loss: 3.354841E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.815 | TFLOPs: 43.74 | +7: iteration 6250/ 7508 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 0.56 | learning rate: 3.243E-05 | global batch size: 256 | lm loss: 3.345122E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.795 | TFLOPs: 43.74 | +7: iteration 6260/ 7508 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 0.56 | learning rate: 3.224E-05 | global batch size: 256 | lm loss: 3.350644E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.640 | TFLOPs: 43.54 | +7: iteration 6270/ 7508 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 0.56 | learning rate: 3.205E-05 | global batch size: 256 | lm loss: 3.331699E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.879 | TFLOPs: 43.75 | +7: iteration 6280/ 7508 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 0.56 | learning rate: 3.186E-05 | global batch size: 256 | lm loss: 3.334296E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.844 | TFLOPs: 43.75 | +7: iteration 6290/ 7508 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 0.56 | learning rate: 3.167E-05 | global batch size: 256 | lm loss: 3.348604E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.153 | TFLOPs: 43.78 | +7: iteration 6300/ 7508 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 0.56 | learning rate: 3.148E-05 | global batch size: 256 | lm loss: 3.355976E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.788 | TFLOPs: 43.65 | +7: iteration 6310/ 7508 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 0.56 | learning rate: 3.130E-05 | global batch size: 256 | lm loss: 3.365200E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.871 | TFLOPs: 43.75 | +7: iteration 6320/ 7508 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 0.56 | learning rate: 3.112E-05 | global batch size: 256 | lm loss: 3.349926E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.876 | TFLOPs: 43.75 | +7: iteration 6330/ 7508 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 0.56 | learning rate: 3.093E-05 | global batch size: 256 | lm loss: 3.347133E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.946 | TFLOPs: 43.76 | +7: iteration 6340/ 7508 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 0.56 | learning rate: 3.075E-05 | global batch size: 256 | lm loss: 3.355248E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.220 | TFLOPs: 43.40 | +7: iteration 6350/ 7508 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 0.56 | learning rate: 3.057E-05 | global batch size: 256 | lm loss: 3.348064E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.982 | TFLOPs: 43.76 | +7: iteration 6360/ 7508 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 0.56 | learning rate: 3.039E-05 | global batch size: 256 | lm loss: 3.349841E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.011 | TFLOPs: 43.76 | +7: iteration 6370/ 7508 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 0.57 | learning rate: 3.022E-05 | global batch size: 256 | lm loss: 3.343257E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.518 | TFLOPs: 43.14 | +7: iteration 6380/ 7508 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 0.56 | learning rate: 3.004E-05 | global batch size: 256 | lm loss: 3.325481E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.364 | TFLOPs: 43.70 | +7: iteration 6390/ 7508 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 0.56 | learning rate: 2.987E-05 | global batch size: 256 | lm loss: 3.355022E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.711 | TFLOPs: 43.73 | +7: iteration 6400/ 7508 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 0.57 | learning rate: 2.970E-05 | global batch size: 256 | lm loss: 3.346223E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.579 | TFLOPs: 43.15 | +7: iteration 6410/ 7508 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 0.56 | learning rate: 2.952E-05 | global batch size: 256 | lm loss: 3.339826E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.497 | TFLOPs: 43.71 | +7: iteration 6420/ 7508 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 0.56 | learning rate: 2.936E-05 | global batch size: 256 | lm loss: 3.350451E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.490 | TFLOPs: 43.71 | +7: iteration 6430/ 7508 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 0.56 | learning rate: 2.919E-05 | global batch size: 256 | lm loss: 3.352022E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.511 | TFLOPs: 43.71 | +7: iteration 6440/ 7508 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 0.56 | learning rate: 2.902E-05 | global batch size: 256 | lm loss: 3.331337E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.663 | TFLOPs: 43.73 | +7: iteration 6450/ 7508 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 0.56 | learning rate: 2.886E-05 | global batch size: 256 | lm loss: 3.344675E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.595 | TFLOPs: 43.25 | +7: iteration 6460/ 7508 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 0.56 | learning rate: 2.869E-05 | global batch size: 256 | lm loss: 3.332583E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.831 | TFLOPs: 43.74 | +7: iteration 6470/ 7508 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 0.56 | learning rate: 2.853E-05 | global batch size: 256 | lm loss: 3.330513E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.448 | TFLOPs: 43.71 | +7: iteration 6480/ 7508 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 0.56 | learning rate: 2.837E-05 | global batch size: 256 | lm loss: 3.335395E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.457 | TFLOPs: 43.52 | +7: iteration 6490/ 7508 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 0.56 | learning rate: 2.821E-05 | global batch size: 256 | lm loss: 3.333065E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.707 | TFLOPs: 43.73 | +7: iteration 6500/ 7508 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 0.56 | learning rate: 2.805E-05 | global batch size: 256 | lm loss: 3.340382E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.529 | TFLOPs: 43.72 | +7: iteration 6510/ 7508 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 0.56 | learning rate: 2.789E-05 | global batch size: 256 | lm loss: 3.349121E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.671 | TFLOPs: 43.73 | +7: iteration 6520/ 7508 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 0.56 | learning rate: 2.774E-05 | global batch size: 256 | lm loss: 3.344036E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.650 | TFLOPs: 43.73 | +7: iteration 6530/ 7508 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 0.56 | learning rate: 2.759E-05 | global batch size: 256 | lm loss: 3.335472E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.718 | TFLOPs: 43.73 | +7: iteration 6540/ 7508 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 0.56 | learning rate: 2.743E-05 | global batch size: 256 | lm loss: 3.325026E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.848 | TFLOPs: 43.75 | +7: iteration 6550/ 7508 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 0.56 | learning rate: 2.728E-05 | global batch size: 256 | lm loss: 3.334243E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.605 | TFLOPs: 43.25 | +7: iteration 6560/ 7508 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 0.56 | learning rate: 2.713E-05 | global batch size: 256 | lm loss: 3.326752E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.695 | TFLOPs: 43.73 | +7: iteration 6570/ 7508 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 0.56 | learning rate: 2.699E-05 | global batch size: 256 | lm loss: 3.340135E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.783 | TFLOPs: 43.74 | +7: iteration 6580/ 7508 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 0.56 | learning rate: 2.684E-05 | global batch size: 256 | lm loss: 3.342480E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.681 | TFLOPs: 43.73 | +7: iteration 6590/ 7508 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 0.56 | learning rate: 2.669E-05 | global batch size: 256 | lm loss: 3.327768E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.931 | TFLOPs: 43.75 | +7: iteration 6600/ 7508 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 0.56 | learning rate: 2.655E-05 | global batch size: 256 | lm loss: 3.330845E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.855 | TFLOPs: 43.75 | +7: iteration 6610/ 7508 | consumed samples: 1692160 | consumed tokens: 3465543680 | elapsed time per iteration (s): 0.56 | learning rate: 2.641E-05 | global batch size: 256 | lm loss: 3.333928E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.443 | TFLOPs: 43.61 | +7: iteration 6620/ 7508 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 0.56 | learning rate: 2.627E-05 | global batch size: 256 | lm loss: 3.336358E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.984 | TFLOPs: 43.76 | +7: iteration 6630/ 7508 | consumed samples: 1697280 | consumed tokens: 3476029440 | elapsed time per iteration (s): 0.56 | learning rate: 2.613E-05 | global batch size: 256 | lm loss: 3.342520E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.784 | TFLOPs: 43.74 | +7: iteration 6640/ 7508 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 0.56 | learning rate: 2.599E-05 | global batch size: 256 | lm loss: 3.332937E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.610 | TFLOPs: 43.72 | +7: iteration 6650/ 7508 | consumed samples: 1702400 | consumed tokens: 3486515200 | elapsed time per iteration (s): 0.56 | learning rate: 2.586E-05 | global batch size: 256 | lm loss: 3.343767E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.908 | TFLOPs: 43.75 | +7: iteration 6660/ 7508 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 0.56 | learning rate: 2.572E-05 | global batch size: 256 | lm loss: 3.329218E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.697 | TFLOPs: 43.73 | +7: iteration 6670/ 7508 | consumed samples: 1707520 | consumed tokens: 3497000960 | elapsed time per iteration (s): 0.56 | learning rate: 2.559E-05 | global batch size: 256 | lm loss: 3.353920E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.841 | TFLOPs: 43.75 | +7: iteration 6680/ 7508 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 0.56 | learning rate: 2.546E-05 | global batch size: 256 | lm loss: 3.331969E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.092 | TFLOPs: 43.77 | +7: iteration 6690/ 7508 | consumed samples: 1712640 | consumed tokens: 3507486720 | elapsed time per iteration (s): 0.56 | learning rate: 2.533E-05 | global batch size: 256 | lm loss: 3.355553E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.168 | TFLOPs: 43.78 | +7: iteration 6700/ 7508 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 0.56 | learning rate: 2.520E-05 | global batch size: 256 | lm loss: 3.328891E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.127 | TFLOPs: 43.68 | +7: iteration 6710/ 7508 | consumed samples: 1717760 | consumed tokens: 3517972480 | elapsed time per iteration (s): 0.58 | learning rate: 2.508E-05 | global batch size: 256 | lm loss: 3.352995E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.536 | TFLOPs: 41.81 | +7: iteration 6720/ 7508 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 0.57 | learning rate: 2.495E-05 | global batch size: 256 | lm loss: 3.333427E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.003 | TFLOPs: 42.52 | +7: iteration 6730/ 7508 | consumed samples: 1722880 | consumed tokens: 3528458240 | elapsed time per iteration (s): 0.56 | learning rate: 2.483E-05 | global batch size: 256 | lm loss: 3.330867E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.277 | TFLOPs: 43.50 | +7: iteration 6740/ 7508 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 0.57 | learning rate: 2.470E-05 | global batch size: 256 | lm loss: 3.334568E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.953 | TFLOPs: 42.52 | +7: iteration 6750/ 7508 | consumed samples: 1728000 | consumed tokens: 3538944000 | elapsed time per iteration (s): 0.57 | learning rate: 2.458E-05 | global batch size: 256 | lm loss: 3.326222E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.608 | TFLOPs: 43.15 | +7: iteration 6760/ 7508 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 0.58 | learning rate: 2.446E-05 | global batch size: 256 | lm loss: 3.327369E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.865 | TFLOPs: 42.32 | +7: iteration 6770/ 7508 | consumed samples: 1733120 | consumed tokens: 3549429760 | elapsed time per iteration (s): 0.56 | learning rate: 2.435E-05 | global batch size: 256 | lm loss: 3.341771E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.907 | TFLOPs: 43.47 | +7: iteration 6780/ 7508 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 0.62 | learning rate: 2.423E-05 | global batch size: 256 | lm loss: 3.339420E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 414.799 | TFLOPs: 39.55 | +7: iteration 6790/ 7508 | consumed samples: 1738240 | consumed tokens: 3559915520 | elapsed time per iteration (s): 0.58 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 3.339455E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.371 | TFLOPs: 41.79 | +7: iteration 6800/ 7508 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 0.61 | learning rate: 2.400E-05 | global batch size: 256 | lm loss: 3.322893E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.296 | TFLOPs: 40.26 | +7: iteration 6810/ 7508 | consumed samples: 1743360 | consumed tokens: 3570401280 | elapsed time per iteration (s): 0.70 | learning rate: 2.389E-05 | global batch size: 256 | lm loss: 3.337493E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 367.597 | TFLOPs: 35.05 | +7: iteration 6820/ 7508 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 0.60 | learning rate: 2.378E-05 | global batch size: 256 | lm loss: 3.343167E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.817 | TFLOPs: 40.50 | +7: iteration 6830/ 7508 | consumed samples: 1748480 | consumed tokens: 3580887040 | elapsed time per iteration (s): 0.57 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 3.332631E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.522 | TFLOPs: 42.48 | +7: iteration 6840/ 7508 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 0.60 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 3.320667E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.704 | TFLOPs: 40.97 | +7: iteration 6850/ 7508 | consumed samples: 1753600 | consumed tokens: 3591372800 | elapsed time per iteration (s): 0.61 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 3.349773E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 417.540 | TFLOPs: 39.81 | +7: iteration 6860/ 7508 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 0.63 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 3.323643E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 407.045 | TFLOPs: 38.81 | +7: iteration 6870/ 7508 | consumed samples: 1758720 | consumed tokens: 3601858560 | elapsed time per iteration (s): 0.56 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.310684E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.271 | TFLOPs: 43.50 | +7: iteration 6880/ 7508 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 0.56 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 3.303070E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.220 | TFLOPs: 43.59 | +7: iteration 6890/ 7508 | consumed samples: 1763840 | consumed tokens: 3612344320 | elapsed time per iteration (s): 0.56 | learning rate: 2.306E-05 | global batch size: 256 | lm loss: 3.314289E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.918 | TFLOPs: 43.75 | +7: iteration 6900/ 7508 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 0.56 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 3.311068E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.763 | TFLOPs: 43.74 | +7: iteration 6910/ 7508 | consumed samples: 1768960 | consumed tokens: 3622830080 | elapsed time per iteration (s): 0.56 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 3.317892E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.119 | TFLOPs: 43.77 | +7: iteration 6920/ 7508 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 0.56 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 3.308627E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.929 | TFLOPs: 43.75 | +7: iteration 6930/ 7508 | consumed samples: 1774080 | consumed tokens: 3633315840 | elapsed time per iteration (s): 0.56 | learning rate: 2.268E-05 | global batch size: 256 | lm loss: 3.318034E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.071 | TFLOPs: 43.77 | +7: iteration 6940/ 7508 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 0.56 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 3.303109E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.007 | TFLOPs: 43.76 | +7: iteration 6950/ 7508 | consumed samples: 1779200 | consumed tokens: 3643801600 | elapsed time per iteration (s): 0.56 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 3.313752E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.011 | TFLOPs: 43.76 | +7: iteration 6960/ 7508 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 0.56 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 3.312466E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.739 | TFLOPs: 43.64 | +7: iteration 6970/ 7508 | consumed samples: 1784320 | consumed tokens: 3654287360 | elapsed time per iteration (s): 0.56 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 3.311646E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.258 | TFLOPs: 43.40 | +7: iteration 6980/ 7508 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 0.56 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 3.316843E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.549 | TFLOPs: 43.43 | +7: iteration 6990/ 7508 | consumed samples: 1789440 | consumed tokens: 3664773120 | elapsed time per iteration (s): 0.57 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 3.316836E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.831 | TFLOPs: 43.17 | +7: iteration 7000/ 7508 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 0.56 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 3.304758E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.014 | TFLOPs: 43.76 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 7000 | lm loss value: 3.454455E+00 | lm loss PPL: 3.164104E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 7000 to checkpoints_421m3b9400m +0: [2023-03-15 23:06:36,123] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is begin to save! +0: [2023-03-15 23:06:36,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:06:36,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:06:36,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:06:36,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:06:36,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:06:36,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:06:36,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:06:36,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:06:36,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:06:36,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:06:36,540] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:06:36,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:06:36,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:06:36,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:06:36,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:06:36,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:06:36,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:06:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:06:36,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:06:36,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:06:36,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:06:36,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:06:36,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:06:36,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:06:36,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:06:36,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:06:36,862] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:06:36,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:06:36,902] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:06:36,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:06:36,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:06:36,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:06:36,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:06:37,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:06:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:06:37,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:06:37,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:06:37,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:06:37,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:06:37,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:06:37,107] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step7000/mp_rank_00_model_states.pt +0: [2023-03-15 23:06:37,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:06:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:06:37,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:06:37,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:06:37,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 23:06:37,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:06:37,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:06:37,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 23:06:37,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:06:37,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 23:06:37,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:06:37,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:06:37,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 23:06:37,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:06:37,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:06:37,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 23:06:37,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:06:37,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:06:37,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 23:06:37,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:06:37,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:06:37,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 23:06:37,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:06:37,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:06:37,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:06:37,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:06:37,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 23:06:37,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: successfully saved checkpoint at iteration 7000 to checkpoints_421m3b9400m +7: time (ms) | save-checkpoint: 1481.41 +7: iteration 7010/ 7508 | consumed samples: 1794560 | consumed tokens: 3675258880 | elapsed time per iteration (s): 0.72 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 3.301467E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 353.279 | TFLOPs: 33.68 | +7: iteration 7020/ 7508 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 0.56 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.303320E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.701 | TFLOPs: 43.45 | +7: iteration 7030/ 7508 | consumed samples: 1799680 | consumed tokens: 3685744640 | elapsed time per iteration (s): 0.56 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.325533E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.995 | TFLOPs: 43.76 | +7: iteration 7040/ 7508 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 0.56 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 3.315805E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.905 | TFLOPs: 43.75 | +7: iteration 7050/ 7508 | consumed samples: 1804800 | consumed tokens: 3696230400 | elapsed time per iteration (s): 0.56 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.318654E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.712 | TFLOPs: 43.73 | +7: iteration 7060/ 7508 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 0.56 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 3.313291E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.701 | TFLOPs: 43.73 | +7: iteration 7070/ 7508 | consumed samples: 1809920 | consumed tokens: 3706716160 | elapsed time per iteration (s): 0.56 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 3.320016E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.053 | TFLOPs: 43.77 | +7: iteration 7080/ 7508 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 0.56 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.311921E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.036 | TFLOPs: 43.76 | +7: iteration 7090/ 7508 | consumed samples: 1815040 | consumed tokens: 3717201920 | elapsed time per iteration (s): 0.56 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 3.316892E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.941 | TFLOPs: 43.76 | +7: iteration 7100/ 7508 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 0.56 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.307483E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.803 | TFLOPs: 43.74 | +7: iteration 7110/ 7508 | consumed samples: 1820160 | consumed tokens: 3727687680 | elapsed time per iteration (s): 0.56 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.301266E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.939 | TFLOPs: 43.75 | +7: iteration 7120/ 7508 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 0.56 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.316782E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.737 | TFLOPs: 43.74 | +7: iteration 7130/ 7508 | consumed samples: 1825280 | consumed tokens: 3738173440 | elapsed time per iteration (s): 0.56 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 3.314205E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.943 | TFLOPs: 43.76 | +7: iteration 7140/ 7508 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 0.56 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.306857E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.111 | TFLOPs: 43.77 | +7: iteration 7150/ 7508 | consumed samples: 1830400 | consumed tokens: 3748659200 | elapsed time per iteration (s): 0.56 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.303555E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.894 | TFLOPs: 43.75 | +7: iteration 7160/ 7508 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 0.56 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.308255E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.980 | TFLOPs: 43.76 | +7: iteration 7170/ 7508 | consumed samples: 1835520 | consumed tokens: 3759144960 | elapsed time per iteration (s): 0.56 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.311080E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.994 | TFLOPs: 43.76 | +7: iteration 7180/ 7508 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 0.57 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.303700E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.430 | TFLOPs: 43.13 | +7: iteration 7190/ 7508 | consumed samples: 1840640 | consumed tokens: 3769630720 | elapsed time per iteration (s): 0.56 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.306047E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.986 | TFLOPs: 43.76 | +7: iteration 7200/ 7508 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 0.56 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.301714E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.795 | TFLOPs: 43.74 | +7: iteration 7210/ 7508 | consumed samples: 1845760 | consumed tokens: 3780116480 | elapsed time per iteration (s): 0.56 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.314372E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.859 | TFLOPs: 43.75 | +7: iteration 7220/ 7508 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 0.56 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.322968E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.819 | TFLOPs: 43.74 | +7: iteration 7230/ 7508 | consumed samples: 1850880 | consumed tokens: 3790602240 | elapsed time per iteration (s): 0.56 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.315638E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.007 | TFLOPs: 43.76 | +7: iteration 7240/ 7508 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 0.56 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.313958E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.985 | TFLOPs: 43.76 | +7: iteration 7250/ 7508 | consumed samples: 1856000 | consumed tokens: 3801088000 | elapsed time per iteration (s): 0.56 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.330902E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.044 | TFLOPs: 43.76 | +7: iteration 7260/ 7508 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 0.56 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.315548E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.994 | TFLOPs: 43.76 | +7: iteration 7270/ 7508 | consumed samples: 1861120 | consumed tokens: 3811573760 | elapsed time per iteration (s): 0.56 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.319111E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.968 | TFLOPs: 43.76 | +7: iteration 7280/ 7508 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 0.56 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.315588E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.948 | TFLOPs: 43.76 | +7: iteration 7290/ 7508 | consumed samples: 1866240 | consumed tokens: 3822059520 | elapsed time per iteration (s): 0.56 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.314400E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.949 | TFLOPs: 43.76 | +7: iteration 7300/ 7508 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 0.56 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.324419E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.313 | TFLOPs: 43.79 | +7: iteration 7310/ 7508 | consumed samples: 1871360 | consumed tokens: 3832545280 | elapsed time per iteration (s): 0.56 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.315535E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.219 | TFLOPs: 43.50 | +7: iteration 7320/ 7508 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 0.56 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.324404E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.879 | TFLOPs: 43.75 | +7: iteration 7330/ 7508 | consumed samples: 1876480 | consumed tokens: 3843031040 | elapsed time per iteration (s): 0.56 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.315795E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.610 | TFLOPs: 43.53 | +7: iteration 7340/ 7508 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 0.56 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.305976E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.988 | TFLOPs: 43.76 | +7: iteration 7350/ 7508 | consumed samples: 1881600 | consumed tokens: 3853516800 | elapsed time per iteration (s): 0.56 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.319290E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.914 | TFLOPs: 43.66 | +7: iteration 7360/ 7508 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 0.57 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.318629E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.930 | TFLOPs: 43.18 | +7: iteration 7370/ 7508 | consumed samples: 1886720 | consumed tokens: 3864002560 | elapsed time per iteration (s): 0.56 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.322016E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.634 | TFLOPs: 43.63 | +7: iteration 7380/ 7508 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 0.56 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.316699E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.752 | TFLOPs: 43.74 | +7: iteration 7390/ 7508 | consumed samples: 1891840 | consumed tokens: 3874488320 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.317249E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.911 | TFLOPs: 43.75 | +7: iteration 7400/ 7508 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 0.56 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.309689E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.405 | TFLOPs: 43.70 | +7: iteration 7410/ 7508 | consumed samples: 1896960 | consumed tokens: 3884974080 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.324926E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.674 | TFLOPs: 43.73 | +7: iteration 7420/ 7508 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 0.56 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.315327E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.943 | TFLOPs: 43.76 | +7: iteration 7430/ 7508 | consumed samples: 1902080 | consumed tokens: 3895459840 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.317769E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.987 | TFLOPs: 43.76 | +7: iteration 7440/ 7508 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.316121E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.956 | TFLOPs: 43.76 | +7: iteration 7450/ 7508 | consumed samples: 1907200 | consumed tokens: 3905945600 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.303223E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.988 | TFLOPs: 43.76 | +7: iteration 7460/ 7508 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.308244E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.903 | TFLOPs: 43.75 | +7: iteration 7470/ 7508 | consumed samples: 1912320 | consumed tokens: 3916431360 | elapsed time per iteration (s): 0.56 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.310282E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.092 | TFLOPs: 43.77 | +7: iteration 7480/ 7508 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 0.56 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.318050E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.986 | TFLOPs: 43.76 | +7: iteration 7490/ 7508 | consumed samples: 1917440 | consumed tokens: 3926917120 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.324417E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.850 | TFLOPs: 43.75 | +7: iteration 7500/ 7508 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.299224E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.914 | TFLOPs: 43.75 | +0: [after training is done] datetime: 2023-03-15 23:11:21 +0: saving checkpoint at iteration 7508 to checkpoints_421m3b9400m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.443438E+00 | lm loss PPL: 3.129436E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-15 23:11:21,349] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7508 is begin to save! +0: [2023-03-15 23:11:21,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:11:21,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:11:21,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:11:21,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:11:21,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:11:21,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:11:21,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:11:21,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:11:21,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:11:21,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:11:21,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:11:21,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:11:21,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:11:21,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:11:21,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:11:21,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:11:21,776] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:11:21,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:11:21,815] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:11:21,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:11:21,855] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:11:21,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:11:21,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:11:21,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:11:21,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:11:21,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:11:21,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:11:22,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:11:22,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:11:22,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:11:22,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:11:22,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:11:22,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:11:22,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:11:22,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:11:22,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:11:22,175] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:11:22,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:11:22,215] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:11:22,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:11:22,220] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt +0: [2023-03-15 23:11:22,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:11:22,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:22,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:22,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +5: [2023-03-15 23:11:22,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:22,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,332] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,332] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +5: [2023-03-15 23:11:22,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +5: [2023-03-15 23:11:22,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +5: [2023-03-15 23:11:22,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:22,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:22,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:22,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +5: [2023-03-15 23:11:22,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +5: [2023-03-15 23:11:22,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:22,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:22,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:22,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:22,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +1: [2023-03-15 23:11:22,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:22,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:22,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +3: [2023-03-15 23:11:22,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:22,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:22,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +7: [2023-03-15 23:11:22,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:22,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:22,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:22,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:22,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +4: [2023-03-15 23:11:22,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:22,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +6: [2023-03-15 23:11:22,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:22,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:22,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:22,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:22,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:22,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: [2023-03-15 23:11:22,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:22,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +2: [2023-03-15 23:11:22,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:22,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:22,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7508 is ready now! +0: successfully saved checkpoint at iteration 7508 to checkpoints_421m3b9400m +END 3318426: Wed 15 Mar 2023 11:11:38 PM EET diff --git a/421m3b9400m/3318674.err b/421m3b9400m/3318674.err new file mode 100644 index 0000000000000000000000000000000000000000..9c46a08bc9e9399be2f85966466a2f0e349921ba --- /dev/null +++ b/421m3b9400m/3318674.err @@ -0,0 +1,1115 @@ +6: 2023-03-15 23:28:51.904238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904241: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904249: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904253: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904255: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904248: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904242: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:51.904252: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990058: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990069: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990073: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990068: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990080: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990079: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990076: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:51.990082: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990194: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990202: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990208: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990212: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990199: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990205: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:51.990220: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038762: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038761: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038765: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038760: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038828: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038837: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: 2023-03-15 23:28:52.038758: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038758: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:52.038769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038837: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038834: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038842: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038844: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:52.038842: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039655: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039663: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039669: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039671: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039665: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:52.039662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058156: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058155: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058148: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058154: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058151: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058160: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:52.058159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122250: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122258: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122258: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122264: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122247: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122265: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122256: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:52.122273: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:54.360642: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360644: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.360649: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:54.361265: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361265: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361273: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361272: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361276: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361276: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361276: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:54.361289: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.362899: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362891: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362906: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362900: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.362931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:54.363336: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363338: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363341: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363345: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363346: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363348: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363348: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:54.363356: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.365607: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365606: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365607: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365607: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365621: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.365622: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:54.366041: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366045: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366047: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366051: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366052: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366054: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366058: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:54.366060: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.369716: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369726: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369721: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.369721: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:28:54.370171: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370173: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370175: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370177: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370178: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370180: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370183: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:54.370185: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373075: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373078: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:54.373517: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373519: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373523: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373526: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373526: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373527: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373530: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:54.373536: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.373747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373759: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373760: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.373762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:54.374259: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374267: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374266: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374269: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374272: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374273: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374275: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:54.374284: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.374718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374725: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374724: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374724: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374732: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.374729: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:54.375140: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375146: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375149: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375153: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375154: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375157: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375155: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:54.375158: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378203: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378203: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378210: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:54.378657: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378659: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378661: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378663: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378664: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378665: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378664: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:54.378667: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:29:00.308769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308803: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308813: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308831: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308924: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308953: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.308956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320210: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320235: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320229: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320253: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320269: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320271: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.320273: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321122: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321145: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321140: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322152: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322234: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322251: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322259: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.322294: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322661: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322661: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322669: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323115: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323178: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323174: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323208: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323232: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.323241: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323713: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323746: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.323821: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.324943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.324968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.324989: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.324999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.325011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.325031: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.325036: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.325203: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325362: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325397: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325424: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325437: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325452: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325462: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.325537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330947: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330946: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330953: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.330965: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.330965: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.330965: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.330970: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.330971: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.330972: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.330975: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.331024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331037: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331400: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-15 23:29:00.331432: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.331401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-15 23:29:00.331435: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.331401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-15 23:29:00.331437: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.331405: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-15 23:29:00.331441: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.331405: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-15 23:29:00.331441: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.331442: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.331448: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.331442: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.331453: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.331454: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.331460: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.331459: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.331464: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.331466: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331405: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-15 23:29:00.331493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.331507: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.331408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.331418: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331418: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331419: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331421: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331423: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331424: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331426: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.331427: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.331814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331860: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331815: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331812: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.331938: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.331816: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331859: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.331967: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331813: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-15 23:29:00.331939: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331819: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331866: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.331970: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-15 23:29:00.331941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331821: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.331971: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-15 23:29:00.331943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331832: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.331832: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331826: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.331971: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331837: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.331840: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.331840: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331868: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.331973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.331839: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.331842: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.331846: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.331975: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-15 23:29:00.331945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.331944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.331948: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.331948: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331877: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331883: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.331956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.331956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331883: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331882: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331885: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331977: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-15 23:29:00.331957: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.331961: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.331962: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331885: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331887: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331888: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.331962: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.331966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.331967: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.331986: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331989: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331989: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331991: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331990: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331993: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331995: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.331997: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322677: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322677: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322675: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322677: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.322684: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322684: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322687: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322685: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322692: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.322693: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +7: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +5: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +3: Building extension module utils... +3: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +3: Loading extension module utils... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +2: +2: +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: +4: +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils...Loading extension module utils... +0: +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +3: +3: +3: +3: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Loading extension module utils... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +3: +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils...Loading extension module utils... +4: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +4: +4: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils...Loading extension module utils... +6: +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +5: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils...Loading extension module utils... +7: +7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils...Loading extension module utils... +7: +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/421m3b9400m/3318674.out b/421m3b9400m/3318674.out new file mode 100644 index 0000000000000000000000000000000000000000..25c840370c889f6ea44584977c3fbe1042a76a5e --- /dev/null +++ b/421m3b9400m/3318674.out @@ -0,0 +1,6425 @@ +Model parameters: d_model 1280 ffw_size 5120 kv_size 128 n_heads 10 n_layers 18 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 18 --hidden-size 1280 --num-attention-heads 10 --kv-channels 128 --ffn-hidden-size 5120 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-421m3b9400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --no-load-optim --reset-progress --override-lr-scheduler --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_421m3b9400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_421m3b9400m --load checkpoints_421m3b9400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3318674.json --zero-stage 0 +START 3318674: Wed 15 Mar 2023 11:28:32 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 48.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 48.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 48.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 39.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 45.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 50.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 46.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 38.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 48.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 38.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 42.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +5: Launching on nid006722 (5/8), master nid006717 port 9999, GPUs 8, CUDA: True +1: Launching on nid006718 (1/8), master nid006717 port 9999, GPUs 8, CUDA: True +7: Launching on nid006724 (7/8), master nid006717 port 9999, GPUs 8, CUDA: True +2: Launching on nid006719 (2/8), master nid006717 port 9999, GPUs 8, CUDA: True +4: Launching on nid006721 (4/8), master nid006717 port 9999, GPUs 8, CUDA: True +0: Launching on nid006717 (0/8), master nid006717 port 9999, GPUs 8, CUDA: True +6: Launching on nid006723 (6/8), master nid006717 port 9999, GPUs 8, CUDA: True +3: Launching on nid006720 (3/8), master nid006717 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3318674.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 5120 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1280 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-421m3b9400mval +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_421m3b9400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 18 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_421m3b9400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_421m3b9400mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-15 23:30:09,099] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.097 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 102 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 27.801 seconds +0: time to initialize megatron (seconds): -0.259 +0: [after megatron is initialized] datetime: 2023-03-15 23:30:39 +0: building GPT model ... +0: [2023-03-15 23:30:39,877] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-15 23:30:39,878] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-15 23:30:39,879] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.35 GB, percent = 6.2% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-15 23:30:41,862] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=25 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: undo +0: 22: MixedFusedLayerNorm +0: 23: EmbeddingPipe +0: 24: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-15 23:30:42,116] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-15 23:30:42,116] [INFO] [utils.py:828:see_memory_usage] MA 0.79 GB Max_MA 0.79 GB CA 0.86 GB Max_CA 1 GB +0: [2023-03-15 23:30:42,117] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.37 GB, percent = 6.2% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-15 23:30:42,118] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-15 23:30:55,432] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-15 23:30:55,433] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-15 23:30:55,433] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-15 23:30:55,442] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-15 23:30:55,442] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-15 23:30:55,561] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-15 23:30:55,561] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.79 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-15 23:30:55,562] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +3: ninja: no work to do. +3: Time to load utils op: 0.18648481369018555 seconds +0: Time to load utils op: 0.10914301872253418 seconds +0: [2023-03-15 23:30:55,781] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-15 23:30:55,782] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.78 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-15 23:30:55,782] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.06 GB, percent = 6.4% +0: ninja: no work to do. +0: Time to load utils op: 0.1426684856414795 seconds +3: Time to load utils op: 0.0005524158477783203 seconds +0: Time to load utils op: 0.0006017684936523438 seconds +4: Time to load utils op: 0.11848950386047363 seconds +4: Time to load utils op: 0.11846780776977539 seconds +4: Time to load utils op: 0.11848974227905273 secondsTime to load utils op: 0.11849164962768555 secondsTime to load utils op: 0.11845803260803223 seconds +4: +4: +4: Time to load utils op: 0.11847805976867676 secondsTime to load utils op: 0.11846423149108887 seconds +4: +4: Time to load utils op: 0.118499755859375 seconds +0: Time to load utils op: 0.20264410972595215 secondsTime to load utils op: 0.20205068588256836 seconds +0: +0: Time to load utils op: 0.20334076881408691 seconds +0: Time to load utils op: 0.20183873176574707 seconds +0: Time to load utils op: 0.20164775848388672 seconds +0: Time to load utils op: 0.20183181762695312 seconds +3: Time to load utils op: 0.2035083770751953 seconds +3: Time to load utils op: 0.20241951942443848 seconds +3: Time to load utils op: 0.20240020751953125 seconds +3: Time to load utils op: 0.20242738723754883 secondsTime to load utils op: 0.20318102836608887 seconds +3: +3: Time to load utils op: 0.20307064056396484 seconds +3: Time to load utils op: 0.20356130599975586 seconds +1: Time to load utils op: 0.21283841133117676 seconds +1: Time to load utils op: 0.21238040924072266 seconds +1: Time to load utils op: 0.21228957176208496 seconds +1: Time to load utils op: 0.21233415603637695 seconds +1: Time to load utils op: 0.21233344078063965 seconds +1: Time to load utils op: 0.2123408317565918 secondsTime to load utils op: 0.21219491958618164 seconds +1: +1: Time to load utils op: 0.21299242973327637 seconds +5: Time to load utils op: 0.2110157012939453 seconds +5: Time to load utils op: 0.2107839584350586 seconds +5: Time to load utils op: 0.2106626033782959 seconds +5: Time to load utils op: 0.21080684661865234 seconds +5: Time to load utils op: 0.2107698917388916 secondsTime to load utils op: 0.21081066131591797 secondsTime to load utils op: 0.21081185340881348 seconds +5: Time to load utils op: 0.2106466293334961 seconds +5: +5: +2: Time to load utils op: 0.2115178108215332 seconds +2: Time to load utils op: 0.2115495204925537 seconds +2: Time to load utils op: 0.21156048774719238 seconds +2: Time to load utils op: 0.21161866188049316 secondsTime to load utils op: 0.21161222457885742 secondsTime to load utils op: 0.2116107940673828 secondsTime to load utils op: 0.21161365509033203 seconds +2: +2: +2: +2: Time to load utils op: 0.21162128448486328 seconds +7: Time to load utils op: 0.21054792404174805 secondsTime to load utils op: 0.20868515968322754 seconds +7: +7: Time to load utils op: 0.21042108535766602 seconds +7: Time to load utils op: 0.2075037956237793 seconds +7: Time to load utils op: 0.20882368087768555 secondsTime to load utils op: 0.20780396461486816 secondsTime to load utils op: 0.20884943008422852 seconds +7: +7: +7: Time to load utils op: 0.20848917961120605 seconds +0: Time to load utils op: 0.0004134178161621094 seconds +0: Time to load utils op: 0.00039267539978027344 seconds +0: Time to load utils op: 0.0003960132598876953 seconds +0: Time to load utils op: 0.00039696693420410156 seconds +0: Time to load utils op: 0.0003917217254638672 seconds +0: Time to load utils op: 0.00041413307189941406 seconds +6: Time to load utils op: 0.21147489547729492 seconds +6: Time to load utils op: 0.2115027904510498 seconds +6: Time to load utils op: 0.21150684356689453 seconds +6: Time to load utils op: 0.21152043342590332 seconds +6: Time to load utils op: 0.21155428886413574 seconds +6: Time to load utils op: 0.21155929565429688 secondsTime to load utils op: 0.2115623950958252 secondsTime to load utils op: 0.2115626335144043 seconds +6: +6: +3: Time to load utils op: 0.0003635883331298828 seconds +3: Time to load utils op: 0.000385284423828125 seconds +3: Time to load utils op: 0.00035762786865234375 seconds +3: Time to load utils op: 0.0003428459167480469 seconds +3: Time to load utils op: 0.00031065940856933594 seconds +3: Time to load utils op: 0.0003757476806640625 seconds +3: Time to load utils op: 0.0003485679626464844 seconds +0: [2023-03-15 23:30:55,963] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-15 23:30:55,964] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-15 23:30:55,964] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +4: Time to load utils op: 0.0007457733154296875 seconds +1: Time to load utils op: 0.0009491443634033203 seconds +4: Time to load utils op: 0.0012960433959960938 secondsTime to load utils op: 0.001293182373046875 seconds +4: +4: Time to load utils op: 0.0013210773468017578 seconds +1: Time to load utils op: 0.0011723041534423828 seconds +4: Time to load utils op: 0.001340627670288086 secondsTime to load utils op: 0.0012574195861816406 seconds +4: +4: Time to load utils op: 0.0012695789337158203 seconds +4: Time to load utils op: 0.0012869834899902344 seconds +1: Time to load utils op: 0.0013275146484375 seconds +1: Time to load utils op: 0.0013577938079833984 seconds +1: Time to load utils op: 0.0013532638549804688 seconds +1: Time to load utils op: 0.0013239383697509766 seconds +1: Time to load utils op: 0.0013179779052734375 seconds +1: Time to load utils op: 0.0014095306396484375 seconds +5: Time to load utils op: 0.0008628368377685547 seconds +6: Time to load utils op: 0.0010914802551269531 secondsTime to load utils op: 0.0011057853698730469 seconds +6: +5: Time to load utils op: 0.0010554790496826172 seconds +5: Time to load utils op: 0.0010478496551513672 seconds +5: Time to load utils op: 0.000926971435546875 seconds +5: Time to load utils op: 0.0011022090911865234 seconds +5: Time to load utils op: 0.0010471343994140625 seconds +6: Time to load utils op: 0.0012297630310058594 seconds +6: Time to load utils op: 0.0012781620025634766 seconds +6: Time to load utils op: 0.0012001991271972656 secondsTime to load utils op: 0.0012514591217041016 seconds +6: +5: Time to load utils op: 0.0010497570037841797 seconds +5: Time to load utils op: 0.0010883808135986328 seconds +6: Time to load utils op: 0.001234292984008789 seconds +6: Time to load utils op: 0.001249074935913086 seconds +2: Time to load utils op: 0.0007288455963134766 seconds +2: Time to load utils op: 0.0006449222564697266 seconds +2: Time to load utils op: 0.0005080699920654297 seconds +2: Time to load utils op: 0.0007419586181640625 secondsTime to load utils op: 0.0006053447723388672 seconds +2: +2: Time to load utils op: 0.0007128715515136719 seconds +2: Time to load utils op: 0.000583648681640625 seconds +2: Time to load utils op: 0.00054168701171875 seconds +7: Time to load utils op: 0.0005326271057128906 secondsTime to load utils op: 0.0005078315734863281 seconds +7: +7: Time to load utils op: 0.0006341934204101562 secondsTime to load utils op: 0.0004513263702392578 seconds +7: +7: Time to load utils op: 0.0003638267517089844 seconds +7: Time to load utils op: 0.0006406307220458984 seconds +7: Time to load utils op: 0.00030732154846191406 seconds +7: Time to load utils op: 0.0003998279571533203 seconds +0: [2023-03-15 23:30:56,076] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-15 23:30:56,077] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-15 23:30:56,077] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,183] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-15 23:30:56,183] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 23:30:56,183] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,287] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-15 23:30:56,287] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 23:30:56,287] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,392] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-15 23:30:56,393] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 23:30:56,393] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,495] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-15 23:30:56,496] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 23:30:56,496] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,606] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-15 23:30:56,606] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 23:30:56,606] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,709] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-15 23:30:56,710] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-15 23:30:56,710] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +0: [2023-03-15 23:30:56,710] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-15 23:30:56,710] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-15 23:30:56,710] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-15 23:30:56,711] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-15 23:30:56,711] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-15 23:30:56,711] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-15 23:30:56,711] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-15 23:30:56,711] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-15 23:30:56,711] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-15 23:30:56,712] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-15 23:30:56,713] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-15 23:30:56,713] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0004253387451171875 seconds +0: [2023-03-15 23:30:56,714] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-15 23:30:56,724] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=25 [0, 25) STAGE_PARAMS=421207040 (421.207M) TOTAL_PARAMS=421207040 (421.207M) UNIQUE_PARAMS=421207040 (421.207M) +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:56,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:56,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:56,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:56,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:56,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:56,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:56,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:56,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:57,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:57,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:57,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:57,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:57,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:57,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:57,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:57,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:57,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:57,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:57,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:57,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:57,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:57,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:57,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:57,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:57,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:57,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:57,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:57,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:57,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:57,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:57,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:57,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:57,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:57,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:57,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:57,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:58,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:58,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:58,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:58,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:58,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:58,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:58,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:58,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:58,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:58,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:58,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:58,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:58,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:58,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:58,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:58,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:58,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:58,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:58,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:58,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:58,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:58,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:58,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:58,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:58,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:58,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:58,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:58,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:58,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:58,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:58,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:58,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:58,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:58,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:58,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:58,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:58,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:58,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:58,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:58,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:58,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:58,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:58,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:58,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:58,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:58,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:58,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:58,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:58,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:58,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:58,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:58,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:58,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:59,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:59,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:59,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:59,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:59,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:59,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:59,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:59,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:59,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:59,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:59,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:59,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:59,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:59,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:59,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:59,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:59,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:59,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:59,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:59,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:59,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:59,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:59,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:59,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:59,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:59,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:59,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:59,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:59,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:59,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:59,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:59,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:59,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:59,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:59,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:59,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:59,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:59,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:59,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:59,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:59,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:59,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:59,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:59,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:59,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:59,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:59,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:59,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:59,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:59,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:59,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:59,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:31:00,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:31:00,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:31:00,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:31:00,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:31:00,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:31:00,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:31:00,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:31:00,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:31:00,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:31:00,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:31:00,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:31:00,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:31:00,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:31:00,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:31:00,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:31:00,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:31:00,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:31:00,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: > overriding learning rate value to 0.0002 +4: [2023-03-15 23:31:00,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +3: [2023-03-15 23:31:00,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:31:00,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:31:00,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +4: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +1: [2023-03-15 23:31:00,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +1: [2023-03-15 23:31:00,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:31:00,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +0: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +3: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +2: [2023-03-15 23:31:00,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +7: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +5: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +7: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +5: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/layer_22-model_00-model_states.pt. +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:31:00,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:31:00,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:31:00,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:31:00,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,443] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +4: [2023-03-15 23:31:00,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,443] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +2: [2023-03-15 23:31:00,446] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +4: [2023-03-15 23:31:00,446] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +1: [2023-03-15 23:31:00,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,448] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +1: [2023-03-15 23:31:00,451] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +0: [2023-03-15 23:31:00,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,466] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +5: [2023-03-15 23:31:00,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,467] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +1: [2023-03-15 23:31:00,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,468] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +0: [2023-03-15 23:31:00,469] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +3: [2023-03-15 23:31:00,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,470] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +5: [2023-03-15 23:31:00,471] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +1: [2023-03-15 23:31:00,471] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +3: [2023-03-15 23:31:00,473] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +7: [2023-03-15 23:31:00,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,474] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +0: [2023-03-15 23:31:00,474] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +0: [2023-03-15 23:31:00,477] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +7: [2023-03-15 23:31:00,478] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +6: [2023-03-15 23:31:00,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,483] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +6: [2023-03-15 23:31:00,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,483] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +5: [2023-03-15 23:31:00,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,485] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +6: [2023-03-15 23:31:00,486] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +6: [2023-03-15 23:31:00,487] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +5: [2023-03-15 23:31:00,488] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +2: [2023-03-15 23:31:00,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,496] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +2: [2023-03-15 23:31:00,499] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +4: [2023-03-15 23:31:00,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,507] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +4: [2023-03-15 23:31:00,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,507] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +7: [2023-03-15 23:31:00,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,508] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +4: [2023-03-15 23:31:00,510] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +4: [2023-03-15 23:31:00,511] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +7: [2023-03-15 23:31:00,511] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +0: [2023-03-15 23:31:00,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,520] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +3: [2023-03-15 23:31:00,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,521] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +0: [2023-03-15 23:31:00,523] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +7: [2023-03-15 23:31:00,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,524] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +3: [2023-03-15 23:31:00,524] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +5: [2023-03-15 23:31:00,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,526] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +7: [2023-03-15 23:31:00,527] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +5: [2023-03-15 23:31:00,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,527] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +6: [2023-03-15 23:31:00,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,528] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +5: [2023-03-15 23:31:00,529] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +5: [2023-03-15 23:31:00,530] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +1: [2023-03-15 23:31:00,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,528] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +1: [2023-03-15 23:31:00,528] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +6: [2023-03-15 23:31:00,531] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +1: [2023-03-15 23:31:00,532] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +1: [2023-03-15 23:31:00,532] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +1: [2023-03-15 23:31:00,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,533] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +1: [2023-03-15 23:31:00,535] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +2: [2023-03-15 23:31:00,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,537] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +2: [2023-03-15 23:31:00,537] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +7: [2023-03-15 23:31:00,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,541] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +7: [2023-03-15 23:31:00,541] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +2: [2023-03-15 23:31:00,541] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +2: [2023-03-15 23:31:00,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,541] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +0: [2023-03-15 23:31:00,542] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +6: [2023-03-15 23:31:00,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,544] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +7: [2023-03-15 23:31:00,544] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +2: [2023-03-15 23:31:00,544] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +0: [2023-03-15 23:31:00,545] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +6: [2023-03-15 23:31:00,547] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +3: [2023-03-15 23:31:00,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,548] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +3: [2023-03-15 23:31:00,548] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +3: [2023-03-15 23:31:00,548] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +4: [2023-03-15 23:31:00,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,549] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +3: [2023-03-15 23:31:00,551] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +3: [2023-03-15 23:31:00,551] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +3: [2023-03-15 23:31:00,551] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +4: [2023-03-15 23:31:00,551] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +6: [2023-03-15 23:31:00,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,553] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +3: [2023-03-15 23:31:00,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,553] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +6: [2023-03-15 23:31:00,555] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +3: [2023-03-15 23:31:00,556] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +7: [2023-03-15 23:31:00,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,559] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +0: [2023-03-15 23:31:00,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,560] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +5: [2023-03-15 23:31:00,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,561] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +5: [2023-03-15 23:31:00,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,562] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +7: [2023-03-15 23:31:00,562] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +4: [2023-03-15 23:31:00,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,562] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +0: [2023-03-15 23:31:00,563] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +5: [2023-03-15 23:31:00,564] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +4: [2023-03-15 23:31:00,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,565] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +4: [2023-03-15 23:31:00,565] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +5: [2023-03-15 23:31:00,566] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +4: [2023-03-15 23:31:00,568] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +6: [2023-03-15 23:31:00,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,572] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +5: [2023-03-15 23:31:00,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,573] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +0: [2023-03-15 23:31:00,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,574] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +6: [2023-03-15 23:31:00,575] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +5: [2023-03-15 23:31:00,576] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +0: [2023-03-15 23:31:00,577] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +7: [2023-03-15 23:31:00,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,580] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +2: [2023-03-15 23:31:00,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,583] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +7: [2023-03-15 23:31:00,583] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +1: [2023-03-15 23:31:00,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,585] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +1: [2023-03-15 23:31:00,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,586] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +1: [2023-03-15 23:31:00,586] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +0: [2023-03-15 23:31:00,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,588] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +6: [2023-03-15 23:31:00,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,588] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +6: [2023-03-15 23:31:00,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,588] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +1: [2023-03-15 23:31:00,589] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +1: [2023-03-15 23:31:00,589] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +2: [2023-03-15 23:31:00,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,590] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +0: [2023-03-15 23:31:00,591] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +6: [2023-03-15 23:31:00,592] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +4: [2023-03-15 23:31:00,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:31:00,592] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +4: [2023-03-15 23:31:00,592] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +2: [2023-03-15 23:31:00,594] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +4: [2023-03-15 23:31:00,595] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +4: [2023-03-15 23:31:00,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,598] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +3: [2023-03-15 23:31:00,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:31:00,601] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +3: [2023-03-15 23:31:00,601] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +3: [2023-03-15 23:31:00,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:31:00,604] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +7: [2023-03-15 23:31:00,604] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +3: [2023-03-15 23:31:00,605] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +7: [2023-03-15 23:31:00,607] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +3: [2023-03-15 23:31:00,607] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +7: [2023-03-15 23:31:00,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:31:00,613] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +2: [2023-03-15 23:31:00,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:31:00,615] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +7: [2023-03-15 23:31:00,616] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +5: [2023-03-15 23:31:00,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:31:00,618] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +2: [2023-03-15 23:31:00,619] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +5: [2023-03-15 23:31:00,621] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +0: [2023-03-15 23:31:00,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:31:00,623] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +0: [2023-03-15 23:31:00,627] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +1: [2023-03-15 23:31:00,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:31:00,643] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +1: [2023-03-15 23:31:00,645] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +0: successfully loaded checkpoint from checkpoints_421m3b9400m at iteration 0 +7: time (ms) | load-checkpoint: 3932.05 +0: estimated model parameters: 0.42120704 +0: estimated model parameters without embeddings: 0.35419648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-15 23:31:01 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.007469 seconds +0: number of documents: 835726 +0: > dataset split: +0: train: +0: document indices in [0, 835726) total of 835726 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.014 seconds +0: total number of samples: 195101 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.034558 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.009 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-15 23:31:14 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 21329.46 | train/valid/test-data-iterators-setup: 13056.72 +0: [after training is done] datetime: 2023-03-15 23:31:14 +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.404755E+00 | lm loss PPL: 3.010691E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3318674: Wed 15 Mar 2023 11:31:44 PM EET diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91cf74175ab4eb37d39bdca23241d9e33881198d --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2414ea04f5e61281ad9fcd9b65d276d1c00784a942a77abfb0b2558367ba1392 +size 78980887 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f754090afda34840b13587c764d1a19fe627c6c5 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a50e14bf8b324621a2f0e9220e02b5fedbf0db9239c51860ee08e7fb4c8bc0 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfd276f6ed42607681022983c8d7b44017a42f14 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc890f47e611eadfe737b2b5c716daaec683961409a2bc1e30a46cf08114df0 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ac6f66a3c313d4aeba8200971dbc294e189fc62 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cdcffcf0d3f30b79321be204df505e5639493d59779424e088a50d64b603f6 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa46346762dd07471db49a0218fa2326a81bf477 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d495800d0bdb232282156c203501c414640ebe079e07fd8ca2615cf99971b73 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..290049b49abc69ca21b15e297b4165a7df228484 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:993592d679bd094daa57a6ce0be9a0b85d37303670dd31a2269f5a59496b2784 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d028ddf742976744c76502ebc72d73c170879fc2 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6639153cfadd4189f9fc4dd7073539b9123b04a6e6fb5af0d28f0e2737f350 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..780c9cd92fc53a1d8eef4b7326bdfb2d92e146b3 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0840b3bbcbbbe1e189237ed5136f4eaea24206cbd9ecc7a10fcdf277de272f32 +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20b5f12a3477f675e184785d5507387320afd20a --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abc0d33434125791fbe6856775ecfe46dbeb752a2ab1043b92d0a94d5d8edec9 +size 78981090 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..949ecb5f9f7e44444372bbe5d8ace7bb46b945ce --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db94c740c8cbc11fa84c11a40d27a28544bc2d912b31d97b501f6bec8215b739 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55d4c778af54d545500bd1115d3dc5e37a256b0a --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce81e2a44d33538055a3923ec749b8063357c4836be4b3347a4b6a2ca9f3afc1 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61bef6b73fe0e2e6fe598271f1d565977362897d --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d36f28b4ccb41f67d5a2f6519271ea18b0cbe01315d9f57bb35a93e3d14d23f +size 78980951 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83d33c4b153199717dfe75cd9c4127288620e1de --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:043aa0164e60924870d42ae999ee5d1c070c473790231ca9ffec4a0ba8cd7241 +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a22eab555472ae2e309240bbe8d9fe8c25f430fd --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b79dab4f06ebc56367d82abf070efb3bb92bf23f3aa0624f47fe462d98b6a206 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32dd68df2ff8dd64bde10d74b6085cf86bc8f2dd --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db423354c54ec4e1b32c0528d081f8ac090f700dbdae6031f0402beb82e6d69 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c07aea8adf0f8ac7ae59dde71f213421a0b58212 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d24c1e27e828bd65fcc16b0921f6c15223a89a4703c56e58fb3ca7aed263bf6 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a80af18913d28d3a14fd99a1abc81ca9b3ee1f37 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ff23bdd8a2faf043230a0bda3780c1036a5cb487168f0bb601217c9d876e8fd +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e50d7c624ffb91e582aed77b55d77488f0190e60 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89820bf444195b0ab285cff434ba55f5a280e1008b8d071ccc23dca6e1ad9c58 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..738f297e7934a208f453f5d54d0117c81fd3860d --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:083095852e6f2a6163025f571ba06eb8ce9f17b6442e391aa450bf9f73539b26 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b00deb5eb99e3b3d7547990a93444883cdfa1267 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7fc67e56e22e9b50ce802402306b5486a258dff98fcf387b72abeed89910c3 +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c2552df3b8c06e95c46df92f8e6c49ae13f3d95 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:396df5ac682f2625b0283db163ae1decae041fe39699503f9fe53cf6b1eb9b91 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e003676b1bfac782b76404bac2fa62f0c81ef261 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f50782e7eaea2ff4bc0b495973d328688f953a024990413452cffec805a488 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8754ca720b001ef064a72251cee69bd6e0bec17 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df5b8c42d54f4fcb0ad5a2e22337bc8719b6c774e27b575922bba54a639779f4 +size 78980887 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3ef8d119410cd9f6bb3f873fdeb7f77d03fa014 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e7068f8bb4c280a294e8aea0405510b9efe0d6c5d594438f203bc484494f97c +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6375901f5fdc7cf87856a31aabc3b236b47e62b --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0177fcba7556802fd0431cb2cf74ec6d275d1ab69166bc6f170dca93051f8f +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..002c5b5e66db8f8934b6ce301a6004449ac8d8a0 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd57fd1166a17c454b00fad076b255be3cce18da142a5b1cddc1b999075fbdb0 +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0686b0742572b752bf82268b5f6ba6c8f867f9c7 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f4b027bcdec4322588739ef3dfff0a22e96def54788ec90f1bd03313db0a722 +size 78981090 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8fca0ae6b8335f94426253649d129ea6793f020 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb83fa55514406a0b67cc18db868c8447a9218f327d5b7d58188b58cc12cfd5 +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb25dc5bd2c0085949e93d66f652c4abd88adc01 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d97f9b09f6cdb9a4078b5a1a32c2c822248d7a67ff821adb7d190377784fff79 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7197c02b372adbd9726335a293894c387eb3eb5b --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0676b76520126199e2fcc3680b9d493933aa3892a62f137a08c7fe14cb5f9344 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7321c4640de57f395891bf01a02ca4a455d1ab9d --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0a187564f9fe3a1c0ce5587fe7291e17fceae4af8453173ad75e0a724989b75 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..978bbf1fd938c599a6444788ff1640c45ef36c5f --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78085f860cbd6a27b2c5b2509e302bdecf0e8a8119c474c236e619edda901b4e +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..defad7b9a1e7a14ec8f3fdd0d3fa5f923eef0fad --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d57010b849486b63d9f50ea7f0e40e88657f26ba6e6e7a9dd9b8c2834c00009 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed05b3b815615810198b926657c6de36dc9c9462 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec0e8ab44b95dc95b1bb52498066aafc6160a0d7518f499fd1924a89b6219a01 +size 78980887 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d52ccd1c05a0eaff5c3fae0b4d618f1a1e2d46f --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58ee3145677aab5a8073318582d359667012b3374b2cf59612f63bbc46550698 +size 78981090 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3421351948eb53b8a6b24c4fff7ac2fe70da5755 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f890a4e14f98b3a4e0a4b30d09471cbc3867707f390af95ca4c1b80303e85f66 +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d21df351f543220d0c3b436c42d62ea55e4f7e5 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:257d90f9fbe052813fb0384da7e482698191b72768588b48fe6fa516cf414f58 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8d5088daf4da95fa3b197f74ac2db6ebf3760a1 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ade52fd48c6de0466c970d3d516dbd65ce47878abf857a376d9554814fa82c2c +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14ba7bb81f3d53a5401ebe0913ada5a5f55cef80 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333cd4b5803ec745f3d61893b76daadce467c6bb6aa20604ea3f49d49d63c6c0 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..262e5c5c89699358e980ab110a76a19f800c7da5 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1a0be2e2251935bf23b48a53ae21617e55dc97b25f1f2f05e5bca75cc28ab3 +size 78981090 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e59bc9e2f1f7f8c6b30ccaca2d21e2a868f43dfc --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32691f8ae9680023326eda621d1e0ff61a8ea3fd3b7ff7db490a6fe9a979e12b +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd36f42ea8edc6113fc73419ce043f905afcdcca --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03216ddac2b6e5bfeb45e393cfb0d8d1ed74ef02565e08ac9047692251cbd10f +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df070b72d53c70abbcb4dde4781bc78662533bbd --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:100a82bd424c1aa3b094c7aa696ead6f0ac8f5428266ece83b3dd46d5d74e358 +size 78980770 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ec696f7758b870c6ce7058927517b2513b5fc58 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30ceddbc910700d760d5f91662952fae5a1f9d398d1dbe8091264af0d5383caa +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8612c9f21f5d3e5864c7b75911c8dd4687a74ced --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:024a668f60493613a1e4b55f65919c87c350b4d8604246bfbcd9f00ef66c5eb5 +size 78980951 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a894bdbf89e03b33e4133e3f563adae3f3b543f --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c053b2ef58223bf81c5436a1354cb80dc498d54afe32d1f45a1df2fe6af40a8b +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e64b2eb42561b300cf99facabeb568e1ced03be --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66588d3257c7e85fb54eac9caa6d0d6b1172a7549330742b5087fade249f2ff3 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce63087bb9278d5248e3ca43372e8fbc481e3b79 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e3adeb18287d25bfc7fc8ba422a95023145d9017bf86edd7a5b625b44671ef7 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34758ec3782a04213d5c7ba9662174dfd2e99a18 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efff1fe62a680565dc888c43f93b700bc58c11c8524346b5d644cd7c8dce24ce +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6e46b42b0ecb4c46e3a325ddbf54c3f50a40a93 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afa37d42571570496f63fc01f54cf2a07056fb7e91f91b2db03191ae144c62a8 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f27dc139a0cd23ae1197d8ca2f3b5998733627b --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ec3187649a8fe19b78110de5f0c26f11f6f31a47d19d798c87a7a9eba2c8491 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78726375cd7802dd9d4340161f6be6c202402682 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70bebd737d3bac66af2638c26513692f906caeb2509b61d7c7021889c65b5b18 +size 78981090 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d1e074717e34d26a0e8a1c856e6aa84dbe16a6d --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f93c58c27ef70166f57dffd1b6d6942977954ef80d307cb3990c872a5fea85 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a0bde8271dddc9d4a6b9f874d566bd705ed7a0 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c431465b4e9094ceccf8c42ca1179a68273acbc0378d8ae948a907cabac03091 +size 78981026 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0358984e4814c972ce7f62cf582735564f10eec7 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03b1b1033b9581c0581942c2865692e21f28447126ac8930ce13fb83017d9ca +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a5879d20174a31ca6b694a5eac59aa93fcebde9 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:347e90e508ab143eecf13ae56d79621fb21bcfb2775d4db5ba49975a50b76b31 +size 78980887 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..880ec481d02d7fd9c6fe8b48f126503c488a7370 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b0c0277eb84c509599ce03597b58bd17d0efcc6624b25dd4f03ac6bb9b870d2 +size 78980962 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d63574b3526ad31433e87cdd7a519bfe79c963af --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ff222506eaed6df4e76cb39383a08eedb0bfa0d8cf2781f78db29401a314fe +size 78981090 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d1bfd462f37d85f79d52beff9059a43958bf0c3 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6656f881b0b61ca72ad20d85835a8b4b6c98754de318febffc15f10d52f653c +size 78980834 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26a6486e06f744e0373378bae81b478ad58ae9ed --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f6da510bf474b1d213e18ce3d9e82bdeebf926d52a97df085c4c393aff15d8 +size 78980898 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55abd0806a152a275640e7ec718f345ef01c6905 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef2c6b4639fa0908c2f88f696ca1f252c4ae2002c409814057b6d0592d2cf9d +size 78980823 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7fae68d2aaf9199f7bf936b371d13c1152be630 --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec963c349b8a36c823cada7020eb3a30374fa90c0c0cb07e26590a8c217fa52f +size 78980887 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..838a7a263a66d2e8813ba257da886a9103573c9f --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1002c085a36c7e9605ea179cfc792dbd6d4bc67ea6ae67ef4857e22b87552bb +size 78980951 diff --git a/421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e186cb8ed78f42acac1b3d75f83bf80ee48c5c5b --- /dev/null +++ b/421m3b9400m/global_step7508/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc3b86ac6ccc6554409ea63144a48d815eeacedd030c69afb20ea80e2d779f0a +size 78980887 diff --git a/421m3b9400m/global_step7508/layer_01-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a71f0c6dbb275b060516afdcca8cfe94dbafdd4e --- /dev/null +++ b/421m3b9400m/global_step7508/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67097e0b894a29e872a6f595fb309e0b343ddbb626a1912763f7d98678440d07 +size 134022403 diff --git a/421m3b9400m/global_step7508/layer_03-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95364cce323911f2239fc20844b0afba9d3ec83f --- /dev/null +++ b/421m3b9400m/global_step7508/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edbd26091eed40228f360bbd3d41024d627b51aaefa17464e89877c545308a6d +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_04-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..096ac9109755fcc6750d6aad4d733abef1a29d9a --- /dev/null +++ b/421m3b9400m/global_step7508/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0516b8c1548278b1e9fc47e1ca45dd63385a6ab22cc4c2ec6c7b22701f208c43 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_05-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4b18f0444524ca1df1f7714402f69ffd29d3a90 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44fb0de0c59e2bd4b73a0da2bff0776135d33de33294b5ccc9d29a6fc057fce7 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_06-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bb4f9eb22137a16213a7f85dc0080ae2ea19673 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e6c32a66bfc6aa7142ea8eea71cf603e89e782fba20b13343185d47ede0c78a +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_07-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..adb76710773f9f91d559a83a44e7cb59b18d07df --- /dev/null +++ b/421m3b9400m/global_step7508/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dd6a5d7ade20af23a70daaa77dc9f52b45128e93cc908d53c732e17a0f5bab +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_08-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db7746235665a6666078ce05366002faffdd891d --- /dev/null +++ b/421m3b9400m/global_step7508/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeadda448de0372c81162809bac897858cb731fa8a968e1af979948e36028b1b +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_09-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c53f4d033fc8953c4b8b5e68d6a845008414e53 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69155b39b71d0305e0de4187d509ce0722b4f7df29548b8a1b8f732c862610a4 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_10-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2e5856c596a75146b3c208f52bdbc4ebfb6082a --- /dev/null +++ b/421m3b9400m/global_step7508/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c902d6a65c2e53ac9506b824b1777825c48a526fc8d62209faf56df4c5f5a404 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_11-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d01f6ebf7198c2a49f9a355a73b7a441ee6f2ab4 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcfb79c583c288b1689efb24c11aa6229a70c6a39b4297c5a4755acfa3d686c7 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_12-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f02cb22c91ed5f60b54eddbce3f0676258a9fa1 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b02d92bcb801fed0959d8a046f3f5c5720a05db1230a63c6b57a25c152799eae +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_13-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..049b12e235342d988a071820a50c4d5689daaa23 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80589272fbc9de2c6fe4714d8c81e73f68635c88163d03ef89ccc7a191b4858d +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_14-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a90c480905d38ed9489e8c9212423f2a03914d2c --- /dev/null +++ b/421m3b9400m/global_step7508/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341e104af4e4c0e0352b6584a788f48c6c24f6508631988d083ad348f0346d10 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_15-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fccf8170fcbe0a2807c94ce0708c4de73d5ca80e --- /dev/null +++ b/421m3b9400m/global_step7508/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4369c178cdffa0a73281ce6480a7ae3f4ef1a82150f32731d13c1165cdd29743 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_16-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82b13a1afbe3f3fb9571d8ce94971006241a7222 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8caaefb0663d6178c394dde713a00d7a83fa64b0b5422fa75627caebb7b58666 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_17-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f1ec37959854a4d5433e1066968600dd57b8779 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0d12b15e36d1b2f3730588e978a3b89b29351c1b1354745fe9ddc677b291cb4 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_18-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..feb513c8c4a0d04ea4fc669b75b763b18290de59 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6b3a3561721b4b7e0785122f8eefe47d3cc1f42497404279ed4fd418a9e3690 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_19-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb668d133fa064da28353f33ff4401ab362d6e79 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:506fc206db21d8688011627a07affb01300256aa6b9e2dc816ae75c0e0c9d0d3 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_20-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d16143c06d340ca0e1eb90a8afba20e86c5d3ced --- /dev/null +++ b/421m3b9400m/global_step7508/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e916289f9248d996340a3c3859c82aa337a95db2b992e8c6913bbaba6b7e20 +size 39359235 diff --git a/421m3b9400m/global_step7508/layer_22-model_00-model_states.pt b/421m3b9400m/global_step7508/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..facc152539b8e2587095a5a8e742c33af550c473 --- /dev/null +++ b/421m3b9400m/global_step7508/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f1fcf257c28821400d4705783f1063c99770cc5476319b63e05b1cbee0a6299 +size 6339 diff --git a/421m3b9400m/global_step7508/mp_rank_00_model_states.pt b/421m3b9400m/global_step7508/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16ac305be3a1a68ff625eab38fd2330661dbeb2d --- /dev/null +++ b/421m3b9400m/global_step7508/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce0cb16c1b1928403a80fc5b08ec22f0c48560535cd41e5b899f008dac31e4c +size 37747 diff --git a/421m3b9400m/sbatch_421m3b9400m.sh b/421m3b9400m/sbatch_421m3b9400m.sh new file mode 100644 index 0000000000000000000000000000000000000000..2c94dfcafdba0e2cbbb72da6358a2e7aedceba6e --- /dev/null +++ b/421m3b9400m/sbatch_421m3b9400m.sh @@ -0,0 +1,163 @@ +#!/bin/bash +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m3b9400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 3936562000 +# -> Samples: 1_922_149 +TRAIN_SAMPLES=1_922_149 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 19_221 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m3b9400m/sbatch_421m3b9400mval.sh b/421m3b9400m/sbatch_421m3b9400mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..df4612d5cdbc0ca08fee23f4e84ed68f4d63b11d --- /dev/null +++ b/421m3b9400m/sbatch_421m3b9400mval.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m3b9400mval +VARIANT_CKPT=421m3b9400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_3B9_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 3936562000 +# -> Samples: 1_922_149 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --no-load-optim \ + --reset-progress \ + --override-lr-scheduler \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m3b9400m/tensorboard_421m3b9400m/events.out.tfevents.1678910356.nid006724.105089.0 b/421m3b9400m/tensorboard_421m3b9400m/events.out.tfevents.1678910356.nid006724.105089.0 new file mode 100644 index 0000000000000000000000000000000000000000..65deb45cc88f2a2ca1ea88f30faa5c6d2947038f --- /dev/null +++ b/421m3b9400m/tensorboard_421m3b9400m/events.out.tfevents.1678910356.nid006724.105089.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6948c689d185f4a684a66f69bb266b8be7a98aa341231d1975727ffa121c02ae +size 13359042 diff --git a/421m3b9400m/tensorboard_421m3b9400mval/events.out.tfevents.1678915809.nid006724.122039.0 b/421m3b9400m/tensorboard_421m3b9400mval/events.out.tfevents.1678915809.nid006724.122039.0 new file mode 100644 index 0000000000000000000000000000000000000000..d8bd78211bbd03ec48106af4239be05a93026023 --- /dev/null +++ b/421m3b9400m/tensorboard_421m3b9400mval/events.out.tfevents.1678915809.nid006724.122039.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef2f0459f0a9faf2af405ba041d4515eaa57dd5d03e33808838bfde64e5e3888 +size 980 diff --git a/421m60b400m/3319845.err b/421m60b400m/3319845.err new file mode 100644 index 0000000000000000000000000000000000000000..2136c7ddc500f2aa5147d17e8ed4a8c02df5fe16 --- /dev/null +++ b/421m60b400m/3319845.err @@ -0,0 +1,1104 @@ +6: 2023-03-16 12:46:31.710743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710749: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710866: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710848: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710860: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710868: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710878: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 12:46:31.710881: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711222: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711237: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711243: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711346: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711357: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711362: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 12:46:31.711703: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711557: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711569: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711552: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: 2023-03-16 12:46:31.711612: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:31.711624: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:31.711629: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-16 12:46:31.711701: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 12:46:31.711718: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 12:46:31.711699: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: 2023-03-16 12:46:31.711426: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711455: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711434: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711949: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 12:46:31.711957: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 12:46:31.711741: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711619: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711627: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-16 12:46:31.711760: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 12:46:31.711839: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 12:46:31.711842: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 12:46:31.711612: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 12:46:31.711869: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:31.711635: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:31.711658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.711986: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.711998: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.711989: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: 2023-03-16 12:46:31.711684: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:31.711696: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:31.711694: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.712044: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.712056: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.712061: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.712071: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 12:46:31.712069: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712533: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712539: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712556: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712573: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712585: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712597: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712597: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 12:46:31.712661: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 12:46:46.236051: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.236354: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 12:46:46.236073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:46:46.236083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.236369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 12:46:46.236094: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.236379: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 12:46:46.236103: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.236406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.236409: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 12:46:46.236105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236428: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-16 12:46:46.236433: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 12:46:46.236118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.236430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.236433: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 12:46:46.236112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.236438: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.236404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.236459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-16 12:46:46.236396: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.236465: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:46:46.236994: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236807: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 12:46:46.237046: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.236472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-16 12:46:46.236463: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-16 12:46:46.237064: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 12:46:46.237075: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236815: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 12:46:46.237092: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237010: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.236538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-16 12:46:46.236484: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-16 12:46:46.237100: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237023: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237047: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237053: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:46:46.237104: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 12:46:46.237108: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237058: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.236467: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-16 12:46:46.236489: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-16 12:46:46.237123: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237066: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 12:46:46.237070: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:46:46.236506: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:46:46.237102: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 12:46:46.236830: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.237123: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.237130: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 12:46:46.236477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.237144: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.237163: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:46:46.237167: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 12:46:46.237165: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236494: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 12:46:46.237174: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236847: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.236568: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-16 12:46:46.236501: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:46:46.236853: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 12:46:46.236859: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 12:46:46.236862: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 12:46:46.236871: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236676: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-16 12:46:46.236548: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236525: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 12:46:46.236598: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.236624: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236530: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:46:46.236580: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.236630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236709: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236546: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:46:46.236614: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.236640: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236712: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236557: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:46:46.236605: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.236672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236732: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236557: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:46:46.236627: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.236656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236733: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236582: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:46:46.236650: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.236651: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236739: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.236566: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:46:46.236635: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:46:46.237373: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.236745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-16 12:46:46.237296: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237393: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237392: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.236660: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:46:46.237324: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237406: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237416: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237421: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237421: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 12:46:46.237434: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:46:46.237517: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 12:46:46.237329: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237461: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 12:46:46.237339: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237479: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237491: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237506: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237511: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237513: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 12:46:46.237351: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237518: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237537: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 12:46:46.237362: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 12:46:46.237527: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 12:46:46.237378: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 12:46:46.237381: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237546: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237555: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237571: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237569: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237584: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 12:46:46.237598: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 12:47:16.183674: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.183691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.183701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 12:47:16.183848: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.183736: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.183741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.183876: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 12:47:16.183751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.183897: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 12:47:16.183762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.184032: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.183905: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 12:47:16.183915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.184048: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.183920: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.184110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.183917: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.183936: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.184073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 12:47:16.184009: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 12:47:16.184138: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184127: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 12:47:16.184155: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184134: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-16 12:47:16.184093: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.184186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184142: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184266: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.184126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.184190: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184145: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184295: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.184136: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.184199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 12:47:16.184154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.184147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.184208: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 12:47:16.184324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.184216: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.184149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184348: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-16 12:47:16.184168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.184352: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.195943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-16 12:47:16.195987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.195944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.195947: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.196029: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196086: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-16 12:47:16.195994: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.195950: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.195997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.195951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.196002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.195949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 12:47:16.196153: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 12:47:16.196088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.195959: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.195998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.195962: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 12:47:16.195963: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 12:47:16.195968: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 12:47:16.196089: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.196009: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 12:47:16.195969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 12:47:16.195969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.196002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196032: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.196013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196092: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196102: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 12:47:16.196005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196033: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.196015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.196016: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196159: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196104: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 12:47:16.196104: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 12:47:16.196008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 12:47:16.196030: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 12:47:16.196030: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196109: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 12:47:16.196113: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.196020: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 12:47:16.196020: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196160: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 12:47:16.196138: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-16 12:47:16.196023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 12:47:16.196022: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 12:47:16.196035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 12:47:16.196026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 12:47:16.196027: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 12:47:16.196144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196035: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196169: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196169: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 12:47:16.196152: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.196048: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 12:47:16.196049: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196159: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 12:47:16.196146: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 12:47:16.196049: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 12:47:16.196052: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 12:47:16.196053: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 12:47:16.196053: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 12:47:16.196056: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 12:47:16.196056: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196163: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 12:47:16.196159: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 12:47:16.196163: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196175: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 12:47:16.196178: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196183: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196187: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196187: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 12:47:16.196188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.212741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212796: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212821: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212827: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212838: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212849: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.212867: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213098: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213134: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213219: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213226: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.213235: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215407: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215411: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215411: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215412: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215418: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215419: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215423: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215426: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215428: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215431: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215432: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 12:47:16.215574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 12:47:16.215587: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215708: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215713: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215709: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215708: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215719: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195773: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195778: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195776: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195789: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195780: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195783: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 12:47:16.195795: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195796: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195799: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195799: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195801: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195804: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 12:47:16.195805: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 12:47:16.215724: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215730: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215732: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215733: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215734: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215736: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215737: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 12:47:16.215737: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: +1: +1: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: +7: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +0: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +1: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +0: Loading extension module utils... +4: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +5: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +6: Loading extension module utils... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils...Loading extension module utils... +3: +3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils...Loading extension module utils... +3: +4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +4: +4: +4: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: +6: +6: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +6: +6: +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/421m60b400m/3319845.out b/421m60b400m/3319845.out new file mode 100644 index 0000000000000000000000000000000000000000..9bccc2c86aa1d93731e5726befbc8679e75fd4bd --- /dev/null +++ b/421m60b400m/3319845.out @@ -0,0 +1,16155 @@ +Model parameters: d_model 1280 ffw_size 5120 kv_size 128 n_heads 10 n_layers 18 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 18 --hidden-size 1280 --num-attention-heads 10 --kv-channels 128 --ffn-hidden-size 5120 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 29_492_188 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-421m60b400m --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 29_492_188 --lr-warmup-samples 294_922 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 10000 --eval-interval 10000 --eval-iters 1 --tensorboard-dir tensorboard_421m60b400m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_421m60b400m --load checkpoints_421m60b400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3319845.json --zero-stage 0 +START 3319845: Thu 16 Mar 2023 12:44:52 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 50.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 38.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 37.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 39.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 46.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 48.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 45.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 36.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 40.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 49.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 36.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 48.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +4: Launching on nid006549 (4/8), master nid006545 port 9999, GPUs 8, CUDA: True +5: Launching on nid006550 (5/8), master nid006545 port 9999, GPUs 8, CUDA: True +6: Launching on nid006551 (6/8), master nid006545 port 9999, GPUs 8, CUDA: True +3: Launching on nid006548 (3/8), master nid006545 port 9999, GPUs 8, CUDA: True +2: Launching on nid006547 (2/8), master nid006545 port 9999, GPUs 8, CUDA: True +1: Launching on nid006546 (1/8), master nid006545 port 9999, GPUs 8, CUDA: True +7: Launching on nid006552 (7/8), master nid006545 port 9999, GPUs 8, CUDA: True +0: Launching on nid006545 (0/8), master nid006545 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3319845.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 10000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 5120 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1280 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-421m60b400m +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_421m60b400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 29492188 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 294922 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 18 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_421m60b400m +0: save_interval ................................... 10000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_421m60b400m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 29492188 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-16 12:48:35,847] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.112 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 102 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 27.873 seconds +0: time to initialize megatron (seconds): 34.662 +0: [after megatron is initialized] datetime: 2023-03-16 12:49:06 +0: building GPT model ... +0: [2023-03-16 12:49:06,807] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-16 12:49:06,807] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-16 12:49:06,808] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.69 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-16 12:49:08,825] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=25 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: undo +0: 22: MixedFusedLayerNorm +0: 23: EmbeddingPipe +0: 24: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-16 12:49:09,231] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-16 12:49:09,231] [INFO] [utils.py:828:see_memory_usage] MA 0.79 GB Max_MA 0.79 GB CA 0.86 GB Max_CA 1 GB +0: [2023-03-16 12:49:09,232] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.71 GB, percent = 6.1% +0: setting training iterations to 115203 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-16 12:49:09,233] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-16 12:49:22,587] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-16 12:49:22,587] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-16 12:49:22,587] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-16 12:49:22,594] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-16 12:49:22,594] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-16 12:49:22,719] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-16 12:49:22,719] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.79 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-16 12:49:22,719] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.4 GB, percent = 6.2% +0: Time to load utils op: 0.26334071159362793 seconds +0: Time to load utils op: 0.2616121768951416 seconds +0: Time to load utils op: 0.25490546226501465 seconds +0: Time to load utils op: 0.25203728675842285 seconds +0: Time to load utils op: 0.2518460750579834 secondsTime to load utils op: 0.2511279582977295 secondsTime to load utils op: 0.26016664505004883 seconds +0: +0: +3: Time to load utils op: 0.26169896125793457 seconds +3: Time to load utils op: 0.2565925121307373 seconds +3: Time to load utils op: 0.25629091262817383 seconds +3: Time to load utils op: 0.2564735412597656 seconds +3: Time to load utils op: 0.2560238838195801 secondsTime to load utils op: 0.2617650032043457 seconds +3: +3: Time to load utils op: 0.2617771625518799 seconds +3: Time to load utils op: 0.25587940216064453 seconds +6: Time to load utils op: 0.2627537250518799 seconds +6: Time to load utils op: 0.2632741928100586 seconds +6: Time to load utils op: 0.2631111145019531 seconds +6: Time to load utils op: 0.2632791996002197 secondsTime to load utils op: 0.26327943801879883 seconds +6: +6: Time to load utils op: 0.26279354095458984 seconds +6: Time to load utils op: 0.2632880210876465 seconds +7: Time to load utils op: 0.2571239471435547 seconds +7: Time to load utils op: 0.25713253021240234 seconds +7: Time to load utils op: 0.25716352462768555 seconds +7: Time to load utils op: 0.2571725845336914 seconds +7: Time to load utils op: 0.2571837902069092 secondsTime to load utils op: 0.25717830657958984 seconds +7: Time to load utils op: 0.2571752071380615 seconds +7: +7: Time to load utils op: 0.25719666481018066 seconds +5: Time to load utils op: 0.2616727352142334 secondsTime to load utils op: 0.26166343688964844 seconds +5: +5: Time to load utils op: 0.2617170810699463 seconds +5: Time to load utils op: 0.2617220878601074 seconds +5: Time to load utils op: 0.26175975799560547 seconds +5: Time to load utils op: 0.2617957592010498 seconds +5: Time to load utils op: 0.26179075241088867 seconds +5: Time to load utils op: 0.2617959976196289 seconds +1: Time to load utils op: 0.26670145988464355 seconds +1: Time to load utils op: 0.26674699783325195 seconds +1: Time to load utils op: 0.26676273345947266 seconds +1: Time to load utils op: 0.2667872905731201 secondsTime to load utils op: 0.266782283782959 seconds +1: +1: Time to load utils op: 0.2667810916900635 secondsTime to load utils op: 0.2667875289916992 seconds +1: +1: Time to load utils op: 0.26679205894470215 seconds +4: Time to load utils op: 0.26447200775146484 seconds +4: Time to load utils op: 0.26183485984802246 seconds +4: Time to load utils op: 0.26231837272644043 seconds +4: Time to load utils op: 0.26251769065856934 seconds +4: Time to load utils op: 0.2625579833984375 secondsTime to load utils op: 0.26454997062683105 seconds +4: +4: Time to load utils op: 0.26189613342285156 seconds +4: Time to load utils op: 0.26281261444091797 seconds +2: Time to load utils op: 0.27103638648986816 seconds +2: Time to load utils op: 0.27207064628601074 seconds +2: Time to load utils op: 0.2720377445220947 seconds +2: Time to load utils op: 0.2720320224761963 secondsTime to load utils op: 0.27184557914733887 seconds +2: Time to load utils op: 0.2719871997833252 seconds +2: +2: Time to load utils op: 0.2716825008392334 seconds +2: Time to load utils op: 0.30446362495422363 seconds +6: Time to load utils op: 0.30405592918395996 seconds +0: Time to load utils op: 0.2020866870880127 seconds +0: [2023-03-16 12:49:23,035] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-16 12:49:23,036] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.78 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-16 12:49:23,036] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.4 GB, percent = 6.2% +0: Time to load utils op: 0.0005750656127929688 seconds +0: Time to load utils op: 0.000453948974609375 seconds +0: Time to load utils op: 0.0004124641418457031 seconds +0: Time to load utils op: 0.0004100799560546875 seconds +0: Time to load utils op: 0.0004513263702392578 seconds +0: Time to load utils op: 0.0004439353942871094 seconds +0: Time to load utils op: 0.0006306171417236328 seconds +3: Time to load utils op: 0.0005228519439697266 seconds +3: Time to load utils op: 0.00041985511779785156 secondsTime to load utils op: 0.0005710124969482422 secondsTime to load utils op: 0.00038123130798339844 seconds +3: +3: Time to load utils op: 0.0003886222839355469 seconds +3: +4: Time to load utils op: 0.0005476474761962891 seconds +4: Time to load utils op: 0.0004444122314453125 secondsTime to load utils op: 0.00043582916259765625 seconds +4: +4: Time to load utils op: 0.0005657672882080078 seconds +4: Time to load utils op: 0.0005211830139160156 seconds +4: Time to load utils op: 0.00055694580078125 seconds +4: Time to load utils op: 0.0005521774291992188 seconds +4: Time to load utils op: 0.00040459632873535156 seconds +3: Time to load utils op: 0.0005342960357666016 seconds +3: Time to load utils op: 0.0005412101745605469 seconds +3: Time to load utils op: 0.0004143714904785156 seconds +6: Time to load utils op: 0.0004665851593017578 seconds +6: Time to load utils op: 0.0005006790161132812 seconds +6: Time to load utils op: 0.0004987716674804688 seconds +6: Time to load utils op: 0.00046181678771972656 secondsTime to load utils op: 0.00046706199645996094 secondsTime to load utils op: 0.00045943260192871094 seconds +6: +6: Time to load utils op: 0.0004558563232421875 seconds +6: +6: Time to load utils op: 0.00044608116149902344 seconds +2: Time to load utils op: 0.0005092620849609375 seconds +2: Time to load utils op: 0.0005168914794921875 seconds +2: Time to load utils op: 0.0005290508270263672 seconds +2: Time to load utils op: 0.0005600452423095703 seconds +2: Time to load utils op: 0.0005624294281005859 seconds +2: Time to load utils op: 0.0006802082061767578 seconds +2: Time to load utils op: 0.000705718994140625 seconds +2: Time to load utils op: 0.0007076263427734375 seconds +5: Time to load utils op: 0.0008096694946289062 seconds +5: Time to load utils op: 0.0008602142333984375 seconds +5: Time to load utils op: 0.0009257793426513672 seconds +5: Time to load utils op: 0.0009522438049316406 seconds +5: Time to load utils op: 0.0010039806365966797 seconds +5: Time to load utils op: 0.0009691715240478516 seconds +5: Time to load utils op: 0.0009381771087646484 seconds +5: Time to load utils op: 0.0010476112365722656 seconds +1: Time to load utils op: 0.0008451938629150391 seconds +1: Time to load utils op: 0.0012323856353759766 seconds +1: Time to load utils op: 0.00136566162109375 seconds +1: Time to load utils op: 0.001331329345703125 seconds +1: Time to load utils op: 0.0013289451599121094 seconds +1: Time to load utils op: 0.0013713836669921875 secondsTime to load utils op: 0.0013191699981689453 seconds +1: +1: Time to load utils op: 0.0014202594757080078 seconds +7: Time to load utils op: 0.0008683204650878906 seconds +7: Time to load utils op: 0.000985860824584961 seconds +7: Time to load utils op: 0.0010755062103271484 seconds +7: Time to load utils op: 0.001249551773071289 seconds +7: Time to load utils op: 0.0011904239654541016 seconds +7: Time to load utils op: 0.0011632442474365234 seconds +7: Time to load utils op: 0.001196146011352539 seconds +7: Time to load utils op: 0.00037026405334472656 seconds +0: [2023-03-16 12:49:23,170] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-16 12:49:23,171] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-16 12:49:23,171] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,279] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-16 12:49:23,280] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-16 12:49:23,280] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,386] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-16 12:49:23,387] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 12:49:23,387] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,492] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-16 12:49:23,492] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 12:49:23,493] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,599] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-16 12:49:23,600] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 12:49:23,600] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,703] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-16 12:49:23,703] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 12:49:23,703] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,813] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-16 12:49:23,813] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 12:49:23,813] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,918] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-16 12:49:23,918] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-16 12:49:23,918] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-16 12:49:23,918] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-16 12:49:23,919] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-16 12:49:23,919] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-16 12:49:23,919] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-16 12:49:23,919] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-16 12:49:23,919] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-16 12:49:23,919] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-16 12:49:23,920] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-16 12:49:23,921] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-16 12:49:23,921] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00040793418884277344 seconds +0: [2023-03-16 12:49:23,922] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-16 12:49:23,973] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=25 [0, 25) STAGE_PARAMS=421207040 (421.207M) TOTAL_PARAMS=421207040 (421.207M) UNIQUE_PARAMS=421207040 (421.207M) +7: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_421m60b400m +0: will not load any checkpoints and will start from random +2: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,978] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,979] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 12:49:23,980] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_421m60b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 7.79 +0: estimated model parameters: 0.42120704 +0: estimated model parameters without embeddings: 0.35419648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 12:49:24 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 29492188 +0: validation: 3072 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.009363 seconds +0: number of documents: 835726 +0: > dataset split: +0: train: +0: document indices in [0, 835726) total of 835726 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_29492188ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_29492188ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_29492188ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.072 seconds +0: total number of samples: 29655283 +0: total number of epochs: 152 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.044545 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_3072ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_3072ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_3072ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.070 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-16 12:49:38 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 17807.34 | train/valid/test-data-iterators-setup: 12859.07 +0: [000-000] 0.4212B / 0.3542B +0: [before the start of training step] datetime: 2023-03-16 12:49:38 +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 4873.00537109375 | max allocated: 27817.7919921875 | reserved: 29816.0 | max reserved: 29816.0 +7: iteration 10/ 115203 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 1.99 | learning rate: 1.736E-06 | global batch size: 256 | lm loss: 1.091453E+01 | grad norm: 18.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 128.538 | TFLOPs: 12.25 | +7: iteration 20/ 115203 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.59 | learning rate: 3.472E-06 | global batch size: 256 | lm loss: 9.793789E+00 | grad norm: 3.985 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.595 | TFLOPs: 41.15 | +7: iteration 30/ 115203 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.58 | learning rate: 5.208E-06 | global batch size: 256 | lm loss: 9.261969E+00 | grad norm: 1.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.865 | TFLOPs: 41.84 | +7: iteration 40/ 115203 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.58 | learning rate: 6.944E-06 | global batch size: 256 | lm loss: 9.055422E+00 | grad norm: 1.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.520 | TFLOPs: 41.81 | +7: iteration 50/ 115203 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.58 | learning rate: 8.680E-06 | global batch size: 256 | lm loss: 8.825452E+00 | grad norm: 1.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.938 | TFLOPs: 42.04 | +7: iteration 60/ 115203 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.60 | learning rate: 1.042E-05 | global batch size: 256 | lm loss: 8.603447E+00 | grad norm: 1.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.532 | TFLOPs: 40.86 | +7: iteration 70/ 115203 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.59 | learning rate: 1.215E-05 | global batch size: 256 | lm loss: 8.360941E+00 | grad norm: 1.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.860 | TFLOPs: 41.65 | +7: iteration 80/ 115203 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.61 | learning rate: 1.389E-05 | global batch size: 256 | lm loss: 8.135986E+00 | grad norm: 1.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.875 | TFLOPs: 40.22 | +7: iteration 90/ 115203 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.60 | learning rate: 1.562E-05 | global batch size: 256 | lm loss: 7.924311E+00 | grad norm: 1.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.341 | TFLOPs: 40.93 | +7: iteration 100/ 115203 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.59 | learning rate: 1.736E-05 | global batch size: 256 | lm loss: 7.728225E+00 | grad norm: 1.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.231 | TFLOPs: 41.30 | +7: iteration 110/ 115203 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.57 | learning rate: 1.910E-05 | global batch size: 256 | lm loss: 7.557211E+00 | grad norm: 0.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.890 | TFLOPs: 42.51 | +7: iteration 120/ 115203 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.62 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 7.384554E+00 | grad norm: 1.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 414.583 | TFLOPs: 39.53 | +7: iteration 130/ 115203 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.59 | learning rate: 2.257E-05 | global batch size: 256 | lm loss: 7.288203E+00 | grad norm: 2.050 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.085 | TFLOPs: 41.10 | +7: iteration 140/ 115203 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.61 | learning rate: 2.430E-05 | global batch size: 256 | lm loss: 7.184540E+00 | grad norm: 1.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 420.005 | TFLOPs: 40.04 | +7: iteration 150/ 115203 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.59 | learning rate: 2.604E-05 | global batch size: 256 | lm loss: 7.094702E+00 | grad norm: 0.992 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.521 | TFLOPs: 41.52 | +7: iteration 160/ 115203 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.59 | learning rate: 2.778E-05 | global batch size: 256 | lm loss: 7.023695E+00 | grad norm: 0.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.468 | TFLOPs: 41.71 | +7: iteration 170/ 115203 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.58 | learning rate: 2.951E-05 | global batch size: 256 | lm loss: 6.943324E+00 | grad norm: 1.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.326 | TFLOPs: 41.79 | +7: iteration 180/ 115203 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.60 | learning rate: 3.125E-05 | global batch size: 256 | lm loss: 6.876694E+00 | grad norm: 0.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.780 | TFLOPs: 40.40 | +7: iteration 190/ 115203 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.57 | learning rate: 3.298E-05 | global batch size: 256 | lm loss: 6.837374E+00 | grad norm: 0.929 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.272 | TFLOPs: 42.45 | +7: iteration 200/ 115203 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.59 | learning rate: 3.472E-05 | global batch size: 256 | lm loss: 6.774904E+00 | grad norm: 1.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.089 | TFLOPs: 41.10 | +7: iteration 210/ 115203 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.57 | learning rate: 3.646E-05 | global batch size: 256 | lm loss: 6.728931E+00 | grad norm: 1.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.971 | TFLOPs: 42.90 | +7: iteration 220/ 115203 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.57 | learning rate: 3.819E-05 | global batch size: 256 | lm loss: 6.661845E+00 | grad norm: 1.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.224 | TFLOPs: 43.02 | +7: iteration 230/ 115203 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.59 | learning rate: 3.993E-05 | global batch size: 256 | lm loss: 6.619039E+00 | grad norm: 0.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.641 | TFLOPs: 41.53 | +7: iteration 240/ 115203 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.58 | learning rate: 4.167E-05 | global batch size: 256 | lm loss: 6.589149E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.145 | TFLOPs: 41.87 | +7: iteration 250/ 115203 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.59 | learning rate: 4.340E-05 | global batch size: 256 | lm loss: 6.552911E+00 | grad norm: 1.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.668 | TFLOPs: 41.25 | +7: iteration 260/ 115203 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.60 | learning rate: 4.514E-05 | global batch size: 256 | lm loss: 6.504588E+00 | grad norm: 1.038 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.597 | TFLOPs: 40.67 | +7: iteration 270/ 115203 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.59 | learning rate: 4.687E-05 | global batch size: 256 | lm loss: 6.491062E+00 | grad norm: 1.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.757 | TFLOPs: 41.45 | +7: iteration 280/ 115203 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.59 | learning rate: 4.861E-05 | global batch size: 256 | lm loss: 6.475332E+00 | grad norm: 1.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.887 | TFLOPs: 41.08 | +7: iteration 290/ 115203 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.58 | learning rate: 5.035E-05 | global batch size: 256 | lm loss: 6.443741E+00 | grad norm: 1.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.039 | TFLOPs: 42.14 | +7: iteration 300/ 115203 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.58 | learning rate: 5.208E-05 | global batch size: 256 | lm loss: 6.417327E+00 | grad norm: 0.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.924 | TFLOPs: 42.42 | +7: iteration 310/ 115203 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.58 | learning rate: 5.382E-05 | global batch size: 256 | lm loss: 6.376525E+00 | grad norm: 1.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.442 | TFLOPs: 42.18 | +7: iteration 320/ 115203 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.58 | learning rate: 5.555E-05 | global batch size: 256 | lm loss: 6.362547E+00 | grad norm: 1.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.713 | TFLOPs: 41.83 | +7: iteration 330/ 115203 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.59 | learning rate: 5.729E-05 | global batch size: 256 | lm loss: 6.344047E+00 | grad norm: 0.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.416 | TFLOPs: 41.70 | +7: iteration 340/ 115203 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.58 | learning rate: 5.903E-05 | global batch size: 256 | lm loss: 6.320394E+00 | grad norm: 0.759 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.165 | TFLOPs: 41.77 | +7: iteration 350/ 115203 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.59 | learning rate: 6.076E-05 | global batch size: 256 | lm loss: 6.295150E+00 | grad norm: 1.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.134 | TFLOPs: 41.58 | +7: iteration 360/ 115203 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.57 | learning rate: 6.250E-05 | global batch size: 256 | lm loss: 6.269004E+00 | grad norm: 0.901 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.823 | TFLOPs: 42.70 | +7: iteration 370/ 115203 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.58 | learning rate: 6.423E-05 | global batch size: 256 | lm loss: 6.242486E+00 | grad norm: 0.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.677 | TFLOPs: 42.01 | +7: iteration 380/ 115203 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.58 | learning rate: 6.597E-05 | global batch size: 256 | lm loss: 6.219765E+00 | grad norm: 1.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.928 | TFLOPs: 41.75 | +7: iteration 390/ 115203 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.57 | learning rate: 6.771E-05 | global batch size: 256 | lm loss: 6.198944E+00 | grad norm: 1.032 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.581 | TFLOPs: 42.58 | +7: iteration 400/ 115203 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.59 | learning rate: 6.944E-05 | global batch size: 256 | lm loss: 6.184899E+00 | grad norm: 0.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.262 | TFLOPs: 41.59 | +7: iteration 410/ 115203 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.58 | learning rate: 7.118E-05 | global batch size: 256 | lm loss: 6.178782E+00 | grad norm: 2.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.313 | TFLOPs: 41.88 | +7: iteration 420/ 115203 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.60 | learning rate: 7.291E-05 | global batch size: 256 | lm loss: 6.161135E+00 | grad norm: 1.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.707 | TFLOPs: 40.68 | +7: iteration 430/ 115203 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.59 | learning rate: 7.465E-05 | global batch size: 256 | lm loss: 6.130381E+00 | grad norm: 1.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.199 | TFLOPs: 41.59 | +7: iteration 440/ 115203 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.57 | learning rate: 7.639E-05 | global batch size: 256 | lm loss: 6.107217E+00 | grad norm: 0.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.918 | TFLOPs: 42.89 | +7: iteration 450/ 115203 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.59 | learning rate: 7.812E-05 | global batch size: 256 | lm loss: 6.088022E+00 | grad norm: 0.899 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.267 | TFLOPs: 41.59 | +7: iteration 460/ 115203 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.59 | learning rate: 7.986E-05 | global batch size: 256 | lm loss: 6.069090E+00 | grad norm: 1.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.169 | TFLOPs: 41.49 | +7: iteration 470/ 115203 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.58 | learning rate: 8.159E-05 | global batch size: 256 | lm loss: 6.038139E+00 | grad norm: 1.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.773 | TFLOPs: 41.74 | +7: iteration 480/ 115203 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.59 | learning rate: 8.333E-05 | global batch size: 256 | lm loss: 6.011141E+00 | grad norm: 1.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.440 | TFLOPs: 41.51 | +7: iteration 490/ 115203 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.61 | learning rate: 8.507E-05 | global batch size: 256 | lm loss: 6.001598E+00 | grad norm: 1.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.102 | TFLOPs: 40.15 | +7: iteration 500/ 115203 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.62 | learning rate: 8.680E-05 | global batch size: 256 | lm loss: 5.974238E+00 | grad norm: 0.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 409.890 | TFLOPs: 39.08 | +7: iteration 510/ 115203 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.58 | learning rate: 8.854E-05 | global batch size: 256 | lm loss: 5.954333E+00 | grad norm: 1.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.833 | TFLOPs: 41.93 | +7: iteration 520/ 115203 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.59 | learning rate: 9.027E-05 | global batch size: 256 | lm loss: 5.932952E+00 | grad norm: 1.101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.230 | TFLOPs: 41.59 | +7: iteration 530/ 115203 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.59 | learning rate: 9.201E-05 | global batch size: 256 | lm loss: 5.896020E+00 | grad norm: 1.015 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.815 | TFLOPs: 41.36 | +7: iteration 540/ 115203 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.59 | learning rate: 9.375E-05 | global batch size: 256 | lm loss: 5.901015E+00 | grad norm: 1.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.895 | TFLOPs: 41.37 | +7: iteration 550/ 115203 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.57 | learning rate: 9.548E-05 | global batch size: 256 | lm loss: 5.869278E+00 | grad norm: 1.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.763 | TFLOPs: 42.98 | +7: iteration 560/ 115203 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.59 | learning rate: 9.722E-05 | global batch size: 256 | lm loss: 5.857905E+00 | grad norm: 0.998 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.842 | TFLOPs: 41.36 | +7: iteration 570/ 115203 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.58 | learning rate: 9.895E-05 | global batch size: 256 | lm loss: 5.828834E+00 | grad norm: 1.110 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.461 | TFLOPs: 42.18 | +7: iteration 580/ 115203 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.59 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 5.800731E+00 | grad norm: 1.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.449 | TFLOPs: 41.52 | +7: iteration 590/ 115203 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.58 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 5.779882E+00 | grad norm: 0.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.214 | TFLOPs: 42.06 | +7: iteration 600/ 115203 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.58 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 5.746590E+00 | grad norm: 1.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.229 | TFLOPs: 42.16 | +7: iteration 610/ 115203 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.60 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 5.743531E+00 | grad norm: 1.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.381 | TFLOPs: 40.75 | +7: iteration 620/ 115203 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.59 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 5.711839E+00 | grad norm: 1.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.323 | TFLOPs: 41.60 | +7: iteration 630/ 115203 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.57 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 5.683174E+00 | grad norm: 1.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.534 | TFLOPs: 42.67 | +7: iteration 640/ 115203 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.58 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 5.667537E+00 | grad norm: 1.106 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.626 | TFLOPs: 41.91 | +7: iteration 650/ 115203 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.58 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 5.654660E+00 | grad norm: 1.075 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.804 | TFLOPs: 41.93 | +7: iteration 660/ 115203 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.58 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 5.627711E+00 | grad norm: 1.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.795 | TFLOPs: 41.83 | +7: iteration 670/ 115203 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.59 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 5.594418E+00 | grad norm: 1.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.892 | TFLOPs: 41.56 | +7: iteration 680/ 115203 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.59 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 5.563987E+00 | grad norm: 1.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.309 | TFLOPs: 41.03 | +7: iteration 690/ 115203 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.59 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 5.552671E+00 | grad norm: 1.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.112 | TFLOPs: 41.58 | +7: iteration 700/ 115203 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.57 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 5.524026E+00 | grad norm: 1.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.303 | TFLOPs: 42.74 | +7: iteration 710/ 115203 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.58 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 5.499785E+00 | grad norm: 1.070 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.698 | TFLOPs: 41.92 | +7: iteration 720/ 115203 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.58 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 5.474585E+00 | grad norm: 1.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.757 | TFLOPs: 42.31 | +7: iteration 730/ 115203 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.58 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 5.446213E+00 | grad norm: 0.957 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.627 | TFLOPs: 42.39 | +7: iteration 740/ 115203 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.58 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 5.425023E+00 | grad norm: 0.887 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.577 | TFLOPs: 41.81 | +7: iteration 750/ 115203 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.59 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 5.394869E+00 | grad norm: 1.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.034 | TFLOPs: 41.48 | +7: iteration 760/ 115203 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.57 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 5.379728E+00 | grad norm: 0.910 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.077 | TFLOPs: 42.62 | +7: iteration 770/ 115203 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.58 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 5.356950E+00 | grad norm: 0.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.688 | TFLOPs: 42.11 | +7: iteration 780/ 115203 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.58 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 5.322932E+00 | grad norm: 0.957 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.548 | TFLOPs: 42.19 | +7: iteration 790/ 115203 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.58 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 5.295696E+00 | grad norm: 0.963 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.169 | TFLOPs: 42.25 | +7: iteration 800/ 115203 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.58 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 5.283319E+00 | grad norm: 0.838 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.464 | TFLOPs: 42.18 | +7: iteration 810/ 115203 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.58 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 5.265985E+00 | grad norm: 0.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.422 | TFLOPs: 41.99 | +7: iteration 820/ 115203 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.59 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 5.235396E+00 | grad norm: 1.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.504 | TFLOPs: 41.04 | +7: iteration 830/ 115203 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.58 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 5.202178E+00 | grad norm: 1.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.429 | TFLOPs: 42.37 | +7: iteration 840/ 115203 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.58 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 5.181488E+00 | grad norm: 0.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.701 | TFLOPs: 42.02 | +7: iteration 850/ 115203 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.59 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 5.153091E+00 | grad norm: 1.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.648 | TFLOPs: 41.63 | +7: iteration 860/ 115203 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.57 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 5.128491E+00 | grad norm: 1.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.907 | TFLOPs: 42.89 | +7: iteration 870/ 115203 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.59 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 5.104889E+00 | grad norm: 0.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.723 | TFLOPs: 41.35 | +7: iteration 880/ 115203 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.59 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 5.080643E+00 | grad norm: 1.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.789 | TFLOPs: 41.64 | +7: iteration 890/ 115203 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.58 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 5.057726E+00 | grad norm: 1.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.166 | TFLOPs: 42.44 | +7: iteration 900/ 115203 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.58 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 5.029507E+00 | grad norm: 1.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.529 | TFLOPs: 42.19 | +7: iteration 910/ 115203 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.57 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 4.995224E+00 | grad norm: 0.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.554 | TFLOPs: 42.67 | +7: iteration 920/ 115203 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.58 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 4.977769E+00 | grad norm: 0.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.460 | TFLOPs: 42.37 | +7: iteration 930/ 115203 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.58 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 4.944556E+00 | grad norm: 1.012 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.148 | TFLOPs: 41.77 | +7: iteration 940/ 115203 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.59 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 4.925821E+00 | grad norm: 1.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.545 | TFLOPs: 41.24 | +7: iteration 950/ 115203 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.58 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 4.890816E+00 | grad norm: 0.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.275 | TFLOPs: 42.07 | +7: iteration 960/ 115203 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.57 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 4.864664E+00 | grad norm: 0.991 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.362 | TFLOPs: 42.84 | +7: iteration 970/ 115203 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.58 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 4.817918E+00 | grad norm: 0.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.802 | TFLOPs: 41.84 | +7: iteration 980/ 115203 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.57 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 4.805898E+00 | grad norm: 0.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.229 | TFLOPs: 42.92 | +7: iteration 990/ 115203 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.57 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 4.793547E+00 | grad norm: 0.997 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.078 | TFLOPs: 42.53 | +7: iteration 1000/ 115203 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.58 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 4.755049E+00 | grad norm: 1.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.920 | TFLOPs: 42.13 | +7: iteration 1010/ 115203 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.56 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 4.721589E+00 | grad norm: 0.820 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.253 | TFLOPs: 43.21 | +7: iteration 1020/ 115203 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.58 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 4.703882E+00 | grad norm: 1.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.373 | TFLOPs: 42.08 | +7: iteration 1030/ 115203 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.58 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 4.682614E+00 | grad norm: 0.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.751 | TFLOPs: 42.31 | +7: iteration 1040/ 115203 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.59 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 4.654118E+00 | grad norm: 0.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.908 | TFLOPs: 41.37 | +7: iteration 1050/ 115203 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.58 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 4.665781E+00 | grad norm: 0.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.671 | TFLOPs: 42.30 | +7: iteration 1060/ 115203 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.58 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 4.608846E+00 | grad norm: 0.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.876 | TFLOPs: 42.03 | +7: iteration 1070/ 115203 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.57 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 4.597751E+00 | grad norm: 0.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.065 | TFLOPs: 42.81 | +7: iteration 1080/ 115203 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.57 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 4.568410E+00 | grad norm: 0.827 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.089 | TFLOPs: 42.91 | +7: iteration 1090/ 115203 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.57 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.553720E+00 | grad norm: 0.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.664 | TFLOPs: 42.49 | +7: iteration 1100/ 115203 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.57 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.544164E+00 | grad norm: 1.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.100 | TFLOPs: 42.53 | +7: iteration 1110/ 115203 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.58 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.553835E+00 | grad norm: 0.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.483 | TFLOPs: 42.28 | +7: iteration 1120/ 115203 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.60 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.514715E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.112 | TFLOPs: 40.43 | +7: iteration 1130/ 115203 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.494342E+00 | grad norm: 0.675 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.720 | TFLOPs: 43.45 | +7: iteration 1140/ 115203 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.58 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.484286E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.933 | TFLOPs: 42.13 | +7: iteration 1150/ 115203 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.466142E+00 | grad norm: 0.803 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.384 | TFLOPs: 43.03 | +7: iteration 1160/ 115203 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.446978E+00 | grad norm: 0.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.034 | TFLOPs: 42.43 | +7: iteration 1170/ 115203 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.440932E+00 | grad norm: 0.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.915 | TFLOPs: 42.51 | +7: iteration 1180/ 115203 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.425223E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.438 | TFLOPs: 41.99 | +7: iteration 1190/ 115203 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.411828E+00 | grad norm: 0.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.528 | TFLOPs: 42.29 | +7: iteration 1200/ 115203 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.401925E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.089 | TFLOPs: 42.24 | +7: iteration 1210/ 115203 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.371182E+00 | grad norm: 0.849 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.790 | TFLOPs: 42.69 | +7: iteration 1220/ 115203 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.381845E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.843 | TFLOPs: 43.36 | +7: iteration 1230/ 115203 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.357425E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.430 | TFLOPs: 43.04 | +7: iteration 1240/ 115203 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.355668E+00 | grad norm: 0.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.900 | TFLOPs: 42.99 | +7: iteration 1250/ 115203 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.60 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.343254E+00 | grad norm: 0.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.518 | TFLOPs: 40.95 | +7: iteration 1260/ 115203 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.317823E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.916 | TFLOPs: 43.09 | +7: iteration 1270/ 115203 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.317448E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.834 | TFLOPs: 42.51 | +7: iteration 1280/ 115203 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.295735E+00 | grad norm: 0.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.603 | TFLOPs: 43.44 | +7: iteration 1290/ 115203 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.301752E+00 | grad norm: 0.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.934 | TFLOPs: 43.37 | +7: iteration 1300/ 115203 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.303105E+00 | grad norm: 0.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.170 | TFLOPs: 42.25 | +7: iteration 1310/ 115203 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.289275E+00 | grad norm: 0.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.438 | TFLOPs: 42.75 | +7: iteration 1320/ 115203 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.274089E+00 | grad norm: 0.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.876 | TFLOPs: 42.89 | +7: iteration 1330/ 115203 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.269057E+00 | grad norm: 0.849 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.207 | TFLOPs: 42.54 | +7: iteration 1340/ 115203 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.263853E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.061 | TFLOPs: 42.15 | +7: iteration 1350/ 115203 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.230310E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.555 | TFLOPs: 43.15 | +7: iteration 1360/ 115203 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.239049E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.599 | TFLOPs: 42.39 | +7: iteration 1370/ 115203 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.60 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.226430E+00 | grad norm: 0.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.757 | TFLOPs: 40.50 | +7: iteration 1380/ 115203 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.218304E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.656 | TFLOPs: 42.97 | +7: iteration 1390/ 115203 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.204264E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.746 | TFLOPs: 42.97 | +7: iteration 1400/ 115203 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.215569E+00 | grad norm: 0.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.871 | TFLOPs: 43.56 | +7: iteration 1410/ 115203 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.211316E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.756 | TFLOPs: 43.36 | +7: iteration 1420/ 115203 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.167598E+00 | grad norm: 0.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.328 | TFLOPs: 43.60 | +7: iteration 1430/ 115203 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.192402E+00 | grad norm: 0.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.430 | TFLOPs: 42.18 | +7: iteration 1440/ 115203 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.164381E+00 | grad norm: 0.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.469 | TFLOPs: 42.95 | +7: iteration 1450/ 115203 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.166402E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.250 | TFLOPs: 42.83 | +7: iteration 1460/ 115203 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.161439E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.842 | TFLOPs: 42.89 | +7: iteration 1470/ 115203 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.158066E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.851 | TFLOPs: 41.65 | +7: iteration 1480/ 115203 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.137557E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.499 | TFLOPs: 42.85 | +7: iteration 1490/ 115203 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.139757E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.034 | TFLOPs: 41.76 | +7: iteration 1500/ 115203 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.121047E+00 | grad norm: 0.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.420 | TFLOPs: 42.37 | +7: iteration 1510/ 115203 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.130090E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.016 | TFLOPs: 43.19 | +7: iteration 1520/ 115203 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.117041E+00 | grad norm: 0.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.367 | TFLOPs: 42.75 | +7: iteration 1530/ 115203 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.120011E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.711 | TFLOPs: 42.40 | +7: iteration 1540/ 115203 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.087398E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.291 | TFLOPs: 41.12 | +7: iteration 1550/ 115203 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.102070E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.822 | TFLOPs: 42.60 | +7: iteration 1560/ 115203 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.093875E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.388 | TFLOPs: 42.46 | +7: iteration 1570/ 115203 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.071863E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.248 | TFLOPs: 42.16 | +7: iteration 1580/ 115203 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.087856E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.043 | TFLOPs: 42.14 | +7: iteration 1590/ 115203 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.067475E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.516 | TFLOPs: 43.91 | +7: iteration 1600/ 115203 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.063922E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.524 | TFLOPs: 41.71 | +7: iteration 1610/ 115203 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.059980E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.027 | TFLOPs: 43.00 | +7: iteration 1620/ 115203 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.050697E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.111 | TFLOPs: 42.72 | +7: iteration 1630/ 115203 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.030390E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.706 | TFLOPs: 42.68 | +7: iteration 1640/ 115203 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.037593E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.644 | TFLOPs: 42.87 | +7: iteration 1650/ 115203 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.029446E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.901 | TFLOPs: 43.08 | +7: iteration 1660/ 115203 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.026376E+00 | grad norm: 0.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.773 | TFLOPs: 41.64 | +7: iteration 1670/ 115203 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.052903E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.213 | TFLOPs: 41.97 | +7: iteration 1680/ 115203 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.036416E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.782 | TFLOPs: 43.36 | +7: iteration 1690/ 115203 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.025433E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.131 | TFLOPs: 42.82 | +7: iteration 1700/ 115203 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.021999E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.664 | TFLOPs: 41.92 | +7: iteration 1710/ 115203 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.012592E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.346 | TFLOPs: 43.32 | +7: iteration 1720/ 115203 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.992098E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.071 | TFLOPs: 41.77 | +7: iteration 1730/ 115203 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.006924E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.386 | TFLOPs: 43.23 | +7: iteration 1740/ 115203 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.992574E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.857 | TFLOPs: 42.41 | +7: iteration 1750/ 115203 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.999265E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.662 | TFLOPs: 42.39 | +7: iteration 1760/ 115203 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.60 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.973203E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.689 | TFLOPs: 40.68 | +7: iteration 1770/ 115203 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.979195E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.214 | TFLOPs: 41.78 | +7: iteration 1780/ 115203 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.980511E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.177 | TFLOPs: 42.73 | +7: iteration 1790/ 115203 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.973595E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.984 | TFLOPs: 42.90 | +7: iteration 1800/ 115203 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.967590E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.679 | TFLOPs: 42.20 | +7: iteration 1810/ 115203 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.963490E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.385 | TFLOPs: 41.89 | +7: iteration 1820/ 115203 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.955769E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.281 | TFLOPs: 42.64 | +7: iteration 1830/ 115203 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.961776E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.027 | TFLOPs: 43.57 | +7: iteration 1840/ 115203 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.950670E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.726 | TFLOPs: 42.59 | +7: iteration 1850/ 115203 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.944305E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.961 | TFLOPs: 41.66 | +7: iteration 1860/ 115203 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.936361E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.152 | TFLOPs: 42.54 | +7: iteration 1870/ 115203 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.934866E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.796 | TFLOPs: 43.36 | +7: iteration 1880/ 115203 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.932917E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.979 | TFLOPs: 43.09 | +7: iteration 1890/ 115203 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.948693E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.216 | TFLOPs: 42.83 | +7: iteration 1900/ 115203 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.929689E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.996 | TFLOPs: 41.28 | +7: iteration 1910/ 115203 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.903655E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.331 | TFLOPs: 42.93 | +7: iteration 1920/ 115203 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.60 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.916010E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.928 | TFLOPs: 40.42 | +7: iteration 1930/ 115203 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.903462E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.916 | TFLOPs: 42.13 | +7: iteration 1940/ 115203 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.910558E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.957 | TFLOPs: 43.28 | +7: iteration 1950/ 115203 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.919135E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.360 | TFLOPs: 42.94 | +7: iteration 1960/ 115203 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.906839E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.630 | TFLOPs: 43.15 | +7: iteration 1970/ 115203 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.896611E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.422 | TFLOPs: 43.23 | +7: iteration 1980/ 115203 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.876390E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.775 | TFLOPs: 42.88 | +7: iteration 1990/ 115203 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.884431E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.108 | TFLOPs: 43.01 | +0: [2023-03-16 13:09:10,899] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.0001999754506631688, 0.0001999754506631688, 0.0001999754506631688], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 115203 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.873222E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.979 | TFLOPs: 43.28 | +0: steps: 2000 loss: 3.8898 iter time (s): 0.584 samples/sec: 438.303 +7: iteration 2010/ 115203 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.882880E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.008 | TFLOPs: 43.09 | +7: iteration 2020/ 115203 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.872437E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.542 | TFLOPs: 42.29 | +7: iteration 2030/ 115203 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.874244E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.178 | TFLOPs: 41.58 | +7: iteration 2040/ 115203 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.857454E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.408 | TFLOPs: 41.99 | +7: iteration 2050/ 115203 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.863844E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.949 | TFLOPs: 41.94 | +7: iteration 2060/ 115203 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.872206E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.578 | TFLOPs: 42.48 | +7: iteration 2070/ 115203 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.852542E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.069 | TFLOPs: 42.91 | +7: iteration 2080/ 115203 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.864257E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.306 | TFLOPs: 43.22 | +7: iteration 2090/ 115203 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.863011E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.023 | TFLOPs: 43.29 | +7: iteration 2100/ 115203 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.855587E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.925 | TFLOPs: 42.51 | +7: iteration 2110/ 115203 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.857816E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.741 | TFLOPs: 43.07 | +7: iteration 2120/ 115203 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.849236E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.190 | TFLOPs: 42.44 | +7: iteration 2130/ 115203 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.836247E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.543 | TFLOPs: 42.86 | +7: iteration 2140/ 115203 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.836411E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.135 | TFLOPs: 43.30 | +7: iteration 2150/ 115203 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.832098E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.125 | TFLOPs: 41.77 | +7: iteration 2160/ 115203 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.832311E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.234 | TFLOPs: 43.78 | +7: iteration 2170/ 115203 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.837332E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.937 | TFLOPs: 41.85 | +7: iteration 2180/ 115203 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.821481E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.046 | TFLOPs: 42.53 | +7: iteration 2190/ 115203 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.820064E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.898 | TFLOPs: 43.46 | +7: iteration 2200/ 115203 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.820369E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.014 | TFLOPs: 42.52 | +7: iteration 2210/ 115203 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.60 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.823581E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.841 | TFLOPs: 40.41 | +7: iteration 2220/ 115203 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.809862E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.388 | TFLOPs: 41.41 | +7: iteration 2230/ 115203 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.811462E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.703 | TFLOPs: 42.78 | +7: iteration 2240/ 115203 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.807815E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.614 | TFLOPs: 41.82 | +7: iteration 2250/ 115203 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.792344E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.668 | TFLOPs: 43.54 | +7: iteration 2260/ 115203 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.783706E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.074 | TFLOPs: 42.81 | +7: iteration 2270/ 115203 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.804823E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.026 | TFLOPs: 42.91 | +7: iteration 2280/ 115203 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.794315E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.820 | TFLOPs: 43.27 | +7: iteration 2290/ 115203 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.804355E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.468 | TFLOPs: 42.76 | +7: iteration 2300/ 115203 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.790607E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.385 | TFLOPs: 42.27 | +7: iteration 2310/ 115203 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.803643E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.362 | TFLOPs: 43.70 | +7: iteration 2320/ 115203 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.782593E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.444 | TFLOPs: 42.47 | +7: iteration 2330/ 115203 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.793562E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.666 | TFLOPs: 41.25 | +7: iteration 2340/ 115203 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.789695E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.705 | TFLOPs: 42.30 | +7: iteration 2350/ 115203 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.773928E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.390 | TFLOPs: 42.94 | +7: iteration 2360/ 115203 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 3.769840E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.900 | TFLOPs: 42.03 | +7: iteration 2370/ 115203 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.775311E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.647 | TFLOPs: 43.06 | +7: iteration 2380/ 115203 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.762804E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.214 | TFLOPs: 42.26 | +7: iteration 2390/ 115203 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.764423E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.677 | TFLOPs: 43.16 | +7: iteration 2400/ 115203 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.758123E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.579 | TFLOPs: 43.24 | +7: iteration 2410/ 115203 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.752546E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.961 | TFLOPs: 42.90 | +7: iteration 2420/ 115203 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.761537E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.314 | TFLOPs: 43.41 | +7: iteration 2430/ 115203 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.756293E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.460 | TFLOPs: 41.99 | +7: iteration 2440/ 115203 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.764418E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.792 | TFLOPs: 42.02 | +7: iteration 2450/ 115203 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.761521E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.721 | TFLOPs: 42.88 | +7: iteration 2460/ 115203 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.748004E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.624 | TFLOPs: 43.25 | +7: iteration 2470/ 115203 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.760518E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.679 | TFLOPs: 42.97 | +7: iteration 2480/ 115203 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.740751E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.153 | TFLOPs: 43.39 | +7: iteration 2490/ 115203 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.742368E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.715 | TFLOPs: 43.16 | +7: iteration 2500/ 115203 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.734137E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.027 | TFLOPs: 42.81 | +7: iteration 2510/ 115203 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.739474E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.592 | TFLOPs: 42.10 | +7: iteration 2520/ 115203 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.758297E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.799 | TFLOPs: 42.98 | +7: iteration 2530/ 115203 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.726088E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.791 | TFLOPs: 42.22 | +7: iteration 2540/ 115203 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.720038E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.840 | TFLOPs: 43.84 | +7: iteration 2550/ 115203 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.720016E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.467 | TFLOPs: 43.71 | +7: iteration 2560/ 115203 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.724945E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.607 | TFLOPs: 43.44 | +7: iteration 2570/ 115203 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.758938E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.200 | TFLOPs: 42.35 | +7: iteration 2580/ 115203 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.746498E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.743 | TFLOPs: 43.26 | +7: iteration 2590/ 115203 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.728372E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.500 | TFLOPs: 42.47 | +7: iteration 2600/ 115203 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.731224E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.269 | TFLOPs: 43.50 | +7: iteration 2610/ 115203 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.714174E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.603 | TFLOPs: 43.53 | +7: iteration 2620/ 115203 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.704136E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.519 | TFLOPs: 42.57 | +7: iteration 2630/ 115203 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.723105E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.384 | TFLOPs: 42.37 | +7: iteration 2640/ 115203 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.697469E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.992 | TFLOPs: 42.81 | +7: iteration 2650/ 115203 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.701651E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.478 | TFLOPs: 42.57 | +7: iteration 2660/ 115203 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.768695E+00 | grad norm: 1.046 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.613 | TFLOPs: 42.68 | +7: iteration 2670/ 115203 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.088062E+00 | grad norm: 8.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.101 | TFLOPs: 43.01 | +7: iteration 2680/ 115203 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.277649E+00 | grad norm: 2.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.951 | TFLOPs: 42.33 | +7: iteration 2690/ 115203 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.997896E+00 | grad norm: 1.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.460 | TFLOPs: 41.80 | +7: iteration 2700/ 115203 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.900912E+00 | grad norm: 0.805 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.324 | TFLOPs: 42.46 | +7: iteration 2710/ 115203 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.823028E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.255 | TFLOPs: 41.50 | +7: iteration 2720/ 115203 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.773312E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.872 | TFLOPs: 43.18 | +7: iteration 2730/ 115203 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.741782E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.745 | TFLOPs: 42.97 | +7: iteration 2740/ 115203 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.709592E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.482 | TFLOPs: 43.71 | +7: iteration 2750/ 115203 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.724538E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.417 | TFLOPs: 43.23 | +7: iteration 2760/ 115203 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.727676E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.077 | TFLOPs: 43.29 | +7: iteration 2770/ 115203 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.710256E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.932 | TFLOPs: 41.08 | +7: iteration 2780/ 115203 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.699622E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.555 | TFLOPs: 43.62 | +7: iteration 2790/ 115203 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.693470E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.313 | TFLOPs: 41.41 | +7: iteration 2800/ 115203 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.698156E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.287 | TFLOPs: 43.69 | +7: iteration 2810/ 115203 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.695710E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.931 | TFLOPs: 42.23 | +7: iteration 2820/ 115203 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.706929E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.303 | TFLOPs: 42.55 | +7: iteration 2830/ 115203 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.699462E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.817 | TFLOPs: 42.50 | +7: iteration 2840/ 115203 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.679366E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.090 | TFLOPs: 42.34 | +7: iteration 2850/ 115203 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.675962E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.723 | TFLOPs: 42.11 | +7: iteration 2860/ 115203 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.682520E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.975 | TFLOPs: 43.47 | +7: iteration 2870/ 115203 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.667065E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.038 | TFLOPs: 42.52 | +7: iteration 2880/ 115203 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.665738E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.978 | TFLOPs: 42.90 | +7: iteration 2890/ 115203 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.667136E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.069 | TFLOPs: 42.34 | +7: iteration 2900/ 115203 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.664083E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.318 | TFLOPs: 42.74 | +7: iteration 2910/ 115203 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.651214E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.097 | TFLOPs: 43.67 | +7: iteration 2920/ 115203 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.667326E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.152 | TFLOPs: 42.92 | +7: iteration 2930/ 115203 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.662314E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.862 | TFLOPs: 43.56 | +7: iteration 2940/ 115203 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.655476E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.668 | TFLOPs: 42.58 | +7: iteration 2950/ 115203 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.653095E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.154 | TFLOPs: 43.39 | +7: iteration 2960/ 115203 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.642876E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.346 | TFLOPs: 43.51 | +7: iteration 2970/ 115203 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.658574E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.511 | TFLOPs: 42.00 | +7: iteration 2980/ 115203 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.655695E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.408 | TFLOPs: 41.61 | +7: iteration 2990/ 115203 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.644088E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.977 | TFLOPs: 43.19 | +7: iteration 3000/ 115203 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.623629E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.459 | TFLOPs: 42.66 | +7: iteration 3010/ 115203 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.641942E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.529 | TFLOPs: 42.57 | +7: iteration 3020/ 115203 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.642267E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.807 | TFLOPs: 43.27 | +7: iteration 3030/ 115203 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.636774E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.290 | TFLOPs: 41.88 | +7: iteration 3040/ 115203 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.649610E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.656 | TFLOPs: 41.92 | +7: iteration 3050/ 115203 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.633241E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.404 | TFLOPs: 43.70 | +7: iteration 3060/ 115203 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.645624E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.796 | TFLOPs: 41.74 | +7: iteration 3070/ 115203 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.642659E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.620 | TFLOPs: 42.87 | +7: iteration 3080/ 115203 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.630785E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.613 | TFLOPs: 42.20 | +7: iteration 3090/ 115203 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.621812E+00 | grad norm: 2.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.361 | TFLOPs: 41.51 | +7: iteration 3100/ 115203 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.762073E+00 | grad norm: 2.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.564 | TFLOPs: 41.81 | +7: iteration 3110/ 115203 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.862754E+00 | grad norm: 1.113 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.444 | TFLOPs: 43.23 | +7: iteration 3120/ 115203 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.771660E+00 | grad norm: 0.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.372 | TFLOPs: 42.37 | +7: iteration 3130/ 115203 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.696538E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.248 | TFLOPs: 43.21 | +7: iteration 3140/ 115203 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.689786E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.245 | TFLOPs: 43.31 | +7: iteration 3150/ 115203 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.59 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.658918E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.381 | TFLOPs: 41.70 | +7: iteration 3160/ 115203 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.641885E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.519 | TFLOPs: 43.91 | +7: iteration 3170/ 115203 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.643068E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.040 | TFLOPs: 42.81 | +7: iteration 3180/ 115203 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.651600E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.909 | TFLOPs: 42.89 | +7: iteration 3190/ 115203 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.625617E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.261 | TFLOPs: 42.93 | +7: iteration 3200/ 115203 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.612974E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.724 | TFLOPs: 42.97 | +7: iteration 3210/ 115203 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.618066E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.439 | TFLOPs: 42.94 | +7: iteration 3220/ 115203 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.57 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.633244E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.554 | TFLOPs: 42.57 | +7: iteration 3230/ 115203 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.56 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.618657E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.703 | TFLOPs: 43.35 | +7: iteration 3240/ 115203 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.58 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 3.612206E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.289 | TFLOPs: 42.26 | +7: iteration 3250/ 115203 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.59 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.616041E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.906 | TFLOPs: 41.56 | +7: iteration 3260/ 115203 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.608200E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.131 | TFLOPs: 43.39 | +7: iteration 3270/ 115203 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.611473E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.638 | TFLOPs: 43.63 | +7: iteration 3280/ 115203 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.596400E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.545 | TFLOPs: 42.86 | +7: iteration 3290/ 115203 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.608556E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.655 | TFLOPs: 43.16 | +7: iteration 3300/ 115203 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.598954E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.879 | TFLOPs: 43.27 | +7: iteration 3310/ 115203 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.602754E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.940 | TFLOPs: 42.42 | +7: iteration 3320/ 115203 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.601017E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.906 | TFLOPs: 42.99 | +7: iteration 3330/ 115203 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.590338E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.147 | TFLOPs: 43.20 | +7: iteration 3340/ 115203 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.596056E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.860 | TFLOPs: 42.60 | +7: iteration 3350/ 115203 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.590139E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.382 | TFLOPs: 42.65 | +7: iteration 3360/ 115203 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.573724E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.680 | TFLOPs: 41.73 | +7: iteration 3370/ 115203 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.597368E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.154 | TFLOPs: 42.35 | +7: iteration 3380/ 115203 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.594479E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.864 | TFLOPs: 43.18 | +7: iteration 3390/ 115203 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.589432E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.690 | TFLOPs: 43.54 | +7: iteration 3400/ 115203 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.573724E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.801 | TFLOPs: 43.17 | +7: iteration 3410/ 115203 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.569210E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.416 | TFLOPs: 43.13 | +7: iteration 3420/ 115203 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.575512E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.985 | TFLOPs: 42.62 | +7: iteration 3430/ 115203 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.571957E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.087 | TFLOPs: 42.43 | +7: iteration 3440/ 115203 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.570538E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.764 | TFLOPs: 43.17 | +7: iteration 3450/ 115203 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.565285E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.726 | TFLOPs: 43.83 | +7: iteration 3460/ 115203 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.571659E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.614 | TFLOPs: 43.72 | +7: iteration 3470/ 115203 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.579535E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.989 | TFLOPs: 42.90 | +7: iteration 3480/ 115203 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.573294E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.386 | TFLOPs: 41.99 | +7: iteration 3490/ 115203 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.580718E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.874 | TFLOPs: 43.65 | +7: iteration 3500/ 115203 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.563930E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.445 | TFLOPs: 42.09 | +7: iteration 3510/ 115203 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.55 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.560925E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.375 | TFLOPs: 43.99 | +7: iteration 3520/ 115203 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.572539E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.469 | TFLOPs: 42.18 | +7: iteration 3530/ 115203 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.564172E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.082 | TFLOPs: 42.53 | +7: iteration 3540/ 115203 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.556110E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.115 | TFLOPs: 42.63 | +7: iteration 3550/ 115203 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.562082E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.712 | TFLOPs: 43.35 | +7: iteration 3560/ 115203 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.568440E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.571 | TFLOPs: 43.05 | +7: iteration 3570/ 115203 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.562823E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.341 | TFLOPs: 43.70 | +7: iteration 3580/ 115203 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.557665E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.496 | TFLOPs: 42.57 | +7: iteration 3590/ 115203 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.541757E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.918 | TFLOPs: 42.42 | +7: iteration 3600/ 115203 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.550386E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.213 | TFLOPs: 42.73 | +7: iteration 3610/ 115203 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.562772E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.230 | TFLOPs: 43.21 | +7: iteration 3620/ 115203 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.538205E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.126 | TFLOPs: 43.20 | +7: iteration 3630/ 115203 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.554341E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.372 | TFLOPs: 43.32 | +7: iteration 3640/ 115203 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.555310E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.459 | TFLOPs: 42.66 | +7: iteration 3650/ 115203 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.540936E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.402 | TFLOPs: 41.89 | +7: iteration 3660/ 115203 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.59 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.555559E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.329 | TFLOPs: 41.31 | +7: iteration 3670/ 115203 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.532450E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.292 | TFLOPs: 43.31 | +7: iteration 3680/ 115203 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.558592E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.303 | TFLOPs: 42.84 | +7: iteration 3690/ 115203 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.535553E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.418 | TFLOPs: 43.42 | +7: iteration 3700/ 115203 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.541524E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.081 | TFLOPs: 43.20 | +7: iteration 3710/ 115203 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.550447E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.950 | TFLOPs: 41.85 | +7: iteration 3720/ 115203 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.544693E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.100 | TFLOPs: 43.29 | +7: iteration 3730/ 115203 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.555907E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.188 | TFLOPs: 43.02 | +7: iteration 3740/ 115203 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.525185E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.016 | TFLOPs: 42.90 | +7: iteration 3750/ 115203 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.522741E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.405 | TFLOPs: 42.27 | +7: iteration 3760/ 115203 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.540412E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.689 | TFLOPs: 42.87 | +7: iteration 3770/ 115203 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.530580E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.977 | TFLOPs: 41.95 | +7: iteration 3780/ 115203 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.536485E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.961 | TFLOPs: 43.09 | +7: iteration 3790/ 115203 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.530962E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.323 | TFLOPs: 42.08 | +7: iteration 3800/ 115203 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.57 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.519750E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.884 | TFLOPs: 42.99 | +7: iteration 3810/ 115203 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.530056E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.921 | TFLOPs: 42.13 | +7: iteration 3820/ 115203 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.527156E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.677 | TFLOPs: 43.44 | +7: iteration 3830/ 115203 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.526907E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.555 | TFLOPs: 43.24 | +7: iteration 3840/ 115203 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.56 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.519584E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.089 | TFLOPs: 43.39 | +7: iteration 3850/ 115203 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.58 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 3.510915E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.810 | TFLOPs: 41.84 | +7: iteration 3860/ 115203 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.517064E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.743 | TFLOPs: 43.55 | +7: iteration 3870/ 115203 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.530902E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.338 | TFLOPs: 43.89 | +7: iteration 3880/ 115203 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.533409E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.212 | TFLOPs: 43.40 | +7: iteration 3890/ 115203 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.530120E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.658 | TFLOPs: 43.63 | +7: iteration 3900/ 115203 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.516466E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.568 | TFLOPs: 42.96 | +7: iteration 3910/ 115203 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.58 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.527357E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.732 | TFLOPs: 42.21 | +7: iteration 3920/ 115203 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.58 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.510162E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.807 | TFLOPs: 42.41 | +7: iteration 3930/ 115203 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.511049E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.886 | TFLOPs: 42.80 | +7: iteration 3940/ 115203 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.521288E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.818 | TFLOPs: 42.89 | +7: iteration 3950/ 115203 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.517825E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.978 | TFLOPs: 43.57 | +7: iteration 3960/ 115203 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.511323E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.362 | TFLOPs: 41.70 | +7: iteration 3970/ 115203 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.507584E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.962 | TFLOPs: 43.18 | +7: iteration 3980/ 115203 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.503149E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.389 | TFLOPs: 43.51 | +7: iteration 3990/ 115203 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.514571E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.345 | TFLOPs: 43.03 | +0: [2023-03-16 13:28:12,718] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.00019972320825211248, 0.00019972320825211248, 0.00019972320825211248], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 115203 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.58 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.501412E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.977 | TFLOPs: 41.76 | +0: steps: 4000 loss: 3.4712 iter time (s): 0.569 samples/sec: 450.249 +7: iteration 4010/ 115203 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.502592E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.448 | TFLOPs: 42.95 | +7: iteration 4020/ 115203 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.520422E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.504 | TFLOPs: 43.14 | +7: iteration 4030/ 115203 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.503262E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.705 | TFLOPs: 42.49 | +7: iteration 4040/ 115203 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.518765E+00 | grad norm: 1.052 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.588 | TFLOPs: 42.58 | +7: iteration 4050/ 115203 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.759319E+00 | grad norm: 5.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.866 | TFLOPs: 42.79 | +7: iteration 4060/ 115203 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.737498E+00 | grad norm: 1.122 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.553 | TFLOPs: 43.34 | +7: iteration 4070/ 115203 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.682061E+00 | grad norm: 0.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.735 | TFLOPs: 42.78 | +7: iteration 4080/ 115203 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.598532E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.238 | TFLOPs: 42.64 | +7: iteration 4090/ 115203 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.566933E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.298 | TFLOPs: 42.64 | +7: iteration 4100/ 115203 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.58 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.529934E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.206 | TFLOPs: 42.45 | +7: iteration 4110/ 115203 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.60 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.537493E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.181 | TFLOPs: 40.54 | +7: iteration 4120/ 115203 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.526067E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.549 | TFLOPs: 41.24 | +7: iteration 4130/ 115203 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.515664E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.228 | TFLOPs: 43.02 | +7: iteration 4140/ 115203 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.503191E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.503 | TFLOPs: 43.43 | +7: iteration 4150/ 115203 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.497609E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.630 | TFLOPs: 43.44 | +7: iteration 4160/ 115203 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.488226E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.804 | TFLOPs: 41.07 | +7: iteration 4170/ 115203 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.495157E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.716 | TFLOPs: 41.64 | +7: iteration 4180/ 115203 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.502707E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.520 | TFLOPs: 43.43 | +7: iteration 4190/ 115203 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.481891E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.615 | TFLOPs: 41.63 | +7: iteration 4200/ 115203 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.61 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.487413E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.270 | TFLOPs: 40.26 | +7: iteration 4210/ 115203 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.484173E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.678 | TFLOPs: 42.78 | +7: iteration 4220/ 115203 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.491697E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.387 | TFLOPs: 43.61 | +7: iteration 4230/ 115203 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.493550E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.565 | TFLOPs: 42.77 | +7: iteration 4240/ 115203 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.497982E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.243 | TFLOPs: 41.69 | +7: iteration 4250/ 115203 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.473922E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.800 | TFLOPs: 43.46 | +7: iteration 4260/ 115203 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.495518E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.395 | TFLOPs: 43.23 | +7: iteration 4270/ 115203 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.476290E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.249 | TFLOPs: 42.54 | +7: iteration 4280/ 115203 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.477217E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.238 | TFLOPs: 43.78 | +7: iteration 4290/ 115203 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.60 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.482925E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.716 | TFLOPs: 40.78 | +7: iteration 4300/ 115203 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.476044E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.511 | TFLOPs: 43.33 | +7: iteration 4310/ 115203 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.59 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.471058E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.138 | TFLOPs: 41.39 | +7: iteration 4320/ 115203 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.476482E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.147 | TFLOPs: 43.30 | +7: iteration 4330/ 115203 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.56 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.459129E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.258 | TFLOPs: 43.31 | +7: iteration 4340/ 115203 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.57 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.476937E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.321 | TFLOPs: 42.55 | +7: iteration 4350/ 115203 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.58 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 3.460926E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.530 | TFLOPs: 42.10 | +7: iteration 4360/ 115203 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.483918E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.609 | TFLOPs: 42.96 | +7: iteration 4370/ 115203 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.464880E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.989 | TFLOPs: 42.71 | +7: iteration 4380/ 115203 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.58 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.469549E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.014 | TFLOPs: 42.24 | +7: iteration 4390/ 115203 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.471868E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.654 | TFLOPs: 43.16 | +7: iteration 4400/ 115203 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.460762E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.419 | TFLOPs: 42.75 | +7: iteration 4410/ 115203 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.469278E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.427 | TFLOPs: 43.04 | +7: iteration 4420/ 115203 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.464630E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.773 | TFLOPs: 42.98 | +7: iteration 4430/ 115203 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.58 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.477397E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.700 | TFLOPs: 42.30 | +7: iteration 4440/ 115203 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.465185E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.844 | TFLOPs: 42.89 | +7: iteration 4450/ 115203 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.464158E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.301 | TFLOPs: 43.22 | +7: iteration 4460/ 115203 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.473399E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.931 | TFLOPs: 43.85 | +7: iteration 4470/ 115203 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.471893E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.179 | TFLOPs: 43.02 | +7: iteration 4480/ 115203 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.459465E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.761 | TFLOPs: 43.26 | +7: iteration 4490/ 115203 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.473938E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.235 | TFLOPs: 43.88 | +7: iteration 4500/ 115203 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.456741E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.319 | TFLOPs: 42.93 | +7: iteration 4510/ 115203 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.449725E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.999 | TFLOPs: 43.95 | +7: iteration 4520/ 115203 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.457507E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.143 | TFLOPs: 43.30 | +7: iteration 4530/ 115203 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.442463E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.475 | TFLOPs: 43.33 | +7: iteration 4540/ 115203 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.459652E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.289 | TFLOPs: 43.22 | +7: iteration 4550/ 115203 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.440992E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.121 | TFLOPs: 43.10 | +7: iteration 4560/ 115203 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.58 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.441632E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.801 | TFLOPs: 42.03 | +7: iteration 4570/ 115203 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.55 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.449110E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.324 | TFLOPs: 43.98 | +7: iteration 4580/ 115203 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.449752E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.886 | TFLOPs: 42.80 | +7: iteration 4590/ 115203 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.440379E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.669 | TFLOPs: 43.25 | +7: iteration 4600/ 115203 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.456939E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.658 | TFLOPs: 43.35 | +7: iteration 4610/ 115203 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.453460E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.163 | TFLOPs: 43.87 | +7: iteration 4620/ 115203 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.450310E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.263 | TFLOPs: 42.93 | +7: iteration 4630/ 115203 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.447261E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.523 | TFLOPs: 43.24 | +7: iteration 4640/ 115203 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.445176E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.675 | TFLOPs: 42.78 | +7: iteration 4650/ 115203 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.60 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.440106E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.882 | TFLOPs: 40.79 | +7: iteration 4660/ 115203 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.434602E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.384 | TFLOPs: 43.42 | +7: iteration 4670/ 115203 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.454094E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.989 | TFLOPs: 43.19 | +7: iteration 4680/ 115203 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.431455E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.283 | TFLOPs: 42.83 | +7: iteration 4690/ 115203 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.55 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.419458E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.302 | TFLOPs: 43.98 | +7: iteration 4700/ 115203 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.449553E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.744 | TFLOPs: 42.78 | +7: iteration 4710/ 115203 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.439742E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.409 | TFLOPs: 43.23 | +7: iteration 4720/ 115203 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.438356E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.029 | TFLOPs: 43.76 | +7: iteration 4730/ 115203 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.438144E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.379 | TFLOPs: 43.13 | +7: iteration 4740/ 115203 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.56 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.430640E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.183 | TFLOPs: 43.40 | +7: iteration 4750/ 115203 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.438800E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.380 | TFLOPs: 42.84 | +7: iteration 4760/ 115203 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.424158E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.958 | TFLOPs: 43.18 | +7: iteration 4770/ 115203 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.432800E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.295 | TFLOPs: 42.84 | +7: iteration 4780/ 115203 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.57 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 3.438901E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.301 | TFLOPs: 42.84 | +7: iteration 4790/ 115203 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.424072E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.901 | TFLOPs: 43.47 | +7: iteration 4800/ 115203 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.426955E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.360 | TFLOPs: 43.41 | +7: iteration 4810/ 115203 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.419360E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.096 | TFLOPs: 43.96 | +7: iteration 4820/ 115203 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.414481E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.894 | TFLOPs: 42.61 | +7: iteration 4830/ 115203 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.427771E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.722 | TFLOPs: 43.35 | +7: iteration 4840/ 115203 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.426334E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.029 | TFLOPs: 42.81 | +7: iteration 4850/ 115203 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.429502E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.281 | TFLOPs: 43.60 | +7: iteration 4860/ 115203 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.424220E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.523 | TFLOPs: 43.05 | +7: iteration 4870/ 115203 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.409717E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.239 | TFLOPs: 43.40 | +7: iteration 4880/ 115203 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.419583E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.948 | TFLOPs: 43.28 | +7: iteration 4890/ 115203 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.58 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.429937E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.874 | TFLOPs: 42.41 | +7: iteration 4900/ 115203 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.413614E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.245 | TFLOPs: 42.93 | +7: iteration 4910/ 115203 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.416655E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.057 | TFLOPs: 43.96 | +7: iteration 4920/ 115203 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.439486E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.464 | TFLOPs: 43.33 | +7: iteration 4930/ 115203 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.418875E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.247 | TFLOPs: 43.40 | +7: iteration 4940/ 115203 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.405551E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.971 | TFLOPs: 42.80 | +7: iteration 4950/ 115203 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.428019E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.383 | TFLOPs: 43.70 | +7: iteration 4960/ 115203 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.415166E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.889 | TFLOPs: 43.27 | +7: iteration 4970/ 115203 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.410094E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.914 | TFLOPs: 43.56 | +7: iteration 4980/ 115203 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.55 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.414375E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.630 | TFLOPs: 44.01 | +7: iteration 4990/ 115203 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.417967E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.970 | TFLOPs: 43.09 | +7: iteration 5000/ 115203 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.416031E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.235 | TFLOPs: 42.92 | +7: iteration 5010/ 115203 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.408383E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.552 | TFLOPs: 42.96 | +7: iteration 5020/ 115203 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.413295E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.765 | TFLOPs: 43.36 | +7: iteration 5030/ 115203 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.425443E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.256 | TFLOPs: 42.55 | +7: iteration 5040/ 115203 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.406838E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.033 | TFLOPs: 43.38 | +7: iteration 5050/ 115203 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.402492E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.994 | TFLOPs: 43.86 | +7: iteration 5060/ 115203 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.394091E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.482 | TFLOPs: 43.71 | +7: iteration 5070/ 115203 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.413587E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.045 | TFLOPs: 43.96 | +7: iteration 5080/ 115203 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.405056E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.473 | TFLOPs: 42.47 | +7: iteration 5090/ 115203 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.55 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.405912E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.289 | TFLOPs: 43.98 | +7: iteration 5100/ 115203 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.412272E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.183 | TFLOPs: 43.97 | +7: iteration 5110/ 115203 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.58 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.403730E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.230 | TFLOPs: 42.26 | +7: iteration 5120/ 115203 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.400402E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 5130/ 115203 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 0.56 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.403416E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.499 | TFLOPs: 43.52 | +7: iteration 5140/ 115203 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.402286E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.532 | TFLOPs: 43.14 | +7: iteration 5150/ 115203 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 0.57 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.408972E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.107 | TFLOPs: 43.01 | +7: iteration 5160/ 115203 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 0.58 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 3.406640E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.112 | TFLOPs: 42.15 | +7: iteration 5170/ 115203 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.410521E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.035 | TFLOPs: 43.48 | +7: iteration 5180/ 115203 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.410188E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.557 | TFLOPs: 43.43 | +7: iteration 5190/ 115203 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.410228E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.723 | TFLOPs: 43.26 | +7: iteration 5200/ 115203 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.390989E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.576 | TFLOPs: 43.53 | +7: iteration 5210/ 115203 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.405567E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.233 | TFLOPs: 43.69 | +7: iteration 5220/ 115203 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.404516E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.171 | TFLOPs: 43.97 | +7: iteration 5230/ 115203 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.390731E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 5240/ 115203 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.399543E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.278 | TFLOPs: 43.98 | +7: iteration 5250/ 115203 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.390702E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.273 | TFLOPs: 43.41 | +7: iteration 5260/ 115203 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.389916E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.161 | TFLOPs: 43.97 | +7: iteration 5270/ 115203 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.393861E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.678 | TFLOPs: 43.54 | +7: iteration 5280/ 115203 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.375098E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.350 | TFLOPs: 43.32 | +7: iteration 5290/ 115203 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.399625E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.230 | TFLOPs: 43.97 | +7: iteration 5300/ 115203 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 0.57 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.391365E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.846 | TFLOPs: 43.17 | +7: iteration 5310/ 115203 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.408348E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.409 | TFLOPs: 43.99 | +7: iteration 5320/ 115203 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.396178E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.462 | TFLOPs: 44.00 | +7: iteration 5330/ 115203 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.395382E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.186 | TFLOPs: 43.97 | +7: iteration 5340/ 115203 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.382593E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.669 | TFLOPs: 43.54 | +7: iteration 5350/ 115203 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.394614E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.655 | TFLOPs: 43.54 | +7: iteration 5360/ 115203 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 0.57 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.386108E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.129 | TFLOPs: 42.91 | +7: iteration 5370/ 115203 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.372615E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.680 | TFLOPs: 43.63 | +7: iteration 5380/ 115203 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.380148E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.287 | TFLOPs: 43.98 | +7: iteration 5390/ 115203 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.387868E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 5400/ 115203 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.378642E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.227 | TFLOPs: 43.97 | +7: iteration 5410/ 115203 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.383297E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.166 | TFLOPs: 43.97 | +7: iteration 5420/ 115203 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 0.55 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.367382E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.329 | TFLOPs: 43.98 | +7: iteration 5430/ 115203 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.372313E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.504 | TFLOPs: 43.62 | +7: iteration 5440/ 115203 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.380993E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.920 | TFLOPs: 43.66 | +7: iteration 5450/ 115203 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 0.57 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.378574E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.760 | TFLOPs: 43.07 | +7: iteration 5460/ 115203 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.373624E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.375 | TFLOPs: 43.70 | +7: iteration 5470/ 115203 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.381567E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.466 | TFLOPs: 43.42 | +7: iteration 5480/ 115203 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.376409E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.507 | TFLOPs: 43.43 | +7: iteration 5490/ 115203 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.369931E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.938 | TFLOPs: 43.56 | +7: iteration 5500/ 115203 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.383097E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.084 | TFLOPs: 43.96 | +7: iteration 5510/ 115203 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 0.56 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 3.371960E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.913 | TFLOPs: 43.28 | +7: iteration 5520/ 115203 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.367506E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.815 | TFLOPs: 43.93 | +7: iteration 5530/ 115203 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.365016E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.599 | TFLOPs: 43.91 | +7: iteration 5540/ 115203 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.367226E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.434 | TFLOPs: 43.52 | +7: iteration 5550/ 115203 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.387309E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.328 | TFLOPs: 43.79 | +7: iteration 5560/ 115203 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.375902E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.015 | TFLOPs: 43.86 | +7: iteration 5570/ 115203 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.365658E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.510 | TFLOPs: 43.81 | +7: iteration 5580/ 115203 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 0.55 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.350553E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.530 | TFLOPs: 44.00 | +7: iteration 5590/ 115203 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.367937E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.867 | TFLOPs: 43.46 | +7: iteration 5600/ 115203 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.366870E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.933 | TFLOPs: 43.56 | +7: iteration 5610/ 115203 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.377440E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.152 | TFLOPs: 43.49 | +7: iteration 5620/ 115203 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.359290E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.112 | TFLOPs: 43.49 | +7: iteration 5630/ 115203 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.364145E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.365 | TFLOPs: 43.32 | +7: iteration 5640/ 115203 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.350858E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.210 | TFLOPs: 43.97 | +7: iteration 5650/ 115203 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 0.55 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.367754E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.289 | TFLOPs: 43.98 | +7: iteration 5660/ 115203 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.341351E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.397 | TFLOPs: 43.70 | +7: iteration 5670/ 115203 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 0.57 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.355296E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.292 | TFLOPs: 43.03 | +7: iteration 5680/ 115203 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.360668E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.245 | TFLOPs: 43.97 | +7: iteration 5690/ 115203 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.360370E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.033 | TFLOPs: 43.67 | +7: iteration 5700/ 115203 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.355780E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.027 | TFLOPs: 43.38 | +7: iteration 5710/ 115203 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.359046E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.723 | TFLOPs: 43.54 | +7: iteration 5720/ 115203 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.358059E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.879 | TFLOPs: 43.65 | +7: iteration 5730/ 115203 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.365650E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.551 | TFLOPs: 43.72 | +7: iteration 5740/ 115203 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.366603E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.002 | TFLOPs: 43.95 | +7: iteration 5750/ 115203 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.361958E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.311 | TFLOPs: 43.41 | +7: iteration 5760/ 115203 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.358826E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.241 | TFLOPs: 43.69 | +7: iteration 5770/ 115203 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.351041E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.990 | TFLOPs: 43.95 | +7: iteration 5780/ 115203 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.349793E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.941 | TFLOPs: 43.37 | +7: iteration 5790/ 115203 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.349302E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.485 | TFLOPs: 43.81 | +7: iteration 5800/ 115203 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.338162E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.739 | TFLOPs: 43.93 | +7: iteration 5810/ 115203 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.342574E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.849 | TFLOPs: 43.94 | +7: iteration 5820/ 115203 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 0.57 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.345137E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.940 | TFLOPs: 43.18 | +7: iteration 5830/ 115203 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 0.56 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 3.330406E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.033 | TFLOPs: 43.76 | +7: iteration 5840/ 115203 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 0.57 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.160773E+00 | grad norm: 9.065 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.668 | TFLOPs: 42.68 | +7: iteration 5850/ 115203 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.285795E+00 | grad norm: 5.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.020 | TFLOPs: 43.48 | +7: iteration 5860/ 115203 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.999061E+00 | grad norm: 1.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.636 | TFLOPs: 43.73 | +7: iteration 5870/ 115203 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.698036E+00 | grad norm: 1.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.030 | TFLOPs: 43.76 | +7: iteration 5880/ 115203 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.558301E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.668 | TFLOPs: 43.92 | +7: iteration 5890/ 115203 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.459351E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.919 | TFLOPs: 43.94 | +7: iteration 5900/ 115203 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 0.57 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.413180E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.071 | TFLOPs: 43.00 | +7: iteration 5910/ 115203 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.395166E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.028 | TFLOPs: 43.95 | +7: iteration 5920/ 115203 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 0.57 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.377091E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.604 | TFLOPs: 42.67 | +7: iteration 5930/ 115203 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.354890E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.612 | TFLOPs: 43.72 | +7: iteration 5940/ 115203 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 0.55 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.378060E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 5950/ 115203 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.376217E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.153 | TFLOPs: 43.97 | +7: iteration 5960/ 115203 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.360511E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.966 | TFLOPs: 43.95 | +7: iteration 5970/ 115203 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.357906E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.432 | TFLOPs: 43.42 | +7: iteration 5980/ 115203 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.366512E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.160 | TFLOPs: 43.97 | +7: iteration 5990/ 115203 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.349006E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.059 | TFLOPs: 43.96 | +0: [2023-03-16 13:47:01,318] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.00019919872690019844, 0.00019919872690019844, 0.00019919872690019844], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 6000/ 115203 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.368370E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.221 | TFLOPs: 43.97 | +0: steps: 6000 loss: 3.3978 iter time (s): 0.562 samples/sec: 455.497 +7: iteration 6010/ 115203 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 0.57 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.370811E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.597 | TFLOPs: 42.96 | +7: iteration 6020/ 115203 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.340815E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.208 | TFLOPs: 43.97 | +7: iteration 6030/ 115203 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.349959E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.850 | TFLOPs: 43.94 | +7: iteration 6040/ 115203 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.337577E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 6050/ 115203 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.342576E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.177 | TFLOPs: 43.97 | +7: iteration 6060/ 115203 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 0.57 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.358375E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.315 | TFLOPs: 42.74 | +7: iteration 6070/ 115203 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.344313E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.314 | TFLOPs: 43.50 | +7: iteration 6080/ 115203 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.331813E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.454 | TFLOPs: 43.90 | +7: iteration 6090/ 115203 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.337033E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.583 | TFLOPs: 43.63 | +7: iteration 6100/ 115203 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.336075E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.002 | TFLOPs: 43.95 | +7: iteration 6110/ 115203 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.344908E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.051 | TFLOPs: 43.96 | +7: iteration 6120/ 115203 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.344032E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.014 | TFLOPs: 43.95 | +7: iteration 6130/ 115203 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.336709E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.776 | TFLOPs: 43.64 | +7: iteration 6140/ 115203 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 0.56 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 3.332840E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.178 | TFLOPs: 43.68 | +7: iteration 6150/ 115203 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.331007E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.434 | TFLOPs: 43.33 | +7: iteration 6160/ 115203 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.327165E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.164 | TFLOPs: 43.97 | +7: iteration 6170/ 115203 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.329634E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.123 | TFLOPs: 43.96 | +7: iteration 6180/ 115203 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.330973E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.475 | TFLOPs: 43.62 | +7: iteration 6190/ 115203 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.341698E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.099 | TFLOPs: 43.96 | +7: iteration 6200/ 115203 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 0.55 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.338498E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.294 | TFLOPs: 43.98 | +7: iteration 6210/ 115203 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.334376E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.092 | TFLOPs: 43.96 | +7: iteration 6220/ 115203 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.331870E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.215 | TFLOPs: 43.97 | +7: iteration 6230/ 115203 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.330471E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.094 | TFLOPs: 43.96 | +7: iteration 6240/ 115203 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 0.57 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.320736E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.314 | TFLOPs: 43.12 | +7: iteration 6250/ 115203 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.324288E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.156 | TFLOPs: 43.97 | +7: iteration 6260/ 115203 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.333569E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.009 | TFLOPs: 43.95 | +7: iteration 6270/ 115203 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.330465E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.956 | TFLOPs: 43.47 | +7: iteration 6280/ 115203 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.323577E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.623 | TFLOPs: 43.34 | +7: iteration 6290/ 115203 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 0.55 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.320481E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.273 | TFLOPs: 43.98 | +7: iteration 6300/ 115203 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.312287E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.831 | TFLOPs: 43.65 | +7: iteration 6310/ 115203 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.335525E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.114 | TFLOPs: 43.68 | +7: iteration 6320/ 115203 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.329490E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.605 | TFLOPs: 43.44 | +7: iteration 6330/ 115203 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 0.58 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.314900E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.373 | TFLOPs: 41.89 | +7: iteration 6340/ 115203 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.313155E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.402 | TFLOPs: 43.61 | +7: iteration 6350/ 115203 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.334285E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.545 | TFLOPs: 43.53 | +7: iteration 6360/ 115203 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 0.57 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.312745E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.868 | TFLOPs: 42.89 | +7: iteration 6370/ 115203 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.311859E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.215 | TFLOPs: 43.30 | +7: iteration 6380/ 115203 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.316151E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.056 | TFLOPs: 43.67 | +7: iteration 6390/ 115203 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.321466E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.023 | TFLOPs: 43.95 | +7: iteration 6400/ 115203 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.321841E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.063 | TFLOPs: 43.96 | +7: iteration 6410/ 115203 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.324090E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.216 | TFLOPs: 43.97 | +7: iteration 6420/ 115203 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.320189E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.169 | TFLOPs: 43.97 | +7: iteration 6430/ 115203 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 0.56 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 3.318829E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.032 | TFLOPs: 43.95 | +7: iteration 6440/ 115203 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.316851E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.985 | TFLOPs: 43.95 | +7: iteration 6450/ 115203 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.323423E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.108 | TFLOPs: 43.96 | +7: iteration 6460/ 115203 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.315997E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.109 | TFLOPs: 43.96 | +7: iteration 6470/ 115203 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.317770E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.842 | TFLOPs: 43.94 | +7: iteration 6480/ 115203 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.319662E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.856 | TFLOPs: 43.84 | +7: iteration 6490/ 115203 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.313260E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.813 | TFLOPs: 43.74 | +7: iteration 6500/ 115203 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.297935E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.972 | TFLOPs: 43.95 | +7: iteration 6510/ 115203 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.316374E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.836 | TFLOPs: 43.94 | +7: iteration 6520/ 115203 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.303621E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.043 | TFLOPs: 43.96 | +7: iteration 6530/ 115203 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.319716E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.890 | TFLOPs: 43.75 | +7: iteration 6540/ 115203 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.320856E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.695 | TFLOPs: 43.92 | +7: iteration 6550/ 115203 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.305096E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.711 | TFLOPs: 43.92 | +7: iteration 6560/ 115203 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.304784E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.857 | TFLOPs: 43.84 | +7: iteration 6570/ 115203 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.342374E+00 | grad norm: 1.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.752 | TFLOPs: 43.74 | +7: iteration 6580/ 115203 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.362100E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.026 | TFLOPs: 43.95 | +7: iteration 6590/ 115203 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 0.57 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.332311E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.583 | TFLOPs: 42.86 | +7: iteration 6600/ 115203 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.318145E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.387 | TFLOPs: 43.70 | +7: iteration 6610/ 115203 | consumed samples: 1692160 | consumed tokens: 3465543680 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.298055E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 6620/ 115203 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 0.57 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.312146E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.803 | TFLOPs: 42.79 | +7: iteration 6630/ 115203 | consumed samples: 1697280 | consumed tokens: 3476029440 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.307681E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.055 | TFLOPs: 43.96 | +7: iteration 6640/ 115203 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.300314E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.639 | TFLOPs: 43.54 | +7: iteration 6650/ 115203 | consumed samples: 1702400 | consumed tokens: 3486515200 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.301730E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.577 | TFLOPs: 43.91 | +7: iteration 6660/ 115203 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.304272E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.723 | TFLOPs: 43.83 | +7: iteration 6670/ 115203 | consumed samples: 1707520 | consumed tokens: 3497000960 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.292750E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.797 | TFLOPs: 43.93 | +7: iteration 6680/ 115203 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.310929E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.775 | TFLOPs: 43.93 | +7: iteration 6690/ 115203 | consumed samples: 1712640 | consumed tokens: 3507486720 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.306750E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.900 | TFLOPs: 43.94 | +7: iteration 6700/ 115203 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 0.56 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 3.299497E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.241 | TFLOPs: 43.59 | +7: iteration 6710/ 115203 | consumed samples: 1717760 | consumed tokens: 3517972480 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.306280E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.589 | TFLOPs: 43.82 | +7: iteration 6720/ 115203 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.304568E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.237 | TFLOPs: 43.88 | +7: iteration 6730/ 115203 | consumed samples: 1722880 | consumed tokens: 3528458240 | elapsed time per iteration (s): 0.57 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.297826E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.712 | TFLOPs: 42.49 | +7: iteration 6740/ 115203 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 0.57 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.301406E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.526 | TFLOPs: 43.14 | +7: iteration 6750/ 115203 | consumed samples: 1728000 | consumed tokens: 3538944000 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.286617E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.603 | TFLOPs: 43.91 | +7: iteration 6760/ 115203 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.297257E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.903 | TFLOPs: 43.94 | +7: iteration 6770/ 115203 | consumed samples: 1733120 | consumed tokens: 3549429760 | elapsed time per iteration (s): 0.61 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.308386E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.245 | TFLOPs: 40.16 | +7: iteration 6780/ 115203 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.292445E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.063 | TFLOPs: 43.58 | +7: iteration 6790/ 115203 | consumed samples: 1738240 | consumed tokens: 3559915520 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.296299E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.067 | TFLOPs: 43.96 | +7: iteration 6800/ 115203 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 0.58 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.300217E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.589 | TFLOPs: 42.39 | +7: iteration 6810/ 115203 | consumed samples: 1743360 | consumed tokens: 3570401280 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.297296E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.525 | TFLOPs: 43.72 | +7: iteration 6820/ 115203 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 0.62 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.287957E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 414.278 | TFLOPs: 39.50 | +7: iteration 6830/ 115203 | consumed samples: 1748480 | consumed tokens: 3580887040 | elapsed time per iteration (s): 0.60 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.291022E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.516 | TFLOPs: 40.85 | +7: iteration 6840/ 115203 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.283736E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.936 | TFLOPs: 43.56 | +7: iteration 6850/ 115203 | consumed samples: 1753600 | consumed tokens: 3591372800 | elapsed time per iteration (s): 0.57 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.295049E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.650 | TFLOPs: 42.87 | +7: iteration 6860/ 115203 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.289075E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.960 | TFLOPs: 43.95 | +7: iteration 6870/ 115203 | consumed samples: 1758720 | consumed tokens: 3601858560 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.278854E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.076 | TFLOPs: 43.96 | +7: iteration 6880/ 115203 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.280574E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.673 | TFLOPs: 43.63 | +7: iteration 6890/ 115203 | consumed samples: 1763840 | consumed tokens: 3612344320 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.290104E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 6900/ 115203 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 0.55 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.295279E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.302 | TFLOPs: 43.98 | +7: iteration 6910/ 115203 | consumed samples: 1768960 | consumed tokens: 3622830080 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.301484E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.776 | TFLOPs: 43.45 | +7: iteration 6920/ 115203 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.286239E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.129 | TFLOPs: 43.96 | +7: iteration 6930/ 115203 | consumed samples: 1774080 | consumed tokens: 3633315840 | elapsed time per iteration (s): 0.55 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.289520E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.279 | TFLOPs: 43.98 | +7: iteration 6940/ 115203 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 0.55 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.283019E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.315 | TFLOPs: 43.98 | +7: iteration 6950/ 115203 | consumed samples: 1779200 | consumed tokens: 3643801600 | elapsed time per iteration (s): 0.56 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.295492E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.213 | TFLOPs: 43.97 | +7: iteration 6960/ 115203 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 0.55 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.288752E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.263 | TFLOPs: 43.98 | +7: iteration 6970/ 115203 | consumed samples: 1784320 | consumed tokens: 3654287360 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.284595E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 6980/ 115203 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.290430E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.098 | TFLOPs: 43.96 | +7: iteration 6990/ 115203 | consumed samples: 1789440 | consumed tokens: 3664773120 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.282433E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.202 | TFLOPs: 43.97 | +7: iteration 7000/ 115203 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.295173E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.059 | TFLOPs: 43.96 | +7: iteration 7010/ 115203 | consumed samples: 1794560 | consumed tokens: 3675258880 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.292067E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.081 | TFLOPs: 43.96 | +7: iteration 7020/ 115203 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.281429E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.201 | TFLOPs: 43.97 | +7: iteration 7030/ 115203 | consumed samples: 1799680 | consumed tokens: 3685744640 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.288489E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.931 | TFLOPs: 43.56 | +7: iteration 7040/ 115203 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.274565E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.059 | TFLOPs: 43.96 | +7: iteration 7050/ 115203 | consumed samples: 1804800 | consumed tokens: 3696230400 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.272860E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.755 | TFLOPs: 43.64 | +7: iteration 7060/ 115203 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.289085E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.187 | TFLOPs: 43.97 | +7: iteration 7070/ 115203 | consumed samples: 1809920 | consumed tokens: 3706716160 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.270801E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.038 | TFLOPs: 43.67 | +7: iteration 7080/ 115203 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.259804E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.474 | TFLOPs: 43.42 | +7: iteration 7090/ 115203 | consumed samples: 1815040 | consumed tokens: 3717201920 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.306569E+00 | grad norm: 3.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.041 | TFLOPs: 43.86 | +7: iteration 7100/ 115203 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.352141E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.588 | TFLOPs: 43.91 | +7: iteration 7110/ 115203 | consumed samples: 1820160 | consumed tokens: 3727687680 | elapsed time per iteration (s): 0.55 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.314833E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.381 | TFLOPs: 43.99 | +7: iteration 7120/ 115203 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.317979E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.138 | TFLOPs: 43.96 | +7: iteration 7130/ 115203 | consumed samples: 1825280 | consumed tokens: 3738173440 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.292262E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.188 | TFLOPs: 43.97 | +7: iteration 7140/ 115203 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.277620E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 7150/ 115203 | consumed samples: 1830400 | consumed tokens: 3748659200 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.293025E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.163 | TFLOPs: 43.97 | +7: iteration 7160/ 115203 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.280128E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.185 | TFLOPs: 43.97 | +7: iteration 7170/ 115203 | consumed samples: 1835520 | consumed tokens: 3759144960 | elapsed time per iteration (s): 0.55 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.276626E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.294 | TFLOPs: 43.98 | +7: iteration 7180/ 115203 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.284298E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.085 | TFLOPs: 43.96 | +7: iteration 7190/ 115203 | consumed samples: 1840640 | consumed tokens: 3769630720 | elapsed time per iteration (s): 0.56 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.294649E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.081 | TFLOPs: 43.96 | +7: iteration 7200/ 115203 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 0.55 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.284986E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.312 | TFLOPs: 43.98 | +7: iteration 7210/ 115203 | consumed samples: 1845760 | consumed tokens: 3780116480 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.270150E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.224 | TFLOPs: 43.97 | +7: iteration 7220/ 115203 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.264207E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.246 | TFLOPs: 43.97 | +7: iteration 7230/ 115203 | consumed samples: 1850880 | consumed tokens: 3790602240 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.272700E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 7240/ 115203 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.274165E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.255 | TFLOPs: 43.98 | +7: iteration 7250/ 115203 | consumed samples: 1856000 | consumed tokens: 3801088000 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.272029E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.177 | TFLOPs: 43.40 | +7: iteration 7260/ 115203 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 0.55 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.275653E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.380 | TFLOPs: 43.99 | +7: iteration 7270/ 115203 | consumed samples: 1861120 | consumed tokens: 3811573760 | elapsed time per iteration (s): 0.55 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.263551E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.412 | TFLOPs: 43.99 | +7: iteration 7280/ 115203 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 0.55 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.257357E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.306 | TFLOPs: 43.98 | +7: iteration 7290/ 115203 | consumed samples: 1866240 | consumed tokens: 3822059520 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.269812E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.216 | TFLOPs: 43.97 | +7: iteration 7300/ 115203 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.265124E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.037 | TFLOPs: 43.95 | +7: iteration 7310/ 115203 | consumed samples: 1871360 | consumed tokens: 3832545280 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.269460E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.105 | TFLOPs: 43.96 | +7: iteration 7320/ 115203 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.262474E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.190 | TFLOPs: 43.97 | +7: iteration 7330/ 115203 | consumed samples: 1876480 | consumed tokens: 3843031040 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.266969E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.187 | TFLOPs: 43.97 | +7: iteration 7340/ 115203 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.263140E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.192 | TFLOPs: 43.97 | +7: iteration 7350/ 115203 | consumed samples: 1881600 | consumed tokens: 3853516800 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.263290E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.261 | TFLOPs: 43.98 | +7: iteration 7360/ 115203 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 0.55 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.256913E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.403 | TFLOPs: 43.99 | +7: iteration 7370/ 115203 | consumed samples: 1886720 | consumed tokens: 3864002560 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.263770E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.154 | TFLOPs: 43.68 | +7: iteration 7380/ 115203 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 0.55 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.278946E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.337 | TFLOPs: 43.98 | +7: iteration 7390/ 115203 | consumed samples: 1891840 | consumed tokens: 3874488320 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.259591E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.205 | TFLOPs: 43.97 | +7: iteration 7400/ 115203 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.265528E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.255 | TFLOPs: 43.98 | +7: iteration 7410/ 115203 | consumed samples: 1896960 | consumed tokens: 3884974080 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.270121E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.736 | TFLOPs: 43.54 | +7: iteration 7420/ 115203 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.264741E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.502 | TFLOPs: 43.62 | +7: iteration 7430/ 115203 | consumed samples: 1902080 | consumed tokens: 3895459840 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.256913E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.454 | TFLOPs: 43.52 | +7: iteration 7440/ 115203 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 0.56 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.267483E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.669 | TFLOPs: 43.92 | +7: iteration 7450/ 115203 | consumed samples: 1907200 | consumed tokens: 3905945600 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.251948E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.999 | TFLOPs: 43.76 | +7: iteration 7460/ 115203 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.263075E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.962 | TFLOPs: 43.95 | +7: iteration 7470/ 115203 | consumed samples: 1912320 | consumed tokens: 3916431360 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.259173E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.010 | TFLOPs: 43.95 | +7: iteration 7480/ 115203 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.261774E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.335 | TFLOPs: 43.98 | +7: iteration 7490/ 115203 | consumed samples: 1917440 | consumed tokens: 3926917120 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.251444E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.507 | TFLOPs: 44.00 | +7: iteration 7500/ 115203 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.252217E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.210 | TFLOPs: 43.97 | +7: iteration 7510/ 115203 | consumed samples: 1922560 | consumed tokens: 3937402880 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.263665E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.157 | TFLOPs: 43.97 | +7: iteration 7520/ 115203 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.248800E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.259 | TFLOPs: 43.98 | +7: iteration 7530/ 115203 | consumed samples: 1927680 | consumed tokens: 3947888640 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.250932E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 7540/ 115203 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.257563E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.258 | TFLOPs: 43.98 | +7: iteration 7550/ 115203 | consumed samples: 1932800 | consumed tokens: 3958374400 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.257407E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.449 | TFLOPs: 43.99 | +7: iteration 7560/ 115203 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.257071E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.333 | TFLOPs: 43.98 | +7: iteration 7570/ 115203 | consumed samples: 1937920 | consumed tokens: 3968860160 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.241413E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.314 | TFLOPs: 43.98 | +7: iteration 7580/ 115203 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.251538E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.026 | TFLOPs: 43.95 | +7: iteration 7590/ 115203 | consumed samples: 1943040 | consumed tokens: 3979345920 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.244971E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.906 | TFLOPs: 43.94 | +7: iteration 7600/ 115203 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.239965E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.976 | TFLOPs: 43.95 | +7: iteration 7610/ 115203 | consumed samples: 1948160 | consumed tokens: 3989831680 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.244801E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.379 | TFLOPs: 43.61 | +7: iteration 7620/ 115203 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.247729E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.027 | TFLOPs: 43.95 | +7: iteration 7630/ 115203 | consumed samples: 1953280 | consumed tokens: 4000317440 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.255244E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.988 | TFLOPs: 43.95 | +7: iteration 7640/ 115203 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.250265E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.133 | TFLOPs: 43.96 | +7: iteration 7650/ 115203 | consumed samples: 1958400 | consumed tokens: 4010803200 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.242198E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.179 | TFLOPs: 43.97 | +7: iteration 7660/ 115203 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 0.55 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.235442E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.308 | TFLOPs: 43.98 | +7: iteration 7670/ 115203 | consumed samples: 1963520 | consumed tokens: 4021288960 | elapsed time per iteration (s): 0.56 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.251696E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.015 | TFLOPs: 43.95 | +7: iteration 7680/ 115203 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.264338E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.004 | TFLOPs: 43.95 | +7: iteration 7690/ 115203 | consumed samples: 1968640 | consumed tokens: 4031774720 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.248156E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.103 | TFLOPs: 43.96 | +7: iteration 7700/ 115203 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.246392E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.235 | TFLOPs: 43.97 | +7: iteration 7710/ 115203 | consumed samples: 1973760 | consumed tokens: 4042260480 | elapsed time per iteration (s): 0.55 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.237197E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.270 | TFLOPs: 43.98 | +7: iteration 7720/ 115203 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.256115E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 7730/ 115203 | consumed samples: 1978880 | consumed tokens: 4052746240 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.235371E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.045 | TFLOPs: 43.96 | +7: iteration 7740/ 115203 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 0.55 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.238671E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.338 | TFLOPs: 43.98 | +7: iteration 7750/ 115203 | consumed samples: 1984000 | consumed tokens: 4063232000 | elapsed time per iteration (s): 0.55 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.254271E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.322 | TFLOPs: 43.98 | +7: iteration 7760/ 115203 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.241755E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.206 | TFLOPs: 43.97 | +7: iteration 7770/ 115203 | consumed samples: 1989120 | consumed tokens: 4073717760 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.230630E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.969 | TFLOPs: 43.95 | +7: iteration 7780/ 115203 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.240738E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 7790/ 115203 | consumed samples: 1994240 | consumed tokens: 4084203520 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.233254E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.058 | TFLOPs: 43.96 | +7: iteration 7800/ 115203 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.411904E+00 | grad norm: 2.024 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.688 | TFLOPs: 43.83 | +7: iteration 7810/ 115203 | consumed samples: 1999360 | consumed tokens: 4094689280 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.525817E+00 | grad norm: 1.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.009 | TFLOPs: 43.76 | +7: iteration 7820/ 115203 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.508182E+00 | grad norm: 0.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.757 | TFLOPs: 43.83 | +7: iteration 7830/ 115203 | consumed samples: 2004480 | consumed tokens: 4105175040 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.365607E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.901 | TFLOPs: 43.94 | +7: iteration 7840/ 115203 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.315551E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.086 | TFLOPs: 43.96 | +7: iteration 7850/ 115203 | consumed samples: 2009600 | consumed tokens: 4115660800 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.275000E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.094 | TFLOPs: 43.96 | +7: iteration 7860/ 115203 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.263441E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.150 | TFLOPs: 43.97 | +7: iteration 7870/ 115203 | consumed samples: 2014720 | consumed tokens: 4126146560 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.264729E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.146 | TFLOPs: 43.97 | +7: iteration 7880/ 115203 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.262151E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.937 | TFLOPs: 43.95 | +7: iteration 7890/ 115203 | consumed samples: 2019840 | consumed tokens: 4136632320 | elapsed time per iteration (s): 0.56 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.255863E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 7900/ 115203 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.259242E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.922 | TFLOPs: 43.94 | +7: iteration 7910/ 115203 | consumed samples: 2024960 | consumed tokens: 4147118080 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.256896E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.095 | TFLOPs: 43.96 | +7: iteration 7920/ 115203 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.262582E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.200 | TFLOPs: 43.97 | +7: iteration 7930/ 115203 | consumed samples: 2030080 | consumed tokens: 4157603840 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.247282E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 7940/ 115203 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.242095E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.088 | TFLOPs: 43.96 | +7: iteration 7950/ 115203 | consumed samples: 2035200 | consumed tokens: 4168089600 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.244183E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.231 | TFLOPs: 43.97 | +7: iteration 7960/ 115203 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.246982E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 7970/ 115203 | consumed samples: 2040320 | consumed tokens: 4178575360 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.260534E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.119 | TFLOPs: 43.96 | +7: iteration 7980/ 115203 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.233084E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.256 | TFLOPs: 43.98 | +7: iteration 7990/ 115203 | consumed samples: 2045440 | consumed tokens: 4189061120 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.235313E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.052 | TFLOPs: 43.96 | +0: [2023-03-16 14:05:36,948] [INFO] [logging.py:68:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.00019840359799331808, 0.00019840359799331808, 0.00019840359799331808], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 8000/ 115203 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.242918E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.015 | TFLOPs: 43.95 | +0: steps: 8000 loss: 3.2678 iter time (s): 0.556 samples/sec: 460.805 +7: iteration 8010/ 115203 | consumed samples: 2050560 | consumed tokens: 4199546880 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.233193E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.528 | TFLOPs: 43.91 | +7: iteration 8020/ 115203 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 0.55 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.233434E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.283 | TFLOPs: 43.98 | +7: iteration 8030/ 115203 | consumed samples: 2055680 | consumed tokens: 4210032640 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.242971E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.001 | TFLOPs: 43.95 | +7: iteration 8040/ 115203 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.231610E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.130 | TFLOPs: 43.96 | +7: iteration 8050/ 115203 | consumed samples: 2060800 | consumed tokens: 4220518400 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.229246E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.222 | TFLOPs: 43.97 | +7: iteration 8060/ 115203 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.234556E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.100 | TFLOPs: 43.96 | +7: iteration 8070/ 115203 | consumed samples: 2065920 | consumed tokens: 4231004160 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.233204E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.120 | TFLOPs: 43.96 | +7: iteration 8080/ 115203 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.230449E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.090 | TFLOPs: 43.96 | +7: iteration 8090/ 115203 | consumed samples: 2071040 | consumed tokens: 4241489920 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.230439E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.118 | TFLOPs: 43.96 | +7: iteration 8100/ 115203 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.231400E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.228 | TFLOPs: 43.97 | +7: iteration 8110/ 115203 | consumed samples: 2076160 | consumed tokens: 4251975680 | elapsed time per iteration (s): 0.56 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.229501E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.036 | TFLOPs: 43.95 | +7: iteration 8120/ 115203 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.228293E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.138 | TFLOPs: 43.96 | +7: iteration 8130/ 115203 | consumed samples: 2081280 | consumed tokens: 4262461440 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.239475E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.936 | TFLOPs: 43.95 | +7: iteration 8140/ 115203 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.220520E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.035 | TFLOPs: 43.95 | +7: iteration 8150/ 115203 | consumed samples: 2086400 | consumed tokens: 4272947200 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.223679E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.180 | TFLOPs: 43.97 | +7: iteration 8160/ 115203 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.234840E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.027 | TFLOPs: 43.95 | +7: iteration 8170/ 115203 | consumed samples: 2091520 | consumed tokens: 4283432960 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.227098E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.028 | TFLOPs: 43.95 | +7: iteration 8180/ 115203 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.231918E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.063 | TFLOPs: 43.96 | +7: iteration 8190/ 115203 | consumed samples: 2096640 | consumed tokens: 4293918720 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.224396E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.216 | TFLOPs: 43.97 | +7: iteration 8200/ 115203 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.218922E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.146 | TFLOPs: 43.97 | +7: iteration 8210/ 115203 | consumed samples: 2101760 | consumed tokens: 4304404480 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.217274E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.998 | TFLOPs: 43.95 | +7: iteration 8220/ 115203 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.208431E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.011 | TFLOPs: 43.95 | +7: iteration 8230/ 115203 | consumed samples: 2106880 | consumed tokens: 4314890240 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.201603E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.045 | TFLOPs: 43.96 | +7: iteration 8240/ 115203 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.227020E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.198 | TFLOPs: 43.40 | +7: iteration 8250/ 115203 | consumed samples: 2112000 | consumed tokens: 4325376000 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.208232E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.986 | TFLOPs: 43.95 | +7: iteration 8260/ 115203 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.215358E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.899 | TFLOPs: 43.94 | +7: iteration 8270/ 115203 | consumed samples: 2117120 | consumed tokens: 4335861760 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.213494E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.923 | TFLOPs: 43.94 | +7: iteration 8280/ 115203 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.234151E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.101 | TFLOPs: 43.96 | +7: iteration 8290/ 115203 | consumed samples: 2122240 | consumed tokens: 4346347520 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.222833E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 8300/ 115203 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.233203E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.986 | TFLOPs: 43.95 | +7: iteration 8310/ 115203 | consumed samples: 2127360 | consumed tokens: 4356833280 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.213607E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.215 | TFLOPs: 43.97 | +7: iteration 8320/ 115203 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 0.56 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.218836E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.953 | TFLOPs: 43.95 | +7: iteration 8330/ 115203 | consumed samples: 2132480 | consumed tokens: 4367319040 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.211164E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.897 | TFLOPs: 43.94 | +7: iteration 8340/ 115203 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.210874E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.093 | TFLOPs: 43.96 | +7: iteration 8350/ 115203 | consumed samples: 2137600 | consumed tokens: 4377804800 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.214544E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.074 | TFLOPs: 43.96 | +7: iteration 8360/ 115203 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.204984E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.102 | TFLOPs: 43.96 | +7: iteration 8370/ 115203 | consumed samples: 2142720 | consumed tokens: 4388290560 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.219945E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.143 | TFLOPs: 43.97 | +7: iteration 8380/ 115203 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.221597E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.986 | TFLOPs: 43.95 | +7: iteration 8390/ 115203 | consumed samples: 2147840 | consumed tokens: 4398776320 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.212733E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.082 | TFLOPs: 43.96 | +7: iteration 8400/ 115203 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.211270E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.148 | TFLOPs: 43.97 | +7: iteration 8410/ 115203 | consumed samples: 2152960 | consumed tokens: 4409262080 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.215837E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.258 | TFLOPs: 43.98 | +7: iteration 8420/ 115203 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.215769E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.022 | TFLOPs: 43.95 | +7: iteration 8430/ 115203 | consumed samples: 2158080 | consumed tokens: 4419747840 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.212709E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.150 | TFLOPs: 43.97 | +7: iteration 8440/ 115203 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.210133E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.091 | TFLOPs: 43.96 | +7: iteration 8450/ 115203 | consumed samples: 2163200 | consumed tokens: 4430233600 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.209319E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.914 | TFLOPs: 43.94 | +7: iteration 8460/ 115203 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.219484E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.082 | TFLOPs: 43.96 | +7: iteration 8470/ 115203 | consumed samples: 2168320 | consumed tokens: 4440719360 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.210027E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.252 | TFLOPs: 43.98 | +7: iteration 8480/ 115203 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.205727E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.014 | TFLOPs: 43.95 | +7: iteration 8490/ 115203 | consumed samples: 2173440 | consumed tokens: 4451205120 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.227036E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.214 | TFLOPs: 43.97 | +7: iteration 8500/ 115203 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.216463E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 8510/ 115203 | consumed samples: 2178560 | consumed tokens: 4461690880 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.213790E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.209 | TFLOPs: 43.97 | +7: iteration 8520/ 115203 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 0.56 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.193729E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 8530/ 115203 | consumed samples: 2183680 | consumed tokens: 4472176640 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.209988E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.129 | TFLOPs: 43.96 | +7: iteration 8540/ 115203 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.238196E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 8550/ 115203 | consumed samples: 2188800 | consumed tokens: 4482662400 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.202109E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.129 | TFLOPs: 43.96 | +7: iteration 8560/ 115203 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.195540E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.098 | TFLOPs: 43.96 | +7: iteration 8570/ 115203 | consumed samples: 2193920 | consumed tokens: 4493148160 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.208807E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.115 | TFLOPs: 43.96 | +7: iteration 8580/ 115203 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.215048E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.094 | TFLOPs: 43.96 | +7: iteration 8590/ 115203 | consumed samples: 2199040 | consumed tokens: 4503633920 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.200681E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.145 | TFLOPs: 43.97 | +7: iteration 8600/ 115203 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 0.55 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.211796E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.347 | TFLOPs: 43.98 | +7: iteration 8610/ 115203 | consumed samples: 2204160 | consumed tokens: 4514119680 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.219395E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.703 | TFLOPs: 43.64 | +7: iteration 8620/ 115203 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.203797E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.034 | TFLOPs: 43.95 | +7: iteration 8630/ 115203 | consumed samples: 2209280 | consumed tokens: 4524605440 | elapsed time per iteration (s): 0.55 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.209472E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.344 | TFLOPs: 43.98 | +7: iteration 8640/ 115203 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.215280E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 8650/ 115203 | consumed samples: 2214400 | consumed tokens: 4535091200 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.198265E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.118 | TFLOPs: 43.96 | +7: iteration 8660/ 115203 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.195595E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.107 | TFLOPs: 43.96 | +7: iteration 8670/ 115203 | consumed samples: 2219520 | consumed tokens: 4545576960 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.213392E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.413 | TFLOPs: 43.42 | +7: iteration 8680/ 115203 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.196233E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.253 | TFLOPs: 43.98 | +7: iteration 8690/ 115203 | consumed samples: 2224640 | consumed tokens: 4556062720 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.199594E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.226 | TFLOPs: 43.97 | +7: iteration 8700/ 115203 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 0.56 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.209003E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.085 | TFLOPs: 43.96 | +7: iteration 8710/ 115203 | consumed samples: 2229760 | consumed tokens: 4566548480 | elapsed time per iteration (s): 0.55 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.196158E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.309 | TFLOPs: 43.98 | +7: iteration 8720/ 115203 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 0.55 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.205145E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.345 | TFLOPs: 43.98 | +7: iteration 8730/ 115203 | consumed samples: 2234880 | consumed tokens: 4577034240 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.186079E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.098 | TFLOPs: 43.96 | +7: iteration 8740/ 115203 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.212039E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.072 | TFLOPs: 43.96 | +7: iteration 8750/ 115203 | consumed samples: 2240000 | consumed tokens: 4587520000 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.193689E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 8760/ 115203 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 0.55 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.203511E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.339 | TFLOPs: 43.98 | +7: iteration 8770/ 115203 | consumed samples: 2245120 | consumed tokens: 4598005760 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.192630E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 8780/ 115203 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.186141E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 8790/ 115203 | consumed samples: 2250240 | consumed tokens: 4608491520 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.193472E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.107 | TFLOPs: 43.96 | +7: iteration 8800/ 115203 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 0.55 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.194665E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.340 | TFLOPs: 43.98 | +7: iteration 8810/ 115203 | consumed samples: 2255360 | consumed tokens: 4618977280 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.211812E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 8820/ 115203 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.191456E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 8830/ 115203 | consumed samples: 2260480 | consumed tokens: 4629463040 | elapsed time per iteration (s): 0.55 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.186633E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.436 | TFLOPs: 43.99 | +7: iteration 8840/ 115203 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.205066E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 8850/ 115203 | consumed samples: 2265600 | consumed tokens: 4639948800 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.199409E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.184 | TFLOPs: 43.97 | +7: iteration 8860/ 115203 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.197756E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.147 | TFLOPs: 43.97 | +7: iteration 8870/ 115203 | consumed samples: 2270720 | consumed tokens: 4650434560 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.185356E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.240 | TFLOPs: 43.97 | +7: iteration 8880/ 115203 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.189248E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.257 | TFLOPs: 43.98 | +7: iteration 8890/ 115203 | consumed samples: 2275840 | consumed tokens: 4660920320 | elapsed time per iteration (s): 0.55 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.192833E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.329 | TFLOPs: 43.98 | +7: iteration 8900/ 115203 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 0.55 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.200685E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.348 | TFLOPs: 43.98 | +7: iteration 8910/ 115203 | consumed samples: 2280960 | consumed tokens: 4671406080 | elapsed time per iteration (s): 0.56 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.191916E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.057 | TFLOPs: 43.96 | +7: iteration 8920/ 115203 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.203852E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.864 | TFLOPs: 43.37 | +7: iteration 8930/ 115203 | consumed samples: 2286080 | consumed tokens: 4681891840 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.193410E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.236 | TFLOPs: 43.97 | +7: iteration 8940/ 115203 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 0.55 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.188694E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.335 | TFLOPs: 43.98 | +7: iteration 8950/ 115203 | consumed samples: 2291200 | consumed tokens: 4692377600 | elapsed time per iteration (s): 0.55 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.195213E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 8960/ 115203 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.206274E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.047 | TFLOPs: 43.96 | +7: iteration 8970/ 115203 | consumed samples: 2296320 | consumed tokens: 4702863360 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.194185E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.258 | TFLOPs: 43.98 | +7: iteration 8980/ 115203 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.199822E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.139 | TFLOPs: 43.96 | +7: iteration 8990/ 115203 | consumed samples: 2301440 | consumed tokens: 4713349120 | elapsed time per iteration (s): 0.55 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.179567E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.477 | TFLOPs: 44.00 | +7: iteration 9000/ 115203 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.195598E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 9010/ 115203 | consumed samples: 2306560 | consumed tokens: 4723834880 | elapsed time per iteration (s): 0.55 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.181600E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.330 | TFLOPs: 43.98 | +7: iteration 9020/ 115203 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.184919E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.143 | TFLOPs: 43.96 | +7: iteration 9030/ 115203 | consumed samples: 2311680 | consumed tokens: 4734320640 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.188607E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.203 | TFLOPs: 43.97 | +7: iteration 9040/ 115203 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.184304E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.258 | TFLOPs: 43.98 | +7: iteration 9050/ 115203 | consumed samples: 2316800 | consumed tokens: 4744806400 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.189310E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.756 | TFLOPs: 43.55 | +7: iteration 9060/ 115203 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.196952E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.169 | TFLOPs: 43.97 | +7: iteration 9070/ 115203 | consumed samples: 2321920 | consumed tokens: 4755292160 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.189877E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.150 | TFLOPs: 43.97 | +7: iteration 9080/ 115203 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 0.55 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.184941E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.322 | TFLOPs: 43.98 | +7: iteration 9090/ 115203 | consumed samples: 2327040 | consumed tokens: 4765777920 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.191724E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.202 | TFLOPs: 43.97 | +7: iteration 9100/ 115203 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 0.56 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.172038E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.126 | TFLOPs: 43.96 | +7: iteration 9110/ 115203 | consumed samples: 2332160 | consumed tokens: 4776263680 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.186933E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.145 | TFLOPs: 43.97 | +7: iteration 9120/ 115203 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.190344E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.089 | TFLOPs: 43.96 | +7: iteration 9130/ 115203 | consumed samples: 2337280 | consumed tokens: 4786749440 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.169555E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.047 | TFLOPs: 43.96 | +7: iteration 9140/ 115203 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.177118E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.092 | TFLOPs: 43.96 | +7: iteration 9150/ 115203 | consumed samples: 2342400 | consumed tokens: 4797235200 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.186541E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.946 | TFLOPs: 43.95 | +7: iteration 9160/ 115203 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.185985E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.085 | TFLOPs: 43.96 | +7: iteration 9170/ 115203 | consumed samples: 2347520 | consumed tokens: 4807720960 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.177245E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.943 | TFLOPs: 43.95 | +7: iteration 9180/ 115203 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.181310E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 9190/ 115203 | consumed samples: 2352640 | consumed tokens: 4818206720 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.173113E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.081 | TFLOPs: 43.96 | +7: iteration 9200/ 115203 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.185334E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.108 | TFLOPs: 43.96 | +7: iteration 9210/ 115203 | consumed samples: 2357760 | consumed tokens: 4828692480 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.187468E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.997 | TFLOPs: 43.95 | +7: iteration 9220/ 115203 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.170386E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.256 | TFLOPs: 43.98 | +7: iteration 9230/ 115203 | consumed samples: 2362880 | consumed tokens: 4839178240 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.182032E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.016 | TFLOPs: 43.95 | +7: iteration 9240/ 115203 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.176960E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.937 | TFLOPs: 43.95 | +7: iteration 9250/ 115203 | consumed samples: 2368000 | consumed tokens: 4849664000 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.186601E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.015 | TFLOPs: 43.95 | +7: iteration 9260/ 115203 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.181972E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.202 | TFLOPs: 43.97 | +7: iteration 9270/ 115203 | consumed samples: 2373120 | consumed tokens: 4860149760 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.187550E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.142 | TFLOPs: 43.96 | +7: iteration 9280/ 115203 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 0.56 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.174673E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.153 | TFLOPs: 43.97 | +7: iteration 9290/ 115203 | consumed samples: 2378240 | consumed tokens: 4870635520 | elapsed time per iteration (s): 0.55 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.172327E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.393 | TFLOPs: 43.99 | +7: iteration 9300/ 115203 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.180206E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.252 | TFLOPs: 43.98 | +7: iteration 9310/ 115203 | consumed samples: 2383360 | consumed tokens: 4881121280 | elapsed time per iteration (s): 0.55 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.181971E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.330 | TFLOPs: 43.98 | +7: iteration 9320/ 115203 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.162876E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 9330/ 115203 | consumed samples: 2388480 | consumed tokens: 4891607040 | elapsed time per iteration (s): 0.55 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.187237E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.378 | TFLOPs: 43.99 | +7: iteration 9340/ 115203 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.183355E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.146 | TFLOPs: 43.97 | +7: iteration 9350/ 115203 | consumed samples: 2393600 | consumed tokens: 4902092800 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.190380E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.245 | TFLOPs: 43.97 | +7: iteration 9360/ 115203 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.170809E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.584 | TFLOPs: 43.91 | +7: iteration 9370/ 115203 | consumed samples: 2398720 | consumed tokens: 4912578560 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.175799E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.502 | TFLOPs: 43.90 | +7: iteration 9380/ 115203 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.178666E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.372 | TFLOPs: 43.89 | +7: iteration 9390/ 115203 | consumed samples: 2403840 | consumed tokens: 4923064320 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.174066E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.528 | TFLOPs: 43.72 | +7: iteration 9400/ 115203 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.170523E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.525 | TFLOPs: 43.81 | +7: iteration 9410/ 115203 | consumed samples: 2408960 | consumed tokens: 4933550080 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.177463E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.005 | TFLOPs: 43.76 | +7: iteration 9420/ 115203 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.176330E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.077 | TFLOPs: 43.96 | +7: iteration 9430/ 115203 | consumed samples: 2414080 | consumed tokens: 4944035840 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.180147E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.758 | TFLOPs: 43.93 | +7: iteration 9440/ 115203 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.185647E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.381 | TFLOPs: 43.51 | +7: iteration 9450/ 115203 | consumed samples: 2419200 | consumed tokens: 4954521600 | elapsed time per iteration (s): 0.56 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.180986E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.067 | TFLOPs: 43.96 | +7: iteration 9460/ 115203 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 0.55 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.164154E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 9470/ 115203 | consumed samples: 2424320 | consumed tokens: 4965007360 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.186842E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.038 | TFLOPs: 43.95 | +7: iteration 9480/ 115203 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.174266E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.261 | TFLOPs: 43.98 | +7: iteration 9490/ 115203 | consumed samples: 2429440 | consumed tokens: 4975493120 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.181087E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.982 | TFLOPs: 43.95 | +7: iteration 9500/ 115203 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.184411E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.089 | TFLOPs: 43.96 | +7: iteration 9510/ 115203 | consumed samples: 2434560 | consumed tokens: 4985978880 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.172402E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.060 | TFLOPs: 43.96 | +7: iteration 9520/ 115203 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.172134E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.096 | TFLOPs: 43.96 | +7: iteration 9530/ 115203 | consumed samples: 2439680 | consumed tokens: 4996464640 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.168247E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.081 | TFLOPs: 43.96 | +7: iteration 9540/ 115203 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.162310E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.953 | TFLOPs: 43.95 | +7: iteration 9550/ 115203 | consumed samples: 2444800 | consumed tokens: 5006950400 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.153572E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.121 | TFLOPs: 43.96 | +7: iteration 9560/ 115203 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.168481E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.196 | TFLOPs: 43.97 | +7: iteration 9570/ 115203 | consumed samples: 2449920 | consumed tokens: 5017436160 | elapsed time per iteration (s): 0.55 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.157572E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.309 | TFLOPs: 43.98 | +7: iteration 9580/ 115203 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 0.55 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.161267E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.285 | TFLOPs: 43.98 | +7: iteration 9590/ 115203 | consumed samples: 2455040 | consumed tokens: 5027921920 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.174314E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.255 | TFLOPs: 43.98 | +7: iteration 9600/ 115203 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.164101E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.183 | TFLOPs: 43.97 | +7: iteration 9610/ 115203 | consumed samples: 2460160 | consumed tokens: 5038407680 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.167030E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.170 | TFLOPs: 43.97 | +7: iteration 9620/ 115203 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 0.55 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.169225E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.316 | TFLOPs: 43.98 | +7: iteration 9630/ 115203 | consumed samples: 2465280 | consumed tokens: 5048893440 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.162639E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.077 | TFLOPs: 43.96 | +7: iteration 9640/ 115203 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 0.56 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.162456E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.034 | TFLOPs: 43.95 | +7: iteration 9650/ 115203 | consumed samples: 2470400 | consumed tokens: 5059379200 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.161667E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.143 | TFLOPs: 43.96 | +7: iteration 9660/ 115203 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.165093E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.086 | TFLOPs: 43.96 | +7: iteration 9670/ 115203 | consumed samples: 2475520 | consumed tokens: 5069864960 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.175463E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.229 | TFLOPs: 43.97 | +7: iteration 9680/ 115203 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.174785E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.151 | TFLOPs: 43.97 | +7: iteration 9690/ 115203 | consumed samples: 2480640 | consumed tokens: 5080350720 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.175154E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.041 | TFLOPs: 43.48 | +7: iteration 9700/ 115203 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.160824E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.699 | TFLOPs: 43.92 | +7: iteration 9710/ 115203 | consumed samples: 2485760 | consumed tokens: 5090836480 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.161857E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.774 | TFLOPs: 43.93 | +7: iteration 9720/ 115203 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.181504E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.181 | TFLOPs: 43.97 | +7: iteration 9730/ 115203 | consumed samples: 2490880 | consumed tokens: 5101322240 | elapsed time per iteration (s): 0.55 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.160355E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.307 | TFLOPs: 43.98 | +7: iteration 9740/ 115203 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 0.55 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.160877E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.352 | TFLOPs: 43.98 | +7: iteration 9750/ 115203 | consumed samples: 2496000 | consumed tokens: 5111808000 | elapsed time per iteration (s): 0.55 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.155602E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 9760/ 115203 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 0.55 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.162436E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.339 | TFLOPs: 43.98 | +7: iteration 9770/ 115203 | consumed samples: 2501120 | consumed tokens: 5122293760 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.157499E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.223 | TFLOPs: 43.97 | +7: iteration 9780/ 115203 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.147786E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 9790/ 115203 | consumed samples: 2506240 | consumed tokens: 5132779520 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.153161E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.159 | TFLOPs: 43.97 | +7: iteration 9800/ 115203 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 0.55 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.147908E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.298 | TFLOPs: 43.98 | +7: iteration 9810/ 115203 | consumed samples: 2511360 | consumed tokens: 5143265280 | elapsed time per iteration (s): 0.56 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.157642E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.104 | TFLOPs: 43.96 | +7: iteration 9820/ 115203 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.153606E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.256 | TFLOPs: 43.98 | +7: iteration 9830/ 115203 | consumed samples: 2516480 | consumed tokens: 5153751040 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.170256E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.084 | TFLOPs: 43.96 | +7: iteration 9840/ 115203 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 0.55 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.163341E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.271 | TFLOPs: 43.98 | +7: iteration 9850/ 115203 | consumed samples: 2521600 | consumed tokens: 5164236800 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.168384E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.907 | TFLOPs: 43.94 | +7: iteration 9860/ 115203 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 0.55 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.156232E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.262 | TFLOPs: 43.98 | +7: iteration 9870/ 115203 | consumed samples: 2526720 | consumed tokens: 5174722560 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.153388E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.114 | TFLOPs: 43.96 | +7: iteration 9880/ 115203 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.166129E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.222 | TFLOPs: 43.97 | +7: iteration 9890/ 115203 | consumed samples: 2531840 | consumed tokens: 5185208320 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.161839E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.140 | TFLOPs: 43.96 | +7: iteration 9900/ 115203 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.160129E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.025 | TFLOPs: 43.95 | +7: iteration 9910/ 115203 | consumed samples: 2536960 | consumed tokens: 5195694080 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.155269E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 9920/ 115203 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.152498E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 9930/ 115203 | consumed samples: 2542080 | consumed tokens: 5206179840 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.157576E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.190 | TFLOPs: 43.97 | +7: iteration 9940/ 115203 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.162694E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.824 | TFLOPs: 43.93 | +7: iteration 9950/ 115203 | consumed samples: 2547200 | consumed tokens: 5216665600 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.152675E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.124 | TFLOPs: 43.96 | +7: iteration 9960/ 115203 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.147554E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.142 | TFLOPs: 43.96 | +7: iteration 9970/ 115203 | consumed samples: 2552320 | consumed tokens: 5227151360 | elapsed time per iteration (s): 0.55 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.152427E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.345 | TFLOPs: 43.98 | +7: iteration 9980/ 115203 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 0.56 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.159901E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 9990/ 115203 | consumed samples: 2557440 | consumed tokens: 5237637120 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.140722E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.252 | TFLOPs: 43.98 | +0: [2023-03-16 14:24:07,768] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=0, lr=[0.00019734023411853413, 0.00019734023411853413, 0.00019734023411853413], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 10000/ 115203 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.153118E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.233 | TFLOPs: 43.97 | +0: steps: 10000 loss: 3.1675 iter time (s): 0.553 samples/sec: 462.665 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 10000 | lm loss value: 3.404969E+00 | lm loss PPL: 3.011337E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 10000 to checkpoints_421m60b400m +0: [2023-03-16 14:24:08,375] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is begin to save! +0: [2023-03-16 14:24:09,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_01-model_00-model_states.pt... +0: [2023-03-16 14:24:09,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_01-model_00-model_states.pt. +0: [2023-03-16 14:24:09,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_03-model_00-model_states.pt... +0: [2023-03-16 14:24:09,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_03-model_00-model_states.pt. +0: [2023-03-16 14:24:09,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_04-model_00-model_states.pt... +0: [2023-03-16 14:24:09,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_04-model_00-model_states.pt. +0: [2023-03-16 14:24:09,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_05-model_00-model_states.pt... +0: [2023-03-16 14:24:09,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_05-model_00-model_states.pt. +0: [2023-03-16 14:24:09,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_06-model_00-model_states.pt... +0: [2023-03-16 14:24:09,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_06-model_00-model_states.pt. +0: [2023-03-16 14:24:09,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_07-model_00-model_states.pt... +0: [2023-03-16 14:24:09,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_07-model_00-model_states.pt. +0: [2023-03-16 14:24:09,410] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_08-model_00-model_states.pt... +0: [2023-03-16 14:24:09,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_08-model_00-model_states.pt. +0: [2023-03-16 14:24:09,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_09-model_00-model_states.pt... +0: [2023-03-16 14:24:09,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_09-model_00-model_states.pt. +0: [2023-03-16 14:24:09,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_10-model_00-model_states.pt... +0: [2023-03-16 14:24:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_10-model_00-model_states.pt. +0: [2023-03-16 14:24:09,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_11-model_00-model_states.pt... +0: [2023-03-16 14:24:09,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_11-model_00-model_states.pt. +0: [2023-03-16 14:24:09,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_12-model_00-model_states.pt... +0: [2023-03-16 14:24:09,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_12-model_00-model_states.pt. +0: [2023-03-16 14:24:09,617] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_13-model_00-model_states.pt... +0: [2023-03-16 14:24:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_13-model_00-model_states.pt. +0: [2023-03-16 14:24:09,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_14-model_00-model_states.pt... +0: [2023-03-16 14:24:09,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_14-model_00-model_states.pt. +0: [2023-03-16 14:24:09,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_15-model_00-model_states.pt... +0: [2023-03-16 14:24:09,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_15-model_00-model_states.pt. +0: [2023-03-16 14:24:09,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_16-model_00-model_states.pt... +0: [2023-03-16 14:24:09,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_16-model_00-model_states.pt. +0: [2023-03-16 14:24:09,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_17-model_00-model_states.pt... +0: [2023-03-16 14:24:09,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_17-model_00-model_states.pt. +0: [2023-03-16 14:24:09,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_18-model_00-model_states.pt... +0: [2023-03-16 14:24:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_18-model_00-model_states.pt. +0: [2023-03-16 14:24:09,866] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_19-model_00-model_states.pt... +0: [2023-03-16 14:24:09,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_19-model_00-model_states.pt. +0: [2023-03-16 14:24:09,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_20-model_00-model_states.pt... +0: [2023-03-16 14:24:09,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_20-model_00-model_states.pt. +0: [2023-03-16 14:24:09,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/layer_22-model_00-model_states.pt... +0: [2023-03-16 14:24:09,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/layer_22-model_00-model_states.pt. +0: [2023-03-16 14:24:09,954] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step10000/mp_rank_00_model_states.pt +0: [2023-03-16 14:24:09,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/mp_rank_00_model_states.pt... +0: [2023-03-16 14:24:09,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/mp_rank_00_model_states.pt. +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 14:24:09,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-16 14:24:10,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 14:24:10,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 14:24:10,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-16 14:24:10,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-16 14:24:10,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-16 14:24:10,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 14:24:10,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 14:24:10,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-16 14:24:10,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-16 14:24:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-16 14:24:10,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 14:24:10,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-16 14:24:10,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-16 14:24:10,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 14:24:10,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: successfully saved checkpoint at iteration 10000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1873.40 +7: iteration 10010/ 115203 | consumed samples: 2562560 | consumed tokens: 5248122880 | elapsed time per iteration (s): 0.82 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.163919E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 313.267 | TFLOPs: 29.87 | +7: iteration 10020/ 115203 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 0.55 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.159833E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.523 | TFLOPs: 44.00 | +7: iteration 10030/ 115203 | consumed samples: 2567680 | consumed tokens: 5258608640 | elapsed time per iteration (s): 0.55 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.159866E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 10040/ 115203 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 0.55 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.161084E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 10050/ 115203 | consumed samples: 2572800 | consumed tokens: 5269094400 | elapsed time per iteration (s): 0.55 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.151085E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.305 | TFLOPs: 43.98 | +7: iteration 10060/ 115203 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.171162E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.132 | TFLOPs: 43.96 | +7: iteration 10070/ 115203 | consumed samples: 2577920 | consumed tokens: 5279580160 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.154787E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.116 | TFLOPs: 43.96 | +7: iteration 10080/ 115203 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.156624E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.201 | TFLOPs: 43.97 | +7: iteration 10090/ 115203 | consumed samples: 2583040 | consumed tokens: 5290065920 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.155237E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 10100/ 115203 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.145781E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.115 | TFLOPs: 43.96 | +7: iteration 10110/ 115203 | consumed samples: 2588160 | consumed tokens: 5300551680 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.152212E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.197 | TFLOPs: 43.97 | +7: iteration 10120/ 115203 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.156543E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.050 | TFLOPs: 43.96 | +7: iteration 10130/ 115203 | consumed samples: 2593280 | consumed tokens: 5311037440 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.160699E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.182 | TFLOPs: 43.97 | +7: iteration 10140/ 115203 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 0.56 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.163990E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 10150/ 115203 | consumed samples: 2598400 | consumed tokens: 5321523200 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.147558E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 10160/ 115203 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.158083E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.952 | TFLOPs: 43.95 | +7: iteration 10170/ 115203 | consumed samples: 2603520 | consumed tokens: 5332008960 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.146530E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.176 | TFLOPs: 43.97 | +7: iteration 10180/ 115203 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.160649E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.207 | TFLOPs: 43.97 | +7: iteration 10190/ 115203 | consumed samples: 2608640 | consumed tokens: 5342494720 | elapsed time per iteration (s): 0.55 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.162963E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.313 | TFLOPs: 43.98 | +7: iteration 10200/ 115203 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.158678E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 10210/ 115203 | consumed samples: 2613760 | consumed tokens: 5352980480 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.147666E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.147 | TFLOPs: 43.97 | +7: iteration 10220/ 115203 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.144683E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.110 | TFLOPs: 43.96 | +7: iteration 10230/ 115203 | consumed samples: 2618880 | consumed tokens: 5363466240 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.148949E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.245 | TFLOPs: 43.97 | +7: iteration 10240/ 115203 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.137726E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.038 | TFLOPs: 43.95 | +7: iteration 10250/ 115203 | consumed samples: 2624000 | consumed tokens: 5373952000 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.154069E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.974 | TFLOPs: 43.95 | +7: iteration 10260/ 115203 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.143670E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.953 | TFLOPs: 43.95 | +7: iteration 10270/ 115203 | consumed samples: 2629120 | consumed tokens: 5384437760 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.138412E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.156 | TFLOPs: 43.97 | +7: iteration 10280/ 115203 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 0.55 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.141366E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.370 | TFLOPs: 43.99 | +7: iteration 10290/ 115203 | consumed samples: 2634240 | consumed tokens: 5394923520 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.154182E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.150 | TFLOPs: 43.97 | +7: iteration 10300/ 115203 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 0.55 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.152318E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.348 | TFLOPs: 43.98 | +7: iteration 10310/ 115203 | consumed samples: 2639360 | consumed tokens: 5405409280 | elapsed time per iteration (s): 0.56 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.141814E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.403 | TFLOPs: 43.70 | +7: iteration 10320/ 115203 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.152450E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.622 | TFLOPs: 43.63 | +7: iteration 10330/ 115203 | consumed samples: 2644480 | consumed tokens: 5415895040 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.142503E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 10340/ 115203 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.138599E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.170 | TFLOPs: 43.97 | +7: iteration 10350/ 115203 | consumed samples: 2649600 | consumed tokens: 5426380800 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.135568E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.085 | TFLOPs: 43.96 | +7: iteration 10360/ 115203 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.140566E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.172 | TFLOPs: 43.97 | +7: iteration 10370/ 115203 | consumed samples: 2654720 | consumed tokens: 5436866560 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.136013E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.192 | TFLOPs: 43.97 | +7: iteration 10380/ 115203 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.124237E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.109 | TFLOPs: 43.96 | +7: iteration 10390/ 115203 | consumed samples: 2659840 | consumed tokens: 5447352320 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.143585E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.065 | TFLOPs: 43.96 | +7: iteration 10400/ 115203 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.141929E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.025 | TFLOPs: 43.95 | +7: iteration 10410/ 115203 | consumed samples: 2664960 | consumed tokens: 5457838080 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.143756E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.162 | TFLOPs: 43.97 | +7: iteration 10420/ 115203 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.148346E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.136 | TFLOPs: 43.96 | +7: iteration 10430/ 115203 | consumed samples: 2670080 | consumed tokens: 5468323840 | elapsed time per iteration (s): 0.55 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.141527E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.263 | TFLOPs: 43.98 | +7: iteration 10440/ 115203 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.133784E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.211 | TFLOPs: 43.97 | +7: iteration 10450/ 115203 | consumed samples: 2675200 | consumed tokens: 5478809600 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.139836E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.182 | TFLOPs: 43.97 | +7: iteration 10460/ 115203 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 0.55 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.128170E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.280 | TFLOPs: 43.98 | +7: iteration 10470/ 115203 | consumed samples: 2680320 | consumed tokens: 5489295360 | elapsed time per iteration (s): 0.56 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.142229E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.093 | TFLOPs: 43.96 | +7: iteration 10480/ 115203 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.129263E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.235 | TFLOPs: 43.97 | +7: iteration 10490/ 115203 | consumed samples: 2685440 | consumed tokens: 5499781120 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.145005E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.255 | TFLOPs: 43.98 | +7: iteration 10500/ 115203 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.135139E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 10510/ 115203 | consumed samples: 2690560 | consumed tokens: 5510266880 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.142513E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.200 | TFLOPs: 43.97 | +7: iteration 10520/ 115203 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.137004E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.244 | TFLOPs: 43.97 | +7: iteration 10530/ 115203 | consumed samples: 2695680 | consumed tokens: 5520752640 | elapsed time per iteration (s): 0.55 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.140776E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.314 | TFLOPs: 43.98 | +7: iteration 10540/ 115203 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.146716E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.252 | TFLOPs: 43.98 | +7: iteration 10550/ 115203 | consumed samples: 2700800 | consumed tokens: 5531238400 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.148123E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.184 | TFLOPs: 43.97 | +7: iteration 10560/ 115203 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.129703E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.184 | TFLOPs: 43.97 | +7: iteration 10570/ 115203 | consumed samples: 2705920 | consumed tokens: 5541724160 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.134742E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.205 | TFLOPs: 43.97 | +7: iteration 10580/ 115203 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 0.55 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.130577E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.273 | TFLOPs: 43.98 | +7: iteration 10590/ 115203 | consumed samples: 2711040 | consumed tokens: 5552209920 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.130957E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.715 | TFLOPs: 43.45 | +7: iteration 10600/ 115203 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 0.55 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.133386E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.286 | TFLOPs: 43.98 | +7: iteration 10610/ 115203 | consumed samples: 2716160 | consumed tokens: 5562695680 | elapsed time per iteration (s): 0.56 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.148223E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.142 | TFLOPs: 43.96 | +7: iteration 10620/ 115203 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 0.55 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.136572E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.337 | TFLOPs: 43.98 | +7: iteration 10630/ 115203 | consumed samples: 2721280 | consumed tokens: 5573181440 | elapsed time per iteration (s): 0.55 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.141493E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 10640/ 115203 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.138481E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.483 | TFLOPs: 44.00 | +7: iteration 10650/ 115203 | consumed samples: 2726400 | consumed tokens: 5583667200 | elapsed time per iteration (s): 0.56 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.137264E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.226 | TFLOPs: 43.97 | +7: iteration 10660/ 115203 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 0.56 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.125872E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.700 | TFLOPs: 43.83 | +7: iteration 10670/ 115203 | consumed samples: 2731520 | consumed tokens: 5594152960 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.122023E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.533 | TFLOPs: 44.00 | +7: iteration 10680/ 115203 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.127183E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.409 | TFLOPs: 43.99 | +7: iteration 10690/ 115203 | consumed samples: 2736640 | consumed tokens: 5604638720 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.137498E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 10700/ 115203 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 0.56 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.156693E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.211 | TFLOPs: 43.97 | +7: iteration 10710/ 115203 | consumed samples: 2741760 | consumed tokens: 5615124480 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.127747E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.295 | TFLOPs: 43.98 | +7: iteration 10720/ 115203 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.128897E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.311 | TFLOPs: 43.98 | +7: iteration 10730/ 115203 | consumed samples: 2746880 | consumed tokens: 5625610240 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.132589E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.313 | TFLOPs: 43.98 | +7: iteration 10740/ 115203 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.136041E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 10750/ 115203 | consumed samples: 2752000 | consumed tokens: 5636096000 | elapsed time per iteration (s): 0.56 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.141316E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.214 | TFLOPs: 43.97 | +7: iteration 10760/ 115203 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 0.56 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.123610E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.275 | TFLOPs: 43.50 | +7: iteration 10770/ 115203 | consumed samples: 2757120 | consumed tokens: 5646581760 | elapsed time per iteration (s): 0.56 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.139018E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.071 | TFLOPs: 43.96 | +7: iteration 10780/ 115203 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 0.55 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.130798E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +7: iteration 10790/ 115203 | consumed samples: 2762240 | consumed tokens: 5657067520 | elapsed time per iteration (s): 0.55 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.131054E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.261 | TFLOPs: 43.98 | +7: iteration 10800/ 115203 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 0.55 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.136703E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.303 | TFLOPs: 43.98 | +7: iteration 10810/ 115203 | consumed samples: 2767360 | consumed tokens: 5667553280 | elapsed time per iteration (s): 0.55 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.143713E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.293 | TFLOPs: 43.98 | +7: iteration 10820/ 115203 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.137364E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.182 | TFLOPs: 43.78 | +7: iteration 10830/ 115203 | consumed samples: 2772480 | consumed tokens: 5678039040 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.127599E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.226 | TFLOPs: 43.97 | +7: iteration 10840/ 115203 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.134948E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.203 | TFLOPs: 43.97 | +7: iteration 10850/ 115203 | consumed samples: 2777600 | consumed tokens: 5688524800 | elapsed time per iteration (s): 0.55 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.118128E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.282 | TFLOPs: 43.98 | +7: iteration 10860/ 115203 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.136762E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.218 | TFLOPs: 43.97 | +7: iteration 10870/ 115203 | consumed samples: 2782720 | consumed tokens: 5699010560 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.123799E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.983 | TFLOPs: 43.95 | +7: iteration 10880/ 115203 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.138276E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.981 | TFLOPs: 43.95 | +7: iteration 10890/ 115203 | consumed samples: 2787840 | consumed tokens: 5709496320 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.127382E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.063 | TFLOPs: 43.96 | +7: iteration 10900/ 115203 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.119492E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.071 | TFLOPs: 43.96 | +7: iteration 10910/ 115203 | consumed samples: 2792960 | consumed tokens: 5719982080 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.127122E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.138 | TFLOPs: 43.96 | +7: iteration 10920/ 115203 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.122184E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.032 | TFLOPs: 43.95 | +7: iteration 10930/ 115203 | consumed samples: 2798080 | consumed tokens: 5730467840 | elapsed time per iteration (s): 0.56 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.126657E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.116 | TFLOPs: 43.96 | +7: iteration 10940/ 115203 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.118559E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.030 | TFLOPs: 43.95 | +7: iteration 10950/ 115203 | consumed samples: 2803200 | consumed tokens: 5740953600 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.574544E+00 | grad norm: 6.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.977 | TFLOPs: 43.76 | +7: iteration 10960/ 115203 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.725967E+00 | grad norm: 2.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.046 | TFLOPs: 43.77 | +7: iteration 10970/ 115203 | consumed samples: 2808320 | consumed tokens: 5751439360 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.360424E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.868 | TFLOPs: 43.84 | +7: iteration 10980/ 115203 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.229432E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.161 | TFLOPs: 43.97 | +7: iteration 10990/ 115203 | consumed samples: 2813440 | consumed tokens: 5761925120 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.182430E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.045 | TFLOPs: 43.96 | +7: iteration 11000/ 115203 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.160032E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.035 | TFLOPs: 43.95 | +7: iteration 11010/ 115203 | consumed samples: 2818560 | consumed tokens: 5772410880 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.154587E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.092 | TFLOPs: 43.96 | +7: iteration 11020/ 115203 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.139330E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.105 | TFLOPs: 43.96 | +7: iteration 11030/ 115203 | consumed samples: 2823680 | consumed tokens: 5782896640 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.139478E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.197 | TFLOPs: 43.97 | +7: iteration 11040/ 115203 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 0.55 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.151150E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.297 | TFLOPs: 43.98 | +7: iteration 11050/ 115203 | consumed samples: 2828800 | consumed tokens: 5793382400 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.144521E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.196 | TFLOPs: 43.97 | +7: iteration 11060/ 115203 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.140054E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.199 | TFLOPs: 43.97 | +7: iteration 11070/ 115203 | consumed samples: 2833920 | consumed tokens: 5803868160 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.126003E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.123 | TFLOPs: 43.96 | +7: iteration 11080/ 115203 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 0.56 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.125154E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 11090/ 115203 | consumed samples: 2839040 | consumed tokens: 5814353920 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.135196E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.201 | TFLOPs: 43.97 | +7: iteration 11100/ 115203 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 0.55 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.127438E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.329 | TFLOPs: 43.98 | +7: iteration 11110/ 115203 | consumed samples: 2844160 | consumed tokens: 5824839680 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.147717E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.159 | TFLOPs: 43.97 | +7: iteration 11120/ 115203 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.121066E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 11130/ 115203 | consumed samples: 2849280 | consumed tokens: 5835325440 | elapsed time per iteration (s): 0.55 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.124230E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.409 | TFLOPs: 43.99 | +7: iteration 11140/ 115203 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.139596E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.190 | TFLOPs: 43.97 | +7: iteration 11150/ 115203 | consumed samples: 2854400 | consumed tokens: 5845811200 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.126574E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 11160/ 115203 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.126745E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.229 | TFLOPs: 43.97 | +7: iteration 11170/ 115203 | consumed samples: 2859520 | consumed tokens: 5856296960 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.123422E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.124 | TFLOPs: 43.96 | +7: iteration 11180/ 115203 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 0.55 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.122926E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.286 | TFLOPs: 43.98 | +7: iteration 11190/ 115203 | consumed samples: 2864640 | consumed tokens: 5866782720 | elapsed time per iteration (s): 0.55 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.116486E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 11200/ 115203 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.116487E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.040 | TFLOPs: 43.38 | +7: iteration 11210/ 115203 | consumed samples: 2869760 | consumed tokens: 5877268480 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.107682E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.212 | TFLOPs: 43.97 | +7: iteration 11220/ 115203 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 0.55 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.127026E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.282 | TFLOPs: 43.98 | +7: iteration 11230/ 115203 | consumed samples: 2874880 | consumed tokens: 5887754240 | elapsed time per iteration (s): 0.56 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.126015E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.090 | TFLOPs: 43.96 | +7: iteration 11240/ 115203 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.113482E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.337 | TFLOPs: 43.98 | +7: iteration 11250/ 115203 | consumed samples: 2880000 | consumed tokens: 5898240000 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.116378E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 11260/ 115203 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.118847E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.465 | TFLOPs: 44.00 | +7: iteration 11270/ 115203 | consumed samples: 2885120 | consumed tokens: 5908725760 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.119997E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.513 | TFLOPs: 44.00 | +7: iteration 11280/ 115203 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.109799E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.361 | TFLOPs: 43.99 | +7: iteration 11290/ 115203 | consumed samples: 2890240 | consumed tokens: 5919211520 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.122764E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.413 | TFLOPs: 43.99 | +7: iteration 11300/ 115203 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.129938E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.427 | TFLOPs: 43.99 | +7: iteration 11310/ 115203 | consumed samples: 2895360 | consumed tokens: 5929697280 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.118997E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.512 | TFLOPs: 44.00 | +7: iteration 11320/ 115203 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 0.56 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.110341E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 11330/ 115203 | consumed samples: 2900480 | consumed tokens: 5940183040 | elapsed time per iteration (s): 0.55 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.114592E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.432 | TFLOPs: 43.99 | +7: iteration 11340/ 115203 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 0.56 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.108932E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.217 | TFLOPs: 43.97 | +7: iteration 11350/ 115203 | consumed samples: 2905600 | consumed tokens: 5950668800 | elapsed time per iteration (s): 0.56 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.120801E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.108 | TFLOPs: 43.96 | +7: iteration 11360/ 115203 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 0.56 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.116104E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.974 | TFLOPs: 43.95 | +7: iteration 11370/ 115203 | consumed samples: 2910720 | consumed tokens: 5961154560 | elapsed time per iteration (s): 0.56 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.114474E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.136 | TFLOPs: 43.87 | +7: iteration 11380/ 115203 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 0.57 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.107173E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.287 | TFLOPs: 42.93 | +7: iteration 11390/ 115203 | consumed samples: 2915840 | consumed tokens: 5971640320 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.115869E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.579 | TFLOPs: 43.82 | +7: iteration 11400/ 115203 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.114904E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.069 | TFLOPs: 43.86 | +7: iteration 11410/ 115203 | consumed samples: 2920960 | consumed tokens: 5982126080 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.114396E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.084 | TFLOPs: 43.77 | +7: iteration 11420/ 115203 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.115309E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.078 | TFLOPs: 43.86 | +7: iteration 11430/ 115203 | consumed samples: 2926080 | consumed tokens: 5992611840 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.116696E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.521 | TFLOPs: 43.91 | +7: iteration 11440/ 115203 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.114328E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.107 | TFLOPs: 43.96 | +7: iteration 11450/ 115203 | consumed samples: 2931200 | consumed tokens: 6003097600 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.125721E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.218 | TFLOPs: 43.97 | +7: iteration 11460/ 115203 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.093692E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.187 | TFLOPs: 43.97 | +7: iteration 11470/ 115203 | consumed samples: 2936320 | consumed tokens: 6013583360 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.123062E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.987 | TFLOPs: 43.95 | +7: iteration 11480/ 115203 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.117347E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.100 | TFLOPs: 43.96 | +7: iteration 11490/ 115203 | consumed samples: 2941440 | consumed tokens: 6024069120 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.112850E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.149 | TFLOPs: 43.97 | +7: iteration 11500/ 115203 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.104904E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.034 | TFLOPs: 43.95 | +7: iteration 11510/ 115203 | consumed samples: 2946560 | consumed tokens: 6034554880 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.114826E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.979 | TFLOPs: 43.95 | +7: iteration 11520/ 115203 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 0.56 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.134538E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.965 | TFLOPs: 43.95 | +7: iteration 11530/ 115203 | consumed samples: 2951680 | consumed tokens: 6045040640 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.110007E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.281 | TFLOPs: 43.98 | +7: iteration 11540/ 115203 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.113827E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.960 | TFLOPs: 43.95 | +7: iteration 11550/ 115203 | consumed samples: 2956800 | consumed tokens: 6055526400 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.115612E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.411 | TFLOPs: 43.99 | +7: iteration 11560/ 115203 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.100696E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.187 | TFLOPs: 43.97 | +7: iteration 11570/ 115203 | consumed samples: 2961920 | consumed tokens: 6066012160 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.100731E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.431 | TFLOPs: 43.99 | +7: iteration 11580/ 115203 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.088189E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.206 | TFLOPs: 43.97 | +7: iteration 11590/ 115203 | consumed samples: 2967040 | consumed tokens: 6076497920 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.126846E+00 | grad norm: 1.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.429 | TFLOPs: 43.90 | +7: iteration 11600/ 115203 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.201488E+00 | grad norm: 0.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.479 | TFLOPs: 43.90 | +7: iteration 11610/ 115203 | consumed samples: 2972160 | consumed tokens: 6086983680 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.163803E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 11620/ 115203 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 0.56 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.143342E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 11630/ 115203 | consumed samples: 2977280 | consumed tokens: 6097469440 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.116031E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.312 | TFLOPs: 43.98 | +7: iteration 11640/ 115203 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.127502E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.297 | TFLOPs: 43.98 | +7: iteration 11650/ 115203 | consumed samples: 2982400 | consumed tokens: 6107955200 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.107932E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.385 | TFLOPs: 43.99 | +7: iteration 11660/ 115203 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 0.55 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.114056E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.401 | TFLOPs: 43.99 | +7: iteration 11670/ 115203 | consumed samples: 2987520 | consumed tokens: 6118440960 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.107059E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.164 | TFLOPs: 43.97 | +7: iteration 11680/ 115203 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.106479E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.181 | TFLOPs: 43.97 | +7: iteration 11690/ 115203 | consumed samples: 2992640 | consumed tokens: 6128926720 | elapsed time per iteration (s): 0.55 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.114627E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.284 | TFLOPs: 43.98 | +7: iteration 11700/ 115203 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 0.55 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.111360E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.352 | TFLOPs: 43.98 | +7: iteration 11710/ 115203 | consumed samples: 2997760 | consumed tokens: 6139412480 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.108882E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.199 | TFLOPs: 43.97 | +7: iteration 11720/ 115203 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 0.55 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.103585E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.274 | TFLOPs: 43.98 | +7: iteration 11730/ 115203 | consumed samples: 3002880 | consumed tokens: 6149898240 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.100189E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.198 | TFLOPs: 43.97 | +7: iteration 11740/ 115203 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 0.55 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.117805E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.283 | TFLOPs: 43.98 | +7: iteration 11750/ 115203 | consumed samples: 3008000 | consumed tokens: 6160384000 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.100579E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.250 | TFLOPs: 43.98 | +7: iteration 11760/ 115203 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.112648E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.243 | TFLOPs: 43.97 | +7: iteration 11770/ 115203 | consumed samples: 3013120 | consumed tokens: 6170869760 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.100241E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.238 | TFLOPs: 43.97 | +7: iteration 11780/ 115203 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 0.55 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.114689E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.400 | TFLOPs: 43.99 | +7: iteration 11790/ 115203 | consumed samples: 3018240 | consumed tokens: 6181355520 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.113218E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 11800/ 115203 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 0.56 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.091525E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.163 | TFLOPs: 43.97 | +7: iteration 11810/ 115203 | consumed samples: 3023360 | consumed tokens: 6191841280 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.110475E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.179 | TFLOPs: 43.97 | +7: iteration 11820/ 115203 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.099575E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.135 | TFLOPs: 43.96 | +7: iteration 11830/ 115203 | consumed samples: 3028480 | consumed tokens: 6202327040 | elapsed time per iteration (s): 0.55 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.100186E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.311 | TFLOPs: 43.98 | +7: iteration 11840/ 115203 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.118895E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.221 | TFLOPs: 43.97 | +7: iteration 11850/ 115203 | consumed samples: 3033600 | consumed tokens: 6212812800 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.104794E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.121 | TFLOPs: 43.96 | +7: iteration 11860/ 115203 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 0.55 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.100602E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.327 | TFLOPs: 43.98 | +7: iteration 11870/ 115203 | consumed samples: 3038720 | consumed tokens: 6223298560 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.089052E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.023 | TFLOPs: 43.95 | +7: iteration 11880/ 115203 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.093486E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.900 | TFLOPs: 43.94 | +7: iteration 11890/ 115203 | consumed samples: 3043840 | consumed tokens: 6233784320 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.100698E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.996 | TFLOPs: 43.95 | +7: iteration 11900/ 115203 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.085442E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.156 | TFLOPs: 43.97 | +7: iteration 11910/ 115203 | consumed samples: 3048960 | consumed tokens: 6244270080 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.085926E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.090 | TFLOPs: 43.96 | +7: iteration 11920/ 115203 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.100997E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.208 | TFLOPs: 43.97 | +7: iteration 11930/ 115203 | consumed samples: 3054080 | consumed tokens: 6254755840 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.093568E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 11940/ 115203 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 0.56 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.101095E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.166 | TFLOPs: 43.97 | +7: iteration 11950/ 115203 | consumed samples: 3059200 | consumed tokens: 6265241600 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.086864E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.130 | TFLOPs: 43.96 | +7: iteration 11960/ 115203 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.107281E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.147 | TFLOPs: 43.97 | +7: iteration 11970/ 115203 | consumed samples: 3064320 | consumed tokens: 6275727360 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.089443E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 11980/ 115203 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.083532E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.158 | TFLOPs: 43.97 | +7: iteration 11990/ 115203 | consumed samples: 3069440 | consumed tokens: 6286213120 | elapsed time per iteration (s): 0.55 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.084103E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.301 | TFLOPs: 43.98 | +0: [2023-03-16 14:42:41,158] [INFO] [logging.py:68:log_dist] [Rank 0] step=12000, skipped=0, lr=[0.0001960118617437879, 0.0001960118617437879, 0.0001960118617437879], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 12000/ 115203 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 0.55 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.095519E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.267 | TFLOPs: 43.98 | +0: steps: 12000 loss: 3.0906 iter time (s): 0.553 samples/sec: 462.674 +7: iteration 12010/ 115203 | consumed samples: 3074560 | consumed tokens: 6296698880 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.096409E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.760 | TFLOPs: 43.93 | +7: iteration 12020/ 115203 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 0.55 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.108832E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.286 | TFLOPs: 43.98 | +7: iteration 12030/ 115203 | consumed samples: 3079680 | consumed tokens: 6307184640 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.100442E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.155 | TFLOPs: 43.97 | +7: iteration 12040/ 115203 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 0.55 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.098149E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.285 | TFLOPs: 43.98 | +7: iteration 12050/ 115203 | consumed samples: 3084800 | consumed tokens: 6317670400 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.108870E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 12060/ 115203 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.098953E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.182 | TFLOPs: 43.97 | +7: iteration 12070/ 115203 | consumed samples: 3089920 | consumed tokens: 6328156160 | elapsed time per iteration (s): 0.55 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.106582E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.401 | TFLOPs: 43.99 | +7: iteration 12080/ 115203 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 0.56 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.111390E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.132 | TFLOPs: 43.96 | +7: iteration 12090/ 115203 | consumed samples: 3095040 | consumed tokens: 6338641920 | elapsed time per iteration (s): 0.55 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.089981E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.302 | TFLOPs: 43.98 | +7: iteration 12100/ 115203 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.088264E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.192 | TFLOPs: 43.97 | +7: iteration 12110/ 115203 | consumed samples: 3100160 | consumed tokens: 6349127680 | elapsed time per iteration (s): 0.55 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.103360E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.333 | TFLOPs: 43.98 | +7: iteration 12120/ 115203 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.089170E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.144 | TFLOPs: 43.97 | +7: iteration 12130/ 115203 | consumed samples: 3105280 | consumed tokens: 6359613440 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.095126E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.045 | TFLOPs: 43.96 | +7: iteration 12140/ 115203 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.091289E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.965 | TFLOPs: 43.95 | +7: iteration 12150/ 115203 | consumed samples: 3110400 | consumed tokens: 6370099200 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.099851E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.794 | TFLOPs: 43.93 | +7: iteration 12160/ 115203 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.087797E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 12170/ 115203 | consumed samples: 3115520 | consumed tokens: 6380584960 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.102084E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.884 | TFLOPs: 43.94 | +7: iteration 12180/ 115203 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.100930E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.057 | TFLOPs: 43.96 | +7: iteration 12190/ 115203 | consumed samples: 3120640 | consumed tokens: 6391070720 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.088846E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.977 | TFLOPs: 43.95 | +7: iteration 12200/ 115203 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.102211E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.825 | TFLOPs: 43.93 | +7: iteration 12210/ 115203 | consumed samples: 3125760 | consumed tokens: 6401556480 | elapsed time per iteration (s): 0.56 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.080459E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 12220/ 115203 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.099717E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.145 | TFLOPs: 43.97 | +7: iteration 12230/ 115203 | consumed samples: 3130880 | consumed tokens: 6412042240 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.100203E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 12240/ 115203 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.085616E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.187 | TFLOPs: 43.97 | +7: iteration 12250/ 115203 | consumed samples: 3136000 | consumed tokens: 6422528000 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.076303E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.179 | TFLOPs: 43.97 | +7: iteration 12260/ 115203 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.082794E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.041 | TFLOPs: 43.96 | +7: iteration 12270/ 115203 | consumed samples: 3141120 | consumed tokens: 6433013760 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.102741E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.152 | TFLOPs: 43.97 | +7: iteration 12280/ 115203 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.085923E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.004 | TFLOPs: 43.95 | +7: iteration 12290/ 115203 | consumed samples: 3146240 | consumed tokens: 6443499520 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.076809E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.995 | TFLOPs: 43.95 | +7: iteration 12300/ 115203 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.092300E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.183 | TFLOPs: 43.97 | +7: iteration 12310/ 115203 | consumed samples: 3151360 | consumed tokens: 6453985280 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.083444E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.061 | TFLOPs: 43.96 | +7: iteration 12320/ 115203 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.085638E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.134 | TFLOPs: 43.96 | +7: iteration 12330/ 115203 | consumed samples: 3156480 | consumed tokens: 6464471040 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.090436E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.997 | TFLOPs: 43.95 | +7: iteration 12340/ 115203 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.081412E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 12350/ 115203 | consumed samples: 3161600 | consumed tokens: 6474956800 | elapsed time per iteration (s): 0.56 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.090543E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.051 | TFLOPs: 43.96 | +7: iteration 12360/ 115203 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.110527E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.998 | TFLOPs: 43.95 | +7: iteration 12370/ 115203 | consumed samples: 3166720 | consumed tokens: 6485442560 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.080412E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.978 | TFLOPs: 43.95 | +7: iteration 12380/ 115203 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.085228E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.016 | TFLOPs: 43.95 | +7: iteration 12390/ 115203 | consumed samples: 3171840 | consumed tokens: 6495928320 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.086622E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.024 | TFLOPs: 43.95 | +7: iteration 12400/ 115203 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.090210E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.979 | TFLOPs: 43.95 | +7: iteration 12410/ 115203 | consumed samples: 3176960 | consumed tokens: 6506414080 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.082801E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.972 | TFLOPs: 43.95 | +7: iteration 12420/ 115203 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.075411E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.044 | TFLOPs: 43.96 | +7: iteration 12430/ 115203 | consumed samples: 3182080 | consumed tokens: 6516899840 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.079777E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.041 | TFLOPs: 43.96 | +7: iteration 12440/ 115203 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.076036E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.069 | TFLOPs: 43.96 | +7: iteration 12450/ 115203 | consumed samples: 3187200 | consumed tokens: 6527385600 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.087844E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.102 | TFLOPs: 43.96 | +7: iteration 12460/ 115203 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.091830E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.925 | TFLOPs: 43.94 | +7: iteration 12470/ 115203 | consumed samples: 3192320 | consumed tokens: 6537871360 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.082638E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.154 | TFLOPs: 43.97 | +7: iteration 12480/ 115203 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 0.56 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.086276E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.984 | TFLOPs: 43.95 | +7: iteration 12490/ 115203 | consumed samples: 3197440 | consumed tokens: 6548357120 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.089239E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.205 | TFLOPs: 43.59 | +7: iteration 12500/ 115203 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.069712E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.103 | TFLOPs: 43.96 | +7: iteration 12510/ 115203 | consumed samples: 3202560 | consumed tokens: 6558842880 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.082254E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.256 | TFLOPs: 43.40 | +7: iteration 12520/ 115203 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.090149E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.029 | TFLOPs: 43.95 | +7: iteration 12530/ 115203 | consumed samples: 3207680 | consumed tokens: 6569328640 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.092052E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 12540/ 115203 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.081932E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.162 | TFLOPs: 43.97 | +7: iteration 12550/ 115203 | consumed samples: 3212800 | consumed tokens: 6579814400 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.078293E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.061 | TFLOPs: 43.96 | +7: iteration 12560/ 115203 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.090489E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.847 | TFLOPs: 43.94 | +7: iteration 12570/ 115203 | consumed samples: 3217920 | consumed tokens: 6590300160 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.094670E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.590 | TFLOPs: 43.82 | +7: iteration 12580/ 115203 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.092460E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.061 | TFLOPs: 43.96 | +7: iteration 12590/ 115203 | consumed samples: 3223040 | consumed tokens: 6600785920 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.085378E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.024 | TFLOPs: 43.95 | +7: iteration 12600/ 115203 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.100066E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.822 | TFLOPs: 43.93 | +7: iteration 12610/ 115203 | consumed samples: 3228160 | consumed tokens: 6611271680 | elapsed time per iteration (s): 0.56 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.072813E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.025 | TFLOPs: 43.95 | +7: iteration 12620/ 115203 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.084996E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.887 | TFLOPs: 43.94 | +7: iteration 12630/ 115203 | consumed samples: 3233280 | consumed tokens: 6621757440 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.073589E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.041 | TFLOPs: 43.96 | +7: iteration 12640/ 115203 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.082879E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.957 | TFLOPs: 43.95 | +7: iteration 12650/ 115203 | consumed samples: 3238400 | consumed tokens: 6632243200 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.092006E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.109 | TFLOPs: 43.96 | +7: iteration 12660/ 115203 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.081273E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.006 | TFLOPs: 43.95 | +7: iteration 12670/ 115203 | consumed samples: 3243520 | consumed tokens: 6642728960 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.081931E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.116 | TFLOPs: 43.96 | +7: iteration 12680/ 115203 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.084368E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.068 | TFLOPs: 43.96 | +7: iteration 12690/ 115203 | consumed samples: 3248640 | consumed tokens: 6653214720 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.071861E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.034 | TFLOPs: 43.95 | +7: iteration 12700/ 115203 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.081299E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.058 | TFLOPs: 43.96 | +7: iteration 12710/ 115203 | consumed samples: 3253760 | consumed tokens: 6663700480 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.070953E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.190 | TFLOPs: 43.97 | +7: iteration 12720/ 115203 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.079362E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.015 | TFLOPs: 43.95 | +7: iteration 12730/ 115203 | consumed samples: 3258880 | consumed tokens: 6674186240 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.073993E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.145 | TFLOPs: 43.97 | +7: iteration 12740/ 115203 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 0.56 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.052126E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.026 | TFLOPs: 43.95 | +7: iteration 12750/ 115203 | consumed samples: 3264000 | consumed tokens: 6684672000 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.079794E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.060 | TFLOPs: 43.96 | +7: iteration 12760/ 115203 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.089661E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.068 | TFLOPs: 43.96 | +7: iteration 12770/ 115203 | consumed samples: 3269120 | consumed tokens: 6695157760 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.071210E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.086 | TFLOPs: 43.96 | +7: iteration 12780/ 115203 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.080950E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.088 | TFLOPs: 43.96 | +7: iteration 12790/ 115203 | consumed samples: 3274240 | consumed tokens: 6705643520 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.071402E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.180 | TFLOPs: 43.97 | +7: iteration 12800/ 115203 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.074282E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 12810/ 115203 | consumed samples: 3279360 | consumed tokens: 6716129280 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.062902E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.063 | TFLOPs: 43.96 | +7: iteration 12820/ 115203 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.070324E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 12830/ 115203 | consumed samples: 3284480 | consumed tokens: 6726615040 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.073493E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.817 | TFLOPs: 43.84 | +7: iteration 12840/ 115203 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 0.55 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.071878E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.361 | TFLOPs: 43.99 | +7: iteration 12850/ 115203 | consumed samples: 3289600 | consumed tokens: 6737100800 | elapsed time per iteration (s): 0.55 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.070620E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.324 | TFLOPs: 43.98 | +7: iteration 12860/ 115203 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 0.55 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.072175E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 12870/ 115203 | consumed samples: 3294720 | consumed tokens: 6747586560 | elapsed time per iteration (s): 0.56 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.067662E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 12880/ 115203 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.070175E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.270 | TFLOPs: 43.98 | +7: iteration 12890/ 115203 | consumed samples: 3299840 | consumed tokens: 6758072320 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.054436E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.386 | TFLOPs: 43.99 | +7: iteration 12900/ 115203 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 0.56 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.062139E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.118 | TFLOPs: 43.96 | +7: iteration 12910/ 115203 | consumed samples: 3304960 | consumed tokens: 6768558080 | elapsed time per iteration (s): 0.56 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.067795E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 12920/ 115203 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.083050E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.308 | TFLOPs: 43.98 | +7: iteration 12930/ 115203 | consumed samples: 3310080 | consumed tokens: 6779043840 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.077786E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.573 | TFLOPs: 44.01 | +7: iteration 12940/ 115203 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.067053E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.525 | TFLOPs: 44.00 | +7: iteration 12950/ 115203 | consumed samples: 3315200 | consumed tokens: 6789529600 | elapsed time per iteration (s): 0.56 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.061183E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.229 | TFLOPs: 43.97 | +7: iteration 12960/ 115203 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.068470E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.425 | TFLOPs: 43.99 | +7: iteration 12970/ 115203 | consumed samples: 3320320 | consumed tokens: 6800015360 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.065749E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.482 | TFLOPs: 44.00 | +7: iteration 12980/ 115203 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.076313E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.580 | TFLOPs: 44.01 | +7: iteration 12990/ 115203 | consumed samples: 3325440 | consumed tokens: 6810501120 | elapsed time per iteration (s): 0.55 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.056218E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.363 | TFLOPs: 43.99 | +7: iteration 13000/ 115203 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.059063E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.364 | TFLOPs: 43.99 | +7: iteration 13010/ 115203 | consumed samples: 3330560 | consumed tokens: 6820986880 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.060422E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.365 | TFLOPs: 43.99 | +7: iteration 13020/ 115203 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.075030E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.355 | TFLOPs: 43.99 | +7: iteration 13030/ 115203 | consumed samples: 3335680 | consumed tokens: 6831472640 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.081873E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.421 | TFLOPs: 43.99 | +7: iteration 13040/ 115203 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.073075E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 13050/ 115203 | consumed samples: 3340800 | consumed tokens: 6841958400 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.080960E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 13060/ 115203 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.071885E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.281 | TFLOPs: 43.98 | +7: iteration 13070/ 115203 | consumed samples: 3345920 | consumed tokens: 6852444160 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.072807E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.516 | TFLOPs: 44.00 | +7: iteration 13080/ 115203 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.057166E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.407 | TFLOPs: 43.99 | +7: iteration 13090/ 115203 | consumed samples: 3351040 | consumed tokens: 6862929920 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.085560E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.434 | TFLOPs: 43.99 | +7: iteration 13100/ 115203 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.085199E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 13110/ 115203 | consumed samples: 3356160 | consumed tokens: 6873415680 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.067217E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.597 | TFLOPs: 44.01 | +7: iteration 13120/ 115203 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 0.55 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.076118E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.437 | TFLOPs: 43.99 | +7: iteration 13130/ 115203 | consumed samples: 3361280 | consumed tokens: 6883901440 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.072586E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.361 | TFLOPs: 43.99 | +7: iteration 13140/ 115203 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.062089E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.505 | TFLOPs: 44.00 | +7: iteration 13150/ 115203 | consumed samples: 3366400 | consumed tokens: 6894387200 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.064297E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.330 | TFLOPs: 43.98 | +7: iteration 13160/ 115203 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.073115E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.635 | TFLOPs: 44.01 | +7: iteration 13170/ 115203 | consumed samples: 3371520 | consumed tokens: 6904872960 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.062694E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.316 | TFLOPs: 43.98 | +7: iteration 13180/ 115203 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.069006E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.366 | TFLOPs: 43.99 | +7: iteration 13190/ 115203 | consumed samples: 3376640 | consumed tokens: 6915358720 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.060728E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.377 | TFLOPs: 43.99 | +7: iteration 13200/ 115203 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.090420E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.270 | TFLOPs: 43.98 | +7: iteration 13210/ 115203 | consumed samples: 3381760 | consumed tokens: 6925844480 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.056538E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +7: iteration 13220/ 115203 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 0.56 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.064074E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.177 | TFLOPs: 43.97 | +7: iteration 13230/ 115203 | consumed samples: 3386880 | consumed tokens: 6936330240 | elapsed time per iteration (s): 0.56 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.071475E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.196 | TFLOPs: 43.97 | +7: iteration 13240/ 115203 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 0.55 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.069296E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.531 | TFLOPs: 44.00 | +7: iteration 13250/ 115203 | consumed samples: 3392000 | consumed tokens: 6946816000 | elapsed time per iteration (s): 0.55 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.084551E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.572 | TFLOPs: 44.01 | +7: iteration 13260/ 115203 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 0.55 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.075469E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.634 | TFLOPs: 44.01 | +7: iteration 13270/ 115203 | consumed samples: 3397120 | consumed tokens: 6957301760 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.062062E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.217 | TFLOPs: 43.97 | +7: iteration 13280/ 115203 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.198758E+00 | grad norm: 3.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.723 | TFLOPs: 43.83 | +7: iteration 13290/ 115203 | consumed samples: 3402240 | consumed tokens: 6967787520 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.213686E+00 | grad norm: 1.090 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.469 | TFLOPs: 43.81 | +7: iteration 13300/ 115203 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.148539E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.253 | TFLOPs: 43.98 | +7: iteration 13310/ 115203 | consumed samples: 3407360 | consumed tokens: 6978273280 | elapsed time per iteration (s): 0.55 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.103259E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.303 | TFLOPs: 43.98 | +7: iteration 13320/ 115203 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 0.55 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.068548E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.338 | TFLOPs: 43.98 | +7: iteration 13330/ 115203 | consumed samples: 3412480 | consumed tokens: 6988759040 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.079814E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.475 | TFLOPs: 43.90 | +7: iteration 13340/ 115203 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.072856E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.554 | TFLOPs: 43.81 | +7: iteration 13350/ 115203 | consumed samples: 3417600 | consumed tokens: 6999244800 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.055705E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.886 | TFLOPs: 43.85 | +7: iteration 13360/ 115203 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 0.56 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.075541E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.147 | TFLOPs: 43.87 | +7: iteration 13370/ 115203 | consumed samples: 3422720 | consumed tokens: 7009730560 | elapsed time per iteration (s): 0.55 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.075390E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.377 | TFLOPs: 43.99 | +7: iteration 13380/ 115203 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 0.55 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.073495E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 13390/ 115203 | consumed samples: 3427840 | consumed tokens: 7020216320 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.067653E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.300 | TFLOPs: 43.88 | +7: iteration 13400/ 115203 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.064741E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.008 | TFLOPs: 43.95 | +7: iteration 13410/ 115203 | consumed samples: 3432960 | consumed tokens: 7030702080 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.085919E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.769 | TFLOPs: 43.93 | +7: iteration 13420/ 115203 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.071276E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.549 | TFLOPs: 43.72 | +7: iteration 13430/ 115203 | consumed samples: 3438080 | consumed tokens: 7041187840 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.071132E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.028 | TFLOPs: 43.67 | +7: iteration 13440/ 115203 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.049682E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.243 | TFLOPs: 43.78 | +7: iteration 13450/ 115203 | consumed samples: 3443200 | consumed tokens: 7051673600 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.074024E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.054 | TFLOPs: 43.86 | +7: iteration 13460/ 115203 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 0.56 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.067027E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.219 | TFLOPs: 43.88 | +7: iteration 13470/ 115203 | consumed samples: 3448320 | consumed tokens: 7062159360 | elapsed time per iteration (s): 0.55 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.075554E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.563 | TFLOPs: 44.01 | +7: iteration 13480/ 115203 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 0.55 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.078417E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.487 | TFLOPs: 44.00 | +7: iteration 13490/ 115203 | consumed samples: 3453440 | consumed tokens: 7072645120 | elapsed time per iteration (s): 0.55 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.068572E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.325 | TFLOPs: 43.98 | +7: iteration 13500/ 115203 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.050548E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.497 | TFLOPs: 43.90 | +7: iteration 13510/ 115203 | consumed samples: 3458560 | consumed tokens: 7083130880 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.049488E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.016 | TFLOPs: 43.86 | +7: iteration 13520/ 115203 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.069125E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.731 | TFLOPs: 43.74 | +7: iteration 13530/ 115203 | consumed samples: 3463680 | consumed tokens: 7093616640 | elapsed time per iteration (s): 0.55 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.057688E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.582 | TFLOPs: 44.01 | +7: iteration 13540/ 115203 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.076116E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.144 | TFLOPs: 43.97 | +7: iteration 13550/ 115203 | consumed samples: 3468800 | consumed tokens: 7104102400 | elapsed time per iteration (s): 0.57 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.052281E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.873 | TFLOPs: 43.18 | +7: iteration 13560/ 115203 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 0.55 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.051953E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 13570/ 115203 | consumed samples: 3473920 | consumed tokens: 7114588160 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.059603E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.780 | TFLOPs: 43.45 | +7: iteration 13580/ 115203 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.066258E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.091 | TFLOPs: 43.67 | +7: iteration 13590/ 115203 | consumed samples: 3479040 | consumed tokens: 7125073920 | elapsed time per iteration (s): 0.55 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.069560E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.357 | TFLOPs: 43.99 | +7: iteration 13600/ 115203 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 0.55 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.067504E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.297 | TFLOPs: 43.98 | +7: iteration 13610/ 115203 | consumed samples: 3484160 | consumed tokens: 7135559680 | elapsed time per iteration (s): 0.56 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.047138E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.218 | TFLOPs: 43.40 | +7: iteration 13620/ 115203 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 0.56 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.057524E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.831 | TFLOPs: 43.84 | +7: iteration 13630/ 115203 | consumed samples: 3489280 | consumed tokens: 7146045440 | elapsed time per iteration (s): 0.56 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.044124E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.074 | TFLOPs: 43.96 | +7: iteration 13640/ 115203 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 0.56 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.050192E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.098 | TFLOPs: 43.87 | +7: iteration 13650/ 115203 | consumed samples: 3494400 | consumed tokens: 7156531200 | elapsed time per iteration (s): 0.57 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.053542E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.728 | TFLOPs: 42.88 | +7: iteration 13660/ 115203 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 0.56 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.067205E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.279 | TFLOPs: 43.50 | +7: iteration 13670/ 115203 | consumed samples: 3499520 | consumed tokens: 7167016960 | elapsed time per iteration (s): 0.62 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.072676E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 415.716 | TFLOPs: 39.63 | +7: iteration 13680/ 115203 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 0.60 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.072108E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.295 | TFLOPs: 40.93 | +7: iteration 13690/ 115203 | consumed samples: 3504640 | consumed tokens: 7177502720 | elapsed time per iteration (s): 0.59 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.064707E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.648 | TFLOPs: 41.15 | +7: iteration 13700/ 115203 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 0.57 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.085960E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.233 | TFLOPs: 42.45 | +7: iteration 13710/ 115203 | consumed samples: 3509760 | consumed tokens: 7187988480 | elapsed time per iteration (s): 0.58 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.096696E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.270 | TFLOPs: 42.36 | +7: iteration 13720/ 115203 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 0.58 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.084463E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.841 | TFLOPs: 42.03 | +7: iteration 13730/ 115203 | consumed samples: 3514880 | consumed tokens: 7198474240 | elapsed time per iteration (s): 0.57 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.066505E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.235 | TFLOPs: 43.12 | +7: iteration 13740/ 115203 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 0.58 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.074688E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.763 | TFLOPs: 42.12 | +7: iteration 13750/ 115203 | consumed samples: 3520000 | consumed tokens: 7208960000 | elapsed time per iteration (s): 0.57 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.082059E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.147 | TFLOPs: 42.63 | +7: iteration 13760/ 115203 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 0.59 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.063442E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.391 | TFLOPs: 41.41 | +7: iteration 13770/ 115203 | consumed samples: 3525120 | consumed tokens: 7219445760 | elapsed time per iteration (s): 0.57 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.083422E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.677 | TFLOPs: 42.68 | +7: iteration 13780/ 115203 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 0.57 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.069473E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.498 | TFLOPs: 42.76 | +7: iteration 13790/ 115203 | consumed samples: 3530240 | consumed tokens: 7229931520 | elapsed time per iteration (s): 0.57 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.076487E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.599 | TFLOPs: 42.48 | +7: iteration 13800/ 115203 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 0.56 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.062796E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.279 | TFLOPs: 43.50 | +7: iteration 13810/ 115203 | consumed samples: 3535360 | consumed tokens: 7240417280 | elapsed time per iteration (s): 0.58 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.072076E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.491 | TFLOPs: 41.90 | +7: iteration 13820/ 115203 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 0.59 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.055303E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.105 | TFLOPs: 41.39 | +7: iteration 13830/ 115203 | consumed samples: 3540480 | consumed tokens: 7250903040 | elapsed time per iteration (s): 0.58 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.072171E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.398 | TFLOPs: 42.18 | +7: iteration 13840/ 115203 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 0.60 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.060169E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.471 | TFLOPs: 40.56 | +7: iteration 13850/ 115203 | consumed samples: 3545600 | consumed tokens: 7261388800 | elapsed time per iteration (s): 0.57 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.068557E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.662 | TFLOPs: 42.68 | +7: iteration 13860/ 115203 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 0.59 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.055647E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.030 | TFLOPs: 41.57 | +7: iteration 13870/ 115203 | consumed samples: 3550720 | consumed tokens: 7271874560 | elapsed time per iteration (s): 0.59 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.073420E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.289 | TFLOPs: 41.50 | +7: iteration 13880/ 115203 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 0.57 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.053189E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.168 | TFLOPs: 42.54 | +7: iteration 13890/ 115203 | consumed samples: 3555840 | consumed tokens: 7282360320 | elapsed time per iteration (s): 0.57 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.055089E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.450 | TFLOPs: 43.04 | +7: iteration 13900/ 115203 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 0.58 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.057448E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.533 | TFLOPs: 42.10 | +7: iteration 13910/ 115203 | consumed samples: 3560960 | consumed tokens: 7292846080 | elapsed time per iteration (s): 0.59 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.063485E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.366 | TFLOPs: 41.51 | +7: iteration 13920/ 115203 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 0.58 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.062061E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.527 | TFLOPs: 42.29 | +7: iteration 13930/ 115203 | consumed samples: 3566080 | consumed tokens: 7303331840 | elapsed time per iteration (s): 0.60 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.039085E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.967 | TFLOPs: 40.42 | +7: iteration 13940/ 115203 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 0.58 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.060293E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.523 | TFLOPs: 41.81 | +7: iteration 13950/ 115203 | consumed samples: 3571200 | consumed tokens: 7313817600 | elapsed time per iteration (s): 0.60 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.039143E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.914 | TFLOPs: 40.99 | +7: iteration 13960/ 115203 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 0.58 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.036789E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.375 | TFLOPs: 42.18 | +7: iteration 13970/ 115203 | consumed samples: 3576320 | consumed tokens: 7324303360 | elapsed time per iteration (s): 0.58 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.061647E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.291 | TFLOPs: 42.07 | +7: iteration 13980/ 115203 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 0.57 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.068770E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.043 | TFLOPs: 42.53 | +7: iteration 13990/ 115203 | consumed samples: 3581440 | consumed tokens: 7334789120 | elapsed time per iteration (s): 0.58 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.041738E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.537 | TFLOPs: 42.29 | +0: [2023-03-16 15:01:21,377] [INFO] [logging.py:68:log_dist] [Rank 0] step=14000, skipped=0, lr=[0.00019442251142812213, 0.00019442251142812213, 0.00019442251142812213], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 14000/ 115203 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 0.59 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.033924E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.251 | TFLOPs: 41.59 | +0: steps: 14000 loss: 3.0099 iter time (s): 0.558 samples/sec: 459.010 +7: iteration 14010/ 115203 | consumed samples: 3586560 | consumed tokens: 7345274880 | elapsed time per iteration (s): 0.57 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.067436E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.772 | TFLOPs: 42.59 | +7: iteration 14020/ 115203 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 0.59 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.050529E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.704 | TFLOPs: 41.16 | +7: iteration 14030/ 115203 | consumed samples: 3591680 | consumed tokens: 7355760640 | elapsed time per iteration (s): 0.57 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.041557E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.030 | TFLOPs: 42.71 | +7: iteration 14040/ 115203 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 0.57 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.051161E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.744 | TFLOPs: 42.78 | +7: iteration 14050/ 115203 | consumed samples: 3596800 | consumed tokens: 7366246400 | elapsed time per iteration (s): 0.59 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.063282E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.215 | TFLOPs: 41.21 | +7: iteration 14060/ 115203 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 0.57 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.059570E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.940 | TFLOPs: 42.90 | +7: iteration 14070/ 115203 | consumed samples: 3601920 | consumed tokens: 7376732160 | elapsed time per iteration (s): 0.59 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.037584E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.855 | TFLOPs: 41.46 | +7: iteration 14080/ 115203 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 0.57 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.053726E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.323 | TFLOPs: 42.93 | +7: iteration 14090/ 115203 | consumed samples: 3607040 | consumed tokens: 7387217920 | elapsed time per iteration (s): 0.58 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.051801E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.645 | TFLOPs: 42.11 | +7: iteration 14100/ 115203 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 0.57 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.056772E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.294 | TFLOPs: 42.45 | +7: iteration 14110/ 115203 | consumed samples: 3612160 | consumed tokens: 7397703680 | elapsed time per iteration (s): 0.60 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.050946E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.588 | TFLOPs: 40.77 | +7: iteration 14120/ 115203 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 0.58 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.044361E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.126 | TFLOPs: 42.06 | +7: iteration 14130/ 115203 | consumed samples: 3617280 | consumed tokens: 7408189440 | elapsed time per iteration (s): 0.59 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.048593E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.106 | TFLOPs: 41.20 | +7: iteration 14140/ 115203 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 0.58 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.052917E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.885 | TFLOPs: 42.32 | +7: iteration 14150/ 115203 | consumed samples: 3622400 | consumed tokens: 7418675200 | elapsed time per iteration (s): 0.57 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.054997E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.095 | TFLOPs: 42.91 | +7: iteration 14160/ 115203 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 0.59 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.052815E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.057 | TFLOPs: 41.19 | +7: iteration 14170/ 115203 | consumed samples: 3627520 | consumed tokens: 7429160960 | elapsed time per iteration (s): 0.57 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.049854E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.437 | TFLOPs: 42.75 | +7: iteration 14180/ 115203 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 0.58 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.058850E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.858 | TFLOPs: 42.41 | +7: iteration 14190/ 115203 | consumed samples: 3632640 | consumed tokens: 7439646720 | elapsed time per iteration (s): 0.59 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.062716E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.909 | TFLOPs: 41.27 | +7: iteration 14200/ 115203 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 0.57 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.049928E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.515 | TFLOPs: 43.05 | +7: iteration 14210/ 115203 | consumed samples: 3637760 | consumed tokens: 7450132480 | elapsed time per iteration (s): 0.57 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.058719E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.534 | TFLOPs: 42.57 | +7: iteration 14220/ 115203 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 0.58 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.041586E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.542 | TFLOPs: 42.00 | +7: iteration 14230/ 115203 | consumed samples: 3642880 | consumed tokens: 7460618240 | elapsed time per iteration (s): 0.58 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.057240E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.498 | TFLOPs: 41.81 | +7: iteration 14240/ 115203 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 0.56 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.034332E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.110 | TFLOPs: 43.58 | +7: iteration 14250/ 115203 | consumed samples: 3648000 | consumed tokens: 7471104000 | elapsed time per iteration (s): 0.59 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.039563E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.447 | TFLOPs: 41.32 | +7: iteration 14260/ 115203 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 0.57 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.028532E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.960 | TFLOPs: 42.99 | +7: iteration 14270/ 115203 | consumed samples: 3653120 | consumed tokens: 7481589760 | elapsed time per iteration (s): 0.57 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.046870E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.429 | TFLOPs: 42.75 | +7: iteration 14280/ 115203 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 0.58 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.034577E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.106 | TFLOPs: 41.96 | +7: iteration 14290/ 115203 | consumed samples: 3658240 | consumed tokens: 7492075520 | elapsed time per iteration (s): 0.58 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.051579E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.165 | TFLOPs: 42.25 | +7: iteration 14300/ 115203 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 0.59 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.042938E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.186 | TFLOPs: 41.68 | +7: iteration 14310/ 115203 | consumed samples: 3663360 | consumed tokens: 7502561280 | elapsed time per iteration (s): 0.59 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.064832E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.520 | TFLOPs: 41.71 | +7: iteration 14320/ 115203 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 0.58 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.049966E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.596 | TFLOPs: 41.82 | +7: iteration 14330/ 115203 | consumed samples: 3668480 | consumed tokens: 7513047040 | elapsed time per iteration (s): 0.59 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.046349E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.642 | TFLOPs: 41.25 | +7: iteration 14340/ 115203 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 0.59 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.070557E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.694 | TFLOPs: 41.63 | +7: iteration 14350/ 115203 | consumed samples: 3673600 | consumed tokens: 7523532800 | elapsed time per iteration (s): 0.59 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.193537E+00 | grad norm: 2.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.728 | TFLOPs: 41.64 | +7: iteration 14360/ 115203 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 0.59 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.146523E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.065 | TFLOPs: 41.29 | +7: iteration 14370/ 115203 | consumed samples: 3678720 | consumed tokens: 7534018560 | elapsed time per iteration (s): 0.59 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.106724E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.800 | TFLOPs: 41.45 | +7: iteration 14380/ 115203 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 0.57 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.078215E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.345 | TFLOPs: 42.65 | +7: iteration 14390/ 115203 | consumed samples: 3683840 | consumed tokens: 7544504320 | elapsed time per iteration (s): 0.58 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.071607E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.945 | TFLOPs: 41.94 | +7: iteration 14400/ 115203 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 0.58 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.059459E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.059 | TFLOPs: 42.43 | +7: iteration 14410/ 115203 | consumed samples: 3688960 | consumed tokens: 7554990080 | elapsed time per iteration (s): 0.56 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.066311E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.369 | TFLOPs: 43.22 | +7: iteration 14420/ 115203 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 0.58 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.057931E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.602 | TFLOPs: 42.20 | +7: iteration 14430/ 115203 | consumed samples: 3694080 | consumed tokens: 7565475840 | elapsed time per iteration (s): 0.59 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.058438E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.252 | TFLOPs: 41.12 | +7: iteration 14440/ 115203 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 0.58 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.053132E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.170 | TFLOPs: 41.97 | +7: iteration 14450/ 115203 | consumed samples: 3699200 | consumed tokens: 7575961600 | elapsed time per iteration (s): 0.58 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.057009E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.518 | TFLOPs: 42.00 | +7: iteration 14460/ 115203 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 0.58 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.058516E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.195 | TFLOPs: 41.97 | +7: iteration 14470/ 115203 | consumed samples: 3704320 | consumed tokens: 7586447360 | elapsed time per iteration (s): 0.57 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.046572E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.606 | TFLOPs: 42.96 | +7: iteration 14480/ 115203 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 0.57 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.056059E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.332 | TFLOPs: 43.12 | +7: iteration 14490/ 115203 | consumed samples: 3709440 | consumed tokens: 7596933120 | elapsed time per iteration (s): 0.60 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.044631E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.409 | TFLOPs: 40.84 | +7: iteration 14500/ 115203 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 0.56 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.049276E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.507 | TFLOPs: 43.33 | +7: iteration 14510/ 115203 | consumed samples: 3714560 | consumed tokens: 7607418880 | elapsed time per iteration (s): 0.59 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.044056E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.063 | TFLOPs: 41.57 | +7: iteration 14520/ 115203 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 0.58 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.039664E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.525 | TFLOPs: 42.29 | +7: iteration 14530/ 115203 | consumed samples: 3719680 | consumed tokens: 7617904640 | elapsed time per iteration (s): 0.57 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.045527E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.077 | TFLOPs: 42.81 | +7: iteration 14540/ 115203 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.054226E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.227 | TFLOPs: 42.54 | +7: iteration 14550/ 115203 | consumed samples: 3724800 | consumed tokens: 7628390400 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.029598E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.432 | TFLOPs: 42.94 | +7: iteration 14560/ 115203 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 0.58 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.050506E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.759 | TFLOPs: 42.31 | +7: iteration 14570/ 115203 | consumed samples: 3729920 | consumed tokens: 7638876160 | elapsed time per iteration (s): 0.58 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.050162E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.736 | TFLOPs: 42.31 | +7: iteration 14580/ 115203 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.030673E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.620 | TFLOPs: 42.77 | +7: iteration 14590/ 115203 | consumed samples: 3735040 | consumed tokens: 7649361920 | elapsed time per iteration (s): 0.58 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.054586E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.065 | TFLOPs: 42.34 | +7: iteration 14600/ 115203 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.035519E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.910 | TFLOPs: 42.51 | +7: iteration 14610/ 115203 | consumed samples: 3740160 | consumed tokens: 7659847680 | elapsed time per iteration (s): 0.58 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.040006E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.082 | TFLOPs: 42.34 | +7: iteration 14620/ 115203 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.035982E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.723 | TFLOPs: 42.88 | +7: iteration 14630/ 115203 | consumed samples: 3745280 | consumed tokens: 7670333440 | elapsed time per iteration (s): 0.58 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.043794E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.735 | TFLOPs: 42.31 | +7: iteration 14640/ 115203 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.056546E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.756 | TFLOPs: 42.88 | +7: iteration 14650/ 115203 | consumed samples: 3750400 | consumed tokens: 7680819200 | elapsed time per iteration (s): 0.57 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.055436E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.389 | TFLOPs: 42.75 | +7: iteration 14660/ 115203 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 0.57 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.038515E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.925 | TFLOPs: 42.80 | +7: iteration 14670/ 115203 | consumed samples: 3755520 | consumed tokens: 7691304960 | elapsed time per iteration (s): 0.58 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.029668E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.219 | TFLOPs: 41.87 | +7: iteration 14680/ 115203 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 0.56 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.051125E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.850 | TFLOPs: 43.56 | +7: iteration 14690/ 115203 | consumed samples: 3760640 | consumed tokens: 7701790720 | elapsed time per iteration (s): 0.57 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.049398E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.889 | TFLOPs: 42.51 | +7: iteration 14700/ 115203 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 0.58 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.038380E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.170 | TFLOPs: 42.44 | +7: iteration 14710/ 115203 | consumed samples: 3765760 | consumed tokens: 7712276480 | elapsed time per iteration (s): 0.57 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.042028E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.149 | TFLOPs: 42.92 | +7: iteration 14720/ 115203 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 0.57 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.039661E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.130 | TFLOPs: 42.82 | +7: iteration 14730/ 115203 | consumed samples: 3770880 | consumed tokens: 7722762240 | elapsed time per iteration (s): 0.58 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.034472E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.850 | TFLOPs: 41.84 | +7: iteration 14740/ 115203 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 0.57 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.041990E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.117 | TFLOPs: 43.01 | +7: iteration 14750/ 115203 | consumed samples: 3776000 | consumed tokens: 7733248000 | elapsed time per iteration (s): 0.57 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.044406E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.112 | TFLOPs: 43.01 | +7: iteration 14760/ 115203 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 0.56 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.038132E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.131 | TFLOPs: 43.49 | +7: iteration 14770/ 115203 | consumed samples: 3781120 | consumed tokens: 7743733760 | elapsed time per iteration (s): 0.58 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.034252E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.418 | TFLOPs: 42.37 | +7: iteration 14780/ 115203 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 0.58 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.047784E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.409 | TFLOPs: 42.37 | +7: iteration 14790/ 115203 | consumed samples: 3786240 | consumed tokens: 7754219520 | elapsed time per iteration (s): 0.58 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.036039E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.623 | TFLOPs: 41.72 | +7: iteration 14800/ 115203 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 0.58 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.022377E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.639 | TFLOPs: 42.39 | +7: iteration 14810/ 115203 | consumed samples: 3791360 | consumed tokens: 7764705280 | elapsed time per iteration (s): 0.56 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.025340E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.521 | TFLOPs: 43.43 | +7: iteration 14820/ 115203 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 0.56 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.022816E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.163 | TFLOPs: 43.20 | +7: iteration 14830/ 115203 | consumed samples: 3796480 | consumed tokens: 7775191040 | elapsed time per iteration (s): 0.57 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.039085E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.084 | TFLOPs: 43.01 | +7: iteration 14840/ 115203 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 0.56 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.041605E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.419 | TFLOPs: 43.32 | +7: iteration 14850/ 115203 | consumed samples: 3801600 | consumed tokens: 7785676800 | elapsed time per iteration (s): 0.57 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.040685E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.724 | TFLOPs: 42.78 | +7: iteration 14860/ 115203 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 0.58 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.038083E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.334 | TFLOPs: 42.27 | +7: iteration 14870/ 115203 | consumed samples: 3806720 | consumed tokens: 7796162560 | elapsed time per iteration (s): 0.57 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.039703E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.717 | TFLOPs: 42.78 | +7: iteration 14880/ 115203 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 0.58 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.042949E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.357 | TFLOPs: 42.08 | +7: iteration 14890/ 115203 | consumed samples: 3811840 | consumed tokens: 7806648320 | elapsed time per iteration (s): 0.56 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.033894E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.812 | TFLOPs: 43.55 | +7: iteration 14900/ 115203 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 0.58 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.021228E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.344 | TFLOPs: 42.17 | +7: iteration 14910/ 115203 | consumed samples: 3816960 | consumed tokens: 7817134080 | elapsed time per iteration (s): 0.58 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.030147E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.873 | TFLOPs: 42.13 | +7: iteration 14920/ 115203 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 0.57 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.027695E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.962 | TFLOPs: 43.09 | +7: iteration 14930/ 115203 | consumed samples: 3822080 | consumed tokens: 7827619840 | elapsed time per iteration (s): 0.57 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.038382E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.209 | TFLOPs: 42.92 | +7: iteration 14940/ 115203 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 0.56 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.012857E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.248 | TFLOPs: 43.40 | +7: iteration 14950/ 115203 | consumed samples: 3827200 | consumed tokens: 7838105600 | elapsed time per iteration (s): 0.58 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.040042E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.819 | TFLOPs: 42.31 | +7: iteration 14960/ 115203 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 0.57 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.028605E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.777 | TFLOPs: 42.69 | +7: iteration 14970/ 115203 | consumed samples: 3832320 | consumed tokens: 7848591360 | elapsed time per iteration (s): 0.58 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.019902E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.503 | TFLOPs: 42.38 | +7: iteration 14980/ 115203 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 0.57 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.033780E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.811 | TFLOPs: 42.88 | +7: iteration 14990/ 115203 | consumed samples: 3837440 | consumed tokens: 7859077120 | elapsed time per iteration (s): 0.58 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.029716E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.595 | TFLOPs: 42.29 | +7: iteration 15000/ 115203 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 0.57 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.035316E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.993 | TFLOPs: 42.81 | +7: iteration 15010/ 115203 | consumed samples: 3842560 | consumed tokens: 7869562880 | elapsed time per iteration (s): 0.56 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.026893E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.246 | TFLOPs: 43.21 | +7: iteration 15020/ 115203 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 0.57 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.035983E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.737 | TFLOPs: 43.07 | +7: iteration 15030/ 115203 | consumed samples: 3847680 | consumed tokens: 7880048640 | elapsed time per iteration (s): 0.56 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.014954E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.121 | TFLOPs: 43.39 | +7: iteration 15040/ 115203 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 0.58 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.041736E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.084 | TFLOPs: 41.96 | +7: iteration 15050/ 115203 | consumed samples: 3852800 | consumed tokens: 7890534400 | elapsed time per iteration (s): 0.56 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.030402E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.039 | TFLOPs: 43.38 | +7: iteration 15060/ 115203 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 0.58 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.034828E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.043 | TFLOPs: 42.33 | +7: iteration 15070/ 115203 | consumed samples: 3857920 | consumed tokens: 7901020160 | elapsed time per iteration (s): 0.58 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.037527E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.092 | TFLOPs: 41.86 | +7: iteration 15080/ 115203 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 0.59 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.021521E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.441 | TFLOPs: 41.51 | +7: iteration 15090/ 115203 | consumed samples: 3863040 | consumed tokens: 7911505920 | elapsed time per iteration (s): 0.57 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.031153E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.806 | TFLOPs: 42.98 | +7: iteration 15100/ 115203 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 0.55 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.033978E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.299 | TFLOPs: 43.98 | +7: iteration 15110/ 115203 | consumed samples: 3868160 | consumed tokens: 7921991680 | elapsed time per iteration (s): 0.56 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.037340E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.793 | TFLOPs: 43.36 | +7: iteration 15120/ 115203 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 0.57 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.021469E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.366 | TFLOPs: 42.56 | +7: iteration 15130/ 115203 | consumed samples: 3873280 | consumed tokens: 7932477440 | elapsed time per iteration (s): 0.59 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.015873E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.942 | TFLOPs: 41.56 | +7: iteration 15140/ 115203 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 0.57 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.022069E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.552 | TFLOPs: 42.76 | +7: iteration 15150/ 115203 | consumed samples: 3878400 | consumed tokens: 7942963200 | elapsed time per iteration (s): 0.57 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.031185E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.404 | TFLOPs: 43.04 | +7: iteration 15160/ 115203 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 0.58 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.029201E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.733 | TFLOPs: 41.83 | +7: iteration 15170/ 115203 | consumed samples: 3883520 | consumed tokens: 7953448960 | elapsed time per iteration (s): 0.57 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.029208E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.250 | TFLOPs: 42.93 | +7: iteration 15180/ 115203 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 0.58 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.033295E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.153 | TFLOPs: 41.77 | +7: iteration 15190/ 115203 | consumed samples: 3888640 | consumed tokens: 7963934720 | elapsed time per iteration (s): 0.58 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.019262E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.045 | TFLOPs: 41.95 | +7: iteration 15200/ 115203 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 0.56 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.029024E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.660 | TFLOPs: 43.54 | +7: iteration 15210/ 115203 | consumed samples: 3893760 | consumed tokens: 7974420480 | elapsed time per iteration (s): 0.58 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.032862E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.869 | TFLOPs: 41.84 | +7: iteration 15220/ 115203 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 0.58 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.049441E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.622 | TFLOPs: 42.10 | +7: iteration 15230/ 115203 | consumed samples: 3898880 | consumed tokens: 7984906240 | elapsed time per iteration (s): 0.58 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.038171E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.654 | TFLOPs: 42.11 | +7: iteration 15240/ 115203 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 0.57 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.011476E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.277 | TFLOPs: 42.74 | +7: iteration 15250/ 115203 | consumed samples: 3904000 | consumed tokens: 7995392000 | elapsed time per iteration (s): 0.56 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.028008E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.482 | TFLOPs: 43.62 | +7: iteration 15260/ 115203 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 0.58 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.023290E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.620 | TFLOPs: 41.91 | +7: iteration 15270/ 115203 | consumed samples: 3909120 | consumed tokens: 8005877760 | elapsed time per iteration (s): 0.57 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.018463E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.321 | TFLOPs: 43.12 | +7: iteration 15280/ 115203 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 0.57 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.032407E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.426 | TFLOPs: 42.85 | +7: iteration 15290/ 115203 | consumed samples: 3914240 | consumed tokens: 8016363520 | elapsed time per iteration (s): 0.56 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.028982E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.895 | TFLOPs: 43.85 | +7: iteration 15300/ 115203 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 0.56 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.034976E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.801 | TFLOPs: 43.26 | +7: iteration 15310/ 115203 | consumed samples: 3919360 | consumed tokens: 8026849280 | elapsed time per iteration (s): 0.58 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.017971E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.290 | TFLOPs: 42.36 | +7: iteration 15320/ 115203 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 0.56 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.021570E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.703 | TFLOPs: 43.45 | +7: iteration 15330/ 115203 | consumed samples: 3924480 | consumed tokens: 8037335040 | elapsed time per iteration (s): 0.58 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.031405E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.055 | TFLOPs: 42.24 | +7: iteration 15340/ 115203 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 0.57 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.033341E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.279 | TFLOPs: 42.93 | +7: iteration 15350/ 115203 | consumed samples: 3929600 | consumed tokens: 8047820800 | elapsed time per iteration (s): 0.56 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.018516E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.296 | TFLOPs: 43.50 | +7: iteration 15360/ 115203 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 0.57 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.033815E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.648 | TFLOPs: 42.77 | +7: iteration 15370/ 115203 | consumed samples: 3934720 | consumed tokens: 8058306560 | elapsed time per iteration (s): 0.56 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.015966E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.917 | TFLOPs: 43.56 | +7: iteration 15380/ 115203 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 0.56 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.023246E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.096 | TFLOPs: 43.29 | +7: iteration 15390/ 115203 | consumed samples: 3939840 | consumed tokens: 8068792320 | elapsed time per iteration (s): 0.57 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.029009E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.124 | TFLOPs: 43.01 | +7: iteration 15400/ 115203 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 0.57 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.017399E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.007 | TFLOPs: 42.62 | +7: iteration 15410/ 115203 | consumed samples: 3944960 | consumed tokens: 8079278080 | elapsed time per iteration (s): 0.56 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.002105E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.783 | TFLOPs: 43.45 | +7: iteration 15420/ 115203 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 0.57 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.013012E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.934 | TFLOPs: 42.90 | +7: iteration 15430/ 115203 | consumed samples: 3950080 | consumed tokens: 8089763840 | elapsed time per iteration (s): 0.56 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.031623E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.247 | TFLOPs: 43.31 | +7: iteration 15440/ 115203 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 0.56 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.037687E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.555 | TFLOPs: 43.24 | +7: iteration 15450/ 115203 | consumed samples: 3955200 | consumed tokens: 8100249600 | elapsed time per iteration (s): 0.58 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.031789E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.032 | TFLOPs: 42.43 | +7: iteration 15460/ 115203 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 0.57 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.010541E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.124 | TFLOPs: 43.11 | +7: iteration 15470/ 115203 | consumed samples: 3960320 | consumed tokens: 8110735360 | elapsed time per iteration (s): 0.59 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.014775E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.049 | TFLOPs: 41.67 | +7: iteration 15480/ 115203 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 0.57 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.029343E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.711 | TFLOPs: 43.16 | +7: iteration 15490/ 115203 | consumed samples: 3965440 | consumed tokens: 8121221120 | elapsed time per iteration (s): 0.58 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.024604E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.916 | TFLOPs: 42.32 | +7: iteration 15500/ 115203 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 0.57 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.030338E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.128 | TFLOPs: 42.53 | +7: iteration 15510/ 115203 | consumed samples: 3970560 | consumed tokens: 8131706880 | elapsed time per iteration (s): 0.57 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.014109E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.817 | TFLOPs: 42.69 | +7: iteration 15520/ 115203 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 0.59 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.009966E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.034 | TFLOPs: 41.29 | +7: iteration 15530/ 115203 | consumed samples: 3975680 | consumed tokens: 8142192640 | elapsed time per iteration (s): 0.57 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 2.996549E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.023 | TFLOPs: 42.81 | +7: iteration 15540/ 115203 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 0.59 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.011334E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.158 | TFLOPs: 41.39 | +7: iteration 15550/ 115203 | consumed samples: 3980800 | consumed tokens: 8152678400 | elapsed time per iteration (s): 0.57 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.020697E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.041 | TFLOPs: 42.53 | +7: iteration 15560/ 115203 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 0.57 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.011989E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.385 | TFLOPs: 42.94 | +7: iteration 15570/ 115203 | consumed samples: 3985920 | consumed tokens: 8163164160 | elapsed time per iteration (s): 0.58 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.026831E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.948 | TFLOPs: 41.94 | +7: iteration 15580/ 115203 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 0.57 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.011447E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.618 | TFLOPs: 43.15 | +7: iteration 15590/ 115203 | consumed samples: 3991040 | consumed tokens: 8173649920 | elapsed time per iteration (s): 0.56 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.010323E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.931 | TFLOPs: 43.28 | +7: iteration 15600/ 115203 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 0.56 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.018186E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.747 | TFLOPs: 43.55 | +7: iteration 15610/ 115203 | consumed samples: 3996160 | consumed tokens: 8184135680 | elapsed time per iteration (s): 0.58 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.015825E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.496 | TFLOPs: 42.38 | +7: iteration 15620/ 115203 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 0.58 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.023396E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.946 | TFLOPs: 42.33 | +7: iteration 15630/ 115203 | consumed samples: 4001280 | consumed tokens: 8194621440 | elapsed time per iteration (s): 0.58 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.022289E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.185 | TFLOPs: 42.16 | +7: iteration 15640/ 115203 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 0.57 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.012866E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.150 | TFLOPs: 43.11 | +7: iteration 15650/ 115203 | consumed samples: 4006400 | consumed tokens: 8205107200 | elapsed time per iteration (s): 0.56 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.014807E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.561 | TFLOPs: 43.24 | +7: iteration 15660/ 115203 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 0.58 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.023469E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.519 | TFLOPs: 42.09 | +7: iteration 15670/ 115203 | consumed samples: 4011520 | consumed tokens: 8215592960 | elapsed time per iteration (s): 0.57 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.018307E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.321 | TFLOPs: 42.46 | +7: iteration 15680/ 115203 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 0.56 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.021832E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.446 | TFLOPs: 43.61 | +7: iteration 15690/ 115203 | consumed samples: 4016640 | consumed tokens: 8226078720 | elapsed time per iteration (s): 0.56 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.018059E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.145 | TFLOPs: 43.30 | +7: iteration 15700/ 115203 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 0.57 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.024812E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.280 | TFLOPs: 42.93 | +7: iteration 15710/ 115203 | consumed samples: 4021760 | consumed tokens: 8236564480 | elapsed time per iteration (s): 0.56 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.028060E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.184 | TFLOPs: 43.49 | +7: iteration 15720/ 115203 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 0.57 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.015079E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.529 | TFLOPs: 42.48 | +7: iteration 15730/ 115203 | consumed samples: 4026880 | consumed tokens: 8247050240 | elapsed time per iteration (s): 0.58 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.019930E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.312 | TFLOPs: 42.17 | +7: iteration 15740/ 115203 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 0.57 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.019468E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.787 | TFLOPs: 43.07 | +7: iteration 15750/ 115203 | consumed samples: 4032000 | consumed tokens: 8257536000 | elapsed time per iteration (s): 0.57 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.019188E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.673 | TFLOPs: 42.97 | +7: iteration 15760/ 115203 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 0.56 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.023113E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.731 | TFLOPs: 43.73 | +7: iteration 15770/ 115203 | consumed samples: 4037120 | consumed tokens: 8268021760 | elapsed time per iteration (s): 0.56 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.015125E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.459 | TFLOPs: 43.42 | +7: iteration 15780/ 115203 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 0.56 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.013642E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.176 | TFLOPs: 43.30 | +7: iteration 15790/ 115203 | consumed samples: 4042240 | consumed tokens: 8278507520 | elapsed time per iteration (s): 0.57 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.013037E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.475 | TFLOPs: 42.95 | +7: iteration 15800/ 115203 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 0.57 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.017220E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.853 | TFLOPs: 42.70 | +7: iteration 15810/ 115203 | consumed samples: 4047360 | consumed tokens: 8288993280 | elapsed time per iteration (s): 0.57 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.018166E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.176 | TFLOPs: 42.73 | +7: iteration 15820/ 115203 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 0.57 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.028028E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.439 | TFLOPs: 42.94 | +7: iteration 15830/ 115203 | consumed samples: 4052480 | consumed tokens: 8299479040 | elapsed time per iteration (s): 0.58 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.007841E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.121 | TFLOPs: 42.15 | +7: iteration 15840/ 115203 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 0.56 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.018483E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.194 | TFLOPs: 43.87 | +7: iteration 15850/ 115203 | consumed samples: 4057600 | consumed tokens: 8309964800 | elapsed time per iteration (s): 0.57 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.005210E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.262 | TFLOPs: 43.12 | +7: iteration 15860/ 115203 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 0.58 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.013638E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.978 | TFLOPs: 41.85 | +7: iteration 15870/ 115203 | consumed samples: 4062720 | consumed tokens: 8320450560 | elapsed time per iteration (s): 0.57 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.012414E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.120 | TFLOPs: 42.91 | +7: iteration 15880/ 115203 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 0.57 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.013762E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.221 | TFLOPs: 42.73 | +7: iteration 15890/ 115203 | consumed samples: 4067840 | consumed tokens: 8330936320 | elapsed time per iteration (s): 0.56 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.010294E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.332 | TFLOPs: 43.32 | +7: iteration 15900/ 115203 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 0.57 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 2.994867E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.532 | TFLOPs: 42.67 | +7: iteration 15910/ 115203 | consumed samples: 4072960 | consumed tokens: 8341422080 | elapsed time per iteration (s): 0.57 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.009014E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.235 | TFLOPs: 42.73 | +7: iteration 15920/ 115203 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 0.57 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.022502E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.951 | TFLOPs: 42.71 | +7: iteration 15930/ 115203 | consumed samples: 4078080 | consumed tokens: 8351907840 | elapsed time per iteration (s): 0.56 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.003615E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.788 | TFLOPs: 43.45 | +7: iteration 15940/ 115203 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 0.58 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.012420E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.319 | TFLOPs: 41.79 | +7: iteration 15950/ 115203 | consumed samples: 4083200 | consumed tokens: 8362393600 | elapsed time per iteration (s): 0.56 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.000872E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.579 | TFLOPs: 43.24 | +7: iteration 15960/ 115203 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 0.58 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.004233E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.131 | TFLOPs: 42.44 | +7: iteration 15970/ 115203 | consumed samples: 4088320 | consumed tokens: 8372879360 | elapsed time per iteration (s): 0.57 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.028147E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.800 | TFLOPs: 42.88 | +7: iteration 15980/ 115203 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 0.56 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.011785E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.695 | TFLOPs: 43.92 | +7: iteration 15990/ 115203 | consumed samples: 4093440 | consumed tokens: 8383365120 | elapsed time per iteration (s): 0.56 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.020417E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.532 | TFLOPs: 43.53 | +0: [2023-03-16 15:20:27,830] [INFO] [logging.py:68:log_dist] [Rank 0] step=16000, skipped=0, lr=[0.00019257700559212364, 0.00019257700559212364, 0.00019257700559212364], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 16000/ 115203 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 0.56 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.002149E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.155 | TFLOPs: 43.58 | +0: steps: 16000 loss: 2.9873 iter time (s): 0.571 samples/sec: 448.501 +7: iteration 16010/ 115203 | consumed samples: 4098560 | consumed tokens: 8393850880 | elapsed time per iteration (s): 0.56 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.019361E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.423 | TFLOPs: 43.61 | +7: iteration 16020/ 115203 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 0.57 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.005581E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.157 | TFLOPs: 43.01 | +7: iteration 16030/ 115203 | consumed samples: 4103680 | consumed tokens: 8404336640 | elapsed time per iteration (s): 0.57 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.009617E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.685 | TFLOPs: 42.97 | +7: iteration 16040/ 115203 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 0.58 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.022407E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.265 | TFLOPs: 42.36 | +7: iteration 16050/ 115203 | consumed samples: 4108800 | consumed tokens: 8414822400 | elapsed time per iteration (s): 0.56 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.015812E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.338 | TFLOPs: 43.51 | +7: iteration 16060/ 115203 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 0.57 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.008389E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.201 | TFLOPs: 43.11 | +7: iteration 16070/ 115203 | consumed samples: 4113920 | consumed tokens: 8425308160 | elapsed time per iteration (s): 0.57 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.011371E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.003 | TFLOPs: 42.90 | +7: iteration 16080/ 115203 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 0.59 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.003934E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.717 | TFLOPs: 41.45 | +7: iteration 16090/ 115203 | consumed samples: 4119040 | consumed tokens: 8435793920 | elapsed time per iteration (s): 0.56 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.022364E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.113 | TFLOPs: 43.29 | +7: iteration 16100/ 115203 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 0.56 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.006794E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.317 | TFLOPs: 43.41 | +7: iteration 16110/ 115203 | consumed samples: 4124160 | consumed tokens: 8446279680 | elapsed time per iteration (s): 0.57 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.013310E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.278 | TFLOPs: 43.12 | +7: iteration 16120/ 115203 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 0.58 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.017811E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.279 | TFLOPs: 42.17 | +7: iteration 16130/ 115203 | consumed samples: 4129280 | consumed tokens: 8456765440 | elapsed time per iteration (s): 0.56 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.007042E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.989 | TFLOPs: 43.57 | +7: iteration 16140/ 115203 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 0.57 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.012405E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.312 | TFLOPs: 42.55 | +7: iteration 16150/ 115203 | consumed samples: 4134400 | consumed tokens: 8467251200 | elapsed time per iteration (s): 0.57 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.011125E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.549 | TFLOPs: 42.48 | +7: iteration 16160/ 115203 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 0.56 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.012701E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.581 | TFLOPs: 43.53 | +7: iteration 16170/ 115203 | consumed samples: 4139520 | consumed tokens: 8477736960 | elapsed time per iteration (s): 0.56 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.009985E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.906 | TFLOPs: 43.56 | +7: iteration 16180/ 115203 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 0.55 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 2.992249E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.455 | TFLOPs: 43.99 | +7: iteration 16190/ 115203 | consumed samples: 4144640 | consumed tokens: 8488222720 | elapsed time per iteration (s): 0.56 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.010495E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.747 | TFLOPs: 43.55 | +7: iteration 16200/ 115203 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 0.56 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.018429E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.910 | TFLOPs: 43.56 | +7: iteration 16210/ 115203 | consumed samples: 4149760 | consumed tokens: 8498708480 | elapsed time per iteration (s): 0.57 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.004866E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.924 | TFLOPs: 42.90 | +7: iteration 16220/ 115203 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 0.55 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.014217E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.429 | TFLOPs: 43.99 | +7: iteration 16230/ 115203 | consumed samples: 4154880 | consumed tokens: 8509194240 | elapsed time per iteration (s): 0.57 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.012663E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.205 | TFLOPs: 43.02 | +7: iteration 16240/ 115203 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 0.56 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.011252E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.204 | TFLOPs: 43.97 | +7: iteration 16250/ 115203 | consumed samples: 4160000 | consumed tokens: 8519680000 | elapsed time per iteration (s): 0.56 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.028857E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.477 | TFLOPs: 43.90 | +7: iteration 16260/ 115203 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 0.58 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 2.985869E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.146 | TFLOPs: 42.25 | +7: iteration 16270/ 115203 | consumed samples: 4165120 | consumed tokens: 8530165760 | elapsed time per iteration (s): 0.58 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.003774E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.361 | TFLOPs: 42.27 | +7: iteration 16280/ 115203 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 0.57 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.006253E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.575 | TFLOPs: 42.48 | +7: iteration 16290/ 115203 | consumed samples: 4170240 | consumed tokens: 8540651520 | elapsed time per iteration (s): 0.57 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 2.998351E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.875 | TFLOPs: 43.18 | +7: iteration 16300/ 115203 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 0.57 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.008329E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.451 | TFLOPs: 42.66 | +7: iteration 16310/ 115203 | consumed samples: 4175360 | consumed tokens: 8551137280 | elapsed time per iteration (s): 0.57 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.008303E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.619 | TFLOPs: 43.15 | +7: iteration 16320/ 115203 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 0.56 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.019047E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.408 | TFLOPs: 43.61 | +7: iteration 16330/ 115203 | consumed samples: 4180480 | consumed tokens: 8561623040 | elapsed time per iteration (s): 0.57 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 2.987835E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.851 | TFLOPs: 42.98 | +7: iteration 16340/ 115203 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 0.57 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.009298E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.956 | TFLOPs: 43.09 | +7: iteration 16350/ 115203 | consumed samples: 4185600 | consumed tokens: 8572108800 | elapsed time per iteration (s): 0.59 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.004708E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.580 | TFLOPs: 41.53 | +7: iteration 16360/ 115203 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 0.56 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.008109E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.911 | TFLOPs: 43.37 | +7: iteration 16370/ 115203 | consumed samples: 4190720 | consumed tokens: 8582594560 | elapsed time per iteration (s): 0.58 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 2.993888E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.070 | TFLOPs: 42.24 | +7: iteration 16380/ 115203 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 0.56 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.011901E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.146 | TFLOPs: 43.58 | +7: iteration 16390/ 115203 | consumed samples: 4195840 | consumed tokens: 8593080320 | elapsed time per iteration (s): 0.56 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 2.992989E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.573 | TFLOPs: 43.24 | +7: iteration 16400/ 115203 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 0.57 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 2.994063E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.654 | TFLOPs: 42.58 | +7: iteration 16410/ 115203 | consumed samples: 4200960 | consumed tokens: 8603566080 | elapsed time per iteration (s): 0.58 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 2.998869E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.779 | TFLOPs: 41.83 | +7: iteration 16420/ 115203 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 0.56 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 2.994120E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.904 | TFLOPs: 43.27 | +7: iteration 16430/ 115203 | consumed samples: 4206080 | consumed tokens: 8614051840 | elapsed time per iteration (s): 0.56 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 2.993048E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.021 | TFLOPs: 43.95 | +7: iteration 16440/ 115203 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 0.57 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.011519E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.839 | TFLOPs: 42.60 | +7: iteration 16450/ 115203 | consumed samples: 4211200 | consumed tokens: 8624537600 | elapsed time per iteration (s): 0.57 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 2.997623E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.376 | TFLOPs: 42.84 | +7: iteration 16460/ 115203 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 0.57 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 2.998689E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.264 | TFLOPs: 43.02 | +7: iteration 16470/ 115203 | consumed samples: 4216320 | consumed tokens: 8635023360 | elapsed time per iteration (s): 0.56 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.007922E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.260 | TFLOPs: 43.50 | +7: iteration 16480/ 115203 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 0.58 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 2.998786E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.908 | TFLOPs: 42.04 | +7: iteration 16490/ 115203 | consumed samples: 4221440 | consumed tokens: 8645509120 | elapsed time per iteration (s): 0.55 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.009107E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.381 | TFLOPs: 43.99 | +7: iteration 16500/ 115203 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 0.56 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.005632E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.573 | TFLOPs: 43.34 | +7: iteration 16510/ 115203 | consumed samples: 4226560 | consumed tokens: 8655994880 | elapsed time per iteration (s): 0.57 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 2.995914E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.480 | TFLOPs: 42.47 | +7: iteration 16520/ 115203 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 0.56 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 2.993088E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.066 | TFLOPs: 43.67 | +7: iteration 16530/ 115203 | consumed samples: 4231680 | consumed tokens: 8666480640 | elapsed time per iteration (s): 0.56 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.003813E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.518 | TFLOPs: 43.24 | +7: iteration 16540/ 115203 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 0.58 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.014105E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.703 | TFLOPs: 42.21 | +7: iteration 16550/ 115203 | consumed samples: 4236800 | consumed tokens: 8676966400 | elapsed time per iteration (s): 0.59 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 2.996032E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.377 | TFLOPs: 41.60 | +7: iteration 16560/ 115203 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 0.56 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.000297E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.989 | TFLOPs: 43.28 | +7: iteration 16570/ 115203 | consumed samples: 4241920 | consumed tokens: 8687452160 | elapsed time per iteration (s): 0.57 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.000782E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.245 | TFLOPs: 42.83 | +7: iteration 16580/ 115203 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 0.57 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.007222E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.777 | TFLOPs: 42.69 | +7: iteration 16590/ 115203 | consumed samples: 4247040 | consumed tokens: 8697937920 | elapsed time per iteration (s): 0.57 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.002525E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.085 | TFLOPs: 42.62 | +7: iteration 16600/ 115203 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 0.56 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 2.992123E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.495 | TFLOPs: 43.71 | +7: iteration 16610/ 115203 | consumed samples: 4252160 | consumed tokens: 8708423680 | elapsed time per iteration (s): 0.57 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.000603E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.258 | TFLOPs: 43.02 | +7: iteration 16620/ 115203 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 0.56 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.000625E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.208 | TFLOPs: 43.49 | +7: iteration 16630/ 115203 | consumed samples: 4257280 | consumed tokens: 8718909440 | elapsed time per iteration (s): 0.58 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 2.999368E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.079 | TFLOPs: 42.15 | +7: iteration 16640/ 115203 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 0.57 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.011725E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.959 | TFLOPs: 42.52 | +7: iteration 16650/ 115203 | consumed samples: 4262400 | consumed tokens: 8729395200 | elapsed time per iteration (s): 0.55 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.006112E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.441 | TFLOPs: 43.99 | +7: iteration 16660/ 115203 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 0.58 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 2.998494E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.348 | TFLOPs: 42.36 | +7: iteration 16670/ 115203 | consumed samples: 4267520 | consumed tokens: 8739880960 | elapsed time per iteration (s): 0.56 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 2.996999E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.645 | TFLOPs: 43.63 | +7: iteration 16680/ 115203 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 0.56 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.003866E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.150 | TFLOPs: 43.58 | +7: iteration 16690/ 115203 | consumed samples: 4272640 | consumed tokens: 8750366720 | elapsed time per iteration (s): 0.56 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.018323E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.387 | TFLOPs: 43.80 | +7: iteration 16700/ 115203 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 0.58 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.009614E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.052 | TFLOPs: 42.05 | +7: iteration 16710/ 115203 | consumed samples: 4277760 | consumed tokens: 8760852480 | elapsed time per iteration (s): 0.56 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.005258E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.657 | TFLOPs: 43.35 | +7: iteration 16720/ 115203 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 0.56 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 2.995952E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.108 | TFLOPs: 43.39 | +7: iteration 16730/ 115203 | consumed samples: 4282880 | consumed tokens: 8771338240 | elapsed time per iteration (s): 0.57 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.003131E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.081 | TFLOPs: 42.91 | +7: iteration 16740/ 115203 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 0.57 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 2.989608E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.905 | TFLOPs: 42.99 | +7: iteration 16750/ 115203 | consumed samples: 4288000 | consumed tokens: 8781824000 | elapsed time per iteration (s): 0.56 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.012711E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.875 | TFLOPs: 43.56 | +7: iteration 16760/ 115203 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 0.55 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.002971E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.304 | TFLOPs: 43.98 | +7: iteration 16770/ 115203 | consumed samples: 4293120 | consumed tokens: 8792309760 | elapsed time per iteration (s): 0.56 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 2.995182E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.483 | TFLOPs: 43.71 | +7: iteration 16780/ 115203 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 0.57 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.005882E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.633 | TFLOPs: 42.87 | +7: iteration 16790/ 115203 | consumed samples: 4298240 | consumed tokens: 8802795520 | elapsed time per iteration (s): 0.57 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 2.987241E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.835 | TFLOPs: 42.79 | +7: iteration 16800/ 115203 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 0.57 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.006054E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.965 | TFLOPs: 42.90 | +7: iteration 16810/ 115203 | consumed samples: 4303360 | consumed tokens: 8813281280 | elapsed time per iteration (s): 0.56 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.005750E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.182 | TFLOPs: 43.68 | +7: iteration 16820/ 115203 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 0.56 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 2.982353E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.502 | TFLOPs: 43.71 | +7: iteration 16830/ 115203 | consumed samples: 4308480 | consumed tokens: 8823767040 | elapsed time per iteration (s): 0.56 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 2.993706E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.030 | TFLOPs: 43.76 | +7: iteration 16840/ 115203 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 0.58 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.000069E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.244 | TFLOPs: 42.35 | +7: iteration 16850/ 115203 | consumed samples: 4313600 | consumed tokens: 8834252800 | elapsed time per iteration (s): 0.57 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.005044E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.988 | TFLOPs: 42.90 | +7: iteration 16860/ 115203 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 0.57 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.010108E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.961 | TFLOPs: 42.61 | +7: iteration 16870/ 115203 | consumed samples: 4318720 | consumed tokens: 8844738560 | elapsed time per iteration (s): 0.56 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.004045E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.891 | TFLOPs: 43.46 | +7: iteration 16880/ 115203 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 0.57 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 2.996203E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.092 | TFLOPs: 43.01 | +7: iteration 16890/ 115203 | consumed samples: 4323840 | consumed tokens: 8855224320 | elapsed time per iteration (s): 0.56 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.013092E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.432 | TFLOPs: 43.52 | +7: iteration 16900/ 115203 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 0.56 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 2.985015E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.595 | TFLOPs: 43.72 | +7: iteration 16910/ 115203 | consumed samples: 4328960 | consumed tokens: 8865710080 | elapsed time per iteration (s): 0.56 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.000435E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.209 | TFLOPs: 43.49 | +7: iteration 16920/ 115203 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 0.57 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.989752E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.125 | TFLOPs: 42.91 | +7: iteration 16930/ 115203 | consumed samples: 4334080 | consumed tokens: 8876195840 | elapsed time per iteration (s): 0.59 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.000317E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.502 | TFLOPs: 41.04 | +7: iteration 16940/ 115203 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 0.57 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.003436E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.735 | TFLOPs: 42.59 | +7: iteration 16950/ 115203 | consumed samples: 4339200 | consumed tokens: 8886681600 | elapsed time per iteration (s): 0.56 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.992087E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.953 | TFLOPs: 43.47 | +7: iteration 16960/ 115203 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 0.56 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.987166E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.030 | TFLOPs: 43.48 | +7: iteration 16970/ 115203 | consumed samples: 4344320 | consumed tokens: 8897167360 | elapsed time per iteration (s): 0.56 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.993198E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.202 | TFLOPs: 43.68 | +7: iteration 16980/ 115203 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 0.56 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.990421E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.321 | TFLOPs: 43.70 | +7: iteration 16990/ 115203 | consumed samples: 4349440 | consumed tokens: 8907653120 | elapsed time per iteration (s): 0.56 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.995093E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.875 | TFLOPs: 43.46 | +7: iteration 17000/ 115203 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 0.55 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 2.982934E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.394 | TFLOPs: 43.99 | +7: iteration 17010/ 115203 | consumed samples: 4354560 | consumed tokens: 8918138880 | elapsed time per iteration (s): 0.56 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.998151E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.963 | TFLOPs: 43.47 | +7: iteration 17020/ 115203 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 0.56 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.986043E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.462 | TFLOPs: 43.61 | +7: iteration 17030/ 115203 | consumed samples: 4359680 | consumed tokens: 8928624640 | elapsed time per iteration (s): 0.56 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.986594E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.958 | TFLOPs: 43.57 | +7: iteration 17040/ 115203 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 0.57 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.984224E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.803 | TFLOPs: 42.69 | +7: iteration 17050/ 115203 | consumed samples: 4364800 | consumed tokens: 8939110400 | elapsed time per iteration (s): 0.56 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.002773E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.860 | TFLOPs: 43.84 | +7: iteration 17060/ 115203 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 0.56 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.995937E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.493 | TFLOPs: 43.24 | +7: iteration 17070/ 115203 | consumed samples: 4369920 | consumed tokens: 8949596160 | elapsed time per iteration (s): 0.57 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.011672E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.097 | TFLOPs: 43.10 | +7: iteration 17080/ 115203 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 0.55 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.985550E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.424 | TFLOPs: 43.99 | +7: iteration 17090/ 115203 | consumed samples: 4375040 | consumed tokens: 8960081920 | elapsed time per iteration (s): 0.56 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.980577E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.323 | TFLOPs: 43.41 | +7: iteration 17100/ 115203 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 0.57 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 2.990242E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.316 | TFLOPs: 43.03 | +7: iteration 17110/ 115203 | consumed samples: 4380160 | consumed tokens: 8970567680 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 2.994186E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.582 | TFLOPs: 43.91 | +7: iteration 17120/ 115203 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 2.989552E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.596 | TFLOPs: 43.53 | +7: iteration 17130/ 115203 | consumed samples: 4385280 | consumed tokens: 8981053440 | elapsed time per iteration (s): 0.58 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 2.979451E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.715 | TFLOPs: 42.40 | +7: iteration 17140/ 115203 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.003440E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.964 | TFLOPs: 43.85 | +7: iteration 17150/ 115203 | consumed samples: 4390400 | consumed tokens: 8991539200 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.007996E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.218 | TFLOPs: 43.97 | +7: iteration 17160/ 115203 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 2.997318E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.898 | TFLOPs: 43.46 | +7: iteration 17170/ 115203 | consumed samples: 4395520 | consumed tokens: 9002024960 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.000345E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.606 | TFLOPs: 43.63 | +7: iteration 17180/ 115203 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 0.56 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 2.991953E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.575 | TFLOPs: 43.43 | +7: iteration 17190/ 115203 | consumed samples: 4400640 | consumed tokens: 9012510720 | elapsed time per iteration (s): 0.55 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 2.992794E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.320 | TFLOPs: 43.98 | +7: iteration 17200/ 115203 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.982797E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.208 | TFLOPs: 43.49 | +7: iteration 17210/ 115203 | consumed samples: 4405760 | consumed tokens: 9022996480 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.989214E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.170 | TFLOPs: 43.49 | +7: iteration 17220/ 115203 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.986131E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.934 | TFLOPs: 43.56 | +7: iteration 17230/ 115203 | consumed samples: 4410880 | consumed tokens: 9033482240 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.989042E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.141 | TFLOPs: 43.20 | +7: iteration 17240/ 115203 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.995492E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.171 | TFLOPs: 43.97 | +7: iteration 17250/ 115203 | consumed samples: 4416000 | consumed tokens: 9043968000 | elapsed time per iteration (s): 0.57 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.990348E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.263 | TFLOPs: 43.02 | +7: iteration 17260/ 115203 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.992178E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.849 | TFLOPs: 43.84 | +7: iteration 17270/ 115203 | consumed samples: 4421120 | consumed tokens: 9054453760 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.999221E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.908 | TFLOPs: 43.28 | +7: iteration 17280/ 115203 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.002164E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.592 | TFLOPs: 43.72 | +7: iteration 17290/ 115203 | consumed samples: 4426240 | consumed tokens: 9064939520 | elapsed time per iteration (s): 0.56 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 2.989594E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.963 | TFLOPs: 43.57 | +7: iteration 17300/ 115203 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 0.56 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.981078E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.225 | TFLOPs: 43.97 | +7: iteration 17310/ 115203 | consumed samples: 4431360 | consumed tokens: 9075425280 | elapsed time per iteration (s): 0.56 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.977807E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.725 | TFLOPs: 43.26 | +7: iteration 17320/ 115203 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 0.57 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.992358E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.028 | TFLOPs: 42.71 | +7: iteration 17330/ 115203 | consumed samples: 4436480 | consumed tokens: 9085911040 | elapsed time per iteration (s): 0.56 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.996275E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.137 | TFLOPs: 43.49 | +7: iteration 17340/ 115203 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 0.56 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.995063E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.965 | TFLOPs: 43.66 | +7: iteration 17350/ 115203 | consumed samples: 4441600 | consumed tokens: 9096396800 | elapsed time per iteration (s): 0.56 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.970285E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.735 | TFLOPs: 43.64 | +7: iteration 17360/ 115203 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 0.57 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.002620E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.745 | TFLOPs: 42.69 | +7: iteration 17370/ 115203 | consumed samples: 4446720 | consumed tokens: 9106882560 | elapsed time per iteration (s): 0.56 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.990807E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.459 | TFLOPs: 43.90 | +7: iteration 17380/ 115203 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 0.55 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 2.986336E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.457 | TFLOPs: 43.99 | +7: iteration 17390/ 115203 | consumed samples: 4451840 | consumed tokens: 9117368320 | elapsed time per iteration (s): 0.55 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.988001E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.519 | TFLOPs: 44.00 | +7: iteration 17400/ 115203 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 0.56 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.000276E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.870 | TFLOPs: 43.56 | +7: iteration 17410/ 115203 | consumed samples: 4456960 | consumed tokens: 9127854080 | elapsed time per iteration (s): 0.57 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.978937E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.496 | TFLOPs: 43.05 | +7: iteration 17420/ 115203 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 0.58 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.980338E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.410 | TFLOPs: 42.37 | +7: iteration 17430/ 115203 | consumed samples: 4462080 | consumed tokens: 9138339840 | elapsed time per iteration (s): 0.56 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.999007E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.782 | TFLOPs: 43.36 | +7: iteration 17440/ 115203 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 0.56 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.989281E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.524 | TFLOPs: 43.52 | +7: iteration 17450/ 115203 | consumed samples: 4467200 | consumed tokens: 9148825600 | elapsed time per iteration (s): 0.56 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.989128E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.887 | TFLOPs: 43.46 | +7: iteration 17460/ 115203 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 0.56 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.991894E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.107 | TFLOPs: 43.96 | +7: iteration 17470/ 115203 | consumed samples: 4472320 | consumed tokens: 9159311360 | elapsed time per iteration (s): 0.56 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 2.995268E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.658 | TFLOPs: 43.92 | +7: iteration 17480/ 115203 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 0.58 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.978399E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.870 | TFLOPs: 41.94 | +7: iteration 17490/ 115203 | consumed samples: 4477440 | consumed tokens: 9169797120 | elapsed time per iteration (s): 0.55 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.993174E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.267 | TFLOPs: 43.98 | +7: iteration 17500/ 115203 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 0.57 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.982617E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.708 | TFLOPs: 43.07 | +7: iteration 17510/ 115203 | consumed samples: 4482560 | consumed tokens: 9180282880 | elapsed time per iteration (s): 0.56 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.980617E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.350 | TFLOPs: 43.60 | +7: iteration 17520/ 115203 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 0.57 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.976050E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.687 | TFLOPs: 43.06 | +7: iteration 17530/ 115203 | consumed samples: 4487680 | consumed tokens: 9190768640 | elapsed time per iteration (s): 0.56 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.992494E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.119 | TFLOPs: 43.96 | +7: iteration 17540/ 115203 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 0.57 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.988192E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.849 | TFLOPs: 42.98 | +7: iteration 17550/ 115203 | consumed samples: 4492800 | consumed tokens: 9201254400 | elapsed time per iteration (s): 0.57 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.991874E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.622 | TFLOPs: 43.06 | +7: iteration 17560/ 115203 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 0.56 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.981137E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.812 | TFLOPs: 43.55 | +7: iteration 17570/ 115203 | consumed samples: 4497920 | consumed tokens: 9211740160 | elapsed time per iteration (s): 0.56 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 2.991861E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.000 | TFLOPs: 43.76 | +7: iteration 17580/ 115203 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.980478E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.712 | TFLOPs: 43.92 | +7: iteration 17590/ 115203 | consumed samples: 4503040 | consumed tokens: 9222225920 | elapsed time per iteration (s): 0.57 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.988919E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.638 | TFLOPs: 43.06 | +7: iteration 17600/ 115203 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.983262E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.081 | TFLOPs: 43.39 | +7: iteration 17610/ 115203 | consumed samples: 4508160 | consumed tokens: 9232711680 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.987510E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.604 | TFLOPs: 43.63 | +7: iteration 17620/ 115203 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.977522E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.173 | TFLOPs: 43.30 | +7: iteration 17630/ 115203 | consumed samples: 4513280 | consumed tokens: 9243197440 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.996290E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.267 | TFLOPs: 43.31 | +7: iteration 17640/ 115203 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.980509E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.977 | TFLOPs: 43.28 | +7: iteration 17650/ 115203 | consumed samples: 4518400 | consumed tokens: 9253683200 | elapsed time per iteration (s): 0.58 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.986851E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.575 | TFLOPs: 42.39 | +7: iteration 17660/ 115203 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 0.56 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 2.987001E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.438 | TFLOPs: 43.42 | +7: iteration 17670/ 115203 | consumed samples: 4523520 | consumed tokens: 9264168960 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.984495E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.457 | TFLOPs: 43.80 | +7: iteration 17680/ 115203 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.971261E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.388 | TFLOPs: 43.42 | +7: iteration 17690/ 115203 | consumed samples: 4528640 | consumed tokens: 9274654720 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.982619E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.379 | TFLOPs: 43.70 | +7: iteration 17700/ 115203 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.979426E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.699 | TFLOPs: 43.45 | +7: iteration 17710/ 115203 | consumed samples: 4533760 | consumed tokens: 9285140480 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.990737E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.966 | TFLOPs: 43.66 | +7: iteration 17720/ 115203 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.000293E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.297 | TFLOPs: 43.50 | +7: iteration 17730/ 115203 | consumed samples: 4538880 | consumed tokens: 9295626240 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.983525E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.029 | TFLOPs: 43.95 | +7: iteration 17740/ 115203 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.976989E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.895 | TFLOPs: 43.94 | +7: iteration 17750/ 115203 | consumed samples: 4544000 | consumed tokens: 9306112000 | elapsed time per iteration (s): 0.56 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 2.988327E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.899 | TFLOPs: 43.56 | +7: iteration 17760/ 115203 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.984669E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.486 | TFLOPs: 43.43 | +7: iteration 17770/ 115203 | consumed samples: 4549120 | consumed tokens: 9316597760 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.996281E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.727 | TFLOPs: 43.54 | +7: iteration 17780/ 115203 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.990444E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.228 | TFLOPs: 43.97 | +7: iteration 17790/ 115203 | consumed samples: 4554240 | consumed tokens: 9327083520 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.965954E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.207 | TFLOPs: 43.97 | +7: iteration 17800/ 115203 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.987352E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.441 | TFLOPs: 43.71 | +7: iteration 17810/ 115203 | consumed samples: 4559360 | consumed tokens: 9337569280 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.969876E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.410 | TFLOPs: 43.61 | +7: iteration 17820/ 115203 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.977589E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.189 | TFLOPs: 43.59 | +7: iteration 17830/ 115203 | consumed samples: 4564480 | consumed tokens: 9348055040 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.971187E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.202 | TFLOPs: 43.49 | +7: iteration 17840/ 115203 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 0.56 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 2.975413E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.007 | TFLOPs: 43.95 | +7: iteration 17850/ 115203 | consumed samples: 4569600 | consumed tokens: 9358540800 | elapsed time per iteration (s): 0.57 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.982492E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.958 | TFLOPs: 43.09 | +7: iteration 17860/ 115203 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 0.56 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.984246E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.482 | TFLOPs: 43.90 | +7: iteration 17870/ 115203 | consumed samples: 4574720 | consumed tokens: 9369026560 | elapsed time per iteration (s): 0.56 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.979989E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.117 | TFLOPs: 43.96 | +7: iteration 17880/ 115203 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 0.56 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.978044E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.170 | TFLOPs: 43.30 | +7: iteration 17890/ 115203 | consumed samples: 4579840 | consumed tokens: 9379512320 | elapsed time per iteration (s): 0.55 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.987625E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.282 | TFLOPs: 43.98 | +7: iteration 17900/ 115203 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 0.57 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.978092E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.945 | TFLOPs: 42.99 | +7: iteration 17910/ 115203 | consumed samples: 4584960 | consumed tokens: 9389998080 | elapsed time per iteration (s): 0.56 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.969874E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.958 | TFLOPs: 43.95 | +7: iteration 17920/ 115203 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 0.56 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.991071E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.219 | TFLOPs: 43.97 | +7: iteration 17930/ 115203 | consumed samples: 4590080 | consumed tokens: 9400483840 | elapsed time per iteration (s): 0.56 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 2.983474E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.665 | TFLOPs: 43.54 | +7: iteration 17940/ 115203 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 0.56 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.986254E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.794 | TFLOPs: 43.65 | +7: iteration 17950/ 115203 | consumed samples: 4595200 | consumed tokens: 9410969600 | elapsed time per iteration (s): 0.55 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.987329E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 17960/ 115203 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 0.55 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.988352E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.345 | TFLOPs: 43.98 | +7: iteration 17970/ 115203 | consumed samples: 4600320 | consumed tokens: 9421455360 | elapsed time per iteration (s): 0.55 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.981200E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.277 | TFLOPs: 43.98 | +7: iteration 17980/ 115203 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 0.56 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.990093E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.269 | TFLOPs: 43.79 | +7: iteration 17990/ 115203 | consumed samples: 4605440 | consumed tokens: 9431941120 | elapsed time per iteration (s): 0.56 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.985456E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.272 | TFLOPs: 43.60 | +0: [2023-03-16 15:39:15,404] [INFO] [logging.py:68:log_dist] [Rank 0] step=18000, skipped=0, lr=[0.00019048094388569267, 0.00019048094388569267, 0.00019048094388569267], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 18000/ 115203 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 0.56 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.977903E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.042 | TFLOPs: 43.38 | +0: steps: 18000 loss: 2.9783 iter time (s): 0.561 samples/sec: 455.991 +7: iteration 18010/ 115203 | consumed samples: 4610560 | consumed tokens: 9442426880 | elapsed time per iteration (s): 0.56 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.984964E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.839 | TFLOPs: 43.55 | +7: iteration 18020/ 115203 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 0.56 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 2.987005E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.404 | TFLOPs: 43.70 | +7: iteration 18030/ 115203 | consumed samples: 4615680 | consumed tokens: 9452912640 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.993447E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.459 | TFLOPs: 43.71 | +7: iteration 18040/ 115203 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.977168E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.983 | TFLOPs: 43.38 | +7: iteration 18050/ 115203 | consumed samples: 4620800 | consumed tokens: 9463398400 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.975739E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.261 | TFLOPs: 43.59 | +7: iteration 18060/ 115203 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.975086E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.824 | TFLOPs: 43.74 | +7: iteration 18070/ 115203 | consumed samples: 4625920 | consumed tokens: 9473884160 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.977446E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.270 | TFLOPs: 43.69 | +7: iteration 18080/ 115203 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 0.57 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.972324E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.261 | TFLOPs: 42.93 | +7: iteration 18090/ 115203 | consumed samples: 4631040 | consumed tokens: 9484369920 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.975495E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.261 | TFLOPs: 43.69 | +7: iteration 18100/ 115203 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.972783E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.909 | TFLOPs: 43.37 | +7: iteration 18110/ 115203 | consumed samples: 4636160 | consumed tokens: 9494855680 | elapsed time per iteration (s): 0.56 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 2.973052E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.858 | TFLOPs: 43.94 | +7: iteration 18120/ 115203 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.997005E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.860 | TFLOPs: 43.84 | +7: iteration 18130/ 115203 | consumed samples: 4641280 | consumed tokens: 9505341440 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.970194E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.081 | TFLOPs: 43.58 | +7: iteration 18140/ 115203 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.971431E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 18150/ 115203 | consumed samples: 4646400 | consumed tokens: 9515827200 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.978181E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.906 | TFLOPs: 43.94 | +7: iteration 18160/ 115203 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 0.58 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.984070E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.804 | TFLOPs: 42.12 | +7: iteration 18170/ 115203 | consumed samples: 4651520 | consumed tokens: 9526312960 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.980759E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.865 | TFLOPs: 43.65 | +7: iteration 18180/ 115203 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.980526E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.993 | TFLOPs: 43.76 | +7: iteration 18190/ 115203 | consumed samples: 4656640 | consumed tokens: 9536798720 | elapsed time per iteration (s): 0.57 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.972210E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.451 | TFLOPs: 43.04 | +7: iteration 18200/ 115203 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 0.56 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 2.977992E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.941 | TFLOPs: 43.95 | +7: iteration 18210/ 115203 | consumed samples: 4661760 | consumed tokens: 9547284480 | elapsed time per iteration (s): 0.57 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.969886E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.986 | TFLOPs: 43.19 | +7: iteration 18220/ 115203 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 0.56 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.967105E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.766 | TFLOPs: 43.93 | +7: iteration 18230/ 115203 | consumed samples: 4666880 | consumed tokens: 9557770240 | elapsed time per iteration (s): 0.58 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.992999E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.311 | TFLOPs: 42.17 | +7: iteration 18240/ 115203 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 0.56 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.970978E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.433 | TFLOPs: 43.42 | +7: iteration 18250/ 115203 | consumed samples: 4672000 | consumed tokens: 9568256000 | elapsed time per iteration (s): 0.55 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.968182E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.298 | TFLOPs: 43.98 | +7: iteration 18260/ 115203 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 0.56 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.984124E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.204 | TFLOPs: 43.97 | +7: iteration 18270/ 115203 | consumed samples: 4677120 | consumed tokens: 9578741760 | elapsed time per iteration (s): 0.58 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.974703E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.034 | TFLOPs: 42.24 | +7: iteration 18280/ 115203 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 0.57 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.974950E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.700 | TFLOPs: 42.78 | +7: iteration 18290/ 115203 | consumed samples: 4682240 | consumed tokens: 9589227520 | elapsed time per iteration (s): 0.56 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 2.994557E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.021 | TFLOPs: 43.57 | +7: iteration 18300/ 115203 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 2.989622E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.326 | TFLOPs: 43.89 | +7: iteration 18310/ 115203 | consumed samples: 4687360 | consumed tokens: 9599713280 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.295060E+00 | grad norm: 2.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.561 | TFLOPs: 43.81 | +7: iteration 18320/ 115203 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.329122E+00 | grad norm: 0.926 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.384 | TFLOPs: 43.80 | +7: iteration 18330/ 115203 | consumed samples: 4692480 | consumed tokens: 9610199040 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.052985E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.102 | TFLOPs: 43.96 | +7: iteration 18340/ 115203 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 0.57 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.006585E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.487 | TFLOPs: 42.85 | +7: iteration 18350/ 115203 | consumed samples: 4697600 | consumed tokens: 9620684800 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.008291E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.632 | TFLOPs: 43.44 | +7: iteration 18360/ 115203 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 2.992565E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 18370/ 115203 | consumed samples: 4702720 | consumed tokens: 9631170560 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 2.997475E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.725 | TFLOPs: 43.45 | +7: iteration 18380/ 115203 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 0.56 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.005047E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.365 | TFLOPs: 43.60 | +7: iteration 18390/ 115203 | consumed samples: 4707840 | consumed tokens: 9641656320 | elapsed time per iteration (s): 0.57 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.987257E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.581 | TFLOPs: 42.96 | +7: iteration 18400/ 115203 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.987097E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.905 | TFLOPs: 43.47 | +7: iteration 18410/ 115203 | consumed samples: 4712960 | consumed tokens: 9652142080 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.978399E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.861 | TFLOPs: 43.56 | +7: iteration 18420/ 115203 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.973452E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.597 | TFLOPs: 43.53 | +7: iteration 18430/ 115203 | consumed samples: 4718080 | consumed tokens: 9662627840 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.977577E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.575 | TFLOPs: 43.62 | +7: iteration 18440/ 115203 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.972103E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.154 | TFLOPs: 43.97 | +7: iteration 18450/ 115203 | consumed samples: 4723200 | consumed tokens: 9673113600 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.986873E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.374 | TFLOPs: 43.51 | +7: iteration 18460/ 115203 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.988118E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.788 | TFLOPs: 43.93 | +7: iteration 18470/ 115203 | consumed samples: 4728320 | consumed tokens: 9683599360 | elapsed time per iteration (s): 0.56 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 2.981826E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.119 | TFLOPs: 43.96 | +7: iteration 18480/ 115203 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 0.56 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.965181E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.736 | TFLOPs: 43.45 | +7: iteration 18490/ 115203 | consumed samples: 4733440 | consumed tokens: 9694085120 | elapsed time per iteration (s): 0.56 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.964774E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 18500/ 115203 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 0.56 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.978107E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.074 | TFLOPs: 43.58 | +7: iteration 18510/ 115203 | consumed samples: 4738560 | consumed tokens: 9704570880 | elapsed time per iteration (s): 0.56 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.969441E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 18520/ 115203 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 0.57 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.969678E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.099 | TFLOPs: 42.63 | +7: iteration 18530/ 115203 | consumed samples: 4743680 | consumed tokens: 9715056640 | elapsed time per iteration (s): 0.56 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.961667E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.954 | TFLOPs: 43.57 | +7: iteration 18540/ 115203 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 0.55 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.982840E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.308 | TFLOPs: 43.98 | +7: iteration 18550/ 115203 | consumed samples: 4748800 | consumed tokens: 9725542400 | elapsed time per iteration (s): 0.57 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 2.980792E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.052 | TFLOPs: 43.10 | +7: iteration 18560/ 115203 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.977166E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.195 | TFLOPs: 43.59 | +7: iteration 18570/ 115203 | consumed samples: 4753920 | consumed tokens: 9736028160 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.979150E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.964 | TFLOPs: 43.95 | +7: iteration 18580/ 115203 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.977212E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.054 | TFLOPs: 43.96 | +7: iteration 18590/ 115203 | consumed samples: 4759040 | consumed tokens: 9746513920 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.973639E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.220 | TFLOPs: 43.97 | +7: iteration 18600/ 115203 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.978767E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.914 | TFLOPs: 43.94 | +7: iteration 18610/ 115203 | consumed samples: 4764160 | consumed tokens: 9756999680 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.974228E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.961 | TFLOPs: 43.95 | +7: iteration 18620/ 115203 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.975389E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.720 | TFLOPs: 43.64 | +7: iteration 18630/ 115203 | consumed samples: 4769280 | consumed tokens: 9767485440 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.971234E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.036 | TFLOPs: 43.95 | +7: iteration 18640/ 115203 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 0.56 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 2.978203E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.054 | TFLOPs: 43.96 | +7: iteration 18650/ 115203 | consumed samples: 4774400 | consumed tokens: 9777971200 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.976586E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 18660/ 115203 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.973906E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.301 | TFLOPs: 43.69 | +7: iteration 18670/ 115203 | consumed samples: 4779520 | consumed tokens: 9788456960 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.958833E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.964 | TFLOPs: 43.85 | +7: iteration 18680/ 115203 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.965359E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.054 | TFLOPs: 43.96 | +7: iteration 18690/ 115203 | consumed samples: 4784640 | consumed tokens: 9798942720 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.973709E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.084 | TFLOPs: 43.58 | +7: iteration 18700/ 115203 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.955955E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.955 | TFLOPs: 43.57 | +7: iteration 18710/ 115203 | consumed samples: 4789760 | consumed tokens: 9809428480 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.962546E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.206 | TFLOPs: 43.59 | +7: iteration 18720/ 115203 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.969322E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.980 | TFLOPs: 43.95 | +7: iteration 18730/ 115203 | consumed samples: 4794880 | consumed tokens: 9819914240 | elapsed time per iteration (s): 0.56 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 2.963536E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.540 | TFLOPs: 43.34 | +7: iteration 18740/ 115203 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 0.56 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.966935E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.084 | TFLOPs: 43.67 | +7: iteration 18750/ 115203 | consumed samples: 4800000 | consumed tokens: 9830400000 | elapsed time per iteration (s): 0.57 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.988703E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.736 | TFLOPs: 42.97 | +7: iteration 18760/ 115203 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 0.57 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.973714E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.761 | TFLOPs: 42.78 | +7: iteration 18770/ 115203 | consumed samples: 4805120 | consumed tokens: 9840885760 | elapsed time per iteration (s): 0.56 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.975918E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.430 | TFLOPs: 43.42 | +7: iteration 18780/ 115203 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 0.56 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.976509E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.089 | TFLOPs: 43.96 | +7: iteration 18790/ 115203 | consumed samples: 4810240 | consumed tokens: 9851371520 | elapsed time per iteration (s): 0.56 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.962870E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.938 | TFLOPs: 43.95 | +7: iteration 18800/ 115203 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 0.57 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.971148E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.245 | TFLOPs: 42.83 | +7: iteration 18810/ 115203 | consumed samples: 4815360 | consumed tokens: 9861857280 | elapsed time per iteration (s): 0.56 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.976647E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.076 | TFLOPs: 43.96 | +7: iteration 18820/ 115203 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 0.56 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 2.973032E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.471 | TFLOPs: 43.52 | +7: iteration 18830/ 115203 | consumed samples: 4820480 | consumed tokens: 9872343040 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.972132E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.112 | TFLOPs: 43.96 | +7: iteration 18840/ 115203 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.959943E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.223 | TFLOPs: 43.69 | +7: iteration 18850/ 115203 | consumed samples: 4825600 | consumed tokens: 9882828800 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.970479E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.962 | TFLOPs: 43.66 | +7: iteration 18860/ 115203 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 0.57 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.966390E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.221 | TFLOPs: 43.11 | +7: iteration 18870/ 115203 | consumed samples: 4830720 | consumed tokens: 9893314560 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.970059E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.982 | TFLOPs: 43.95 | +7: iteration 18880/ 115203 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.966175E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.760 | TFLOPs: 43.93 | +7: iteration 18890/ 115203 | consumed samples: 4835840 | consumed tokens: 9903800320 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.980724E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.687 | TFLOPs: 43.92 | +7: iteration 18900/ 115203 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 0.56 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 2.975739E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.966 | TFLOPs: 43.85 | +7: iteration 18910/ 115203 | consumed samples: 4840960 | consumed tokens: 9914286080 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.968950E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.022 | TFLOPs: 43.95 | +7: iteration 18920/ 115203 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.966681E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.971 | TFLOPs: 43.47 | +7: iteration 18930/ 115203 | consumed samples: 4846080 | consumed tokens: 9924771840 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.962940E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.053 | TFLOPs: 43.96 | +7: iteration 18940/ 115203 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.980313E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.084 | TFLOPs: 43.96 | +7: iteration 18950/ 115203 | consumed samples: 4851200 | consumed tokens: 9935257600 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.969471E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.726 | TFLOPs: 43.45 | +7: iteration 18960/ 115203 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.967173E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.012 | TFLOPs: 43.95 | +7: iteration 18970/ 115203 | consumed samples: 4856320 | consumed tokens: 9945743360 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.972485E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.853 | TFLOPs: 43.94 | +7: iteration 18980/ 115203 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.969067E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.095 | TFLOPs: 43.96 | +7: iteration 18990/ 115203 | consumed samples: 4861440 | consumed tokens: 9956229120 | elapsed time per iteration (s): 0.56 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 2.969345E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.630 | TFLOPs: 43.44 | +7: iteration 19000/ 115203 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.960646E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.035 | TFLOPs: 43.95 | +7: iteration 19010/ 115203 | consumed samples: 4866560 | consumed tokens: 9966714880 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.962721E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.089 | TFLOPs: 43.58 | +7: iteration 19020/ 115203 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.953512E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.151 | TFLOPs: 43.97 | +7: iteration 19030/ 115203 | consumed samples: 4871680 | consumed tokens: 9977200640 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.966570E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.927 | TFLOPs: 43.66 | +7: iteration 19040/ 115203 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.959225E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.451 | TFLOPs: 43.71 | +7: iteration 19050/ 115203 | consumed samples: 4876800 | consumed tokens: 9987686400 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.966670E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 19060/ 115203 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 0.55 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.963083E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.275 | TFLOPs: 43.98 | +7: iteration 19070/ 115203 | consumed samples: 4881920 | consumed tokens: 9998172160 | elapsed time per iteration (s): 0.56 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 2.957160E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 19080/ 115203 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 0.55 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.978948E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.346 | TFLOPs: 43.98 | +7: iteration 19090/ 115203 | consumed samples: 4887040 | consumed tokens: 10008657920 | elapsed time per iteration (s): 0.55 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.968820E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.296 | TFLOPs: 43.98 | +7: iteration 19100/ 115203 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 0.55 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.971604E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.300 | TFLOPs: 43.98 | +7: iteration 19110/ 115203 | consumed samples: 4892160 | consumed tokens: 10019143680 | elapsed time per iteration (s): 0.56 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.967207E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.050 | TFLOPs: 43.96 | +7: iteration 19120/ 115203 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 0.56 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.967072E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.997 | TFLOPs: 43.95 | +7: iteration 19130/ 115203 | consumed samples: 4897280 | consumed tokens: 10029629440 | elapsed time per iteration (s): 0.56 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.962091E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.782 | TFLOPs: 43.93 | +7: iteration 19140/ 115203 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 0.56 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.956221E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.102 | TFLOPs: 43.96 | +7: iteration 19150/ 115203 | consumed samples: 4902400 | consumed tokens: 10040115200 | elapsed time per iteration (s): 0.56 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.947269E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.731 | TFLOPs: 43.45 | +7: iteration 19160/ 115203 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 0.56 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 2.960383E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.937 | TFLOPs: 43.95 | +7: iteration 19170/ 115203 | consumed samples: 4907520 | consumed tokens: 10050600960 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.955017E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.899 | TFLOPs: 43.94 | +7: iteration 19180/ 115203 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.970985E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.827 | TFLOPs: 43.93 | +7: iteration 19190/ 115203 | consumed samples: 4912640 | consumed tokens: 10061086720 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.968391E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.117 | TFLOPs: 43.96 | +7: iteration 19200/ 115203 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.981371E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.560 | TFLOPs: 43.62 | +7: iteration 19210/ 115203 | consumed samples: 4917760 | consumed tokens: 10071572480 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.965921E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.316 | TFLOPs: 43.60 | +7: iteration 19220/ 115203 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.964518E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.258 | TFLOPs: 43.59 | +7: iteration 19230/ 115203 | consumed samples: 4922880 | consumed tokens: 10082058240 | elapsed time per iteration (s): 0.56 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.950209E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.191 | TFLOPs: 43.97 | +7: iteration 19240/ 115203 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 0.55 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 2.955387E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.278 | TFLOPs: 43.98 | +7: iteration 19250/ 115203 | consumed samples: 4928000 | consumed tokens: 10092544000 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.980213E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.136 | TFLOPs: 43.96 | +7: iteration 19260/ 115203 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.959061E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.146 | TFLOPs: 43.68 | +7: iteration 19270/ 115203 | consumed samples: 4933120 | consumed tokens: 10103029760 | elapsed time per iteration (s): 0.57 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.969239E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.458 | TFLOPs: 43.04 | +7: iteration 19280/ 115203 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.946844E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.900 | TFLOPs: 43.66 | +7: iteration 19290/ 115203 | consumed samples: 4938240 | consumed tokens: 10113515520 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.951341E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.201 | TFLOPs: 43.97 | +7: iteration 19300/ 115203 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.969347E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.006 | TFLOPs: 43.95 | +7: iteration 19310/ 115203 | consumed samples: 4943360 | consumed tokens: 10124001280 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.955511E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.577 | TFLOPs: 43.53 | +7: iteration 19320/ 115203 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.975299E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.451 | TFLOPs: 43.61 | +7: iteration 19330/ 115203 | consumed samples: 4948480 | consumed tokens: 10134487040 | elapsed time per iteration (s): 0.56 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 2.947216E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.926 | TFLOPs: 43.94 | +7: iteration 19340/ 115203 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.964065E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.077 | TFLOPs: 43.96 | +7: iteration 19350/ 115203 | consumed samples: 4953600 | consumed tokens: 10144972800 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.951036E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 19360/ 115203 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 0.57 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.941443E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.864 | TFLOPs: 43.18 | +7: iteration 19370/ 115203 | consumed samples: 4958720 | consumed tokens: 10155458560 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.962931E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.038 | TFLOPs: 43.96 | +7: iteration 19380/ 115203 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.967986E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.498 | TFLOPs: 43.52 | +7: iteration 19390/ 115203 | consumed samples: 4963840 | consumed tokens: 10165944320 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.963853E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.029 | TFLOPs: 43.95 | +7: iteration 19400/ 115203 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.960861E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.376 | TFLOPs: 43.61 | +7: iteration 19410/ 115203 | consumed samples: 4968960 | consumed tokens: 10176430080 | elapsed time per iteration (s): 0.56 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 2.952166E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.095 | TFLOPs: 43.96 | +7: iteration 19420/ 115203 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.952963E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.070 | TFLOPs: 43.96 | +7: iteration 19430/ 115203 | consumed samples: 4974080 | consumed tokens: 10186915840 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.949285E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.821 | TFLOPs: 43.93 | +7: iteration 19440/ 115203 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.953680E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.841 | TFLOPs: 43.94 | +7: iteration 19450/ 115203 | consumed samples: 4979200 | consumed tokens: 10197401600 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.964193E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.424 | TFLOPs: 43.61 | +7: iteration 19460/ 115203 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.951753E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.149 | TFLOPs: 43.97 | +7: iteration 19470/ 115203 | consumed samples: 4984320 | consumed tokens: 10207887360 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.946394E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.036 | TFLOPs: 43.95 | +7: iteration 19480/ 115203 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.963504E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.081 | TFLOPs: 43.96 | +7: iteration 19490/ 115203 | consumed samples: 4989440 | consumed tokens: 10218373120 | elapsed time per iteration (s): 0.56 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 2.963211E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.005 | TFLOPs: 43.95 | +7: iteration 19500/ 115203 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.962447E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.248 | TFLOPs: 43.98 | +7: iteration 19510/ 115203 | consumed samples: 4994560 | consumed tokens: 10228858880 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.953588E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 19520/ 115203 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.963747E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.044 | TFLOPs: 43.96 | +7: iteration 19530/ 115203 | consumed samples: 4999680 | consumed tokens: 10239344640 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.968189E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.931 | TFLOPs: 43.94 | +7: iteration 19540/ 115203 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.947919E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.030 | TFLOPs: 43.95 | +7: iteration 19550/ 115203 | consumed samples: 5004800 | consumed tokens: 10249830400 | elapsed time per iteration (s): 0.55 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.950439E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.283 | TFLOPs: 43.98 | +7: iteration 19560/ 115203 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.955361E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.154 | TFLOPs: 43.97 | +7: iteration 19570/ 115203 | consumed samples: 5009920 | consumed tokens: 10260316160 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.967047E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.177 | TFLOPs: 43.97 | +7: iteration 19580/ 115203 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 0.56 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 2.933581E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.823 | TFLOPs: 43.93 | +7: iteration 19590/ 115203 | consumed samples: 5015040 | consumed tokens: 10270801920 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.955190E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 19600/ 115203 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.950172E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.114 | TFLOPs: 43.96 | +7: iteration 19610/ 115203 | consumed samples: 5020160 | consumed tokens: 10281287680 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.963153E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.215 | TFLOPs: 43.97 | +7: iteration 19620/ 115203 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.964542E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 19630/ 115203 | consumed samples: 5025280 | consumed tokens: 10291773440 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.961283E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.002 | TFLOPs: 43.95 | +7: iteration 19640/ 115203 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.934861E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.137 | TFLOPs: 43.39 | +7: iteration 19650/ 115203 | consumed samples: 5030400 | consumed tokens: 10302259200 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.968277E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.953 | TFLOPs: 43.95 | +7: iteration 19660/ 115203 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 0.56 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 2.942946E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.010 | TFLOPs: 43.95 | +7: iteration 19670/ 115203 | consumed samples: 5035520 | consumed tokens: 10312744960 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.943389E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.457 | TFLOPs: 43.61 | +7: iteration 19680/ 115203 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.953157E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.508 | TFLOPs: 43.33 | +7: iteration 19690/ 115203 | consumed samples: 5040640 | consumed tokens: 10323230720 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.952847E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.894 | TFLOPs: 43.94 | +7: iteration 19700/ 115203 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.949450E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.708 | TFLOPs: 43.92 | +7: iteration 19710/ 115203 | consumed samples: 5045760 | consumed tokens: 10333716480 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.964079E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.850 | TFLOPs: 43.94 | +7: iteration 19720/ 115203 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.960075E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.030 | TFLOPs: 43.95 | +7: iteration 19730/ 115203 | consumed samples: 5050880 | consumed tokens: 10344202240 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.948853E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.308 | TFLOPs: 43.69 | +7: iteration 19740/ 115203 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 0.56 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 2.951925E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.087 | TFLOPs: 43.96 | +7: iteration 19750/ 115203 | consumed samples: 5056000 | consumed tokens: 10354688000 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.941791E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.043 | TFLOPs: 43.96 | +7: iteration 19760/ 115203 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.939077E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.119 | TFLOPs: 43.96 | +7: iteration 19770/ 115203 | consumed samples: 5061120 | consumed tokens: 10365173760 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.956099E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.122 | TFLOPs: 43.96 | +7: iteration 19780/ 115203 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.962960E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.752 | TFLOPs: 43.55 | +7: iteration 19790/ 115203 | consumed samples: 5066240 | consumed tokens: 10375659520 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.954172E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.038 | TFLOPs: 43.67 | +7: iteration 19800/ 115203 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.946465E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.197 | TFLOPs: 43.40 | +7: iteration 19810/ 115203 | consumed samples: 5071360 | consumed tokens: 10386145280 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.959589E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.364 | TFLOPs: 43.51 | +7: iteration 19820/ 115203 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 0.56 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 2.951184E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.015 | TFLOPs: 43.48 | +7: iteration 19830/ 115203 | consumed samples: 5076480 | consumed tokens: 10396631040 | elapsed time per iteration (s): 0.56 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.951337E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.649 | TFLOPs: 43.63 | +7: iteration 19840/ 115203 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 0.56 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.950023E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.913 | TFLOPs: 43.85 | +7: iteration 19850/ 115203 | consumed samples: 5081600 | consumed tokens: 10407116800 | elapsed time per iteration (s): 0.56 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.960905E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 19860/ 115203 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 0.57 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.951596E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.769 | TFLOPs: 43.07 | +7: iteration 19870/ 115203 | consumed samples: 5086720 | consumed tokens: 10417602560 | elapsed time per iteration (s): 0.56 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.957796E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.138 | TFLOPs: 43.96 | +7: iteration 19880/ 115203 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 0.56 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.960034E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.149 | TFLOPs: 43.97 | +7: iteration 19890/ 115203 | consumed samples: 5091840 | consumed tokens: 10428088320 | elapsed time per iteration (s): 0.55 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.947098E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 19900/ 115203 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 0.56 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.947360E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.239 | TFLOPs: 43.97 | +7: iteration 19910/ 115203 | consumed samples: 5096960 | consumed tokens: 10438574080 | elapsed time per iteration (s): 0.55 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 2.963327E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.337 | TFLOPs: 43.98 | +7: iteration 19920/ 115203 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 0.56 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.950264E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.229 | TFLOPs: 43.97 | +7: iteration 19930/ 115203 | consumed samples: 5102080 | consumed tokens: 10449059840 | elapsed time per iteration (s): 0.56 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.960425E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.209 | TFLOPs: 43.97 | +7: iteration 19940/ 115203 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 0.56 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.954448E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.137 | TFLOPs: 43.96 | +7: iteration 19950/ 115203 | consumed samples: 5107200 | consumed tokens: 10459545600 | elapsed time per iteration (s): 0.55 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.950107E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.448 | TFLOPs: 43.99 | +7: iteration 19960/ 115203 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 0.55 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.959004E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.389 | TFLOPs: 43.99 | +7: iteration 19970/ 115203 | consumed samples: 5112320 | consumed tokens: 10470031360 | elapsed time per iteration (s): 0.55 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.945635E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 19980/ 115203 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 0.55 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.947365E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.344 | TFLOPs: 43.98 | +7: iteration 19990/ 115203 | consumed samples: 5117440 | consumed tokens: 10480517120 | elapsed time per iteration (s): 0.55 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 2.954418E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +0: [2023-03-16 15:57:51,792] [INFO] [logging.py:68:log_dist] [Rank 0] step=20000, skipped=0, lr=[0.00018814068619753637, 0.00018814068619753637, 0.00018814068619753637], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 20000/ 115203 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 0.56 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.958269E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.194 | TFLOPs: 43.97 | +0: steps: 20000 loss: 2.9567 iter time (s): 0.556 samples/sec: 460.573 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 20000 | lm loss value: 3.306425E+00 | lm loss PPL: 2.728739E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 20000 to checkpoints_421m60b400m +0: [2023-03-16 15:57:52,003] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is begin to save! +0: [2023-03-16 15:57:52,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_01-model_00-model_states.pt... +0: [2023-03-16 15:57:52,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_01-model_00-model_states.pt. +0: [2023-03-16 15:57:52,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_03-model_00-model_states.pt... +0: [2023-03-16 15:57:52,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_03-model_00-model_states.pt. +0: [2023-03-16 15:57:52,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_04-model_00-model_states.pt... +0: [2023-03-16 15:57:52,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_04-model_00-model_states.pt. +0: [2023-03-16 15:57:52,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_05-model_00-model_states.pt... +0: [2023-03-16 15:57:52,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_05-model_00-model_states.pt. +0: [2023-03-16 15:57:52,284] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_06-model_00-model_states.pt... +0: [2023-03-16 15:57:52,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_06-model_00-model_states.pt. +0: [2023-03-16 15:57:52,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_07-model_00-model_states.pt... +0: [2023-03-16 15:57:52,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_07-model_00-model_states.pt. +0: [2023-03-16 15:57:52,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_08-model_00-model_states.pt... +0: [2023-03-16 15:57:52,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_08-model_00-model_states.pt. +0: [2023-03-16 15:57:52,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_09-model_00-model_states.pt... +0: [2023-03-16 15:57:52,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_09-model_00-model_states.pt. +0: [2023-03-16 15:57:52,450] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_10-model_00-model_states.pt... +0: [2023-03-16 15:57:52,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_10-model_00-model_states.pt. +0: [2023-03-16 15:57:52,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_11-model_00-model_states.pt... +0: [2023-03-16 15:57:52,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_11-model_00-model_states.pt. +0: [2023-03-16 15:57:52,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_12-model_00-model_states.pt... +0: [2023-03-16 15:57:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_12-model_00-model_states.pt. +0: [2023-03-16 15:57:52,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_13-model_00-model_states.pt... +0: [2023-03-16 15:57:52,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_13-model_00-model_states.pt. +0: [2023-03-16 15:57:52,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_14-model_00-model_states.pt... +0: [2023-03-16 15:57:52,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_14-model_00-model_states.pt. +0: [2023-03-16 15:57:52,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_15-model_00-model_states.pt... +0: [2023-03-16 15:57:52,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_15-model_00-model_states.pt. +0: [2023-03-16 15:57:52,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_16-model_00-model_states.pt... +0: [2023-03-16 15:57:52,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_16-model_00-model_states.pt. +0: [2023-03-16 15:57:52,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_17-model_00-model_states.pt... +0: [2023-03-16 15:57:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_17-model_00-model_states.pt. +0: [2023-03-16 15:57:52,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_18-model_00-model_states.pt... +0: [2023-03-16 15:57:52,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_18-model_00-model_states.pt. +0: [2023-03-16 15:57:52,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_19-model_00-model_states.pt... +0: [2023-03-16 15:57:52,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_19-model_00-model_states.pt. +0: [2023-03-16 15:57:52,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_20-model_00-model_states.pt... +0: [2023-03-16 15:57:52,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_20-model_00-model_states.pt. +0: [2023-03-16 15:57:52,905] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/layer_22-model_00-model_states.pt... +0: [2023-03-16 15:57:52,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/layer_22-model_00-model_states.pt. +0: [2023-03-16 15:57:52,910] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step20000/mp_rank_00_model_states.pt +0: [2023-03-16 15:57:52,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/mp_rank_00_model_states.pt... +0: [2023-03-16 15:57:52,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/mp_rank_00_model_states.pt. +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 15:57:52,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +0: [2023-03-16 15:57:53,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 15:57:53,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-16 15:57:53,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-16 15:57:53,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 15:57:53,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 15:57:53,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-16 15:57:53,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 15:57:53,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 15:57:53,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 15:57:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-16 15:57:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-16 15:57:53,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 15:57:53,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-16 15:57:53,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-16 15:57:53,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 15:57:53,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: successfully saved checkpoint at iteration 20000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1170.46 +7: iteration 20010/ 115203 | consumed samples: 5122560 | consumed tokens: 10491002880 | elapsed time per iteration (s): 0.69 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.949735E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 369.154 | TFLOPs: 35.19 | +7: iteration 20020/ 115203 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 0.55 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.954301E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.522 | TFLOPs: 44.00 | +7: iteration 20030/ 115203 | consumed samples: 5127680 | consumed tokens: 10501488640 | elapsed time per iteration (s): 0.56 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.948278E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.259 | TFLOPs: 43.98 | +7: iteration 20040/ 115203 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 0.56 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.950352E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.205 | TFLOPs: 43.97 | +7: iteration 20050/ 115203 | consumed samples: 5132800 | consumed tokens: 10511974400 | elapsed time per iteration (s): 0.55 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.956869E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.412 | TFLOPs: 43.99 | +7: iteration 20060/ 115203 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 0.56 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.942175E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.235 | TFLOPs: 43.97 | +7: iteration 20070/ 115203 | consumed samples: 5137920 | consumed tokens: 10522460160 | elapsed time per iteration (s): 0.55 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 2.949540E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.374 | TFLOPs: 43.99 | +7: iteration 20080/ 115203 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 0.56 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.953556E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.077 | TFLOPs: 43.39 | +7: iteration 20090/ 115203 | consumed samples: 5143040 | consumed tokens: 10532945920 | elapsed time per iteration (s): 0.55 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.952024E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.453 | TFLOPs: 43.99 | +7: iteration 20100/ 115203 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 0.55 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.952779E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.503 | TFLOPs: 44.00 | +7: iteration 20110/ 115203 | consumed samples: 5148160 | consumed tokens: 10543431680 | elapsed time per iteration (s): 0.56 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.956085E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 20120/ 115203 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 0.55 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.945380E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.284 | TFLOPs: 43.98 | +7: iteration 20130/ 115203 | consumed samples: 5153280 | consumed tokens: 10553917440 | elapsed time per iteration (s): 0.55 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.932715E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.264 | TFLOPs: 43.98 | +7: iteration 20140/ 115203 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 0.56 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.946507E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.973 | TFLOPs: 43.95 | +7: iteration 20150/ 115203 | consumed samples: 5158400 | consumed tokens: 10564403200 | elapsed time per iteration (s): 0.56 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 2.941989E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.094 | TFLOPs: 43.96 | +7: iteration 20160/ 115203 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 0.55 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.939723E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.304 | TFLOPs: 43.98 | +7: iteration 20170/ 115203 | consumed samples: 5163520 | consumed tokens: 10574888960 | elapsed time per iteration (s): 0.55 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.937359E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 20180/ 115203 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 0.56 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.952659E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.114 | TFLOPs: 43.96 | +7: iteration 20190/ 115203 | consumed samples: 5168640 | consumed tokens: 10585374720 | elapsed time per iteration (s): 0.56 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.961388E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 20200/ 115203 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 0.55 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.937318E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.272 | TFLOPs: 43.98 | +7: iteration 20210/ 115203 | consumed samples: 5173760 | consumed tokens: 10595860480 | elapsed time per iteration (s): 0.55 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.943365E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.422 | TFLOPs: 43.99 | +7: iteration 20220/ 115203 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 0.56 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.944390E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.096 | TFLOPs: 43.96 | +7: iteration 20230/ 115203 | consumed samples: 5178880 | consumed tokens: 10606346240 | elapsed time per iteration (s): 0.55 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 2.951234E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.367 | TFLOPs: 43.99 | +7: iteration 20240/ 115203 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 0.56 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.933043E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 20250/ 115203 | consumed samples: 5184000 | consumed tokens: 10616832000 | elapsed time per iteration (s): 0.56 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.954771E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.103 | TFLOPs: 43.87 | +7: iteration 20260/ 115203 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 0.56 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.944147E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.223 | TFLOPs: 43.69 | +7: iteration 20270/ 115203 | consumed samples: 5189120 | consumed tokens: 10627317760 | elapsed time per iteration (s): 0.55 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.952881E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.343 | TFLOPs: 43.98 | +7: iteration 20280/ 115203 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 0.55 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.957825E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.301 | TFLOPs: 43.98 | +7: iteration 20290/ 115203 | consumed samples: 5194240 | consumed tokens: 10637803520 | elapsed time per iteration (s): 0.55 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.935692E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.641 | TFLOPs: 44.01 | +7: iteration 20300/ 115203 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 0.55 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.945531E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 20310/ 115203 | consumed samples: 5199360 | consumed tokens: 10648289280 | elapsed time per iteration (s): 0.57 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 2.927741E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.167 | TFLOPs: 42.82 | +7: iteration 20320/ 115203 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 0.55 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.945324E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 20330/ 115203 | consumed samples: 5204480 | consumed tokens: 10658775040 | elapsed time per iteration (s): 0.55 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.953496E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.456 | TFLOPs: 43.99 | +7: iteration 20340/ 115203 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 0.56 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.952886E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.390 | TFLOPs: 43.42 | +7: iteration 20350/ 115203 | consumed samples: 5209600 | consumed tokens: 10669260800 | elapsed time per iteration (s): 0.55 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.961355E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.495 | TFLOPs: 44.00 | +7: iteration 20360/ 115203 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 0.56 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.939196E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 20370/ 115203 | consumed samples: 5214720 | consumed tokens: 10679746560 | elapsed time per iteration (s): 0.56 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.935405E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.799 | TFLOPs: 43.93 | +7: iteration 20380/ 115203 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 0.56 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.934697E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.159 | TFLOPs: 43.39 | +7: iteration 20390/ 115203 | consumed samples: 5219840 | consumed tokens: 10690232320 | elapsed time per iteration (s): 0.56 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 2.954504E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.441 | TFLOPs: 43.90 | +7: iteration 20400/ 115203 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 0.56 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.916961E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.488 | TFLOPs: 43.90 | +7: iteration 20410/ 115203 | consumed samples: 5224960 | consumed tokens: 10700718080 | elapsed time per iteration (s): 0.56 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.941566E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.215 | TFLOPs: 43.88 | +7: iteration 20420/ 115203 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 0.56 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.946960E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.517 | TFLOPs: 43.91 | +7: iteration 20430/ 115203 | consumed samples: 5230080 | consumed tokens: 10711203840 | elapsed time per iteration (s): 0.55 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.937798E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.273 | TFLOPs: 43.98 | +7: iteration 20440/ 115203 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 0.56 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.952916E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.247 | TFLOPs: 43.50 | +7: iteration 20450/ 115203 | consumed samples: 5235200 | consumed tokens: 10721689600 | elapsed time per iteration (s): 0.55 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.944300E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.421 | TFLOPs: 43.99 | +7: iteration 20460/ 115203 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 0.56 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.967507E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.690 | TFLOPs: 43.45 | +7: iteration 20470/ 115203 | consumed samples: 5240320 | consumed tokens: 10732175360 | elapsed time per iteration (s): 0.56 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 2.949401E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.964 | TFLOPs: 43.47 | +7: iteration 20480/ 115203 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 0.55 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.934098E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.270 | TFLOPs: 43.98 | +7: iteration 20490/ 115203 | consumed samples: 5245440 | consumed tokens: 10742661120 | elapsed time per iteration (s): 0.56 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.957654E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.088 | TFLOPs: 43.77 | +7: iteration 20500/ 115203 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 0.55 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.942460E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.397 | TFLOPs: 43.99 | +7: iteration 20510/ 115203 | consumed samples: 5250560 | consumed tokens: 10753146880 | elapsed time per iteration (s): 0.56 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.944902E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.221 | TFLOPs: 43.40 | +7: iteration 20520/ 115203 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 0.55 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.951651E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.477 | TFLOPs: 44.00 | +7: iteration 20530/ 115203 | consumed samples: 5255680 | consumed tokens: 10763632640 | elapsed time per iteration (s): 0.56 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.950009E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.270 | TFLOPs: 43.41 | +7: iteration 20540/ 115203 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 0.55 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.939702E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.453 | TFLOPs: 43.99 | +7: iteration 20550/ 115203 | consumed samples: 5260800 | consumed tokens: 10774118400 | elapsed time per iteration (s): 0.56 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 2.955216E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.787 | TFLOPs: 43.84 | +7: iteration 20560/ 115203 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 0.56 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.945319E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.948 | TFLOPs: 43.76 | +7: iteration 20570/ 115203 | consumed samples: 5265920 | consumed tokens: 10784604160 | elapsed time per iteration (s): 0.57 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.947870E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.530 | TFLOPs: 43.14 | +7: iteration 20580/ 115203 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 0.56 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.939620E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.484 | TFLOPs: 43.52 | +7: iteration 20590/ 115203 | consumed samples: 5271040 | consumed tokens: 10795089920 | elapsed time per iteration (s): 0.57 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.947182E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.456 | TFLOPs: 43.14 | +7: iteration 20600/ 115203 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 0.57 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.954109E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.498 | TFLOPs: 42.76 | +7: iteration 20610/ 115203 | consumed samples: 5276160 | consumed tokens: 10805575680 | elapsed time per iteration (s): 0.57 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.952538E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.666 | TFLOPs: 42.97 | +7: iteration 20620/ 115203 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 0.55 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.944272E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.332 | TFLOPs: 43.98 | +7: iteration 20630/ 115203 | consumed samples: 5281280 | consumed tokens: 10816061440 | elapsed time per iteration (s): 0.56 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 2.944041E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.887 | TFLOPs: 43.75 | +7: iteration 20640/ 115203 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 0.57 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.940092E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.551 | TFLOPs: 42.76 | +7: iteration 20650/ 115203 | consumed samples: 5286400 | consumed tokens: 10826547200 | elapsed time per iteration (s): 0.58 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.939790E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.932 | TFLOPs: 42.42 | +7: iteration 20660/ 115203 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 0.55 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.950694E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.546 | TFLOPs: 44.00 | +7: iteration 20670/ 115203 | consumed samples: 5291520 | consumed tokens: 10837032960 | elapsed time per iteration (s): 0.56 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.935451E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.246 | TFLOPs: 43.78 | +7: iteration 20680/ 115203 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 0.58 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.942161E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.044 | TFLOPs: 42.33 | +7: iteration 20690/ 115203 | consumed samples: 5296640 | consumed tokens: 10847518720 | elapsed time per iteration (s): 0.56 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.948649E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.956 | TFLOPs: 43.28 | +7: iteration 20700/ 115203 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 0.58 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.934460E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.556 | TFLOPs: 42.38 | +7: iteration 20710/ 115203 | consumed samples: 5301760 | consumed tokens: 10858004480 | elapsed time per iteration (s): 0.58 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 2.949249E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.175 | TFLOPs: 41.78 | +7: iteration 20720/ 115203 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.934921E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.291 | TFLOPs: 43.50 | +7: iteration 20730/ 115203 | consumed samples: 5306880 | consumed tokens: 10868490240 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.941155E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.769 | TFLOPs: 43.55 | +7: iteration 20740/ 115203 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.931358E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.353 | TFLOPs: 43.22 | +7: iteration 20750/ 115203 | consumed samples: 5312000 | consumed tokens: 10878976000 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.932212E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.873 | TFLOPs: 43.65 | +7: iteration 20760/ 115203 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.946154E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.144 | TFLOPs: 43.58 | +7: iteration 20770/ 115203 | consumed samples: 5317120 | consumed tokens: 10889461760 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.949149E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.804 | TFLOPs: 43.84 | +7: iteration 20780/ 115203 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.936660E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.642 | TFLOPs: 43.35 | +7: iteration 20790/ 115203 | consumed samples: 5322240 | consumed tokens: 10899947520 | elapsed time per iteration (s): 0.56 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 2.936167E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.910 | TFLOPs: 43.37 | +7: iteration 20800/ 115203 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 0.56 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.942954E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.570 | TFLOPs: 43.62 | +7: iteration 20810/ 115203 | consumed samples: 5327360 | consumed tokens: 10910433280 | elapsed time per iteration (s): 0.57 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.931552E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.061 | TFLOPs: 43.00 | +7: iteration 20820/ 115203 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 0.56 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.925831E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.109 | TFLOPs: 43.49 | +7: iteration 20830/ 115203 | consumed samples: 5332480 | consumed tokens: 10920919040 | elapsed time per iteration (s): 0.55 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.933520E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.333 | TFLOPs: 43.98 | +7: iteration 20840/ 115203 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 0.56 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.950319E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.177 | TFLOPs: 43.87 | +7: iteration 20850/ 115203 | consumed samples: 5337600 | consumed tokens: 10931404800 | elapsed time per iteration (s): 0.56 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.948138E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.795 | TFLOPs: 43.93 | +7: iteration 20860/ 115203 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 0.55 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 2.942458E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.504 | TFLOPs: 44.00 | +7: iteration 20870/ 115203 | consumed samples: 5342720 | consumed tokens: 10941890560 | elapsed time per iteration (s): 0.56 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.949862E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.341 | TFLOPs: 43.32 | +7: iteration 20880/ 115203 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 0.58 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.946888E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.929 | TFLOPs: 41.94 | +7: iteration 20890/ 115203 | consumed samples: 5347840 | consumed tokens: 10952376320 | elapsed time per iteration (s): 0.57 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.953246E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.794 | TFLOPs: 43.07 | +7: iteration 20900/ 115203 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 0.56 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.933138E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.534 | TFLOPs: 43.72 | +7: iteration 20910/ 115203 | consumed samples: 5352960 | consumed tokens: 10962862080 | elapsed time per iteration (s): 0.56 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.936355E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.119 | TFLOPs: 43.30 | +7: iteration 20920/ 115203 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 0.55 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.938723E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.277 | TFLOPs: 43.98 | +7: iteration 20930/ 115203 | consumed samples: 5358080 | consumed tokens: 10973347840 | elapsed time per iteration (s): 0.56 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.951964E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.812 | TFLOPs: 43.65 | +7: iteration 20940/ 115203 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 0.56 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 2.936540E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.282 | TFLOPs: 43.60 | +7: iteration 20950/ 115203 | consumed samples: 5363200 | consumed tokens: 10983833600 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.939315E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.366 | TFLOPs: 43.51 | +7: iteration 20960/ 115203 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.926977E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.623 | TFLOPs: 43.44 | +7: iteration 20970/ 115203 | consumed samples: 5368320 | consumed tokens: 10994319360 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.953085E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.217 | TFLOPs: 43.97 | +7: iteration 20980/ 115203 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 0.55 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.929595E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.475 | TFLOPs: 44.00 | +7: iteration 20990/ 115203 | consumed samples: 5373440 | consumed tokens: 11004805120 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.947701E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.052 | TFLOPs: 43.86 | +7: iteration 21000/ 115203 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.928843E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.027 | TFLOPs: 43.95 | +7: iteration 21010/ 115203 | consumed samples: 5378560 | consumed tokens: 11015290880 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.937226E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.237 | TFLOPs: 43.59 | +7: iteration 21020/ 115203 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 0.56 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 2.943922E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.313 | TFLOPs: 43.70 | +7: iteration 21030/ 115203 | consumed samples: 5383680 | consumed tokens: 11025776640 | elapsed time per iteration (s): 0.55 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.943276E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.452 | TFLOPs: 43.99 | +7: iteration 21040/ 115203 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 0.57 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.944208E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.641 | TFLOPs: 42.87 | +7: iteration 21050/ 115203 | consumed samples: 5388800 | consumed tokens: 11036262400 | elapsed time per iteration (s): 0.55 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.954400E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 21060/ 115203 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 0.57 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.919807E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.579 | TFLOPs: 42.58 | +7: iteration 21070/ 115203 | consumed samples: 5393920 | consumed tokens: 11046748160 | elapsed time per iteration (s): 0.55 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.930057E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.425 | TFLOPs: 43.99 | +7: iteration 21080/ 115203 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 0.56 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.939038E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.772 | TFLOPs: 43.26 | +7: iteration 21090/ 115203 | consumed samples: 5399040 | consumed tokens: 11057233920 | elapsed time per iteration (s): 0.56 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.947651E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.082 | TFLOPs: 43.48 | +7: iteration 21100/ 115203 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 0.55 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 2.915601E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.494 | TFLOPs: 44.00 | +7: iteration 21110/ 115203 | consumed samples: 5404160 | consumed tokens: 11067719680 | elapsed time per iteration (s): 0.56 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.949299E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.784 | TFLOPs: 43.45 | +7: iteration 21120/ 115203 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 0.55 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.929206E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.420 | TFLOPs: 43.99 | +7: iteration 21130/ 115203 | consumed samples: 5409280 | consumed tokens: 11078205440 | elapsed time per iteration (s): 0.57 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.945151E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.704 | TFLOPs: 43.16 | +7: iteration 21140/ 115203 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 0.56 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.936357E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.560 | TFLOPs: 43.72 | +7: iteration 21150/ 115203 | consumed samples: 5414400 | consumed tokens: 11088691200 | elapsed time per iteration (s): 0.56 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.929104E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.230 | TFLOPs: 43.97 | +7: iteration 21160/ 115203 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 0.56 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.943567E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.617 | TFLOPs: 43.44 | +7: iteration 21170/ 115203 | consumed samples: 5419520 | consumed tokens: 11099176960 | elapsed time per iteration (s): 0.56 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 2.933652E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.454 | TFLOPs: 43.42 | +7: iteration 21180/ 115203 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 0.56 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.959283E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.490 | TFLOPs: 43.62 | +7: iteration 21190/ 115203 | consumed samples: 5424640 | consumed tokens: 11109662720 | elapsed time per iteration (s): 0.56 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.945663E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.194 | TFLOPs: 43.68 | +7: iteration 21200/ 115203 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 0.55 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.933225E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.455 | TFLOPs: 43.99 | +7: iteration 21210/ 115203 | consumed samples: 5429760 | consumed tokens: 11120148480 | elapsed time per iteration (s): 0.55 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.932740E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.426 | TFLOPs: 43.99 | +7: iteration 21220/ 115203 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 0.56 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.941222E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.703 | TFLOPs: 43.73 | +7: iteration 21230/ 115203 | consumed samples: 5434880 | consumed tokens: 11130634240 | elapsed time per iteration (s): 0.55 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.931947E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.356 | TFLOPs: 43.99 | +7: iteration 21240/ 115203 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 0.57 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.930013E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.722 | TFLOPs: 42.88 | +7: iteration 21250/ 115203 | consumed samples: 5440000 | consumed tokens: 11141120000 | elapsed time per iteration (s): 0.56 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 2.936039E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.669 | TFLOPs: 43.73 | +7: iteration 21260/ 115203 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 0.55 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.929398E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.388 | TFLOPs: 43.99 | +7: iteration 21270/ 115203 | consumed samples: 5445120 | consumed tokens: 11151605760 | elapsed time per iteration (s): 0.55 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.948732E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.336 | TFLOPs: 43.98 | +7: iteration 21280/ 115203 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 0.56 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.930988E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.411 | TFLOPs: 43.70 | +7: iteration 21290/ 115203 | consumed samples: 5450240 | consumed tokens: 11162091520 | elapsed time per iteration (s): 0.56 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.934762E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.575 | TFLOPs: 43.82 | +7: iteration 21300/ 115203 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 0.56 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.909889E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.947 | TFLOPs: 43.47 | +7: iteration 21310/ 115203 | consumed samples: 5455360 | consumed tokens: 11172577280 | elapsed time per iteration (s): 0.56 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.929259E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.644 | TFLOPs: 43.63 | +7: iteration 21320/ 115203 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 0.56 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.936548E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.695 | TFLOPs: 43.73 | +7: iteration 21330/ 115203 | consumed samples: 5460480 | consumed tokens: 11183063040 | elapsed time per iteration (s): 0.56 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 2.935511E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.783 | TFLOPs: 43.64 | +7: iteration 21340/ 115203 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 0.56 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.930614E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.771 | TFLOPs: 43.45 | +7: iteration 21350/ 115203 | consumed samples: 5465600 | consumed tokens: 11193548800 | elapsed time per iteration (s): 0.55 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.924862E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.264 | TFLOPs: 43.98 | +7: iteration 21360/ 115203 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 0.56 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.934319E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.968 | TFLOPs: 43.95 | +7: iteration 21370/ 115203 | consumed samples: 5470720 | consumed tokens: 11204034560 | elapsed time per iteration (s): 0.56 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.933862E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.462 | TFLOPs: 43.71 | +7: iteration 21380/ 115203 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 0.58 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.933564E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.434 | TFLOPs: 41.99 | +7: iteration 21390/ 115203 | consumed samples: 5475840 | consumed tokens: 11214520320 | elapsed time per iteration (s): 0.56 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.930453E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.936 | TFLOPs: 43.47 | +7: iteration 21400/ 115203 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 0.56 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 2.933880E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.181 | TFLOPs: 43.40 | +7: iteration 21410/ 115203 | consumed samples: 5480960 | consumed tokens: 11225006080 | elapsed time per iteration (s): 0.57 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.935603E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.212 | TFLOPs: 42.64 | +7: iteration 21420/ 115203 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 0.57 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.936239E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.416 | TFLOPs: 43.13 | +7: iteration 21430/ 115203 | consumed samples: 5486080 | consumed tokens: 11235491840 | elapsed time per iteration (s): 0.56 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.941095E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.453 | TFLOPs: 43.33 | +7: iteration 21440/ 115203 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 0.55 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.940771E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.363 | TFLOPs: 43.99 | +7: iteration 21450/ 115203 | consumed samples: 5491200 | consumed tokens: 11245977600 | elapsed time per iteration (s): 0.57 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.937602E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.260 | TFLOPs: 43.12 | +7: iteration 21460/ 115203 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 0.56 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.927085E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.655 | TFLOPs: 43.73 | +7: iteration 21470/ 115203 | consumed samples: 5496320 | consumed tokens: 11256463360 | elapsed time per iteration (s): 0.55 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.928383E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.391 | TFLOPs: 43.99 | +7: iteration 21480/ 115203 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 0.56 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 2.927554E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.132 | TFLOPs: 43.58 | +7: iteration 21490/ 115203 | consumed samples: 5501440 | consumed tokens: 11266949120 | elapsed time per iteration (s): 0.56 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.931083E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.462 | TFLOPs: 43.71 | +7: iteration 21500/ 115203 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 0.56 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.927590E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.278 | TFLOPs: 43.88 | +7: iteration 21510/ 115203 | consumed samples: 5506560 | consumed tokens: 11277434880 | elapsed time per iteration (s): 0.56 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.930892E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.296 | TFLOPs: 43.41 | +7: iteration 21520/ 115203 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 0.55 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.931255E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 21530/ 115203 | consumed samples: 5511680 | consumed tokens: 11287920640 | elapsed time per iteration (s): 0.56 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.932452E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.190 | TFLOPs: 43.97 | +7: iteration 21540/ 115203 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 0.55 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.931181E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.278 | TFLOPs: 43.98 | +7: iteration 21550/ 115203 | consumed samples: 5516800 | consumed tokens: 11298406400 | elapsed time per iteration (s): 0.56 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.912609E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.495 | TFLOPs: 43.43 | +7: iteration 21560/ 115203 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 0.55 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 2.933717E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.460 | TFLOPs: 44.00 | +7: iteration 21570/ 115203 | consumed samples: 5521920 | consumed tokens: 11308892160 | elapsed time per iteration (s): 0.56 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.920029E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.681 | TFLOPs: 43.54 | +7: iteration 21580/ 115203 | consumed samples: 5524480 | consumed tokens: 11314135040 | elapsed time per iteration (s): 0.55 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.937227E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.373 | TFLOPs: 43.99 | +7: iteration 21590/ 115203 | consumed samples: 5527040 | consumed tokens: 11319377920 | elapsed time per iteration (s): 0.58 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.933868E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.016 | TFLOPs: 41.76 | +7: iteration 21600/ 115203 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 0.56 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.919095E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.736 | TFLOPs: 43.74 | +7: iteration 21610/ 115203 | consumed samples: 5532160 | consumed tokens: 11329863680 | elapsed time per iteration (s): 0.57 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.918125E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.618 | TFLOPs: 42.87 | +7: iteration 21620/ 115203 | consumed samples: 5534720 | consumed tokens: 11335106560 | elapsed time per iteration (s): 0.56 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.939382E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.507 | TFLOPs: 43.43 | +7: iteration 21630/ 115203 | consumed samples: 5537280 | consumed tokens: 11340349440 | elapsed time per iteration (s): 0.56 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 2.932990E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.497 | TFLOPs: 43.71 | +7: iteration 21640/ 115203 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 0.56 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.937618E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.664 | TFLOPs: 43.73 | +7: iteration 21650/ 115203 | consumed samples: 5542400 | consumed tokens: 11350835200 | elapsed time per iteration (s): 0.56 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.930123E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.917 | TFLOPs: 43.47 | +7: iteration 21660/ 115203 | consumed samples: 5544960 | consumed tokens: 11356078080 | elapsed time per iteration (s): 0.55 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.925261E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.565 | TFLOPs: 44.01 | +7: iteration 21670/ 115203 | consumed samples: 5547520 | consumed tokens: 11361320960 | elapsed time per iteration (s): 0.56 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.931568E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.736 | TFLOPs: 43.74 | +7: iteration 21680/ 115203 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 0.55 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.937477E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.309 | TFLOPs: 43.98 | +7: iteration 21690/ 115203 | consumed samples: 5552640 | consumed tokens: 11371806720 | elapsed time per iteration (s): 0.55 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.939607E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.511 | TFLOPs: 44.00 | +7: iteration 21700/ 115203 | consumed samples: 5555200 | consumed tokens: 11377049600 | elapsed time per iteration (s): 0.55 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.930490E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.387 | TFLOPs: 43.99 | +7: iteration 21710/ 115203 | consumed samples: 5557760 | consumed tokens: 11382292480 | elapsed time per iteration (s): 0.55 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 2.925193E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.528 | TFLOPs: 44.00 | +7: iteration 21720/ 115203 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 0.56 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.935405E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.628 | TFLOPs: 43.73 | +7: iteration 21730/ 115203 | consumed samples: 5562880 | consumed tokens: 11392778240 | elapsed time per iteration (s): 0.55 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.937564E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.467 | TFLOPs: 44.00 | +7: iteration 21740/ 115203 | consumed samples: 5565440 | consumed tokens: 11398021120 | elapsed time per iteration (s): 0.57 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.942195E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.912 | TFLOPs: 42.80 | +7: iteration 21750/ 115203 | consumed samples: 5568000 | consumed tokens: 11403264000 | elapsed time per iteration (s): 0.55 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.939622E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 21760/ 115203 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 0.56 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.937106E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.991 | TFLOPs: 43.38 | +7: iteration 21770/ 115203 | consumed samples: 5573120 | consumed tokens: 11413749760 | elapsed time per iteration (s): 0.56 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.925311E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.391 | TFLOPs: 43.70 | +7: iteration 21780/ 115203 | consumed samples: 5575680 | consumed tokens: 11418992640 | elapsed time per iteration (s): 0.56 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 2.922697E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.545 | TFLOPs: 43.72 | +7: iteration 21790/ 115203 | consumed samples: 5578240 | consumed tokens: 11424235520 | elapsed time per iteration (s): 0.57 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.921766E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.583 | TFLOPs: 42.77 | +7: iteration 21800/ 115203 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 0.56 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.927077E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.103 | TFLOPs: 43.39 | +7: iteration 21810/ 115203 | consumed samples: 5583360 | consumed tokens: 11434721280 | elapsed time per iteration (s): 0.56 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.927544E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.391 | TFLOPs: 43.51 | +7: iteration 21820/ 115203 | consumed samples: 5585920 | consumed tokens: 11439964160 | elapsed time per iteration (s): 0.56 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.941831E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.896 | TFLOPs: 43.27 | +7: iteration 21830/ 115203 | consumed samples: 5588480 | consumed tokens: 11445207040 | elapsed time per iteration (s): 0.56 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.912763E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.361 | TFLOPs: 43.60 | +7: iteration 21840/ 115203 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 0.56 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.927940E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.018 | TFLOPs: 43.57 | +7: iteration 21850/ 115203 | consumed samples: 5593600 | consumed tokens: 11455692800 | elapsed time per iteration (s): 0.56 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.934967E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.495 | TFLOPs: 43.24 | +7: iteration 21860/ 115203 | consumed samples: 5596160 | consumed tokens: 11460935680 | elapsed time per iteration (s): 0.55 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 2.925076E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.573 | TFLOPs: 44.01 | +7: iteration 21870/ 115203 | consumed samples: 5598720 | consumed tokens: 11466178560 | elapsed time per iteration (s): 0.55 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.917498E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.614 | TFLOPs: 44.01 | +7: iteration 21880/ 115203 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 0.55 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.932978E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +7: iteration 21890/ 115203 | consumed samples: 5603840 | consumed tokens: 11476664320 | elapsed time per iteration (s): 0.55 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.943867E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.528 | TFLOPs: 44.00 | +7: iteration 21900/ 115203 | consumed samples: 5606400 | consumed tokens: 11481907200 | elapsed time per iteration (s): 0.56 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.932800E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.115 | TFLOPs: 43.77 | +7: iteration 21910/ 115203 | consumed samples: 5608960 | consumed tokens: 11487150080 | elapsed time per iteration (s): 0.56 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.934464E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.534 | TFLOPs: 43.72 | +7: iteration 21920/ 115203 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 0.61 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.928933E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 416.856 | TFLOPs: 39.74 | +7: iteration 21930/ 115203 | consumed samples: 5614080 | consumed tokens: 11497635840 | elapsed time per iteration (s): 0.56 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 2.923269E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.816 | TFLOPs: 43.74 | +7: iteration 21940/ 115203 | consumed samples: 5616640 | consumed tokens: 11502878720 | elapsed time per iteration (s): 0.56 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.930376E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.042 | TFLOPs: 43.57 | +7: iteration 21950/ 115203 | consumed samples: 5619200 | consumed tokens: 11508121600 | elapsed time per iteration (s): 0.56 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.931169E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.794 | TFLOPs: 43.74 | +7: iteration 21960/ 115203 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 0.56 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.925150E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.652 | TFLOPs: 43.73 | +7: iteration 21970/ 115203 | consumed samples: 5624320 | consumed tokens: 11518607360 | elapsed time per iteration (s): 0.56 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.925191E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.103 | TFLOPs: 43.68 | +7: iteration 21980/ 115203 | consumed samples: 5626880 | consumed tokens: 11523850240 | elapsed time per iteration (s): 0.57 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.948328E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.041 | TFLOPs: 42.81 | +7: iteration 21990/ 115203 | consumed samples: 5629440 | consumed tokens: 11529093120 | elapsed time per iteration (s): 0.57 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.930918E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.273 | TFLOPs: 42.55 | +0: [2023-03-16 16:16:32,923] [INFO] [logging.py:68:log_dist] [Rank 0] step=22000, skipped=0, lr=[0.00018556333335793902, 0.00018556333335793902, 0.00018556333335793902], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 22000/ 115203 | consumed samples: 5632000 | consumed tokens: 11534336000 | elapsed time per iteration (s): 0.57 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 2.935423E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.387 | TFLOPs: 43.13 | +0: steps: 22000 loss: 2.9294 iter time (s): 0.558 samples/sec: 458.597 +7: iteration 22010/ 115203 | consumed samples: 5634560 | consumed tokens: 11539578880 | elapsed time per iteration (s): 0.56 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.920821E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.183 | TFLOPs: 43.30 | +7: iteration 22020/ 115203 | consumed samples: 5637120 | consumed tokens: 11544821760 | elapsed time per iteration (s): 0.57 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.927168E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.337 | TFLOPs: 43.13 | +7: iteration 22030/ 115203 | consumed samples: 5639680 | consumed tokens: 11550064640 | elapsed time per iteration (s): 0.56 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.925021E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.943 | TFLOPs: 43.56 | +7: iteration 22040/ 115203 | consumed samples: 5642240 | consumed tokens: 11555307520 | elapsed time per iteration (s): 0.57 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.925428E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.881 | TFLOPs: 42.99 | +7: iteration 22050/ 115203 | consumed samples: 5644800 | consumed tokens: 11560550400 | elapsed time per iteration (s): 0.56 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.940694E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.967 | TFLOPs: 43.38 | +7: iteration 22060/ 115203 | consumed samples: 5647360 | consumed tokens: 11565793280 | elapsed time per iteration (s): 0.57 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.931224E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.563 | TFLOPs: 42.57 | +7: iteration 22070/ 115203 | consumed samples: 5649920 | consumed tokens: 11571036160 | elapsed time per iteration (s): 0.56 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.943567E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.441 | TFLOPs: 43.33 | +7: iteration 22080/ 115203 | consumed samples: 5652480 | consumed tokens: 11576279040 | elapsed time per iteration (s): 0.56 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 2.926328E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.122 | TFLOPs: 43.39 | +7: iteration 22090/ 115203 | consumed samples: 5655040 | consumed tokens: 11581521920 | elapsed time per iteration (s): 0.56 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.929074E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.215 | TFLOPs: 43.59 | +7: iteration 22100/ 115203 | consumed samples: 5657600 | consumed tokens: 11586764800 | elapsed time per iteration (s): 0.55 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.931896E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.712 | TFLOPs: 44.02 | +7: iteration 22110/ 115203 | consumed samples: 5660160 | consumed tokens: 11592007680 | elapsed time per iteration (s): 0.56 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.927858E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.376 | TFLOPs: 43.42 | +7: iteration 22120/ 115203 | consumed samples: 5662720 | consumed tokens: 11597250560 | elapsed time per iteration (s): 0.56 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.943631E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.254 | TFLOPs: 43.40 | +7: iteration 22130/ 115203 | consumed samples: 5665280 | consumed tokens: 11602493440 | elapsed time per iteration (s): 0.55 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.933741E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.652 | TFLOPs: 44.01 | +7: iteration 22140/ 115203 | consumed samples: 5667840 | consumed tokens: 11607736320 | elapsed time per iteration (s): 0.57 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.923193E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.637 | TFLOPs: 43.15 | +7: iteration 22150/ 115203 | consumed samples: 5670400 | consumed tokens: 11612979200 | elapsed time per iteration (s): 0.56 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 2.922620E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.991 | TFLOPs: 43.57 | +7: iteration 22160/ 115203 | consumed samples: 5672960 | consumed tokens: 11618222080 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.910115E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.739 | TFLOPs: 43.74 | +7: iteration 22170/ 115203 | consumed samples: 5675520 | consumed tokens: 11623464960 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.929136E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.828 | TFLOPs: 43.74 | +7: iteration 22180/ 115203 | consumed samples: 5678080 | consumed tokens: 11628707840 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.946293E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.648 | TFLOPs: 43.92 | +7: iteration 22190/ 115203 | consumed samples: 5680640 | consumed tokens: 11633950720 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.929725E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.092 | TFLOPs: 43.67 | +7: iteration 22200/ 115203 | consumed samples: 5683200 | consumed tokens: 11639193600 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.918684E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.684 | TFLOPs: 43.64 | +7: iteration 22210/ 115203 | consumed samples: 5685760 | consumed tokens: 11644436480 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.924737E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.794 | TFLOPs: 43.65 | +7: iteration 22220/ 115203 | consumed samples: 5688320 | consumed tokens: 11649679360 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.928729E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.547 | TFLOPs: 43.72 | +7: iteration 22230/ 115203 | consumed samples: 5690880 | consumed tokens: 11654922240 | elapsed time per iteration (s): 0.56 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 2.924021E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.351 | TFLOPs: 43.41 | +7: iteration 22240/ 115203 | consumed samples: 5693440 | consumed tokens: 11660165120 | elapsed time per iteration (s): 0.57 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.933882E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.336 | TFLOPs: 43.13 | +7: iteration 22250/ 115203 | consumed samples: 5696000 | consumed tokens: 11665408000 | elapsed time per iteration (s): 0.56 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.921906E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.627 | TFLOPs: 43.34 | +7: iteration 22260/ 115203 | consumed samples: 5698560 | consumed tokens: 11670650880 | elapsed time per iteration (s): 0.57 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.932705E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.701 | TFLOPs: 43.06 | +7: iteration 22270/ 115203 | consumed samples: 5701120 | consumed tokens: 11675893760 | elapsed time per iteration (s): 0.55 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.918804E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.770 | TFLOPs: 44.02 | +7: iteration 22280/ 115203 | consumed samples: 5703680 | consumed tokens: 11681136640 | elapsed time per iteration (s): 0.55 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.934674E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.515 | TFLOPs: 44.00 | +7: iteration 22290/ 115203 | consumed samples: 5706240 | consumed tokens: 11686379520 | elapsed time per iteration (s): 0.56 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.921165E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.356 | TFLOPs: 43.60 | +7: iteration 22300/ 115203 | consumed samples: 5708800 | consumed tokens: 11691622400 | elapsed time per iteration (s): 0.58 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 2.924129E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.706 | TFLOPs: 42.02 | +7: iteration 22310/ 115203 | consumed samples: 5711360 | consumed tokens: 11696865280 | elapsed time per iteration (s): 0.57 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.939986E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.764 | TFLOPs: 42.98 | +7: iteration 22320/ 115203 | consumed samples: 5713920 | consumed tokens: 11702108160 | elapsed time per iteration (s): 0.56 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.923097E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.804 | TFLOPs: 43.74 | +7: iteration 22330/ 115203 | consumed samples: 5716480 | consumed tokens: 11707351040 | elapsed time per iteration (s): 0.56 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.919581E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.124 | TFLOPs: 43.68 | +7: iteration 22340/ 115203 | consumed samples: 5719040 | consumed tokens: 11712593920 | elapsed time per iteration (s): 0.56 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.924521E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.164 | TFLOPs: 43.20 | +7: iteration 22350/ 115203 | consumed samples: 5721600 | consumed tokens: 11717836800 | elapsed time per iteration (s): 0.56 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.917586E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.104 | TFLOPs: 43.48 | +7: iteration 22360/ 115203 | consumed samples: 5724160 | consumed tokens: 11723079680 | elapsed time per iteration (s): 0.56 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.914495E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.217 | TFLOPs: 43.59 | +7: iteration 22370/ 115203 | consumed samples: 5726720 | consumed tokens: 11728322560 | elapsed time per iteration (s): 0.56 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 2.916959E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.913 | TFLOPs: 43.56 | +7: iteration 22380/ 115203 | consumed samples: 5729280 | consumed tokens: 11733565440 | elapsed time per iteration (s): 0.56 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.935707E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.165 | TFLOPs: 43.20 | +7: iteration 22390/ 115203 | consumed samples: 5731840 | consumed tokens: 11738808320 | elapsed time per iteration (s): 0.56 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.925529E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.253 | TFLOPs: 43.59 | +7: iteration 22400/ 115203 | consumed samples: 5734400 | consumed tokens: 11744051200 | elapsed time per iteration (s): 0.56 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.927968E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.612 | TFLOPs: 43.82 | +7: iteration 22410/ 115203 | consumed samples: 5736960 | consumed tokens: 11749294080 | elapsed time per iteration (s): 0.56 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.924484E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.569 | TFLOPs: 43.53 | +7: iteration 22420/ 115203 | consumed samples: 5739520 | consumed tokens: 11754536960 | elapsed time per iteration (s): 0.57 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.921154E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.737 | TFLOPs: 42.88 | +7: iteration 22430/ 115203 | consumed samples: 5742080 | consumed tokens: 11759779840 | elapsed time per iteration (s): 0.59 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.906122E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.432 | TFLOPs: 41.13 | +7: iteration 22440/ 115203 | consumed samples: 5744640 | consumed tokens: 11765022720 | elapsed time per iteration (s): 0.57 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.906947E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.881 | TFLOPs: 42.70 | +7: iteration 22450/ 115203 | consumed samples: 5747200 | consumed tokens: 11770265600 | elapsed time per iteration (s): 0.57 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 2.919645E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.465 | TFLOPs: 42.76 | +7: iteration 22460/ 115203 | consumed samples: 5749760 | consumed tokens: 11775508480 | elapsed time per iteration (s): 0.60 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.926088E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.118 | TFLOPs: 40.63 | +7: iteration 22470/ 115203 | consumed samples: 5752320 | consumed tokens: 11780751360 | elapsed time per iteration (s): 0.59 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.920603E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.991 | TFLOPs: 41.19 | +7: iteration 22480/ 115203 | consumed samples: 5754880 | consumed tokens: 11785994240 | elapsed time per iteration (s): 0.56 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.910179E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.460 | TFLOPs: 43.23 | +7: iteration 22490/ 115203 | consumed samples: 5757440 | consumed tokens: 11791237120 | elapsed time per iteration (s): 0.58 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.928367E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.839 | TFLOPs: 41.84 | +7: iteration 22500/ 115203 | consumed samples: 5760000 | consumed tokens: 11796480000 | elapsed time per iteration (s): 0.58 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.934801E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.731 | TFLOPs: 41.73 | +7: iteration 22510/ 115203 | consumed samples: 5762560 | consumed tokens: 11801722880 | elapsed time per iteration (s): 0.59 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.933708E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.962 | TFLOPs: 41.28 | +7: iteration 22520/ 115203 | consumed samples: 5765120 | consumed tokens: 11806965760 | elapsed time per iteration (s): 0.58 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 2.909022E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.003 | TFLOPs: 41.95 | +7: iteration 22530/ 115203 | consumed samples: 5767680 | consumed tokens: 11812208640 | elapsed time per iteration (s): 0.59 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.930426E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.637 | TFLOPs: 41.15 | +7: iteration 22540/ 115203 | consumed samples: 5770240 | consumed tokens: 11817451520 | elapsed time per iteration (s): 0.58 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.925761E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.738 | TFLOPs: 42.21 | +7: iteration 22550/ 115203 | consumed samples: 5772800 | consumed tokens: 11822694400 | elapsed time per iteration (s): 0.58 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.919694E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.244 | TFLOPs: 42.16 | +7: iteration 22560/ 115203 | consumed samples: 5775360 | consumed tokens: 11827937280 | elapsed time per iteration (s): 0.57 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.922191E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.242 | TFLOPs: 42.45 | +7: iteration 22570/ 115203 | consumed samples: 5777920 | consumed tokens: 11833180160 | elapsed time per iteration (s): 0.58 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.919619E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.627 | TFLOPs: 42.20 | +7: iteration 22580/ 115203 | consumed samples: 5780480 | consumed tokens: 11838423040 | elapsed time per iteration (s): 0.57 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.922288E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.711 | TFLOPs: 42.78 | +7: iteration 22590/ 115203 | consumed samples: 5783040 | consumed tokens: 11843665920 | elapsed time per iteration (s): 0.57 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 2.917471E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.756 | TFLOPs: 43.07 | +7: iteration 22600/ 115203 | consumed samples: 5785600 | consumed tokens: 11848908800 | elapsed time per iteration (s): 0.57 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.923513E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.950 | TFLOPs: 42.61 | +7: iteration 22610/ 115203 | consumed samples: 5788160 | consumed tokens: 11854151680 | elapsed time per iteration (s): 0.58 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.916109E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.741 | TFLOPs: 41.73 | +7: iteration 22620/ 115203 | consumed samples: 5790720 | consumed tokens: 11859394560 | elapsed time per iteration (s): 0.58 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.925503E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.599 | TFLOPs: 42.01 | +7: iteration 22630/ 115203 | consumed samples: 5793280 | consumed tokens: 11864637440 | elapsed time per iteration (s): 0.58 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.918221E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.116 | TFLOPs: 42.34 | +7: iteration 22640/ 115203 | consumed samples: 5795840 | consumed tokens: 11869880320 | elapsed time per iteration (s): 0.58 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.923510E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.231 | TFLOPs: 42.26 | +7: iteration 22650/ 115203 | consumed samples: 5798400 | consumed tokens: 11875123200 | elapsed time per iteration (s): 0.57 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.916371E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.893 | TFLOPs: 42.89 | +7: iteration 22660/ 115203 | consumed samples: 5800960 | consumed tokens: 11880366080 | elapsed time per iteration (s): 0.59 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 2.910161E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.514 | TFLOPs: 41.71 | +7: iteration 22670/ 115203 | consumed samples: 5803520 | consumed tokens: 11885608960 | elapsed time per iteration (s): 0.59 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.918105E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.532 | TFLOPs: 41.43 | +7: iteration 22680/ 115203 | consumed samples: 5806080 | consumed tokens: 11890851840 | elapsed time per iteration (s): 0.58 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.931390E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.011 | TFLOPs: 41.95 | +7: iteration 22690/ 115203 | consumed samples: 5808640 | consumed tokens: 11896094720 | elapsed time per iteration (s): 0.60 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.915443E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.351 | TFLOPs: 40.74 | +7: iteration 22700/ 115203 | consumed samples: 5811200 | consumed tokens: 11901337600 | elapsed time per iteration (s): 0.58 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.934958E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.572 | TFLOPs: 41.81 | +7: iteration 22710/ 115203 | consumed samples: 5813760 | consumed tokens: 11906580480 | elapsed time per iteration (s): 0.58 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.926098E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.759 | TFLOPs: 42.02 | +7: iteration 22720/ 115203 | consumed samples: 5816320 | consumed tokens: 11911823360 | elapsed time per iteration (s): 0.57 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.928507E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.243 | TFLOPs: 42.54 | +7: iteration 22730/ 115203 | consumed samples: 5818880 | consumed tokens: 11917066240 | elapsed time per iteration (s): 0.58 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.914351E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.665 | TFLOPs: 42.11 | +7: iteration 22740/ 115203 | consumed samples: 5821440 | consumed tokens: 11922309120 | elapsed time per iteration (s): 0.57 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 2.902570E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.602 | TFLOPs: 42.48 | +7: iteration 22750/ 115203 | consumed samples: 5824000 | consumed tokens: 11927552000 | elapsed time per iteration (s): 0.58 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.926408E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.735 | TFLOPs: 42.02 | +7: iteration 22760/ 115203 | consumed samples: 5826560 | consumed tokens: 11932794880 | elapsed time per iteration (s): 0.57 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.910956E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.653 | TFLOPs: 42.68 | +7: iteration 22770/ 115203 | consumed samples: 5829120 | consumed tokens: 11938037760 | elapsed time per iteration (s): 0.58 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.912341E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.680 | TFLOPs: 42.30 | +7: iteration 22780/ 115203 | consumed samples: 5831680 | consumed tokens: 11943280640 | elapsed time per iteration (s): 0.57 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.918930E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.515 | TFLOPs: 42.57 | +7: iteration 22790/ 115203 | consumed samples: 5834240 | consumed tokens: 11948523520 | elapsed time per iteration (s): 0.57 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.915403E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.532 | TFLOPs: 42.57 | +7: iteration 22800/ 115203 | consumed samples: 5836800 | consumed tokens: 11953766400 | elapsed time per iteration (s): 0.58 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.906240E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.893 | TFLOPs: 42.32 | +7: iteration 22810/ 115203 | consumed samples: 5839360 | consumed tokens: 11959009280 | elapsed time per iteration (s): 0.57 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 2.918464E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.900 | TFLOPs: 43.08 | +7: iteration 22820/ 115203 | consumed samples: 5841920 | consumed tokens: 11964252160 | elapsed time per iteration (s): 0.58 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.929701E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.847 | TFLOPs: 42.41 | +7: iteration 22830/ 115203 | consumed samples: 5844480 | consumed tokens: 11969495040 | elapsed time per iteration (s): 0.59 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.920328E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.738 | TFLOPs: 41.35 | +7: iteration 22840/ 115203 | consumed samples: 5847040 | consumed tokens: 11974737920 | elapsed time per iteration (s): 0.58 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.931848E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.355 | TFLOPs: 41.89 | +7: iteration 22850/ 115203 | consumed samples: 5849600 | consumed tokens: 11979980800 | elapsed time per iteration (s): 0.58 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.910315E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.645 | TFLOPs: 42.20 | +7: iteration 22860/ 115203 | consumed samples: 5852160 | consumed tokens: 11985223680 | elapsed time per iteration (s): 0.58 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.920452E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.435 | TFLOPs: 42.28 | +7: iteration 22870/ 115203 | consumed samples: 5854720 | consumed tokens: 11990466560 | elapsed time per iteration (s): 0.57 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.922242E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.829 | TFLOPs: 42.89 | +7: iteration 22880/ 115203 | consumed samples: 5857280 | consumed tokens: 11995709440 | elapsed time per iteration (s): 0.59 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 2.913537E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.407 | TFLOPs: 41.61 | +7: iteration 22890/ 115203 | consumed samples: 5859840 | consumed tokens: 12000952320 | elapsed time per iteration (s): 0.59 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.919658E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.726 | TFLOPs: 41.64 | +7: iteration 22900/ 115203 | consumed samples: 5862400 | consumed tokens: 12006195200 | elapsed time per iteration (s): 0.58 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.910795E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.975 | TFLOPs: 42.33 | +7: iteration 22910/ 115203 | consumed samples: 5864960 | consumed tokens: 12011438080 | elapsed time per iteration (s): 0.57 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.905257E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.261 | TFLOPs: 42.55 | +7: iteration 22920/ 115203 | consumed samples: 5867520 | consumed tokens: 12016680960 | elapsed time per iteration (s): 0.58 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.916557E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.015 | TFLOPs: 42.24 | +7: iteration 22930/ 115203 | consumed samples: 5870080 | consumed tokens: 12021923840 | elapsed time per iteration (s): 0.56 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.916777E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.417 | TFLOPs: 43.23 | +7: iteration 22940/ 115203 | consumed samples: 5872640 | consumed tokens: 12027166720 | elapsed time per iteration (s): 0.56 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.909559E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.701 | TFLOPs: 43.54 | +7: iteration 22950/ 115203 | consumed samples: 5875200 | consumed tokens: 12032409600 | elapsed time per iteration (s): 0.59 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 2.916529E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.036 | TFLOPs: 41.48 | +7: iteration 22960/ 115203 | consumed samples: 5877760 | consumed tokens: 12037652480 | elapsed time per iteration (s): 0.56 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.918977E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.371 | TFLOPs: 43.22 | +7: iteration 22970/ 115203 | consumed samples: 5880320 | consumed tokens: 12042895360 | elapsed time per iteration (s): 0.59 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.922792E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.414 | TFLOPs: 41.13 | +7: iteration 22980/ 115203 | consumed samples: 5882880 | consumed tokens: 12048138240 | elapsed time per iteration (s): 0.59 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.923029E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.834 | TFLOPs: 41.55 | +7: iteration 22990/ 115203 | consumed samples: 5885440 | consumed tokens: 12053381120 | elapsed time per iteration (s): 0.58 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.912948E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.882 | TFLOPs: 41.84 | +7: iteration 23000/ 115203 | consumed samples: 5888000 | consumed tokens: 12058624000 | elapsed time per iteration (s): 0.57 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.921344E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.517 | TFLOPs: 42.86 | +7: iteration 23010/ 115203 | consumed samples: 5890560 | consumed tokens: 12063866880 | elapsed time per iteration (s): 0.59 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.913457E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.143 | TFLOPs: 41.10 | +7: iteration 23020/ 115203 | consumed samples: 5893120 | consumed tokens: 12069109760 | elapsed time per iteration (s): 0.56 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 2.926163E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.852 | TFLOPs: 43.37 | +7: iteration 23030/ 115203 | consumed samples: 5895680 | consumed tokens: 12074352640 | elapsed time per iteration (s): 0.59 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.915728E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.595 | TFLOPs: 41.72 | +7: iteration 23040/ 115203 | consumed samples: 5898240 | consumed tokens: 12079595520 | elapsed time per iteration (s): 0.57 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.899438E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.756 | TFLOPs: 42.59 | +7: iteration 23050/ 115203 | consumed samples: 5900800 | consumed tokens: 12084838400 | elapsed time per iteration (s): 0.58 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.900510E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.014 | TFLOPs: 42.24 | +7: iteration 23060/ 115203 | consumed samples: 5903360 | consumed tokens: 12090081280 | elapsed time per iteration (s): 0.58 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.907426E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.149 | TFLOPs: 42.06 | +7: iteration 23070/ 115203 | consumed samples: 5905920 | consumed tokens: 12095324160 | elapsed time per iteration (s): 0.58 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.908296E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.156 | TFLOPs: 41.87 | +7: iteration 23080/ 115203 | consumed samples: 5908480 | consumed tokens: 12100567040 | elapsed time per iteration (s): 0.56 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.912775E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.147 | TFLOPs: 43.58 | +7: iteration 23090/ 115203 | consumed samples: 5911040 | consumed tokens: 12105809920 | elapsed time per iteration (s): 0.58 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 2.911103E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.999 | TFLOPs: 42.14 | +7: iteration 23100/ 115203 | consumed samples: 5913600 | consumed tokens: 12111052800 | elapsed time per iteration (s): 0.58 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.913797E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.118 | TFLOPs: 41.96 | +7: iteration 23110/ 115203 | consumed samples: 5916160 | consumed tokens: 12116295680 | elapsed time per iteration (s): 0.56 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.924407E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.470 | TFLOPs: 43.33 | +7: iteration 23120/ 115203 | consumed samples: 5918720 | consumed tokens: 12121538560 | elapsed time per iteration (s): 0.58 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.910787E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.871 | TFLOPs: 42.13 | +7: iteration 23130/ 115203 | consumed samples: 5921280 | consumed tokens: 12126781440 | elapsed time per iteration (s): 0.59 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.914629E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.314 | TFLOPs: 41.03 | +7: iteration 23140/ 115203 | consumed samples: 5923840 | consumed tokens: 12132024320 | elapsed time per iteration (s): 0.57 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.897600E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.243 | TFLOPs: 42.64 | +7: iteration 23150/ 115203 | consumed samples: 5926400 | consumed tokens: 12137267200 | elapsed time per iteration (s): 0.59 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.911791E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.047 | TFLOPs: 41.48 | +7: iteration 23160/ 115203 | consumed samples: 5928960 | consumed tokens: 12142510080 | elapsed time per iteration (s): 0.58 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 2.916717E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.196 | TFLOPs: 42.16 | +7: iteration 23170/ 115203 | consumed samples: 5931520 | consumed tokens: 12147752960 | elapsed time per iteration (s): 0.60 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.907744E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.887 | TFLOPs: 40.79 | +7: iteration 23180/ 115203 | consumed samples: 5934080 | consumed tokens: 12152995840 | elapsed time per iteration (s): 0.59 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.903769E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.521 | TFLOPs: 41.71 | +7: iteration 23190/ 115203 | consumed samples: 5936640 | consumed tokens: 12158238720 | elapsed time per iteration (s): 0.58 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.909584E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.349 | TFLOPs: 42.36 | +7: iteration 23200/ 115203 | consumed samples: 5939200 | consumed tokens: 12163481600 | elapsed time per iteration (s): 0.57 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.916380E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.361 | TFLOPs: 42.65 | +7: iteration 23210/ 115203 | consumed samples: 5941760 | consumed tokens: 12168724480 | elapsed time per iteration (s): 0.59 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.908645E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.505 | TFLOPs: 41.71 | +7: iteration 23220/ 115203 | consumed samples: 5944320 | consumed tokens: 12173967360 | elapsed time per iteration (s): 0.58 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.916230E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.063 | TFLOPs: 42.05 | +7: iteration 23230/ 115203 | consumed samples: 5946880 | consumed tokens: 12179210240 | elapsed time per iteration (s): 0.57 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 2.914952E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.441 | TFLOPs: 42.56 | +7: iteration 23240/ 115203 | consumed samples: 5949440 | consumed tokens: 12184453120 | elapsed time per iteration (s): 0.56 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.918053E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.556 | TFLOPs: 43.53 | +7: iteration 23250/ 115203 | consumed samples: 5952000 | consumed tokens: 12189696000 | elapsed time per iteration (s): 0.58 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.918820E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.518 | TFLOPs: 42.09 | +7: iteration 23260/ 115203 | consumed samples: 5954560 | consumed tokens: 12194938880 | elapsed time per iteration (s): 0.58 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.913133E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.897 | TFLOPs: 42.42 | +7: iteration 23270/ 115203 | consumed samples: 5957120 | consumed tokens: 12200181760 | elapsed time per iteration (s): 0.58 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.916467E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.926 | TFLOPs: 42.42 | +7: iteration 23280/ 115203 | consumed samples: 5959680 | consumed tokens: 12205424640 | elapsed time per iteration (s): 0.58 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.914853E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.392 | TFLOPs: 42.37 | +7: iteration 23290/ 115203 | consumed samples: 5962240 | consumed tokens: 12210667520 | elapsed time per iteration (s): 0.57 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.914398E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.595 | TFLOPs: 42.77 | +7: iteration 23300/ 115203 | consumed samples: 5964800 | consumed tokens: 12215910400 | elapsed time per iteration (s): 0.58 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.898492E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.111 | TFLOPs: 42.44 | +7: iteration 23310/ 115203 | consumed samples: 5967360 | consumed tokens: 12221153280 | elapsed time per iteration (s): 0.59 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 2.913915E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.804 | TFLOPs: 41.45 | +7: iteration 23320/ 115203 | consumed samples: 5969920 | consumed tokens: 12226396160 | elapsed time per iteration (s): 0.58 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.900215E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.801 | TFLOPs: 41.93 | +7: iteration 23330/ 115203 | consumed samples: 5972480 | consumed tokens: 12231639040 | elapsed time per iteration (s): 0.58 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.923480E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.462 | TFLOPs: 41.99 | +7: iteration 23340/ 115203 | consumed samples: 5975040 | consumed tokens: 12236881920 | elapsed time per iteration (s): 0.56 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.914820E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.784 | TFLOPs: 43.55 | +7: iteration 23350/ 115203 | consumed samples: 5977600 | consumed tokens: 12242124800 | elapsed time per iteration (s): 0.57 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.905727E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.088 | TFLOPs: 42.82 | +7: iteration 23360/ 115203 | consumed samples: 5980160 | consumed tokens: 12247367680 | elapsed time per iteration (s): 0.57 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.907038E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.238 | TFLOPs: 42.83 | +7: iteration 23370/ 115203 | consumed samples: 5982720 | consumed tokens: 12252610560 | elapsed time per iteration (s): 0.58 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.920708E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.516 | TFLOPs: 42.28 | +7: iteration 23380/ 115203 | consumed samples: 5985280 | consumed tokens: 12257853440 | elapsed time per iteration (s): 0.56 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 2.915037E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.754 | TFLOPs: 43.64 | +7: iteration 23390/ 115203 | consumed samples: 5987840 | consumed tokens: 12263096320 | elapsed time per iteration (s): 0.56 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.924028E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.287 | TFLOPs: 43.69 | +7: iteration 23400/ 115203 | consumed samples: 5990400 | consumed tokens: 12268339200 | elapsed time per iteration (s): 0.58 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.902831E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.804 | TFLOPs: 42.03 | +7: iteration 23410/ 115203 | consumed samples: 5992960 | consumed tokens: 12273582080 | elapsed time per iteration (s): 0.55 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.920952E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.488 | TFLOPs: 44.00 | +7: iteration 23420/ 115203 | consumed samples: 5995520 | consumed tokens: 12278824960 | elapsed time per iteration (s): 0.59 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.916086E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.580 | TFLOPs: 41.43 | +7: iteration 23430/ 115203 | consumed samples: 5998080 | consumed tokens: 12284067840 | elapsed time per iteration (s): 0.58 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.908451E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.516 | TFLOPs: 42.00 | +7: iteration 23440/ 115203 | consumed samples: 6000640 | consumed tokens: 12289310720 | elapsed time per iteration (s): 0.57 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.918182E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.991 | TFLOPs: 42.81 | +7: iteration 23450/ 115203 | consumed samples: 6003200 | consumed tokens: 12294553600 | elapsed time per iteration (s): 0.58 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 2.918587E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.761 | TFLOPs: 42.40 | +7: iteration 23460/ 115203 | consumed samples: 6005760 | consumed tokens: 12299796480 | elapsed time per iteration (s): 0.56 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.912082E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.209 | TFLOPs: 43.21 | +7: iteration 23470/ 115203 | consumed samples: 6008320 | consumed tokens: 12305039360 | elapsed time per iteration (s): 0.58 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.903534E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.368 | TFLOPs: 41.98 | +7: iteration 23480/ 115203 | consumed samples: 6010880 | consumed tokens: 12310282240 | elapsed time per iteration (s): 0.58 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.894172E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.539 | TFLOPs: 42.38 | +7: iteration 23490/ 115203 | consumed samples: 6013440 | consumed tokens: 12315525120 | elapsed time per iteration (s): 0.57 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.901332E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.955 | TFLOPs: 42.80 | +7: iteration 23500/ 115203 | consumed samples: 6016000 | consumed tokens: 12320768000 | elapsed time per iteration (s): 0.58 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.904463E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.004 | TFLOPs: 42.04 | +7: iteration 23510/ 115203 | consumed samples: 6018560 | consumed tokens: 12326010880 | elapsed time per iteration (s): 0.58 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.902290E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.626 | TFLOPs: 42.20 | +7: iteration 23520/ 115203 | consumed samples: 6021120 | consumed tokens: 12331253760 | elapsed time per iteration (s): 0.57 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 2.903031E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.942 | TFLOPs: 42.71 | +7: iteration 23530/ 115203 | consumed samples: 6023680 | consumed tokens: 12336496640 | elapsed time per iteration (s): 0.58 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.924039E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.185 | TFLOPs: 42.44 | +7: iteration 23540/ 115203 | consumed samples: 6026240 | consumed tokens: 12341739520 | elapsed time per iteration (s): 0.57 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.909581E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.737 | TFLOPs: 42.97 | +7: iteration 23550/ 115203 | consumed samples: 6028800 | consumed tokens: 12346982400 | elapsed time per iteration (s): 0.57 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.896925E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.100 | TFLOPs: 42.63 | +7: iteration 23560/ 115203 | consumed samples: 6031360 | consumed tokens: 12352225280 | elapsed time per iteration (s): 0.56 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.905128E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.744 | TFLOPs: 43.55 | +7: iteration 23570/ 115203 | consumed samples: 6033920 | consumed tokens: 12357468160 | elapsed time per iteration (s): 0.55 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.914206E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.397 | TFLOPs: 43.99 | +7: iteration 23580/ 115203 | consumed samples: 6036480 | consumed tokens: 12362711040 | elapsed time per iteration (s): 0.58 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.916865E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.987 | TFLOPs: 42.33 | +7: iteration 23590/ 115203 | consumed samples: 6039040 | consumed tokens: 12367953920 | elapsed time per iteration (s): 0.58 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 2.908340E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.160 | TFLOPs: 41.77 | +7: iteration 23600/ 115203 | consumed samples: 6041600 | consumed tokens: 12373196800 | elapsed time per iteration (s): 0.57 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 2.898183E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.096 | TFLOPs: 42.53 | +7: iteration 23610/ 115203 | consumed samples: 6044160 | consumed tokens: 12378439680 | elapsed time per iteration (s): 0.57 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 2.929136E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.095 | TFLOPs: 43.20 | +7: iteration 23620/ 115203 | consumed samples: 6046720 | consumed tokens: 12383682560 | elapsed time per iteration (s): 0.55 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 2.904704E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 23630/ 115203 | consumed samples: 6049280 | consumed tokens: 12388925440 | elapsed time per iteration (s): 0.57 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 2.903384E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.250 | TFLOPs: 42.64 | +7: iteration 23640/ 115203 | consumed samples: 6051840 | consumed tokens: 12394168320 | elapsed time per iteration (s): 0.57 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 2.910020E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.861 | TFLOPs: 42.60 | +7: iteration 23650/ 115203 | consumed samples: 6054400 | consumed tokens: 12399411200 | elapsed time per iteration (s): 0.57 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 2.910309E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.637 | TFLOPs: 42.77 | +7: iteration 23660/ 115203 | consumed samples: 6056960 | consumed tokens: 12404654080 | elapsed time per iteration (s): 0.58 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.913817E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.164 | TFLOPs: 42.06 | +7: iteration 23670/ 115203 | consumed samples: 6059520 | consumed tokens: 12409896960 | elapsed time per iteration (s): 0.60 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.921574E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.158 | TFLOPs: 40.82 | +7: iteration 23680/ 115203 | consumed samples: 6062080 | consumed tokens: 12415139840 | elapsed time per iteration (s): 0.57 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.909282E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.142 | TFLOPs: 43.01 | +7: iteration 23690/ 115203 | consumed samples: 6064640 | consumed tokens: 12420382720 | elapsed time per iteration (s): 0.57 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.900540E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.616 | TFLOPs: 42.68 | +7: iteration 23700/ 115203 | consumed samples: 6067200 | consumed tokens: 12425625600 | elapsed time per iteration (s): 0.57 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.899193E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.007 | TFLOPs: 42.52 | +7: iteration 23710/ 115203 | consumed samples: 6069760 | consumed tokens: 12430868480 | elapsed time per iteration (s): 0.57 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.914218E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.255 | TFLOPs: 42.74 | +7: iteration 23720/ 115203 | consumed samples: 6072320 | consumed tokens: 12436111360 | elapsed time per iteration (s): 0.57 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 2.914223E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.409 | TFLOPs: 42.75 | +7: iteration 23730/ 115203 | consumed samples: 6074880 | consumed tokens: 12441354240 | elapsed time per iteration (s): 0.56 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.906344E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.749 | TFLOPs: 43.64 | +7: iteration 23740/ 115203 | consumed samples: 6077440 | consumed tokens: 12446597120 | elapsed time per iteration (s): 0.60 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.896970E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.369 | TFLOPs: 40.55 | +7: iteration 23750/ 115203 | consumed samples: 6080000 | consumed tokens: 12451840000 | elapsed time per iteration (s): 0.57 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.913400E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.939 | TFLOPs: 42.99 | +7: iteration 23760/ 115203 | consumed samples: 6082560 | consumed tokens: 12457082880 | elapsed time per iteration (s): 0.56 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.921245E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.577 | TFLOPs: 43.62 | +7: iteration 23770/ 115203 | consumed samples: 6085120 | consumed tokens: 12462325760 | elapsed time per iteration (s): 0.57 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.906310E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.356 | TFLOPs: 42.56 | +7: iteration 23780/ 115203 | consumed samples: 6087680 | consumed tokens: 12467568640 | elapsed time per iteration (s): 0.58 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.905031E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.062 | TFLOPs: 42.43 | +7: iteration 23790/ 115203 | consumed samples: 6090240 | consumed tokens: 12472811520 | elapsed time per iteration (s): 0.58 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 2.917290E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.042 | TFLOPs: 41.95 | +7: iteration 23800/ 115203 | consumed samples: 6092800 | consumed tokens: 12478054400 | elapsed time per iteration (s): 0.58 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.899177E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.619 | TFLOPs: 42.01 | +7: iteration 23810/ 115203 | consumed samples: 6095360 | consumed tokens: 12483297280 | elapsed time per iteration (s): 0.56 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.911411E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.753 | TFLOPs: 43.55 | +7: iteration 23820/ 115203 | consumed samples: 6097920 | consumed tokens: 12488540160 | elapsed time per iteration (s): 0.59 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.886833E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.498 | TFLOPs: 41.71 | +7: iteration 23830/ 115203 | consumed samples: 6100480 | consumed tokens: 12493783040 | elapsed time per iteration (s): 0.57 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.901504E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.008 | TFLOPs: 43.19 | +7: iteration 23840/ 115203 | consumed samples: 6103040 | consumed tokens: 12499025920 | elapsed time per iteration (s): 0.58 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.907220E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.061 | TFLOPs: 41.95 | +7: iteration 23850/ 115203 | consumed samples: 6105600 | consumed tokens: 12504268800 | elapsed time per iteration (s): 0.57 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.909690E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.302 | TFLOPs: 42.93 | +7: iteration 23860/ 115203 | consumed samples: 6108160 | consumed tokens: 12509511680 | elapsed time per iteration (s): 0.58 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 2.905911E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.746 | TFLOPs: 42.12 | +7: iteration 23870/ 115203 | consumed samples: 6110720 | consumed tokens: 12514754560 | elapsed time per iteration (s): 0.57 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.917764E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.300 | TFLOPs: 42.84 | +7: iteration 23880/ 115203 | consumed samples: 6113280 | consumed tokens: 12519997440 | elapsed time per iteration (s): 0.56 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.921822E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.463 | TFLOPs: 43.71 | +7: iteration 23890/ 115203 | consumed samples: 6115840 | consumed tokens: 12525240320 | elapsed time per iteration (s): 0.57 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.897245E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.759 | TFLOPs: 43.07 | +7: iteration 23900/ 115203 | consumed samples: 6118400 | consumed tokens: 12530483200 | elapsed time per iteration (s): 0.57 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.903460E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.802 | TFLOPs: 43.07 | +7: iteration 23910/ 115203 | consumed samples: 6120960 | consumed tokens: 12535726080 | elapsed time per iteration (s): 0.58 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.898057E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.310 | TFLOPs: 42.07 | +7: iteration 23920/ 115203 | consumed samples: 6123520 | consumed tokens: 12540968960 | elapsed time per iteration (s): 0.57 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.918012E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.840 | TFLOPs: 42.70 | +7: iteration 23930/ 115203 | consumed samples: 6126080 | consumed tokens: 12546211840 | elapsed time per iteration (s): 0.57 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 2.899196E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.473 | TFLOPs: 43.04 | +7: iteration 23940/ 115203 | consumed samples: 6128640 | consumed tokens: 12551454720 | elapsed time per iteration (s): 0.59 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.897775E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.328 | TFLOPs: 41.31 | +7: iteration 23950/ 115203 | consumed samples: 6131200 | consumed tokens: 12556697600 | elapsed time per iteration (s): 0.57 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.914379E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.649 | TFLOPs: 42.58 | +7: iteration 23960/ 115203 | consumed samples: 6133760 | consumed tokens: 12561940480 | elapsed time per iteration (s): 0.58 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.918879E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.002 | TFLOPs: 42.43 | +7: iteration 23970/ 115203 | consumed samples: 6136320 | consumed tokens: 12567183360 | elapsed time per iteration (s): 0.57 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.905061E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.404 | TFLOPs: 42.85 | +7: iteration 23980/ 115203 | consumed samples: 6138880 | consumed tokens: 12572426240 | elapsed time per iteration (s): 0.58 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.916586E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.876 | TFLOPs: 42.41 | +7: iteration 23990/ 115203 | consumed samples: 6141440 | consumed tokens: 12577669120 | elapsed time per iteration (s): 0.58 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.904406E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.299 | TFLOPs: 42.07 | +0: [2023-03-16 16:35:39,250] [INFO] [logging.py:68:log_dist] [Rank 0] step=24000, skipped=0, lr=[0.00018275670559336077, 0.00018275670559336077, 0.00018275670559336077], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 24000/ 115203 | consumed samples: 6144000 | consumed tokens: 12582912000 | elapsed time per iteration (s): 0.57 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 2.910414E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.199 | TFLOPs: 43.11 | +0: steps: 24000 loss: 2.9020 iter time (s): 0.571 samples/sec: 448.023 +7: iteration 24010/ 115203 | consumed samples: 6146560 | consumed tokens: 12588154880 | elapsed time per iteration (s): 0.57 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.902856E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.368 | TFLOPs: 42.56 | +7: iteration 24020/ 115203 | consumed samples: 6149120 | consumed tokens: 12593397760 | elapsed time per iteration (s): 0.61 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.902936E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.733 | TFLOPs: 40.21 | +7: iteration 24030/ 115203 | consumed samples: 6151680 | consumed tokens: 12598640640 | elapsed time per iteration (s): 0.58 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.886574E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.181 | TFLOPs: 42.44 | +7: iteration 24040/ 115203 | consumed samples: 6154240 | consumed tokens: 12603883520 | elapsed time per iteration (s): 0.57 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.895666E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.546 | TFLOPs: 42.57 | +7: iteration 24050/ 115203 | consumed samples: 6156800 | consumed tokens: 12609126400 | elapsed time per iteration (s): 0.56 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.897434E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.196 | TFLOPs: 43.68 | +7: iteration 24060/ 115203 | consumed samples: 6159360 | consumed tokens: 12614369280 | elapsed time per iteration (s): 0.57 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.911580E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.411 | TFLOPs: 42.66 | +7: iteration 24070/ 115203 | consumed samples: 6161920 | consumed tokens: 12619612160 | elapsed time per iteration (s): 0.57 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 2.904777E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.749 | TFLOPs: 42.50 | +7: iteration 24080/ 115203 | consumed samples: 6164480 | consumed tokens: 12624855040 | elapsed time per iteration (s): 0.57 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.914962E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.196 | TFLOPs: 43.02 | +7: iteration 24090/ 115203 | consumed samples: 6167040 | consumed tokens: 12630097920 | elapsed time per iteration (s): 0.56 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.909569E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.502 | TFLOPs: 43.43 | +7: iteration 24100/ 115203 | consumed samples: 6169600 | consumed tokens: 12635340800 | elapsed time per iteration (s): 0.58 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.918295E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.453 | TFLOPs: 42.37 | +7: iteration 24110/ 115203 | consumed samples: 6172160 | consumed tokens: 12640583680 | elapsed time per iteration (s): 0.57 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.915412E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.640 | TFLOPs: 42.49 | +7: iteration 24120/ 115203 | consumed samples: 6174720 | consumed tokens: 12645826560 | elapsed time per iteration (s): 0.57 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.898826E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.994 | TFLOPs: 43.19 | +7: iteration 24130/ 115203 | consumed samples: 6177280 | consumed tokens: 12651069440 | elapsed time per iteration (s): 0.56 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.903045E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.597 | TFLOPs: 43.53 | +7: iteration 24140/ 115203 | consumed samples: 6179840 | consumed tokens: 12656312320 | elapsed time per iteration (s): 0.56 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 2.922229E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.160 | TFLOPs: 43.87 | +7: iteration 24150/ 115203 | consumed samples: 6182400 | consumed tokens: 12661555200 | elapsed time per iteration (s): 0.57 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 2.902910E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.874 | TFLOPs: 42.51 | +7: iteration 24160/ 115203 | consumed samples: 6184960 | consumed tokens: 12666798080 | elapsed time per iteration (s): 0.60 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 2.902359E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.533 | TFLOPs: 40.95 | +7: iteration 24170/ 115203 | consumed samples: 6187520 | consumed tokens: 12672040960 | elapsed time per iteration (s): 0.58 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 2.903457E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.504 | TFLOPs: 42.09 | +7: iteration 24180/ 115203 | consumed samples: 6190080 | consumed tokens: 12677283840 | elapsed time per iteration (s): 0.59 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 2.885056E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.455 | TFLOPs: 41.52 | +7: iteration 24190/ 115203 | consumed samples: 6192640 | consumed tokens: 12682526720 | elapsed time per iteration (s): 0.58 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 2.904516E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.095 | TFLOPs: 42.15 | +7: iteration 24200/ 115203 | consumed samples: 6195200 | consumed tokens: 12687769600 | elapsed time per iteration (s): 0.59 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 2.896612E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.929 | TFLOPs: 41.66 | +7: iteration 24210/ 115203 | consumed samples: 6197760 | consumed tokens: 12693012480 | elapsed time per iteration (s): 0.57 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.909889E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.520 | TFLOPs: 42.57 | +7: iteration 24220/ 115203 | consumed samples: 6200320 | consumed tokens: 12698255360 | elapsed time per iteration (s): 0.57 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.892565E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.748 | TFLOPs: 42.88 | +7: iteration 24230/ 115203 | consumed samples: 6202880 | consumed tokens: 12703498240 | elapsed time per iteration (s): 0.57 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.904200E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.264 | TFLOPs: 42.74 | +7: iteration 24240/ 115203 | consumed samples: 6205440 | consumed tokens: 12708741120 | elapsed time per iteration (s): 0.56 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.899288E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.616 | TFLOPs: 43.34 | +7: iteration 24250/ 115203 | consumed samples: 6208000 | consumed tokens: 12713984000 | elapsed time per iteration (s): 0.59 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.903745E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.182 | TFLOPs: 41.39 | +7: iteration 24260/ 115203 | consumed samples: 6210560 | consumed tokens: 12719226880 | elapsed time per iteration (s): 0.57 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.908257E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.803 | TFLOPs: 42.60 | +7: iteration 24270/ 115203 | consumed samples: 6213120 | consumed tokens: 12724469760 | elapsed time per iteration (s): 0.58 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 2.906277E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.513 | TFLOPs: 42.00 | +7: iteration 24280/ 115203 | consumed samples: 6215680 | consumed tokens: 12729712640 | elapsed time per iteration (s): 0.59 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.903351E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.166 | TFLOPs: 41.30 | +7: iteration 24290/ 115203 | consumed samples: 6218240 | consumed tokens: 12734955520 | elapsed time per iteration (s): 0.60 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.901831E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.901 | TFLOPs: 40.89 | +7: iteration 24300/ 115203 | consumed samples: 6220800 | consumed tokens: 12740198400 | elapsed time per iteration (s): 0.59 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.905746E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.460 | TFLOPs: 41.71 | +7: iteration 24310/ 115203 | consumed samples: 6223360 | consumed tokens: 12745441280 | elapsed time per iteration (s): 0.58 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.918362E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.168 | TFLOPs: 42.44 | +7: iteration 24320/ 115203 | consumed samples: 6225920 | consumed tokens: 12750684160 | elapsed time per iteration (s): 0.56 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.908823E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.472 | TFLOPs: 43.33 | +7: iteration 24330/ 115203 | consumed samples: 6228480 | consumed tokens: 12755927040 | elapsed time per iteration (s): 0.57 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.905210E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.793 | TFLOPs: 43.07 | +7: iteration 24340/ 115203 | consumed samples: 6231040 | consumed tokens: 12761169920 | elapsed time per iteration (s): 0.57 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 2.894086E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.688 | TFLOPs: 42.87 | +7: iteration 24350/ 115203 | consumed samples: 6233600 | consumed tokens: 12766412800 | elapsed time per iteration (s): 0.56 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.903603E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.275 | TFLOPs: 43.69 | +7: iteration 24360/ 115203 | consumed samples: 6236160 | consumed tokens: 12771655680 | elapsed time per iteration (s): 0.56 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.901663E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.476 | TFLOPs: 43.71 | +7: iteration 24370/ 115203 | consumed samples: 6238720 | consumed tokens: 12776898560 | elapsed time per iteration (s): 0.57 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.906035E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.603 | TFLOPs: 42.86 | +7: iteration 24380/ 115203 | consumed samples: 6241280 | consumed tokens: 12782141440 | elapsed time per iteration (s): 0.56 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.915921E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.992 | TFLOPs: 43.47 | +7: iteration 24390/ 115203 | consumed samples: 6243840 | consumed tokens: 12787384320 | elapsed time per iteration (s): 0.57 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.902829E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.189 | TFLOPs: 42.83 | +7: iteration 24400/ 115203 | consumed samples: 6246400 | consumed tokens: 12792627200 | elapsed time per iteration (s): 0.57 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.916448E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.035 | TFLOPs: 42.81 | +7: iteration 24410/ 115203 | consumed samples: 6248960 | consumed tokens: 12797870080 | elapsed time per iteration (s): 0.57 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 2.900204E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.074 | TFLOPs: 43.10 | +7: iteration 24420/ 115203 | consumed samples: 6251520 | consumed tokens: 12803112960 | elapsed time per iteration (s): 0.57 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 2.894313E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.315 | TFLOPs: 42.93 | +7: iteration 24430/ 115203 | consumed samples: 6254080 | consumed tokens: 12808355840 | elapsed time per iteration (s): 0.56 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 2.896444E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.010 | TFLOPs: 43.57 | +7: iteration 24440/ 115203 | consumed samples: 6256640 | consumed tokens: 12813598720 | elapsed time per iteration (s): 0.56 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 2.894410E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.812 | TFLOPs: 43.65 | +7: iteration 24450/ 115203 | consumed samples: 6259200 | consumed tokens: 12818841600 | elapsed time per iteration (s): 0.56 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 2.911506E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.848 | TFLOPs: 43.65 | +7: iteration 24460/ 115203 | consumed samples: 6261760 | consumed tokens: 12824084480 | elapsed time per iteration (s): 0.56 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 2.897678E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.336 | TFLOPs: 43.41 | +7: iteration 24470/ 115203 | consumed samples: 6264320 | consumed tokens: 12829327360 | elapsed time per iteration (s): 0.57 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 2.903570E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.802 | TFLOPs: 43.07 | +7: iteration 24480/ 115203 | consumed samples: 6266880 | consumed tokens: 12834570240 | elapsed time per iteration (s): 0.57 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.888921E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.477 | TFLOPs: 42.95 | +7: iteration 24490/ 115203 | consumed samples: 6269440 | consumed tokens: 12839813120 | elapsed time per iteration (s): 0.56 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.893535E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.812 | TFLOPs: 43.55 | +7: iteration 24500/ 115203 | consumed samples: 6272000 | consumed tokens: 12845056000 | elapsed time per iteration (s): 0.56 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.911723E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.799 | TFLOPs: 43.36 | +7: iteration 24510/ 115203 | consumed samples: 6274560 | consumed tokens: 12850298880 | elapsed time per iteration (s): 0.59 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.878779E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.386 | TFLOPs: 41.70 | +7: iteration 24520/ 115203 | consumed samples: 6277120 | consumed tokens: 12855541760 | elapsed time per iteration (s): 0.56 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.902513E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.781 | TFLOPs: 43.26 | +7: iteration 24530/ 115203 | consumed samples: 6279680 | consumed tokens: 12860784640 | elapsed time per iteration (s): 0.56 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.906778E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.824 | TFLOPs: 43.55 | +7: iteration 24540/ 115203 | consumed samples: 6282240 | consumed tokens: 12866027520 | elapsed time per iteration (s): 0.56 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 2.896152E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.211 | TFLOPs: 43.69 | +7: iteration 24550/ 115203 | consumed samples: 6284800 | consumed tokens: 12871270400 | elapsed time per iteration (s): 0.60 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.878603E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.257 | TFLOPs: 40.93 | +7: iteration 24560/ 115203 | consumed samples: 6287360 | consumed tokens: 12876513280 | elapsed time per iteration (s): 0.56 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.888830E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.145 | TFLOPs: 43.97 | +7: iteration 24570/ 115203 | consumed samples: 6289920 | consumed tokens: 12881756160 | elapsed time per iteration (s): 0.57 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.896378E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.475 | TFLOPs: 42.66 | +7: iteration 24580/ 115203 | consumed samples: 6292480 | consumed tokens: 12886999040 | elapsed time per iteration (s): 0.58 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.894645E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.630 | TFLOPs: 41.82 | +7: iteration 24590/ 115203 | consumed samples: 6295040 | consumed tokens: 12892241920 | elapsed time per iteration (s): 0.56 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.894244E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.856 | TFLOPs: 43.27 | +7: iteration 24600/ 115203 | consumed samples: 6297600 | consumed tokens: 12897484800 | elapsed time per iteration (s): 0.56 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.899966E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.420 | TFLOPs: 43.42 | +7: iteration 24610/ 115203 | consumed samples: 6300160 | consumed tokens: 12902727680 | elapsed time per iteration (s): 0.58 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 2.892962E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.479 | TFLOPs: 42.38 | +7: iteration 24620/ 115203 | consumed samples: 6302720 | consumed tokens: 12907970560 | elapsed time per iteration (s): 0.59 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.890944E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.782 | TFLOPs: 41.64 | +7: iteration 24630/ 115203 | consumed samples: 6305280 | consumed tokens: 12913213440 | elapsed time per iteration (s): 0.56 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.896575E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.902 | TFLOPs: 43.75 | +7: iteration 24640/ 115203 | consumed samples: 6307840 | consumed tokens: 12918456320 | elapsed time per iteration (s): 0.57 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.911224E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.737 | TFLOPs: 43.16 | +7: iteration 24650/ 115203 | consumed samples: 6310400 | consumed tokens: 12923699200 | elapsed time per iteration (s): 0.55 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.887983E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.333 | TFLOPs: 43.98 | +7: iteration 24660/ 115203 | consumed samples: 6312960 | consumed tokens: 12928942080 | elapsed time per iteration (s): 0.59 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.903967E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.495 | TFLOPs: 41.62 | +7: iteration 24670/ 115203 | consumed samples: 6315520 | consumed tokens: 12934184960 | elapsed time per iteration (s): 0.57 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.892058E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.311 | TFLOPs: 42.55 | +7: iteration 24680/ 115203 | consumed samples: 6318080 | consumed tokens: 12939427840 | elapsed time per iteration (s): 0.57 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 2.908947E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.962 | TFLOPs: 43.19 | +7: iteration 24690/ 115203 | consumed samples: 6320640 | consumed tokens: 12944670720 | elapsed time per iteration (s): 0.57 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 2.891463E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.835 | TFLOPs: 42.89 | +7: iteration 24700/ 115203 | consumed samples: 6323200 | consumed tokens: 12949913600 | elapsed time per iteration (s): 0.57 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 2.899058E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.073 | TFLOPs: 43.10 | +7: iteration 24710/ 115203 | consumed samples: 6325760 | consumed tokens: 12955156480 | elapsed time per iteration (s): 0.57 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 2.907875E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.464 | TFLOPs: 42.95 | +7: iteration 24720/ 115203 | consumed samples: 6328320 | consumed tokens: 12960399360 | elapsed time per iteration (s): 0.56 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 2.900253E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.353 | TFLOPs: 43.22 | +7: iteration 24730/ 115203 | consumed samples: 6330880 | consumed tokens: 12965642240 | elapsed time per iteration (s): 0.57 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 2.899715E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.244 | TFLOPs: 43.02 | +7: iteration 24740/ 115203 | consumed samples: 6333440 | consumed tokens: 12970885120 | elapsed time per iteration (s): 0.57 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 2.911001E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.645 | TFLOPs: 43.06 | +7: iteration 24750/ 115203 | consumed samples: 6336000 | consumed tokens: 12976128000 | elapsed time per iteration (s): 0.57 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.897016E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.927 | TFLOPs: 43.18 | +7: iteration 24760/ 115203 | consumed samples: 6338560 | consumed tokens: 12981370880 | elapsed time per iteration (s): 0.56 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.900931E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.553 | TFLOPs: 43.62 | +7: iteration 24770/ 115203 | consumed samples: 6341120 | consumed tokens: 12986613760 | elapsed time per iteration (s): 0.56 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.903501E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.501 | TFLOPs: 43.52 | +7: iteration 24780/ 115203 | consumed samples: 6343680 | consumed tokens: 12991856640 | elapsed time per iteration (s): 0.57 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.905415E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.102 | TFLOPs: 43.01 | +7: iteration 24790/ 115203 | consumed samples: 6346240 | consumed tokens: 12997099520 | elapsed time per iteration (s): 0.56 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.887729E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.496 | TFLOPs: 43.24 | +7: iteration 24800/ 115203 | consumed samples: 6348800 | consumed tokens: 13002342400 | elapsed time per iteration (s): 0.56 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.901851E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.908 | TFLOPs: 43.28 | +7: iteration 24810/ 115203 | consumed samples: 6351360 | consumed tokens: 13007585280 | elapsed time per iteration (s): 0.56 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 2.911156E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.294 | TFLOPs: 43.22 | +7: iteration 24820/ 115203 | consumed samples: 6353920 | consumed tokens: 13012828160 | elapsed time per iteration (s): 0.56 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.900542E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.430 | TFLOPs: 43.42 | +7: iteration 24830/ 115203 | consumed samples: 6356480 | consumed tokens: 13018071040 | elapsed time per iteration (s): 0.57 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.913229E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.325 | TFLOPs: 42.93 | +7: iteration 24840/ 115203 | consumed samples: 6359040 | consumed tokens: 13023313920 | elapsed time per iteration (s): 0.57 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.888592E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.485 | TFLOPs: 42.76 | +7: iteration 24850/ 115203 | consumed samples: 6361600 | consumed tokens: 13028556800 | elapsed time per iteration (s): 0.57 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.896877E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.111 | TFLOPs: 42.53 | +7: iteration 24860/ 115203 | consumed samples: 6364160 | consumed tokens: 13033799680 | elapsed time per iteration (s): 0.56 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.897536E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 24870/ 115203 | consumed samples: 6366720 | consumed tokens: 13039042560 | elapsed time per iteration (s): 0.56 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.897145E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.162 | TFLOPs: 43.30 | +7: iteration 24880/ 115203 | consumed samples: 6369280 | consumed tokens: 13044285440 | elapsed time per iteration (s): 0.57 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 2.895181E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.367 | TFLOPs: 43.13 | +7: iteration 24890/ 115203 | consumed samples: 6371840 | consumed tokens: 13049528320 | elapsed time per iteration (s): 0.56 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 2.901330E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.822 | TFLOPs: 43.74 | +7: iteration 24900/ 115203 | consumed samples: 6374400 | consumed tokens: 13054771200 | elapsed time per iteration (s): 0.56 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 2.889617E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.416 | TFLOPs: 43.23 | +7: iteration 24910/ 115203 | consumed samples: 6376960 | consumed tokens: 13060014080 | elapsed time per iteration (s): 0.56 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 2.871913E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.681 | TFLOPs: 43.44 | +7: iteration 24920/ 115203 | consumed samples: 6379520 | consumed tokens: 13065256960 | elapsed time per iteration (s): 0.57 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 2.898293E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.786 | TFLOPs: 42.98 | +7: iteration 24930/ 115203 | consumed samples: 6382080 | consumed tokens: 13070499840 | elapsed time per iteration (s): 0.57 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 2.908680E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.252 | TFLOPs: 42.93 | +7: iteration 24940/ 115203 | consumed samples: 6384640 | consumed tokens: 13075742720 | elapsed time per iteration (s): 0.57 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 2.914808E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.846 | TFLOPs: 42.79 | +7: iteration 24950/ 115203 | consumed samples: 6387200 | consumed tokens: 13080985600 | elapsed time per iteration (s): 0.56 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.901740E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.979 | TFLOPs: 43.57 | +7: iteration 24960/ 115203 | consumed samples: 6389760 | consumed tokens: 13086228480 | elapsed time per iteration (s): 0.58 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.872062E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.914 | TFLOPs: 42.32 | +7: iteration 24970/ 115203 | consumed samples: 6392320 | consumed tokens: 13091471360 | elapsed time per iteration (s): 0.57 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.903477E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.735 | TFLOPs: 42.69 | +7: iteration 24980/ 115203 | consumed samples: 6394880 | consumed tokens: 13096714240 | elapsed time per iteration (s): 0.56 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.909396E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.236 | TFLOPs: 43.21 | +7: iteration 24990/ 115203 | consumed samples: 6397440 | consumed tokens: 13101957120 | elapsed time per iteration (s): 0.57 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.903746E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.951 | TFLOPs: 42.61 | +7: iteration 25000/ 115203 | consumed samples: 6400000 | consumed tokens: 13107200000 | elapsed time per iteration (s): 0.56 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.894976E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.441 | TFLOPs: 43.71 | +7: iteration 25010/ 115203 | consumed samples: 6402560 | consumed tokens: 13112442880 | elapsed time per iteration (s): 0.56 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 2.890388E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.792 | TFLOPs: 43.55 | +7: iteration 25020/ 115203 | consumed samples: 6405120 | consumed tokens: 13117685760 | elapsed time per iteration (s): 0.57 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 2.896247E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.418 | TFLOPs: 42.56 | +7: iteration 25030/ 115203 | consumed samples: 6407680 | consumed tokens: 13122928640 | elapsed time per iteration (s): 0.57 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 2.886830E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.995 | TFLOPs: 43.19 | +7: iteration 25040/ 115203 | consumed samples: 6410240 | consumed tokens: 13128171520 | elapsed time per iteration (s): 0.57 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 2.895139E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.477 | TFLOPs: 42.95 | +7: iteration 25050/ 115203 | consumed samples: 6412800 | consumed tokens: 13133414400 | elapsed time per iteration (s): 0.58 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 2.897654E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.320 | TFLOPs: 42.17 | +7: iteration 25060/ 115203 | consumed samples: 6415360 | consumed tokens: 13138657280 | elapsed time per iteration (s): 0.57 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 2.896278E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.874 | TFLOPs: 43.18 | +7: iteration 25070/ 115203 | consumed samples: 6417920 | consumed tokens: 13143900160 | elapsed time per iteration (s): 0.58 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 2.893513E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.962 | TFLOPs: 42.33 | +7: iteration 25080/ 115203 | consumed samples: 6420480 | consumed tokens: 13149143040 | elapsed time per iteration (s): 0.57 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.880781E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.337 | TFLOPs: 42.74 | +7: iteration 25090/ 115203 | consumed samples: 6423040 | consumed tokens: 13154385920 | elapsed time per iteration (s): 0.56 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.892435E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.194 | TFLOPs: 43.78 | +7: iteration 25100/ 115203 | consumed samples: 6425600 | consumed tokens: 13159628800 | elapsed time per iteration (s): 0.56 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.894340E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.140 | TFLOPs: 43.49 | +7: iteration 25110/ 115203 | consumed samples: 6428160 | consumed tokens: 13164871680 | elapsed time per iteration (s): 0.56 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.894275E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.599 | TFLOPs: 43.44 | +7: iteration 25120/ 115203 | consumed samples: 6430720 | consumed tokens: 13170114560 | elapsed time per iteration (s): 0.57 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.889421E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.081 | TFLOPs: 43.10 | +7: iteration 25130/ 115203 | consumed samples: 6433280 | consumed tokens: 13175357440 | elapsed time per iteration (s): 0.55 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.893559E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.576 | TFLOPs: 44.01 | +7: iteration 25140/ 115203 | consumed samples: 6435840 | consumed tokens: 13180600320 | elapsed time per iteration (s): 0.59 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 2.886292E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.193 | TFLOPs: 41.68 | +7: iteration 25150/ 115203 | consumed samples: 6438400 | consumed tokens: 13185843200 | elapsed time per iteration (s): 0.57 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.908321E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.440 | TFLOPs: 43.04 | +7: iteration 25160/ 115203 | consumed samples: 6440960 | consumed tokens: 13191086080 | elapsed time per iteration (s): 0.57 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.902587E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.470 | TFLOPs: 42.76 | +7: iteration 25170/ 115203 | consumed samples: 6443520 | consumed tokens: 13196328960 | elapsed time per iteration (s): 0.57 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.897926E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.047 | TFLOPs: 43.00 | +7: iteration 25180/ 115203 | consumed samples: 6446080 | consumed tokens: 13201571840 | elapsed time per iteration (s): 0.57 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.902161E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.110 | TFLOPs: 42.72 | +7: iteration 25190/ 115203 | consumed samples: 6448640 | consumed tokens: 13206814720 | elapsed time per iteration (s): 0.56 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.893704E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.651 | TFLOPs: 43.73 | +7: iteration 25200/ 115203 | consumed samples: 6451200 | consumed tokens: 13212057600 | elapsed time per iteration (s): 0.55 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.888888E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.770 | TFLOPs: 44.02 | +7: iteration 25210/ 115203 | consumed samples: 6453760 | consumed tokens: 13217300480 | elapsed time per iteration (s): 0.56 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 2.890099E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.998 | TFLOPs: 43.67 | +7: iteration 25220/ 115203 | consumed samples: 6456320 | consumed tokens: 13222543360 | elapsed time per iteration (s): 0.57 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 2.896233E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.414 | TFLOPs: 43.13 | +7: iteration 25230/ 115203 | consumed samples: 6458880 | consumed tokens: 13227786240 | elapsed time per iteration (s): 0.58 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 2.891089E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.925 | TFLOPs: 41.94 | +7: iteration 25240/ 115203 | consumed samples: 6461440 | consumed tokens: 13233029120 | elapsed time per iteration (s): 0.56 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 2.887103E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.499 | TFLOPs: 43.43 | +7: iteration 25250/ 115203 | consumed samples: 6464000 | consumed tokens: 13238272000 | elapsed time per iteration (s): 0.57 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 2.895895E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.053 | TFLOPs: 43.19 | +7: iteration 25260/ 115203 | consumed samples: 6466560 | consumed tokens: 13243514880 | elapsed time per iteration (s): 2.37 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 2.906533E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 107.826 | TFLOPs: 10.28 | +7: iteration 25270/ 115203 | consumed samples: 6469120 | consumed tokens: 13248757760 | elapsed time per iteration (s): 0.94 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 2.894982E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 271.261 | TFLOPs: 25.86 | +7: iteration 25280/ 115203 | consumed samples: 6471680 | consumed tokens: 13254000640 | elapsed time per iteration (s): 0.56 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.892161E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.251 | TFLOPs: 43.31 | +7: iteration 25290/ 115203 | consumed samples: 6474240 | consumed tokens: 13259243520 | elapsed time per iteration (s): 0.56 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.898382E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.693 | TFLOPs: 43.35 | +7: iteration 25300/ 115203 | consumed samples: 6476800 | consumed tokens: 13264486400 | elapsed time per iteration (s): 0.56 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.892754E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.032 | TFLOPs: 43.67 | +7: iteration 25310/ 115203 | consumed samples: 6479360 | consumed tokens: 13269729280 | elapsed time per iteration (s): 0.58 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.897693E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.497 | TFLOPs: 42.28 | +7: iteration 25320/ 115203 | consumed samples: 6481920 | consumed tokens: 13274972160 | elapsed time per iteration (s): 0.56 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.891494E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.691 | TFLOPs: 43.64 | +7: iteration 25330/ 115203 | consumed samples: 6484480 | consumed tokens: 13280215040 | elapsed time per iteration (s): 0.56 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.892593E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.848 | TFLOPs: 43.36 | +7: iteration 25340/ 115203 | consumed samples: 6487040 | consumed tokens: 13285457920 | elapsed time per iteration (s): 0.56 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 2.895799E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.285 | TFLOPs: 43.50 | +7: iteration 25350/ 115203 | consumed samples: 6489600 | consumed tokens: 13290700800 | elapsed time per iteration (s): 0.56 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 2.900077E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.820 | TFLOPs: 43.65 | +7: iteration 25360/ 115203 | consumed samples: 6492160 | consumed tokens: 13295943680 | elapsed time per iteration (s): 0.56 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 2.899140E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.950 | TFLOPs: 43.76 | +7: iteration 25370/ 115203 | consumed samples: 6494720 | consumed tokens: 13301186560 | elapsed time per iteration (s): 0.56 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 2.896194E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.855 | TFLOPs: 43.84 | +7: iteration 25380/ 115203 | consumed samples: 6497280 | consumed tokens: 13306429440 | elapsed time per iteration (s): 0.56 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 2.875735E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.274 | TFLOPs: 43.21 | +7: iteration 25390/ 115203 | consumed samples: 6499840 | consumed tokens: 13311672320 | elapsed time per iteration (s): 0.58 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 2.890129E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.267 | TFLOPs: 42.17 | +7: iteration 25400/ 115203 | consumed samples: 6502400 | consumed tokens: 13316915200 | elapsed time per iteration (s): 0.57 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 2.893543E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.818 | TFLOPs: 43.08 | +7: iteration 25410/ 115203 | consumed samples: 6504960 | consumed tokens: 13322158080 | elapsed time per iteration (s): 0.56 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.898944E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.975 | TFLOPs: 43.38 | +7: iteration 25420/ 115203 | consumed samples: 6507520 | consumed tokens: 13327400960 | elapsed time per iteration (s): 0.56 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.891738E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.477 | TFLOPs: 43.33 | +7: iteration 25430/ 115203 | consumed samples: 6510080 | consumed tokens: 13332643840 | elapsed time per iteration (s): 0.57 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.897351E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.627 | TFLOPs: 42.68 | +7: iteration 25440/ 115203 | consumed samples: 6512640 | consumed tokens: 13337886720 | elapsed time per iteration (s): 0.56 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.882243E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.021 | TFLOPs: 43.57 | +7: iteration 25450/ 115203 | consumed samples: 6515200 | consumed tokens: 13343129600 | elapsed time per iteration (s): 0.56 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.900924E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.269 | TFLOPs: 43.88 | +7: iteration 25460/ 115203 | consumed samples: 6517760 | consumed tokens: 13348372480 | elapsed time per iteration (s): 0.57 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.895274E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.487 | TFLOPs: 42.95 | +7: iteration 25470/ 115203 | consumed samples: 6520320 | consumed tokens: 13353615360 | elapsed time per iteration (s): 0.60 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 2.899183E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.559 | TFLOPs: 40.76 | +7: iteration 25480/ 115203 | consumed samples: 6522880 | consumed tokens: 13358858240 | elapsed time per iteration (s): 0.56 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 2.901649E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.132 | TFLOPs: 43.30 | +7: iteration 25490/ 115203 | consumed samples: 6525440 | consumed tokens: 13364101120 | elapsed time per iteration (s): 0.58 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 2.895363E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.794 | TFLOPs: 41.74 | +7: iteration 25500/ 115203 | consumed samples: 6528000 | consumed tokens: 13369344000 | elapsed time per iteration (s): 0.56 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 2.893604E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.475 | TFLOPs: 43.81 | +7: iteration 25510/ 115203 | consumed samples: 6530560 | consumed tokens: 13374586880 | elapsed time per iteration (s): 0.56 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 2.892579E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.959 | TFLOPs: 43.66 | +7: iteration 25520/ 115203 | consumed samples: 6533120 | consumed tokens: 13379829760 | elapsed time per iteration (s): 0.56 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 2.896767E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.188 | TFLOPs: 43.49 | +7: iteration 25530/ 115203 | consumed samples: 6535680 | consumed tokens: 13385072640 | elapsed time per iteration (s): 0.56 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 2.886677E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.075 | TFLOPs: 43.48 | +7: iteration 25540/ 115203 | consumed samples: 6538240 | consumed tokens: 13390315520 | elapsed time per iteration (s): 0.58 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.889397E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.519 | TFLOPs: 42.38 | +7: iteration 25550/ 115203 | consumed samples: 6540800 | consumed tokens: 13395558400 | elapsed time per iteration (s): 0.57 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.887424E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.353 | TFLOPs: 43.13 | +7: iteration 25560/ 115203 | consumed samples: 6543360 | consumed tokens: 13400801280 | elapsed time per iteration (s): 0.55 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.913733E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.406 | TFLOPs: 43.99 | +7: iteration 25570/ 115203 | consumed samples: 6545920 | consumed tokens: 13406044160 | elapsed time per iteration (s): 0.55 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.901516E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 25580/ 115203 | consumed samples: 6548480 | consumed tokens: 13411287040 | elapsed time per iteration (s): 0.56 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.887202E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.708 | TFLOPs: 43.54 | +7: iteration 25590/ 115203 | consumed samples: 6551040 | consumed tokens: 13416529920 | elapsed time per iteration (s): 0.55 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.891241E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.587 | TFLOPs: 44.01 | +7: iteration 25600/ 115203 | consumed samples: 6553600 | consumed tokens: 13421772800 | elapsed time per iteration (s): 0.57 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 2.891940E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.060 | TFLOPs: 42.91 | +7: iteration 25610/ 115203 | consumed samples: 6556160 | consumed tokens: 13427015680 | elapsed time per iteration (s): 0.55 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 2.883840E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.587 | TFLOPs: 44.01 | +7: iteration 25620/ 115203 | consumed samples: 6558720 | consumed tokens: 13432258560 | elapsed time per iteration (s): 0.56 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 2.891530E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.619 | TFLOPs: 43.63 | +7: iteration 25630/ 115203 | consumed samples: 6561280 | consumed tokens: 13437501440 | elapsed time per iteration (s): 0.57 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 2.878195E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.968 | TFLOPs: 42.99 | +7: iteration 25640/ 115203 | consumed samples: 6563840 | consumed tokens: 13442744320 | elapsed time per iteration (s): 0.56 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 2.871717E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.027 | TFLOPs: 43.57 | +7: iteration 25650/ 115203 | consumed samples: 6566400 | consumed tokens: 13447987200 | elapsed time per iteration (s): 0.56 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 2.889613E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.389 | TFLOPs: 43.70 | +7: iteration 25660/ 115203 | consumed samples: 6568960 | consumed tokens: 13453230080 | elapsed time per iteration (s): 0.57 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 2.880540E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.534 | TFLOPs: 42.76 | +7: iteration 25670/ 115203 | consumed samples: 6571520 | consumed tokens: 13458472960 | elapsed time per iteration (s): 0.58 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.898153E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.532 | TFLOPs: 42.10 | +7: iteration 25680/ 115203 | consumed samples: 6574080 | consumed tokens: 13463715840 | elapsed time per iteration (s): 0.56 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.886275E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.150 | TFLOPs: 43.39 | +7: iteration 25690/ 115203 | consumed samples: 6576640 | consumed tokens: 13468958720 | elapsed time per iteration (s): 0.55 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.893423E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.574 | TFLOPs: 44.01 | +7: iteration 25700/ 115203 | consumed samples: 6579200 | consumed tokens: 13474201600 | elapsed time per iteration (s): 0.55 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.884457E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 25710/ 115203 | consumed samples: 6581760 | consumed tokens: 13479444480 | elapsed time per iteration (s): 0.58 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.882878E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.355 | TFLOPs: 42.36 | +7: iteration 25720/ 115203 | consumed samples: 6584320 | consumed tokens: 13484687360 | elapsed time per iteration (s): 0.57 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.885139E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.578 | TFLOPs: 43.05 | +7: iteration 25730/ 115203 | consumed samples: 6586880 | consumed tokens: 13489930240 | elapsed time per iteration (s): 0.56 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 2.894072E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.275 | TFLOPs: 43.88 | +7: iteration 25740/ 115203 | consumed samples: 6589440 | consumed tokens: 13495173120 | elapsed time per iteration (s): 0.56 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 2.886353E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.369 | TFLOPs: 43.70 | +7: iteration 25750/ 115203 | consumed samples: 6592000 | consumed tokens: 13500416000 | elapsed time per iteration (s): 0.56 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 2.887324E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.857 | TFLOPs: 43.46 | +7: iteration 25760/ 115203 | consumed samples: 6594560 | consumed tokens: 13505658880 | elapsed time per iteration (s): 0.57 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 2.896270E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.964 | TFLOPs: 43.19 | +7: iteration 25770/ 115203 | consumed samples: 6597120 | consumed tokens: 13510901760 | elapsed time per iteration (s): 0.57 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 2.881917E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.127 | TFLOPs: 42.63 | +7: iteration 25780/ 115203 | consumed samples: 6599680 | consumed tokens: 13516144640 | elapsed time per iteration (s): 0.56 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 2.884456E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.175 | TFLOPs: 43.49 | +7: iteration 25790/ 115203 | consumed samples: 6602240 | consumed tokens: 13521387520 | elapsed time per iteration (s): 0.57 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 2.877350E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.529 | TFLOPs: 42.48 | +7: iteration 25800/ 115203 | consumed samples: 6604800 | consumed tokens: 13526630400 | elapsed time per iteration (s): 0.55 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 2.884839E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.543 | TFLOPs: 44.00 | +7: iteration 25810/ 115203 | consumed samples: 6607360 | consumed tokens: 13531873280 | elapsed time per iteration (s): 0.56 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 2.881454E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.974 | TFLOPs: 43.57 | +7: iteration 25820/ 115203 | consumed samples: 6609920 | consumed tokens: 13537116160 | elapsed time per iteration (s): 0.59 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 2.875970E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.509 | TFLOPs: 41.52 | +7: iteration 25830/ 115203 | consumed samples: 6612480 | consumed tokens: 13542359040 | elapsed time per iteration (s): 0.56 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 2.873797E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.403 | TFLOPs: 43.61 | +7: iteration 25840/ 115203 | consumed samples: 6615040 | consumed tokens: 13547601920 | elapsed time per iteration (s): 0.57 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 2.878644E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.193 | TFLOPs: 43.02 | +7: iteration 25850/ 115203 | consumed samples: 6617600 | consumed tokens: 13552844800 | elapsed time per iteration (s): 0.55 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 2.873474E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.595 | TFLOPs: 44.01 | +7: iteration 25860/ 115203 | consumed samples: 6620160 | consumed tokens: 13558087680 | elapsed time per iteration (s): 0.56 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.882612E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.768 | TFLOPs: 43.55 | +7: iteration 25870/ 115203 | consumed samples: 6622720 | consumed tokens: 13563330560 | elapsed time per iteration (s): 0.56 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.875770E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.225 | TFLOPs: 43.97 | +7: iteration 25880/ 115203 | consumed samples: 6625280 | consumed tokens: 13568573440 | elapsed time per iteration (s): 0.56 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.878781E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.682 | TFLOPs: 43.54 | +7: iteration 25890/ 115203 | consumed samples: 6627840 | consumed tokens: 13573816320 | elapsed time per iteration (s): 0.56 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.876573E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.046 | TFLOPs: 43.67 | +7: iteration 25900/ 115203 | consumed samples: 6630400 | consumed tokens: 13579059200 | elapsed time per iteration (s): 0.57 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.894840E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.252 | TFLOPs: 43.02 | +7: iteration 25910/ 115203 | consumed samples: 6632960 | consumed tokens: 13584302080 | elapsed time per iteration (s): 0.56 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.884223E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.486 | TFLOPs: 43.24 | +7: iteration 25920/ 115203 | consumed samples: 6635520 | consumed tokens: 13589544960 | elapsed time per iteration (s): 0.56 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 2.876483E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.635 | TFLOPs: 43.73 | +7: iteration 25930/ 115203 | consumed samples: 6638080 | consumed tokens: 13594787840 | elapsed time per iteration (s): 0.56 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 2.871906E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.463 | TFLOPs: 43.33 | +7: iteration 25940/ 115203 | consumed samples: 6640640 | consumed tokens: 13600030720 | elapsed time per iteration (s): 0.55 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 2.895063E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.391 | TFLOPs: 43.99 | +7: iteration 25950/ 115203 | consumed samples: 6643200 | consumed tokens: 13605273600 | elapsed time per iteration (s): 0.58 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 2.876239E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.516 | TFLOPs: 42.09 | +7: iteration 25960/ 115203 | consumed samples: 6645760 | consumed tokens: 13610516480 | elapsed time per iteration (s): 0.56 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 2.879850E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.147 | TFLOPs: 43.49 | +7: iteration 25970/ 115203 | consumed samples: 6648320 | consumed tokens: 13615759360 | elapsed time per iteration (s): 0.56 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 2.869827E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.558 | TFLOPs: 43.43 | +7: iteration 25980/ 115203 | consumed samples: 6650880 | consumed tokens: 13621002240 | elapsed time per iteration (s): 0.56 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 2.886841E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.771 | TFLOPs: 43.64 | +7: iteration 25990/ 115203 | consumed samples: 6653440 | consumed tokens: 13626245120 | elapsed time per iteration (s): 0.58 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.891728E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.193 | TFLOPs: 42.44 | +0: [2023-03-16 16:54:56,020] [INFO] [logging.py:68:log_dist] [Rank 0] step=26000, skipped=0, lr=[0.00017972931879823854, 0.00017972931879823854, 0.00017972931879823854], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 26000/ 115203 | consumed samples: 6656000 | consumed tokens: 13631488000 | elapsed time per iteration (s): 0.62 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.881018E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 415.436 | TFLOPs: 39.61 | +0: steps: 26000 loss: 2.9011 iter time (s): 0.576 samples/sec: 444.527 +7: iteration 26010/ 115203 | consumed samples: 6658560 | consumed tokens: 13636730880 | elapsed time per iteration (s): 0.58 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.883261E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.699 | TFLOPs: 42.30 | +7: iteration 26020/ 115203 | consumed samples: 6661120 | consumed tokens: 13641973760 | elapsed time per iteration (s): 0.57 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.880910E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.019 | TFLOPs: 43.19 | +7: iteration 26030/ 115203 | consumed samples: 6663680 | consumed tokens: 13647216640 | elapsed time per iteration (s): 0.56 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.885922E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.959 | TFLOPs: 43.47 | +7: iteration 26040/ 115203 | consumed samples: 6666240 | consumed tokens: 13652459520 | elapsed time per iteration (s): 0.56 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.896819E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.795 | TFLOPs: 43.74 | +7: iteration 26050/ 115203 | consumed samples: 6668800 | consumed tokens: 13657702400 | elapsed time per iteration (s): 0.56 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 2.879343E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.565 | TFLOPs: 43.72 | +7: iteration 26060/ 115203 | consumed samples: 6671360 | consumed tokens: 13662945280 | elapsed time per iteration (s): 0.55 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 2.882344E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.509 | TFLOPs: 44.00 | +7: iteration 26070/ 115203 | consumed samples: 6673920 | consumed tokens: 13668188160 | elapsed time per iteration (s): 0.55 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 2.884415E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.565 | TFLOPs: 44.01 | +7: iteration 26080/ 115203 | consumed samples: 6676480 | consumed tokens: 13673431040 | elapsed time per iteration (s): 0.57 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 2.888256E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.640 | TFLOPs: 43.06 | +7: iteration 26090/ 115203 | consumed samples: 6679040 | consumed tokens: 13678673920 | elapsed time per iteration (s): 0.55 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 2.893881E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.496 | TFLOPs: 44.00 | +7: iteration 26100/ 115203 | consumed samples: 6681600 | consumed tokens: 13683916800 | elapsed time per iteration (s): 0.56 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 2.893819E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.156 | TFLOPs: 43.68 | +7: iteration 26110/ 115203 | consumed samples: 6684160 | consumed tokens: 13689159680 | elapsed time per iteration (s): 0.55 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 2.875026E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 26120/ 115203 | consumed samples: 6686720 | consumed tokens: 13694402560 | elapsed time per iteration (s): 0.56 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 2.878228E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.383 | TFLOPs: 43.70 | +7: iteration 26130/ 115203 | consumed samples: 6689280 | consumed tokens: 13699645440 | elapsed time per iteration (s): 0.55 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 2.879751E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.507 | TFLOPs: 44.00 | +7: iteration 26140/ 115203 | consumed samples: 6691840 | consumed tokens: 13704888320 | elapsed time per iteration (s): 0.56 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 2.887732E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.712 | TFLOPs: 43.64 | +7: iteration 26150/ 115203 | consumed samples: 6694400 | consumed tokens: 13710131200 | elapsed time per iteration (s): 0.55 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 2.887245E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.451 | TFLOPs: 43.99 | +7: iteration 26160/ 115203 | consumed samples: 6696960 | consumed tokens: 13715374080 | elapsed time per iteration (s): 0.56 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 2.880613E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.105 | TFLOPs: 43.48 | +7: iteration 26170/ 115203 | consumed samples: 6699520 | consumed tokens: 13720616960 | elapsed time per iteration (s): 0.57 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 2.875016E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.941 | TFLOPs: 43.18 | +7: iteration 26180/ 115203 | consumed samples: 6702080 | consumed tokens: 13725859840 | elapsed time per iteration (s): 0.56 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.879792E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.520 | TFLOPs: 43.71 | +7: iteration 26190/ 115203 | consumed samples: 6704640 | consumed tokens: 13731102720 | elapsed time per iteration (s): 0.57 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.868178E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.187 | TFLOPs: 43.02 | +7: iteration 26200/ 115203 | consumed samples: 6707200 | consumed tokens: 13736345600 | elapsed time per iteration (s): 0.55 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.884942E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.467 | TFLOPs: 44.00 | +7: iteration 26210/ 115203 | consumed samples: 6709760 | consumed tokens: 13741588480 | elapsed time per iteration (s): 0.56 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.881307E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.944 | TFLOPs: 43.56 | +7: iteration 26220/ 115203 | consumed samples: 6712320 | consumed tokens: 13746831360 | elapsed time per iteration (s): 0.56 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.888518E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.497 | TFLOPs: 43.24 | +7: iteration 26230/ 115203 | consumed samples: 6714880 | consumed tokens: 13752074240 | elapsed time per iteration (s): 0.56 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.890539E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.454 | TFLOPs: 43.52 | +7: iteration 26240/ 115203 | consumed samples: 6717440 | consumed tokens: 13757317120 | elapsed time per iteration (s): 0.56 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 2.886396E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.640 | TFLOPs: 43.35 | +7: iteration 26250/ 115203 | consumed samples: 6720000 | consumed tokens: 13762560000 | elapsed time per iteration (s): 0.55 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 2.885219E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.577 | TFLOPs: 44.01 | +7: iteration 26260/ 115203 | consumed samples: 6722560 | consumed tokens: 13767802880 | elapsed time per iteration (s): 0.56 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 2.881759E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.643 | TFLOPs: 43.63 | +7: iteration 26270/ 115203 | consumed samples: 6725120 | consumed tokens: 13773045760 | elapsed time per iteration (s): 0.56 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 2.887662E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.036 | TFLOPs: 43.95 | +7: iteration 26280/ 115203 | consumed samples: 6727680 | consumed tokens: 13778288640 | elapsed time per iteration (s): 0.56 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 2.894107E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.298 | TFLOPs: 43.69 | +7: iteration 26290/ 115203 | consumed samples: 6730240 | consumed tokens: 13783531520 | elapsed time per iteration (s): 0.56 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 2.870841E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.071 | TFLOPs: 43.96 | +7: iteration 26300/ 115203 | consumed samples: 6732800 | consumed tokens: 13788774400 | elapsed time per iteration (s): 0.56 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 2.879519E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.082 | TFLOPs: 43.96 | +7: iteration 26310/ 115203 | consumed samples: 6735360 | consumed tokens: 13794017280 | elapsed time per iteration (s): 0.57 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 2.872325E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.322 | TFLOPs: 43.12 | +7: iteration 26320/ 115203 | consumed samples: 6737920 | consumed tokens: 13799260160 | elapsed time per iteration (s): 0.55 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 2.877001E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 26330/ 115203 | consumed samples: 6740480 | consumed tokens: 13804503040 | elapsed time per iteration (s): 0.56 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 2.893369E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.147 | TFLOPs: 43.20 | +7: iteration 26340/ 115203 | consumed samples: 6743040 | consumed tokens: 13809745920 | elapsed time per iteration (s): 0.56 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 2.888339E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.784 | TFLOPs: 43.45 | +7: iteration 26350/ 115203 | consumed samples: 6745600 | consumed tokens: 13814988800 | elapsed time per iteration (s): 0.56 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 2.876644E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.713 | TFLOPs: 43.64 | +7: iteration 26360/ 115203 | consumed samples: 6748160 | consumed tokens: 13820231680 | elapsed time per iteration (s): 0.55 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 2.883325E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.546 | TFLOPs: 44.00 | +7: iteration 26370/ 115203 | consumed samples: 6750720 | consumed tokens: 13825474560 | elapsed time per iteration (s): 0.55 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.878792E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 26380/ 115203 | consumed samples: 6753280 | consumed tokens: 13830717440 | elapsed time per iteration (s): 0.56 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.879433E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.008 | TFLOPs: 43.57 | +7: iteration 26390/ 115203 | consumed samples: 6755840 | consumed tokens: 13835960320 | elapsed time per iteration (s): 0.56 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.882898E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.000 | TFLOPs: 43.57 | +7: iteration 26400/ 115203 | consumed samples: 6758400 | consumed tokens: 13841203200 | elapsed time per iteration (s): 0.57 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.870874E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.622 | TFLOPs: 42.87 | +7: iteration 26410/ 115203 | consumed samples: 6760960 | consumed tokens: 13846446080 | elapsed time per iteration (s): 0.55 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.874768E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.421 | TFLOPs: 43.99 | +7: iteration 26420/ 115203 | consumed samples: 6763520 | consumed tokens: 13851688960 | elapsed time per iteration (s): 0.55 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.886989E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 26430/ 115203 | consumed samples: 6766080 | consumed tokens: 13856931840 | elapsed time per iteration (s): 0.57 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 2.884980E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.145 | TFLOPs: 42.92 | +7: iteration 26440/ 115203 | consumed samples: 6768640 | consumed tokens: 13862174720 | elapsed time per iteration (s): 0.57 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 2.894877E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.527 | TFLOPs: 43.05 | +7: iteration 26450/ 115203 | consumed samples: 6771200 | consumed tokens: 13867417600 | elapsed time per iteration (s): 0.56 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 2.878152E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.944 | TFLOPs: 43.56 | +7: iteration 26460/ 115203 | consumed samples: 6773760 | consumed tokens: 13872660480 | elapsed time per iteration (s): 0.56 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 2.892165E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.835 | TFLOPs: 43.27 | +7: iteration 26470/ 115203 | consumed samples: 6776320 | consumed tokens: 13877903360 | elapsed time per iteration (s): 0.56 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 2.874070E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.808 | TFLOPs: 43.65 | +7: iteration 26480/ 115203 | consumed samples: 6778880 | consumed tokens: 13883146240 | elapsed time per iteration (s): 0.56 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 2.877093E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.758 | TFLOPs: 43.45 | +7: iteration 26490/ 115203 | consumed samples: 6781440 | consumed tokens: 13888389120 | elapsed time per iteration (s): 0.56 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 2.880664E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.533 | TFLOPs: 43.72 | +7: iteration 26500/ 115203 | consumed samples: 6784000 | consumed tokens: 13893632000 | elapsed time per iteration (s): 0.55 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 2.882705E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.381 | TFLOPs: 43.99 | +7: iteration 26510/ 115203 | consumed samples: 6786560 | consumed tokens: 13898874880 | elapsed time per iteration (s): 0.57 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 2.877120E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.616 | TFLOPs: 43.15 | +7: iteration 26520/ 115203 | consumed samples: 6789120 | consumed tokens: 13904117760 | elapsed time per iteration (s): 0.56 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 2.892089E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.963 | TFLOPs: 43.57 | +7: iteration 26530/ 115203 | consumed samples: 6791680 | consumed tokens: 13909360640 | elapsed time per iteration (s): 0.56 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 2.884300E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.996 | TFLOPs: 43.57 | +7: iteration 26540/ 115203 | consumed samples: 6794240 | consumed tokens: 13914603520 | elapsed time per iteration (s): 0.56 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 2.885396E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.691 | TFLOPs: 43.73 | +7: iteration 26550/ 115203 | consumed samples: 6796800 | consumed tokens: 13919846400 | elapsed time per iteration (s): 0.57 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 2.886477E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.436 | TFLOPs: 42.94 | +7: iteration 26560/ 115203 | consumed samples: 6799360 | consumed tokens: 13925089280 | elapsed time per iteration (s): 0.56 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 2.874402E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.693 | TFLOPs: 43.54 | +7: iteration 26570/ 115203 | consumed samples: 6801920 | consumed tokens: 13930332160 | elapsed time per iteration (s): 0.56 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 2.877214E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.975 | TFLOPs: 43.57 | +7: iteration 26580/ 115203 | consumed samples: 6804480 | consumed tokens: 13935575040 | elapsed time per iteration (s): 0.56 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 2.869877E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.721 | TFLOPs: 43.54 | +7: iteration 26590/ 115203 | consumed samples: 6807040 | consumed tokens: 13940817920 | elapsed time per iteration (s): 0.57 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 2.900114E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.608 | TFLOPs: 42.96 | +7: iteration 26600/ 115203 | consumed samples: 6809600 | consumed tokens: 13946060800 | elapsed time per iteration (s): 0.56 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 2.887854E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.026 | TFLOPs: 43.29 | +7: iteration 26610/ 115203 | consumed samples: 6812160 | consumed tokens: 13951303680 | elapsed time per iteration (s): 0.55 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 2.885169E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 26620/ 115203 | consumed samples: 6814720 | consumed tokens: 13956546560 | elapsed time per iteration (s): 0.56 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.886211E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.850 | TFLOPs: 43.84 | +7: iteration 26630/ 115203 | consumed samples: 6817280 | consumed tokens: 13961789440 | elapsed time per iteration (s): 0.56 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.882349E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.427 | TFLOPs: 43.52 | +7: iteration 26640/ 115203 | consumed samples: 6819840 | consumed tokens: 13967032320 | elapsed time per iteration (s): 0.56 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.882249E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.631 | TFLOPs: 43.82 | +7: iteration 26650/ 115203 | consumed samples: 6822400 | consumed tokens: 13972275200 | elapsed time per iteration (s): 0.56 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.874505E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.221 | TFLOPs: 43.31 | +7: iteration 26660/ 115203 | consumed samples: 6824960 | consumed tokens: 13977518080 | elapsed time per iteration (s): 0.56 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.862605E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.209 | TFLOPs: 43.30 | +7: iteration 26670/ 115203 | consumed samples: 6827520 | consumed tokens: 13982760960 | elapsed time per iteration (s): 0.56 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.881697E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.638 | TFLOPs: 43.54 | +7: iteration 26680/ 115203 | consumed samples: 6830080 | consumed tokens: 13988003840 | elapsed time per iteration (s): 0.58 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 2.880474E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.070 | TFLOPs: 41.86 | +7: iteration 26690/ 115203 | consumed samples: 6832640 | consumed tokens: 13993246720 | elapsed time per iteration (s): 0.56 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 2.876252E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.121 | TFLOPs: 43.20 | +7: iteration 26700/ 115203 | consumed samples: 6835200 | consumed tokens: 13998489600 | elapsed time per iteration (s): 0.57 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 2.883303E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.566 | TFLOPs: 42.96 | +7: iteration 26710/ 115203 | consumed samples: 6837760 | consumed tokens: 14003732480 | elapsed time per iteration (s): 0.57 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 2.872996E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.542 | TFLOPs: 42.67 | +7: iteration 26720/ 115203 | consumed samples: 6840320 | consumed tokens: 14008975360 | elapsed time per iteration (s): 0.56 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 2.881261E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.813 | TFLOPs: 43.84 | +7: iteration 26730/ 115203 | consumed samples: 6842880 | consumed tokens: 14014218240 | elapsed time per iteration (s): 0.56 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 2.869013E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.919 | TFLOPs: 43.66 | +7: iteration 26740/ 115203 | consumed samples: 6845440 | consumed tokens: 14019461120 | elapsed time per iteration (s): 0.60 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 2.906918E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.392 | TFLOPs: 40.75 | +7: iteration 26750/ 115203 | consumed samples: 6848000 | consumed tokens: 14024704000 | elapsed time per iteration (s): 0.58 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 2.881956E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.571 | TFLOPs: 41.91 | +7: iteration 26760/ 115203 | consumed samples: 6850560 | consumed tokens: 14029946880 | elapsed time per iteration (s): 0.57 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 2.874509E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.686 | TFLOPs: 43.16 | +7: iteration 26770/ 115203 | consumed samples: 6853120 | consumed tokens: 14035189760 | elapsed time per iteration (s): 0.55 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 2.886874E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.321 | TFLOPs: 43.98 | +7: iteration 26780/ 115203 | consumed samples: 6855680 | consumed tokens: 14040432640 | elapsed time per iteration (s): 0.56 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 2.889101E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.243 | TFLOPs: 43.97 | +7: iteration 26790/ 115203 | consumed samples: 6858240 | consumed tokens: 14045675520 | elapsed time per iteration (s): 0.55 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 2.868895E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +7: iteration 26800/ 115203 | consumed samples: 6860800 | consumed tokens: 14050918400 | elapsed time per iteration (s): 0.55 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 2.883734E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.315 | TFLOPs: 43.98 | +7: iteration 26810/ 115203 | consumed samples: 6863360 | consumed tokens: 14056161280 | elapsed time per iteration (s): 0.56 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 2.888721E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.815 | TFLOPs: 43.46 | +7: iteration 26820/ 115203 | consumed samples: 6865920 | consumed tokens: 14061404160 | elapsed time per iteration (s): 0.55 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 2.882527E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.401 | TFLOPs: 43.99 | +7: iteration 26830/ 115203 | consumed samples: 6868480 | consumed tokens: 14066647040 | elapsed time per iteration (s): 0.55 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 2.877220E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.375 | TFLOPs: 43.99 | +7: iteration 26840/ 115203 | consumed samples: 6871040 | consumed tokens: 14071889920 | elapsed time per iteration (s): 0.56 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 2.926902E+00 | grad norm: 2.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.379 | TFLOPs: 43.89 | +7: iteration 26850/ 115203 | consumed samples: 6873600 | consumed tokens: 14077132800 | elapsed time per iteration (s): 0.56 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.028558E+00 | grad norm: 1.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.225 | TFLOPs: 43.78 | +7: iteration 26860/ 115203 | consumed samples: 6876160 | consumed tokens: 14082375680 | elapsed time per iteration (s): 0.56 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 2.939248E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.221 | TFLOPs: 43.31 | +7: iteration 26870/ 115203 | consumed samples: 6878720 | consumed tokens: 14087618560 | elapsed time per iteration (s): 0.56 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 2.924854E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.187 | TFLOPs: 43.78 | +7: iteration 26880/ 115203 | consumed samples: 6881280 | consumed tokens: 14092861440 | elapsed time per iteration (s): 0.57 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 2.910859E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.393 | TFLOPs: 42.94 | +7: iteration 26890/ 115203 | consumed samples: 6883840 | consumed tokens: 14098104320 | elapsed time per iteration (s): 0.55 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 2.899166E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.423 | TFLOPs: 43.99 | +7: iteration 26900/ 115203 | consumed samples: 6886400 | consumed tokens: 14103347200 | elapsed time per iteration (s): 0.55 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 2.886145E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 26910/ 115203 | consumed samples: 6888960 | consumed tokens: 14108590080 | elapsed time per iteration (s): 0.55 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 2.888214E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.620 | TFLOPs: 44.01 | +7: iteration 26920/ 115203 | consumed samples: 6891520 | consumed tokens: 14113832960 | elapsed time per iteration (s): 0.56 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 2.891088E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.662 | TFLOPs: 43.82 | +7: iteration 26930/ 115203 | consumed samples: 6894080 | consumed tokens: 14119075840 | elapsed time per iteration (s): 0.55 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.893392E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.531 | TFLOPs: 44.00 | +7: iteration 26940/ 115203 | consumed samples: 6896640 | consumed tokens: 14124318720 | elapsed time per iteration (s): 0.55 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.885249E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.481 | TFLOPs: 44.00 | +7: iteration 26950/ 115203 | consumed samples: 6899200 | consumed tokens: 14129561600 | elapsed time per iteration (s): 0.56 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.892804E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.844 | TFLOPs: 43.56 | +7: iteration 26960/ 115203 | consumed samples: 6901760 | consumed tokens: 14134804480 | elapsed time per iteration (s): 0.55 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.898710E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.487 | TFLOPs: 44.00 | +7: iteration 26970/ 115203 | consumed samples: 6904320 | consumed tokens: 14140047360 | elapsed time per iteration (s): 0.55 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.893068E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.704 | TFLOPs: 44.02 | +7: iteration 26980/ 115203 | consumed samples: 6906880 | consumed tokens: 14145290240 | elapsed time per iteration (s): 0.56 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.878779E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.611 | TFLOPs: 43.72 | +7: iteration 26990/ 115203 | consumed samples: 6909440 | consumed tokens: 14150533120 | elapsed time per iteration (s): 0.56 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 2.859228E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.607 | TFLOPs: 43.72 | +7: iteration 27000/ 115203 | consumed samples: 6912000 | consumed tokens: 14155776000 | elapsed time per iteration (s): 0.56 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 2.900776E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.970 | TFLOPs: 43.38 | +7: iteration 27010/ 115203 | consumed samples: 6914560 | consumed tokens: 14161018880 | elapsed time per iteration (s): 0.55 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 2.875218E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.483 | TFLOPs: 44.00 | +7: iteration 27020/ 115203 | consumed samples: 6917120 | consumed tokens: 14166261760 | elapsed time per iteration (s): 0.56 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 2.869245E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.377 | TFLOPs: 43.70 | +7: iteration 27030/ 115203 | consumed samples: 6919680 | consumed tokens: 14171504640 | elapsed time per iteration (s): 0.55 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 2.879554E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.694 | TFLOPs: 44.02 | +7: iteration 27040/ 115203 | consumed samples: 6922240 | consumed tokens: 14176747520 | elapsed time per iteration (s): 0.55 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 2.869657E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.393 | TFLOPs: 43.99 | +7: iteration 27050/ 115203 | consumed samples: 6924800 | consumed tokens: 14181990400 | elapsed time per iteration (s): 0.56 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 2.877726E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.920 | TFLOPs: 43.37 | +7: iteration 27060/ 115203 | consumed samples: 6927360 | consumed tokens: 14187233280 | elapsed time per iteration (s): 0.56 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 2.884268E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.498 | TFLOPs: 43.71 | +7: iteration 27070/ 115203 | consumed samples: 6929920 | consumed tokens: 14192476160 | elapsed time per iteration (s): 0.55 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 2.878795E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.350 | TFLOPs: 43.98 | +7: iteration 27080/ 115203 | consumed samples: 6932480 | consumed tokens: 14197719040 | elapsed time per iteration (s): 0.56 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 2.879908E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.184 | TFLOPs: 43.40 | +7: iteration 27090/ 115203 | consumed samples: 6935040 | consumed tokens: 14202961920 | elapsed time per iteration (s): 0.58 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 2.878452E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.440 | TFLOPs: 42.37 | +7: iteration 27100/ 115203 | consumed samples: 6937600 | consumed tokens: 14208204800 | elapsed time per iteration (s): 0.56 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 2.877447E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.068 | TFLOPs: 43.96 | +7: iteration 27110/ 115203 | consumed samples: 6940160 | consumed tokens: 14213447680 | elapsed time per iteration (s): 0.56 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 2.883619E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.183 | TFLOPs: 43.30 | +7: iteration 27120/ 115203 | consumed samples: 6942720 | consumed tokens: 14218690560 | elapsed time per iteration (s): 0.56 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 2.868743E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.837 | TFLOPs: 43.84 | +7: iteration 27130/ 115203 | consumed samples: 6945280 | consumed tokens: 14223933440 | elapsed time per iteration (s): 0.56 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 2.891534E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.532 | TFLOPs: 43.91 | +7: iteration 27140/ 115203 | consumed samples: 6947840 | consumed tokens: 14229176320 | elapsed time per iteration (s): 0.56 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 2.864869E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.178 | TFLOPs: 43.97 | +7: iteration 27150/ 115203 | consumed samples: 6950400 | consumed tokens: 14234419200 | elapsed time per iteration (s): 0.56 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 2.876569E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.137 | TFLOPs: 43.96 | +7: iteration 27160/ 115203 | consumed samples: 6952960 | consumed tokens: 14239662080 | elapsed time per iteration (s): 0.56 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 2.880234E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 27170/ 115203 | consumed samples: 6955520 | consumed tokens: 14244904960 | elapsed time per iteration (s): 0.56 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 2.859359E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.340 | TFLOPs: 43.41 | +7: iteration 27180/ 115203 | consumed samples: 6958080 | consumed tokens: 14250147840 | elapsed time per iteration (s): 0.55 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 2.877365E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.301 | TFLOPs: 43.98 | +7: iteration 27190/ 115203 | consumed samples: 6960640 | consumed tokens: 14255390720 | elapsed time per iteration (s): 0.55 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 2.871430E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.319 | TFLOPs: 43.98 | +7: iteration 27200/ 115203 | consumed samples: 6963200 | consumed tokens: 14260633600 | elapsed time per iteration (s): 0.55 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 2.866747E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.364 | TFLOPs: 43.99 | +7: iteration 27210/ 115203 | consumed samples: 6965760 | consumed tokens: 14265876480 | elapsed time per iteration (s): 0.56 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 2.875684E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.250 | TFLOPs: 43.98 | +7: iteration 27220/ 115203 | consumed samples: 6968320 | consumed tokens: 14271119360 | elapsed time per iteration (s): 0.55 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 2.881100E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.300 | TFLOPs: 43.98 | +7: iteration 27230/ 115203 | consumed samples: 6970880 | consumed tokens: 14276362240 | elapsed time per iteration (s): 0.56 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 2.876504E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.857 | TFLOPs: 43.46 | +7: iteration 27240/ 115203 | consumed samples: 6973440 | consumed tokens: 14281605120 | elapsed time per iteration (s): 0.56 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 2.887464E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.863 | TFLOPs: 43.94 | +7: iteration 27250/ 115203 | consumed samples: 6976000 | consumed tokens: 14286848000 | elapsed time per iteration (s): 0.57 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 2.885793E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.508 | TFLOPs: 43.05 | +7: iteration 27260/ 115203 | consumed samples: 6978560 | consumed tokens: 14292090880 | elapsed time per iteration (s): 0.56 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 2.879735E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.623 | TFLOPs: 43.92 | +7: iteration 27270/ 115203 | consumed samples: 6981120 | consumed tokens: 14297333760 | elapsed time per iteration (s): 0.56 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 2.891701E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.434 | TFLOPs: 43.52 | +7: iteration 27280/ 115203 | consumed samples: 6983680 | consumed tokens: 14302576640 | elapsed time per iteration (s): 0.56 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 2.878870E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.876 | TFLOPs: 43.94 | +7: iteration 27290/ 115203 | consumed samples: 6986240 | consumed tokens: 14307819520 | elapsed time per iteration (s): 0.56 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 2.877005E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.015 | TFLOPs: 43.86 | +7: iteration 27300/ 115203 | consumed samples: 6988800 | consumed tokens: 14313062400 | elapsed time per iteration (s): 0.55 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 2.871940E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.298 | TFLOPs: 43.98 | +7: iteration 27310/ 115203 | consumed samples: 6991360 | consumed tokens: 14318305280 | elapsed time per iteration (s): 0.56 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 2.863533E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.214 | TFLOPs: 43.69 | +7: iteration 27320/ 115203 | consumed samples: 6993920 | consumed tokens: 14323548160 | elapsed time per iteration (s): 0.56 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 2.887476E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.355 | TFLOPs: 43.51 | +7: iteration 27330/ 115203 | consumed samples: 6996480 | consumed tokens: 14328791040 | elapsed time per iteration (s): 0.56 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 2.860505E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.677 | TFLOPs: 43.54 | +7: iteration 27340/ 115203 | consumed samples: 6999040 | consumed tokens: 14334033920 | elapsed time per iteration (s): 0.55 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 2.874901E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.353 | TFLOPs: 43.99 | +7: iteration 27350/ 115203 | consumed samples: 7001600 | consumed tokens: 14339276800 | elapsed time per iteration (s): 0.56 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 2.855471E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.380 | TFLOPs: 43.42 | +7: iteration 27360/ 115203 | consumed samples: 7004160 | consumed tokens: 14344519680 | elapsed time per iteration (s): 0.55 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.863340E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.451 | TFLOPs: 43.99 | +7: iteration 27370/ 115203 | consumed samples: 7006720 | consumed tokens: 14349762560 | elapsed time per iteration (s): 0.56 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.855343E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.988 | TFLOPs: 43.95 | +7: iteration 27380/ 115203 | consumed samples: 7009280 | consumed tokens: 14355005440 | elapsed time per iteration (s): 0.55 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.871908E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.517 | TFLOPs: 44.00 | +7: iteration 27390/ 115203 | consumed samples: 7011840 | consumed tokens: 14360248320 | elapsed time per iteration (s): 0.56 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.873742E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.026 | TFLOPs: 43.95 | +7: iteration 27400/ 115203 | consumed samples: 7014400 | consumed tokens: 14365491200 | elapsed time per iteration (s): 0.56 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.877076E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.281 | TFLOPs: 43.41 | +7: iteration 27410/ 115203 | consumed samples: 7016960 | consumed tokens: 14370734080 | elapsed time per iteration (s): 0.56 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.895317E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.741 | TFLOPs: 43.55 | +7: iteration 27420/ 115203 | consumed samples: 7019520 | consumed tokens: 14375976960 | elapsed time per iteration (s): 0.57 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 2.862185E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.975 | TFLOPs: 43.09 | +7: iteration 27430/ 115203 | consumed samples: 7022080 | consumed tokens: 14381219840 | elapsed time per iteration (s): 0.56 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 2.885774E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.091 | TFLOPs: 43.86 | +7: iteration 27440/ 115203 | consumed samples: 7024640 | consumed tokens: 14386462720 | elapsed time per iteration (s): 0.56 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 2.874821E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.196 | TFLOPs: 43.30 | +7: iteration 27450/ 115203 | consumed samples: 7027200 | consumed tokens: 14391705600 | elapsed time per iteration (s): 0.56 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 2.872895E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.993 | TFLOPs: 43.86 | +7: iteration 27460/ 115203 | consumed samples: 7029760 | consumed tokens: 14396948480 | elapsed time per iteration (s): 0.55 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 2.862962E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.449 | TFLOPs: 43.99 | +7: iteration 27470/ 115203 | consumed samples: 7032320 | consumed tokens: 14402191360 | elapsed time per iteration (s): 0.55 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 2.875039E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.286 | TFLOPs: 43.98 | +7: iteration 27480/ 115203 | consumed samples: 7034880 | consumed tokens: 14407434240 | elapsed time per iteration (s): 0.55 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 2.871460E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.424 | TFLOPs: 43.99 | +7: iteration 27490/ 115203 | consumed samples: 7037440 | consumed tokens: 14412677120 | elapsed time per iteration (s): 0.56 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 2.858595E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.921 | TFLOPs: 43.85 | +7: iteration 27500/ 115203 | consumed samples: 7040000 | consumed tokens: 14417920000 | elapsed time per iteration (s): 0.55 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 2.881010E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.274 | TFLOPs: 43.98 | +7: iteration 27510/ 115203 | consumed samples: 7042560 | consumed tokens: 14423162880 | elapsed time per iteration (s): 0.55 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 2.857586E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.300 | TFLOPs: 43.98 | +7: iteration 27520/ 115203 | consumed samples: 7045120 | consumed tokens: 14428405760 | elapsed time per iteration (s): 0.57 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 2.872148E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.291 | TFLOPs: 43.12 | +7: iteration 27530/ 115203 | consumed samples: 7047680 | consumed tokens: 14433648640 | elapsed time per iteration (s): 0.57 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 2.869431E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.431 | TFLOPs: 42.47 | +7: iteration 27540/ 115203 | consumed samples: 7050240 | consumed tokens: 14438891520 | elapsed time per iteration (s): 0.57 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 2.866019E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.147 | TFLOPs: 42.82 | +7: iteration 27550/ 115203 | consumed samples: 7052800 | consumed tokens: 14444134400 | elapsed time per iteration (s): 0.60 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 2.869658E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.926 | TFLOPs: 40.99 | +7: iteration 27560/ 115203 | consumed samples: 7055360 | consumed tokens: 14449377280 | elapsed time per iteration (s): 0.58 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 2.860924E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.809 | TFLOPs: 41.84 | +7: iteration 27570/ 115203 | consumed samples: 7057920 | consumed tokens: 14454620160 | elapsed time per iteration (s): 0.57 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 2.872820E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.640 | TFLOPs: 43.15 | +7: iteration 27580/ 115203 | consumed samples: 7060480 | consumed tokens: 14459863040 | elapsed time per iteration (s): 0.57 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 2.881143E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.401 | TFLOPs: 42.94 | +7: iteration 27590/ 115203 | consumed samples: 7063040 | consumed tokens: 14465105920 | elapsed time per iteration (s): 0.55 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 2.863474E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.453 | TFLOPs: 43.99 | +7: iteration 27600/ 115203 | consumed samples: 7065600 | consumed tokens: 14470348800 | elapsed time per iteration (s): 0.56 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 2.851137E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.198 | TFLOPs: 43.49 | +7: iteration 27610/ 115203 | consumed samples: 7068160 | consumed tokens: 14475591680 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 2.864391E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.721 | TFLOPs: 43.35 | +7: iteration 27620/ 115203 | consumed samples: 7070720 | consumed tokens: 14480834560 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 2.873520E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.618 | TFLOPs: 43.72 | +7: iteration 27630/ 115203 | consumed samples: 7073280 | consumed tokens: 14486077440 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 2.866069E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.478 | TFLOPs: 43.90 | +7: iteration 27640/ 115203 | consumed samples: 7075840 | consumed tokens: 14491320320 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 2.873334E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.677 | TFLOPs: 43.44 | +7: iteration 27650/ 115203 | consumed samples: 7078400 | consumed tokens: 14496563200 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 2.875517E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.352 | TFLOPs: 43.32 | +7: iteration 27660/ 115203 | consumed samples: 7080960 | consumed tokens: 14501806080 | elapsed time per iteration (s): 0.56 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 2.871539E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.930 | TFLOPs: 43.94 | +7: iteration 27670/ 115203 | consumed samples: 7083520 | consumed tokens: 14507048960 | elapsed time per iteration (s): 0.56 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 2.868801E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.902 | TFLOPs: 43.94 | +7: iteration 27680/ 115203 | consumed samples: 7086080 | consumed tokens: 14512291840 | elapsed time per iteration (s): 0.57 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 2.878399E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.426 | TFLOPs: 43.04 | +7: iteration 27690/ 115203 | consumed samples: 7088640 | consumed tokens: 14517534720 | elapsed time per iteration (s): 0.56 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 2.855698E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.041 | TFLOPs: 43.96 | +7: iteration 27700/ 115203 | consumed samples: 7091200 | consumed tokens: 14522777600 | elapsed time per iteration (s): 0.56 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 2.888537E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.902 | TFLOPs: 43.94 | +7: iteration 27710/ 115203 | consumed samples: 7093760 | consumed tokens: 14528020480 | elapsed time per iteration (s): 0.56 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 2.877836E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.954 | TFLOPs: 43.95 | +7: iteration 27720/ 115203 | consumed samples: 7096320 | consumed tokens: 14533263360 | elapsed time per iteration (s): 0.56 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 2.858813E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 27730/ 115203 | consumed samples: 7098880 | consumed tokens: 14538506240 | elapsed time per iteration (s): 0.55 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 2.870761E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 27740/ 115203 | consumed samples: 7101440 | consumed tokens: 14543749120 | elapsed time per iteration (s): 0.56 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 2.872510E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.980 | TFLOPs: 43.95 | +7: iteration 27750/ 115203 | consumed samples: 7104000 | consumed tokens: 14548992000 | elapsed time per iteration (s): 0.56 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 2.865300E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.620 | TFLOPs: 43.34 | +7: iteration 27760/ 115203 | consumed samples: 7106560 | consumed tokens: 14554234880 | elapsed time per iteration (s): 0.57 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 2.872515E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.425 | TFLOPs: 42.94 | +7: iteration 27770/ 115203 | consumed samples: 7109120 | consumed tokens: 14559477760 | elapsed time per iteration (s): 0.56 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 2.857556E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.089 | TFLOPs: 43.96 | +7: iteration 27780/ 115203 | consumed samples: 7111680 | consumed tokens: 14564720640 | elapsed time per iteration (s): 0.56 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 2.878252E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.130 | TFLOPs: 43.96 | +7: iteration 27790/ 115203 | consumed samples: 7114240 | consumed tokens: 14569963520 | elapsed time per iteration (s): 0.55 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 2.874411E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.281 | TFLOPs: 43.98 | +7: iteration 27800/ 115203 | consumed samples: 7116800 | consumed tokens: 14575206400 | elapsed time per iteration (s): 0.56 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 2.870743E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.142 | TFLOPs: 43.58 | +7: iteration 27810/ 115203 | consumed samples: 7119360 | consumed tokens: 14580449280 | elapsed time per iteration (s): 0.56 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 2.868246E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.149 | TFLOPs: 43.97 | +7: iteration 27820/ 115203 | consumed samples: 7121920 | consumed tokens: 14585692160 | elapsed time per iteration (s): 0.56 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 2.872656E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.366 | TFLOPs: 43.80 | +7: iteration 27830/ 115203 | consumed samples: 7124480 | consumed tokens: 14590935040 | elapsed time per iteration (s): 0.56 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 2.873685E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.900 | TFLOPs: 43.37 | +7: iteration 27840/ 115203 | consumed samples: 7127040 | consumed tokens: 14596177920 | elapsed time per iteration (s): 0.56 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 2.860833E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.129 | TFLOPs: 43.96 | +7: iteration 27850/ 115203 | consumed samples: 7129600 | consumed tokens: 14601420800 | elapsed time per iteration (s): 0.56 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 2.873790E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.224 | TFLOPs: 43.97 | +7: iteration 27860/ 115203 | consumed samples: 7132160 | consumed tokens: 14606663680 | elapsed time per iteration (s): 0.56 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 2.871336E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.729 | TFLOPs: 43.45 | +7: iteration 27870/ 115203 | consumed samples: 7134720 | consumed tokens: 14611906560 | elapsed time per iteration (s): 0.56 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 2.864095E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.173 | TFLOPs: 43.97 | +7: iteration 27880/ 115203 | consumed samples: 7137280 | consumed tokens: 14617149440 | elapsed time per iteration (s): 0.56 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 2.869391E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 27890/ 115203 | consumed samples: 7139840 | consumed tokens: 14622392320 | elapsed time per iteration (s): 0.56 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 2.868533E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.245 | TFLOPs: 43.97 | +7: iteration 27900/ 115203 | consumed samples: 7142400 | consumed tokens: 14627635200 | elapsed time per iteration (s): 0.56 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 2.869110E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.206 | TFLOPs: 43.97 | +7: iteration 27910/ 115203 | consumed samples: 7144960 | consumed tokens: 14632878080 | elapsed time per iteration (s): 0.55 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 2.876722E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 27920/ 115203 | consumed samples: 7147520 | consumed tokens: 14638120960 | elapsed time per iteration (s): 0.56 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 2.868832E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.912 | TFLOPs: 43.37 | +7: iteration 27930/ 115203 | consumed samples: 7150080 | consumed tokens: 14643363840 | elapsed time per iteration (s): 0.56 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 2.866656E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.201 | TFLOPs: 43.97 | +7: iteration 27940/ 115203 | consumed samples: 7152640 | consumed tokens: 14648606720 | elapsed time per iteration (s): 0.56 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 2.861571E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.231 | TFLOPs: 43.97 | +7: iteration 27950/ 115203 | consumed samples: 7155200 | consumed tokens: 14653849600 | elapsed time per iteration (s): 0.56 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 2.864582E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.499 | TFLOPs: 43.43 | +7: iteration 27960/ 115203 | consumed samples: 7157760 | consumed tokens: 14659092480 | elapsed time per iteration (s): 0.56 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 2.857615E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.061 | TFLOPs: 43.96 | +7: iteration 27970/ 115203 | consumed samples: 7160320 | consumed tokens: 14664335360 | elapsed time per iteration (s): 0.55 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 2.862339E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.295 | TFLOPs: 43.98 | +7: iteration 27980/ 115203 | consumed samples: 7162880 | consumed tokens: 14669578240 | elapsed time per iteration (s): 0.56 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 2.870173E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.077 | TFLOPs: 43.67 | +7: iteration 27990/ 115203 | consumed samples: 7165440 | consumed tokens: 14674821120 | elapsed time per iteration (s): 0.55 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 2.880244E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.297 | TFLOPs: 43.98 | +0: [2023-03-16 17:13:35,612] [INFO] [logging.py:68:log_dist] [Rank 0] step=28000, skipped=0, lr=[0.00017649035869598463, 0.00017649035869598463, 0.00017649035869598463], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 28000/ 115203 | consumed samples: 7168000 | consumed tokens: 14680064000 | elapsed time per iteration (s): 0.56 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 2.851656E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.118 | TFLOPs: 43.96 | +0: steps: 28000 loss: 2.8487 iter time (s): 0.557 samples/sec: 459.422 +7: iteration 28010/ 115203 | consumed samples: 7170560 | consumed tokens: 14685306880 | elapsed time per iteration (s): 0.56 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 2.864271E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.626 | TFLOPs: 43.92 | +7: iteration 28020/ 115203 | consumed samples: 7173120 | consumed tokens: 14690549760 | elapsed time per iteration (s): 0.56 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 2.874358E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.162 | TFLOPs: 43.97 | +7: iteration 28030/ 115203 | consumed samples: 7175680 | consumed tokens: 14695792640 | elapsed time per iteration (s): 0.56 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 2.875006E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.045 | TFLOPs: 43.96 | +7: iteration 28040/ 115203 | consumed samples: 7178240 | consumed tokens: 14701035520 | elapsed time per iteration (s): 0.56 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 2.858641E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.215 | TFLOPs: 43.97 | +7: iteration 28050/ 115203 | consumed samples: 7180800 | consumed tokens: 14706278400 | elapsed time per iteration (s): 0.55 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 2.863494E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.285 | TFLOPs: 43.98 | +7: iteration 28060/ 115203 | consumed samples: 7183360 | consumed tokens: 14711521280 | elapsed time per iteration (s): 0.56 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 2.875996E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.817 | TFLOPs: 43.93 | +7: iteration 28070/ 115203 | consumed samples: 7185920 | consumed tokens: 14716764160 | elapsed time per iteration (s): 0.55 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 2.860064E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.278 | TFLOPs: 43.98 | +7: iteration 28080/ 115203 | consumed samples: 7188480 | consumed tokens: 14722007040 | elapsed time per iteration (s): 0.56 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 2.871462E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.705 | TFLOPs: 43.64 | +7: iteration 28090/ 115203 | consumed samples: 7191040 | consumed tokens: 14727249920 | elapsed time per iteration (s): 0.56 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 2.866661E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.181 | TFLOPs: 43.97 | +7: iteration 28100/ 115203 | consumed samples: 7193600 | consumed tokens: 14732492800 | elapsed time per iteration (s): 0.55 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 2.856775E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.302 | TFLOPs: 43.98 | +7: iteration 28110/ 115203 | consumed samples: 7196160 | consumed tokens: 14737735680 | elapsed time per iteration (s): 0.55 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 2.861755E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.491 | TFLOPs: 44.00 | +7: iteration 28120/ 115203 | consumed samples: 7198720 | consumed tokens: 14742978560 | elapsed time per iteration (s): 0.55 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 2.859608E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.264 | TFLOPs: 43.98 | +7: iteration 28130/ 115203 | consumed samples: 7201280 | consumed tokens: 14748221440 | elapsed time per iteration (s): 0.55 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 2.867373E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.475 | TFLOPs: 44.00 | +7: iteration 28140/ 115203 | consumed samples: 7203840 | consumed tokens: 14753464320 | elapsed time per iteration (s): 0.56 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 2.870542E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.742 | TFLOPs: 43.45 | +7: iteration 28150/ 115203 | consumed samples: 7206400 | consumed tokens: 14758707200 | elapsed time per iteration (s): 0.56 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 2.878478E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.007 | TFLOPs: 43.95 | +7: iteration 28160/ 115203 | consumed samples: 7208960 | consumed tokens: 14763950080 | elapsed time per iteration (s): 0.56 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 2.862386E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.627 | TFLOPs: 43.63 | +7: iteration 28170/ 115203 | consumed samples: 7211520 | consumed tokens: 14769192960 | elapsed time per iteration (s): 0.56 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 2.867715E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.128 | TFLOPs: 43.96 | +7: iteration 28180/ 115203 | consumed samples: 7214080 | consumed tokens: 14774435840 | elapsed time per iteration (s): 0.55 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 2.868791E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 28190/ 115203 | consumed samples: 7216640 | consumed tokens: 14779678720 | elapsed time per iteration (s): 0.56 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 2.870461E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.261 | TFLOPs: 43.59 | +7: iteration 28200/ 115203 | consumed samples: 7219200 | consumed tokens: 14784921600 | elapsed time per iteration (s): 0.56 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 2.868621E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.219 | TFLOPs: 43.97 | +7: iteration 28210/ 115203 | consumed samples: 7221760 | consumed tokens: 14790164480 | elapsed time per iteration (s): 0.55 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 2.861786E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.441 | TFLOPs: 43.99 | +7: iteration 28220/ 115203 | consumed samples: 7224320 | consumed tokens: 14795407360 | elapsed time per iteration (s): 0.56 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 2.876006E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.088 | TFLOPs: 43.96 | +7: iteration 28230/ 115203 | consumed samples: 7226880 | consumed tokens: 14800650240 | elapsed time per iteration (s): 0.56 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 2.862064E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.551 | TFLOPs: 43.53 | +7: iteration 28240/ 115203 | consumed samples: 7229440 | consumed tokens: 14805893120 | elapsed time per iteration (s): 0.56 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 2.865439E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 28250/ 115203 | consumed samples: 7232000 | consumed tokens: 14811136000 | elapsed time per iteration (s): 0.56 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 2.863306E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.565 | TFLOPs: 43.53 | +7: iteration 28260/ 115203 | consumed samples: 7234560 | consumed tokens: 14816378880 | elapsed time per iteration (s): 0.56 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 2.862385E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.247 | TFLOPs: 43.97 | +7: iteration 28270/ 115203 | consumed samples: 7237120 | consumed tokens: 14821621760 | elapsed time per iteration (s): 0.56 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 2.849713E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 28280/ 115203 | consumed samples: 7239680 | consumed tokens: 14826864640 | elapsed time per iteration (s): 0.55 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 2.857470E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.305 | TFLOPs: 43.98 | +7: iteration 28290/ 115203 | consumed samples: 7242240 | consumed tokens: 14832107520 | elapsed time per iteration (s): 0.56 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 2.855595E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.400 | TFLOPs: 43.42 | +7: iteration 28300/ 115203 | consumed samples: 7244800 | consumed tokens: 14837350400 | elapsed time per iteration (s): 0.56 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 2.866593E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.986 | TFLOPs: 43.95 | +7: iteration 28310/ 115203 | consumed samples: 7247360 | consumed tokens: 14842593280 | elapsed time per iteration (s): 0.56 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 2.877150E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.181 | TFLOPs: 43.97 | +7: iteration 28320/ 115203 | consumed samples: 7249920 | consumed tokens: 14847836160 | elapsed time per iteration (s): 0.56 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 2.870565E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.072 | TFLOPs: 43.96 | +7: iteration 28330/ 115203 | consumed samples: 7252480 | consumed tokens: 14853079040 | elapsed time per iteration (s): 0.56 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 2.865621E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 28340/ 115203 | consumed samples: 7255040 | consumed tokens: 14858321920 | elapsed time per iteration (s): 0.56 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 2.859746E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.981 | TFLOPs: 43.95 | +7: iteration 28350/ 115203 | consumed samples: 7257600 | consumed tokens: 14863564800 | elapsed time per iteration (s): 0.56 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 2.871529E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.148 | TFLOPs: 43.97 | +7: iteration 28360/ 115203 | consumed samples: 7260160 | consumed tokens: 14868807680 | elapsed time per iteration (s): 0.56 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 2.858569E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.068 | TFLOPs: 43.96 | +7: iteration 28370/ 115203 | consumed samples: 7262720 | consumed tokens: 14874050560 | elapsed time per iteration (s): 0.56 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 2.876463E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.555 | TFLOPs: 43.72 | +7: iteration 28380/ 115203 | consumed samples: 7265280 | consumed tokens: 14879293440 | elapsed time per iteration (s): 0.57 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 2.858608E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.549 | TFLOPs: 42.76 | +7: iteration 28390/ 115203 | consumed samples: 7267840 | consumed tokens: 14884536320 | elapsed time per iteration (s): 0.56 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 2.858041E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.878 | TFLOPs: 43.94 | +7: iteration 28400/ 115203 | consumed samples: 7270400 | consumed tokens: 14889779200 | elapsed time per iteration (s): 0.56 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 2.858290E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.585 | TFLOPs: 43.53 | +7: iteration 28410/ 115203 | consumed samples: 7272960 | consumed tokens: 14895022080 | elapsed time per iteration (s): 0.56 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 2.874875E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.712 | TFLOPs: 43.64 | +7: iteration 28420/ 115203 | consumed samples: 7275520 | consumed tokens: 14900264960 | elapsed time per iteration (s): 0.56 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 2.858059E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.851 | TFLOPs: 43.94 | +7: iteration 28430/ 115203 | consumed samples: 7278080 | consumed tokens: 14905507840 | elapsed time per iteration (s): 0.56 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 2.859872E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.570 | TFLOPs: 43.62 | +7: iteration 28440/ 115203 | consumed samples: 7280640 | consumed tokens: 14910750720 | elapsed time per iteration (s): 0.56 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 2.863236E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.664 | TFLOPs: 43.44 | +7: iteration 28450/ 115203 | consumed samples: 7283200 | consumed tokens: 14915993600 | elapsed time per iteration (s): 0.56 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 2.871187E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.890 | TFLOPs: 43.94 | +7: iteration 28460/ 115203 | consumed samples: 7285760 | consumed tokens: 14921236480 | elapsed time per iteration (s): 0.56 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 2.865146E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.948 | TFLOPs: 43.95 | +7: iteration 28470/ 115203 | consumed samples: 7288320 | consumed tokens: 14926479360 | elapsed time per iteration (s): 0.56 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 2.859155E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.786 | TFLOPs: 43.93 | +7: iteration 28480/ 115203 | consumed samples: 7290880 | consumed tokens: 14931722240 | elapsed time per iteration (s): 0.56 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 2.862168E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.925 | TFLOPs: 43.94 | +7: iteration 28490/ 115203 | consumed samples: 7293440 | consumed tokens: 14936965120 | elapsed time per iteration (s): 0.56 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 2.857584E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.323 | TFLOPs: 43.60 | +7: iteration 28500/ 115203 | consumed samples: 7296000 | consumed tokens: 14942208000 | elapsed time per iteration (s): 0.56 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 2.859907E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.864 | TFLOPs: 43.37 | +7: iteration 28510/ 115203 | consumed samples: 7298560 | consumed tokens: 14947450880 | elapsed time per iteration (s): 0.56 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 2.862798E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.128 | TFLOPs: 43.96 | +7: iteration 28520/ 115203 | consumed samples: 7301120 | consumed tokens: 14952693760 | elapsed time per iteration (s): 0.56 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 2.861367E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.976 | TFLOPs: 43.95 | +7: iteration 28530/ 115203 | consumed samples: 7303680 | consumed tokens: 14957936640 | elapsed time per iteration (s): 0.55 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 2.874781E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.340 | TFLOPs: 43.98 | +7: iteration 28540/ 115203 | consumed samples: 7306240 | consumed tokens: 14963179520 | elapsed time per iteration (s): 0.56 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 2.860787E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.899 | TFLOPs: 43.56 | +7: iteration 28550/ 115203 | consumed samples: 7308800 | consumed tokens: 14968422400 | elapsed time per iteration (s): 0.56 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 2.858748E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.972 | TFLOPs: 43.95 | +7: iteration 28560/ 115203 | consumed samples: 7311360 | consumed tokens: 14973665280 | elapsed time per iteration (s): 0.56 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 2.855991E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.965 | TFLOPs: 43.95 | +7: iteration 28570/ 115203 | consumed samples: 7313920 | consumed tokens: 14978908160 | elapsed time per iteration (s): 0.56 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 2.870002E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.117 | TFLOPs: 43.96 | +7: iteration 28580/ 115203 | consumed samples: 7316480 | consumed tokens: 14984151040 | elapsed time per iteration (s): 0.56 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 2.857622E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.108 | TFLOPs: 43.96 | +7: iteration 28590/ 115203 | consumed samples: 7319040 | consumed tokens: 14989393920 | elapsed time per iteration (s): 0.56 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 2.863333E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.885 | TFLOPs: 43.94 | +7: iteration 28600/ 115203 | consumed samples: 7321600 | consumed tokens: 14994636800 | elapsed time per iteration (s): 0.56 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 2.852247E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.192 | TFLOPs: 43.97 | +7: iteration 28610/ 115203 | consumed samples: 7324160 | consumed tokens: 14999879680 | elapsed time per iteration (s): 0.57 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 2.869307E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.547 | TFLOPs: 42.95 | +7: iteration 28620/ 115203 | consumed samples: 7326720 | consumed tokens: 15005122560 | elapsed time per iteration (s): 0.56 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 2.878418E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.100 | TFLOPs: 43.96 | +7: iteration 28630/ 115203 | consumed samples: 7329280 | consumed tokens: 15010365440 | elapsed time per iteration (s): 0.56 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 2.858243E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.206 | TFLOPs: 43.97 | +7: iteration 28640/ 115203 | consumed samples: 7331840 | consumed tokens: 15015608320 | elapsed time per iteration (s): 0.56 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 2.856301E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 28650/ 115203 | consumed samples: 7334400 | consumed tokens: 15020851200 | elapsed time per iteration (s): 0.56 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 2.870974E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.258 | TFLOPs: 43.98 | +7: iteration 28660/ 115203 | consumed samples: 7336960 | consumed tokens: 15026094080 | elapsed time per iteration (s): 0.58 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 2.861927E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.271 | TFLOPs: 41.98 | +7: iteration 28670/ 115203 | consumed samples: 7339520 | consumed tokens: 15031336960 | elapsed time per iteration (s): 0.55 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 2.855997E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.289 | TFLOPs: 43.98 | +7: iteration 28680/ 115203 | consumed samples: 7342080 | consumed tokens: 15036579840 | elapsed time per iteration (s): 0.56 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 2.872714E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.200 | TFLOPs: 43.97 | +7: iteration 28690/ 115203 | consumed samples: 7344640 | consumed tokens: 15041822720 | elapsed time per iteration (s): 0.56 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 2.860301E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 28700/ 115203 | consumed samples: 7347200 | consumed tokens: 15047065600 | elapsed time per iteration (s): 0.56 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 2.873300E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.180 | TFLOPs: 43.97 | +7: iteration 28710/ 115203 | consumed samples: 7349760 | consumed tokens: 15052308480 | elapsed time per iteration (s): 0.55 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 2.866354E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.270 | TFLOPs: 43.98 | +7: iteration 28720/ 115203 | consumed samples: 7352320 | consumed tokens: 15057551360 | elapsed time per iteration (s): 0.55 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 2.865300E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.351 | TFLOPs: 43.98 | +7: iteration 28730/ 115203 | consumed samples: 7354880 | consumed tokens: 15062794240 | elapsed time per iteration (s): 0.56 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 2.871332E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.176 | TFLOPs: 43.97 | +7: iteration 28740/ 115203 | consumed samples: 7357440 | consumed tokens: 15068037120 | elapsed time per iteration (s): 0.55 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 2.862615E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.281 | TFLOPs: 43.98 | +7: iteration 28750/ 115203 | consumed samples: 7360000 | consumed tokens: 15073280000 | elapsed time per iteration (s): 0.55 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 2.874233E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.293 | TFLOPs: 43.98 | +7: iteration 28760/ 115203 | consumed samples: 7362560 | consumed tokens: 15078522880 | elapsed time per iteration (s): 0.55 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 2.870649E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.294 | TFLOPs: 43.98 | +7: iteration 28770/ 115203 | consumed samples: 7365120 | consumed tokens: 15083765760 | elapsed time per iteration (s): 0.56 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 2.858077E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.227 | TFLOPs: 43.97 | +7: iteration 28780/ 115203 | consumed samples: 7367680 | consumed tokens: 15089008640 | elapsed time per iteration (s): 0.56 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 2.861441E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.188 | TFLOPs: 43.97 | +7: iteration 28790/ 115203 | consumed samples: 7370240 | consumed tokens: 15094251520 | elapsed time per iteration (s): 0.56 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 2.874695E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.047 | TFLOPs: 43.57 | +7: iteration 28800/ 115203 | consumed samples: 7372800 | consumed tokens: 15099494400 | elapsed time per iteration (s): 0.56 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 2.862291E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 28810/ 115203 | consumed samples: 7375360 | consumed tokens: 15104737280 | elapsed time per iteration (s): 0.55 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 2.850284E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.423 | TFLOPs: 43.99 | +7: iteration 28820/ 115203 | consumed samples: 7377920 | consumed tokens: 15109980160 | elapsed time per iteration (s): 0.56 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 2.856423E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.766 | TFLOPs: 43.36 | +7: iteration 28830/ 115203 | consumed samples: 7380480 | consumed tokens: 15115223040 | elapsed time per iteration (s): 0.55 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 2.855488E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.365 | TFLOPs: 43.99 | +7: iteration 28840/ 115203 | consumed samples: 7383040 | consumed tokens: 15120465920 | elapsed time per iteration (s): 0.79 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 2.857869E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 325.650 | TFLOPs: 31.05 | +7: iteration 28850/ 115203 | consumed samples: 7385600 | consumed tokens: 15125708800 | elapsed time per iteration (s): 0.56 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 2.865206E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.181 | TFLOPs: 43.40 | +7: iteration 28860/ 115203 | consumed samples: 7388160 | consumed tokens: 15130951680 | elapsed time per iteration (s): 0.56 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 2.863384E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.881 | TFLOPs: 43.75 | +7: iteration 28870/ 115203 | consumed samples: 7390720 | consumed tokens: 15136194560 | elapsed time per iteration (s): 0.56 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 2.860538E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.363 | TFLOPs: 43.22 | +7: iteration 28880/ 115203 | consumed samples: 7393280 | consumed tokens: 15141437440 | elapsed time per iteration (s): 0.55 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 2.859983E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.602 | TFLOPs: 44.01 | +7: iteration 28890/ 115203 | consumed samples: 7395840 | consumed tokens: 15146680320 | elapsed time per iteration (s): 0.56 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 2.846515E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.961 | TFLOPs: 43.66 | +7: iteration 28900/ 115203 | consumed samples: 7398400 | consumed tokens: 15151923200 | elapsed time per iteration (s): 0.56 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 2.848584E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.551 | TFLOPs: 43.62 | +7: iteration 28910/ 115203 | consumed samples: 7400960 | consumed tokens: 15157166080 | elapsed time per iteration (s): 0.55 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 2.860961E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.638 | TFLOPs: 44.01 | +7: iteration 28920/ 115203 | consumed samples: 7403520 | consumed tokens: 15162408960 | elapsed time per iteration (s): 0.56 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 2.853349E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.776 | TFLOPs: 43.64 | +7: iteration 28930/ 115203 | consumed samples: 7406080 | consumed tokens: 15167651840 | elapsed time per iteration (s): 0.56 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 2.847975E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.118 | TFLOPs: 43.30 | +7: iteration 28940/ 115203 | consumed samples: 7408640 | consumed tokens: 15172894720 | elapsed time per iteration (s): 0.56 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 2.854018E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.906 | TFLOPs: 43.75 | +7: iteration 28950/ 115203 | consumed samples: 7411200 | consumed tokens: 15178137600 | elapsed time per iteration (s): 0.55 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 2.872255E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.782 | TFLOPs: 44.03 | +7: iteration 28960/ 115203 | consumed samples: 7413760 | consumed tokens: 15183380480 | elapsed time per iteration (s): 0.55 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 2.863918E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.340 | TFLOPs: 43.98 | +7: iteration 28970/ 115203 | consumed samples: 7416320 | consumed tokens: 15188623360 | elapsed time per iteration (s): 0.58 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 2.851760E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.115 | TFLOPs: 42.34 | +7: iteration 28980/ 115203 | consumed samples: 7418880 | consumed tokens: 15193866240 | elapsed time per iteration (s): 0.56 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 2.863226E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.475 | TFLOPs: 43.90 | +7: iteration 28990/ 115203 | consumed samples: 7421440 | consumed tokens: 15199109120 | elapsed time per iteration (s): 0.55 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 2.845341E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.394 | TFLOPs: 43.99 | +7: iteration 29000/ 115203 | consumed samples: 7424000 | consumed tokens: 15204352000 | elapsed time per iteration (s): 0.56 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 2.851557E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.157 | TFLOPs: 43.68 | +7: iteration 29010/ 115203 | consumed samples: 7426560 | consumed tokens: 15209594880 | elapsed time per iteration (s): 0.56 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 2.861132E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.374 | TFLOPs: 43.51 | +7: iteration 29020/ 115203 | consumed samples: 7429120 | consumed tokens: 15214837760 | elapsed time per iteration (s): 0.57 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 2.869281E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.874 | TFLOPs: 42.51 | +7: iteration 29030/ 115203 | consumed samples: 7431680 | consumed tokens: 15220080640 | elapsed time per iteration (s): 0.57 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 2.862429E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.194 | TFLOPs: 42.64 | +7: iteration 29040/ 115203 | consumed samples: 7434240 | consumed tokens: 15225323520 | elapsed time per iteration (s): 0.57 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 2.857650E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.460 | TFLOPs: 42.66 | +7: iteration 29050/ 115203 | consumed samples: 7436800 | consumed tokens: 15230566400 | elapsed time per iteration (s): 0.56 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 2.864021E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.359 | TFLOPs: 43.89 | +7: iteration 29060/ 115203 | consumed samples: 7439360 | consumed tokens: 15235809280 | elapsed time per iteration (s): 0.56 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 2.866341E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.079 | TFLOPs: 43.86 | +7: iteration 29070/ 115203 | consumed samples: 7441920 | consumed tokens: 15241052160 | elapsed time per iteration (s): 0.55 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 2.848799E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 29080/ 115203 | consumed samples: 7444480 | consumed tokens: 15246295040 | elapsed time per iteration (s): 0.57 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 2.855806E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.504 | TFLOPs: 43.05 | +7: iteration 29090/ 115203 | consumed samples: 7447040 | consumed tokens: 15251537920 | elapsed time per iteration (s): 0.57 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 2.856371E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.317 | TFLOPs: 43.12 | +7: iteration 29100/ 115203 | consumed samples: 7449600 | consumed tokens: 15256780800 | elapsed time per iteration (s): 0.56 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 2.856612E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.947 | TFLOPs: 43.28 | +7: iteration 29110/ 115203 | consumed samples: 7452160 | consumed tokens: 15262023680 | elapsed time per iteration (s): 0.57 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 2.855911E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.639 | TFLOPs: 43.15 | +7: iteration 29120/ 115203 | consumed samples: 7454720 | consumed tokens: 15267266560 | elapsed time per iteration (s): 0.57 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 2.866224E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.824 | TFLOPs: 42.89 | +7: iteration 29130/ 115203 | consumed samples: 7457280 | consumed tokens: 15272509440 | elapsed time per iteration (s): 0.56 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 2.853057E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.540 | TFLOPs: 43.43 | +7: iteration 29140/ 115203 | consumed samples: 7459840 | consumed tokens: 15277752320 | elapsed time per iteration (s): 0.57 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 2.867137E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.006 | TFLOPs: 42.71 | +7: iteration 29150/ 115203 | consumed samples: 7462400 | consumed tokens: 15282995200 | elapsed time per iteration (s): 0.57 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 2.866937E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.793 | TFLOPs: 43.17 | +7: iteration 29160/ 115203 | consumed samples: 7464960 | consumed tokens: 15288238080 | elapsed time per iteration (s): 0.57 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 2.865043E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.405 | TFLOPs: 42.46 | +7: iteration 29170/ 115203 | consumed samples: 7467520 | consumed tokens: 15293480960 | elapsed time per iteration (s): 0.56 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 2.856033E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.684 | TFLOPs: 43.35 | +7: iteration 29180/ 115203 | consumed samples: 7470080 | consumed tokens: 15298723840 | elapsed time per iteration (s): 0.56 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 2.867513E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.146 | TFLOPs: 43.77 | +7: iteration 29190/ 115203 | consumed samples: 7472640 | consumed tokens: 15303966720 | elapsed time per iteration (s): 0.57 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 2.851259E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.621 | TFLOPs: 43.15 | +7: iteration 29200/ 115203 | consumed samples: 7475200 | consumed tokens: 15309209600 | elapsed time per iteration (s): 0.56 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 2.875975E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.173 | TFLOPs: 43.30 | +7: iteration 29210/ 115203 | consumed samples: 7477760 | consumed tokens: 15314452480 | elapsed time per iteration (s): 0.58 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 2.864145E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.051 | TFLOPs: 42.05 | +7: iteration 29220/ 115203 | consumed samples: 7480320 | consumed tokens: 15319695360 | elapsed time per iteration (s): 0.57 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 2.848317E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.675 | TFLOPs: 42.87 | +7: iteration 29230/ 115203 | consumed samples: 7482880 | consumed tokens: 15324938240 | elapsed time per iteration (s): 0.56 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 2.841628E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.161 | TFLOPs: 43.20 | +7: iteration 29240/ 115203 | consumed samples: 7485440 | consumed tokens: 15330181120 | elapsed time per iteration (s): 0.57 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 2.861329E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.397 | TFLOPs: 42.85 | +7: iteration 29250/ 115203 | consumed samples: 7488000 | consumed tokens: 15335424000 | elapsed time per iteration (s): 0.56 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 2.867419E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.342 | TFLOPs: 43.41 | +7: iteration 29260/ 115203 | consumed samples: 7490560 | consumed tokens: 15340666880 | elapsed time per iteration (s): 0.56 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 2.858034E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.827 | TFLOPs: 43.74 | +7: iteration 29270/ 115203 | consumed samples: 7493120 | consumed tokens: 15345909760 | elapsed time per iteration (s): 0.57 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 2.859208E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.289 | TFLOPs: 43.03 | +7: iteration 29280/ 115203 | consumed samples: 7495680 | consumed tokens: 15351152640 | elapsed time per iteration (s): 0.57 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 2.856301E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.920 | TFLOPs: 42.90 | +7: iteration 29290/ 115203 | consumed samples: 7498240 | consumed tokens: 15356395520 | elapsed time per iteration (s): 0.56 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 2.871110E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.238 | TFLOPs: 43.59 | +7: iteration 29300/ 115203 | consumed samples: 7500800 | consumed tokens: 15361638400 | elapsed time per iteration (s): 0.57 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 2.856585E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.862 | TFLOPs: 42.98 | +7: iteration 29310/ 115203 | consumed samples: 7503360 | consumed tokens: 15366881280 | elapsed time per iteration (s): 0.56 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 2.849339E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.081 | TFLOPs: 43.29 | +7: iteration 29320/ 115203 | consumed samples: 7505920 | consumed tokens: 15372124160 | elapsed time per iteration (s): 0.58 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 2.858143E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.583 | TFLOPs: 42.10 | +7: iteration 29330/ 115203 | consumed samples: 7508480 | consumed tokens: 15377367040 | elapsed time per iteration (s): 0.56 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 2.852846E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.569 | TFLOPs: 43.43 | +7: iteration 29340/ 115203 | consumed samples: 7511040 | consumed tokens: 15382609920 | elapsed time per iteration (s): 0.57 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 2.851249E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.588 | TFLOPs: 42.67 | +7: iteration 29350/ 115203 | consumed samples: 7513600 | consumed tokens: 15387852800 | elapsed time per iteration (s): 0.57 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 2.857905E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.930 | TFLOPs: 42.99 | +7: iteration 29360/ 115203 | consumed samples: 7516160 | consumed tokens: 15393095680 | elapsed time per iteration (s): 0.56 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 2.860921E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.012 | TFLOPs: 43.29 | +7: iteration 29370/ 115203 | consumed samples: 7518720 | consumed tokens: 15398338560 | elapsed time per iteration (s): 0.56 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 2.874142E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.977 | TFLOPs: 43.28 | +7: iteration 29380/ 115203 | consumed samples: 7521280 | consumed tokens: 15403581440 | elapsed time per iteration (s): 0.57 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 2.850179E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.664 | TFLOPs: 43.16 | +7: iteration 29390/ 115203 | consumed samples: 7523840 | consumed tokens: 15408824320 | elapsed time per iteration (s): 0.56 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 2.867533E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.636 | TFLOPs: 43.44 | +7: iteration 29400/ 115203 | consumed samples: 7526400 | consumed tokens: 15414067200 | elapsed time per iteration (s): 0.57 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 2.856406E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.361 | TFLOPs: 42.65 | +7: iteration 29410/ 115203 | consumed samples: 7528960 | consumed tokens: 15419310080 | elapsed time per iteration (s): 0.57 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 2.850674E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.575 | TFLOPs: 43.05 | +7: iteration 29420/ 115203 | consumed samples: 7531520 | consumed tokens: 15424552960 | elapsed time per iteration (s): 0.57 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 2.861336E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.750 | TFLOPs: 42.59 | +7: iteration 29430/ 115203 | consumed samples: 7534080 | consumed tokens: 15429795840 | elapsed time per iteration (s): 0.57 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 2.866175E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.253 | TFLOPs: 42.64 | +7: iteration 29440/ 115203 | consumed samples: 7536640 | consumed tokens: 15435038720 | elapsed time per iteration (s): 0.56 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 2.844472E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.565 | TFLOPs: 43.53 | +7: iteration 29450/ 115203 | consumed samples: 7539200 | consumed tokens: 15440281600 | elapsed time per iteration (s): 0.56 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 2.860290E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.602 | TFLOPs: 43.82 | +7: iteration 29460/ 115203 | consumed samples: 7541760 | consumed tokens: 15445524480 | elapsed time per iteration (s): 0.55 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 2.866065E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.629 | TFLOPs: 44.01 | +7: iteration 29470/ 115203 | consumed samples: 7544320 | consumed tokens: 15450767360 | elapsed time per iteration (s): 0.56 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 2.867480E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.031 | TFLOPs: 43.48 | +7: iteration 29480/ 115203 | consumed samples: 7546880 | consumed tokens: 15456010240 | elapsed time per iteration (s): 0.55 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 2.857462E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.624 | TFLOPs: 44.01 | +7: iteration 29490/ 115203 | consumed samples: 7549440 | consumed tokens: 15461253120 | elapsed time per iteration (s): 0.56 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 2.846335E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.717 | TFLOPs: 43.73 | +7: iteration 29500/ 115203 | consumed samples: 7552000 | consumed tokens: 15466496000 | elapsed time per iteration (s): 0.58 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 2.856103E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.513 | TFLOPs: 41.90 | +7: iteration 29510/ 115203 | consumed samples: 7554560 | consumed tokens: 15471738880 | elapsed time per iteration (s): 0.57 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 2.850744E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.508 | TFLOPs: 43.14 | +7: iteration 29520/ 115203 | consumed samples: 7557120 | consumed tokens: 15476981760 | elapsed time per iteration (s): 0.57 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 2.869640E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.343 | TFLOPs: 42.74 | +7: iteration 29530/ 115203 | consumed samples: 7559680 | consumed tokens: 15482224640 | elapsed time per iteration (s): 0.57 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 2.862829E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.888 | TFLOPs: 42.89 | +7: iteration 29540/ 115203 | consumed samples: 7562240 | consumed tokens: 15487467520 | elapsed time per iteration (s): 0.56 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 2.858121E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.187 | TFLOPs: 43.30 | +7: iteration 29550/ 115203 | consumed samples: 7564800 | consumed tokens: 15492710400 | elapsed time per iteration (s): 0.57 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 2.848390E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.561 | TFLOPs: 42.48 | +7: iteration 29560/ 115203 | consumed samples: 7567360 | consumed tokens: 15497953280 | elapsed time per iteration (s): 0.57 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 2.852126E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.629 | TFLOPs: 43.15 | +7: iteration 29570/ 115203 | consumed samples: 7569920 | consumed tokens: 15503196160 | elapsed time per iteration (s): 0.56 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 2.857828E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.416 | TFLOPs: 43.51 | +7: iteration 29580/ 115203 | consumed samples: 7572480 | consumed tokens: 15508439040 | elapsed time per iteration (s): 0.57 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 2.858920E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.166 | TFLOPs: 42.54 | +7: iteration 29590/ 115203 | consumed samples: 7575040 | consumed tokens: 15513681920 | elapsed time per iteration (s): 0.57 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 2.869273E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.189 | TFLOPs: 43.11 | +7: iteration 29600/ 115203 | consumed samples: 7577600 | consumed tokens: 15518924800 | elapsed time per iteration (s): 0.58 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 2.853956E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.146 | TFLOPs: 41.96 | +7: iteration 29610/ 115203 | consumed samples: 7580160 | consumed tokens: 15524167680 | elapsed time per iteration (s): 0.56 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 2.859417E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.051 | TFLOPs: 43.67 | +7: iteration 29620/ 115203 | consumed samples: 7582720 | consumed tokens: 15529410560 | elapsed time per iteration (s): 0.56 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 2.856845E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.265 | TFLOPs: 43.40 | +7: iteration 29630/ 115203 | consumed samples: 7585280 | consumed tokens: 15534653440 | elapsed time per iteration (s): 0.57 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 2.860296E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.700 | TFLOPs: 43.06 | +7: iteration 29640/ 115203 | consumed samples: 7587840 | consumed tokens: 15539896320 | elapsed time per iteration (s): 0.57 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 2.847312E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.942 | TFLOPs: 42.99 | +7: iteration 29650/ 115203 | consumed samples: 7590400 | consumed tokens: 15545139200 | elapsed time per iteration (s): 0.56 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 2.855400E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.232 | TFLOPs: 43.21 | +7: iteration 29660/ 115203 | consumed samples: 7592960 | consumed tokens: 15550382080 | elapsed time per iteration (s): 0.57 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 2.834282E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.070 | TFLOPs: 43.10 | +7: iteration 29670/ 115203 | consumed samples: 7595520 | consumed tokens: 15555624960 | elapsed time per iteration (s): 0.58 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 2.870066E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.758 | TFLOPs: 42.40 | +7: iteration 29680/ 115203 | consumed samples: 7598080 | consumed tokens: 15560867840 | elapsed time per iteration (s): 0.57 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 2.861408E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.466 | TFLOPs: 42.76 | +7: iteration 29690/ 115203 | consumed samples: 7600640 | consumed tokens: 15566110720 | elapsed time per iteration (s): 0.57 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 2.847086E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.868 | TFLOPs: 43.18 | +7: iteration 29700/ 115203 | consumed samples: 7603200 | consumed tokens: 15571353600 | elapsed time per iteration (s): 0.57 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 2.854526E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.360 | TFLOPs: 43.03 | +7: iteration 29710/ 115203 | consumed samples: 7605760 | consumed tokens: 15576596480 | elapsed time per iteration (s): 0.57 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 2.878427E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.254 | TFLOPs: 42.55 | +7: iteration 29720/ 115203 | consumed samples: 7608320 | consumed tokens: 15581839360 | elapsed time per iteration (s): 0.57 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 2.868275E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.230 | TFLOPs: 42.73 | +7: iteration 29730/ 115203 | consumed samples: 7610880 | consumed tokens: 15587082240 | elapsed time per iteration (s): 0.56 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 2.852412E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.480 | TFLOPs: 43.71 | +7: iteration 29740/ 115203 | consumed samples: 7613440 | consumed tokens: 15592325120 | elapsed time per iteration (s): 0.59 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 2.851391E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.342 | TFLOPs: 41.70 | +7: iteration 29750/ 115203 | consumed samples: 7616000 | consumed tokens: 15597568000 | elapsed time per iteration (s): 0.56 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 2.868595E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.726 | TFLOPs: 43.35 | +7: iteration 29760/ 115203 | consumed samples: 7618560 | consumed tokens: 15602810880 | elapsed time per iteration (s): 0.57 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 2.853545E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.856 | TFLOPs: 43.17 | +7: iteration 29770/ 115203 | consumed samples: 7621120 | consumed tokens: 15608053760 | elapsed time per iteration (s): 0.57 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 2.854964E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.346 | TFLOPs: 42.94 | +7: iteration 29780/ 115203 | consumed samples: 7623680 | consumed tokens: 15613296640 | elapsed time per iteration (s): 0.56 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 2.859986E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.080 | TFLOPs: 43.39 | +7: iteration 29790/ 115203 | consumed samples: 7626240 | consumed tokens: 15618539520 | elapsed time per iteration (s): 0.57 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 2.850416E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.859 | TFLOPs: 43.08 | +7: iteration 29800/ 115203 | consumed samples: 7628800 | consumed tokens: 15623782400 | elapsed time per iteration (s): 0.58 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 2.838246E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.153 | TFLOPs: 42.44 | +7: iteration 29810/ 115203 | consumed samples: 7631360 | consumed tokens: 15629025280 | elapsed time per iteration (s): 0.56 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 2.860007E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.196 | TFLOPs: 43.49 | +7: iteration 29820/ 115203 | consumed samples: 7633920 | consumed tokens: 15634268160 | elapsed time per iteration (s): 0.56 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 2.851780E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.954 | TFLOPs: 43.76 | +7: iteration 29830/ 115203 | consumed samples: 7636480 | consumed tokens: 15639511040 | elapsed time per iteration (s): 0.56 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 2.852583E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.313 | TFLOPs: 43.41 | +7: iteration 29840/ 115203 | consumed samples: 7639040 | consumed tokens: 15644753920 | elapsed time per iteration (s): 0.56 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 2.851153E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.761 | TFLOPs: 43.74 | +7: iteration 29850/ 115203 | consumed samples: 7641600 | consumed tokens: 15649996800 | elapsed time per iteration (s): 0.57 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 2.854001E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.933 | TFLOPs: 42.99 | +7: iteration 29860/ 115203 | consumed samples: 7644160 | consumed tokens: 15655239680 | elapsed time per iteration (s): 0.57 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 2.852028E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.422 | TFLOPs: 42.94 | +7: iteration 29870/ 115203 | consumed samples: 7646720 | consumed tokens: 15660482560 | elapsed time per iteration (s): 0.56 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 2.858830E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.401 | TFLOPs: 43.51 | +7: iteration 29880/ 115203 | consumed samples: 7649280 | consumed tokens: 15665725440 | elapsed time per iteration (s): 0.58 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 2.838226E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.780 | TFLOPs: 41.93 | +7: iteration 29890/ 115203 | consumed samples: 7651840 | consumed tokens: 15670968320 | elapsed time per iteration (s): 0.56 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 2.871901E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.140 | TFLOPs: 43.20 | +7: iteration 29900/ 115203 | consumed samples: 7654400 | consumed tokens: 15676211200 | elapsed time per iteration (s): 0.56 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 2.857779E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.919 | TFLOPs: 43.37 | +7: iteration 29910/ 115203 | consumed samples: 7656960 | consumed tokens: 15681454080 | elapsed time per iteration (s): 0.56 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 2.832672E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.813 | TFLOPs: 43.27 | +7: iteration 29920/ 115203 | consumed samples: 7659520 | consumed tokens: 15686696960 | elapsed time per iteration (s): 0.56 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 2.840530E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.052 | TFLOPs: 43.48 | +7: iteration 29930/ 115203 | consumed samples: 7662080 | consumed tokens: 15691939840 | elapsed time per iteration (s): 0.57 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 2.851807E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.383 | TFLOPs: 43.13 | +7: iteration 29940/ 115203 | consumed samples: 7664640 | consumed tokens: 15697182720 | elapsed time per iteration (s): 0.57 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 2.865314E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.610 | TFLOPs: 43.15 | +7: iteration 29950/ 115203 | consumed samples: 7667200 | consumed tokens: 15702425600 | elapsed time per iteration (s): 0.55 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 2.832822E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.498 | TFLOPs: 44.00 | +7: iteration 29960/ 115203 | consumed samples: 7669760 | consumed tokens: 15707668480 | elapsed time per iteration (s): 0.57 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 2.864579E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.738 | TFLOPs: 42.50 | +7: iteration 29970/ 115203 | consumed samples: 7672320 | consumed tokens: 15712911360 | elapsed time per iteration (s): 0.57 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 2.866567E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.826 | TFLOPs: 42.98 | +7: iteration 29980/ 115203 | consumed samples: 7674880 | consumed tokens: 15718154240 | elapsed time per iteration (s): 0.56 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 2.841022E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.162 | TFLOPs: 43.39 | +7: iteration 29990/ 115203 | consumed samples: 7677440 | consumed tokens: 15723397120 | elapsed time per iteration (s): 0.58 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 2.851699E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.483 | TFLOPs: 42.09 | +0: [2023-03-16 17:32:21,674] [INFO] [logging.py:68:log_dist] [Rank 0] step=30000, skipped=0, lr=[0.00017304965296758478, 0.00017304965296758478, 0.00017304965296758478], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 30000/ 115203 | consumed samples: 7680000 | consumed tokens: 15728640000 | elapsed time per iteration (s): 0.57 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 2.857698E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.559 | TFLOPs: 42.48 | +0: steps: 30000 loss: 2.8557 iter time (s): 0.561 samples/sec: 456.610 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 30000 | lm loss value: 3.324608E+00 | lm loss PPL: 2.778810E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 30000 to checkpoints_421m60b400m +0: [2023-03-16 17:32:21,917] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step30000 is begin to save! +0: [2023-03-16 17:32:21,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_01-model_00-model_states.pt... +0: [2023-03-16 17:32:22,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_01-model_00-model_states.pt. +0: [2023-03-16 17:32:22,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_03-model_00-model_states.pt... +0: [2023-03-16 17:32:22,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_03-model_00-model_states.pt. +0: [2023-03-16 17:32:22,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_04-model_00-model_states.pt... +0: [2023-03-16 17:32:22,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_04-model_00-model_states.pt. +0: [2023-03-16 17:32:22,167] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_05-model_00-model_states.pt... +0: [2023-03-16 17:32:22,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_05-model_00-model_states.pt. +0: [2023-03-16 17:32:22,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_06-model_00-model_states.pt... +0: [2023-03-16 17:32:22,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_06-model_00-model_states.pt. +0: [2023-03-16 17:32:22,252] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_07-model_00-model_states.pt... +0: [2023-03-16 17:32:22,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_07-model_00-model_states.pt. +0: [2023-03-16 17:32:22,293] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_08-model_00-model_states.pt... +0: [2023-03-16 17:32:22,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_08-model_00-model_states.pt. +0: [2023-03-16 17:32:22,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_09-model_00-model_states.pt... +0: [2023-03-16 17:32:22,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_09-model_00-model_states.pt. +0: [2023-03-16 17:32:22,378] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_10-model_00-model_states.pt... +0: [2023-03-16 17:32:22,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_10-model_00-model_states.pt. +0: [2023-03-16 17:32:22,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_11-model_00-model_states.pt... +0: [2023-03-16 17:32:22,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_11-model_00-model_states.pt. +0: [2023-03-16 17:32:22,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_12-model_00-model_states.pt... +0: [2023-03-16 17:32:22,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_12-model_00-model_states.pt. +0: [2023-03-16 17:32:22,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_13-model_00-model_states.pt... +0: [2023-03-16 17:32:22,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_13-model_00-model_states.pt. +0: [2023-03-16 17:32:22,544] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_14-model_00-model_states.pt... +0: [2023-03-16 17:32:22,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_14-model_00-model_states.pt. +0: [2023-03-16 17:32:22,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_15-model_00-model_states.pt... +0: [2023-03-16 17:32:22,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_15-model_00-model_states.pt. +0: [2023-03-16 17:32:22,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_16-model_00-model_states.pt... +0: [2023-03-16 17:32:22,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_16-model_00-model_states.pt. +0: [2023-03-16 17:32:22,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_17-model_00-model_states.pt... +0: [2023-03-16 17:32:22,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_17-model_00-model_states.pt. +0: [2023-03-16 17:32:22,711] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_18-model_00-model_states.pt... +0: [2023-03-16 17:32:22,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_18-model_00-model_states.pt. +0: [2023-03-16 17:32:22,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_19-model_00-model_states.pt... +0: [2023-03-16 17:32:22,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_19-model_00-model_states.pt. +0: [2023-03-16 17:32:22,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_20-model_00-model_states.pt... +0: [2023-03-16 17:32:22,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_20-model_00-model_states.pt. +0: [2023-03-16 17:32:22,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/layer_22-model_00-model_states.pt... +0: [2023-03-16 17:32:22,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/layer_22-model_00-model_states.pt. +0: [2023-03-16 17:32:22,840] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step30000/mp_rank_00_model_states.pt +0: [2023-03-16 17:32:22,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/mp_rank_00_model_states.pt... +0: [2023-03-16 17:32:22,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/mp_rank_00_model_states.pt. +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-16 17:32:22,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-16 17:32:22,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:22,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:22,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:22,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:22,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:22,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:22,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 17:32:22,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:22,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-16 17:32:22,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-16 17:32:23,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-16 17:32:23,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-16 17:32:23,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-16 17:32:23,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-16 17:32:23,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-16 17:32:23,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 17:32:23,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-16 17:32:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: successfully saved checkpoint at iteration 30000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1236.14 +7: iteration 30010/ 115203 | consumed samples: 7682560 | consumed tokens: 15733882880 | elapsed time per iteration (s): 0.71 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 2.849882E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 358.800 | TFLOPs: 34.21 | +7: iteration 30020/ 115203 | consumed samples: 7685120 | consumed tokens: 15739125760 | elapsed time per iteration (s): 0.57 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 2.826277E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.280 | TFLOPs: 43.12 | +7: iteration 30030/ 115203 | consumed samples: 7687680 | consumed tokens: 15744368640 | elapsed time per iteration (s): 0.56 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 2.836230E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.190 | TFLOPs: 43.40 | +7: iteration 30040/ 115203 | consumed samples: 7690240 | consumed tokens: 15749611520 | elapsed time per iteration (s): 0.57 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 2.845378E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.340 | TFLOPs: 42.55 | +7: iteration 30050/ 115203 | consumed samples: 7692800 | consumed tokens: 15754854400 | elapsed time per iteration (s): 0.57 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 2.849346E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.708 | TFLOPs: 43.16 | +7: iteration 30060/ 115203 | consumed samples: 7695360 | consumed tokens: 15760097280 | elapsed time per iteration (s): 0.56 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 2.859620E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.164 | TFLOPs: 43.20 | +7: iteration 30070/ 115203 | consumed samples: 7697920 | consumed tokens: 15765340160 | elapsed time per iteration (s): 0.57 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 2.858941E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.362 | TFLOPs: 42.84 | +7: iteration 30080/ 115203 | consumed samples: 7700480 | consumed tokens: 15770583040 | elapsed time per iteration (s): 0.56 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 2.851258E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.781 | TFLOPs: 43.45 | +7: iteration 30090/ 115203 | consumed samples: 7703040 | consumed tokens: 15775825920 | elapsed time per iteration (s): 0.57 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 2.839178E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.173 | TFLOPs: 43.01 | +7: iteration 30100/ 115203 | consumed samples: 7705600 | consumed tokens: 15781068800 | elapsed time per iteration (s): 0.57 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 2.850995E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.617 | TFLOPs: 42.58 | +7: iteration 30110/ 115203 | consumed samples: 7708160 | consumed tokens: 15786311680 | elapsed time per iteration (s): 0.56 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 2.844744E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.720 | TFLOPs: 43.83 | +7: iteration 30120/ 115203 | consumed samples: 7710720 | consumed tokens: 15791554560 | elapsed time per iteration (s): 0.57 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 2.861499E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.171 | TFLOPs: 42.92 | +7: iteration 30130/ 115203 | consumed samples: 7713280 | consumed tokens: 15796797440 | elapsed time per iteration (s): 0.56 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 2.839152E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.930 | TFLOPs: 43.66 | +7: iteration 30140/ 115203 | consumed samples: 7715840 | consumed tokens: 15802040320 | elapsed time per iteration (s): 0.57 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 2.852472E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.736 | TFLOPs: 43.07 | +7: iteration 30150/ 115203 | consumed samples: 7718400 | consumed tokens: 15807283200 | elapsed time per iteration (s): 0.56 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 2.850694E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.324 | TFLOPs: 43.60 | +7: iteration 30160/ 115203 | consumed samples: 7720960 | consumed tokens: 15812526080 | elapsed time per iteration (s): 0.56 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 2.854364E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.802 | TFLOPs: 43.55 | +7: iteration 30170/ 115203 | consumed samples: 7723520 | consumed tokens: 15817768960 | elapsed time per iteration (s): 0.57 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 2.851081E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.496 | TFLOPs: 43.05 | +7: iteration 30180/ 115203 | consumed samples: 7726080 | consumed tokens: 15823011840 | elapsed time per iteration (s): 0.57 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 2.840573E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.858 | TFLOPs: 42.98 | +7: iteration 30190/ 115203 | consumed samples: 7728640 | consumed tokens: 15828254720 | elapsed time per iteration (s): 0.56 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 2.846316E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.504 | TFLOPs: 43.43 | +7: iteration 30200/ 115203 | consumed samples: 7731200 | consumed tokens: 15833497600 | elapsed time per iteration (s): 0.56 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 2.848488E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.191 | TFLOPs: 43.40 | +7: iteration 30210/ 115203 | consumed samples: 7733760 | consumed tokens: 15838740480 | elapsed time per iteration (s): 0.56 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 2.865011E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.112 | TFLOPs: 43.39 | +7: iteration 30220/ 115203 | consumed samples: 7736320 | consumed tokens: 15843983360 | elapsed time per iteration (s): 0.55 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 2.843359E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.895 | TFLOPs: 44.04 | +7: iteration 30230/ 115203 | consumed samples: 7738880 | consumed tokens: 15849226240 | elapsed time per iteration (s): 0.57 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 2.855176E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.073 | TFLOPs: 43.20 | +7: iteration 30240/ 115203 | consumed samples: 7741440 | consumed tokens: 15854469120 | elapsed time per iteration (s): 0.57 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 2.840705E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.351 | TFLOPs: 43.03 | +7: iteration 30250/ 115203 | consumed samples: 7744000 | consumed tokens: 15859712000 | elapsed time per iteration (s): 0.56 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 2.845897E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.698 | TFLOPs: 43.26 | +7: iteration 30260/ 115203 | consumed samples: 7746560 | consumed tokens: 15864954880 | elapsed time per iteration (s): 0.56 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 2.866934E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.057 | TFLOPs: 43.86 | +7: iteration 30270/ 115203 | consumed samples: 7749120 | consumed tokens: 15870197760 | elapsed time per iteration (s): 0.56 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 2.854939E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.770 | TFLOPs: 43.64 | +7: iteration 30280/ 115203 | consumed samples: 7751680 | consumed tokens: 15875440640 | elapsed time per iteration (s): 0.56 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 2.838168E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.328 | TFLOPs: 43.41 | +7: iteration 30290/ 115203 | consumed samples: 7754240 | consumed tokens: 15880683520 | elapsed time per iteration (s): 0.55 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 2.831954E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.777 | TFLOPs: 44.03 | +7: iteration 30300/ 115203 | consumed samples: 7756800 | consumed tokens: 15885926400 | elapsed time per iteration (s): 0.56 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 2.862240E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.368 | TFLOPs: 43.61 | +7: iteration 30310/ 115203 | consumed samples: 7759360 | consumed tokens: 15891169280 | elapsed time per iteration (s): 0.56 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 2.844006E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.741 | TFLOPs: 43.55 | +7: iteration 30320/ 115203 | consumed samples: 7761920 | consumed tokens: 15896412160 | elapsed time per iteration (s): 0.56 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 2.849613E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.163 | TFLOPs: 43.87 | +7: iteration 30330/ 115203 | consumed samples: 7764480 | consumed tokens: 15901655040 | elapsed time per iteration (s): 0.56 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 2.844156E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.889 | TFLOPs: 43.46 | +7: iteration 30340/ 115203 | consumed samples: 7767040 | consumed tokens: 15906897920 | elapsed time per iteration (s): 0.56 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 2.851512E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.133 | TFLOPs: 43.30 | +7: iteration 30350/ 115203 | consumed samples: 7769600 | consumed tokens: 15912140800 | elapsed time per iteration (s): 0.56 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 2.848150E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.821 | TFLOPs: 43.55 | +7: iteration 30360/ 115203 | consumed samples: 7772160 | consumed tokens: 15917383680 | elapsed time per iteration (s): 0.56 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 2.861478E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.346 | TFLOPs: 43.32 | +7: iteration 30370/ 115203 | consumed samples: 7774720 | consumed tokens: 15922626560 | elapsed time per iteration (s): 0.55 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 2.845449E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.432 | TFLOPs: 43.99 | +7: iteration 30380/ 115203 | consumed samples: 7777280 | consumed tokens: 15927869440 | elapsed time per iteration (s): 0.55 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 2.851275E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 30390/ 115203 | consumed samples: 7779840 | consumed tokens: 15933112320 | elapsed time per iteration (s): 0.55 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 2.841978E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.516 | TFLOPs: 44.00 | +7: iteration 30400/ 115203 | consumed samples: 7782400 | consumed tokens: 15938355200 | elapsed time per iteration (s): 0.56 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 2.853103E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.482 | TFLOPs: 43.52 | +7: iteration 30410/ 115203 | consumed samples: 7784960 | consumed tokens: 15943598080 | elapsed time per iteration (s): 0.56 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 2.860052E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.750 | TFLOPs: 43.74 | +7: iteration 30420/ 115203 | consumed samples: 7787520 | consumed tokens: 15948840960 | elapsed time per iteration (s): 0.57 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 2.851648E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.112 | TFLOPs: 42.82 | +7: iteration 30430/ 115203 | consumed samples: 7790080 | consumed tokens: 15954083840 | elapsed time per iteration (s): 0.55 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 2.865049E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.762 | TFLOPs: 44.02 | +7: iteration 30440/ 115203 | consumed samples: 7792640 | consumed tokens: 15959326720 | elapsed time per iteration (s): 0.56 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 2.840110E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.049 | TFLOPs: 43.67 | +7: iteration 30450/ 115203 | consumed samples: 7795200 | consumed tokens: 15964569600 | elapsed time per iteration (s): 0.56 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 2.833091E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.388 | TFLOPs: 43.42 | +7: iteration 30460/ 115203 | consumed samples: 7797760 | consumed tokens: 15969812480 | elapsed time per iteration (s): 0.57 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 2.854893E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.292 | TFLOPs: 42.64 | +7: iteration 30470/ 115203 | consumed samples: 7800320 | consumed tokens: 15975055360 | elapsed time per iteration (s): 0.56 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 2.846195E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.920 | TFLOPs: 43.56 | +7: iteration 30480/ 115203 | consumed samples: 7802880 | consumed tokens: 15980298240 | elapsed time per iteration (s): 0.56 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 2.844945E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.660 | TFLOPs: 43.73 | +7: iteration 30490/ 115203 | consumed samples: 7805440 | consumed tokens: 15985541120 | elapsed time per iteration (s): 0.57 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 2.852878E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.125 | TFLOPs: 42.53 | +7: iteration 30500/ 115203 | consumed samples: 7808000 | consumed tokens: 15990784000 | elapsed time per iteration (s): 0.55 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 2.843498E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.734 | TFLOPs: 44.02 | +7: iteration 30510/ 115203 | consumed samples: 7810560 | consumed tokens: 15996026880 | elapsed time per iteration (s): 0.56 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 2.835102E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.133 | TFLOPs: 43.20 | +7: iteration 30520/ 115203 | consumed samples: 7813120 | consumed tokens: 16001269760 | elapsed time per iteration (s): 0.57 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 2.837849E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.232 | TFLOPs: 42.54 | +7: iteration 30530/ 115203 | consumed samples: 7815680 | consumed tokens: 16006512640 | elapsed time per iteration (s): 0.56 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 2.853741E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.713 | TFLOPs: 43.35 | +7: iteration 30540/ 115203 | consumed samples: 7818240 | consumed tokens: 16011755520 | elapsed time per iteration (s): 0.55 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 2.841826E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.659 | TFLOPs: 44.01 | +7: iteration 30550/ 115203 | consumed samples: 7820800 | consumed tokens: 16016998400 | elapsed time per iteration (s): 0.57 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 2.854341E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.855 | TFLOPs: 43.17 | +7: iteration 30560/ 115203 | consumed samples: 7823360 | consumed tokens: 16022241280 | elapsed time per iteration (s): 0.55 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 2.839382E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.654 | TFLOPs: 44.01 | +7: iteration 30570/ 115203 | consumed samples: 7825920 | consumed tokens: 16027484160 | elapsed time per iteration (s): 0.59 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 2.859956E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.926 | TFLOPs: 41.66 | +7: iteration 30580/ 115203 | consumed samples: 7828480 | consumed tokens: 16032727040 | elapsed time per iteration (s): 0.55 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 2.840636E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.569 | TFLOPs: 44.01 | +7: iteration 30590/ 115203 | consumed samples: 7831040 | consumed tokens: 16037969920 | elapsed time per iteration (s): 0.55 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 2.865937E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.589 | TFLOPs: 44.01 | +7: iteration 30600/ 115203 | consumed samples: 7833600 | consumed tokens: 16043212800 | elapsed time per iteration (s): 0.56 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 2.846382E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.006 | TFLOPs: 43.48 | +7: iteration 30610/ 115203 | consumed samples: 7836160 | consumed tokens: 16048455680 | elapsed time per iteration (s): 0.55 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 2.848061E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.261 | TFLOPs: 43.98 | +7: iteration 30620/ 115203 | consumed samples: 7838720 | consumed tokens: 16053698560 | elapsed time per iteration (s): 0.56 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 2.838582E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.580 | TFLOPs: 43.53 | +7: iteration 30630/ 115203 | consumed samples: 7841280 | consumed tokens: 16058941440 | elapsed time per iteration (s): 0.55 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 2.843790E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.520 | TFLOPs: 44.00 | +7: iteration 30640/ 115203 | consumed samples: 7843840 | consumed tokens: 16064184320 | elapsed time per iteration (s): 0.57 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 2.858012E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.453 | TFLOPs: 43.14 | +7: iteration 30650/ 115203 | consumed samples: 7846400 | consumed tokens: 16069427200 | elapsed time per iteration (s): 0.56 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 2.848234E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.228 | TFLOPs: 43.50 | +7: iteration 30660/ 115203 | consumed samples: 7848960 | consumed tokens: 16074670080 | elapsed time per iteration (s): 0.56 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 2.828923E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.297 | TFLOPs: 43.22 | +7: iteration 30670/ 115203 | consumed samples: 7851520 | consumed tokens: 16079912960 | elapsed time per iteration (s): 0.57 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 2.846039E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.016 | TFLOPs: 43.09 | +7: iteration 30680/ 115203 | consumed samples: 7854080 | consumed tokens: 16085155840 | elapsed time per iteration (s): 0.56 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 2.840064E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.258 | TFLOPs: 43.40 | +7: iteration 30690/ 115203 | consumed samples: 7856640 | consumed tokens: 16090398720 | elapsed time per iteration (s): 0.56 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 2.834087E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.670 | TFLOPs: 43.44 | +7: iteration 30700/ 115203 | consumed samples: 7859200 | consumed tokens: 16095641600 | elapsed time per iteration (s): 0.56 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 2.845631E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.736 | TFLOPs: 43.45 | +7: iteration 30710/ 115203 | consumed samples: 7861760 | consumed tokens: 16100884480 | elapsed time per iteration (s): 0.57 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 2.851275E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.092 | TFLOPs: 43.01 | +7: iteration 30720/ 115203 | consumed samples: 7864320 | consumed tokens: 16106127360 | elapsed time per iteration (s): 0.56 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 2.855460E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.595 | TFLOPs: 43.34 | +7: iteration 30730/ 115203 | consumed samples: 7866880 | consumed tokens: 16111370240 | elapsed time per iteration (s): 0.56 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 2.841829E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.978 | TFLOPs: 43.38 | +7: iteration 30740/ 115203 | consumed samples: 7869440 | consumed tokens: 16116613120 | elapsed time per iteration (s): 0.56 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 2.844823E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.279 | TFLOPs: 43.79 | +7: iteration 30750/ 115203 | consumed samples: 7872000 | consumed tokens: 16121856000 | elapsed time per iteration (s): 0.56 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 2.845228E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.994 | TFLOPs: 43.66 | +7: iteration 30760/ 115203 | consumed samples: 7874560 | consumed tokens: 16127098880 | elapsed time per iteration (s): 0.56 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 2.836743E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.672 | TFLOPs: 43.44 | +7: iteration 30770/ 115203 | consumed samples: 7877120 | consumed tokens: 16132341760 | elapsed time per iteration (s): 0.56 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 2.849851E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.254 | TFLOPs: 43.69 | +7: iteration 30780/ 115203 | consumed samples: 7879680 | consumed tokens: 16137584640 | elapsed time per iteration (s): 0.56 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 2.839511E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.344 | TFLOPs: 43.89 | +7: iteration 30790/ 115203 | consumed samples: 7882240 | consumed tokens: 16142827520 | elapsed time per iteration (s): 0.55 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 2.846424E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.511 | TFLOPs: 44.00 | +7: iteration 30800/ 115203 | consumed samples: 7884800 | consumed tokens: 16148070400 | elapsed time per iteration (s): 0.55 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 2.858144E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.626 | TFLOPs: 44.01 | +7: iteration 30810/ 115203 | consumed samples: 7887360 | consumed tokens: 16153313280 | elapsed time per iteration (s): 0.56 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 2.838143E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.540 | TFLOPs: 43.72 | +7: iteration 30820/ 115203 | consumed samples: 7889920 | consumed tokens: 16158556160 | elapsed time per iteration (s): 0.57 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 2.840131E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.006 | TFLOPs: 43.19 | +7: iteration 30830/ 115203 | consumed samples: 7892480 | consumed tokens: 16163799040 | elapsed time per iteration (s): 0.57 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 2.844123E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.633 | TFLOPs: 42.49 | +7: iteration 30840/ 115203 | consumed samples: 7895040 | consumed tokens: 16169041920 | elapsed time per iteration (s): 0.56 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 2.841341E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.480 | TFLOPs: 43.52 | +7: iteration 30850/ 115203 | consumed samples: 7897600 | consumed tokens: 16174284800 | elapsed time per iteration (s): 0.56 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 2.847110E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.316 | TFLOPs: 43.79 | +7: iteration 30860/ 115203 | consumed samples: 7900160 | consumed tokens: 16179527680 | elapsed time per iteration (s): 0.56 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 2.856253E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.411 | TFLOPs: 43.80 | +7: iteration 30870/ 115203 | consumed samples: 7902720 | consumed tokens: 16184770560 | elapsed time per iteration (s): 0.55 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 2.845662E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.468 | TFLOPs: 44.00 | +7: iteration 30880/ 115203 | consumed samples: 7905280 | consumed tokens: 16190013440 | elapsed time per iteration (s): 0.56 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 2.835115E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.918 | TFLOPs: 43.66 | +7: iteration 30890/ 115203 | consumed samples: 7907840 | consumed tokens: 16195256320 | elapsed time per iteration (s): 0.56 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 2.848768E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.272 | TFLOPs: 43.69 | +7: iteration 30900/ 115203 | consumed samples: 7910400 | consumed tokens: 16200499200 | elapsed time per iteration (s): 0.57 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 2.850988E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.036 | TFLOPs: 43.00 | +7: iteration 30910/ 115203 | consumed samples: 7912960 | consumed tokens: 16205742080 | elapsed time per iteration (s): 0.57 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 2.834699E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.067 | TFLOPs: 43.00 | +7: iteration 30920/ 115203 | consumed samples: 7915520 | consumed tokens: 16210984960 | elapsed time per iteration (s): 0.57 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 2.832628E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.037 | TFLOPs: 43.19 | +7: iteration 30930/ 115203 | consumed samples: 7918080 | consumed tokens: 16216227840 | elapsed time per iteration (s): 0.55 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 2.833869E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.841 | TFLOPs: 44.03 | +7: iteration 30940/ 115203 | consumed samples: 7920640 | consumed tokens: 16221470720 | elapsed time per iteration (s): 0.57 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 2.853792E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.090 | TFLOPs: 43.01 | +7: iteration 30950/ 115203 | consumed samples: 7923200 | consumed tokens: 16226713600 | elapsed time per iteration (s): 0.56 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 2.852803E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.830 | TFLOPs: 43.65 | +7: iteration 30960/ 115203 | consumed samples: 7925760 | consumed tokens: 16231956480 | elapsed time per iteration (s): 0.57 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 2.847287E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.071 | TFLOPs: 43.00 | +7: iteration 30970/ 115203 | consumed samples: 7928320 | consumed tokens: 16237199360 | elapsed time per iteration (s): 0.57 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 2.842028E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.961 | TFLOPs: 43.18 | +7: iteration 30980/ 115203 | consumed samples: 7930880 | consumed tokens: 16242442240 | elapsed time per iteration (s): 0.57 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 2.833781E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.878 | TFLOPs: 43.08 | +7: iteration 30990/ 115203 | consumed samples: 7933440 | consumed tokens: 16247685120 | elapsed time per iteration (s): 0.55 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 2.839106E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.925 | TFLOPs: 44.04 | +7: iteration 31000/ 115203 | consumed samples: 7936000 | consumed tokens: 16252928000 | elapsed time per iteration (s): 0.56 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 2.838192E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.149 | TFLOPs: 43.58 | +7: iteration 31010/ 115203 | consumed samples: 7938560 | consumed tokens: 16258170880 | elapsed time per iteration (s): 0.55 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 2.862710E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.744 | TFLOPs: 44.02 | +7: iteration 31020/ 115203 | consumed samples: 7941120 | consumed tokens: 16263413760 | elapsed time per iteration (s): 0.55 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 2.856992E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.657 | TFLOPs: 44.01 | +7: iteration 31030/ 115203 | consumed samples: 7943680 | consumed tokens: 16268656640 | elapsed time per iteration (s): 0.56 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 2.841886E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.825 | TFLOPs: 43.55 | +7: iteration 31040/ 115203 | consumed samples: 7946240 | consumed tokens: 16273899520 | elapsed time per iteration (s): 0.56 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 2.835176E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.392 | TFLOPs: 43.61 | +7: iteration 31050/ 115203 | consumed samples: 7948800 | consumed tokens: 16279142400 | elapsed time per iteration (s): 0.56 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 2.825381E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.982 | TFLOPs: 43.66 | +7: iteration 31060/ 115203 | consumed samples: 7951360 | consumed tokens: 16284385280 | elapsed time per iteration (s): 0.57 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 2.839634E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.817 | TFLOPs: 43.17 | +7: iteration 31070/ 115203 | consumed samples: 7953920 | consumed tokens: 16289628160 | elapsed time per iteration (s): 0.56 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 2.843458E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.857 | TFLOPs: 43.27 | +7: iteration 31080/ 115203 | consumed samples: 7956480 | consumed tokens: 16294871040 | elapsed time per iteration (s): 0.56 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 2.832038E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.036 | TFLOPs: 43.76 | +7: iteration 31090/ 115203 | consumed samples: 7959040 | consumed tokens: 16300113920 | elapsed time per iteration (s): 0.55 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 2.843651E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.903 | TFLOPs: 44.04 | +7: iteration 31100/ 115203 | consumed samples: 7961600 | consumed tokens: 16305356800 | elapsed time per iteration (s): 0.58 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 2.853461E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.757 | TFLOPs: 42.12 | +7: iteration 31110/ 115203 | consumed samples: 7964160 | consumed tokens: 16310599680 | elapsed time per iteration (s): 0.55 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 2.837736E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.955 | TFLOPs: 44.04 | +7: iteration 31120/ 115203 | consumed samples: 7966720 | consumed tokens: 16315842560 | elapsed time per iteration (s): 0.56 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 2.831749E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.421 | TFLOPs: 43.23 | +7: iteration 31130/ 115203 | consumed samples: 7969280 | consumed tokens: 16321085440 | elapsed time per iteration (s): 0.57 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 2.847382E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.547 | TFLOPs: 42.86 | +7: iteration 31140/ 115203 | consumed samples: 7971840 | consumed tokens: 16326328320 | elapsed time per iteration (s): 0.57 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 2.843077E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.113 | TFLOPs: 43.01 | +7: iteration 31150/ 115203 | consumed samples: 7974400 | consumed tokens: 16331571200 | elapsed time per iteration (s): 0.58 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 2.843837E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.173 | TFLOPs: 41.87 | +7: iteration 31160/ 115203 | consumed samples: 7976960 | consumed tokens: 16336814080 | elapsed time per iteration (s): 0.58 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 2.842149E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.382 | TFLOPs: 42.08 | +7: iteration 31170/ 115203 | consumed samples: 7979520 | consumed tokens: 16342056960 | elapsed time per iteration (s): 0.55 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 2.844740E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.594 | TFLOPs: 44.01 | +7: iteration 31180/ 115203 | consumed samples: 7982080 | consumed tokens: 16347299840 | elapsed time per iteration (s): 0.58 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 2.850397E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.993 | TFLOPs: 42.04 | +7: iteration 31190/ 115203 | consumed samples: 7984640 | consumed tokens: 16352542720 | elapsed time per iteration (s): 0.56 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 2.836789E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.177 | TFLOPs: 43.49 | +7: iteration 31200/ 115203 | consumed samples: 7987200 | consumed tokens: 16357785600 | elapsed time per iteration (s): 0.58 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 2.837475E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.158 | TFLOPs: 42.44 | +7: iteration 31210/ 115203 | consumed samples: 7989760 | consumed tokens: 16363028480 | elapsed time per iteration (s): 0.56 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 2.840093E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.604 | TFLOPs: 43.53 | +7: iteration 31220/ 115203 | consumed samples: 7992320 | consumed tokens: 16368271360 | elapsed time per iteration (s): 0.58 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 2.850201E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.556 | TFLOPs: 42.29 | +7: iteration 31230/ 115203 | consumed samples: 7994880 | consumed tokens: 16373514240 | elapsed time per iteration (s): 0.56 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 2.846183E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.392 | TFLOPs: 43.32 | +7: iteration 31240/ 115203 | consumed samples: 7997440 | consumed tokens: 16378757120 | elapsed time per iteration (s): 0.56 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 2.839758E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.842 | TFLOPs: 43.65 | +7: iteration 31250/ 115203 | consumed samples: 8000000 | consumed tokens: 16384000000 | elapsed time per iteration (s): 0.56 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 2.841988E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.202 | TFLOPs: 43.68 | +7: iteration 31260/ 115203 | consumed samples: 8002560 | consumed tokens: 16389242880 | elapsed time per iteration (s): 0.57 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 2.841144E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.917 | TFLOPs: 42.70 | +7: iteration 31270/ 115203 | consumed samples: 8005120 | consumed tokens: 16394485760 | elapsed time per iteration (s): 0.59 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 2.842256E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.921 | TFLOPs: 41.56 | +7: iteration 31280/ 115203 | consumed samples: 8007680 | consumed tokens: 16399728640 | elapsed time per iteration (s): 0.58 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 2.842082E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.567 | TFLOPs: 42.19 | +7: iteration 31290/ 115203 | consumed samples: 8010240 | consumed tokens: 16404971520 | elapsed time per iteration (s): 0.56 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 2.842585E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.335 | TFLOPs: 43.41 | +7: iteration 31300/ 115203 | consumed samples: 8012800 | consumed tokens: 16410214400 | elapsed time per iteration (s): 0.57 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 2.853482E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.638 | TFLOPs: 42.96 | +7: iteration 31310/ 115203 | consumed samples: 8015360 | consumed tokens: 16415457280 | elapsed time per iteration (s): 0.57 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 2.841823E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.030 | TFLOPs: 43.00 | +7: iteration 31320/ 115203 | consumed samples: 8017920 | consumed tokens: 16420700160 | elapsed time per iteration (s): 0.60 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 2.847937E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.871 | TFLOPs: 40.89 | +7: iteration 31330/ 115203 | consumed samples: 8020480 | consumed tokens: 16425943040 | elapsed time per iteration (s): 0.58 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 2.839200E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.131 | TFLOPs: 42.15 | +7: iteration 31340/ 115203 | consumed samples: 8023040 | consumed tokens: 16431185920 | elapsed time per iteration (s): 0.56 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 2.855536E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.743 | TFLOPs: 43.55 | +7: iteration 31350/ 115203 | consumed samples: 8025600 | consumed tokens: 16436428800 | elapsed time per iteration (s): 0.56 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 2.838067E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.305 | TFLOPs: 43.50 | +7: iteration 31360/ 115203 | consumed samples: 8028160 | consumed tokens: 16441671680 | elapsed time per iteration (s): 0.59 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 2.843321E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.176 | TFLOPs: 41.49 | +7: iteration 31370/ 115203 | consumed samples: 8030720 | consumed tokens: 16446914560 | elapsed time per iteration (s): 0.60 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 2.845234E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.294 | TFLOPs: 40.55 | +7: iteration 31380/ 115203 | consumed samples: 8033280 | consumed tokens: 16452157440 | elapsed time per iteration (s): 0.57 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 2.837538E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.577 | TFLOPs: 42.96 | +7: iteration 31390/ 115203 | consumed samples: 8035840 | consumed tokens: 16457400320 | elapsed time per iteration (s): 0.57 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 2.836429E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.235 | TFLOPs: 43.12 | +7: iteration 31400/ 115203 | consumed samples: 8038400 | consumed tokens: 16462643200 | elapsed time per iteration (s): 0.59 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 2.835482E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.761 | TFLOPs: 41.55 | +7: iteration 31410/ 115203 | consumed samples: 8040960 | consumed tokens: 16467886080 | elapsed time per iteration (s): 0.57 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 2.836531E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.269 | TFLOPs: 42.64 | +7: iteration 31420/ 115203 | consumed samples: 8043520 | consumed tokens: 16473128960 | elapsed time per iteration (s): 0.56 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 2.836158E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.743 | TFLOPs: 43.83 | +7: iteration 31430/ 115203 | consumed samples: 8046080 | consumed tokens: 16478371840 | elapsed time per iteration (s): 0.58 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 2.840584E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.865 | TFLOPs: 42.13 | +7: iteration 31440/ 115203 | consumed samples: 8048640 | consumed tokens: 16483614720 | elapsed time per iteration (s): 0.58 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 2.836922E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.293 | TFLOPs: 41.88 | +7: iteration 31450/ 115203 | consumed samples: 8051200 | consumed tokens: 16488857600 | elapsed time per iteration (s): 0.57 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 2.838760E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.413 | TFLOPs: 42.75 | +7: iteration 31460/ 115203 | consumed samples: 8053760 | consumed tokens: 16494100480 | elapsed time per iteration (s): 0.58 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 2.832449E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.556 | TFLOPs: 42.38 | +7: iteration 31470/ 115203 | consumed samples: 8056320 | consumed tokens: 16499343360 | elapsed time per iteration (s): 0.57 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 2.846666E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.303 | TFLOPs: 42.93 | +7: iteration 31480/ 115203 | consumed samples: 8058880 | consumed tokens: 16504586240 | elapsed time per iteration (s): 0.58 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 2.834823E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.122 | TFLOPs: 42.06 | +7: iteration 31490/ 115203 | consumed samples: 8061440 | consumed tokens: 16509829120 | elapsed time per iteration (s): 0.60 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 2.850343E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.846 | TFLOPs: 40.89 | +7: iteration 31500/ 115203 | consumed samples: 8064000 | consumed tokens: 16515072000 | elapsed time per iteration (s): 0.57 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 2.835828E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.088 | TFLOPs: 42.91 | +7: iteration 31510/ 115203 | consumed samples: 8066560 | consumed tokens: 16520314880 | elapsed time per iteration (s): 0.57 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 2.858429E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.057 | TFLOPs: 42.91 | +7: iteration 31520/ 115203 | consumed samples: 8069120 | consumed tokens: 16525557760 | elapsed time per iteration (s): 0.59 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 2.841462E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.817 | TFLOPs: 41.65 | +7: iteration 31530/ 115203 | consumed samples: 8071680 | consumed tokens: 16530800640 | elapsed time per iteration (s): 0.57 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 2.841024E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.419 | TFLOPs: 42.56 | +7: iteration 31540/ 115203 | consumed samples: 8074240 | consumed tokens: 16536043520 | elapsed time per iteration (s): 0.57 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 2.847828E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.122 | TFLOPs: 42.53 | +7: iteration 31550/ 115203 | consumed samples: 8076800 | consumed tokens: 16541286400 | elapsed time per iteration (s): 0.57 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 2.847135E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.719 | TFLOPs: 42.97 | +7: iteration 31560/ 115203 | consumed samples: 8079360 | consumed tokens: 16546529280 | elapsed time per iteration (s): 0.57 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 2.841299E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.039 | TFLOPs: 43.00 | +7: iteration 31570/ 115203 | consumed samples: 8081920 | consumed tokens: 16551772160 | elapsed time per iteration (s): 0.57 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 2.842638E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.544 | TFLOPs: 42.67 | +7: iteration 31580/ 115203 | consumed samples: 8084480 | consumed tokens: 16557015040 | elapsed time per iteration (s): 0.58 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 2.842369E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.981 | TFLOPs: 42.23 | +7: iteration 31590/ 115203 | consumed samples: 8087040 | consumed tokens: 16562257920 | elapsed time per iteration (s): 0.57 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 2.835579E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.410 | TFLOPs: 42.85 | +7: iteration 31600/ 115203 | consumed samples: 8089600 | consumed tokens: 16567500800 | elapsed time per iteration (s): 0.59 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 2.833622E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.383 | TFLOPs: 41.51 | +7: iteration 31610/ 115203 | consumed samples: 8092160 | consumed tokens: 16572743680 | elapsed time per iteration (s): 0.60 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 2.847072E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.111 | TFLOPs: 40.43 | +7: iteration 31620/ 115203 | consumed samples: 8094720 | consumed tokens: 16577986560 | elapsed time per iteration (s): 0.58 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 2.837975E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.568 | TFLOPs: 42.29 | +7: iteration 31630/ 115203 | consumed samples: 8097280 | consumed tokens: 16583229440 | elapsed time per iteration (s): 0.58 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 2.849189E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.237 | TFLOPs: 42.16 | +7: iteration 31640/ 115203 | consumed samples: 8099840 | consumed tokens: 16588472320 | elapsed time per iteration (s): 0.59 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 2.844559E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.047 | TFLOPs: 41.57 | +7: iteration 31650/ 115203 | consumed samples: 8102400 | consumed tokens: 16593715200 | elapsed time per iteration (s): 0.59 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 2.828175E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.983 | TFLOPs: 41.57 | +7: iteration 31660/ 115203 | consumed samples: 8104960 | consumed tokens: 16598958080 | elapsed time per iteration (s): 0.59 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 2.851911E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.046 | TFLOPs: 41.67 | +7: iteration 31670/ 115203 | consumed samples: 8107520 | consumed tokens: 16604200960 | elapsed time per iteration (s): 0.57 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 2.842155E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.658 | TFLOPs: 43.06 | +7: iteration 31680/ 115203 | consumed samples: 8110080 | consumed tokens: 16609443840 | elapsed time per iteration (s): 0.57 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 2.839129E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.358 | TFLOPs: 42.56 | +7: iteration 31690/ 115203 | consumed samples: 8112640 | consumed tokens: 16614686720 | elapsed time per iteration (s): 0.57 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 2.853129E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.854 | TFLOPs: 42.79 | +7: iteration 31700/ 115203 | consumed samples: 8115200 | consumed tokens: 16619929600 | elapsed time per iteration (s): 0.59 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 2.828879E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.659 | TFLOPs: 41.34 | +7: iteration 31710/ 115203 | consumed samples: 8117760 | consumed tokens: 16625172480 | elapsed time per iteration (s): 0.58 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 2.828402E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.791 | TFLOPs: 42.31 | +7: iteration 31720/ 115203 | consumed samples: 8120320 | consumed tokens: 16630415360 | elapsed time per iteration (s): 0.58 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 2.842246E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.494 | TFLOPs: 42.28 | +7: iteration 31730/ 115203 | consumed samples: 8122880 | consumed tokens: 16635658240 | elapsed time per iteration (s): 0.57 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 2.847273E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.159 | TFLOPs: 42.82 | +7: iteration 31740/ 115203 | consumed samples: 8125440 | consumed tokens: 16640901120 | elapsed time per iteration (s): 0.59 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 2.833547E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.742 | TFLOPs: 41.07 | +7: iteration 31750/ 115203 | consumed samples: 8128000 | consumed tokens: 16646144000 | elapsed time per iteration (s): 0.57 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 2.848244E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.280 | TFLOPs: 43.02 | +7: iteration 31760/ 115203 | consumed samples: 8130560 | consumed tokens: 16651386880 | elapsed time per iteration (s): 0.57 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 2.830639E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.231 | TFLOPs: 42.45 | +7: iteration 31770/ 115203 | consumed samples: 8133120 | consumed tokens: 16656629760 | elapsed time per iteration (s): 0.58 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 2.840651E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.658 | TFLOPs: 42.11 | +7: iteration 31780/ 115203 | consumed samples: 8135680 | consumed tokens: 16661872640 | elapsed time per iteration (s): 0.58 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 2.845297E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.946 | TFLOPs: 41.94 | +7: iteration 31790/ 115203 | consumed samples: 8138240 | consumed tokens: 16667115520 | elapsed time per iteration (s): 0.57 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 2.850221E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.074 | TFLOPs: 43.10 | +7: iteration 31800/ 115203 | consumed samples: 8140800 | consumed tokens: 16672358400 | elapsed time per iteration (s): 0.56 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 2.829886E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.987 | TFLOPs: 43.57 | +7: iteration 31810/ 115203 | consumed samples: 8143360 | consumed tokens: 16677601280 | elapsed time per iteration (s): 0.57 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 2.838762E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.611 | TFLOPs: 43.15 | +7: iteration 31820/ 115203 | consumed samples: 8145920 | consumed tokens: 16682844160 | elapsed time per iteration (s): 0.57 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 2.836840E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.287 | TFLOPs: 42.74 | +7: iteration 31830/ 115203 | consumed samples: 8148480 | consumed tokens: 16688087040 | elapsed time per iteration (s): 0.57 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 2.838966E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.445 | TFLOPs: 42.66 | +7: iteration 31840/ 115203 | consumed samples: 8151040 | consumed tokens: 16693329920 | elapsed time per iteration (s): 0.58 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 2.837888E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.172 | TFLOPs: 41.77 | +7: iteration 31850/ 115203 | consumed samples: 8153600 | consumed tokens: 16698572800 | elapsed time per iteration (s): 0.57 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 2.852025E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.579 | TFLOPs: 42.67 | +7: iteration 31860/ 115203 | consumed samples: 8156160 | consumed tokens: 16703815680 | elapsed time per iteration (s): 0.57 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 2.833781E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.001 | TFLOPs: 42.52 | +7: iteration 31870/ 115203 | consumed samples: 8158720 | consumed tokens: 16709058560 | elapsed time per iteration (s): 0.57 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 2.832451E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.986 | TFLOPs: 42.90 | +7: iteration 31880/ 115203 | consumed samples: 8161280 | consumed tokens: 16714301440 | elapsed time per iteration (s): 0.57 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 2.837094E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.346 | TFLOPs: 42.46 | +7: iteration 31890/ 115203 | consumed samples: 8163840 | consumed tokens: 16719544320 | elapsed time per iteration (s): 0.56 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 2.839097E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.988 | TFLOPs: 43.85 | +7: iteration 31900/ 115203 | consumed samples: 8166400 | consumed tokens: 16724787200 | elapsed time per iteration (s): 0.57 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 2.826756E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.495 | TFLOPs: 42.95 | +7: iteration 31910/ 115203 | consumed samples: 8168960 | consumed tokens: 16730030080 | elapsed time per iteration (s): 0.56 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 2.838705E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.415 | TFLOPs: 43.32 | +7: iteration 31920/ 115203 | consumed samples: 8171520 | consumed tokens: 16735272960 | elapsed time per iteration (s): 0.57 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 2.837954E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.777 | TFLOPs: 43.07 | +7: iteration 31930/ 115203 | consumed samples: 8174080 | consumed tokens: 16740515840 | elapsed time per iteration (s): 0.59 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 2.842592E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.611 | TFLOPs: 41.34 | +7: iteration 31940/ 115203 | consumed samples: 8176640 | consumed tokens: 16745758720 | elapsed time per iteration (s): 0.58 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 2.859020E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.714 | TFLOPs: 41.83 | +7: iteration 31950/ 115203 | consumed samples: 8179200 | consumed tokens: 16751001600 | elapsed time per iteration (s): 0.58 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 2.843457E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.633 | TFLOPs: 42.01 | +7: iteration 31960/ 115203 | consumed samples: 8181760 | consumed tokens: 16756244480 | elapsed time per iteration (s): 0.58 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 2.846426E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.314 | TFLOPs: 42.27 | +7: iteration 31970/ 115203 | consumed samples: 8184320 | consumed tokens: 16761487360 | elapsed time per iteration (s): 0.60 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 2.860605E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.438 | TFLOPs: 40.66 | +7: iteration 31980/ 115203 | consumed samples: 8186880 | consumed tokens: 16766730240 | elapsed time per iteration (s): 0.59 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 2.823793E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.457 | TFLOPs: 41.71 | +7: iteration 31990/ 115203 | consumed samples: 8189440 | consumed tokens: 16771973120 | elapsed time per iteration (s): 0.58 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 2.826814E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.157 | TFLOPs: 41.96 | +0: [2023-03-16 17:51:18,755] [INFO] [logging.py:68:log_dist] [Rank 0] step=32000, skipped=0, lr=[0.00016941764143236279, 0.00016941764143236279, 0.00016941764143236279], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 32000/ 115203 | consumed samples: 8192000 | consumed tokens: 16777216000 | elapsed time per iteration (s): 0.60 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 2.828145E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.918 | TFLOPs: 40.70 | +0: steps: 32000 loss: 2.7736 iter time (s): 0.566 samples/sec: 452.143 +7: iteration 32010/ 115203 | consumed samples: 8194560 | consumed tokens: 16782458880 | elapsed time per iteration (s): 0.57 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 2.827299E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.252 | TFLOPs: 42.74 | +7: iteration 32020/ 115203 | consumed samples: 8197120 | consumed tokens: 16787701760 | elapsed time per iteration (s): 0.57 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 2.828398E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.601 | TFLOPs: 42.86 | +7: iteration 32030/ 115203 | consumed samples: 8199680 | consumed tokens: 16792944640 | elapsed time per iteration (s): 0.57 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 2.842259E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.287 | TFLOPs: 42.64 | +7: iteration 32040/ 115203 | consumed samples: 8202240 | consumed tokens: 16798187520 | elapsed time per iteration (s): 0.57 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 2.836852E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.776 | TFLOPs: 42.88 | +7: iteration 32050/ 115203 | consumed samples: 8204800 | consumed tokens: 16803430400 | elapsed time per iteration (s): 0.56 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 2.833939E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.624 | TFLOPs: 43.25 | +7: iteration 32060/ 115203 | consumed samples: 8207360 | consumed tokens: 16808673280 | elapsed time per iteration (s): 0.56 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 2.830757E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.886 | TFLOPs: 43.65 | +7: iteration 32070/ 115203 | consumed samples: 8209920 | consumed tokens: 16813916160 | elapsed time per iteration (s): 0.56 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 2.835371E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.004 | TFLOPs: 43.38 | +7: iteration 32080/ 115203 | consumed samples: 8212480 | consumed tokens: 16819159040 | elapsed time per iteration (s): 0.56 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 2.830026E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.265 | TFLOPs: 43.21 | +7: iteration 32090/ 115203 | consumed samples: 8215040 | consumed tokens: 16824401920 | elapsed time per iteration (s): 0.56 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 2.830199E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.218 | TFLOPs: 43.40 | +7: iteration 32100/ 115203 | consumed samples: 8217600 | consumed tokens: 16829644800 | elapsed time per iteration (s): 0.56 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 2.835093E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.874 | TFLOPs: 43.27 | +7: iteration 32110/ 115203 | consumed samples: 8220160 | consumed tokens: 16834887680 | elapsed time per iteration (s): 0.57 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 2.841780E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.962 | TFLOPs: 42.90 | +7: iteration 32120/ 115203 | consumed samples: 8222720 | consumed tokens: 16840130560 | elapsed time per iteration (s): 0.56 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 2.854028E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.999 | TFLOPs: 43.95 | +7: iteration 32130/ 115203 | consumed samples: 8225280 | consumed tokens: 16845373440 | elapsed time per iteration (s): 0.58 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 2.832334E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.002 | TFLOPs: 41.95 | +7: iteration 32140/ 115203 | consumed samples: 8227840 | consumed tokens: 16850616320 | elapsed time per iteration (s): 0.59 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 2.842179E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.595 | TFLOPs: 41.72 | +7: iteration 32150/ 115203 | consumed samples: 8230400 | consumed tokens: 16855859200 | elapsed time per iteration (s): 0.56 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 2.843030E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.290 | TFLOPs: 43.50 | +7: iteration 32160/ 115203 | consumed samples: 8232960 | consumed tokens: 16861102080 | elapsed time per iteration (s): 0.56 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 2.844650E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.450 | TFLOPs: 43.52 | +7: iteration 32170/ 115203 | consumed samples: 8235520 | consumed tokens: 16866344960 | elapsed time per iteration (s): 0.58 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 2.829665E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.552 | TFLOPs: 42.38 | +7: iteration 32180/ 115203 | consumed samples: 8238080 | consumed tokens: 16871587840 | elapsed time per iteration (s): 0.56 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 2.831684E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.805 | TFLOPs: 43.27 | +7: iteration 32190/ 115203 | consumed samples: 8240640 | consumed tokens: 16876830720 | elapsed time per iteration (s): 0.58 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 2.841679E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.208 | TFLOPs: 42.06 | +7: iteration 32200/ 115203 | consumed samples: 8243200 | consumed tokens: 16882073600 | elapsed time per iteration (s): 0.60 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 2.834795E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.392 | TFLOPs: 40.56 | +7: iteration 32210/ 115203 | consumed samples: 8245760 | consumed tokens: 16887316480 | elapsed time per iteration (s): 0.57 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 2.842942E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.111 | TFLOPs: 42.63 | +7: iteration 32220/ 115203 | consumed samples: 8248320 | consumed tokens: 16892559360 | elapsed time per iteration (s): 0.58 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 2.846442E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.989 | TFLOPs: 42.42 | +7: iteration 32230/ 115203 | consumed samples: 8250880 | consumed tokens: 16897802240 | elapsed time per iteration (s): 0.57 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 2.834382E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.680 | TFLOPs: 42.59 | +7: iteration 32240/ 115203 | consumed samples: 8253440 | consumed tokens: 16903045120 | elapsed time per iteration (s): 0.57 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 2.820667E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.239 | TFLOPs: 42.93 | +7: iteration 32250/ 115203 | consumed samples: 8256000 | consumed tokens: 16908288000 | elapsed time per iteration (s): 0.56 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 2.837466E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.384 | TFLOPs: 43.51 | +7: iteration 32260/ 115203 | consumed samples: 8258560 | consumed tokens: 16913530880 | elapsed time per iteration (s): 0.57 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 2.827171E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.433 | TFLOPs: 42.75 | +7: iteration 32270/ 115203 | consumed samples: 8261120 | consumed tokens: 16918773760 | elapsed time per iteration (s): 0.56 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 2.829861E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.401 | TFLOPs: 43.42 | +7: iteration 32280/ 115203 | consumed samples: 8263680 | consumed tokens: 16924016640 | elapsed time per iteration (s): 0.57 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 2.842437E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.292 | TFLOPs: 42.55 | +7: iteration 32290/ 115203 | consumed samples: 8266240 | consumed tokens: 16929259520 | elapsed time per iteration (s): 0.57 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 2.834003E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.061 | TFLOPs: 42.53 | +7: iteration 32300/ 115203 | consumed samples: 8268800 | consumed tokens: 16934502400 | elapsed time per iteration (s): 0.56 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 2.814023E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.744 | TFLOPs: 43.74 | +7: iteration 32310/ 115203 | consumed samples: 8271360 | consumed tokens: 16939745280 | elapsed time per iteration (s): 0.57 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 2.832283E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.139 | TFLOPs: 42.92 | +7: iteration 32320/ 115203 | consumed samples: 8273920 | consumed tokens: 16944988160 | elapsed time per iteration (s): 0.58 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 2.839641E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.311 | TFLOPs: 42.26 | +7: iteration 32330/ 115203 | consumed samples: 8276480 | consumed tokens: 16950231040 | elapsed time per iteration (s): 0.57 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 2.821033E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.408 | TFLOPs: 42.94 | +7: iteration 32340/ 115203 | consumed samples: 8279040 | consumed tokens: 16955473920 | elapsed time per iteration (s): 0.57 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 2.836627E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.197 | TFLOPs: 42.54 | +7: iteration 32350/ 115203 | consumed samples: 8281600 | consumed tokens: 16960716800 | elapsed time per iteration (s): 0.57 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 2.849289E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.379 | TFLOPs: 42.46 | +7: iteration 32360/ 115203 | consumed samples: 8284160 | consumed tokens: 16965959680 | elapsed time per iteration (s): 0.56 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 2.828272E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.752 | TFLOPs: 43.26 | +7: iteration 32370/ 115203 | consumed samples: 8286720 | consumed tokens: 16971202560 | elapsed time per iteration (s): 0.57 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 2.821239E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.968 | TFLOPs: 42.71 | +7: iteration 32380/ 115203 | consumed samples: 8289280 | consumed tokens: 16976445440 | elapsed time per iteration (s): 0.58 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 2.838514E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.454 | TFLOPs: 42.28 | +7: iteration 32390/ 115203 | consumed samples: 8291840 | consumed tokens: 16981688320 | elapsed time per iteration (s): 0.56 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 2.829184E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.176 | TFLOPs: 43.40 | +7: iteration 32400/ 115203 | consumed samples: 8294400 | consumed tokens: 16986931200 | elapsed time per iteration (s): 0.57 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 2.838090E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.380 | TFLOPs: 42.46 | +7: iteration 32410/ 115203 | consumed samples: 8296960 | consumed tokens: 16992174080 | elapsed time per iteration (s): 0.57 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 2.830646E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.235 | TFLOPs: 42.83 | +7: iteration 32420/ 115203 | consumed samples: 8299520 | consumed tokens: 16997416960 | elapsed time per iteration (s): 0.57 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 2.849965E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.640 | TFLOPs: 43.06 | +7: iteration 32430/ 115203 | consumed samples: 8302080 | consumed tokens: 17002659840 | elapsed time per iteration (s): 0.59 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 2.836039E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.624 | TFLOPs: 41.63 | +7: iteration 32440/ 115203 | consumed samples: 8304640 | consumed tokens: 17007902720 | elapsed time per iteration (s): 0.56 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 2.827943E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.994 | TFLOPs: 43.57 | +7: iteration 32450/ 115203 | consumed samples: 8307200 | consumed tokens: 17013145600 | elapsed time per iteration (s): 0.57 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 2.832537E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.682 | TFLOPs: 42.78 | +7: iteration 32460/ 115203 | consumed samples: 8309760 | consumed tokens: 17018388480 | elapsed time per iteration (s): 0.57 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 2.837115E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.564 | TFLOPs: 43.15 | +7: iteration 32470/ 115203 | consumed samples: 8312320 | consumed tokens: 17023631360 | elapsed time per iteration (s): 0.57 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 2.831024E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.918 | TFLOPs: 42.99 | +7: iteration 32480/ 115203 | consumed samples: 8314880 | consumed tokens: 17028874240 | elapsed time per iteration (s): 0.58 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 2.831601E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.180 | TFLOPs: 42.44 | +7: iteration 32490/ 115203 | consumed samples: 8317440 | consumed tokens: 17034117120 | elapsed time per iteration (s): 0.57 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 2.824291E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.612 | TFLOPs: 42.77 | +7: iteration 32500/ 115203 | consumed samples: 8320000 | consumed tokens: 17039360000 | elapsed time per iteration (s): 0.57 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 2.831798E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.161 | TFLOPs: 42.82 | +7: iteration 32510/ 115203 | consumed samples: 8322560 | consumed tokens: 17044602880 | elapsed time per iteration (s): 0.56 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 2.822943E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.035 | TFLOPs: 43.38 | +7: iteration 32520/ 115203 | consumed samples: 8325120 | consumed tokens: 17049845760 | elapsed time per iteration (s): 0.59 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 2.835953E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.189 | TFLOPs: 41.49 | +7: iteration 32530/ 115203 | consumed samples: 8327680 | consumed tokens: 17055088640 | elapsed time per iteration (s): 0.56 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 2.821062E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.132 | TFLOPs: 43.30 | +7: iteration 32540/ 115203 | consumed samples: 8330240 | consumed tokens: 17060331520 | elapsed time per iteration (s): 0.57 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 2.835628E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.287 | TFLOPs: 42.45 | +7: iteration 32550/ 115203 | consumed samples: 8332800 | consumed tokens: 17065574400 | elapsed time per iteration (s): 0.57 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 2.824041E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.823 | TFLOPs: 42.98 | +7: iteration 32560/ 115203 | consumed samples: 8335360 | consumed tokens: 17070817280 | elapsed time per iteration (s): 0.58 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 2.842189E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.753 | TFLOPs: 41.93 | +7: iteration 32570/ 115203 | consumed samples: 8337920 | consumed tokens: 17076060160 | elapsed time per iteration (s): 0.57 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 2.840310E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.581 | TFLOPs: 42.96 | +7: iteration 32580/ 115203 | consumed samples: 8340480 | consumed tokens: 17081303040 | elapsed time per iteration (s): 0.57 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 2.831427E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.807 | TFLOPs: 42.60 | +7: iteration 32590/ 115203 | consumed samples: 8343040 | consumed tokens: 17086545920 | elapsed time per iteration (s): 0.57 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 2.841604E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.404 | TFLOPs: 42.75 | +7: iteration 32600/ 115203 | consumed samples: 8345600 | consumed tokens: 17091788800 | elapsed time per iteration (s): 0.56 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 2.826017E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.610 | TFLOPs: 43.25 | +7: iteration 32610/ 115203 | consumed samples: 8348160 | consumed tokens: 17097031680 | elapsed time per iteration (s): 0.58 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 2.821881E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.748 | TFLOPs: 41.83 | +7: iteration 32620/ 115203 | consumed samples: 8350720 | consumed tokens: 17102274560 | elapsed time per iteration (s): 0.58 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 2.840796E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.283 | TFLOPs: 42.26 | +7: iteration 32630/ 115203 | consumed samples: 8353280 | consumed tokens: 17107517440 | elapsed time per iteration (s): 0.56 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 2.833522E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.310 | TFLOPs: 43.50 | +7: iteration 32640/ 115203 | consumed samples: 8355840 | consumed tokens: 17112760320 | elapsed time per iteration (s): 0.56 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 2.832293E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.183 | TFLOPs: 43.30 | +7: iteration 32650/ 115203 | consumed samples: 8358400 | consumed tokens: 17118003200 | elapsed time per iteration (s): 0.58 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 2.821892E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.104 | TFLOPs: 42.15 | +7: iteration 32660/ 115203 | consumed samples: 8360960 | consumed tokens: 17123246080 | elapsed time per iteration (s): 0.58 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 2.835891E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.324 | TFLOPs: 42.17 | +7: iteration 32670/ 115203 | consumed samples: 8363520 | consumed tokens: 17128488960 | elapsed time per iteration (s): 0.56 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 2.832362E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.905 | TFLOPs: 43.66 | +7: iteration 32680/ 115203 | consumed samples: 8366080 | consumed tokens: 17133731840 | elapsed time per iteration (s): 0.57 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 2.839025E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.094 | TFLOPs: 42.63 | +7: iteration 32690/ 115203 | consumed samples: 8368640 | consumed tokens: 17138974720 | elapsed time per iteration (s): 0.59 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 2.841395E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.075 | TFLOPs: 41.29 | +7: iteration 32700/ 115203 | consumed samples: 8371200 | consumed tokens: 17144217600 | elapsed time per iteration (s): 0.57 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 2.830856E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.903 | TFLOPs: 42.51 | +7: iteration 32710/ 115203 | consumed samples: 8373760 | consumed tokens: 17149460480 | elapsed time per iteration (s): 0.57 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 2.835657E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.102 | TFLOPs: 42.72 | +7: iteration 32720/ 115203 | consumed samples: 8376320 | consumed tokens: 17154703360 | elapsed time per iteration (s): 0.57 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 2.826877E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.033 | TFLOPs: 43.10 | +7: iteration 32730/ 115203 | consumed samples: 8378880 | consumed tokens: 17159946240 | elapsed time per iteration (s): 0.57 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 2.840298E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.321 | TFLOPs: 42.65 | +7: iteration 32740/ 115203 | consumed samples: 8381440 | consumed tokens: 17165189120 | elapsed time per iteration (s): 0.57 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 2.845738E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.194 | TFLOPs: 42.83 | +7: iteration 32750/ 115203 | consumed samples: 8384000 | consumed tokens: 17170432000 | elapsed time per iteration (s): 0.57 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 2.836220E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.674 | TFLOPs: 42.49 | +7: iteration 32760/ 115203 | consumed samples: 8386560 | consumed tokens: 17175674880 | elapsed time per iteration (s): 0.58 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 2.821675E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.609 | TFLOPs: 41.91 | +7: iteration 32770/ 115203 | consumed samples: 8389120 | consumed tokens: 17180917760 | elapsed time per iteration (s): 0.57 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 2.841068E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.776 | TFLOPs: 42.79 | +7: iteration 32780/ 115203 | consumed samples: 8391680 | consumed tokens: 17186160640 | elapsed time per iteration (s): 0.56 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 2.830743E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.212 | TFLOPs: 43.69 | +7: iteration 32790/ 115203 | consumed samples: 8394240 | consumed tokens: 17191403520 | elapsed time per iteration (s): 0.56 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 2.813154E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.540 | TFLOPs: 43.62 | +7: iteration 32800/ 115203 | consumed samples: 8396800 | consumed tokens: 17196646400 | elapsed time per iteration (s): 0.57 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 2.843926E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.778 | TFLOPs: 42.79 | +7: iteration 32810/ 115203 | consumed samples: 8399360 | consumed tokens: 17201889280 | elapsed time per iteration (s): 0.58 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 2.830125E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.876 | TFLOPs: 42.32 | +7: iteration 32820/ 115203 | consumed samples: 8401920 | consumed tokens: 17207132160 | elapsed time per iteration (s): 0.57 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 2.818535E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.122 | TFLOPs: 42.72 | +7: iteration 32830/ 115203 | consumed samples: 8404480 | consumed tokens: 17212375040 | elapsed time per iteration (s): 0.57 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 2.846488E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.369 | TFLOPs: 42.65 | +7: iteration 32840/ 115203 | consumed samples: 8407040 | consumed tokens: 17217617920 | elapsed time per iteration (s): 0.57 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 2.838489E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.870 | TFLOPs: 42.60 | +7: iteration 32850/ 115203 | consumed samples: 8409600 | consumed tokens: 17222860800 | elapsed time per iteration (s): 0.57 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 2.845350E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.243 | TFLOPs: 42.45 | +7: iteration 32860/ 115203 | consumed samples: 8412160 | consumed tokens: 17228103680 | elapsed time per iteration (s): 0.56 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 2.839814E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.694 | TFLOPs: 43.64 | +7: iteration 32870/ 115203 | consumed samples: 8414720 | consumed tokens: 17233346560 | elapsed time per iteration (s): 0.58 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 2.831754E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.818 | TFLOPs: 42.03 | +7: iteration 32880/ 115203 | consumed samples: 8417280 | consumed tokens: 17238589440 | elapsed time per iteration (s): 0.56 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 2.844125E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.275 | TFLOPs: 43.21 | +7: iteration 32890/ 115203 | consumed samples: 8419840 | consumed tokens: 17243832320 | elapsed time per iteration (s): 0.59 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 2.822518E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.286 | TFLOPs: 41.69 | +7: iteration 32900/ 115203 | consumed samples: 8422400 | consumed tokens: 17249075200 | elapsed time per iteration (s): 0.58 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 2.829001E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.659 | TFLOPs: 42.20 | +7: iteration 32910/ 115203 | consumed samples: 8424960 | consumed tokens: 17254318080 | elapsed time per iteration (s): 0.58 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 2.832269E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.083 | TFLOPs: 42.43 | +7: iteration 32920/ 115203 | consumed samples: 8427520 | consumed tokens: 17259560960 | elapsed time per iteration (s): 0.56 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 2.816673E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.533 | TFLOPs: 43.53 | +7: iteration 32930/ 115203 | consumed samples: 8430080 | consumed tokens: 17264803840 | elapsed time per iteration (s): 0.60 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 2.831415E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.489 | TFLOPs: 40.85 | +7: iteration 32940/ 115203 | consumed samples: 8432640 | consumed tokens: 17270046720 | elapsed time per iteration (s): 0.57 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 2.808409E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.964 | TFLOPs: 42.61 | +7: iteration 32950/ 115203 | consumed samples: 8435200 | consumed tokens: 17275289600 | elapsed time per iteration (s): 0.57 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 2.811824E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.673 | TFLOPs: 42.68 | +7: iteration 32960/ 115203 | consumed samples: 8437760 | consumed tokens: 17280532480 | elapsed time per iteration (s): 0.59 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 2.830282E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.665 | TFLOPs: 41.63 | +7: iteration 32970/ 115203 | consumed samples: 8440320 | consumed tokens: 17285775360 | elapsed time per iteration (s): 0.56 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 2.841074E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.334 | TFLOPs: 43.60 | +7: iteration 32980/ 115203 | consumed samples: 8442880 | consumed tokens: 17291018240 | elapsed time per iteration (s): 0.57 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 2.821484E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.042 | TFLOPs: 43.19 | +7: iteration 32990/ 115203 | consumed samples: 8445440 | consumed tokens: 17296261120 | elapsed time per iteration (s): 0.56 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 2.843042E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.947 | TFLOPs: 43.37 | +7: iteration 33000/ 115203 | consumed samples: 8448000 | consumed tokens: 17301504000 | elapsed time per iteration (s): 0.56 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 2.839167E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.723 | TFLOPs: 43.54 | +7: iteration 33010/ 115203 | consumed samples: 8450560 | consumed tokens: 17306746880 | elapsed time per iteration (s): 0.59 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 2.836428E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.883 | TFLOPs: 41.46 | +7: iteration 33020/ 115203 | consumed samples: 8453120 | consumed tokens: 17311989760 | elapsed time per iteration (s): 0.57 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 2.833873E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.335 | TFLOPs: 42.84 | +7: iteration 33030/ 115203 | consumed samples: 8455680 | consumed tokens: 17317232640 | elapsed time per iteration (s): 0.58 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 2.817497E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.267 | TFLOPs: 42.17 | +7: iteration 33040/ 115203 | consumed samples: 8458240 | consumed tokens: 17322475520 | elapsed time per iteration (s): 0.56 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 2.822235E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.696 | TFLOPs: 43.73 | +7: iteration 33050/ 115203 | consumed samples: 8460800 | consumed tokens: 17327718400 | elapsed time per iteration (s): 0.57 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 2.818081E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.890 | TFLOPs: 42.80 | +7: iteration 33060/ 115203 | consumed samples: 8463360 | consumed tokens: 17332961280 | elapsed time per iteration (s): 0.57 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 2.829754E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.833 | TFLOPs: 42.79 | +7: iteration 33070/ 115203 | consumed samples: 8465920 | consumed tokens: 17338204160 | elapsed time per iteration (s): 0.56 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 2.827644E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.844 | TFLOPs: 43.65 | +7: iteration 33080/ 115203 | consumed samples: 8468480 | consumed tokens: 17343447040 | elapsed time per iteration (s): 0.59 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 2.818886E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.420 | TFLOPs: 41.70 | +7: iteration 33090/ 115203 | consumed samples: 8471040 | consumed tokens: 17348689920 | elapsed time per iteration (s): 0.57 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 2.822394E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.472 | TFLOPs: 43.04 | +7: iteration 33100/ 115203 | consumed samples: 8473600 | consumed tokens: 17353932800 | elapsed time per iteration (s): 0.56 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 2.835662E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.711 | TFLOPs: 43.45 | +7: iteration 33110/ 115203 | consumed samples: 8476160 | consumed tokens: 17359175680 | elapsed time per iteration (s): 0.57 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 2.836813E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.295 | TFLOPs: 43.03 | +7: iteration 33120/ 115203 | consumed samples: 8478720 | consumed tokens: 17364418560 | elapsed time per iteration (s): 0.56 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 2.828981E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.833 | TFLOPs: 43.84 | +7: iteration 33130/ 115203 | consumed samples: 8481280 | consumed tokens: 17369661440 | elapsed time per iteration (s): 0.57 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 2.813236E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.270 | TFLOPs: 43.02 | +7: iteration 33140/ 115203 | consumed samples: 8483840 | consumed tokens: 17374904320 | elapsed time per iteration (s): 0.57 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 2.837921E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.908 | TFLOPs: 42.89 | +7: iteration 33150/ 115203 | consumed samples: 8486400 | consumed tokens: 17380147200 | elapsed time per iteration (s): 0.56 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 2.827798E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.587 | TFLOPs: 43.24 | +7: iteration 33160/ 115203 | consumed samples: 8488960 | consumed tokens: 17385390080 | elapsed time per iteration (s): 0.58 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 2.833155E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.749 | TFLOPs: 42.40 | +7: iteration 33170/ 115203 | consumed samples: 8491520 | consumed tokens: 17390632960 | elapsed time per iteration (s): 0.58 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 2.818546E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.203 | TFLOPs: 42.45 | +7: iteration 33180/ 115203 | consumed samples: 8494080 | consumed tokens: 17395875840 | elapsed time per iteration (s): 0.57 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 2.826113E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.326 | TFLOPs: 43.12 | +7: iteration 33190/ 115203 | consumed samples: 8496640 | consumed tokens: 17401118720 | elapsed time per iteration (s): 0.59 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 2.838852E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.377 | TFLOPs: 41.13 | +7: iteration 33200/ 115203 | consumed samples: 8499200 | consumed tokens: 17406361600 | elapsed time per iteration (s): 0.56 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 2.830382E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.658 | TFLOPs: 43.44 | +7: iteration 33210/ 115203 | consumed samples: 8501760 | consumed tokens: 17411604480 | elapsed time per iteration (s): 0.56 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 2.813976E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.932 | TFLOPs: 43.37 | +7: iteration 33220/ 115203 | consumed samples: 8504320 | consumed tokens: 17416847360 | elapsed time per iteration (s): 0.56 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 2.836285E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.230 | TFLOPs: 43.31 | +7: iteration 33230/ 115203 | consumed samples: 8506880 | consumed tokens: 17422090240 | elapsed time per iteration (s): 0.56 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 2.835906E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.818 | TFLOPs: 43.46 | +7: iteration 33240/ 115203 | consumed samples: 8509440 | consumed tokens: 17427333120 | elapsed time per iteration (s): 0.55 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 2.834110E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.400 | TFLOPs: 43.99 | +7: iteration 33250/ 115203 | consumed samples: 8512000 | consumed tokens: 17432576000 | elapsed time per iteration (s): 0.59 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 2.829902E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.998 | TFLOPs: 41.19 | +7: iteration 33260/ 115203 | consumed samples: 8514560 | consumed tokens: 17437818880 | elapsed time per iteration (s): 0.57 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 2.831904E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.620 | TFLOPs: 42.68 | +7: iteration 33270/ 115203 | consumed samples: 8517120 | consumed tokens: 17443061760 | elapsed time per iteration (s): 0.56 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 2.820776E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.731 | TFLOPs: 43.54 | +7: iteration 33280/ 115203 | consumed samples: 8519680 | consumed tokens: 17448304640 | elapsed time per iteration (s): 0.56 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 2.816238E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.845 | TFLOPs: 43.46 | +7: iteration 33290/ 115203 | consumed samples: 8522240 | consumed tokens: 17453547520 | elapsed time per iteration (s): 0.57 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 2.837594E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.425 | TFLOPs: 43.04 | +7: iteration 33300/ 115203 | consumed samples: 8524800 | consumed tokens: 17458790400 | elapsed time per iteration (s): 0.56 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 2.832052E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.764 | TFLOPs: 43.83 | +7: iteration 33310/ 115203 | consumed samples: 8527360 | consumed tokens: 17464033280 | elapsed time per iteration (s): 0.56 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 2.827997E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.160 | TFLOPs: 43.20 | +7: iteration 33320/ 115203 | consumed samples: 8529920 | consumed tokens: 17469276160 | elapsed time per iteration (s): 0.56 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 2.828560E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.743 | TFLOPs: 43.55 | +7: iteration 33330/ 115203 | consumed samples: 8532480 | consumed tokens: 17474519040 | elapsed time per iteration (s): 0.56 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 2.816554E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.858 | TFLOPs: 43.94 | +7: iteration 33340/ 115203 | consumed samples: 8535040 | consumed tokens: 17479761920 | elapsed time per iteration (s): 0.56 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 2.830709E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.912 | TFLOPs: 43.28 | +7: iteration 33350/ 115203 | consumed samples: 8537600 | consumed tokens: 17485004800 | elapsed time per iteration (s): 0.57 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 2.827911E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.842 | TFLOPs: 42.79 | +7: iteration 33360/ 115203 | consumed samples: 8540160 | consumed tokens: 17490247680 | elapsed time per iteration (s): 0.57 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 2.823699E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.545 | TFLOPs: 43.15 | +7: iteration 33370/ 115203 | consumed samples: 8542720 | consumed tokens: 17495490560 | elapsed time per iteration (s): 0.56 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 2.835766E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.126 | TFLOPs: 43.20 | +7: iteration 33380/ 115203 | consumed samples: 8545280 | consumed tokens: 17500733440 | elapsed time per iteration (s): 0.57 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 2.823064E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.303 | TFLOPs: 42.55 | +7: iteration 33390/ 115203 | consumed samples: 8547840 | consumed tokens: 17505976320 | elapsed time per iteration (s): 0.56 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 2.841217E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.003 | TFLOPs: 43.95 | +7: iteration 33400/ 115203 | consumed samples: 8550400 | consumed tokens: 17511219200 | elapsed time per iteration (s): 0.56 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 2.832969E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.159 | TFLOPs: 43.78 | +7: iteration 33410/ 115203 | consumed samples: 8552960 | consumed tokens: 17516462080 | elapsed time per iteration (s): 0.56 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 2.821226E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.918 | TFLOPs: 43.56 | +7: iteration 33420/ 115203 | consumed samples: 8555520 | consumed tokens: 17521704960 | elapsed time per iteration (s): 0.56 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 2.835761E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.703 | TFLOPs: 43.64 | +7: iteration 33430/ 115203 | consumed samples: 8558080 | consumed tokens: 17526947840 | elapsed time per iteration (s): 0.56 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 2.834538E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.613 | TFLOPs: 43.72 | +7: iteration 33440/ 115203 | consumed samples: 8560640 | consumed tokens: 17532190720 | elapsed time per iteration (s): 0.58 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 2.831283E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.601 | TFLOPs: 42.01 | +7: iteration 33450/ 115203 | consumed samples: 8563200 | consumed tokens: 17537433600 | elapsed time per iteration (s): 0.55 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 2.829882E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.774 | TFLOPs: 44.03 | +7: iteration 33460/ 115203 | consumed samples: 8565760 | consumed tokens: 17542676480 | elapsed time per iteration (s): 0.56 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 2.821516E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.749 | TFLOPs: 43.55 | +7: iteration 33470/ 115203 | consumed samples: 8568320 | consumed tokens: 17547919360 | elapsed time per iteration (s): 0.56 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 2.839254E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.014 | TFLOPs: 43.29 | +7: iteration 33480/ 115203 | consumed samples: 8570880 | consumed tokens: 17553162240 | elapsed time per iteration (s): 0.56 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 2.839747E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.242 | TFLOPs: 43.88 | +7: iteration 33490/ 115203 | consumed samples: 8573440 | consumed tokens: 17558405120 | elapsed time per iteration (s): 0.56 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 2.825735E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.708 | TFLOPs: 43.35 | +7: iteration 33500/ 115203 | consumed samples: 8576000 | consumed tokens: 17563648000 | elapsed time per iteration (s): 0.56 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 2.827856E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.994 | TFLOPs: 43.57 | +7: iteration 33510/ 115203 | consumed samples: 8578560 | consumed tokens: 17568890880 | elapsed time per iteration (s): 0.56 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 2.836847E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.917 | TFLOPs: 43.56 | +7: iteration 33520/ 115203 | consumed samples: 8581120 | consumed tokens: 17574133760 | elapsed time per iteration (s): 0.56 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 2.835848E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.174 | TFLOPs: 43.40 | +7: iteration 33530/ 115203 | consumed samples: 8583680 | consumed tokens: 17579376640 | elapsed time per iteration (s): 0.57 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 2.832257E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.931 | TFLOPs: 43.09 | +7: iteration 33540/ 115203 | consumed samples: 8586240 | consumed tokens: 17584619520 | elapsed time per iteration (s): 0.56 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 2.836767E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.280 | TFLOPs: 43.69 | +7: iteration 33550/ 115203 | consumed samples: 8588800 | consumed tokens: 17589862400 | elapsed time per iteration (s): 0.56 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 2.826145E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.675 | TFLOPs: 43.35 | +7: iteration 33560/ 115203 | consumed samples: 8591360 | consumed tokens: 17595105280 | elapsed time per iteration (s): 0.56 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 2.811454E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.208 | TFLOPs: 43.97 | +7: iteration 33570/ 115203 | consumed samples: 8593920 | consumed tokens: 17600348160 | elapsed time per iteration (s): 0.60 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 2.840509E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.146 | TFLOPs: 40.34 | +7: iteration 33580/ 115203 | consumed samples: 8596480 | consumed tokens: 17605591040 | elapsed time per iteration (s): 0.56 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 2.815713E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.530 | TFLOPs: 43.72 | +7: iteration 33590/ 115203 | consumed samples: 8599040 | consumed tokens: 17610833920 | elapsed time per iteration (s): 0.58 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 2.830816E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.328 | TFLOPs: 41.89 | +7: iteration 33600/ 115203 | consumed samples: 8601600 | consumed tokens: 17616076800 | elapsed time per iteration (s): 0.57 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 2.821625E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.060 | TFLOPs: 42.81 | +7: iteration 33610/ 115203 | consumed samples: 8604160 | consumed tokens: 17621319680 | elapsed time per iteration (s): 0.56 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 2.824988E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.725 | TFLOPs: 43.73 | +7: iteration 33620/ 115203 | consumed samples: 8606720 | consumed tokens: 17626562560 | elapsed time per iteration (s): 0.56 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 2.818829E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.226 | TFLOPs: 43.31 | +7: iteration 33630/ 115203 | consumed samples: 8609280 | consumed tokens: 17631805440 | elapsed time per iteration (s): 0.56 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 2.825466E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.258 | TFLOPs: 43.21 | +7: iteration 33640/ 115203 | consumed samples: 8611840 | consumed tokens: 17637048320 | elapsed time per iteration (s): 0.57 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 2.805527E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.654 | TFLOPs: 42.58 | +7: iteration 33650/ 115203 | consumed samples: 8614400 | consumed tokens: 17642291200 | elapsed time per iteration (s): 0.55 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 2.829918E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.751 | TFLOPs: 44.02 | +7: iteration 33660/ 115203 | consumed samples: 8616960 | consumed tokens: 17647534080 | elapsed time per iteration (s): 0.56 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 2.825605E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.337 | TFLOPs: 43.60 | +7: iteration 33670/ 115203 | consumed samples: 8619520 | consumed tokens: 17652776960 | elapsed time per iteration (s): 0.57 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 2.827737E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.987 | TFLOPs: 42.90 | +7: iteration 33680/ 115203 | consumed samples: 8622080 | consumed tokens: 17658019840 | elapsed time per iteration (s): 0.56 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 2.824894E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.538 | TFLOPs: 43.72 | +7: iteration 33690/ 115203 | consumed samples: 8624640 | consumed tokens: 17663262720 | elapsed time per iteration (s): 0.56 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 2.835866E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.696 | TFLOPs: 43.73 | +7: iteration 33700/ 115203 | consumed samples: 8627200 | consumed tokens: 17668505600 | elapsed time per iteration (s): 0.56 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 2.825387E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.829 | TFLOPs: 43.27 | +7: iteration 33710/ 115203 | consumed samples: 8629760 | consumed tokens: 17673748480 | elapsed time per iteration (s): 0.56 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 2.814434E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.108 | TFLOPs: 43.58 | +7: iteration 33720/ 115203 | consumed samples: 8632320 | consumed tokens: 17678991360 | elapsed time per iteration (s): 0.56 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 2.838181E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.051 | TFLOPs: 43.57 | +7: iteration 33730/ 115203 | consumed samples: 8634880 | consumed tokens: 17684234240 | elapsed time per iteration (s): 0.56 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 2.828546E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.334 | TFLOPs: 43.32 | +7: iteration 33740/ 115203 | consumed samples: 8637440 | consumed tokens: 17689477120 | elapsed time per iteration (s): 0.56 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 2.822686E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.980 | TFLOPs: 43.57 | +7: iteration 33750/ 115203 | consumed samples: 8640000 | consumed tokens: 17694720000 | elapsed time per iteration (s): 0.57 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 2.829370E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.669 | TFLOPs: 42.68 | +7: iteration 33760/ 115203 | consumed samples: 8642560 | consumed tokens: 17699962880 | elapsed time per iteration (s): 0.57 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 2.825497E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.657 | TFLOPs: 42.87 | +7: iteration 33770/ 115203 | consumed samples: 8645120 | consumed tokens: 17705205760 | elapsed time per iteration (s): 0.56 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 2.819587E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.553 | TFLOPs: 43.53 | +7: iteration 33780/ 115203 | consumed samples: 8647680 | consumed tokens: 17710448640 | elapsed time per iteration (s): 0.56 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 2.830645E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.461 | TFLOPs: 43.42 | +7: iteration 33790/ 115203 | consumed samples: 8650240 | consumed tokens: 17715691520 | elapsed time per iteration (s): 0.55 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 2.831000E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.503 | TFLOPs: 44.00 | +7: iteration 33800/ 115203 | consumed samples: 8652800 | consumed tokens: 17720934400 | elapsed time per iteration (s): 0.59 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 2.821267E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.186 | TFLOPs: 41.20 | +7: iteration 33810/ 115203 | consumed samples: 8655360 | consumed tokens: 17726177280 | elapsed time per iteration (s): 0.58 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 2.833685E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.799 | TFLOPs: 42.41 | +7: iteration 33820/ 115203 | consumed samples: 8657920 | consumed tokens: 17731420160 | elapsed time per iteration (s): 0.57 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 2.834286E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.573 | TFLOPs: 43.15 | +7: iteration 33830/ 115203 | consumed samples: 8660480 | consumed tokens: 17736663040 | elapsed time per iteration (s): 0.56 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 2.831213E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.409 | TFLOPs: 43.70 | +7: iteration 33840/ 115203 | consumed samples: 8663040 | consumed tokens: 17741905920 | elapsed time per iteration (s): 0.57 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 2.839438E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.731 | TFLOPs: 42.97 | +7: iteration 33850/ 115203 | consumed samples: 8665600 | consumed tokens: 17747148800 | elapsed time per iteration (s): 0.57 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 2.822897E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.285 | TFLOPs: 42.55 | +7: iteration 33860/ 115203 | consumed samples: 8668160 | consumed tokens: 17752391680 | elapsed time per iteration (s): 0.57 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 2.825563E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.772 | TFLOPs: 43.07 | +7: iteration 33870/ 115203 | consumed samples: 8670720 | consumed tokens: 17757634560 | elapsed time per iteration (s): 0.56 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 2.812811E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.849 | TFLOPs: 43.75 | +7: iteration 33880/ 115203 | consumed samples: 8673280 | consumed tokens: 17762877440 | elapsed time per iteration (s): 0.56 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 2.823897E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.391 | TFLOPs: 43.70 | +7: iteration 33890/ 115203 | consumed samples: 8675840 | consumed tokens: 17768120320 | elapsed time per iteration (s): 0.56 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 2.831844E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.361 | TFLOPs: 43.41 | +7: iteration 33900/ 115203 | consumed samples: 8678400 | consumed tokens: 17773363200 | elapsed time per iteration (s): 0.56 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 2.817773E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.531 | TFLOPs: 43.72 | +7: iteration 33910/ 115203 | consumed samples: 8680960 | consumed tokens: 17778606080 | elapsed time per iteration (s): 0.56 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 2.829004E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.256 | TFLOPs: 43.98 | +7: iteration 33920/ 115203 | consumed samples: 8683520 | consumed tokens: 17783848960 | elapsed time per iteration (s): 0.56 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 2.820816E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.273 | TFLOPs: 43.69 | +7: iteration 33930/ 115203 | consumed samples: 8686080 | consumed tokens: 17789091840 | elapsed time per iteration (s): 0.56 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 2.823799E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.814 | TFLOPs: 43.46 | +7: iteration 33940/ 115203 | consumed samples: 8688640 | consumed tokens: 17794334720 | elapsed time per iteration (s): 0.56 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 2.824379E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.668 | TFLOPs: 43.25 | +7: iteration 33950/ 115203 | consumed samples: 8691200 | consumed tokens: 17799577600 | elapsed time per iteration (s): 0.55 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 2.828840E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.349 | TFLOPs: 43.98 | +7: iteration 33960/ 115203 | consumed samples: 8693760 | consumed tokens: 17804820480 | elapsed time per iteration (s): 0.56 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 2.811707E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.400 | TFLOPs: 43.61 | +7: iteration 33970/ 115203 | consumed samples: 8696320 | consumed tokens: 17810063360 | elapsed time per iteration (s): 0.56 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 2.829411E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.761 | TFLOPs: 43.55 | +7: iteration 33980/ 115203 | consumed samples: 8698880 | consumed tokens: 17815306240 | elapsed time per iteration (s): 0.56 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 2.822208E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.531 | TFLOPs: 43.72 | +7: iteration 33990/ 115203 | consumed samples: 8701440 | consumed tokens: 17820549120 | elapsed time per iteration (s): 0.56 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 2.835206E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.397 | TFLOPs: 43.70 | +0: [2023-03-16 18:10:14,897] [INFO] [logging.py:68:log_dist] [Rank 0] step=34000, skipped=0, lr=[0.00016560534437138965, 0.00016560534437138965, 0.00016560534437138965], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 34000/ 115203 | consumed samples: 8704000 | consumed tokens: 17825792000 | elapsed time per iteration (s): 0.56 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 2.831250E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.016 | TFLOPs: 43.48 | +0: steps: 34000 loss: 2.8181 iter time (s): 0.566 samples/sec: 452.025 +7: iteration 34010/ 115203 | consumed samples: 8706560 | consumed tokens: 17831034880 | elapsed time per iteration (s): 0.57 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 2.823650E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.455 | TFLOPs: 43.14 | +7: iteration 34020/ 115203 | consumed samples: 8709120 | consumed tokens: 17836277760 | elapsed time per iteration (s): 0.56 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 2.821055E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.644 | TFLOPs: 43.54 | +7: iteration 34030/ 115203 | consumed samples: 8711680 | consumed tokens: 17841520640 | elapsed time per iteration (s): 0.58 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 2.820722E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.597 | TFLOPs: 42.39 | +7: iteration 34040/ 115203 | consumed samples: 8714240 | consumed tokens: 17846763520 | elapsed time per iteration (s): 0.57 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 2.812900E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.776 | TFLOPs: 42.98 | +7: iteration 34050/ 115203 | consumed samples: 8716800 | consumed tokens: 17852006400 | elapsed time per iteration (s): 0.56 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 2.828919E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.267 | TFLOPs: 43.31 | +7: iteration 34060/ 115203 | consumed samples: 8719360 | consumed tokens: 17857249280 | elapsed time per iteration (s): 0.57 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 2.819794E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.089 | TFLOPs: 42.91 | +7: iteration 34070/ 115203 | consumed samples: 8721920 | consumed tokens: 17862492160 | elapsed time per iteration (s): 0.57 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 2.828496E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.870 | TFLOPs: 42.89 | +7: iteration 34080/ 115203 | consumed samples: 8724480 | consumed tokens: 17867735040 | elapsed time per iteration (s): 0.56 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 2.826059E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.627 | TFLOPs: 43.53 | +7: iteration 34090/ 115203 | consumed samples: 8727040 | consumed tokens: 17872977920 | elapsed time per iteration (s): 0.56 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 2.823233E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.113 | TFLOPs: 43.49 | +7: iteration 34100/ 115203 | consumed samples: 8729600 | consumed tokens: 17878220800 | elapsed time per iteration (s): 0.56 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 2.815917E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.879 | TFLOPs: 43.37 | +7: iteration 34110/ 115203 | consumed samples: 8732160 | consumed tokens: 17883463680 | elapsed time per iteration (s): 0.56 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 2.826956E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.205 | TFLOPs: 43.97 | +7: iteration 34120/ 115203 | consumed samples: 8734720 | consumed tokens: 17888706560 | elapsed time per iteration (s): 0.56 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 2.833066E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.455 | TFLOPs: 43.71 | +7: iteration 34130/ 115203 | consumed samples: 8737280 | consumed tokens: 17893949440 | elapsed time per iteration (s): 0.56 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 2.827174E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.551 | TFLOPs: 43.62 | +7: iteration 34140/ 115203 | consumed samples: 8739840 | consumed tokens: 17899192320 | elapsed time per iteration (s): 0.56 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 2.818805E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.066 | TFLOPs: 43.29 | +7: iteration 34150/ 115203 | consumed samples: 8742400 | consumed tokens: 17904435200 | elapsed time per iteration (s): 0.57 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 2.821362E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.189 | TFLOPs: 42.73 | +7: iteration 34160/ 115203 | consumed samples: 8744960 | consumed tokens: 17909678080 | elapsed time per iteration (s): 0.56 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 2.823282E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.444 | TFLOPs: 43.71 | +7: iteration 34170/ 115203 | consumed samples: 8747520 | consumed tokens: 17914920960 | elapsed time per iteration (s): 0.58 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 2.833968E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.821 | TFLOPs: 42.41 | +7: iteration 34180/ 115203 | consumed samples: 8750080 | consumed tokens: 17920163840 | elapsed time per iteration (s): 0.56 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 2.830002E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.980 | TFLOPs: 43.95 | +7: iteration 34190/ 115203 | consumed samples: 8752640 | consumed tokens: 17925406720 | elapsed time per iteration (s): 0.56 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 2.831764E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.380 | TFLOPs: 43.42 | +7: iteration 34200/ 115203 | consumed samples: 8755200 | consumed tokens: 17930649600 | elapsed time per iteration (s): 0.55 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 2.820949E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.746 | TFLOPs: 44.02 | +7: iteration 34210/ 115203 | consumed samples: 8757760 | consumed tokens: 17935892480 | elapsed time per iteration (s): 0.56 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 2.822501E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.742 | TFLOPs: 43.45 | +7: iteration 34220/ 115203 | consumed samples: 8760320 | consumed tokens: 17941135360 | elapsed time per iteration (s): 0.57 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 2.829037E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.852 | TFLOPs: 43.08 | +7: iteration 34230/ 115203 | consumed samples: 8762880 | consumed tokens: 17946378240 | elapsed time per iteration (s): 0.56 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 2.818164E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.548 | TFLOPs: 43.43 | +7: iteration 34240/ 115203 | consumed samples: 8765440 | consumed tokens: 17951621120 | elapsed time per iteration (s): 0.56 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 2.824801E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.887 | TFLOPs: 43.27 | +7: iteration 34250/ 115203 | consumed samples: 8768000 | consumed tokens: 17956864000 | elapsed time per iteration (s): 0.56 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 2.826710E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.494 | TFLOPs: 43.90 | +7: iteration 34260/ 115203 | consumed samples: 8770560 | consumed tokens: 17962106880 | elapsed time per iteration (s): 0.56 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 2.821531E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.122 | TFLOPs: 43.30 | +7: iteration 34270/ 115203 | consumed samples: 8773120 | consumed tokens: 17967349760 | elapsed time per iteration (s): 0.56 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 2.821175E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.109 | TFLOPs: 43.68 | +7: iteration 34280/ 115203 | consumed samples: 8775680 | consumed tokens: 17972592640 | elapsed time per iteration (s): 0.56 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 2.827850E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.397 | TFLOPs: 43.51 | +7: iteration 34290/ 115203 | consumed samples: 8778240 | consumed tokens: 17977835520 | elapsed time per iteration (s): 0.56 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 2.834363E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.029 | TFLOPs: 43.76 | +7: iteration 34300/ 115203 | consumed samples: 8780800 | consumed tokens: 17983078400 | elapsed time per iteration (s): 0.56 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 2.826258E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.652 | TFLOPs: 43.73 | +7: iteration 34310/ 115203 | consumed samples: 8783360 | consumed tokens: 17988321280 | elapsed time per iteration (s): 0.56 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 2.818282E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.215 | TFLOPs: 43.59 | +7: iteration 34320/ 115203 | consumed samples: 8785920 | consumed tokens: 17993564160 | elapsed time per iteration (s): 0.56 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 2.811821E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.608 | TFLOPs: 43.44 | +7: iteration 34330/ 115203 | consumed samples: 8788480 | consumed tokens: 17998807040 | elapsed time per iteration (s): 0.56 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 2.810027E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.310 | TFLOPs: 43.31 | +7: iteration 34340/ 115203 | consumed samples: 8791040 | consumed tokens: 18004049920 | elapsed time per iteration (s): 0.56 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 2.815292E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.104 | TFLOPs: 43.39 | +7: iteration 34350/ 115203 | consumed samples: 8793600 | consumed tokens: 18009292800 | elapsed time per iteration (s): 0.56 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 2.826252E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.043 | TFLOPs: 43.86 | +7: iteration 34360/ 115203 | consumed samples: 8796160 | consumed tokens: 18014535680 | elapsed time per iteration (s): 0.56 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 2.820011E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.602 | TFLOPs: 43.44 | +7: iteration 34370/ 115203 | consumed samples: 8798720 | consumed tokens: 18019778560 | elapsed time per iteration (s): 0.56 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 2.839100E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.279 | TFLOPs: 43.69 | +7: iteration 34380/ 115203 | consumed samples: 8801280 | consumed tokens: 18025021440 | elapsed time per iteration (s): 0.55 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 2.827147E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.469 | TFLOPs: 44.00 | +7: iteration 34390/ 115203 | consumed samples: 8803840 | consumed tokens: 18030264320 | elapsed time per iteration (s): 0.56 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 2.814577E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.227 | TFLOPs: 43.69 | +7: iteration 34400/ 115203 | consumed samples: 8806400 | consumed tokens: 18035507200 | elapsed time per iteration (s): 0.56 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 2.824615E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.744 | TFLOPs: 43.26 | +7: iteration 34410/ 115203 | consumed samples: 8808960 | consumed tokens: 18040750080 | elapsed time per iteration (s): 0.55 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 2.828950E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.449 | TFLOPs: 43.99 | +7: iteration 34420/ 115203 | consumed samples: 8811520 | consumed tokens: 18045992960 | elapsed time per iteration (s): 0.56 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 2.831351E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.933 | TFLOPs: 43.47 | +7: iteration 34430/ 115203 | consumed samples: 8814080 | consumed tokens: 18051235840 | elapsed time per iteration (s): 0.60 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 2.820789E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.114 | TFLOPs: 40.72 | +7: iteration 34440/ 115203 | consumed samples: 8816640 | consumed tokens: 18056478720 | elapsed time per iteration (s): 0.58 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 2.828723E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.018 | TFLOPs: 42.05 | +7: iteration 34450/ 115203 | consumed samples: 8819200 | consumed tokens: 18061721600 | elapsed time per iteration (s): 0.58 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 2.829128E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.552 | TFLOPs: 42.00 | +7: iteration 34460/ 115203 | consumed samples: 8821760 | consumed tokens: 18066964480 | elapsed time per iteration (s): 0.56 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 2.815970E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.460 | TFLOPs: 43.71 | +7: iteration 34470/ 115203 | consumed samples: 8824320 | consumed tokens: 18072207360 | elapsed time per iteration (s): 0.56 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 2.818511E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.967 | TFLOPs: 43.85 | +7: iteration 34480/ 115203 | consumed samples: 8826880 | consumed tokens: 18077450240 | elapsed time per iteration (s): 0.57 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 2.818127E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.946 | TFLOPs: 42.99 | +7: iteration 34490/ 115203 | consumed samples: 8829440 | consumed tokens: 18082693120 | elapsed time per iteration (s): 0.56 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 2.821228E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.449 | TFLOPs: 43.61 | +7: iteration 34500/ 115203 | consumed samples: 8832000 | consumed tokens: 18087936000 | elapsed time per iteration (s): 0.57 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 2.824819E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.668 | TFLOPs: 43.16 | +7: iteration 34510/ 115203 | consumed samples: 8834560 | consumed tokens: 18093178880 | elapsed time per iteration (s): 0.56 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 2.819178E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.195 | TFLOPs: 43.59 | +7: iteration 34520/ 115203 | consumed samples: 8837120 | consumed tokens: 18098421760 | elapsed time per iteration (s): 0.56 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 2.823951E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.936 | TFLOPs: 43.56 | +7: iteration 34530/ 115203 | consumed samples: 8839680 | consumed tokens: 18103664640 | elapsed time per iteration (s): 0.56 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 2.817080E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.385 | TFLOPs: 43.51 | +7: iteration 34540/ 115203 | consumed samples: 8842240 | consumed tokens: 18108907520 | elapsed time per iteration (s): 0.56 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 2.841947E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.079 | TFLOPs: 43.86 | +7: iteration 34550/ 115203 | consumed samples: 8844800 | consumed tokens: 18114150400 | elapsed time per iteration (s): 0.55 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 2.822140E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.530 | TFLOPs: 44.00 | +7: iteration 34560/ 115203 | consumed samples: 8847360 | consumed tokens: 18119393280 | elapsed time per iteration (s): 0.56 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 2.834565E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.413 | TFLOPs: 43.61 | +7: iteration 34570/ 115203 | consumed samples: 8849920 | consumed tokens: 18124636160 | elapsed time per iteration (s): 0.56 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 2.812344E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.066 | TFLOPs: 43.67 | +7: iteration 34580/ 115203 | consumed samples: 8852480 | consumed tokens: 18129879040 | elapsed time per iteration (s): 0.55 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 2.809956E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.443 | TFLOPs: 43.99 | +7: iteration 34590/ 115203 | consumed samples: 8855040 | consumed tokens: 18135121920 | elapsed time per iteration (s): 0.55 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 2.824864E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.363 | TFLOPs: 43.99 | +7: iteration 34600/ 115203 | consumed samples: 8857600 | consumed tokens: 18140364800 | elapsed time per iteration (s): 0.56 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 2.808070E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.859 | TFLOPs: 43.56 | +7: iteration 34610/ 115203 | consumed samples: 8860160 | consumed tokens: 18145607680 | elapsed time per iteration (s): 0.55 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 2.821176E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.697 | TFLOPs: 44.02 | +7: iteration 34620/ 115203 | consumed samples: 8862720 | consumed tokens: 18150850560 | elapsed time per iteration (s): 0.55 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 2.826848E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.407 | TFLOPs: 43.99 | +7: iteration 34630/ 115203 | consumed samples: 8865280 | consumed tokens: 18156093440 | elapsed time per iteration (s): 0.55 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 2.824722E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.475 | TFLOPs: 44.00 | +7: iteration 34640/ 115203 | consumed samples: 8867840 | consumed tokens: 18161336320 | elapsed time per iteration (s): 0.56 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 2.815731E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.659 | TFLOPs: 43.54 | +7: iteration 34650/ 115203 | consumed samples: 8870400 | consumed tokens: 18166579200 | elapsed time per iteration (s): 0.56 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 2.807146E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.675 | TFLOPs: 43.54 | +7: iteration 34660/ 115203 | consumed samples: 8872960 | consumed tokens: 18171822080 | elapsed time per iteration (s): 0.56 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 2.823723E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.788 | TFLOPs: 43.55 | +7: iteration 34670/ 115203 | consumed samples: 8875520 | consumed tokens: 18177064960 | elapsed time per iteration (s): 0.56 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 2.826162E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.674 | TFLOPs: 43.54 | +7: iteration 34680/ 115203 | consumed samples: 8878080 | consumed tokens: 18182307840 | elapsed time per iteration (s): 0.55 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 2.814334E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.549 | TFLOPs: 44.00 | +7: iteration 34690/ 115203 | consumed samples: 8880640 | consumed tokens: 18187550720 | elapsed time per iteration (s): 0.55 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 2.826861E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.661 | TFLOPs: 44.01 | +7: iteration 34700/ 115203 | consumed samples: 8883200 | consumed tokens: 18192793600 | elapsed time per iteration (s): 0.56 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 2.822167E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.563 | TFLOPs: 43.53 | +7: iteration 34710/ 115203 | consumed samples: 8885760 | consumed tokens: 18198036480 | elapsed time per iteration (s): 0.56 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 2.832577E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.051 | TFLOPs: 43.29 | +7: iteration 34720/ 115203 | consumed samples: 8888320 | consumed tokens: 18203279360 | elapsed time per iteration (s): 0.56 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 2.816217E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.238 | TFLOPs: 43.97 | +7: iteration 34730/ 115203 | consumed samples: 8890880 | consumed tokens: 18208522240 | elapsed time per iteration (s): 0.56 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 2.821962E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.527 | TFLOPs: 43.43 | +7: iteration 34740/ 115203 | consumed samples: 8893440 | consumed tokens: 18213765120 | elapsed time per iteration (s): 0.55 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 2.819368E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.287 | TFLOPs: 43.98 | +7: iteration 34750/ 115203 | consumed samples: 8896000 | consumed tokens: 18219008000 | elapsed time per iteration (s): 0.56 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 2.817571E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 34760/ 115203 | consumed samples: 8898560 | consumed tokens: 18224250880 | elapsed time per iteration (s): 0.55 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 2.820391E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.526 | TFLOPs: 44.00 | +7: iteration 34770/ 115203 | consumed samples: 8901120 | consumed tokens: 18229493760 | elapsed time per iteration (s): 0.56 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 2.813090E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.147 | TFLOPs: 43.87 | +7: iteration 34780/ 115203 | consumed samples: 8903680 | consumed tokens: 18234736640 | elapsed time per iteration (s): 0.55 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 2.833129E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.404 | TFLOPs: 43.99 | +7: iteration 34790/ 115203 | consumed samples: 8906240 | consumed tokens: 18239979520 | elapsed time per iteration (s): 0.57 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 2.828114E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.954 | TFLOPs: 43.18 | +7: iteration 34800/ 115203 | consumed samples: 8908800 | consumed tokens: 18245222400 | elapsed time per iteration (s): 0.56 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 2.805690E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.054 | TFLOPs: 43.58 | +7: iteration 34810/ 115203 | consumed samples: 8911360 | consumed tokens: 18250465280 | elapsed time per iteration (s): 0.56 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 2.813373E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.946 | TFLOPs: 43.56 | +7: iteration 34820/ 115203 | consumed samples: 8913920 | consumed tokens: 18255708160 | elapsed time per iteration (s): 0.56 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 2.840148E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.088 | TFLOPs: 43.29 | +7: iteration 34830/ 115203 | consumed samples: 8916480 | consumed tokens: 18260951040 | elapsed time per iteration (s): 0.56 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 2.812985E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.741 | TFLOPs: 43.26 | +7: iteration 34840/ 115203 | consumed samples: 8919040 | consumed tokens: 18266193920 | elapsed time per iteration (s): 0.57 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 2.808864E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.543 | TFLOPs: 43.15 | +7: iteration 34850/ 115203 | consumed samples: 8921600 | consumed tokens: 18271436800 | elapsed time per iteration (s): 0.56 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 2.817215E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.040 | TFLOPs: 43.48 | +7: iteration 34860/ 115203 | consumed samples: 8924160 | consumed tokens: 18276679680 | elapsed time per iteration (s): 0.55 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 2.833450E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.268 | TFLOPs: 43.98 | +7: iteration 34870/ 115203 | consumed samples: 8926720 | consumed tokens: 18281922560 | elapsed time per iteration (s): 0.55 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 2.805593E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.456 | TFLOPs: 43.99 | +7: iteration 34880/ 115203 | consumed samples: 8929280 | consumed tokens: 18287165440 | elapsed time per iteration (s): 0.56 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 2.817273E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.846 | TFLOPs: 43.36 | +7: iteration 34890/ 115203 | consumed samples: 8931840 | consumed tokens: 18292408320 | elapsed time per iteration (s): 0.56 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 2.806971E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.482 | TFLOPs: 43.62 | +7: iteration 34900/ 115203 | consumed samples: 8934400 | consumed tokens: 18297651200 | elapsed time per iteration (s): 0.55 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 2.820169E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.452 | TFLOPs: 43.99 | +7: iteration 34910/ 115203 | consumed samples: 8936960 | consumed tokens: 18302894080 | elapsed time per iteration (s): 0.57 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 2.814833E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.585 | TFLOPs: 43.05 | +7: iteration 34920/ 115203 | consumed samples: 8939520 | consumed tokens: 18308136960 | elapsed time per iteration (s): 0.55 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 2.822490E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.604 | TFLOPs: 44.01 | +7: iteration 34930/ 115203 | consumed samples: 8942080 | consumed tokens: 18313379840 | elapsed time per iteration (s): 0.57 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 2.823684E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.644 | TFLOPs: 42.96 | +7: iteration 34940/ 115203 | consumed samples: 8944640 | consumed tokens: 18318622720 | elapsed time per iteration (s): 0.56 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 2.813340E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.079 | TFLOPs: 43.86 | +7: iteration 34950/ 115203 | consumed samples: 8947200 | consumed tokens: 18323865600 | elapsed time per iteration (s): 0.55 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 2.816522E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.407 | TFLOPs: 43.99 | +7: iteration 34960/ 115203 | consumed samples: 8949760 | consumed tokens: 18329108480 | elapsed time per iteration (s): 0.55 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 2.814586E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.551 | TFLOPs: 44.00 | +7: iteration 34970/ 115203 | consumed samples: 8952320 | consumed tokens: 18334351360 | elapsed time per iteration (s): 0.56 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 2.810662E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.905 | TFLOPs: 43.27 | +7: iteration 34980/ 115203 | consumed samples: 8954880 | consumed tokens: 18339594240 | elapsed time per iteration (s): 0.56 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 2.807402E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.093 | TFLOPs: 43.48 | +7: iteration 34990/ 115203 | consumed samples: 8957440 | consumed tokens: 18344837120 | elapsed time per iteration (s): 0.56 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 2.813800E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.499 | TFLOPs: 43.71 | +7: iteration 35000/ 115203 | consumed samples: 8960000 | consumed tokens: 18350080000 | elapsed time per iteration (s): 0.57 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 2.831297E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.182 | TFLOPs: 42.73 | +7: iteration 35010/ 115203 | consumed samples: 8962560 | consumed tokens: 18355322880 | elapsed time per iteration (s): 0.56 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 2.806414E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.971 | TFLOPs: 43.47 | +7: iteration 35020/ 115203 | consumed samples: 8965120 | consumed tokens: 18360565760 | elapsed time per iteration (s): 0.55 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 2.812314E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.562 | TFLOPs: 44.00 | +7: iteration 35030/ 115203 | consumed samples: 8967680 | consumed tokens: 18365808640 | elapsed time per iteration (s): 0.56 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 2.825402E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.894 | TFLOPs: 43.75 | +7: iteration 35040/ 115203 | consumed samples: 8970240 | consumed tokens: 18371051520 | elapsed time per iteration (s): 0.55 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 2.825766E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.629 | TFLOPs: 44.01 | +7: iteration 35050/ 115203 | consumed samples: 8972800 | consumed tokens: 18376294400 | elapsed time per iteration (s): 0.55 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 2.832640E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.563 | TFLOPs: 44.00 | +7: iteration 35060/ 115203 | consumed samples: 8975360 | consumed tokens: 18381537280 | elapsed time per iteration (s): 0.56 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 2.816292E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.200 | TFLOPs: 43.97 | +7: iteration 35070/ 115203 | consumed samples: 8977920 | consumed tokens: 18386780160 | elapsed time per iteration (s): 0.56 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 2.808515E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.112 | TFLOPs: 43.68 | +7: iteration 35080/ 115203 | consumed samples: 8980480 | consumed tokens: 18392023040 | elapsed time per iteration (s): 0.56 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 2.807910E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.731 | TFLOPs: 43.93 | +7: iteration 35090/ 115203 | consumed samples: 8983040 | consumed tokens: 18397265920 | elapsed time per iteration (s): 0.55 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 2.811993E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.555 | TFLOPs: 44.00 | +7: iteration 35100/ 115203 | consumed samples: 8985600 | consumed tokens: 18402508800 | elapsed time per iteration (s): 0.56 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 2.800567E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.192 | TFLOPs: 43.97 | +7: iteration 35110/ 115203 | consumed samples: 8988160 | consumed tokens: 18407751680 | elapsed time per iteration (s): 0.56 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 2.820153E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.250 | TFLOPs: 43.31 | +7: iteration 35120/ 115203 | consumed samples: 8990720 | consumed tokens: 18412994560 | elapsed time per iteration (s): 0.55 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 2.826836E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.798 | TFLOPs: 44.03 | +7: iteration 35130/ 115203 | consumed samples: 8993280 | consumed tokens: 18418237440 | elapsed time per iteration (s): 0.55 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 2.824344E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.815 | TFLOPs: 44.03 | +7: iteration 35140/ 115203 | consumed samples: 8995840 | consumed tokens: 18423480320 | elapsed time per iteration (s): 0.55 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 2.815884E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.968 | TFLOPs: 44.04 | +7: iteration 35150/ 115203 | consumed samples: 8998400 | consumed tokens: 18428723200 | elapsed time per iteration (s): 0.55 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 2.809987E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.847 | TFLOPs: 44.03 | +7: iteration 35160/ 115203 | consumed samples: 9000960 | consumed tokens: 18433966080 | elapsed time per iteration (s): 0.55 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 2.806910E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.498 | TFLOPs: 44.00 | +7: iteration 35170/ 115203 | consumed samples: 9003520 | consumed tokens: 18439208960 | elapsed time per iteration (s): 0.56 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 2.818405E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.018 | TFLOPs: 43.57 | +7: iteration 35180/ 115203 | consumed samples: 9006080 | consumed tokens: 18444451840 | elapsed time per iteration (s): 0.56 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 2.821645E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.869 | TFLOPs: 43.56 | +7: iteration 35190/ 115203 | consumed samples: 9008640 | consumed tokens: 18449694720 | elapsed time per iteration (s): 0.55 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 2.820441E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.648 | TFLOPs: 44.01 | +7: iteration 35200/ 115203 | consumed samples: 9011200 | consumed tokens: 18454937600 | elapsed time per iteration (s): 0.57 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 2.798898E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.631 | TFLOPs: 43.06 | +7: iteration 35210/ 115203 | consumed samples: 9013760 | consumed tokens: 18460180480 | elapsed time per iteration (s): 0.55 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 2.814506E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.568 | TFLOPs: 44.01 | +7: iteration 35220/ 115203 | consumed samples: 9016320 | consumed tokens: 18465423360 | elapsed time per iteration (s): 0.55 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 2.809058E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.367 | TFLOPs: 43.99 | +7: iteration 35230/ 115203 | consumed samples: 9018880 | consumed tokens: 18470666240 | elapsed time per iteration (s): 0.56 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 2.808467E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.561 | TFLOPs: 43.43 | +7: iteration 35240/ 115203 | consumed samples: 9021440 | consumed tokens: 18475909120 | elapsed time per iteration (s): 0.56 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 2.806790E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.513 | TFLOPs: 43.43 | +7: iteration 35250/ 115203 | consumed samples: 9024000 | consumed tokens: 18481152000 | elapsed time per iteration (s): 0.55 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 2.813625E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 35260/ 115203 | consumed samples: 9026560 | consumed tokens: 18486394880 | elapsed time per iteration (s): 0.57 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 2.810445E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.348 | TFLOPs: 42.75 | +7: iteration 35270/ 115203 | consumed samples: 9029120 | consumed tokens: 18491637760 | elapsed time per iteration (s): 0.56 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 2.822152E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.682 | TFLOPs: 43.92 | +7: iteration 35280/ 115203 | consumed samples: 9031680 | consumed tokens: 18496880640 | elapsed time per iteration (s): 0.55 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 2.826496E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.647 | TFLOPs: 44.01 | +7: iteration 35290/ 115203 | consumed samples: 9034240 | consumed tokens: 18502123520 | elapsed time per iteration (s): 0.55 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 2.818909E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.685 | TFLOPs: 44.02 | +7: iteration 35300/ 115203 | consumed samples: 9036800 | consumed tokens: 18507366400 | elapsed time per iteration (s): 0.55 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 2.811649E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.700 | TFLOPs: 44.02 | +7: iteration 35310/ 115203 | consumed samples: 9039360 | consumed tokens: 18512609280 | elapsed time per iteration (s): 0.56 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 2.830058E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.670 | TFLOPs: 43.73 | +7: iteration 35320/ 115203 | consumed samples: 9041920 | consumed tokens: 18517852160 | elapsed time per iteration (s): 0.56 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 2.802269E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.040 | TFLOPs: 43.57 | +7: iteration 35330/ 115203 | consumed samples: 9044480 | consumed tokens: 18523095040 | elapsed time per iteration (s): 0.55 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 2.820212E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.685 | TFLOPs: 44.02 | +7: iteration 35340/ 115203 | consumed samples: 9047040 | consumed tokens: 18528337920 | elapsed time per iteration (s): 0.55 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 2.819574E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.534 | TFLOPs: 44.00 | +7: iteration 35350/ 115203 | consumed samples: 9049600 | consumed tokens: 18533580800 | elapsed time per iteration (s): 0.55 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 2.821404E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.551 | TFLOPs: 44.00 | +7: iteration 35360/ 115203 | consumed samples: 9052160 | consumed tokens: 18538823680 | elapsed time per iteration (s): 0.57 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 2.801913E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.391 | TFLOPs: 43.04 | +7: iteration 35370/ 115203 | consumed samples: 9054720 | consumed tokens: 18544066560 | elapsed time per iteration (s): 0.56 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 2.813155E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.824 | TFLOPs: 43.55 | +7: iteration 35380/ 115203 | consumed samples: 9057280 | consumed tokens: 18549309440 | elapsed time per iteration (s): 0.55 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 2.821161E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.283 | TFLOPs: 43.98 | +7: iteration 35390/ 115203 | consumed samples: 9059840 | consumed tokens: 18554552320 | elapsed time per iteration (s): 0.56 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 2.818682E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 35400/ 115203 | consumed samples: 9062400 | consumed tokens: 18559795200 | elapsed time per iteration (s): 0.56 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 2.825398E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.211 | TFLOPs: 43.97 | +7: iteration 35410/ 115203 | consumed samples: 9064960 | consumed tokens: 18565038080 | elapsed time per iteration (s): 0.57 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 2.808638E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.475 | TFLOPs: 43.14 | +7: iteration 35420/ 115203 | consumed samples: 9067520 | consumed tokens: 18570280960 | elapsed time per iteration (s): 0.56 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 2.817638E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.713 | TFLOPs: 43.73 | +7: iteration 35430/ 115203 | consumed samples: 9070080 | consumed tokens: 18575523840 | elapsed time per iteration (s): 0.55 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 2.816451E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.410 | TFLOPs: 43.99 | +7: iteration 35440/ 115203 | consumed samples: 9072640 | consumed tokens: 18580766720 | elapsed time per iteration (s): 0.56 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 2.808773E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.020 | TFLOPs: 43.67 | +7: iteration 35450/ 115203 | consumed samples: 9075200 | consumed tokens: 18586009600 | elapsed time per iteration (s): 0.56 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 2.822864E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.897 | TFLOPs: 43.94 | +7: iteration 35460/ 115203 | consumed samples: 9077760 | consumed tokens: 18591252480 | elapsed time per iteration (s): 0.55 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 2.824139E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.613 | TFLOPs: 44.01 | +7: iteration 35470/ 115203 | consumed samples: 9080320 | consumed tokens: 18596495360 | elapsed time per iteration (s): 0.56 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 2.805478E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.628 | TFLOPs: 43.73 | +7: iteration 35480/ 115203 | consumed samples: 9082880 | consumed tokens: 18601738240 | elapsed time per iteration (s): 0.56 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 2.817103E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.707 | TFLOPs: 43.73 | +7: iteration 35490/ 115203 | consumed samples: 9085440 | consumed tokens: 18606981120 | elapsed time per iteration (s): 0.55 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 2.816138E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.723 | TFLOPs: 44.02 | +7: iteration 35500/ 115203 | consumed samples: 9088000 | consumed tokens: 18612224000 | elapsed time per iteration (s): 0.56 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 2.815989E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.220 | TFLOPs: 43.50 | +7: iteration 35510/ 115203 | consumed samples: 9090560 | consumed tokens: 18617466880 | elapsed time per iteration (s): 0.56 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 2.811896E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.647 | TFLOPs: 43.73 | +7: iteration 35520/ 115203 | consumed samples: 9093120 | consumed tokens: 18622709760 | elapsed time per iteration (s): 0.55 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 2.807537E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.582 | TFLOPs: 44.01 | +7: iteration 35530/ 115203 | consumed samples: 9095680 | consumed tokens: 18627952640 | elapsed time per iteration (s): 0.56 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 2.812339E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.025 | TFLOPs: 43.48 | +7: iteration 35540/ 115203 | consumed samples: 9098240 | consumed tokens: 18633195520 | elapsed time per iteration (s): 0.56 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 2.802513E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.934 | TFLOPs: 43.66 | +7: iteration 35550/ 115203 | consumed samples: 9100800 | consumed tokens: 18638438400 | elapsed time per iteration (s): 0.56 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 2.800441E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.066 | TFLOPs: 43.58 | +7: iteration 35560/ 115203 | consumed samples: 9103360 | consumed tokens: 18643681280 | elapsed time per iteration (s): 0.57 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 2.811596E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.800 | TFLOPs: 43.07 | +7: iteration 35570/ 115203 | consumed samples: 9105920 | consumed tokens: 18648924160 | elapsed time per iteration (s): 0.55 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 2.813331E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.614 | TFLOPs: 44.01 | +7: iteration 35580/ 115203 | consumed samples: 9108480 | consumed tokens: 18654167040 | elapsed time per iteration (s): 0.55 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 2.807550E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.642 | TFLOPs: 44.01 | +7: iteration 35590/ 115203 | consumed samples: 9111040 | consumed tokens: 18659409920 | elapsed time per iteration (s): 0.55 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 2.814089E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.595 | TFLOPs: 44.01 | +7: iteration 35600/ 115203 | consumed samples: 9113600 | consumed tokens: 18664652800 | elapsed time per iteration (s): 0.55 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 2.817276E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.614 | TFLOPs: 44.01 | +7: iteration 35610/ 115203 | consumed samples: 9116160 | consumed tokens: 18669895680 | elapsed time per iteration (s): 0.55 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 2.806779E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.518 | TFLOPs: 44.00 | +7: iteration 35620/ 115203 | consumed samples: 9118720 | consumed tokens: 18675138560 | elapsed time per iteration (s): 0.55 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 2.804873E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.476 | TFLOPs: 44.00 | +7: iteration 35630/ 115203 | consumed samples: 9121280 | consumed tokens: 18680381440 | elapsed time per iteration (s): 0.55 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 2.806213E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.436 | TFLOPs: 43.99 | +7: iteration 35640/ 115203 | consumed samples: 9123840 | consumed tokens: 18685624320 | elapsed time per iteration (s): 0.56 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 2.805291E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.646 | TFLOPs: 43.92 | +7: iteration 35650/ 115203 | consumed samples: 9126400 | consumed tokens: 18690867200 | elapsed time per iteration (s): 0.56 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 2.804688E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.038 | TFLOPs: 43.95 | +7: iteration 35660/ 115203 | consumed samples: 9128960 | consumed tokens: 18696110080 | elapsed time per iteration (s): 0.56 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 2.824018E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.149 | TFLOPs: 43.97 | +7: iteration 35670/ 115203 | consumed samples: 9131520 | consumed tokens: 18701352960 | elapsed time per iteration (s): 0.56 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 2.830421E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.719 | TFLOPs: 43.92 | +7: iteration 35680/ 115203 | consumed samples: 9134080 | consumed tokens: 18706595840 | elapsed time per iteration (s): 0.56 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 2.818362E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.036 | TFLOPs: 43.95 | +7: iteration 35690/ 115203 | consumed samples: 9136640 | consumed tokens: 18711838720 | elapsed time per iteration (s): 0.56 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 2.804605E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.239 | TFLOPs: 43.50 | +7: iteration 35700/ 115203 | consumed samples: 9139200 | consumed tokens: 18717081600 | elapsed time per iteration (s): 0.56 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 2.814976E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.277 | TFLOPs: 43.79 | +7: iteration 35710/ 115203 | consumed samples: 9141760 | consumed tokens: 18722324480 | elapsed time per iteration (s): 0.55 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 2.798969E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 35720/ 115203 | consumed samples: 9144320 | consumed tokens: 18727567360 | elapsed time per iteration (s): 0.55 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 2.808479E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.525 | TFLOPs: 44.00 | +7: iteration 35730/ 115203 | consumed samples: 9146880 | consumed tokens: 18732810240 | elapsed time per iteration (s): 0.56 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 2.812769E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 35740/ 115203 | consumed samples: 9149440 | consumed tokens: 18738053120 | elapsed time per iteration (s): 0.56 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 2.814726E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.799 | TFLOPs: 43.93 | +7: iteration 35750/ 115203 | consumed samples: 9152000 | consumed tokens: 18743296000 | elapsed time per iteration (s): 0.56 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 2.816145E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.059 | TFLOPs: 43.86 | +7: iteration 35760/ 115203 | consumed samples: 9154560 | consumed tokens: 18748538880 | elapsed time per iteration (s): 0.55 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 2.815100E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.623 | TFLOPs: 44.01 | +7: iteration 35770/ 115203 | consumed samples: 9157120 | consumed tokens: 18753781760 | elapsed time per iteration (s): 0.55 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 2.814786E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.315 | TFLOPs: 43.98 | +7: iteration 35780/ 115203 | consumed samples: 9159680 | consumed tokens: 18759024640 | elapsed time per iteration (s): 0.56 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 2.811516E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 35790/ 115203 | consumed samples: 9162240 | consumed tokens: 18764267520 | elapsed time per iteration (s): 0.56 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 2.810225E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.004 | TFLOPs: 43.67 | +7: iteration 35800/ 115203 | consumed samples: 9164800 | consumed tokens: 18769510400 | elapsed time per iteration (s): 0.57 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 2.814854E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.701 | TFLOPs: 42.59 | +7: iteration 35810/ 115203 | consumed samples: 9167360 | consumed tokens: 18774753280 | elapsed time per iteration (s): 0.56 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 2.793787E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.189 | TFLOPs: 43.97 | +7: iteration 35820/ 115203 | consumed samples: 9169920 | consumed tokens: 18779996160 | elapsed time per iteration (s): 0.56 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 2.928587E+00 | grad norm: 17.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.244 | TFLOPs: 43.88 | +7: iteration 35830/ 115203 | consumed samples: 9172480 | consumed tokens: 18785239040 | elapsed time per iteration (s): 0.56 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.196582E+00 | grad norm: 0.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.819 | TFLOPs: 43.74 | +7: iteration 35840/ 115203 | consumed samples: 9175040 | consumed tokens: 18790481920 | elapsed time per iteration (s): 0.56 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 2.923285E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.012 | TFLOPs: 43.86 | +7: iteration 35850/ 115203 | consumed samples: 9177600 | consumed tokens: 18795724800 | elapsed time per iteration (s): 0.56 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 2.864280E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.966 | TFLOPs: 43.95 | +7: iteration 35860/ 115203 | consumed samples: 9180160 | consumed tokens: 18800967680 | elapsed time per iteration (s): 0.56 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 2.833157E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.392 | TFLOPs: 43.89 | +7: iteration 35870/ 115203 | consumed samples: 9182720 | consumed tokens: 18806210560 | elapsed time per iteration (s): 0.56 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 2.835543E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.936 | TFLOPs: 43.95 | +7: iteration 35880/ 115203 | consumed samples: 9185280 | consumed tokens: 18811453440 | elapsed time per iteration (s): 0.56 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 2.815215E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.362 | TFLOPs: 43.89 | +7: iteration 35890/ 115203 | consumed samples: 9187840 | consumed tokens: 18816696320 | elapsed time per iteration (s): 0.56 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 2.827448E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.471 | TFLOPs: 43.90 | +7: iteration 35900/ 115203 | consumed samples: 9190400 | consumed tokens: 18821939200 | elapsed time per iteration (s): 0.56 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 2.814097E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.553 | TFLOPs: 43.91 | +7: iteration 35910/ 115203 | consumed samples: 9192960 | consumed tokens: 18827182080 | elapsed time per iteration (s): 0.56 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 2.816500E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.549 | TFLOPs: 43.91 | +7: iteration 35920/ 115203 | consumed samples: 9195520 | consumed tokens: 18832424960 | elapsed time per iteration (s): 0.56 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 2.828564E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.471 | TFLOPs: 43.90 | +7: iteration 35930/ 115203 | consumed samples: 9198080 | consumed tokens: 18837667840 | elapsed time per iteration (s): 0.56 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 2.806539E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.647 | TFLOPs: 43.92 | +7: iteration 35940/ 115203 | consumed samples: 9200640 | consumed tokens: 18842910720 | elapsed time per iteration (s): 0.56 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 2.817556E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.032 | TFLOPs: 43.95 | +7: iteration 35950/ 115203 | consumed samples: 9203200 | consumed tokens: 18848153600 | elapsed time per iteration (s): 0.56 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 2.808807E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.925 | TFLOPs: 43.94 | +7: iteration 35960/ 115203 | consumed samples: 9205760 | consumed tokens: 18853396480 | elapsed time per iteration (s): 0.56 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 2.822392E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.921 | TFLOPs: 43.94 | +7: iteration 35970/ 115203 | consumed samples: 9208320 | consumed tokens: 18858639360 | elapsed time per iteration (s): 0.56 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 2.821488E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.692 | TFLOPs: 43.92 | +7: iteration 35980/ 115203 | consumed samples: 9210880 | consumed tokens: 18863882240 | elapsed time per iteration (s): 0.56 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 2.819956E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.828 | TFLOPs: 43.93 | +7: iteration 35990/ 115203 | consumed samples: 9213440 | consumed tokens: 18869125120 | elapsed time per iteration (s): 0.56 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 2.806055E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.944 | TFLOPs: 43.28 | +0: [2023-03-16 18:28:53,167] [INFO] [logging.py:68:log_dist] [Rank 0] step=36000, skipped=0, lr=[0.00016162432908965068, 0.00016162432908965068, 0.00016162432908965068], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 36000/ 115203 | consumed samples: 9216000 | consumed tokens: 18874368000 | elapsed time per iteration (s): 0.56 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 2.809388E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.981 | TFLOPs: 43.95 | +0: steps: 36000 loss: 2.8113 iter time (s): 0.557 samples/sec: 459.263 +7: iteration 36010/ 115203 | consumed samples: 9218560 | consumed tokens: 18879610880 | elapsed time per iteration (s): 0.56 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 2.802573E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.990 | TFLOPs: 43.76 | +7: iteration 36020/ 115203 | consumed samples: 9221120 | consumed tokens: 18884853760 | elapsed time per iteration (s): 0.56 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 2.813977E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.992 | TFLOPs: 43.95 | +7: iteration 36030/ 115203 | consumed samples: 9223680 | consumed tokens: 18890096640 | elapsed time per iteration (s): 0.56 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 2.821592E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.970 | TFLOPs: 43.85 | +7: iteration 36040/ 115203 | consumed samples: 9226240 | consumed tokens: 18895339520 | elapsed time per iteration (s): 0.56 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 2.815042E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.179 | TFLOPs: 43.87 | +7: iteration 36050/ 115203 | consumed samples: 9228800 | consumed tokens: 18900582400 | elapsed time per iteration (s): 0.55 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 2.816295E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.381 | TFLOPs: 43.99 | +7: iteration 36060/ 115203 | consumed samples: 9231360 | consumed tokens: 18905825280 | elapsed time per iteration (s): 0.56 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 2.804279E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.901 | TFLOPs: 43.56 | +7: iteration 36070/ 115203 | consumed samples: 9233920 | consumed tokens: 18911068160 | elapsed time per iteration (s): 0.56 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 2.828905E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.228 | TFLOPs: 43.59 | +7: iteration 36080/ 115203 | consumed samples: 9236480 | consumed tokens: 18916311040 | elapsed time per iteration (s): 0.57 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 2.811499E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.089 | TFLOPs: 42.72 | +7: iteration 36090/ 115203 | consumed samples: 9239040 | consumed tokens: 18921553920 | elapsed time per iteration (s): 0.57 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 2.817473E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.163 | TFLOPs: 42.73 | +7: iteration 36100/ 115203 | consumed samples: 9241600 | consumed tokens: 18926796800 | elapsed time per iteration (s): 0.56 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 2.800154E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.855 | TFLOPs: 43.46 | +7: iteration 36110/ 115203 | consumed samples: 9244160 | consumed tokens: 18932039680 | elapsed time per iteration (s): 0.56 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 2.813739E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.703 | TFLOPs: 43.35 | +7: iteration 36120/ 115203 | consumed samples: 9246720 | consumed tokens: 18937282560 | elapsed time per iteration (s): 0.56 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 2.819335E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.010 | TFLOPs: 43.67 | +7: iteration 36130/ 115203 | consumed samples: 9249280 | consumed tokens: 18942525440 | elapsed time per iteration (s): 0.57 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 2.807084E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.691 | TFLOPs: 43.06 | +7: iteration 36140/ 115203 | consumed samples: 9251840 | consumed tokens: 18947768320 | elapsed time per iteration (s): 0.56 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 2.803782E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.085 | TFLOPs: 43.39 | +7: iteration 36150/ 115203 | consumed samples: 9254400 | consumed tokens: 18953011200 | elapsed time per iteration (s): 0.57 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 2.813939E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.473 | TFLOPs: 42.47 | +7: iteration 36160/ 115203 | consumed samples: 9256960 | consumed tokens: 18958254080 | elapsed time per iteration (s): 0.56 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 2.818659E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.115 | TFLOPs: 43.58 | +7: iteration 36170/ 115203 | consumed samples: 9259520 | consumed tokens: 18963496960 | elapsed time per iteration (s): 0.56 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 2.811586E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.148 | TFLOPs: 43.49 | +7: iteration 36180/ 115203 | consumed samples: 9262080 | consumed tokens: 18968739840 | elapsed time per iteration (s): 0.55 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 2.817318E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.620 | TFLOPs: 44.01 | +7: iteration 36190/ 115203 | consumed samples: 9264640 | consumed tokens: 18973982720 | elapsed time per iteration (s): 0.56 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 2.812845E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.945 | TFLOPs: 43.85 | +7: iteration 36200/ 115203 | consumed samples: 9267200 | consumed tokens: 18979225600 | elapsed time per iteration (s): 0.55 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 2.816389E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.277 | TFLOPs: 43.98 | +7: iteration 36210/ 115203 | consumed samples: 9269760 | consumed tokens: 18984468480 | elapsed time per iteration (s): 0.56 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 2.824160E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.951 | TFLOPs: 43.57 | +7: iteration 36220/ 115203 | consumed samples: 9272320 | consumed tokens: 18989711360 | elapsed time per iteration (s): 0.57 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 2.796087E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.594 | TFLOPs: 43.15 | +7: iteration 36230/ 115203 | consumed samples: 9274880 | consumed tokens: 18994954240 | elapsed time per iteration (s): 0.56 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 2.796302E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.525 | TFLOPs: 43.62 | +7: iteration 36240/ 115203 | consumed samples: 9277440 | consumed tokens: 19000197120 | elapsed time per iteration (s): 0.56 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 2.814190E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.105 | TFLOPs: 43.20 | +7: iteration 36250/ 115203 | consumed samples: 9280000 | consumed tokens: 19005440000 | elapsed time per iteration (s): 0.57 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 2.821287E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.609 | TFLOPs: 43.15 | +7: iteration 36260/ 115203 | consumed samples: 9282560 | consumed tokens: 19010682880 | elapsed time per iteration (s): 0.57 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 2.797025E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.720 | TFLOPs: 42.78 | +7: iteration 36270/ 115203 | consumed samples: 9285120 | consumed tokens: 19015925760 | elapsed time per iteration (s): 0.57 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 2.808641E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.073 | TFLOPs: 42.81 | +7: iteration 36280/ 115203 | consumed samples: 9287680 | consumed tokens: 19021168640 | elapsed time per iteration (s): 0.57 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 2.818174E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.179 | TFLOPs: 42.63 | +7: iteration 36290/ 115203 | consumed samples: 9290240 | consumed tokens: 19026411520 | elapsed time per iteration (s): 0.56 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 2.821398E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.978 | TFLOPs: 43.47 | +7: iteration 36300/ 115203 | consumed samples: 9292800 | consumed tokens: 19031654400 | elapsed time per iteration (s): 0.57 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 2.821951E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.481 | TFLOPs: 43.04 | +7: iteration 36310/ 115203 | consumed samples: 9295360 | consumed tokens: 19036897280 | elapsed time per iteration (s): 0.56 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 2.809375E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.122 | TFLOPs: 43.49 | +7: iteration 36320/ 115203 | consumed samples: 9297920 | consumed tokens: 19042140160 | elapsed time per iteration (s): 0.57 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 2.812156E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.862 | TFLOPs: 42.89 | +7: iteration 36330/ 115203 | consumed samples: 9300480 | consumed tokens: 19047383040 | elapsed time per iteration (s): 0.55 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 2.802867E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.617 | TFLOPs: 44.01 | +7: iteration 36340/ 115203 | consumed samples: 9303040 | consumed tokens: 19052625920 | elapsed time per iteration (s): 0.56 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 2.806599E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.088 | TFLOPs: 43.39 | +7: iteration 36350/ 115203 | consumed samples: 9305600 | consumed tokens: 19057868800 | elapsed time per iteration (s): 0.56 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 2.803459E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.213 | TFLOPs: 43.69 | +7: iteration 36360/ 115203 | consumed samples: 9308160 | consumed tokens: 19063111680 | elapsed time per iteration (s): 0.56 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 2.800875E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.359 | TFLOPs: 43.79 | +7: iteration 36370/ 115203 | consumed samples: 9310720 | consumed tokens: 19068354560 | elapsed time per iteration (s): 0.57 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 2.806048E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.985 | TFLOPs: 43.09 | +7: iteration 36380/ 115203 | consumed samples: 9313280 | consumed tokens: 19073597440 | elapsed time per iteration (s): 0.56 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 2.813919E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.372 | TFLOPs: 43.22 | +7: iteration 36390/ 115203 | consumed samples: 9315840 | consumed tokens: 19078840320 | elapsed time per iteration (s): 0.56 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 2.797668E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.144 | TFLOPs: 43.49 | +7: iteration 36400/ 115203 | consumed samples: 9318400 | consumed tokens: 19084083200 | elapsed time per iteration (s): 0.56 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 2.784794E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.088 | TFLOPs: 43.96 | +7: iteration 36410/ 115203 | consumed samples: 9320960 | consumed tokens: 19089326080 | elapsed time per iteration (s): 0.55 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 2.816086E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.613 | TFLOPs: 44.01 | +7: iteration 36420/ 115203 | consumed samples: 9323520 | consumed tokens: 19094568960 | elapsed time per iteration (s): 0.56 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 2.789593E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.670 | TFLOPs: 43.92 | +7: iteration 36430/ 115203 | consumed samples: 9326080 | consumed tokens: 19099811840 | elapsed time per iteration (s): 0.57 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 2.804904E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.852 | TFLOPs: 42.89 | +7: iteration 36440/ 115203 | consumed samples: 9328640 | consumed tokens: 19105054720 | elapsed time per iteration (s): 0.57 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 2.812689E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.690 | TFLOPs: 43.16 | +7: iteration 36450/ 115203 | consumed samples: 9331200 | consumed tokens: 19110297600 | elapsed time per iteration (s): 0.56 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 2.811891E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.987 | TFLOPs: 43.76 | +7: iteration 36460/ 115203 | consumed samples: 9333760 | consumed tokens: 19115540480 | elapsed time per iteration (s): 0.56 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 2.822677E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.361 | TFLOPs: 43.60 | +7: iteration 36470/ 115203 | consumed samples: 9336320 | consumed tokens: 19120783360 | elapsed time per iteration (s): 0.56 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 2.796326E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.989 | TFLOPs: 43.66 | +7: iteration 36480/ 115203 | consumed samples: 9338880 | consumed tokens: 19126026240 | elapsed time per iteration (s): 0.57 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 2.811162E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.664 | TFLOPs: 43.16 | +7: iteration 36490/ 115203 | consumed samples: 9341440 | consumed tokens: 19131269120 | elapsed time per iteration (s): 0.56 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 2.812832E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.786 | TFLOPs: 43.74 | +7: iteration 36500/ 115203 | consumed samples: 9344000 | consumed tokens: 19136512000 | elapsed time per iteration (s): 0.56 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 2.789033E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.259 | TFLOPs: 43.69 | +7: iteration 36510/ 115203 | consumed samples: 9346560 | consumed tokens: 19141754880 | elapsed time per iteration (s): 0.56 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 2.814890E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.365 | TFLOPs: 43.22 | +7: iteration 36520/ 115203 | consumed samples: 9349120 | consumed tokens: 19146997760 | elapsed time per iteration (s): 0.56 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 2.807551E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.354 | TFLOPs: 43.22 | +7: iteration 36530/ 115203 | consumed samples: 9351680 | consumed tokens: 19152240640 | elapsed time per iteration (s): 0.56 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 2.815912E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.410 | TFLOPs: 43.51 | +7: iteration 36540/ 115203 | consumed samples: 9354240 | consumed tokens: 19157483520 | elapsed time per iteration (s): 0.57 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 2.795399E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.960 | TFLOPs: 42.99 | +7: iteration 36550/ 115203 | consumed samples: 9356800 | consumed tokens: 19162726400 | elapsed time per iteration (s): 0.58 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 2.808259E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.722 | TFLOPs: 42.40 | +7: iteration 36560/ 115203 | consumed samples: 9359360 | consumed tokens: 19167969280 | elapsed time per iteration (s): 0.56 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 2.789249E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.839 | TFLOPs: 43.75 | +7: iteration 36570/ 115203 | consumed samples: 9361920 | consumed tokens: 19173212160 | elapsed time per iteration (s): 0.56 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 2.794602E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.338 | TFLOPs: 43.70 | +7: iteration 36580/ 115203 | consumed samples: 9364480 | consumed tokens: 19178455040 | elapsed time per iteration (s): 0.56 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 2.803176E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.505 | TFLOPs: 43.52 | +7: iteration 36590/ 115203 | consumed samples: 9367040 | consumed tokens: 19183697920 | elapsed time per iteration (s): 0.56 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 2.804803E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.413 | TFLOPs: 43.90 | +7: iteration 36600/ 115203 | consumed samples: 9369600 | consumed tokens: 19188940800 | elapsed time per iteration (s): 0.56 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 2.820324E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.420 | TFLOPs: 43.23 | +7: iteration 36610/ 115203 | consumed samples: 9372160 | consumed tokens: 19194183680 | elapsed time per iteration (s): 0.56 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 2.810956E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.994 | TFLOPs: 43.76 | +7: iteration 36620/ 115203 | consumed samples: 9374720 | consumed tokens: 19199426560 | elapsed time per iteration (s): 0.57 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 2.811656E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.459 | TFLOPs: 42.95 | +7: iteration 36630/ 115203 | consumed samples: 9377280 | consumed tokens: 19204669440 | elapsed time per iteration (s): 0.56 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 2.801709E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.241 | TFLOPs: 43.97 | +7: iteration 36640/ 115203 | consumed samples: 9379840 | consumed tokens: 19209912320 | elapsed time per iteration (s): 0.55 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 2.819117E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.499 | TFLOPs: 44.00 | +7: iteration 36650/ 115203 | consumed samples: 9382400 | consumed tokens: 19215155200 | elapsed time per iteration (s): 0.55 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 2.812974E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.519 | TFLOPs: 44.00 | +7: iteration 36660/ 115203 | consumed samples: 9384960 | consumed tokens: 19220398080 | elapsed time per iteration (s): 0.56 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 2.807122E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.463 | TFLOPs: 43.42 | +7: iteration 36670/ 115203 | consumed samples: 9387520 | consumed tokens: 19225640960 | elapsed time per iteration (s): 0.55 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 2.799274E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.464 | TFLOPs: 44.00 | +7: iteration 36680/ 115203 | consumed samples: 9390080 | consumed tokens: 19230883840 | elapsed time per iteration (s): 0.57 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 2.788961E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.429 | TFLOPs: 42.94 | +7: iteration 36690/ 115203 | consumed samples: 9392640 | consumed tokens: 19236126720 | elapsed time per iteration (s): 0.56 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 2.802322E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.150 | TFLOPs: 43.58 | +7: iteration 36700/ 115203 | consumed samples: 9395200 | consumed tokens: 19241369600 | elapsed time per iteration (s): 0.56 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 2.811623E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.034 | TFLOPs: 43.48 | +7: iteration 36710/ 115203 | consumed samples: 9397760 | consumed tokens: 19246612480 | elapsed time per iteration (s): 0.55 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 2.805829E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.603 | TFLOPs: 44.01 | +7: iteration 36720/ 115203 | consumed samples: 9400320 | consumed tokens: 19251855360 | elapsed time per iteration (s): 0.56 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 2.814056E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.295 | TFLOPs: 43.50 | +7: iteration 36730/ 115203 | consumed samples: 9402880 | consumed tokens: 19257098240 | elapsed time per iteration (s): 0.56 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 2.801759E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.655 | TFLOPs: 43.73 | +7: iteration 36740/ 115203 | consumed samples: 9405440 | consumed tokens: 19262341120 | elapsed time per iteration (s): 0.56 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 2.803178E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.012 | TFLOPs: 43.95 | +7: iteration 36750/ 115203 | consumed samples: 9408000 | consumed tokens: 19267584000 | elapsed time per iteration (s): 0.56 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 2.814267E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.749 | TFLOPs: 43.74 | +7: iteration 36760/ 115203 | consumed samples: 9410560 | consumed tokens: 19272826880 | elapsed time per iteration (s): 0.57 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 2.814524E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.692 | TFLOPs: 43.06 | +7: iteration 36770/ 115203 | consumed samples: 9413120 | consumed tokens: 19278069760 | elapsed time per iteration (s): 0.56 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 2.808160E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.174 | TFLOPs: 43.59 | +7: iteration 36780/ 115203 | consumed samples: 9415680 | consumed tokens: 19283312640 | elapsed time per iteration (s): 0.56 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 2.813805E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.028 | TFLOPs: 43.29 | +7: iteration 36790/ 115203 | consumed samples: 9418240 | consumed tokens: 19288555520 | elapsed time per iteration (s): 0.56 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 2.815701E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.995 | TFLOPs: 43.28 | +7: iteration 36800/ 115203 | consumed samples: 9420800 | consumed tokens: 19293798400 | elapsed time per iteration (s): 0.56 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 2.814834E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.999 | TFLOPs: 43.28 | +7: iteration 36810/ 115203 | consumed samples: 9423360 | consumed tokens: 19299041280 | elapsed time per iteration (s): 0.56 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 2.813521E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.407 | TFLOPs: 43.61 | +7: iteration 36820/ 115203 | consumed samples: 9425920 | consumed tokens: 19304284160 | elapsed time per iteration (s): 0.58 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 2.803563E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.487 | TFLOPs: 42.28 | +7: iteration 36830/ 115203 | consumed samples: 9428480 | consumed tokens: 19309527040 | elapsed time per iteration (s): 0.56 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 2.814355E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.438 | TFLOPs: 43.52 | +7: iteration 36840/ 115203 | consumed samples: 9431040 | consumed tokens: 19314769920 | elapsed time per iteration (s): 0.57 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 2.806319E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.950 | TFLOPs: 43.18 | +7: iteration 36850/ 115203 | consumed samples: 9433600 | consumed tokens: 19320012800 | elapsed time per iteration (s): 0.58 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 2.801126E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.070 | TFLOPs: 42.43 | +7: iteration 36860/ 115203 | consumed samples: 9436160 | consumed tokens: 19325255680 | elapsed time per iteration (s): 0.57 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 2.799959E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.216 | TFLOPs: 42.64 | +7: iteration 36870/ 115203 | consumed samples: 9438720 | consumed tokens: 19330498560 | elapsed time per iteration (s): 0.56 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 2.796922E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.067 | TFLOPs: 43.77 | +7: iteration 36880/ 115203 | consumed samples: 9441280 | consumed tokens: 19335741440 | elapsed time per iteration (s): 0.57 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 2.813848E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.763 | TFLOPs: 42.78 | +7: iteration 36890/ 115203 | consumed samples: 9443840 | consumed tokens: 19340984320 | elapsed time per iteration (s): 0.56 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 2.806079E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.168 | TFLOPs: 43.20 | +7: iteration 36900/ 115203 | consumed samples: 9446400 | consumed tokens: 19346227200 | elapsed time per iteration (s): 0.57 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 2.825271E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.780 | TFLOPs: 42.98 | +7: iteration 36910/ 115203 | consumed samples: 9448960 | consumed tokens: 19351470080 | elapsed time per iteration (s): 0.57 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 2.809015E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.762 | TFLOPs: 42.50 | +7: iteration 36920/ 115203 | consumed samples: 9451520 | consumed tokens: 19356712960 | elapsed time per iteration (s): 0.57 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 2.801040E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.806 | TFLOPs: 42.60 | +7: iteration 36930/ 115203 | consumed samples: 9454080 | consumed tokens: 19361955840 | elapsed time per iteration (s): 0.57 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 2.789923E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.103 | TFLOPs: 43.10 | +7: iteration 36940/ 115203 | consumed samples: 9456640 | consumed tokens: 19367198720 | elapsed time per iteration (s): 0.58 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 2.802346E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.878 | TFLOPs: 42.41 | +7: iteration 36950/ 115203 | consumed samples: 9459200 | consumed tokens: 19372441600 | elapsed time per iteration (s): 0.57 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 2.806593E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.624 | TFLOPs: 43.15 | +7: iteration 36960/ 115203 | consumed samples: 9461760 | consumed tokens: 19377684480 | elapsed time per iteration (s): 0.56 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 2.807688E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.024 | TFLOPs: 43.48 | +7: iteration 36970/ 115203 | consumed samples: 9464320 | consumed tokens: 19382927360 | elapsed time per iteration (s): 0.56 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 2.809920E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.608 | TFLOPs: 43.25 | +7: iteration 36980/ 115203 | consumed samples: 9466880 | consumed tokens: 19388170240 | elapsed time per iteration (s): 0.56 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 2.816641E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.888 | TFLOPs: 43.75 | +7: iteration 36990/ 115203 | consumed samples: 9469440 | consumed tokens: 19393413120 | elapsed time per iteration (s): 0.56 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 2.818256E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.609 | TFLOPs: 43.72 | +7: iteration 37000/ 115203 | consumed samples: 9472000 | consumed tokens: 19398656000 | elapsed time per iteration (s): 0.56 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 2.811839E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.194 | TFLOPs: 43.59 | +7: iteration 37010/ 115203 | consumed samples: 9474560 | consumed tokens: 19403898880 | elapsed time per iteration (s): 0.57 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 2.811322E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.886 | TFLOPs: 42.99 | +7: iteration 37020/ 115203 | consumed samples: 9477120 | consumed tokens: 19409141760 | elapsed time per iteration (s): 0.57 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 2.811074E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.845 | TFLOPs: 42.98 | +7: iteration 37030/ 115203 | consumed samples: 9479680 | consumed tokens: 19414384640 | elapsed time per iteration (s): 0.56 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 2.792469E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.733 | TFLOPs: 43.54 | +7: iteration 37040/ 115203 | consumed samples: 9482240 | consumed tokens: 19419627520 | elapsed time per iteration (s): 0.57 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 2.796229E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.104 | TFLOPs: 42.91 | +7: iteration 37050/ 115203 | consumed samples: 9484800 | consumed tokens: 19424870400 | elapsed time per iteration (s): 0.56 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 2.807875E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.205 | TFLOPs: 43.59 | +7: iteration 37060/ 115203 | consumed samples: 9487360 | consumed tokens: 19430113280 | elapsed time per iteration (s): 0.57 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 2.803954E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.867 | TFLOPs: 42.99 | +7: iteration 37070/ 115203 | consumed samples: 9489920 | consumed tokens: 19435356160 | elapsed time per iteration (s): 0.56 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 2.805432E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.882 | TFLOPs: 43.84 | +7: iteration 37080/ 115203 | consumed samples: 9492480 | consumed tokens: 19440599040 | elapsed time per iteration (s): 0.56 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 2.807868E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.541 | TFLOPs: 43.24 | +7: iteration 37090/ 115203 | consumed samples: 9495040 | consumed tokens: 19445841920 | elapsed time per iteration (s): 0.57 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 2.799936E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.576 | TFLOPs: 42.58 | +7: iteration 37100/ 115203 | consumed samples: 9497600 | consumed tokens: 19451084800 | elapsed time per iteration (s): 0.56 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 2.798123E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.817 | TFLOPs: 43.36 | +7: iteration 37110/ 115203 | consumed samples: 9500160 | consumed tokens: 19456327680 | elapsed time per iteration (s): 0.56 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 2.805653E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.029 | TFLOPs: 43.95 | +7: iteration 37120/ 115203 | consumed samples: 9502720 | consumed tokens: 19461570560 | elapsed time per iteration (s): 0.56 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 2.803405E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.856 | TFLOPs: 43.37 | +7: iteration 37130/ 115203 | consumed samples: 9505280 | consumed tokens: 19466813440 | elapsed time per iteration (s): 0.56 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 2.803957E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.121 | TFLOPs: 43.96 | +7: iteration 37140/ 115203 | consumed samples: 9507840 | consumed tokens: 19472056320 | elapsed time per iteration (s): 0.56 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 2.804269E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.005 | TFLOPs: 43.48 | +7: iteration 37150/ 115203 | consumed samples: 9510400 | consumed tokens: 19477299200 | elapsed time per iteration (s): 0.56 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 2.806413E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.467 | TFLOPs: 43.71 | +7: iteration 37160/ 115203 | consumed samples: 9512960 | consumed tokens: 19482542080 | elapsed time per iteration (s): 0.56 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 2.804260E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.243 | TFLOPs: 43.40 | +7: iteration 37170/ 115203 | consumed samples: 9515520 | consumed tokens: 19487784960 | elapsed time per iteration (s): 0.56 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 2.807045E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.061 | TFLOPs: 43.39 | +7: iteration 37180/ 115203 | consumed samples: 9518080 | consumed tokens: 19493027840 | elapsed time per iteration (s): 0.56 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 2.804704E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.180 | TFLOPs: 43.59 | +7: iteration 37190/ 115203 | consumed samples: 9520640 | consumed tokens: 19498270720 | elapsed time per iteration (s): 0.57 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 2.802439E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.200 | TFLOPs: 43.02 | +7: iteration 37200/ 115203 | consumed samples: 9523200 | consumed tokens: 19503513600 | elapsed time per iteration (s): 0.56 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 2.807037E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.040 | TFLOPs: 43.96 | +7: iteration 37210/ 115203 | consumed samples: 9525760 | consumed tokens: 19508756480 | elapsed time per iteration (s): 0.58 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 2.800006E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.416 | TFLOPs: 42.37 | +7: iteration 37220/ 115203 | consumed samples: 9528320 | consumed tokens: 19513999360 | elapsed time per iteration (s): 0.56 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 2.797144E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.480 | TFLOPs: 43.43 | +7: iteration 37230/ 115203 | consumed samples: 9530880 | consumed tokens: 19519242240 | elapsed time per iteration (s): 0.56 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 2.807676E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.242 | TFLOPs: 43.31 | +7: iteration 37240/ 115203 | consumed samples: 9533440 | consumed tokens: 19524485120 | elapsed time per iteration (s): 0.57 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 2.810430E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.868 | TFLOPs: 42.79 | +7: iteration 37250/ 115203 | consumed samples: 9536000 | consumed tokens: 19529728000 | elapsed time per iteration (s): 0.56 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 2.805609E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.900 | TFLOPs: 43.47 | +7: iteration 37260/ 115203 | consumed samples: 9538560 | consumed tokens: 19534970880 | elapsed time per iteration (s): 0.56 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 2.805054E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.636 | TFLOPs: 43.54 | +7: iteration 37270/ 115203 | consumed samples: 9541120 | consumed tokens: 19540213760 | elapsed time per iteration (s): 0.56 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 2.809717E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.501 | TFLOPs: 43.52 | +7: iteration 37280/ 115203 | consumed samples: 9543680 | consumed tokens: 19545456640 | elapsed time per iteration (s): 0.56 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 2.803244E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.740 | TFLOPs: 43.55 | +7: iteration 37290/ 115203 | consumed samples: 9546240 | consumed tokens: 19550699520 | elapsed time per iteration (s): 0.57 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 2.792930E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.272 | TFLOPs: 43.12 | +7: iteration 37300/ 115203 | consumed samples: 9548800 | consumed tokens: 19555942400 | elapsed time per iteration (s): 0.56 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 2.807077E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.290 | TFLOPs: 43.22 | +7: iteration 37310/ 115203 | consumed samples: 9551360 | consumed tokens: 19561185280 | elapsed time per iteration (s): 0.56 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 2.804189E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.561 | TFLOPs: 43.62 | +7: iteration 37320/ 115203 | consumed samples: 9553920 | consumed tokens: 19566428160 | elapsed time per iteration (s): 0.56 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 2.821004E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.051 | TFLOPs: 43.48 | +7: iteration 37330/ 115203 | consumed samples: 9556480 | consumed tokens: 19571671040 | elapsed time per iteration (s): 0.57 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 2.803432E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.654 | TFLOPs: 43.06 | +7: iteration 37340/ 115203 | consumed samples: 9559040 | consumed tokens: 19576913920 | elapsed time per iteration (s): 0.56 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 2.805164E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.714 | TFLOPs: 43.35 | +7: iteration 37350/ 115203 | consumed samples: 9561600 | consumed tokens: 19582156800 | elapsed time per iteration (s): 0.56 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 2.805219E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.901 | TFLOPs: 43.56 | +7: iteration 37360/ 115203 | consumed samples: 9564160 | consumed tokens: 19587399680 | elapsed time per iteration (s): 0.57 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 2.804938E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.310 | TFLOPs: 43.03 | +7: iteration 37370/ 115203 | consumed samples: 9566720 | consumed tokens: 19592642560 | elapsed time per iteration (s): 0.56 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 2.808966E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.866 | TFLOPs: 43.56 | +7: iteration 37380/ 115203 | consumed samples: 9569280 | consumed tokens: 19597885440 | elapsed time per iteration (s): 0.57 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 2.797929E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.460 | TFLOPs: 43.04 | +7: iteration 37390/ 115203 | consumed samples: 9571840 | consumed tokens: 19603128320 | elapsed time per iteration (s): 0.56 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 2.800626E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.957 | TFLOPs: 43.57 | +7: iteration 37400/ 115203 | consumed samples: 9574400 | consumed tokens: 19608371200 | elapsed time per iteration (s): 0.56 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 2.800066E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.364 | TFLOPs: 43.89 | +7: iteration 37410/ 115203 | consumed samples: 9576960 | consumed tokens: 19613614080 | elapsed time per iteration (s): 0.56 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 2.809352E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.410 | TFLOPs: 43.70 | +7: iteration 37420/ 115203 | consumed samples: 9579520 | consumed tokens: 19618856960 | elapsed time per iteration (s): 0.57 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 2.812272E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.006 | TFLOPs: 42.81 | +7: iteration 37430/ 115203 | consumed samples: 9582080 | consumed tokens: 19624099840 | elapsed time per iteration (s): 0.56 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 2.805124E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.187 | TFLOPs: 43.97 | +7: iteration 37440/ 115203 | consumed samples: 9584640 | consumed tokens: 19629342720 | elapsed time per iteration (s): 0.55 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 2.798955E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.273 | TFLOPs: 43.98 | +7: iteration 37450/ 115203 | consumed samples: 9587200 | consumed tokens: 19634585600 | elapsed time per iteration (s): 0.56 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 2.799611E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.929 | TFLOPs: 43.85 | +7: iteration 37460/ 115203 | consumed samples: 9589760 | consumed tokens: 19639828480 | elapsed time per iteration (s): 0.56 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 2.785207E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.228 | TFLOPs: 43.97 | +7: iteration 37470/ 115203 | consumed samples: 9592320 | consumed tokens: 19645071360 | elapsed time per iteration (s): 0.57 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 2.792119E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.027 | TFLOPs: 43.19 | +7: iteration 37480/ 115203 | consumed samples: 9594880 | consumed tokens: 19650314240 | elapsed time per iteration (s): 0.56 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 2.800055E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.033 | TFLOPs: 43.95 | +7: iteration 37490/ 115203 | consumed samples: 9597440 | consumed tokens: 19655557120 | elapsed time per iteration (s): 0.57 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 2.803675E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.451 | TFLOPs: 43.14 | +7: iteration 37500/ 115203 | consumed samples: 9600000 | consumed tokens: 19660800000 | elapsed time per iteration (s): 0.57 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 2.787239E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.025 | TFLOPs: 43.19 | +7: iteration 37510/ 115203 | consumed samples: 9602560 | consumed tokens: 19666042880 | elapsed time per iteration (s): 0.56 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 2.796897E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.249 | TFLOPs: 43.31 | +7: iteration 37520/ 115203 | consumed samples: 9605120 | consumed tokens: 19671285760 | elapsed time per iteration (s): 0.57 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 2.806575E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.038 | TFLOPs: 43.19 | +7: iteration 37530/ 115203 | consumed samples: 9607680 | consumed tokens: 19676528640 | elapsed time per iteration (s): 0.56 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 2.781253E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.235 | TFLOPs: 43.97 | +7: iteration 37540/ 115203 | consumed samples: 9610240 | consumed tokens: 19681771520 | elapsed time per iteration (s): 0.59 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 2.792656E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.265 | TFLOPs: 41.59 | +7: iteration 37550/ 115203 | consumed samples: 9612800 | consumed tokens: 19687014400 | elapsed time per iteration (s): 0.56 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 2.812834E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.361 | TFLOPs: 43.51 | +7: iteration 37560/ 115203 | consumed samples: 9615360 | consumed tokens: 19692257280 | elapsed time per iteration (s): 0.56 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 2.809959E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.985 | TFLOPs: 43.38 | +7: iteration 37570/ 115203 | consumed samples: 9617920 | consumed tokens: 19697500160 | elapsed time per iteration (s): 0.57 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 2.794277E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.098 | TFLOPs: 43.01 | +7: iteration 37580/ 115203 | consumed samples: 9620480 | consumed tokens: 19702743040 | elapsed time per iteration (s): 0.56 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 2.802746E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.896 | TFLOPs: 43.27 | +7: iteration 37590/ 115203 | consumed samples: 9623040 | consumed tokens: 19707985920 | elapsed time per iteration (s): 0.56 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 2.797548E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.858 | TFLOPs: 43.94 | +7: iteration 37600/ 115203 | consumed samples: 9625600 | consumed tokens: 19713228800 | elapsed time per iteration (s): 0.56 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 2.798790E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.443 | TFLOPs: 43.61 | +7: iteration 37610/ 115203 | consumed samples: 9628160 | consumed tokens: 19718471680 | elapsed time per iteration (s): 0.56 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 2.787039E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.172 | TFLOPs: 43.97 | +7: iteration 37620/ 115203 | consumed samples: 9630720 | consumed tokens: 19723714560 | elapsed time per iteration (s): 0.56 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 2.796196E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.195 | TFLOPs: 43.78 | +7: iteration 37630/ 115203 | consumed samples: 9633280 | consumed tokens: 19728957440 | elapsed time per iteration (s): 0.56 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 2.796933E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.466 | TFLOPs: 43.42 | +7: iteration 37640/ 115203 | consumed samples: 9635840 | consumed tokens: 19734200320 | elapsed time per iteration (s): 0.56 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 2.803385E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.043 | TFLOPs: 43.57 | +7: iteration 37650/ 115203 | consumed samples: 9638400 | consumed tokens: 19739443200 | elapsed time per iteration (s): 0.56 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 2.793928E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.421 | TFLOPs: 43.71 | +7: iteration 37660/ 115203 | consumed samples: 9640960 | consumed tokens: 19744686080 | elapsed time per iteration (s): 0.56 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 2.791851E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.310 | TFLOPs: 43.79 | +7: iteration 37670/ 115203 | consumed samples: 9643520 | consumed tokens: 19749928960 | elapsed time per iteration (s): 0.56 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 2.799858E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.883 | TFLOPs: 43.27 | +7: iteration 37680/ 115203 | consumed samples: 9646080 | consumed tokens: 19755171840 | elapsed time per iteration (s): 0.56 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 2.807366E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.335 | TFLOPs: 43.32 | +7: iteration 37690/ 115203 | consumed samples: 9648640 | consumed tokens: 19760414720 | elapsed time per iteration (s): 0.56 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 2.791461E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.602 | TFLOPs: 43.53 | +7: iteration 37700/ 115203 | consumed samples: 9651200 | consumed tokens: 19765657600 | elapsed time per iteration (s): 0.57 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 2.797819E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.796 | TFLOPs: 42.98 | +7: iteration 37710/ 115203 | consumed samples: 9653760 | consumed tokens: 19770900480 | elapsed time per iteration (s): 0.56 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 2.793248E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.320 | TFLOPs: 43.60 | +7: iteration 37720/ 115203 | consumed samples: 9656320 | consumed tokens: 19776143360 | elapsed time per iteration (s): 0.56 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 2.806257E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.968 | TFLOPs: 43.47 | +7: iteration 37730/ 115203 | consumed samples: 9658880 | consumed tokens: 19781386240 | elapsed time per iteration (s): 0.56 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 2.796784E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.044 | TFLOPs: 43.96 | +7: iteration 37740/ 115203 | consumed samples: 9661440 | consumed tokens: 19786629120 | elapsed time per iteration (s): 0.57 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 2.820928E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.635 | TFLOPs: 42.87 | +7: iteration 37750/ 115203 | consumed samples: 9664000 | consumed tokens: 19791872000 | elapsed time per iteration (s): 0.56 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 2.789754E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.932 | TFLOPs: 43.94 | +7: iteration 37760/ 115203 | consumed samples: 9666560 | consumed tokens: 19797114880 | elapsed time per iteration (s): 0.56 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 2.804539E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.129 | TFLOPs: 43.49 | +7: iteration 37770/ 115203 | consumed samples: 9669120 | consumed tokens: 19802357760 | elapsed time per iteration (s): 0.57 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 2.801340E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.833 | TFLOPs: 42.60 | +7: iteration 37780/ 115203 | consumed samples: 9671680 | consumed tokens: 19807600640 | elapsed time per iteration (s): 0.58 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 2.785646E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.286 | TFLOPs: 41.79 | +7: iteration 37790/ 115203 | consumed samples: 9674240 | consumed tokens: 19812843520 | elapsed time per iteration (s): 0.57 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 2.805581E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.720 | TFLOPs: 42.88 | +7: iteration 37800/ 115203 | consumed samples: 9676800 | consumed tokens: 19818086400 | elapsed time per iteration (s): 0.57 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 2.802692E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.419 | TFLOPs: 43.13 | +7: iteration 37810/ 115203 | consumed samples: 9679360 | consumed tokens: 19823329280 | elapsed time per iteration (s): 0.57 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 2.791769E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.298 | TFLOPs: 43.12 | +7: iteration 37820/ 115203 | consumed samples: 9681920 | consumed tokens: 19828572160 | elapsed time per iteration (s): 0.57 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 2.801145E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.200 | TFLOPs: 42.54 | +7: iteration 37830/ 115203 | consumed samples: 9684480 | consumed tokens: 19833815040 | elapsed time per iteration (s): 0.57 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 2.804982E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.471 | TFLOPs: 43.14 | +7: iteration 37840/ 115203 | consumed samples: 9687040 | consumed tokens: 19839057920 | elapsed time per iteration (s): 0.56 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 2.784546E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.650 | TFLOPs: 43.25 | +7: iteration 37850/ 115203 | consumed samples: 9689600 | consumed tokens: 19844300800 | elapsed time per iteration (s): 0.57 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 2.804033E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.795 | TFLOPs: 42.98 | +7: iteration 37860/ 115203 | consumed samples: 9692160 | consumed tokens: 19849543680 | elapsed time per iteration (s): 0.57 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 2.795830E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.375 | TFLOPs: 42.94 | +7: iteration 37870/ 115203 | consumed samples: 9694720 | consumed tokens: 19854786560 | elapsed time per iteration (s): 0.56 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 2.799191E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.343 | TFLOPs: 43.70 | +7: iteration 37880/ 115203 | consumed samples: 9697280 | consumed tokens: 19860029440 | elapsed time per iteration (s): 0.56 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 2.798683E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.771 | TFLOPs: 43.36 | +7: iteration 37890/ 115203 | consumed samples: 9699840 | consumed tokens: 19865272320 | elapsed time per iteration (s): 0.56 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 2.807769E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.444 | TFLOPs: 43.61 | +7: iteration 37900/ 115203 | consumed samples: 9702400 | consumed tokens: 19870515200 | elapsed time per iteration (s): 0.58 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 2.811920E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.796 | TFLOPs: 41.74 | +7: iteration 37910/ 115203 | consumed samples: 9704960 | consumed tokens: 19875758080 | elapsed time per iteration (s): 0.57 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 2.802043E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.136 | TFLOPs: 42.82 | +7: iteration 37920/ 115203 | consumed samples: 9707520 | consumed tokens: 19881000960 | elapsed time per iteration (s): 0.56 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 2.802920E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.158 | TFLOPs: 43.68 | +7: iteration 37930/ 115203 | consumed samples: 9710080 | consumed tokens: 19886243840 | elapsed time per iteration (s): 0.56 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 2.795139E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.377 | TFLOPs: 43.22 | +7: iteration 37940/ 115203 | consumed samples: 9712640 | consumed tokens: 19891486720 | elapsed time per iteration (s): 0.56 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 2.786988E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.773 | TFLOPs: 43.26 | +7: iteration 37950/ 115203 | consumed samples: 9715200 | consumed tokens: 19896729600 | elapsed time per iteration (s): 0.57 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 2.796127E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.535 | TFLOPs: 42.48 | +7: iteration 37960/ 115203 | consumed samples: 9717760 | consumed tokens: 19901972480 | elapsed time per iteration (s): 0.57 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 2.795465E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.771 | TFLOPs: 42.69 | +7: iteration 37970/ 115203 | consumed samples: 9720320 | consumed tokens: 19907215360 | elapsed time per iteration (s): 0.57 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 2.809183E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.703 | TFLOPs: 43.16 | +7: iteration 37980/ 115203 | consumed samples: 9722880 | consumed tokens: 19912458240 | elapsed time per iteration (s): 0.56 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 2.793802E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.942 | TFLOPs: 43.37 | +7: iteration 37990/ 115203 | consumed samples: 9725440 | consumed tokens: 19917701120 | elapsed time per iteration (s): 0.57 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 2.798048E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.498 | TFLOPs: 42.76 | +0: [2023-03-16 18:47:39,312] [INFO] [logging.py:68:log_dist] [Rank 0] step=38000, skipped=0, lr=[0.00015748667481842792, 0.00015748667481842792, 0.00015748667481842792], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 38000/ 115203 | consumed samples: 9728000 | consumed tokens: 19922944000 | elapsed time per iteration (s): 0.57 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 2.804614E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.920 | TFLOPs: 43.18 | +0: steps: 38000 loss: 2.8152 iter time (s): 0.561 samples/sec: 456.356 +7: iteration 38010/ 115203 | consumed samples: 9730560 | consumed tokens: 19928186880 | elapsed time per iteration (s): 0.58 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 2.804375E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.869 | TFLOPs: 42.13 | +7: iteration 38020/ 115203 | consumed samples: 9733120 | consumed tokens: 19933429760 | elapsed time per iteration (s): 0.56 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 2.784387E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.141 | TFLOPs: 43.58 | +7: iteration 38030/ 115203 | consumed samples: 9735680 | consumed tokens: 19938672640 | elapsed time per iteration (s): 0.56 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 2.796774E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.769 | TFLOPs: 43.55 | +7: iteration 38040/ 115203 | consumed samples: 9738240 | consumed tokens: 19943915520 | elapsed time per iteration (s): 0.57 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 2.793177E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.906 | TFLOPs: 43.08 | +7: iteration 38050/ 115203 | consumed samples: 9740800 | consumed tokens: 19949158400 | elapsed time per iteration (s): 0.57 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 2.791930E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.220 | TFLOPs: 42.73 | +7: iteration 38060/ 115203 | consumed samples: 9743360 | consumed tokens: 19954401280 | elapsed time per iteration (s): 0.57 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 2.802389E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.564 | TFLOPs: 42.77 | +7: iteration 38070/ 115203 | consumed samples: 9745920 | consumed tokens: 19959644160 | elapsed time per iteration (s): 0.57 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 2.802999E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.970 | TFLOPs: 42.61 | +7: iteration 38080/ 115203 | consumed samples: 9748480 | consumed tokens: 19964887040 | elapsed time per iteration (s): 0.58 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 2.789365E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.868 | TFLOPs: 42.41 | +7: iteration 38090/ 115203 | consumed samples: 9751040 | consumed tokens: 19970129920 | elapsed time per iteration (s): 0.57 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 2.809908E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.772 | TFLOPs: 42.88 | +7: iteration 38100/ 115203 | consumed samples: 9753600 | consumed tokens: 19975372800 | elapsed time per iteration (s): 0.56 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 2.792617E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.183 | TFLOPs: 43.59 | +7: iteration 38110/ 115203 | consumed samples: 9756160 | consumed tokens: 19980615680 | elapsed time per iteration (s): 0.58 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 2.792824E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.657 | TFLOPs: 41.73 | +7: iteration 38120/ 115203 | consumed samples: 9758720 | consumed tokens: 19985858560 | elapsed time per iteration (s): 0.58 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 2.798343E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.853 | TFLOPs: 42.03 | +7: iteration 38130/ 115203 | consumed samples: 9761280 | consumed tokens: 19991101440 | elapsed time per iteration (s): 0.57 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 2.798233E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.655 | TFLOPs: 43.06 | +7: iteration 38140/ 115203 | consumed samples: 9763840 | consumed tokens: 19996344320 | elapsed time per iteration (s): 0.57 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 2.795580E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.164 | TFLOPs: 43.11 | +7: iteration 38150/ 115203 | consumed samples: 9766400 | consumed tokens: 20001587200 | elapsed time per iteration (s): 0.57 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 2.805310E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.903 | TFLOPs: 43.08 | +7: iteration 38160/ 115203 | consumed samples: 9768960 | consumed tokens: 20006830080 | elapsed time per iteration (s): 0.57 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 2.798014E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.097 | TFLOPs: 42.63 | +7: iteration 38170/ 115203 | consumed samples: 9771520 | consumed tokens: 20012072960 | elapsed time per iteration (s): 0.56 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 2.797395E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.984 | TFLOPs: 43.57 | +7: iteration 38180/ 115203 | consumed samples: 9774080 | consumed tokens: 20017315840 | elapsed time per iteration (s): 0.57 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 2.792110E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.065 | TFLOPs: 43.10 | +7: iteration 38190/ 115203 | consumed samples: 9776640 | consumed tokens: 20022558720 | elapsed time per iteration (s): 0.57 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 2.812440E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.821 | TFLOPs: 42.98 | +7: iteration 38200/ 115203 | consumed samples: 9779200 | consumed tokens: 20027801600 | elapsed time per iteration (s): 0.57 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 2.796489E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.202 | TFLOPs: 42.83 | +7: iteration 38210/ 115203 | consumed samples: 9781760 | consumed tokens: 20033044480 | elapsed time per iteration (s): 0.57 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 2.795054E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.412 | TFLOPs: 42.47 | +7: iteration 38220/ 115203 | consumed samples: 9784320 | consumed tokens: 20038287360 | elapsed time per iteration (s): 0.59 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 2.802468E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.148 | TFLOPs: 41.68 | +7: iteration 38230/ 115203 | consumed samples: 9786880 | consumed tokens: 20043530240 | elapsed time per iteration (s): 0.59 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 2.799456E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.779 | TFLOPs: 41.55 | +7: iteration 38240/ 115203 | consumed samples: 9789440 | consumed tokens: 20048773120 | elapsed time per iteration (s): 0.58 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 2.818789E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.268 | TFLOPs: 42.26 | +7: iteration 38250/ 115203 | consumed samples: 9792000 | consumed tokens: 20054016000 | elapsed time per iteration (s): 0.57 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 2.791300E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.605 | TFLOPs: 42.86 | +7: iteration 38260/ 115203 | consumed samples: 9794560 | consumed tokens: 20059258880 | elapsed time per iteration (s): 0.57 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 2.794746E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.768 | TFLOPs: 42.50 | +7: iteration 38270/ 115203 | consumed samples: 9797120 | consumed tokens: 20064501760 | elapsed time per iteration (s): 0.57 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 2.802417E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.250 | TFLOPs: 42.64 | +7: iteration 38280/ 115203 | consumed samples: 9799680 | consumed tokens: 20069744640 | elapsed time per iteration (s): 0.56 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 2.792643E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.342 | TFLOPs: 43.60 | +7: iteration 38290/ 115203 | consumed samples: 9802240 | consumed tokens: 20074987520 | elapsed time per iteration (s): 0.57 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 2.797532E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.830 | TFLOPs: 43.17 | +7: iteration 38300/ 115203 | consumed samples: 9804800 | consumed tokens: 20080230400 | elapsed time per iteration (s): 0.56 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 2.808779E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.725 | TFLOPs: 43.26 | +7: iteration 38310/ 115203 | consumed samples: 9807360 | consumed tokens: 20085473280 | elapsed time per iteration (s): 0.57 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 2.793767E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.448 | TFLOPs: 42.75 | +7: iteration 38320/ 115203 | consumed samples: 9809920 | consumed tokens: 20090716160 | elapsed time per iteration (s): 0.57 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 2.813723E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.480 | TFLOPs: 42.47 | +7: iteration 38330/ 115203 | consumed samples: 9812480 | consumed tokens: 20095959040 | elapsed time per iteration (s): 0.57 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 2.798478E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.398 | TFLOPs: 42.65 | +7: iteration 38340/ 115203 | consumed samples: 9815040 | consumed tokens: 20101201920 | elapsed time per iteration (s): 0.56 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 2.794337E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.336 | TFLOPs: 43.79 | +7: iteration 38350/ 115203 | consumed samples: 9817600 | consumed tokens: 20106444800 | elapsed time per iteration (s): 0.59 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 2.790726E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.325 | TFLOPs: 41.69 | +7: iteration 38360/ 115203 | consumed samples: 9820160 | consumed tokens: 20111687680 | elapsed time per iteration (s): 0.58 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 2.787354E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.405 | TFLOPs: 41.80 | +7: iteration 38370/ 115203 | consumed samples: 9822720 | consumed tokens: 20116930560 | elapsed time per iteration (s): 0.57 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 2.807245E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.755 | TFLOPs: 43.07 | +7: iteration 38380/ 115203 | consumed samples: 9825280 | consumed tokens: 20122173440 | elapsed time per iteration (s): 0.58 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 2.808402E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.531 | TFLOPs: 41.90 | +7: iteration 38390/ 115203 | consumed samples: 9827840 | consumed tokens: 20127416320 | elapsed time per iteration (s): 0.57 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 2.787183E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.350 | TFLOPs: 43.13 | +7: iteration 38400/ 115203 | consumed samples: 9830400 | consumed tokens: 20132659200 | elapsed time per iteration (s): 0.57 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 2.799509E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.252 | TFLOPs: 43.02 | +7: iteration 38410/ 115203 | consumed samples: 9832960 | consumed tokens: 20137902080 | elapsed time per iteration (s): 0.58 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 2.782923E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.524 | TFLOPs: 42.38 | +7: iteration 38420/ 115203 | consumed samples: 9835520 | consumed tokens: 20143144960 | elapsed time per iteration (s): 0.56 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 2.806517E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.729 | TFLOPs: 43.26 | +7: iteration 38430/ 115203 | consumed samples: 9838080 | consumed tokens: 20148387840 | elapsed time per iteration (s): 0.56 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 2.789057E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.793 | TFLOPs: 43.45 | +7: iteration 38440/ 115203 | consumed samples: 9840640 | consumed tokens: 20153630720 | elapsed time per iteration (s): 0.58 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 2.820821E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.854 | TFLOPs: 42.22 | +7: iteration 38450/ 115203 | consumed samples: 9843200 | consumed tokens: 20158873600 | elapsed time per iteration (s): 0.57 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 2.802527E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.544 | TFLOPs: 42.57 | +7: iteration 38460/ 115203 | consumed samples: 9845760 | consumed tokens: 20164116480 | elapsed time per iteration (s): 0.56 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 2.787594E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.915 | TFLOPs: 43.56 | +7: iteration 38470/ 115203 | consumed samples: 9848320 | consumed tokens: 20169359360 | elapsed time per iteration (s): 0.56 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 2.799591E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.860 | TFLOPs: 43.94 | +7: iteration 38480/ 115203 | consumed samples: 9850880 | consumed tokens: 20174602240 | elapsed time per iteration (s): 0.57 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 2.777102E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.763 | TFLOPs: 43.17 | +7: iteration 38490/ 115203 | consumed samples: 9853440 | consumed tokens: 20179845120 | elapsed time per iteration (s): 0.57 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 2.801637E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.032 | TFLOPs: 43.10 | +7: iteration 38500/ 115203 | consumed samples: 9856000 | consumed tokens: 20185088000 | elapsed time per iteration (s): 0.56 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 2.809928E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.615 | TFLOPs: 43.25 | +7: iteration 38510/ 115203 | consumed samples: 9858560 | consumed tokens: 20190330880 | elapsed time per iteration (s): 0.57 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 2.786510E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.382 | TFLOPs: 43.03 | +7: iteration 38520/ 115203 | consumed samples: 9861120 | consumed tokens: 20195573760 | elapsed time per iteration (s): 0.58 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 2.779787E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.654 | TFLOPs: 42.01 | +7: iteration 38530/ 115203 | consumed samples: 9863680 | consumed tokens: 20200816640 | elapsed time per iteration (s): 0.58 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 2.805538E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.924 | TFLOPs: 42.42 | +7: iteration 38540/ 115203 | consumed samples: 9866240 | consumed tokens: 20206059520 | elapsed time per iteration (s): 0.59 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 2.789159E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.561 | TFLOPs: 41.43 | +7: iteration 38550/ 115203 | consumed samples: 9868800 | consumed tokens: 20211302400 | elapsed time per iteration (s): 0.59 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 2.783651E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.486 | TFLOPs: 41.33 | +7: iteration 38560/ 115203 | consumed samples: 9871360 | consumed tokens: 20216545280 | elapsed time per iteration (s): 0.57 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 2.798568E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.456 | TFLOPs: 43.04 | +7: iteration 38570/ 115203 | consumed samples: 9873920 | consumed tokens: 20221788160 | elapsed time per iteration (s): 0.56 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 2.802985E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.313 | TFLOPs: 43.60 | +7: iteration 38580/ 115203 | consumed samples: 9876480 | consumed tokens: 20227031040 | elapsed time per iteration (s): 0.58 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 2.802790E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.620 | TFLOPs: 42.20 | +7: iteration 38590/ 115203 | consumed samples: 9879040 | consumed tokens: 20232273920 | elapsed time per iteration (s): 0.57 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 2.807424E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.740 | TFLOPs: 43.16 | +7: iteration 38600/ 115203 | consumed samples: 9881600 | consumed tokens: 20237516800 | elapsed time per iteration (s): 0.56 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 2.784197E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.644 | TFLOPs: 43.25 | +7: iteration 38610/ 115203 | consumed samples: 9884160 | consumed tokens: 20242759680 | elapsed time per iteration (s): 0.58 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 2.799813E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.276 | TFLOPs: 42.26 | +7: iteration 38620/ 115203 | consumed samples: 9886720 | consumed tokens: 20248002560 | elapsed time per iteration (s): 0.57 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 2.789497E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.581 | TFLOPs: 42.86 | +7: iteration 38630/ 115203 | consumed samples: 9889280 | consumed tokens: 20253245440 | elapsed time per iteration (s): 0.57 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 2.791339E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.609 | TFLOPs: 42.87 | +7: iteration 38640/ 115203 | consumed samples: 9891840 | consumed tokens: 20258488320 | elapsed time per iteration (s): 0.58 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 2.788487E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.123 | TFLOPs: 41.77 | +7: iteration 38650/ 115203 | consumed samples: 9894400 | consumed tokens: 20263731200 | elapsed time per iteration (s): 0.58 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 2.796411E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.704 | TFLOPs: 42.11 | +7: iteration 38660/ 115203 | consumed samples: 9896960 | consumed tokens: 20268974080 | elapsed time per iteration (s): 0.58 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 2.791768E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.829 | TFLOPs: 41.93 | +7: iteration 38670/ 115203 | consumed samples: 9899520 | consumed tokens: 20274216960 | elapsed time per iteration (s): 0.56 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 2.785362E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.138 | TFLOPs: 43.49 | +7: iteration 38680/ 115203 | consumed samples: 9902080 | consumed tokens: 20279459840 | elapsed time per iteration (s): 0.59 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 2.791328E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.696 | TFLOPs: 41.35 | +7: iteration 38690/ 115203 | consumed samples: 9904640 | consumed tokens: 20284702720 | elapsed time per iteration (s): 0.58 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 2.781864E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.306 | TFLOPs: 42.26 | +7: iteration 38700/ 115203 | consumed samples: 9907200 | consumed tokens: 20289945600 | elapsed time per iteration (s): 0.60 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 2.779529E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.105 | TFLOPs: 40.91 | +7: iteration 38710/ 115203 | consumed samples: 9909760 | consumed tokens: 20295188480 | elapsed time per iteration (s): 0.57 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 2.791941E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.923 | TFLOPs: 42.61 | +7: iteration 38720/ 115203 | consumed samples: 9912320 | consumed tokens: 20300431360 | elapsed time per iteration (s): 0.58 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 2.789474E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.047 | TFLOPs: 42.43 | +7: iteration 38730/ 115203 | consumed samples: 9914880 | consumed tokens: 20305674240 | elapsed time per iteration (s): 0.57 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 2.789866E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.578 | TFLOPs: 42.48 | +7: iteration 38740/ 115203 | consumed samples: 9917440 | consumed tokens: 20310917120 | elapsed time per iteration (s): 0.56 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 2.780697E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.293 | TFLOPs: 43.31 | +7: iteration 38750/ 115203 | consumed samples: 9920000 | consumed tokens: 20316160000 | elapsed time per iteration (s): 0.57 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 2.798044E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.581 | TFLOPs: 42.67 | +7: iteration 38760/ 115203 | consumed samples: 9922560 | consumed tokens: 20321402880 | elapsed time per iteration (s): 0.57 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 2.808428E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.020 | TFLOPs: 43.00 | +7: iteration 38770/ 115203 | consumed samples: 9925120 | consumed tokens: 20326645760 | elapsed time per iteration (s): 0.56 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 2.807187E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.727 | TFLOPs: 43.45 | +7: iteration 38780/ 115203 | consumed samples: 9927680 | consumed tokens: 20331888640 | elapsed time per iteration (s): 0.58 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 2.789320E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.075 | TFLOPs: 42.24 | +7: iteration 38790/ 115203 | consumed samples: 9930240 | consumed tokens: 20337131520 | elapsed time per iteration (s): 0.58 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 2.792359E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.675 | TFLOPs: 42.20 | +7: iteration 38800/ 115203 | consumed samples: 9932800 | consumed tokens: 20342374400 | elapsed time per iteration (s): 0.58 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 2.805380E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.911 | TFLOPs: 42.23 | +7: iteration 38810/ 115203 | consumed samples: 9935360 | consumed tokens: 20347617280 | elapsed time per iteration (s): 0.56 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 2.793124E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.209 | TFLOPs: 43.40 | +7: iteration 38820/ 115203 | consumed samples: 9937920 | consumed tokens: 20352860160 | elapsed time per iteration (s): 0.58 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 2.803054E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.956 | TFLOPs: 41.85 | +7: iteration 38830/ 115203 | consumed samples: 9940480 | consumed tokens: 20358103040 | elapsed time per iteration (s): 0.59 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 2.792144E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.861 | TFLOPs: 41.65 | +7: iteration 38840/ 115203 | consumed samples: 9943040 | consumed tokens: 20363345920 | elapsed time per iteration (s): 0.57 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 2.808861E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.238 | TFLOPs: 42.64 | +7: iteration 38850/ 115203 | consumed samples: 9945600 | consumed tokens: 20368588800 | elapsed time per iteration (s): 0.57 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 2.793797E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.671 | TFLOPs: 42.97 | +7: iteration 38860/ 115203 | consumed samples: 9948160 | consumed tokens: 20373831680 | elapsed time per iteration (s): 0.57 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 2.790642E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.969 | TFLOPs: 42.52 | +7: iteration 38870/ 115203 | consumed samples: 9950720 | consumed tokens: 20379074560 | elapsed time per iteration (s): 0.57 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 2.797696E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.701 | TFLOPs: 42.87 | +7: iteration 38880/ 115203 | consumed samples: 9953280 | consumed tokens: 20384317440 | elapsed time per iteration (s): 0.57 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 2.799798E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.937 | TFLOPs: 43.09 | +7: iteration 38890/ 115203 | consumed samples: 9955840 | consumed tokens: 20389560320 | elapsed time per iteration (s): 0.58 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 2.790755E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.327 | TFLOPs: 42.17 | +7: iteration 38900/ 115203 | consumed samples: 9958400 | consumed tokens: 20394803200 | elapsed time per iteration (s): 0.57 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 2.784278E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.050 | TFLOPs: 42.72 | +7: iteration 38910/ 115203 | consumed samples: 9960960 | consumed tokens: 20400046080 | elapsed time per iteration (s): 0.57 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 2.795282E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.134 | TFLOPs: 43.11 | +7: iteration 38920/ 115203 | consumed samples: 9963520 | consumed tokens: 20405288960 | elapsed time per iteration (s): 0.58 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 2.780825E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.901 | TFLOPs: 41.75 | +7: iteration 38930/ 115203 | consumed samples: 9966080 | consumed tokens: 20410531840 | elapsed time per iteration (s): 0.58 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 2.795893E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.719 | TFLOPs: 42.40 | +7: iteration 38940/ 115203 | consumed samples: 9968640 | consumed tokens: 20415774720 | elapsed time per iteration (s): 0.57 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 2.788654E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.342 | TFLOPs: 42.55 | +7: iteration 38950/ 115203 | consumed samples: 9971200 | consumed tokens: 20421017600 | elapsed time per iteration (s): 0.59 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 2.784251E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.625 | TFLOPs: 41.34 | +7: iteration 38960/ 115203 | consumed samples: 9973760 | consumed tokens: 20426260480 | elapsed time per iteration (s): 0.57 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 2.789228E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.986 | TFLOPs: 43.09 | +7: iteration 38970/ 115203 | consumed samples: 9976320 | consumed tokens: 20431503360 | elapsed time per iteration (s): 0.58 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 2.797169E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.012 | TFLOPs: 42.14 | +7: iteration 38980/ 115203 | consumed samples: 9978880 | consumed tokens: 20436746240 | elapsed time per iteration (s): 0.58 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 2.790561E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.444 | TFLOPs: 42.28 | +7: iteration 38990/ 115203 | consumed samples: 9981440 | consumed tokens: 20441989120 | elapsed time per iteration (s): 0.56 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 2.797676E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.553 | TFLOPs: 43.34 | +7: iteration 39000/ 115203 | consumed samples: 9984000 | consumed tokens: 20447232000 | elapsed time per iteration (s): 0.57 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 2.786201E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.723 | TFLOPs: 42.88 | +7: iteration 39010/ 115203 | consumed samples: 9986560 | consumed tokens: 20452474880 | elapsed time per iteration (s): 0.57 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 2.802386E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.684 | TFLOPs: 43.06 | +7: iteration 39020/ 115203 | consumed samples: 9989120 | consumed tokens: 20457717760 | elapsed time per iteration (s): 0.58 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 2.787486E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.001 | TFLOPs: 42.43 | +7: iteration 39030/ 115203 | consumed samples: 9991680 | consumed tokens: 20462960640 | elapsed time per iteration (s): 0.56 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 2.784480E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.475 | TFLOPs: 43.62 | +7: iteration 39040/ 115203 | consumed samples: 9994240 | consumed tokens: 20468203520 | elapsed time per iteration (s): 0.58 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 2.785680E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.930 | TFLOPs: 42.42 | +7: iteration 39050/ 115203 | consumed samples: 9996800 | consumed tokens: 20473446400 | elapsed time per iteration (s): 0.56 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 2.791731E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.857 | TFLOPs: 43.56 | +7: iteration 39060/ 115203 | consumed samples: 9999360 | consumed tokens: 20478689280 | elapsed time per iteration (s): 0.57 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 2.786669E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.306 | TFLOPs: 43.03 | +7: iteration 39070/ 115203 | consumed samples: 10001920 | consumed tokens: 20483932160 | elapsed time per iteration (s): 0.57 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 2.789299E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.093 | TFLOPs: 42.53 | +7: iteration 39080/ 115203 | consumed samples: 10004480 | consumed tokens: 20489175040 | elapsed time per iteration (s): 0.57 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 2.788173E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.431 | TFLOPs: 42.94 | +7: iteration 39090/ 115203 | consumed samples: 10007040 | consumed tokens: 20494417920 | elapsed time per iteration (s): 0.58 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 2.797976E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.955 | TFLOPs: 42.33 | +7: iteration 39100/ 115203 | consumed samples: 10009600 | consumed tokens: 20499660800 | elapsed time per iteration (s): 0.56 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 2.785025E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.282 | TFLOPs: 43.50 | +7: iteration 39110/ 115203 | consumed samples: 10012160 | consumed tokens: 20504903680 | elapsed time per iteration (s): 0.57 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 2.802523E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.288 | TFLOPs: 43.03 | +7: iteration 39120/ 115203 | consumed samples: 10014720 | consumed tokens: 20510146560 | elapsed time per iteration (s): 0.57 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 2.778594E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.704 | TFLOPs: 43.16 | +7: iteration 39130/ 115203 | consumed samples: 10017280 | consumed tokens: 20515389440 | elapsed time per iteration (s): 0.57 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 2.792766E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.217 | TFLOPs: 42.64 | +7: iteration 39140/ 115203 | consumed samples: 10019840 | consumed tokens: 20520632320 | elapsed time per iteration (s): 0.57 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 2.794734E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.982 | TFLOPs: 42.81 | +7: iteration 39150/ 115203 | consumed samples: 10022400 | consumed tokens: 20525875200 | elapsed time per iteration (s): 0.57 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 2.790607E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.344 | TFLOPs: 42.46 | +7: iteration 39160/ 115203 | consumed samples: 10024960 | consumed tokens: 20531118080 | elapsed time per iteration (s): 0.58 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 2.795751E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.087 | TFLOPs: 42.15 | +7: iteration 39170/ 115203 | consumed samples: 10027520 | consumed tokens: 20536360960 | elapsed time per iteration (s): 0.57 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 2.798439E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.559 | TFLOPs: 43.05 | +7: iteration 39180/ 115203 | consumed samples: 10030080 | consumed tokens: 20541603840 | elapsed time per iteration (s): 0.58 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 2.786366E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.240 | TFLOPs: 41.97 | +7: iteration 39190/ 115203 | consumed samples: 10032640 | consumed tokens: 20546846720 | elapsed time per iteration (s): 0.57 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 2.789272E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.074 | TFLOPs: 43.10 | +7: iteration 39200/ 115203 | consumed samples: 10035200 | consumed tokens: 20552089600 | elapsed time per iteration (s): 0.58 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 2.810605E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.023 | TFLOPs: 42.05 | +7: iteration 39210/ 115203 | consumed samples: 10037760 | consumed tokens: 20557332480 | elapsed time per iteration (s): 0.60 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 2.796914E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.091 | TFLOPs: 41.00 | +7: iteration 39220/ 115203 | consumed samples: 10040320 | consumed tokens: 20562575360 | elapsed time per iteration (s): 0.59 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 2.778895E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.911 | TFLOPs: 41.65 | +7: iteration 39230/ 115203 | consumed samples: 10042880 | consumed tokens: 20567818240 | elapsed time per iteration (s): 0.59 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 2.778567E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.446 | TFLOPs: 41.71 | +7: iteration 39240/ 115203 | consumed samples: 10045440 | consumed tokens: 20573061120 | elapsed time per iteration (s): 0.58 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 2.790540E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.214 | TFLOPs: 42.35 | +7: iteration 39250/ 115203 | consumed samples: 10048000 | consumed tokens: 20578304000 | elapsed time per iteration (s): 0.56 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 2.778396E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.550 | TFLOPs: 43.34 | +7: iteration 39260/ 115203 | consumed samples: 10050560 | consumed tokens: 20583546880 | elapsed time per iteration (s): 0.57 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 2.797566E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.589 | TFLOPs: 42.86 | +7: iteration 39270/ 115203 | consumed samples: 10053120 | consumed tokens: 20588789760 | elapsed time per iteration (s): 0.58 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 2.796152E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.121 | TFLOPs: 41.77 | +7: iteration 39280/ 115203 | consumed samples: 10055680 | consumed tokens: 20594032640 | elapsed time per iteration (s): 0.60 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 2.792372E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.522 | TFLOPs: 40.76 | +7: iteration 39290/ 115203 | consumed samples: 10058240 | consumed tokens: 20599275520 | elapsed time per iteration (s): 0.57 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 2.795555E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.529 | TFLOPs: 42.67 | +7: iteration 39300/ 115203 | consumed samples: 10060800 | consumed tokens: 20604518400 | elapsed time per iteration (s): 0.58 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 2.803363E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.096 | TFLOPs: 41.86 | +7: iteration 39310/ 115203 | consumed samples: 10063360 | consumed tokens: 20609761280 | elapsed time per iteration (s): 0.59 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 2.771381E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.866 | TFLOPs: 41.56 | +7: iteration 39320/ 115203 | consumed samples: 10065920 | consumed tokens: 20615004160 | elapsed time per iteration (s): 0.57 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 2.791232E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.053 | TFLOPs: 43.00 | +7: iteration 39330/ 115203 | consumed samples: 10068480 | consumed tokens: 20620247040 | elapsed time per iteration (s): 0.57 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 2.785605E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.465 | TFLOPs: 43.14 | +7: iteration 39340/ 115203 | consumed samples: 10071040 | consumed tokens: 20625489920 | elapsed time per iteration (s): 0.57 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 2.781452E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.017 | TFLOPs: 42.81 | +7: iteration 39350/ 115203 | consumed samples: 10073600 | consumed tokens: 20630732800 | elapsed time per iteration (s): 0.56 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 2.773864E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.660 | TFLOPs: 43.63 | +7: iteration 39360/ 115203 | consumed samples: 10076160 | consumed tokens: 20635975680 | elapsed time per iteration (s): 0.57 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 2.786085E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.551 | TFLOPs: 42.86 | +7: iteration 39370/ 115203 | consumed samples: 10078720 | consumed tokens: 20641218560 | elapsed time per iteration (s): 0.57 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 2.789734E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.007 | TFLOPs: 42.52 | +7: iteration 39380/ 115203 | consumed samples: 10081280 | consumed tokens: 20646461440 | elapsed time per iteration (s): 0.57 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 2.787297E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.486 | TFLOPs: 42.47 | +7: iteration 39390/ 115203 | consumed samples: 10083840 | consumed tokens: 20651704320 | elapsed time per iteration (s): 0.58 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 2.775945E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.208 | TFLOPs: 42.16 | +7: iteration 39400/ 115203 | consumed samples: 10086400 | consumed tokens: 20656947200 | elapsed time per iteration (s): 0.58 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 2.797958E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.617 | TFLOPs: 42.29 | +7: iteration 39410/ 115203 | consumed samples: 10088960 | consumed tokens: 20662190080 | elapsed time per iteration (s): 0.57 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 2.795145E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.319 | TFLOPs: 42.65 | +7: iteration 39420/ 115203 | consumed samples: 10091520 | consumed tokens: 20667432960 | elapsed time per iteration (s): 0.58 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 2.794473E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.450 | TFLOPs: 42.37 | +7: iteration 39430/ 115203 | consumed samples: 10094080 | consumed tokens: 20672675840 | elapsed time per iteration (s): 0.57 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 2.794368E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.818 | TFLOPs: 42.60 | +7: iteration 39440/ 115203 | consumed samples: 10096640 | consumed tokens: 20677918720 | elapsed time per iteration (s): 0.58 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 2.791932E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.579 | TFLOPs: 42.10 | +7: iteration 39450/ 115203 | consumed samples: 10099200 | consumed tokens: 20683161600 | elapsed time per iteration (s): 0.55 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 2.791017E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.524 | TFLOPs: 44.00 | +7: iteration 39460/ 115203 | consumed samples: 10101760 | consumed tokens: 20688404480 | elapsed time per iteration (s): 0.60 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 2.784369E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.661 | TFLOPs: 40.58 | +7: iteration 39470/ 115203 | consumed samples: 10104320 | consumed tokens: 20693647360 | elapsed time per iteration (s): 0.57 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 2.790857E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.123 | TFLOPs: 42.53 | +7: iteration 39480/ 115203 | consumed samples: 10106880 | consumed tokens: 20698890240 | elapsed time per iteration (s): 0.59 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 2.798008E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.610 | TFLOPs: 41.53 | +7: iteration 39490/ 115203 | consumed samples: 10109440 | consumed tokens: 20704133120 | elapsed time per iteration (s): 0.59 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 2.789737E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.819 | TFLOPs: 41.36 | +7: iteration 39500/ 115203 | consumed samples: 10112000 | consumed tokens: 20709376000 | elapsed time per iteration (s): 0.59 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 2.787416E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.269 | TFLOPs: 41.31 | +7: iteration 39510/ 115203 | consumed samples: 10114560 | consumed tokens: 20714618880 | elapsed time per iteration (s): 0.58 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 2.789816E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.841 | TFLOPs: 42.12 | +7: iteration 39520/ 115203 | consumed samples: 10117120 | consumed tokens: 20719861760 | elapsed time per iteration (s): 0.58 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 2.798192E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.507 | TFLOPs: 42.09 | +7: iteration 39530/ 115203 | consumed samples: 10119680 | consumed tokens: 20725104640 | elapsed time per iteration (s): 0.57 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 2.784493E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.582 | TFLOPs: 42.67 | +7: iteration 39540/ 115203 | consumed samples: 10122240 | consumed tokens: 20730347520 | elapsed time per iteration (s): 0.58 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 2.781078E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.217 | TFLOPs: 42.16 | +7: iteration 39550/ 115203 | consumed samples: 10124800 | consumed tokens: 20735590400 | elapsed time per iteration (s): 0.59 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 2.790659E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.488 | TFLOPs: 41.71 | +7: iteration 39560/ 115203 | consumed samples: 10127360 | consumed tokens: 20740833280 | elapsed time per iteration (s): 0.58 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 2.797466E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.478 | TFLOPs: 42.09 | +7: iteration 39570/ 115203 | consumed samples: 10129920 | consumed tokens: 20746076160 | elapsed time per iteration (s): 0.58 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 2.789247E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.255 | TFLOPs: 42.35 | +7: iteration 39580/ 115203 | consumed samples: 10132480 | consumed tokens: 20751319040 | elapsed time per iteration (s): 0.59 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 2.802065E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.277 | TFLOPs: 41.40 | +7: iteration 39590/ 115203 | consumed samples: 10135040 | consumed tokens: 20756561920 | elapsed time per iteration (s): 0.60 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 2.792589E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.621 | TFLOPs: 40.86 | +7: iteration 39600/ 115203 | consumed samples: 10137600 | consumed tokens: 20761804800 | elapsed time per iteration (s): 0.57 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 2.793861E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.848 | TFLOPs: 42.89 | +7: iteration 39610/ 115203 | consumed samples: 10140160 | consumed tokens: 20767047680 | elapsed time per iteration (s): 0.59 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 2.782116E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.444 | TFLOPs: 41.51 | +7: iteration 39620/ 115203 | consumed samples: 10142720 | consumed tokens: 20772290560 | elapsed time per iteration (s): 0.59 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 2.795449E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.077 | TFLOPs: 41.48 | +7: iteration 39630/ 115203 | consumed samples: 10145280 | consumed tokens: 20777533440 | elapsed time per iteration (s): 0.58 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 2.775160E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.955 | TFLOPs: 42.23 | +7: iteration 39640/ 115203 | consumed samples: 10147840 | consumed tokens: 20782776320 | elapsed time per iteration (s): 0.56 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 2.783301E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.317 | TFLOPs: 43.70 | +7: iteration 39650/ 115203 | consumed samples: 10150400 | consumed tokens: 20788019200 | elapsed time per iteration (s): 0.59 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 2.785924E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.735 | TFLOPs: 41.45 | +7: iteration 39660/ 115203 | consumed samples: 10152960 | consumed tokens: 20793262080 | elapsed time per iteration (s): 0.58 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 2.791479E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.134 | TFLOPs: 41.87 | +7: iteration 39670/ 115203 | consumed samples: 10155520 | consumed tokens: 20798504960 | elapsed time per iteration (s): 0.58 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 2.798324E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.962 | TFLOPs: 42.33 | +7: iteration 39680/ 115203 | consumed samples: 10158080 | consumed tokens: 20803747840 | elapsed time per iteration (s): 0.57 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 2.781951E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.236 | TFLOPs: 42.73 | +7: iteration 39690/ 115203 | consumed samples: 10160640 | consumed tokens: 20808990720 | elapsed time per iteration (s): 0.56 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 2.779346E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.854 | TFLOPs: 43.27 | +7: iteration 39700/ 115203 | consumed samples: 10163200 | consumed tokens: 20814233600 | elapsed time per iteration (s): 0.59 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 2.805400E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.011 | TFLOPs: 41.38 | +7: iteration 39710/ 115203 | consumed samples: 10165760 | consumed tokens: 20819476480 | elapsed time per iteration (s): 0.57 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 2.785994E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.385 | TFLOPs: 42.65 | +7: iteration 39720/ 115203 | consumed samples: 10168320 | consumed tokens: 20824719360 | elapsed time per iteration (s): 0.57 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 2.794583E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.984 | TFLOPs: 42.62 | +7: iteration 39730/ 115203 | consumed samples: 10170880 | consumed tokens: 20829962240 | elapsed time per iteration (s): 0.57 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 2.799807E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.346 | TFLOPs: 42.84 | +7: iteration 39740/ 115203 | consumed samples: 10173440 | consumed tokens: 20835205120 | elapsed time per iteration (s): 0.57 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 2.783291E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.470 | TFLOPs: 42.66 | +7: iteration 39750/ 115203 | consumed samples: 10176000 | consumed tokens: 20840448000 | elapsed time per iteration (s): 0.56 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 2.778581E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.067 | TFLOPs: 43.48 | +7: iteration 39760/ 115203 | consumed samples: 10178560 | consumed tokens: 20845690880 | elapsed time per iteration (s): 0.56 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 2.790925E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.297 | TFLOPs: 43.22 | +7: iteration 39770/ 115203 | consumed samples: 10181120 | consumed tokens: 20850933760 | elapsed time per iteration (s): 0.57 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 2.788098E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.232 | TFLOPs: 42.45 | +7: iteration 39780/ 115203 | consumed samples: 10183680 | consumed tokens: 20856176640 | elapsed time per iteration (s): 0.57 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 2.778011E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.105 | TFLOPs: 42.91 | +7: iteration 39790/ 115203 | consumed samples: 10186240 | consumed tokens: 20861419520 | elapsed time per iteration (s): 0.56 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 2.792526E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.678 | TFLOPs: 43.25 | +7: iteration 39800/ 115203 | consumed samples: 10188800 | consumed tokens: 20866662400 | elapsed time per iteration (s): 0.58 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 2.776250E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.918 | TFLOPs: 41.85 | +7: iteration 39810/ 115203 | consumed samples: 10191360 | consumed tokens: 20871905280 | elapsed time per iteration (s): 0.59 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 2.777809E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.478 | TFLOPs: 41.52 | +7: iteration 39820/ 115203 | consumed samples: 10193920 | consumed tokens: 20877148160 | elapsed time per iteration (s): 0.59 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 2.807985E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.105 | TFLOPs: 41.29 | +7: iteration 39830/ 115203 | consumed samples: 10196480 | consumed tokens: 20882391040 | elapsed time per iteration (s): 0.56 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 2.796708E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.166 | TFLOPs: 43.49 | +7: iteration 39840/ 115203 | consumed samples: 10199040 | consumed tokens: 20887633920 | elapsed time per iteration (s): 0.60 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 2.794298E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.900 | TFLOPs: 40.80 | +7: iteration 39850/ 115203 | consumed samples: 10201600 | consumed tokens: 20892876800 | elapsed time per iteration (s): 0.57 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 2.778993E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.003 | TFLOPs: 42.90 | +7: iteration 39860/ 115203 | consumed samples: 10204160 | consumed tokens: 20898119680 | elapsed time per iteration (s): 0.58 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 2.777411E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.906 | TFLOPs: 41.94 | +7: iteration 39870/ 115203 | consumed samples: 10206720 | consumed tokens: 20903362560 | elapsed time per iteration (s): 0.58 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 2.791105E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.014 | TFLOPs: 42.43 | +7: iteration 39880/ 115203 | consumed samples: 10209280 | consumed tokens: 20908605440 | elapsed time per iteration (s): 0.57 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 2.790024E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.785 | TFLOPs: 42.60 | +7: iteration 39890/ 115203 | consumed samples: 10211840 | consumed tokens: 20913848320 | elapsed time per iteration (s): 0.58 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 2.789011E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.779 | TFLOPs: 41.74 | +7: iteration 39900/ 115203 | consumed samples: 10214400 | consumed tokens: 20919091200 | elapsed time per iteration (s): 0.56 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 2.789289E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.693 | TFLOPs: 43.92 | +7: iteration 39910/ 115203 | consumed samples: 10216960 | consumed tokens: 20924334080 | elapsed time per iteration (s): 0.57 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 2.793475E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.424 | TFLOPs: 43.13 | +7: iteration 39920/ 115203 | consumed samples: 10219520 | consumed tokens: 20929576960 | elapsed time per iteration (s): 0.56 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 2.789254E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.169 | TFLOPs: 43.49 | +7: iteration 39930/ 115203 | consumed samples: 10222080 | consumed tokens: 20934819840 | elapsed time per iteration (s): 0.56 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 2.778102E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.758 | TFLOPs: 43.26 | +7: iteration 39940/ 115203 | consumed samples: 10224640 | consumed tokens: 20940062720 | elapsed time per iteration (s): 0.57 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 2.776917E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.445 | TFLOPs: 42.85 | +7: iteration 39950/ 115203 | consumed samples: 10227200 | consumed tokens: 20945305600 | elapsed time per iteration (s): 0.57 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 2.794097E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.797 | TFLOPs: 42.69 | +7: iteration 39960/ 115203 | consumed samples: 10229760 | consumed tokens: 20950548480 | elapsed time per iteration (s): 0.57 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 2.772170E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.054 | TFLOPs: 43.00 | +7: iteration 39970/ 115203 | consumed samples: 10232320 | consumed tokens: 20955791360 | elapsed time per iteration (s): 0.58 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 2.782237E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.630 | TFLOPs: 42.30 | +7: iteration 39980/ 115203 | consumed samples: 10234880 | consumed tokens: 20961034240 | elapsed time per iteration (s): 0.57 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 2.781560E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.609 | TFLOPs: 43.15 | +7: iteration 39990/ 115203 | consumed samples: 10237440 | consumed tokens: 20966277120 | elapsed time per iteration (s): 0.58 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 2.780158E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.901 | TFLOPs: 41.84 | +0: [2023-03-16 19:06:46,953] [INFO] [logging.py:68:log_dist] [Rank 0] step=40000, skipped=0, lr=[0.0001532049360643911, 0.0001532049360643911, 0.0001532049360643911], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 40000/ 115203 | consumed samples: 10240000 | consumed tokens: 20971520000 | elapsed time per iteration (s): 0.58 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 2.789650E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.358 | TFLOPs: 41.98 | +0: steps: 40000 loss: 2.7817 iter time (s): 0.571 samples/sec: 448.014 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 40000 | lm loss value: 3.346216E+00 | lm loss PPL: 2.839509E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 40000 to checkpoints_421m60b400m +0: [2023-03-16 19:06:47,177] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step40000 is begin to save! +0: [2023-03-16 19:06:47,184] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_01-model_00-model_states.pt... +0: [2023-03-16 19:06:47,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_01-model_00-model_states.pt. +0: [2023-03-16 19:06:47,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_03-model_00-model_states.pt... +0: [2023-03-16 19:06:47,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_03-model_00-model_states.pt. +0: [2023-03-16 19:06:47,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_04-model_00-model_states.pt... +0: [2023-03-16 19:06:47,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_04-model_00-model_states.pt. +0: [2023-03-16 19:06:47,433] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_05-model_00-model_states.pt... +0: [2023-03-16 19:06:47,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_05-model_00-model_states.pt. +0: [2023-03-16 19:06:47,474] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_06-model_00-model_states.pt... +0: [2023-03-16 19:06:47,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_06-model_00-model_states.pt. +0: [2023-03-16 19:06:47,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_07-model_00-model_states.pt... +0: [2023-03-16 19:06:47,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_07-model_00-model_states.pt. +0: [2023-03-16 19:06:47,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_08-model_00-model_states.pt... +0: [2023-03-16 19:06:47,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_08-model_00-model_states.pt. +0: [2023-03-16 19:06:47,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_09-model_00-model_states.pt... +0: [2023-03-16 19:06:47,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_09-model_00-model_states.pt. +0: [2023-03-16 19:06:47,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_10-model_00-model_states.pt... +0: [2023-03-16 19:06:47,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_10-model_00-model_states.pt. +0: [2023-03-16 19:06:47,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_11-model_00-model_states.pt... +0: [2023-03-16 19:06:47,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_11-model_00-model_states.pt. +0: [2023-03-16 19:06:47,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_12-model_00-model_states.pt... +0: [2023-03-16 19:06:47,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_12-model_00-model_states.pt. +0: [2023-03-16 19:06:47,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_13-model_00-model_states.pt... +0: [2023-03-16 19:06:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_13-model_00-model_states.pt. +0: [2023-03-16 19:06:47,806] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_14-model_00-model_states.pt... +0: [2023-03-16 19:06:47,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_14-model_00-model_states.pt. +0: [2023-03-16 19:06:47,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_15-model_00-model_states.pt... +0: [2023-03-16 19:06:47,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_15-model_00-model_states.pt. +0: [2023-03-16 19:06:47,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_16-model_00-model_states.pt... +0: [2023-03-16 19:06:47,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_16-model_00-model_states.pt. +0: [2023-03-16 19:06:47,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_17-model_00-model_states.pt... +0: [2023-03-16 19:06:47,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_17-model_00-model_states.pt. +0: [2023-03-16 19:06:47,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_18-model_00-model_states.pt... +0: [2023-03-16 19:06:48,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_18-model_00-model_states.pt. +0: [2023-03-16 19:06:48,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_19-model_00-model_states.pt... +0: [2023-03-16 19:06:48,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_19-model_00-model_states.pt. +0: [2023-03-16 19:06:48,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_20-model_00-model_states.pt... +0: [2023-03-16 19:06:48,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_20-model_00-model_states.pt. +0: [2023-03-16 19:06:48,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/layer_22-model_00-model_states.pt... +0: [2023-03-16 19:06:48,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/layer_22-model_00-model_states.pt. +0: [2023-03-16 19:06:48,105] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step40000/mp_rank_00_model_states.pt +0: [2023-03-16 19:06:48,105] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/mp_rank_00_model_states.pt... +0: [2023-03-16 19:06:48,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/mp_rank_00_model_states.pt. +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-16 19:06:48,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-16 19:06:48,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-16 19:06:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 19:06:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 19:06:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 19:06:48,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-16 19:06:48,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-16 19:06:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 19:06:48,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 19:06:48,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-16 19:06:48,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 19:06:48,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-16 19:06:48,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 19:06:48,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-16 19:06:48,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-16 19:06:48,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: successfully saved checkpoint at iteration 40000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1218.77 +7: iteration 40010/ 115203 | consumed samples: 10242560 | consumed tokens: 20976762880 | elapsed time per iteration (s): 0.70 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 2.796806E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 365.698 | TFLOPs: 34.87 | +7: iteration 40020/ 115203 | consumed samples: 10245120 | consumed tokens: 20982005760 | elapsed time per iteration (s): 0.57 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 2.800131E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.820 | TFLOPs: 42.89 | +7: iteration 40030/ 115203 | consumed samples: 10247680 | consumed tokens: 20987248640 | elapsed time per iteration (s): 0.57 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 2.792057E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.040 | TFLOPs: 42.91 | +7: iteration 40040/ 115203 | consumed samples: 10250240 | consumed tokens: 20992491520 | elapsed time per iteration (s): 0.56 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 2.788392E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.969 | TFLOPs: 43.28 | +7: iteration 40050/ 115203 | consumed samples: 10252800 | consumed tokens: 20997734400 | elapsed time per iteration (s): 0.56 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 2.774157E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.112 | TFLOPs: 43.39 | +7: iteration 40060/ 115203 | consumed samples: 10255360 | consumed tokens: 21002977280 | elapsed time per iteration (s): 0.56 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 2.777560E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.852 | TFLOPs: 43.65 | +7: iteration 40070/ 115203 | consumed samples: 10257920 | consumed tokens: 21008220160 | elapsed time per iteration (s): 0.56 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 2.780723E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.461 | TFLOPs: 43.23 | +7: iteration 40080/ 115203 | consumed samples: 10260480 | consumed tokens: 21013463040 | elapsed time per iteration (s): 0.57 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 2.783653E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.994 | TFLOPs: 42.81 | +7: iteration 40090/ 115203 | consumed samples: 10263040 | consumed tokens: 21018705920 | elapsed time per iteration (s): 0.56 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 2.785634E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.594 | TFLOPs: 43.72 | +7: iteration 40100/ 115203 | consumed samples: 10265600 | consumed tokens: 21023948800 | elapsed time per iteration (s): 0.56 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 2.781016E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.434 | TFLOPs: 43.33 | +7: iteration 40110/ 115203 | consumed samples: 10268160 | consumed tokens: 21029191680 | elapsed time per iteration (s): 0.57 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 2.795520E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.136 | TFLOPs: 42.63 | +7: iteration 40120/ 115203 | consumed samples: 10270720 | consumed tokens: 21034434560 | elapsed time per iteration (s): 0.57 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 2.773658E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.756 | TFLOPs: 42.88 | +7: iteration 40130/ 115203 | consumed samples: 10273280 | consumed tokens: 21039677440 | elapsed time per iteration (s): 0.57 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 2.792549E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.670 | TFLOPs: 42.97 | +7: iteration 40140/ 115203 | consumed samples: 10275840 | consumed tokens: 21044920320 | elapsed time per iteration (s): 0.57 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 2.791405E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.405 | TFLOPs: 42.75 | +7: iteration 40150/ 115203 | consumed samples: 10278400 | consumed tokens: 21050163200 | elapsed time per iteration (s): 0.57 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 2.781557E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.616 | TFLOPs: 43.15 | +7: iteration 40160/ 115203 | consumed samples: 10280960 | consumed tokens: 21055406080 | elapsed time per iteration (s): 0.58 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 2.781870E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.871 | TFLOPs: 42.32 | +7: iteration 40170/ 115203 | consumed samples: 10283520 | consumed tokens: 21060648960 | elapsed time per iteration (s): 0.56 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 2.779003E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.466 | TFLOPs: 43.42 | +7: iteration 40180/ 115203 | consumed samples: 10286080 | consumed tokens: 21065891840 | elapsed time per iteration (s): 0.57 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 2.776140E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.906 | TFLOPs: 42.80 | +7: iteration 40190/ 115203 | consumed samples: 10288640 | consumed tokens: 21071134720 | elapsed time per iteration (s): 0.57 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 2.779948E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.594 | TFLOPs: 42.96 | +7: iteration 40200/ 115203 | consumed samples: 10291200 | consumed tokens: 21076377600 | elapsed time per iteration (s): 0.58 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 2.780303E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.566 | TFLOPs: 42.19 | +7: iteration 40210/ 115203 | consumed samples: 10293760 | consumed tokens: 21081620480 | elapsed time per iteration (s): 0.56 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 2.797301E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.619 | TFLOPs: 43.44 | +7: iteration 40220/ 115203 | consumed samples: 10296320 | consumed tokens: 21086863360 | elapsed time per iteration (s): 0.55 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 2.792381E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.417 | TFLOPs: 43.99 | +7: iteration 40230/ 115203 | consumed samples: 10298880 | consumed tokens: 21092106240 | elapsed time per iteration (s): 0.58 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 2.788142E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.641 | TFLOPs: 42.11 | +7: iteration 40240/ 115203 | consumed samples: 10301440 | consumed tokens: 21097349120 | elapsed time per iteration (s): 0.57 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 2.782286E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.157 | TFLOPs: 42.82 | +7: iteration 40250/ 115203 | consumed samples: 10304000 | consumed tokens: 21102592000 | elapsed time per iteration (s): 0.58 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 2.797266E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.449 | TFLOPs: 41.99 | +7: iteration 40260/ 115203 | consumed samples: 10306560 | consumed tokens: 21107834880 | elapsed time per iteration (s): 0.59 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 2.766638E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.419 | TFLOPs: 41.23 | +7: iteration 40270/ 115203 | consumed samples: 10309120 | consumed tokens: 21113077760 | elapsed time per iteration (s): 0.57 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 2.789301E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.877 | TFLOPs: 42.80 | +7: iteration 40280/ 115203 | consumed samples: 10311680 | consumed tokens: 21118320640 | elapsed time per iteration (s): 0.56 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 2.793229E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.285 | TFLOPs: 43.60 | +7: iteration 40290/ 115203 | consumed samples: 10314240 | consumed tokens: 21123563520 | elapsed time per iteration (s): 0.56 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 2.783847E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.999 | TFLOPs: 43.57 | +7: iteration 40300/ 115203 | consumed samples: 10316800 | consumed tokens: 21128806400 | elapsed time per iteration (s): 0.57 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 2.774733E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.405 | TFLOPs: 42.56 | +7: iteration 40310/ 115203 | consumed samples: 10319360 | consumed tokens: 21134049280 | elapsed time per iteration (s): 0.57 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 2.794887E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.245 | TFLOPs: 42.64 | +7: iteration 40320/ 115203 | consumed samples: 10321920 | consumed tokens: 21139292160 | elapsed time per iteration (s): 0.56 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 2.774374E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.277 | TFLOPs: 43.60 | +7: iteration 40330/ 115203 | consumed samples: 10324480 | consumed tokens: 21144535040 | elapsed time per iteration (s): 0.56 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 2.800655E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.091 | TFLOPs: 43.48 | +7: iteration 40340/ 115203 | consumed samples: 10327040 | consumed tokens: 21149777920 | elapsed time per iteration (s): 0.57 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 2.793154E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.020 | TFLOPs: 43.10 | +7: iteration 40350/ 115203 | consumed samples: 10329600 | consumed tokens: 21155020800 | elapsed time per iteration (s): 0.57 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 2.780543E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.175 | TFLOPs: 43.11 | +7: iteration 40360/ 115203 | consumed samples: 10332160 | consumed tokens: 21160263680 | elapsed time per iteration (s): 0.57 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 2.798195E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.683 | TFLOPs: 42.97 | +7: iteration 40370/ 115203 | consumed samples: 10334720 | consumed tokens: 21165506560 | elapsed time per iteration (s): 0.58 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 2.778710E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.796 | TFLOPs: 42.12 | +7: iteration 40380/ 115203 | consumed samples: 10337280 | consumed tokens: 21170749440 | elapsed time per iteration (s): 0.56 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 2.784589E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.466 | TFLOPs: 43.61 | +7: iteration 40390/ 115203 | consumed samples: 10339840 | consumed tokens: 21175992320 | elapsed time per iteration (s): 0.55 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 2.777056E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.436 | TFLOPs: 43.99 | +7: iteration 40400/ 115203 | consumed samples: 10342400 | consumed tokens: 21181235200 | elapsed time per iteration (s): 0.56 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 2.787491E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.369 | TFLOPs: 43.32 | +7: iteration 40410/ 115203 | consumed samples: 10344960 | consumed tokens: 21186478080 | elapsed time per iteration (s): 0.57 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 2.789509E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.763 | TFLOPs: 42.88 | +7: iteration 40420/ 115203 | consumed samples: 10347520 | consumed tokens: 21191720960 | elapsed time per iteration (s): 0.55 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 2.776640E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.781 | TFLOPs: 44.03 | +7: iteration 40430/ 115203 | consumed samples: 10350080 | consumed tokens: 21196963840 | elapsed time per iteration (s): 0.56 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 2.777116E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.766 | TFLOPs: 43.64 | +7: iteration 40440/ 115203 | consumed samples: 10352640 | consumed tokens: 21202206720 | elapsed time per iteration (s): 0.58 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 2.787581E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.422 | TFLOPs: 42.18 | +7: iteration 40450/ 115203 | consumed samples: 10355200 | consumed tokens: 21207449600 | elapsed time per iteration (s): 0.58 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 2.770240E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.040 | TFLOPs: 42.43 | +7: iteration 40460/ 115203 | consumed samples: 10357760 | consumed tokens: 21212692480 | elapsed time per iteration (s): 0.57 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 2.785069E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.382 | TFLOPs: 42.65 | +7: iteration 40470/ 115203 | consumed samples: 10360320 | consumed tokens: 21217935360 | elapsed time per iteration (s): 0.55 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 2.795917E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.506 | TFLOPs: 44.00 | +7: iteration 40480/ 115203 | consumed samples: 10362880 | consumed tokens: 21223178240 | elapsed time per iteration (s): 0.57 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 2.793424E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.792 | TFLOPs: 42.98 | +7: iteration 40490/ 115203 | consumed samples: 10365440 | consumed tokens: 21228421120 | elapsed time per iteration (s): 0.57 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 2.787935E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.707 | TFLOPs: 42.59 | +7: iteration 40500/ 115203 | consumed samples: 10368000 | consumed tokens: 21233664000 | elapsed time per iteration (s): 0.55 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 2.766809E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.575 | TFLOPs: 44.01 | +7: iteration 40510/ 115203 | consumed samples: 10370560 | consumed tokens: 21238906880 | elapsed time per iteration (s): 0.57 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 2.784205E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.035 | TFLOPs: 43.00 | +7: iteration 40520/ 115203 | consumed samples: 10373120 | consumed tokens: 21244149760 | elapsed time per iteration (s): 0.56 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 2.788609E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.689 | TFLOPs: 43.45 | +7: iteration 40530/ 115203 | consumed samples: 10375680 | consumed tokens: 21249392640 | elapsed time per iteration (s): 0.56 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 2.770364E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.634 | TFLOPs: 43.73 | +7: iteration 40540/ 115203 | consumed samples: 10378240 | consumed tokens: 21254635520 | elapsed time per iteration (s): 0.56 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 2.784189E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.978 | TFLOPs: 43.47 | +7: iteration 40550/ 115203 | consumed samples: 10380800 | consumed tokens: 21259878400 | elapsed time per iteration (s): 0.55 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 2.776780E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.497 | TFLOPs: 44.00 | +7: iteration 40560/ 115203 | consumed samples: 10383360 | consumed tokens: 21265121280 | elapsed time per iteration (s): 0.56 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 2.787922E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.685 | TFLOPs: 43.44 | +7: iteration 40570/ 115203 | consumed samples: 10385920 | consumed tokens: 21270364160 | elapsed time per iteration (s): 0.56 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 2.781228E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.987 | TFLOPs: 43.38 | +7: iteration 40580/ 115203 | consumed samples: 10388480 | consumed tokens: 21275607040 | elapsed time per iteration (s): 0.56 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 2.781793E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.919 | TFLOPs: 43.47 | +7: iteration 40590/ 115203 | consumed samples: 10391040 | consumed tokens: 21280849920 | elapsed time per iteration (s): 0.57 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 2.775661E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.865 | TFLOPs: 42.79 | +7: iteration 40600/ 115203 | consumed samples: 10393600 | consumed tokens: 21286092800 | elapsed time per iteration (s): 0.57 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 2.793769E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.221 | TFLOPs: 43.11 | +7: iteration 40610/ 115203 | consumed samples: 10396160 | consumed tokens: 21291335680 | elapsed time per iteration (s): 0.57 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 2.771354E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.984 | TFLOPs: 43.19 | +7: iteration 40620/ 115203 | consumed samples: 10398720 | consumed tokens: 21296578560 | elapsed time per iteration (s): 0.56 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 2.773716E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.037 | TFLOPs: 43.29 | +7: iteration 40630/ 115203 | consumed samples: 10401280 | consumed tokens: 21301821440 | elapsed time per iteration (s): 0.57 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 2.785266E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.572 | TFLOPs: 43.15 | +7: iteration 40640/ 115203 | consumed samples: 10403840 | consumed tokens: 21307064320 | elapsed time per iteration (s): 0.58 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 2.785091E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.722 | TFLOPs: 42.30 | +7: iteration 40650/ 115203 | consumed samples: 10406400 | consumed tokens: 21312307200 | elapsed time per iteration (s): 0.56 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 2.785725E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.118 | TFLOPs: 43.58 | +7: iteration 40660/ 115203 | consumed samples: 10408960 | consumed tokens: 21317550080 | elapsed time per iteration (s): 0.56 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 2.789027E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.316 | TFLOPs: 43.60 | +7: iteration 40670/ 115203 | consumed samples: 10411520 | consumed tokens: 21322792960 | elapsed time per iteration (s): 0.57 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 2.781752E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.493 | TFLOPs: 42.95 | +7: iteration 40680/ 115203 | consumed samples: 10414080 | consumed tokens: 21328035840 | elapsed time per iteration (s): 0.56 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 2.791639E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.066 | TFLOPs: 43.29 | +7: iteration 40690/ 115203 | consumed samples: 10416640 | consumed tokens: 21333278720 | elapsed time per iteration (s): 0.57 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 2.779940E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.941 | TFLOPs: 42.80 | +7: iteration 40700/ 115203 | consumed samples: 10419200 | consumed tokens: 21338521600 | elapsed time per iteration (s): 0.56 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 2.777761E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.665 | TFLOPs: 43.73 | +7: iteration 40710/ 115203 | consumed samples: 10421760 | consumed tokens: 21343764480 | elapsed time per iteration (s): 0.57 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 2.790011E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.894 | TFLOPs: 42.99 | +7: iteration 40720/ 115203 | consumed samples: 10424320 | consumed tokens: 21349007360 | elapsed time per iteration (s): 0.56 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 2.772671E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.955 | TFLOPs: 43.57 | +7: iteration 40730/ 115203 | consumed samples: 10426880 | consumed tokens: 21354250240 | elapsed time per iteration (s): 0.57 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 2.772051E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.005 | TFLOPs: 43.09 | +7: iteration 40740/ 115203 | consumed samples: 10429440 | consumed tokens: 21359493120 | elapsed time per iteration (s): 0.56 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 2.787169E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.284 | TFLOPs: 43.31 | +7: iteration 40750/ 115203 | consumed samples: 10432000 | consumed tokens: 21364736000 | elapsed time per iteration (s): 0.56 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 2.780425E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.422 | TFLOPs: 43.71 | +7: iteration 40760/ 115203 | consumed samples: 10434560 | consumed tokens: 21369978880 | elapsed time per iteration (s): 0.56 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 2.771974E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.686 | TFLOPs: 43.54 | +7: iteration 40770/ 115203 | consumed samples: 10437120 | consumed tokens: 21375221760 | elapsed time per iteration (s): 0.56 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 2.765811E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.561 | TFLOPs: 43.24 | +7: iteration 40780/ 115203 | consumed samples: 10439680 | consumed tokens: 21380464640 | elapsed time per iteration (s): 0.56 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 2.791412E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.737 | TFLOPs: 43.26 | +7: iteration 40790/ 115203 | consumed samples: 10442240 | consumed tokens: 21385707520 | elapsed time per iteration (s): 0.57 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 2.776723E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.984 | TFLOPs: 42.62 | +7: iteration 40800/ 115203 | consumed samples: 10444800 | consumed tokens: 21390950400 | elapsed time per iteration (s): 0.57 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 2.782945E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.323 | TFLOPs: 42.74 | +7: iteration 40810/ 115203 | consumed samples: 10447360 | consumed tokens: 21396193280 | elapsed time per iteration (s): 0.56 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 2.790625E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.458 | TFLOPs: 43.71 | +7: iteration 40820/ 115203 | consumed samples: 10449920 | consumed tokens: 21401436160 | elapsed time per iteration (s): 0.56 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 2.773959E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.469 | TFLOPs: 43.23 | +7: iteration 40830/ 115203 | consumed samples: 10452480 | consumed tokens: 21406679040 | elapsed time per iteration (s): 0.56 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 2.781802E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.503 | TFLOPs: 43.71 | +7: iteration 40840/ 115203 | consumed samples: 10455040 | consumed tokens: 21411921920 | elapsed time per iteration (s): 0.56 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 2.764668E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.458 | TFLOPs: 43.42 | +7: iteration 40850/ 115203 | consumed samples: 10457600 | consumed tokens: 21417164800 | elapsed time per iteration (s): 0.57 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 2.779792E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.160 | TFLOPs: 43.01 | +7: iteration 40860/ 115203 | consumed samples: 10460160 | consumed tokens: 21422407680 | elapsed time per iteration (s): 0.56 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 2.783407E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.384 | TFLOPs: 43.70 | +7: iteration 40870/ 115203 | consumed samples: 10462720 | consumed tokens: 21427650560 | elapsed time per iteration (s): 0.57 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 2.774480E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.225 | TFLOPs: 43.11 | +7: iteration 40880/ 115203 | consumed samples: 10465280 | consumed tokens: 21432893440 | elapsed time per iteration (s): 0.56 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 2.780024E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.089 | TFLOPs: 43.86 | +7: iteration 40890/ 115203 | consumed samples: 10467840 | consumed tokens: 21438136320 | elapsed time per iteration (s): 0.56 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 2.774326E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.003 | TFLOPs: 43.86 | +7: iteration 40900/ 115203 | consumed samples: 10470400 | consumed tokens: 21443379200 | elapsed time per iteration (s): 0.56 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 2.781932E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.603 | TFLOPs: 43.44 | +7: iteration 40910/ 115203 | consumed samples: 10472960 | consumed tokens: 21448622080 | elapsed time per iteration (s): 0.56 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 2.780143E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.801 | TFLOPs: 43.55 | +7: iteration 40920/ 115203 | consumed samples: 10475520 | consumed tokens: 21453864960 | elapsed time per iteration (s): 0.56 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 2.783722E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.534 | TFLOPs: 43.33 | +7: iteration 40930/ 115203 | consumed samples: 10478080 | consumed tokens: 21459107840 | elapsed time per iteration (s): 0.57 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 2.768107E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.155 | TFLOPs: 42.63 | +7: iteration 40940/ 115203 | consumed samples: 10480640 | consumed tokens: 21464350720 | elapsed time per iteration (s): 0.56 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 2.792841E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.352 | TFLOPs: 43.60 | +7: iteration 40950/ 115203 | consumed samples: 10483200 | consumed tokens: 21469593600 | elapsed time per iteration (s): 0.57 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 2.772756E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.038 | TFLOPs: 43.19 | +7: iteration 40960/ 115203 | consumed samples: 10485760 | consumed tokens: 21474836480 | elapsed time per iteration (s): 0.56 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 2.783853E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.727 | TFLOPs: 43.73 | +7: iteration 40970/ 115203 | consumed samples: 10488320 | consumed tokens: 21480079360 | elapsed time per iteration (s): 0.56 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 2.771629E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.514 | TFLOPs: 43.62 | +7: iteration 40980/ 115203 | consumed samples: 10490880 | consumed tokens: 21485322240 | elapsed time per iteration (s): 0.58 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 2.777712E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.028 | TFLOPs: 42.43 | +7: iteration 40990/ 115203 | consumed samples: 10493440 | consumed tokens: 21490565120 | elapsed time per iteration (s): 0.57 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 2.783930E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.292 | TFLOPs: 43.12 | +7: iteration 41000/ 115203 | consumed samples: 10496000 | consumed tokens: 21495808000 | elapsed time per iteration (s): 0.56 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 2.785442E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.436 | TFLOPs: 43.71 | +7: iteration 41010/ 115203 | consumed samples: 10498560 | consumed tokens: 21501050880 | elapsed time per iteration (s): 0.56 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 2.773631E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.356 | TFLOPs: 43.22 | +7: iteration 41020/ 115203 | consumed samples: 10501120 | consumed tokens: 21506293760 | elapsed time per iteration (s): 0.58 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 2.778776E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.912 | TFLOPs: 42.04 | +7: iteration 41030/ 115203 | consumed samples: 10503680 | consumed tokens: 21511536640 | elapsed time per iteration (s): 0.56 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 2.793800E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.213 | TFLOPs: 43.69 | +7: iteration 41040/ 115203 | consumed samples: 10506240 | consumed tokens: 21516779520 | elapsed time per iteration (s): 0.57 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 2.777016E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.621 | TFLOPs: 43.15 | +7: iteration 41050/ 115203 | consumed samples: 10508800 | consumed tokens: 21522022400 | elapsed time per iteration (s): 0.56 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 2.779930E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.433 | TFLOPs: 43.71 | +7: iteration 41060/ 115203 | consumed samples: 10511360 | consumed tokens: 21527265280 | elapsed time per iteration (s): 0.56 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 2.774750E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.744 | TFLOPs: 43.74 | +7: iteration 41070/ 115203 | consumed samples: 10513920 | consumed tokens: 21532508160 | elapsed time per iteration (s): 0.56 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 2.768023E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.295 | TFLOPs: 43.31 | +7: iteration 41080/ 115203 | consumed samples: 10516480 | consumed tokens: 21537751040 | elapsed time per iteration (s): 0.56 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 2.772200E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.612 | TFLOPs: 43.25 | +7: iteration 41090/ 115203 | consumed samples: 10519040 | consumed tokens: 21542993920 | elapsed time per iteration (s): 0.57 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 2.784681E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.784 | TFLOPs: 43.17 | +7: iteration 41100/ 115203 | consumed samples: 10521600 | consumed tokens: 21548236800 | elapsed time per iteration (s): 0.55 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 2.786468E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.641 | TFLOPs: 44.01 | +7: iteration 41110/ 115203 | consumed samples: 10524160 | consumed tokens: 21553479680 | elapsed time per iteration (s): 0.55 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 2.769992E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.521 | TFLOPs: 44.00 | +7: iteration 41120/ 115203 | consumed samples: 10526720 | consumed tokens: 21558722560 | elapsed time per iteration (s): 0.57 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 2.776741E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.671 | TFLOPs: 42.68 | +7: iteration 41130/ 115203 | consumed samples: 10529280 | consumed tokens: 21563965440 | elapsed time per iteration (s): 0.57 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 2.779808E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.520 | TFLOPs: 43.05 | +7: iteration 41140/ 115203 | consumed samples: 10531840 | consumed tokens: 21569208320 | elapsed time per iteration (s): 0.58 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 2.800836E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.638 | TFLOPs: 42.39 | +7: iteration 41150/ 115203 | consumed samples: 10534400 | consumed tokens: 21574451200 | elapsed time per iteration (s): 0.55 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 2.790497E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.618 | TFLOPs: 44.01 | +7: iteration 41160/ 115203 | consumed samples: 10536960 | consumed tokens: 21579694080 | elapsed time per iteration (s): 0.56 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 2.779474E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.057 | TFLOPs: 43.48 | +7: iteration 41170/ 115203 | consumed samples: 10539520 | consumed tokens: 21584936960 | elapsed time per iteration (s): 0.57 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 2.788221E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.976 | TFLOPs: 43.09 | +7: iteration 41180/ 115203 | consumed samples: 10542080 | consumed tokens: 21590179840 | elapsed time per iteration (s): 0.56 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 2.786797E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.976 | TFLOPs: 43.47 | +7: iteration 41190/ 115203 | consumed samples: 10544640 | consumed tokens: 21595422720 | elapsed time per iteration (s): 0.55 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 2.773515E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.551 | TFLOPs: 44.00 | +7: iteration 41200/ 115203 | consumed samples: 10547200 | consumed tokens: 21600665600 | elapsed time per iteration (s): 0.56 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 2.788914E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.789 | TFLOPs: 43.26 | +7: iteration 41210/ 115203 | consumed samples: 10549760 | consumed tokens: 21605908480 | elapsed time per iteration (s): 0.56 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 2.772530E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.397 | TFLOPs: 43.51 | +7: iteration 41220/ 115203 | consumed samples: 10552320 | consumed tokens: 21611151360 | elapsed time per iteration (s): 0.57 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 2.780496E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.177 | TFLOPs: 43.11 | +7: iteration 41230/ 115203 | consumed samples: 10554880 | consumed tokens: 21616394240 | elapsed time per iteration (s): 0.55 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 2.778746E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 41240/ 115203 | consumed samples: 10557440 | consumed tokens: 21621637120 | elapsed time per iteration (s): 0.57 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 2.771782E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.536 | TFLOPs: 42.95 | +7: iteration 41250/ 115203 | consumed samples: 10560000 | consumed tokens: 21626880000 | elapsed time per iteration (s): 0.56 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 2.795960E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.032 | TFLOPs: 43.57 | +7: iteration 41260/ 115203 | consumed samples: 10562560 | consumed tokens: 21632122880 | elapsed time per iteration (s): 0.56 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 2.781831E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.297 | TFLOPs: 43.41 | +7: iteration 41270/ 115203 | consumed samples: 10565120 | consumed tokens: 21637365760 | elapsed time per iteration (s): 0.56 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 2.773231E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.520 | TFLOPs: 43.71 | +7: iteration 41280/ 115203 | consumed samples: 10567680 | consumed tokens: 21642608640 | elapsed time per iteration (s): 0.56 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 2.767542E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.398 | TFLOPs: 43.80 | +7: iteration 41290/ 115203 | consumed samples: 10570240 | consumed tokens: 21647851520 | elapsed time per iteration (s): 0.56 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 2.783990E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.914 | TFLOPs: 43.66 | +7: iteration 41300/ 115203 | consumed samples: 10572800 | consumed tokens: 21653094400 | elapsed time per iteration (s): 0.56 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 2.784410E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.405 | TFLOPs: 43.42 | +7: iteration 41310/ 115203 | consumed samples: 10575360 | consumed tokens: 21658337280 | elapsed time per iteration (s): 0.56 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 2.778465E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.870 | TFLOPs: 43.65 | +7: iteration 41320/ 115203 | consumed samples: 10577920 | consumed tokens: 21663580160 | elapsed time per iteration (s): 0.57 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 2.776429E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.477 | TFLOPs: 42.85 | +7: iteration 41330/ 115203 | consumed samples: 10580480 | consumed tokens: 21668823040 | elapsed time per iteration (s): 0.56 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 2.784536E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.117 | TFLOPs: 43.39 | +7: iteration 41340/ 115203 | consumed samples: 10583040 | consumed tokens: 21674065920 | elapsed time per iteration (s): 0.58 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 2.794807E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.229 | TFLOPs: 42.07 | +7: iteration 41350/ 115203 | consumed samples: 10585600 | consumed tokens: 21679308800 | elapsed time per iteration (s): 0.57 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 2.781229E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.798 | TFLOPs: 42.79 | +7: iteration 41360/ 115203 | consumed samples: 10588160 | consumed tokens: 21684551680 | elapsed time per iteration (s): 0.56 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 2.780564E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.062 | TFLOPs: 43.58 | +7: iteration 41370/ 115203 | consumed samples: 10590720 | consumed tokens: 21689794560 | elapsed time per iteration (s): 0.56 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 2.776715E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.305 | TFLOPs: 43.50 | +7: iteration 41380/ 115203 | consumed samples: 10593280 | consumed tokens: 21695037440 | elapsed time per iteration (s): 0.57 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 2.781083E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.658 | TFLOPs: 42.97 | +7: iteration 41390/ 115203 | consumed samples: 10595840 | consumed tokens: 21700280320 | elapsed time per iteration (s): 0.56 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 2.772647E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.926 | TFLOPs: 43.47 | +7: iteration 41400/ 115203 | consumed samples: 10598400 | consumed tokens: 21705523200 | elapsed time per iteration (s): 0.56 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 2.768118E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.327 | TFLOPs: 43.51 | +7: iteration 41410/ 115203 | consumed samples: 10600960 | consumed tokens: 21710766080 | elapsed time per iteration (s): 0.56 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 2.766783E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.578 | TFLOPs: 43.43 | +7: iteration 41420/ 115203 | consumed samples: 10603520 | consumed tokens: 21716008960 | elapsed time per iteration (s): 0.55 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 2.784053E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.630 | TFLOPs: 44.01 | +7: iteration 41430/ 115203 | consumed samples: 10606080 | consumed tokens: 21721251840 | elapsed time per iteration (s): 0.56 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 2.771104E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.537 | TFLOPs: 43.72 | +7: iteration 41440/ 115203 | consumed samples: 10608640 | consumed tokens: 21726494720 | elapsed time per iteration (s): 0.57 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 2.792216E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.690 | TFLOPs: 42.97 | +7: iteration 41450/ 115203 | consumed samples: 10611200 | consumed tokens: 21731737600 | elapsed time per iteration (s): 0.56 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 2.768804E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.696 | TFLOPs: 43.54 | +7: iteration 41460/ 115203 | consumed samples: 10613760 | consumed tokens: 21736980480 | elapsed time per iteration (s): 0.56 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 2.769243E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.459 | TFLOPs: 43.80 | +7: iteration 41470/ 115203 | consumed samples: 10616320 | consumed tokens: 21742223360 | elapsed time per iteration (s): 0.56 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 2.772816E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.451 | TFLOPs: 43.90 | +7: iteration 41480/ 115203 | consumed samples: 10618880 | consumed tokens: 21747466240 | elapsed time per iteration (s): 0.56 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 2.768070E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.810 | TFLOPs: 43.55 | +7: iteration 41490/ 115203 | consumed samples: 10621440 | consumed tokens: 21752709120 | elapsed time per iteration (s): 0.56 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 2.783796E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.969 | TFLOPs: 43.66 | +7: iteration 41500/ 115203 | consumed samples: 10624000 | consumed tokens: 21757952000 | elapsed time per iteration (s): 0.56 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 2.779421E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.740 | TFLOPs: 43.55 | +7: iteration 41510/ 115203 | consumed samples: 10626560 | consumed tokens: 21763194880 | elapsed time per iteration (s): 0.56 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 2.773375E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.242 | TFLOPs: 43.69 | +7: iteration 41520/ 115203 | consumed samples: 10629120 | consumed tokens: 21768437760 | elapsed time per iteration (s): 0.56 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 2.756951E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.452 | TFLOPs: 43.61 | +7: iteration 41530/ 115203 | consumed samples: 10631680 | consumed tokens: 21773680640 | elapsed time per iteration (s): 0.56 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 2.774399E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.229 | TFLOPs: 43.21 | +7: iteration 41540/ 115203 | consumed samples: 10634240 | consumed tokens: 21778923520 | elapsed time per iteration (s): 0.57 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 2.771443E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.571 | TFLOPs: 43.15 | +7: iteration 41550/ 115203 | consumed samples: 10636800 | consumed tokens: 21784166400 | elapsed time per iteration (s): 0.57 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 2.791517E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.944 | TFLOPs: 43.09 | +7: iteration 41560/ 115203 | consumed samples: 10639360 | consumed tokens: 21789409280 | elapsed time per iteration (s): 0.56 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 2.786426E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.671 | TFLOPs: 43.44 | +7: iteration 41570/ 115203 | consumed samples: 10641920 | consumed tokens: 21794652160 | elapsed time per iteration (s): 0.56 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 2.775859E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.682 | TFLOPs: 43.25 | +7: iteration 41580/ 115203 | consumed samples: 10644480 | consumed tokens: 21799895040 | elapsed time per iteration (s): 0.56 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 2.787918E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.882 | TFLOPs: 43.46 | +7: iteration 41590/ 115203 | consumed samples: 10647040 | consumed tokens: 21805137920 | elapsed time per iteration (s): 0.56 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 2.778584E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.367 | TFLOPs: 43.70 | +7: iteration 41600/ 115203 | consumed samples: 10649600 | consumed tokens: 21810380800 | elapsed time per iteration (s): 0.56 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 2.771759E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.754 | TFLOPs: 43.36 | +7: iteration 41610/ 115203 | consumed samples: 10652160 | consumed tokens: 21815623680 | elapsed time per iteration (s): 0.56 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 2.780936E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.970 | TFLOPs: 43.38 | +7: iteration 41620/ 115203 | consumed samples: 10654720 | consumed tokens: 21820866560 | elapsed time per iteration (s): 0.56 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 2.785503E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.224 | TFLOPs: 43.97 | +7: iteration 41630/ 115203 | consumed samples: 10657280 | consumed tokens: 21826109440 | elapsed time per iteration (s): 0.56 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 2.779862E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.427 | TFLOPs: 43.52 | +7: iteration 41640/ 115203 | consumed samples: 10659840 | consumed tokens: 21831352320 | elapsed time per iteration (s): 0.61 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 2.768886E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.524 | TFLOPs: 40.19 | +7: iteration 41650/ 115203 | consumed samples: 10662400 | consumed tokens: 21836595200 | elapsed time per iteration (s): 0.56 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 2.765129E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.140 | TFLOPs: 43.39 | +7: iteration 41660/ 115203 | consumed samples: 10664960 | consumed tokens: 21841838080 | elapsed time per iteration (s): 0.56 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 2.772447E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.129 | TFLOPs: 43.87 | +7: iteration 41670/ 115203 | consumed samples: 10667520 | consumed tokens: 21847080960 | elapsed time per iteration (s): 0.56 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 2.777332E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.596 | TFLOPs: 43.63 | +7: iteration 41680/ 115203 | consumed samples: 10670080 | consumed tokens: 21852323840 | elapsed time per iteration (s): 0.55 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 2.773433E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.730 | TFLOPs: 44.02 | +7: iteration 41690/ 115203 | consumed samples: 10672640 | consumed tokens: 21857566720 | elapsed time per iteration (s): 0.55 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 2.781504E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.634 | TFLOPs: 44.01 | +7: iteration 41700/ 115203 | consumed samples: 10675200 | consumed tokens: 21862809600 | elapsed time per iteration (s): 0.55 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 2.775732E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.564 | TFLOPs: 44.01 | +7: iteration 41710/ 115203 | consumed samples: 10677760 | consumed tokens: 21868052480 | elapsed time per iteration (s): 0.55 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 2.790508E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.496 | TFLOPs: 44.00 | +7: iteration 41720/ 115203 | consumed samples: 10680320 | consumed tokens: 21873295360 | elapsed time per iteration (s): 0.55 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 2.785347E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.441 | TFLOPs: 43.99 | +7: iteration 41730/ 115203 | consumed samples: 10682880 | consumed tokens: 21878538240 | elapsed time per iteration (s): 0.56 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 2.777516E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.361 | TFLOPs: 43.51 | +7: iteration 41740/ 115203 | consumed samples: 10685440 | consumed tokens: 21883781120 | elapsed time per iteration (s): 0.55 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 2.780025E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.513 | TFLOPs: 44.00 | +7: iteration 41750/ 115203 | consumed samples: 10688000 | consumed tokens: 21889024000 | elapsed time per iteration (s): 0.55 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 2.764027E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.597 | TFLOPs: 44.01 | +7: iteration 41760/ 115203 | consumed samples: 10690560 | consumed tokens: 21894266880 | elapsed time per iteration (s): 0.56 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 2.787046E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.556 | TFLOPs: 43.81 | +7: iteration 41770/ 115203 | consumed samples: 10693120 | consumed tokens: 21899509760 | elapsed time per iteration (s): 0.55 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 2.787195E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.535 | TFLOPs: 44.00 | +7: iteration 41780/ 115203 | consumed samples: 10695680 | consumed tokens: 21904752640 | elapsed time per iteration (s): 0.56 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 2.768662E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.068 | TFLOPs: 43.86 | +7: iteration 41790/ 115203 | consumed samples: 10698240 | consumed tokens: 21909995520 | elapsed time per iteration (s): 0.56 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 2.763540E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.976 | TFLOPs: 43.47 | +7: iteration 41800/ 115203 | consumed samples: 10700800 | consumed tokens: 21915238400 | elapsed time per iteration (s): 0.56 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 2.765586E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.354 | TFLOPs: 43.79 | +7: iteration 41810/ 115203 | consumed samples: 10703360 | consumed tokens: 21920481280 | elapsed time per iteration (s): 0.55 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 2.774050E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.381 | TFLOPs: 43.99 | +7: iteration 41820/ 115203 | consumed samples: 10705920 | consumed tokens: 21925724160 | elapsed time per iteration (s): 0.56 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 2.766687E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.503 | TFLOPs: 43.52 | +7: iteration 41830/ 115203 | consumed samples: 10708480 | consumed tokens: 21930967040 | elapsed time per iteration (s): 0.55 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 2.772099E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.388 | TFLOPs: 43.99 | +7: iteration 41840/ 115203 | consumed samples: 10711040 | consumed tokens: 21936209920 | elapsed time per iteration (s): 0.56 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 2.780139E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.237 | TFLOPs: 43.97 | +7: iteration 41850/ 115203 | consumed samples: 10713600 | consumed tokens: 21941452800 | elapsed time per iteration (s): 0.56 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 2.775075E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.748 | TFLOPs: 43.55 | +7: iteration 41860/ 115203 | consumed samples: 10716160 | consumed tokens: 21946695680 | elapsed time per iteration (s): 0.55 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 2.781546E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.348 | TFLOPs: 43.98 | +7: iteration 41870/ 115203 | consumed samples: 10718720 | consumed tokens: 21951938560 | elapsed time per iteration (s): 0.55 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 2.772751E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.293 | TFLOPs: 43.98 | +7: iteration 41880/ 115203 | consumed samples: 10721280 | consumed tokens: 21957181440 | elapsed time per iteration (s): 0.55 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 2.770327E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.409 | TFLOPs: 43.99 | +7: iteration 41890/ 115203 | consumed samples: 10723840 | consumed tokens: 21962424320 | elapsed time per iteration (s): 0.56 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 2.785701E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.240 | TFLOPs: 43.97 | +7: iteration 41900/ 115203 | consumed samples: 10726400 | consumed tokens: 21967667200 | elapsed time per iteration (s): 0.55 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 2.767797E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.394 | TFLOPs: 43.99 | +7: iteration 41910/ 115203 | consumed samples: 10728960 | consumed tokens: 21972910080 | elapsed time per iteration (s): 0.56 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 2.781445E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.711 | TFLOPs: 43.54 | +7: iteration 41920/ 115203 | consumed samples: 10731520 | consumed tokens: 21978152960 | elapsed time per iteration (s): 0.56 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 2.791843E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.129 | TFLOPs: 43.96 | +7: iteration 41930/ 115203 | consumed samples: 10734080 | consumed tokens: 21983395840 | elapsed time per iteration (s): 0.56 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 2.773889E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.036 | TFLOPs: 43.95 | +7: iteration 41940/ 115203 | consumed samples: 10736640 | consumed tokens: 21988638720 | elapsed time per iteration (s): 0.56 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 2.773044E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.082 | TFLOPs: 43.96 | +7: iteration 41950/ 115203 | consumed samples: 10739200 | consumed tokens: 21993881600 | elapsed time per iteration (s): 0.56 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 2.770648E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.770 | TFLOPs: 43.45 | +7: iteration 41960/ 115203 | consumed samples: 10741760 | consumed tokens: 21999124480 | elapsed time per iteration (s): 0.56 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 2.778724E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.223 | TFLOPs: 43.97 | +7: iteration 41970/ 115203 | consumed samples: 10744320 | consumed tokens: 22004367360 | elapsed time per iteration (s): 0.56 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 2.781794E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.005 | TFLOPs: 43.95 | +7: iteration 41980/ 115203 | consumed samples: 10746880 | consumed tokens: 22009610240 | elapsed time per iteration (s): 0.55 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 2.766731E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.322 | TFLOPs: 43.98 | +7: iteration 41990/ 115203 | consumed samples: 10749440 | consumed tokens: 22014853120 | elapsed time per iteration (s): 0.56 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 2.775234E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.690 | TFLOPs: 43.54 | +0: [2023-03-16 19:25:34,397] [INFO] [logging.py:68:log_dist] [Rank 0] step=42000, skipped=0, lr=[0.0001487921045166041, 0.0001487921045166041, 0.0001487921045166041], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 42000/ 115203 | consumed samples: 10752000 | consumed tokens: 22020096000 | elapsed time per iteration (s): 0.56 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 2.773589E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.250 | TFLOPs: 43.98 | +0: steps: 42000 loss: 2.8020 iter time (s): 0.561 samples/sec: 456.086 +7: iteration 42010/ 115203 | consumed samples: 10754560 | consumed tokens: 22025338880 | elapsed time per iteration (s): 0.57 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 2.768605E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.991 | TFLOPs: 43.19 | +7: iteration 42020/ 115203 | consumed samples: 10757120 | consumed tokens: 22030581760 | elapsed time per iteration (s): 0.56 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 2.786612E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.690 | TFLOPs: 43.54 | +7: iteration 42030/ 115203 | consumed samples: 10759680 | consumed tokens: 22035824640 | elapsed time per iteration (s): 0.56 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 2.790625E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.181 | TFLOPs: 43.97 | +7: iteration 42040/ 115203 | consumed samples: 10762240 | consumed tokens: 22041067520 | elapsed time per iteration (s): 0.56 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 2.763999E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.222 | TFLOPs: 43.97 | +7: iteration 42050/ 115203 | consumed samples: 10764800 | consumed tokens: 22046310400 | elapsed time per iteration (s): 0.56 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 2.793287E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 42060/ 115203 | consumed samples: 10767360 | consumed tokens: 22051553280 | elapsed time per iteration (s): 0.56 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 2.786053E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.944 | TFLOPs: 43.95 | +7: iteration 42070/ 115203 | consumed samples: 10769920 | consumed tokens: 22056796160 | elapsed time per iteration (s): 0.56 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 2.768438E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.127 | TFLOPs: 43.96 | +7: iteration 42080/ 115203 | consumed samples: 10772480 | consumed tokens: 22062039040 | elapsed time per iteration (s): 0.56 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 2.782936E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 42090/ 115203 | consumed samples: 10775040 | consumed tokens: 22067281920 | elapsed time per iteration (s): 0.56 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 2.769147E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.294 | TFLOPs: 43.60 | +7: iteration 42100/ 115203 | consumed samples: 10777600 | consumed tokens: 22072524800 | elapsed time per iteration (s): 0.55 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 2.779175E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.404 | TFLOPs: 43.99 | +7: iteration 42110/ 115203 | consumed samples: 10780160 | consumed tokens: 22077767680 | elapsed time per iteration (s): 0.56 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 2.767230E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.223 | TFLOPs: 43.97 | +7: iteration 42120/ 115203 | consumed samples: 10782720 | consumed tokens: 22083010560 | elapsed time per iteration (s): 0.55 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 2.781735E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.268 | TFLOPs: 43.98 | +7: iteration 42130/ 115203 | consumed samples: 10785280 | consumed tokens: 22088253440 | elapsed time per iteration (s): 0.55 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 2.773951E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.351 | TFLOPs: 43.98 | +7: iteration 42140/ 115203 | consumed samples: 10787840 | consumed tokens: 22093496320 | elapsed time per iteration (s): 0.56 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 2.776856E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.628 | TFLOPs: 43.25 | +7: iteration 42150/ 115203 | consumed samples: 10790400 | consumed tokens: 22098739200 | elapsed time per iteration (s): 0.56 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 2.775814E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.559 | TFLOPs: 43.81 | +7: iteration 42160/ 115203 | consumed samples: 10792960 | consumed tokens: 22103982080 | elapsed time per iteration (s): 0.56 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 2.764132E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.129 | TFLOPs: 43.96 | +7: iteration 42170/ 115203 | consumed samples: 10795520 | consumed tokens: 22109224960 | elapsed time per iteration (s): 0.56 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 2.765866E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 42180/ 115203 | consumed samples: 10798080 | consumed tokens: 22114467840 | elapsed time per iteration (s): 0.55 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 2.785796E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 42190/ 115203 | consumed samples: 10800640 | consumed tokens: 22119710720 | elapsed time per iteration (s): 0.56 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 2.770711E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.223 | TFLOPs: 43.97 | +7: iteration 42200/ 115203 | consumed samples: 10803200 | consumed tokens: 22124953600 | elapsed time per iteration (s): 0.56 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 2.757787E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 42210/ 115203 | consumed samples: 10805760 | consumed tokens: 22130196480 | elapsed time per iteration (s): 0.56 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 2.786702E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.097 | TFLOPs: 43.96 | +7: iteration 42220/ 115203 | consumed samples: 10808320 | consumed tokens: 22135439360 | elapsed time per iteration (s): 0.56 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 2.772899E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.279 | TFLOPs: 43.41 | +7: iteration 42230/ 115203 | consumed samples: 10810880 | consumed tokens: 22140682240 | elapsed time per iteration (s): 0.55 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 2.763062E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.403 | TFLOPs: 43.99 | +7: iteration 42240/ 115203 | consumed samples: 10813440 | consumed tokens: 22145925120 | elapsed time per iteration (s): 0.56 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 2.781828E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.953 | TFLOPs: 43.66 | +7: iteration 42250/ 115203 | consumed samples: 10816000 | consumed tokens: 22151168000 | elapsed time per iteration (s): 0.55 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 2.776315E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.275 | TFLOPs: 43.98 | +7: iteration 42260/ 115203 | consumed samples: 10818560 | consumed tokens: 22156410880 | elapsed time per iteration (s): 0.55 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 2.779860E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.324 | TFLOPs: 43.98 | +7: iteration 42270/ 115203 | consumed samples: 10821120 | consumed tokens: 22161653760 | elapsed time per iteration (s): 0.57 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 2.782090E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.234 | TFLOPs: 42.92 | +7: iteration 42280/ 115203 | consumed samples: 10823680 | consumed tokens: 22166896640 | elapsed time per iteration (s): 0.60 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 2.775077E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.754 | TFLOPs: 40.88 | +7: iteration 42290/ 115203 | consumed samples: 10826240 | consumed tokens: 22172139520 | elapsed time per iteration (s): 0.55 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 2.777371E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.403 | TFLOPs: 43.99 | +7: iteration 42300/ 115203 | consumed samples: 10828800 | consumed tokens: 22177382400 | elapsed time per iteration (s): 0.55 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 2.784462E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 42310/ 115203 | consumed samples: 10831360 | consumed tokens: 22182625280 | elapsed time per iteration (s): 0.56 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 2.765839E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.208 | TFLOPs: 43.97 | +7: iteration 42320/ 115203 | consumed samples: 10833920 | consumed tokens: 22187868160 | elapsed time per iteration (s): 0.55 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 2.779976E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.311 | TFLOPs: 43.98 | +7: iteration 42330/ 115203 | consumed samples: 10836480 | consumed tokens: 22193111040 | elapsed time per iteration (s): 0.56 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 2.764168E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.030 | TFLOPs: 43.48 | +7: iteration 42340/ 115203 | consumed samples: 10839040 | consumed tokens: 22198353920 | elapsed time per iteration (s): 0.57 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 2.777175E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.643 | TFLOPs: 42.68 | +7: iteration 42350/ 115203 | consumed samples: 10841600 | consumed tokens: 22203596800 | elapsed time per iteration (s): 0.57 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 2.774891E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.050 | TFLOPs: 43.19 | +7: iteration 42360/ 115203 | consumed samples: 10844160 | consumed tokens: 22208839680 | elapsed time per iteration (s): 0.56 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 2.769179E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.129 | TFLOPs: 43.68 | +7: iteration 42370/ 115203 | consumed samples: 10846720 | consumed tokens: 22214082560 | elapsed time per iteration (s): 0.56 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 2.783216E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.695 | TFLOPs: 43.83 | +7: iteration 42380/ 115203 | consumed samples: 10849280 | consumed tokens: 22219325440 | elapsed time per iteration (s): 0.57 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 2.769164E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.497 | TFLOPs: 43.14 | +7: iteration 42390/ 115203 | consumed samples: 10851840 | consumed tokens: 22224568320 | elapsed time per iteration (s): 0.56 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 2.786576E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.701 | TFLOPs: 43.45 | +7: iteration 42400/ 115203 | consumed samples: 10854400 | consumed tokens: 22229811200 | elapsed time per iteration (s): 0.57 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 2.783082E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.543 | TFLOPs: 42.76 | +7: iteration 42410/ 115203 | consumed samples: 10856960 | consumed tokens: 22235054080 | elapsed time per iteration (s): 0.56 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 2.788252E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.470 | TFLOPs: 43.71 | +7: iteration 42420/ 115203 | consumed samples: 10859520 | consumed tokens: 22240296960 | elapsed time per iteration (s): 0.56 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 2.758869E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.902 | TFLOPs: 43.37 | +7: iteration 42430/ 115203 | consumed samples: 10862080 | consumed tokens: 22245539840 | elapsed time per iteration (s): 0.56 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 2.779489E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.768 | TFLOPs: 43.45 | +7: iteration 42440/ 115203 | consumed samples: 10864640 | consumed tokens: 22250782720 | elapsed time per iteration (s): 0.57 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 2.773046E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.030 | TFLOPs: 42.71 | +7: iteration 42450/ 115203 | consumed samples: 10867200 | consumed tokens: 22256025600 | elapsed time per iteration (s): 0.56 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 2.760621E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.552 | TFLOPs: 43.43 | +7: iteration 42460/ 115203 | consumed samples: 10869760 | consumed tokens: 22261268480 | elapsed time per iteration (s): 0.56 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 2.769350E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.655 | TFLOPs: 43.35 | +7: iteration 42470/ 115203 | consumed samples: 10872320 | consumed tokens: 22266511360 | elapsed time per iteration (s): 0.57 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 2.768521E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.746 | TFLOPs: 42.97 | +7: iteration 42480/ 115203 | consumed samples: 10874880 | consumed tokens: 22271754240 | elapsed time per iteration (s): 0.56 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 2.760341E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.733 | TFLOPs: 43.54 | +7: iteration 42490/ 115203 | consumed samples: 10877440 | consumed tokens: 22276997120 | elapsed time per iteration (s): 0.56 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 2.781964E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.704 | TFLOPs: 43.73 | +7: iteration 42500/ 115203 | consumed samples: 10880000 | consumed tokens: 22282240000 | elapsed time per iteration (s): 0.56 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 2.772959E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.319 | TFLOPs: 43.31 | +7: iteration 42510/ 115203 | consumed samples: 10882560 | consumed tokens: 22287482880 | elapsed time per iteration (s): 0.57 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 2.764648E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.524 | TFLOPs: 42.86 | +7: iteration 42520/ 115203 | consumed samples: 10885120 | consumed tokens: 22292725760 | elapsed time per iteration (s): 0.57 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 2.773857E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.123 | TFLOPs: 42.63 | +7: iteration 42530/ 115203 | consumed samples: 10887680 | consumed tokens: 22297968640 | elapsed time per iteration (s): 0.56 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 2.769824E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.551 | TFLOPs: 43.43 | +7: iteration 42540/ 115203 | consumed samples: 10890240 | consumed tokens: 22303211520 | elapsed time per iteration (s): 0.57 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 2.768464E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.944 | TFLOPs: 43.18 | +7: iteration 42550/ 115203 | consumed samples: 10892800 | consumed tokens: 22308454400 | elapsed time per iteration (s): 0.56 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 2.791682E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.363 | TFLOPs: 43.70 | +7: iteration 42560/ 115203 | consumed samples: 10895360 | consumed tokens: 22313697280 | elapsed time per iteration (s): 0.56 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 2.769110E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.678 | TFLOPs: 43.44 | +7: iteration 42570/ 115203 | consumed samples: 10897920 | consumed tokens: 22318940160 | elapsed time per iteration (s): 0.56 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 2.772449E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.923 | TFLOPs: 43.28 | +7: iteration 42580/ 115203 | consumed samples: 10900480 | consumed tokens: 22324183040 | elapsed time per iteration (s): 0.57 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 2.760326E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.829 | TFLOPs: 43.08 | +7: iteration 42590/ 115203 | consumed samples: 10903040 | consumed tokens: 22329425920 | elapsed time per iteration (s): 0.55 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 2.789394E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 42600/ 115203 | consumed samples: 10905600 | consumed tokens: 22334668800 | elapsed time per iteration (s): 0.56 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 2.773031E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.011 | TFLOPs: 43.67 | +7: iteration 42610/ 115203 | consumed samples: 10908160 | consumed tokens: 22339911680 | elapsed time per iteration (s): 0.57 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 2.779527E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.799 | TFLOPs: 43.17 | +7: iteration 42620/ 115203 | consumed samples: 10910720 | consumed tokens: 22345154560 | elapsed time per iteration (s): 0.56 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 2.766830E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.247 | TFLOPs: 43.50 | +7: iteration 42630/ 115203 | consumed samples: 10913280 | consumed tokens: 22350397440 | elapsed time per iteration (s): 0.56 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 2.770730E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.938 | TFLOPs: 43.47 | +7: iteration 42640/ 115203 | consumed samples: 10915840 | consumed tokens: 22355640320 | elapsed time per iteration (s): 0.55 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 2.754417E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.625 | TFLOPs: 44.01 | +7: iteration 42650/ 115203 | consumed samples: 10918400 | consumed tokens: 22360883200 | elapsed time per iteration (s): 0.56 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 2.777162E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.820 | TFLOPs: 43.55 | +7: iteration 42660/ 115203 | consumed samples: 10920960 | consumed tokens: 22366126080 | elapsed time per iteration (s): 0.56 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 2.771284E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.123 | TFLOPs: 43.20 | +7: iteration 42670/ 115203 | consumed samples: 10923520 | consumed tokens: 22371368960 | elapsed time per iteration (s): 0.55 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 2.761427E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.621 | TFLOPs: 44.01 | +7: iteration 42680/ 115203 | consumed samples: 10926080 | consumed tokens: 22376611840 | elapsed time per iteration (s): 0.56 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 2.764005E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.018 | TFLOPs: 43.57 | +7: iteration 42690/ 115203 | consumed samples: 10928640 | consumed tokens: 22381854720 | elapsed time per iteration (s): 0.57 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 2.783983E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.436 | TFLOPs: 42.94 | +7: iteration 42700/ 115203 | consumed samples: 10931200 | consumed tokens: 22387097600 | elapsed time per iteration (s): 0.58 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 2.758429E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.144 | TFLOPs: 41.87 | +7: iteration 42710/ 115203 | consumed samples: 10933760 | consumed tokens: 22392340480 | elapsed time per iteration (s): 0.57 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 2.762715E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.961 | TFLOPs: 42.90 | +7: iteration 42720/ 115203 | consumed samples: 10936320 | consumed tokens: 22397583360 | elapsed time per iteration (s): 0.56 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 2.768492E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.800 | TFLOPs: 43.46 | +7: iteration 42730/ 115203 | consumed samples: 10938880 | consumed tokens: 22402826240 | elapsed time per iteration (s): 0.56 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 2.769447E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.264 | TFLOPs: 43.69 | +7: iteration 42740/ 115203 | consumed samples: 10941440 | consumed tokens: 22408069120 | elapsed time per iteration (s): 0.55 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 2.770804E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.274 | TFLOPs: 43.98 | +7: iteration 42750/ 115203 | consumed samples: 10944000 | consumed tokens: 22413312000 | elapsed time per iteration (s): 0.56 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 2.769971E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.102 | TFLOPs: 43.68 | +7: iteration 42760/ 115203 | consumed samples: 10946560 | consumed tokens: 22418554880 | elapsed time per iteration (s): 0.57 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 2.770463E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.923 | TFLOPs: 43.09 | +7: iteration 42770/ 115203 | consumed samples: 10949120 | consumed tokens: 22423797760 | elapsed time per iteration (s): 0.55 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 2.766179E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.313 | TFLOPs: 43.98 | +7: iteration 42780/ 115203 | consumed samples: 10951680 | consumed tokens: 22429040640 | elapsed time per iteration (s): 0.56 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 2.769422E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.002 | TFLOPs: 43.47 | +7: iteration 42790/ 115203 | consumed samples: 10954240 | consumed tokens: 22434283520 | elapsed time per iteration (s): 0.56 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 2.765305E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.593 | TFLOPs: 43.44 | +7: iteration 42800/ 115203 | consumed samples: 10956800 | consumed tokens: 22439526400 | elapsed time per iteration (s): 0.56 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 2.775826E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.155 | TFLOPs: 43.30 | +7: iteration 42810/ 115203 | consumed samples: 10959360 | consumed tokens: 22444769280 | elapsed time per iteration (s): 0.56 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 2.772714E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.125 | TFLOPs: 43.39 | +7: iteration 42820/ 115203 | consumed samples: 10961920 | consumed tokens: 22450012160 | elapsed time per iteration (s): 0.55 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 2.777799E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.575 | TFLOPs: 44.01 | +7: iteration 42830/ 115203 | consumed samples: 10964480 | consumed tokens: 22455255040 | elapsed time per iteration (s): 0.55 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 2.768052E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.575 | TFLOPs: 44.01 | +7: iteration 42840/ 115203 | consumed samples: 10967040 | consumed tokens: 22460497920 | elapsed time per iteration (s): 0.56 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 2.771533E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.968 | TFLOPs: 43.47 | +7: iteration 42850/ 115203 | consumed samples: 10969600 | consumed tokens: 22465740800 | elapsed time per iteration (s): 0.55 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 2.775384E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.354 | TFLOPs: 43.99 | +7: iteration 42860/ 115203 | consumed samples: 10972160 | consumed tokens: 22470983680 | elapsed time per iteration (s): 0.56 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 2.772569E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.447 | TFLOPs: 43.71 | +7: iteration 42870/ 115203 | consumed samples: 10974720 | consumed tokens: 22476226560 | elapsed time per iteration (s): 0.55 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 2.769333E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.410 | TFLOPs: 43.99 | +7: iteration 42880/ 115203 | consumed samples: 10977280 | consumed tokens: 22481469440 | elapsed time per iteration (s): 0.57 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 2.768870E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.767 | TFLOPs: 42.88 | +7: iteration 42890/ 115203 | consumed samples: 10979840 | consumed tokens: 22486712320 | elapsed time per iteration (s): 0.55 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 2.759383E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.432 | TFLOPs: 43.99 | +7: iteration 42900/ 115203 | consumed samples: 10982400 | consumed tokens: 22491955200 | elapsed time per iteration (s): 0.56 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 2.760596E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.972 | TFLOPs: 43.85 | +7: iteration 42910/ 115203 | consumed samples: 10984960 | consumed tokens: 22497198080 | elapsed time per iteration (s): 0.56 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 2.768366E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 42920/ 115203 | consumed samples: 10987520 | consumed tokens: 22502440960 | elapsed time per iteration (s): 0.55 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 2.780624E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.468 | TFLOPs: 44.00 | +7: iteration 42930/ 115203 | consumed samples: 10990080 | consumed tokens: 22507683840 | elapsed time per iteration (s): 0.56 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 2.770026E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.085 | TFLOPs: 43.58 | +7: iteration 42940/ 115203 | consumed samples: 10992640 | consumed tokens: 22512926720 | elapsed time per iteration (s): 0.55 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 2.767893E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.373 | TFLOPs: 43.99 | +7: iteration 42950/ 115203 | consumed samples: 10995200 | consumed tokens: 22518169600 | elapsed time per iteration (s): 0.55 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 2.767024E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.555 | TFLOPs: 44.00 | +7: iteration 42960/ 115203 | consumed samples: 10997760 | consumed tokens: 22523412480 | elapsed time per iteration (s): 0.56 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 2.758757E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.096 | TFLOPs: 43.96 | +7: iteration 42970/ 115203 | consumed samples: 11000320 | consumed tokens: 22528655360 | elapsed time per iteration (s): 0.55 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 2.763867E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 42980/ 115203 | consumed samples: 11002880 | consumed tokens: 22533898240 | elapsed time per iteration (s): 0.56 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 2.775385E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.082 | TFLOPs: 43.96 | +7: iteration 42990/ 115203 | consumed samples: 11005440 | consumed tokens: 22539141120 | elapsed time per iteration (s): 0.56 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 2.779375E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.840 | TFLOPs: 43.36 | +7: iteration 43000/ 115203 | consumed samples: 11008000 | consumed tokens: 22544384000 | elapsed time per iteration (s): 0.56 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 2.765253E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.851 | TFLOPs: 43.94 | +7: iteration 43010/ 115203 | consumed samples: 11010560 | consumed tokens: 22549626880 | elapsed time per iteration (s): 0.55 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 2.777318E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.292 | TFLOPs: 43.98 | +7: iteration 43020/ 115203 | consumed samples: 11013120 | consumed tokens: 22554869760 | elapsed time per iteration (s): 0.56 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 2.763201E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.080 | TFLOPs: 43.58 | +7: iteration 43030/ 115203 | consumed samples: 11015680 | consumed tokens: 22560112640 | elapsed time per iteration (s): 0.56 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 2.767212E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.604 | TFLOPs: 43.53 | +7: iteration 43040/ 115203 | consumed samples: 11018240 | consumed tokens: 22565355520 | elapsed time per iteration (s): 0.56 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 2.784981E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.732 | TFLOPs: 43.64 | +7: iteration 43050/ 115203 | consumed samples: 11020800 | consumed tokens: 22570598400 | elapsed time per iteration (s): 0.56 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 2.773459E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.259 | TFLOPs: 43.69 | +7: iteration 43060/ 115203 | consumed samples: 11023360 | consumed tokens: 22575841280 | elapsed time per iteration (s): 0.56 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 2.769277E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.115 | TFLOPs: 43.96 | +7: iteration 43070/ 115203 | consumed samples: 11025920 | consumed tokens: 22581084160 | elapsed time per iteration (s): 0.55 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 2.770238E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.300 | TFLOPs: 43.98 | +7: iteration 43080/ 115203 | consumed samples: 11028480 | consumed tokens: 22586327040 | elapsed time per iteration (s): 0.56 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 2.770534E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.178 | TFLOPs: 43.97 | +7: iteration 43090/ 115203 | consumed samples: 11031040 | consumed tokens: 22591569920 | elapsed time per iteration (s): 0.56 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 2.779496E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 43100/ 115203 | consumed samples: 11033600 | consumed tokens: 22596812800 | elapsed time per iteration (s): 0.56 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 2.773072E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 43110/ 115203 | consumed samples: 11036160 | consumed tokens: 22602055680 | elapsed time per iteration (s): 0.56 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 2.772766E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.339 | TFLOPs: 43.70 | +7: iteration 43120/ 115203 | consumed samples: 11038720 | consumed tokens: 22607298560 | elapsed time per iteration (s): 0.55 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 2.753595E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.287 | TFLOPs: 43.98 | +7: iteration 43130/ 115203 | consumed samples: 11041280 | consumed tokens: 22612541440 | elapsed time per iteration (s): 0.56 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 2.764249E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.011 | TFLOPs: 43.67 | +7: iteration 43140/ 115203 | consumed samples: 11043840 | consumed tokens: 22617784320 | elapsed time per iteration (s): 0.56 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 2.773226E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.016 | TFLOPs: 43.95 | +7: iteration 43150/ 115203 | consumed samples: 11046400 | consumed tokens: 22623027200 | elapsed time per iteration (s): 0.56 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 2.784652E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.225 | TFLOPs: 43.97 | +7: iteration 43160/ 115203 | consumed samples: 11048960 | consumed tokens: 22628270080 | elapsed time per iteration (s): 0.56 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 2.783228E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.399 | TFLOPs: 43.70 | +7: iteration 43170/ 115203 | consumed samples: 11051520 | consumed tokens: 22633512960 | elapsed time per iteration (s): 0.55 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 2.771903E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.429 | TFLOPs: 43.99 | +7: iteration 43180/ 115203 | consumed samples: 11054080 | consumed tokens: 22638755840 | elapsed time per iteration (s): 0.56 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 2.762066E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.135 | TFLOPs: 43.58 | +7: iteration 43190/ 115203 | consumed samples: 11056640 | consumed tokens: 22643998720 | elapsed time per iteration (s): 0.55 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 2.776358E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.581 | TFLOPs: 44.01 | +7: iteration 43200/ 115203 | consumed samples: 11059200 | consumed tokens: 22649241600 | elapsed time per iteration (s): 0.55 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 2.755819E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 43210/ 115203 | consumed samples: 11061760 | consumed tokens: 22654484480 | elapsed time per iteration (s): 0.55 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 2.747109E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.653 | TFLOPs: 44.01 | +7: iteration 43220/ 115203 | consumed samples: 11064320 | consumed tokens: 22659727360 | elapsed time per iteration (s): 0.56 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 2.771364E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.650 | TFLOPs: 43.73 | +7: iteration 43230/ 115203 | consumed samples: 11066880 | consumed tokens: 22664970240 | elapsed time per iteration (s): 0.55 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 2.770584E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.557 | TFLOPs: 44.00 | +7: iteration 43240/ 115203 | consumed samples: 11069440 | consumed tokens: 22670213120 | elapsed time per iteration (s): 0.56 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 2.768774E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.418 | TFLOPs: 43.42 | +7: iteration 43250/ 115203 | consumed samples: 11072000 | consumed tokens: 22675456000 | elapsed time per iteration (s): 0.56 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 2.764240E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.241 | TFLOPs: 43.97 | +7: iteration 43260/ 115203 | consumed samples: 11074560 | consumed tokens: 22680698880 | elapsed time per iteration (s): 0.56 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 2.765839E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.999 | TFLOPs: 43.95 | +7: iteration 43270/ 115203 | consumed samples: 11077120 | consumed tokens: 22685941760 | elapsed time per iteration (s): 0.56 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 2.775340E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.396 | TFLOPs: 43.61 | +7: iteration 43280/ 115203 | consumed samples: 11079680 | consumed tokens: 22691184640 | elapsed time per iteration (s): 0.55 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 2.753060E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 43290/ 115203 | consumed samples: 11082240 | consumed tokens: 22696427520 | elapsed time per iteration (s): 0.55 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 2.773753E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 43300/ 115203 | consumed samples: 11084800 | consumed tokens: 22701670400 | elapsed time per iteration (s): 0.56 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 2.754103E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.161 | TFLOPs: 43.97 | +7: iteration 43310/ 115203 | consumed samples: 11087360 | consumed tokens: 22706913280 | elapsed time per iteration (s): 0.56 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 2.776462E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.181 | TFLOPs: 43.68 | +7: iteration 43320/ 115203 | consumed samples: 11089920 | consumed tokens: 22712156160 | elapsed time per iteration (s): 0.55 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 2.783849E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 43330/ 115203 | consumed samples: 11092480 | consumed tokens: 22717399040 | elapsed time per iteration (s): 0.56 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 2.779771E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.768 | TFLOPs: 43.45 | +7: iteration 43340/ 115203 | consumed samples: 11095040 | consumed tokens: 22722641920 | elapsed time per iteration (s): 0.55 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 2.769902E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.707 | TFLOPs: 44.02 | +7: iteration 43350/ 115203 | consumed samples: 11097600 | consumed tokens: 22727884800 | elapsed time per iteration (s): 0.56 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 2.768883E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.613 | TFLOPs: 43.72 | +7: iteration 43360/ 115203 | consumed samples: 11100160 | consumed tokens: 22733127680 | elapsed time per iteration (s): 0.55 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 2.772056E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.654 | TFLOPs: 44.01 | +7: iteration 43370/ 115203 | consumed samples: 11102720 | consumed tokens: 22738370560 | elapsed time per iteration (s): 0.56 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 2.769364E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.206 | TFLOPs: 43.59 | +7: iteration 43380/ 115203 | consumed samples: 11105280 | consumed tokens: 22743613440 | elapsed time per iteration (s): 0.56 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 2.769228E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.418 | TFLOPs: 43.71 | +7: iteration 43390/ 115203 | consumed samples: 11107840 | consumed tokens: 22748856320 | elapsed time per iteration (s): 0.55 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 2.771830E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.385 | TFLOPs: 43.99 | +7: iteration 43400/ 115203 | consumed samples: 11110400 | consumed tokens: 22754099200 | elapsed time per iteration (s): 0.55 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 2.767948E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.491 | TFLOPs: 44.00 | +7: iteration 43410/ 115203 | consumed samples: 11112960 | consumed tokens: 22759342080 | elapsed time per iteration (s): 0.55 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 2.777785E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.688 | TFLOPs: 44.02 | +7: iteration 43420/ 115203 | consumed samples: 11115520 | consumed tokens: 22764584960 | elapsed time per iteration (s): 0.56 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 2.776891E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.744 | TFLOPs: 43.45 | +7: iteration 43430/ 115203 | consumed samples: 11118080 | consumed tokens: 22769827840 | elapsed time per iteration (s): 0.55 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 2.774336E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.564 | TFLOPs: 44.01 | +7: iteration 43440/ 115203 | consumed samples: 11120640 | consumed tokens: 22775070720 | elapsed time per iteration (s): 0.55 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 2.769292E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.798 | TFLOPs: 44.03 | +7: iteration 43450/ 115203 | consumed samples: 11123200 | consumed tokens: 22780313600 | elapsed time per iteration (s): 0.55 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 2.753716E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.740 | TFLOPs: 44.02 | +7: iteration 43460/ 115203 | consumed samples: 11125760 | consumed tokens: 22785556480 | elapsed time per iteration (s): 0.55 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 2.774563E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.722 | TFLOPs: 44.02 | +7: iteration 43470/ 115203 | consumed samples: 11128320 | consumed tokens: 22790799360 | elapsed time per iteration (s): 0.55 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 2.766524E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.810 | TFLOPs: 44.03 | +7: iteration 43480/ 115203 | consumed samples: 11130880 | consumed tokens: 22796042240 | elapsed time per iteration (s): 0.55 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 2.760557E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.678 | TFLOPs: 44.02 | +7: iteration 43490/ 115203 | consumed samples: 11133440 | consumed tokens: 22801285120 | elapsed time per iteration (s): 0.56 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 2.770584E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.400 | TFLOPs: 43.89 | +7: iteration 43500/ 115203 | consumed samples: 11136000 | consumed tokens: 22806528000 | elapsed time per iteration (s): 0.55 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 2.777218E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.681 | TFLOPs: 44.02 | +7: iteration 43510/ 115203 | consumed samples: 11138560 | consumed tokens: 22811770880 | elapsed time per iteration (s): 0.55 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 2.763513E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.451 | TFLOPs: 43.99 | +7: iteration 43520/ 115203 | consumed samples: 11141120 | consumed tokens: 22817013760 | elapsed time per iteration (s): 0.55 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 2.766722E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.674 | TFLOPs: 44.02 | +7: iteration 43530/ 115203 | consumed samples: 11143680 | consumed tokens: 22822256640 | elapsed time per iteration (s): 0.55 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 2.769528E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.612 | TFLOPs: 44.01 | +7: iteration 43540/ 115203 | consumed samples: 11146240 | consumed tokens: 22827499520 | elapsed time per iteration (s): 0.55 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 2.757824E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.753 | TFLOPs: 44.02 | +7: iteration 43550/ 115203 | consumed samples: 11148800 | consumed tokens: 22832742400 | elapsed time per iteration (s): 0.55 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 2.768535E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.399 | TFLOPs: 43.99 | +7: iteration 43560/ 115203 | consumed samples: 11151360 | consumed tokens: 22837985280 | elapsed time per iteration (s): 0.55 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 2.769398E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.694 | TFLOPs: 44.02 | +7: iteration 43570/ 115203 | consumed samples: 11153920 | consumed tokens: 22843228160 | elapsed time per iteration (s): 0.56 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 2.781513E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.460 | TFLOPs: 43.90 | +7: iteration 43580/ 115203 | consumed samples: 11156480 | consumed tokens: 22848471040 | elapsed time per iteration (s): 0.55 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 2.774162E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.792 | TFLOPs: 44.03 | +7: iteration 43590/ 115203 | consumed samples: 11159040 | consumed tokens: 22853713920 | elapsed time per iteration (s): 0.55 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 2.765249E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.719 | TFLOPs: 44.02 | +7: iteration 43600/ 115203 | consumed samples: 11161600 | consumed tokens: 22858956800 | elapsed time per iteration (s): 0.55 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 2.754763E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.733 | TFLOPs: 44.02 | +7: iteration 43610/ 115203 | consumed samples: 11164160 | consumed tokens: 22864199680 | elapsed time per iteration (s): 0.56 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 2.757241E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.481 | TFLOPs: 43.90 | +7: iteration 43620/ 115203 | consumed samples: 11166720 | consumed tokens: 22869442560 | elapsed time per iteration (s): 0.55 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 2.774832E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.822 | TFLOPs: 44.03 | +7: iteration 43630/ 115203 | consumed samples: 11169280 | consumed tokens: 22874685440 | elapsed time per iteration (s): 0.55 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 2.775388E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.731 | TFLOPs: 44.02 | +7: iteration 43640/ 115203 | consumed samples: 11171840 | consumed tokens: 22879928320 | elapsed time per iteration (s): 0.55 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 2.751100E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.517 | TFLOPs: 44.00 | +7: iteration 43650/ 115203 | consumed samples: 11174400 | consumed tokens: 22885171200 | elapsed time per iteration (s): 0.55 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 2.763339E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 43660/ 115203 | consumed samples: 11176960 | consumed tokens: 22890414080 | elapsed time per iteration (s): 0.56 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 2.760368E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.164 | TFLOPs: 43.39 | +7: iteration 43670/ 115203 | consumed samples: 11179520 | consumed tokens: 22895656960 | elapsed time per iteration (s): 0.56 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 2.770775E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.945 | TFLOPs: 43.76 | +7: iteration 43680/ 115203 | consumed samples: 11182080 | consumed tokens: 22900899840 | elapsed time per iteration (s): 0.56 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 2.756845E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.667 | TFLOPs: 43.73 | +7: iteration 43690/ 115203 | consumed samples: 11184640 | consumed tokens: 22906142720 | elapsed time per iteration (s): 0.55 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 2.769458E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.488 | TFLOPs: 44.00 | +7: iteration 43700/ 115203 | consumed samples: 11187200 | consumed tokens: 22911385600 | elapsed time per iteration (s): 0.56 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 2.768536E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.094 | TFLOPs: 43.77 | +7: iteration 43710/ 115203 | consumed samples: 11189760 | consumed tokens: 22916628480 | elapsed time per iteration (s): 0.56 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 2.773175E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.104 | TFLOPs: 43.20 | +7: iteration 43720/ 115203 | consumed samples: 11192320 | consumed tokens: 22921871360 | elapsed time per iteration (s): 0.55 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 2.773998E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.711 | TFLOPs: 44.02 | +7: iteration 43730/ 115203 | consumed samples: 11194880 | consumed tokens: 22927114240 | elapsed time per iteration (s): 0.57 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 2.766417E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.174 | TFLOPs: 42.63 | +7: iteration 43740/ 115203 | consumed samples: 11197440 | consumed tokens: 22932357120 | elapsed time per iteration (s): 0.57 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 2.781295E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.646 | TFLOPs: 42.87 | +7: iteration 43750/ 115203 | consumed samples: 11200000 | consumed tokens: 22937600000 | elapsed time per iteration (s): 0.55 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 2.763182E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.685 | TFLOPs: 44.02 | +7: iteration 43760/ 115203 | consumed samples: 11202560 | consumed tokens: 22942842880 | elapsed time per iteration (s): 0.56 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 2.752759E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.898 | TFLOPs: 43.66 | +7: iteration 43770/ 115203 | consumed samples: 11205120 | consumed tokens: 22948085760 | elapsed time per iteration (s): 0.56 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 2.754697E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.113 | TFLOPs: 43.68 | +7: iteration 43780/ 115203 | consumed samples: 11207680 | consumed tokens: 22953328640 | elapsed time per iteration (s): 0.56 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 2.760235E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.069 | TFLOPs: 43.39 | +7: iteration 43790/ 115203 | consumed samples: 11210240 | consumed tokens: 22958571520 | elapsed time per iteration (s): 0.55 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 2.767179E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.510 | TFLOPs: 44.00 | +7: iteration 43800/ 115203 | consumed samples: 11212800 | consumed tokens: 22963814400 | elapsed time per iteration (s): 0.57 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 2.758052E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.006 | TFLOPs: 43.09 | +7: iteration 43810/ 115203 | consumed samples: 11215360 | consumed tokens: 22969057280 | elapsed time per iteration (s): 0.57 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 2.770297E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.177 | TFLOPs: 43.01 | +7: iteration 43820/ 115203 | consumed samples: 11217920 | consumed tokens: 22974300160 | elapsed time per iteration (s): 0.56 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 2.774086E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.159 | TFLOPs: 43.97 | +7: iteration 43830/ 115203 | consumed samples: 11220480 | consumed tokens: 22979543040 | elapsed time per iteration (s): 0.55 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 2.771477E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.730 | TFLOPs: 44.02 | +7: iteration 43840/ 115203 | consumed samples: 11223040 | consumed tokens: 22984785920 | elapsed time per iteration (s): 0.57 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 2.769488E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.738 | TFLOPs: 42.78 | +7: iteration 43850/ 115203 | consumed samples: 11225600 | consumed tokens: 22990028800 | elapsed time per iteration (s): 0.57 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 2.776152E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.389 | TFLOPs: 43.04 | +7: iteration 43860/ 115203 | consumed samples: 11228160 | consumed tokens: 22995271680 | elapsed time per iteration (s): 0.56 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 2.772613E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.622 | TFLOPs: 43.34 | +7: iteration 43870/ 115203 | consumed samples: 11230720 | consumed tokens: 23000514560 | elapsed time per iteration (s): 0.58 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 2.767124E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.183 | TFLOPs: 42.06 | +7: iteration 43880/ 115203 | consumed samples: 11233280 | consumed tokens: 23005757440 | elapsed time per iteration (s): 0.58 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 2.771294E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.487 | TFLOPs: 42.38 | +7: iteration 43890/ 115203 | consumed samples: 11235840 | consumed tokens: 23011000320 | elapsed time per iteration (s): 0.57 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 2.754833E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.618 | TFLOPs: 42.68 | +7: iteration 43900/ 115203 | consumed samples: 11238400 | consumed tokens: 23016243200 | elapsed time per iteration (s): 0.57 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 2.750678E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.622 | TFLOPs: 42.49 | +7: iteration 43910/ 115203 | consumed samples: 11240960 | consumed tokens: 23021486080 | elapsed time per iteration (s): 0.56 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 2.759208E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.845 | TFLOPs: 43.75 | +7: iteration 43920/ 115203 | consumed samples: 11243520 | consumed tokens: 23026728960 | elapsed time per iteration (s): 0.56 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 2.797766E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.821 | TFLOPs: 43.74 | +7: iteration 43930/ 115203 | consumed samples: 11246080 | consumed tokens: 23031971840 | elapsed time per iteration (s): 0.56 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 2.760224E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.292 | TFLOPs: 43.41 | +7: iteration 43940/ 115203 | consumed samples: 11248640 | consumed tokens: 23037214720 | elapsed time per iteration (s): 0.56 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 2.758174E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.198 | TFLOPs: 43.49 | +7: iteration 43950/ 115203 | consumed samples: 11251200 | consumed tokens: 23042457600 | elapsed time per iteration (s): 0.57 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 2.768308E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.757 | TFLOPs: 42.78 | +7: iteration 43960/ 115203 | consumed samples: 11253760 | consumed tokens: 23047700480 | elapsed time per iteration (s): 0.56 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 2.753588E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.384 | TFLOPs: 43.89 | +7: iteration 43970/ 115203 | consumed samples: 11256320 | consumed tokens: 23052943360 | elapsed time per iteration (s): 0.56 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 2.788147E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.124 | TFLOPs: 43.49 | +7: iteration 43980/ 115203 | consumed samples: 11258880 | consumed tokens: 23058186240 | elapsed time per iteration (s): 0.56 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 2.761819E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.750 | TFLOPs: 43.45 | +7: iteration 43990/ 115203 | consumed samples: 11261440 | consumed tokens: 23063429120 | elapsed time per iteration (s): 0.56 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 2.764763E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.522 | TFLOPs: 43.72 | +0: [2023-03-16 19:44:13,067] [INFO] [logging.py:68:log_dist] [Rank 0] step=44000, skipped=0, lr=[0.00014426156962702883, 0.00014426156962702883, 0.00014426156962702883], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 44000/ 115203 | consumed samples: 11264000 | consumed tokens: 23068672000 | elapsed time per iteration (s): 0.56 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 2.759861E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.313 | TFLOPs: 43.31 | +0: steps: 44000 loss: 2.7301 iter time (s): 0.558 samples/sec: 459.117 +7: iteration 44010/ 115203 | consumed samples: 11266560 | consumed tokens: 23073914880 | elapsed time per iteration (s): 0.56 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 2.750906E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.891 | TFLOPs: 43.65 | +7: iteration 44020/ 115203 | consumed samples: 11269120 | consumed tokens: 23079157760 | elapsed time per iteration (s): 0.55 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 2.769524E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.599 | TFLOPs: 44.01 | +7: iteration 44030/ 115203 | consumed samples: 11271680 | consumed tokens: 23084400640 | elapsed time per iteration (s): 0.56 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 2.767100E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.166 | TFLOPs: 43.59 | +7: iteration 44040/ 115203 | consumed samples: 11274240 | consumed tokens: 23089643520 | elapsed time per iteration (s): 0.57 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 2.757302E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.791 | TFLOPs: 42.88 | +7: iteration 44050/ 115203 | consumed samples: 11276800 | consumed tokens: 23094886400 | elapsed time per iteration (s): 0.55 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 2.774110E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.545 | TFLOPs: 44.00 | +7: iteration 44060/ 115203 | consumed samples: 11279360 | consumed tokens: 23100129280 | elapsed time per iteration (s): 0.57 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 2.764650E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.317 | TFLOPs: 42.74 | +7: iteration 44070/ 115203 | consumed samples: 11281920 | consumed tokens: 23105372160 | elapsed time per iteration (s): 0.57 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 2.755612E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.380 | TFLOPs: 42.46 | +7: iteration 44080/ 115203 | consumed samples: 11284480 | consumed tokens: 23110615040 | elapsed time per iteration (s): 0.57 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 2.756750E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.992 | TFLOPs: 42.81 | +7: iteration 44090/ 115203 | consumed samples: 11287040 | consumed tokens: 23115857920 | elapsed time per iteration (s): 0.56 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 2.768974E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.407 | TFLOPs: 43.51 | +7: iteration 44100/ 115203 | consumed samples: 11289600 | consumed tokens: 23121100800 | elapsed time per iteration (s): 0.56 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 2.764269E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.706 | TFLOPs: 43.26 | +7: iteration 44110/ 115203 | consumed samples: 11292160 | consumed tokens: 23126343680 | elapsed time per iteration (s): 0.55 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 2.764428E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.825 | TFLOPs: 44.03 | +7: iteration 44120/ 115203 | consumed samples: 11294720 | consumed tokens: 23131586560 | elapsed time per iteration (s): 0.56 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 2.769883E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.103 | TFLOPs: 43.68 | +7: iteration 44130/ 115203 | consumed samples: 11297280 | consumed tokens: 23136829440 | elapsed time per iteration (s): 0.56 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 2.769979E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.282 | TFLOPs: 43.79 | +7: iteration 44140/ 115203 | consumed samples: 11299840 | consumed tokens: 23142072320 | elapsed time per iteration (s): 0.56 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 2.753188E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.435 | TFLOPs: 43.90 | +7: iteration 44150/ 115203 | consumed samples: 11302400 | consumed tokens: 23147315200 | elapsed time per iteration (s): 0.56 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 2.757334E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.314 | TFLOPs: 43.41 | +7: iteration 44160/ 115203 | consumed samples: 11304960 | consumed tokens: 23152558080 | elapsed time per iteration (s): 0.56 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 2.766476E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.412 | TFLOPs: 43.90 | +7: iteration 44170/ 115203 | consumed samples: 11307520 | consumed tokens: 23157800960 | elapsed time per iteration (s): 0.56 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 2.755114E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.282 | TFLOPs: 43.41 | +7: iteration 44180/ 115203 | consumed samples: 11310080 | consumed tokens: 23163043840 | elapsed time per iteration (s): 0.57 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 2.776996E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.776 | TFLOPs: 42.88 | +7: iteration 44190/ 115203 | consumed samples: 11312640 | consumed tokens: 23168286720 | elapsed time per iteration (s): 0.55 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 2.776155E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.803 | TFLOPs: 44.03 | +7: iteration 44200/ 115203 | consumed samples: 11315200 | consumed tokens: 23173529600 | elapsed time per iteration (s): 0.57 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 2.740511E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.556 | TFLOPs: 42.57 | +7: iteration 44210/ 115203 | consumed samples: 11317760 | consumed tokens: 23178772480 | elapsed time per iteration (s): 0.57 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 2.771407E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.002 | TFLOPs: 43.09 | +7: iteration 44220/ 115203 | consumed samples: 11320320 | consumed tokens: 23184015360 | elapsed time per iteration (s): 0.56 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 2.759405E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.152 | TFLOPs: 43.20 | +7: iteration 44230/ 115203 | consumed samples: 11322880 | consumed tokens: 23189258240 | elapsed time per iteration (s): 0.56 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 2.761885E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.538 | TFLOPs: 43.62 | +7: iteration 44240/ 115203 | consumed samples: 11325440 | consumed tokens: 23194501120 | elapsed time per iteration (s): 0.56 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 2.758927E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.008 | TFLOPs: 43.48 | +7: iteration 44250/ 115203 | consumed samples: 11328000 | consumed tokens: 23199744000 | elapsed time per iteration (s): 0.56 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 2.758342E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.545 | TFLOPs: 43.43 | +7: iteration 44260/ 115203 | consumed samples: 11330560 | consumed tokens: 23204986880 | elapsed time per iteration (s): 0.56 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 2.767566E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.404 | TFLOPs: 43.23 | +7: iteration 44270/ 115203 | consumed samples: 11333120 | consumed tokens: 23210229760 | elapsed time per iteration (s): 0.56 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 2.762119E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.208 | TFLOPs: 43.40 | +7: iteration 44280/ 115203 | consumed samples: 11335680 | consumed tokens: 23215472640 | elapsed time per iteration (s): 0.56 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 2.760997E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.176 | TFLOPs: 43.97 | +7: iteration 44290/ 115203 | consumed samples: 11338240 | consumed tokens: 23220715520 | elapsed time per iteration (s): 0.58 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 2.760053E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.312 | TFLOPs: 41.79 | +7: iteration 44300/ 115203 | consumed samples: 11340800 | consumed tokens: 23225958400 | elapsed time per iteration (s): 0.56 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 2.767108E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.931 | TFLOPs: 43.75 | +7: iteration 44310/ 115203 | consumed samples: 11343360 | consumed tokens: 23231201280 | elapsed time per iteration (s): 0.55 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 2.762679E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.648 | TFLOPs: 44.01 | +7: iteration 44320/ 115203 | consumed samples: 11345920 | consumed tokens: 23236444160 | elapsed time per iteration (s): 0.56 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 2.761653E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.274 | TFLOPs: 43.50 | +7: iteration 44330/ 115203 | consumed samples: 11348480 | consumed tokens: 23241687040 | elapsed time per iteration (s): 0.56 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 2.758668E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.547 | TFLOPs: 43.62 | +7: iteration 44340/ 115203 | consumed samples: 11351040 | consumed tokens: 23246929920 | elapsed time per iteration (s): 0.56 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 2.774088E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.078 | TFLOPs: 43.39 | +7: iteration 44350/ 115203 | consumed samples: 11353600 | consumed tokens: 23252172800 | elapsed time per iteration (s): 0.56 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 2.768080E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.451 | TFLOPs: 43.52 | +7: iteration 44360/ 115203 | consumed samples: 11356160 | consumed tokens: 23257415680 | elapsed time per iteration (s): 0.55 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 2.757548E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.541 | TFLOPs: 44.00 | +7: iteration 44370/ 115203 | consumed samples: 11358720 | consumed tokens: 23262658560 | elapsed time per iteration (s): 0.57 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 2.758278E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.422 | TFLOPs: 42.47 | +7: iteration 44380/ 115203 | consumed samples: 11361280 | consumed tokens: 23267901440 | elapsed time per iteration (s): 0.56 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 2.764190E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.406 | TFLOPs: 43.89 | +7: iteration 44390/ 115203 | consumed samples: 11363840 | consumed tokens: 23273144320 | elapsed time per iteration (s): 0.56 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 2.762525E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.086 | TFLOPs: 43.77 | +7: iteration 44400/ 115203 | consumed samples: 11366400 | consumed tokens: 23278387200 | elapsed time per iteration (s): 0.56 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 2.764281E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.727 | TFLOPs: 43.45 | +7: iteration 44410/ 115203 | consumed samples: 11368960 | consumed tokens: 23283630080 | elapsed time per iteration (s): 0.56 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 2.776033E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.547 | TFLOPs: 43.72 | +7: iteration 44420/ 115203 | consumed samples: 11371520 | consumed tokens: 23288872960 | elapsed time per iteration (s): 0.56 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 2.762919E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.656 | TFLOPs: 43.44 | +7: iteration 44430/ 115203 | consumed samples: 11374080 | consumed tokens: 23294115840 | elapsed time per iteration (s): 0.56 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 2.772012E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.910 | TFLOPs: 43.37 | +7: iteration 44440/ 115203 | consumed samples: 11376640 | consumed tokens: 23299358720 | elapsed time per iteration (s): 0.56 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 2.773911E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.681 | TFLOPs: 43.63 | +7: iteration 44450/ 115203 | consumed samples: 11379200 | consumed tokens: 23304601600 | elapsed time per iteration (s): 0.57 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 2.766373E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.421 | TFLOPs: 43.13 | +7: iteration 44460/ 115203 | consumed samples: 11381760 | consumed tokens: 23309844480 | elapsed time per iteration (s): 0.57 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 2.765545E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.288 | TFLOPs: 42.93 | +7: iteration 44470/ 115203 | consumed samples: 11384320 | consumed tokens: 23315087360 | elapsed time per iteration (s): 0.56 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 2.755804E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.819 | TFLOPs: 43.46 | +7: iteration 44480/ 115203 | consumed samples: 11386880 | consumed tokens: 23320330240 | elapsed time per iteration (s): 0.56 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 2.774439E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.451 | TFLOPs: 43.42 | +7: iteration 44490/ 115203 | consumed samples: 11389440 | consumed tokens: 23325573120 | elapsed time per iteration (s): 0.55 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 2.749378E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 44500/ 115203 | consumed samples: 11392000 | consumed tokens: 23330816000 | elapsed time per iteration (s): 0.57 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 2.768646E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.659 | TFLOPs: 42.68 | +7: iteration 44510/ 115203 | consumed samples: 11394560 | consumed tokens: 23336058880 | elapsed time per iteration (s): 0.56 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 2.774096E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.643 | TFLOPs: 43.63 | +7: iteration 44520/ 115203 | consumed samples: 11397120 | consumed tokens: 23341301760 | elapsed time per iteration (s): 0.56 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 2.755443E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.665 | TFLOPs: 43.44 | +7: iteration 44530/ 115203 | consumed samples: 11399680 | consumed tokens: 23346544640 | elapsed time per iteration (s): 0.56 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 2.765608E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.828 | TFLOPs: 43.46 | +7: iteration 44540/ 115203 | consumed samples: 11402240 | consumed tokens: 23351787520 | elapsed time per iteration (s): 0.56 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 2.758297E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.966 | TFLOPs: 43.38 | +7: iteration 44550/ 115203 | consumed samples: 11404800 | consumed tokens: 23357030400 | elapsed time per iteration (s): 0.56 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 2.762725E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.698 | TFLOPs: 43.54 | +7: iteration 44560/ 115203 | consumed samples: 11407360 | consumed tokens: 23362273280 | elapsed time per iteration (s): 0.56 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 2.752387E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.923 | TFLOPs: 43.66 | +7: iteration 44570/ 115203 | consumed samples: 11409920 | consumed tokens: 23367516160 | elapsed time per iteration (s): 0.56 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 2.780359E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.810 | TFLOPs: 43.65 | +7: iteration 44580/ 115203 | consumed samples: 11412480 | consumed tokens: 23372759040 | elapsed time per iteration (s): 0.55 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 2.756349E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.523 | TFLOPs: 44.00 | +7: iteration 44590/ 115203 | consumed samples: 11415040 | consumed tokens: 23378001920 | elapsed time per iteration (s): 0.56 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 2.748816E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.979 | TFLOPs: 43.57 | +7: iteration 44600/ 115203 | consumed samples: 11417600 | consumed tokens: 23383244800 | elapsed time per iteration (s): 0.56 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 2.758701E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.301 | TFLOPs: 43.69 | +7: iteration 44610/ 115203 | consumed samples: 11420160 | consumed tokens: 23388487680 | elapsed time per iteration (s): 0.56 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 2.773778E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.728 | TFLOPs: 43.73 | +7: iteration 44620/ 115203 | consumed samples: 11422720 | consumed tokens: 23393730560 | elapsed time per iteration (s): 0.55 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 2.759922E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.316 | TFLOPs: 43.98 | +7: iteration 44630/ 115203 | consumed samples: 11425280 | consumed tokens: 23398973440 | elapsed time per iteration (s): 0.56 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 2.769309E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.078 | TFLOPs: 43.67 | +7: iteration 44640/ 115203 | consumed samples: 11427840 | consumed tokens: 23404216320 | elapsed time per iteration (s): 0.56 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 2.759029E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.502 | TFLOPs: 43.52 | +7: iteration 44650/ 115203 | consumed samples: 11430400 | consumed tokens: 23409459200 | elapsed time per iteration (s): 0.55 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 2.761407E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.608 | TFLOPs: 44.01 | +7: iteration 44660/ 115203 | consumed samples: 11432960 | consumed tokens: 23414702080 | elapsed time per iteration (s): 0.56 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 2.769559E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.728 | TFLOPs: 43.45 | +7: iteration 44670/ 115203 | consumed samples: 11435520 | consumed tokens: 23419944960 | elapsed time per iteration (s): 0.57 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 2.761254E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.022 | TFLOPs: 42.90 | +7: iteration 44680/ 115203 | consumed samples: 11438080 | consumed tokens: 23425187840 | elapsed time per iteration (s): 0.56 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 2.767656E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.580 | TFLOPs: 43.72 | +7: iteration 44690/ 115203 | consumed samples: 11440640 | consumed tokens: 23430430720 | elapsed time per iteration (s): 0.56 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 2.769210E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.193 | TFLOPs: 43.87 | +7: iteration 44700/ 115203 | consumed samples: 11443200 | consumed tokens: 23435673600 | elapsed time per iteration (s): 0.56 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 2.749430E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.602 | TFLOPs: 43.72 | +7: iteration 44710/ 115203 | consumed samples: 11445760 | consumed tokens: 23440916480 | elapsed time per iteration (s): 0.56 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 2.778507E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.648 | TFLOPs: 43.73 | +7: iteration 44720/ 115203 | consumed samples: 11448320 | consumed tokens: 23446159360 | elapsed time per iteration (s): 0.56 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 2.756531E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.429 | TFLOPs: 43.23 | +7: iteration 44730/ 115203 | consumed samples: 11450880 | consumed tokens: 23451402240 | elapsed time per iteration (s): 0.56 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 2.768400E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.797 | TFLOPs: 43.74 | +7: iteration 44740/ 115203 | consumed samples: 11453440 | consumed tokens: 23456645120 | elapsed time per iteration (s): 0.57 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 2.747945E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.946 | TFLOPs: 42.99 | +7: iteration 44750/ 115203 | consumed samples: 11456000 | consumed tokens: 23461888000 | elapsed time per iteration (s): 0.57 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 2.742761E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.867 | TFLOPs: 43.18 | +7: iteration 44760/ 115203 | consumed samples: 11458560 | consumed tokens: 23467130880 | elapsed time per iteration (s): 0.56 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 2.756989E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.247 | TFLOPs: 43.50 | +7: iteration 44770/ 115203 | consumed samples: 11461120 | consumed tokens: 23472373760 | elapsed time per iteration (s): 0.58 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 2.749495E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.655 | TFLOPs: 42.20 | +7: iteration 44780/ 115203 | consumed samples: 11463680 | consumed tokens: 23477616640 | elapsed time per iteration (s): 0.55 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 2.769040E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.287 | TFLOPs: 43.98 | +7: iteration 44790/ 115203 | consumed samples: 11466240 | consumed tokens: 23482859520 | elapsed time per iteration (s): 0.56 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 2.753805E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.986 | TFLOPs: 43.47 | +7: iteration 44800/ 115203 | consumed samples: 11468800 | consumed tokens: 23488102400 | elapsed time per iteration (s): 0.56 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 2.756591E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.206 | TFLOPs: 43.59 | +7: iteration 44810/ 115203 | consumed samples: 11471360 | consumed tokens: 23493345280 | elapsed time per iteration (s): 0.55 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 2.769663E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.545 | TFLOPs: 44.00 | +7: iteration 44820/ 115203 | consumed samples: 11473920 | consumed tokens: 23498588160 | elapsed time per iteration (s): 0.55 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 2.750831E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 44830/ 115203 | consumed samples: 11476480 | consumed tokens: 23503831040 | elapsed time per iteration (s): 0.56 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 2.769122E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.958 | TFLOPs: 43.76 | +7: iteration 44840/ 115203 | consumed samples: 11479040 | consumed tokens: 23509073920 | elapsed time per iteration (s): 0.56 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 2.762980E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.769 | TFLOPs: 43.26 | +7: iteration 44850/ 115203 | consumed samples: 11481600 | consumed tokens: 23514316800 | elapsed time per iteration (s): 0.55 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 2.759783E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.446 | TFLOPs: 43.99 | +7: iteration 44860/ 115203 | consumed samples: 11484160 | consumed tokens: 23519559680 | elapsed time per iteration (s): 0.56 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 2.751139E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.840 | TFLOPs: 43.36 | +7: iteration 44870/ 115203 | consumed samples: 11486720 | consumed tokens: 23524802560 | elapsed time per iteration (s): 0.56 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 2.762278E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.160 | TFLOPs: 43.59 | +7: iteration 44880/ 115203 | consumed samples: 11489280 | consumed tokens: 23530045440 | elapsed time per iteration (s): 0.56 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 2.768071E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.116 | TFLOPs: 43.39 | +7: iteration 44890/ 115203 | consumed samples: 11491840 | consumed tokens: 23535288320 | elapsed time per iteration (s): 0.57 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 2.758379E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.514 | TFLOPs: 42.76 | +7: iteration 44900/ 115203 | consumed samples: 11494400 | consumed tokens: 23540531200 | elapsed time per iteration (s): 0.56 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 2.776541E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.393 | TFLOPs: 43.42 | +7: iteration 44910/ 115203 | consumed samples: 11496960 | consumed tokens: 23545774080 | elapsed time per iteration (s): 0.57 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 2.754375E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.847 | TFLOPs: 42.60 | +7: iteration 44920/ 115203 | consumed samples: 11499520 | consumed tokens: 23551016960 | elapsed time per iteration (s): 0.56 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 2.785846E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.180 | TFLOPs: 43.40 | +7: iteration 44930/ 115203 | consumed samples: 11502080 | consumed tokens: 23556259840 | elapsed time per iteration (s): 0.55 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 2.758422E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.380 | TFLOPs: 43.99 | +7: iteration 44940/ 115203 | consumed samples: 11504640 | consumed tokens: 23561502720 | elapsed time per iteration (s): 0.56 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 2.751043E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.455 | TFLOPs: 43.33 | +7: iteration 44950/ 115203 | consumed samples: 11507200 | consumed tokens: 23566745600 | elapsed time per iteration (s): 0.56 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 2.765319E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.647 | TFLOPs: 43.73 | +7: iteration 44960/ 115203 | consumed samples: 11509760 | consumed tokens: 23571988480 | elapsed time per iteration (s): 0.56 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 2.769302E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.163 | TFLOPs: 43.97 | +7: iteration 44970/ 115203 | consumed samples: 11512320 | consumed tokens: 23577231360 | elapsed time per iteration (s): 0.56 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 2.764510E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.148 | TFLOPs: 43.20 | +7: iteration 44980/ 115203 | consumed samples: 11514880 | consumed tokens: 23582474240 | elapsed time per iteration (s): 0.57 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 2.748675E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.852 | TFLOPs: 42.98 | +7: iteration 44990/ 115203 | consumed samples: 11517440 | consumed tokens: 23587717120 | elapsed time per iteration (s): 0.56 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 2.757425E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.833 | TFLOPs: 43.46 | +7: iteration 45000/ 115203 | consumed samples: 11520000 | consumed tokens: 23592960000 | elapsed time per iteration (s): 0.57 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 2.769111E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.055 | TFLOPs: 43.10 | +7: iteration 45010/ 115203 | consumed samples: 11522560 | consumed tokens: 23598202880 | elapsed time per iteration (s): 0.57 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 2.761333E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.689 | TFLOPs: 43.16 | +7: iteration 45020/ 115203 | consumed samples: 11525120 | consumed tokens: 23603445760 | elapsed time per iteration (s): 0.56 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 2.762507E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.683 | TFLOPs: 43.54 | +7: iteration 45030/ 115203 | consumed samples: 11527680 | consumed tokens: 23608688640 | elapsed time per iteration (s): 0.55 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 2.755433E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 45040/ 115203 | consumed samples: 11530240 | consumed tokens: 23613931520 | elapsed time per iteration (s): 0.56 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 2.763447E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.472 | TFLOPs: 43.71 | +7: iteration 45050/ 115203 | consumed samples: 11532800 | consumed tokens: 23619174400 | elapsed time per iteration (s): 0.56 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 2.758110E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.770 | TFLOPs: 43.36 | +7: iteration 45060/ 115203 | consumed samples: 11535360 | consumed tokens: 23624417280 | elapsed time per iteration (s): 0.57 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 2.752745E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.039 | TFLOPs: 42.91 | +7: iteration 45070/ 115203 | consumed samples: 11537920 | consumed tokens: 23629660160 | elapsed time per iteration (s): 0.56 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 2.763023E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.118 | TFLOPs: 43.58 | +7: iteration 45080/ 115203 | consumed samples: 11540480 | consumed tokens: 23634903040 | elapsed time per iteration (s): 0.56 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 2.753073E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.043 | TFLOPs: 43.48 | +7: iteration 45090/ 115203 | consumed samples: 11543040 | consumed tokens: 23640145920 | elapsed time per iteration (s): 0.56 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 2.756214E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.881 | TFLOPs: 43.46 | +7: iteration 45100/ 115203 | consumed samples: 11545600 | consumed tokens: 23645388800 | elapsed time per iteration (s): 0.59 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 2.722478E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.648 | TFLOPs: 41.06 | +7: iteration 45110/ 115203 | consumed samples: 11548160 | consumed tokens: 23650631680 | elapsed time per iteration (s): 0.56 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 2.766943E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.898 | TFLOPs: 43.94 | +7: iteration 45120/ 115203 | consumed samples: 11550720 | consumed tokens: 23655874560 | elapsed time per iteration (s): 0.56 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 2.765150E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.800 | TFLOPs: 43.84 | +7: iteration 45130/ 115203 | consumed samples: 11553280 | consumed tokens: 23661117440 | elapsed time per iteration (s): 0.57 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 2.764351E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.513 | TFLOPs: 42.86 | +7: iteration 45140/ 115203 | consumed samples: 11555840 | consumed tokens: 23666360320 | elapsed time per iteration (s): 0.56 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 2.755499E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.728 | TFLOPs: 43.54 | +7: iteration 45150/ 115203 | consumed samples: 11558400 | consumed tokens: 23671603200 | elapsed time per iteration (s): 0.56 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 2.767768E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.799 | TFLOPs: 43.26 | +7: iteration 45160/ 115203 | consumed samples: 11560960 | consumed tokens: 23676846080 | elapsed time per iteration (s): 0.56 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 2.756129E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.977 | TFLOPs: 43.38 | +7: iteration 45170/ 115203 | consumed samples: 11563520 | consumed tokens: 23682088960 | elapsed time per iteration (s): 0.57 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 2.758243E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.879 | TFLOPs: 42.61 | +7: iteration 45180/ 115203 | consumed samples: 11566080 | consumed tokens: 23687331840 | elapsed time per iteration (s): 0.56 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 2.749526E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.033 | TFLOPs: 43.29 | +7: iteration 45190/ 115203 | consumed samples: 11568640 | consumed tokens: 23692574720 | elapsed time per iteration (s): 0.57 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 2.766177E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.600 | TFLOPs: 42.67 | +7: iteration 45200/ 115203 | consumed samples: 11571200 | consumed tokens: 23697817600 | elapsed time per iteration (s): 0.56 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 2.740991E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.039 | TFLOPs: 43.38 | +7: iteration 45210/ 115203 | consumed samples: 11573760 | consumed tokens: 23703060480 | elapsed time per iteration (s): 0.56 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 2.761496E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.435 | TFLOPs: 43.23 | +7: iteration 45220/ 115203 | consumed samples: 11576320 | consumed tokens: 23708303360 | elapsed time per iteration (s): 0.56 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 2.752957E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.173 | TFLOPs: 43.59 | +7: iteration 45230/ 115203 | consumed samples: 11578880 | consumed tokens: 23713546240 | elapsed time per iteration (s): 0.57 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 2.760604E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.860 | TFLOPs: 43.08 | +7: iteration 45240/ 115203 | consumed samples: 11581440 | consumed tokens: 23718789120 | elapsed time per iteration (s): 0.58 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 2.761259E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.139 | TFLOPs: 42.44 | +7: iteration 45250/ 115203 | consumed samples: 11584000 | consumed tokens: 23724032000 | elapsed time per iteration (s): 0.57 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 2.763154E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.055 | TFLOPs: 43.10 | +7: iteration 45260/ 115203 | consumed samples: 11586560 | consumed tokens: 23729274880 | elapsed time per iteration (s): 0.55 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 2.766801E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.528 | TFLOPs: 44.00 | +7: iteration 45270/ 115203 | consumed samples: 11589120 | consumed tokens: 23734517760 | elapsed time per iteration (s): 0.58 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 2.749223E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.134 | TFLOPs: 42.25 | +7: iteration 45280/ 115203 | consumed samples: 11591680 | consumed tokens: 23739760640 | elapsed time per iteration (s): 0.57 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 2.731368E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.402 | TFLOPs: 42.75 | +7: iteration 45290/ 115203 | consumed samples: 11594240 | consumed tokens: 23745003520 | elapsed time per iteration (s): 0.57 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 2.747488E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.065 | TFLOPs: 43.10 | +7: iteration 45300/ 115203 | consumed samples: 11596800 | consumed tokens: 23750246400 | elapsed time per iteration (s): 0.57 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 2.771099E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.841 | TFLOPs: 42.79 | +7: iteration 45310/ 115203 | consumed samples: 11599360 | consumed tokens: 23755489280 | elapsed time per iteration (s): 0.59 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 2.769820E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.011 | TFLOPs: 41.57 | +7: iteration 45320/ 115203 | consumed samples: 11601920 | consumed tokens: 23760732160 | elapsed time per iteration (s): 0.58 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 2.752567E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.037 | TFLOPs: 41.95 | +7: iteration 45330/ 115203 | consumed samples: 11604480 | consumed tokens: 23765975040 | elapsed time per iteration (s): 0.56 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 2.770766E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.234 | TFLOPs: 43.59 | +7: iteration 45340/ 115203 | consumed samples: 11607040 | consumed tokens: 23771217920 | elapsed time per iteration (s): 0.57 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 2.758101E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.355 | TFLOPs: 42.84 | +7: iteration 45350/ 115203 | consumed samples: 11609600 | consumed tokens: 23776460800 | elapsed time per iteration (s): 0.57 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 2.764822E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.874 | TFLOPs: 42.89 | +7: iteration 45360/ 115203 | consumed samples: 11612160 | consumed tokens: 23781703680 | elapsed time per iteration (s): 0.57 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 2.752632E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.229 | TFLOPs: 42.54 | +7: iteration 45370/ 115203 | consumed samples: 11614720 | consumed tokens: 23786946560 | elapsed time per iteration (s): 0.59 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 2.754264E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.593 | TFLOPs: 41.43 | +7: iteration 45380/ 115203 | consumed samples: 11617280 | consumed tokens: 23792189440 | elapsed time per iteration (s): 0.57 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 2.748287E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.907 | TFLOPs: 42.61 | +7: iteration 45390/ 115203 | consumed samples: 11619840 | consumed tokens: 23797432320 | elapsed time per iteration (s): 0.57 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 2.744539E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.014 | TFLOPs: 42.62 | +7: iteration 45400/ 115203 | consumed samples: 11622400 | consumed tokens: 23802675200 | elapsed time per iteration (s): 0.58 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 2.770048E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.844 | TFLOPs: 42.03 | +7: iteration 45410/ 115203 | consumed samples: 11624960 | consumed tokens: 23807918080 | elapsed time per iteration (s): 0.57 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 2.775140E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.038 | TFLOPs: 43.19 | +7: iteration 45420/ 115203 | consumed samples: 11627520 | consumed tokens: 23813160960 | elapsed time per iteration (s): 0.57 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 2.751304E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.585 | TFLOPs: 42.67 | +7: iteration 45430/ 115203 | consumed samples: 11630080 | consumed tokens: 23818403840 | elapsed time per iteration (s): 0.56 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 2.764833E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.382 | TFLOPs: 43.51 | +7: iteration 45440/ 115203 | consumed samples: 11632640 | consumed tokens: 23823646720 | elapsed time per iteration (s): 0.57 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 2.758346E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.603 | TFLOPs: 42.58 | +7: iteration 45450/ 115203 | consumed samples: 11635200 | consumed tokens: 23828889600 | elapsed time per iteration (s): 0.57 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 2.759407E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.370 | TFLOPs: 43.03 | +7: iteration 45460/ 115203 | consumed samples: 11637760 | consumed tokens: 23834132480 | elapsed time per iteration (s): 0.57 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 2.754667E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.733 | TFLOPs: 42.88 | +7: iteration 45470/ 115203 | consumed samples: 11640320 | consumed tokens: 23839375360 | elapsed time per iteration (s): 0.58 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 2.756341E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.562 | TFLOPs: 42.29 | +7: iteration 45480/ 115203 | consumed samples: 11642880 | consumed tokens: 23844618240 | elapsed time per iteration (s): 0.57 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 2.768808E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.617 | TFLOPs: 42.96 | +7: iteration 45490/ 115203 | consumed samples: 11645440 | consumed tokens: 23849861120 | elapsed time per iteration (s): 0.58 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 2.764753E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.160 | TFLOPs: 42.35 | +7: iteration 45500/ 115203 | consumed samples: 11648000 | consumed tokens: 23855104000 | elapsed time per iteration (s): 0.57 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 2.761190E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.238 | TFLOPs: 43.12 | +7: iteration 45510/ 115203 | consumed samples: 11650560 | consumed tokens: 23860346880 | elapsed time per iteration (s): 0.57 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 2.754867E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.031 | TFLOPs: 43.10 | +7: iteration 45520/ 115203 | consumed samples: 11653120 | consumed tokens: 23865589760 | elapsed time per iteration (s): 0.56 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 2.746677E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.551 | TFLOPs: 43.62 | +7: iteration 45530/ 115203 | consumed samples: 11655680 | consumed tokens: 23870832640 | elapsed time per iteration (s): 0.59 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 2.757288E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.934 | TFLOPs: 41.66 | +7: iteration 45540/ 115203 | consumed samples: 11658240 | consumed tokens: 23876075520 | elapsed time per iteration (s): 0.57 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 2.755515E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.738 | TFLOPs: 42.78 | +7: iteration 45550/ 115203 | consumed samples: 11660800 | consumed tokens: 23881318400 | elapsed time per iteration (s): 0.56 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 2.766112E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.960 | TFLOPs: 43.38 | +7: iteration 45560/ 115203 | consumed samples: 11663360 | consumed tokens: 23886561280 | elapsed time per iteration (s): 0.56 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 2.745662E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.635 | TFLOPs: 43.25 | +7: iteration 45570/ 115203 | consumed samples: 11665920 | consumed tokens: 23891804160 | elapsed time per iteration (s): 0.57 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 2.771558E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.619 | TFLOPs: 42.87 | +7: iteration 45580/ 115203 | consumed samples: 11668480 | consumed tokens: 23897047040 | elapsed time per iteration (s): 0.57 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 2.764949E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.976 | TFLOPs: 42.71 | +7: iteration 45590/ 115203 | consumed samples: 11671040 | consumed tokens: 23902289920 | elapsed time per iteration (s): 0.56 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 2.765024E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.677 | TFLOPs: 43.44 | +7: iteration 45600/ 115203 | consumed samples: 11673600 | consumed tokens: 23907532800 | elapsed time per iteration (s): 0.57 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 2.749734E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.722 | TFLOPs: 42.88 | +7: iteration 45610/ 115203 | consumed samples: 11676160 | consumed tokens: 23912775680 | elapsed time per iteration (s): 0.57 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 2.747652E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.026 | TFLOPs: 42.71 | +7: iteration 45620/ 115203 | consumed samples: 11678720 | consumed tokens: 23918018560 | elapsed time per iteration (s): 0.57 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 2.762675E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.180 | TFLOPs: 42.73 | +7: iteration 45630/ 115203 | consumed samples: 11681280 | consumed tokens: 23923261440 | elapsed time per iteration (s): 0.57 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 2.761931E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.343 | TFLOPs: 42.94 | +7: iteration 45640/ 115203 | consumed samples: 11683840 | consumed tokens: 23928504320 | elapsed time per iteration (s): 0.55 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 2.781836E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.764 | TFLOPs: 44.02 | +7: iteration 45650/ 115203 | consumed samples: 11686400 | consumed tokens: 23933747200 | elapsed time per iteration (s): 0.56 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 2.768264E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.103 | TFLOPs: 43.87 | +7: iteration 45660/ 115203 | consumed samples: 11688960 | consumed tokens: 23938990080 | elapsed time per iteration (s): 0.56 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 2.757722E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.932 | TFLOPs: 43.47 | +7: iteration 45670/ 115203 | consumed samples: 11691520 | consumed tokens: 23944232960 | elapsed time per iteration (s): 0.56 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 2.749978E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.853 | TFLOPs: 43.46 | +7: iteration 45680/ 115203 | consumed samples: 11694080 | consumed tokens: 23949475840 | elapsed time per iteration (s): 0.56 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 2.744512E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.503 | TFLOPs: 43.33 | +7: iteration 45690/ 115203 | consumed samples: 11696640 | consumed tokens: 23954718720 | elapsed time per iteration (s): 0.56 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 2.782416E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.021 | TFLOPs: 43.76 | +7: iteration 45700/ 115203 | consumed samples: 11699200 | consumed tokens: 23959961600 | elapsed time per iteration (s): 0.57 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 2.763160E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.252 | TFLOPs: 42.93 | +7: iteration 45710/ 115203 | consumed samples: 11701760 | consumed tokens: 23965204480 | elapsed time per iteration (s): 0.57 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 2.761030E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.472 | TFLOPs: 42.57 | +7: iteration 45720/ 115203 | consumed samples: 11704320 | consumed tokens: 23970447360 | elapsed time per iteration (s): 0.57 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 2.761771E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.272 | TFLOPs: 42.93 | +7: iteration 45730/ 115203 | consumed samples: 11706880 | consumed tokens: 23975690240 | elapsed time per iteration (s): 0.56 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 2.757236E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.359 | TFLOPs: 43.32 | +7: iteration 45740/ 115203 | consumed samples: 11709440 | consumed tokens: 23980933120 | elapsed time per iteration (s): 0.57 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 2.758174E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.401 | TFLOPs: 43.04 | +7: iteration 45750/ 115203 | consumed samples: 11712000 | consumed tokens: 23986176000 | elapsed time per iteration (s): 0.58 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 2.758931E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.382 | TFLOPs: 41.99 | +7: iteration 45760/ 115203 | consumed samples: 11714560 | consumed tokens: 23991418880 | elapsed time per iteration (s): 0.56 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 2.752865E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.843 | TFLOPs: 43.55 | +7: iteration 45770/ 115203 | consumed samples: 11717120 | consumed tokens: 23996661760 | elapsed time per iteration (s): 0.56 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 2.754522E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.038 | TFLOPs: 43.76 | +7: iteration 45780/ 115203 | consumed samples: 11719680 | consumed tokens: 24001904640 | elapsed time per iteration (s): 0.56 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 2.760552E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.556 | TFLOPs: 43.53 | +7: iteration 45790/ 115203 | consumed samples: 11722240 | consumed tokens: 24007147520 | elapsed time per iteration (s): 0.56 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 2.748979E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.117 | TFLOPs: 43.39 | +7: iteration 45800/ 115203 | consumed samples: 11724800 | consumed tokens: 24012390400 | elapsed time per iteration (s): 0.55 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 2.747851E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.611 | TFLOPs: 44.01 | +7: iteration 45810/ 115203 | consumed samples: 11727360 | consumed tokens: 24017633280 | elapsed time per iteration (s): 0.56 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 2.754876E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.002 | TFLOPs: 43.28 | +7: iteration 45820/ 115203 | consumed samples: 11729920 | consumed tokens: 24022876160 | elapsed time per iteration (s): 0.56 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 2.765869E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.582 | TFLOPs: 43.53 | +7: iteration 45830/ 115203 | consumed samples: 11732480 | consumed tokens: 24028119040 | elapsed time per iteration (s): 0.57 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 2.753617E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.380 | TFLOPs: 42.56 | +7: iteration 45840/ 115203 | consumed samples: 11735040 | consumed tokens: 24033361920 | elapsed time per iteration (s): 0.58 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 2.754128E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.796 | TFLOPs: 41.93 | +7: iteration 45850/ 115203 | consumed samples: 11737600 | consumed tokens: 24038604800 | elapsed time per iteration (s): 0.56 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 2.758246E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.337 | TFLOPs: 43.89 | +7: iteration 45860/ 115203 | consumed samples: 11740160 | consumed tokens: 24043847680 | elapsed time per iteration (s): 0.57 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 2.755829E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.597 | TFLOPs: 42.67 | +7: iteration 45870/ 115203 | consumed samples: 11742720 | consumed tokens: 24049090560 | elapsed time per iteration (s): 0.56 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 2.752268E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.192 | TFLOPs: 43.59 | +7: iteration 45880/ 115203 | consumed samples: 11745280 | consumed tokens: 24054333440 | elapsed time per iteration (s): 0.56 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 2.760243E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.837 | TFLOPs: 43.75 | +7: iteration 45890/ 115203 | consumed samples: 11747840 | consumed tokens: 24059576320 | elapsed time per iteration (s): 0.56 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 2.758525E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.902 | TFLOPs: 43.75 | +7: iteration 45900/ 115203 | consumed samples: 11750400 | consumed tokens: 24064819200 | elapsed time per iteration (s): 0.55 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 2.761928E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.620 | TFLOPs: 44.01 | +7: iteration 45910/ 115203 | consumed samples: 11752960 | consumed tokens: 24070062080 | elapsed time per iteration (s): 0.56 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 2.755020E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.125 | TFLOPs: 43.58 | +7: iteration 45920/ 115203 | consumed samples: 11755520 | consumed tokens: 24075304960 | elapsed time per iteration (s): 0.57 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 2.762029E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.926 | TFLOPs: 42.99 | +7: iteration 45930/ 115203 | consumed samples: 11758080 | consumed tokens: 24080547840 | elapsed time per iteration (s): 0.57 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 2.753341E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.794 | TFLOPs: 43.17 | +7: iteration 45940/ 115203 | consumed samples: 11760640 | consumed tokens: 24085790720 | elapsed time per iteration (s): 0.56 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 2.757215E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.888 | TFLOPs: 43.75 | +7: iteration 45950/ 115203 | consumed samples: 11763200 | consumed tokens: 24091033600 | elapsed time per iteration (s): 0.56 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 2.756827E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.559 | TFLOPs: 43.62 | +7: iteration 45960/ 115203 | consumed samples: 11765760 | consumed tokens: 24096276480 | elapsed time per iteration (s): 0.57 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 2.749879E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.028 | TFLOPs: 43.00 | +7: iteration 45970/ 115203 | consumed samples: 11768320 | consumed tokens: 24101519360 | elapsed time per iteration (s): 0.56 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 2.751334E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.552 | TFLOPs: 43.53 | +7: iteration 45980/ 115203 | consumed samples: 11770880 | consumed tokens: 24106762240 | elapsed time per iteration (s): 0.56 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 2.735732E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.780 | TFLOPs: 43.45 | +7: iteration 45990/ 115203 | consumed samples: 11773440 | consumed tokens: 24112005120 | elapsed time per iteration (s): 0.55 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 2.739312E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +0: [2023-03-16 20:03:01,002] [INFO] [logging.py:68:log_dist] [Rank 0] step=46000, skipped=0, lr=[0.0001396270779841331, 0.0001396270779841331, 0.0001396270779841331], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 46000/ 115203 | consumed samples: 11776000 | consumed tokens: 24117248000 | elapsed time per iteration (s): 0.56 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 2.763649E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.534 | TFLOPs: 43.53 | +0: steps: 46000 loss: 2.7944 iter time (s): 0.562 samples/sec: 455.322 +7: iteration 46010/ 115203 | consumed samples: 11778560 | consumed tokens: 24122490880 | elapsed time per iteration (s): 0.56 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 2.759281E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.867 | TFLOPs: 43.94 | +7: iteration 46020/ 115203 | consumed samples: 11781120 | consumed tokens: 24127733760 | elapsed time per iteration (s): 0.56 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 2.755022E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.649 | TFLOPs: 43.44 | +7: iteration 46030/ 115203 | consumed samples: 11783680 | consumed tokens: 24132976640 | elapsed time per iteration (s): 0.55 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 2.764948E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 46040/ 115203 | consumed samples: 11786240 | consumed tokens: 24138219520 | elapsed time per iteration (s): 0.56 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 2.748497E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.214 | TFLOPs: 43.97 | +7: iteration 46050/ 115203 | consumed samples: 11788800 | consumed tokens: 24143462400 | elapsed time per iteration (s): 0.57 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 2.769321E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.754 | TFLOPs: 42.59 | +7: iteration 46060/ 115203 | consumed samples: 11791360 | consumed tokens: 24148705280 | elapsed time per iteration (s): 0.57 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 2.750458E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.285 | TFLOPs: 43.12 | +7: iteration 46070/ 115203 | consumed samples: 11793920 | consumed tokens: 24153948160 | elapsed time per iteration (s): 0.56 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 2.747968E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.634 | TFLOPs: 43.54 | +7: iteration 46080/ 115203 | consumed samples: 11796480 | consumed tokens: 24159191040 | elapsed time per iteration (s): 0.58 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 2.759863E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.409 | TFLOPs: 42.27 | +7: iteration 46090/ 115203 | consumed samples: 11799040 | consumed tokens: 24164433920 | elapsed time per iteration (s): 0.56 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 2.758086E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.252 | TFLOPs: 43.69 | +7: iteration 46100/ 115203 | consumed samples: 11801600 | consumed tokens: 24169676800 | elapsed time per iteration (s): 0.55 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 2.752746E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.423 | TFLOPs: 43.99 | +7: iteration 46110/ 115203 | consumed samples: 11804160 | consumed tokens: 24174919680 | elapsed time per iteration (s): 0.56 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 2.753400E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.854 | TFLOPs: 43.56 | +7: iteration 46120/ 115203 | consumed samples: 11806720 | consumed tokens: 24180162560 | elapsed time per iteration (s): 0.55 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 2.744735E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 46130/ 115203 | consumed samples: 11809280 | consumed tokens: 24185405440 | elapsed time per iteration (s): 0.56 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 2.735722E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.082 | TFLOPs: 43.39 | +7: iteration 46140/ 115203 | consumed samples: 11811840 | consumed tokens: 24190648320 | elapsed time per iteration (s): 0.56 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 2.752981E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.470 | TFLOPs: 43.52 | +7: iteration 46150/ 115203 | consumed samples: 11814400 | consumed tokens: 24195891200 | elapsed time per iteration (s): 0.56 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 2.755345E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.112 | TFLOPs: 43.58 | +7: iteration 46160/ 115203 | consumed samples: 11816960 | consumed tokens: 24201134080 | elapsed time per iteration (s): 0.56 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 2.739511E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.618 | TFLOPs: 43.72 | +7: iteration 46170/ 115203 | consumed samples: 11819520 | consumed tokens: 24206376960 | elapsed time per iteration (s): 0.56 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 2.761014E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.821 | TFLOPs: 43.74 | +7: iteration 46180/ 115203 | consumed samples: 11822080 | consumed tokens: 24211619840 | elapsed time per iteration (s): 0.57 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 2.754591E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.260 | TFLOPs: 42.55 | +7: iteration 46190/ 115203 | consumed samples: 11824640 | consumed tokens: 24216862720 | elapsed time per iteration (s): 0.57 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 2.744996E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.107 | TFLOPs: 43.01 | +7: iteration 46200/ 115203 | consumed samples: 11827200 | consumed tokens: 24222105600 | elapsed time per iteration (s): 0.56 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 2.753217E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.712 | TFLOPs: 43.64 | +7: iteration 46210/ 115203 | consumed samples: 11829760 | consumed tokens: 24227348480 | elapsed time per iteration (s): 0.56 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 2.758583E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.751 | TFLOPs: 43.93 | +7: iteration 46220/ 115203 | consumed samples: 11832320 | consumed tokens: 24232591360 | elapsed time per iteration (s): 0.56 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 2.753966E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.368 | TFLOPs: 43.22 | +7: iteration 46230/ 115203 | consumed samples: 11834880 | consumed tokens: 24237834240 | elapsed time per iteration (s): 0.55 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 2.746120E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 46240/ 115203 | consumed samples: 11837440 | consumed tokens: 24243077120 | elapsed time per iteration (s): 0.55 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 2.772753E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.350 | TFLOPs: 43.98 | +7: iteration 46250/ 115203 | consumed samples: 11840000 | consumed tokens: 24248320000 | elapsed time per iteration (s): 0.55 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 2.752573E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 46260/ 115203 | consumed samples: 11842560 | consumed tokens: 24253562880 | elapsed time per iteration (s): 0.56 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 2.757457E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.880 | TFLOPs: 43.46 | +7: iteration 46270/ 115203 | consumed samples: 11845120 | consumed tokens: 24258805760 | elapsed time per iteration (s): 0.56 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 2.747678E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.807 | TFLOPs: 43.55 | +7: iteration 46280/ 115203 | consumed samples: 11847680 | consumed tokens: 24264048640 | elapsed time per iteration (s): 0.55 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 2.768045E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.453 | TFLOPs: 43.99 | +7: iteration 46290/ 115203 | consumed samples: 11850240 | consumed tokens: 24269291520 | elapsed time per iteration (s): 0.56 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 2.761828E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.008 | TFLOPs: 43.48 | +7: iteration 46300/ 115203 | consumed samples: 11852800 | consumed tokens: 24274534400 | elapsed time per iteration (s): 0.56 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 2.763341E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.198 | TFLOPs: 43.87 | +7: iteration 46310/ 115203 | consumed samples: 11855360 | consumed tokens: 24279777280 | elapsed time per iteration (s): 0.55 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 2.755170E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.569 | TFLOPs: 44.01 | +7: iteration 46320/ 115203 | consumed samples: 11857920 | consumed tokens: 24285020160 | elapsed time per iteration (s): 0.56 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 2.743586E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.142 | TFLOPs: 43.87 | +7: iteration 46330/ 115203 | consumed samples: 11860480 | consumed tokens: 24290263040 | elapsed time per iteration (s): 0.55 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 2.754726E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.363 | TFLOPs: 43.99 | +7: iteration 46340/ 115203 | consumed samples: 11863040 | consumed tokens: 24295505920 | elapsed time per iteration (s): 0.57 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 2.753479E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.133 | TFLOPs: 42.72 | +7: iteration 46350/ 115203 | consumed samples: 11865600 | consumed tokens: 24300748800 | elapsed time per iteration (s): 0.56 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 2.752326E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.821 | TFLOPs: 43.65 | +7: iteration 46360/ 115203 | consumed samples: 11868160 | consumed tokens: 24305991680 | elapsed time per iteration (s): 0.57 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 2.762685E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.241 | TFLOPs: 42.83 | +7: iteration 46370/ 115203 | consumed samples: 11870720 | consumed tokens: 24311234560 | elapsed time per iteration (s): 0.56 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 2.746408E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.587 | TFLOPs: 43.72 | +7: iteration 46380/ 115203 | consumed samples: 11873280 | consumed tokens: 24316477440 | elapsed time per iteration (s): 0.56 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 2.740120E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.821 | TFLOPs: 43.46 | +7: iteration 46390/ 115203 | consumed samples: 11875840 | consumed tokens: 24321720320 | elapsed time per iteration (s): 0.59 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 2.760907E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.101 | TFLOPs: 41.29 | +7: iteration 46400/ 115203 | consumed samples: 11878400 | consumed tokens: 24326963200 | elapsed time per iteration (s): 0.56 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 2.744006E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.287 | TFLOPs: 43.88 | +7: iteration 46410/ 115203 | consumed samples: 11880960 | consumed tokens: 24332206080 | elapsed time per iteration (s): 0.57 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 2.750958E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.891 | TFLOPs: 43.18 | +7: iteration 46420/ 115203 | consumed samples: 11883520 | consumed tokens: 24337448960 | elapsed time per iteration (s): 0.58 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 2.749505E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.757 | TFLOPs: 42.02 | +7: iteration 46430/ 115203 | consumed samples: 11886080 | consumed tokens: 24342691840 | elapsed time per iteration (s): 0.58 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 2.748847E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.754 | TFLOPs: 42.02 | +7: iteration 46440/ 115203 | consumed samples: 11888640 | consumed tokens: 24347934720 | elapsed time per iteration (s): 0.57 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 2.755595E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.556 | TFLOPs: 42.48 | +7: iteration 46450/ 115203 | consumed samples: 11891200 | consumed tokens: 24353177600 | elapsed time per iteration (s): 0.56 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 2.750631E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.161 | TFLOPs: 43.87 | +7: iteration 46460/ 115203 | consumed samples: 11893760 | consumed tokens: 24358420480 | elapsed time per iteration (s): 0.57 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 2.753161E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.956 | TFLOPs: 43.18 | +7: iteration 46470/ 115203 | consumed samples: 11896320 | consumed tokens: 24363663360 | elapsed time per iteration (s): 0.57 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 2.770484E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.872 | TFLOPs: 42.89 | +7: iteration 46480/ 115203 | consumed samples: 11898880 | consumed tokens: 24368906240 | elapsed time per iteration (s): 0.56 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 2.750205E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.295 | TFLOPs: 43.41 | +7: iteration 46490/ 115203 | consumed samples: 11901440 | consumed tokens: 24374149120 | elapsed time per iteration (s): 0.57 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 2.754262E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.644 | TFLOPs: 43.06 | +7: iteration 46500/ 115203 | consumed samples: 11904000 | consumed tokens: 24379392000 | elapsed time per iteration (s): 0.56 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 2.746690E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.716 | TFLOPs: 43.92 | +7: iteration 46510/ 115203 | consumed samples: 11906560 | consumed tokens: 24384634880 | elapsed time per iteration (s): 0.58 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 2.756223E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.696 | TFLOPs: 42.30 | +7: iteration 46520/ 115203 | consumed samples: 11909120 | consumed tokens: 24389877760 | elapsed time per iteration (s): 0.59 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 2.756785E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.738 | TFLOPs: 41.54 | +7: iteration 46530/ 115203 | consumed samples: 11911680 | consumed tokens: 24395120640 | elapsed time per iteration (s): 0.59 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 2.760907E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.223 | TFLOPs: 41.40 | +7: iteration 46540/ 115203 | consumed samples: 11914240 | consumed tokens: 24400363520 | elapsed time per iteration (s): 0.56 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 2.750365E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.470 | TFLOPs: 43.23 | +7: iteration 46550/ 115203 | consumed samples: 11916800 | consumed tokens: 24405606400 | elapsed time per iteration (s): 0.56 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 2.762576E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.262 | TFLOPs: 43.31 | +7: iteration 46560/ 115203 | consumed samples: 11919360 | consumed tokens: 24410849280 | elapsed time per iteration (s): 0.57 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 2.759002E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.511 | TFLOPs: 42.76 | +7: iteration 46570/ 115203 | consumed samples: 11921920 | consumed tokens: 24416092160 | elapsed time per iteration (s): 0.57 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 2.741794E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.336 | TFLOPs: 42.65 | +7: iteration 46580/ 115203 | consumed samples: 11924480 | consumed tokens: 24421335040 | elapsed time per iteration (s): 0.56 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 2.761624E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.657 | TFLOPs: 43.35 | +7: iteration 46590/ 115203 | consumed samples: 11927040 | consumed tokens: 24426577920 | elapsed time per iteration (s): 0.57 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 2.742300E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.135 | TFLOPs: 42.63 | +7: iteration 46600/ 115203 | consumed samples: 11929600 | consumed tokens: 24431820800 | elapsed time per iteration (s): 0.55 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 2.759367E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.597 | TFLOPs: 44.01 | +7: iteration 46610/ 115203 | consumed samples: 11932160 | consumed tokens: 24437063680 | elapsed time per iteration (s): 0.57 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 2.748203E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.018 | TFLOPs: 43.19 | +7: iteration 46620/ 115203 | consumed samples: 11934720 | consumed tokens: 24442306560 | elapsed time per iteration (s): 0.56 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 2.754922E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.207 | TFLOPs: 43.59 | +7: iteration 46630/ 115203 | consumed samples: 11937280 | consumed tokens: 24447549440 | elapsed time per iteration (s): 0.57 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 2.749685E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.563 | TFLOPs: 42.96 | +7: iteration 46640/ 115203 | consumed samples: 11939840 | consumed tokens: 24452792320 | elapsed time per iteration (s): 0.56 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 2.766646E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.825 | TFLOPs: 43.74 | +7: iteration 46650/ 115203 | consumed samples: 11942400 | consumed tokens: 24458035200 | elapsed time per iteration (s): 0.56 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 2.762033E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.154 | TFLOPs: 43.30 | +7: iteration 46660/ 115203 | consumed samples: 11944960 | consumed tokens: 24463278080 | elapsed time per iteration (s): 0.57 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 2.756246E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.999 | TFLOPs: 42.62 | +7: iteration 46670/ 115203 | consumed samples: 11947520 | consumed tokens: 24468520960 | elapsed time per iteration (s): 0.57 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 2.743291E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.413 | TFLOPs: 42.66 | +7: iteration 46680/ 115203 | consumed samples: 11950080 | consumed tokens: 24473763840 | elapsed time per iteration (s): 0.57 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 2.746695E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.863 | TFLOPs: 42.70 | +7: iteration 46690/ 115203 | consumed samples: 11952640 | consumed tokens: 24479006720 | elapsed time per iteration (s): 0.56 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 2.749952E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.660 | TFLOPs: 43.35 | +7: iteration 46700/ 115203 | consumed samples: 11955200 | consumed tokens: 24484249600 | elapsed time per iteration (s): 0.56 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 2.756645E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.037 | TFLOPs: 43.29 | +7: iteration 46710/ 115203 | consumed samples: 11957760 | consumed tokens: 24489492480 | elapsed time per iteration (s): 0.56 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 2.746013E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.402 | TFLOPs: 43.32 | +7: iteration 46720/ 115203 | consumed samples: 11960320 | consumed tokens: 24494735360 | elapsed time per iteration (s): 0.57 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 2.749312E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.242 | TFLOPs: 42.93 | +7: iteration 46730/ 115203 | consumed samples: 11962880 | consumed tokens: 24499978240 | elapsed time per iteration (s): 0.57 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 2.746294E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.424 | TFLOPs: 42.66 | +7: iteration 46740/ 115203 | consumed samples: 11965440 | consumed tokens: 24505221120 | elapsed time per iteration (s): 0.58 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 2.755990E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.308 | TFLOPs: 42.36 | +7: iteration 46750/ 115203 | consumed samples: 11968000 | consumed tokens: 24510464000 | elapsed time per iteration (s): 0.59 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 2.747505E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.167 | TFLOPs: 41.20 | +7: iteration 46760/ 115203 | consumed samples: 11970560 | consumed tokens: 24515706880 | elapsed time per iteration (s): 0.58 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 2.754205E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.348 | TFLOPs: 42.08 | +7: iteration 46770/ 115203 | consumed samples: 11973120 | consumed tokens: 24520949760 | elapsed time per iteration (s): 0.57 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 2.759696E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.708 | TFLOPs: 42.78 | +7: iteration 46780/ 115203 | consumed samples: 11975680 | consumed tokens: 24526192640 | elapsed time per iteration (s): 0.56 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 2.767253E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.083 | TFLOPs: 43.29 | +7: iteration 46790/ 115203 | consumed samples: 11978240 | consumed tokens: 24531435520 | elapsed time per iteration (s): 0.57 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 2.761348E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.686 | TFLOPs: 43.06 | +7: iteration 46800/ 115203 | consumed samples: 11980800 | consumed tokens: 24536678400 | elapsed time per iteration (s): 0.56 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 2.754258E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.330 | TFLOPs: 43.79 | +7: iteration 46810/ 115203 | consumed samples: 11983360 | consumed tokens: 24541921280 | elapsed time per iteration (s): 0.59 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 2.755186E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.161 | TFLOPs: 41.39 | +7: iteration 46820/ 115203 | consumed samples: 11985920 | consumed tokens: 24547164160 | elapsed time per iteration (s): 0.56 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 2.752695E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.776 | TFLOPs: 43.45 | +7: iteration 46830/ 115203 | consumed samples: 11988480 | consumed tokens: 24552407040 | elapsed time per iteration (s): 0.56 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 2.756816E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.294 | TFLOPs: 43.41 | +7: iteration 46840/ 115203 | consumed samples: 11991040 | consumed tokens: 24557649920 | elapsed time per iteration (s): 0.58 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 2.757101E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.000 | TFLOPs: 42.14 | +7: iteration 46850/ 115203 | consumed samples: 11993600 | consumed tokens: 24562892800 | elapsed time per iteration (s): 0.56 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 2.736090E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.723 | TFLOPs: 43.83 | +7: iteration 46860/ 115203 | consumed samples: 11996160 | consumed tokens: 24568135680 | elapsed time per iteration (s): 0.57 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 2.743962E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.614 | TFLOPs: 43.15 | +7: iteration 46870/ 115203 | consumed samples: 11998720 | consumed tokens: 24573378560 | elapsed time per iteration (s): 0.57 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 2.752719E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.782 | TFLOPs: 42.60 | +7: iteration 46880/ 115203 | consumed samples: 12001280 | consumed tokens: 24578621440 | elapsed time per iteration (s): 0.57 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 2.755389E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.068 | TFLOPs: 43.20 | +7: iteration 46890/ 115203 | consumed samples: 12003840 | consumed tokens: 24583864320 | elapsed time per iteration (s): 0.58 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 2.753684E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.311 | TFLOPs: 42.36 | +7: iteration 46900/ 115203 | consumed samples: 12006400 | consumed tokens: 24589107200 | elapsed time per iteration (s): 0.56 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 2.754616E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.421 | TFLOPs: 43.42 | +7: iteration 46910/ 115203 | consumed samples: 12008960 | consumed tokens: 24594350080 | elapsed time per iteration (s): 0.57 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 2.760615E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.155 | TFLOPs: 43.01 | +7: iteration 46920/ 115203 | consumed samples: 12011520 | consumed tokens: 24599592960 | elapsed time per iteration (s): 0.58 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 2.741231E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.740 | TFLOPs: 42.31 | +7: iteration 46930/ 115203 | consumed samples: 12014080 | consumed tokens: 24604835840 | elapsed time per iteration (s): 0.57 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 2.749911E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.302 | TFLOPs: 42.84 | +7: iteration 46940/ 115203 | consumed samples: 12016640 | consumed tokens: 24610078720 | elapsed time per iteration (s): 0.56 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 2.757556E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.280 | TFLOPs: 43.22 | +7: iteration 46950/ 115203 | consumed samples: 12019200 | consumed tokens: 24615321600 | elapsed time per iteration (s): 0.57 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 2.750960E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.786 | TFLOPs: 42.60 | +7: iteration 46960/ 115203 | consumed samples: 12021760 | consumed tokens: 24620564480 | elapsed time per iteration (s): 0.56 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 2.740328E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.674 | TFLOPs: 43.44 | +7: iteration 46970/ 115203 | consumed samples: 12024320 | consumed tokens: 24625807360 | elapsed time per iteration (s): 0.57 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 2.739003E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.906 | TFLOPs: 43.08 | +7: iteration 46980/ 115203 | consumed samples: 12026880 | consumed tokens: 24631050240 | elapsed time per iteration (s): 0.57 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 2.751863E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.068 | TFLOPs: 42.72 | +7: iteration 46990/ 115203 | consumed samples: 12029440 | consumed tokens: 24636293120 | elapsed time per iteration (s): 0.56 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 2.734621E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.440 | TFLOPs: 43.71 | +7: iteration 47000/ 115203 | consumed samples: 12032000 | consumed tokens: 24641536000 | elapsed time per iteration (s): 0.57 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 2.754812E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.678 | TFLOPs: 42.68 | +7: iteration 47010/ 115203 | consumed samples: 12034560 | consumed tokens: 24646778880 | elapsed time per iteration (s): 0.56 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 2.755561E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.927 | TFLOPs: 43.56 | +7: iteration 47020/ 115203 | consumed samples: 12037120 | consumed tokens: 24652021760 | elapsed time per iteration (s): 0.56 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 2.756920E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.612 | TFLOPs: 43.34 | +7: iteration 47030/ 115203 | consumed samples: 12039680 | consumed tokens: 24657264640 | elapsed time per iteration (s): 0.57 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 2.745822E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.280 | TFLOPs: 42.64 | +7: iteration 47040/ 115203 | consumed samples: 12042240 | consumed tokens: 24662507520 | elapsed time per iteration (s): 0.58 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 2.759660E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.318 | TFLOPs: 42.27 | +7: iteration 47050/ 115203 | consumed samples: 12044800 | consumed tokens: 24667750400 | elapsed time per iteration (s): 0.57 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 2.761070E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.219 | TFLOPs: 43.11 | +7: iteration 47060/ 115203 | consumed samples: 12047360 | consumed tokens: 24672993280 | elapsed time per iteration (s): 0.56 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 2.758299E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.977 | TFLOPs: 43.38 | +7: iteration 47070/ 115203 | consumed samples: 12049920 | consumed tokens: 24678236160 | elapsed time per iteration (s): 0.57 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 2.750860E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.108 | TFLOPs: 42.72 | +7: iteration 47080/ 115203 | consumed samples: 12052480 | consumed tokens: 24683479040 | elapsed time per iteration (s): 0.57 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 2.758044E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.500 | TFLOPs: 43.05 | +7: iteration 47090/ 115203 | consumed samples: 12055040 | consumed tokens: 24688721920 | elapsed time per iteration (s): 0.58 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 2.756352E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.159 | TFLOPs: 42.44 | +7: iteration 47100/ 115203 | consumed samples: 12057600 | consumed tokens: 24693964800 | elapsed time per iteration (s): 0.57 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 2.752807E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.922 | TFLOPs: 43.09 | +7: iteration 47110/ 115203 | consumed samples: 12060160 | consumed tokens: 24699207680 | elapsed time per iteration (s): 0.57 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 2.746621E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.567 | TFLOPs: 42.58 | +7: iteration 47120/ 115203 | consumed samples: 12062720 | consumed tokens: 24704450560 | elapsed time per iteration (s): 0.57 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 2.739679E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.499 | TFLOPs: 43.14 | +7: iteration 47130/ 115203 | consumed samples: 12065280 | consumed tokens: 24709693440 | elapsed time per iteration (s): 0.58 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 2.759678E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.883 | TFLOPs: 42.32 | +7: iteration 47140/ 115203 | consumed samples: 12067840 | consumed tokens: 24714936320 | elapsed time per iteration (s): 0.56 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 2.746284E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.701 | TFLOPs: 43.73 | +7: iteration 47150/ 115203 | consumed samples: 12070400 | consumed tokens: 24720179200 | elapsed time per iteration (s): 0.57 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 2.758196E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.391 | TFLOPs: 42.75 | +7: iteration 47160/ 115203 | consumed samples: 12072960 | consumed tokens: 24725422080 | elapsed time per iteration (s): 0.57 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 2.738215E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.245 | TFLOPs: 43.12 | +7: iteration 47170/ 115203 | consumed samples: 12075520 | consumed tokens: 24730664960 | elapsed time per iteration (s): 0.57 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 2.749980E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.030 | TFLOPs: 42.91 | +7: iteration 47180/ 115203 | consumed samples: 12078080 | consumed tokens: 24735907840 | elapsed time per iteration (s): 0.56 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 2.762917E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.768 | TFLOPs: 43.83 | +7: iteration 47190/ 115203 | consumed samples: 12080640 | consumed tokens: 24741150720 | elapsed time per iteration (s): 0.59 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 2.741850E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.976 | TFLOPs: 41.18 | +7: iteration 47200/ 115203 | consumed samples: 12083200 | consumed tokens: 24746393600 | elapsed time per iteration (s): 0.56 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 2.748689E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.352 | TFLOPs: 43.51 | +7: iteration 47210/ 115203 | consumed samples: 12085760 | consumed tokens: 24751636480 | elapsed time per iteration (s): 0.56 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 2.752821E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.114 | TFLOPs: 43.58 | +7: iteration 47220/ 115203 | consumed samples: 12088320 | consumed tokens: 24756879360 | elapsed time per iteration (s): 0.56 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 2.758387E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.121 | TFLOPs: 43.87 | +7: iteration 47230/ 115203 | consumed samples: 12090880 | consumed tokens: 24762122240 | elapsed time per iteration (s): 0.57 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 2.764561E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.237 | TFLOPs: 42.93 | +7: iteration 47240/ 115203 | consumed samples: 12093440 | consumed tokens: 24767365120 | elapsed time per iteration (s): 0.56 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 2.752717E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.237 | TFLOPs: 43.40 | +7: iteration 47250/ 115203 | consumed samples: 12096000 | consumed tokens: 24772608000 | elapsed time per iteration (s): 0.57 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 2.752555E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.901 | TFLOPs: 43.18 | +7: iteration 47260/ 115203 | consumed samples: 12098560 | consumed tokens: 24777850880 | elapsed time per iteration (s): 0.58 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 2.747211E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.746 | TFLOPs: 42.21 | +7: iteration 47270/ 115203 | consumed samples: 12101120 | consumed tokens: 24783093760 | elapsed time per iteration (s): 0.57 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 2.757450E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.233 | TFLOPs: 42.92 | +7: iteration 47280/ 115203 | consumed samples: 12103680 | consumed tokens: 24788336640 | elapsed time per iteration (s): 0.56 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 2.752286E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.285 | TFLOPs: 43.31 | +7: iteration 47290/ 115203 | consumed samples: 12106240 | consumed tokens: 24793579520 | elapsed time per iteration (s): 0.57 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 2.734612E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.813 | TFLOPs: 43.08 | +7: iteration 47300/ 115203 | consumed samples: 12108800 | consumed tokens: 24798822400 | elapsed time per iteration (s): 0.57 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 2.747658E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.251 | TFLOPs: 43.12 | +7: iteration 47310/ 115203 | consumed samples: 12111360 | consumed tokens: 24804065280 | elapsed time per iteration (s): 0.58 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 2.762822E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.490 | TFLOPs: 42.00 | +7: iteration 47320/ 115203 | consumed samples: 12113920 | consumed tokens: 24809308160 | elapsed time per iteration (s): 0.58 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 2.739589E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.781 | TFLOPs: 42.12 | +7: iteration 47330/ 115203 | consumed samples: 12116480 | consumed tokens: 24814551040 | elapsed time per iteration (s): 0.56 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 2.747378E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.022 | TFLOPs: 43.29 | +7: iteration 47340/ 115203 | consumed samples: 12119040 | consumed tokens: 24819793920 | elapsed time per iteration (s): 0.61 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 2.755874E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.695 | TFLOPs: 40.30 | +7: iteration 47350/ 115203 | consumed samples: 12121600 | consumed tokens: 24825036800 | elapsed time per iteration (s): 0.59 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 2.747725E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.889 | TFLOPs: 41.37 | +7: iteration 47360/ 115203 | consumed samples: 12124160 | consumed tokens: 24830279680 | elapsed time per iteration (s): 0.59 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 2.749462E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.268 | TFLOPs: 41.21 | +7: iteration 47370/ 115203 | consumed samples: 12126720 | consumed tokens: 24835522560 | elapsed time per iteration (s): 0.57 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 2.744753E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.293 | TFLOPs: 42.93 | +7: iteration 47380/ 115203 | consumed samples: 12129280 | consumed tokens: 24840765440 | elapsed time per iteration (s): 0.58 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 2.758174E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.813 | TFLOPs: 42.22 | +7: iteration 47390/ 115203 | consumed samples: 12131840 | consumed tokens: 24846008320 | elapsed time per iteration (s): 0.57 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 2.752290E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.938 | TFLOPs: 42.99 | +7: iteration 47400/ 115203 | consumed samples: 12134400 | consumed tokens: 24851251200 | elapsed time per iteration (s): 0.57 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 2.746125E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.763 | TFLOPs: 42.78 | +7: iteration 47410/ 115203 | consumed samples: 12136960 | consumed tokens: 24856494080 | elapsed time per iteration (s): 0.57 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 2.763411E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.350 | TFLOPs: 42.94 | +7: iteration 47420/ 115203 | consumed samples: 12139520 | consumed tokens: 24861736960 | elapsed time per iteration (s): 0.55 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 2.734977E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.485 | TFLOPs: 44.00 | +7: iteration 47430/ 115203 | consumed samples: 12142080 | consumed tokens: 24866979840 | elapsed time per iteration (s): 0.57 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 2.746784E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.633 | TFLOPs: 42.49 | +7: iteration 47440/ 115203 | consumed samples: 12144640 | consumed tokens: 24872222720 | elapsed time per iteration (s): 0.57 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 2.745045E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.445 | TFLOPs: 43.04 | +7: iteration 47450/ 115203 | consumed samples: 12147200 | consumed tokens: 24877465600 | elapsed time per iteration (s): 0.57 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 2.746477E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.774 | TFLOPs: 43.07 | +7: iteration 47460/ 115203 | consumed samples: 12149760 | consumed tokens: 24882708480 | elapsed time per iteration (s): 0.56 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 2.756402E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.291 | TFLOPs: 43.41 | +7: iteration 47470/ 115203 | consumed samples: 12152320 | consumed tokens: 24887951360 | elapsed time per iteration (s): 0.56 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 2.759135E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.770 | TFLOPs: 43.36 | +7: iteration 47480/ 115203 | consumed samples: 12154880 | consumed tokens: 24893194240 | elapsed time per iteration (s): 0.57 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 2.754874E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.777 | TFLOPs: 42.98 | +7: iteration 47490/ 115203 | consumed samples: 12157440 | consumed tokens: 24898437120 | elapsed time per iteration (s): 0.56 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 2.742602E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.552 | TFLOPs: 43.72 | +7: iteration 47500/ 115203 | consumed samples: 12160000 | consumed tokens: 24903680000 | elapsed time per iteration (s): 0.57 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 2.741563E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.135 | TFLOPs: 42.63 | +7: iteration 47510/ 115203 | consumed samples: 12162560 | consumed tokens: 24908922880 | elapsed time per iteration (s): 0.56 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 2.761370E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.960 | TFLOPs: 43.95 | +7: iteration 47520/ 115203 | consumed samples: 12165120 | consumed tokens: 24914165760 | elapsed time per iteration (s): 0.57 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 2.744894E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.889 | TFLOPs: 43.08 | +7: iteration 47530/ 115203 | consumed samples: 12167680 | consumed tokens: 24919408640 | elapsed time per iteration (s): 0.57 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 2.755396E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.579 | TFLOPs: 43.15 | +7: iteration 47540/ 115203 | consumed samples: 12170240 | consumed tokens: 24924651520 | elapsed time per iteration (s): 0.57 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 2.728595E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.505 | TFLOPs: 42.95 | +7: iteration 47550/ 115203 | consumed samples: 12172800 | consumed tokens: 24929894400 | elapsed time per iteration (s): 0.56 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 2.736221E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.345 | TFLOPs: 43.51 | +7: iteration 47560/ 115203 | consumed samples: 12175360 | consumed tokens: 24935137280 | elapsed time per iteration (s): 0.57 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 2.739663E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.146 | TFLOPs: 43.01 | +7: iteration 47570/ 115203 | consumed samples: 12177920 | consumed tokens: 24940380160 | elapsed time per iteration (s): 0.57 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 2.744496E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.488 | TFLOPs: 42.57 | +7: iteration 47580/ 115203 | consumed samples: 12180480 | consumed tokens: 24945623040 | elapsed time per iteration (s): 0.57 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 2.747220E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.912 | TFLOPs: 42.80 | +7: iteration 47590/ 115203 | consumed samples: 12183040 | consumed tokens: 24950865920 | elapsed time per iteration (s): 0.57 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 2.747256E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.762 | TFLOPs: 42.78 | +7: iteration 47600/ 115203 | consumed samples: 12185600 | consumed tokens: 24956108800 | elapsed time per iteration (s): 0.57 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 2.738504E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.097 | TFLOPs: 43.01 | +7: iteration 47610/ 115203 | consumed samples: 12188160 | consumed tokens: 24961351680 | elapsed time per iteration (s): 0.57 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 2.747392E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.659 | TFLOPs: 42.87 | +7: iteration 47620/ 115203 | consumed samples: 12190720 | consumed tokens: 24966594560 | elapsed time per iteration (s): 0.56 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 2.736394E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.617 | TFLOPs: 43.25 | +7: iteration 47630/ 115203 | consumed samples: 12193280 | consumed tokens: 24971837440 | elapsed time per iteration (s): 0.57 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 2.752663E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.640 | TFLOPs: 42.58 | +7: iteration 47640/ 115203 | consumed samples: 12195840 | consumed tokens: 24977080320 | elapsed time per iteration (s): 0.56 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 2.745596E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.309 | TFLOPs: 43.79 | +7: iteration 47650/ 115203 | consumed samples: 12198400 | consumed tokens: 24982323200 | elapsed time per iteration (s): 0.56 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 2.753620E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.015 | TFLOPs: 43.38 | +7: iteration 47660/ 115203 | consumed samples: 12200960 | consumed tokens: 24987566080 | elapsed time per iteration (s): 0.56 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 2.761297E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.148 | TFLOPs: 43.30 | +7: iteration 47670/ 115203 | consumed samples: 12203520 | consumed tokens: 24992808960 | elapsed time per iteration (s): 0.56 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 2.757420E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.855 | TFLOPs: 43.46 | +7: iteration 47680/ 115203 | consumed samples: 12206080 | consumed tokens: 24998051840 | elapsed time per iteration (s): 0.57 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 2.759552E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.421 | TFLOPs: 42.66 | +7: iteration 47690/ 115203 | consumed samples: 12208640 | consumed tokens: 25003294720 | elapsed time per iteration (s): 0.58 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 2.741165E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.760 | TFLOPs: 42.02 | +7: iteration 47700/ 115203 | consumed samples: 12211200 | consumed tokens: 25008537600 | elapsed time per iteration (s): 0.57 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 2.756070E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.557 | TFLOPs: 43.05 | +7: iteration 47710/ 115203 | consumed samples: 12213760 | consumed tokens: 25013780480 | elapsed time per iteration (s): 0.57 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 2.744143E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.377 | TFLOPs: 43.03 | +7: iteration 47720/ 115203 | consumed samples: 12216320 | consumed tokens: 25019023360 | elapsed time per iteration (s): 0.57 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 2.762622E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.332 | TFLOPs: 42.84 | +7: iteration 47730/ 115203 | consumed samples: 12218880 | consumed tokens: 25024266240 | elapsed time per iteration (s): 0.56 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 2.773485E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.132 | TFLOPs: 43.39 | +7: iteration 47740/ 115203 | consumed samples: 12221440 | consumed tokens: 25029509120 | elapsed time per iteration (s): 0.58 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 2.740743E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.347 | TFLOPs: 41.89 | +7: iteration 47750/ 115203 | consumed samples: 12224000 | consumed tokens: 25034752000 | elapsed time per iteration (s): 0.58 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 2.752645E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.825 | TFLOPs: 42.31 | +7: iteration 47760/ 115203 | consumed samples: 12226560 | consumed tokens: 25039994880 | elapsed time per iteration (s): 0.57 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 2.748870E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.139 | TFLOPs: 43.01 | +7: iteration 47770/ 115203 | consumed samples: 12229120 | consumed tokens: 25045237760 | elapsed time per iteration (s): 0.57 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 2.751448E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.877 | TFLOPs: 43.08 | +7: iteration 47780/ 115203 | consumed samples: 12231680 | consumed tokens: 25050480640 | elapsed time per iteration (s): 0.57 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 2.729413E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.354 | TFLOPs: 42.75 | +7: iteration 47790/ 115203 | consumed samples: 12234240 | consumed tokens: 25055723520 | elapsed time per iteration (s): 0.57 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 2.740727E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.968 | TFLOPs: 42.52 | +7: iteration 47800/ 115203 | consumed samples: 12236800 | consumed tokens: 25060966400 | elapsed time per iteration (s): 0.57 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 2.736551E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.672 | TFLOPs: 42.59 | +7: iteration 47810/ 115203 | consumed samples: 12239360 | consumed tokens: 25066209280 | elapsed time per iteration (s): 0.58 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 2.743955E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.552 | TFLOPs: 42.00 | +7: iteration 47820/ 115203 | consumed samples: 12241920 | consumed tokens: 25071452160 | elapsed time per iteration (s): 0.57 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 2.753688E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.202 | TFLOPs: 43.02 | +7: iteration 47830/ 115203 | consumed samples: 12244480 | consumed tokens: 25076695040 | elapsed time per iteration (s): 0.56 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 2.739483E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.865 | TFLOPs: 43.37 | +7: iteration 47840/ 115203 | consumed samples: 12247040 | consumed tokens: 25081937920 | elapsed time per iteration (s): 0.59 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 2.749012E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.819 | TFLOPs: 41.46 | +7: iteration 47850/ 115203 | consumed samples: 12249600 | consumed tokens: 25087180800 | elapsed time per iteration (s): 0.56 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 2.749726E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.668 | TFLOPs: 43.44 | +7: iteration 47860/ 115203 | consumed samples: 12252160 | consumed tokens: 25092423680 | elapsed time per iteration (s): 0.58 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 2.744242E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.765 | TFLOPs: 42.02 | +7: iteration 47870/ 115203 | consumed samples: 12254720 | consumed tokens: 25097666560 | elapsed time per iteration (s): 0.60 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 2.741974E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.188 | TFLOPs: 40.44 | +7: iteration 47880/ 115203 | consumed samples: 12257280 | consumed tokens: 25102909440 | elapsed time per iteration (s): 0.57 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 2.740604E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.865 | TFLOPs: 42.60 | +7: iteration 47890/ 115203 | consumed samples: 12259840 | consumed tokens: 25108152320 | elapsed time per iteration (s): 0.56 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 2.750002E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.247 | TFLOPs: 43.31 | +7: iteration 47900/ 115203 | consumed samples: 12262400 | consumed tokens: 25113395200 | elapsed time per iteration (s): 0.56 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 2.748202E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.117 | TFLOPs: 43.20 | +7: iteration 47910/ 115203 | consumed samples: 12264960 | consumed tokens: 25118638080 | elapsed time per iteration (s): 0.57 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 2.736843E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.062 | TFLOPs: 43.10 | +7: iteration 47920/ 115203 | consumed samples: 12267520 | consumed tokens: 25123880960 | elapsed time per iteration (s): 0.60 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 2.741478E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.323 | TFLOPs: 40.36 | +7: iteration 47930/ 115203 | consumed samples: 12270080 | consumed tokens: 25129123840 | elapsed time per iteration (s): 0.59 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 2.742646E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.378 | TFLOPs: 41.60 | +7: iteration 47940/ 115203 | consumed samples: 12272640 | consumed tokens: 25134366720 | elapsed time per iteration (s): 0.57 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 2.725281E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.808 | TFLOPs: 42.79 | +7: iteration 47950/ 115203 | consumed samples: 12275200 | consumed tokens: 25139609600 | elapsed time per iteration (s): 0.58 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 2.739206E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.409 | TFLOPs: 42.18 | +7: iteration 47960/ 115203 | consumed samples: 12277760 | consumed tokens: 25144852480 | elapsed time per iteration (s): 0.58 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 2.749320E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.975 | TFLOPs: 42.23 | +7: iteration 47970/ 115203 | consumed samples: 12280320 | consumed tokens: 25150095360 | elapsed time per iteration (s): 0.59 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 2.748400E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.472 | TFLOPs: 41.52 | +7: iteration 47980/ 115203 | consumed samples: 12282880 | consumed tokens: 25155338240 | elapsed time per iteration (s): 0.57 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 2.735940E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.461 | TFLOPs: 42.57 | +7: iteration 47990/ 115203 | consumed samples: 12285440 | consumed tokens: 25160581120 | elapsed time per iteration (s): 0.58 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 2.748827E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.175 | TFLOPs: 42.44 | +0: [2023-03-16 20:21:57,359] [INFO] [logging.py:68:log_dist] [Rank 0] step=48000, skipped=0, lr=[0.00013490269160287214, 0.00013490269160287214, 0.00013490269160287214], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 48000/ 115203 | consumed samples: 12288000 | consumed tokens: 25165824000 | elapsed time per iteration (s): 0.56 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 2.753722E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.677 | TFLOPs: 43.73 | +0: steps: 48000 loss: 2.7292 iter time (s): 0.566 samples/sec: 452.253 +7: iteration 48010/ 115203 | consumed samples: 12290560 | consumed tokens: 25171066880 | elapsed time per iteration (s): 0.57 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 2.751978E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.857 | TFLOPs: 42.51 | +7: iteration 48020/ 115203 | consumed samples: 12293120 | consumed tokens: 25176309760 | elapsed time per iteration (s): 0.57 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 2.739915E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.884 | TFLOPs: 42.99 | +7: iteration 48030/ 115203 | consumed samples: 12295680 | consumed tokens: 25181552640 | elapsed time per iteration (s): 0.56 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 2.736644E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.363 | TFLOPs: 43.41 | +7: iteration 48040/ 115203 | consumed samples: 12298240 | consumed tokens: 25186795520 | elapsed time per iteration (s): 0.57 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 2.752364E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.699 | TFLOPs: 43.06 | +7: iteration 48050/ 115203 | consumed samples: 12300800 | consumed tokens: 25192038400 | elapsed time per iteration (s): 0.56 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 2.749970E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.177 | TFLOPs: 43.21 | +7: iteration 48060/ 115203 | consumed samples: 12303360 | consumed tokens: 25197281280 | elapsed time per iteration (s): 0.57 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 2.738341E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.391 | TFLOPs: 42.65 | +7: iteration 48070/ 115203 | consumed samples: 12305920 | consumed tokens: 25202524160 | elapsed time per iteration (s): 0.58 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 2.748008E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.532 | TFLOPs: 41.90 | +7: iteration 48080/ 115203 | consumed samples: 12308480 | consumed tokens: 25207767040 | elapsed time per iteration (s): 0.57 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 2.750122E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.347 | TFLOPs: 43.13 | +7: iteration 48090/ 115203 | consumed samples: 12311040 | consumed tokens: 25213009920 | elapsed time per iteration (s): 0.57 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 2.737180E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.134 | TFLOPs: 43.01 | +7: iteration 48100/ 115203 | consumed samples: 12313600 | consumed tokens: 25218252800 | elapsed time per iteration (s): 0.58 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 2.741192E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.849 | TFLOPs: 41.74 | +7: iteration 48110/ 115203 | consumed samples: 12316160 | consumed tokens: 25223495680 | elapsed time per iteration (s): 0.57 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 2.749780E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.603 | TFLOPs: 43.06 | +7: iteration 48120/ 115203 | consumed samples: 12318720 | consumed tokens: 25228738560 | elapsed time per iteration (s): 0.57 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 2.757219E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.146 | TFLOPs: 42.82 | +7: iteration 48130/ 115203 | consumed samples: 12321280 | consumed tokens: 25233981440 | elapsed time per iteration (s): 0.57 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 2.740857E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.885 | TFLOPs: 42.80 | +7: iteration 48140/ 115203 | consumed samples: 12323840 | consumed tokens: 25239224320 | elapsed time per iteration (s): 0.57 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 2.756527E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.712 | TFLOPs: 42.78 | +7: iteration 48150/ 115203 | consumed samples: 12326400 | consumed tokens: 25244467200 | elapsed time per iteration (s): 0.57 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 2.747655E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.878 | TFLOPs: 42.89 | +7: iteration 48160/ 115203 | consumed samples: 12328960 | consumed tokens: 25249710080 | elapsed time per iteration (s): 0.57 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 2.744993E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.266 | TFLOPs: 42.74 | +7: iteration 48170/ 115203 | consumed samples: 12331520 | consumed tokens: 25254952960 | elapsed time per iteration (s): 0.57 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 2.742209E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.622 | TFLOPs: 42.77 | +7: iteration 48180/ 115203 | consumed samples: 12334080 | consumed tokens: 25260195840 | elapsed time per iteration (s): 0.56 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 2.744838E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.864 | TFLOPs: 43.65 | +7: iteration 48190/ 115203 | consumed samples: 12336640 | consumed tokens: 25265438720 | elapsed time per iteration (s): 0.56 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 2.748947E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.386 | TFLOPs: 43.42 | +7: iteration 48200/ 115203 | consumed samples: 12339200 | consumed tokens: 25270681600 | elapsed time per iteration (s): 0.57 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 2.739396E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.042 | TFLOPs: 43.19 | +7: iteration 48210/ 115203 | consumed samples: 12341760 | consumed tokens: 25275924480 | elapsed time per iteration (s): 0.56 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 2.749905E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.296 | TFLOPs: 43.41 | +7: iteration 48220/ 115203 | consumed samples: 12344320 | consumed tokens: 25281167360 | elapsed time per iteration (s): 0.57 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 2.737530E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.426 | TFLOPs: 42.75 | +7: iteration 48230/ 115203 | consumed samples: 12346880 | consumed tokens: 25286410240 | elapsed time per iteration (s): 0.56 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 2.747906E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.150 | TFLOPs: 43.20 | +7: iteration 48240/ 115203 | consumed samples: 12349440 | consumed tokens: 25291653120 | elapsed time per iteration (s): 0.58 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 2.730019E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.141 | TFLOPs: 42.34 | +7: iteration 48250/ 115203 | consumed samples: 12352000 | consumed tokens: 25296896000 | elapsed time per iteration (s): 0.56 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 2.743534E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.533 | TFLOPs: 43.24 | +7: iteration 48260/ 115203 | consumed samples: 12354560 | consumed tokens: 25302138880 | elapsed time per iteration (s): 0.59 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 2.749197E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.415 | TFLOPs: 41.61 | +7: iteration 48270/ 115203 | consumed samples: 12357120 | consumed tokens: 25307381760 | elapsed time per iteration (s): 0.58 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 2.734447E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.980 | TFLOPs: 42.14 | +7: iteration 48280/ 115203 | consumed samples: 12359680 | consumed tokens: 25312624640 | elapsed time per iteration (s): 0.56 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 2.740105E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.889 | TFLOPs: 43.65 | +7: iteration 48290/ 115203 | consumed samples: 12362240 | consumed tokens: 25317867520 | elapsed time per iteration (s): 0.57 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 2.734477E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.896 | TFLOPs: 43.08 | +7: iteration 48300/ 115203 | consumed samples: 12364800 | consumed tokens: 25323110400 | elapsed time per iteration (s): 0.56 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 2.755326E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.585 | TFLOPs: 43.44 | +7: iteration 48310/ 115203 | consumed samples: 12367360 | consumed tokens: 25328353280 | elapsed time per iteration (s): 0.56 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 2.739168E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.403 | TFLOPs: 43.80 | +7: iteration 48320/ 115203 | consumed samples: 12369920 | consumed tokens: 25333596160 | elapsed time per iteration (s): 0.56 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 2.764937E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.352 | TFLOPs: 43.32 | +7: iteration 48330/ 115203 | consumed samples: 12372480 | consumed tokens: 25338839040 | elapsed time per iteration (s): 0.56 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 2.728177E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.560 | TFLOPs: 43.62 | +7: iteration 48340/ 115203 | consumed samples: 12375040 | consumed tokens: 25344081920 | elapsed time per iteration (s): 0.56 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 2.750153E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.889 | TFLOPs: 43.46 | +7: iteration 48350/ 115203 | consumed samples: 12377600 | consumed tokens: 25349324800 | elapsed time per iteration (s): 0.58 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 2.739554E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.251 | TFLOPs: 42.16 | +7: iteration 48360/ 115203 | consumed samples: 12380160 | consumed tokens: 25354567680 | elapsed time per iteration (s): 0.57 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 2.744021E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.913 | TFLOPs: 42.70 | +7: iteration 48370/ 115203 | consumed samples: 12382720 | consumed tokens: 25359810560 | elapsed time per iteration (s): 0.57 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 2.746527E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.978 | TFLOPs: 43.00 | +7: iteration 48380/ 115203 | consumed samples: 12385280 | consumed tokens: 25365053440 | elapsed time per iteration (s): 0.56 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 2.742308E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.368 | TFLOPs: 43.61 | +7: iteration 48390/ 115203 | consumed samples: 12387840 | consumed tokens: 25370296320 | elapsed time per iteration (s): 0.58 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 2.748940E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.424 | TFLOPs: 42.28 | +7: iteration 48400/ 115203 | consumed samples: 12390400 | consumed tokens: 25375539200 | elapsed time per iteration (s): 0.58 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 2.742249E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.915 | TFLOPs: 42.13 | +7: iteration 48410/ 115203 | consumed samples: 12392960 | consumed tokens: 25380782080 | elapsed time per iteration (s): 0.56 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 2.758446E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.024 | TFLOPs: 43.29 | +7: iteration 48420/ 115203 | consumed samples: 12395520 | consumed tokens: 25386024960 | elapsed time per iteration (s): 0.56 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 2.745700E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.550 | TFLOPs: 43.24 | +7: iteration 48430/ 115203 | consumed samples: 12398080 | consumed tokens: 25391267840 | elapsed time per iteration (s): 0.57 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 2.755583E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.307 | TFLOPs: 42.65 | +7: iteration 48440/ 115203 | consumed samples: 12400640 | consumed tokens: 25396510720 | elapsed time per iteration (s): 0.57 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 2.730977E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.091 | TFLOPs: 43.20 | +7: iteration 48450/ 115203 | consumed samples: 12403200 | consumed tokens: 25401753600 | elapsed time per iteration (s): 0.58 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 2.756822E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.178 | TFLOPs: 41.87 | +7: iteration 48460/ 115203 | consumed samples: 12405760 | consumed tokens: 25406996480 | elapsed time per iteration (s): 0.56 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 2.739576E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.374 | TFLOPs: 43.70 | +7: iteration 48470/ 115203 | consumed samples: 12408320 | consumed tokens: 25412239360 | elapsed time per iteration (s): 0.57 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 2.742664E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.737 | TFLOPs: 43.16 | +7: iteration 48480/ 115203 | consumed samples: 12410880 | consumed tokens: 25417482240 | elapsed time per iteration (s): 0.57 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 2.724445E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.491 | TFLOPs: 42.95 | +7: iteration 48490/ 115203 | consumed samples: 12413440 | consumed tokens: 25422725120 | elapsed time per iteration (s): 0.57 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 2.736888E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.505 | TFLOPs: 42.86 | +7: iteration 48500/ 115203 | consumed samples: 12416000 | consumed tokens: 25427968000 | elapsed time per iteration (s): 0.57 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 2.748468E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.027 | TFLOPs: 43.00 | +7: iteration 48510/ 115203 | consumed samples: 12418560 | consumed tokens: 25433210880 | elapsed time per iteration (s): 0.57 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 2.753343E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.295 | TFLOPs: 43.12 | +7: iteration 48520/ 115203 | consumed samples: 12421120 | consumed tokens: 25438453760 | elapsed time per iteration (s): 0.56 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 2.731306E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.551 | TFLOPs: 43.72 | +7: iteration 48530/ 115203 | consumed samples: 12423680 | consumed tokens: 25443696640 | elapsed time per iteration (s): 0.55 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 2.729261E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 48540/ 115203 | consumed samples: 12426240 | consumed tokens: 25448939520 | elapsed time per iteration (s): 0.55 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 2.751688E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.425 | TFLOPs: 43.99 | +7: iteration 48550/ 115203 | consumed samples: 12428800 | consumed tokens: 25454182400 | elapsed time per iteration (s): 0.57 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 2.755950E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.236 | TFLOPs: 42.45 | +7: iteration 48560/ 115203 | consumed samples: 12431360 | consumed tokens: 25459425280 | elapsed time per iteration (s): 0.56 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 2.740990E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.547 | TFLOPs: 43.34 | +7: iteration 48570/ 115203 | consumed samples: 12433920 | consumed tokens: 25464668160 | elapsed time per iteration (s): 0.56 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 2.739175E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.778 | TFLOPs: 43.26 | +7: iteration 48580/ 115203 | consumed samples: 12436480 | consumed tokens: 25469911040 | elapsed time per iteration (s): 0.56 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 2.737335E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.740 | TFLOPs: 43.64 | +7: iteration 48590/ 115203 | consumed samples: 12439040 | consumed tokens: 25475153920 | elapsed time per iteration (s): 0.57 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 2.742529E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.280 | TFLOPs: 43.12 | +7: iteration 48600/ 115203 | consumed samples: 12441600 | consumed tokens: 25480396800 | elapsed time per iteration (s): 0.56 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 2.734596E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.948 | TFLOPs: 43.85 | +7: iteration 48610/ 115203 | consumed samples: 12444160 | consumed tokens: 25485639680 | elapsed time per iteration (s): 0.57 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 2.754350E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.754 | TFLOPs: 43.17 | +7: iteration 48620/ 115203 | consumed samples: 12446720 | consumed tokens: 25490882560 | elapsed time per iteration (s): 0.57 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 2.742718E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.765 | TFLOPs: 42.98 | +7: iteration 48630/ 115203 | consumed samples: 12449280 | consumed tokens: 25496125440 | elapsed time per iteration (s): 0.57 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 2.740838E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.270 | TFLOPs: 42.74 | +7: iteration 48640/ 115203 | consumed samples: 12451840 | consumed tokens: 25501368320 | elapsed time per iteration (s): 0.56 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 2.743448E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.595 | TFLOPs: 43.34 | +7: iteration 48650/ 115203 | consumed samples: 12454400 | consumed tokens: 25506611200 | elapsed time per iteration (s): 0.56 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 2.741797E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.358 | TFLOPs: 43.22 | +7: iteration 48660/ 115203 | consumed samples: 12456960 | consumed tokens: 25511854080 | elapsed time per iteration (s): 0.58 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 2.741389E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.541 | TFLOPs: 42.29 | +7: iteration 48670/ 115203 | consumed samples: 12459520 | consumed tokens: 25517096960 | elapsed time per iteration (s): 0.57 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 2.746288E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.554 | TFLOPs: 42.86 | +7: iteration 48680/ 115203 | consumed samples: 12462080 | consumed tokens: 25522339840 | elapsed time per iteration (s): 0.57 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 2.751601E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.220 | TFLOPs: 42.83 | +7: iteration 48690/ 115203 | consumed samples: 12464640 | consumed tokens: 25527582720 | elapsed time per iteration (s): 0.56 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 2.749532E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.635 | TFLOPs: 43.44 | +7: iteration 48700/ 115203 | consumed samples: 12467200 | consumed tokens: 25532825600 | elapsed time per iteration (s): 0.57 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 2.755172E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.870 | TFLOPs: 43.18 | +7: iteration 48710/ 115203 | consumed samples: 12469760 | consumed tokens: 25538068480 | elapsed time per iteration (s): 0.55 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 2.734381E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.411 | TFLOPs: 43.99 | +7: iteration 48720/ 115203 | consumed samples: 12472320 | consumed tokens: 25543311360 | elapsed time per iteration (s): 0.56 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 2.743084E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.799 | TFLOPs: 43.55 | +7: iteration 48730/ 115203 | consumed samples: 12474880 | consumed tokens: 25548554240 | elapsed time per iteration (s): 0.56 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 2.745257E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.502 | TFLOPs: 43.71 | +7: iteration 48740/ 115203 | consumed samples: 12477440 | consumed tokens: 25553797120 | elapsed time per iteration (s): 0.56 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 2.752939E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.645 | TFLOPs: 43.54 | +7: iteration 48750/ 115203 | consumed samples: 12480000 | consumed tokens: 25559040000 | elapsed time per iteration (s): 0.56 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 2.744760E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.424 | TFLOPs: 43.52 | +7: iteration 48760/ 115203 | consumed samples: 12482560 | consumed tokens: 25564282880 | elapsed time per iteration (s): 0.56 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 2.736617E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.226 | TFLOPs: 43.69 | +7: iteration 48770/ 115203 | consumed samples: 12485120 | consumed tokens: 25569525760 | elapsed time per iteration (s): 0.56 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 2.743487E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.480 | TFLOPs: 43.71 | +7: iteration 48780/ 115203 | consumed samples: 12487680 | consumed tokens: 25574768640 | elapsed time per iteration (s): 0.57 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 2.764094E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.889 | TFLOPs: 43.18 | +7: iteration 48790/ 115203 | consumed samples: 12490240 | consumed tokens: 25580011520 | elapsed time per iteration (s): 0.57 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 2.750496E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.423 | TFLOPs: 42.56 | +7: iteration 48800/ 115203 | consumed samples: 12492800 | consumed tokens: 25585254400 | elapsed time per iteration (s): 0.57 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 2.731843E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.841 | TFLOPs: 43.17 | +7: iteration 48810/ 115203 | consumed samples: 12495360 | consumed tokens: 25590497280 | elapsed time per iteration (s): 0.56 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 2.740696E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.143 | TFLOPs: 43.39 | +7: iteration 48820/ 115203 | consumed samples: 12497920 | consumed tokens: 25595740160 | elapsed time per iteration (s): 0.57 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 2.750983E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.841 | TFLOPs: 43.08 | +7: iteration 48830/ 115203 | consumed samples: 12500480 | consumed tokens: 25600983040 | elapsed time per iteration (s): 0.57 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 2.745333E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.645 | TFLOPs: 42.58 | +7: iteration 48840/ 115203 | consumed samples: 12503040 | consumed tokens: 25606225920 | elapsed time per iteration (s): 0.56 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 2.742129E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.187 | TFLOPs: 43.21 | +7: iteration 48850/ 115203 | consumed samples: 12505600 | consumed tokens: 25611468800 | elapsed time per iteration (s): 0.57 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 2.744730E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.472 | TFLOPs: 43.14 | +7: iteration 48860/ 115203 | consumed samples: 12508160 | consumed tokens: 25616711680 | elapsed time per iteration (s): 0.56 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 2.743707E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.737 | TFLOPs: 43.74 | +7: iteration 48870/ 115203 | consumed samples: 12510720 | consumed tokens: 25621954560 | elapsed time per iteration (s): 0.57 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 2.736541E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.175 | TFLOPs: 43.11 | +7: iteration 48880/ 115203 | consumed samples: 12513280 | consumed tokens: 25627197440 | elapsed time per iteration (s): 0.57 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 2.744314E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.502 | TFLOPs: 42.76 | +7: iteration 48890/ 115203 | consumed samples: 12515840 | consumed tokens: 25632440320 | elapsed time per iteration (s): 0.60 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 2.747844E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.720 | TFLOPs: 40.97 | +7: iteration 48900/ 115203 | consumed samples: 12518400 | consumed tokens: 25637683200 | elapsed time per iteration (s): 0.56 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 2.728684E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.651 | TFLOPs: 43.54 | +7: iteration 48910/ 115203 | consumed samples: 12520960 | consumed tokens: 25642926080 | elapsed time per iteration (s): 0.57 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 2.748475E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.682 | TFLOPs: 42.49 | +7: iteration 48920/ 115203 | consumed samples: 12523520 | consumed tokens: 25648168960 | elapsed time per iteration (s): 0.55 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 2.743757E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.738 | TFLOPs: 44.02 | +7: iteration 48930/ 115203 | consumed samples: 12526080 | consumed tokens: 25653411840 | elapsed time per iteration (s): 0.57 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 2.747463E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.541 | TFLOPs: 43.14 | +7: iteration 48940/ 115203 | consumed samples: 12528640 | consumed tokens: 25658654720 | elapsed time per iteration (s): 0.56 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 2.744305E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.944 | TFLOPs: 43.56 | +7: iteration 48950/ 115203 | consumed samples: 12531200 | consumed tokens: 25663897600 | elapsed time per iteration (s): 0.55 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 2.752296E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.768 | TFLOPs: 44.02 | +7: iteration 48960/ 115203 | consumed samples: 12533760 | consumed tokens: 25669140480 | elapsed time per iteration (s): 0.57 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 2.738793E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.227 | TFLOPs: 43.11 | +7: iteration 48970/ 115203 | consumed samples: 12536320 | consumed tokens: 25674383360 | elapsed time per iteration (s): 0.56 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 2.735431E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.084 | TFLOPs: 43.67 | +7: iteration 48980/ 115203 | consumed samples: 12538880 | consumed tokens: 25679626240 | elapsed time per iteration (s): 0.56 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 2.743330E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.969 | TFLOPs: 43.57 | +7: iteration 48990/ 115203 | consumed samples: 12541440 | consumed tokens: 25684869120 | elapsed time per iteration (s): 0.56 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 2.748371E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.944 | TFLOPs: 43.56 | +7: iteration 49000/ 115203 | consumed samples: 12544000 | consumed tokens: 25690112000 | elapsed time per iteration (s): 0.56 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 2.731092E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.075 | TFLOPs: 43.86 | +7: iteration 49010/ 115203 | consumed samples: 12546560 | consumed tokens: 25695354880 | elapsed time per iteration (s): 0.56 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 2.747369E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.711 | TFLOPs: 43.45 | +7: iteration 49020/ 115203 | consumed samples: 12549120 | consumed tokens: 25700597760 | elapsed time per iteration (s): 0.56 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 2.732510E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.838 | TFLOPs: 43.55 | +7: iteration 49030/ 115203 | consumed samples: 12551680 | consumed tokens: 25705840640 | elapsed time per iteration (s): 0.55 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 2.744806E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.694 | TFLOPs: 44.02 | +7: iteration 49040/ 115203 | consumed samples: 12554240 | consumed tokens: 25711083520 | elapsed time per iteration (s): 0.56 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 2.749141E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.691 | TFLOPs: 43.45 | +7: iteration 49050/ 115203 | consumed samples: 12556800 | consumed tokens: 25716326400 | elapsed time per iteration (s): 0.55 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 2.742049E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.653 | TFLOPs: 44.01 | +7: iteration 49060/ 115203 | consumed samples: 12559360 | consumed tokens: 25721569280 | elapsed time per iteration (s): 0.57 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 2.746767E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.247 | TFLOPs: 43.12 | +7: iteration 49070/ 115203 | consumed samples: 12561920 | consumed tokens: 25726812160 | elapsed time per iteration (s): 0.56 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 2.732767E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.726 | TFLOPs: 43.64 | +7: iteration 49080/ 115203 | consumed samples: 12564480 | consumed tokens: 25732055040 | elapsed time per iteration (s): 0.56 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 2.744556E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.718 | TFLOPs: 43.54 | +7: iteration 49090/ 115203 | consumed samples: 12567040 | consumed tokens: 25737297920 | elapsed time per iteration (s): 0.56 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 2.732128E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.369 | TFLOPs: 43.51 | +7: iteration 49100/ 115203 | consumed samples: 12569600 | consumed tokens: 25742540800 | elapsed time per iteration (s): 0.56 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 2.748152E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.515 | TFLOPs: 43.71 | +7: iteration 49110/ 115203 | consumed samples: 12572160 | consumed tokens: 25747783680 | elapsed time per iteration (s): 0.56 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 2.745110E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.958 | TFLOPs: 43.57 | +7: iteration 49120/ 115203 | consumed samples: 12574720 | consumed tokens: 25753026560 | elapsed time per iteration (s): 0.56 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 2.733674E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.019 | TFLOPs: 43.48 | +7: iteration 49130/ 115203 | consumed samples: 12577280 | consumed tokens: 25758269440 | elapsed time per iteration (s): 0.56 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 2.731725E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.097 | TFLOPs: 43.48 | +7: iteration 49140/ 115203 | consumed samples: 12579840 | consumed tokens: 25763512320 | elapsed time per iteration (s): 0.55 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 2.746283E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.380 | TFLOPs: 43.99 | +7: iteration 49150/ 115203 | consumed samples: 12582400 | consumed tokens: 25768755200 | elapsed time per iteration (s): 0.56 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 2.738108E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.327 | TFLOPs: 43.60 | +7: iteration 49160/ 115203 | consumed samples: 12584960 | consumed tokens: 25773998080 | elapsed time per iteration (s): 0.55 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 2.736869E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.388 | TFLOPs: 43.99 | +7: iteration 49170/ 115203 | consumed samples: 12587520 | consumed tokens: 25779240960 | elapsed time per iteration (s): 0.57 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 2.736772E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.220 | TFLOPs: 43.11 | +7: iteration 49180/ 115203 | consumed samples: 12590080 | consumed tokens: 25784483840 | elapsed time per iteration (s): 0.55 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 2.721765E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.631 | TFLOPs: 44.01 | +7: iteration 49190/ 115203 | consumed samples: 12592640 | consumed tokens: 25789726720 | elapsed time per iteration (s): 0.56 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 2.739258E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.142 | TFLOPs: 43.49 | +7: iteration 49200/ 115203 | consumed samples: 12595200 | consumed tokens: 25794969600 | elapsed time per iteration (s): 0.56 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 2.733799E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.478 | TFLOPs: 43.42 | +7: iteration 49210/ 115203 | consumed samples: 12597760 | consumed tokens: 25800212480 | elapsed time per iteration (s): 0.56 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 2.731517E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.338 | TFLOPs: 43.51 | +7: iteration 49220/ 115203 | consumed samples: 12600320 | consumed tokens: 25805455360 | elapsed time per iteration (s): 0.56 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 2.738698E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.046 | TFLOPs: 43.57 | +7: iteration 49230/ 115203 | consumed samples: 12602880 | consumed tokens: 25810698240 | elapsed time per iteration (s): 0.56 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 2.722803E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.981 | TFLOPs: 43.47 | +7: iteration 49240/ 115203 | consumed samples: 12605440 | consumed tokens: 25815941120 | elapsed time per iteration (s): 0.57 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 2.736327E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.217 | TFLOPs: 43.11 | +7: iteration 49250/ 115203 | consumed samples: 12608000 | consumed tokens: 25821184000 | elapsed time per iteration (s): 0.56 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 2.734228E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.599 | TFLOPs: 43.72 | +7: iteration 49260/ 115203 | consumed samples: 12610560 | consumed tokens: 25826426880 | elapsed time per iteration (s): 0.55 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 2.742055E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.418 | TFLOPs: 43.99 | +7: iteration 49270/ 115203 | consumed samples: 12613120 | consumed tokens: 25831669760 | elapsed time per iteration (s): 0.56 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 2.745958E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.301 | TFLOPs: 43.69 | +7: iteration 49280/ 115203 | consumed samples: 12615680 | consumed tokens: 25836912640 | elapsed time per iteration (s): 0.55 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 2.728283E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.339 | TFLOPs: 43.98 | +7: iteration 49290/ 115203 | consumed samples: 12618240 | consumed tokens: 25842155520 | elapsed time per iteration (s): 0.56 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 2.738627E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.515 | TFLOPs: 43.43 | +7: iteration 49300/ 115203 | consumed samples: 12620800 | consumed tokens: 25847398400 | elapsed time per iteration (s): 0.56 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 2.734287E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.536 | TFLOPs: 43.53 | +7: iteration 49310/ 115203 | consumed samples: 12623360 | consumed tokens: 25852641280 | elapsed time per iteration (s): 0.56 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 2.732600E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.741 | TFLOPs: 43.74 | +7: iteration 49320/ 115203 | consumed samples: 12625920 | consumed tokens: 25857884160 | elapsed time per iteration (s): 0.56 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 2.733828E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.945 | TFLOPs: 43.47 | +7: iteration 49330/ 115203 | consumed samples: 12628480 | consumed tokens: 25863127040 | elapsed time per iteration (s): 0.56 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 2.742408E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.942 | TFLOPs: 43.56 | +7: iteration 49340/ 115203 | consumed samples: 12631040 | consumed tokens: 25868369920 | elapsed time per iteration (s): 0.57 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 2.734884E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.676 | TFLOPs: 43.16 | +7: iteration 49350/ 115203 | consumed samples: 12633600 | consumed tokens: 25873612800 | elapsed time per iteration (s): 0.56 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 2.748709E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.999 | TFLOPs: 43.57 | +7: iteration 49360/ 115203 | consumed samples: 12636160 | consumed tokens: 25878855680 | elapsed time per iteration (s): 0.56 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 2.743511E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.243 | TFLOPs: 43.21 | +7: iteration 49370/ 115203 | consumed samples: 12638720 | consumed tokens: 25884098560 | elapsed time per iteration (s): 0.55 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 2.740338E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.461 | TFLOPs: 44.00 | +7: iteration 49380/ 115203 | consumed samples: 12641280 | consumed tokens: 25889341440 | elapsed time per iteration (s): 0.56 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 2.744828E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.790 | TFLOPs: 43.45 | +7: iteration 49390/ 115203 | consumed samples: 12643840 | consumed tokens: 25894584320 | elapsed time per iteration (s): 0.57 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 2.740562E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.343 | TFLOPs: 43.13 | +7: iteration 49400/ 115203 | consumed samples: 12646400 | consumed tokens: 25899827200 | elapsed time per iteration (s): 0.56 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 2.737123E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.195 | TFLOPs: 43.40 | +7: iteration 49410/ 115203 | consumed samples: 12648960 | consumed tokens: 25905070080 | elapsed time per iteration (s): 0.56 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 2.726832E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.846 | TFLOPs: 43.46 | +7: iteration 49420/ 115203 | consumed samples: 12651520 | consumed tokens: 25910312960 | elapsed time per iteration (s): 0.56 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 2.748381E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.376 | TFLOPs: 43.51 | +7: iteration 49430/ 115203 | consumed samples: 12654080 | consumed tokens: 25915555840 | elapsed time per iteration (s): 0.56 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 2.741386E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.787 | TFLOPs: 43.84 | +7: iteration 49440/ 115203 | consumed samples: 12656640 | consumed tokens: 25920798720 | elapsed time per iteration (s): 0.55 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 2.740216E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.478 | TFLOPs: 44.00 | +7: iteration 49450/ 115203 | consumed samples: 12659200 | consumed tokens: 25926041600 | elapsed time per iteration (s): 0.56 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 2.732498E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.526 | TFLOPs: 43.24 | +7: iteration 49460/ 115203 | consumed samples: 12661760 | consumed tokens: 25931284480 | elapsed time per iteration (s): 0.57 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 2.736852E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.644 | TFLOPs: 42.58 | +7: iteration 49470/ 115203 | consumed samples: 12664320 | consumed tokens: 25936527360 | elapsed time per iteration (s): 0.56 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 2.737481E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.652 | TFLOPs: 43.73 | +7: iteration 49480/ 115203 | consumed samples: 12666880 | consumed tokens: 25941770240 | elapsed time per iteration (s): 0.55 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 2.733029E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.610 | TFLOPs: 44.01 | +7: iteration 49490/ 115203 | consumed samples: 12669440 | consumed tokens: 25947013120 | elapsed time per iteration (s): 0.55 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 2.757660E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.475 | TFLOPs: 44.00 | +7: iteration 49500/ 115203 | consumed samples: 12672000 | consumed tokens: 25952256000 | elapsed time per iteration (s): 0.56 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 2.745352E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.633 | TFLOPs: 43.25 | +7: iteration 49510/ 115203 | consumed samples: 12674560 | consumed tokens: 25957498880 | elapsed time per iteration (s): 0.56 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 2.737925E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.805 | TFLOPs: 43.46 | +7: iteration 49520/ 115203 | consumed samples: 12677120 | consumed tokens: 25962741760 | elapsed time per iteration (s): 0.57 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 2.737426E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.626 | TFLOPs: 43.15 | +7: iteration 49530/ 115203 | consumed samples: 12679680 | consumed tokens: 25967984640 | elapsed time per iteration (s): 0.57 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 2.740851E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.599 | TFLOPs: 42.77 | +7: iteration 49540/ 115203 | consumed samples: 12682240 | consumed tokens: 25973227520 | elapsed time per iteration (s): 0.55 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 2.744841E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.598 | TFLOPs: 44.01 | +7: iteration 49550/ 115203 | consumed samples: 12684800 | consumed tokens: 25978470400 | elapsed time per iteration (s): 0.56 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 2.737027E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.912 | TFLOPs: 43.75 | +7: iteration 49560/ 115203 | consumed samples: 12687360 | consumed tokens: 25983713280 | elapsed time per iteration (s): 0.59 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 2.732477E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.557 | TFLOPs: 41.53 | +7: iteration 49570/ 115203 | consumed samples: 12689920 | consumed tokens: 25988956160 | elapsed time per iteration (s): 0.57 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 2.727794E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.883 | TFLOPs: 43.08 | +7: iteration 49580/ 115203 | consumed samples: 12692480 | consumed tokens: 25994199040 | elapsed time per iteration (s): 0.55 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 2.727545E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.315 | TFLOPs: 43.98 | +7: iteration 49590/ 115203 | consumed samples: 12695040 | consumed tokens: 25999441920 | elapsed time per iteration (s): 0.57 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 2.748892E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.755 | TFLOPs: 43.07 | +7: iteration 49600/ 115203 | consumed samples: 12697600 | consumed tokens: 26004684800 | elapsed time per iteration (s): 0.56 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 2.739571E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.192 | TFLOPs: 43.49 | +7: iteration 49610/ 115203 | consumed samples: 12700160 | consumed tokens: 26009927680 | elapsed time per iteration (s): 0.55 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 2.744355E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.497 | TFLOPs: 44.00 | +7: iteration 49620/ 115203 | consumed samples: 12702720 | consumed tokens: 26015170560 | elapsed time per iteration (s): 0.56 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 2.739398E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.291 | TFLOPs: 43.31 | +7: iteration 49630/ 115203 | consumed samples: 12705280 | consumed tokens: 26020413440 | elapsed time per iteration (s): 0.56 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 2.728226E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.681 | TFLOPs: 43.73 | +7: iteration 49640/ 115203 | consumed samples: 12707840 | consumed tokens: 26025656320 | elapsed time per iteration (s): 0.56 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 2.747512E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.985 | TFLOPs: 43.28 | +7: iteration 49650/ 115203 | consumed samples: 12710400 | consumed tokens: 26030899200 | elapsed time per iteration (s): 0.56 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 2.742719E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.733 | TFLOPs: 43.45 | +7: iteration 49660/ 115203 | consumed samples: 12712960 | consumed tokens: 26036142080 | elapsed time per iteration (s): 0.59 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 2.739002E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.780 | TFLOPs: 41.55 | +7: iteration 49670/ 115203 | consumed samples: 12715520 | consumed tokens: 26041384960 | elapsed time per iteration (s): 0.56 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 2.726647E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.070 | TFLOPs: 43.58 | +7: iteration 49680/ 115203 | consumed samples: 12718080 | consumed tokens: 26046627840 | elapsed time per iteration (s): 0.56 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 2.744598E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.696 | TFLOPs: 43.54 | +7: iteration 49690/ 115203 | consumed samples: 12720640 | consumed tokens: 26051870720 | elapsed time per iteration (s): 0.56 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 2.735530E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.141 | TFLOPs: 43.96 | +7: iteration 49700/ 115203 | consumed samples: 12723200 | consumed tokens: 26057113600 | elapsed time per iteration (s): 0.56 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 2.729456E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.121 | TFLOPs: 43.39 | +7: iteration 49710/ 115203 | consumed samples: 12725760 | consumed tokens: 26062356480 | elapsed time per iteration (s): 0.55 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 2.735015E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.548 | TFLOPs: 44.00 | +7: iteration 49720/ 115203 | consumed samples: 12728320 | consumed tokens: 26067599360 | elapsed time per iteration (s): 0.56 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 2.723627E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.145 | TFLOPs: 43.49 | +7: iteration 49730/ 115203 | consumed samples: 12730880 | consumed tokens: 26072842240 | elapsed time per iteration (s): 0.57 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 2.737681E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.718 | TFLOPs: 42.69 | +7: iteration 49740/ 115203 | consumed samples: 12733440 | consumed tokens: 26078085120 | elapsed time per iteration (s): 0.56 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 2.723825E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.297 | TFLOPs: 43.60 | +7: iteration 49750/ 115203 | consumed samples: 12736000 | consumed tokens: 26083328000 | elapsed time per iteration (s): 0.56 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 2.736445E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.937 | TFLOPs: 43.56 | +7: iteration 49760/ 115203 | consumed samples: 12738560 | consumed tokens: 26088570880 | elapsed time per iteration (s): 0.55 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 2.730001E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.635 | TFLOPs: 44.01 | +7: iteration 49770/ 115203 | consumed samples: 12741120 | consumed tokens: 26093813760 | elapsed time per iteration (s): 0.57 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 2.744375E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.442 | TFLOPs: 43.14 | +7: iteration 49780/ 115203 | consumed samples: 12743680 | consumed tokens: 26099056640 | elapsed time per iteration (s): 0.56 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 2.737802E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.521 | TFLOPs: 43.72 | +7: iteration 49790/ 115203 | consumed samples: 12746240 | consumed tokens: 26104299520 | elapsed time per iteration (s): 0.56 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 2.750824E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.911 | TFLOPs: 43.47 | +7: iteration 49800/ 115203 | consumed samples: 12748800 | consumed tokens: 26109542400 | elapsed time per iteration (s): 0.56 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 2.739125E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.212 | TFLOPs: 43.49 | +7: iteration 49810/ 115203 | consumed samples: 12751360 | consumed tokens: 26114785280 | elapsed time per iteration (s): 0.55 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 2.738150E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.360 | TFLOPs: 43.99 | +7: iteration 49820/ 115203 | consumed samples: 12753920 | consumed tokens: 26120028160 | elapsed time per iteration (s): 0.57 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 2.730975E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.880 | TFLOPs: 43.08 | +7: iteration 49830/ 115203 | consumed samples: 12756480 | consumed tokens: 26125271040 | elapsed time per iteration (s): 0.56 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 2.742319E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.842 | TFLOPs: 43.94 | +7: iteration 49840/ 115203 | consumed samples: 12759040 | consumed tokens: 26130513920 | elapsed time per iteration (s): 0.56 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 2.751111E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.161 | TFLOPs: 43.49 | +7: iteration 49850/ 115203 | consumed samples: 12761600 | consumed tokens: 26135756800 | elapsed time per iteration (s): 0.56 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 2.732935E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.062 | TFLOPs: 43.48 | +7: iteration 49860/ 115203 | consumed samples: 12764160 | consumed tokens: 26140999680 | elapsed time per iteration (s): 0.55 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 2.734919E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.739 | TFLOPs: 44.02 | +7: iteration 49870/ 115203 | consumed samples: 12766720 | consumed tokens: 26146242560 | elapsed time per iteration (s): 0.56 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 2.738391E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.134 | TFLOPs: 43.68 | +7: iteration 49880/ 115203 | consumed samples: 12769280 | consumed tokens: 26151485440 | elapsed time per iteration (s): 0.56 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 2.735621E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.871 | TFLOPs: 43.37 | +7: iteration 49890/ 115203 | consumed samples: 12771840 | consumed tokens: 26156728320 | elapsed time per iteration (s): 0.56 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 2.741093E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.850 | TFLOPs: 43.56 | +7: iteration 49900/ 115203 | consumed samples: 12774400 | consumed tokens: 26161971200 | elapsed time per iteration (s): 0.56 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 2.740047E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.965 | TFLOPs: 43.47 | +7: iteration 49910/ 115203 | consumed samples: 12776960 | consumed tokens: 26167214080 | elapsed time per iteration (s): 0.55 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 2.737965E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.474 | TFLOPs: 44.00 | +7: iteration 49920/ 115203 | consumed samples: 12779520 | consumed tokens: 26172456960 | elapsed time per iteration (s): 0.56 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 2.731761E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.469 | TFLOPs: 43.71 | +7: iteration 49930/ 115203 | consumed samples: 12782080 | consumed tokens: 26177699840 | elapsed time per iteration (s): 0.56 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 2.732489E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.298 | TFLOPs: 43.69 | +7: iteration 49940/ 115203 | consumed samples: 12784640 | consumed tokens: 26182942720 | elapsed time per iteration (s): 0.56 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 2.740990E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.729 | TFLOPs: 43.54 | +7: iteration 49950/ 115203 | consumed samples: 12787200 | consumed tokens: 26188185600 | elapsed time per iteration (s): 0.56 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 2.744207E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.042 | TFLOPs: 43.48 | +7: iteration 49960/ 115203 | consumed samples: 12789760 | consumed tokens: 26193428480 | elapsed time per iteration (s): 0.56 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 2.738635E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.450 | TFLOPs: 43.52 | +7: iteration 49970/ 115203 | consumed samples: 12792320 | consumed tokens: 26198671360 | elapsed time per iteration (s): 0.55 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 2.739787E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.481 | TFLOPs: 44.00 | +7: iteration 49980/ 115203 | consumed samples: 12794880 | consumed tokens: 26203914240 | elapsed time per iteration (s): 0.57 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 2.730982E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.915 | TFLOPs: 43.18 | +7: iteration 49990/ 115203 | consumed samples: 12797440 | consumed tokens: 26209157120 | elapsed time per iteration (s): 0.58 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 2.733269E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.106 | TFLOPs: 42.44 | +0: [2023-03-16 20:40:44,712] [INFO] [logging.py:68:log_dist] [Rank 0] step=50000, skipped=0, lr=[0.00013010274525760026, 0.00013010274525760026, 0.00013010274525760026], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 50000/ 115203 | consumed samples: 12800000 | consumed tokens: 26214400000 | elapsed time per iteration (s): 0.56 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 2.747729E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.374 | TFLOPs: 43.70 | +0: steps: 50000 loss: 2.7650 iter time (s): 0.561 samples/sec: 456.141 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 50000 | lm loss value: 3.361743E+00 | lm loss PPL: 2.883941E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 50000 to checkpoints_421m60b400m +0: [2023-03-16 20:40:44,923] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step50000 is begin to save! +0: [2023-03-16 20:40:44,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_01-model_00-model_states.pt... +0: [2023-03-16 20:40:45,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_01-model_00-model_states.pt. +0: [2023-03-16 20:40:45,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_03-model_00-model_states.pt... +0: [2023-03-16 20:40:45,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_03-model_00-model_states.pt. +0: [2023-03-16 20:40:45,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_04-model_00-model_states.pt... +0: [2023-03-16 20:40:45,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_04-model_00-model_states.pt. +0: [2023-03-16 20:40:45,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_05-model_00-model_states.pt... +0: [2023-03-16 20:40:45,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_05-model_00-model_states.pt. +0: [2023-03-16 20:40:45,229] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_06-model_00-model_states.pt... +0: [2023-03-16 20:40:45,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_06-model_00-model_states.pt. +0: [2023-03-16 20:40:45,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_07-model_00-model_states.pt... +0: [2023-03-16 20:40:45,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_07-model_00-model_states.pt. +0: [2023-03-16 20:40:45,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_08-model_00-model_states.pt... +0: [2023-03-16 20:40:45,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_08-model_00-model_states.pt. +0: [2023-03-16 20:40:45,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_09-model_00-model_states.pt... +0: [2023-03-16 20:40:45,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_09-model_00-model_states.pt. +0: [2023-03-16 20:40:45,401] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_10-model_00-model_states.pt... +0: [2023-03-16 20:40:45,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_10-model_00-model_states.pt. +0: [2023-03-16 20:40:45,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_11-model_00-model_states.pt... +0: [2023-03-16 20:40:45,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_11-model_00-model_states.pt. +0: [2023-03-16 20:40:45,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_12-model_00-model_states.pt... +0: [2023-03-16 20:40:45,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_12-model_00-model_states.pt. +0: [2023-03-16 20:40:45,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_13-model_00-model_states.pt... +0: [2023-03-16 20:40:45,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_13-model_00-model_states.pt. +0: [2023-03-16 20:40:45,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_14-model_00-model_states.pt... +0: [2023-03-16 20:40:45,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_14-model_00-model_states.pt. +0: [2023-03-16 20:40:45,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_15-model_00-model_states.pt... +0: [2023-03-16 20:40:45,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_15-model_00-model_states.pt. +0: [2023-03-16 20:40:45,656] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_16-model_00-model_states.pt... +0: [2023-03-16 20:40:45,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_16-model_00-model_states.pt. +0: [2023-03-16 20:40:45,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_17-model_00-model_states.pt... +0: [2023-03-16 20:40:45,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_17-model_00-model_states.pt. +0: [2023-03-16 20:40:45,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_18-model_00-model_states.pt... +0: [2023-03-16 20:40:45,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_18-model_00-model_states.pt. +0: [2023-03-16 20:40:45,786] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_19-model_00-model_states.pt... +0: [2023-03-16 20:40:45,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_19-model_00-model_states.pt. +0: [2023-03-16 20:40:45,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_20-model_00-model_states.pt... +0: [2023-03-16 20:40:45,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_20-model_00-model_states.pt. +0: [2023-03-16 20:40:45,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/layer_22-model_00-model_states.pt... +0: [2023-03-16 20:40:45,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/layer_22-model_00-model_states.pt. +0: [2023-03-16 20:40:45,873] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step50000/mp_rank_00_model_states.pt +0: [2023-03-16 20:40:45,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/mp_rank_00_model_states.pt... +0: [2023-03-16 20:40:45,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/mp_rank_00_model_states.pt. +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +3: [2023-03-16 20:40:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-16 20:40:45,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:45,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:45,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-16 20:40:46,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-16 20:40:46,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-16 20:40:46,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 20:40:46,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-16 20:40:46,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-16 20:40:46,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-16 20:40:46,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-16 20:40:46,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-16 20:40:46,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 20:40:46,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: successfully saved checkpoint at iteration 50000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1224.10 +7: iteration 50010/ 115203 | consumed samples: 12802560 | consumed tokens: 26219642880 | elapsed time per iteration (s): 0.70 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 2.739010E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 367.028 | TFLOPs: 34.99 | +7: iteration 50020/ 115203 | consumed samples: 12805120 | consumed tokens: 26224885760 | elapsed time per iteration (s): 0.55 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 2.746364E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.012 | TFLOPs: 44.05 | +7: iteration 50030/ 115203 | consumed samples: 12807680 | consumed tokens: 26230128640 | elapsed time per iteration (s): 0.55 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 2.747606E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.911 | TFLOPs: 44.04 | +7: iteration 50040/ 115203 | consumed samples: 12810240 | consumed tokens: 26235371520 | elapsed time per iteration (s): 0.56 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 2.740857E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.408 | TFLOPs: 43.80 | +7: iteration 50050/ 115203 | consumed samples: 12812800 | consumed tokens: 26240614400 | elapsed time per iteration (s): 0.56 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 2.741981E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.006 | TFLOPs: 43.48 | +7: iteration 50060/ 115203 | consumed samples: 12815360 | consumed tokens: 26245857280 | elapsed time per iteration (s): 0.57 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 2.749009E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.584 | TFLOPs: 42.86 | +7: iteration 50070/ 115203 | consumed samples: 12817920 | consumed tokens: 26251100160 | elapsed time per iteration (s): 0.55 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 2.730447E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.688 | TFLOPs: 44.02 | +7: iteration 50080/ 115203 | consumed samples: 12820480 | consumed tokens: 26256343040 | elapsed time per iteration (s): 0.56 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 2.717085E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.863 | TFLOPs: 43.46 | +7: iteration 50090/ 115203 | consumed samples: 12823040 | consumed tokens: 26261585920 | elapsed time per iteration (s): 0.56 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 2.726773E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.275 | TFLOPs: 43.69 | +7: iteration 50100/ 115203 | consumed samples: 12825600 | consumed tokens: 26266828800 | elapsed time per iteration (s): 0.56 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 2.725959E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.209 | TFLOPs: 43.97 | +7: iteration 50110/ 115203 | consumed samples: 12828160 | consumed tokens: 26272071680 | elapsed time per iteration (s): 0.56 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 2.738144E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.190 | TFLOPs: 43.68 | +7: iteration 50120/ 115203 | consumed samples: 12830720 | consumed tokens: 26277314560 | elapsed time per iteration (s): 0.56 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 2.738573E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.236 | TFLOPs: 43.97 | +7: iteration 50130/ 115203 | consumed samples: 12833280 | consumed tokens: 26282557440 | elapsed time per iteration (s): 0.56 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 2.734041E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.355 | TFLOPs: 43.70 | +7: iteration 50140/ 115203 | consumed samples: 12835840 | consumed tokens: 26287800320 | elapsed time per iteration (s): 0.55 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 2.722011E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.487 | TFLOPs: 44.00 | +7: iteration 50150/ 115203 | consumed samples: 12838400 | consumed tokens: 26293043200 | elapsed time per iteration (s): 0.56 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 2.744269E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.814 | TFLOPs: 43.55 | +7: iteration 50160/ 115203 | consumed samples: 12840960 | consumed tokens: 26298286080 | elapsed time per iteration (s): 0.55 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 2.732325E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.432 | TFLOPs: 43.99 | +7: iteration 50170/ 115203 | consumed samples: 12843520 | consumed tokens: 26303528960 | elapsed time per iteration (s): 0.56 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 2.738172E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.804 | TFLOPs: 43.93 | +7: iteration 50180/ 115203 | consumed samples: 12846080 | consumed tokens: 26308771840 | elapsed time per iteration (s): 0.56 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 2.739815E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.191 | TFLOPs: 43.40 | +7: iteration 50190/ 115203 | consumed samples: 12848640 | consumed tokens: 26314014720 | elapsed time per iteration (s): 0.57 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 2.722117E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.976 | TFLOPs: 43.09 | +7: iteration 50200/ 115203 | consumed samples: 12851200 | consumed tokens: 26319257600 | elapsed time per iteration (s): 0.56 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 2.730954E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.022 | TFLOPs: 43.67 | +7: iteration 50210/ 115203 | consumed samples: 12853760 | consumed tokens: 26324500480 | elapsed time per iteration (s): 0.55 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 2.737460E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.598 | TFLOPs: 44.01 | +7: iteration 50220/ 115203 | consumed samples: 12856320 | consumed tokens: 26329743360 | elapsed time per iteration (s): 0.55 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 2.735482E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.567 | TFLOPs: 44.01 | +7: iteration 50230/ 115203 | consumed samples: 12858880 | consumed tokens: 26334986240 | elapsed time per iteration (s): 0.56 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 2.718867E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.482 | TFLOPs: 43.71 | +7: iteration 50240/ 115203 | consumed samples: 12861440 | consumed tokens: 26340229120 | elapsed time per iteration (s): 0.56 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 2.734036E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.677 | TFLOPs: 43.54 | +7: iteration 50250/ 115203 | consumed samples: 12864000 | consumed tokens: 26345472000 | elapsed time per iteration (s): 0.56 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 2.729926E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.906 | TFLOPs: 43.94 | +7: iteration 50260/ 115203 | consumed samples: 12866560 | consumed tokens: 26350714880 | elapsed time per iteration (s): 0.56 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 2.733606E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.858 | TFLOPs: 43.94 | +7: iteration 50270/ 115203 | consumed samples: 12869120 | consumed tokens: 26355957760 | elapsed time per iteration (s): 0.58 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 2.743604E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.613 | TFLOPs: 42.01 | +7: iteration 50280/ 115203 | consumed samples: 12871680 | consumed tokens: 26361200640 | elapsed time per iteration (s): 0.56 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 2.732127E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.203 | TFLOPs: 43.97 | +7: iteration 50290/ 115203 | consumed samples: 12874240 | consumed tokens: 26366443520 | elapsed time per iteration (s): 0.56 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 2.742897E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.270 | TFLOPs: 43.69 | +7: iteration 50300/ 115203 | consumed samples: 12876800 | consumed tokens: 26371686400 | elapsed time per iteration (s): 0.57 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 2.731514E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.098 | TFLOPs: 42.91 | +7: iteration 50310/ 115203 | consumed samples: 12879360 | consumed tokens: 26376929280 | elapsed time per iteration (s): 0.55 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 2.730028E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.313 | TFLOPs: 43.98 | +7: iteration 50320/ 115203 | consumed samples: 12881920 | consumed tokens: 26382172160 | elapsed time per iteration (s): 0.55 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 2.739316E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.460 | TFLOPs: 44.00 | +7: iteration 50330/ 115203 | consumed samples: 12884480 | consumed tokens: 26387415040 | elapsed time per iteration (s): 0.56 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 2.731475E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.948 | TFLOPs: 43.57 | +7: iteration 50340/ 115203 | consumed samples: 12887040 | consumed tokens: 26392657920 | elapsed time per iteration (s): 0.56 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 2.751291E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.102 | TFLOPs: 43.68 | +7: iteration 50350/ 115203 | consumed samples: 12889600 | consumed tokens: 26397900800 | elapsed time per iteration (s): 0.56 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 2.719910E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.979 | TFLOPs: 43.95 | +7: iteration 50360/ 115203 | consumed samples: 12892160 | consumed tokens: 26403143680 | elapsed time per iteration (s): 0.56 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 2.737757E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.913 | TFLOPs: 43.56 | +7: iteration 50370/ 115203 | consumed samples: 12894720 | consumed tokens: 26408386560 | elapsed time per iteration (s): 0.56 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 2.716795E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.957 | TFLOPs: 43.95 | +7: iteration 50380/ 115203 | consumed samples: 12897280 | consumed tokens: 26413629440 | elapsed time per iteration (s): 0.56 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 2.731876E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.194 | TFLOPs: 43.97 | +7: iteration 50390/ 115203 | consumed samples: 12899840 | consumed tokens: 26418872320 | elapsed time per iteration (s): 0.56 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 2.728070E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.181 | TFLOPs: 43.97 | +7: iteration 50400/ 115203 | consumed samples: 12902400 | consumed tokens: 26424115200 | elapsed time per iteration (s): 0.56 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 2.740209E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.127 | TFLOPs: 43.96 | +7: iteration 50410/ 115203 | consumed samples: 12904960 | consumed tokens: 26429358080 | elapsed time per iteration (s): 0.56 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 2.732992E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.881 | TFLOPs: 43.94 | +7: iteration 50420/ 115203 | consumed samples: 12907520 | consumed tokens: 26434600960 | elapsed time per iteration (s): 0.56 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 2.725196E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.424 | TFLOPs: 43.61 | +7: iteration 50430/ 115203 | consumed samples: 12910080 | consumed tokens: 26439843840 | elapsed time per iteration (s): 0.55 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 2.737985E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.573 | TFLOPs: 44.01 | +7: iteration 50440/ 115203 | consumed samples: 12912640 | consumed tokens: 26445086720 | elapsed time per iteration (s): 0.55 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 2.744034E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.349 | TFLOPs: 43.98 | +7: iteration 50450/ 115203 | consumed samples: 12915200 | consumed tokens: 26450329600 | elapsed time per iteration (s): 0.55 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 2.744329E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.690 | TFLOPs: 44.02 | +7: iteration 50460/ 115203 | consumed samples: 12917760 | consumed tokens: 26455572480 | elapsed time per iteration (s): 0.55 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 2.741735E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.703 | TFLOPs: 44.02 | +7: iteration 50470/ 115203 | consumed samples: 12920320 | consumed tokens: 26460815360 | elapsed time per iteration (s): 0.55 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 2.740338E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.634 | TFLOPs: 44.01 | +7: iteration 50480/ 115203 | consumed samples: 12922880 | consumed tokens: 26466058240 | elapsed time per iteration (s): 0.56 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 2.737494E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.168 | TFLOPs: 43.68 | +7: iteration 50490/ 115203 | consumed samples: 12925440 | consumed tokens: 26471301120 | elapsed time per iteration (s): 0.57 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 2.728683E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.553 | TFLOPs: 43.05 | +7: iteration 50500/ 115203 | consumed samples: 12928000 | consumed tokens: 26476544000 | elapsed time per iteration (s): 0.55 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 2.720683E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.580 | TFLOPs: 44.01 | +7: iteration 50510/ 115203 | consumed samples: 12930560 | consumed tokens: 26481786880 | elapsed time per iteration (s): 0.55 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 2.731574E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.502 | TFLOPs: 44.00 | +7: iteration 50520/ 115203 | consumed samples: 12933120 | consumed tokens: 26487029760 | elapsed time per iteration (s): 0.56 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 2.723821E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.105 | TFLOPs: 43.58 | +7: iteration 50530/ 115203 | consumed samples: 12935680 | consumed tokens: 26492272640 | elapsed time per iteration (s): 0.55 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 2.745105E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.496 | TFLOPs: 44.00 | +7: iteration 50540/ 115203 | consumed samples: 12938240 | consumed tokens: 26497515520 | elapsed time per iteration (s): 0.55 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 2.748717E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.633 | TFLOPs: 44.01 | +7: iteration 50550/ 115203 | consumed samples: 12940800 | consumed tokens: 26502758400 | elapsed time per iteration (s): 0.55 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 2.732323E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.557 | TFLOPs: 44.00 | +7: iteration 50560/ 115203 | consumed samples: 12943360 | consumed tokens: 26508001280 | elapsed time per iteration (s): 0.55 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 2.733789E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.586 | TFLOPs: 44.01 | +7: iteration 50570/ 115203 | consumed samples: 12945920 | consumed tokens: 26513244160 | elapsed time per iteration (s): 0.56 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 2.731336E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.366 | TFLOPs: 43.60 | +7: iteration 50580/ 115203 | consumed samples: 12948480 | consumed tokens: 26518487040 | elapsed time per iteration (s): 0.56 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 2.731170E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.821 | TFLOPs: 43.84 | +7: iteration 50590/ 115203 | consumed samples: 12951040 | consumed tokens: 26523729920 | elapsed time per iteration (s): 0.56 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 2.727779E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.658 | TFLOPs: 43.73 | +7: iteration 50600/ 115203 | consumed samples: 12953600 | consumed tokens: 26528972800 | elapsed time per iteration (s): 0.55 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 2.741489E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.596 | TFLOPs: 44.01 | +7: iteration 50610/ 115203 | consumed samples: 12956160 | consumed tokens: 26534215680 | elapsed time per iteration (s): 0.55 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 2.727950E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.541 | TFLOPs: 44.00 | +7: iteration 50620/ 115203 | consumed samples: 12958720 | consumed tokens: 26539458560 | elapsed time per iteration (s): 0.55 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 2.742128E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.679 | TFLOPs: 44.02 | +7: iteration 50630/ 115203 | consumed samples: 12961280 | consumed tokens: 26544701440 | elapsed time per iteration (s): 0.56 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 2.733223E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.765 | TFLOPs: 43.26 | +7: iteration 50640/ 115203 | consumed samples: 12963840 | consumed tokens: 26549944320 | elapsed time per iteration (s): 0.55 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 2.740231E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.682 | TFLOPs: 44.02 | +7: iteration 50650/ 115203 | consumed samples: 12966400 | consumed tokens: 26555187200 | elapsed time per iteration (s): 0.55 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 2.735494E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.714 | TFLOPs: 44.02 | +7: iteration 50660/ 115203 | consumed samples: 12968960 | consumed tokens: 26560430080 | elapsed time per iteration (s): 0.55 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 2.739908E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.516 | TFLOPs: 44.00 | +7: iteration 50670/ 115203 | consumed samples: 12971520 | consumed tokens: 26565672960 | elapsed time per iteration (s): 0.56 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 2.723642E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.621 | TFLOPs: 43.72 | +7: iteration 50680/ 115203 | consumed samples: 12974080 | consumed tokens: 26570915840 | elapsed time per iteration (s): 0.57 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 2.724282E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.062 | TFLOPs: 42.62 | +7: iteration 50690/ 115203 | consumed samples: 12976640 | consumed tokens: 26576158720 | elapsed time per iteration (s): 0.56 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 2.736398E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.038 | TFLOPs: 43.57 | +7: iteration 50700/ 115203 | consumed samples: 12979200 | consumed tokens: 26581401600 | elapsed time per iteration (s): 0.56 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 2.726267E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.376 | TFLOPs: 43.61 | +7: iteration 50710/ 115203 | consumed samples: 12981760 | consumed tokens: 26586644480 | elapsed time per iteration (s): 0.56 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 2.724619E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.508 | TFLOPs: 43.52 | +7: iteration 50720/ 115203 | consumed samples: 12984320 | consumed tokens: 26591887360 | elapsed time per iteration (s): 0.56 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 2.737348E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.466 | TFLOPs: 43.71 | +7: iteration 50730/ 115203 | consumed samples: 12986880 | consumed tokens: 26597130240 | elapsed time per iteration (s): 0.56 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 2.731615E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.701 | TFLOPs: 43.64 | +7: iteration 50740/ 115203 | consumed samples: 12989440 | consumed tokens: 26602373120 | elapsed time per iteration (s): 0.55 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 2.744019E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.612 | TFLOPs: 44.01 | +7: iteration 50750/ 115203 | consumed samples: 12992000 | consumed tokens: 26607616000 | elapsed time per iteration (s): 0.56 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 2.736924E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.030 | TFLOPs: 43.57 | +7: iteration 50760/ 115203 | consumed samples: 12994560 | consumed tokens: 26612858880 | elapsed time per iteration (s): 0.56 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 2.719721E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.719 | TFLOPs: 43.54 | +7: iteration 50770/ 115203 | consumed samples: 12997120 | consumed tokens: 26618101760 | elapsed time per iteration (s): 0.57 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 2.729972E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.625 | TFLOPs: 43.15 | +7: iteration 50780/ 115203 | consumed samples: 12999680 | consumed tokens: 26623344640 | elapsed time per iteration (s): 0.56 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 2.733778E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.590 | TFLOPs: 43.63 | +7: iteration 50790/ 115203 | consumed samples: 13002240 | consumed tokens: 26628587520 | elapsed time per iteration (s): 0.56 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 2.742093E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.924 | TFLOPs: 43.47 | +7: iteration 50800/ 115203 | consumed samples: 13004800 | consumed tokens: 26633830400 | elapsed time per iteration (s): 0.56 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 2.729943E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.292 | TFLOPs: 43.60 | +7: iteration 50810/ 115203 | consumed samples: 13007360 | consumed tokens: 26639073280 | elapsed time per iteration (s): 0.56 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 2.745243E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.693 | TFLOPs: 43.73 | +7: iteration 50820/ 115203 | consumed samples: 13009920 | consumed tokens: 26644316160 | elapsed time per iteration (s): 0.57 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 2.726645E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.424 | TFLOPs: 43.04 | +7: iteration 50830/ 115203 | consumed samples: 13012480 | consumed tokens: 26649559040 | elapsed time per iteration (s): 0.55 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 2.725765E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.844 | TFLOPs: 44.03 | +7: iteration 50840/ 115203 | consumed samples: 13015040 | consumed tokens: 26654801920 | elapsed time per iteration (s): 0.56 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 2.729164E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.194 | TFLOPs: 43.49 | +7: iteration 50850/ 115203 | consumed samples: 13017600 | consumed tokens: 26660044800 | elapsed time per iteration (s): 0.55 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 2.724104E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.714 | TFLOPs: 44.02 | +7: iteration 50860/ 115203 | consumed samples: 13020160 | consumed tokens: 26665287680 | elapsed time per iteration (s): 0.55 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 2.727822E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.601 | TFLOPs: 44.01 | +7: iteration 50870/ 115203 | consumed samples: 13022720 | consumed tokens: 26670530560 | elapsed time per iteration (s): 0.55 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 2.727546E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.380 | TFLOPs: 43.99 | +7: iteration 50880/ 115203 | consumed samples: 13025280 | consumed tokens: 26675773440 | elapsed time per iteration (s): 0.56 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 2.727327E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.619 | TFLOPs: 43.82 | +7: iteration 50890/ 115203 | consumed samples: 13027840 | consumed tokens: 26681016320 | elapsed time per iteration (s): 0.56 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 2.721625E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.117 | TFLOPs: 43.68 | +7: iteration 50900/ 115203 | consumed samples: 13030400 | consumed tokens: 26686259200 | elapsed time per iteration (s): 0.55 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 2.743851E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.507 | TFLOPs: 44.00 | +7: iteration 50910/ 115203 | consumed samples: 13032960 | consumed tokens: 26691502080 | elapsed time per iteration (s): 0.56 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 2.733855E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.062 | TFLOPs: 43.29 | +7: iteration 50920/ 115203 | consumed samples: 13035520 | consumed tokens: 26696744960 | elapsed time per iteration (s): 0.55 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 2.735175E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.378 | TFLOPs: 43.99 | +7: iteration 50930/ 115203 | consumed samples: 13038080 | consumed tokens: 26701987840 | elapsed time per iteration (s): 0.56 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 2.734285E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.798 | TFLOPs: 43.74 | +7: iteration 50940/ 115203 | consumed samples: 13040640 | consumed tokens: 26707230720 | elapsed time per iteration (s): 0.56 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 2.725916E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.586 | TFLOPs: 43.72 | +7: iteration 50950/ 115203 | consumed samples: 13043200 | consumed tokens: 26712473600 | elapsed time per iteration (s): 0.55 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 2.729326E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 50960/ 115203 | consumed samples: 13045760 | consumed tokens: 26717716480 | elapsed time per iteration (s): 0.55 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 2.732885E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.608 | TFLOPs: 44.01 | +7: iteration 50970/ 115203 | consumed samples: 13048320 | consumed tokens: 26722959360 | elapsed time per iteration (s): 0.56 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 2.730210E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.082 | TFLOPs: 43.39 | +7: iteration 50980/ 115203 | consumed samples: 13050880 | consumed tokens: 26728202240 | elapsed time per iteration (s): 0.55 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 2.737293E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.456 | TFLOPs: 43.99 | +7: iteration 50990/ 115203 | consumed samples: 13053440 | consumed tokens: 26733445120 | elapsed time per iteration (s): 0.55 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 2.720153E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.457 | TFLOPs: 43.99 | +7: iteration 51000/ 115203 | consumed samples: 13056000 | consumed tokens: 26738688000 | elapsed time per iteration (s): 0.56 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 2.722769E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.352 | TFLOPs: 43.70 | +7: iteration 51010/ 115203 | consumed samples: 13058560 | consumed tokens: 26743930880 | elapsed time per iteration (s): 0.56 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 2.735697E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.543 | TFLOPs: 43.72 | +7: iteration 51020/ 115203 | consumed samples: 13061120 | consumed tokens: 26749173760 | elapsed time per iteration (s): 0.55 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 2.728004E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.385 | TFLOPs: 43.99 | +7: iteration 51030/ 115203 | consumed samples: 13063680 | consumed tokens: 26754416640 | elapsed time per iteration (s): 0.55 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 2.736361E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 51040/ 115203 | consumed samples: 13066240 | consumed tokens: 26759659520 | elapsed time per iteration (s): 0.55 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 2.725051E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.434 | TFLOPs: 43.99 | +7: iteration 51050/ 115203 | consumed samples: 13068800 | consumed tokens: 26764902400 | elapsed time per iteration (s): 0.55 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 2.742806E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.400 | TFLOPs: 43.99 | +7: iteration 51060/ 115203 | consumed samples: 13071360 | consumed tokens: 26770145280 | elapsed time per iteration (s): 0.56 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 2.735498E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.432 | TFLOPs: 43.71 | +7: iteration 51070/ 115203 | consumed samples: 13073920 | consumed tokens: 26775388160 | elapsed time per iteration (s): 0.55 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 2.724962E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.636 | TFLOPs: 44.01 | +7: iteration 51080/ 115203 | consumed samples: 13076480 | consumed tokens: 26780631040 | elapsed time per iteration (s): 0.55 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 2.736141E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.533 | TFLOPs: 44.00 | +7: iteration 51090/ 115203 | consumed samples: 13079040 | consumed tokens: 26785873920 | elapsed time per iteration (s): 0.56 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 2.725174E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.676 | TFLOPs: 43.63 | +7: iteration 51100/ 115203 | consumed samples: 13081600 | consumed tokens: 26791116800 | elapsed time per iteration (s): 0.58 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 2.721532E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.674 | TFLOPs: 42.01 | +7: iteration 51110/ 115203 | consumed samples: 13084160 | consumed tokens: 26796359680 | elapsed time per iteration (s): 0.55 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 2.726222E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.575 | TFLOPs: 44.01 | +7: iteration 51120/ 115203 | consumed samples: 13086720 | consumed tokens: 26801602560 | elapsed time per iteration (s): 0.56 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 2.724976E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.035 | TFLOPs: 43.76 | +7: iteration 51130/ 115203 | consumed samples: 13089280 | consumed tokens: 26806845440 | elapsed time per iteration (s): 0.57 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 2.721243E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.195 | TFLOPs: 43.02 | +7: iteration 51140/ 115203 | consumed samples: 13091840 | consumed tokens: 26812088320 | elapsed time per iteration (s): 0.55 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 2.726071E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.697 | TFLOPs: 44.02 | +7: iteration 51150/ 115203 | consumed samples: 13094400 | consumed tokens: 26817331200 | elapsed time per iteration (s): 0.56 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 2.741119E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.158 | TFLOPs: 43.59 | +7: iteration 51160/ 115203 | consumed samples: 13096960 | consumed tokens: 26822574080 | elapsed time per iteration (s): 0.57 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 2.737049E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.338 | TFLOPs: 42.65 | +7: iteration 51170/ 115203 | consumed samples: 13099520 | consumed tokens: 26827816960 | elapsed time per iteration (s): 0.56 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 2.729160E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.442 | TFLOPs: 43.33 | +7: iteration 51180/ 115203 | consumed samples: 13102080 | consumed tokens: 26833059840 | elapsed time per iteration (s): 0.57 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 2.722229E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.872 | TFLOPs: 43.08 | +7: iteration 51190/ 115203 | consumed samples: 13104640 | consumed tokens: 26838302720 | elapsed time per iteration (s): 0.57 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 2.731091E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.128 | TFLOPs: 43.11 | +7: iteration 51200/ 115203 | consumed samples: 13107200 | consumed tokens: 26843545600 | elapsed time per iteration (s): 0.56 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 2.730781E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.255 | TFLOPs: 43.98 | +7: iteration 51210/ 115203 | consumed samples: 13109760 | consumed tokens: 26848788480 | elapsed time per iteration (s): 0.55 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 2.723940E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.393 | TFLOPs: 43.99 | +7: iteration 51220/ 115203 | consumed samples: 13112320 | consumed tokens: 26854031360 | elapsed time per iteration (s): 0.56 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 2.746208E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.973 | TFLOPs: 43.47 | +7: iteration 51230/ 115203 | consumed samples: 13114880 | consumed tokens: 26859274240 | elapsed time per iteration (s): 0.57 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 2.726215E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.810 | TFLOPs: 42.88 | +7: iteration 51240/ 115203 | consumed samples: 13117440 | consumed tokens: 26864517120 | elapsed time per iteration (s): 0.57 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 2.728388E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.477 | TFLOPs: 43.04 | +7: iteration 51250/ 115203 | consumed samples: 13120000 | consumed tokens: 26869760000 | elapsed time per iteration (s): 0.57 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 2.729505E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.662 | TFLOPs: 43.16 | +7: iteration 51260/ 115203 | consumed samples: 13122560 | consumed tokens: 26875002880 | elapsed time per iteration (s): 0.56 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 2.729084E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.236 | TFLOPs: 43.97 | +7: iteration 51270/ 115203 | consumed samples: 13125120 | consumed tokens: 26880245760 | elapsed time per iteration (s): 0.55 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 2.729407E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.264 | TFLOPs: 43.98 | +7: iteration 51280/ 115203 | consumed samples: 13127680 | consumed tokens: 26885488640 | elapsed time per iteration (s): 0.55 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 2.737415E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.294 | TFLOPs: 43.98 | +7: iteration 51290/ 115203 | consumed samples: 13130240 | consumed tokens: 26890731520 | elapsed time per iteration (s): 0.56 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 2.730914E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.047 | TFLOPs: 43.77 | +7: iteration 51300/ 115203 | consumed samples: 13132800 | consumed tokens: 26895974400 | elapsed time per iteration (s): 0.57 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 2.722052E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.915 | TFLOPs: 43.09 | +7: iteration 51310/ 115203 | consumed samples: 13135360 | consumed tokens: 26901217280 | elapsed time per iteration (s): 0.58 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 2.727463E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.481 | TFLOPs: 42.19 | +7: iteration 51320/ 115203 | consumed samples: 13137920 | consumed tokens: 26906460160 | elapsed time per iteration (s): 0.56 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 2.724280E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.439 | TFLOPs: 43.71 | +7: iteration 51330/ 115203 | consumed samples: 13140480 | consumed tokens: 26911703040 | elapsed time per iteration (s): 0.57 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 2.733177E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.714 | TFLOPs: 42.49 | +7: iteration 51340/ 115203 | consumed samples: 13143040 | consumed tokens: 26916945920 | elapsed time per iteration (s): 0.55 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 2.732530E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.543 | TFLOPs: 44.00 | +7: iteration 51350/ 115203 | consumed samples: 13145600 | consumed tokens: 26922188800 | elapsed time per iteration (s): 0.56 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 2.723745E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.460 | TFLOPs: 43.42 | +7: iteration 51360/ 115203 | consumed samples: 13148160 | consumed tokens: 26927431680 | elapsed time per iteration (s): 0.56 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 2.728042E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.223 | TFLOPs: 43.59 | +7: iteration 51370/ 115203 | consumed samples: 13150720 | consumed tokens: 26932674560 | elapsed time per iteration (s): 0.57 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 2.737028E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.502 | TFLOPs: 42.95 | +7: iteration 51380/ 115203 | consumed samples: 13153280 | consumed tokens: 26937917440 | elapsed time per iteration (s): 0.56 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 2.722723E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.414 | TFLOPs: 43.42 | +7: iteration 51390/ 115203 | consumed samples: 13155840 | consumed tokens: 26943160320 | elapsed time per iteration (s): 0.57 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 2.731913E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.262 | TFLOPs: 42.93 | +7: iteration 51400/ 115203 | consumed samples: 13158400 | consumed tokens: 26948403200 | elapsed time per iteration (s): 0.56 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 2.718874E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.745 | TFLOPs: 43.64 | +7: iteration 51410/ 115203 | consumed samples: 13160960 | consumed tokens: 26953646080 | elapsed time per iteration (s): 0.56 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 2.727919E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.961 | TFLOPs: 43.38 | +7: iteration 51420/ 115203 | consumed samples: 13163520 | consumed tokens: 26958888960 | elapsed time per iteration (s): 0.56 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 2.743167E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.212 | TFLOPs: 43.69 | +7: iteration 51430/ 115203 | consumed samples: 13166080 | consumed tokens: 26964131840 | elapsed time per iteration (s): 0.57 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 2.717993E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.954 | TFLOPs: 42.61 | +7: iteration 51440/ 115203 | consumed samples: 13168640 | consumed tokens: 26969374720 | elapsed time per iteration (s): 0.56 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 2.735433E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.822 | TFLOPs: 43.27 | +7: iteration 51450/ 115203 | consumed samples: 13171200 | consumed tokens: 26974617600 | elapsed time per iteration (s): 0.56 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 2.735983E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.300 | TFLOPs: 43.79 | +7: iteration 51460/ 115203 | consumed samples: 13173760 | consumed tokens: 26979860480 | elapsed time per iteration (s): 0.56 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 2.721060E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.124 | TFLOPs: 43.77 | +7: iteration 51470/ 115203 | consumed samples: 13176320 | consumed tokens: 26985103360 | elapsed time per iteration (s): 0.55 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 2.732746E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 51480/ 115203 | consumed samples: 13178880 | consumed tokens: 26990346240 | elapsed time per iteration (s): 0.56 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 2.726115E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.711 | TFLOPs: 43.35 | +7: iteration 51490/ 115203 | consumed samples: 13181440 | consumed tokens: 26995589120 | elapsed time per iteration (s): 0.57 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 2.733276E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.961 | TFLOPs: 42.99 | +7: iteration 51500/ 115203 | consumed samples: 13184000 | consumed tokens: 27000832000 | elapsed time per iteration (s): 0.57 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 2.716865E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.696 | TFLOPs: 42.68 | +7: iteration 51510/ 115203 | consumed samples: 13186560 | consumed tokens: 27006074880 | elapsed time per iteration (s): 0.56 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 2.724809E+00 | grad norm: 0.952 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.090 | TFLOPs: 43.39 | +7: iteration 51520/ 115203 | consumed samples: 13189120 | consumed tokens: 27011317760 | elapsed time per iteration (s): 0.57 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 2.729542E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.815 | TFLOPs: 43.17 | +7: iteration 51530/ 115203 | consumed samples: 13191680 | consumed tokens: 27016560640 | elapsed time per iteration (s): 0.56 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 2.729175E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.831 | TFLOPs: 43.65 | +7: iteration 51540/ 115203 | consumed samples: 13194240 | consumed tokens: 27021803520 | elapsed time per iteration (s): 0.56 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 2.738358E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.864 | TFLOPs: 43.94 | +7: iteration 51550/ 115203 | consumed samples: 13196800 | consumed tokens: 27027046400 | elapsed time per iteration (s): 0.56 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 2.747743E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.724 | TFLOPs: 43.83 | +7: iteration 51560/ 115203 | consumed samples: 13199360 | consumed tokens: 27032289280 | elapsed time per iteration (s): 0.55 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 2.742510E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 51570/ 115203 | consumed samples: 13201920 | consumed tokens: 27037532160 | elapsed time per iteration (s): 0.55 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 2.746905E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 51580/ 115203 | consumed samples: 13204480 | consumed tokens: 27042775040 | elapsed time per iteration (s): 0.56 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 2.744241E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.096 | TFLOPs: 43.29 | +7: iteration 51590/ 115203 | consumed samples: 13207040 | consumed tokens: 27048017920 | elapsed time per iteration (s): 0.56 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 2.739407E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.263 | TFLOPs: 43.40 | +7: iteration 51600/ 115203 | consumed samples: 13209600 | consumed tokens: 27053260800 | elapsed time per iteration (s): 0.55 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 2.714851E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 51610/ 115203 | consumed samples: 13212160 | consumed tokens: 27058503680 | elapsed time per iteration (s): 0.55 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 2.739430E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.419 | TFLOPs: 43.99 | +7: iteration 51620/ 115203 | consumed samples: 13214720 | consumed tokens: 27063746560 | elapsed time per iteration (s): 0.55 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 2.724591E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.533 | TFLOPs: 44.00 | +7: iteration 51630/ 115203 | consumed samples: 13217280 | consumed tokens: 27068989440 | elapsed time per iteration (s): 0.55 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 2.719333E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.437 | TFLOPs: 43.99 | +7: iteration 51640/ 115203 | consumed samples: 13219840 | consumed tokens: 27074232320 | elapsed time per iteration (s): 0.56 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 2.726655E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.321 | TFLOPs: 43.60 | +7: iteration 51650/ 115203 | consumed samples: 13222400 | consumed tokens: 27079475200 | elapsed time per iteration (s): 0.56 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 2.717145E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.265 | TFLOPs: 43.31 | +7: iteration 51660/ 115203 | consumed samples: 13224960 | consumed tokens: 27084718080 | elapsed time per iteration (s): 0.56 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 2.729588E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.069 | TFLOPs: 43.67 | +7: iteration 51670/ 115203 | consumed samples: 13227520 | consumed tokens: 27089960960 | elapsed time per iteration (s): 0.55 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 2.726427E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.371 | TFLOPs: 43.99 | +7: iteration 51680/ 115203 | consumed samples: 13230080 | consumed tokens: 27095203840 | elapsed time per iteration (s): 0.55 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 2.717545E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.482 | TFLOPs: 44.00 | +7: iteration 51690/ 115203 | consumed samples: 13232640 | consumed tokens: 27100446720 | elapsed time per iteration (s): 0.55 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 2.724540E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.460 | TFLOPs: 44.00 | +7: iteration 51700/ 115203 | consumed samples: 13235200 | consumed tokens: 27105689600 | elapsed time per iteration (s): 0.55 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 2.723869E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.589 | TFLOPs: 44.01 | +7: iteration 51710/ 115203 | consumed samples: 13237760 | consumed tokens: 27110932480 | elapsed time per iteration (s): 0.55 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 2.722125E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.455 | TFLOPs: 43.99 | +7: iteration 51720/ 115203 | consumed samples: 13240320 | consumed tokens: 27116175360 | elapsed time per iteration (s): 0.55 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 2.731776E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.513 | TFLOPs: 44.00 | +7: iteration 51730/ 115203 | consumed samples: 13242880 | consumed tokens: 27121418240 | elapsed time per iteration (s): 0.57 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 2.737419E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.593 | TFLOPs: 43.05 | +7: iteration 51740/ 115203 | consumed samples: 13245440 | consumed tokens: 27126661120 | elapsed time per iteration (s): 0.56 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 2.720792E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.511 | TFLOPs: 43.52 | +7: iteration 51750/ 115203 | consumed samples: 13248000 | consumed tokens: 27131904000 | elapsed time per iteration (s): 0.55 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 2.731968E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.463 | TFLOPs: 44.00 | +7: iteration 51760/ 115203 | consumed samples: 13250560 | consumed tokens: 27137146880 | elapsed time per iteration (s): 0.55 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 2.741926E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.431 | TFLOPs: 43.99 | +7: iteration 51770/ 115203 | consumed samples: 13253120 | consumed tokens: 27142389760 | elapsed time per iteration (s): 0.56 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 2.738369E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.104 | TFLOPs: 43.20 | +7: iteration 51780/ 115203 | consumed samples: 13255680 | consumed tokens: 27147632640 | elapsed time per iteration (s): 0.55 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 2.739313E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.407 | TFLOPs: 43.99 | +7: iteration 51790/ 115203 | consumed samples: 13258240 | consumed tokens: 27152875520 | elapsed time per iteration (s): 0.55 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 2.741512E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 51800/ 115203 | consumed samples: 13260800 | consumed tokens: 27158118400 | elapsed time per iteration (s): 0.55 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 2.737075E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.563 | TFLOPs: 44.01 | +7: iteration 51810/ 115203 | consumed samples: 13263360 | consumed tokens: 27163361280 | elapsed time per iteration (s): 0.55 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 2.727992E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.424 | TFLOPs: 43.99 | +7: iteration 51820/ 115203 | consumed samples: 13265920 | consumed tokens: 27168604160 | elapsed time per iteration (s): 0.56 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 2.731736E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.797 | TFLOPs: 43.74 | +7: iteration 51830/ 115203 | consumed samples: 13268480 | consumed tokens: 27173847040 | elapsed time per iteration (s): 0.55 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 2.719351E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.326 | TFLOPs: 43.98 | +7: iteration 51840/ 115203 | consumed samples: 13271040 | consumed tokens: 27179089920 | elapsed time per iteration (s): 0.55 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 2.725104E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.491 | TFLOPs: 44.00 | +7: iteration 51850/ 115203 | consumed samples: 13273600 | consumed tokens: 27184332800 | elapsed time per iteration (s): 0.55 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 2.735904E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.746 | TFLOPs: 44.02 | +7: iteration 51860/ 115203 | consumed samples: 13276160 | consumed tokens: 27189575680 | elapsed time per iteration (s): 0.57 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 2.727971E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.848 | TFLOPs: 42.98 | +7: iteration 51870/ 115203 | consumed samples: 13278720 | consumed tokens: 27194818560 | elapsed time per iteration (s): 0.55 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 2.736690E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.585 | TFLOPs: 44.01 | +7: iteration 51880/ 115203 | consumed samples: 13281280 | consumed tokens: 27200061440 | elapsed time per iteration (s): 0.56 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 2.724295E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.462 | TFLOPs: 43.90 | +7: iteration 51890/ 115203 | consumed samples: 13283840 | consumed tokens: 27205304320 | elapsed time per iteration (s): 0.56 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 2.734468E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.456 | TFLOPs: 43.90 | +7: iteration 51900/ 115203 | consumed samples: 13286400 | consumed tokens: 27210547200 | elapsed time per iteration (s): 0.56 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 2.728710E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.214 | TFLOPs: 43.97 | +7: iteration 51910/ 115203 | consumed samples: 13288960 | consumed tokens: 27215790080 | elapsed time per iteration (s): 0.56 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 2.729686E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.032 | TFLOPs: 43.48 | +7: iteration 51920/ 115203 | consumed samples: 13291520 | consumed tokens: 27221032960 | elapsed time per iteration (s): 0.55 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 2.723178E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.678 | TFLOPs: 44.02 | +7: iteration 51930/ 115203 | consumed samples: 13294080 | consumed tokens: 27226275840 | elapsed time per iteration (s): 0.56 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 2.724652E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.662 | TFLOPs: 43.73 | +7: iteration 51940/ 115203 | consumed samples: 13296640 | consumed tokens: 27231518720 | elapsed time per iteration (s): 0.55 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 2.723213E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.682 | TFLOPs: 44.02 | +7: iteration 51950/ 115203 | consumed samples: 13299200 | consumed tokens: 27236761600 | elapsed time per iteration (s): 0.55 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 2.729600E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.714 | TFLOPs: 44.02 | +7: iteration 51960/ 115203 | consumed samples: 13301760 | consumed tokens: 27242004480 | elapsed time per iteration (s): 0.56 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 2.730535E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.500 | TFLOPs: 43.33 | +7: iteration 51970/ 115203 | consumed samples: 13304320 | consumed tokens: 27247247360 | elapsed time per iteration (s): 0.56 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 2.726621E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.576 | TFLOPs: 43.43 | +7: iteration 51980/ 115203 | consumed samples: 13306880 | consumed tokens: 27252490240 | elapsed time per iteration (s): 0.56 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 2.724140E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.810 | TFLOPs: 43.93 | +7: iteration 51990/ 115203 | consumed samples: 13309440 | consumed tokens: 27257733120 | elapsed time per iteration (s): 0.56 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 2.731001E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.977 | TFLOPs: 43.57 | +0: [2023-03-16 20:59:23,810] [INFO] [logging.py:68:log_dist] [Rank 0] step=52000, skipped=0, lr=[0.00012524180298737348, 0.00012524180298737348, 0.00012524180298737348], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 52000/ 115203 | consumed samples: 13312000 | consumed tokens: 27262976000 | elapsed time per iteration (s): 0.56 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 2.735894E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.271 | TFLOPs: 43.88 | +0: steps: 52000 loss: 2.7089 iter time (s): 0.557 samples/sec: 459.894 +7: iteration 52010/ 115203 | consumed samples: 13314560 | consumed tokens: 27268218880 | elapsed time per iteration (s): 0.56 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 2.719217E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.329 | TFLOPs: 43.60 | +7: iteration 52020/ 115203 | consumed samples: 13317120 | consumed tokens: 27273461760 | elapsed time per iteration (s): 0.56 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 2.744497E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.946 | TFLOPs: 43.56 | +7: iteration 52030/ 115203 | consumed samples: 13319680 | consumed tokens: 27278704640 | elapsed time per iteration (s): 0.56 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 2.719137E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.824 | TFLOPs: 43.55 | +7: iteration 52040/ 115203 | consumed samples: 13322240 | consumed tokens: 27283947520 | elapsed time per iteration (s): 0.56 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 2.717912E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.973 | TFLOPs: 43.28 | +7: iteration 52050/ 115203 | consumed samples: 13324800 | consumed tokens: 27289190400 | elapsed time per iteration (s): 0.56 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 2.727179E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.913 | TFLOPs: 43.47 | +7: iteration 52060/ 115203 | consumed samples: 13327360 | consumed tokens: 27294433280 | elapsed time per iteration (s): 0.55 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 2.724895E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.540 | TFLOPs: 44.00 | +7: iteration 52070/ 115203 | consumed samples: 13329920 | consumed tokens: 27299676160 | elapsed time per iteration (s): 0.55 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 2.713303E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.549 | TFLOPs: 44.00 | +7: iteration 52080/ 115203 | consumed samples: 13332480 | consumed tokens: 27304919040 | elapsed time per iteration (s): 0.56 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 2.721211E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.207 | TFLOPs: 43.97 | +7: iteration 52090/ 115203 | consumed samples: 13335040 | consumed tokens: 27310161920 | elapsed time per iteration (s): 0.55 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 2.732151E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.343 | TFLOPs: 43.98 | +7: iteration 52100/ 115203 | consumed samples: 13337600 | consumed tokens: 27315404800 | elapsed time per iteration (s): 0.55 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 2.729861E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.514 | TFLOPs: 44.00 | +7: iteration 52110/ 115203 | consumed samples: 13340160 | consumed tokens: 27320647680 | elapsed time per iteration (s): 0.56 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 2.725156E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.725 | TFLOPs: 43.64 | +7: iteration 52120/ 115203 | consumed samples: 13342720 | consumed tokens: 27325890560 | elapsed time per iteration (s): 0.56 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 2.714663E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.424 | TFLOPs: 43.52 | +7: iteration 52130/ 115203 | consumed samples: 13345280 | consumed tokens: 27331133440 | elapsed time per iteration (s): 0.56 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 2.719879E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.238 | TFLOPs: 43.40 | +7: iteration 52140/ 115203 | consumed samples: 13347840 | consumed tokens: 27336376320 | elapsed time per iteration (s): 0.55 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 2.737759E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.381 | TFLOPs: 43.99 | +7: iteration 52150/ 115203 | consumed samples: 13350400 | consumed tokens: 27341619200 | elapsed time per iteration (s): 0.55 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 2.719407E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.604 | TFLOPs: 44.01 | +7: iteration 52160/ 115203 | consumed samples: 13352960 | consumed tokens: 27346862080 | elapsed time per iteration (s): 0.56 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 2.718534E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.138 | TFLOPs: 43.20 | +7: iteration 52170/ 115203 | consumed samples: 13355520 | consumed tokens: 27352104960 | elapsed time per iteration (s): 0.56 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 2.728112E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.020 | TFLOPs: 43.86 | +7: iteration 52180/ 115203 | consumed samples: 13358080 | consumed tokens: 27357347840 | elapsed time per iteration (s): 0.55 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 2.731743E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.490 | TFLOPs: 44.00 | +7: iteration 52190/ 115203 | consumed samples: 13360640 | consumed tokens: 27362590720 | elapsed time per iteration (s): 0.57 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 2.728085E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.330 | TFLOPs: 42.74 | +7: iteration 52200/ 115203 | consumed samples: 13363200 | consumed tokens: 27367833600 | elapsed time per iteration (s): 0.56 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 2.744932E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.486 | TFLOPs: 43.52 | +7: iteration 52210/ 115203 | consumed samples: 13365760 | consumed tokens: 27373076480 | elapsed time per iteration (s): 0.55 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 2.742059E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 52220/ 115203 | consumed samples: 13368320 | consumed tokens: 27378319360 | elapsed time per iteration (s): 0.56 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 2.711262E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.879 | TFLOPs: 43.84 | +7: iteration 52230/ 115203 | consumed samples: 13370880 | consumed tokens: 27383562240 | elapsed time per iteration (s): 0.56 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 2.717105E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.898 | TFLOPs: 43.46 | +7: iteration 52240/ 115203 | consumed samples: 13373440 | consumed tokens: 27388805120 | elapsed time per iteration (s): 0.56 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 2.740290E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.470 | TFLOPs: 43.71 | +7: iteration 52250/ 115203 | consumed samples: 13376000 | consumed tokens: 27394048000 | elapsed time per iteration (s): 0.56 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 2.726623E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.472 | TFLOPs: 43.42 | +7: iteration 52260/ 115203 | consumed samples: 13378560 | consumed tokens: 27399290880 | elapsed time per iteration (s): 0.55 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 2.710970E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.362 | TFLOPs: 43.99 | +7: iteration 52270/ 115203 | consumed samples: 13381120 | consumed tokens: 27404533760 | elapsed time per iteration (s): 0.55 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 2.722219E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.283 | TFLOPs: 43.98 | +7: iteration 52280/ 115203 | consumed samples: 13383680 | consumed tokens: 27409776640 | elapsed time per iteration (s): 0.56 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 2.703583E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.016 | TFLOPs: 43.86 | +7: iteration 52290/ 115203 | consumed samples: 13386240 | consumed tokens: 27415019520 | elapsed time per iteration (s): 0.56 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 2.730078E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.963 | TFLOPs: 43.38 | +7: iteration 52300/ 115203 | consumed samples: 13388800 | consumed tokens: 27420262400 | elapsed time per iteration (s): 0.55 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 2.724190E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.321 | TFLOPs: 43.98 | +7: iteration 52310/ 115203 | consumed samples: 13391360 | consumed tokens: 27425505280 | elapsed time per iteration (s): 0.57 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 2.726485E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.958 | TFLOPs: 42.99 | +7: iteration 52320/ 115203 | consumed samples: 13393920 | consumed tokens: 27430748160 | elapsed time per iteration (s): 0.57 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 2.722301E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.291 | TFLOPs: 42.74 | +7: iteration 52330/ 115203 | consumed samples: 13396480 | consumed tokens: 27435991040 | elapsed time per iteration (s): 0.56 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 2.711490E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.377 | TFLOPs: 43.70 | +7: iteration 52340/ 115203 | consumed samples: 13399040 | consumed tokens: 27441233920 | elapsed time per iteration (s): 0.56 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 2.720771E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.517 | TFLOPs: 43.71 | +7: iteration 52350/ 115203 | consumed samples: 13401600 | consumed tokens: 27446476800 | elapsed time per iteration (s): 0.56 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 2.743318E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.034 | TFLOPs: 43.95 | +7: iteration 52360/ 115203 | consumed samples: 13404160 | consumed tokens: 27451719680 | elapsed time per iteration (s): 0.56 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 2.717671E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.282 | TFLOPs: 43.31 | +7: iteration 52370/ 115203 | consumed samples: 13406720 | consumed tokens: 27456962560 | elapsed time per iteration (s): 0.56 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 2.732277E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.930 | TFLOPs: 43.66 | +7: iteration 52380/ 115203 | consumed samples: 13409280 | consumed tokens: 27462205440 | elapsed time per iteration (s): 0.56 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 2.728589E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.384 | TFLOPs: 43.70 | +7: iteration 52390/ 115203 | consumed samples: 13411840 | consumed tokens: 27467448320 | elapsed time per iteration (s): 0.56 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 2.720777E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.684 | TFLOPs: 43.92 | +7: iteration 52400/ 115203 | consumed samples: 13414400 | consumed tokens: 27472691200 | elapsed time per iteration (s): 0.56 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 2.713471E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.035 | TFLOPs: 43.67 | +7: iteration 52410/ 115203 | consumed samples: 13416960 | consumed tokens: 27477934080 | elapsed time per iteration (s): 0.56 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 2.720418E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.949 | TFLOPs: 43.66 | +7: iteration 52420/ 115203 | consumed samples: 13419520 | consumed tokens: 27483176960 | elapsed time per iteration (s): 0.57 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 2.708911E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.654 | TFLOPs: 43.06 | +7: iteration 52430/ 115203 | consumed samples: 13422080 | consumed tokens: 27488419840 | elapsed time per iteration (s): 0.56 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 2.719044E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.405 | TFLOPs: 43.70 | +7: iteration 52440/ 115203 | consumed samples: 13424640 | consumed tokens: 27493662720 | elapsed time per iteration (s): 0.56 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 2.733159E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.474 | TFLOPs: 43.42 | +7: iteration 52450/ 115203 | consumed samples: 13427200 | consumed tokens: 27498905600 | elapsed time per iteration (s): 0.56 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 2.738307E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.941 | TFLOPs: 43.95 | +7: iteration 52460/ 115203 | consumed samples: 13429760 | consumed tokens: 27504148480 | elapsed time per iteration (s): 0.56 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 2.727927E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.217 | TFLOPs: 43.59 | +7: iteration 52470/ 115203 | consumed samples: 13432320 | consumed tokens: 27509391360 | elapsed time per iteration (s): 0.56 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 2.719955E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.036 | TFLOPs: 43.38 | +7: iteration 52480/ 115203 | consumed samples: 13434880 | consumed tokens: 27514634240 | elapsed time per iteration (s): 0.57 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 2.716631E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.483 | TFLOPs: 43.04 | +7: iteration 52490/ 115203 | consumed samples: 13437440 | consumed tokens: 27519877120 | elapsed time per iteration (s): 0.56 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 2.716820E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.979 | TFLOPs: 43.95 | +7: iteration 52500/ 115203 | consumed samples: 13440000 | consumed tokens: 27525120000 | elapsed time per iteration (s): 0.56 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 2.724979E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.205 | TFLOPs: 43.97 | +7: iteration 52510/ 115203 | consumed samples: 13442560 | consumed tokens: 27530362880 | elapsed time per iteration (s): 0.56 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 2.733748E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.629 | TFLOPs: 43.25 | +7: iteration 52520/ 115203 | consumed samples: 13445120 | consumed tokens: 27535605760 | elapsed time per iteration (s): 0.56 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 2.725140E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.517 | TFLOPs: 43.43 | +7: iteration 52530/ 115203 | consumed samples: 13447680 | consumed tokens: 27540848640 | elapsed time per iteration (s): 0.57 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 2.736805E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.386 | TFLOPs: 42.84 | +7: iteration 52540/ 115203 | consumed samples: 13450240 | consumed tokens: 27546091520 | elapsed time per iteration (s): 0.56 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 2.727426E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.453 | TFLOPs: 43.52 | +7: iteration 52550/ 115203 | consumed samples: 13452800 | consumed tokens: 27551334400 | elapsed time per iteration (s): 0.56 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 2.717972E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.444 | TFLOPs: 43.52 | +7: iteration 52560/ 115203 | consumed samples: 13455360 | consumed tokens: 27556577280 | elapsed time per iteration (s): 0.57 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 2.732449E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.862 | TFLOPs: 42.98 | +7: iteration 52570/ 115203 | consumed samples: 13457920 | consumed tokens: 27561820160 | elapsed time per iteration (s): 0.56 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 2.724280E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.393 | TFLOPs: 43.61 | +7: iteration 52580/ 115203 | consumed samples: 13460480 | consumed tokens: 27567063040 | elapsed time per iteration (s): 0.59 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 2.725286E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.790 | TFLOPs: 41.55 | +7: iteration 52590/ 115203 | consumed samples: 13463040 | consumed tokens: 27572305920 | elapsed time per iteration (s): 0.56 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 2.721634E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.634 | TFLOPs: 43.63 | +7: iteration 52600/ 115203 | consumed samples: 13465600 | consumed tokens: 27577548800 | elapsed time per iteration (s): 0.56 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 2.709142E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.806 | TFLOPs: 43.27 | +7: iteration 52610/ 115203 | consumed samples: 13468160 | consumed tokens: 27582791680 | elapsed time per iteration (s): 0.56 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 2.730050E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 52620/ 115203 | consumed samples: 13470720 | consumed tokens: 27588034560 | elapsed time per iteration (s): 0.57 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 2.733033E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.916 | TFLOPs: 43.09 | +7: iteration 52630/ 115203 | consumed samples: 13473280 | consumed tokens: 27593277440 | elapsed time per iteration (s): 0.56 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 2.734299E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.347 | TFLOPs: 43.41 | +7: iteration 52640/ 115203 | consumed samples: 13475840 | consumed tokens: 27598520320 | elapsed time per iteration (s): 0.56 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 2.736202E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.170 | TFLOPs: 43.68 | +7: iteration 52650/ 115203 | consumed samples: 13478400 | consumed tokens: 27603763200 | elapsed time per iteration (s): 0.58 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 2.728150E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.511 | TFLOPs: 42.38 | +7: iteration 52660/ 115203 | consumed samples: 13480960 | consumed tokens: 27609006080 | elapsed time per iteration (s): 0.56 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 2.718803E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.797 | TFLOPs: 43.26 | +7: iteration 52670/ 115203 | consumed samples: 13483520 | consumed tokens: 27614248960 | elapsed time per iteration (s): 0.56 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 2.712520E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.729 | TFLOPs: 43.45 | +7: iteration 52680/ 115203 | consumed samples: 13486080 | consumed tokens: 27619491840 | elapsed time per iteration (s): 0.56 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 2.717749E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.853 | TFLOPs: 43.56 | +7: iteration 52690/ 115203 | consumed samples: 13488640 | consumed tokens: 27624734720 | elapsed time per iteration (s): 0.56 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 2.727954E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.015 | TFLOPs: 43.48 | +7: iteration 52700/ 115203 | consumed samples: 13491200 | consumed tokens: 27629977600 | elapsed time per iteration (s): 0.57 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 2.723684E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.226 | TFLOPs: 42.64 | +7: iteration 52710/ 115203 | consumed samples: 13493760 | consumed tokens: 27635220480 | elapsed time per iteration (s): 0.56 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 2.717275E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.950 | TFLOPs: 43.28 | +7: iteration 52720/ 115203 | consumed samples: 13496320 | consumed tokens: 27640463360 | elapsed time per iteration (s): 0.57 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 2.721056E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.000 | TFLOPs: 42.81 | +7: iteration 52730/ 115203 | consumed samples: 13498880 | consumed tokens: 27645706240 | elapsed time per iteration (s): 0.56 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 2.729592E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.849 | TFLOPs: 43.46 | +7: iteration 52740/ 115203 | consumed samples: 13501440 | consumed tokens: 27650949120 | elapsed time per iteration (s): 0.58 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 2.719174E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.701 | TFLOPs: 42.21 | +7: iteration 52750/ 115203 | consumed samples: 13504000 | consumed tokens: 27656192000 | elapsed time per iteration (s): 0.56 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 2.719247E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.178 | TFLOPs: 43.49 | +7: iteration 52760/ 115203 | consumed samples: 13506560 | consumed tokens: 27661434880 | elapsed time per iteration (s): 0.58 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 2.720716E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.375 | TFLOPs: 42.18 | +7: iteration 52770/ 115203 | consumed samples: 13509120 | consumed tokens: 27666677760 | elapsed time per iteration (s): 0.58 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 2.721820E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.010 | TFLOPs: 42.43 | +7: iteration 52780/ 115203 | consumed samples: 13511680 | consumed tokens: 27671920640 | elapsed time per iteration (s): 0.58 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 2.715035E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.288 | TFLOPs: 42.07 | +7: iteration 52790/ 115203 | consumed samples: 13514240 | consumed tokens: 27677163520 | elapsed time per iteration (s): 0.57 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 2.719951E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.627 | TFLOPs: 43.15 | +7: iteration 52800/ 115203 | consumed samples: 13516800 | consumed tokens: 27682406400 | elapsed time per iteration (s): 0.58 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 2.716825E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.339 | TFLOPs: 42.27 | +7: iteration 52810/ 115203 | consumed samples: 13519360 | consumed tokens: 27687649280 | elapsed time per iteration (s): 0.58 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 2.721674E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.563 | TFLOPs: 42.29 | +7: iteration 52820/ 115203 | consumed samples: 13521920 | consumed tokens: 27692892160 | elapsed time per iteration (s): 0.57 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 2.733765E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.269 | TFLOPs: 43.02 | +7: iteration 52830/ 115203 | consumed samples: 13524480 | consumed tokens: 27698135040 | elapsed time per iteration (s): 0.61 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 2.709273E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 418.095 | TFLOPs: 39.86 | +7: iteration 52840/ 115203 | consumed samples: 13527040 | consumed tokens: 27703377920 | elapsed time per iteration (s): 0.59 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 2.723012E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.234 | TFLOPs: 41.69 | +7: iteration 52850/ 115203 | consumed samples: 13529600 | consumed tokens: 27708620800 | elapsed time per iteration (s): 0.57 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 2.744177E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.653 | TFLOPs: 43.16 | +7: iteration 52860/ 115203 | consumed samples: 13532160 | consumed tokens: 27713863680 | elapsed time per iteration (s): 0.57 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 2.737321E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.212 | TFLOPs: 43.02 | +7: iteration 52870/ 115203 | consumed samples: 13534720 | consumed tokens: 27719106560 | elapsed time per iteration (s): 0.58 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 2.722463E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.991 | TFLOPs: 42.43 | +7: iteration 52880/ 115203 | consumed samples: 13537280 | consumed tokens: 27724349440 | elapsed time per iteration (s): 0.56 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 2.718430E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.683 | TFLOPs: 43.64 | +7: iteration 52890/ 115203 | consumed samples: 13539840 | consumed tokens: 27729592320 | elapsed time per iteration (s): 0.57 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 2.731729E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.273 | TFLOPs: 42.93 | +7: iteration 52900/ 115203 | consumed samples: 13542400 | consumed tokens: 27734835200 | elapsed time per iteration (s): 0.56 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 2.710105E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.032 | TFLOPs: 43.48 | +7: iteration 52910/ 115203 | consumed samples: 13544960 | consumed tokens: 27740078080 | elapsed time per iteration (s): 0.56 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 2.727518E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.373 | TFLOPs: 43.51 | +7: iteration 52920/ 115203 | consumed samples: 13547520 | consumed tokens: 27745320960 | elapsed time per iteration (s): 0.56 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 2.712479E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.832 | TFLOPs: 43.46 | +7: iteration 52930/ 115203 | consumed samples: 13550080 | consumed tokens: 27750563840 | elapsed time per iteration (s): 0.56 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 2.733904E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.501 | TFLOPs: 43.52 | +7: iteration 52940/ 115203 | consumed samples: 13552640 | consumed tokens: 27755806720 | elapsed time per iteration (s): 0.56 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 2.722368E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.401 | TFLOPs: 43.61 | +7: iteration 52950/ 115203 | consumed samples: 13555200 | consumed tokens: 27761049600 | elapsed time per iteration (s): 0.58 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 2.731219E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.280 | TFLOPs: 42.07 | +7: iteration 52960/ 115203 | consumed samples: 13557760 | consumed tokens: 27766292480 | elapsed time per iteration (s): 0.56 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 2.732804E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.238 | TFLOPs: 43.59 | +7: iteration 52970/ 115203 | consumed samples: 13560320 | consumed tokens: 27771535360 | elapsed time per iteration (s): 0.56 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 2.718660E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.122 | TFLOPs: 43.30 | +7: iteration 52980/ 115203 | consumed samples: 13562880 | consumed tokens: 27776778240 | elapsed time per iteration (s): 0.57 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 2.723717E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.090 | TFLOPs: 42.91 | +7: iteration 52990/ 115203 | consumed samples: 13565440 | consumed tokens: 27782021120 | elapsed time per iteration (s): 0.56 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 2.730627E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.656 | TFLOPs: 43.73 | +7: iteration 53000/ 115203 | consumed samples: 13568000 | consumed tokens: 27787264000 | elapsed time per iteration (s): 0.59 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 2.726663E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.578 | TFLOPs: 41.62 | +7: iteration 53010/ 115203 | consumed samples: 13570560 | consumed tokens: 27792506880 | elapsed time per iteration (s): 0.60 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 2.736832E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.736 | TFLOPs: 40.88 | +7: iteration 53020/ 115203 | consumed samples: 13573120 | consumed tokens: 27797749760 | elapsed time per iteration (s): 0.57 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 2.726431E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.543 | TFLOPs: 42.95 | +7: iteration 53030/ 115203 | consumed samples: 13575680 | consumed tokens: 27802992640 | elapsed time per iteration (s): 0.57 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 2.733220E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.935 | TFLOPs: 42.99 | +7: iteration 53040/ 115203 | consumed samples: 13578240 | consumed tokens: 27808235520 | elapsed time per iteration (s): 0.56 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 2.723684E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.143 | TFLOPs: 43.30 | +7: iteration 53050/ 115203 | consumed samples: 13580800 | consumed tokens: 27813478400 | elapsed time per iteration (s): 0.57 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 2.719748E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.008 | TFLOPs: 42.62 | +7: iteration 53060/ 115203 | consumed samples: 13583360 | consumed tokens: 27818721280 | elapsed time per iteration (s): 0.57 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 2.715342E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.187 | TFLOPs: 42.54 | +7: iteration 53070/ 115203 | consumed samples: 13585920 | consumed tokens: 27823964160 | elapsed time per iteration (s): 0.56 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 2.720273E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.735 | TFLOPs: 43.64 | +7: iteration 53080/ 115203 | consumed samples: 13588480 | consumed tokens: 27829207040 | elapsed time per iteration (s): 0.57 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 2.715991E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.150 | TFLOPs: 42.63 | +7: iteration 53090/ 115203 | consumed samples: 13591040 | consumed tokens: 27834449920 | elapsed time per iteration (s): 0.56 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 2.723682E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.177 | TFLOPs: 43.49 | +7: iteration 53100/ 115203 | consumed samples: 13593600 | consumed tokens: 27839692800 | elapsed time per iteration (s): 0.57 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 2.710547E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.779 | TFLOPs: 43.07 | +7: iteration 53110/ 115203 | consumed samples: 13596160 | consumed tokens: 27844935680 | elapsed time per iteration (s): 0.56 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 2.728125E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.006 | TFLOPs: 43.57 | +7: iteration 53120/ 115203 | consumed samples: 13598720 | consumed tokens: 27850178560 | elapsed time per iteration (s): 0.56 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 2.712385E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.880 | TFLOPs: 43.65 | +7: iteration 53130/ 115203 | consumed samples: 13601280 | consumed tokens: 27855421440 | elapsed time per iteration (s): 0.56 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 2.731082E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.339 | TFLOPs: 43.32 | +7: iteration 53140/ 115203 | consumed samples: 13603840 | consumed tokens: 27860664320 | elapsed time per iteration (s): 0.55 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 2.728719E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.445 | TFLOPs: 43.99 | +7: iteration 53150/ 115203 | consumed samples: 13606400 | consumed tokens: 27865907200 | elapsed time per iteration (s): 0.56 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 2.724208E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.086 | TFLOPs: 43.86 | +7: iteration 53160/ 115203 | consumed samples: 13608960 | consumed tokens: 27871150080 | elapsed time per iteration (s): 0.55 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 2.718526E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.398 | TFLOPs: 43.99 | +7: iteration 53170/ 115203 | consumed samples: 13611520 | consumed tokens: 27876392960 | elapsed time per iteration (s): 0.55 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 2.729965E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.483 | TFLOPs: 44.00 | +7: iteration 53180/ 115203 | consumed samples: 13614080 | consumed tokens: 27881635840 | elapsed time per iteration (s): 0.56 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 2.721373E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.353 | TFLOPs: 43.41 | +7: iteration 53190/ 115203 | consumed samples: 13616640 | consumed tokens: 27886878720 | elapsed time per iteration (s): 0.56 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 2.725324E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.495 | TFLOPs: 43.43 | +7: iteration 53200/ 115203 | consumed samples: 13619200 | consumed tokens: 27892121600 | elapsed time per iteration (s): 0.58 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 2.718861E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.201 | TFLOPs: 42.45 | +7: iteration 53210/ 115203 | consumed samples: 13621760 | consumed tokens: 27897364480 | elapsed time per iteration (s): 0.56 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 2.723384E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.846 | TFLOPs: 43.27 | +7: iteration 53220/ 115203 | consumed samples: 13624320 | consumed tokens: 27902607360 | elapsed time per iteration (s): 0.57 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 2.716813E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.130 | TFLOPs: 42.91 | +7: iteration 53230/ 115203 | consumed samples: 13626880 | consumed tokens: 27907850240 | elapsed time per iteration (s): 0.56 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 2.722981E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.476 | TFLOPs: 43.81 | +7: iteration 53240/ 115203 | consumed samples: 13629440 | consumed tokens: 27913093120 | elapsed time per iteration (s): 0.56 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 2.724205E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.512 | TFLOPs: 43.62 | +7: iteration 53250/ 115203 | consumed samples: 13632000 | consumed tokens: 27918336000 | elapsed time per iteration (s): 0.56 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 2.732089E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.854 | TFLOPs: 43.46 | +7: iteration 53260/ 115203 | consumed samples: 13634560 | consumed tokens: 27923578880 | elapsed time per iteration (s): 0.57 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 2.720227E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.271 | TFLOPs: 43.12 | +7: iteration 53270/ 115203 | consumed samples: 13637120 | consumed tokens: 27928821760 | elapsed time per iteration (s): 0.56 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 2.719302E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.895 | TFLOPs: 43.46 | +7: iteration 53280/ 115203 | consumed samples: 13639680 | consumed tokens: 27934064640 | elapsed time per iteration (s): 0.56 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 2.736603E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.256 | TFLOPs: 43.31 | +7: iteration 53290/ 115203 | consumed samples: 13642240 | consumed tokens: 27939307520 | elapsed time per iteration (s): 0.57 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 2.719416E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.999 | TFLOPs: 43.00 | +7: iteration 53300/ 115203 | consumed samples: 13644800 | consumed tokens: 27944550400 | elapsed time per iteration (s): 0.59 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 2.724383E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.953 | TFLOPs: 41.56 | +7: iteration 53310/ 115203 | consumed samples: 13647360 | consumed tokens: 27949793280 | elapsed time per iteration (s): 0.59 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 2.720779E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.713 | TFLOPs: 41.35 | +7: iteration 53320/ 115203 | consumed samples: 13649920 | consumed tokens: 27955036160 | elapsed time per iteration (s): 0.57 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 2.739222E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.283 | TFLOPs: 42.93 | +7: iteration 53330/ 115203 | consumed samples: 13652480 | consumed tokens: 27960279040 | elapsed time per iteration (s): 0.56 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 2.713966E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.306 | TFLOPs: 43.31 | +7: iteration 53340/ 115203 | consumed samples: 13655040 | consumed tokens: 27965521920 | elapsed time per iteration (s): 0.57 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 2.725867E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.213 | TFLOPs: 43.11 | +7: iteration 53350/ 115203 | consumed samples: 13657600 | consumed tokens: 27970764800 | elapsed time per iteration (s): 0.57 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 2.713033E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.232 | TFLOPs: 42.73 | +7: iteration 53360/ 115203 | consumed samples: 13660160 | consumed tokens: 27976007680 | elapsed time per iteration (s): 0.58 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 2.715393E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.751 | TFLOPs: 42.40 | +7: iteration 53370/ 115203 | consumed samples: 13662720 | consumed tokens: 27981250560 | elapsed time per iteration (s): 0.57 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 2.729608E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.473 | TFLOPs: 42.76 | +7: iteration 53380/ 115203 | consumed samples: 13665280 | consumed tokens: 27986493440 | elapsed time per iteration (s): 0.57 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 2.726691E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.861 | TFLOPs: 42.70 | +7: iteration 53390/ 115203 | consumed samples: 13667840 | consumed tokens: 27991736320 | elapsed time per iteration (s): 0.58 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 2.731378E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.502 | TFLOPs: 41.90 | +7: iteration 53400/ 115203 | consumed samples: 13670400 | consumed tokens: 27996979200 | elapsed time per iteration (s): 0.57 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 2.720663E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.443 | TFLOPs: 42.94 | +7: iteration 53410/ 115203 | consumed samples: 13672960 | consumed tokens: 28002222080 | elapsed time per iteration (s): 0.55 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 2.722000E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.536 | TFLOPs: 44.00 | +7: iteration 53420/ 115203 | consumed samples: 13675520 | consumed tokens: 28007464960 | elapsed time per iteration (s): 0.57 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 2.714642E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.540 | TFLOPs: 43.14 | +7: iteration 53430/ 115203 | consumed samples: 13678080 | consumed tokens: 28012707840 | elapsed time per iteration (s): 0.57 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 2.706318E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.830 | TFLOPs: 43.17 | +7: iteration 53440/ 115203 | consumed samples: 13680640 | consumed tokens: 28017950720 | elapsed time per iteration (s): 0.56 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 2.712107E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.942 | TFLOPs: 43.28 | +7: iteration 53450/ 115203 | consumed samples: 13683200 | consumed tokens: 28023193600 | elapsed time per iteration (s): 0.57 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 2.711348E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.880 | TFLOPs: 43.08 | +7: iteration 53460/ 115203 | consumed samples: 13685760 | consumed tokens: 28028436480 | elapsed time per iteration (s): 0.58 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 2.713150E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.732 | TFLOPs: 42.02 | +7: iteration 53470/ 115203 | consumed samples: 13688320 | consumed tokens: 28033679360 | elapsed time per iteration (s): 0.57 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 2.724669E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.794 | TFLOPs: 42.50 | +7: iteration 53480/ 115203 | consumed samples: 13690880 | consumed tokens: 28038922240 | elapsed time per iteration (s): 0.57 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 2.720187E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.063 | TFLOPs: 42.53 | +7: iteration 53490/ 115203 | consumed samples: 13693440 | consumed tokens: 28044165120 | elapsed time per iteration (s): 0.56 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 2.713903E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.699 | TFLOPs: 43.54 | +7: iteration 53500/ 115203 | consumed samples: 13696000 | consumed tokens: 28049408000 | elapsed time per iteration (s): 0.57 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 2.715310E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.450 | TFLOPs: 42.56 | +7: iteration 53510/ 115203 | consumed samples: 13698560 | consumed tokens: 28054650880 | elapsed time per iteration (s): 0.57 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 2.728175E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.651 | TFLOPs: 42.68 | +7: iteration 53520/ 115203 | consumed samples: 13701120 | consumed tokens: 28059893760 | elapsed time per iteration (s): 0.56 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 2.721840E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.991 | TFLOPs: 43.66 | +7: iteration 53530/ 115203 | consumed samples: 13703680 | consumed tokens: 28065136640 | elapsed time per iteration (s): 0.56 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 2.719781E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.882 | TFLOPs: 43.37 | +7: iteration 53540/ 115203 | consumed samples: 13706240 | consumed tokens: 28070379520 | elapsed time per iteration (s): 0.58 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 2.712955E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.796 | TFLOPs: 41.83 | +7: iteration 53550/ 115203 | consumed samples: 13708800 | consumed tokens: 28075622400 | elapsed time per iteration (s): 0.57 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 2.725327E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.379 | TFLOPs: 42.75 | +7: iteration 53560/ 115203 | consumed samples: 13711360 | consumed tokens: 28080865280 | elapsed time per iteration (s): 0.56 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 2.714725E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.096 | TFLOPs: 43.67 | +7: iteration 53570/ 115203 | consumed samples: 13713920 | consumed tokens: 28086108160 | elapsed time per iteration (s): 0.57 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 2.716190E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.547 | TFLOPs: 42.57 | +7: iteration 53580/ 115203 | consumed samples: 13716480 | consumed tokens: 28091351040 | elapsed time per iteration (s): 0.57 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 2.716846E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.698 | TFLOPs: 42.87 | +7: iteration 53590/ 115203 | consumed samples: 13719040 | consumed tokens: 28096593920 | elapsed time per iteration (s): 0.56 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 2.721928E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.276 | TFLOPs: 43.41 | +7: iteration 53600/ 115203 | consumed samples: 13721600 | consumed tokens: 28101836800 | elapsed time per iteration (s): 0.57 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 2.713617E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.977 | TFLOPs: 42.71 | +7: iteration 53610/ 115203 | consumed samples: 13724160 | consumed tokens: 28107079680 | elapsed time per iteration (s): 0.57 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 2.721079E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.802 | TFLOPs: 42.69 | +7: iteration 53620/ 115203 | consumed samples: 13726720 | consumed tokens: 28112322560 | elapsed time per iteration (s): 0.57 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 2.719485E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.310 | TFLOPs: 42.93 | +7: iteration 53630/ 115203 | consumed samples: 13729280 | consumed tokens: 28117565440 | elapsed time per iteration (s): 0.56 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 2.724354E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.780 | TFLOPs: 43.45 | +7: iteration 53640/ 115203 | consumed samples: 13731840 | consumed tokens: 28122808320 | elapsed time per iteration (s): 0.57 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 2.718363E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.562 | TFLOPs: 43.15 | +7: iteration 53650/ 115203 | consumed samples: 13734400 | consumed tokens: 28128051200 | elapsed time per iteration (s): 0.57 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 2.718534E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.222 | TFLOPs: 42.64 | +7: iteration 53660/ 115203 | consumed samples: 13736960 | consumed tokens: 28133294080 | elapsed time per iteration (s): 0.58 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 2.711220E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.248 | TFLOPs: 42.16 | +7: iteration 53670/ 115203 | consumed samples: 13739520 | consumed tokens: 28138536960 | elapsed time per iteration (s): 0.57 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 2.714072E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.411 | TFLOPs: 43.04 | +7: iteration 53680/ 115203 | consumed samples: 13742080 | consumed tokens: 28143779840 | elapsed time per iteration (s): 0.57 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 2.724641E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.498 | TFLOPs: 42.57 | +7: iteration 53690/ 115203 | consumed samples: 13744640 | consumed tokens: 28149022720 | elapsed time per iteration (s): 0.57 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 2.712157E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.609 | TFLOPs: 42.67 | +7: iteration 53700/ 115203 | consumed samples: 13747200 | consumed tokens: 28154265600 | elapsed time per iteration (s): 0.56 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 2.722186E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.359 | TFLOPs: 43.22 | +7: iteration 53710/ 115203 | consumed samples: 13749760 | consumed tokens: 28159508480 | elapsed time per iteration (s): 0.56 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 2.733733E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.445 | TFLOPs: 43.42 | +7: iteration 53720/ 115203 | consumed samples: 13752320 | consumed tokens: 28164751360 | elapsed time per iteration (s): 0.57 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 2.721142E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.915 | TFLOPs: 42.89 | +7: iteration 53730/ 115203 | consumed samples: 13754880 | consumed tokens: 28169994240 | elapsed time per iteration (s): 0.56 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 2.727076E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.720 | TFLOPs: 43.26 | +7: iteration 53740/ 115203 | consumed samples: 13757440 | consumed tokens: 28175237120 | elapsed time per iteration (s): 0.57 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 2.707960E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.221 | TFLOPs: 42.45 | +7: iteration 53750/ 115203 | consumed samples: 13760000 | consumed tokens: 28180480000 | elapsed time per iteration (s): 0.56 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 2.722095E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.338 | TFLOPs: 43.60 | +7: iteration 53760/ 115203 | consumed samples: 13762560 | consumed tokens: 28185722880 | elapsed time per iteration (s): 0.56 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 2.727111E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.725 | TFLOPs: 43.73 | +7: iteration 53770/ 115203 | consumed samples: 13765120 | consumed tokens: 28190965760 | elapsed time per iteration (s): 0.57 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 2.733147E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.309 | TFLOPs: 43.12 | +7: iteration 53780/ 115203 | consumed samples: 13767680 | consumed tokens: 28196208640 | elapsed time per iteration (s): 0.56 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 2.708837E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.148 | TFLOPs: 43.30 | +7: iteration 53790/ 115203 | consumed samples: 13770240 | consumed tokens: 28201451520 | elapsed time per iteration (s): 0.57 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 2.726849E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.317 | TFLOPs: 42.46 | +7: iteration 53800/ 115203 | consumed samples: 13772800 | consumed tokens: 28206694400 | elapsed time per iteration (s): 0.56 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 2.726798E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.093 | TFLOPs: 43.29 | +7: iteration 53810/ 115203 | consumed samples: 13775360 | consumed tokens: 28211937280 | elapsed time per iteration (s): 0.58 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 2.714101E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.337 | TFLOPs: 42.08 | +7: iteration 53820/ 115203 | consumed samples: 13777920 | consumed tokens: 28217180160 | elapsed time per iteration (s): 0.57 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 2.727324E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.350 | TFLOPs: 43.13 | +7: iteration 53830/ 115203 | consumed samples: 13780480 | consumed tokens: 28222423040 | elapsed time per iteration (s): 0.57 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 2.726905E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.920 | TFLOPs: 42.90 | +7: iteration 53840/ 115203 | consumed samples: 13783040 | consumed tokens: 28227665920 | elapsed time per iteration (s): 0.58 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 2.721856E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.697 | TFLOPs: 42.30 | +7: iteration 53850/ 115203 | consumed samples: 13785600 | consumed tokens: 28232908800 | elapsed time per iteration (s): 0.56 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 2.717091E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.986 | TFLOPs: 43.57 | +7: iteration 53860/ 115203 | consumed samples: 13788160 | consumed tokens: 28238151680 | elapsed time per iteration (s): 0.56 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 2.711203E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.909 | TFLOPs: 43.94 | +7: iteration 53870/ 115203 | consumed samples: 13790720 | consumed tokens: 28243394560 | elapsed time per iteration (s): 0.57 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 2.732265E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.787 | TFLOPs: 43.07 | +7: iteration 53880/ 115203 | consumed samples: 13793280 | consumed tokens: 28248637440 | elapsed time per iteration (s): 0.57 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 2.704770E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.641 | TFLOPs: 42.77 | +7: iteration 53890/ 115203 | consumed samples: 13795840 | consumed tokens: 28253880320 | elapsed time per iteration (s): 0.57 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 2.719636E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.860 | TFLOPs: 42.89 | +7: iteration 53900/ 115203 | consumed samples: 13798400 | consumed tokens: 28259123200 | elapsed time per iteration (s): 0.57 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 2.722964E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.908 | TFLOPs: 42.89 | +7: iteration 53910/ 115203 | consumed samples: 13800960 | consumed tokens: 28264366080 | elapsed time per iteration (s): 0.56 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 2.715470E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.352 | TFLOPs: 43.32 | +7: iteration 53920/ 115203 | consumed samples: 13803520 | consumed tokens: 28269608960 | elapsed time per iteration (s): 0.56 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 2.713342E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.200 | TFLOPs: 43.21 | +7: iteration 53930/ 115203 | consumed samples: 13806080 | consumed tokens: 28274851840 | elapsed time per iteration (s): 0.60 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 2.705324E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.420 | TFLOPs: 40.75 | +7: iteration 53940/ 115203 | consumed samples: 13808640 | consumed tokens: 28280094720 | elapsed time per iteration (s): 0.56 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 2.721506E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.562 | TFLOPs: 43.34 | +7: iteration 53950/ 115203 | consumed samples: 13811200 | consumed tokens: 28285337600 | elapsed time per iteration (s): 0.56 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 2.715297E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.023 | TFLOPs: 43.48 | +7: iteration 53960/ 115203 | consumed samples: 13813760 | consumed tokens: 28290580480 | elapsed time per iteration (s): 0.57 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 2.718365E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.965 | TFLOPs: 43.09 | +7: iteration 53970/ 115203 | consumed samples: 13816320 | consumed tokens: 28295823360 | elapsed time per iteration (s): 0.57 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 2.708635E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.574 | TFLOPs: 43.05 | +7: iteration 53980/ 115203 | consumed samples: 13818880 | consumed tokens: 28301066240 | elapsed time per iteration (s): 0.57 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 2.722974E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.913 | TFLOPs: 42.80 | +7: iteration 53990/ 115203 | consumed samples: 13821440 | consumed tokens: 28306309120 | elapsed time per iteration (s): 0.56 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 2.721101E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.990 | TFLOPs: 43.57 | +0: [2023-03-16 21:18:15,256] [INFO] [logging.py:68:log_dist] [Rank 0] step=54000, skipped=0, lr=[0.00012033461390561511, 0.00012033461390561511, 0.00012033461390561511], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 54000/ 115203 | consumed samples: 13824000 | consumed tokens: 28311552000 | elapsed time per iteration (s): 0.57 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 2.731878E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.238 | TFLOPs: 42.93 | +0: steps: 54000 loss: 2.7418 iter time (s): 0.563 samples/sec: 454.380 +7: iteration 54010/ 115203 | consumed samples: 13826560 | consumed tokens: 28316794880 | elapsed time per iteration (s): 0.57 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 2.719518E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.164 | TFLOPs: 43.11 | +7: iteration 54020/ 115203 | consumed samples: 13829120 | consumed tokens: 28322037760 | elapsed time per iteration (s): 0.57 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 2.724894E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.851 | TFLOPs: 42.51 | +7: iteration 54030/ 115203 | consumed samples: 13831680 | consumed tokens: 28327280640 | elapsed time per iteration (s): 0.56 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 2.718922E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.768 | TFLOPs: 43.26 | +7: iteration 54040/ 115203 | consumed samples: 13834240 | consumed tokens: 28332523520 | elapsed time per iteration (s): 0.57 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 2.723388E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.120 | TFLOPs: 42.82 | +7: iteration 54050/ 115203 | consumed samples: 13836800 | consumed tokens: 28337766400 | elapsed time per iteration (s): 0.56 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 2.720787E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.890 | TFLOPs: 43.37 | +7: iteration 54060/ 115203 | consumed samples: 13839360 | consumed tokens: 28343009280 | elapsed time per iteration (s): 0.57 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 2.717692E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.073 | TFLOPs: 42.62 | +7: iteration 54070/ 115203 | consumed samples: 13841920 | consumed tokens: 28348252160 | elapsed time per iteration (s): 0.56 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 2.705722E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.374 | TFLOPs: 43.61 | +7: iteration 54080/ 115203 | consumed samples: 13844480 | consumed tokens: 28353495040 | elapsed time per iteration (s): 0.57 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 2.711112E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.890 | TFLOPs: 43.08 | +7: iteration 54090/ 115203 | consumed samples: 13847040 | consumed tokens: 28358737920 | elapsed time per iteration (s): 0.57 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 2.724882E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.338 | TFLOPs: 42.74 | +7: iteration 54100/ 115203 | consumed samples: 13849600 | consumed tokens: 28363980800 | elapsed time per iteration (s): 0.56 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 2.702595E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.789 | TFLOPs: 43.45 | +7: iteration 54110/ 115203 | consumed samples: 13852160 | consumed tokens: 28369223680 | elapsed time per iteration (s): 0.56 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 2.716873E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.854 | TFLOPs: 43.56 | +7: iteration 54120/ 115203 | consumed samples: 13854720 | consumed tokens: 28374466560 | elapsed time per iteration (s): 0.56 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 2.705639E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.255 | TFLOPs: 43.31 | +7: iteration 54130/ 115203 | consumed samples: 13857280 | consumed tokens: 28379709440 | elapsed time per iteration (s): 0.56 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 2.713000E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.102 | TFLOPs: 43.68 | +7: iteration 54140/ 115203 | consumed samples: 13859840 | consumed tokens: 28384952320 | elapsed time per iteration (s): 0.58 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 2.727183E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.609 | TFLOPs: 42.29 | +7: iteration 54150/ 115203 | consumed samples: 13862400 | consumed tokens: 28390195200 | elapsed time per iteration (s): 0.56 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 2.711798E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.879 | TFLOPs: 43.56 | +7: iteration 54160/ 115203 | consumed samples: 13864960 | consumed tokens: 28395438080 | elapsed time per iteration (s): 0.59 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 2.727040E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.371 | TFLOPs: 41.60 | +7: iteration 54170/ 115203 | consumed samples: 13867520 | consumed tokens: 28400680960 | elapsed time per iteration (s): 0.58 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 2.711172E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.812 | TFLOPs: 41.84 | +7: iteration 54180/ 115203 | consumed samples: 13870080 | consumed tokens: 28405923840 | elapsed time per iteration (s): 0.57 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 2.709521E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.800 | TFLOPs: 42.69 | +7: iteration 54190/ 115203 | consumed samples: 13872640 | consumed tokens: 28411166720 | elapsed time per iteration (s): 0.58 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 2.714454E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.755 | TFLOPs: 41.93 | +7: iteration 54200/ 115203 | consumed samples: 13875200 | consumed tokens: 28416409600 | elapsed time per iteration (s): 0.59 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 2.713128E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.419 | TFLOPs: 41.70 | +7: iteration 54210/ 115203 | consumed samples: 13877760 | consumed tokens: 28421652480 | elapsed time per iteration (s): 0.58 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 2.714748E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.349 | TFLOPs: 42.17 | +7: iteration 54220/ 115203 | consumed samples: 13880320 | consumed tokens: 28426895360 | elapsed time per iteration (s): 0.57 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 2.706124E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.346 | TFLOPs: 42.65 | +7: iteration 54230/ 115203 | consumed samples: 13882880 | consumed tokens: 28432138240 | elapsed time per iteration (s): 0.57 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 2.711502E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.568 | TFLOPs: 42.96 | +7: iteration 54240/ 115203 | consumed samples: 13885440 | consumed tokens: 28437381120 | elapsed time per iteration (s): 0.58 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 2.714348E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.786 | TFLOPs: 42.41 | +7: iteration 54250/ 115203 | consumed samples: 13888000 | consumed tokens: 28442624000 | elapsed time per iteration (s): 0.58 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 2.712688E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.728 | TFLOPs: 42.11 | +7: iteration 54260/ 115203 | consumed samples: 13890560 | consumed tokens: 28447866880 | elapsed time per iteration (s): 0.57 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 2.697645E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.217 | TFLOPs: 43.11 | +7: iteration 54270/ 115203 | consumed samples: 13893120 | consumed tokens: 28453109760 | elapsed time per iteration (s): 0.57 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 2.718149E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.308 | TFLOPs: 42.46 | +7: iteration 54280/ 115203 | consumed samples: 13895680 | consumed tokens: 28458352640 | elapsed time per iteration (s): 0.57 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 2.726750E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.454 | TFLOPs: 42.76 | +7: iteration 54290/ 115203 | consumed samples: 13898240 | consumed tokens: 28463595520 | elapsed time per iteration (s): 0.58 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 2.720910E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.991 | TFLOPs: 41.95 | +7: iteration 54300/ 115203 | consumed samples: 13900800 | consumed tokens: 28468838400 | elapsed time per iteration (s): 0.58 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 2.710123E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.684 | TFLOPs: 42.01 | +7: iteration 54310/ 115203 | consumed samples: 13903360 | consumed tokens: 28474081280 | elapsed time per iteration (s): 0.56 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 2.720631E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.975 | TFLOPs: 43.76 | +7: iteration 54320/ 115203 | consumed samples: 13905920 | consumed tokens: 28479324160 | elapsed time per iteration (s): 0.60 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 2.714684E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.965 | TFLOPs: 40.99 | +7: iteration 54330/ 115203 | consumed samples: 13908480 | consumed tokens: 28484567040 | elapsed time per iteration (s): 0.58 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 2.707595E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.742 | TFLOPs: 42.02 | +7: iteration 54340/ 115203 | consumed samples: 13911040 | consumed tokens: 28489809920 | elapsed time per iteration (s): 0.58 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 2.719756E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.232 | TFLOPs: 42.16 | +7: iteration 54350/ 115203 | consumed samples: 13913600 | consumed tokens: 28495052800 | elapsed time per iteration (s): 0.60 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 2.718181E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.213 | TFLOPs: 40.92 | +7: iteration 54360/ 115203 | consumed samples: 13916160 | consumed tokens: 28500295680 | elapsed time per iteration (s): 0.56 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 2.703407E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.718 | TFLOPs: 43.54 | +7: iteration 54370/ 115203 | consumed samples: 13918720 | consumed tokens: 28505538560 | elapsed time per iteration (s): 0.56 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 2.698631E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.743 | TFLOPs: 43.55 | +7: iteration 54380/ 115203 | consumed samples: 13921280 | consumed tokens: 28510781440 | elapsed time per iteration (s): 0.59 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 2.720794E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.087 | TFLOPs: 41.67 | +7: iteration 54390/ 115203 | consumed samples: 13923840 | consumed tokens: 28516024320 | elapsed time per iteration (s): 0.57 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 2.711419E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.877 | TFLOPs: 42.89 | +7: iteration 54400/ 115203 | consumed samples: 13926400 | consumed tokens: 28521267200 | elapsed time per iteration (s): 0.57 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 2.715032E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.039 | TFLOPs: 43.19 | +7: iteration 54410/ 115203 | consumed samples: 13928960 | consumed tokens: 28526510080 | elapsed time per iteration (s): 0.57 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 2.722070E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.473 | TFLOPs: 42.76 | +7: iteration 54420/ 115203 | consumed samples: 13931520 | consumed tokens: 28531752960 | elapsed time per iteration (s): 0.57 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 2.716440E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.853 | TFLOPs: 42.79 | +7: iteration 54430/ 115203 | consumed samples: 13934080 | consumed tokens: 28536995840 | elapsed time per iteration (s): 0.59 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 2.704382E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.000 | TFLOPs: 41.19 | +7: iteration 54440/ 115203 | consumed samples: 13936640 | consumed tokens: 28542238720 | elapsed time per iteration (s): 0.56 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 2.719949E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.074 | TFLOPs: 43.39 | +7: iteration 54450/ 115203 | consumed samples: 13939200 | consumed tokens: 28547481600 | elapsed time per iteration (s): 0.57 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 2.731606E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.051 | TFLOPs: 43.19 | +7: iteration 54460/ 115203 | consumed samples: 13941760 | consumed tokens: 28552724480 | elapsed time per iteration (s): 0.57 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 2.723498E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.606 | TFLOPs: 42.77 | +7: iteration 54470/ 115203 | consumed samples: 13944320 | consumed tokens: 28557967360 | elapsed time per iteration (s): 0.57 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 2.711562E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.560 | TFLOPs: 42.57 | +7: iteration 54480/ 115203 | consumed samples: 13946880 | consumed tokens: 28563210240 | elapsed time per iteration (s): 0.57 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 2.724029E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.812 | TFLOPs: 43.08 | +7: iteration 54490/ 115203 | consumed samples: 13949440 | consumed tokens: 28568453120 | elapsed time per iteration (s): 0.57 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 2.727181E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.768 | TFLOPs: 43.17 | +7: iteration 54500/ 115203 | consumed samples: 13952000 | consumed tokens: 28573696000 | elapsed time per iteration (s): 0.56 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 2.706896E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.042 | TFLOPs: 43.57 | +7: iteration 54510/ 115203 | consumed samples: 13954560 | consumed tokens: 28578938880 | elapsed time per iteration (s): 0.57 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 2.723498E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.857 | TFLOPs: 42.70 | +7: iteration 54520/ 115203 | consumed samples: 13957120 | consumed tokens: 28584181760 | elapsed time per iteration (s): 0.57 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 2.716148E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.850 | TFLOPs: 43.08 | +7: iteration 54530/ 115203 | consumed samples: 13959680 | consumed tokens: 28589424640 | elapsed time per iteration (s): 0.57 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 2.721046E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.542 | TFLOPs: 42.95 | +7: iteration 54540/ 115203 | consumed samples: 13962240 | consumed tokens: 28594667520 | elapsed time per iteration (s): 0.57 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 2.711752E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.674 | TFLOPs: 43.06 | +7: iteration 54550/ 115203 | consumed samples: 13964800 | consumed tokens: 28599910400 | elapsed time per iteration (s): 0.57 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 2.715909E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.385 | TFLOPs: 42.84 | +7: iteration 54560/ 115203 | consumed samples: 13967360 | consumed tokens: 28605153280 | elapsed time per iteration (s): 0.56 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 2.729827E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.622 | TFLOPs: 43.25 | +7: iteration 54570/ 115203 | consumed samples: 13969920 | consumed tokens: 28610396160 | elapsed time per iteration (s): 0.57 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 2.702130E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.155 | TFLOPs: 42.63 | +7: iteration 54580/ 115203 | consumed samples: 13972480 | consumed tokens: 28615639040 | elapsed time per iteration (s): 0.56 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 2.716329E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.811 | TFLOPs: 43.65 | +7: iteration 54590/ 115203 | consumed samples: 13975040 | consumed tokens: 28620881920 | elapsed time per iteration (s): 0.58 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 2.709810E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.444 | TFLOPs: 42.28 | +7: iteration 54600/ 115203 | consumed samples: 13977600 | consumed tokens: 28626124800 | elapsed time per iteration (s): 0.57 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 2.725097E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.298 | TFLOPs: 43.03 | +7: iteration 54610/ 115203 | consumed samples: 13980160 | consumed tokens: 28631367680 | elapsed time per iteration (s): 0.58 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 2.704899E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.444 | TFLOPs: 42.18 | +7: iteration 54620/ 115203 | consumed samples: 13982720 | consumed tokens: 28636610560 | elapsed time per iteration (s): 0.57 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 2.699854E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.376 | TFLOPs: 42.65 | +7: iteration 54630/ 115203 | consumed samples: 13985280 | consumed tokens: 28641853440 | elapsed time per iteration (s): 0.58 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 2.701132E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.908 | TFLOPs: 42.42 | +7: iteration 54640/ 115203 | consumed samples: 13987840 | consumed tokens: 28647096320 | elapsed time per iteration (s): 0.56 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 2.730617E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.154 | TFLOPs: 43.58 | +7: iteration 54650/ 115203 | consumed samples: 13990400 | consumed tokens: 28652339200 | elapsed time per iteration (s): 0.57 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 2.709106E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.610 | TFLOPs: 43.06 | +7: iteration 54660/ 115203 | consumed samples: 13992960 | consumed tokens: 28657582080 | elapsed time per iteration (s): 0.58 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 2.695213E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.711 | TFLOPs: 42.40 | +7: iteration 54670/ 115203 | consumed samples: 13995520 | consumed tokens: 28662824960 | elapsed time per iteration (s): 0.57 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 2.713685E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.784 | TFLOPs: 42.69 | +7: iteration 54680/ 115203 | consumed samples: 13998080 | consumed tokens: 28668067840 | elapsed time per iteration (s): 0.56 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 2.734094E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.457 | TFLOPs: 43.52 | +7: iteration 54690/ 115203 | consumed samples: 14000640 | consumed tokens: 28673310720 | elapsed time per iteration (s): 0.57 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 2.711374E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.678 | TFLOPs: 43.16 | +7: iteration 54700/ 115203 | consumed samples: 14003200 | consumed tokens: 28678553600 | elapsed time per iteration (s): 0.56 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 2.718365E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.497 | TFLOPs: 43.24 | +7: iteration 54710/ 115203 | consumed samples: 14005760 | consumed tokens: 28683796480 | elapsed time per iteration (s): 0.57 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 2.733150E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.366 | TFLOPs: 43.03 | +7: iteration 54720/ 115203 | consumed samples: 14008320 | consumed tokens: 28689039360 | elapsed time per iteration (s): 0.56 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 2.706296E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.176 | TFLOPs: 43.87 | +7: iteration 54730/ 115203 | consumed samples: 14010880 | consumed tokens: 28694282240 | elapsed time per iteration (s): 0.56 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 2.699996E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.695 | TFLOPs: 43.45 | +7: iteration 54740/ 115203 | consumed samples: 14013440 | consumed tokens: 28699525120 | elapsed time per iteration (s): 0.57 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 2.714210E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.318 | TFLOPs: 42.74 | +7: iteration 54750/ 115203 | consumed samples: 14016000 | consumed tokens: 28704768000 | elapsed time per iteration (s): 0.57 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 2.714482E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.343 | TFLOPs: 43.13 | +7: iteration 54760/ 115203 | consumed samples: 14018560 | consumed tokens: 28710010880 | elapsed time per iteration (s): 0.56 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 2.717645E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.736 | TFLOPs: 43.35 | +7: iteration 54770/ 115203 | consumed samples: 14021120 | consumed tokens: 28715253760 | elapsed time per iteration (s): 0.56 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 2.710266E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.420 | TFLOPs: 43.23 | +7: iteration 54780/ 115203 | consumed samples: 14023680 | consumed tokens: 28720496640 | elapsed time per iteration (s): 0.57 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 2.715224E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.898 | TFLOPs: 43.08 | +7: iteration 54790/ 115203 | consumed samples: 14026240 | consumed tokens: 28725739520 | elapsed time per iteration (s): 0.56 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 2.716270E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.181 | TFLOPs: 43.40 | +7: iteration 54800/ 115203 | consumed samples: 14028800 | consumed tokens: 28730982400 | elapsed time per iteration (s): 0.56 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 2.708467E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.722 | TFLOPs: 43.54 | +7: iteration 54810/ 115203 | consumed samples: 14031360 | consumed tokens: 28736225280 | elapsed time per iteration (s): 0.56 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 2.697497E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.793 | TFLOPs: 43.26 | +7: iteration 54820/ 115203 | consumed samples: 14033920 | consumed tokens: 28741468160 | elapsed time per iteration (s): 0.56 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 2.714927E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.361 | TFLOPs: 43.22 | +7: iteration 54830/ 115203 | consumed samples: 14036480 | consumed tokens: 28746711040 | elapsed time per iteration (s): 0.57 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 2.703386E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.206 | TFLOPs: 42.64 | +7: iteration 54840/ 115203 | consumed samples: 14039040 | consumed tokens: 28751953920 | elapsed time per iteration (s): 0.57 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 2.694841E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.768 | TFLOPs: 43.07 | +7: iteration 54850/ 115203 | consumed samples: 14041600 | consumed tokens: 28757196800 | elapsed time per iteration (s): 0.56 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 2.709727E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.312 | TFLOPs: 43.22 | +7: iteration 54860/ 115203 | consumed samples: 14044160 | consumed tokens: 28762439680 | elapsed time per iteration (s): 0.56 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 2.721126E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.431 | TFLOPs: 43.52 | +7: iteration 54870/ 115203 | consumed samples: 14046720 | consumed tokens: 28767682560 | elapsed time per iteration (s): 0.56 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 2.712177E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.690 | TFLOPs: 43.54 | +7: iteration 54880/ 115203 | consumed samples: 14049280 | consumed tokens: 28772925440 | elapsed time per iteration (s): 0.57 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 2.706896E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.212 | TFLOPs: 42.73 | +7: iteration 54890/ 115203 | consumed samples: 14051840 | consumed tokens: 28778168320 | elapsed time per iteration (s): 0.58 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 2.716103E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.049 | TFLOPs: 42.24 | +7: iteration 54900/ 115203 | consumed samples: 14054400 | consumed tokens: 28783411200 | elapsed time per iteration (s): 0.57 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 2.720829E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.588 | TFLOPs: 43.15 | +7: iteration 54910/ 115203 | consumed samples: 14056960 | consumed tokens: 28788654080 | elapsed time per iteration (s): 0.57 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 2.702085E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.246 | TFLOPs: 43.12 | +7: iteration 54920/ 115203 | consumed samples: 14059520 | consumed tokens: 28793896960 | elapsed time per iteration (s): 0.56 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 2.708925E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.746 | TFLOPs: 43.26 | +7: iteration 54930/ 115203 | consumed samples: 14062080 | consumed tokens: 28799139840 | elapsed time per iteration (s): 0.56 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 2.703339E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.503 | TFLOPs: 43.62 | +7: iteration 54940/ 115203 | consumed samples: 14064640 | consumed tokens: 28804382720 | elapsed time per iteration (s): 0.55 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 2.727061E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.369 | TFLOPs: 43.99 | +7: iteration 54950/ 115203 | consumed samples: 14067200 | consumed tokens: 28809625600 | elapsed time per iteration (s): 0.57 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 2.700910E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.035 | TFLOPs: 43.10 | +7: iteration 54960/ 115203 | consumed samples: 14069760 | consumed tokens: 28814868480 | elapsed time per iteration (s): 0.57 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 2.710456E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.815 | TFLOPs: 42.89 | +7: iteration 54970/ 115203 | consumed samples: 14072320 | consumed tokens: 28820111360 | elapsed time per iteration (s): 0.56 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 2.702622E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.573 | TFLOPs: 43.82 | +7: iteration 54980/ 115203 | consumed samples: 14074880 | consumed tokens: 28825354240 | elapsed time per iteration (s): 0.56 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 2.718831E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.870 | TFLOPs: 43.46 | +7: iteration 54990/ 115203 | consumed samples: 14077440 | consumed tokens: 28830597120 | elapsed time per iteration (s): 0.57 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 2.703089E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.796 | TFLOPs: 42.60 | +7: iteration 55000/ 115203 | consumed samples: 14080000 | consumed tokens: 28835840000 | elapsed time per iteration (s): 0.57 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 2.720302E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.358 | TFLOPs: 43.13 | +7: iteration 55010/ 115203 | consumed samples: 14082560 | consumed tokens: 28841082880 | elapsed time per iteration (s): 0.58 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 2.705381E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.495 | TFLOPs: 42.38 | +7: iteration 55020/ 115203 | consumed samples: 14085120 | consumed tokens: 28846325760 | elapsed time per iteration (s): 0.56 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 2.712347E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.388 | TFLOPs: 43.61 | +7: iteration 55030/ 115203 | consumed samples: 14087680 | consumed tokens: 28851568640 | elapsed time per iteration (s): 0.56 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 2.719011E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.557 | TFLOPs: 43.24 | +7: iteration 55040/ 115203 | consumed samples: 14090240 | consumed tokens: 28856811520 | elapsed time per iteration (s): 0.59 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 2.707661E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.944 | TFLOPs: 41.66 | +7: iteration 55050/ 115203 | consumed samples: 14092800 | consumed tokens: 28862054400 | elapsed time per iteration (s): 0.56 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 2.707634E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.031 | TFLOPs: 43.57 | +7: iteration 55060/ 115203 | consumed samples: 14095360 | consumed tokens: 28867297280 | elapsed time per iteration (s): 0.57 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 2.712178E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.353 | TFLOPs: 42.94 | +7: iteration 55070/ 115203 | consumed samples: 14097920 | consumed tokens: 28872540160 | elapsed time per iteration (s): 0.57 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 2.715093E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.345 | TFLOPs: 42.74 | +7: iteration 55080/ 115203 | consumed samples: 14100480 | consumed tokens: 28877783040 | elapsed time per iteration (s): 0.57 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 2.710750E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.059 | TFLOPs: 42.91 | +7: iteration 55090/ 115203 | consumed samples: 14103040 | consumed tokens: 28883025920 | elapsed time per iteration (s): 0.58 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 2.710706E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.816 | TFLOPs: 42.22 | +7: iteration 55100/ 115203 | consumed samples: 14105600 | consumed tokens: 28888268800 | elapsed time per iteration (s): 0.58 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 2.714893E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.105 | TFLOPs: 42.34 | +7: iteration 55110/ 115203 | consumed samples: 14108160 | consumed tokens: 28893511680 | elapsed time per iteration (s): 0.57 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 2.712928E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.997 | TFLOPs: 43.19 | +7: iteration 55120/ 115203 | consumed samples: 14110720 | consumed tokens: 28898754560 | elapsed time per iteration (s): 0.57 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 2.708597E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.730 | TFLOPs: 42.69 | +7: iteration 55130/ 115203 | consumed samples: 14113280 | consumed tokens: 28903997440 | elapsed time per iteration (s): 0.57 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 2.716592E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.508 | TFLOPs: 42.57 | +7: iteration 55140/ 115203 | consumed samples: 14115840 | consumed tokens: 28909240320 | elapsed time per iteration (s): 0.56 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 2.704327E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.794 | TFLOPs: 43.74 | +7: iteration 55150/ 115203 | consumed samples: 14118400 | consumed tokens: 28914483200 | elapsed time per iteration (s): 0.58 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 2.695837E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.219 | TFLOPs: 42.35 | +7: iteration 55160/ 115203 | consumed samples: 14120960 | consumed tokens: 28919726080 | elapsed time per iteration (s): 0.59 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 2.722103E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.449 | TFLOPs: 41.23 | +7: iteration 55170/ 115203 | consumed samples: 14123520 | consumed tokens: 28924968960 | elapsed time per iteration (s): 0.56 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 2.724672E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.635 | TFLOPs: 43.34 | +7: iteration 55180/ 115203 | consumed samples: 14126080 | consumed tokens: 28930211840 | elapsed time per iteration (s): 0.56 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 2.712820E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.190 | TFLOPs: 43.30 | +7: iteration 55190/ 115203 | consumed samples: 14128640 | consumed tokens: 28935454720 | elapsed time per iteration (s): 0.58 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 2.715245E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.884 | TFLOPs: 41.75 | +7: iteration 55200/ 115203 | consumed samples: 14131200 | consumed tokens: 28940697600 | elapsed time per iteration (s): 0.56 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 2.714678E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.251 | TFLOPs: 43.59 | +7: iteration 55210/ 115203 | consumed samples: 14133760 | consumed tokens: 28945940480 | elapsed time per iteration (s): 0.58 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 2.703999E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.645 | TFLOPs: 42.11 | +7: iteration 55220/ 115203 | consumed samples: 14136320 | consumed tokens: 28951183360 | elapsed time per iteration (s): 0.56 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 2.707075E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.104 | TFLOPs: 43.39 | +7: iteration 55230/ 115203 | consumed samples: 14138880 | consumed tokens: 28956426240 | elapsed time per iteration (s): 0.56 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 2.716901E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.776 | TFLOPs: 43.64 | +7: iteration 55240/ 115203 | consumed samples: 14141440 | consumed tokens: 28961669120 | elapsed time per iteration (s): 0.56 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 2.703814E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.330 | TFLOPs: 43.60 | +7: iteration 55250/ 115203 | consumed samples: 14144000 | consumed tokens: 28966912000 | elapsed time per iteration (s): 0.56 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 2.706514E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.180 | TFLOPs: 43.49 | +7: iteration 55260/ 115203 | consumed samples: 14146560 | consumed tokens: 28972154880 | elapsed time per iteration (s): 0.56 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 2.715365E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.581 | TFLOPs: 43.53 | +7: iteration 55270/ 115203 | consumed samples: 14149120 | consumed tokens: 28977397760 | elapsed time per iteration (s): 0.57 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 2.710343E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.344 | TFLOPs: 43.03 | +7: iteration 55280/ 115203 | consumed samples: 14151680 | consumed tokens: 28982640640 | elapsed time per iteration (s): 0.56 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 2.721693E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.221 | TFLOPs: 43.69 | +7: iteration 55290/ 115203 | consumed samples: 14154240 | consumed tokens: 28987883520 | elapsed time per iteration (s): 0.57 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 2.723181E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.273 | TFLOPs: 42.74 | +7: iteration 55300/ 115203 | consumed samples: 14156800 | consumed tokens: 28993126400 | elapsed time per iteration (s): 0.57 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 2.701018E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.146 | TFLOPs: 43.11 | +7: iteration 55310/ 115203 | consumed samples: 14159360 | consumed tokens: 28998369280 | elapsed time per iteration (s): 0.56 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 2.712899E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.740 | TFLOPs: 43.35 | +7: iteration 55320/ 115203 | consumed samples: 14161920 | consumed tokens: 29003612160 | elapsed time per iteration (s): 0.57 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 2.698128E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.343 | TFLOPs: 42.74 | +7: iteration 55330/ 115203 | consumed samples: 14164480 | consumed tokens: 29008855040 | elapsed time per iteration (s): 0.56 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 2.713506E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.716 | TFLOPs: 43.64 | +7: iteration 55340/ 115203 | consumed samples: 14167040 | consumed tokens: 29014097920 | elapsed time per iteration (s): 0.57 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 2.723162E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.886 | TFLOPs: 42.80 | +7: iteration 55350/ 115203 | consumed samples: 14169600 | consumed tokens: 29019340800 | elapsed time per iteration (s): 0.56 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 2.709840E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.553 | TFLOPs: 43.34 | +7: iteration 55360/ 115203 | consumed samples: 14172160 | consumed tokens: 29024583680 | elapsed time per iteration (s): 0.57 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 2.706890E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.029 | TFLOPs: 42.62 | +7: iteration 55370/ 115203 | consumed samples: 14174720 | consumed tokens: 29029826560 | elapsed time per iteration (s): 0.57 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 2.706400E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.787 | TFLOPs: 43.17 | +7: iteration 55380/ 115203 | consumed samples: 14177280 | consumed tokens: 29035069440 | elapsed time per iteration (s): 0.57 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 2.699309E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.846 | TFLOPs: 43.17 | +7: iteration 55390/ 115203 | consumed samples: 14179840 | consumed tokens: 29040312320 | elapsed time per iteration (s): 0.58 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 2.710725E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.243 | TFLOPs: 41.78 | +7: iteration 55400/ 115203 | consumed samples: 14182400 | consumed tokens: 29045555200 | elapsed time per iteration (s): 0.57 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 2.703356E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.114 | TFLOPs: 43.01 | +7: iteration 55410/ 115203 | consumed samples: 14184960 | consumed tokens: 29050798080 | elapsed time per iteration (s): 0.58 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 2.715026E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.683 | TFLOPs: 42.40 | +7: iteration 55420/ 115203 | consumed samples: 14187520 | consumed tokens: 29056040960 | elapsed time per iteration (s): 0.58 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 2.726360E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.060 | TFLOPs: 42.05 | +7: iteration 55430/ 115203 | consumed samples: 14190080 | consumed tokens: 29061283840 | elapsed time per iteration (s): 0.59 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 2.704609E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.970 | TFLOPs: 41.66 | +7: iteration 55440/ 115203 | consumed samples: 14192640 | consumed tokens: 29066526720 | elapsed time per iteration (s): 0.56 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 2.708608E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.300 | TFLOPs: 43.60 | +7: iteration 55450/ 115203 | consumed samples: 14195200 | consumed tokens: 29071769600 | elapsed time per iteration (s): 0.56 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 2.716936E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.400 | TFLOPs: 43.23 | +7: iteration 55460/ 115203 | consumed samples: 14197760 | consumed tokens: 29077012480 | elapsed time per iteration (s): 0.58 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 2.719040E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.497 | TFLOPs: 42.00 | +7: iteration 55470/ 115203 | consumed samples: 14200320 | consumed tokens: 29082255360 | elapsed time per iteration (s): 0.56 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 2.713679E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.381 | TFLOPs: 43.22 | +7: iteration 55480/ 115203 | consumed samples: 14202880 | consumed tokens: 29087498240 | elapsed time per iteration (s): 0.57 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 2.702615E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.007 | TFLOPs: 43.19 | +7: iteration 55490/ 115203 | consumed samples: 14205440 | consumed tokens: 29092741120 | elapsed time per iteration (s): 0.57 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 2.703125E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.806 | TFLOPs: 42.98 | +7: iteration 55500/ 115203 | consumed samples: 14208000 | consumed tokens: 29097984000 | elapsed time per iteration (s): 0.58 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 2.712454E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.470 | TFLOPs: 42.09 | +7: iteration 55510/ 115203 | consumed samples: 14210560 | consumed tokens: 29103226880 | elapsed time per iteration (s): 0.57 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 2.708857E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.893 | TFLOPs: 42.99 | +7: iteration 55520/ 115203 | consumed samples: 14213120 | consumed tokens: 29108469760 | elapsed time per iteration (s): 0.56 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 2.701892E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.666 | TFLOPs: 43.35 | +7: iteration 55530/ 115203 | consumed samples: 14215680 | consumed tokens: 29113712640 | elapsed time per iteration (s): 0.57 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 2.715056E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.808 | TFLOPs: 42.69 | +7: iteration 55540/ 115203 | consumed samples: 14218240 | consumed tokens: 29118955520 | elapsed time per iteration (s): 0.56 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 2.700386E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.443 | TFLOPs: 43.23 | +7: iteration 55550/ 115203 | consumed samples: 14220800 | consumed tokens: 29124198400 | elapsed time per iteration (s): 0.57 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 2.704502E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.839 | TFLOPs: 42.89 | +7: iteration 55560/ 115203 | consumed samples: 14223360 | consumed tokens: 29129441280 | elapsed time per iteration (s): 0.57 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 2.724185E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.479 | TFLOPs: 42.95 | +7: iteration 55570/ 115203 | consumed samples: 14225920 | consumed tokens: 29134684160 | elapsed time per iteration (s): 0.59 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 2.720615E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.697 | TFLOPs: 41.16 | +7: iteration 55580/ 115203 | consumed samples: 14228480 | consumed tokens: 29139927040 | elapsed time per iteration (s): 0.59 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 2.702959E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.202 | TFLOPs: 41.30 | +7: iteration 55590/ 115203 | consumed samples: 14231040 | consumed tokens: 29145169920 | elapsed time per iteration (s): 0.59 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 2.704451E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.187 | TFLOPs: 41.30 | +7: iteration 55600/ 115203 | consumed samples: 14233600 | consumed tokens: 29150412800 | elapsed time per iteration (s): 0.56 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 2.696946E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.660 | TFLOPs: 43.35 | +7: iteration 55610/ 115203 | consumed samples: 14236160 | consumed tokens: 29155655680 | elapsed time per iteration (s): 0.58 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 2.715215E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.033 | TFLOPs: 42.43 | +7: iteration 55620/ 115203 | consumed samples: 14238720 | consumed tokens: 29160898560 | elapsed time per iteration (s): 0.56 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 2.713981E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.531 | TFLOPs: 43.24 | +7: iteration 55630/ 115203 | consumed samples: 14241280 | consumed tokens: 29166141440 | elapsed time per iteration (s): 0.56 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 2.726134E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.991 | TFLOPs: 43.47 | +7: iteration 55640/ 115203 | consumed samples: 14243840 | consumed tokens: 29171384320 | elapsed time per iteration (s): 0.57 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 2.711952E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.913 | TFLOPs: 42.99 | +7: iteration 55650/ 115203 | consumed samples: 14246400 | consumed tokens: 29176627200 | elapsed time per iteration (s): 0.61 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 2.722533E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 417.710 | TFLOPs: 39.82 | +7: iteration 55660/ 115203 | consumed samples: 14248960 | consumed tokens: 29181870080 | elapsed time per iteration (s): 0.58 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 2.712944E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.458 | TFLOPs: 42.09 | +7: iteration 55670/ 115203 | consumed samples: 14251520 | consumed tokens: 29187112960 | elapsed time per iteration (s): 0.56 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 2.705557E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.830 | TFLOPs: 43.36 | +7: iteration 55680/ 115203 | consumed samples: 14254080 | consumed tokens: 29192355840 | elapsed time per iteration (s): 0.56 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 2.700420E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.290 | TFLOPs: 43.41 | +7: iteration 55690/ 115203 | consumed samples: 14256640 | consumed tokens: 29197598720 | elapsed time per iteration (s): 0.57 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 2.710794E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.812 | TFLOPs: 42.50 | +7: iteration 55700/ 115203 | consumed samples: 14259200 | consumed tokens: 29202841600 | elapsed time per iteration (s): 0.56 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 2.703820E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.364 | TFLOPs: 43.70 | +7: iteration 55710/ 115203 | consumed samples: 14261760 | consumed tokens: 29208084480 | elapsed time per iteration (s): 0.57 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 2.718486E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.532 | TFLOPs: 43.14 | +7: iteration 55720/ 115203 | consumed samples: 14264320 | consumed tokens: 29213327360 | elapsed time per iteration (s): 0.57 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 2.707365E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.868 | TFLOPs: 43.08 | +7: iteration 55730/ 115203 | consumed samples: 14266880 | consumed tokens: 29218570240 | elapsed time per iteration (s): 0.58 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 2.687542E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.927 | TFLOPs: 42.13 | +7: iteration 55740/ 115203 | consumed samples: 14269440 | consumed tokens: 29223813120 | elapsed time per iteration (s): 0.57 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 2.697791E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.609 | TFLOPs: 43.06 | +7: iteration 55750/ 115203 | consumed samples: 14272000 | consumed tokens: 29229056000 | elapsed time per iteration (s): 0.56 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 2.715974E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.490 | TFLOPs: 43.24 | +7: iteration 55760/ 115203 | consumed samples: 14274560 | consumed tokens: 29234298880 | elapsed time per iteration (s): 0.58 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 2.722043E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.814 | TFLOPs: 42.41 | +7: iteration 55770/ 115203 | consumed samples: 14277120 | consumed tokens: 29239541760 | elapsed time per iteration (s): 0.57 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 2.717195E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.078 | TFLOPs: 42.91 | +7: iteration 55780/ 115203 | consumed samples: 14279680 | consumed tokens: 29244784640 | elapsed time per iteration (s): 0.56 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 2.712724E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.132 | TFLOPs: 43.68 | +7: iteration 55790/ 115203 | consumed samples: 14282240 | consumed tokens: 29250027520 | elapsed time per iteration (s): 0.57 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 2.706531E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.392 | TFLOPs: 42.94 | +7: iteration 55800/ 115203 | consumed samples: 14284800 | consumed tokens: 29255270400 | elapsed time per iteration (s): 0.56 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 2.732132E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.938 | TFLOPs: 43.56 | +7: iteration 55810/ 115203 | consumed samples: 14287360 | consumed tokens: 29260513280 | elapsed time per iteration (s): 0.56 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 2.709237E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.055 | TFLOPs: 43.96 | +7: iteration 55820/ 115203 | consumed samples: 14289920 | consumed tokens: 29265756160 | elapsed time per iteration (s): 0.57 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 2.713929E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.200 | TFLOPs: 42.92 | +7: iteration 55830/ 115203 | consumed samples: 14292480 | consumed tokens: 29270999040 | elapsed time per iteration (s): 0.57 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 2.700324E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.333 | TFLOPs: 42.93 | +7: iteration 55840/ 115203 | consumed samples: 14295040 | consumed tokens: 29276241920 | elapsed time per iteration (s): 0.56 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 2.719494E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.573 | TFLOPs: 43.72 | +7: iteration 55850/ 115203 | consumed samples: 14297600 | consumed tokens: 29281484800 | elapsed time per iteration (s): 0.56 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 2.706797E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.946 | TFLOPs: 43.47 | +7: iteration 55860/ 115203 | consumed samples: 14300160 | consumed tokens: 29286727680 | elapsed time per iteration (s): 0.56 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 2.712641E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.925 | TFLOPs: 43.37 | +7: iteration 55870/ 115203 | consumed samples: 14302720 | consumed tokens: 29291970560 | elapsed time per iteration (s): 0.56 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 2.693782E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.764 | TFLOPs: 43.26 | +7: iteration 55880/ 115203 | consumed samples: 14305280 | consumed tokens: 29297213440 | elapsed time per iteration (s): 0.58 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 2.705364E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.976 | TFLOPs: 42.23 | +7: iteration 55890/ 115203 | consumed samples: 14307840 | consumed tokens: 29302456320 | elapsed time per iteration (s): 0.56 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 2.697097E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.801 | TFLOPs: 43.26 | +7: iteration 55900/ 115203 | consumed samples: 14310400 | consumed tokens: 29307699200 | elapsed time per iteration (s): 0.56 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 2.715842E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.947 | TFLOPs: 43.56 | +7: iteration 55910/ 115203 | consumed samples: 14312960 | consumed tokens: 29312942080 | elapsed time per iteration (s): 0.55 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 2.704320E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.622 | TFLOPs: 44.01 | +7: iteration 55920/ 115203 | consumed samples: 14315520 | consumed tokens: 29318184960 | elapsed time per iteration (s): 0.55 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 2.710882E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.562 | TFLOPs: 44.00 | +7: iteration 55930/ 115203 | consumed samples: 14318080 | consumed tokens: 29323427840 | elapsed time per iteration (s): 0.57 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 2.704368E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.945 | TFLOPs: 42.52 | +7: iteration 55940/ 115203 | consumed samples: 14320640 | consumed tokens: 29328670720 | elapsed time per iteration (s): 0.56 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 2.701274E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.231 | TFLOPs: 43.31 | +7: iteration 55950/ 115203 | consumed samples: 14323200 | consumed tokens: 29333913600 | elapsed time per iteration (s): 0.57 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 2.693913E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.023 | TFLOPs: 43.10 | +7: iteration 55960/ 115203 | consumed samples: 14325760 | consumed tokens: 29339156480 | elapsed time per iteration (s): 0.56 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 2.704265E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.170 | TFLOPs: 43.49 | +7: iteration 55970/ 115203 | consumed samples: 14328320 | consumed tokens: 29344399360 | elapsed time per iteration (s): 0.57 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 2.703526E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.395 | TFLOPs: 42.75 | +7: iteration 55980/ 115203 | consumed samples: 14330880 | consumed tokens: 29349642240 | elapsed time per iteration (s): 0.56 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 2.710309E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.097 | TFLOPs: 43.29 | +7: iteration 55990/ 115203 | consumed samples: 14333440 | consumed tokens: 29354885120 | elapsed time per iteration (s): 0.56 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 2.711916E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.216 | TFLOPs: 43.21 | +0: [2023-03-16 21:37:13,362] [INFO] [logging.py:68:log_dist] [Rank 0] step=56000, skipped=0, lr=[0.00011539606744822729, 0.00011539606744822729, 0.00011539606744822729], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 56000/ 115203 | consumed samples: 14336000 | consumed tokens: 29360128000 | elapsed time per iteration (s): 0.58 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 2.714665E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.034 | TFLOPs: 42.43 | +0: steps: 56000 loss: 2.7410 iter time (s): 0.567 samples/sec: 451.837 +7: iteration 56010/ 115203 | consumed samples: 14338560 | consumed tokens: 29365370880 | elapsed time per iteration (s): 0.57 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 2.708751E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.444 | TFLOPs: 43.14 | +7: iteration 56020/ 115203 | consumed samples: 14341120 | consumed tokens: 29370613760 | elapsed time per iteration (s): 0.56 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 2.714913E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.689 | TFLOPs: 43.64 | +7: iteration 56030/ 115203 | consumed samples: 14343680 | consumed tokens: 29375856640 | elapsed time per iteration (s): 0.56 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 2.713313E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.413 | TFLOPs: 43.51 | +7: iteration 56040/ 115203 | consumed samples: 14346240 | consumed tokens: 29381099520 | elapsed time per iteration (s): 0.56 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 2.708723E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.528 | TFLOPs: 43.72 | +7: iteration 56050/ 115203 | consumed samples: 14348800 | consumed tokens: 29386342400 | elapsed time per iteration (s): 0.56 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 2.712012E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.695 | TFLOPs: 43.64 | +7: iteration 56060/ 115203 | consumed samples: 14351360 | consumed tokens: 29391585280 | elapsed time per iteration (s): 0.57 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 2.708955E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.744 | TFLOPs: 42.78 | +7: iteration 56070/ 115203 | consumed samples: 14353920 | consumed tokens: 29396828160 | elapsed time per iteration (s): 0.57 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 2.711471E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.024 | TFLOPs: 42.90 | +7: iteration 56080/ 115203 | consumed samples: 14356480 | consumed tokens: 29402071040 | elapsed time per iteration (s): 0.57 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 2.720375E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.884 | TFLOPs: 43.08 | +7: iteration 56090/ 115203 | consumed samples: 14359040 | consumed tokens: 29407313920 | elapsed time per iteration (s): 0.59 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 2.695672E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.333 | TFLOPs: 41.22 | +7: iteration 56100/ 115203 | consumed samples: 14361600 | consumed tokens: 29412556800 | elapsed time per iteration (s): 0.60 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 2.705378E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.594 | TFLOPs: 40.39 | +7: iteration 56110/ 115203 | consumed samples: 14364160 | consumed tokens: 29417799680 | elapsed time per iteration (s): 0.57 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 2.690520E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.657 | TFLOPs: 43.16 | +7: iteration 56120/ 115203 | consumed samples: 14366720 | consumed tokens: 29423042560 | elapsed time per iteration (s): 0.56 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 2.712224E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.793 | TFLOPs: 43.26 | +7: iteration 56130/ 115203 | consumed samples: 14369280 | consumed tokens: 29428285440 | elapsed time per iteration (s): 0.57 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 2.701369E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.763 | TFLOPs: 43.17 | +7: iteration 56140/ 115203 | consumed samples: 14371840 | consumed tokens: 29433528320 | elapsed time per iteration (s): 0.57 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 2.708186E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.594 | TFLOPs: 42.58 | +7: iteration 56150/ 115203 | consumed samples: 14374400 | consumed tokens: 29438771200 | elapsed time per iteration (s): 0.56 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 2.712536E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.719 | TFLOPs: 43.45 | +7: iteration 56160/ 115203 | consumed samples: 14376960 | consumed tokens: 29444014080 | elapsed time per iteration (s): 0.57 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 2.710272E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.785 | TFLOPs: 42.98 | +7: iteration 56170/ 115203 | consumed samples: 14379520 | consumed tokens: 29449256960 | elapsed time per iteration (s): 0.57 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 2.706290E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.062 | TFLOPs: 42.81 | +7: iteration 56180/ 115203 | consumed samples: 14382080 | consumed tokens: 29454499840 | elapsed time per iteration (s): 0.57 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 2.713794E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.965 | TFLOPs: 42.90 | +7: iteration 56190/ 115203 | consumed samples: 14384640 | consumed tokens: 29459742720 | elapsed time per iteration (s): 0.56 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 2.708077E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.484 | TFLOPs: 43.62 | +7: iteration 56200/ 115203 | consumed samples: 14387200 | consumed tokens: 29464985600 | elapsed time per iteration (s): 0.59 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 2.704902E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.755 | TFLOPs: 41.54 | +7: iteration 56210/ 115203 | consumed samples: 14389760 | consumed tokens: 29470228480 | elapsed time per iteration (s): 0.56 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 2.708706E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.270 | TFLOPs: 43.21 | +7: iteration 56220/ 115203 | consumed samples: 14392320 | consumed tokens: 29475471360 | elapsed time per iteration (s): 0.57 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 2.715577E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.967 | TFLOPs: 43.09 | +7: iteration 56230/ 115203 | consumed samples: 14394880 | consumed tokens: 29480714240 | elapsed time per iteration (s): 0.56 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 2.703550E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.091 | TFLOPs: 43.48 | +7: iteration 56240/ 115203 | consumed samples: 14397440 | consumed tokens: 29485957120 | elapsed time per iteration (s): 0.57 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 2.707026E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.861 | TFLOPs: 43.18 | +7: iteration 56250/ 115203 | consumed samples: 14400000 | consumed tokens: 29491200000 | elapsed time per iteration (s): 0.56 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 2.698151E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.751 | TFLOPs: 43.55 | +7: iteration 56260/ 115203 | consumed samples: 14402560 | consumed tokens: 29496442880 | elapsed time per iteration (s): 0.56 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 2.716152E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.578 | TFLOPs: 43.34 | +7: iteration 56270/ 115203 | consumed samples: 14405120 | consumed tokens: 29501685760 | elapsed time per iteration (s): 0.56 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 2.714875E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.632 | TFLOPs: 43.34 | +7: iteration 56280/ 115203 | consumed samples: 14407680 | consumed tokens: 29506928640 | elapsed time per iteration (s): 0.56 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 2.701333E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.453 | TFLOPs: 43.52 | +7: iteration 56290/ 115203 | consumed samples: 14410240 | consumed tokens: 29512171520 | elapsed time per iteration (s): 0.56 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 2.727222E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.151 | TFLOPs: 43.49 | +7: iteration 56300/ 115203 | consumed samples: 14412800 | consumed tokens: 29517414400 | elapsed time per iteration (s): 0.57 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 2.696682E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.962 | TFLOPs: 42.99 | +7: iteration 56310/ 115203 | consumed samples: 14415360 | consumed tokens: 29522657280 | elapsed time per iteration (s): 0.56 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 2.716721E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.681 | TFLOPs: 43.63 | +7: iteration 56320/ 115203 | consumed samples: 14417920 | consumed tokens: 29527900160 | elapsed time per iteration (s): 0.56 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 2.701408E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.570 | TFLOPs: 43.43 | +7: iteration 56330/ 115203 | consumed samples: 14420480 | consumed tokens: 29533143040 | elapsed time per iteration (s): 0.56 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 2.722060E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.589 | TFLOPs: 43.44 | +7: iteration 56340/ 115203 | consumed samples: 14423040 | consumed tokens: 29538385920 | elapsed time per iteration (s): 0.56 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 2.711170E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.849 | TFLOPs: 43.56 | +7: iteration 56350/ 115203 | consumed samples: 14425600 | consumed tokens: 29543628800 | elapsed time per iteration (s): 0.55 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 2.708305E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.536 | TFLOPs: 44.00 | +7: iteration 56360/ 115203 | consumed samples: 14428160 | consumed tokens: 29548871680 | elapsed time per iteration (s): 0.56 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 2.709958E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.037 | TFLOPs: 43.38 | +7: iteration 56370/ 115203 | consumed samples: 14430720 | consumed tokens: 29554114560 | elapsed time per iteration (s): 0.56 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 2.706551E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.149 | TFLOPs: 43.39 | +7: iteration 56380/ 115203 | consumed samples: 14433280 | consumed tokens: 29559357440 | elapsed time per iteration (s): 0.55 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 2.716333E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.401 | TFLOPs: 43.99 | +7: iteration 56390/ 115203 | consumed samples: 14435840 | consumed tokens: 29564600320 | elapsed time per iteration (s): 0.57 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 2.718874E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.722 | TFLOPs: 43.07 | +7: iteration 56400/ 115203 | consumed samples: 14438400 | consumed tokens: 29569843200 | elapsed time per iteration (s): 0.57 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 2.717006E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.167 | TFLOPs: 43.11 | +7: iteration 56410/ 115203 | consumed samples: 14440960 | consumed tokens: 29575086080 | elapsed time per iteration (s): 0.56 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 2.708556E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.249 | TFLOPs: 43.40 | +7: iteration 56420/ 115203 | consumed samples: 14443520 | consumed tokens: 29580328960 | elapsed time per iteration (s): 0.57 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 2.708103E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.712 | TFLOPs: 43.07 | +7: iteration 56430/ 115203 | consumed samples: 14446080 | consumed tokens: 29585571840 | elapsed time per iteration (s): 0.57 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 2.707909E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.920 | TFLOPs: 42.80 | +7: iteration 56440/ 115203 | consumed samples: 14448640 | consumed tokens: 29590814720 | elapsed time per iteration (s): 0.57 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 2.715722E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.677 | TFLOPs: 43.06 | +7: iteration 56450/ 115203 | consumed samples: 14451200 | consumed tokens: 29596057600 | elapsed time per iteration (s): 0.56 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 2.711835E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.352 | TFLOPs: 43.51 | +7: iteration 56460/ 115203 | consumed samples: 14453760 | consumed tokens: 29601300480 | elapsed time per iteration (s): 0.56 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 2.717294E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.364 | TFLOPs: 43.41 | +7: iteration 56470/ 115203 | consumed samples: 14456320 | consumed tokens: 29606543360 | elapsed time per iteration (s): 0.56 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 2.711190E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.621 | TFLOPs: 43.25 | +7: iteration 56480/ 115203 | consumed samples: 14458880 | consumed tokens: 29611786240 | elapsed time per iteration (s): 0.57 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 2.718184E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.987 | TFLOPs: 43.19 | +7: iteration 56490/ 115203 | consumed samples: 14461440 | consumed tokens: 29617029120 | elapsed time per iteration (s): 0.56 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 2.712914E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.628 | TFLOPs: 43.73 | +7: iteration 56500/ 115203 | consumed samples: 14464000 | consumed tokens: 29622272000 | elapsed time per iteration (s): 0.56 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 2.704195E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.753 | TFLOPs: 43.36 | +7: iteration 56510/ 115203 | consumed samples: 14466560 | consumed tokens: 29627514880 | elapsed time per iteration (s): 0.56 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 2.713359E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.204 | TFLOPs: 43.21 | +7: iteration 56520/ 115203 | consumed samples: 14469120 | consumed tokens: 29632757760 | elapsed time per iteration (s): 0.56 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 2.712353E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.191 | TFLOPs: 43.87 | +7: iteration 56530/ 115203 | consumed samples: 14471680 | consumed tokens: 29638000640 | elapsed time per iteration (s): 0.55 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 2.707560E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.544 | TFLOPs: 44.00 | +7: iteration 56540/ 115203 | consumed samples: 14474240 | consumed tokens: 29643243520 | elapsed time per iteration (s): 0.55 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 2.701631E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.557 | TFLOPs: 44.00 | +7: iteration 56550/ 115203 | consumed samples: 14476800 | consumed tokens: 29648486400 | elapsed time per iteration (s): 0.56 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 2.698326E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.625 | TFLOPs: 43.25 | +7: iteration 56560/ 115203 | consumed samples: 14479360 | consumed tokens: 29653729280 | elapsed time per iteration (s): 0.56 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 2.704819E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.222 | TFLOPs: 43.21 | +7: iteration 56570/ 115203 | consumed samples: 14481920 | consumed tokens: 29658972160 | elapsed time per iteration (s): 0.57 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 2.715384E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.598 | TFLOPs: 43.06 | +7: iteration 56580/ 115203 | consumed samples: 14484480 | consumed tokens: 29664215040 | elapsed time per iteration (s): 0.56 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 2.715123E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.998 | TFLOPs: 43.28 | +7: iteration 56590/ 115203 | consumed samples: 14487040 | consumed tokens: 29669457920 | elapsed time per iteration (s): 0.56 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 2.714105E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.258 | TFLOPs: 43.31 | +7: iteration 56600/ 115203 | consumed samples: 14489600 | consumed tokens: 29674700800 | elapsed time per iteration (s): 0.57 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 2.700687E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.680 | TFLOPs: 42.97 | +7: iteration 56610/ 115203 | consumed samples: 14492160 | consumed tokens: 29679943680 | elapsed time per iteration (s): 0.59 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 2.714458E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.288 | TFLOPs: 41.69 | +7: iteration 56620/ 115203 | consumed samples: 14494720 | consumed tokens: 29685186560 | elapsed time per iteration (s): 0.56 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 2.706809E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.988 | TFLOPs: 43.28 | +7: iteration 56630/ 115203 | consumed samples: 14497280 | consumed tokens: 29690429440 | elapsed time per iteration (s): 0.57 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 2.695625E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.850 | TFLOPs: 42.89 | +7: iteration 56640/ 115203 | consumed samples: 14499840 | consumed tokens: 29695672320 | elapsed time per iteration (s): 0.57 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 2.716722E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.120 | TFLOPs: 43.10 | +7: iteration 56650/ 115203 | consumed samples: 14502400 | consumed tokens: 29700915200 | elapsed time per iteration (s): 0.56 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 2.697134E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.695 | TFLOPs: 43.64 | +7: iteration 56660/ 115203 | consumed samples: 14504960 | consumed tokens: 29706158080 | elapsed time per iteration (s): 0.57 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 2.704541E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.231 | TFLOPs: 43.12 | +7: iteration 56670/ 115203 | consumed samples: 14507520 | consumed tokens: 29711400960 | elapsed time per iteration (s): 0.56 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 2.702247E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.297 | TFLOPs: 43.50 | +7: iteration 56680/ 115203 | consumed samples: 14510080 | consumed tokens: 29716643840 | elapsed time per iteration (s): 0.56 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 2.707929E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.412 | TFLOPs: 43.42 | +7: iteration 56690/ 115203 | consumed samples: 14512640 | consumed tokens: 29721886720 | elapsed time per iteration (s): 0.56 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 2.703375E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.898 | TFLOPs: 43.46 | +7: iteration 56700/ 115203 | consumed samples: 14515200 | consumed tokens: 29727129600 | elapsed time per iteration (s): 0.56 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 2.725806E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.763 | TFLOPs: 43.36 | +7: iteration 56710/ 115203 | consumed samples: 14517760 | consumed tokens: 29732372480 | elapsed time per iteration (s): 0.56 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 2.719544E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.216 | TFLOPs: 43.59 | +7: iteration 56720/ 115203 | consumed samples: 14520320 | consumed tokens: 29737615360 | elapsed time per iteration (s): 0.57 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 2.710542E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.422 | TFLOPs: 42.56 | +7: iteration 56730/ 115203 | consumed samples: 14522880 | consumed tokens: 29742858240 | elapsed time per iteration (s): 0.56 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 2.710038E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.558 | TFLOPs: 43.81 | +7: iteration 56740/ 115203 | consumed samples: 14525440 | consumed tokens: 29748101120 | elapsed time per iteration (s): 0.56 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 2.695460E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.947 | TFLOPs: 43.95 | +7: iteration 56750/ 115203 | consumed samples: 14528000 | consumed tokens: 29753344000 | elapsed time per iteration (s): 0.56 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 2.709482E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.029 | TFLOPs: 43.29 | +7: iteration 56760/ 115203 | consumed samples: 14530560 | consumed tokens: 29758586880 | elapsed time per iteration (s): 0.56 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 2.704770E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.872 | TFLOPs: 43.65 | +7: iteration 56770/ 115203 | consumed samples: 14533120 | consumed tokens: 29763829760 | elapsed time per iteration (s): 0.56 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 2.695601E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.647 | TFLOPs: 43.35 | +7: iteration 56780/ 115203 | consumed samples: 14535680 | consumed tokens: 29769072640 | elapsed time per iteration (s): 0.57 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 2.705450E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.216 | TFLOPs: 42.92 | +7: iteration 56790/ 115203 | consumed samples: 14538240 | consumed tokens: 29774315520 | elapsed time per iteration (s): 0.58 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 2.706750E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.429 | TFLOPs: 42.18 | +7: iteration 56800/ 115203 | consumed samples: 14540800 | consumed tokens: 29779558400 | elapsed time per iteration (s): 0.56 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 2.696842E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.292 | TFLOPs: 43.88 | +7: iteration 56810/ 115203 | consumed samples: 14543360 | consumed tokens: 29784801280 | elapsed time per iteration (s): 0.56 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 2.702937E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.335 | TFLOPs: 43.51 | +7: iteration 56820/ 115203 | consumed samples: 14545920 | consumed tokens: 29790044160 | elapsed time per iteration (s): 0.57 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 2.702761E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.048 | TFLOPs: 43.19 | +7: iteration 56830/ 115203 | consumed samples: 14548480 | consumed tokens: 29795287040 | elapsed time per iteration (s): 0.56 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 2.708185E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.997 | TFLOPs: 43.47 | +7: iteration 56840/ 115203 | consumed samples: 14551040 | consumed tokens: 29800529920 | elapsed time per iteration (s): 0.57 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 2.689772E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.796 | TFLOPs: 43.07 | +7: iteration 56850/ 115203 | consumed samples: 14553600 | consumed tokens: 29805772800 | elapsed time per iteration (s): 0.56 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 2.712802E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.704 | TFLOPs: 43.73 | +7: iteration 56860/ 115203 | consumed samples: 14556160 | consumed tokens: 29811015680 | elapsed time per iteration (s): 0.56 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 2.698363E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.431 | TFLOPs: 43.23 | +7: iteration 56870/ 115203 | consumed samples: 14558720 | consumed tokens: 29816258560 | elapsed time per iteration (s): 0.56 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 2.686725E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.148 | TFLOPs: 43.49 | +7: iteration 56880/ 115203 | consumed samples: 14561280 | consumed tokens: 29821501440 | elapsed time per iteration (s): 0.56 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 2.722609E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.989 | TFLOPs: 43.47 | +7: iteration 56890/ 115203 | consumed samples: 14563840 | consumed tokens: 29826744320 | elapsed time per iteration (s): 0.57 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 2.697549E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.437 | TFLOPs: 43.13 | +7: iteration 56900/ 115203 | consumed samples: 14566400 | consumed tokens: 29831987200 | elapsed time per iteration (s): 0.56 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 2.709160E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.384 | TFLOPs: 43.23 | +7: iteration 56910/ 115203 | consumed samples: 14568960 | consumed tokens: 29837230080 | elapsed time per iteration (s): 0.56 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 2.716575E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.272 | TFLOPs: 43.21 | +7: iteration 56920/ 115203 | consumed samples: 14571520 | consumed tokens: 29842472960 | elapsed time per iteration (s): 0.56 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 2.714673E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.031 | TFLOPs: 43.29 | +7: iteration 56930/ 115203 | consumed samples: 14574080 | consumed tokens: 29847715840 | elapsed time per iteration (s): 0.57 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 2.701316E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.551 | TFLOPs: 42.76 | +7: iteration 56940/ 115203 | consumed samples: 14576640 | consumed tokens: 29852958720 | elapsed time per iteration (s): 0.56 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 2.705031E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.767 | TFLOPs: 43.64 | +7: iteration 56950/ 115203 | consumed samples: 14579200 | consumed tokens: 29858201600 | elapsed time per iteration (s): 0.56 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 2.713204E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.225 | TFLOPs: 43.40 | +7: iteration 56960/ 115203 | consumed samples: 14581760 | consumed tokens: 29863444480 | elapsed time per iteration (s): 0.57 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 2.695609E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.127 | TFLOPs: 43.11 | +7: iteration 56970/ 115203 | consumed samples: 14584320 | consumed tokens: 29868687360 | elapsed time per iteration (s): 0.57 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 2.707576E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.981 | TFLOPs: 42.71 | +7: iteration 56980/ 115203 | consumed samples: 14586880 | consumed tokens: 29873930240 | elapsed time per iteration (s): 0.56 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 2.708140E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.002 | TFLOPs: 43.57 | +7: iteration 56990/ 115203 | consumed samples: 14589440 | consumed tokens: 29879173120 | elapsed time per iteration (s): 0.57 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 2.720345E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.857 | TFLOPs: 42.98 | +7: iteration 57000/ 115203 | consumed samples: 14592000 | consumed tokens: 29884416000 | elapsed time per iteration (s): 0.57 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 2.711982E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.413 | TFLOPs: 43.13 | +7: iteration 57010/ 115203 | consumed samples: 14594560 | consumed tokens: 29889658880 | elapsed time per iteration (s): 0.56 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 2.714477E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.744 | TFLOPs: 43.35 | +7: iteration 57020/ 115203 | consumed samples: 14597120 | consumed tokens: 29894901760 | elapsed time per iteration (s): 0.56 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 2.711228E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.566 | TFLOPs: 43.53 | +7: iteration 57030/ 115203 | consumed samples: 14599680 | consumed tokens: 29900144640 | elapsed time per iteration (s): 0.57 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 2.706530E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.073 | TFLOPs: 42.91 | +7: iteration 57040/ 115203 | consumed samples: 14602240 | consumed tokens: 29905387520 | elapsed time per iteration (s): 0.55 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 2.708659E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.379 | TFLOPs: 43.99 | +7: iteration 57050/ 115203 | consumed samples: 14604800 | consumed tokens: 29910630400 | elapsed time per iteration (s): 0.57 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 2.708433E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.104 | TFLOPs: 42.72 | +7: iteration 57060/ 115203 | consumed samples: 14607360 | consumed tokens: 29915873280 | elapsed time per iteration (s): 0.56 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 2.704426E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.138 | TFLOPs: 43.30 | +7: iteration 57070/ 115203 | consumed samples: 14609920 | consumed tokens: 29921116160 | elapsed time per iteration (s): 0.57 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 2.702783E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.152 | TFLOPs: 42.92 | +7: iteration 57080/ 115203 | consumed samples: 14612480 | consumed tokens: 29926359040 | elapsed time per iteration (s): 0.56 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 2.702507E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.984 | TFLOPs: 43.28 | +7: iteration 57090/ 115203 | consumed samples: 14615040 | consumed tokens: 29931601920 | elapsed time per iteration (s): 0.57 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 2.706021E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.964 | TFLOPs: 43.09 | +7: iteration 57100/ 115203 | consumed samples: 14617600 | consumed tokens: 29936844800 | elapsed time per iteration (s): 0.58 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 2.706220E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.341 | TFLOPs: 42.27 | +7: iteration 57110/ 115203 | consumed samples: 14620160 | consumed tokens: 29942087680 | elapsed time per iteration (s): 0.56 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 2.712767E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.921 | TFLOPs: 43.28 | +7: iteration 57120/ 115203 | consumed samples: 14622720 | consumed tokens: 29947330560 | elapsed time per iteration (s): 0.57 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 2.704634E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.953 | TFLOPs: 42.99 | +7: iteration 57130/ 115203 | consumed samples: 14625280 | consumed tokens: 29952573440 | elapsed time per iteration (s): 0.57 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 2.697386E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.343 | TFLOPs: 42.84 | +7: iteration 57140/ 115203 | consumed samples: 14627840 | consumed tokens: 29957816320 | elapsed time per iteration (s): 0.55 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 2.696369E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.683 | TFLOPs: 44.02 | +7: iteration 57150/ 115203 | consumed samples: 14630400 | consumed tokens: 29963059200 | elapsed time per iteration (s): 0.56 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 2.703828E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.026 | TFLOPs: 43.57 | +7: iteration 57160/ 115203 | consumed samples: 14632960 | consumed tokens: 29968302080 | elapsed time per iteration (s): 0.56 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 2.700417E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.713 | TFLOPs: 43.54 | +7: iteration 57170/ 115203 | consumed samples: 14635520 | consumed tokens: 29973544960 | elapsed time per iteration (s): 0.57 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 2.701623E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.471 | TFLOPs: 42.76 | +7: iteration 57180/ 115203 | consumed samples: 14638080 | consumed tokens: 29978787840 | elapsed time per iteration (s): 0.56 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 2.708868E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.768 | TFLOPs: 43.36 | +7: iteration 57190/ 115203 | consumed samples: 14640640 | consumed tokens: 29984030720 | elapsed time per iteration (s): 0.58 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 2.703614E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.006 | TFLOPs: 42.33 | +7: iteration 57200/ 115203 | consumed samples: 14643200 | consumed tokens: 29989273600 | elapsed time per iteration (s): 0.57 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 2.703850E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.991 | TFLOPs: 43.09 | +7: iteration 57210/ 115203 | consumed samples: 14645760 | consumed tokens: 29994516480 | elapsed time per iteration (s): 0.56 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 2.711940E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.340 | TFLOPs: 43.51 | +7: iteration 57220/ 115203 | consumed samples: 14648320 | consumed tokens: 29999759360 | elapsed time per iteration (s): 0.56 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 2.698366E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.933 | TFLOPs: 43.47 | +7: iteration 57230/ 115203 | consumed samples: 14650880 | consumed tokens: 30005002240 | elapsed time per iteration (s): 0.56 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 2.706206E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.238 | TFLOPs: 43.21 | +7: iteration 57240/ 115203 | consumed samples: 14653440 | consumed tokens: 30010245120 | elapsed time per iteration (s): 0.56 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 2.715885E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.814 | TFLOPs: 43.55 | +7: iteration 57250/ 115203 | consumed samples: 14656000 | consumed tokens: 30015488000 | elapsed time per iteration (s): 0.56 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 2.687220E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.874 | TFLOPs: 43.27 | +7: iteration 57260/ 115203 | consumed samples: 14658560 | consumed tokens: 30020730880 | elapsed time per iteration (s): 0.56 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 2.712269E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.501 | TFLOPs: 43.71 | +7: iteration 57270/ 115203 | consumed samples: 14661120 | consumed tokens: 30025973760 | elapsed time per iteration (s): 0.56 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 2.707338E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.399 | TFLOPs: 43.70 | +7: iteration 57280/ 115203 | consumed samples: 14663680 | consumed tokens: 30031216640 | elapsed time per iteration (s): 0.57 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 2.703608E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.479 | TFLOPs: 42.95 | +7: iteration 57290/ 115203 | consumed samples: 14666240 | consumed tokens: 30036459520 | elapsed time per iteration (s): 0.56 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 2.709561E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.721 | TFLOPs: 43.64 | +7: iteration 57300/ 115203 | consumed samples: 14668800 | consumed tokens: 30041702400 | elapsed time per iteration (s): 0.57 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 2.695492E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.361 | TFLOPs: 42.94 | +7: iteration 57310/ 115203 | consumed samples: 14671360 | consumed tokens: 30046945280 | elapsed time per iteration (s): 0.57 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 2.706167E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.947 | TFLOPs: 42.61 | +7: iteration 57320/ 115203 | consumed samples: 14673920 | consumed tokens: 30052188160 | elapsed time per iteration (s): 0.58 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 2.695484E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.105 | TFLOPs: 42.44 | +7: iteration 57330/ 115203 | consumed samples: 14676480 | consumed tokens: 30057431040 | elapsed time per iteration (s): 0.57 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 2.709754E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.383 | TFLOPs: 43.03 | +7: iteration 57340/ 115203 | consumed samples: 14679040 | consumed tokens: 30062673920 | elapsed time per iteration (s): 0.57 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 2.707630E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.630 | TFLOPs: 43.06 | +7: iteration 57350/ 115203 | consumed samples: 14681600 | consumed tokens: 30067916800 | elapsed time per iteration (s): 0.56 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 2.692488E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.549 | TFLOPs: 43.24 | +7: iteration 57360/ 115203 | consumed samples: 14684160 | consumed tokens: 30073159680 | elapsed time per iteration (s): 0.57 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 2.703748E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.787 | TFLOPs: 42.60 | +7: iteration 57370/ 115203 | consumed samples: 14686720 | consumed tokens: 30078402560 | elapsed time per iteration (s): 0.57 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 2.714142E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.572 | TFLOPs: 42.67 | +7: iteration 57380/ 115203 | consumed samples: 14689280 | consumed tokens: 30083645440 | elapsed time per iteration (s): 0.57 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 2.711882E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.285 | TFLOPs: 42.83 | +7: iteration 57390/ 115203 | consumed samples: 14691840 | consumed tokens: 30088888320 | elapsed time per iteration (s): 0.56 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 2.698956E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.665 | TFLOPs: 43.35 | +7: iteration 57400/ 115203 | consumed samples: 14694400 | consumed tokens: 30094131200 | elapsed time per iteration (s): 0.59 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 2.706580E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.035 | TFLOPs: 41.48 | +7: iteration 57410/ 115203 | consumed samples: 14696960 | consumed tokens: 30099374080 | elapsed time per iteration (s): 0.57 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 2.699022E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.842 | TFLOPs: 42.89 | +7: iteration 57420/ 115203 | consumed samples: 14699520 | consumed tokens: 30104616960 | elapsed time per iteration (s): 0.57 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 2.707861E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.244 | TFLOPs: 42.93 | +7: iteration 57430/ 115203 | consumed samples: 14702080 | consumed tokens: 30109859840 | elapsed time per iteration (s): 0.57 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 2.705597E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.902 | TFLOPs: 42.61 | +7: iteration 57440/ 115203 | consumed samples: 14704640 | consumed tokens: 30115102720 | elapsed time per iteration (s): 0.57 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 2.712012E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.084 | TFLOPs: 42.91 | +7: iteration 57450/ 115203 | consumed samples: 14707200 | consumed tokens: 30120345600 | elapsed time per iteration (s): 0.57 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 2.708209E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.097 | TFLOPs: 42.63 | +7: iteration 57460/ 115203 | consumed samples: 14709760 | consumed tokens: 30125588480 | elapsed time per iteration (s): 0.56 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 2.675767E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.999 | TFLOPs: 43.28 | +7: iteration 57470/ 115203 | consumed samples: 14712320 | consumed tokens: 30130831360 | elapsed time per iteration (s): 0.58 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 2.710752E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.994 | TFLOPs: 42.43 | +7: iteration 57480/ 115203 | consumed samples: 14714880 | consumed tokens: 30136074240 | elapsed time per iteration (s): 0.57 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 2.711309E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.231 | TFLOPs: 42.83 | +7: iteration 57490/ 115203 | consumed samples: 14717440 | consumed tokens: 30141317120 | elapsed time per iteration (s): 0.57 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 2.701648E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.773 | TFLOPs: 43.07 | +7: iteration 57500/ 115203 | consumed samples: 14720000 | consumed tokens: 30146560000 | elapsed time per iteration (s): 0.57 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 2.697808E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.235 | TFLOPs: 42.45 | +7: iteration 57510/ 115203 | consumed samples: 14722560 | consumed tokens: 30151802880 | elapsed time per iteration (s): 0.57 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 2.710525E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.816 | TFLOPs: 42.89 | +7: iteration 57520/ 115203 | consumed samples: 14725120 | consumed tokens: 30157045760 | elapsed time per iteration (s): 0.55 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 2.691540E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.588 | TFLOPs: 44.01 | +7: iteration 57530/ 115203 | consumed samples: 14727680 | consumed tokens: 30162288640 | elapsed time per iteration (s): 0.56 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 2.714224E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.139 | TFLOPs: 43.30 | +7: iteration 57540/ 115203 | consumed samples: 14730240 | consumed tokens: 30167531520 | elapsed time per iteration (s): 0.56 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 2.698560E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.548 | TFLOPs: 43.72 | +7: iteration 57550/ 115203 | consumed samples: 14732800 | consumed tokens: 30172774400 | elapsed time per iteration (s): 0.56 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 2.703275E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.258 | TFLOPs: 43.21 | +7: iteration 57560/ 115203 | consumed samples: 14735360 | consumed tokens: 30178017280 | elapsed time per iteration (s): 0.58 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 2.699420E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.907 | TFLOPs: 42.23 | +7: iteration 57570/ 115203 | consumed samples: 14737920 | consumed tokens: 30183260160 | elapsed time per iteration (s): 0.57 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 2.689786E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.000 | TFLOPs: 42.81 | +7: iteration 57580/ 115203 | consumed samples: 14740480 | consumed tokens: 30188503040 | elapsed time per iteration (s): 0.55 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 2.697969E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.471 | TFLOPs: 44.00 | +7: iteration 57590/ 115203 | consumed samples: 14743040 | consumed tokens: 30193745920 | elapsed time per iteration (s): 0.57 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 2.697346E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.259 | TFLOPs: 42.64 | +7: iteration 57600/ 115203 | consumed samples: 14745600 | consumed tokens: 30198988800 | elapsed time per iteration (s): 0.56 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 2.696918E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.490 | TFLOPs: 43.52 | +7: iteration 57610/ 115203 | consumed samples: 14748160 | consumed tokens: 30204231680 | elapsed time per iteration (s): 0.55 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 2.714600E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.455 | TFLOPs: 43.99 | +7: iteration 57620/ 115203 | consumed samples: 14750720 | consumed tokens: 30209474560 | elapsed time per iteration (s): 0.55 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 2.708750E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.544 | TFLOPs: 44.00 | +7: iteration 57630/ 115203 | consumed samples: 14753280 | consumed tokens: 30214717440 | elapsed time per iteration (s): 0.56 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 2.696140E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.058 | TFLOPs: 43.38 | +7: iteration 57640/ 115203 | consumed samples: 14755840 | consumed tokens: 30219960320 | elapsed time per iteration (s): 0.57 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 2.708912E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.209 | TFLOPs: 43.11 | +7: iteration 57650/ 115203 | consumed samples: 14758400 | consumed tokens: 30225203200 | elapsed time per iteration (s): 0.55 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 2.708275E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.478 | TFLOPs: 44.00 | +7: iteration 57660/ 115203 | consumed samples: 14760960 | consumed tokens: 30230446080 | elapsed time per iteration (s): 0.57 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 2.700073E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.526 | TFLOPs: 42.86 | +7: iteration 57670/ 115203 | consumed samples: 14763520 | consumed tokens: 30235688960 | elapsed time per iteration (s): 0.56 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 2.704292E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.242 | TFLOPs: 43.50 | +7: iteration 57680/ 115203 | consumed samples: 14766080 | consumed tokens: 30240931840 | elapsed time per iteration (s): 0.56 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 2.680001E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.833 | TFLOPs: 43.74 | +7: iteration 57690/ 115203 | consumed samples: 14768640 | consumed tokens: 30246174720 | elapsed time per iteration (s): 0.56 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 2.691264E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.358 | TFLOPs: 43.60 | +7: iteration 57700/ 115203 | consumed samples: 14771200 | consumed tokens: 30251417600 | elapsed time per iteration (s): 0.57 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 2.708505E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.747 | TFLOPs: 42.78 | +7: iteration 57710/ 115203 | consumed samples: 14773760 | consumed tokens: 30256660480 | elapsed time per iteration (s): 0.57 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 2.692350E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.231 | TFLOPs: 42.83 | +7: iteration 57720/ 115203 | consumed samples: 14776320 | consumed tokens: 30261903360 | elapsed time per iteration (s): 0.56 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 2.715014E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.818 | TFLOPs: 43.55 | +7: iteration 57730/ 115203 | consumed samples: 14778880 | consumed tokens: 30267146240 | elapsed time per iteration (s): 0.56 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 2.713148E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.712 | TFLOPs: 43.35 | +7: iteration 57740/ 115203 | consumed samples: 14781440 | consumed tokens: 30272389120 | elapsed time per iteration (s): 0.56 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 2.689444E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.594 | TFLOPs: 43.53 | +7: iteration 57750/ 115203 | consumed samples: 14784000 | consumed tokens: 30277632000 | elapsed time per iteration (s): 0.55 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 2.684100E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.396 | TFLOPs: 43.99 | +7: iteration 57760/ 115203 | consumed samples: 14786560 | consumed tokens: 30282874880 | elapsed time per iteration (s): 0.56 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 2.708480E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.016 | TFLOPs: 43.38 | +7: iteration 57770/ 115203 | consumed samples: 14789120 | consumed tokens: 30288117760 | elapsed time per iteration (s): 0.55 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 2.709363E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.296 | TFLOPs: 43.98 | +7: iteration 57780/ 115203 | consumed samples: 14791680 | consumed tokens: 30293360640 | elapsed time per iteration (s): 0.56 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 2.688552E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.512 | TFLOPs: 43.62 | +7: iteration 57790/ 115203 | consumed samples: 14794240 | consumed tokens: 30298603520 | elapsed time per iteration (s): 0.56 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 2.714406E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.746 | TFLOPs: 43.45 | +7: iteration 57800/ 115203 | consumed samples: 14796800 | consumed tokens: 30303846400 | elapsed time per iteration (s): 0.55 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 2.686285E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.527 | TFLOPs: 44.00 | +7: iteration 57810/ 115203 | consumed samples: 14799360 | consumed tokens: 30309089280 | elapsed time per iteration (s): 0.57 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 2.694109E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.292 | TFLOPs: 42.55 | +7: iteration 57820/ 115203 | consumed samples: 14801920 | consumed tokens: 30314332160 | elapsed time per iteration (s): 0.57 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 2.700593E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.728 | TFLOPs: 43.07 | +7: iteration 57830/ 115203 | consumed samples: 14804480 | consumed tokens: 30319575040 | elapsed time per iteration (s): 0.55 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 2.692085E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.556 | TFLOPs: 44.00 | +7: iteration 57840/ 115203 | consumed samples: 14807040 | consumed tokens: 30324817920 | elapsed time per iteration (s): 0.56 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 2.718667E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.754 | TFLOPs: 43.64 | +7: iteration 57850/ 115203 | consumed samples: 14809600 | consumed tokens: 30330060800 | elapsed time per iteration (s): 0.55 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 2.705788E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.455 | TFLOPs: 43.99 | +7: iteration 57860/ 115203 | consumed samples: 14812160 | consumed tokens: 30335303680 | elapsed time per iteration (s): 0.57 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 2.709441E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.978 | TFLOPs: 42.90 | +7: iteration 57870/ 115203 | consumed samples: 14814720 | consumed tokens: 30340546560 | elapsed time per iteration (s): 0.55 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 2.698663E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 57880/ 115203 | consumed samples: 14817280 | consumed tokens: 30345789440 | elapsed time per iteration (s): 0.55 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 2.714125E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 57890/ 115203 | consumed samples: 14819840 | consumed tokens: 30351032320 | elapsed time per iteration (s): 0.56 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 2.705610E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.344 | TFLOPs: 43.60 | +7: iteration 57900/ 115203 | consumed samples: 14822400 | consumed tokens: 30356275200 | elapsed time per iteration (s): 0.56 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 2.706852E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.726 | TFLOPs: 43.54 | +7: iteration 57910/ 115203 | consumed samples: 14824960 | consumed tokens: 30361518080 | elapsed time per iteration (s): 0.56 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 2.683441E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.097 | TFLOPs: 43.20 | +7: iteration 57920/ 115203 | consumed samples: 14827520 | consumed tokens: 30366760960 | elapsed time per iteration (s): 0.56 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 2.690640E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.958 | TFLOPs: 43.57 | +7: iteration 57930/ 115203 | consumed samples: 14830080 | consumed tokens: 30372003840 | elapsed time per iteration (s): 0.55 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 2.701000E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.497 | TFLOPs: 44.00 | +7: iteration 57940/ 115203 | consumed samples: 14832640 | consumed tokens: 30377246720 | elapsed time per iteration (s): 0.57 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 2.697166E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.852 | TFLOPs: 42.98 | +7: iteration 57950/ 115203 | consumed samples: 14835200 | consumed tokens: 30382489600 | elapsed time per iteration (s): 0.56 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 2.712639E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.249 | TFLOPs: 43.98 | +7: iteration 57960/ 115203 | consumed samples: 14837760 | consumed tokens: 30387732480 | elapsed time per iteration (s): 0.56 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 2.709992E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.836 | TFLOPs: 43.46 | +7: iteration 57970/ 115203 | consumed samples: 14840320 | consumed tokens: 30392975360 | elapsed time per iteration (s): 0.56 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 2.706347E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.390 | TFLOPs: 43.51 | +7: iteration 57980/ 115203 | consumed samples: 14842880 | consumed tokens: 30398218240 | elapsed time per iteration (s): 0.55 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 2.704590E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 57990/ 115203 | consumed samples: 14845440 | consumed tokens: 30403461120 | elapsed time per iteration (s): 0.55 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 2.700832E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.315 | TFLOPs: 43.98 | +0: [2023-03-16 21:56:02,255] [INFO] [logging.py:68:log_dist] [Rank 0] step=58000, skipped=0, lr=[0.00011044114819593482, 0.00011044114819593482, 0.00011044114819593482], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 58000/ 115203 | consumed samples: 14848000 | consumed tokens: 30408704000 | elapsed time per iteration (s): 0.56 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 2.705225E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.775 | TFLOPs: 43.55 | +0: steps: 58000 loss: 2.6837 iter time (s): 0.562 samples/sec: 455.444 +7: iteration 58010/ 115203 | consumed samples: 14850560 | consumed tokens: 30413946880 | elapsed time per iteration (s): 0.56 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 2.688153E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.934 | TFLOPs: 43.66 | +7: iteration 58020/ 115203 | consumed samples: 14853120 | consumed tokens: 30419189760 | elapsed time per iteration (s): 0.56 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 2.709022E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.475 | TFLOPs: 43.52 | +7: iteration 58030/ 115203 | consumed samples: 14855680 | consumed tokens: 30424432640 | elapsed time per iteration (s): 0.56 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 2.697680E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.277 | TFLOPs: 43.31 | +7: iteration 58040/ 115203 | consumed samples: 14858240 | consumed tokens: 30429675520 | elapsed time per iteration (s): 0.55 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 2.696210E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.355 | TFLOPs: 43.99 | +7: iteration 58050/ 115203 | consumed samples: 14860800 | consumed tokens: 30434918400 | elapsed time per iteration (s): 0.56 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 2.702400E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.815 | TFLOPs: 43.46 | +7: iteration 58060/ 115203 | consumed samples: 14863360 | consumed tokens: 30440161280 | elapsed time per iteration (s): 0.55 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 2.701382E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.447 | TFLOPs: 43.99 | +7: iteration 58070/ 115203 | consumed samples: 14865920 | consumed tokens: 30445404160 | elapsed time per iteration (s): 0.56 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 2.711205E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.352 | TFLOPs: 43.70 | +7: iteration 58080/ 115203 | consumed samples: 14868480 | consumed tokens: 30450647040 | elapsed time per iteration (s): 0.56 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 2.707970E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.866 | TFLOPs: 43.37 | +7: iteration 58090/ 115203 | consumed samples: 14871040 | consumed tokens: 30455889920 | elapsed time per iteration (s): 0.55 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 2.705371E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.361 | TFLOPs: 43.99 | +7: iteration 58100/ 115203 | consumed samples: 14873600 | consumed tokens: 30461132800 | elapsed time per iteration (s): 0.56 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 2.703474E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.047 | TFLOPs: 43.86 | +7: iteration 58110/ 115203 | consumed samples: 14876160 | consumed tokens: 30466375680 | elapsed time per iteration (s): 0.55 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 2.696562E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.296 | TFLOPs: 43.98 | +7: iteration 58120/ 115203 | consumed samples: 14878720 | consumed tokens: 30471618560 | elapsed time per iteration (s): 0.56 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 2.682089E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.952 | TFLOPs: 43.66 | +7: iteration 58130/ 115203 | consumed samples: 14881280 | consumed tokens: 30476861440 | elapsed time per iteration (s): 0.56 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 2.703902E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.732 | TFLOPs: 43.45 | +7: iteration 58140/ 115203 | consumed samples: 14883840 | consumed tokens: 30482104320 | elapsed time per iteration (s): 0.56 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 2.707093E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.225 | TFLOPs: 43.97 | +7: iteration 58150/ 115203 | consumed samples: 14886400 | consumed tokens: 30487347200 | elapsed time per iteration (s): 0.55 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 2.699291E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.545 | TFLOPs: 44.00 | +7: iteration 58160/ 115203 | consumed samples: 14888960 | consumed tokens: 30492590080 | elapsed time per iteration (s): 0.55 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 2.701418E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.517 | TFLOPs: 44.00 | +7: iteration 58170/ 115203 | consumed samples: 14891520 | consumed tokens: 30497832960 | elapsed time per iteration (s): 0.57 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 2.699354E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.863 | TFLOPs: 43.18 | +7: iteration 58180/ 115203 | consumed samples: 14894080 | consumed tokens: 30503075840 | elapsed time per iteration (s): 0.59 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 2.691835E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.719 | TFLOPs: 41.35 | +7: iteration 58190/ 115203 | consumed samples: 14896640 | consumed tokens: 30508318720 | elapsed time per iteration (s): 0.55 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 2.704885E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.804 | TFLOPs: 44.03 | +7: iteration 58200/ 115203 | consumed samples: 14899200 | consumed tokens: 30513561600 | elapsed time per iteration (s): 0.56 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 2.702815E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.388 | TFLOPs: 43.70 | +7: iteration 58210/ 115203 | consumed samples: 14901760 | consumed tokens: 30518804480 | elapsed time per iteration (s): 0.55 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 2.694332E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.432 | TFLOPs: 43.99 | +7: iteration 58220/ 115203 | consumed samples: 14904320 | consumed tokens: 30524047360 | elapsed time per iteration (s): 0.55 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 2.696771E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.350 | TFLOPs: 43.98 | +7: iteration 58230/ 115203 | consumed samples: 14906880 | consumed tokens: 30529290240 | elapsed time per iteration (s): 0.55 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 2.698249E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.581 | TFLOPs: 44.01 | +7: iteration 58240/ 115203 | consumed samples: 14909440 | consumed tokens: 30534533120 | elapsed time per iteration (s): 0.55 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 2.710230E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.695 | TFLOPs: 44.02 | +7: iteration 58250/ 115203 | consumed samples: 14912000 | consumed tokens: 30539776000 | elapsed time per iteration (s): 0.55 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 2.692024E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 58260/ 115203 | consumed samples: 14914560 | consumed tokens: 30545018880 | elapsed time per iteration (s): 0.55 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 2.711028E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.646 | TFLOPs: 44.01 | +7: iteration 58270/ 115203 | consumed samples: 14917120 | consumed tokens: 30550261760 | elapsed time per iteration (s): 0.55 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 2.716247E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.434 | TFLOPs: 43.99 | +7: iteration 58280/ 115203 | consumed samples: 14919680 | consumed tokens: 30555504640 | elapsed time per iteration (s): 0.56 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 2.697999E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.132 | TFLOPs: 43.49 | +7: iteration 58290/ 115203 | consumed samples: 14922240 | consumed tokens: 30560747520 | elapsed time per iteration (s): 0.56 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 2.699884E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.875 | TFLOPs: 43.37 | +7: iteration 58300/ 115203 | consumed samples: 14924800 | consumed tokens: 30565990400 | elapsed time per iteration (s): 0.56 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 2.705746E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.209 | TFLOPs: 43.30 | +7: iteration 58310/ 115203 | consumed samples: 14927360 | consumed tokens: 30571233280 | elapsed time per iteration (s): 0.56 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 2.694608E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.203 | TFLOPs: 43.97 | +7: iteration 58320/ 115203 | consumed samples: 14929920 | consumed tokens: 30576476160 | elapsed time per iteration (s): 0.56 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 2.699342E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.189 | TFLOPs: 43.97 | +7: iteration 58330/ 115203 | consumed samples: 14932480 | consumed tokens: 30581719040 | elapsed time per iteration (s): 0.55 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 2.705303E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.502 | TFLOPs: 44.00 | +7: iteration 58340/ 115203 | consumed samples: 14935040 | consumed tokens: 30586961920 | elapsed time per iteration (s): 0.55 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 2.698027E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.265 | TFLOPs: 43.98 | +7: iteration 58350/ 115203 | consumed samples: 14937600 | consumed tokens: 30592204800 | elapsed time per iteration (s): 0.55 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 2.690545E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 58360/ 115203 | consumed samples: 14940160 | consumed tokens: 30597447680 | elapsed time per iteration (s): 0.56 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 2.702903E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.213 | TFLOPs: 43.97 | +7: iteration 58370/ 115203 | consumed samples: 14942720 | consumed tokens: 30602690560 | elapsed time per iteration (s): 0.56 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 2.700658E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.101 | TFLOPs: 43.96 | +7: iteration 58380/ 115203 | consumed samples: 14945280 | consumed tokens: 30607933440 | elapsed time per iteration (s): 0.56 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 2.700843E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.218 | TFLOPs: 43.97 | +7: iteration 58390/ 115203 | consumed samples: 14947840 | consumed tokens: 30613176320 | elapsed time per iteration (s): 0.56 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 2.710371E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.018 | TFLOPs: 43.95 | +7: iteration 58400/ 115203 | consumed samples: 14950400 | consumed tokens: 30618419200 | elapsed time per iteration (s): 0.56 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 2.708649E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.198 | TFLOPs: 43.97 | +7: iteration 58410/ 115203 | consumed samples: 14952960 | consumed tokens: 30623662080 | elapsed time per iteration (s): 0.56 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 2.700102E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.864 | TFLOPs: 43.84 | +7: iteration 58420/ 115203 | consumed samples: 14955520 | consumed tokens: 30628904960 | elapsed time per iteration (s): 0.56 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 2.692964E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.092 | TFLOPs: 43.96 | +7: iteration 58430/ 115203 | consumed samples: 14958080 | consumed tokens: 30634147840 | elapsed time per iteration (s): 0.56 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 2.704230E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.087 | TFLOPs: 43.96 | +7: iteration 58440/ 115203 | consumed samples: 14960640 | consumed tokens: 30639390720 | elapsed time per iteration (s): 0.56 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 2.701191E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 58450/ 115203 | consumed samples: 14963200 | consumed tokens: 30644633600 | elapsed time per iteration (s): 0.56 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 2.693056E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.131 | TFLOPs: 43.96 | +7: iteration 58460/ 115203 | consumed samples: 14965760 | consumed tokens: 30649876480 | elapsed time per iteration (s): 0.56 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 2.688375E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.203 | TFLOPs: 43.97 | +7: iteration 58470/ 115203 | consumed samples: 14968320 | consumed tokens: 30655119360 | elapsed time per iteration (s): 0.56 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 2.718699E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.125 | TFLOPs: 43.96 | +7: iteration 58480/ 115203 | consumed samples: 14970880 | consumed tokens: 30660362240 | elapsed time per iteration (s): 0.56 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 2.705060E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.255 | TFLOPs: 43.98 | +7: iteration 58490/ 115203 | consumed samples: 14973440 | consumed tokens: 30665605120 | elapsed time per iteration (s): 0.56 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 2.713099E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.079 | TFLOPs: 43.96 | +7: iteration 58500/ 115203 | consumed samples: 14976000 | consumed tokens: 30670848000 | elapsed time per iteration (s): 0.56 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 2.678547E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.095 | TFLOPs: 43.96 | +7: iteration 58510/ 115203 | consumed samples: 14978560 | consumed tokens: 30676090880 | elapsed time per iteration (s): 0.56 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 2.707023E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.091 | TFLOPs: 43.96 | +7: iteration 58520/ 115203 | consumed samples: 14981120 | consumed tokens: 30681333760 | elapsed time per iteration (s): 0.56 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 2.685250E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.919 | TFLOPs: 43.85 | +7: iteration 58530/ 115203 | consumed samples: 14983680 | consumed tokens: 30686576640 | elapsed time per iteration (s): 0.56 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 2.705868E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.113 | TFLOPs: 43.96 | +7: iteration 58540/ 115203 | consumed samples: 14986240 | consumed tokens: 30691819520 | elapsed time per iteration (s): 0.56 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 2.707729E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.876 | TFLOPs: 43.94 | +7: iteration 58550/ 115203 | consumed samples: 14988800 | consumed tokens: 30697062400 | elapsed time per iteration (s): 0.56 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 2.702496E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.928 | TFLOPs: 43.94 | +7: iteration 58560/ 115203 | consumed samples: 14991360 | consumed tokens: 30702305280 | elapsed time per iteration (s): 0.56 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 2.691886E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.007 | TFLOPs: 43.95 | +7: iteration 58570/ 115203 | consumed samples: 14993920 | consumed tokens: 30707548160 | elapsed time per iteration (s): 0.55 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 2.695651E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.292 | TFLOPs: 43.98 | +7: iteration 58580/ 115203 | consumed samples: 14996480 | consumed tokens: 30712791040 | elapsed time per iteration (s): 0.55 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 2.696296E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.279 | TFLOPs: 43.98 | +7: iteration 58590/ 115203 | consumed samples: 14999040 | consumed tokens: 30718033920 | elapsed time per iteration (s): 0.56 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 2.715412E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.171 | TFLOPs: 43.97 | +7: iteration 58600/ 115203 | consumed samples: 15001600 | consumed tokens: 30723276800 | elapsed time per iteration (s): 0.56 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 2.695825E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.183 | TFLOPs: 43.97 | +7: iteration 58610/ 115203 | consumed samples: 15004160 | consumed tokens: 30728519680 | elapsed time per iteration (s): 0.55 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 2.698070E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.269 | TFLOPs: 43.98 | +7: iteration 58620/ 115203 | consumed samples: 15006720 | consumed tokens: 30733762560 | elapsed time per iteration (s): 0.56 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 2.698632E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.142 | TFLOPs: 43.96 | +7: iteration 58630/ 115203 | consumed samples: 15009280 | consumed tokens: 30739005440 | elapsed time per iteration (s): 0.56 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 2.705881E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.162 | TFLOPs: 43.97 | +7: iteration 58640/ 115203 | consumed samples: 15011840 | consumed tokens: 30744248320 | elapsed time per iteration (s): 0.56 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 2.703743E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.165 | TFLOPs: 43.97 | +7: iteration 58650/ 115203 | consumed samples: 15014400 | consumed tokens: 30749491200 | elapsed time per iteration (s): 0.55 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 2.694822E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.282 | TFLOPs: 43.98 | +7: iteration 58660/ 115203 | consumed samples: 15016960 | consumed tokens: 30754734080 | elapsed time per iteration (s): 0.56 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 2.701378E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.049 | TFLOPs: 43.96 | +7: iteration 58670/ 115203 | consumed samples: 15019520 | consumed tokens: 30759976960 | elapsed time per iteration (s): 0.56 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 2.706971E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.996 | TFLOPs: 43.95 | +7: iteration 58680/ 115203 | consumed samples: 15022080 | consumed tokens: 30765219840 | elapsed time per iteration (s): 0.56 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 2.688097E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 58690/ 115203 | consumed samples: 15024640 | consumed tokens: 30770462720 | elapsed time per iteration (s): 0.56 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 2.705499E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.060 | TFLOPs: 43.96 | +7: iteration 58700/ 115203 | consumed samples: 15027200 | consumed tokens: 30775705600 | elapsed time per iteration (s): 0.56 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 2.699299E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.111 | TFLOPs: 43.96 | +7: iteration 58710/ 115203 | consumed samples: 15029760 | consumed tokens: 30780948480 | elapsed time per iteration (s): 0.56 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 2.703244E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.334 | TFLOPs: 43.51 | +7: iteration 58720/ 115203 | consumed samples: 15032320 | consumed tokens: 30786191360 | elapsed time per iteration (s): 0.56 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 2.707960E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.060 | TFLOPs: 43.96 | +7: iteration 58730/ 115203 | consumed samples: 15034880 | consumed tokens: 30791434240 | elapsed time per iteration (s): 0.56 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 2.690956E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.719 | TFLOPs: 43.45 | +7: iteration 58740/ 115203 | consumed samples: 15037440 | consumed tokens: 30796677120 | elapsed time per iteration (s): 0.56 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 2.693535E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.765 | TFLOPs: 43.45 | +7: iteration 58750/ 115203 | consumed samples: 15040000 | consumed tokens: 30801920000 | elapsed time per iteration (s): 0.56 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 2.693975E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.868 | TFLOPs: 43.56 | +7: iteration 58760/ 115203 | consumed samples: 15042560 | consumed tokens: 30807162880 | elapsed time per iteration (s): 0.56 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 2.704682E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.151 | TFLOPs: 43.97 | +7: iteration 58770/ 115203 | consumed samples: 15045120 | consumed tokens: 30812405760 | elapsed time per iteration (s): 0.55 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 2.687854E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.300 | TFLOPs: 43.98 | +7: iteration 58780/ 115203 | consumed samples: 15047680 | consumed tokens: 30817648640 | elapsed time per iteration (s): 0.55 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 2.683390E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.321 | TFLOPs: 43.98 | +7: iteration 58790/ 115203 | consumed samples: 15050240 | consumed tokens: 30822891520 | elapsed time per iteration (s): 0.56 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 2.704658E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 58800/ 115203 | consumed samples: 15052800 | consumed tokens: 30828134400 | elapsed time per iteration (s): 0.56 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 2.700645E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.539 | TFLOPs: 43.62 | +7: iteration 58810/ 115203 | consumed samples: 15055360 | consumed tokens: 30833377280 | elapsed time per iteration (s): 0.56 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 2.713684E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.254 | TFLOPs: 43.88 | +7: iteration 58820/ 115203 | consumed samples: 15057920 | consumed tokens: 30838620160 | elapsed time per iteration (s): 0.55 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 2.708454E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.357 | TFLOPs: 43.99 | +7: iteration 58830/ 115203 | consumed samples: 15060480 | consumed tokens: 30843863040 | elapsed time per iteration (s): 0.56 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 2.698712E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 58840/ 115203 | consumed samples: 15063040 | consumed tokens: 30849105920 | elapsed time per iteration (s): 0.56 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 2.683948E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 58850/ 115203 | consumed samples: 15065600 | consumed tokens: 30854348800 | elapsed time per iteration (s): 0.56 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 2.683885E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.540 | TFLOPs: 43.62 | +7: iteration 58860/ 115203 | consumed samples: 15068160 | consumed tokens: 30859591680 | elapsed time per iteration (s): 0.56 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 2.705069E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.235 | TFLOPs: 43.97 | +7: iteration 58870/ 115203 | consumed samples: 15070720 | consumed tokens: 30864834560 | elapsed time per iteration (s): 0.55 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 2.696479E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.419 | TFLOPs: 43.99 | +7: iteration 58880/ 115203 | consumed samples: 15073280 | consumed tokens: 30870077440 | elapsed time per iteration (s): 0.56 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 2.695641E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.263 | TFLOPs: 43.21 | +7: iteration 58890/ 115203 | consumed samples: 15075840 | consumed tokens: 30875320320 | elapsed time per iteration (s): 0.55 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 2.695289E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.296 | TFLOPs: 43.98 | +7: iteration 58900/ 115203 | consumed samples: 15078400 | consumed tokens: 30880563200 | elapsed time per iteration (s): 0.56 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 2.705125E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 58910/ 115203 | consumed samples: 15080960 | consumed tokens: 30885806080 | elapsed time per iteration (s): 0.55 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 2.691187E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.347 | TFLOPs: 43.98 | +7: iteration 58920/ 115203 | consumed samples: 15083520 | consumed tokens: 30891048960 | elapsed time per iteration (s): 0.56 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 2.699094E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.196 | TFLOPs: 43.97 | +7: iteration 58930/ 115203 | consumed samples: 15086080 | consumed tokens: 30896291840 | elapsed time per iteration (s): 0.56 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 2.693417E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.089 | TFLOPs: 43.96 | +7: iteration 58940/ 115203 | consumed samples: 15088640 | consumed tokens: 30901534720 | elapsed time per iteration (s): 0.56 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 2.683731E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.204 | TFLOPs: 43.97 | +7: iteration 58950/ 115203 | consumed samples: 15091200 | consumed tokens: 30906777600 | elapsed time per iteration (s): 0.55 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 2.682815E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.335 | TFLOPs: 43.98 | +7: iteration 58960/ 115203 | consumed samples: 15093760 | consumed tokens: 30912020480 | elapsed time per iteration (s): 0.56 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 2.703681E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.228 | TFLOPs: 43.97 | +7: iteration 58970/ 115203 | consumed samples: 15096320 | consumed tokens: 30917263360 | elapsed time per iteration (s): 0.56 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 2.712127E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.245 | TFLOPs: 43.97 | +7: iteration 58980/ 115203 | consumed samples: 15098880 | consumed tokens: 30922506240 | elapsed time per iteration (s): 0.55 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 2.694996E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.297 | TFLOPs: 43.98 | +7: iteration 58990/ 115203 | consumed samples: 15101440 | consumed tokens: 30927749120 | elapsed time per iteration (s): 0.55 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 2.694697E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.503 | TFLOPs: 44.00 | +7: iteration 59000/ 115203 | consumed samples: 15104000 | consumed tokens: 30932992000 | elapsed time per iteration (s): 0.56 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 2.693843E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.230 | TFLOPs: 43.97 | +7: iteration 59010/ 115203 | consumed samples: 15106560 | consumed tokens: 30938234880 | elapsed time per iteration (s): 0.55 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 2.687754E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 59020/ 115203 | consumed samples: 15109120 | consumed tokens: 30943477760 | elapsed time per iteration (s): 0.56 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 2.694257E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.078 | TFLOPs: 43.96 | +7: iteration 59030/ 115203 | consumed samples: 15111680 | consumed tokens: 30948720640 | elapsed time per iteration (s): 0.56 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 2.701138E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.063 | TFLOPs: 43.96 | +7: iteration 59040/ 115203 | consumed samples: 15114240 | consumed tokens: 30953963520 | elapsed time per iteration (s): 0.56 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 2.685006E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.107 | TFLOPs: 43.96 | +7: iteration 59050/ 115203 | consumed samples: 15116800 | consumed tokens: 30959206400 | elapsed time per iteration (s): 0.56 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 2.712136E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 59060/ 115203 | consumed samples: 15119360 | consumed tokens: 30964449280 | elapsed time per iteration (s): 0.55 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 2.690031E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.355 | TFLOPs: 43.99 | +7: iteration 59070/ 115203 | consumed samples: 15121920 | consumed tokens: 30969692160 | elapsed time per iteration (s): 0.55 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 2.687663E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 59080/ 115203 | consumed samples: 15124480 | consumed tokens: 30974935040 | elapsed time per iteration (s): 0.55 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 2.695229E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.487 | TFLOPs: 44.00 | +7: iteration 59090/ 115203 | consumed samples: 15127040 | consumed tokens: 30980177920 | elapsed time per iteration (s): 0.55 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 2.691792E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.354 | TFLOPs: 43.99 | +7: iteration 59100/ 115203 | consumed samples: 15129600 | consumed tokens: 30985420800 | elapsed time per iteration (s): 0.56 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 2.688830E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.033 | TFLOPs: 43.29 | +7: iteration 59110/ 115203 | consumed samples: 15132160 | consumed tokens: 30990663680 | elapsed time per iteration (s): 0.56 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 2.711674E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.567 | TFLOPs: 43.34 | +7: iteration 59120/ 115203 | consumed samples: 15134720 | consumed tokens: 30995906560 | elapsed time per iteration (s): 0.55 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 2.707639E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.315 | TFLOPs: 43.98 | +7: iteration 59130/ 115203 | consumed samples: 15137280 | consumed tokens: 31001149440 | elapsed time per iteration (s): 0.56 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 2.690389E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.980 | TFLOPs: 43.95 | +7: iteration 59140/ 115203 | consumed samples: 15139840 | consumed tokens: 31006392320 | elapsed time per iteration (s): 0.56 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 2.698915E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.997 | TFLOPs: 43.38 | +7: iteration 59150/ 115203 | consumed samples: 15142400 | consumed tokens: 31011635200 | elapsed time per iteration (s): 0.57 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 2.693888E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.769 | TFLOPs: 42.88 | +7: iteration 59160/ 115203 | consumed samples: 15144960 | consumed tokens: 31016878080 | elapsed time per iteration (s): 0.55 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 2.675019E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.345 | TFLOPs: 43.98 | +7: iteration 59170/ 115203 | consumed samples: 15147520 | consumed tokens: 31022120960 | elapsed time per iteration (s): 0.56 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 2.705513E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.145 | TFLOPs: 43.97 | +7: iteration 59180/ 115203 | consumed samples: 15150080 | consumed tokens: 31027363840 | elapsed time per iteration (s): 0.57 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 2.698624E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.329 | TFLOPs: 43.12 | +7: iteration 59190/ 115203 | consumed samples: 15152640 | consumed tokens: 31032606720 | elapsed time per iteration (s): 0.55 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 2.709767E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 59200/ 115203 | consumed samples: 15155200 | consumed tokens: 31037849600 | elapsed time per iteration (s): 0.56 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 2.699011E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.108 | TFLOPs: 43.96 | +7: iteration 59210/ 115203 | consumed samples: 15157760 | consumed tokens: 31043092480 | elapsed time per iteration (s): 0.55 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 2.703048E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.332 | TFLOPs: 43.98 | +7: iteration 59220/ 115203 | consumed samples: 15160320 | consumed tokens: 31048335360 | elapsed time per iteration (s): 0.55 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 2.687033E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.431 | TFLOPs: 43.99 | +7: iteration 59230/ 115203 | consumed samples: 15162880 | consumed tokens: 31053578240 | elapsed time per iteration (s): 0.55 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 2.681395E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +7: iteration 59240/ 115203 | consumed samples: 15165440 | consumed tokens: 31058821120 | elapsed time per iteration (s): 0.55 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 2.684217E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.351 | TFLOPs: 43.98 | +7: iteration 59250/ 115203 | consumed samples: 15168000 | consumed tokens: 31064064000 | elapsed time per iteration (s): 0.56 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 2.692536E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 59260/ 115203 | consumed samples: 15170560 | consumed tokens: 31069306880 | elapsed time per iteration (s): 0.55 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 2.694686E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.406 | TFLOPs: 43.99 | +7: iteration 59270/ 115203 | consumed samples: 15173120 | consumed tokens: 31074549760 | elapsed time per iteration (s): 0.56 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 2.690018E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.266 | TFLOPs: 43.88 | +7: iteration 59280/ 115203 | consumed samples: 15175680 | consumed tokens: 31079792640 | elapsed time per iteration (s): 0.55 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 2.697525E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.313 | TFLOPs: 43.98 | +7: iteration 59290/ 115203 | consumed samples: 15178240 | consumed tokens: 31085035520 | elapsed time per iteration (s): 0.56 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 2.685817E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.257 | TFLOPs: 43.98 | +7: iteration 59300/ 115203 | consumed samples: 15180800 | consumed tokens: 31090278400 | elapsed time per iteration (s): 0.56 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 2.687048E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.256 | TFLOPs: 43.98 | +7: iteration 59310/ 115203 | consumed samples: 15183360 | consumed tokens: 31095521280 | elapsed time per iteration (s): 0.56 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 2.683331E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.437 | TFLOPs: 43.23 | +7: iteration 59320/ 115203 | consumed samples: 15185920 | consumed tokens: 31100764160 | elapsed time per iteration (s): 0.56 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 2.687833E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.160 | TFLOPs: 43.49 | +7: iteration 59330/ 115203 | consumed samples: 15188480 | consumed tokens: 31106007040 | elapsed time per iteration (s): 0.56 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 2.692393E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.987 | TFLOPs: 43.95 | +7: iteration 59340/ 115203 | consumed samples: 15191040 | consumed tokens: 31111249920 | elapsed time per iteration (s): 0.56 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 2.688150E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.208 | TFLOPs: 43.97 | +7: iteration 59350/ 115203 | consumed samples: 15193600 | consumed tokens: 31116492800 | elapsed time per iteration (s): 0.56 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 2.688038E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.164 | TFLOPs: 43.97 | +7: iteration 59360/ 115203 | consumed samples: 15196160 | consumed tokens: 31121735680 | elapsed time per iteration (s): 0.56 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 2.695633E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.320 | TFLOPs: 43.51 | +7: iteration 59370/ 115203 | consumed samples: 15198720 | consumed tokens: 31126978560 | elapsed time per iteration (s): 0.56 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 2.685549E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.096 | TFLOPs: 43.96 | +7: iteration 59380/ 115203 | consumed samples: 15201280 | consumed tokens: 31132221440 | elapsed time per iteration (s): 0.56 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 2.694546E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 59390/ 115203 | consumed samples: 15203840 | consumed tokens: 31137464320 | elapsed time per iteration (s): 0.56 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 2.688538E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.027 | TFLOPs: 43.95 | +7: iteration 59400/ 115203 | consumed samples: 15206400 | consumed tokens: 31142707200 | elapsed time per iteration (s): 0.55 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 2.699591E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 59410/ 115203 | consumed samples: 15208960 | consumed tokens: 31147950080 | elapsed time per iteration (s): 0.56 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 2.694321E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.123 | TFLOPs: 43.96 | +7: iteration 59420/ 115203 | consumed samples: 15211520 | consumed tokens: 31153192960 | elapsed time per iteration (s): 0.56 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 2.704034E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.058 | TFLOPs: 43.96 | +7: iteration 59430/ 115203 | consumed samples: 15214080 | consumed tokens: 31158435840 | elapsed time per iteration (s): 0.56 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 2.690367E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.882 | TFLOPs: 43.94 | +7: iteration 59440/ 115203 | consumed samples: 15216640 | consumed tokens: 31163678720 | elapsed time per iteration (s): 0.56 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 2.690200E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.983 | TFLOPs: 43.95 | +7: iteration 59450/ 115203 | consumed samples: 15219200 | consumed tokens: 31168921600 | elapsed time per iteration (s): 0.56 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 2.688486E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.007 | TFLOPs: 43.95 | +7: iteration 59460/ 115203 | consumed samples: 15221760 | consumed tokens: 31174164480 | elapsed time per iteration (s): 0.56 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 2.704012E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.125 | TFLOPs: 43.96 | +7: iteration 59470/ 115203 | consumed samples: 15224320 | consumed tokens: 31179407360 | elapsed time per iteration (s): 0.56 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 2.684128E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.123 | TFLOPs: 43.96 | +7: iteration 59480/ 115203 | consumed samples: 15226880 | consumed tokens: 31184650240 | elapsed time per iteration (s): 0.56 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 2.682532E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.931 | TFLOPs: 43.85 | +7: iteration 59490/ 115203 | consumed samples: 15229440 | consumed tokens: 31189893120 | elapsed time per iteration (s): 0.56 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 2.692761E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.138 | TFLOPs: 43.96 | +7: iteration 59500/ 115203 | consumed samples: 15232000 | consumed tokens: 31195136000 | elapsed time per iteration (s): 0.56 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 2.693796E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.080 | TFLOPs: 43.96 | +7: iteration 59510/ 115203 | consumed samples: 15234560 | consumed tokens: 31200378880 | elapsed time per iteration (s): 0.56 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 2.712691E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.252 | TFLOPs: 43.98 | +7: iteration 59520/ 115203 | consumed samples: 15237120 | consumed tokens: 31205621760 | elapsed time per iteration (s): 0.56 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 2.687629E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.207 | TFLOPs: 43.97 | +7: iteration 59530/ 115203 | consumed samples: 15239680 | consumed tokens: 31210864640 | elapsed time per iteration (s): 0.55 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 2.699332E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.309 | TFLOPs: 43.98 | +7: iteration 59540/ 115203 | consumed samples: 15242240 | consumed tokens: 31216107520 | elapsed time per iteration (s): 0.56 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 2.689273E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.781 | TFLOPs: 43.45 | +7: iteration 59550/ 115203 | consumed samples: 15244800 | consumed tokens: 31221350400 | elapsed time per iteration (s): 0.56 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 2.690442E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.062 | TFLOPs: 43.96 | +7: iteration 59560/ 115203 | consumed samples: 15247360 | consumed tokens: 31226593280 | elapsed time per iteration (s): 0.56 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 2.689220E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.093 | TFLOPs: 43.96 | +7: iteration 59570/ 115203 | consumed samples: 15249920 | consumed tokens: 31231836160 | elapsed time per iteration (s): 0.56 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 2.704188E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.105 | TFLOPs: 43.96 | +7: iteration 59580/ 115203 | consumed samples: 15252480 | consumed tokens: 31237079040 | elapsed time per iteration (s): 0.56 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 2.694516E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.101 | TFLOPs: 43.96 | +7: iteration 59590/ 115203 | consumed samples: 15255040 | consumed tokens: 31242321920 | elapsed time per iteration (s): 0.56 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 2.691407E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 59600/ 115203 | consumed samples: 15257600 | consumed tokens: 31247564800 | elapsed time per iteration (s): 0.58 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 2.693419E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.786 | TFLOPs: 42.31 | +7: iteration 59610/ 115203 | consumed samples: 15260160 | consumed tokens: 31252807680 | elapsed time per iteration (s): 0.56 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 2.691481E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.201 | TFLOPs: 43.97 | +7: iteration 59620/ 115203 | consumed samples: 15262720 | consumed tokens: 31258050560 | elapsed time per iteration (s): 0.56 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 2.693061E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.981 | TFLOPs: 43.95 | +7: iteration 59630/ 115203 | consumed samples: 15265280 | consumed tokens: 31263293440 | elapsed time per iteration (s): 0.56 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 2.680092E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.052 | TFLOPs: 43.96 | +7: iteration 59640/ 115203 | consumed samples: 15267840 | consumed tokens: 31268536320 | elapsed time per iteration (s): 0.56 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 2.689649E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.139 | TFLOPs: 43.96 | +7: iteration 59650/ 115203 | consumed samples: 15270400 | consumed tokens: 31273779200 | elapsed time per iteration (s): 0.56 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 2.695690E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.077 | TFLOPs: 43.96 | +7: iteration 59660/ 115203 | consumed samples: 15272960 | consumed tokens: 31279022080 | elapsed time per iteration (s): 0.56 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 2.694164E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.814 | TFLOPs: 43.93 | +7: iteration 59670/ 115203 | consumed samples: 15275520 | consumed tokens: 31284264960 | elapsed time per iteration (s): 0.56 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 2.697012E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.074 | TFLOPs: 43.96 | +7: iteration 59680/ 115203 | consumed samples: 15278080 | consumed tokens: 31289507840 | elapsed time per iteration (s): 0.57 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 2.713587E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.076 | TFLOPs: 43.20 | +7: iteration 59690/ 115203 | consumed samples: 15280640 | consumed tokens: 31294750720 | elapsed time per iteration (s): 0.56 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 2.701821E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.158 | TFLOPs: 43.97 | +7: iteration 59700/ 115203 | consumed samples: 15283200 | consumed tokens: 31299993600 | elapsed time per iteration (s): 0.56 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 2.697814E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.013 | TFLOPs: 43.95 | +7: iteration 59710/ 115203 | consumed samples: 15285760 | consumed tokens: 31305236480 | elapsed time per iteration (s): 0.55 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 2.688553E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.312 | TFLOPs: 43.98 | +7: iteration 59720/ 115203 | consumed samples: 15288320 | consumed tokens: 31310479360 | elapsed time per iteration (s): 0.56 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 2.692186E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.179 | TFLOPs: 43.97 | +7: iteration 59730/ 115203 | consumed samples: 15290880 | consumed tokens: 31315722240 | elapsed time per iteration (s): 0.55 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 2.704043E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.370 | TFLOPs: 43.99 | +7: iteration 59740/ 115203 | consumed samples: 15293440 | consumed tokens: 31320965120 | elapsed time per iteration (s): 0.56 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 2.683537E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.957 | TFLOPs: 43.95 | +7: iteration 59750/ 115203 | consumed samples: 15296000 | consumed tokens: 31326208000 | elapsed time per iteration (s): 0.56 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 2.698483E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.251 | TFLOPs: 43.98 | +7: iteration 59760/ 115203 | consumed samples: 15298560 | consumed tokens: 31331450880 | elapsed time per iteration (s): 0.56 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 2.692543E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.180 | TFLOPs: 43.97 | +7: iteration 59770/ 115203 | consumed samples: 15301120 | consumed tokens: 31336693760 | elapsed time per iteration (s): 0.56 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 2.701117E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.244 | TFLOPs: 43.97 | +7: iteration 59780/ 115203 | consumed samples: 15303680 | consumed tokens: 31341936640 | elapsed time per iteration (s): 0.55 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 2.703915E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.261 | TFLOPs: 43.98 | +7: iteration 59790/ 115203 | consumed samples: 15306240 | consumed tokens: 31347179520 | elapsed time per iteration (s): 0.55 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 2.704022E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.450 | TFLOPs: 43.99 | +7: iteration 59800/ 115203 | consumed samples: 15308800 | consumed tokens: 31352422400 | elapsed time per iteration (s): 0.55 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 2.680750E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.503 | TFLOPs: 44.00 | +7: iteration 59810/ 115203 | consumed samples: 15311360 | consumed tokens: 31357665280 | elapsed time per iteration (s): 0.55 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 2.701618E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.509 | TFLOPs: 44.00 | +7: iteration 59820/ 115203 | consumed samples: 15313920 | consumed tokens: 31362908160 | elapsed time per iteration (s): 0.55 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 2.697163E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.400 | TFLOPs: 43.99 | +7: iteration 59830/ 115203 | consumed samples: 15316480 | consumed tokens: 31368151040 | elapsed time per iteration (s): 0.55 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 2.695178E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.316 | TFLOPs: 43.98 | +7: iteration 59840/ 115203 | consumed samples: 15319040 | consumed tokens: 31373393920 | elapsed time per iteration (s): 0.55 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 2.692605E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.360 | TFLOPs: 43.99 | +7: iteration 59850/ 115203 | consumed samples: 15321600 | consumed tokens: 31378636800 | elapsed time per iteration (s): 0.55 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 2.689904E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.333 | TFLOPs: 43.98 | +7: iteration 59860/ 115203 | consumed samples: 15324160 | consumed tokens: 31383879680 | elapsed time per iteration (s): 0.56 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 2.687174E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.196 | TFLOPs: 43.97 | +7: iteration 59870/ 115203 | consumed samples: 15326720 | consumed tokens: 31389122560 | elapsed time per iteration (s): 0.57 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 2.694293E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.962 | TFLOPs: 42.99 | +7: iteration 59880/ 115203 | consumed samples: 15329280 | consumed tokens: 31394365440 | elapsed time per iteration (s): 0.56 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 2.683715E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.235 | TFLOPs: 43.97 | +7: iteration 59890/ 115203 | consumed samples: 15331840 | consumed tokens: 31399608320 | elapsed time per iteration (s): 0.56 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 2.692243E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.099 | TFLOPs: 43.96 | +7: iteration 59900/ 115203 | consumed samples: 15334400 | consumed tokens: 31404851200 | elapsed time per iteration (s): 0.55 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 2.696792E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.277 | TFLOPs: 43.98 | +7: iteration 59910/ 115203 | consumed samples: 15336960 | consumed tokens: 31410094080 | elapsed time per iteration (s): 0.55 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 2.693823E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.278 | TFLOPs: 43.98 | +7: iteration 59920/ 115203 | consumed samples: 15339520 | consumed tokens: 31415336960 | elapsed time per iteration (s): 0.56 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 2.699628E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.106 | TFLOPs: 43.96 | +7: iteration 59930/ 115203 | consumed samples: 15342080 | consumed tokens: 31420579840 | elapsed time per iteration (s): 0.56 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 2.697700E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.179 | TFLOPs: 43.97 | +7: iteration 59940/ 115203 | consumed samples: 15344640 | consumed tokens: 31425822720 | elapsed time per iteration (s): 0.56 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 2.699623E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 59950/ 115203 | consumed samples: 15347200 | consumed tokens: 31431065600 | elapsed time per iteration (s): 0.55 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 2.689497E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.335 | TFLOPs: 43.98 | +7: iteration 59960/ 115203 | consumed samples: 15349760 | consumed tokens: 31436308480 | elapsed time per iteration (s): 0.55 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 2.694176E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 59970/ 115203 | consumed samples: 15352320 | consumed tokens: 31441551360 | elapsed time per iteration (s): 0.55 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 2.698023E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.294 | TFLOPs: 43.98 | +7: iteration 59980/ 115203 | consumed samples: 15354880 | consumed tokens: 31446794240 | elapsed time per iteration (s): 0.55 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 2.688136E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.353 | TFLOPs: 43.98 | +7: iteration 59990/ 115203 | consumed samples: 15357440 | consumed tokens: 31452037120 | elapsed time per iteration (s): 0.55 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 2.702775E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.351 | TFLOPs: 43.98 | +0: [2023-03-16 22:14:35,269] [INFO] [logging.py:68:log_dist] [Rank 0] step=60000, skipped=0, lr=[0.00010548489040793946, 0.00010548489040793946, 0.00010548489040793946], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 60000/ 115203 | consumed samples: 15360000 | consumed tokens: 31457280000 | elapsed time per iteration (s): 0.56 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 2.696482E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.256 | TFLOPs: 43.98 | +0: steps: 60000 loss: 2.6563 iter time (s): 0.554 samples/sec: 462.118 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 60000 | lm loss value: 3.377162E+00 | lm loss PPL: 2.928755E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 60000 to checkpoints_421m60b400m +0: [2023-03-16 22:14:35,538] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step60000 is begin to save! +0: [2023-03-16 22:14:35,544] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_01-model_00-model_states.pt... +0: [2023-03-16 22:14:35,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_01-model_00-model_states.pt. +0: [2023-03-16 22:14:35,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_03-model_00-model_states.pt... +0: [2023-03-16 22:14:35,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_03-model_00-model_states.pt. +0: [2023-03-16 22:14:35,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_04-model_00-model_states.pt... +0: [2023-03-16 22:14:35,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_04-model_00-model_states.pt. +0: [2023-03-16 22:14:35,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_05-model_00-model_states.pt... +0: [2023-03-16 22:14:35,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_05-model_00-model_states.pt. +0: [2023-03-16 22:14:35,859] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_06-model_00-model_states.pt... +0: [2023-03-16 22:14:35,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_06-model_00-model_states.pt. +0: [2023-03-16 22:14:35,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_07-model_00-model_states.pt... +0: [2023-03-16 22:14:35,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_07-model_00-model_states.pt. +0: [2023-03-16 22:14:35,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_08-model_00-model_states.pt... +0: [2023-03-16 22:14:35,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_08-model_00-model_states.pt. +0: [2023-03-16 22:14:35,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_09-model_00-model_states.pt... +0: [2023-03-16 22:14:36,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_09-model_00-model_states.pt. +0: [2023-03-16 22:14:36,027] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_10-model_00-model_states.pt... +0: [2023-03-16 22:14:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_10-model_00-model_states.pt. +0: [2023-03-16 22:14:36,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_11-model_00-model_states.pt... +0: [2023-03-16 22:14:36,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_11-model_00-model_states.pt. +0: [2023-03-16 22:14:36,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_12-model_00-model_states.pt... +0: [2023-03-16 22:14:36,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_12-model_00-model_states.pt. +0: [2023-03-16 22:14:36,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_13-model_00-model_states.pt... +0: [2023-03-16 22:14:36,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_13-model_00-model_states.pt. +0: [2023-03-16 22:14:36,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_14-model_00-model_states.pt... +0: [2023-03-16 22:14:36,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_14-model_00-model_states.pt. +0: [2023-03-16 22:14:36,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_15-model_00-model_states.pt... +0: [2023-03-16 22:14:36,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_15-model_00-model_states.pt. +0: [2023-03-16 22:14:36,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_16-model_00-model_states.pt... +0: [2023-03-16 22:14:36,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_16-model_00-model_states.pt. +0: [2023-03-16 22:14:36,321] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_17-model_00-model_states.pt... +0: [2023-03-16 22:14:36,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_17-model_00-model_states.pt. +0: [2023-03-16 22:14:36,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_18-model_00-model_states.pt... +0: [2023-03-16 22:14:36,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_18-model_00-model_states.pt. +0: [2023-03-16 22:14:36,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_19-model_00-model_states.pt... +0: [2023-03-16 22:14:36,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_19-model_00-model_states.pt. +0: [2023-03-16 22:14:36,445] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_20-model_00-model_states.pt... +0: [2023-03-16 22:14:36,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_20-model_00-model_states.pt. +0: [2023-03-16 22:14:36,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/layer_22-model_00-model_states.pt... +0: [2023-03-16 22:14:36,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/layer_22-model_00-model_states.pt. +0: [2023-03-16 22:14:36,491] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step60000/mp_rank_00_model_states.pt +0: [2023-03-16 22:14:36,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/mp_rank_00_model_states.pt... +0: [2023-03-16 22:14:36,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/mp_rank_00_model_states.pt. +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +1: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +5: [2023-03-16 22:14:36,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +0: [2023-03-16 22:14:36,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-16 22:14:36,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-16 22:14:36,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-16 22:14:36,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-16 22:14:36,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-16 22:14:36,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-16 22:14:36,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-16 22:14:36,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-16 22:14:36,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 22:14:36,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-16 22:14:36,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-16 22:14:36,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-16 22:14:36,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-16 22:14:36,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-16 22:14:36,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 22:14:36,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-16 22:14:36,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: successfully saved checkpoint at iteration 60000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1292.42 +7: iteration 60010/ 115203 | consumed samples: 15362560 | consumed tokens: 31462522880 | elapsed time per iteration (s): 0.70 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 2.684386E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 364.913 | TFLOPs: 34.79 | +7: iteration 60020/ 115203 | consumed samples: 15365120 | consumed tokens: 31467765760 | elapsed time per iteration (s): 0.55 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 2.685463E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.601 | TFLOPs: 44.01 | +7: iteration 60030/ 115203 | consumed samples: 15367680 | consumed tokens: 31473008640 | elapsed time per iteration (s): 0.56 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 2.700828E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.281 | TFLOPs: 43.50 | +7: iteration 60040/ 115203 | consumed samples: 15370240 | consumed tokens: 31478251520 | elapsed time per iteration (s): 0.55 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 2.687364E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.527 | TFLOPs: 44.00 | +7: iteration 60050/ 115203 | consumed samples: 15372800 | consumed tokens: 31483494400 | elapsed time per iteration (s): 0.55 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 2.690602E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 60060/ 115203 | consumed samples: 15375360 | consumed tokens: 31488737280 | elapsed time per iteration (s): 0.55 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 2.686974E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.395 | TFLOPs: 43.99 | +7: iteration 60070/ 115203 | consumed samples: 15377920 | consumed tokens: 31493980160 | elapsed time per iteration (s): 0.55 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 2.693590E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.533 | TFLOPs: 44.00 | +7: iteration 60080/ 115203 | consumed samples: 15380480 | consumed tokens: 31499223040 | elapsed time per iteration (s): 0.55 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 2.705784E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 60090/ 115203 | consumed samples: 15383040 | consumed tokens: 31504465920 | elapsed time per iteration (s): 0.55 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 2.683861E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.473 | TFLOPs: 44.00 | +7: iteration 60100/ 115203 | consumed samples: 15385600 | consumed tokens: 31509708800 | elapsed time per iteration (s): 0.55 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 2.685516E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.434 | TFLOPs: 43.99 | +7: iteration 60110/ 115203 | consumed samples: 15388160 | consumed tokens: 31514951680 | elapsed time per iteration (s): 0.55 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 2.697166E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.568 | TFLOPs: 44.01 | +7: iteration 60120/ 115203 | consumed samples: 15390720 | consumed tokens: 31520194560 | elapsed time per iteration (s): 0.55 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 2.688153E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.578 | TFLOPs: 44.01 | +7: iteration 60130/ 115203 | consumed samples: 15393280 | consumed tokens: 31525437440 | elapsed time per iteration (s): 0.55 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 2.707254E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.695 | TFLOPs: 44.02 | +7: iteration 60140/ 115203 | consumed samples: 15395840 | consumed tokens: 31530680320 | elapsed time per iteration (s): 0.55 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 2.697392E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.627 | TFLOPs: 44.01 | +7: iteration 60150/ 115203 | consumed samples: 15398400 | consumed tokens: 31535923200 | elapsed time per iteration (s): 0.55 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 2.687683E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.560 | TFLOPs: 44.00 | +7: iteration 60160/ 115203 | consumed samples: 15400960 | consumed tokens: 31541166080 | elapsed time per iteration (s): 0.55 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 2.696431E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.411 | TFLOPs: 43.99 | +7: iteration 60170/ 115203 | consumed samples: 15403520 | consumed tokens: 31546408960 | elapsed time per iteration (s): 0.55 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 2.682937E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.371 | TFLOPs: 43.99 | +7: iteration 60180/ 115203 | consumed samples: 15406080 | consumed tokens: 31551651840 | elapsed time per iteration (s): 0.55 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 2.692440E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.560 | TFLOPs: 44.00 | +7: iteration 60190/ 115203 | consumed samples: 15408640 | consumed tokens: 31556894720 | elapsed time per iteration (s): 0.55 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 2.689573E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.708 | TFLOPs: 44.02 | +7: iteration 60200/ 115203 | consumed samples: 15411200 | consumed tokens: 31562137600 | elapsed time per iteration (s): 0.56 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 2.695785E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.908 | TFLOPs: 43.66 | +7: iteration 60210/ 115203 | consumed samples: 15413760 | consumed tokens: 31567380480 | elapsed time per iteration (s): 0.55 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 2.690280E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.628 | TFLOPs: 44.01 | +7: iteration 60220/ 115203 | consumed samples: 15416320 | consumed tokens: 31572623360 | elapsed time per iteration (s): 0.55 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 2.695795E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.579 | TFLOPs: 44.01 | +7: iteration 60230/ 115203 | consumed samples: 15418880 | consumed tokens: 31577866240 | elapsed time per iteration (s): 0.56 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 2.695057E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.998 | TFLOPs: 43.38 | +7: iteration 60240/ 115203 | consumed samples: 15421440 | consumed tokens: 31583109120 | elapsed time per iteration (s): 0.55 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 2.687812E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.510 | TFLOPs: 44.00 | +7: iteration 60250/ 115203 | consumed samples: 15424000 | consumed tokens: 31588352000 | elapsed time per iteration (s): 0.55 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 2.691287E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.619 | TFLOPs: 44.01 | +7: iteration 60260/ 115203 | consumed samples: 15426560 | consumed tokens: 31593594880 | elapsed time per iteration (s): 0.55 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 2.702839E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 60270/ 115203 | consumed samples: 15429120 | consumed tokens: 31598837760 | elapsed time per iteration (s): 0.55 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 2.702326E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.642 | TFLOPs: 44.01 | +7: iteration 60280/ 115203 | consumed samples: 15431680 | consumed tokens: 31604080640 | elapsed time per iteration (s): 0.55 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 2.680906E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.742 | TFLOPs: 44.02 | +7: iteration 60290/ 115203 | consumed samples: 15434240 | consumed tokens: 31609323520 | elapsed time per iteration (s): 0.55 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 2.692874E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.968 | TFLOPs: 44.04 | +7: iteration 60300/ 115203 | consumed samples: 15436800 | consumed tokens: 31614566400 | elapsed time per iteration (s): 0.55 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 2.687787E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.843 | TFLOPs: 44.03 | +7: iteration 60310/ 115203 | consumed samples: 15439360 | consumed tokens: 31619809280 | elapsed time per iteration (s): 0.55 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 2.681320E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.769 | TFLOPs: 44.02 | +7: iteration 60320/ 115203 | consumed samples: 15441920 | consumed tokens: 31625052160 | elapsed time per iteration (s): 0.55 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 2.692723E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.811 | TFLOPs: 44.03 | +7: iteration 60330/ 115203 | consumed samples: 15444480 | consumed tokens: 31630295040 | elapsed time per iteration (s): 0.55 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 2.691763E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.629 | TFLOPs: 44.01 | +7: iteration 60340/ 115203 | consumed samples: 15447040 | consumed tokens: 31635537920 | elapsed time per iteration (s): 0.55 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 2.707886E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.750 | TFLOPs: 44.02 | +7: iteration 60350/ 115203 | consumed samples: 15449600 | consumed tokens: 31640780800 | elapsed time per iteration (s): 0.56 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 2.693361E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.302 | TFLOPs: 43.60 | +7: iteration 60360/ 115203 | consumed samples: 15452160 | consumed tokens: 31646023680 | elapsed time per iteration (s): 0.55 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 2.692483E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.746 | TFLOPs: 44.02 | +7: iteration 60370/ 115203 | consumed samples: 15454720 | consumed tokens: 31651266560 | elapsed time per iteration (s): 0.55 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 2.696385E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.871 | TFLOPs: 44.03 | +7: iteration 60380/ 115203 | consumed samples: 15457280 | consumed tokens: 31656509440 | elapsed time per iteration (s): 0.56 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 2.686478E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.525 | TFLOPs: 43.62 | +7: iteration 60390/ 115203 | consumed samples: 15459840 | consumed tokens: 31661752320 | elapsed time per iteration (s): 0.56 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 2.701355E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.279 | TFLOPs: 43.50 | +7: iteration 60400/ 115203 | consumed samples: 15462400 | consumed tokens: 31666995200 | elapsed time per iteration (s): 0.56 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 2.692695E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.260 | TFLOPs: 43.59 | +7: iteration 60410/ 115203 | consumed samples: 15464960 | consumed tokens: 31672238080 | elapsed time per iteration (s): 0.55 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 2.694470E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.920 | TFLOPs: 44.04 | +7: iteration 60420/ 115203 | consumed samples: 15467520 | consumed tokens: 31677480960 | elapsed time per iteration (s): 0.55 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 2.681662E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.811 | TFLOPs: 44.03 | +7: iteration 60430/ 115203 | consumed samples: 15470080 | consumed tokens: 31682723840 | elapsed time per iteration (s): 0.55 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 2.697696E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.516 | TFLOPs: 44.00 | +7: iteration 60440/ 115203 | consumed samples: 15472640 | consumed tokens: 31687966720 | elapsed time per iteration (s): 0.56 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 2.679485E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.113 | TFLOPs: 43.58 | +7: iteration 60450/ 115203 | consumed samples: 15475200 | consumed tokens: 31693209600 | elapsed time per iteration (s): 0.57 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 2.694126E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.606 | TFLOPs: 43.15 | +7: iteration 60460/ 115203 | consumed samples: 15477760 | consumed tokens: 31698452480 | elapsed time per iteration (s): 0.56 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 2.698720E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.419 | TFLOPs: 43.51 | +7: iteration 60470/ 115203 | consumed samples: 15480320 | consumed tokens: 31703695360 | elapsed time per iteration (s): 0.57 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 2.691105E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.686 | TFLOPs: 43.16 | +7: iteration 60480/ 115203 | consumed samples: 15482880 | consumed tokens: 31708938240 | elapsed time per iteration (s): 0.55 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 2.688313E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.686 | TFLOPs: 44.02 | +7: iteration 60490/ 115203 | consumed samples: 15485440 | consumed tokens: 31714181120 | elapsed time per iteration (s): 0.55 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 2.683735E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.306 | TFLOPs: 43.98 | +7: iteration 60500/ 115203 | consumed samples: 15488000 | consumed tokens: 31719424000 | elapsed time per iteration (s): 0.56 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 2.695594E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.210 | TFLOPs: 43.59 | +7: iteration 60510/ 115203 | consumed samples: 15490560 | consumed tokens: 31724666880 | elapsed time per iteration (s): 0.56 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 2.690339E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 60520/ 115203 | consumed samples: 15493120 | consumed tokens: 31729909760 | elapsed time per iteration (s): 0.57 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 2.683626E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.432 | TFLOPs: 43.13 | +7: iteration 60530/ 115203 | consumed samples: 15495680 | consumed tokens: 31735152640 | elapsed time per iteration (s): 0.56 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 2.692841E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.851 | TFLOPs: 43.65 | +7: iteration 60540/ 115203 | consumed samples: 15498240 | consumed tokens: 31740395520 | elapsed time per iteration (s): 0.55 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 2.694694E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 60550/ 115203 | consumed samples: 15500800 | consumed tokens: 31745638400 | elapsed time per iteration (s): 0.57 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 2.713648E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.557 | TFLOPs: 43.15 | +7: iteration 60560/ 115203 | consumed samples: 15503360 | consumed tokens: 31750881280 | elapsed time per iteration (s): 0.57 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 2.703428E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.272 | TFLOPs: 42.45 | +7: iteration 60570/ 115203 | consumed samples: 15505920 | consumed tokens: 31756124160 | elapsed time per iteration (s): 0.57 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 2.695357E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.697 | TFLOPs: 43.16 | +7: iteration 60580/ 115203 | consumed samples: 15508480 | consumed tokens: 31761367040 | elapsed time per iteration (s): 0.56 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 2.678099E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.489 | TFLOPs: 43.62 | +7: iteration 60590/ 115203 | consumed samples: 15511040 | consumed tokens: 31766609920 | elapsed time per iteration (s): 0.57 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 2.700656E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.186 | TFLOPs: 43.11 | +7: iteration 60600/ 115203 | consumed samples: 15513600 | consumed tokens: 31771852800 | elapsed time per iteration (s): 0.56 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 2.681295E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.357 | TFLOPs: 43.22 | +7: iteration 60610/ 115203 | consumed samples: 15516160 | consumed tokens: 31777095680 | elapsed time per iteration (s): 0.56 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 2.698238E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.242 | TFLOPs: 43.40 | +7: iteration 60620/ 115203 | consumed samples: 15518720 | consumed tokens: 31782338560 | elapsed time per iteration (s): 0.56 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 2.689675E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.900 | TFLOPs: 43.27 | +7: iteration 60630/ 115203 | consumed samples: 15521280 | consumed tokens: 31787581440 | elapsed time per iteration (s): 0.57 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 2.689917E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.399 | TFLOPs: 42.85 | +7: iteration 60640/ 115203 | consumed samples: 15523840 | consumed tokens: 31792824320 | elapsed time per iteration (s): 0.57 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 2.692618E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.475 | TFLOPs: 43.14 | +7: iteration 60650/ 115203 | consumed samples: 15526400 | consumed tokens: 31798067200 | elapsed time per iteration (s): 0.56 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 2.701615E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.842 | TFLOPs: 43.55 | +7: iteration 60660/ 115203 | consumed samples: 15528960 | consumed tokens: 31803310080 | elapsed time per iteration (s): 0.56 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 2.681242E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.614 | TFLOPs: 43.53 | +7: iteration 60670/ 115203 | consumed samples: 15531520 | consumed tokens: 31808552960 | elapsed time per iteration (s): 0.56 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 2.689244E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.818 | TFLOPs: 43.55 | +7: iteration 60680/ 115203 | consumed samples: 15534080 | consumed tokens: 31813795840 | elapsed time per iteration (s): 0.55 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 2.689658E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.679 | TFLOPs: 44.02 | +7: iteration 60690/ 115203 | consumed samples: 15536640 | consumed tokens: 31819038720 | elapsed time per iteration (s): 0.56 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 2.691415E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.154 | TFLOPs: 43.20 | +7: iteration 60700/ 115203 | consumed samples: 15539200 | consumed tokens: 31824281600 | elapsed time per iteration (s): 0.56 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 2.696603E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.045 | TFLOPs: 43.67 | +7: iteration 60710/ 115203 | consumed samples: 15541760 | consumed tokens: 31829524480 | elapsed time per iteration (s): 0.57 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 2.685296E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.625 | TFLOPs: 42.58 | +7: iteration 60720/ 115203 | consumed samples: 15544320 | consumed tokens: 31834767360 | elapsed time per iteration (s): 0.57 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 2.683562E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.689 | TFLOPs: 43.16 | +7: iteration 60730/ 115203 | consumed samples: 15546880 | consumed tokens: 31840010240 | elapsed time per iteration (s): 0.55 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 2.683575E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 60740/ 115203 | consumed samples: 15549440 | consumed tokens: 31845253120 | elapsed time per iteration (s): 0.56 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 2.689201E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.195 | TFLOPs: 43.97 | +7: iteration 60750/ 115203 | consumed samples: 15552000 | consumed tokens: 31850496000 | elapsed time per iteration (s): 0.56 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 2.699109E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.175 | TFLOPs: 43.97 | +7: iteration 60760/ 115203 | consumed samples: 15554560 | consumed tokens: 31855738880 | elapsed time per iteration (s): 0.55 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 2.685235E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 60770/ 115203 | consumed samples: 15557120 | consumed tokens: 31860981760 | elapsed time per iteration (s): 0.56 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 2.682748E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.733 | TFLOPs: 43.54 | +7: iteration 60780/ 115203 | consumed samples: 15559680 | consumed tokens: 31866224640 | elapsed time per iteration (s): 0.56 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 2.695370E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.009 | TFLOPs: 43.57 | +7: iteration 60790/ 115203 | consumed samples: 15562240 | consumed tokens: 31871467520 | elapsed time per iteration (s): 0.56 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 2.689479E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.275 | TFLOPs: 43.21 | +7: iteration 60800/ 115203 | consumed samples: 15564800 | consumed tokens: 31876710400 | elapsed time per iteration (s): 0.56 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 2.704028E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.940 | TFLOPs: 43.56 | +7: iteration 60810/ 115203 | consumed samples: 15567360 | consumed tokens: 31881953280 | elapsed time per iteration (s): 0.56 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 2.688020E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.037 | TFLOPs: 43.57 | +7: iteration 60820/ 115203 | consumed samples: 15569920 | consumed tokens: 31887196160 | elapsed time per iteration (s): 0.57 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 2.684667E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.464 | TFLOPs: 43.14 | +7: iteration 60830/ 115203 | consumed samples: 15572480 | consumed tokens: 31892439040 | elapsed time per iteration (s): 0.55 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 2.678769E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.754 | TFLOPs: 44.02 | +7: iteration 60840/ 115203 | consumed samples: 15575040 | consumed tokens: 31897681920 | elapsed time per iteration (s): 0.56 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 2.681718E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.248 | TFLOPs: 43.97 | +7: iteration 60850/ 115203 | consumed samples: 15577600 | consumed tokens: 31902924800 | elapsed time per iteration (s): 0.55 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 2.674644E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.741 | TFLOPs: 44.02 | +7: iteration 60860/ 115203 | consumed samples: 15580160 | consumed tokens: 31908167680 | elapsed time per iteration (s): 0.55 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 2.680935E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.755 | TFLOPs: 44.02 | +7: iteration 60870/ 115203 | consumed samples: 15582720 | consumed tokens: 31913410560 | elapsed time per iteration (s): 0.55 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 2.687889E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.819 | TFLOPs: 44.03 | +7: iteration 60880/ 115203 | consumed samples: 15585280 | consumed tokens: 31918653440 | elapsed time per iteration (s): 0.55 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 2.685842E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.642 | TFLOPs: 44.01 | +7: iteration 60890/ 115203 | consumed samples: 15587840 | consumed tokens: 31923896320 | elapsed time per iteration (s): 0.55 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 2.694175E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.754 | TFLOPs: 44.02 | +7: iteration 60900/ 115203 | consumed samples: 15590400 | consumed tokens: 31929139200 | elapsed time per iteration (s): 0.55 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 2.687698E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.823 | TFLOPs: 44.03 | +7: iteration 60910/ 115203 | consumed samples: 15592960 | consumed tokens: 31934382080 | elapsed time per iteration (s): 0.55 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 2.698061E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.611 | TFLOPs: 44.01 | +7: iteration 60920/ 115203 | consumed samples: 15595520 | consumed tokens: 31939624960 | elapsed time per iteration (s): 0.55 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 2.684149E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 60930/ 115203 | consumed samples: 15598080 | consumed tokens: 31944867840 | elapsed time per iteration (s): 0.55 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 2.683456E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.510 | TFLOPs: 44.00 | +7: iteration 60940/ 115203 | consumed samples: 15600640 | consumed tokens: 31950110720 | elapsed time per iteration (s): 0.55 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 2.697566E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.506 | TFLOPs: 44.00 | +7: iteration 60950/ 115203 | consumed samples: 15603200 | consumed tokens: 31955353600 | elapsed time per iteration (s): 0.55 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 2.693243E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.744 | TFLOPs: 44.02 | +7: iteration 60960/ 115203 | consumed samples: 15605760 | consumed tokens: 31960596480 | elapsed time per iteration (s): 0.56 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 2.673731E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.287 | TFLOPs: 43.69 | +7: iteration 60970/ 115203 | consumed samples: 15608320 | consumed tokens: 31965839360 | elapsed time per iteration (s): 0.55 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 2.690023E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.622 | TFLOPs: 44.01 | +7: iteration 60980/ 115203 | consumed samples: 15610880 | consumed tokens: 31971082240 | elapsed time per iteration (s): 0.55 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 2.689837E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.775 | TFLOPs: 44.03 | +7: iteration 60990/ 115203 | consumed samples: 15613440 | consumed tokens: 31976325120 | elapsed time per iteration (s): 0.55 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 2.695913E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.613 | TFLOPs: 44.01 | +7: iteration 61000/ 115203 | consumed samples: 15616000 | consumed tokens: 31981568000 | elapsed time per iteration (s): 0.55 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 2.691384E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.679 | TFLOPs: 44.02 | +7: iteration 61010/ 115203 | consumed samples: 15618560 | consumed tokens: 31986810880 | elapsed time per iteration (s): 0.55 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 2.688409E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.677 | TFLOPs: 44.02 | +7: iteration 61020/ 115203 | consumed samples: 15621120 | consumed tokens: 31992053760 | elapsed time per iteration (s): 0.56 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 2.690426E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.934 | TFLOPs: 43.75 | +7: iteration 61030/ 115203 | consumed samples: 15623680 | consumed tokens: 31997296640 | elapsed time per iteration (s): 0.57 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 2.692126E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.449 | TFLOPs: 43.14 | +7: iteration 61040/ 115203 | consumed samples: 15626240 | consumed tokens: 32002539520 | elapsed time per iteration (s): 0.55 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 2.684727E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.645 | TFLOPs: 44.01 | +7: iteration 61050/ 115203 | consumed samples: 15628800 | consumed tokens: 32007782400 | elapsed time per iteration (s): 0.57 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 2.685188E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.076 | TFLOPs: 43.20 | +7: iteration 61060/ 115203 | consumed samples: 15631360 | consumed tokens: 32013025280 | elapsed time per iteration (s): 0.55 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 2.681234E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.719 | TFLOPs: 44.02 | +7: iteration 61070/ 115203 | consumed samples: 15633920 | consumed tokens: 32018268160 | elapsed time per iteration (s): 0.55 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 2.699527E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 61080/ 115203 | consumed samples: 15636480 | consumed tokens: 32023511040 | elapsed time per iteration (s): 0.55 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 2.683269E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 61090/ 115203 | consumed samples: 15639040 | consumed tokens: 32028753920 | elapsed time per iteration (s): 0.56 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 2.698205E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.082 | TFLOPs: 43.58 | +7: iteration 61100/ 115203 | consumed samples: 15641600 | consumed tokens: 32033996800 | elapsed time per iteration (s): 0.56 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 2.706464E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.871 | TFLOPs: 43.56 | +7: iteration 61110/ 115203 | consumed samples: 15644160 | consumed tokens: 32039239680 | elapsed time per iteration (s): 0.58 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 2.688642E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.153 | TFLOPs: 42.06 | +7: iteration 61120/ 115203 | consumed samples: 15646720 | consumed tokens: 32044482560 | elapsed time per iteration (s): 0.55 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 2.673534E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.933 | TFLOPs: 44.04 | +7: iteration 61130/ 115203 | consumed samples: 15649280 | consumed tokens: 32049725440 | elapsed time per iteration (s): 0.57 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 2.689936E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.766 | TFLOPs: 43.17 | +7: iteration 61140/ 115203 | consumed samples: 15651840 | consumed tokens: 32054968320 | elapsed time per iteration (s): 0.57 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 2.675604E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.814 | TFLOPs: 43.17 | +7: iteration 61150/ 115203 | consumed samples: 15654400 | consumed tokens: 32060211200 | elapsed time per iteration (s): 0.58 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 2.696460E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.440 | TFLOPs: 42.37 | +7: iteration 61160/ 115203 | consumed samples: 15656960 | consumed tokens: 32065454080 | elapsed time per iteration (s): 0.58 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 2.686162E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.497 | TFLOPs: 42.19 | +7: iteration 61170/ 115203 | consumed samples: 15659520 | consumed tokens: 32070696960 | elapsed time per iteration (s): 0.57 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 2.675276E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.303 | TFLOPs: 43.03 | +7: iteration 61180/ 115203 | consumed samples: 15662080 | consumed tokens: 32075939840 | elapsed time per iteration (s): 0.56 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 2.699413E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.983 | TFLOPs: 43.28 | +7: iteration 61190/ 115203 | consumed samples: 15664640 | consumed tokens: 32081182720 | elapsed time per iteration (s): 0.58 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 2.676795E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.206 | TFLOPs: 42.45 | +7: iteration 61200/ 115203 | consumed samples: 15667200 | consumed tokens: 32086425600 | elapsed time per iteration (s): 0.58 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 2.691310E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.430 | TFLOPs: 42.18 | +7: iteration 61210/ 115203 | consumed samples: 15669760 | consumed tokens: 32091668480 | elapsed time per iteration (s): 0.57 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 2.682074E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.903 | TFLOPs: 42.70 | +7: iteration 61220/ 115203 | consumed samples: 15672320 | consumed tokens: 32096911360 | elapsed time per iteration (s): 0.56 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 2.673581E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.424 | TFLOPs: 43.71 | +7: iteration 61230/ 115203 | consumed samples: 15674880 | consumed tokens: 32102154240 | elapsed time per iteration (s): 0.57 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 2.684545E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.068 | TFLOPs: 43.00 | +7: iteration 61240/ 115203 | consumed samples: 15677440 | consumed tokens: 32107397120 | elapsed time per iteration (s): 0.58 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 2.685814E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.724 | TFLOPs: 42.11 | +7: iteration 61250/ 115203 | consumed samples: 15680000 | consumed tokens: 32112640000 | elapsed time per iteration (s): 0.56 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 2.693010E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.016 | TFLOPs: 43.57 | +7: iteration 61260/ 115203 | consumed samples: 15682560 | consumed tokens: 32117882880 | elapsed time per iteration (s): 0.57 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 2.682715E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.752 | TFLOPs: 43.07 | +7: iteration 61270/ 115203 | consumed samples: 15685120 | consumed tokens: 32123125760 | elapsed time per iteration (s): 0.56 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 2.688281E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.183 | TFLOPs: 43.87 | +7: iteration 61280/ 115203 | consumed samples: 15687680 | consumed tokens: 32128368640 | elapsed time per iteration (s): 0.57 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 2.680677E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.268 | TFLOPs: 43.12 | +7: iteration 61290/ 115203 | consumed samples: 15690240 | consumed tokens: 32133611520 | elapsed time per iteration (s): 0.58 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 2.681626E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.585 | TFLOPs: 42.00 | +7: iteration 61300/ 115203 | consumed samples: 15692800 | consumed tokens: 32138854400 | elapsed time per iteration (s): 0.58 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 2.680845E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.454 | TFLOPs: 41.90 | +7: iteration 61310/ 115203 | consumed samples: 15695360 | consumed tokens: 32144097280 | elapsed time per iteration (s): 0.56 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 2.688350E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.890 | TFLOPs: 43.27 | +7: iteration 61320/ 115203 | consumed samples: 15697920 | consumed tokens: 32149340160 | elapsed time per iteration (s): 0.57 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 2.685332E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.266 | TFLOPs: 43.02 | +7: iteration 61330/ 115203 | consumed samples: 15700480 | consumed tokens: 32154583040 | elapsed time per iteration (s): 0.57 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 2.669996E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.153 | TFLOPs: 42.63 | +7: iteration 61340/ 115203 | consumed samples: 15703040 | consumed tokens: 32159825920 | elapsed time per iteration (s): 0.57 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 2.674214E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.619 | TFLOPs: 42.87 | +7: iteration 61350/ 115203 | consumed samples: 15705600 | consumed tokens: 32165068800 | elapsed time per iteration (s): 0.56 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 2.682230E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.953 | TFLOPs: 43.66 | +7: iteration 61360/ 115203 | consumed samples: 15708160 | consumed tokens: 32170311680 | elapsed time per iteration (s): 0.57 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 2.675795E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.461 | TFLOPs: 42.76 | +7: iteration 61370/ 115203 | consumed samples: 15710720 | consumed tokens: 32175554560 | elapsed time per iteration (s): 0.57 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 2.700771E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.949 | TFLOPs: 43.18 | +7: iteration 61380/ 115203 | consumed samples: 15713280 | consumed tokens: 32180797440 | elapsed time per iteration (s): 0.57 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 2.693898E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.592 | TFLOPs: 42.86 | +7: iteration 61390/ 115203 | consumed samples: 15715840 | consumed tokens: 32186040320 | elapsed time per iteration (s): 0.56 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 2.688680E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.242 | TFLOPs: 43.69 | +7: iteration 61400/ 115203 | consumed samples: 15718400 | consumed tokens: 32191283200 | elapsed time per iteration (s): 0.58 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 2.680674E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.121 | TFLOPs: 42.44 | +7: iteration 61410/ 115203 | consumed samples: 15720960 | consumed tokens: 32196526080 | elapsed time per iteration (s): 0.57 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 2.687040E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.643 | TFLOPs: 42.87 | +7: iteration 61420/ 115203 | consumed samples: 15723520 | consumed tokens: 32201768960 | elapsed time per iteration (s): 0.57 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 2.692995E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.679 | TFLOPs: 42.59 | +7: iteration 61430/ 115203 | consumed samples: 15726080 | consumed tokens: 32207011840 | elapsed time per iteration (s): 0.56 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 2.689735E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.689 | TFLOPs: 43.92 | +7: iteration 61440/ 115203 | consumed samples: 15728640 | consumed tokens: 32212254720 | elapsed time per iteration (s): 0.56 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 2.689284E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.177 | TFLOPs: 43.68 | +7: iteration 61450/ 115203 | consumed samples: 15731200 | consumed tokens: 32217497600 | elapsed time per iteration (s): 0.56 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 2.683738E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.950 | TFLOPs: 43.28 | +7: iteration 61460/ 115203 | consumed samples: 15733760 | consumed tokens: 32222740480 | elapsed time per iteration (s): 0.57 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 2.682508E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.158 | TFLOPs: 42.63 | +7: iteration 61470/ 115203 | consumed samples: 15736320 | consumed tokens: 32227983360 | elapsed time per iteration (s): 0.58 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 2.684297E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.199 | TFLOPs: 42.35 | +7: iteration 61480/ 115203 | consumed samples: 15738880 | consumed tokens: 32233226240 | elapsed time per iteration (s): 0.57 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 2.678476E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.412 | TFLOPs: 42.85 | +7: iteration 61490/ 115203 | consumed samples: 15741440 | consumed tokens: 32238469120 | elapsed time per iteration (s): 0.58 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 2.694115E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.215 | TFLOPs: 42.35 | +7: iteration 61500/ 115203 | consumed samples: 15744000 | consumed tokens: 32243712000 | elapsed time per iteration (s): 0.57 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 2.684724E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.613 | TFLOPs: 42.77 | +7: iteration 61510/ 115203 | consumed samples: 15746560 | consumed tokens: 32248954880 | elapsed time per iteration (s): 0.57 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 2.678292E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.285 | TFLOPs: 43.12 | +7: iteration 61520/ 115203 | consumed samples: 15749120 | consumed tokens: 32254197760 | elapsed time per iteration (s): 0.58 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 2.693213E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.078 | TFLOPs: 41.86 | +7: iteration 61530/ 115203 | consumed samples: 15751680 | consumed tokens: 32259440640 | elapsed time per iteration (s): 0.60 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 2.689428E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.033 | TFLOPs: 41.00 | +7: iteration 61540/ 115203 | consumed samples: 15754240 | consumed tokens: 32264683520 | elapsed time per iteration (s): 0.57 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 2.679491E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.788 | TFLOPs: 42.88 | +7: iteration 61550/ 115203 | consumed samples: 15756800 | consumed tokens: 32269926400 | elapsed time per iteration (s): 0.59 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 2.686392E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.425 | TFLOPs: 41.61 | +7: iteration 61560/ 115203 | consumed samples: 15759360 | consumed tokens: 32275169280 | elapsed time per iteration (s): 0.60 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 2.675480E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.592 | TFLOPs: 40.58 | +7: iteration 61570/ 115203 | consumed samples: 15761920 | consumed tokens: 32280412160 | elapsed time per iteration (s): 0.58 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 2.692097E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.356 | TFLOPs: 42.08 | +7: iteration 61580/ 115203 | consumed samples: 15764480 | consumed tokens: 32285655040 | elapsed time per iteration (s): 0.57 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 2.672198E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.133 | TFLOPs: 42.53 | +7: iteration 61590/ 115203 | consumed samples: 15767040 | consumed tokens: 32290897920 | elapsed time per iteration (s): 0.58 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 2.690412E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.611 | TFLOPs: 41.91 | +7: iteration 61600/ 115203 | consumed samples: 15769600 | consumed tokens: 32296140800 | elapsed time per iteration (s): 0.57 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 2.699911E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.109 | TFLOPs: 43.10 | +7: iteration 61610/ 115203 | consumed samples: 15772160 | consumed tokens: 32301383680 | elapsed time per iteration (s): 0.57 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 2.673614E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.067 | TFLOPs: 42.81 | +7: iteration 61620/ 115203 | consumed samples: 15774720 | consumed tokens: 32306626560 | elapsed time per iteration (s): 0.57 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 2.683508E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.367 | TFLOPs: 43.03 | +7: iteration 61630/ 115203 | consumed samples: 15777280 | consumed tokens: 32311869440 | elapsed time per iteration (s): 0.59 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 2.678119E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.433 | TFLOPs: 41.42 | +7: iteration 61640/ 115203 | consumed samples: 15779840 | consumed tokens: 32317112320 | elapsed time per iteration (s): 0.58 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 2.662878E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.967 | TFLOPs: 42.14 | +7: iteration 61650/ 115203 | consumed samples: 15782400 | consumed tokens: 32322355200 | elapsed time per iteration (s): 0.57 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 2.687060E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.797 | TFLOPs: 42.79 | +7: iteration 61660/ 115203 | consumed samples: 15784960 | consumed tokens: 32327598080 | elapsed time per iteration (s): 0.58 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 2.692877E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.719 | TFLOPs: 42.40 | +7: iteration 61670/ 115203 | consumed samples: 15787520 | consumed tokens: 32332840960 | elapsed time per iteration (s): 0.57 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 2.677299E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.049 | TFLOPs: 43.10 | +7: iteration 61680/ 115203 | consumed samples: 15790080 | consumed tokens: 32338083840 | elapsed time per iteration (s): 0.59 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 2.687008E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.533 | TFLOPs: 41.05 | +7: iteration 61690/ 115203 | consumed samples: 15792640 | consumed tokens: 32343326720 | elapsed time per iteration (s): 0.59 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 2.677373E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.368 | TFLOPs: 41.32 | +7: iteration 61700/ 115203 | consumed samples: 15795200 | consumed tokens: 32348569600 | elapsed time per iteration (s): 0.57 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 2.688111E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.077 | TFLOPs: 42.53 | +7: iteration 61710/ 115203 | consumed samples: 15797760 | consumed tokens: 32353812480 | elapsed time per iteration (s): 0.59 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 2.700518E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.875 | TFLOPs: 41.46 | +7: iteration 61720/ 115203 | consumed samples: 15800320 | consumed tokens: 32359055360 | elapsed time per iteration (s): 0.57 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 2.674312E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.790 | TFLOPs: 42.69 | +7: iteration 61730/ 115203 | consumed samples: 15802880 | consumed tokens: 32364298240 | elapsed time per iteration (s): 0.56 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 2.687656E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.645 | TFLOPs: 43.54 | +7: iteration 61740/ 115203 | consumed samples: 15805440 | consumed tokens: 32369541120 | elapsed time per iteration (s): 0.57 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 2.689380E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.626 | TFLOPs: 42.96 | +7: iteration 61750/ 115203 | consumed samples: 15808000 | consumed tokens: 32374784000 | elapsed time per iteration (s): 0.57 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 2.692532E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.487 | TFLOPs: 43.14 | +7: iteration 61760/ 115203 | consumed samples: 15810560 | consumed tokens: 32380026880 | elapsed time per iteration (s): 0.58 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 2.691372E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.911 | TFLOPs: 42.32 | +7: iteration 61770/ 115203 | consumed samples: 15813120 | consumed tokens: 32385269760 | elapsed time per iteration (s): 0.57 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 2.693073E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.049 | TFLOPs: 42.53 | +7: iteration 61780/ 115203 | consumed samples: 15815680 | consumed tokens: 32390512640 | elapsed time per iteration (s): 0.58 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 2.701027E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.207 | TFLOPs: 42.45 | +7: iteration 61790/ 115203 | consumed samples: 15818240 | consumed tokens: 32395755520 | elapsed time per iteration (s): 0.59 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 2.687263E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.358 | TFLOPs: 41.60 | +7: iteration 61800/ 115203 | consumed samples: 15820800 | consumed tokens: 32400998400 | elapsed time per iteration (s): 0.58 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 2.682986E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.289 | TFLOPs: 42.07 | +7: iteration 61810/ 115203 | consumed samples: 15823360 | consumed tokens: 32406241280 | elapsed time per iteration (s): 0.58 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 2.690131E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.326 | TFLOPs: 42.36 | +7: iteration 61820/ 115203 | consumed samples: 15825920 | consumed tokens: 32411484160 | elapsed time per iteration (s): 0.57 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 2.676311E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.435 | TFLOPs: 42.47 | +7: iteration 61830/ 115203 | consumed samples: 15828480 | consumed tokens: 32416727040 | elapsed time per iteration (s): 0.56 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 2.691926E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.484 | TFLOPs: 43.62 | +7: iteration 61840/ 115203 | consumed samples: 15831040 | consumed tokens: 32421969920 | elapsed time per iteration (s): 0.57 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 2.689403E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.771 | TFLOPs: 42.79 | +7: iteration 61850/ 115203 | consumed samples: 15833600 | consumed tokens: 32427212800 | elapsed time per iteration (s): 0.57 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 2.689624E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.811 | TFLOPs: 42.50 | +7: iteration 61860/ 115203 | consumed samples: 15836160 | consumed tokens: 32432455680 | elapsed time per iteration (s): 0.57 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 2.684627E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.644 | TFLOPs: 42.77 | +7: iteration 61870/ 115203 | consumed samples: 15838720 | consumed tokens: 32437698560 | elapsed time per iteration (s): 0.57 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 2.694419E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.770 | TFLOPs: 42.98 | +7: iteration 61880/ 115203 | consumed samples: 15841280 | consumed tokens: 32442941440 | elapsed time per iteration (s): 0.57 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 2.676163E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.302 | TFLOPs: 42.55 | +7: iteration 61890/ 115203 | consumed samples: 15843840 | consumed tokens: 32448184320 | elapsed time per iteration (s): 0.57 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 2.684490E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.359 | TFLOPs: 42.75 | +7: iteration 61900/ 115203 | consumed samples: 15846400 | consumed tokens: 32453427200 | elapsed time per iteration (s): 0.57 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 2.682103E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.132 | TFLOPs: 42.72 | +7: iteration 61910/ 115203 | consumed samples: 15848960 | consumed tokens: 32458670080 | elapsed time per iteration (s): 0.58 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 2.681285E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.345 | TFLOPs: 41.98 | +7: iteration 61920/ 115203 | consumed samples: 15851520 | consumed tokens: 32463912960 | elapsed time per iteration (s): 0.58 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 2.680988E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.881 | TFLOPs: 42.32 | +7: iteration 61930/ 115203 | consumed samples: 15854080 | consumed tokens: 32469155840 | elapsed time per iteration (s): 0.56 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 2.697728E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.669 | TFLOPs: 43.44 | +7: iteration 61940/ 115203 | consumed samples: 15856640 | consumed tokens: 32474398720 | elapsed time per iteration (s): 0.57 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 2.681284E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.323 | TFLOPs: 42.93 | +7: iteration 61950/ 115203 | consumed samples: 15859200 | consumed tokens: 32479641600 | elapsed time per iteration (s): 0.59 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 2.681577E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.009 | TFLOPs: 41.66 | +7: iteration 61960/ 115203 | consumed samples: 15861760 | consumed tokens: 32484884480 | elapsed time per iteration (s): 0.59 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 2.687879E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.071 | TFLOPs: 41.67 | +7: iteration 61970/ 115203 | consumed samples: 15864320 | consumed tokens: 32490127360 | elapsed time per iteration (s): 0.57 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 2.685956E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.625 | TFLOPs: 42.68 | +7: iteration 61980/ 115203 | consumed samples: 15866880 | consumed tokens: 32495370240 | elapsed time per iteration (s): 0.57 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 2.680245E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.860 | TFLOPs: 43.08 | +7: iteration 61990/ 115203 | consumed samples: 15869440 | consumed tokens: 32500613120 | elapsed time per iteration (s): 0.58 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 2.686752E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.376 | TFLOPs: 42.37 | +0: [2023-03-16 22:33:25,659] [INFO] [logging.py:68:log_dist] [Rank 0] step=62000, skipped=0, lr=[0.0001005423324048397, 0.0001005423324048397, 0.0001005423324048397], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 62000/ 115203 | consumed samples: 15872000 | consumed tokens: 32505856000 | elapsed time per iteration (s): 0.57 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 2.689288E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.644 | TFLOPs: 42.96 | +0: steps: 62000 loss: 2.6940 iter time (s): 0.563 samples/sec: 454.921 +7: iteration 62010/ 115203 | consumed samples: 15874560 | consumed tokens: 32511098880 | elapsed time per iteration (s): 0.57 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 2.700748E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.013 | TFLOPs: 42.90 | +7: iteration 62020/ 115203 | consumed samples: 15877120 | consumed tokens: 32516341760 | elapsed time per iteration (s): 0.57 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 2.684438E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.814 | TFLOPs: 43.08 | +7: iteration 62030/ 115203 | consumed samples: 15879680 | consumed tokens: 32521584640 | elapsed time per iteration (s): 0.57 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 2.679526E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.310 | TFLOPs: 42.93 | +7: iteration 62040/ 115203 | consumed samples: 15882240 | consumed tokens: 32526827520 | elapsed time per iteration (s): 0.56 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 2.686403E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.927 | TFLOPs: 43.28 | +7: iteration 62050/ 115203 | consumed samples: 15884800 | consumed tokens: 32532070400 | elapsed time per iteration (s): 0.57 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 2.684109E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.515 | TFLOPs: 42.95 | +7: iteration 62060/ 115203 | consumed samples: 15887360 | consumed tokens: 32537313280 | elapsed time per iteration (s): 0.57 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 2.674338E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.225 | TFLOPs: 42.64 | +7: iteration 62070/ 115203 | consumed samples: 15889920 | consumed tokens: 32542556160 | elapsed time per iteration (s): 0.57 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 2.687635E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.975 | TFLOPs: 42.80 | +7: iteration 62080/ 115203 | consumed samples: 15892480 | consumed tokens: 32547799040 | elapsed time per iteration (s): 0.58 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 2.667457E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.248 | TFLOPs: 42.16 | +7: iteration 62090/ 115203 | consumed samples: 15895040 | consumed tokens: 32553041920 | elapsed time per iteration (s): 0.58 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 2.685076E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.748 | TFLOPs: 42.02 | +7: iteration 62100/ 115203 | consumed samples: 15897600 | consumed tokens: 32558284800 | elapsed time per iteration (s): 0.58 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 2.698587E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.179 | TFLOPs: 42.35 | +7: iteration 62110/ 115203 | consumed samples: 15900160 | consumed tokens: 32563527680 | elapsed time per iteration (s): 0.58 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 2.686139E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.670 | TFLOPs: 41.82 | +7: iteration 62120/ 115203 | consumed samples: 15902720 | consumed tokens: 32568770560 | elapsed time per iteration (s): 0.58 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 2.677667E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.171 | TFLOPs: 41.77 | +7: iteration 62130/ 115203 | consumed samples: 15905280 | consumed tokens: 32574013440 | elapsed time per iteration (s): 0.58 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 2.690439E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.607 | TFLOPs: 42.29 | +7: iteration 62140/ 115203 | consumed samples: 15907840 | consumed tokens: 32579256320 | elapsed time per iteration (s): 0.58 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 2.679816E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.477 | TFLOPs: 42.28 | +7: iteration 62150/ 115203 | consumed samples: 15910400 | consumed tokens: 32584499200 | elapsed time per iteration (s): 0.57 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 2.682618E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.064 | TFLOPs: 43.10 | +7: iteration 62160/ 115203 | consumed samples: 15912960 | consumed tokens: 32589742080 | elapsed time per iteration (s): 0.57 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 2.692818E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.605 | TFLOPs: 43.06 | +7: iteration 62170/ 115203 | consumed samples: 15915520 | consumed tokens: 32594984960 | elapsed time per iteration (s): 0.57 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 2.677771E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.498 | TFLOPs: 42.95 | +7: iteration 62180/ 115203 | consumed samples: 15918080 | consumed tokens: 32600227840 | elapsed time per iteration (s): 0.58 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 2.685524E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.799 | TFLOPs: 41.74 | +7: iteration 62190/ 115203 | consumed samples: 15920640 | consumed tokens: 32605470720 | elapsed time per iteration (s): 0.57 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 2.682389E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.884 | TFLOPs: 42.61 | +7: iteration 62200/ 115203 | consumed samples: 15923200 | consumed tokens: 32610713600 | elapsed time per iteration (s): 0.57 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 2.685008E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.077 | TFLOPs: 42.91 | +7: iteration 62210/ 115203 | consumed samples: 15925760 | consumed tokens: 32615956480 | elapsed time per iteration (s): 0.56 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 2.679750E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.954 | TFLOPs: 43.47 | +7: iteration 62220/ 115203 | consumed samples: 15928320 | consumed tokens: 32621199360 | elapsed time per iteration (s): 0.57 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 2.690232E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.433 | TFLOPs: 43.13 | +7: iteration 62230/ 115203 | consumed samples: 15930880 | consumed tokens: 32626442240 | elapsed time per iteration (s): 0.56 | learning rate: 9.998E-05 | global batch size: 256 | lm loss: 2.684765E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.385 | TFLOPs: 43.42 | +7: iteration 62240/ 115203 | consumed samples: 15933440 | consumed tokens: 32631685120 | elapsed time per iteration (s): 0.57 | learning rate: 9.995E-05 | global batch size: 256 | lm loss: 2.689849E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.641 | TFLOPs: 42.68 | +7: iteration 62250/ 115203 | consumed samples: 15936000 | consumed tokens: 32636928000 | elapsed time per iteration (s): 0.57 | learning rate: 9.993E-05 | global batch size: 256 | lm loss: 2.690545E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.865 | TFLOPs: 42.79 | +7: iteration 62260/ 115203 | consumed samples: 15938560 | consumed tokens: 32642170880 | elapsed time per iteration (s): 0.58 | learning rate: 9.990E-05 | global batch size: 256 | lm loss: 2.693233E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.251 | TFLOPs: 42.35 | +7: iteration 62270/ 115203 | consumed samples: 15941120 | consumed tokens: 32647413760 | elapsed time per iteration (s): 0.58 | learning rate: 9.988E-05 | global batch size: 256 | lm loss: 2.672427E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.860 | TFLOPs: 42.22 | +7: iteration 62280/ 115203 | consumed samples: 15943680 | consumed tokens: 32652656640 | elapsed time per iteration (s): 0.58 | learning rate: 9.985E-05 | global batch size: 256 | lm loss: 2.679115E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.182 | TFLOPs: 42.35 | +7: iteration 62290/ 115203 | consumed samples: 15946240 | consumed tokens: 32657899520 | elapsed time per iteration (s): 0.56 | learning rate: 9.983E-05 | global batch size: 256 | lm loss: 2.695436E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.109 | TFLOPs: 43.58 | +7: iteration 62300/ 115203 | consumed samples: 15948800 | consumed tokens: 32663142400 | elapsed time per iteration (s): 0.56 | learning rate: 9.980E-05 | global batch size: 256 | lm loss: 2.683452E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.339 | TFLOPs: 43.32 | +7: iteration 62310/ 115203 | consumed samples: 15951360 | consumed tokens: 32668385280 | elapsed time per iteration (s): 0.57 | learning rate: 9.978E-05 | global batch size: 256 | lm loss: 2.684418E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.470 | TFLOPs: 42.85 | +7: iteration 62320/ 115203 | consumed samples: 15953920 | consumed tokens: 32673628160 | elapsed time per iteration (s): 0.57 | learning rate: 9.975E-05 | global batch size: 256 | lm loss: 2.688448E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.813 | TFLOPs: 42.79 | +7: iteration 62330/ 115203 | consumed samples: 15956480 | consumed tokens: 32678871040 | elapsed time per iteration (s): 0.57 | learning rate: 9.973E-05 | global batch size: 256 | lm loss: 2.686529E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.098 | TFLOPs: 42.53 | +7: iteration 62340/ 115203 | consumed samples: 15959040 | consumed tokens: 32684113920 | elapsed time per iteration (s): 0.56 | learning rate: 9.970E-05 | global batch size: 256 | lm loss: 2.695807E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.755 | TFLOPs: 43.74 | +7: iteration 62350/ 115203 | consumed samples: 15961600 | consumed tokens: 32689356800 | elapsed time per iteration (s): 0.56 | learning rate: 9.968E-05 | global batch size: 256 | lm loss: 2.695892E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.305 | TFLOPs: 43.69 | +7: iteration 62360/ 115203 | consumed samples: 15964160 | consumed tokens: 32694599680 | elapsed time per iteration (s): 0.56 | learning rate: 9.966E-05 | global batch size: 256 | lm loss: 2.693946E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.337 | TFLOPs: 43.51 | +7: iteration 62370/ 115203 | consumed samples: 15966720 | consumed tokens: 32699842560 | elapsed time per iteration (s): 0.56 | learning rate: 9.963E-05 | global batch size: 256 | lm loss: 2.685818E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.483 | TFLOPs: 43.43 | +7: iteration 62380/ 115203 | consumed samples: 15969280 | consumed tokens: 32705085440 | elapsed time per iteration (s): 0.56 | learning rate: 9.961E-05 | global batch size: 256 | lm loss: 2.671383E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 62390/ 115203 | consumed samples: 15971840 | consumed tokens: 32710328320 | elapsed time per iteration (s): 0.58 | learning rate: 9.958E-05 | global batch size: 256 | lm loss: 2.693723E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.880 | TFLOPs: 42.32 | +7: iteration 62400/ 115203 | consumed samples: 15974400 | consumed tokens: 32715571200 | elapsed time per iteration (s): 0.57 | learning rate: 9.956E-05 | global batch size: 256 | lm loss: 2.679464E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.016 | TFLOPs: 42.62 | +7: iteration 62410/ 115203 | consumed samples: 15976960 | consumed tokens: 32720814080 | elapsed time per iteration (s): 0.59 | learning rate: 9.953E-05 | global batch size: 256 | lm loss: 2.685099E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.717 | TFLOPs: 41.35 | +7: iteration 62420/ 115203 | consumed samples: 15979520 | consumed tokens: 32726056960 | elapsed time per iteration (s): 0.57 | learning rate: 9.951E-05 | global batch size: 256 | lm loss: 2.672547E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.578 | TFLOPs: 42.96 | +7: iteration 62430/ 115203 | consumed samples: 15982080 | consumed tokens: 32731299840 | elapsed time per iteration (s): 0.55 | learning rate: 9.948E-05 | global batch size: 256 | lm loss: 2.688155E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.573 | TFLOPs: 44.01 | +7: iteration 62440/ 115203 | consumed samples: 15984640 | consumed tokens: 32736542720 | elapsed time per iteration (s): 0.56 | learning rate: 9.946E-05 | global batch size: 256 | lm loss: 2.690047E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.072 | TFLOPs: 43.48 | +7: iteration 62450/ 115203 | consumed samples: 15987200 | consumed tokens: 32741785600 | elapsed time per iteration (s): 0.57 | learning rate: 9.943E-05 | global batch size: 256 | lm loss: 2.681720E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.029 | TFLOPs: 42.62 | +7: iteration 62460/ 115203 | consumed samples: 15989760 | consumed tokens: 32747028480 | elapsed time per iteration (s): 0.57 | learning rate: 9.941E-05 | global batch size: 256 | lm loss: 2.693659E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.046 | TFLOPs: 42.53 | +7: iteration 62470/ 115203 | consumed samples: 15992320 | consumed tokens: 32752271360 | elapsed time per iteration (s): 0.56 | learning rate: 9.938E-05 | global batch size: 256 | lm loss: 2.689779E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.017 | TFLOPs: 43.38 | +7: iteration 62480/ 115203 | consumed samples: 15994880 | consumed tokens: 32757514240 | elapsed time per iteration (s): 0.56 | learning rate: 9.936E-05 | global batch size: 256 | lm loss: 2.671228E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.888 | TFLOPs: 43.37 | +7: iteration 62490/ 115203 | consumed samples: 15997440 | consumed tokens: 32762757120 | elapsed time per iteration (s): 0.57 | learning rate: 9.934E-05 | global batch size: 256 | lm loss: 2.686530E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.229 | TFLOPs: 42.54 | +7: iteration 62500/ 115203 | consumed samples: 16000000 | consumed tokens: 32768000000 | elapsed time per iteration (s): 0.56 | learning rate: 9.931E-05 | global batch size: 256 | lm loss: 2.688076E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.708 | TFLOPs: 43.26 | +7: iteration 62510/ 115203 | consumed samples: 16002560 | consumed tokens: 32773242880 | elapsed time per iteration (s): 0.56 | learning rate: 9.929E-05 | global batch size: 256 | lm loss: 2.681101E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.597 | TFLOPs: 43.34 | +7: iteration 62520/ 115203 | consumed samples: 16005120 | consumed tokens: 32778485760 | elapsed time per iteration (s): 0.56 | learning rate: 9.926E-05 | global batch size: 256 | lm loss: 2.677840E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.093 | TFLOPs: 43.39 | +7: iteration 62530/ 115203 | consumed samples: 16007680 | consumed tokens: 32783728640 | elapsed time per iteration (s): 0.56 | learning rate: 9.924E-05 | global batch size: 256 | lm loss: 2.681720E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.726 | TFLOPs: 43.54 | +7: iteration 62540/ 115203 | consumed samples: 16010240 | consumed tokens: 32788971520 | elapsed time per iteration (s): 0.56 | learning rate: 9.921E-05 | global batch size: 256 | lm loss: 2.675891E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.936 | TFLOPs: 43.95 | +7: iteration 62550/ 115203 | consumed samples: 16012800 | consumed tokens: 32794214400 | elapsed time per iteration (s): 0.57 | learning rate: 9.919E-05 | global batch size: 256 | lm loss: 2.683427E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.305 | TFLOPs: 42.74 | +7: iteration 62560/ 115203 | consumed samples: 16015360 | consumed tokens: 32799457280 | elapsed time per iteration (s): 0.56 | learning rate: 9.916E-05 | global batch size: 256 | lm loss: 2.674936E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.279 | TFLOPs: 43.31 | +7: iteration 62570/ 115203 | consumed samples: 16017920 | consumed tokens: 32804700160 | elapsed time per iteration (s): 0.56 | learning rate: 9.914E-05 | global batch size: 256 | lm loss: 2.683704E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.355 | TFLOPs: 43.89 | +7: iteration 62580/ 115203 | consumed samples: 16020480 | consumed tokens: 32809943040 | elapsed time per iteration (s): 0.56 | learning rate: 9.911E-05 | global batch size: 256 | lm loss: 2.672293E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.945 | TFLOPs: 43.28 | +7: iteration 62590/ 115203 | consumed samples: 16023040 | consumed tokens: 32815185920 | elapsed time per iteration (s): 0.56 | learning rate: 9.909E-05 | global batch size: 256 | lm loss: 2.689756E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.917 | TFLOPs: 43.28 | +7: iteration 62600/ 115203 | consumed samples: 16025600 | consumed tokens: 32820428800 | elapsed time per iteration (s): 0.58 | learning rate: 9.906E-05 | global batch size: 256 | lm loss: 2.685042E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.193 | TFLOPs: 42.06 | +7: iteration 62610/ 115203 | consumed samples: 16028160 | consumed tokens: 32825671680 | elapsed time per iteration (s): 0.56 | learning rate: 9.904E-05 | global batch size: 256 | lm loss: 2.684889E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.195 | TFLOPs: 43.78 | +7: iteration 62620/ 115203 | consumed samples: 16030720 | consumed tokens: 32830914560 | elapsed time per iteration (s): 0.57 | learning rate: 9.902E-05 | global batch size: 256 | lm loss: 2.694915E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.347 | TFLOPs: 43.03 | +7: iteration 62630/ 115203 | consumed samples: 16033280 | consumed tokens: 32836157440 | elapsed time per iteration (s): 0.58 | learning rate: 9.899E-05 | global batch size: 256 | lm loss: 2.676740E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.155 | TFLOPs: 42.44 | +7: iteration 62640/ 115203 | consumed samples: 16035840 | consumed tokens: 32841400320 | elapsed time per iteration (s): 0.56 | learning rate: 9.897E-05 | global batch size: 256 | lm loss: 2.690858E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.564 | TFLOPs: 43.72 | +7: iteration 62650/ 115203 | consumed samples: 16038400 | consumed tokens: 32846643200 | elapsed time per iteration (s): 0.56 | learning rate: 9.894E-05 | global batch size: 256 | lm loss: 2.676913E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.886 | TFLOPs: 43.46 | +7: iteration 62660/ 115203 | consumed samples: 16040960 | consumed tokens: 32851886080 | elapsed time per iteration (s): 0.58 | learning rate: 9.892E-05 | global batch size: 256 | lm loss: 2.672507E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.090 | TFLOPs: 42.24 | +7: iteration 62670/ 115203 | consumed samples: 16043520 | consumed tokens: 32857128960 | elapsed time per iteration (s): 0.56 | learning rate: 9.889E-05 | global batch size: 256 | lm loss: 2.683077E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.614 | TFLOPs: 43.63 | +7: iteration 62680/ 115203 | consumed samples: 16046080 | consumed tokens: 32862371840 | elapsed time per iteration (s): 0.56 | learning rate: 9.887E-05 | global batch size: 256 | lm loss: 2.691613E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.035 | TFLOPs: 43.57 | +7: iteration 62690/ 115203 | consumed samples: 16048640 | consumed tokens: 32867614720 | elapsed time per iteration (s): 0.57 | learning rate: 9.884E-05 | global batch size: 256 | lm loss: 2.677505E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.014 | TFLOPs: 43.19 | +7: iteration 62700/ 115203 | consumed samples: 16051200 | consumed tokens: 32872857600 | elapsed time per iteration (s): 0.57 | learning rate: 9.882E-05 | global batch size: 256 | lm loss: 2.669416E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.124 | TFLOPs: 42.72 | +7: iteration 62710/ 115203 | consumed samples: 16053760 | consumed tokens: 32878100480 | elapsed time per iteration (s): 0.57 | learning rate: 9.879E-05 | global batch size: 256 | lm loss: 2.682684E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.101 | TFLOPs: 43.01 | +7: iteration 62720/ 115203 | consumed samples: 16056320 | consumed tokens: 32883343360 | elapsed time per iteration (s): 0.56 | learning rate: 9.877E-05 | global batch size: 256 | lm loss: 2.677616E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.545 | TFLOPs: 43.34 | +7: iteration 62730/ 115203 | consumed samples: 16058880 | consumed tokens: 32888586240 | elapsed time per iteration (s): 0.57 | learning rate: 9.874E-05 | global batch size: 256 | lm loss: 2.687879E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.587 | TFLOPs: 42.67 | +7: iteration 62740/ 115203 | consumed samples: 16061440 | consumed tokens: 32893829120 | elapsed time per iteration (s): 0.56 | learning rate: 9.872E-05 | global batch size: 256 | lm loss: 2.678449E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.528 | TFLOPs: 43.72 | +7: iteration 62750/ 115203 | consumed samples: 16064000 | consumed tokens: 32899072000 | elapsed time per iteration (s): 0.57 | learning rate: 9.870E-05 | global batch size: 256 | lm loss: 2.687354E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.563 | TFLOPs: 42.96 | +7: iteration 62760/ 115203 | consumed samples: 16066560 | consumed tokens: 32904314880 | elapsed time per iteration (s): 0.57 | learning rate: 9.867E-05 | global batch size: 256 | lm loss: 2.695592E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.745 | TFLOPs: 42.88 | +7: iteration 62770/ 115203 | consumed samples: 16069120 | consumed tokens: 32909557760 | elapsed time per iteration (s): 0.56 | learning rate: 9.865E-05 | global batch size: 256 | lm loss: 2.666185E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.011 | TFLOPs: 43.86 | +7: iteration 62780/ 115203 | consumed samples: 16071680 | consumed tokens: 32914800640 | elapsed time per iteration (s): 0.56 | learning rate: 9.862E-05 | global batch size: 256 | lm loss: 2.680215E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.412 | TFLOPs: 43.80 | +7: iteration 62790/ 115203 | consumed samples: 16074240 | consumed tokens: 32920043520 | elapsed time per iteration (s): 0.57 | learning rate: 9.860E-05 | global batch size: 256 | lm loss: 2.681662E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.882 | TFLOPs: 43.08 | +7: iteration 62800/ 115203 | consumed samples: 16076800 | consumed tokens: 32925286400 | elapsed time per iteration (s): 0.56 | learning rate: 9.857E-05 | global batch size: 256 | lm loss: 2.682393E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.290 | TFLOPs: 43.69 | +7: iteration 62810/ 115203 | consumed samples: 16079360 | consumed tokens: 32930529280 | elapsed time per iteration (s): 0.56 | learning rate: 9.855E-05 | global batch size: 256 | lm loss: 2.666545E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.738 | TFLOPs: 43.74 | +7: iteration 62820/ 115203 | consumed samples: 16081920 | consumed tokens: 32935772160 | elapsed time per iteration (s): 0.56 | learning rate: 9.852E-05 | global batch size: 256 | lm loss: 2.683722E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.766 | TFLOPs: 43.45 | +7: iteration 62830/ 115203 | consumed samples: 16084480 | consumed tokens: 32941015040 | elapsed time per iteration (s): 0.56 | learning rate: 9.850E-05 | global batch size: 256 | lm loss: 2.681226E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.923 | TFLOPs: 43.47 | +7: iteration 62840/ 115203 | consumed samples: 16087040 | consumed tokens: 32946257920 | elapsed time per iteration (s): 0.56 | learning rate: 9.847E-05 | global batch size: 256 | lm loss: 2.688563E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.899 | TFLOPs: 43.56 | +7: iteration 62850/ 115203 | consumed samples: 16089600 | consumed tokens: 32951500800 | elapsed time per iteration (s): 0.56 | learning rate: 9.845E-05 | global batch size: 256 | lm loss: 2.679997E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.954 | TFLOPs: 43.85 | +7: iteration 62860/ 115203 | consumed samples: 16092160 | consumed tokens: 32956743680 | elapsed time per iteration (s): 0.58 | learning rate: 9.842E-05 | global batch size: 256 | lm loss: 2.692880E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.980 | TFLOPs: 42.42 | +7: iteration 62870/ 115203 | consumed samples: 16094720 | consumed tokens: 32961986560 | elapsed time per iteration (s): 0.56 | learning rate: 9.840E-05 | global batch size: 256 | lm loss: 2.677601E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.054 | TFLOPs: 43.48 | +7: iteration 62880/ 115203 | consumed samples: 16097280 | consumed tokens: 32967229440 | elapsed time per iteration (s): 0.57 | learning rate: 9.838E-05 | global batch size: 256 | lm loss: 2.687261E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.050 | TFLOPs: 42.53 | +7: iteration 62890/ 115203 | consumed samples: 16099840 | consumed tokens: 32972472320 | elapsed time per iteration (s): 0.56 | learning rate: 9.835E-05 | global batch size: 256 | lm loss: 2.680639E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.257 | TFLOPs: 43.40 | +7: iteration 62900/ 115203 | consumed samples: 16102400 | consumed tokens: 32977715200 | elapsed time per iteration (s): 0.56 | learning rate: 9.833E-05 | global batch size: 256 | lm loss: 2.687843E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.366 | TFLOPs: 43.70 | +7: iteration 62910/ 115203 | consumed samples: 16104960 | consumed tokens: 32982958080 | elapsed time per iteration (s): 0.56 | learning rate: 9.830E-05 | global batch size: 256 | lm loss: 2.682769E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.568 | TFLOPs: 43.72 | +7: iteration 62920/ 115203 | consumed samples: 16107520 | consumed tokens: 32988200960 | elapsed time per iteration (s): 0.56 | learning rate: 9.828E-05 | global batch size: 256 | lm loss: 2.685186E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.230 | TFLOPs: 43.97 | +7: iteration 62930/ 115203 | consumed samples: 16110080 | consumed tokens: 32993443840 | elapsed time per iteration (s): 0.55 | learning rate: 9.825E-05 | global batch size: 256 | lm loss: 2.690669E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.338 | TFLOPs: 43.98 | +7: iteration 62940/ 115203 | consumed samples: 16112640 | consumed tokens: 32998686720 | elapsed time per iteration (s): 0.56 | learning rate: 9.823E-05 | global batch size: 256 | lm loss: 2.666253E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.182 | TFLOPs: 43.97 | +7: iteration 62950/ 115203 | consumed samples: 16115200 | consumed tokens: 33003929600 | elapsed time per iteration (s): 0.56 | learning rate: 9.820E-05 | global batch size: 256 | lm loss: 2.671943E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.239 | TFLOPs: 43.69 | +7: iteration 62960/ 115203 | consumed samples: 16117760 | consumed tokens: 33009172480 | elapsed time per iteration (s): 0.56 | learning rate: 9.818E-05 | global batch size: 256 | lm loss: 2.689253E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.736 | TFLOPs: 43.83 | +7: iteration 62970/ 115203 | consumed samples: 16120320 | consumed tokens: 33014415360 | elapsed time per iteration (s): 0.56 | learning rate: 9.815E-05 | global batch size: 256 | lm loss: 2.679426E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.149 | TFLOPs: 43.87 | +7: iteration 62980/ 115203 | consumed samples: 16122880 | consumed tokens: 33019658240 | elapsed time per iteration (s): 0.56 | learning rate: 9.813E-05 | global batch size: 256 | lm loss: 2.669184E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.085 | TFLOPs: 43.96 | +7: iteration 62990/ 115203 | consumed samples: 16125440 | consumed tokens: 33024901120 | elapsed time per iteration (s): 0.56 | learning rate: 9.811E-05 | global batch size: 256 | lm loss: 2.677179E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.323 | TFLOPs: 43.70 | +7: iteration 63000/ 115203 | consumed samples: 16128000 | consumed tokens: 33030144000 | elapsed time per iteration (s): 0.56 | learning rate: 9.808E-05 | global batch size: 256 | lm loss: 2.682766E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.473 | TFLOPs: 43.71 | +7: iteration 63010/ 115203 | consumed samples: 16130560 | consumed tokens: 33035386880 | elapsed time per iteration (s): 0.55 | learning rate: 9.806E-05 | global batch size: 256 | lm loss: 2.680796E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.307 | TFLOPs: 43.98 | +7: iteration 63020/ 115203 | consumed samples: 16133120 | consumed tokens: 33040629760 | elapsed time per iteration (s): 0.56 | learning rate: 9.803E-05 | global batch size: 256 | lm loss: 2.688526E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.985 | TFLOPs: 43.66 | +7: iteration 63030/ 115203 | consumed samples: 16135680 | consumed tokens: 33045872640 | elapsed time per iteration (s): 0.55 | learning rate: 9.801E-05 | global batch size: 256 | lm loss: 2.676029E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.361 | TFLOPs: 43.99 | +7: iteration 63040/ 115203 | consumed samples: 16138240 | consumed tokens: 33051115520 | elapsed time per iteration (s): 0.57 | learning rate: 9.798E-05 | global batch size: 256 | lm loss: 2.685335E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.101 | TFLOPs: 42.53 | +7: iteration 63050/ 115203 | consumed samples: 16140800 | consumed tokens: 33056358400 | elapsed time per iteration (s): 0.57 | learning rate: 9.796E-05 | global batch size: 256 | lm loss: 2.680293E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.423 | TFLOPs: 43.04 | +7: iteration 63060/ 115203 | consumed samples: 16143360 | consumed tokens: 33061601280 | elapsed time per iteration (s): 0.56 | learning rate: 9.793E-05 | global batch size: 256 | lm loss: 2.663118E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.211 | TFLOPs: 43.21 | +7: iteration 63070/ 115203 | consumed samples: 16145920 | consumed tokens: 33066844160 | elapsed time per iteration (s): 0.56 | learning rate: 9.791E-05 | global batch size: 256 | lm loss: 2.680998E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.070 | TFLOPs: 43.67 | +7: iteration 63080/ 115203 | consumed samples: 16148480 | consumed tokens: 33072087040 | elapsed time per iteration (s): 0.56 | learning rate: 9.788E-05 | global batch size: 256 | lm loss: 2.680285E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.066 | TFLOPs: 43.67 | +7: iteration 63090/ 115203 | consumed samples: 16151040 | consumed tokens: 33077329920 | elapsed time per iteration (s): 0.56 | learning rate: 9.786E-05 | global batch size: 256 | lm loss: 2.674590E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.125 | TFLOPs: 43.68 | +7: iteration 63100/ 115203 | consumed samples: 16153600 | consumed tokens: 33082572800 | elapsed time per iteration (s): 0.56 | learning rate: 9.784E-05 | global batch size: 256 | lm loss: 2.688325E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.638 | TFLOPs: 43.34 | +7: iteration 63110/ 115203 | consumed samples: 16156160 | consumed tokens: 33087815680 | elapsed time per iteration (s): 0.56 | learning rate: 9.781E-05 | global batch size: 256 | lm loss: 2.678752E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.760 | TFLOPs: 43.36 | +7: iteration 63120/ 115203 | consumed samples: 16158720 | consumed tokens: 33093058560 | elapsed time per iteration (s): 0.56 | learning rate: 9.779E-05 | global batch size: 256 | lm loss: 2.681146E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.310 | TFLOPs: 43.50 | +7: iteration 63130/ 115203 | consumed samples: 16161280 | consumed tokens: 33098301440 | elapsed time per iteration (s): 0.56 | learning rate: 9.776E-05 | global batch size: 256 | lm loss: 2.686855E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.588 | TFLOPs: 43.24 | +7: iteration 63140/ 115203 | consumed samples: 16163840 | consumed tokens: 33103544320 | elapsed time per iteration (s): 0.56 | learning rate: 9.774E-05 | global batch size: 256 | lm loss: 2.676758E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.567 | TFLOPs: 43.62 | +7: iteration 63150/ 115203 | consumed samples: 16166400 | consumed tokens: 33108787200 | elapsed time per iteration (s): 0.56 | learning rate: 9.771E-05 | global batch size: 256 | lm loss: 2.673182E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.284 | TFLOPs: 43.69 | +7: iteration 63160/ 115203 | consumed samples: 16168960 | consumed tokens: 33114030080 | elapsed time per iteration (s): 0.56 | learning rate: 9.769E-05 | global batch size: 256 | lm loss: 2.684557E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.900 | TFLOPs: 43.56 | +7: iteration 63170/ 115203 | consumed samples: 16171520 | consumed tokens: 33119272960 | elapsed time per iteration (s): 0.56 | learning rate: 9.766E-05 | global batch size: 256 | lm loss: 2.671409E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.145 | TFLOPs: 43.49 | +7: iteration 63180/ 115203 | consumed samples: 16174080 | consumed tokens: 33124515840 | elapsed time per iteration (s): 0.56 | learning rate: 9.764E-05 | global batch size: 256 | lm loss: 2.688219E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.355 | TFLOPs: 43.89 | +7: iteration 63190/ 115203 | consumed samples: 16176640 | consumed tokens: 33129758720 | elapsed time per iteration (s): 0.55 | learning rate: 9.761E-05 | global batch size: 256 | lm loss: 2.671704E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.429 | TFLOPs: 43.99 | +7: iteration 63200/ 115203 | consumed samples: 16179200 | consumed tokens: 33135001600 | elapsed time per iteration (s): 0.56 | learning rate: 9.759E-05 | global batch size: 256 | lm loss: 2.667284E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.758 | TFLOPs: 43.45 | +7: iteration 63210/ 115203 | consumed samples: 16181760 | consumed tokens: 33140244480 | elapsed time per iteration (s): 0.57 | learning rate: 9.757E-05 | global batch size: 256 | lm loss: 2.690284E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.674 | TFLOPs: 43.16 | +7: iteration 63220/ 115203 | consumed samples: 16184320 | consumed tokens: 33145487360 | elapsed time per iteration (s): 0.56 | learning rate: 9.754E-05 | global batch size: 256 | lm loss: 2.682257E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.480 | TFLOPs: 43.52 | +7: iteration 63230/ 115203 | consumed samples: 16186880 | consumed tokens: 33150730240 | elapsed time per iteration (s): 0.56 | learning rate: 9.752E-05 | global batch size: 256 | lm loss: 2.679686E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.097 | TFLOPs: 43.48 | +7: iteration 63240/ 115203 | consumed samples: 16189440 | consumed tokens: 33155973120 | elapsed time per iteration (s): 0.57 | learning rate: 9.749E-05 | global batch size: 256 | lm loss: 2.682581E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.474 | TFLOPs: 42.57 | +7: iteration 63250/ 115203 | consumed samples: 16192000 | consumed tokens: 33161216000 | elapsed time per iteration (s): 0.56 | learning rate: 9.747E-05 | global batch size: 256 | lm loss: 2.672944E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.818 | TFLOPs: 43.65 | +7: iteration 63260/ 115203 | consumed samples: 16194560 | consumed tokens: 33166458880 | elapsed time per iteration (s): 0.57 | learning rate: 9.744E-05 | global batch size: 256 | lm loss: 2.679602E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.534 | TFLOPs: 42.48 | +7: iteration 63270/ 115203 | consumed samples: 16197120 | consumed tokens: 33171701760 | elapsed time per iteration (s): 0.57 | learning rate: 9.742E-05 | global batch size: 256 | lm loss: 2.674499E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.950 | TFLOPs: 42.90 | +7: iteration 63280/ 115203 | consumed samples: 16199680 | consumed tokens: 33176944640 | elapsed time per iteration (s): 0.56 | learning rate: 9.739E-05 | global batch size: 256 | lm loss: 2.686185E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.842 | TFLOPs: 43.65 | +7: iteration 63290/ 115203 | consumed samples: 16202240 | consumed tokens: 33182187520 | elapsed time per iteration (s): 0.56 | learning rate: 9.737E-05 | global batch size: 256 | lm loss: 2.667083E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.753 | TFLOPs: 43.93 | +7: iteration 63300/ 115203 | consumed samples: 16204800 | consumed tokens: 33187430400 | elapsed time per iteration (s): 0.55 | learning rate: 9.734E-05 | global batch size: 256 | lm loss: 2.674357E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 63310/ 115203 | consumed samples: 16207360 | consumed tokens: 33192673280 | elapsed time per iteration (s): 0.56 | learning rate: 9.732E-05 | global batch size: 256 | lm loss: 2.683796E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.173 | TFLOPs: 43.40 | +7: iteration 63320/ 115203 | consumed samples: 16209920 | consumed tokens: 33197916160 | elapsed time per iteration (s): 0.56 | learning rate: 9.730E-05 | global batch size: 256 | lm loss: 2.676490E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.980 | TFLOPs: 43.66 | +7: iteration 63330/ 115203 | consumed samples: 16212480 | consumed tokens: 33203159040 | elapsed time per iteration (s): 0.56 | learning rate: 9.727E-05 | global batch size: 256 | lm loss: 2.684741E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.553 | TFLOPs: 43.62 | +7: iteration 63340/ 115203 | consumed samples: 16215040 | consumed tokens: 33208401920 | elapsed time per iteration (s): 0.56 | learning rate: 9.725E-05 | global batch size: 256 | lm loss: 2.683727E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.541 | TFLOPs: 43.81 | +7: iteration 63350/ 115203 | consumed samples: 16217600 | consumed tokens: 33213644800 | elapsed time per iteration (s): 0.55 | learning rate: 9.722E-05 | global batch size: 256 | lm loss: 2.674355E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.668 | TFLOPs: 44.02 | +7: iteration 63360/ 115203 | consumed samples: 16220160 | consumed tokens: 33218887680 | elapsed time per iteration (s): 0.56 | learning rate: 9.720E-05 | global batch size: 256 | lm loss: 2.679050E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.330 | TFLOPs: 43.41 | +7: iteration 63370/ 115203 | consumed samples: 16222720 | consumed tokens: 33224130560 | elapsed time per iteration (s): 0.56 | learning rate: 9.717E-05 | global batch size: 256 | lm loss: 2.685559E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.105 | TFLOPs: 43.68 | +7: iteration 63380/ 115203 | consumed samples: 16225280 | consumed tokens: 33229373440 | elapsed time per iteration (s): 0.57 | learning rate: 9.715E-05 | global batch size: 256 | lm loss: 2.690938E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.868 | TFLOPs: 42.89 | +7: iteration 63390/ 115203 | consumed samples: 16227840 | consumed tokens: 33234616320 | elapsed time per iteration (s): 0.56 | learning rate: 9.712E-05 | global batch size: 256 | lm loss: 2.683030E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.623 | TFLOPs: 43.72 | +7: iteration 63400/ 115203 | consumed samples: 16230400 | consumed tokens: 33239859200 | elapsed time per iteration (s): 0.56 | learning rate: 9.710E-05 | global batch size: 256 | lm loss: 2.680192E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.843 | TFLOPs: 43.75 | +7: iteration 63410/ 115203 | consumed samples: 16232960 | consumed tokens: 33245102080 | elapsed time per iteration (s): 0.56 | learning rate: 9.707E-05 | global batch size: 256 | lm loss: 2.685606E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.967 | TFLOPs: 43.57 | +7: iteration 63420/ 115203 | consumed samples: 16235520 | consumed tokens: 33250344960 | elapsed time per iteration (s): 0.56 | learning rate: 9.705E-05 | global batch size: 256 | lm loss: 2.664976E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.940 | TFLOPs: 43.28 | +7: iteration 63430/ 115203 | consumed samples: 16238080 | consumed tokens: 33255587840 | elapsed time per iteration (s): 0.56 | learning rate: 9.703E-05 | global batch size: 256 | lm loss: 2.683289E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.309 | TFLOPs: 43.41 | +7: iteration 63440/ 115203 | consumed samples: 16240640 | consumed tokens: 33260830720 | elapsed time per iteration (s): 0.56 | learning rate: 9.700E-05 | global batch size: 256 | lm loss: 2.680728E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.697 | TFLOPs: 43.45 | +7: iteration 63450/ 115203 | consumed samples: 16243200 | consumed tokens: 33266073600 | elapsed time per iteration (s): 0.57 | learning rate: 9.698E-05 | global batch size: 256 | lm loss: 2.677161E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.472 | TFLOPs: 42.95 | +7: iteration 63460/ 115203 | consumed samples: 16245760 | consumed tokens: 33271316480 | elapsed time per iteration (s): 0.57 | learning rate: 9.695E-05 | global batch size: 256 | lm loss: 2.682483E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.505 | TFLOPs: 42.47 | +7: iteration 63470/ 115203 | consumed samples: 16248320 | consumed tokens: 33276559360 | elapsed time per iteration (s): 0.56 | learning rate: 9.693E-05 | global batch size: 256 | lm loss: 2.672830E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.132 | TFLOPs: 43.58 | +7: iteration 63480/ 115203 | consumed samples: 16250880 | consumed tokens: 33281802240 | elapsed time per iteration (s): 0.56 | learning rate: 9.690E-05 | global batch size: 256 | lm loss: 2.684403E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.593 | TFLOPs: 43.53 | +7: iteration 63490/ 115203 | consumed samples: 16253440 | consumed tokens: 33287045120 | elapsed time per iteration (s): 0.56 | learning rate: 9.688E-05 | global batch size: 256 | lm loss: 2.673223E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.926 | TFLOPs: 43.75 | +7: iteration 63500/ 115203 | consumed samples: 16256000 | consumed tokens: 33292288000 | elapsed time per iteration (s): 0.56 | learning rate: 9.685E-05 | global batch size: 256 | lm loss: 2.673227E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.520 | TFLOPs: 43.62 | +7: iteration 63510/ 115203 | consumed samples: 16258560 | consumed tokens: 33297530880 | elapsed time per iteration (s): 0.57 | learning rate: 9.683E-05 | global batch size: 256 | lm loss: 2.677188E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.839 | TFLOPs: 42.60 | +7: iteration 63520/ 115203 | consumed samples: 16261120 | consumed tokens: 33302773760 | elapsed time per iteration (s): 0.57 | learning rate: 9.680E-05 | global batch size: 256 | lm loss: 2.684913E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.169 | TFLOPs: 42.92 | +7: iteration 63530/ 115203 | consumed samples: 16263680 | consumed tokens: 33308016640 | elapsed time per iteration (s): 0.57 | learning rate: 9.678E-05 | global batch size: 256 | lm loss: 2.685379E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.680 | TFLOPs: 42.78 | +7: iteration 63540/ 115203 | consumed samples: 16266240 | consumed tokens: 33313259520 | elapsed time per iteration (s): 0.56 | learning rate: 9.676E-05 | global batch size: 256 | lm loss: 2.677774E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.191 | TFLOPs: 43.59 | +7: iteration 63550/ 115203 | consumed samples: 16268800 | consumed tokens: 33318502400 | elapsed time per iteration (s): 0.56 | learning rate: 9.673E-05 | global batch size: 256 | lm loss: 2.682550E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.747 | TFLOPs: 43.45 | +7: iteration 63560/ 115203 | consumed samples: 16271360 | consumed tokens: 33323745280 | elapsed time per iteration (s): 0.59 | learning rate: 9.671E-05 | global batch size: 256 | lm loss: 2.674788E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.388 | TFLOPs: 41.22 | +7: iteration 63570/ 115203 | consumed samples: 16273920 | consumed tokens: 33328988160 | elapsed time per iteration (s): 0.58 | learning rate: 9.668E-05 | global batch size: 256 | lm loss: 2.682229E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.434 | TFLOPs: 42.18 | +7: iteration 63580/ 115203 | consumed samples: 16276480 | consumed tokens: 33334231040 | elapsed time per iteration (s): 0.57 | learning rate: 9.666E-05 | global batch size: 256 | lm loss: 2.669880E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.539 | TFLOPs: 43.14 | +7: iteration 63590/ 115203 | consumed samples: 16279040 | consumed tokens: 33339473920 | elapsed time per iteration (s): 0.55 | learning rate: 9.663E-05 | global batch size: 256 | lm loss: 2.680791E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.590 | TFLOPs: 44.01 | +7: iteration 63600/ 115203 | consumed samples: 16281600 | consumed tokens: 33344716800 | elapsed time per iteration (s): 0.56 | learning rate: 9.661E-05 | global batch size: 256 | lm loss: 2.694819E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.898 | TFLOPs: 43.46 | +7: iteration 63610/ 115203 | consumed samples: 16284160 | consumed tokens: 33349959680 | elapsed time per iteration (s): 0.56 | learning rate: 9.658E-05 | global batch size: 256 | lm loss: 2.678831E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.617 | TFLOPs: 43.63 | +7: iteration 63620/ 115203 | consumed samples: 16286720 | consumed tokens: 33355202560 | elapsed time per iteration (s): 0.57 | learning rate: 9.656E-05 | global batch size: 256 | lm loss: 2.685940E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.271 | TFLOPs: 42.55 | +7: iteration 63630/ 115203 | consumed samples: 16289280 | consumed tokens: 33360445440 | elapsed time per iteration (s): 0.56 | learning rate: 9.653E-05 | global batch size: 256 | lm loss: 2.689271E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.685 | TFLOPs: 43.44 | +7: iteration 63640/ 115203 | consumed samples: 16291840 | consumed tokens: 33365688320 | elapsed time per iteration (s): 0.57 | learning rate: 9.651E-05 | global batch size: 256 | lm loss: 2.664557E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.916 | TFLOPs: 42.61 | +7: iteration 63650/ 115203 | consumed samples: 16294400 | consumed tokens: 33370931200 | elapsed time per iteration (s): 0.57 | learning rate: 9.649E-05 | global batch size: 256 | lm loss: 2.690533E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.666 | TFLOPs: 42.58 | +7: iteration 63660/ 115203 | consumed samples: 16296960 | consumed tokens: 33376174080 | elapsed time per iteration (s): 0.55 | learning rate: 9.646E-05 | global batch size: 256 | lm loss: 2.683300E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.581 | TFLOPs: 44.01 | +7: iteration 63670/ 115203 | consumed samples: 16299520 | consumed tokens: 33381416960 | elapsed time per iteration (s): 0.58 | learning rate: 9.644E-05 | global batch size: 256 | lm loss: 2.677820E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.974 | TFLOPs: 42.23 | +7: iteration 63680/ 115203 | consumed samples: 16302080 | consumed tokens: 33386659840 | elapsed time per iteration (s): 0.58 | learning rate: 9.641E-05 | global batch size: 256 | lm loss: 2.679260E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.415 | TFLOPs: 41.99 | +7: iteration 63690/ 115203 | consumed samples: 16304640 | consumed tokens: 33391902720 | elapsed time per iteration (s): 0.57 | learning rate: 9.639E-05 | global batch size: 256 | lm loss: 2.690580E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.024 | TFLOPs: 42.81 | +7: iteration 63700/ 115203 | consumed samples: 16307200 | consumed tokens: 33397145600 | elapsed time per iteration (s): 0.59 | learning rate: 9.636E-05 | global batch size: 256 | lm loss: 2.681647E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.389 | TFLOPs: 41.70 | +7: iteration 63710/ 115203 | consumed samples: 16309760 | consumed tokens: 33402388480 | elapsed time per iteration (s): 0.56 | learning rate: 9.634E-05 | global batch size: 256 | lm loss: 2.667637E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.097 | TFLOPs: 43.58 | +7: iteration 63720/ 115203 | consumed samples: 16312320 | consumed tokens: 33407631360 | elapsed time per iteration (s): 0.58 | learning rate: 9.631E-05 | global batch size: 256 | lm loss: 2.675318E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.756 | TFLOPs: 42.31 | +7: iteration 63730/ 115203 | consumed samples: 16314880 | consumed tokens: 33412874240 | elapsed time per iteration (s): 0.57 | learning rate: 9.629E-05 | global batch size: 256 | lm loss: 2.668433E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.384 | TFLOPs: 42.65 | +7: iteration 63740/ 115203 | consumed samples: 16317440 | consumed tokens: 33418117120 | elapsed time per iteration (s): 0.56 | learning rate: 9.627E-05 | global batch size: 256 | lm loss: 2.689406E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.380 | TFLOPs: 43.80 | +7: iteration 63750/ 115203 | consumed samples: 16320000 | consumed tokens: 33423360000 | elapsed time per iteration (s): 0.55 | learning rate: 9.624E-05 | global batch size: 256 | lm loss: 2.690497E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.539 | TFLOPs: 44.00 | +7: iteration 63760/ 115203 | consumed samples: 16322560 | consumed tokens: 33428602880 | elapsed time per iteration (s): 0.57 | learning rate: 9.622E-05 | global batch size: 256 | lm loss: 2.681326E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.505 | TFLOPs: 43.05 | +7: iteration 63770/ 115203 | consumed samples: 16325120 | consumed tokens: 33433845760 | elapsed time per iteration (s): 0.56 | learning rate: 9.619E-05 | global batch size: 256 | lm loss: 2.690154E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.948 | TFLOPs: 43.47 | +7: iteration 63780/ 115203 | consumed samples: 16327680 | consumed tokens: 33439088640 | elapsed time per iteration (s): 0.56 | learning rate: 9.617E-05 | global batch size: 256 | lm loss: 2.679020E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.174 | TFLOPs: 43.97 | +7: iteration 63790/ 115203 | consumed samples: 16330240 | consumed tokens: 33444331520 | elapsed time per iteration (s): 0.57 | learning rate: 9.614E-05 | global batch size: 256 | lm loss: 2.671607E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.676 | TFLOPs: 43.06 | +7: iteration 63800/ 115203 | consumed samples: 16332800 | consumed tokens: 33449574400 | elapsed time per iteration (s): 0.56 | learning rate: 9.612E-05 | global batch size: 256 | lm loss: 2.682119E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.448 | TFLOPs: 43.23 | +7: iteration 63810/ 115203 | consumed samples: 16335360 | consumed tokens: 33454817280 | elapsed time per iteration (s): 0.56 | learning rate: 9.609E-05 | global batch size: 256 | lm loss: 2.694987E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.668 | TFLOPs: 43.25 | +7: iteration 63820/ 115203 | consumed samples: 16337920 | consumed tokens: 33460060160 | elapsed time per iteration (s): 0.56 | learning rate: 9.607E-05 | global batch size: 256 | lm loss: 2.650805E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.954 | TFLOPs: 43.37 | +7: iteration 63830/ 115203 | consumed samples: 16340480 | consumed tokens: 33465303040 | elapsed time per iteration (s): 0.56 | learning rate: 9.604E-05 | global batch size: 256 | lm loss: 2.683795E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.585 | TFLOPs: 43.34 | +7: iteration 63840/ 115203 | consumed samples: 16343040 | consumed tokens: 33470545920 | elapsed time per iteration (s): 0.56 | learning rate: 9.602E-05 | global batch size: 256 | lm loss: 2.667270E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.727 | TFLOPs: 43.26 | +7: iteration 63850/ 115203 | consumed samples: 16345600 | consumed tokens: 33475788800 | elapsed time per iteration (s): 0.56 | learning rate: 9.600E-05 | global batch size: 256 | lm loss: 2.688914E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.744 | TFLOPs: 43.93 | +7: iteration 63860/ 115203 | consumed samples: 16348160 | consumed tokens: 33481031680 | elapsed time per iteration (s): 0.57 | learning rate: 9.597E-05 | global batch size: 256 | lm loss: 2.674353E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.361 | TFLOPs: 42.75 | +7: iteration 63870/ 115203 | consumed samples: 16350720 | consumed tokens: 33486274560 | elapsed time per iteration (s): 0.56 | learning rate: 9.595E-05 | global batch size: 256 | lm loss: 2.682387E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.123 | TFLOPs: 43.68 | +7: iteration 63880/ 115203 | consumed samples: 16353280 | consumed tokens: 33491517440 | elapsed time per iteration (s): 0.56 | learning rate: 9.592E-05 | global batch size: 256 | lm loss: 2.681120E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.391 | TFLOPs: 43.89 | +7: iteration 63890/ 115203 | consumed samples: 16355840 | consumed tokens: 33496760320 | elapsed time per iteration (s): 0.57 | learning rate: 9.590E-05 | global batch size: 256 | lm loss: 2.693212E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.334 | TFLOPs: 42.74 | +7: iteration 63900/ 115203 | consumed samples: 16358400 | consumed tokens: 33502003200 | elapsed time per iteration (s): 0.57 | learning rate: 9.587E-05 | global batch size: 256 | lm loss: 2.674877E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.443 | TFLOPs: 43.04 | +7: iteration 63910/ 115203 | consumed samples: 16360960 | consumed tokens: 33507246080 | elapsed time per iteration (s): 0.56 | learning rate: 9.585E-05 | global batch size: 256 | lm loss: 2.690648E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.732 | TFLOPs: 43.74 | +7: iteration 63920/ 115203 | consumed samples: 16363520 | consumed tokens: 33512488960 | elapsed time per iteration (s): 0.57 | learning rate: 9.582E-05 | global batch size: 256 | lm loss: 2.692240E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.707 | TFLOPs: 43.07 | +7: iteration 63930/ 115203 | consumed samples: 16366080 | consumed tokens: 33517731840 | elapsed time per iteration (s): 0.55 | learning rate: 9.580E-05 | global batch size: 256 | lm loss: 2.665031E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.540 | TFLOPs: 44.00 | +7: iteration 63940/ 115203 | consumed samples: 16368640 | consumed tokens: 33522974720 | elapsed time per iteration (s): 0.58 | learning rate: 9.578E-05 | global batch size: 256 | lm loss: 2.672047E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.261 | TFLOPs: 41.97 | +7: iteration 63950/ 115203 | consumed samples: 16371200 | consumed tokens: 33528217600 | elapsed time per iteration (s): 0.57 | learning rate: 9.575E-05 | global batch size: 256 | lm loss: 2.670943E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.815 | TFLOPs: 43.17 | +7: iteration 63960/ 115203 | consumed samples: 16373760 | consumed tokens: 33533460480 | elapsed time per iteration (s): 0.58 | learning rate: 9.573E-05 | global batch size: 256 | lm loss: 2.678568E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.555 | TFLOPs: 42.19 | +7: iteration 63970/ 115203 | consumed samples: 16376320 | consumed tokens: 33538703360 | elapsed time per iteration (s): 0.56 | learning rate: 9.570E-05 | global batch size: 256 | lm loss: 2.670440E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.784 | TFLOPs: 43.55 | +7: iteration 63980/ 115203 | consumed samples: 16378880 | consumed tokens: 33543946240 | elapsed time per iteration (s): 0.56 | learning rate: 9.568E-05 | global batch size: 256 | lm loss: 2.681762E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.871 | TFLOPs: 43.65 | +7: iteration 63990/ 115203 | consumed samples: 16381440 | consumed tokens: 33549189120 | elapsed time per iteration (s): 0.57 | learning rate: 9.565E-05 | global batch size: 256 | lm loss: 2.665327E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.066 | TFLOPs: 43.19 | +0: [2023-03-16 22:52:15,790] [INFO] [logging.py:68:log_dist] [Rank 0] step=64000, skipped=0, lr=[9.56284709392273e-05, 9.56284709392273e-05, 9.56284709392273e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 64000/ 115203 | consumed samples: 16384000 | consumed tokens: 33554432000 | elapsed time per iteration (s): 0.56 | learning rate: 9.563E-05 | global batch size: 256 | lm loss: 2.674652E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.525 | TFLOPs: 43.43 | +0: steps: 64000 loss: 2.6628 iter time (s): 0.563 samples/sec: 454.802 +7: iteration 64010/ 115203 | consumed samples: 16386560 | consumed tokens: 33559674880 | elapsed time per iteration (s): 0.56 | learning rate: 9.560E-05 | global batch size: 256 | lm loss: 2.671576E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.840 | TFLOPs: 43.27 | +7: iteration 64020/ 115203 | consumed samples: 16389120 | consumed tokens: 33564917760 | elapsed time per iteration (s): 0.56 | learning rate: 9.558E-05 | global batch size: 256 | lm loss: 2.678518E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.088 | TFLOPs: 43.77 | +7: iteration 64030/ 115203 | consumed samples: 16391680 | consumed tokens: 33570160640 | elapsed time per iteration (s): 0.56 | learning rate: 9.556E-05 | global batch size: 256 | lm loss: 2.660449E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.916 | TFLOPs: 43.94 | +7: iteration 64040/ 115203 | consumed samples: 16394240 | consumed tokens: 33575403520 | elapsed time per iteration (s): 0.58 | learning rate: 9.553E-05 | global batch size: 256 | lm loss: 2.678586E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.164 | TFLOPs: 42.06 | +7: iteration 64050/ 115203 | consumed samples: 16396800 | consumed tokens: 33580646400 | elapsed time per iteration (s): 0.56 | learning rate: 9.551E-05 | global batch size: 256 | lm loss: 2.686515E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.116 | TFLOPs: 43.30 | +7: iteration 64060/ 115203 | consumed samples: 16399360 | consumed tokens: 33585889280 | elapsed time per iteration (s): 0.56 | learning rate: 9.548E-05 | global batch size: 256 | lm loss: 2.675721E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.050 | TFLOPs: 43.48 | +7: iteration 64070/ 115203 | consumed samples: 16401920 | consumed tokens: 33591132160 | elapsed time per iteration (s): 0.57 | learning rate: 9.546E-05 | global batch size: 256 | lm loss: 2.679817E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.561 | TFLOPs: 42.86 | +7: iteration 64080/ 115203 | consumed samples: 16404480 | consumed tokens: 33596375040 | elapsed time per iteration (s): 0.56 | learning rate: 9.543E-05 | global batch size: 256 | lm loss: 2.682387E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.150 | TFLOPs: 43.39 | +7: iteration 64090/ 115203 | consumed samples: 16407040 | consumed tokens: 33601617920 | elapsed time per iteration (s): 0.56 | learning rate: 9.541E-05 | global batch size: 256 | lm loss: 2.671244E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.843 | TFLOPs: 43.56 | +7: iteration 64100/ 115203 | consumed samples: 16409600 | consumed tokens: 33606860800 | elapsed time per iteration (s): 0.57 | learning rate: 9.538E-05 | global batch size: 256 | lm loss: 2.674820E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.398 | TFLOPs: 43.13 | +7: iteration 64110/ 115203 | consumed samples: 16412160 | consumed tokens: 33612103680 | elapsed time per iteration (s): 0.55 | learning rate: 9.536E-05 | global batch size: 256 | lm loss: 2.670314E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.384 | TFLOPs: 43.99 | +7: iteration 64120/ 115203 | consumed samples: 16414720 | consumed tokens: 33617346560 | elapsed time per iteration (s): 0.57 | learning rate: 9.533E-05 | global batch size: 256 | lm loss: 2.673456E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.949 | TFLOPs: 42.90 | +7: iteration 64130/ 115203 | consumed samples: 16417280 | consumed tokens: 33622589440 | elapsed time per iteration (s): 0.56 | learning rate: 9.531E-05 | global batch size: 256 | lm loss: 2.669548E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.858 | TFLOPs: 43.46 | +7: iteration 64140/ 115203 | consumed samples: 16419840 | consumed tokens: 33627832320 | elapsed time per iteration (s): 0.56 | learning rate: 9.529E-05 | global batch size: 256 | lm loss: 2.674135E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.134 | TFLOPs: 43.68 | +7: iteration 64150/ 115203 | consumed samples: 16422400 | consumed tokens: 33633075200 | elapsed time per iteration (s): 0.56 | learning rate: 9.526E-05 | global batch size: 256 | lm loss: 2.666304E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.791 | TFLOPs: 43.45 | +7: iteration 64160/ 115203 | consumed samples: 16424960 | consumed tokens: 33638318080 | elapsed time per iteration (s): 0.57 | learning rate: 9.524E-05 | global batch size: 256 | lm loss: 2.682323E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.016 | TFLOPs: 42.81 | +7: iteration 64170/ 115203 | consumed samples: 16427520 | consumed tokens: 33643560960 | elapsed time per iteration (s): 0.57 | learning rate: 9.521E-05 | global batch size: 256 | lm loss: 2.683002E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.247 | TFLOPs: 42.93 | +7: iteration 64180/ 115203 | consumed samples: 16430080 | consumed tokens: 33648803840 | elapsed time per iteration (s): 0.56 | learning rate: 9.519E-05 | global batch size: 256 | lm loss: 2.680788E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.881 | TFLOPs: 43.75 | +7: iteration 64190/ 115203 | consumed samples: 16432640 | consumed tokens: 33654046720 | elapsed time per iteration (s): 0.56 | learning rate: 9.516E-05 | global batch size: 256 | lm loss: 2.681224E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.273 | TFLOPs: 43.60 | +7: iteration 64200/ 115203 | consumed samples: 16435200 | consumed tokens: 33659289600 | elapsed time per iteration (s): 0.56 | learning rate: 9.514E-05 | global batch size: 256 | lm loss: 2.676687E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.440 | TFLOPs: 43.90 | +7: iteration 64210/ 115203 | consumed samples: 16437760 | consumed tokens: 33664532480 | elapsed time per iteration (s): 0.56 | learning rate: 9.511E-05 | global batch size: 256 | lm loss: 2.686921E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.159 | TFLOPs: 43.87 | +7: iteration 64220/ 115203 | consumed samples: 16440320 | consumed tokens: 33669775360 | elapsed time per iteration (s): 0.56 | learning rate: 9.509E-05 | global batch size: 256 | lm loss: 2.667660E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.398 | TFLOPs: 43.51 | +7: iteration 64230/ 115203 | consumed samples: 16442880 | consumed tokens: 33675018240 | elapsed time per iteration (s): 0.57 | learning rate: 9.507E-05 | global batch size: 256 | lm loss: 2.679636E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.933 | TFLOPs: 42.51 | +7: iteration 64240/ 115203 | consumed samples: 16445440 | consumed tokens: 33680261120 | elapsed time per iteration (s): 0.57 | learning rate: 9.504E-05 | global batch size: 256 | lm loss: 2.675524E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.493 | TFLOPs: 43.14 | +7: iteration 64250/ 115203 | consumed samples: 16448000 | consumed tokens: 33685504000 | elapsed time per iteration (s): 0.58 | learning rate: 9.502E-05 | global batch size: 256 | lm loss: 2.676799E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.168 | TFLOPs: 42.06 | +7: iteration 64260/ 115203 | consumed samples: 16450560 | consumed tokens: 33690746880 | elapsed time per iteration (s): 0.56 | learning rate: 9.499E-05 | global batch size: 256 | lm loss: 2.663451E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.837 | TFLOPs: 43.75 | +7: iteration 64270/ 115203 | consumed samples: 16453120 | consumed tokens: 33695989760 | elapsed time per iteration (s): 0.56 | learning rate: 9.497E-05 | global batch size: 256 | lm loss: 2.677834E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.778 | TFLOPs: 43.64 | +7: iteration 64280/ 115203 | consumed samples: 16455680 | consumed tokens: 33701232640 | elapsed time per iteration (s): 0.57 | learning rate: 9.494E-05 | global batch size: 256 | lm loss: 2.683253E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.532 | TFLOPs: 42.67 | +7: iteration 64290/ 115203 | consumed samples: 16458240 | consumed tokens: 33706475520 | elapsed time per iteration (s): 0.57 | learning rate: 9.492E-05 | global batch size: 256 | lm loss: 2.669333E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.108 | TFLOPs: 43.10 | +7: iteration 64300/ 115203 | consumed samples: 16460800 | consumed tokens: 33711718400 | elapsed time per iteration (s): 0.57 | learning rate: 9.489E-05 | global batch size: 256 | lm loss: 2.676427E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.040 | TFLOPs: 43.19 | +7: iteration 64310/ 115203 | consumed samples: 16463360 | consumed tokens: 33716961280 | elapsed time per iteration (s): 0.57 | learning rate: 9.487E-05 | global batch size: 256 | lm loss: 2.679937E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.602 | TFLOPs: 42.77 | +7: iteration 64320/ 115203 | consumed samples: 16465920 | consumed tokens: 33722204160 | elapsed time per iteration (s): 0.57 | learning rate: 9.485E-05 | global batch size: 256 | lm loss: 2.678776E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.776 | TFLOPs: 42.98 | +7: iteration 64330/ 115203 | consumed samples: 16468480 | consumed tokens: 33727447040 | elapsed time per iteration (s): 0.57 | learning rate: 9.482E-05 | global batch size: 256 | lm loss: 2.678346E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.339 | TFLOPs: 42.65 | +7: iteration 64340/ 115203 | consumed samples: 16471040 | consumed tokens: 33732689920 | elapsed time per iteration (s): 0.57 | learning rate: 9.480E-05 | global batch size: 256 | lm loss: 2.679734E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.643 | TFLOPs: 42.49 | +7: iteration 64350/ 115203 | consumed samples: 16473600 | consumed tokens: 33737932800 | elapsed time per iteration (s): 0.57 | learning rate: 9.477E-05 | global batch size: 256 | lm loss: 2.672667E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.067 | TFLOPs: 43.00 | +7: iteration 64360/ 115203 | consumed samples: 16476160 | consumed tokens: 33743175680 | elapsed time per iteration (s): 0.57 | learning rate: 9.475E-05 | global batch size: 256 | lm loss: 2.664753E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.333 | TFLOPs: 42.93 | +7: iteration 64370/ 115203 | consumed samples: 16478720 | consumed tokens: 33748418560 | elapsed time per iteration (s): 0.57 | learning rate: 9.472E-05 | global batch size: 256 | lm loss: 2.665006E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.299 | TFLOPs: 42.45 | +7: iteration 64380/ 115203 | consumed samples: 16481280 | consumed tokens: 33753661440 | elapsed time per iteration (s): 0.56 | learning rate: 9.470E-05 | global batch size: 256 | lm loss: 2.678581E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.706 | TFLOPs: 43.35 | +7: iteration 64390/ 115203 | consumed samples: 16483840 | consumed tokens: 33758904320 | elapsed time per iteration (s): 0.58 | learning rate: 9.467E-05 | global batch size: 256 | lm loss: 2.681870E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.336 | TFLOPs: 41.79 | +7: iteration 64400/ 115203 | consumed samples: 16486400 | consumed tokens: 33764147200 | elapsed time per iteration (s): 0.60 | learning rate: 9.465E-05 | global batch size: 256 | lm loss: 2.688734E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.168 | TFLOPs: 40.73 | +7: iteration 64410/ 115203 | consumed samples: 16488960 | consumed tokens: 33769390080 | elapsed time per iteration (s): 0.61 | learning rate: 9.463E-05 | global batch size: 256 | lm loss: 2.679211E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.580 | TFLOPs: 40.29 | +7: iteration 64420/ 115203 | consumed samples: 16491520 | consumed tokens: 33774632960 | elapsed time per iteration (s): 0.57 | learning rate: 9.460E-05 | global batch size: 256 | lm loss: 2.683948E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.677 | TFLOPs: 43.16 | +7: iteration 64430/ 115203 | consumed samples: 16494080 | consumed tokens: 33779875840 | elapsed time per iteration (s): 0.57 | learning rate: 9.458E-05 | global batch size: 256 | lm loss: 2.676223E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.396 | TFLOPs: 43.13 | +7: iteration 64440/ 115203 | consumed samples: 16496640 | consumed tokens: 33785118720 | elapsed time per iteration (s): 0.57 | learning rate: 9.455E-05 | global batch size: 256 | lm loss: 2.668797E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.926 | TFLOPs: 43.18 | +7: iteration 64450/ 115203 | consumed samples: 16499200 | consumed tokens: 33790361600 | elapsed time per iteration (s): 0.59 | learning rate: 9.453E-05 | global batch size: 256 | lm loss: 2.666977E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.203 | TFLOPs: 41.21 | +7: iteration 64460/ 115203 | consumed samples: 16501760 | consumed tokens: 33795604480 | elapsed time per iteration (s): 0.58 | learning rate: 9.450E-05 | global batch size: 256 | lm loss: 2.667658E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.514 | TFLOPs: 42.38 | +7: iteration 64470/ 115203 | consumed samples: 16504320 | consumed tokens: 33800847360 | elapsed time per iteration (s): 0.57 | learning rate: 9.448E-05 | global batch size: 256 | lm loss: 2.669040E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.925 | TFLOPs: 42.90 | +7: iteration 64480/ 115203 | consumed samples: 16506880 | consumed tokens: 33806090240 | elapsed time per iteration (s): 0.58 | learning rate: 9.446E-05 | global batch size: 256 | lm loss: 2.672601E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.375 | TFLOPs: 41.79 | +7: iteration 64490/ 115203 | consumed samples: 16509440 | consumed tokens: 33811333120 | elapsed time per iteration (s): 0.59 | learning rate: 9.443E-05 | global batch size: 256 | lm loss: 2.672809E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.329 | TFLOPs: 41.03 | +7: iteration 64500/ 115203 | consumed samples: 16512000 | consumed tokens: 33816576000 | elapsed time per iteration (s): 0.59 | learning rate: 9.441E-05 | global batch size: 256 | lm loss: 2.667587E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.818 | TFLOPs: 41.07 | +7: iteration 64510/ 115203 | consumed samples: 16514560 | consumed tokens: 33821818880 | elapsed time per iteration (s): 0.61 | learning rate: 9.438E-05 | global batch size: 256 | lm loss: 2.667573E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.547 | TFLOPs: 40.29 | +7: iteration 64520/ 115203 | consumed samples: 16517120 | consumed tokens: 33827061760 | elapsed time per iteration (s): 0.58 | learning rate: 9.436E-05 | global batch size: 256 | lm loss: 2.682470E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.803 | TFLOPs: 41.74 | +7: iteration 64530/ 115203 | consumed samples: 16519680 | consumed tokens: 33832304640 | elapsed time per iteration (s): 0.57 | learning rate: 9.433E-05 | global batch size: 256 | lm loss: 2.683752E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.828 | TFLOPs: 42.70 | +7: iteration 64540/ 115203 | consumed samples: 16522240 | consumed tokens: 33837547520 | elapsed time per iteration (s): 0.57 | learning rate: 9.431E-05 | global batch size: 256 | lm loss: 2.663503E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.656 | TFLOPs: 42.77 | +7: iteration 64550/ 115203 | consumed samples: 16524800 | consumed tokens: 33842790400 | elapsed time per iteration (s): 0.57 | learning rate: 9.428E-05 | global batch size: 256 | lm loss: 2.667526E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.670 | TFLOPs: 42.59 | +7: iteration 64560/ 115203 | consumed samples: 16527360 | consumed tokens: 33848033280 | elapsed time per iteration (s): 0.56 | learning rate: 9.426E-05 | global batch size: 256 | lm loss: 2.667679E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.897 | TFLOPs: 43.56 | +7: iteration 64570/ 115203 | consumed samples: 16529920 | consumed tokens: 33853276160 | elapsed time per iteration (s): 0.57 | learning rate: 9.424E-05 | global batch size: 256 | lm loss: 2.672610E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.114 | TFLOPs: 42.82 | +7: iteration 64580/ 115203 | consumed samples: 16532480 | consumed tokens: 33858519040 | elapsed time per iteration (s): 0.56 | learning rate: 9.421E-05 | global batch size: 256 | lm loss: 2.676742E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.190 | TFLOPs: 43.30 | +7: iteration 64590/ 115203 | consumed samples: 16535040 | consumed tokens: 33863761920 | elapsed time per iteration (s): 0.58 | learning rate: 9.419E-05 | global batch size: 256 | lm loss: 2.690862E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.305 | TFLOPs: 41.88 | +7: iteration 64600/ 115203 | consumed samples: 16537600 | consumed tokens: 33869004800 | elapsed time per iteration (s): 0.58 | learning rate: 9.416E-05 | global batch size: 256 | lm loss: 2.678989E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.098 | TFLOPs: 41.77 | +7: iteration 64610/ 115203 | consumed samples: 16540160 | consumed tokens: 33874247680 | elapsed time per iteration (s): 0.57 | learning rate: 9.414E-05 | global batch size: 256 | lm loss: 2.667131E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.349 | TFLOPs: 42.55 | +7: iteration 64620/ 115203 | consumed samples: 16542720 | consumed tokens: 33879490560 | elapsed time per iteration (s): 0.57 | learning rate: 9.411E-05 | global batch size: 256 | lm loss: 2.673084E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.850 | TFLOPs: 42.79 | +7: iteration 64630/ 115203 | consumed samples: 16545280 | consumed tokens: 33884733440 | elapsed time per iteration (s): 0.57 | learning rate: 9.409E-05 | global batch size: 256 | lm loss: 2.664591E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.638 | TFLOPs: 42.68 | +7: iteration 64640/ 115203 | consumed samples: 16547840 | consumed tokens: 33889976320 | elapsed time per iteration (s): 0.56 | learning rate: 9.406E-05 | global batch size: 256 | lm loss: 2.667137E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.635 | TFLOPs: 43.54 | +7: iteration 64650/ 115203 | consumed samples: 16550400 | consumed tokens: 33895219200 | elapsed time per iteration (s): 0.57 | learning rate: 9.404E-05 | global batch size: 256 | lm loss: 2.684398E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.470 | TFLOPs: 43.04 | +7: iteration 64660/ 115203 | consumed samples: 16552960 | consumed tokens: 33900462080 | elapsed time per iteration (s): 0.60 | learning rate: 9.402E-05 | global batch size: 256 | lm loss: 2.667577E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.376 | TFLOPs: 40.84 | +7: iteration 64670/ 115203 | consumed samples: 16555520 | consumed tokens: 33905704960 | elapsed time per iteration (s): 0.57 | learning rate: 9.399E-05 | global batch size: 256 | lm loss: 2.672193E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.524 | TFLOPs: 43.14 | +7: iteration 64680/ 115203 | consumed samples: 16558080 | consumed tokens: 33910947840 | elapsed time per iteration (s): 0.58 | learning rate: 9.397E-05 | global batch size: 256 | lm loss: 2.664113E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.937 | TFLOPs: 41.75 | +7: iteration 64690/ 115203 | consumed samples: 16560640 | consumed tokens: 33916190720 | elapsed time per iteration (s): 0.57 | learning rate: 9.394E-05 | global batch size: 256 | lm loss: 2.672481E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.219 | TFLOPs: 42.64 | +7: iteration 64700/ 115203 | consumed samples: 16563200 | consumed tokens: 33921433600 | elapsed time per iteration (s): 0.57 | learning rate: 9.392E-05 | global batch size: 256 | lm loss: 2.673117E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.628 | TFLOPs: 43.06 | +7: iteration 64710/ 115203 | consumed samples: 16565760 | consumed tokens: 33926676480 | elapsed time per iteration (s): 0.57 | learning rate: 9.389E-05 | global batch size: 256 | lm loss: 2.685613E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.630 | TFLOPs: 42.77 | +7: iteration 64720/ 115203 | consumed samples: 16568320 | consumed tokens: 33931919360 | elapsed time per iteration (s): 0.60 | learning rate: 9.387E-05 | global batch size: 256 | lm loss: 2.672727E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.242 | TFLOPs: 40.92 | +7: iteration 64730/ 115203 | consumed samples: 16570880 | consumed tokens: 33937162240 | elapsed time per iteration (s): 0.56 | learning rate: 9.384E-05 | global batch size: 256 | lm loss: 2.674269E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.733 | TFLOPs: 43.54 | +7: iteration 64740/ 115203 | consumed samples: 16573440 | consumed tokens: 33942405120 | elapsed time per iteration (s): 0.56 | learning rate: 9.382E-05 | global batch size: 256 | lm loss: 2.666273E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.448 | TFLOPs: 43.33 | +7: iteration 64750/ 115203 | consumed samples: 16576000 | consumed tokens: 33947648000 | elapsed time per iteration (s): 0.57 | learning rate: 9.380E-05 | global batch size: 256 | lm loss: 2.677039E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.380 | TFLOPs: 42.84 | +7: iteration 64760/ 115203 | consumed samples: 16578560 | consumed tokens: 33952890880 | elapsed time per iteration (s): 0.57 | learning rate: 9.377E-05 | global batch size: 256 | lm loss: 2.675122E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.414 | TFLOPs: 43.04 | +7: iteration 64770/ 115203 | consumed samples: 16581120 | consumed tokens: 33958133760 | elapsed time per iteration (s): 0.57 | learning rate: 9.375E-05 | global batch size: 256 | lm loss: 2.673918E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.396 | TFLOPs: 42.94 | +7: iteration 64780/ 115203 | consumed samples: 16583680 | consumed tokens: 33963376640 | elapsed time per iteration (s): 0.56 | learning rate: 9.372E-05 | global batch size: 256 | lm loss: 2.672910E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.357 | TFLOPs: 43.51 | +7: iteration 64790/ 115203 | consumed samples: 16586240 | consumed tokens: 33968619520 | elapsed time per iteration (s): 0.57 | learning rate: 9.370E-05 | global batch size: 256 | lm loss: 2.663947E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.623 | TFLOPs: 42.96 | +7: iteration 64800/ 115203 | consumed samples: 16588800 | consumed tokens: 33973862400 | elapsed time per iteration (s): 0.56 | learning rate: 9.367E-05 | global batch size: 256 | lm loss: 2.691299E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.930 | TFLOPs: 43.75 | +7: iteration 64810/ 115203 | consumed samples: 16591360 | consumed tokens: 33979105280 | elapsed time per iteration (s): 0.57 | learning rate: 9.365E-05 | global batch size: 256 | lm loss: 2.676034E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.879 | TFLOPs: 42.61 | +7: iteration 64820/ 115203 | consumed samples: 16593920 | consumed tokens: 33984348160 | elapsed time per iteration (s): 0.56 | learning rate: 9.363E-05 | global batch size: 256 | lm loss: 2.684359E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.160 | TFLOPs: 43.87 | +7: iteration 64830/ 115203 | consumed samples: 16596480 | consumed tokens: 33989591040 | elapsed time per iteration (s): 0.56 | learning rate: 9.360E-05 | global batch size: 256 | lm loss: 2.682068E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.877 | TFLOPs: 43.37 | +7: iteration 64840/ 115203 | consumed samples: 16599040 | consumed tokens: 33994833920 | elapsed time per iteration (s): 0.57 | learning rate: 9.358E-05 | global batch size: 256 | lm loss: 2.664113E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.642 | TFLOPs: 42.68 | +7: iteration 64850/ 115203 | consumed samples: 16601600 | consumed tokens: 34000076800 | elapsed time per iteration (s): 0.57 | learning rate: 9.355E-05 | global batch size: 256 | lm loss: 2.676356E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.891 | TFLOPs: 42.89 | +7: iteration 64860/ 115203 | consumed samples: 16604160 | consumed tokens: 34005319680 | elapsed time per iteration (s): 0.57 | learning rate: 9.353E-05 | global batch size: 256 | lm loss: 2.659511E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.552 | TFLOPs: 42.96 | +7: iteration 64870/ 115203 | consumed samples: 16606720 | consumed tokens: 34010562560 | elapsed time per iteration (s): 0.57 | learning rate: 9.350E-05 | global batch size: 256 | lm loss: 2.678331E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.634 | TFLOPs: 42.77 | +7: iteration 64880/ 115203 | consumed samples: 16609280 | consumed tokens: 34015805440 | elapsed time per iteration (s): 0.57 | learning rate: 9.348E-05 | global batch size: 256 | lm loss: 2.666875E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.848 | TFLOPs: 43.08 | +7: iteration 64890/ 115203 | consumed samples: 16611840 | consumed tokens: 34021048320 | elapsed time per iteration (s): 0.57 | learning rate: 9.345E-05 | global batch size: 256 | lm loss: 2.679217E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.237 | TFLOPs: 43.02 | +7: iteration 64900/ 115203 | consumed samples: 16614400 | consumed tokens: 34026291200 | elapsed time per iteration (s): 0.57 | learning rate: 9.343E-05 | global batch size: 256 | lm loss: 2.670808E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.725 | TFLOPs: 42.97 | +7: iteration 64910/ 115203 | consumed samples: 16616960 | consumed tokens: 34031534080 | elapsed time per iteration (s): 0.57 | learning rate: 9.341E-05 | global batch size: 256 | lm loss: 2.686525E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.306 | TFLOPs: 43.03 | +7: iteration 64920/ 115203 | consumed samples: 16619520 | consumed tokens: 34036776960 | elapsed time per iteration (s): 0.57 | learning rate: 9.338E-05 | global batch size: 256 | lm loss: 2.675293E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.432 | TFLOPs: 42.75 | +7: iteration 64930/ 115203 | consumed samples: 16622080 | consumed tokens: 34042019840 | elapsed time per iteration (s): 0.57 | learning rate: 9.336E-05 | global batch size: 256 | lm loss: 2.668911E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.688 | TFLOPs: 43.16 | +7: iteration 64940/ 115203 | consumed samples: 16624640 | consumed tokens: 34047262720 | elapsed time per iteration (s): 0.56 | learning rate: 9.333E-05 | global batch size: 256 | lm loss: 2.667830E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.386 | TFLOPs: 43.70 | +7: iteration 64950/ 115203 | consumed samples: 16627200 | consumed tokens: 34052505600 | elapsed time per iteration (s): 0.60 | learning rate: 9.331E-05 | global batch size: 256 | lm loss: 2.687842E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.427 | TFLOPs: 40.85 | +7: iteration 64960/ 115203 | consumed samples: 16629760 | consumed tokens: 34057748480 | elapsed time per iteration (s): 0.57 | learning rate: 9.328E-05 | global batch size: 256 | lm loss: 2.670778E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.892 | TFLOPs: 43.18 | +7: iteration 64970/ 115203 | consumed samples: 16632320 | consumed tokens: 34062991360 | elapsed time per iteration (s): 0.58 | learning rate: 9.326E-05 | global batch size: 256 | lm loss: 2.671593E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.748 | TFLOPs: 42.31 | +7: iteration 64980/ 115203 | consumed samples: 16634880 | consumed tokens: 34068234240 | elapsed time per iteration (s): 0.56 | learning rate: 9.324E-05 | global batch size: 256 | lm loss: 2.669073E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.459 | TFLOPs: 43.23 | +7: iteration 64990/ 115203 | consumed samples: 16637440 | consumed tokens: 34073477120 | elapsed time per iteration (s): 0.58 | learning rate: 9.321E-05 | global batch size: 256 | lm loss: 2.665164E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.016 | TFLOPs: 42.33 | +7: iteration 65000/ 115203 | consumed samples: 16640000 | consumed tokens: 34078720000 | elapsed time per iteration (s): 0.56 | learning rate: 9.319E-05 | global batch size: 256 | lm loss: 2.664556E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.136 | TFLOPs: 43.49 | +7: iteration 65010/ 115203 | consumed samples: 16642560 | consumed tokens: 34083962880 | elapsed time per iteration (s): 0.56 | learning rate: 9.316E-05 | global batch size: 256 | lm loss: 2.661504E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.630 | TFLOPs: 43.63 | +7: iteration 65020/ 115203 | consumed samples: 16645120 | consumed tokens: 34089205760 | elapsed time per iteration (s): 0.57 | learning rate: 9.314E-05 | global batch size: 256 | lm loss: 2.680794E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.246 | TFLOPs: 43.12 | +7: iteration 65030/ 115203 | consumed samples: 16647680 | consumed tokens: 34094448640 | elapsed time per iteration (s): 0.56 | learning rate: 9.311E-05 | global batch size: 256 | lm loss: 2.657629E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.254 | TFLOPs: 43.69 | +7: iteration 65040/ 115203 | consumed samples: 16650240 | consumed tokens: 34099691520 | elapsed time per iteration (s): 0.57 | learning rate: 9.309E-05 | global batch size: 256 | lm loss: 2.668164E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.464 | TFLOPs: 43.04 | +7: iteration 65050/ 115203 | consumed samples: 16652800 | consumed tokens: 34104934400 | elapsed time per iteration (s): 0.57 | learning rate: 9.307E-05 | global batch size: 256 | lm loss: 2.682084E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.116 | TFLOPs: 42.82 | +7: iteration 65060/ 115203 | consumed samples: 16655360 | consumed tokens: 34110177280 | elapsed time per iteration (s): 0.57 | learning rate: 9.304E-05 | global batch size: 256 | lm loss: 2.689606E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.587 | TFLOPs: 43.15 | +7: iteration 65070/ 115203 | consumed samples: 16657920 | consumed tokens: 34115420160 | elapsed time per iteration (s): 0.56 | learning rate: 9.302E-05 | global batch size: 256 | lm loss: 2.677393E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 65080/ 115203 | consumed samples: 16660480 | consumed tokens: 34120663040 | elapsed time per iteration (s): 0.57 | learning rate: 9.299E-05 | global batch size: 256 | lm loss: 2.688147E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.971 | TFLOPs: 43.19 | +7: iteration 65090/ 115203 | consumed samples: 16663040 | consumed tokens: 34125905920 | elapsed time per iteration (s): 0.56 | learning rate: 9.297E-05 | global batch size: 256 | lm loss: 2.671354E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.023 | TFLOPs: 43.48 | +7: iteration 65100/ 115203 | consumed samples: 16665600 | consumed tokens: 34131148800 | elapsed time per iteration (s): 0.56 | learning rate: 9.294E-05 | global batch size: 256 | lm loss: 2.673918E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.136 | TFLOPs: 43.58 | +7: iteration 65110/ 115203 | consumed samples: 16668160 | consumed tokens: 34136391680 | elapsed time per iteration (s): 0.56 | learning rate: 9.292E-05 | global batch size: 256 | lm loss: 2.680654E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.723 | TFLOPs: 43.64 | +7: iteration 65120/ 115203 | consumed samples: 16670720 | consumed tokens: 34141634560 | elapsed time per iteration (s): 0.56 | learning rate: 9.289E-05 | global batch size: 256 | lm loss: 2.664223E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.573 | TFLOPs: 43.24 | +7: iteration 65130/ 115203 | consumed samples: 16673280 | consumed tokens: 34146877440 | elapsed time per iteration (s): 0.56 | learning rate: 9.287E-05 | global batch size: 256 | lm loss: 2.674339E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 65140/ 115203 | consumed samples: 16675840 | consumed tokens: 34152120320 | elapsed time per iteration (s): 0.56 | learning rate: 9.285E-05 | global batch size: 256 | lm loss: 2.670785E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.931 | TFLOPs: 43.37 | +7: iteration 65150/ 115203 | consumed samples: 16678400 | consumed tokens: 34157363200 | elapsed time per iteration (s): 0.58 | learning rate: 9.282E-05 | global batch size: 256 | lm loss: 2.666430E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.596 | TFLOPs: 42.29 | +7: iteration 65160/ 115203 | consumed samples: 16680960 | consumed tokens: 34162606080 | elapsed time per iteration (s): 0.56 | learning rate: 9.280E-05 | global batch size: 256 | lm loss: 2.677212E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.084 | TFLOPs: 43.67 | +7: iteration 65170/ 115203 | consumed samples: 16683520 | consumed tokens: 34167848960 | elapsed time per iteration (s): 0.57 | learning rate: 9.277E-05 | global batch size: 256 | lm loss: 2.694785E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.084 | TFLOPs: 43.10 | +7: iteration 65180/ 115203 | consumed samples: 16686080 | consumed tokens: 34173091840 | elapsed time per iteration (s): 0.57 | learning rate: 9.275E-05 | global batch size: 256 | lm loss: 2.675809E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.141 | TFLOPs: 42.63 | +7: iteration 65190/ 115203 | consumed samples: 16688640 | consumed tokens: 34178334720 | elapsed time per iteration (s): 0.56 | learning rate: 9.272E-05 | global batch size: 256 | lm loss: 2.669609E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.416 | TFLOPs: 43.61 | +7: iteration 65200/ 115203 | consumed samples: 16691200 | consumed tokens: 34183577600 | elapsed time per iteration (s): 0.57 | learning rate: 9.270E-05 | global batch size: 256 | lm loss: 2.668870E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.893 | TFLOPs: 43.08 | +7: iteration 65210/ 115203 | consumed samples: 16693760 | consumed tokens: 34188820480 | elapsed time per iteration (s): 0.57 | learning rate: 9.268E-05 | global batch size: 256 | lm loss: 2.678118E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.448 | TFLOPs: 42.95 | +7: iteration 65220/ 115203 | consumed samples: 16696320 | consumed tokens: 34194063360 | elapsed time per iteration (s): 0.56 | learning rate: 9.265E-05 | global batch size: 256 | lm loss: 2.666960E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.542 | TFLOPs: 43.34 | +7: iteration 65230/ 115203 | consumed samples: 16698880 | consumed tokens: 34199306240 | elapsed time per iteration (s): 0.56 | learning rate: 9.263E-05 | global batch size: 256 | lm loss: 2.671817E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.676 | TFLOPs: 43.63 | +7: iteration 65240/ 115203 | consumed samples: 16701440 | consumed tokens: 34204549120 | elapsed time per iteration (s): 0.56 | learning rate: 9.260E-05 | global batch size: 256 | lm loss: 2.664815E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.520 | TFLOPs: 43.62 | +7: iteration 65250/ 115203 | consumed samples: 16704000 | consumed tokens: 34209792000 | elapsed time per iteration (s): 0.56 | learning rate: 9.258E-05 | global batch size: 256 | lm loss: 2.667900E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.297 | TFLOPs: 43.69 | +7: iteration 65260/ 115203 | consumed samples: 16706560 | consumed tokens: 34215034880 | elapsed time per iteration (s): 0.56 | learning rate: 9.255E-05 | global batch size: 256 | lm loss: 2.645871E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.898 | TFLOPs: 43.66 | +7: iteration 65270/ 115203 | consumed samples: 16709120 | consumed tokens: 34220277760 | elapsed time per iteration (s): 0.57 | learning rate: 9.253E-05 | global batch size: 256 | lm loss: 2.665760E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.692 | TFLOPs: 42.49 | +7: iteration 65280/ 115203 | consumed samples: 16711680 | consumed tokens: 34225520640 | elapsed time per iteration (s): 0.57 | learning rate: 9.251E-05 | global batch size: 256 | lm loss: 2.667582E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.006 | TFLOPs: 43.19 | +7: iteration 65290/ 115203 | consumed samples: 16714240 | consumed tokens: 34230763520 | elapsed time per iteration (s): 0.55 | learning rate: 9.248E-05 | global batch size: 256 | lm loss: 2.677230E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.475 | TFLOPs: 44.00 | +7: iteration 65300/ 115203 | consumed samples: 16716800 | consumed tokens: 34236006400 | elapsed time per iteration (s): 0.56 | learning rate: 9.246E-05 | global batch size: 256 | lm loss: 2.653765E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.821 | TFLOPs: 43.65 | +7: iteration 65310/ 115203 | consumed samples: 16719360 | consumed tokens: 34241249280 | elapsed time per iteration (s): 0.56 | learning rate: 9.243E-05 | global batch size: 256 | lm loss: 2.690916E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.808 | TFLOPs: 43.46 | +7: iteration 65320/ 115203 | consumed samples: 16721920 | consumed tokens: 34246492160 | elapsed time per iteration (s): 0.57 | learning rate: 9.241E-05 | global batch size: 256 | lm loss: 2.659661E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.481 | TFLOPs: 42.85 | +7: iteration 65330/ 115203 | consumed samples: 16724480 | consumed tokens: 34251735040 | elapsed time per iteration (s): 0.55 | learning rate: 9.238E-05 | global batch size: 256 | lm loss: 2.668402E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.445 | TFLOPs: 43.99 | +7: iteration 65340/ 115203 | consumed samples: 16727040 | consumed tokens: 34256977920 | elapsed time per iteration (s): 0.57 | learning rate: 9.236E-05 | global batch size: 256 | lm loss: 2.676517E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.054 | TFLOPs: 43.10 | +7: iteration 65350/ 115203 | consumed samples: 16729600 | consumed tokens: 34262220800 | elapsed time per iteration (s): 0.56 | learning rate: 9.234E-05 | global batch size: 256 | lm loss: 2.668162E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.147 | TFLOPs: 43.49 | +7: iteration 65360/ 115203 | consumed samples: 16732160 | consumed tokens: 34267463680 | elapsed time per iteration (s): 0.57 | learning rate: 9.231E-05 | global batch size: 256 | lm loss: 2.677463E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.672 | TFLOPs: 43.06 | +7: iteration 65370/ 115203 | consumed samples: 16734720 | consumed tokens: 34272706560 | elapsed time per iteration (s): 0.55 | learning rate: 9.229E-05 | global batch size: 256 | lm loss: 2.676915E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.496 | TFLOPs: 44.00 | +7: iteration 65380/ 115203 | consumed samples: 16737280 | consumed tokens: 34277949440 | elapsed time per iteration (s): 0.56 | learning rate: 9.226E-05 | global batch size: 256 | lm loss: 2.680309E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.494 | TFLOPs: 43.81 | +7: iteration 65390/ 115203 | consumed samples: 16739840 | consumed tokens: 34283192320 | elapsed time per iteration (s): 0.57 | learning rate: 9.224E-05 | global batch size: 256 | lm loss: 2.670663E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.856 | TFLOPs: 42.98 | +7: iteration 65400/ 115203 | consumed samples: 16742400 | consumed tokens: 34288435200 | elapsed time per iteration (s): 0.57 | learning rate: 9.221E-05 | global batch size: 256 | lm loss: 2.661573E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.334 | TFLOPs: 43.03 | +7: iteration 65410/ 115203 | consumed samples: 16744960 | consumed tokens: 34293678080 | elapsed time per iteration (s): 0.57 | learning rate: 9.219E-05 | global batch size: 256 | lm loss: 2.677321E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.547 | TFLOPs: 43.15 | +7: iteration 65420/ 115203 | consumed samples: 16747520 | consumed tokens: 34298920960 | elapsed time per iteration (s): 0.57 | learning rate: 9.217E-05 | global batch size: 256 | lm loss: 2.676261E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.459 | TFLOPs: 42.85 | +7: iteration 65430/ 115203 | consumed samples: 16750080 | consumed tokens: 34304163840 | elapsed time per iteration (s): 0.56 | learning rate: 9.214E-05 | global batch size: 256 | lm loss: 2.674284E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.083 | TFLOPs: 43.48 | +7: iteration 65440/ 115203 | consumed samples: 16752640 | consumed tokens: 34309406720 | elapsed time per iteration (s): 0.55 | learning rate: 9.212E-05 | global batch size: 256 | lm loss: 2.660879E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.458 | TFLOPs: 44.00 | +7: iteration 65450/ 115203 | consumed samples: 16755200 | consumed tokens: 34314649600 | elapsed time per iteration (s): 0.56 | learning rate: 9.209E-05 | global batch size: 256 | lm loss: 2.660505E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.584 | TFLOPs: 43.43 | +7: iteration 65460/ 115203 | consumed samples: 16757760 | consumed tokens: 34319892480 | elapsed time per iteration (s): 0.56 | learning rate: 9.207E-05 | global batch size: 256 | lm loss: 2.667424E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.927 | TFLOPs: 43.37 | +7: iteration 65470/ 115203 | consumed samples: 16760320 | consumed tokens: 34325135360 | elapsed time per iteration (s): 0.57 | learning rate: 9.204E-05 | global batch size: 256 | lm loss: 2.659150E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.019 | TFLOPs: 42.52 | +7: iteration 65480/ 115203 | consumed samples: 16762880 | consumed tokens: 34330378240 | elapsed time per iteration (s): 0.56 | learning rate: 9.202E-05 | global batch size: 256 | lm loss: 2.687686E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.776 | TFLOPs: 43.45 | +7: iteration 65490/ 115203 | consumed samples: 16765440 | consumed tokens: 34335621120 | elapsed time per iteration (s): 0.59 | learning rate: 9.200E-05 | global batch size: 256 | lm loss: 2.668822E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.387 | TFLOPs: 41.32 | +7: iteration 65500/ 115203 | consumed samples: 16768000 | consumed tokens: 34340864000 | elapsed time per iteration (s): 0.58 | learning rate: 9.197E-05 | global batch size: 256 | lm loss: 2.667997E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.224 | TFLOPs: 41.97 | +7: iteration 65510/ 115203 | consumed samples: 16770560 | consumed tokens: 34346106880 | elapsed time per iteration (s): 0.56 | learning rate: 9.195E-05 | global batch size: 256 | lm loss: 2.669116E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.856 | TFLOPs: 43.56 | +7: iteration 65520/ 115203 | consumed samples: 16773120 | consumed tokens: 34351349760 | elapsed time per iteration (s): 0.57 | learning rate: 9.192E-05 | global batch size: 256 | lm loss: 2.672955E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.088 | TFLOPs: 42.91 | +7: iteration 65530/ 115203 | consumed samples: 16775680 | consumed tokens: 34356592640 | elapsed time per iteration (s): 0.57 | learning rate: 9.190E-05 | global batch size: 256 | lm loss: 2.664068E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.508 | TFLOPs: 42.95 | +7: iteration 65540/ 115203 | consumed samples: 16778240 | consumed tokens: 34361835520 | elapsed time per iteration (s): 0.56 | learning rate: 9.187E-05 | global batch size: 256 | lm loss: 2.653771E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.334 | TFLOPs: 43.79 | +7: iteration 65550/ 115203 | consumed samples: 16780800 | consumed tokens: 34367078400 | elapsed time per iteration (s): 0.56 | learning rate: 9.185E-05 | global batch size: 256 | lm loss: 2.671178E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.585 | TFLOPs: 43.63 | +7: iteration 65560/ 115203 | consumed samples: 16783360 | consumed tokens: 34372321280 | elapsed time per iteration (s): 0.56 | learning rate: 9.183E-05 | global batch size: 256 | lm loss: 2.668011E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.641 | TFLOPs: 43.63 | +7: iteration 65570/ 115203 | consumed samples: 16785920 | consumed tokens: 34377564160 | elapsed time per iteration (s): 0.56 | learning rate: 9.180E-05 | global batch size: 256 | lm loss: 2.673626E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.037 | TFLOPs: 43.67 | +7: iteration 65580/ 115203 | consumed samples: 16788480 | consumed tokens: 34382807040 | elapsed time per iteration (s): 0.56 | learning rate: 9.178E-05 | global batch size: 256 | lm loss: 2.662332E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.968 | TFLOPs: 43.47 | +7: iteration 65590/ 115203 | consumed samples: 16791040 | consumed tokens: 34388049920 | elapsed time per iteration (s): 0.56 | learning rate: 9.175E-05 | global batch size: 256 | lm loss: 2.669640E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.541 | TFLOPs: 43.43 | +7: iteration 65600/ 115203 | consumed samples: 16793600 | consumed tokens: 34393292800 | elapsed time per iteration (s): 0.57 | learning rate: 9.173E-05 | global batch size: 256 | lm loss: 2.668553E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.513 | TFLOPs: 43.05 | +7: iteration 65610/ 115203 | consumed samples: 16796160 | consumed tokens: 34398535680 | elapsed time per iteration (s): 0.56 | learning rate: 9.170E-05 | global batch size: 256 | lm loss: 2.683413E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.768 | TFLOPs: 43.64 | +7: iteration 65620/ 115203 | consumed samples: 16798720 | consumed tokens: 34403778560 | elapsed time per iteration (s): 0.56 | learning rate: 9.168E-05 | global batch size: 256 | lm loss: 2.668460E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.012 | TFLOPs: 43.95 | +7: iteration 65630/ 115203 | consumed samples: 16801280 | consumed tokens: 34409021440 | elapsed time per iteration (s): 0.56 | learning rate: 9.166E-05 | global batch size: 256 | lm loss: 2.655238E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.917 | TFLOPs: 43.56 | +7: iteration 65640/ 115203 | consumed samples: 16803840 | consumed tokens: 34414264320 | elapsed time per iteration (s): 0.56 | learning rate: 9.163E-05 | global batch size: 256 | lm loss: 2.673353E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.073 | TFLOPs: 43.48 | +7: iteration 65650/ 115203 | consumed samples: 16806400 | consumed tokens: 34419507200 | elapsed time per iteration (s): 0.56 | learning rate: 9.161E-05 | global batch size: 256 | lm loss: 2.669770E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.075 | TFLOPs: 43.48 | +7: iteration 65660/ 115203 | consumed samples: 16808960 | consumed tokens: 34424750080 | elapsed time per iteration (s): 0.56 | learning rate: 9.158E-05 | global batch size: 256 | lm loss: 2.675026E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.209 | TFLOPs: 43.97 | +7: iteration 65670/ 115203 | consumed samples: 16811520 | consumed tokens: 34429992960 | elapsed time per iteration (s): 0.56 | learning rate: 9.156E-05 | global batch size: 256 | lm loss: 2.669706E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.508 | TFLOPs: 43.71 | +7: iteration 65680/ 115203 | consumed samples: 16814080 | consumed tokens: 34435235840 | elapsed time per iteration (s): 0.58 | learning rate: 9.153E-05 | global batch size: 256 | lm loss: 2.666813E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.835 | TFLOPs: 42.03 | +7: iteration 65690/ 115203 | consumed samples: 16816640 | consumed tokens: 34440478720 | elapsed time per iteration (s): 0.55 | learning rate: 9.151E-05 | global batch size: 256 | lm loss: 2.668495E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.274 | TFLOPs: 43.98 | +7: iteration 65700/ 115203 | consumed samples: 16819200 | consumed tokens: 34445721600 | elapsed time per iteration (s): 0.57 | learning rate: 9.149E-05 | global batch size: 256 | lm loss: 2.659334E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.817 | TFLOPs: 43.17 | +7: iteration 65710/ 115203 | consumed samples: 16821760 | consumed tokens: 34450964480 | elapsed time per iteration (s): 0.55 | learning rate: 9.146E-05 | global batch size: 256 | lm loss: 2.675148E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.319 | TFLOPs: 43.98 | +7: iteration 65720/ 115203 | consumed samples: 16824320 | consumed tokens: 34456207360 | elapsed time per iteration (s): 0.56 | learning rate: 9.144E-05 | global batch size: 256 | lm loss: 2.674921E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.697 | TFLOPs: 43.35 | +7: iteration 65730/ 115203 | consumed samples: 16826880 | consumed tokens: 34461450240 | elapsed time per iteration (s): 0.56 | learning rate: 9.141E-05 | global batch size: 256 | lm loss: 2.666056E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.834 | TFLOPs: 43.46 | +7: iteration 65740/ 115203 | consumed samples: 16829440 | consumed tokens: 34466693120 | elapsed time per iteration (s): 0.56 | learning rate: 9.139E-05 | global batch size: 256 | lm loss: 2.661499E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.496 | TFLOPs: 43.62 | +7: iteration 65750/ 115203 | consumed samples: 16832000 | consumed tokens: 34471936000 | elapsed time per iteration (s): 0.56 | learning rate: 9.136E-05 | global batch size: 256 | lm loss: 2.663060E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.895 | TFLOPs: 43.46 | +7: iteration 65760/ 115203 | consumed samples: 16834560 | consumed tokens: 34477178880 | elapsed time per iteration (s): 0.55 | learning rate: 9.134E-05 | global batch size: 256 | lm loss: 2.655672E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.300 | TFLOPs: 43.98 | +7: iteration 65770/ 115203 | consumed samples: 16837120 | consumed tokens: 34482421760 | elapsed time per iteration (s): 0.56 | learning rate: 9.132E-05 | global batch size: 256 | lm loss: 2.659289E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.591 | TFLOPs: 43.63 | +7: iteration 65780/ 115203 | consumed samples: 16839680 | consumed tokens: 34487664640 | elapsed time per iteration (s): 0.55 | learning rate: 9.129E-05 | global batch size: 256 | lm loss: 2.678800E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.347 | TFLOPs: 43.98 | +7: iteration 65790/ 115203 | consumed samples: 16842240 | consumed tokens: 34492907520 | elapsed time per iteration (s): 0.56 | learning rate: 9.127E-05 | global batch size: 256 | lm loss: 2.662280E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.527 | TFLOPs: 43.43 | +7: iteration 65800/ 115203 | consumed samples: 16844800 | consumed tokens: 34498150400 | elapsed time per iteration (s): 0.56 | learning rate: 9.124E-05 | global batch size: 256 | lm loss: 2.664164E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.667 | TFLOPs: 43.44 | +7: iteration 65810/ 115203 | consumed samples: 16847360 | consumed tokens: 34503393280 | elapsed time per iteration (s): 0.56 | learning rate: 9.122E-05 | global batch size: 256 | lm loss: 2.665614E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.276 | TFLOPs: 43.31 | +7: iteration 65820/ 115203 | consumed samples: 16849920 | consumed tokens: 34508636160 | elapsed time per iteration (s): 0.57 | learning rate: 9.119E-05 | global batch size: 256 | lm loss: 2.663770E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.700 | TFLOPs: 43.16 | +7: iteration 65830/ 115203 | consumed samples: 16852480 | consumed tokens: 34513879040 | elapsed time per iteration (s): 0.55 | learning rate: 9.117E-05 | global batch size: 256 | lm loss: 2.670290E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.326 | TFLOPs: 43.98 | +7: iteration 65840/ 115203 | consumed samples: 16855040 | consumed tokens: 34519121920 | elapsed time per iteration (s): 0.57 | learning rate: 9.115E-05 | global batch size: 256 | lm loss: 2.651542E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.837 | TFLOPs: 43.17 | +7: iteration 65850/ 115203 | consumed samples: 16857600 | consumed tokens: 34524364800 | elapsed time per iteration (s): 0.56 | learning rate: 9.112E-05 | global batch size: 256 | lm loss: 2.671682E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.968 | TFLOPs: 43.38 | +7: iteration 65860/ 115203 | consumed samples: 16860160 | consumed tokens: 34529607680 | elapsed time per iteration (s): 0.56 | learning rate: 9.110E-05 | global batch size: 256 | lm loss: 2.673393E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.853 | TFLOPs: 43.37 | +7: iteration 65870/ 115203 | consumed samples: 16862720 | consumed tokens: 34534850560 | elapsed time per iteration (s): 0.57 | learning rate: 9.107E-05 | global batch size: 256 | lm loss: 2.674085E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.292 | TFLOPs: 43.03 | +7: iteration 65880/ 115203 | consumed samples: 16865280 | consumed tokens: 34540093440 | elapsed time per iteration (s): 0.56 | learning rate: 9.105E-05 | global batch size: 256 | lm loss: 2.666249E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.658 | TFLOPs: 43.35 | +7: iteration 65890/ 115203 | consumed samples: 16867840 | consumed tokens: 34545336320 | elapsed time per iteration (s): 0.56 | learning rate: 9.102E-05 | global batch size: 256 | lm loss: 2.664133E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.477 | TFLOPs: 43.62 | +7: iteration 65900/ 115203 | consumed samples: 16870400 | consumed tokens: 34550579200 | elapsed time per iteration (s): 0.56 | learning rate: 9.100E-05 | global batch size: 256 | lm loss: 2.669265E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.365 | TFLOPs: 43.70 | +7: iteration 65910/ 115203 | consumed samples: 16872960 | consumed tokens: 34555822080 | elapsed time per iteration (s): 0.56 | learning rate: 9.098E-05 | global batch size: 256 | lm loss: 2.678334E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.450 | TFLOPs: 43.61 | +7: iteration 65920/ 115203 | consumed samples: 16875520 | consumed tokens: 34561064960 | elapsed time per iteration (s): 0.56 | learning rate: 9.095E-05 | global batch size: 256 | lm loss: 2.670123E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.451 | TFLOPs: 43.33 | +7: iteration 65930/ 115203 | consumed samples: 16878080 | consumed tokens: 34566307840 | elapsed time per iteration (s): 0.56 | learning rate: 9.093E-05 | global batch size: 256 | lm loss: 2.672328E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.246 | TFLOPs: 43.78 | +7: iteration 65940/ 115203 | consumed samples: 16880640 | consumed tokens: 34571550720 | elapsed time per iteration (s): 0.56 | learning rate: 9.090E-05 | global batch size: 256 | lm loss: 2.669971E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.075 | TFLOPs: 43.86 | +7: iteration 65950/ 115203 | consumed samples: 16883200 | consumed tokens: 34576793600 | elapsed time per iteration (s): 0.57 | learning rate: 9.088E-05 | global batch size: 256 | lm loss: 2.672953E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.445 | TFLOPs: 43.04 | +7: iteration 65960/ 115203 | consumed samples: 16885760 | consumed tokens: 34582036480 | elapsed time per iteration (s): 0.56 | learning rate: 9.086E-05 | global batch size: 256 | lm loss: 2.667699E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.195 | TFLOPs: 43.40 | +7: iteration 65970/ 115203 | consumed samples: 16888320 | consumed tokens: 34587279360 | elapsed time per iteration (s): 0.56 | learning rate: 9.083E-05 | global batch size: 256 | lm loss: 2.674808E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.609 | TFLOPs: 43.44 | +7: iteration 65980/ 115203 | consumed samples: 16890880 | consumed tokens: 34592522240 | elapsed time per iteration (s): 0.56 | learning rate: 9.081E-05 | global batch size: 256 | lm loss: 2.666058E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.623 | TFLOPs: 43.44 | +7: iteration 65990/ 115203 | consumed samples: 16893440 | consumed tokens: 34597765120 | elapsed time per iteration (s): 0.56 | learning rate: 9.078E-05 | global batch size: 256 | lm loss: 2.667439E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.247 | TFLOPs: 43.50 | +0: [2023-03-16 23:11:08,782] [INFO] [logging.py:68:log_dist] [Rank 0] step=66000, skipped=0, lr=[9.075821569240965e-05, 9.075821569240965e-05, 9.075821569240965e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 66000/ 115203 | consumed samples: 16896000 | consumed tokens: 34603008000 | elapsed time per iteration (s): 0.56 | learning rate: 9.076E-05 | global batch size: 256 | lm loss: 2.675397E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.191 | TFLOPs: 43.49 | +0: steps: 66000 loss: 2.6508 iter time (s): 0.564 samples/sec: 453.863 +7: iteration 66010/ 115203 | consumed samples: 16898560 | consumed tokens: 34608250880 | elapsed time per iteration (s): 0.56 | learning rate: 9.073E-05 | global batch size: 256 | lm loss: 2.671992E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.916 | TFLOPs: 43.37 | +7: iteration 66020/ 115203 | consumed samples: 16901120 | consumed tokens: 34613493760 | elapsed time per iteration (s): 0.56 | learning rate: 9.071E-05 | global batch size: 256 | lm loss: 2.674857E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.211 | TFLOPs: 43.21 | +7: iteration 66030/ 115203 | consumed samples: 16903680 | consumed tokens: 34618736640 | elapsed time per iteration (s): 0.56 | learning rate: 9.069E-05 | global batch size: 256 | lm loss: 2.669347E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.310 | TFLOPs: 43.50 | +7: iteration 66040/ 115203 | consumed samples: 16906240 | consumed tokens: 34623979520 | elapsed time per iteration (s): 0.56 | learning rate: 9.066E-05 | global batch size: 256 | lm loss: 2.654958E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.106 | TFLOPs: 43.58 | +7: iteration 66050/ 115203 | consumed samples: 16908800 | consumed tokens: 34629222400 | elapsed time per iteration (s): 0.56 | learning rate: 9.064E-05 | global batch size: 256 | lm loss: 2.668082E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.676 | TFLOPs: 43.63 | +7: iteration 66060/ 115203 | consumed samples: 16911360 | consumed tokens: 34634465280 | elapsed time per iteration (s): 0.56 | learning rate: 9.061E-05 | global batch size: 256 | lm loss: 2.657678E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.190 | TFLOPs: 43.87 | +7: iteration 66070/ 115203 | consumed samples: 16913920 | consumed tokens: 34639708160 | elapsed time per iteration (s): 0.56 | learning rate: 9.059E-05 | global batch size: 256 | lm loss: 2.665068E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.736 | TFLOPs: 43.45 | +7: iteration 66080/ 115203 | consumed samples: 16916480 | consumed tokens: 34644951040 | elapsed time per iteration (s): 0.56 | learning rate: 9.056E-05 | global batch size: 256 | lm loss: 2.673556E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.545 | TFLOPs: 43.34 | +7: iteration 66090/ 115203 | consumed samples: 16919040 | consumed tokens: 34650193920 | elapsed time per iteration (s): 0.56 | learning rate: 9.054E-05 | global batch size: 256 | lm loss: 2.665552E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.609 | TFLOPs: 43.72 | +7: iteration 66100/ 115203 | consumed samples: 16921600 | consumed tokens: 34655436800 | elapsed time per iteration (s): 0.57 | learning rate: 9.052E-05 | global batch size: 256 | lm loss: 2.654595E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.034 | TFLOPs: 42.81 | +7: iteration 66110/ 115203 | consumed samples: 16924160 | consumed tokens: 34660679680 | elapsed time per iteration (s): 0.55 | learning rate: 9.049E-05 | global batch size: 256 | lm loss: 2.685759E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.714 | TFLOPs: 44.02 | +7: iteration 66120/ 115203 | consumed samples: 16926720 | consumed tokens: 34665922560 | elapsed time per iteration (s): 0.55 | learning rate: 9.047E-05 | global batch size: 256 | lm loss: 2.682589E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.488 | TFLOPs: 44.00 | +7: iteration 66130/ 115203 | consumed samples: 16929280 | consumed tokens: 34671165440 | elapsed time per iteration (s): 0.56 | learning rate: 9.044E-05 | global batch size: 256 | lm loss: 2.646296E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.115 | TFLOPs: 43.49 | +7: iteration 66140/ 115203 | consumed samples: 16931840 | consumed tokens: 34676408320 | elapsed time per iteration (s): 0.56 | learning rate: 9.042E-05 | global batch size: 256 | lm loss: 2.664599E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.706 | TFLOPs: 43.64 | +7: iteration 66150/ 115203 | consumed samples: 16934400 | consumed tokens: 34681651200 | elapsed time per iteration (s): 0.55 | learning rate: 9.040E-05 | global batch size: 256 | lm loss: 2.663408E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 66160/ 115203 | consumed samples: 16936960 | consumed tokens: 34686894080 | elapsed time per iteration (s): 0.56 | learning rate: 9.037E-05 | global batch size: 256 | lm loss: 2.669367E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.650 | TFLOPs: 43.63 | +7: iteration 66170/ 115203 | consumed samples: 16939520 | consumed tokens: 34692136960 | elapsed time per iteration (s): 0.56 | learning rate: 9.035E-05 | global batch size: 256 | lm loss: 2.674595E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.348 | TFLOPs: 43.51 | +7: iteration 66180/ 115203 | consumed samples: 16942080 | consumed tokens: 34697379840 | elapsed time per iteration (s): 0.58 | learning rate: 9.032E-05 | global batch size: 256 | lm loss: 2.675134E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.473 | TFLOPs: 42.09 | +7: iteration 66190/ 115203 | consumed samples: 16944640 | consumed tokens: 34702622720 | elapsed time per iteration (s): 0.56 | learning rate: 9.030E-05 | global batch size: 256 | lm loss: 2.663621E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.216 | TFLOPs: 43.50 | +7: iteration 66200/ 115203 | consumed samples: 16947200 | consumed tokens: 34707865600 | elapsed time per iteration (s): 0.55 | learning rate: 9.027E-05 | global batch size: 256 | lm loss: 2.666884E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.307 | TFLOPs: 43.98 | +7: iteration 66210/ 115203 | consumed samples: 16949760 | consumed tokens: 34713108480 | elapsed time per iteration (s): 0.56 | learning rate: 9.025E-05 | global batch size: 256 | lm loss: 2.666611E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.818 | TFLOPs: 43.46 | +7: iteration 66220/ 115203 | consumed samples: 16952320 | consumed tokens: 34718351360 | elapsed time per iteration (s): 0.55 | learning rate: 9.023E-05 | global batch size: 256 | lm loss: 2.677617E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.462 | TFLOPs: 44.00 | +7: iteration 66230/ 115203 | consumed samples: 16954880 | consumed tokens: 34723594240 | elapsed time per iteration (s): 0.65 | learning rate: 9.020E-05 | global batch size: 256 | lm loss: 2.660133E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 392.350 | TFLOPs: 37.41 | +7: iteration 66240/ 115203 | consumed samples: 16957440 | consumed tokens: 34728837120 | elapsed time per iteration (s): 0.57 | learning rate: 9.018E-05 | global batch size: 256 | lm loss: 2.681850E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.554 | TFLOPs: 42.76 | +7: iteration 66250/ 115203 | consumed samples: 16960000 | consumed tokens: 34734080000 | elapsed time per iteration (s): 0.56 | learning rate: 9.015E-05 | global batch size: 256 | lm loss: 2.659556E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.365 | TFLOPs: 43.41 | +7: iteration 66260/ 115203 | consumed samples: 16962560 | consumed tokens: 34739322880 | elapsed time per iteration (s): 0.56 | learning rate: 9.013E-05 | global batch size: 256 | lm loss: 2.683486E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.727 | TFLOPs: 43.35 | +7: iteration 66270/ 115203 | consumed samples: 16965120 | consumed tokens: 34744565760 | elapsed time per iteration (s): 0.56 | learning rate: 9.010E-05 | global batch size: 256 | lm loss: 2.655251E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.805 | TFLOPs: 43.55 | +7: iteration 66280/ 115203 | consumed samples: 16967680 | consumed tokens: 34749808640 | elapsed time per iteration (s): 0.56 | learning rate: 9.008E-05 | global batch size: 256 | lm loss: 2.672620E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.737 | TFLOPs: 43.74 | +7: iteration 66290/ 115203 | consumed samples: 16970240 | consumed tokens: 34755051520 | elapsed time per iteration (s): 0.56 | learning rate: 9.006E-05 | global batch size: 256 | lm loss: 2.667256E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.256 | TFLOPs: 43.40 | +7: iteration 66300/ 115203 | consumed samples: 16972800 | consumed tokens: 34760294400 | elapsed time per iteration (s): 0.56 | learning rate: 9.003E-05 | global batch size: 256 | lm loss: 2.666435E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.076 | TFLOPs: 43.48 | +7: iteration 66310/ 115203 | consumed samples: 16975360 | consumed tokens: 34765537280 | elapsed time per iteration (s): 0.55 | learning rate: 9.001E-05 | global batch size: 256 | lm loss: 2.663262E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.519 | TFLOPs: 44.00 | +7: iteration 66320/ 115203 | consumed samples: 16977920 | consumed tokens: 34770780160 | elapsed time per iteration (s): 0.56 | learning rate: 8.998E-05 | global batch size: 256 | lm loss: 2.665668E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.707 | TFLOPs: 43.73 | +7: iteration 66330/ 115203 | consumed samples: 16980480 | consumed tokens: 34776023040 | elapsed time per iteration (s): 0.56 | learning rate: 8.996E-05 | global batch size: 256 | lm loss: 2.669880E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.708 | TFLOPs: 43.83 | +7: iteration 66340/ 115203 | consumed samples: 16983040 | consumed tokens: 34781265920 | elapsed time per iteration (s): 0.55 | learning rate: 8.994E-05 | global batch size: 256 | lm loss: 2.666637E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 66350/ 115203 | consumed samples: 16985600 | consumed tokens: 34786508800 | elapsed time per iteration (s): 0.56 | learning rate: 8.991E-05 | global batch size: 256 | lm loss: 2.682060E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.188 | TFLOPs: 43.59 | +7: iteration 66360/ 115203 | consumed samples: 16988160 | consumed tokens: 34791751680 | elapsed time per iteration (s): 0.56 | learning rate: 8.989E-05 | global batch size: 256 | lm loss: 2.658185E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.604 | TFLOPs: 43.82 | +7: iteration 66370/ 115203 | consumed samples: 16990720 | consumed tokens: 34796994560 | elapsed time per iteration (s): 0.58 | learning rate: 8.986E-05 | global batch size: 256 | lm loss: 2.664462E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.868 | TFLOPs: 41.94 | +7: iteration 66380/ 115203 | consumed samples: 16993280 | consumed tokens: 34802237440 | elapsed time per iteration (s): 0.56 | learning rate: 8.984E-05 | global batch size: 256 | lm loss: 2.661573E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.655 | TFLOPs: 43.63 | +7: iteration 66390/ 115203 | consumed samples: 16995840 | consumed tokens: 34807480320 | elapsed time per iteration (s): 0.55 | learning rate: 8.981E-05 | global batch size: 256 | lm loss: 2.674071E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 66400/ 115203 | consumed samples: 16998400 | consumed tokens: 34812723200 | elapsed time per iteration (s): 0.55 | learning rate: 8.979E-05 | global batch size: 256 | lm loss: 2.658957E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.527 | TFLOPs: 44.00 | +7: iteration 66410/ 115203 | consumed samples: 17000960 | consumed tokens: 34817966080 | elapsed time per iteration (s): 0.55 | learning rate: 8.977E-05 | global batch size: 256 | lm loss: 2.668301E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.626 | TFLOPs: 44.01 | +7: iteration 66420/ 115203 | consumed samples: 17003520 | consumed tokens: 34823208960 | elapsed time per iteration (s): 0.57 | learning rate: 8.974E-05 | global batch size: 256 | lm loss: 2.675319E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.491 | TFLOPs: 43.14 | +7: iteration 66430/ 115203 | consumed samples: 17006080 | consumed tokens: 34828451840 | elapsed time per iteration (s): 0.55 | learning rate: 8.972E-05 | global batch size: 256 | lm loss: 2.653328E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.428 | TFLOPs: 43.99 | +7: iteration 66440/ 115203 | consumed samples: 17008640 | consumed tokens: 34833694720 | elapsed time per iteration (s): 0.56 | learning rate: 8.969E-05 | global batch size: 256 | lm loss: 2.660370E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.935 | TFLOPs: 43.66 | +7: iteration 66450/ 115203 | consumed samples: 17011200 | consumed tokens: 34838937600 | elapsed time per iteration (s): 0.56 | learning rate: 8.967E-05 | global batch size: 256 | lm loss: 2.659225E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.852 | TFLOPs: 43.65 | +7: iteration 66460/ 115203 | consumed samples: 17013760 | consumed tokens: 34844180480 | elapsed time per iteration (s): 0.57 | learning rate: 8.965E-05 | global batch size: 256 | lm loss: 2.657899E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.467 | TFLOPs: 43.04 | +7: iteration 66470/ 115203 | consumed samples: 17016320 | consumed tokens: 34849423360 | elapsed time per iteration (s): 0.55 | learning rate: 8.962E-05 | global batch size: 256 | lm loss: 2.666499E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.667 | TFLOPs: 44.01 | +7: iteration 66480/ 115203 | consumed samples: 17018880 | consumed tokens: 34854666240 | elapsed time per iteration (s): 0.55 | learning rate: 8.960E-05 | global batch size: 256 | lm loss: 2.651152E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.729 | TFLOPs: 44.02 | +7: iteration 66490/ 115203 | consumed samples: 17021440 | consumed tokens: 34859909120 | elapsed time per iteration (s): 0.56 | learning rate: 8.957E-05 | global batch size: 256 | lm loss: 2.666554E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.323 | TFLOPs: 43.51 | +7: iteration 66500/ 115203 | consumed samples: 17024000 | consumed tokens: 34865152000 | elapsed time per iteration (s): 0.55 | learning rate: 8.955E-05 | global batch size: 256 | lm loss: 2.675354E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.641 | TFLOPs: 44.01 | +7: iteration 66510/ 115203 | consumed samples: 17026560 | consumed tokens: 34870394880 | elapsed time per iteration (s): 0.56 | learning rate: 8.953E-05 | global batch size: 256 | lm loss: 2.664274E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.546 | TFLOPs: 43.91 | +7: iteration 66520/ 115203 | consumed samples: 17029120 | consumed tokens: 34875637760 | elapsed time per iteration (s): 0.56 | learning rate: 8.950E-05 | global batch size: 256 | lm loss: 2.668913E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.223 | TFLOPs: 43.97 | +7: iteration 66530/ 115203 | consumed samples: 17031680 | consumed tokens: 34880880640 | elapsed time per iteration (s): 0.56 | learning rate: 8.948E-05 | global batch size: 256 | lm loss: 2.667083E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.127 | TFLOPs: 43.49 | +7: iteration 66540/ 115203 | consumed samples: 17034240 | consumed tokens: 34886123520 | elapsed time per iteration (s): 0.55 | learning rate: 8.945E-05 | global batch size: 256 | lm loss: 2.677771E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.491 | TFLOPs: 44.00 | +7: iteration 66550/ 115203 | consumed samples: 17036800 | consumed tokens: 34891366400 | elapsed time per iteration (s): 0.55 | learning rate: 8.943E-05 | global batch size: 256 | lm loss: 2.666407E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.703 | TFLOPs: 44.02 | +7: iteration 66560/ 115203 | consumed samples: 17039360 | consumed tokens: 34896609280 | elapsed time per iteration (s): 0.56 | learning rate: 8.940E-05 | global batch size: 256 | lm loss: 2.666341E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.977 | TFLOPs: 43.47 | +7: iteration 66570/ 115203 | consumed samples: 17041920 | consumed tokens: 34901852160 | elapsed time per iteration (s): 0.55 | learning rate: 8.938E-05 | global batch size: 256 | lm loss: 2.675030E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.526 | TFLOPs: 44.00 | +7: iteration 66580/ 115203 | consumed samples: 17044480 | consumed tokens: 34907095040 | elapsed time per iteration (s): 0.55 | learning rate: 8.936E-05 | global batch size: 256 | lm loss: 2.663581E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.339 | TFLOPs: 43.98 | +7: iteration 66590/ 115203 | consumed samples: 17047040 | consumed tokens: 34912337920 | elapsed time per iteration (s): 0.56 | learning rate: 8.933E-05 | global batch size: 256 | lm loss: 2.670122E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.506 | TFLOPs: 43.52 | +7: iteration 66600/ 115203 | consumed samples: 17049600 | consumed tokens: 34917580800 | elapsed time per iteration (s): 0.57 | learning rate: 8.931E-05 | global batch size: 256 | lm loss: 2.680319E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.892 | TFLOPs: 42.99 | +7: iteration 66610/ 115203 | consumed samples: 17052160 | consumed tokens: 34922823680 | elapsed time per iteration (s): 0.56 | learning rate: 8.928E-05 | global batch size: 256 | lm loss: 2.675649E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.121 | TFLOPs: 43.39 | +7: iteration 66620/ 115203 | consumed samples: 17054720 | consumed tokens: 34928066560 | elapsed time per iteration (s): 0.56 | learning rate: 8.926E-05 | global batch size: 256 | lm loss: 2.665290E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.653 | TFLOPs: 43.73 | +7: iteration 66630/ 115203 | consumed samples: 17057280 | consumed tokens: 34933309440 | elapsed time per iteration (s): 0.55 | learning rate: 8.924E-05 | global batch size: 256 | lm loss: 2.655391E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.399 | TFLOPs: 43.99 | +7: iteration 66640/ 115203 | consumed samples: 17059840 | consumed tokens: 34938552320 | elapsed time per iteration (s): 0.55 | learning rate: 8.921E-05 | global batch size: 256 | lm loss: 2.653591E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.607 | TFLOPs: 44.01 | +7: iteration 66650/ 115203 | consumed samples: 17062400 | consumed tokens: 34943795200 | elapsed time per iteration (s): 0.55 | learning rate: 8.919E-05 | global batch size: 256 | lm loss: 2.668859E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.803 | TFLOPs: 44.03 | +7: iteration 66660/ 115203 | consumed samples: 17064960 | consumed tokens: 34949038080 | elapsed time per iteration (s): 0.58 | learning rate: 8.916E-05 | global batch size: 256 | lm loss: 2.678736E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.875 | TFLOPs: 42.32 | +7: iteration 66670/ 115203 | consumed samples: 17067520 | consumed tokens: 34954280960 | elapsed time per iteration (s): 0.55 | learning rate: 8.914E-05 | global batch size: 256 | lm loss: 2.677090E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.746 | TFLOPs: 44.02 | +7: iteration 66680/ 115203 | consumed samples: 17070080 | consumed tokens: 34959523840 | elapsed time per iteration (s): 0.56 | learning rate: 8.911E-05 | global batch size: 256 | lm loss: 2.665358E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.909 | TFLOPs: 43.75 | +7: iteration 66690/ 115203 | consumed samples: 17072640 | consumed tokens: 34964766720 | elapsed time per iteration (s): 0.55 | learning rate: 8.909E-05 | global batch size: 256 | lm loss: 2.673230E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.667 | TFLOPs: 44.01 | +7: iteration 66700/ 115203 | consumed samples: 17075200 | consumed tokens: 34970009600 | elapsed time per iteration (s): 0.73 | learning rate: 8.907E-05 | global batch size: 256 | lm loss: 2.684635E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 348.372 | TFLOPs: 33.21 | +7: iteration 66710/ 115203 | consumed samples: 17077760 | consumed tokens: 34975252480 | elapsed time per iteration (s): 0.66 | learning rate: 8.904E-05 | global batch size: 256 | lm loss: 2.671153E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 388.661 | TFLOPs: 37.05 | +7: iteration 66720/ 115203 | consumed samples: 17080320 | consumed tokens: 34980495360 | elapsed time per iteration (s): 0.56 | learning rate: 8.902E-05 | global batch size: 256 | lm loss: 2.671080E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.573 | TFLOPs: 43.53 | +7: iteration 66730/ 115203 | consumed samples: 17082880 | consumed tokens: 34985738240 | elapsed time per iteration (s): 0.99 | learning rate: 8.899E-05 | global batch size: 256 | lm loss: 2.671825E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 258.405 | TFLOPs: 24.64 | +7: iteration 66740/ 115203 | consumed samples: 17085440 | consumed tokens: 34990981120 | elapsed time per iteration (s): 0.55 | learning rate: 8.897E-05 | global batch size: 256 | lm loss: 2.666204E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.122 | TFLOPs: 44.06 | +7: iteration 66750/ 115203 | consumed samples: 17088000 | consumed tokens: 34996224000 | elapsed time per iteration (s): 0.55 | learning rate: 8.895E-05 | global batch size: 256 | lm loss: 2.672515E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.873 | TFLOPs: 44.03 | +7: iteration 66760/ 115203 | consumed samples: 17090560 | consumed tokens: 35001466880 | elapsed time per iteration (s): 0.55 | learning rate: 8.892E-05 | global batch size: 256 | lm loss: 2.666228E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.830 | TFLOPs: 44.03 | +7: iteration 66770/ 115203 | consumed samples: 17093120 | consumed tokens: 35006709760 | elapsed time per iteration (s): 0.55 | learning rate: 8.890E-05 | global batch size: 256 | lm loss: 2.670194E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.720 | TFLOPs: 44.02 | +7: iteration 66780/ 115203 | consumed samples: 17095680 | consumed tokens: 35011952640 | elapsed time per iteration (s): 0.56 | learning rate: 8.887E-05 | global batch size: 256 | lm loss: 2.661710E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.272 | TFLOPs: 43.41 | +7: iteration 66790/ 115203 | consumed samples: 17098240 | consumed tokens: 35017195520 | elapsed time per iteration (s): 0.56 | learning rate: 8.885E-05 | global batch size: 256 | lm loss: 2.654261E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.437 | TFLOPs: 43.42 | +7: iteration 66800/ 115203 | consumed samples: 17100800 | consumed tokens: 35022438400 | elapsed time per iteration (s): 0.56 | learning rate: 8.883E-05 | global batch size: 256 | lm loss: 2.675557E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.316 | TFLOPs: 43.70 | +7: iteration 66810/ 115203 | consumed samples: 17103360 | consumed tokens: 35027681280 | elapsed time per iteration (s): 0.57 | learning rate: 8.880E-05 | global batch size: 256 | lm loss: 2.670432E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.795 | TFLOPs: 42.60 | +7: iteration 66820/ 115203 | consumed samples: 17105920 | consumed tokens: 35032924160 | elapsed time per iteration (s): 0.55 | learning rate: 8.878E-05 | global batch size: 256 | lm loss: 2.669730E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.674 | TFLOPs: 44.02 | +7: iteration 66830/ 115203 | consumed samples: 17108480 | consumed tokens: 35038167040 | elapsed time per iteration (s): 0.56 | learning rate: 8.875E-05 | global batch size: 256 | lm loss: 2.658813E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.941 | TFLOPs: 43.37 | +7: iteration 66840/ 115203 | consumed samples: 17111040 | consumed tokens: 35043409920 | elapsed time per iteration (s): 0.55 | learning rate: 8.873E-05 | global batch size: 256 | lm loss: 2.652132E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.448 | TFLOPs: 43.99 | +7: iteration 66850/ 115203 | consumed samples: 17113600 | consumed tokens: 35048652800 | elapsed time per iteration (s): 0.56 | learning rate: 8.871E-05 | global batch size: 256 | lm loss: 2.680062E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.965 | TFLOPs: 43.57 | +7: iteration 66860/ 115203 | consumed samples: 17116160 | consumed tokens: 35053895680 | elapsed time per iteration (s): 0.56 | learning rate: 8.868E-05 | global batch size: 256 | lm loss: 2.663552E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.032 | TFLOPs: 43.48 | +7: iteration 66870/ 115203 | consumed samples: 17118720 | consumed tokens: 35059138560 | elapsed time per iteration (s): 0.56 | learning rate: 8.866E-05 | global batch size: 256 | lm loss: 2.653815E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.223 | TFLOPs: 43.50 | +7: iteration 66880/ 115203 | consumed samples: 17121280 | consumed tokens: 35064381440 | elapsed time per iteration (s): 0.56 | learning rate: 8.863E-05 | global batch size: 256 | lm loss: 2.666218E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.284 | TFLOPs: 43.31 | +7: iteration 66890/ 115203 | consumed samples: 17123840 | consumed tokens: 35069624320 | elapsed time per iteration (s): 0.55 | learning rate: 8.861E-05 | global batch size: 256 | lm loss: 2.673327E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.864 | TFLOPs: 44.03 | +7: iteration 66900/ 115203 | consumed samples: 17126400 | consumed tokens: 35074867200 | elapsed time per iteration (s): 0.58 | learning rate: 8.858E-05 | global batch size: 256 | lm loss: 2.664681E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.521 | TFLOPs: 42.38 | +7: iteration 66910/ 115203 | consumed samples: 17128960 | consumed tokens: 35080110080 | elapsed time per iteration (s): 0.57 | learning rate: 8.856E-05 | global batch size: 256 | lm loss: 2.678340E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.076 | TFLOPs: 42.62 | +7: iteration 66920/ 115203 | consumed samples: 17131520 | consumed tokens: 35085352960 | elapsed time per iteration (s): 0.56 | learning rate: 8.854E-05 | global batch size: 256 | lm loss: 2.672105E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.460 | TFLOPs: 43.52 | +7: iteration 66930/ 115203 | consumed samples: 17134080 | consumed tokens: 35090595840 | elapsed time per iteration (s): 0.57 | learning rate: 8.851E-05 | global batch size: 256 | lm loss: 2.657520E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.296 | TFLOPs: 42.64 | +7: iteration 66940/ 115203 | consumed samples: 17136640 | consumed tokens: 35095838720 | elapsed time per iteration (s): 0.55 | learning rate: 8.849E-05 | global batch size: 256 | lm loss: 2.677724E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.420 | TFLOPs: 43.99 | +7: iteration 66950/ 115203 | consumed samples: 17139200 | consumed tokens: 35101081600 | elapsed time per iteration (s): 0.56 | learning rate: 8.846E-05 | global batch size: 256 | lm loss: 2.657885E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.529 | TFLOPs: 43.43 | +7: iteration 66960/ 115203 | consumed samples: 17141760 | consumed tokens: 35106324480 | elapsed time per iteration (s): 0.56 | learning rate: 8.844E-05 | global batch size: 256 | lm loss: 2.676117E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.259 | TFLOPs: 43.69 | +7: iteration 66970/ 115203 | consumed samples: 17144320 | consumed tokens: 35111567360 | elapsed time per iteration (s): 0.56 | learning rate: 8.842E-05 | global batch size: 256 | lm loss: 2.662688E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.871 | TFLOPs: 43.46 | +7: iteration 66980/ 115203 | consumed samples: 17146880 | consumed tokens: 35116810240 | elapsed time per iteration (s): 0.57 | learning rate: 8.839E-05 | global batch size: 256 | lm loss: 2.660621E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.206 | TFLOPs: 42.83 | +7: iteration 66990/ 115203 | consumed samples: 17149440 | consumed tokens: 35122053120 | elapsed time per iteration (s): 0.56 | learning rate: 8.837E-05 | global batch size: 256 | lm loss: 2.664146E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.555 | TFLOPs: 43.53 | +7: iteration 67000/ 115203 | consumed samples: 17152000 | consumed tokens: 35127296000 | elapsed time per iteration (s): 0.57 | learning rate: 8.834E-05 | global batch size: 256 | lm loss: 2.661927E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.486 | TFLOPs: 43.04 | +7: iteration 67010/ 115203 | consumed samples: 17154560 | consumed tokens: 35132538880 | elapsed time per iteration (s): 0.55 | learning rate: 8.832E-05 | global batch size: 256 | lm loss: 2.661405E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 67020/ 115203 | consumed samples: 17157120 | consumed tokens: 35137781760 | elapsed time per iteration (s): 0.58 | learning rate: 8.830E-05 | global batch size: 256 | lm loss: 2.666005E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.631 | TFLOPs: 41.91 | +7: iteration 67030/ 115203 | consumed samples: 17159680 | consumed tokens: 35143024640 | elapsed time per iteration (s): 0.57 | learning rate: 8.827E-05 | global batch size: 256 | lm loss: 2.667841E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.205 | TFLOPs: 43.02 | +7: iteration 67040/ 115203 | consumed samples: 17162240 | consumed tokens: 35148267520 | elapsed time per iteration (s): 0.57 | learning rate: 8.825E-05 | global batch size: 256 | lm loss: 2.655024E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.067 | TFLOPs: 43.19 | +7: iteration 67050/ 115203 | consumed samples: 17164800 | consumed tokens: 35153510400 | elapsed time per iteration (s): 0.55 | learning rate: 8.822E-05 | global batch size: 256 | lm loss: 2.661785E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.730 | TFLOPs: 44.02 | +7: iteration 67060/ 115203 | consumed samples: 17167360 | consumed tokens: 35158753280 | elapsed time per iteration (s): 0.56 | learning rate: 8.820E-05 | global batch size: 256 | lm loss: 2.652059E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.524 | TFLOPs: 43.52 | +7: iteration 67070/ 115203 | consumed samples: 17169920 | consumed tokens: 35163996160 | elapsed time per iteration (s): 0.57 | learning rate: 8.818E-05 | global batch size: 256 | lm loss: 2.659978E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.675 | TFLOPs: 42.87 | +7: iteration 67080/ 115203 | consumed samples: 17172480 | consumed tokens: 35169239040 | elapsed time per iteration (s): 0.57 | learning rate: 8.815E-05 | global batch size: 256 | lm loss: 2.671042E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.461 | TFLOPs: 43.14 | +7: iteration 67090/ 115203 | consumed samples: 17175040 | consumed tokens: 35174481920 | elapsed time per iteration (s): 0.55 | learning rate: 8.813E-05 | global batch size: 256 | lm loss: 2.652159E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.314 | TFLOPs: 43.98 | +7: iteration 67100/ 115203 | consumed samples: 17177600 | consumed tokens: 35179724800 | elapsed time per iteration (s): 0.57 | learning rate: 8.810E-05 | global batch size: 256 | lm loss: 2.663955E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.424 | TFLOPs: 43.04 | +7: iteration 67110/ 115203 | consumed samples: 17180160 | consumed tokens: 35184967680 | elapsed time per iteration (s): 0.57 | learning rate: 8.808E-05 | global batch size: 256 | lm loss: 2.666175E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.366 | TFLOPs: 42.46 | +7: iteration 67120/ 115203 | consumed samples: 17182720 | consumed tokens: 35190210560 | elapsed time per iteration (s): 0.57 | learning rate: 8.806E-05 | global batch size: 256 | lm loss: 2.662820E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.506 | TFLOPs: 43.05 | +7: iteration 67130/ 115203 | consumed samples: 17185280 | consumed tokens: 35195453440 | elapsed time per iteration (s): 0.56 | learning rate: 8.803E-05 | global batch size: 256 | lm loss: 2.678041E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.215 | TFLOPs: 43.50 | +7: iteration 67140/ 115203 | consumed samples: 17187840 | consumed tokens: 35200696320 | elapsed time per iteration (s): 0.57 | learning rate: 8.801E-05 | global batch size: 256 | lm loss: 2.669880E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.851 | TFLOPs: 43.08 | +7: iteration 67150/ 115203 | consumed samples: 17190400 | consumed tokens: 35205939200 | elapsed time per iteration (s): 0.55 | learning rate: 8.798E-05 | global batch size: 256 | lm loss: 2.669575E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.606 | TFLOPs: 44.01 | +7: iteration 67160/ 115203 | consumed samples: 17192960 | consumed tokens: 35211182080 | elapsed time per iteration (s): 0.55 | learning rate: 8.796E-05 | global batch size: 256 | lm loss: 2.659213E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.483 | TFLOPs: 44.00 | +7: iteration 67170/ 115203 | consumed samples: 17195520 | consumed tokens: 35216424960 | elapsed time per iteration (s): 0.56 | learning rate: 8.794E-05 | global batch size: 256 | lm loss: 2.658259E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.439 | TFLOPs: 43.42 | +7: iteration 67180/ 115203 | consumed samples: 17198080 | consumed tokens: 35221667840 | elapsed time per iteration (s): 0.55 | learning rate: 8.791E-05 | global batch size: 256 | lm loss: 2.662012E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.304 | TFLOPs: 43.98 | +7: iteration 67190/ 115203 | consumed samples: 17200640 | consumed tokens: 35226910720 | elapsed time per iteration (s): 0.57 | learning rate: 8.789E-05 | global batch size: 256 | lm loss: 2.661249E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.323 | TFLOPs: 42.84 | +7: iteration 67200/ 115203 | consumed samples: 17203200 | consumed tokens: 35232153600 | elapsed time per iteration (s): 0.56 | learning rate: 8.786E-05 | global batch size: 256 | lm loss: 2.675961E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.658 | TFLOPs: 43.63 | +7: iteration 67210/ 115203 | consumed samples: 17205760 | consumed tokens: 35237396480 | elapsed time per iteration (s): 0.56 | learning rate: 8.784E-05 | global batch size: 256 | lm loss: 2.666829E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.448 | TFLOPs: 43.42 | +7: iteration 67220/ 115203 | consumed samples: 17208320 | consumed tokens: 35242639360 | elapsed time per iteration (s): 0.58 | learning rate: 8.782E-05 | global batch size: 256 | lm loss: 2.665706E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.603 | TFLOPs: 42.20 | +7: iteration 67230/ 115203 | consumed samples: 17210880 | consumed tokens: 35247882240 | elapsed time per iteration (s): 0.56 | learning rate: 8.779E-05 | global batch size: 256 | lm loss: 2.663915E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.158 | TFLOPs: 43.49 | +7: iteration 67240/ 115203 | consumed samples: 17213440 | consumed tokens: 35253125120 | elapsed time per iteration (s): 0.55 | learning rate: 8.777E-05 | global batch size: 256 | lm loss: 2.659407E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.367 | TFLOPs: 43.99 | +7: iteration 67250/ 115203 | consumed samples: 17216000 | consumed tokens: 35258368000 | elapsed time per iteration (s): 0.56 | learning rate: 8.774E-05 | global batch size: 256 | lm loss: 2.646671E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.142 | TFLOPs: 43.49 | +7: iteration 67260/ 115203 | consumed samples: 17218560 | consumed tokens: 35263610880 | elapsed time per iteration (s): 0.56 | learning rate: 8.772E-05 | global batch size: 256 | lm loss: 2.656238E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.227 | TFLOPs: 43.97 | +7: iteration 67270/ 115203 | consumed samples: 17221120 | consumed tokens: 35268853760 | elapsed time per iteration (s): 0.55 | learning rate: 8.769E-05 | global batch size: 256 | lm loss: 2.658766E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.325 | TFLOPs: 43.98 | +7: iteration 67280/ 115203 | consumed samples: 17223680 | consumed tokens: 35274096640 | elapsed time per iteration (s): 0.56 | learning rate: 8.767E-05 | global batch size: 256 | lm loss: 2.662956E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.457 | TFLOPs: 43.52 | +7: iteration 67290/ 115203 | consumed samples: 17226240 | consumed tokens: 35279339520 | elapsed time per iteration (s): 0.56 | learning rate: 8.765E-05 | global batch size: 256 | lm loss: 2.669773E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.511 | TFLOPs: 43.81 | +7: iteration 67300/ 115203 | consumed samples: 17228800 | consumed tokens: 35284582400 | elapsed time per iteration (s): 0.56 | learning rate: 8.762E-05 | global batch size: 256 | lm loss: 2.668843E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.708 | TFLOPs: 43.54 | +7: iteration 67310/ 115203 | consumed samples: 17231360 | consumed tokens: 35289825280 | elapsed time per iteration (s): 0.57 | learning rate: 8.760E-05 | global batch size: 256 | lm loss: 2.671668E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.342 | TFLOPs: 42.94 | +7: iteration 67320/ 115203 | consumed samples: 17233920 | consumed tokens: 35295068160 | elapsed time per iteration (s): 0.56 | learning rate: 8.757E-05 | global batch size: 256 | lm loss: 2.658865E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.528 | TFLOPs: 43.52 | +7: iteration 67330/ 115203 | consumed samples: 17236480 | consumed tokens: 35300311040 | elapsed time per iteration (s): 0.57 | learning rate: 8.755E-05 | global batch size: 256 | lm loss: 2.661558E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.956 | TFLOPs: 43.09 | +7: iteration 67340/ 115203 | consumed samples: 17239040 | consumed tokens: 35305553920 | elapsed time per iteration (s): 0.55 | learning rate: 8.753E-05 | global batch size: 256 | lm loss: 2.671372E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.451 | TFLOPs: 43.99 | +7: iteration 67350/ 115203 | consumed samples: 17241600 | consumed tokens: 35310796800 | elapsed time per iteration (s): 0.56 | learning rate: 8.750E-05 | global batch size: 256 | lm loss: 2.655098E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.116 | TFLOPs: 43.49 | +7: iteration 67360/ 115203 | consumed samples: 17244160 | consumed tokens: 35316039680 | elapsed time per iteration (s): 0.56 | learning rate: 8.748E-05 | global batch size: 256 | lm loss: 2.652134E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.260 | TFLOPs: 43.50 | +7: iteration 67370/ 115203 | consumed samples: 17246720 | consumed tokens: 35321282560 | elapsed time per iteration (s): 0.55 | learning rate: 8.745E-05 | global batch size: 256 | lm loss: 2.657155E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.447 | TFLOPs: 43.99 | +7: iteration 67380/ 115203 | consumed samples: 17249280 | consumed tokens: 35326525440 | elapsed time per iteration (s): 0.56 | learning rate: 8.743E-05 | global batch size: 256 | lm loss: 2.661014E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.689 | TFLOPs: 43.54 | +7: iteration 67390/ 115203 | consumed samples: 17251840 | consumed tokens: 35331768320 | elapsed time per iteration (s): 0.55 | learning rate: 8.741E-05 | global batch size: 256 | lm loss: 2.653635E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.377 | TFLOPs: 43.99 | +7: iteration 67400/ 115203 | consumed samples: 17254400 | consumed tokens: 35337011200 | elapsed time per iteration (s): 0.56 | learning rate: 8.738E-05 | global batch size: 256 | lm loss: 2.658491E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.218 | TFLOPs: 43.97 | +7: iteration 67410/ 115203 | consumed samples: 17256960 | consumed tokens: 35342254080 | elapsed time per iteration (s): 0.56 | learning rate: 8.736E-05 | global batch size: 256 | lm loss: 2.666373E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.168 | TFLOPs: 43.49 | +7: iteration 67420/ 115203 | consumed samples: 17259520 | consumed tokens: 35347496960 | elapsed time per iteration (s): 0.55 | learning rate: 8.733E-05 | global batch size: 256 | lm loss: 2.667904E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 67430/ 115203 | consumed samples: 17262080 | consumed tokens: 35352739840 | elapsed time per iteration (s): 0.56 | learning rate: 8.731E-05 | global batch size: 256 | lm loss: 2.647108E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.315 | TFLOPs: 43.60 | +7: iteration 67440/ 115203 | consumed samples: 17264640 | consumed tokens: 35357982720 | elapsed time per iteration (s): 0.55 | learning rate: 8.729E-05 | global batch size: 256 | lm loss: 2.653335E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.286 | TFLOPs: 43.98 | +7: iteration 67450/ 115203 | consumed samples: 17267200 | consumed tokens: 35363225600 | elapsed time per iteration (s): 0.56 | learning rate: 8.726E-05 | global batch size: 256 | lm loss: 2.647237E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.051 | TFLOPs: 43.96 | +7: iteration 67460/ 115203 | consumed samples: 17269760 | consumed tokens: 35368468480 | elapsed time per iteration (s): 0.56 | learning rate: 8.724E-05 | global batch size: 256 | lm loss: 2.666304E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.083 | TFLOPs: 43.86 | +7: iteration 67470/ 115203 | consumed samples: 17272320 | consumed tokens: 35373711360 | elapsed time per iteration (s): 0.56 | learning rate: 8.721E-05 | global batch size: 256 | lm loss: 2.657063E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.254 | TFLOPs: 43.69 | +7: iteration 67480/ 115203 | consumed samples: 17274880 | consumed tokens: 35378954240 | elapsed time per iteration (s): 0.56 | learning rate: 8.719E-05 | global batch size: 256 | lm loss: 2.673643E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.101 | TFLOPs: 43.96 | +7: iteration 67490/ 115203 | consumed samples: 17277440 | consumed tokens: 35384197120 | elapsed time per iteration (s): 0.56 | learning rate: 8.717E-05 | global batch size: 256 | lm loss: 2.652678E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.842 | TFLOPs: 43.94 | +7: iteration 67500/ 115203 | consumed samples: 17280000 | consumed tokens: 35389440000 | elapsed time per iteration (s): 0.56 | learning rate: 8.714E-05 | global batch size: 256 | lm loss: 2.665964E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.698 | TFLOPs: 43.35 | +7: iteration 67510/ 115203 | consumed samples: 17282560 | consumed tokens: 35394682880 | elapsed time per iteration (s): 0.56 | learning rate: 8.712E-05 | global batch size: 256 | lm loss: 2.654915E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.126 | TFLOPs: 43.68 | +7: iteration 67520/ 115203 | consumed samples: 17285120 | consumed tokens: 35399925760 | elapsed time per iteration (s): 0.56 | learning rate: 8.710E-05 | global batch size: 256 | lm loss: 2.655162E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.583 | TFLOPs: 43.53 | +7: iteration 67530/ 115203 | consumed samples: 17287680 | consumed tokens: 35405168640 | elapsed time per iteration (s): 0.56 | learning rate: 8.707E-05 | global batch size: 256 | lm loss: 2.669747E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 67540/ 115203 | consumed samples: 17290240 | consumed tokens: 35410411520 | elapsed time per iteration (s): 0.56 | learning rate: 8.705E-05 | global batch size: 256 | lm loss: 2.683448E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.222 | TFLOPs: 43.59 | +7: iteration 67550/ 115203 | consumed samples: 17292800 | consumed tokens: 35415654400 | elapsed time per iteration (s): 0.56 | learning rate: 8.702E-05 | global batch size: 256 | lm loss: 2.664042E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.022 | TFLOPs: 43.95 | +7: iteration 67560/ 115203 | consumed samples: 17295360 | consumed tokens: 35420897280 | elapsed time per iteration (s): 0.56 | learning rate: 8.700E-05 | global batch size: 256 | lm loss: 2.663911E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.056 | TFLOPs: 43.96 | +7: iteration 67570/ 115203 | consumed samples: 17297920 | consumed tokens: 35426140160 | elapsed time per iteration (s): 0.57 | learning rate: 8.698E-05 | global batch size: 256 | lm loss: 2.674843E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.740 | TFLOPs: 42.88 | +7: iteration 67580/ 115203 | consumed samples: 17300480 | consumed tokens: 35431383040 | elapsed time per iteration (s): 0.55 | learning rate: 8.695E-05 | global batch size: 256 | lm loss: 2.677213E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.367 | TFLOPs: 43.99 | +7: iteration 67590/ 115203 | consumed samples: 17303040 | consumed tokens: 35436625920 | elapsed time per iteration (s): 0.57 | learning rate: 8.693E-05 | global batch size: 256 | lm loss: 2.662699E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.617 | TFLOPs: 42.77 | +7: iteration 67600/ 115203 | consumed samples: 17305600 | consumed tokens: 35441868800 | elapsed time per iteration (s): 0.56 | learning rate: 8.690E-05 | global batch size: 256 | lm loss: 2.654842E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.058 | TFLOPs: 43.86 | +7: iteration 67610/ 115203 | consumed samples: 17308160 | consumed tokens: 35447111680 | elapsed time per iteration (s): 0.56 | learning rate: 8.688E-05 | global batch size: 256 | lm loss: 2.677675E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.683 | TFLOPs: 43.44 | +7: iteration 67620/ 115203 | consumed samples: 17310720 | consumed tokens: 35452354560 | elapsed time per iteration (s): 0.58 | learning rate: 8.686E-05 | global batch size: 256 | lm loss: 2.664492E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.206 | TFLOPs: 42.35 | +7: iteration 67630/ 115203 | consumed samples: 17313280 | consumed tokens: 35457597440 | elapsed time per iteration (s): 0.56 | learning rate: 8.683E-05 | global batch size: 256 | lm loss: 2.660870E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.954 | TFLOPs: 43.37 | +7: iteration 67640/ 115203 | consumed samples: 17315840 | consumed tokens: 35462840320 | elapsed time per iteration (s): 0.55 | learning rate: 8.681E-05 | global batch size: 256 | lm loss: 2.662326E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.458 | TFLOPs: 43.99 | +7: iteration 67650/ 115203 | consumed samples: 17318400 | consumed tokens: 35468083200 | elapsed time per iteration (s): 0.55 | learning rate: 8.678E-05 | global batch size: 256 | lm loss: 2.674361E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 67660/ 115203 | consumed samples: 17320960 | consumed tokens: 35473326080 | elapsed time per iteration (s): 0.55 | learning rate: 8.676E-05 | global batch size: 256 | lm loss: 2.658047E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.347 | TFLOPs: 43.98 | +7: iteration 67670/ 115203 | consumed samples: 17323520 | consumed tokens: 35478568960 | elapsed time per iteration (s): 0.55 | learning rate: 8.674E-05 | global batch size: 256 | lm loss: 2.661854E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.337 | TFLOPs: 43.98 | +7: iteration 67680/ 115203 | consumed samples: 17326080 | consumed tokens: 35483811840 | elapsed time per iteration (s): 0.57 | learning rate: 8.671E-05 | global batch size: 256 | lm loss: 2.659164E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.993 | TFLOPs: 43.19 | +7: iteration 67690/ 115203 | consumed samples: 17328640 | consumed tokens: 35489054720 | elapsed time per iteration (s): 0.56 | learning rate: 8.669E-05 | global batch size: 256 | lm loss: 2.672890E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.423 | TFLOPs: 43.80 | +7: iteration 67700/ 115203 | consumed samples: 17331200 | consumed tokens: 35494297600 | elapsed time per iteration (s): 0.56 | learning rate: 8.666E-05 | global batch size: 256 | lm loss: 2.662816E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.277 | TFLOPs: 43.88 | +7: iteration 67710/ 115203 | consumed samples: 17333760 | consumed tokens: 35499540480 | elapsed time per iteration (s): 0.56 | learning rate: 8.664E-05 | global batch size: 256 | lm loss: 2.661942E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.576 | TFLOPs: 43.62 | +7: iteration 67720/ 115203 | consumed samples: 17336320 | consumed tokens: 35504783360 | elapsed time per iteration (s): 0.55 | learning rate: 8.662E-05 | global batch size: 256 | lm loss: 2.663901E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.376 | TFLOPs: 43.99 | +7: iteration 67730/ 115203 | consumed samples: 17338880 | consumed tokens: 35510026240 | elapsed time per iteration (s): 0.56 | learning rate: 8.659E-05 | global batch size: 256 | lm loss: 2.670837E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.989 | TFLOPs: 43.66 | +7: iteration 67740/ 115203 | consumed samples: 17341440 | consumed tokens: 35515269120 | elapsed time per iteration (s): 0.56 | learning rate: 8.657E-05 | global batch size: 256 | lm loss: 2.659477E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.169 | TFLOPs: 43.49 | +7: iteration 67750/ 115203 | consumed samples: 17344000 | consumed tokens: 35520512000 | elapsed time per iteration (s): 0.56 | learning rate: 8.654E-05 | global batch size: 256 | lm loss: 2.664411E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.346 | TFLOPs: 43.60 | +7: iteration 67760/ 115203 | consumed samples: 17346560 | consumed tokens: 35525754880 | elapsed time per iteration (s): 0.56 | learning rate: 8.652E-05 | global batch size: 256 | lm loss: 2.648082E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.159 | TFLOPs: 43.97 | +7: iteration 67770/ 115203 | consumed samples: 17349120 | consumed tokens: 35530997760 | elapsed time per iteration (s): 0.56 | learning rate: 8.650E-05 | global batch size: 256 | lm loss: 2.650066E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.202 | TFLOPs: 43.49 | +7: iteration 67780/ 115203 | consumed samples: 17351680 | consumed tokens: 35536240640 | elapsed time per iteration (s): 0.56 | learning rate: 8.647E-05 | global batch size: 256 | lm loss: 2.652518E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 67790/ 115203 | consumed samples: 17354240 | consumed tokens: 35541483520 | elapsed time per iteration (s): 0.59 | learning rate: 8.645E-05 | global batch size: 256 | lm loss: 2.657833E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.846 | TFLOPs: 41.65 | +7: iteration 67800/ 115203 | consumed samples: 17356800 | consumed tokens: 35546726400 | elapsed time per iteration (s): 0.55 | learning rate: 8.642E-05 | global batch size: 256 | lm loss: 2.656873E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.479 | TFLOPs: 44.00 | +7: iteration 67810/ 115203 | consumed samples: 17359360 | consumed tokens: 35551969280 | elapsed time per iteration (s): 0.56 | learning rate: 8.640E-05 | global batch size: 256 | lm loss: 2.664137E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.296 | TFLOPs: 43.50 | +7: iteration 67820/ 115203 | consumed samples: 17361920 | consumed tokens: 35557212160 | elapsed time per iteration (s): 0.56 | learning rate: 8.638E-05 | global batch size: 256 | lm loss: 2.652870E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.130 | TFLOPs: 43.96 | +7: iteration 67830/ 115203 | consumed samples: 17364480 | consumed tokens: 35562455040 | elapsed time per iteration (s): 0.56 | learning rate: 8.635E-05 | global batch size: 256 | lm loss: 2.673656E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.744 | TFLOPs: 43.45 | +7: iteration 67840/ 115203 | consumed samples: 17367040 | consumed tokens: 35567697920 | elapsed time per iteration (s): 0.56 | learning rate: 8.633E-05 | global batch size: 256 | lm loss: 2.675562E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.172 | TFLOPs: 43.97 | +7: iteration 67850/ 115203 | consumed samples: 17369600 | consumed tokens: 35572940800 | elapsed time per iteration (s): 0.58 | learning rate: 8.630E-05 | global batch size: 256 | lm loss: 2.663253E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.884 | TFLOPs: 42.32 | +7: iteration 67860/ 115203 | consumed samples: 17372160 | consumed tokens: 35578183680 | elapsed time per iteration (s): 0.55 | learning rate: 8.628E-05 | global batch size: 256 | lm loss: 2.665333E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 67870/ 115203 | consumed samples: 17374720 | consumed tokens: 35583426560 | elapsed time per iteration (s): 0.56 | learning rate: 8.626E-05 | global batch size: 256 | lm loss: 2.654563E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.233 | TFLOPs: 43.97 | +7: iteration 67880/ 115203 | consumed samples: 17377280 | consumed tokens: 35588669440 | elapsed time per iteration (s): 0.56 | learning rate: 8.623E-05 | global batch size: 256 | lm loss: 2.649904E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.832 | TFLOPs: 43.84 | +7: iteration 67890/ 115203 | consumed samples: 17379840 | consumed tokens: 35593912320 | elapsed time per iteration (s): 0.57 | learning rate: 8.621E-05 | global batch size: 256 | lm loss: 2.651648E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.582 | TFLOPs: 43.15 | +7: iteration 67900/ 115203 | consumed samples: 17382400 | consumed tokens: 35599155200 | elapsed time per iteration (s): 0.56 | learning rate: 8.619E-05 | global batch size: 256 | lm loss: 2.655283E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.008 | TFLOPs: 43.95 | +7: iteration 67910/ 115203 | consumed samples: 17384960 | consumed tokens: 35604398080 | elapsed time per iteration (s): 0.56 | learning rate: 8.616E-05 | global batch size: 256 | lm loss: 2.668795E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.134 | TFLOPs: 43.96 | +7: iteration 67920/ 115203 | consumed samples: 17387520 | consumed tokens: 35609640960 | elapsed time per iteration (s): 0.57 | learning rate: 8.614E-05 | global batch size: 256 | lm loss: 2.658489E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.034 | TFLOPs: 43.10 | +7: iteration 67930/ 115203 | consumed samples: 17390080 | consumed tokens: 35614883840 | elapsed time per iteration (s): 0.56 | learning rate: 8.611E-05 | global batch size: 256 | lm loss: 2.652793E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.705 | TFLOPs: 43.54 | +7: iteration 67940/ 115203 | consumed samples: 17392640 | consumed tokens: 35620126720 | elapsed time per iteration (s): 0.56 | learning rate: 8.609E-05 | global batch size: 256 | lm loss: 2.666396E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.502 | TFLOPs: 43.90 | +7: iteration 67950/ 115203 | consumed samples: 17395200 | consumed tokens: 35625369600 | elapsed time per iteration (s): 0.56 | learning rate: 8.607E-05 | global batch size: 256 | lm loss: 2.649321E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.167 | TFLOPs: 43.97 | +7: iteration 67960/ 115203 | consumed samples: 17397760 | consumed tokens: 35630612480 | elapsed time per iteration (s): 0.56 | learning rate: 8.604E-05 | global batch size: 256 | lm loss: 2.666892E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.584 | TFLOPs: 43.24 | +7: iteration 67970/ 115203 | consumed samples: 17400320 | consumed tokens: 35635855360 | elapsed time per iteration (s): 0.56 | learning rate: 8.602E-05 | global batch size: 256 | lm loss: 2.654030E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.639 | TFLOPs: 43.34 | +7: iteration 67980/ 115203 | consumed samples: 17402880 | consumed tokens: 35641098240 | elapsed time per iteration (s): 0.55 | learning rate: 8.599E-05 | global batch size: 256 | lm loss: 2.646076E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.334 | TFLOPs: 43.98 | +7: iteration 67990/ 115203 | consumed samples: 17405440 | consumed tokens: 35646341120 | elapsed time per iteration (s): 0.56 | learning rate: 8.597E-05 | global batch size: 256 | lm loss: 2.648891E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.162 | TFLOPs: 43.87 | +0: [2023-03-16 23:29:57,486] [INFO] [logging.py:68:log_dist] [Rank 0] step=68000, skipped=0, lr=[8.594634403532495e-05, 8.594634403532495e-05, 8.594634403532495e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 68000/ 115203 | consumed samples: 17408000 | consumed tokens: 35651584000 | elapsed time per iteration (s): 0.58 | learning rate: 8.595E-05 | global batch size: 256 | lm loss: 2.652788E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.084 | TFLOPs: 42.34 | +0: steps: 68000 loss: 2.6290 iter time (s): 0.562 samples/sec: 455.640 +7: iteration 68010/ 115203 | consumed samples: 17410560 | consumed tokens: 35656826880 | elapsed time per iteration (s): 0.57 | learning rate: 8.592E-05 | global batch size: 256 | lm loss: 2.663309E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.420 | TFLOPs: 42.94 | +7: iteration 68020/ 115203 | consumed samples: 17413120 | consumed tokens: 35662069760 | elapsed time per iteration (s): 0.55 | learning rate: 8.590E-05 | global batch size: 256 | lm loss: 2.654685E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.320 | TFLOPs: 43.98 | +7: iteration 68030/ 115203 | consumed samples: 17415680 | consumed tokens: 35667312640 | elapsed time per iteration (s): 0.55 | learning rate: 8.587E-05 | global batch size: 256 | lm loss: 2.653188E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.474 | TFLOPs: 44.00 | +7: iteration 68040/ 115203 | consumed samples: 17418240 | consumed tokens: 35672555520 | elapsed time per iteration (s): 0.57 | learning rate: 8.585E-05 | global batch size: 256 | lm loss: 2.655777E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.420 | TFLOPs: 42.66 | +7: iteration 68050/ 115203 | consumed samples: 17420800 | consumed tokens: 35677798400 | elapsed time per iteration (s): 0.56 | learning rate: 8.583E-05 | global batch size: 256 | lm loss: 2.668606E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.299 | TFLOPs: 43.41 | +7: iteration 68060/ 115203 | consumed samples: 17423360 | consumed tokens: 35683041280 | elapsed time per iteration (s): 0.56 | learning rate: 8.580E-05 | global batch size: 256 | lm loss: 2.656431E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.104 | TFLOPs: 43.39 | +7: iteration 68070/ 115203 | consumed samples: 17425920 | consumed tokens: 35688284160 | elapsed time per iteration (s): 0.56 | learning rate: 8.578E-05 | global batch size: 256 | lm loss: 2.669948E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.056 | TFLOPs: 43.29 | +7: iteration 68080/ 115203 | consumed samples: 17428480 | consumed tokens: 35693527040 | elapsed time per iteration (s): 0.55 | learning rate: 8.576E-05 | global batch size: 256 | lm loss: 2.650773E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 68090/ 115203 | consumed samples: 17431040 | consumed tokens: 35698769920 | elapsed time per iteration (s): 0.57 | learning rate: 8.573E-05 | global batch size: 256 | lm loss: 2.666271E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.700 | TFLOPs: 42.59 | +7: iteration 68100/ 115203 | consumed samples: 17433600 | consumed tokens: 35704012800 | elapsed time per iteration (s): 0.56 | learning rate: 8.571E-05 | global batch size: 256 | lm loss: 2.656937E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.124 | TFLOPs: 43.39 | +7: iteration 68110/ 115203 | consumed samples: 17436160 | consumed tokens: 35709255680 | elapsed time per iteration (s): 0.56 | learning rate: 8.568E-05 | global batch size: 256 | lm loss: 2.677435E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.023 | TFLOPs: 43.67 | +7: iteration 68120/ 115203 | consumed samples: 17438720 | consumed tokens: 35714498560 | elapsed time per iteration (s): 0.57 | learning rate: 8.566E-05 | global batch size: 256 | lm loss: 2.661239E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.429 | TFLOPs: 43.13 | +7: iteration 68130/ 115203 | consumed samples: 17441280 | consumed tokens: 35719741440 | elapsed time per iteration (s): 0.56 | learning rate: 8.564E-05 | global batch size: 256 | lm loss: 2.672005E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.182 | TFLOPs: 43.97 | +7: iteration 68140/ 115203 | consumed samples: 17443840 | consumed tokens: 35724984320 | elapsed time per iteration (s): 0.56 | learning rate: 8.561E-05 | global batch size: 256 | lm loss: 2.660184E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.064 | TFLOPs: 43.96 | +7: iteration 68150/ 115203 | consumed samples: 17446400 | consumed tokens: 35730227200 | elapsed time per iteration (s): 0.56 | learning rate: 8.559E-05 | global batch size: 256 | lm loss: 2.667563E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.848 | TFLOPs: 43.36 | +7: iteration 68160/ 115203 | consumed samples: 17448960 | consumed tokens: 35735470080 | elapsed time per iteration (s): 0.56 | learning rate: 8.556E-05 | global batch size: 256 | lm loss: 2.667665E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.328 | TFLOPs: 43.70 | +7: iteration 68170/ 115203 | consumed samples: 17451520 | consumed tokens: 35740712960 | elapsed time per iteration (s): 0.56 | learning rate: 8.554E-05 | global batch size: 256 | lm loss: 2.663882E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.496 | TFLOPs: 43.62 | +7: iteration 68180/ 115203 | consumed samples: 17454080 | consumed tokens: 35745955840 | elapsed time per iteration (s): 0.56 | learning rate: 8.552E-05 | global batch size: 256 | lm loss: 2.668623E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 68190/ 115203 | consumed samples: 17456640 | consumed tokens: 35751198720 | elapsed time per iteration (s): 0.55 | learning rate: 8.549E-05 | global batch size: 256 | lm loss: 2.658108E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.490 | TFLOPs: 44.00 | +7: iteration 68200/ 115203 | consumed samples: 17459200 | consumed tokens: 35756441600 | elapsed time per iteration (s): 0.56 | learning rate: 8.547E-05 | global batch size: 256 | lm loss: 2.670825E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.394 | TFLOPs: 43.89 | +7: iteration 68210/ 115203 | consumed samples: 17461760 | consumed tokens: 35761684480 | elapsed time per iteration (s): 0.56 | learning rate: 8.545E-05 | global batch size: 256 | lm loss: 2.647730E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.798 | TFLOPs: 43.36 | +7: iteration 68220/ 115203 | consumed samples: 17464320 | consumed tokens: 35766927360 | elapsed time per iteration (s): 0.56 | learning rate: 8.542E-05 | global batch size: 256 | lm loss: 2.665498E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.417 | TFLOPs: 43.71 | +7: iteration 68230/ 115203 | consumed samples: 17466880 | consumed tokens: 35772170240 | elapsed time per iteration (s): 0.56 | learning rate: 8.540E-05 | global batch size: 256 | lm loss: 2.671718E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.372 | TFLOPs: 43.70 | +7: iteration 68240/ 115203 | consumed samples: 17469440 | consumed tokens: 35777413120 | elapsed time per iteration (s): 0.56 | learning rate: 8.537E-05 | global batch size: 256 | lm loss: 2.666291E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.643 | TFLOPs: 43.54 | +7: iteration 68250/ 115203 | consumed samples: 17472000 | consumed tokens: 35782656000 | elapsed time per iteration (s): 0.56 | learning rate: 8.535E-05 | global batch size: 256 | lm loss: 2.663534E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.623 | TFLOPs: 43.44 | +7: iteration 68260/ 115203 | consumed samples: 17474560 | consumed tokens: 35787898880 | elapsed time per iteration (s): 0.57 | learning rate: 8.533E-05 | global batch size: 256 | lm loss: 2.640475E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.786 | TFLOPs: 42.69 | +7: iteration 68270/ 115203 | consumed samples: 17477120 | consumed tokens: 35793141760 | elapsed time per iteration (s): 0.56 | learning rate: 8.530E-05 | global batch size: 256 | lm loss: 2.657558E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.749 | TFLOPs: 43.45 | +7: iteration 68280/ 115203 | consumed samples: 17479680 | consumed tokens: 35798384640 | elapsed time per iteration (s): 0.56 | learning rate: 8.528E-05 | global batch size: 256 | lm loss: 2.648572E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.185 | TFLOPs: 43.97 | +7: iteration 68290/ 115203 | consumed samples: 17482240 | consumed tokens: 35803627520 | elapsed time per iteration (s): 0.56 | learning rate: 8.525E-05 | global batch size: 256 | lm loss: 2.654489E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.173 | TFLOPs: 43.30 | +7: iteration 68300/ 115203 | consumed samples: 17484800 | consumed tokens: 35808870400 | elapsed time per iteration (s): 0.55 | learning rate: 8.523E-05 | global batch size: 256 | lm loss: 2.657831E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.421 | TFLOPs: 43.99 | +7: iteration 68310/ 115203 | consumed samples: 17487360 | consumed tokens: 35814113280 | elapsed time per iteration (s): 0.56 | learning rate: 8.521E-05 | global batch size: 256 | lm loss: 2.666474E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.112 | TFLOPs: 43.87 | +7: iteration 68320/ 115203 | consumed samples: 17489920 | consumed tokens: 35819356160 | elapsed time per iteration (s): 0.56 | learning rate: 8.518E-05 | global batch size: 256 | lm loss: 2.649382E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.632 | TFLOPs: 43.63 | +7: iteration 68330/ 115203 | consumed samples: 17492480 | consumed tokens: 35824599040 | elapsed time per iteration (s): 0.56 | learning rate: 8.516E-05 | global batch size: 256 | lm loss: 2.670806E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.127 | TFLOPs: 43.96 | +7: iteration 68340/ 115203 | consumed samples: 17495040 | consumed tokens: 35829841920 | elapsed time per iteration (s): 0.56 | learning rate: 8.514E-05 | global batch size: 256 | lm loss: 2.660403E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.568 | TFLOPs: 43.72 | +7: iteration 68350/ 115203 | consumed samples: 17497600 | consumed tokens: 35835084800 | elapsed time per iteration (s): 0.55 | learning rate: 8.511E-05 | global batch size: 256 | lm loss: 2.660694E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.273 | TFLOPs: 43.98 | +7: iteration 68360/ 115203 | consumed samples: 17500160 | consumed tokens: 35840327680 | elapsed time per iteration (s): 0.57 | learning rate: 8.509E-05 | global batch size: 256 | lm loss: 2.670044E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.465 | TFLOPs: 42.85 | +7: iteration 68370/ 115203 | consumed samples: 17502720 | consumed tokens: 35845570560 | elapsed time per iteration (s): 0.56 | learning rate: 8.506E-05 | global batch size: 256 | lm loss: 2.666780E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.157 | TFLOPs: 43.97 | +7: iteration 68380/ 115203 | consumed samples: 17505280 | consumed tokens: 35850813440 | elapsed time per iteration (s): 0.58 | learning rate: 8.504E-05 | global batch size: 256 | lm loss: 2.665195E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.076 | TFLOPs: 41.77 | +7: iteration 68390/ 115203 | consumed samples: 17507840 | consumed tokens: 35856056320 | elapsed time per iteration (s): 0.58 | learning rate: 8.502E-05 | global batch size: 256 | lm loss: 2.666863E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.290 | TFLOPs: 41.88 | +7: iteration 68400/ 115203 | consumed samples: 17510400 | consumed tokens: 35861299200 | elapsed time per iteration (s): 0.56 | learning rate: 8.499E-05 | global batch size: 256 | lm loss: 2.662598E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.325 | TFLOPs: 43.70 | +7: iteration 68410/ 115203 | consumed samples: 17512960 | consumed tokens: 35866542080 | elapsed time per iteration (s): 0.56 | learning rate: 8.497E-05 | global batch size: 256 | lm loss: 2.668469E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.879 | TFLOPs: 43.37 | +7: iteration 68420/ 115203 | consumed samples: 17515520 | consumed tokens: 35871784960 | elapsed time per iteration (s): 0.57 | learning rate: 8.494E-05 | global batch size: 256 | lm loss: 2.676544E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.335 | TFLOPs: 42.46 | +7: iteration 68430/ 115203 | consumed samples: 17518080 | consumed tokens: 35877027840 | elapsed time per iteration (s): 0.55 | learning rate: 8.492E-05 | global batch size: 256 | lm loss: 2.646498E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.436 | TFLOPs: 43.99 | +7: iteration 68440/ 115203 | consumed samples: 17520640 | consumed tokens: 35882270720 | elapsed time per iteration (s): 0.56 | learning rate: 8.490E-05 | global batch size: 256 | lm loss: 2.671403E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.855 | TFLOPs: 43.94 | +7: iteration 68450/ 115203 | consumed samples: 17523200 | consumed tokens: 35887513600 | elapsed time per iteration (s): 0.56 | learning rate: 8.487E-05 | global batch size: 256 | lm loss: 2.655031E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.835 | TFLOPs: 43.74 | +7: iteration 68460/ 115203 | consumed samples: 17525760 | consumed tokens: 35892756480 | elapsed time per iteration (s): 0.56 | learning rate: 8.485E-05 | global batch size: 256 | lm loss: 2.662741E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.406 | TFLOPs: 43.61 | +7: iteration 68470/ 115203 | consumed samples: 17528320 | consumed tokens: 35897999360 | elapsed time per iteration (s): 0.56 | learning rate: 8.483E-05 | global batch size: 256 | lm loss: 2.667002E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.380 | TFLOPs: 43.22 | +7: iteration 68480/ 115203 | consumed samples: 17530880 | consumed tokens: 35903242240 | elapsed time per iteration (s): 0.57 | learning rate: 8.480E-05 | global batch size: 256 | lm loss: 2.670307E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.049 | TFLOPs: 43.00 | +7: iteration 68490/ 115203 | consumed samples: 17533440 | consumed tokens: 35908485120 | elapsed time per iteration (s): 0.57 | learning rate: 8.478E-05 | global batch size: 256 | lm loss: 2.667099E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.936 | TFLOPs: 43.18 | +7: iteration 68500/ 115203 | consumed samples: 17536000 | consumed tokens: 35913728000 | elapsed time per iteration (s): 0.57 | learning rate: 8.475E-05 | global batch size: 256 | lm loss: 2.666145E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.270 | TFLOPs: 42.83 | +7: iteration 68510/ 115203 | consumed samples: 17538560 | consumed tokens: 35918970880 | elapsed time per iteration (s): 0.57 | learning rate: 8.473E-05 | global batch size: 256 | lm loss: 2.648170E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.890 | TFLOPs: 43.18 | +7: iteration 68520/ 115203 | consumed samples: 17541120 | consumed tokens: 35924213760 | elapsed time per iteration (s): 0.56 | learning rate: 8.471E-05 | global batch size: 256 | lm loss: 2.659178E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.049 | TFLOPs: 43.96 | +7: iteration 68530/ 115203 | consumed samples: 17543680 | consumed tokens: 35929456640 | elapsed time per iteration (s): 0.56 | learning rate: 8.468E-05 | global batch size: 256 | lm loss: 2.656790E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.916 | TFLOPs: 43.28 | +7: iteration 68540/ 115203 | consumed samples: 17546240 | consumed tokens: 35934699520 | elapsed time per iteration (s): 0.57 | learning rate: 8.466E-05 | global batch size: 256 | lm loss: 2.659755E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.423 | TFLOPs: 42.85 | +7: iteration 68550/ 115203 | consumed samples: 17548800 | consumed tokens: 35939942400 | elapsed time per iteration (s): 0.56 | learning rate: 8.464E-05 | global batch size: 256 | lm loss: 2.647894E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.211 | TFLOPs: 43.21 | +7: iteration 68560/ 115203 | consumed samples: 17551360 | consumed tokens: 35945185280 | elapsed time per iteration (s): 0.56 | learning rate: 8.461E-05 | global batch size: 256 | lm loss: 2.663322E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.806 | TFLOPs: 43.84 | +7: iteration 68570/ 115203 | consumed samples: 17553920 | consumed tokens: 35950428160 | elapsed time per iteration (s): 0.57 | learning rate: 8.459E-05 | global batch size: 256 | lm loss: 2.667928E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.424 | TFLOPs: 42.56 | +7: iteration 68580/ 115203 | consumed samples: 17556480 | consumed tokens: 35955671040 | elapsed time per iteration (s): 0.58 | learning rate: 8.456E-05 | global batch size: 256 | lm loss: 2.653920E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.241 | TFLOPs: 42.35 | +7: iteration 68590/ 115203 | consumed samples: 17559040 | consumed tokens: 35960913920 | elapsed time per iteration (s): 0.56 | learning rate: 8.454E-05 | global batch size: 256 | lm loss: 2.664538E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.124 | TFLOPs: 43.96 | +7: iteration 68600/ 115203 | consumed samples: 17561600 | consumed tokens: 35966156800 | elapsed time per iteration (s): 0.56 | learning rate: 8.452E-05 | global batch size: 256 | lm loss: 2.661398E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.596 | TFLOPs: 43.34 | +7: iteration 68610/ 115203 | consumed samples: 17564160 | consumed tokens: 35971399680 | elapsed time per iteration (s): 0.56 | learning rate: 8.449E-05 | global batch size: 256 | lm loss: 2.656621E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.916 | TFLOPs: 43.85 | +7: iteration 68620/ 115203 | consumed samples: 17566720 | consumed tokens: 35976642560 | elapsed time per iteration (s): 0.56 | learning rate: 8.447E-05 | global batch size: 256 | lm loss: 2.663604E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.700 | TFLOPs: 43.35 | +7: iteration 68630/ 115203 | consumed samples: 17569280 | consumed tokens: 35981885440 | elapsed time per iteration (s): 0.56 | learning rate: 8.445E-05 | global batch size: 256 | lm loss: 2.657809E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.325 | TFLOPs: 43.31 | +7: iteration 68640/ 115203 | consumed samples: 17571840 | consumed tokens: 35987128320 | elapsed time per iteration (s): 0.56 | learning rate: 8.442E-05 | global batch size: 256 | lm loss: 2.661999E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.557 | TFLOPs: 43.72 | +7: iteration 68650/ 115203 | consumed samples: 17574400 | consumed tokens: 35992371200 | elapsed time per iteration (s): 0.56 | learning rate: 8.440E-05 | global batch size: 256 | lm loss: 2.654300E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.482 | TFLOPs: 43.81 | +7: iteration 68660/ 115203 | consumed samples: 17576960 | consumed tokens: 35997614080 | elapsed time per iteration (s): 0.56 | learning rate: 8.437E-05 | global batch size: 256 | lm loss: 2.664392E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.982 | TFLOPs: 43.85 | +7: iteration 68670/ 115203 | consumed samples: 17579520 | consumed tokens: 36002856960 | elapsed time per iteration (s): 0.56 | learning rate: 8.435E-05 | global batch size: 256 | lm loss: 2.663477E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.230 | TFLOPs: 43.97 | +7: iteration 68680/ 115203 | consumed samples: 17582080 | consumed tokens: 36008099840 | elapsed time per iteration (s): 0.56 | learning rate: 8.433E-05 | global batch size: 256 | lm loss: 2.662957E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.583 | TFLOPs: 43.43 | +7: iteration 68690/ 115203 | consumed samples: 17584640 | consumed tokens: 36013342720 | elapsed time per iteration (s): 0.56 | learning rate: 8.430E-05 | global batch size: 256 | lm loss: 2.661588E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.938 | TFLOPs: 43.95 | +7: iteration 68700/ 115203 | consumed samples: 17587200 | consumed tokens: 36018585600 | elapsed time per iteration (s): 0.57 | learning rate: 8.428E-05 | global batch size: 256 | lm loss: 2.655927E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.130 | TFLOPs: 43.11 | +7: iteration 68710/ 115203 | consumed samples: 17589760 | consumed tokens: 36023828480 | elapsed time per iteration (s): 0.56 | learning rate: 8.425E-05 | global batch size: 256 | lm loss: 2.651571E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.609 | TFLOPs: 43.91 | +7: iteration 68720/ 115203 | consumed samples: 17592320 | consumed tokens: 36029071360 | elapsed time per iteration (s): 0.56 | learning rate: 8.423E-05 | global batch size: 256 | lm loss: 2.660745E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.146 | TFLOPs: 43.30 | +7: iteration 68730/ 115203 | consumed samples: 17594880 | consumed tokens: 36034314240 | elapsed time per iteration (s): 0.57 | learning rate: 8.421E-05 | global batch size: 256 | lm loss: 2.672186E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.557 | TFLOPs: 42.96 | +7: iteration 68740/ 115203 | consumed samples: 17597440 | consumed tokens: 36039557120 | elapsed time per iteration (s): 0.57 | learning rate: 8.418E-05 | global batch size: 256 | lm loss: 2.660540E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.111 | TFLOPs: 42.82 | +7: iteration 68750/ 115203 | consumed samples: 17600000 | consumed tokens: 36044800000 | elapsed time per iteration (s): 0.56 | learning rate: 8.416E-05 | global batch size: 256 | lm loss: 2.648689E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.995 | TFLOPs: 43.95 | +7: iteration 68760/ 115203 | consumed samples: 17602560 | consumed tokens: 36050042880 | elapsed time per iteration (s): 0.58 | learning rate: 8.414E-05 | global batch size: 256 | lm loss: 2.652976E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.076 | TFLOPs: 42.34 | +7: iteration 68770/ 115203 | consumed samples: 17605120 | consumed tokens: 36055285760 | elapsed time per iteration (s): 0.56 | learning rate: 8.411E-05 | global batch size: 256 | lm loss: 2.641783E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.107 | TFLOPs: 43.68 | +7: iteration 68780/ 115203 | consumed samples: 17607680 | consumed tokens: 36060528640 | elapsed time per iteration (s): 0.56 | learning rate: 8.409E-05 | global batch size: 256 | lm loss: 2.658463E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.886 | TFLOPs: 43.37 | +7: iteration 68790/ 115203 | consumed samples: 17610240 | consumed tokens: 36065771520 | elapsed time per iteration (s): 0.56 | learning rate: 8.406E-05 | global batch size: 256 | lm loss: 2.667126E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.645 | TFLOPs: 43.44 | +7: iteration 68800/ 115203 | consumed samples: 17612800 | consumed tokens: 36071014400 | elapsed time per iteration (s): 0.57 | learning rate: 8.404E-05 | global batch size: 256 | lm loss: 2.648400E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.132 | TFLOPs: 42.63 | +7: iteration 68810/ 115203 | consumed samples: 17615360 | consumed tokens: 36076257280 | elapsed time per iteration (s): 0.56 | learning rate: 8.402E-05 | global batch size: 256 | lm loss: 2.660072E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.158 | TFLOPs: 43.87 | +7: iteration 68820/ 115203 | consumed samples: 17617920 | consumed tokens: 36081500160 | elapsed time per iteration (s): 0.56 | learning rate: 8.399E-05 | global batch size: 256 | lm loss: 2.674694E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.380 | TFLOPs: 43.70 | +7: iteration 68830/ 115203 | consumed samples: 17620480 | consumed tokens: 36086743040 | elapsed time per iteration (s): 0.57 | learning rate: 8.397E-05 | global batch size: 256 | lm loss: 2.659369E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.816 | TFLOPs: 42.89 | +7: iteration 68840/ 115203 | consumed samples: 17623040 | consumed tokens: 36091985920 | elapsed time per iteration (s): 0.56 | learning rate: 8.395E-05 | global batch size: 256 | lm loss: 2.659142E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.832 | TFLOPs: 43.94 | +7: iteration 68850/ 115203 | consumed samples: 17625600 | consumed tokens: 36097228800 | elapsed time per iteration (s): 0.56 | learning rate: 8.392E-05 | global batch size: 256 | lm loss: 2.659763E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.161 | TFLOPs: 43.97 | +7: iteration 68860/ 115203 | consumed samples: 17628160 | consumed tokens: 36102471680 | elapsed time per iteration (s): 0.56 | learning rate: 8.390E-05 | global batch size: 256 | lm loss: 2.661342E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.268 | TFLOPs: 43.60 | +7: iteration 68870/ 115203 | consumed samples: 17630720 | consumed tokens: 36107714560 | elapsed time per iteration (s): 0.56 | learning rate: 8.388E-05 | global batch size: 256 | lm loss: 2.651883E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.259 | TFLOPs: 43.40 | +7: iteration 68880/ 115203 | consumed samples: 17633280 | consumed tokens: 36112957440 | elapsed time per iteration (s): 0.56 | learning rate: 8.385E-05 | global batch size: 256 | lm loss: 2.644400E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.823 | TFLOPs: 43.93 | +7: iteration 68890/ 115203 | consumed samples: 17635840 | consumed tokens: 36118200320 | elapsed time per iteration (s): 0.56 | learning rate: 8.383E-05 | global batch size: 256 | lm loss: 2.656280E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.247 | TFLOPs: 43.88 | +7: iteration 68900/ 115203 | consumed samples: 17638400 | consumed tokens: 36123443200 | elapsed time per iteration (s): 0.56 | learning rate: 8.380E-05 | global batch size: 256 | lm loss: 2.654287E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.028 | TFLOPs: 43.76 | +7: iteration 68910/ 115203 | consumed samples: 17640960 | consumed tokens: 36128686080 | elapsed time per iteration (s): 0.55 | learning rate: 8.378E-05 | global batch size: 256 | lm loss: 2.652462E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.350 | TFLOPs: 43.98 | +7: iteration 68920/ 115203 | consumed samples: 17643520 | consumed tokens: 36133928960 | elapsed time per iteration (s): 0.55 | learning rate: 8.376E-05 | global batch size: 256 | lm loss: 2.646464E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.314 | TFLOPs: 43.98 | +7: iteration 68930/ 115203 | consumed samples: 17646080 | consumed tokens: 36139171840 | elapsed time per iteration (s): 0.56 | learning rate: 8.373E-05 | global batch size: 256 | lm loss: 2.657262E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.580 | TFLOPs: 43.72 | +7: iteration 68940/ 115203 | consumed samples: 17648640 | consumed tokens: 36144414720 | elapsed time per iteration (s): 0.55 | learning rate: 8.371E-05 | global batch size: 256 | lm loss: 2.675841E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.406 | TFLOPs: 43.99 | +7: iteration 68950/ 115203 | consumed samples: 17651200 | consumed tokens: 36149657600 | elapsed time per iteration (s): 0.56 | learning rate: 8.369E-05 | global batch size: 256 | lm loss: 2.648125E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.763 | TFLOPs: 43.45 | +7: iteration 68960/ 115203 | consumed samples: 17653760 | consumed tokens: 36154900480 | elapsed time per iteration (s): 0.55 | learning rate: 8.366E-05 | global batch size: 256 | lm loss: 2.647026E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.344 | TFLOPs: 43.98 | +7: iteration 68970/ 115203 | consumed samples: 17656320 | consumed tokens: 36160143360 | elapsed time per iteration (s): 0.55 | learning rate: 8.364E-05 | global batch size: 256 | lm loss: 2.653424E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.316 | TFLOPs: 43.98 | +7: iteration 68980/ 115203 | consumed samples: 17658880 | consumed tokens: 36165386240 | elapsed time per iteration (s): 0.56 | learning rate: 8.361E-05 | global batch size: 256 | lm loss: 2.660653E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 68990/ 115203 | consumed samples: 17661440 | consumed tokens: 36170629120 | elapsed time per iteration (s): 0.56 | learning rate: 8.359E-05 | global batch size: 256 | lm loss: 2.658368E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.975 | TFLOPs: 43.95 | +7: iteration 69000/ 115203 | consumed samples: 17664000 | consumed tokens: 36175872000 | elapsed time per iteration (s): 0.58 | learning rate: 8.357E-05 | global batch size: 256 | lm loss: 2.651175E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.292 | TFLOPs: 42.07 | +7: iteration 69010/ 115203 | consumed samples: 17666560 | consumed tokens: 36181114880 | elapsed time per iteration (s): 0.57 | learning rate: 8.354E-05 | global batch size: 256 | lm loss: 2.649132E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.835 | TFLOPs: 42.51 | +7: iteration 69020/ 115203 | consumed samples: 17669120 | consumed tokens: 36186357760 | elapsed time per iteration (s): 0.58 | learning rate: 8.352E-05 | global batch size: 256 | lm loss: 2.647315E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.765 | TFLOPs: 42.40 | +7: iteration 69030/ 115203 | consumed samples: 17671680 | consumed tokens: 36191600640 | elapsed time per iteration (s): 0.56 | learning rate: 8.350E-05 | global batch size: 256 | lm loss: 2.659314E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.701 | TFLOPs: 43.45 | +7: iteration 69040/ 115203 | consumed samples: 17674240 | consumed tokens: 36196843520 | elapsed time per iteration (s): 0.56 | learning rate: 8.347E-05 | global batch size: 256 | lm loss: 2.668574E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.866 | TFLOPs: 43.94 | +7: iteration 69050/ 115203 | consumed samples: 17676800 | consumed tokens: 36202086400 | elapsed time per iteration (s): 0.56 | learning rate: 8.345E-05 | global batch size: 256 | lm loss: 2.660947E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.861 | TFLOPs: 43.46 | +7: iteration 69060/ 115203 | consumed samples: 17679360 | consumed tokens: 36207329280 | elapsed time per iteration (s): 0.55 | learning rate: 8.342E-05 | global batch size: 256 | lm loss: 2.655894E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.375 | TFLOPs: 43.99 | +7: iteration 69070/ 115203 | consumed samples: 17681920 | consumed tokens: 36212572160 | elapsed time per iteration (s): 0.56 | learning rate: 8.340E-05 | global batch size: 256 | lm loss: 2.645266E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.807 | TFLOPs: 43.84 | +7: iteration 69080/ 115203 | consumed samples: 17684480 | consumed tokens: 36217815040 | elapsed time per iteration (s): 0.56 | learning rate: 8.338E-05 | global batch size: 256 | lm loss: 2.645162E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.258 | TFLOPs: 43.69 | +7: iteration 69090/ 115203 | consumed samples: 17687040 | consumed tokens: 36223057920 | elapsed time per iteration (s): 0.56 | learning rate: 8.335E-05 | global batch size: 256 | lm loss: 2.654554E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.123 | TFLOPs: 43.20 | +7: iteration 69100/ 115203 | consumed samples: 17689600 | consumed tokens: 36228300800 | elapsed time per iteration (s): 0.57 | learning rate: 8.333E-05 | global batch size: 256 | lm loss: 2.651354E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.174 | TFLOPs: 43.01 | +7: iteration 69110/ 115203 | consumed samples: 17692160 | consumed tokens: 36233543680 | elapsed time per iteration (s): 0.56 | learning rate: 8.331E-05 | global batch size: 256 | lm loss: 2.657081E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.851 | TFLOPs: 43.37 | +7: iteration 69120/ 115203 | consumed samples: 17694720 | consumed tokens: 36238786560 | elapsed time per iteration (s): 0.57 | learning rate: 8.328E-05 | global batch size: 256 | lm loss: 2.663420E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.246 | TFLOPs: 42.93 | +7: iteration 69130/ 115203 | consumed samples: 17697280 | consumed tokens: 36244029440 | elapsed time per iteration (s): 0.55 | learning rate: 8.326E-05 | global batch size: 256 | lm loss: 2.659404E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.758 | TFLOPs: 44.02 | +7: iteration 69140/ 115203 | consumed samples: 17699840 | consumed tokens: 36249272320 | elapsed time per iteration (s): 0.56 | learning rate: 8.324E-05 | global batch size: 256 | lm loss: 2.652351E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.172 | TFLOPs: 43.30 | +7: iteration 69150/ 115203 | consumed samples: 17702400 | consumed tokens: 36254515200 | elapsed time per iteration (s): 0.55 | learning rate: 8.321E-05 | global batch size: 256 | lm loss: 2.660826E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.285 | TFLOPs: 43.98 | +7: iteration 69160/ 115203 | consumed samples: 17704960 | consumed tokens: 36259758080 | elapsed time per iteration (s): 0.56 | learning rate: 8.319E-05 | global batch size: 256 | lm loss: 2.649607E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.031 | TFLOPs: 43.67 | +7: iteration 69170/ 115203 | consumed samples: 17707520 | consumed tokens: 36265000960 | elapsed time per iteration (s): 0.59 | learning rate: 8.316E-05 | global batch size: 256 | lm loss: 2.661584E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.497 | TFLOPs: 41.52 | +7: iteration 69180/ 115203 | consumed samples: 17710080 | consumed tokens: 36270243840 | elapsed time per iteration (s): 0.56 | learning rate: 8.314E-05 | global batch size: 256 | lm loss: 2.656708E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.442 | TFLOPs: 43.42 | +7: iteration 69190/ 115203 | consumed samples: 17712640 | consumed tokens: 36275486720 | elapsed time per iteration (s): 0.57 | learning rate: 8.312E-05 | global batch size: 256 | lm loss: 2.656972E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.393 | TFLOPs: 43.04 | +7: iteration 69200/ 115203 | consumed samples: 17715200 | consumed tokens: 36280729600 | elapsed time per iteration (s): 0.57 | learning rate: 8.309E-05 | global batch size: 256 | lm loss: 2.655224E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.781 | TFLOPs: 43.07 | +7: iteration 69210/ 115203 | consumed samples: 17717760 | consumed tokens: 36285972480 | elapsed time per iteration (s): 0.56 | learning rate: 8.307E-05 | global batch size: 256 | lm loss: 2.655604E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.198 | TFLOPs: 43.87 | +7: iteration 69220/ 115203 | consumed samples: 17720320 | consumed tokens: 36291215360 | elapsed time per iteration (s): 0.57 | learning rate: 8.305E-05 | global batch size: 256 | lm loss: 2.659452E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.736 | TFLOPs: 43.16 | +7: iteration 69230/ 115203 | consumed samples: 17722880 | consumed tokens: 36296458240 | elapsed time per iteration (s): 0.57 | learning rate: 8.302E-05 | global batch size: 256 | lm loss: 2.651687E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.693 | TFLOPs: 42.78 | +7: iteration 69240/ 115203 | consumed samples: 17725440 | consumed tokens: 36301701120 | elapsed time per iteration (s): 0.56 | learning rate: 8.300E-05 | global batch size: 256 | lm loss: 2.663180E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.062 | TFLOPs: 43.77 | +7: iteration 69250/ 115203 | consumed samples: 17728000 | consumed tokens: 36306944000 | elapsed time per iteration (s): 0.56 | learning rate: 8.298E-05 | global batch size: 256 | lm loss: 2.656120E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.164 | TFLOPs: 43.97 | +7: iteration 69260/ 115203 | consumed samples: 17730560 | consumed tokens: 36312186880 | elapsed time per iteration (s): 0.56 | learning rate: 8.295E-05 | global batch size: 256 | lm loss: 2.653515E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.530 | TFLOPs: 43.53 | +7: iteration 69270/ 115203 | consumed samples: 17733120 | consumed tokens: 36317429760 | elapsed time per iteration (s): 0.57 | learning rate: 8.293E-05 | global batch size: 256 | lm loss: 2.649740E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.628 | TFLOPs: 42.96 | +7: iteration 69280/ 115203 | consumed samples: 17735680 | consumed tokens: 36322672640 | elapsed time per iteration (s): 0.56 | learning rate: 8.290E-05 | global batch size: 256 | lm loss: 2.665435E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.113 | TFLOPs: 43.77 | +7: iteration 69290/ 115203 | consumed samples: 17738240 | consumed tokens: 36327915520 | elapsed time per iteration (s): 0.56 | learning rate: 8.288E-05 | global batch size: 256 | lm loss: 2.664476E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.224 | TFLOPs: 43.97 | +7: iteration 69300/ 115203 | consumed samples: 17740800 | consumed tokens: 36333158400 | elapsed time per iteration (s): 0.56 | learning rate: 8.286E-05 | global batch size: 256 | lm loss: 2.652588E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.540 | TFLOPs: 43.43 | +7: iteration 69310/ 115203 | consumed samples: 17743360 | consumed tokens: 36338401280 | elapsed time per iteration (s): 0.56 | learning rate: 8.283E-05 | global batch size: 256 | lm loss: 2.657749E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.039 | TFLOPs: 43.96 | +7: iteration 69320/ 115203 | consumed samples: 17745920 | consumed tokens: 36343644160 | elapsed time per iteration (s): 0.55 | learning rate: 8.281E-05 | global batch size: 256 | lm loss: 2.638507E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.323 | TFLOPs: 43.98 | +7: iteration 69330/ 115203 | consumed samples: 17748480 | consumed tokens: 36348887040 | elapsed time per iteration (s): 0.56 | learning rate: 8.279E-05 | global batch size: 256 | lm loss: 2.652222E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.217 | TFLOPs: 43.97 | +7: iteration 69340/ 115203 | consumed samples: 17751040 | consumed tokens: 36354129920 | elapsed time per iteration (s): 0.56 | learning rate: 8.276E-05 | global batch size: 256 | lm loss: 2.665123E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.822 | TFLOPs: 43.84 | +7: iteration 69350/ 115203 | consumed samples: 17753600 | consumed tokens: 36359372800 | elapsed time per iteration (s): 0.57 | learning rate: 8.274E-05 | global batch size: 256 | lm loss: 2.667550E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.336 | TFLOPs: 43.03 | +7: iteration 69360/ 115203 | consumed samples: 17756160 | consumed tokens: 36364615680 | elapsed time per iteration (s): 0.56 | learning rate: 8.272E-05 | global batch size: 256 | lm loss: 2.668928E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.921 | TFLOPs: 43.56 | +7: iteration 69370/ 115203 | consumed samples: 17758720 | consumed tokens: 36369858560 | elapsed time per iteration (s): 0.57 | learning rate: 8.269E-05 | global batch size: 256 | lm loss: 2.663741E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.517 | TFLOPs: 42.95 | +7: iteration 69380/ 115203 | consumed samples: 17761280 | consumed tokens: 36375101440 | elapsed time per iteration (s): 0.56 | learning rate: 8.267E-05 | global batch size: 256 | lm loss: 2.627102E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.108 | TFLOPs: 43.77 | +7: iteration 69390/ 115203 | consumed samples: 17763840 | consumed tokens: 36380344320 | elapsed time per iteration (s): 0.56 | learning rate: 8.264E-05 | global batch size: 256 | lm loss: 2.658242E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.734 | TFLOPs: 43.45 | +7: iteration 69400/ 115203 | consumed samples: 17766400 | consumed tokens: 36385587200 | elapsed time per iteration (s): 0.56 | learning rate: 8.262E-05 | global batch size: 256 | lm loss: 2.662857E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.023 | TFLOPs: 43.48 | +7: iteration 69410/ 115203 | consumed samples: 17768960 | consumed tokens: 36390830080 | elapsed time per iteration (s): 0.56 | learning rate: 8.260E-05 | global batch size: 256 | lm loss: 2.649772E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.931 | TFLOPs: 43.85 | +7: iteration 69420/ 115203 | consumed samples: 17771520 | consumed tokens: 36396072960 | elapsed time per iteration (s): 0.56 | learning rate: 8.257E-05 | global batch size: 256 | lm loss: 2.658785E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.399 | TFLOPs: 43.51 | +7: iteration 69430/ 115203 | consumed samples: 17774080 | consumed tokens: 36401315840 | elapsed time per iteration (s): 0.55 | learning rate: 8.255E-05 | global batch size: 256 | lm loss: 2.652492E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.492 | TFLOPs: 44.00 | +7: iteration 69440/ 115203 | consumed samples: 17776640 | consumed tokens: 36406558720 | elapsed time per iteration (s): 0.56 | learning rate: 8.253E-05 | global batch size: 256 | lm loss: 2.655610E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.951 | TFLOPs: 43.95 | +7: iteration 69450/ 115203 | consumed samples: 17779200 | consumed tokens: 36411801600 | elapsed time per iteration (s): 0.55 | learning rate: 8.250E-05 | global batch size: 256 | lm loss: 2.649721E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 69460/ 115203 | consumed samples: 17781760 | consumed tokens: 36417044480 | elapsed time per iteration (s): 0.56 | learning rate: 8.248E-05 | global batch size: 256 | lm loss: 2.662237E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.712 | TFLOPs: 43.92 | +7: iteration 69470/ 115203 | consumed samples: 17784320 | consumed tokens: 36422287360 | elapsed time per iteration (s): 0.56 | learning rate: 8.246E-05 | global batch size: 256 | lm loss: 2.664388E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.631 | TFLOPs: 43.73 | +7: iteration 69480/ 115203 | consumed samples: 17786880 | consumed tokens: 36427530240 | elapsed time per iteration (s): 0.56 | learning rate: 8.243E-05 | global batch size: 256 | lm loss: 2.650414E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.687 | TFLOPs: 43.35 | +7: iteration 69490/ 115203 | consumed samples: 17789440 | consumed tokens: 36432773120 | elapsed time per iteration (s): 0.57 | learning rate: 8.241E-05 | global batch size: 256 | lm loss: 2.654920E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.369 | TFLOPs: 42.46 | +7: iteration 69500/ 115203 | consumed samples: 17792000 | consumed tokens: 36438016000 | elapsed time per iteration (s): 0.56 | learning rate: 8.238E-05 | global batch size: 256 | lm loss: 2.660497E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.887 | TFLOPs: 43.56 | +7: iteration 69510/ 115203 | consumed samples: 17794560 | consumed tokens: 36443258880 | elapsed time per iteration (s): 0.56 | learning rate: 8.236E-05 | global batch size: 256 | lm loss: 2.647002E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.609 | TFLOPs: 43.72 | +7: iteration 69520/ 115203 | consumed samples: 17797120 | consumed tokens: 36448501760 | elapsed time per iteration (s): 0.56 | learning rate: 8.234E-05 | global batch size: 256 | lm loss: 2.651806E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.530 | TFLOPs: 43.72 | +7: iteration 69530/ 115203 | consumed samples: 17799680 | consumed tokens: 36453744640 | elapsed time per iteration (s): 0.55 | learning rate: 8.231E-05 | global batch size: 256 | lm loss: 2.668021E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.312 | TFLOPs: 43.98 | +7: iteration 69540/ 115203 | consumed samples: 17802240 | consumed tokens: 36458987520 | elapsed time per iteration (s): 0.56 | learning rate: 8.229E-05 | global batch size: 256 | lm loss: 2.648270E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.870 | TFLOPs: 43.46 | +7: iteration 69550/ 115203 | consumed samples: 17804800 | consumed tokens: 36464230400 | elapsed time per iteration (s): 0.55 | learning rate: 8.227E-05 | global batch size: 256 | lm loss: 2.654650E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.404 | TFLOPs: 43.99 | +7: iteration 69560/ 115203 | consumed samples: 17807360 | consumed tokens: 36469473280 | elapsed time per iteration (s): 0.55 | learning rate: 8.224E-05 | global batch size: 256 | lm loss: 2.668352E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.303 | TFLOPs: 43.98 | +7: iteration 69570/ 115203 | consumed samples: 17809920 | consumed tokens: 36474716160 | elapsed time per iteration (s): 0.57 | learning rate: 8.222E-05 | global batch size: 256 | lm loss: 2.654422E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.753 | TFLOPs: 43.07 | +7: iteration 69580/ 115203 | consumed samples: 17812480 | consumed tokens: 36479959040 | elapsed time per iteration (s): 0.57 | learning rate: 8.220E-05 | global batch size: 256 | lm loss: 2.648036E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.829 | TFLOPs: 43.08 | +7: iteration 69590/ 115203 | consumed samples: 17815040 | consumed tokens: 36485201920 | elapsed time per iteration (s): 0.56 | learning rate: 8.217E-05 | global batch size: 256 | lm loss: 2.658129E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.135 | TFLOPs: 43.87 | +7: iteration 69600/ 115203 | consumed samples: 17817600 | consumed tokens: 36490444800 | elapsed time per iteration (s): 0.56 | learning rate: 8.215E-05 | global batch size: 256 | lm loss: 2.659400E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.266 | TFLOPs: 43.79 | +7: iteration 69610/ 115203 | consumed samples: 17820160 | consumed tokens: 36495687680 | elapsed time per iteration (s): 0.57 | learning rate: 8.213E-05 | global batch size: 256 | lm loss: 2.653715E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.660 | TFLOPs: 43.06 | +7: iteration 69620/ 115203 | consumed samples: 17822720 | consumed tokens: 36500930560 | elapsed time per iteration (s): 0.56 | learning rate: 8.210E-05 | global batch size: 256 | lm loss: 2.668512E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.754 | TFLOPs: 43.45 | +7: iteration 69630/ 115203 | consumed samples: 17825280 | consumed tokens: 36506173440 | elapsed time per iteration (s): 0.55 | learning rate: 8.208E-05 | global batch size: 256 | lm loss: 2.654309E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.317 | TFLOPs: 43.98 | +7: iteration 69640/ 115203 | consumed samples: 17827840 | consumed tokens: 36511416320 | elapsed time per iteration (s): 0.55 | learning rate: 8.205E-05 | global batch size: 256 | lm loss: 2.653635E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.462 | TFLOPs: 44.00 | +7: iteration 69650/ 115203 | consumed samples: 17830400 | consumed tokens: 36516659200 | elapsed time per iteration (s): 0.57 | learning rate: 8.203E-05 | global batch size: 256 | lm loss: 2.638198E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.809 | TFLOPs: 42.50 | +7: iteration 69660/ 115203 | consumed samples: 17832960 | consumed tokens: 36521902080 | elapsed time per iteration (s): 0.56 | learning rate: 8.201E-05 | global batch size: 256 | lm loss: 2.653418E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.006 | TFLOPs: 43.48 | +7: iteration 69670/ 115203 | consumed samples: 17835520 | consumed tokens: 36527144960 | elapsed time per iteration (s): 0.56 | learning rate: 8.198E-05 | global batch size: 256 | lm loss: 2.648980E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.579 | TFLOPs: 43.72 | +7: iteration 69680/ 115203 | consumed samples: 17838080 | consumed tokens: 36532387840 | elapsed time per iteration (s): 0.56 | learning rate: 8.196E-05 | global batch size: 256 | lm loss: 2.667967E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.769 | TFLOPs: 43.55 | +7: iteration 69690/ 115203 | consumed samples: 17840640 | consumed tokens: 36537630720 | elapsed time per iteration (s): 0.55 | learning rate: 8.194E-05 | global batch size: 256 | lm loss: 2.656721E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.523 | TFLOPs: 44.00 | +7: iteration 69700/ 115203 | consumed samples: 17843200 | consumed tokens: 36542873600 | elapsed time per iteration (s): 0.57 | learning rate: 8.191E-05 | global batch size: 256 | lm loss: 2.647218E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.099 | TFLOPs: 43.01 | +7: iteration 69710/ 115203 | consumed samples: 17845760 | consumed tokens: 36548116480 | elapsed time per iteration (s): 0.55 | learning rate: 8.189E-05 | global batch size: 256 | lm loss: 2.644847E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 69720/ 115203 | consumed samples: 17848320 | consumed tokens: 36553359360 | elapsed time per iteration (s): 0.56 | learning rate: 8.187E-05 | global batch size: 256 | lm loss: 2.645159E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.397 | TFLOPs: 43.61 | +7: iteration 69730/ 115203 | consumed samples: 17850880 | consumed tokens: 36558602240 | elapsed time per iteration (s): 0.55 | learning rate: 8.184E-05 | global batch size: 256 | lm loss: 2.648288E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.587 | TFLOPs: 44.01 | +7: iteration 69740/ 115203 | consumed samples: 17853440 | consumed tokens: 36563845120 | elapsed time per iteration (s): 0.55 | learning rate: 8.182E-05 | global batch size: 256 | lm loss: 2.654977E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 69750/ 115203 | consumed samples: 17856000 | consumed tokens: 36569088000 | elapsed time per iteration (s): 0.56 | learning rate: 8.180E-05 | global batch size: 256 | lm loss: 2.656616E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.999 | TFLOPs: 43.57 | +7: iteration 69760/ 115203 | consumed samples: 17858560 | consumed tokens: 36574330880 | elapsed time per iteration (s): 0.57 | learning rate: 8.177E-05 | global batch size: 256 | lm loss: 2.646250E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.012 | TFLOPs: 42.90 | +7: iteration 69770/ 115203 | consumed samples: 17861120 | consumed tokens: 36579573760 | elapsed time per iteration (s): 0.57 | learning rate: 8.175E-05 | global batch size: 256 | lm loss: 2.652204E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.921 | TFLOPs: 42.70 | +7: iteration 69780/ 115203 | consumed samples: 17863680 | consumed tokens: 36584816640 | elapsed time per iteration (s): 0.55 | learning rate: 8.172E-05 | global batch size: 256 | lm loss: 2.653463E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 69790/ 115203 | consumed samples: 17866240 | consumed tokens: 36590059520 | elapsed time per iteration (s): 0.60 | learning rate: 8.170E-05 | global batch size: 256 | lm loss: 2.664333E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.475 | TFLOPs: 40.85 | +7: iteration 69800/ 115203 | consumed samples: 17868800 | consumed tokens: 36595302400 | elapsed time per iteration (s): 0.55 | learning rate: 8.168E-05 | global batch size: 256 | lm loss: 2.644317E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.503 | TFLOPs: 44.00 | +7: iteration 69810/ 115203 | consumed samples: 17871360 | consumed tokens: 36600545280 | elapsed time per iteration (s): 0.56 | learning rate: 8.165E-05 | global batch size: 256 | lm loss: 2.664678E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.162 | TFLOPs: 43.49 | +7: iteration 69820/ 115203 | consumed samples: 17873920 | consumed tokens: 36605788160 | elapsed time per iteration (s): 0.56 | learning rate: 8.163E-05 | global batch size: 256 | lm loss: 2.653721E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.106 | TFLOPs: 43.48 | +7: iteration 69830/ 115203 | consumed samples: 17876480 | consumed tokens: 36611031040 | elapsed time per iteration (s): 0.57 | learning rate: 8.161E-05 | global batch size: 256 | lm loss: 2.659057E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.603 | TFLOPs: 43.06 | +7: iteration 69840/ 115203 | consumed samples: 17879040 | consumed tokens: 36616273920 | elapsed time per iteration (s): 0.56 | learning rate: 8.158E-05 | global batch size: 256 | lm loss: 2.651458E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.784 | TFLOPs: 43.74 | +7: iteration 69850/ 115203 | consumed samples: 17881600 | consumed tokens: 36621516800 | elapsed time per iteration (s): 0.56 | learning rate: 8.156E-05 | global batch size: 256 | lm loss: 2.648826E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.799 | TFLOPs: 43.74 | +7: iteration 69860/ 115203 | consumed samples: 17884160 | consumed tokens: 36626759680 | elapsed time per iteration (s): 0.56 | learning rate: 8.154E-05 | global batch size: 256 | lm loss: 2.659777E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.989 | TFLOPs: 43.57 | +7: iteration 69870/ 115203 | consumed samples: 17886720 | consumed tokens: 36632002560 | elapsed time per iteration (s): 0.56 | learning rate: 8.151E-05 | global batch size: 256 | lm loss: 2.667714E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.258 | TFLOPs: 43.88 | +7: iteration 69880/ 115203 | consumed samples: 17889280 | consumed tokens: 36637245440 | elapsed time per iteration (s): 0.55 | learning rate: 8.149E-05 | global batch size: 256 | lm loss: 2.665621E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.570 | TFLOPs: 44.01 | +7: iteration 69890/ 115203 | consumed samples: 17891840 | consumed tokens: 36642488320 | elapsed time per iteration (s): 0.55 | learning rate: 8.147E-05 | global batch size: 256 | lm loss: 2.658140E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.317 | TFLOPs: 43.98 | +7: iteration 69900/ 115203 | consumed samples: 17894400 | consumed tokens: 36647731200 | elapsed time per iteration (s): 0.56 | learning rate: 8.144E-05 | global batch size: 256 | lm loss: 2.653244E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.202 | TFLOPs: 43.68 | +7: iteration 69910/ 115203 | consumed samples: 17896960 | consumed tokens: 36652974080 | elapsed time per iteration (s): 0.55 | learning rate: 8.142E-05 | global batch size: 256 | lm loss: 2.648249E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.292 | TFLOPs: 43.98 | +7: iteration 69920/ 115203 | consumed samples: 17899520 | consumed tokens: 36658216960 | elapsed time per iteration (s): 0.56 | learning rate: 8.140E-05 | global batch size: 256 | lm loss: 2.646024E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.840 | TFLOPs: 43.46 | +7: iteration 69930/ 115203 | consumed samples: 17902080 | consumed tokens: 36663459840 | elapsed time per iteration (s): 0.56 | learning rate: 8.137E-05 | global batch size: 256 | lm loss: 2.653444E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.915 | TFLOPs: 43.47 | +7: iteration 69940/ 115203 | consumed samples: 17904640 | consumed tokens: 36668702720 | elapsed time per iteration (s): 0.56 | learning rate: 8.135E-05 | global batch size: 256 | lm loss: 2.664027E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.388 | TFLOPs: 43.42 | +7: iteration 69950/ 115203 | consumed samples: 17907200 | consumed tokens: 36673945600 | elapsed time per iteration (s): 0.57 | learning rate: 8.132E-05 | global batch size: 256 | lm loss: 2.655880E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.359 | TFLOPs: 43.13 | +7: iteration 69960/ 115203 | consumed samples: 17909760 | consumed tokens: 36679188480 | elapsed time per iteration (s): 0.56 | learning rate: 8.130E-05 | global batch size: 256 | lm loss: 2.656089E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.180 | TFLOPs: 43.78 | +7: iteration 69970/ 115203 | consumed samples: 17912320 | consumed tokens: 36684431360 | elapsed time per iteration (s): 0.56 | learning rate: 8.128E-05 | global batch size: 256 | lm loss: 2.654372E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.021 | TFLOPs: 43.95 | +7: iteration 69980/ 115203 | consumed samples: 17914880 | consumed tokens: 36689674240 | elapsed time per iteration (s): 0.56 | learning rate: 8.125E-05 | global batch size: 256 | lm loss: 2.647734E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.143 | TFLOPs: 43.96 | +7: iteration 69990/ 115203 | consumed samples: 17917440 | consumed tokens: 36694917120 | elapsed time per iteration (s): 0.55 | learning rate: 8.123E-05 | global batch size: 256 | lm loss: 2.650781E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.290 | TFLOPs: 43.98 | +0: [2023-03-16 23:48:39,924] [INFO] [logging.py:68:log_dist] [Rank 0] step=70000, skipped=0, lr=[8.120745619091417e-05, 8.120745619091417e-05, 8.120745619091417e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 70000/ 115203 | consumed samples: 17920000 | consumed tokens: 36700160000 | elapsed time per iteration (s): 0.57 | learning rate: 8.121E-05 | global batch size: 256 | lm loss: 2.648820E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.574 | TFLOPs: 43.05 | +0: steps: 70000 loss: 2.6568 iter time (s): 0.559 samples/sec: 458.209 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 70000 | lm loss value: 3.434762E+00 | lm loss PPL: 3.102401E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 70000 to checkpoints_421m60b400m +0: [2023-03-16 23:48:40,166] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step70000 is begin to save! +0: [2023-03-16 23:48:40,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:48:40,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:48:40,461] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:48:40,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:48:40,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:48:40,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:48:40,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:48:40,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:48:40,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:48:40,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:48:40,631] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:48:40,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:48:40,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:48:40,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:48:40,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:48:40,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:48:40,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:48:40,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:48:40,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:48:40,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:48:40,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:48:40,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:48:40,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_13-model_00-model_states.pt... +0: [2023-03-16 23:48:40,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_13-model_00-model_states.pt. +0: [2023-03-16 23:48:40,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:48:40,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:48:40,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_15-model_00-model_states.pt... +0: [2023-03-16 23:48:41,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_15-model_00-model_states.pt. +0: [2023-03-16 23:48:41,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_16-model_00-model_states.pt... +0: [2023-03-16 23:48:41,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_16-model_00-model_states.pt. +0: [2023-03-16 23:48:41,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_17-model_00-model_states.pt... +0: [2023-03-16 23:48:41,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_17-model_00-model_states.pt. +0: [2023-03-16 23:48:41,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_18-model_00-model_states.pt... +0: [2023-03-16 23:48:41,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_18-model_00-model_states.pt. +0: [2023-03-16 23:48:41,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_19-model_00-model_states.pt... +0: [2023-03-16 23:48:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_19-model_00-model_states.pt. +0: [2023-03-16 23:48:41,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_20-model_00-model_states.pt... +0: [2023-03-16 23:48:41,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_20-model_00-model_states.pt. +0: [2023-03-16 23:48:41,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/layer_22-model_00-model_states.pt... +0: [2023-03-16 23:48:41,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/layer_22-model_00-model_states.pt. +0: [2023-03-16 23:48:41,212] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step70000/mp_rank_00_model_states.pt +0: [2023-03-16 23:48:41,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:48:41,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:48:41,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:48:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,334] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,334] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,335] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,335] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:48:41,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:48:41,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-16 23:48:41,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-16 23:48:41,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-16 23:48:41,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-16 23:48:41,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-16 23:48:41,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:48:41,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-16 23:48:41,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:48:41,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step70000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-16 23:48:41,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: successfully saved checkpoint at iteration 70000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1394.98 +7: iteration 70010/ 115203 | consumed samples: 17922560 | consumed tokens: 36705402880 | elapsed time per iteration (s): 0.73 | learning rate: 8.118E-05 | global batch size: 256 | lm loss: 2.642573E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 352.524 | TFLOPs: 33.61 | +7: iteration 70020/ 115203 | consumed samples: 17925120 | consumed tokens: 36710645760 | elapsed time per iteration (s): 0.56 | learning rate: 8.116E-05 | global batch size: 256 | lm loss: 2.653339E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.314 | TFLOPs: 43.31 | +7: iteration 70030/ 115203 | consumed samples: 17927680 | consumed tokens: 36715888640 | elapsed time per iteration (s): 0.55 | learning rate: 8.114E-05 | global batch size: 256 | lm loss: 2.639805E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.708 | TFLOPs: 44.02 | +7: iteration 70040/ 115203 | consumed samples: 17930240 | consumed tokens: 36721131520 | elapsed time per iteration (s): 0.56 | learning rate: 8.111E-05 | global batch size: 256 | lm loss: 2.658828E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.658 | TFLOPs: 43.44 | +7: iteration 70050/ 115203 | consumed samples: 17932800 | consumed tokens: 36726374400 | elapsed time per iteration (s): 0.56 | learning rate: 8.109E-05 | global batch size: 256 | lm loss: 2.656916E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.681 | TFLOPs: 43.44 | +7: iteration 70060/ 115203 | consumed samples: 17935360 | consumed tokens: 36731617280 | elapsed time per iteration (s): 0.57 | learning rate: 8.107E-05 | global batch size: 256 | lm loss: 2.639637E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.718 | TFLOPs: 42.78 | +7: iteration 70070/ 115203 | consumed samples: 17937920 | consumed tokens: 36736860160 | elapsed time per iteration (s): 0.56 | learning rate: 8.104E-05 | global batch size: 256 | lm loss: 2.655516E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.640 | TFLOPs: 43.92 | +7: iteration 70080/ 115203 | consumed samples: 17940480 | consumed tokens: 36742103040 | elapsed time per iteration (s): 0.55 | learning rate: 8.102E-05 | global batch size: 256 | lm loss: 2.646589E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.571 | TFLOPs: 44.01 | +7: iteration 70090/ 115203 | consumed samples: 17943040 | consumed tokens: 36747345920 | elapsed time per iteration (s): 0.56 | learning rate: 8.100E-05 | global batch size: 256 | lm loss: 2.656299E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.262 | TFLOPs: 43.50 | +7: iteration 70100/ 115203 | consumed samples: 17945600 | consumed tokens: 36752588800 | elapsed time per iteration (s): 0.55 | learning rate: 8.097E-05 | global batch size: 256 | lm loss: 2.665248E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.590 | TFLOPs: 44.01 | +7: iteration 70110/ 115203 | consumed samples: 17948160 | consumed tokens: 36757831680 | elapsed time per iteration (s): 0.55 | learning rate: 8.095E-05 | global batch size: 256 | lm loss: 2.651584E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.457 | TFLOPs: 43.99 | +7: iteration 70120/ 115203 | consumed samples: 17950720 | consumed tokens: 36763074560 | elapsed time per iteration (s): 0.56 | learning rate: 8.093E-05 | global batch size: 256 | lm loss: 2.644176E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.990 | TFLOPs: 43.66 | +7: iteration 70130/ 115203 | consumed samples: 17953280 | consumed tokens: 36768317440 | elapsed time per iteration (s): 0.56 | learning rate: 8.090E-05 | global batch size: 256 | lm loss: 2.649039E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.706 | TFLOPs: 43.45 | +7: iteration 70140/ 115203 | consumed samples: 17955840 | consumed tokens: 36773560320 | elapsed time per iteration (s): 0.56 | learning rate: 8.088E-05 | global batch size: 256 | lm loss: 2.667383E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.579 | TFLOPs: 43.72 | +7: iteration 70150/ 115203 | consumed samples: 17958400 | consumed tokens: 36778803200 | elapsed time per iteration (s): 0.56 | learning rate: 8.086E-05 | global batch size: 256 | lm loss: 2.654876E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.237 | TFLOPs: 43.40 | +7: iteration 70160/ 115203 | consumed samples: 17960960 | consumed tokens: 36784046080 | elapsed time per iteration (s): 0.56 | learning rate: 8.083E-05 | global batch size: 256 | lm loss: 2.663358E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.275 | TFLOPs: 43.60 | +7: iteration 70170/ 115203 | consumed samples: 17963520 | consumed tokens: 36789288960 | elapsed time per iteration (s): 0.56 | learning rate: 8.081E-05 | global batch size: 256 | lm loss: 2.655884E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.379 | TFLOPs: 43.70 | +7: iteration 70180/ 115203 | consumed samples: 17966080 | consumed tokens: 36794531840 | elapsed time per iteration (s): 0.57 | learning rate: 8.079E-05 | global batch size: 256 | lm loss: 2.648314E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.590 | TFLOPs: 42.96 | +7: iteration 70190/ 115203 | consumed samples: 17968640 | consumed tokens: 36799774720 | elapsed time per iteration (s): 0.56 | learning rate: 8.076E-05 | global batch size: 256 | lm loss: 2.649315E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.242 | TFLOPs: 43.50 | +7: iteration 70200/ 115203 | consumed samples: 17971200 | consumed tokens: 36805017600 | elapsed time per iteration (s): 0.55 | learning rate: 8.074E-05 | global batch size: 256 | lm loss: 2.644875E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.267 | TFLOPs: 43.98 | +7: iteration 70210/ 115203 | consumed samples: 17973760 | consumed tokens: 36810260480 | elapsed time per iteration (s): 0.58 | learning rate: 8.071E-05 | global batch size: 256 | lm loss: 2.658883E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.660 | TFLOPs: 42.39 | +7: iteration 70220/ 115203 | consumed samples: 17976320 | consumed tokens: 36815503360 | elapsed time per iteration (s): 0.56 | learning rate: 8.069E-05 | global batch size: 256 | lm loss: 2.639954E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.497 | TFLOPs: 43.24 | +7: iteration 70230/ 115203 | consumed samples: 17978880 | consumed tokens: 36820746240 | elapsed time per iteration (s): 0.56 | learning rate: 8.067E-05 | global batch size: 256 | lm loss: 2.661596E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.200 | TFLOPs: 43.49 | +7: iteration 70240/ 115203 | consumed samples: 17981440 | consumed tokens: 36825989120 | elapsed time per iteration (s): 0.56 | learning rate: 8.064E-05 | global batch size: 256 | lm loss: 2.653240E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.911 | TFLOPs: 43.47 | +7: iteration 70250/ 115203 | consumed samples: 17984000 | consumed tokens: 36831232000 | elapsed time per iteration (s): 0.58 | learning rate: 8.062E-05 | global batch size: 256 | lm loss: 2.654356E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.528 | TFLOPs: 42.38 | +7: iteration 70260/ 115203 | consumed samples: 17986560 | consumed tokens: 36836474880 | elapsed time per iteration (s): 0.56 | learning rate: 8.060E-05 | global batch size: 256 | lm loss: 2.654032E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.234 | TFLOPs: 43.50 | +7: iteration 70270/ 115203 | consumed samples: 17989120 | consumed tokens: 36841717760 | elapsed time per iteration (s): 0.56 | learning rate: 8.057E-05 | global batch size: 256 | lm loss: 2.649448E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.413 | TFLOPs: 43.51 | +7: iteration 70280/ 115203 | consumed samples: 17991680 | consumed tokens: 36846960640 | elapsed time per iteration (s): 0.56 | learning rate: 8.055E-05 | global batch size: 256 | lm loss: 2.658448E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.168 | TFLOPs: 43.49 | +7: iteration 70290/ 115203 | consumed samples: 17994240 | consumed tokens: 36852203520 | elapsed time per iteration (s): 0.57 | learning rate: 8.053E-05 | global batch size: 256 | lm loss: 2.659401E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.306 | TFLOPs: 42.93 | +7: iteration 70300/ 115203 | consumed samples: 17996800 | consumed tokens: 36857446400 | elapsed time per iteration (s): 0.56 | learning rate: 8.050E-05 | global batch size: 256 | lm loss: 2.643926E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.676 | TFLOPs: 43.73 | +7: iteration 70310/ 115203 | consumed samples: 17999360 | consumed tokens: 36862689280 | elapsed time per iteration (s): 0.57 | learning rate: 8.048E-05 | global batch size: 256 | lm loss: 2.659295E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.389 | TFLOPs: 43.04 | +7: iteration 70320/ 115203 | consumed samples: 18001920 | consumed tokens: 36867932160 | elapsed time per iteration (s): 0.57 | learning rate: 8.046E-05 | global batch size: 256 | lm loss: 2.661330E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.250 | TFLOPs: 43.02 | +7: iteration 70330/ 115203 | consumed samples: 18004480 | consumed tokens: 36873175040 | elapsed time per iteration (s): 0.58 | learning rate: 8.043E-05 | global batch size: 256 | lm loss: 2.651231E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.541 | TFLOPs: 42.00 | +7: iteration 70340/ 115203 | consumed samples: 18007040 | consumed tokens: 36878417920 | elapsed time per iteration (s): 0.57 | learning rate: 8.041E-05 | global batch size: 256 | lm loss: 2.652977E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.895 | TFLOPs: 42.80 | +7: iteration 70350/ 115203 | consumed samples: 18009600 | consumed tokens: 36883660800 | elapsed time per iteration (s): 0.58 | learning rate: 8.039E-05 | global batch size: 256 | lm loss: 2.672158E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.036 | TFLOPs: 42.33 | +7: iteration 70360/ 115203 | consumed samples: 18012160 | consumed tokens: 36888903680 | elapsed time per iteration (s): 0.56 | learning rate: 8.036E-05 | global batch size: 256 | lm loss: 2.649504E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.168 | TFLOPs: 43.40 | +7: iteration 70370/ 115203 | consumed samples: 18014720 | consumed tokens: 36894146560 | elapsed time per iteration (s): 0.58 | learning rate: 8.034E-05 | global batch size: 256 | lm loss: 2.646623E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.571 | TFLOPs: 41.91 | +7: iteration 70380/ 115203 | consumed samples: 18017280 | consumed tokens: 36899389440 | elapsed time per iteration (s): 0.59 | learning rate: 8.032E-05 | global batch size: 256 | lm loss: 2.665721E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.090 | TFLOPs: 41.58 | +7: iteration 70390/ 115203 | consumed samples: 18019840 | consumed tokens: 36904632320 | elapsed time per iteration (s): 0.58 | learning rate: 8.029E-05 | global batch size: 256 | lm loss: 2.643709E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.643 | TFLOPs: 42.39 | +7: iteration 70400/ 115203 | consumed samples: 18022400 | consumed tokens: 36909875200 | elapsed time per iteration (s): 0.58 | learning rate: 8.027E-05 | global batch size: 256 | lm loss: 2.663232E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.246 | TFLOPs: 42.35 | +7: iteration 70410/ 115203 | consumed samples: 18024960 | consumed tokens: 36915118080 | elapsed time per iteration (s): 0.57 | learning rate: 8.025E-05 | global batch size: 256 | lm loss: 2.659294E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.483 | TFLOPs: 42.66 | +7: iteration 70420/ 115203 | consumed samples: 18027520 | consumed tokens: 36920360960 | elapsed time per iteration (s): 0.56 | learning rate: 8.022E-05 | global batch size: 256 | lm loss: 2.650596E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.667 | TFLOPs: 43.54 | +7: iteration 70430/ 115203 | consumed samples: 18030080 | consumed tokens: 36925603840 | elapsed time per iteration (s): 0.58 | learning rate: 8.020E-05 | global batch size: 256 | lm loss: 2.636854E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.169 | TFLOPs: 42.35 | +7: iteration 70440/ 115203 | consumed samples: 18032640 | consumed tokens: 36930846720 | elapsed time per iteration (s): 0.57 | learning rate: 8.018E-05 | global batch size: 256 | lm loss: 2.652668E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.748 | TFLOPs: 42.97 | +7: iteration 70450/ 115203 | consumed samples: 18035200 | consumed tokens: 36936089600 | elapsed time per iteration (s): 0.57 | learning rate: 8.015E-05 | global batch size: 256 | lm loss: 2.660290E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.202 | TFLOPs: 42.92 | +7: iteration 70460/ 115203 | consumed samples: 18037760 | consumed tokens: 36941332480 | elapsed time per iteration (s): 0.58 | learning rate: 8.013E-05 | global batch size: 256 | lm loss: 2.632693E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.190 | TFLOPs: 42.25 | +7: iteration 70470/ 115203 | consumed samples: 18040320 | consumed tokens: 36946575360 | elapsed time per iteration (s): 0.56 | learning rate: 8.011E-05 | global batch size: 256 | lm loss: 2.647250E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.705 | TFLOPs: 43.92 | +7: iteration 70480/ 115203 | consumed samples: 18042880 | consumed tokens: 36951818240 | elapsed time per iteration (s): 0.57 | learning rate: 8.008E-05 | global batch size: 256 | lm loss: 2.641901E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.310 | TFLOPs: 42.84 | +7: iteration 70490/ 115203 | consumed samples: 18045440 | consumed tokens: 36957061120 | elapsed time per iteration (s): 0.55 | learning rate: 8.006E-05 | global batch size: 256 | lm loss: 2.653705E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.467 | TFLOPs: 44.00 | +7: iteration 70500/ 115203 | consumed samples: 18048000 | consumed tokens: 36962304000 | elapsed time per iteration (s): 0.58 | learning rate: 8.004E-05 | global batch size: 256 | lm loss: 2.657169E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.995 | TFLOPs: 42.43 | +7: iteration 70510/ 115203 | consumed samples: 18050560 | consumed tokens: 36967546880 | elapsed time per iteration (s): 0.56 | learning rate: 8.001E-05 | global batch size: 256 | lm loss: 2.656448E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.976 | TFLOPs: 43.57 | +7: iteration 70520/ 115203 | consumed samples: 18053120 | consumed tokens: 36972789760 | elapsed time per iteration (s): 0.57 | learning rate: 7.999E-05 | global batch size: 256 | lm loss: 2.665233E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.749 | TFLOPs: 43.16 | +7: iteration 70530/ 115203 | consumed samples: 18055680 | consumed tokens: 36978032640 | elapsed time per iteration (s): 0.56 | learning rate: 7.997E-05 | global batch size: 256 | lm loss: 2.656083E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.213 | TFLOPs: 43.97 | +7: iteration 70540/ 115203 | consumed samples: 18058240 | consumed tokens: 36983275520 | elapsed time per iteration (s): 0.56 | learning rate: 7.994E-05 | global batch size: 256 | lm loss: 2.650775E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.710 | TFLOPs: 43.54 | +7: iteration 70550/ 115203 | consumed samples: 18060800 | consumed tokens: 36988518400 | elapsed time per iteration (s): 0.56 | learning rate: 7.992E-05 | global batch size: 256 | lm loss: 2.646396E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.164 | TFLOPs: 43.59 | +7: iteration 70560/ 115203 | consumed samples: 18063360 | consumed tokens: 36993761280 | elapsed time per iteration (s): 0.57 | learning rate: 7.990E-05 | global batch size: 256 | lm loss: 2.651009E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.568 | TFLOPs: 42.58 | +7: iteration 70570/ 115203 | consumed samples: 18065920 | consumed tokens: 36999004160 | elapsed time per iteration (s): 0.55 | learning rate: 7.987E-05 | global batch size: 256 | lm loss: 2.651678E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.304 | TFLOPs: 43.98 | +7: iteration 70580/ 115203 | consumed samples: 18068480 | consumed tokens: 37004247040 | elapsed time per iteration (s): 0.56 | learning rate: 7.985E-05 | global batch size: 256 | lm loss: 2.649558E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.021 | TFLOPs: 43.48 | +7: iteration 70590/ 115203 | consumed samples: 18071040 | consumed tokens: 37009489920 | elapsed time per iteration (s): 0.57 | learning rate: 7.983E-05 | global batch size: 256 | lm loss: 2.646387E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.622 | TFLOPs: 42.96 | +7: iteration 70600/ 115203 | consumed samples: 18073600 | consumed tokens: 37014732800 | elapsed time per iteration (s): 0.56 | learning rate: 7.980E-05 | global batch size: 256 | lm loss: 2.646349E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.423 | TFLOPs: 43.51 | +7: iteration 70610/ 115203 | consumed samples: 18076160 | consumed tokens: 37019975680 | elapsed time per iteration (s): 0.57 | learning rate: 7.978E-05 | global batch size: 256 | lm loss: 2.641147E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.048 | TFLOPs: 42.53 | +7: iteration 70620/ 115203 | consumed samples: 18078720 | consumed tokens: 37025218560 | elapsed time per iteration (s): 0.57 | learning rate: 7.976E-05 | global batch size: 256 | lm loss: 2.657050E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.248 | TFLOPs: 42.93 | +7: iteration 70630/ 115203 | consumed samples: 18081280 | consumed tokens: 37030461440 | elapsed time per iteration (s): 0.57 | learning rate: 7.973E-05 | global batch size: 256 | lm loss: 2.661284E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.238 | TFLOPs: 42.73 | +7: iteration 70640/ 115203 | consumed samples: 18083840 | consumed tokens: 37035704320 | elapsed time per iteration (s): 0.56 | learning rate: 7.971E-05 | global batch size: 256 | lm loss: 2.648199E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.435 | TFLOPs: 43.23 | +7: iteration 70650/ 115203 | consumed samples: 18086400 | consumed tokens: 37040947200 | elapsed time per iteration (s): 0.57 | learning rate: 7.969E-05 | global batch size: 256 | lm loss: 2.648797E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.505 | TFLOPs: 42.86 | +7: iteration 70660/ 115203 | consumed samples: 18088960 | consumed tokens: 37046190080 | elapsed time per iteration (s): 0.57 | learning rate: 7.966E-05 | global batch size: 256 | lm loss: 2.645828E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.343 | TFLOPs: 43.03 | +7: iteration 70670/ 115203 | consumed samples: 18091520 | consumed tokens: 37051432960 | elapsed time per iteration (s): 0.58 | learning rate: 7.964E-05 | global batch size: 256 | lm loss: 2.650830E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.519 | TFLOPs: 42.38 | +7: iteration 70680/ 115203 | consumed samples: 18094080 | consumed tokens: 37056675840 | elapsed time per iteration (s): 0.57 | learning rate: 7.962E-05 | global batch size: 256 | lm loss: 2.652589E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.330 | TFLOPs: 42.74 | +7: iteration 70690/ 115203 | consumed samples: 18096640 | consumed tokens: 37061918720 | elapsed time per iteration (s): 0.56 | learning rate: 7.959E-05 | global batch size: 256 | lm loss: 2.655567E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.772 | TFLOPs: 43.36 | +7: iteration 70700/ 115203 | consumed samples: 18099200 | consumed tokens: 37067161600 | elapsed time per iteration (s): 0.56 | learning rate: 7.957E-05 | global batch size: 256 | lm loss: 2.643444E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.463 | TFLOPs: 43.71 | +7: iteration 70710/ 115203 | consumed samples: 18101760 | consumed tokens: 37072404480 | elapsed time per iteration (s): 0.56 | learning rate: 7.955E-05 | global batch size: 256 | lm loss: 2.635868E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.661 | TFLOPs: 43.35 | +7: iteration 70720/ 115203 | consumed samples: 18104320 | consumed tokens: 37077647360 | elapsed time per iteration (s): 0.58 | learning rate: 7.952E-05 | global batch size: 256 | lm loss: 2.654599E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.390 | TFLOPs: 42.27 | +7: iteration 70730/ 115203 | consumed samples: 18106880 | consumed tokens: 37082890240 | elapsed time per iteration (s): 0.56 | learning rate: 7.950E-05 | global batch size: 256 | lm loss: 2.639229E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.980 | TFLOPs: 43.38 | +7: iteration 70740/ 115203 | consumed samples: 18109440 | consumed tokens: 37088133120 | elapsed time per iteration (s): 0.56 | learning rate: 7.948E-05 | global batch size: 256 | lm loss: 2.651924E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.888 | TFLOPs: 43.37 | +7: iteration 70750/ 115203 | consumed samples: 18112000 | consumed tokens: 37093376000 | elapsed time per iteration (s): 0.57 | learning rate: 7.945E-05 | global batch size: 256 | lm loss: 2.660078E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.825 | TFLOPs: 42.70 | +7: iteration 70760/ 115203 | consumed samples: 18114560 | consumed tokens: 37098618880 | elapsed time per iteration (s): 0.57 | learning rate: 7.943E-05 | global batch size: 256 | lm loss: 2.647970E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.454 | TFLOPs: 42.66 | +7: iteration 70770/ 115203 | consumed samples: 18117120 | consumed tokens: 37103861760 | elapsed time per iteration (s): 0.56 | learning rate: 7.941E-05 | global batch size: 256 | lm loss: 2.651349E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.653 | TFLOPs: 43.82 | +7: iteration 70780/ 115203 | consumed samples: 18119680 | consumed tokens: 37109104640 | elapsed time per iteration (s): 0.58 | learning rate: 7.938E-05 | global batch size: 256 | lm loss: 2.646201E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.217 | TFLOPs: 42.26 | +7: iteration 70790/ 115203 | consumed samples: 18122240 | consumed tokens: 37114347520 | elapsed time per iteration (s): 0.57 | learning rate: 7.936E-05 | global batch size: 256 | lm loss: 2.638096E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.956 | TFLOPs: 43.18 | +7: iteration 70800/ 115203 | consumed samples: 18124800 | consumed tokens: 37119590400 | elapsed time per iteration (s): 0.57 | learning rate: 7.934E-05 | global batch size: 256 | lm loss: 2.641750E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.170 | TFLOPs: 43.11 | +7: iteration 70810/ 115203 | consumed samples: 18127360 | consumed tokens: 37124833280 | elapsed time per iteration (s): 0.56 | learning rate: 7.931E-05 | global batch size: 256 | lm loss: 2.642319E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.219 | TFLOPs: 43.69 | +7: iteration 70820/ 115203 | consumed samples: 18129920 | consumed tokens: 37130076160 | elapsed time per iteration (s): 0.57 | learning rate: 7.929E-05 | global batch size: 256 | lm loss: 2.645767E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.263 | TFLOPs: 42.45 | +7: iteration 70830/ 115203 | consumed samples: 18132480 | consumed tokens: 37135319040 | elapsed time per iteration (s): 0.56 | learning rate: 7.927E-05 | global batch size: 256 | lm loss: 2.650039E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.220 | TFLOPs: 43.50 | +7: iteration 70840/ 115203 | consumed samples: 18135040 | consumed tokens: 37140561920 | elapsed time per iteration (s): 0.56 | learning rate: 7.924E-05 | global batch size: 256 | lm loss: 2.644988E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.186 | TFLOPs: 43.49 | +7: iteration 70850/ 115203 | consumed samples: 18137600 | consumed tokens: 37145804800 | elapsed time per iteration (s): 0.57 | learning rate: 7.922E-05 | global batch size: 256 | lm loss: 2.649586E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.623 | TFLOPs: 42.58 | +7: iteration 70860/ 115203 | consumed samples: 18140160 | consumed tokens: 37151047680 | elapsed time per iteration (s): 0.56 | learning rate: 7.920E-05 | global batch size: 256 | lm loss: 2.647595E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.313 | TFLOPs: 43.70 | +7: iteration 70870/ 115203 | consumed samples: 18142720 | consumed tokens: 37156290560 | elapsed time per iteration (s): 0.56 | learning rate: 7.917E-05 | global batch size: 256 | lm loss: 2.653087E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.933 | TFLOPs: 43.37 | +7: iteration 70880/ 115203 | consumed samples: 18145280 | consumed tokens: 37161533440 | elapsed time per iteration (s): 0.56 | learning rate: 7.915E-05 | global batch size: 256 | lm loss: 2.647795E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.188 | TFLOPs: 43.87 | +7: iteration 70890/ 115203 | consumed samples: 18147840 | consumed tokens: 37166776320 | elapsed time per iteration (s): 0.58 | learning rate: 7.913E-05 | global batch size: 256 | lm loss: 2.634729E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.059 | TFLOPs: 42.34 | +7: iteration 70900/ 115203 | consumed samples: 18150400 | consumed tokens: 37172019200 | elapsed time per iteration (s): 0.57 | learning rate: 7.910E-05 | global batch size: 256 | lm loss: 2.655667E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.026 | TFLOPs: 43.19 | +7: iteration 70910/ 115203 | consumed samples: 18152960 | consumed tokens: 37177262080 | elapsed time per iteration (s): 0.58 | learning rate: 7.908E-05 | global batch size: 256 | lm loss: 2.651997E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.849 | TFLOPs: 42.22 | +7: iteration 70920/ 115203 | consumed samples: 18155520 | consumed tokens: 37182504960 | elapsed time per iteration (s): 0.56 | learning rate: 7.906E-05 | global batch size: 256 | lm loss: 2.644527E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.226 | TFLOPs: 43.50 | +7: iteration 70930/ 115203 | consumed samples: 18158080 | consumed tokens: 37187747840 | elapsed time per iteration (s): 0.58 | learning rate: 7.903E-05 | global batch size: 256 | lm loss: 2.643553E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.456 | TFLOPs: 42.09 | +7: iteration 70940/ 115203 | consumed samples: 18160640 | consumed tokens: 37192990720 | elapsed time per iteration (s): 0.57 | learning rate: 7.901E-05 | global batch size: 256 | lm loss: 2.635266E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.344 | TFLOPs: 43.13 | +7: iteration 70950/ 115203 | consumed samples: 18163200 | consumed tokens: 37198233600 | elapsed time per iteration (s): 0.58 | learning rate: 7.899E-05 | global batch size: 256 | lm loss: 2.649345E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.733 | TFLOPs: 42.21 | +7: iteration 70960/ 115203 | consumed samples: 18165760 | consumed tokens: 37203476480 | elapsed time per iteration (s): 0.57 | learning rate: 7.896E-05 | global batch size: 256 | lm loss: 2.644239E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.454 | TFLOPs: 42.85 | +7: iteration 70970/ 115203 | consumed samples: 18168320 | consumed tokens: 37208719360 | elapsed time per iteration (s): 0.57 | learning rate: 7.894E-05 | global batch size: 256 | lm loss: 2.652375E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.197 | TFLOPs: 43.02 | +7: iteration 70980/ 115203 | consumed samples: 18170880 | consumed tokens: 37213962240 | elapsed time per iteration (s): 0.57 | learning rate: 7.892E-05 | global batch size: 256 | lm loss: 2.649928E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.914 | TFLOPs: 43.18 | +7: iteration 70990/ 115203 | consumed samples: 18173440 | consumed tokens: 37219205120 | elapsed time per iteration (s): 0.57 | learning rate: 7.889E-05 | global batch size: 256 | lm loss: 2.649266E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.416 | TFLOPs: 42.85 | +7: iteration 71000/ 115203 | consumed samples: 18176000 | consumed tokens: 37224448000 | elapsed time per iteration (s): 0.57 | learning rate: 7.887E-05 | global batch size: 256 | lm loss: 2.650901E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.902 | TFLOPs: 42.70 | +7: iteration 71010/ 115203 | consumed samples: 18178560 | consumed tokens: 37229690880 | elapsed time per iteration (s): 0.56 | learning rate: 7.885E-05 | global batch size: 256 | lm loss: 2.644932E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.980 | TFLOPs: 43.38 | +7: iteration 71020/ 115203 | consumed samples: 18181120 | consumed tokens: 37234933760 | elapsed time per iteration (s): 0.56 | learning rate: 7.882E-05 | global batch size: 256 | lm loss: 2.654033E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.319 | TFLOPs: 43.41 | +7: iteration 71030/ 115203 | consumed samples: 18183680 | consumed tokens: 37240176640 | elapsed time per iteration (s): 0.58 | learning rate: 7.880E-05 | global batch size: 256 | lm loss: 2.646424E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.416 | TFLOPs: 42.08 | +7: iteration 71040/ 115203 | consumed samples: 18186240 | consumed tokens: 37245419520 | elapsed time per iteration (s): 0.57 | learning rate: 7.878E-05 | global batch size: 256 | lm loss: 2.643633E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.665 | TFLOPs: 42.49 | +7: iteration 71050/ 115203 | consumed samples: 18188800 | consumed tokens: 37250662400 | elapsed time per iteration (s): 0.57 | learning rate: 7.875E-05 | global batch size: 256 | lm loss: 2.648260E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.280 | TFLOPs: 42.45 | +7: iteration 71060/ 115203 | consumed samples: 18191360 | consumed tokens: 37255905280 | elapsed time per iteration (s): 0.57 | learning rate: 7.873E-05 | global batch size: 256 | lm loss: 2.640426E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.357 | TFLOPs: 42.84 | +7: iteration 71070/ 115203 | consumed samples: 18193920 | consumed tokens: 37261148160 | elapsed time per iteration (s): 0.57 | learning rate: 7.871E-05 | global batch size: 256 | lm loss: 2.637348E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.293 | TFLOPs: 42.64 | +7: iteration 71080/ 115203 | consumed samples: 18196480 | consumed tokens: 37266391040 | elapsed time per iteration (s): 0.56 | learning rate: 7.868E-05 | global batch size: 256 | lm loss: 2.659772E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.336 | TFLOPs: 43.51 | +7: iteration 71090/ 115203 | consumed samples: 18199040 | consumed tokens: 37271633920 | elapsed time per iteration (s): 0.56 | learning rate: 7.866E-05 | global batch size: 256 | lm loss: 2.668399E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.600 | TFLOPs: 43.25 | +7: iteration 71100/ 115203 | consumed samples: 18201600 | consumed tokens: 37276876800 | elapsed time per iteration (s): 0.56 | learning rate: 7.864E-05 | global batch size: 256 | lm loss: 2.643994E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.518 | TFLOPs: 43.43 | +7: iteration 71110/ 115203 | consumed samples: 18204160 | consumed tokens: 37282119680 | elapsed time per iteration (s): 0.58 | learning rate: 7.861E-05 | global batch size: 256 | lm loss: 2.649189E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.981 | TFLOPs: 42.42 | +7: iteration 71120/ 115203 | consumed samples: 18206720 | consumed tokens: 37287362560 | elapsed time per iteration (s): 0.56 | learning rate: 7.859E-05 | global batch size: 256 | lm loss: 2.643106E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.065 | TFLOPs: 43.67 | +7: iteration 71130/ 115203 | consumed samples: 18209280 | consumed tokens: 37292605440 | elapsed time per iteration (s): 0.57 | learning rate: 7.857E-05 | global batch size: 256 | lm loss: 2.649008E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.393 | TFLOPs: 42.46 | +7: iteration 71140/ 115203 | consumed samples: 18211840 | consumed tokens: 37297848320 | elapsed time per iteration (s): 0.57 | learning rate: 7.854E-05 | global batch size: 256 | lm loss: 2.659294E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.049 | TFLOPs: 42.91 | +7: iteration 71150/ 115203 | consumed samples: 18214400 | consumed tokens: 37303091200 | elapsed time per iteration (s): 0.57 | learning rate: 7.852E-05 | global batch size: 256 | lm loss: 2.647614E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.741 | TFLOPs: 42.59 | +7: iteration 71160/ 115203 | consumed samples: 18216960 | consumed tokens: 37308334080 | elapsed time per iteration (s): 0.58 | learning rate: 7.850E-05 | global batch size: 256 | lm loss: 2.654730E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.958 | TFLOPs: 41.95 | +7: iteration 71170/ 115203 | consumed samples: 18219520 | consumed tokens: 37313576960 | elapsed time per iteration (s): 0.57 | learning rate: 7.847E-05 | global batch size: 256 | lm loss: 2.659403E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.956 | TFLOPs: 42.52 | +7: iteration 71180/ 115203 | consumed samples: 18222080 | consumed tokens: 37318819840 | elapsed time per iteration (s): 0.57 | learning rate: 7.845E-05 | global batch size: 256 | lm loss: 2.639344E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.745 | TFLOPs: 42.78 | +7: iteration 71190/ 115203 | consumed samples: 18224640 | consumed tokens: 37324062720 | elapsed time per iteration (s): 0.57 | learning rate: 7.843E-05 | global batch size: 256 | lm loss: 2.636637E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.873 | TFLOPs: 42.80 | +7: iteration 71200/ 115203 | consumed samples: 18227200 | consumed tokens: 37329305600 | elapsed time per iteration (s): 0.58 | learning rate: 7.841E-05 | global batch size: 256 | lm loss: 2.648388E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.328 | TFLOPs: 42.27 | +7: iteration 71210/ 115203 | consumed samples: 18229760 | consumed tokens: 37334548480 | elapsed time per iteration (s): 0.59 | learning rate: 7.838E-05 | global batch size: 256 | lm loss: 2.649898E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.796 | TFLOPs: 41.17 | +7: iteration 71220/ 115203 | consumed samples: 18232320 | consumed tokens: 37339791360 | elapsed time per iteration (s): 0.56 | learning rate: 7.836E-05 | global batch size: 256 | lm loss: 2.658091E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.154 | TFLOPs: 43.68 | +7: iteration 71230/ 115203 | consumed samples: 18234880 | consumed tokens: 37345034240 | elapsed time per iteration (s): 0.57 | learning rate: 7.834E-05 | global batch size: 256 | lm loss: 2.660861E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.787 | TFLOPs: 42.79 | +7: iteration 71240/ 115203 | consumed samples: 18237440 | consumed tokens: 37350277120 | elapsed time per iteration (s): 0.57 | learning rate: 7.831E-05 | global batch size: 256 | lm loss: 2.644862E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.506 | TFLOPs: 42.57 | +7: iteration 71250/ 115203 | consumed samples: 18240000 | consumed tokens: 37355520000 | elapsed time per iteration (s): 0.59 | learning rate: 7.829E-05 | global batch size: 256 | lm loss: 2.644488E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.058 | TFLOPs: 41.57 | +7: iteration 71260/ 115203 | consumed samples: 18242560 | consumed tokens: 37360762880 | elapsed time per iteration (s): 0.58 | learning rate: 7.827E-05 | global batch size: 256 | lm loss: 2.650434E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.042 | TFLOPs: 42.33 | +7: iteration 71270/ 115203 | consumed samples: 18245120 | consumed tokens: 37366005760 | elapsed time per iteration (s): 0.58 | learning rate: 7.824E-05 | global batch size: 256 | lm loss: 2.647503E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.058 | TFLOPs: 41.86 | +7: iteration 71280/ 115203 | consumed samples: 18247680 | consumed tokens: 37371248640 | elapsed time per iteration (s): 0.56 | learning rate: 7.822E-05 | global batch size: 256 | lm loss: 2.653067E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.940 | TFLOPs: 43.28 | +7: iteration 71290/ 115203 | consumed samples: 18250240 | consumed tokens: 37376491520 | elapsed time per iteration (s): 0.56 | learning rate: 7.820E-05 | global batch size: 256 | lm loss: 2.644928E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.762 | TFLOPs: 43.55 | +7: iteration 71300/ 115203 | consumed samples: 18252800 | consumed tokens: 37381734400 | elapsed time per iteration (s): 0.59 | learning rate: 7.817E-05 | global batch size: 256 | lm loss: 2.655704E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.617 | TFLOPs: 41.53 | +7: iteration 71310/ 115203 | consumed samples: 18255360 | consumed tokens: 37386977280 | elapsed time per iteration (s): 0.57 | learning rate: 7.815E-05 | global batch size: 256 | lm loss: 2.634605E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.537 | TFLOPs: 42.67 | +7: iteration 71320/ 115203 | consumed samples: 18257920 | consumed tokens: 37392220160 | elapsed time per iteration (s): 0.56 | learning rate: 7.813E-05 | global batch size: 256 | lm loss: 2.635706E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.405 | TFLOPs: 43.51 | +7: iteration 71330/ 115203 | consumed samples: 18260480 | consumed tokens: 37397463040 | elapsed time per iteration (s): 0.57 | learning rate: 7.810E-05 | global batch size: 256 | lm loss: 2.657770E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.645 | TFLOPs: 42.96 | +7: iteration 71340/ 115203 | consumed samples: 18263040 | consumed tokens: 37402705920 | elapsed time per iteration (s): 0.57 | learning rate: 7.808E-05 | global batch size: 256 | lm loss: 2.638815E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.706 | TFLOPs: 42.97 | +7: iteration 71350/ 115203 | consumed samples: 18265600 | consumed tokens: 37407948800 | elapsed time per iteration (s): 0.58 | learning rate: 7.806E-05 | global batch size: 256 | lm loss: 2.639951E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.513 | TFLOPs: 42.38 | +7: iteration 71360/ 115203 | consumed samples: 18268160 | consumed tokens: 37413191680 | elapsed time per iteration (s): 0.58 | learning rate: 7.803E-05 | global batch size: 256 | lm loss: 2.648911E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.901 | TFLOPs: 42.23 | +7: iteration 71370/ 115203 | consumed samples: 18270720 | consumed tokens: 37418434560 | elapsed time per iteration (s): 0.58 | learning rate: 7.801E-05 | global batch size: 256 | lm loss: 2.652276E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.354 | TFLOPs: 42.08 | +7: iteration 71380/ 115203 | consumed samples: 18273280 | consumed tokens: 37423677440 | elapsed time per iteration (s): 0.58 | learning rate: 7.799E-05 | global batch size: 256 | lm loss: 2.646532E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.500 | TFLOPs: 42.38 | +7: iteration 71390/ 115203 | consumed samples: 18275840 | consumed tokens: 37428920320 | elapsed time per iteration (s): 0.57 | learning rate: 7.796E-05 | global batch size: 256 | lm loss: 2.646423E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.381 | TFLOPs: 42.46 | +7: iteration 71400/ 115203 | consumed samples: 18278400 | consumed tokens: 37434163200 | elapsed time per iteration (s): 0.58 | learning rate: 7.794E-05 | global batch size: 256 | lm loss: 2.632777E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.578 | TFLOPs: 42.00 | +7: iteration 71410/ 115203 | consumed samples: 18280960 | consumed tokens: 37439406080 | elapsed time per iteration (s): 0.57 | learning rate: 7.792E-05 | global batch size: 256 | lm loss: 2.645260E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.856 | TFLOPs: 42.98 | +7: iteration 71420/ 115203 | consumed samples: 18283520 | consumed tokens: 37444648960 | elapsed time per iteration (s): 0.58 | learning rate: 7.790E-05 | global batch size: 256 | lm loss: 2.652902E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.361 | TFLOPs: 42.08 | +7: iteration 71430/ 115203 | consumed samples: 18286080 | consumed tokens: 37449891840 | elapsed time per iteration (s): 0.57 | learning rate: 7.787E-05 | global batch size: 256 | lm loss: 2.640448E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.678 | TFLOPs: 43.06 | +7: iteration 71440/ 115203 | consumed samples: 18288640 | consumed tokens: 37455134720 | elapsed time per iteration (s): 0.57 | learning rate: 7.785E-05 | global batch size: 256 | lm loss: 2.641184E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.543 | TFLOPs: 42.86 | +7: iteration 71450/ 115203 | consumed samples: 18291200 | consumed tokens: 37460377600 | elapsed time per iteration (s): 0.56 | learning rate: 7.783E-05 | global batch size: 256 | lm loss: 2.635578E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.861 | TFLOPs: 43.37 | +7: iteration 71460/ 115203 | consumed samples: 18293760 | consumed tokens: 37465620480 | elapsed time per iteration (s): 0.58 | learning rate: 7.780E-05 | global batch size: 256 | lm loss: 2.658049E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.080 | TFLOPs: 42.34 | +7: iteration 71470/ 115203 | consumed samples: 18296320 | consumed tokens: 37470863360 | elapsed time per iteration (s): 0.59 | learning rate: 7.778E-05 | global batch size: 256 | lm loss: 2.650955E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.132 | TFLOPs: 41.68 | +7: iteration 71480/ 115203 | consumed samples: 18298880 | consumed tokens: 37476106240 | elapsed time per iteration (s): 0.57 | learning rate: 7.776E-05 | global batch size: 256 | lm loss: 2.650406E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.827 | TFLOPs: 42.98 | +7: iteration 71490/ 115203 | consumed samples: 18301440 | consumed tokens: 37481349120 | elapsed time per iteration (s): 0.57 | learning rate: 7.773E-05 | global batch size: 256 | lm loss: 2.648938E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.872 | TFLOPs: 43.08 | +7: iteration 71500/ 115203 | consumed samples: 18304000 | consumed tokens: 37486592000 | elapsed time per iteration (s): 0.58 | learning rate: 7.771E-05 | global batch size: 256 | lm loss: 2.650364E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.697 | TFLOPs: 41.73 | +7: iteration 71510/ 115203 | consumed samples: 18306560 | consumed tokens: 37491834880 | elapsed time per iteration (s): 0.56 | learning rate: 7.769E-05 | global batch size: 256 | lm loss: 2.631698E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.549 | TFLOPs: 43.24 | +7: iteration 71520/ 115203 | consumed samples: 18309120 | consumed tokens: 37497077760 | elapsed time per iteration (s): 0.60 | learning rate: 7.766E-05 | global batch size: 256 | lm loss: 2.650374E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.391 | TFLOPs: 40.84 | +7: iteration 71530/ 115203 | consumed samples: 18311680 | consumed tokens: 37502320640 | elapsed time per iteration (s): 0.60 | learning rate: 7.764E-05 | global batch size: 256 | lm loss: 2.645358E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.097 | TFLOPs: 40.72 | +7: iteration 71540/ 115203 | consumed samples: 18314240 | consumed tokens: 37507563520 | elapsed time per iteration (s): 0.57 | learning rate: 7.762E-05 | global batch size: 256 | lm loss: 2.646046E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.727 | TFLOPs: 42.69 | +7: iteration 71550/ 115203 | consumed samples: 18316800 | consumed tokens: 37512806400 | elapsed time per iteration (s): 0.56 | learning rate: 7.759E-05 | global batch size: 256 | lm loss: 2.651458E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.437 | TFLOPs: 43.52 | +7: iteration 71560/ 115203 | consumed samples: 18319360 | consumed tokens: 37518049280 | elapsed time per iteration (s): 0.58 | learning rate: 7.757E-05 | global batch size: 256 | lm loss: 2.637913E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.446 | TFLOPs: 42.09 | +7: iteration 71570/ 115203 | consumed samples: 18321920 | consumed tokens: 37523292160 | elapsed time per iteration (s): 0.57 | learning rate: 7.755E-05 | global batch size: 256 | lm loss: 2.641392E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.237 | TFLOPs: 43.02 | +7: iteration 71580/ 115203 | consumed samples: 18324480 | consumed tokens: 37528535040 | elapsed time per iteration (s): 0.56 | learning rate: 7.752E-05 | global batch size: 256 | lm loss: 2.649509E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.818 | TFLOPs: 43.36 | +7: iteration 71590/ 115203 | consumed samples: 18327040 | consumed tokens: 37533777920 | elapsed time per iteration (s): 0.58 | learning rate: 7.750E-05 | global batch size: 256 | lm loss: 2.646009E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.751 | TFLOPs: 42.31 | +7: iteration 71600/ 115203 | consumed samples: 18329600 | consumed tokens: 37539020800 | elapsed time per iteration (s): 0.57 | learning rate: 7.748E-05 | global batch size: 256 | lm loss: 2.663098E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.620 | TFLOPs: 43.15 | +7: iteration 71610/ 115203 | consumed samples: 18332160 | consumed tokens: 37544263680 | elapsed time per iteration (s): 0.56 | learning rate: 7.746E-05 | global batch size: 256 | lm loss: 2.653737E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.815 | TFLOPs: 43.55 | +7: iteration 71620/ 115203 | consumed samples: 18334720 | consumed tokens: 37549506560 | elapsed time per iteration (s): 0.57 | learning rate: 7.743E-05 | global batch size: 256 | lm loss: 2.651890E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.127 | TFLOPs: 42.82 | +7: iteration 71630/ 115203 | consumed samples: 18337280 | consumed tokens: 37554749440 | elapsed time per iteration (s): 0.57 | learning rate: 7.741E-05 | global batch size: 256 | lm loss: 2.651765E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.828 | TFLOPs: 42.98 | +7: iteration 71640/ 115203 | consumed samples: 18339840 | consumed tokens: 37559992320 | elapsed time per iteration (s): 0.56 | learning rate: 7.739E-05 | global batch size: 256 | lm loss: 2.651529E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.155 | TFLOPs: 43.20 | +7: iteration 71650/ 115203 | consumed samples: 18342400 | consumed tokens: 37565235200 | elapsed time per iteration (s): 0.56 | learning rate: 7.736E-05 | global batch size: 256 | lm loss: 2.650885E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.408 | TFLOPs: 43.32 | +7: iteration 71660/ 115203 | consumed samples: 18344960 | consumed tokens: 37570478080 | elapsed time per iteration (s): 0.57 | learning rate: 7.734E-05 | global batch size: 256 | lm loss: 2.650028E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.799 | TFLOPs: 42.79 | +7: iteration 71670/ 115203 | consumed samples: 18347520 | consumed tokens: 37575720960 | elapsed time per iteration (s): 0.57 | learning rate: 7.732E-05 | global batch size: 256 | lm loss: 2.639204E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.032 | TFLOPs: 42.52 | +7: iteration 71680/ 115203 | consumed samples: 18350080 | consumed tokens: 37580963840 | elapsed time per iteration (s): 0.57 | learning rate: 7.729E-05 | global batch size: 256 | lm loss: 2.654334E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.572 | TFLOPs: 42.86 | +7: iteration 71690/ 115203 | consumed samples: 18352640 | consumed tokens: 37586206720 | elapsed time per iteration (s): 0.57 | learning rate: 7.727E-05 | global batch size: 256 | lm loss: 2.657578E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.457 | TFLOPs: 42.47 | +7: iteration 71700/ 115203 | consumed samples: 18355200 | consumed tokens: 37591449600 | elapsed time per iteration (s): 0.57 | learning rate: 7.725E-05 | global batch size: 256 | lm loss: 2.646025E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.523 | TFLOPs: 42.76 | +7: iteration 71710/ 115203 | consumed samples: 18357760 | consumed tokens: 37596692480 | elapsed time per iteration (s): 0.57 | learning rate: 7.722E-05 | global batch size: 256 | lm loss: 2.647199E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.836 | TFLOPs: 43.08 | +7: iteration 71720/ 115203 | consumed samples: 18360320 | consumed tokens: 37601935360 | elapsed time per iteration (s): 0.57 | learning rate: 7.720E-05 | global batch size: 256 | lm loss: 2.652011E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.317 | TFLOPs: 42.55 | +7: iteration 71730/ 115203 | consumed samples: 18362880 | consumed tokens: 37607178240 | elapsed time per iteration (s): 0.57 | learning rate: 7.718E-05 | global batch size: 256 | lm loss: 2.649901E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.785 | TFLOPs: 42.79 | +7: iteration 71740/ 115203 | consumed samples: 18365440 | consumed tokens: 37612421120 | elapsed time per iteration (s): 0.56 | learning rate: 7.716E-05 | global batch size: 256 | lm loss: 2.660165E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.281 | TFLOPs: 43.22 | +7: iteration 71750/ 115203 | consumed samples: 18368000 | consumed tokens: 37617664000 | elapsed time per iteration (s): 0.57 | learning rate: 7.713E-05 | global batch size: 256 | lm loss: 2.645842E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.790 | TFLOPs: 42.98 | +7: iteration 71760/ 115203 | consumed samples: 18370560 | consumed tokens: 37622906880 | elapsed time per iteration (s): 0.56 | learning rate: 7.711E-05 | global batch size: 256 | lm loss: 2.672961E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.869 | TFLOPs: 43.37 | +7: iteration 71770/ 115203 | consumed samples: 18373120 | consumed tokens: 37628149760 | elapsed time per iteration (s): 0.57 | learning rate: 7.709E-05 | global batch size: 256 | lm loss: 2.643507E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.668 | TFLOPs: 42.68 | +7: iteration 71780/ 115203 | consumed samples: 18375680 | consumed tokens: 37633392640 | elapsed time per iteration (s): 0.56 | learning rate: 7.706E-05 | global batch size: 256 | lm loss: 2.651028E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.300 | TFLOPs: 43.22 | +7: iteration 71790/ 115203 | consumed samples: 18378240 | consumed tokens: 37638635520 | elapsed time per iteration (s): 0.57 | learning rate: 7.704E-05 | global batch size: 256 | lm loss: 2.665461E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.292 | TFLOPs: 42.55 | +7: iteration 71800/ 115203 | consumed samples: 18380800 | consumed tokens: 37643878400 | elapsed time per iteration (s): 0.56 | learning rate: 7.702E-05 | global batch size: 256 | lm loss: 2.633533E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.967 | TFLOPs: 43.38 | +7: iteration 71810/ 115203 | consumed samples: 18383360 | consumed tokens: 37649121280 | elapsed time per iteration (s): 0.56 | learning rate: 7.699E-05 | global batch size: 256 | lm loss: 2.651351E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.513 | TFLOPs: 43.71 | +7: iteration 71820/ 115203 | consumed samples: 18385920 | consumed tokens: 37654364160 | elapsed time per iteration (s): 0.56 | learning rate: 7.697E-05 | global batch size: 256 | lm loss: 2.654869E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.298 | TFLOPs: 43.41 | +7: iteration 71830/ 115203 | consumed samples: 18388480 | consumed tokens: 37659607040 | elapsed time per iteration (s): 0.57 | learning rate: 7.695E-05 | global batch size: 256 | lm loss: 2.638108E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.709 | TFLOPs: 42.78 | +7: iteration 71840/ 115203 | consumed samples: 18391040 | consumed tokens: 37664849920 | elapsed time per iteration (s): 0.56 | learning rate: 7.692E-05 | global batch size: 256 | lm loss: 2.635486E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.553 | TFLOPs: 43.72 | +7: iteration 71850/ 115203 | consumed samples: 18393600 | consumed tokens: 37670092800 | elapsed time per iteration (s): 0.56 | learning rate: 7.690E-05 | global batch size: 256 | lm loss: 2.657693E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.536 | TFLOPs: 43.43 | +7: iteration 71860/ 115203 | consumed samples: 18396160 | consumed tokens: 37675335680 | elapsed time per iteration (s): 0.56 | learning rate: 7.688E-05 | global batch size: 256 | lm loss: 2.645231E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.240 | TFLOPs: 43.21 | +7: iteration 71870/ 115203 | consumed samples: 18398720 | consumed tokens: 37680578560 | elapsed time per iteration (s): 0.57 | learning rate: 7.686E-05 | global batch size: 256 | lm loss: 2.653282E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.782 | TFLOPs: 42.60 | +7: iteration 71880/ 115203 | consumed samples: 18401280 | consumed tokens: 37685821440 | elapsed time per iteration (s): 0.58 | learning rate: 7.683E-05 | global batch size: 256 | lm loss: 2.633260E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.160 | TFLOPs: 42.35 | +7: iteration 71890/ 115203 | consumed samples: 18403840 | consumed tokens: 37691064320 | elapsed time per iteration (s): 0.57 | learning rate: 7.681E-05 | global batch size: 256 | lm loss: 2.643778E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.517 | TFLOPs: 42.86 | +7: iteration 71900/ 115203 | consumed samples: 18406400 | consumed tokens: 37696307200 | elapsed time per iteration (s): 0.57 | learning rate: 7.679E-05 | global batch size: 256 | lm loss: 2.654817E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.582 | TFLOPs: 43.05 | +7: iteration 71910/ 115203 | consumed samples: 18408960 | consumed tokens: 37701550080 | elapsed time per iteration (s): 0.56 | learning rate: 7.676E-05 | global batch size: 256 | lm loss: 2.652742E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.117 | TFLOPs: 43.49 | +7: iteration 71920/ 115203 | consumed samples: 18411520 | consumed tokens: 37706792960 | elapsed time per iteration (s): 0.57 | learning rate: 7.674E-05 | global batch size: 256 | lm loss: 2.641608E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.981 | TFLOPs: 42.52 | +7: iteration 71930/ 115203 | consumed samples: 18414080 | consumed tokens: 37712035840 | elapsed time per iteration (s): 0.57 | learning rate: 7.672E-05 | global batch size: 256 | lm loss: 2.646071E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.040 | TFLOPs: 42.91 | +7: iteration 71940/ 115203 | consumed samples: 18416640 | consumed tokens: 37717278720 | elapsed time per iteration (s): 0.57 | learning rate: 7.669E-05 | global batch size: 256 | lm loss: 2.646671E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.915 | TFLOPs: 42.70 | +7: iteration 71950/ 115203 | consumed samples: 18419200 | consumed tokens: 37722521600 | elapsed time per iteration (s): 0.56 | learning rate: 7.667E-05 | global batch size: 256 | lm loss: 2.648676E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.956 | TFLOPs: 43.28 | +7: iteration 71960/ 115203 | consumed samples: 18421760 | consumed tokens: 37727764480 | elapsed time per iteration (s): 0.56 | learning rate: 7.665E-05 | global batch size: 256 | lm loss: 2.642416E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.640 | TFLOPs: 43.25 | +7: iteration 71970/ 115203 | consumed samples: 18424320 | consumed tokens: 37733007360 | elapsed time per iteration (s): 0.56 | learning rate: 7.662E-05 | global batch size: 256 | lm loss: 2.649188E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.266 | TFLOPs: 43.31 | +7: iteration 71980/ 115203 | consumed samples: 18426880 | consumed tokens: 37738250240 | elapsed time per iteration (s): 0.57 | learning rate: 7.660E-05 | global batch size: 256 | lm loss: 2.646214E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.044 | TFLOPs: 42.91 | +7: iteration 71990/ 115203 | consumed samples: 18429440 | consumed tokens: 37743493120 | elapsed time per iteration (s): 0.58 | learning rate: 7.658E-05 | global batch size: 256 | lm loss: 2.647572E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.696 | TFLOPs: 42.21 | +0: [2023-03-17 00:07:38,475] [INFO] [logging.py:68:log_dist] [Rank 0] step=72000, skipped=0, lr=[7.655593093399763e-05, 7.655593093399763e-05, 7.655593093399763e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 72000/ 115203 | consumed samples: 18432000 | consumed tokens: 37748736000 | elapsed time per iteration (s): 0.57 | learning rate: 7.656E-05 | global batch size: 256 | lm loss: 2.644434E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.008 | TFLOPs: 42.62 | +0: steps: 72000 loss: 2.6378 iter time (s): 0.566 samples/sec: 452.118 +7: iteration 72010/ 115203 | consumed samples: 18434560 | consumed tokens: 37753978880 | elapsed time per iteration (s): 0.56 | learning rate: 7.653E-05 | global batch size: 256 | lm loss: 2.643242E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.293 | TFLOPs: 43.41 | +7: iteration 72020/ 115203 | consumed samples: 18437120 | consumed tokens: 37759221760 | elapsed time per iteration (s): 0.57 | learning rate: 7.651E-05 | global batch size: 256 | lm loss: 2.652248E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.900 | TFLOPs: 43.18 | +7: iteration 72030/ 115203 | consumed samples: 18439680 | consumed tokens: 37764464640 | elapsed time per iteration (s): 0.57 | learning rate: 7.649E-05 | global batch size: 256 | lm loss: 2.641263E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.084 | TFLOPs: 43.01 | +7: iteration 72040/ 115203 | consumed samples: 18442240 | consumed tokens: 37769707520 | elapsed time per iteration (s): 0.58 | learning rate: 7.646E-05 | global batch size: 256 | lm loss: 2.642277E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.288 | TFLOPs: 42.26 | +7: iteration 72050/ 115203 | consumed samples: 18444800 | consumed tokens: 37774950400 | elapsed time per iteration (s): 0.57 | learning rate: 7.644E-05 | global batch size: 256 | lm loss: 2.645667E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.087 | TFLOPs: 43.20 | +7: iteration 72060/ 115203 | consumed samples: 18447360 | consumed tokens: 37780193280 | elapsed time per iteration (s): 0.56 | learning rate: 7.642E-05 | global batch size: 256 | lm loss: 2.653564E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.364 | TFLOPs: 43.32 | +7: iteration 72070/ 115203 | consumed samples: 18449920 | consumed tokens: 37785436160 | elapsed time per iteration (s): 0.55 | learning rate: 7.639E-05 | global batch size: 256 | lm loss: 2.643242E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.702 | TFLOPs: 44.02 | +7: iteration 72080/ 115203 | consumed samples: 18452480 | consumed tokens: 37790679040 | elapsed time per iteration (s): 0.57 | learning rate: 7.637E-05 | global batch size: 256 | lm loss: 2.647361E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.420 | TFLOPs: 43.13 | +7: iteration 72090/ 115203 | consumed samples: 18455040 | consumed tokens: 37795921920 | elapsed time per iteration (s): 0.57 | learning rate: 7.635E-05 | global batch size: 256 | lm loss: 2.641900E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.449 | TFLOPs: 43.04 | +7: iteration 72100/ 115203 | consumed samples: 18457600 | consumed tokens: 37801164800 | elapsed time per iteration (s): 0.56 | learning rate: 7.633E-05 | global batch size: 256 | lm loss: 2.648864E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.828 | TFLOPs: 43.36 | +7: iteration 72110/ 115203 | consumed samples: 18460160 | consumed tokens: 37806407680 | elapsed time per iteration (s): 0.57 | learning rate: 7.630E-05 | global batch size: 256 | lm loss: 2.642756E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.360 | TFLOPs: 42.65 | +7: iteration 72120/ 115203 | consumed samples: 18462720 | consumed tokens: 37811650560 | elapsed time per iteration (s): 0.55 | learning rate: 7.628E-05 | global batch size: 256 | lm loss: 2.634726E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.404 | TFLOPs: 43.99 | +7: iteration 72130/ 115203 | consumed samples: 18465280 | consumed tokens: 37816893440 | elapsed time per iteration (s): 0.57 | learning rate: 7.626E-05 | global batch size: 256 | lm loss: 2.657297E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.431 | TFLOPs: 42.75 | +7: iteration 72140/ 115203 | consumed samples: 18467840 | consumed tokens: 37822136320 | elapsed time per iteration (s): 0.56 | learning rate: 7.623E-05 | global batch size: 256 | lm loss: 2.655116E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.746 | TFLOPs: 43.45 | +7: iteration 72150/ 115203 | consumed samples: 18470400 | consumed tokens: 37827379200 | elapsed time per iteration (s): 0.57 | learning rate: 7.621E-05 | global batch size: 256 | lm loss: 2.639418E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.735 | TFLOPs: 43.16 | +7: iteration 72160/ 115203 | consumed samples: 18472960 | consumed tokens: 37832622080 | elapsed time per iteration (s): 0.57 | learning rate: 7.619E-05 | global batch size: 256 | lm loss: 2.665936E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.729 | TFLOPs: 42.97 | +7: iteration 72170/ 115203 | consumed samples: 18475520 | consumed tokens: 37837864960 | elapsed time per iteration (s): 0.56 | learning rate: 7.617E-05 | global batch size: 256 | lm loss: 2.662220E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.810 | TFLOPs: 43.46 | +7: iteration 72180/ 115203 | consumed samples: 18478080 | consumed tokens: 37843107840 | elapsed time per iteration (s): 0.57 | learning rate: 7.614E-05 | global batch size: 256 | lm loss: 2.653554E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.699 | TFLOPs: 43.16 | +7: iteration 72190/ 115203 | consumed samples: 18480640 | consumed tokens: 37848350720 | elapsed time per iteration (s): 0.57 | learning rate: 7.612E-05 | global batch size: 256 | lm loss: 2.652906E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.055 | TFLOPs: 43.10 | +7: iteration 72200/ 115203 | consumed samples: 18483200 | consumed tokens: 37853593600 | elapsed time per iteration (s): 0.56 | learning rate: 7.610E-05 | global batch size: 256 | lm loss: 2.641743E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.638 | TFLOPs: 43.25 | +7: iteration 72210/ 115203 | consumed samples: 18485760 | consumed tokens: 37858836480 | elapsed time per iteration (s): 0.57 | learning rate: 7.607E-05 | global batch size: 256 | lm loss: 2.636948E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.553 | TFLOPs: 42.57 | +7: iteration 72220/ 115203 | consumed samples: 18488320 | consumed tokens: 37864079360 | elapsed time per iteration (s): 0.56 | learning rate: 7.605E-05 | global batch size: 256 | lm loss: 2.646337E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.831 | TFLOPs: 43.36 | +7: iteration 72230/ 115203 | consumed samples: 18490880 | consumed tokens: 37869322240 | elapsed time per iteration (s): 0.56 | learning rate: 7.603E-05 | global batch size: 256 | lm loss: 2.630893E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.727 | TFLOPs: 43.83 | +7: iteration 72240/ 115203 | consumed samples: 18493440 | consumed tokens: 37874565120 | elapsed time per iteration (s): 0.56 | learning rate: 7.600E-05 | global batch size: 256 | lm loss: 2.657442E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.222 | TFLOPs: 43.69 | +7: iteration 72250/ 115203 | consumed samples: 18496000 | consumed tokens: 37879808000 | elapsed time per iteration (s): 0.56 | learning rate: 7.598E-05 | global batch size: 256 | lm loss: 2.637424E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.493 | TFLOPs: 43.43 | +7: iteration 72260/ 115203 | consumed samples: 18498560 | consumed tokens: 37885050880 | elapsed time per iteration (s): 0.56 | learning rate: 7.596E-05 | global batch size: 256 | lm loss: 2.641742E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.224 | TFLOPs: 43.50 | +7: iteration 72270/ 115203 | consumed samples: 18501120 | consumed tokens: 37890293760 | elapsed time per iteration (s): 0.57 | learning rate: 7.594E-05 | global batch size: 256 | lm loss: 2.631911E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.657 | TFLOPs: 42.77 | +7: iteration 72280/ 115203 | consumed samples: 18503680 | consumed tokens: 37895536640 | elapsed time per iteration (s): 0.57 | learning rate: 7.591E-05 | global batch size: 256 | lm loss: 2.641858E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.850 | TFLOPs: 43.08 | +7: iteration 72290/ 115203 | consumed samples: 18506240 | consumed tokens: 37900779520 | elapsed time per iteration (s): 0.56 | learning rate: 7.589E-05 | global batch size: 256 | lm loss: 2.652009E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.556 | TFLOPs: 43.43 | +7: iteration 72300/ 115203 | consumed samples: 18508800 | consumed tokens: 37906022400 | elapsed time per iteration (s): 0.57 | learning rate: 7.587E-05 | global batch size: 256 | lm loss: 2.653318E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.793 | TFLOPs: 43.07 | +7: iteration 72310/ 115203 | consumed samples: 18511360 | consumed tokens: 37911265280 | elapsed time per iteration (s): 0.57 | learning rate: 7.584E-05 | global batch size: 256 | lm loss: 2.650011E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.118 | TFLOPs: 42.53 | +7: iteration 72320/ 115203 | consumed samples: 18513920 | consumed tokens: 37916508160 | elapsed time per iteration (s): 0.56 | learning rate: 7.582E-05 | global batch size: 256 | lm loss: 2.636412E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.839 | TFLOPs: 43.65 | +7: iteration 72330/ 115203 | consumed samples: 18516480 | consumed tokens: 37921751040 | elapsed time per iteration (s): 0.56 | learning rate: 7.580E-05 | global batch size: 256 | lm loss: 2.653620E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.239 | TFLOPs: 43.31 | +7: iteration 72340/ 115203 | consumed samples: 18519040 | consumed tokens: 37926993920 | elapsed time per iteration (s): 0.57 | learning rate: 7.577E-05 | global batch size: 256 | lm loss: 2.649747E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.639 | TFLOPs: 42.68 | +7: iteration 72350/ 115203 | consumed samples: 18521600 | consumed tokens: 37932236800 | elapsed time per iteration (s): 0.56 | learning rate: 7.575E-05 | global batch size: 256 | lm loss: 2.660738E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.313 | TFLOPs: 43.22 | +7: iteration 72360/ 115203 | consumed samples: 18524160 | consumed tokens: 37937479680 | elapsed time per iteration (s): 0.57 | learning rate: 7.573E-05 | global batch size: 256 | lm loss: 2.649053E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.300 | TFLOPs: 42.74 | +7: iteration 72370/ 115203 | consumed samples: 18526720 | consumed tokens: 37942722560 | elapsed time per iteration (s): 0.57 | learning rate: 7.571E-05 | global batch size: 256 | lm loss: 2.629589E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.522 | TFLOPs: 42.76 | +7: iteration 72380/ 115203 | consumed samples: 18529280 | consumed tokens: 37947965440 | elapsed time per iteration (s): 0.57 | learning rate: 7.568E-05 | global batch size: 256 | lm loss: 2.646080E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.958 | TFLOPs: 42.61 | +7: iteration 72390/ 115203 | consumed samples: 18531840 | consumed tokens: 37953208320 | elapsed time per iteration (s): 0.59 | learning rate: 7.566E-05 | global batch size: 256 | lm loss: 2.645090E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.938 | TFLOPs: 41.09 | +7: iteration 72400/ 115203 | consumed samples: 18534400 | consumed tokens: 37958451200 | elapsed time per iteration (s): 0.56 | learning rate: 7.564E-05 | global batch size: 256 | lm loss: 2.638911E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.535 | TFLOPs: 43.91 | +7: iteration 72410/ 115203 | consumed samples: 18536960 | consumed tokens: 37963694080 | elapsed time per iteration (s): 0.57 | learning rate: 7.561E-05 | global batch size: 256 | lm loss: 2.646309E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.191 | TFLOPs: 42.54 | +7: iteration 72420/ 115203 | consumed samples: 18539520 | consumed tokens: 37968936960 | elapsed time per iteration (s): 0.55 | learning rate: 7.559E-05 | global batch size: 256 | lm loss: 2.645274E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 72430/ 115203 | consumed samples: 18542080 | consumed tokens: 37974179840 | elapsed time per iteration (s): 0.56 | learning rate: 7.557E-05 | global batch size: 256 | lm loss: 2.650475E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.355 | TFLOPs: 43.41 | +7: iteration 72440/ 115203 | consumed samples: 18544640 | consumed tokens: 37979422720 | elapsed time per iteration (s): 0.57 | learning rate: 7.555E-05 | global batch size: 256 | lm loss: 2.636892E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.562 | TFLOPs: 42.48 | +7: iteration 72450/ 115203 | consumed samples: 18547200 | consumed tokens: 37984665600 | elapsed time per iteration (s): 0.56 | learning rate: 7.552E-05 | global batch size: 256 | lm loss: 2.645544E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.803 | TFLOPs: 43.27 | +7: iteration 72460/ 115203 | consumed samples: 18549760 | consumed tokens: 37989908480 | elapsed time per iteration (s): 0.57 | learning rate: 7.550E-05 | global batch size: 256 | lm loss: 2.649792E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.490 | TFLOPs: 42.76 | +7: iteration 72470/ 115203 | consumed samples: 18552320 | consumed tokens: 37995151360 | elapsed time per iteration (s): 0.56 | learning rate: 7.548E-05 | global batch size: 256 | lm loss: 2.642270E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.618 | TFLOPs: 43.44 | +7: iteration 72480/ 115203 | consumed samples: 18554880 | consumed tokens: 38000394240 | elapsed time per iteration (s): 0.56 | learning rate: 7.545E-05 | global batch size: 256 | lm loss: 2.647233E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.075 | TFLOPs: 43.67 | +7: iteration 72490/ 115203 | consumed samples: 18557440 | consumed tokens: 38005637120 | elapsed time per iteration (s): 0.57 | learning rate: 7.543E-05 | global batch size: 256 | lm loss: 2.656077E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.194 | TFLOPs: 42.92 | +7: iteration 72500/ 115203 | consumed samples: 18560000 | consumed tokens: 38010880000 | elapsed time per iteration (s): 0.56 | learning rate: 7.541E-05 | global batch size: 256 | lm loss: 2.646523E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.426 | TFLOPs: 43.42 | +7: iteration 72510/ 115203 | consumed samples: 18562560 | consumed tokens: 38016122880 | elapsed time per iteration (s): 0.56 | learning rate: 7.539E-05 | global batch size: 256 | lm loss: 2.635933E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.169 | TFLOPs: 43.49 | +7: iteration 72520/ 115203 | consumed samples: 18565120 | consumed tokens: 38021365760 | elapsed time per iteration (s): 0.56 | learning rate: 7.536E-05 | global batch size: 256 | lm loss: 2.648184E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.973 | TFLOPs: 43.28 | +7: iteration 72530/ 115203 | consumed samples: 18567680 | consumed tokens: 38026608640 | elapsed time per iteration (s): 0.57 | learning rate: 7.534E-05 | global batch size: 256 | lm loss: 2.670074E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.262 | TFLOPs: 43.12 | +7: iteration 72540/ 115203 | consumed samples: 18570240 | consumed tokens: 38031851520 | elapsed time per iteration (s): 0.57 | learning rate: 7.532E-05 | global batch size: 256 | lm loss: 2.640846E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.193 | TFLOPs: 43.02 | +7: iteration 72550/ 115203 | consumed samples: 18572800 | consumed tokens: 38037094400 | elapsed time per iteration (s): 0.56 | learning rate: 7.529E-05 | global batch size: 256 | lm loss: 2.645231E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.369 | TFLOPs: 43.41 | +7: iteration 72560/ 115203 | consumed samples: 18575360 | consumed tokens: 38042337280 | elapsed time per iteration (s): 0.58 | learning rate: 7.527E-05 | global batch size: 256 | lm loss: 2.636908E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.641 | TFLOPs: 42.01 | +7: iteration 72570/ 115203 | consumed samples: 18577920 | consumed tokens: 38047580160 | elapsed time per iteration (s): 0.56 | learning rate: 7.525E-05 | global batch size: 256 | lm loss: 2.648206E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.146 | TFLOPs: 43.58 | +7: iteration 72580/ 115203 | consumed samples: 18580480 | consumed tokens: 38052823040 | elapsed time per iteration (s): 0.57 | learning rate: 7.523E-05 | global batch size: 256 | lm loss: 2.639744E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.415 | TFLOPs: 42.94 | +7: iteration 72590/ 115203 | consumed samples: 18583040 | consumed tokens: 38058065920 | elapsed time per iteration (s): 0.57 | learning rate: 7.520E-05 | global batch size: 256 | lm loss: 2.654952E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.239 | TFLOPs: 42.93 | +7: iteration 72600/ 115203 | consumed samples: 18585600 | consumed tokens: 38063308800 | elapsed time per iteration (s): 0.56 | learning rate: 7.518E-05 | global batch size: 256 | lm loss: 2.650150E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.054 | TFLOPs: 43.29 | +7: iteration 72610/ 115203 | consumed samples: 18588160 | consumed tokens: 38068551680 | elapsed time per iteration (s): 0.57 | learning rate: 7.516E-05 | global batch size: 256 | lm loss: 2.638299E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.497 | TFLOPs: 43.14 | +7: iteration 72620/ 115203 | consumed samples: 18590720 | consumed tokens: 38073794560 | elapsed time per iteration (s): 0.57 | learning rate: 7.513E-05 | global batch size: 256 | lm loss: 2.645916E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.774 | TFLOPs: 42.88 | +7: iteration 72630/ 115203 | consumed samples: 18593280 | consumed tokens: 38079037440 | elapsed time per iteration (s): 0.57 | learning rate: 7.511E-05 | global batch size: 256 | lm loss: 2.629572E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.381 | TFLOPs: 43.13 | +7: iteration 72640/ 115203 | consumed samples: 18595840 | consumed tokens: 38084280320 | elapsed time per iteration (s): 0.56 | learning rate: 7.509E-05 | global batch size: 256 | lm loss: 2.643159E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.417 | TFLOPs: 43.51 | +7: iteration 72650/ 115203 | consumed samples: 18598400 | consumed tokens: 38089523200 | elapsed time per iteration (s): 0.57 | learning rate: 7.507E-05 | global batch size: 256 | lm loss: 2.654500E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.330 | TFLOPs: 43.12 | +7: iteration 72660/ 115203 | consumed samples: 18600960 | consumed tokens: 38094766080 | elapsed time per iteration (s): 0.56 | learning rate: 7.504E-05 | global batch size: 256 | lm loss: 2.630498E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.488 | TFLOPs: 43.33 | +7: iteration 72670/ 115203 | consumed samples: 18603520 | consumed tokens: 38100008960 | elapsed time per iteration (s): 0.57 | learning rate: 7.502E-05 | global batch size: 256 | lm loss: 2.648836E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.982 | TFLOPs: 43.09 | +7: iteration 72680/ 115203 | consumed samples: 18606080 | consumed tokens: 38105251840 | elapsed time per iteration (s): 0.57 | learning rate: 7.500E-05 | global batch size: 256 | lm loss: 2.642902E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.409 | TFLOPs: 42.75 | +7: iteration 72690/ 115203 | consumed samples: 18608640 | consumed tokens: 38110494720 | elapsed time per iteration (s): 0.57 | learning rate: 7.497E-05 | global batch size: 256 | lm loss: 2.630587E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.369 | TFLOPs: 43.13 | +7: iteration 72700/ 115203 | consumed samples: 18611200 | consumed tokens: 38115737600 | elapsed time per iteration (s): 0.57 | learning rate: 7.495E-05 | global batch size: 256 | lm loss: 2.651547E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.247 | TFLOPs: 43.02 | +7: iteration 72710/ 115203 | consumed samples: 18613760 | consumed tokens: 38120980480 | elapsed time per iteration (s): 0.57 | learning rate: 7.493E-05 | global batch size: 256 | lm loss: 2.637525E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.103 | TFLOPs: 43.10 | +7: iteration 72720/ 115203 | consumed samples: 18616320 | consumed tokens: 38126223360 | elapsed time per iteration (s): 0.55 | learning rate: 7.491E-05 | global batch size: 256 | lm loss: 2.635950E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.389 | TFLOPs: 43.99 | +7: iteration 72730/ 115203 | consumed samples: 18618880 | consumed tokens: 38131466240 | elapsed time per iteration (s): 0.57 | learning rate: 7.488E-05 | global batch size: 256 | lm loss: 2.638787E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.279 | TFLOPs: 43.02 | +7: iteration 72740/ 115203 | consumed samples: 18621440 | consumed tokens: 38136709120 | elapsed time per iteration (s): 0.56 | learning rate: 7.486E-05 | global batch size: 256 | lm loss: 2.645001E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.536 | TFLOPs: 43.43 | +7: iteration 72750/ 115203 | consumed samples: 18624000 | consumed tokens: 38141952000 | elapsed time per iteration (s): 0.57 | learning rate: 7.484E-05 | global batch size: 256 | lm loss: 2.648926E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.916 | TFLOPs: 42.89 | +7: iteration 72760/ 115203 | consumed samples: 18626560 | consumed tokens: 38147194880 | elapsed time per iteration (s): 0.57 | learning rate: 7.481E-05 | global batch size: 256 | lm loss: 2.638316E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.598 | TFLOPs: 42.86 | +7: iteration 72770/ 115203 | consumed samples: 18629120 | consumed tokens: 38152437760 | elapsed time per iteration (s): 0.56 | learning rate: 7.479E-05 | global batch size: 256 | lm loss: 2.645868E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.174 | TFLOPs: 43.40 | +7: iteration 72780/ 115203 | consumed samples: 18631680 | consumed tokens: 38157680640 | elapsed time per iteration (s): 0.56 | learning rate: 7.477E-05 | global batch size: 256 | lm loss: 2.644280E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.420 | TFLOPs: 43.42 | +7: iteration 72790/ 115203 | consumed samples: 18634240 | consumed tokens: 38162923520 | elapsed time per iteration (s): 0.57 | learning rate: 7.475E-05 | global batch size: 256 | lm loss: 2.635897E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.873 | TFLOPs: 43.18 | +7: iteration 72800/ 115203 | consumed samples: 18636800 | consumed tokens: 38168166400 | elapsed time per iteration (s): 0.57 | learning rate: 7.472E-05 | global batch size: 256 | lm loss: 2.637164E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.707 | TFLOPs: 43.07 | +7: iteration 72810/ 115203 | consumed samples: 18639360 | consumed tokens: 38173409280 | elapsed time per iteration (s): 0.56 | learning rate: 7.470E-05 | global batch size: 256 | lm loss: 2.654687E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.445 | TFLOPs: 43.71 | +7: iteration 72820/ 115203 | consumed samples: 18641920 | consumed tokens: 38178652160 | elapsed time per iteration (s): 0.57 | learning rate: 7.468E-05 | global batch size: 256 | lm loss: 2.641144E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.451 | TFLOPs: 42.47 | +7: iteration 72830/ 115203 | consumed samples: 18644480 | consumed tokens: 38183895040 | elapsed time per iteration (s): 0.55 | learning rate: 7.465E-05 | global batch size: 256 | lm loss: 2.636277E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.576 | TFLOPs: 44.01 | +7: iteration 72840/ 115203 | consumed samples: 18647040 | consumed tokens: 38189137920 | elapsed time per iteration (s): 0.57 | learning rate: 7.463E-05 | global batch size: 256 | lm loss: 2.639514E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.502 | TFLOPs: 43.05 | +7: iteration 72850/ 115203 | consumed samples: 18649600 | consumed tokens: 38194380800 | elapsed time per iteration (s): 0.57 | learning rate: 7.461E-05 | global batch size: 256 | lm loss: 2.640683E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.399 | TFLOPs: 42.94 | +7: iteration 72860/ 115203 | consumed samples: 18652160 | consumed tokens: 38199623680 | elapsed time per iteration (s): 0.56 | learning rate: 7.459E-05 | global batch size: 256 | lm loss: 2.655760E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.002 | TFLOPs: 43.47 | +7: iteration 72870/ 115203 | consumed samples: 18654720 | consumed tokens: 38204866560 | elapsed time per iteration (s): 0.57 | learning rate: 7.456E-05 | global batch size: 256 | lm loss: 2.638315E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.239 | TFLOPs: 42.83 | +7: iteration 72880/ 115203 | consumed samples: 18657280 | consumed tokens: 38210109440 | elapsed time per iteration (s): 0.57 | learning rate: 7.454E-05 | global batch size: 256 | lm loss: 2.649931E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.399 | TFLOPs: 43.04 | +7: iteration 72890/ 115203 | consumed samples: 18659840 | consumed tokens: 38215352320 | elapsed time per iteration (s): 0.55 | learning rate: 7.452E-05 | global batch size: 256 | lm loss: 2.634637E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.637 | TFLOPs: 44.01 | +7: iteration 72900/ 115203 | consumed samples: 18662400 | consumed tokens: 38220595200 | elapsed time per iteration (s): 0.56 | learning rate: 7.450E-05 | global batch size: 256 | lm loss: 2.657262E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.858 | TFLOPs: 43.65 | +7: iteration 72910/ 115203 | consumed samples: 18664960 | consumed tokens: 38225838080 | elapsed time per iteration (s): 0.57 | learning rate: 7.447E-05 | global batch size: 256 | lm loss: 2.642367E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.299 | TFLOPs: 42.45 | +7: iteration 72920/ 115203 | consumed samples: 18667520 | consumed tokens: 38231080960 | elapsed time per iteration (s): 0.56 | learning rate: 7.445E-05 | global batch size: 256 | lm loss: 2.636766E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.283 | TFLOPs: 43.60 | +7: iteration 72930/ 115203 | consumed samples: 18670080 | consumed tokens: 38236323840 | elapsed time per iteration (s): 0.57 | learning rate: 7.443E-05 | global batch size: 256 | lm loss: 2.646084E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.355 | TFLOPs: 42.94 | +7: iteration 72940/ 115203 | consumed samples: 18672640 | consumed tokens: 38241566720 | elapsed time per iteration (s): 0.57 | learning rate: 7.440E-05 | global batch size: 256 | lm loss: 2.640642E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.332 | TFLOPs: 43.03 | +7: iteration 72950/ 115203 | consumed samples: 18675200 | consumed tokens: 38246809600 | elapsed time per iteration (s): 0.56 | learning rate: 7.438E-05 | global batch size: 256 | lm loss: 2.653448E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.999 | TFLOPs: 43.57 | +7: iteration 72960/ 115203 | consumed samples: 18677760 | consumed tokens: 38252052480 | elapsed time per iteration (s): 0.56 | learning rate: 7.436E-05 | global batch size: 256 | lm loss: 2.633419E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.806 | TFLOPs: 43.36 | +7: iteration 72970/ 115203 | consumed samples: 18680320 | consumed tokens: 38257295360 | elapsed time per iteration (s): 0.56 | learning rate: 7.434E-05 | global batch size: 256 | lm loss: 2.650450E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.058 | TFLOPs: 43.38 | +7: iteration 72980/ 115203 | consumed samples: 18682880 | consumed tokens: 38262538240 | elapsed time per iteration (s): 0.57 | learning rate: 7.431E-05 | global batch size: 256 | lm loss: 2.640783E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.319 | TFLOPs: 43.03 | +7: iteration 72990/ 115203 | consumed samples: 18685440 | consumed tokens: 38267781120 | elapsed time per iteration (s): 0.57 | learning rate: 7.429E-05 | global batch size: 256 | lm loss: 2.644024E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.333 | TFLOPs: 43.13 | +7: iteration 73000/ 115203 | consumed samples: 18688000 | consumed tokens: 38273024000 | elapsed time per iteration (s): 0.57 | learning rate: 7.427E-05 | global batch size: 256 | lm loss: 2.653614E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.814 | TFLOPs: 43.17 | +7: iteration 73010/ 115203 | consumed samples: 18690560 | consumed tokens: 38278266880 | elapsed time per iteration (s): 0.57 | learning rate: 7.424E-05 | global batch size: 256 | lm loss: 2.646787E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.785 | TFLOPs: 42.60 | +7: iteration 73020/ 115203 | consumed samples: 18693120 | consumed tokens: 38283509760 | elapsed time per iteration (s): 0.56 | learning rate: 7.422E-05 | global batch size: 256 | lm loss: 2.633774E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.170 | TFLOPs: 43.97 | +7: iteration 73030/ 115203 | consumed samples: 18695680 | consumed tokens: 38288752640 | elapsed time per iteration (s): 0.58 | learning rate: 7.420E-05 | global batch size: 256 | lm loss: 2.654428E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.271 | TFLOPs: 41.88 | +7: iteration 73040/ 115203 | consumed samples: 18698240 | consumed tokens: 38293995520 | elapsed time per iteration (s): 0.57 | learning rate: 7.418E-05 | global batch size: 256 | lm loss: 2.634023E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.690 | TFLOPs: 43.06 | +7: iteration 73050/ 115203 | consumed samples: 18700800 | consumed tokens: 38299238400 | elapsed time per iteration (s): 0.57 | learning rate: 7.415E-05 | global batch size: 256 | lm loss: 2.644299E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.257 | TFLOPs: 42.83 | +7: iteration 73060/ 115203 | consumed samples: 18703360 | consumed tokens: 38304481280 | elapsed time per iteration (s): 0.56 | learning rate: 7.413E-05 | global batch size: 256 | lm loss: 2.632058E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.876 | TFLOPs: 43.37 | +7: iteration 73070/ 115203 | consumed samples: 18705920 | consumed tokens: 38309724160 | elapsed time per iteration (s): 0.56 | learning rate: 7.411E-05 | global batch size: 256 | lm loss: 2.642109E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.679 | TFLOPs: 43.83 | +7: iteration 73080/ 115203 | consumed samples: 18708480 | consumed tokens: 38314967040 | elapsed time per iteration (s): 0.56 | learning rate: 7.409E-05 | global batch size: 256 | lm loss: 2.643246E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.086 | TFLOPs: 43.67 | +7: iteration 73090/ 115203 | consumed samples: 18711040 | consumed tokens: 38320209920 | elapsed time per iteration (s): 0.56 | learning rate: 7.406E-05 | global batch size: 256 | lm loss: 2.653826E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.572 | TFLOPs: 43.53 | +7: iteration 73100/ 115203 | consumed samples: 18713600 | consumed tokens: 38325452800 | elapsed time per iteration (s): 0.57 | learning rate: 7.404E-05 | global batch size: 256 | lm loss: 2.651029E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.902 | TFLOPs: 43.08 | +7: iteration 73110/ 115203 | consumed samples: 18716160 | consumed tokens: 38330695680 | elapsed time per iteration (s): 0.56 | learning rate: 7.402E-05 | global batch size: 256 | lm loss: 2.651748E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.939 | TFLOPs: 43.28 | +7: iteration 73120/ 115203 | consumed samples: 18718720 | consumed tokens: 38335938560 | elapsed time per iteration (s): 0.57 | learning rate: 7.399E-05 | global batch size: 256 | lm loss: 2.648233E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.428 | TFLOPs: 43.13 | +7: iteration 73130/ 115203 | consumed samples: 18721280 | consumed tokens: 38341181440 | elapsed time per iteration (s): 0.58 | learning rate: 7.397E-05 | global batch size: 256 | lm loss: 2.645895E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.837 | TFLOPs: 42.03 | +7: iteration 73140/ 115203 | consumed samples: 18723840 | consumed tokens: 38346424320 | elapsed time per iteration (s): 0.56 | learning rate: 7.395E-05 | global batch size: 256 | lm loss: 2.641189E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.025 | TFLOPs: 43.29 | +7: iteration 73150/ 115203 | consumed samples: 18726400 | consumed tokens: 38351667200 | elapsed time per iteration (s): 0.57 | learning rate: 7.393E-05 | global batch size: 256 | lm loss: 2.643906E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.579 | TFLOPs: 42.48 | +7: iteration 73160/ 115203 | consumed samples: 18728960 | consumed tokens: 38356910080 | elapsed time per iteration (s): 0.57 | learning rate: 7.390E-05 | global batch size: 256 | lm loss: 2.644733E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.922 | TFLOPs: 42.51 | +7: iteration 73170/ 115203 | consumed samples: 18731520 | consumed tokens: 38362152960 | elapsed time per iteration (s): 0.57 | learning rate: 7.388E-05 | global batch size: 256 | lm loss: 2.643784E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.599 | TFLOPs: 42.48 | +7: iteration 73180/ 115203 | consumed samples: 18734080 | consumed tokens: 38367395840 | elapsed time per iteration (s): 0.57 | learning rate: 7.386E-05 | global batch size: 256 | lm loss: 2.634976E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.716 | TFLOPs: 42.97 | +7: iteration 73190/ 115203 | consumed samples: 18736640 | consumed tokens: 38372638720 | elapsed time per iteration (s): 0.56 | learning rate: 7.384E-05 | global batch size: 256 | lm loss: 2.639766E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.778 | TFLOPs: 43.36 | +7: iteration 73200/ 115203 | consumed samples: 18739200 | consumed tokens: 38377881600 | elapsed time per iteration (s): 0.56 | learning rate: 7.381E-05 | global batch size: 256 | lm loss: 2.652546E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.308 | TFLOPs: 43.89 | +7: iteration 73210/ 115203 | consumed samples: 18741760 | consumed tokens: 38383124480 | elapsed time per iteration (s): 0.57 | learning rate: 7.379E-05 | global batch size: 256 | lm loss: 2.645939E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.356 | TFLOPs: 42.56 | +7: iteration 73220/ 115203 | consumed samples: 18744320 | consumed tokens: 38388367360 | elapsed time per iteration (s): 0.57 | learning rate: 7.377E-05 | global batch size: 256 | lm loss: 2.653210E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.450 | TFLOPs: 42.95 | +7: iteration 73230/ 115203 | consumed samples: 18746880 | consumed tokens: 38393610240 | elapsed time per iteration (s): 0.57 | learning rate: 7.374E-05 | global batch size: 256 | lm loss: 2.647641E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.268 | TFLOPs: 42.93 | +7: iteration 73240/ 115203 | consumed samples: 18749440 | consumed tokens: 38398853120 | elapsed time per iteration (s): 0.56 | learning rate: 7.372E-05 | global batch size: 256 | lm loss: 2.649716E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.125 | TFLOPs: 43.68 | +7: iteration 73250/ 115203 | consumed samples: 18752000 | consumed tokens: 38404096000 | elapsed time per iteration (s): 0.55 | learning rate: 7.370E-05 | global batch size: 256 | lm loss: 2.646691E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.306 | TFLOPs: 43.98 | +7: iteration 73260/ 115203 | consumed samples: 18754560 | consumed tokens: 38409338880 | elapsed time per iteration (s): 0.57 | learning rate: 7.368E-05 | global batch size: 256 | lm loss: 2.648770E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.297 | TFLOPs: 43.12 | +7: iteration 73270/ 115203 | consumed samples: 18757120 | consumed tokens: 38414581760 | elapsed time per iteration (s): 0.56 | learning rate: 7.365E-05 | global batch size: 256 | lm loss: 2.644522E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.321 | TFLOPs: 43.31 | +7: iteration 73280/ 115203 | consumed samples: 18759680 | consumed tokens: 38419824640 | elapsed time per iteration (s): 0.57 | learning rate: 7.363E-05 | global batch size: 256 | lm loss: 2.647223E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.050 | TFLOPs: 42.81 | +7: iteration 73290/ 115203 | consumed samples: 18762240 | consumed tokens: 38425067520 | elapsed time per iteration (s): 0.58 | learning rate: 7.361E-05 | global batch size: 256 | lm loss: 2.631415E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.901 | TFLOPs: 42.32 | +7: iteration 73300/ 115203 | consumed samples: 18764800 | consumed tokens: 38430310400 | elapsed time per iteration (s): 0.56 | learning rate: 7.359E-05 | global batch size: 256 | lm loss: 2.627520E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.120 | TFLOPs: 43.96 | +7: iteration 73310/ 115203 | consumed samples: 18767360 | consumed tokens: 38435553280 | elapsed time per iteration (s): 0.57 | learning rate: 7.356E-05 | global batch size: 256 | lm loss: 2.649343E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.606 | TFLOPs: 43.15 | +7: iteration 73320/ 115203 | consumed samples: 18769920 | consumed tokens: 38440796160 | elapsed time per iteration (s): 0.56 | learning rate: 7.354E-05 | global batch size: 256 | lm loss: 2.635334E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.176 | TFLOPs: 43.21 | +7: iteration 73330/ 115203 | consumed samples: 18772480 | consumed tokens: 38446039040 | elapsed time per iteration (s): 0.55 | learning rate: 7.352E-05 | global batch size: 256 | lm loss: 2.633194E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.480 | TFLOPs: 44.00 | +7: iteration 73340/ 115203 | consumed samples: 18775040 | consumed tokens: 38451281920 | elapsed time per iteration (s): 0.55 | learning rate: 7.350E-05 | global batch size: 256 | lm loss: 2.641031E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.465 | TFLOPs: 44.00 | +7: iteration 73350/ 115203 | consumed samples: 18777600 | consumed tokens: 38456524800 | elapsed time per iteration (s): 0.56 | learning rate: 7.347E-05 | global batch size: 256 | lm loss: 2.639560E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.908 | TFLOPs: 43.56 | +7: iteration 73360/ 115203 | consumed samples: 18780160 | consumed tokens: 38461767680 | elapsed time per iteration (s): 0.57 | learning rate: 7.345E-05 | global batch size: 256 | lm loss: 2.655696E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.710 | TFLOPs: 43.16 | +7: iteration 73370/ 115203 | consumed samples: 18782720 | consumed tokens: 38467010560 | elapsed time per iteration (s): 0.56 | learning rate: 7.343E-05 | global batch size: 256 | lm loss: 2.644504E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.361 | TFLOPs: 43.60 | +7: iteration 73380/ 115203 | consumed samples: 18785280 | consumed tokens: 38472253440 | elapsed time per iteration (s): 0.55 | learning rate: 7.340E-05 | global batch size: 256 | lm loss: 2.638422E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.427 | TFLOPs: 43.99 | +7: iteration 73390/ 115203 | consumed samples: 18787840 | consumed tokens: 38477496320 | elapsed time per iteration (s): 0.57 | learning rate: 7.338E-05 | global batch size: 256 | lm loss: 2.636142E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.185 | TFLOPs: 42.73 | +7: iteration 73400/ 115203 | consumed samples: 18790400 | consumed tokens: 38482739200 | elapsed time per iteration (s): 0.56 | learning rate: 7.336E-05 | global batch size: 256 | lm loss: 2.642722E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.996 | TFLOPs: 43.66 | +7: iteration 73410/ 115203 | consumed samples: 18792960 | consumed tokens: 38487982080 | elapsed time per iteration (s): 0.55 | learning rate: 7.334E-05 | global batch size: 256 | lm loss: 2.641874E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.405 | TFLOPs: 43.99 | +7: iteration 73420/ 115203 | consumed samples: 18795520 | consumed tokens: 38493224960 | elapsed time per iteration (s): 0.56 | learning rate: 7.331E-05 | global batch size: 256 | lm loss: 2.644385E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.472 | TFLOPs: 43.33 | +7: iteration 73430/ 115203 | consumed samples: 18798080 | consumed tokens: 38498467840 | elapsed time per iteration (s): 0.56 | learning rate: 7.329E-05 | global batch size: 256 | lm loss: 2.640740E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.878 | TFLOPs: 43.56 | +7: iteration 73440/ 115203 | consumed samples: 18800640 | consumed tokens: 38503710720 | elapsed time per iteration (s): 0.56 | learning rate: 7.327E-05 | global batch size: 256 | lm loss: 2.633068E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.082 | TFLOPs: 43.39 | +7: iteration 73450/ 115203 | consumed samples: 18803200 | consumed tokens: 38508953600 | elapsed time per iteration (s): 0.56 | learning rate: 7.325E-05 | global batch size: 256 | lm loss: 2.629865E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.985 | TFLOPs: 43.57 | +7: iteration 73460/ 115203 | consumed samples: 18805760 | consumed tokens: 38514196480 | elapsed time per iteration (s): 0.56 | learning rate: 7.322E-05 | global batch size: 256 | lm loss: 2.649524E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.832 | TFLOPs: 43.36 | +7: iteration 73470/ 115203 | consumed samples: 18808320 | consumed tokens: 38519439360 | elapsed time per iteration (s): 0.56 | learning rate: 7.320E-05 | global batch size: 256 | lm loss: 2.650422E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.380 | TFLOPs: 43.70 | +7: iteration 73480/ 115203 | consumed samples: 18810880 | consumed tokens: 38524682240 | elapsed time per iteration (s): 0.55 | learning rate: 7.318E-05 | global batch size: 256 | lm loss: 2.624135E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.498 | TFLOPs: 44.00 | +7: iteration 73490/ 115203 | consumed samples: 18813440 | consumed tokens: 38529925120 | elapsed time per iteration (s): 0.55 | learning rate: 7.316E-05 | global batch size: 256 | lm loss: 2.634438E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.413 | TFLOPs: 43.99 | +7: iteration 73500/ 115203 | consumed samples: 18816000 | consumed tokens: 38535168000 | elapsed time per iteration (s): 0.56 | learning rate: 7.313E-05 | global batch size: 256 | lm loss: 2.636603E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.036 | TFLOPs: 43.38 | +7: iteration 73510/ 115203 | consumed samples: 18818560 | consumed tokens: 38540410880 | elapsed time per iteration (s): 0.55 | learning rate: 7.311E-05 | global batch size: 256 | lm loss: 2.648339E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.358 | TFLOPs: 43.99 | +7: iteration 73520/ 115203 | consumed samples: 18821120 | consumed tokens: 38545653760 | elapsed time per iteration (s): 0.57 | learning rate: 7.309E-05 | global batch size: 256 | lm loss: 2.640528E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.014 | TFLOPs: 43.00 | +7: iteration 73530/ 115203 | consumed samples: 18823680 | consumed tokens: 38550896640 | elapsed time per iteration (s): 0.56 | learning rate: 7.307E-05 | global batch size: 256 | lm loss: 2.650428E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.257 | TFLOPs: 43.98 | +7: iteration 73540/ 115203 | consumed samples: 18826240 | consumed tokens: 38556139520 | elapsed time per iteration (s): 0.55 | learning rate: 7.304E-05 | global batch size: 256 | lm loss: 2.632565E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.385 | TFLOPs: 43.99 | +7: iteration 73550/ 115203 | consumed samples: 18828800 | consumed tokens: 38561382400 | elapsed time per iteration (s): 0.56 | learning rate: 7.302E-05 | global batch size: 256 | lm loss: 2.652614E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.470 | TFLOPs: 43.71 | +7: iteration 73560/ 115203 | consumed samples: 18831360 | consumed tokens: 38566625280 | elapsed time per iteration (s): 0.55 | learning rate: 7.300E-05 | global batch size: 256 | lm loss: 2.637738E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 73570/ 115203 | consumed samples: 18833920 | consumed tokens: 38571868160 | elapsed time per iteration (s): 0.55 | learning rate: 7.297E-05 | global batch size: 256 | lm loss: 2.635888E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 73580/ 115203 | consumed samples: 18836480 | consumed tokens: 38577111040 | elapsed time per iteration (s): 0.56 | learning rate: 7.295E-05 | global batch size: 256 | lm loss: 2.649208E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.373 | TFLOPs: 43.32 | +7: iteration 73590/ 115203 | consumed samples: 18839040 | consumed tokens: 38582353920 | elapsed time per iteration (s): 0.56 | learning rate: 7.293E-05 | global batch size: 256 | lm loss: 2.650460E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.253 | TFLOPs: 43.98 | +7: iteration 73600/ 115203 | consumed samples: 18841600 | consumed tokens: 38587596800 | elapsed time per iteration (s): 0.57 | learning rate: 7.291E-05 | global batch size: 256 | lm loss: 2.641925E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.799 | TFLOPs: 43.17 | +7: iteration 73610/ 115203 | consumed samples: 18844160 | consumed tokens: 38592839680 | elapsed time per iteration (s): 0.56 | learning rate: 7.288E-05 | global batch size: 256 | lm loss: 2.645925E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.325 | TFLOPs: 43.70 | +7: iteration 73620/ 115203 | consumed samples: 18846720 | consumed tokens: 38598082560 | elapsed time per iteration (s): 0.56 | learning rate: 7.286E-05 | global batch size: 256 | lm loss: 2.641410E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.141 | TFLOPs: 43.39 | +7: iteration 73630/ 115203 | consumed samples: 18849280 | consumed tokens: 38603325440 | elapsed time per iteration (s): 0.56 | learning rate: 7.284E-05 | global batch size: 256 | lm loss: 2.645872E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.053 | TFLOPs: 43.48 | +7: iteration 73640/ 115203 | consumed samples: 18851840 | consumed tokens: 38608568320 | elapsed time per iteration (s): 0.56 | learning rate: 7.282E-05 | global batch size: 256 | lm loss: 2.634026E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.676 | TFLOPs: 43.44 | +7: iteration 73650/ 115203 | consumed samples: 18854400 | consumed tokens: 38613811200 | elapsed time per iteration (s): 0.55 | learning rate: 7.279E-05 | global batch size: 256 | lm loss: 2.638956E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 73660/ 115203 | consumed samples: 18856960 | consumed tokens: 38619054080 | elapsed time per iteration (s): 0.58 | learning rate: 7.277E-05 | global batch size: 256 | lm loss: 2.651429E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.781 | TFLOPs: 42.02 | +7: iteration 73670/ 115203 | consumed samples: 18859520 | consumed tokens: 38624296960 | elapsed time per iteration (s): 0.56 | learning rate: 7.275E-05 | global batch size: 256 | lm loss: 2.632276E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.491 | TFLOPs: 43.24 | +7: iteration 73680/ 115203 | consumed samples: 18862080 | consumed tokens: 38629539840 | elapsed time per iteration (s): 0.56 | learning rate: 7.273E-05 | global batch size: 256 | lm loss: 2.646495E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.067 | TFLOPs: 43.39 | +7: iteration 73690/ 115203 | consumed samples: 18864640 | consumed tokens: 38634782720 | elapsed time per iteration (s): 0.57 | learning rate: 7.270E-05 | global batch size: 256 | lm loss: 2.627473E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.160 | TFLOPs: 43.01 | +7: iteration 73700/ 115203 | consumed samples: 18867200 | consumed tokens: 38640025600 | elapsed time per iteration (s): 0.57 | learning rate: 7.268E-05 | global batch size: 256 | lm loss: 2.627179E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.095 | TFLOPs: 42.91 | +7: iteration 73710/ 115203 | consumed samples: 18869760 | consumed tokens: 38645268480 | elapsed time per iteration (s): 0.56 | learning rate: 7.266E-05 | global batch size: 256 | lm loss: 2.638349E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.903 | TFLOPs: 43.27 | +7: iteration 73720/ 115203 | consumed samples: 18872320 | consumed tokens: 38650511360 | elapsed time per iteration (s): 0.57 | learning rate: 7.264E-05 | global batch size: 256 | lm loss: 2.624846E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.923 | TFLOPs: 42.99 | +7: iteration 73730/ 115203 | consumed samples: 18874880 | consumed tokens: 38655754240 | elapsed time per iteration (s): 0.57 | learning rate: 7.261E-05 | global batch size: 256 | lm loss: 2.647265E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.912 | TFLOPs: 42.89 | +7: iteration 73740/ 115203 | consumed samples: 18877440 | consumed tokens: 38660997120 | elapsed time per iteration (s): 0.58 | learning rate: 7.259E-05 | global batch size: 256 | lm loss: 2.632520E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.972 | TFLOPs: 42.42 | +7: iteration 73750/ 115203 | consumed samples: 18880000 | consumed tokens: 38666240000 | elapsed time per iteration (s): 0.56 | learning rate: 7.257E-05 | global batch size: 256 | lm loss: 2.639698E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.736 | TFLOPs: 43.45 | +7: iteration 73760/ 115203 | consumed samples: 18882560 | consumed tokens: 38671482880 | elapsed time per iteration (s): 0.55 | learning rate: 7.255E-05 | global batch size: 256 | lm loss: 2.635901E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.665 | TFLOPs: 44.01 | +7: iteration 73770/ 115203 | consumed samples: 18885120 | consumed tokens: 38676725760 | elapsed time per iteration (s): 0.57 | learning rate: 7.252E-05 | global batch size: 256 | lm loss: 2.645984E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.892 | TFLOPs: 43.08 | +7: iteration 73780/ 115203 | consumed samples: 18887680 | consumed tokens: 38681968640 | elapsed time per iteration (s): 0.56 | learning rate: 7.250E-05 | global batch size: 256 | lm loss: 2.641339E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.323 | TFLOPs: 43.22 | +7: iteration 73790/ 115203 | consumed samples: 18890240 | consumed tokens: 38687211520 | elapsed time per iteration (s): 0.59 | learning rate: 7.248E-05 | global batch size: 256 | lm loss: 2.637389E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.729 | TFLOPs: 41.64 | +7: iteration 73800/ 115203 | consumed samples: 18892800 | consumed tokens: 38692454400 | elapsed time per iteration (s): 0.56 | learning rate: 7.246E-05 | global batch size: 256 | lm loss: 2.634081E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.854 | TFLOPs: 43.46 | +7: iteration 73810/ 115203 | consumed samples: 18895360 | consumed tokens: 38697697280 | elapsed time per iteration (s): 0.56 | learning rate: 7.243E-05 | global batch size: 256 | lm loss: 2.644045E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.248 | TFLOPs: 43.31 | +7: iteration 73820/ 115203 | consumed samples: 18897920 | consumed tokens: 38702940160 | elapsed time per iteration (s): 0.57 | learning rate: 7.241E-05 | global batch size: 256 | lm loss: 2.645754E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.829 | TFLOPs: 42.60 | +7: iteration 73830/ 115203 | consumed samples: 18900480 | consumed tokens: 38708183040 | elapsed time per iteration (s): 0.57 | learning rate: 7.239E-05 | global batch size: 256 | lm loss: 2.627074E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.438 | TFLOPs: 42.85 | +7: iteration 73840/ 115203 | consumed samples: 18903040 | consumed tokens: 38713425920 | elapsed time per iteration (s): 0.57 | learning rate: 7.237E-05 | global batch size: 256 | lm loss: 2.634854E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.553 | TFLOPs: 42.67 | +7: iteration 73850/ 115203 | consumed samples: 18905600 | consumed tokens: 38718668800 | elapsed time per iteration (s): 0.56 | learning rate: 7.234E-05 | global batch size: 256 | lm loss: 2.641139E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.671 | TFLOPs: 43.63 | +7: iteration 73860/ 115203 | consumed samples: 18908160 | consumed tokens: 38723911680 | elapsed time per iteration (s): 0.58 | learning rate: 7.232E-05 | global batch size: 256 | lm loss: 2.626739E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.700 | TFLOPs: 41.92 | +7: iteration 73870/ 115203 | consumed samples: 18910720 | consumed tokens: 38729154560 | elapsed time per iteration (s): 0.56 | learning rate: 7.230E-05 | global batch size: 256 | lm loss: 2.636166E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.620 | TFLOPs: 43.25 | +7: iteration 73880/ 115203 | consumed samples: 18913280 | consumed tokens: 38734397440 | elapsed time per iteration (s): 0.56 | learning rate: 7.228E-05 | global batch size: 256 | lm loss: 2.642184E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.622 | TFLOPs: 43.34 | +7: iteration 73890/ 115203 | consumed samples: 18915840 | consumed tokens: 38739640320 | elapsed time per iteration (s): 0.58 | learning rate: 7.225E-05 | global batch size: 256 | lm loss: 2.638413E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.584 | TFLOPs: 42.39 | +7: iteration 73900/ 115203 | consumed samples: 18918400 | consumed tokens: 38744883200 | elapsed time per iteration (s): 0.57 | learning rate: 7.223E-05 | global batch size: 256 | lm loss: 2.649694E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.181 | TFLOPs: 43.02 | +7: iteration 73910/ 115203 | consumed samples: 18920960 | consumed tokens: 38750126080 | elapsed time per iteration (s): 0.57 | learning rate: 7.221E-05 | global batch size: 256 | lm loss: 2.643505E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.812 | TFLOPs: 43.17 | +7: iteration 73920/ 115203 | consumed samples: 18923520 | consumed tokens: 38755368960 | elapsed time per iteration (s): 0.56 | learning rate: 7.219E-05 | global batch size: 256 | lm loss: 2.648297E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.479 | TFLOPs: 43.52 | +7: iteration 73930/ 115203 | consumed samples: 18926080 | consumed tokens: 38760611840 | elapsed time per iteration (s): 0.56 | learning rate: 7.216E-05 | global batch size: 256 | lm loss: 2.638722E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.839 | TFLOPs: 43.46 | +7: iteration 73940/ 115203 | consumed samples: 18928640 | consumed tokens: 38765854720 | elapsed time per iteration (s): 0.57 | learning rate: 7.214E-05 | global batch size: 256 | lm loss: 2.633294E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.379 | TFLOPs: 42.75 | +7: iteration 73950/ 115203 | consumed samples: 18931200 | consumed tokens: 38771097600 | elapsed time per iteration (s): 0.57 | learning rate: 7.212E-05 | global batch size: 256 | lm loss: 2.642926E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.876 | TFLOPs: 42.51 | +7: iteration 73960/ 115203 | consumed samples: 18933760 | consumed tokens: 38776340480 | elapsed time per iteration (s): 0.58 | learning rate: 7.210E-05 | global batch size: 256 | lm loss: 2.647182E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.948 | TFLOPs: 41.75 | +7: iteration 73970/ 115203 | consumed samples: 18936320 | consumed tokens: 38781583360 | elapsed time per iteration (s): 0.56 | learning rate: 7.207E-05 | global batch size: 256 | lm loss: 2.646238E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.920 | TFLOPs: 43.75 | +7: iteration 73980/ 115203 | consumed samples: 18938880 | consumed tokens: 38786826240 | elapsed time per iteration (s): 0.57 | learning rate: 7.205E-05 | global batch size: 256 | lm loss: 2.629008E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.436 | TFLOPs: 42.94 | +7: iteration 73990/ 115203 | consumed samples: 18941440 | consumed tokens: 38792069120 | elapsed time per iteration (s): 0.57 | learning rate: 7.203E-05 | global batch size: 256 | lm loss: 2.647665E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.086 | TFLOPs: 42.53 | +0: [2023-03-17 00:26:28,804] [INFO] [logging.py:68:log_dist] [Rank 0] step=74000, skipped=0, lr=[7.20058819630707e-05, 7.20058819630707e-05, 7.20058819630707e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 74000/ 115203 | consumed samples: 18944000 | consumed tokens: 38797312000 | elapsed time per iteration (s): 0.58 | learning rate: 7.201E-05 | global batch size: 256 | lm loss: 2.629381E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.794 | TFLOPs: 42.41 | +0: steps: 74000 loss: 2.6224 iter time (s): 0.563 samples/sec: 454.949 +7: iteration 74010/ 115203 | consumed samples: 18946560 | consumed tokens: 38802554880 | elapsed time per iteration (s): 0.56 | learning rate: 7.198E-05 | global batch size: 256 | lm loss: 2.638954E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.533 | TFLOPs: 43.33 | +7: iteration 74020/ 115203 | consumed samples: 18949120 | consumed tokens: 38807797760 | elapsed time per iteration (s): 0.55 | learning rate: 7.196E-05 | global batch size: 256 | lm loss: 2.620953E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.306 | TFLOPs: 43.98 | +7: iteration 74030/ 115203 | consumed samples: 18951680 | consumed tokens: 38813040640 | elapsed time per iteration (s): 0.55 | learning rate: 7.194E-05 | global batch size: 256 | lm loss: 2.631379E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.490 | TFLOPs: 44.00 | +7: iteration 74040/ 115203 | consumed samples: 18954240 | consumed tokens: 38818283520 | elapsed time per iteration (s): 0.57 | learning rate: 7.192E-05 | global batch size: 256 | lm loss: 2.631977E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.634 | TFLOPs: 42.87 | +7: iteration 74050/ 115203 | consumed samples: 18956800 | consumed tokens: 38823526400 | elapsed time per iteration (s): 0.58 | learning rate: 7.189E-05 | global batch size: 256 | lm loss: 2.642753E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.906 | TFLOPs: 42.42 | +7: iteration 74060/ 115203 | consumed samples: 18959360 | consumed tokens: 38828769280 | elapsed time per iteration (s): 0.55 | learning rate: 7.187E-05 | global batch size: 256 | lm loss: 2.642771E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.408 | TFLOPs: 43.99 | +7: iteration 74070/ 115203 | consumed samples: 18961920 | consumed tokens: 38834012160 | elapsed time per iteration (s): 0.58 | learning rate: 7.185E-05 | global batch size: 256 | lm loss: 2.640408E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.086 | TFLOPs: 42.43 | +7: iteration 74080/ 115203 | consumed samples: 18964480 | consumed tokens: 38839255040 | elapsed time per iteration (s): 0.55 | learning rate: 7.183E-05 | global batch size: 256 | lm loss: 2.640687E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.358 | TFLOPs: 43.99 | +7: iteration 74090/ 115203 | consumed samples: 18967040 | consumed tokens: 38844497920 | elapsed time per iteration (s): 0.56 | learning rate: 7.180E-05 | global batch size: 256 | lm loss: 2.623049E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.045 | TFLOPs: 43.48 | +7: iteration 74100/ 115203 | consumed samples: 18969600 | consumed tokens: 38849740800 | elapsed time per iteration (s): 0.56 | learning rate: 7.178E-05 | global batch size: 256 | lm loss: 2.641577E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.263 | TFLOPs: 43.88 | +7: iteration 74110/ 115203 | consumed samples: 18972160 | consumed tokens: 38854983680 | elapsed time per iteration (s): 0.57 | learning rate: 7.176E-05 | global batch size: 256 | lm loss: 2.635255E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.430 | TFLOPs: 43.04 | +7: iteration 74120/ 115203 | consumed samples: 18974720 | consumed tokens: 38860226560 | elapsed time per iteration (s): 0.56 | learning rate: 7.174E-05 | global batch size: 256 | lm loss: 2.640568E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.373 | TFLOPs: 43.41 | +7: iteration 74130/ 115203 | consumed samples: 18977280 | consumed tokens: 38865469440 | elapsed time per iteration (s): 0.56 | learning rate: 7.171E-05 | global batch size: 256 | lm loss: 2.649217E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.368 | TFLOPs: 43.89 | +7: iteration 74140/ 115203 | consumed samples: 18979840 | consumed tokens: 38870712320 | elapsed time per iteration (s): 0.55 | learning rate: 7.169E-05 | global batch size: 256 | lm loss: 2.635231E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.595 | TFLOPs: 44.01 | +7: iteration 74150/ 115203 | consumed samples: 18982400 | consumed tokens: 38875955200 | elapsed time per iteration (s): 0.56 | learning rate: 7.167E-05 | global batch size: 256 | lm loss: 2.645588E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.019 | TFLOPs: 43.48 | +7: iteration 74160/ 115203 | consumed samples: 18984960 | consumed tokens: 38881198080 | elapsed time per iteration (s): 0.55 | learning rate: 7.165E-05 | global batch size: 256 | lm loss: 2.635324E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.587 | TFLOPs: 44.01 | +7: iteration 74170/ 115203 | consumed samples: 18987520 | consumed tokens: 38886440960 | elapsed time per iteration (s): 0.56 | learning rate: 7.162E-05 | global batch size: 256 | lm loss: 2.641329E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.715 | TFLOPs: 43.45 | +7: iteration 74180/ 115203 | consumed samples: 18990080 | consumed tokens: 38891683840 | elapsed time per iteration (s): 0.56 | learning rate: 7.160E-05 | global batch size: 256 | lm loss: 2.637022E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.491 | TFLOPs: 43.52 | +7: iteration 74190/ 115203 | consumed samples: 18992640 | consumed tokens: 38896926720 | elapsed time per iteration (s): 0.57 | learning rate: 7.158E-05 | global batch size: 256 | lm loss: 2.636445E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.988 | TFLOPs: 42.71 | +7: iteration 74200/ 115203 | consumed samples: 18995200 | consumed tokens: 38902169600 | elapsed time per iteration (s): 0.56 | learning rate: 7.156E-05 | global batch size: 256 | lm loss: 2.647605E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.369 | TFLOPs: 43.61 | +7: iteration 74210/ 115203 | consumed samples: 18997760 | consumed tokens: 38907412480 | elapsed time per iteration (s): 0.56 | learning rate: 7.153E-05 | global batch size: 256 | lm loss: 2.635870E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.430 | TFLOPs: 43.90 | +7: iteration 74220/ 115203 | consumed samples: 19000320 | consumed tokens: 38912655360 | elapsed time per iteration (s): 0.57 | learning rate: 7.151E-05 | global batch size: 256 | lm loss: 2.638339E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.866 | TFLOPs: 42.99 | +7: iteration 74230/ 115203 | consumed samples: 19002880 | consumed tokens: 38917898240 | elapsed time per iteration (s): 0.56 | learning rate: 7.149E-05 | global batch size: 256 | lm loss: 2.638665E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.777 | TFLOPs: 43.83 | +7: iteration 74240/ 115203 | consumed samples: 19005440 | consumed tokens: 38923141120 | elapsed time per iteration (s): 0.55 | learning rate: 7.147E-05 | global batch size: 256 | lm loss: 2.636503E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.719 | TFLOPs: 44.02 | +7: iteration 74250/ 115203 | consumed samples: 19008000 | consumed tokens: 38928384000 | elapsed time per iteration (s): 0.55 | learning rate: 7.144E-05 | global batch size: 256 | lm loss: 2.617914E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.446 | TFLOPs: 43.99 | +7: iteration 74260/ 115203 | consumed samples: 19010560 | consumed tokens: 38933626880 | elapsed time per iteration (s): 0.57 | learning rate: 7.142E-05 | global batch size: 256 | lm loss: 2.643606E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.255 | TFLOPs: 43.02 | +7: iteration 74270/ 115203 | consumed samples: 19013120 | consumed tokens: 38938869760 | elapsed time per iteration (s): 0.56 | learning rate: 7.140E-05 | global batch size: 256 | lm loss: 2.613447E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.809 | TFLOPs: 43.65 | +7: iteration 74280/ 115203 | consumed samples: 19015680 | consumed tokens: 38944112640 | elapsed time per iteration (s): 0.56 | learning rate: 7.138E-05 | global batch size: 256 | lm loss: 2.632234E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.073 | TFLOPs: 43.48 | +7: iteration 74290/ 115203 | consumed samples: 19018240 | consumed tokens: 38949355520 | elapsed time per iteration (s): 0.58 | learning rate: 7.136E-05 | global batch size: 256 | lm loss: 2.626589E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.043 | TFLOPs: 42.43 | +7: iteration 74300/ 115203 | consumed samples: 19020800 | consumed tokens: 38954598400 | elapsed time per iteration (s): 0.55 | learning rate: 7.133E-05 | global batch size: 256 | lm loss: 2.636804E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.348 | TFLOPs: 43.98 | +7: iteration 74310/ 115203 | consumed samples: 19023360 | consumed tokens: 38959841280 | elapsed time per iteration (s): 0.56 | learning rate: 7.131E-05 | global batch size: 256 | lm loss: 2.638662E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.789 | TFLOPs: 43.36 | +7: iteration 74320/ 115203 | consumed samples: 19025920 | consumed tokens: 38965084160 | elapsed time per iteration (s): 0.57 | learning rate: 7.129E-05 | global batch size: 256 | lm loss: 2.633788E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.584 | TFLOPs: 42.96 | +7: iteration 74330/ 115203 | consumed samples: 19028480 | consumed tokens: 38970327040 | elapsed time per iteration (s): 0.56 | learning rate: 7.127E-05 | global batch size: 256 | lm loss: 2.643572E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.541 | TFLOPs: 43.72 | +7: iteration 74340/ 115203 | consumed samples: 19031040 | consumed tokens: 38975569920 | elapsed time per iteration (s): 0.56 | learning rate: 7.124E-05 | global batch size: 256 | lm loss: 2.640900E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.654 | TFLOPs: 43.63 | +7: iteration 74350/ 115203 | consumed samples: 19033600 | consumed tokens: 38980812800 | elapsed time per iteration (s): 0.56 | learning rate: 7.122E-05 | global batch size: 256 | lm loss: 2.635139E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.100 | TFLOPs: 43.87 | +7: iteration 74360/ 115203 | consumed samples: 19036160 | consumed tokens: 38986055680 | elapsed time per iteration (s): 0.57 | learning rate: 7.120E-05 | global batch size: 256 | lm loss: 2.624684E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.314 | TFLOPs: 42.65 | +7: iteration 74370/ 115203 | consumed samples: 19038720 | consumed tokens: 38991298560 | elapsed time per iteration (s): 0.57 | learning rate: 7.118E-05 | global batch size: 256 | lm loss: 2.635315E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.445 | TFLOPs: 43.14 | +7: iteration 74380/ 115203 | consumed samples: 19041280 | consumed tokens: 38996541440 | elapsed time per iteration (s): 0.56 | learning rate: 7.115E-05 | global batch size: 256 | lm loss: 2.630276E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.671 | TFLOPs: 43.44 | +7: iteration 74390/ 115203 | consumed samples: 19043840 | consumed tokens: 39001784320 | elapsed time per iteration (s): 0.55 | learning rate: 7.113E-05 | global batch size: 256 | lm loss: 2.650966E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.578 | TFLOPs: 44.01 | +7: iteration 74400/ 115203 | consumed samples: 19046400 | consumed tokens: 39007027200 | elapsed time per iteration (s): 0.56 | learning rate: 7.111E-05 | global batch size: 256 | lm loss: 2.654175E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.321 | TFLOPs: 43.89 | +7: iteration 74410/ 115203 | consumed samples: 19048960 | consumed tokens: 39012270080 | elapsed time per iteration (s): 0.57 | learning rate: 7.109E-05 | global batch size: 256 | lm loss: 2.641376E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.557 | TFLOPs: 42.96 | +7: iteration 74420/ 115203 | consumed samples: 19051520 | consumed tokens: 39017512960 | elapsed time per iteration (s): 0.57 | learning rate: 7.106E-05 | global batch size: 256 | lm loss: 2.646380E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.797 | TFLOPs: 43.17 | +7: iteration 74430/ 115203 | consumed samples: 19054080 | consumed tokens: 39022755840 | elapsed time per iteration (s): 0.57 | learning rate: 7.104E-05 | global batch size: 256 | lm loss: 2.649949E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.247 | TFLOPs: 43.02 | +7: iteration 74440/ 115203 | consumed samples: 19056640 | consumed tokens: 39027998720 | elapsed time per iteration (s): 0.57 | learning rate: 7.102E-05 | global batch size: 256 | lm loss: 2.645997E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.150 | TFLOPs: 42.73 | +7: iteration 74450/ 115203 | consumed samples: 19059200 | consumed tokens: 39033241600 | elapsed time per iteration (s): 0.55 | learning rate: 7.100E-05 | global batch size: 256 | lm loss: 2.624482E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.699 | TFLOPs: 44.02 | +7: iteration 74460/ 115203 | consumed samples: 19061760 | consumed tokens: 39038484480 | elapsed time per iteration (s): 0.57 | learning rate: 7.098E-05 | global batch size: 256 | lm loss: 2.637532E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.045 | TFLOPs: 43.19 | +7: iteration 74470/ 115203 | consumed samples: 19064320 | consumed tokens: 39043727360 | elapsed time per iteration (s): 0.56 | learning rate: 7.095E-05 | global batch size: 256 | lm loss: 2.637780E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.682 | TFLOPs: 43.25 | +7: iteration 74480/ 115203 | consumed samples: 19066880 | consumed tokens: 39048970240 | elapsed time per iteration (s): 0.57 | learning rate: 7.093E-05 | global batch size: 256 | lm loss: 2.643439E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.605 | TFLOPs: 43.06 | +7: iteration 74490/ 115203 | consumed samples: 19069440 | consumed tokens: 39054213120 | elapsed time per iteration (s): 0.58 | learning rate: 7.091E-05 | global batch size: 256 | lm loss: 2.636783E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.642 | TFLOPs: 42.11 | +7: iteration 74500/ 115203 | consumed samples: 19072000 | consumed tokens: 39059456000 | elapsed time per iteration (s): 0.56 | learning rate: 7.089E-05 | global batch size: 256 | lm loss: 2.641189E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.445 | TFLOPs: 43.52 | +7: iteration 74510/ 115203 | consumed samples: 19074560 | consumed tokens: 39064698880 | elapsed time per iteration (s): 0.57 | learning rate: 7.086E-05 | global batch size: 256 | lm loss: 2.638482E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.864 | TFLOPs: 42.99 | +7: iteration 74520/ 115203 | consumed samples: 19077120 | consumed tokens: 39069941760 | elapsed time per iteration (s): 0.57 | learning rate: 7.084E-05 | global batch size: 256 | lm loss: 2.625273E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.153 | TFLOPs: 42.54 | +7: iteration 74530/ 115203 | consumed samples: 19079680 | consumed tokens: 39075184640 | elapsed time per iteration (s): 0.55 | learning rate: 7.082E-05 | global batch size: 256 | lm loss: 2.628291E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.560 | TFLOPs: 44.00 | +7: iteration 74540/ 115203 | consumed samples: 19082240 | consumed tokens: 39080427520 | elapsed time per iteration (s): 0.57 | learning rate: 7.080E-05 | global batch size: 256 | lm loss: 2.634619E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.233 | TFLOPs: 43.12 | +7: iteration 74550/ 115203 | consumed samples: 19084800 | consumed tokens: 39085670400 | elapsed time per iteration (s): 0.57 | learning rate: 7.077E-05 | global batch size: 256 | lm loss: 2.630783E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.879 | TFLOPs: 42.99 | +7: iteration 74560/ 115203 | consumed samples: 19087360 | consumed tokens: 39090913280 | elapsed time per iteration (s): 0.56 | learning rate: 7.075E-05 | global batch size: 256 | lm loss: 2.622404E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.618 | TFLOPs: 43.44 | +7: iteration 74570/ 115203 | consumed samples: 19089920 | consumed tokens: 39096156160 | elapsed time per iteration (s): 0.58 | learning rate: 7.073E-05 | global batch size: 256 | lm loss: 2.620840E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.549 | TFLOPs: 41.81 | +7: iteration 74580/ 115203 | consumed samples: 19092480 | consumed tokens: 39101399040 | elapsed time per iteration (s): 0.56 | learning rate: 7.071E-05 | global batch size: 256 | lm loss: 2.631733E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.633 | TFLOPs: 43.73 | +7: iteration 74590/ 115203 | consumed samples: 19095040 | consumed tokens: 39106641920 | elapsed time per iteration (s): 0.57 | learning rate: 7.069E-05 | global batch size: 256 | lm loss: 2.629453E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.939 | TFLOPs: 42.71 | +7: iteration 74600/ 115203 | consumed samples: 19097600 | consumed tokens: 39111884800 | elapsed time per iteration (s): 0.57 | learning rate: 7.066E-05 | global batch size: 256 | lm loss: 2.629410E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.007 | TFLOPs: 43.19 | +7: iteration 74610/ 115203 | consumed samples: 19100160 | consumed tokens: 39117127680 | elapsed time per iteration (s): 0.58 | learning rate: 7.064E-05 | global batch size: 256 | lm loss: 2.624439E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.674 | TFLOPs: 42.30 | +7: iteration 74620/ 115203 | consumed samples: 19102720 | consumed tokens: 39122370560 | elapsed time per iteration (s): 0.56 | learning rate: 7.062E-05 | global batch size: 256 | lm loss: 2.620128E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.229 | TFLOPs: 43.40 | +7: iteration 74630/ 115203 | consumed samples: 19105280 | consumed tokens: 39127613440 | elapsed time per iteration (s): 0.56 | learning rate: 7.060E-05 | global batch size: 256 | lm loss: 2.637658E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.542 | TFLOPs: 43.24 | +7: iteration 74640/ 115203 | consumed samples: 19107840 | consumed tokens: 39132856320 | elapsed time per iteration (s): 0.57 | learning rate: 7.057E-05 | global batch size: 256 | lm loss: 2.632266E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.524 | TFLOPs: 43.14 | +7: iteration 74650/ 115203 | consumed samples: 19110400 | consumed tokens: 39138099200 | elapsed time per iteration (s): 0.55 | learning rate: 7.055E-05 | global batch size: 256 | lm loss: 2.647424E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.680 | TFLOPs: 44.02 | +7: iteration 74660/ 115203 | consumed samples: 19112960 | consumed tokens: 39143342080 | elapsed time per iteration (s): 0.56 | learning rate: 7.053E-05 | global batch size: 256 | lm loss: 2.639986E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.515 | TFLOPs: 43.62 | +7: iteration 74670/ 115203 | consumed samples: 19115520 | consumed tokens: 39148584960 | elapsed time per iteration (s): 0.56 | learning rate: 7.051E-05 | global batch size: 256 | lm loss: 2.631586E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.136 | TFLOPs: 43.58 | +7: iteration 74680/ 115203 | consumed samples: 19118080 | consumed tokens: 39153827840 | elapsed time per iteration (s): 0.55 | learning rate: 7.048E-05 | global batch size: 256 | lm loss: 2.622903E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.824 | TFLOPs: 44.03 | +7: iteration 74690/ 115203 | consumed samples: 19120640 | consumed tokens: 39159070720 | elapsed time per iteration (s): 0.56 | learning rate: 7.046E-05 | global batch size: 256 | lm loss: 2.641183E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.642 | TFLOPs: 43.54 | +7: iteration 74700/ 115203 | consumed samples: 19123200 | consumed tokens: 39164313600 | elapsed time per iteration (s): 0.56 | learning rate: 7.044E-05 | global batch size: 256 | lm loss: 2.638044E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.455 | TFLOPs: 43.71 | +7: iteration 74710/ 115203 | consumed samples: 19125760 | consumed tokens: 39169556480 | elapsed time per iteration (s): 0.56 | learning rate: 7.042E-05 | global batch size: 256 | lm loss: 2.640986E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.633 | TFLOPs: 43.44 | +7: iteration 74720/ 115203 | consumed samples: 19128320 | consumed tokens: 39174799360 | elapsed time per iteration (s): 0.57 | learning rate: 7.040E-05 | global batch size: 256 | lm loss: 2.637099E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.782 | TFLOPs: 43.07 | +7: iteration 74730/ 115203 | consumed samples: 19130880 | consumed tokens: 39180042240 | elapsed time per iteration (s): 0.56 | learning rate: 7.037E-05 | global batch size: 256 | lm loss: 2.630781E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.839 | TFLOPs: 43.75 | +7: iteration 74740/ 115203 | consumed samples: 19133440 | consumed tokens: 39185285120 | elapsed time per iteration (s): 0.56 | learning rate: 7.035E-05 | global batch size: 256 | lm loss: 2.640108E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.874 | TFLOPs: 43.46 | +7: iteration 74750/ 115203 | consumed samples: 19136000 | consumed tokens: 39190528000 | elapsed time per iteration (s): 0.55 | learning rate: 7.033E-05 | global batch size: 256 | lm loss: 2.632641E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.556 | TFLOPs: 44.00 | +7: iteration 74760/ 115203 | consumed samples: 19138560 | consumed tokens: 39195770880 | elapsed time per iteration (s): 0.56 | learning rate: 7.031E-05 | global batch size: 256 | lm loss: 2.650802E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.014 | TFLOPs: 43.57 | +7: iteration 74770/ 115203 | consumed samples: 19141120 | consumed tokens: 39201013760 | elapsed time per iteration (s): 0.55 | learning rate: 7.028E-05 | global batch size: 256 | lm loss: 2.641936E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.451 | TFLOPs: 43.99 | +7: iteration 74780/ 115203 | consumed samples: 19143680 | consumed tokens: 39206256640 | elapsed time per iteration (s): 0.56 | learning rate: 7.026E-05 | global batch size: 256 | lm loss: 2.639127E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.472 | TFLOPs: 43.90 | +7: iteration 74790/ 115203 | consumed samples: 19146240 | consumed tokens: 39211499520 | elapsed time per iteration (s): 0.55 | learning rate: 7.024E-05 | global batch size: 256 | lm loss: 2.636699E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.664 | TFLOPs: 44.01 | +7: iteration 74800/ 115203 | consumed samples: 19148800 | consumed tokens: 39216742400 | elapsed time per iteration (s): 0.55 | learning rate: 7.022E-05 | global batch size: 256 | lm loss: 2.635588E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.507 | TFLOPs: 44.00 | +7: iteration 74810/ 115203 | consumed samples: 19151360 | consumed tokens: 39221985280 | elapsed time per iteration (s): 0.56 | learning rate: 7.020E-05 | global batch size: 256 | lm loss: 2.653249E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.431 | TFLOPs: 43.80 | +7: iteration 74820/ 115203 | consumed samples: 19153920 | consumed tokens: 39227228160 | elapsed time per iteration (s): 0.56 | learning rate: 7.017E-05 | global batch size: 256 | lm loss: 2.629954E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.582 | TFLOPs: 43.72 | +7: iteration 74830/ 115203 | consumed samples: 19156480 | consumed tokens: 39232471040 | elapsed time per iteration (s): 0.55 | learning rate: 7.015E-05 | global batch size: 256 | lm loss: 2.629358E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.599 | TFLOPs: 44.01 | +7: iteration 74840/ 115203 | consumed samples: 19159040 | consumed tokens: 39237713920 | elapsed time per iteration (s): 0.56 | learning rate: 7.013E-05 | global batch size: 256 | lm loss: 2.628525E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.905 | TFLOPs: 43.47 | +7: iteration 74850/ 115203 | consumed samples: 19161600 | consumed tokens: 39242956800 | elapsed time per iteration (s): 0.56 | learning rate: 7.011E-05 | global batch size: 256 | lm loss: 2.638671E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.188 | TFLOPs: 43.21 | +7: iteration 74860/ 115203 | consumed samples: 19164160 | consumed tokens: 39248199680 | elapsed time per iteration (s): 0.55 | learning rate: 7.008E-05 | global batch size: 256 | lm loss: 2.633047E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.582 | TFLOPs: 44.01 | +7: iteration 74870/ 115203 | consumed samples: 19166720 | consumed tokens: 39253442560 | elapsed time per iteration (s): 0.55 | learning rate: 7.006E-05 | global batch size: 256 | lm loss: 2.622524E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.423 | TFLOPs: 43.99 | +7: iteration 74880/ 115203 | consumed samples: 19169280 | consumed tokens: 39258685440 | elapsed time per iteration (s): 0.55 | learning rate: 7.004E-05 | global batch size: 256 | lm loss: 2.633950E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 74890/ 115203 | consumed samples: 19171840 | consumed tokens: 39263928320 | elapsed time per iteration (s): 0.55 | learning rate: 7.002E-05 | global batch size: 256 | lm loss: 2.633424E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.499 | TFLOPs: 44.00 | +7: iteration 74900/ 115203 | consumed samples: 19174400 | consumed tokens: 39269171200 | elapsed time per iteration (s): 0.56 | learning rate: 7.000E-05 | global batch size: 256 | lm loss: 2.641549E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.324 | TFLOPs: 43.31 | +7: iteration 74910/ 115203 | consumed samples: 19176960 | consumed tokens: 39274414080 | elapsed time per iteration (s): 0.56 | learning rate: 6.997E-05 | global batch size: 256 | lm loss: 2.625599E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.942 | TFLOPs: 43.66 | +7: iteration 74920/ 115203 | consumed samples: 19179520 | consumed tokens: 39279656960 | elapsed time per iteration (s): 0.55 | learning rate: 6.995E-05 | global batch size: 256 | lm loss: 2.627543E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.550 | TFLOPs: 44.00 | +7: iteration 74930/ 115203 | consumed samples: 19182080 | consumed tokens: 39284899840 | elapsed time per iteration (s): 0.56 | learning rate: 6.993E-05 | global batch size: 256 | lm loss: 2.635013E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.350 | TFLOPs: 43.51 | +7: iteration 74940/ 115203 | consumed samples: 19184640 | consumed tokens: 39290142720 | elapsed time per iteration (s): 0.56 | learning rate: 6.991E-05 | global batch size: 256 | lm loss: 2.636987E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.081 | TFLOPs: 43.58 | +7: iteration 74950/ 115203 | consumed samples: 19187200 | consumed tokens: 39295385600 | elapsed time per iteration (s): 0.55 | learning rate: 6.988E-05 | global batch size: 256 | lm loss: 2.645581E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.634 | TFLOPs: 44.01 | +7: iteration 74960/ 115203 | consumed samples: 19189760 | consumed tokens: 39300628480 | elapsed time per iteration (s): 0.56 | learning rate: 6.986E-05 | global batch size: 256 | lm loss: 2.637240E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.897 | TFLOPs: 43.66 | +7: iteration 74970/ 115203 | consumed samples: 19192320 | consumed tokens: 39305871360 | elapsed time per iteration (s): 0.55 | learning rate: 6.984E-05 | global batch size: 256 | lm loss: 2.624160E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.472 | TFLOPs: 44.00 | +7: iteration 74980/ 115203 | consumed samples: 19194880 | consumed tokens: 39311114240 | elapsed time per iteration (s): 0.55 | learning rate: 6.982E-05 | global batch size: 256 | lm loss: 2.632592E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.262 | TFLOPs: 43.98 | +7: iteration 74990/ 115203 | consumed samples: 19197440 | consumed tokens: 39316357120 | elapsed time per iteration (s): 0.57 | learning rate: 6.980E-05 | global batch size: 256 | lm loss: 2.638540E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.376 | TFLOPs: 43.13 | +7: iteration 75000/ 115203 | consumed samples: 19200000 | consumed tokens: 39321600000 | elapsed time per iteration (s): 0.56 | learning rate: 6.977E-05 | global batch size: 256 | lm loss: 2.643048E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.448 | TFLOPs: 43.61 | +7: iteration 75010/ 115203 | consumed samples: 19202560 | consumed tokens: 39326842880 | elapsed time per iteration (s): 0.56 | learning rate: 6.975E-05 | global batch size: 256 | lm loss: 2.628556E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.855 | TFLOPs: 43.65 | +7: iteration 75020/ 115203 | consumed samples: 19205120 | consumed tokens: 39332085760 | elapsed time per iteration (s): 0.56 | learning rate: 6.973E-05 | global batch size: 256 | lm loss: 2.628909E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.745 | TFLOPs: 43.55 | +7: iteration 75030/ 115203 | consumed samples: 19207680 | consumed tokens: 39337328640 | elapsed time per iteration (s): 0.56 | learning rate: 6.971E-05 | global batch size: 256 | lm loss: 2.646075E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.717 | TFLOPs: 43.64 | +7: iteration 75040/ 115203 | consumed samples: 19210240 | consumed tokens: 39342571520 | elapsed time per iteration (s): 0.55 | learning rate: 6.968E-05 | global batch size: 256 | lm loss: 2.643596E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.375 | TFLOPs: 43.99 | +7: iteration 75050/ 115203 | consumed samples: 19212800 | consumed tokens: 39347814400 | elapsed time per iteration (s): 0.55 | learning rate: 6.966E-05 | global batch size: 256 | lm loss: 2.638353E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.626 | TFLOPs: 44.01 | +7: iteration 75060/ 115203 | consumed samples: 19215360 | consumed tokens: 39353057280 | elapsed time per iteration (s): 0.55 | learning rate: 6.964E-05 | global batch size: 256 | lm loss: 2.624883E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.574 | TFLOPs: 44.01 | +7: iteration 75070/ 115203 | consumed samples: 19217920 | consumed tokens: 39358300160 | elapsed time per iteration (s): 0.56 | learning rate: 6.962E-05 | global batch size: 256 | lm loss: 2.636884E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.013 | TFLOPs: 43.67 | +7: iteration 75080/ 115203 | consumed samples: 19220480 | consumed tokens: 39363543040 | elapsed time per iteration (s): 0.56 | learning rate: 6.960E-05 | global batch size: 256 | lm loss: 2.644745E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.849 | TFLOPs: 43.46 | +7: iteration 75090/ 115203 | consumed samples: 19223040 | consumed tokens: 39368785920 | elapsed time per iteration (s): 0.56 | learning rate: 6.957E-05 | global batch size: 256 | lm loss: 2.634826E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.180 | TFLOPs: 43.97 | +7: iteration 75100/ 115203 | consumed samples: 19225600 | consumed tokens: 39374028800 | elapsed time per iteration (s): 0.56 | learning rate: 6.955E-05 | global batch size: 256 | lm loss: 2.641481E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.117 | TFLOPs: 43.96 | +7: iteration 75110/ 115203 | consumed samples: 19228160 | consumed tokens: 39379271680 | elapsed time per iteration (s): 0.55 | learning rate: 6.953E-05 | global batch size: 256 | lm loss: 2.626569E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.698 | TFLOPs: 44.02 | +7: iteration 75120/ 115203 | consumed samples: 19230720 | consumed tokens: 39384514560 | elapsed time per iteration (s): 0.56 | learning rate: 6.951E-05 | global batch size: 256 | lm loss: 2.620545E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.870 | TFLOPs: 43.56 | +7: iteration 75130/ 115203 | consumed samples: 19233280 | consumed tokens: 39389757440 | elapsed time per iteration (s): 0.55 | learning rate: 6.949E-05 | global batch size: 256 | lm loss: 2.638326E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 75140/ 115203 | consumed samples: 19235840 | consumed tokens: 39395000320 | elapsed time per iteration (s): 0.56 | learning rate: 6.946E-05 | global batch size: 256 | lm loss: 2.634961E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.765 | TFLOPs: 43.55 | +7: iteration 75150/ 115203 | consumed samples: 19238400 | consumed tokens: 39400243200 | elapsed time per iteration (s): 0.55 | learning rate: 6.944E-05 | global batch size: 256 | lm loss: 2.639042E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 75160/ 115203 | consumed samples: 19240960 | consumed tokens: 39405486080 | elapsed time per iteration (s): 0.55 | learning rate: 6.942E-05 | global batch size: 256 | lm loss: 2.633375E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.428 | TFLOPs: 43.99 | +7: iteration 75170/ 115203 | consumed samples: 19243520 | consumed tokens: 39410728960 | elapsed time per iteration (s): 0.55 | learning rate: 6.940E-05 | global batch size: 256 | lm loss: 2.635867E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.710 | TFLOPs: 44.02 | +7: iteration 75180/ 115203 | consumed samples: 19246080 | consumed tokens: 39415971840 | elapsed time per iteration (s): 0.55 | learning rate: 6.937E-05 | global batch size: 256 | lm loss: 2.643727E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.564 | TFLOPs: 44.01 | +7: iteration 75190/ 115203 | consumed samples: 19248640 | consumed tokens: 39421214720 | elapsed time per iteration (s): 0.55 | learning rate: 6.935E-05 | global batch size: 256 | lm loss: 2.631252E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.325 | TFLOPs: 43.98 | +7: iteration 75200/ 115203 | consumed samples: 19251200 | consumed tokens: 39426457600 | elapsed time per iteration (s): 0.55 | learning rate: 6.933E-05 | global batch size: 256 | lm loss: 2.629079E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 75210/ 115203 | consumed samples: 19253760 | consumed tokens: 39431700480 | elapsed time per iteration (s): 0.55 | learning rate: 6.931E-05 | global batch size: 256 | lm loss: 2.641445E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.399 | TFLOPs: 43.99 | +7: iteration 75220/ 115203 | consumed samples: 19256320 | consumed tokens: 39436943360 | elapsed time per iteration (s): 0.55 | learning rate: 6.929E-05 | global batch size: 256 | lm loss: 2.631781E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.515 | TFLOPs: 44.00 | +7: iteration 75230/ 115203 | consumed samples: 19258880 | consumed tokens: 39442186240 | elapsed time per iteration (s): 0.58 | learning rate: 6.926E-05 | global batch size: 256 | lm loss: 2.637180E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.239 | TFLOPs: 42.35 | +7: iteration 75240/ 115203 | consumed samples: 19261440 | consumed tokens: 39447429120 | elapsed time per iteration (s): 0.56 | learning rate: 6.924E-05 | global batch size: 256 | lm loss: 2.625358E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.896 | TFLOPs: 43.66 | +7: iteration 75250/ 115203 | consumed samples: 19264000 | consumed tokens: 39452672000 | elapsed time per iteration (s): 0.55 | learning rate: 6.922E-05 | global batch size: 256 | lm loss: 2.650303E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.292 | TFLOPs: 43.98 | +7: iteration 75260/ 115203 | consumed samples: 19266560 | consumed tokens: 39457914880 | elapsed time per iteration (s): 0.55 | learning rate: 6.920E-05 | global batch size: 256 | lm loss: 2.635198E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.619 | TFLOPs: 44.01 | +7: iteration 75270/ 115203 | consumed samples: 19269120 | consumed tokens: 39463157760 | elapsed time per iteration (s): 0.55 | learning rate: 6.918E-05 | global batch size: 256 | lm loss: 2.638651E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.628 | TFLOPs: 44.01 | +7: iteration 75280/ 115203 | consumed samples: 19271680 | consumed tokens: 39468400640 | elapsed time per iteration (s): 0.55 | learning rate: 6.915E-05 | global batch size: 256 | lm loss: 2.637421E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.520 | TFLOPs: 44.00 | +7: iteration 75290/ 115203 | consumed samples: 19274240 | consumed tokens: 39473643520 | elapsed time per iteration (s): 0.55 | learning rate: 6.913E-05 | global batch size: 256 | lm loss: 2.636816E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.525 | TFLOPs: 44.00 | +7: iteration 75300/ 115203 | consumed samples: 19276800 | consumed tokens: 39478886400 | elapsed time per iteration (s): 0.55 | learning rate: 6.911E-05 | global batch size: 256 | lm loss: 2.631298E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.643 | TFLOPs: 44.01 | +7: iteration 75310/ 115203 | consumed samples: 19279360 | consumed tokens: 39484129280 | elapsed time per iteration (s): 0.55 | learning rate: 6.909E-05 | global batch size: 256 | lm loss: 2.624375E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.584 | TFLOPs: 44.01 | +7: iteration 75320/ 115203 | consumed samples: 19281920 | consumed tokens: 39489372160 | elapsed time per iteration (s): 0.55 | learning rate: 6.907E-05 | global batch size: 256 | lm loss: 2.641127E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.498 | TFLOPs: 44.00 | +7: iteration 75330/ 115203 | consumed samples: 19284480 | consumed tokens: 39494615040 | elapsed time per iteration (s): 0.55 | learning rate: 6.904E-05 | global batch size: 256 | lm loss: 2.631315E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.351 | TFLOPs: 43.98 | +7: iteration 75340/ 115203 | consumed samples: 19287040 | consumed tokens: 39499857920 | elapsed time per iteration (s): 0.56 | learning rate: 6.902E-05 | global batch size: 256 | lm loss: 2.646083E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.284 | TFLOPs: 43.50 | +7: iteration 75350/ 115203 | consumed samples: 19289600 | consumed tokens: 39505100800 | elapsed time per iteration (s): 0.56 | learning rate: 6.900E-05 | global batch size: 256 | lm loss: 2.627051E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.993 | TFLOPs: 43.57 | +7: iteration 75360/ 115203 | consumed samples: 19292160 | consumed tokens: 39510343680 | elapsed time per iteration (s): 0.56 | learning rate: 6.898E-05 | global batch size: 256 | lm loss: 2.626208E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.459 | TFLOPs: 43.71 | +7: iteration 75370/ 115203 | consumed samples: 19294720 | consumed tokens: 39515586560 | elapsed time per iteration (s): 0.55 | learning rate: 6.895E-05 | global batch size: 256 | lm loss: 2.641763E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.418 | TFLOPs: 43.99 | +7: iteration 75380/ 115203 | consumed samples: 19297280 | consumed tokens: 39520829440 | elapsed time per iteration (s): 0.55 | learning rate: 6.893E-05 | global batch size: 256 | lm loss: 2.636523E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.277 | TFLOPs: 43.98 | +7: iteration 75390/ 115203 | consumed samples: 19299840 | consumed tokens: 39526072320 | elapsed time per iteration (s): 0.56 | learning rate: 6.891E-05 | global batch size: 256 | lm loss: 2.623065E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.257 | TFLOPs: 43.98 | +7: iteration 75400/ 115203 | consumed samples: 19302400 | consumed tokens: 39531315200 | elapsed time per iteration (s): 0.56 | learning rate: 6.889E-05 | global batch size: 256 | lm loss: 2.639139E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.797 | TFLOPs: 43.55 | +7: iteration 75410/ 115203 | consumed samples: 19304960 | consumed tokens: 39536558080 | elapsed time per iteration (s): 0.56 | learning rate: 6.887E-05 | global batch size: 256 | lm loss: 2.627990E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.936 | TFLOPs: 43.47 | +7: iteration 75420/ 115203 | consumed samples: 19307520 | consumed tokens: 39541800960 | elapsed time per iteration (s): 0.56 | learning rate: 6.884E-05 | global batch size: 256 | lm loss: 2.628123E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.940 | TFLOPs: 43.56 | +7: iteration 75430/ 115203 | consumed samples: 19310080 | consumed tokens: 39547043840 | elapsed time per iteration (s): 0.55 | learning rate: 6.882E-05 | global batch size: 256 | lm loss: 2.652635E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 75440/ 115203 | consumed samples: 19312640 | consumed tokens: 39552286720 | elapsed time per iteration (s): 0.55 | learning rate: 6.880E-05 | global batch size: 256 | lm loss: 2.639421E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 75450/ 115203 | consumed samples: 19315200 | consumed tokens: 39557529600 | elapsed time per iteration (s): 0.55 | learning rate: 6.878E-05 | global batch size: 256 | lm loss: 2.632992E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.349 | TFLOPs: 43.98 | +7: iteration 75460/ 115203 | consumed samples: 19317760 | consumed tokens: 39562772480 | elapsed time per iteration (s): 0.57 | learning rate: 6.876E-05 | global batch size: 256 | lm loss: 2.641233E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.024 | TFLOPs: 43.19 | +7: iteration 75470/ 115203 | consumed samples: 19320320 | consumed tokens: 39568015360 | elapsed time per iteration (s): 0.55 | learning rate: 6.873E-05 | global batch size: 256 | lm loss: 2.629836E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 75480/ 115203 | consumed samples: 19322880 | consumed tokens: 39573258240 | elapsed time per iteration (s): 0.55 | learning rate: 6.871E-05 | global batch size: 256 | lm loss: 2.633048E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.434 | TFLOPs: 43.99 | +7: iteration 75490/ 115203 | consumed samples: 19325440 | consumed tokens: 39578501120 | elapsed time per iteration (s): 0.57 | learning rate: 6.869E-05 | global batch size: 256 | lm loss: 2.621056E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.773 | TFLOPs: 43.17 | +7: iteration 75500/ 115203 | consumed samples: 19328000 | consumed tokens: 39583744000 | elapsed time per iteration (s): 0.55 | learning rate: 6.867E-05 | global batch size: 256 | lm loss: 2.655702E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.389 | TFLOPs: 43.99 | +7: iteration 75510/ 115203 | consumed samples: 19330560 | consumed tokens: 39588986880 | elapsed time per iteration (s): 0.57 | learning rate: 6.865E-05 | global batch size: 256 | lm loss: 2.629921E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.126 | TFLOPs: 42.72 | +7: iteration 75520/ 115203 | consumed samples: 19333120 | consumed tokens: 39594229760 | elapsed time per iteration (s): 0.56 | learning rate: 6.862E-05 | global batch size: 256 | lm loss: 2.632369E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.991 | TFLOPs: 43.95 | +7: iteration 75530/ 115203 | consumed samples: 19335680 | consumed tokens: 39599472640 | elapsed time per iteration (s): 0.55 | learning rate: 6.860E-05 | global batch size: 256 | lm loss: 2.622757E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 75540/ 115203 | consumed samples: 19338240 | consumed tokens: 39604715520 | elapsed time per iteration (s): 0.56 | learning rate: 6.858E-05 | global batch size: 256 | lm loss: 2.637507E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.689 | TFLOPs: 43.25 | +7: iteration 75550/ 115203 | consumed samples: 19340800 | consumed tokens: 39609958400 | elapsed time per iteration (s): 0.55 | learning rate: 6.856E-05 | global batch size: 256 | lm loss: 2.645623E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.425 | TFLOPs: 43.99 | +7: iteration 75560/ 115203 | consumed samples: 19343360 | consumed tokens: 39615201280 | elapsed time per iteration (s): 0.56 | learning rate: 6.854E-05 | global batch size: 256 | lm loss: 2.651411E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.652 | TFLOPs: 43.73 | +7: iteration 75570/ 115203 | consumed samples: 19345920 | consumed tokens: 39620444160 | elapsed time per iteration (s): 0.56 | learning rate: 6.851E-05 | global batch size: 256 | lm loss: 2.628021E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.883 | TFLOPs: 43.37 | +7: iteration 75580/ 115203 | consumed samples: 19348480 | consumed tokens: 39625687040 | elapsed time per iteration (s): 0.56 | learning rate: 6.849E-05 | global batch size: 256 | lm loss: 2.628113E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.556 | TFLOPs: 43.43 | +7: iteration 75590/ 115203 | consumed samples: 19351040 | consumed tokens: 39630929920 | elapsed time per iteration (s): 0.55 | learning rate: 6.847E-05 | global batch size: 256 | lm loss: 2.640067E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.558 | TFLOPs: 44.00 | +7: iteration 75600/ 115203 | consumed samples: 19353600 | consumed tokens: 39636172800 | elapsed time per iteration (s): 0.55 | learning rate: 6.845E-05 | global batch size: 256 | lm loss: 2.641899E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.383 | TFLOPs: 43.99 | +7: iteration 75610/ 115203 | consumed samples: 19356160 | consumed tokens: 39641415680 | elapsed time per iteration (s): 0.56 | learning rate: 6.843E-05 | global batch size: 256 | lm loss: 2.639332E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.956 | TFLOPs: 43.57 | +7: iteration 75620/ 115203 | consumed samples: 19358720 | consumed tokens: 39646658560 | elapsed time per iteration (s): 0.56 | learning rate: 6.840E-05 | global batch size: 256 | lm loss: 2.639662E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.758 | TFLOPs: 43.83 | +7: iteration 75630/ 115203 | consumed samples: 19361280 | consumed tokens: 39651901440 | elapsed time per iteration (s): 0.58 | learning rate: 6.838E-05 | global batch size: 256 | lm loss: 2.635308E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.312 | TFLOPs: 41.98 | +7: iteration 75640/ 115203 | consumed samples: 19363840 | consumed tokens: 39657144320 | elapsed time per iteration (s): 0.55 | learning rate: 6.836E-05 | global batch size: 256 | lm loss: 2.634147E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.262 | TFLOPs: 43.98 | +7: iteration 75650/ 115203 | consumed samples: 19366400 | consumed tokens: 39662387200 | elapsed time per iteration (s): 0.56 | learning rate: 6.834E-05 | global batch size: 256 | lm loss: 2.628375E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.240 | TFLOPs: 43.78 | +7: iteration 75660/ 115203 | consumed samples: 19368960 | consumed tokens: 39667630080 | elapsed time per iteration (s): 0.55 | learning rate: 6.832E-05 | global batch size: 256 | lm loss: 2.634238E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.436 | TFLOPs: 43.99 | +7: iteration 75670/ 115203 | consumed samples: 19371520 | consumed tokens: 39672872960 | elapsed time per iteration (s): 0.57 | learning rate: 6.829E-05 | global batch size: 256 | lm loss: 2.639086E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.348 | TFLOPs: 43.03 | +7: iteration 75680/ 115203 | consumed samples: 19374080 | consumed tokens: 39678115840 | elapsed time per iteration (s): 0.56 | learning rate: 6.827E-05 | global batch size: 256 | lm loss: 2.635353E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.968 | TFLOPs: 43.85 | +7: iteration 75690/ 115203 | consumed samples: 19376640 | consumed tokens: 39683358720 | elapsed time per iteration (s): 0.55 | learning rate: 6.825E-05 | global batch size: 256 | lm loss: 2.626730E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 75700/ 115203 | consumed samples: 19379200 | consumed tokens: 39688601600 | elapsed time per iteration (s): 0.56 | learning rate: 6.823E-05 | global batch size: 256 | lm loss: 2.623363E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 75710/ 115203 | consumed samples: 19381760 | consumed tokens: 39693844480 | elapsed time per iteration (s): 0.56 | learning rate: 6.821E-05 | global batch size: 256 | lm loss: 2.630048E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.428 | TFLOPs: 43.23 | +7: iteration 75720/ 115203 | consumed samples: 19384320 | consumed tokens: 39699087360 | elapsed time per iteration (s): 0.56 | learning rate: 6.818E-05 | global batch size: 256 | lm loss: 2.631964E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.366 | TFLOPs: 43.80 | +7: iteration 75730/ 115203 | consumed samples: 19386880 | consumed tokens: 39704330240 | elapsed time per iteration (s): 0.56 | learning rate: 6.816E-05 | global batch size: 256 | lm loss: 2.626487E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.873 | TFLOPs: 43.75 | +7: iteration 75740/ 115203 | consumed samples: 19389440 | consumed tokens: 39709573120 | elapsed time per iteration (s): 0.55 | learning rate: 6.814E-05 | global batch size: 256 | lm loss: 2.631219E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.350 | TFLOPs: 43.98 | +7: iteration 75750/ 115203 | consumed samples: 19392000 | consumed tokens: 39714816000 | elapsed time per iteration (s): 0.57 | learning rate: 6.812E-05 | global batch size: 256 | lm loss: 2.626545E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.150 | TFLOPs: 42.82 | +7: iteration 75760/ 115203 | consumed samples: 19394560 | consumed tokens: 39720058880 | elapsed time per iteration (s): 0.55 | learning rate: 6.810E-05 | global batch size: 256 | lm loss: 2.641406E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.528 | TFLOPs: 44.00 | +7: iteration 75770/ 115203 | consumed samples: 19397120 | consumed tokens: 39725301760 | elapsed time per iteration (s): 0.55 | learning rate: 6.807E-05 | global batch size: 256 | lm loss: 2.630967E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.507 | TFLOPs: 44.00 | +7: iteration 75780/ 115203 | consumed samples: 19399680 | consumed tokens: 39730544640 | elapsed time per iteration (s): 0.57 | learning rate: 6.805E-05 | global batch size: 256 | lm loss: 2.629536E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.475 | TFLOPs: 42.85 | +7: iteration 75790/ 115203 | consumed samples: 19402240 | consumed tokens: 39735787520 | elapsed time per iteration (s): 0.56 | learning rate: 6.803E-05 | global batch size: 256 | lm loss: 2.625079E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.549 | TFLOPs: 43.91 | +7: iteration 75800/ 115203 | consumed samples: 19404800 | consumed tokens: 39741030400 | elapsed time per iteration (s): 0.55 | learning rate: 6.801E-05 | global batch size: 256 | lm loss: 2.638935E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.515 | TFLOPs: 44.00 | +7: iteration 75810/ 115203 | consumed samples: 19407360 | consumed tokens: 39746273280 | elapsed time per iteration (s): 0.56 | learning rate: 6.799E-05 | global batch size: 256 | lm loss: 2.635179E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.334 | TFLOPs: 43.41 | +7: iteration 75820/ 115203 | consumed samples: 19409920 | consumed tokens: 39751516160 | elapsed time per iteration (s): 0.56 | learning rate: 6.797E-05 | global batch size: 256 | lm loss: 2.623975E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.400 | TFLOPs: 43.80 | +7: iteration 75830/ 115203 | consumed samples: 19412480 | consumed tokens: 39756759040 | elapsed time per iteration (s): 0.55 | learning rate: 6.794E-05 | global batch size: 256 | lm loss: 2.634889E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.272 | TFLOPs: 43.98 | +7: iteration 75840/ 115203 | consumed samples: 19415040 | consumed tokens: 39762001920 | elapsed time per iteration (s): 0.56 | learning rate: 6.792E-05 | global batch size: 256 | lm loss: 2.636334E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.754 | TFLOPs: 43.55 | +7: iteration 75850/ 115203 | consumed samples: 19417600 | consumed tokens: 39767244800 | elapsed time per iteration (s): 0.55 | learning rate: 6.790E-05 | global batch size: 256 | lm loss: 2.637313E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.659 | TFLOPs: 44.01 | +7: iteration 75860/ 115203 | consumed samples: 19420160 | consumed tokens: 39772487680 | elapsed time per iteration (s): 0.56 | learning rate: 6.788E-05 | global batch size: 256 | lm loss: 2.624788E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.932 | TFLOPs: 43.85 | +7: iteration 75870/ 115203 | consumed samples: 19422720 | consumed tokens: 39777730560 | elapsed time per iteration (s): 0.55 | learning rate: 6.786E-05 | global batch size: 256 | lm loss: 2.637008E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.548 | TFLOPs: 44.00 | +7: iteration 75880/ 115203 | consumed samples: 19425280 | consumed tokens: 39782973440 | elapsed time per iteration (s): 0.56 | learning rate: 6.783E-05 | global batch size: 256 | lm loss: 2.633609E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.470 | TFLOPs: 43.33 | +7: iteration 75890/ 115203 | consumed samples: 19427840 | consumed tokens: 39788216320 | elapsed time per iteration (s): 0.55 | learning rate: 6.781E-05 | global batch size: 256 | lm loss: 2.632885E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.534 | TFLOPs: 44.00 | +7: iteration 75900/ 115203 | consumed samples: 19430400 | consumed tokens: 39793459200 | elapsed time per iteration (s): 0.55 | learning rate: 6.779E-05 | global batch size: 256 | lm loss: 2.631625E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.499 | TFLOPs: 44.00 | +7: iteration 75910/ 115203 | consumed samples: 19432960 | consumed tokens: 39798702080 | elapsed time per iteration (s): 0.57 | learning rate: 6.777E-05 | global batch size: 256 | lm loss: 2.623062E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.840 | TFLOPs: 42.70 | +7: iteration 75920/ 115203 | consumed samples: 19435520 | consumed tokens: 39803944960 | elapsed time per iteration (s): 0.57 | learning rate: 6.775E-05 | global batch size: 256 | lm loss: 2.628725E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.174 | TFLOPs: 42.92 | +7: iteration 75930/ 115203 | consumed samples: 19438080 | consumed tokens: 39809187840 | elapsed time per iteration (s): 0.56 | learning rate: 6.772E-05 | global batch size: 256 | lm loss: 2.637935E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.078 | TFLOPs: 43.39 | +7: iteration 75940/ 115203 | consumed samples: 19440640 | consumed tokens: 39814430720 | elapsed time per iteration (s): 0.55 | learning rate: 6.770E-05 | global batch size: 256 | lm loss: 2.622507E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.591 | TFLOPs: 44.01 | +7: iteration 75950/ 115203 | consumed samples: 19443200 | consumed tokens: 39819673600 | elapsed time per iteration (s): 0.55 | learning rate: 6.768E-05 | global batch size: 256 | lm loss: 2.631842E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.713 | TFLOPs: 44.02 | +7: iteration 75960/ 115203 | consumed samples: 19445760 | consumed tokens: 39824916480 | elapsed time per iteration (s): 0.55 | learning rate: 6.766E-05 | global batch size: 256 | lm loss: 2.637568E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.798 | TFLOPs: 44.03 | +7: iteration 75970/ 115203 | consumed samples: 19448320 | consumed tokens: 39830159360 | elapsed time per iteration (s): 0.55 | learning rate: 6.764E-05 | global batch size: 256 | lm loss: 2.634225E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.724 | TFLOPs: 44.02 | +7: iteration 75980/ 115203 | consumed samples: 19450880 | consumed tokens: 39835402240 | elapsed time per iteration (s): 0.56 | learning rate: 6.761E-05 | global batch size: 256 | lm loss: 2.636981E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.412 | TFLOPs: 43.90 | +7: iteration 75990/ 115203 | consumed samples: 19453440 | consumed tokens: 39840645120 | elapsed time per iteration (s): 0.56 | learning rate: 6.759E-05 | global batch size: 256 | lm loss: 2.623999E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.931 | TFLOPs: 43.94 | +0: [2023-03-17 00:45:08,376] [INFO] [logging.py:68:log_dist] [Rank 0] step=76000, skipped=0, lr=[6.757111507639708e-05, 6.757111507639708e-05, 6.757111507639708e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 76000/ 115203 | consumed samples: 19456000 | consumed tokens: 39845888000 | elapsed time per iteration (s): 0.56 | learning rate: 6.757E-05 | global batch size: 256 | lm loss: 2.624310E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.797 | TFLOPs: 43.55 | +0: steps: 76000 loss: 2.6294 iter time (s): 0.557 samples/sec: 459.448 +7: iteration 76010/ 115203 | consumed samples: 19458560 | consumed tokens: 39851130880 | elapsed time per iteration (s): 0.56 | learning rate: 6.755E-05 | global batch size: 256 | lm loss: 2.627990E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.174 | TFLOPs: 43.78 | +7: iteration 76020/ 115203 | consumed samples: 19461120 | consumed tokens: 39856373760 | elapsed time per iteration (s): 0.55 | learning rate: 6.753E-05 | global batch size: 256 | lm loss: 2.632102E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.637 | TFLOPs: 44.01 | +7: iteration 76030/ 115203 | consumed samples: 19463680 | consumed tokens: 39861616640 | elapsed time per iteration (s): 0.55 | learning rate: 6.751E-05 | global batch size: 256 | lm loss: 2.637094E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.656 | TFLOPs: 44.01 | +7: iteration 76040/ 115203 | consumed samples: 19466240 | consumed tokens: 39866859520 | elapsed time per iteration (s): 0.56 | learning rate: 6.748E-05 | global batch size: 256 | lm loss: 2.619967E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.320 | TFLOPs: 43.31 | +7: iteration 76050/ 115203 | consumed samples: 19468800 | consumed tokens: 39872102400 | elapsed time per iteration (s): 0.55 | learning rate: 6.746E-05 | global batch size: 256 | lm loss: 2.636007E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.652 | TFLOPs: 44.01 | +7: iteration 76060/ 115203 | consumed samples: 19471360 | consumed tokens: 39877345280 | elapsed time per iteration (s): 0.55 | learning rate: 6.744E-05 | global batch size: 256 | lm loss: 2.633285E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.599 | TFLOPs: 44.01 | +7: iteration 76070/ 115203 | consumed samples: 19473920 | consumed tokens: 39882588160 | elapsed time per iteration (s): 0.55 | learning rate: 6.742E-05 | global batch size: 256 | lm loss: 2.634146E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.540 | TFLOPs: 44.00 | +7: iteration 76080/ 115203 | consumed samples: 19476480 | consumed tokens: 39887831040 | elapsed time per iteration (s): 0.56 | learning rate: 6.740E-05 | global batch size: 256 | lm loss: 2.617114E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.658 | TFLOPs: 43.54 | +7: iteration 76090/ 115203 | consumed samples: 19479040 | consumed tokens: 39893073920 | elapsed time per iteration (s): 0.56 | learning rate: 6.737E-05 | global batch size: 256 | lm loss: 2.623023E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.993 | TFLOPs: 43.57 | +7: iteration 76100/ 115203 | consumed samples: 19481600 | consumed tokens: 39898316800 | elapsed time per iteration (s): 0.57 | learning rate: 6.735E-05 | global batch size: 256 | lm loss: 2.635736E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.427 | TFLOPs: 43.04 | +7: iteration 76110/ 115203 | consumed samples: 19484160 | consumed tokens: 39903559680 | elapsed time per iteration (s): 0.55 | learning rate: 6.733E-05 | global batch size: 256 | lm loss: 2.629826E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.506 | TFLOPs: 44.00 | +7: iteration 76120/ 115203 | consumed samples: 19486720 | consumed tokens: 39908802560 | elapsed time per iteration (s): 0.55 | learning rate: 6.731E-05 | global batch size: 256 | lm loss: 2.632381E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.499 | TFLOPs: 44.00 | +7: iteration 76130/ 115203 | consumed samples: 19489280 | consumed tokens: 39914045440 | elapsed time per iteration (s): 0.56 | learning rate: 6.729E-05 | global batch size: 256 | lm loss: 2.618824E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.049 | TFLOPs: 43.86 | +7: iteration 76140/ 115203 | consumed samples: 19491840 | consumed tokens: 39919288320 | elapsed time per iteration (s): 0.56 | learning rate: 6.727E-05 | global batch size: 256 | lm loss: 2.619400E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.722 | TFLOPs: 43.64 | +7: iteration 76150/ 115203 | consumed samples: 19494400 | consumed tokens: 39924531200 | elapsed time per iteration (s): 0.56 | learning rate: 6.724E-05 | global batch size: 256 | lm loss: 2.634627E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.135 | TFLOPs: 43.30 | +7: iteration 76160/ 115203 | consumed samples: 19496960 | consumed tokens: 39929774080 | elapsed time per iteration (s): 0.56 | learning rate: 6.722E-05 | global batch size: 256 | lm loss: 2.628678E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.630 | TFLOPs: 43.63 | +7: iteration 76170/ 115203 | consumed samples: 19499520 | consumed tokens: 39935016960 | elapsed time per iteration (s): 0.56 | learning rate: 6.720E-05 | global batch size: 256 | lm loss: 2.625806E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.215 | TFLOPs: 43.69 | +7: iteration 76180/ 115203 | consumed samples: 19502080 | consumed tokens: 39940259840 | elapsed time per iteration (s): 0.56 | learning rate: 6.718E-05 | global batch size: 256 | lm loss: 2.639554E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.354 | TFLOPs: 43.22 | +7: iteration 76190/ 115203 | consumed samples: 19504640 | consumed tokens: 39945502720 | elapsed time per iteration (s): 0.55 | learning rate: 6.716E-05 | global batch size: 256 | lm loss: 2.633415E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.772 | TFLOPs: 44.02 | +7: iteration 76200/ 115203 | consumed samples: 19507200 | consumed tokens: 39950745600 | elapsed time per iteration (s): 0.55 | learning rate: 6.713E-05 | global batch size: 256 | lm loss: 2.631391E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.620 | TFLOPs: 44.01 | +7: iteration 76210/ 115203 | consumed samples: 19509760 | consumed tokens: 39955988480 | elapsed time per iteration (s): 0.55 | learning rate: 6.711E-05 | global batch size: 256 | lm loss: 2.627693E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.517 | TFLOPs: 44.00 | +7: iteration 76220/ 115203 | consumed samples: 19512320 | consumed tokens: 39961231360 | elapsed time per iteration (s): 0.56 | learning rate: 6.709E-05 | global batch size: 256 | lm loss: 2.632451E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.734 | TFLOPs: 43.83 | +7: iteration 76230/ 115203 | consumed samples: 19514880 | consumed tokens: 39966474240 | elapsed time per iteration (s): 0.55 | learning rate: 6.707E-05 | global batch size: 256 | lm loss: 2.627454E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.525 | TFLOPs: 44.00 | +7: iteration 76240/ 115203 | consumed samples: 19517440 | consumed tokens: 39971717120 | elapsed time per iteration (s): 0.55 | learning rate: 6.705E-05 | global batch size: 256 | lm loss: 2.635826E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 76250/ 115203 | consumed samples: 19520000 | consumed tokens: 39976960000 | elapsed time per iteration (s): 0.55 | learning rate: 6.703E-05 | global batch size: 256 | lm loss: 2.650761E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.376 | TFLOPs: 43.99 | +7: iteration 76260/ 115203 | consumed samples: 19522560 | consumed tokens: 39982202880 | elapsed time per iteration (s): 0.55 | learning rate: 6.700E-05 | global batch size: 256 | lm loss: 2.645751E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.471 | TFLOPs: 44.00 | +7: iteration 76270/ 115203 | consumed samples: 19525120 | consumed tokens: 39987445760 | elapsed time per iteration (s): 0.55 | learning rate: 6.698E-05 | global batch size: 256 | lm loss: 2.630566E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.407 | TFLOPs: 43.99 | +7: iteration 76280/ 115203 | consumed samples: 19527680 | consumed tokens: 39992688640 | elapsed time per iteration (s): 0.56 | learning rate: 6.696E-05 | global batch size: 256 | lm loss: 2.618605E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.110 | TFLOPs: 43.49 | +7: iteration 76290/ 115203 | consumed samples: 19530240 | consumed tokens: 39997931520 | elapsed time per iteration (s): 0.55 | learning rate: 6.694E-05 | global batch size: 256 | lm loss: 2.635799E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.533 | TFLOPs: 44.00 | +7: iteration 76300/ 115203 | consumed samples: 19532800 | consumed tokens: 40003174400 | elapsed time per iteration (s): 0.55 | learning rate: 6.692E-05 | global batch size: 256 | lm loss: 2.632955E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.410 | TFLOPs: 43.99 | +7: iteration 76310/ 115203 | consumed samples: 19535360 | consumed tokens: 40008417280 | elapsed time per iteration (s): 0.55 | learning rate: 6.689E-05 | global batch size: 256 | lm loss: 2.624495E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 76320/ 115203 | consumed samples: 19537920 | consumed tokens: 40013660160 | elapsed time per iteration (s): 0.55 | learning rate: 6.687E-05 | global batch size: 256 | lm loss: 2.621851E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.481 | TFLOPs: 44.00 | +7: iteration 76330/ 115203 | consumed samples: 19540480 | consumed tokens: 40018903040 | elapsed time per iteration (s): 0.55 | learning rate: 6.685E-05 | global batch size: 256 | lm loss: 2.630408E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.490 | TFLOPs: 44.00 | +7: iteration 76340/ 115203 | consumed samples: 19543040 | consumed tokens: 40024145920 | elapsed time per iteration (s): 0.55 | learning rate: 6.683E-05 | global batch size: 256 | lm loss: 2.624894E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.546 | TFLOPs: 44.00 | +7: iteration 76350/ 115203 | consumed samples: 19545600 | consumed tokens: 40029388800 | elapsed time per iteration (s): 0.55 | learning rate: 6.681E-05 | global batch size: 256 | lm loss: 2.636249E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.777 | TFLOPs: 44.03 | +7: iteration 76360/ 115203 | consumed samples: 19548160 | consumed tokens: 40034631680 | elapsed time per iteration (s): 0.55 | learning rate: 6.679E-05 | global batch size: 256 | lm loss: 2.632814E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.677 | TFLOPs: 44.02 | +7: iteration 76370/ 115203 | consumed samples: 19550720 | consumed tokens: 40039874560 | elapsed time per iteration (s): 0.55 | learning rate: 6.676E-05 | global batch size: 256 | lm loss: 2.624804E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.702 | TFLOPs: 44.02 | +7: iteration 76380/ 115203 | consumed samples: 19553280 | consumed tokens: 40045117440 | elapsed time per iteration (s): 0.56 | learning rate: 6.674E-05 | global batch size: 256 | lm loss: 2.625940E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.337 | TFLOPs: 43.70 | +7: iteration 76390/ 115203 | consumed samples: 19555840 | consumed tokens: 40050360320 | elapsed time per iteration (s): 0.55 | learning rate: 6.672E-05 | global batch size: 256 | lm loss: 2.631123E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.471 | TFLOPs: 44.00 | +7: iteration 76400/ 115203 | consumed samples: 19558400 | consumed tokens: 40055603200 | elapsed time per iteration (s): 0.55 | learning rate: 6.670E-05 | global batch size: 256 | lm loss: 2.627158E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 76410/ 115203 | consumed samples: 19560960 | consumed tokens: 40060846080 | elapsed time per iteration (s): 0.55 | learning rate: 6.668E-05 | global batch size: 256 | lm loss: 2.640459E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.622 | TFLOPs: 44.01 | +7: iteration 76420/ 115203 | consumed samples: 19563520 | consumed tokens: 40066088960 | elapsed time per iteration (s): 0.56 | learning rate: 6.666E-05 | global batch size: 256 | lm loss: 2.641032E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.732 | TFLOPs: 43.74 | +7: iteration 76430/ 115203 | consumed samples: 19566080 | consumed tokens: 40071331840 | elapsed time per iteration (s): 0.55 | learning rate: 6.663E-05 | global batch size: 256 | lm loss: 2.631302E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.377 | TFLOPs: 43.99 | +7: iteration 76440/ 115203 | consumed samples: 19568640 | consumed tokens: 40076574720 | elapsed time per iteration (s): 0.55 | learning rate: 6.661E-05 | global batch size: 256 | lm loss: 2.629886E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.626 | TFLOPs: 44.01 | +7: iteration 76450/ 115203 | consumed samples: 19571200 | consumed tokens: 40081817600 | elapsed time per iteration (s): 0.55 | learning rate: 6.659E-05 | global batch size: 256 | lm loss: 2.631837E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.461 | TFLOPs: 44.00 | +7: iteration 76460/ 115203 | consumed samples: 19573760 | consumed tokens: 40087060480 | elapsed time per iteration (s): 0.55 | learning rate: 6.657E-05 | global batch size: 256 | lm loss: 2.631775E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.384 | TFLOPs: 43.99 | +7: iteration 76470/ 115203 | consumed samples: 19576320 | consumed tokens: 40092303360 | elapsed time per iteration (s): 0.55 | learning rate: 6.655E-05 | global batch size: 256 | lm loss: 2.639542E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.586 | TFLOPs: 44.01 | +7: iteration 76480/ 115203 | consumed samples: 19578880 | consumed tokens: 40097546240 | elapsed time per iteration (s): 0.56 | learning rate: 6.653E-05 | global batch size: 256 | lm loss: 2.634137E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.889 | TFLOPs: 43.56 | +7: iteration 76490/ 115203 | consumed samples: 19581440 | consumed tokens: 40102789120 | elapsed time per iteration (s): 0.55 | learning rate: 6.650E-05 | global batch size: 256 | lm loss: 2.624987E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.536 | TFLOPs: 44.00 | +7: iteration 76500/ 115203 | consumed samples: 19584000 | consumed tokens: 40108032000 | elapsed time per iteration (s): 0.55 | learning rate: 6.648E-05 | global batch size: 256 | lm loss: 2.617584E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.610 | TFLOPs: 44.01 | +7: iteration 76510/ 115203 | consumed samples: 19586560 | consumed tokens: 40113274880 | elapsed time per iteration (s): 0.55 | learning rate: 6.646E-05 | global batch size: 256 | lm loss: 2.632248E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.597 | TFLOPs: 44.01 | +7: iteration 76520/ 115203 | consumed samples: 19589120 | consumed tokens: 40118517760 | elapsed time per iteration (s): 0.55 | learning rate: 6.644E-05 | global batch size: 256 | lm loss: 2.639488E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.668 | TFLOPs: 44.02 | +7: iteration 76530/ 115203 | consumed samples: 19591680 | consumed tokens: 40123760640 | elapsed time per iteration (s): 0.56 | learning rate: 6.642E-05 | global batch size: 256 | lm loss: 2.629258E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.066 | TFLOPs: 43.48 | +7: iteration 76540/ 115203 | consumed samples: 19594240 | consumed tokens: 40129003520 | elapsed time per iteration (s): 0.55 | learning rate: 6.640E-05 | global batch size: 256 | lm loss: 2.627993E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.614 | TFLOPs: 44.01 | +7: iteration 76550/ 115203 | consumed samples: 19596800 | consumed tokens: 40134246400 | elapsed time per iteration (s): 0.55 | learning rate: 6.637E-05 | global batch size: 256 | lm loss: 2.638454E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.607 | TFLOPs: 44.01 | +7: iteration 76560/ 115203 | consumed samples: 19599360 | consumed tokens: 40139489280 | elapsed time per iteration (s): 0.55 | learning rate: 6.635E-05 | global batch size: 256 | lm loss: 2.636084E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.511 | TFLOPs: 44.00 | +7: iteration 76570/ 115203 | consumed samples: 19601920 | consumed tokens: 40144732160 | elapsed time per iteration (s): 0.55 | learning rate: 6.633E-05 | global batch size: 256 | lm loss: 2.644228E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.523 | TFLOPs: 44.00 | +7: iteration 76580/ 115203 | consumed samples: 19604480 | consumed tokens: 40149975040 | elapsed time per iteration (s): 0.55 | learning rate: 6.631E-05 | global batch size: 256 | lm loss: 2.624955E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.857 | TFLOPs: 44.03 | +7: iteration 76590/ 115203 | consumed samples: 19607040 | consumed tokens: 40155217920 | elapsed time per iteration (s): 0.56 | learning rate: 6.629E-05 | global batch size: 256 | lm loss: 2.631026E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.596 | TFLOPs: 43.72 | +7: iteration 76600/ 115203 | consumed samples: 19609600 | consumed tokens: 40160460800 | elapsed time per iteration (s): 0.56 | learning rate: 6.627E-05 | global batch size: 256 | lm loss: 2.629040E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.443 | TFLOPs: 43.71 | +7: iteration 76610/ 115203 | consumed samples: 19612160 | consumed tokens: 40165703680 | elapsed time per iteration (s): 0.55 | learning rate: 6.624E-05 | global batch size: 256 | lm loss: 2.629762E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.305 | TFLOPs: 43.98 | +7: iteration 76620/ 115203 | consumed samples: 19614720 | consumed tokens: 40170946560 | elapsed time per iteration (s): 0.55 | learning rate: 6.622E-05 | global batch size: 256 | lm loss: 2.631392E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.478 | TFLOPs: 44.00 | +7: iteration 76630/ 115203 | consumed samples: 19617280 | consumed tokens: 40176189440 | elapsed time per iteration (s): 0.55 | learning rate: 6.620E-05 | global batch size: 256 | lm loss: 2.632028E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.390 | TFLOPs: 43.99 | +7: iteration 76640/ 115203 | consumed samples: 19619840 | consumed tokens: 40181432320 | elapsed time per iteration (s): 0.55 | learning rate: 6.618E-05 | global batch size: 256 | lm loss: 2.621096E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 76650/ 115203 | consumed samples: 19622400 | consumed tokens: 40186675200 | elapsed time per iteration (s): 0.55 | learning rate: 6.616E-05 | global batch size: 256 | lm loss: 2.632829E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.479 | TFLOPs: 44.00 | +7: iteration 76660/ 115203 | consumed samples: 19624960 | consumed tokens: 40191918080 | elapsed time per iteration (s): 0.55 | learning rate: 6.614E-05 | global batch size: 256 | lm loss: 2.629465E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 76670/ 115203 | consumed samples: 19627520 | consumed tokens: 40197160960 | elapsed time per iteration (s): 0.56 | learning rate: 6.611E-05 | global batch size: 256 | lm loss: 2.635623E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.534 | TFLOPs: 43.53 | +7: iteration 76680/ 115203 | consumed samples: 19630080 | consumed tokens: 40202403840 | elapsed time per iteration (s): 0.56 | learning rate: 6.609E-05 | global batch size: 256 | lm loss: 2.642454E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.100 | TFLOPs: 43.58 | +7: iteration 76690/ 115203 | consumed samples: 19632640 | consumed tokens: 40207646720 | elapsed time per iteration (s): 0.55 | learning rate: 6.607E-05 | global batch size: 256 | lm loss: 2.621045E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.313 | TFLOPs: 43.98 | +7: iteration 76700/ 115203 | consumed samples: 19635200 | consumed tokens: 40212889600 | elapsed time per iteration (s): 0.56 | learning rate: 6.605E-05 | global batch size: 256 | lm loss: 2.640941E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.409 | TFLOPs: 43.70 | +7: iteration 76710/ 115203 | consumed samples: 19637760 | consumed tokens: 40218132480 | elapsed time per iteration (s): 0.56 | learning rate: 6.603E-05 | global batch size: 256 | lm loss: 2.620846E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.769 | TFLOPs: 43.55 | +7: iteration 76720/ 115203 | consumed samples: 19640320 | consumed tokens: 40223375360 | elapsed time per iteration (s): 0.55 | learning rate: 6.601E-05 | global batch size: 256 | lm loss: 2.618604E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.328 | TFLOPs: 43.98 | +7: iteration 76730/ 115203 | consumed samples: 19642880 | consumed tokens: 40228618240 | elapsed time per iteration (s): 0.55 | learning rate: 6.598E-05 | global batch size: 256 | lm loss: 2.630226E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 76740/ 115203 | consumed samples: 19645440 | consumed tokens: 40233861120 | elapsed time per iteration (s): 0.55 | learning rate: 6.596E-05 | global batch size: 256 | lm loss: 2.615222E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.378 | TFLOPs: 43.99 | +7: iteration 76750/ 115203 | consumed samples: 19648000 | consumed tokens: 40239104000 | elapsed time per iteration (s): 0.56 | learning rate: 6.594E-05 | global batch size: 256 | lm loss: 2.629605E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.230 | TFLOPs: 43.50 | +7: iteration 76760/ 115203 | consumed samples: 19650560 | consumed tokens: 40244346880 | elapsed time per iteration (s): 0.55 | learning rate: 6.592E-05 | global batch size: 256 | lm loss: 2.640289E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 76770/ 115203 | consumed samples: 19653120 | consumed tokens: 40249589760 | elapsed time per iteration (s): 0.55 | learning rate: 6.590E-05 | global batch size: 256 | lm loss: 2.629027E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 76780/ 115203 | consumed samples: 19655680 | consumed tokens: 40254832640 | elapsed time per iteration (s): 0.55 | learning rate: 6.588E-05 | global batch size: 256 | lm loss: 2.640133E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.360 | TFLOPs: 43.99 | +7: iteration 76790/ 115203 | consumed samples: 19658240 | consumed tokens: 40260075520 | elapsed time per iteration (s): 0.55 | learning rate: 6.585E-05 | global batch size: 256 | lm loss: 2.633672E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 76800/ 115203 | consumed samples: 19660800 | consumed tokens: 40265318400 | elapsed time per iteration (s): 0.56 | learning rate: 6.583E-05 | global batch size: 256 | lm loss: 2.627555E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.780 | TFLOPs: 43.26 | +7: iteration 76810/ 115203 | consumed samples: 19663360 | consumed tokens: 40270561280 | elapsed time per iteration (s): 0.55 | learning rate: 6.581E-05 | global batch size: 256 | lm loss: 2.639984E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.491 | TFLOPs: 44.00 | +7: iteration 76820/ 115203 | consumed samples: 19665920 | consumed tokens: 40275804160 | elapsed time per iteration (s): 0.55 | learning rate: 6.579E-05 | global batch size: 256 | lm loss: 2.638131E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.409 | TFLOPs: 43.99 | +7: iteration 76830/ 115203 | consumed samples: 19668480 | consumed tokens: 40281047040 | elapsed time per iteration (s): 0.55 | learning rate: 6.577E-05 | global batch size: 256 | lm loss: 2.619346E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.302 | TFLOPs: 43.98 | +7: iteration 76840/ 115203 | consumed samples: 19671040 | consumed tokens: 40286289920 | elapsed time per iteration (s): 0.55 | learning rate: 6.575E-05 | global batch size: 256 | lm loss: 2.630856E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.395 | TFLOPs: 43.99 | +7: iteration 76850/ 115203 | consumed samples: 19673600 | consumed tokens: 40291532800 | elapsed time per iteration (s): 0.55 | learning rate: 6.572E-05 | global batch size: 256 | lm loss: 2.636809E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.485 | TFLOPs: 44.00 | +7: iteration 76860/ 115203 | consumed samples: 19676160 | consumed tokens: 40296775680 | elapsed time per iteration (s): 0.55 | learning rate: 6.570E-05 | global batch size: 256 | lm loss: 2.634516E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.397 | TFLOPs: 43.99 | +7: iteration 76870/ 115203 | consumed samples: 19678720 | consumed tokens: 40302018560 | elapsed time per iteration (s): 0.55 | learning rate: 6.568E-05 | global batch size: 256 | lm loss: 2.636863E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.414 | TFLOPs: 43.99 | +7: iteration 76880/ 115203 | consumed samples: 19681280 | consumed tokens: 40307261440 | elapsed time per iteration (s): 0.55 | learning rate: 6.566E-05 | global batch size: 256 | lm loss: 2.638251E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.397 | TFLOPs: 43.99 | +7: iteration 76890/ 115203 | consumed samples: 19683840 | consumed tokens: 40312504320 | elapsed time per iteration (s): 0.56 | learning rate: 6.564E-05 | global batch size: 256 | lm loss: 2.619920E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.131 | TFLOPs: 43.96 | +7: iteration 76900/ 115203 | consumed samples: 19686400 | consumed tokens: 40317747200 | elapsed time per iteration (s): 0.55 | learning rate: 6.562E-05 | global batch size: 256 | lm loss: 2.636106E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 76910/ 115203 | consumed samples: 19688960 | consumed tokens: 40322990080 | elapsed time per iteration (s): 0.55 | learning rate: 6.560E-05 | global batch size: 256 | lm loss: 2.638127E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.375 | TFLOPs: 43.99 | +7: iteration 76920/ 115203 | consumed samples: 19691520 | consumed tokens: 40328232960 | elapsed time per iteration (s): 0.55 | learning rate: 6.557E-05 | global batch size: 256 | lm loss: 2.632734E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.425 | TFLOPs: 43.99 | +7: iteration 76930/ 115203 | consumed samples: 19694080 | consumed tokens: 40333475840 | elapsed time per iteration (s): 0.56 | learning rate: 6.555E-05 | global batch size: 256 | lm loss: 2.628681E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.783 | TFLOPs: 43.45 | +7: iteration 76940/ 115203 | consumed samples: 19696640 | consumed tokens: 40338718720 | elapsed time per iteration (s): 0.56 | learning rate: 6.553E-05 | global batch size: 256 | lm loss: 2.632921E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.434 | TFLOPs: 43.33 | +7: iteration 76950/ 115203 | consumed samples: 19699200 | consumed tokens: 40343961600 | elapsed time per iteration (s): 0.55 | learning rate: 6.551E-05 | global batch size: 256 | lm loss: 2.617274E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.387 | TFLOPs: 43.99 | +7: iteration 76960/ 115203 | consumed samples: 19701760 | consumed tokens: 40349204480 | elapsed time per iteration (s): 0.55 | learning rate: 6.549E-05 | global batch size: 256 | lm loss: 2.624866E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.670 | TFLOPs: 44.02 | +7: iteration 76970/ 115203 | consumed samples: 19704320 | consumed tokens: 40354447360 | elapsed time per iteration (s): 0.55 | learning rate: 6.547E-05 | global batch size: 256 | lm loss: 2.636949E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.449 | TFLOPs: 43.99 | +7: iteration 76980/ 115203 | consumed samples: 19706880 | consumed tokens: 40359690240 | elapsed time per iteration (s): 0.56 | learning rate: 6.544E-05 | global batch size: 256 | lm loss: 2.626377E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.211 | TFLOPs: 43.97 | +7: iteration 76990/ 115203 | consumed samples: 19709440 | consumed tokens: 40364933120 | elapsed time per iteration (s): 0.55 | learning rate: 6.542E-05 | global batch size: 256 | lm loss: 2.630634E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 77000/ 115203 | consumed samples: 19712000 | consumed tokens: 40370176000 | elapsed time per iteration (s): 0.56 | learning rate: 6.540E-05 | global batch size: 256 | lm loss: 2.640267E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.774 | TFLOPs: 43.26 | +7: iteration 77010/ 115203 | consumed samples: 19714560 | consumed tokens: 40375418880 | elapsed time per iteration (s): 0.55 | learning rate: 6.538E-05 | global batch size: 256 | lm loss: 2.629717E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.407 | TFLOPs: 43.99 | +7: iteration 77020/ 115203 | consumed samples: 19717120 | consumed tokens: 40380661760 | elapsed time per iteration (s): 0.55 | learning rate: 6.536E-05 | global batch size: 256 | lm loss: 2.635808E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.347 | TFLOPs: 43.98 | +7: iteration 77030/ 115203 | consumed samples: 19719680 | consumed tokens: 40385904640 | elapsed time per iteration (s): 0.55 | learning rate: 6.534E-05 | global batch size: 256 | lm loss: 2.629747E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.467 | TFLOPs: 44.00 | +7: iteration 77040/ 115203 | consumed samples: 19722240 | consumed tokens: 40391147520 | elapsed time per iteration (s): 0.55 | learning rate: 6.532E-05 | global batch size: 256 | lm loss: 2.626116E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.311 | TFLOPs: 43.98 | +7: iteration 77050/ 115203 | consumed samples: 19724800 | consumed tokens: 40396390400 | elapsed time per iteration (s): 0.55 | learning rate: 6.529E-05 | global batch size: 256 | lm loss: 2.640415E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 77060/ 115203 | consumed samples: 19727360 | consumed tokens: 40401633280 | elapsed time per iteration (s): 0.55 | learning rate: 6.527E-05 | global batch size: 256 | lm loss: 2.638629E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.466 | TFLOPs: 44.00 | +7: iteration 77070/ 115203 | consumed samples: 19729920 | consumed tokens: 40406876160 | elapsed time per iteration (s): 0.56 | learning rate: 6.525E-05 | global batch size: 256 | lm loss: 2.624919E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 77080/ 115203 | consumed samples: 19732480 | consumed tokens: 40412119040 | elapsed time per iteration (s): 0.55 | learning rate: 6.523E-05 | global batch size: 256 | lm loss: 2.624407E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.349 | TFLOPs: 43.98 | +7: iteration 77090/ 115203 | consumed samples: 19735040 | consumed tokens: 40417361920 | elapsed time per iteration (s): 0.56 | learning rate: 6.521E-05 | global batch size: 256 | lm loss: 2.629080E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.129 | TFLOPs: 43.39 | +7: iteration 77100/ 115203 | consumed samples: 19737600 | consumed tokens: 40422604800 | elapsed time per iteration (s): 0.55 | learning rate: 6.519E-05 | global batch size: 256 | lm loss: 2.650974E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 77110/ 115203 | consumed samples: 19740160 | consumed tokens: 40427847680 | elapsed time per iteration (s): 0.57 | learning rate: 6.516E-05 | global batch size: 256 | lm loss: 2.637643E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.924 | TFLOPs: 42.51 | +7: iteration 77120/ 115203 | consumed samples: 19742720 | consumed tokens: 40433090560 | elapsed time per iteration (s): 0.55 | learning rate: 6.514E-05 | global batch size: 256 | lm loss: 2.621261E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.393 | TFLOPs: 43.99 | +7: iteration 77130/ 115203 | consumed samples: 19745280 | consumed tokens: 40438333440 | elapsed time per iteration (s): 0.55 | learning rate: 6.512E-05 | global batch size: 256 | lm loss: 2.626748E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.373 | TFLOPs: 43.99 | +7: iteration 77140/ 115203 | consumed samples: 19747840 | consumed tokens: 40443576320 | elapsed time per iteration (s): 0.56 | learning rate: 6.510E-05 | global batch size: 256 | lm loss: 2.617462E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.070 | TFLOPs: 43.29 | +7: iteration 77150/ 115203 | consumed samples: 19750400 | consumed tokens: 40448819200 | elapsed time per iteration (s): 0.57 | learning rate: 6.508E-05 | global batch size: 256 | lm loss: 2.636681E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.862 | TFLOPs: 42.89 | +7: iteration 77160/ 115203 | consumed samples: 19752960 | consumed tokens: 40454062080 | elapsed time per iteration (s): 0.56 | learning rate: 6.506E-05 | global batch size: 256 | lm loss: 2.630954E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.574 | TFLOPs: 43.24 | +7: iteration 77170/ 115203 | consumed samples: 19755520 | consumed tokens: 40459304960 | elapsed time per iteration (s): 0.56 | learning rate: 6.504E-05 | global batch size: 256 | lm loss: 2.632648E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.234 | TFLOPs: 43.40 | +7: iteration 77180/ 115203 | consumed samples: 19758080 | consumed tokens: 40464547840 | elapsed time per iteration (s): 0.55 | learning rate: 6.501E-05 | global batch size: 256 | lm loss: 2.627374E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 77190/ 115203 | consumed samples: 19760640 | consumed tokens: 40469790720 | elapsed time per iteration (s): 0.57 | learning rate: 6.499E-05 | global batch size: 256 | lm loss: 2.625592E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.382 | TFLOPs: 42.75 | +7: iteration 77200/ 115203 | consumed samples: 19763200 | consumed tokens: 40475033600 | elapsed time per iteration (s): 0.57 | learning rate: 6.497E-05 | global batch size: 256 | lm loss: 2.626771E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.403 | TFLOPs: 42.56 | +7: iteration 77210/ 115203 | consumed samples: 19765760 | consumed tokens: 40480276480 | elapsed time per iteration (s): 0.58 | learning rate: 6.495E-05 | global batch size: 256 | lm loss: 2.623851E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.851 | TFLOPs: 42.22 | +7: iteration 77220/ 115203 | consumed samples: 19768320 | consumed tokens: 40485519360 | elapsed time per iteration (s): 0.57 | learning rate: 6.493E-05 | global batch size: 256 | lm loss: 2.633521E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.944 | TFLOPs: 42.71 | +7: iteration 77230/ 115203 | consumed samples: 19770880 | consumed tokens: 40490762240 | elapsed time per iteration (s): 0.57 | learning rate: 6.491E-05 | global batch size: 256 | lm loss: 2.620550E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.101 | TFLOPs: 42.63 | +7: iteration 77240/ 115203 | consumed samples: 19773440 | consumed tokens: 40496005120 | elapsed time per iteration (s): 0.56 | learning rate: 6.489E-05 | global batch size: 256 | lm loss: 2.614922E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.048 | TFLOPs: 43.29 | +7: iteration 77250/ 115203 | consumed samples: 19776000 | consumed tokens: 40501248000 | elapsed time per iteration (s): 0.56 | learning rate: 6.486E-05 | global batch size: 256 | lm loss: 2.632735E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.168 | TFLOPs: 43.30 | +7: iteration 77260/ 115203 | consumed samples: 19778560 | consumed tokens: 40506490880 | elapsed time per iteration (s): 0.58 | learning rate: 6.484E-05 | global batch size: 256 | lm loss: 2.622104E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.636 | TFLOPs: 42.39 | +7: iteration 77270/ 115203 | consumed samples: 19781120 | consumed tokens: 40511733760 | elapsed time per iteration (s): 0.56 | learning rate: 6.482E-05 | global batch size: 256 | lm loss: 2.639789E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.121 | TFLOPs: 43.77 | +7: iteration 77280/ 115203 | consumed samples: 19783680 | consumed tokens: 40516976640 | elapsed time per iteration (s): 0.56 | learning rate: 6.480E-05 | global batch size: 256 | lm loss: 2.633098E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.374 | TFLOPs: 43.51 | +7: iteration 77290/ 115203 | consumed samples: 19786240 | consumed tokens: 40522219520 | elapsed time per iteration (s): 0.57 | learning rate: 6.478E-05 | global batch size: 256 | lm loss: 2.617207E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.502 | TFLOPs: 42.47 | +7: iteration 77300/ 115203 | consumed samples: 19788800 | consumed tokens: 40527462400 | elapsed time per iteration (s): 0.58 | learning rate: 6.476E-05 | global batch size: 256 | lm loss: 2.616706E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.193 | TFLOPs: 42.06 | +7: iteration 77310/ 115203 | consumed samples: 19791360 | consumed tokens: 40532705280 | elapsed time per iteration (s): 0.59 | learning rate: 6.474E-05 | global batch size: 256 | lm loss: 2.615684E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.700 | TFLOPs: 41.54 | +7: iteration 77320/ 115203 | consumed samples: 19793920 | consumed tokens: 40537948160 | elapsed time per iteration (s): 0.57 | learning rate: 6.471E-05 | global batch size: 256 | lm loss: 2.632948E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.171 | TFLOPs: 43.11 | +7: iteration 77330/ 115203 | consumed samples: 19796480 | consumed tokens: 40543191040 | elapsed time per iteration (s): 0.57 | learning rate: 6.469E-05 | global batch size: 256 | lm loss: 2.647961E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.777 | TFLOPs: 42.98 | +7: iteration 77340/ 115203 | consumed samples: 19799040 | consumed tokens: 40548433920 | elapsed time per iteration (s): 0.56 | learning rate: 6.467E-05 | global batch size: 256 | lm loss: 2.617742E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.883 | TFLOPs: 43.27 | +7: iteration 77350/ 115203 | consumed samples: 19801600 | consumed tokens: 40553676800 | elapsed time per iteration (s): 0.57 | learning rate: 6.465E-05 | global batch size: 256 | lm loss: 2.626019E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.667 | TFLOPs: 42.49 | +7: iteration 77360/ 115203 | consumed samples: 19804160 | consumed tokens: 40558919680 | elapsed time per iteration (s): 0.58 | learning rate: 6.463E-05 | global batch size: 256 | lm loss: 2.626816E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.643 | TFLOPs: 41.92 | +7: iteration 77370/ 115203 | consumed samples: 19806720 | consumed tokens: 40564162560 | elapsed time per iteration (s): 0.58 | learning rate: 6.461E-05 | global batch size: 256 | lm loss: 2.624760E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.200 | TFLOPs: 41.87 | +7: iteration 77380/ 115203 | consumed samples: 19809280 | consumed tokens: 40569405440 | elapsed time per iteration (s): 0.56 | learning rate: 6.459E-05 | global batch size: 256 | lm loss: 2.633564E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.046 | TFLOPs: 43.38 | +7: iteration 77390/ 115203 | consumed samples: 19811840 | consumed tokens: 40574648320 | elapsed time per iteration (s): 0.58 | learning rate: 6.456E-05 | global batch size: 256 | lm loss: 2.643153E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.693 | TFLOPs: 41.92 | +7: iteration 77400/ 115203 | consumed samples: 19814400 | consumed tokens: 40579891200 | elapsed time per iteration (s): 0.57 | learning rate: 6.454E-05 | global batch size: 256 | lm loss: 2.622027E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.149 | TFLOPs: 42.63 | +7: iteration 77410/ 115203 | consumed samples: 19816960 | consumed tokens: 40585134080 | elapsed time per iteration (s): 0.57 | learning rate: 6.452E-05 | global batch size: 256 | lm loss: 2.628905E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.491 | TFLOPs: 43.04 | +7: iteration 77420/ 115203 | consumed samples: 19819520 | consumed tokens: 40590376960 | elapsed time per iteration (s): 0.58 | learning rate: 6.450E-05 | global batch size: 256 | lm loss: 2.631909E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.938 | TFLOPs: 42.42 | +7: iteration 77430/ 115203 | consumed samples: 19822080 | consumed tokens: 40595619840 | elapsed time per iteration (s): 0.58 | learning rate: 6.448E-05 | global batch size: 256 | lm loss: 2.615870E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.187 | TFLOPs: 42.44 | +7: iteration 77440/ 115203 | consumed samples: 19824640 | consumed tokens: 40600862720 | elapsed time per iteration (s): 0.56 | learning rate: 6.446E-05 | global batch size: 256 | lm loss: 2.618349E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.597 | TFLOPs: 43.72 | +7: iteration 77450/ 115203 | consumed samples: 19827200 | consumed tokens: 40606105600 | elapsed time per iteration (s): 0.57 | learning rate: 6.444E-05 | global batch size: 256 | lm loss: 2.625199E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.928 | TFLOPs: 42.51 | +7: iteration 77460/ 115203 | consumed samples: 19829760 | consumed tokens: 40611348480 | elapsed time per iteration (s): 0.57 | learning rate: 6.441E-05 | global batch size: 256 | lm loss: 2.622909E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.058 | TFLOPs: 43.10 | +7: iteration 77470/ 115203 | consumed samples: 19832320 | consumed tokens: 40616591360 | elapsed time per iteration (s): 0.57 | learning rate: 6.439E-05 | global batch size: 256 | lm loss: 2.622569E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.294 | TFLOPs: 43.03 | +7: iteration 77480/ 115203 | consumed samples: 19834880 | consumed tokens: 40621834240 | elapsed time per iteration (s): 0.57 | learning rate: 6.437E-05 | global batch size: 256 | lm loss: 2.614205E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.035 | TFLOPs: 42.91 | +7: iteration 77490/ 115203 | consumed samples: 19837440 | consumed tokens: 40627077120 | elapsed time per iteration (s): 0.56 | learning rate: 6.435E-05 | global batch size: 256 | lm loss: 2.627100E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.665 | TFLOPs: 43.54 | +7: iteration 77500/ 115203 | consumed samples: 19840000 | consumed tokens: 40632320000 | elapsed time per iteration (s): 0.57 | learning rate: 6.433E-05 | global batch size: 256 | lm loss: 2.623561E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.315 | TFLOPs: 43.12 | +7: iteration 77510/ 115203 | consumed samples: 19842560 | consumed tokens: 40637562880 | elapsed time per iteration (s): 0.56 | learning rate: 6.431E-05 | global batch size: 256 | lm loss: 2.628534E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.540 | TFLOPs: 43.24 | +7: iteration 77520/ 115203 | consumed samples: 19845120 | consumed tokens: 40642805760 | elapsed time per iteration (s): 0.56 | learning rate: 6.429E-05 | global batch size: 256 | lm loss: 2.637063E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.574 | TFLOPs: 43.34 | +7: iteration 77530/ 115203 | consumed samples: 19847680 | consumed tokens: 40648048640 | elapsed time per iteration (s): 0.56 | learning rate: 6.426E-05 | global batch size: 256 | lm loss: 2.627386E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.942 | TFLOPs: 43.66 | +7: iteration 77540/ 115203 | consumed samples: 19850240 | consumed tokens: 40653291520 | elapsed time per iteration (s): 0.57 | learning rate: 6.424E-05 | global batch size: 256 | lm loss: 2.624622E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.029 | TFLOPs: 42.91 | +7: iteration 77550/ 115203 | consumed samples: 19852800 | consumed tokens: 40658534400 | elapsed time per iteration (s): 0.58 | learning rate: 6.422E-05 | global batch size: 256 | lm loss: 2.641963E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.744 | TFLOPs: 41.92 | +7: iteration 77560/ 115203 | consumed samples: 19855360 | consumed tokens: 40663777280 | elapsed time per iteration (s): 0.56 | learning rate: 6.420E-05 | global batch size: 256 | lm loss: 2.638705E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.374 | TFLOPs: 43.51 | +7: iteration 77570/ 115203 | consumed samples: 19857920 | consumed tokens: 40669020160 | elapsed time per iteration (s): 0.56 | learning rate: 6.418E-05 | global batch size: 256 | lm loss: 2.625839E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.839 | TFLOPs: 43.46 | +7: iteration 77580/ 115203 | consumed samples: 19860480 | consumed tokens: 40674263040 | elapsed time per iteration (s): 0.56 | learning rate: 6.416E-05 | global batch size: 256 | lm loss: 2.614967E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.330 | TFLOPs: 43.51 | +7: iteration 77590/ 115203 | consumed samples: 19863040 | consumed tokens: 40679505920 | elapsed time per iteration (s): 0.56 | learning rate: 6.414E-05 | global batch size: 256 | lm loss: 2.647217E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.527 | TFLOPs: 43.62 | +7: iteration 77600/ 115203 | consumed samples: 19865600 | consumed tokens: 40684748800 | elapsed time per iteration (s): 0.56 | learning rate: 6.412E-05 | global batch size: 256 | lm loss: 2.616790E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.461 | TFLOPs: 43.61 | +7: iteration 77610/ 115203 | consumed samples: 19868160 | consumed tokens: 40689991680 | elapsed time per iteration (s): 0.56 | learning rate: 6.409E-05 | global batch size: 256 | lm loss: 2.623382E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.060 | TFLOPs: 43.29 | +7: iteration 77620/ 115203 | consumed samples: 19870720 | consumed tokens: 40695234560 | elapsed time per iteration (s): 0.55 | learning rate: 6.407E-05 | global batch size: 256 | lm loss: 2.629890E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.335 | TFLOPs: 43.98 | +7: iteration 77630/ 115203 | consumed samples: 19873280 | consumed tokens: 40700477440 | elapsed time per iteration (s): 0.56 | learning rate: 6.405E-05 | global batch size: 256 | lm loss: 2.607723E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.925 | TFLOPs: 43.66 | +7: iteration 77640/ 115203 | consumed samples: 19875840 | consumed tokens: 40705720320 | elapsed time per iteration (s): 0.57 | learning rate: 6.403E-05 | global batch size: 256 | lm loss: 2.630831E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.901 | TFLOPs: 42.51 | +7: iteration 77650/ 115203 | consumed samples: 19878400 | consumed tokens: 40710963200 | elapsed time per iteration (s): 0.56 | learning rate: 6.401E-05 | global batch size: 256 | lm loss: 2.627715E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.571 | TFLOPs: 43.24 | +7: iteration 77660/ 115203 | consumed samples: 19880960 | consumed tokens: 40716206080 | elapsed time per iteration (s): 0.56 | learning rate: 6.399E-05 | global batch size: 256 | lm loss: 2.626425E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.129 | TFLOPs: 43.58 | +7: iteration 77670/ 115203 | consumed samples: 19883520 | consumed tokens: 40721448960 | elapsed time per iteration (s): 0.57 | learning rate: 6.397E-05 | global batch size: 256 | lm loss: 2.618989E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.358 | TFLOPs: 43.03 | +7: iteration 77680/ 115203 | consumed samples: 19886080 | consumed tokens: 40726691840 | elapsed time per iteration (s): 0.56 | learning rate: 6.394E-05 | global batch size: 256 | lm loss: 2.613466E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.894 | TFLOPs: 43.66 | +7: iteration 77690/ 115203 | consumed samples: 19888640 | consumed tokens: 40731934720 | elapsed time per iteration (s): 0.57 | learning rate: 6.392E-05 | global batch size: 256 | lm loss: 2.632631E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.668 | TFLOPs: 42.97 | +7: iteration 77700/ 115203 | consumed samples: 19891200 | consumed tokens: 40737177600 | elapsed time per iteration (s): 0.56 | learning rate: 6.390E-05 | global batch size: 256 | lm loss: 2.626733E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.510 | TFLOPs: 43.62 | +7: iteration 77710/ 115203 | consumed samples: 19893760 | consumed tokens: 40742420480 | elapsed time per iteration (s): 0.56 | learning rate: 6.388E-05 | global batch size: 256 | lm loss: 2.641179E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.334 | TFLOPs: 43.22 | +7: iteration 77720/ 115203 | consumed samples: 19896320 | consumed tokens: 40747663360 | elapsed time per iteration (s): 0.56 | learning rate: 6.386E-05 | global batch size: 256 | lm loss: 2.630762E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.323 | TFLOPs: 43.31 | +7: iteration 77730/ 115203 | consumed samples: 19898880 | consumed tokens: 40752906240 | elapsed time per iteration (s): 0.57 | learning rate: 6.384E-05 | global batch size: 256 | lm loss: 2.636254E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.727 | TFLOPs: 42.88 | +7: iteration 77740/ 115203 | consumed samples: 19901440 | consumed tokens: 40758149120 | elapsed time per iteration (s): 0.56 | learning rate: 6.382E-05 | global batch size: 256 | lm loss: 2.627954E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.876 | TFLOPs: 43.84 | +7: iteration 77750/ 115203 | consumed samples: 19904000 | consumed tokens: 40763392000 | elapsed time per iteration (s): 0.59 | learning rate: 6.380E-05 | global batch size: 256 | lm loss: 2.638389E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.261 | TFLOPs: 41.02 | +7: iteration 77760/ 115203 | consumed samples: 19906560 | consumed tokens: 40768634880 | elapsed time per iteration (s): 0.56 | learning rate: 6.377E-05 | global batch size: 256 | lm loss: 2.614793E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.140 | TFLOPs: 43.20 | +7: iteration 77770/ 115203 | consumed samples: 19909120 | consumed tokens: 40773877760 | elapsed time per iteration (s): 0.57 | learning rate: 6.375E-05 | global batch size: 256 | lm loss: 2.619089E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.927 | TFLOPs: 42.99 | +7: iteration 77780/ 115203 | consumed samples: 19911680 | consumed tokens: 40779120640 | elapsed time per iteration (s): 0.56 | learning rate: 6.373E-05 | global batch size: 256 | lm loss: 2.636887E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.831 | TFLOPs: 43.46 | +7: iteration 77790/ 115203 | consumed samples: 19914240 | consumed tokens: 40784363520 | elapsed time per iteration (s): 0.57 | learning rate: 6.371E-05 | global batch size: 256 | lm loss: 2.631005E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.949 | TFLOPs: 42.52 | +7: iteration 77800/ 115203 | consumed samples: 19916800 | consumed tokens: 40789606400 | elapsed time per iteration (s): 0.56 | learning rate: 6.369E-05 | global batch size: 256 | lm loss: 2.636291E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.957 | TFLOPs: 43.85 | +7: iteration 77810/ 115203 | consumed samples: 19919360 | consumed tokens: 40794849280 | elapsed time per iteration (s): 0.57 | learning rate: 6.367E-05 | global batch size: 256 | lm loss: 2.621038E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.185 | TFLOPs: 42.82 | +7: iteration 77820/ 115203 | consumed samples: 19921920 | consumed tokens: 40800092160 | elapsed time per iteration (s): 0.55 | learning rate: 6.365E-05 | global batch size: 256 | lm loss: 2.637166E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.324 | TFLOPs: 43.98 | +7: iteration 77830/ 115203 | consumed samples: 19924480 | consumed tokens: 40805335040 | elapsed time per iteration (s): 0.58 | learning rate: 6.363E-05 | global batch size: 256 | lm loss: 2.640884E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.192 | TFLOPs: 42.35 | +7: iteration 77840/ 115203 | consumed samples: 19927040 | consumed tokens: 40810577920 | elapsed time per iteration (s): 0.56 | learning rate: 6.360E-05 | global batch size: 256 | lm loss: 2.628601E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.875 | TFLOPs: 43.56 | +7: iteration 77850/ 115203 | consumed samples: 19929600 | consumed tokens: 40815820800 | elapsed time per iteration (s): 0.55 | learning rate: 6.358E-05 | global batch size: 256 | lm loss: 2.635601E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.265 | TFLOPs: 43.98 | +7: iteration 77860/ 115203 | consumed samples: 19932160 | consumed tokens: 40821063680 | elapsed time per iteration (s): 0.56 | learning rate: 6.356E-05 | global batch size: 256 | lm loss: 2.613168E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.447 | TFLOPs: 43.71 | +7: iteration 77870/ 115203 | consumed samples: 19934720 | consumed tokens: 40826306560 | elapsed time per iteration (s): 0.56 | learning rate: 6.354E-05 | global batch size: 256 | lm loss: 2.619890E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.451 | TFLOPs: 43.42 | +7: iteration 77880/ 115203 | consumed samples: 19937280 | consumed tokens: 40831549440 | elapsed time per iteration (s): 0.56 | learning rate: 6.352E-05 | global batch size: 256 | lm loss: 2.624615E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.239 | TFLOPs: 43.50 | +7: iteration 77890/ 115203 | consumed samples: 19939840 | consumed tokens: 40836792320 | elapsed time per iteration (s): 0.56 | learning rate: 6.350E-05 | global batch size: 256 | lm loss: 2.627948E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.168 | TFLOPs: 43.40 | +7: iteration 77900/ 115203 | consumed samples: 19942400 | consumed tokens: 40842035200 | elapsed time per iteration (s): 0.57 | learning rate: 6.348E-05 | global batch size: 256 | lm loss: 2.629001E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.743 | TFLOPs: 42.88 | +7: iteration 77910/ 115203 | consumed samples: 19944960 | consumed tokens: 40847278080 | elapsed time per iteration (s): 0.57 | learning rate: 6.346E-05 | global batch size: 256 | lm loss: 2.628208E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.643 | TFLOPs: 42.96 | +7: iteration 77920/ 115203 | consumed samples: 19947520 | consumed tokens: 40852520960 | elapsed time per iteration (s): 0.55 | learning rate: 6.343E-05 | global batch size: 256 | lm loss: 2.625328E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 77930/ 115203 | consumed samples: 19950080 | consumed tokens: 40857763840 | elapsed time per iteration (s): 0.56 | learning rate: 6.341E-05 | global batch size: 256 | lm loss: 2.619125E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.993 | TFLOPs: 43.66 | +7: iteration 77940/ 115203 | consumed samples: 19952640 | consumed tokens: 40863006720 | elapsed time per iteration (s): 0.56 | learning rate: 6.339E-05 | global batch size: 256 | lm loss: 2.631364E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.722 | TFLOPs: 43.54 | +7: iteration 77950/ 115203 | consumed samples: 19955200 | consumed tokens: 40868249600 | elapsed time per iteration (s): 0.56 | learning rate: 6.337E-05 | global batch size: 256 | lm loss: 2.615613E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.054 | TFLOPs: 43.38 | +7: iteration 77960/ 115203 | consumed samples: 19957760 | consumed tokens: 40873492480 | elapsed time per iteration (s): 0.56 | learning rate: 6.335E-05 | global batch size: 256 | lm loss: 2.624257E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.214 | TFLOPs: 43.30 | +7: iteration 77970/ 115203 | consumed samples: 19960320 | consumed tokens: 40878735360 | elapsed time per iteration (s): 0.56 | learning rate: 6.333E-05 | global batch size: 256 | lm loss: 2.626064E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.386 | TFLOPs: 43.70 | +7: iteration 77980/ 115203 | consumed samples: 19962880 | consumed tokens: 40883978240 | elapsed time per iteration (s): 0.57 | learning rate: 6.331E-05 | global batch size: 256 | lm loss: 2.620321E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.286 | TFLOPs: 43.12 | +7: iteration 77990/ 115203 | consumed samples: 19965440 | consumed tokens: 40889221120 | elapsed time per iteration (s): 0.56 | learning rate: 6.329E-05 | global batch size: 256 | lm loss: 2.626252E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.561 | TFLOPs: 43.72 | +0: [2023-03-17 01:03:49,835] [INFO] [logging.py:68:log_dist] [Rank 0] step=78000, skipped=0, lr=[6.326508628233516e-05, 6.326508628233516e-05, 6.326508628233516e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 78000/ 115203 | consumed samples: 19968000 | consumed tokens: 40894464000 | elapsed time per iteration (s): 0.56 | learning rate: 6.327E-05 | global batch size: 256 | lm loss: 2.607902E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.143 | TFLOPs: 43.39 | +0: steps: 78000 loss: 2.5744 iter time (s): 0.558 samples/sec: 458.642 +7: iteration 78010/ 115203 | consumed samples: 19970560 | consumed tokens: 40899706880 | elapsed time per iteration (s): 0.56 | learning rate: 6.324E-05 | global batch size: 256 | lm loss: 2.625783E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.591 | TFLOPs: 43.82 | +7: iteration 78020/ 115203 | consumed samples: 19973120 | consumed tokens: 40904949760 | elapsed time per iteration (s): 0.57 | learning rate: 6.322E-05 | global batch size: 256 | lm loss: 2.617634E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.569 | TFLOPs: 42.96 | +7: iteration 78030/ 115203 | consumed samples: 19975680 | consumed tokens: 40910192640 | elapsed time per iteration (s): 0.56 | learning rate: 6.320E-05 | global batch size: 256 | lm loss: 2.634199E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.914 | TFLOPs: 43.66 | +7: iteration 78040/ 115203 | consumed samples: 19978240 | consumed tokens: 40915435520 | elapsed time per iteration (s): 0.56 | learning rate: 6.318E-05 | global batch size: 256 | lm loss: 2.621337E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.794 | TFLOPs: 43.45 | +7: iteration 78050/ 115203 | consumed samples: 19980800 | consumed tokens: 40920678400 | elapsed time per iteration (s): 0.56 | learning rate: 6.316E-05 | global batch size: 256 | lm loss: 2.613979E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.200 | TFLOPs: 43.97 | +7: iteration 78060/ 115203 | consumed samples: 19983360 | consumed tokens: 40925921280 | elapsed time per iteration (s): 0.60 | learning rate: 6.314E-05 | global batch size: 256 | lm loss: 2.624841E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.364 | TFLOPs: 40.74 | +7: iteration 78070/ 115203 | consumed samples: 19985920 | consumed tokens: 40931164160 | elapsed time per iteration (s): 0.55 | learning rate: 6.312E-05 | global batch size: 256 | lm loss: 2.630956E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.304 | TFLOPs: 43.98 | +7: iteration 78080/ 115203 | consumed samples: 19988480 | consumed tokens: 40936407040 | elapsed time per iteration (s): 0.56 | learning rate: 6.310E-05 | global batch size: 256 | lm loss: 2.628988E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.876 | TFLOPs: 43.94 | +7: iteration 78090/ 115203 | consumed samples: 19991040 | consumed tokens: 40941649920 | elapsed time per iteration (s): 0.56 | learning rate: 6.307E-05 | global batch size: 256 | lm loss: 2.629998E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.116 | TFLOPs: 43.39 | +7: iteration 78100/ 115203 | consumed samples: 19993600 | consumed tokens: 40946892800 | elapsed time per iteration (s): 0.56 | learning rate: 6.305E-05 | global batch size: 256 | lm loss: 2.636153E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.834 | TFLOPs: 43.27 | +7: iteration 78110/ 115203 | consumed samples: 19996160 | consumed tokens: 40952135680 | elapsed time per iteration (s): 0.56 | learning rate: 6.303E-05 | global batch size: 256 | lm loss: 2.634719E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.097 | TFLOPs: 43.87 | +7: iteration 78120/ 115203 | consumed samples: 19998720 | consumed tokens: 40957378560 | elapsed time per iteration (s): 0.56 | learning rate: 6.301E-05 | global batch size: 256 | lm loss: 2.626894E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.481 | TFLOPs: 43.71 | +7: iteration 78130/ 115203 | consumed samples: 20001280 | consumed tokens: 40962621440 | elapsed time per iteration (s): 0.55 | learning rate: 6.299E-05 | global batch size: 256 | lm loss: 2.611146E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.287 | TFLOPs: 43.98 | +7: iteration 78140/ 115203 | consumed samples: 20003840 | consumed tokens: 40967864320 | elapsed time per iteration (s): 0.56 | learning rate: 6.297E-05 | global batch size: 256 | lm loss: 2.608330E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.355 | TFLOPs: 43.70 | +7: iteration 78150/ 115203 | consumed samples: 20006400 | consumed tokens: 40973107200 | elapsed time per iteration (s): 0.56 | learning rate: 6.295E-05 | global batch size: 256 | lm loss: 2.633076E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.178 | TFLOPs: 43.40 | +7: iteration 78160/ 115203 | consumed samples: 20008960 | consumed tokens: 40978350080 | elapsed time per iteration (s): 0.55 | learning rate: 6.293E-05 | global batch size: 256 | lm loss: 2.629172E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.360 | TFLOPs: 43.99 | +7: iteration 78170/ 115203 | consumed samples: 20011520 | consumed tokens: 40983592960 | elapsed time per iteration (s): 0.56 | learning rate: 6.291E-05 | global batch size: 256 | lm loss: 2.633178E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.281 | TFLOPs: 43.60 | +7: iteration 78180/ 115203 | consumed samples: 20014080 | consumed tokens: 40988835840 | elapsed time per iteration (s): 0.56 | learning rate: 6.288E-05 | global batch size: 256 | lm loss: 2.625614E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.108 | TFLOPs: 43.58 | +7: iteration 78190/ 115203 | consumed samples: 20016640 | consumed tokens: 40994078720 | elapsed time per iteration (s): 0.57 | learning rate: 6.286E-05 | global batch size: 256 | lm loss: 2.628995E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.955 | TFLOPs: 42.99 | +7: iteration 78200/ 115203 | consumed samples: 20019200 | consumed tokens: 40999321600 | elapsed time per iteration (s): 0.56 | learning rate: 6.284E-05 | global batch size: 256 | lm loss: 2.621900E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.865 | TFLOPs: 43.56 | +7: iteration 78210/ 115203 | consumed samples: 20021760 | consumed tokens: 41004564480 | elapsed time per iteration (s): 0.57 | learning rate: 6.282E-05 | global batch size: 256 | lm loss: 2.624866E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.970 | TFLOPs: 42.71 | +7: iteration 78220/ 115203 | consumed samples: 20024320 | consumed tokens: 41009807360 | elapsed time per iteration (s): 0.56 | learning rate: 6.280E-05 | global batch size: 256 | lm loss: 2.603447E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.948 | TFLOPs: 43.47 | +7: iteration 78230/ 115203 | consumed samples: 20026880 | consumed tokens: 41015050240 | elapsed time per iteration (s): 0.56 | learning rate: 6.278E-05 | global batch size: 256 | lm loss: 2.626332E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.022 | TFLOPs: 43.48 | +7: iteration 78240/ 115203 | consumed samples: 20029440 | consumed tokens: 41020293120 | elapsed time per iteration (s): 0.56 | learning rate: 6.276E-05 | global batch size: 256 | lm loss: 2.616091E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.302 | TFLOPs: 43.60 | +7: iteration 78250/ 115203 | consumed samples: 20032000 | consumed tokens: 41025536000 | elapsed time per iteration (s): 0.56 | learning rate: 6.274E-05 | global batch size: 256 | lm loss: 2.632513E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.538 | TFLOPs: 43.53 | +7: iteration 78260/ 115203 | consumed samples: 20034560 | consumed tokens: 41030778880 | elapsed time per iteration (s): 0.56 | learning rate: 6.272E-05 | global batch size: 256 | lm loss: 2.623325E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.800 | TFLOPs: 43.55 | +7: iteration 78270/ 115203 | consumed samples: 20037120 | consumed tokens: 41036021760 | elapsed time per iteration (s): 0.56 | learning rate: 6.269E-05 | global batch size: 256 | lm loss: 2.638413E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.203 | TFLOPs: 43.49 | +7: iteration 78280/ 115203 | consumed samples: 20039680 | consumed tokens: 41041264640 | elapsed time per iteration (s): 0.56 | learning rate: 6.267E-05 | global batch size: 256 | lm loss: 2.630133E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.872 | TFLOPs: 43.84 | +7: iteration 78290/ 115203 | consumed samples: 20042240 | consumed tokens: 41046507520 | elapsed time per iteration (s): 0.56 | learning rate: 6.265E-05 | global batch size: 256 | lm loss: 2.625584E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.968 | TFLOPs: 43.57 | +7: iteration 78300/ 115203 | consumed samples: 20044800 | consumed tokens: 41051750400 | elapsed time per iteration (s): 0.56 | learning rate: 6.263E-05 | global batch size: 256 | lm loss: 2.625251E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.638 | TFLOPs: 43.73 | +7: iteration 78310/ 115203 | consumed samples: 20047360 | consumed tokens: 41056993280 | elapsed time per iteration (s): 0.56 | learning rate: 6.261E-05 | global batch size: 256 | lm loss: 2.625577E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.519 | TFLOPs: 43.52 | +7: iteration 78320/ 115203 | consumed samples: 20049920 | consumed tokens: 41062236160 | elapsed time per iteration (s): 0.56 | learning rate: 6.259E-05 | global batch size: 256 | lm loss: 2.616553E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.778 | TFLOPs: 43.55 | +7: iteration 78330/ 115203 | consumed samples: 20052480 | consumed tokens: 41067479040 | elapsed time per iteration (s): 0.56 | learning rate: 6.257E-05 | global batch size: 256 | lm loss: 2.625204E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.160 | TFLOPs: 43.39 | +7: iteration 78340/ 115203 | consumed samples: 20055040 | consumed tokens: 41072721920 | elapsed time per iteration (s): 0.56 | learning rate: 6.255E-05 | global batch size: 256 | lm loss: 2.621335E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.416 | TFLOPs: 43.51 | +7: iteration 78350/ 115203 | consumed samples: 20057600 | consumed tokens: 41077964800 | elapsed time per iteration (s): 0.56 | learning rate: 6.253E-05 | global batch size: 256 | lm loss: 2.623558E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.149 | TFLOPs: 43.39 | +7: iteration 78360/ 115203 | consumed samples: 20060160 | consumed tokens: 41083207680 | elapsed time per iteration (s): 0.57 | learning rate: 6.250E-05 | global batch size: 256 | lm loss: 2.622703E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.045 | TFLOPs: 43.00 | +7: iteration 78370/ 115203 | consumed samples: 20062720 | consumed tokens: 41088450560 | elapsed time per iteration (s): 0.55 | learning rate: 6.248E-05 | global batch size: 256 | lm loss: 2.627113E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.294 | TFLOPs: 43.98 | +7: iteration 78380/ 115203 | consumed samples: 20065280 | consumed tokens: 41093693440 | elapsed time per iteration (s): 0.56 | learning rate: 6.246E-05 | global batch size: 256 | lm loss: 2.623679E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.359 | TFLOPs: 43.70 | +7: iteration 78390/ 115203 | consumed samples: 20067840 | consumed tokens: 41098936320 | elapsed time per iteration (s): 0.57 | learning rate: 6.244E-05 | global batch size: 256 | lm loss: 2.636540E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.831 | TFLOPs: 43.17 | +7: iteration 78400/ 115203 | consumed samples: 20070400 | consumed tokens: 41104179200 | elapsed time per iteration (s): 0.56 | learning rate: 6.242E-05 | global batch size: 256 | lm loss: 2.638542E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.099 | TFLOPs: 43.67 | +7: iteration 78410/ 115203 | consumed samples: 20072960 | consumed tokens: 41109422080 | elapsed time per iteration (s): 0.55 | learning rate: 6.240E-05 | global batch size: 256 | lm loss: 2.633549E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.316 | TFLOPs: 43.98 | +7: iteration 78420/ 115203 | consumed samples: 20075520 | consumed tokens: 41114664960 | elapsed time per iteration (s): 0.57 | learning rate: 6.238E-05 | global batch size: 256 | lm loss: 2.619769E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.116 | TFLOPs: 43.10 | +7: iteration 78430/ 115203 | consumed samples: 20078080 | consumed tokens: 41119907840 | elapsed time per iteration (s): 0.56 | learning rate: 6.236E-05 | global batch size: 256 | lm loss: 2.632610E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.740 | TFLOPs: 43.26 | +7: iteration 78440/ 115203 | consumed samples: 20080640 | consumed tokens: 41125150720 | elapsed time per iteration (s): 0.57 | learning rate: 6.234E-05 | global batch size: 256 | lm loss: 2.627803E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.412 | TFLOPs: 42.47 | +7: iteration 78450/ 115203 | consumed samples: 20083200 | consumed tokens: 41130393600 | elapsed time per iteration (s): 0.57 | learning rate: 6.232E-05 | global batch size: 256 | lm loss: 2.614649E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.922 | TFLOPs: 42.90 | +7: iteration 78460/ 115203 | consumed samples: 20085760 | consumed tokens: 41135636480 | elapsed time per iteration (s): 0.55 | learning rate: 6.229E-05 | global batch size: 256 | lm loss: 2.621866E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.597 | TFLOPs: 44.01 | +7: iteration 78470/ 115203 | consumed samples: 20088320 | consumed tokens: 41140879360 | elapsed time per iteration (s): 0.56 | learning rate: 6.227E-05 | global batch size: 256 | lm loss: 2.608142E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.349 | TFLOPs: 43.60 | +7: iteration 78480/ 115203 | consumed samples: 20090880 | consumed tokens: 41146122240 | elapsed time per iteration (s): 0.58 | learning rate: 6.225E-05 | global batch size: 256 | lm loss: 2.625606E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.336 | TFLOPs: 42.27 | +7: iteration 78490/ 115203 | consumed samples: 20093440 | consumed tokens: 41151365120 | elapsed time per iteration (s): 0.55 | learning rate: 6.223E-05 | global batch size: 256 | lm loss: 2.624318E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.524 | TFLOPs: 44.00 | +7: iteration 78500/ 115203 | consumed samples: 20096000 | consumed tokens: 41156608000 | elapsed time per iteration (s): 0.56 | learning rate: 6.221E-05 | global batch size: 256 | lm loss: 2.610439E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.189 | TFLOPs: 43.97 | +7: iteration 78510/ 115203 | consumed samples: 20098560 | consumed tokens: 41161850880 | elapsed time per iteration (s): 0.55 | learning rate: 6.219E-05 | global batch size: 256 | lm loss: 2.634656E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.372 | TFLOPs: 43.99 | +7: iteration 78520/ 115203 | consumed samples: 20101120 | consumed tokens: 41167093760 | elapsed time per iteration (s): 0.57 | learning rate: 6.217E-05 | global batch size: 256 | lm loss: 2.618207E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.088 | TFLOPs: 42.72 | +7: iteration 78530/ 115203 | consumed samples: 20103680 | consumed tokens: 41172336640 | elapsed time per iteration (s): 0.56 | learning rate: 6.215E-05 | global batch size: 256 | lm loss: 2.628204E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.529 | TFLOPs: 43.43 | +7: iteration 78540/ 115203 | consumed samples: 20106240 | consumed tokens: 41177579520 | elapsed time per iteration (s): 0.56 | learning rate: 6.213E-05 | global batch size: 256 | lm loss: 2.620245E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.562 | TFLOPs: 43.72 | +7: iteration 78550/ 115203 | consumed samples: 20108800 | consumed tokens: 41182822400 | elapsed time per iteration (s): 0.56 | learning rate: 6.211E-05 | global batch size: 256 | lm loss: 2.618476E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.453 | TFLOPs: 43.71 | +7: iteration 78560/ 115203 | consumed samples: 20111360 | consumed tokens: 41188065280 | elapsed time per iteration (s): 0.56 | learning rate: 6.208E-05 | global batch size: 256 | lm loss: 2.617952E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.739 | TFLOPs: 43.26 | +7: iteration 78570/ 115203 | consumed samples: 20113920 | consumed tokens: 41193308160 | elapsed time per iteration (s): 0.57 | learning rate: 6.206E-05 | global batch size: 256 | lm loss: 2.638605E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.383 | TFLOPs: 42.94 | +7: iteration 78580/ 115203 | consumed samples: 20116480 | consumed tokens: 41198551040 | elapsed time per iteration (s): 0.55 | learning rate: 6.204E-05 | global batch size: 256 | lm loss: 2.625366E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.324 | TFLOPs: 43.98 | +7: iteration 78590/ 115203 | consumed samples: 20119040 | consumed tokens: 41203793920 | elapsed time per iteration (s): 0.56 | learning rate: 6.202E-05 | global batch size: 256 | lm loss: 2.635025E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.984 | TFLOPs: 43.47 | +7: iteration 78600/ 115203 | consumed samples: 20121600 | consumed tokens: 41209036800 | elapsed time per iteration (s): 0.56 | learning rate: 6.200E-05 | global batch size: 256 | lm loss: 2.631919E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.822 | TFLOPs: 43.36 | +7: iteration 78610/ 115203 | consumed samples: 20124160 | consumed tokens: 41214279680 | elapsed time per iteration (s): 0.56 | learning rate: 6.198E-05 | global batch size: 256 | lm loss: 2.624600E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.020 | TFLOPs: 43.48 | +7: iteration 78620/ 115203 | consumed samples: 20126720 | consumed tokens: 41219522560 | elapsed time per iteration (s): 0.56 | learning rate: 6.196E-05 | global batch size: 256 | lm loss: 2.629699E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.376 | TFLOPs: 43.89 | +7: iteration 78630/ 115203 | consumed samples: 20129280 | consumed tokens: 41224765440 | elapsed time per iteration (s): 0.56 | learning rate: 6.194E-05 | global batch size: 256 | lm loss: 2.611976E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.304 | TFLOPs: 43.31 | +7: iteration 78640/ 115203 | consumed samples: 20131840 | consumed tokens: 41230008320 | elapsed time per iteration (s): 0.56 | learning rate: 6.192E-05 | global batch size: 256 | lm loss: 2.627142E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.017 | TFLOPs: 43.48 | +7: iteration 78650/ 115203 | consumed samples: 20134400 | consumed tokens: 41235251200 | elapsed time per iteration (s): 0.56 | learning rate: 6.190E-05 | global batch size: 256 | lm loss: 2.623018E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.893 | TFLOPs: 43.46 | +7: iteration 78660/ 115203 | consumed samples: 20136960 | consumed tokens: 41240494080 | elapsed time per iteration (s): 0.56 | learning rate: 6.187E-05 | global batch size: 256 | lm loss: 2.609203E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.849 | TFLOPs: 43.94 | +7: iteration 78670/ 115203 | consumed samples: 20139520 | consumed tokens: 41245736960 | elapsed time per iteration (s): 0.56 | learning rate: 6.185E-05 | global batch size: 256 | lm loss: 2.624638E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.806 | TFLOPs: 43.74 | +7: iteration 78680/ 115203 | consumed samples: 20142080 | consumed tokens: 41250979840 | elapsed time per iteration (s): 0.56 | learning rate: 6.183E-05 | global batch size: 256 | lm loss: 2.617264E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.233 | TFLOPs: 43.97 | +7: iteration 78690/ 115203 | consumed samples: 20144640 | consumed tokens: 41256222720 | elapsed time per iteration (s): 0.56 | learning rate: 6.181E-05 | global batch size: 256 | lm loss: 2.626288E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.053 | TFLOPs: 43.96 | +7: iteration 78700/ 115203 | consumed samples: 20147200 | consumed tokens: 41261465600 | elapsed time per iteration (s): 0.56 | learning rate: 6.179E-05 | global batch size: 256 | lm loss: 2.618486E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.955 | TFLOPs: 43.95 | +7: iteration 78710/ 115203 | consumed samples: 20149760 | consumed tokens: 41266708480 | elapsed time per iteration (s): 0.56 | learning rate: 6.177E-05 | global batch size: 256 | lm loss: 2.625070E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.718 | TFLOPs: 43.83 | +7: iteration 78720/ 115203 | consumed samples: 20152320 | consumed tokens: 41271951360 | elapsed time per iteration (s): 0.55 | learning rate: 6.175E-05 | global batch size: 256 | lm loss: 2.625094E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.293 | TFLOPs: 43.98 | +7: iteration 78730/ 115203 | consumed samples: 20154880 | consumed tokens: 41277194240 | elapsed time per iteration (s): 0.56 | learning rate: 6.173E-05 | global batch size: 256 | lm loss: 2.621214E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.716 | TFLOPs: 43.26 | +7: iteration 78740/ 115203 | consumed samples: 20157440 | consumed tokens: 41282437120 | elapsed time per iteration (s): 0.57 | learning rate: 6.171E-05 | global batch size: 256 | lm loss: 2.630399E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.631 | TFLOPs: 43.15 | +7: iteration 78750/ 115203 | consumed samples: 20160000 | consumed tokens: 41287680000 | elapsed time per iteration (s): 0.57 | learning rate: 6.169E-05 | global batch size: 256 | lm loss: 2.616035E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.688 | TFLOPs: 42.97 | +7: iteration 78760/ 115203 | consumed samples: 20162560 | consumed tokens: 41292922880 | elapsed time per iteration (s): 0.56 | learning rate: 6.167E-05 | global batch size: 256 | lm loss: 2.606067E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.230 | TFLOPs: 43.21 | +7: iteration 78770/ 115203 | consumed samples: 20165120 | consumed tokens: 41298165760 | elapsed time per iteration (s): 0.56 | learning rate: 6.164E-05 | global batch size: 256 | lm loss: 2.625937E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.268 | TFLOPs: 43.69 | +7: iteration 78780/ 115203 | consumed samples: 20167680 | consumed tokens: 41303408640 | elapsed time per iteration (s): 0.56 | learning rate: 6.162E-05 | global batch size: 256 | lm loss: 2.622117E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.561 | TFLOPs: 43.72 | +7: iteration 78790/ 115203 | consumed samples: 20170240 | consumed tokens: 41308651520 | elapsed time per iteration (s): 0.56 | learning rate: 6.160E-05 | global batch size: 256 | lm loss: 2.621853E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.804 | TFLOPs: 43.65 | +7: iteration 78800/ 115203 | consumed samples: 20172800 | consumed tokens: 41313894400 | elapsed time per iteration (s): 0.56 | learning rate: 6.158E-05 | global batch size: 256 | lm loss: 2.627783E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.923 | TFLOPs: 43.56 | +7: iteration 78810/ 115203 | consumed samples: 20175360 | consumed tokens: 41319137280 | elapsed time per iteration (s): 0.55 | learning rate: 6.156E-05 | global batch size: 256 | lm loss: 2.616838E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.288 | TFLOPs: 43.98 | +7: iteration 78820/ 115203 | consumed samples: 20177920 | consumed tokens: 41324380160 | elapsed time per iteration (s): 0.56 | learning rate: 6.154E-05 | global batch size: 256 | lm loss: 2.619789E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.224 | TFLOPs: 43.69 | +7: iteration 78830/ 115203 | consumed samples: 20180480 | consumed tokens: 41329623040 | elapsed time per iteration (s): 0.55 | learning rate: 6.152E-05 | global batch size: 256 | lm loss: 2.628038E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.435 | TFLOPs: 43.99 | +7: iteration 78840/ 115203 | consumed samples: 20183040 | consumed tokens: 41334865920 | elapsed time per iteration (s): 0.56 | learning rate: 6.150E-05 | global batch size: 256 | lm loss: 2.622861E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.152 | TFLOPs: 43.49 | +7: iteration 78850/ 115203 | consumed samples: 20185600 | consumed tokens: 41340108800 | elapsed time per iteration (s): 0.56 | learning rate: 6.148E-05 | global batch size: 256 | lm loss: 2.627601E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.420 | TFLOPs: 43.71 | +7: iteration 78860/ 115203 | consumed samples: 20188160 | consumed tokens: 41345351680 | elapsed time per iteration (s): 0.57 | learning rate: 6.146E-05 | global batch size: 256 | lm loss: 2.633271E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.062 | TFLOPs: 43.19 | +7: iteration 78870/ 115203 | consumed samples: 20190720 | consumed tokens: 41350594560 | elapsed time per iteration (s): 0.56 | learning rate: 6.144E-05 | global batch size: 256 | lm loss: 2.630771E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.841 | TFLOPs: 43.55 | +7: iteration 78880/ 115203 | consumed samples: 20193280 | consumed tokens: 41355837440 | elapsed time per iteration (s): 0.56 | learning rate: 6.141E-05 | global batch size: 256 | lm loss: 2.619426E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.372 | TFLOPs: 43.70 | +7: iteration 78890/ 115203 | consumed samples: 20195840 | consumed tokens: 41361080320 | elapsed time per iteration (s): 0.57 | learning rate: 6.139E-05 | global batch size: 256 | lm loss: 2.622605E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.082 | TFLOPs: 42.72 | +7: iteration 78900/ 115203 | consumed samples: 20198400 | consumed tokens: 41366323200 | elapsed time per iteration (s): 0.55 | learning rate: 6.137E-05 | global batch size: 256 | lm loss: 2.624159E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.364 | TFLOPs: 43.99 | +7: iteration 78910/ 115203 | consumed samples: 20200960 | consumed tokens: 41371566080 | elapsed time per iteration (s): 0.55 | learning rate: 6.135E-05 | global batch size: 256 | lm loss: 2.616319E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.360 | TFLOPs: 43.99 | +7: iteration 78920/ 115203 | consumed samples: 20203520 | consumed tokens: 41376808960 | elapsed time per iteration (s): 0.57 | learning rate: 6.133E-05 | global batch size: 256 | lm loss: 2.612826E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.146 | TFLOPs: 43.11 | +7: iteration 78930/ 115203 | consumed samples: 20206080 | consumed tokens: 41382051840 | elapsed time per iteration (s): 0.56 | learning rate: 6.131E-05 | global batch size: 256 | lm loss: 2.610860E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.285 | TFLOPs: 43.69 | +7: iteration 78940/ 115203 | consumed samples: 20208640 | consumed tokens: 41387294720 | elapsed time per iteration (s): 0.56 | learning rate: 6.129E-05 | global batch size: 256 | lm loss: 2.622364E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.925 | TFLOPs: 43.47 | +7: iteration 78950/ 115203 | consumed samples: 20211200 | consumed tokens: 41392537600 | elapsed time per iteration (s): 0.56 | learning rate: 6.127E-05 | global batch size: 256 | lm loss: 2.618232E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.150 | TFLOPs: 43.97 | +7: iteration 78960/ 115203 | consumed samples: 20213760 | consumed tokens: 41397780480 | elapsed time per iteration (s): 0.56 | learning rate: 6.125E-05 | global batch size: 256 | lm loss: 2.622136E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.140 | TFLOPs: 43.39 | +7: iteration 78970/ 115203 | consumed samples: 20216320 | consumed tokens: 41403023360 | elapsed time per iteration (s): 0.56 | learning rate: 6.123E-05 | global batch size: 256 | lm loss: 2.626021E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.658 | TFLOPs: 43.54 | +7: iteration 78980/ 115203 | consumed samples: 20218880 | consumed tokens: 41408266240 | elapsed time per iteration (s): 0.56 | learning rate: 6.121E-05 | global batch size: 256 | lm loss: 2.620410E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.984 | TFLOPs: 43.76 | +7: iteration 78990/ 115203 | consumed samples: 20221440 | consumed tokens: 41413509120 | elapsed time per iteration (s): 0.55 | learning rate: 6.119E-05 | global batch size: 256 | lm loss: 2.627351E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 79000/ 115203 | consumed samples: 20224000 | consumed tokens: 41418752000 | elapsed time per iteration (s): 0.55 | learning rate: 6.116E-05 | global batch size: 256 | lm loss: 2.616892E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.405 | TFLOPs: 43.99 | +7: iteration 79010/ 115203 | consumed samples: 20226560 | consumed tokens: 41423994880 | elapsed time per iteration (s): 0.58 | learning rate: 6.114E-05 | global batch size: 256 | lm loss: 2.614691E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.148 | TFLOPs: 41.77 | +7: iteration 79020/ 115203 | consumed samples: 20229120 | consumed tokens: 41429237760 | elapsed time per iteration (s): 0.57 | learning rate: 6.112E-05 | global batch size: 256 | lm loss: 2.623916E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.027 | TFLOPs: 42.62 | +7: iteration 79030/ 115203 | consumed samples: 20231680 | consumed tokens: 41434480640 | elapsed time per iteration (s): 0.58 | learning rate: 6.110E-05 | global batch size: 256 | lm loss: 2.629655E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.181 | TFLOPs: 42.25 | +7: iteration 79040/ 115203 | consumed samples: 20234240 | consumed tokens: 41439723520 | elapsed time per iteration (s): 0.57 | learning rate: 6.108E-05 | global batch size: 256 | lm loss: 2.621391E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.410 | TFLOPs: 42.75 | +7: iteration 79050/ 115203 | consumed samples: 20236800 | consumed tokens: 41444966400 | elapsed time per iteration (s): 0.56 | learning rate: 6.106E-05 | global batch size: 256 | lm loss: 2.620019E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.888 | TFLOPs: 43.46 | +7: iteration 79060/ 115203 | consumed samples: 20239360 | consumed tokens: 41450209280 | elapsed time per iteration (s): 0.56 | learning rate: 6.104E-05 | global batch size: 256 | lm loss: 2.615843E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.468 | TFLOPs: 43.71 | +7: iteration 79070/ 115203 | consumed samples: 20241920 | consumed tokens: 41455452160 | elapsed time per iteration (s): 0.59 | learning rate: 6.102E-05 | global batch size: 256 | lm loss: 2.630299E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.254 | TFLOPs: 41.50 | +7: iteration 79080/ 115203 | consumed samples: 20244480 | consumed tokens: 41460695040 | elapsed time per iteration (s): 0.55 | learning rate: 6.100E-05 | global batch size: 256 | lm loss: 2.635913E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.342 | TFLOPs: 43.98 | +7: iteration 79090/ 115203 | consumed samples: 20247040 | consumed tokens: 41465937920 | elapsed time per iteration (s): 0.57 | learning rate: 6.098E-05 | global batch size: 256 | lm loss: 2.622808E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.995 | TFLOPs: 43.00 | +7: iteration 79100/ 115203 | consumed samples: 20249600 | consumed tokens: 41471180800 | elapsed time per iteration (s): 0.57 | learning rate: 6.096E-05 | global batch size: 256 | lm loss: 2.613773E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.406 | TFLOPs: 43.13 | +7: iteration 79110/ 115203 | consumed samples: 20252160 | consumed tokens: 41476423680 | elapsed time per iteration (s): 0.57 | learning rate: 6.094E-05 | global batch size: 256 | lm loss: 2.640495E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.112 | TFLOPs: 43.01 | +7: iteration 79120/ 115203 | consumed samples: 20254720 | consumed tokens: 41481666560 | elapsed time per iteration (s): 0.56 | learning rate: 6.091E-05 | global batch size: 256 | lm loss: 2.622253E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.842 | TFLOPs: 43.55 | +7: iteration 79130/ 115203 | consumed samples: 20257280 | consumed tokens: 41486909440 | elapsed time per iteration (s): 0.56 | learning rate: 6.089E-05 | global batch size: 256 | lm loss: 2.621530E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.967 | TFLOPs: 43.57 | +7: iteration 79140/ 115203 | consumed samples: 20259840 | consumed tokens: 41492152320 | elapsed time per iteration (s): 0.56 | learning rate: 6.087E-05 | global batch size: 256 | lm loss: 2.633239E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.973 | TFLOPs: 43.95 | +7: iteration 79150/ 115203 | consumed samples: 20262400 | consumed tokens: 41497395200 | elapsed time per iteration (s): 0.56 | learning rate: 6.085E-05 | global batch size: 256 | lm loss: 2.622791E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.276 | TFLOPs: 43.41 | +7: iteration 79160/ 115203 | consumed samples: 20264960 | consumed tokens: 41502638080 | elapsed time per iteration (s): 0.56 | learning rate: 6.083E-05 | global batch size: 256 | lm loss: 2.614335E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.320 | TFLOPs: 43.60 | +7: iteration 79170/ 115203 | consumed samples: 20267520 | consumed tokens: 41507880960 | elapsed time per iteration (s): 0.56 | learning rate: 6.081E-05 | global batch size: 256 | lm loss: 2.628847E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.188 | TFLOPs: 43.97 | +7: iteration 79180/ 115203 | consumed samples: 20270080 | consumed tokens: 41513123840 | elapsed time per iteration (s): 0.56 | learning rate: 6.079E-05 | global batch size: 256 | lm loss: 2.618405E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.097 | TFLOPs: 43.96 | +7: iteration 79190/ 115203 | consumed samples: 20272640 | consumed tokens: 41518366720 | elapsed time per iteration (s): 0.56 | learning rate: 6.077E-05 | global batch size: 256 | lm loss: 2.618653E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.844 | TFLOPs: 43.75 | +7: iteration 79200/ 115203 | consumed samples: 20275200 | consumed tokens: 41523609600 | elapsed time per iteration (s): 0.56 | learning rate: 6.075E-05 | global batch size: 256 | lm loss: 2.624248E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.172 | TFLOPs: 43.97 | +7: iteration 79210/ 115203 | consumed samples: 20277760 | consumed tokens: 41528852480 | elapsed time per iteration (s): 0.56 | learning rate: 6.073E-05 | global batch size: 256 | lm loss: 2.628422E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.458 | TFLOPs: 43.23 | +7: iteration 79220/ 115203 | consumed samples: 20280320 | consumed tokens: 41534095360 | elapsed time per iteration (s): 0.57 | learning rate: 6.071E-05 | global batch size: 256 | lm loss: 2.618570E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.562 | TFLOPs: 42.48 | +7: iteration 79230/ 115203 | consumed samples: 20282880 | consumed tokens: 41539338240 | elapsed time per iteration (s): 0.56 | learning rate: 6.069E-05 | global batch size: 256 | lm loss: 2.626689E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.553 | TFLOPs: 43.62 | +7: iteration 79240/ 115203 | consumed samples: 20285440 | consumed tokens: 41544581120 | elapsed time per iteration (s): 0.56 | learning rate: 6.067E-05 | global batch size: 256 | lm loss: 2.615287E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.968 | TFLOPs: 43.28 | +7: iteration 79250/ 115203 | consumed samples: 20288000 | consumed tokens: 41549824000 | elapsed time per iteration (s): 0.56 | learning rate: 6.065E-05 | global batch size: 256 | lm loss: 2.616103E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.913 | TFLOPs: 43.47 | +7: iteration 79260/ 115203 | consumed samples: 20290560 | consumed tokens: 41555066880 | elapsed time per iteration (s): 0.56 | learning rate: 6.062E-05 | global batch size: 256 | lm loss: 2.611387E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.019 | TFLOPs: 43.67 | +7: iteration 79270/ 115203 | consumed samples: 20293120 | consumed tokens: 41560309760 | elapsed time per iteration (s): 0.57 | learning rate: 6.060E-05 | global batch size: 256 | lm loss: 2.619460E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.986 | TFLOPs: 42.90 | +7: iteration 79280/ 115203 | consumed samples: 20295680 | consumed tokens: 41565552640 | elapsed time per iteration (s): 0.56 | learning rate: 6.058E-05 | global batch size: 256 | lm loss: 2.616934E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.955 | TFLOPs: 43.57 | +7: iteration 79290/ 115203 | consumed samples: 20298240 | consumed tokens: 41570795520 | elapsed time per iteration (s): 0.56 | learning rate: 6.056E-05 | global batch size: 256 | lm loss: 2.628632E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.064 | TFLOPs: 43.39 | +7: iteration 79300/ 115203 | consumed samples: 20300800 | consumed tokens: 41576038400 | elapsed time per iteration (s): 0.56 | learning rate: 6.054E-05 | global batch size: 256 | lm loss: 2.622894E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.200 | TFLOPs: 43.49 | +7: iteration 79310/ 115203 | consumed samples: 20303360 | consumed tokens: 41581281280 | elapsed time per iteration (s): 0.56 | learning rate: 6.052E-05 | global batch size: 256 | lm loss: 2.623882E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.160 | TFLOPs: 43.97 | +7: iteration 79320/ 115203 | consumed samples: 20305920 | consumed tokens: 41586524160 | elapsed time per iteration (s): 0.56 | learning rate: 6.050E-05 | global batch size: 256 | lm loss: 2.622152E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.383 | TFLOPs: 43.51 | +7: iteration 79330/ 115203 | consumed samples: 20308480 | consumed tokens: 41591767040 | elapsed time per iteration (s): 0.56 | learning rate: 6.048E-05 | global batch size: 256 | lm loss: 2.631154E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.090 | TFLOPs: 43.67 | +7: iteration 79340/ 115203 | consumed samples: 20311040 | consumed tokens: 41597009920 | elapsed time per iteration (s): 0.56 | learning rate: 6.046E-05 | global batch size: 256 | lm loss: 2.620581E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.244 | TFLOPs: 43.97 | +7: iteration 79350/ 115203 | consumed samples: 20313600 | consumed tokens: 41602252800 | elapsed time per iteration (s): 0.56 | learning rate: 6.044E-05 | global batch size: 256 | lm loss: 2.622591E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.506 | TFLOPs: 43.71 | +7: iteration 79360/ 115203 | consumed samples: 20316160 | consumed tokens: 41607495680 | elapsed time per iteration (s): 0.56 | learning rate: 6.042E-05 | global batch size: 256 | lm loss: 2.618278E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.320 | TFLOPs: 43.70 | +7: iteration 79370/ 115203 | consumed samples: 20318720 | consumed tokens: 41612738560 | elapsed time per iteration (s): 0.56 | learning rate: 6.040E-05 | global batch size: 256 | lm loss: 2.630469E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.768 | TFLOPs: 43.83 | +7: iteration 79380/ 115203 | consumed samples: 20321280 | consumed tokens: 41617981440 | elapsed time per iteration (s): 0.56 | learning rate: 6.038E-05 | global batch size: 256 | lm loss: 2.618298E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.158 | TFLOPs: 43.68 | +7: iteration 79390/ 115203 | consumed samples: 20323840 | consumed tokens: 41623224320 | elapsed time per iteration (s): 0.56 | learning rate: 6.036E-05 | global batch size: 256 | lm loss: 2.626053E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.466 | TFLOPs: 43.23 | +7: iteration 79400/ 115203 | consumed samples: 20326400 | consumed tokens: 41628467200 | elapsed time per iteration (s): 0.56 | learning rate: 6.033E-05 | global batch size: 256 | lm loss: 2.634435E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.752 | TFLOPs: 43.55 | +7: iteration 79410/ 115203 | consumed samples: 20328960 | consumed tokens: 41633710080 | elapsed time per iteration (s): 0.57 | learning rate: 6.031E-05 | global batch size: 256 | lm loss: 2.610065E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.779 | TFLOPs: 42.98 | +7: iteration 79420/ 115203 | consumed samples: 20331520 | consumed tokens: 41638952960 | elapsed time per iteration (s): 0.56 | learning rate: 6.029E-05 | global batch size: 256 | lm loss: 2.616650E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.480 | TFLOPs: 43.52 | +7: iteration 79430/ 115203 | consumed samples: 20334080 | consumed tokens: 41644195840 | elapsed time per iteration (s): 0.55 | learning rate: 6.027E-05 | global batch size: 256 | lm loss: 2.609144E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 79440/ 115203 | consumed samples: 20336640 | consumed tokens: 41649438720 | elapsed time per iteration (s): 0.56 | learning rate: 6.025E-05 | global batch size: 256 | lm loss: 2.610578E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.427 | TFLOPs: 43.42 | +7: iteration 79450/ 115203 | consumed samples: 20339200 | consumed tokens: 41654681600 | elapsed time per iteration (s): 0.55 | learning rate: 6.023E-05 | global batch size: 256 | lm loss: 2.625479E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.308 | TFLOPs: 43.98 | +7: iteration 79460/ 115203 | consumed samples: 20341760 | consumed tokens: 41659924480 | elapsed time per iteration (s): 0.56 | learning rate: 6.021E-05 | global batch size: 256 | lm loss: 2.625359E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.925 | TFLOPs: 43.47 | +7: iteration 79470/ 115203 | consumed samples: 20344320 | consumed tokens: 41665167360 | elapsed time per iteration (s): 0.57 | learning rate: 6.019E-05 | global batch size: 256 | lm loss: 2.630126E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.732 | TFLOPs: 42.97 | +7: iteration 79480/ 115203 | consumed samples: 20346880 | consumed tokens: 41670410240 | elapsed time per iteration (s): 0.56 | learning rate: 6.017E-05 | global batch size: 256 | lm loss: 2.629454E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.072 | TFLOPs: 43.48 | +7: iteration 79490/ 115203 | consumed samples: 20349440 | consumed tokens: 41675653120 | elapsed time per iteration (s): 0.55 | learning rate: 6.015E-05 | global batch size: 256 | lm loss: 2.625714E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.344 | TFLOPs: 43.98 | +7: iteration 79500/ 115203 | consumed samples: 20352000 | consumed tokens: 41680896000 | elapsed time per iteration (s): 0.56 | learning rate: 6.013E-05 | global batch size: 256 | lm loss: 2.616137E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.365 | TFLOPs: 43.51 | +7: iteration 79510/ 115203 | consumed samples: 20354560 | consumed tokens: 41686138880 | elapsed time per iteration (s): 0.56 | learning rate: 6.011E-05 | global batch size: 256 | lm loss: 2.623258E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.547 | TFLOPs: 43.53 | +7: iteration 79520/ 115203 | consumed samples: 20357120 | consumed tokens: 41691381760 | elapsed time per iteration (s): 0.56 | learning rate: 6.009E-05 | global batch size: 256 | lm loss: 2.616722E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.788 | TFLOPs: 43.55 | +7: iteration 79530/ 115203 | consumed samples: 20359680 | consumed tokens: 41696624640 | elapsed time per iteration (s): 0.57 | learning rate: 6.007E-05 | global batch size: 256 | lm loss: 2.614801E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.469 | TFLOPs: 42.85 | +7: iteration 79540/ 115203 | consumed samples: 20362240 | consumed tokens: 41701867520 | elapsed time per iteration (s): 0.56 | learning rate: 6.005E-05 | global batch size: 256 | lm loss: 2.631605E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.423 | TFLOPs: 43.42 | +7: iteration 79550/ 115203 | consumed samples: 20364800 | consumed tokens: 41707110400 | elapsed time per iteration (s): 0.56 | learning rate: 6.002E-05 | global batch size: 256 | lm loss: 2.623571E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.970 | TFLOPs: 43.76 | +7: iteration 79560/ 115203 | consumed samples: 20367360 | consumed tokens: 41712353280 | elapsed time per iteration (s): 0.56 | learning rate: 6.000E-05 | global batch size: 256 | lm loss: 2.621831E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.610 | TFLOPs: 43.72 | +7: iteration 79570/ 115203 | consumed samples: 20369920 | consumed tokens: 41717596160 | elapsed time per iteration (s): 0.57 | learning rate: 5.998E-05 | global batch size: 256 | lm loss: 2.619100E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.436 | TFLOPs: 42.56 | +7: iteration 79580/ 115203 | consumed samples: 20372480 | consumed tokens: 41722839040 | elapsed time per iteration (s): 0.56 | learning rate: 5.996E-05 | global batch size: 256 | lm loss: 2.622929E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.446 | TFLOPs: 43.71 | +7: iteration 79590/ 115203 | consumed samples: 20375040 | consumed tokens: 41728081920 | elapsed time per iteration (s): 0.56 | learning rate: 5.994E-05 | global batch size: 256 | lm loss: 2.617840E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.154 | TFLOPs: 43.97 | +7: iteration 79600/ 115203 | consumed samples: 20377600 | consumed tokens: 41733324800 | elapsed time per iteration (s): 0.56 | learning rate: 5.992E-05 | global batch size: 256 | lm loss: 2.613104E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.205 | TFLOPs: 43.21 | +7: iteration 79610/ 115203 | consumed samples: 20380160 | consumed tokens: 41738567680 | elapsed time per iteration (s): 0.58 | learning rate: 5.990E-05 | global batch size: 256 | lm loss: 2.622838E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.047 | TFLOPs: 42.24 | +7: iteration 79620/ 115203 | consumed samples: 20382720 | consumed tokens: 41743810560 | elapsed time per iteration (s): 0.56 | learning rate: 5.988E-05 | global batch size: 256 | lm loss: 2.620843E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.420 | TFLOPs: 43.71 | +7: iteration 79630/ 115203 | consumed samples: 20385280 | consumed tokens: 41749053440 | elapsed time per iteration (s): 0.56 | learning rate: 5.986E-05 | global batch size: 256 | lm loss: 2.620974E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.138 | TFLOPs: 43.39 | +7: iteration 79640/ 115203 | consumed samples: 20387840 | consumed tokens: 41754296320 | elapsed time per iteration (s): 0.55 | learning rate: 5.984E-05 | global batch size: 256 | lm loss: 2.624420E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.358 | TFLOPs: 43.99 | +7: iteration 79650/ 115203 | consumed samples: 20390400 | consumed tokens: 41759539200 | elapsed time per iteration (s): 0.55 | learning rate: 5.982E-05 | global batch size: 256 | lm loss: 2.622019E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.452 | TFLOPs: 43.99 | +7: iteration 79660/ 115203 | consumed samples: 20392960 | consumed tokens: 41764782080 | elapsed time per iteration (s): 0.56 | learning rate: 5.980E-05 | global batch size: 256 | lm loss: 2.601062E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.103 | TFLOPs: 43.48 | +7: iteration 79670/ 115203 | consumed samples: 20395520 | consumed tokens: 41770024960 | elapsed time per iteration (s): 0.56 | learning rate: 5.978E-05 | global batch size: 256 | lm loss: 2.618923E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.024 | TFLOPs: 43.57 | +7: iteration 79680/ 115203 | consumed samples: 20398080 | consumed tokens: 41775267840 | elapsed time per iteration (s): 0.56 | learning rate: 5.976E-05 | global batch size: 256 | lm loss: 2.632103E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.292 | TFLOPs: 43.31 | +7: iteration 79690/ 115203 | consumed samples: 20400640 | consumed tokens: 41780510720 | elapsed time per iteration (s): 0.56 | learning rate: 5.974E-05 | global batch size: 256 | lm loss: 2.629560E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.854 | TFLOPs: 43.94 | +7: iteration 79700/ 115203 | consumed samples: 20403200 | consumed tokens: 41785753600 | elapsed time per iteration (s): 0.56 | learning rate: 5.972E-05 | global batch size: 256 | lm loss: 2.612712E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.108 | TFLOPs: 43.29 | +7: iteration 79710/ 115203 | consumed samples: 20405760 | consumed tokens: 41790996480 | elapsed time per iteration (s): 0.55 | learning rate: 5.970E-05 | global batch size: 256 | lm loss: 2.621747E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.389 | TFLOPs: 43.99 | +7: iteration 79720/ 115203 | consumed samples: 20408320 | consumed tokens: 41796239360 | elapsed time per iteration (s): 0.56 | learning rate: 5.967E-05 | global batch size: 256 | lm loss: 2.619366E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.046 | TFLOPs: 43.86 | +7: iteration 79730/ 115203 | consumed samples: 20410880 | consumed tokens: 41801482240 | elapsed time per iteration (s): 0.57 | learning rate: 5.965E-05 | global batch size: 256 | lm loss: 2.616625E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.049 | TFLOPs: 42.62 | +7: iteration 79740/ 115203 | consumed samples: 20413440 | consumed tokens: 41806725120 | elapsed time per iteration (s): 0.56 | learning rate: 5.963E-05 | global batch size: 256 | lm loss: 2.628289E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.697 | TFLOPs: 43.73 | +7: iteration 79750/ 115203 | consumed samples: 20416000 | consumed tokens: 41811968000 | elapsed time per iteration (s): 0.56 | learning rate: 5.961E-05 | global batch size: 256 | lm loss: 2.618012E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.588 | TFLOPs: 43.24 | +7: iteration 79760/ 115203 | consumed samples: 20418560 | consumed tokens: 41817210880 | elapsed time per iteration (s): 0.55 | learning rate: 5.959E-05 | global batch size: 256 | lm loss: 2.623055E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.340 | TFLOPs: 43.98 | +7: iteration 79770/ 115203 | consumed samples: 20421120 | consumed tokens: 41822453760 | elapsed time per iteration (s): 0.56 | learning rate: 5.957E-05 | global batch size: 256 | lm loss: 2.628042E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.864 | TFLOPs: 43.65 | +7: iteration 79780/ 115203 | consumed samples: 20423680 | consumed tokens: 41827696640 | elapsed time per iteration (s): 0.55 | learning rate: 5.955E-05 | global batch size: 256 | lm loss: 2.623776E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.353 | TFLOPs: 43.98 | +7: iteration 79790/ 115203 | consumed samples: 20426240 | consumed tokens: 41832939520 | elapsed time per iteration (s): 0.56 | learning rate: 5.953E-05 | global batch size: 256 | lm loss: 2.632200E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.098 | TFLOPs: 43.96 | +7: iteration 79800/ 115203 | consumed samples: 20428800 | consumed tokens: 41838182400 | elapsed time per iteration (s): 0.56 | learning rate: 5.951E-05 | global batch size: 256 | lm loss: 2.617293E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.628 | TFLOPs: 43.73 | +7: iteration 79810/ 115203 | consumed samples: 20431360 | consumed tokens: 41843425280 | elapsed time per iteration (s): 0.55 | learning rate: 5.949E-05 | global batch size: 256 | lm loss: 2.622586E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.331 | TFLOPs: 43.98 | +7: iteration 79820/ 115203 | consumed samples: 20433920 | consumed tokens: 41848668160 | elapsed time per iteration (s): 0.56 | learning rate: 5.947E-05 | global batch size: 256 | lm loss: 2.625694E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.216 | TFLOPs: 43.97 | +7: iteration 79830/ 115203 | consumed samples: 20436480 | consumed tokens: 41853911040 | elapsed time per iteration (s): 0.56 | learning rate: 5.945E-05 | global batch size: 256 | lm loss: 2.622718E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.948 | TFLOPs: 43.57 | +7: iteration 79840/ 115203 | consumed samples: 20439040 | consumed tokens: 41859153920 | elapsed time per iteration (s): 0.56 | learning rate: 5.943E-05 | global batch size: 256 | lm loss: 2.625340E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.876 | TFLOPs: 43.94 | +7: iteration 79850/ 115203 | consumed samples: 20441600 | consumed tokens: 41864396800 | elapsed time per iteration (s): 0.56 | learning rate: 5.941E-05 | global batch size: 256 | lm loss: 2.609894E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.228 | TFLOPs: 43.97 | +7: iteration 79860/ 115203 | consumed samples: 20444160 | consumed tokens: 41869639680 | elapsed time per iteration (s): 0.56 | learning rate: 5.939E-05 | global batch size: 256 | lm loss: 2.611242E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.137 | TFLOPs: 43.96 | +7: iteration 79870/ 115203 | consumed samples: 20446720 | consumed tokens: 41874882560 | elapsed time per iteration (s): 0.56 | learning rate: 5.937E-05 | global batch size: 256 | lm loss: 2.611888E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.472 | TFLOPs: 43.42 | +7: iteration 79880/ 115203 | consumed samples: 20449280 | consumed tokens: 41880125440 | elapsed time per iteration (s): 0.56 | learning rate: 5.935E-05 | global batch size: 256 | lm loss: 2.618571E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.259 | TFLOPs: 43.98 | +7: iteration 79890/ 115203 | consumed samples: 20451840 | consumed tokens: 41885368320 | elapsed time per iteration (s): 0.56 | learning rate: 5.933E-05 | global batch size: 256 | lm loss: 2.615260E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.691 | TFLOPs: 43.64 | +7: iteration 79900/ 115203 | consumed samples: 20454400 | consumed tokens: 41890611200 | elapsed time per iteration (s): 0.56 | learning rate: 5.931E-05 | global batch size: 256 | lm loss: 2.625183E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.248 | TFLOPs: 43.88 | +7: iteration 79910/ 115203 | consumed samples: 20456960 | consumed tokens: 41895854080 | elapsed time per iteration (s): 0.57 | learning rate: 5.929E-05 | global batch size: 256 | lm loss: 2.613075E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.878 | TFLOPs: 42.99 | +7: iteration 79920/ 115203 | consumed samples: 20459520 | consumed tokens: 41901096960 | elapsed time per iteration (s): 0.56 | learning rate: 5.926E-05 | global batch size: 256 | lm loss: 2.610756E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.752 | TFLOPs: 43.36 | +7: iteration 79930/ 115203 | consumed samples: 20462080 | consumed tokens: 41906339840 | elapsed time per iteration (s): 0.56 | learning rate: 5.924E-05 | global batch size: 256 | lm loss: 2.618675E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.603 | TFLOPs: 43.63 | +7: iteration 79940/ 115203 | consumed samples: 20464640 | consumed tokens: 41911582720 | elapsed time per iteration (s): 0.56 | learning rate: 5.922E-05 | global batch size: 256 | lm loss: 2.621945E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.779 | TFLOPs: 43.55 | +7: iteration 79950/ 115203 | consumed samples: 20467200 | consumed tokens: 41916825600 | elapsed time per iteration (s): 0.55 | learning rate: 5.920E-05 | global batch size: 256 | lm loss: 2.623210E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.474 | TFLOPs: 44.00 | +7: iteration 79960/ 115203 | consumed samples: 20469760 | consumed tokens: 41922068480 | elapsed time per iteration (s): 0.56 | learning rate: 5.918E-05 | global batch size: 256 | lm loss: 2.623627E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.030 | TFLOPs: 43.29 | +7: iteration 79970/ 115203 | consumed samples: 20472320 | consumed tokens: 41927311360 | elapsed time per iteration (s): 0.56 | learning rate: 5.916E-05 | global batch size: 256 | lm loss: 2.629183E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.642 | TFLOPs: 43.73 | +7: iteration 79980/ 115203 | consumed samples: 20474880 | consumed tokens: 41932554240 | elapsed time per iteration (s): 0.56 | learning rate: 5.914E-05 | global batch size: 256 | lm loss: 2.607578E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.088 | TFLOPs: 43.29 | +7: iteration 79990/ 115203 | consumed samples: 20477440 | consumed tokens: 41937797120 | elapsed time per iteration (s): 0.55 | learning rate: 5.912E-05 | global batch size: 256 | lm loss: 2.620921E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.524 | TFLOPs: 44.00 | +0: [2023-03-17 01:22:31,764] [INFO] [logging.py:68:log_dist] [Rank 0] step=80000, skipped=0, lr=[5.910086097100006e-05, 5.910086097100006e-05, 5.910086097100006e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 80000/ 115203 | consumed samples: 20480000 | consumed tokens: 41943040000 | elapsed time per iteration (s): 0.55 | learning rate: 5.910E-05 | global batch size: 256 | lm loss: 2.626377E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.443 | TFLOPs: 43.99 | +0: steps: 80000 loss: 2.6182 iter time (s): 0.558 samples/sec: 458.374 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 80000 | lm loss value: 3.436205E+00 | lm loss PPL: 3.106883E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 80000 to checkpoints_421m60b400m +0: [2023-03-17 01:22:31,975] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step80000 is begin to save! +0: [2023-03-17 01:22:31,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:22:32,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:22:32,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:22:32,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:22:32,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:22:32,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:22:32,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:22:32,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:22:32,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:22:32,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:22:32,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:22:32,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:22:32,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:22:32,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:22:32,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:22:32,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:22:32,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:22:32,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:22:32,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_11-model_00-model_states.pt... +0: [2023-03-17 01:22:32,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_11-model_00-model_states.pt. +0: [2023-03-17 01:22:32,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:22:32,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:22:32,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_13-model_00-model_states.pt... +0: [2023-03-17 01:22:32,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_13-model_00-model_states.pt. +0: [2023-03-17 01:22:32,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_14-model_00-model_states.pt... +0: [2023-03-17 01:22:32,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_14-model_00-model_states.pt. +0: [2023-03-17 01:22:32,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_15-model_00-model_states.pt... +0: [2023-03-17 01:22:32,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_15-model_00-model_states.pt. +0: [2023-03-17 01:22:32,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_16-model_00-model_states.pt... +0: [2023-03-17 01:22:32,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_16-model_00-model_states.pt. +0: [2023-03-17 01:22:32,720] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_17-model_00-model_states.pt... +0: [2023-03-17 01:22:32,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_17-model_00-model_states.pt. +0: [2023-03-17 01:22:32,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_18-model_00-model_states.pt... +0: [2023-03-17 01:22:32,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_18-model_00-model_states.pt. +0: [2023-03-17 01:22:32,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_19-model_00-model_states.pt... +0: [2023-03-17 01:22:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_19-model_00-model_states.pt. +0: [2023-03-17 01:22:32,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_20-model_00-model_states.pt... +0: [2023-03-17 01:22:32,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_20-model_00-model_states.pt. +0: [2023-03-17 01:22:32,885] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/layer_22-model_00-model_states.pt... +0: [2023-03-17 01:22:32,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/layer_22-model_00-model_states.pt. +0: [2023-03-17 01:22:32,890] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step80000/mp_rank_00_model_states.pt +0: [2023-03-17 01:22:32,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:22:32,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:32,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 01:22:33,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:22:33,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 01:22:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 01:22:33,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:22:33,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:22:33,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 01:22:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:22:33,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 01:22:33,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:22:33,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 01:22:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step80000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 01:22:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: successfully saved checkpoint at iteration 80000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1189.77 +7: iteration 80010/ 115203 | consumed samples: 20482560 | consumed tokens: 41948282880 | elapsed time per iteration (s): 0.69 | learning rate: 5.908E-05 | global batch size: 256 | lm loss: 2.613088E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 368.904 | TFLOPs: 35.17 | +7: iteration 80020/ 115203 | consumed samples: 20485120 | consumed tokens: 41953525760 | elapsed time per iteration (s): 0.56 | learning rate: 5.906E-05 | global batch size: 256 | lm loss: 2.625344E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.540 | TFLOPs: 43.24 | +7: iteration 80030/ 115203 | consumed samples: 20487680 | consumed tokens: 41958768640 | elapsed time per iteration (s): 0.55 | learning rate: 5.904E-05 | global batch size: 256 | lm loss: 2.616236E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.522 | TFLOPs: 44.00 | +7: iteration 80040/ 115203 | consumed samples: 20490240 | consumed tokens: 41964011520 | elapsed time per iteration (s): 0.56 | learning rate: 5.902E-05 | global batch size: 256 | lm loss: 2.608928E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.222 | TFLOPs: 43.59 | +7: iteration 80050/ 115203 | consumed samples: 20492800 | consumed tokens: 41969254400 | elapsed time per iteration (s): 0.55 | learning rate: 5.900E-05 | global batch size: 256 | lm loss: 2.621922E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.598 | TFLOPs: 44.01 | +7: iteration 80060/ 115203 | consumed samples: 20495360 | consumed tokens: 41974497280 | elapsed time per iteration (s): 0.55 | learning rate: 5.898E-05 | global batch size: 256 | lm loss: 2.606410E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.617 | TFLOPs: 44.01 | +7: iteration 80070/ 115203 | consumed samples: 20497920 | consumed tokens: 41979740160 | elapsed time per iteration (s): 0.55 | learning rate: 5.896E-05 | global batch size: 256 | lm loss: 2.625970E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.770 | TFLOPs: 44.02 | +7: iteration 80080/ 115203 | consumed samples: 20500480 | consumed tokens: 41984983040 | elapsed time per iteration (s): 0.55 | learning rate: 5.894E-05 | global batch size: 256 | lm loss: 2.619207E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.489 | TFLOPs: 44.00 | +7: iteration 80090/ 115203 | consumed samples: 20503040 | consumed tokens: 41990225920 | elapsed time per iteration (s): 0.56 | learning rate: 5.892E-05 | global batch size: 256 | lm loss: 2.607689E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.707 | TFLOPs: 43.64 | +7: iteration 80100/ 115203 | consumed samples: 20505600 | consumed tokens: 41995468800 | elapsed time per iteration (s): 0.56 | learning rate: 5.890E-05 | global batch size: 256 | lm loss: 2.607199E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.862 | TFLOPs: 43.84 | +7: iteration 80110/ 115203 | consumed samples: 20508160 | consumed tokens: 42000711680 | elapsed time per iteration (s): 0.57 | learning rate: 5.888E-05 | global batch size: 256 | lm loss: 2.620370E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.071 | TFLOPs: 42.72 | +7: iteration 80120/ 115203 | consumed samples: 20510720 | consumed tokens: 42005954560 | elapsed time per iteration (s): 0.56 | learning rate: 5.886E-05 | global batch size: 256 | lm loss: 2.619728E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.033 | TFLOPs: 43.57 | +7: iteration 80130/ 115203 | consumed samples: 20513280 | consumed tokens: 42011197440 | elapsed time per iteration (s): 0.55 | learning rate: 5.884E-05 | global batch size: 256 | lm loss: 2.618706E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.915 | TFLOPs: 44.04 | +7: iteration 80140/ 115203 | consumed samples: 20515840 | consumed tokens: 42016440320 | elapsed time per iteration (s): 0.55 | learning rate: 5.882E-05 | global batch size: 256 | lm loss: 2.615906E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.505 | TFLOPs: 44.00 | +7: iteration 80150/ 115203 | consumed samples: 20518400 | consumed tokens: 42021683200 | elapsed time per iteration (s): 0.56 | learning rate: 5.879E-05 | global batch size: 256 | lm loss: 2.623807E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.781 | TFLOPs: 43.64 | +7: iteration 80160/ 115203 | consumed samples: 20520960 | consumed tokens: 42026926080 | elapsed time per iteration (s): 0.55 | learning rate: 5.877E-05 | global batch size: 256 | lm loss: 2.605615E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.464 | TFLOPs: 44.00 | +7: iteration 80170/ 115203 | consumed samples: 20523520 | consumed tokens: 42032168960 | elapsed time per iteration (s): 0.58 | learning rate: 5.875E-05 | global batch size: 256 | lm loss: 2.621612E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.972 | TFLOPs: 41.95 | +7: iteration 80180/ 115203 | consumed samples: 20526080 | consumed tokens: 42037411840 | elapsed time per iteration (s): 0.56 | learning rate: 5.873E-05 | global batch size: 256 | lm loss: 2.639562E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.004 | TFLOPs: 43.76 | +7: iteration 80190/ 115203 | consumed samples: 20528640 | consumed tokens: 42042654720 | elapsed time per iteration (s): 0.56 | learning rate: 5.871E-05 | global batch size: 256 | lm loss: 2.629224E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.025 | TFLOPs: 43.76 | +7: iteration 80200/ 115203 | consumed samples: 20531200 | consumed tokens: 42047897600 | elapsed time per iteration (s): 0.55 | learning rate: 5.869E-05 | global batch size: 256 | lm loss: 2.608898E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.802 | TFLOPs: 44.03 | +7: iteration 80210/ 115203 | consumed samples: 20533760 | consumed tokens: 42053140480 | elapsed time per iteration (s): 0.56 | learning rate: 5.867E-05 | global batch size: 256 | lm loss: 2.631765E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.722 | TFLOPs: 43.73 | +7: iteration 80220/ 115203 | consumed samples: 20536320 | consumed tokens: 42058383360 | elapsed time per iteration (s): 0.56 | learning rate: 5.865E-05 | global batch size: 256 | lm loss: 2.622309E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.288 | TFLOPs: 43.79 | +7: iteration 80230/ 115203 | consumed samples: 20538880 | consumed tokens: 42063626240 | elapsed time per iteration (s): 0.55 | learning rate: 5.863E-05 | global batch size: 256 | lm loss: 2.609547E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.586 | TFLOPs: 44.01 | +7: iteration 80240/ 115203 | consumed samples: 20541440 | consumed tokens: 42068869120 | elapsed time per iteration (s): 0.55 | learning rate: 5.861E-05 | global batch size: 256 | lm loss: 2.622973E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 80250/ 115203 | consumed samples: 20544000 | consumed tokens: 42074112000 | elapsed time per iteration (s): 0.55 | learning rate: 5.859E-05 | global batch size: 256 | lm loss: 2.625321E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.397 | TFLOPs: 43.99 | +7: iteration 80260/ 115203 | consumed samples: 20546560 | consumed tokens: 42079354880 | elapsed time per iteration (s): 0.56 | learning rate: 5.857E-05 | global batch size: 256 | lm loss: 2.628245E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.936 | TFLOPs: 43.66 | +7: iteration 80270/ 115203 | consumed samples: 20549120 | consumed tokens: 42084597760 | elapsed time per iteration (s): 0.55 | learning rate: 5.855E-05 | global batch size: 256 | lm loss: 2.615378E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.388 | TFLOPs: 43.99 | +7: iteration 80280/ 115203 | consumed samples: 20551680 | consumed tokens: 42089840640 | elapsed time per iteration (s): 0.55 | learning rate: 5.853E-05 | global batch size: 256 | lm loss: 2.610572E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.494 | TFLOPs: 44.00 | +7: iteration 80290/ 115203 | consumed samples: 20554240 | consumed tokens: 42095083520 | elapsed time per iteration (s): 0.55 | learning rate: 5.851E-05 | global batch size: 256 | lm loss: 2.616522E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.445 | TFLOPs: 43.99 | +7: iteration 80300/ 115203 | consumed samples: 20556800 | consumed tokens: 42100326400 | elapsed time per iteration (s): 0.56 | learning rate: 5.849E-05 | global batch size: 256 | lm loss: 2.618205E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.072 | TFLOPs: 43.67 | +7: iteration 80310/ 115203 | consumed samples: 20559360 | consumed tokens: 42105569280 | elapsed time per iteration (s): 0.55 | learning rate: 5.847E-05 | global batch size: 256 | lm loss: 2.617546E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.486 | TFLOPs: 44.00 | +7: iteration 80320/ 115203 | consumed samples: 20561920 | consumed tokens: 42110812160 | elapsed time per iteration (s): 0.56 | learning rate: 5.845E-05 | global batch size: 256 | lm loss: 2.617291E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.036 | TFLOPs: 43.76 | +7: iteration 80330/ 115203 | consumed samples: 20564480 | consumed tokens: 42116055040 | elapsed time per iteration (s): 0.56 | learning rate: 5.843E-05 | global batch size: 256 | lm loss: 2.614305E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.534 | TFLOPs: 43.81 | +7: iteration 80340/ 115203 | consumed samples: 20567040 | consumed tokens: 42121297920 | elapsed time per iteration (s): 0.55 | learning rate: 5.841E-05 | global batch size: 256 | lm loss: 2.606536E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.641 | TFLOPs: 44.01 | +7: iteration 80350/ 115203 | consumed samples: 20569600 | consumed tokens: 42126540800 | elapsed time per iteration (s): 0.56 | learning rate: 5.839E-05 | global batch size: 256 | lm loss: 2.604704E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.179 | TFLOPs: 43.49 | +7: iteration 80360/ 115203 | consumed samples: 20572160 | consumed tokens: 42131783680 | elapsed time per iteration (s): 0.55 | learning rate: 5.837E-05 | global batch size: 256 | lm loss: 2.612444E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.623 | TFLOPs: 44.01 | +7: iteration 80370/ 115203 | consumed samples: 20574720 | consumed tokens: 42137026560 | elapsed time per iteration (s): 0.55 | learning rate: 5.835E-05 | global batch size: 256 | lm loss: 2.620837E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.633 | TFLOPs: 44.01 | +7: iteration 80380/ 115203 | consumed samples: 20577280 | consumed tokens: 42142269440 | elapsed time per iteration (s): 0.55 | learning rate: 5.833E-05 | global batch size: 256 | lm loss: 2.614747E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.558 | TFLOPs: 44.00 | +7: iteration 80390/ 115203 | consumed samples: 20579840 | consumed tokens: 42147512320 | elapsed time per iteration (s): 0.55 | learning rate: 5.831E-05 | global batch size: 256 | lm loss: 2.629909E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.726 | TFLOPs: 44.02 | +7: iteration 80400/ 115203 | consumed samples: 20582400 | consumed tokens: 42152755200 | elapsed time per iteration (s): 0.56 | learning rate: 5.829E-05 | global batch size: 256 | lm loss: 2.616667E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.233 | TFLOPs: 43.88 | +7: iteration 80410/ 115203 | consumed samples: 20584960 | consumed tokens: 42157998080 | elapsed time per iteration (s): 0.55 | learning rate: 5.827E-05 | global batch size: 256 | lm loss: 2.617071E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.816 | TFLOPs: 44.03 | +7: iteration 80420/ 115203 | consumed samples: 20587520 | consumed tokens: 42163240960 | elapsed time per iteration (s): 0.55 | learning rate: 5.825E-05 | global batch size: 256 | lm loss: 2.604303E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.648 | TFLOPs: 44.01 | +7: iteration 80430/ 115203 | consumed samples: 20590080 | consumed tokens: 42168483840 | elapsed time per iteration (s): 0.55 | learning rate: 5.823E-05 | global batch size: 256 | lm loss: 2.616266E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.448 | TFLOPs: 43.99 | +7: iteration 80440/ 115203 | consumed samples: 20592640 | consumed tokens: 42173726720 | elapsed time per iteration (s): 0.55 | learning rate: 5.821E-05 | global batch size: 256 | lm loss: 2.618333E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.724 | TFLOPs: 44.02 | +7: iteration 80450/ 115203 | consumed samples: 20595200 | consumed tokens: 42178969600 | elapsed time per iteration (s): 0.55 | learning rate: 5.818E-05 | global batch size: 256 | lm loss: 2.617334E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.422 | TFLOPs: 43.99 | +7: iteration 80460/ 115203 | consumed samples: 20597760 | consumed tokens: 42184212480 | elapsed time per iteration (s): 0.55 | learning rate: 5.816E-05 | global batch size: 256 | lm loss: 2.617328E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.493 | TFLOPs: 44.00 | +7: iteration 80470/ 115203 | consumed samples: 20600320 | consumed tokens: 42189455360 | elapsed time per iteration (s): 0.55 | learning rate: 5.814E-05 | global batch size: 256 | lm loss: 2.620387E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.699 | TFLOPs: 44.02 | +7: iteration 80480/ 115203 | consumed samples: 20602880 | consumed tokens: 42194698240 | elapsed time per iteration (s): 0.55 | learning rate: 5.812E-05 | global batch size: 256 | lm loss: 2.625464E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.522 | TFLOPs: 44.00 | +7: iteration 80490/ 115203 | consumed samples: 20605440 | consumed tokens: 42199941120 | elapsed time per iteration (s): 0.55 | learning rate: 5.810E-05 | global batch size: 256 | lm loss: 2.614838E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.646 | TFLOPs: 44.01 | +7: iteration 80500/ 115203 | consumed samples: 20608000 | consumed tokens: 42205184000 | elapsed time per iteration (s): 0.56 | learning rate: 5.808E-05 | global batch size: 256 | lm loss: 2.623851E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.615 | TFLOPs: 43.72 | +7: iteration 80510/ 115203 | consumed samples: 20610560 | consumed tokens: 42210426880 | elapsed time per iteration (s): 0.55 | learning rate: 5.806E-05 | global batch size: 256 | lm loss: 2.617519E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.800 | TFLOPs: 44.03 | +7: iteration 80520/ 115203 | consumed samples: 20613120 | consumed tokens: 42215669760 | elapsed time per iteration (s): 0.55 | learning rate: 5.804E-05 | global batch size: 256 | lm loss: 2.607873E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.723 | TFLOPs: 44.02 | +7: iteration 80530/ 115203 | consumed samples: 20615680 | consumed tokens: 42220912640 | elapsed time per iteration (s): 0.56 | learning rate: 5.802E-05 | global batch size: 256 | lm loss: 2.630948E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.574 | TFLOPs: 43.62 | +7: iteration 80540/ 115203 | consumed samples: 20618240 | consumed tokens: 42226155520 | elapsed time per iteration (s): 0.55 | learning rate: 5.800E-05 | global batch size: 256 | lm loss: 2.615150E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.691 | TFLOPs: 44.02 | +7: iteration 80550/ 115203 | consumed samples: 20620800 | consumed tokens: 42231398400 | elapsed time per iteration (s): 0.56 | learning rate: 5.798E-05 | global batch size: 256 | lm loss: 2.602795E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.982 | TFLOPs: 43.57 | +7: iteration 80560/ 115203 | consumed samples: 20623360 | consumed tokens: 42236641280 | elapsed time per iteration (s): 0.55 | learning rate: 5.796E-05 | global batch size: 256 | lm loss: 2.613765E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.751 | TFLOPs: 44.02 | +7: iteration 80570/ 115203 | consumed samples: 20625920 | consumed tokens: 42241884160 | elapsed time per iteration (s): 0.55 | learning rate: 5.794E-05 | global batch size: 256 | lm loss: 2.616882E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.607 | TFLOPs: 44.01 | +7: iteration 80580/ 115203 | consumed samples: 20628480 | consumed tokens: 42247127040 | elapsed time per iteration (s): 0.55 | learning rate: 5.792E-05 | global batch size: 256 | lm loss: 2.621013E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.540 | TFLOPs: 44.00 | +7: iteration 80590/ 115203 | consumed samples: 20631040 | consumed tokens: 42252369920 | elapsed time per iteration (s): 0.56 | learning rate: 5.790E-05 | global batch size: 256 | lm loss: 2.618382E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.890 | TFLOPs: 43.94 | +7: iteration 80600/ 115203 | consumed samples: 20633600 | consumed tokens: 42257612800 | elapsed time per iteration (s): 0.57 | learning rate: 5.788E-05 | global batch size: 256 | lm loss: 2.625594E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.961 | TFLOPs: 42.90 | +7: iteration 80610/ 115203 | consumed samples: 20636160 | consumed tokens: 42262855680 | elapsed time per iteration (s): 0.55 | learning rate: 5.786E-05 | global batch size: 256 | lm loss: 2.623590E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.583 | TFLOPs: 44.01 | +7: iteration 80620/ 115203 | consumed samples: 20638720 | consumed tokens: 42268098560 | elapsed time per iteration (s): 0.56 | learning rate: 5.784E-05 | global batch size: 256 | lm loss: 2.622863E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.165 | TFLOPs: 43.30 | +7: iteration 80630/ 115203 | consumed samples: 20641280 | consumed tokens: 42273341440 | elapsed time per iteration (s): 0.55 | learning rate: 5.782E-05 | global batch size: 256 | lm loss: 2.611948E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.552 | TFLOPs: 44.00 | +7: iteration 80640/ 115203 | consumed samples: 20643840 | consumed tokens: 42278584320 | elapsed time per iteration (s): 0.57 | learning rate: 5.780E-05 | global batch size: 256 | lm loss: 2.618354E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.618 | TFLOPs: 43.06 | +7: iteration 80650/ 115203 | consumed samples: 20646400 | consumed tokens: 42283827200 | elapsed time per iteration (s): 0.55 | learning rate: 5.778E-05 | global batch size: 256 | lm loss: 2.623524E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.673 | TFLOPs: 44.02 | +7: iteration 80660/ 115203 | consumed samples: 20648960 | consumed tokens: 42289070080 | elapsed time per iteration (s): 0.57 | learning rate: 5.776E-05 | global batch size: 256 | lm loss: 2.614984E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.552 | TFLOPs: 43.05 | +7: iteration 80670/ 115203 | consumed samples: 20651520 | consumed tokens: 42294312960 | elapsed time per iteration (s): 0.57 | learning rate: 5.774E-05 | global batch size: 256 | lm loss: 2.625070E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.795 | TFLOPs: 42.50 | +7: iteration 80680/ 115203 | consumed samples: 20654080 | consumed tokens: 42299555840 | elapsed time per iteration (s): 0.57 | learning rate: 5.772E-05 | global batch size: 256 | lm loss: 2.625579E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.944 | TFLOPs: 42.80 | +7: iteration 80690/ 115203 | consumed samples: 20656640 | consumed tokens: 42304798720 | elapsed time per iteration (s): 0.56 | learning rate: 5.770E-05 | global batch size: 256 | lm loss: 2.626642E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.381 | TFLOPs: 43.22 | +7: iteration 80700/ 115203 | consumed samples: 20659200 | consumed tokens: 42310041600 | elapsed time per iteration (s): 0.58 | learning rate: 5.768E-05 | global batch size: 256 | lm loss: 2.623823E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.713 | TFLOPs: 42.21 | +7: iteration 80710/ 115203 | consumed samples: 20661760 | consumed tokens: 42315284480 | elapsed time per iteration (s): 0.60 | learning rate: 5.766E-05 | global batch size: 256 | lm loss: 2.620237E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.977 | TFLOPs: 40.90 | +7: iteration 80720/ 115203 | consumed samples: 20664320 | consumed tokens: 42320527360 | elapsed time per iteration (s): 0.56 | learning rate: 5.764E-05 | global batch size: 256 | lm loss: 2.612838E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.634 | TFLOPs: 43.34 | +7: iteration 80730/ 115203 | consumed samples: 20666880 | consumed tokens: 42325770240 | elapsed time per iteration (s): 0.58 | learning rate: 5.762E-05 | global batch size: 256 | lm loss: 2.604151E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.561 | TFLOPs: 42.00 | +7: iteration 80740/ 115203 | consumed samples: 20669440 | consumed tokens: 42331013120 | elapsed time per iteration (s): 0.58 | learning rate: 5.760E-05 | global batch size: 256 | lm loss: 2.624377E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.270 | TFLOPs: 41.78 | +7: iteration 80750/ 115203 | consumed samples: 20672000 | consumed tokens: 42336256000 | elapsed time per iteration (s): 0.58 | learning rate: 5.758E-05 | global batch size: 256 | lm loss: 2.609936E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.549 | TFLOPs: 41.81 | +7: iteration 80760/ 115203 | consumed samples: 20674560 | consumed tokens: 42341498880 | elapsed time per iteration (s): 0.58 | learning rate: 5.756E-05 | global batch size: 256 | lm loss: 2.617296E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.091 | TFLOPs: 41.86 | +7: iteration 80770/ 115203 | consumed samples: 20677120 | consumed tokens: 42346741760 | elapsed time per iteration (s): 0.57 | learning rate: 5.754E-05 | global batch size: 256 | lm loss: 2.612392E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.181 | TFLOPs: 42.73 | +7: iteration 80780/ 115203 | consumed samples: 20679680 | consumed tokens: 42351984640 | elapsed time per iteration (s): 0.59 | learning rate: 5.752E-05 | global batch size: 256 | lm loss: 2.621000E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.439 | TFLOPs: 41.32 | +7: iteration 80790/ 115203 | consumed samples: 20682240 | consumed tokens: 42357227520 | elapsed time per iteration (s): 0.59 | learning rate: 5.750E-05 | global batch size: 256 | lm loss: 2.606452E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.188 | TFLOPs: 41.59 | +7: iteration 80800/ 115203 | consumed samples: 20684800 | consumed tokens: 42362470400 | elapsed time per iteration (s): 0.57 | learning rate: 5.748E-05 | global batch size: 256 | lm loss: 2.614992E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.998 | TFLOPs: 42.62 | +7: iteration 80810/ 115203 | consumed samples: 20687360 | consumed tokens: 42367713280 | elapsed time per iteration (s): 0.57 | learning rate: 5.746E-05 | global batch size: 256 | lm loss: 2.616526E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.434 | TFLOPs: 42.47 | +7: iteration 80820/ 115203 | consumed samples: 20689920 | consumed tokens: 42372956160 | elapsed time per iteration (s): 0.58 | learning rate: 5.744E-05 | global batch size: 256 | lm loss: 2.613653E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.795 | TFLOPs: 42.41 | +7: iteration 80830/ 115203 | consumed samples: 20692480 | consumed tokens: 42378199040 | elapsed time per iteration (s): 0.59 | learning rate: 5.742E-05 | global batch size: 256 | lm loss: 2.614343E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.218 | TFLOPs: 41.49 | +7: iteration 80840/ 115203 | consumed samples: 20695040 | consumed tokens: 42383441920 | elapsed time per iteration (s): 0.58 | learning rate: 5.740E-05 | global batch size: 256 | lm loss: 2.630930E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.432 | TFLOPs: 42.09 | +7: iteration 80850/ 115203 | consumed samples: 20697600 | consumed tokens: 42388684800 | elapsed time per iteration (s): 0.59 | learning rate: 5.738E-05 | global batch size: 256 | lm loss: 2.621869E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.555 | TFLOPs: 41.53 | +7: iteration 80860/ 115203 | consumed samples: 20700160 | consumed tokens: 42393927680 | elapsed time per iteration (s): 0.57 | learning rate: 5.736E-05 | global batch size: 256 | lm loss: 2.599166E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.951 | TFLOPs: 42.90 | +7: iteration 80870/ 115203 | consumed samples: 20702720 | consumed tokens: 42399170560 | elapsed time per iteration (s): 0.58 | learning rate: 5.734E-05 | global batch size: 256 | lm loss: 2.623479E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.301 | TFLOPs: 41.98 | +7: iteration 80880/ 115203 | consumed samples: 20705280 | consumed tokens: 42404413440 | elapsed time per iteration (s): 0.58 | learning rate: 5.732E-05 | global batch size: 256 | lm loss: 2.624746E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.011 | TFLOPs: 42.33 | +7: iteration 80890/ 115203 | consumed samples: 20707840 | consumed tokens: 42409656320 | elapsed time per iteration (s): 0.59 | learning rate: 5.730E-05 | global batch size: 256 | lm loss: 2.618404E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.814 | TFLOPs: 41.45 | +7: iteration 80900/ 115203 | consumed samples: 20710400 | consumed tokens: 42414899200 | elapsed time per iteration (s): 0.58 | learning rate: 5.728E-05 | global batch size: 256 | lm loss: 2.621504E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.518 | TFLOPs: 42.09 | +7: iteration 80910/ 115203 | consumed samples: 20712960 | consumed tokens: 42420142080 | elapsed time per iteration (s): 0.57 | learning rate: 5.726E-05 | global batch size: 256 | lm loss: 2.624854E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.847 | TFLOPs: 42.51 | +7: iteration 80920/ 115203 | consumed samples: 20715520 | consumed tokens: 42425384960 | elapsed time per iteration (s): 0.59 | learning rate: 5.724E-05 | global batch size: 256 | lm loss: 2.624078E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.507 | TFLOPs: 41.23 | +7: iteration 80930/ 115203 | consumed samples: 20718080 | consumed tokens: 42430627840 | elapsed time per iteration (s): 0.57 | learning rate: 5.722E-05 | global batch size: 256 | lm loss: 2.632669E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.005 | TFLOPs: 42.81 | +7: iteration 80940/ 115203 | consumed samples: 20720640 | consumed tokens: 42435870720 | elapsed time per iteration (s): 0.58 | learning rate: 5.720E-05 | global batch size: 256 | lm loss: 2.621795E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.228 | TFLOPs: 41.78 | +7: iteration 80950/ 115203 | consumed samples: 20723200 | consumed tokens: 42441113600 | elapsed time per iteration (s): 0.57 | learning rate: 5.718E-05 | global batch size: 256 | lm loss: 2.604366E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.988 | TFLOPs: 43.19 | +7: iteration 80960/ 115203 | consumed samples: 20725760 | consumed tokens: 42446356480 | elapsed time per iteration (s): 0.58 | learning rate: 5.716E-05 | global batch size: 256 | lm loss: 2.620389E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.987 | TFLOPs: 41.76 | +7: iteration 80970/ 115203 | consumed samples: 20728320 | consumed tokens: 42451599360 | elapsed time per iteration (s): 0.58 | learning rate: 5.714E-05 | global batch size: 256 | lm loss: 2.613390E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.466 | TFLOPs: 42.09 | +7: iteration 80980/ 115203 | consumed samples: 20730880 | consumed tokens: 42456842240 | elapsed time per iteration (s): 0.57 | learning rate: 5.712E-05 | global batch size: 256 | lm loss: 2.608835E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.036 | TFLOPs: 42.62 | +7: iteration 80990/ 115203 | consumed samples: 20733440 | consumed tokens: 42462085120 | elapsed time per iteration (s): 0.56 | learning rate: 5.710E-05 | global batch size: 256 | lm loss: 2.604939E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.825 | TFLOPs: 43.46 | +7: iteration 81000/ 115203 | consumed samples: 20736000 | consumed tokens: 42467328000 | elapsed time per iteration (s): 0.58 | learning rate: 5.708E-05 | global batch size: 256 | lm loss: 2.612358E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.413 | TFLOPs: 41.99 | +7: iteration 81010/ 115203 | consumed samples: 20738560 | consumed tokens: 42472570880 | elapsed time per iteration (s): 0.59 | learning rate: 5.706E-05 | global batch size: 256 | lm loss: 2.615456E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.271 | TFLOPs: 41.40 | +7: iteration 81020/ 115203 | consumed samples: 20741120 | consumed tokens: 42477813760 | elapsed time per iteration (s): 0.56 | learning rate: 5.704E-05 | global batch size: 256 | lm loss: 2.626301E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.213 | TFLOPs: 43.21 | +7: iteration 81030/ 115203 | consumed samples: 20743680 | consumed tokens: 42483056640 | elapsed time per iteration (s): 0.58 | learning rate: 5.702E-05 | global batch size: 256 | lm loss: 2.614807E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.716 | TFLOPs: 42.30 | +7: iteration 81040/ 115203 | consumed samples: 20746240 | consumed tokens: 42488299520 | elapsed time per iteration (s): 0.59 | learning rate: 5.700E-05 | global batch size: 256 | lm loss: 2.627131E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.759 | TFLOPs: 41.26 | +7: iteration 81050/ 115203 | consumed samples: 20748800 | consumed tokens: 42493542400 | elapsed time per iteration (s): 0.58 | learning rate: 5.698E-05 | global batch size: 256 | lm loss: 2.616390E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.046 | TFLOPs: 42.14 | +7: iteration 81060/ 115203 | consumed samples: 20751360 | consumed tokens: 42498785280 | elapsed time per iteration (s): 0.56 | learning rate: 5.696E-05 | global batch size: 256 | lm loss: 2.603063E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.297 | TFLOPs: 43.60 | +7: iteration 81070/ 115203 | consumed samples: 20753920 | consumed tokens: 42504028160 | elapsed time per iteration (s): 0.60 | learning rate: 5.694E-05 | global batch size: 256 | lm loss: 2.611457E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.244 | TFLOPs: 40.73 | +7: iteration 81080/ 115203 | consumed samples: 20756480 | consumed tokens: 42509271040 | elapsed time per iteration (s): 0.59 | learning rate: 5.692E-05 | global batch size: 256 | lm loss: 2.608310E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.623 | TFLOPs: 41.63 | +7: iteration 81090/ 115203 | consumed samples: 20759040 | consumed tokens: 42514513920 | elapsed time per iteration (s): 0.57 | learning rate: 5.690E-05 | global batch size: 256 | lm loss: 2.615107E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.066 | TFLOPs: 42.81 | +7: iteration 81100/ 115203 | consumed samples: 20761600 | consumed tokens: 42519756800 | elapsed time per iteration (s): 0.58 | learning rate: 5.688E-05 | global batch size: 256 | lm loss: 2.616669E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.395 | TFLOPs: 42.27 | +7: iteration 81110/ 115203 | consumed samples: 20764160 | consumed tokens: 42524999680 | elapsed time per iteration (s): 0.57 | learning rate: 5.686E-05 | global batch size: 256 | lm loss: 2.608202E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.264 | TFLOPs: 42.83 | +7: iteration 81120/ 115203 | consumed samples: 20766720 | consumed tokens: 42530242560 | elapsed time per iteration (s): 0.57 | learning rate: 5.684E-05 | global batch size: 256 | lm loss: 2.597793E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.586 | TFLOPs: 43.05 | +7: iteration 81130/ 115203 | consumed samples: 20769280 | consumed tokens: 42535485440 | elapsed time per iteration (s): 0.57 | learning rate: 5.682E-05 | global batch size: 256 | lm loss: 2.603336E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.385 | TFLOPs: 42.84 | +7: iteration 81140/ 115203 | consumed samples: 20771840 | consumed tokens: 42540728320 | elapsed time per iteration (s): 0.56 | learning rate: 5.680E-05 | global batch size: 256 | lm loss: 2.615039E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.554 | TFLOPs: 43.34 | +7: iteration 81150/ 115203 | consumed samples: 20774400 | consumed tokens: 42545971200 | elapsed time per iteration (s): 0.57 | learning rate: 5.678E-05 | global batch size: 256 | lm loss: 2.617889E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.432 | TFLOPs: 42.85 | +7: iteration 81160/ 115203 | consumed samples: 20776960 | consumed tokens: 42551214080 | elapsed time per iteration (s): 0.59 | learning rate: 5.676E-05 | global batch size: 256 | lm loss: 2.592637E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.066 | TFLOPs: 41.48 | +7: iteration 81170/ 115203 | consumed samples: 20779520 | consumed tokens: 42556456960 | elapsed time per iteration (s): 0.57 | learning rate: 5.674E-05 | global batch size: 256 | lm loss: 2.630623E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.427 | TFLOPs: 42.56 | +7: iteration 81180/ 115203 | consumed samples: 20782080 | consumed tokens: 42561699840 | elapsed time per iteration (s): 0.59 | learning rate: 5.672E-05 | global batch size: 256 | lm loss: 2.629295E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.107 | TFLOPs: 41.58 | +7: iteration 81190/ 115203 | consumed samples: 20784640 | consumed tokens: 42566942720 | elapsed time per iteration (s): 0.57 | learning rate: 5.670E-05 | global batch size: 256 | lm loss: 2.614123E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.311 | TFLOPs: 42.65 | +7: iteration 81200/ 115203 | consumed samples: 20787200 | consumed tokens: 42572185600 | elapsed time per iteration (s): 0.58 | learning rate: 5.668E-05 | global batch size: 256 | lm loss: 2.621683E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.452 | TFLOPs: 42.28 | +7: iteration 81210/ 115203 | consumed samples: 20789760 | consumed tokens: 42577428480 | elapsed time per iteration (s): 0.57 | learning rate: 5.666E-05 | global batch size: 256 | lm loss: 2.602130E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.525 | TFLOPs: 42.86 | +7: iteration 81220/ 115203 | consumed samples: 20792320 | consumed tokens: 42582671360 | elapsed time per iteration (s): 0.58 | learning rate: 5.664E-05 | global batch size: 256 | lm loss: 2.626370E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.390 | TFLOPs: 42.08 | +7: iteration 81230/ 115203 | consumed samples: 20794880 | consumed tokens: 42587914240 | elapsed time per iteration (s): 0.57 | learning rate: 5.662E-05 | global batch size: 256 | lm loss: 2.620686E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.285 | TFLOPs: 42.93 | +7: iteration 81240/ 115203 | consumed samples: 20797440 | consumed tokens: 42593157120 | elapsed time per iteration (s): 0.58 | learning rate: 5.660E-05 | global batch size: 256 | lm loss: 2.602305E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.380 | TFLOPs: 42.18 | +7: iteration 81250/ 115203 | consumed samples: 20800000 | consumed tokens: 42598400000 | elapsed time per iteration (s): 0.55 | learning rate: 5.658E-05 | global batch size: 256 | lm loss: 2.614492E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.450 | TFLOPs: 43.99 | +7: iteration 81260/ 115203 | consumed samples: 20802560 | consumed tokens: 42603642880 | elapsed time per iteration (s): 0.57 | learning rate: 5.656E-05 | global batch size: 256 | lm loss: 2.609532E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.853 | TFLOPs: 42.51 | +7: iteration 81270/ 115203 | consumed samples: 20805120 | consumed tokens: 42608885760 | elapsed time per iteration (s): 0.57 | learning rate: 5.654E-05 | global batch size: 256 | lm loss: 2.634712E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.089 | TFLOPs: 43.01 | +7: iteration 81280/ 115203 | consumed samples: 20807680 | consumed tokens: 42614128640 | elapsed time per iteration (s): 0.55 | learning rate: 5.652E-05 | global batch size: 256 | lm loss: 2.617842E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 81290/ 115203 | consumed samples: 20810240 | consumed tokens: 42619371520 | elapsed time per iteration (s): 0.58 | learning rate: 5.650E-05 | global batch size: 256 | lm loss: 2.618133E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.304 | TFLOPs: 42.17 | +7: iteration 81300/ 115203 | consumed samples: 20812800 | consumed tokens: 42624614400 | elapsed time per iteration (s): 0.57 | learning rate: 5.648E-05 | global batch size: 256 | lm loss: 2.607702E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.967 | TFLOPs: 43.19 | +7: iteration 81310/ 115203 | consumed samples: 20815360 | consumed tokens: 42629857280 | elapsed time per iteration (s): 0.58 | learning rate: 5.646E-05 | global batch size: 256 | lm loss: 2.605213E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.931 | TFLOPs: 42.42 | +7: iteration 81320/ 115203 | consumed samples: 20817920 | consumed tokens: 42635100160 | elapsed time per iteration (s): 0.57 | learning rate: 5.644E-05 | global batch size: 256 | lm loss: 2.612689E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.480 | TFLOPs: 42.47 | +7: iteration 81330/ 115203 | consumed samples: 20820480 | consumed tokens: 42640343040 | elapsed time per iteration (s): 0.59 | learning rate: 5.642E-05 | global batch size: 256 | lm loss: 2.610241E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.951 | TFLOPs: 41.66 | +7: iteration 81340/ 115203 | consumed samples: 20823040 | consumed tokens: 42645585920 | elapsed time per iteration (s): 0.58 | learning rate: 5.640E-05 | global batch size: 256 | lm loss: 2.619369E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.996 | TFLOPs: 42.43 | +7: iteration 81350/ 115203 | consumed samples: 20825600 | consumed tokens: 42650828800 | elapsed time per iteration (s): 0.59 | learning rate: 5.638E-05 | global batch size: 256 | lm loss: 2.608826E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.299 | TFLOPs: 41.02 | +7: iteration 81360/ 115203 | consumed samples: 20828160 | consumed tokens: 42656071680 | elapsed time per iteration (s): 0.57 | learning rate: 5.636E-05 | global batch size: 256 | lm loss: 2.604979E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.020 | TFLOPs: 43.19 | +7: iteration 81370/ 115203 | consumed samples: 20830720 | consumed tokens: 42661314560 | elapsed time per iteration (s): 0.58 | learning rate: 5.634E-05 | global batch size: 256 | lm loss: 2.605243E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.090 | TFLOPs: 42.05 | +7: iteration 81380/ 115203 | consumed samples: 20833280 | consumed tokens: 42666557440 | elapsed time per iteration (s): 0.56 | learning rate: 5.632E-05 | global batch size: 256 | lm loss: 2.605855E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.360 | TFLOPs: 43.22 | +7: iteration 81390/ 115203 | consumed samples: 20835840 | consumed tokens: 42671800320 | elapsed time per iteration (s): 0.58 | learning rate: 5.630E-05 | global batch size: 256 | lm loss: 2.626394E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.025 | TFLOPs: 41.86 | +7: iteration 81400/ 115203 | consumed samples: 20838400 | consumed tokens: 42677043200 | elapsed time per iteration (s): 0.57 | learning rate: 5.628E-05 | global batch size: 256 | lm loss: 2.613237E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.780 | TFLOPs: 42.79 | +7: iteration 81410/ 115203 | consumed samples: 20840960 | consumed tokens: 42682286080 | elapsed time per iteration (s): 0.57 | learning rate: 5.626E-05 | global batch size: 256 | lm loss: 2.607368E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.476 | TFLOPs: 42.95 | +7: iteration 81420/ 115203 | consumed samples: 20843520 | consumed tokens: 42687528960 | elapsed time per iteration (s): 0.57 | learning rate: 5.624E-05 | global batch size: 256 | lm loss: 2.615102E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.816 | TFLOPs: 42.50 | +7: iteration 81430/ 115203 | consumed samples: 20846080 | consumed tokens: 42692771840 | elapsed time per iteration (s): 0.57 | learning rate: 5.622E-05 | global batch size: 256 | lm loss: 2.628564E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.174 | TFLOPs: 42.63 | +7: iteration 81440/ 115203 | consumed samples: 20848640 | consumed tokens: 42698014720 | elapsed time per iteration (s): 0.55 | learning rate: 5.620E-05 | global batch size: 256 | lm loss: 2.602687E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.715 | TFLOPs: 44.02 | +7: iteration 81450/ 115203 | consumed samples: 20851200 | consumed tokens: 42703257600 | elapsed time per iteration (s): 0.58 | learning rate: 5.618E-05 | global batch size: 256 | lm loss: 2.627538E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.125 | TFLOPs: 41.87 | +7: iteration 81460/ 115203 | consumed samples: 20853760 | consumed tokens: 42708500480 | elapsed time per iteration (s): 0.58 | learning rate: 5.616E-05 | global batch size: 256 | lm loss: 2.614130E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.800 | TFLOPs: 41.74 | +7: iteration 81470/ 115203 | consumed samples: 20856320 | consumed tokens: 42713743360 | elapsed time per iteration (s): 0.56 | learning rate: 5.614E-05 | global batch size: 256 | lm loss: 2.617762E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.209 | TFLOPs: 43.40 | +7: iteration 81480/ 115203 | consumed samples: 20858880 | consumed tokens: 42718986240 | elapsed time per iteration (s): 0.58 | learning rate: 5.612E-05 | global batch size: 256 | lm loss: 2.596774E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.001 | TFLOPs: 41.95 | +7: iteration 81490/ 115203 | consumed samples: 20861440 | consumed tokens: 42724229120 | elapsed time per iteration (s): 0.56 | learning rate: 5.610E-05 | global batch size: 256 | lm loss: 2.617530E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.339 | TFLOPs: 43.32 | +7: iteration 81500/ 115203 | consumed samples: 20864000 | consumed tokens: 42729472000 | elapsed time per iteration (s): 0.59 | learning rate: 5.608E-05 | global batch size: 256 | lm loss: 2.610645E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.551 | TFLOPs: 41.05 | +7: iteration 81510/ 115203 | consumed samples: 20866560 | consumed tokens: 42734714880 | elapsed time per iteration (s): 0.57 | learning rate: 5.606E-05 | global batch size: 256 | lm loss: 2.616790E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.474 | TFLOPs: 42.47 | +7: iteration 81520/ 115203 | consumed samples: 20869120 | consumed tokens: 42739957760 | elapsed time per iteration (s): 0.57 | learning rate: 5.604E-05 | global batch size: 256 | lm loss: 2.607909E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.773 | TFLOPs: 42.50 | +7: iteration 81530/ 115203 | consumed samples: 20871680 | consumed tokens: 42745200640 | elapsed time per iteration (s): 0.56 | learning rate: 5.602E-05 | global batch size: 256 | lm loss: 2.609693E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.437 | TFLOPs: 43.71 | +7: iteration 81540/ 115203 | consumed samples: 20874240 | consumed tokens: 42750443520 | elapsed time per iteration (s): 0.56 | learning rate: 5.600E-05 | global batch size: 256 | lm loss: 2.613531E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.847 | TFLOPs: 43.46 | +7: iteration 81550/ 115203 | consumed samples: 20876800 | consumed tokens: 42755686400 | elapsed time per iteration (s): 0.58 | learning rate: 5.598E-05 | global batch size: 256 | lm loss: 2.601966E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.386 | TFLOPs: 41.89 | +7: iteration 81560/ 115203 | consumed samples: 20879360 | consumed tokens: 42760929280 | elapsed time per iteration (s): 0.57 | learning rate: 5.596E-05 | global batch size: 256 | lm loss: 2.611832E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.641 | TFLOPs: 42.96 | +7: iteration 81570/ 115203 | consumed samples: 20881920 | consumed tokens: 42766172160 | elapsed time per iteration (s): 0.57 | learning rate: 5.594E-05 | global batch size: 256 | lm loss: 2.608937E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.447 | TFLOPs: 42.47 | +7: iteration 81580/ 115203 | consumed samples: 20884480 | consumed tokens: 42771415040 | elapsed time per iteration (s): 0.57 | learning rate: 5.592E-05 | global batch size: 256 | lm loss: 2.603533E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.495 | TFLOPs: 42.57 | +7: iteration 81590/ 115203 | consumed samples: 20887040 | consumed tokens: 42776657920 | elapsed time per iteration (s): 0.57 | learning rate: 5.590E-05 | global batch size: 256 | lm loss: 2.626289E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.209 | TFLOPs: 42.64 | +7: iteration 81600/ 115203 | consumed samples: 20889600 | consumed tokens: 42781900800 | elapsed time per iteration (s): 0.57 | learning rate: 5.588E-05 | global batch size: 256 | lm loss: 2.610895E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.614 | TFLOPs: 42.77 | +7: iteration 81610/ 115203 | consumed samples: 20892160 | consumed tokens: 42787143680 | elapsed time per iteration (s): 0.57 | learning rate: 5.586E-05 | global batch size: 256 | lm loss: 2.614475E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.921 | TFLOPs: 42.61 | +7: iteration 81620/ 115203 | consumed samples: 20894720 | consumed tokens: 42792386560 | elapsed time per iteration (s): 0.57 | learning rate: 5.584E-05 | global batch size: 256 | lm loss: 2.615832E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.885 | TFLOPs: 42.80 | +7: iteration 81630/ 115203 | consumed samples: 20897280 | consumed tokens: 42797629440 | elapsed time per iteration (s): 0.56 | learning rate: 5.582E-05 | global batch size: 256 | lm loss: 2.621384E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.344 | TFLOPs: 43.60 | +7: iteration 81640/ 115203 | consumed samples: 20899840 | consumed tokens: 42802872320 | elapsed time per iteration (s): 0.57 | learning rate: 5.580E-05 | global batch size: 256 | lm loss: 2.623954E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.655 | TFLOPs: 43.06 | +7: iteration 81650/ 115203 | consumed samples: 20902400 | consumed tokens: 42808115200 | elapsed time per iteration (s): 0.57 | learning rate: 5.578E-05 | global batch size: 256 | lm loss: 2.614945E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.405 | TFLOPs: 42.94 | +7: iteration 81660/ 115203 | consumed samples: 20904960 | consumed tokens: 42813358080 | elapsed time per iteration (s): 0.57 | learning rate: 5.576E-05 | global batch size: 256 | lm loss: 2.608116E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.387 | TFLOPs: 42.56 | +7: iteration 81670/ 115203 | consumed samples: 20907520 | consumed tokens: 42818600960 | elapsed time per iteration (s): 0.56 | learning rate: 5.574E-05 | global batch size: 256 | lm loss: 2.616720E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.444 | TFLOPs: 43.52 | +7: iteration 81680/ 115203 | consumed samples: 20910080 | consumed tokens: 42823843840 | elapsed time per iteration (s): 0.57 | learning rate: 5.572E-05 | global batch size: 256 | lm loss: 2.613239E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.687 | TFLOPs: 43.16 | +7: iteration 81690/ 115203 | consumed samples: 20912640 | consumed tokens: 42829086720 | elapsed time per iteration (s): 0.57 | learning rate: 5.570E-05 | global batch size: 256 | lm loss: 2.615316E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.176 | TFLOPs: 43.01 | +7: iteration 81700/ 115203 | consumed samples: 20915200 | consumed tokens: 42834329600 | elapsed time per iteration (s): 0.56 | learning rate: 5.568E-05 | global batch size: 256 | lm loss: 2.609261E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.877 | TFLOPs: 43.75 | +7: iteration 81710/ 115203 | consumed samples: 20917760 | consumed tokens: 42839572480 | elapsed time per iteration (s): 0.56 | learning rate: 5.566E-05 | global batch size: 256 | lm loss: 2.617865E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.207 | TFLOPs: 43.40 | +7: iteration 81720/ 115203 | consumed samples: 20920320 | consumed tokens: 42844815360 | elapsed time per iteration (s): 0.56 | learning rate: 5.564E-05 | global batch size: 256 | lm loss: 2.616353E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.568 | TFLOPs: 43.24 | +7: iteration 81730/ 115203 | consumed samples: 20922880 | consumed tokens: 42850058240 | elapsed time per iteration (s): 0.57 | learning rate: 5.562E-05 | global batch size: 256 | lm loss: 2.605934E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.703 | TFLOPs: 42.87 | +7: iteration 81740/ 115203 | consumed samples: 20925440 | consumed tokens: 42855301120 | elapsed time per iteration (s): 0.56 | learning rate: 5.560E-05 | global batch size: 256 | lm loss: 2.610664E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.708 | TFLOPs: 43.92 | +7: iteration 81750/ 115203 | consumed samples: 20928000 | consumed tokens: 42860544000 | elapsed time per iteration (s): 0.57 | learning rate: 5.558E-05 | global batch size: 256 | lm loss: 2.601055E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.856 | TFLOPs: 42.89 | +7: iteration 81760/ 115203 | consumed samples: 20930560 | consumed tokens: 42865786880 | elapsed time per iteration (s): 0.56 | learning rate: 5.556E-05 | global batch size: 256 | lm loss: 2.607448E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.039 | TFLOPs: 43.67 | +7: iteration 81770/ 115203 | consumed samples: 20933120 | consumed tokens: 42871029760 | elapsed time per iteration (s): 0.56 | learning rate: 5.554E-05 | global batch size: 256 | lm loss: 2.617338E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.126 | TFLOPs: 43.58 | +7: iteration 81780/ 115203 | consumed samples: 20935680 | consumed tokens: 42876272640 | elapsed time per iteration (s): 0.55 | learning rate: 5.552E-05 | global batch size: 256 | lm loss: 2.620531E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 81790/ 115203 | consumed samples: 20938240 | consumed tokens: 42881515520 | elapsed time per iteration (s): 0.57 | learning rate: 5.550E-05 | global batch size: 256 | lm loss: 2.610660E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.563 | TFLOPs: 43.15 | +7: iteration 81800/ 115203 | consumed samples: 20940800 | consumed tokens: 42886758400 | elapsed time per iteration (s): 0.58 | learning rate: 5.548E-05 | global batch size: 256 | lm loss: 2.608562E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.538 | TFLOPs: 42.29 | +7: iteration 81810/ 115203 | consumed samples: 20943360 | consumed tokens: 42892001280 | elapsed time per iteration (s): 0.55 | learning rate: 5.547E-05 | global batch size: 256 | lm loss: 2.612101E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.780 | TFLOPs: 44.03 | +7: iteration 81820/ 115203 | consumed samples: 20945920 | consumed tokens: 42897244160 | elapsed time per iteration (s): 0.56 | learning rate: 5.545E-05 | global batch size: 256 | lm loss: 2.618050E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.177 | TFLOPs: 43.59 | +7: iteration 81830/ 115203 | consumed samples: 20948480 | consumed tokens: 42902487040 | elapsed time per iteration (s): 0.57 | learning rate: 5.543E-05 | global batch size: 256 | lm loss: 2.610176E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.091 | TFLOPs: 42.91 | +7: iteration 81840/ 115203 | consumed samples: 20951040 | consumed tokens: 42907729920 | elapsed time per iteration (s): 0.57 | learning rate: 5.541E-05 | global batch size: 256 | lm loss: 2.616517E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.055 | TFLOPs: 43.00 | +7: iteration 81850/ 115203 | consumed samples: 20953600 | consumed tokens: 42912972800 | elapsed time per iteration (s): 0.57 | learning rate: 5.539E-05 | global batch size: 256 | lm loss: 2.609662E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.901 | TFLOPs: 43.08 | +7: iteration 81860/ 115203 | consumed samples: 20956160 | consumed tokens: 42918215680 | elapsed time per iteration (s): 0.57 | learning rate: 5.537E-05 | global batch size: 256 | lm loss: 2.616782E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.520 | TFLOPs: 43.14 | +7: iteration 81870/ 115203 | consumed samples: 20958720 | consumed tokens: 42923458560 | elapsed time per iteration (s): 0.56 | learning rate: 5.535E-05 | global batch size: 256 | lm loss: 2.606233E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.539 | TFLOPs: 43.43 | +7: iteration 81880/ 115203 | consumed samples: 20961280 | consumed tokens: 42928701440 | elapsed time per iteration (s): 0.57 | learning rate: 5.533E-05 | global batch size: 256 | lm loss: 2.623749E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.360 | TFLOPs: 42.65 | +7: iteration 81890/ 115203 | consumed samples: 20963840 | consumed tokens: 42933944320 | elapsed time per iteration (s): 0.56 | learning rate: 5.531E-05 | global batch size: 256 | lm loss: 2.606176E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.590 | TFLOPs: 43.44 | +7: iteration 81900/ 115203 | consumed samples: 20966400 | consumed tokens: 42939187200 | elapsed time per iteration (s): 0.56 | learning rate: 5.529E-05 | global batch size: 256 | lm loss: 2.613631E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.986 | TFLOPs: 43.57 | +7: iteration 81910/ 115203 | consumed samples: 20968960 | consumed tokens: 42944430080 | elapsed time per iteration (s): 0.56 | learning rate: 5.527E-05 | global batch size: 256 | lm loss: 2.616437E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.258 | TFLOPs: 43.21 | +7: iteration 81920/ 115203 | consumed samples: 20971520 | consumed tokens: 42949672960 | elapsed time per iteration (s): 0.57 | learning rate: 5.525E-05 | global batch size: 256 | lm loss: 2.610859E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.458 | TFLOPs: 42.66 | +7: iteration 81930/ 115203 | consumed samples: 20974080 | consumed tokens: 42954915840 | elapsed time per iteration (s): 0.56 | learning rate: 5.523E-05 | global batch size: 256 | lm loss: 2.626153E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.997 | TFLOPs: 43.67 | +7: iteration 81940/ 115203 | consumed samples: 20976640 | consumed tokens: 42960158720 | elapsed time per iteration (s): 0.56 | learning rate: 5.521E-05 | global batch size: 256 | lm loss: 2.609169E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.033 | TFLOPs: 43.67 | +7: iteration 81950/ 115203 | consumed samples: 20979200 | consumed tokens: 42965401600 | elapsed time per iteration (s): 0.55 | learning rate: 5.519E-05 | global batch size: 256 | lm loss: 2.612193E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.397 | TFLOPs: 43.99 | +7: iteration 81960/ 115203 | consumed samples: 20981760 | consumed tokens: 42970644480 | elapsed time per iteration (s): 0.56 | learning rate: 5.517E-05 | global batch size: 256 | lm loss: 2.613335E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.414 | TFLOPs: 43.32 | +7: iteration 81970/ 115203 | consumed samples: 20984320 | consumed tokens: 42975887360 | elapsed time per iteration (s): 0.56 | learning rate: 5.515E-05 | global batch size: 256 | lm loss: 2.609506E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.150 | TFLOPs: 43.30 | +7: iteration 81980/ 115203 | consumed samples: 20986880 | consumed tokens: 42981130240 | elapsed time per iteration (s): 0.55 | learning rate: 5.513E-05 | global batch size: 256 | lm loss: 2.616353E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.822 | TFLOPs: 44.03 | +7: iteration 81990/ 115203 | consumed samples: 20989440 | consumed tokens: 42986373120 | elapsed time per iteration (s): 0.57 | learning rate: 5.511E-05 | global batch size: 256 | lm loss: 2.619759E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.103 | TFLOPs: 43.10 | +0: [2023-03-17 01:41:27,596] [INFO] [logging.py:68:log_dist] [Rank 0] step=82000, skipped=0, lr=[5.5091074271143155e-05, 5.5091074271143155e-05, 5.5091074271143155e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 82000/ 115203 | consumed samples: 20992000 | consumed tokens: 42991616000 | elapsed time per iteration (s): 0.56 | learning rate: 5.509E-05 | global batch size: 256 | lm loss: 2.612587E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.880 | TFLOPs: 43.75 | +0: steps: 82000 loss: 2.6251 iter time (s): 0.566 samples/sec: 452.666 +7: iteration 82010/ 115203 | consumed samples: 20994560 | consumed tokens: 42996858880 | elapsed time per iteration (s): 0.57 | learning rate: 5.507E-05 | global batch size: 256 | lm loss: 2.605283E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.905 | TFLOPs: 42.80 | +7: iteration 82020/ 115203 | consumed samples: 20997120 | consumed tokens: 43002101760 | elapsed time per iteration (s): 0.56 | learning rate: 5.505E-05 | global batch size: 256 | lm loss: 2.611874E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.697 | TFLOPs: 43.54 | +7: iteration 82030/ 115203 | consumed samples: 20999680 | consumed tokens: 43007344640 | elapsed time per iteration (s): 0.56 | learning rate: 5.503E-05 | global batch size: 256 | lm loss: 2.616928E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.554 | TFLOPs: 43.24 | +7: iteration 82040/ 115203 | consumed samples: 21002240 | consumed tokens: 43012587520 | elapsed time per iteration (s): 0.56 | learning rate: 5.501E-05 | global batch size: 256 | lm loss: 2.616212E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.793 | TFLOPs: 43.45 | +7: iteration 82050/ 115203 | consumed samples: 21004800 | consumed tokens: 43017830400 | elapsed time per iteration (s): 0.56 | learning rate: 5.499E-05 | global batch size: 256 | lm loss: 2.622213E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.540 | TFLOPs: 43.62 | +7: iteration 82060/ 115203 | consumed samples: 21007360 | consumed tokens: 43023073280 | elapsed time per iteration (s): 0.57 | learning rate: 5.497E-05 | global batch size: 256 | lm loss: 2.608189E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.630 | TFLOPs: 42.49 | +7: iteration 82070/ 115203 | consumed samples: 21009920 | consumed tokens: 43028316160 | elapsed time per iteration (s): 0.56 | learning rate: 5.495E-05 | global batch size: 256 | lm loss: 2.609170E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.190 | TFLOPs: 43.49 | +7: iteration 82080/ 115203 | consumed samples: 21012480 | consumed tokens: 43033559040 | elapsed time per iteration (s): 0.56 | learning rate: 5.493E-05 | global batch size: 256 | lm loss: 2.615461E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.185 | TFLOPs: 43.49 | +7: iteration 82090/ 115203 | consumed samples: 21015040 | consumed tokens: 43038801920 | elapsed time per iteration (s): 0.56 | learning rate: 5.491E-05 | global batch size: 256 | lm loss: 2.608778E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.177 | TFLOPs: 43.40 | +7: iteration 82100/ 115203 | consumed samples: 21017600 | consumed tokens: 43044044800 | elapsed time per iteration (s): 0.56 | learning rate: 5.489E-05 | global batch size: 256 | lm loss: 2.618276E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.768 | TFLOPs: 43.64 | +7: iteration 82110/ 115203 | consumed samples: 21020160 | consumed tokens: 43049287680 | elapsed time per iteration (s): 0.56 | learning rate: 5.488E-05 | global batch size: 256 | lm loss: 2.622120E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.893 | TFLOPs: 43.46 | +7: iteration 82120/ 115203 | consumed samples: 21022720 | consumed tokens: 43054530560 | elapsed time per iteration (s): 0.56 | learning rate: 5.486E-05 | global batch size: 256 | lm loss: 2.623675E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.865 | TFLOPs: 43.37 | +7: iteration 82130/ 115203 | consumed samples: 21025280 | consumed tokens: 43059773440 | elapsed time per iteration (s): 0.56 | learning rate: 5.484E-05 | global batch size: 256 | lm loss: 2.620185E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.690 | TFLOPs: 43.45 | +7: iteration 82140/ 115203 | consumed samples: 21027840 | consumed tokens: 43065016320 | elapsed time per iteration (s): 0.56 | learning rate: 5.482E-05 | global batch size: 256 | lm loss: 2.603706E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.678 | TFLOPs: 43.44 | +7: iteration 82150/ 115203 | consumed samples: 21030400 | consumed tokens: 43070259200 | elapsed time per iteration (s): 0.56 | learning rate: 5.480E-05 | global batch size: 256 | lm loss: 2.612776E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.238 | TFLOPs: 43.40 | +7: iteration 82160/ 115203 | consumed samples: 21032960 | consumed tokens: 43075502080 | elapsed time per iteration (s): 0.56 | learning rate: 5.478E-05 | global batch size: 256 | lm loss: 2.613459E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.260 | TFLOPs: 43.88 | +7: iteration 82170/ 115203 | consumed samples: 21035520 | consumed tokens: 43080744960 | elapsed time per iteration (s): 0.56 | learning rate: 5.476E-05 | global batch size: 256 | lm loss: 2.606067E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.285 | TFLOPs: 43.88 | +7: iteration 82180/ 115203 | consumed samples: 21038080 | consumed tokens: 43085987840 | elapsed time per iteration (s): 0.55 | learning rate: 5.474E-05 | global batch size: 256 | lm loss: 2.601952E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 82190/ 115203 | consumed samples: 21040640 | consumed tokens: 43091230720 | elapsed time per iteration (s): 0.56 | learning rate: 5.472E-05 | global batch size: 256 | lm loss: 2.604118E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.299 | TFLOPs: 43.50 | +7: iteration 82200/ 115203 | consumed samples: 21043200 | consumed tokens: 43096473600 | elapsed time per iteration (s): 0.56 | learning rate: 5.470E-05 | global batch size: 256 | lm loss: 2.606745E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.577 | TFLOPs: 43.72 | +7: iteration 82210/ 115203 | consumed samples: 21045760 | consumed tokens: 43101716480 | elapsed time per iteration (s): 0.57 | learning rate: 5.468E-05 | global batch size: 256 | lm loss: 2.605075E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.532 | TFLOPs: 42.95 | +7: iteration 82220/ 115203 | consumed samples: 21048320 | consumed tokens: 43106959360 | elapsed time per iteration (s): 0.56 | learning rate: 5.466E-05 | global batch size: 256 | lm loss: 2.622048E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.537 | TFLOPs: 43.72 | +7: iteration 82230/ 115203 | consumed samples: 21050880 | consumed tokens: 43112202240 | elapsed time per iteration (s): 0.56 | learning rate: 5.464E-05 | global batch size: 256 | lm loss: 2.615520E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.110 | TFLOPs: 43.20 | +7: iteration 82240/ 115203 | consumed samples: 21053440 | consumed tokens: 43117445120 | elapsed time per iteration (s): 0.57 | learning rate: 5.462E-05 | global batch size: 256 | lm loss: 2.599478E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.315 | TFLOPs: 42.74 | +7: iteration 82250/ 115203 | consumed samples: 21056000 | consumed tokens: 43122688000 | elapsed time per iteration (s): 0.57 | learning rate: 5.460E-05 | global batch size: 256 | lm loss: 2.615378E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.889 | TFLOPs: 43.18 | +7: iteration 82260/ 115203 | consumed samples: 21058560 | consumed tokens: 43127930880 | elapsed time per iteration (s): 0.56 | learning rate: 5.458E-05 | global batch size: 256 | lm loss: 2.616339E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.395 | TFLOPs: 43.51 | +7: iteration 82270/ 115203 | consumed samples: 21061120 | consumed tokens: 43133173760 | elapsed time per iteration (s): 0.57 | learning rate: 5.456E-05 | global batch size: 256 | lm loss: 2.619413E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.656 | TFLOPs: 43.06 | +7: iteration 82280/ 115203 | consumed samples: 21063680 | consumed tokens: 43138416640 | elapsed time per iteration (s): 0.56 | learning rate: 5.454E-05 | global batch size: 256 | lm loss: 2.608963E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.756 | TFLOPs: 43.64 | +7: iteration 82290/ 115203 | consumed samples: 21066240 | consumed tokens: 43143659520 | elapsed time per iteration (s): 0.56 | learning rate: 5.452E-05 | global batch size: 256 | lm loss: 2.634382E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.476 | TFLOPs: 43.42 | +7: iteration 82300/ 115203 | consumed samples: 21068800 | consumed tokens: 43148902400 | elapsed time per iteration (s): 0.56 | learning rate: 5.450E-05 | global batch size: 256 | lm loss: 2.629498E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.498 | TFLOPs: 43.62 | +7: iteration 82310/ 115203 | consumed samples: 21071360 | consumed tokens: 43154145280 | elapsed time per iteration (s): 0.57 | learning rate: 5.448E-05 | global batch size: 256 | lm loss: 2.600660E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.496 | TFLOPs: 43.05 | +7: iteration 82320/ 115203 | consumed samples: 21073920 | consumed tokens: 43159388160 | elapsed time per iteration (s): 0.56 | learning rate: 5.446E-05 | global batch size: 256 | lm loss: 2.628844E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.600 | TFLOPs: 43.63 | +7: iteration 82330/ 115203 | consumed samples: 21076480 | consumed tokens: 43164631040 | elapsed time per iteration (s): 0.56 | learning rate: 5.445E-05 | global batch size: 256 | lm loss: 2.622268E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.841 | TFLOPs: 43.46 | +7: iteration 82340/ 115203 | consumed samples: 21079040 | consumed tokens: 43169873920 | elapsed time per iteration (s): 0.55 | learning rate: 5.443E-05 | global batch size: 256 | lm loss: 2.615612E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.350 | TFLOPs: 43.98 | +7: iteration 82350/ 115203 | consumed samples: 21081600 | consumed tokens: 43175116800 | elapsed time per iteration (s): 0.57 | learning rate: 5.441E-05 | global batch size: 256 | lm loss: 2.610920E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.081 | TFLOPs: 42.53 | +7: iteration 82360/ 115203 | consumed samples: 21084160 | consumed tokens: 43180359680 | elapsed time per iteration (s): 0.56 | learning rate: 5.439E-05 | global batch size: 256 | lm loss: 2.618017E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.202 | TFLOPs: 43.68 | +7: iteration 82370/ 115203 | consumed samples: 21086720 | consumed tokens: 43185602560 | elapsed time per iteration (s): 0.57 | learning rate: 5.437E-05 | global batch size: 256 | lm loss: 2.609171E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.344 | TFLOPs: 42.65 | +7: iteration 82380/ 115203 | consumed samples: 21089280 | consumed tokens: 43190845440 | elapsed time per iteration (s): 0.57 | learning rate: 5.435E-05 | global batch size: 256 | lm loss: 2.608222E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.218 | TFLOPs: 43.11 | +7: iteration 82390/ 115203 | consumed samples: 21091840 | consumed tokens: 43196088320 | elapsed time per iteration (s): 0.56 | learning rate: 5.433E-05 | global batch size: 256 | lm loss: 2.614749E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.307 | TFLOPs: 43.41 | +7: iteration 82400/ 115203 | consumed samples: 21094400 | consumed tokens: 43201331200 | elapsed time per iteration (s): 0.57 | learning rate: 5.431E-05 | global batch size: 256 | lm loss: 2.606688E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.137 | TFLOPs: 42.92 | +7: iteration 82410/ 115203 | consumed samples: 21096960 | consumed tokens: 43206574080 | elapsed time per iteration (s): 0.57 | learning rate: 5.429E-05 | global batch size: 256 | lm loss: 2.607259E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.308 | TFLOPs: 43.12 | +7: iteration 82420/ 115203 | consumed samples: 21099520 | consumed tokens: 43211816960 | elapsed time per iteration (s): 0.57 | learning rate: 5.427E-05 | global batch size: 256 | lm loss: 2.616183E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.894 | TFLOPs: 42.80 | +7: iteration 82430/ 115203 | consumed samples: 21102080 | consumed tokens: 43217059840 | elapsed time per iteration (s): 0.56 | learning rate: 5.425E-05 | global batch size: 256 | lm loss: 2.600897E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.068 | TFLOPs: 43.96 | +7: iteration 82440/ 115203 | consumed samples: 21104640 | consumed tokens: 43222302720 | elapsed time per iteration (s): 0.56 | learning rate: 5.423E-05 | global batch size: 256 | lm loss: 2.600510E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.814 | TFLOPs: 43.65 | +7: iteration 82450/ 115203 | consumed samples: 21107200 | consumed tokens: 43227545600 | elapsed time per iteration (s): 0.59 | learning rate: 5.421E-05 | global batch size: 256 | lm loss: 2.600149E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.341 | TFLOPs: 41.70 | +7: iteration 82460/ 115203 | consumed samples: 21109760 | consumed tokens: 43232788480 | elapsed time per iteration (s): 0.56 | learning rate: 5.419E-05 | global batch size: 256 | lm loss: 2.613699E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.249 | TFLOPs: 43.69 | +7: iteration 82470/ 115203 | consumed samples: 21112320 | consumed tokens: 43238031360 | elapsed time per iteration (s): 0.56 | learning rate: 5.417E-05 | global batch size: 256 | lm loss: 2.600113E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.474 | TFLOPs: 43.90 | +7: iteration 82480/ 115203 | consumed samples: 21114880 | consumed tokens: 43243274240 | elapsed time per iteration (s): 0.57 | learning rate: 5.415E-05 | global batch size: 256 | lm loss: 2.607404E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.680 | TFLOPs: 42.78 | +7: iteration 82490/ 115203 | consumed samples: 21117440 | consumed tokens: 43248517120 | elapsed time per iteration (s): 0.57 | learning rate: 5.413E-05 | global batch size: 256 | lm loss: 2.607331E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.875 | TFLOPs: 43.18 | +7: iteration 82500/ 115203 | consumed samples: 21120000 | consumed tokens: 43253760000 | elapsed time per iteration (s): 0.55 | learning rate: 5.411E-05 | global batch size: 256 | lm loss: 2.589590E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.380 | TFLOPs: 43.99 | +7: iteration 82510/ 115203 | consumed samples: 21122560 | consumed tokens: 43259002880 | elapsed time per iteration (s): 0.57 | learning rate: 5.409E-05 | global batch size: 256 | lm loss: 2.622997E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.175 | TFLOPs: 42.92 | +7: iteration 82520/ 115203 | consumed samples: 21125120 | consumed tokens: 43264245760 | elapsed time per iteration (s): 0.56 | learning rate: 5.408E-05 | global batch size: 256 | lm loss: 2.615492E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.330 | TFLOPs: 43.70 | +7: iteration 82530/ 115203 | consumed samples: 21127680 | consumed tokens: 43269488640 | elapsed time per iteration (s): 0.55 | learning rate: 5.406E-05 | global batch size: 256 | lm loss: 2.609051E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.285 | TFLOPs: 43.98 | +7: iteration 82540/ 115203 | consumed samples: 21130240 | consumed tokens: 43274731520 | elapsed time per iteration (s): 0.56 | learning rate: 5.404E-05 | global batch size: 256 | lm loss: 2.623183E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.824 | TFLOPs: 43.55 | +7: iteration 82550/ 115203 | consumed samples: 21132800 | consumed tokens: 43279974400 | elapsed time per iteration (s): 0.56 | learning rate: 5.402E-05 | global batch size: 256 | lm loss: 2.628197E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.180 | TFLOPs: 43.30 | +7: iteration 82560/ 115203 | consumed samples: 21135360 | consumed tokens: 43285217280 | elapsed time per iteration (s): 0.56 | learning rate: 5.400E-05 | global batch size: 256 | lm loss: 2.605348E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.772 | TFLOPs: 43.55 | +7: iteration 82570/ 115203 | consumed samples: 21137920 | consumed tokens: 43290460160 | elapsed time per iteration (s): 0.56 | learning rate: 5.398E-05 | global batch size: 256 | lm loss: 2.605974E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.580 | TFLOPs: 43.34 | +7: iteration 82580/ 115203 | consumed samples: 21140480 | consumed tokens: 43295703040 | elapsed time per iteration (s): 0.56 | learning rate: 5.396E-05 | global batch size: 256 | lm loss: 2.611459E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.713 | TFLOPs: 43.45 | +7: iteration 82590/ 115203 | consumed samples: 21143040 | consumed tokens: 43300945920 | elapsed time per iteration (s): 0.57 | learning rate: 5.394E-05 | global batch size: 256 | lm loss: 2.619883E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.817 | TFLOPs: 42.89 | +7: iteration 82600/ 115203 | consumed samples: 21145600 | consumed tokens: 43306188800 | elapsed time per iteration (s): 0.55 | learning rate: 5.392E-05 | global batch size: 256 | lm loss: 2.610768E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.333 | TFLOPs: 43.98 | +7: iteration 82610/ 115203 | consumed samples: 21148160 | consumed tokens: 43311431680 | elapsed time per iteration (s): 0.56 | learning rate: 5.390E-05 | global batch size: 256 | lm loss: 2.614516E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.987 | TFLOPs: 43.38 | +7: iteration 82620/ 115203 | consumed samples: 21150720 | consumed tokens: 43316674560 | elapsed time per iteration (s): 0.56 | learning rate: 5.388E-05 | global batch size: 256 | lm loss: 2.606516E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.007 | TFLOPs: 43.38 | +7: iteration 82630/ 115203 | consumed samples: 21153280 | consumed tokens: 43321917440 | elapsed time per iteration (s): 0.56 | learning rate: 5.386E-05 | global batch size: 256 | lm loss: 2.619628E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.224 | TFLOPs: 43.21 | +7: iteration 82640/ 115203 | consumed samples: 21155840 | consumed tokens: 43327160320 | elapsed time per iteration (s): 0.57 | learning rate: 5.384E-05 | global batch size: 256 | lm loss: 2.615745E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.603 | TFLOPs: 42.67 | +7: iteration 82650/ 115203 | consumed samples: 21158400 | consumed tokens: 43332403200 | elapsed time per iteration (s): 0.57 | learning rate: 5.382E-05 | global batch size: 256 | lm loss: 2.605351E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.514 | TFLOPs: 42.86 | +7: iteration 82660/ 115203 | consumed samples: 21160960 | consumed tokens: 43337646080 | elapsed time per iteration (s): 0.56 | learning rate: 5.380E-05 | global batch size: 256 | lm loss: 2.624864E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.205 | TFLOPs: 43.40 | +7: iteration 82670/ 115203 | consumed samples: 21163520 | consumed tokens: 43342888960 | elapsed time per iteration (s): 0.56 | learning rate: 5.378E-05 | global batch size: 256 | lm loss: 2.609860E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.509 | TFLOPs: 43.52 | +7: iteration 82680/ 115203 | consumed samples: 21166080 | consumed tokens: 43348131840 | elapsed time per iteration (s): 0.56 | learning rate: 5.377E-05 | global batch size: 256 | lm loss: 2.621554E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.436 | TFLOPs: 43.42 | +7: iteration 82690/ 115203 | consumed samples: 21168640 | consumed tokens: 43353374720 | elapsed time per iteration (s): 0.56 | learning rate: 5.375E-05 | global batch size: 256 | lm loss: 2.614299E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.505 | TFLOPs: 43.52 | +7: iteration 82700/ 115203 | consumed samples: 21171200 | consumed tokens: 43358617600 | elapsed time per iteration (s): 0.56 | learning rate: 5.373E-05 | global batch size: 256 | lm loss: 2.608973E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.185 | TFLOPs: 43.97 | +7: iteration 82710/ 115203 | consumed samples: 21173760 | consumed tokens: 43363860480 | elapsed time per iteration (s): 0.56 | learning rate: 5.371E-05 | global batch size: 256 | lm loss: 2.606254E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 82720/ 115203 | consumed samples: 21176320 | consumed tokens: 43369103360 | elapsed time per iteration (s): 0.56 | learning rate: 5.369E-05 | global batch size: 256 | lm loss: 2.598396E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.881 | TFLOPs: 43.84 | +7: iteration 82730/ 115203 | consumed samples: 21178880 | consumed tokens: 43374346240 | elapsed time per iteration (s): 0.55 | learning rate: 5.367E-05 | global batch size: 256 | lm loss: 2.609132E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 82740/ 115203 | consumed samples: 21181440 | consumed tokens: 43379589120 | elapsed time per iteration (s): 0.56 | learning rate: 5.365E-05 | global batch size: 256 | lm loss: 2.614203E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.231 | TFLOPs: 43.97 | +7: iteration 82750/ 115203 | consumed samples: 21184000 | consumed tokens: 43384832000 | elapsed time per iteration (s): 0.56 | learning rate: 5.363E-05 | global batch size: 256 | lm loss: 2.594834E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.829 | TFLOPs: 43.65 | +7: iteration 82760/ 115203 | consumed samples: 21186560 | consumed tokens: 43390074880 | elapsed time per iteration (s): 0.55 | learning rate: 5.361E-05 | global batch size: 256 | lm loss: 2.605329E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 82770/ 115203 | consumed samples: 21189120 | consumed tokens: 43395317760 | elapsed time per iteration (s): 0.56 | learning rate: 5.359E-05 | global batch size: 256 | lm loss: 2.613801E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.817 | TFLOPs: 43.46 | +7: iteration 82780/ 115203 | consumed samples: 21191680 | consumed tokens: 43400560640 | elapsed time per iteration (s): 0.56 | learning rate: 5.357E-05 | global batch size: 256 | lm loss: 2.615462E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.500 | TFLOPs: 43.81 | +7: iteration 82790/ 115203 | consumed samples: 21194240 | consumed tokens: 43405803520 | elapsed time per iteration (s): 0.57 | learning rate: 5.355E-05 | global batch size: 256 | lm loss: 2.605927E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.448 | TFLOPs: 42.47 | +7: iteration 82800/ 115203 | consumed samples: 21196800 | consumed tokens: 43411046400 | elapsed time per iteration (s): 0.56 | learning rate: 5.353E-05 | global batch size: 256 | lm loss: 2.608832E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.864 | TFLOPs: 43.46 | +7: iteration 82810/ 115203 | consumed samples: 21199360 | consumed tokens: 43416289280 | elapsed time per iteration (s): 0.56 | learning rate: 5.351E-05 | global batch size: 256 | lm loss: 2.615204E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.453 | TFLOPs: 43.90 | +7: iteration 82820/ 115203 | consumed samples: 21201920 | consumed tokens: 43421532160 | elapsed time per iteration (s): 0.56 | learning rate: 5.349E-05 | global batch size: 256 | lm loss: 2.598750E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.104 | TFLOPs: 43.96 | +7: iteration 82830/ 115203 | consumed samples: 21204480 | consumed tokens: 43426775040 | elapsed time per iteration (s): 0.56 | learning rate: 5.348E-05 | global batch size: 256 | lm loss: 2.596920E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.933 | TFLOPs: 43.75 | +7: iteration 82840/ 115203 | consumed samples: 21207040 | consumed tokens: 43432017920 | elapsed time per iteration (s): 0.57 | learning rate: 5.346E-05 | global batch size: 256 | lm loss: 2.613459E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.019 | TFLOPs: 42.81 | +7: iteration 82850/ 115203 | consumed samples: 21209600 | consumed tokens: 43437260800 | elapsed time per iteration (s): 0.57 | learning rate: 5.344E-05 | global batch size: 256 | lm loss: 2.610652E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.218 | TFLOPs: 43.11 | +7: iteration 82860/ 115203 | consumed samples: 21212160 | consumed tokens: 43442503680 | elapsed time per iteration (s): 0.57 | learning rate: 5.342E-05 | global batch size: 256 | lm loss: 2.603029E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.863 | TFLOPs: 42.70 | +7: iteration 82870/ 115203 | consumed samples: 21214720 | consumed tokens: 43447746560 | elapsed time per iteration (s): 0.56 | learning rate: 5.340E-05 | global batch size: 256 | lm loss: 2.615662E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.248 | TFLOPs: 43.98 | +7: iteration 82880/ 115203 | consumed samples: 21217280 | consumed tokens: 43452989440 | elapsed time per iteration (s): 0.56 | learning rate: 5.338E-05 | global batch size: 256 | lm loss: 2.610969E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.133 | TFLOPs: 43.96 | +7: iteration 82890/ 115203 | consumed samples: 21219840 | consumed tokens: 43458232320 | elapsed time per iteration (s): 0.56 | learning rate: 5.336E-05 | global batch size: 256 | lm loss: 2.613976E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.190 | TFLOPs: 43.97 | +7: iteration 82900/ 115203 | consumed samples: 21222400 | consumed tokens: 43463475200 | elapsed time per iteration (s): 0.57 | learning rate: 5.334E-05 | global batch size: 256 | lm loss: 2.614950E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.714 | TFLOPs: 43.16 | +7: iteration 82910/ 115203 | consumed samples: 21224960 | consumed tokens: 43468718080 | elapsed time per iteration (s): 0.56 | learning rate: 5.332E-05 | global batch size: 256 | lm loss: 2.619178E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.816 | TFLOPs: 43.55 | +7: iteration 82920/ 115203 | consumed samples: 21227520 | consumed tokens: 43473960960 | elapsed time per iteration (s): 0.56 | learning rate: 5.330E-05 | global batch size: 256 | lm loss: 2.618644E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.060 | TFLOPs: 43.48 | +7: iteration 82930/ 115203 | consumed samples: 21230080 | consumed tokens: 43479203840 | elapsed time per iteration (s): 0.56 | learning rate: 5.328E-05 | global batch size: 256 | lm loss: 2.617114E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.281 | TFLOPs: 43.31 | +7: iteration 82940/ 115203 | consumed samples: 21232640 | consumed tokens: 43484446720 | elapsed time per iteration (s): 0.56 | learning rate: 5.326E-05 | global batch size: 256 | lm loss: 2.616812E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.691 | TFLOPs: 43.45 | +7: iteration 82950/ 115203 | consumed samples: 21235200 | consumed tokens: 43489689600 | elapsed time per iteration (s): 0.57 | learning rate: 5.324E-05 | global batch size: 256 | lm loss: 2.608829E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.887 | TFLOPs: 42.70 | +7: iteration 82960/ 115203 | consumed samples: 21237760 | consumed tokens: 43494932480 | elapsed time per iteration (s): 0.58 | learning rate: 5.322E-05 | global batch size: 256 | lm loss: 2.615969E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.210 | TFLOPs: 42.16 | +7: iteration 82970/ 115203 | consumed samples: 21240320 | consumed tokens: 43500175360 | elapsed time per iteration (s): 0.55 | learning rate: 5.321E-05 | global batch size: 256 | lm loss: 2.617895E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 82980/ 115203 | consumed samples: 21242880 | consumed tokens: 43505418240 | elapsed time per iteration (s): 0.56 | learning rate: 5.319E-05 | global batch size: 256 | lm loss: 2.610160E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.993 | TFLOPs: 43.86 | +7: iteration 82990/ 115203 | consumed samples: 21245440 | consumed tokens: 43510661120 | elapsed time per iteration (s): 0.57 | learning rate: 5.317E-05 | global batch size: 256 | lm loss: 2.615392E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.612 | TFLOPs: 42.96 | +7: iteration 83000/ 115203 | consumed samples: 21248000 | consumed tokens: 43515904000 | elapsed time per iteration (s): 0.56 | learning rate: 5.315E-05 | global batch size: 256 | lm loss: 2.612107E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.147 | TFLOPs: 43.39 | +7: iteration 83010/ 115203 | consumed samples: 21250560 | consumed tokens: 43521146880 | elapsed time per iteration (s): 0.58 | learning rate: 5.313E-05 | global batch size: 256 | lm loss: 2.604112E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.674 | TFLOPs: 42.39 | +7: iteration 83020/ 115203 | consumed samples: 21253120 | consumed tokens: 43526389760 | elapsed time per iteration (s): 0.55 | learning rate: 5.311E-05 | global batch size: 256 | lm loss: 2.618163E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.337 | TFLOPs: 43.98 | +7: iteration 83030/ 115203 | consumed samples: 21255680 | consumed tokens: 43531632640 | elapsed time per iteration (s): 0.56 | learning rate: 5.309E-05 | global batch size: 256 | lm loss: 2.620398E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.621 | TFLOPs: 43.25 | +7: iteration 83040/ 115203 | consumed samples: 21258240 | consumed tokens: 43536875520 | elapsed time per iteration (s): 0.58 | learning rate: 5.307E-05 | global batch size: 256 | lm loss: 2.615329E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.242 | TFLOPs: 42.16 | +7: iteration 83050/ 115203 | consumed samples: 21260800 | consumed tokens: 43542118400 | elapsed time per iteration (s): 0.57 | learning rate: 5.305E-05 | global batch size: 256 | lm loss: 2.618656E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.693 | TFLOPs: 42.68 | +7: iteration 83060/ 115203 | consumed samples: 21263360 | consumed tokens: 43547361280 | elapsed time per iteration (s): 0.56 | learning rate: 5.303E-05 | global batch size: 256 | lm loss: 2.610337E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.059 | TFLOPs: 43.67 | +7: iteration 83070/ 115203 | consumed samples: 21265920 | consumed tokens: 43552604160 | elapsed time per iteration (s): 0.57 | learning rate: 5.301E-05 | global batch size: 256 | lm loss: 2.617097E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.141 | TFLOPs: 43.01 | +7: iteration 83080/ 115203 | consumed samples: 21268480 | consumed tokens: 43557847040 | elapsed time per iteration (s): 0.57 | learning rate: 5.299E-05 | global batch size: 256 | lm loss: 2.608696E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.641 | TFLOPs: 43.15 | +7: iteration 83090/ 115203 | consumed samples: 21271040 | consumed tokens: 43563089920 | elapsed time per iteration (s): 0.56 | learning rate: 5.298E-05 | global batch size: 256 | lm loss: 2.608487E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.811 | TFLOPs: 43.84 | +7: iteration 83100/ 115203 | consumed samples: 21273600 | consumed tokens: 43568332800 | elapsed time per iteration (s): 0.56 | learning rate: 5.296E-05 | global batch size: 256 | lm loss: 2.598813E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.401 | TFLOPs: 43.61 | +7: iteration 83110/ 115203 | consumed samples: 21276160 | consumed tokens: 43573575680 | elapsed time per iteration (s): 0.56 | learning rate: 5.294E-05 | global batch size: 256 | lm loss: 2.604366E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.810 | TFLOPs: 43.27 | +7: iteration 83120/ 115203 | consumed samples: 21278720 | consumed tokens: 43578818560 | elapsed time per iteration (s): 0.58 | learning rate: 5.292E-05 | global batch size: 256 | lm loss: 2.607889E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.932 | TFLOPs: 42.13 | +7: iteration 83130/ 115203 | consumed samples: 21281280 | consumed tokens: 43584061440 | elapsed time per iteration (s): 0.57 | learning rate: 5.290E-05 | global batch size: 256 | lm loss: 2.604373E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.874 | TFLOPs: 43.08 | +7: iteration 83140/ 115203 | consumed samples: 21283840 | consumed tokens: 43589304320 | elapsed time per iteration (s): 0.57 | learning rate: 5.288E-05 | global batch size: 256 | lm loss: 2.618382E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.497 | TFLOPs: 42.85 | +7: iteration 83150/ 115203 | consumed samples: 21286400 | consumed tokens: 43594547200 | elapsed time per iteration (s): 0.56 | learning rate: 5.286E-05 | global batch size: 256 | lm loss: 2.624533E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.849 | TFLOPs: 43.27 | +7: iteration 83160/ 115203 | consumed samples: 21288960 | consumed tokens: 43599790080 | elapsed time per iteration (s): 0.56 | learning rate: 5.284E-05 | global batch size: 256 | lm loss: 2.605887E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.561 | TFLOPs: 43.43 | +7: iteration 83170/ 115203 | consumed samples: 21291520 | consumed tokens: 43605032960 | elapsed time per iteration (s): 0.60 | learning rate: 5.282E-05 | global batch size: 256 | lm loss: 2.616365E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.913 | TFLOPs: 40.80 | +7: iteration 83180/ 115203 | consumed samples: 21294080 | consumed tokens: 43610275840 | elapsed time per iteration (s): 0.57 | learning rate: 5.280E-05 | global batch size: 256 | lm loss: 2.605804E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.732 | TFLOPs: 42.88 | +7: iteration 83190/ 115203 | consumed samples: 21296640 | consumed tokens: 43615518720 | elapsed time per iteration (s): 0.56 | learning rate: 5.278E-05 | global batch size: 256 | lm loss: 2.617857E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.671 | TFLOPs: 43.44 | +7: iteration 83200/ 115203 | consumed samples: 21299200 | consumed tokens: 43620761600 | elapsed time per iteration (s): 0.58 | learning rate: 5.276E-05 | global batch size: 256 | lm loss: 2.605464E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.762 | TFLOPs: 42.12 | +7: iteration 83210/ 115203 | consumed samples: 21301760 | consumed tokens: 43626004480 | elapsed time per iteration (s): 0.57 | learning rate: 5.275E-05 | global batch size: 256 | lm loss: 2.602907E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.465 | TFLOPs: 42.66 | +7: iteration 83220/ 115203 | consumed samples: 21304320 | consumed tokens: 43631247360 | elapsed time per iteration (s): 0.56 | learning rate: 5.273E-05 | global batch size: 256 | lm loss: 2.618458E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.144 | TFLOPs: 43.87 | +7: iteration 83230/ 115203 | consumed samples: 21306880 | consumed tokens: 43636490240 | elapsed time per iteration (s): 0.57 | learning rate: 5.271E-05 | global batch size: 256 | lm loss: 2.608787E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.880 | TFLOPs: 42.89 | +7: iteration 83240/ 115203 | consumed samples: 21309440 | consumed tokens: 43641733120 | elapsed time per iteration (s): 0.56 | learning rate: 5.269E-05 | global batch size: 256 | lm loss: 2.605465E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.432 | TFLOPs: 43.61 | +7: iteration 83250/ 115203 | consumed samples: 21312000 | consumed tokens: 43646976000 | elapsed time per iteration (s): 0.56 | learning rate: 5.267E-05 | global batch size: 256 | lm loss: 2.608833E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.672 | TFLOPs: 43.92 | +7: iteration 83260/ 115203 | consumed samples: 21314560 | consumed tokens: 43652218880 | elapsed time per iteration (s): 0.56 | learning rate: 5.265E-05 | global batch size: 256 | lm loss: 2.605730E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.141 | TFLOPs: 43.68 | +7: iteration 83270/ 115203 | consumed samples: 21317120 | consumed tokens: 43657461760 | elapsed time per iteration (s): 0.57 | learning rate: 5.263E-05 | global batch size: 256 | lm loss: 2.609441E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.817 | TFLOPs: 42.98 | +7: iteration 83280/ 115203 | consumed samples: 21319680 | consumed tokens: 43662704640 | elapsed time per iteration (s): 0.56 | learning rate: 5.261E-05 | global batch size: 256 | lm loss: 2.613425E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.845 | TFLOPs: 43.46 | +7: iteration 83290/ 115203 | consumed samples: 21322240 | consumed tokens: 43667947520 | elapsed time per iteration (s): 0.56 | learning rate: 5.259E-05 | global batch size: 256 | lm loss: 2.614450E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.165 | TFLOPs: 43.20 | +7: iteration 83300/ 115203 | consumed samples: 21324800 | consumed tokens: 43673190400 | elapsed time per iteration (s): 0.56 | learning rate: 5.257E-05 | global batch size: 256 | lm loss: 2.619029E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.428 | TFLOPs: 43.42 | +7: iteration 83310/ 115203 | consumed samples: 21327360 | consumed tokens: 43678433280 | elapsed time per iteration (s): 0.57 | learning rate: 5.255E-05 | global batch size: 256 | lm loss: 2.600370E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.512 | TFLOPs: 43.14 | +7: iteration 83320/ 115203 | consumed samples: 21329920 | consumed tokens: 43683676160 | elapsed time per iteration (s): 0.56 | learning rate: 5.254E-05 | global batch size: 256 | lm loss: 2.617314E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.897 | TFLOPs: 43.85 | +7: iteration 83330/ 115203 | consumed samples: 21332480 | consumed tokens: 43688919040 | elapsed time per iteration (s): 0.56 | learning rate: 5.252E-05 | global batch size: 256 | lm loss: 2.610975E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.363 | TFLOPs: 43.22 | +7: iteration 83340/ 115203 | consumed samples: 21335040 | consumed tokens: 43694161920 | elapsed time per iteration (s): 0.56 | learning rate: 5.250E-05 | global batch size: 256 | lm loss: 2.621065E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.990 | TFLOPs: 43.57 | +7: iteration 83350/ 115203 | consumed samples: 21337600 | consumed tokens: 43699404800 | elapsed time per iteration (s): 0.57 | learning rate: 5.248E-05 | global batch size: 256 | lm loss: 2.599787E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.702 | TFLOPs: 42.49 | +7: iteration 83360/ 115203 | consumed samples: 21340160 | consumed tokens: 43704647680 | elapsed time per iteration (s): 0.56 | learning rate: 5.246E-05 | global batch size: 256 | lm loss: 2.613872E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.957 | TFLOPs: 43.57 | +7: iteration 83370/ 115203 | consumed samples: 21342720 | consumed tokens: 43709890560 | elapsed time per iteration (s): 0.56 | learning rate: 5.244E-05 | global batch size: 256 | lm loss: 2.601612E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.190 | TFLOPs: 43.68 | +7: iteration 83380/ 115203 | consumed samples: 21345280 | consumed tokens: 43715133440 | elapsed time per iteration (s): 0.58 | learning rate: 5.242E-05 | global batch size: 256 | lm loss: 2.617435E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.073 | TFLOPs: 42.05 | +7: iteration 83390/ 115203 | consumed samples: 21347840 | consumed tokens: 43720376320 | elapsed time per iteration (s): 0.57 | learning rate: 5.240E-05 | global batch size: 256 | lm loss: 2.607989E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.122 | TFLOPs: 42.82 | +7: iteration 83400/ 115203 | consumed samples: 21350400 | consumed tokens: 43725619200 | elapsed time per iteration (s): 0.57 | learning rate: 5.238E-05 | global batch size: 256 | lm loss: 2.607473E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.639 | TFLOPs: 43.06 | +7: iteration 83410/ 115203 | consumed samples: 21352960 | consumed tokens: 43730862080 | elapsed time per iteration (s): 0.56 | learning rate: 5.236E-05 | global batch size: 256 | lm loss: 2.612584E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.104 | TFLOPs: 43.96 | +7: iteration 83420/ 115203 | consumed samples: 21355520 | consumed tokens: 43736104960 | elapsed time per iteration (s): 0.57 | learning rate: 5.234E-05 | global batch size: 256 | lm loss: 2.596641E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.358 | TFLOPs: 43.03 | +7: iteration 83430/ 115203 | consumed samples: 21358080 | consumed tokens: 43741347840 | elapsed time per iteration (s): 0.57 | learning rate: 5.233E-05 | global batch size: 256 | lm loss: 2.615414E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.361 | TFLOPs: 42.75 | +7: iteration 83440/ 115203 | consumed samples: 21360640 | consumed tokens: 43746590720 | elapsed time per iteration (s): 0.56 | learning rate: 5.231E-05 | global batch size: 256 | lm loss: 2.608022E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.121 | TFLOPs: 43.30 | +7: iteration 83450/ 115203 | consumed samples: 21363200 | consumed tokens: 43751833600 | elapsed time per iteration (s): 0.56 | learning rate: 5.229E-05 | global batch size: 256 | lm loss: 2.605236E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 83460/ 115203 | consumed samples: 21365760 | consumed tokens: 43757076480 | elapsed time per iteration (s): 0.56 | learning rate: 5.227E-05 | global batch size: 256 | lm loss: 2.607497E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.928 | TFLOPs: 43.28 | +7: iteration 83470/ 115203 | consumed samples: 21368320 | consumed tokens: 43762319360 | elapsed time per iteration (s): 0.57 | learning rate: 5.225E-05 | global batch size: 256 | lm loss: 2.603840E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.284 | TFLOPs: 42.83 | +7: iteration 83480/ 115203 | consumed samples: 21370880 | consumed tokens: 43767562240 | elapsed time per iteration (s): 0.56 | learning rate: 5.223E-05 | global batch size: 256 | lm loss: 2.608340E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.454 | TFLOPs: 43.42 | +7: iteration 83490/ 115203 | consumed samples: 21373440 | consumed tokens: 43772805120 | elapsed time per iteration (s): 0.56 | learning rate: 5.221E-05 | global batch size: 256 | lm loss: 2.593836E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.793 | TFLOPs: 43.74 | +7: iteration 83500/ 115203 | consumed samples: 21376000 | consumed tokens: 43778048000 | elapsed time per iteration (s): 0.56 | learning rate: 5.219E-05 | global batch size: 256 | lm loss: 2.615669E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.828 | TFLOPs: 43.46 | +7: iteration 83510/ 115203 | consumed samples: 21378560 | consumed tokens: 43783290880 | elapsed time per iteration (s): 0.58 | learning rate: 5.217E-05 | global batch size: 256 | lm loss: 2.616257E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.359 | TFLOPs: 42.08 | +7: iteration 83520/ 115203 | consumed samples: 21381120 | consumed tokens: 43788533760 | elapsed time per iteration (s): 0.56 | learning rate: 5.215E-05 | global batch size: 256 | lm loss: 2.592096E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.235 | TFLOPs: 43.59 | +7: iteration 83530/ 115203 | consumed samples: 21383680 | consumed tokens: 43793776640 | elapsed time per iteration (s): 0.57 | learning rate: 5.214E-05 | global batch size: 256 | lm loss: 2.606412E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.063 | TFLOPs: 42.53 | +7: iteration 83540/ 115203 | consumed samples: 21386240 | consumed tokens: 43799019520 | elapsed time per iteration (s): 0.57 | learning rate: 5.212E-05 | global batch size: 256 | lm loss: 2.603704E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.391 | TFLOPs: 42.84 | +7: iteration 83550/ 115203 | consumed samples: 21388800 | consumed tokens: 43804262400 | elapsed time per iteration (s): 0.56 | learning rate: 5.210E-05 | global batch size: 256 | lm loss: 2.611617E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.166 | TFLOPs: 43.59 | +7: iteration 83560/ 115203 | consumed samples: 21391360 | consumed tokens: 43809505280 | elapsed time per iteration (s): 0.56 | learning rate: 5.208E-05 | global batch size: 256 | lm loss: 2.612379E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.508 | TFLOPs: 43.62 | +7: iteration 83570/ 115203 | consumed samples: 21393920 | consumed tokens: 43814748160 | elapsed time per iteration (s): 0.56 | learning rate: 5.206E-05 | global batch size: 256 | lm loss: 2.592145E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.659 | TFLOPs: 43.35 | +7: iteration 83580/ 115203 | consumed samples: 21396480 | consumed tokens: 43819991040 | elapsed time per iteration (s): 0.57 | learning rate: 5.204E-05 | global batch size: 256 | lm loss: 2.603353E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.568 | TFLOPs: 43.05 | +7: iteration 83590/ 115203 | consumed samples: 21399040 | consumed tokens: 43825233920 | elapsed time per iteration (s): 0.56 | learning rate: 5.202E-05 | global batch size: 256 | lm loss: 2.614610E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.386 | TFLOPs: 43.70 | +7: iteration 83600/ 115203 | consumed samples: 21401600 | consumed tokens: 43830476800 | elapsed time per iteration (s): 0.57 | learning rate: 5.200E-05 | global batch size: 256 | lm loss: 2.609170E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.745 | TFLOPs: 42.88 | +7: iteration 83610/ 115203 | consumed samples: 21404160 | consumed tokens: 43835719680 | elapsed time per iteration (s): 0.56 | learning rate: 5.198E-05 | global batch size: 256 | lm loss: 2.611830E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.993 | TFLOPs: 43.47 | +7: iteration 83620/ 115203 | consumed samples: 21406720 | consumed tokens: 43840962560 | elapsed time per iteration (s): 0.56 | learning rate: 5.196E-05 | global batch size: 256 | lm loss: 2.607723E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.878 | TFLOPs: 43.37 | +7: iteration 83630/ 115203 | consumed samples: 21409280 | consumed tokens: 43846205440 | elapsed time per iteration (s): 0.56 | learning rate: 5.195E-05 | global batch size: 256 | lm loss: 2.609186E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.339 | TFLOPs: 43.60 | +7: iteration 83640/ 115203 | consumed samples: 21411840 | consumed tokens: 43851448320 | elapsed time per iteration (s): 0.56 | learning rate: 5.193E-05 | global batch size: 256 | lm loss: 2.606914E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.078 | TFLOPs: 43.48 | +7: iteration 83650/ 115203 | consumed samples: 21414400 | consumed tokens: 43856691200 | elapsed time per iteration (s): 0.56 | learning rate: 5.191E-05 | global batch size: 256 | lm loss: 2.610284E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.430 | TFLOPs: 43.52 | +7: iteration 83660/ 115203 | consumed samples: 21416960 | consumed tokens: 43861934080 | elapsed time per iteration (s): 0.56 | learning rate: 5.189E-05 | global batch size: 256 | lm loss: 2.596968E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.227 | TFLOPs: 43.97 | +7: iteration 83670/ 115203 | consumed samples: 21419520 | consumed tokens: 43867176960 | elapsed time per iteration (s): 0.57 | learning rate: 5.187E-05 | global batch size: 256 | lm loss: 2.609558E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.767 | TFLOPs: 42.59 | +7: iteration 83680/ 115203 | consumed samples: 21422080 | consumed tokens: 43872419840 | elapsed time per iteration (s): 0.56 | learning rate: 5.185E-05 | global batch size: 256 | lm loss: 2.607077E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.502 | TFLOPs: 43.43 | +7: iteration 83690/ 115203 | consumed samples: 21424640 | consumed tokens: 43877662720 | elapsed time per iteration (s): 0.56 | learning rate: 5.183E-05 | global batch size: 256 | lm loss: 2.599672E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.418 | TFLOPs: 43.71 | +7: iteration 83700/ 115203 | consumed samples: 21427200 | consumed tokens: 43882905600 | elapsed time per iteration (s): 0.56 | learning rate: 5.181E-05 | global batch size: 256 | lm loss: 2.605164E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.771 | TFLOPs: 43.26 | +7: iteration 83710/ 115203 | consumed samples: 21429760 | consumed tokens: 43888148480 | elapsed time per iteration (s): 0.56 | learning rate: 5.179E-05 | global batch size: 256 | lm loss: 2.604486E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.418 | TFLOPs: 43.71 | +7: iteration 83720/ 115203 | consumed samples: 21432320 | consumed tokens: 43893391360 | elapsed time per iteration (s): 0.57 | learning rate: 5.178E-05 | global batch size: 256 | lm loss: 2.610832E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.506 | TFLOPs: 42.66 | +7: iteration 83730/ 115203 | consumed samples: 21434880 | consumed tokens: 43898634240 | elapsed time per iteration (s): 0.55 | learning rate: 5.176E-05 | global batch size: 256 | lm loss: 2.610289E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.452 | TFLOPs: 43.99 | +7: iteration 83740/ 115203 | consumed samples: 21437440 | consumed tokens: 43903877120 | elapsed time per iteration (s): 0.57 | learning rate: 5.174E-05 | global batch size: 256 | lm loss: 2.604133E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.513 | TFLOPs: 42.86 | +7: iteration 83750/ 115203 | consumed samples: 21440000 | consumed tokens: 43909120000 | elapsed time per iteration (s): 0.56 | learning rate: 5.172E-05 | global batch size: 256 | lm loss: 2.611587E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.989 | TFLOPs: 43.95 | +7: iteration 83760/ 115203 | consumed samples: 21442560 | consumed tokens: 43914362880 | elapsed time per iteration (s): 0.55 | learning rate: 5.170E-05 | global batch size: 256 | lm loss: 2.610237E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.369 | TFLOPs: 43.99 | +7: iteration 83770/ 115203 | consumed samples: 21445120 | consumed tokens: 43919605760 | elapsed time per iteration (s): 0.55 | learning rate: 5.168E-05 | global batch size: 256 | lm loss: 2.619579E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.335 | TFLOPs: 43.98 | +7: iteration 83780/ 115203 | consumed samples: 21447680 | consumed tokens: 43924848640 | elapsed time per iteration (s): 0.55 | learning rate: 5.166E-05 | global batch size: 256 | lm loss: 2.606841E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.440 | TFLOPs: 43.99 | +7: iteration 83790/ 115203 | consumed samples: 21450240 | consumed tokens: 43930091520 | elapsed time per iteration (s): 0.55 | learning rate: 5.164E-05 | global batch size: 256 | lm loss: 2.623348E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.414 | TFLOPs: 43.99 | +7: iteration 83800/ 115203 | consumed samples: 21452800 | consumed tokens: 43935334400 | elapsed time per iteration (s): 0.56 | learning rate: 5.162E-05 | global batch size: 256 | lm loss: 2.608933E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 83810/ 115203 | consumed samples: 21455360 | consumed tokens: 43940577280 | elapsed time per iteration (s): 0.56 | learning rate: 5.161E-05 | global batch size: 256 | lm loss: 2.599415E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.234 | TFLOPs: 43.97 | +7: iteration 83820/ 115203 | consumed samples: 21457920 | consumed tokens: 43945820160 | elapsed time per iteration (s): 0.56 | learning rate: 5.159E-05 | global batch size: 256 | lm loss: 2.604428E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.360 | TFLOPs: 43.60 | +7: iteration 83830/ 115203 | consumed samples: 21460480 | consumed tokens: 43951063040 | elapsed time per iteration (s): 0.55 | learning rate: 5.157E-05 | global batch size: 256 | lm loss: 2.601139E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.423 | TFLOPs: 43.99 | +7: iteration 83840/ 115203 | consumed samples: 21463040 | consumed tokens: 43956305920 | elapsed time per iteration (s): 0.56 | learning rate: 5.155E-05 | global batch size: 256 | lm loss: 2.610006E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.278 | TFLOPs: 43.79 | +7: iteration 83850/ 115203 | consumed samples: 21465600 | consumed tokens: 43961548800 | elapsed time per iteration (s): 0.57 | learning rate: 5.153E-05 | global batch size: 256 | lm loss: 2.601687E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.207 | TFLOPs: 43.11 | +7: iteration 83860/ 115203 | consumed samples: 21468160 | consumed tokens: 43966791680 | elapsed time per iteration (s): 0.55 | learning rate: 5.151E-05 | global batch size: 256 | lm loss: 2.615820E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.491 | TFLOPs: 44.00 | +7: iteration 83870/ 115203 | consumed samples: 21470720 | consumed tokens: 43972034560 | elapsed time per iteration (s): 0.55 | learning rate: 5.149E-05 | global batch size: 256 | lm loss: 2.603776E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.580 | TFLOPs: 44.01 | +7: iteration 83880/ 115203 | consumed samples: 21473280 | consumed tokens: 43977277440 | elapsed time per iteration (s): 0.57 | learning rate: 5.147E-05 | global batch size: 256 | lm loss: 2.605318E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.848 | TFLOPs: 42.89 | +7: iteration 83890/ 115203 | consumed samples: 21475840 | consumed tokens: 43982520320 | elapsed time per iteration (s): 0.55 | learning rate: 5.145E-05 | global batch size: 256 | lm loss: 2.604560E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.348 | TFLOPs: 43.98 | +7: iteration 83900/ 115203 | consumed samples: 21478400 | consumed tokens: 43987763200 | elapsed time per iteration (s): 0.55 | learning rate: 5.144E-05 | global batch size: 256 | lm loss: 2.595801E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 83910/ 115203 | consumed samples: 21480960 | consumed tokens: 43993006080 | elapsed time per iteration (s): 0.55 | learning rate: 5.142E-05 | global batch size: 256 | lm loss: 2.599497E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.461 | TFLOPs: 44.00 | +7: iteration 83920/ 115203 | consumed samples: 21483520 | consumed tokens: 43998248960 | elapsed time per iteration (s): 0.55 | learning rate: 5.140E-05 | global batch size: 256 | lm loss: 2.615669E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.285 | TFLOPs: 43.98 | +7: iteration 83930/ 115203 | consumed samples: 21486080 | consumed tokens: 44003491840 | elapsed time per iteration (s): 0.56 | learning rate: 5.138E-05 | global batch size: 256 | lm loss: 2.609631E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.750 | TFLOPs: 43.26 | +7: iteration 83940/ 115203 | consumed samples: 21488640 | consumed tokens: 44008734720 | elapsed time per iteration (s): 0.56 | learning rate: 5.136E-05 | global batch size: 256 | lm loss: 2.594003E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.428 | TFLOPs: 43.71 | +7: iteration 83950/ 115203 | consumed samples: 21491200 | consumed tokens: 44013977600 | elapsed time per iteration (s): 0.56 | learning rate: 5.134E-05 | global batch size: 256 | lm loss: 2.602850E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.109 | TFLOPs: 43.96 | +7: iteration 83960/ 115203 | consumed samples: 21493760 | consumed tokens: 44019220480 | elapsed time per iteration (s): 0.55 | learning rate: 5.132E-05 | global batch size: 256 | lm loss: 2.599773E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.452 | TFLOPs: 43.99 | +7: iteration 83970/ 115203 | consumed samples: 21496320 | consumed tokens: 44024463360 | elapsed time per iteration (s): 0.55 | learning rate: 5.130E-05 | global batch size: 256 | lm loss: 2.586216E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.298 | TFLOPs: 43.98 | +7: iteration 83980/ 115203 | consumed samples: 21498880 | consumed tokens: 44029706240 | elapsed time per iteration (s): 0.56 | learning rate: 5.129E-05 | global batch size: 256 | lm loss: 2.609517E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.112 | TFLOPs: 43.58 | +7: iteration 83990/ 115203 | consumed samples: 21501440 | consumed tokens: 44034949120 | elapsed time per iteration (s): 0.55 | learning rate: 5.127E-05 | global batch size: 256 | lm loss: 2.610861E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.459 | TFLOPs: 44.00 | +0: [2023-03-17 02:00:12,940] [INFO] [logging.py:68:log_dist] [Rank 0] step=84000, skipped=0, lr=[5.124789271253415e-05, 5.124789271253415e-05, 5.124789271253415e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 84000/ 115203 | consumed samples: 21504000 | consumed tokens: 44040192000 | elapsed time per iteration (s): 0.55 | learning rate: 5.125E-05 | global batch size: 256 | lm loss: 2.602885E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.282 | TFLOPs: 43.98 | +0: steps: 84000 loss: 2.5909 iter time (s): 0.560 samples/sec: 456.900 +7: iteration 84010/ 115203 | consumed samples: 21506560 | consumed tokens: 44045434880 | elapsed time per iteration (s): 0.56 | learning rate: 5.123E-05 | global batch size: 256 | lm loss: 2.615419E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.578 | TFLOPs: 43.91 | +7: iteration 84020/ 115203 | consumed samples: 21509120 | consumed tokens: 44050677760 | elapsed time per iteration (s): 0.55 | learning rate: 5.121E-05 | global batch size: 256 | lm loss: 2.601047E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.314 | TFLOPs: 43.98 | +7: iteration 84030/ 115203 | consumed samples: 21511680 | consumed tokens: 44055920640 | elapsed time per iteration (s): 0.56 | learning rate: 5.119E-05 | global batch size: 256 | lm loss: 2.600853E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.582 | TFLOPs: 43.24 | +7: iteration 84040/ 115203 | consumed samples: 21514240 | consumed tokens: 44061163520 | elapsed time per iteration (s): 0.55 | learning rate: 5.117E-05 | global batch size: 256 | lm loss: 2.624570E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.634 | TFLOPs: 44.01 | +7: iteration 84050/ 115203 | consumed samples: 21516800 | consumed tokens: 44066406400 | elapsed time per iteration (s): 0.56 | learning rate: 5.115E-05 | global batch size: 256 | lm loss: 2.591079E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.266 | TFLOPs: 43.79 | +7: iteration 84060/ 115203 | consumed samples: 21519360 | consumed tokens: 44071649280 | elapsed time per iteration (s): 0.56 | learning rate: 5.114E-05 | global batch size: 256 | lm loss: 2.610764E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.460 | TFLOPs: 43.71 | +7: iteration 84070/ 115203 | consumed samples: 21521920 | consumed tokens: 44076892160 | elapsed time per iteration (s): 0.57 | learning rate: 5.112E-05 | global batch size: 256 | lm loss: 2.596231E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.596 | TFLOPs: 42.77 | +7: iteration 84080/ 115203 | consumed samples: 21524480 | consumed tokens: 44082135040 | elapsed time per iteration (s): 0.58 | learning rate: 5.110E-05 | global batch size: 256 | lm loss: 2.604337E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.438 | TFLOPs: 42.09 | +7: iteration 84090/ 115203 | consumed samples: 21527040 | consumed tokens: 44087377920 | elapsed time per iteration (s): 0.56 | learning rate: 5.108E-05 | global batch size: 256 | lm loss: 2.595209E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.795 | TFLOPs: 43.74 | +7: iteration 84100/ 115203 | consumed samples: 21529600 | consumed tokens: 44092620800 | elapsed time per iteration (s): 0.56 | learning rate: 5.106E-05 | global batch size: 256 | lm loss: 2.602288E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.943 | TFLOPs: 43.37 | +7: iteration 84110/ 115203 | consumed samples: 21532160 | consumed tokens: 44097863680 | elapsed time per iteration (s): 0.56 | learning rate: 5.104E-05 | global batch size: 256 | lm loss: 2.622497E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.932 | TFLOPs: 43.75 | +7: iteration 84120/ 115203 | consumed samples: 21534720 | consumed tokens: 44103106560 | elapsed time per iteration (s): 0.56 | learning rate: 5.102E-05 | global batch size: 256 | lm loss: 2.596447E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.886 | TFLOPs: 43.56 | +7: iteration 84130/ 115203 | consumed samples: 21537280 | consumed tokens: 44108349440 | elapsed time per iteration (s): 0.56 | learning rate: 5.100E-05 | global batch size: 256 | lm loss: 2.600529E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.163 | TFLOPs: 43.49 | +7: iteration 84140/ 115203 | consumed samples: 21539840 | consumed tokens: 44113592320 | elapsed time per iteration (s): 0.56 | learning rate: 5.099E-05 | global batch size: 256 | lm loss: 2.610434E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.154 | TFLOPs: 43.97 | +7: iteration 84150/ 115203 | consumed samples: 21542400 | consumed tokens: 44118835200 | elapsed time per iteration (s): 0.56 | learning rate: 5.097E-05 | global batch size: 256 | lm loss: 2.590066E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.416 | TFLOPs: 43.61 | +7: iteration 84160/ 115203 | consumed samples: 21544960 | consumed tokens: 44124078080 | elapsed time per iteration (s): 0.56 | learning rate: 5.095E-05 | global batch size: 256 | lm loss: 2.609386E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.154 | TFLOPs: 43.97 | +7: iteration 84170/ 115203 | consumed samples: 21547520 | consumed tokens: 44129320960 | elapsed time per iteration (s): 0.56 | learning rate: 5.093E-05 | global batch size: 256 | lm loss: 2.604121E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.862 | TFLOPs: 43.37 | +7: iteration 84180/ 115203 | consumed samples: 21550080 | consumed tokens: 44134563840 | elapsed time per iteration (s): 0.55 | learning rate: 5.091E-05 | global batch size: 256 | lm loss: 2.602041E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.265 | TFLOPs: 43.98 | +7: iteration 84190/ 115203 | consumed samples: 21552640 | consumed tokens: 44139806720 | elapsed time per iteration (s): 0.56 | learning rate: 5.089E-05 | global batch size: 256 | lm loss: 2.600603E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.222 | TFLOPs: 43.97 | +7: iteration 84200/ 115203 | consumed samples: 21555200 | consumed tokens: 44145049600 | elapsed time per iteration (s): 0.57 | learning rate: 5.087E-05 | global batch size: 256 | lm loss: 2.602130E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.015 | TFLOPs: 43.09 | +7: iteration 84210/ 115203 | consumed samples: 21557760 | consumed tokens: 44150292480 | elapsed time per iteration (s): 0.57 | learning rate: 5.085E-05 | global batch size: 256 | lm loss: 2.617357E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.106 | TFLOPs: 42.72 | +7: iteration 84220/ 115203 | consumed samples: 21560320 | consumed tokens: 44155535360 | elapsed time per iteration (s): 0.56 | learning rate: 5.084E-05 | global batch size: 256 | lm loss: 2.612966E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.973 | TFLOPs: 43.47 | +7: iteration 84230/ 115203 | consumed samples: 21562880 | consumed tokens: 44160778240 | elapsed time per iteration (s): 0.56 | learning rate: 5.082E-05 | global batch size: 256 | lm loss: 2.610916E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.239 | TFLOPs: 43.59 | +7: iteration 84240/ 115203 | consumed samples: 21565440 | consumed tokens: 44166021120 | elapsed time per iteration (s): 0.56 | learning rate: 5.080E-05 | global batch size: 256 | lm loss: 2.600391E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.088 | TFLOPs: 43.96 | +7: iteration 84250/ 115203 | consumed samples: 21568000 | consumed tokens: 44171264000 | elapsed time per iteration (s): 0.56 | learning rate: 5.078E-05 | global batch size: 256 | lm loss: 2.610892E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.568 | TFLOPs: 43.43 | +7: iteration 84260/ 115203 | consumed samples: 21570560 | consumed tokens: 44176506880 | elapsed time per iteration (s): 0.57 | learning rate: 5.076E-05 | global batch size: 256 | lm loss: 2.618191E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.287 | TFLOPs: 43.03 | +7: iteration 84270/ 115203 | consumed samples: 21573120 | consumed tokens: 44181749760 | elapsed time per iteration (s): 0.57 | learning rate: 5.074E-05 | global batch size: 256 | lm loss: 2.624231E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.540 | TFLOPs: 42.95 | +7: iteration 84280/ 115203 | consumed samples: 21575680 | consumed tokens: 44186992640 | elapsed time per iteration (s): 0.58 | learning rate: 5.072E-05 | global batch size: 256 | lm loss: 2.598600E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.367 | TFLOPs: 42.27 | +7: iteration 84290/ 115203 | consumed samples: 21578240 | consumed tokens: 44192235520 | elapsed time per iteration (s): 0.58 | learning rate: 5.071E-05 | global batch size: 256 | lm loss: 2.600651E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.331 | TFLOPs: 42.08 | +7: iteration 84300/ 115203 | consumed samples: 21580800 | consumed tokens: 44197478400 | elapsed time per iteration (s): 0.56 | learning rate: 5.069E-05 | global batch size: 256 | lm loss: 2.606917E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.410 | TFLOPs: 43.70 | +7: iteration 84310/ 115203 | consumed samples: 21583360 | consumed tokens: 44202721280 | elapsed time per iteration (s): 0.57 | learning rate: 5.067E-05 | global batch size: 256 | lm loss: 2.611416E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.837 | TFLOPs: 42.70 | +7: iteration 84320/ 115203 | consumed samples: 21585920 | consumed tokens: 44207964160 | elapsed time per iteration (s): 0.56 | learning rate: 5.065E-05 | global batch size: 256 | lm loss: 2.608636E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.362 | TFLOPs: 43.22 | +7: iteration 84330/ 115203 | consumed samples: 21588480 | consumed tokens: 44213207040 | elapsed time per iteration (s): 0.57 | learning rate: 5.063E-05 | global batch size: 256 | lm loss: 2.603985E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.902 | TFLOPs: 42.89 | +7: iteration 84340/ 115203 | consumed samples: 21591040 | consumed tokens: 44218449920 | elapsed time per iteration (s): 0.57 | learning rate: 5.061E-05 | global batch size: 256 | lm loss: 2.591587E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.490 | TFLOPs: 42.66 | +7: iteration 84350/ 115203 | consumed samples: 21593600 | consumed tokens: 44223692800 | elapsed time per iteration (s): 0.57 | learning rate: 5.059E-05 | global batch size: 256 | lm loss: 2.613710E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.366 | TFLOPs: 42.56 | +7: iteration 84360/ 115203 | consumed samples: 21596160 | consumed tokens: 44228935680 | elapsed time per iteration (s): 0.58 | learning rate: 5.057E-05 | global batch size: 256 | lm loss: 2.605589E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.901 | TFLOPs: 42.23 | +7: iteration 84370/ 115203 | consumed samples: 21598720 | consumed tokens: 44234178560 | elapsed time per iteration (s): 0.58 | learning rate: 5.056E-05 | global batch size: 256 | lm loss: 2.624119E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.839 | TFLOPs: 41.74 | +7: iteration 84380/ 115203 | consumed samples: 21601280 | consumed tokens: 44239421440 | elapsed time per iteration (s): 0.57 | learning rate: 5.054E-05 | global batch size: 256 | lm loss: 2.596889E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.601 | TFLOPs: 42.48 | +7: iteration 84390/ 115203 | consumed samples: 21603840 | consumed tokens: 44244664320 | elapsed time per iteration (s): 0.58 | learning rate: 5.052E-05 | global batch size: 256 | lm loss: 2.604228E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.007 | TFLOPs: 41.85 | +7: iteration 84400/ 115203 | consumed samples: 21606400 | consumed tokens: 44249907200 | elapsed time per iteration (s): 0.56 | learning rate: 5.050E-05 | global batch size: 256 | lm loss: 2.607139E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.133 | TFLOPs: 43.39 | +7: iteration 84410/ 115203 | consumed samples: 21608960 | consumed tokens: 44255150080 | elapsed time per iteration (s): 0.57 | learning rate: 5.048E-05 | global batch size: 256 | lm loss: 2.618490E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.509 | TFLOPs: 42.67 | +7: iteration 84420/ 115203 | consumed samples: 21611520 | consumed tokens: 44260392960 | elapsed time per iteration (s): 0.58 | learning rate: 5.046E-05 | global batch size: 256 | lm loss: 2.597892E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.585 | TFLOPs: 42.29 | +7: iteration 84430/ 115203 | consumed samples: 21614080 | consumed tokens: 44265635840 | elapsed time per iteration (s): 0.58 | learning rate: 5.044E-05 | global batch size: 256 | lm loss: 2.601011E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.393 | TFLOPs: 42.18 | +7: iteration 84440/ 115203 | consumed samples: 21616640 | consumed tokens: 44270878720 | elapsed time per iteration (s): 0.57 | learning rate: 5.043E-05 | global batch size: 256 | lm loss: 2.590684E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.972 | TFLOPs: 43.00 | +7: iteration 84450/ 115203 | consumed samples: 21619200 | consumed tokens: 44276121600 | elapsed time per iteration (s): 0.57 | learning rate: 5.041E-05 | global batch size: 256 | lm loss: 2.601702E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.498 | TFLOPs: 42.85 | +7: iteration 84460/ 115203 | consumed samples: 21621760 | consumed tokens: 44281364480 | elapsed time per iteration (s): 0.57 | learning rate: 5.039E-05 | global batch size: 256 | lm loss: 2.594142E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.471 | TFLOPs: 42.66 | +7: iteration 84470/ 115203 | consumed samples: 21624320 | consumed tokens: 44286607360 | elapsed time per iteration (s): 0.57 | learning rate: 5.037E-05 | global batch size: 256 | lm loss: 2.618862E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.065 | TFLOPs: 43.00 | +7: iteration 84480/ 115203 | consumed samples: 21626880 | consumed tokens: 44291850240 | elapsed time per iteration (s): 0.58 | learning rate: 5.035E-05 | global batch size: 256 | lm loss: 2.610107E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.728 | TFLOPs: 42.11 | +7: iteration 84490/ 115203 | consumed samples: 21629440 | consumed tokens: 44297093120 | elapsed time per iteration (s): 0.58 | learning rate: 5.033E-05 | global batch size: 256 | lm loss: 2.598674E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.817 | TFLOPs: 41.74 | +7: iteration 84500/ 115203 | consumed samples: 21632000 | consumed tokens: 44302336000 | elapsed time per iteration (s): 0.60 | learning rate: 5.031E-05 | global batch size: 256 | lm loss: 2.598837E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.306 | TFLOPs: 40.93 | +7: iteration 84510/ 115203 | consumed samples: 21634560 | consumed tokens: 44307578880 | elapsed time per iteration (s): 0.58 | learning rate: 5.030E-05 | global batch size: 256 | lm loss: 2.626643E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.977 | TFLOPs: 42.04 | +7: iteration 84520/ 115203 | consumed samples: 21637120 | consumed tokens: 44312821760 | elapsed time per iteration (s): 0.59 | learning rate: 5.028E-05 | global batch size: 256 | lm loss: 2.615792E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.460 | TFLOPs: 41.52 | +7: iteration 84530/ 115203 | consumed samples: 21639680 | consumed tokens: 44318064640 | elapsed time per iteration (s): 0.58 | learning rate: 5.026E-05 | global batch size: 256 | lm loss: 2.600938E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.584 | TFLOPs: 42.10 | +7: iteration 84540/ 115203 | consumed samples: 21642240 | consumed tokens: 44323307520 | elapsed time per iteration (s): 0.58 | learning rate: 5.024E-05 | global batch size: 256 | lm loss: 2.596334E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.157 | TFLOPs: 42.25 | +7: iteration 84550/ 115203 | consumed samples: 21644800 | consumed tokens: 44328550400 | elapsed time per iteration (s): 0.58 | learning rate: 5.022E-05 | global batch size: 256 | lm loss: 2.614580E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.197 | TFLOPs: 42.06 | +7: iteration 84560/ 115203 | consumed samples: 21647360 | consumed tokens: 44333793280 | elapsed time per iteration (s): 0.59 | learning rate: 5.020E-05 | global batch size: 256 | lm loss: 2.607984E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.106 | TFLOPs: 41.39 | +7: iteration 84570/ 115203 | consumed samples: 21649920 | consumed tokens: 44339036160 | elapsed time per iteration (s): 0.57 | learning rate: 5.018E-05 | global batch size: 256 | lm loss: 2.626007E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.105 | TFLOPs: 42.72 | +7: iteration 84580/ 115203 | consumed samples: 21652480 | consumed tokens: 44344279040 | elapsed time per iteration (s): 0.56 | learning rate: 5.017E-05 | global batch size: 256 | lm loss: 2.607598E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.925 | TFLOPs: 43.94 | +7: iteration 84590/ 115203 | consumed samples: 21655040 | consumed tokens: 44349521920 | elapsed time per iteration (s): 0.56 | learning rate: 5.015E-05 | global batch size: 256 | lm loss: 2.608275E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.960 | TFLOPs: 43.95 | +7: iteration 84600/ 115203 | consumed samples: 21657600 | consumed tokens: 44354764800 | elapsed time per iteration (s): 0.55 | learning rate: 5.013E-05 | global batch size: 256 | lm loss: 2.595031E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.359 | TFLOPs: 43.99 | +7: iteration 84610/ 115203 | consumed samples: 21660160 | consumed tokens: 44360007680 | elapsed time per iteration (s): 0.57 | learning rate: 5.011E-05 | global batch size: 256 | lm loss: 2.606865E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.442 | TFLOPs: 43.14 | +7: iteration 84620/ 115203 | consumed samples: 21662720 | consumed tokens: 44365250560 | elapsed time per iteration (s): 0.58 | learning rate: 5.009E-05 | global batch size: 256 | lm loss: 2.592618E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.774 | TFLOPs: 42.21 | +7: iteration 84630/ 115203 | consumed samples: 21665280 | consumed tokens: 44370493440 | elapsed time per iteration (s): 0.56 | learning rate: 5.007E-05 | global batch size: 256 | lm loss: 2.605446E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.884 | TFLOPs: 43.46 | +7: iteration 84640/ 115203 | consumed samples: 21667840 | consumed tokens: 44375736320 | elapsed time per iteration (s): 0.56 | learning rate: 5.006E-05 | global batch size: 256 | lm loss: 2.595520E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.886 | TFLOPs: 43.27 | +7: iteration 84650/ 115203 | consumed samples: 21670400 | consumed tokens: 44380979200 | elapsed time per iteration (s): 0.57 | learning rate: 5.004E-05 | global batch size: 256 | lm loss: 2.604328E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.699 | TFLOPs: 42.87 | +7: iteration 84660/ 115203 | consumed samples: 21672960 | consumed tokens: 44386222080 | elapsed time per iteration (s): 0.56 | learning rate: 5.002E-05 | global batch size: 256 | lm loss: 2.614108E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.350 | TFLOPs: 43.70 | +7: iteration 84670/ 115203 | consumed samples: 21675520 | consumed tokens: 44391464960 | elapsed time per iteration (s): 0.56 | learning rate: 5.000E-05 | global batch size: 256 | lm loss: 2.606166E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.846 | TFLOPs: 43.75 | +7: iteration 84680/ 115203 | consumed samples: 21678080 | consumed tokens: 44396707840 | elapsed time per iteration (s): 0.57 | learning rate: 4.998E-05 | global batch size: 256 | lm loss: 2.608060E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.763 | TFLOPs: 43.17 | +7: iteration 84690/ 115203 | consumed samples: 21680640 | consumed tokens: 44401950720 | elapsed time per iteration (s): 0.57 | learning rate: 4.996E-05 | global batch size: 256 | lm loss: 2.600589E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.038 | TFLOPs: 42.81 | +7: iteration 84700/ 115203 | consumed samples: 21683200 | consumed tokens: 44407193600 | elapsed time per iteration (s): 0.59 | learning rate: 4.994E-05 | global batch size: 256 | lm loss: 2.617562E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.788 | TFLOPs: 41.45 | +7: iteration 84710/ 115203 | consumed samples: 21685760 | consumed tokens: 44412436480 | elapsed time per iteration (s): 0.56 | learning rate: 4.993E-05 | global batch size: 256 | lm loss: 2.590676E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.977 | TFLOPs: 43.38 | +7: iteration 84720/ 115203 | consumed samples: 21688320 | consumed tokens: 44417679360 | elapsed time per iteration (s): 0.55 | learning rate: 4.991E-05 | global batch size: 256 | lm loss: 2.598433E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.485 | TFLOPs: 44.00 | +7: iteration 84730/ 115203 | consumed samples: 21690880 | consumed tokens: 44422922240 | elapsed time per iteration (s): 0.56 | learning rate: 4.989E-05 | global batch size: 256 | lm loss: 2.604440E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.100 | TFLOPs: 43.48 | +7: iteration 84740/ 115203 | consumed samples: 21693440 | consumed tokens: 44428165120 | elapsed time per iteration (s): 0.57 | learning rate: 4.987E-05 | global batch size: 256 | lm loss: 2.610550E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.235 | TFLOPs: 42.92 | +7: iteration 84750/ 115203 | consumed samples: 21696000 | consumed tokens: 44433408000 | elapsed time per iteration (s): 0.57 | learning rate: 4.985E-05 | global batch size: 256 | lm loss: 2.601518E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.823 | TFLOPs: 43.08 | +7: iteration 84760/ 115203 | consumed samples: 21698560 | consumed tokens: 44438650880 | elapsed time per iteration (s): 0.56 | learning rate: 4.983E-05 | global batch size: 256 | lm loss: 2.611191E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.453 | TFLOPs: 43.61 | +7: iteration 84770/ 115203 | consumed samples: 21701120 | consumed tokens: 44443893760 | elapsed time per iteration (s): 0.58 | learning rate: 4.982E-05 | global batch size: 256 | lm loss: 2.601420E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.951 | TFLOPs: 42.33 | +7: iteration 84780/ 115203 | consumed samples: 21703680 | consumed tokens: 44449136640 | elapsed time per iteration (s): 0.57 | learning rate: 4.980E-05 | global batch size: 256 | lm loss: 2.611518E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.964 | TFLOPs: 42.71 | +7: iteration 84790/ 115203 | consumed samples: 21706240 | consumed tokens: 44454379520 | elapsed time per iteration (s): 0.57 | learning rate: 4.978E-05 | global batch size: 256 | lm loss: 2.607061E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.701 | TFLOPs: 43.16 | +7: iteration 84800/ 115203 | consumed samples: 21708800 | consumed tokens: 44459622400 | elapsed time per iteration (s): 0.57 | learning rate: 4.976E-05 | global batch size: 256 | lm loss: 2.614352E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.425 | TFLOPs: 42.66 | +7: iteration 84810/ 115203 | consumed samples: 21711360 | consumed tokens: 44464865280 | elapsed time per iteration (s): 0.56 | learning rate: 4.974E-05 | global batch size: 256 | lm loss: 2.604260E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.736 | TFLOPs: 43.83 | +7: iteration 84820/ 115203 | consumed samples: 21713920 | consumed tokens: 44470108160 | elapsed time per iteration (s): 0.56 | learning rate: 4.972E-05 | global batch size: 256 | lm loss: 2.616063E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.288 | TFLOPs: 43.22 | +7: iteration 84830/ 115203 | consumed samples: 21716480 | consumed tokens: 44475351040 | elapsed time per iteration (s): 0.57 | learning rate: 4.970E-05 | global batch size: 256 | lm loss: 2.594398E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.656 | TFLOPs: 43.16 | +7: iteration 84840/ 115203 | consumed samples: 21719040 | consumed tokens: 44480593920 | elapsed time per iteration (s): 0.56 | learning rate: 4.969E-05 | global batch size: 256 | lm loss: 2.598813E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.310 | TFLOPs: 43.31 | +7: iteration 84850/ 115203 | consumed samples: 21721600 | consumed tokens: 44485836800 | elapsed time per iteration (s): 0.56 | learning rate: 4.967E-05 | global batch size: 256 | lm loss: 2.611732E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.630 | TFLOPs: 43.73 | +7: iteration 84860/ 115203 | consumed samples: 21724160 | consumed tokens: 44491079680 | elapsed time per iteration (s): 0.58 | learning rate: 4.965E-05 | global batch size: 256 | lm loss: 2.590671E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.084 | TFLOPs: 42.15 | +7: iteration 84870/ 115203 | consumed samples: 21726720 | consumed tokens: 44496322560 | elapsed time per iteration (s): 0.57 | learning rate: 4.963E-05 | global batch size: 256 | lm loss: 2.607619E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.637 | TFLOPs: 43.06 | +7: iteration 84880/ 115203 | consumed samples: 21729280 | consumed tokens: 44501565440 | elapsed time per iteration (s): 0.56 | learning rate: 4.961E-05 | global batch size: 256 | lm loss: 2.591864E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.135 | TFLOPs: 43.49 | +7: iteration 84890/ 115203 | consumed samples: 21731840 | consumed tokens: 44506808320 | elapsed time per iteration (s): 0.56 | learning rate: 4.959E-05 | global batch size: 256 | lm loss: 2.609101E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.495 | TFLOPs: 43.62 | +7: iteration 84900/ 115203 | consumed samples: 21734400 | consumed tokens: 44512051200 | elapsed time per iteration (s): 0.56 | learning rate: 4.958E-05 | global batch size: 256 | lm loss: 2.595040E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.175 | TFLOPs: 43.49 | +7: iteration 84910/ 115203 | consumed samples: 21736960 | consumed tokens: 44517294080 | elapsed time per iteration (s): 0.57 | learning rate: 4.956E-05 | global batch size: 256 | lm loss: 2.590591E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.804 | TFLOPs: 43.17 | +7: iteration 84920/ 115203 | consumed samples: 21739520 | consumed tokens: 44522536960 | elapsed time per iteration (s): 0.57 | learning rate: 4.954E-05 | global batch size: 256 | lm loss: 2.609358E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.624 | TFLOPs: 42.58 | +7: iteration 84930/ 115203 | consumed samples: 21742080 | consumed tokens: 44527779840 | elapsed time per iteration (s): 0.58 | learning rate: 4.952E-05 | global batch size: 256 | lm loss: 2.612611E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.307 | TFLOPs: 42.07 | +7: iteration 84940/ 115203 | consumed samples: 21744640 | consumed tokens: 44533022720 | elapsed time per iteration (s): 0.56 | learning rate: 4.950E-05 | global batch size: 256 | lm loss: 2.602832E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.218 | TFLOPs: 43.97 | +7: iteration 84950/ 115203 | consumed samples: 21747200 | consumed tokens: 44538265600 | elapsed time per iteration (s): 0.56 | learning rate: 4.948E-05 | global batch size: 256 | lm loss: 2.594544E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.896 | TFLOPs: 43.46 | +7: iteration 84960/ 115203 | consumed samples: 21749760 | consumed tokens: 44543508480 | elapsed time per iteration (s): 0.55 | learning rate: 4.947E-05 | global batch size: 256 | lm loss: 2.601416E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.439 | TFLOPs: 43.99 | +7: iteration 84970/ 115203 | consumed samples: 21752320 | consumed tokens: 44548751360 | elapsed time per iteration (s): 0.57 | learning rate: 4.945E-05 | global batch size: 256 | lm loss: 2.599783E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.745 | TFLOPs: 42.97 | +7: iteration 84980/ 115203 | consumed samples: 21754880 | consumed tokens: 44553994240 | elapsed time per iteration (s): 0.55 | learning rate: 4.943E-05 | global batch size: 256 | lm loss: 2.600151E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.356 | TFLOPs: 43.99 | +7: iteration 84990/ 115203 | consumed samples: 21757440 | consumed tokens: 44559237120 | elapsed time per iteration (s): 0.56 | learning rate: 4.941E-05 | global batch size: 256 | lm loss: 2.613879E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.854 | TFLOPs: 43.56 | +7: iteration 85000/ 115203 | consumed samples: 21760000 | consumed tokens: 44564480000 | elapsed time per iteration (s): 0.57 | learning rate: 4.939E-05 | global batch size: 256 | lm loss: 2.605063E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.048 | TFLOPs: 42.72 | +7: iteration 85010/ 115203 | consumed samples: 21762560 | consumed tokens: 44569722880 | elapsed time per iteration (s): 0.55 | learning rate: 4.937E-05 | global batch size: 256 | lm loss: 2.613768E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.387 | TFLOPs: 43.99 | +7: iteration 85020/ 115203 | consumed samples: 21765120 | consumed tokens: 44574965760 | elapsed time per iteration (s): 0.56 | learning rate: 4.936E-05 | global batch size: 256 | lm loss: 2.593641E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.157 | TFLOPs: 43.97 | +7: iteration 85030/ 115203 | consumed samples: 21767680 | consumed tokens: 44580208640 | elapsed time per iteration (s): 0.55 | learning rate: 4.934E-05 | global batch size: 256 | lm loss: 2.601624E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.352 | TFLOPs: 43.98 | +7: iteration 85040/ 115203 | consumed samples: 21770240 | consumed tokens: 44585451520 | elapsed time per iteration (s): 0.56 | learning rate: 4.932E-05 | global batch size: 256 | lm loss: 2.606835E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.156 | TFLOPs: 43.97 | +7: iteration 85050/ 115203 | consumed samples: 21772800 | consumed tokens: 44590694400 | elapsed time per iteration (s): 0.56 | learning rate: 4.930E-05 | global batch size: 256 | lm loss: 2.602600E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.904 | TFLOPs: 43.66 | +7: iteration 85060/ 115203 | consumed samples: 21775360 | consumed tokens: 44595937280 | elapsed time per iteration (s): 0.57 | learning rate: 4.928E-05 | global batch size: 256 | lm loss: 2.598225E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.188 | TFLOPs: 42.92 | +7: iteration 85070/ 115203 | consumed samples: 21777920 | consumed tokens: 44601180160 | elapsed time per iteration (s): 0.56 | learning rate: 4.926E-05 | global batch size: 256 | lm loss: 2.595624E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.127 | TFLOPs: 43.30 | +7: iteration 85080/ 115203 | consumed samples: 21780480 | consumed tokens: 44606423040 | elapsed time per iteration (s): 0.56 | learning rate: 4.925E-05 | global batch size: 256 | lm loss: 2.608943E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.133 | TFLOPs: 43.96 | +7: iteration 85090/ 115203 | consumed samples: 21783040 | consumed tokens: 44611665920 | elapsed time per iteration (s): 0.56 | learning rate: 4.923E-05 | global batch size: 256 | lm loss: 2.595806E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.591 | TFLOPs: 43.44 | +7: iteration 85100/ 115203 | consumed samples: 21785600 | consumed tokens: 44616908800 | elapsed time per iteration (s): 0.56 | learning rate: 4.921E-05 | global batch size: 256 | lm loss: 2.600835E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.309 | TFLOPs: 43.89 | +7: iteration 85110/ 115203 | consumed samples: 21788160 | consumed tokens: 44622151680 | elapsed time per iteration (s): 0.56 | learning rate: 4.919E-05 | global batch size: 256 | lm loss: 2.615084E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.322 | TFLOPs: 43.51 | +7: iteration 85120/ 115203 | consumed samples: 21790720 | consumed tokens: 44627394560 | elapsed time per iteration (s): 0.57 | learning rate: 4.917E-05 | global batch size: 256 | lm loss: 2.617899E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.083 | TFLOPs: 43.20 | +7: iteration 85130/ 115203 | consumed samples: 21793280 | consumed tokens: 44632637440 | elapsed time per iteration (s): 0.56 | learning rate: 4.915E-05 | global batch size: 256 | lm loss: 2.607921E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.048 | TFLOPs: 43.38 | +7: iteration 85140/ 115203 | consumed samples: 21795840 | consumed tokens: 44637880320 | elapsed time per iteration (s): 0.55 | learning rate: 4.914E-05 | global batch size: 256 | lm loss: 2.599168E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.417 | TFLOPs: 43.99 | +7: iteration 85150/ 115203 | consumed samples: 21798400 | consumed tokens: 44643123200 | elapsed time per iteration (s): 0.57 | learning rate: 4.912E-05 | global batch size: 256 | lm loss: 2.590973E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.465 | TFLOPs: 42.95 | +7: iteration 85160/ 115203 | consumed samples: 21800960 | consumed tokens: 44648366080 | elapsed time per iteration (s): 0.56 | learning rate: 4.910E-05 | global batch size: 256 | lm loss: 2.606050E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.817 | TFLOPs: 43.36 | +7: iteration 85170/ 115203 | consumed samples: 21803520 | consumed tokens: 44653608960 | elapsed time per iteration (s): 0.56 | learning rate: 4.908E-05 | global batch size: 256 | lm loss: 2.604845E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.201 | TFLOPs: 43.68 | +7: iteration 85180/ 115203 | consumed samples: 21806080 | consumed tokens: 44658851840 | elapsed time per iteration (s): 0.58 | learning rate: 4.906E-05 | global batch size: 256 | lm loss: 2.611272E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.403 | TFLOPs: 42.08 | +7: iteration 85190/ 115203 | consumed samples: 21808640 | consumed tokens: 44664094720 | elapsed time per iteration (s): 0.56 | learning rate: 4.905E-05 | global batch size: 256 | lm loss: 2.586275E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.825 | TFLOPs: 43.55 | +7: iteration 85200/ 115203 | consumed samples: 21811200 | consumed tokens: 44669337600 | elapsed time per iteration (s): 0.57 | learning rate: 4.903E-05 | global batch size: 256 | lm loss: 2.604030E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.987 | TFLOPs: 43.09 | +7: iteration 85210/ 115203 | consumed samples: 21813760 | consumed tokens: 44674580480 | elapsed time per iteration (s): 0.57 | learning rate: 4.901E-05 | global batch size: 256 | lm loss: 2.596731E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.872 | TFLOPs: 42.89 | +7: iteration 85220/ 115203 | consumed samples: 21816320 | consumed tokens: 44679823360 | elapsed time per iteration (s): 0.57 | learning rate: 4.899E-05 | global batch size: 256 | lm loss: 2.605461E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.507 | TFLOPs: 43.05 | +7: iteration 85230/ 115203 | consumed samples: 21818880 | consumed tokens: 44685066240 | elapsed time per iteration (s): 0.56 | learning rate: 4.897E-05 | global batch size: 256 | lm loss: 2.608463E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.246 | TFLOPs: 43.21 | +7: iteration 85240/ 115203 | consumed samples: 21821440 | consumed tokens: 44690309120 | elapsed time per iteration (s): 0.57 | learning rate: 4.895E-05 | global batch size: 256 | lm loss: 2.603776E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.885 | TFLOPs: 42.99 | +7: iteration 85250/ 115203 | consumed samples: 21824000 | consumed tokens: 44695552000 | elapsed time per iteration (s): 0.56 | learning rate: 4.894E-05 | global batch size: 256 | lm loss: 2.600117E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.489 | TFLOPs: 43.71 | +7: iteration 85260/ 115203 | consumed samples: 21826560 | consumed tokens: 44700794880 | elapsed time per iteration (s): 0.57 | learning rate: 4.892E-05 | global batch size: 256 | lm loss: 2.612826E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.741 | TFLOPs: 42.97 | +7: iteration 85270/ 115203 | consumed samples: 21829120 | consumed tokens: 44706037760 | elapsed time per iteration (s): 0.57 | learning rate: 4.890E-05 | global batch size: 256 | lm loss: 2.596217E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.420 | TFLOPs: 43.04 | +7: iteration 85280/ 115203 | consumed samples: 21831680 | consumed tokens: 44711280640 | elapsed time per iteration (s): 0.56 | learning rate: 4.888E-05 | global batch size: 256 | lm loss: 2.589520E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.308 | TFLOPs: 43.31 | +7: iteration 85290/ 115203 | consumed samples: 21834240 | consumed tokens: 44716523520 | elapsed time per iteration (s): 0.56 | learning rate: 4.886E-05 | global batch size: 256 | lm loss: 2.609343E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.402 | TFLOPs: 43.23 | +7: iteration 85300/ 115203 | consumed samples: 21836800 | consumed tokens: 44721766400 | elapsed time per iteration (s): 0.56 | learning rate: 4.884E-05 | global batch size: 256 | lm loss: 2.585837E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.055 | TFLOPs: 43.67 | +7: iteration 85310/ 115203 | consumed samples: 21839360 | consumed tokens: 44727009280 | elapsed time per iteration (s): 0.56 | learning rate: 4.883E-05 | global batch size: 256 | lm loss: 2.600537E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.699 | TFLOPs: 43.73 | +7: iteration 85320/ 115203 | consumed samples: 21841920 | consumed tokens: 44732252160 | elapsed time per iteration (s): 0.55 | learning rate: 4.881E-05 | global batch size: 256 | lm loss: 2.607611E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.454 | TFLOPs: 43.99 | +7: iteration 85330/ 115203 | consumed samples: 21844480 | consumed tokens: 44737495040 | elapsed time per iteration (s): 0.55 | learning rate: 4.879E-05 | global batch size: 256 | lm loss: 2.596705E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.469 | TFLOPs: 44.00 | +7: iteration 85340/ 115203 | consumed samples: 21847040 | consumed tokens: 44742737920 | elapsed time per iteration (s): 0.58 | learning rate: 4.877E-05 | global batch size: 256 | lm loss: 2.592589E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.345 | TFLOPs: 42.17 | +7: iteration 85350/ 115203 | consumed samples: 21849600 | consumed tokens: 44747980800 | elapsed time per iteration (s): 0.56 | learning rate: 4.875E-05 | global batch size: 256 | lm loss: 2.592396E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.593 | TFLOPs: 43.44 | +7: iteration 85360/ 115203 | consumed samples: 21852160 | consumed tokens: 44753223680 | elapsed time per iteration (s): 0.55 | learning rate: 4.874E-05 | global batch size: 256 | lm loss: 2.602536E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.388 | TFLOPs: 43.99 | +7: iteration 85370/ 115203 | consumed samples: 21854720 | consumed tokens: 44758466560 | elapsed time per iteration (s): 0.56 | learning rate: 4.872E-05 | global batch size: 256 | lm loss: 2.606764E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.261 | TFLOPs: 43.21 | +7: iteration 85380/ 115203 | consumed samples: 21857280 | consumed tokens: 44763709440 | elapsed time per iteration (s): 0.57 | learning rate: 4.870E-05 | global batch size: 256 | lm loss: 2.600840E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.757 | TFLOPs: 43.07 | +7: iteration 85390/ 115203 | consumed samples: 21859840 | consumed tokens: 44768952320 | elapsed time per iteration (s): 0.56 | learning rate: 4.868E-05 | global batch size: 256 | lm loss: 2.594345E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.734 | TFLOPs: 43.45 | +7: iteration 85400/ 115203 | consumed samples: 21862400 | consumed tokens: 44774195200 | elapsed time per iteration (s): 0.56 | learning rate: 4.866E-05 | global batch size: 256 | lm loss: 2.611049E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.192 | TFLOPs: 43.68 | +7: iteration 85410/ 115203 | consumed samples: 21864960 | consumed tokens: 44779438080 | elapsed time per iteration (s): 0.56 | learning rate: 4.864E-05 | global batch size: 256 | lm loss: 2.603616E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.587 | TFLOPs: 43.53 | +7: iteration 85420/ 115203 | consumed samples: 21867520 | consumed tokens: 44784680960 | elapsed time per iteration (s): 0.56 | learning rate: 4.863E-05 | global batch size: 256 | lm loss: 2.613952E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.813 | TFLOPs: 43.65 | +7: iteration 85430/ 115203 | consumed samples: 21870080 | consumed tokens: 44789923840 | elapsed time per iteration (s): 0.56 | learning rate: 4.861E-05 | global batch size: 256 | lm loss: 2.611631E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.855 | TFLOPs: 43.65 | +7: iteration 85440/ 115203 | consumed samples: 21872640 | consumed tokens: 44795166720 | elapsed time per iteration (s): 0.56 | learning rate: 4.859E-05 | global batch size: 256 | lm loss: 2.607241E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.161 | TFLOPs: 43.68 | +7: iteration 85450/ 115203 | consumed samples: 21875200 | consumed tokens: 44800409600 | elapsed time per iteration (s): 0.55 | learning rate: 4.857E-05 | global batch size: 256 | lm loss: 2.608311E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.468 | TFLOPs: 44.00 | +7: iteration 85460/ 115203 | consumed samples: 21877760 | consumed tokens: 44805652480 | elapsed time per iteration (s): 0.56 | learning rate: 4.855E-05 | global batch size: 256 | lm loss: 2.604956E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.560 | TFLOPs: 43.53 | +7: iteration 85470/ 115203 | consumed samples: 21880320 | consumed tokens: 44810895360 | elapsed time per iteration (s): 0.56 | learning rate: 4.854E-05 | global batch size: 256 | lm loss: 2.603815E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.296 | TFLOPs: 43.69 | +7: iteration 85480/ 115203 | consumed samples: 21882880 | consumed tokens: 44816138240 | elapsed time per iteration (s): 0.56 | learning rate: 4.852E-05 | global batch size: 256 | lm loss: 2.589068E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.595 | TFLOPs: 43.72 | +7: iteration 85490/ 115203 | consumed samples: 21885440 | consumed tokens: 44821381120 | elapsed time per iteration (s): 0.56 | learning rate: 4.850E-05 | global batch size: 256 | lm loss: 2.619166E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.622 | TFLOPs: 43.44 | +7: iteration 85500/ 115203 | consumed samples: 21888000 | consumed tokens: 44826624000 | elapsed time per iteration (s): 0.56 | learning rate: 4.848E-05 | global batch size: 256 | lm loss: 2.613002E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.173 | TFLOPs: 43.40 | +7: iteration 85510/ 115203 | consumed samples: 21890560 | consumed tokens: 44831866880 | elapsed time per iteration (s): 0.57 | learning rate: 4.846E-05 | global batch size: 256 | lm loss: 2.599018E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.453 | TFLOPs: 42.95 | +7: iteration 85520/ 115203 | consumed samples: 21893120 | consumed tokens: 44837109760 | elapsed time per iteration (s): 0.57 | learning rate: 4.845E-05 | global batch size: 256 | lm loss: 2.606299E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.916 | TFLOPs: 43.18 | +7: iteration 85530/ 115203 | consumed samples: 21895680 | consumed tokens: 44842352640 | elapsed time per iteration (s): 0.56 | learning rate: 4.843E-05 | global batch size: 256 | lm loss: 2.602090E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.731 | TFLOPs: 43.45 | +7: iteration 85540/ 115203 | consumed samples: 21898240 | consumed tokens: 44847595520 | elapsed time per iteration (s): 0.55 | learning rate: 4.841E-05 | global batch size: 256 | lm loss: 2.602247E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.471 | TFLOPs: 44.00 | +7: iteration 85550/ 115203 | consumed samples: 21900800 | consumed tokens: 44852838400 | elapsed time per iteration (s): 0.57 | learning rate: 4.839E-05 | global batch size: 256 | lm loss: 2.594236E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.632 | TFLOPs: 43.06 | +7: iteration 85560/ 115203 | consumed samples: 21903360 | consumed tokens: 44858081280 | elapsed time per iteration (s): 0.56 | learning rate: 4.837E-05 | global batch size: 256 | lm loss: 2.609009E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.959 | TFLOPs: 43.85 | +7: iteration 85570/ 115203 | consumed samples: 21905920 | consumed tokens: 44863324160 | elapsed time per iteration (s): 0.57 | learning rate: 4.836E-05 | global batch size: 256 | lm loss: 2.600796E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.412 | TFLOPs: 42.94 | +7: iteration 85580/ 115203 | consumed samples: 21908480 | consumed tokens: 44868567040 | elapsed time per iteration (s): 0.57 | learning rate: 4.834E-05 | global batch size: 256 | lm loss: 2.612839E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.782 | TFLOPs: 43.17 | +7: iteration 85590/ 115203 | consumed samples: 21911040 | consumed tokens: 44873809920 | elapsed time per iteration (s): 0.58 | learning rate: 4.832E-05 | global batch size: 256 | lm loss: 2.603896E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.928 | TFLOPs: 42.04 | +7: iteration 85600/ 115203 | consumed samples: 21913600 | consumed tokens: 44879052800 | elapsed time per iteration (s): 0.56 | learning rate: 4.830E-05 | global batch size: 256 | lm loss: 2.606510E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.531 | TFLOPs: 43.72 | +7: iteration 85610/ 115203 | consumed samples: 21916160 | consumed tokens: 44884295680 | elapsed time per iteration (s): 0.57 | learning rate: 4.828E-05 | global batch size: 256 | lm loss: 2.601093E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.805 | TFLOPs: 43.07 | +7: iteration 85620/ 115203 | consumed samples: 21918720 | consumed tokens: 44889538560 | elapsed time per iteration (s): 0.56 | learning rate: 4.827E-05 | global batch size: 256 | lm loss: 2.598792E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.339 | TFLOPs: 43.32 | +7: iteration 85630/ 115203 | consumed samples: 21921280 | consumed tokens: 44894781440 | elapsed time per iteration (s): 0.57 | learning rate: 4.825E-05 | global batch size: 256 | lm loss: 2.605549E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.967 | TFLOPs: 43.19 | +7: iteration 85640/ 115203 | consumed samples: 21923840 | consumed tokens: 44900024320 | elapsed time per iteration (s): 0.56 | learning rate: 4.823E-05 | global batch size: 256 | lm loss: 2.621105E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.235 | TFLOPs: 43.40 | +7: iteration 85650/ 115203 | consumed samples: 21926400 | consumed tokens: 44905267200 | elapsed time per iteration (s): 0.56 | learning rate: 4.821E-05 | global batch size: 256 | lm loss: 2.596146E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.223 | TFLOPs: 43.21 | +7: iteration 85660/ 115203 | consumed samples: 21928960 | consumed tokens: 44910510080 | elapsed time per iteration (s): 0.56 | learning rate: 4.819E-05 | global batch size: 256 | lm loss: 2.608234E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.038 | TFLOPs: 43.48 | +7: iteration 85670/ 115203 | consumed samples: 21931520 | consumed tokens: 44915752960 | elapsed time per iteration (s): 0.56 | learning rate: 4.817E-05 | global batch size: 256 | lm loss: 2.597603E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.183 | TFLOPs: 43.49 | +7: iteration 85680/ 115203 | consumed samples: 21934080 | consumed tokens: 44920995840 | elapsed time per iteration (s): 0.56 | learning rate: 4.816E-05 | global batch size: 256 | lm loss: 2.602834E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.037 | TFLOPs: 43.67 | +7: iteration 85690/ 115203 | consumed samples: 21936640 | consumed tokens: 44926238720 | elapsed time per iteration (s): 0.56 | learning rate: 4.814E-05 | global batch size: 256 | lm loss: 2.599992E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.714 | TFLOPs: 43.83 | +7: iteration 85700/ 115203 | consumed samples: 21939200 | consumed tokens: 44931481600 | elapsed time per iteration (s): 0.57 | learning rate: 4.812E-05 | global batch size: 256 | lm loss: 2.593252E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.398 | TFLOPs: 43.04 | +7: iteration 85710/ 115203 | consumed samples: 21941760 | consumed tokens: 44936724480 | elapsed time per iteration (s): 0.56 | learning rate: 4.810E-05 | global batch size: 256 | lm loss: 2.611041E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.461 | TFLOPs: 43.52 | +7: iteration 85720/ 115203 | consumed samples: 21944320 | consumed tokens: 44941967360 | elapsed time per iteration (s): 0.57 | learning rate: 4.808E-05 | global batch size: 256 | lm loss: 2.609042E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.445 | TFLOPs: 43.04 | +7: iteration 85730/ 115203 | consumed samples: 21946880 | consumed tokens: 44947210240 | elapsed time per iteration (s): 0.56 | learning rate: 4.807E-05 | global batch size: 256 | lm loss: 2.605359E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.578 | TFLOPs: 43.34 | +7: iteration 85740/ 115203 | consumed samples: 21949440 | consumed tokens: 44952453120 | elapsed time per iteration (s): 0.57 | learning rate: 4.805E-05 | global batch size: 256 | lm loss: 2.603531E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.305 | TFLOPs: 42.55 | +7: iteration 85750/ 115203 | consumed samples: 21952000 | consumed tokens: 44957696000 | elapsed time per iteration (s): 0.56 | learning rate: 4.803E-05 | global batch size: 256 | lm loss: 2.601193E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.022 | TFLOPs: 43.48 | +7: iteration 85760/ 115203 | consumed samples: 21954560 | consumed tokens: 44962938880 | elapsed time per iteration (s): 0.56 | learning rate: 4.801E-05 | global batch size: 256 | lm loss: 2.607795E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.420 | TFLOPs: 43.61 | +7: iteration 85770/ 115203 | consumed samples: 21957120 | consumed tokens: 44968181760 | elapsed time per iteration (s): 0.56 | learning rate: 4.800E-05 | global batch size: 256 | lm loss: 2.605829E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.500 | TFLOPs: 43.62 | +7: iteration 85780/ 115203 | consumed samples: 21959680 | consumed tokens: 44973424640 | elapsed time per iteration (s): 0.57 | learning rate: 4.798E-05 | global batch size: 256 | lm loss: 2.604483E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.574 | TFLOPs: 43.15 | +7: iteration 85790/ 115203 | consumed samples: 21962240 | consumed tokens: 44978667520 | elapsed time per iteration (s): 0.57 | learning rate: 4.796E-05 | global batch size: 256 | lm loss: 2.610590E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.691 | TFLOPs: 43.16 | +7: iteration 85800/ 115203 | consumed samples: 21964800 | consumed tokens: 44983910400 | elapsed time per iteration (s): 0.57 | learning rate: 4.794E-05 | global batch size: 256 | lm loss: 2.601291E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.211 | TFLOPs: 43.11 | +7: iteration 85810/ 115203 | consumed samples: 21967360 | consumed tokens: 44989153280 | elapsed time per iteration (s): 0.57 | learning rate: 4.792E-05 | global batch size: 256 | lm loss: 2.611675E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.435 | TFLOPs: 43.04 | +7: iteration 85820/ 115203 | consumed samples: 21969920 | consumed tokens: 44994396160 | elapsed time per iteration (s): 0.57 | learning rate: 4.791E-05 | global batch size: 256 | lm loss: 2.609431E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.927 | TFLOPs: 43.18 | +7: iteration 85830/ 115203 | consumed samples: 21972480 | consumed tokens: 44999639040 | elapsed time per iteration (s): 0.56 | learning rate: 4.789E-05 | global batch size: 256 | lm loss: 2.593834E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.804 | TFLOPs: 43.74 | +7: iteration 85840/ 115203 | consumed samples: 21975040 | consumed tokens: 45004881920 | elapsed time per iteration (s): 0.55 | learning rate: 4.787E-05 | global batch size: 256 | lm loss: 2.592129E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.535 | TFLOPs: 44.00 | +7: iteration 85850/ 115203 | consumed samples: 21977600 | consumed tokens: 45010124800 | elapsed time per iteration (s): 0.58 | learning rate: 4.785E-05 | global batch size: 256 | lm loss: 2.593238E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.901 | TFLOPs: 41.84 | +7: iteration 85860/ 115203 | consumed samples: 21980160 | consumed tokens: 45015367680 | elapsed time per iteration (s): 0.57 | learning rate: 4.783E-05 | global batch size: 256 | lm loss: 2.606010E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.530 | TFLOPs: 42.57 | +7: iteration 85870/ 115203 | consumed samples: 21982720 | consumed tokens: 45020610560 | elapsed time per iteration (s): 0.56 | learning rate: 4.782E-05 | global batch size: 256 | lm loss: 2.586681E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.858 | TFLOPs: 43.56 | +7: iteration 85880/ 115203 | consumed samples: 21985280 | consumed tokens: 45025853440 | elapsed time per iteration (s): 0.56 | learning rate: 4.780E-05 | global batch size: 256 | lm loss: 2.595668E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.054 | TFLOPs: 43.77 | +7: iteration 85890/ 115203 | consumed samples: 21987840 | consumed tokens: 45031096320 | elapsed time per iteration (s): 0.56 | learning rate: 4.778E-05 | global batch size: 256 | lm loss: 2.605623E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.615 | TFLOPs: 43.44 | +7: iteration 85900/ 115203 | consumed samples: 21990400 | consumed tokens: 45036339200 | elapsed time per iteration (s): 0.57 | learning rate: 4.776E-05 | global batch size: 256 | lm loss: 2.617405E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.715 | TFLOPs: 42.88 | +7: iteration 85910/ 115203 | consumed samples: 21992960 | consumed tokens: 45041582080 | elapsed time per iteration (s): 0.56 | learning rate: 4.774E-05 | global batch size: 256 | lm loss: 2.610919E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.261 | TFLOPs: 43.98 | +7: iteration 85920/ 115203 | consumed samples: 21995520 | consumed tokens: 45046824960 | elapsed time per iteration (s): 0.57 | learning rate: 4.773E-05 | global batch size: 256 | lm loss: 2.603415E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.428 | TFLOPs: 42.66 | +7: iteration 85930/ 115203 | consumed samples: 21998080 | consumed tokens: 45052067840 | elapsed time per iteration (s): 0.56 | learning rate: 4.771E-05 | global batch size: 256 | lm loss: 2.602579E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.010 | TFLOPs: 43.57 | +7: iteration 85940/ 115203 | consumed samples: 22000640 | consumed tokens: 45057310720 | elapsed time per iteration (s): 0.56 | learning rate: 4.769E-05 | global batch size: 256 | lm loss: 2.598460E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.411 | TFLOPs: 43.42 | +7: iteration 85950/ 115203 | consumed samples: 22003200 | consumed tokens: 45062553600 | elapsed time per iteration (s): 0.57 | learning rate: 4.767E-05 | global batch size: 256 | lm loss: 2.599502E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.792 | TFLOPs: 42.88 | +7: iteration 85960/ 115203 | consumed samples: 22005760 | consumed tokens: 45067796480 | elapsed time per iteration (s): 0.57 | learning rate: 4.765E-05 | global batch size: 256 | lm loss: 2.585786E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.746 | TFLOPs: 43.16 | +7: iteration 85970/ 115203 | consumed samples: 22008320 | consumed tokens: 45073039360 | elapsed time per iteration (s): 0.56 | learning rate: 4.764E-05 | global batch size: 256 | lm loss: 2.598977E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.031 | TFLOPs: 43.67 | +7: iteration 85980/ 115203 | consumed samples: 22010880 | consumed tokens: 45078282240 | elapsed time per iteration (s): 0.56 | learning rate: 4.762E-05 | global batch size: 256 | lm loss: 2.613262E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.137 | TFLOPs: 43.49 | +7: iteration 85990/ 115203 | consumed samples: 22013440 | consumed tokens: 45083525120 | elapsed time per iteration (s): 0.57 | learning rate: 4.760E-05 | global batch size: 256 | lm loss: 2.604869E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.671 | TFLOPs: 43.16 | +0: [2023-03-17 02:19:02,901] [INFO] [logging.py:68:log_dist] [Rank 0] step=86000, skipped=0, lr=[4.7582977310170454e-05, 4.7582977310170454e-05, 4.7582977310170454e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 86000/ 115203 | consumed samples: 22016000 | consumed tokens: 45088768000 | elapsed time per iteration (s): 0.56 | learning rate: 4.758E-05 | global batch size: 256 | lm loss: 2.586035E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.783 | TFLOPs: 43.45 | +0: steps: 86000 loss: 2.5700 iter time (s): 0.563 samples/sec: 454.982 +7: iteration 86010/ 115203 | consumed samples: 22018560 | consumed tokens: 45094010880 | elapsed time per iteration (s): 0.56 | learning rate: 4.757E-05 | global batch size: 256 | lm loss: 2.591373E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.166 | TFLOPs: 43.87 | +7: iteration 86020/ 115203 | consumed samples: 22021120 | consumed tokens: 45099253760 | elapsed time per iteration (s): 0.56 | learning rate: 4.755E-05 | global batch size: 256 | lm loss: 2.599495E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.140 | TFLOPs: 43.49 | +7: iteration 86030/ 115203 | consumed samples: 22023680 | consumed tokens: 45104496640 | elapsed time per iteration (s): 0.55 | learning rate: 4.753E-05 | global batch size: 256 | lm loss: 2.596385E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.319 | TFLOPs: 43.98 | +7: iteration 86040/ 115203 | consumed samples: 22026240 | consumed tokens: 45109739520 | elapsed time per iteration (s): 0.57 | learning rate: 4.751E-05 | global batch size: 256 | lm loss: 2.604055E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.602 | TFLOPs: 42.96 | +7: iteration 86050/ 115203 | consumed samples: 22028800 | consumed tokens: 45114982400 | elapsed time per iteration (s): 0.57 | learning rate: 4.749E-05 | global batch size: 256 | lm loss: 2.596087E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.853 | TFLOPs: 42.89 | +7: iteration 86060/ 115203 | consumed samples: 22031360 | consumed tokens: 45120225280 | elapsed time per iteration (s): 0.55 | learning rate: 4.748E-05 | global batch size: 256 | lm loss: 2.588239E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.416 | TFLOPs: 43.99 | +7: iteration 86070/ 115203 | consumed samples: 22033920 | consumed tokens: 45125468160 | elapsed time per iteration (s): 0.56 | learning rate: 4.746E-05 | global batch size: 256 | lm loss: 2.591937E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.907 | TFLOPs: 43.85 | +7: iteration 86080/ 115203 | consumed samples: 22036480 | consumed tokens: 45130711040 | elapsed time per iteration (s): 0.55 | learning rate: 4.744E-05 | global batch size: 256 | lm loss: 2.601223E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.292 | TFLOPs: 43.98 | +7: iteration 86090/ 115203 | consumed samples: 22039040 | consumed tokens: 45135953920 | elapsed time per iteration (s): 0.56 | learning rate: 4.742E-05 | global batch size: 256 | lm loss: 2.598674E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.743 | TFLOPs: 43.45 | +7: iteration 86100/ 115203 | consumed samples: 22041600 | consumed tokens: 45141196800 | elapsed time per iteration (s): 0.56 | learning rate: 4.740E-05 | global batch size: 256 | lm loss: 2.598067E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.486 | TFLOPs: 43.71 | +7: iteration 86110/ 115203 | consumed samples: 22044160 | consumed tokens: 45146439680 | elapsed time per iteration (s): 0.55 | learning rate: 4.739E-05 | global batch size: 256 | lm loss: 2.602792E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 86120/ 115203 | consumed samples: 22046720 | consumed tokens: 45151682560 | elapsed time per iteration (s): 0.55 | learning rate: 4.737E-05 | global batch size: 256 | lm loss: 2.611993E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.293 | TFLOPs: 43.98 | +7: iteration 86130/ 115203 | consumed samples: 22049280 | consumed tokens: 45156925440 | elapsed time per iteration (s): 0.56 | learning rate: 4.735E-05 | global batch size: 256 | lm loss: 2.609738E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.232 | TFLOPs: 43.97 | +7: iteration 86140/ 115203 | consumed samples: 22051840 | consumed tokens: 45162168320 | elapsed time per iteration (s): 0.55 | learning rate: 4.733E-05 | global batch size: 256 | lm loss: 2.610933E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.291 | TFLOPs: 43.98 | +7: iteration 86150/ 115203 | consumed samples: 22054400 | consumed tokens: 45167411200 | elapsed time per iteration (s): 0.56 | learning rate: 4.732E-05 | global batch size: 256 | lm loss: 2.609118E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.956 | TFLOPs: 43.28 | +7: iteration 86160/ 115203 | consumed samples: 22056960 | consumed tokens: 45172654080 | elapsed time per iteration (s): 0.56 | learning rate: 4.730E-05 | global batch size: 256 | lm loss: 2.604860E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.136 | TFLOPs: 43.77 | +7: iteration 86170/ 115203 | consumed samples: 22059520 | consumed tokens: 45177896960 | elapsed time per iteration (s): 0.56 | learning rate: 4.728E-05 | global batch size: 256 | lm loss: 2.599447E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.180 | TFLOPs: 43.68 | +7: iteration 86180/ 115203 | consumed samples: 22062080 | consumed tokens: 45183139840 | elapsed time per iteration (s): 0.55 | learning rate: 4.726E-05 | global batch size: 256 | lm loss: 2.594949E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 86190/ 115203 | consumed samples: 22064640 | consumed tokens: 45188382720 | elapsed time per iteration (s): 0.56 | learning rate: 4.724E-05 | global batch size: 256 | lm loss: 2.618489E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.389 | TFLOPs: 43.70 | +7: iteration 86200/ 115203 | consumed samples: 22067200 | consumed tokens: 45193625600 | elapsed time per iteration (s): 0.56 | learning rate: 4.723E-05 | global batch size: 256 | lm loss: 2.594877E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.721 | TFLOPs: 43.35 | +7: iteration 86210/ 115203 | consumed samples: 22069760 | consumed tokens: 45198868480 | elapsed time per iteration (s): 0.56 | learning rate: 4.721E-05 | global batch size: 256 | lm loss: 2.598500E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.876 | TFLOPs: 43.84 | +7: iteration 86220/ 115203 | consumed samples: 22072320 | consumed tokens: 45204111360 | elapsed time per iteration (s): 0.55 | learning rate: 4.719E-05 | global batch size: 256 | lm loss: 2.589101E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.322 | TFLOPs: 43.98 | +7: iteration 86230/ 115203 | consumed samples: 22074880 | consumed tokens: 45209354240 | elapsed time per iteration (s): 0.55 | learning rate: 4.717E-05 | global batch size: 256 | lm loss: 2.603640E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.274 | TFLOPs: 43.98 | +7: iteration 86240/ 115203 | consumed samples: 22077440 | consumed tokens: 45214597120 | elapsed time per iteration (s): 0.55 | learning rate: 4.716E-05 | global batch size: 256 | lm loss: 2.612580E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.574 | TFLOPs: 44.01 | +7: iteration 86250/ 115203 | consumed samples: 22080000 | consumed tokens: 45219840000 | elapsed time per iteration (s): 0.55 | learning rate: 4.714E-05 | global batch size: 256 | lm loss: 2.602731E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 86260/ 115203 | consumed samples: 22082560 | consumed tokens: 45225082880 | elapsed time per iteration (s): 0.55 | learning rate: 4.712E-05 | global batch size: 256 | lm loss: 2.605908E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.528 | TFLOPs: 44.00 | +7: iteration 86270/ 115203 | consumed samples: 22085120 | consumed tokens: 45230325760 | elapsed time per iteration (s): 0.55 | learning rate: 4.710E-05 | global batch size: 256 | lm loss: 2.598373E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.311 | TFLOPs: 43.98 | +7: iteration 86280/ 115203 | consumed samples: 22087680 | consumed tokens: 45235568640 | elapsed time per iteration (s): 0.56 | learning rate: 4.708E-05 | global batch size: 256 | lm loss: 2.592941E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.804 | TFLOPs: 43.55 | +7: iteration 86290/ 115203 | consumed samples: 22090240 | consumed tokens: 45240811520 | elapsed time per iteration (s): 0.56 | learning rate: 4.707E-05 | global batch size: 256 | lm loss: 2.596346E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.037 | TFLOPs: 43.86 | +7: iteration 86300/ 115203 | consumed samples: 22092800 | consumed tokens: 45246054400 | elapsed time per iteration (s): 0.55 | learning rate: 4.705E-05 | global batch size: 256 | lm loss: 2.583969E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.417 | TFLOPs: 43.99 | +7: iteration 86310/ 115203 | consumed samples: 22095360 | consumed tokens: 45251297280 | elapsed time per iteration (s): 0.55 | learning rate: 4.703E-05 | global batch size: 256 | lm loss: 2.590166E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.549 | TFLOPs: 44.00 | +7: iteration 86320/ 115203 | consumed samples: 22097920 | consumed tokens: 45256540160 | elapsed time per iteration (s): 0.55 | learning rate: 4.701E-05 | global batch size: 256 | lm loss: 2.588338E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.391 | TFLOPs: 43.99 | +7: iteration 86330/ 115203 | consumed samples: 22100480 | consumed tokens: 45261783040 | elapsed time per iteration (s): 0.56 | learning rate: 4.700E-05 | global batch size: 256 | lm loss: 2.590905E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.357 | TFLOPs: 43.89 | +7: iteration 86340/ 115203 | consumed samples: 22103040 | consumed tokens: 45267025920 | elapsed time per iteration (s): 0.55 | learning rate: 4.698E-05 | global batch size: 256 | lm loss: 2.589971E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.623 | TFLOPs: 44.01 | +7: iteration 86350/ 115203 | consumed samples: 22105600 | consumed tokens: 45272268800 | elapsed time per iteration (s): 0.56 | learning rate: 4.696E-05 | global batch size: 256 | lm loss: 2.597686E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.676 | TFLOPs: 43.73 | +7: iteration 86360/ 115203 | consumed samples: 22108160 | consumed tokens: 45277511680 | elapsed time per iteration (s): 0.55 | learning rate: 4.694E-05 | global batch size: 256 | lm loss: 2.606201E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.590 | TFLOPs: 44.01 | +7: iteration 86370/ 115203 | consumed samples: 22110720 | consumed tokens: 45282754560 | elapsed time per iteration (s): 0.57 | learning rate: 4.693E-05 | global batch size: 256 | lm loss: 2.606327E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.806 | TFLOPs: 42.88 | +7: iteration 86380/ 115203 | consumed samples: 22113280 | consumed tokens: 45287997440 | elapsed time per iteration (s): 0.56 | learning rate: 4.691E-05 | global batch size: 256 | lm loss: 2.597426E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.258 | TFLOPs: 43.88 | +7: iteration 86390/ 115203 | consumed samples: 22115840 | consumed tokens: 45293240320 | elapsed time per iteration (s): 0.55 | learning rate: 4.689E-05 | global batch size: 256 | lm loss: 2.595080E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.287 | TFLOPs: 43.98 | +7: iteration 86400/ 115203 | consumed samples: 22118400 | consumed tokens: 45298483200 | elapsed time per iteration (s): 0.56 | learning rate: 4.687E-05 | global batch size: 256 | lm loss: 2.595414E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.820 | TFLOPs: 43.55 | +7: iteration 86410/ 115203 | consumed samples: 22120960 | consumed tokens: 45303726080 | elapsed time per iteration (s): 0.56 | learning rate: 4.685E-05 | global batch size: 256 | lm loss: 2.588546E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.242 | TFLOPs: 43.97 | +7: iteration 86420/ 115203 | consumed samples: 22123520 | consumed tokens: 45308968960 | elapsed time per iteration (s): 0.55 | learning rate: 4.684E-05 | global batch size: 256 | lm loss: 2.588624E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.569 | TFLOPs: 44.01 | +7: iteration 86430/ 115203 | consumed samples: 22126080 | consumed tokens: 45314211840 | elapsed time per iteration (s): 0.55 | learning rate: 4.682E-05 | global batch size: 256 | lm loss: 2.606751E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.302 | TFLOPs: 43.98 | +7: iteration 86440/ 115203 | consumed samples: 22128640 | consumed tokens: 45319454720 | elapsed time per iteration (s): 0.56 | learning rate: 4.680E-05 | global batch size: 256 | lm loss: 2.594148E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.068 | TFLOPs: 43.86 | +7: iteration 86450/ 115203 | consumed samples: 22131200 | consumed tokens: 45324697600 | elapsed time per iteration (s): 0.55 | learning rate: 4.678E-05 | global batch size: 256 | lm loss: 2.598154E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.410 | TFLOPs: 43.99 | +7: iteration 86460/ 115203 | consumed samples: 22133760 | consumed tokens: 45329940480 | elapsed time per iteration (s): 0.55 | learning rate: 4.677E-05 | global batch size: 256 | lm loss: 2.585607E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.321 | TFLOPs: 43.98 | +7: iteration 86470/ 115203 | consumed samples: 22136320 | consumed tokens: 45335183360 | elapsed time per iteration (s): 0.56 | learning rate: 4.675E-05 | global batch size: 256 | lm loss: 2.606875E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.615 | TFLOPs: 43.91 | +7: iteration 86480/ 115203 | consumed samples: 22138880 | consumed tokens: 45340426240 | elapsed time per iteration (s): 0.55 | learning rate: 4.673E-05 | global batch size: 256 | lm loss: 2.589353E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.517 | TFLOPs: 44.00 | +7: iteration 86490/ 115203 | consumed samples: 22141440 | consumed tokens: 45345669120 | elapsed time per iteration (s): 0.55 | learning rate: 4.671E-05 | global batch size: 256 | lm loss: 2.600563E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.360 | TFLOPs: 43.99 | +7: iteration 86500/ 115203 | consumed samples: 22144000 | consumed tokens: 45350912000 | elapsed time per iteration (s): 0.55 | learning rate: 4.670E-05 | global batch size: 256 | lm loss: 2.598783E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.581 | TFLOPs: 44.01 | +7: iteration 86510/ 115203 | consumed samples: 22146560 | consumed tokens: 45356154880 | elapsed time per iteration (s): 0.56 | learning rate: 4.668E-05 | global batch size: 256 | lm loss: 2.598104E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.055 | TFLOPs: 43.48 | +7: iteration 86520/ 115203 | consumed samples: 22149120 | consumed tokens: 45361397760 | elapsed time per iteration (s): 0.55 | learning rate: 4.666E-05 | global batch size: 256 | lm loss: 2.591143E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.396 | TFLOPs: 43.99 | +7: iteration 86530/ 115203 | consumed samples: 22151680 | consumed tokens: 45366640640 | elapsed time per iteration (s): 0.56 | learning rate: 4.664E-05 | global batch size: 256 | lm loss: 2.610552E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.260 | TFLOPs: 43.98 | +7: iteration 86540/ 115203 | consumed samples: 22154240 | consumed tokens: 45371883520 | elapsed time per iteration (s): 0.55 | learning rate: 4.663E-05 | global batch size: 256 | lm loss: 2.610277E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.448 | TFLOPs: 43.99 | +7: iteration 86550/ 115203 | consumed samples: 22156800 | consumed tokens: 45377126400 | elapsed time per iteration (s): 0.55 | learning rate: 4.661E-05 | global batch size: 256 | lm loss: 2.592747E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 86560/ 115203 | consumed samples: 22159360 | consumed tokens: 45382369280 | elapsed time per iteration (s): 0.55 | learning rate: 4.659E-05 | global batch size: 256 | lm loss: 2.585225E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.357 | TFLOPs: 43.99 | +7: iteration 86570/ 115203 | consumed samples: 22161920 | consumed tokens: 45387612160 | elapsed time per iteration (s): 0.55 | learning rate: 4.657E-05 | global batch size: 256 | lm loss: 2.595370E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.499 | TFLOPs: 44.00 | +7: iteration 86580/ 115203 | consumed samples: 22164480 | consumed tokens: 45392855040 | elapsed time per iteration (s): 0.56 | learning rate: 4.656E-05 | global batch size: 256 | lm loss: 2.593007E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.431 | TFLOPs: 43.80 | +7: iteration 86590/ 115203 | consumed samples: 22167040 | consumed tokens: 45398097920 | elapsed time per iteration (s): 0.55 | learning rate: 4.654E-05 | global batch size: 256 | lm loss: 2.586711E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.646 | TFLOPs: 44.01 | +7: iteration 86600/ 115203 | consumed samples: 22169600 | consumed tokens: 45403340800 | elapsed time per iteration (s): 0.55 | learning rate: 4.652E-05 | global batch size: 256 | lm loss: 2.605865E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.505 | TFLOPs: 44.00 | +7: iteration 86610/ 115203 | consumed samples: 22172160 | consumed tokens: 45408583680 | elapsed time per iteration (s): 0.55 | learning rate: 4.650E-05 | global batch size: 256 | lm loss: 2.602679E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.530 | TFLOPs: 44.00 | +7: iteration 86620/ 115203 | consumed samples: 22174720 | consumed tokens: 45413826560 | elapsed time per iteration (s): 0.55 | learning rate: 4.648E-05 | global batch size: 256 | lm loss: 2.588854E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.413 | TFLOPs: 43.99 | +7: iteration 86630/ 115203 | consumed samples: 22177280 | consumed tokens: 45419069440 | elapsed time per iteration (s): 0.55 | learning rate: 4.647E-05 | global batch size: 256 | lm loss: 2.601130E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.598 | TFLOPs: 44.01 | +7: iteration 86640/ 115203 | consumed samples: 22179840 | consumed tokens: 45424312320 | elapsed time per iteration (s): 0.55 | learning rate: 4.645E-05 | global batch size: 256 | lm loss: 2.588307E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.626 | TFLOPs: 44.01 | +7: iteration 86650/ 115203 | consumed samples: 22182400 | consumed tokens: 45429555200 | elapsed time per iteration (s): 0.55 | learning rate: 4.643E-05 | global batch size: 256 | lm loss: 2.597847E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.403 | TFLOPs: 43.99 | +7: iteration 86660/ 115203 | consumed samples: 22184960 | consumed tokens: 45434798080 | elapsed time per iteration (s): 0.55 | learning rate: 4.641E-05 | global batch size: 256 | lm loss: 2.586373E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.364 | TFLOPs: 43.99 | +7: iteration 86670/ 115203 | consumed samples: 22187520 | consumed tokens: 45440040960 | elapsed time per iteration (s): 0.55 | learning rate: 4.640E-05 | global batch size: 256 | lm loss: 2.605244E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.586 | TFLOPs: 44.01 | +7: iteration 86680/ 115203 | consumed samples: 22190080 | consumed tokens: 45445283840 | elapsed time per iteration (s): 0.55 | learning rate: 4.638E-05 | global batch size: 256 | lm loss: 2.610860E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.410 | TFLOPs: 43.99 | +7: iteration 86690/ 115203 | consumed samples: 22192640 | consumed tokens: 45450526720 | elapsed time per iteration (s): 0.56 | learning rate: 4.636E-05 | global batch size: 256 | lm loss: 2.606020E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.972 | TFLOPs: 43.47 | +7: iteration 86700/ 115203 | consumed samples: 22195200 | consumed tokens: 45455769600 | elapsed time per iteration (s): 0.55 | learning rate: 4.634E-05 | global batch size: 256 | lm loss: 2.598121E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.283 | TFLOPs: 43.98 | +7: iteration 86710/ 115203 | consumed samples: 22197760 | consumed tokens: 45461012480 | elapsed time per iteration (s): 0.55 | learning rate: 4.633E-05 | global batch size: 256 | lm loss: 2.597020E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 86720/ 115203 | consumed samples: 22200320 | consumed tokens: 45466255360 | elapsed time per iteration (s): 0.55 | learning rate: 4.631E-05 | global batch size: 256 | lm loss: 2.618664E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.269 | TFLOPs: 43.98 | +7: iteration 86730/ 115203 | consumed samples: 22202880 | consumed tokens: 45471498240 | elapsed time per iteration (s): 0.55 | learning rate: 4.629E-05 | global batch size: 256 | lm loss: 2.592861E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.367 | TFLOPs: 43.99 | +7: iteration 86740/ 115203 | consumed samples: 22205440 | consumed tokens: 45476741120 | elapsed time per iteration (s): 0.55 | learning rate: 4.627E-05 | global batch size: 256 | lm loss: 2.598479E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.508 | TFLOPs: 44.00 | +7: iteration 86750/ 115203 | consumed samples: 22208000 | consumed tokens: 45481984000 | elapsed time per iteration (s): 0.55 | learning rate: 4.626E-05 | global batch size: 256 | lm loss: 2.603228E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 86760/ 115203 | consumed samples: 22210560 | consumed tokens: 45487226880 | elapsed time per iteration (s): 0.55 | learning rate: 4.624E-05 | global batch size: 256 | lm loss: 2.597025E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.508 | TFLOPs: 44.00 | +7: iteration 86770/ 115203 | consumed samples: 22213120 | consumed tokens: 45492469760 | elapsed time per iteration (s): 0.55 | learning rate: 4.622E-05 | global batch size: 256 | lm loss: 2.589113E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.431 | TFLOPs: 43.99 | +7: iteration 86780/ 115203 | consumed samples: 22215680 | consumed tokens: 45497712640 | elapsed time per iteration (s): 0.56 | learning rate: 4.620E-05 | global batch size: 256 | lm loss: 2.602977E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.897 | TFLOPs: 43.75 | +7: iteration 86790/ 115203 | consumed samples: 22218240 | consumed tokens: 45502955520 | elapsed time per iteration (s): 0.55 | learning rate: 4.619E-05 | global batch size: 256 | lm loss: 2.614233E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.521 | TFLOPs: 44.00 | +7: iteration 86800/ 115203 | consumed samples: 22220800 | consumed tokens: 45508198400 | elapsed time per iteration (s): 0.55 | learning rate: 4.617E-05 | global batch size: 256 | lm loss: 2.595389E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.477 | TFLOPs: 44.00 | +7: iteration 86810/ 115203 | consumed samples: 22223360 | consumed tokens: 45513441280 | elapsed time per iteration (s): 0.55 | learning rate: 4.615E-05 | global batch size: 256 | lm loss: 2.598919E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.392 | TFLOPs: 43.99 | +7: iteration 86820/ 115203 | consumed samples: 22225920 | consumed tokens: 45518684160 | elapsed time per iteration (s): 0.55 | learning rate: 4.613E-05 | global batch size: 256 | lm loss: 2.591182E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.496 | TFLOPs: 44.00 | +7: iteration 86830/ 115203 | consumed samples: 22228480 | consumed tokens: 45523927040 | elapsed time per iteration (s): 0.56 | learning rate: 4.612E-05 | global batch size: 256 | lm loss: 2.590359E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.243 | TFLOPs: 43.88 | +7: iteration 86840/ 115203 | consumed samples: 22231040 | consumed tokens: 45529169920 | elapsed time per iteration (s): 0.55 | learning rate: 4.610E-05 | global batch size: 256 | lm loss: 2.598921E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.317 | TFLOPs: 43.98 | +7: iteration 86850/ 115203 | consumed samples: 22233600 | consumed tokens: 45534412800 | elapsed time per iteration (s): 0.56 | learning rate: 4.608E-05 | global batch size: 256 | lm loss: 2.607223E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.768 | TFLOPs: 43.93 | +7: iteration 86860/ 115203 | consumed samples: 22236160 | consumed tokens: 45539655680 | elapsed time per iteration (s): 0.55 | learning rate: 4.606E-05 | global batch size: 256 | lm loss: 2.601926E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.401 | TFLOPs: 43.99 | +7: iteration 86870/ 115203 | consumed samples: 22238720 | consumed tokens: 45544898560 | elapsed time per iteration (s): 0.55 | learning rate: 4.605E-05 | global batch size: 256 | lm loss: 2.605497E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.415 | TFLOPs: 43.99 | +7: iteration 86880/ 115203 | consumed samples: 22241280 | consumed tokens: 45550141440 | elapsed time per iteration (s): 0.55 | learning rate: 4.603E-05 | global batch size: 256 | lm loss: 2.605024E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.341 | TFLOPs: 43.98 | +7: iteration 86890/ 115203 | consumed samples: 22243840 | consumed tokens: 45555384320 | elapsed time per iteration (s): 0.55 | learning rate: 4.601E-05 | global batch size: 256 | lm loss: 2.592954E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.325 | TFLOPs: 43.98 | +7: iteration 86900/ 115203 | consumed samples: 22246400 | consumed tokens: 45560627200 | elapsed time per iteration (s): 0.56 | learning rate: 4.599E-05 | global batch size: 256 | lm loss: 2.593052E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.788 | TFLOPs: 43.55 | +7: iteration 86910/ 115203 | consumed samples: 22248960 | consumed tokens: 45565870080 | elapsed time per iteration (s): 0.55 | learning rate: 4.598E-05 | global batch size: 256 | lm loss: 2.593778E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.458 | TFLOPs: 43.99 | +7: iteration 86920/ 115203 | consumed samples: 22251520 | consumed tokens: 45571112960 | elapsed time per iteration (s): 0.57 | learning rate: 4.596E-05 | global batch size: 256 | lm loss: 2.605102E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.211 | TFLOPs: 42.73 | +7: iteration 86930/ 115203 | consumed samples: 22254080 | consumed tokens: 45576355840 | elapsed time per iteration (s): 0.56 | learning rate: 4.594E-05 | global batch size: 256 | lm loss: 2.585957E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.822 | TFLOPs: 43.27 | +7: iteration 86940/ 115203 | consumed samples: 22256640 | consumed tokens: 45581598720 | elapsed time per iteration (s): 0.57 | learning rate: 4.593E-05 | global batch size: 256 | lm loss: 2.587556E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.175 | TFLOPs: 42.73 | +7: iteration 86950/ 115203 | consumed samples: 22259200 | consumed tokens: 45586841600 | elapsed time per iteration (s): 0.56 | learning rate: 4.591E-05 | global batch size: 256 | lm loss: 2.606084E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.172 | TFLOPs: 43.97 | +7: iteration 86960/ 115203 | consumed samples: 22261760 | consumed tokens: 45592084480 | elapsed time per iteration (s): 0.55 | learning rate: 4.589E-05 | global batch size: 256 | lm loss: 2.590934E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.382 | TFLOPs: 43.99 | +7: iteration 86970/ 115203 | consumed samples: 22264320 | consumed tokens: 45597327360 | elapsed time per iteration (s): 0.55 | learning rate: 4.587E-05 | global batch size: 256 | lm loss: 2.601510E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.319 | TFLOPs: 43.98 | +7: iteration 86980/ 115203 | consumed samples: 22266880 | consumed tokens: 45602570240 | elapsed time per iteration (s): 0.56 | learning rate: 4.586E-05 | global batch size: 256 | lm loss: 2.592917E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.191 | TFLOPs: 43.97 | +7: iteration 86990/ 115203 | consumed samples: 22269440 | consumed tokens: 45607813120 | elapsed time per iteration (s): 0.56 | learning rate: 4.584E-05 | global batch size: 256 | lm loss: 2.600905E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.046 | TFLOPs: 43.67 | +7: iteration 87000/ 115203 | consumed samples: 22272000 | consumed tokens: 45613056000 | elapsed time per iteration (s): 0.56 | learning rate: 4.582E-05 | global batch size: 256 | lm loss: 2.608314E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.599 | TFLOPs: 43.63 | +7: iteration 87010/ 115203 | consumed samples: 22274560 | consumed tokens: 45618298880 | elapsed time per iteration (s): 0.56 | learning rate: 4.580E-05 | global batch size: 256 | lm loss: 2.602100E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.368 | TFLOPs: 43.51 | +7: iteration 87020/ 115203 | consumed samples: 22277120 | consumed tokens: 45623541760 | elapsed time per iteration (s): 0.57 | learning rate: 4.579E-05 | global batch size: 256 | lm loss: 2.595862E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.996 | TFLOPs: 42.81 | +7: iteration 87030/ 115203 | consumed samples: 22279680 | consumed tokens: 45628784640 | elapsed time per iteration (s): 0.56 | learning rate: 4.577E-05 | global batch size: 256 | lm loss: 2.589859E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.841 | TFLOPs: 43.84 | +7: iteration 87040/ 115203 | consumed samples: 22282240 | consumed tokens: 45634027520 | elapsed time per iteration (s): 0.56 | learning rate: 4.575E-05 | global batch size: 256 | lm loss: 2.601173E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.320 | TFLOPs: 43.22 | +7: iteration 87050/ 115203 | consumed samples: 22284800 | consumed tokens: 45639270400 | elapsed time per iteration (s): 0.58 | learning rate: 4.573E-05 | global batch size: 256 | lm loss: 2.596424E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.359 | TFLOPs: 41.98 | +7: iteration 87060/ 115203 | consumed samples: 22287360 | consumed tokens: 45644513280 | elapsed time per iteration (s): 0.56 | learning rate: 4.572E-05 | global batch size: 256 | lm loss: 2.599860E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.985 | TFLOPs: 43.38 | +7: iteration 87070/ 115203 | consumed samples: 22289920 | consumed tokens: 45649756160 | elapsed time per iteration (s): 0.57 | learning rate: 4.570E-05 | global batch size: 256 | lm loss: 2.596058E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.626 | TFLOPs: 43.15 | +7: iteration 87080/ 115203 | consumed samples: 22292480 | consumed tokens: 45654999040 | elapsed time per iteration (s): 0.56 | learning rate: 4.568E-05 | global batch size: 256 | lm loss: 2.599818E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.450 | TFLOPs: 43.33 | +7: iteration 87090/ 115203 | consumed samples: 22295040 | consumed tokens: 45660241920 | elapsed time per iteration (s): 0.57 | learning rate: 4.566E-05 | global batch size: 256 | lm loss: 2.599842E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.864 | TFLOPs: 42.89 | +7: iteration 87100/ 115203 | consumed samples: 22297600 | consumed tokens: 45665484800 | elapsed time per iteration (s): 0.56 | learning rate: 4.565E-05 | global batch size: 256 | lm loss: 2.607208E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.353 | TFLOPs: 43.60 | +7: iteration 87110/ 115203 | consumed samples: 22300160 | consumed tokens: 45670727680 | elapsed time per iteration (s): 0.56 | learning rate: 4.563E-05 | global batch size: 256 | lm loss: 2.606924E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.476 | TFLOPs: 43.71 | +7: iteration 87120/ 115203 | consumed samples: 22302720 | consumed tokens: 45675970560 | elapsed time per iteration (s): 0.56 | learning rate: 4.561E-05 | global batch size: 256 | lm loss: 2.591614E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.974 | TFLOPs: 43.57 | +7: iteration 87130/ 115203 | consumed samples: 22305280 | consumed tokens: 45681213440 | elapsed time per iteration (s): 0.56 | learning rate: 4.560E-05 | global batch size: 256 | lm loss: 2.609774E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.476 | TFLOPs: 43.81 | +7: iteration 87140/ 115203 | consumed samples: 22307840 | consumed tokens: 45686456320 | elapsed time per iteration (s): 0.57 | learning rate: 4.558E-05 | global batch size: 256 | lm loss: 2.603962E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.707 | TFLOPs: 43.07 | +7: iteration 87150/ 115203 | consumed samples: 22310400 | consumed tokens: 45691699200 | elapsed time per iteration (s): 0.56 | learning rate: 4.556E-05 | global batch size: 256 | lm loss: 2.590083E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.354 | TFLOPs: 43.70 | +7: iteration 87160/ 115203 | consumed samples: 22312960 | consumed tokens: 45696942080 | elapsed time per iteration (s): 0.57 | learning rate: 4.554E-05 | global batch size: 256 | lm loss: 2.591407E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.940 | TFLOPs: 42.90 | +7: iteration 87170/ 115203 | consumed samples: 22315520 | consumed tokens: 45702184960 | elapsed time per iteration (s): 0.57 | learning rate: 4.553E-05 | global batch size: 256 | lm loss: 2.585303E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.027 | TFLOPs: 43.19 | +7: iteration 87180/ 115203 | consumed samples: 22318080 | consumed tokens: 45707427840 | elapsed time per iteration (s): 0.56 | learning rate: 4.551E-05 | global batch size: 256 | lm loss: 2.596782E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.250 | TFLOPs: 43.40 | +7: iteration 87190/ 115203 | consumed samples: 22320640 | consumed tokens: 45712670720 | elapsed time per iteration (s): 0.56 | learning rate: 4.549E-05 | global batch size: 256 | lm loss: 2.597286E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.757 | TFLOPs: 43.55 | +7: iteration 87200/ 115203 | consumed samples: 22323200 | consumed tokens: 45717913600 | elapsed time per iteration (s): 0.56 | learning rate: 4.547E-05 | global batch size: 256 | lm loss: 2.594750E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.214 | TFLOPs: 43.50 | +7: iteration 87210/ 115203 | consumed samples: 22325760 | consumed tokens: 45723156480 | elapsed time per iteration (s): 0.57 | learning rate: 4.546E-05 | global batch size: 256 | lm loss: 2.601976E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.627 | TFLOPs: 43.06 | +7: iteration 87220/ 115203 | consumed samples: 22328320 | consumed tokens: 45728399360 | elapsed time per iteration (s): 0.56 | learning rate: 4.544E-05 | global batch size: 256 | lm loss: 2.598408E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.029 | TFLOPs: 43.48 | +7: iteration 87230/ 115203 | consumed samples: 22330880 | consumed tokens: 45733642240 | elapsed time per iteration (s): 0.56 | learning rate: 4.542E-05 | global batch size: 256 | lm loss: 2.603620E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.193 | TFLOPs: 43.97 | +7: iteration 87240/ 115203 | consumed samples: 22333440 | consumed tokens: 45738885120 | elapsed time per iteration (s): 0.56 | learning rate: 4.541E-05 | global batch size: 256 | lm loss: 2.578022E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.607 | TFLOPs: 43.63 | +7: iteration 87250/ 115203 | consumed samples: 22336000 | consumed tokens: 45744128000 | elapsed time per iteration (s): 0.57 | learning rate: 4.539E-05 | global batch size: 256 | lm loss: 2.614249E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.044 | TFLOPs: 43.00 | +7: iteration 87260/ 115203 | consumed samples: 22338560 | consumed tokens: 45749370880 | elapsed time per iteration (s): 0.58 | learning rate: 4.537E-05 | global batch size: 256 | lm loss: 2.615952E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.677 | TFLOPs: 42.40 | +7: iteration 87270/ 115203 | consumed samples: 22341120 | consumed tokens: 45754613760 | elapsed time per iteration (s): 0.57 | learning rate: 4.535E-05 | global batch size: 256 | lm loss: 2.603094E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.414 | TFLOPs: 42.94 | +7: iteration 87280/ 115203 | consumed samples: 22343680 | consumed tokens: 45759856640 | elapsed time per iteration (s): 0.57 | learning rate: 4.534E-05 | global batch size: 256 | lm loss: 2.585562E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.630 | TFLOPs: 43.15 | +7: iteration 87290/ 115203 | consumed samples: 22346240 | consumed tokens: 45765099520 | elapsed time per iteration (s): 0.56 | learning rate: 4.532E-05 | global batch size: 256 | lm loss: 2.602736E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.679 | TFLOPs: 43.63 | +7: iteration 87300/ 115203 | consumed samples: 22348800 | consumed tokens: 45770342400 | elapsed time per iteration (s): 0.57 | learning rate: 4.530E-05 | global batch size: 256 | lm loss: 2.593034E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.764 | TFLOPs: 43.07 | +7: iteration 87310/ 115203 | consumed samples: 22351360 | consumed tokens: 45775585280 | elapsed time per iteration (s): 0.56 | learning rate: 4.528E-05 | global batch size: 256 | lm loss: 2.598767E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.303 | TFLOPs: 43.31 | +7: iteration 87320/ 115203 | consumed samples: 22353920 | consumed tokens: 45780828160 | elapsed time per iteration (s): 0.56 | learning rate: 4.527E-05 | global batch size: 256 | lm loss: 2.597213E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.807 | TFLOPs: 43.74 | +7: iteration 87330/ 115203 | consumed samples: 22356480 | consumed tokens: 45786071040 | elapsed time per iteration (s): 0.56 | learning rate: 4.525E-05 | global batch size: 256 | lm loss: 2.599725E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.921 | TFLOPs: 43.47 | +7: iteration 87340/ 115203 | consumed samples: 22359040 | consumed tokens: 45791313920 | elapsed time per iteration (s): 0.56 | learning rate: 4.523E-05 | global batch size: 256 | lm loss: 2.599800E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.140 | TFLOPs: 43.68 | +7: iteration 87350/ 115203 | consumed samples: 22361600 | consumed tokens: 45796556800 | elapsed time per iteration (s): 0.56 | learning rate: 4.522E-05 | global batch size: 256 | lm loss: 2.589174E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.997 | TFLOPs: 43.95 | +7: iteration 87360/ 115203 | consumed samples: 22364160 | consumed tokens: 45801799680 | elapsed time per iteration (s): 0.57 | learning rate: 4.520E-05 | global batch size: 256 | lm loss: 2.586157E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.205 | TFLOPs: 42.92 | +7: iteration 87370/ 115203 | consumed samples: 22366720 | consumed tokens: 45807042560 | elapsed time per iteration (s): 0.56 | learning rate: 4.518E-05 | global batch size: 256 | lm loss: 2.608742E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.696 | TFLOPs: 43.45 | +7: iteration 87380/ 115203 | consumed samples: 22369280 | consumed tokens: 45812285440 | elapsed time per iteration (s): 0.57 | learning rate: 4.516E-05 | global batch size: 256 | lm loss: 2.596589E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.119 | TFLOPs: 43.10 | +7: iteration 87390/ 115203 | consumed samples: 22371840 | consumed tokens: 45817528320 | elapsed time per iteration (s): 0.56 | learning rate: 4.515E-05 | global batch size: 256 | lm loss: 2.586543E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.014 | TFLOPs: 43.48 | +7: iteration 87400/ 115203 | consumed samples: 22374400 | consumed tokens: 45822771200 | elapsed time per iteration (s): 0.56 | learning rate: 4.513E-05 | global batch size: 256 | lm loss: 2.597799E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.939 | TFLOPs: 43.66 | +7: iteration 87410/ 115203 | consumed samples: 22376960 | consumed tokens: 45828014080 | elapsed time per iteration (s): 0.56 | learning rate: 4.511E-05 | global batch size: 256 | lm loss: 2.590078E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.475 | TFLOPs: 43.52 | +7: iteration 87420/ 115203 | consumed samples: 22379520 | consumed tokens: 45833256960 | elapsed time per iteration (s): 0.57 | learning rate: 4.510E-05 | global batch size: 256 | lm loss: 2.596676E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.605 | TFLOPs: 43.06 | +7: iteration 87430/ 115203 | consumed samples: 22382080 | consumed tokens: 45838499840 | elapsed time per iteration (s): 0.57 | learning rate: 4.508E-05 | global batch size: 256 | lm loss: 2.603680E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.905 | TFLOPs: 42.99 | +7: iteration 87440/ 115203 | consumed samples: 22384640 | consumed tokens: 45843742720 | elapsed time per iteration (s): 0.56 | learning rate: 4.506E-05 | global batch size: 256 | lm loss: 2.600932E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.261 | TFLOPs: 43.59 | +7: iteration 87450/ 115203 | consumed samples: 22387200 | consumed tokens: 45848985600 | elapsed time per iteration (s): 0.57 | learning rate: 4.504E-05 | global batch size: 256 | lm loss: 2.590448E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.640 | TFLOPs: 42.49 | +7: iteration 87460/ 115203 | consumed samples: 22389760 | consumed tokens: 45854228480 | elapsed time per iteration (s): 0.56 | learning rate: 4.503E-05 | global batch size: 256 | lm loss: 2.605523E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.876 | TFLOPs: 43.37 | +7: iteration 87470/ 115203 | consumed samples: 22392320 | consumed tokens: 45859471360 | elapsed time per iteration (s): 0.56 | learning rate: 4.501E-05 | global batch size: 256 | lm loss: 2.571997E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.124 | TFLOPs: 43.49 | +7: iteration 87480/ 115203 | consumed samples: 22394880 | consumed tokens: 45864714240 | elapsed time per iteration (s): 0.56 | learning rate: 4.499E-05 | global batch size: 256 | lm loss: 2.606405E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.156 | TFLOPs: 43.97 | +7: iteration 87490/ 115203 | consumed samples: 22397440 | consumed tokens: 45869957120 | elapsed time per iteration (s): 0.56 | learning rate: 4.498E-05 | global batch size: 256 | lm loss: 2.598556E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.692 | TFLOPs: 43.45 | +7: iteration 87500/ 115203 | consumed samples: 22400000 | consumed tokens: 45875200000 | elapsed time per iteration (s): 0.56 | learning rate: 4.496E-05 | global batch size: 256 | lm loss: 2.578066E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.703 | TFLOPs: 43.35 | +7: iteration 87510/ 115203 | consumed samples: 22402560 | consumed tokens: 45880442880 | elapsed time per iteration (s): 0.56 | learning rate: 4.494E-05 | global batch size: 256 | lm loss: 2.605609E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.671 | TFLOPs: 43.25 | +7: iteration 87520/ 115203 | consumed samples: 22405120 | consumed tokens: 45885685760 | elapsed time per iteration (s): 0.57 | learning rate: 4.492E-05 | global batch size: 256 | lm loss: 2.603604E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.483 | TFLOPs: 43.14 | +7: iteration 87530/ 115203 | consumed samples: 22407680 | consumed tokens: 45890928640 | elapsed time per iteration (s): 0.55 | learning rate: 4.491E-05 | global batch size: 256 | lm loss: 2.612975E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.295 | TFLOPs: 43.98 | +7: iteration 87540/ 115203 | consumed samples: 22410240 | consumed tokens: 45896171520 | elapsed time per iteration (s): 0.56 | learning rate: 4.489E-05 | global batch size: 256 | lm loss: 2.593014E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.096 | TFLOPs: 43.58 | +7: iteration 87550/ 115203 | consumed samples: 22412800 | consumed tokens: 45901414400 | elapsed time per iteration (s): 0.56 | learning rate: 4.487E-05 | global batch size: 256 | lm loss: 2.598759E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.190 | TFLOPs: 43.78 | +7: iteration 87560/ 115203 | consumed samples: 22415360 | consumed tokens: 45906657280 | elapsed time per iteration (s): 0.56 | learning rate: 4.486E-05 | global batch size: 256 | lm loss: 2.595486E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.636 | TFLOPs: 43.44 | +7: iteration 87570/ 115203 | consumed samples: 22417920 | consumed tokens: 45911900160 | elapsed time per iteration (s): 0.57 | learning rate: 4.484E-05 | global batch size: 256 | lm loss: 2.586867E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.359 | TFLOPs: 42.94 | +7: iteration 87580/ 115203 | consumed samples: 22420480 | consumed tokens: 45917143040 | elapsed time per iteration (s): 0.57 | learning rate: 4.482E-05 | global batch size: 256 | lm loss: 2.598821E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.051 | TFLOPs: 42.91 | +7: iteration 87590/ 115203 | consumed samples: 22423040 | consumed tokens: 45922385920 | elapsed time per iteration (s): 0.57 | learning rate: 4.480E-05 | global batch size: 256 | lm loss: 2.586881E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.643 | TFLOPs: 43.15 | +7: iteration 87600/ 115203 | consumed samples: 22425600 | consumed tokens: 45927628800 | elapsed time per iteration (s): 0.56 | learning rate: 4.479E-05 | global batch size: 256 | lm loss: 2.585294E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.965 | TFLOPs: 43.66 | +7: iteration 87610/ 115203 | consumed samples: 22428160 | consumed tokens: 45932871680 | elapsed time per iteration (s): 0.56 | learning rate: 4.477E-05 | global batch size: 256 | lm loss: 2.606933E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.975 | TFLOPs: 43.28 | +7: iteration 87620/ 115203 | consumed samples: 22430720 | consumed tokens: 45938114560 | elapsed time per iteration (s): 0.56 | learning rate: 4.475E-05 | global batch size: 256 | lm loss: 2.601060E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.373 | TFLOPs: 43.51 | +7: iteration 87630/ 115203 | consumed samples: 22433280 | consumed tokens: 45943357440 | elapsed time per iteration (s): 0.56 | learning rate: 4.474E-05 | global batch size: 256 | lm loss: 2.589024E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.933 | TFLOPs: 43.47 | +7: iteration 87640/ 115203 | consumed samples: 22435840 | consumed tokens: 45948600320 | elapsed time per iteration (s): 0.57 | learning rate: 4.472E-05 | global batch size: 256 | lm loss: 2.601333E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.458 | TFLOPs: 42.95 | +7: iteration 87650/ 115203 | consumed samples: 22438400 | consumed tokens: 45953843200 | elapsed time per iteration (s): 0.56 | learning rate: 4.470E-05 | global batch size: 256 | lm loss: 2.591261E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.422 | TFLOPs: 43.32 | +7: iteration 87660/ 115203 | consumed samples: 22440960 | consumed tokens: 45959086080 | elapsed time per iteration (s): 0.57 | learning rate: 4.468E-05 | global batch size: 256 | lm loss: 2.602711E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.888 | TFLOPs: 42.99 | +7: iteration 87670/ 115203 | consumed samples: 22443520 | consumed tokens: 45964328960 | elapsed time per iteration (s): 0.58 | learning rate: 4.467E-05 | global batch size: 256 | lm loss: 2.597857E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.212 | TFLOPs: 41.97 | +7: iteration 87680/ 115203 | consumed samples: 22446080 | consumed tokens: 45969571840 | elapsed time per iteration (s): 0.57 | learning rate: 4.465E-05 | global batch size: 256 | lm loss: 2.598650E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.026 | TFLOPs: 42.52 | +7: iteration 87690/ 115203 | consumed samples: 22448640 | consumed tokens: 45974814720 | elapsed time per iteration (s): 0.60 | learning rate: 4.463E-05 | global batch size: 256 | lm loss: 2.603297E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.235 | TFLOPs: 40.92 | +7: iteration 87700/ 115203 | consumed samples: 22451200 | consumed tokens: 45980057600 | elapsed time per iteration (s): 0.58 | learning rate: 4.462E-05 | global batch size: 256 | lm loss: 2.600089E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.489 | TFLOPs: 42.19 | +7: iteration 87710/ 115203 | consumed samples: 22453760 | consumed tokens: 45985300480 | elapsed time per iteration (s): 0.57 | learning rate: 4.460E-05 | global batch size: 256 | lm loss: 2.594411E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.696 | TFLOPs: 42.68 | +7: iteration 87720/ 115203 | consumed samples: 22456320 | consumed tokens: 45990543360 | elapsed time per iteration (s): 0.56 | learning rate: 4.458E-05 | global batch size: 256 | lm loss: 2.619836E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.898 | TFLOPs: 43.46 | +7: iteration 87730/ 115203 | consumed samples: 22458880 | consumed tokens: 45995786240 | elapsed time per iteration (s): 0.61 | learning rate: 4.457E-05 | global batch size: 256 | lm loss: 2.591452E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.398 | TFLOPs: 40.27 | +7: iteration 87740/ 115203 | consumed samples: 22461440 | consumed tokens: 46001029120 | elapsed time per iteration (s): 0.57 | learning rate: 4.455E-05 | global batch size: 256 | lm loss: 2.601268E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.872 | TFLOPs: 42.70 | +7: iteration 87750/ 115203 | consumed samples: 22464000 | consumed tokens: 46006272000 | elapsed time per iteration (s): 0.58 | learning rate: 4.453E-05 | global batch size: 256 | lm loss: 2.597361E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.928 | TFLOPs: 41.75 | +7: iteration 87760/ 115203 | consumed samples: 22466560 | consumed tokens: 46011514880 | elapsed time per iteration (s): 0.58 | learning rate: 4.451E-05 | global batch size: 256 | lm loss: 2.598083E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.757 | TFLOPs: 41.83 | +7: iteration 87770/ 115203 | consumed samples: 22469120 | consumed tokens: 46016757760 | elapsed time per iteration (s): 0.58 | learning rate: 4.450E-05 | global batch size: 256 | lm loss: 2.603358E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.432 | TFLOPs: 41.90 | +7: iteration 87780/ 115203 | consumed samples: 22471680 | consumed tokens: 46022000640 | elapsed time per iteration (s): 0.58 | learning rate: 4.448E-05 | global batch size: 256 | lm loss: 2.607129E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.749 | TFLOPs: 41.73 | +7: iteration 87790/ 115203 | consumed samples: 22474240 | consumed tokens: 46027243520 | elapsed time per iteration (s): 0.59 | learning rate: 4.446E-05 | global batch size: 256 | lm loss: 2.587681E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.262 | TFLOPs: 41.59 | +7: iteration 87800/ 115203 | consumed samples: 22476800 | consumed tokens: 46032486400 | elapsed time per iteration (s): 0.57 | learning rate: 4.445E-05 | global batch size: 256 | lm loss: 2.566968E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.877 | TFLOPs: 42.60 | +7: iteration 87810/ 115203 | consumed samples: 22479360 | consumed tokens: 46037729280 | elapsed time per iteration (s): 0.57 | learning rate: 4.443E-05 | global batch size: 256 | lm loss: 2.586187E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.908 | TFLOPs: 42.61 | +7: iteration 87820/ 115203 | consumed samples: 22481920 | consumed tokens: 46042972160 | elapsed time per iteration (s): 0.60 | learning rate: 4.441E-05 | global batch size: 256 | lm loss: 2.600333E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.355 | TFLOPs: 40.36 | +7: iteration 87830/ 115203 | consumed samples: 22484480 | consumed tokens: 46048215040 | elapsed time per iteration (s): 0.57 | learning rate: 4.440E-05 | global batch size: 256 | lm loss: 2.608659E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.996 | TFLOPs: 42.52 | +7: iteration 87840/ 115203 | consumed samples: 22487040 | consumed tokens: 46053457920 | elapsed time per iteration (s): 0.58 | learning rate: 4.438E-05 | global batch size: 256 | lm loss: 2.606178E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.342 | TFLOPs: 41.98 | +7: iteration 87850/ 115203 | consumed samples: 22489600 | consumed tokens: 46058700800 | elapsed time per iteration (s): 0.57 | learning rate: 4.436E-05 | global batch size: 256 | lm loss: 2.601176E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.527 | TFLOPs: 42.95 | +7: iteration 87860/ 115203 | consumed samples: 22492160 | consumed tokens: 46063943680 | elapsed time per iteration (s): 0.57 | learning rate: 4.434E-05 | global batch size: 256 | lm loss: 2.604287E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.532 | TFLOPs: 42.57 | +7: iteration 87870/ 115203 | consumed samples: 22494720 | consumed tokens: 46069186560 | elapsed time per iteration (s): 0.56 | learning rate: 4.433E-05 | global batch size: 256 | lm loss: 2.583675E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.042 | TFLOPs: 43.38 | +7: iteration 87880/ 115203 | consumed samples: 22497280 | consumed tokens: 46074429440 | elapsed time per iteration (s): 0.58 | learning rate: 4.431E-05 | global batch size: 256 | lm loss: 2.604695E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.136 | TFLOPs: 41.96 | +7: iteration 87890/ 115203 | consumed samples: 22499840 | consumed tokens: 46079672320 | elapsed time per iteration (s): 0.60 | learning rate: 4.429E-05 | global batch size: 256 | lm loss: 2.586533E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.927 | TFLOPs: 40.51 | +7: iteration 87900/ 115203 | consumed samples: 22502400 | consumed tokens: 46084915200 | elapsed time per iteration (s): 0.58 | learning rate: 4.428E-05 | global batch size: 256 | lm loss: 2.606472E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.109 | TFLOPs: 42.25 | +7: iteration 87910/ 115203 | consumed samples: 22504960 | consumed tokens: 46090158080 | elapsed time per iteration (s): 0.59 | learning rate: 4.426E-05 | global batch size: 256 | lm loss: 2.598974E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.079 | TFLOPs: 41.58 | +7: iteration 87920/ 115203 | consumed samples: 22507520 | consumed tokens: 46095400960 | elapsed time per iteration (s): 0.56 | learning rate: 4.424E-05 | global batch size: 256 | lm loss: 2.591225E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.092 | TFLOPs: 43.39 | +7: iteration 87930/ 115203 | consumed samples: 22510080 | consumed tokens: 46100643840 | elapsed time per iteration (s): 0.58 | learning rate: 4.423E-05 | global batch size: 256 | lm loss: 2.581824E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.027 | TFLOPs: 42.14 | +7: iteration 87940/ 115203 | consumed samples: 22512640 | consumed tokens: 46105886720 | elapsed time per iteration (s): 0.58 | learning rate: 4.421E-05 | global batch size: 256 | lm loss: 2.587973E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.026 | TFLOPs: 42.33 | +7: iteration 87950/ 115203 | consumed samples: 22515200 | consumed tokens: 46111129600 | elapsed time per iteration (s): 0.57 | learning rate: 4.419E-05 | global batch size: 256 | lm loss: 2.591006E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.773 | TFLOPs: 42.59 | +7: iteration 87960/ 115203 | consumed samples: 22517760 | consumed tokens: 46116372480 | elapsed time per iteration (s): 0.59 | learning rate: 4.418E-05 | global batch size: 256 | lm loss: 2.590571E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.083 | TFLOPs: 41.10 | +7: iteration 87970/ 115203 | consumed samples: 22520320 | consumed tokens: 46121615360 | elapsed time per iteration (s): 0.56 | learning rate: 4.416E-05 | global batch size: 256 | lm loss: 2.603377E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.441 | TFLOPs: 43.71 | +7: iteration 87980/ 115203 | consumed samples: 22522880 | consumed tokens: 46126858240 | elapsed time per iteration (s): 0.58 | learning rate: 4.414E-05 | global batch size: 256 | lm loss: 2.578965E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.009 | TFLOPs: 42.43 | +7: iteration 87990/ 115203 | consumed samples: 22525440 | consumed tokens: 46132101120 | elapsed time per iteration (s): 0.59 | learning rate: 4.412E-05 | global batch size: 256 | lm loss: 2.594711E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.082 | TFLOPs: 41.19 | +0: [2023-03-17 02:37:48,386] [INFO] [logging.py:68:log_dist] [Rank 0] step=88000, skipped=0, lr=[4.410744818232367e-05, 4.410744818232367e-05, 4.410744818232367e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 88000/ 115203 | consumed samples: 22528000 | consumed tokens: 46137344000 | elapsed time per iteration (s): 0.56 | learning rate: 4.411E-05 | global batch size: 256 | lm loss: 2.589531E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.843 | TFLOPs: 43.65 | +0: steps: 88000 loss: 2.6028 iter time (s): 0.560 samples/sec: 456.913 +7: iteration 88010/ 115203 | consumed samples: 22530560 | consumed tokens: 46142586880 | elapsed time per iteration (s): 0.62 | learning rate: 4.409E-05 | global batch size: 256 | lm loss: 2.601008E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 409.941 | TFLOPs: 39.08 | +7: iteration 88020/ 115203 | consumed samples: 22533120 | consumed tokens: 46147829760 | elapsed time per iteration (s): 0.64 | learning rate: 4.407E-05 | global batch size: 256 | lm loss: 2.599919E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 397.478 | TFLOPs: 37.90 | +7: iteration 88030/ 115203 | consumed samples: 22535680 | consumed tokens: 46153072640 | elapsed time per iteration (s): 0.59 | learning rate: 4.406E-05 | global batch size: 256 | lm loss: 2.602048E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.379 | TFLOPs: 41.60 | +7: iteration 88040/ 115203 | consumed samples: 22538240 | consumed tokens: 46158315520 | elapsed time per iteration (s): 0.57 | learning rate: 4.404E-05 | global batch size: 256 | lm loss: 2.589918E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.515 | TFLOPs: 42.57 | +7: iteration 88050/ 115203 | consumed samples: 22540800 | consumed tokens: 46163558400 | elapsed time per iteration (s): 0.60 | learning rate: 4.402E-05 | global batch size: 256 | lm loss: 2.600596E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.656 | TFLOPs: 40.39 | +7: iteration 88060/ 115203 | consumed samples: 22543360 | consumed tokens: 46168801280 | elapsed time per iteration (s): 0.59 | learning rate: 4.401E-05 | global batch size: 256 | lm loss: 2.602711E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.907 | TFLOPs: 41.18 | +7: iteration 88070/ 115203 | consumed samples: 22545920 | consumed tokens: 46174044160 | elapsed time per iteration (s): 0.60 | learning rate: 4.399E-05 | global batch size: 256 | lm loss: 2.612681E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.859 | TFLOPs: 40.70 | +7: iteration 88080/ 115203 | consumed samples: 22548480 | consumed tokens: 46179287040 | elapsed time per iteration (s): 0.59 | learning rate: 4.397E-05 | global batch size: 256 | lm loss: 2.593666E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.955 | TFLOPs: 41.47 | +7: iteration 88090/ 115203 | consumed samples: 22551040 | consumed tokens: 46184529920 | elapsed time per iteration (s): 0.59 | learning rate: 4.396E-05 | global batch size: 256 | lm loss: 2.593530E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.884 | TFLOPs: 41.65 | +7: iteration 88100/ 115203 | consumed samples: 22553600 | consumed tokens: 46189772800 | elapsed time per iteration (s): 0.58 | learning rate: 4.394E-05 | global batch size: 256 | lm loss: 2.602230E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.072 | TFLOPs: 42.34 | +7: iteration 88110/ 115203 | consumed samples: 22556160 | consumed tokens: 46195015680 | elapsed time per iteration (s): 0.60 | learning rate: 4.392E-05 | global batch size: 256 | lm loss: 2.601589E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.616 | TFLOPs: 40.77 | +7: iteration 88120/ 115203 | consumed samples: 22558720 | consumed tokens: 46200258560 | elapsed time per iteration (s): 0.60 | learning rate: 4.391E-05 | global batch size: 256 | lm loss: 2.588314E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.206 | TFLOPs: 40.92 | +7: iteration 88130/ 115203 | consumed samples: 22561280 | consumed tokens: 46205501440 | elapsed time per iteration (s): 0.58 | learning rate: 4.389E-05 | global batch size: 256 | lm loss: 2.601526E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.268 | TFLOPs: 41.78 | +7: iteration 88140/ 115203 | consumed samples: 22563840 | consumed tokens: 46210744320 | elapsed time per iteration (s): 0.60 | learning rate: 4.387E-05 | global batch size: 256 | lm loss: 2.593549E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.899 | TFLOPs: 40.89 | +7: iteration 88150/ 115203 | consumed samples: 22566400 | consumed tokens: 46215987200 | elapsed time per iteration (s): 0.58 | learning rate: 4.385E-05 | global batch size: 256 | lm loss: 2.595663E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.371 | TFLOPs: 41.79 | +7: iteration 88160/ 115203 | consumed samples: 22568960 | consumed tokens: 46221230080 | elapsed time per iteration (s): 0.59 | learning rate: 4.384E-05 | global batch size: 256 | lm loss: 2.608486E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.001 | TFLOPs: 41.38 | +7: iteration 88170/ 115203 | consumed samples: 22571520 | consumed tokens: 46226472960 | elapsed time per iteration (s): 0.61 | learning rate: 4.382E-05 | global batch size: 256 | lm loss: 2.610446E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.600 | TFLOPs: 40.20 | +7: iteration 88180/ 115203 | consumed samples: 22574080 | consumed tokens: 46231715840 | elapsed time per iteration (s): 0.58 | learning rate: 4.380E-05 | global batch size: 256 | lm loss: 2.587828E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.577 | TFLOPs: 42.10 | +7: iteration 88190/ 115203 | consumed samples: 22576640 | consumed tokens: 46236958720 | elapsed time per iteration (s): 0.58 | learning rate: 4.379E-05 | global batch size: 256 | lm loss: 2.595202E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.347 | TFLOPs: 42.27 | +7: iteration 88200/ 115203 | consumed samples: 22579200 | consumed tokens: 46242201600 | elapsed time per iteration (s): 0.56 | learning rate: 4.377E-05 | global batch size: 256 | lm loss: 2.593087E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.979 | TFLOPs: 43.28 | +7: iteration 88210/ 115203 | consumed samples: 22581760 | consumed tokens: 46247444480 | elapsed time per iteration (s): 0.60 | learning rate: 4.375E-05 | global batch size: 256 | lm loss: 2.592224E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.160 | TFLOPs: 40.92 | +7: iteration 88220/ 115203 | consumed samples: 22584320 | consumed tokens: 46252687360 | elapsed time per iteration (s): 0.63 | learning rate: 4.374E-05 | global batch size: 256 | lm loss: 2.590185E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 406.895 | TFLOPs: 38.79 | +7: iteration 88230/ 115203 | consumed samples: 22586880 | consumed tokens: 46257930240 | elapsed time per iteration (s): 0.57 | learning rate: 4.372E-05 | global batch size: 256 | lm loss: 2.587376E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.685 | TFLOPs: 42.68 | +7: iteration 88240/ 115203 | consumed samples: 22589440 | consumed tokens: 46263173120 | elapsed time per iteration (s): 0.58 | learning rate: 4.370E-05 | global batch size: 256 | lm loss: 2.585751E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.538 | TFLOPs: 42.19 | +7: iteration 88250/ 115203 | consumed samples: 22592000 | consumed tokens: 46268416000 | elapsed time per iteration (s): 0.58 | learning rate: 4.369E-05 | global batch size: 256 | lm loss: 2.585640E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.354 | TFLOPs: 41.98 | +7: iteration 88260/ 115203 | consumed samples: 22594560 | consumed tokens: 46273658880 | elapsed time per iteration (s): 0.58 | learning rate: 4.367E-05 | global batch size: 256 | lm loss: 2.598276E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.521 | TFLOPs: 41.90 | +7: iteration 88270/ 115203 | consumed samples: 22597120 | consumed tokens: 46278901760 | elapsed time per iteration (s): 0.60 | learning rate: 4.365E-05 | global batch size: 256 | lm loss: 2.576882E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.558 | TFLOPs: 40.76 | +7: iteration 88280/ 115203 | consumed samples: 22599680 | consumed tokens: 46284144640 | elapsed time per iteration (s): 0.57 | learning rate: 4.364E-05 | global batch size: 256 | lm loss: 2.600910E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.523 | TFLOPs: 42.95 | +7: iteration 88290/ 115203 | consumed samples: 22602240 | consumed tokens: 46289387520 | elapsed time per iteration (s): 0.58 | learning rate: 4.362E-05 | global batch size: 256 | lm loss: 2.612197E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.085 | TFLOPs: 42.15 | +7: iteration 88300/ 115203 | consumed samples: 22604800 | consumed tokens: 46294630400 | elapsed time per iteration (s): 0.58 | learning rate: 4.360E-05 | global batch size: 256 | lm loss: 2.593340E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.357 | TFLOPs: 41.79 | +7: iteration 88310/ 115203 | consumed samples: 22607360 | consumed tokens: 46299873280 | elapsed time per iteration (s): 0.58 | learning rate: 4.359E-05 | global batch size: 256 | lm loss: 2.589197E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.034 | TFLOPs: 41.95 | +7: iteration 88320/ 115203 | consumed samples: 22609920 | consumed tokens: 46305116160 | elapsed time per iteration (s): 0.59 | learning rate: 4.357E-05 | global batch size: 256 | lm loss: 2.606293E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.615 | TFLOPs: 41.34 | +7: iteration 88330/ 115203 | consumed samples: 22612480 | consumed tokens: 46310359040 | elapsed time per iteration (s): 0.58 | learning rate: 4.355E-05 | global batch size: 256 | lm loss: 2.602961E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.267 | TFLOPs: 42.17 | +7: iteration 88340/ 115203 | consumed samples: 22615040 | consumed tokens: 46315601920 | elapsed time per iteration (s): 0.58 | learning rate: 4.354E-05 | global batch size: 256 | lm loss: 2.600460E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.248 | TFLOPs: 42.07 | +7: iteration 88350/ 115203 | consumed samples: 22617600 | consumed tokens: 46320844800 | elapsed time per iteration (s): 0.58 | learning rate: 4.352E-05 | global batch size: 256 | lm loss: 2.609833E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.134 | TFLOPs: 42.34 | +7: iteration 88360/ 115203 | consumed samples: 22620160 | consumed tokens: 46326087680 | elapsed time per iteration (s): 0.57 | learning rate: 4.350E-05 | global batch size: 256 | lm loss: 2.604572E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.480 | TFLOPs: 42.95 | +7: iteration 88370/ 115203 | consumed samples: 22622720 | consumed tokens: 46331330560 | elapsed time per iteration (s): 0.59 | learning rate: 4.349E-05 | global batch size: 256 | lm loss: 2.589667E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.440 | TFLOPs: 41.42 | +7: iteration 88380/ 115203 | consumed samples: 22625280 | consumed tokens: 46336573440 | elapsed time per iteration (s): 0.57 | learning rate: 4.347E-05 | global batch size: 256 | lm loss: 2.611003E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.850 | TFLOPs: 43.17 | +7: iteration 88390/ 115203 | consumed samples: 22627840 | consumed tokens: 46341816320 | elapsed time per iteration (s): 0.58 | learning rate: 4.345E-05 | global batch size: 256 | lm loss: 2.587490E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.952 | TFLOPs: 42.42 | +7: iteration 88400/ 115203 | consumed samples: 22630400 | consumed tokens: 46347059200 | elapsed time per iteration (s): 0.57 | learning rate: 4.344E-05 | global batch size: 256 | lm loss: 2.597593E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.192 | TFLOPs: 42.83 | +7: iteration 88410/ 115203 | consumed samples: 22632960 | consumed tokens: 46352302080 | elapsed time per iteration (s): 0.57 | learning rate: 4.342E-05 | global batch size: 256 | lm loss: 2.592585E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.207 | TFLOPs: 42.64 | +7: iteration 88420/ 115203 | consumed samples: 22635520 | consumed tokens: 46357544960 | elapsed time per iteration (s): 0.59 | learning rate: 4.340E-05 | global batch size: 256 | lm loss: 2.602666E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.408 | TFLOPs: 41.32 | +7: iteration 88430/ 115203 | consumed samples: 22638080 | consumed tokens: 46362787840 | elapsed time per iteration (s): 0.57 | learning rate: 4.339E-05 | global batch size: 256 | lm loss: 2.607890E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.237 | TFLOPs: 42.54 | +7: iteration 88440/ 115203 | consumed samples: 22640640 | consumed tokens: 46368030720 | elapsed time per iteration (s): 0.57 | learning rate: 4.337E-05 | global batch size: 256 | lm loss: 2.583395E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.435 | TFLOPs: 42.75 | +7: iteration 88450/ 115203 | consumed samples: 22643200 | consumed tokens: 46373273600 | elapsed time per iteration (s): 0.58 | learning rate: 4.335E-05 | global batch size: 256 | lm loss: 2.587564E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.066 | TFLOPs: 42.24 | +7: iteration 88460/ 115203 | consumed samples: 22645760 | consumed tokens: 46378516480 | elapsed time per iteration (s): 0.58 | learning rate: 4.334E-05 | global batch size: 256 | lm loss: 2.596165E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.927 | TFLOPs: 41.75 | +7: iteration 88470/ 115203 | consumed samples: 22648320 | consumed tokens: 46383759360 | elapsed time per iteration (s): 0.57 | learning rate: 4.332E-05 | global batch size: 256 | lm loss: 2.596360E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.910 | TFLOPs: 42.70 | +7: iteration 88480/ 115203 | consumed samples: 22650880 | consumed tokens: 46389002240 | elapsed time per iteration (s): 0.57 | learning rate: 4.330E-05 | global batch size: 256 | lm loss: 2.599829E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.676 | TFLOPs: 42.97 | +7: iteration 88490/ 115203 | consumed samples: 22653440 | consumed tokens: 46394245120 | elapsed time per iteration (s): 0.59 | learning rate: 4.329E-05 | global batch size: 256 | lm loss: 2.583480E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.373 | TFLOPs: 41.22 | +7: iteration 88500/ 115203 | consumed samples: 22656000 | consumed tokens: 46399488000 | elapsed time per iteration (s): 0.59 | learning rate: 4.327E-05 | global batch size: 256 | lm loss: 2.585922E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.368 | TFLOPs: 41.70 | +7: iteration 88510/ 115203 | consumed samples: 22658560 | consumed tokens: 46404730880 | elapsed time per iteration (s): 0.58 | learning rate: 4.325E-05 | global batch size: 256 | lm loss: 2.601552E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.873 | TFLOPs: 41.94 | +7: iteration 88520/ 115203 | consumed samples: 22661120 | consumed tokens: 46409973760 | elapsed time per iteration (s): 0.58 | learning rate: 4.324E-05 | global batch size: 256 | lm loss: 2.612908E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.680 | TFLOPs: 42.20 | +7: iteration 88530/ 115203 | consumed samples: 22663680 | consumed tokens: 46415216640 | elapsed time per iteration (s): 0.57 | learning rate: 4.322E-05 | global batch size: 256 | lm loss: 2.587191E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.881 | TFLOPs: 42.99 | +7: iteration 88540/ 115203 | consumed samples: 22666240 | consumed tokens: 46420459520 | elapsed time per iteration (s): 0.58 | learning rate: 4.320E-05 | global batch size: 256 | lm loss: 2.601249E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.218 | TFLOPs: 42.26 | +7: iteration 88550/ 115203 | consumed samples: 22668800 | consumed tokens: 46425702400 | elapsed time per iteration (s): 0.57 | learning rate: 4.319E-05 | global batch size: 256 | lm loss: 2.593362E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.537 | TFLOPs: 43.05 | +7: iteration 88560/ 115203 | consumed samples: 22671360 | consumed tokens: 46430945280 | elapsed time per iteration (s): 0.57 | learning rate: 4.317E-05 | global batch size: 256 | lm loss: 2.601724E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.349 | TFLOPs: 42.84 | +7: iteration 88570/ 115203 | consumed samples: 22673920 | consumed tokens: 46436188160 | elapsed time per iteration (s): 0.57 | learning rate: 4.315E-05 | global batch size: 256 | lm loss: 2.588990E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.472 | TFLOPs: 42.95 | +7: iteration 88580/ 115203 | consumed samples: 22676480 | consumed tokens: 46441431040 | elapsed time per iteration (s): 0.56 | learning rate: 4.314E-05 | global batch size: 256 | lm loss: 2.604652E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.031 | TFLOPs: 43.57 | +7: iteration 88590/ 115203 | consumed samples: 22679040 | consumed tokens: 46446673920 | elapsed time per iteration (s): 0.58 | learning rate: 4.312E-05 | global batch size: 256 | lm loss: 2.598503E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.131 | TFLOPs: 42.34 | +7: iteration 88600/ 115203 | consumed samples: 22681600 | consumed tokens: 46451916800 | elapsed time per iteration (s): 0.56 | learning rate: 4.310E-05 | global batch size: 256 | lm loss: 2.583049E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.375 | TFLOPs: 43.42 | +7: iteration 88610/ 115203 | consumed samples: 22684160 | consumed tokens: 46457159680 | elapsed time per iteration (s): 0.56 | learning rate: 4.309E-05 | global batch size: 256 | lm loss: 2.587299E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.194 | TFLOPs: 43.21 | +7: iteration 88620/ 115203 | consumed samples: 22686720 | consumed tokens: 46462402560 | elapsed time per iteration (s): 0.56 | learning rate: 4.307E-05 | global batch size: 256 | lm loss: 2.603317E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.256 | TFLOPs: 43.21 | +7: iteration 88630/ 115203 | consumed samples: 22689280 | consumed tokens: 46467645440 | elapsed time per iteration (s): 0.57 | learning rate: 4.305E-05 | global batch size: 256 | lm loss: 2.598462E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.827 | TFLOPs: 42.89 | +7: iteration 88640/ 115203 | consumed samples: 22691840 | consumed tokens: 46472888320 | elapsed time per iteration (s): 0.57 | learning rate: 4.304E-05 | global batch size: 256 | lm loss: 2.595780E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.296 | TFLOPs: 42.84 | +7: iteration 88650/ 115203 | consumed samples: 22694400 | consumed tokens: 46478131200 | elapsed time per iteration (s): 0.58 | learning rate: 4.302E-05 | global batch size: 256 | lm loss: 2.599974E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.198 | TFLOPs: 42.44 | +7: iteration 88660/ 115203 | consumed samples: 22696960 | consumed tokens: 46483374080 | elapsed time per iteration (s): 0.58 | learning rate: 4.300E-05 | global batch size: 256 | lm loss: 2.598582E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.316 | TFLOPs: 42.27 | +7: iteration 88670/ 115203 | consumed samples: 22699520 | consumed tokens: 46488616960 | elapsed time per iteration (s): 0.56 | learning rate: 4.299E-05 | global batch size: 256 | lm loss: 2.592741E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.956 | TFLOPs: 43.47 | +7: iteration 88680/ 115203 | consumed samples: 22702080 | consumed tokens: 46493859840 | elapsed time per iteration (s): 0.56 | learning rate: 4.297E-05 | global batch size: 256 | lm loss: 2.603467E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.710 | TFLOPs: 43.35 | +7: iteration 88690/ 115203 | consumed samples: 22704640 | consumed tokens: 46499102720 | elapsed time per iteration (s): 0.56 | learning rate: 4.295E-05 | global batch size: 256 | lm loss: 2.597305E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.754 | TFLOPs: 43.55 | +7: iteration 88700/ 115203 | consumed samples: 22707200 | consumed tokens: 46504345600 | elapsed time per iteration (s): 0.56 | learning rate: 4.294E-05 | global batch size: 256 | lm loss: 2.598264E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.303 | TFLOPs: 43.22 | +7: iteration 88710/ 115203 | consumed samples: 22709760 | consumed tokens: 46509588480 | elapsed time per iteration (s): 0.58 | learning rate: 4.292E-05 | global batch size: 256 | lm loss: 2.593100E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.886 | TFLOPs: 42.03 | +7: iteration 88720/ 115203 | consumed samples: 22712320 | consumed tokens: 46514831360 | elapsed time per iteration (s): 0.58 | learning rate: 4.290E-05 | global batch size: 256 | lm loss: 2.589600E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.104 | TFLOPs: 42.34 | +7: iteration 88730/ 115203 | consumed samples: 22714880 | consumed tokens: 46520074240 | elapsed time per iteration (s): 0.58 | learning rate: 4.289E-05 | global batch size: 256 | lm loss: 2.586189E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.041 | TFLOPs: 42.33 | +7: iteration 88740/ 115203 | consumed samples: 22717440 | consumed tokens: 46525317120 | elapsed time per iteration (s): 0.55 | learning rate: 4.287E-05 | global batch size: 256 | lm loss: 2.588125E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.630 | TFLOPs: 44.01 | +7: iteration 88750/ 115203 | consumed samples: 22720000 | consumed tokens: 46530560000 | elapsed time per iteration (s): 0.57 | learning rate: 4.286E-05 | global batch size: 256 | lm loss: 2.604229E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.172 | TFLOPs: 42.73 | +7: iteration 88760/ 115203 | consumed samples: 22722560 | consumed tokens: 46535802880 | elapsed time per iteration (s): 0.57 | learning rate: 4.284E-05 | global batch size: 256 | lm loss: 2.591199E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.836 | TFLOPs: 42.98 | +7: iteration 88770/ 115203 | consumed samples: 22725120 | consumed tokens: 46541045760 | elapsed time per iteration (s): 0.58 | learning rate: 4.282E-05 | global batch size: 256 | lm loss: 2.595852E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.673 | TFLOPs: 42.30 | +7: iteration 88780/ 115203 | consumed samples: 22727680 | consumed tokens: 46546288640 | elapsed time per iteration (s): 0.58 | learning rate: 4.281E-05 | global batch size: 256 | lm loss: 2.595039E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.794 | TFLOPs: 42.31 | +7: iteration 88790/ 115203 | consumed samples: 22730240 | consumed tokens: 46551531520 | elapsed time per iteration (s): 0.58 | learning rate: 4.279E-05 | global batch size: 256 | lm loss: 2.591873E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.650 | TFLOPs: 41.73 | +7: iteration 88800/ 115203 | consumed samples: 22732800 | consumed tokens: 46556774400 | elapsed time per iteration (s): 0.58 | learning rate: 4.277E-05 | global batch size: 256 | lm loss: 2.582717E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.389 | TFLOPs: 42.37 | +7: iteration 88810/ 115203 | consumed samples: 22735360 | consumed tokens: 46562017280 | elapsed time per iteration (s): 0.57 | learning rate: 4.276E-05 | global batch size: 256 | lm loss: 2.597863E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.263 | TFLOPs: 42.64 | +7: iteration 88820/ 115203 | consumed samples: 22737920 | consumed tokens: 46567260160 | elapsed time per iteration (s): 0.57 | learning rate: 4.274E-05 | global batch size: 256 | lm loss: 2.590916E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.603 | TFLOPs: 42.67 | +7: iteration 88830/ 115203 | consumed samples: 22740480 | consumed tokens: 46572503040 | elapsed time per iteration (s): 0.58 | learning rate: 4.272E-05 | global batch size: 256 | lm loss: 2.578502E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.533 | TFLOPs: 41.90 | +7: iteration 88840/ 115203 | consumed samples: 22743040 | consumed tokens: 46577745920 | elapsed time per iteration (s): 0.57 | learning rate: 4.271E-05 | global batch size: 256 | lm loss: 2.609168E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.865 | TFLOPs: 42.99 | +7: iteration 88850/ 115203 | consumed samples: 22745600 | consumed tokens: 46582988800 | elapsed time per iteration (s): 0.58 | learning rate: 4.269E-05 | global batch size: 256 | lm loss: 2.601023E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.428 | TFLOPs: 42.18 | +7: iteration 88860/ 115203 | consumed samples: 22748160 | consumed tokens: 46588231680 | elapsed time per iteration (s): 0.57 | learning rate: 4.267E-05 | global batch size: 256 | lm loss: 2.593295E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.290 | TFLOPs: 42.64 | +7: iteration 88870/ 115203 | consumed samples: 22750720 | consumed tokens: 46593474560 | elapsed time per iteration (s): 0.56 | learning rate: 4.266E-05 | global batch size: 256 | lm loss: 2.578241E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.613 | TFLOPs: 43.82 | +7: iteration 88880/ 115203 | consumed samples: 22753280 | consumed tokens: 46598717440 | elapsed time per iteration (s): 0.57 | learning rate: 4.264E-05 | global batch size: 256 | lm loss: 2.590436E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.600 | TFLOPs: 43.15 | +7: iteration 88890/ 115203 | consumed samples: 22755840 | consumed tokens: 46603960320 | elapsed time per iteration (s): 0.56 | learning rate: 4.262E-05 | global batch size: 256 | lm loss: 2.593751E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.609 | TFLOPs: 43.72 | +7: iteration 88900/ 115203 | consumed samples: 22758400 | consumed tokens: 46609203200 | elapsed time per iteration (s): 0.56 | learning rate: 4.261E-05 | global batch size: 256 | lm loss: 2.585330E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.244 | TFLOPs: 43.40 | +7: iteration 88910/ 115203 | consumed samples: 22760960 | consumed tokens: 46614446080 | elapsed time per iteration (s): 0.57 | learning rate: 4.259E-05 | global batch size: 256 | lm loss: 2.615275E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.711 | TFLOPs: 42.78 | +7: iteration 88920/ 115203 | consumed samples: 22763520 | consumed tokens: 46619688960 | elapsed time per iteration (s): 0.57 | learning rate: 4.258E-05 | global batch size: 256 | lm loss: 2.596324E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.670 | TFLOPs: 42.78 | +7: iteration 88930/ 115203 | consumed samples: 22766080 | consumed tokens: 46624931840 | elapsed time per iteration (s): 0.57 | learning rate: 4.256E-05 | global batch size: 256 | lm loss: 2.596855E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.474 | TFLOPs: 42.76 | +7: iteration 88940/ 115203 | consumed samples: 22768640 | consumed tokens: 46630174720 | elapsed time per iteration (s): 0.56 | learning rate: 4.254E-05 | global batch size: 256 | lm loss: 2.594809E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.650 | TFLOPs: 43.35 | +7: iteration 88950/ 115203 | consumed samples: 22771200 | consumed tokens: 46635417600 | elapsed time per iteration (s): 0.58 | learning rate: 4.253E-05 | global batch size: 256 | lm loss: 2.585863E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.683 | TFLOPs: 42.30 | +7: iteration 88960/ 115203 | consumed samples: 22773760 | consumed tokens: 46640660480 | elapsed time per iteration (s): 0.58 | learning rate: 4.251E-05 | global batch size: 256 | lm loss: 2.588279E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.628 | TFLOPs: 42.30 | +7: iteration 88970/ 115203 | consumed samples: 22776320 | consumed tokens: 46645903360 | elapsed time per iteration (s): 0.57 | learning rate: 4.249E-05 | global batch size: 256 | lm loss: 2.586979E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.830 | TFLOPs: 42.79 | +7: iteration 88980/ 115203 | consumed samples: 22778880 | consumed tokens: 46651146240 | elapsed time per iteration (s): 0.57 | learning rate: 4.248E-05 | global batch size: 256 | lm loss: 2.599248E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.349 | TFLOPs: 42.94 | +7: iteration 88990/ 115203 | consumed samples: 22781440 | consumed tokens: 46656389120 | elapsed time per iteration (s): 0.58 | learning rate: 4.246E-05 | global batch size: 256 | lm loss: 2.588954E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.397 | TFLOPs: 42.08 | +7: iteration 89000/ 115203 | consumed samples: 22784000 | consumed tokens: 46661632000 | elapsed time per iteration (s): 0.59 | learning rate: 4.244E-05 | global batch size: 256 | lm loss: 2.588137E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.059 | TFLOPs: 41.67 | +7: iteration 89010/ 115203 | consumed samples: 22786560 | consumed tokens: 46666874880 | elapsed time per iteration (s): 0.56 | learning rate: 4.243E-05 | global batch size: 256 | lm loss: 2.596121E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.031 | TFLOPs: 43.29 | +7: iteration 89020/ 115203 | consumed samples: 22789120 | consumed tokens: 46672117760 | elapsed time per iteration (s): 0.59 | learning rate: 4.241E-05 | global batch size: 256 | lm loss: 2.587978E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.105 | TFLOPs: 41.67 | +7: iteration 89030/ 115203 | consumed samples: 22791680 | consumed tokens: 46677360640 | elapsed time per iteration (s): 0.56 | learning rate: 4.239E-05 | global batch size: 256 | lm loss: 2.600595E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.353 | TFLOPs: 43.32 | +7: iteration 89040/ 115203 | consumed samples: 22794240 | consumed tokens: 46682603520 | elapsed time per iteration (s): 0.58 | learning rate: 4.238E-05 | global batch size: 256 | lm loss: 2.603097E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.080 | TFLOPs: 42.34 | +7: iteration 89050/ 115203 | consumed samples: 22796800 | consumed tokens: 46687846400 | elapsed time per iteration (s): 0.59 | learning rate: 4.236E-05 | global batch size: 256 | lm loss: 2.589581E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.578 | TFLOPs: 41.72 | +7: iteration 89060/ 115203 | consumed samples: 22799360 | consumed tokens: 46693089280 | elapsed time per iteration (s): 0.57 | learning rate: 4.235E-05 | global batch size: 256 | lm loss: 2.584292E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.050 | TFLOPs: 43.00 | +7: iteration 89070/ 115203 | consumed samples: 22801920 | consumed tokens: 46698332160 | elapsed time per iteration (s): 0.57 | learning rate: 4.233E-05 | global batch size: 256 | lm loss: 2.595078E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.017 | TFLOPs: 42.52 | +7: iteration 89080/ 115203 | consumed samples: 22804480 | consumed tokens: 46703575040 | elapsed time per iteration (s): 0.58 | learning rate: 4.231E-05 | global batch size: 256 | lm loss: 2.593449E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.479 | TFLOPs: 42.19 | +7: iteration 89090/ 115203 | consumed samples: 22807040 | consumed tokens: 46708817920 | elapsed time per iteration (s): 0.58 | learning rate: 4.230E-05 | global batch size: 256 | lm loss: 2.598819E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.647 | TFLOPs: 42.20 | +7: iteration 89100/ 115203 | consumed samples: 22809600 | consumed tokens: 46714060800 | elapsed time per iteration (s): 0.57 | learning rate: 4.228E-05 | global batch size: 256 | lm loss: 2.609661E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.597 | TFLOPs: 43.05 | +7: iteration 89110/ 115203 | consumed samples: 22812160 | consumed tokens: 46719303680 | elapsed time per iteration (s): 0.58 | learning rate: 4.226E-05 | global batch size: 256 | lm loss: 2.597799E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.943 | TFLOPs: 42.33 | +7: iteration 89120/ 115203 | consumed samples: 22814720 | consumed tokens: 46724546560 | elapsed time per iteration (s): 0.57 | learning rate: 4.225E-05 | global batch size: 256 | lm loss: 2.591418E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.450 | TFLOPs: 42.47 | +7: iteration 89130/ 115203 | consumed samples: 22817280 | consumed tokens: 46729789440 | elapsed time per iteration (s): 0.58 | learning rate: 4.223E-05 | global batch size: 256 | lm loss: 2.588609E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.922 | TFLOPs: 42.42 | +7: iteration 89140/ 115203 | consumed samples: 22819840 | consumed tokens: 46735032320 | elapsed time per iteration (s): 0.57 | learning rate: 4.222E-05 | global batch size: 256 | lm loss: 2.589500E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.449 | TFLOPs: 42.66 | +7: iteration 89150/ 115203 | consumed samples: 22822400 | consumed tokens: 46740275200 | elapsed time per iteration (s): 0.58 | learning rate: 4.220E-05 | global batch size: 256 | lm loss: 2.588062E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.252 | TFLOPs: 42.26 | +7: iteration 89160/ 115203 | consumed samples: 22824960 | consumed tokens: 46745518080 | elapsed time per iteration (s): 0.58 | learning rate: 4.218E-05 | global batch size: 256 | lm loss: 2.591652E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.190 | TFLOPs: 42.44 | +7: iteration 89170/ 115203 | consumed samples: 22827520 | consumed tokens: 46750760960 | elapsed time per iteration (s): 0.57 | learning rate: 4.217E-05 | global batch size: 256 | lm loss: 2.595571E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.800 | TFLOPs: 43.17 | +7: iteration 89180/ 115203 | consumed samples: 22830080 | consumed tokens: 46756003840 | elapsed time per iteration (s): 0.58 | learning rate: 4.215E-05 | global batch size: 256 | lm loss: 2.592658E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.121 | TFLOPs: 41.77 | +7: iteration 89190/ 115203 | consumed samples: 22832640 | consumed tokens: 46761246720 | elapsed time per iteration (s): 0.58 | learning rate: 4.213E-05 | global batch size: 256 | lm loss: 2.585781E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.700 | TFLOPs: 42.40 | +7: iteration 89200/ 115203 | consumed samples: 22835200 | consumed tokens: 46766489600 | elapsed time per iteration (s): 0.56 | learning rate: 4.212E-05 | global batch size: 256 | lm loss: 2.593965E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.302 | TFLOPs: 43.22 | +7: iteration 89210/ 115203 | consumed samples: 22837760 | consumed tokens: 46771732480 | elapsed time per iteration (s): 0.58 | learning rate: 4.210E-05 | global batch size: 256 | lm loss: 2.593854E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.035 | TFLOPs: 42.24 | +7: iteration 89220/ 115203 | consumed samples: 22840320 | consumed tokens: 46776975360 | elapsed time per iteration (s): 0.56 | learning rate: 4.208E-05 | global batch size: 256 | lm loss: 2.603186E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.059 | TFLOPs: 43.48 | +7: iteration 89230/ 115203 | consumed samples: 22842880 | consumed tokens: 46782218240 | elapsed time per iteration (s): 0.58 | learning rate: 4.207E-05 | global batch size: 256 | lm loss: 2.609843E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.725 | TFLOPs: 42.30 | +7: iteration 89240/ 115203 | consumed samples: 22845440 | consumed tokens: 46787461120 | elapsed time per iteration (s): 0.57 | learning rate: 4.205E-05 | global batch size: 256 | lm loss: 2.604887E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.789 | TFLOPs: 43.17 | +7: iteration 89250/ 115203 | consumed samples: 22848000 | consumed tokens: 46792704000 | elapsed time per iteration (s): 0.57 | learning rate: 4.204E-05 | global batch size: 256 | lm loss: 2.584044E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.134 | TFLOPs: 43.01 | +7: iteration 89260/ 115203 | consumed samples: 22850560 | consumed tokens: 46797946880 | elapsed time per iteration (s): 0.57 | learning rate: 4.202E-05 | global batch size: 256 | lm loss: 2.591728E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.690 | TFLOPs: 42.78 | +7: iteration 89270/ 115203 | consumed samples: 22853120 | consumed tokens: 46803189760 | elapsed time per iteration (s): 0.61 | learning rate: 4.200E-05 | global batch size: 256 | lm loss: 2.587662E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 419.360 | TFLOPs: 39.98 | +7: iteration 89280/ 115203 | consumed samples: 22855680 | consumed tokens: 46808432640 | elapsed time per iteration (s): 0.57 | learning rate: 4.199E-05 | global batch size: 256 | lm loss: 2.591387E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.405 | TFLOPs: 43.04 | +7: iteration 89290/ 115203 | consumed samples: 22858240 | consumed tokens: 46813675520 | elapsed time per iteration (s): 0.57 | learning rate: 4.197E-05 | global batch size: 256 | lm loss: 2.599287E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.385 | TFLOPs: 42.84 | +7: iteration 89300/ 115203 | consumed samples: 22860800 | consumed tokens: 46818918400 | elapsed time per iteration (s): 0.57 | learning rate: 4.195E-05 | global batch size: 256 | lm loss: 2.586047E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.688 | TFLOPs: 43.06 | +7: iteration 89310/ 115203 | consumed samples: 22863360 | consumed tokens: 46824161280 | elapsed time per iteration (s): 0.57 | learning rate: 4.194E-05 | global batch size: 256 | lm loss: 2.589593E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.079 | TFLOPs: 42.62 | +7: iteration 89320/ 115203 | consumed samples: 22865920 | consumed tokens: 46829404160 | elapsed time per iteration (s): 0.57 | learning rate: 4.192E-05 | global batch size: 256 | lm loss: 2.592337E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.392 | TFLOPs: 42.75 | +7: iteration 89330/ 115203 | consumed samples: 22868480 | consumed tokens: 46834647040 | elapsed time per iteration (s): 0.58 | learning rate: 4.191E-05 | global batch size: 256 | lm loss: 2.599194E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.384 | TFLOPs: 42.18 | +7: iteration 89340/ 115203 | consumed samples: 22871040 | consumed tokens: 46839889920 | elapsed time per iteration (s): 0.55 | learning rate: 4.189E-05 | global batch size: 256 | lm loss: 2.595959E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.437 | TFLOPs: 43.99 | +7: iteration 89350/ 115203 | consumed samples: 22873600 | consumed tokens: 46845132800 | elapsed time per iteration (s): 0.57 | learning rate: 4.187E-05 | global batch size: 256 | lm loss: 2.603653E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.624 | TFLOPs: 42.58 | +7: iteration 89360/ 115203 | consumed samples: 22876160 | consumed tokens: 46850375680 | elapsed time per iteration (s): 0.57 | learning rate: 4.186E-05 | global batch size: 256 | lm loss: 2.606405E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.992 | TFLOPs: 42.71 | +7: iteration 89370/ 115203 | consumed samples: 22878720 | consumed tokens: 46855618560 | elapsed time per iteration (s): 0.57 | learning rate: 4.184E-05 | global batch size: 256 | lm loss: 2.585733E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.728 | TFLOPs: 43.16 | +7: iteration 89380/ 115203 | consumed samples: 22881280 | consumed tokens: 46860861440 | elapsed time per iteration (s): 0.57 | learning rate: 4.183E-05 | global batch size: 256 | lm loss: 2.585226E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.361 | TFLOPs: 43.13 | +7: iteration 89390/ 115203 | consumed samples: 22883840 | consumed tokens: 46866104320 | elapsed time per iteration (s): 0.57 | learning rate: 4.181E-05 | global batch size: 256 | lm loss: 2.581461E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.277 | TFLOPs: 42.83 | +7: iteration 89400/ 115203 | consumed samples: 22886400 | consumed tokens: 46871347200 | elapsed time per iteration (s): 0.57 | learning rate: 4.179E-05 | global batch size: 256 | lm loss: 2.608122E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.494 | TFLOPs: 42.66 | +7: iteration 89410/ 115203 | consumed samples: 22888960 | consumed tokens: 46876590080 | elapsed time per iteration (s): 0.57 | learning rate: 4.178E-05 | global batch size: 256 | lm loss: 2.580130E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.940 | TFLOPs: 42.99 | +7: iteration 89420/ 115203 | consumed samples: 22891520 | consumed tokens: 46881832960 | elapsed time per iteration (s): 0.56 | learning rate: 4.176E-05 | global batch size: 256 | lm loss: 2.589354E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.184 | TFLOPs: 43.40 | +7: iteration 89430/ 115203 | consumed samples: 22894080 | consumed tokens: 46887075840 | elapsed time per iteration (s): 0.56 | learning rate: 4.174E-05 | global batch size: 256 | lm loss: 2.603855E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.133 | TFLOPs: 43.58 | +7: iteration 89440/ 115203 | consumed samples: 22896640 | consumed tokens: 46892318720 | elapsed time per iteration (s): 0.56 | learning rate: 4.173E-05 | global batch size: 256 | lm loss: 2.594605E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.183 | TFLOPs: 43.68 | +7: iteration 89450/ 115203 | consumed samples: 22899200 | consumed tokens: 46897561600 | elapsed time per iteration (s): 0.57 | learning rate: 4.171E-05 | global batch size: 256 | lm loss: 2.590436E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.871 | TFLOPs: 43.18 | +7: iteration 89460/ 115203 | consumed samples: 22901760 | consumed tokens: 46902804480 | elapsed time per iteration (s): 0.58 | learning rate: 4.170E-05 | global batch size: 256 | lm loss: 2.594679E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.846 | TFLOPs: 41.84 | +7: iteration 89470/ 115203 | consumed samples: 22904320 | consumed tokens: 46908047360 | elapsed time per iteration (s): 0.55 | learning rate: 4.168E-05 | global batch size: 256 | lm loss: 2.578658E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.269 | TFLOPs: 43.98 | +7: iteration 89480/ 115203 | consumed samples: 22906880 | consumed tokens: 46913290240 | elapsed time per iteration (s): 0.57 | learning rate: 4.166E-05 | global batch size: 256 | lm loss: 2.591968E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.520 | TFLOPs: 42.48 | +7: iteration 89490/ 115203 | consumed samples: 22909440 | consumed tokens: 46918533120 | elapsed time per iteration (s): 0.59 | learning rate: 4.165E-05 | global batch size: 256 | lm loss: 2.602454E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.466 | TFLOPs: 41.71 | +7: iteration 89500/ 115203 | consumed samples: 22912000 | consumed tokens: 46923776000 | elapsed time per iteration (s): 0.57 | learning rate: 4.163E-05 | global batch size: 256 | lm loss: 2.579847E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.018 | TFLOPs: 42.90 | +7: iteration 89510/ 115203 | consumed samples: 22914560 | consumed tokens: 46929018880 | elapsed time per iteration (s): 0.58 | learning rate: 4.162E-05 | global batch size: 256 | lm loss: 2.582907E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.469 | TFLOPs: 42.09 | +7: iteration 89520/ 115203 | consumed samples: 22917120 | consumed tokens: 46934261760 | elapsed time per iteration (s): 0.56 | learning rate: 4.160E-05 | global batch size: 256 | lm loss: 2.601575E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.202 | TFLOPs: 43.49 | +7: iteration 89530/ 115203 | consumed samples: 22919680 | consumed tokens: 46939504640 | elapsed time per iteration (s): 0.57 | learning rate: 4.158E-05 | global batch size: 256 | lm loss: 2.600317E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.597 | TFLOPs: 42.96 | +7: iteration 89540/ 115203 | consumed samples: 22922240 | consumed tokens: 46944747520 | elapsed time per iteration (s): 0.56 | learning rate: 4.157E-05 | global batch size: 256 | lm loss: 2.601304E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.107 | TFLOPs: 43.48 | +7: iteration 89550/ 115203 | consumed samples: 22924800 | consumed tokens: 46949990400 | elapsed time per iteration (s): 0.57 | learning rate: 4.155E-05 | global batch size: 256 | lm loss: 2.580694E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.862 | TFLOPs: 43.08 | +7: iteration 89560/ 115203 | consumed samples: 22927360 | consumed tokens: 46955233280 | elapsed time per iteration (s): 0.56 | learning rate: 4.153E-05 | global batch size: 256 | lm loss: 2.582323E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.913 | TFLOPs: 43.56 | +7: iteration 89570/ 115203 | consumed samples: 22929920 | consumed tokens: 46960476160 | elapsed time per iteration (s): 0.56 | learning rate: 4.152E-05 | global batch size: 256 | lm loss: 2.602204E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.170 | TFLOPs: 43.49 | +7: iteration 89580/ 115203 | consumed samples: 22932480 | consumed tokens: 46965719040 | elapsed time per iteration (s): 0.57 | learning rate: 4.150E-05 | global batch size: 256 | lm loss: 2.602149E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.311 | TFLOPs: 42.55 | +7: iteration 89590/ 115203 | consumed samples: 22935040 | consumed tokens: 46970961920 | elapsed time per iteration (s): 0.56 | learning rate: 4.149E-05 | global batch size: 256 | lm loss: 2.602996E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.253 | TFLOPs: 43.31 | +7: iteration 89600/ 115203 | consumed samples: 22937600 | consumed tokens: 46976204800 | elapsed time per iteration (s): 0.57 | learning rate: 4.147E-05 | global batch size: 256 | lm loss: 2.605736E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.998 | TFLOPs: 42.71 | +7: iteration 89610/ 115203 | consumed samples: 22940160 | consumed tokens: 46981447680 | elapsed time per iteration (s): 0.56 | learning rate: 4.145E-05 | global batch size: 256 | lm loss: 2.607933E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.166 | TFLOPs: 43.20 | +7: iteration 89620/ 115203 | consumed samples: 22942720 | consumed tokens: 46986690560 | elapsed time per iteration (s): 0.57 | learning rate: 4.144E-05 | global batch size: 256 | lm loss: 2.585919E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.137 | TFLOPs: 42.63 | +7: iteration 89630/ 115203 | consumed samples: 22945280 | consumed tokens: 46991933440 | elapsed time per iteration (s): 0.57 | learning rate: 4.142E-05 | global batch size: 256 | lm loss: 2.591246E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.165 | TFLOPs: 42.82 | +7: iteration 89640/ 115203 | consumed samples: 22947840 | consumed tokens: 46997176320 | elapsed time per iteration (s): 0.58 | learning rate: 4.141E-05 | global batch size: 256 | lm loss: 2.578695E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.788 | TFLOPs: 42.31 | +7: iteration 89650/ 115203 | consumed samples: 22950400 | consumed tokens: 47002419200 | elapsed time per iteration (s): 0.56 | learning rate: 4.139E-05 | global batch size: 256 | lm loss: 2.590884E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.922 | TFLOPs: 43.37 | +7: iteration 89660/ 115203 | consumed samples: 22952960 | consumed tokens: 47007662080 | elapsed time per iteration (s): 0.57 | learning rate: 4.137E-05 | global batch size: 256 | lm loss: 2.590351E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.028 | TFLOPs: 43.10 | +7: iteration 89670/ 115203 | consumed samples: 22955520 | consumed tokens: 47012904960 | elapsed time per iteration (s): 0.56 | learning rate: 4.136E-05 | global batch size: 256 | lm loss: 2.585921E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.720 | TFLOPs: 43.35 | +7: iteration 89680/ 115203 | consumed samples: 22958080 | consumed tokens: 47018147840 | elapsed time per iteration (s): 0.57 | learning rate: 4.134E-05 | global batch size: 256 | lm loss: 2.596919E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.502 | TFLOPs: 42.86 | +7: iteration 89690/ 115203 | consumed samples: 22960640 | consumed tokens: 47023390720 | elapsed time per iteration (s): 0.56 | learning rate: 4.133E-05 | global batch size: 256 | lm loss: 2.583551E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.502 | TFLOPs: 43.52 | +7: iteration 89700/ 115203 | consumed samples: 22963200 | consumed tokens: 47028633600 | elapsed time per iteration (s): 0.56 | learning rate: 4.131E-05 | global batch size: 256 | lm loss: 2.588399E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.013 | TFLOPs: 43.38 | +7: iteration 89710/ 115203 | consumed samples: 22965760 | consumed tokens: 47033876480 | elapsed time per iteration (s): 0.57 | learning rate: 4.129E-05 | global batch size: 256 | lm loss: 2.588818E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.366 | TFLOPs: 42.84 | +7: iteration 89720/ 115203 | consumed samples: 22968320 | consumed tokens: 47039119360 | elapsed time per iteration (s): 0.57 | learning rate: 4.128E-05 | global batch size: 256 | lm loss: 2.597919E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.597 | TFLOPs: 42.58 | +7: iteration 89730/ 115203 | consumed samples: 22970880 | consumed tokens: 47044362240 | elapsed time per iteration (s): 0.57 | learning rate: 4.126E-05 | global batch size: 256 | lm loss: 2.583599E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.647 | TFLOPs: 42.77 | +7: iteration 89740/ 115203 | consumed samples: 22973440 | consumed tokens: 47049605120 | elapsed time per iteration (s): 0.57 | learning rate: 4.125E-05 | global batch size: 256 | lm loss: 2.595496E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.812 | TFLOPs: 42.79 | +7: iteration 89750/ 115203 | consumed samples: 22976000 | consumed tokens: 47054848000 | elapsed time per iteration (s): 0.57 | learning rate: 4.123E-05 | global batch size: 256 | lm loss: 2.586016E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.693 | TFLOPs: 42.68 | +7: iteration 89760/ 115203 | consumed samples: 22978560 | consumed tokens: 47060090880 | elapsed time per iteration (s): 0.56 | learning rate: 4.121E-05 | global batch size: 256 | lm loss: 2.588291E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.165 | TFLOPs: 43.59 | +7: iteration 89770/ 115203 | consumed samples: 22981120 | consumed tokens: 47065333760 | elapsed time per iteration (s): 0.56 | learning rate: 4.120E-05 | global batch size: 256 | lm loss: 2.595925E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.211 | TFLOPs: 43.40 | +7: iteration 89780/ 115203 | consumed samples: 22983680 | consumed tokens: 47070576640 | elapsed time per iteration (s): 0.56 | learning rate: 4.118E-05 | global batch size: 256 | lm loss: 2.583504E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.501 | TFLOPs: 43.24 | +7: iteration 89790/ 115203 | consumed samples: 22986240 | consumed tokens: 47075819520 | elapsed time per iteration (s): 0.57 | learning rate: 4.117E-05 | global batch size: 256 | lm loss: 2.602051E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.273 | TFLOPs: 42.64 | +7: iteration 89800/ 115203 | consumed samples: 22988800 | consumed tokens: 47081062400 | elapsed time per iteration (s): 0.56 | learning rate: 4.115E-05 | global batch size: 256 | lm loss: 2.582645E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.001 | TFLOPs: 43.47 | +7: iteration 89810/ 115203 | consumed samples: 22991360 | consumed tokens: 47086305280 | elapsed time per iteration (s): 0.57 | learning rate: 4.113E-05 | global batch size: 256 | lm loss: 2.586973E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.699 | TFLOPs: 42.97 | +7: iteration 89820/ 115203 | consumed samples: 22993920 | consumed tokens: 47091548160 | elapsed time per iteration (s): 0.56 | learning rate: 4.112E-05 | global batch size: 256 | lm loss: 2.593590E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.125 | TFLOPs: 43.20 | +7: iteration 89830/ 115203 | consumed samples: 22996480 | consumed tokens: 47096791040 | elapsed time per iteration (s): 0.57 | learning rate: 4.110E-05 | global batch size: 256 | lm loss: 2.592987E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.940 | TFLOPs: 42.80 | +7: iteration 89840/ 115203 | consumed samples: 22999040 | consumed tokens: 47102033920 | elapsed time per iteration (s): 0.57 | learning rate: 4.109E-05 | global batch size: 256 | lm loss: 2.601659E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.097 | TFLOPs: 43.10 | +7: iteration 89850/ 115203 | consumed samples: 23001600 | consumed tokens: 47107276800 | elapsed time per iteration (s): 0.57 | learning rate: 4.107E-05 | global batch size: 256 | lm loss: 2.580431E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.542 | TFLOPs: 42.86 | +7: iteration 89860/ 115203 | consumed samples: 23004160 | consumed tokens: 47112519680 | elapsed time per iteration (s): 0.58 | learning rate: 4.105E-05 | global batch size: 256 | lm loss: 2.582581E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.033 | TFLOPs: 42.33 | +7: iteration 89870/ 115203 | consumed samples: 23006720 | consumed tokens: 47117762560 | elapsed time per iteration (s): 0.56 | learning rate: 4.104E-05 | global batch size: 256 | lm loss: 2.589026E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.970 | TFLOPs: 43.47 | +7: iteration 89880/ 115203 | consumed samples: 23009280 | consumed tokens: 47123005440 | elapsed time per iteration (s): 0.56 | learning rate: 4.102E-05 | global batch size: 256 | lm loss: 2.593744E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.304 | TFLOPs: 43.50 | +7: iteration 89890/ 115203 | consumed samples: 23011840 | consumed tokens: 47128248320 | elapsed time per iteration (s): 0.56 | learning rate: 4.101E-05 | global batch size: 256 | lm loss: 2.586242E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.019 | TFLOPs: 43.57 | +7: iteration 89900/ 115203 | consumed samples: 23014400 | consumed tokens: 47133491200 | elapsed time per iteration (s): 0.57 | learning rate: 4.099E-05 | global batch size: 256 | lm loss: 2.583043E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.970 | TFLOPs: 42.52 | +7: iteration 89910/ 115203 | consumed samples: 23016960 | consumed tokens: 47138734080 | elapsed time per iteration (s): 0.56 | learning rate: 4.097E-05 | global batch size: 256 | lm loss: 2.587870E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.673 | TFLOPs: 43.54 | +7: iteration 89920/ 115203 | consumed samples: 23019520 | consumed tokens: 47143976960 | elapsed time per iteration (s): 0.55 | learning rate: 4.096E-05 | global batch size: 256 | lm loss: 2.588472E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.561 | TFLOPs: 44.00 | +7: iteration 89930/ 115203 | consumed samples: 23022080 | consumed tokens: 47149219840 | elapsed time per iteration (s): 0.56 | learning rate: 4.094E-05 | global batch size: 256 | lm loss: 2.585396E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.125 | TFLOPs: 43.77 | +7: iteration 89940/ 115203 | consumed samples: 23024640 | consumed tokens: 47154462720 | elapsed time per iteration (s): 0.57 | learning rate: 4.093E-05 | global batch size: 256 | lm loss: 2.598306E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.463 | TFLOPs: 42.95 | +7: iteration 89950/ 115203 | consumed samples: 23027200 | consumed tokens: 47159705600 | elapsed time per iteration (s): 0.56 | learning rate: 4.091E-05 | global batch size: 256 | lm loss: 2.584000E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.234 | TFLOPs: 43.31 | +7: iteration 89960/ 115203 | consumed samples: 23029760 | consumed tokens: 47164948480 | elapsed time per iteration (s): 0.57 | learning rate: 4.090E-05 | global batch size: 256 | lm loss: 2.592487E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.746 | TFLOPs: 42.97 | +7: iteration 89970/ 115203 | consumed samples: 23032320 | consumed tokens: 47170191360 | elapsed time per iteration (s): 0.57 | learning rate: 4.088E-05 | global batch size: 256 | lm loss: 2.586594E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.681 | TFLOPs: 42.59 | +7: iteration 89980/ 115203 | consumed samples: 23034880 | consumed tokens: 47175434240 | elapsed time per iteration (s): 0.56 | learning rate: 4.086E-05 | global batch size: 256 | lm loss: 2.593429E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.974 | TFLOPs: 43.57 | +7: iteration 89990/ 115203 | consumed samples: 23037440 | consumed tokens: 47180677120 | elapsed time per iteration (s): 0.57 | learning rate: 4.085E-05 | global batch size: 256 | lm loss: 2.592503E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.889 | TFLOPs: 42.80 | +0: [2023-03-17 02:56:55,833] [INFO] [logging.py:68:log_dist] [Rank 0] step=90000, skipped=0, lr=[4.083185080977982e-05, 4.083185080977982e-05, 4.083185080977982e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 90000/ 115203 | consumed samples: 23040000 | consumed tokens: 47185920000 | elapsed time per iteration (s): 0.56 | learning rate: 4.083E-05 | global batch size: 256 | lm loss: 2.596161E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.254 | TFLOPs: 43.50 | +0: steps: 90000 loss: 2.5804 iter time (s): 0.571 samples/sec: 448.059 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 90000 | lm loss value: 3.378689E+00 | lm loss PPL: 2.933230E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 90000 to checkpoints_421m60b400m +0: [2023-03-17 02:56:56,042] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step90000 is begin to save! +0: [2023-03-17 02:56:56,048] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:56:56,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:56:56,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:56:56,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:56:56,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:56:56,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:56:56,302] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:56:56,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:56:56,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:56:56,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:56:56,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:56:56,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:56:56,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:56:56,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:56:56,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:56:56,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:56:56,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:56:56,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:56:56,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_11-model_00-model_states.pt... +0: [2023-03-17 02:56:56,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_11-model_00-model_states.pt. +0: [2023-03-17 02:56:56,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:56:56,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:56:56,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_13-model_00-model_states.pt... +0: [2023-03-17 02:56:56,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_13-model_00-model_states.pt. +0: [2023-03-17 02:56:56,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_14-model_00-model_states.pt... +0: [2023-03-17 02:56:56,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_14-model_00-model_states.pt. +0: [2023-03-17 02:56:56,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_15-model_00-model_states.pt... +0: [2023-03-17 02:56:56,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_15-model_00-model_states.pt. +0: [2023-03-17 02:56:56,767] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_16-model_00-model_states.pt... +0: [2023-03-17 02:56:56,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_16-model_00-model_states.pt. +0: [2023-03-17 02:56:56,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_17-model_00-model_states.pt... +0: [2023-03-17 02:56:56,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_17-model_00-model_states.pt. +0: [2023-03-17 02:56:56,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_18-model_00-model_states.pt... +0: [2023-03-17 02:56:56,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_18-model_00-model_states.pt. +0: [2023-03-17 02:56:56,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_19-model_00-model_states.pt... +0: [2023-03-17 02:56:56,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_19-model_00-model_states.pt. +0: [2023-03-17 02:56:56,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_20-model_00-model_states.pt... +0: [2023-03-17 02:56:56,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_20-model_00-model_states.pt. +0: [2023-03-17 02:56:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/layer_22-model_00-model_states.pt... +0: [2023-03-17 02:56:56,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/layer_22-model_00-model_states.pt. +0: [2023-03-17 02:56:56,980] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step90000/mp_rank_00_model_states.pt +0: [2023-03-17 02:56:56,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:56:56,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:57,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:57,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:57,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 02:56:57,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 02:56:57,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 02:56:57,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 02:56:57,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 02:56:57,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 02:56:57,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:57,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step90000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 02:56:57,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: successfully saved checkpoint at iteration 90000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1204.03 +7: iteration 90010/ 115203 | consumed samples: 23042560 | consumed tokens: 47191162880 | elapsed time per iteration (s): 0.71 | learning rate: 4.082E-05 | global batch size: 256 | lm loss: 2.585188E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 362.768 | TFLOPs: 34.59 | +7: iteration 90020/ 115203 | consumed samples: 23045120 | consumed tokens: 47196405760 | elapsed time per iteration (s): 0.55 | learning rate: 4.080E-05 | global batch size: 256 | lm loss: 2.573314E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.909 | TFLOPs: 44.04 | +7: iteration 90030/ 115203 | consumed samples: 23047680 | consumed tokens: 47201648640 | elapsed time per iteration (s): 0.56 | learning rate: 4.078E-05 | global batch size: 256 | lm loss: 2.585337E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.257 | TFLOPs: 43.40 | +7: iteration 90040/ 115203 | consumed samples: 23050240 | consumed tokens: 47206891520 | elapsed time per iteration (s): 0.57 | learning rate: 4.077E-05 | global batch size: 256 | lm loss: 2.588396E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.172 | TFLOPs: 42.92 | +7: iteration 90050/ 115203 | consumed samples: 23052800 | consumed tokens: 47212134400 | elapsed time per iteration (s): 0.58 | learning rate: 4.075E-05 | global batch size: 256 | lm loss: 2.579610E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.648 | TFLOPs: 42.39 | +7: iteration 90060/ 115203 | consumed samples: 23055360 | consumed tokens: 47217377280 | elapsed time per iteration (s): 0.56 | learning rate: 4.074E-05 | global batch size: 256 | lm loss: 2.588860E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.657 | TFLOPs: 43.63 | +7: iteration 90070/ 115203 | consumed samples: 23057920 | consumed tokens: 47222620160 | elapsed time per iteration (s): 0.57 | learning rate: 4.072E-05 | global batch size: 256 | lm loss: 2.601550E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.640 | TFLOPs: 43.15 | +7: iteration 90080/ 115203 | consumed samples: 23060480 | consumed tokens: 47227863040 | elapsed time per iteration (s): 0.57 | learning rate: 4.071E-05 | global batch size: 256 | lm loss: 2.602051E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.588 | TFLOPs: 42.86 | +7: iteration 90090/ 115203 | consumed samples: 23063040 | consumed tokens: 47233105920 | elapsed time per iteration (s): 0.56 | learning rate: 4.069E-05 | global batch size: 256 | lm loss: 2.590991E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.985 | TFLOPs: 43.47 | +7: iteration 90100/ 115203 | consumed samples: 23065600 | consumed tokens: 47238348800 | elapsed time per iteration (s): 0.55 | learning rate: 4.067E-05 | global batch size: 256 | lm loss: 2.588108E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.695 | TFLOPs: 44.02 | +7: iteration 90110/ 115203 | consumed samples: 23068160 | consumed tokens: 47243591680 | elapsed time per iteration (s): 0.56 | learning rate: 4.066E-05 | global batch size: 256 | lm loss: 2.590954E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.010 | TFLOPs: 43.67 | +7: iteration 90120/ 115203 | consumed samples: 23070720 | consumed tokens: 47248834560 | elapsed time per iteration (s): 0.57 | learning rate: 4.064E-05 | global batch size: 256 | lm loss: 2.585984E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.755 | TFLOPs: 42.88 | +7: iteration 90130/ 115203 | consumed samples: 23073280 | consumed tokens: 47254077440 | elapsed time per iteration (s): 0.56 | learning rate: 4.063E-05 | global batch size: 256 | lm loss: 2.580387E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.354 | TFLOPs: 43.32 | +7: iteration 90140/ 115203 | consumed samples: 23075840 | consumed tokens: 47259320320 | elapsed time per iteration (s): 0.56 | learning rate: 4.061E-05 | global batch size: 256 | lm loss: 2.599718E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.126 | TFLOPs: 43.58 | +7: iteration 90150/ 115203 | consumed samples: 23078400 | consumed tokens: 47264563200 | elapsed time per iteration (s): 0.55 | learning rate: 4.059E-05 | global batch size: 256 | lm loss: 2.598335E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.749 | TFLOPs: 44.02 | +7: iteration 90160/ 115203 | consumed samples: 23080960 | consumed tokens: 47269806080 | elapsed time per iteration (s): 0.57 | learning rate: 4.058E-05 | global batch size: 256 | lm loss: 2.590860E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.042 | TFLOPs: 42.91 | +7: iteration 90170/ 115203 | consumed samples: 23083520 | consumed tokens: 47275048960 | elapsed time per iteration (s): 0.56 | learning rate: 4.056E-05 | global batch size: 256 | lm loss: 2.593470E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.828 | TFLOPs: 43.36 | +7: iteration 90180/ 115203 | consumed samples: 23086080 | consumed tokens: 47280291840 | elapsed time per iteration (s): 0.56 | learning rate: 4.055E-05 | global batch size: 256 | lm loss: 2.600145E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.792 | TFLOPs: 43.55 | +7: iteration 90190/ 115203 | consumed samples: 23088640 | consumed tokens: 47285534720 | elapsed time per iteration (s): 0.57 | learning rate: 4.053E-05 | global batch size: 256 | lm loss: 2.586976E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.437 | TFLOPs: 42.94 | +7: iteration 90200/ 115203 | consumed samples: 23091200 | consumed tokens: 47290777600 | elapsed time per iteration (s): 0.56 | learning rate: 4.052E-05 | global batch size: 256 | lm loss: 2.593727E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.681 | TFLOPs: 43.73 | +7: iteration 90210/ 115203 | consumed samples: 23093760 | consumed tokens: 47296020480 | elapsed time per iteration (s): 0.57 | learning rate: 4.050E-05 | global batch size: 256 | lm loss: 2.596288E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.676 | TFLOPs: 42.68 | +7: iteration 90220/ 115203 | consumed samples: 23096320 | consumed tokens: 47301263360 | elapsed time per iteration (s): 0.57 | learning rate: 4.048E-05 | global batch size: 256 | lm loss: 2.582802E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.043 | TFLOPs: 43.10 | +7: iteration 90230/ 115203 | consumed samples: 23098880 | consumed tokens: 47306506240 | elapsed time per iteration (s): 0.57 | learning rate: 4.047E-05 | global batch size: 256 | lm loss: 2.595071E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.664 | TFLOPs: 42.78 | +7: iteration 90240/ 115203 | consumed samples: 23101440 | consumed tokens: 47311749120 | elapsed time per iteration (s): 0.56 | learning rate: 4.045E-05 | global batch size: 256 | lm loss: 2.579119E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.067 | TFLOPs: 43.39 | +7: iteration 90250/ 115203 | consumed samples: 23104000 | consumed tokens: 47316992000 | elapsed time per iteration (s): 0.57 | learning rate: 4.044E-05 | global batch size: 256 | lm loss: 2.587187E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.352 | TFLOPs: 42.84 | +7: iteration 90260/ 115203 | consumed samples: 23106560 | consumed tokens: 47322234880 | elapsed time per iteration (s): 0.56 | learning rate: 4.042E-05 | global batch size: 256 | lm loss: 2.580033E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.582 | TFLOPs: 43.43 | +7: iteration 90270/ 115203 | consumed samples: 23109120 | consumed tokens: 47327477760 | elapsed time per iteration (s): 0.56 | learning rate: 4.041E-05 | global batch size: 256 | lm loss: 2.580576E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.712 | TFLOPs: 43.45 | +7: iteration 90280/ 115203 | consumed samples: 23111680 | consumed tokens: 47332720640 | elapsed time per iteration (s): 0.56 | learning rate: 4.039E-05 | global batch size: 256 | lm loss: 2.584622E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.157 | TFLOPs: 43.58 | +7: iteration 90290/ 115203 | consumed samples: 23114240 | consumed tokens: 47337963520 | elapsed time per iteration (s): 0.57 | learning rate: 4.037E-05 | global batch size: 256 | lm loss: 2.589133E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.739 | TFLOPs: 43.16 | +7: iteration 90300/ 115203 | consumed samples: 23116800 | consumed tokens: 47343206400 | elapsed time per iteration (s): 0.56 | learning rate: 4.036E-05 | global batch size: 256 | lm loss: 2.592522E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.164 | TFLOPs: 43.39 | +7: iteration 90310/ 115203 | consumed samples: 23119360 | consumed tokens: 47348449280 | elapsed time per iteration (s): 0.58 | learning rate: 4.034E-05 | global batch size: 256 | lm loss: 2.587914E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.475 | TFLOPs: 42.28 | +7: iteration 90320/ 115203 | consumed samples: 23121920 | consumed tokens: 47353692160 | elapsed time per iteration (s): 0.57 | learning rate: 4.033E-05 | global batch size: 256 | lm loss: 2.588572E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.005 | TFLOPs: 42.71 | +7: iteration 90330/ 115203 | consumed samples: 23124480 | consumed tokens: 47358935040 | elapsed time per iteration (s): 0.57 | learning rate: 4.031E-05 | global batch size: 256 | lm loss: 2.592947E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.166 | TFLOPs: 43.11 | +7: iteration 90340/ 115203 | consumed samples: 23127040 | consumed tokens: 47364177920 | elapsed time per iteration (s): 0.57 | learning rate: 4.030E-05 | global batch size: 256 | lm loss: 2.586558E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.518 | TFLOPs: 42.86 | +7: iteration 90350/ 115203 | consumed samples: 23129600 | consumed tokens: 47369420800 | elapsed time per iteration (s): 0.56 | learning rate: 4.028E-05 | global batch size: 256 | lm loss: 2.590614E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.038 | TFLOPs: 43.29 | +7: iteration 90360/ 115203 | consumed samples: 23132160 | consumed tokens: 47374663680 | elapsed time per iteration (s): 0.57 | learning rate: 4.026E-05 | global batch size: 256 | lm loss: 2.593433E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.441 | TFLOPs: 43.04 | +7: iteration 90370/ 115203 | consumed samples: 23134720 | consumed tokens: 47379906560 | elapsed time per iteration (s): 0.57 | learning rate: 4.025E-05 | global batch size: 256 | lm loss: 2.591732E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.824 | TFLOPs: 42.79 | +7: iteration 90380/ 115203 | consumed samples: 23137280 | consumed tokens: 47385149440 | elapsed time per iteration (s): 0.57 | learning rate: 4.023E-05 | global batch size: 256 | lm loss: 2.593597E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.837 | TFLOPs: 43.08 | +7: iteration 90390/ 115203 | consumed samples: 23139840 | consumed tokens: 47390392320 | elapsed time per iteration (s): 0.57 | learning rate: 4.022E-05 | global batch size: 256 | lm loss: 2.592459E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.388 | TFLOPs: 42.56 | +7: iteration 90400/ 115203 | consumed samples: 23142400 | consumed tokens: 47395635200 | elapsed time per iteration (s): 0.57 | learning rate: 4.020E-05 | global batch size: 256 | lm loss: 2.582235E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.742 | TFLOPs: 42.97 | +7: iteration 90410/ 115203 | consumed samples: 23144960 | consumed tokens: 47400878080 | elapsed time per iteration (s): 0.56 | learning rate: 4.019E-05 | global batch size: 256 | lm loss: 2.591291E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.931 | TFLOPs: 43.47 | +7: iteration 90420/ 115203 | consumed samples: 23147520 | consumed tokens: 47406120960 | elapsed time per iteration (s): 0.57 | learning rate: 4.017E-05 | global batch size: 256 | lm loss: 2.576041E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.833 | TFLOPs: 42.60 | +7: iteration 90430/ 115203 | consumed samples: 23150080 | consumed tokens: 47411363840 | elapsed time per iteration (s): 0.57 | learning rate: 4.015E-05 | global batch size: 256 | lm loss: 2.586982E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.978 | TFLOPs: 42.52 | +7: iteration 90440/ 115203 | consumed samples: 23152640 | consumed tokens: 47416606720 | elapsed time per iteration (s): 0.57 | learning rate: 4.014E-05 | global batch size: 256 | lm loss: 2.589493E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.216 | TFLOPs: 43.02 | +7: iteration 90450/ 115203 | consumed samples: 23155200 | consumed tokens: 47421849600 | elapsed time per iteration (s): 0.56 | learning rate: 4.012E-05 | global batch size: 256 | lm loss: 2.598043E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.917 | TFLOPs: 43.56 | +7: iteration 90460/ 115203 | consumed samples: 23157760 | consumed tokens: 47427092480 | elapsed time per iteration (s): 0.56 | learning rate: 4.011E-05 | global batch size: 256 | lm loss: 2.589940E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.695 | TFLOPs: 43.83 | +7: iteration 90470/ 115203 | consumed samples: 23160320 | consumed tokens: 47432335360 | elapsed time per iteration (s): 0.57 | learning rate: 4.009E-05 | global batch size: 256 | lm loss: 2.586471E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.452 | TFLOPs: 43.14 | +7: iteration 90480/ 115203 | consumed samples: 23162880 | consumed tokens: 47437578240 | elapsed time per iteration (s): 0.58 | learning rate: 4.008E-05 | global batch size: 256 | lm loss: 2.586287E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.050 | TFLOPs: 42.24 | +7: iteration 90490/ 115203 | consumed samples: 23165440 | consumed tokens: 47442821120 | elapsed time per iteration (s): 0.56 | learning rate: 4.006E-05 | global batch size: 256 | lm loss: 2.572436E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.998 | TFLOPs: 43.47 | +7: iteration 90500/ 115203 | consumed samples: 23168000 | consumed tokens: 47448064000 | elapsed time per iteration (s): 0.57 | learning rate: 4.005E-05 | global batch size: 256 | lm loss: 2.598937E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.133 | TFLOPs: 43.01 | +7: iteration 90510/ 115203 | consumed samples: 23170560 | consumed tokens: 47453306880 | elapsed time per iteration (s): 0.55 | learning rate: 4.003E-05 | global batch size: 256 | lm loss: 2.578647E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.436 | TFLOPs: 43.99 | +7: iteration 90520/ 115203 | consumed samples: 23173120 | consumed tokens: 47458549760 | elapsed time per iteration (s): 0.56 | learning rate: 4.001E-05 | global batch size: 256 | lm loss: 2.605036E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.553 | TFLOPs: 43.53 | +7: iteration 90530/ 115203 | consumed samples: 23175680 | consumed tokens: 47463792640 | elapsed time per iteration (s): 0.59 | learning rate: 4.000E-05 | global batch size: 256 | lm loss: 2.593395E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.725 | TFLOPs: 41.35 | +7: iteration 90540/ 115203 | consumed samples: 23178240 | consumed tokens: 47469035520 | elapsed time per iteration (s): 0.57 | learning rate: 3.998E-05 | global batch size: 256 | lm loss: 2.586774E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.921 | TFLOPs: 42.51 | +7: iteration 90550/ 115203 | consumed samples: 23180800 | consumed tokens: 47474278400 | elapsed time per iteration (s): 0.57 | learning rate: 3.997E-05 | global batch size: 256 | lm loss: 2.582112E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.012 | TFLOPs: 43.00 | +7: iteration 90560/ 115203 | consumed samples: 23183360 | consumed tokens: 47479521280 | elapsed time per iteration (s): 0.57 | learning rate: 3.995E-05 | global batch size: 256 | lm loss: 2.604339E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.851 | TFLOPs: 42.98 | +7: iteration 90570/ 115203 | consumed samples: 23185920 | consumed tokens: 47484764160 | elapsed time per iteration (s): 0.56 | learning rate: 3.994E-05 | global batch size: 256 | lm loss: 2.583331E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.584 | TFLOPs: 43.72 | +7: iteration 90580/ 115203 | consumed samples: 23188480 | consumed tokens: 47490007040 | elapsed time per iteration (s): 0.57 | learning rate: 3.992E-05 | global batch size: 256 | lm loss: 2.589558E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.065 | TFLOPs: 43.10 | +7: iteration 90590/ 115203 | consumed samples: 23191040 | consumed tokens: 47495249920 | elapsed time per iteration (s): 0.58 | learning rate: 3.991E-05 | global batch size: 256 | lm loss: 2.580381E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.128 | TFLOPs: 42.06 | +7: iteration 90600/ 115203 | consumed samples: 23193600 | consumed tokens: 47500492800 | elapsed time per iteration (s): 0.56 | learning rate: 3.989E-05 | global batch size: 256 | lm loss: 2.590197E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.310 | TFLOPs: 43.60 | +7: iteration 90610/ 115203 | consumed samples: 23196160 | consumed tokens: 47505735680 | elapsed time per iteration (s): 0.56 | learning rate: 3.987E-05 | global batch size: 256 | lm loss: 2.587563E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.648 | TFLOPs: 43.54 | +7: iteration 90620/ 115203 | consumed samples: 23198720 | consumed tokens: 47510978560 | elapsed time per iteration (s): 0.56 | learning rate: 3.986E-05 | global batch size: 256 | lm loss: 2.593660E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.705 | TFLOPs: 43.54 | +7: iteration 90630/ 115203 | consumed samples: 23201280 | consumed tokens: 47516221440 | elapsed time per iteration (s): 0.56 | learning rate: 3.984E-05 | global batch size: 256 | lm loss: 2.585670E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.531 | TFLOPs: 43.72 | +7: iteration 90640/ 115203 | consumed samples: 23203840 | consumed tokens: 47521464320 | elapsed time per iteration (s): 0.56 | learning rate: 3.983E-05 | global batch size: 256 | lm loss: 2.581334E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.402 | TFLOPs: 43.89 | +7: iteration 90650/ 115203 | consumed samples: 23206400 | consumed tokens: 47526707200 | elapsed time per iteration (s): 0.58 | learning rate: 3.981E-05 | global batch size: 256 | lm loss: 2.587463E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.196 | TFLOPs: 42.16 | +7: iteration 90660/ 115203 | consumed samples: 23208960 | consumed tokens: 47531950080 | elapsed time per iteration (s): 0.57 | learning rate: 3.980E-05 | global batch size: 256 | lm loss: 2.598796E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.534 | TFLOPs: 42.57 | +7: iteration 90670/ 115203 | consumed samples: 23211520 | consumed tokens: 47537192960 | elapsed time per iteration (s): 0.57 | learning rate: 3.978E-05 | global batch size: 256 | lm loss: 2.599114E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.849 | TFLOPs: 42.51 | +7: iteration 90680/ 115203 | consumed samples: 23214080 | consumed tokens: 47542435840 | elapsed time per iteration (s): 0.57 | learning rate: 3.977E-05 | global batch size: 256 | lm loss: 2.583319E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.838 | TFLOPs: 43.17 | +7: iteration 90690/ 115203 | consumed samples: 23216640 | consumed tokens: 47547678720 | elapsed time per iteration (s): 0.56 | learning rate: 3.975E-05 | global batch size: 256 | lm loss: 2.577161E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.074 | TFLOPs: 43.67 | +7: iteration 90700/ 115203 | consumed samples: 23219200 | consumed tokens: 47552921600 | elapsed time per iteration (s): 0.55 | learning rate: 3.973E-05 | global batch size: 256 | lm loss: 2.590106E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.732 | TFLOPs: 44.02 | +7: iteration 90710/ 115203 | consumed samples: 23221760 | consumed tokens: 47558164480 | elapsed time per iteration (s): 0.56 | learning rate: 3.972E-05 | global batch size: 256 | lm loss: 2.592122E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.114 | TFLOPs: 43.49 | +7: iteration 90720/ 115203 | consumed samples: 23224320 | consumed tokens: 47563407360 | elapsed time per iteration (s): 0.58 | learning rate: 3.970E-05 | global batch size: 256 | lm loss: 2.597412E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.111 | TFLOPs: 42.15 | +7: iteration 90730/ 115203 | consumed samples: 23226880 | consumed tokens: 47568650240 | elapsed time per iteration (s): 0.57 | learning rate: 3.969E-05 | global batch size: 256 | lm loss: 2.586460E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.778 | TFLOPs: 43.17 | +7: iteration 90740/ 115203 | consumed samples: 23229440 | consumed tokens: 47573893120 | elapsed time per iteration (s): 0.57 | learning rate: 3.967E-05 | global batch size: 256 | lm loss: 2.585625E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.632 | TFLOPs: 42.96 | +7: iteration 90750/ 115203 | consumed samples: 23232000 | consumed tokens: 47579136000 | elapsed time per iteration (s): 0.58 | learning rate: 3.966E-05 | global batch size: 256 | lm loss: 2.583671E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.481 | TFLOPs: 42.19 | +7: iteration 90760/ 115203 | consumed samples: 23234560 | consumed tokens: 47584378880 | elapsed time per iteration (s): 0.55 | learning rate: 3.964E-05 | global batch size: 256 | lm loss: 2.581922E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.658 | TFLOPs: 44.01 | +7: iteration 90770/ 115203 | consumed samples: 23237120 | consumed tokens: 47589621760 | elapsed time per iteration (s): 0.57 | learning rate: 3.963E-05 | global batch size: 256 | lm loss: 2.591687E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.537 | TFLOPs: 42.57 | +7: iteration 90780/ 115203 | consumed samples: 23239680 | consumed tokens: 47594864640 | elapsed time per iteration (s): 0.55 | learning rate: 3.961E-05 | global batch size: 256 | lm loss: 2.585613E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.590 | TFLOPs: 44.01 | +7: iteration 90790/ 115203 | consumed samples: 23242240 | consumed tokens: 47600107520 | elapsed time per iteration (s): 0.56 | learning rate: 3.960E-05 | global batch size: 256 | lm loss: 2.585292E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.087 | TFLOPs: 43.67 | +7: iteration 90800/ 115203 | consumed samples: 23244800 | consumed tokens: 47605350400 | elapsed time per iteration (s): 0.56 | learning rate: 3.958E-05 | global batch size: 256 | lm loss: 2.575508E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.627 | TFLOPs: 43.73 | +7: iteration 90810/ 115203 | consumed samples: 23247360 | consumed tokens: 47610593280 | elapsed time per iteration (s): 0.56 | learning rate: 3.956E-05 | global batch size: 256 | lm loss: 2.598991E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.055 | TFLOPs: 43.29 | +7: iteration 90820/ 115203 | consumed samples: 23249920 | consumed tokens: 47615836160 | elapsed time per iteration (s): 0.58 | learning rate: 3.955E-05 | global batch size: 256 | lm loss: 2.594191E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.678 | TFLOPs: 42.20 | +7: iteration 90830/ 115203 | consumed samples: 23252480 | consumed tokens: 47621079040 | elapsed time per iteration (s): 0.57 | learning rate: 3.953E-05 | global batch size: 256 | lm loss: 2.592204E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.367 | TFLOPs: 42.84 | +7: iteration 90840/ 115203 | consumed samples: 23255040 | consumed tokens: 47626321920 | elapsed time per iteration (s): 0.56 | learning rate: 3.952E-05 | global batch size: 256 | lm loss: 2.580202E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.137 | TFLOPs: 43.20 | +7: iteration 90850/ 115203 | consumed samples: 23257600 | consumed tokens: 47631564800 | elapsed time per iteration (s): 0.56 | learning rate: 3.950E-05 | global batch size: 256 | lm loss: 2.597037E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.610 | TFLOPs: 43.72 | +7: iteration 90860/ 115203 | consumed samples: 23260160 | consumed tokens: 47636807680 | elapsed time per iteration (s): 0.56 | learning rate: 3.949E-05 | global batch size: 256 | lm loss: 2.587811E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.568 | TFLOPs: 43.81 | +7: iteration 90870/ 115203 | consumed samples: 23262720 | consumed tokens: 47642050560 | elapsed time per iteration (s): 0.57 | learning rate: 3.947E-05 | global batch size: 256 | lm loss: 2.594909E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.188 | TFLOPs: 43.02 | +7: iteration 90880/ 115203 | consumed samples: 23265280 | consumed tokens: 47647293440 | elapsed time per iteration (s): 0.57 | learning rate: 3.946E-05 | global batch size: 256 | lm loss: 2.587466E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.237 | TFLOPs: 42.83 | +7: iteration 90890/ 115203 | consumed samples: 23267840 | consumed tokens: 47652536320 | elapsed time per iteration (s): 0.57 | learning rate: 3.944E-05 | global batch size: 256 | lm loss: 2.593111E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.488 | TFLOPs: 42.85 | +7: iteration 90900/ 115203 | consumed samples: 23270400 | consumed tokens: 47657779200 | elapsed time per iteration (s): 0.55 | learning rate: 3.943E-05 | global batch size: 256 | lm loss: 2.593469E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.576 | TFLOPs: 44.01 | +7: iteration 90910/ 115203 | consumed samples: 23272960 | consumed tokens: 47663022080 | elapsed time per iteration (s): 0.58 | learning rate: 3.941E-05 | global batch size: 256 | lm loss: 2.583316E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.626 | TFLOPs: 41.91 | +7: iteration 90920/ 115203 | consumed samples: 23275520 | consumed tokens: 47668264960 | elapsed time per iteration (s): 0.57 | learning rate: 3.939E-05 | global batch size: 256 | lm loss: 2.592938E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.549 | TFLOPs: 42.57 | +7: iteration 90930/ 115203 | consumed samples: 23278080 | consumed tokens: 47673507840 | elapsed time per iteration (s): 0.57 | learning rate: 3.938E-05 | global batch size: 256 | lm loss: 2.595445E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.616 | TFLOPs: 43.15 | +7: iteration 90940/ 115203 | consumed samples: 23280640 | consumed tokens: 47678750720 | elapsed time per iteration (s): 0.55 | learning rate: 3.936E-05 | global batch size: 256 | lm loss: 2.579924E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.588 | TFLOPs: 44.01 | +7: iteration 90950/ 115203 | consumed samples: 23283200 | consumed tokens: 47683993600 | elapsed time per iteration (s): 0.56 | learning rate: 3.935E-05 | global batch size: 256 | lm loss: 2.579633E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.718 | TFLOPs: 43.64 | +7: iteration 90960/ 115203 | consumed samples: 23285760 | consumed tokens: 47689236480 | elapsed time per iteration (s): 0.56 | learning rate: 3.933E-05 | global batch size: 256 | lm loss: 2.580485E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.740 | TFLOPs: 43.64 | +7: iteration 90970/ 115203 | consumed samples: 23288320 | consumed tokens: 47694479360 | elapsed time per iteration (s): 0.55 | learning rate: 3.932E-05 | global batch size: 256 | lm loss: 2.609270E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.674 | TFLOPs: 44.02 | +7: iteration 90980/ 115203 | consumed samples: 23290880 | consumed tokens: 47699722240 | elapsed time per iteration (s): 0.56 | learning rate: 3.930E-05 | global batch size: 256 | lm loss: 2.591657E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.727 | TFLOPs: 43.64 | +7: iteration 90990/ 115203 | consumed samples: 23293440 | consumed tokens: 47704965120 | elapsed time per iteration (s): 0.56 | learning rate: 3.929E-05 | global batch size: 256 | lm loss: 2.582162E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.446 | TFLOPs: 43.90 | +7: iteration 91000/ 115203 | consumed samples: 23296000 | consumed tokens: 47710208000 | elapsed time per iteration (s): 0.55 | learning rate: 3.927E-05 | global batch size: 256 | lm loss: 2.576745E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.714 | TFLOPs: 44.02 | +7: iteration 91010/ 115203 | consumed samples: 23298560 | consumed tokens: 47715450880 | elapsed time per iteration (s): 0.56 | learning rate: 3.926E-05 | global batch size: 256 | lm loss: 2.587339E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.054 | TFLOPs: 43.58 | +7: iteration 91020/ 115203 | consumed samples: 23301120 | consumed tokens: 47720693760 | elapsed time per iteration (s): 0.55 | learning rate: 3.924E-05 | global batch size: 256 | lm loss: 2.588815E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.542 | TFLOPs: 44.00 | +7: iteration 91030/ 115203 | consumed samples: 23303680 | consumed tokens: 47725936640 | elapsed time per iteration (s): 0.55 | learning rate: 3.923E-05 | global batch size: 256 | lm loss: 2.576551E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.572 | TFLOPs: 44.01 | +7: iteration 91040/ 115203 | consumed samples: 23306240 | consumed tokens: 47731179520 | elapsed time per iteration (s): 0.55 | learning rate: 3.921E-05 | global batch size: 256 | lm loss: 2.588770E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.469 | TFLOPs: 44.00 | +7: iteration 91050/ 115203 | consumed samples: 23308800 | consumed tokens: 47736422400 | elapsed time per iteration (s): 0.55 | learning rate: 3.920E-05 | global batch size: 256 | lm loss: 2.587676E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.330 | TFLOPs: 43.98 | +7: iteration 91060/ 115203 | consumed samples: 23311360 | consumed tokens: 47741665280 | elapsed time per iteration (s): 0.55 | learning rate: 3.918E-05 | global batch size: 256 | lm loss: 2.598637E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.456 | TFLOPs: 43.99 | +7: iteration 91070/ 115203 | consumed samples: 23313920 | consumed tokens: 47746908160 | elapsed time per iteration (s): 0.56 | learning rate: 3.916E-05 | global batch size: 256 | lm loss: 2.584006E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.804 | TFLOPs: 43.55 | +7: iteration 91080/ 115203 | consumed samples: 23316480 | consumed tokens: 47752151040 | elapsed time per iteration (s): 0.56 | learning rate: 3.915E-05 | global batch size: 256 | lm loss: 2.592293E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.595 | TFLOPs: 43.53 | +7: iteration 91090/ 115203 | consumed samples: 23319040 | consumed tokens: 47757393920 | elapsed time per iteration (s): 0.56 | learning rate: 3.913E-05 | global batch size: 256 | lm loss: 2.591680E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.252 | TFLOPs: 43.21 | +7: iteration 91100/ 115203 | consumed samples: 23321600 | consumed tokens: 47762636800 | elapsed time per iteration (s): 0.55 | learning rate: 3.912E-05 | global batch size: 256 | lm loss: 2.599437E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.386 | TFLOPs: 43.99 | +7: iteration 91110/ 115203 | consumed samples: 23324160 | consumed tokens: 47767879680 | elapsed time per iteration (s): 0.56 | learning rate: 3.910E-05 | global batch size: 256 | lm loss: 2.584079E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.387 | TFLOPs: 43.61 | +7: iteration 91120/ 115203 | consumed samples: 23326720 | consumed tokens: 47773122560 | elapsed time per iteration (s): 0.55 | learning rate: 3.909E-05 | global batch size: 256 | lm loss: 2.592451E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.488 | TFLOPs: 44.00 | +7: iteration 91130/ 115203 | consumed samples: 23329280 | consumed tokens: 47778365440 | elapsed time per iteration (s): 0.56 | learning rate: 3.907E-05 | global batch size: 256 | lm loss: 2.606560E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.068 | TFLOPs: 43.29 | +7: iteration 91140/ 115203 | consumed samples: 23331840 | consumed tokens: 47783608320 | elapsed time per iteration (s): 0.56 | learning rate: 3.906E-05 | global batch size: 256 | lm loss: 2.594568E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.757 | TFLOPs: 43.93 | +7: iteration 91150/ 115203 | consumed samples: 23334400 | consumed tokens: 47788851200 | elapsed time per iteration (s): 0.55 | learning rate: 3.904E-05 | global batch size: 256 | lm loss: 2.592161E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.368 | TFLOPs: 43.99 | +7: iteration 91160/ 115203 | consumed samples: 23336960 | consumed tokens: 47794094080 | elapsed time per iteration (s): 0.56 | learning rate: 3.903E-05 | global batch size: 256 | lm loss: 2.586795E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.170 | TFLOPs: 43.87 | +7: iteration 91170/ 115203 | consumed samples: 23339520 | consumed tokens: 47799336960 | elapsed time per iteration (s): 0.58 | learning rate: 3.901E-05 | global batch size: 256 | lm loss: 2.584468E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.486 | TFLOPs: 42.00 | +7: iteration 91180/ 115203 | consumed samples: 23342080 | consumed tokens: 47804579840 | elapsed time per iteration (s): 0.57 | learning rate: 3.900E-05 | global batch size: 256 | lm loss: 2.579807E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.363 | TFLOPs: 43.13 | +7: iteration 91190/ 115203 | consumed samples: 23344640 | consumed tokens: 47809822720 | elapsed time per iteration (s): 0.57 | learning rate: 3.898E-05 | global batch size: 256 | lm loss: 2.592194E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.998 | TFLOPs: 42.71 | +7: iteration 91200/ 115203 | consumed samples: 23347200 | consumed tokens: 47815065600 | elapsed time per iteration (s): 0.57 | learning rate: 3.897E-05 | global batch size: 256 | lm loss: 2.588949E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.346 | TFLOPs: 42.55 | +7: iteration 91210/ 115203 | consumed samples: 23349760 | consumed tokens: 47820308480 | elapsed time per iteration (s): 0.56 | learning rate: 3.895E-05 | global batch size: 256 | lm loss: 2.580109E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.912 | TFLOPs: 43.47 | +7: iteration 91220/ 115203 | consumed samples: 23352320 | consumed tokens: 47825551360 | elapsed time per iteration (s): 0.56 | learning rate: 3.894E-05 | global batch size: 256 | lm loss: 2.588097E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.240 | TFLOPs: 43.69 | +7: iteration 91230/ 115203 | consumed samples: 23354880 | consumed tokens: 47830794240 | elapsed time per iteration (s): 0.56 | learning rate: 3.892E-05 | global batch size: 256 | lm loss: 2.581171E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.527 | TFLOPs: 43.33 | +7: iteration 91240/ 115203 | consumed samples: 23357440 | consumed tokens: 47836037120 | elapsed time per iteration (s): 0.57 | learning rate: 3.891E-05 | global batch size: 256 | lm loss: 2.587092E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.934 | TFLOPs: 42.80 | +7: iteration 91250/ 115203 | consumed samples: 23360000 | consumed tokens: 47841280000 | elapsed time per iteration (s): 0.58 | learning rate: 3.889E-05 | global batch size: 256 | lm loss: 2.575941E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.748 | TFLOPs: 42.31 | +7: iteration 91260/ 115203 | consumed samples: 23362560 | consumed tokens: 47846522880 | elapsed time per iteration (s): 0.56 | learning rate: 3.888E-05 | global batch size: 256 | lm loss: 2.585989E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.739 | TFLOPs: 43.26 | +7: iteration 91270/ 115203 | consumed samples: 23365120 | consumed tokens: 47851765760 | elapsed time per iteration (s): 0.57 | learning rate: 3.886E-05 | global batch size: 256 | lm loss: 2.599336E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.530 | TFLOPs: 42.76 | +7: iteration 91280/ 115203 | consumed samples: 23367680 | consumed tokens: 47857008640 | elapsed time per iteration (s): 0.56 | learning rate: 3.885E-05 | global batch size: 256 | lm loss: 2.601712E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.124 | TFLOPs: 43.49 | +7: iteration 91290/ 115203 | consumed samples: 23370240 | consumed tokens: 47862251520 | elapsed time per iteration (s): 0.57 | learning rate: 3.883E-05 | global batch size: 256 | lm loss: 2.589801E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.244 | TFLOPs: 42.45 | +7: iteration 91300/ 115203 | consumed samples: 23372800 | consumed tokens: 47867494400 | elapsed time per iteration (s): 0.58 | learning rate: 3.881E-05 | global batch size: 256 | lm loss: 2.594176E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.515 | TFLOPs: 42.38 | +7: iteration 91310/ 115203 | consumed samples: 23375360 | consumed tokens: 47872737280 | elapsed time per iteration (s): 0.55 | learning rate: 3.880E-05 | global batch size: 256 | lm loss: 2.588754E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.615 | TFLOPs: 44.01 | +7: iteration 91320/ 115203 | consumed samples: 23377920 | consumed tokens: 47877980160 | elapsed time per iteration (s): 0.57 | learning rate: 3.878E-05 | global batch size: 256 | lm loss: 2.580065E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.630 | TFLOPs: 42.77 | +7: iteration 91330/ 115203 | consumed samples: 23380480 | consumed tokens: 47883223040 | elapsed time per iteration (s): 0.56 | learning rate: 3.877E-05 | global batch size: 256 | lm loss: 2.580114E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.532 | TFLOPs: 43.24 | +7: iteration 91340/ 115203 | consumed samples: 23383040 | consumed tokens: 47888465920 | elapsed time per iteration (s): 0.57 | learning rate: 3.875E-05 | global batch size: 256 | lm loss: 2.587101E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.407 | TFLOPs: 42.94 | +7: iteration 91350/ 115203 | consumed samples: 23385600 | consumed tokens: 47893708800 | elapsed time per iteration (s): 0.57 | learning rate: 3.874E-05 | global batch size: 256 | lm loss: 2.589326E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.573 | TFLOPs: 42.67 | +7: iteration 91360/ 115203 | consumed samples: 23388160 | consumed tokens: 47898951680 | elapsed time per iteration (s): 0.56 | learning rate: 3.872E-05 | global batch size: 256 | lm loss: 2.583872E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.913 | TFLOPs: 43.85 | +7: iteration 91370/ 115203 | consumed samples: 23390720 | consumed tokens: 47904194560 | elapsed time per iteration (s): 0.57 | learning rate: 3.871E-05 | global batch size: 256 | lm loss: 2.587088E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.567 | TFLOPs: 42.77 | +7: iteration 91380/ 115203 | consumed samples: 23393280 | consumed tokens: 47909437440 | elapsed time per iteration (s): 0.57 | learning rate: 3.869E-05 | global batch size: 256 | lm loss: 2.583073E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.732 | TFLOPs: 43.16 | +7: iteration 91390/ 115203 | consumed samples: 23395840 | consumed tokens: 47914680320 | elapsed time per iteration (s): 0.57 | learning rate: 3.868E-05 | global batch size: 256 | lm loss: 2.589415E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.782 | TFLOPs: 42.79 | +7: iteration 91400/ 115203 | consumed samples: 23398400 | consumed tokens: 47919923200 | elapsed time per iteration (s): 0.56 | learning rate: 3.866E-05 | global batch size: 256 | lm loss: 2.585858E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.376 | TFLOPs: 43.42 | +7: iteration 91410/ 115203 | consumed samples: 23400960 | consumed tokens: 47925166080 | elapsed time per iteration (s): 0.56 | learning rate: 3.865E-05 | global batch size: 256 | lm loss: 2.579897E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.922 | TFLOPs: 43.66 | +7: iteration 91420/ 115203 | consumed samples: 23403520 | consumed tokens: 47930408960 | elapsed time per iteration (s): 0.57 | learning rate: 3.863E-05 | global batch size: 256 | lm loss: 2.579118E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.584 | TFLOPs: 43.15 | +7: iteration 91430/ 115203 | consumed samples: 23406080 | consumed tokens: 47935651840 | elapsed time per iteration (s): 0.57 | learning rate: 3.862E-05 | global batch size: 256 | lm loss: 2.594630E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.903 | TFLOPs: 43.18 | +7: iteration 91440/ 115203 | consumed samples: 23408640 | consumed tokens: 47940894720 | elapsed time per iteration (s): 0.57 | learning rate: 3.860E-05 | global batch size: 256 | lm loss: 2.593647E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.881 | TFLOPs: 42.70 | +7: iteration 91450/ 115203 | consumed samples: 23411200 | consumed tokens: 47946137600 | elapsed time per iteration (s): 0.56 | learning rate: 3.859E-05 | global batch size: 256 | lm loss: 2.577720E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.228 | TFLOPs: 43.31 | +7: iteration 91460/ 115203 | consumed samples: 23413760 | consumed tokens: 47951380480 | elapsed time per iteration (s): 0.56 | learning rate: 3.857E-05 | global batch size: 256 | lm loss: 2.580171E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.201 | TFLOPs: 43.49 | +7: iteration 91470/ 115203 | consumed samples: 23416320 | consumed tokens: 47956623360 | elapsed time per iteration (s): 0.57 | learning rate: 3.856E-05 | global batch size: 256 | lm loss: 2.572650E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.357 | TFLOPs: 42.94 | +7: iteration 91480/ 115203 | consumed samples: 23418880 | consumed tokens: 47961866240 | elapsed time per iteration (s): 0.57 | learning rate: 3.854E-05 | global batch size: 256 | lm loss: 2.590635E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.483 | TFLOPs: 42.76 | +7: iteration 91490/ 115203 | consumed samples: 23421440 | consumed tokens: 47967109120 | elapsed time per iteration (s): 0.57 | learning rate: 3.853E-05 | global batch size: 256 | lm loss: 2.585158E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.577 | TFLOPs: 43.15 | +7: iteration 91500/ 115203 | consumed samples: 23424000 | consumed tokens: 47972352000 | elapsed time per iteration (s): 0.57 | learning rate: 3.851E-05 | global batch size: 256 | lm loss: 2.583209E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.202 | TFLOPs: 42.73 | +7: iteration 91510/ 115203 | consumed samples: 23426560 | consumed tokens: 47977594880 | elapsed time per iteration (s): 0.56 | learning rate: 3.850E-05 | global batch size: 256 | lm loss: 2.581396E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.750 | TFLOPs: 43.26 | +7: iteration 91520/ 115203 | consumed samples: 23429120 | consumed tokens: 47982837760 | elapsed time per iteration (s): 0.56 | learning rate: 3.848E-05 | global batch size: 256 | lm loss: 2.586633E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.317 | TFLOPs: 43.22 | +7: iteration 91530/ 115203 | consumed samples: 23431680 | consumed tokens: 47988080640 | elapsed time per iteration (s): 0.56 | learning rate: 3.847E-05 | global batch size: 256 | lm loss: 2.587105E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.070 | TFLOPs: 43.39 | +7: iteration 91540/ 115203 | consumed samples: 23434240 | consumed tokens: 47993323520 | elapsed time per iteration (s): 0.56 | learning rate: 3.845E-05 | global batch size: 256 | lm loss: 2.590565E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.930 | TFLOPs: 43.37 | +7: iteration 91550/ 115203 | consumed samples: 23436800 | consumed tokens: 47998566400 | elapsed time per iteration (s): 0.57 | learning rate: 3.844E-05 | global batch size: 256 | lm loss: 2.574804E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.936 | TFLOPs: 42.52 | +7: iteration 91560/ 115203 | consumed samples: 23439360 | consumed tokens: 48003809280 | elapsed time per iteration (s): 0.58 | learning rate: 3.842E-05 | global batch size: 256 | lm loss: 2.588858E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.911 | TFLOPs: 42.42 | +7: iteration 91570/ 115203 | consumed samples: 23441920 | consumed tokens: 48009052160 | elapsed time per iteration (s): 0.57 | learning rate: 3.841E-05 | global batch size: 256 | lm loss: 2.592682E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.869 | TFLOPs: 43.08 | +7: iteration 91580/ 115203 | consumed samples: 23444480 | consumed tokens: 48014295040 | elapsed time per iteration (s): 0.59 | learning rate: 3.839E-05 | global batch size: 256 | lm loss: 2.583806E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.937 | TFLOPs: 41.66 | +7: iteration 91590/ 115203 | consumed samples: 23447040 | consumed tokens: 48019537920 | elapsed time per iteration (s): 0.56 | learning rate: 3.838E-05 | global batch size: 256 | lm loss: 2.579104E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.567 | TFLOPs: 43.72 | +7: iteration 91600/ 115203 | consumed samples: 23449600 | consumed tokens: 48024780800 | elapsed time per iteration (s): 0.57 | learning rate: 3.836E-05 | global batch size: 256 | lm loss: 2.580289E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.683 | TFLOPs: 42.97 | +7: iteration 91610/ 115203 | consumed samples: 23452160 | consumed tokens: 48030023680 | elapsed time per iteration (s): 0.56 | learning rate: 3.835E-05 | global batch size: 256 | lm loss: 2.582251E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.405 | TFLOPs: 43.42 | +7: iteration 91620/ 115203 | consumed samples: 23454720 | consumed tokens: 48035266560 | elapsed time per iteration (s): 0.57 | learning rate: 3.833E-05 | global batch size: 256 | lm loss: 2.576454E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.668 | TFLOPs: 42.97 | +7: iteration 91630/ 115203 | consumed samples: 23457280 | consumed tokens: 48040509440 | elapsed time per iteration (s): 0.58 | learning rate: 3.832E-05 | global batch size: 256 | lm loss: 2.586490E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.252 | TFLOPs: 42.35 | +7: iteration 91640/ 115203 | consumed samples: 23459840 | consumed tokens: 48045752320 | elapsed time per iteration (s): 0.58 | learning rate: 3.830E-05 | global batch size: 256 | lm loss: 2.577479E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.323 | TFLOPs: 42.27 | +7: iteration 91650/ 115203 | consumed samples: 23462400 | consumed tokens: 48050995200 | elapsed time per iteration (s): 0.57 | learning rate: 3.829E-05 | global batch size: 256 | lm loss: 2.585456E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.866 | TFLOPs: 42.79 | +7: iteration 91660/ 115203 | consumed samples: 23464960 | consumed tokens: 48056238080 | elapsed time per iteration (s): 0.56 | learning rate: 3.827E-05 | global batch size: 256 | lm loss: 2.586989E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.733 | TFLOPs: 43.26 | +7: iteration 91670/ 115203 | consumed samples: 23467520 | consumed tokens: 48061480960 | elapsed time per iteration (s): 0.57 | learning rate: 3.826E-05 | global batch size: 256 | lm loss: 2.578631E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.689 | TFLOPs: 42.68 | +7: iteration 91680/ 115203 | consumed samples: 23470080 | consumed tokens: 48066723840 | elapsed time per iteration (s): 0.58 | learning rate: 3.824E-05 | global batch size: 256 | lm loss: 2.581753E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.307 | TFLOPs: 42.36 | +7: iteration 91690/ 115203 | consumed samples: 23472640 | consumed tokens: 48071966720 | elapsed time per iteration (s): 0.58 | learning rate: 3.823E-05 | global batch size: 256 | lm loss: 2.585692E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.248 | TFLOPs: 42.26 | +7: iteration 91700/ 115203 | consumed samples: 23475200 | consumed tokens: 48077209600 | elapsed time per iteration (s): 0.56 | learning rate: 3.821E-05 | global batch size: 256 | lm loss: 2.586959E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.375 | TFLOPs: 43.51 | +7: iteration 91710/ 115203 | consumed samples: 23477760 | consumed tokens: 48082452480 | elapsed time per iteration (s): 0.57 | learning rate: 3.820E-05 | global batch size: 256 | lm loss: 2.580177E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.572 | TFLOPs: 42.58 | +7: iteration 91720/ 115203 | consumed samples: 23480320 | consumed tokens: 48087695360 | elapsed time per iteration (s): 0.57 | learning rate: 3.818E-05 | global batch size: 256 | lm loss: 2.587933E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.456 | TFLOPs: 42.95 | +7: iteration 91730/ 115203 | consumed samples: 23482880 | consumed tokens: 48092938240 | elapsed time per iteration (s): 0.57 | learning rate: 3.817E-05 | global batch size: 256 | lm loss: 2.592254E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.894 | TFLOPs: 42.70 | +7: iteration 91740/ 115203 | consumed samples: 23485440 | consumed tokens: 48098181120 | elapsed time per iteration (s): 0.57 | learning rate: 3.815E-05 | global batch size: 256 | lm loss: 2.578856E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.543 | TFLOPs: 43.15 | +7: iteration 91750/ 115203 | consumed samples: 23488000 | consumed tokens: 48103424000 | elapsed time per iteration (s): 0.58 | learning rate: 3.814E-05 | global batch size: 256 | lm loss: 2.589309E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.879 | TFLOPs: 42.41 | +7: iteration 91760/ 115203 | consumed samples: 23490560 | consumed tokens: 48108666880 | elapsed time per iteration (s): 0.57 | learning rate: 3.812E-05 | global batch size: 256 | lm loss: 2.591478E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.825 | TFLOPs: 42.79 | +7: iteration 91770/ 115203 | consumed samples: 23493120 | consumed tokens: 48113909760 | elapsed time per iteration (s): 0.57 | learning rate: 3.811E-05 | global batch size: 256 | lm loss: 2.583069E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.497 | TFLOPs: 42.95 | +7: iteration 91780/ 115203 | consumed samples: 23495680 | consumed tokens: 48119152640 | elapsed time per iteration (s): 0.57 | learning rate: 3.809E-05 | global batch size: 256 | lm loss: 2.567762E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.564 | TFLOPs: 42.86 | +7: iteration 91790/ 115203 | consumed samples: 23498240 | consumed tokens: 48124395520 | elapsed time per iteration (s): 0.58 | learning rate: 3.808E-05 | global batch size: 256 | lm loss: 2.588097E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.380 | TFLOPs: 42.27 | +7: iteration 91800/ 115203 | consumed samples: 23500800 | consumed tokens: 48129638400 | elapsed time per iteration (s): 0.57 | learning rate: 3.806E-05 | global batch size: 256 | lm loss: 2.594284E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.251 | TFLOPs: 42.45 | +7: iteration 91810/ 115203 | consumed samples: 23503360 | consumed tokens: 48134881280 | elapsed time per iteration (s): 0.59 | learning rate: 3.805E-05 | global batch size: 256 | lm loss: 2.577593E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.384 | TFLOPs: 41.41 | +7: iteration 91820/ 115203 | consumed samples: 23505920 | consumed tokens: 48140124160 | elapsed time per iteration (s): 0.60 | learning rate: 3.803E-05 | global batch size: 256 | lm loss: 2.594778E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.060 | TFLOPs: 40.91 | +7: iteration 91830/ 115203 | consumed samples: 23508480 | consumed tokens: 48145367040 | elapsed time per iteration (s): 0.57 | learning rate: 3.802E-05 | global batch size: 256 | lm loss: 2.586658E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.485 | TFLOPs: 42.47 | +7: iteration 91840/ 115203 | consumed samples: 23511040 | consumed tokens: 48150609920 | elapsed time per iteration (s): 0.59 | learning rate: 3.800E-05 | global batch size: 256 | lm loss: 2.588947E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.502 | TFLOPs: 41.33 | +7: iteration 91850/ 115203 | consumed samples: 23513600 | consumed tokens: 48155852800 | elapsed time per iteration (s): 0.58 | learning rate: 3.799E-05 | global batch size: 256 | lm loss: 2.590507E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.105 | TFLOPs: 42.15 | +7: iteration 91860/ 115203 | consumed samples: 23516160 | consumed tokens: 48161095680 | elapsed time per iteration (s): 0.59 | learning rate: 3.797E-05 | global batch size: 256 | lm loss: 2.595930E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.502 | TFLOPs: 41.33 | +7: iteration 91870/ 115203 | consumed samples: 23518720 | consumed tokens: 48166338560 | elapsed time per iteration (s): 0.57 | learning rate: 3.796E-05 | global batch size: 256 | lm loss: 2.604986E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.298 | TFLOPs: 42.93 | +7: iteration 91880/ 115203 | consumed samples: 23521280 | consumed tokens: 48171581440 | elapsed time per iteration (s): 0.57 | learning rate: 3.794E-05 | global batch size: 256 | lm loss: 2.585588E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.460 | TFLOPs: 42.57 | +7: iteration 91890/ 115203 | consumed samples: 23523840 | consumed tokens: 48176824320 | elapsed time per iteration (s): 0.57 | learning rate: 3.793E-05 | global batch size: 256 | lm loss: 2.587615E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.403 | TFLOPs: 42.85 | +7: iteration 91900/ 115203 | consumed samples: 23526400 | consumed tokens: 48182067200 | elapsed time per iteration (s): 0.58 | learning rate: 3.791E-05 | global batch size: 256 | lm loss: 2.580936E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.420 | TFLOPs: 42.08 | +7: iteration 91910/ 115203 | consumed samples: 23528960 | consumed tokens: 48187310080 | elapsed time per iteration (s): 0.58 | learning rate: 3.790E-05 | global batch size: 256 | lm loss: 2.597369E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.848 | TFLOPs: 42.03 | +7: iteration 91920/ 115203 | consumed samples: 23531520 | consumed tokens: 48192552960 | elapsed time per iteration (s): 0.58 | learning rate: 3.788E-05 | global batch size: 256 | lm loss: 2.577529E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.409 | TFLOPs: 42.18 | +7: iteration 91930/ 115203 | consumed samples: 23534080 | consumed tokens: 48197795840 | elapsed time per iteration (s): 0.60 | learning rate: 3.787E-05 | global batch size: 256 | lm loss: 2.591193E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.751 | TFLOPs: 40.97 | +7: iteration 91940/ 115203 | consumed samples: 23536640 | consumed tokens: 48203038720 | elapsed time per iteration (s): 0.59 | learning rate: 3.785E-05 | global batch size: 256 | lm loss: 2.578904E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.938 | TFLOPs: 41.09 | +7: iteration 91950/ 115203 | consumed samples: 23539200 | consumed tokens: 48208281600 | elapsed time per iteration (s): 0.59 | learning rate: 3.784E-05 | global batch size: 256 | lm loss: 2.607963E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.834 | TFLOPs: 41.08 | +7: iteration 91960/ 115203 | consumed samples: 23541760 | consumed tokens: 48213524480 | elapsed time per iteration (s): 0.57 | learning rate: 3.783E-05 | global batch size: 256 | lm loss: 2.587742E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.719 | TFLOPs: 42.88 | +7: iteration 91970/ 115203 | consumed samples: 23544320 | consumed tokens: 48218767360 | elapsed time per iteration (s): 0.58 | learning rate: 3.781E-05 | global batch size: 256 | lm loss: 2.570788E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.500 | TFLOPs: 42.28 | +7: iteration 91980/ 115203 | consumed samples: 23546880 | consumed tokens: 48224010240 | elapsed time per iteration (s): 0.60 | learning rate: 3.780E-05 | global batch size: 256 | lm loss: 2.579405E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.156 | TFLOPs: 41.01 | +7: iteration 91990/ 115203 | consumed samples: 23549440 | consumed tokens: 48229253120 | elapsed time per iteration (s): 0.59 | learning rate: 3.778E-05 | global batch size: 256 | lm loss: 2.583060E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.286 | TFLOPs: 41.21 | +0: [2023-03-17 03:15:52,423] [INFO] [logging.py:68:log_dist] [Rank 0] step=92000, skipped=0, lr=[3.776612403864962e-05, 3.776612403864962e-05, 3.776612403864962e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 92000/ 115203 | consumed samples: 23552000 | consumed tokens: 48234496000 | elapsed time per iteration (s): 0.60 | learning rate: 3.777E-05 | global batch size: 256 | lm loss: 2.583251E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.307 | TFLOPs: 40.45 | +0: steps: 92000 loss: 2.5822 iter time (s): 0.566 samples/sec: 452.578 +7: iteration 92010/ 115203 | consumed samples: 23554560 | consumed tokens: 48239738880 | elapsed time per iteration (s): 0.58 | learning rate: 3.775E-05 | global batch size: 256 | lm loss: 2.590544E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.173 | TFLOPs: 42.06 | +7: iteration 92020/ 115203 | consumed samples: 23557120 | consumed tokens: 48244981760 | elapsed time per iteration (s): 0.58 | learning rate: 3.774E-05 | global batch size: 256 | lm loss: 2.593632E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.644 | TFLOPs: 41.82 | +7: iteration 92030/ 115203 | consumed samples: 23559680 | consumed tokens: 48250224640 | elapsed time per iteration (s): 0.58 | learning rate: 3.772E-05 | global batch size: 256 | lm loss: 2.590141E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.748 | TFLOPs: 42.40 | +7: iteration 92040/ 115203 | consumed samples: 23562240 | consumed tokens: 48255467520 | elapsed time per iteration (s): 0.59 | learning rate: 3.771E-05 | global batch size: 256 | lm loss: 2.581378E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.296 | TFLOPs: 41.69 | +7: iteration 92050/ 115203 | consumed samples: 23564800 | consumed tokens: 48260710400 | elapsed time per iteration (s): 0.57 | learning rate: 3.769E-05 | global batch size: 256 | lm loss: 2.590493E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.709 | TFLOPs: 43.07 | +7: iteration 92060/ 115203 | consumed samples: 23567360 | consumed tokens: 48265953280 | elapsed time per iteration (s): 0.56 | learning rate: 3.768E-05 | global batch size: 256 | lm loss: 2.578333E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.296 | TFLOPs: 43.69 | +7: iteration 92070/ 115203 | consumed samples: 23569920 | consumed tokens: 48271196160 | elapsed time per iteration (s): 0.58 | learning rate: 3.766E-05 | global batch size: 256 | lm loss: 2.607238E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.728 | TFLOPs: 42.02 | +7: iteration 92080/ 115203 | consumed samples: 23572480 | consumed tokens: 48276439040 | elapsed time per iteration (s): 0.58 | learning rate: 3.765E-05 | global batch size: 256 | lm loss: 2.589701E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.660 | TFLOPs: 42.01 | +7: iteration 92090/ 115203 | consumed samples: 23575040 | consumed tokens: 48281681920 | elapsed time per iteration (s): 0.62 | learning rate: 3.763E-05 | global batch size: 256 | lm loss: 2.596311E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 411.384 | TFLOPs: 39.22 | +7: iteration 92100/ 115203 | consumed samples: 23577600 | consumed tokens: 48286924800 | elapsed time per iteration (s): 0.57 | learning rate: 3.762E-05 | global batch size: 256 | lm loss: 2.584192E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.469 | TFLOPs: 42.85 | +7: iteration 92110/ 115203 | consumed samples: 23580160 | consumed tokens: 48292167680 | elapsed time per iteration (s): 0.57 | learning rate: 3.760E-05 | global batch size: 256 | lm loss: 2.587821E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.968 | TFLOPs: 42.90 | +7: iteration 92120/ 115203 | consumed samples: 23582720 | consumed tokens: 48297410560 | elapsed time per iteration (s): 0.57 | learning rate: 3.759E-05 | global batch size: 256 | lm loss: 2.580986E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.897 | TFLOPs: 42.61 | +7: iteration 92130/ 115203 | consumed samples: 23585280 | consumed tokens: 48302653440 | elapsed time per iteration (s): 0.58 | learning rate: 3.757E-05 | global batch size: 256 | lm loss: 2.582364E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.768 | TFLOPs: 42.02 | +7: iteration 92140/ 115203 | consumed samples: 23587840 | consumed tokens: 48307896320 | elapsed time per iteration (s): 0.60 | learning rate: 3.756E-05 | global batch size: 256 | lm loss: 2.586638E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.556 | TFLOPs: 40.76 | +7: iteration 92150/ 115203 | consumed samples: 23590400 | consumed tokens: 48313139200 | elapsed time per iteration (s): 0.57 | learning rate: 3.754E-05 | global batch size: 256 | lm loss: 2.585730E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.525 | TFLOPs: 42.95 | +7: iteration 92160/ 115203 | consumed samples: 23592960 | consumed tokens: 48318382080 | elapsed time per iteration (s): 0.57 | learning rate: 3.753E-05 | global batch size: 256 | lm loss: 2.597506E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.648 | TFLOPs: 42.77 | +7: iteration 92170/ 115203 | consumed samples: 23595520 | consumed tokens: 48323624960 | elapsed time per iteration (s): 0.56 | learning rate: 3.752E-05 | global batch size: 256 | lm loss: 2.594370E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.618 | TFLOPs: 43.44 | +7: iteration 92180/ 115203 | consumed samples: 23598080 | consumed tokens: 48328867840 | elapsed time per iteration (s): 0.57 | learning rate: 3.750E-05 | global batch size: 256 | lm loss: 2.583161E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.409 | TFLOPs: 42.94 | +7: iteration 92190/ 115203 | consumed samples: 23600640 | consumed tokens: 48334110720 | elapsed time per iteration (s): 0.56 | learning rate: 3.749E-05 | global batch size: 256 | lm loss: 2.564863E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.180 | TFLOPs: 43.30 | +7: iteration 92200/ 115203 | consumed samples: 23603200 | consumed tokens: 48339353600 | elapsed time per iteration (s): 0.56 | learning rate: 3.747E-05 | global batch size: 256 | lm loss: 2.591675E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.820 | TFLOPs: 43.74 | +7: iteration 92210/ 115203 | consumed samples: 23605760 | consumed tokens: 48344596480 | elapsed time per iteration (s): 0.57 | learning rate: 3.746E-05 | global batch size: 256 | lm loss: 2.589882E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.098 | TFLOPs: 42.82 | +7: iteration 92220/ 115203 | consumed samples: 23608320 | consumed tokens: 48349839360 | elapsed time per iteration (s): 0.56 | learning rate: 3.744E-05 | global batch size: 256 | lm loss: 2.587740E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.115 | TFLOPs: 43.68 | +7: iteration 92230/ 115203 | consumed samples: 23610880 | consumed tokens: 48355082240 | elapsed time per iteration (s): 0.56 | learning rate: 3.743E-05 | global batch size: 256 | lm loss: 2.590195E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.376 | TFLOPs: 43.22 | +7: iteration 92240/ 115203 | consumed samples: 23613440 | consumed tokens: 48360325120 | elapsed time per iteration (s): 0.57 | learning rate: 3.741E-05 | global batch size: 256 | lm loss: 2.568506E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.588 | TFLOPs: 42.77 | +7: iteration 92250/ 115203 | consumed samples: 23616000 | consumed tokens: 48365568000 | elapsed time per iteration (s): 0.57 | learning rate: 3.740E-05 | global batch size: 256 | lm loss: 2.569972E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.578 | TFLOPs: 42.86 | +7: iteration 92260/ 115203 | consumed samples: 23618560 | consumed tokens: 48370810880 | elapsed time per iteration (s): 0.56 | learning rate: 3.738E-05 | global batch size: 256 | lm loss: 2.583162E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.484 | TFLOPs: 43.23 | +7: iteration 92270/ 115203 | consumed samples: 23621120 | consumed tokens: 48376053760 | elapsed time per iteration (s): 0.57 | learning rate: 3.737E-05 | global batch size: 256 | lm loss: 2.578959E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.024 | TFLOPs: 42.90 | +7: iteration 92280/ 115203 | consumed samples: 23623680 | consumed tokens: 48381296640 | elapsed time per iteration (s): 0.59 | learning rate: 3.735E-05 | global batch size: 256 | lm loss: 2.581724E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.529 | TFLOPs: 41.71 | +7: iteration 92290/ 115203 | consumed samples: 23626240 | consumed tokens: 48386539520 | elapsed time per iteration (s): 0.55 | learning rate: 3.734E-05 | global batch size: 256 | lm loss: 2.591764E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.880 | TFLOPs: 44.04 | +7: iteration 92300/ 115203 | consumed samples: 23628800 | consumed tokens: 48391782400 | elapsed time per iteration (s): 0.56 | learning rate: 3.732E-05 | global batch size: 256 | lm loss: 2.588261E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.551 | TFLOPs: 43.24 | +7: iteration 92310/ 115203 | consumed samples: 23631360 | consumed tokens: 48397025280 | elapsed time per iteration (s): 0.56 | learning rate: 3.731E-05 | global batch size: 256 | lm loss: 2.592408E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.827 | TFLOPs: 43.65 | +7: iteration 92320/ 115203 | consumed samples: 23633920 | consumed tokens: 48402268160 | elapsed time per iteration (s): 0.57 | learning rate: 3.730E-05 | global batch size: 256 | lm loss: 2.589407E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.497 | TFLOPs: 42.85 | +7: iteration 92330/ 115203 | consumed samples: 23636480 | consumed tokens: 48407511040 | elapsed time per iteration (s): 0.57 | learning rate: 3.728E-05 | global batch size: 256 | lm loss: 2.592339E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.539 | TFLOPs: 42.95 | +7: iteration 92340/ 115203 | consumed samples: 23639040 | consumed tokens: 48412753920 | elapsed time per iteration (s): 0.59 | learning rate: 3.727E-05 | global batch size: 256 | lm loss: 2.596982E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.234 | TFLOPs: 41.69 | +7: iteration 92350/ 115203 | consumed samples: 23641600 | consumed tokens: 48417996800 | elapsed time per iteration (s): 0.56 | learning rate: 3.725E-05 | global batch size: 256 | lm loss: 2.596006E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.555 | TFLOPs: 43.24 | +7: iteration 92360/ 115203 | consumed samples: 23644160 | consumed tokens: 48423239680 | elapsed time per iteration (s): 0.56 | learning rate: 3.724E-05 | global batch size: 256 | lm loss: 2.578443E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.914 | TFLOPs: 43.56 | +7: iteration 92370/ 115203 | consumed samples: 23646720 | consumed tokens: 48428482560 | elapsed time per iteration (s): 0.56 | learning rate: 3.722E-05 | global batch size: 256 | lm loss: 2.572031E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.544 | TFLOPs: 43.34 | +7: iteration 92380/ 115203 | consumed samples: 23649280 | consumed tokens: 48433725440 | elapsed time per iteration (s): 0.55 | learning rate: 3.721E-05 | global batch size: 256 | lm loss: 2.586354E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.882 | TFLOPs: 44.04 | +7: iteration 92390/ 115203 | consumed samples: 23651840 | consumed tokens: 48438968320 | elapsed time per iteration (s): 0.55 | learning rate: 3.719E-05 | global batch size: 256 | lm loss: 2.587521E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.828 | TFLOPs: 44.03 | +7: iteration 92400/ 115203 | consumed samples: 23654400 | consumed tokens: 48444211200 | elapsed time per iteration (s): 0.55 | learning rate: 3.718E-05 | global batch size: 256 | lm loss: 2.599729E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.838 | TFLOPs: 44.03 | +7: iteration 92410/ 115203 | consumed samples: 23656960 | consumed tokens: 48449454080 | elapsed time per iteration (s): 0.56 | learning rate: 3.716E-05 | global batch size: 256 | lm loss: 2.573412E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.425 | TFLOPs: 43.71 | +7: iteration 92420/ 115203 | consumed samples: 23659520 | consumed tokens: 48454696960 | elapsed time per iteration (s): 0.55 | learning rate: 3.715E-05 | global batch size: 256 | lm loss: 2.579505E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.623 | TFLOPs: 44.01 | +7: iteration 92430/ 115203 | consumed samples: 23662080 | consumed tokens: 48459939840 | elapsed time per iteration (s): 0.56 | learning rate: 3.714E-05 | global batch size: 256 | lm loss: 2.589470E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.024 | TFLOPs: 43.48 | +7: iteration 92440/ 115203 | consumed samples: 23664640 | consumed tokens: 48465182720 | elapsed time per iteration (s): 0.56 | learning rate: 3.712E-05 | global batch size: 256 | lm loss: 2.567635E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.405 | TFLOPs: 43.89 | +7: iteration 92450/ 115203 | consumed samples: 23667200 | consumed tokens: 48470425600 | elapsed time per iteration (s): 0.55 | learning rate: 3.711E-05 | global batch size: 256 | lm loss: 2.585498E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.643 | TFLOPs: 44.01 | +7: iteration 92460/ 115203 | consumed samples: 23669760 | consumed tokens: 48475668480 | elapsed time per iteration (s): 0.56 | learning rate: 3.709E-05 | global batch size: 256 | lm loss: 2.586921E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.146 | TFLOPs: 43.39 | +7: iteration 92470/ 115203 | consumed samples: 23672320 | consumed tokens: 48480911360 | elapsed time per iteration (s): 0.56 | learning rate: 3.708E-05 | global batch size: 256 | lm loss: 2.577489E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.227 | TFLOPs: 43.69 | +7: iteration 92480/ 115203 | consumed samples: 23674880 | consumed tokens: 48486154240 | elapsed time per iteration (s): 0.56 | learning rate: 3.706E-05 | global batch size: 256 | lm loss: 2.584048E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.848 | TFLOPs: 43.65 | +7: iteration 92490/ 115203 | consumed samples: 23677440 | consumed tokens: 48491397120 | elapsed time per iteration (s): 0.57 | learning rate: 3.705E-05 | global batch size: 256 | lm loss: 2.592501E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.209 | TFLOPs: 42.73 | +7: iteration 92500/ 115203 | consumed samples: 23680000 | consumed tokens: 48496640000 | elapsed time per iteration (s): 0.56 | learning rate: 3.703E-05 | global batch size: 256 | lm loss: 2.581983E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.012 | TFLOPs: 43.48 | +7: iteration 92510/ 115203 | consumed samples: 23682560 | consumed tokens: 48501882880 | elapsed time per iteration (s): 0.56 | learning rate: 3.702E-05 | global batch size: 256 | lm loss: 2.586764E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.020 | TFLOPs: 43.48 | +7: iteration 92520/ 115203 | consumed samples: 23685120 | consumed tokens: 48507125760 | elapsed time per iteration (s): 0.56 | learning rate: 3.700E-05 | global batch size: 256 | lm loss: 2.572665E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.766 | TFLOPs: 43.64 | +7: iteration 92530/ 115203 | consumed samples: 23687680 | consumed tokens: 48512368640 | elapsed time per iteration (s): 0.56 | learning rate: 3.699E-05 | global batch size: 256 | lm loss: 2.590264E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.104 | TFLOPs: 43.29 | +7: iteration 92540/ 115203 | consumed samples: 23690240 | consumed tokens: 48517611520 | elapsed time per iteration (s): 0.57 | learning rate: 3.698E-05 | global batch size: 256 | lm loss: 2.598523E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.943 | TFLOPs: 42.80 | +7: iteration 92550/ 115203 | consumed samples: 23692800 | consumed tokens: 48522854400 | elapsed time per iteration (s): 0.56 | learning rate: 3.696E-05 | global batch size: 256 | lm loss: 2.592031E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.343 | TFLOPs: 43.51 | +7: iteration 92560/ 115203 | consumed samples: 23695360 | consumed tokens: 48528097280 | elapsed time per iteration (s): 0.57 | learning rate: 3.695E-05 | global batch size: 256 | lm loss: 2.598227E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.733 | TFLOPs: 43.07 | +7: iteration 92570/ 115203 | consumed samples: 23697920 | consumed tokens: 48533340160 | elapsed time per iteration (s): 0.56 | learning rate: 3.693E-05 | global batch size: 256 | lm loss: 2.577511E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.822 | TFLOPs: 43.36 | +7: iteration 92580/ 115203 | consumed samples: 23700480 | consumed tokens: 48538583040 | elapsed time per iteration (s): 0.56 | learning rate: 3.692E-05 | global batch size: 256 | lm loss: 2.589590E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.817 | TFLOPs: 43.74 | +7: iteration 92590/ 115203 | consumed samples: 23703040 | consumed tokens: 48543825920 | elapsed time per iteration (s): 0.56 | learning rate: 3.690E-05 | global batch size: 256 | lm loss: 2.584124E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.206 | TFLOPs: 43.78 | +7: iteration 92600/ 115203 | consumed samples: 23705600 | consumed tokens: 48549068800 | elapsed time per iteration (s): 0.57 | learning rate: 3.689E-05 | global batch size: 256 | lm loss: 2.587715E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.568 | TFLOPs: 42.77 | +7: iteration 92610/ 115203 | consumed samples: 23708160 | consumed tokens: 48554311680 | elapsed time per iteration (s): 0.56 | learning rate: 3.687E-05 | global batch size: 256 | lm loss: 2.577048E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.852 | TFLOPs: 43.75 | +7: iteration 92620/ 115203 | consumed samples: 23710720 | consumed tokens: 48559554560 | elapsed time per iteration (s): 0.56 | learning rate: 3.686E-05 | global batch size: 256 | lm loss: 2.581136E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.744 | TFLOPs: 43.74 | +7: iteration 92630/ 115203 | consumed samples: 23713280 | consumed tokens: 48564797440 | elapsed time per iteration (s): 0.57 | learning rate: 3.685E-05 | global batch size: 256 | lm loss: 2.587950E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.265 | TFLOPs: 42.64 | +7: iteration 92640/ 115203 | consumed samples: 23715840 | consumed tokens: 48570040320 | elapsed time per iteration (s): 0.56 | learning rate: 3.683E-05 | global batch size: 256 | lm loss: 2.577408E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.964 | TFLOPs: 43.28 | +7: iteration 92650/ 115203 | consumed samples: 23718400 | consumed tokens: 48575283200 | elapsed time per iteration (s): 0.56 | learning rate: 3.682E-05 | global batch size: 256 | lm loss: 2.579016E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.614 | TFLOPs: 43.63 | +7: iteration 92660/ 115203 | consumed samples: 23720960 | consumed tokens: 48580526080 | elapsed time per iteration (s): 0.57 | learning rate: 3.680E-05 | global batch size: 256 | lm loss: 2.592252E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.813 | TFLOPs: 43.17 | +7: iteration 92670/ 115203 | consumed samples: 23723520 | consumed tokens: 48585768960 | elapsed time per iteration (s): 0.56 | learning rate: 3.679E-05 | global batch size: 256 | lm loss: 2.595460E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.721 | TFLOPs: 43.54 | +7: iteration 92680/ 115203 | consumed samples: 23726080 | consumed tokens: 48591011840 | elapsed time per iteration (s): 0.57 | learning rate: 3.677E-05 | global batch size: 256 | lm loss: 2.585306E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.784 | TFLOPs: 43.17 | +7: iteration 92690/ 115203 | consumed samples: 23728640 | consumed tokens: 48596254720 | elapsed time per iteration (s): 0.58 | learning rate: 3.676E-05 | global batch size: 256 | lm loss: 2.604256E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.691 | TFLOPs: 41.82 | +7: iteration 92700/ 115203 | consumed samples: 23731200 | consumed tokens: 48601497600 | elapsed time per iteration (s): 0.58 | learning rate: 3.674E-05 | global batch size: 256 | lm loss: 2.581215E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.694 | TFLOPs: 41.92 | +7: iteration 92710/ 115203 | consumed samples: 23733760 | consumed tokens: 48606740480 | elapsed time per iteration (s): 0.57 | learning rate: 3.673E-05 | global batch size: 256 | lm loss: 2.586166E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.823 | TFLOPs: 42.98 | +7: iteration 92720/ 115203 | consumed samples: 23736320 | consumed tokens: 48611983360 | elapsed time per iteration (s): 0.55 | learning rate: 3.672E-05 | global batch size: 256 | lm loss: 2.576324E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.638 | TFLOPs: 44.01 | +7: iteration 92730/ 115203 | consumed samples: 23738880 | consumed tokens: 48617226240 | elapsed time per iteration (s): 0.56 | learning rate: 3.670E-05 | global batch size: 256 | lm loss: 2.588471E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.287 | TFLOPs: 43.31 | +7: iteration 92740/ 115203 | consumed samples: 23741440 | consumed tokens: 48622469120 | elapsed time per iteration (s): 0.56 | learning rate: 3.669E-05 | global batch size: 256 | lm loss: 2.592621E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.381 | TFLOPs: 43.32 | +7: iteration 92750/ 115203 | consumed samples: 23744000 | consumed tokens: 48627712000 | elapsed time per iteration (s): 0.57 | learning rate: 3.667E-05 | global batch size: 256 | lm loss: 2.575550E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.088 | TFLOPs: 43.01 | +7: iteration 92760/ 115203 | consumed samples: 23746560 | consumed tokens: 48632954880 | elapsed time per iteration (s): 0.56 | learning rate: 3.666E-05 | global batch size: 256 | lm loss: 2.573315E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.835 | TFLOPs: 43.46 | +7: iteration 92770/ 115203 | consumed samples: 23749120 | consumed tokens: 48638197760 | elapsed time per iteration (s): 0.56 | learning rate: 3.664E-05 | global batch size: 256 | lm loss: 2.587891E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.585 | TFLOPs: 43.53 | +7: iteration 92780/ 115203 | consumed samples: 23751680 | consumed tokens: 48643440640 | elapsed time per iteration (s): 0.55 | learning rate: 3.663E-05 | global batch size: 256 | lm loss: 2.569928E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.710 | TFLOPs: 44.02 | +7: iteration 92790/ 115203 | consumed samples: 23754240 | consumed tokens: 48648683520 | elapsed time per iteration (s): 0.56 | learning rate: 3.662E-05 | global batch size: 256 | lm loss: 2.596303E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.235 | TFLOPs: 43.50 | +7: iteration 92800/ 115203 | consumed samples: 23756800 | consumed tokens: 48653926400 | elapsed time per iteration (s): 0.56 | learning rate: 3.660E-05 | global batch size: 256 | lm loss: 2.611270E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.170 | TFLOPs: 43.30 | +7: iteration 92810/ 115203 | consumed samples: 23759360 | consumed tokens: 48659169280 | elapsed time per iteration (s): 0.56 | learning rate: 3.659E-05 | global batch size: 256 | lm loss: 2.585963E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.320 | TFLOPs: 43.31 | +7: iteration 92820/ 115203 | consumed samples: 23761920 | consumed tokens: 48664412160 | elapsed time per iteration (s): 0.56 | learning rate: 3.657E-05 | global batch size: 256 | lm loss: 2.587414E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.890 | TFLOPs: 43.27 | +7: iteration 92830/ 115203 | consumed samples: 23764480 | consumed tokens: 48669655040 | elapsed time per iteration (s): 0.57 | learning rate: 3.656E-05 | global batch size: 256 | lm loss: 2.594404E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.484 | TFLOPs: 43.14 | +7: iteration 92840/ 115203 | consumed samples: 23767040 | consumed tokens: 48674897920 | elapsed time per iteration (s): 0.57 | learning rate: 3.654E-05 | global batch size: 256 | lm loss: 2.567379E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.412 | TFLOPs: 42.94 | +7: iteration 92850/ 115203 | consumed samples: 23769600 | consumed tokens: 48680140800 | elapsed time per iteration (s): 0.57 | learning rate: 3.653E-05 | global batch size: 256 | lm loss: 2.569658E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.239 | TFLOPs: 42.54 | +7: iteration 92860/ 115203 | consumed samples: 23772160 | consumed tokens: 48685383680 | elapsed time per iteration (s): 0.57 | learning rate: 3.651E-05 | global batch size: 256 | lm loss: 2.588550E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.962 | TFLOPs: 43.19 | +7: iteration 92870/ 115203 | consumed samples: 23774720 | consumed tokens: 48690626560 | elapsed time per iteration (s): 0.57 | learning rate: 3.650E-05 | global batch size: 256 | lm loss: 2.584716E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.409 | TFLOPs: 42.56 | +7: iteration 92880/ 115203 | consumed samples: 23777280 | consumed tokens: 48695869440 | elapsed time per iteration (s): 0.57 | learning rate: 3.649E-05 | global batch size: 256 | lm loss: 2.578939E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.667 | TFLOPs: 43.16 | +7: iteration 92890/ 115203 | consumed samples: 23779840 | consumed tokens: 48701112320 | elapsed time per iteration (s): 0.55 | learning rate: 3.647E-05 | global batch size: 256 | lm loss: 2.579290E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.523 | TFLOPs: 44.00 | +7: iteration 92900/ 115203 | consumed samples: 23782400 | consumed tokens: 48706355200 | elapsed time per iteration (s): 0.56 | learning rate: 3.646E-05 | global batch size: 256 | lm loss: 2.587194E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.184 | TFLOPs: 43.49 | +7: iteration 92910/ 115203 | consumed samples: 23784960 | consumed tokens: 48711598080 | elapsed time per iteration (s): 0.56 | learning rate: 3.644E-05 | global batch size: 256 | lm loss: 2.580828E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.770 | TFLOPs: 43.55 | +7: iteration 92920/ 115203 | consumed samples: 23787520 | consumed tokens: 48716840960 | elapsed time per iteration (s): 0.57 | learning rate: 3.643E-05 | global batch size: 256 | lm loss: 2.582968E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.054 | TFLOPs: 43.19 | +7: iteration 92930/ 115203 | consumed samples: 23790080 | consumed tokens: 48722083840 | elapsed time per iteration (s): 0.57 | learning rate: 3.641E-05 | global batch size: 256 | lm loss: 2.594290E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.806 | TFLOPs: 42.60 | +7: iteration 92940/ 115203 | consumed samples: 23792640 | consumed tokens: 48727326720 | elapsed time per iteration (s): 0.56 | learning rate: 3.640E-05 | global batch size: 256 | lm loss: 2.587370E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.360 | TFLOPs: 43.41 | +7: iteration 92950/ 115203 | consumed samples: 23795200 | consumed tokens: 48732569600 | elapsed time per iteration (s): 0.57 | learning rate: 3.639E-05 | global batch size: 256 | lm loss: 2.586553E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.404 | TFLOPs: 42.94 | +7: iteration 92960/ 115203 | consumed samples: 23797760 | consumed tokens: 48737812480 | elapsed time per iteration (s): 0.56 | learning rate: 3.637E-05 | global batch size: 256 | lm loss: 2.581870E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.232 | TFLOPs: 43.40 | +7: iteration 92970/ 115203 | consumed samples: 23800320 | consumed tokens: 48743055360 | elapsed time per iteration (s): 0.56 | learning rate: 3.636E-05 | global batch size: 256 | lm loss: 2.585988E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.341 | TFLOPs: 43.79 | +7: iteration 92980/ 115203 | consumed samples: 23802880 | consumed tokens: 48748298240 | elapsed time per iteration (s): 0.56 | learning rate: 3.634E-05 | global batch size: 256 | lm loss: 2.582248E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.927 | TFLOPs: 43.56 | +7: iteration 92990/ 115203 | consumed samples: 23805440 | consumed tokens: 48753541120 | elapsed time per iteration (s): 0.57 | learning rate: 3.633E-05 | global batch size: 256 | lm loss: 2.590196E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.260 | TFLOPs: 43.02 | +7: iteration 93000/ 115203 | consumed samples: 23808000 | consumed tokens: 48758784000 | elapsed time per iteration (s): 0.57 | learning rate: 3.631E-05 | global batch size: 256 | lm loss: 2.593009E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.958 | TFLOPs: 42.80 | +7: iteration 93010/ 115203 | consumed samples: 23810560 | consumed tokens: 48764026880 | elapsed time per iteration (s): 0.56 | learning rate: 3.630E-05 | global batch size: 256 | lm loss: 2.567819E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.097 | TFLOPs: 43.29 | +7: iteration 93020/ 115203 | consumed samples: 23813120 | consumed tokens: 48769269760 | elapsed time per iteration (s): 0.56 | learning rate: 3.629E-05 | global batch size: 256 | lm loss: 2.586466E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.154 | TFLOPs: 43.30 | +7: iteration 93030/ 115203 | consumed samples: 23815680 | consumed tokens: 48774512640 | elapsed time per iteration (s): 0.57 | learning rate: 3.627E-05 | global batch size: 256 | lm loss: 2.589352E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.007 | TFLOPs: 42.62 | +7: iteration 93040/ 115203 | consumed samples: 23818240 | consumed tokens: 48779755520 | elapsed time per iteration (s): 0.57 | learning rate: 3.626E-05 | global batch size: 256 | lm loss: 2.581391E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.250 | TFLOPs: 42.64 | +7: iteration 93050/ 115203 | consumed samples: 23820800 | consumed tokens: 48784998400 | elapsed time per iteration (s): 0.55 | learning rate: 3.624E-05 | global batch size: 256 | lm loss: 2.588297E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.574 | TFLOPs: 44.01 | +7: iteration 93060/ 115203 | consumed samples: 23823360 | consumed tokens: 48790241280 | elapsed time per iteration (s): 0.58 | learning rate: 3.623E-05 | global batch size: 256 | lm loss: 2.579956E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.236 | TFLOPs: 42.16 | +7: iteration 93070/ 115203 | consumed samples: 23825920 | consumed tokens: 48795484160 | elapsed time per iteration (s): 0.56 | learning rate: 3.622E-05 | global batch size: 256 | lm loss: 2.571890E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.752 | TFLOPs: 43.74 | +7: iteration 93080/ 115203 | consumed samples: 23828480 | consumed tokens: 48800727040 | elapsed time per iteration (s): 0.57 | learning rate: 3.620E-05 | global batch size: 256 | lm loss: 2.583673E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.519 | TFLOPs: 42.76 | +7: iteration 93090/ 115203 | consumed samples: 23831040 | consumed tokens: 48805969920 | elapsed time per iteration (s): 0.56 | learning rate: 3.619E-05 | global batch size: 256 | lm loss: 2.597589E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.247 | TFLOPs: 43.40 | +7: iteration 93100/ 115203 | consumed samples: 23833600 | consumed tokens: 48811212800 | elapsed time per iteration (s): 0.57 | learning rate: 3.617E-05 | global batch size: 256 | lm loss: 2.590014E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.800 | TFLOPs: 42.98 | +7: iteration 93110/ 115203 | consumed samples: 23836160 | consumed tokens: 48816455680 | elapsed time per iteration (s): 0.56 | learning rate: 3.616E-05 | global batch size: 256 | lm loss: 2.580747E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.807 | TFLOPs: 43.27 | +7: iteration 93120/ 115203 | consumed samples: 23838720 | consumed tokens: 48821698560 | elapsed time per iteration (s): 0.56 | learning rate: 3.614E-05 | global batch size: 256 | lm loss: 2.565799E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.859 | TFLOPs: 43.56 | +7: iteration 93130/ 115203 | consumed samples: 23841280 | consumed tokens: 48826941440 | elapsed time per iteration (s): 0.56 | learning rate: 3.613E-05 | global batch size: 256 | lm loss: 2.589838E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.253 | TFLOPs: 43.59 | +7: iteration 93140/ 115203 | consumed samples: 23843840 | consumed tokens: 48832184320 | elapsed time per iteration (s): 0.56 | learning rate: 3.612E-05 | global batch size: 256 | lm loss: 2.577977E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.658 | TFLOPs: 43.73 | +7: iteration 93150/ 115203 | consumed samples: 23846400 | consumed tokens: 48837427200 | elapsed time per iteration (s): 0.56 | learning rate: 3.610E-05 | global batch size: 256 | lm loss: 2.586057E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.323 | TFLOPs: 43.51 | +7: iteration 93160/ 115203 | consumed samples: 23848960 | consumed tokens: 48842670080 | elapsed time per iteration (s): 0.56 | learning rate: 3.609E-05 | global batch size: 256 | lm loss: 2.575673E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.399 | TFLOPs: 43.42 | +7: iteration 93170/ 115203 | consumed samples: 23851520 | consumed tokens: 48847912960 | elapsed time per iteration (s): 0.55 | learning rate: 3.607E-05 | global batch size: 256 | lm loss: 2.570078E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.526 | TFLOPs: 44.00 | +7: iteration 93180/ 115203 | consumed samples: 23854080 | consumed tokens: 48853155840 | elapsed time per iteration (s): 0.56 | learning rate: 3.606E-05 | global batch size: 256 | lm loss: 2.599815E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.549 | TFLOPs: 43.72 | +7: iteration 93190/ 115203 | consumed samples: 23856640 | consumed tokens: 48858398720 | elapsed time per iteration (s): 0.56 | learning rate: 3.605E-05 | global batch size: 256 | lm loss: 2.594459E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.403 | TFLOPs: 43.32 | +7: iteration 93200/ 115203 | consumed samples: 23859200 | consumed tokens: 48863641600 | elapsed time per iteration (s): 0.56 | learning rate: 3.603E-05 | global batch size: 256 | lm loss: 2.580346E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.125 | TFLOPs: 43.49 | +7: iteration 93210/ 115203 | consumed samples: 23861760 | consumed tokens: 48868884480 | elapsed time per iteration (s): 0.56 | learning rate: 3.602E-05 | global batch size: 256 | lm loss: 2.575082E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.696 | TFLOPs: 43.54 | +7: iteration 93220/ 115203 | consumed samples: 23864320 | consumed tokens: 48874127360 | elapsed time per iteration (s): 0.56 | learning rate: 3.600E-05 | global batch size: 256 | lm loss: 2.582032E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.824 | TFLOPs: 43.74 | +7: iteration 93230/ 115203 | consumed samples: 23866880 | consumed tokens: 48879370240 | elapsed time per iteration (s): 0.56 | learning rate: 3.599E-05 | global batch size: 256 | lm loss: 2.578659E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.002 | TFLOPs: 43.57 | +7: iteration 93240/ 115203 | consumed samples: 23869440 | consumed tokens: 48884613120 | elapsed time per iteration (s): 0.55 | learning rate: 3.597E-05 | global batch size: 256 | lm loss: 2.584049E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.716 | TFLOPs: 44.02 | +7: iteration 93250/ 115203 | consumed samples: 23872000 | consumed tokens: 48889856000 | elapsed time per iteration (s): 0.55 | learning rate: 3.596E-05 | global batch size: 256 | lm loss: 2.590145E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.547 | TFLOPs: 44.00 | +7: iteration 93260/ 115203 | consumed samples: 23874560 | consumed tokens: 48895098880 | elapsed time per iteration (s): 0.57 | learning rate: 3.595E-05 | global batch size: 256 | lm loss: 2.593234E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.900 | TFLOPs: 42.80 | +7: iteration 93270/ 115203 | consumed samples: 23877120 | consumed tokens: 48900341760 | elapsed time per iteration (s): 0.56 | learning rate: 3.593E-05 | global batch size: 256 | lm loss: 2.584454E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.376 | TFLOPs: 43.51 | +7: iteration 93280/ 115203 | consumed samples: 23879680 | consumed tokens: 48905584640 | elapsed time per iteration (s): 0.56 | learning rate: 3.592E-05 | global batch size: 256 | lm loss: 2.581209E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.816 | TFLOPs: 43.55 | +7: iteration 93290/ 115203 | consumed samples: 23882240 | consumed tokens: 48910827520 | elapsed time per iteration (s): 0.56 | learning rate: 3.590E-05 | global batch size: 256 | lm loss: 2.586138E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.047 | TFLOPs: 43.57 | +7: iteration 93300/ 115203 | consumed samples: 23884800 | consumed tokens: 48916070400 | elapsed time per iteration (s): 0.56 | learning rate: 3.589E-05 | global batch size: 256 | lm loss: 2.581265E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.347 | TFLOPs: 43.70 | +7: iteration 93310/ 115203 | consumed samples: 23887360 | consumed tokens: 48921313280 | elapsed time per iteration (s): 0.57 | learning rate: 3.588E-05 | global batch size: 256 | lm loss: 2.582242E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.816 | TFLOPs: 43.08 | +7: iteration 93320/ 115203 | consumed samples: 23889920 | consumed tokens: 48926556160 | elapsed time per iteration (s): 0.57 | learning rate: 3.586E-05 | global batch size: 256 | lm loss: 2.594353E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.855 | TFLOPs: 42.79 | +7: iteration 93330/ 115203 | consumed samples: 23892480 | consumed tokens: 48931799040 | elapsed time per iteration (s): 0.57 | learning rate: 3.585E-05 | global batch size: 256 | lm loss: 2.590554E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.682 | TFLOPs: 43.06 | +7: iteration 93340/ 115203 | consumed samples: 23895040 | consumed tokens: 48937041920 | elapsed time per iteration (s): 0.55 | learning rate: 3.583E-05 | global batch size: 256 | lm loss: 2.584528E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.684 | TFLOPs: 44.02 | +7: iteration 93350/ 115203 | consumed samples: 23897600 | consumed tokens: 48942284800 | elapsed time per iteration (s): 0.56 | learning rate: 3.582E-05 | global batch size: 256 | lm loss: 2.584907E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.272 | TFLOPs: 43.50 | +7: iteration 93360/ 115203 | consumed samples: 23900160 | consumed tokens: 48947527680 | elapsed time per iteration (s): 0.56 | learning rate: 3.581E-05 | global batch size: 256 | lm loss: 2.576732E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.100 | TFLOPs: 43.96 | +7: iteration 93370/ 115203 | consumed samples: 23902720 | consumed tokens: 48952770560 | elapsed time per iteration (s): 0.56 | learning rate: 3.579E-05 | global batch size: 256 | lm loss: 2.584157E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.558 | TFLOPs: 43.43 | +7: iteration 93380/ 115203 | consumed samples: 23905280 | consumed tokens: 48958013440 | elapsed time per iteration (s): 0.58 | learning rate: 3.578E-05 | global batch size: 256 | lm loss: 2.576776E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.972 | TFLOPs: 42.42 | +7: iteration 93390/ 115203 | consumed samples: 23907840 | consumed tokens: 48963256320 | elapsed time per iteration (s): 0.56 | learning rate: 3.576E-05 | global batch size: 256 | lm loss: 2.585830E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.946 | TFLOPs: 43.76 | +7: iteration 93400/ 115203 | consumed samples: 23910400 | consumed tokens: 48968499200 | elapsed time per iteration (s): 0.57 | learning rate: 3.575E-05 | global batch size: 256 | lm loss: 2.592065E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.878 | TFLOPs: 43.08 | +7: iteration 93410/ 115203 | consumed samples: 23912960 | consumed tokens: 48973742080 | elapsed time per iteration (s): 0.56 | learning rate: 3.574E-05 | global batch size: 256 | lm loss: 2.584236E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.170 | TFLOPs: 43.59 | +7: iteration 93420/ 115203 | consumed samples: 23915520 | consumed tokens: 48978984960 | elapsed time per iteration (s): 0.56 | learning rate: 3.572E-05 | global batch size: 256 | lm loss: 2.581822E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.958 | TFLOPs: 43.76 | +7: iteration 93430/ 115203 | consumed samples: 23918080 | consumed tokens: 48984227840 | elapsed time per iteration (s): 0.56 | learning rate: 3.571E-05 | global batch size: 256 | lm loss: 2.591839E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.876 | TFLOPs: 43.46 | +7: iteration 93440/ 115203 | consumed samples: 23920640 | consumed tokens: 48989470720 | elapsed time per iteration (s): 0.57 | learning rate: 3.569E-05 | global batch size: 256 | lm loss: 2.582744E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.622 | TFLOPs: 43.06 | +7: iteration 93450/ 115203 | consumed samples: 23923200 | consumed tokens: 48994713600 | elapsed time per iteration (s): 0.56 | learning rate: 3.568E-05 | global batch size: 256 | lm loss: 2.589830E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.516 | TFLOPs: 43.52 | +7: iteration 93460/ 115203 | consumed samples: 23925760 | consumed tokens: 48999956480 | elapsed time per iteration (s): 0.57 | learning rate: 3.567E-05 | global batch size: 256 | lm loss: 2.583961E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.934 | TFLOPs: 42.99 | +7: iteration 93470/ 115203 | consumed samples: 23928320 | consumed tokens: 49005199360 | elapsed time per iteration (s): 0.56 | learning rate: 3.565E-05 | global batch size: 256 | lm loss: 2.579098E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.146 | TFLOPs: 43.87 | +7: iteration 93480/ 115203 | consumed samples: 23930880 | consumed tokens: 49010442240 | elapsed time per iteration (s): 0.56 | learning rate: 3.564E-05 | global batch size: 256 | lm loss: 2.595137E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.150 | TFLOPs: 43.30 | +7: iteration 93490/ 115203 | consumed samples: 23933440 | consumed tokens: 49015685120 | elapsed time per iteration (s): 0.56 | learning rate: 3.562E-05 | global batch size: 256 | lm loss: 2.598990E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.360 | TFLOPs: 43.60 | +7: iteration 93500/ 115203 | consumed samples: 23936000 | consumed tokens: 49020928000 | elapsed time per iteration (s): 0.56 | learning rate: 3.561E-05 | global batch size: 256 | lm loss: 2.594374E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.349 | TFLOPs: 43.60 | +7: iteration 93510/ 115203 | consumed samples: 23938560 | consumed tokens: 49026170880 | elapsed time per iteration (s): 0.57 | learning rate: 3.560E-05 | global batch size: 256 | lm loss: 2.570667E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.251 | TFLOPs: 42.64 | +7: iteration 93520/ 115203 | consumed samples: 23941120 | consumed tokens: 49031413760 | elapsed time per iteration (s): 0.58 | learning rate: 3.558E-05 | global batch size: 256 | lm loss: 2.578667E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.204 | TFLOPs: 41.97 | +7: iteration 93530/ 115203 | consumed samples: 23943680 | consumed tokens: 49036656640 | elapsed time per iteration (s): 0.56 | learning rate: 3.557E-05 | global batch size: 256 | lm loss: 2.591378E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.171 | TFLOPs: 43.68 | +7: iteration 93540/ 115203 | consumed samples: 23946240 | consumed tokens: 49041899520 | elapsed time per iteration (s): 0.57 | learning rate: 3.555E-05 | global batch size: 256 | lm loss: 2.573668E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.780 | TFLOPs: 42.88 | +7: iteration 93550/ 115203 | consumed samples: 23948800 | consumed tokens: 49047142400 | elapsed time per iteration (s): 0.56 | learning rate: 3.554E-05 | global batch size: 256 | lm loss: 2.584960E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.525 | TFLOPs: 43.91 | +7: iteration 93560/ 115203 | consumed samples: 23951360 | consumed tokens: 49052385280 | elapsed time per iteration (s): 0.56 | learning rate: 3.553E-05 | global batch size: 256 | lm loss: 2.593659E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.307 | TFLOPs: 43.22 | +7: iteration 93570/ 115203 | consumed samples: 23953920 | consumed tokens: 49057628160 | elapsed time per iteration (s): 0.56 | learning rate: 3.551E-05 | global batch size: 256 | lm loss: 2.573971E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.005 | TFLOPs: 43.38 | +7: iteration 93580/ 115203 | consumed samples: 23956480 | consumed tokens: 49062871040 | elapsed time per iteration (s): 0.56 | learning rate: 3.550E-05 | global batch size: 256 | lm loss: 2.580776E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.585 | TFLOPs: 43.53 | +7: iteration 93590/ 115203 | consumed samples: 23959040 | consumed tokens: 49068113920 | elapsed time per iteration (s): 0.56 | learning rate: 3.548E-05 | global batch size: 256 | lm loss: 2.572315E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.069 | TFLOPs: 43.48 | +7: iteration 93600/ 115203 | consumed samples: 23961600 | consumed tokens: 49073356800 | elapsed time per iteration (s): 0.56 | learning rate: 3.547E-05 | global batch size: 256 | lm loss: 2.589189E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.686 | TFLOPs: 43.54 | +7: iteration 93610/ 115203 | consumed samples: 23964160 | consumed tokens: 49078599680 | elapsed time per iteration (s): 0.55 | learning rate: 3.546E-05 | global batch size: 256 | lm loss: 2.583045E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.703 | TFLOPs: 44.02 | +7: iteration 93620/ 115203 | consumed samples: 23966720 | consumed tokens: 49083842560 | elapsed time per iteration (s): 0.55 | learning rate: 3.544E-05 | global batch size: 256 | lm loss: 2.588834E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.858 | TFLOPs: 44.03 | +7: iteration 93630/ 115203 | consumed samples: 23969280 | consumed tokens: 49089085440 | elapsed time per iteration (s): 0.56 | learning rate: 3.543E-05 | global batch size: 256 | lm loss: 2.586299E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.916 | TFLOPs: 43.56 | +7: iteration 93640/ 115203 | consumed samples: 23971840 | consumed tokens: 49094328320 | elapsed time per iteration (s): 0.55 | learning rate: 3.542E-05 | global batch size: 256 | lm loss: 2.583780E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.702 | TFLOPs: 44.02 | +7: iteration 93650/ 115203 | consumed samples: 23974400 | consumed tokens: 49099571200 | elapsed time per iteration (s): 0.55 | learning rate: 3.540E-05 | global batch size: 256 | lm loss: 2.580882E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.772 | TFLOPs: 44.02 | +7: iteration 93660/ 115203 | consumed samples: 23976960 | consumed tokens: 49104814080 | elapsed time per iteration (s): 0.55 | learning rate: 3.539E-05 | global batch size: 256 | lm loss: 2.577086E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.520 | TFLOPs: 44.00 | +7: iteration 93670/ 115203 | consumed samples: 23979520 | consumed tokens: 49110056960 | elapsed time per iteration (s): 0.56 | learning rate: 3.537E-05 | global batch size: 256 | lm loss: 2.590646E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.908 | TFLOPs: 43.47 | +7: iteration 93680/ 115203 | consumed samples: 23982080 | consumed tokens: 49115299840 | elapsed time per iteration (s): 0.55 | learning rate: 3.536E-05 | global batch size: 256 | lm loss: 2.579062E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.277 | TFLOPs: 43.98 | +7: iteration 93690/ 115203 | consumed samples: 23984640 | consumed tokens: 49120542720 | elapsed time per iteration (s): 0.56 | learning rate: 3.535E-05 | global batch size: 256 | lm loss: 2.583372E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.341 | TFLOPs: 43.89 | +7: iteration 93700/ 115203 | consumed samples: 23987200 | consumed tokens: 49125785600 | elapsed time per iteration (s): 0.56 | learning rate: 3.533E-05 | global batch size: 256 | lm loss: 2.584132E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.738 | TFLOPs: 43.64 | +7: iteration 93710/ 115203 | consumed samples: 23989760 | consumed tokens: 49131028480 | elapsed time per iteration (s): 0.56 | learning rate: 3.532E-05 | global batch size: 256 | lm loss: 2.576959E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.402 | TFLOPs: 43.89 | +7: iteration 93720/ 115203 | consumed samples: 23992320 | consumed tokens: 49136271360 | elapsed time per iteration (s): 0.56 | learning rate: 3.530E-05 | global batch size: 256 | lm loss: 2.584780E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.311 | TFLOPs: 43.89 | +7: iteration 93730/ 115203 | consumed samples: 23994880 | consumed tokens: 49141514240 | elapsed time per iteration (s): 0.56 | learning rate: 3.529E-05 | global batch size: 256 | lm loss: 2.582419E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.565 | TFLOPs: 43.62 | +7: iteration 93740/ 115203 | consumed samples: 23997440 | consumed tokens: 49146757120 | elapsed time per iteration (s): 0.55 | learning rate: 3.528E-05 | global batch size: 256 | lm loss: 2.589555E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.527 | TFLOPs: 44.00 | +7: iteration 93750/ 115203 | consumed samples: 24000000 | consumed tokens: 49152000000 | elapsed time per iteration (s): 0.55 | learning rate: 3.526E-05 | global batch size: 256 | lm loss: 2.583051E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.607 | TFLOPs: 44.01 | +7: iteration 93760/ 115203 | consumed samples: 24002560 | consumed tokens: 49157242880 | elapsed time per iteration (s): 0.55 | learning rate: 3.525E-05 | global batch size: 256 | lm loss: 2.584796E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.589 | TFLOPs: 44.01 | +7: iteration 93770/ 115203 | consumed samples: 24005120 | consumed tokens: 49162485760 | elapsed time per iteration (s): 0.56 | learning rate: 3.524E-05 | global batch size: 256 | lm loss: 2.588631E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.352 | TFLOPs: 43.51 | +7: iteration 93780/ 115203 | consumed samples: 24007680 | consumed tokens: 49167728640 | elapsed time per iteration (s): 0.58 | learning rate: 3.522E-05 | global batch size: 256 | lm loss: 2.583427E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.521 | TFLOPs: 42.38 | +7: iteration 93790/ 115203 | consumed samples: 24010240 | consumed tokens: 49172971520 | elapsed time per iteration (s): 0.58 | learning rate: 3.521E-05 | global batch size: 256 | lm loss: 2.585406E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.123 | TFLOPs: 42.25 | +7: iteration 93800/ 115203 | consumed samples: 24012800 | consumed tokens: 49178214400 | elapsed time per iteration (s): 0.57 | learning rate: 3.519E-05 | global batch size: 256 | lm loss: 2.574431E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.908 | TFLOPs: 42.99 | +7: iteration 93810/ 115203 | consumed samples: 24015360 | consumed tokens: 49183457280 | elapsed time per iteration (s): 0.57 | learning rate: 3.518E-05 | global batch size: 256 | lm loss: 2.581186E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.696 | TFLOPs: 43.16 | +7: iteration 93820/ 115203 | consumed samples: 24017920 | consumed tokens: 49188700160 | elapsed time per iteration (s): 0.56 | learning rate: 3.517E-05 | global batch size: 256 | lm loss: 2.574172E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.503 | TFLOPs: 43.90 | +7: iteration 93830/ 115203 | consumed samples: 24020480 | consumed tokens: 49193943040 | elapsed time per iteration (s): 0.55 | learning rate: 3.515E-05 | global batch size: 256 | lm loss: 2.584954E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.649 | TFLOPs: 44.01 | +7: iteration 93840/ 115203 | consumed samples: 24023040 | consumed tokens: 49199185920 | elapsed time per iteration (s): 0.56 | learning rate: 3.514E-05 | global batch size: 256 | lm loss: 2.590819E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.782 | TFLOPs: 43.74 | +7: iteration 93850/ 115203 | consumed samples: 24025600 | consumed tokens: 49204428800 | elapsed time per iteration (s): 0.56 | learning rate: 3.513E-05 | global batch size: 256 | lm loss: 2.589960E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.036 | TFLOPs: 43.38 | +7: iteration 93860/ 115203 | consumed samples: 24028160 | consumed tokens: 49209671680 | elapsed time per iteration (s): 0.56 | learning rate: 3.511E-05 | global batch size: 256 | lm loss: 2.590498E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.506 | TFLOPs: 43.62 | +7: iteration 93870/ 115203 | consumed samples: 24030720 | consumed tokens: 49214914560 | elapsed time per iteration (s): 0.56 | learning rate: 3.510E-05 | global batch size: 256 | lm loss: 2.570295E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.740 | TFLOPs: 43.74 | +7: iteration 93880/ 115203 | consumed samples: 24033280 | consumed tokens: 49220157440 | elapsed time per iteration (s): 0.56 | learning rate: 3.508E-05 | global batch size: 256 | lm loss: 2.579739E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.002 | TFLOPs: 43.67 | +7: iteration 93890/ 115203 | consumed samples: 24035840 | consumed tokens: 49225400320 | elapsed time per iteration (s): 0.56 | learning rate: 3.507E-05 | global batch size: 256 | lm loss: 2.575360E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.317 | TFLOPs: 43.41 | +7: iteration 93900/ 115203 | consumed samples: 24038400 | consumed tokens: 49230643200 | elapsed time per iteration (s): 0.57 | learning rate: 3.506E-05 | global batch size: 256 | lm loss: 2.581074E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.508 | TFLOPs: 43.05 | +7: iteration 93910/ 115203 | consumed samples: 24040960 | consumed tokens: 49235886080 | elapsed time per iteration (s): 0.56 | learning rate: 3.504E-05 | global batch size: 256 | lm loss: 2.599586E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.867 | TFLOPs: 43.27 | +7: iteration 93920/ 115203 | consumed samples: 24043520 | consumed tokens: 49241128960 | elapsed time per iteration (s): 0.56 | learning rate: 3.503E-05 | global batch size: 256 | lm loss: 2.568184E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.437 | TFLOPs: 43.90 | +7: iteration 93930/ 115203 | consumed samples: 24046080 | consumed tokens: 49246371840 | elapsed time per iteration (s): 0.56 | learning rate: 3.502E-05 | global batch size: 256 | lm loss: 2.592565E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.125 | TFLOPs: 43.58 | +7: iteration 93940/ 115203 | consumed samples: 24048640 | consumed tokens: 49251614720 | elapsed time per iteration (s): 0.56 | learning rate: 3.500E-05 | global batch size: 256 | lm loss: 2.575874E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.041 | TFLOPs: 43.57 | +7: iteration 93950/ 115203 | consumed samples: 24051200 | consumed tokens: 49256857600 | elapsed time per iteration (s): 0.56 | learning rate: 3.499E-05 | global batch size: 256 | lm loss: 2.595640E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.283 | TFLOPs: 43.41 | +7: iteration 93960/ 115203 | consumed samples: 24053760 | consumed tokens: 49262100480 | elapsed time per iteration (s): 0.58 | learning rate: 3.497E-05 | global batch size: 256 | lm loss: 2.584184E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.734 | TFLOPs: 42.40 | +7: iteration 93970/ 115203 | consumed samples: 24056320 | consumed tokens: 49267343360 | elapsed time per iteration (s): 0.57 | learning rate: 3.496E-05 | global batch size: 256 | lm loss: 2.587601E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.620 | TFLOPs: 42.49 | +7: iteration 93980/ 115203 | consumed samples: 24058880 | consumed tokens: 49272586240 | elapsed time per iteration (s): 0.55 | learning rate: 3.495E-05 | global batch size: 256 | lm loss: 2.582963E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.667 | TFLOPs: 44.01 | +7: iteration 93990/ 115203 | consumed samples: 24061440 | consumed tokens: 49277829120 | elapsed time per iteration (s): 0.56 | learning rate: 3.493E-05 | global batch size: 256 | lm loss: 2.580923E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.032 | TFLOPs: 43.57 | +0: [2023-03-17 03:34:40,515] [INFO] [logging.py:68:log_dist] [Rank 0] step=94000, skipped=0, lr=[3.4919569923835e-05, 3.4919569923835e-05, 3.4919569923835e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 94000/ 115203 | consumed samples: 24064000 | consumed tokens: 49283072000 | elapsed time per iteration (s): 0.55 | learning rate: 3.492E-05 | global batch size: 256 | lm loss: 2.572598E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.624 | TFLOPs: 44.01 | +0: steps: 94000 loss: 2.5934 iter time (s): 0.562 samples/sec: 455.827 +7: iteration 94010/ 115203 | consumed samples: 24066560 | consumed tokens: 49288314880 | elapsed time per iteration (s): 0.55 | learning rate: 3.491E-05 | global batch size: 256 | lm loss: 2.572510E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.321 | TFLOPs: 43.98 | +7: iteration 94020/ 115203 | consumed samples: 24069120 | consumed tokens: 49293557760 | elapsed time per iteration (s): 0.57 | learning rate: 3.489E-05 | global batch size: 256 | lm loss: 2.587224E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.042 | TFLOPs: 42.91 | +7: iteration 94030/ 115203 | consumed samples: 24071680 | consumed tokens: 49298800640 | elapsed time per iteration (s): 0.55 | learning rate: 3.488E-05 | global batch size: 256 | lm loss: 2.581125E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.480 | TFLOPs: 44.00 | +7: iteration 94040/ 115203 | consumed samples: 24074240 | consumed tokens: 49304043520 | elapsed time per iteration (s): 0.56 | learning rate: 3.486E-05 | global batch size: 256 | lm loss: 2.583864E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.828 | TFLOPs: 43.46 | +7: iteration 94050/ 115203 | consumed samples: 24076800 | consumed tokens: 49309286400 | elapsed time per iteration (s): 0.56 | learning rate: 3.485E-05 | global batch size: 256 | lm loss: 2.577759E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.729 | TFLOPs: 43.64 | +7: iteration 94060/ 115203 | consumed samples: 24079360 | consumed tokens: 49314529280 | elapsed time per iteration (s): 0.56 | learning rate: 3.484E-05 | global batch size: 256 | lm loss: 2.579567E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.606 | TFLOPs: 43.82 | +7: iteration 94070/ 115203 | consumed samples: 24081920 | consumed tokens: 49319772160 | elapsed time per iteration (s): 0.56 | learning rate: 3.482E-05 | global batch size: 256 | lm loss: 2.581825E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.902 | TFLOPs: 43.47 | +7: iteration 94080/ 115203 | consumed samples: 24084480 | consumed tokens: 49325015040 | elapsed time per iteration (s): 0.57 | learning rate: 3.481E-05 | global batch size: 256 | lm loss: 2.582159E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.085 | TFLOPs: 42.82 | +7: iteration 94090/ 115203 | consumed samples: 24087040 | consumed tokens: 49330257920 | elapsed time per iteration (s): 0.55 | learning rate: 3.480E-05 | global batch size: 256 | lm loss: 2.571453E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.540 | TFLOPs: 44.00 | +7: iteration 94100/ 115203 | consumed samples: 24089600 | consumed tokens: 49335500800 | elapsed time per iteration (s): 0.57 | learning rate: 3.478E-05 | global batch size: 256 | lm loss: 2.597181E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.537 | TFLOPs: 43.05 | +7: iteration 94110/ 115203 | consumed samples: 24092160 | consumed tokens: 49340743680 | elapsed time per iteration (s): 0.55 | learning rate: 3.477E-05 | global batch size: 256 | lm loss: 2.571153E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.480 | TFLOPs: 44.00 | +7: iteration 94120/ 115203 | consumed samples: 24094720 | consumed tokens: 49345986560 | elapsed time per iteration (s): 0.56 | learning rate: 3.476E-05 | global batch size: 256 | lm loss: 2.571775E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.955 | TFLOPs: 43.57 | +7: iteration 94130/ 115203 | consumed samples: 24097280 | consumed tokens: 49351229440 | elapsed time per iteration (s): 0.56 | learning rate: 3.474E-05 | global batch size: 256 | lm loss: 2.584748E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.276 | TFLOPs: 43.50 | +7: iteration 94140/ 115203 | consumed samples: 24099840 | consumed tokens: 49356472320 | elapsed time per iteration (s): 0.55 | learning rate: 3.473E-05 | global batch size: 256 | lm loss: 2.590852E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.587 | TFLOPs: 44.01 | +7: iteration 94150/ 115203 | consumed samples: 24102400 | consumed tokens: 49361715200 | elapsed time per iteration (s): 0.55 | learning rate: 3.472E-05 | global batch size: 256 | lm loss: 2.586418E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.569 | TFLOPs: 44.01 | +7: iteration 94160/ 115203 | consumed samples: 24104960 | consumed tokens: 49366958080 | elapsed time per iteration (s): 0.55 | learning rate: 3.470E-05 | global batch size: 256 | lm loss: 2.577740E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.555 | TFLOPs: 44.00 | +7: iteration 94170/ 115203 | consumed samples: 24107520 | consumed tokens: 49372200960 | elapsed time per iteration (s): 0.56 | learning rate: 3.469E-05 | global batch size: 256 | lm loss: 2.587626E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.718 | TFLOPs: 43.35 | +7: iteration 94180/ 115203 | consumed samples: 24110080 | consumed tokens: 49377443840 | elapsed time per iteration (s): 0.56 | learning rate: 3.467E-05 | global batch size: 256 | lm loss: 2.569663E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.216 | TFLOPs: 43.21 | +7: iteration 94190/ 115203 | consumed samples: 24112640 | consumed tokens: 49382686720 | elapsed time per iteration (s): 0.55 | learning rate: 3.466E-05 | global batch size: 256 | lm loss: 2.580583E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.297 | TFLOPs: 43.98 | +7: iteration 94200/ 115203 | consumed samples: 24115200 | consumed tokens: 49387929600 | elapsed time per iteration (s): 0.55 | learning rate: 3.465E-05 | global batch size: 256 | lm loss: 2.573892E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.371 | TFLOPs: 43.99 | +7: iteration 94210/ 115203 | consumed samples: 24117760 | consumed tokens: 49393172480 | elapsed time per iteration (s): 0.55 | learning rate: 3.463E-05 | global batch size: 256 | lm loss: 2.580100E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.428 | TFLOPs: 43.99 | +7: iteration 94220/ 115203 | consumed samples: 24120320 | consumed tokens: 49398415360 | elapsed time per iteration (s): 0.55 | learning rate: 3.462E-05 | global batch size: 256 | lm loss: 2.584391E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.386 | TFLOPs: 43.99 | +7: iteration 94230/ 115203 | consumed samples: 24122880 | consumed tokens: 49403658240 | elapsed time per iteration (s): 0.56 | learning rate: 3.461E-05 | global batch size: 256 | lm loss: 2.571980E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.355 | TFLOPs: 43.79 | +7: iteration 94240/ 115203 | consumed samples: 24125440 | consumed tokens: 49408901120 | elapsed time per iteration (s): 0.55 | learning rate: 3.459E-05 | global batch size: 256 | lm loss: 2.577898E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.375 | TFLOPs: 43.99 | +7: iteration 94250/ 115203 | consumed samples: 24128000 | consumed tokens: 49414144000 | elapsed time per iteration (s): 0.56 | learning rate: 3.458E-05 | global batch size: 256 | lm loss: 2.581685E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.097 | TFLOPs: 43.87 | +7: iteration 94260/ 115203 | consumed samples: 24130560 | consumed tokens: 49419386880 | elapsed time per iteration (s): 0.55 | learning rate: 3.457E-05 | global batch size: 256 | lm loss: 2.570550E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.715 | TFLOPs: 44.02 | +7: iteration 94270/ 115203 | consumed samples: 24133120 | consumed tokens: 49424629760 | elapsed time per iteration (s): 0.56 | learning rate: 3.455E-05 | global batch size: 256 | lm loss: 2.568464E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.644 | TFLOPs: 43.44 | +7: iteration 94280/ 115203 | consumed samples: 24135680 | consumed tokens: 49429872640 | elapsed time per iteration (s): 0.56 | learning rate: 3.454E-05 | global batch size: 256 | lm loss: 2.568323E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.050 | TFLOPs: 43.57 | +7: iteration 94290/ 115203 | consumed samples: 24138240 | consumed tokens: 49435115520 | elapsed time per iteration (s): 0.55 | learning rate: 3.453E-05 | global batch size: 256 | lm loss: 2.573897E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.516 | TFLOPs: 44.00 | +7: iteration 94300/ 115203 | consumed samples: 24140800 | consumed tokens: 49440358400 | elapsed time per iteration (s): 0.55 | learning rate: 3.451E-05 | global batch size: 256 | lm loss: 2.583828E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.580 | TFLOPs: 44.01 | +7: iteration 94310/ 115203 | consumed samples: 24143360 | consumed tokens: 49445601280 | elapsed time per iteration (s): 0.55 | learning rate: 3.450E-05 | global batch size: 256 | lm loss: 2.578548E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.534 | TFLOPs: 44.00 | +7: iteration 94320/ 115203 | consumed samples: 24145920 | consumed tokens: 49450844160 | elapsed time per iteration (s): 0.55 | learning rate: 3.449E-05 | global batch size: 256 | lm loss: 2.579810E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.424 | TFLOPs: 43.99 | +7: iteration 94330/ 115203 | consumed samples: 24148480 | consumed tokens: 49456087040 | elapsed time per iteration (s): 0.55 | learning rate: 3.447E-05 | global batch size: 256 | lm loss: 2.582455E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.575 | TFLOPs: 44.01 | +7: iteration 94340/ 115203 | consumed samples: 24151040 | consumed tokens: 49461329920 | elapsed time per iteration (s): 0.55 | learning rate: 3.446E-05 | global batch size: 256 | lm loss: 2.582178E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.566 | TFLOPs: 44.01 | +7: iteration 94350/ 115203 | consumed samples: 24153600 | consumed tokens: 49466572800 | elapsed time per iteration (s): 0.56 | learning rate: 3.444E-05 | global batch size: 256 | lm loss: 2.589247E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.223 | TFLOPs: 43.88 | +7: iteration 94360/ 115203 | consumed samples: 24156160 | consumed tokens: 49471815680 | elapsed time per iteration (s): 0.56 | learning rate: 3.443E-05 | global batch size: 256 | lm loss: 2.587964E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.740 | TFLOPs: 43.93 | +7: iteration 94370/ 115203 | consumed samples: 24158720 | consumed tokens: 49477058560 | elapsed time per iteration (s): 0.55 | learning rate: 3.442E-05 | global batch size: 256 | lm loss: 2.565923E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.660 | TFLOPs: 44.01 | +7: iteration 94380/ 115203 | consumed samples: 24161280 | consumed tokens: 49482301440 | elapsed time per iteration (s): 0.55 | learning rate: 3.440E-05 | global batch size: 256 | lm loss: 2.568976E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.706 | TFLOPs: 44.02 | +7: iteration 94390/ 115203 | consumed samples: 24163840 | consumed tokens: 49487544320 | elapsed time per iteration (s): 0.56 | learning rate: 3.439E-05 | global batch size: 256 | lm loss: 2.566667E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.448 | TFLOPs: 43.52 | +7: iteration 94400/ 115203 | consumed samples: 24166400 | consumed tokens: 49492787200 | elapsed time per iteration (s): 0.55 | learning rate: 3.438E-05 | global batch size: 256 | lm loss: 2.572976E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.448 | TFLOPs: 43.99 | +7: iteration 94410/ 115203 | consumed samples: 24168960 | consumed tokens: 49498030080 | elapsed time per iteration (s): 0.55 | learning rate: 3.436E-05 | global batch size: 256 | lm loss: 2.589821E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.460 | TFLOPs: 44.00 | +7: iteration 94420/ 115203 | consumed samples: 24171520 | consumed tokens: 49503272960 | elapsed time per iteration (s): 0.55 | learning rate: 3.435E-05 | global batch size: 256 | lm loss: 2.581359E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.608 | TFLOPs: 44.01 | +7: iteration 94430/ 115203 | consumed samples: 24174080 | consumed tokens: 49508515840 | elapsed time per iteration (s): 0.55 | learning rate: 3.434E-05 | global batch size: 256 | lm loss: 2.589503E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.667 | TFLOPs: 44.01 | +7: iteration 94440/ 115203 | consumed samples: 24176640 | consumed tokens: 49513758720 | elapsed time per iteration (s): 0.55 | learning rate: 3.432E-05 | global batch size: 256 | lm loss: 2.568167E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.698 | TFLOPs: 44.02 | +7: iteration 94450/ 115203 | consumed samples: 24179200 | consumed tokens: 49519001600 | elapsed time per iteration (s): 0.55 | learning rate: 3.431E-05 | global batch size: 256 | lm loss: 2.575603E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.445 | TFLOPs: 43.99 | +7: iteration 94460/ 115203 | consumed samples: 24181760 | consumed tokens: 49524244480 | elapsed time per iteration (s): 0.55 | learning rate: 3.430E-05 | global batch size: 256 | lm loss: 2.581473E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.599 | TFLOPs: 44.01 | +7: iteration 94470/ 115203 | consumed samples: 24184320 | consumed tokens: 49529487360 | elapsed time per iteration (s): 0.55 | learning rate: 3.428E-05 | global batch size: 256 | lm loss: 2.594472E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.523 | TFLOPs: 44.00 | +7: iteration 94480/ 115203 | consumed samples: 24186880 | consumed tokens: 49534730240 | elapsed time per iteration (s): 0.56 | learning rate: 3.427E-05 | global batch size: 256 | lm loss: 2.575310E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.106 | TFLOPs: 43.96 | +7: iteration 94490/ 115203 | consumed samples: 24189440 | consumed tokens: 49539973120 | elapsed time per iteration (s): 0.55 | learning rate: 3.426E-05 | global batch size: 256 | lm loss: 2.572775E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.746 | TFLOPs: 44.02 | +7: iteration 94500/ 115203 | consumed samples: 24192000 | consumed tokens: 49545216000 | elapsed time per iteration (s): 0.56 | learning rate: 3.424E-05 | global batch size: 256 | lm loss: 2.575269E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.632 | TFLOPs: 43.73 | +7: iteration 94510/ 115203 | consumed samples: 24194560 | consumed tokens: 49550458880 | elapsed time per iteration (s): 0.56 | learning rate: 3.423E-05 | global batch size: 256 | lm loss: 2.577096E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.191 | TFLOPs: 43.78 | +7: iteration 94520/ 115203 | consumed samples: 24197120 | consumed tokens: 49555701760 | elapsed time per iteration (s): 0.55 | learning rate: 3.422E-05 | global batch size: 256 | lm loss: 2.579229E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.330 | TFLOPs: 43.98 | +7: iteration 94530/ 115203 | consumed samples: 24199680 | consumed tokens: 49560944640 | elapsed time per iteration (s): 0.55 | learning rate: 3.420E-05 | global batch size: 256 | lm loss: 2.575763E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.512 | TFLOPs: 44.00 | +7: iteration 94540/ 115203 | consumed samples: 24202240 | consumed tokens: 49566187520 | elapsed time per iteration (s): 0.56 | learning rate: 3.419E-05 | global batch size: 256 | lm loss: 2.575173E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.218 | TFLOPs: 43.50 | +7: iteration 94550/ 115203 | consumed samples: 24204800 | consumed tokens: 49571430400 | elapsed time per iteration (s): 0.56 | learning rate: 3.418E-05 | global batch size: 256 | lm loss: 2.574396E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.877 | TFLOPs: 43.84 | +7: iteration 94560/ 115203 | consumed samples: 24207360 | consumed tokens: 49576673280 | elapsed time per iteration (s): 0.56 | learning rate: 3.416E-05 | global batch size: 256 | lm loss: 2.585627E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.606 | TFLOPs: 43.25 | +7: iteration 94570/ 115203 | consumed samples: 24209920 | consumed tokens: 49581916160 | elapsed time per iteration (s): 0.56 | learning rate: 3.415E-05 | global batch size: 256 | lm loss: 2.580624E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.933 | TFLOPs: 43.37 | +7: iteration 94580/ 115203 | consumed samples: 24212480 | consumed tokens: 49587159040 | elapsed time per iteration (s): 0.56 | learning rate: 3.414E-05 | global batch size: 256 | lm loss: 2.578181E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.153 | TFLOPs: 43.87 | +7: iteration 94590/ 115203 | consumed samples: 24215040 | consumed tokens: 49592401920 | elapsed time per iteration (s): 0.55 | learning rate: 3.412E-05 | global batch size: 256 | lm loss: 2.576529E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.322 | TFLOPs: 43.98 | +7: iteration 94600/ 115203 | consumed samples: 24217600 | consumed tokens: 49597644800 | elapsed time per iteration (s): 0.55 | learning rate: 3.411E-05 | global batch size: 256 | lm loss: 2.577966E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.340 | TFLOPs: 43.98 | +7: iteration 94610/ 115203 | consumed samples: 24220160 | consumed tokens: 49602887680 | elapsed time per iteration (s): 0.55 | learning rate: 3.410E-05 | global batch size: 256 | lm loss: 2.577565E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.476 | TFLOPs: 44.00 | +7: iteration 94620/ 115203 | consumed samples: 24222720 | consumed tokens: 49608130560 | elapsed time per iteration (s): 0.55 | learning rate: 3.408E-05 | global batch size: 256 | lm loss: 2.578392E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.517 | TFLOPs: 44.00 | +7: iteration 94630/ 115203 | consumed samples: 24225280 | consumed tokens: 49613373440 | elapsed time per iteration (s): 0.55 | learning rate: 3.407E-05 | global batch size: 256 | lm loss: 2.597332E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.646 | TFLOPs: 44.01 | +7: iteration 94640/ 115203 | consumed samples: 24227840 | consumed tokens: 49618616320 | elapsed time per iteration (s): 0.55 | learning rate: 3.406E-05 | global batch size: 256 | lm loss: 2.589095E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.732 | TFLOPs: 44.02 | +7: iteration 94650/ 115203 | consumed samples: 24230400 | consumed tokens: 49623859200 | elapsed time per iteration (s): 0.55 | learning rate: 3.404E-05 | global batch size: 256 | lm loss: 2.597859E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.668 | TFLOPs: 44.02 | +7: iteration 94660/ 115203 | consumed samples: 24232960 | consumed tokens: 49629102080 | elapsed time per iteration (s): 0.55 | learning rate: 3.403E-05 | global batch size: 256 | lm loss: 2.573212E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.795 | TFLOPs: 44.03 | +7: iteration 94670/ 115203 | consumed samples: 24235520 | consumed tokens: 49634344960 | elapsed time per iteration (s): 0.55 | learning rate: 3.402E-05 | global batch size: 256 | lm loss: 2.579566E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.554 | TFLOPs: 44.00 | +7: iteration 94680/ 115203 | consumed samples: 24238080 | consumed tokens: 49639587840 | elapsed time per iteration (s): 0.55 | learning rate: 3.400E-05 | global batch size: 256 | lm loss: 2.577944E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.731 | TFLOPs: 44.02 | +7: iteration 94690/ 115203 | consumed samples: 24240640 | consumed tokens: 49644830720 | elapsed time per iteration (s): 0.55 | learning rate: 3.399E-05 | global batch size: 256 | lm loss: 2.576506E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.634 | TFLOPs: 44.01 | +7: iteration 94700/ 115203 | consumed samples: 24243200 | consumed tokens: 49650073600 | elapsed time per iteration (s): 0.56 | learning rate: 3.398E-05 | global batch size: 256 | lm loss: 2.591131E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.457 | TFLOPs: 43.23 | +7: iteration 94710/ 115203 | consumed samples: 24245760 | consumed tokens: 49655316480 | elapsed time per iteration (s): 0.55 | learning rate: 3.396E-05 | global batch size: 256 | lm loss: 2.578368E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.715 | TFLOPs: 44.02 | +7: iteration 94720/ 115203 | consumed samples: 24248320 | consumed tokens: 49660559360 | elapsed time per iteration (s): 0.56 | learning rate: 3.395E-05 | global batch size: 256 | lm loss: 2.581183E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.782 | TFLOPs: 43.36 | +7: iteration 94730/ 115203 | consumed samples: 24250880 | consumed tokens: 49665802240 | elapsed time per iteration (s): 0.55 | learning rate: 3.394E-05 | global batch size: 256 | lm loss: 2.576136E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.582 | TFLOPs: 44.01 | +7: iteration 94740/ 115203 | consumed samples: 24253440 | consumed tokens: 49671045120 | elapsed time per iteration (s): 0.56 | learning rate: 3.392E-05 | global batch size: 256 | lm loss: 2.571873E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.315 | TFLOPs: 43.89 | +7: iteration 94750/ 115203 | consumed samples: 24256000 | consumed tokens: 49676288000 | elapsed time per iteration (s): 0.55 | learning rate: 3.391E-05 | global batch size: 256 | lm loss: 2.586988E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.706 | TFLOPs: 44.02 | +7: iteration 94760/ 115203 | consumed samples: 24258560 | consumed tokens: 49681530880 | elapsed time per iteration (s): 0.56 | learning rate: 3.390E-05 | global batch size: 256 | lm loss: 2.591877E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.497 | TFLOPs: 43.52 | +7: iteration 94770/ 115203 | consumed samples: 24261120 | consumed tokens: 49686773760 | elapsed time per iteration (s): 0.56 | learning rate: 3.388E-05 | global batch size: 256 | lm loss: 2.588082E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.804 | TFLOPs: 43.74 | +7: iteration 94780/ 115203 | consumed samples: 24263680 | consumed tokens: 49692016640 | elapsed time per iteration (s): 0.55 | learning rate: 3.387E-05 | global batch size: 256 | lm loss: 2.575192E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.817 | TFLOPs: 44.03 | +7: iteration 94790/ 115203 | consumed samples: 24266240 | consumed tokens: 49697259520 | elapsed time per iteration (s): 0.55 | learning rate: 3.386E-05 | global batch size: 256 | lm loss: 2.585251E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.727 | TFLOPs: 44.02 | +7: iteration 94800/ 115203 | consumed samples: 24268800 | consumed tokens: 49702502400 | elapsed time per iteration (s): 0.55 | learning rate: 3.384E-05 | global batch size: 256 | lm loss: 2.578966E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.695 | TFLOPs: 44.02 | +7: iteration 94810/ 115203 | consumed samples: 24271360 | consumed tokens: 49707745280 | elapsed time per iteration (s): 0.56 | learning rate: 3.383E-05 | global batch size: 256 | lm loss: 2.581947E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.878 | TFLOPs: 43.75 | +7: iteration 94820/ 115203 | consumed samples: 24273920 | consumed tokens: 49712988160 | elapsed time per iteration (s): 0.56 | learning rate: 3.382E-05 | global batch size: 256 | lm loss: 2.584280E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.998 | TFLOPs: 43.76 | +7: iteration 94830/ 115203 | consumed samples: 24276480 | consumed tokens: 49718231040 | elapsed time per iteration (s): 0.55 | learning rate: 3.380E-05 | global batch size: 256 | lm loss: 2.577318E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.686 | TFLOPs: 44.02 | +7: iteration 94840/ 115203 | consumed samples: 24279040 | consumed tokens: 49723473920 | elapsed time per iteration (s): 0.55 | learning rate: 3.379E-05 | global batch size: 256 | lm loss: 2.580194E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.704 | TFLOPs: 44.02 | +7: iteration 94850/ 115203 | consumed samples: 24281600 | consumed tokens: 49728716800 | elapsed time per iteration (s): 0.55 | learning rate: 3.378E-05 | global batch size: 256 | lm loss: 2.577051E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.653 | TFLOPs: 44.01 | +7: iteration 94860/ 115203 | consumed samples: 24284160 | consumed tokens: 49733959680 | elapsed time per iteration (s): 0.55 | learning rate: 3.377E-05 | global batch size: 256 | lm loss: 2.575256E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.610 | TFLOPs: 44.01 | +7: iteration 94870/ 115203 | consumed samples: 24286720 | consumed tokens: 49739202560 | elapsed time per iteration (s): 0.56 | learning rate: 3.375E-05 | global batch size: 256 | lm loss: 2.584430E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.425 | TFLOPs: 43.52 | +7: iteration 94880/ 115203 | consumed samples: 24289280 | consumed tokens: 49744445440 | elapsed time per iteration (s): 0.55 | learning rate: 3.374E-05 | global batch size: 256 | lm loss: 2.579215E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.708 | TFLOPs: 44.02 | +7: iteration 94890/ 115203 | consumed samples: 24291840 | consumed tokens: 49749688320 | elapsed time per iteration (s): 0.56 | learning rate: 3.373E-05 | global batch size: 256 | lm loss: 2.584227E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.379 | TFLOPs: 43.32 | +7: iteration 94900/ 115203 | consumed samples: 24294400 | consumed tokens: 49754931200 | elapsed time per iteration (s): 0.56 | learning rate: 3.371E-05 | global batch size: 256 | lm loss: 2.589233E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.643 | TFLOPs: 43.82 | +7: iteration 94910/ 115203 | consumed samples: 24296960 | consumed tokens: 49760174080 | elapsed time per iteration (s): 0.57 | learning rate: 3.370E-05 | global batch size: 256 | lm loss: 2.582813E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.347 | TFLOPs: 42.46 | +7: iteration 94920/ 115203 | consumed samples: 24299520 | consumed tokens: 49765416960 | elapsed time per iteration (s): 0.56 | learning rate: 3.369E-05 | global batch size: 256 | lm loss: 2.586466E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.796 | TFLOPs: 43.74 | +7: iteration 94930/ 115203 | consumed samples: 24302080 | consumed tokens: 49770659840 | elapsed time per iteration (s): 0.56 | learning rate: 3.367E-05 | global batch size: 256 | lm loss: 2.574532E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.865 | TFLOPs: 43.37 | +7: iteration 94940/ 115203 | consumed samples: 24304640 | consumed tokens: 49775902720 | elapsed time per iteration (s): 0.57 | learning rate: 3.366E-05 | global batch size: 256 | lm loss: 2.585181E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.235 | TFLOPs: 43.12 | +7: iteration 94950/ 115203 | consumed samples: 24307200 | consumed tokens: 49781145600 | elapsed time per iteration (s): 0.56 | learning rate: 3.365E-05 | global batch size: 256 | lm loss: 2.582254E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.165 | TFLOPs: 43.87 | +7: iteration 94960/ 115203 | consumed samples: 24309760 | consumed tokens: 49786388480 | elapsed time per iteration (s): 0.57 | learning rate: 3.363E-05 | global batch size: 256 | lm loss: 2.581484E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.505 | TFLOPs: 43.05 | +7: iteration 94970/ 115203 | consumed samples: 24312320 | consumed tokens: 49791631360 | elapsed time per iteration (s): 0.58 | learning rate: 3.362E-05 | global batch size: 256 | lm loss: 2.571992E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.130 | TFLOPs: 42.44 | +7: iteration 94980/ 115203 | consumed samples: 24314880 | consumed tokens: 49796874240 | elapsed time per iteration (s): 0.57 | learning rate: 3.361E-05 | global batch size: 256 | lm loss: 2.578824E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.538 | TFLOPs: 42.48 | +7: iteration 94990/ 115203 | consumed samples: 24317440 | consumed tokens: 49802117120 | elapsed time per iteration (s): 0.58 | learning rate: 3.359E-05 | global batch size: 256 | lm loss: 2.571539E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.316 | TFLOPs: 42.17 | +7: iteration 95000/ 115203 | consumed samples: 24320000 | consumed tokens: 49807360000 | elapsed time per iteration (s): 0.57 | learning rate: 3.358E-05 | global batch size: 256 | lm loss: 2.578907E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.860 | TFLOPs: 42.79 | +7: iteration 95010/ 115203 | consumed samples: 24322560 | consumed tokens: 49812602880 | elapsed time per iteration (s): 0.58 | learning rate: 3.357E-05 | global batch size: 256 | lm loss: 2.584061E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.072 | TFLOPs: 42.15 | +7: iteration 95020/ 115203 | consumed samples: 24325120 | consumed tokens: 49817845760 | elapsed time per iteration (s): 0.58 | learning rate: 3.356E-05 | global batch size: 256 | lm loss: 2.582431E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.572 | TFLOPs: 41.91 | +7: iteration 95030/ 115203 | consumed samples: 24327680 | consumed tokens: 49823088640 | elapsed time per iteration (s): 0.58 | learning rate: 3.354E-05 | global batch size: 256 | lm loss: 2.572788E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.991 | TFLOPs: 41.85 | +7: iteration 95040/ 115203 | consumed samples: 24330240 | consumed tokens: 49828331520 | elapsed time per iteration (s): 0.57 | learning rate: 3.353E-05 | global batch size: 256 | lm loss: 2.573619E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.580 | TFLOPs: 42.58 | +7: iteration 95050/ 115203 | consumed samples: 24332800 | consumed tokens: 49833574400 | elapsed time per iteration (s): 0.60 | learning rate: 3.352E-05 | global batch size: 256 | lm loss: 2.591289E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.952 | TFLOPs: 40.90 | +7: iteration 95060/ 115203 | consumed samples: 24335360 | consumed tokens: 49838817280 | elapsed time per iteration (s): 0.59 | learning rate: 3.350E-05 | global batch size: 256 | lm loss: 2.576999E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.928 | TFLOPs: 41.37 | +7: iteration 95070/ 115203 | consumed samples: 24337920 | consumed tokens: 49844060160 | elapsed time per iteration (s): 0.57 | learning rate: 3.349E-05 | global batch size: 256 | lm loss: 2.569115E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.488 | TFLOPs: 42.66 | +7: iteration 95080/ 115203 | consumed samples: 24340480 | consumed tokens: 49849303040 | elapsed time per iteration (s): 0.57 | learning rate: 3.348E-05 | global batch size: 256 | lm loss: 2.587291E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.009 | TFLOPs: 42.52 | +7: iteration 95090/ 115203 | consumed samples: 24343040 | consumed tokens: 49854545920 | elapsed time per iteration (s): 0.60 | learning rate: 3.346E-05 | global batch size: 256 | lm loss: 2.591809E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.973 | TFLOPs: 40.80 | +7: iteration 95100/ 115203 | consumed samples: 24345600 | consumed tokens: 49859788800 | elapsed time per iteration (s): 0.57 | learning rate: 3.345E-05 | global batch size: 256 | lm loss: 2.580013E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.276 | TFLOPs: 42.45 | +7: iteration 95110/ 115203 | consumed samples: 24348160 | consumed tokens: 49865031680 | elapsed time per iteration (s): 0.57 | learning rate: 3.344E-05 | global batch size: 256 | lm loss: 2.591190E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.627 | TFLOPs: 42.58 | +7: iteration 95120/ 115203 | consumed samples: 24350720 | consumed tokens: 49870274560 | elapsed time per iteration (s): 0.58 | learning rate: 3.342E-05 | global batch size: 256 | lm loss: 2.581310E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.203 | TFLOPs: 41.87 | +7: iteration 95130/ 115203 | consumed samples: 24353280 | consumed tokens: 49875517440 | elapsed time per iteration (s): 0.57 | learning rate: 3.341E-05 | global batch size: 256 | lm loss: 2.574587E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.699 | TFLOPs: 42.59 | +7: iteration 95140/ 115203 | consumed samples: 24355840 | consumed tokens: 49880760320 | elapsed time per iteration (s): 0.58 | learning rate: 3.340E-05 | global batch size: 256 | lm loss: 2.581352E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.740 | TFLOPs: 42.21 | +7: iteration 95150/ 115203 | consumed samples: 24358400 | consumed tokens: 49886003200 | elapsed time per iteration (s): 0.58 | learning rate: 3.339E-05 | global batch size: 256 | lm loss: 2.582814E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.663 | TFLOPs: 42.01 | +7: iteration 95160/ 115203 | consumed samples: 24360960 | consumed tokens: 49891246080 | elapsed time per iteration (s): 0.57 | learning rate: 3.337E-05 | global batch size: 256 | lm loss: 2.591920E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.785 | TFLOPs: 43.17 | +7: iteration 95170/ 115203 | consumed samples: 24363520 | consumed tokens: 49896488960 | elapsed time per iteration (s): 0.59 | learning rate: 3.336E-05 | global batch size: 256 | lm loss: 2.576279E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.500 | TFLOPs: 41.23 | +7: iteration 95180/ 115203 | consumed samples: 24366080 | consumed tokens: 49901731840 | elapsed time per iteration (s): 0.59 | learning rate: 3.335E-05 | global batch size: 256 | lm loss: 2.589181E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.815 | TFLOPs: 41.45 | +7: iteration 95190/ 115203 | consumed samples: 24368640 | consumed tokens: 49906974720 | elapsed time per iteration (s): 0.58 | learning rate: 3.333E-05 | global batch size: 256 | lm loss: 2.579325E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.917 | TFLOPs: 42.42 | +7: iteration 95200/ 115203 | consumed samples: 24371200 | consumed tokens: 49912217600 | elapsed time per iteration (s): 0.57 | learning rate: 3.332E-05 | global batch size: 256 | lm loss: 2.577137E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.519 | TFLOPs: 42.48 | +7: iteration 95210/ 115203 | consumed samples: 24373760 | consumed tokens: 49917460480 | elapsed time per iteration (s): 0.57 | learning rate: 3.331E-05 | global batch size: 256 | lm loss: 2.576819E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.297 | TFLOPs: 42.55 | +7: iteration 95220/ 115203 | consumed samples: 24376320 | consumed tokens: 49922703360 | elapsed time per iteration (s): 0.59 | learning rate: 3.329E-05 | global batch size: 256 | lm loss: 2.572584E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.318 | TFLOPs: 41.03 | +7: iteration 95230/ 115203 | consumed samples: 24378880 | consumed tokens: 49927946240 | elapsed time per iteration (s): 0.60 | learning rate: 3.328E-05 | global batch size: 256 | lm loss: 2.580857E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.933 | TFLOPs: 40.99 | +7: iteration 95240/ 115203 | consumed samples: 24381440 | consumed tokens: 49933189120 | elapsed time per iteration (s): 0.58 | learning rate: 3.327E-05 | global batch size: 256 | lm loss: 2.575285E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.123 | TFLOPs: 42.06 | +7: iteration 95250/ 115203 | consumed samples: 24384000 | consumed tokens: 49938432000 | elapsed time per iteration (s): 0.58 | learning rate: 3.326E-05 | global batch size: 256 | lm loss: 2.584701E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.058 | TFLOPs: 42.24 | +7: iteration 95260/ 115203 | consumed samples: 24386560 | consumed tokens: 49943674880 | elapsed time per iteration (s): 0.57 | learning rate: 3.324E-05 | global batch size: 256 | lm loss: 2.577023E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.504 | TFLOPs: 43.05 | +7: iteration 95270/ 115203 | consumed samples: 24389120 | consumed tokens: 49948917760 | elapsed time per iteration (s): 0.56 | learning rate: 3.323E-05 | global batch size: 256 | lm loss: 2.590689E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.745 | TFLOPs: 43.64 | +7: iteration 95280/ 115203 | consumed samples: 24391680 | consumed tokens: 49954160640 | elapsed time per iteration (s): 0.57 | learning rate: 3.322E-05 | global batch size: 256 | lm loss: 2.589659E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.718 | TFLOPs: 42.69 | +7: iteration 95290/ 115203 | consumed samples: 24394240 | consumed tokens: 49959403520 | elapsed time per iteration (s): 0.57 | learning rate: 3.320E-05 | global batch size: 256 | lm loss: 2.578077E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.295 | TFLOPs: 42.45 | +7: iteration 95300/ 115203 | consumed samples: 24396800 | consumed tokens: 49964646400 | elapsed time per iteration (s): 0.58 | learning rate: 3.319E-05 | global batch size: 256 | lm loss: 2.571584E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.421 | TFLOPs: 41.80 | +7: iteration 95310/ 115203 | consumed samples: 24399360 | consumed tokens: 49969889280 | elapsed time per iteration (s): 0.58 | learning rate: 3.318E-05 | global batch size: 256 | lm loss: 2.592101E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.118 | TFLOPs: 42.44 | +7: iteration 95320/ 115203 | consumed samples: 24401920 | consumed tokens: 49975132160 | elapsed time per iteration (s): 0.58 | learning rate: 3.317E-05 | global batch size: 256 | lm loss: 2.572661E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.320 | TFLOPs: 42.36 | +7: iteration 95330/ 115203 | consumed samples: 24404480 | consumed tokens: 49980375040 | elapsed time per iteration (s): 0.57 | learning rate: 3.315E-05 | global batch size: 256 | lm loss: 2.580330E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.704 | TFLOPs: 42.78 | +7: iteration 95340/ 115203 | consumed samples: 24407040 | consumed tokens: 49985617920 | elapsed time per iteration (s): 0.58 | learning rate: 3.314E-05 | global batch size: 256 | lm loss: 2.570436E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.237 | TFLOPs: 42.35 | +7: iteration 95350/ 115203 | consumed samples: 24409600 | consumed tokens: 49990860800 | elapsed time per iteration (s): 0.57 | learning rate: 3.313E-05 | global batch size: 256 | lm loss: 2.580112E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.894 | TFLOPs: 42.70 | +7: iteration 95360/ 115203 | consumed samples: 24412160 | consumed tokens: 49996103680 | elapsed time per iteration (s): 0.56 | learning rate: 3.311E-05 | global batch size: 256 | lm loss: 2.568575E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.101 | TFLOPs: 43.29 | +7: iteration 95370/ 115203 | consumed samples: 24414720 | consumed tokens: 50001346560 | elapsed time per iteration (s): 0.57 | learning rate: 3.310E-05 | global batch size: 256 | lm loss: 2.572606E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.905 | TFLOPs: 42.70 | +7: iteration 95380/ 115203 | consumed samples: 24417280 | consumed tokens: 50006589440 | elapsed time per iteration (s): 0.57 | learning rate: 3.309E-05 | global batch size: 256 | lm loss: 2.574702E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.475 | TFLOPs: 42.95 | +7: iteration 95390/ 115203 | consumed samples: 24419840 | consumed tokens: 50011832320 | elapsed time per iteration (s): 0.57 | learning rate: 3.307E-05 | global batch size: 256 | lm loss: 2.579917E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.082 | TFLOPs: 43.20 | +7: iteration 95400/ 115203 | consumed samples: 24422400 | consumed tokens: 50017075200 | elapsed time per iteration (s): 0.57 | learning rate: 3.306E-05 | global batch size: 256 | lm loss: 2.573749E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.434 | TFLOPs: 43.04 | +7: iteration 95410/ 115203 | consumed samples: 24424960 | consumed tokens: 50022318080 | elapsed time per iteration (s): 0.58 | learning rate: 3.305E-05 | global batch size: 256 | lm loss: 2.586112E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.547 | TFLOPs: 42.00 | +7: iteration 95420/ 115203 | consumed samples: 24427520 | consumed tokens: 50027560960 | elapsed time per iteration (s): 0.57 | learning rate: 3.304E-05 | global batch size: 256 | lm loss: 2.587253E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.212 | TFLOPs: 42.92 | +7: iteration 95430/ 115203 | consumed samples: 24430080 | consumed tokens: 50032803840 | elapsed time per iteration (s): 0.58 | learning rate: 3.302E-05 | global batch size: 256 | lm loss: 2.574944E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.422 | TFLOPs: 42.18 | +7: iteration 95440/ 115203 | consumed samples: 24432640 | consumed tokens: 50038046720 | elapsed time per iteration (s): 0.58 | learning rate: 3.301E-05 | global batch size: 256 | lm loss: 2.598922E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.234 | TFLOPs: 42.26 | +7: iteration 95450/ 115203 | consumed samples: 24435200 | consumed tokens: 50043289600 | elapsed time per iteration (s): 0.56 | learning rate: 3.300E-05 | global batch size: 256 | lm loss: 2.588785E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.774 | TFLOPs: 43.93 | +7: iteration 95460/ 115203 | consumed samples: 24437760 | consumed tokens: 50048532480 | elapsed time per iteration (s): 0.57 | learning rate: 3.298E-05 | global batch size: 256 | lm loss: 2.581674E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.249 | TFLOPs: 42.74 | +7: iteration 95470/ 115203 | consumed samples: 24440320 | consumed tokens: 50053775360 | elapsed time per iteration (s): 0.58 | learning rate: 3.297E-05 | global batch size: 256 | lm loss: 2.562810E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.186 | TFLOPs: 42.25 | +7: iteration 95480/ 115203 | consumed samples: 24442880 | consumed tokens: 50059018240 | elapsed time per iteration (s): 0.57 | learning rate: 3.296E-05 | global batch size: 256 | lm loss: 2.566759E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.007 | TFLOPs: 42.62 | +7: iteration 95490/ 115203 | consumed samples: 24445440 | consumed tokens: 50064261120 | elapsed time per iteration (s): 0.56 | learning rate: 3.295E-05 | global batch size: 256 | lm loss: 2.576970E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.517 | TFLOPs: 43.52 | +7: iteration 95500/ 115203 | consumed samples: 24448000 | consumed tokens: 50069504000 | elapsed time per iteration (s): 0.57 | learning rate: 3.293E-05 | global batch size: 256 | lm loss: 2.589471E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.684 | TFLOPs: 42.59 | +7: iteration 95510/ 115203 | consumed samples: 24450560 | consumed tokens: 50074746880 | elapsed time per iteration (s): 0.58 | learning rate: 3.292E-05 | global batch size: 256 | lm loss: 2.589808E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.389 | TFLOPs: 41.89 | +7: iteration 95520/ 115203 | consumed samples: 24453120 | consumed tokens: 50079989760 | elapsed time per iteration (s): 0.56 | learning rate: 3.291E-05 | global batch size: 256 | lm loss: 2.573162E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.028 | TFLOPs: 43.57 | +7: iteration 95530/ 115203 | consumed samples: 24455680 | consumed tokens: 50085232640 | elapsed time per iteration (s): 0.56 | learning rate: 3.290E-05 | global batch size: 256 | lm loss: 2.584761E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.764 | TFLOPs: 43.36 | +7: iteration 95540/ 115203 | consumed samples: 24458240 | consumed tokens: 50090475520 | elapsed time per iteration (s): 0.57 | learning rate: 3.288E-05 | global batch size: 256 | lm loss: 2.567352E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.610 | TFLOPs: 42.77 | +7: iteration 95550/ 115203 | consumed samples: 24460800 | consumed tokens: 50095718400 | elapsed time per iteration (s): 0.57 | learning rate: 3.287E-05 | global batch size: 256 | lm loss: 2.569721E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.870 | TFLOPs: 42.60 | +7: iteration 95560/ 115203 | consumed samples: 24463360 | consumed tokens: 50100961280 | elapsed time per iteration (s): 0.58 | learning rate: 3.286E-05 | global batch size: 256 | lm loss: 2.577391E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.662 | TFLOPs: 42.20 | +7: iteration 95570/ 115203 | consumed samples: 24465920 | consumed tokens: 50106204160 | elapsed time per iteration (s): 0.58 | learning rate: 3.284E-05 | global batch size: 256 | lm loss: 2.562003E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.590 | TFLOPs: 42.39 | +7: iteration 95580/ 115203 | consumed samples: 24468480 | consumed tokens: 50111447040 | elapsed time per iteration (s): 0.57 | learning rate: 3.283E-05 | global batch size: 256 | lm loss: 2.571168E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.481 | TFLOPs: 42.66 | +7: iteration 95590/ 115203 | consumed samples: 24471040 | consumed tokens: 50116689920 | elapsed time per iteration (s): 0.58 | learning rate: 3.282E-05 | global batch size: 256 | lm loss: 2.592258E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.087 | TFLOPs: 42.24 | +7: iteration 95600/ 115203 | consumed samples: 24473600 | consumed tokens: 50121932800 | elapsed time per iteration (s): 0.57 | learning rate: 3.281E-05 | global batch size: 256 | lm loss: 2.574694E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.185 | TFLOPs: 42.73 | +7: iteration 95610/ 115203 | consumed samples: 24476160 | consumed tokens: 50127175680 | elapsed time per iteration (s): 0.59 | learning rate: 3.279E-05 | global batch size: 256 | lm loss: 2.588134E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.914 | TFLOPs: 41.65 | +7: iteration 95620/ 115203 | consumed samples: 24478720 | consumed tokens: 50132418560 | elapsed time per iteration (s): 0.59 | learning rate: 3.278E-05 | global batch size: 256 | lm loss: 2.572192E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.762 | TFLOPs: 41.26 | +7: iteration 95630/ 115203 | consumed samples: 24481280 | consumed tokens: 50137661440 | elapsed time per iteration (s): 0.58 | learning rate: 3.277E-05 | global batch size: 256 | lm loss: 2.582094E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.095 | TFLOPs: 41.86 | +7: iteration 95640/ 115203 | consumed samples: 24483840 | consumed tokens: 50142904320 | elapsed time per iteration (s): 0.59 | learning rate: 3.276E-05 | global batch size: 256 | lm loss: 2.575413E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.171 | TFLOPs: 41.30 | +7: iteration 95650/ 115203 | consumed samples: 24486400 | consumed tokens: 50148147200 | elapsed time per iteration (s): 0.61 | learning rate: 3.274E-05 | global batch size: 256 | lm loss: 2.582091E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 422.282 | TFLOPs: 40.26 | +7: iteration 95660/ 115203 | consumed samples: 24488960 | consumed tokens: 50153390080 | elapsed time per iteration (s): 0.59 | learning rate: 3.273E-05 | global batch size: 256 | lm loss: 2.584496E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.873 | TFLOPs: 41.37 | +7: iteration 95670/ 115203 | consumed samples: 24491520 | consumed tokens: 50158632960 | elapsed time per iteration (s): 0.58 | learning rate: 3.272E-05 | global batch size: 256 | lm loss: 2.589191E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.541 | TFLOPs: 42.38 | +7: iteration 95680/ 115203 | consumed samples: 24494080 | consumed tokens: 50163875840 | elapsed time per iteration (s): 0.57 | learning rate: 3.270E-05 | global batch size: 256 | lm loss: 2.582918E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.739 | TFLOPs: 42.88 | +7: iteration 95690/ 115203 | consumed samples: 24496640 | consumed tokens: 50169118720 | elapsed time per iteration (s): 0.56 | learning rate: 3.269E-05 | global batch size: 256 | lm loss: 2.570691E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.534 | TFLOPs: 43.43 | +7: iteration 95700/ 115203 | consumed samples: 24499200 | consumed tokens: 50174361600 | elapsed time per iteration (s): 0.58 | learning rate: 3.268E-05 | global batch size: 256 | lm loss: 2.590442E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.749 | TFLOPs: 41.93 | +7: iteration 95710/ 115203 | consumed samples: 24501760 | consumed tokens: 50179604480 | elapsed time per iteration (s): 0.57 | learning rate: 3.267E-05 | global batch size: 256 | lm loss: 2.573289E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.952 | TFLOPs: 42.99 | +7: iteration 95720/ 115203 | consumed samples: 24504320 | consumed tokens: 50184847360 | elapsed time per iteration (s): 0.59 | learning rate: 3.265E-05 | global batch size: 256 | lm loss: 2.559775E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.918 | TFLOPs: 41.46 | +7: iteration 95730/ 115203 | consumed samples: 24506880 | consumed tokens: 50190090240 | elapsed time per iteration (s): 0.60 | learning rate: 3.264E-05 | global batch size: 256 | lm loss: 2.578472E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 423.862 | TFLOPs: 40.41 | +7: iteration 95740/ 115203 | consumed samples: 24509440 | consumed tokens: 50195333120 | elapsed time per iteration (s): 0.58 | learning rate: 3.263E-05 | global batch size: 256 | lm loss: 2.572955E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.203 | TFLOPs: 42.06 | +7: iteration 95750/ 115203 | consumed samples: 24512000 | consumed tokens: 50200576000 | elapsed time per iteration (s): 0.59 | learning rate: 3.262E-05 | global batch size: 256 | lm loss: 2.581945E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.078 | TFLOPs: 41.29 | +7: iteration 95760/ 115203 | consumed samples: 24514560 | consumed tokens: 50205818880 | elapsed time per iteration (s): 0.59 | learning rate: 3.260E-05 | global batch size: 256 | lm loss: 2.572694E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.252 | TFLOPs: 41.31 | +7: iteration 95770/ 115203 | consumed samples: 24517120 | consumed tokens: 50211061760 | elapsed time per iteration (s): 0.57 | learning rate: 3.259E-05 | global batch size: 256 | lm loss: 2.584915E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.469 | TFLOPs: 42.47 | +7: iteration 95780/ 115203 | consumed samples: 24519680 | consumed tokens: 50216304640 | elapsed time per iteration (s): 0.60 | learning rate: 3.258E-05 | global batch size: 256 | lm loss: 2.569771E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.600 | TFLOPs: 40.58 | +7: iteration 95790/ 115203 | consumed samples: 24522240 | consumed tokens: 50221547520 | elapsed time per iteration (s): 0.58 | learning rate: 3.256E-05 | global batch size: 256 | lm loss: 2.574840E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.325 | TFLOPs: 42.27 | +7: iteration 95800/ 115203 | consumed samples: 24524800 | consumed tokens: 50226790400 | elapsed time per iteration (s): 0.59 | learning rate: 3.255E-05 | global batch size: 256 | lm loss: 2.573205E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.198 | TFLOPs: 41.21 | +7: iteration 95810/ 115203 | consumed samples: 24527360 | consumed tokens: 50232033280 | elapsed time per iteration (s): 0.59 | learning rate: 3.254E-05 | global batch size: 256 | lm loss: 2.571963E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.559 | TFLOPs: 41.43 | +7: iteration 95820/ 115203 | consumed samples: 24529920 | consumed tokens: 50237276160 | elapsed time per iteration (s): 0.59 | learning rate: 3.253E-05 | global batch size: 256 | lm loss: 2.577836E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.275 | TFLOPs: 41.21 | +7: iteration 95830/ 115203 | consumed samples: 24532480 | consumed tokens: 50242519040 | elapsed time per iteration (s): 0.57 | learning rate: 3.251E-05 | global batch size: 256 | lm loss: 2.584238E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.204 | TFLOPs: 42.83 | +7: iteration 95840/ 115203 | consumed samples: 24535040 | consumed tokens: 50247761920 | elapsed time per iteration (s): 0.57 | learning rate: 3.250E-05 | global batch size: 256 | lm loss: 2.570689E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.587 | TFLOPs: 42.77 | +7: iteration 95850/ 115203 | consumed samples: 24537600 | consumed tokens: 50253004800 | elapsed time per iteration (s): 0.58 | learning rate: 3.249E-05 | global batch size: 256 | lm loss: 2.589287E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.324 | TFLOPs: 41.79 | +7: iteration 95860/ 115203 | consumed samples: 24540160 | consumed tokens: 50258247680 | elapsed time per iteration (s): 0.56 | learning rate: 3.248E-05 | global batch size: 256 | lm loss: 2.567774E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.955 | TFLOPs: 43.47 | +7: iteration 95870/ 115203 | consumed samples: 24542720 | consumed tokens: 50263490560 | elapsed time per iteration (s): 0.57 | learning rate: 3.246E-05 | global batch size: 256 | lm loss: 2.575052E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.632 | TFLOPs: 42.58 | +7: iteration 95880/ 115203 | consumed samples: 24545280 | consumed tokens: 50268733440 | elapsed time per iteration (s): 0.57 | learning rate: 3.245E-05 | global batch size: 256 | lm loss: 2.579340E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.457 | TFLOPs: 42.56 | +7: iteration 95890/ 115203 | consumed samples: 24547840 | consumed tokens: 50273976320 | elapsed time per iteration (s): 0.57 | learning rate: 3.244E-05 | global batch size: 256 | lm loss: 2.562608E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.898 | TFLOPs: 42.70 | +7: iteration 95900/ 115203 | consumed samples: 24550400 | consumed tokens: 50279219200 | elapsed time per iteration (s): 0.59 | learning rate: 3.243E-05 | global batch size: 256 | lm loss: 2.570712E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.246 | TFLOPs: 41.31 | +7: iteration 95910/ 115203 | consumed samples: 24552960 | consumed tokens: 50284462080 | elapsed time per iteration (s): 0.59 | learning rate: 3.241E-05 | global batch size: 256 | lm loss: 2.571816E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.503 | TFLOPs: 41.71 | +7: iteration 95920/ 115203 | consumed samples: 24555520 | consumed tokens: 50289704960 | elapsed time per iteration (s): 0.58 | learning rate: 3.240E-05 | global batch size: 256 | lm loss: 2.582254E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.911 | TFLOPs: 42.42 | +7: iteration 95930/ 115203 | consumed samples: 24558080 | consumed tokens: 50294947840 | elapsed time per iteration (s): 0.57 | learning rate: 3.239E-05 | global batch size: 256 | lm loss: 2.573588E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.736 | TFLOPs: 42.88 | +7: iteration 95940/ 115203 | consumed samples: 24560640 | consumed tokens: 50300190720 | elapsed time per iteration (s): 0.57 | learning rate: 3.238E-05 | global batch size: 256 | lm loss: 2.582516E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.701 | TFLOPs: 42.59 | +7: iteration 95950/ 115203 | consumed samples: 24563200 | consumed tokens: 50305433600 | elapsed time per iteration (s): 0.60 | learning rate: 3.236E-05 | global batch size: 256 | lm loss: 2.554891E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.099 | TFLOPs: 40.91 | +7: iteration 95960/ 115203 | consumed samples: 24565760 | consumed tokens: 50310676480 | elapsed time per iteration (s): 0.57 | learning rate: 3.235E-05 | global batch size: 256 | lm loss: 2.571093E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.075 | TFLOPs: 42.62 | +7: iteration 95970/ 115203 | consumed samples: 24568320 | consumed tokens: 50315919360 | elapsed time per iteration (s): 0.57 | learning rate: 3.234E-05 | global batch size: 256 | lm loss: 2.564298E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.363 | TFLOPs: 43.03 | +7: iteration 95980/ 115203 | consumed samples: 24570880 | consumed tokens: 50321162240 | elapsed time per iteration (s): 0.59 | learning rate: 3.233E-05 | global batch size: 256 | lm loss: 2.584812E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.197 | TFLOPs: 41.30 | +7: iteration 95990/ 115203 | consumed samples: 24573440 | consumed tokens: 50326405120 | elapsed time per iteration (s): 0.57 | learning rate: 3.231E-05 | global batch size: 256 | lm loss: 2.584389E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.332 | TFLOPs: 42.93 | +0: [2023-03-17 03:53:36,558] [INFO] [logging.py:68:log_dist] [Rank 0] step=96000, skipped=0, lr=[3.230082550465275e-05, 3.230082550465275e-05, 3.230082550465275e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 96000/ 115203 | consumed samples: 24576000 | consumed tokens: 50331648000 | elapsed time per iteration (s): 0.59 | learning rate: 3.230E-05 | global batch size: 256 | lm loss: 2.572897E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.837 | TFLOPs: 41.55 | +0: steps: 96000 loss: 2.5993 iter time (s): 0.566 samples/sec: 452.586 +7: iteration 96010/ 115203 | consumed samples: 24578560 | consumed tokens: 50336890880 | elapsed time per iteration (s): 0.58 | learning rate: 3.229E-05 | global batch size: 256 | lm loss: 2.580522E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.717 | TFLOPs: 42.11 | +7: iteration 96020/ 115203 | consumed samples: 24581120 | consumed tokens: 50342133760 | elapsed time per iteration (s): 0.57 | learning rate: 3.228E-05 | global batch size: 256 | lm loss: 2.569264E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.146 | TFLOPs: 42.63 | +7: iteration 96030/ 115203 | consumed samples: 24583680 | consumed tokens: 50347376640 | elapsed time per iteration (s): 0.59 | learning rate: 3.226E-05 | global batch size: 256 | lm loss: 2.584468E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.524 | TFLOPs: 41.52 | +7: iteration 96040/ 115203 | consumed samples: 24586240 | consumed tokens: 50352619520 | elapsed time per iteration (s): 0.55 | learning rate: 3.225E-05 | global batch size: 256 | lm loss: 2.580637E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.725 | TFLOPs: 44.02 | +7: iteration 96050/ 115203 | consumed samples: 24588800 | consumed tokens: 50357862400 | elapsed time per iteration (s): 0.60 | learning rate: 3.224E-05 | global batch size: 256 | lm loss: 2.584535E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.249 | TFLOPs: 40.83 | +7: iteration 96060/ 115203 | consumed samples: 24591360 | consumed tokens: 50363105280 | elapsed time per iteration (s): 0.59 | learning rate: 3.223E-05 | global batch size: 256 | lm loss: 2.572219E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.420 | TFLOPs: 41.61 | +7: iteration 96070/ 115203 | consumed samples: 24593920 | consumed tokens: 50368348160 | elapsed time per iteration (s): 0.58 | learning rate: 3.221E-05 | global batch size: 256 | lm loss: 2.575922E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.691 | TFLOPs: 42.02 | +7: iteration 96080/ 115203 | consumed samples: 24596480 | consumed tokens: 50373591040 | elapsed time per iteration (s): 0.58 | learning rate: 3.220E-05 | global batch size: 256 | lm loss: 2.593114E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.102 | TFLOPs: 42.15 | +7: iteration 96090/ 115203 | consumed samples: 24599040 | consumed tokens: 50378833920 | elapsed time per iteration (s): 0.57 | learning rate: 3.219E-05 | global batch size: 256 | lm loss: 2.566963E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.823 | TFLOPs: 42.50 | +7: iteration 96100/ 115203 | consumed samples: 24601600 | consumed tokens: 50384076800 | elapsed time per iteration (s): 0.57 | learning rate: 3.218E-05 | global batch size: 256 | lm loss: 2.579213E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.536 | TFLOPs: 43.05 | +7: iteration 96110/ 115203 | consumed samples: 24604160 | consumed tokens: 50389319680 | elapsed time per iteration (s): 0.56 | learning rate: 3.216E-05 | global batch size: 256 | lm loss: 2.581775E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.539 | TFLOPs: 43.72 | +7: iteration 96120/ 115203 | consumed samples: 24606720 | consumed tokens: 50394562560 | elapsed time per iteration (s): 0.56 | learning rate: 3.215E-05 | global batch size: 256 | lm loss: 2.573210E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.314 | TFLOPs: 43.22 | +7: iteration 96130/ 115203 | consumed samples: 24609280 | consumed tokens: 50399805440 | elapsed time per iteration (s): 0.56 | learning rate: 3.214E-05 | global batch size: 256 | lm loss: 2.570287E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.913 | TFLOPs: 43.28 | +7: iteration 96140/ 115203 | consumed samples: 24611840 | consumed tokens: 50405048320 | elapsed time per iteration (s): 0.57 | learning rate: 3.213E-05 | global batch size: 256 | lm loss: 2.579205E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.845 | TFLOPs: 43.08 | +7: iteration 96150/ 115203 | consumed samples: 24614400 | consumed tokens: 50410291200 | elapsed time per iteration (s): 0.57 | learning rate: 3.211E-05 | global batch size: 256 | lm loss: 2.580085E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.845 | TFLOPs: 42.89 | +7: iteration 96160/ 115203 | consumed samples: 24616960 | consumed tokens: 50415534080 | elapsed time per iteration (s): 0.55 | learning rate: 3.210E-05 | global batch size: 256 | lm loss: 2.593371E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.756 | TFLOPs: 44.02 | +7: iteration 96170/ 115203 | consumed samples: 24619520 | consumed tokens: 50420776960 | elapsed time per iteration (s): 0.56 | learning rate: 3.209E-05 | global batch size: 256 | lm loss: 2.575423E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.813 | TFLOPs: 43.36 | +7: iteration 96180/ 115203 | consumed samples: 24622080 | consumed tokens: 50426019840 | elapsed time per iteration (s): 0.59 | learning rate: 3.208E-05 | global batch size: 256 | lm loss: 2.561378E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.284 | TFLOPs: 41.69 | +7: iteration 96190/ 115203 | consumed samples: 24624640 | consumed tokens: 50431262720 | elapsed time per iteration (s): 0.57 | learning rate: 3.206E-05 | global batch size: 256 | lm loss: 2.580641E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.079 | TFLOPs: 42.53 | +7: iteration 96200/ 115203 | consumed samples: 24627200 | consumed tokens: 50436505600 | elapsed time per iteration (s): 0.57 | learning rate: 3.205E-05 | global batch size: 256 | lm loss: 2.595550E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.201 | TFLOPs: 42.64 | +7: iteration 96210/ 115203 | consumed samples: 24629760 | consumed tokens: 50441748480 | elapsed time per iteration (s): 0.56 | learning rate: 3.204E-05 | global batch size: 256 | lm loss: 2.589811E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.405 | TFLOPs: 43.61 | +7: iteration 96220/ 115203 | consumed samples: 24632320 | consumed tokens: 50446991360 | elapsed time per iteration (s): 0.57 | learning rate: 3.203E-05 | global batch size: 256 | lm loss: 2.570947E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.345 | TFLOPs: 43.03 | +7: iteration 96230/ 115203 | consumed samples: 24634880 | consumed tokens: 50452234240 | elapsed time per iteration (s): 0.58 | learning rate: 3.201E-05 | global batch size: 256 | lm loss: 2.568917E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.131 | TFLOPs: 42.06 | +7: iteration 96240/ 115203 | consumed samples: 24637440 | consumed tokens: 50457477120 | elapsed time per iteration (s): 0.57 | learning rate: 3.200E-05 | global batch size: 256 | lm loss: 2.574399E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.310 | TFLOPs: 43.12 | +7: iteration 96250/ 115203 | consumed samples: 24640000 | consumed tokens: 50462720000 | elapsed time per iteration (s): 0.56 | learning rate: 3.199E-05 | global batch size: 256 | lm loss: 2.570370E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.078 | TFLOPs: 43.48 | +7: iteration 96260/ 115203 | consumed samples: 24642560 | consumed tokens: 50467962880 | elapsed time per iteration (s): 0.57 | learning rate: 3.198E-05 | global batch size: 256 | lm loss: 2.562677E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.685 | TFLOPs: 42.49 | +7: iteration 96270/ 115203 | consumed samples: 24645120 | consumed tokens: 50473205760 | elapsed time per iteration (s): 0.57 | learning rate: 3.197E-05 | global batch size: 256 | lm loss: 2.576523E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.360 | TFLOPs: 42.65 | +7: iteration 96280/ 115203 | consumed samples: 24647680 | consumed tokens: 50478448640 | elapsed time per iteration (s): 0.57 | learning rate: 3.195E-05 | global batch size: 256 | lm loss: 2.578094E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.622 | TFLOPs: 42.77 | +7: iteration 96290/ 115203 | consumed samples: 24650240 | consumed tokens: 50483691520 | elapsed time per iteration (s): 0.57 | learning rate: 3.194E-05 | global batch size: 256 | lm loss: 2.589337E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.791 | TFLOPs: 42.69 | +7: iteration 96300/ 115203 | consumed samples: 24652800 | consumed tokens: 50488934400 | elapsed time per iteration (s): 0.57 | learning rate: 3.193E-05 | global batch size: 256 | lm loss: 2.578867E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.932 | TFLOPs: 43.09 | +7: iteration 96310/ 115203 | consumed samples: 24655360 | consumed tokens: 50494177280 | elapsed time per iteration (s): 0.56 | learning rate: 3.192E-05 | global batch size: 256 | lm loss: 2.567114E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.554 | TFLOPs: 43.81 | +7: iteration 96320/ 115203 | consumed samples: 24657920 | consumed tokens: 50499420160 | elapsed time per iteration (s): 0.58 | learning rate: 3.190E-05 | global batch size: 256 | lm loss: 2.586714E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.827 | TFLOPs: 42.22 | +7: iteration 96330/ 115203 | consumed samples: 24660480 | consumed tokens: 50504663040 | elapsed time per iteration (s): 0.58 | learning rate: 3.189E-05 | global batch size: 256 | lm loss: 2.570126E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.393 | TFLOPs: 42.37 | +7: iteration 96340/ 115203 | consumed samples: 24663040 | consumed tokens: 50509905920 | elapsed time per iteration (s): 0.56 | learning rate: 3.188E-05 | global batch size: 256 | lm loss: 2.593747E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.669 | TFLOPs: 43.73 | +7: iteration 96350/ 115203 | consumed samples: 24665600 | consumed tokens: 50515148800 | elapsed time per iteration (s): 0.57 | learning rate: 3.187E-05 | global batch size: 256 | lm loss: 2.582276E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.571 | TFLOPs: 42.96 | +7: iteration 96360/ 115203 | consumed samples: 24668160 | consumed tokens: 50520391680 | elapsed time per iteration (s): 0.55 | learning rate: 3.185E-05 | global batch size: 256 | lm loss: 2.567677E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.423 | TFLOPs: 43.99 | +7: iteration 96370/ 115203 | consumed samples: 24670720 | consumed tokens: 50525634560 | elapsed time per iteration (s): 0.57 | learning rate: 3.184E-05 | global batch size: 256 | lm loss: 2.586630E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.371 | TFLOPs: 42.75 | +7: iteration 96380/ 115203 | consumed samples: 24673280 | consumed tokens: 50530877440 | elapsed time per iteration (s): 0.56 | learning rate: 3.183E-05 | global batch size: 256 | lm loss: 2.571631E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.807 | TFLOPs: 43.55 | +7: iteration 96390/ 115203 | consumed samples: 24675840 | consumed tokens: 50536120320 | elapsed time per iteration (s): 0.56 | learning rate: 3.182E-05 | global batch size: 256 | lm loss: 2.571074E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.990 | TFLOPs: 43.47 | +7: iteration 96400/ 115203 | consumed samples: 24678400 | consumed tokens: 50541363200 | elapsed time per iteration (s): 0.57 | learning rate: 3.181E-05 | global batch size: 256 | lm loss: 2.582838E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.778 | TFLOPs: 43.07 | +7: iteration 96410/ 115203 | consumed samples: 24680960 | consumed tokens: 50546606080 | elapsed time per iteration (s): 0.56 | learning rate: 3.179E-05 | global batch size: 256 | lm loss: 2.582330E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.382 | TFLOPs: 43.42 | +7: iteration 96420/ 115203 | consumed samples: 24683520 | consumed tokens: 50551848960 | elapsed time per iteration (s): 0.58 | learning rate: 3.178E-05 | global batch size: 256 | lm loss: 2.571990E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.840 | TFLOPs: 42.03 | +7: iteration 96430/ 115203 | consumed samples: 24686080 | consumed tokens: 50557091840 | elapsed time per iteration (s): 0.56 | learning rate: 3.177E-05 | global batch size: 256 | lm loss: 2.575176E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.201 | TFLOPs: 43.21 | +7: iteration 96440/ 115203 | consumed samples: 24688640 | consumed tokens: 50562334720 | elapsed time per iteration (s): 0.56 | learning rate: 3.176E-05 | global batch size: 256 | lm loss: 2.567668E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.711 | TFLOPs: 43.64 | +7: iteration 96450/ 115203 | consumed samples: 24691200 | consumed tokens: 50567577600 | elapsed time per iteration (s): 0.57 | learning rate: 3.174E-05 | global batch size: 256 | lm loss: 2.586265E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.098 | TFLOPs: 42.82 | +7: iteration 96460/ 115203 | consumed samples: 24693760 | consumed tokens: 50572820480 | elapsed time per iteration (s): 0.56 | learning rate: 3.173E-05 | global batch size: 256 | lm loss: 2.578048E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.029 | TFLOPs: 43.67 | +7: iteration 96470/ 115203 | consumed samples: 24696320 | consumed tokens: 50578063360 | elapsed time per iteration (s): 0.56 | learning rate: 3.172E-05 | global batch size: 256 | lm loss: 2.577444E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.601 | TFLOPs: 43.44 | +7: iteration 96480/ 115203 | consumed samples: 24698880 | consumed tokens: 50583306240 | elapsed time per iteration (s): 0.57 | learning rate: 3.171E-05 | global batch size: 256 | lm loss: 2.570568E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.997 | TFLOPs: 43.00 | +7: iteration 96490/ 115203 | consumed samples: 24701440 | consumed tokens: 50588549120 | elapsed time per iteration (s): 0.57 | learning rate: 3.169E-05 | global batch size: 256 | lm loss: 2.579583E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.621 | TFLOPs: 42.87 | +7: iteration 96500/ 115203 | consumed samples: 24704000 | consumed tokens: 50593792000 | elapsed time per iteration (s): 0.57 | learning rate: 3.168E-05 | global batch size: 256 | lm loss: 2.566205E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.922 | TFLOPs: 43.18 | +7: iteration 96510/ 115203 | consumed samples: 24706560 | consumed tokens: 50599034880 | elapsed time per iteration (s): 0.57 | learning rate: 3.167E-05 | global batch size: 256 | lm loss: 2.568581E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.066 | TFLOPs: 42.91 | +7: iteration 96520/ 115203 | consumed samples: 24709120 | consumed tokens: 50604277760 | elapsed time per iteration (s): 0.57 | learning rate: 3.166E-05 | global batch size: 256 | lm loss: 2.571047E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.752 | TFLOPs: 42.78 | +7: iteration 96530/ 115203 | consumed samples: 24711680 | consumed tokens: 50609520640 | elapsed time per iteration (s): 0.56 | learning rate: 3.165E-05 | global batch size: 256 | lm loss: 2.562695E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.392 | TFLOPs: 43.51 | +7: iteration 96540/ 115203 | consumed samples: 24714240 | consumed tokens: 50614763520 | elapsed time per iteration (s): 0.56 | learning rate: 3.163E-05 | global batch size: 256 | lm loss: 2.568087E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.928 | TFLOPs: 43.66 | +7: iteration 96550/ 115203 | consumed samples: 24716800 | consumed tokens: 50620006400 | elapsed time per iteration (s): 0.56 | learning rate: 3.162E-05 | global batch size: 256 | lm loss: 2.580010E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.175 | TFLOPs: 43.30 | +7: iteration 96560/ 115203 | consumed samples: 24719360 | consumed tokens: 50625249280 | elapsed time per iteration (s): 0.56 | learning rate: 3.161E-05 | global batch size: 256 | lm loss: 2.578501E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.753 | TFLOPs: 43.36 | +7: iteration 96570/ 115203 | consumed samples: 24721920 | consumed tokens: 50630492160 | elapsed time per iteration (s): 0.56 | learning rate: 3.160E-05 | global batch size: 256 | lm loss: 2.580104E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.372 | TFLOPs: 43.22 | +7: iteration 96580/ 115203 | consumed samples: 24724480 | consumed tokens: 50635735040 | elapsed time per iteration (s): 0.56 | learning rate: 3.159E-05 | global batch size: 256 | lm loss: 2.568763E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.920 | TFLOPs: 43.37 | +7: iteration 96590/ 115203 | consumed samples: 24727040 | consumed tokens: 50640977920 | elapsed time per iteration (s): 0.56 | learning rate: 3.157E-05 | global batch size: 256 | lm loss: 2.579188E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.712 | TFLOPs: 43.54 | +7: iteration 96600/ 115203 | consumed samples: 24729600 | consumed tokens: 50646220800 | elapsed time per iteration (s): 0.60 | learning rate: 3.156E-05 | global batch size: 256 | lm loss: 2.568446E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.090 | TFLOPs: 41.00 | +7: iteration 96610/ 115203 | consumed samples: 24732160 | consumed tokens: 50651463680 | elapsed time per iteration (s): 0.58 | learning rate: 3.155E-05 | global batch size: 256 | lm loss: 2.572837E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.145 | TFLOPs: 42.15 | +7: iteration 96620/ 115203 | consumed samples: 24734720 | consumed tokens: 50656706560 | elapsed time per iteration (s): 0.59 | learning rate: 3.154E-05 | global batch size: 256 | lm loss: 2.568185E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.490 | TFLOPs: 41.42 | +7: iteration 96630/ 115203 | consumed samples: 24737280 | consumed tokens: 50661949440 | elapsed time per iteration (s): 0.58 | learning rate: 3.152E-05 | global batch size: 256 | lm loss: 2.585103E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.741 | TFLOPs: 42.40 | +7: iteration 96640/ 115203 | consumed samples: 24739840 | consumed tokens: 50667192320 | elapsed time per iteration (s): 0.56 | learning rate: 3.151E-05 | global batch size: 256 | lm loss: 2.572028E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.158 | TFLOPs: 43.30 | +7: iteration 96650/ 115203 | consumed samples: 24742400 | consumed tokens: 50672435200 | elapsed time per iteration (s): 0.57 | learning rate: 3.150E-05 | global batch size: 256 | lm loss: 2.590766E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.399 | TFLOPs: 42.56 | +7: iteration 96660/ 115203 | consumed samples: 24744960 | consumed tokens: 50677678080 | elapsed time per iteration (s): 0.55 | learning rate: 3.149E-05 | global batch size: 256 | lm loss: 2.584075E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.623 | TFLOPs: 44.01 | +7: iteration 96670/ 115203 | consumed samples: 24747520 | consumed tokens: 50682920960 | elapsed time per iteration (s): 0.58 | learning rate: 3.148E-05 | global batch size: 256 | lm loss: 2.573674E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.069 | TFLOPs: 41.77 | +7: iteration 96680/ 115203 | consumed samples: 24750080 | consumed tokens: 50688163840 | elapsed time per iteration (s): 0.58 | learning rate: 3.146E-05 | global batch size: 256 | lm loss: 2.573230E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.469 | TFLOPs: 41.80 | +7: iteration 96690/ 115203 | consumed samples: 24752640 | consumed tokens: 50693406720 | elapsed time per iteration (s): 0.56 | learning rate: 3.145E-05 | global batch size: 256 | lm loss: 2.572864E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.601 | TFLOPs: 43.53 | +7: iteration 96700/ 115203 | consumed samples: 24755200 | consumed tokens: 50698649600 | elapsed time per iteration (s): 0.57 | learning rate: 3.144E-05 | global batch size: 256 | lm loss: 2.572691E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.912 | TFLOPs: 42.89 | +7: iteration 96710/ 115203 | consumed samples: 24757760 | consumed tokens: 50703892480 | elapsed time per iteration (s): 0.57 | learning rate: 3.143E-05 | global batch size: 256 | lm loss: 2.579115E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.357 | TFLOPs: 43.13 | +7: iteration 96720/ 115203 | consumed samples: 24760320 | consumed tokens: 50709135360 | elapsed time per iteration (s): 0.57 | learning rate: 3.142E-05 | global batch size: 256 | lm loss: 2.572685E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.347 | TFLOPs: 43.03 | +7: iteration 96730/ 115203 | consumed samples: 24762880 | consumed tokens: 50714378240 | elapsed time per iteration (s): 0.57 | learning rate: 3.140E-05 | global batch size: 256 | lm loss: 2.573357E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.844 | TFLOPs: 42.89 | +7: iteration 96740/ 115203 | consumed samples: 24765440 | consumed tokens: 50719621120 | elapsed time per iteration (s): 0.56 | learning rate: 3.139E-05 | global batch size: 256 | lm loss: 2.568653E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.246 | TFLOPs: 43.31 | +7: iteration 96750/ 115203 | consumed samples: 24768000 | consumed tokens: 50724864000 | elapsed time per iteration (s): 0.58 | learning rate: 3.138E-05 | global batch size: 256 | lm loss: 2.587164E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.197 | TFLOPs: 42.16 | +7: iteration 96760/ 115203 | consumed samples: 24770560 | consumed tokens: 50730106880 | elapsed time per iteration (s): 0.58 | learning rate: 3.137E-05 | global batch size: 256 | lm loss: 2.577994E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.953 | TFLOPs: 42.42 | +7: iteration 96770/ 115203 | consumed samples: 24773120 | consumed tokens: 50735349760 | elapsed time per iteration (s): 0.56 | learning rate: 3.136E-05 | global batch size: 256 | lm loss: 2.570791E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.083 | TFLOPs: 43.96 | +7: iteration 96780/ 115203 | consumed samples: 24775680 | consumed tokens: 50740592640 | elapsed time per iteration (s): 0.56 | learning rate: 3.134E-05 | global batch size: 256 | lm loss: 2.576723E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.514 | TFLOPs: 43.52 | +7: iteration 96790/ 115203 | consumed samples: 24778240 | consumed tokens: 50745835520 | elapsed time per iteration (s): 0.56 | learning rate: 3.133E-05 | global batch size: 256 | lm loss: 2.576348E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.084 | TFLOPs: 43.29 | +7: iteration 96800/ 115203 | consumed samples: 24780800 | consumed tokens: 50751078400 | elapsed time per iteration (s): 0.58 | learning rate: 3.132E-05 | global batch size: 256 | lm loss: 2.584341E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.153 | TFLOPs: 42.35 | +7: iteration 96810/ 115203 | consumed samples: 24783360 | consumed tokens: 50756321280 | elapsed time per iteration (s): 0.57 | learning rate: 3.131E-05 | global batch size: 256 | lm loss: 2.580099E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.914 | TFLOPs: 42.61 | +7: iteration 96820/ 115203 | consumed samples: 24785920 | consumed tokens: 50761564160 | elapsed time per iteration (s): 0.57 | learning rate: 3.129E-05 | global batch size: 256 | lm loss: 2.562474E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.863 | TFLOPs: 42.60 | +7: iteration 96830/ 115203 | consumed samples: 24788480 | consumed tokens: 50766807040 | elapsed time per iteration (s): 0.56 | learning rate: 3.128E-05 | global batch size: 256 | lm loss: 2.580944E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.057 | TFLOPs: 43.48 | +7: iteration 96840/ 115203 | consumed samples: 24791040 | consumed tokens: 50772049920 | elapsed time per iteration (s): 0.57 | learning rate: 3.127E-05 | global batch size: 256 | lm loss: 2.565599E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.681 | TFLOPs: 43.16 | +7: iteration 96850/ 115203 | consumed samples: 24793600 | consumed tokens: 50777292800 | elapsed time per iteration (s): 0.57 | learning rate: 3.126E-05 | global batch size: 256 | lm loss: 2.587149E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.777 | TFLOPs: 42.79 | +7: iteration 96860/ 115203 | consumed samples: 24796160 | consumed tokens: 50782535680 | elapsed time per iteration (s): 0.56 | learning rate: 3.125E-05 | global batch size: 256 | lm loss: 2.587998E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.937 | TFLOPs: 43.47 | +7: iteration 96870/ 115203 | consumed samples: 24798720 | consumed tokens: 50787778560 | elapsed time per iteration (s): 0.56 | learning rate: 3.123E-05 | global batch size: 256 | lm loss: 2.567806E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.919 | TFLOPs: 43.28 | +7: iteration 96880/ 115203 | consumed samples: 24801280 | consumed tokens: 50793021440 | elapsed time per iteration (s): 0.57 | learning rate: 3.122E-05 | global batch size: 256 | lm loss: 2.582721E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.295 | TFLOPs: 42.64 | +7: iteration 96890/ 115203 | consumed samples: 24803840 | consumed tokens: 50798264320 | elapsed time per iteration (s): 0.56 | learning rate: 3.121E-05 | global batch size: 256 | lm loss: 2.578879E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.684 | TFLOPs: 43.35 | +7: iteration 96900/ 115203 | consumed samples: 24806400 | consumed tokens: 50803507200 | elapsed time per iteration (s): 0.56 | learning rate: 3.120E-05 | global batch size: 256 | lm loss: 2.581043E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.036 | TFLOPs: 43.76 | +7: iteration 96910/ 115203 | consumed samples: 24808960 | consumed tokens: 50808750080 | elapsed time per iteration (s): 0.57 | learning rate: 3.119E-05 | global batch size: 256 | lm loss: 2.580600E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.283 | TFLOPs: 43.12 | +7: iteration 96920/ 115203 | consumed samples: 24811520 | consumed tokens: 50813992960 | elapsed time per iteration (s): 0.57 | learning rate: 3.117E-05 | global batch size: 256 | lm loss: 2.551479E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.148 | TFLOPs: 43.01 | +7: iteration 96930/ 115203 | consumed samples: 24814080 | consumed tokens: 50819235840 | elapsed time per iteration (s): 0.56 | learning rate: 3.116E-05 | global batch size: 256 | lm loss: 2.576342E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.635 | TFLOPs: 43.25 | +7: iteration 96940/ 115203 | consumed samples: 24816640 | consumed tokens: 50824478720 | elapsed time per iteration (s): 0.55 | learning rate: 3.115E-05 | global batch size: 256 | lm loss: 2.569210E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.469 | TFLOPs: 44.00 | +7: iteration 96950/ 115203 | consumed samples: 24819200 | consumed tokens: 50829721600 | elapsed time per iteration (s): 0.56 | learning rate: 3.114E-05 | global batch size: 256 | lm loss: 2.582067E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.975 | TFLOPs: 43.47 | +7: iteration 96960/ 115203 | consumed samples: 24821760 | consumed tokens: 50834964480 | elapsed time per iteration (s): 0.58 | learning rate: 3.113E-05 | global batch size: 256 | lm loss: 2.585241E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.850 | TFLOPs: 42.41 | +7: iteration 96970/ 115203 | consumed samples: 24824320 | consumed tokens: 50840207360 | elapsed time per iteration (s): 0.60 | learning rate: 3.112E-05 | global batch size: 256 | lm loss: 2.588865E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.787 | TFLOPs: 40.98 | +7: iteration 96980/ 115203 | consumed samples: 24826880 | consumed tokens: 50845450240 | elapsed time per iteration (s): 0.58 | learning rate: 3.110E-05 | global batch size: 256 | lm loss: 2.588735E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.443 | TFLOPs: 42.28 | +7: iteration 96990/ 115203 | consumed samples: 24829440 | consumed tokens: 50850693120 | elapsed time per iteration (s): 0.57 | learning rate: 3.109E-05 | global batch size: 256 | lm loss: 2.578637E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.688 | TFLOPs: 42.68 | +7: iteration 97000/ 115203 | consumed samples: 24832000 | consumed tokens: 50855936000 | elapsed time per iteration (s): 0.57 | learning rate: 3.108E-05 | global batch size: 256 | lm loss: 2.575414E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.009 | TFLOPs: 43.00 | +7: iteration 97010/ 115203 | consumed samples: 24834560 | consumed tokens: 50861178880 | elapsed time per iteration (s): 0.58 | learning rate: 3.107E-05 | global batch size: 256 | lm loss: 2.568817E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.256 | TFLOPs: 42.35 | +7: iteration 97020/ 115203 | consumed samples: 24837120 | consumed tokens: 50866421760 | elapsed time per iteration (s): 0.56 | learning rate: 3.106E-05 | global batch size: 256 | lm loss: 2.581827E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.289 | TFLOPs: 43.22 | +7: iteration 97030/ 115203 | consumed samples: 24839680 | consumed tokens: 50871664640 | elapsed time per iteration (s): 0.57 | learning rate: 3.104E-05 | global batch size: 256 | lm loss: 2.577560E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.778 | TFLOPs: 42.88 | +7: iteration 97040/ 115203 | consumed samples: 24842240 | consumed tokens: 50876907520 | elapsed time per iteration (s): 0.57 | learning rate: 3.103E-05 | global batch size: 256 | lm loss: 2.576719E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.242 | TFLOPs: 42.64 | +7: iteration 97050/ 115203 | consumed samples: 24844800 | consumed tokens: 50882150400 | elapsed time per iteration (s): 0.57 | learning rate: 3.102E-05 | global batch size: 256 | lm loss: 2.573204E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.228 | TFLOPs: 42.64 | +7: iteration 97060/ 115203 | consumed samples: 24847360 | consumed tokens: 50887393280 | elapsed time per iteration (s): 0.56 | learning rate: 3.101E-05 | global batch size: 256 | lm loss: 2.579358E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.385 | TFLOPs: 43.70 | +7: iteration 97070/ 115203 | consumed samples: 24849920 | consumed tokens: 50892636160 | elapsed time per iteration (s): 0.59 | learning rate: 3.100E-05 | global batch size: 256 | lm loss: 2.575741E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.043 | TFLOPs: 41.67 | +7: iteration 97080/ 115203 | consumed samples: 24852480 | consumed tokens: 50897879040 | elapsed time per iteration (s): 0.58 | learning rate: 3.098E-05 | global batch size: 256 | lm loss: 2.575332E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.626 | TFLOPs: 42.10 | +7: iteration 97090/ 115203 | consumed samples: 24855040 | consumed tokens: 50903121920 | elapsed time per iteration (s): 0.58 | learning rate: 3.097E-05 | global batch size: 256 | lm loss: 2.577804E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.649 | TFLOPs: 41.73 | +7: iteration 97100/ 115203 | consumed samples: 24857600 | consumed tokens: 50908364800 | elapsed time per iteration (s): 0.56 | learning rate: 3.096E-05 | global batch size: 256 | lm loss: 2.566068E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.245 | TFLOPs: 43.31 | +7: iteration 97110/ 115203 | consumed samples: 24860160 | consumed tokens: 50913607680 | elapsed time per iteration (s): 0.58 | learning rate: 3.095E-05 | global batch size: 256 | lm loss: 2.563206E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.923 | TFLOPs: 42.23 | +7: iteration 97120/ 115203 | consumed samples: 24862720 | consumed tokens: 50918850560 | elapsed time per iteration (s): 0.57 | learning rate: 3.094E-05 | global batch size: 256 | lm loss: 2.579285E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.370 | TFLOPs: 42.75 | +7: iteration 97130/ 115203 | consumed samples: 24865280 | consumed tokens: 50924093440 | elapsed time per iteration (s): 0.57 | learning rate: 3.092E-05 | global batch size: 256 | lm loss: 2.578438E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.823 | TFLOPs: 42.70 | +7: iteration 97140/ 115203 | consumed samples: 24867840 | consumed tokens: 50929336320 | elapsed time per iteration (s): 0.56 | learning rate: 3.091E-05 | global batch size: 256 | lm loss: 2.582374E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.229 | TFLOPs: 43.69 | +7: iteration 97150/ 115203 | consumed samples: 24870400 | consumed tokens: 50934579200 | elapsed time per iteration (s): 0.56 | learning rate: 3.090E-05 | global batch size: 256 | lm loss: 2.579791E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.611 | TFLOPs: 43.34 | +7: iteration 97160/ 115203 | consumed samples: 24872960 | consumed tokens: 50939822080 | elapsed time per iteration (s): 0.57 | learning rate: 3.089E-05 | global batch size: 256 | lm loss: 2.575870E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.680 | TFLOPs: 42.97 | +7: iteration 97170/ 115203 | consumed samples: 24875520 | consumed tokens: 50945064960 | elapsed time per iteration (s): 0.58 | learning rate: 3.088E-05 | global batch size: 256 | lm loss: 2.560878E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.989 | TFLOPs: 42.33 | +7: iteration 97180/ 115203 | consumed samples: 24878080 | consumed tokens: 50950307840 | elapsed time per iteration (s): 0.57 | learning rate: 3.087E-05 | global batch size: 256 | lm loss: 2.573319E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.236 | TFLOPs: 42.73 | +7: iteration 97190/ 115203 | consumed samples: 24880640 | consumed tokens: 50955550720 | elapsed time per iteration (s): 0.58 | learning rate: 3.085E-05 | global batch size: 256 | lm loss: 2.580436E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.044 | TFLOPs: 42.24 | +7: iteration 97200/ 115203 | consumed samples: 24883200 | consumed tokens: 50960793600 | elapsed time per iteration (s): 0.56 | learning rate: 3.084E-05 | global batch size: 256 | lm loss: 2.578468E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.320 | TFLOPs: 43.51 | +7: iteration 97210/ 115203 | consumed samples: 24885760 | consumed tokens: 50966036480 | elapsed time per iteration (s): 0.58 | learning rate: 3.083E-05 | global batch size: 256 | lm loss: 2.576159E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.471 | TFLOPs: 41.99 | +7: iteration 97220/ 115203 | consumed samples: 24888320 | consumed tokens: 50971279360 | elapsed time per iteration (s): 0.56 | learning rate: 3.082E-05 | global batch size: 256 | lm loss: 2.587544E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.022 | TFLOPs: 43.38 | +7: iteration 97230/ 115203 | consumed samples: 24890880 | consumed tokens: 50976522240 | elapsed time per iteration (s): 0.57 | learning rate: 3.081E-05 | global batch size: 256 | lm loss: 2.579255E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.778 | TFLOPs: 42.88 | +7: iteration 97240/ 115203 | consumed samples: 24893440 | consumed tokens: 50981765120 | elapsed time per iteration (s): 0.57 | learning rate: 3.080E-05 | global batch size: 256 | lm loss: 2.570093E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.195 | TFLOPs: 42.64 | +7: iteration 97250/ 115203 | consumed samples: 24896000 | consumed tokens: 50987008000 | elapsed time per iteration (s): 0.57 | learning rate: 3.078E-05 | global batch size: 256 | lm loss: 2.580322E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.838 | TFLOPs: 42.60 | +7: iteration 97260/ 115203 | consumed samples: 24898560 | consumed tokens: 50992250880 | elapsed time per iteration (s): 0.58 | learning rate: 3.077E-05 | global batch size: 256 | lm loss: 2.566976E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.196 | TFLOPs: 42.44 | +7: iteration 97270/ 115203 | consumed samples: 24901120 | consumed tokens: 50997493760 | elapsed time per iteration (s): 0.57 | learning rate: 3.076E-05 | global batch size: 256 | lm loss: 2.587213E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.372 | TFLOPs: 43.13 | +7: iteration 97280/ 115203 | consumed samples: 24903680 | consumed tokens: 51002736640 | elapsed time per iteration (s): 0.57 | learning rate: 3.075E-05 | global batch size: 256 | lm loss: 2.576229E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.392 | TFLOPs: 43.04 | +7: iteration 97290/ 115203 | consumed samples: 24906240 | consumed tokens: 51007979520 | elapsed time per iteration (s): 0.58 | learning rate: 3.074E-05 | global batch size: 256 | lm loss: 2.575105E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.052 | TFLOPs: 41.86 | +7: iteration 97300/ 115203 | consumed samples: 24908800 | consumed tokens: 51013222400 | elapsed time per iteration (s): 0.57 | learning rate: 3.072E-05 | global batch size: 256 | lm loss: 2.585553E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.726 | TFLOPs: 42.50 | +7: iteration 97310/ 115203 | consumed samples: 24911360 | consumed tokens: 51018465280 | elapsed time per iteration (s): 0.57 | learning rate: 3.071E-05 | global batch size: 256 | lm loss: 2.564126E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.053 | TFLOPs: 42.91 | +7: iteration 97320/ 115203 | consumed samples: 24913920 | consumed tokens: 51023708160 | elapsed time per iteration (s): 0.58 | learning rate: 3.070E-05 | global batch size: 256 | lm loss: 2.570649E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.886 | TFLOPs: 42.32 | +7: iteration 97330/ 115203 | consumed samples: 24916480 | consumed tokens: 51028951040 | elapsed time per iteration (s): 0.57 | learning rate: 3.069E-05 | global batch size: 256 | lm loss: 2.569577E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.227 | TFLOPs: 42.45 | +7: iteration 97340/ 115203 | consumed samples: 24919040 | consumed tokens: 51034193920 | elapsed time per iteration (s): 0.57 | learning rate: 3.068E-05 | global batch size: 256 | lm loss: 2.583850E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.963 | TFLOPs: 42.99 | +7: iteration 97350/ 115203 | consumed samples: 24921600 | consumed tokens: 51039436800 | elapsed time per iteration (s): 0.57 | learning rate: 3.067E-05 | global batch size: 256 | lm loss: 2.580364E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.187 | TFLOPs: 43.02 | +7: iteration 97360/ 115203 | consumed samples: 24924160 | consumed tokens: 51044679680 | elapsed time per iteration (s): 0.56 | learning rate: 3.065E-05 | global batch size: 256 | lm loss: 2.573570E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.899 | TFLOPs: 43.56 | +7: iteration 97370/ 115203 | consumed samples: 24926720 | consumed tokens: 51049922560 | elapsed time per iteration (s): 0.58 | learning rate: 3.064E-05 | global batch size: 256 | lm loss: 2.574179E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.712 | TFLOPs: 42.02 | +7: iteration 97380/ 115203 | consumed samples: 24929280 | consumed tokens: 51055165440 | elapsed time per iteration (s): 0.56 | learning rate: 3.063E-05 | global batch size: 256 | lm loss: 2.566631E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.472 | TFLOPs: 43.23 | +7: iteration 97390/ 115203 | consumed samples: 24931840 | consumed tokens: 51060408320 | elapsed time per iteration (s): 0.57 | learning rate: 3.062E-05 | global batch size: 256 | lm loss: 2.562933E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.478 | TFLOPs: 42.66 | +7: iteration 97400/ 115203 | consumed samples: 24934400 | consumed tokens: 51065651200 | elapsed time per iteration (s): 0.57 | learning rate: 3.061E-05 | global batch size: 256 | lm loss: 2.571280E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.559 | TFLOPs: 42.57 | +7: iteration 97410/ 115203 | consumed samples: 24936960 | consumed tokens: 51070894080 | elapsed time per iteration (s): 0.57 | learning rate: 3.060E-05 | global batch size: 256 | lm loss: 2.567397E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.001 | TFLOPs: 42.71 | +7: iteration 97420/ 115203 | consumed samples: 24939520 | consumed tokens: 51076136960 | elapsed time per iteration (s): 0.57 | learning rate: 3.058E-05 | global batch size: 256 | lm loss: 2.580443E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.300 | TFLOPs: 42.84 | +7: iteration 97430/ 115203 | consumed samples: 24942080 | consumed tokens: 51081379840 | elapsed time per iteration (s): 0.58 | learning rate: 3.057E-05 | global batch size: 256 | lm loss: 2.570378E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.034 | TFLOPs: 42.05 | +7: iteration 97440/ 115203 | consumed samples: 24944640 | consumed tokens: 51086622720 | elapsed time per iteration (s): 0.56 | learning rate: 3.056E-05 | global batch size: 256 | lm loss: 2.565873E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.136 | TFLOPs: 43.49 | +7: iteration 97450/ 115203 | consumed samples: 24947200 | consumed tokens: 51091865600 | elapsed time per iteration (s): 0.58 | learning rate: 3.055E-05 | global batch size: 256 | lm loss: 2.564416E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.270 | TFLOPs: 42.07 | +7: iteration 97460/ 115203 | consumed samples: 24949760 | consumed tokens: 51097108480 | elapsed time per iteration (s): 0.57 | learning rate: 3.054E-05 | global batch size: 256 | lm loss: 2.565121E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.595 | TFLOPs: 42.96 | +7: iteration 97470/ 115203 | consumed samples: 24952320 | consumed tokens: 51102351360 | elapsed time per iteration (s): 0.58 | learning rate: 3.053E-05 | global batch size: 256 | lm loss: 2.580426E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.065 | TFLOPs: 41.96 | +7: iteration 97480/ 115203 | consumed samples: 24954880 | consumed tokens: 51107594240 | elapsed time per iteration (s): 0.57 | learning rate: 3.051E-05 | global batch size: 256 | lm loss: 2.563376E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.064 | TFLOPs: 42.53 | +7: iteration 97490/ 115203 | consumed samples: 24957440 | consumed tokens: 51112837120 | elapsed time per iteration (s): 0.57 | learning rate: 3.050E-05 | global batch size: 256 | lm loss: 2.563234E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.917 | TFLOPs: 43.09 | +7: iteration 97500/ 115203 | consumed samples: 24960000 | consumed tokens: 51118080000 | elapsed time per iteration (s): 0.58 | learning rate: 3.049E-05 | global batch size: 256 | lm loss: 2.565204E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.058 | TFLOPs: 41.86 | +7: iteration 97510/ 115203 | consumed samples: 24962560 | consumed tokens: 51123322880 | elapsed time per iteration (s): 0.58 | learning rate: 3.048E-05 | global batch size: 256 | lm loss: 2.562102E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.908 | TFLOPs: 42.32 | +7: iteration 97520/ 115203 | consumed samples: 24965120 | consumed tokens: 51128565760 | elapsed time per iteration (s): 0.56 | learning rate: 3.047E-05 | global batch size: 256 | lm loss: 2.575401E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.421 | TFLOPs: 43.51 | +7: iteration 97530/ 115203 | consumed samples: 24967680 | consumed tokens: 51133808640 | elapsed time per iteration (s): 0.57 | learning rate: 3.046E-05 | global batch size: 256 | lm loss: 2.578591E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.310 | TFLOPs: 42.93 | +7: iteration 97540/ 115203 | consumed samples: 24970240 | consumed tokens: 51139051520 | elapsed time per iteration (s): 0.57 | learning rate: 3.044E-05 | global batch size: 256 | lm loss: 2.572651E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.341 | TFLOPs: 43.03 | +7: iteration 97550/ 115203 | consumed samples: 24972800 | consumed tokens: 51144294400 | elapsed time per iteration (s): 0.57 | learning rate: 3.043E-05 | global batch size: 256 | lm loss: 2.578451E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.835 | TFLOPs: 42.51 | +7: iteration 97560/ 115203 | consumed samples: 24975360 | consumed tokens: 51149537280 | elapsed time per iteration (s): 0.56 | learning rate: 3.042E-05 | global batch size: 256 | lm loss: 2.592669E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.921 | TFLOPs: 43.37 | +7: iteration 97570/ 115203 | consumed samples: 24977920 | consumed tokens: 51154780160 | elapsed time per iteration (s): 0.59 | learning rate: 3.041E-05 | global batch size: 256 | lm loss: 2.573160E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.908 | TFLOPs: 41.56 | +7: iteration 97580/ 115203 | consumed samples: 24980480 | consumed tokens: 51160023040 | elapsed time per iteration (s): 0.57 | learning rate: 3.040E-05 | global batch size: 256 | lm loss: 2.583187E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.427 | TFLOPs: 42.56 | +7: iteration 97590/ 115203 | consumed samples: 24983040 | consumed tokens: 51165265920 | elapsed time per iteration (s): 0.56 | learning rate: 3.039E-05 | global batch size: 256 | lm loss: 2.570596E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.125 | TFLOPs: 43.39 | +7: iteration 97600/ 115203 | consumed samples: 24985600 | consumed tokens: 51170508800 | elapsed time per iteration (s): 0.58 | learning rate: 3.038E-05 | global batch size: 256 | lm loss: 2.571676E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.728 | TFLOPs: 41.73 | +7: iteration 97610/ 115203 | consumed samples: 24988160 | consumed tokens: 51175751680 | elapsed time per iteration (s): 0.58 | learning rate: 3.036E-05 | global batch size: 256 | lm loss: 2.573007E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.481 | TFLOPs: 42.00 | +7: iteration 97620/ 115203 | consumed samples: 24990720 | consumed tokens: 51180994560 | elapsed time per iteration (s): 0.59 | learning rate: 3.035E-05 | global batch size: 256 | lm loss: 2.577088E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.131 | TFLOPs: 41.48 | +7: iteration 97630/ 115203 | consumed samples: 24993280 | consumed tokens: 51186237440 | elapsed time per iteration (s): 0.58 | learning rate: 3.034E-05 | global batch size: 256 | lm loss: 2.590378E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.297 | TFLOPs: 42.36 | +7: iteration 97640/ 115203 | consumed samples: 24995840 | consumed tokens: 51191480320 | elapsed time per iteration (s): 0.56 | learning rate: 3.033E-05 | global batch size: 256 | lm loss: 2.578367E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.185 | TFLOPs: 43.21 | +7: iteration 97650/ 115203 | consumed samples: 24998400 | consumed tokens: 51196723200 | elapsed time per iteration (s): 0.58 | learning rate: 3.032E-05 | global batch size: 256 | lm loss: 2.576188E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.208 | TFLOPs: 42.45 | +7: iteration 97660/ 115203 | consumed samples: 25000960 | consumed tokens: 51201966080 | elapsed time per iteration (s): 0.56 | learning rate: 3.031E-05 | global batch size: 256 | lm loss: 2.563459E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.929 | TFLOPs: 43.56 | +7: iteration 97670/ 115203 | consumed samples: 25003520 | consumed tokens: 51207208960 | elapsed time per iteration (s): 0.57 | learning rate: 3.029E-05 | global batch size: 256 | lm loss: 2.572615E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.429 | TFLOPs: 43.04 | +7: iteration 97680/ 115203 | consumed samples: 25006080 | consumed tokens: 51212451840 | elapsed time per iteration (s): 0.56 | learning rate: 3.028E-05 | global batch size: 256 | lm loss: 2.576076E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.254 | TFLOPs: 43.31 | +7: iteration 97690/ 115203 | consumed samples: 25008640 | consumed tokens: 51217694720 | elapsed time per iteration (s): 0.56 | learning rate: 3.027E-05 | global batch size: 256 | lm loss: 2.583938E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.939 | TFLOPs: 43.37 | +7: iteration 97700/ 115203 | consumed samples: 25011200 | consumed tokens: 51222937600 | elapsed time per iteration (s): 0.55 | learning rate: 3.026E-05 | global batch size: 256 | lm loss: 2.577246E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.389 | TFLOPs: 43.99 | +7: iteration 97710/ 115203 | consumed samples: 25013760 | consumed tokens: 51228180480 | elapsed time per iteration (s): 0.58 | learning rate: 3.025E-05 | global batch size: 256 | lm loss: 2.575456E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.807 | TFLOPs: 41.93 | +7: iteration 97720/ 115203 | consumed samples: 25016320 | consumed tokens: 51233423360 | elapsed time per iteration (s): 0.56 | learning rate: 3.024E-05 | global batch size: 256 | lm loss: 2.568274E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.074 | TFLOPs: 43.29 | +7: iteration 97730/ 115203 | consumed samples: 25018880 | consumed tokens: 51238666240 | elapsed time per iteration (s): 0.56 | learning rate: 3.023E-05 | global batch size: 256 | lm loss: 2.573277E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.170 | TFLOPs: 43.97 | +7: iteration 97740/ 115203 | consumed samples: 25021440 | consumed tokens: 51243909120 | elapsed time per iteration (s): 0.57 | learning rate: 3.021E-05 | global batch size: 256 | lm loss: 2.576364E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.249 | TFLOPs: 43.02 | +7: iteration 97750/ 115203 | consumed samples: 25024000 | consumed tokens: 51249152000 | elapsed time per iteration (s): 0.56 | learning rate: 3.020E-05 | global batch size: 256 | lm loss: 2.571328E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.371 | TFLOPs: 43.41 | +7: iteration 97760/ 115203 | consumed samples: 25026560 | consumed tokens: 51254394880 | elapsed time per iteration (s): 0.56 | learning rate: 3.019E-05 | global batch size: 256 | lm loss: 2.586844E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.209 | TFLOPs: 43.30 | +7: iteration 97770/ 115203 | consumed samples: 25029120 | consumed tokens: 51259637760 | elapsed time per iteration (s): 0.56 | learning rate: 3.018E-05 | global batch size: 256 | lm loss: 2.576318E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.104 | TFLOPs: 43.39 | +7: iteration 97780/ 115203 | consumed samples: 25031680 | consumed tokens: 51264880640 | elapsed time per iteration (s): 0.57 | learning rate: 3.017E-05 | global batch size: 256 | lm loss: 2.563806E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.590 | TFLOPs: 43.05 | +7: iteration 97790/ 115203 | consumed samples: 25034240 | consumed tokens: 51270123520 | elapsed time per iteration (s): 0.56 | learning rate: 3.016E-05 | global batch size: 256 | lm loss: 2.584423E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.549 | TFLOPs: 43.62 | +7: iteration 97800/ 115203 | consumed samples: 25036800 | consumed tokens: 51275366400 | elapsed time per iteration (s): 0.58 | learning rate: 3.015E-05 | global batch size: 256 | lm loss: 2.573156E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.992 | TFLOPs: 42.23 | +7: iteration 97810/ 115203 | consumed samples: 25039360 | consumed tokens: 51280609280 | elapsed time per iteration (s): 0.57 | learning rate: 3.013E-05 | global batch size: 256 | lm loss: 2.580218E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.038 | TFLOPs: 43.19 | +7: iteration 97820/ 115203 | consumed samples: 25041920 | consumed tokens: 51285852160 | elapsed time per iteration (s): 0.58 | learning rate: 3.012E-05 | global batch size: 256 | lm loss: 2.581837E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.933 | TFLOPs: 42.42 | +7: iteration 97830/ 115203 | consumed samples: 25044480 | consumed tokens: 51291095040 | elapsed time per iteration (s): 0.58 | learning rate: 3.011E-05 | global batch size: 256 | lm loss: 2.583306E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.436 | TFLOPs: 42.28 | +7: iteration 97840/ 115203 | consumed samples: 25047040 | consumed tokens: 51296337920 | elapsed time per iteration (s): 0.57 | learning rate: 3.010E-05 | global batch size: 256 | lm loss: 2.583916E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.429 | TFLOPs: 42.85 | +7: iteration 97850/ 115203 | consumed samples: 25049600 | consumed tokens: 51301580800 | elapsed time per iteration (s): 0.57 | learning rate: 3.009E-05 | global batch size: 256 | lm loss: 2.584466E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.726 | TFLOPs: 43.07 | +7: iteration 97860/ 115203 | consumed samples: 25052160 | consumed tokens: 51306823680 | elapsed time per iteration (s): 0.57 | learning rate: 3.008E-05 | global batch size: 256 | lm loss: 2.571453E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.857 | TFLOPs: 43.08 | +7: iteration 97870/ 115203 | consumed samples: 25054720 | consumed tokens: 51312066560 | elapsed time per iteration (s): 0.57 | learning rate: 3.007E-05 | global batch size: 256 | lm loss: 2.579878E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.823 | TFLOPs: 43.17 | +7: iteration 97880/ 115203 | consumed samples: 25057280 | consumed tokens: 51317309440 | elapsed time per iteration (s): 0.58 | learning rate: 3.005E-05 | global batch size: 256 | lm loss: 2.578561E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.883 | TFLOPs: 42.13 | +7: iteration 97890/ 115203 | consumed samples: 25059840 | consumed tokens: 51322552320 | elapsed time per iteration (s): 0.57 | learning rate: 3.004E-05 | global batch size: 256 | lm loss: 2.575533E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.391 | TFLOPs: 43.13 | +7: iteration 97900/ 115203 | consumed samples: 25062400 | consumed tokens: 51327795200 | elapsed time per iteration (s): 0.57 | learning rate: 3.003E-05 | global batch size: 256 | lm loss: 2.577774E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.987 | TFLOPs: 43.19 | +7: iteration 97910/ 115203 | consumed samples: 25064960 | consumed tokens: 51333038080 | elapsed time per iteration (s): 0.57 | learning rate: 3.002E-05 | global batch size: 256 | lm loss: 2.567703E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.719 | TFLOPs: 42.88 | +7: iteration 97920/ 115203 | consumed samples: 25067520 | consumed tokens: 51338280960 | elapsed time per iteration (s): 0.56 | learning rate: 3.001E-05 | global batch size: 256 | lm loss: 2.566234E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.313 | TFLOPs: 43.31 | +7: iteration 97930/ 115203 | consumed samples: 25070080 | consumed tokens: 51343523840 | elapsed time per iteration (s): 0.58 | learning rate: 3.000E-05 | global batch size: 256 | lm loss: 2.564915E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.372 | TFLOPs: 42.37 | +7: iteration 97940/ 115203 | consumed samples: 25072640 | consumed tokens: 51348766720 | elapsed time per iteration (s): 0.55 | learning rate: 2.999E-05 | global batch size: 256 | lm loss: 2.577348E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.504 | TFLOPs: 44.00 | +7: iteration 97950/ 115203 | consumed samples: 25075200 | consumed tokens: 51354009600 | elapsed time per iteration (s): 0.57 | learning rate: 2.997E-05 | global batch size: 256 | lm loss: 2.578656E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.668 | TFLOPs: 43.06 | +7: iteration 97960/ 115203 | consumed samples: 25077760 | consumed tokens: 51359252480 | elapsed time per iteration (s): 0.56 | learning rate: 2.996E-05 | global batch size: 256 | lm loss: 2.577812E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.337 | TFLOPs: 43.32 | +7: iteration 97970/ 115203 | consumed samples: 25080320 | consumed tokens: 51364495360 | elapsed time per iteration (s): 0.56 | learning rate: 2.995E-05 | global batch size: 256 | lm loss: 2.566249E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.751 | TFLOPs: 43.26 | +7: iteration 97980/ 115203 | consumed samples: 25082880 | consumed tokens: 51369738240 | elapsed time per iteration (s): 0.58 | learning rate: 2.994E-05 | global batch size: 256 | lm loss: 2.565837E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.028 | TFLOPs: 42.05 | +7: iteration 97990/ 115203 | consumed samples: 25085440 | consumed tokens: 51374981120 | elapsed time per iteration (s): 0.58 | learning rate: 2.993E-05 | global batch size: 256 | lm loss: 2.566555E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.572 | TFLOPs: 42.29 | +0: [2023-03-17 04:12:35,882] [INFO] [logging.py:68:log_dist] [Rank 0] step=98000, skipped=0, lr=[2.9917836598254863e-05, 2.9917836598254863e-05, 2.9917836598254863e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 98000/ 115203 | consumed samples: 25088000 | consumed tokens: 51380224000 | elapsed time per iteration (s): 0.55 | learning rate: 2.992E-05 | global batch size: 256 | lm loss: 2.574827E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.497 | TFLOPs: 44.00 | +0: steps: 98000 loss: 2.6225 iter time (s): 0.567 samples/sec: 451.292 +7: iteration 98010/ 115203 | consumed samples: 25090560 | consumed tokens: 51385466880 | elapsed time per iteration (s): 0.58 | learning rate: 2.991E-05 | global batch size: 256 | lm loss: 2.568494E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.887 | TFLOPs: 42.32 | +7: iteration 98020/ 115203 | consumed samples: 25093120 | consumed tokens: 51390709760 | elapsed time per iteration (s): 0.56 | learning rate: 2.990E-05 | global batch size: 256 | lm loss: 2.581658E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.722 | TFLOPs: 43.54 | +7: iteration 98030/ 115203 | consumed samples: 25095680 | consumed tokens: 51395952640 | elapsed time per iteration (s): 0.56 | learning rate: 2.988E-05 | global batch size: 256 | lm loss: 2.564589E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.151 | TFLOPs: 43.68 | +7: iteration 98040/ 115203 | consumed samples: 25098240 | consumed tokens: 51401195520 | elapsed time per iteration (s): 0.57 | learning rate: 2.987E-05 | global batch size: 256 | lm loss: 2.589954E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.065 | TFLOPs: 43.00 | +7: iteration 98050/ 115203 | consumed samples: 25100800 | consumed tokens: 51406438400 | elapsed time per iteration (s): 0.56 | learning rate: 2.986E-05 | global batch size: 256 | lm loss: 2.582204E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.782 | TFLOPs: 43.45 | +7: iteration 98060/ 115203 | consumed samples: 25103360 | consumed tokens: 51411681280 | elapsed time per iteration (s): 0.56 | learning rate: 2.985E-05 | global batch size: 256 | lm loss: 2.573981E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.635 | TFLOPs: 43.82 | +7: iteration 98070/ 115203 | consumed samples: 25105920 | consumed tokens: 51416924160 | elapsed time per iteration (s): 0.56 | learning rate: 2.984E-05 | global batch size: 256 | lm loss: 2.565692E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.489 | TFLOPs: 43.24 | +7: iteration 98080/ 115203 | consumed samples: 25108480 | consumed tokens: 51422167040 | elapsed time per iteration (s): 0.56 | learning rate: 2.983E-05 | global batch size: 256 | lm loss: 2.564750E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.087 | TFLOPs: 43.48 | +7: iteration 98090/ 115203 | consumed samples: 25111040 | consumed tokens: 51427409920 | elapsed time per iteration (s): 0.56 | learning rate: 2.982E-05 | global batch size: 256 | lm loss: 2.556944E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.414 | TFLOPs: 43.23 | +7: iteration 98100/ 115203 | consumed samples: 25113600 | consumed tokens: 51432652800 | elapsed time per iteration (s): 0.57 | learning rate: 2.981E-05 | global batch size: 256 | lm loss: 2.571921E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.042 | TFLOPs: 42.72 | +7: iteration 98110/ 115203 | consumed samples: 25116160 | consumed tokens: 51437895680 | elapsed time per iteration (s): 0.56 | learning rate: 2.979E-05 | global batch size: 256 | lm loss: 2.572696E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.684 | TFLOPs: 43.83 | +7: iteration 98120/ 115203 | consumed samples: 25118720 | consumed tokens: 51443138560 | elapsed time per iteration (s): 0.56 | learning rate: 2.978E-05 | global batch size: 256 | lm loss: 2.583978E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.075 | TFLOPs: 43.29 | +7: iteration 98130/ 115203 | consumed samples: 25121280 | consumed tokens: 51448381440 | elapsed time per iteration (s): 0.57 | learning rate: 2.977E-05 | global batch size: 256 | lm loss: 2.585445E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.167 | TFLOPs: 43.01 | +7: iteration 98140/ 115203 | consumed samples: 25123840 | consumed tokens: 51453624320 | elapsed time per iteration (s): 0.59 | learning rate: 2.976E-05 | global batch size: 256 | lm loss: 2.577343E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.606 | TFLOPs: 41.72 | +7: iteration 98150/ 115203 | consumed samples: 25126400 | consumed tokens: 51458867200 | elapsed time per iteration (s): 0.57 | learning rate: 2.975E-05 | global batch size: 256 | lm loss: 2.587384E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.775 | TFLOPs: 42.79 | +7: iteration 98160/ 115203 | consumed samples: 25128960 | consumed tokens: 51464110080 | elapsed time per iteration (s): 0.56 | learning rate: 2.974E-05 | global batch size: 256 | lm loss: 2.580439E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.255 | TFLOPs: 43.69 | +7: iteration 98170/ 115203 | consumed samples: 25131520 | consumed tokens: 51469352960 | elapsed time per iteration (s): 0.56 | learning rate: 2.973E-05 | global batch size: 256 | lm loss: 2.569030E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.078 | TFLOPs: 43.48 | +7: iteration 98180/ 115203 | consumed samples: 25134080 | consumed tokens: 51474595840 | elapsed time per iteration (s): 0.56 | learning rate: 2.972E-05 | global batch size: 256 | lm loss: 2.565622E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.845 | TFLOPs: 43.46 | +7: iteration 98190/ 115203 | consumed samples: 25136640 | consumed tokens: 51479838720 | elapsed time per iteration (s): 0.56 | learning rate: 2.970E-05 | global batch size: 256 | lm loss: 2.577811E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.153 | TFLOPs: 43.97 | +7: iteration 98200/ 115203 | consumed samples: 25139200 | consumed tokens: 51485081600 | elapsed time per iteration (s): 0.56 | learning rate: 2.969E-05 | global batch size: 256 | lm loss: 2.574682E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.907 | TFLOPs: 43.37 | +7: iteration 98210/ 115203 | consumed samples: 25141760 | consumed tokens: 51490324480 | elapsed time per iteration (s): 0.56 | learning rate: 2.968E-05 | global batch size: 256 | lm loss: 2.572181E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.176 | TFLOPs: 43.97 | +7: iteration 98220/ 115203 | consumed samples: 25144320 | consumed tokens: 51495567360 | elapsed time per iteration (s): 0.57 | learning rate: 2.967E-05 | global batch size: 256 | lm loss: 2.574739E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.101 | TFLOPs: 42.63 | +7: iteration 98230/ 115203 | consumed samples: 25146880 | consumed tokens: 51500810240 | elapsed time per iteration (s): 0.56 | learning rate: 2.966E-05 | global batch size: 256 | lm loss: 2.579212E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.104 | TFLOPs: 43.48 | +7: iteration 98240/ 115203 | consumed samples: 25149440 | consumed tokens: 51506053120 | elapsed time per iteration (s): 0.57 | learning rate: 2.965E-05 | global batch size: 256 | lm loss: 2.590131E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.317 | TFLOPs: 42.84 | +7: iteration 98250/ 115203 | consumed samples: 25152000 | consumed tokens: 51511296000 | elapsed time per iteration (s): 0.57 | learning rate: 2.964E-05 | global batch size: 256 | lm loss: 2.570995E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.494 | TFLOPs: 42.95 | +7: iteration 98260/ 115203 | consumed samples: 25154560 | consumed tokens: 51516538880 | elapsed time per iteration (s): 0.56 | learning rate: 2.963E-05 | global batch size: 256 | lm loss: 2.584175E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.537 | TFLOPs: 43.24 | +7: iteration 98270/ 115203 | consumed samples: 25157120 | consumed tokens: 51521781760 | elapsed time per iteration (s): 0.55 | learning rate: 2.961E-05 | global batch size: 256 | lm loss: 2.567843E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.262 | TFLOPs: 43.98 | +7: iteration 98280/ 115203 | consumed samples: 25159680 | consumed tokens: 51527024640 | elapsed time per iteration (s): 0.56 | learning rate: 2.960E-05 | global batch size: 256 | lm loss: 2.588418E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.546 | TFLOPs: 43.53 | +7: iteration 98290/ 115203 | consumed samples: 25162240 | consumed tokens: 51532267520 | elapsed time per iteration (s): 0.56 | learning rate: 2.959E-05 | global batch size: 256 | lm loss: 2.569864E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.324 | TFLOPs: 43.41 | +7: iteration 98300/ 115203 | consumed samples: 25164800 | consumed tokens: 51537510400 | elapsed time per iteration (s): 0.57 | learning rate: 2.958E-05 | global batch size: 256 | lm loss: 2.576474E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.412 | TFLOPs: 42.47 | +7: iteration 98310/ 115203 | consumed samples: 25167360 | consumed tokens: 51542753280 | elapsed time per iteration (s): 0.57 | learning rate: 2.957E-05 | global batch size: 256 | lm loss: 2.575098E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.569 | TFLOPs: 43.05 | +7: iteration 98320/ 115203 | consumed samples: 25169920 | consumed tokens: 51547996160 | elapsed time per iteration (s): 0.57 | learning rate: 2.956E-05 | global batch size: 256 | lm loss: 2.569593E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.385 | TFLOPs: 42.75 | +7: iteration 98330/ 115203 | consumed samples: 25172480 | consumed tokens: 51553239040 | elapsed time per iteration (s): 0.57 | learning rate: 2.955E-05 | global batch size: 256 | lm loss: 2.573935E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.188 | TFLOPs: 43.11 | +7: iteration 98340/ 115203 | consumed samples: 25175040 | consumed tokens: 51558481920 | elapsed time per iteration (s): 0.57 | learning rate: 2.954E-05 | global batch size: 256 | lm loss: 2.567985E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.998 | TFLOPs: 42.71 | +7: iteration 98350/ 115203 | consumed samples: 25177600 | consumed tokens: 51563724800 | elapsed time per iteration (s): 0.55 | learning rate: 2.953E-05 | global batch size: 256 | lm loss: 2.571869E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.550 | TFLOPs: 44.00 | +7: iteration 98360/ 115203 | consumed samples: 25180160 | consumed tokens: 51568967680 | elapsed time per iteration (s): 0.56 | learning rate: 2.951E-05 | global batch size: 256 | lm loss: 2.576296E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.132 | TFLOPs: 43.49 | +7: iteration 98370/ 115203 | consumed samples: 25182720 | consumed tokens: 51574210560 | elapsed time per iteration (s): 0.56 | learning rate: 2.950E-05 | global batch size: 256 | lm loss: 2.588072E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.702 | TFLOPs: 43.45 | +7: iteration 98380/ 115203 | consumed samples: 25185280 | consumed tokens: 51579453440 | elapsed time per iteration (s): 0.56 | learning rate: 2.949E-05 | global batch size: 256 | lm loss: 2.564502E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.075 | TFLOPs: 43.86 | +7: iteration 98390/ 115203 | consumed samples: 25187840 | consumed tokens: 51584696320 | elapsed time per iteration (s): 0.56 | learning rate: 2.948E-05 | global batch size: 256 | lm loss: 2.571561E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.024 | TFLOPs: 43.48 | +7: iteration 98400/ 115203 | consumed samples: 25190400 | consumed tokens: 51589939200 | elapsed time per iteration (s): 0.55 | learning rate: 2.947E-05 | global batch size: 256 | lm loss: 2.554253E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.485 | TFLOPs: 44.00 | +7: iteration 98410/ 115203 | consumed samples: 25192960 | consumed tokens: 51595182080 | elapsed time per iteration (s): 0.56 | learning rate: 2.946E-05 | global batch size: 256 | lm loss: 2.583667E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.945 | TFLOPs: 43.47 | +7: iteration 98420/ 115203 | consumed samples: 25195520 | consumed tokens: 51600424960 | elapsed time per iteration (s): 0.56 | learning rate: 2.945E-05 | global batch size: 256 | lm loss: 2.565283E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.286 | TFLOPs: 43.50 | +7: iteration 98430/ 115203 | consumed samples: 25198080 | consumed tokens: 51605667840 | elapsed time per iteration (s): 0.56 | learning rate: 2.944E-05 | global batch size: 256 | lm loss: 2.578587E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.048 | TFLOPs: 43.57 | +7: iteration 98440/ 115203 | consumed samples: 25200640 | consumed tokens: 51610910720 | elapsed time per iteration (s): 0.57 | learning rate: 2.943E-05 | global batch size: 256 | lm loss: 2.575578E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.444 | TFLOPs: 42.94 | +7: iteration 98450/ 115203 | consumed samples: 25203200 | consumed tokens: 51616153600 | elapsed time per iteration (s): 0.56 | learning rate: 2.941E-05 | global batch size: 256 | lm loss: 2.585018E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.010 | TFLOPs: 43.57 | +7: iteration 98460/ 115203 | consumed samples: 25205760 | consumed tokens: 51621396480 | elapsed time per iteration (s): 0.56 | learning rate: 2.940E-05 | global batch size: 256 | lm loss: 2.575741E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.750 | TFLOPs: 43.45 | +7: iteration 98470/ 115203 | consumed samples: 25208320 | consumed tokens: 51626639360 | elapsed time per iteration (s): 0.56 | learning rate: 2.939E-05 | global batch size: 256 | lm loss: 2.573199E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.972 | TFLOPs: 43.85 | +7: iteration 98480/ 115203 | consumed samples: 25210880 | consumed tokens: 51631882240 | elapsed time per iteration (s): 0.56 | learning rate: 2.938E-05 | global batch size: 256 | lm loss: 2.560213E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.052 | TFLOPs: 43.57 | +7: iteration 98490/ 115203 | consumed samples: 25213440 | consumed tokens: 51637125120 | elapsed time per iteration (s): 0.56 | learning rate: 2.937E-05 | global batch size: 256 | lm loss: 2.578896E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.402 | TFLOPs: 43.70 | +7: iteration 98500/ 115203 | consumed samples: 25216000 | consumed tokens: 51642368000 | elapsed time per iteration (s): 0.57 | learning rate: 2.936E-05 | global batch size: 256 | lm loss: 2.573583E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.713 | TFLOPs: 42.88 | +7: iteration 98510/ 115203 | consumed samples: 25218560 | consumed tokens: 51647610880 | elapsed time per iteration (s): 0.57 | learning rate: 2.935E-05 | global batch size: 256 | lm loss: 2.587286E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.227 | TFLOPs: 43.02 | +7: iteration 98520/ 115203 | consumed samples: 25221120 | consumed tokens: 51652853760 | elapsed time per iteration (s): 0.56 | learning rate: 2.934E-05 | global batch size: 256 | lm loss: 2.577259E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.839 | TFLOPs: 43.46 | +7: iteration 98530/ 115203 | consumed samples: 25223680 | consumed tokens: 51658096640 | elapsed time per iteration (s): 0.56 | learning rate: 2.933E-05 | global batch size: 256 | lm loss: 2.573272E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.172 | TFLOPs: 43.49 | +7: iteration 98540/ 115203 | consumed samples: 25226240 | consumed tokens: 51663339520 | elapsed time per iteration (s): 0.56 | learning rate: 2.932E-05 | global batch size: 256 | lm loss: 2.560012E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.187 | TFLOPs: 43.68 | +7: iteration 98550/ 115203 | consumed samples: 25228800 | consumed tokens: 51668582400 | elapsed time per iteration (s): 0.56 | learning rate: 2.930E-05 | global batch size: 256 | lm loss: 2.573376E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.993 | TFLOPs: 43.47 | +7: iteration 98560/ 115203 | consumed samples: 25231360 | consumed tokens: 51673825280 | elapsed time per iteration (s): 0.56 | learning rate: 2.929E-05 | global batch size: 256 | lm loss: 2.573743E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.211 | TFLOPs: 43.97 | +7: iteration 98570/ 115203 | consumed samples: 25233920 | consumed tokens: 51679068160 | elapsed time per iteration (s): 0.56 | learning rate: 2.928E-05 | global batch size: 256 | lm loss: 2.588638E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.389 | TFLOPs: 43.42 | +7: iteration 98580/ 115203 | consumed samples: 25236480 | consumed tokens: 51684311040 | elapsed time per iteration (s): 0.56 | learning rate: 2.927E-05 | global batch size: 256 | lm loss: 2.564297E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.215 | TFLOPs: 43.97 | +7: iteration 98590/ 115203 | consumed samples: 25239040 | consumed tokens: 51689553920 | elapsed time per iteration (s): 0.57 | learning rate: 2.926E-05 | global batch size: 256 | lm loss: 2.563860E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.276 | TFLOPs: 42.83 | +7: iteration 98600/ 115203 | consumed samples: 25241600 | consumed tokens: 51694796800 | elapsed time per iteration (s): 0.57 | learning rate: 2.925E-05 | global batch size: 256 | lm loss: 2.574985E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.967 | TFLOPs: 43.19 | +7: iteration 98610/ 115203 | consumed samples: 25244160 | consumed tokens: 51700039680 | elapsed time per iteration (s): 0.56 | learning rate: 2.924E-05 | global batch size: 256 | lm loss: 2.573442E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.494 | TFLOPs: 43.43 | +7: iteration 98620/ 115203 | consumed samples: 25246720 | consumed tokens: 51705282560 | elapsed time per iteration (s): 0.57 | learning rate: 2.923E-05 | global batch size: 256 | lm loss: 2.574981E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.975 | TFLOPs: 42.52 | +7: iteration 98630/ 115203 | consumed samples: 25249280 | consumed tokens: 51710525440 | elapsed time per iteration (s): 0.57 | learning rate: 2.922E-05 | global batch size: 256 | lm loss: 2.565771E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.543 | TFLOPs: 43.05 | +7: iteration 98640/ 115203 | consumed samples: 25251840 | consumed tokens: 51715768320 | elapsed time per iteration (s): 0.56 | learning rate: 2.921E-05 | global batch size: 256 | lm loss: 2.563527E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.709 | TFLOPs: 43.73 | +7: iteration 98650/ 115203 | consumed samples: 25254400 | consumed tokens: 51721011200 | elapsed time per iteration (s): 0.57 | learning rate: 2.920E-05 | global batch size: 256 | lm loss: 2.577895E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.907 | TFLOPs: 42.99 | +7: iteration 98660/ 115203 | consumed samples: 25256960 | consumed tokens: 51726254080 | elapsed time per iteration (s): 0.57 | learning rate: 2.918E-05 | global batch size: 256 | lm loss: 2.566832E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.042 | TFLOPs: 43.19 | +7: iteration 98670/ 115203 | consumed samples: 25259520 | consumed tokens: 51731496960 | elapsed time per iteration (s): 0.56 | learning rate: 2.917E-05 | global batch size: 256 | lm loss: 2.575572E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.809 | TFLOPs: 43.55 | +7: iteration 98680/ 115203 | consumed samples: 25262080 | consumed tokens: 51736739840 | elapsed time per iteration (s): 0.57 | learning rate: 2.916E-05 | global batch size: 256 | lm loss: 2.572006E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.233 | TFLOPs: 42.83 | +7: iteration 98690/ 115203 | consumed samples: 25264640 | consumed tokens: 51741982720 | elapsed time per iteration (s): 0.56 | learning rate: 2.915E-05 | global batch size: 256 | lm loss: 2.572561E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.188 | TFLOPs: 43.40 | +7: iteration 98700/ 115203 | consumed samples: 25267200 | consumed tokens: 51747225600 | elapsed time per iteration (s): 0.59 | learning rate: 2.914E-05 | global batch size: 256 | lm loss: 2.556765E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.902 | TFLOPs: 41.18 | +7: iteration 98710/ 115203 | consumed samples: 25269760 | consumed tokens: 51752468480 | elapsed time per iteration (s): 0.59 | learning rate: 2.913E-05 | global batch size: 256 | lm loss: 2.581137E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.402 | TFLOPs: 41.42 | +7: iteration 98720/ 115203 | consumed samples: 25272320 | consumed tokens: 51757711360 | elapsed time per iteration (s): 0.58 | learning rate: 2.912E-05 | global batch size: 256 | lm loss: 2.566799E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.665 | TFLOPs: 41.82 | +7: iteration 98730/ 115203 | consumed samples: 25274880 | consumed tokens: 51762954240 | elapsed time per iteration (s): 0.58 | learning rate: 2.911E-05 | global batch size: 256 | lm loss: 2.584139E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.281 | TFLOPs: 41.79 | +7: iteration 98740/ 115203 | consumed samples: 25277440 | consumed tokens: 51768197120 | elapsed time per iteration (s): 0.57 | learning rate: 2.910E-05 | global batch size: 256 | lm loss: 2.576145E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.697 | TFLOPs: 42.87 | +7: iteration 98750/ 115203 | consumed samples: 25280000 | consumed tokens: 51773440000 | elapsed time per iteration (s): 0.57 | learning rate: 2.909E-05 | global batch size: 256 | lm loss: 2.569709E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.177 | TFLOPs: 43.11 | +7: iteration 98760/ 115203 | consumed samples: 25282560 | consumed tokens: 51778682880 | elapsed time per iteration (s): 0.57 | learning rate: 2.908E-05 | global batch size: 256 | lm loss: 2.569128E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.828 | TFLOPs: 43.08 | +7: iteration 98770/ 115203 | consumed samples: 25285120 | consumed tokens: 51783925760 | elapsed time per iteration (s): 0.56 | learning rate: 2.906E-05 | global batch size: 256 | lm loss: 2.565742E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.023 | TFLOPs: 43.38 | +7: iteration 98780/ 115203 | consumed samples: 25287680 | consumed tokens: 51789168640 | elapsed time per iteration (s): 0.58 | learning rate: 2.905E-05 | global batch size: 256 | lm loss: 2.576312E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.319 | TFLOPs: 41.98 | +7: iteration 98790/ 115203 | consumed samples: 25290240 | consumed tokens: 51794411520 | elapsed time per iteration (s): 0.57 | learning rate: 2.904E-05 | global batch size: 256 | lm loss: 2.561563E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.036 | TFLOPs: 43.10 | +7: iteration 98800/ 115203 | consumed samples: 25292800 | consumed tokens: 51799654400 | elapsed time per iteration (s): 0.56 | learning rate: 2.903E-05 | global batch size: 256 | lm loss: 2.576657E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.376 | TFLOPs: 43.42 | +7: iteration 98810/ 115203 | consumed samples: 25295360 | consumed tokens: 51804897280 | elapsed time per iteration (s): 0.57 | learning rate: 2.902E-05 | global batch size: 256 | lm loss: 2.569211E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.346 | TFLOPs: 42.84 | +7: iteration 98820/ 115203 | consumed samples: 25297920 | consumed tokens: 51810140160 | elapsed time per iteration (s): 0.57 | learning rate: 2.901E-05 | global batch size: 256 | lm loss: 2.562708E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.445 | TFLOPs: 43.14 | +7: iteration 98830/ 115203 | consumed samples: 25300480 | consumed tokens: 51815383040 | elapsed time per iteration (s): 0.57 | learning rate: 2.900E-05 | global batch size: 256 | lm loss: 2.570834E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.293 | TFLOPs: 42.55 | +7: iteration 98840/ 115203 | consumed samples: 25303040 | consumed tokens: 51820625920 | elapsed time per iteration (s): 0.57 | learning rate: 2.899E-05 | global batch size: 256 | lm loss: 2.579615E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.183 | TFLOPs: 43.11 | +7: iteration 98850/ 115203 | consumed samples: 25305600 | consumed tokens: 51825868800 | elapsed time per iteration (s): 0.56 | learning rate: 2.898E-05 | global batch size: 256 | lm loss: 2.572482E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.944 | TFLOPs: 43.95 | +7: iteration 98860/ 115203 | consumed samples: 25308160 | consumed tokens: 51831111680 | elapsed time per iteration (s): 0.55 | learning rate: 2.897E-05 | global batch size: 256 | lm loss: 2.582827E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.354 | TFLOPs: 43.99 | +7: iteration 98870/ 115203 | consumed samples: 25310720 | consumed tokens: 51836354560 | elapsed time per iteration (s): 0.57 | learning rate: 2.896E-05 | global batch size: 256 | lm loss: 2.574553E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.437 | TFLOPs: 43.14 | +7: iteration 98880/ 115203 | consumed samples: 25313280 | consumed tokens: 51841597440 | elapsed time per iteration (s): 0.58 | learning rate: 2.895E-05 | global batch size: 256 | lm loss: 2.570629E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.692 | TFLOPs: 42.02 | +7: iteration 98890/ 115203 | consumed samples: 25315840 | consumed tokens: 51846840320 | elapsed time per iteration (s): 0.57 | learning rate: 2.894E-05 | global batch size: 256 | lm loss: 2.568813E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.307 | TFLOPs: 42.46 | +7: iteration 98900/ 115203 | consumed samples: 25318400 | consumed tokens: 51852083200 | elapsed time per iteration (s): 0.57 | learning rate: 2.892E-05 | global batch size: 256 | lm loss: 2.553006E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.826 | TFLOPs: 43.17 | +7: iteration 98910/ 115203 | consumed samples: 25320960 | consumed tokens: 51857326080 | elapsed time per iteration (s): 0.57 | learning rate: 2.891E-05 | global batch size: 256 | lm loss: 2.567873E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.998 | TFLOPs: 42.52 | +7: iteration 98920/ 115203 | consumed samples: 25323520 | consumed tokens: 51862568960 | elapsed time per iteration (s): 0.57 | learning rate: 2.890E-05 | global batch size: 256 | lm loss: 2.564605E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.817 | TFLOPs: 42.69 | +7: iteration 98930/ 115203 | consumed samples: 25326080 | consumed tokens: 51867811840 | elapsed time per iteration (s): 0.57 | learning rate: 2.889E-05 | global batch size: 256 | lm loss: 2.580833E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.456 | TFLOPs: 43.04 | +7: iteration 98940/ 115203 | consumed samples: 25328640 | consumed tokens: 51873054720 | elapsed time per iteration (s): 0.56 | learning rate: 2.888E-05 | global batch size: 256 | lm loss: 2.571118E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.125 | TFLOPs: 43.39 | +7: iteration 98950/ 115203 | consumed samples: 25331200 | consumed tokens: 51878297600 | elapsed time per iteration (s): 0.57 | learning rate: 2.887E-05 | global batch size: 256 | lm loss: 2.571500E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.375 | TFLOPs: 42.84 | +7: iteration 98960/ 115203 | consumed samples: 25333760 | consumed tokens: 51883540480 | elapsed time per iteration (s): 0.57 | learning rate: 2.886E-05 | global batch size: 256 | lm loss: 2.573040E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.425 | TFLOPs: 42.56 | +7: iteration 98970/ 115203 | consumed samples: 25336320 | consumed tokens: 51888783360 | elapsed time per iteration (s): 0.57 | learning rate: 2.885E-05 | global batch size: 256 | lm loss: 2.565384E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.644 | TFLOPs: 42.68 | +7: iteration 98980/ 115203 | consumed samples: 25338880 | consumed tokens: 51894026240 | elapsed time per iteration (s): 0.57 | learning rate: 2.884E-05 | global batch size: 256 | lm loss: 2.584159E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.629 | TFLOPs: 42.96 | +7: iteration 98990/ 115203 | consumed samples: 25341440 | consumed tokens: 51899269120 | elapsed time per iteration (s): 0.57 | learning rate: 2.883E-05 | global batch size: 256 | lm loss: 2.560223E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.971 | TFLOPs: 42.80 | +7: iteration 99000/ 115203 | consumed samples: 25344000 | consumed tokens: 51904512000 | elapsed time per iteration (s): 0.56 | learning rate: 2.882E-05 | global batch size: 256 | lm loss: 2.563700E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.340 | TFLOPs: 43.22 | +7: iteration 99010/ 115203 | consumed samples: 25346560 | consumed tokens: 51909754880 | elapsed time per iteration (s): 0.57 | learning rate: 2.881E-05 | global batch size: 256 | lm loss: 2.572623E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.460 | TFLOPs: 42.57 | +7: iteration 99020/ 115203 | consumed samples: 25349120 | consumed tokens: 51914997760 | elapsed time per iteration (s): 0.58 | learning rate: 2.880E-05 | global batch size: 256 | lm loss: 2.589543E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.973 | TFLOPs: 42.14 | +7: iteration 99030/ 115203 | consumed samples: 25351680 | consumed tokens: 51920240640 | elapsed time per iteration (s): 0.56 | learning rate: 2.878E-05 | global batch size: 256 | lm loss: 2.559565E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.436 | TFLOPs: 43.33 | +7: iteration 99040/ 115203 | consumed samples: 25354240 | consumed tokens: 51925483520 | elapsed time per iteration (s): 0.61 | learning rate: 2.877E-05 | global batch size: 256 | lm loss: 2.571946E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 416.420 | TFLOPs: 39.70 | +7: iteration 99050/ 115203 | consumed samples: 25356800 | consumed tokens: 51930726400 | elapsed time per iteration (s): 0.57 | learning rate: 2.876E-05 | global batch size: 256 | lm loss: 2.567582E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.984 | TFLOPs: 43.00 | +7: iteration 99060/ 115203 | consumed samples: 25359360 | consumed tokens: 51935969280 | elapsed time per iteration (s): 0.56 | learning rate: 2.875E-05 | global batch size: 256 | lm loss: 2.562840E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.607 | TFLOPs: 43.25 | +7: iteration 99070/ 115203 | consumed samples: 25361920 | consumed tokens: 51941212160 | elapsed time per iteration (s): 0.56 | learning rate: 2.874E-05 | global batch size: 256 | lm loss: 2.575918E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.664 | TFLOPs: 43.54 | +7: iteration 99080/ 115203 | consumed samples: 25364480 | consumed tokens: 51946455040 | elapsed time per iteration (s): 0.57 | learning rate: 2.873E-05 | global batch size: 256 | lm loss: 2.567936E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.043 | TFLOPs: 43.00 | +7: iteration 99090/ 115203 | consumed samples: 25367040 | consumed tokens: 51951697920 | elapsed time per iteration (s): 0.58 | learning rate: 2.872E-05 | global batch size: 256 | lm loss: 2.576184E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.985 | TFLOPs: 42.42 | +7: iteration 99100/ 115203 | consumed samples: 25369600 | consumed tokens: 51956940800 | elapsed time per iteration (s): 0.57 | learning rate: 2.871E-05 | global batch size: 256 | lm loss: 2.566781E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.988 | TFLOPs: 42.81 | +7: iteration 99110/ 115203 | consumed samples: 25372160 | consumed tokens: 51962183680 | elapsed time per iteration (s): 0.57 | learning rate: 2.870E-05 | global batch size: 256 | lm loss: 2.568011E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.563 | TFLOPs: 42.96 | +7: iteration 99120/ 115203 | consumed samples: 25374720 | consumed tokens: 51967426560 | elapsed time per iteration (s): 0.57 | learning rate: 2.869E-05 | global batch size: 256 | lm loss: 2.565638E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.976 | TFLOPs: 42.80 | +7: iteration 99130/ 115203 | consumed samples: 25377280 | consumed tokens: 51972669440 | elapsed time per iteration (s): 0.57 | learning rate: 2.868E-05 | global batch size: 256 | lm loss: 2.575803E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.721 | TFLOPs: 42.49 | +7: iteration 99140/ 115203 | consumed samples: 25379840 | consumed tokens: 51977912320 | elapsed time per iteration (s): 0.57 | learning rate: 2.867E-05 | global batch size: 256 | lm loss: 2.575998E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.639 | TFLOPs: 42.96 | +7: iteration 99150/ 115203 | consumed samples: 25382400 | consumed tokens: 51983155200 | elapsed time per iteration (s): 0.57 | learning rate: 2.866E-05 | global batch size: 256 | lm loss: 2.566453E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.580 | TFLOPs: 42.77 | +7: iteration 99160/ 115203 | consumed samples: 25384960 | consumed tokens: 51988398080 | elapsed time per iteration (s): 0.58 | learning rate: 2.865E-05 | global batch size: 256 | lm loss: 2.564193E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.941 | TFLOPs: 42.42 | +7: iteration 99170/ 115203 | consumed samples: 25387520 | consumed tokens: 51993640960 | elapsed time per iteration (s): 0.56 | learning rate: 2.864E-05 | global batch size: 256 | lm loss: 2.568604E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.587 | TFLOPs: 43.44 | +7: iteration 99180/ 115203 | consumed samples: 25390080 | consumed tokens: 51998883840 | elapsed time per iteration (s): 0.57 | learning rate: 2.863E-05 | global batch size: 256 | lm loss: 2.573218E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.599 | TFLOPs: 42.96 | +7: iteration 99190/ 115203 | consumed samples: 25392640 | consumed tokens: 52004126720 | elapsed time per iteration (s): 0.56 | learning rate: 2.861E-05 | global batch size: 256 | lm loss: 2.559943E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.439 | TFLOPs: 43.42 | +7: iteration 99200/ 115203 | consumed samples: 25395200 | consumed tokens: 52009369600 | elapsed time per iteration (s): 0.59 | learning rate: 2.860E-05 | global batch size: 256 | lm loss: 2.571449E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.198 | TFLOPs: 41.30 | +7: iteration 99210/ 115203 | consumed samples: 25397760 | consumed tokens: 52014612480 | elapsed time per iteration (s): 0.58 | learning rate: 2.859E-05 | global batch size: 256 | lm loss: 2.576593E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.382 | TFLOPs: 41.99 | +7: iteration 99220/ 115203 | consumed samples: 25400320 | consumed tokens: 52019855360 | elapsed time per iteration (s): 0.57 | learning rate: 2.858E-05 | global batch size: 256 | lm loss: 2.562348E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.526 | TFLOPs: 43.05 | +7: iteration 99230/ 115203 | consumed samples: 25402880 | consumed tokens: 52025098240 | elapsed time per iteration (s): 0.57 | learning rate: 2.857E-05 | global batch size: 256 | lm loss: 2.583777E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.727 | TFLOPs: 43.07 | +7: iteration 99240/ 115203 | consumed samples: 25405440 | consumed tokens: 52030341120 | elapsed time per iteration (s): 0.58 | learning rate: 2.856E-05 | global batch size: 256 | lm loss: 2.570750E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.902 | TFLOPs: 42.42 | +7: iteration 99250/ 115203 | consumed samples: 25408000 | consumed tokens: 52035584000 | elapsed time per iteration (s): 0.57 | learning rate: 2.855E-05 | global batch size: 256 | lm loss: 2.580532E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.947 | TFLOPs: 42.90 | +7: iteration 99260/ 115203 | consumed samples: 25410560 | consumed tokens: 52040826880 | elapsed time per iteration (s): 0.56 | learning rate: 2.854E-05 | global batch size: 256 | lm loss: 2.563091E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.745 | TFLOPs: 43.55 | +7: iteration 99270/ 115203 | consumed samples: 25413120 | consumed tokens: 52046069760 | elapsed time per iteration (s): 0.56 | learning rate: 2.853E-05 | global batch size: 256 | lm loss: 2.564835E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.069 | TFLOPs: 43.48 | +7: iteration 99280/ 115203 | consumed samples: 25415680 | consumed tokens: 52051312640 | elapsed time per iteration (s): 0.57 | learning rate: 2.852E-05 | global batch size: 256 | lm loss: 2.566097E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.734 | TFLOPs: 42.97 | +7: iteration 99290/ 115203 | consumed samples: 25418240 | consumed tokens: 52056555520 | elapsed time per iteration (s): 0.57 | learning rate: 2.851E-05 | global batch size: 256 | lm loss: 2.571638E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.563 | TFLOPs: 42.77 | +7: iteration 99300/ 115203 | consumed samples: 25420800 | consumed tokens: 52061798400 | elapsed time per iteration (s): 0.56 | learning rate: 2.850E-05 | global batch size: 256 | lm loss: 2.585742E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.850 | TFLOPs: 43.46 | +7: iteration 99310/ 115203 | consumed samples: 25423360 | consumed tokens: 52067041280 | elapsed time per iteration (s): 0.57 | learning rate: 2.849E-05 | global batch size: 256 | lm loss: 2.572232E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.183 | TFLOPs: 42.92 | +7: iteration 99320/ 115203 | consumed samples: 25425920 | consumed tokens: 52072284160 | elapsed time per iteration (s): 0.57 | learning rate: 2.848E-05 | global batch size: 256 | lm loss: 2.575148E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.886 | TFLOPs: 42.61 | +7: iteration 99330/ 115203 | consumed samples: 25428480 | consumed tokens: 52077527040 | elapsed time per iteration (s): 0.57 | learning rate: 2.847E-05 | global batch size: 256 | lm loss: 2.578066E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.662 | TFLOPs: 42.97 | +7: iteration 99340/ 115203 | consumed samples: 25431040 | consumed tokens: 52082769920 | elapsed time per iteration (s): 0.57 | learning rate: 2.846E-05 | global batch size: 256 | lm loss: 2.567291E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.816 | TFLOPs: 43.08 | +7: iteration 99350/ 115203 | consumed samples: 25433600 | consumed tokens: 52088012800 | elapsed time per iteration (s): 0.57 | learning rate: 2.845E-05 | global batch size: 256 | lm loss: 2.574355E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.509 | TFLOPs: 42.57 | +7: iteration 99360/ 115203 | consumed samples: 25436160 | consumed tokens: 52093255680 | elapsed time per iteration (s): 0.56 | learning rate: 2.844E-05 | global batch size: 256 | lm loss: 2.581985E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.389 | TFLOPs: 43.32 | +7: iteration 99370/ 115203 | consumed samples: 25438720 | consumed tokens: 52098498560 | elapsed time per iteration (s): 0.57 | learning rate: 2.843E-05 | global batch size: 256 | lm loss: 2.575688E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.571 | TFLOPs: 42.96 | +7: iteration 99380/ 115203 | consumed samples: 25441280 | consumed tokens: 52103741440 | elapsed time per iteration (s): 0.56 | learning rate: 2.841E-05 | global batch size: 256 | lm loss: 2.576067E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.373 | TFLOPs: 43.41 | +7: iteration 99390/ 115203 | consumed samples: 25443840 | consumed tokens: 52108984320 | elapsed time per iteration (s): 0.57 | learning rate: 2.840E-05 | global batch size: 256 | lm loss: 2.583749E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.220 | TFLOPs: 42.92 | +7: iteration 99400/ 115203 | consumed samples: 25446400 | consumed tokens: 52114227200 | elapsed time per iteration (s): 0.59 | learning rate: 2.839E-05 | global batch size: 256 | lm loss: 2.559199E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.936 | TFLOPs: 41.56 | +7: iteration 99410/ 115203 | consumed samples: 25448960 | consumed tokens: 52119470080 | elapsed time per iteration (s): 0.58 | learning rate: 2.838E-05 | global batch size: 256 | lm loss: 2.573579E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.460 | TFLOPs: 41.99 | +7: iteration 99420/ 115203 | consumed samples: 25451520 | consumed tokens: 52124712960 | elapsed time per iteration (s): 0.56 | learning rate: 2.837E-05 | global batch size: 256 | lm loss: 2.566049E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.822 | TFLOPs: 43.46 | +7: iteration 99430/ 115203 | consumed samples: 25454080 | consumed tokens: 52129955840 | elapsed time per iteration (s): 0.56 | learning rate: 2.836E-05 | global batch size: 256 | lm loss: 2.574266E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.970 | TFLOPs: 43.28 | +7: iteration 99440/ 115203 | consumed samples: 25456640 | consumed tokens: 52135198720 | elapsed time per iteration (s): 0.56 | learning rate: 2.835E-05 | global batch size: 256 | lm loss: 2.555713E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.301 | TFLOPs: 43.50 | +7: iteration 99450/ 115203 | consumed samples: 25459200 | consumed tokens: 52140441600 | elapsed time per iteration (s): 0.58 | learning rate: 2.834E-05 | global batch size: 256 | lm loss: 2.575582E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.492 | TFLOPs: 42.09 | +7: iteration 99460/ 115203 | consumed samples: 25461760 | consumed tokens: 52145684480 | elapsed time per iteration (s): 0.56 | learning rate: 2.833E-05 | global batch size: 256 | lm loss: 2.558536E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.597 | TFLOPs: 43.53 | +7: iteration 99470/ 115203 | consumed samples: 25464320 | consumed tokens: 52150927360 | elapsed time per iteration (s): 0.57 | learning rate: 2.832E-05 | global batch size: 256 | lm loss: 2.577741E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.284 | TFLOPs: 42.64 | +7: iteration 99480/ 115203 | consumed samples: 25466880 | consumed tokens: 52156170240 | elapsed time per iteration (s): 0.58 | learning rate: 2.831E-05 | global batch size: 256 | lm loss: 2.573116E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.729 | TFLOPs: 42.02 | +7: iteration 99490/ 115203 | consumed samples: 25469440 | consumed tokens: 52161413120 | elapsed time per iteration (s): 0.57 | learning rate: 2.830E-05 | global batch size: 256 | lm loss: 2.574364E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.054 | TFLOPs: 42.91 | +7: iteration 99500/ 115203 | consumed samples: 25472000 | consumed tokens: 52166656000 | elapsed time per iteration (s): 0.56 | learning rate: 2.829E-05 | global batch size: 256 | lm loss: 2.566010E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.525 | TFLOPs: 43.52 | +7: iteration 99510/ 115203 | consumed samples: 25474560 | consumed tokens: 52171898880 | elapsed time per iteration (s): 0.56 | learning rate: 2.828E-05 | global batch size: 256 | lm loss: 2.574937E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.636 | TFLOPs: 43.63 | +7: iteration 99520/ 115203 | consumed samples: 25477120 | consumed tokens: 52177141760 | elapsed time per iteration (s): 0.56 | learning rate: 2.827E-05 | global batch size: 256 | lm loss: 2.578159E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.133 | TFLOPs: 43.87 | +7: iteration 99530/ 115203 | consumed samples: 25479680 | consumed tokens: 52182384640 | elapsed time per iteration (s): 0.59 | learning rate: 2.826E-05 | global batch size: 256 | lm loss: 2.569173E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.317 | TFLOPs: 41.50 | +7: iteration 99540/ 115203 | consumed samples: 25482240 | consumed tokens: 52187627520 | elapsed time per iteration (s): 0.58 | learning rate: 2.825E-05 | global batch size: 256 | lm loss: 2.575385E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.826 | TFLOPs: 41.93 | +7: iteration 99550/ 115203 | consumed samples: 25484800 | consumed tokens: 52192870400 | elapsed time per iteration (s): 0.58 | learning rate: 2.824E-05 | global batch size: 256 | lm loss: 2.559478E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.336 | TFLOPs: 41.89 | +7: iteration 99560/ 115203 | consumed samples: 25487360 | consumed tokens: 52198113280 | elapsed time per iteration (s): 0.59 | learning rate: 2.823E-05 | global batch size: 256 | lm loss: 2.565407E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.334 | TFLOPs: 41.12 | +7: iteration 99570/ 115203 | consumed samples: 25489920 | consumed tokens: 52203356160 | elapsed time per iteration (s): 0.58 | learning rate: 2.822E-05 | global batch size: 256 | lm loss: 2.579092E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.237 | TFLOPs: 41.97 | +7: iteration 99580/ 115203 | consumed samples: 25492480 | consumed tokens: 52208599040 | elapsed time per iteration (s): 0.60 | learning rate: 2.821E-05 | global batch size: 256 | lm loss: 2.569869E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.993 | TFLOPs: 40.90 | +7: iteration 99590/ 115203 | consumed samples: 25495040 | consumed tokens: 52213841920 | elapsed time per iteration (s): 0.57 | learning rate: 2.820E-05 | global batch size: 256 | lm loss: 2.566796E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.150 | TFLOPs: 42.73 | +7: iteration 99600/ 115203 | consumed samples: 25497600 | consumed tokens: 52219084800 | elapsed time per iteration (s): 0.57 | learning rate: 2.819E-05 | global batch size: 256 | lm loss: 2.563897E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.934 | TFLOPs: 42.99 | +7: iteration 99610/ 115203 | consumed samples: 25500160 | consumed tokens: 52224327680 | elapsed time per iteration (s): 0.57 | learning rate: 2.818E-05 | global batch size: 256 | lm loss: 2.571809E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.852 | TFLOPs: 43.17 | +7: iteration 99620/ 115203 | consumed samples: 25502720 | consumed tokens: 52229570560 | elapsed time per iteration (s): 0.57 | learning rate: 2.817E-05 | global batch size: 256 | lm loss: 2.563022E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.209 | TFLOPs: 42.73 | +7: iteration 99630/ 115203 | consumed samples: 25505280 | consumed tokens: 52234813440 | elapsed time per iteration (s): 0.57 | learning rate: 2.816E-05 | global batch size: 256 | lm loss: 2.571988E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.033 | TFLOPs: 42.72 | +7: iteration 99640/ 115203 | consumed samples: 25507840 | consumed tokens: 52240056320 | elapsed time per iteration (s): 0.57 | learning rate: 2.814E-05 | global batch size: 256 | lm loss: 2.571865E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.635 | TFLOPs: 42.77 | +7: iteration 99650/ 115203 | consumed samples: 25510400 | consumed tokens: 52245299200 | elapsed time per iteration (s): 0.57 | learning rate: 2.813E-05 | global batch size: 256 | lm loss: 2.565739E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.245 | TFLOPs: 42.93 | +7: iteration 99660/ 115203 | consumed samples: 25512960 | consumed tokens: 52250542080 | elapsed time per iteration (s): 0.59 | learning rate: 2.812E-05 | global batch size: 256 | lm loss: 2.565184E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.146 | TFLOPs: 41.58 | +7: iteration 99670/ 115203 | consumed samples: 25515520 | consumed tokens: 52255784960 | elapsed time per iteration (s): 0.58 | learning rate: 2.811E-05 | global batch size: 256 | lm loss: 2.574498E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.567 | TFLOPs: 41.91 | +7: iteration 99680/ 115203 | consumed samples: 25518080 | consumed tokens: 52261027840 | elapsed time per iteration (s): 0.58 | learning rate: 2.810E-05 | global batch size: 256 | lm loss: 2.566035E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.791 | TFLOPs: 42.02 | +7: iteration 99690/ 115203 | consumed samples: 25520640 | consumed tokens: 52266270720 | elapsed time per iteration (s): 0.59 | learning rate: 2.809E-05 | global batch size: 256 | lm loss: 2.572603E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.756 | TFLOPs: 41.07 | +7: iteration 99700/ 115203 | consumed samples: 25523200 | consumed tokens: 52271513600 | elapsed time per iteration (s): 0.56 | learning rate: 2.808E-05 | global batch size: 256 | lm loss: 2.564696E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.381 | TFLOPs: 43.70 | +7: iteration 99710/ 115203 | consumed samples: 25525760 | consumed tokens: 52276756480 | elapsed time per iteration (s): 0.59 | learning rate: 2.807E-05 | global batch size: 256 | lm loss: 2.574866E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.832 | TFLOPs: 41.27 | +7: iteration 99720/ 115203 | consumed samples: 25528320 | consumed tokens: 52281999360 | elapsed time per iteration (s): 0.59 | learning rate: 2.806E-05 | global batch size: 256 | lm loss: 2.567020E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.916 | TFLOPs: 41.56 | +7: iteration 99730/ 115203 | consumed samples: 25530880 | consumed tokens: 52287242240 | elapsed time per iteration (s): 0.57 | learning rate: 2.805E-05 | global batch size: 256 | lm loss: 2.581875E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.090 | TFLOPs: 42.72 | +7: iteration 99740/ 115203 | consumed samples: 25533440 | consumed tokens: 52292485120 | elapsed time per iteration (s): 0.61 | learning rate: 2.804E-05 | global batch size: 256 | lm loss: 2.564457E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.055 | TFLOPs: 40.14 | +7: iteration 99750/ 115203 | consumed samples: 25536000 | consumed tokens: 52297728000 | elapsed time per iteration (s): 0.60 | learning rate: 2.803E-05 | global batch size: 256 | lm loss: 2.574812E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.615 | TFLOPs: 40.58 | +7: iteration 99760/ 115203 | consumed samples: 25538560 | consumed tokens: 52302970880 | elapsed time per iteration (s): 0.57 | learning rate: 2.802E-05 | global batch size: 256 | lm loss: 2.572085E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.861 | TFLOPs: 42.89 | +7: iteration 99770/ 115203 | consumed samples: 25541120 | consumed tokens: 52308213760 | elapsed time per iteration (s): 0.59 | learning rate: 2.801E-05 | global batch size: 256 | lm loss: 2.570036E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.685 | TFLOPs: 41.54 | +7: iteration 99780/ 115203 | consumed samples: 25543680 | consumed tokens: 52313456640 | elapsed time per iteration (s): 0.56 | learning rate: 2.800E-05 | global batch size: 256 | lm loss: 2.566486E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.024 | TFLOPs: 43.86 | +7: iteration 99790/ 115203 | consumed samples: 25546240 | consumed tokens: 52318699520 | elapsed time per iteration (s): 0.57 | learning rate: 2.799E-05 | global batch size: 256 | lm loss: 2.560297E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.100 | TFLOPs: 42.53 | +7: iteration 99800/ 115203 | consumed samples: 25548800 | consumed tokens: 52323942400 | elapsed time per iteration (s): 0.57 | learning rate: 2.798E-05 | global batch size: 256 | lm loss: 2.560955E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.209 | TFLOPs: 42.64 | +7: iteration 99810/ 115203 | consumed samples: 25551360 | consumed tokens: 52329185280 | elapsed time per iteration (s): 0.57 | learning rate: 2.797E-05 | global batch size: 256 | lm loss: 2.567651E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.044 | TFLOPs: 42.62 | +7: iteration 99820/ 115203 | consumed samples: 25553920 | consumed tokens: 52334428160 | elapsed time per iteration (s): 0.56 | learning rate: 2.796E-05 | global batch size: 256 | lm loss: 2.584028E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.521 | TFLOPs: 43.24 | +7: iteration 99830/ 115203 | consumed samples: 25556480 | consumed tokens: 52339671040 | elapsed time per iteration (s): 0.56 | learning rate: 2.795E-05 | global batch size: 256 | lm loss: 2.567442E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.113 | TFLOPs: 43.58 | +7: iteration 99840/ 115203 | consumed samples: 25559040 | consumed tokens: 52344913920 | elapsed time per iteration (s): 0.57 | learning rate: 2.794E-05 | global batch size: 256 | lm loss: 2.566289E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.278 | TFLOPs: 43.02 | +7: iteration 99850/ 115203 | consumed samples: 25561600 | consumed tokens: 52350156800 | elapsed time per iteration (s): 0.57 | learning rate: 2.793E-05 | global batch size: 256 | lm loss: 2.578172E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.729 | TFLOPs: 42.88 | +7: iteration 99860/ 115203 | consumed samples: 25564160 | consumed tokens: 52355399680 | elapsed time per iteration (s): 0.57 | learning rate: 2.792E-05 | global batch size: 256 | lm loss: 2.564444E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.442 | TFLOPs: 42.85 | +7: iteration 99870/ 115203 | consumed samples: 25566720 | consumed tokens: 52360642560 | elapsed time per iteration (s): 0.57 | learning rate: 2.791E-05 | global batch size: 256 | lm loss: 2.565980E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.450 | TFLOPs: 42.56 | +7: iteration 99880/ 115203 | consumed samples: 25569280 | consumed tokens: 52365885440 | elapsed time per iteration (s): 0.56 | learning rate: 2.790E-05 | global batch size: 256 | lm loss: 2.575043E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.423 | TFLOPs: 43.52 | +7: iteration 99890/ 115203 | consumed samples: 25571840 | consumed tokens: 52371128320 | elapsed time per iteration (s): 0.58 | learning rate: 2.789E-05 | global batch size: 256 | lm loss: 2.579307E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.779 | TFLOPs: 41.74 | +7: iteration 99900/ 115203 | consumed samples: 25574400 | consumed tokens: 52376371200 | elapsed time per iteration (s): 0.56 | learning rate: 2.788E-05 | global batch size: 256 | lm loss: 2.583781E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.126 | TFLOPs: 43.30 | +7: iteration 99910/ 115203 | consumed samples: 25576960 | consumed tokens: 52381614080 | elapsed time per iteration (s): 0.57 | learning rate: 2.787E-05 | global batch size: 256 | lm loss: 2.584577E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.671 | TFLOPs: 42.68 | +7: iteration 99920/ 115203 | consumed samples: 25579520 | consumed tokens: 52386856960 | elapsed time per iteration (s): 0.57 | learning rate: 2.786E-05 | global batch size: 256 | lm loss: 2.572752E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.717 | TFLOPs: 43.07 | +7: iteration 99930/ 115203 | consumed samples: 25582080 | consumed tokens: 52392099840 | elapsed time per iteration (s): 0.58 | learning rate: 2.785E-05 | global batch size: 256 | lm loss: 2.571269E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.246 | TFLOPs: 41.88 | +7: iteration 99940/ 115203 | consumed samples: 25584640 | consumed tokens: 52397342720 | elapsed time per iteration (s): 0.58 | learning rate: 2.784E-05 | global batch size: 256 | lm loss: 2.559853E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.343 | TFLOPs: 42.36 | +7: iteration 99950/ 115203 | consumed samples: 25587200 | consumed tokens: 52402585600 | elapsed time per iteration (s): 0.57 | learning rate: 2.783E-05 | global batch size: 256 | lm loss: 2.571384E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.748 | TFLOPs: 43.07 | +7: iteration 99960/ 115203 | consumed samples: 25589760 | consumed tokens: 52407828480 | elapsed time per iteration (s): 0.56 | learning rate: 2.782E-05 | global batch size: 256 | lm loss: 2.568592E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.176 | TFLOPs: 43.21 | +7: iteration 99970/ 115203 | consumed samples: 25592320 | consumed tokens: 52413071360 | elapsed time per iteration (s): 0.57 | learning rate: 2.781E-05 | global batch size: 256 | lm loss: 2.561370E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.309 | TFLOPs: 43.03 | +7: iteration 99980/ 115203 | consumed samples: 25594880 | consumed tokens: 52418314240 | elapsed time per iteration (s): 0.57 | learning rate: 2.780E-05 | global batch size: 256 | lm loss: 2.573972E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.146 | TFLOPs: 42.82 | +7: iteration 99990/ 115203 | consumed samples: 25597440 | consumed tokens: 52423557120 | elapsed time per iteration (s): 0.57 | learning rate: 2.779E-05 | global batch size: 256 | lm loss: 2.566480E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.066 | TFLOPs: 43.00 | +0: [2023-03-17 04:31:34,125] [INFO] [logging.py:68:log_dist] [Rank 0] step=100000, skipped=0, lr=[2.777783369036059e-05, 2.777783369036059e-05, 2.777783369036059e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 100000/ 115203 | consumed samples: 25600000 | consumed tokens: 52428800000 | elapsed time per iteration (s): 0.57 | learning rate: 2.778E-05 | global batch size: 256 | lm loss: 2.561579E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.128 | TFLOPs: 42.82 | +0: steps: 100000 loss: 2.5789 iter time (s): 0.567 samples/sec: 451.742 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 100000 | lm loss value: 3.351080E+00 | lm loss PPL: 2.853353E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 100000 to checkpoints_421m60b400m +0: [2023-03-17 04:31:34,344] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step100000 is begin to save! +0: [2023-03-17 04:31:34,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:31:34,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:31:34,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:31:34,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:31:34,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:31:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:31:34,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:31:34,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:31:34,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:31:34,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:31:34,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:31:34,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:31:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:31:34,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:31:34,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:31:34,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:31:34,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:31:34,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:31:34,860] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_11-model_00-model_states.pt... +0: [2023-03-17 04:31:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_11-model_00-model_states.pt. +0: [2023-03-17 04:31:34,902] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:31:34,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:31:34,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_13-model_00-model_states.pt... +0: [2023-03-17 04:31:34,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_13-model_00-model_states.pt. +0: [2023-03-17 04:31:34,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_14-model_00-model_states.pt... +0: [2023-03-17 04:31:35,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_14-model_00-model_states.pt. +0: [2023-03-17 04:31:35,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_15-model_00-model_states.pt... +0: [2023-03-17 04:31:35,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_15-model_00-model_states.pt. +0: [2023-03-17 04:31:35,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_16-model_00-model_states.pt... +0: [2023-03-17 04:31:35,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_16-model_00-model_states.pt. +0: [2023-03-17 04:31:35,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_17-model_00-model_states.pt... +0: [2023-03-17 04:31:35,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_17-model_00-model_states.pt. +0: [2023-03-17 04:31:35,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_18-model_00-model_states.pt... +0: [2023-03-17 04:31:35,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_18-model_00-model_states.pt. +0: [2023-03-17 04:31:35,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_19-model_00-model_states.pt... +0: [2023-03-17 04:31:35,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_19-model_00-model_states.pt. +0: [2023-03-17 04:31:35,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_20-model_00-model_states.pt... +0: [2023-03-17 04:31:35,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_20-model_00-model_states.pt. +0: [2023-03-17 04:31:35,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/layer_22-model_00-model_states.pt... +0: [2023-03-17 04:31:35,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/layer_22-model_00-model_states.pt. +0: [2023-03-17 04:31:35,285] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step100000/mp_rank_00_model_states.pt +0: [2023-03-17 04:31:35,285] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:31:35,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:31:35,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:31:35,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:31:35,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:31:35,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:31:35,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:31:35,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:31:35,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:31:35,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:31:35,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:31:35,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:31:35,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: successfully saved checkpoint at iteration 100000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1216.90 +7: iteration 100010/ 115203 | consumed samples: 25602560 | consumed tokens: 52434042880 | elapsed time per iteration (s): 0.70 | learning rate: 2.777E-05 | global batch size: 256 | lm loss: 2.565393E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 365.820 | TFLOPs: 34.88 | +7: iteration 100020/ 115203 | consumed samples: 25605120 | consumed tokens: 52439285760 | elapsed time per iteration (s): 0.56 | learning rate: 2.776E-05 | global batch size: 256 | lm loss: 2.563626E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.284 | TFLOPs: 43.31 | +7: iteration 100030/ 115203 | consumed samples: 25607680 | consumed tokens: 52444528640 | elapsed time per iteration (s): 0.58 | learning rate: 2.775E-05 | global batch size: 256 | lm loss: 2.569811E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.654 | TFLOPs: 42.30 | +7: iteration 100040/ 115203 | consumed samples: 25610240 | consumed tokens: 52449771520 | elapsed time per iteration (s): 0.57 | learning rate: 2.774E-05 | global batch size: 256 | lm loss: 2.563550E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.539 | TFLOPs: 42.67 | +7: iteration 100050/ 115203 | consumed samples: 25612800 | consumed tokens: 52455014400 | elapsed time per iteration (s): 0.56 | learning rate: 2.773E-05 | global batch size: 256 | lm loss: 2.559019E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.312 | TFLOPs: 43.22 | +7: iteration 100060/ 115203 | consumed samples: 25615360 | consumed tokens: 52460257280 | elapsed time per iteration (s): 0.56 | learning rate: 2.772E-05 | global batch size: 256 | lm loss: 2.564039E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.688 | TFLOPs: 43.92 | +7: iteration 100070/ 115203 | consumed samples: 25617920 | consumed tokens: 52465500160 | elapsed time per iteration (s): 0.55 | learning rate: 2.771E-05 | global batch size: 256 | lm loss: 2.575294E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.850 | TFLOPs: 44.03 | +7: iteration 100080/ 115203 | consumed samples: 25620480 | consumed tokens: 52470743040 | elapsed time per iteration (s): 0.56 | learning rate: 2.770E-05 | global batch size: 256 | lm loss: 2.571021E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.699 | TFLOPs: 43.73 | +7: iteration 100090/ 115203 | consumed samples: 25623040 | consumed tokens: 52475985920 | elapsed time per iteration (s): 0.57 | learning rate: 2.769E-05 | global batch size: 256 | lm loss: 2.566258E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.128 | TFLOPs: 43.01 | +7: iteration 100100/ 115203 | consumed samples: 25625600 | consumed tokens: 52481228800 | elapsed time per iteration (s): 0.55 | learning rate: 2.768E-05 | global batch size: 256 | lm loss: 2.575897E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.908 | TFLOPs: 44.04 | +7: iteration 100110/ 115203 | consumed samples: 25628160 | consumed tokens: 52486471680 | elapsed time per iteration (s): 0.56 | learning rate: 2.767E-05 | global batch size: 256 | lm loss: 2.563930E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.633 | TFLOPs: 43.25 | +7: iteration 100120/ 115203 | consumed samples: 25630720 | consumed tokens: 52491714560 | elapsed time per iteration (s): 0.57 | learning rate: 2.766E-05 | global batch size: 256 | lm loss: 2.588729E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.718 | TFLOPs: 43.07 | +7: iteration 100130/ 115203 | consumed samples: 25633280 | consumed tokens: 52496957440 | elapsed time per iteration (s): 0.56 | learning rate: 2.765E-05 | global batch size: 256 | lm loss: 2.572564E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.639 | TFLOPs: 43.25 | +7: iteration 100140/ 115203 | consumed samples: 25635840 | consumed tokens: 52502200320 | elapsed time per iteration (s): 0.57 | learning rate: 2.764E-05 | global batch size: 256 | lm loss: 2.553600E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.075 | TFLOPs: 43.01 | +7: iteration 100150/ 115203 | consumed samples: 25638400 | consumed tokens: 52507443200 | elapsed time per iteration (s): 0.56 | learning rate: 2.763E-05 | global batch size: 256 | lm loss: 2.580070E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.200 | TFLOPs: 43.49 | +7: iteration 100160/ 115203 | consumed samples: 25640960 | consumed tokens: 52512686080 | elapsed time per iteration (s): 0.56 | learning rate: 2.762E-05 | global batch size: 256 | lm loss: 2.561844E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.086 | TFLOPs: 43.86 | +7: iteration 100170/ 115203 | consumed samples: 25643520 | consumed tokens: 52517928960 | elapsed time per iteration (s): 0.57 | learning rate: 2.761E-05 | global batch size: 256 | lm loss: 2.573103E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.297 | TFLOPs: 42.45 | +7: iteration 100180/ 115203 | consumed samples: 25646080 | consumed tokens: 52523171840 | elapsed time per iteration (s): 0.56 | learning rate: 2.760E-05 | global batch size: 256 | lm loss: 2.575842E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.313 | TFLOPs: 43.41 | +7: iteration 100190/ 115203 | consumed samples: 25648640 | consumed tokens: 52528414720 | elapsed time per iteration (s): 0.56 | learning rate: 2.759E-05 | global batch size: 256 | lm loss: 2.572308E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.377 | TFLOPs: 43.22 | +7: iteration 100200/ 115203 | consumed samples: 25651200 | consumed tokens: 52533657600 | elapsed time per iteration (s): 0.56 | learning rate: 2.758E-05 | global batch size: 256 | lm loss: 2.568492E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.962 | TFLOPs: 43.57 | +7: iteration 100210/ 115203 | consumed samples: 25653760 | consumed tokens: 52538900480 | elapsed time per iteration (s): 0.57 | learning rate: 2.757E-05 | global batch size: 256 | lm loss: 2.584290E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.511 | TFLOPs: 43.05 | +7: iteration 100220/ 115203 | consumed samples: 25656320 | consumed tokens: 52544143360 | elapsed time per iteration (s): 0.57 | learning rate: 2.756E-05 | global batch size: 256 | lm loss: 2.565203E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.775 | TFLOPs: 43.07 | +7: iteration 100230/ 115203 | consumed samples: 25658880 | consumed tokens: 52549386240 | elapsed time per iteration (s): 0.56 | learning rate: 2.755E-05 | global batch size: 256 | lm loss: 2.555057E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.426 | TFLOPs: 43.90 | +7: iteration 100240/ 115203 | consumed samples: 25661440 | consumed tokens: 52554629120 | elapsed time per iteration (s): 0.56 | learning rate: 2.754E-05 | global batch size: 256 | lm loss: 2.567993E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.986 | TFLOPs: 43.57 | +7: iteration 100250/ 115203 | consumed samples: 25664000 | consumed tokens: 52559872000 | elapsed time per iteration (s): 0.56 | learning rate: 2.753E-05 | global batch size: 256 | lm loss: 2.565652E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.980 | TFLOPs: 43.76 | +7: iteration 100260/ 115203 | consumed samples: 25666560 | consumed tokens: 52565114880 | elapsed time per iteration (s): 0.56 | learning rate: 2.752E-05 | global batch size: 256 | lm loss: 2.570997E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.303 | TFLOPs: 43.60 | +7: iteration 100270/ 115203 | consumed samples: 25669120 | consumed tokens: 52570357760 | elapsed time per iteration (s): 0.56 | learning rate: 2.751E-05 | global batch size: 256 | lm loss: 2.576344E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.927 | TFLOPs: 43.66 | +7: iteration 100280/ 115203 | consumed samples: 25671680 | consumed tokens: 52575600640 | elapsed time per iteration (s): 0.55 | learning rate: 2.750E-05 | global batch size: 256 | lm loss: 2.559927E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.594 | TFLOPs: 44.01 | +7: iteration 100290/ 115203 | consumed samples: 25674240 | consumed tokens: 52580843520 | elapsed time per iteration (s): 0.55 | learning rate: 2.749E-05 | global batch size: 256 | lm loss: 2.570145E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.684 | TFLOPs: 44.02 | +7: iteration 100300/ 115203 | consumed samples: 25676800 | consumed tokens: 52586086400 | elapsed time per iteration (s): 0.56 | learning rate: 2.748E-05 | global batch size: 256 | lm loss: 2.570082E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.344 | TFLOPs: 43.32 | +7: iteration 100310/ 115203 | consumed samples: 25679360 | consumed tokens: 52591329280 | elapsed time per iteration (s): 0.56 | learning rate: 2.747E-05 | global batch size: 256 | lm loss: 2.567828E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.171 | TFLOPs: 43.87 | +7: iteration 100320/ 115203 | consumed samples: 25681920 | consumed tokens: 52596572160 | elapsed time per iteration (s): 0.57 | learning rate: 2.746E-05 | global batch size: 256 | lm loss: 2.566685E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.019 | TFLOPs: 42.71 | +7: iteration 100330/ 115203 | consumed samples: 25684480 | consumed tokens: 52601815040 | elapsed time per iteration (s): 0.56 | learning rate: 2.745E-05 | global batch size: 256 | lm loss: 2.570727E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.657 | TFLOPs: 43.73 | +7: iteration 100340/ 115203 | consumed samples: 25687040 | consumed tokens: 52607057920 | elapsed time per iteration (s): 0.56 | learning rate: 2.744E-05 | global batch size: 256 | lm loss: 2.566393E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.870 | TFLOPs: 43.75 | +7: iteration 100350/ 115203 | consumed samples: 25689600 | consumed tokens: 52612300800 | elapsed time per iteration (s): 0.56 | learning rate: 2.743E-05 | global batch size: 256 | lm loss: 2.562215E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.270 | TFLOPs: 43.50 | +7: iteration 100360/ 115203 | consumed samples: 25692160 | consumed tokens: 52617543680 | elapsed time per iteration (s): 0.57 | learning rate: 2.742E-05 | global batch size: 256 | lm loss: 2.570708E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.043 | TFLOPs: 42.81 | +7: iteration 100370/ 115203 | consumed samples: 25694720 | consumed tokens: 52622786560 | elapsed time per iteration (s): 0.56 | learning rate: 2.741E-05 | global batch size: 256 | lm loss: 2.559861E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.480 | TFLOPs: 43.43 | +7: iteration 100380/ 115203 | consumed samples: 25697280 | consumed tokens: 52628029440 | elapsed time per iteration (s): 0.57 | learning rate: 2.740E-05 | global batch size: 256 | lm loss: 2.570713E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.165 | TFLOPs: 42.92 | +7: iteration 100390/ 115203 | consumed samples: 25699840 | consumed tokens: 52633272320 | elapsed time per iteration (s): 0.57 | learning rate: 2.739E-05 | global batch size: 256 | lm loss: 2.555672E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.804 | TFLOPs: 43.07 | +7: iteration 100400/ 115203 | consumed samples: 25702400 | consumed tokens: 52638515200 | elapsed time per iteration (s): 0.56 | learning rate: 2.738E-05 | global batch size: 256 | lm loss: 2.557396E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.540 | TFLOPs: 43.62 | +7: iteration 100410/ 115203 | consumed samples: 25704960 | consumed tokens: 52643758080 | elapsed time per iteration (s): 0.57 | learning rate: 2.737E-05 | global batch size: 256 | lm loss: 2.572484E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.886 | TFLOPs: 42.61 | +7: iteration 100420/ 115203 | consumed samples: 25707520 | consumed tokens: 52649000960 | elapsed time per iteration (s): 0.56 | learning rate: 2.736E-05 | global batch size: 256 | lm loss: 2.570143E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.765 | TFLOPs: 43.55 | +7: iteration 100430/ 115203 | consumed samples: 25710080 | consumed tokens: 52654243840 | elapsed time per iteration (s): 0.55 | learning rate: 2.735E-05 | global batch size: 256 | lm loss: 2.576641E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.747 | TFLOPs: 44.02 | +7: iteration 100440/ 115203 | consumed samples: 25712640 | consumed tokens: 52659486720 | elapsed time per iteration (s): 0.56 | learning rate: 2.734E-05 | global batch size: 256 | lm loss: 2.569365E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.066 | TFLOPs: 43.58 | +7: iteration 100450/ 115203 | consumed samples: 25715200 | consumed tokens: 52664729600 | elapsed time per iteration (s): 0.58 | learning rate: 2.733E-05 | global batch size: 256 | lm loss: 2.574127E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.135 | TFLOPs: 42.34 | +7: iteration 100460/ 115203 | consumed samples: 25717760 | consumed tokens: 52669972480 | elapsed time per iteration (s): 0.58 | learning rate: 2.732E-05 | global batch size: 256 | lm loss: 2.582698E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.360 | TFLOPs: 42.08 | +7: iteration 100470/ 115203 | consumed samples: 25720320 | consumed tokens: 52675215360 | elapsed time per iteration (s): 0.56 | learning rate: 2.731E-05 | global batch size: 256 | lm loss: 2.566476E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.746 | TFLOPs: 43.93 | +7: iteration 100480/ 115203 | consumed samples: 25722880 | consumed tokens: 52680458240 | elapsed time per iteration (s): 0.56 | learning rate: 2.730E-05 | global batch size: 256 | lm loss: 2.577582E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.760 | TFLOPs: 43.83 | +7: iteration 100490/ 115203 | consumed samples: 25725440 | consumed tokens: 52685701120 | elapsed time per iteration (s): 0.56 | learning rate: 2.729E-05 | global batch size: 256 | lm loss: 2.561628E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.448 | TFLOPs: 43.52 | +7: iteration 100500/ 115203 | consumed samples: 25728000 | consumed tokens: 52690944000 | elapsed time per iteration (s): 0.56 | learning rate: 2.728E-05 | global batch size: 256 | lm loss: 2.558414E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.318 | TFLOPs: 43.60 | +7: iteration 100510/ 115203 | consumed samples: 25730560 | consumed tokens: 52696186880 | elapsed time per iteration (s): 0.56 | learning rate: 2.727E-05 | global batch size: 256 | lm loss: 2.579462E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.234 | TFLOPs: 43.31 | +7: iteration 100520/ 115203 | consumed samples: 25733120 | consumed tokens: 52701429760 | elapsed time per iteration (s): 0.57 | learning rate: 2.726E-05 | global batch size: 256 | lm loss: 2.563830E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.896 | TFLOPs: 42.70 | +7: iteration 100530/ 115203 | consumed samples: 25735680 | consumed tokens: 52706672640 | elapsed time per iteration (s): 0.56 | learning rate: 2.725E-05 | global batch size: 256 | lm loss: 2.560908E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.750 | TFLOPs: 43.93 | +7: iteration 100540/ 115203 | consumed samples: 25738240 | consumed tokens: 52711915520 | elapsed time per iteration (s): 0.57 | learning rate: 2.724E-05 | global batch size: 256 | lm loss: 2.554626E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.099 | TFLOPs: 42.91 | +7: iteration 100550/ 115203 | consumed samples: 25740800 | consumed tokens: 52717158400 | elapsed time per iteration (s): 0.56 | learning rate: 2.723E-05 | global batch size: 256 | lm loss: 2.554618E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.669 | TFLOPs: 43.92 | +7: iteration 100560/ 115203 | consumed samples: 25743360 | consumed tokens: 52722401280 | elapsed time per iteration (s): 0.57 | learning rate: 2.722E-05 | global batch size: 256 | lm loss: 2.568520E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.138 | TFLOPs: 42.92 | +7: iteration 100570/ 115203 | consumed samples: 25745920 | consumed tokens: 52727644160 | elapsed time per iteration (s): 0.56 | learning rate: 2.721E-05 | global batch size: 256 | lm loss: 2.566078E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.429 | TFLOPs: 43.90 | +7: iteration 100580/ 115203 | consumed samples: 25748480 | consumed tokens: 52732887040 | elapsed time per iteration (s): 0.56 | learning rate: 2.720E-05 | global batch size: 256 | lm loss: 2.557996E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.916 | TFLOPs: 43.56 | +7: iteration 100590/ 115203 | consumed samples: 25751040 | consumed tokens: 52738129920 | elapsed time per iteration (s): 0.56 | learning rate: 2.719E-05 | global batch size: 256 | lm loss: 2.552660E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.965 | TFLOPs: 43.76 | +7: iteration 100600/ 115203 | consumed samples: 25753600 | consumed tokens: 52743372800 | elapsed time per iteration (s): 0.55 | learning rate: 2.718E-05 | global batch size: 256 | lm loss: 2.562439E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.437 | TFLOPs: 43.99 | +7: iteration 100610/ 115203 | consumed samples: 25756160 | consumed tokens: 52748615680 | elapsed time per iteration (s): 0.56 | learning rate: 2.717E-05 | global batch size: 256 | lm loss: 2.582455E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.334 | TFLOPs: 43.60 | +7: iteration 100620/ 115203 | consumed samples: 25758720 | consumed tokens: 52753858560 | elapsed time per iteration (s): 0.56 | learning rate: 2.716E-05 | global batch size: 256 | lm loss: 2.581384E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.621 | TFLOPs: 43.53 | +7: iteration 100630/ 115203 | consumed samples: 25761280 | consumed tokens: 52759101440 | elapsed time per iteration (s): 0.56 | learning rate: 2.716E-05 | global batch size: 256 | lm loss: 2.568284E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.994 | TFLOPs: 43.86 | +7: iteration 100640/ 115203 | consumed samples: 25763840 | consumed tokens: 52764344320 | elapsed time per iteration (s): 0.56 | learning rate: 2.715E-05 | global batch size: 256 | lm loss: 2.564013E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.324 | TFLOPs: 43.60 | +7: iteration 100650/ 115203 | consumed samples: 25766400 | consumed tokens: 52769587200 | elapsed time per iteration (s): 0.56 | learning rate: 2.714E-05 | global batch size: 256 | lm loss: 2.567115E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.560 | TFLOPs: 43.72 | +7: iteration 100660/ 115203 | consumed samples: 25768960 | consumed tokens: 52774830080 | elapsed time per iteration (s): 0.56 | learning rate: 2.713E-05 | global batch size: 256 | lm loss: 2.570263E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.497 | TFLOPs: 43.90 | +7: iteration 100670/ 115203 | consumed samples: 25771520 | consumed tokens: 52780072960 | elapsed time per iteration (s): 0.55 | learning rate: 2.712E-05 | global batch size: 256 | lm loss: 2.567108E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.686 | TFLOPs: 44.02 | +7: iteration 100680/ 115203 | consumed samples: 25774080 | consumed tokens: 52785315840 | elapsed time per iteration (s): 0.57 | learning rate: 2.711E-05 | global batch size: 256 | lm loss: 2.572541E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.128 | TFLOPs: 42.82 | +7: iteration 100690/ 115203 | consumed samples: 25776640 | consumed tokens: 52790558720 | elapsed time per iteration (s): 0.57 | learning rate: 2.710E-05 | global batch size: 256 | lm loss: 2.570003E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.550 | TFLOPs: 43.15 | +7: iteration 100700/ 115203 | consumed samples: 25779200 | consumed tokens: 52795801600 | elapsed time per iteration (s): 0.56 | learning rate: 2.709E-05 | global batch size: 256 | lm loss: 2.562231E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.306 | TFLOPs: 43.50 | +7: iteration 100710/ 115203 | consumed samples: 25781760 | consumed tokens: 52801044480 | elapsed time per iteration (s): 0.57 | learning rate: 2.708E-05 | global batch size: 256 | lm loss: 2.576940E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.578 | TFLOPs: 42.96 | +7: iteration 100720/ 115203 | consumed samples: 25784320 | consumed tokens: 52806287360 | elapsed time per iteration (s): 0.57 | learning rate: 2.707E-05 | global batch size: 256 | lm loss: 2.569688E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.647 | TFLOPs: 43.06 | +7: iteration 100730/ 115203 | consumed samples: 25786880 | consumed tokens: 52811530240 | elapsed time per iteration (s): 0.57 | learning rate: 2.706E-05 | global batch size: 256 | lm loss: 2.564148E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.296 | TFLOPs: 42.45 | +7: iteration 100740/ 115203 | consumed samples: 25789440 | consumed tokens: 52816773120 | elapsed time per iteration (s): 0.56 | learning rate: 2.705E-05 | global batch size: 256 | lm loss: 2.559745E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.834 | TFLOPs: 43.84 | +7: iteration 100750/ 115203 | consumed samples: 25792000 | consumed tokens: 52822016000 | elapsed time per iteration (s): 0.57 | learning rate: 2.704E-05 | global batch size: 256 | lm loss: 2.566188E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.845 | TFLOPs: 43.08 | +7: iteration 100760/ 115203 | consumed samples: 25794560 | consumed tokens: 52827258880 | elapsed time per iteration (s): 0.56 | learning rate: 2.703E-05 | global batch size: 256 | lm loss: 2.559587E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.403 | TFLOPs: 43.61 | +7: iteration 100770/ 115203 | consumed samples: 25797120 | consumed tokens: 52832501760 | elapsed time per iteration (s): 0.56 | learning rate: 2.702E-05 | global batch size: 256 | lm loss: 2.567506E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.496 | TFLOPs: 43.62 | +7: iteration 100780/ 115203 | consumed samples: 25799680 | consumed tokens: 52837744640 | elapsed time per iteration (s): 0.56 | learning rate: 2.701E-05 | global batch size: 256 | lm loss: 2.559838E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.236 | TFLOPs: 43.69 | +7: iteration 100790/ 115203 | consumed samples: 25802240 | consumed tokens: 52842987520 | elapsed time per iteration (s): 0.56 | learning rate: 2.700E-05 | global batch size: 256 | lm loss: 2.567992E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.203 | TFLOPs: 43.59 | +7: iteration 100800/ 115203 | consumed samples: 25804800 | consumed tokens: 52848230400 | elapsed time per iteration (s): 0.57 | learning rate: 2.699E-05 | global batch size: 256 | lm loss: 2.590479E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.179 | TFLOPs: 42.92 | +7: iteration 100810/ 115203 | consumed samples: 25807360 | consumed tokens: 52853473280 | elapsed time per iteration (s): 0.56 | learning rate: 2.698E-05 | global batch size: 256 | lm loss: 2.571462E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.362 | TFLOPs: 43.51 | +7: iteration 100820/ 115203 | consumed samples: 25809920 | consumed tokens: 52858716160 | elapsed time per iteration (s): 0.56 | learning rate: 2.697E-05 | global batch size: 256 | lm loss: 2.557550E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.679 | TFLOPs: 43.25 | +7: iteration 100830/ 115203 | consumed samples: 25812480 | consumed tokens: 52863959040 | elapsed time per iteration (s): 0.56 | learning rate: 2.696E-05 | global batch size: 256 | lm loss: 2.571280E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.514 | TFLOPs: 43.90 | +7: iteration 100840/ 115203 | consumed samples: 25815040 | consumed tokens: 52869201920 | elapsed time per iteration (s): 0.56 | learning rate: 2.695E-05 | global batch size: 256 | lm loss: 2.566296E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.056 | TFLOPs: 43.67 | +7: iteration 100850/ 115203 | consumed samples: 25817600 | consumed tokens: 52874444800 | elapsed time per iteration (s): 0.57 | learning rate: 2.694E-05 | global batch size: 256 | lm loss: 2.561891E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.022 | TFLOPs: 43.00 | +7: iteration 100860/ 115203 | consumed samples: 25820160 | consumed tokens: 52879687680 | elapsed time per iteration (s): 0.56 | learning rate: 2.693E-05 | global batch size: 256 | lm loss: 2.566628E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.947 | TFLOPs: 43.47 | +7: iteration 100870/ 115203 | consumed samples: 25822720 | consumed tokens: 52884930560 | elapsed time per iteration (s): 0.56 | learning rate: 2.692E-05 | global batch size: 256 | lm loss: 2.561840E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.829 | TFLOPs: 43.27 | +7: iteration 100880/ 115203 | consumed samples: 25825280 | consumed tokens: 52890173440 | elapsed time per iteration (s): 0.57 | learning rate: 2.691E-05 | global batch size: 256 | lm loss: 2.562915E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.454 | TFLOPs: 42.95 | +7: iteration 100890/ 115203 | consumed samples: 25827840 | consumed tokens: 52895416320 | elapsed time per iteration (s): 0.57 | learning rate: 2.691E-05 | global batch size: 256 | lm loss: 2.574418E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.293 | TFLOPs: 43.12 | +7: iteration 100900/ 115203 | consumed samples: 25830400 | consumed tokens: 52900659200 | elapsed time per iteration (s): 0.56 | learning rate: 2.690E-05 | global batch size: 256 | lm loss: 2.573812E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.675 | TFLOPs: 43.92 | +7: iteration 100910/ 115203 | consumed samples: 25832960 | consumed tokens: 52905902080 | elapsed time per iteration (s): 0.57 | learning rate: 2.689E-05 | global batch size: 256 | lm loss: 2.570172E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.738 | TFLOPs: 43.16 | +7: iteration 100920/ 115203 | consumed samples: 25835520 | consumed tokens: 52911144960 | elapsed time per iteration (s): 0.56 | learning rate: 2.688E-05 | global batch size: 256 | lm loss: 2.567222E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.961 | TFLOPs: 43.66 | +7: iteration 100930/ 115203 | consumed samples: 25838080 | consumed tokens: 52916387840 | elapsed time per iteration (s): 0.56 | learning rate: 2.687E-05 | global batch size: 256 | lm loss: 2.571109E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.217 | TFLOPs: 43.59 | +7: iteration 100940/ 115203 | consumed samples: 25840640 | consumed tokens: 52921630720 | elapsed time per iteration (s): 0.56 | learning rate: 2.686E-05 | global batch size: 256 | lm loss: 2.550703E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.401 | TFLOPs: 43.51 | +7: iteration 100950/ 115203 | consumed samples: 25843200 | consumed tokens: 52926873600 | elapsed time per iteration (s): 0.55 | learning rate: 2.685E-05 | global batch size: 256 | lm loss: 2.568086E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.707 | TFLOPs: 44.02 | +7: iteration 100960/ 115203 | consumed samples: 25845760 | consumed tokens: 52932116480 | elapsed time per iteration (s): 0.56 | learning rate: 2.684E-05 | global batch size: 256 | lm loss: 2.585496E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.432 | TFLOPs: 43.52 | +7: iteration 100970/ 115203 | consumed samples: 25848320 | consumed tokens: 52937359360 | elapsed time per iteration (s): 0.58 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 2.562080E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.134 | TFLOPs: 42.44 | +7: iteration 100980/ 115203 | consumed samples: 25850880 | consumed tokens: 52942602240 | elapsed time per iteration (s): 0.56 | learning rate: 2.682E-05 | global batch size: 256 | lm loss: 2.570810E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.860 | TFLOPs: 43.65 | +7: iteration 100990/ 115203 | consumed samples: 25853440 | consumed tokens: 52947845120 | elapsed time per iteration (s): 0.55 | learning rate: 2.681E-05 | global batch size: 256 | lm loss: 2.563274E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.905 | TFLOPs: 44.04 | +7: iteration 101000/ 115203 | consumed samples: 25856000 | consumed tokens: 52953088000 | elapsed time per iteration (s): 0.57 | learning rate: 2.680E-05 | global batch size: 256 | lm loss: 2.562324E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.144 | TFLOPs: 43.01 | +7: iteration 101010/ 115203 | consumed samples: 25858560 | consumed tokens: 52958330880 | elapsed time per iteration (s): 0.57 | learning rate: 2.679E-05 | global batch size: 256 | lm loss: 2.566871E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.670 | TFLOPs: 42.59 | +7: iteration 101020/ 115203 | consumed samples: 25861120 | consumed tokens: 52963573760 | elapsed time per iteration (s): 0.56 | learning rate: 2.678E-05 | global batch size: 256 | lm loss: 2.567811E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.304 | TFLOPs: 43.60 | +7: iteration 101030/ 115203 | consumed samples: 25863680 | consumed tokens: 52968816640 | elapsed time per iteration (s): 0.56 | learning rate: 2.677E-05 | global batch size: 256 | lm loss: 2.562838E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.400 | TFLOPs: 43.51 | +7: iteration 101040/ 115203 | consumed samples: 25866240 | consumed tokens: 52974059520 | elapsed time per iteration (s): 0.56 | learning rate: 2.676E-05 | global batch size: 256 | lm loss: 2.556723E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.953 | TFLOPs: 43.28 | +7: iteration 101050/ 115203 | consumed samples: 25868800 | consumed tokens: 52979302400 | elapsed time per iteration (s): 0.55 | learning rate: 2.675E-05 | global batch size: 256 | lm loss: 2.587854E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.942 | TFLOPs: 44.04 | +7: iteration 101060/ 115203 | consumed samples: 25871360 | consumed tokens: 52984545280 | elapsed time per iteration (s): 0.56 | learning rate: 2.674E-05 | global batch size: 256 | lm loss: 2.559474E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.822 | TFLOPs: 43.55 | +7: iteration 101070/ 115203 | consumed samples: 25873920 | consumed tokens: 52989788160 | elapsed time per iteration (s): 0.55 | learning rate: 2.673E-05 | global batch size: 256 | lm loss: 2.572991E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.848 | TFLOPs: 44.03 | +7: iteration 101080/ 115203 | consumed samples: 25876480 | consumed tokens: 52995031040 | elapsed time per iteration (s): 0.57 | learning rate: 2.673E-05 | global batch size: 256 | lm loss: 2.565853E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.599 | TFLOPs: 43.06 | +7: iteration 101090/ 115203 | consumed samples: 25879040 | consumed tokens: 53000273920 | elapsed time per iteration (s): 0.55 | learning rate: 2.672E-05 | global batch size: 256 | lm loss: 2.566598E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.724 | TFLOPs: 44.02 | +7: iteration 101100/ 115203 | consumed samples: 25881600 | consumed tokens: 53005516800 | elapsed time per iteration (s): 0.55 | learning rate: 2.671E-05 | global batch size: 256 | lm loss: 2.564060E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.722 | TFLOPs: 44.02 | +7: iteration 101110/ 115203 | consumed samples: 25884160 | consumed tokens: 53010759680 | elapsed time per iteration (s): 0.55 | learning rate: 2.670E-05 | global batch size: 256 | lm loss: 2.569664E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.485 | TFLOPs: 44.00 | +7: iteration 101120/ 115203 | consumed samples: 25886720 | consumed tokens: 53016002560 | elapsed time per iteration (s): 0.55 | learning rate: 2.669E-05 | global batch size: 256 | lm loss: 2.574965E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.724 | TFLOPs: 44.02 | +7: iteration 101130/ 115203 | consumed samples: 25889280 | consumed tokens: 53021245440 | elapsed time per iteration (s): 0.56 | learning rate: 2.668E-05 | global batch size: 256 | lm loss: 2.575128E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.077 | TFLOPs: 43.96 | +7: iteration 101140/ 115203 | consumed samples: 25891840 | consumed tokens: 53026488320 | elapsed time per iteration (s): 0.58 | learning rate: 2.667E-05 | global batch size: 256 | lm loss: 2.554454E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.150 | TFLOPs: 42.44 | +7: iteration 101150/ 115203 | consumed samples: 25894400 | consumed tokens: 53031731200 | elapsed time per iteration (s): 0.56 | learning rate: 2.666E-05 | global batch size: 256 | lm loss: 2.562180E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.779 | TFLOPs: 43.55 | +7: iteration 101160/ 115203 | consumed samples: 25896960 | consumed tokens: 53036974080 | elapsed time per iteration (s): 0.56 | learning rate: 2.665E-05 | global batch size: 256 | lm loss: 2.569115E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.702 | TFLOPs: 43.83 | +7: iteration 101170/ 115203 | consumed samples: 25899520 | consumed tokens: 53042216960 | elapsed time per iteration (s): 0.55 | learning rate: 2.664E-05 | global batch size: 256 | lm loss: 2.576173E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.444 | TFLOPs: 43.99 | +7: iteration 101180/ 115203 | consumed samples: 25902080 | consumed tokens: 53047459840 | elapsed time per iteration (s): 0.56 | learning rate: 2.663E-05 | global batch size: 256 | lm loss: 2.577481E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.583 | TFLOPs: 43.53 | +7: iteration 101190/ 115203 | consumed samples: 25904640 | consumed tokens: 53052702720 | elapsed time per iteration (s): 0.56 | learning rate: 2.662E-05 | global batch size: 256 | lm loss: 2.557019E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.423 | TFLOPs: 43.90 | +7: iteration 101200/ 115203 | consumed samples: 25907200 | consumed tokens: 53057945600 | elapsed time per iteration (s): 0.56 | learning rate: 2.661E-05 | global batch size: 256 | lm loss: 2.583449E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.454 | TFLOPs: 43.71 | +7: iteration 101210/ 115203 | consumed samples: 25909760 | consumed tokens: 53063188480 | elapsed time per iteration (s): 0.56 | learning rate: 2.660E-05 | global batch size: 256 | lm loss: 2.564968E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.706 | TFLOPs: 43.92 | +7: iteration 101220/ 115203 | consumed samples: 25912320 | consumed tokens: 53068431360 | elapsed time per iteration (s): 0.57 | learning rate: 2.659E-05 | global batch size: 256 | lm loss: 2.552768E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.554 | TFLOPs: 43.05 | +7: iteration 101230/ 115203 | consumed samples: 25914880 | consumed tokens: 53073674240 | elapsed time per iteration (s): 0.56 | learning rate: 2.659E-05 | global batch size: 256 | lm loss: 2.567708E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.185 | TFLOPs: 43.97 | +7: iteration 101240/ 115203 | consumed samples: 25917440 | consumed tokens: 53078917120 | elapsed time per iteration (s): 0.56 | learning rate: 2.658E-05 | global batch size: 256 | lm loss: 2.568735E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.357 | TFLOPs: 43.89 | +7: iteration 101250/ 115203 | consumed samples: 25920000 | consumed tokens: 53084160000 | elapsed time per iteration (s): 0.57 | learning rate: 2.657E-05 | global batch size: 256 | lm loss: 2.570890E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.859 | TFLOPs: 43.18 | +7: iteration 101260/ 115203 | consumed samples: 25922560 | consumed tokens: 53089402880 | elapsed time per iteration (s): 0.55 | learning rate: 2.656E-05 | global batch size: 256 | lm loss: 2.570611E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.563 | TFLOPs: 44.00 | +7: iteration 101270/ 115203 | consumed samples: 25925120 | consumed tokens: 53094645760 | elapsed time per iteration (s): 0.56 | learning rate: 2.655E-05 | global batch size: 256 | lm loss: 2.566983E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.131 | TFLOPs: 43.68 | +7: iteration 101280/ 115203 | consumed samples: 25927680 | consumed tokens: 53099888640 | elapsed time per iteration (s): 0.55 | learning rate: 2.654E-05 | global batch size: 256 | lm loss: 2.564082E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.607 | TFLOPs: 44.01 | +7: iteration 101290/ 115203 | consumed samples: 25930240 | consumed tokens: 53105131520 | elapsed time per iteration (s): 0.56 | learning rate: 2.653E-05 | global batch size: 256 | lm loss: 2.563650E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.166 | TFLOPs: 43.30 | +7: iteration 101300/ 115203 | consumed samples: 25932800 | consumed tokens: 53110374400 | elapsed time per iteration (s): 0.55 | learning rate: 2.652E-05 | global batch size: 256 | lm loss: 2.577088E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.788 | TFLOPs: 44.03 | +7: iteration 101310/ 115203 | consumed samples: 25935360 | consumed tokens: 53115617280 | elapsed time per iteration (s): 0.56 | learning rate: 2.651E-05 | global batch size: 256 | lm loss: 2.573785E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.984 | TFLOPs: 43.57 | +7: iteration 101320/ 115203 | consumed samples: 25937920 | consumed tokens: 53120860160 | elapsed time per iteration (s): 0.55 | learning rate: 2.650E-05 | global batch size: 256 | lm loss: 2.578586E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.683 | TFLOPs: 44.02 | +7: iteration 101330/ 115203 | consumed samples: 25940480 | consumed tokens: 53126103040 | elapsed time per iteration (s): 0.55 | learning rate: 2.649E-05 | global batch size: 256 | lm loss: 2.560687E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.624 | TFLOPs: 44.01 | +7: iteration 101340/ 115203 | consumed samples: 25943040 | consumed tokens: 53131345920 | elapsed time per iteration (s): 0.56 | learning rate: 2.648E-05 | global batch size: 256 | lm loss: 2.566467E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.887 | TFLOPs: 43.85 | +7: iteration 101350/ 115203 | consumed samples: 25945600 | consumed tokens: 53136588800 | elapsed time per iteration (s): 0.56 | learning rate: 2.647E-05 | global batch size: 256 | lm loss: 2.573755E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.778 | TFLOPs: 43.55 | +7: iteration 101360/ 115203 | consumed samples: 25948160 | consumed tokens: 53141831680 | elapsed time per iteration (s): 0.55 | learning rate: 2.646E-05 | global batch size: 256 | lm loss: 2.572311E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.798 | TFLOPs: 44.03 | +7: iteration 101370/ 115203 | consumed samples: 25950720 | consumed tokens: 53147074560 | elapsed time per iteration (s): 0.56 | learning rate: 2.646E-05 | global batch size: 256 | lm loss: 2.561442E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.081 | TFLOPs: 43.58 | +7: iteration 101380/ 115203 | consumed samples: 25953280 | consumed tokens: 53152317440 | elapsed time per iteration (s): 0.56 | learning rate: 2.645E-05 | global batch size: 256 | lm loss: 2.573874E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.859 | TFLOPs: 43.46 | +7: iteration 101390/ 115203 | consumed samples: 25955840 | consumed tokens: 53157560320 | elapsed time per iteration (s): 0.56 | learning rate: 2.644E-05 | global batch size: 256 | lm loss: 2.583913E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.616 | TFLOPs: 43.82 | +7: iteration 101400/ 115203 | consumed samples: 25958400 | consumed tokens: 53162803200 | elapsed time per iteration (s): 0.56 | learning rate: 2.643E-05 | global batch size: 256 | lm loss: 2.553135E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.143 | TFLOPs: 43.96 | +7: iteration 101410/ 115203 | consumed samples: 25960960 | consumed tokens: 53168046080 | elapsed time per iteration (s): 0.57 | learning rate: 2.642E-05 | global batch size: 256 | lm loss: 2.573191E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.372 | TFLOPs: 43.13 | +7: iteration 101420/ 115203 | consumed samples: 25963520 | consumed tokens: 53173288960 | elapsed time per iteration (s): 0.56 | learning rate: 2.641E-05 | global batch size: 256 | lm loss: 2.560400E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.970 | TFLOPs: 43.57 | +7: iteration 101430/ 115203 | consumed samples: 25966080 | consumed tokens: 53178531840 | elapsed time per iteration (s): 0.56 | learning rate: 2.640E-05 | global batch size: 256 | lm loss: 2.579360E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.198 | TFLOPs: 43.21 | +7: iteration 101440/ 115203 | consumed samples: 25968640 | consumed tokens: 53183774720 | elapsed time per iteration (s): 0.56 | learning rate: 2.639E-05 | global batch size: 256 | lm loss: 2.572269E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.248 | TFLOPs: 43.59 | +7: iteration 101450/ 115203 | consumed samples: 25971200 | consumed tokens: 53189017600 | elapsed time per iteration (s): 0.56 | learning rate: 2.638E-05 | global batch size: 256 | lm loss: 2.569784E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.993 | TFLOPs: 43.76 | +7: iteration 101460/ 115203 | consumed samples: 25973760 | consumed tokens: 53194260480 | elapsed time per iteration (s): 0.55 | learning rate: 2.637E-05 | global batch size: 256 | lm loss: 2.550044E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.779 | TFLOPs: 44.03 | +7: iteration 101470/ 115203 | consumed samples: 25976320 | consumed tokens: 53199503360 | elapsed time per iteration (s): 0.56 | learning rate: 2.636E-05 | global batch size: 256 | lm loss: 2.557957E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.830 | TFLOPs: 43.74 | +7: iteration 101480/ 115203 | consumed samples: 25978880 | consumed tokens: 53204746240 | elapsed time per iteration (s): 0.56 | learning rate: 2.635E-05 | global batch size: 256 | lm loss: 2.563306E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.502 | TFLOPs: 43.43 | +7: iteration 101490/ 115203 | consumed samples: 25981440 | consumed tokens: 53209989120 | elapsed time per iteration (s): 0.55 | learning rate: 2.635E-05 | global batch size: 256 | lm loss: 2.562882E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.469 | TFLOPs: 44.00 | +7: iteration 101500/ 115203 | consumed samples: 25984000 | consumed tokens: 53215232000 | elapsed time per iteration (s): 0.57 | learning rate: 2.634E-05 | global batch size: 256 | lm loss: 2.569680E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.438 | TFLOPs: 43.14 | +7: iteration 101510/ 115203 | consumed samples: 25986560 | consumed tokens: 53220474880 | elapsed time per iteration (s): 0.55 | learning rate: 2.633E-05 | global batch size: 256 | lm loss: 2.566504E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.595 | TFLOPs: 44.01 | +7: iteration 101520/ 115203 | consumed samples: 25989120 | consumed tokens: 53225717760 | elapsed time per iteration (s): 0.56 | learning rate: 2.632E-05 | global batch size: 256 | lm loss: 2.567494E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.834 | TFLOPs: 43.84 | +7: iteration 101530/ 115203 | consumed samples: 25991680 | consumed tokens: 53230960640 | elapsed time per iteration (s): 0.57 | learning rate: 2.631E-05 | global batch size: 256 | lm loss: 2.579089E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.293 | TFLOPs: 43.12 | +7: iteration 101540/ 115203 | consumed samples: 25994240 | consumed tokens: 53236203520 | elapsed time per iteration (s): 0.55 | learning rate: 2.630E-05 | global batch size: 256 | lm loss: 2.563106E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.666 | TFLOPs: 44.01 | +7: iteration 101550/ 115203 | consumed samples: 25996800 | consumed tokens: 53241446400 | elapsed time per iteration (s): 0.55 | learning rate: 2.629E-05 | global batch size: 256 | lm loss: 2.565279E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.558 | TFLOPs: 44.00 | +7: iteration 101560/ 115203 | consumed samples: 25999360 | consumed tokens: 53246689280 | elapsed time per iteration (s): 0.56 | learning rate: 2.628E-05 | global batch size: 256 | lm loss: 2.561575E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.722 | TFLOPs: 43.73 | +7: iteration 101570/ 115203 | consumed samples: 26001920 | consumed tokens: 53251932160 | elapsed time per iteration (s): 0.56 | learning rate: 2.627E-05 | global batch size: 256 | lm loss: 2.557729E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.015 | TFLOPs: 43.57 | +7: iteration 101580/ 115203 | consumed samples: 26004480 | consumed tokens: 53257175040 | elapsed time per iteration (s): 0.56 | learning rate: 2.626E-05 | global batch size: 256 | lm loss: 2.564915E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.693 | TFLOPs: 43.64 | +7: iteration 101590/ 115203 | consumed samples: 26007040 | consumed tokens: 53262417920 | elapsed time per iteration (s): 0.55 | learning rate: 2.625E-05 | global batch size: 256 | lm loss: 2.554006E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.764 | TFLOPs: 44.02 | +7: iteration 101600/ 115203 | consumed samples: 26009600 | consumed tokens: 53267660800 | elapsed time per iteration (s): 0.56 | learning rate: 2.625E-05 | global batch size: 256 | lm loss: 2.576788E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.268 | TFLOPs: 43.88 | +7: iteration 101610/ 115203 | consumed samples: 26012160 | consumed tokens: 53272903680 | elapsed time per iteration (s): 0.56 | learning rate: 2.624E-05 | global batch size: 256 | lm loss: 2.564800E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.348 | TFLOPs: 43.60 | +7: iteration 101620/ 115203 | consumed samples: 26014720 | consumed tokens: 53278146560 | elapsed time per iteration (s): 0.56 | learning rate: 2.623E-05 | global batch size: 256 | lm loss: 2.579893E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.424 | TFLOPs: 43.71 | +7: iteration 101630/ 115203 | consumed samples: 26017280 | consumed tokens: 53283389440 | elapsed time per iteration (s): 0.59 | learning rate: 2.622E-05 | global batch size: 256 | lm loss: 2.564299E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.999 | TFLOPs: 41.38 | +7: iteration 101640/ 115203 | consumed samples: 26019840 | consumed tokens: 53288632320 | elapsed time per iteration (s): 0.56 | learning rate: 2.621E-05 | global batch size: 256 | lm loss: 2.580055E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.242 | TFLOPs: 43.69 | +7: iteration 101650/ 115203 | consumed samples: 26022400 | consumed tokens: 53293875200 | elapsed time per iteration (s): 0.56 | learning rate: 2.620E-05 | global batch size: 256 | lm loss: 2.550214E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.359 | TFLOPs: 43.70 | +7: iteration 101660/ 115203 | consumed samples: 26024960 | consumed tokens: 53299118080 | elapsed time per iteration (s): 0.56 | learning rate: 2.619E-05 | global batch size: 256 | lm loss: 2.560481E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.501 | TFLOPs: 43.43 | +7: iteration 101670/ 115203 | consumed samples: 26027520 | consumed tokens: 53304360960 | elapsed time per iteration (s): 0.55 | learning rate: 2.618E-05 | global batch size: 256 | lm loss: 2.574289E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.601 | TFLOPs: 44.01 | +7: iteration 101680/ 115203 | consumed samples: 26030080 | consumed tokens: 53309603840 | elapsed time per iteration (s): 0.55 | learning rate: 2.617E-05 | global batch size: 256 | lm loss: 2.557563E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.695 | TFLOPs: 44.02 | +7: iteration 101690/ 115203 | consumed samples: 26032640 | consumed tokens: 53314846720 | elapsed time per iteration (s): 0.55 | learning rate: 2.616E-05 | global batch size: 256 | lm loss: 2.557990E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.734 | TFLOPs: 44.02 | +7: iteration 101700/ 115203 | consumed samples: 26035200 | consumed tokens: 53320089600 | elapsed time per iteration (s): 0.56 | learning rate: 2.615E-05 | global batch size: 256 | lm loss: 2.573442E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.309 | TFLOPs: 43.41 | +7: iteration 101710/ 115203 | consumed samples: 26037760 | consumed tokens: 53325332480 | elapsed time per iteration (s): 0.55 | learning rate: 2.615E-05 | global batch size: 256 | lm loss: 2.566213E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.774 | TFLOPs: 44.03 | +7: iteration 101720/ 115203 | consumed samples: 26040320 | consumed tokens: 53330575360 | elapsed time per iteration (s): 0.55 | learning rate: 2.614E-05 | global batch size: 256 | lm loss: 2.565949E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.615 | TFLOPs: 44.01 | +7: iteration 101730/ 115203 | consumed samples: 26042880 | consumed tokens: 53335818240 | elapsed time per iteration (s): 0.57 | learning rate: 2.613E-05 | global batch size: 256 | lm loss: 2.563869E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.028 | TFLOPs: 43.10 | +7: iteration 101740/ 115203 | consumed samples: 26045440 | consumed tokens: 53341061120 | elapsed time per iteration (s): 0.55 | learning rate: 2.612E-05 | global batch size: 256 | lm loss: 2.570120E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.811 | TFLOPs: 44.03 | +7: iteration 101750/ 115203 | consumed samples: 26048000 | consumed tokens: 53346304000 | elapsed time per iteration (s): 0.56 | learning rate: 2.611E-05 | global batch size: 256 | lm loss: 2.574874E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.851 | TFLOPs: 43.37 | +7: iteration 101760/ 115203 | consumed samples: 26050560 | consumed tokens: 53351546880 | elapsed time per iteration (s): 0.57 | learning rate: 2.610E-05 | global batch size: 256 | lm loss: 2.562996E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.425 | TFLOPs: 42.94 | +7: iteration 101770/ 115203 | consumed samples: 26053120 | consumed tokens: 53356789760 | elapsed time per iteration (s): 0.56 | learning rate: 2.609E-05 | global batch size: 256 | lm loss: 2.570508E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.135 | TFLOPs: 43.30 | +7: iteration 101780/ 115203 | consumed samples: 26055680 | consumed tokens: 53362032640 | elapsed time per iteration (s): 0.57 | learning rate: 2.608E-05 | global batch size: 256 | lm loss: 2.561897E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.331 | TFLOPs: 43.03 | +7: iteration 101790/ 115203 | consumed samples: 26058240 | consumed tokens: 53367275520 | elapsed time per iteration (s): 0.56 | learning rate: 2.607E-05 | global batch size: 256 | lm loss: 2.561812E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.669 | TFLOPs: 43.54 | +7: iteration 101800/ 115203 | consumed samples: 26060800 | consumed tokens: 53372518400 | elapsed time per iteration (s): 0.58 | learning rate: 2.606E-05 | global batch size: 256 | lm loss: 2.567718E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.607 | TFLOPs: 42.20 | +7: iteration 101810/ 115203 | consumed samples: 26063360 | consumed tokens: 53377761280 | elapsed time per iteration (s): 0.57 | learning rate: 2.606E-05 | global batch size: 256 | lm loss: 2.566478E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.076 | TFLOPs: 43.01 | +7: iteration 101820/ 115203 | consumed samples: 26065920 | consumed tokens: 53383004160 | elapsed time per iteration (s): 0.56 | learning rate: 2.605E-05 | global batch size: 256 | lm loss: 2.557888E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.096 | TFLOPs: 43.39 | +7: iteration 101830/ 115203 | consumed samples: 26068480 | consumed tokens: 53388247040 | elapsed time per iteration (s): 0.57 | learning rate: 2.604E-05 | global batch size: 256 | lm loss: 2.551758E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.469 | TFLOPs: 42.47 | +7: iteration 101840/ 115203 | consumed samples: 26071040 | consumed tokens: 53393489920 | elapsed time per iteration (s): 0.56 | learning rate: 2.603E-05 | global batch size: 256 | lm loss: 2.569722E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.130 | TFLOPs: 43.49 | +7: iteration 101850/ 115203 | consumed samples: 26073600 | consumed tokens: 53398732800 | elapsed time per iteration (s): 0.55 | learning rate: 2.602E-05 | global batch size: 256 | lm loss: 2.571035E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.453 | TFLOPs: 43.99 | +7: iteration 101860/ 115203 | consumed samples: 26076160 | consumed tokens: 53403975680 | elapsed time per iteration (s): 0.55 | learning rate: 2.601E-05 | global batch size: 256 | lm loss: 2.566514E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.592 | TFLOPs: 44.01 | +7: iteration 101870/ 115203 | consumed samples: 26078720 | consumed tokens: 53409218560 | elapsed time per iteration (s): 0.57 | learning rate: 2.600E-05 | global batch size: 256 | lm loss: 2.578696E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.822 | TFLOPs: 43.08 | +7: iteration 101880/ 115203 | consumed samples: 26081280 | consumed tokens: 53414461440 | elapsed time per iteration (s): 0.56 | learning rate: 2.599E-05 | global batch size: 256 | lm loss: 2.565600E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.095 | TFLOPs: 43.58 | +7: iteration 101890/ 115203 | consumed samples: 26083840 | consumed tokens: 53419704320 | elapsed time per iteration (s): 0.56 | learning rate: 2.598E-05 | global batch size: 256 | lm loss: 2.564081E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.747 | TFLOPs: 43.45 | +7: iteration 101900/ 115203 | consumed samples: 26086400 | consumed tokens: 53424947200 | elapsed time per iteration (s): 0.56 | learning rate: 2.598E-05 | global batch size: 256 | lm loss: 2.562173E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.901 | TFLOPs: 43.27 | +7: iteration 101910/ 115203 | consumed samples: 26088960 | consumed tokens: 53430190080 | elapsed time per iteration (s): 0.56 | learning rate: 2.597E-05 | global batch size: 256 | lm loss: 2.562999E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.212 | TFLOPs: 43.49 | +7: iteration 101920/ 115203 | consumed samples: 26091520 | consumed tokens: 53435432960 | elapsed time per iteration (s): 0.56 | learning rate: 2.596E-05 | global batch size: 256 | lm loss: 2.565364E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.120 | TFLOPs: 43.30 | +7: iteration 101930/ 115203 | consumed samples: 26094080 | consumed tokens: 53440675840 | elapsed time per iteration (s): 0.57 | learning rate: 2.595E-05 | global batch size: 256 | lm loss: 2.565367E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.214 | TFLOPs: 42.83 | +7: iteration 101940/ 115203 | consumed samples: 26096640 | consumed tokens: 53445918720 | elapsed time per iteration (s): 0.56 | learning rate: 2.594E-05 | global batch size: 256 | lm loss: 2.557318E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.061 | TFLOPs: 43.58 | +7: iteration 101950/ 115203 | consumed samples: 26099200 | consumed tokens: 53451161600 | elapsed time per iteration (s): 0.56 | learning rate: 2.593E-05 | global batch size: 256 | lm loss: 2.560252E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.121 | TFLOPs: 43.58 | +7: iteration 101960/ 115203 | consumed samples: 26101760 | consumed tokens: 53456404480 | elapsed time per iteration (s): 0.56 | learning rate: 2.592E-05 | global batch size: 256 | lm loss: 2.557597E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.370 | TFLOPs: 43.22 | +7: iteration 101970/ 115203 | consumed samples: 26104320 | consumed tokens: 53461647360 | elapsed time per iteration (s): 0.57 | learning rate: 2.591E-05 | global batch size: 256 | lm loss: 2.556580E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.747 | TFLOPs: 42.97 | +7: iteration 101980/ 115203 | consumed samples: 26106880 | consumed tokens: 53466890240 | elapsed time per iteration (s): 0.56 | learning rate: 2.590E-05 | global batch size: 256 | lm loss: 2.558352E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.175 | TFLOPs: 43.40 | +7: iteration 101990/ 115203 | consumed samples: 26109440 | consumed tokens: 53472133120 | elapsed time per iteration (s): 0.57 | learning rate: 2.590E-05 | global batch size: 256 | lm loss: 2.554197E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.377 | TFLOPs: 43.13 | +0: [2023-03-17 04:50:18,205] [INFO] [logging.py:68:log_dist] [Rank 0] step=102000, skipped=0, lr=[2.5887309996453706e-05, 2.5887309996453706e-05, 2.5887309996453706e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 102000/ 115203 | consumed samples: 26112000 | consumed tokens: 53477376000 | elapsed time per iteration (s): 0.56 | learning rate: 2.589E-05 | global batch size: 256 | lm loss: 2.561206E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.548 | TFLOPs: 43.43 | +0: steps: 102000 loss: 2.5478 iter time (s): 0.560 samples/sec: 457.485 +7: iteration 102010/ 115203 | consumed samples: 26114560 | consumed tokens: 53482618880 | elapsed time per iteration (s): 0.57 | learning rate: 2.588E-05 | global batch size: 256 | lm loss: 2.573495E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.907 | TFLOPs: 42.61 | +7: iteration 102020/ 115203 | consumed samples: 26117120 | consumed tokens: 53487861760 | elapsed time per iteration (s): 0.56 | learning rate: 2.587E-05 | global batch size: 256 | lm loss: 2.564004E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.124 | TFLOPs: 43.39 | +7: iteration 102030/ 115203 | consumed samples: 26119680 | consumed tokens: 53493104640 | elapsed time per iteration (s): 0.59 | learning rate: 2.586E-05 | global batch size: 256 | lm loss: 2.573728E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.052 | TFLOPs: 41.38 | +7: iteration 102040/ 115203 | consumed samples: 26122240 | consumed tokens: 53498347520 | elapsed time per iteration (s): 0.56 | learning rate: 2.585E-05 | global batch size: 256 | lm loss: 2.559859E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.173 | TFLOPs: 43.21 | +7: iteration 102050/ 115203 | consumed samples: 26124800 | consumed tokens: 53503590400 | elapsed time per iteration (s): 0.57 | learning rate: 2.584E-05 | global batch size: 256 | lm loss: 2.555668E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.326 | TFLOPs: 42.84 | +7: iteration 102060/ 115203 | consumed samples: 26127360 | consumed tokens: 53508833280 | elapsed time per iteration (s): 0.56 | learning rate: 2.583E-05 | global batch size: 256 | lm loss: 2.555546E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.891 | TFLOPs: 43.56 | +7: iteration 102070/ 115203 | consumed samples: 26129920 | consumed tokens: 53514076160 | elapsed time per iteration (s): 0.56 | learning rate: 2.583E-05 | global batch size: 256 | lm loss: 2.560487E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.740 | TFLOPs: 43.26 | +7: iteration 102080/ 115203 | consumed samples: 26132480 | consumed tokens: 53519319040 | elapsed time per iteration (s): 0.55 | learning rate: 2.582E-05 | global batch size: 256 | lm loss: 2.573977E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.627 | TFLOPs: 44.01 | +7: iteration 102090/ 115203 | consumed samples: 26135040 | consumed tokens: 53524561920 | elapsed time per iteration (s): 0.56 | learning rate: 2.581E-05 | global batch size: 256 | lm loss: 2.563338E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.184 | TFLOPs: 43.59 | +7: iteration 102100/ 115203 | consumed samples: 26137600 | consumed tokens: 53529804800 | elapsed time per iteration (s): 0.58 | learning rate: 2.580E-05 | global batch size: 256 | lm loss: 2.572381E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.991 | TFLOPs: 42.14 | +7: iteration 102110/ 115203 | consumed samples: 26140160 | consumed tokens: 53535047680 | elapsed time per iteration (s): 0.58 | learning rate: 2.579E-05 | global batch size: 256 | lm loss: 2.566156E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.342 | TFLOPs: 42.36 | +7: iteration 102120/ 115203 | consumed samples: 26142720 | consumed tokens: 53540290560 | elapsed time per iteration (s): 0.56 | learning rate: 2.578E-05 | global batch size: 256 | lm loss: 2.569936E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.487 | TFLOPs: 43.71 | +7: iteration 102130/ 115203 | consumed samples: 26145280 | consumed tokens: 53545533440 | elapsed time per iteration (s): 0.56 | learning rate: 2.577E-05 | global batch size: 256 | lm loss: 2.567373E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.023 | TFLOPs: 43.57 | +7: iteration 102140/ 115203 | consumed samples: 26147840 | consumed tokens: 53550776320 | elapsed time per iteration (s): 0.57 | learning rate: 2.576E-05 | global batch size: 256 | lm loss: 2.559922E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.499 | TFLOPs: 43.05 | +7: iteration 102150/ 115203 | consumed samples: 26150400 | consumed tokens: 53556019200 | elapsed time per iteration (s): 0.56 | learning rate: 2.576E-05 | global batch size: 256 | lm loss: 2.569772E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.990 | TFLOPs: 43.57 | +7: iteration 102160/ 115203 | consumed samples: 26152960 | consumed tokens: 53561262080 | elapsed time per iteration (s): 0.56 | learning rate: 2.575E-05 | global batch size: 256 | lm loss: 2.582153E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.237 | TFLOPs: 43.40 | +7: iteration 102170/ 115203 | consumed samples: 26155520 | consumed tokens: 53566504960 | elapsed time per iteration (s): 0.57 | learning rate: 2.574E-05 | global batch size: 256 | lm loss: 2.570763E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.797 | TFLOPs: 43.07 | +7: iteration 102180/ 115203 | consumed samples: 26158080 | consumed tokens: 53571747840 | elapsed time per iteration (s): 0.56 | learning rate: 2.573E-05 | global batch size: 256 | lm loss: 2.570385E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.978 | TFLOPs: 43.95 | +7: iteration 102190/ 115203 | consumed samples: 26160640 | consumed tokens: 53576990720 | elapsed time per iteration (s): 0.57 | learning rate: 2.572E-05 | global batch size: 256 | lm loss: 2.565377E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.317 | TFLOPs: 42.74 | +7: iteration 102200/ 115203 | consumed samples: 26163200 | consumed tokens: 53582233600 | elapsed time per iteration (s): 0.56 | learning rate: 2.571E-05 | global batch size: 256 | lm loss: 2.573730E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.190 | TFLOPs: 43.78 | +7: iteration 102210/ 115203 | consumed samples: 26165760 | consumed tokens: 53587476480 | elapsed time per iteration (s): 0.58 | learning rate: 2.570E-05 | global batch size: 256 | lm loss: 2.571587E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.339 | TFLOPs: 42.36 | +7: iteration 102220/ 115203 | consumed samples: 26168320 | consumed tokens: 53592719360 | elapsed time per iteration (s): 0.56 | learning rate: 2.569E-05 | global batch size: 256 | lm loss: 2.564829E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.160 | TFLOPs: 43.97 | +7: iteration 102230/ 115203 | consumed samples: 26170880 | consumed tokens: 53597962240 | elapsed time per iteration (s): 0.55 | learning rate: 2.569E-05 | global batch size: 256 | lm loss: 2.565079E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.390 | TFLOPs: 43.99 | +7: iteration 102240/ 115203 | consumed samples: 26173440 | consumed tokens: 53603205120 | elapsed time per iteration (s): 0.56 | learning rate: 2.568E-05 | global batch size: 256 | lm loss: 2.563767E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.840 | TFLOPs: 43.55 | +7: iteration 102250/ 115203 | consumed samples: 26176000 | consumed tokens: 53608448000 | elapsed time per iteration (s): 0.56 | learning rate: 2.567E-05 | global batch size: 256 | lm loss: 2.558157E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.060 | TFLOPs: 43.58 | +7: iteration 102260/ 115203 | consumed samples: 26178560 | consumed tokens: 53613690880 | elapsed time per iteration (s): 0.57 | learning rate: 2.566E-05 | global batch size: 256 | lm loss: 2.570815E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.313 | TFLOPs: 43.03 | +7: iteration 102270/ 115203 | consumed samples: 26181120 | consumed tokens: 53618933760 | elapsed time per iteration (s): 0.56 | learning rate: 2.565E-05 | global batch size: 256 | lm loss: 2.562781E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.248 | TFLOPs: 43.31 | +7: iteration 102280/ 115203 | consumed samples: 26183680 | consumed tokens: 53624176640 | elapsed time per iteration (s): 0.57 | learning rate: 2.564E-05 | global batch size: 256 | lm loss: 2.562085E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.428 | TFLOPs: 43.04 | +7: iteration 102290/ 115203 | consumed samples: 26186240 | consumed tokens: 53629419520 | elapsed time per iteration (s): 0.56 | learning rate: 2.563E-05 | global batch size: 256 | lm loss: 2.558017E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.911 | TFLOPs: 43.37 | +7: iteration 102300/ 115203 | consumed samples: 26188800 | consumed tokens: 53634662400 | elapsed time per iteration (s): 0.56 | learning rate: 2.563E-05 | global batch size: 256 | lm loss: 2.571031E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.192 | TFLOPs: 43.49 | +7: iteration 102310/ 115203 | consumed samples: 26191360 | consumed tokens: 53639905280 | elapsed time per iteration (s): 0.56 | learning rate: 2.562E-05 | global batch size: 256 | lm loss: 2.554103E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.437 | TFLOPs: 43.42 | +7: iteration 102320/ 115203 | consumed samples: 26193920 | consumed tokens: 53645148160 | elapsed time per iteration (s): 0.57 | learning rate: 2.561E-05 | global batch size: 256 | lm loss: 2.562775E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.112 | TFLOPs: 42.82 | +7: iteration 102330/ 115203 | consumed samples: 26196480 | consumed tokens: 53650391040 | elapsed time per iteration (s): 0.56 | learning rate: 2.560E-05 | global batch size: 256 | lm loss: 2.559678E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.222 | TFLOPs: 43.97 | +7: iteration 102340/ 115203 | consumed samples: 26199040 | consumed tokens: 53655633920 | elapsed time per iteration (s): 0.57 | learning rate: 2.559E-05 | global batch size: 256 | lm loss: 2.555751E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.126 | TFLOPs: 43.01 | +7: iteration 102350/ 115203 | consumed samples: 26201600 | consumed tokens: 53660876800 | elapsed time per iteration (s): 0.57 | learning rate: 2.558E-05 | global batch size: 256 | lm loss: 2.563077E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.970 | TFLOPs: 43.19 | +7: iteration 102360/ 115203 | consumed samples: 26204160 | consumed tokens: 53666119680 | elapsed time per iteration (s): 0.57 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 2.572213E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.334 | TFLOPs: 42.93 | +7: iteration 102370/ 115203 | consumed samples: 26206720 | consumed tokens: 53671362560 | elapsed time per iteration (s): 0.56 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 2.567474E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.816 | TFLOPs: 43.46 | +7: iteration 102380/ 115203 | consumed samples: 26209280 | consumed tokens: 53676605440 | elapsed time per iteration (s): 0.57 | learning rate: 2.556E-05 | global batch size: 256 | lm loss: 2.564249E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.581 | TFLOPs: 43.05 | +7: iteration 102390/ 115203 | consumed samples: 26211840 | consumed tokens: 53681848320 | elapsed time per iteration (s): 0.58 | learning rate: 2.555E-05 | global batch size: 256 | lm loss: 2.567672E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.363 | TFLOPs: 42.37 | +7: iteration 102400/ 115203 | consumed samples: 26214400 | consumed tokens: 53687091200 | elapsed time per iteration (s): 0.57 | learning rate: 2.554E-05 | global batch size: 256 | lm loss: 2.563627E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.742 | TFLOPs: 43.07 | +7: iteration 102410/ 115203 | consumed samples: 26216960 | consumed tokens: 53692334080 | elapsed time per iteration (s): 0.58 | learning rate: 2.553E-05 | global batch size: 256 | lm loss: 2.553682E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.067 | TFLOPs: 41.76 | +7: iteration 102420/ 115203 | consumed samples: 26219520 | consumed tokens: 53697576960 | elapsed time per iteration (s): 0.57 | learning rate: 2.552E-05 | global batch size: 256 | lm loss: 2.575060E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.164 | TFLOPs: 42.63 | +7: iteration 102430/ 115203 | consumed samples: 26222080 | consumed tokens: 53702819840 | elapsed time per iteration (s): 0.56 | learning rate: 2.551E-05 | global batch size: 256 | lm loss: 2.579410E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.694 | TFLOPs: 43.45 | +7: iteration 102440/ 115203 | consumed samples: 26224640 | consumed tokens: 53708062720 | elapsed time per iteration (s): 0.58 | learning rate: 2.551E-05 | global batch size: 256 | lm loss: 2.567665E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.208 | TFLOPs: 42.45 | +7: iteration 102450/ 115203 | consumed samples: 26227200 | consumed tokens: 53713305600 | elapsed time per iteration (s): 0.58 | learning rate: 2.550E-05 | global batch size: 256 | lm loss: 2.575789E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.732 | TFLOPs: 42.02 | +7: iteration 102460/ 115203 | consumed samples: 26229760 | consumed tokens: 53718548480 | elapsed time per iteration (s): 0.57 | learning rate: 2.549E-05 | global batch size: 256 | lm loss: 2.568715E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.913 | TFLOPs: 42.99 | +7: iteration 102470/ 115203 | consumed samples: 26232320 | consumed tokens: 53723791360 | elapsed time per iteration (s): 0.57 | learning rate: 2.548E-05 | global batch size: 256 | lm loss: 2.574517E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.888 | TFLOPs: 42.61 | +7: iteration 102480/ 115203 | consumed samples: 26234880 | consumed tokens: 53729034240 | elapsed time per iteration (s): 0.56 | learning rate: 2.547E-05 | global batch size: 256 | lm loss: 2.549482E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.123 | TFLOPs: 43.49 | +7: iteration 102490/ 115203 | consumed samples: 26237440 | consumed tokens: 53734277120 | elapsed time per iteration (s): 0.56 | learning rate: 2.546E-05 | global batch size: 256 | lm loss: 2.559781E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.521 | TFLOPs: 43.24 | +7: iteration 102500/ 115203 | consumed samples: 26240000 | consumed tokens: 53739520000 | elapsed time per iteration (s): 0.56 | learning rate: 2.545E-05 | global batch size: 256 | lm loss: 2.570396E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.429 | TFLOPs: 43.42 | +7: iteration 102510/ 115203 | consumed samples: 26242560 | consumed tokens: 53744762880 | elapsed time per iteration (s): 0.56 | learning rate: 2.545E-05 | global batch size: 256 | lm loss: 2.574721E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.418 | TFLOPs: 43.71 | +7: iteration 102520/ 115203 | consumed samples: 26245120 | consumed tokens: 53750005760 | elapsed time per iteration (s): 0.57 | learning rate: 2.544E-05 | global batch size: 256 | lm loss: 2.568064E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.315 | TFLOPs: 42.93 | +7: iteration 102530/ 115203 | consumed samples: 26247680 | consumed tokens: 53755248640 | elapsed time per iteration (s): 0.57 | learning rate: 2.543E-05 | global batch size: 256 | lm loss: 2.569425E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.516 | TFLOPs: 43.14 | +7: iteration 102540/ 115203 | consumed samples: 26250240 | consumed tokens: 53760491520 | elapsed time per iteration (s): 0.58 | learning rate: 2.542E-05 | global batch size: 256 | lm loss: 2.575960E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.187 | TFLOPs: 42.35 | +7: iteration 102550/ 115203 | consumed samples: 26252800 | consumed tokens: 53765734400 | elapsed time per iteration (s): 0.57 | learning rate: 2.541E-05 | global batch size: 256 | lm loss: 2.560614E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.157 | TFLOPs: 42.92 | +7: iteration 102560/ 115203 | consumed samples: 26255360 | consumed tokens: 53770977280 | elapsed time per iteration (s): 0.57 | learning rate: 2.540E-05 | global batch size: 256 | lm loss: 2.558722E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.222 | TFLOPs: 43.02 | +7: iteration 102570/ 115203 | consumed samples: 26257920 | consumed tokens: 53776220160 | elapsed time per iteration (s): 0.56 | learning rate: 2.540E-05 | global batch size: 256 | lm loss: 2.560163E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.471 | TFLOPs: 43.81 | +7: iteration 102580/ 115203 | consumed samples: 26260480 | consumed tokens: 53781463040 | elapsed time per iteration (s): 0.56 | learning rate: 2.539E-05 | global batch size: 256 | lm loss: 2.550507E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.921 | TFLOPs: 43.56 | +7: iteration 102590/ 115203 | consumed samples: 26263040 | consumed tokens: 53786705920 | elapsed time per iteration (s): 0.57 | learning rate: 2.538E-05 | global batch size: 256 | lm loss: 2.569968E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.347 | TFLOPs: 42.75 | +7: iteration 102600/ 115203 | consumed samples: 26265600 | consumed tokens: 53791948800 | elapsed time per iteration (s): 0.57 | learning rate: 2.537E-05 | global batch size: 256 | lm loss: 2.562288E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.078 | TFLOPs: 43.01 | +7: iteration 102610/ 115203 | consumed samples: 26268160 | consumed tokens: 53797191680 | elapsed time per iteration (s): 0.57 | learning rate: 2.536E-05 | global batch size: 256 | lm loss: 2.570682E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.860 | TFLOPs: 43.18 | +7: iteration 102620/ 115203 | consumed samples: 26270720 | consumed tokens: 53802434560 | elapsed time per iteration (s): 0.56 | learning rate: 2.535E-05 | global batch size: 256 | lm loss: 2.571407E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.753 | TFLOPs: 43.55 | +7: iteration 102630/ 115203 | consumed samples: 26273280 | consumed tokens: 53807677440 | elapsed time per iteration (s): 0.57 | learning rate: 2.534E-05 | global batch size: 256 | lm loss: 2.570402E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.739 | TFLOPs: 43.07 | +7: iteration 102640/ 115203 | consumed samples: 26275840 | consumed tokens: 53812920320 | elapsed time per iteration (s): 0.57 | learning rate: 2.534E-05 | global batch size: 256 | lm loss: 2.566075E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.283 | TFLOPs: 43.12 | +7: iteration 102650/ 115203 | consumed samples: 26278400 | consumed tokens: 53818163200 | elapsed time per iteration (s): 0.56 | learning rate: 2.533E-05 | global batch size: 256 | lm loss: 2.557314E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.096 | TFLOPs: 43.58 | +7: iteration 102660/ 115203 | consumed samples: 26280960 | consumed tokens: 53823406080 | elapsed time per iteration (s): 0.57 | learning rate: 2.532E-05 | global batch size: 256 | lm loss: 2.564276E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.420 | TFLOPs: 42.94 | +7: iteration 102670/ 115203 | consumed samples: 26283520 | consumed tokens: 53828648960 | elapsed time per iteration (s): 0.59 | learning rate: 2.531E-05 | global batch size: 256 | lm loss: 2.559259E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.109 | TFLOPs: 41.20 | +7: iteration 102680/ 115203 | consumed samples: 26286080 | consumed tokens: 53833891840 | elapsed time per iteration (s): 0.58 | learning rate: 2.530E-05 | global batch size: 256 | lm loss: 2.555627E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.615 | TFLOPs: 42.29 | +7: iteration 102690/ 115203 | consumed samples: 26288640 | consumed tokens: 53839134720 | elapsed time per iteration (s): 0.58 | learning rate: 2.529E-05 | global batch size: 256 | lm loss: 2.568355E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.916 | TFLOPs: 41.75 | +7: iteration 102700/ 115203 | consumed samples: 26291200 | consumed tokens: 53844377600 | elapsed time per iteration (s): 0.60 | learning rate: 2.529E-05 | global batch size: 256 | lm loss: 2.558274E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.138 | TFLOPs: 40.63 | +7: iteration 102710/ 115203 | consumed samples: 26293760 | consumed tokens: 53849620480 | elapsed time per iteration (s): 0.58 | learning rate: 2.528E-05 | global batch size: 256 | lm loss: 2.574860E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.669 | TFLOPs: 41.82 | +7: iteration 102720/ 115203 | consumed samples: 26296320 | consumed tokens: 53854863360 | elapsed time per iteration (s): 0.58 | learning rate: 2.527E-05 | global batch size: 256 | lm loss: 2.559711E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.490 | TFLOPs: 42.28 | +7: iteration 102730/ 115203 | consumed samples: 26298880 | consumed tokens: 53860106240 | elapsed time per iteration (s): 0.58 | learning rate: 2.526E-05 | global batch size: 256 | lm loss: 2.559867E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.972 | TFLOPs: 42.04 | +7: iteration 102740/ 115203 | consumed samples: 26301440 | consumed tokens: 53865349120 | elapsed time per iteration (s): 0.56 | learning rate: 2.525E-05 | global batch size: 256 | lm loss: 2.564127E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.678 | TFLOPs: 43.25 | +7: iteration 102750/ 115203 | consumed samples: 26304000 | consumed tokens: 53870592000 | elapsed time per iteration (s): 0.58 | learning rate: 2.524E-05 | global batch size: 256 | lm loss: 2.576669E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.095 | TFLOPs: 41.96 | +7: iteration 102760/ 115203 | consumed samples: 26306560 | consumed tokens: 53875834880 | elapsed time per iteration (s): 0.57 | learning rate: 2.524E-05 | global batch size: 256 | lm loss: 2.556992E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.263 | TFLOPs: 42.74 | +7: iteration 102770/ 115203 | consumed samples: 26309120 | consumed tokens: 53881077760 | elapsed time per iteration (s): 0.58 | learning rate: 2.523E-05 | global batch size: 256 | lm loss: 2.543881E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.514 | TFLOPs: 42.09 | +7: iteration 102780/ 115203 | consumed samples: 26311680 | consumed tokens: 53886320640 | elapsed time per iteration (s): 0.57 | learning rate: 2.522E-05 | global batch size: 256 | lm loss: 2.561590E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.173 | TFLOPs: 43.01 | +7: iteration 102790/ 115203 | consumed samples: 26314240 | consumed tokens: 53891563520 | elapsed time per iteration (s): 0.58 | learning rate: 2.521E-05 | global batch size: 256 | lm loss: 2.574895E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.986 | TFLOPs: 42.04 | +7: iteration 102800/ 115203 | consumed samples: 26316800 | consumed tokens: 53896806400 | elapsed time per iteration (s): 0.57 | learning rate: 2.520E-05 | global batch size: 256 | lm loss: 2.567262E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.776 | TFLOPs: 42.79 | +7: iteration 102810/ 115203 | consumed samples: 26319360 | consumed tokens: 53902049280 | elapsed time per iteration (s): 0.57 | learning rate: 2.519E-05 | global batch size: 256 | lm loss: 2.568974E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.556 | TFLOPs: 42.96 | +7: iteration 102820/ 115203 | consumed samples: 26321920 | consumed tokens: 53907292160 | elapsed time per iteration (s): 0.57 | learning rate: 2.519E-05 | global batch size: 256 | lm loss: 2.562261E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.071 | TFLOPs: 42.72 | +7: iteration 102830/ 115203 | consumed samples: 26324480 | consumed tokens: 53912535040 | elapsed time per iteration (s): 0.58 | learning rate: 2.518E-05 | global batch size: 256 | lm loss: 2.568150E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.496 | TFLOPs: 42.00 | +7: iteration 102840/ 115203 | consumed samples: 26327040 | consumed tokens: 53917777920 | elapsed time per iteration (s): 0.57 | learning rate: 2.517E-05 | global batch size: 256 | lm loss: 2.569129E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.201 | TFLOPs: 43.02 | +7: iteration 102850/ 115203 | consumed samples: 26329600 | consumed tokens: 53923020800 | elapsed time per iteration (s): 0.58 | learning rate: 2.516E-05 | global batch size: 256 | lm loss: 2.569996E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.407 | TFLOPs: 42.37 | +7: iteration 102860/ 115203 | consumed samples: 26332160 | consumed tokens: 53928263680 | elapsed time per iteration (s): 0.57 | learning rate: 2.515E-05 | global batch size: 256 | lm loss: 2.570209E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.311 | TFLOPs: 43.03 | +7: iteration 102870/ 115203 | consumed samples: 26334720 | consumed tokens: 53933506560 | elapsed time per iteration (s): 0.57 | learning rate: 2.514E-05 | global batch size: 256 | lm loss: 2.567594E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.040 | TFLOPs: 43.10 | +7: iteration 102880/ 115203 | consumed samples: 26337280 | consumed tokens: 53938749440 | elapsed time per iteration (s): 0.57 | learning rate: 2.514E-05 | global batch size: 256 | lm loss: 2.570270E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.342 | TFLOPs: 42.84 | +7: iteration 102890/ 115203 | consumed samples: 26339840 | consumed tokens: 53943992320 | elapsed time per iteration (s): 0.58 | learning rate: 2.513E-05 | global batch size: 256 | lm loss: 2.552030E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.986 | TFLOPs: 42.42 | +7: iteration 102900/ 115203 | consumed samples: 26342400 | consumed tokens: 53949235200 | elapsed time per iteration (s): 0.57 | learning rate: 2.512E-05 | global batch size: 256 | lm loss: 2.570951E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.973 | TFLOPs: 42.80 | +7: iteration 102910/ 115203 | consumed samples: 26344960 | consumed tokens: 53954478080 | elapsed time per iteration (s): 0.58 | learning rate: 2.511E-05 | global batch size: 256 | lm loss: 2.559867E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.032 | TFLOPs: 42.05 | +7: iteration 102920/ 115203 | consumed samples: 26347520 | consumed tokens: 53959720960 | elapsed time per iteration (s): 0.58 | learning rate: 2.510E-05 | global batch size: 256 | lm loss: 2.553561E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.020 | TFLOPs: 42.43 | +7: iteration 102930/ 115203 | consumed samples: 26350080 | consumed tokens: 53964963840 | elapsed time per iteration (s): 0.58 | learning rate: 2.509E-05 | global batch size: 256 | lm loss: 2.558286E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.883 | TFLOPs: 42.03 | +7: iteration 102940/ 115203 | consumed samples: 26352640 | consumed tokens: 53970206720 | elapsed time per iteration (s): 0.57 | learning rate: 2.509E-05 | global batch size: 256 | lm loss: 2.568321E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.206 | TFLOPs: 43.02 | +7: iteration 102950/ 115203 | consumed samples: 26355200 | consumed tokens: 53975449600 | elapsed time per iteration (s): 0.57 | learning rate: 2.508E-05 | global batch size: 256 | lm loss: 2.562556E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.241 | TFLOPs: 42.64 | +7: iteration 102960/ 115203 | consumed samples: 26357760 | consumed tokens: 53980692480 | elapsed time per iteration (s): 0.58 | learning rate: 2.507E-05 | global batch size: 256 | lm loss: 2.557550E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.751 | TFLOPs: 42.31 | +7: iteration 102970/ 115203 | consumed samples: 26360320 | consumed tokens: 53985935360 | elapsed time per iteration (s): 0.58 | learning rate: 2.506E-05 | global batch size: 256 | lm loss: 2.574901E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.688 | TFLOPs: 41.73 | +7: iteration 102980/ 115203 | consumed samples: 26362880 | consumed tokens: 53991178240 | elapsed time per iteration (s): 0.58 | learning rate: 2.505E-05 | global batch size: 256 | lm loss: 2.552032E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.700 | TFLOPs: 42.30 | +7: iteration 102990/ 115203 | consumed samples: 26365440 | consumed tokens: 53996421120 | elapsed time per iteration (s): 0.58 | learning rate: 2.505E-05 | global batch size: 256 | lm loss: 2.566544E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.991 | TFLOPs: 42.14 | +7: iteration 103000/ 115203 | consumed samples: 26368000 | consumed tokens: 54001664000 | elapsed time per iteration (s): 0.57 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 2.577870E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.518 | TFLOPs: 42.57 | +7: iteration 103010/ 115203 | consumed samples: 26370560 | consumed tokens: 54006906880 | elapsed time per iteration (s): 0.57 | learning rate: 2.503E-05 | global batch size: 256 | lm loss: 2.564802E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.461 | TFLOPs: 42.47 | +7: iteration 103020/ 115203 | consumed samples: 26373120 | consumed tokens: 54012149760 | elapsed time per iteration (s): 0.59 | learning rate: 2.502E-05 | global batch size: 256 | lm loss: 2.570145E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.961 | TFLOPs: 41.37 | +7: iteration 103030/ 115203 | consumed samples: 26375680 | consumed tokens: 54017392640 | elapsed time per iteration (s): 0.57 | learning rate: 2.501E-05 | global batch size: 256 | lm loss: 2.565680E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.503 | TFLOPs: 43.05 | +7: iteration 103040/ 115203 | consumed samples: 26378240 | consumed tokens: 54022635520 | elapsed time per iteration (s): 0.56 | learning rate: 2.500E-05 | global batch size: 256 | lm loss: 2.560564E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.563 | TFLOPs: 43.34 | +7: iteration 103050/ 115203 | consumed samples: 26380800 | consumed tokens: 54027878400 | elapsed time per iteration (s): 0.58 | learning rate: 2.500E-05 | global batch size: 256 | lm loss: 2.553381E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.293 | TFLOPs: 41.98 | +7: iteration 103060/ 115203 | consumed samples: 26383360 | consumed tokens: 54033121280 | elapsed time per iteration (s): 0.56 | learning rate: 2.499E-05 | global batch size: 256 | lm loss: 2.565140E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.910 | TFLOPs: 43.28 | +7: iteration 103070/ 115203 | consumed samples: 26385920 | consumed tokens: 54038364160 | elapsed time per iteration (s): 0.58 | learning rate: 2.498E-05 | global batch size: 256 | lm loss: 2.563566E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.651 | TFLOPs: 42.39 | +7: iteration 103080/ 115203 | consumed samples: 26388480 | consumed tokens: 54043607040 | elapsed time per iteration (s): 0.57 | learning rate: 2.497E-05 | global batch size: 256 | lm loss: 2.567954E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.795 | TFLOPs: 42.69 | +7: iteration 103090/ 115203 | consumed samples: 26391040 | consumed tokens: 54048849920 | elapsed time per iteration (s): 0.56 | learning rate: 2.496E-05 | global batch size: 256 | lm loss: 2.569597E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.042 | TFLOPs: 43.29 | +7: iteration 103100/ 115203 | consumed samples: 26393600 | consumed tokens: 54054092800 | elapsed time per iteration (s): 0.57 | learning rate: 2.496E-05 | global batch size: 256 | lm loss: 2.562935E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.792 | TFLOPs: 42.79 | +7: iteration 103110/ 115203 | consumed samples: 26396160 | consumed tokens: 54059335680 | elapsed time per iteration (s): 0.58 | learning rate: 2.495E-05 | global batch size: 256 | lm loss: 2.557724E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.985 | TFLOPs: 42.14 | +7: iteration 103120/ 115203 | consumed samples: 26398720 | consumed tokens: 54064578560 | elapsed time per iteration (s): 0.59 | learning rate: 2.494E-05 | global batch size: 256 | lm loss: 2.574518E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.507 | TFLOPs: 41.33 | +7: iteration 103130/ 115203 | consumed samples: 26401280 | consumed tokens: 54069821440 | elapsed time per iteration (s): 0.57 | learning rate: 2.493E-05 | global batch size: 256 | lm loss: 2.573992E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.111 | TFLOPs: 42.53 | +7: iteration 103140/ 115203 | consumed samples: 26403840 | consumed tokens: 54075064320 | elapsed time per iteration (s): 0.57 | learning rate: 2.492E-05 | global batch size: 256 | lm loss: 2.559551E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.836 | TFLOPs: 43.08 | +7: iteration 103150/ 115203 | consumed samples: 26406400 | consumed tokens: 54080307200 | elapsed time per iteration (s): 0.58 | learning rate: 2.492E-05 | global batch size: 256 | lm loss: 2.569946E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.846 | TFLOPs: 42.32 | +7: iteration 103160/ 115203 | consumed samples: 26408960 | consumed tokens: 54085550080 | elapsed time per iteration (s): 0.58 | learning rate: 2.491E-05 | global batch size: 256 | lm loss: 2.567588E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.926 | TFLOPs: 42.42 | +7: iteration 103170/ 115203 | consumed samples: 26411520 | consumed tokens: 54090792960 | elapsed time per iteration (s): 0.57 | learning rate: 2.490E-05 | global batch size: 256 | lm loss: 2.580124E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.398 | TFLOPs: 42.65 | +7: iteration 103180/ 115203 | consumed samples: 26414080 | consumed tokens: 54096035840 | elapsed time per iteration (s): 0.59 | learning rate: 2.489E-05 | global batch size: 256 | lm loss: 2.565001E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.440 | TFLOPs: 41.42 | +7: iteration 103190/ 115203 | consumed samples: 26416640 | consumed tokens: 54101278720 | elapsed time per iteration (s): 0.58 | learning rate: 2.488E-05 | global batch size: 256 | lm loss: 2.559243E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.603 | TFLOPs: 42.10 | +7: iteration 103200/ 115203 | consumed samples: 26419200 | consumed tokens: 54106521600 | elapsed time per iteration (s): 0.58 | learning rate: 2.488E-05 | global batch size: 256 | lm loss: 2.566914E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.872 | TFLOPs: 41.75 | +7: iteration 103210/ 115203 | consumed samples: 26421760 | consumed tokens: 54111764480 | elapsed time per iteration (s): 0.58 | learning rate: 2.487E-05 | global batch size: 256 | lm loss: 2.568834E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.514 | TFLOPs: 42.19 | +7: iteration 103220/ 115203 | consumed samples: 26424320 | consumed tokens: 54117007360 | elapsed time per iteration (s): 0.57 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 2.574776E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.819 | TFLOPs: 42.69 | +7: iteration 103230/ 115203 | consumed samples: 26426880 | consumed tokens: 54122250240 | elapsed time per iteration (s): 0.56 | learning rate: 2.485E-05 | global batch size: 256 | lm loss: 2.563233E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.083 | TFLOPs: 43.29 | +7: iteration 103240/ 115203 | consumed samples: 26429440 | consumed tokens: 54127493120 | elapsed time per iteration (s): 0.58 | learning rate: 2.484E-05 | global batch size: 256 | lm loss: 2.559529E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.086 | TFLOPs: 42.34 | +7: iteration 103250/ 115203 | consumed samples: 26432000 | consumed tokens: 54132736000 | elapsed time per iteration (s): 0.56 | learning rate: 2.484E-05 | global batch size: 256 | lm loss: 2.562290E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.447 | TFLOPs: 43.33 | +7: iteration 103260/ 115203 | consumed samples: 26434560 | consumed tokens: 54137978880 | elapsed time per iteration (s): 0.58 | learning rate: 2.483E-05 | global batch size: 256 | lm loss: 2.568637E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.444 | TFLOPs: 42.37 | +7: iteration 103270/ 115203 | consumed samples: 26437120 | consumed tokens: 54143221760 | elapsed time per iteration (s): 0.58 | learning rate: 2.482E-05 | global batch size: 256 | lm loss: 2.567906E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.417 | TFLOPs: 42.08 | +7: iteration 103280/ 115203 | consumed samples: 26439680 | consumed tokens: 54148464640 | elapsed time per iteration (s): 0.56 | learning rate: 2.481E-05 | global batch size: 256 | lm loss: 2.570040E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.525 | TFLOPs: 43.62 | +7: iteration 103290/ 115203 | consumed samples: 26442240 | consumed tokens: 54153707520 | elapsed time per iteration (s): 0.60 | learning rate: 2.480E-05 | global batch size: 256 | lm loss: 2.571720E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.808 | TFLOPs: 40.98 | +7: iteration 103300/ 115203 | consumed samples: 26444800 | consumed tokens: 54158950400 | elapsed time per iteration (s): 0.57 | learning rate: 2.480E-05 | global batch size: 256 | lm loss: 2.562375E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.513 | TFLOPs: 43.14 | +7: iteration 103310/ 115203 | consumed samples: 26447360 | consumed tokens: 54164193280 | elapsed time per iteration (s): 0.59 | learning rate: 2.479E-05 | global batch size: 256 | lm loss: 2.565851E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.421 | TFLOPs: 41.61 | +7: iteration 103320/ 115203 | consumed samples: 26449920 | consumed tokens: 54169436160 | elapsed time per iteration (s): 0.57 | learning rate: 2.478E-05 | global batch size: 256 | lm loss: 2.567697E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.004 | TFLOPs: 42.81 | +7: iteration 103330/ 115203 | consumed samples: 26452480 | consumed tokens: 54174679040 | elapsed time per iteration (s): 0.57 | learning rate: 2.477E-05 | global batch size: 256 | lm loss: 2.555955E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.530 | TFLOPs: 43.14 | +7: iteration 103340/ 115203 | consumed samples: 26455040 | consumed tokens: 54179921920 | elapsed time per iteration (s): 0.58 | learning rate: 2.476E-05 | global batch size: 256 | lm loss: 2.559299E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.528 | TFLOPs: 42.29 | +7: iteration 103350/ 115203 | consumed samples: 26457600 | consumed tokens: 54185164800 | elapsed time per iteration (s): 0.56 | learning rate: 2.476E-05 | global batch size: 256 | lm loss: 2.573055E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.073 | TFLOPs: 43.39 | +7: iteration 103360/ 115203 | consumed samples: 26460160 | consumed tokens: 54190407680 | elapsed time per iteration (s): 0.57 | learning rate: 2.475E-05 | global batch size: 256 | lm loss: 2.552624E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.910 | TFLOPs: 42.80 | +7: iteration 103370/ 115203 | consumed samples: 26462720 | consumed tokens: 54195650560 | elapsed time per iteration (s): 0.57 | learning rate: 2.474E-05 | global batch size: 256 | lm loss: 2.559588E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.210 | TFLOPs: 43.02 | +7: iteration 103380/ 115203 | consumed samples: 26465280 | consumed tokens: 54200893440 | elapsed time per iteration (s): 0.58 | learning rate: 2.473E-05 | global batch size: 256 | lm loss: 2.565375E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.787 | TFLOPs: 42.12 | +7: iteration 103390/ 115203 | consumed samples: 26467840 | consumed tokens: 54206136320 | elapsed time per iteration (s): 0.57 | learning rate: 2.472E-05 | global batch size: 256 | lm loss: 2.552064E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.839 | TFLOPs: 43.08 | +7: iteration 103400/ 115203 | consumed samples: 26470400 | consumed tokens: 54211379200 | elapsed time per iteration (s): 0.58 | learning rate: 2.472E-05 | global batch size: 256 | lm loss: 2.566668E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.118 | TFLOPs: 42.15 | +7: iteration 103410/ 115203 | consumed samples: 26472960 | consumed tokens: 54216622080 | elapsed time per iteration (s): 0.57 | learning rate: 2.471E-05 | global batch size: 256 | lm loss: 2.563486E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.655 | TFLOPs: 42.58 | +7: iteration 103420/ 115203 | consumed samples: 26475520 | consumed tokens: 54221864960 | elapsed time per iteration (s): 0.56 | learning rate: 2.470E-05 | global batch size: 256 | lm loss: 2.574256E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.257 | TFLOPs: 43.31 | +7: iteration 103430/ 115203 | consumed samples: 26478080 | consumed tokens: 54227107840 | elapsed time per iteration (s): 0.57 | learning rate: 2.469E-05 | global batch size: 256 | lm loss: 2.573637E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.166 | TFLOPs: 43.11 | +7: iteration 103440/ 115203 | consumed samples: 26480640 | consumed tokens: 54232350720 | elapsed time per iteration (s): 0.58 | learning rate: 2.468E-05 | global batch size: 256 | lm loss: 2.565095E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.800 | TFLOPs: 41.83 | +7: iteration 103450/ 115203 | consumed samples: 26483200 | consumed tokens: 54237593600 | elapsed time per iteration (s): 0.59 | learning rate: 2.468E-05 | global batch size: 256 | lm loss: 2.560754E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.701 | TFLOPs: 41.25 | +7: iteration 103460/ 115203 | consumed samples: 26485760 | consumed tokens: 54242836480 | elapsed time per iteration (s): 0.56 | learning rate: 2.467E-05 | global batch size: 256 | lm loss: 2.555136E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.251 | TFLOPs: 43.59 | +7: iteration 103470/ 115203 | consumed samples: 26488320 | consumed tokens: 54248079360 | elapsed time per iteration (s): 0.59 | learning rate: 2.466E-05 | global batch size: 256 | lm loss: 2.569402E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.182 | TFLOPs: 41.49 | +7: iteration 103480/ 115203 | consumed samples: 26490880 | consumed tokens: 54253322240 | elapsed time per iteration (s): 0.56 | learning rate: 2.465E-05 | global batch size: 256 | lm loss: 2.571751E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.219 | TFLOPs: 43.21 | +7: iteration 103490/ 115203 | consumed samples: 26493440 | consumed tokens: 54258565120 | elapsed time per iteration (s): 0.58 | learning rate: 2.464E-05 | global batch size: 256 | lm loss: 2.561998E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.790 | TFLOPs: 41.74 | +7: iteration 103500/ 115203 | consumed samples: 26496000 | consumed tokens: 54263808000 | elapsed time per iteration (s): 0.58 | learning rate: 2.464E-05 | global batch size: 256 | lm loss: 2.578487E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.959 | TFLOPs: 41.85 | +7: iteration 103510/ 115203 | consumed samples: 26498560 | consumed tokens: 54269050880 | elapsed time per iteration (s): 0.57 | learning rate: 2.463E-05 | global batch size: 256 | lm loss: 2.564496E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.090 | TFLOPs: 42.72 | +7: iteration 103520/ 115203 | consumed samples: 26501120 | consumed tokens: 54274293760 | elapsed time per iteration (s): 0.57 | learning rate: 2.462E-05 | global batch size: 256 | lm loss: 2.549011E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.586 | TFLOPs: 42.48 | +7: iteration 103530/ 115203 | consumed samples: 26503680 | consumed tokens: 54279536640 | elapsed time per iteration (s): 0.58 | learning rate: 2.461E-05 | global batch size: 256 | lm loss: 2.571192E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.725 | TFLOPs: 42.11 | +7: iteration 103540/ 115203 | consumed samples: 26506240 | consumed tokens: 54284779520 | elapsed time per iteration (s): 0.57 | learning rate: 2.461E-05 | global batch size: 256 | lm loss: 2.562594E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.088 | TFLOPs: 42.72 | +7: iteration 103550/ 115203 | consumed samples: 26508800 | consumed tokens: 54290022400 | elapsed time per iteration (s): 0.57 | learning rate: 2.460E-05 | global batch size: 256 | lm loss: 2.564935E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.830 | TFLOPs: 42.79 | +7: iteration 103560/ 115203 | consumed samples: 26511360 | consumed tokens: 54295265280 | elapsed time per iteration (s): 0.58 | learning rate: 2.459E-05 | global batch size: 256 | lm loss: 2.559029E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.635 | TFLOPs: 42.39 | +7: iteration 103570/ 115203 | consumed samples: 26513920 | consumed tokens: 54300508160 | elapsed time per iteration (s): 0.56 | learning rate: 2.458E-05 | global batch size: 256 | lm loss: 2.567809E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.803 | TFLOPs: 43.36 | +7: iteration 103580/ 115203 | consumed samples: 26516480 | consumed tokens: 54305751040 | elapsed time per iteration (s): 0.59 | learning rate: 2.457E-05 | global batch size: 256 | lm loss: 2.565056E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.007 | TFLOPs: 41.66 | +7: iteration 103590/ 115203 | consumed samples: 26519040 | consumed tokens: 54310993920 | elapsed time per iteration (s): 0.58 | learning rate: 2.457E-05 | global batch size: 256 | lm loss: 2.580481E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.775 | TFLOPs: 42.40 | +7: iteration 103600/ 115203 | consumed samples: 26521600 | consumed tokens: 54316236800 | elapsed time per iteration (s): 0.57 | learning rate: 2.456E-05 | global batch size: 256 | lm loss: 2.558869E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.595 | TFLOPs: 42.67 | +7: iteration 103610/ 115203 | consumed samples: 26524160 | consumed tokens: 54321479680 | elapsed time per iteration (s): 0.62 | learning rate: 2.455E-05 | global batch size: 256 | lm loss: 2.557390E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 414.172 | TFLOPs: 39.49 | +7: iteration 103620/ 115203 | consumed samples: 26526720 | consumed tokens: 54326722560 | elapsed time per iteration (s): 0.62 | learning rate: 2.454E-05 | global batch size: 256 | lm loss: 2.554050E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 413.600 | TFLOPs: 39.43 | +7: iteration 103630/ 115203 | consumed samples: 26529280 | consumed tokens: 54331965440 | elapsed time per iteration (s): 0.59 | learning rate: 2.454E-05 | global batch size: 256 | lm loss: 2.576242E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.130 | TFLOPs: 41.29 | +7: iteration 103640/ 115203 | consumed samples: 26531840 | consumed tokens: 54337208320 | elapsed time per iteration (s): 0.57 | learning rate: 2.453E-05 | global batch size: 256 | lm loss: 2.559658E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.075 | TFLOPs: 42.81 | +7: iteration 103650/ 115203 | consumed samples: 26534400 | consumed tokens: 54342451200 | elapsed time per iteration (s): 0.58 | learning rate: 2.452E-05 | global batch size: 256 | lm loss: 2.567216E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.378 | TFLOPs: 41.99 | +7: iteration 103660/ 115203 | consumed samples: 26536960 | consumed tokens: 54347694080 | elapsed time per iteration (s): 0.59 | learning rate: 2.451E-05 | global batch size: 256 | lm loss: 2.560360E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.045 | TFLOPs: 41.29 | +7: iteration 103670/ 115203 | consumed samples: 26539520 | consumed tokens: 54352936960 | elapsed time per iteration (s): 0.57 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 2.566484E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.033 | TFLOPs: 43.10 | +7: iteration 103680/ 115203 | consumed samples: 26542080 | consumed tokens: 54358179840 | elapsed time per iteration (s): 0.57 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 2.556512E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.415 | TFLOPs: 42.56 | +7: iteration 103690/ 115203 | consumed samples: 26544640 | consumed tokens: 54363422720 | elapsed time per iteration (s): 0.57 | learning rate: 2.449E-05 | global batch size: 256 | lm loss: 2.556191E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.299 | TFLOPs: 42.74 | +7: iteration 103700/ 115203 | consumed samples: 26547200 | consumed tokens: 54368665600 | elapsed time per iteration (s): 0.58 | learning rate: 2.448E-05 | global batch size: 256 | lm loss: 2.575742E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.478 | TFLOPs: 41.99 | +7: iteration 103710/ 115203 | consumed samples: 26549760 | consumed tokens: 54373908480 | elapsed time per iteration (s): 0.56 | learning rate: 2.447E-05 | global batch size: 256 | lm loss: 2.567050E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.610 | TFLOPs: 43.72 | +7: iteration 103720/ 115203 | consumed samples: 26552320 | consumed tokens: 54379151360 | elapsed time per iteration (s): 0.58 | learning rate: 2.447E-05 | global batch size: 256 | lm loss: 2.556075E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.834 | TFLOPs: 41.74 | +7: iteration 103730/ 115203 | consumed samples: 26554880 | consumed tokens: 54384394240 | elapsed time per iteration (s): 0.59 | learning rate: 2.446E-05 | global batch size: 256 | lm loss: 2.565314E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.646 | TFLOPs: 41.63 | +7: iteration 103740/ 115203 | consumed samples: 26557440 | consumed tokens: 54389637120 | elapsed time per iteration (s): 0.60 | learning rate: 2.445E-05 | global batch size: 256 | lm loss: 2.561502E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.631 | TFLOPs: 40.58 | +7: iteration 103750/ 115203 | consumed samples: 26560000 | consumed tokens: 54394880000 | elapsed time per iteration (s): 0.60 | learning rate: 2.444E-05 | global batch size: 256 | lm loss: 2.560531E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.044 | TFLOPs: 40.81 | +7: iteration 103760/ 115203 | consumed samples: 26562560 | consumed tokens: 54400122880 | elapsed time per iteration (s): 0.57 | learning rate: 2.443E-05 | global batch size: 256 | lm loss: 2.556667E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.565 | TFLOPs: 42.96 | +7: iteration 103770/ 115203 | consumed samples: 26565120 | consumed tokens: 54405365760 | elapsed time per iteration (s): 0.57 | learning rate: 2.443E-05 | global batch size: 256 | lm loss: 2.563228E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.382 | TFLOPs: 42.56 | +7: iteration 103780/ 115203 | consumed samples: 26567680 | consumed tokens: 54410608640 | elapsed time per iteration (s): 2.63 | learning rate: 2.442E-05 | global batch size: 256 | lm loss: 2.561643E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 97.490 | TFLOPs: 9.29 | +7: iteration 103790/ 115203 | consumed samples: 26570240 | consumed tokens: 54415851520 | elapsed time per iteration (s): 0.93 | learning rate: 2.441E-05 | global batch size: 256 | lm loss: 2.567902E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 275.707 | TFLOPs: 26.29 | +7: iteration 103800/ 115203 | consumed samples: 26572800 | consumed tokens: 54421094400 | elapsed time per iteration (s): 0.59 | learning rate: 2.440E-05 | global batch size: 256 | lm loss: 2.558963E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.601 | TFLOPs: 41.43 | +7: iteration 103810/ 115203 | consumed samples: 26575360 | consumed tokens: 54426337280 | elapsed time per iteration (s): 0.58 | learning rate: 2.440E-05 | global batch size: 256 | lm loss: 2.551195E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.773 | TFLOPs: 42.31 | +7: iteration 103820/ 115203 | consumed samples: 26577920 | consumed tokens: 54431580160 | elapsed time per iteration (s): 0.57 | learning rate: 2.439E-05 | global batch size: 256 | lm loss: 2.577615E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.037 | TFLOPs: 43.00 | +7: iteration 103830/ 115203 | consumed samples: 26580480 | consumed tokens: 54436823040 | elapsed time per iteration (s): 0.57 | learning rate: 2.438E-05 | global batch size: 256 | lm loss: 2.551536E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.523 | TFLOPs: 42.57 | +7: iteration 103840/ 115203 | consumed samples: 26583040 | consumed tokens: 54442065920 | elapsed time per iteration (s): 0.56 | learning rate: 2.437E-05 | global batch size: 256 | lm loss: 2.567044E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.329 | TFLOPs: 43.60 | +7: iteration 103850/ 115203 | consumed samples: 26585600 | consumed tokens: 54447308800 | elapsed time per iteration (s): 0.57 | learning rate: 2.437E-05 | global batch size: 256 | lm loss: 2.580001E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.258 | TFLOPs: 42.45 | +7: iteration 103860/ 115203 | consumed samples: 26588160 | consumed tokens: 54452551680 | elapsed time per iteration (s): 0.57 | learning rate: 2.436E-05 | global batch size: 256 | lm loss: 2.561908E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.569 | TFLOPs: 42.96 | +7: iteration 103870/ 115203 | consumed samples: 26590720 | consumed tokens: 54457794560 | elapsed time per iteration (s): 0.56 | learning rate: 2.435E-05 | global batch size: 256 | lm loss: 2.555453E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.306 | TFLOPs: 43.50 | +7: iteration 103880/ 115203 | consumed samples: 26593280 | consumed tokens: 54463037440 | elapsed time per iteration (s): 0.56 | learning rate: 2.434E-05 | global batch size: 256 | lm loss: 2.562938E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.361 | TFLOPs: 43.51 | +7: iteration 103890/ 115203 | consumed samples: 26595840 | consumed tokens: 54468280320 | elapsed time per iteration (s): 0.58 | learning rate: 2.434E-05 | global batch size: 256 | lm loss: 2.553547E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.845 | TFLOPs: 42.41 | +7: iteration 103900/ 115203 | consumed samples: 26598400 | consumed tokens: 54473523200 | elapsed time per iteration (s): 0.58 | learning rate: 2.433E-05 | global batch size: 256 | lm loss: 2.565288E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.699 | TFLOPs: 42.30 | +7: iteration 103910/ 115203 | consumed samples: 26600960 | consumed tokens: 54478766080 | elapsed time per iteration (s): 0.58 | learning rate: 2.432E-05 | global batch size: 256 | lm loss: 2.549315E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.593 | TFLOPs: 41.82 | +7: iteration 103920/ 115203 | consumed samples: 26603520 | consumed tokens: 54484008960 | elapsed time per iteration (s): 0.59 | learning rate: 2.431E-05 | global batch size: 256 | lm loss: 2.570844E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.582 | TFLOPs: 41.15 | +7: iteration 103930/ 115203 | consumed samples: 26606080 | consumed tokens: 54489251840 | elapsed time per iteration (s): 0.56 | learning rate: 2.430E-05 | global batch size: 256 | lm loss: 2.559158E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.198 | TFLOPs: 43.49 | +7: iteration 103940/ 115203 | consumed samples: 26608640 | consumed tokens: 54494494720 | elapsed time per iteration (s): 0.58 | learning rate: 2.430E-05 | global batch size: 256 | lm loss: 2.568605E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.689 | TFLOPs: 42.30 | +7: iteration 103950/ 115203 | consumed samples: 26611200 | consumed tokens: 54499737600 | elapsed time per iteration (s): 0.58 | learning rate: 2.429E-05 | global batch size: 256 | lm loss: 2.557103E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.293 | TFLOPs: 42.36 | +7: iteration 103960/ 115203 | consumed samples: 26613760 | consumed tokens: 54504980480 | elapsed time per iteration (s): 0.59 | learning rate: 2.428E-05 | global batch size: 256 | lm loss: 2.570502E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.002 | TFLOPs: 41.09 | +7: iteration 103970/ 115203 | consumed samples: 26616320 | consumed tokens: 54510223360 | elapsed time per iteration (s): 0.58 | learning rate: 2.427E-05 | global batch size: 256 | lm loss: 2.563335E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.748 | TFLOPs: 42.40 | +7: iteration 103980/ 115203 | consumed samples: 26618880 | consumed tokens: 54515466240 | elapsed time per iteration (s): 0.58 | learning rate: 2.427E-05 | global batch size: 256 | lm loss: 2.554587E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.781 | TFLOPs: 42.02 | +7: iteration 103990/ 115203 | consumed samples: 26621440 | consumed tokens: 54520709120 | elapsed time per iteration (s): 0.58 | learning rate: 2.426E-05 | global batch size: 256 | lm loss: 2.567023E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.943 | TFLOPs: 42.33 | +0: [2023-03-17 05:09:47,967] [INFO] [logging.py:68:log_dist] [Rank 0] step=104000, skipped=0, lr=[2.4252001760011466e-05, 2.4252001760011466e-05, 2.4252001760011466e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 104000/ 115203 | consumed samples: 26624000 | consumed tokens: 54525952000 | elapsed time per iteration (s): 0.58 | learning rate: 2.425E-05 | global batch size: 256 | lm loss: 2.564169E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.956 | TFLOPs: 41.85 | +0: steps: 104000 loss: 2.5812 iter time (s): 0.582 samples/sec: 439.531 +7: iteration 104010/ 115203 | consumed samples: 26626560 | consumed tokens: 54531194880 | elapsed time per iteration (s): 0.57 | learning rate: 2.424E-05 | global batch size: 256 | lm loss: 2.571606E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.377 | TFLOPs: 43.13 | +7: iteration 104020/ 115203 | consumed samples: 26629120 | consumed tokens: 54536437760 | elapsed time per iteration (s): 0.57 | learning rate: 2.424E-05 | global batch size: 256 | lm loss: 2.560618E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.128 | TFLOPs: 42.91 | +7: iteration 104030/ 115203 | consumed samples: 26631680 | consumed tokens: 54541680640 | elapsed time per iteration (s): 0.57 | learning rate: 2.423E-05 | global batch size: 256 | lm loss: 2.555911E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.941 | TFLOPs: 42.99 | +7: iteration 104040/ 115203 | consumed samples: 26634240 | consumed tokens: 54546923520 | elapsed time per iteration (s): 0.57 | learning rate: 2.422E-05 | global batch size: 256 | lm loss: 2.569181E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.218 | TFLOPs: 42.64 | +7: iteration 104050/ 115203 | consumed samples: 26636800 | consumed tokens: 54552166400 | elapsed time per iteration (s): 0.56 | learning rate: 2.421E-05 | global batch size: 256 | lm loss: 2.567038E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.389 | TFLOPs: 43.23 | +7: iteration 104060/ 115203 | consumed samples: 26639360 | consumed tokens: 54557409280 | elapsed time per iteration (s): 0.55 | learning rate: 2.421E-05 | global batch size: 256 | lm loss: 2.556848E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.570 | TFLOPs: 44.01 | +7: iteration 104070/ 115203 | consumed samples: 26641920 | consumed tokens: 54562652160 | elapsed time per iteration (s): 0.58 | learning rate: 2.420E-05 | global batch size: 256 | lm loss: 2.565499E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.825 | TFLOPs: 42.12 | +7: iteration 104080/ 115203 | consumed samples: 26644480 | consumed tokens: 54567895040 | elapsed time per iteration (s): 0.58 | learning rate: 2.419E-05 | global batch size: 256 | lm loss: 2.569502E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.930 | TFLOPs: 42.04 | +7: iteration 104090/ 115203 | consumed samples: 26647040 | consumed tokens: 54573137920 | elapsed time per iteration (s): 0.58 | learning rate: 2.418E-05 | global batch size: 256 | lm loss: 2.562850E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.313 | TFLOPs: 42.27 | +7: iteration 104100/ 115203 | consumed samples: 26649600 | consumed tokens: 54578380800 | elapsed time per iteration (s): 0.57 | learning rate: 2.418E-05 | global batch size: 256 | lm loss: 2.567820E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.827 | TFLOPs: 42.89 | +7: iteration 104110/ 115203 | consumed samples: 26652160 | consumed tokens: 54583623680 | elapsed time per iteration (s): 0.57 | learning rate: 2.417E-05 | global batch size: 256 | lm loss: 2.553854E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.290 | TFLOPs: 42.93 | +7: iteration 104120/ 115203 | consumed samples: 26654720 | consumed tokens: 54588866560 | elapsed time per iteration (s): 0.56 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 2.567360E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.880 | TFLOPs: 43.46 | +7: iteration 104130/ 115203 | consumed samples: 26657280 | consumed tokens: 54594109440 | elapsed time per iteration (s): 0.59 | learning rate: 2.415E-05 | global batch size: 256 | lm loss: 2.563496E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.918 | TFLOPs: 41.56 | +7: iteration 104140/ 115203 | consumed samples: 26659840 | consumed tokens: 54599352320 | elapsed time per iteration (s): 0.55 | learning rate: 2.415E-05 | global batch size: 256 | lm loss: 2.558204E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.848 | TFLOPs: 44.03 | +7: iteration 104150/ 115203 | consumed samples: 26662400 | consumed tokens: 54604595200 | elapsed time per iteration (s): 0.57 | learning rate: 2.414E-05 | global batch size: 256 | lm loss: 2.559778E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.676 | TFLOPs: 43.06 | +7: iteration 104160/ 115203 | consumed samples: 26664960 | consumed tokens: 54609838080 | elapsed time per iteration (s): 0.58 | learning rate: 2.413E-05 | global batch size: 256 | lm loss: 2.553766E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.151 | TFLOPs: 42.06 | +7: iteration 104170/ 115203 | consumed samples: 26667520 | consumed tokens: 54615080960 | elapsed time per iteration (s): 0.56 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 2.578824E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.353 | TFLOPs: 43.79 | +7: iteration 104180/ 115203 | consumed samples: 26670080 | consumed tokens: 54620323840 | elapsed time per iteration (s): 0.56 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 2.560253E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.009 | TFLOPs: 43.48 | +7: iteration 104190/ 115203 | consumed samples: 26672640 | consumed tokens: 54625566720 | elapsed time per iteration (s): 0.57 | learning rate: 2.411E-05 | global batch size: 256 | lm loss: 2.573351E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.323 | TFLOPs: 43.03 | +7: iteration 104200/ 115203 | consumed samples: 26675200 | consumed tokens: 54630809600 | elapsed time per iteration (s): 0.59 | learning rate: 2.410E-05 | global batch size: 256 | lm loss: 2.562846E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.760 | TFLOPs: 41.45 | +7: iteration 104210/ 115203 | consumed samples: 26677760 | consumed tokens: 54636052480 | elapsed time per iteration (s): 0.56 | learning rate: 2.410E-05 | global batch size: 256 | lm loss: 2.541754E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.348 | TFLOPs: 43.79 | +7: iteration 104220/ 115203 | consumed samples: 26680320 | consumed tokens: 54641295360 | elapsed time per iteration (s): 0.58 | learning rate: 2.409E-05 | global batch size: 256 | lm loss: 2.557121E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.914 | TFLOPs: 42.42 | +7: iteration 104230/ 115203 | consumed samples: 26682880 | consumed tokens: 54646538240 | elapsed time per iteration (s): 0.56 | learning rate: 2.408E-05 | global batch size: 256 | lm loss: 2.557050E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.866 | TFLOPs: 43.27 | +7: iteration 104240/ 115203 | consumed samples: 26685440 | consumed tokens: 54651781120 | elapsed time per iteration (s): 0.58 | learning rate: 2.407E-05 | global batch size: 256 | lm loss: 2.561505E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.605 | TFLOPs: 42.29 | +7: iteration 104250/ 115203 | consumed samples: 26688000 | consumed tokens: 54657024000 | elapsed time per iteration (s): 0.57 | learning rate: 2.407E-05 | global batch size: 256 | lm loss: 2.545653E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.920 | TFLOPs: 43.09 | +7: iteration 104260/ 115203 | consumed samples: 26690560 | consumed tokens: 54662266880 | elapsed time per iteration (s): 0.56 | learning rate: 2.406E-05 | global batch size: 256 | lm loss: 2.551160E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.746 | TFLOPs: 43.64 | +7: iteration 104270/ 115203 | consumed samples: 26693120 | consumed tokens: 54667509760 | elapsed time per iteration (s): 0.56 | learning rate: 2.405E-05 | global batch size: 256 | lm loss: 2.576556E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.175 | TFLOPs: 43.49 | +7: iteration 104280/ 115203 | consumed samples: 26695680 | consumed tokens: 54672752640 | elapsed time per iteration (s): 0.57 | learning rate: 2.404E-05 | global batch size: 256 | lm loss: 2.554020E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.904 | TFLOPs: 42.61 | +7: iteration 104290/ 115203 | consumed samples: 26698240 | consumed tokens: 54677995520 | elapsed time per iteration (s): 0.56 | learning rate: 2.404E-05 | global batch size: 256 | lm loss: 2.559745E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.967 | TFLOPs: 43.47 | +7: iteration 104300/ 115203 | consumed samples: 26700800 | consumed tokens: 54683238400 | elapsed time per iteration (s): 0.57 | learning rate: 2.403E-05 | global batch size: 256 | lm loss: 2.569324E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.359 | TFLOPs: 42.75 | +7: iteration 104310/ 115203 | consumed samples: 26703360 | consumed tokens: 54688481280 | elapsed time per iteration (s): 0.57 | learning rate: 2.402E-05 | global batch size: 256 | lm loss: 2.556120E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.313 | TFLOPs: 42.74 | +7: iteration 104320/ 115203 | consumed samples: 26705920 | consumed tokens: 54693724160 | elapsed time per iteration (s): 0.55 | learning rate: 2.401E-05 | global batch size: 256 | lm loss: 2.569022E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.643 | TFLOPs: 44.01 | +7: iteration 104330/ 115203 | consumed samples: 26708480 | consumed tokens: 54698967040 | elapsed time per iteration (s): 0.57 | learning rate: 2.401E-05 | global batch size: 256 | lm loss: 2.558278E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.595 | TFLOPs: 43.15 | +7: iteration 104340/ 115203 | consumed samples: 26711040 | consumed tokens: 54704209920 | elapsed time per iteration (s): 0.58 | learning rate: 2.400E-05 | global batch size: 256 | lm loss: 2.576272E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.332 | TFLOPs: 42.27 | +7: iteration 104350/ 115203 | consumed samples: 26713600 | consumed tokens: 54709452800 | elapsed time per iteration (s): 0.56 | learning rate: 2.399E-05 | global batch size: 256 | lm loss: 2.548282E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.577 | TFLOPs: 43.91 | +7: iteration 104360/ 115203 | consumed samples: 26716160 | consumed tokens: 54714695680 | elapsed time per iteration (s): 0.56 | learning rate: 2.399E-05 | global batch size: 256 | lm loss: 2.578144E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.149 | TFLOPs: 43.58 | +7: iteration 104370/ 115203 | consumed samples: 26718720 | consumed tokens: 54719938560 | elapsed time per iteration (s): 0.57 | learning rate: 2.398E-05 | global batch size: 256 | lm loss: 2.564106E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.615 | TFLOPs: 42.58 | +7: iteration 104380/ 115203 | consumed samples: 26721280 | consumed tokens: 54725181440 | elapsed time per iteration (s): 0.56 | learning rate: 2.397E-05 | global batch size: 256 | lm loss: 2.559725E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.655 | TFLOPs: 43.25 | +7: iteration 104390/ 115203 | consumed samples: 26723840 | consumed tokens: 54730424320 | elapsed time per iteration (s): 0.56 | learning rate: 2.396E-05 | global batch size: 256 | lm loss: 2.559258E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.587 | TFLOPs: 43.34 | +7: iteration 104400/ 115203 | consumed samples: 26726400 | consumed tokens: 54735667200 | elapsed time per iteration (s): 0.57 | learning rate: 2.396E-05 | global batch size: 256 | lm loss: 2.563473E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.845 | TFLOPs: 43.08 | +7: iteration 104410/ 115203 | consumed samples: 26728960 | consumed tokens: 54740910080 | elapsed time per iteration (s): 0.56 | learning rate: 2.395E-05 | global batch size: 256 | lm loss: 2.557886E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.507 | TFLOPs: 43.33 | +7: iteration 104420/ 115203 | consumed samples: 26731520 | consumed tokens: 54746152960 | elapsed time per iteration (s): 0.58 | learning rate: 2.394E-05 | global batch size: 256 | lm loss: 2.569825E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.779 | TFLOPs: 42.31 | +7: iteration 104430/ 115203 | consumed samples: 26734080 | consumed tokens: 54751395840 | elapsed time per iteration (s): 0.56 | learning rate: 2.393E-05 | global batch size: 256 | lm loss: 2.556405E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.250 | TFLOPs: 43.98 | +7: iteration 104440/ 115203 | consumed samples: 26736640 | consumed tokens: 54756638720 | elapsed time per iteration (s): 0.56 | learning rate: 2.393E-05 | global batch size: 256 | lm loss: 2.574265E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.209 | TFLOPs: 43.30 | +7: iteration 104450/ 115203 | consumed samples: 26739200 | consumed tokens: 54761881600 | elapsed time per iteration (s): 0.58 | learning rate: 2.392E-05 | global batch size: 256 | lm loss: 2.564379E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.648 | TFLOPs: 42.30 | +7: iteration 104460/ 115203 | consumed samples: 26741760 | consumed tokens: 54767124480 | elapsed time per iteration (s): 0.55 | learning rate: 2.391E-05 | global batch size: 256 | lm loss: 2.553433E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 104470/ 115203 | consumed samples: 26744320 | consumed tokens: 54772367360 | elapsed time per iteration (s): 0.58 | learning rate: 2.391E-05 | global batch size: 256 | lm loss: 2.548035E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.606 | TFLOPs: 42.10 | +7: iteration 104480/ 115203 | consumed samples: 26746880 | consumed tokens: 54777610240 | elapsed time per iteration (s): 0.57 | learning rate: 2.390E-05 | global batch size: 256 | lm loss: 2.552604E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.569 | TFLOPs: 42.67 | +7: iteration 104490/ 115203 | consumed samples: 26749440 | consumed tokens: 54782853120 | elapsed time per iteration (s): 0.56 | learning rate: 2.389E-05 | global batch size: 256 | lm loss: 2.554132E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.701 | TFLOPs: 43.73 | +7: iteration 104500/ 115203 | consumed samples: 26752000 | consumed tokens: 54788096000 | elapsed time per iteration (s): 0.57 | learning rate: 2.388E-05 | global batch size: 256 | lm loss: 2.564587E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.742 | TFLOPs: 42.97 | +7: iteration 104510/ 115203 | consumed samples: 26754560 | consumed tokens: 54793338880 | elapsed time per iteration (s): 0.56 | learning rate: 2.388E-05 | global batch size: 256 | lm loss: 2.562642E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.430 | TFLOPs: 43.23 | +7: iteration 104520/ 115203 | consumed samples: 26757120 | consumed tokens: 54798581760 | elapsed time per iteration (s): 0.56 | learning rate: 2.387E-05 | global batch size: 256 | lm loss: 2.567326E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.684 | TFLOPs: 43.54 | +7: iteration 104530/ 115203 | consumed samples: 26759680 | consumed tokens: 54803824640 | elapsed time per iteration (s): 0.56 | learning rate: 2.386E-05 | global batch size: 256 | lm loss: 2.559741E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.467 | TFLOPs: 43.90 | +7: iteration 104540/ 115203 | consumed samples: 26762240 | consumed tokens: 54809067520 | elapsed time per iteration (s): 0.57 | learning rate: 2.385E-05 | global batch size: 256 | lm loss: 2.548465E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.307 | TFLOPs: 42.74 | +7: iteration 104550/ 115203 | consumed samples: 26764800 | consumed tokens: 54814310400 | elapsed time per iteration (s): 0.58 | learning rate: 2.385E-05 | global batch size: 256 | lm loss: 2.565853E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.911 | TFLOPs: 42.32 | +7: iteration 104560/ 115203 | consumed samples: 26767360 | consumed tokens: 54819553280 | elapsed time per iteration (s): 0.58 | learning rate: 2.384E-05 | global batch size: 256 | lm loss: 2.537945E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.288 | TFLOPs: 42.17 | +7: iteration 104570/ 115203 | consumed samples: 26769920 | consumed tokens: 54824796160 | elapsed time per iteration (s): 0.57 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 2.555121E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.408 | TFLOPs: 43.13 | +7: iteration 104580/ 115203 | consumed samples: 26772480 | consumed tokens: 54830039040 | elapsed time per iteration (s): 0.58 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 2.573931E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.990 | TFLOPs: 42.23 | +7: iteration 104590/ 115203 | consumed samples: 26775040 | consumed tokens: 54835281920 | elapsed time per iteration (s): 0.58 | learning rate: 2.382E-05 | global batch size: 256 | lm loss: 2.575020E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.619 | TFLOPs: 42.39 | +7: iteration 104600/ 115203 | consumed samples: 26777600 | consumed tokens: 54840524800 | elapsed time per iteration (s): 0.56 | learning rate: 2.381E-05 | global batch size: 256 | lm loss: 2.563029E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.383 | TFLOPs: 43.89 | +7: iteration 104610/ 115203 | consumed samples: 26780160 | consumed tokens: 54845767680 | elapsed time per iteration (s): 0.57 | learning rate: 2.380E-05 | global batch size: 256 | lm loss: 2.579609E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.959 | TFLOPs: 42.71 | +7: iteration 104620/ 115203 | consumed samples: 26782720 | consumed tokens: 54851010560 | elapsed time per iteration (s): 0.57 | learning rate: 2.380E-05 | global batch size: 256 | lm loss: 2.565103E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.031 | TFLOPs: 42.62 | +7: iteration 104630/ 115203 | consumed samples: 26785280 | consumed tokens: 54856253440 | elapsed time per iteration (s): 0.58 | learning rate: 2.379E-05 | global batch size: 256 | lm loss: 2.568284E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.230 | TFLOPs: 42.16 | +7: iteration 104640/ 115203 | consumed samples: 26787840 | consumed tokens: 54861496320 | elapsed time per iteration (s): 0.56 | learning rate: 2.378E-05 | global batch size: 256 | lm loss: 2.569312E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.708 | TFLOPs: 43.73 | +7: iteration 104650/ 115203 | consumed samples: 26790400 | consumed tokens: 54866739200 | elapsed time per iteration (s): 0.56 | learning rate: 2.378E-05 | global batch size: 256 | lm loss: 2.557768E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.181 | TFLOPs: 43.68 | +7: iteration 104660/ 115203 | consumed samples: 26792960 | consumed tokens: 54871982080 | elapsed time per iteration (s): 0.55 | learning rate: 2.377E-05 | global batch size: 256 | lm loss: 2.568440E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.414 | TFLOPs: 43.99 | +7: iteration 104670/ 115203 | consumed samples: 26795520 | consumed tokens: 54877224960 | elapsed time per iteration (s): 0.56 | learning rate: 2.376E-05 | global batch size: 256 | lm loss: 2.557563E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.281 | TFLOPs: 43.31 | +7: iteration 104680/ 115203 | consumed samples: 26798080 | consumed tokens: 54882467840 | elapsed time per iteration (s): 0.57 | learning rate: 2.376E-05 | global batch size: 256 | lm loss: 2.570308E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.929 | TFLOPs: 43.18 | +7: iteration 104690/ 115203 | consumed samples: 26800640 | consumed tokens: 54887710720 | elapsed time per iteration (s): 0.56 | learning rate: 2.375E-05 | global batch size: 256 | lm loss: 2.568020E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.577 | TFLOPs: 43.72 | +7: iteration 104700/ 115203 | consumed samples: 26803200 | consumed tokens: 54892953600 | elapsed time per iteration (s): 0.57 | learning rate: 2.374E-05 | global batch size: 256 | lm loss: 2.562823E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.946 | TFLOPs: 43.09 | +7: iteration 104710/ 115203 | consumed samples: 26805760 | consumed tokens: 54898196480 | elapsed time per iteration (s): 0.57 | learning rate: 2.373E-05 | global batch size: 256 | lm loss: 2.573762E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.127 | TFLOPs: 42.63 | +7: iteration 104720/ 115203 | consumed samples: 26808320 | consumed tokens: 54903439360 | elapsed time per iteration (s): 0.56 | learning rate: 2.373E-05 | global batch size: 256 | lm loss: 2.559475E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.326 | TFLOPs: 43.70 | +7: iteration 104730/ 115203 | consumed samples: 26810880 | consumed tokens: 54908682240 | elapsed time per iteration (s): 0.57 | learning rate: 2.372E-05 | global batch size: 256 | lm loss: 2.574250E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.128 | TFLOPs: 43.11 | +7: iteration 104740/ 115203 | consumed samples: 26813440 | consumed tokens: 54913925120 | elapsed time per iteration (s): 0.57 | learning rate: 2.371E-05 | global batch size: 256 | lm loss: 2.575067E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.786 | TFLOPs: 43.07 | +7: iteration 104750/ 115203 | consumed samples: 26816000 | consumed tokens: 54919168000 | elapsed time per iteration (s): 0.57 | learning rate: 2.371E-05 | global batch size: 256 | lm loss: 2.562227E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.180 | TFLOPs: 43.11 | +7: iteration 104760/ 115203 | consumed samples: 26818560 | consumed tokens: 54924410880 | elapsed time per iteration (s): 0.55 | learning rate: 2.370E-05 | global batch size: 256 | lm loss: 2.554103E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.269 | TFLOPs: 43.98 | +7: iteration 104770/ 115203 | consumed samples: 26821120 | consumed tokens: 54929653760 | elapsed time per iteration (s): 0.57 | learning rate: 2.369E-05 | global batch size: 256 | lm loss: 2.551113E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.957 | TFLOPs: 42.80 | +7: iteration 104780/ 115203 | consumed samples: 26823680 | consumed tokens: 54934896640 | elapsed time per iteration (s): 0.56 | learning rate: 2.368E-05 | global batch size: 256 | lm loss: 2.557581E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.760 | TFLOPs: 43.45 | +7: iteration 104790/ 115203 | consumed samples: 26826240 | consumed tokens: 54940139520 | elapsed time per iteration (s): 0.58 | learning rate: 2.368E-05 | global batch size: 256 | lm loss: 2.569291E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.920 | TFLOPs: 42.32 | +7: iteration 104800/ 115203 | consumed samples: 26828800 | consumed tokens: 54945382400 | elapsed time per iteration (s): 0.57 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 2.545187E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.496 | TFLOPs: 43.14 | +7: iteration 104810/ 115203 | consumed samples: 26831360 | consumed tokens: 54950625280 | elapsed time per iteration (s): 0.56 | learning rate: 2.366E-05 | global batch size: 256 | lm loss: 2.569624E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.137 | TFLOPs: 43.39 | +7: iteration 104820/ 115203 | consumed samples: 26833920 | consumed tokens: 54955868160 | elapsed time per iteration (s): 0.57 | learning rate: 2.366E-05 | global batch size: 256 | lm loss: 2.557071E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.595 | TFLOPs: 42.96 | +7: iteration 104830/ 115203 | consumed samples: 26836480 | consumed tokens: 54961111040 | elapsed time per iteration (s): 0.56 | learning rate: 2.365E-05 | global batch size: 256 | lm loss: 2.574722E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.082 | TFLOPs: 43.39 | +7: iteration 104840/ 115203 | consumed samples: 26839040 | consumed tokens: 54966353920 | elapsed time per iteration (s): 0.56 | learning rate: 2.364E-05 | global batch size: 256 | lm loss: 2.560610E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.423 | TFLOPs: 43.61 | +7: iteration 104850/ 115203 | consumed samples: 26841600 | consumed tokens: 54971596800 | elapsed time per iteration (s): 0.57 | learning rate: 2.364E-05 | global batch size: 256 | lm loss: 2.549801E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.259 | TFLOPs: 43.02 | +7: iteration 104860/ 115203 | consumed samples: 26844160 | consumed tokens: 54976839680 | elapsed time per iteration (s): 0.57 | learning rate: 2.363E-05 | global batch size: 256 | lm loss: 2.569225E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.059 | TFLOPs: 43.19 | +7: iteration 104870/ 115203 | consumed samples: 26846720 | consumed tokens: 54982082560 | elapsed time per iteration (s): 0.57 | learning rate: 2.362E-05 | global batch size: 256 | lm loss: 2.551690E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.130 | TFLOPs: 43.11 | +7: iteration 104880/ 115203 | consumed samples: 26849280 | consumed tokens: 54987325440 | elapsed time per iteration (s): 0.56 | learning rate: 2.361E-05 | global batch size: 256 | lm loss: 2.564730E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.557 | TFLOPs: 43.72 | +7: iteration 104890/ 115203 | consumed samples: 26851840 | consumed tokens: 54992568320 | elapsed time per iteration (s): 0.56 | learning rate: 2.361E-05 | global batch size: 256 | lm loss: 2.567211E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.028 | TFLOPs: 43.86 | +7: iteration 104900/ 115203 | consumed samples: 26854400 | consumed tokens: 54997811200 | elapsed time per iteration (s): 0.57 | learning rate: 2.360E-05 | global batch size: 256 | lm loss: 2.547962E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.321 | TFLOPs: 43.03 | +7: iteration 104910/ 115203 | consumed samples: 26856960 | consumed tokens: 55003054080 | elapsed time per iteration (s): 0.56 | learning rate: 2.359E-05 | global batch size: 256 | lm loss: 2.567646E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.960 | TFLOPs: 43.38 | +7: iteration 104920/ 115203 | consumed samples: 26859520 | consumed tokens: 55008296960 | elapsed time per iteration (s): 0.56 | learning rate: 2.359E-05 | global batch size: 256 | lm loss: 2.564267E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.582 | TFLOPs: 43.72 | +7: iteration 104930/ 115203 | consumed samples: 26862080 | consumed tokens: 55013539840 | elapsed time per iteration (s): 0.57 | learning rate: 2.358E-05 | global batch size: 256 | lm loss: 2.569680E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.486 | TFLOPs: 42.76 | +7: iteration 104940/ 115203 | consumed samples: 26864640 | consumed tokens: 55018782720 | elapsed time per iteration (s): 0.55 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 2.565834E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.577 | TFLOPs: 44.01 | +7: iteration 104950/ 115203 | consumed samples: 26867200 | consumed tokens: 55024025600 | elapsed time per iteration (s): 0.56 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 2.565629E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.216 | TFLOPs: 43.50 | +7: iteration 104960/ 115203 | consumed samples: 26869760 | consumed tokens: 55029268480 | elapsed time per iteration (s): 0.56 | learning rate: 2.356E-05 | global batch size: 256 | lm loss: 2.566039E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.969 | TFLOPs: 43.47 | +7: iteration 104970/ 115203 | consumed samples: 26872320 | consumed tokens: 55034511360 | elapsed time per iteration (s): 0.56 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 2.553185E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.154 | TFLOPs: 43.30 | +7: iteration 104980/ 115203 | consumed samples: 26874880 | consumed tokens: 55039754240 | elapsed time per iteration (s): 0.56 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 2.566570E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.095 | TFLOPs: 43.58 | +7: iteration 104990/ 115203 | consumed samples: 26877440 | consumed tokens: 55044997120 | elapsed time per iteration (s): 0.56 | learning rate: 2.354E-05 | global batch size: 256 | lm loss: 2.559563E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.147 | TFLOPs: 43.49 | +7: iteration 105000/ 115203 | consumed samples: 26880000 | consumed tokens: 55050240000 | elapsed time per iteration (s): 0.56 | learning rate: 2.353E-05 | global batch size: 256 | lm loss: 2.559704E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.398 | TFLOPs: 43.32 | +7: iteration 105010/ 115203 | consumed samples: 26882560 | consumed tokens: 55055482880 | elapsed time per iteration (s): 0.56 | learning rate: 2.352E-05 | global batch size: 256 | lm loss: 2.561859E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.895 | TFLOPs: 43.46 | +7: iteration 105020/ 115203 | consumed samples: 26885120 | consumed tokens: 55060725760 | elapsed time per iteration (s): 0.56 | learning rate: 2.352E-05 | global batch size: 256 | lm loss: 2.576733E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.843 | TFLOPs: 43.65 | +7: iteration 105030/ 115203 | consumed samples: 26887680 | consumed tokens: 55065968640 | elapsed time per iteration (s): 0.56 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 2.561242E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.341 | TFLOPs: 43.60 | +7: iteration 105040/ 115203 | consumed samples: 26890240 | consumed tokens: 55071211520 | elapsed time per iteration (s): 0.57 | learning rate: 2.350E-05 | global batch size: 256 | lm loss: 2.551137E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.455 | TFLOPs: 42.56 | +7: iteration 105050/ 115203 | consumed samples: 26892800 | consumed tokens: 55076454400 | elapsed time per iteration (s): 0.56 | learning rate: 2.350E-05 | global batch size: 256 | lm loss: 2.564166E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.097 | TFLOPs: 43.77 | +7: iteration 105060/ 115203 | consumed samples: 26895360 | consumed tokens: 55081697280 | elapsed time per iteration (s): 0.56 | learning rate: 2.349E-05 | global batch size: 256 | lm loss: 2.562397E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.663 | TFLOPs: 43.54 | +7: iteration 105070/ 115203 | consumed samples: 26897920 | consumed tokens: 55086940160 | elapsed time per iteration (s): 0.56 | learning rate: 2.348E-05 | global batch size: 256 | lm loss: 2.560937E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.500 | TFLOPs: 43.52 | +7: iteration 105080/ 115203 | consumed samples: 26900480 | consumed tokens: 55092183040 | elapsed time per iteration (s): 0.57 | learning rate: 2.348E-05 | global batch size: 256 | lm loss: 2.542011E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.988 | TFLOPs: 43.09 | +7: iteration 105090/ 115203 | consumed samples: 26903040 | consumed tokens: 55097425920 | elapsed time per iteration (s): 0.57 | learning rate: 2.347E-05 | global batch size: 256 | lm loss: 2.578688E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.141 | TFLOPs: 43.11 | +7: iteration 105100/ 115203 | consumed samples: 26905600 | consumed tokens: 55102668800 | elapsed time per iteration (s): 0.57 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 2.572856E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.949 | TFLOPs: 43.18 | +7: iteration 105110/ 115203 | consumed samples: 26908160 | consumed tokens: 55107911680 | elapsed time per iteration (s): 0.56 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 2.563534E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.716 | TFLOPs: 43.73 | +7: iteration 105120/ 115203 | consumed samples: 26910720 | consumed tokens: 55113154560 | elapsed time per iteration (s): 0.56 | learning rate: 2.345E-05 | global batch size: 256 | lm loss: 2.559155E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.497 | TFLOPs: 43.71 | +7: iteration 105130/ 115203 | consumed samples: 26913280 | consumed tokens: 55118397440 | elapsed time per iteration (s): 0.56 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 2.576013E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.037 | TFLOPs: 43.48 | +7: iteration 105140/ 115203 | consumed samples: 26915840 | consumed tokens: 55123640320 | elapsed time per iteration (s): 0.56 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 2.562942E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.697 | TFLOPs: 43.45 | +7: iteration 105150/ 115203 | consumed samples: 26918400 | consumed tokens: 55128883200 | elapsed time per iteration (s): 0.58 | learning rate: 2.343E-05 | global batch size: 256 | lm loss: 2.555084E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.680 | TFLOPs: 42.20 | +7: iteration 105160/ 115203 | consumed samples: 26920960 | consumed tokens: 55134126080 | elapsed time per iteration (s): 0.57 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 2.565881E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.095 | TFLOPs: 43.01 | +7: iteration 105170/ 115203 | consumed samples: 26923520 | consumed tokens: 55139368960 | elapsed time per iteration (s): 0.56 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 2.565588E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.013 | TFLOPs: 43.57 | +7: iteration 105180/ 115203 | consumed samples: 26926080 | consumed tokens: 55144611840 | elapsed time per iteration (s): 0.56 | learning rate: 2.341E-05 | global batch size: 256 | lm loss: 2.557800E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.637 | TFLOPs: 43.25 | +7: iteration 105190/ 115203 | consumed samples: 26928640 | consumed tokens: 55149854720 | elapsed time per iteration (s): 0.55 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 2.562864E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.354 | TFLOPs: 43.99 | +7: iteration 105200/ 115203 | consumed samples: 26931200 | consumed tokens: 55155097600 | elapsed time per iteration (s): 0.57 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 2.567694E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.573 | TFLOPs: 42.96 | +7: iteration 105210/ 115203 | consumed samples: 26933760 | consumed tokens: 55160340480 | elapsed time per iteration (s): 0.56 | learning rate: 2.339E-05 | global batch size: 256 | lm loss: 2.569470E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.380 | TFLOPs: 43.32 | +7: iteration 105220/ 115203 | consumed samples: 26936320 | consumed tokens: 55165583360 | elapsed time per iteration (s): 0.56 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 2.559747E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.866 | TFLOPs: 43.56 | +7: iteration 105230/ 115203 | consumed samples: 26938880 | consumed tokens: 55170826240 | elapsed time per iteration (s): 0.56 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 2.566831E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.997 | TFLOPs: 43.28 | +7: iteration 105240/ 115203 | consumed samples: 26941440 | consumed tokens: 55176069120 | elapsed time per iteration (s): 0.55 | learning rate: 2.337E-05 | global batch size: 256 | lm loss: 2.550940E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.445 | TFLOPs: 43.99 | +7: iteration 105250/ 115203 | consumed samples: 26944000 | consumed tokens: 55181312000 | elapsed time per iteration (s): 0.57 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 2.568487E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.262 | TFLOPs: 43.02 | +7: iteration 105260/ 115203 | consumed samples: 26946560 | consumed tokens: 55186554880 | elapsed time per iteration (s): 0.55 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 2.571081E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.509 | TFLOPs: 44.00 | +7: iteration 105270/ 115203 | consumed samples: 26949120 | consumed tokens: 55191797760 | elapsed time per iteration (s): 0.57 | learning rate: 2.335E-05 | global batch size: 256 | lm loss: 2.567175E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.338 | TFLOPs: 42.55 | +7: iteration 105280/ 115203 | consumed samples: 26951680 | consumed tokens: 55197040640 | elapsed time per iteration (s): 0.56 | learning rate: 2.334E-05 | global batch size: 256 | lm loss: 2.557883E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.078 | TFLOPs: 43.58 | +7: iteration 105290/ 115203 | consumed samples: 26954240 | consumed tokens: 55202283520 | elapsed time per iteration (s): 0.57 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 2.557686E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.278 | TFLOPs: 43.12 | +7: iteration 105300/ 115203 | consumed samples: 26956800 | consumed tokens: 55207526400 | elapsed time per iteration (s): 0.57 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 2.560337E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.053 | TFLOPs: 42.91 | +7: iteration 105310/ 115203 | consumed samples: 26959360 | consumed tokens: 55212769280 | elapsed time per iteration (s): 0.56 | learning rate: 2.332E-05 | global batch size: 256 | lm loss: 2.549324E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.567 | TFLOPs: 43.62 | +7: iteration 105320/ 115203 | consumed samples: 26961920 | consumed tokens: 55218012160 | elapsed time per iteration (s): 0.56 | learning rate: 2.331E-05 | global batch size: 256 | lm loss: 2.568717E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.499 | TFLOPs: 43.62 | +7: iteration 105330/ 115203 | consumed samples: 26964480 | consumed tokens: 55223255040 | elapsed time per iteration (s): 0.56 | learning rate: 2.331E-05 | global batch size: 256 | lm loss: 2.543484E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.996 | TFLOPs: 43.47 | +7: iteration 105340/ 115203 | consumed samples: 26967040 | consumed tokens: 55228497920 | elapsed time per iteration (s): 0.55 | learning rate: 2.330E-05 | global batch size: 256 | lm loss: 2.555728E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.584 | TFLOPs: 44.01 | +7: iteration 105350/ 115203 | consumed samples: 26969600 | consumed tokens: 55233740800 | elapsed time per iteration (s): 0.55 | learning rate: 2.329E-05 | global batch size: 256 | lm loss: 2.549508E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.708 | TFLOPs: 44.02 | +7: iteration 105360/ 115203 | consumed samples: 26972160 | consumed tokens: 55238983680 | elapsed time per iteration (s): 0.56 | learning rate: 2.329E-05 | global batch size: 256 | lm loss: 2.557692E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.770 | TFLOPs: 43.26 | +7: iteration 105370/ 115203 | consumed samples: 26974720 | consumed tokens: 55244226560 | elapsed time per iteration (s): 0.56 | learning rate: 2.328E-05 | global batch size: 256 | lm loss: 2.554711E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.257 | TFLOPs: 43.79 | +7: iteration 105380/ 115203 | consumed samples: 26977280 | consumed tokens: 55249469440 | elapsed time per iteration (s): 0.56 | learning rate: 2.328E-05 | global batch size: 256 | lm loss: 2.580482E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.295 | TFLOPs: 43.22 | +7: iteration 105390/ 115203 | consumed samples: 26979840 | consumed tokens: 55254712320 | elapsed time per iteration (s): 0.56 | learning rate: 2.327E-05 | global batch size: 256 | lm loss: 2.572548E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.054 | TFLOPs: 43.38 | +7: iteration 105400/ 115203 | consumed samples: 26982400 | consumed tokens: 55259955200 | elapsed time per iteration (s): 0.56 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 2.554668E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.977 | TFLOPs: 43.28 | +7: iteration 105410/ 115203 | consumed samples: 26984960 | consumed tokens: 55265198080 | elapsed time per iteration (s): 0.57 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 2.558429E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.344 | TFLOPs: 42.46 | +7: iteration 105420/ 115203 | consumed samples: 26987520 | consumed tokens: 55270440960 | elapsed time per iteration (s): 0.55 | learning rate: 2.325E-05 | global batch size: 256 | lm loss: 2.567551E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.582 | TFLOPs: 44.01 | +7: iteration 105430/ 115203 | consumed samples: 26990080 | consumed tokens: 55275683840 | elapsed time per iteration (s): 0.56 | learning rate: 2.324E-05 | global batch size: 256 | lm loss: 2.563568E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.958 | TFLOPs: 43.47 | +7: iteration 105440/ 115203 | consumed samples: 26992640 | consumed tokens: 55280926720 | elapsed time per iteration (s): 0.55 | learning rate: 2.324E-05 | global batch size: 256 | lm loss: 2.558692E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.672 | TFLOPs: 44.02 | +7: iteration 105450/ 115203 | consumed samples: 26995200 | consumed tokens: 55286169600 | elapsed time per iteration (s): 0.56 | learning rate: 2.323E-05 | global batch size: 256 | lm loss: 2.572831E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.145 | TFLOPs: 43.87 | +7: iteration 105460/ 115203 | consumed samples: 26997760 | consumed tokens: 55291412480 | elapsed time per iteration (s): 0.56 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 2.558759E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.789 | TFLOPs: 43.36 | +7: iteration 105470/ 115203 | consumed samples: 27000320 | consumed tokens: 55296655360 | elapsed time per iteration (s): 0.56 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 2.561074E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.193 | TFLOPs: 43.49 | +7: iteration 105480/ 115203 | consumed samples: 27002880 | consumed tokens: 55301898240 | elapsed time per iteration (s): 0.56 | learning rate: 2.321E-05 | global batch size: 256 | lm loss: 2.556262E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.053 | TFLOPs: 43.67 | +7: iteration 105490/ 115203 | consumed samples: 27005440 | consumed tokens: 55307141120 | elapsed time per iteration (s): 0.56 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 2.553601E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.351 | TFLOPs: 43.32 | +7: iteration 105500/ 115203 | consumed samples: 27008000 | consumed tokens: 55312384000 | elapsed time per iteration (s): 0.56 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 2.563085E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.832 | TFLOPs: 43.36 | +7: iteration 105510/ 115203 | consumed samples: 27010560 | consumed tokens: 55317626880 | elapsed time per iteration (s): 0.55 | learning rate: 2.319E-05 | global batch size: 256 | lm loss: 2.566378E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.470 | TFLOPs: 44.00 | +7: iteration 105520/ 115203 | consumed samples: 27013120 | consumed tokens: 55322869760 | elapsed time per iteration (s): 0.56 | learning rate: 2.318E-05 | global batch size: 256 | lm loss: 2.552682E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.116 | TFLOPs: 43.20 | +7: iteration 105530/ 115203 | consumed samples: 27015680 | consumed tokens: 55328112640 | elapsed time per iteration (s): 0.56 | learning rate: 2.318E-05 | global batch size: 256 | lm loss: 2.546711E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.732 | TFLOPs: 43.35 | +7: iteration 105540/ 115203 | consumed samples: 27018240 | consumed tokens: 55333355520 | elapsed time per iteration (s): 0.55 | learning rate: 2.317E-05 | global batch size: 256 | lm loss: 2.580449E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.579 | TFLOPs: 44.01 | +7: iteration 105550/ 115203 | consumed samples: 27020800 | consumed tokens: 55338598400 | elapsed time per iteration (s): 0.55 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 2.559279E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.361 | TFLOPs: 43.99 | +7: iteration 105560/ 115203 | consumed samples: 27023360 | consumed tokens: 55343841280 | elapsed time per iteration (s): 0.55 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 2.556365E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.504 | TFLOPs: 44.00 | +7: iteration 105570/ 115203 | consumed samples: 27025920 | consumed tokens: 55349084160 | elapsed time per iteration (s): 0.55 | learning rate: 2.315E-05 | global batch size: 256 | lm loss: 2.563380E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.510 | TFLOPs: 44.00 | +7: iteration 105580/ 115203 | consumed samples: 27028480 | consumed tokens: 55354327040 | elapsed time per iteration (s): 0.56 | learning rate: 2.314E-05 | global batch size: 256 | lm loss: 2.566040E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.729 | TFLOPs: 43.83 | +7: iteration 105590/ 115203 | consumed samples: 27031040 | consumed tokens: 55359569920 | elapsed time per iteration (s): 0.56 | learning rate: 2.314E-05 | global batch size: 256 | lm loss: 2.549520E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.077 | TFLOPs: 43.86 | +7: iteration 105600/ 115203 | consumed samples: 27033600 | consumed tokens: 55364812800 | elapsed time per iteration (s): 0.57 | learning rate: 2.313E-05 | global batch size: 256 | lm loss: 2.545294E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.049 | TFLOPs: 43.00 | +7: iteration 105610/ 115203 | consumed samples: 27036160 | consumed tokens: 55370055680 | elapsed time per iteration (s): 0.55 | learning rate: 2.312E-05 | global batch size: 256 | lm loss: 2.558798E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.485 | TFLOPs: 44.00 | +7: iteration 105620/ 115203 | consumed samples: 27038720 | consumed tokens: 55375298560 | elapsed time per iteration (s): 0.56 | learning rate: 2.312E-05 | global batch size: 256 | lm loss: 2.550573E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.616 | TFLOPs: 43.72 | +7: iteration 105630/ 115203 | consumed samples: 27041280 | consumed tokens: 55380541440 | elapsed time per iteration (s): 0.56 | learning rate: 2.311E-05 | global batch size: 256 | lm loss: 2.561132E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.044 | TFLOPs: 43.48 | +7: iteration 105640/ 115203 | consumed samples: 27043840 | consumed tokens: 55385784320 | elapsed time per iteration (s): 0.56 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 2.560464E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.266 | TFLOPs: 43.60 | +7: iteration 105650/ 115203 | consumed samples: 27046400 | consumed tokens: 55391027200 | elapsed time per iteration (s): 0.56 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 2.557350E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.541 | TFLOPs: 43.72 | +7: iteration 105660/ 115203 | consumed samples: 27048960 | consumed tokens: 55396270080 | elapsed time per iteration (s): 0.56 | learning rate: 2.309E-05 | global batch size: 256 | lm loss: 2.574754E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.777 | TFLOPs: 43.93 | +7: iteration 105670/ 115203 | consumed samples: 27051520 | consumed tokens: 55401512960 | elapsed time per iteration (s): 0.56 | learning rate: 2.309E-05 | global batch size: 256 | lm loss: 2.565185E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.900 | TFLOPs: 43.47 | +7: iteration 105680/ 115203 | consumed samples: 27054080 | consumed tokens: 55406755840 | elapsed time per iteration (s): 0.56 | learning rate: 2.308E-05 | global batch size: 256 | lm loss: 2.572037E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.727 | TFLOPs: 43.35 | +7: iteration 105690/ 115203 | consumed samples: 27056640 | consumed tokens: 55411998720 | elapsed time per iteration (s): 0.55 | learning rate: 2.307E-05 | global batch size: 256 | lm loss: 2.543860E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.388 | TFLOPs: 43.99 | +7: iteration 105700/ 115203 | consumed samples: 27059200 | consumed tokens: 55417241600 | elapsed time per iteration (s): 0.56 | learning rate: 2.307E-05 | global batch size: 256 | lm loss: 2.563292E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.863 | TFLOPs: 43.75 | +7: iteration 105710/ 115203 | consumed samples: 27061760 | consumed tokens: 55422484480 | elapsed time per iteration (s): 0.56 | learning rate: 2.306E-05 | global batch size: 256 | lm loss: 2.567344E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.617 | TFLOPs: 43.72 | +7: iteration 105720/ 115203 | consumed samples: 27064320 | consumed tokens: 55427727360 | elapsed time per iteration (s): 0.56 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 2.560208E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.899 | TFLOPs: 43.56 | +7: iteration 105730/ 115203 | consumed samples: 27066880 | consumed tokens: 55432970240 | elapsed time per iteration (s): 0.55 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 2.555009E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.737 | TFLOPs: 44.02 | +7: iteration 105740/ 115203 | consumed samples: 27069440 | consumed tokens: 55438213120 | elapsed time per iteration (s): 0.55 | learning rate: 2.304E-05 | global batch size: 256 | lm loss: 2.562887E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.629 | TFLOPs: 44.01 | +7: iteration 105750/ 115203 | consumed samples: 27072000 | consumed tokens: 55443456000 | elapsed time per iteration (s): 0.55 | learning rate: 2.303E-05 | global batch size: 256 | lm loss: 2.568106E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.694 | TFLOPs: 44.02 | +7: iteration 105760/ 115203 | consumed samples: 27074560 | consumed tokens: 55448698880 | elapsed time per iteration (s): 0.56 | learning rate: 2.303E-05 | global batch size: 256 | lm loss: 2.555294E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.556 | TFLOPs: 43.34 | +7: iteration 105770/ 115203 | consumed samples: 27077120 | consumed tokens: 55453941760 | elapsed time per iteration (s): 0.55 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 2.558418E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.638 | TFLOPs: 44.01 | +7: iteration 105780/ 115203 | consumed samples: 27079680 | consumed tokens: 55459184640 | elapsed time per iteration (s): 0.56 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 2.565652E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.888 | TFLOPs: 43.46 | +7: iteration 105790/ 115203 | consumed samples: 27082240 | consumed tokens: 55464427520 | elapsed time per iteration (s): 0.56 | learning rate: 2.301E-05 | global batch size: 256 | lm loss: 2.552960E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.978 | TFLOPs: 43.47 | +7: iteration 105800/ 115203 | consumed samples: 27084800 | consumed tokens: 55469670400 | elapsed time per iteration (s): 0.56 | learning rate: 2.300E-05 | global batch size: 256 | lm loss: 2.564750E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.759 | TFLOPs: 43.83 | +7: iteration 105810/ 115203 | consumed samples: 27087360 | consumed tokens: 55474913280 | elapsed time per iteration (s): 0.55 | learning rate: 2.300E-05 | global batch size: 256 | lm loss: 2.558685E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.372 | TFLOPs: 43.99 | +7: iteration 105820/ 115203 | consumed samples: 27089920 | consumed tokens: 55480156160 | elapsed time per iteration (s): 0.56 | learning rate: 2.299E-05 | global batch size: 256 | lm loss: 2.553325E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.994 | TFLOPs: 43.47 | +7: iteration 105830/ 115203 | consumed samples: 27092480 | consumed tokens: 55485399040 | elapsed time per iteration (s): 0.55 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 2.554291E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.484 | TFLOPs: 44.00 | +7: iteration 105840/ 115203 | consumed samples: 27095040 | consumed tokens: 55490641920 | elapsed time per iteration (s): 0.56 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 2.551630E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.854 | TFLOPs: 43.94 | +7: iteration 105850/ 115203 | consumed samples: 27097600 | consumed tokens: 55495884800 | elapsed time per iteration (s): 0.57 | learning rate: 2.297E-05 | global batch size: 256 | lm loss: 2.571072E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.688 | TFLOPs: 42.97 | +7: iteration 105860/ 115203 | consumed samples: 27100160 | consumed tokens: 55501127680 | elapsed time per iteration (s): 0.56 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 2.567522E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.749 | TFLOPs: 43.55 | +7: iteration 105870/ 115203 | consumed samples: 27102720 | consumed tokens: 55506370560 | elapsed time per iteration (s): 0.55 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 2.551084E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.472 | TFLOPs: 44.00 | +7: iteration 105880/ 115203 | consumed samples: 27105280 | consumed tokens: 55511613440 | elapsed time per iteration (s): 0.56 | learning rate: 2.295E-05 | global batch size: 256 | lm loss: 2.562103E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.725 | TFLOPs: 43.54 | +7: iteration 105890/ 115203 | consumed samples: 27107840 | consumed tokens: 55516856320 | elapsed time per iteration (s): 0.55 | learning rate: 2.295E-05 | global batch size: 256 | lm loss: 2.559735E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.658 | TFLOPs: 44.01 | +7: iteration 105900/ 115203 | consumed samples: 27110400 | consumed tokens: 55522099200 | elapsed time per iteration (s): 0.56 | learning rate: 2.294E-05 | global batch size: 256 | lm loss: 2.558151E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.275 | TFLOPs: 43.50 | +7: iteration 105910/ 115203 | consumed samples: 27112960 | consumed tokens: 55527342080 | elapsed time per iteration (s): 0.56 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 2.554280E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.584 | TFLOPs: 43.72 | +7: iteration 105920/ 115203 | consumed samples: 27115520 | consumed tokens: 55532584960 | elapsed time per iteration (s): 0.55 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 2.548484E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.433 | TFLOPs: 43.99 | +7: iteration 105930/ 115203 | consumed samples: 27118080 | consumed tokens: 55537827840 | elapsed time per iteration (s): 0.55 | learning rate: 2.292E-05 | global batch size: 256 | lm loss: 2.555672E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.437 | TFLOPs: 43.99 | +7: iteration 105940/ 115203 | consumed samples: 27120640 | consumed tokens: 55543070720 | elapsed time per iteration (s): 0.56 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 2.565503E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.173 | TFLOPs: 43.68 | +7: iteration 105950/ 115203 | consumed samples: 27123200 | consumed tokens: 55548313600 | elapsed time per iteration (s): 0.55 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 2.546303E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.327 | TFLOPs: 43.98 | +7: iteration 105960/ 115203 | consumed samples: 27125760 | consumed tokens: 55553556480 | elapsed time per iteration (s): 0.55 | learning rate: 2.290E-05 | global batch size: 256 | lm loss: 2.563265E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.389 | TFLOPs: 43.99 | +7: iteration 105970/ 115203 | consumed samples: 27128320 | consumed tokens: 55558799360 | elapsed time per iteration (s): 0.55 | learning rate: 2.290E-05 | global batch size: 256 | lm loss: 2.566286E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.429 | TFLOPs: 43.99 | +7: iteration 105980/ 115203 | consumed samples: 27130880 | consumed tokens: 55564042240 | elapsed time per iteration (s): 0.55 | learning rate: 2.289E-05 | global batch size: 256 | lm loss: 2.560332E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.555 | TFLOPs: 44.00 | +7: iteration 105990/ 115203 | consumed samples: 27133440 | consumed tokens: 55569285120 | elapsed time per iteration (s): 0.55 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 2.556531E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.661 | TFLOPs: 44.01 | +0: [2023-03-17 05:28:34,197] [INFO] [logging.py:68:log_dist] [Rank 0] step=106000, skipped=0, lr=[2.2876870847544666e-05, 2.2876870847544666e-05, 2.2876870847544666e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 106000/ 115203 | consumed samples: 27136000 | consumed tokens: 55574528000 | elapsed time per iteration (s): 0.55 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 2.558033E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.477 | TFLOPs: 44.00 | +0: steps: 106000 loss: 2.5976 iter time (s): 0.561 samples/sec: 456.654 +7: iteration 106010/ 115203 | consumed samples: 27138560 | consumed tokens: 55579770880 | elapsed time per iteration (s): 0.56 | learning rate: 2.287E-05 | global batch size: 256 | lm loss: 2.551894E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.927 | TFLOPs: 43.94 | +7: iteration 106020/ 115203 | consumed samples: 27141120 | consumed tokens: 55585013760 | elapsed time per iteration (s): 0.55 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 2.568352E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.340 | TFLOPs: 43.98 | +7: iteration 106030/ 115203 | consumed samples: 27143680 | consumed tokens: 55590256640 | elapsed time per iteration (s): 0.55 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 2.564929E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.475 | TFLOPs: 44.00 | +7: iteration 106040/ 115203 | consumed samples: 27146240 | consumed tokens: 55595499520 | elapsed time per iteration (s): 0.55 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 2.556045E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.473 | TFLOPs: 44.00 | +7: iteration 106050/ 115203 | consumed samples: 27148800 | consumed tokens: 55600742400 | elapsed time per iteration (s): 0.56 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 2.550022E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.339 | TFLOPs: 43.70 | +7: iteration 106060/ 115203 | consumed samples: 27151360 | consumed tokens: 55605985280 | elapsed time per iteration (s): 0.55 | learning rate: 2.284E-05 | global batch size: 256 | lm loss: 2.572558E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.380 | TFLOPs: 43.99 | +7: iteration 106070/ 115203 | consumed samples: 27153920 | consumed tokens: 55611228160 | elapsed time per iteration (s): 0.57 | learning rate: 2.283E-05 | global batch size: 256 | lm loss: 2.552580E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.983 | TFLOPs: 42.90 | +7: iteration 106080/ 115203 | consumed samples: 27156480 | consumed tokens: 55616471040 | elapsed time per iteration (s): 0.55 | learning rate: 2.283E-05 | global batch size: 256 | lm loss: 2.562937E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.419 | TFLOPs: 43.99 | +7: iteration 106090/ 115203 | consumed samples: 27159040 | consumed tokens: 55621713920 | elapsed time per iteration (s): 0.56 | learning rate: 2.282E-05 | global batch size: 256 | lm loss: 2.552927E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.436 | TFLOPs: 43.71 | +7: iteration 106100/ 115203 | consumed samples: 27161600 | consumed tokens: 55626956800 | elapsed time per iteration (s): 0.56 | learning rate: 2.282E-05 | global batch size: 256 | lm loss: 2.557333E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.865 | TFLOPs: 43.94 | +7: iteration 106110/ 115203 | consumed samples: 27164160 | consumed tokens: 55632199680 | elapsed time per iteration (s): 0.56 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 2.572767E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.277 | TFLOPs: 43.41 | +7: iteration 106120/ 115203 | consumed samples: 27166720 | consumed tokens: 55637442560 | elapsed time per iteration (s): 0.56 | learning rate: 2.280E-05 | global batch size: 256 | lm loss: 2.557111E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.148 | TFLOPs: 43.97 | +7: iteration 106130/ 115203 | consumed samples: 27169280 | consumed tokens: 55642685440 | elapsed time per iteration (s): 0.55 | learning rate: 2.280E-05 | global batch size: 256 | lm loss: 2.569351E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.338 | TFLOPs: 43.98 | +7: iteration 106140/ 115203 | consumed samples: 27171840 | consumed tokens: 55647928320 | elapsed time per iteration (s): 0.56 | learning rate: 2.279E-05 | global batch size: 256 | lm loss: 2.564069E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.066 | TFLOPs: 43.96 | +7: iteration 106150/ 115203 | consumed samples: 27174400 | consumed tokens: 55653171200 | elapsed time per iteration (s): 0.56 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 2.579827E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.161 | TFLOPs: 43.97 | +7: iteration 106160/ 115203 | consumed samples: 27176960 | consumed tokens: 55658414080 | elapsed time per iteration (s): 0.56 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 2.560192E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.246 | TFLOPs: 43.97 | +7: iteration 106170/ 115203 | consumed samples: 27179520 | consumed tokens: 55663656960 | elapsed time per iteration (s): 0.57 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 2.578366E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.082 | TFLOPs: 43.10 | +7: iteration 106180/ 115203 | consumed samples: 27182080 | consumed tokens: 55668899840 | elapsed time per iteration (s): 0.56 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 2.567311E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.003 | TFLOPs: 43.57 | +7: iteration 106190/ 115203 | consumed samples: 27184640 | consumed tokens: 55674142720 | elapsed time per iteration (s): 0.57 | learning rate: 2.276E-05 | global batch size: 256 | lm loss: 2.554891E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.082 | TFLOPs: 43.20 | +7: iteration 106200/ 115203 | consumed samples: 27187200 | consumed tokens: 55679385600 | elapsed time per iteration (s): 0.55 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 2.559616E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 106210/ 115203 | consumed samples: 27189760 | consumed tokens: 55684628480 | elapsed time per iteration (s): 0.55 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 2.559089E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.576 | TFLOPs: 44.01 | +7: iteration 106220/ 115203 | consumed samples: 27192320 | consumed tokens: 55689871360 | elapsed time per iteration (s): 0.56 | learning rate: 2.274E-05 | global batch size: 256 | lm loss: 2.558172E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.307 | TFLOPs: 43.69 | +7: iteration 106230/ 115203 | consumed samples: 27194880 | consumed tokens: 55695114240 | elapsed time per iteration (s): 0.55 | learning rate: 2.274E-05 | global batch size: 256 | lm loss: 2.565840E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.698 | TFLOPs: 44.02 | +7: iteration 106240/ 115203 | consumed samples: 27197440 | consumed tokens: 55700357120 | elapsed time per iteration (s): 0.56 | learning rate: 2.273E-05 | global batch size: 256 | lm loss: 2.561977E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.650 | TFLOPs: 43.73 | +7: iteration 106250/ 115203 | consumed samples: 27200000 | consumed tokens: 55705600000 | elapsed time per iteration (s): 0.55 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 2.567142E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.646 | TFLOPs: 44.01 | +7: iteration 106260/ 115203 | consumed samples: 27202560 | consumed tokens: 55710842880 | elapsed time per iteration (s): 0.56 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 2.553504E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.787 | TFLOPs: 43.45 | +7: iteration 106270/ 115203 | consumed samples: 27205120 | consumed tokens: 55716085760 | elapsed time per iteration (s): 0.55 | learning rate: 2.271E-05 | global batch size: 256 | lm loss: 2.556764E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.760 | TFLOPs: 44.02 | +7: iteration 106280/ 115203 | consumed samples: 27207680 | consumed tokens: 55721328640 | elapsed time per iteration (s): 0.55 | learning rate: 2.271E-05 | global batch size: 256 | lm loss: 2.558656E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.608 | TFLOPs: 44.01 | +7: iteration 106290/ 115203 | consumed samples: 27210240 | consumed tokens: 55726571520 | elapsed time per iteration (s): 0.55 | learning rate: 2.270E-05 | global batch size: 256 | lm loss: 2.559240E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.610 | TFLOPs: 44.01 | +7: iteration 106300/ 115203 | consumed samples: 27212800 | consumed tokens: 55731814400 | elapsed time per iteration (s): 0.55 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 2.554437E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.293 | TFLOPs: 43.98 | +7: iteration 106310/ 115203 | consumed samples: 27215360 | consumed tokens: 55737057280 | elapsed time per iteration (s): 0.56 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 2.551547E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.422 | TFLOPs: 43.61 | +7: iteration 106320/ 115203 | consumed samples: 27217920 | consumed tokens: 55742300160 | elapsed time per iteration (s): 0.56 | learning rate: 2.268E-05 | global batch size: 256 | lm loss: 2.548656E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.674 | TFLOPs: 43.63 | +7: iteration 106330/ 115203 | consumed samples: 27220480 | consumed tokens: 55747543040 | elapsed time per iteration (s): 0.55 | learning rate: 2.268E-05 | global batch size: 256 | lm loss: 2.551117E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.813 | TFLOPs: 44.03 | +7: iteration 106340/ 115203 | consumed samples: 27223040 | consumed tokens: 55752785920 | elapsed time per iteration (s): 0.57 | learning rate: 2.267E-05 | global batch size: 256 | lm loss: 2.556769E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.177 | TFLOPs: 42.82 | +7: iteration 106350/ 115203 | consumed samples: 27225600 | consumed tokens: 55758028800 | elapsed time per iteration (s): 0.57 | learning rate: 2.266E-05 | global batch size: 256 | lm loss: 2.565062E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.217 | TFLOPs: 43.11 | +7: iteration 106360/ 115203 | consumed samples: 27228160 | consumed tokens: 55763271680 | elapsed time per iteration (s): 0.56 | learning rate: 2.266E-05 | global batch size: 256 | lm loss: 2.554202E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.944 | TFLOPs: 43.47 | +7: iteration 106370/ 115203 | consumed samples: 27230720 | consumed tokens: 55768514560 | elapsed time per iteration (s): 0.55 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 2.558426E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.671 | TFLOPs: 44.02 | +7: iteration 106380/ 115203 | consumed samples: 27233280 | consumed tokens: 55773757440 | elapsed time per iteration (s): 0.56 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 2.570125E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.690 | TFLOPs: 43.45 | +7: iteration 106390/ 115203 | consumed samples: 27235840 | consumed tokens: 55779000320 | elapsed time per iteration (s): 0.55 | learning rate: 2.264E-05 | global batch size: 256 | lm loss: 2.557632E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.656 | TFLOPs: 44.01 | +7: iteration 106400/ 115203 | consumed samples: 27238400 | consumed tokens: 55784243200 | elapsed time per iteration (s): 0.56 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 2.578725E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.209 | TFLOPs: 43.40 | +7: iteration 106410/ 115203 | consumed samples: 27240960 | consumed tokens: 55789486080 | elapsed time per iteration (s): 0.56 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 2.553596E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.332 | TFLOPs: 43.22 | +7: iteration 106420/ 115203 | consumed samples: 27243520 | consumed tokens: 55794728960 | elapsed time per iteration (s): 0.57 | learning rate: 2.262E-05 | global batch size: 256 | lm loss: 2.559309E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.283 | TFLOPs: 42.64 | +7: iteration 106430/ 115203 | consumed samples: 27246080 | consumed tokens: 55799971840 | elapsed time per iteration (s): 0.56 | learning rate: 2.262E-05 | global batch size: 256 | lm loss: 2.562498E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.890 | TFLOPs: 43.46 | +7: iteration 106440/ 115203 | consumed samples: 27248640 | consumed tokens: 55805214720 | elapsed time per iteration (s): 0.57 | learning rate: 2.261E-05 | global batch size: 256 | lm loss: 2.571535E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.338 | TFLOPs: 42.74 | +7: iteration 106450/ 115203 | consumed samples: 27251200 | consumed tokens: 55810457600 | elapsed time per iteration (s): 0.59 | learning rate: 2.260E-05 | global batch size: 256 | lm loss: 2.578389E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.534 | TFLOPs: 41.71 | +7: iteration 106460/ 115203 | consumed samples: 27253760 | consumed tokens: 55815700480 | elapsed time per iteration (s): 0.57 | learning rate: 2.260E-05 | global batch size: 256 | lm loss: 2.574858E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.027 | TFLOPs: 42.62 | +7: iteration 106470/ 115203 | consumed samples: 27256320 | consumed tokens: 55820943360 | elapsed time per iteration (s): 0.57 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 2.561829E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.458 | TFLOPs: 43.04 | +7: iteration 106480/ 115203 | consumed samples: 27258880 | consumed tokens: 55826186240 | elapsed time per iteration (s): 0.56 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 2.560672E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.390 | TFLOPs: 43.61 | +7: iteration 106490/ 115203 | consumed samples: 27261440 | consumed tokens: 55831429120 | elapsed time per iteration (s): 0.57 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 2.563769E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.579 | TFLOPs: 43.15 | +7: iteration 106500/ 115203 | consumed samples: 27264000 | consumed tokens: 55836672000 | elapsed time per iteration (s): 0.56 | learning rate: 2.257E-05 | global batch size: 256 | lm loss: 2.565798E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.612 | TFLOPs: 43.25 | +7: iteration 106510/ 115203 | consumed samples: 27266560 | consumed tokens: 55841914880 | elapsed time per iteration (s): 0.57 | learning rate: 2.257E-05 | global batch size: 256 | lm loss: 2.550205E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.448 | TFLOPs: 42.47 | +7: iteration 106520/ 115203 | consumed samples: 27269120 | consumed tokens: 55847157760 | elapsed time per iteration (s): 0.58 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 2.556232E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.041 | TFLOPs: 42.14 | +7: iteration 106530/ 115203 | consumed samples: 27271680 | consumed tokens: 55852400640 | elapsed time per iteration (s): 0.57 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 2.563544E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.308 | TFLOPs: 42.65 | +7: iteration 106540/ 115203 | consumed samples: 27274240 | consumed tokens: 55857643520 | elapsed time per iteration (s): 0.57 | learning rate: 2.255E-05 | global batch size: 256 | lm loss: 2.549311E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.874 | TFLOPs: 43.18 | +7: iteration 106550/ 115203 | consumed samples: 27276800 | consumed tokens: 55862886400 | elapsed time per iteration (s): 0.56 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 2.565368E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.467 | TFLOPs: 43.52 | +7: iteration 106560/ 115203 | consumed samples: 27279360 | consumed tokens: 55868129280 | elapsed time per iteration (s): 0.58 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 2.557972E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.597 | TFLOPs: 42.10 | +7: iteration 106570/ 115203 | consumed samples: 27281920 | consumed tokens: 55873372160 | elapsed time per iteration (s): 0.56 | learning rate: 2.253E-05 | global batch size: 256 | lm loss: 2.552299E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.325 | TFLOPs: 43.41 | +7: iteration 106580/ 115203 | consumed samples: 27284480 | consumed tokens: 55878615040 | elapsed time per iteration (s): 0.58 | learning rate: 2.253E-05 | global batch size: 256 | lm loss: 2.550200E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.664 | TFLOPs: 41.82 | +7: iteration 106590/ 115203 | consumed samples: 27287040 | consumed tokens: 55883857920 | elapsed time per iteration (s): 0.57 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 2.569904E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.432 | TFLOPs: 43.13 | +7: iteration 106600/ 115203 | consumed samples: 27289600 | consumed tokens: 55889100800 | elapsed time per iteration (s): 0.58 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 2.549369E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.271 | TFLOPs: 41.88 | +7: iteration 106610/ 115203 | consumed samples: 27292160 | consumed tokens: 55894343680 | elapsed time per iteration (s): 0.58 | learning rate: 2.251E-05 | global batch size: 256 | lm loss: 2.557368E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.082 | TFLOPs: 41.77 | +7: iteration 106620/ 115203 | consumed samples: 27294720 | consumed tokens: 55899586560 | elapsed time per iteration (s): 0.56 | learning rate: 2.250E-05 | global batch size: 256 | lm loss: 2.551112E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.052 | TFLOPs: 43.48 | +7: iteration 106630/ 115203 | consumed samples: 27297280 | consumed tokens: 55904829440 | elapsed time per iteration (s): 0.58 | learning rate: 2.250E-05 | global batch size: 256 | lm loss: 2.562460E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.601 | TFLOPs: 42.39 | +7: iteration 106640/ 115203 | consumed samples: 27299840 | consumed tokens: 55910072320 | elapsed time per iteration (s): 0.57 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 2.563251E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.596 | TFLOPs: 42.67 | +7: iteration 106650/ 115203 | consumed samples: 27302400 | consumed tokens: 55915315200 | elapsed time per iteration (s): 0.56 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 2.547896E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.181 | TFLOPs: 43.49 | +7: iteration 106660/ 115203 | consumed samples: 27304960 | consumed tokens: 55920558080 | elapsed time per iteration (s): 0.56 | learning rate: 2.248E-05 | global batch size: 256 | lm loss: 2.575958E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.159 | TFLOPs: 43.68 | +7: iteration 106670/ 115203 | consumed samples: 27307520 | consumed tokens: 55925800960 | elapsed time per iteration (s): 0.57 | learning rate: 2.248E-05 | global batch size: 256 | lm loss: 2.554247E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.303 | TFLOPs: 43.12 | +7: iteration 106680/ 115203 | consumed samples: 27310080 | consumed tokens: 55931043840 | elapsed time per iteration (s): 0.58 | learning rate: 2.247E-05 | global batch size: 256 | lm loss: 2.563519E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.032 | TFLOPs: 42.33 | +7: iteration 106690/ 115203 | consumed samples: 27312640 | consumed tokens: 55936286720 | elapsed time per iteration (s): 0.57 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 2.555122E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.917 | TFLOPs: 42.80 | +7: iteration 106700/ 115203 | consumed samples: 27315200 | consumed tokens: 55941529600 | elapsed time per iteration (s): 0.57 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 2.567728E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.602 | TFLOPs: 43.06 | +7: iteration 106710/ 115203 | consumed samples: 27317760 | consumed tokens: 55946772480 | elapsed time per iteration (s): 0.58 | learning rate: 2.245E-05 | global batch size: 256 | lm loss: 2.543201E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.477 | TFLOPs: 41.99 | +7: iteration 106720/ 115203 | consumed samples: 27320320 | consumed tokens: 55952015360 | elapsed time per iteration (s): 0.57 | learning rate: 2.245E-05 | global batch size: 256 | lm loss: 2.564039E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.723 | TFLOPs: 42.97 | +7: iteration 106730/ 115203 | consumed samples: 27322880 | consumed tokens: 55957258240 | elapsed time per iteration (s): 0.57 | learning rate: 2.244E-05 | global batch size: 256 | lm loss: 2.555226E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.130 | TFLOPs: 42.63 | +7: iteration 106740/ 115203 | consumed samples: 27325440 | consumed tokens: 55962501120 | elapsed time per iteration (s): 0.57 | learning rate: 2.243E-05 | global batch size: 256 | lm loss: 2.557676E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.420 | TFLOPs: 42.75 | +7: iteration 106750/ 115203 | consumed samples: 27328000 | consumed tokens: 55967744000 | elapsed time per iteration (s): 0.57 | learning rate: 2.243E-05 | global batch size: 256 | lm loss: 2.557138E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.646 | TFLOPs: 42.49 | +7: iteration 106760/ 115203 | consumed samples: 27330560 | consumed tokens: 55972986880 | elapsed time per iteration (s): 0.57 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 2.564293E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.744 | TFLOPs: 43.07 | +7: iteration 106770/ 115203 | consumed samples: 27333120 | consumed tokens: 55978229760 | elapsed time per iteration (s): 0.57 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 2.566808E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.399 | TFLOPs: 42.85 | +7: iteration 106780/ 115203 | consumed samples: 27335680 | consumed tokens: 55983472640 | elapsed time per iteration (s): 0.58 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 2.571366E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.138 | TFLOPs: 42.25 | +7: iteration 106790/ 115203 | consumed samples: 27338240 | consumed tokens: 55988715520 | elapsed time per iteration (s): 0.58 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 2.567296E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.948 | TFLOPs: 41.85 | +7: iteration 106800/ 115203 | consumed samples: 27340800 | consumed tokens: 55993958400 | elapsed time per iteration (s): 0.56 | learning rate: 2.240E-05 | global batch size: 256 | lm loss: 2.553966E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.910 | TFLOPs: 43.47 | +7: iteration 106810/ 115203 | consumed samples: 27343360 | consumed tokens: 55999201280 | elapsed time per iteration (s): 0.56 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 2.562665E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.396 | TFLOPs: 43.23 | +7: iteration 106820/ 115203 | consumed samples: 27345920 | consumed tokens: 56004444160 | elapsed time per iteration (s): 0.58 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 2.551225E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.149 | TFLOPs: 42.06 | +7: iteration 106830/ 115203 | consumed samples: 27348480 | consumed tokens: 56009687040 | elapsed time per iteration (s): 0.57 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 2.554303E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.774 | TFLOPs: 42.88 | +7: iteration 106840/ 115203 | consumed samples: 27351040 | consumed tokens: 56014929920 | elapsed time per iteration (s): 0.59 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 2.577627E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.310 | TFLOPs: 41.12 | +7: iteration 106850/ 115203 | consumed samples: 27353600 | consumed tokens: 56020172800 | elapsed time per iteration (s): 0.58 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 2.554735E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.033 | TFLOPs: 41.86 | +7: iteration 106860/ 115203 | consumed samples: 27356160 | consumed tokens: 56025415680 | elapsed time per iteration (s): 0.58 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 2.558250E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.240 | TFLOPs: 42.16 | +7: iteration 106870/ 115203 | consumed samples: 27358720 | consumed tokens: 56030658560 | elapsed time per iteration (s): 0.58 | learning rate: 2.236E-05 | global batch size: 256 | lm loss: 2.565916E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.233 | TFLOPs: 42.16 | +7: iteration 106880/ 115203 | consumed samples: 27361280 | consumed tokens: 56035901440 | elapsed time per iteration (s): 0.57 | learning rate: 2.236E-05 | global batch size: 256 | lm loss: 2.565747E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.784 | TFLOPs: 43.17 | +7: iteration 106890/ 115203 | consumed samples: 27363840 | consumed tokens: 56041144320 | elapsed time per iteration (s): 0.57 | learning rate: 2.235E-05 | global batch size: 256 | lm loss: 2.558149E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.523 | TFLOPs: 42.57 | +7: iteration 106900/ 115203 | consumed samples: 27366400 | consumed tokens: 56046387200 | elapsed time per iteration (s): 0.57 | learning rate: 2.234E-05 | global batch size: 256 | lm loss: 2.559978E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.646 | TFLOPs: 42.68 | +7: iteration 106910/ 115203 | consumed samples: 27368960 | consumed tokens: 56051630080 | elapsed time per iteration (s): 0.58 | learning rate: 2.234E-05 | global batch size: 256 | lm loss: 2.563460E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.783 | TFLOPs: 42.41 | +7: iteration 106920/ 115203 | consumed samples: 27371520 | consumed tokens: 56056872960 | elapsed time per iteration (s): 0.56 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 2.552799E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.288 | TFLOPs: 43.22 | +7: iteration 106930/ 115203 | consumed samples: 27374080 | consumed tokens: 56062115840 | elapsed time per iteration (s): 0.58 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 2.552442E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.806 | TFLOPs: 42.12 | +7: iteration 106940/ 115203 | consumed samples: 27376640 | consumed tokens: 56067358720 | elapsed time per iteration (s): 0.57 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 2.549233E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.107 | TFLOPs: 42.63 | +7: iteration 106950/ 115203 | consumed samples: 27379200 | consumed tokens: 56072601600 | elapsed time per iteration (s): 0.57 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 2.567797E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.526 | TFLOPs: 43.14 | +7: iteration 106960/ 115203 | consumed samples: 27381760 | consumed tokens: 56077844480 | elapsed time per iteration (s): 0.58 | learning rate: 2.231E-05 | global batch size: 256 | lm loss: 2.555402E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.431 | TFLOPs: 41.99 | +7: iteration 106970/ 115203 | consumed samples: 27384320 | consumed tokens: 56083087360 | elapsed time per iteration (s): 0.58 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 2.566130E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.885 | TFLOPs: 42.03 | +7: iteration 106980/ 115203 | consumed samples: 27386880 | consumed tokens: 56088330240 | elapsed time per iteration (s): 0.57 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 2.558501E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.010 | TFLOPs: 42.71 | +7: iteration 106990/ 115203 | consumed samples: 27389440 | consumed tokens: 56093573120 | elapsed time per iteration (s): 0.58 | learning rate: 2.229E-05 | global batch size: 256 | lm loss: 2.556802E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.170 | TFLOPs: 42.35 | +7: iteration 107000/ 115203 | consumed samples: 27392000 | consumed tokens: 56098816000 | elapsed time per iteration (s): 0.58 | learning rate: 2.229E-05 | global batch size: 256 | lm loss: 2.570654E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.582 | TFLOPs: 42.39 | +7: iteration 107010/ 115203 | consumed samples: 27394560 | consumed tokens: 56104058880 | elapsed time per iteration (s): 0.57 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 2.574074E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.017 | TFLOPs: 42.52 | +7: iteration 107020/ 115203 | consumed samples: 27397120 | consumed tokens: 56109301760 | elapsed time per iteration (s): 0.58 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 2.569939E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.966 | TFLOPs: 42.33 | +7: iteration 107030/ 115203 | consumed samples: 27399680 | consumed tokens: 56114544640 | elapsed time per iteration (s): 0.58 | learning rate: 2.227E-05 | global batch size: 256 | lm loss: 2.572583E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.602 | TFLOPs: 42.10 | +7: iteration 107040/ 115203 | consumed samples: 27402240 | consumed tokens: 56119787520 | elapsed time per iteration (s): 0.57 | learning rate: 2.227E-05 | global batch size: 256 | lm loss: 2.563980E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.865 | TFLOPs: 42.89 | +7: iteration 107050/ 115203 | consumed samples: 27404800 | consumed tokens: 56125030400 | elapsed time per iteration (s): 0.59 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 2.541600E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.683 | TFLOPs: 41.54 | +7: iteration 107060/ 115203 | consumed samples: 27407360 | consumed tokens: 56130273280 | elapsed time per iteration (s): 0.58 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 2.550951E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.809 | TFLOPs: 42.41 | +7: iteration 107070/ 115203 | consumed samples: 27409920 | consumed tokens: 56135516160 | elapsed time per iteration (s): 0.58 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 2.571502E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.929 | TFLOPs: 41.85 | +7: iteration 107080/ 115203 | consumed samples: 27412480 | consumed tokens: 56140759040 | elapsed time per iteration (s): 0.57 | learning rate: 2.224E-05 | global batch size: 256 | lm loss: 2.564634E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.811 | TFLOPs: 42.88 | +7: iteration 107090/ 115203 | consumed samples: 27415040 | consumed tokens: 56146001920 | elapsed time per iteration (s): 0.58 | learning rate: 2.224E-05 | global batch size: 256 | lm loss: 2.566340E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.799 | TFLOPs: 42.22 | +7: iteration 107100/ 115203 | consumed samples: 27417600 | consumed tokens: 56151244800 | elapsed time per iteration (s): 0.56 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 2.552667E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.332 | TFLOPs: 43.32 | +7: iteration 107110/ 115203 | consumed samples: 27420160 | consumed tokens: 56156487680 | elapsed time per iteration (s): 0.58 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 2.566510E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.738 | TFLOPs: 41.83 | +7: iteration 107120/ 115203 | consumed samples: 27422720 | consumed tokens: 56161730560 | elapsed time per iteration (s): 0.59 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 2.557670E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.464 | TFLOPs: 41.61 | +7: iteration 107130/ 115203 | consumed samples: 27425280 | consumed tokens: 56166973440 | elapsed time per iteration (s): 0.59 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 2.561965E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.630 | TFLOPs: 41.44 | +7: iteration 107140/ 115203 | consumed samples: 27427840 | consumed tokens: 56172216320 | elapsed time per iteration (s): 0.57 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 2.561794E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.642 | TFLOPs: 43.06 | +7: iteration 107150/ 115203 | consumed samples: 27430400 | consumed tokens: 56177459200 | elapsed time per iteration (s): 0.58 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 2.556514E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.875 | TFLOPs: 42.41 | +7: iteration 107160/ 115203 | consumed samples: 27432960 | consumed tokens: 56182702080 | elapsed time per iteration (s): 0.59 | learning rate: 2.220E-05 | global batch size: 256 | lm loss: 2.565846E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.763 | TFLOPs: 41.55 | +7: iteration 107170/ 115203 | consumed samples: 27435520 | consumed tokens: 56187944960 | elapsed time per iteration (s): 0.59 | learning rate: 2.219E-05 | global batch size: 256 | lm loss: 2.553741E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.360 | TFLOPs: 41.41 | +7: iteration 107180/ 115203 | consumed samples: 27438080 | consumed tokens: 56193187840 | elapsed time per iteration (s): 0.56 | learning rate: 2.219E-05 | global batch size: 256 | lm loss: 2.562171E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.940 | TFLOPs: 43.75 | +7: iteration 107190/ 115203 | consumed samples: 27440640 | consumed tokens: 56198430720 | elapsed time per iteration (s): 0.56 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 2.548695E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.818 | TFLOPs: 43.46 | +7: iteration 107200/ 115203 | consumed samples: 27443200 | consumed tokens: 56203673600 | elapsed time per iteration (s): 0.57 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 2.559916E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.699 | TFLOPs: 43.16 | +7: iteration 107210/ 115203 | consumed samples: 27445760 | consumed tokens: 56208916480 | elapsed time per iteration (s): 0.59 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 2.552398E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.193 | TFLOPs: 41.68 | +7: iteration 107220/ 115203 | consumed samples: 27448320 | consumed tokens: 56214159360 | elapsed time per iteration (s): 0.57 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 2.553825E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.555 | TFLOPs: 43.15 | +7: iteration 107230/ 115203 | consumed samples: 27450880 | consumed tokens: 56219402240 | elapsed time per iteration (s): 0.56 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 2.564082E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.258 | TFLOPs: 43.88 | +7: iteration 107240/ 115203 | consumed samples: 27453440 | consumed tokens: 56224645120 | elapsed time per iteration (s): 0.56 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 2.553246E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.714 | TFLOPs: 43.64 | +7: iteration 107250/ 115203 | consumed samples: 27456000 | consumed tokens: 56229888000 | elapsed time per iteration (s): 0.59 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 2.554092E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.458 | TFLOPs: 41.33 | +7: iteration 107260/ 115203 | consumed samples: 27458560 | consumed tokens: 56235130880 | elapsed time per iteration (s): 0.59 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 2.565166E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.967 | TFLOPs: 41.66 | +7: iteration 107270/ 115203 | consumed samples: 27461120 | consumed tokens: 56240373760 | elapsed time per iteration (s): 0.57 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 2.565000E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.299 | TFLOPs: 42.74 | +7: iteration 107280/ 115203 | consumed samples: 27463680 | consumed tokens: 56245616640 | elapsed time per iteration (s): 0.57 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 2.566770E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.779 | TFLOPs: 42.60 | +7: iteration 107290/ 115203 | consumed samples: 27466240 | consumed tokens: 56250859520 | elapsed time per iteration (s): 0.59 | learning rate: 2.213E-05 | global batch size: 256 | lm loss: 2.535860E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.886 | TFLOPs: 41.56 | +7: iteration 107300/ 115203 | consumed samples: 27468800 | consumed tokens: 56256102400 | elapsed time per iteration (s): 0.57 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 2.553479E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.866 | TFLOPs: 42.70 | +7: iteration 107310/ 115203 | consumed samples: 27471360 | consumed tokens: 56261345280 | elapsed time per iteration (s): 0.57 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 2.549306E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.388 | TFLOPs: 42.75 | +7: iteration 107320/ 115203 | consumed samples: 27473920 | consumed tokens: 56266588160 | elapsed time per iteration (s): 0.59 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 2.561314E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.756 | TFLOPs: 41.07 | +7: iteration 107330/ 115203 | consumed samples: 27476480 | consumed tokens: 56271831040 | elapsed time per iteration (s): 0.58 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 2.559688E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.937 | TFLOPs: 42.42 | +7: iteration 107340/ 115203 | consumed samples: 27479040 | consumed tokens: 56277073920 | elapsed time per iteration (s): 0.57 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 2.544072E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.226 | TFLOPs: 42.45 | +7: iteration 107350/ 115203 | consumed samples: 27481600 | consumed tokens: 56282316800 | elapsed time per iteration (s): 0.56 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 2.556394E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.456 | TFLOPs: 43.80 | +7: iteration 107360/ 115203 | consumed samples: 27484160 | consumed tokens: 56287559680 | elapsed time per iteration (s): 0.56 | learning rate: 2.209E-05 | global batch size: 256 | lm loss: 2.567402E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.334 | TFLOPs: 43.41 | +7: iteration 107370/ 115203 | consumed samples: 27486720 | consumed tokens: 56292802560 | elapsed time per iteration (s): 0.56 | learning rate: 2.209E-05 | global batch size: 256 | lm loss: 2.565833E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.718 | TFLOPs: 43.26 | +7: iteration 107380/ 115203 | consumed samples: 27489280 | consumed tokens: 56298045440 | elapsed time per iteration (s): 0.57 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 2.553212E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.477 | TFLOPs: 42.47 | +7: iteration 107390/ 115203 | consumed samples: 27491840 | consumed tokens: 56303288320 | elapsed time per iteration (s): 0.61 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 2.565676E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 420.058 | TFLOPs: 40.05 | +7: iteration 107400/ 115203 | consumed samples: 27494400 | consumed tokens: 56308531200 | elapsed time per iteration (s): 0.57 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 2.550020E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.876 | TFLOPs: 42.60 | +7: iteration 107410/ 115203 | consumed samples: 27496960 | consumed tokens: 56313774080 | elapsed time per iteration (s): 0.56 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 2.563737E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.820 | TFLOPs: 43.65 | +7: iteration 107420/ 115203 | consumed samples: 27499520 | consumed tokens: 56319016960 | elapsed time per iteration (s): 0.58 | learning rate: 2.206E-05 | global batch size: 256 | lm loss: 2.569596E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.101 | TFLOPs: 42.44 | +7: iteration 107430/ 115203 | consumed samples: 27502080 | consumed tokens: 56324259840 | elapsed time per iteration (s): 0.57 | learning rate: 2.206E-05 | global batch size: 256 | lm loss: 2.560984E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.870 | TFLOPs: 42.70 | +7: iteration 107440/ 115203 | consumed samples: 27504640 | consumed tokens: 56329502720 | elapsed time per iteration (s): 0.57 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 2.561286E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.348 | TFLOPs: 43.03 | +7: iteration 107450/ 115203 | consumed samples: 27507200 | consumed tokens: 56334745600 | elapsed time per iteration (s): 0.56 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 2.558819E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.364 | TFLOPs: 43.80 | +7: iteration 107460/ 115203 | consumed samples: 27509760 | consumed tokens: 56339988480 | elapsed time per iteration (s): 0.57 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 2.556114E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.753 | TFLOPs: 42.78 | +7: iteration 107470/ 115203 | consumed samples: 27512320 | consumed tokens: 56345231360 | elapsed time per iteration (s): 0.58 | learning rate: 2.203E-05 | global batch size: 256 | lm loss: 2.542946E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.677 | TFLOPs: 41.73 | +7: iteration 107480/ 115203 | consumed samples: 27514880 | consumed tokens: 56350474240 | elapsed time per iteration (s): 0.57 | learning rate: 2.203E-05 | global batch size: 256 | lm loss: 2.568359E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.611 | TFLOPs: 43.06 | +7: iteration 107490/ 115203 | consumed samples: 27517440 | consumed tokens: 56355717120 | elapsed time per iteration (s): 0.57 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 2.553887E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.699 | TFLOPs: 42.87 | +7: iteration 107500/ 115203 | consumed samples: 27520000 | consumed tokens: 56360960000 | elapsed time per iteration (s): 0.57 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 2.556666E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.526 | TFLOPs: 42.48 | +7: iteration 107510/ 115203 | consumed samples: 27522560 | consumed tokens: 56366202880 | elapsed time per iteration (s): 0.56 | learning rate: 2.201E-05 | global batch size: 256 | lm loss: 2.576282E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.196 | TFLOPs: 43.49 | +7: iteration 107520/ 115203 | consumed samples: 27525120 | consumed tokens: 56371445760 | elapsed time per iteration (s): 0.56 | learning rate: 2.201E-05 | global batch size: 256 | lm loss: 2.568468E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.574 | TFLOPs: 43.43 | +7: iteration 107530/ 115203 | consumed samples: 27527680 | consumed tokens: 56376688640 | elapsed time per iteration (s): 0.58 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 2.557990E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.166 | TFLOPs: 41.77 | +7: iteration 107540/ 115203 | consumed samples: 27530240 | consumed tokens: 56381931520 | elapsed time per iteration (s): 0.57 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 2.568576E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.777 | TFLOPs: 43.17 | +7: iteration 107550/ 115203 | consumed samples: 27532800 | consumed tokens: 56387174400 | elapsed time per iteration (s): 0.60 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 2.550071E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.673 | TFLOPs: 40.87 | +7: iteration 107560/ 115203 | consumed samples: 27535360 | consumed tokens: 56392417280 | elapsed time per iteration (s): 0.57 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 2.548439E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.989 | TFLOPs: 43.00 | +7: iteration 107570/ 115203 | consumed samples: 27537920 | consumed tokens: 56397660160 | elapsed time per iteration (s): 0.57 | learning rate: 2.198E-05 | global batch size: 256 | lm loss: 2.547469E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.238 | TFLOPs: 42.93 | +7: iteration 107580/ 115203 | consumed samples: 27540480 | consumed tokens: 56402903040 | elapsed time per iteration (s): 0.59 | learning rate: 2.198E-05 | global batch size: 256 | lm loss: 2.561884E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.746 | TFLOPs: 41.07 | +7: iteration 107590/ 115203 | consumed samples: 27543040 | consumed tokens: 56408145920 | elapsed time per iteration (s): 0.57 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 2.564227E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.668 | TFLOPs: 43.16 | +7: iteration 107600/ 115203 | consumed samples: 27545600 | consumed tokens: 56413388800 | elapsed time per iteration (s): 0.59 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 2.549533E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.071 | TFLOPs: 41.38 | +7: iteration 107610/ 115203 | consumed samples: 27548160 | consumed tokens: 56418631680 | elapsed time per iteration (s): 0.59 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 2.551679E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.260 | TFLOPs: 41.12 | +7: iteration 107620/ 115203 | consumed samples: 27550720 | consumed tokens: 56423874560 | elapsed time per iteration (s): 0.59 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 2.560810E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.129 | TFLOPs: 41.10 | +7: iteration 107630/ 115203 | consumed samples: 27553280 | consumed tokens: 56429117440 | elapsed time per iteration (s): 0.58 | learning rate: 2.195E-05 | global batch size: 256 | lm loss: 2.566291E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.227 | TFLOPs: 42.35 | +7: iteration 107640/ 115203 | consumed samples: 27555840 | consumed tokens: 56434360320 | elapsed time per iteration (s): 0.57 | learning rate: 2.195E-05 | global batch size: 256 | lm loss: 2.558894E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.125 | TFLOPs: 42.63 | +7: iteration 107650/ 115203 | consumed samples: 27558400 | consumed tokens: 56439603200 | elapsed time per iteration (s): 0.61 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 2.548835E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 419.918 | TFLOPs: 40.03 | +7: iteration 107660/ 115203 | consumed samples: 27560960 | consumed tokens: 56444846080 | elapsed time per iteration (s): 0.59 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 2.559571E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.474 | TFLOPs: 41.42 | +7: iteration 107670/ 115203 | consumed samples: 27563520 | consumed tokens: 56450088960 | elapsed time per iteration (s): 0.57 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 2.538184E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.677 | TFLOPs: 42.97 | +7: iteration 107680/ 115203 | consumed samples: 27566080 | consumed tokens: 56455331840 | elapsed time per iteration (s): 0.56 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 2.558211E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.160 | TFLOPs: 43.20 | +7: iteration 107690/ 115203 | consumed samples: 27568640 | consumed tokens: 56460574720 | elapsed time per iteration (s): 0.58 | learning rate: 2.192E-05 | global batch size: 256 | lm loss: 2.564253E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.399 | TFLOPs: 41.80 | +7: iteration 107700/ 115203 | consumed samples: 27571200 | consumed tokens: 56465817600 | elapsed time per iteration (s): 0.57 | learning rate: 2.192E-05 | global batch size: 256 | lm loss: 2.560221E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.016 | TFLOPs: 42.71 | +7: iteration 107710/ 115203 | consumed samples: 27573760 | consumed tokens: 56471060480 | elapsed time per iteration (s): 0.57 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 2.547744E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.737 | TFLOPs: 42.59 | +7: iteration 107720/ 115203 | consumed samples: 27576320 | consumed tokens: 56476303360 | elapsed time per iteration (s): 0.60 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 2.558175E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.714 | TFLOPs: 40.59 | +7: iteration 107730/ 115203 | consumed samples: 27578880 | consumed tokens: 56481546240 | elapsed time per iteration (s): 0.63 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 2.556228E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 405.608 | TFLOPs: 38.67 | +7: iteration 107740/ 115203 | consumed samples: 27581440 | consumed tokens: 56486789120 | elapsed time per iteration (s): 0.58 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 2.548234E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.962 | TFLOPs: 42.23 | +7: iteration 107750/ 115203 | consumed samples: 27584000 | consumed tokens: 56492032000 | elapsed time per iteration (s): 0.59 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 2.556555E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.045 | TFLOPs: 41.67 | +7: iteration 107760/ 115203 | consumed samples: 27586560 | consumed tokens: 56497274880 | elapsed time per iteration (s): 0.62 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 2.562960E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 416.218 | TFLOPs: 39.68 | +7: iteration 107770/ 115203 | consumed samples: 27589120 | consumed tokens: 56502517760 | elapsed time per iteration (s): 0.57 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 2.554865E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.712 | TFLOPs: 42.59 | +7: iteration 107780/ 115203 | consumed samples: 27591680 | consumed tokens: 56507760640 | elapsed time per iteration (s): 0.62 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 2.547979E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 414.918 | TFLOPs: 39.56 | +7: iteration 107790/ 115203 | consumed samples: 27594240 | consumed tokens: 56513003520 | elapsed time per iteration (s): 0.57 | learning rate: 2.187E-05 | global batch size: 256 | lm loss: 2.564606E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.199 | TFLOPs: 42.64 | +7: iteration 107800/ 115203 | consumed samples: 27596800 | consumed tokens: 56518246400 | elapsed time per iteration (s): 0.59 | learning rate: 2.187E-05 | global batch size: 256 | lm loss: 2.546831E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.352 | TFLOPs: 41.70 | +7: iteration 107810/ 115203 | consumed samples: 27599360 | consumed tokens: 56523489280 | elapsed time per iteration (s): 0.60 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 2.567707E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 424.522 | TFLOPs: 40.47 | +7: iteration 107820/ 115203 | consumed samples: 27601920 | consumed tokens: 56528732160 | elapsed time per iteration (s): 0.59 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 2.554149E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.714 | TFLOPs: 41.25 | +7: iteration 107830/ 115203 | consumed samples: 27604480 | consumed tokens: 56533975040 | elapsed time per iteration (s): 0.57 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 2.552063E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.856 | TFLOPs: 42.79 | +7: iteration 107840/ 115203 | consumed samples: 27607040 | consumed tokens: 56539217920 | elapsed time per iteration (s): 0.57 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 2.550994E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.419 | TFLOPs: 42.56 | +7: iteration 107850/ 115203 | consumed samples: 27609600 | consumed tokens: 56544460800 | elapsed time per iteration (s): 0.56 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 2.554889E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.287 | TFLOPs: 43.22 | +7: iteration 107860/ 115203 | consumed samples: 27612160 | consumed tokens: 56549703680 | elapsed time per iteration (s): 0.57 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 2.552361E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.560 | TFLOPs: 42.57 | +7: iteration 107870/ 115203 | consumed samples: 27614720 | consumed tokens: 56554946560 | elapsed time per iteration (s): 0.57 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 2.573748E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.364 | TFLOPs: 42.75 | +7: iteration 107880/ 115203 | consumed samples: 27617280 | consumed tokens: 56560189440 | elapsed time per iteration (s): 0.57 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 2.547277E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.408 | TFLOPs: 42.46 | +7: iteration 107890/ 115203 | consumed samples: 27619840 | consumed tokens: 56565432320 | elapsed time per iteration (s): 0.59 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 2.553222E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.284 | TFLOPs: 41.59 | +7: iteration 107900/ 115203 | consumed samples: 27622400 | consumed tokens: 56570675200 | elapsed time per iteration (s): 0.57 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 2.541648E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.435 | TFLOPs: 42.85 | +7: iteration 107910/ 115203 | consumed samples: 27624960 | consumed tokens: 56575918080 | elapsed time per iteration (s): 0.56 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 2.562827E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.185 | TFLOPs: 43.49 | +7: iteration 107920/ 115203 | consumed samples: 27627520 | consumed tokens: 56581160960 | elapsed time per iteration (s): 0.57 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 2.554987E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.165 | TFLOPs: 43.11 | +7: iteration 107930/ 115203 | consumed samples: 27630080 | consumed tokens: 56586403840 | elapsed time per iteration (s): 0.57 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 2.553735E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.866 | TFLOPs: 42.79 | +7: iteration 107940/ 115203 | consumed samples: 27632640 | consumed tokens: 56591646720 | elapsed time per iteration (s): 0.58 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 2.561494E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.385 | TFLOPs: 42.37 | +7: iteration 107950/ 115203 | consumed samples: 27635200 | consumed tokens: 56596889600 | elapsed time per iteration (s): 0.58 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 2.558910E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.381 | TFLOPs: 42.08 | +7: iteration 107960/ 115203 | consumed samples: 27637760 | consumed tokens: 56602132480 | elapsed time per iteration (s): 0.59 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 2.555199E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.071 | TFLOPs: 41.38 | +7: iteration 107970/ 115203 | consumed samples: 27640320 | consumed tokens: 56607375360 | elapsed time per iteration (s): 0.59 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 2.560400E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.087 | TFLOPs: 41.39 | +7: iteration 107980/ 115203 | consumed samples: 27642880 | consumed tokens: 56612618240 | elapsed time per iteration (s): 0.58 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 2.550675E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.429 | TFLOPs: 42.18 | +7: iteration 107990/ 115203 | consumed samples: 27645440 | consumed tokens: 56617861120 | elapsed time per iteration (s): 0.57 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 2.559274E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.279 | TFLOPs: 43.02 | +0: [2023-03-17 05:47:38,453] [INFO] [logging.py:68:log_dist] [Rank 0] step=108000, skipped=0, lr=[2.176608969325893e-05, 2.176608969325893e-05, 2.176608969325893e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 108000/ 115203 | consumed samples: 27648000 | consumed tokens: 56623104000 | elapsed time per iteration (s): 0.59 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 2.562329E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.524 | TFLOPs: 41.62 | +0: steps: 108000 loss: 2.5547 iter time (s): 0.570 samples/sec: 449.359 +7: iteration 108010/ 115203 | consumed samples: 27650560 | consumed tokens: 56628346880 | elapsed time per iteration (s): 0.57 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 2.562945E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.965 | TFLOPs: 42.90 | +7: iteration 108020/ 115203 | consumed samples: 27653120 | consumed tokens: 56633589760 | elapsed time per iteration (s): 0.56 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 2.553438E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.247 | TFLOPs: 43.31 | +7: iteration 108030/ 115203 | consumed samples: 27655680 | consumed tokens: 56638832640 | elapsed time per iteration (s): 0.56 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 2.565001E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.306 | TFLOPs: 43.60 | +7: iteration 108040/ 115203 | consumed samples: 27658240 | consumed tokens: 56644075520 | elapsed time per iteration (s): 0.57 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 2.558706E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.047 | TFLOPs: 43.10 | +7: iteration 108050/ 115203 | consumed samples: 27660800 | consumed tokens: 56649318400 | elapsed time per iteration (s): 0.56 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 2.554439E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.861 | TFLOPs: 43.46 | +7: iteration 108060/ 115203 | consumed samples: 27663360 | consumed tokens: 56654561280 | elapsed time per iteration (s): 0.56 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 2.558957E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.208 | TFLOPs: 43.49 | +7: iteration 108070/ 115203 | consumed samples: 27665920 | consumed tokens: 56659804160 | elapsed time per iteration (s): 0.57 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 2.571659E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.588 | TFLOPs: 43.05 | +7: iteration 108080/ 115203 | consumed samples: 27668480 | consumed tokens: 56665047040 | elapsed time per iteration (s): 0.59 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 2.554572E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.678 | TFLOPs: 41.35 | +7: iteration 108090/ 115203 | consumed samples: 27671040 | consumed tokens: 56670289920 | elapsed time per iteration (s): 0.58 | learning rate: 2.172E-05 | global batch size: 256 | lm loss: 2.557594E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.328 | TFLOPs: 41.79 | +7: iteration 108100/ 115203 | consumed samples: 27673600 | consumed tokens: 56675532800 | elapsed time per iteration (s): 0.57 | learning rate: 2.172E-05 | global batch size: 256 | lm loss: 2.551738E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.025 | TFLOPs: 42.81 | +7: iteration 108110/ 115203 | consumed samples: 27676160 | consumed tokens: 56680775680 | elapsed time per iteration (s): 0.56 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 2.566915E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.903 | TFLOPs: 43.27 | +7: iteration 108120/ 115203 | consumed samples: 27678720 | consumed tokens: 56686018560 | elapsed time per iteration (s): 0.58 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 2.561713E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.993 | TFLOPs: 42.43 | +7: iteration 108130/ 115203 | consumed samples: 27681280 | consumed tokens: 56691261440 | elapsed time per iteration (s): 0.57 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 2.565262E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.567 | TFLOPs: 42.86 | +7: iteration 108140/ 115203 | consumed samples: 27683840 | consumed tokens: 56696504320 | elapsed time per iteration (s): 0.59 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 2.553256E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.027 | TFLOPs: 41.57 | +7: iteration 108150/ 115203 | consumed samples: 27686400 | consumed tokens: 56701747200 | elapsed time per iteration (s): 0.58 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 2.560408E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.419 | TFLOPs: 41.80 | +7: iteration 108160/ 115203 | consumed samples: 27688960 | consumed tokens: 56706990080 | elapsed time per iteration (s): 0.58 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 2.554680E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.817 | TFLOPs: 41.84 | +7: iteration 108170/ 115203 | consumed samples: 27691520 | consumed tokens: 56712232960 | elapsed time per iteration (s): 0.57 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 2.559077E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.802 | TFLOPs: 42.88 | +7: iteration 108180/ 115203 | consumed samples: 27694080 | consumed tokens: 56717475840 | elapsed time per iteration (s): 0.57 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 2.567875E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.531 | TFLOPs: 43.14 | +7: iteration 108190/ 115203 | consumed samples: 27696640 | consumed tokens: 56722718720 | elapsed time per iteration (s): 0.57 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 2.550269E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.908 | TFLOPs: 42.89 | +7: iteration 108200/ 115203 | consumed samples: 27699200 | consumed tokens: 56727961600 | elapsed time per iteration (s): 0.57 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 2.547499E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.323 | TFLOPs: 42.65 | +7: iteration 108210/ 115203 | consumed samples: 27701760 | consumed tokens: 56733204480 | elapsed time per iteration (s): 0.58 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 2.559507E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.744 | TFLOPs: 41.83 | +7: iteration 108220/ 115203 | consumed samples: 27704320 | consumed tokens: 56738447360 | elapsed time per iteration (s): 0.57 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 2.555474E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.195 | TFLOPs: 43.02 | +7: iteration 108230/ 115203 | consumed samples: 27706880 | consumed tokens: 56743690240 | elapsed time per iteration (s): 0.57 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 2.566749E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.995 | TFLOPs: 42.71 | +7: iteration 108240/ 115203 | consumed samples: 27709440 | consumed tokens: 56748933120 | elapsed time per iteration (s): 0.57 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 2.558789E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.731 | TFLOPs: 42.78 | +7: iteration 108250/ 115203 | consumed samples: 27712000 | consumed tokens: 56754176000 | elapsed time per iteration (s): 0.57 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 2.564443E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.048 | TFLOPs: 43.10 | +7: iteration 108260/ 115203 | consumed samples: 27714560 | consumed tokens: 56759418880 | elapsed time per iteration (s): 0.57 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 2.569826E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.435 | TFLOPs: 42.85 | +7: iteration 108270/ 115203 | consumed samples: 27717120 | consumed tokens: 56764661760 | elapsed time per iteration (s): 0.57 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 2.564289E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.161 | TFLOPs: 42.82 | +7: iteration 108280/ 115203 | consumed samples: 27719680 | consumed tokens: 56769904640 | elapsed time per iteration (s): 0.58 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 2.562124E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.692 | TFLOPs: 42.40 | +7: iteration 108290/ 115203 | consumed samples: 27722240 | consumed tokens: 56775147520 | elapsed time per iteration (s): 0.58 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 2.560166E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.929 | TFLOPs: 42.13 | +7: iteration 108300/ 115203 | consumed samples: 27724800 | consumed tokens: 56780390400 | elapsed time per iteration (s): 0.57 | learning rate: 2.162E-05 | global batch size: 256 | lm loss: 2.545988E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.183 | TFLOPs: 43.02 | +7: iteration 108310/ 115203 | consumed samples: 27727360 | consumed tokens: 56785633280 | elapsed time per iteration (s): 0.56 | learning rate: 2.162E-05 | global batch size: 256 | lm loss: 2.557598E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.495 | TFLOPs: 43.43 | +7: iteration 108320/ 115203 | consumed samples: 27729920 | consumed tokens: 56790876160 | elapsed time per iteration (s): 0.57 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 2.549901E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.040 | TFLOPs: 42.62 | +7: iteration 108330/ 115203 | consumed samples: 27732480 | consumed tokens: 56796119040 | elapsed time per iteration (s): 0.58 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 2.567886E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.750 | TFLOPs: 42.40 | +7: iteration 108340/ 115203 | consumed samples: 27735040 | consumed tokens: 56801361920 | elapsed time per iteration (s): 0.57 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 2.565917E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.934 | TFLOPs: 42.90 | +7: iteration 108350/ 115203 | consumed samples: 27737600 | consumed tokens: 56806604800 | elapsed time per iteration (s): 0.57 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 2.559746E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.076 | TFLOPs: 42.72 | +7: iteration 108360/ 115203 | consumed samples: 27740160 | consumed tokens: 56811847680 | elapsed time per iteration (s): 0.56 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 2.562640E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.152 | TFLOPs: 43.58 | +7: iteration 108370/ 115203 | consumed samples: 27742720 | consumed tokens: 56817090560 | elapsed time per iteration (s): 0.57 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 2.558631E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.600 | TFLOPs: 42.86 | +7: iteration 108380/ 115203 | consumed samples: 27745280 | consumed tokens: 56822333440 | elapsed time per iteration (s): 0.57 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 2.548408E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.721 | TFLOPs: 43.16 | +7: iteration 108390/ 115203 | consumed samples: 27747840 | consumed tokens: 56827576320 | elapsed time per iteration (s): 0.57 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 2.557827E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.274 | TFLOPs: 42.55 | +7: iteration 108400/ 115203 | consumed samples: 27750400 | consumed tokens: 56832819200 | elapsed time per iteration (s): 0.57 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 2.560864E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.975 | TFLOPs: 42.80 | +7: iteration 108410/ 115203 | consumed samples: 27752960 | consumed tokens: 56838062080 | elapsed time per iteration (s): 0.57 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 2.553960E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.771 | TFLOPs: 43.17 | +7: iteration 108420/ 115203 | consumed samples: 27755520 | consumed tokens: 56843304960 | elapsed time per iteration (s): 0.57 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 2.550280E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.153 | TFLOPs: 42.73 | +7: iteration 108430/ 115203 | consumed samples: 27758080 | consumed tokens: 56848547840 | elapsed time per iteration (s): 0.57 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 2.546180E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.285 | TFLOPs: 42.64 | +7: iteration 108440/ 115203 | consumed samples: 27760640 | consumed tokens: 56853790720 | elapsed time per iteration (s): 0.57 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 2.548527E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.167 | TFLOPs: 43.01 | +7: iteration 108450/ 115203 | consumed samples: 27763200 | consumed tokens: 56859033600 | elapsed time per iteration (s): 0.57 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 2.548628E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.349 | TFLOPs: 42.65 | +7: iteration 108460/ 115203 | consumed samples: 27765760 | consumed tokens: 56864276480 | elapsed time per iteration (s): 0.56 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 2.562926E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.206 | TFLOPs: 43.88 | +7: iteration 108470/ 115203 | consumed samples: 27768320 | consumed tokens: 56869519360 | elapsed time per iteration (s): 0.57 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 2.552001E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.440 | TFLOPs: 42.75 | +7: iteration 108480/ 115203 | consumed samples: 27770880 | consumed tokens: 56874762240 | elapsed time per iteration (s): 0.57 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 2.576378E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.875 | TFLOPs: 42.99 | +7: iteration 108490/ 115203 | consumed samples: 27773440 | consumed tokens: 56880005120 | elapsed time per iteration (s): 0.57 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 2.563773E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.590 | TFLOPs: 43.15 | +7: iteration 108500/ 115203 | consumed samples: 27776000 | consumed tokens: 56885248000 | elapsed time per iteration (s): 0.56 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 2.555142E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.233 | TFLOPs: 43.50 | +7: iteration 108510/ 115203 | consumed samples: 27778560 | consumed tokens: 56890490880 | elapsed time per iteration (s): 0.57 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 2.559225E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.189 | TFLOPs: 42.92 | +7: iteration 108520/ 115203 | consumed samples: 27781120 | consumed tokens: 56895733760 | elapsed time per iteration (s): 0.57 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 2.562230E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.786 | TFLOPs: 42.50 | +7: iteration 108530/ 115203 | consumed samples: 27783680 | consumed tokens: 56900976640 | elapsed time per iteration (s): 0.56 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 2.554968E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.293 | TFLOPs: 43.69 | +7: iteration 108540/ 115203 | consumed samples: 27786240 | consumed tokens: 56906219520 | elapsed time per iteration (s): 0.57 | learning rate: 2.151E-05 | global batch size: 256 | lm loss: 2.561758E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.707 | TFLOPs: 42.87 | +7: iteration 108550/ 115203 | consumed samples: 27788800 | consumed tokens: 56911462400 | elapsed time per iteration (s): 0.56 | learning rate: 2.151E-05 | global batch size: 256 | lm loss: 2.561692E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.042 | TFLOPs: 43.57 | +7: iteration 108560/ 115203 | consumed samples: 27791360 | consumed tokens: 56916705280 | elapsed time per iteration (s): 0.57 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 2.562937E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.209 | TFLOPs: 42.73 | +7: iteration 108570/ 115203 | consumed samples: 27793920 | consumed tokens: 56921948160 | elapsed time per iteration (s): 0.57 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 2.551968E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.073 | TFLOPs: 43.10 | +7: iteration 108580/ 115203 | consumed samples: 27796480 | consumed tokens: 56927191040 | elapsed time per iteration (s): 0.59 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 2.549297E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.660 | TFLOPs: 41.15 | +7: iteration 108590/ 115203 | consumed samples: 27799040 | consumed tokens: 56932433920 | elapsed time per iteration (s): 0.56 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 2.534355E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.170 | TFLOPs: 43.20 | +7: iteration 108600/ 115203 | consumed samples: 27801600 | consumed tokens: 56937676800 | elapsed time per iteration (s): 0.56 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 2.562315E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.010 | TFLOPs: 43.28 | +7: iteration 108610/ 115203 | consumed samples: 27804160 | consumed tokens: 56942919680 | elapsed time per iteration (s): 0.57 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 2.555180E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.227 | TFLOPs: 42.92 | +7: iteration 108620/ 115203 | consumed samples: 27806720 | consumed tokens: 56948162560 | elapsed time per iteration (s): 0.58 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 2.559903E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.126 | TFLOPs: 41.87 | +7: iteration 108630/ 115203 | consumed samples: 27809280 | consumed tokens: 56953405440 | elapsed time per iteration (s): 0.58 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 2.562381E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.327 | TFLOPs: 42.17 | +7: iteration 108640/ 115203 | consumed samples: 27811840 | consumed tokens: 56958648320 | elapsed time per iteration (s): 0.56 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 2.566627E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.939 | TFLOPs: 43.47 | +7: iteration 108650/ 115203 | consumed samples: 27814400 | consumed tokens: 56963891200 | elapsed time per iteration (s): 0.57 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 2.560851E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.984 | TFLOPs: 42.90 | +7: iteration 108660/ 115203 | consumed samples: 27816960 | consumed tokens: 56969134080 | elapsed time per iteration (s): 0.57 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 2.557955E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.442 | TFLOPs: 43.04 | +7: iteration 108670/ 115203 | consumed samples: 27819520 | consumed tokens: 56974376960 | elapsed time per iteration (s): 0.56 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 2.557777E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.781 | TFLOPs: 43.74 | +7: iteration 108680/ 115203 | consumed samples: 27822080 | consumed tokens: 56979619840 | elapsed time per iteration (s): 0.55 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 2.555973E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.639 | TFLOPs: 44.01 | +7: iteration 108690/ 115203 | consumed samples: 27824640 | consumed tokens: 56984862720 | elapsed time per iteration (s): 0.57 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 2.560208E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.085 | TFLOPs: 42.91 | +7: iteration 108700/ 115203 | consumed samples: 27827200 | consumed tokens: 56990105600 | elapsed time per iteration (s): 0.56 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 2.569930E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.912 | TFLOPs: 43.47 | +7: iteration 108710/ 115203 | consumed samples: 27829760 | consumed tokens: 56995348480 | elapsed time per iteration (s): 0.57 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 2.546971E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.004 | TFLOPs: 43.00 | +7: iteration 108720/ 115203 | consumed samples: 27832320 | consumed tokens: 57000591360 | elapsed time per iteration (s): 0.57 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 2.539694E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.152 | TFLOPs: 43.11 | +7: iteration 108730/ 115203 | consumed samples: 27834880 | consumed tokens: 57005834240 | elapsed time per iteration (s): 0.56 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 2.556045E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.107 | TFLOPs: 43.20 | +7: iteration 108740/ 115203 | consumed samples: 27837440 | consumed tokens: 57011077120 | elapsed time per iteration (s): 0.57 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 2.557509E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.842 | TFLOPs: 43.08 | +7: iteration 108750/ 115203 | consumed samples: 27840000 | consumed tokens: 57016320000 | elapsed time per iteration (s): 0.56 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 2.568769E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.791 | TFLOPs: 43.93 | +7: iteration 108760/ 115203 | consumed samples: 27842560 | consumed tokens: 57021562880 | elapsed time per iteration (s): 0.56 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 2.548157E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.806 | TFLOPs: 43.74 | +7: iteration 108770/ 115203 | consumed samples: 27845120 | consumed tokens: 57026805760 | elapsed time per iteration (s): 0.56 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 2.554694E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.431 | TFLOPs: 43.33 | +7: iteration 108780/ 115203 | consumed samples: 27847680 | consumed tokens: 57032048640 | elapsed time per iteration (s): 0.57 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 2.568735E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.093 | TFLOPs: 42.91 | +7: iteration 108790/ 115203 | consumed samples: 27850240 | consumed tokens: 57037291520 | elapsed time per iteration (s): 0.56 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 2.552155E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.944 | TFLOPs: 43.56 | +7: iteration 108800/ 115203 | consumed samples: 27852800 | consumed tokens: 57042534400 | elapsed time per iteration (s): 0.57 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 2.554277E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.256 | TFLOPs: 43.02 | +7: iteration 108810/ 115203 | consumed samples: 27855360 | consumed tokens: 57047777280 | elapsed time per iteration (s): 0.56 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 2.552992E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.721 | TFLOPs: 43.35 | +7: iteration 108820/ 115203 | consumed samples: 27857920 | consumed tokens: 57053020160 | elapsed time per iteration (s): 0.56 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 2.554484E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.368 | TFLOPs: 43.51 | +7: iteration 108830/ 115203 | consumed samples: 27860480 | consumed tokens: 57058263040 | elapsed time per iteration (s): 0.55 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 2.563932E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.842 | TFLOPs: 44.03 | +7: iteration 108840/ 115203 | consumed samples: 27863040 | consumed tokens: 57063505920 | elapsed time per iteration (s): 0.57 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 2.544786E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.381 | TFLOPs: 43.03 | +7: iteration 108850/ 115203 | consumed samples: 27865600 | consumed tokens: 57068748800 | elapsed time per iteration (s): 0.56 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 2.552709E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.581 | TFLOPs: 43.43 | +7: iteration 108860/ 115203 | consumed samples: 27868160 | consumed tokens: 57073991680 | elapsed time per iteration (s): 0.56 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 2.559594E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.847 | TFLOPs: 43.75 | +7: iteration 108870/ 115203 | consumed samples: 27870720 | consumed tokens: 57079234560 | elapsed time per iteration (s): 0.56 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 2.557466E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.266 | TFLOPs: 43.50 | +7: iteration 108880/ 115203 | consumed samples: 27873280 | consumed tokens: 57084477440 | elapsed time per iteration (s): 0.55 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 2.566091E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.759 | TFLOPs: 44.02 | +7: iteration 108890/ 115203 | consumed samples: 27875840 | consumed tokens: 57089720320 | elapsed time per iteration (s): 0.56 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 2.566905E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.975 | TFLOPs: 43.76 | +7: iteration 108900/ 115203 | consumed samples: 27878400 | consumed tokens: 57094963200 | elapsed time per iteration (s): 0.57 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 2.549035E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.551 | TFLOPs: 43.05 | +7: iteration 108910/ 115203 | consumed samples: 27880960 | consumed tokens: 57100206080 | elapsed time per iteration (s): 0.57 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 2.566102E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.109 | TFLOPs: 42.53 | +7: iteration 108920/ 115203 | consumed samples: 27883520 | consumed tokens: 57105448960 | elapsed time per iteration (s): 0.55 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 2.576463E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.727 | TFLOPs: 44.02 | +7: iteration 108930/ 115203 | consumed samples: 27886080 | consumed tokens: 57110691840 | elapsed time per iteration (s): 0.57 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 2.568307E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.967 | TFLOPs: 42.71 | +7: iteration 108940/ 115203 | consumed samples: 27888640 | consumed tokens: 57115934720 | elapsed time per iteration (s): 0.56 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 2.558250E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.252 | TFLOPs: 43.50 | +7: iteration 108950/ 115203 | consumed samples: 27891200 | consumed tokens: 57121177600 | elapsed time per iteration (s): 0.56 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 2.556579E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 108960/ 115203 | consumed samples: 27893760 | consumed tokens: 57126420480 | elapsed time per iteration (s): 0.56 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 2.562673E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.070 | TFLOPs: 43.48 | +7: iteration 108970/ 115203 | consumed samples: 27896320 | consumed tokens: 57131663360 | elapsed time per iteration (s): 0.58 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 2.564477E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.836 | TFLOPs: 41.93 | +7: iteration 108980/ 115203 | consumed samples: 27898880 | consumed tokens: 57136906240 | elapsed time per iteration (s): 0.56 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 2.543381E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.480 | TFLOPs: 43.52 | +7: iteration 108990/ 115203 | consumed samples: 27901440 | consumed tokens: 57142149120 | elapsed time per iteration (s): 0.58 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 2.546064E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.142 | TFLOPs: 42.06 | +7: iteration 109000/ 115203 | consumed samples: 27904000 | consumed tokens: 57147392000 | elapsed time per iteration (s): 0.58 | learning rate: 2.131E-05 | global batch size: 256 | lm loss: 2.558891E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.134 | TFLOPs: 42.06 | +7: iteration 109010/ 115203 | consumed samples: 27906560 | consumed tokens: 57152634880 | elapsed time per iteration (s): 0.56 | learning rate: 2.131E-05 | global batch size: 256 | lm loss: 2.559011E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.755 | TFLOPs: 43.74 | +7: iteration 109020/ 115203 | consumed samples: 27909120 | consumed tokens: 57157877760 | elapsed time per iteration (s): 0.56 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 2.553111E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.629 | TFLOPs: 43.44 | +7: iteration 109030/ 115203 | consumed samples: 27911680 | consumed tokens: 57163120640 | elapsed time per iteration (s): 0.57 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 2.564079E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.840 | TFLOPs: 42.98 | +7: iteration 109040/ 115203 | consumed samples: 27914240 | consumed tokens: 57168363520 | elapsed time per iteration (s): 0.55 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 2.558504E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.370 | TFLOPs: 43.99 | +7: iteration 109050/ 115203 | consumed samples: 27916800 | consumed tokens: 57173606400 | elapsed time per iteration (s): 0.56 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 2.560279E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.995 | TFLOPs: 43.66 | +7: iteration 109060/ 115203 | consumed samples: 27919360 | consumed tokens: 57178849280 | elapsed time per iteration (s): 0.56 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 2.553030E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.572 | TFLOPs: 43.43 | +7: iteration 109070/ 115203 | consumed samples: 27921920 | consumed tokens: 57184092160 | elapsed time per iteration (s): 0.57 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 2.560426E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.018 | TFLOPs: 43.00 | +7: iteration 109080/ 115203 | consumed samples: 27924480 | consumed tokens: 57189335040 | elapsed time per iteration (s): 0.56 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 2.556640E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.916 | TFLOPs: 43.56 | +7: iteration 109090/ 115203 | consumed samples: 27927040 | consumed tokens: 57194577920 | elapsed time per iteration (s): 0.56 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 2.541726E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.492 | TFLOPs: 43.52 | +7: iteration 109100/ 115203 | consumed samples: 27929600 | consumed tokens: 57199820800 | elapsed time per iteration (s): 0.56 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 2.561087E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.201 | TFLOPs: 43.40 | +7: iteration 109110/ 115203 | consumed samples: 27932160 | consumed tokens: 57205063680 | elapsed time per iteration (s): 0.56 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 2.546905E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.827 | TFLOPs: 43.55 | +7: iteration 109120/ 115203 | consumed samples: 27934720 | consumed tokens: 57210306560 | elapsed time per iteration (s): 0.56 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 2.556967E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.130 | TFLOPs: 43.58 | +7: iteration 109130/ 115203 | consumed samples: 27937280 | consumed tokens: 57215549440 | elapsed time per iteration (s): 0.56 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 2.554154E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.380 | TFLOPs: 43.22 | +7: iteration 109140/ 115203 | consumed samples: 27939840 | consumed tokens: 57220792320 | elapsed time per iteration (s): 0.60 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 2.565787E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.639 | TFLOPs: 40.77 | +7: iteration 109150/ 115203 | consumed samples: 27942400 | consumed tokens: 57226035200 | elapsed time per iteration (s): 0.56 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 2.551683E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.675 | TFLOPs: 43.63 | +7: iteration 109160/ 115203 | consumed samples: 27944960 | consumed tokens: 57231278080 | elapsed time per iteration (s): 0.56 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 2.557005E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.344 | TFLOPs: 43.60 | +7: iteration 109170/ 115203 | consumed samples: 27947520 | consumed tokens: 57236520960 | elapsed time per iteration (s): 0.56 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 2.571798E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.321 | TFLOPs: 43.41 | +7: iteration 109180/ 115203 | consumed samples: 27950080 | consumed tokens: 57241763840 | elapsed time per iteration (s): 0.57 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 2.543326E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.027 | TFLOPs: 43.00 | +7: iteration 109190/ 115203 | consumed samples: 27952640 | consumed tokens: 57247006720 | elapsed time per iteration (s): 0.56 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 2.548683E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.880 | TFLOPs: 43.46 | +7: iteration 109200/ 115203 | consumed samples: 27955200 | consumed tokens: 57252249600 | elapsed time per iteration (s): 0.56 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 2.549764E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.141 | TFLOPs: 43.49 | +7: iteration 109210/ 115203 | consumed samples: 27957760 | consumed tokens: 57257492480 | elapsed time per iteration (s): 0.56 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 2.552269E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.457 | TFLOPs: 43.23 | +7: iteration 109220/ 115203 | consumed samples: 27960320 | consumed tokens: 57262735360 | elapsed time per iteration (s): 0.56 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 2.563961E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.016 | TFLOPs: 43.48 | +7: iteration 109230/ 115203 | consumed samples: 27962880 | consumed tokens: 57267978240 | elapsed time per iteration (s): 0.56 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 2.542841E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.987 | TFLOPs: 43.95 | +7: iteration 109240/ 115203 | consumed samples: 27965440 | consumed tokens: 57273221120 | elapsed time per iteration (s): 0.55 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 2.557526E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.281 | TFLOPs: 43.98 | +7: iteration 109250/ 115203 | consumed samples: 27968000 | consumed tokens: 57278464000 | elapsed time per iteration (s): 0.55 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 2.557197E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.450 | TFLOPs: 43.99 | +7: iteration 109260/ 115203 | consumed samples: 27970560 | consumed tokens: 57283706880 | elapsed time per iteration (s): 0.56 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 2.555466E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.879 | TFLOPs: 43.46 | +7: iteration 109270/ 115203 | consumed samples: 27973120 | consumed tokens: 57288949760 | elapsed time per iteration (s): 0.56 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 2.558468E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.973 | TFLOPs: 43.66 | +7: iteration 109280/ 115203 | consumed samples: 27975680 | consumed tokens: 57294192640 | elapsed time per iteration (s): 0.56 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 2.563616E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.329 | TFLOPs: 43.89 | +7: iteration 109290/ 115203 | consumed samples: 27978240 | consumed tokens: 57299435520 | elapsed time per iteration (s): 0.56 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 2.547183E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.088 | TFLOPs: 43.29 | +7: iteration 109300/ 115203 | consumed samples: 27980800 | consumed tokens: 57304678400 | elapsed time per iteration (s): 0.56 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 2.548276E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.490 | TFLOPs: 43.52 | +7: iteration 109310/ 115203 | consumed samples: 27983360 | consumed tokens: 57309921280 | elapsed time per iteration (s): 0.56 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 2.567261E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.115 | TFLOPs: 43.49 | +7: iteration 109320/ 115203 | consumed samples: 27985920 | consumed tokens: 57315164160 | elapsed time per iteration (s): 0.56 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 2.551543E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.959 | TFLOPs: 43.95 | +7: iteration 109330/ 115203 | consumed samples: 27988480 | consumed tokens: 57320407040 | elapsed time per iteration (s): 0.56 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 2.558254E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.199 | TFLOPs: 43.97 | +7: iteration 109340/ 115203 | consumed samples: 27991040 | consumed tokens: 57325649920 | elapsed time per iteration (s): 0.56 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 2.561067E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.209 | TFLOPs: 43.30 | +7: iteration 109350/ 115203 | consumed samples: 27993600 | consumed tokens: 57330892800 | elapsed time per iteration (s): 0.56 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 2.545747E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.243 | TFLOPs: 43.97 | +7: iteration 109360/ 115203 | consumed samples: 27996160 | consumed tokens: 57336135680 | elapsed time per iteration (s): 0.56 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 2.559938E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.352 | TFLOPs: 43.70 | +7: iteration 109370/ 115203 | consumed samples: 27998720 | consumed tokens: 57341378560 | elapsed time per iteration (s): 0.56 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 2.545916E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.103 | TFLOPs: 43.68 | +7: iteration 109380/ 115203 | consumed samples: 28001280 | consumed tokens: 57346621440 | elapsed time per iteration (s): 0.56 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 2.561942E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.068 | TFLOPs: 43.96 | +7: iteration 109390/ 115203 | consumed samples: 28003840 | consumed tokens: 57351864320 | elapsed time per iteration (s): 0.56 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 2.551923E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.619 | TFLOPs: 43.53 | +7: iteration 109400/ 115203 | consumed samples: 28006400 | consumed tokens: 57357107200 | elapsed time per iteration (s): 0.56 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 2.566065E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.153 | TFLOPs: 43.97 | +7: iteration 109410/ 115203 | consumed samples: 28008960 | consumed tokens: 57362350080 | elapsed time per iteration (s): 0.56 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 2.560503E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.291 | TFLOPs: 43.60 | +7: iteration 109420/ 115203 | consumed samples: 28011520 | consumed tokens: 57367592960 | elapsed time per iteration (s): 0.56 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 2.566228E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.030 | TFLOPs: 43.86 | +7: iteration 109430/ 115203 | consumed samples: 28014080 | consumed tokens: 57372835840 | elapsed time per iteration (s): 0.56 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 2.561553E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.239 | TFLOPs: 43.97 | +7: iteration 109440/ 115203 | consumed samples: 28016640 | consumed tokens: 57378078720 | elapsed time per iteration (s): 0.56 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 2.549183E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.691 | TFLOPs: 43.54 | +7: iteration 109450/ 115203 | consumed samples: 28019200 | consumed tokens: 57383321600 | elapsed time per iteration (s): 0.56 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 2.555140E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.058 | TFLOPs: 43.96 | +7: iteration 109460/ 115203 | consumed samples: 28021760 | consumed tokens: 57388564480 | elapsed time per iteration (s): 0.56 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 2.560857E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.044 | TFLOPs: 43.96 | +7: iteration 109470/ 115203 | consumed samples: 28024320 | consumed tokens: 57393807360 | elapsed time per iteration (s): 0.56 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 2.548928E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.941 | TFLOPs: 43.95 | +7: iteration 109480/ 115203 | consumed samples: 28026880 | consumed tokens: 57399050240 | elapsed time per iteration (s): 0.56 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 2.570630E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.004 | TFLOPs: 43.95 | +7: iteration 109490/ 115203 | consumed samples: 28029440 | consumed tokens: 57404293120 | elapsed time per iteration (s): 0.56 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 2.555174E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.137 | TFLOPs: 43.39 | +7: iteration 109500/ 115203 | consumed samples: 28032000 | consumed tokens: 57409536000 | elapsed time per iteration (s): 0.56 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 2.563544E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.376 | TFLOPs: 43.89 | +7: iteration 109510/ 115203 | consumed samples: 28034560 | consumed tokens: 57414778880 | elapsed time per iteration (s): 0.56 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 2.565625E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.878 | TFLOPs: 43.94 | +7: iteration 109520/ 115203 | consumed samples: 28037120 | consumed tokens: 57420021760 | elapsed time per iteration (s): 0.56 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 2.569903E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.885 | TFLOPs: 43.94 | +7: iteration 109530/ 115203 | consumed samples: 28039680 | consumed tokens: 57425264640 | elapsed time per iteration (s): 0.56 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 2.565433E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.996 | TFLOPs: 43.95 | +7: iteration 109540/ 115203 | consumed samples: 28042240 | consumed tokens: 57430507520 | elapsed time per iteration (s): 0.56 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 2.554116E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.783 | TFLOPs: 43.93 | +7: iteration 109550/ 115203 | consumed samples: 28044800 | consumed tokens: 57435750400 | elapsed time per iteration (s): 0.56 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 2.555103E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.082 | TFLOPs: 43.96 | +7: iteration 109560/ 115203 | consumed samples: 28047360 | consumed tokens: 57440993280 | elapsed time per iteration (s): 0.57 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 2.559629E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.031 | TFLOPs: 43.10 | +7: iteration 109570/ 115203 | consumed samples: 28049920 | consumed tokens: 57446236160 | elapsed time per iteration (s): 0.56 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 2.550286E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.058 | TFLOPs: 43.96 | +7: iteration 109580/ 115203 | consumed samples: 28052480 | consumed tokens: 57451479040 | elapsed time per iteration (s): 0.56 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 2.554091E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.791 | TFLOPs: 43.36 | +7: iteration 109590/ 115203 | consumed samples: 28055040 | consumed tokens: 57456721920 | elapsed time per iteration (s): 0.56 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 2.554842E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.299 | TFLOPs: 43.41 | +7: iteration 109600/ 115203 | consumed samples: 28057600 | consumed tokens: 57461964800 | elapsed time per iteration (s): 0.55 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 2.546526E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.406 | TFLOPs: 43.99 | +7: iteration 109610/ 115203 | consumed samples: 28060160 | consumed tokens: 57467207680 | elapsed time per iteration (s): 0.56 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 2.559898E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.219 | TFLOPs: 43.97 | +7: iteration 109620/ 115203 | consumed samples: 28062720 | consumed tokens: 57472450560 | elapsed time per iteration (s): 0.56 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 2.554592E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.963 | TFLOPs: 43.95 | +7: iteration 109630/ 115203 | consumed samples: 28065280 | consumed tokens: 57477693440 | elapsed time per iteration (s): 0.57 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 2.555282E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.690 | TFLOPs: 42.68 | +7: iteration 109640/ 115203 | consumed samples: 28067840 | consumed tokens: 57482936320 | elapsed time per iteration (s): 0.56 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 2.555265E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.993 | TFLOPs: 43.57 | +7: iteration 109650/ 115203 | consumed samples: 28070400 | consumed tokens: 57488179200 | elapsed time per iteration (s): 0.55 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 2.567236E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.933 | TFLOPs: 44.04 | +7: iteration 109660/ 115203 | consumed samples: 28072960 | consumed tokens: 57493422080 | elapsed time per iteration (s): 0.55 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 2.547912E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.904 | TFLOPs: 44.04 | +7: iteration 109670/ 115203 | consumed samples: 28075520 | consumed tokens: 57498664960 | elapsed time per iteration (s): 0.56 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 2.537930E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.600 | TFLOPs: 43.34 | +7: iteration 109680/ 115203 | consumed samples: 28078080 | consumed tokens: 57503907840 | elapsed time per iteration (s): 0.55 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 2.557112E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.811 | TFLOPs: 44.03 | +7: iteration 109690/ 115203 | consumed samples: 28080640 | consumed tokens: 57509150720 | elapsed time per iteration (s): 0.55 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 2.554347E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.954 | TFLOPs: 44.04 | +7: iteration 109700/ 115203 | consumed samples: 28083200 | consumed tokens: 57514393600 | elapsed time per iteration (s): 0.56 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 2.553815E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.681 | TFLOPs: 43.54 | +7: iteration 109710/ 115203 | consumed samples: 28085760 | consumed tokens: 57519636480 | elapsed time per iteration (s): 0.55 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 2.565804E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.560 | TFLOPs: 44.00 | +7: iteration 109720/ 115203 | consumed samples: 28088320 | consumed tokens: 57524879360 | elapsed time per iteration (s): 0.55 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 2.555331E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.573 | TFLOPs: 44.01 | +7: iteration 109730/ 115203 | consumed samples: 28090880 | consumed tokens: 57530122240 | elapsed time per iteration (s): 0.55 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 2.565496E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.529 | TFLOPs: 44.00 | +7: iteration 109740/ 115203 | consumed samples: 28093440 | consumed tokens: 57535365120 | elapsed time per iteration (s): 0.56 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 2.562946E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.602 | TFLOPs: 43.44 | +7: iteration 109750/ 115203 | consumed samples: 28096000 | consumed tokens: 57540608000 | elapsed time per iteration (s): 0.55 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 2.551204E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.980 | TFLOPs: 44.04 | +7: iteration 109760/ 115203 | consumed samples: 28098560 | consumed tokens: 57545850880 | elapsed time per iteration (s): 0.55 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 2.554428E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.744 | TFLOPs: 44.02 | +7: iteration 109770/ 115203 | consumed samples: 28101120 | consumed tokens: 57551093760 | elapsed time per iteration (s): 0.55 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 2.546403E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.859 | TFLOPs: 44.03 | +7: iteration 109780/ 115203 | consumed samples: 28103680 | consumed tokens: 57556336640 | elapsed time per iteration (s): 0.55 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 2.548475E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.890 | TFLOPs: 44.04 | +7: iteration 109790/ 115203 | consumed samples: 28106240 | consumed tokens: 57561579520 | elapsed time per iteration (s): 0.55 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 2.563960E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.099 | TFLOPs: 44.06 | +7: iteration 109800/ 115203 | consumed samples: 28108800 | consumed tokens: 57566822400 | elapsed time per iteration (s): 0.55 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 2.560452E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.991 | TFLOPs: 44.05 | +7: iteration 109810/ 115203 | consumed samples: 28111360 | consumed tokens: 57572065280 | elapsed time per iteration (s): 0.56 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 2.559507E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.173 | TFLOPs: 43.59 | +7: iteration 109820/ 115203 | consumed samples: 28113920 | consumed tokens: 57577308160 | elapsed time per iteration (s): 0.56 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 2.553083E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.383 | TFLOPs: 43.70 | +7: iteration 109830/ 115203 | consumed samples: 28116480 | consumed tokens: 57582551040 | elapsed time per iteration (s): 0.56 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 2.556728E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.237 | TFLOPs: 43.59 | +7: iteration 109840/ 115203 | consumed samples: 28119040 | consumed tokens: 57587793920 | elapsed time per iteration (s): 0.55 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 2.566045E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.609 | TFLOPs: 44.01 | +7: iteration 109850/ 115203 | consumed samples: 28121600 | consumed tokens: 57593036800 | elapsed time per iteration (s): 0.56 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 2.554980E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.177 | TFLOPs: 43.59 | +7: iteration 109860/ 115203 | consumed samples: 28124160 | consumed tokens: 57598279680 | elapsed time per iteration (s): 0.55 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 2.558548E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.645 | TFLOPs: 44.01 | +7: iteration 109870/ 115203 | consumed samples: 28126720 | consumed tokens: 57603522560 | elapsed time per iteration (s): 0.55 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 2.556838E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.543 | TFLOPs: 44.00 | +7: iteration 109880/ 115203 | consumed samples: 28129280 | consumed tokens: 57608765440 | elapsed time per iteration (s): 0.55 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 2.551560E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.781 | TFLOPs: 44.03 | +7: iteration 109890/ 115203 | consumed samples: 28131840 | consumed tokens: 57614008320 | elapsed time per iteration (s): 0.55 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 2.551218E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.660 | TFLOPs: 44.01 | +7: iteration 109900/ 115203 | consumed samples: 28134400 | consumed tokens: 57619251200 | elapsed time per iteration (s): 0.55 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 2.569065E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.266 | TFLOPs: 43.98 | +7: iteration 109910/ 115203 | consumed samples: 28136960 | consumed tokens: 57624494080 | elapsed time per iteration (s): 0.55 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 2.557605E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.402 | TFLOPs: 43.99 | +7: iteration 109920/ 115203 | consumed samples: 28139520 | consumed tokens: 57629736960 | elapsed time per iteration (s): 0.55 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 2.549828E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.946 | TFLOPs: 44.04 | +7: iteration 109930/ 115203 | consumed samples: 28142080 | consumed tokens: 57634979840 | elapsed time per iteration (s): 0.56 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 2.554682E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.766 | TFLOPs: 43.83 | +7: iteration 109940/ 115203 | consumed samples: 28144640 | consumed tokens: 57640222720 | elapsed time per iteration (s): 0.55 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 2.561188E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.545 | TFLOPs: 44.00 | +7: iteration 109950/ 115203 | consumed samples: 28147200 | consumed tokens: 57645465600 | elapsed time per iteration (s): 0.55 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 2.562077E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.632 | TFLOPs: 44.01 | +7: iteration 109960/ 115203 | consumed samples: 28149760 | consumed tokens: 57650708480 | elapsed time per iteration (s): 0.55 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 2.568981E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.675 | TFLOPs: 44.02 | +7: iteration 109970/ 115203 | consumed samples: 28152320 | consumed tokens: 57655951360 | elapsed time per iteration (s): 0.55 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 2.546596E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.748 | TFLOPs: 44.02 | +7: iteration 109980/ 115203 | consumed samples: 28154880 | consumed tokens: 57661194240 | elapsed time per iteration (s): 0.56 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 2.551346E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.962 | TFLOPs: 43.28 | +7: iteration 109990/ 115203 | consumed samples: 28157440 | consumed tokens: 57666437120 | elapsed time per iteration (s): 0.56 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 2.555536E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.915 | TFLOPs: 43.56 | +0: [2023-03-17 06:06:24,976] [INFO] [logging.py:68:log_dist] [Rank 0] step=110000, skipped=0, lr=[2.092302863901853e-05, 2.092302863901853e-05, 2.092302863901853e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 110000/ 115203 | consumed samples: 28160000 | consumed tokens: 57671680000 | elapsed time per iteration (s): 0.55 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 2.550496E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.665 | TFLOPs: 44.01 | +0: steps: 110000 loss: 2.5488 iter time (s): 0.561 samples/sec: 456.507 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 110000 | lm loss value: 3.494872E+00 | lm loss PPL: 3.294607E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 110000 to checkpoints_421m60b400m +0: [2023-03-17 06:06:25,185] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step110000 is begin to save! +0: [2023-03-17 06:06:25,189] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:06:25,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:06:25,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:06:25,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:06:25,394] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:06:25,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:06:25,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:06:25,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:06:25,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:06:25,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:06:25,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:06:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:06:25,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:06:25,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:06:25,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:06:25,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:06:25,656] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:06:25,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:06:25,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_11-model_00-model_states.pt... +0: [2023-03-17 06:06:25,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_11-model_00-model_states.pt. +0: [2023-03-17 06:06:25,746] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:06:25,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:06:25,786] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_13-model_00-model_states.pt... +0: [2023-03-17 06:06:25,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_13-model_00-model_states.pt. +0: [2023-03-17 06:06:25,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_14-model_00-model_states.pt... +0: [2023-03-17 06:06:25,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_14-model_00-model_states.pt. +0: [2023-03-17 06:06:25,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_15-model_00-model_states.pt... +0: [2023-03-17 06:06:25,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_15-model_00-model_states.pt. +0: [2023-03-17 06:06:25,912] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_16-model_00-model_states.pt... +0: [2023-03-17 06:06:25,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_16-model_00-model_states.pt. +0: [2023-03-17 06:06:25,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_17-model_00-model_states.pt... +0: [2023-03-17 06:06:25,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_17-model_00-model_states.pt. +0: [2023-03-17 06:06:25,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_18-model_00-model_states.pt... +0: [2023-03-17 06:06:26,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_18-model_00-model_states.pt. +0: [2023-03-17 06:06:26,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_19-model_00-model_states.pt... +0: [2023-03-17 06:06:26,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_19-model_00-model_states.pt. +0: [2023-03-17 06:06:26,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_20-model_00-model_states.pt... +0: [2023-03-17 06:06:26,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_20-model_00-model_states.pt. +0: [2023-03-17 06:06:26,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/layer_22-model_00-model_states.pt... +0: [2023-03-17 06:06:26,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/layer_22-model_00-model_states.pt. +0: [2023-03-17 06:06:26,125] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step110000/mp_rank_00_model_states.pt +0: [2023-03-17 06:06:26,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:06:26,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:06:26,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,257] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,257] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 06:06:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 06:06:26,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:06:26,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:06:26,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 06:06:26,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 06:06:26,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 06:06:26,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:06:26,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 06:06:26,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 06:06:26,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 06:06:26,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step110000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:06:26,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: successfully saved checkpoint at iteration 110000 to checkpoints_421m60b400m +7: time (ms) | save-checkpoint: 1215.55 +7: iteration 110010/ 115203 | consumed samples: 28162560 | consumed tokens: 57676922880 | elapsed time per iteration (s): 0.69 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 2.552300E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 368.853 | TFLOPs: 35.17 | +7: iteration 110020/ 115203 | consumed samples: 28165120 | consumed tokens: 57682165760 | elapsed time per iteration (s): 0.56 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 2.564453E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.978 | TFLOPs: 43.57 | +7: iteration 110030/ 115203 | consumed samples: 28167680 | consumed tokens: 57687408640 | elapsed time per iteration (s): 0.57 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 2.552096E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.239 | TFLOPs: 42.83 | +7: iteration 110040/ 115203 | consumed samples: 28170240 | consumed tokens: 57692651520 | elapsed time per iteration (s): 0.58 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 2.567255E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.037 | TFLOPs: 42.33 | +7: iteration 110050/ 115203 | consumed samples: 28172800 | consumed tokens: 57697894400 | elapsed time per iteration (s): 0.56 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 2.553449E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.226 | TFLOPs: 43.88 | +7: iteration 110060/ 115203 | consumed samples: 28175360 | consumed tokens: 57703137280 | elapsed time per iteration (s): 0.55 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 2.562064E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.924 | TFLOPs: 44.04 | +7: iteration 110070/ 115203 | consumed samples: 28177920 | consumed tokens: 57708380160 | elapsed time per iteration (s): 0.55 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 2.570468E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.867 | TFLOPs: 44.03 | +7: iteration 110080/ 115203 | consumed samples: 28180480 | consumed tokens: 57713623040 | elapsed time per iteration (s): 0.55 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 2.553269E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.805 | TFLOPs: 44.03 | +7: iteration 110090/ 115203 | consumed samples: 28183040 | consumed tokens: 57718865920 | elapsed time per iteration (s): 0.55 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 2.554644E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.736 | TFLOPs: 44.02 | +7: iteration 110100/ 115203 | consumed samples: 28185600 | consumed tokens: 57724108800 | elapsed time per iteration (s): 0.56 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 2.545307E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.254 | TFLOPs: 43.98 | +7: iteration 110110/ 115203 | consumed samples: 28188160 | consumed tokens: 57729351680 | elapsed time per iteration (s): 0.55 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 2.551255E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.759 | TFLOPs: 44.02 | +7: iteration 110120/ 115203 | consumed samples: 28190720 | consumed tokens: 57734594560 | elapsed time per iteration (s): 0.55 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 2.547344E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.458 | TFLOPs: 44.00 | +7: iteration 110130/ 115203 | consumed samples: 28193280 | consumed tokens: 57739837440 | elapsed time per iteration (s): 0.55 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 2.555059E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.871 | TFLOPs: 44.03 | +7: iteration 110140/ 115203 | consumed samples: 28195840 | consumed tokens: 57745080320 | elapsed time per iteration (s): 0.55 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 2.560192E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.881 | TFLOPs: 44.04 | +7: iteration 110150/ 115203 | consumed samples: 28198400 | consumed tokens: 57750323200 | elapsed time per iteration (s): 0.56 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 2.548665E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.976 | TFLOPs: 43.95 | +7: iteration 110160/ 115203 | consumed samples: 28200960 | consumed tokens: 57755566080 | elapsed time per iteration (s): 0.55 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 2.558771E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.421 | TFLOPs: 43.99 | +7: iteration 110170/ 115203 | consumed samples: 28203520 | consumed tokens: 57760808960 | elapsed time per iteration (s): 0.56 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 2.562413E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.682 | TFLOPs: 43.54 | +7: iteration 110180/ 115203 | consumed samples: 28206080 | consumed tokens: 57766051840 | elapsed time per iteration (s): 0.55 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 2.567202E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.014 | TFLOPs: 44.05 | +7: iteration 110190/ 115203 | consumed samples: 28208640 | consumed tokens: 57771294720 | elapsed time per iteration (s): 0.56 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 2.571898E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.777 | TFLOPs: 43.45 | +7: iteration 110200/ 115203 | consumed samples: 28211200 | consumed tokens: 57776537600 | elapsed time per iteration (s): 0.55 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 2.551339E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.985 | TFLOPs: 44.05 | +7: iteration 110210/ 115203 | consumed samples: 28213760 | consumed tokens: 57781780480 | elapsed time per iteration (s): 0.56 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 2.555168E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.507 | TFLOPs: 43.71 | +7: iteration 110220/ 115203 | consumed samples: 28216320 | consumed tokens: 57787023360 | elapsed time per iteration (s): 0.55 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 2.563296E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.528 | TFLOPs: 44.00 | +7: iteration 110230/ 115203 | consumed samples: 28218880 | consumed tokens: 57792266240 | elapsed time per iteration (s): 0.55 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 2.564015E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.114 | TFLOPs: 44.06 | +7: iteration 110240/ 115203 | consumed samples: 28221440 | consumed tokens: 57797509120 | elapsed time per iteration (s): 0.56 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 2.568206E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.050 | TFLOPs: 43.96 | +7: iteration 110250/ 115203 | consumed samples: 28224000 | consumed tokens: 57802752000 | elapsed time per iteration (s): 0.56 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 2.552742E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.845 | TFLOPs: 43.84 | +7: iteration 110260/ 115203 | consumed samples: 28226560 | consumed tokens: 57807994880 | elapsed time per iteration (s): 0.56 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 2.550842E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.801 | TFLOPs: 43.74 | +7: iteration 110270/ 115203 | consumed samples: 28229120 | consumed tokens: 57813237760 | elapsed time per iteration (s): 0.55 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 2.559892E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.957 | TFLOPs: 44.04 | +7: iteration 110280/ 115203 | consumed samples: 28231680 | consumed tokens: 57818480640 | elapsed time per iteration (s): 0.55 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 2.552534E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.184 | TFLOPs: 44.06 | +7: iteration 110290/ 115203 | consumed samples: 28234240 | consumed tokens: 57823723520 | elapsed time per iteration (s): 0.56 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 2.550808E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.887 | TFLOPs: 43.37 | +7: iteration 110300/ 115203 | consumed samples: 28236800 | consumed tokens: 57828966400 | elapsed time per iteration (s): 0.57 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 2.561463E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.297 | TFLOPs: 42.55 | +7: iteration 110310/ 115203 | consumed samples: 28239360 | consumed tokens: 57834209280 | elapsed time per iteration (s): 0.57 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 2.557738E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.284 | TFLOPs: 42.55 | +7: iteration 110320/ 115203 | consumed samples: 28241920 | consumed tokens: 57839452160 | elapsed time per iteration (s): 0.58 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 2.545619E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.093 | TFLOPs: 42.24 | +7: iteration 110330/ 115203 | consumed samples: 28244480 | consumed tokens: 57844695040 | elapsed time per iteration (s): 0.57 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 2.554844E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.140 | TFLOPs: 42.73 | +7: iteration 110340/ 115203 | consumed samples: 28247040 | consumed tokens: 57849937920 | elapsed time per iteration (s): 0.57 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 2.563764E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.605 | TFLOPs: 42.96 | +7: iteration 110350/ 115203 | consumed samples: 28249600 | consumed tokens: 57855180800 | elapsed time per iteration (s): 0.57 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 2.548984E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.816 | TFLOPs: 42.69 | +7: iteration 110360/ 115203 | consumed samples: 28252160 | consumed tokens: 57860423680 | elapsed time per iteration (s): 0.56 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 2.558722E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.051 | TFLOPs: 43.67 | +7: iteration 110370/ 115203 | consumed samples: 28254720 | consumed tokens: 57865666560 | elapsed time per iteration (s): 0.57 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 2.560414E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.421 | TFLOPs: 42.75 | +7: iteration 110380/ 115203 | consumed samples: 28257280 | consumed tokens: 57870909440 | elapsed time per iteration (s): 0.57 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 2.558239E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.102 | TFLOPs: 42.91 | +7: iteration 110390/ 115203 | consumed samples: 28259840 | consumed tokens: 57876152320 | elapsed time per iteration (s): 0.57 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 2.540755E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.359 | TFLOPs: 43.03 | +7: iteration 110400/ 115203 | consumed samples: 28262400 | consumed tokens: 57881395200 | elapsed time per iteration (s): 0.57 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 2.559203E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.929 | TFLOPs: 42.90 | +7: iteration 110410/ 115203 | consumed samples: 28264960 | consumed tokens: 57886638080 | elapsed time per iteration (s): 0.56 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 2.557929E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.539 | TFLOPs: 43.43 | +7: iteration 110420/ 115203 | consumed samples: 28267520 | consumed tokens: 57891880960 | elapsed time per iteration (s): 0.56 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 2.559171E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.224 | TFLOPs: 43.50 | +7: iteration 110430/ 115203 | consumed samples: 28270080 | consumed tokens: 57897123840 | elapsed time per iteration (s): 0.57 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 2.540742E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.388 | TFLOPs: 42.94 | +7: iteration 110440/ 115203 | consumed samples: 28272640 | consumed tokens: 57902366720 | elapsed time per iteration (s): 0.57 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 2.554525E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.969 | TFLOPs: 43.09 | +7: iteration 110450/ 115203 | consumed samples: 28275200 | consumed tokens: 57907609600 | elapsed time per iteration (s): 0.56 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 2.554948E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.136 | TFLOPs: 43.49 | +7: iteration 110460/ 115203 | consumed samples: 28277760 | consumed tokens: 57912852480 | elapsed time per iteration (s): 0.56 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 2.563436E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.744 | TFLOPs: 43.55 | +7: iteration 110470/ 115203 | consumed samples: 28280320 | consumed tokens: 57918095360 | elapsed time per iteration (s): 0.57 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 2.546541E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.838 | TFLOPs: 42.60 | +7: iteration 110480/ 115203 | consumed samples: 28282880 | consumed tokens: 57923338240 | elapsed time per iteration (s): 0.56 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 2.550452E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.556 | TFLOPs: 43.24 | +7: iteration 110490/ 115203 | consumed samples: 28285440 | consumed tokens: 57928581120 | elapsed time per iteration (s): 0.57 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 2.552860E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.239 | TFLOPs: 42.93 | +7: iteration 110500/ 115203 | consumed samples: 28288000 | consumed tokens: 57933824000 | elapsed time per iteration (s): 0.56 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 2.544501E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.463 | TFLOPs: 43.33 | +7: iteration 110510/ 115203 | consumed samples: 28290560 | consumed tokens: 57939066880 | elapsed time per iteration (s): 0.58 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 2.567784E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.595 | TFLOPs: 42.20 | +7: iteration 110520/ 115203 | consumed samples: 28293120 | consumed tokens: 57944309760 | elapsed time per iteration (s): 0.58 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 2.561385E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.796 | TFLOPs: 42.12 | +7: iteration 110530/ 115203 | consumed samples: 28295680 | consumed tokens: 57949552640 | elapsed time per iteration (s): 0.57 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 2.565141E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.933 | TFLOPs: 42.61 | +7: iteration 110540/ 115203 | consumed samples: 28298240 | consumed tokens: 57954795520 | elapsed time per iteration (s): 0.57 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 2.541671E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.176 | TFLOPs: 42.63 | +7: iteration 110550/ 115203 | consumed samples: 28300800 | consumed tokens: 57960038400 | elapsed time per iteration (s): 0.58 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 2.550347E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.740 | TFLOPs: 42.02 | +7: iteration 110560/ 115203 | consumed samples: 28303360 | consumed tokens: 57965281280 | elapsed time per iteration (s): 0.57 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 2.561889E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.158 | TFLOPs: 42.63 | +7: iteration 110570/ 115203 | consumed samples: 28305920 | consumed tokens: 57970524160 | elapsed time per iteration (s): 0.56 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 2.548018E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.238 | TFLOPs: 43.21 | +7: iteration 110580/ 115203 | consumed samples: 28308480 | consumed tokens: 57975767040 | elapsed time per iteration (s): 0.56 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 2.562879E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.440 | TFLOPs: 43.52 | +7: iteration 110590/ 115203 | consumed samples: 28311040 | consumed tokens: 57981009920 | elapsed time per iteration (s): 0.57 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 2.550263E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.061 | TFLOPs: 43.19 | +7: iteration 110600/ 115203 | consumed samples: 28313600 | consumed tokens: 57986252800 | elapsed time per iteration (s): 0.57 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 2.557430E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.029 | TFLOPs: 42.81 | +7: iteration 110610/ 115203 | consumed samples: 28316160 | consumed tokens: 57991495680 | elapsed time per iteration (s): 0.56 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 2.535100E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.900 | TFLOPs: 43.27 | +7: iteration 110620/ 115203 | consumed samples: 28318720 | consumed tokens: 57996738560 | elapsed time per iteration (s): 0.58 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 2.550463E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.087 | TFLOPs: 42.15 | +7: iteration 110630/ 115203 | consumed samples: 28321280 | consumed tokens: 58001981440 | elapsed time per iteration (s): 0.57 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 2.552607E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.874 | TFLOPs: 42.99 | +7: iteration 110640/ 115203 | consumed samples: 28323840 | consumed tokens: 58007224320 | elapsed time per iteration (s): 0.56 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 2.559145E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.209 | TFLOPs: 43.69 | +7: iteration 110650/ 115203 | consumed samples: 28326400 | consumed tokens: 58012467200 | elapsed time per iteration (s): 0.57 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 2.552016E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.729 | TFLOPs: 42.78 | +7: iteration 110660/ 115203 | consumed samples: 28328960 | consumed tokens: 58017710080 | elapsed time per iteration (s): 0.56 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 2.557209E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.248 | TFLOPs: 43.21 | +7: iteration 110670/ 115203 | consumed samples: 28331520 | consumed tokens: 58022952960 | elapsed time per iteration (s): 0.58 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 2.552029E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.605 | TFLOPs: 42.29 | +7: iteration 110680/ 115203 | consumed samples: 28334080 | consumed tokens: 58028195840 | elapsed time per iteration (s): 0.57 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 2.557657E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.387 | TFLOPs: 43.03 | +7: iteration 110690/ 115203 | consumed samples: 28336640 | consumed tokens: 58033438720 | elapsed time per iteration (s): 0.59 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 2.568797E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.050 | TFLOPs: 41.48 | +7: iteration 110700/ 115203 | consumed samples: 28339200 | consumed tokens: 58038681600 | elapsed time per iteration (s): 0.58 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 2.559984E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.538 | TFLOPs: 42.19 | +7: iteration 110710/ 115203 | consumed samples: 28341760 | consumed tokens: 58043924480 | elapsed time per iteration (s): 0.57 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 2.551921E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.601 | TFLOPs: 42.96 | +7: iteration 110720/ 115203 | consumed samples: 28344320 | consumed tokens: 58049167360 | elapsed time per iteration (s): 0.57 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 2.554883E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.688 | TFLOPs: 42.97 | +7: iteration 110730/ 115203 | consumed samples: 28346880 | consumed tokens: 58054410240 | elapsed time per iteration (s): 0.58 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 2.550505E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.962 | TFLOPs: 42.23 | +7: iteration 110740/ 115203 | consumed samples: 28349440 | consumed tokens: 58059653120 | elapsed time per iteration (s): 0.57 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 2.553970E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.094 | TFLOPs: 42.91 | +7: iteration 110750/ 115203 | consumed samples: 28352000 | consumed tokens: 58064896000 | elapsed time per iteration (s): 0.57 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 2.550442E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.960 | TFLOPs: 42.71 | +7: iteration 110760/ 115203 | consumed samples: 28354560 | consumed tokens: 58070138880 | elapsed time per iteration (s): 0.57 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 2.551452E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.311 | TFLOPs: 43.03 | +7: iteration 110770/ 115203 | consumed samples: 28357120 | consumed tokens: 58075381760 | elapsed time per iteration (s): 0.57 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 2.557939E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.683 | TFLOPs: 42.59 | +7: iteration 110780/ 115203 | consumed samples: 28359680 | consumed tokens: 58080624640 | elapsed time per iteration (s): 0.56 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 2.555032E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.515 | TFLOPs: 43.52 | +7: iteration 110790/ 115203 | consumed samples: 28362240 | consumed tokens: 58085867520 | elapsed time per iteration (s): 0.58 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 2.557109E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.900 | TFLOPs: 42.32 | +7: iteration 110800/ 115203 | consumed samples: 28364800 | consumed tokens: 58091110400 | elapsed time per iteration (s): 0.56 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 2.566884E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.776 | TFLOPs: 43.55 | +7: iteration 110810/ 115203 | consumed samples: 28367360 | consumed tokens: 58096353280 | elapsed time per iteration (s): 0.58 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 2.553378E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.561 | TFLOPs: 41.91 | +7: iteration 110820/ 115203 | consumed samples: 28369920 | consumed tokens: 58101596160 | elapsed time per iteration (s): 0.56 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 2.555531E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.296 | TFLOPs: 43.31 | +7: iteration 110830/ 115203 | consumed samples: 28372480 | consumed tokens: 58106839040 | elapsed time per iteration (s): 0.59 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 2.551608E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.861 | TFLOPs: 41.65 | +7: iteration 110840/ 115203 | consumed samples: 28375040 | consumed tokens: 58112081920 | elapsed time per iteration (s): 0.59 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 2.545024E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 435.754 | TFLOPs: 41.54 | +7: iteration 110850/ 115203 | consumed samples: 28377600 | consumed tokens: 58117324800 | elapsed time per iteration (s): 0.58 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 2.566379E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.758 | TFLOPs: 41.83 | +7: iteration 110860/ 115203 | consumed samples: 28380160 | consumed tokens: 58122567680 | elapsed time per iteration (s): 0.58 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 2.554079E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.466 | TFLOPs: 41.90 | +7: iteration 110870/ 115203 | consumed samples: 28382720 | consumed tokens: 58127810560 | elapsed time per iteration (s): 0.58 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 2.550461E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.158 | TFLOPs: 42.25 | +7: iteration 110880/ 115203 | consumed samples: 28385280 | consumed tokens: 58133053440 | elapsed time per iteration (s): 0.58 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 2.554377E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.761 | TFLOPs: 42.40 | +7: iteration 110890/ 115203 | consumed samples: 28387840 | consumed tokens: 58138296320 | elapsed time per iteration (s): 0.59 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 2.557624E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.427 | TFLOPs: 41.61 | +7: iteration 110900/ 115203 | consumed samples: 28390400 | consumed tokens: 58143539200 | elapsed time per iteration (s): 0.57 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 2.555725E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.401 | TFLOPs: 42.46 | +7: iteration 110910/ 115203 | consumed samples: 28392960 | consumed tokens: 58148782080 | elapsed time per iteration (s): 0.58 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 2.563853E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.049 | TFLOPs: 42.24 | +7: iteration 110920/ 115203 | consumed samples: 28395520 | consumed tokens: 58154024960 | elapsed time per iteration (s): 0.58 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 2.563193E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.497 | TFLOPs: 42.38 | +7: iteration 110930/ 115203 | consumed samples: 28398080 | consumed tokens: 58159267840 | elapsed time per iteration (s): 0.57 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 2.548091E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.855 | TFLOPs: 42.60 | +7: iteration 110940/ 115203 | consumed samples: 28400640 | consumed tokens: 58164510720 | elapsed time per iteration (s): 0.57 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 2.556710E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.806 | TFLOPs: 42.88 | +7: iteration 110950/ 115203 | consumed samples: 28403200 | consumed tokens: 58169753600 | elapsed time per iteration (s): 0.58 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 2.561171E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.745 | TFLOPs: 42.31 | +7: iteration 110960/ 115203 | consumed samples: 28405760 | consumed tokens: 58174996480 | elapsed time per iteration (s): 0.57 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 2.567530E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.772 | TFLOPs: 42.88 | +7: iteration 110970/ 115203 | consumed samples: 28408320 | consumed tokens: 58180239360 | elapsed time per iteration (s): 0.59 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 2.562727E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.468 | TFLOPs: 41.42 | +7: iteration 110980/ 115203 | consumed samples: 28410880 | consumed tokens: 58185482240 | elapsed time per iteration (s): 0.57 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 2.556081E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.515 | TFLOPs: 42.86 | +7: iteration 110990/ 115203 | consumed samples: 28413440 | consumed tokens: 58190725120 | elapsed time per iteration (s): 0.57 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 2.546823E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.427 | TFLOPs: 43.04 | +7: iteration 111000/ 115203 | consumed samples: 28416000 | consumed tokens: 58195968000 | elapsed time per iteration (s): 0.57 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 2.553784E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.524 | TFLOPs: 42.67 | +7: iteration 111010/ 115203 | consumed samples: 28418560 | consumed tokens: 58201210880 | elapsed time per iteration (s): 0.58 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 2.552487E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.181 | TFLOPs: 42.44 | +7: iteration 111020/ 115203 | consumed samples: 28421120 | consumed tokens: 58206453760 | elapsed time per iteration (s): 0.58 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 2.565198E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.910 | TFLOPs: 41.75 | +7: iteration 111030/ 115203 | consumed samples: 28423680 | consumed tokens: 58211696640 | elapsed time per iteration (s): 0.57 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 2.553619E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.148 | TFLOPs: 42.63 | +7: iteration 111040/ 115203 | consumed samples: 28426240 | consumed tokens: 58216939520 | elapsed time per iteration (s): 0.57 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 2.540168E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.775 | TFLOPs: 42.50 | +7: iteration 111050/ 115203 | consumed samples: 28428800 | consumed tokens: 58222182400 | elapsed time per iteration (s): 0.58 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 2.548533E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.752 | TFLOPs: 41.93 | +7: iteration 111060/ 115203 | consumed samples: 28431360 | consumed tokens: 58227425280 | elapsed time per iteration (s): 0.58 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 2.551399E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.977 | TFLOPs: 42.42 | +7: iteration 111070/ 115203 | consumed samples: 28433920 | consumed tokens: 58232668160 | elapsed time per iteration (s): 0.58 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 2.558531E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.437 | TFLOPs: 42.18 | +7: iteration 111080/ 115203 | consumed samples: 28436480 | consumed tokens: 58237911040 | elapsed time per iteration (s): 0.57 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 2.551254E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.948 | TFLOPs: 42.52 | +7: iteration 111090/ 115203 | consumed samples: 28439040 | consumed tokens: 58243153920 | elapsed time per iteration (s): 0.58 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 2.545289E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.300 | TFLOPs: 41.79 | +7: iteration 111100/ 115203 | consumed samples: 28441600 | consumed tokens: 58248396800 | elapsed time per iteration (s): 0.57 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 2.545008E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.662 | TFLOPs: 42.58 | +7: iteration 111110/ 115203 | consumed samples: 28444160 | consumed tokens: 58253639680 | elapsed time per iteration (s): 0.57 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 2.559582E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.420 | TFLOPs: 42.56 | +7: iteration 111120/ 115203 | consumed samples: 28446720 | consumed tokens: 58258882560 | elapsed time per iteration (s): 0.57 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 2.561734E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.078 | TFLOPs: 42.72 | +7: iteration 111130/ 115203 | consumed samples: 28449280 | consumed tokens: 58264125440 | elapsed time per iteration (s): 0.57 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 2.570894E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.940 | TFLOPs: 42.52 | +7: iteration 111140/ 115203 | consumed samples: 28451840 | consumed tokens: 58269368320 | elapsed time per iteration (s): 0.57 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 2.570561E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.041 | TFLOPs: 43.10 | +7: iteration 111150/ 115203 | consumed samples: 28454400 | consumed tokens: 58274611200 | elapsed time per iteration (s): 0.59 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 2.551596E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.065 | TFLOPs: 41.10 | +7: iteration 111160/ 115203 | consumed samples: 28456960 | consumed tokens: 58279854080 | elapsed time per iteration (s): 0.57 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 2.551401E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.028 | TFLOPs: 43.00 | +7: iteration 111170/ 115203 | consumed samples: 28459520 | consumed tokens: 58285096960 | elapsed time per iteration (s): 0.56 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 2.549657E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.402 | TFLOPs: 43.23 | +7: iteration 111180/ 115203 | consumed samples: 28462080 | consumed tokens: 58290339840 | elapsed time per iteration (s): 0.56 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 2.561337E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.102 | TFLOPs: 43.48 | +7: iteration 111190/ 115203 | consumed samples: 28464640 | consumed tokens: 58295582720 | elapsed time per iteration (s): 0.57 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 2.554669E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.566 | TFLOPs: 42.86 | +7: iteration 111200/ 115203 | consumed samples: 28467200 | consumed tokens: 58300825600 | elapsed time per iteration (s): 0.57 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 2.539663E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.727 | TFLOPs: 42.97 | +7: iteration 111210/ 115203 | consumed samples: 28469760 | consumed tokens: 58306068480 | elapsed time per iteration (s): 0.58 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 2.548393E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.972 | TFLOPs: 42.23 | +7: iteration 111220/ 115203 | consumed samples: 28472320 | consumed tokens: 58311311360 | elapsed time per iteration (s): 0.57 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 2.559655E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.078 | TFLOPs: 43.10 | +7: iteration 111230/ 115203 | consumed samples: 28474880 | consumed tokens: 58316554240 | elapsed time per iteration (s): 0.57 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 2.561174E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.263 | TFLOPs: 42.93 | +7: iteration 111240/ 115203 | consumed samples: 28477440 | consumed tokens: 58321797120 | elapsed time per iteration (s): 0.55 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 2.542004E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.607 | TFLOPs: 44.01 | +7: iteration 111250/ 115203 | consumed samples: 28480000 | consumed tokens: 58327040000 | elapsed time per iteration (s): 0.56 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 2.548272E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.575 | TFLOPs: 43.53 | +7: iteration 111260/ 115203 | consumed samples: 28482560 | consumed tokens: 58332282880 | elapsed time per iteration (s): 0.57 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 2.567081E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.414 | TFLOPs: 42.75 | +7: iteration 111270/ 115203 | consumed samples: 28485120 | consumed tokens: 58337525760 | elapsed time per iteration (s): 0.56 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 2.544110E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.829 | TFLOPs: 43.55 | +7: iteration 111280/ 115203 | consumed samples: 28487680 | consumed tokens: 58342768640 | elapsed time per iteration (s): 0.55 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 2.545443E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.513 | TFLOPs: 44.00 | +7: iteration 111290/ 115203 | consumed samples: 28490240 | consumed tokens: 58348011520 | elapsed time per iteration (s): 0.58 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 2.572324E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.771 | TFLOPs: 42.40 | +7: iteration 111300/ 115203 | consumed samples: 28492800 | consumed tokens: 58353254400 | elapsed time per iteration (s): 0.56 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 2.539407E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.172 | TFLOPs: 43.59 | +7: iteration 111310/ 115203 | consumed samples: 28495360 | consumed tokens: 58358497280 | elapsed time per iteration (s): 0.56 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 2.545688E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.755 | TFLOPs: 43.36 | +7: iteration 111320/ 115203 | consumed samples: 28497920 | consumed tokens: 58363740160 | elapsed time per iteration (s): 0.57 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 2.561697E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.269 | TFLOPs: 42.93 | +7: iteration 111330/ 115203 | consumed samples: 28500480 | consumed tokens: 58368983040 | elapsed time per iteration (s): 0.57 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 2.551958E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.603 | TFLOPs: 42.96 | +7: iteration 111340/ 115203 | consumed samples: 28503040 | consumed tokens: 58374225920 | elapsed time per iteration (s): 0.56 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 2.554809E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.848 | TFLOPs: 43.46 | +7: iteration 111350/ 115203 | consumed samples: 28505600 | consumed tokens: 58379468800 | elapsed time per iteration (s): 0.57 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 2.552960E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.407 | TFLOPs: 43.04 | +7: iteration 111360/ 115203 | consumed samples: 28508160 | consumed tokens: 58384711680 | elapsed time per iteration (s): 0.55 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 2.561079E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.038 | TFLOPs: 44.05 | +7: iteration 111370/ 115203 | consumed samples: 28510720 | consumed tokens: 58389954560 | elapsed time per iteration (s): 0.55 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 2.542412E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.861 | TFLOPs: 44.03 | +7: iteration 111380/ 115203 | consumed samples: 28513280 | consumed tokens: 58395197440 | elapsed time per iteration (s): 0.56 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 2.550462E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.078 | TFLOPs: 43.48 | +7: iteration 111390/ 115203 | consumed samples: 28515840 | consumed tokens: 58400440320 | elapsed time per iteration (s): 0.57 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 2.544549E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.463 | TFLOPs: 43.04 | +7: iteration 111400/ 115203 | consumed samples: 28518400 | consumed tokens: 58405683200 | elapsed time per iteration (s): 0.56 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 2.550971E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.125 | TFLOPs: 43.39 | +7: iteration 111410/ 115203 | consumed samples: 28520960 | consumed tokens: 58410926080 | elapsed time per iteration (s): 0.56 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 2.542152E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.976 | TFLOPs: 43.85 | +7: iteration 111420/ 115203 | consumed samples: 28523520 | consumed tokens: 58416168960 | elapsed time per iteration (s): 0.57 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 2.548486E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.622 | TFLOPs: 42.68 | +7: iteration 111430/ 115203 | consumed samples: 28526080 | consumed tokens: 58421411840 | elapsed time per iteration (s): 0.57 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 2.554522E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.327 | TFLOPs: 42.84 | +7: iteration 111440/ 115203 | consumed samples: 28528640 | consumed tokens: 58426654720 | elapsed time per iteration (s): 0.56 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 2.561156E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.337 | TFLOPs: 43.41 | +7: iteration 111450/ 115203 | consumed samples: 28531200 | consumed tokens: 58431897600 | elapsed time per iteration (s): 0.56 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 2.550297E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.994 | TFLOPs: 43.47 | +7: iteration 111460/ 115203 | consumed samples: 28533760 | consumed tokens: 58437140480 | elapsed time per iteration (s): 0.57 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 2.559631E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.427 | TFLOPs: 43.13 | +7: iteration 111470/ 115203 | consumed samples: 28536320 | consumed tokens: 58442383360 | elapsed time per iteration (s): 0.56 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 2.556985E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.770 | TFLOPs: 43.45 | +7: iteration 111480/ 115203 | consumed samples: 28538880 | consumed tokens: 58447626240 | elapsed time per iteration (s): 0.58 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 2.550596E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.820 | TFLOPs: 41.74 | +7: iteration 111490/ 115203 | consumed samples: 28541440 | consumed tokens: 58452869120 | elapsed time per iteration (s): 0.56 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 2.564527E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.407 | TFLOPs: 43.70 | +7: iteration 111500/ 115203 | consumed samples: 28544000 | consumed tokens: 58458112000 | elapsed time per iteration (s): 0.57 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 2.549779E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.374 | TFLOPs: 42.75 | +7: iteration 111510/ 115203 | consumed samples: 28546560 | consumed tokens: 58463354880 | elapsed time per iteration (s): 0.56 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 2.546150E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.487 | TFLOPs: 43.24 | +7: iteration 111520/ 115203 | consumed samples: 28549120 | consumed tokens: 58468597760 | elapsed time per iteration (s): 0.57 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 2.558172E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.870 | TFLOPs: 43.08 | +7: iteration 111530/ 115203 | consumed samples: 28551680 | consumed tokens: 58473840640 | elapsed time per iteration (s): 0.57 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 2.557672E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.646 | TFLOPs: 43.15 | +7: iteration 111540/ 115203 | consumed samples: 28554240 | consumed tokens: 58479083520 | elapsed time per iteration (s): 0.56 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 2.555592E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.177 | TFLOPs: 43.49 | +7: iteration 111550/ 115203 | consumed samples: 28556800 | consumed tokens: 58484326400 | elapsed time per iteration (s): 0.59 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 2.552750E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.420 | TFLOPs: 41.32 | +7: iteration 111560/ 115203 | consumed samples: 28559360 | consumed tokens: 58489569280 | elapsed time per iteration (s): 0.58 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 2.564257E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.605 | TFLOPs: 42.01 | +7: iteration 111570/ 115203 | consumed samples: 28561920 | consumed tokens: 58494812160 | elapsed time per iteration (s): 0.57 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 2.549599E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.508 | TFLOPs: 42.76 | +7: iteration 111580/ 115203 | consumed samples: 28564480 | consumed tokens: 58500055040 | elapsed time per iteration (s): 0.57 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 2.562887E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.547 | TFLOPs: 42.57 | +7: iteration 111590/ 115203 | consumed samples: 28567040 | consumed tokens: 58505297920 | elapsed time per iteration (s): 0.60 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 2.538909E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.868 | TFLOPs: 40.70 | +7: iteration 111600/ 115203 | consumed samples: 28569600 | consumed tokens: 58510540800 | elapsed time per iteration (s): 0.57 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 2.563734E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.768 | TFLOPs: 42.79 | +7: iteration 111610/ 115203 | consumed samples: 28572160 | consumed tokens: 58515783680 | elapsed time per iteration (s): 0.57 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 2.566934E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.861 | TFLOPs: 42.98 | +7: iteration 111620/ 115203 | consumed samples: 28574720 | consumed tokens: 58521026560 | elapsed time per iteration (s): 0.59 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 2.554299E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.872 | TFLOPs: 41.65 | +7: iteration 111630/ 115203 | consumed samples: 28577280 | consumed tokens: 58526269440 | elapsed time per iteration (s): 0.59 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 2.543892E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.496 | TFLOPs: 41.23 | +7: iteration 111640/ 115203 | consumed samples: 28579840 | consumed tokens: 58531512320 | elapsed time per iteration (s): 0.56 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 2.557978E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.309 | TFLOPs: 43.41 | +7: iteration 111650/ 115203 | consumed samples: 28582400 | consumed tokens: 58536755200 | elapsed time per iteration (s): 0.59 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 2.558046E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 434.521 | TFLOPs: 41.43 | +7: iteration 111660/ 115203 | consumed samples: 28584960 | consumed tokens: 58541998080 | elapsed time per iteration (s): 0.58 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 2.557263E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.651 | TFLOPs: 42.01 | +7: iteration 111670/ 115203 | consumed samples: 28587520 | consumed tokens: 58547240960 | elapsed time per iteration (s): 0.59 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 2.555478E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.233 | TFLOPs: 41.69 | +7: iteration 111680/ 115203 | consumed samples: 28590080 | consumed tokens: 58552483840 | elapsed time per iteration (s): 0.57 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 2.563116E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.587 | TFLOPs: 42.96 | +7: iteration 111690/ 115203 | consumed samples: 28592640 | consumed tokens: 58557726720 | elapsed time per iteration (s): 0.58 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 2.565060E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.894 | TFLOPs: 41.84 | +7: iteration 111700/ 115203 | consumed samples: 28595200 | consumed tokens: 58562969600 | elapsed time per iteration (s): 0.57 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 2.553841E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.098 | TFLOPs: 42.72 | +7: iteration 111710/ 115203 | consumed samples: 28597760 | consumed tokens: 58568212480 | elapsed time per iteration (s): 0.61 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 2.560496E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 421.600 | TFLOPs: 40.19 | +7: iteration 111720/ 115203 | consumed samples: 28600320 | consumed tokens: 58573455360 | elapsed time per iteration (s): 0.59 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 2.566868E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.580 | TFLOPs: 41.34 | +7: iteration 111730/ 115203 | consumed samples: 28602880 | consumed tokens: 58578698240 | elapsed time per iteration (s): 0.57 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 2.550301E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.170 | TFLOPs: 43.11 | +7: iteration 111740/ 115203 | consumed samples: 28605440 | consumed tokens: 58583941120 | elapsed time per iteration (s): 0.60 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 2.549174E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 429.785 | TFLOPs: 40.98 | +7: iteration 111750/ 115203 | consumed samples: 28608000 | consumed tokens: 58589184000 | elapsed time per iteration (s): 0.60 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 2.555761E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 427.410 | TFLOPs: 40.75 | +7: iteration 111760/ 115203 | consumed samples: 28610560 | consumed tokens: 58594426880 | elapsed time per iteration (s): 0.58 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 2.573552E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.390 | TFLOPs: 41.89 | +7: iteration 111770/ 115203 | consumed samples: 28613120 | consumed tokens: 58599669760 | elapsed time per iteration (s): 0.56 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 2.554989E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.192 | TFLOPs: 43.21 | +7: iteration 111780/ 115203 | consumed samples: 28615680 | consumed tokens: 58604912640 | elapsed time per iteration (s): 0.57 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 2.559037E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.449 | TFLOPs: 43.14 | +7: iteration 111790/ 115203 | consumed samples: 28618240 | consumed tokens: 58610155520 | elapsed time per iteration (s): 0.57 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 2.552690E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.474 | TFLOPs: 42.66 | +7: iteration 111800/ 115203 | consumed samples: 28620800 | consumed tokens: 58615398400 | elapsed time per iteration (s): 0.59 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 2.560190E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.831 | TFLOPs: 41.17 | +7: iteration 111810/ 115203 | consumed samples: 28623360 | consumed tokens: 58620641280 | elapsed time per iteration (s): 0.60 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 2.558010E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.347 | TFLOPs: 40.65 | +7: iteration 111820/ 115203 | consumed samples: 28625920 | consumed tokens: 58625884160 | elapsed time per iteration (s): 0.57 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 2.563778E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.284 | TFLOPs: 42.45 | +7: iteration 111830/ 115203 | consumed samples: 28628480 | consumed tokens: 58631127040 | elapsed time per iteration (s): 0.58 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 2.559569E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.295 | TFLOPs: 41.98 | +7: iteration 111840/ 115203 | consumed samples: 28631040 | consumed tokens: 58636369920 | elapsed time per iteration (s): 0.57 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 2.563271E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.128 | TFLOPs: 42.72 | +7: iteration 111850/ 115203 | consumed samples: 28633600 | consumed tokens: 58641612800 | elapsed time per iteration (s): 0.56 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 2.558451E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.616 | TFLOPs: 43.63 | +7: iteration 111860/ 115203 | consumed samples: 28636160 | consumed tokens: 58646855680 | elapsed time per iteration (s): 0.56 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 2.560746E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.891 | TFLOPs: 43.37 | +7: iteration 111870/ 115203 | consumed samples: 28638720 | consumed tokens: 58652098560 | elapsed time per iteration (s): 0.59 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 2.558013E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.328 | TFLOPs: 41.22 | +7: iteration 111880/ 115203 | consumed samples: 28641280 | consumed tokens: 58657341440 | elapsed time per iteration (s): 0.60 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 2.550968E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 425.847 | TFLOPs: 40.60 | +7: iteration 111890/ 115203 | consumed samples: 28643840 | consumed tokens: 58662584320 | elapsed time per iteration (s): 0.56 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 2.555645E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.978 | TFLOPs: 43.28 | +7: iteration 111900/ 115203 | consumed samples: 28646400 | consumed tokens: 58667827200 | elapsed time per iteration (s): 0.59 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 2.555087E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 433.056 | TFLOPs: 41.29 | +7: iteration 111910/ 115203 | consumed samples: 28648960 | consumed tokens: 58673070080 | elapsed time per iteration (s): 0.59 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 2.551433E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 431.270 | TFLOPs: 41.12 | +7: iteration 111920/ 115203 | consumed samples: 28651520 | consumed tokens: 58678312960 | elapsed time per iteration (s): 0.58 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 2.553469E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.290 | TFLOPs: 42.26 | +7: iteration 111930/ 115203 | consumed samples: 28654080 | consumed tokens: 58683555840 | elapsed time per iteration (s): 0.57 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 2.565718E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.277 | TFLOPs: 42.83 | +7: iteration 111940/ 115203 | consumed samples: 28656640 | consumed tokens: 58688798720 | elapsed time per iteration (s): 0.56 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 2.566329E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.472 | TFLOPs: 43.61 | +7: iteration 111950/ 115203 | consumed samples: 28659200 | consumed tokens: 58694041600 | elapsed time per iteration (s): 0.57 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 2.558584E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.992 | TFLOPs: 43.19 | +7: iteration 111960/ 115203 | consumed samples: 28661760 | consumed tokens: 58699284480 | elapsed time per iteration (s): 0.56 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 2.548487E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.701 | TFLOPs: 43.64 | +7: iteration 111970/ 115203 | consumed samples: 28664320 | consumed tokens: 58704527360 | elapsed time per iteration (s): 0.56 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 2.553782E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.766 | TFLOPs: 43.55 | +7: iteration 111980/ 115203 | consumed samples: 28666880 | consumed tokens: 58709770240 | elapsed time per iteration (s): 0.58 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 2.553032E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.417 | TFLOPs: 42.37 | +7: iteration 111990/ 115203 | consumed samples: 28669440 | consumed tokens: 58715013120 | elapsed time per iteration (s): 0.56 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 2.551441E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.976 | TFLOPs: 43.76 | +0: [2023-03-17 06:25:26,512] [INFO] [logging.py:68:log_dist] [Rank 0] step=112000, skipped=0, lr=[2.0350245708025642e-05, 2.0350245708025642e-05, 2.0350245708025642e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 112000/ 115203 | consumed samples: 28672000 | consumed tokens: 58720256000 | elapsed time per iteration (s): 0.56 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 2.553196E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.831 | TFLOPs: 43.27 | +0: steps: 112000 loss: 2.5267 iter time (s): 0.568 samples/sec: 450.407 +7: iteration 112010/ 115203 | consumed samples: 28674560 | consumed tokens: 58725498880 | elapsed time per iteration (s): 0.57 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 2.546374E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.414 | TFLOPs: 42.85 | +7: iteration 112020/ 115203 | consumed samples: 28677120 | consumed tokens: 58730741760 | elapsed time per iteration (s): 0.58 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 2.541611E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.062 | TFLOPs: 42.34 | +7: iteration 112030/ 115203 | consumed samples: 28679680 | consumed tokens: 58735984640 | elapsed time per iteration (s): 0.57 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 2.561909E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.710 | TFLOPs: 43.07 | +7: iteration 112040/ 115203 | consumed samples: 28682240 | consumed tokens: 58741227520 | elapsed time per iteration (s): 0.57 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 2.550584E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.903 | TFLOPs: 42.99 | +7: iteration 112050/ 115203 | consumed samples: 28684800 | consumed tokens: 58746470400 | elapsed time per iteration (s): 0.57 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 2.564094E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.332 | TFLOPs: 42.93 | +7: iteration 112060/ 115203 | consumed samples: 28687360 | consumed tokens: 58751713280 | elapsed time per iteration (s): 0.57 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 2.556172E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.288 | TFLOPs: 43.12 | +7: iteration 112070/ 115203 | consumed samples: 28689920 | consumed tokens: 58756956160 | elapsed time per iteration (s): 0.57 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 2.558206E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.046 | TFLOPs: 43.00 | +7: iteration 112080/ 115203 | consumed samples: 28692480 | consumed tokens: 58762199040 | elapsed time per iteration (s): 0.57 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 2.556417E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.431 | TFLOPs: 42.47 | +7: iteration 112090/ 115203 | consumed samples: 28695040 | consumed tokens: 58767441920 | elapsed time per iteration (s): 0.59 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 2.554928E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 432.328 | TFLOPs: 41.22 | +7: iteration 112100/ 115203 | consumed samples: 28697600 | consumed tokens: 58772684800 | elapsed time per iteration (s): 0.57 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 2.563692E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.711 | TFLOPs: 42.78 | +7: iteration 112110/ 115203 | consumed samples: 28700160 | consumed tokens: 58777927680 | elapsed time per iteration (s): 0.56 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 2.542912E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.526 | TFLOPs: 43.62 | +7: iteration 112120/ 115203 | consumed samples: 28702720 | consumed tokens: 58783170560 | elapsed time per iteration (s): 0.57 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 2.552400E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.159 | TFLOPs: 42.92 | +7: iteration 112130/ 115203 | consumed samples: 28705280 | consumed tokens: 58788413440 | elapsed time per iteration (s): 0.57 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 2.545460E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.717 | TFLOPs: 42.49 | +7: iteration 112140/ 115203 | consumed samples: 28707840 | consumed tokens: 58793656320 | elapsed time per iteration (s): 0.57 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 2.553302E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.321 | TFLOPs: 42.74 | +7: iteration 112150/ 115203 | consumed samples: 28710400 | consumed tokens: 58798899200 | elapsed time per iteration (s): 0.58 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 2.555464E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.829 | TFLOPs: 42.22 | +7: iteration 112160/ 115203 | consumed samples: 28712960 | consumed tokens: 58804142080 | elapsed time per iteration (s): 0.56 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 2.551505E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.188 | TFLOPs: 43.21 | +7: iteration 112170/ 115203 | consumed samples: 28715520 | consumed tokens: 58809384960 | elapsed time per iteration (s): 0.58 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 2.562793E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.344 | TFLOPs: 42.08 | +7: iteration 112180/ 115203 | consumed samples: 28718080 | consumed tokens: 58814627840 | elapsed time per iteration (s): 0.57 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 2.544092E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.066 | TFLOPs: 43.19 | +7: iteration 112190/ 115203 | consumed samples: 28720640 | consumed tokens: 58819870720 | elapsed time per iteration (s): 0.58 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 2.562875E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.824 | TFLOPs: 42.31 | +7: iteration 112200/ 115203 | consumed samples: 28723200 | consumed tokens: 58825113600 | elapsed time per iteration (s): 0.58 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 2.554228E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.444 | TFLOPs: 41.80 | +7: iteration 112210/ 115203 | consumed samples: 28725760 | consumed tokens: 58830356480 | elapsed time per iteration (s): 0.58 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 2.544172E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.340 | TFLOPs: 42.17 | +7: iteration 112220/ 115203 | consumed samples: 28728320 | consumed tokens: 58835599360 | elapsed time per iteration (s): 0.56 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 2.555536E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.200 | TFLOPs: 43.49 | +7: iteration 112230/ 115203 | consumed samples: 28730880 | consumed tokens: 58840842240 | elapsed time per iteration (s): 0.56 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 2.560831E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.389 | TFLOPs: 43.61 | +7: iteration 112240/ 115203 | consumed samples: 28733440 | consumed tokens: 58846085120 | elapsed time per iteration (s): 0.57 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 2.545418E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.905 | TFLOPs: 43.18 | +7: iteration 112250/ 115203 | consumed samples: 28736000 | consumed tokens: 58851328000 | elapsed time per iteration (s): 0.56 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 2.543351E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.722 | TFLOPs: 43.92 | +7: iteration 112260/ 115203 | consumed samples: 28738560 | consumed tokens: 58856570880 | elapsed time per iteration (s): 0.56 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 2.545375E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.968 | TFLOPs: 43.38 | +7: iteration 112270/ 115203 | consumed samples: 28741120 | consumed tokens: 58861813760 | elapsed time per iteration (s): 0.56 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 2.547739E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.236 | TFLOPs: 43.40 | +7: iteration 112280/ 115203 | consumed samples: 28743680 | consumed tokens: 58867056640 | elapsed time per iteration (s): 0.56 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 2.562094E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.094 | TFLOPs: 43.77 | +7: iteration 112290/ 115203 | consumed samples: 28746240 | consumed tokens: 58872299520 | elapsed time per iteration (s): 0.56 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 2.545615E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.018 | TFLOPs: 43.48 | +7: iteration 112300/ 115203 | consumed samples: 28748800 | consumed tokens: 58877542400 | elapsed time per iteration (s): 0.56 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 2.559389E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.756 | TFLOPs: 43.45 | +7: iteration 112310/ 115203 | consumed samples: 28751360 | consumed tokens: 58882785280 | elapsed time per iteration (s): 0.56 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 2.552267E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.064 | TFLOPs: 43.48 | +7: iteration 112320/ 115203 | consumed samples: 28753920 | consumed tokens: 58888028160 | elapsed time per iteration (s): 0.56 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 2.542822E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.213 | TFLOPs: 43.78 | +7: iteration 112330/ 115203 | consumed samples: 28756480 | consumed tokens: 58893271040 | elapsed time per iteration (s): 0.56 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 2.553577E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.227 | TFLOPs: 43.69 | +7: iteration 112340/ 115203 | consumed samples: 28759040 | consumed tokens: 58898513920 | elapsed time per iteration (s): 0.56 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 2.566697E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.685 | TFLOPs: 43.54 | +7: iteration 112350/ 115203 | consumed samples: 28761600 | consumed tokens: 58903756800 | elapsed time per iteration (s): 0.57 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 2.559219E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.415 | TFLOPs: 43.13 | +7: iteration 112360/ 115203 | consumed samples: 28764160 | consumed tokens: 58908999680 | elapsed time per iteration (s): 0.57 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 2.561995E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.682 | TFLOPs: 43.06 | +7: iteration 112370/ 115203 | consumed samples: 28766720 | consumed tokens: 58914242560 | elapsed time per iteration (s): 0.57 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 2.567540E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.640 | TFLOPs: 43.06 | +7: iteration 112380/ 115203 | consumed samples: 28769280 | consumed tokens: 58919485440 | elapsed time per iteration (s): 0.56 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 2.538757E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.342 | TFLOPs: 43.22 | +7: iteration 112390/ 115203 | consumed samples: 28771840 | consumed tokens: 58924728320 | elapsed time per iteration (s): 0.57 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 2.559300E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.278 | TFLOPs: 43.12 | +7: iteration 112400/ 115203 | consumed samples: 28774400 | consumed tokens: 58929971200 | elapsed time per iteration (s): 0.57 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 2.556312E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.986 | TFLOPs: 42.62 | +7: iteration 112410/ 115203 | consumed samples: 28776960 | consumed tokens: 58935214080 | elapsed time per iteration (s): 0.57 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 2.540772E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.230 | TFLOPs: 42.54 | +7: iteration 112420/ 115203 | consumed samples: 28779520 | consumed tokens: 58940456960 | elapsed time per iteration (s): 0.57 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 2.546158E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.294 | TFLOPs: 42.55 | +7: iteration 112430/ 115203 | consumed samples: 28782080 | consumed tokens: 58945699840 | elapsed time per iteration (s): 0.56 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 2.549856E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.133 | TFLOPs: 43.68 | +7: iteration 112440/ 115203 | consumed samples: 28784640 | consumed tokens: 58950942720 | elapsed time per iteration (s): 0.56 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 2.543969E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.968 | TFLOPs: 43.28 | +7: iteration 112450/ 115203 | consumed samples: 28787200 | consumed tokens: 58956185600 | elapsed time per iteration (s): 0.56 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 2.546756E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.345 | TFLOPs: 43.89 | +7: iteration 112460/ 115203 | consumed samples: 28789760 | consumed tokens: 58961428480 | elapsed time per iteration (s): 0.55 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 2.556459E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.856 | TFLOPs: 44.03 | +7: iteration 112470/ 115203 | consumed samples: 28792320 | consumed tokens: 58966671360 | elapsed time per iteration (s): 0.56 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 2.553413E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.340 | TFLOPs: 43.22 | +7: iteration 112480/ 115203 | consumed samples: 28794880 | consumed tokens: 58971914240 | elapsed time per iteration (s): 0.57 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 2.559309E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.539 | TFLOPs: 43.14 | +7: iteration 112490/ 115203 | consumed samples: 28797440 | consumed tokens: 58977157120 | elapsed time per iteration (s): 0.57 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 2.558357E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.903 | TFLOPs: 42.89 | +7: iteration 112500/ 115203 | consumed samples: 28800000 | consumed tokens: 58982400000 | elapsed time per iteration (s): 0.57 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 2.551139E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.061 | TFLOPs: 43.19 | +7: iteration 112510/ 115203 | consumed samples: 28802560 | consumed tokens: 58987642880 | elapsed time per iteration (s): 0.56 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 2.548555E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.672 | TFLOPs: 43.25 | +7: iteration 112520/ 115203 | consumed samples: 28805120 | consumed tokens: 58992885760 | elapsed time per iteration (s): 0.57 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 2.568531E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.603 | TFLOPs: 42.67 | +7: iteration 112530/ 115203 | consumed samples: 28807680 | consumed tokens: 58998128640 | elapsed time per iteration (s): 0.56 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 2.550172E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.694 | TFLOPs: 43.54 | +7: iteration 112540/ 115203 | consumed samples: 28810240 | consumed tokens: 59003371520 | elapsed time per iteration (s): 0.58 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 2.541099E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.954 | TFLOPs: 42.33 | +7: iteration 112550/ 115203 | consumed samples: 28812800 | consumed tokens: 59008614400 | elapsed time per iteration (s): 0.56 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 2.556964E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.339 | TFLOPs: 43.79 | +7: iteration 112560/ 115203 | consumed samples: 28815360 | consumed tokens: 59013857280 | elapsed time per iteration (s): 0.57 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 2.564420E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.946 | TFLOPs: 43.09 | +7: iteration 112570/ 115203 | consumed samples: 28817920 | consumed tokens: 59019100160 | elapsed time per iteration (s): 0.57 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 2.553736E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.210 | TFLOPs: 42.92 | +7: iteration 112580/ 115203 | consumed samples: 28820480 | consumed tokens: 59024343040 | elapsed time per iteration (s): 0.56 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 2.548726E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.571 | TFLOPs: 43.53 | +7: iteration 112590/ 115203 | consumed samples: 28823040 | consumed tokens: 59029585920 | elapsed time per iteration (s): 0.57 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 2.541308E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.582 | TFLOPs: 42.58 | +7: iteration 112600/ 115203 | consumed samples: 28825600 | consumed tokens: 59034828800 | elapsed time per iteration (s): 0.58 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 2.545998E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.041 | TFLOPs: 42.43 | +7: iteration 112610/ 115203 | consumed samples: 28828160 | consumed tokens: 59040071680 | elapsed time per iteration (s): 0.57 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 2.553908E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.412 | TFLOPs: 42.75 | +7: iteration 112620/ 115203 | consumed samples: 28830720 | consumed tokens: 59045314560 | elapsed time per iteration (s): 0.57 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 2.548825E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.620 | TFLOPs: 42.77 | +7: iteration 112630/ 115203 | consumed samples: 28833280 | consumed tokens: 59050557440 | elapsed time per iteration (s): 0.57 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 2.552711E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.924 | TFLOPs: 43.09 | +7: iteration 112640/ 115203 | consumed samples: 28835840 | consumed tokens: 59055800320 | elapsed time per iteration (s): 0.57 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 2.551100E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.648 | TFLOPs: 43.06 | +7: iteration 112650/ 115203 | consumed samples: 28838400 | consumed tokens: 59061043200 | elapsed time per iteration (s): 0.55 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 2.559148E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.486 | TFLOPs: 44.00 | +7: iteration 112660/ 115203 | consumed samples: 28840960 | consumed tokens: 59066286080 | elapsed time per iteration (s): 0.56 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 2.544751E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.530 | TFLOPs: 43.33 | +7: iteration 112670/ 115203 | consumed samples: 28843520 | consumed tokens: 59071528960 | elapsed time per iteration (s): 0.56 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 2.554775E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.498 | TFLOPs: 43.62 | +7: iteration 112680/ 115203 | consumed samples: 28846080 | consumed tokens: 59076771840 | elapsed time per iteration (s): 0.57 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 2.571341E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.330 | TFLOPs: 42.74 | +7: iteration 112690/ 115203 | consumed samples: 28848640 | consumed tokens: 59082014720 | elapsed time per iteration (s): 0.56 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 2.560155E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.137 | TFLOPs: 43.68 | +7: iteration 112700/ 115203 | consumed samples: 28851200 | consumed tokens: 59087257600 | elapsed time per iteration (s): 0.57 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 2.562955E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.276 | TFLOPs: 42.83 | +7: iteration 112710/ 115203 | consumed samples: 28853760 | consumed tokens: 59092500480 | elapsed time per iteration (s): 0.57 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 2.552156E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.046 | TFLOPs: 43.10 | +7: iteration 112720/ 115203 | consumed samples: 28856320 | consumed tokens: 59097743360 | elapsed time per iteration (s): 0.56 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 2.548332E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.785 | TFLOPs: 43.64 | +7: iteration 112730/ 115203 | consumed samples: 28858880 | consumed tokens: 59102986240 | elapsed time per iteration (s): 0.57 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 2.560287E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.422 | TFLOPs: 42.75 | +7: iteration 112740/ 115203 | consumed samples: 28861440 | consumed tokens: 59108229120 | elapsed time per iteration (s): 0.57 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 2.551461E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.749 | TFLOPs: 43.16 | +7: iteration 112750/ 115203 | consumed samples: 28864000 | consumed tokens: 59113472000 | elapsed time per iteration (s): 0.57 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 2.559009E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.726 | TFLOPs: 42.59 | +7: iteration 112760/ 115203 | consumed samples: 28866560 | consumed tokens: 59118714880 | elapsed time per iteration (s): 0.56 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 2.556081E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.756 | TFLOPs: 43.64 | +7: iteration 112770/ 115203 | consumed samples: 28869120 | consumed tokens: 59123957760 | elapsed time per iteration (s): 0.56 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 2.551239E+00 | grad norm: 0.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.297 | TFLOPs: 43.88 | +7: iteration 112780/ 115203 | consumed samples: 28871680 | consumed tokens: 59129200640 | elapsed time per iteration (s): 0.57 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 2.527294E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.813 | TFLOPs: 42.60 | +7: iteration 112790/ 115203 | consumed samples: 28874240 | consumed tokens: 59134443520 | elapsed time per iteration (s): 0.57 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 2.544585E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.997 | TFLOPs: 42.90 | +7: iteration 112800/ 115203 | consumed samples: 28876800 | consumed tokens: 59139686400 | elapsed time per iteration (s): 0.55 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 2.552710E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.687 | TFLOPs: 44.02 | +7: iteration 112810/ 115203 | consumed samples: 28879360 | consumed tokens: 59144929280 | elapsed time per iteration (s): 0.56 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 2.563169E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.039 | TFLOPs: 43.38 | +7: iteration 112820/ 115203 | consumed samples: 28881920 | consumed tokens: 59150172160 | elapsed time per iteration (s): 0.56 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 2.556924E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.102 | TFLOPs: 43.58 | +7: iteration 112830/ 115203 | consumed samples: 28884480 | consumed tokens: 59155415040 | elapsed time per iteration (s): 0.56 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 2.562594E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.177 | TFLOPs: 43.40 | +7: iteration 112840/ 115203 | consumed samples: 28887040 | consumed tokens: 59160657920 | elapsed time per iteration (s): 0.56 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 2.553678E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.303 | TFLOPs: 43.50 | +7: iteration 112850/ 115203 | consumed samples: 28889600 | consumed tokens: 59165900800 | elapsed time per iteration (s): 0.57 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 2.551378E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.586 | TFLOPs: 42.96 | +7: iteration 112860/ 115203 | consumed samples: 28892160 | consumed tokens: 59171143680 | elapsed time per iteration (s): 0.56 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 2.554302E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.529 | TFLOPs: 43.43 | +7: iteration 112870/ 115203 | consumed samples: 28894720 | consumed tokens: 59176386560 | elapsed time per iteration (s): 0.57 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 2.550824E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.972 | TFLOPs: 43.19 | +7: iteration 112880/ 115203 | consumed samples: 28897280 | consumed tokens: 59181629440 | elapsed time per iteration (s): 0.56 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 2.571377E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.975 | TFLOPs: 43.28 | +7: iteration 112890/ 115203 | consumed samples: 28899840 | consumed tokens: 59186872320 | elapsed time per iteration (s): 0.56 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 2.550511E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.535 | TFLOPs: 43.24 | +7: iteration 112900/ 115203 | consumed samples: 28902400 | consumed tokens: 59192115200 | elapsed time per iteration (s): 0.56 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 2.554568E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.945 | TFLOPs: 43.37 | +7: iteration 112910/ 115203 | consumed samples: 28904960 | consumed tokens: 59197358080 | elapsed time per iteration (s): 0.57 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 2.563482E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.216 | TFLOPs: 42.92 | +7: iteration 112920/ 115203 | consumed samples: 28907520 | consumed tokens: 59202600960 | elapsed time per iteration (s): 0.56 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 2.566480E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.720 | TFLOPs: 43.35 | +7: iteration 112930/ 115203 | consumed samples: 28910080 | consumed tokens: 59207843840 | elapsed time per iteration (s): 0.57 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 2.552522E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.240 | TFLOPs: 42.83 | +7: iteration 112940/ 115203 | consumed samples: 28912640 | consumed tokens: 59213086720 | elapsed time per iteration (s): 0.57 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.568248E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.193 | TFLOPs: 42.73 | +7: iteration 112950/ 115203 | consumed samples: 28915200 | consumed tokens: 59218329600 | elapsed time per iteration (s): 0.56 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.560767E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.293 | TFLOPs: 43.41 | +7: iteration 112960/ 115203 | consumed samples: 28917760 | consumed tokens: 59223572480 | elapsed time per iteration (s): 0.56 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.547451E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.332 | TFLOPs: 43.51 | +7: iteration 112970/ 115203 | consumed samples: 28920320 | consumed tokens: 59228815360 | elapsed time per iteration (s): 0.57 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.548219E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.431 | TFLOPs: 42.56 | +7: iteration 112980/ 115203 | consumed samples: 28922880 | consumed tokens: 59234058240 | elapsed time per iteration (s): 0.55 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.545513E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.633 | TFLOPs: 44.01 | +7: iteration 112990/ 115203 | consumed samples: 28925440 | consumed tokens: 59239301120 | elapsed time per iteration (s): 0.57 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.545173E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.159 | TFLOPs: 43.11 | +7: iteration 113000/ 115203 | consumed samples: 28928000 | consumed tokens: 59244544000 | elapsed time per iteration (s): 0.56 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 2.547004E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.616 | TFLOPs: 43.25 | +7: iteration 113010/ 115203 | consumed samples: 28930560 | consumed tokens: 59249786880 | elapsed time per iteration (s): 0.55 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.555821E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.676 | TFLOPs: 44.02 | +7: iteration 113020/ 115203 | consumed samples: 28933120 | consumed tokens: 59255029760 | elapsed time per iteration (s): 0.56 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.548972E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.797 | TFLOPs: 43.93 | +7: iteration 113030/ 115203 | consumed samples: 28935680 | consumed tokens: 59260272640 | elapsed time per iteration (s): 0.56 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.548673E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.842 | TFLOPs: 43.75 | +7: iteration 113040/ 115203 | consumed samples: 28938240 | consumed tokens: 59265515520 | elapsed time per iteration (s): 0.57 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.547308E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.500 | TFLOPs: 42.66 | +7: iteration 113050/ 115203 | consumed samples: 28940800 | consumed tokens: 59270758400 | elapsed time per iteration (s): 0.56 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.565140E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.360 | TFLOPs: 43.70 | +7: iteration 113060/ 115203 | consumed samples: 28943360 | consumed tokens: 59276001280 | elapsed time per iteration (s): 0.57 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.555857E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.142 | TFLOPs: 43.11 | +7: iteration 113070/ 115203 | consumed samples: 28945920 | consumed tokens: 59281244160 | elapsed time per iteration (s): 0.57 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 2.551009E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.718 | TFLOPs: 43.07 | +7: iteration 113080/ 115203 | consumed samples: 28948480 | consumed tokens: 59286487040 | elapsed time per iteration (s): 0.58 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.552618E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.067 | TFLOPs: 42.15 | +7: iteration 113090/ 115203 | consumed samples: 28951040 | consumed tokens: 59291729920 | elapsed time per iteration (s): 0.55 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.544592E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.750 | TFLOPs: 44.02 | +7: iteration 113100/ 115203 | consumed samples: 28953600 | consumed tokens: 59296972800 | elapsed time per iteration (s): 0.56 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.546863E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.713 | TFLOPs: 43.73 | +7: iteration 113110/ 115203 | consumed samples: 28956160 | consumed tokens: 59302215680 | elapsed time per iteration (s): 0.56 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.568476E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.142 | TFLOPs: 43.58 | +7: iteration 113120/ 115203 | consumed samples: 28958720 | consumed tokens: 59307458560 | elapsed time per iteration (s): 0.55 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.557217E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.599 | TFLOPs: 44.01 | +7: iteration 113130/ 115203 | consumed samples: 28961280 | consumed tokens: 59312701440 | elapsed time per iteration (s): 0.57 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.554626E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.356 | TFLOPs: 42.94 | +7: iteration 113140/ 115203 | consumed samples: 28963840 | consumed tokens: 59317944320 | elapsed time per iteration (s): 0.57 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 2.560010E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.724 | TFLOPs: 43.16 | +7: iteration 113150/ 115203 | consumed samples: 28966400 | consumed tokens: 59323187200 | elapsed time per iteration (s): 0.55 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.568379E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.737 | TFLOPs: 44.02 | +7: iteration 113160/ 115203 | consumed samples: 28968960 | consumed tokens: 59328430080 | elapsed time per iteration (s): 0.57 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.546165E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.335 | TFLOPs: 42.93 | +7: iteration 113170/ 115203 | consumed samples: 28971520 | consumed tokens: 59333672960 | elapsed time per iteration (s): 0.56 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.560067E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.259 | TFLOPs: 43.31 | +7: iteration 113180/ 115203 | consumed samples: 28974080 | consumed tokens: 59338915840 | elapsed time per iteration (s): 0.56 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.552821E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.924 | TFLOPs: 43.37 | +7: iteration 113190/ 115203 | consumed samples: 28976640 | consumed tokens: 59344158720 | elapsed time per iteration (s): 0.56 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.552841E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.091 | TFLOPs: 43.67 | +7: iteration 113200/ 115203 | consumed samples: 28979200 | consumed tokens: 59349401600 | elapsed time per iteration (s): 0.56 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.560662E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.870 | TFLOPs: 43.46 | +7: iteration 113210/ 115203 | consumed samples: 28981760 | consumed tokens: 59354644480 | elapsed time per iteration (s): 0.55 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 2.550831E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.874 | TFLOPs: 44.03 | +7: iteration 113220/ 115203 | consumed samples: 28984320 | consumed tokens: 59359887360 | elapsed time per iteration (s): 0.56 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.550818E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.181 | TFLOPs: 43.59 | +7: iteration 113230/ 115203 | consumed samples: 28986880 | consumed tokens: 59365130240 | elapsed time per iteration (s): 0.59 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.555684E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.162 | TFLOPs: 41.58 | +7: iteration 113240/ 115203 | consumed samples: 28989440 | consumed tokens: 59370373120 | elapsed time per iteration (s): 0.56 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.552755E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.217 | TFLOPs: 43.59 | +7: iteration 113250/ 115203 | consumed samples: 28992000 | consumed tokens: 59375616000 | elapsed time per iteration (s): 0.55 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.541994E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.679 | TFLOPs: 44.02 | +7: iteration 113260/ 115203 | consumed samples: 28994560 | consumed tokens: 59380858880 | elapsed time per iteration (s): 0.60 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.555887E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 426.758 | TFLOPs: 40.69 | +7: iteration 113270/ 115203 | consumed samples: 28997120 | consumed tokens: 59386101760 | elapsed time per iteration (s): 0.60 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.559175E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 430.065 | TFLOPs: 41.00 | +7: iteration 113280/ 115203 | consumed samples: 28999680 | consumed tokens: 59391344640 | elapsed time per iteration (s): 0.57 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.556448E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.193 | TFLOPs: 42.92 | +7: iteration 113290/ 115203 | consumed samples: 29002240 | consumed tokens: 59396587520 | elapsed time per iteration (s): 0.57 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 2.562855E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.296 | TFLOPs: 43.12 | +7: iteration 113300/ 115203 | consumed samples: 29004800 | consumed tokens: 59401830400 | elapsed time per iteration (s): 0.55 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.563040E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.717 | TFLOPs: 44.02 | +7: iteration 113310/ 115203 | consumed samples: 29007360 | consumed tokens: 59407073280 | elapsed time per iteration (s): 0.56 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.550700E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.733 | TFLOPs: 43.93 | +7: iteration 113320/ 115203 | consumed samples: 29009920 | consumed tokens: 59412316160 | elapsed time per iteration (s): 0.57 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.548755E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.821 | TFLOPs: 42.60 | +7: iteration 113330/ 115203 | consumed samples: 29012480 | consumed tokens: 59417559040 | elapsed time per iteration (s): 0.57 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.546715E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.591 | TFLOPs: 42.86 | +7: iteration 113340/ 115203 | consumed samples: 29015040 | consumed tokens: 59422801920 | elapsed time per iteration (s): 0.56 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.554634E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.113 | TFLOPs: 43.29 | +7: iteration 113350/ 115203 | consumed samples: 29017600 | consumed tokens: 59428044800 | elapsed time per iteration (s): 0.56 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.553927E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.784 | TFLOPs: 43.64 | +7: iteration 113360/ 115203 | consumed samples: 29020160 | consumed tokens: 59433287680 | elapsed time per iteration (s): 0.57 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 2.554089E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.586 | TFLOPs: 42.48 | +7: iteration 113370/ 115203 | consumed samples: 29022720 | consumed tokens: 59438530560 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.554031E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.217 | TFLOPs: 43.59 | +7: iteration 113380/ 115203 | consumed samples: 29025280 | consumed tokens: 59443773440 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.563299E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.679 | TFLOPs: 43.44 | +7: iteration 113390/ 115203 | consumed samples: 29027840 | consumed tokens: 59449016320 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.554057E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.373 | TFLOPs: 43.51 | +7: iteration 113400/ 115203 | consumed samples: 29030400 | consumed tokens: 59454259200 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.549369E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.218 | TFLOPs: 43.59 | +7: iteration 113410/ 115203 | consumed samples: 29032960 | consumed tokens: 59459502080 | elapsed time per iteration (s): 0.57 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.535526E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.392 | TFLOPs: 42.65 | +7: iteration 113420/ 115203 | consumed samples: 29035520 | consumed tokens: 59464744960 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.565651E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.281 | TFLOPs: 43.31 | +7: iteration 113430/ 115203 | consumed samples: 29038080 | consumed tokens: 59469987840 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.557111E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.289 | TFLOPs: 43.60 | +7: iteration 113440/ 115203 | consumed samples: 29040640 | consumed tokens: 59475230720 | elapsed time per iteration (s): 0.56 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.542618E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.818 | TFLOPs: 43.46 | +7: iteration 113450/ 115203 | consumed samples: 29043200 | consumed tokens: 59480473600 | elapsed time per iteration (s): 0.55 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 2.550496E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.762 | TFLOPs: 44.02 | +7: iteration 113460/ 115203 | consumed samples: 29045760 | consumed tokens: 59485716480 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.551470E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.145 | TFLOPs: 43.20 | +7: iteration 113470/ 115203 | consumed samples: 29048320 | consumed tokens: 59490959360 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.550268E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.646 | TFLOPs: 43.73 | +7: iteration 113480/ 115203 | consumed samples: 29050880 | consumed tokens: 59496202240 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.542451E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.057 | TFLOPs: 43.77 | +7: iteration 113490/ 115203 | consumed samples: 29053440 | consumed tokens: 59501445120 | elapsed time per iteration (s): 0.57 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.556058E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.461 | TFLOPs: 43.14 | +7: iteration 113500/ 115203 | consumed samples: 29056000 | consumed tokens: 59506688000 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.562838E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.162 | TFLOPs: 43.39 | +7: iteration 113510/ 115203 | consumed samples: 29058560 | consumed tokens: 59511930880 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.548396E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.136 | TFLOPs: 43.49 | +7: iteration 113520/ 115203 | consumed samples: 29061120 | consumed tokens: 59517173760 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.555816E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.165 | TFLOPs: 43.78 | +7: iteration 113530/ 115203 | consumed samples: 29063680 | consumed tokens: 59522416640 | elapsed time per iteration (s): 0.56 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 2.549490E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.236 | TFLOPs: 43.50 | +7: iteration 113540/ 115203 | consumed samples: 29066240 | consumed tokens: 59527659520 | elapsed time per iteration (s): 0.55 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.548071E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.725 | TFLOPs: 44.02 | +7: iteration 113550/ 115203 | consumed samples: 29068800 | consumed tokens: 59532902400 | elapsed time per iteration (s): 0.56 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.563922E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.752 | TFLOPs: 43.74 | +7: iteration 113560/ 115203 | consumed samples: 29071360 | consumed tokens: 59538145280 | elapsed time per iteration (s): 0.58 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.553926E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.690 | TFLOPs: 42.40 | +7: iteration 113570/ 115203 | consumed samples: 29073920 | consumed tokens: 59543388160 | elapsed time per iteration (s): 0.60 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.559318E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 428.664 | TFLOPs: 40.87 | +7: iteration 113580/ 115203 | consumed samples: 29076480 | consumed tokens: 59548631040 | elapsed time per iteration (s): 0.58 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.557110E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 439.834 | TFLOPs: 41.93 | +7: iteration 113590/ 115203 | consumed samples: 29079040 | consumed tokens: 59553873920 | elapsed time per iteration (s): 0.55 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.565465E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.430 | TFLOPs: 43.99 | +7: iteration 113600/ 115203 | consumed samples: 29081600 | consumed tokens: 59559116800 | elapsed time per iteration (s): 0.57 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.546512E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.378 | TFLOPs: 43.03 | +7: iteration 113610/ 115203 | consumed samples: 29084160 | consumed tokens: 59564359680 | elapsed time per iteration (s): 0.56 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.553000E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.758 | TFLOPs: 43.74 | +7: iteration 113620/ 115203 | consumed samples: 29086720 | consumed tokens: 59569602560 | elapsed time per iteration (s): 0.56 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 2.566445E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.678 | TFLOPs: 43.83 | +7: iteration 113630/ 115203 | consumed samples: 29089280 | consumed tokens: 59574845440 | elapsed time per iteration (s): 0.57 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.565376E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.804 | TFLOPs: 43.17 | +7: iteration 113640/ 115203 | consumed samples: 29091840 | consumed tokens: 59580088320 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.559017E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.333 | TFLOPs: 43.89 | +7: iteration 113650/ 115203 | consumed samples: 29094400 | consumed tokens: 59585331200 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.546969E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.379 | TFLOPs: 43.61 | +7: iteration 113660/ 115203 | consumed samples: 29096960 | consumed tokens: 59590574080 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.543531E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.226 | TFLOPs: 43.21 | +7: iteration 113670/ 115203 | consumed samples: 29099520 | consumed tokens: 59595816960 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.561854E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.549 | TFLOPs: 43.81 | +7: iteration 113680/ 115203 | consumed samples: 29102080 | consumed tokens: 59601059840 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.536422E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.673 | TFLOPs: 43.63 | +7: iteration 113690/ 115203 | consumed samples: 29104640 | consumed tokens: 59606302720 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.559902E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.327 | TFLOPs: 43.60 | +7: iteration 113700/ 115203 | consumed samples: 29107200 | consumed tokens: 59611545600 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.552325E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.207 | TFLOPs: 43.97 | +7: iteration 113710/ 115203 | consumed samples: 29109760 | consumed tokens: 59616788480 | elapsed time per iteration (s): 0.55 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.544325E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.892 | TFLOPs: 44.04 | +7: iteration 113720/ 115203 | consumed samples: 29112320 | consumed tokens: 59622031360 | elapsed time per iteration (s): 0.56 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 2.551775E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.596 | TFLOPs: 43.63 | +7: iteration 113730/ 115203 | consumed samples: 29114880 | consumed tokens: 59627274240 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.546001E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.432 | TFLOPs: 43.80 | +7: iteration 113740/ 115203 | consumed samples: 29117440 | consumed tokens: 59632517120 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.542679E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.911 | TFLOPs: 43.75 | +7: iteration 113750/ 115203 | consumed samples: 29120000 | consumed tokens: 59637760000 | elapsed time per iteration (s): 0.57 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.547762E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.761 | TFLOPs: 43.07 | +7: iteration 113760/ 115203 | consumed samples: 29122560 | consumed tokens: 59643002880 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.539494E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.137 | TFLOPs: 43.39 | +7: iteration 113770/ 115203 | consumed samples: 29125120 | consumed tokens: 59648245760 | elapsed time per iteration (s): 0.58 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.548712E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.278 | TFLOPs: 42.26 | +7: iteration 113780/ 115203 | consumed samples: 29127680 | consumed tokens: 59653488640 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.555844E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.460 | TFLOPs: 43.52 | +7: iteration 113790/ 115203 | consumed samples: 29130240 | consumed tokens: 59658731520 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.546179E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.363 | TFLOPs: 43.41 | +7: iteration 113800/ 115203 | consumed samples: 29132800 | consumed tokens: 59663974400 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.561581E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.129 | TFLOPs: 43.30 | +7: iteration 113810/ 115203 | consumed samples: 29135360 | consumed tokens: 59669217280 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.550678E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.611 | TFLOPs: 43.53 | +7: iteration 113820/ 115203 | consumed samples: 29137920 | consumed tokens: 59674460160 | elapsed time per iteration (s): 0.56 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 2.548397E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.156 | TFLOPs: 43.97 | +7: iteration 113830/ 115203 | consumed samples: 29140480 | consumed tokens: 59679703040 | elapsed time per iteration (s): 0.55 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.556275E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.358 | TFLOPs: 43.99 | +7: iteration 113840/ 115203 | consumed samples: 29143040 | consumed tokens: 59684945920 | elapsed time per iteration (s): 0.57 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.556621E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.049 | TFLOPs: 43.10 | +7: iteration 113850/ 115203 | consumed samples: 29145600 | consumed tokens: 59690188800 | elapsed time per iteration (s): 0.55 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.545809E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.782 | TFLOPs: 44.03 | +7: iteration 113860/ 115203 | consumed samples: 29148160 | consumed tokens: 59695431680 | elapsed time per iteration (s): 0.57 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.549235E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.828 | TFLOPs: 42.98 | +7: iteration 113870/ 115203 | consumed samples: 29150720 | consumed tokens: 59700674560 | elapsed time per iteration (s): 0.56 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.557600E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.182 | TFLOPs: 43.78 | +7: iteration 113880/ 115203 | consumed samples: 29153280 | consumed tokens: 59705917440 | elapsed time per iteration (s): 0.57 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.557288E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.910 | TFLOPs: 42.99 | +7: iteration 113890/ 115203 | consumed samples: 29155840 | consumed tokens: 59711160320 | elapsed time per iteration (s): 0.56 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.554084E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.333 | TFLOPs: 43.70 | +7: iteration 113900/ 115203 | consumed samples: 29158400 | consumed tokens: 59716403200 | elapsed time per iteration (s): 0.55 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.547478E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.620 | TFLOPs: 44.01 | +7: iteration 113910/ 115203 | consumed samples: 29160960 | consumed tokens: 59721646080 | elapsed time per iteration (s): 0.57 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.557049E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.919 | TFLOPs: 42.61 | +7: iteration 113920/ 115203 | consumed samples: 29163520 | consumed tokens: 59726888960 | elapsed time per iteration (s): 0.56 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.547582E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.135 | TFLOPs: 43.20 | +7: iteration 113930/ 115203 | consumed samples: 29166080 | consumed tokens: 59732131840 | elapsed time per iteration (s): 0.57 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 2.556314E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.704 | TFLOPs: 42.68 | +7: iteration 113940/ 115203 | consumed samples: 29168640 | consumed tokens: 59737374720 | elapsed time per iteration (s): 0.55 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.540008E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.667 | TFLOPs: 44.01 | +7: iteration 113950/ 115203 | consumed samples: 29171200 | consumed tokens: 59742617600 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.542637E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.621 | TFLOPs: 43.72 | +7: iteration 113960/ 115203 | consumed samples: 29173760 | consumed tokens: 59747860480 | elapsed time per iteration (s): 0.57 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.553271E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.387 | TFLOPs: 42.75 | +7: iteration 113970/ 115203 | consumed samples: 29176320 | consumed tokens: 59753103360 | elapsed time per iteration (s): 0.55 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.551841E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.665 | TFLOPs: 44.01 | +7: iteration 113980/ 115203 | consumed samples: 29178880 | consumed tokens: 59758346240 | elapsed time per iteration (s): 0.57 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.549180E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.786 | TFLOPs: 43.17 | +7: iteration 113990/ 115203 | consumed samples: 29181440 | consumed tokens: 59763589120 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.553923E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.962 | TFLOPs: 43.38 | +0: [2023-03-17 06:44:15,811] [INFO] [logging.py:68:log_dist] [Rank 0] step=114000, skipped=0, lr=[2.004947884324412e-05, 2.004947884324412e-05, 2.004947884324412e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 114000/ 115203 | consumed samples: 29184000 | consumed tokens: 59768832000 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.548825E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.552 | TFLOPs: 43.72 | +0: steps: 114000 loss: 2.5762 iter time (s): 0.563 samples/sec: 454.737 +7: iteration 114010/ 115203 | consumed samples: 29186560 | consumed tokens: 59774074880 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.554097E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.065 | TFLOPs: 43.96 | +7: iteration 114020/ 115203 | consumed samples: 29189120 | consumed tokens: 59779317760 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.544043E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.014 | TFLOPs: 43.76 | +7: iteration 114030/ 115203 | consumed samples: 29191680 | consumed tokens: 59784560640 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.541580E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.759 | TFLOPs: 43.74 | +7: iteration 114040/ 115203 | consumed samples: 29194240 | consumed tokens: 59789803520 | elapsed time per iteration (s): 0.56 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.556060E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.556 | TFLOPs: 43.24 | +7: iteration 114050/ 115203 | consumed samples: 29196800 | consumed tokens: 59795046400 | elapsed time per iteration (s): 0.57 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 2.553446E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.463 | TFLOPs: 42.85 | +7: iteration 114060/ 115203 | consumed samples: 29199360 | consumed tokens: 59800289280 | elapsed time per iteration (s): 0.57 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.555733E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.735 | TFLOPs: 42.97 | +7: iteration 114070/ 115203 | consumed samples: 29201920 | consumed tokens: 59805532160 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.548047E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.716 | TFLOPs: 43.26 | +7: iteration 114080/ 115203 | consumed samples: 29204480 | consumed tokens: 59810775040 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.560128E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.401 | TFLOPs: 43.23 | +7: iteration 114090/ 115203 | consumed samples: 29207040 | consumed tokens: 59816017920 | elapsed time per iteration (s): 0.55 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.546214E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.750 | TFLOPs: 44.02 | +7: iteration 114100/ 115203 | consumed samples: 29209600 | consumed tokens: 59821260800 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.562235E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.630 | TFLOPs: 43.92 | +7: iteration 114110/ 115203 | consumed samples: 29212160 | consumed tokens: 59826503680 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.548737E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.066 | TFLOPs: 43.29 | +7: iteration 114120/ 115203 | consumed samples: 29214720 | consumed tokens: 59831746560 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.537178E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.012 | TFLOPs: 43.67 | +7: iteration 114130/ 115203 | consumed samples: 29217280 | consumed tokens: 59836989440 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.549287E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.808 | TFLOPs: 43.46 | +7: iteration 114140/ 115203 | consumed samples: 29219840 | consumed tokens: 59842232320 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.556662E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.705 | TFLOPs: 43.45 | +7: iteration 114150/ 115203 | consumed samples: 29222400 | consumed tokens: 59847475200 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.557148E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.215 | TFLOPs: 43.21 | +7: iteration 114160/ 115203 | consumed samples: 29224960 | consumed tokens: 59852718080 | elapsed time per iteration (s): 0.55 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.555180E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.961 | TFLOPs: 44.04 | +7: iteration 114170/ 115203 | consumed samples: 29227520 | consumed tokens: 59857960960 | elapsed time per iteration (s): 0.56 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.533778E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.430 | TFLOPs: 43.52 | +7: iteration 114180/ 115203 | consumed samples: 29230080 | consumed tokens: 59863203840 | elapsed time per iteration (s): 0.55 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.554636E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.782 | TFLOPs: 44.03 | +7: iteration 114190/ 115203 | consumed samples: 29232640 | consumed tokens: 59868446720 | elapsed time per iteration (s): 0.55 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 2.553028E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.883 | TFLOPs: 44.04 | +7: iteration 114200/ 115203 | consumed samples: 29235200 | consumed tokens: 59873689600 | elapsed time per iteration (s): 0.55 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.554021E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.662 | TFLOPs: 44.01 | +7: iteration 114210/ 115203 | consumed samples: 29237760 | consumed tokens: 59878932480 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.541932E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.355 | TFLOPs: 43.70 | +7: iteration 114220/ 115203 | consumed samples: 29240320 | consumed tokens: 59884175360 | elapsed time per iteration (s): 0.55 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.554836E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.850 | TFLOPs: 44.03 | +7: iteration 114230/ 115203 | consumed samples: 29242880 | consumed tokens: 59889418240 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.546934E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.651 | TFLOPs: 43.63 | +7: iteration 114240/ 115203 | consumed samples: 29245440 | consumed tokens: 59894661120 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.553278E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.139 | TFLOPs: 43.49 | +7: iteration 114250/ 115203 | consumed samples: 29248000 | consumed tokens: 59899904000 | elapsed time per iteration (s): 0.55 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.555198E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.803 | TFLOPs: 44.03 | +7: iteration 114260/ 115203 | consumed samples: 29250560 | consumed tokens: 59905146880 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.549539E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.563 | TFLOPs: 43.53 | +7: iteration 114270/ 115203 | consumed samples: 29253120 | consumed tokens: 59910389760 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.554697E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.403 | TFLOPs: 43.23 | +7: iteration 114280/ 115203 | consumed samples: 29255680 | consumed tokens: 59915632640 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.550643E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.518 | TFLOPs: 43.33 | +7: iteration 114290/ 115203 | consumed samples: 29258240 | consumed tokens: 59920875520 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.563881E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.177 | TFLOPs: 43.59 | +7: iteration 114300/ 115203 | consumed samples: 29260800 | consumed tokens: 59926118400 | elapsed time per iteration (s): 0.55 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.565939E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.918 | TFLOPs: 44.04 | +7: iteration 114310/ 115203 | consumed samples: 29263360 | consumed tokens: 59931361280 | elapsed time per iteration (s): 0.55 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.538176E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.669 | TFLOPs: 44.02 | +7: iteration 114320/ 115203 | consumed samples: 29265920 | consumed tokens: 59936604160 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.563457E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.809 | TFLOPs: 43.46 | +7: iteration 114330/ 115203 | consumed samples: 29268480 | consumed tokens: 59941847040 | elapsed time per iteration (s): 0.57 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.540594E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.563 | TFLOPs: 42.96 | +7: iteration 114340/ 115203 | consumed samples: 29271040 | consumed tokens: 59947089920 | elapsed time per iteration (s): 0.56 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 2.555625E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.515 | TFLOPs: 43.52 | +7: iteration 114350/ 115203 | consumed samples: 29273600 | consumed tokens: 59952332800 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.550255E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.320 | TFLOPs: 43.51 | +7: iteration 114360/ 115203 | consumed samples: 29276160 | consumed tokens: 59957575680 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.552985E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.923 | TFLOPs: 43.94 | +7: iteration 114370/ 115203 | consumed samples: 29278720 | consumed tokens: 59962818560 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.548653E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.729 | TFLOPs: 43.83 | +7: iteration 114380/ 115203 | consumed samples: 29281280 | consumed tokens: 59968061440 | elapsed time per iteration (s): 0.55 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.545466E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.644 | TFLOPs: 44.01 | +7: iteration 114390/ 115203 | consumed samples: 29283840 | consumed tokens: 59973304320 | elapsed time per iteration (s): 0.57 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.553154E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.767 | TFLOPs: 43.17 | +7: iteration 114400/ 115203 | consumed samples: 29286400 | consumed tokens: 59978547200 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.553951E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.447 | TFLOPs: 43.90 | +7: iteration 114410/ 115203 | consumed samples: 29288960 | consumed tokens: 59983790080 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.550246E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 459.314 | TFLOPs: 43.79 | +7: iteration 114420/ 115203 | consumed samples: 29291520 | consumed tokens: 59989032960 | elapsed time per iteration (s): 0.55 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.561433E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.702 | TFLOPs: 44.02 | +7: iteration 114430/ 115203 | consumed samples: 29294080 | consumed tokens: 59994275840 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.559231E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.254 | TFLOPs: 43.59 | +7: iteration 114440/ 115203 | consumed samples: 29296640 | consumed tokens: 59999518720 | elapsed time per iteration (s): 0.55 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.552824E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.720 | TFLOPs: 44.02 | +7: iteration 114450/ 115203 | consumed samples: 29299200 | consumed tokens: 60004761600 | elapsed time per iteration (s): 0.55 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.563155E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.882 | TFLOPs: 44.04 | +7: iteration 114460/ 115203 | consumed samples: 29301760 | consumed tokens: 60010004480 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.557010E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 460.293 | TFLOPs: 43.88 | +7: iteration 114470/ 115203 | consumed samples: 29304320 | consumed tokens: 60015247360 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.562983E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.477 | TFLOPs: 43.23 | +7: iteration 114480/ 115203 | consumed samples: 29306880 | consumed tokens: 60020490240 | elapsed time per iteration (s): 0.57 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.567971E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.448 | TFLOPs: 42.85 | +7: iteration 114490/ 115203 | consumed samples: 29309440 | consumed tokens: 60025733120 | elapsed time per iteration (s): 0.57 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.562723E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.233 | TFLOPs: 42.64 | +7: iteration 114500/ 115203 | consumed samples: 29312000 | consumed tokens: 60030976000 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.554700E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.145 | TFLOPs: 43.39 | +7: iteration 114510/ 115203 | consumed samples: 29314560 | consumed tokens: 60036218880 | elapsed time per iteration (s): 0.57 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.558387E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.441 | TFLOPs: 42.75 | +7: iteration 114520/ 115203 | consumed samples: 29317120 | consumed tokens: 60041461760 | elapsed time per iteration (s): 0.57 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.559235E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.897 | TFLOPs: 42.80 | +7: iteration 114530/ 115203 | consumed samples: 29319680 | consumed tokens: 60046704640 | elapsed time per iteration (s): 0.57 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.550138E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.173 | TFLOPs: 42.73 | +7: iteration 114540/ 115203 | consumed samples: 29322240 | consumed tokens: 60051947520 | elapsed time per iteration (s): 0.56 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 2.546805E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.432 | TFLOPs: 43.23 | +7: iteration 114550/ 115203 | consumed samples: 29324800 | consumed tokens: 60057190400 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.554991E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.021 | TFLOPs: 42.24 | +7: iteration 114560/ 115203 | consumed samples: 29327360 | consumed tokens: 60062433280 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.554237E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.989 | TFLOPs: 42.52 | +7: iteration 114570/ 115203 | consumed samples: 29329920 | consumed tokens: 60067676160 | elapsed time per iteration (s): 0.56 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.557374E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.644 | TFLOPs: 43.25 | +7: iteration 114580/ 115203 | consumed samples: 29332480 | consumed tokens: 60072919040 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.550649E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.898 | TFLOPs: 42.23 | +7: iteration 114590/ 115203 | consumed samples: 29335040 | consumed tokens: 60078161920 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.558951E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.460 | TFLOPs: 42.66 | +7: iteration 114600/ 115203 | consumed samples: 29337600 | consumed tokens: 60083404800 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.561993E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 440.934 | TFLOPs: 42.04 | +7: iteration 114610/ 115203 | consumed samples: 29340160 | consumed tokens: 60088647680 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.560844E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.728 | TFLOPs: 42.78 | +7: iteration 114620/ 115203 | consumed samples: 29342720 | consumed tokens: 60093890560 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.559271E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.976 | TFLOPs: 42.42 | +7: iteration 114630/ 115203 | consumed samples: 29345280 | consumed tokens: 60099133440 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.570038E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.854 | TFLOPs: 43.17 | +7: iteration 114640/ 115203 | consumed samples: 29347840 | consumed tokens: 60104376320 | elapsed time per iteration (s): 0.55 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.561313E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 461.718 | TFLOPs: 44.02 | +7: iteration 114650/ 115203 | consumed samples: 29350400 | consumed tokens: 60109619200 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.542023E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.511 | TFLOPs: 43.14 | +7: iteration 114660/ 115203 | consumed samples: 29352960 | consumed tokens: 60114862080 | elapsed time per iteration (s): 0.56 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.563047E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 456.385 | TFLOPs: 43.51 | +7: iteration 114670/ 115203 | consumed samples: 29355520 | consumed tokens: 60120104960 | elapsed time per iteration (s): 0.56 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.544178E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.171 | TFLOPs: 43.40 | +7: iteration 114680/ 115203 | consumed samples: 29358080 | consumed tokens: 60125347840 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.542261E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.753 | TFLOPs: 42.40 | +7: iteration 114690/ 115203 | consumed samples: 29360640 | consumed tokens: 60130590720 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.548452E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.672 | TFLOPs: 42.11 | +7: iteration 114700/ 115203 | consumed samples: 29363200 | consumed tokens: 60135833600 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.553495E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.707 | TFLOPs: 42.87 | +7: iteration 114710/ 115203 | consumed samples: 29365760 | consumed tokens: 60141076480 | elapsed time per iteration (s): 0.56 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.552400E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.585 | TFLOPs: 43.63 | +7: iteration 114720/ 115203 | consumed samples: 29368320 | consumed tokens: 60146319360 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.564005E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.015 | TFLOPs: 42.81 | +7: iteration 114730/ 115203 | consumed samples: 29370880 | consumed tokens: 60151562240 | elapsed time per iteration (s): 0.59 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.558964E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.409 | TFLOPs: 41.61 | +7: iteration 114740/ 115203 | consumed samples: 29373440 | consumed tokens: 60156805120 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.546188E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.065 | TFLOPs: 42.34 | +7: iteration 114750/ 115203 | consumed samples: 29376000 | consumed tokens: 60162048000 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.569197E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.936 | TFLOPs: 42.52 | +7: iteration 114760/ 115203 | consumed samples: 29378560 | consumed tokens: 60167290880 | elapsed time per iteration (s): 0.55 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.553733E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 462.002 | TFLOPs: 44.05 | +7: iteration 114770/ 115203 | consumed samples: 29381120 | consumed tokens: 60172533760 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.551123E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.536 | TFLOPs: 42.10 | +7: iteration 114780/ 115203 | consumed samples: 29383680 | consumed tokens: 60177776640 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.559092E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.116 | TFLOPs: 42.06 | +7: iteration 114790/ 115203 | consumed samples: 29386240 | consumed tokens: 60183019520 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.546079E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.709 | TFLOPs: 42.68 | +7: iteration 114800/ 115203 | consumed samples: 29388800 | consumed tokens: 60188262400 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.554541E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.503 | TFLOPs: 42.47 | +7: iteration 114810/ 115203 | consumed samples: 29391360 | consumed tokens: 60193505280 | elapsed time per iteration (s): 0.58 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.556104E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.005 | TFLOPs: 42.05 | +7: iteration 114820/ 115203 | consumed samples: 29393920 | consumed tokens: 60198748160 | elapsed time per iteration (s): 0.57 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 2.562668E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.137 | TFLOPs: 42.63 | +7: iteration 114830/ 115203 | consumed samples: 29396480 | consumed tokens: 60203991040 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.540417E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 454.725 | TFLOPs: 43.35 | +7: iteration 114840/ 115203 | consumed samples: 29399040 | consumed tokens: 60209233920 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.555317E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 438.793 | TFLOPs: 41.83 | +7: iteration 114850/ 115203 | consumed samples: 29401600 | consumed tokens: 60214476800 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.548873E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 441.186 | TFLOPs: 42.06 | +7: iteration 114860/ 115203 | consumed samples: 29404160 | consumed tokens: 60219719680 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.558504E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 445.487 | TFLOPs: 42.47 | +7: iteration 114870/ 115203 | consumed samples: 29406720 | consumed tokens: 60224962560 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.551696E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 442.890 | TFLOPs: 42.22 | +7: iteration 114880/ 115203 | consumed samples: 29409280 | consumed tokens: 60230205440 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.548379E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.503 | TFLOPs: 43.43 | +7: iteration 114890/ 115203 | consumed samples: 29411840 | consumed tokens: 60235448320 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.570652E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 436.727 | TFLOPs: 41.64 | +7: iteration 114900/ 115203 | consumed samples: 29414400 | consumed tokens: 60240691200 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.553029E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.954 | TFLOPs: 41.75 | +7: iteration 114910/ 115203 | consumed samples: 29416960 | consumed tokens: 60245934080 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.561393E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.214 | TFLOPs: 42.64 | +7: iteration 114920/ 115203 | consumed samples: 29419520 | consumed tokens: 60251176960 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.560885E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.770 | TFLOPs: 43.07 | +7: iteration 114930/ 115203 | consumed samples: 29422080 | consumed tokens: 60256419840 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.542783E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.863 | TFLOPs: 42.32 | +7: iteration 114940/ 115203 | consumed samples: 29424640 | consumed tokens: 60261662720 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.540781E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.347 | TFLOPs: 42.84 | +7: iteration 114950/ 115203 | consumed samples: 29427200 | consumed tokens: 60266905600 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.546106E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.136 | TFLOPs: 42.63 | +7: iteration 114960/ 115203 | consumed samples: 29429760 | consumed tokens: 60272148480 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.566972E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.986 | TFLOPs: 42.42 | +7: iteration 114970/ 115203 | consumed samples: 29432320 | consumed tokens: 60277391360 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.559885E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.877 | TFLOPs: 43.27 | +7: iteration 114980/ 115203 | consumed samples: 29434880 | consumed tokens: 60282634240 | elapsed time per iteration (s): 0.59 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.552869E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 437.370 | TFLOPs: 41.70 | +7: iteration 114990/ 115203 | consumed samples: 29437440 | consumed tokens: 60287877120 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.554711E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.107 | TFLOPs: 42.72 | +7: iteration 115000/ 115203 | consumed samples: 29440000 | consumed tokens: 60293120000 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.565019E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.673 | TFLOPs: 42.39 | +7: iteration 115010/ 115203 | consumed samples: 29442560 | consumed tokens: 60298362880 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.551696E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 453.157 | TFLOPs: 43.20 | +7: iteration 115020/ 115203 | consumed samples: 29445120 | consumed tokens: 60303605760 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.548791E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.299 | TFLOPs: 42.84 | +7: iteration 115030/ 115203 | consumed samples: 29447680 | consumed tokens: 60308848640 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.549122E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 450.198 | TFLOPs: 42.92 | +7: iteration 115040/ 115203 | consumed samples: 29450240 | consumed tokens: 60314091520 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.544981E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 449.130 | TFLOPs: 42.82 | +7: iteration 115050/ 115203 | consumed samples: 29452800 | consumed tokens: 60319334400 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.556823E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 455.990 | TFLOPs: 43.47 | +7: iteration 115060/ 115203 | consumed samples: 29455360 | consumed tokens: 60324577280 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.561958E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 443.023 | TFLOPs: 42.24 | +7: iteration 115070/ 115203 | consumed samples: 29457920 | consumed tokens: 60329820160 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.557435E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.402 | TFLOPs: 42.37 | +7: iteration 115080/ 115203 | consumed samples: 29460480 | consumed tokens: 60335063040 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.541061E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.740 | TFLOPs: 42.78 | +7: iteration 115090/ 115203 | consumed samples: 29463040 | consumed tokens: 60340305920 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.554907E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.990 | TFLOPs: 43.66 | +7: iteration 115100/ 115203 | consumed samples: 29465600 | consumed tokens: 60345548800 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.544849E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.438 | TFLOPs: 42.75 | +7: iteration 115110/ 115203 | consumed samples: 29468160 | consumed tokens: 60350791680 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.554793E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 452.463 | TFLOPs: 43.14 | +7: iteration 115120/ 115203 | consumed samples: 29470720 | consumed tokens: 60356034560 | elapsed time per iteration (s): 0.58 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.530732E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 444.675 | TFLOPs: 42.39 | +7: iteration 115130/ 115203 | consumed samples: 29473280 | consumed tokens: 60361277440 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.562160E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 457.786 | TFLOPs: 43.64 | +7: iteration 115140/ 115203 | consumed samples: 29475840 | consumed tokens: 60366520320 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.551776E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.846 | TFLOPs: 42.79 | +7: iteration 115150/ 115203 | consumed samples: 29478400 | consumed tokens: 60371763200 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.539957E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 448.140 | TFLOPs: 42.73 | +7: iteration 115160/ 115203 | consumed samples: 29480960 | consumed tokens: 60377006080 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.545163E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 447.045 | TFLOPs: 42.62 | +7: iteration 115170/ 115203 | consumed samples: 29483520 | consumed tokens: 60382248960 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.545745E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.323 | TFLOPs: 43.03 | +7: iteration 115180/ 115203 | consumed samples: 29486080 | consumed tokens: 60387491840 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.562654E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 451.538 | TFLOPs: 43.05 | +7: iteration 115190/ 115203 | consumed samples: 29488640 | consumed tokens: 60392734720 | elapsed time per iteration (s): 0.56 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.545991E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 458.302 | TFLOPs: 43.69 | +7: iteration 115200/ 115203 | consumed samples: 29491200 | consumed tokens: 60397977600 | elapsed time per iteration (s): 0.57 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 2.552037E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 446.600 | TFLOPs: 42.58 | +0: [after training is done] datetime: 2023-03-17 06:55:37 +0: saving checkpoint at iteration 115203 to checkpoints_421m60b400m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.359319E+00 | lm loss PPL: 2.876961E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-17 06:55:37,614] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step115203 is begin to save! +0: [2023-03-17 06:55:37,617] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:55:37,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:55:37,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:55:37,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:55:37,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:55:37,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:55:37,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:55:37,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:55:37,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:55:37,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:55:37,937] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:55:37,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:55:37,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:55:38,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:55:38,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:55:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:55:38,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:55:38,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:55:38,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 06:55:38,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 06:55:38,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:55:38,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:55:38,189] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 06:55:38,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 06:55:38,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 06:55:38,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 06:55:38,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 06:55:38,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 06:55:38,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 06:55:38,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 06:55:38,355] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 06:55:38,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 06:55:38,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 06:55:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 06:55:38,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 06:55:38,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 06:55:38,478] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 06:55:38,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 06:55:38,521] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 06:55:38,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 06:55:38,525] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt +0: [2023-03-17 06:55:38,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 06:55:38,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:55:38,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:55:38,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:55:38,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +2: [2023-03-17 06:55:38,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:55:38,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:55:38,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +7: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +5: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:55:38,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +4: [2023-03-17 06:55:38,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:55:38,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:55:38,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:55:38,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +3: [2023-03-17 06:55:38,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:55:38,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:55:38,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: [2023-03-17 06:55:38,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:55:38,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:55:38,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +6: [2023-03-17 06:55:38,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115203 is ready now! +0: successfully saved checkpoint at iteration 115203 to checkpoints_421m60b400m +END 3319845: Fri 17 Mar 2023 06:56:02 AM EET diff --git a/421m60b400m/3328560.err b/421m60b400m/3328560.err new file mode 100644 index 0000000000000000000000000000000000000000..38f6f5a9104f0b15e62114ae738aa3b3a1069de0 --- /dev/null +++ b/421m60b400m/3328560.err @@ -0,0 +1,1121 @@ +3: 2023-03-17 09:39:59.226379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 09:39:59.226388: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 09:39:59.226383: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-17 09:39:59.226059: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226069: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226068: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226073: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 09:39:59.226392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 09:39:59.226399: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226063: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 09:39:59.226076: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: 2023-03-17 09:39:59.226387: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 09:39:59.226417: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 09:39:59.226425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227175: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227179: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227184: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227189: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227190: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227196: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 09:39:59.227198: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227472: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227483: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227470: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227482: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227493: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227493: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227499: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 09:39:59.227505: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228076: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228101: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228097: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228110: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228098: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228135: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:39:59.228134: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228299: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228300: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228304: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228316: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228384: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228381: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228403: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: 2023-03-17 09:39:59.228329: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 09:39:59.228336: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228393: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:39:59.228414: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228505: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228508: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228516: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: 2023-03-17 09:39:59.228421: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228520: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228505: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228511: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:39:59.228510: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 09:40:01.844545: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844554: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844768: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844773: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844610: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844584: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844596: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844793: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844797: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844598: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844802: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844808: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844605: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:01.844819: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 09:40:01.844834: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912221: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912219: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912240: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912262: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912237: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912254: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912421: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912425: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912429: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912432: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:01.912434: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912438: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912443: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:40:01.912452: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.912512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912521: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912524: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912545: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.912536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:01.913105: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913133: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913137: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913157: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913162: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913169: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913171: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 09:40:01.913176: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919443: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919450: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919453: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919489: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919484: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919494: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919502: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:01.919870: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919874: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919879: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919904: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919910: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919914: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919919: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:01.919928: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921515: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921521: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921525: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921525: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:01.921899: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921903: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921912: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921911: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921913: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921917: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921913: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 09:40:01.921919: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922438: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922490: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922470: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922495: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:01.922842: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922869: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922876: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922895: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922898: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922913: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922919: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 09:40:01.922928: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.922744: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922773: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922789: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922775: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922792: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.922798: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:01.923130: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923158: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923156: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923162: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923164: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923161: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923166: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:40:01.923174: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930022: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930022: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:01.930429: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930432: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930437: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930438: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930441: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930445: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930446: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 09:40:01.930449: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 09:40:11.123550: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.123683: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-17 09:40:11.123556: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.123565: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.123565: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.123561: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: 2023-03-17 09:40:11.123750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:11.123732: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.123561: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123693: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:11.123781: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-17 09:40:11.123563: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:11.123842: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123767: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-17 09:40:11.123569: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 09:40:11.123785: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 09:40:11.123791: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:40:11.123839: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 09:40:11.123695: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 09:40:11.123791: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:40:11.123848: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 09:40:11.123793: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:40:11.123850: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123772: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 09:40:11.123795: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:40:11.123847: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.123774: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 09:40:11.123802: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:40:11.123852: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 09:40:11.123753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:11.123804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:40:11.123858: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:11.123860: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124332: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124340: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124346: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124350: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.124353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124907: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124903: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124903: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124906: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124908: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.124912: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:11.125454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:11.125458: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 09:40:11.125608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:11.125458: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 09:40:11.125463: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125611: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 09:40:11.125466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125611: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 09:40:11.125470: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 09:40:11.125480: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125485: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125487: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125488: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125491: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125519: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125615: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 09:40:11.125528: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125622: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 09:40:11.125622: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125618: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 09:40:11.125543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 09:40:11.125549: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125625: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 09:40:11.125620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 09:40:11.125633: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 09:40:11.125632: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 09:40:11.125639: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 09:40:11.125642: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 09:40:11.125643: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 09:40:11.125954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.125992: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125953: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.125992: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:11.125932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-17 09:40:11.125957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.125996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.125995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:11.125934: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.125999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:11.126008: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:40:11.126009: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125935: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:11.126008: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.126004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 09:40:11.125939: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.126008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 09:40:11.125941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125969: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 09:40:11.125971: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:11.126017: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:40:11.126018: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.125971: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 09:40:11.125977: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 09:40:11.125977: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:40:11.126019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 09:40:11.125942: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-17 09:40:11.125978: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 09:40:11.125980: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:40:11.126026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:40:11.126026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:11.125947: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 09:40:11.125996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 09:40:11.126034: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125947: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125953: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125956: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 09:40:11.126009: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125958: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125959: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:11.125975: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-17 09:40:11.126298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 09:40:11.125994: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 09:40:11.125995: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126306: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126313: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126319: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126320: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126317: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126323: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126323: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126336: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126348: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 09:40:11.126345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 09:40:11.126360: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126575: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126575: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126581: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126589: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126583: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126584: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126589: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126599: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126601: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126604: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126605: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126608: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126640: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 09:40:11.126656: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 09:40:11.126658: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125674: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125676: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125678: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125685: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125687: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125689: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125690: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125691: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125692: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125704: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:40:11.125721: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:40:11.125741: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +4: Successfully preprocessed all matching files. +4: Successfully preprocessed all matching files. +4: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +1: Building extension module utils... +1: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +2: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: +3: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Loading extension module utils... +5: Loading extension module utils... +0: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: +0: Loading extension module utils... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +5: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...Loading extension module utils... +1: +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils...Loading extension module utils... +3: +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +7: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/421m60b400m/3328560.out b/421m60b400m/3328560.out new file mode 100644 index 0000000000000000000000000000000000000000..b5490995386a296294f6f1387173b15d0f5750a4 --- /dev/null +++ b/421m60b400m/3328560.out @@ -0,0 +1,6435 @@ +Model parameters: d_model 1280 ffw_size 5120 kv_size 128 n_heads 10 n_layers 18 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 18 --hidden-size 1280 --num-attention-heads 10 --kv-channels 128 --ffn-hidden-size 5120 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-421m60b400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_421m60b400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_421m60b400m --load checkpoints_421m60b400m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3328560.json --zero-stage 0 +START 3328560: Fri 17 Mar 2023 09:39:24 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 40.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 47.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 39.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 37.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 39.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 47.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 46.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 39.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 41.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 47.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 44.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 35.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +5: Launching on nid006130 (5/8), master nid006125 port 9999, GPUs 8, CUDA: True +0: Launching on nid006125 (0/8), master nid006125 port 9999, GPUs 8, CUDA: True +7: Launching on nid006132 (7/8), master nid006125 port 9999, GPUs 8, CUDA: True +1: Launching on nid006126 (1/8), master nid006125 port 9999, GPUs 8, CUDA: True +2: Launching on nid006127 (2/8), master nid006125 port 9999, GPUs 8, CUDA: True +6: Launching on nid006131 (6/8), master nid006125 port 9999, GPUs 8, CUDA: True +4: Launching on nid006129 (4/8), master nid006125 port 9999, GPUs 8, CUDA: True +3: Launching on nid006128 (3/8), master nid006125 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3328560.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 5120 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1280 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-421m60b400mval +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_421m60b400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 18 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_421m60b400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_421m60b400mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +7: > setting tensorboard ... +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-17 09:40:44,010] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.102 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.cuda.o scaled_upper_triang_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 26.358 seconds +0: time to initialize megatron (seconds): 25.154 +0: [after megatron is initialized] datetime: 2023-03-17 09:41:13 +0: building GPT model ... +0: [2023-03-17 09:41:13,291] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-17 09:41:13,292] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-17 09:41:13,292] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.61 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-17 09:41:15,302] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=25 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: undo +0: 22: MixedFusedLayerNorm +0: 23: EmbeddingPipe +0: 24: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-17 09:41:15,607] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-17 09:41:15,608] [INFO] [utils.py:828:see_memory_usage] MA 0.79 GB Max_MA 0.79 GB CA 0.86 GB Max_CA 1 GB +0: [2023-03-17 09:41:15,608] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.63 GB, percent = 6.1% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-17 09:41:15,610] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-17 09:41:26,509] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-17 09:41:26,510] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-17 09:41:26,510] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-17 09:41:26,516] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-17 09:41:26,516] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-17 09:41:26,635] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-17 09:41:26,636] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.79 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-17 09:41:26,636] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.31 GB, percent = 6.2% +1: ninja: no work to do. +1: Time to load utils op: 0.16515302658081055 seconds +5: Time to load utils op: 0.20964360237121582 seconds +6: Time to load utils op: 0.20918917655944824 seconds +0: Time to load utils op: 0.11022830009460449 seconds +7: Time to load utils op: 0.20864605903625488 seconds +0: Time to load utils op: 0.1027071475982666 seconds +0: Time to load utils op: 0.10273957252502441 seconds +0: Time to load utils op: 0.10273313522338867 seconds +0: Time to load utils op: 0.10268163681030273 seconds +0: Time to load utils op: 0.1032712459564209 seconds +1: Time to load utils op: 0.10222721099853516 seconds +0: Time to load utils op: 0.10327696800231934 seconds +1: Time to load utils op: 0.10165548324584961 secondsTime to load utils op: 0.10177874565124512 seconds +1: +1: Time to load utils op: 0.10265207290649414 seconds +0: Time to load utils op: 0.1033470630645752 seconds +1: Time to load utils op: 0.10260343551635742 seconds +1: Time to load utils op: 0.10203790664672852 seconds +1: Time to load utils op: 0.10278439521789551 seconds +5: Time to load utils op: 0.10220766067504883 seconds +5: Time to load utils op: 0.1024935245513916 secondsTime to load utils op: 0.10248613357543945 seconds +5: +5: Time to load utils op: 0.10254406929016113 seconds +6: Time to load utils op: 0.10227560997009277 seconds +5: Time to load utils op: 0.10257172584533691 seconds +5: Time to load utils op: 0.10321760177612305 secondsTime to load utils op: 0.10263609886169434 seconds +5: +6: Time to load utils op: 0.10242748260498047 seconds +6: Time to load utils op: 0.10196805000305176 seconds +6: Time to load utils op: 0.10219168663024902 seconds +6: Time to load utils op: 0.1020667552947998 seconds +6: Time to load utils op: 0.10215425491333008 seconds +6: Time to load utils op: 0.10246562957763672 seconds +7: Time to load utils op: 0.10252928733825684 seconds +7: Time to load utils op: 0.10260796546936035 seconds +7: Time to load utils op: 0.10271143913269043 seconds +7: Time to load utils op: 0.10244274139404297 seconds +7: Time to load utils op: 0.10273623466491699 seconds +7: Time to load utils op: 0.10262656211853027 seconds +7: Time to load utils op: 0.10274481773376465 seconds +4: Time to load utils op: 0.11295104026794434 secondsTime to load utils op: 0.11295223236083984 seconds +4: +4: Time to load utils op: 0.11259078979492188 seconds +4: Time to load utils op: 0.11290264129638672 seconds +4: Time to load utils op: 0.11251020431518555 secondsTime to load utils op: 0.11221075057983398 seconds +4: +4: Time to load utils op: 0.11209869384765625 seconds +4: Time to load utils op: 0.11223316192626953 seconds +2: Time to load utils op: 0.1108236312866211 seconds +2: Time to load utils op: 0.11083102226257324 seconds +2: Time to load utils op: 0.11085915565490723 seconds +2: Time to load utils op: 0.11087560653686523 seconds +2: Time to load utils op: 0.11089277267456055 secondsTime to load utils op: 0.11089181900024414 seconds +2: +2: Time to load utils op: 0.11089897155761719 seconds +2: Time to load utils op: 0.1109011173248291 seconds +3: Time to load utils op: 0.11006617546081543 seconds +3: Time to load utils op: 0.11007356643676758 secondsTime to load utils op: 0.11007547378540039 seconds +3: +3: Time to load utils op: 0.11008620262145996 seconds +3: Time to load utils op: 0.11008048057556152 secondsTime to load utils op: 0.11008381843566895 seconds +3: +3: Time to load utils op: 0.11008810997009277 seconds +3: Time to load utils op: 0.11009049415588379 seconds +0: [2023-03-17 09:41:26,857] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-17 09:41:26,858] [INFO] [utils.py:828:see_memory_usage] MA 0.78 GB Max_MA 0.78 GB CA 0.88 GB Max_CA 1 GB +0: [2023-03-17 09:41:26,858] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.31 GB, percent = 6.2% +4: Time to load utils op: 0.0009427070617675781 seconds +4: Time to load utils op: 0.0009906291961669922 seconds +4: Time to load utils op: 0.001249551773071289 seconds +4: Time to load utils op: 0.001178741455078125 seconds +4: Time to load utils op: 0.0012519359588623047 seconds +4: Time to load utils op: 0.0013031959533691406 seconds +4: Time to load utils op: 0.0012896060943603516 seconds +4: Time to load utils op: 0.0013279914855957031 seconds +0: Time to load utils op: 0.0006537437438964844 secondsTime to load utils op: 0.0006463527679443359 seconds +0: +0: Time to load utils op: 0.0006515979766845703 secondsTime to load utils op: 0.0005764961242675781 seconds +0: +0: Time to load utils op: 0.0006632804870605469 seconds +0: Time to load utils op: 0.0005929470062255859 seconds +0: Time to load utils op: 0.0006248950958251953 seconds +5: Time to load utils op: 0.0005104541778564453 seconds +5: Time to load utils op: 0.0005133152008056641 seconds +6: Time to load utils op: 0.0003998279571533203 seconds +5: Time to load utils op: 0.0006418228149414062 seconds +6: Time to load utils op: 0.000499725341796875 seconds +5: Time to load utils op: 0.0006265640258789062 seconds +6: Time to load utils op: 0.0003936290740966797 seconds +6: Time to load utils op: 0.0005483627319335938 seconds +5: Time to load utils op: 0.0007028579711914062 seconds +6: Time to load utils op: 0.0004949569702148438 seconds +5: Time to load utils op: 0.0007224082946777344 seconds +6: Time to load utils op: 0.0005364418029785156 seconds +6: Time to load utils op: 0.000560760498046875 seconds +5: Time to load utils op: 0.0006954669952392578 seconds +6: Time to load utils op: 0.0005629062652587891 seconds +5: Time to load utils op: 0.0007617473602294922 seconds +1: Time to load utils op: 0.0004832744598388672 secondsTime to load utils op: 0.0004899501800537109 seconds +1: +1: Time to load utils op: 0.0004723072052001953 seconds +1: Time to load utils op: 0.0004534721374511719 seconds +1: Time to load utils op: 0.0004482269287109375 seconds +1: Time to load utils op: 0.0004477500915527344 seconds +1: Time to load utils op: 0.00042176246643066406 seconds +1: Time to load utils op: 0.0005011558532714844 seconds +7: Time to load utils op: 0.0008769035339355469 seconds +7: Time to load utils op: 0.0008547306060791016 seconds +7: Time to load utils op: 0.0011606216430664062 seconds +7: Time to load utils op: 0.0012111663818359375 seconds +7: Time to load utils op: 0.0011823177337646484 seconds +7: Time to load utils op: 0.0010819435119628906 seconds +7: Time to load utils op: 0.0012030601501464844 seconds +3: Time to load utils op: 0.0010123252868652344 seconds +3: Time to load utils op: 0.0009961128234863281 seconds +3: Time to load utils op: 0.0011529922485351562 seconds +3: Time to load utils op: 0.001298666000366211 secondsTime to load utils op: 0.0013325214385986328 seconds +3: +7: Time to load utils op: 0.0003814697265625 seconds +3: Time to load utils op: 0.0013179779052734375 seconds +3: Time to load utils op: 0.0013070106506347656 seconds +3: Time to load utils op: 0.0013246536254882812 seconds +2: Time to load utils op: 0.0010044574737548828 seconds +2: Time to load utils op: 0.0010330677032470703 seconds +2: Time to load utils op: 0.0010976791381835938 seconds +2: Time to load utils op: 0.0012927055358886719 seconds +2: Time to load utils op: 0.001314401626586914 seconds +2: Time to load utils op: 0.0013549327850341797 seconds +2: Time to load utils op: 0.0013308525085449219 seconds +2: Time to load utils op: 0.0013570785522460938 seconds +0: [2023-03-17 09:41:27,017] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-17 09:41:27,018] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-17 09:41:27,018] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,130] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-17 09:41:27,130] [INFO] [utils.py:828:see_memory_usage] MA 1.68 GB Max_MA 1.68 GB CA 2.21 GB Max_CA 2 GB +0: [2023-03-17 09:41:27,130] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,239] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-17 09:41:27,239] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-17 09:41:27,240] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,345] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-17 09:41:27,345] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-17 09:41:27,346] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,453] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-17 09:41:27,454] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-17 09:41:27,454] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,559] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-17 09:41:27,559] [INFO] [utils.py:828:see_memory_usage] MA 2.38 GB Max_MA 2.38 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-17 09:41:27,560] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,670] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-17 09:41:27,670] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-17 09:41:27,671] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,776] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-17 09:41:27,777] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.23 GB Max_CA 3 GB +0: [2023-03-17 09:41:27,777] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.46 GB, percent = 6.3% +0: [2023-03-17 09:41:27,777] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-17 09:41:27,777] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-17 09:41:27,777] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-17 09:41:27,778] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-17 09:41:27,778] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-17 09:41:27,778] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-17 09:41:27,778] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-17 09:41:27,778] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-17 09:41:27,778] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-17 09:41:27,779] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-17 09:41:27,780] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-17 09:41:27,780] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00042319297790527344 seconds +0: [2023-03-17 09:41:27,781] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-17 09:41:27,791] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=25 [0, 25) STAGE_PARAMS=421207040 (421.207M) TOTAL_PARAMS=421207040 (421.207M) UNIQUE_PARAMS=421207040 (421.207M) +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +2: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +1: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:27,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:27,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +0: [2023-03-17 09:41:27,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +4: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt... +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/mp_rank_00_model_states.pt. +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:27,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:28,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +4: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +3: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +7: [2023-03-17 09:41:28,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +2: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +5: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:41:28,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt... +6: [2023-03-17 09:41:28,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +6: [2023-03-17 09:41:28,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +2: [2023-03-17 09:41:28,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +7: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:41:28,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +5: [2023-03-17 09:41:28,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +4: [2023-03-17 09:41:28,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:41:28,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_01-model_00-model_states.pt. +3: [2023-03-17 09:41:28,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +3: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +7: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +2: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +6: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +5: [2023-03-17 09:41:28,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:41:28,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +6: [2023-03-17 09:41:28,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +4: [2023-03-17 09:41:28,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +3: [2023-03-17 09:41:28,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +5: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:41:28,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +7: [2023-03-17 09:41:28,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:41:28,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_03-model_00-model_states.pt. +2: [2023-03-17 09:41:28,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +6: [2023-03-17 09:41:28,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +4: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +3: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +5: [2023-03-17 09:41:28,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +2: [2023-03-17 09:41:28,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:41:28,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt... +7: [2023-03-17 09:41:28,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +7: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +2: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +3: [2023-03-17 09:41:28,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:41:28,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +5: [2023-03-17 09:41:28,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +6: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +4: [2023-03-17 09:41:28,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:41:28,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +6: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +5: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +7: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +3: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +4: [2023-03-17 09:41:28,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +2: [2023-03-17 09:41:28,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:41:28,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +2: [2023-03-17 09:41:28,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +6: [2023-03-17 09:41:28,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +3: [2023-03-17 09:41:28,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +5: [2023-03-17 09:41:28,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +4: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +7: [2023-03-17 09:41:28,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:41:28,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:41:28,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +6: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +2: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +4: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +3: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +7: [2023-03-17 09:41:28,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:41:28,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:41:28,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt... +5: [2023-03-17 09:41:28,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +6: [2023-03-17 09:41:28,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +4: [2023-03-17 09:41:28,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +7: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:41:28,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +2: [2023-03-17 09:41:28,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:28,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:28,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:28,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:28,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +3: [2023-03-17 09:41:28,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:28,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +5: [2023-03-17 09:41:28,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:28,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:41:28,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:28,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +4: [2023-03-17 09:41:29,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:41:29,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:29,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +6: [2023-03-17 09:41:29,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +5: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +3: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +7: [2023-03-17 09:41:29,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:41:29,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:41:29,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt... +2: [2023-03-17 09:41:29,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +4: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +2: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +6: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +7: [2023-03-17 09:41:29,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +3: [2023-03-17 09:41:29,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +5: [2023-03-17 09:41:29,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:41:29,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +6: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +3: [2023-03-17 09:41:29,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:41:29,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:41:29,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +4: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +5: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +2: [2023-03-17 09:41:29,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt... +7: [2023-03-17 09:41:29,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +4: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:41:29,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +6: [2023-03-17 09:41:29,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +3: [2023-03-17 09:41:29,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +7: [2023-03-17 09:41:29,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +2: [2023-03-17 09:41:29,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +5: [2023-03-17 09:41:29,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:41:29,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:41:29,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +4: [2023-03-17 09:41:29,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +2: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +5: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +3: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +7: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +6: [2023-03-17 09:41:29,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:41:29,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +3: [2023-03-17 09:41:29,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +4: [2023-03-17 09:41:29,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +6: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +5: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +7: [2023-03-17 09:41:29,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +2: [2023-03-17 09:41:29,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:41:29,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +4: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +5: [2023-03-17 09:41:29,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +7: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +6: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +3: [2023-03-17 09:41:29,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:41:29,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt... +2: [2023-03-17 09:41:29,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +2: [2023-03-17 09:41:29,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +6: [2023-03-17 09:41:29,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +3: [2023-03-17 09:41:29,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +4: [2023-03-17 09:41:29,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +7: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +5: [2023-03-17 09:41:29,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:41:29,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +1: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +4: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +6: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +3: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +0: [2023-03-17 09:41:29,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +2: [2023-03-17 09:41:29,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +7: [2023-03-17 09:41:29,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt... +5: [2023-03-17 09:41:29,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +4: [2023-03-17 09:41:29,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +3: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +2: [2023-03-17 09:41:29,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +5: [2023-03-17 09:41:29,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +1: [2023-03-17 09:41:29,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +6: [2023-03-17 09:41:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +0: [2023-03-17 09:41:29,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_11-model_00-model_states.pt. +7: [2023-03-17 09:41:29,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +6: [2023-03-17 09:41:29,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +7: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +4: [2023-03-17 09:41:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:41:29,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +5: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +3: [2023-03-17 09:41:29,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt... +2: [2023-03-17 09:41:29,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +2: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +6: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,822] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +5: [2023-03-17 09:41:29,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +3: [2023-03-17 09:41:29,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +4: [2023-03-17 09:41:29,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +7: [2023-03-17 09:41:29,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:41:29,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:29,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +1: [2023-03-17 09:41:29,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:29,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +1: [2023-03-17 09:41:29,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +5: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +6: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +4: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +3: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +0: [2023-03-17 09:41:29,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:29,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:29,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +7: [2023-03-17 09:41:29,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:29,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt... +2: [2023-03-17 09:41:29,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +0: [2023-03-17 09:41:29,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +5: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +6: [2023-03-17 09:41:29,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:29,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +4: [2023-03-17 09:41:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +3: [2023-03-17 09:41:29,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:29,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +2: [2023-03-17 09:41:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:29,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:29,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:29,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_13-model_00-model_states.pt. +7: [2023-03-17 09:41:29,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +6: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +2: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +5: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +0: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +7: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +4: [2023-03-17 09:41:30,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +3: [2023-03-17 09:41:30,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt... +1: [2023-03-17 09:41:30,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +4: [2023-03-17 09:41:30,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +2: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +6: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +5: [2023-03-17 09:41:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +3: [2023-03-17 09:41:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +1: [2023-03-17 09:41:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +7: [2023-03-17 09:41:30,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_14-model_00-model_states.pt. +0: [2023-03-17 09:41:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +6: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +1: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +0: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +7: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +4: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +3: [2023-03-17 09:41:30,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +2: [2023-03-17 09:41:30,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt... +5: [2023-03-17 09:41:30,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +5: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +3: [2023-03-17 09:41:30,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +7: [2023-03-17 09:41:30,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +6: [2023-03-17 09:41:30,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +1: [2023-03-17 09:41:30,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +2: [2023-03-17 09:41:30,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +4: [2023-03-17 09:41:30,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_15-model_00-model_states.pt. +0: [2023-03-17 09:41:30,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +4: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +6: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +7: [2023-03-17 09:41:30,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +1: [2023-03-17 09:41:30,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +3: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +5: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +0: [2023-03-17 09:41:30,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt... +2: [2023-03-17 09:41:30,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +2: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +6: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +3: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +4: [2023-03-17 09:41:30,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +5: [2023-03-17 09:41:30,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +1: [2023-03-17 09:41:30,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +7: [2023-03-17 09:41:30,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_16-model_00-model_states.pt. +0: [2023-03-17 09:41:30,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +5: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +4: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +6: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +1: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +7: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +3: [2023-03-17 09:41:30,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt... +0: [2023-03-17 09:41:30,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +2: [2023-03-17 09:41:30,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +6: [2023-03-17 09:41:30,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +5: [2023-03-17 09:41:30,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +3: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +4: [2023-03-17 09:41:30,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +1: [2023-03-17 09:41:30,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +7: [2023-03-17 09:41:30,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_17-model_00-model_states.pt. +0: [2023-03-17 09:41:30,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +4: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +1: [2023-03-17 09:41:30,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +6: [2023-03-17 09:41:30,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +0: [2023-03-17 09:41:30,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +2: [2023-03-17 09:41:30,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +3: [2023-03-17 09:41:30,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +7: [2023-03-17 09:41:30,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt... +5: [2023-03-17 09:41:30,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +1: [2023-03-17 09:41:30,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +7: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +4: [2023-03-17 09:41:30,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +6: [2023-03-17 09:41:30,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +3: [2023-03-17 09:41:30,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +5: [2023-03-17 09:41:30,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +2: [2023-03-17 09:41:30,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_18-model_00-model_states.pt. +0: [2023-03-17 09:41:30,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +4: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +5: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +3: [2023-03-17 09:41:30,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +6: [2023-03-17 09:41:30,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +7: [2023-03-17 09:41:30,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +0: [2023-03-17 09:41:30,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt... +1: [2023-03-17 09:41:30,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +5: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +4: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +2: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +3: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +6: [2023-03-17 09:41:30,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +0: [2023-03-17 09:41:30,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +7: [2023-03-17 09:41:30,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_19-model_00-model_states.pt. +1: [2023-03-17 09:41:30,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +5: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +4: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +7: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +1: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +3: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +0: [2023-03-17 09:41:30,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +6: [2023-03-17 09:41:30,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt... +2: [2023-03-17 09:41:30,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +5: [2023-03-17 09:41:30,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +6: [2023-03-17 09:41:30,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +2: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +4: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +4: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +1: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: > overriding learning rate value to 0.0002 +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +1: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +2: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +2: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +5: [2023-03-17 09:41:30,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +7: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +3: [2023-03-17 09:41:30,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +3: [2023-03-17 09:41:30,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +6: [2023-03-17 09:41:30,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +6: [2023-03-17 09:41:30,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_20-model_00-model_states.pt. +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:41:30,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +0: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +0: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/layer_22-model_00-model_states.pt. +7: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 09:41:30,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 09:41:30,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 09:41:30,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 09:41:30,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:41:31,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,213] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +6: [2023-03-17 09:41:31,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,214] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +0: [2023-03-17 09:41:31,216] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +6: [2023-03-17 09:41:31,217] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +2: [2023-03-17 09:41:31,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,219] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +2: [2023-03-17 09:41:31,223] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +3: [2023-03-17 09:41:31,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,233] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +5: [2023-03-17 09:41:31,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,235] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +3: [2023-03-17 09:41:31,236] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +5: [2023-03-17 09:41:31,239] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +4: [2023-03-17 09:41:31,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,242] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +4: [2023-03-17 09:41:31,245] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +7: [2023-03-17 09:41:31,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,249] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +7: [2023-03-17 09:41:31,251] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +1: [2023-03-17 09:41:31,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,256] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +1: [2023-03-17 09:41:31,259] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +6: [2023-03-17 09:41:31,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,260] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +6: [2023-03-17 09:41:31,263] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +1: [2023-03-17 09:41:31,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,263] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +2: [2023-03-17 09:41:31,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,265] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +1: [2023-03-17 09:41:31,266] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +0: [2023-03-17 09:41:31,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,267] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +4: [2023-03-17 09:41:31,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,268] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +2: [2023-03-17 09:41:31,269] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +0: [2023-03-17 09:41:31,270] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +4: [2023-03-17 09:41:31,271] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +1: [2023-03-17 09:41:31,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,271] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +5: [2023-03-17 09:41:31,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,273] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +1: [2023-03-17 09:41:31,275] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +5: [2023-03-17 09:41:31,276] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +0: [2023-03-17 09:41:31,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,277] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +0: [2023-03-17 09:41:31,280] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +3: [2023-03-17 09:41:31,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,285] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +3: [2023-03-17 09:41:31,288] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +7: [2023-03-17 09:41:31,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,294] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +7: [2023-03-17 09:41:31,297] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +6: [2023-03-17 09:41:31,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,302] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +1: [2023-03-17 09:41:31,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,305] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +6: [2023-03-17 09:41:31,305] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +5: [2023-03-17 09:41:31,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,307] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +6: [2023-03-17 09:41:31,307] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +1: [2023-03-17 09:41:31,308] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +6: [2023-03-17 09:41:31,310] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +7: [2023-03-17 09:41:31,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,310] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +5: [2023-03-17 09:41:31,311] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +7: [2023-03-17 09:41:31,313] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +3: [2023-03-17 09:41:31,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,319] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +7: [2023-03-17 09:41:31,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,321] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +2: [2023-03-17 09:41:31,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,322] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +3: [2023-03-17 09:41:31,322] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +0: [2023-03-17 09:41:31,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,324] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +7: [2023-03-17 09:41:31,324] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +5: [2023-03-17 09:41:31,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,325] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +5: [2023-03-17 09:41:31,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,325] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +2: [2023-03-17 09:41:31,325] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +0: [2023-03-17 09:41:31,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,326] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +2: [2023-03-17 09:41:31,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,326] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +6: [2023-03-17 09:41:31,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,327] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +0: [2023-03-17 09:41:31,327] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +5: [2023-03-17 09:41:31,328] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +5: [2023-03-17 09:41:31,328] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +2: [2023-03-17 09:41:31,329] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +0: [2023-03-17 09:41:31,330] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +6: [2023-03-17 09:41:31,330] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +3: [2023-03-17 09:41:31,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,334] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +3: [2023-03-17 09:41:31,334] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +3: [2023-03-17 09:41:31,337] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +5: [2023-03-17 09:41:31,337] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +5: [2023-03-17 09:41:31,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,339] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +4: [2023-03-17 09:41:31,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,339] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +2: [2023-03-17 09:41:31,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,341] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +5: [2023-03-17 09:41:31,342] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +4: [2023-03-17 09:41:31,342] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +1: [2023-03-17 09:41:31,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,343] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +2: [2023-03-17 09:41:31,344] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +1: [2023-03-17 09:41:31,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,346] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +3: [2023-03-17 09:41:31,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,347] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +1: [2023-03-17 09:41:31,347] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +2: [2023-03-17 09:41:31,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,348] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +0: [2023-03-17 09:41:31,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,350] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +0: [2023-03-17 09:41:31,350] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +3: [2023-03-17 09:41:31,350] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +2: [2023-03-17 09:41:31,351] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +0: [2023-03-17 09:41:31,353] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +1: [2023-03-17 09:41:31,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,354] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +1: [2023-03-17 09:41:31,354] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +0: [2023-03-17 09:41:31,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,354] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +3: [2023-03-17 09:41:31,357] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +0: [2023-03-17 09:41:31,358] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +4: [2023-03-17 09:41:31,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,359] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +7: [2023-03-17 09:41:31,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,362] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +4: [2023-03-17 09:41:31,362] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +1: [2023-03-17 09:41:31,358] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +4: [2023-03-17 09:41:31,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,365] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +7: [2023-03-17 09:41:31,366] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +4: [2023-03-17 09:41:31,368] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +6: [2023-03-17 09:41:31,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,368] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +7: [2023-03-17 09:41:31,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,369] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +2: [2023-03-17 09:41:31,369] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +6: [2023-03-17 09:41:31,372] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +7: [2023-03-17 09:41:31,373] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +2: [2023-03-17 09:41:31,373] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +2: [2023-03-17 09:41:31,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 09:41:31,374] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +2: [2023-03-17 09:41:31,377] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +3: [2023-03-17 09:41:31,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,378] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +0: [2023-03-17 09:41:31,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:41:31,379] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +6: [2023-03-17 09:41:31,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,379] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +3: [2023-03-17 09:41:31,383] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +6: [2023-03-17 09:41:31,383] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +0: [2023-03-17 09:41:31,383] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +3: [2023-03-17 09:41:31,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 09:41:31,390] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +5: [2023-03-17 09:41:31,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 09:41:31,390] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +4: [2023-03-17 09:41:31,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,392] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +3: [2023-03-17 09:41:31,393] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +5: [2023-03-17 09:41:31,394] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +4: [2023-03-17 09:41:31,396] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +6: [2023-03-17 09:41:31,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 09:41:31,401] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +6: [2023-03-17 09:41:31,405] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +1: [2023-03-17 09:41:31,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,415] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +7: [2023-03-17 09:41:31,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 09:41:31,417] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +7: [2023-03-17 09:41:31,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:41:31,418] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +7: [2023-03-17 09:41:31,418] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +7: [2023-03-17 09:41:31,420] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +7: [2023-03-17 09:41:31,421] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +4: [2023-03-17 09:41:31,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,618] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +4: [2023-03-17 09:41:31,620] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +4: [2023-03-17 09:41:31,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 09:41:31,807] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +4: [2023-03-17 09:41:31,810] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +0: successfully loaded checkpoint from checkpoints_421m60b400m at iteration 0 +7: time (ms) | load-checkpoint: 4036.46 +0: estimated model parameters: 0.42120704 +0: estimated model parameters without embeddings: 0.35419648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-17 09:41:32 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.210584 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.064 seconds +0: total number of samples: 48805 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.042907 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.073 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-17 09:41:45 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 19161.20 | train/valid/test-data-iterators-setup: 12747.67 +0: [after training is done] datetime: 2023-03-17 09:41:45 +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.431953E+00 | lm loss PPL: 3.093702E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3328560: Fri 17 Mar 2023 09:42:16 AM EET diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f51206820e0fa17179a747bca3eb9cf3c028b6 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0c2a9beaf11dd292eb29389f113ff584b871df184c57250aad70a91483fff0 +size 78980887 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf5254cd80e4983c183aac21025c23e9bc87d685 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be33c366c25691820ff98464a1cc1e646bc44a3b8de67066e4893e8304e3cb01 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..349cd3c01ade49f6ccbc496e90bae684dbd4d25b --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39f3b627f2bfda39c3036c5bbd38500c63bc7ea5aa671333cf30c4ec6f21ad0 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d73a135ddd2ada6ae77849af6a8faf966ff1326d --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c159f7e26b6d5c187842dac84a5b947f9e846f80a9d4cb5fed6f9200c93361a +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5ef19a3c76a51ef6a737e3487c645234c0cbbf4 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4f4447f78f4277d6baf5defab54ecc0a5089012b5703ace6f68cb9ddc34c078 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92a9c3f6c0623881e07eb0adbeeeba846f117188 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe15ba427e358a585912ec1f09d048b49e006f585e83d65244484248c46ecbc +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cb46c987b1653f54c6e949269b8ff09a748fcf1 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ecb7a83003b06b37338a0591ce6286c1ad467e0bee2af1f273e6f17846390d +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748681845355540a82dc8bac7f4274e0f1426622 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0f5ad8396955b97078779fdc52eefd5cb237d1010c79223e95ed7b55e43b65f +size 78980834 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1390c56321ac843ec14f84c307ef713ac8ced5b --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e53d136bc609110bd90d45d6de794176fa4dd77bed19e7e32e577ca8a7cc5235 +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c891e7b3968cf9344d8bb5f23455f5b4e340db5 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9c4b1568c9929b4258e108d13d16cac98c7bcaa4a4cb1c69d87c1777a8da6ae +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39db93a5edf97d0c9129bf2ba39a6ca23753aeb3 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82df2453a46ba8ad268a2b44cfbe8073978ad9a30066d2c2adb3271c60a03a63 +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db4a4937c298ccf055d76608596f2ccc7f4bfdf5 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a8b2cdb9f295432088e4a4d44fe9becfcd2e6a8fdf14aeb2560482e97f55c1 +size 78980951 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cef322e3b051d5698955fcf688ab0821d937da7 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dff6ec9483f99ee9841b0d15cbdf13f7253486bae4e8607c844b43b5a4caf2b4 +size 78980834 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8d038af05b97b9eb79abe9ac7e09d79628b6120 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be1fa97e1e700964e61994100793fc55d0217ae7c313b131d1b7fb63357f6af6 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2528f190ad2f2f4c2bfcec16b0d75453a528550 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2677c048edac85173e10415fbb11472c6667601886a92fae1eae1f60b05bdfb +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed54e8eec6e33a7f3d4f637181d2487aa6f45e9c --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678c261f736761094be7c69cc98920e7debb1a9e6cfcbbf4ad0abea83c0919fb +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99cb525bf8274006cecbd8ac4da871118064867e --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5975543b94247cf130d485805e925bda151f6e9082c1e8457464fc2a3ce99c9 +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80b99571b16ca26f00dea4aae5b2ba9415f9020b --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450a7a5b2ed5584980917a10b3d02530d9814734ea3b1c333774153a5b726bab +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..259399b10fd5538afaee576abdfb451370b7ca78 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcdf2e414104f04ad2f144990f3016b1cd1e5e0b67351a701974a8e57dc5046c +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25cc491e89b61404ca4921963cec22b73309f14b --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2b0683a31f5bc4c9836e481597d299e2cb2d9b8f8f43179d09aa98e8b8be052 +size 78980834 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b31fde647320510c0374482ec01614475b5f9207 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64f3f2c803462f42c861247f065267c0311d7c7b3e882fc51d4a9f471968ef73 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33771440bbcfda578c899b4a69e07eee29bc445a --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cfb86e6cd7b27d9d25fd9c46476b26917e096b1a68aae74634d114c333290b3 +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6739a30bd196e5f9b280046b7d79034c75540820 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0024f45a7a85cbfad1855cf14462fd72202a2475d0e75121b1e27b696f668a2 +size 78980887 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d21dd45b8764e3b71416bbcd588424fbdd673876 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df5d95109f88ad51eb4c85f32ff1046396ba9cff9baf90e6d7d591528679b612 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81559f15f69ffe9cdb44a46666664c0d51f6bd04 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bea7a6ffe7c9db0cbeee30e74be3d3cf87d0e95af83bc3c9860752a8242485d +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d56ab5aad963ecdf66076dcaf64a0b0b7bdc32c --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e15646024c013c3c7bcdaf2d13307b844a31e4b8ac26ecd92750e7e4feee1a78 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3f9e0a6403828c663df657d3bdb2410d990474e --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb5c861aa1317397cf12eec9e25a3e6dec304ef26512a0ef40c106ca0fcb89a0 +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..227f6f32510ef8d2573adc30d83698882d8bfedb --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c92574908c517c8ac16b6d4b927146a287bf9dad35ef9f52c91ca3143b9d93b +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83a8cc5da0740bc11494d7a3644ec6c3726f1dbf --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59d6590a3058be5cd3366e2413cc5d666f951b3d68116b89e7551363135f7c47 +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3eaeaf346322e1078cb7419fa194449935514b5d --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a487a11363749cbc58efe46dee88d4efd7bcf4c89adfa7f169e01dbdc9d60d0 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c699c721acf252c186ba1c5f5a869474612f998 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98a2d8d5e6536d351d9d8435e491d97314707b644055176625e451a5889cb61a +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56ef55b22680beb86d1d989143cd8ebe181e2285 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48b38d1aa74dd915f6c253fec3ca67a40985632bfdc9c8d2f6d46669d840d794 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bedfdb40747b25be827d25b2ceb62f589aedff2b --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8574d1042fb3bcbc850436d3c621169c63ac8fbb9c6b240da4af3fe56d477716 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ccefa5873642f30dcbe266f21ba65b1fa9ca6f2 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:755d971590da498e7f85c0e0d885512b6578d1c695925a8716ba1946ea3e64fa +size 78980887 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bdff57bbb54f2eb2a1ca0312a495f99ef247a83 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f8a6f32e065febfb8d3f6a652401a3330cd2f460b576ca63998f34886ba6342 +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea39b99a615766c61999b25ad50e0007b9f5ae2 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcce2cebf60a881be61f9b63925e8b7fd8f8d89743b7294dac54936cb6662861 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8653db26286748bbc383f01c0e09a9915746a262 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d832ec98579afb6509f7767b8cb15fdb4e647467c2cad64e4e3f1112301f32bc +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c5e48c24f286d1203cd3771a5fdd64f8cb4f366 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9736e1799720c9c3ecbd3a4ec6ae7ed465d160f937e8dd1d5df919030147e597 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ba7f302cb25d65133a7c0657756768b345bf936 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7be4cc2a4572e6ea1591ff64bbda1b8f6ce0d21b5cb42782fa9d4630097ddd9 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..383c78e73cd32aeb39815a1a1f97f32e2c8f7e2c --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00064ef7977a66eee0740e6b3277fdd53aea95e3d9ce917da82aedbac66e9f84 +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c67cc9472d4c83400f5e861ce5249f7e1841ed1 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e909bcceea42f3ced46fd7c8822ebabf573af09cab7ea728de83018e92ec9fa9 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ef5a46d06a24031ea906857d5270f58da518b3d --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f08c5550a6749ad9c75d117b8e18f7340a432ffd2be8d3d82e5d2485d8411f +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97d67a6e40dee2c1c291215eb8a0ca385fa62c78 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:316277b603d56aacf7cc78b2019118a1f6b54acb81d64ba101242e52d7652f8c +size 78980834 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71a78118a585c405400a1ae60b68222fa856d528 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c776eaf15cbf900a82c9ee99c79822a16d6ebf40150c712d92a5664775147d1 +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a07f6ae1ca1f983cfb8d3c37602a027195d26b67 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f4227d67c30fdf0046829ae2f50b73d93e76a84d95ece86f3b408957fd31f8 +size 78980951 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45807755d00c2da764760dbc80fa93fc20591174 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ceecc4b7555c6fc8223f5c62a1da4e8aa4b6b66f8aa619a5692a3baecfe7f7 +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd6b56f30e0e5311c2759a6ae7fd6d1a6a483abe --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0bbe3ebd84ec4f571ff2f981ffec0e9ec156765401424886953da089ffed3bf +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f3646095f93e17c77609e9f3cd4c243ab85589f --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d77f2ece8c26dd8be04f1c422e06b1fb293a365b44ef580e311826069dc11d +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e354caf11faec87b6ae4d790e020a2dc717738c --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5323c82bd6247e48a20566a2cdf5b5b98b493e33e731995086277b526c17650b +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4298452260efd5363c50957601ca191dcb609b0c --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea0d0f2e640762352a45d9c51bbf69e4094828a2a27f8228bc831785012fb89 +size 78981026 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71eb5c95513b8de5a3a8da506eee3e3a52ffd6f8 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f790d59495d558dce94900bee7ce6c4383000595019e3f55a603eea95e8bb33c +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1985221e5e2bcdafea9edbd703eb738119c82b76 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e7e0552739b2f9dfb74658dd0f7300eadb16d08d6302162fa4eddde2be0155a +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21599daea83f0fc9f8f40dd7d23dfb48f99c4668 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:902debb3d589921801f26628bc4056ee6d630b10eec1564b3dc5bd64776a2334 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ff8885d2bca71a51ec4a029f1e5a1589f0219b2 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6200cb640d738c1050ccf6555465ad50802a666c760e4a3d442c679a5757e86 +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0253b16839f8e0c8687c8bfba5e285736725f387 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2755c927640174e0885d565952ae97e5eda2148f4cea01de418e3f3e87d7cf6 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49583bf2b8da2c3422ffd12242862e2d6e600996 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4de98ba24949fc5cc09c536cfcb52f26f8ce630e9b29029e0d009a7148d9aeee +size 78980951 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a6bb5b30ffe82703ca28b42199959de0110cf3b --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9e1ce4c101f160c5631655f4b5e0c4c9cd0411f8d02fd0a89bb2c714d0ac9b +size 78980962 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecda78548d3f17c136abd93591540f483727e43e --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bc4c30ea1daa2f4ab8227c0b554a5f2f341f1394dd867a6176766603d47bfc3 +size 78981090 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..616cca4c5f8a6dcf8f607aa134b7c5e4ca296bd6 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cc57e047b9752197aa1a966616db6132b13027ff167afdab59c3e01e4e73f5 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80ea9fcea5ca05101b0b0dbecbd7cf5a29bbf7e3 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbcb4e14e8a6096b978f38c97d268d91c4b6b9eac0e5a786f6450796c48929b8 +size 78980898 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bb2bc285b43f6c08e91056126f77b164cbfd70c --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca66fddb3b7b7695a0f18733d962eea62e0e571369dff0e054d4dca316e3b6f9 +size 78980887 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1ca319d3b722286afb9b552f5dd9c8f519527d7 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:700eba32c055a8b70d32f632d130f09b1ca35e9dea2f950dadeffb3d6f69cd79 +size 78980887 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42ebb4dd51c235767906dd37dfed2b35e82c8cbd --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783efce4200c227449c5a5091822796d55b724483fe8177053ff7009557d9207 +size 78980951 diff --git a/421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cf1da6ece6d4890b40b333a97a9d5777ef0c900 --- /dev/null +++ b/421m60b400m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:282f328bd80aa27c406e5a78b9c5196e32fc91e1949ed4da608e5b71f3b40d2b +size 78980887 diff --git a/421m60b400m/global_step115203/layer_01-model_00-model_states.pt b/421m60b400m/global_step115203/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba51f417fa55fdf56c8cb8c51b3d3b0e991ed74f --- /dev/null +++ b/421m60b400m/global_step115203/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3cafc8d9d0eb73c0508b316e4b9565aa5a97dcd0c63062328d26517c6e72e10 +size 134022403 diff --git a/421m60b400m/global_step115203/layer_03-model_00-model_states.pt b/421m60b400m/global_step115203/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77412863d23db92b60632d25ba1b763e6a256bd6 --- /dev/null +++ b/421m60b400m/global_step115203/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b6b33fafe6570d96c9702b62b495069afec0c78aacf4c0417f855bfa6780a3 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_04-model_00-model_states.pt b/421m60b400m/global_step115203/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..daf7b274b7a0177bdc9a99c33afefcf0a601f3e9 --- /dev/null +++ b/421m60b400m/global_step115203/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5296b593dc5726f79fe10a09d2839f4a8cd841c7cae3285ca6aa2f820aeec576 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_05-model_00-model_states.pt b/421m60b400m/global_step115203/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..977830472f2ce8e1d910fe31587939c4f5507239 --- /dev/null +++ b/421m60b400m/global_step115203/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47d97b98e39610088dde3682362a9018f8f254c7b723b3d78ecf6ad84c7b5323 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_06-model_00-model_states.pt b/421m60b400m/global_step115203/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..462abb51748a33bf18b1a78f4cb141e357aa4d37 --- /dev/null +++ b/421m60b400m/global_step115203/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2071a8dd77b054065244a98ac224832588fa495c1455977c82d5a943ddc19354 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_07-model_00-model_states.pt b/421m60b400m/global_step115203/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aef9c5d48ed365d6cdbe09a79d56d6df6b933ba9 --- /dev/null +++ b/421m60b400m/global_step115203/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e124f1f54e8faaa5eb1d495c013653eb49a391e4a1d9ac113383c70294881d +size 39359235 diff --git a/421m60b400m/global_step115203/layer_08-model_00-model_states.pt b/421m60b400m/global_step115203/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e829c468635e3fbacfcacbdc44308a196c79189 --- /dev/null +++ b/421m60b400m/global_step115203/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c86f8c342de94e592795e07668af90c18afafb77a0e7d08e423249531abc19 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_09-model_00-model_states.pt b/421m60b400m/global_step115203/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb1eb4620eafc3fad678a31c99856db94f6476e0 --- /dev/null +++ b/421m60b400m/global_step115203/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65bdb081ed3c690ae1837018e5bfd28ecd3942ede50eb9280a3cab6e1cb002c0 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_10-model_00-model_states.pt b/421m60b400m/global_step115203/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..943ea2f6d6abead063bf29da25067049281ebb87 --- /dev/null +++ b/421m60b400m/global_step115203/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3238b272a903b70e94cb197058294f7474c1ed5761eab8efe7776a3e1599d450 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_11-model_00-model_states.pt b/421m60b400m/global_step115203/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fbad5d17f27b97603c51fe37626075de1f3ece7 --- /dev/null +++ b/421m60b400m/global_step115203/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a4561fdef58dc64d909a804ca1cf7b2d8a6815dfb1b338371b39af32223eb3c +size 39359235 diff --git a/421m60b400m/global_step115203/layer_12-model_00-model_states.pt b/421m60b400m/global_step115203/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d587b42d47099cd00ba54566968360348abe2469 --- /dev/null +++ b/421m60b400m/global_step115203/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77200d2c0defb48a8a95b328cf1d181ecb54f6fd0687772c26d1ee6b17c7d063 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_13-model_00-model_states.pt b/421m60b400m/global_step115203/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb9a54a19c703d705614ccbcc8a6e539769bacc5 --- /dev/null +++ b/421m60b400m/global_step115203/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e83f51a6a42ac33f1e1db0fc3dffb305e437e71d60767c04f0f74dbb62a8130 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_14-model_00-model_states.pt b/421m60b400m/global_step115203/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c6d91751c2828dd8282bd0f4571c44669a14930 --- /dev/null +++ b/421m60b400m/global_step115203/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b0aa8333c0b85694eaa16c0422f53e48a0e6005b6c191ef951c2dacb2b03f46 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_15-model_00-model_states.pt b/421m60b400m/global_step115203/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b97caba2a6de5b3f0ac20f5be4f7c097f090944c --- /dev/null +++ b/421m60b400m/global_step115203/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb940f0d39ef12a557064991f3b3653ed4fcb51206ca6e0931dc98d1cc25e33 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_16-model_00-model_states.pt b/421m60b400m/global_step115203/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98b885e7fa5631d32485c82eac8329698bad81bf --- /dev/null +++ b/421m60b400m/global_step115203/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d77fd4e72b37917f20f2d2eacae0e519448a0eb4469776249c732fb825ae091 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_17-model_00-model_states.pt b/421m60b400m/global_step115203/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b29f200b0df0454164e93bfeceb97d8b7353139 --- /dev/null +++ b/421m60b400m/global_step115203/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0719d410828b9b6f4ca70a320f06822543064eac98b55cb2463edebc86786ed +size 39359235 diff --git a/421m60b400m/global_step115203/layer_18-model_00-model_states.pt b/421m60b400m/global_step115203/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80ad0b4f88ee7f57fceab98a200f955df21ce1c7 --- /dev/null +++ b/421m60b400m/global_step115203/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341e44ad7ba96bc2989dd79985ef0d7744358d61f91b38c9e97177dfd1f7daeb +size 39359235 diff --git a/421m60b400m/global_step115203/layer_19-model_00-model_states.pt b/421m60b400m/global_step115203/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf701754ba4a7e4a0135bbfd5a0744c2484d6db6 --- /dev/null +++ b/421m60b400m/global_step115203/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1bcd09ca289d95abd6698f3ef4552281922ee9ffe0931370bb9316dcdd1db23 +size 39359235 diff --git a/421m60b400m/global_step115203/layer_20-model_00-model_states.pt b/421m60b400m/global_step115203/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d8ffe342ab928cea5f77f2fec7bfe3c2da60fdd --- /dev/null +++ b/421m60b400m/global_step115203/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dd60a2445c1f19406860cb408878d98e8b625c4dbb62a500b9e7bff579ab14e +size 39359235 diff --git a/421m60b400m/global_step115203/layer_22-model_00-model_states.pt b/421m60b400m/global_step115203/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0baaee88bce486ab8e8de2de22ed4ddf6dc40b6 --- /dev/null +++ b/421m60b400m/global_step115203/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a2fbcba947ac99976d6a07cb6ae42d83b54f59bb87ec427af302765ff28583c +size 6339 diff --git a/421m60b400m/global_step115203/mp_rank_00_model_states.pt b/421m60b400m/global_step115203/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d77e681afb8a83e229f3089414f9ef49b7636c2c --- /dev/null +++ b/421m60b400m/global_step115203/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56363a8a20bfab2c746dad7514a70793d350988614c56be2e4d7ba114e523cb4 +size 37747 diff --git a/421m60b400m/sbatch_421m60b400m.sh b/421m60b400m/sbatch_421m60b400m.sh new file mode 100644 index 0000000000000000000000000000000000000000..c2fdaf6d1fb494d136001906731f2cab46b71dd7 --- /dev/null +++ b/421m60b400m/sbatch_421m60b400m.sh @@ -0,0 +1,166 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m60b400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +# TRAIN_SAMPLES=15_446_035 +# Tokens: 60400000000 +# -> Samples: 29492188 +TRAIN_SAMPLES=29_492_188 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 294_922 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 10000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m60b400m/sbatch_421m60b400mval.sh b/421m60b400m/sbatch_421m60b400mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..54913f10503640b1352f3bf0754326d8f20108df --- /dev/null +++ b/421m60b400m/sbatch_421m60b400mval.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=421m60b400mval +VARIANT_CKPT=421m60b400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_425M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --no-load-optim \ + --reset-progress \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/421m60b400m/tensorboard_421m60b400m/events.out.tfevents.1678963715.nid005153.126453.0 b/421m60b400m/tensorboard_421m60b400m/events.out.tfevents.1678963715.nid005153.126453.0 new file mode 100644 index 0000000000000000000000000000000000000000..bf03f652685dc64ea11ca9e921a4d0832d1437c8 --- /dev/null +++ b/421m60b400m/tensorboard_421m60b400m/events.out.tfevents.1678963715.nid005153.126453.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c59b100822098a73cf1ca1803286f6a9f9c04f45da08939ba127542716ab825a +size 140745446 diff --git a/421m60b400m/tensorboard_421m60b400m/events.out.tfevents.1678963715.nid006552.116945.0 b/421m60b400m/tensorboard_421m60b400m/events.out.tfevents.1678963715.nid006552.116945.0 new file mode 100644 index 0000000000000000000000000000000000000000..bafd5a8d56388d389e35d60f604553a3710e7605 --- /dev/null +++ b/421m60b400m/tensorboard_421m60b400m/events.out.tfevents.1678963715.nid006552.116945.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0878ef61d59d8d737db9b2ed86b55c9836001e4526f79220d34ebb705eb55db9 +size 206403525 diff --git a/421m60b400m/tensorboard_421m60b400mval/events.out.tfevents.1679038843.nid006132.24460.0 b/421m60b400m/tensorboard_421m60b400mval/events.out.tfevents.1679038843.nid006132.24460.0 new file mode 100644 index 0000000000000000000000000000000000000000..155f84a8679931be36ee4a699268be38a5e1c9a0 --- /dev/null +++ b/421m60b400m/tensorboard_421m60b400mval/events.out.tfevents.1679038843.nid006132.24460.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:447cb93336e639a5aadbbbc566cfc914f4da2babf5b492093d449eb28ef1c643 +size 980 diff --git a/44m32b100m/3327118.err b/44m32b100m/3327118.err new file mode 100644 index 0000000000000000000000000000000000000000..ee75e3d501646429c996ce9a8fae38e68e4a44db --- /dev/null +++ b/44m32b100m/3327118.err @@ -0,0 +1,1124 @@ +4: 2023-03-17 00:17:16.290130: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290131: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290128: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: 2023-03-17 00:17:16.290145: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:17:16.290158: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:17:16.290150: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290126: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:17:16.290156: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:17:16.290150: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290144: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290143: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:17:16.290146: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: 2023-03-17 00:17:16.290154: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:17:16.290163: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:17:16.290160: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: 2023-03-17 00:17:16.290306: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290313: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290314: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: 2023-03-17 00:17:16.290387: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:17:16.290395: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:17:16.290402: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: 2023-03-17 00:17:16.290406: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290413: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290420: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-17 00:17:16.290417: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290422: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290321: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:17:16.290400: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:17:16.290414: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290414: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290416: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290414: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290321: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:17:16.290309: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: 2023-03-17 00:17:16.290408: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290410: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290428: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290411: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:17:16.290436: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:17:16.290417: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:17:16.290419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:17:16.290405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292078: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292088: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292081: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292095: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292073: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:16.292081: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292248: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292241: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292249: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292256: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292264: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292257: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292260: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:17:16.292264: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:17:18.172752: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172764: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172760: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.172770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:18.173223: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173229: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173235: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173232: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173234: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173239: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173239: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:18.173246: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.205966: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205971: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205984: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205980: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205981: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.205988: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:18.206201: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206207: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206208: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206209: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206209: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206214: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206216: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:17:18.206218: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.215781: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215778: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215787: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215792: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215790: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215787: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215796: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.215798: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:18.217293: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217299: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217311: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217303: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217307: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217311: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217313: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:17:18.217314: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.243749: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243746: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.243766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:18.244194: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244197: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244198: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244203: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244203: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244206: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244208: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:17:18.244214: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255663: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255674: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255680: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255675: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:18.255872: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255877: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255881: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255882: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255884: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255886: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255887: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:17:18.255892: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257122: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257134: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257130: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257131: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257136: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257140: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:18.257702: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257705: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257710: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257712: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257714: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257715: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257717: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:17:18.257722: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.269985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.269992: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.269996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.269983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.269995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.269995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.270002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.269996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:18.270208: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270208: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270212: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270215: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270216: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270220: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270222: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:17:18.270223: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289519: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289730: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289533: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289737: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289532: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:18.289743: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289747: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289750: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289756: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289757: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:17:18.289757: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:17:23.850982: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.850980: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.850987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.850988: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.850992: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.850993: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.850999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.851007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853195: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853196: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853197: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853217: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853218: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853219: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853221: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853223: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853224: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:17:23.853274: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:17:23.853290: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.856592: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856597: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856604: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856603: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856611: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.856629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857376: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857382: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857385: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857389: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857391: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857391: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857393: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.857397: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857682: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857702: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 00:17:23.857854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.857706: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 00:17:23.857857: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.857862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.857862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.857865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.858005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 00:17:23.857864: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.857870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.857873: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:17:23.858002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858053: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:17:23.858013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.858011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.858011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858057: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:17:23.858014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858061: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:17:23.858015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858061: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:17:23.858018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.858067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858259: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858272: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858267: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858276: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858278: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858274: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-17 00:17:23.858464: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858278: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.858279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858469: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858470: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858471: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858475: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858474: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858479: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858487: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858488: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858489: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:17:23.858525: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:17:23.858526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859318: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859316: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859318: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859322: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859335: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859335: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859335: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859339: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859338: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859340: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859380: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859384: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:17:23.859397: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:17:23.859399: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859771: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859772: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859786: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859780: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859783: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859788: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859789: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859794: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859796: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859800: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859799: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:17:23.859808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:17:23.859822: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860044: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860044: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860048: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860053: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860057: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860052: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860053: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860058: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860061: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860067: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860068: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860070: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:17:23.860072: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860305: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:17:23.860066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:17:23.860082: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860309: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860309: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860313: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860319: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860317: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860319: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860327: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860328: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860330: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860331: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860334: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:17:23.860336: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:17:23.860345: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879960: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879960: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879965: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.879982: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879982: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879984: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879983: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879983: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879986: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.879985: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:17:23.880014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:17:23.880029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859886: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859888: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859896: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859902: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859906: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859907: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859909: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859910: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859909: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859928: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:17:23.859942: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:17:23.859943: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: +3: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +0: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +3: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +3: Building extension module utils... +3: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +3: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +5: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: +6: Loading extension module utils... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils...Loading extension module utils... +7: +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/44m32b100m/3327118.out b/44m32b100m/3327118.out new file mode 100644 index 0000000000000000000000000000000000000000..afd5f6e5ed2f5dca7732c3cd47b3e5f485de7257 --- /dev/null +++ b/44m32b100m/3327118.out @@ -0,0 +1,24396 @@ +Model parameters: d_model 512 ffw_size 2048 kv_size 64 n_heads 8 n_layers 8 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 8 --hidden-size 512 --num-attention-heads 8 --kv-channels 64 --ffn-hidden-size 2048 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 15_446_035 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-44m32b100m --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 15_446_035 --lr-warmup-samples 154_460 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_44m32b100m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_44m32b100m --load checkpoints_44m32b100m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3327118.json --zero-stage 0 +START 3327118: Fri 17 Mar 2023 12:16:52 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 39.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 47.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 43.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 38.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 38.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 49.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +6: Launching on nid005298 (6/8), master nid005292 port 9999, GPUs 8, CUDA: True +1: Launching on nid005293 (1/8), master nid005292 port 9999, GPUs 8, CUDA: True +5: Launching on nid005297 (5/8), master nid005292 port 9999, GPUs 8, CUDA: True +7: Launching on nid005299 (7/8), master nid005292 port 9999, GPUs 8, CUDA: True +4: Launching on nid005296 (4/8), master nid005292 port 9999, GPUs 8, CUDA: True +0: Launching on nid005292 (0/8), master nid005292 port 9999, GPUs 8, CUDA: True +3: Launching on nid005295 (3/8), master nid005292 port 9999, GPUs 8, CUDA: True +2: Launching on nid005294 (2/8), master nid005292 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... True +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3327118.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2048 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 512 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-44m32b100m +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_44m32b100m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... 12.0 +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 15446035 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 154460 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 8 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 8 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_44m32b100m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_44m32b100m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 15446035 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +7: > setting tensorboard ... +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-17 00:17:48,447] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.105 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.o scaled_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_hip_kernel.cuda.o layer_norm_cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 26.822 seconds +0: time to initialize megatron (seconds): 42.256 +0: [after megatron is initialized] datetime: 2023-03-17 00:18:18 +0: building GPT model ... +0: [2023-03-17 00:18:18,392] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-17 00:18:18,393] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-17 00:18:18,393] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.73 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-17 00:18:20,388] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=15 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: undo +0: 12: MixedFusedLayerNorm +0: 13: EmbeddingPipe +0: 14: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-17 00:18:20,641] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-17 00:18:20,642] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 00:18:20,642] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.75 GB, percent = 6.1% +0: setting training iterations to 60336 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-17 00:18:20,643] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-17 00:18:33,266] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-17 00:18:33,266] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-17 00:18:33,266] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-17 00:18:33,268] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-17 00:18:33,268] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-17 00:18:33,394] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-17 00:18:33,395] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 00:18:33,395] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.41 GB, percent = 6.2% +0: ninja: no work to do. +0: Time to load utils op: 0.16084051132202148 seconds +0: Time to load utils op: 0.0005719661712646484 seconds +3: ninja: no work to do. +3: Time to load utils op: 0.1715831756591797 seconds +0: Time to load utils op: 0.20340704917907715 seconds +0: Time to load utils op: 0.2029438018798828 seconds +0: Time to load utils op: 0.2026829719543457 seconds +0: Time to load utils op: 0.2030336856842041 seconds +0: Time to load utils op: 0.202911376953125 seconds +0: Time to load utils op: 0.20281362533569336 seconds +0: Time to load utils op: 0.20273756980895996 seconds +3: Time to load utils op: 0.205657958984375 seconds +1: Time to load utils op: 0.2125396728515625 seconds +1: Time to load utils op: 0.21239185333251953 seconds +1: Time to load utils op: 0.21232271194458008 seconds +1: Time to load utils op: 0.212388277053833 secondsTime to load utils op: 0.21252012252807617 seconds +1: +1: Time to load utils op: 0.21255207061767578 secondsTime to load utils op: 0.21237635612487793 seconds +1: +1: Time to load utils op: 0.21274232864379883 seconds +2: Time to load utils op: 0.2125413417816162 secondsTime to load utils op: 0.21282362937927246 secondsTime to load utils op: 0.21300220489501953 secondsTime to load utils op: 0.21231555938720703 seconds +2: +2: +2: +2: Time to load utils op: 0.21300387382507324 seconds +2: Time to load utils op: 0.21242427825927734 secondsTime to load utils op: 0.2123098373413086 seconds +2: +2: Time to load utils op: 0.21280837059020996 seconds +3: Time to load utils op: 0.206099271774292 seconds +3: Time to load utils op: 0.2061598300933838 seconds +3: Time to load utils op: 0.2063748836517334 seconds +3: Time to load utils op: 0.20643043518066406 seconds +3: Time to load utils op: 0.20647954940795898 seconds +3: Time to load utils op: 0.20671319961547852 seconds +0: Time to load utils op: 0.0005929470062255859 secondsTime to load utils op: 0.0005102157592773438 seconds +0: +0: Time to load utils op: 0.0004429817199707031 seconds +0: Time to load utils op: 0.0004048347473144531 seconds +0: Time to load utils op: 0.0003952980041503906 seconds +0: Time to load utils op: 0.0003902912139892578 seconds +5: Time to load utils op: 0.21164560317993164 secondsTime to load utils op: 0.21165084838867188 secondsTime to load utils op: 0.21165132522583008 secondsTime to load utils op: 0.21168065071105957 seconds +5: +5: +5: +5: Time to load utils op: 0.2116408348083496 seconds +5: Time to load utils op: 0.21166205406188965 secondsTime to load utils op: 0.21169114112854004 seconds +5: Time to load utils op: 0.21166610717773438 seconds +5: +4: Time to load utils op: 0.2124631404876709 seconds +4: Time to load utils op: 0.21249127388000488 seconds +4: Time to load utils op: 0.2124783992767334 seconds +4: Time to load utils op: 0.21248555183410645 seconds +4: Time to load utils op: 0.21250486373901367 seconds +4: Time to load utils op: 0.2125091552734375 seconds +4: Time to load utils op: 0.21251749992370605 secondsTime to load utils op: 0.21248459815979004 seconds +4: +6: Time to load utils op: 0.2115936279296875 seconds +6: Time to load utils op: 0.211622953414917 seconds +6: Time to load utils op: 0.21169328689575195 seconds +6: Time to load utils op: 0.21169137954711914 seconds +6: Time to load utils op: 0.21172165870666504 seconds +6: Time to load utils op: 0.21172428131103516 seconds +6: Time to load utils op: 0.2117016315460205 secondsTime to load utils op: 0.21170759201049805 seconds +6: +7: Time to load utils op: 0.21185541152954102 seconds +7: Time to load utils op: 0.2118692398071289 seconds +7: Time to load utils op: 0.21187186241149902 seconds +7: Time to load utils op: 0.21190619468688965 seconds +7: Time to load utils op: 0.211883544921875 seconds +7: Time to load utils op: 0.21191978454589844 seconds +7: Time to load utils op: 0.21190357208251953 secondsTime to load utils op: 0.2119274139404297 seconds +7: +3: Time to load utils op: 0.0009200572967529297 seconds +3: Time to load utils op: 0.001119375228881836 seconds +3: Time to load utils op: 0.001177072525024414 secondsTime to load utils op: 0.0010991096496582031 seconds +3: +3: Time to load utils op: 0.0011637210845947266 seconds +3: Time to load utils op: 0.0011320114135742188 seconds +3: Time to load utils op: 0.0011563301086425781 seconds +3: Time to load utils op: 0.0011849403381347656 seconds +5: Time to load utils op: 0.0008139610290527344 seconds +1: Time to load utils op: 0.0007674694061279297 seconds +5: Time to load utils op: 0.0010001659393310547 seconds +5: Time to load utils op: 0.0009553432464599609 seconds +1: Time to load utils op: 0.0009655952453613281 secondsTime to load utils op: 0.000919342041015625 seconds +1: +5: Time to load utils op: 0.0011279582977294922 seconds +5: Time to load utils op: 0.0011339187622070312 seconds +5: Time to load utils op: 0.0011663436889648438 seconds +1: Time to load utils op: 0.001085519790649414 seconds +5: Time to load utils op: 0.0011322498321533203 seconds +5: Time to load utils op: 0.001210927963256836 seconds +1: Time to load utils op: 0.0012373924255371094 seconds +1: Time to load utils op: 0.0012881755828857422 seconds +1: Time to load utils op: 0.0012395381927490234 seconds +1: Time to load utils op: 0.0013434886932373047 seconds +4: Time to load utils op: 0.0007832050323486328 seconds +4: Time to load utils op: 0.000934600830078125 seconds +4: Time to load utils op: 0.0010237693786621094 seconds +4: Time to load utils op: 0.001056671142578125 seconds +4: Time to load utils op: 0.0010373592376708984 seconds +4: Time to load utils op: 0.001027822494506836 seconds +4: Time to load utils op: 0.0010330677032470703 seconds +4: Time to load utils op: 0.0010864734649658203 seconds +6: Time to load utils op: 0.0009377002716064453 seconds +6: Time to load utils op: 0.0008192062377929688 seconds +7: Time to load utils op: 0.0008296966552734375 secondsTime to load utils op: 0.0008623600006103516 seconds +7: +6: Time to load utils op: 0.0011758804321289062 seconds +6: Time to load utils op: 0.0011730194091796875 seconds +6: Time to load utils op: 0.0011768341064453125 seconds +7: Time to load utils op: 0.0010192394256591797 secondsTime to load utils op: 0.0011379718780517578 seconds +7: +6: Time to load utils op: 0.0011742115020751953 secondsTime to load utils op: 0.0011806488037109375 seconds +6: +6: Time to load utils op: 0.0011796951293945312 seconds +7: Time to load utils op: 0.0011775493621826172 seconds +7: Time to load utils op: 0.0011272430419921875 secondsTime to load utils op: 0.0010640621185302734 seconds +7: +7: Time to load utils op: 0.0011799335479736328 seconds +2: Time to load utils op: 0.0008060932159423828 seconds +2: Time to load utils op: 0.0009541511535644531 seconds +2: Time to load utils op: 0.0012352466583251953 seconds +2: Time to load utils op: 0.0013089179992675781 secondsTime to load utils op: 0.0012409687042236328 seconds +2: +2: Time to load utils op: 0.0012979507446289062 secondsTime to load utils op: 0.0012123584747314453 seconds +2: +2: Time to load utils op: 0.0013980865478515625 seconds +0: [2023-03-17 00:18:33,725] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-17 00:18:33,726] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 00:18:33,726] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:33,840] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-17 00:18:33,840] [INFO] [utils.py:828:see_memory_usage] MA 0.24 GB Max_MA 0.24 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 00:18:33,841] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:33,942] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-17 00:18:33,943] [INFO] [utils.py:828:see_memory_usage] MA 0.24 GB Max_MA 0.24 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 00:18:33,943] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:34,045] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-17 00:18:34,045] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:18:34,045] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:34,147] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-17 00:18:34,147] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:18:34,147] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:34,251] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-17 00:18:34,251] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:18:34,252] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:34,353] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-17 00:18:34,353] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:18:34,353] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.55 GB, percent = 6.3% +0: [2023-03-17 00:18:34,462] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-17 00:18:34,462] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:18:34,462] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.56 GB, percent = 6.3% +0: [2023-03-17 00:18:34,562] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-17 00:18:34,562] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:18:34,563] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.56 GB, percent = 6.3% +0: [2023-03-17 00:18:34,563] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-17 00:18:34,563] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-17 00:18:34,563] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-17 00:18:34,563] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-17 00:18:34,563] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-17 00:18:34,563] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-17 00:18:34,564] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-17 00:18:34,565] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-17 00:18:34,565] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0004239082336425781 seconds +0: [2023-03-17 00:18:34,566] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 6.96 +0: [2023-03-17 00:18:34,619] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=52024320 (52.024M) TOTAL_PARAMS=52024320 (52.024M) UNIQUE_PARAMS=52024320 (52.024M) +0: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_44m32b100m +0: will not load any checkpoints and will start from random +0: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:18:34,625] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:18:34,626] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m32b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: estimated model parameters: 0.05202432 +0: estimated model parameters without embeddings: 0.025220096 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-17 00:18:34 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 15446035 +0: validation: 15616 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.009732 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_15446035ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_15446035ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_15446035ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.094 seconds +0: total number of samples: 15471148 +0: total number of epochs: 317 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.031378 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_15616ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_15616ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_15616ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.063 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-17 00:18:48 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 16567.94 | train/valid/test-data-iterators-setup: 12815.30 +0: [000-000] 0.0520B / 0.0252B +0: [before the start of training step] datetime: 2023-03-17 00:18:48 +0: [2023-03-17 00:18:48,741] [INFO] [checkpointing.py:553:forward] Activation Checkpointing Information +0: [2023-03-17 00:18:48,742] [INFO] [checkpointing.py:554:forward] ----Partition Activations False, CPU CHECKPOINTING False +0: [2023-03-17 00:18:48,742] [INFO] [checkpointing.py:557:forward] ----contiguous Memory Checkpointing False with None total layers +0: [2023-03-17 00:18:48,742] [INFO] [checkpointing.py:560:forward] ----Synchronization False +0: [2023-03-17 00:18:48,742] [INFO] [checkpointing.py:561:forward] ----Profiling time in checkpointing False +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 1988.51220703125 | max allocated: 4435.271484375 | reserved: 5476.0 | max reserved: 5476.0 +7: iteration 10/ 60336 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 1.16 | learning rate: 3.315E-06 | global batch size: 256 | lm loss: 1.092363E+01 | grad norm: 6.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 221.430 | TFLOPs: 3.47 | +7: iteration 20/ 60336 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.17 | learning rate: 6.630E-06 | global batch size: 256 | lm loss: 1.058038E+01 | grad norm: 3.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.409 | TFLOPs: 24.20 | +7: iteration 30/ 60336 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.18 | learning rate: 9.944E-06 | global batch size: 256 | lm loss: 1.019652E+01 | grad norm: 2.010 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1420.294 | TFLOPs: 22.27 | +7: iteration 40/ 60336 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.18 | learning rate: 1.326E-05 | global batch size: 256 | lm loss: 9.990317E+00 | grad norm: 1.849 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.550 | TFLOPs: 22.84 | +7: iteration 50/ 60336 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.17 | learning rate: 1.657E-05 | global batch size: 256 | lm loss: 9.831102E+00 | grad norm: 1.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.042 | TFLOPs: 23.34 | +7: iteration 60/ 60336 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.19 | learning rate: 1.989E-05 | global batch size: 256 | lm loss: 9.655125E+00 | grad norm: 1.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.398 | TFLOPs: 21.27 | +7: iteration 70/ 60336 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.17 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 9.465331E+00 | grad norm: 1.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1502.428 | TFLOPs: 23.56 | +7: iteration 80/ 60336 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.17 | learning rate: 2.652E-05 | global batch size: 256 | lm loss: 9.255769E+00 | grad norm: 1.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1465.902 | TFLOPs: 22.99 | +7: iteration 90/ 60336 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.17 | learning rate: 2.983E-05 | global batch size: 256 | lm loss: 9.050249E+00 | grad norm: 1.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1467.256 | TFLOPs: 23.01 | +7: iteration 100/ 60336 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.17 | learning rate: 3.315E-05 | global batch size: 256 | lm loss: 8.844396E+00 | grad norm: 1.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.471 | TFLOPs: 24.28 | +7: iteration 110/ 60336 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.16 | learning rate: 3.646E-05 | global batch size: 256 | lm loss: 8.641323E+00 | grad norm: 1.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.016 | TFLOPs: 24.76 | +7: iteration 120/ 60336 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.17 | learning rate: 3.978E-05 | global batch size: 256 | lm loss: 8.431324E+00 | grad norm: 1.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1490.588 | TFLOPs: 23.38 | +7: iteration 130/ 60336 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.17 | learning rate: 4.309E-05 | global batch size: 256 | lm loss: 8.218380E+00 | grad norm: 1.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.691 | TFLOPs: 23.66 | +7: iteration 140/ 60336 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.18 | learning rate: 4.641E-05 | global batch size: 256 | lm loss: 8.020294E+00 | grad norm: 1.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1430.758 | TFLOPs: 22.44 | +7: iteration 150/ 60336 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.17 | learning rate: 4.972E-05 | global batch size: 256 | lm loss: 7.818810E+00 | grad norm: 1.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.050 | TFLOPs: 23.76 | +7: iteration 160/ 60336 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.19 | learning rate: 5.304E-05 | global batch size: 256 | lm loss: 7.654518E+00 | grad norm: 1.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1376.465 | TFLOPs: 21.59 | +7: iteration 170/ 60336 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.17 | learning rate: 5.635E-05 | global batch size: 256 | lm loss: 7.487547E+00 | grad norm: 0.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.777 | TFLOPs: 23.47 | +7: iteration 180/ 60336 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.17 | learning rate: 5.967E-05 | global batch size: 256 | lm loss: 7.359009E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.500 | TFLOPs: 23.22 | +7: iteration 190/ 60336 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.18 | learning rate: 6.298E-05 | global batch size: 256 | lm loss: 7.231019E+00 | grad norm: 0.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.648 | TFLOPs: 21.87 | +7: iteration 200/ 60336 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.16 | learning rate: 6.630E-05 | global batch size: 256 | lm loss: 7.145274E+00 | grad norm: 0.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.166 | TFLOPs: 25.00 | +7: iteration 210/ 60336 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.17 | learning rate: 6.961E-05 | global batch size: 256 | lm loss: 7.058424E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1463.733 | TFLOPs: 22.95 | +7: iteration 220/ 60336 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.17 | learning rate: 7.293E-05 | global batch size: 256 | lm loss: 6.990150E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1499.278 | TFLOPs: 23.51 | +7: iteration 230/ 60336 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.18 | learning rate: 7.624E-05 | global batch size: 256 | lm loss: 6.919986E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.755 | TFLOPs: 22.77 | +7: iteration 240/ 60336 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.16 | learning rate: 7.955E-05 | global batch size: 256 | lm loss: 6.861127E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.045 | TFLOPs: 24.58 | +7: iteration 250/ 60336 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.18 | learning rate: 8.287E-05 | global batch size: 256 | lm loss: 6.797762E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.885 | TFLOPs: 22.64 | +7: iteration 260/ 60336 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.17 | learning rate: 8.618E-05 | global batch size: 256 | lm loss: 6.754904E+00 | grad norm: 0.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1465.831 | TFLOPs: 22.99 | +7: iteration 270/ 60336 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.18 | learning rate: 8.950E-05 | global batch size: 256 | lm loss: 6.697756E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1444.114 | TFLOPs: 22.65 | +7: iteration 280/ 60336 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.19 | learning rate: 9.281E-05 | global batch size: 256 | lm loss: 6.654470E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1358.968 | TFLOPs: 21.31 | +7: iteration 290/ 60336 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.18 | learning rate: 9.613E-05 | global batch size: 256 | lm loss: 6.638582E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1439.682 | TFLOPs: 22.58 | +7: iteration 300/ 60336 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.18 | learning rate: 9.944E-05 | global batch size: 256 | lm loss: 6.586420E+00 | grad norm: 0.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1435.456 | TFLOPs: 22.51 | +7: iteration 310/ 60336 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.18 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 6.555633E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.539 | TFLOPs: 22.83 | +7: iteration 320/ 60336 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.17 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 6.535782E+00 | grad norm: 0.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.739 | TFLOPs: 23.35 | +7: iteration 330/ 60336 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.16 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 6.508652E+00 | grad norm: 0.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.942 | TFLOPs: 24.51 | +7: iteration 340/ 60336 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.18 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 6.468296E+00 | grad norm: 0.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1462.439 | TFLOPs: 22.93 | +7: iteration 350/ 60336 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.18 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 6.438604E+00 | grad norm: 1.003 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1424.325 | TFLOPs: 22.34 | +7: iteration 360/ 60336 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.17 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 6.410486E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.484 | TFLOPs: 24.11 | +7: iteration 370/ 60336 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.17 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 6.391332E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.839 | TFLOPs: 23.49 | +7: iteration 380/ 60336 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.18 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 6.383109E+00 | grad norm: 0.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.561 | TFLOPs: 22.53 | +7: iteration 390/ 60336 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.17 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 6.354917E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.193 | TFLOPs: 23.79 | +7: iteration 400/ 60336 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.17 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 6.318168E+00 | grad norm: 0.888 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.691 | TFLOPs: 23.80 | +7: iteration 410/ 60336 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.17 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 6.307621E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.912 | TFLOPs: 23.46 | +7: iteration 420/ 60336 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.17 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 6.283325E+00 | grad norm: 0.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.103 | TFLOPs: 23.12 | +7: iteration 430/ 60336 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.18 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 6.280037E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1441.527 | TFLOPs: 22.61 | +7: iteration 440/ 60336 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.17 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 6.260084E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.776 | TFLOPs: 23.58 | +7: iteration 450/ 60336 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.17 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 6.223037E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.885 | TFLOPs: 23.88 | +7: iteration 460/ 60336 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.17 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 6.215856E+00 | grad norm: 0.966 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.269 | TFLOPs: 23.70 | +7: iteration 470/ 60336 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.17 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 6.200412E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.597 | TFLOPs: 23.78 | +7: iteration 480/ 60336 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.17 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 6.175638E+00 | grad norm: 0.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1485.585 | TFLOPs: 23.30 | +7: iteration 490/ 60336 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.18 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 6.159822E+00 | grad norm: 0.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.729 | TFLOPs: 22.77 | +7: iteration 500/ 60336 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.17 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 6.144305E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.957 | TFLOPs: 23.59 | +7: iteration 510/ 60336 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.17 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 6.107702E+00 | grad norm: 0.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.219 | TFLOPs: 23.86 | +7: iteration 520/ 60336 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.17 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 6.104984E+00 | grad norm: 1.018 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.274 | TFLOPs: 24.06 | +7: iteration 530/ 60336 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.17 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 6.077240E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1481.102 | TFLOPs: 23.23 | +7: iteration 540/ 60336 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.17 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 6.060674E+00 | grad norm: 0.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1489.701 | TFLOPs: 23.36 | +7: iteration 550/ 60336 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.17 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 6.024584E+00 | grad norm: 1.057 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1471.221 | TFLOPs: 23.07 | +7: iteration 560/ 60336 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.16 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 6.030315E+00 | grad norm: 0.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.335 | TFLOPs: 25.25 | +7: iteration 570/ 60336 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.17 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 5.984346E+00 | grad norm: 0.798 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.438 | TFLOPs: 23.59 | +7: iteration 580/ 60336 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.17 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 5.969047E+00 | grad norm: 0.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.641 | TFLOPs: 23.13 | +7: iteration 590/ 60336 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.16 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 5.940759E+00 | grad norm: 0.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.105 | TFLOPs: 24.37 | +7: iteration 600/ 60336 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.17 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 5.939095E+00 | grad norm: 0.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.389 | TFLOPs: 23.14 | +7: iteration 610/ 60336 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.902383E+00 | grad norm: 1.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1432.230 | TFLOPs: 22.46 | +7: iteration 620/ 60336 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.908618E+00 | grad norm: 0.844 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.536 | TFLOPs: 23.81 | +7: iteration 630/ 60336 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.884499E+00 | grad norm: 0.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.368 | TFLOPs: 24.31 | +7: iteration 640/ 60336 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.857058E+00 | grad norm: 0.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.078 | TFLOPs: 24.47 | +7: iteration 650/ 60336 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.837798E+00 | grad norm: 0.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.204 | TFLOPs: 23.48 | +7: iteration 660/ 60336 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.822650E+00 | grad norm: 0.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.863 | TFLOPs: 23.21 | +7: iteration 670/ 60336 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.803890E+00 | grad norm: 0.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.569 | TFLOPs: 22.64 | +7: iteration 680/ 60336 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.781663E+00 | grad norm: 0.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.821 | TFLOPs: 23.65 | +7: iteration 690/ 60336 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.773384E+00 | grad norm: 0.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.831 | TFLOPs: 26.09 | +7: iteration 700/ 60336 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.748941E+00 | grad norm: 1.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.921 | TFLOPs: 23.85 | +7: iteration 710/ 60336 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.742622E+00 | grad norm: 0.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1506.534 | TFLOPs: 23.63 | +7: iteration 720/ 60336 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.728243E+00 | grad norm: 0.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.690 | TFLOPs: 24.52 | +7: iteration 730/ 60336 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.698138E+00 | grad norm: 0.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.928 | TFLOPs: 24.71 | +7: iteration 740/ 60336 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.690353E+00 | grad norm: 0.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.025 | TFLOPs: 25.05 | +7: iteration 750/ 60336 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.670726E+00 | grad norm: 0.782 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.853 | TFLOPs: 23.68 | +7: iteration 760/ 60336 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.662368E+00 | grad norm: 1.041 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.266 | TFLOPs: 24.72 | +7: iteration 770/ 60336 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.639648E+00 | grad norm: 0.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.278 | TFLOPs: 24.81 | +7: iteration 780/ 60336 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.636644E+00 | grad norm: 0.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.668 | TFLOPs: 24.54 | +7: iteration 790/ 60336 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.609310E+00 | grad norm: 0.986 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.723 | TFLOPs: 24.82 | +7: iteration 800/ 60336 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.602655E+00 | grad norm: 1.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.122 | TFLOPs: 25.20 | +7: iteration 810/ 60336 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.590311E+00 | grad norm: 0.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.544 | TFLOPs: 23.99 | +7: iteration 820/ 60336 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.583551E+00 | grad norm: 1.076 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.570 | TFLOPs: 22.72 | +7: iteration 830/ 60336 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.546741E+00 | grad norm: 0.885 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.460 | TFLOPs: 24.90 | +7: iteration 840/ 60336 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.534141E+00 | grad norm: 0.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.662 | TFLOPs: 25.15 | +7: iteration 850/ 60336 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.522705E+00 | grad norm: 0.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.045 | TFLOPs: 25.06 | +7: iteration 860/ 60336 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.511002E+00 | grad norm: 1.034 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.660 | TFLOPs: 25.12 | +7: iteration 870/ 60336 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.495018E+00 | grad norm: 0.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.251 | TFLOPs: 22.84 | +7: iteration 880/ 60336 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.486448E+00 | grad norm: 0.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.224 | TFLOPs: 24.94 | +7: iteration 890/ 60336 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.476634E+00 | grad norm: 0.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.600 | TFLOPs: 24.02 | +7: iteration 900/ 60336 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.449088E+00 | grad norm: 1.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.302 | TFLOPs: 25.07 | +7: iteration 910/ 60336 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.440461E+00 | grad norm: 0.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.600 | TFLOPs: 25.96 | +7: iteration 920/ 60336 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.427492E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.741 | TFLOPs: 24.81 | +7: iteration 930/ 60336 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.413548E+00 | grad norm: 0.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.178 | TFLOPs: 24.84 | +7: iteration 940/ 60336 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.410460E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.343 | TFLOPs: 25.14 | +7: iteration 950/ 60336 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.390516E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1502.004 | TFLOPs: 23.56 | +7: iteration 960/ 60336 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.368149E+00 | grad norm: 1.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.387 | TFLOPs: 25.10 | +7: iteration 970/ 60336 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.362458E+00 | grad norm: 0.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.611 | TFLOPs: 24.35 | +7: iteration 980/ 60336 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.348759E+00 | grad norm: 1.009 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.191 | TFLOPs: 24.47 | +7: iteration 990/ 60336 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.345673E+00 | grad norm: 1.039 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.360 | TFLOPs: 24.78 | +7: iteration 1000/ 60336 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.340693E+00 | grad norm: 0.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.056 | TFLOPs: 25.44 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 1000 | lm loss value: 5.282221E+00 | lm loss PPL: 1.968066E+02 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 1000 to checkpoints_44m32b100m +0: [2023-03-17 00:21:47,839] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +0: [2023-03-17 00:21:48,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:21:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:21:48,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:21:48,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:21:48,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:21:48,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:21:48,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:21:48,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:21:48,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:21:48,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:21:48,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:21:48,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:21:48,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:21:48,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:21:48,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:21:48,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:21:48,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:21:48,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:21:48,228] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:21:48,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:21:48,229] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step1000/mp_rank_00_model_states.pt +0: [2023-03-17 00:21:48,229] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:21:48,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:21:48,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:48,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:21:48,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:48,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:21:48,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: successfully saved checkpoint at iteration 1000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 473.64 +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:21:48,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:48,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: iteration 1010/ 60336 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.21 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.320494E+00 | grad norm: 0.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1225.456 | TFLOPs: 19.22 | +7: iteration 1020/ 60336 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.315746E+00 | grad norm: 0.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.572 | TFLOPs: 24.40 | +7: iteration 1030/ 60336 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.294713E+00 | grad norm: 0.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.141 | TFLOPs: 23.65 | +7: iteration 1040/ 60336 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.300864E+00 | grad norm: 1.085 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.576 | TFLOPs: 23.99 | +7: iteration 1050/ 60336 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.282514E+00 | grad norm: 0.853 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.022 | TFLOPs: 25.16 | +7: iteration 1060/ 60336 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.260968E+00 | grad norm: 0.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.131 | TFLOPs: 24.39 | +7: iteration 1070/ 60336 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.262869E+00 | grad norm: 0.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.451 | TFLOPs: 24.79 | +7: iteration 1080/ 60336 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.242194E+00 | grad norm: 0.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.510 | TFLOPs: 24.57 | +7: iteration 1090/ 60336 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.231900E+00 | grad norm: 0.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.612 | TFLOPs: 25.10 | +7: iteration 1100/ 60336 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.235229E+00 | grad norm: 0.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.224 | TFLOPs: 25.55 | +7: iteration 1110/ 60336 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.224784E+00 | grad norm: 0.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.516 | TFLOPs: 25.13 | +7: iteration 1120/ 60336 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.222213E+00 | grad norm: 0.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.097 | TFLOPs: 24.37 | +7: iteration 1130/ 60336 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.201607E+00 | grad norm: 0.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.482 | TFLOPs: 25.43 | +7: iteration 1140/ 60336 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.203745E+00 | grad norm: 0.984 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.906 | TFLOPs: 24.38 | +7: iteration 1150/ 60336 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.188167E+00 | grad norm: 0.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.732 | TFLOPs: 25.50 | +7: iteration 1160/ 60336 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.176035E+00 | grad norm: 0.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.302 | TFLOPs: 24.49 | +7: iteration 1170/ 60336 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.157790E+00 | grad norm: 0.629 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.817 | TFLOPs: 25.45 | +7: iteration 1180/ 60336 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.150709E+00 | grad norm: 0.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.028 | TFLOPs: 24.95 | +7: iteration 1190/ 60336 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.149960E+00 | grad norm: 0.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.271 | TFLOPs: 24.86 | +7: iteration 1200/ 60336 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.146875E+00 | grad norm: 0.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.313 | TFLOPs: 25.44 | +7: iteration 1210/ 60336 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.120729E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.280 | TFLOPs: 25.41 | +7: iteration 1220/ 60336 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.125879E+00 | grad norm: 0.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.057 | TFLOPs: 24.95 | +7: iteration 1230/ 60336 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.112920E+00 | grad norm: 0.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.186 | TFLOPs: 24.44 | +7: iteration 1240/ 60336 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.103919E+00 | grad norm: 0.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.735 | TFLOPs: 25.35 | +7: iteration 1250/ 60336 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.096542E+00 | grad norm: 0.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.977 | TFLOPs: 24.87 | +7: iteration 1260/ 60336 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.089962E+00 | grad norm: 0.846 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.640 | TFLOPs: 24.26 | +7: iteration 1270/ 60336 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.086674E+00 | grad norm: 0.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.340 | TFLOPs: 24.92 | +7: iteration 1280/ 60336 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.064565E+00 | grad norm: 0.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.871 | TFLOPs: 25.58 | +7: iteration 1290/ 60336 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.067684E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.964 | TFLOPs: 23.51 | +7: iteration 1300/ 60336 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.058142E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.804 | TFLOPs: 24.37 | +7: iteration 1310/ 60336 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.042147E+00 | grad norm: 0.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.334 | TFLOPs: 24.53 | +7: iteration 1320/ 60336 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.024862E+00 | grad norm: 0.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.912 | TFLOPs: 24.98 | +7: iteration 1330/ 60336 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.030866E+00 | grad norm: 0.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.941 | TFLOPs: 24.59 | +7: iteration 1340/ 60336 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.013855E+00 | grad norm: 0.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.085 | TFLOPs: 24.86 | +7: iteration 1350/ 60336 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.008356E+00 | grad norm: 0.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.416 | TFLOPs: 24.44 | +7: iteration 1360/ 60336 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.989963E+00 | grad norm: 0.742 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.364 | TFLOPs: 23.45 | +7: iteration 1370/ 60336 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.976232E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.279 | TFLOPs: 24.70 | +7: iteration 1380/ 60336 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.982715E+00 | grad norm: 0.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.557 | TFLOPs: 24.29 | +7: iteration 1390/ 60336 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.971574E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.472 | TFLOPs: 23.33 | +7: iteration 1400/ 60336 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.961792E+00 | grad norm: 0.975 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.532 | TFLOPs: 24.61 | +7: iteration 1410/ 60336 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.956792E+00 | grad norm: 0.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.485 | TFLOPs: 24.39 | +7: iteration 1420/ 60336 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.951904E+00 | grad norm: 0.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.084 | TFLOPs: 24.53 | +7: iteration 1430/ 60336 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.948349E+00 | grad norm: 0.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.468 | TFLOPs: 24.97 | +7: iteration 1440/ 60336 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.932924E+00 | grad norm: 0.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.575 | TFLOPs: 24.85 | +7: iteration 1450/ 60336 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.927074E+00 | grad norm: 0.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.650 | TFLOPs: 24.18 | +7: iteration 1460/ 60336 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.924555E+00 | grad norm: 0.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.582 | TFLOPs: 24.41 | +7: iteration 1470/ 60336 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.912043E+00 | grad norm: 1.021 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1492.290 | TFLOPs: 23.40 | +7: iteration 1480/ 60336 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.915187E+00 | grad norm: 0.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.130 | TFLOPs: 26.35 | +7: iteration 1490/ 60336 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.897390E+00 | grad norm: 0.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.215 | TFLOPs: 24.48 | +7: iteration 1500/ 60336 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.897885E+00 | grad norm: 0.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.686 | TFLOPs: 25.60 | +7: iteration 1510/ 60336 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.884644E+00 | grad norm: 0.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.760 | TFLOPs: 25.12 | +7: iteration 1520/ 60336 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.878843E+00 | grad norm: 0.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.078 | TFLOPs: 24.40 | +7: iteration 1530/ 60336 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.863621E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.388 | TFLOPs: 25.54 | +7: iteration 1540/ 60336 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.855055E+00 | grad norm: 0.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.404 | TFLOPs: 24.77 | +7: iteration 1550/ 60336 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.846677E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.727 | TFLOPs: 25.61 | +7: iteration 1560/ 60336 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.846234E+00 | grad norm: 0.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.505 | TFLOPs: 24.43 | +7: iteration 1570/ 60336 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.846407E+00 | grad norm: 0.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.738 | TFLOPs: 25.09 | +7: iteration 1580/ 60336 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.836094E+00 | grad norm: 0.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.472 | TFLOPs: 24.53 | +7: iteration 1590/ 60336 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.820689E+00 | grad norm: 0.832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.032 | TFLOPs: 25.25 | +7: iteration 1600/ 60336 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.807373E+00 | grad norm: 0.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.601 | TFLOPs: 25.37 | +7: iteration 1610/ 60336 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.803474E+00 | grad norm: 0.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.840 | TFLOPs: 24.43 | +7: iteration 1620/ 60336 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.816579E+00 | grad norm: 0.989 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.983 | TFLOPs: 25.55 | +7: iteration 1630/ 60336 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.803643E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.329 | TFLOPs: 25.58 | +7: iteration 1640/ 60336 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.807090E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.134 | TFLOPs: 25.64 | +7: iteration 1650/ 60336 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.789514E+00 | grad norm: 0.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.365 | TFLOPs: 24.78 | +7: iteration 1660/ 60336 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.792236E+00 | grad norm: 0.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.056 | TFLOPs: 25.20 | +7: iteration 1670/ 60336 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.779301E+00 | grad norm: 0.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.542 | TFLOPs: 23.80 | +7: iteration 1680/ 60336 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.772688E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.536 | TFLOPs: 24.46 | +7: iteration 1690/ 60336 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.772812E+00 | grad norm: 0.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.149 | TFLOPs: 24.34 | +7: iteration 1700/ 60336 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.755549E+00 | grad norm: 0.916 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.866 | TFLOPs: 24.82 | +7: iteration 1710/ 60336 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.766174E+00 | grad norm: 0.716 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.135 | TFLOPs: 24.84 | +7: iteration 1720/ 60336 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.756105E+00 | grad norm: 0.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.904 | TFLOPs: 24.70 | +7: iteration 1730/ 60336 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.771824E+00 | grad norm: 0.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.254 | TFLOPs: 25.22 | +7: iteration 1740/ 60336 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.739139E+00 | grad norm: 0.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.287 | TFLOPs: 25.55 | +7: iteration 1750/ 60336 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.749656E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.747 | TFLOPs: 24.29 | +7: iteration 1760/ 60336 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.741566E+00 | grad norm: 0.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.335 | TFLOPs: 24.88 | +7: iteration 1770/ 60336 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.737865E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.200 | TFLOPs: 26.13 | +7: iteration 1780/ 60336 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.731608E+00 | grad norm: 0.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.814 | TFLOPs: 25.34 | +7: iteration 1790/ 60336 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.725381E+00 | grad norm: 0.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.006 | TFLOPs: 25.36 | +7: iteration 1800/ 60336 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.724430E+00 | grad norm: 0.792 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.263 | TFLOPs: 25.47 | +7: iteration 1810/ 60336 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.718814E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.734 | TFLOPs: 25.39 | +7: iteration 1820/ 60336 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.708547E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.490 | TFLOPs: 25.59 | +7: iteration 1830/ 60336 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.695181E+00 | grad norm: 0.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.053 | TFLOPs: 24.65 | +7: iteration 1840/ 60336 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.711556E+00 | grad norm: 0.756 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.269 | TFLOPs: 24.99 | +7: iteration 1850/ 60336 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.708448E+00 | grad norm: 0.736 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.130 | TFLOPs: 24.34 | +7: iteration 1860/ 60336 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.689297E+00 | grad norm: 0.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.724 | TFLOPs: 25.68 | +7: iteration 1870/ 60336 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.697121E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.674 | TFLOPs: 24.05 | +7: iteration 1880/ 60336 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.687649E+00 | grad norm: 0.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.050 | TFLOPs: 25.64 | +7: iteration 1890/ 60336 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.692084E+00 | grad norm: 0.908 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.093 | TFLOPs: 24.92 | +7: iteration 1900/ 60336 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.694794E+00 | grad norm: 0.716 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.760 | TFLOPs: 23.82 | +7: iteration 1910/ 60336 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.679422E+00 | grad norm: 0.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.383 | TFLOPs: 24.89 | +7: iteration 1920/ 60336 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.667158E+00 | grad norm: 0.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.539 | TFLOPs: 24.10 | +7: iteration 1930/ 60336 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.679374E+00 | grad norm: 0.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.095 | TFLOPs: 25.34 | +7: iteration 1940/ 60336 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.672025E+00 | grad norm: 0.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.481 | TFLOPs: 25.33 | +7: iteration 1950/ 60336 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.664164E+00 | grad norm: 0.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.581 | TFLOPs: 25.59 | +7: iteration 1960/ 60336 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.670422E+00 | grad norm: 0.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.371 | TFLOPs: 25.63 | +7: iteration 1970/ 60336 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.653976E+00 | grad norm: 0.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.320 | TFLOPs: 24.50 | +7: iteration 1980/ 60336 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.657817E+00 | grad norm: 0.749 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.058 | TFLOPs: 24.86 | +7: iteration 1990/ 60336 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.645946E+00 | grad norm: 0.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.643 | TFLOPs: 23.94 | +0: [2023-03-17 00:24:29,956] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00019975730445280135, 0.00019975730445280135, 0.00019975730445280135], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 60336 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.640616E+00 | grad norm: 0.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.842 | TFLOPs: 25.40 | +0: steps: 2000 loss: 4.6566 iter time (s): 0.169 samples/sec: 1513.174 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 2000 | lm loss value: 4.530587E+00 | lm loss PPL: 9.281304E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 2000 to checkpoints_44m32b100m +0: [2023-03-17 00:24:30,029] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +0: [2023-03-17 00:24:30,033] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:24:30,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:24:30,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:24:30,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:24:30,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:24:30,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:24:30,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:24:30,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:24:30,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:24:30,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:24:30,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:24:30,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:24:30,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:24:30,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:24:30,143] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:24:30,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:24:30,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:24:30,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:24:30,159] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:24:30,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:24:30,160] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step2000/mp_rank_00_model_states.pt +0: [2023-03-17 00:24:30,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:24:30,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:30,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:24:30,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:30,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:30,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:24:30,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:30,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:30,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:24:30,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:30,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:30,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: successfully saved checkpoint at iteration 2000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.09 +7: iteration 2010/ 60336 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.18 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.639316E+00 | grad norm: 0.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.626 | TFLOPs: 21.76 | +7: iteration 2020/ 60336 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.638845E+00 | grad norm: 0.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.187 | TFLOPs: 24.95 | +7: iteration 2030/ 60336 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.638541E+00 | grad norm: 0.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.703 | TFLOPs: 25.21 | +7: iteration 2040/ 60336 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.628967E+00 | grad norm: 0.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.776 | TFLOPs: 25.43 | +7: iteration 2050/ 60336 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.634740E+00 | grad norm: 0.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.806 | TFLOPs: 25.28 | +7: iteration 2060/ 60336 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.643096E+00 | grad norm: 0.657 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.332 | TFLOPs: 24.92 | +7: iteration 2070/ 60336 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.612700E+00 | grad norm: 0.697 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.610 | TFLOPs: 24.74 | +7: iteration 2080/ 60336 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.620873E+00 | grad norm: 0.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.750 | TFLOPs: 24.04 | +7: iteration 2090/ 60336 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.614666E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.730 | TFLOPs: 25.76 | +7: iteration 2100/ 60336 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.623833E+00 | grad norm: 0.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.868 | TFLOPs: 25.29 | +7: iteration 2110/ 60336 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.622031E+00 | grad norm: 0.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.446 | TFLOPs: 24.58 | +7: iteration 2120/ 60336 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.619064E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.287 | TFLOPs: 25.27 | +7: iteration 2130/ 60336 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.611952E+00 | grad norm: 0.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.035 | TFLOPs: 24.23 | +7: iteration 2140/ 60336 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.604722E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.138 | TFLOPs: 24.62 | +7: iteration 2150/ 60336 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.606447E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.895 | TFLOPs: 25.50 | +7: iteration 2160/ 60336 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.593769E+00 | grad norm: 0.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.173 | TFLOPs: 24.86 | +7: iteration 2170/ 60336 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.615827E+00 | grad norm: 0.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.997 | TFLOPs: 24.92 | +7: iteration 2180/ 60336 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.611981E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.311 | TFLOPs: 25.71 | +7: iteration 2190/ 60336 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.589300E+00 | grad norm: 0.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.683 | TFLOPs: 25.79 | +7: iteration 2200/ 60336 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.584887E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.378 | TFLOPs: 25.14 | +7: iteration 2210/ 60336 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.592096E+00 | grad norm: 0.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.576 | TFLOPs: 25.40 | +7: iteration 2220/ 60336 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.581543E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.305 | TFLOPs: 23.98 | +7: iteration 2230/ 60336 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.585999E+00 | grad norm: 0.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.886 | TFLOPs: 25.59 | +7: iteration 2240/ 60336 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.571989E+00 | grad norm: 0.716 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.805 | TFLOPs: 26.00 | +7: iteration 2250/ 60336 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.585036E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.857 | TFLOPs: 25.28 | +7: iteration 2260/ 60336 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.577665E+00 | grad norm: 0.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.394 | TFLOPs: 24.16 | +7: iteration 2270/ 60336 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.573306E+00 | grad norm: 0.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.186 | TFLOPs: 24.30 | +7: iteration 2280/ 60336 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.566343E+00 | grad norm: 0.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.774 | TFLOPs: 24.27 | +7: iteration 2290/ 60336 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.559651E+00 | grad norm: 0.706 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.821 | TFLOPs: 25.09 | +7: iteration 2300/ 60336 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.569725E+00 | grad norm: 0.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.660 | TFLOPs: 25.01 | +7: iteration 2310/ 60336 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.567666E+00 | grad norm: 0.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.879 | TFLOPs: 25.31 | +7: iteration 2320/ 60336 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.17 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.563441E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.866 | TFLOPs: 24.04 | +7: iteration 2330/ 60336 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.565936E+00 | grad norm: 0.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.925 | TFLOPs: 24.64 | +7: iteration 2340/ 60336 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.552607E+00 | grad norm: 0.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.622 | TFLOPs: 25.23 | +7: iteration 2350/ 60336 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.17 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.544384E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.126 | TFLOPs: 23.98 | +7: iteration 2360/ 60336 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.555045E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.314 | TFLOPs: 26.16 | +7: iteration 2370/ 60336 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.17 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.541009E+00 | grad norm: 0.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.918 | TFLOPs: 24.29 | +7: iteration 2380/ 60336 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.537109E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.149 | TFLOPs: 24.91 | +7: iteration 2390/ 60336 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.546866E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.008 | TFLOPs: 25.48 | +7: iteration 2400/ 60336 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.533358E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.185 | TFLOPs: 25.20 | +7: iteration 2410/ 60336 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.544061E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.752 | TFLOPs: 25.07 | +7: iteration 2420/ 60336 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.541518E+00 | grad norm: 0.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.307 | TFLOPs: 24.86 | +7: iteration 2430/ 60336 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.526198E+00 | grad norm: 0.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.069 | TFLOPs: 25.52 | +7: iteration 2440/ 60336 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.541319E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.520 | TFLOPs: 25.52 | +7: iteration 2450/ 60336 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.17 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.535126E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.981 | TFLOPs: 24.10 | +7: iteration 2460/ 60336 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.537519E+00 | grad norm: 0.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.111 | TFLOPs: 24.56 | +7: iteration 2470/ 60336 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.545973E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.486 | TFLOPs: 25.26 | +7: iteration 2480/ 60336 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.528653E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.061 | TFLOPs: 25.17 | +7: iteration 2490/ 60336 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.520988E+00 | grad norm: 0.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.386 | TFLOPs: 25.33 | +7: iteration 2500/ 60336 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.525913E+00 | grad norm: 0.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.440 | TFLOPs: 25.54 | +7: iteration 2510/ 60336 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.514162E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.389 | TFLOPs: 25.69 | +7: iteration 2520/ 60336 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.525856E+00 | grad norm: 0.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.274 | TFLOPs: 25.93 | +7: iteration 2530/ 60336 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.514874E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.959 | TFLOPs: 25.55 | +7: iteration 2540/ 60336 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.511007E+00 | grad norm: 0.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.085 | TFLOPs: 25.23 | +7: iteration 2550/ 60336 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.509557E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.343 | TFLOPs: 24.82 | +7: iteration 2560/ 60336 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.508999E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.398 | TFLOPs: 25.43 | +7: iteration 2570/ 60336 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.502133E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.972 | TFLOPs: 25.94 | +7: iteration 2580/ 60336 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.517481E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.557 | TFLOPs: 25.85 | +7: iteration 2590/ 60336 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.504093E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.410 | TFLOPs: 25.85 | +7: iteration 2600/ 60336 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.503344E+00 | grad norm: 0.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.087 | TFLOPs: 25.94 | +7: iteration 2610/ 60336 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.506102E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.101 | TFLOPs: 25.81 | +7: iteration 2620/ 60336 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.511717E+00 | grad norm: 0.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.785 | TFLOPs: 24.85 | +7: iteration 2630/ 60336 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.501537E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.378 | TFLOPs: 26.18 | +7: iteration 2640/ 60336 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.491343E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.832 | TFLOPs: 25.98 | +7: iteration 2650/ 60336 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.497439E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.986 | TFLOPs: 24.90 | +7: iteration 2660/ 60336 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.494139E+00 | grad norm: 0.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.013 | TFLOPs: 24.81 | +7: iteration 2670/ 60336 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.480377E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.765 | TFLOPs: 26.08 | +7: iteration 2680/ 60336 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.489215E+00 | grad norm: 0.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.482 | TFLOPs: 26.01 | +7: iteration 2690/ 60336 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.494846E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.847 | TFLOPs: 25.89 | +7: iteration 2700/ 60336 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.488646E+00 | grad norm: 0.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.911 | TFLOPs: 24.59 | +7: iteration 2710/ 60336 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.481715E+00 | grad norm: 0.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.094 | TFLOPs: 25.67 | +7: iteration 2720/ 60336 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.480698E+00 | grad norm: 0.634 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.022 | TFLOPs: 25.99 | +7: iteration 2730/ 60336 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.470555E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.480 | TFLOPs: 24.93 | +7: iteration 2740/ 60336 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.480658E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.881 | TFLOPs: 25.11 | +7: iteration 2750/ 60336 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.485537E+00 | grad norm: 0.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.045 | TFLOPs: 25.69 | +7: iteration 2760/ 60336 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.471854E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.174 | TFLOPs: 25.96 | +7: iteration 2770/ 60336 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.475193E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.886 | TFLOPs: 24.70 | +7: iteration 2780/ 60336 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.479463E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.225 | TFLOPs: 25.00 | +7: iteration 2790/ 60336 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.473000E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.597 | TFLOPs: 25.79 | +7: iteration 2800/ 60336 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.467466E+00 | grad norm: 0.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.820 | TFLOPs: 24.95 | +7: iteration 2810/ 60336 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.469394E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.411 | TFLOPs: 24.93 | +7: iteration 2820/ 60336 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.453559E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.299 | TFLOPs: 25.22 | +7: iteration 2830/ 60336 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.462434E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.820 | TFLOPs: 25.42 | +7: iteration 2840/ 60336 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.455969E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.020 | TFLOPs: 25.28 | +7: iteration 2850/ 60336 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.444592E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.326 | TFLOPs: 25.46 | +7: iteration 2860/ 60336 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.461924E+00 | grad norm: 0.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.143 | TFLOPs: 25.83 | +7: iteration 2870/ 60336 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.17 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.456712E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.993 | TFLOPs: 24.12 | +7: iteration 2880/ 60336 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.458183E+00 | grad norm: 0.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.588 | TFLOPs: 25.34 | +7: iteration 2890/ 60336 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.447278E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.771 | TFLOPs: 26.05 | +7: iteration 2900/ 60336 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.449445E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.971 | TFLOPs: 25.37 | +7: iteration 2910/ 60336 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.459654E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.329 | TFLOPs: 26.09 | +7: iteration 2920/ 60336 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.451857E+00 | grad norm: 0.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.100 | TFLOPs: 24.51 | +7: iteration 2930/ 60336 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.445520E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.527 | TFLOPs: 25.48 | +7: iteration 2940/ 60336 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.439311E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.383 | TFLOPs: 25.60 | +7: iteration 2950/ 60336 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.449551E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.729 | TFLOPs: 25.54 | +7: iteration 2960/ 60336 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.448954E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.362 | TFLOPs: 25.58 | +7: iteration 2970/ 60336 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.453371E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.433 | TFLOPs: 25.51 | +7: iteration 2980/ 60336 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.437085E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.385 | TFLOPs: 25.74 | +7: iteration 2990/ 60336 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.445346E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.981 | TFLOPs: 24.79 | +7: iteration 3000/ 60336 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.434949E+00 | grad norm: 0.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.640 | TFLOPs: 25.73 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 3000 | lm loss value: 4.343617E+00 | lm loss PPL: 7.698553E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 3000 to checkpoints_44m32b100m +0: [2023-03-17 00:27:09,443] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +0: [2023-03-17 00:27:09,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:27:09,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:27:09,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:27:09,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:27:09,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:27:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:27:09,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:27:09,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:27:09,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:27:09,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:27:09,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:27:09,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:27:09,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:27:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:27:09,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:27:09,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:27:09,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:27:09,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:27:09,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:27:09,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:27:09,622] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step3000/mp_rank_00_model_states.pt +0: [2023-03-17 00:27:09,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:27:09,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:09,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:09,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:09,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:09,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:09,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:09,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:09,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:27:09,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:09,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:09,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:09,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:27:09,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:09,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:09,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: successfully saved checkpoint at iteration 3000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 226.62 +7: iteration 3010/ 60336 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.19 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.435661E+00 | grad norm: 0.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1380.898 | TFLOPs: 21.66 | +7: iteration 3020/ 60336 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.430899E+00 | grad norm: 0.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.953 | TFLOPs: 25.84 | +7: iteration 3030/ 60336 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.427718E+00 | grad norm: 0.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.962 | TFLOPs: 25.09 | +7: iteration 3040/ 60336 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.430980E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.880 | TFLOPs: 25.47 | +7: iteration 3050/ 60336 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.425904E+00 | grad norm: 0.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.212 | TFLOPs: 24.47 | +7: iteration 3060/ 60336 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.429056E+00 | grad norm: 0.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.528 | TFLOPs: 25.88 | +7: iteration 3070/ 60336 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.414587E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.882 | TFLOPs: 25.59 | +7: iteration 3080/ 60336 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.419125E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.125 | TFLOPs: 25.82 | +7: iteration 3090/ 60336 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.416897E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.755 | TFLOPs: 25.75 | +7: iteration 3100/ 60336 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.394973E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.211 | TFLOPs: 24.36 | +7: iteration 3110/ 60336 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.425511E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.375 | TFLOPs: 25.47 | +7: iteration 3120/ 60336 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.424724E+00 | grad norm: 0.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.571 | TFLOPs: 26.18 | +7: iteration 3130/ 60336 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.430213E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.322 | TFLOPs: 24.97 | +7: iteration 3140/ 60336 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.17 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.415503E+00 | grad norm: 0.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.750 | TFLOPs: 23.85 | +7: iteration 3150/ 60336 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.402432E+00 | grad norm: 0.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.576 | TFLOPs: 25.68 | +7: iteration 3160/ 60336 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.414014E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.235 | TFLOPs: 25.77 | +7: iteration 3170/ 60336 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.419204E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.959 | TFLOPs: 25.70 | +7: iteration 3180/ 60336 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.414429E+00 | grad norm: 0.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.392 | TFLOPs: 26.07 | +7: iteration 3190/ 60336 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.410340E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.895 | TFLOPs: 25.06 | +7: iteration 3200/ 60336 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.411679E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.407 | TFLOPs: 25.66 | +7: iteration 3210/ 60336 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.399387E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.180 | TFLOPs: 25.75 | +7: iteration 3220/ 60336 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.423393E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.969 | TFLOPs: 25.03 | +7: iteration 3230/ 60336 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.405001E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.908 | TFLOPs: 26.19 | +7: iteration 3240/ 60336 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.400780E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.732 | TFLOPs: 26.17 | +7: iteration 3250/ 60336 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.405370E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.365 | TFLOPs: 25.19 | +7: iteration 3260/ 60336 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.403674E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.459 | TFLOPs: 26.17 | +7: iteration 3270/ 60336 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.399054E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.704 | TFLOPs: 26.15 | +7: iteration 3280/ 60336 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.397489E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.499 | TFLOPs: 26.17 | +7: iteration 3290/ 60336 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.417539E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.345 | TFLOPs: 26.16 | +7: iteration 3300/ 60336 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.385429E+00 | grad norm: 0.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.398 | TFLOPs: 24.60 | +7: iteration 3310/ 60336 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.399465E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.898 | TFLOPs: 26.20 | +7: iteration 3320/ 60336 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.386414E+00 | grad norm: 0.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.655 | TFLOPs: 26.15 | +7: iteration 3330/ 60336 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.383803E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.074 | TFLOPs: 26.13 | +7: iteration 3340/ 60336 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.402326E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.887 | TFLOPs: 25.59 | +7: iteration 3350/ 60336 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.391623E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.293 | TFLOPs: 26.10 | +7: iteration 3360/ 60336 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.401550E+00 | grad norm: 0.640 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.065 | TFLOPs: 26.18 | +7: iteration 3370/ 60336 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.380375E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.309 | TFLOPs: 25.32 | +7: iteration 3380/ 60336 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.387540E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.031 | TFLOPs: 25.75 | +7: iteration 3390/ 60336 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.385179E+00 | grad norm: 0.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.546 | TFLOPs: 25.90 | +7: iteration 3400/ 60336 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.390817E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.913 | TFLOPs: 25.55 | +7: iteration 3410/ 60336 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.391730E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.532 | TFLOPs: 25.74 | +7: iteration 3420/ 60336 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.385989E+00 | grad norm: 0.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.802 | TFLOPs: 25.87 | +7: iteration 3430/ 60336 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.396422E+00 | grad norm: 0.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.084 | TFLOPs: 26.14 | +7: iteration 3440/ 60336 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.375439E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.758 | TFLOPs: 26.12 | +7: iteration 3450/ 60336 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.391341E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.386 | TFLOPs: 26.01 | +7: iteration 3460/ 60336 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.392249E+00 | grad norm: 0.736 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.905 | TFLOPs: 25.23 | +7: iteration 3470/ 60336 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.384647E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.944 | TFLOPs: 26.00 | +7: iteration 3480/ 60336 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.379423E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.652 | TFLOPs: 25.71 | +7: iteration 3490/ 60336 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.382659E+00 | grad norm: 0.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.438 | TFLOPs: 25.79 | +7: iteration 3500/ 60336 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.381738E+00 | grad norm: 0.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.396 | TFLOPs: 26.07 | +7: iteration 3510/ 60336 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.379922E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.802 | TFLOPs: 25.61 | +7: iteration 3520/ 60336 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.374626E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.667 | TFLOPs: 26.09 | +7: iteration 3530/ 60336 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.364676E+00 | grad norm: 0.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.636 | TFLOPs: 25.35 | +7: iteration 3540/ 60336 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.372668E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.930 | TFLOPs: 26.09 | +7: iteration 3550/ 60336 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.376987E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.845 | TFLOPs: 26.11 | +7: iteration 3560/ 60336 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.377152E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.171 | TFLOPs: 25.72 | +7: iteration 3570/ 60336 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.374492E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.653 | TFLOPs: 25.96 | +7: iteration 3580/ 60336 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.368450E+00 | grad norm: 0.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.384 | TFLOPs: 26.10 | +7: iteration 3590/ 60336 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.380565E+00 | grad norm: 0.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.838 | TFLOPs: 25.61 | +7: iteration 3600/ 60336 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.373305E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.989 | TFLOPs: 26.06 | +7: iteration 3610/ 60336 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.361205E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.146 | TFLOPs: 25.08 | +7: iteration 3620/ 60336 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.369770E+00 | grad norm: 0.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.028 | TFLOPs: 25.58 | +7: iteration 3630/ 60336 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.356181E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.211 | TFLOPs: 25.55 | +7: iteration 3640/ 60336 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.372529E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.169 | TFLOPs: 26.00 | +7: iteration 3650/ 60336 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.359076E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.319 | TFLOPs: 25.99 | +7: iteration 3660/ 60336 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.366529E+00 | grad norm: 0.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.733 | TFLOPs: 26.03 | +7: iteration 3670/ 60336 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.355308E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.415 | TFLOPs: 26.06 | +7: iteration 3680/ 60336 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.347705E+00 | grad norm: 0.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.228 | TFLOPs: 25.69 | +7: iteration 3690/ 60336 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.347778E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.346 | TFLOPs: 26.09 | +7: iteration 3700/ 60336 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.348518E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.481 | TFLOPs: 25.70 | +7: iteration 3710/ 60336 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.356656E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.447 | TFLOPs: 25.63 | +7: iteration 3720/ 60336 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.344817E+00 | grad norm: 0.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.287 | TFLOPs: 26.04 | +7: iteration 3730/ 60336 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.344565E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.420 | TFLOPs: 26.09 | +7: iteration 3740/ 60336 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.338115E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.805 | TFLOPs: 25.29 | +7: iteration 3750/ 60336 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.340708E+00 | grad norm: 0.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.307 | TFLOPs: 26.04 | +7: iteration 3760/ 60336 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.346973E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.921 | TFLOPs: 26.08 | +7: iteration 3770/ 60336 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.345473E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.295 | TFLOPs: 26.07 | +7: iteration 3780/ 60336 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.338308E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.658 | TFLOPs: 26.11 | +7: iteration 3790/ 60336 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.341283E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.555 | TFLOPs: 26.14 | +7: iteration 3800/ 60336 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.342057E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.614 | TFLOPs: 26.12 | +7: iteration 3810/ 60336 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.345775E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.728 | TFLOPs: 25.62 | +7: iteration 3820/ 60336 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.334784E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.955 | TFLOPs: 25.53 | +7: iteration 3830/ 60336 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.341938E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.235 | TFLOPs: 25.08 | +7: iteration 3840/ 60336 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.328239E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.503 | TFLOPs: 26.10 | +7: iteration 3850/ 60336 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.331102E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.232 | TFLOPs: 25.83 | +7: iteration 3860/ 60336 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.327881E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.668 | TFLOPs: 25.45 | +7: iteration 3870/ 60336 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.329747E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.880 | TFLOPs: 25.34 | +7: iteration 3880/ 60336 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.336054E+00 | grad norm: 0.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.587 | TFLOPs: 25.54 | +7: iteration 3890/ 60336 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.342420E+00 | grad norm: 0.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.375 | TFLOPs: 25.35 | +7: iteration 3900/ 60336 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.329335E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.216 | TFLOPs: 25.97 | +7: iteration 3910/ 60336 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.320689E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.404 | TFLOPs: 25.69 | +7: iteration 3920/ 60336 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.314096E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.436 | TFLOPs: 26.10 | +7: iteration 3930/ 60336 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.322905E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.533 | TFLOPs: 25.68 | +7: iteration 3940/ 60336 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.322947E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.735 | TFLOPs: 26.04 | +7: iteration 3950/ 60336 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.314792E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.845 | TFLOPs: 25.61 | +7: iteration 3960/ 60336 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.304680E+00 | grad norm: 0.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.790 | TFLOPs: 26.09 | +7: iteration 3970/ 60336 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.323537E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.246 | TFLOPs: 25.50 | +7: iteration 3980/ 60336 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.323247E+00 | grad norm: 0.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.828 | TFLOPs: 26.11 | +7: iteration 3990/ 60336 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.326511E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.555 | TFLOPs: 25.78 | +0: [2023-03-17 00:29:45,755] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.00019856770770599518, 0.00019856770770599518, 0.00019856770770599518], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 60336 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.328479E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.341 | TFLOPs: 25.10 | +0: steps: 4000 loss: 4.3412 iter time (s): 0.156 samples/sec: 1637.359 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 4000 | lm loss value: 4.270508E+00 | lm loss PPL: 7.155800E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 4000 to checkpoints_44m32b100m +0: [2023-03-17 00:29:45,829] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +0: [2023-03-17 00:29:45,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:29:45,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:29:45,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:29:45,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:29:45,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:29:45,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:29:45,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:29:45,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:29:45,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:29:45,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:29:45,926] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:29:45,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:29:45,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:29:45,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:29:45,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:29:45,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:29:45,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:29:45,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:29:45,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:29:45,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:29:45,960] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step4000/mp_rank_00_model_states.pt +0: [2023-03-17 00:29:45,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:29:45,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:29:45,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:29:45,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:45,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:45,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:45,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:45,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:45,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:45,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:29:45,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:45,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:45,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:29:45,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:45,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:45,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:29:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:29:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:45,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:29:45,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:45,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:45,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:29:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:29:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:29:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: successfully saved checkpoint at iteration 4000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 174.43 +7: iteration 4010/ 60336 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.18 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.317775E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1439.100 | TFLOPs: 22.57 | +7: iteration 4020/ 60336 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.324942E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.870 | TFLOPs: 26.13 | +7: iteration 4030/ 60336 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.330509E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.809 | TFLOPs: 26.08 | +7: iteration 4040/ 60336 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.315853E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.844 | TFLOPs: 25.67 | +7: iteration 4050/ 60336 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.314121E+00 | grad norm: 0.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.048 | TFLOPs: 25.81 | +7: iteration 4060/ 60336 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.315010E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.606 | TFLOPs: 25.74 | +7: iteration 4070/ 60336 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.316175E+00 | grad norm: 0.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.113 | TFLOPs: 26.16 | +7: iteration 4080/ 60336 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.299527E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.593 | TFLOPs: 24.96 | +7: iteration 4090/ 60336 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.307733E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.048 | TFLOPs: 25.72 | +7: iteration 4100/ 60336 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.306199E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.876 | TFLOPs: 26.17 | +7: iteration 4110/ 60336 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.318217E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.698 | TFLOPs: 25.57 | +7: iteration 4120/ 60336 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.305020E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.281 | TFLOPs: 25.55 | +7: iteration 4130/ 60336 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.301086E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.797 | TFLOPs: 25.70 | +7: iteration 4140/ 60336 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.304211E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.370 | TFLOPs: 26.05 | +7: iteration 4150/ 60336 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.303938E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.761 | TFLOPs: 26.19 | +7: iteration 4160/ 60336 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.297028E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.534 | TFLOPs: 25.56 | +7: iteration 4170/ 60336 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.303693E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.959 | TFLOPs: 26.11 | +7: iteration 4180/ 60336 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.292665E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.716 | TFLOPs: 25.57 | +7: iteration 4190/ 60336 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.294596E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.291 | TFLOPs: 25.74 | +7: iteration 4200/ 60336 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.292962E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.490 | TFLOPs: 26.06 | +7: iteration 4210/ 60336 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.297235E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.765 | TFLOPs: 26.09 | +7: iteration 4220/ 60336 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.289479E+00 | grad norm: 0.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.704 | TFLOPs: 25.82 | +7: iteration 4230/ 60336 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.296162E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.304 | TFLOPs: 26.19 | +7: iteration 4240/ 60336 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.270671E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.829 | TFLOPs: 26.19 | +7: iteration 4250/ 60336 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.274801E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.703 | TFLOPs: 26.11 | +7: iteration 4260/ 60336 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.289340E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.071 | TFLOPs: 25.85 | +7: iteration 4270/ 60336 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.293549E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.784 | TFLOPs: 26.11 | +7: iteration 4280/ 60336 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.284411E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.944 | TFLOPs: 26.14 | +7: iteration 4290/ 60336 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.284698E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.807 | TFLOPs: 25.50 | +7: iteration 4300/ 60336 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.286279E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.826 | TFLOPs: 26.09 | +7: iteration 4310/ 60336 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.268477E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.913 | TFLOPs: 25.51 | +7: iteration 4320/ 60336 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.299878E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.961 | TFLOPs: 25.50 | +7: iteration 4330/ 60336 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.276600E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.667 | TFLOPs: 25.56 | +7: iteration 4340/ 60336 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.269448E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.193 | TFLOPs: 26.16 | +7: iteration 4350/ 60336 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.281149E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.279 | TFLOPs: 26.12 | +7: iteration 4360/ 60336 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.293765E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.321 | TFLOPs: 26.15 | +7: iteration 4370/ 60336 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.283727E+00 | grad norm: 0.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.878 | TFLOPs: 25.78 | +7: iteration 4380/ 60336 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.279853E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.571 | TFLOPs: 25.79 | +7: iteration 4390/ 60336 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.273626E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.071 | TFLOPs: 26.11 | +7: iteration 4400/ 60336 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.274582E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.130 | TFLOPs: 25.19 | +7: iteration 4410/ 60336 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.270465E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.686 | TFLOPs: 26.18 | +7: iteration 4420/ 60336 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.281295E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.301 | TFLOPs: 26.19 | +7: iteration 4430/ 60336 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.271460E+00 | grad norm: 0.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.096 | TFLOPs: 26.19 | +7: iteration 4440/ 60336 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.282798E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.580 | TFLOPs: 26.18 | +7: iteration 4450/ 60336 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.267345E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.472 | TFLOPs: 26.20 | +7: iteration 4460/ 60336 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.269538E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.815 | TFLOPs: 26.17 | +7: iteration 4470/ 60336 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.276664E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.441 | TFLOPs: 26.06 | +7: iteration 4480/ 60336 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.265901E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.008 | TFLOPs: 25.91 | +7: iteration 4490/ 60336 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.274940E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.865 | TFLOPs: 25.72 | +7: iteration 4500/ 60336 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.261923E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.811 | TFLOPs: 25.64 | +7: iteration 4510/ 60336 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.251046E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.344 | TFLOPs: 26.10 | +7: iteration 4520/ 60336 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.258861E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.212 | TFLOPs: 25.82 | +7: iteration 4530/ 60336 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.264915E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.990 | TFLOPs: 26.10 | +7: iteration 4540/ 60336 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.265654E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.725 | TFLOPs: 26.11 | +7: iteration 4550/ 60336 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.271156E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.641 | TFLOPs: 25.40 | +7: iteration 4560/ 60336 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.266221E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.047 | TFLOPs: 25.92 | +7: iteration 4570/ 60336 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.261130E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.844 | TFLOPs: 25.69 | +7: iteration 4580/ 60336 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.272674E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.411 | TFLOPs: 25.63 | +7: iteration 4590/ 60336 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.264363E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.954 | TFLOPs: 25.69 | +7: iteration 4600/ 60336 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.257405E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.576 | TFLOPs: 26.12 | +7: iteration 4610/ 60336 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.254734E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.622 | TFLOPs: 26.09 | +7: iteration 4620/ 60336 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.243115E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.201 | TFLOPs: 26.08 | +7: iteration 4630/ 60336 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.250158E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.981 | TFLOPs: 26.08 | +7: iteration 4640/ 60336 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.247541E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.199 | TFLOPs: 25.71 | +7: iteration 4650/ 60336 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.244806E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.322 | TFLOPs: 25.57 | +7: iteration 4660/ 60336 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.249601E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.776 | TFLOPs: 25.89 | +7: iteration 4670/ 60336 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.247398E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.158 | TFLOPs: 25.89 | +7: iteration 4680/ 60336 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.253641E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.222 | TFLOPs: 25.30 | +7: iteration 4690/ 60336 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.243403E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.956 | TFLOPs: 26.06 | +7: iteration 4700/ 60336 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.240380E+00 | grad norm: 0.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.168 | TFLOPs: 25.74 | +7: iteration 4710/ 60336 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.255282E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.568 | TFLOPs: 25.15 | +7: iteration 4720/ 60336 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.248202E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.696 | TFLOPs: 26.01 | +7: iteration 4730/ 60336 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.237148E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.121 | TFLOPs: 25.56 | +7: iteration 4740/ 60336 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.245370E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.997 | TFLOPs: 26.00 | +7: iteration 4750/ 60336 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.239023E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.371 | TFLOPs: 25.46 | +7: iteration 4760/ 60336 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.245028E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.113 | TFLOPs: 26.07 | +7: iteration 4770/ 60336 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.228739E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.974 | TFLOPs: 26.08 | +7: iteration 4780/ 60336 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.247974E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.808 | TFLOPs: 26.05 | +7: iteration 4790/ 60336 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.16 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.236790E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.679 | TFLOPs: 25.49 | +7: iteration 4800/ 60336 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.234649E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.220 | TFLOPs: 26.08 | +7: iteration 4810/ 60336 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.239598E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.633 | TFLOPs: 26.06 | +7: iteration 4820/ 60336 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.248695E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.187 | TFLOPs: 26.07 | +7: iteration 4830/ 60336 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.250542E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.634 | TFLOPs: 26.07 | +7: iteration 4840/ 60336 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.229249E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.581 | TFLOPs: 26.07 | +7: iteration 4850/ 60336 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.240725E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.841 | TFLOPs: 26.06 | +7: iteration 4860/ 60336 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.232190E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.645 | TFLOPs: 26.06 | +7: iteration 4870/ 60336 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.232750E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.156 | TFLOPs: 26.07 | +7: iteration 4880/ 60336 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.216722E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.626 | TFLOPs: 26.07 | +7: iteration 4890/ 60336 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.228254E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.692 | TFLOPs: 26.06 | +7: iteration 4900/ 60336 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.234535E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.958 | TFLOPs: 26.05 | +7: iteration 4910/ 60336 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.235153E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.747 | TFLOPs: 26.06 | +7: iteration 4920/ 60336 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.236733E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.532 | TFLOPs: 25.77 | +7: iteration 4930/ 60336 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.228864E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.353 | TFLOPs: 26.04 | +7: iteration 4940/ 60336 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.220918E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.085 | TFLOPs: 26.05 | +7: iteration 4950/ 60336 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.218800E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.252 | TFLOPs: 26.05 | +7: iteration 4960/ 60336 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.224811E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.858 | TFLOPs: 26.06 | +7: iteration 4970/ 60336 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.242225E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.465 | TFLOPs: 26.07 | +7: iteration 4980/ 60336 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.232607E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.051 | TFLOPs: 26.07 | +7: iteration 4990/ 60336 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.224580E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.571 | TFLOPs: 26.07 | +7: iteration 5000/ 60336 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.244791E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.692 | TFLOPs: 26.06 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 5000 | lm loss value: 4.188738E+00 | lm loss PPL: 6.593955E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 5000 to checkpoints_44m32b100m +0: [2023-03-17 00:32:21,022] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +0: [2023-03-17 00:32:21,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:32:21,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:32:21,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:32:21,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:32:21,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:32:21,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:32:21,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:32:21,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:32:21,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:32:21,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:32:21,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:32:21,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:32:21,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:32:21,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:32:21,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:32:21,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:32:21,143] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:32:21,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:32:21,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:32:21,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:32:21,153] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step5000/mp_rank_00_model_states.pt +0: [2023-03-17 00:32:21,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:32:21,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:32:21,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:32:21,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:32:21,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 00:32:21,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:32:21,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:32:21,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:32:21,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:32:21,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 00:32:21,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:32:21,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:32:21,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:32:21,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:32:21,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:32:21,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:32:21,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 00:32:21,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:32:21,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:32:21,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:32:21,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:32:21,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:32:21,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:32:21,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:32:21,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: successfully saved checkpoint at iteration 5000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 178.08 +7: iteration 5010/ 60336 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.18 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.214131E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.300 | TFLOPs: 22.73 | +7: iteration 5020/ 60336 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.226271E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.751 | TFLOPs: 26.08 | +7: iteration 5030/ 60336 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.210706E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.484 | TFLOPs: 26.09 | +7: iteration 5040/ 60336 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.218248E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.047 | TFLOPs: 26.07 | +7: iteration 5050/ 60336 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.215768E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.012 | TFLOPs: 26.06 | +7: iteration 5060/ 60336 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.226259E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.442 | TFLOPs: 26.07 | +7: iteration 5070/ 60336 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.214640E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.180 | TFLOPs: 26.05 | +7: iteration 5080/ 60336 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.203469E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.381 | TFLOPs: 26.05 | +7: iteration 5090/ 60336 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.211040E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.737 | TFLOPs: 26.04 | +7: iteration 5100/ 60336 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.207222E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.789 | TFLOPs: 26.05 | +7: iteration 5110/ 60336 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.202853E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.685 | TFLOPs: 26.01 | +7: iteration 5120/ 60336 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.207526E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.845 | TFLOPs: 26.00 | +7: iteration 5130/ 60336 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.211185E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.288 | TFLOPs: 25.97 | +7: iteration 5140/ 60336 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 0.16 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.217028E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.821 | TFLOPs: 25.36 | +7: iteration 5150/ 60336 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.203939E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.045 | TFLOPs: 26.08 | +7: iteration 5160/ 60336 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.223629E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.286 | TFLOPs: 26.07 | +7: iteration 5170/ 60336 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.206675E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.431 | TFLOPs: 26.07 | +7: iteration 5180/ 60336 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.210858E+00 | grad norm: 0.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.589 | TFLOPs: 26.06 | +7: iteration 5190/ 60336 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.220412E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.834 | TFLOPs: 26.05 | +7: iteration 5200/ 60336 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.200167E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.246 | TFLOPs: 25.88 | +7: iteration 5210/ 60336 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.202187E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.054 | TFLOPs: 26.03 | +7: iteration 5220/ 60336 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.202000E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.279 | TFLOPs: 26.04 | +7: iteration 5230/ 60336 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.198693E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.327 | TFLOPs: 26.04 | +7: iteration 5240/ 60336 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.197800E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.545 | TFLOPs: 25.82 | +7: iteration 5250/ 60336 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.196000E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.354 | TFLOPs: 26.04 | +7: iteration 5260/ 60336 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.208058E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.305 | TFLOPs: 25.66 | +7: iteration 5270/ 60336 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.211058E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.576 | TFLOPs: 26.10 | +7: iteration 5280/ 60336 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.191299E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.915 | TFLOPs: 26.11 | +7: iteration 5290/ 60336 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.202174E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.056 | TFLOPs: 26.08 | +7: iteration 5300/ 60336 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.194480E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.002 | TFLOPs: 26.08 | +7: iteration 5310/ 60336 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.201130E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.960 | TFLOPs: 26.11 | +7: iteration 5320/ 60336 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.197042E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.531 | TFLOPs: 26.12 | +7: iteration 5330/ 60336 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 0.16 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.217883E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.357 | TFLOPs: 25.63 | +7: iteration 5340/ 60336 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.203518E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.490 | TFLOPs: 26.13 | +7: iteration 5350/ 60336 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 0.16 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.199512E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.371 | TFLOPs: 25.80 | +7: iteration 5360/ 60336 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.194571E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.177 | TFLOPs: 26.13 | +7: iteration 5370/ 60336 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.200645E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.872 | TFLOPs: 26.16 | +7: iteration 5380/ 60336 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.187042E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.359 | TFLOPs: 26.15 | +7: iteration 5390/ 60336 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 0.16 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.186546E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.870 | TFLOPs: 25.73 | +7: iteration 5400/ 60336 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.186885E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.113 | TFLOPs: 26.19 | +7: iteration 5410/ 60336 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.190147E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.463 | TFLOPs: 26.15 | +7: iteration 5420/ 60336 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.192856E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.882 | TFLOPs: 26.16 | +7: iteration 5430/ 60336 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.180273E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.455 | TFLOPs: 25.70 | +7: iteration 5440/ 60336 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.182615E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.126 | TFLOPs: 26.21 | +7: iteration 5450/ 60336 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.186820E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.950 | TFLOPs: 25.81 | +7: iteration 5460/ 60336 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.203510E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.860 | TFLOPs: 26.11 | +7: iteration 5470/ 60336 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.198820E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.559 | TFLOPs: 26.04 | +7: iteration 5480/ 60336 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.190423E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.946 | TFLOPs: 26.00 | +7: iteration 5490/ 60336 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 0.16 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.187444E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.880 | TFLOPs: 25.45 | +7: iteration 5500/ 60336 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 0.16 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.173851E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.670 | TFLOPs: 25.82 | +7: iteration 5510/ 60336 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.188687E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.064 | TFLOPs: 26.00 | +7: iteration 5520/ 60336 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.179935E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.973 | TFLOPs: 25.99 | +7: iteration 5530/ 60336 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.178034E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.177 | TFLOPs: 26.00 | +7: iteration 5540/ 60336 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.182801E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.314 | TFLOPs: 26.02 | +7: iteration 5550/ 60336 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.184305E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.843 | TFLOPs: 26.01 | +7: iteration 5560/ 60336 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.180549E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.169 | TFLOPs: 26.07 | +7: iteration 5570/ 60336 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.182412E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.996 | TFLOPs: 26.17 | +7: iteration 5580/ 60336 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.189127E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.871 | TFLOPs: 26.14 | +7: iteration 5590/ 60336 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.184363E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.507 | TFLOPs: 26.03 | +7: iteration 5600/ 60336 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.161289E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.138 | TFLOPs: 26.02 | +7: iteration 5610/ 60336 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.178273E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.323 | TFLOPs: 26.07 | +7: iteration 5620/ 60336 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.171448E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.930 | TFLOPs: 26.06 | +7: iteration 5630/ 60336 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.175423E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.899 | TFLOPs: 26.05 | +7: iteration 5640/ 60336 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.197274E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.074 | TFLOPs: 26.03 | +7: iteration 5650/ 60336 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.178859E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.775 | TFLOPs: 25.87 | +7: iteration 5660/ 60336 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.164561E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.556 | TFLOPs: 26.04 | +7: iteration 5670/ 60336 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.180787E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.341 | TFLOPs: 26.01 | +7: iteration 5680/ 60336 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.176126E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.198 | TFLOPs: 26.05 | +7: iteration 5690/ 60336 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.173549E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.436 | TFLOPs: 25.63 | +7: iteration 5700/ 60336 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.179793E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.406 | TFLOPs: 25.74 | +7: iteration 5710/ 60336 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.186217E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.221 | TFLOPs: 26.02 | +7: iteration 5720/ 60336 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.172530E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.668 | TFLOPs: 26.03 | +7: iteration 5730/ 60336 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 0.17 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.159805E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.466 | TFLOPs: 23.31 | +7: iteration 5740/ 60336 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.172861E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.281 | TFLOPs: 26.01 | +7: iteration 5750/ 60336 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.168770E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.054 | TFLOPs: 26.03 | +7: iteration 5760/ 60336 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 0.16 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.173979E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.494 | TFLOPs: 25.62 | +7: iteration 5770/ 60336 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.172121E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.238 | TFLOPs: 26.02 | +7: iteration 5780/ 60336 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.170588E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.703 | TFLOPs: 26.03 | +7: iteration 5790/ 60336 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.162706E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.312 | TFLOPs: 25.94 | +7: iteration 5800/ 60336 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 0.16 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.169564E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.130 | TFLOPs: 25.83 | +7: iteration 5810/ 60336 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.172756E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.640 | TFLOPs: 25.81 | +7: iteration 5820/ 60336 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.159376E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.018 | TFLOPs: 26.02 | +7: iteration 5830/ 60336 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.161296E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.219 | TFLOPs: 25.99 | +7: iteration 5840/ 60336 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.160805E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.168 | TFLOPs: 25.58 | +7: iteration 5850/ 60336 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.158575E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.018 | TFLOPs: 26.13 | +7: iteration 5860/ 60336 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.171949E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.136 | TFLOPs: 26.08 | +7: iteration 5870/ 60336 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.171282E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.665 | TFLOPs: 25.76 | +7: iteration 5880/ 60336 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.175293E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.174 | TFLOPs: 26.08 | +7: iteration 5890/ 60336 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 0.16 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.158264E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.134 | TFLOPs: 25.64 | +7: iteration 5900/ 60336 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 0.16 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.168246E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.295 | TFLOPs: 25.72 | +7: iteration 5910/ 60336 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.148756E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.674 | TFLOPs: 26.07 | +7: iteration 5920/ 60336 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 0.16 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.155428E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.239 | TFLOPs: 25.68 | +7: iteration 5930/ 60336 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.157527E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.825 | TFLOPs: 26.08 | +7: iteration 5940/ 60336 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.156781E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.484 | TFLOPs: 26.07 | +7: iteration 5950/ 60336 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.155953E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.346 | TFLOPs: 26.10 | +7: iteration 5960/ 60336 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.141068E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.293 | TFLOPs: 26.10 | +7: iteration 5970/ 60336 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 0.16 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.166414E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.891 | TFLOPs: 25.72 | +7: iteration 5980/ 60336 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.155021E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.893 | TFLOPs: 26.09 | +7: iteration 5990/ 60336 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 0.16 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.155037E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.723 | TFLOPs: 25.65 | +0: [2023-03-17 00:34:55,924] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.00019639904904691105, 0.00019639904904691105, 0.00019639904904691105], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 6000/ 60336 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.144667E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.508 | TFLOPs: 26.09 | +0: steps: 6000 loss: 4.1128 iter time (s): 0.154 samples/sec: 1665.522 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 6000 | lm loss value: 4.161024E+00 | lm loss PPL: 6.413714E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 6000 to checkpoints_44m32b100m +0: [2023-03-17 00:34:55,996] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! +0: [2023-03-17 00:34:55,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:34:56,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:34:56,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:34:56,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:34:56,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:34:56,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:34:56,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:34:56,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:34:56,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:34:56,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:34:56,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:34:56,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:34:56,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:34:56,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:34:56,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:34:56,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:34:56,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:34:56,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:34:56,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:34:56,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:34:56,128] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step6000/mp_rank_00_model_states.pt +0: [2023-03-17 00:34:56,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:34:56,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:34:56,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:34:56,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:34:56,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:34:56,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:34:56,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:34:56,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:34:56,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:34:56,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:34:56,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:34:56,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:34:56,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:34:56,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:34:56,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: successfully saved checkpoint at iteration 6000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.93 +7: iteration 6010/ 60336 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 0.18 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.155061E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.676 | TFLOPs: 22.50 | +7: iteration 6020/ 60336 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.157642E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.458 | TFLOPs: 26.02 | +7: iteration 6030/ 60336 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.147976E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.246 | TFLOPs: 26.07 | +7: iteration 6040/ 60336 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 0.16 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.161695E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.709 | TFLOPs: 25.56 | +7: iteration 6050/ 60336 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 0.16 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.148499E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.977 | TFLOPs: 25.75 | +7: iteration 6060/ 60336 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.157809E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.658 | TFLOPs: 26.20 | +7: iteration 6070/ 60336 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 0.16 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.152917E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.955 | TFLOPs: 25.45 | +7: iteration 6080/ 60336 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.149696E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.655 | TFLOPs: 26.26 | +7: iteration 6090/ 60336 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.147591E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.637 | TFLOPs: 26.25 | +7: iteration 6100/ 60336 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.143869E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.025 | TFLOPs: 26.19 | +7: iteration 6110/ 60336 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.148291E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.268 | TFLOPs: 26.19 | +7: iteration 6120/ 60336 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.163821E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.141 | TFLOPs: 26.25 | +7: iteration 6130/ 60336 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.139574E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.630 | TFLOPs: 26.25 | +7: iteration 6140/ 60336 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.155926E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.432 | TFLOPs: 26.21 | +7: iteration 6150/ 60336 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.155860E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.059 | TFLOPs: 26.07 | +7: iteration 6160/ 60336 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.151098E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.340 | TFLOPs: 26.21 | +7: iteration 6170/ 60336 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.151197E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.017 | TFLOPs: 26.22 | +7: iteration 6180/ 60336 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.152144E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.051 | TFLOPs: 26.22 | +7: iteration 6190/ 60336 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.142245E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.993 | TFLOPs: 26.22 | +7: iteration 6200/ 60336 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.128163E+00 | grad norm: 0.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.461 | TFLOPs: 26.12 | +7: iteration 6210/ 60336 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.146519E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.690 | TFLOPs: 26.14 | +7: iteration 6220/ 60336 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.156208E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.218 | TFLOPs: 26.21 | +7: iteration 6230/ 60336 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.144728E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.458 | TFLOPs: 26.23 | +7: iteration 6240/ 60336 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.131795E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.135 | TFLOPs: 26.29 | +7: iteration 6250/ 60336 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.141877E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.128 | TFLOPs: 26.27 | +7: iteration 6260/ 60336 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.140718E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.148 | TFLOPs: 26.27 | +7: iteration 6270/ 60336 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.143687E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.576 | TFLOPs: 26.28 | +7: iteration 6280/ 60336 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.144065E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.569 | TFLOPs: 26.06 | +7: iteration 6290/ 60336 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.128217E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.038 | TFLOPs: 26.25 | +7: iteration 6300/ 60336 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.158936E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.340 | TFLOPs: 26.27 | +7: iteration 6310/ 60336 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.142937E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.063 | TFLOPs: 26.24 | +7: iteration 6320/ 60336 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.143877E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.270 | TFLOPs: 26.19 | +7: iteration 6330/ 60336 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.137899E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.207 | TFLOPs: 26.19 | +7: iteration 6340/ 60336 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.160152E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.346 | TFLOPs: 26.20 | +7: iteration 6350/ 60336 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.143782E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.492 | TFLOPs: 26.20 | +7: iteration 6360/ 60336 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.128796E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.220 | TFLOPs: 26.27 | +7: iteration 6370/ 60336 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.136079E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.199 | TFLOPs: 26.26 | +7: iteration 6380/ 60336 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.135203E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.922 | TFLOPs: 26.28 | +7: iteration 6390/ 60336 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.127800E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.607 | TFLOPs: 26.26 | +7: iteration 6400/ 60336 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.127715E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.206 | TFLOPs: 26.30 | +7: iteration 6410/ 60336 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.133316E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.185 | TFLOPs: 26.29 | +7: iteration 6420/ 60336 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.127698E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.329 | TFLOPs: 26.27 | +7: iteration 6430/ 60336 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.146037E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.149 | TFLOPs: 26.29 | +7: iteration 6440/ 60336 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.132429E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.657 | TFLOPs: 26.22 | +7: iteration 6450/ 60336 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 0.16 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.145768E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.742 | TFLOPs: 25.65 | +7: iteration 6460/ 60336 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.138159E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.712 | TFLOPs: 26.22 | +7: iteration 6470/ 60336 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.122406E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.050 | TFLOPs: 26.22 | +7: iteration 6480/ 60336 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.120613E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.804 | TFLOPs: 26.22 | +7: iteration 6490/ 60336 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.109455E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.106 | TFLOPs: 26.22 | +7: iteration 6500/ 60336 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.137918E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.769 | TFLOPs: 26.20 | +7: iteration 6510/ 60336 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.118761E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.374 | TFLOPs: 26.20 | +7: iteration 6520/ 60336 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.127106E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.689 | TFLOPs: 26.14 | +7: iteration 6530/ 60336 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.135304E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.875 | TFLOPs: 26.13 | +7: iteration 6540/ 60336 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.129657E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.110 | TFLOPs: 26.14 | +7: iteration 6550/ 60336 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.138560E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.224 | TFLOPs: 26.13 | +7: iteration 6560/ 60336 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.124767E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.839 | TFLOPs: 26.16 | +7: iteration 6570/ 60336 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.117599E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.207 | TFLOPs: 26.15 | +7: iteration 6580/ 60336 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.132661E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.544 | TFLOPs: 26.17 | +7: iteration 6590/ 60336 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.132210E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.667 | TFLOPs: 26.15 | +7: iteration 6600/ 60336 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 0.16 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.131612E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.774 | TFLOPs: 25.50 | +7: iteration 6610/ 60336 | consumed samples: 1692160 | consumed tokens: 3465543680 | elapsed time per iteration (s): 0.16 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.117331E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.955 | TFLOPs: 25.80 | +7: iteration 6620/ 60336 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.112462E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.386 | TFLOPs: 26.18 | +7: iteration 6630/ 60336 | consumed samples: 1697280 | consumed tokens: 3476029440 | elapsed time per iteration (s): 0.16 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.124626E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.914 | TFLOPs: 25.73 | +7: iteration 6640/ 60336 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 0.16 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.130487E+00 | grad norm: 0.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.846 | TFLOPs: 25.75 | +7: iteration 6650/ 60336 | consumed samples: 1702400 | consumed tokens: 3486515200 | elapsed time per iteration (s): 0.16 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.125339E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.821 | TFLOPs: 25.73 | +7: iteration 6660/ 60336 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.122401E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.148 | TFLOPs: 26.16 | +7: iteration 6670/ 60336 | consumed samples: 1707520 | consumed tokens: 3497000960 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.115642E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.917 | TFLOPs: 26.16 | +7: iteration 6680/ 60336 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.123692E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.788 | TFLOPs: 26.17 | +7: iteration 6690/ 60336 | consumed samples: 1712640 | consumed tokens: 3507486720 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.134952E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.029 | TFLOPs: 26.19 | +7: iteration 6700/ 60336 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.105028E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.752 | TFLOPs: 26.19 | +7: iteration 6710/ 60336 | consumed samples: 1717760 | consumed tokens: 3517972480 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.123444E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.274 | TFLOPs: 26.18 | +7: iteration 6720/ 60336 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.108258E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.465 | TFLOPs: 26.18 | +7: iteration 6730/ 60336 | consumed samples: 1722880 | consumed tokens: 3528458240 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.108035E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.894 | TFLOPs: 26.17 | +7: iteration 6740/ 60336 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.121152E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.608 | TFLOPs: 26.18 | +7: iteration 6750/ 60336 | consumed samples: 1728000 | consumed tokens: 3538944000 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.124006E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.216 | TFLOPs: 26.18 | +7: iteration 6760/ 60336 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.119352E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.841 | TFLOPs: 26.17 | +7: iteration 6770/ 60336 | consumed samples: 1733120 | consumed tokens: 3549429760 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.125026E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.641 | TFLOPs: 26.17 | +7: iteration 6780/ 60336 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.120000E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.382 | TFLOPs: 26.15 | +7: iteration 6790/ 60336 | consumed samples: 1738240 | consumed tokens: 3559915520 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.122282E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.339 | TFLOPs: 26.18 | +7: iteration 6800/ 60336 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.100450E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.180 | TFLOPs: 26.18 | +7: iteration 6810/ 60336 | consumed samples: 1743360 | consumed tokens: 3570401280 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.109386E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.863 | TFLOPs: 26.19 | +7: iteration 6820/ 60336 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.114502E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.223 | TFLOPs: 26.16 | +7: iteration 6830/ 60336 | consumed samples: 1748480 | consumed tokens: 3580887040 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.118104E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.123 | TFLOPs: 26.16 | +7: iteration 6840/ 60336 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.116965E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.815 | TFLOPs: 26.16 | +7: iteration 6850/ 60336 | consumed samples: 1753600 | consumed tokens: 3591372800 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.103444E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.417 | TFLOPs: 26.15 | +7: iteration 6860/ 60336 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.099132E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.721 | TFLOPs: 26.14 | +7: iteration 6870/ 60336 | consumed samples: 1758720 | consumed tokens: 3601858560 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.118663E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.326 | TFLOPs: 26.15 | +7: iteration 6880/ 60336 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.100961E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.575 | TFLOPs: 26.14 | +7: iteration 6890/ 60336 | consumed samples: 1763840 | consumed tokens: 3612344320 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.106357E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.375 | TFLOPs: 26.16 | +7: iteration 6900/ 60336 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.102640E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.799 | TFLOPs: 26.16 | +7: iteration 6910/ 60336 | consumed samples: 1768960 | consumed tokens: 3622830080 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.108408E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.104 | TFLOPs: 26.16 | +7: iteration 6920/ 60336 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.117358E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.543 | TFLOPs: 26.12 | +7: iteration 6930/ 60336 | consumed samples: 1774080 | consumed tokens: 3633315840 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.101444E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.516 | TFLOPs: 26.12 | +7: iteration 6940/ 60336 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.104242E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.110 | TFLOPs: 26.14 | +7: iteration 6950/ 60336 | consumed samples: 1779200 | consumed tokens: 3643801600 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.108508E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.395 | TFLOPs: 26.15 | +7: iteration 6960/ 60336 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.093165E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.963 | TFLOPs: 26.16 | +7: iteration 6970/ 60336 | consumed samples: 1784320 | consumed tokens: 3654287360 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.110247E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.702 | TFLOPs: 26.15 | +7: iteration 6980/ 60336 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.104168E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.654 | TFLOPs: 26.17 | +7: iteration 6990/ 60336 | consumed samples: 1789440 | consumed tokens: 3664773120 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.121477E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.367 | TFLOPs: 26.16 | +7: iteration 7000/ 60336 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.095475E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.139 | TFLOPs: 26.14 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 7000 | lm loss value: 4.137661E+00 | lm loss PPL: 6.265613E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 7000 to checkpoints_44m32b100m +0: [2023-03-17 00:37:29,820] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is begin to save! +0: [2023-03-17 00:37:29,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:37:29,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:37:29,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:37:29,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:37:29,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:37:29,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:37:29,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:37:29,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:37:29,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:37:29,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:37:29,917] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:37:29,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:37:29,925] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:37:29,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:37:29,933] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:37:29,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:37:29,941] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:37:29,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:37:29,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:37:29,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:37:29,951] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step7000/mp_rank_00_model_states.pt +0: [2023-03-17 00:37:29,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:37:29,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:37:29,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:37:29,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:37:29,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:37:29,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:37:29,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:37:29,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:37:29,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:37:29,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:37:29,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:37:29,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:37:29,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:37:29,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:37:29,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:37:29,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:37:29,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:37:29,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:37:29,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: successfully saved checkpoint at iteration 7000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.34 +7: iteration 7010/ 60336 | consumed samples: 1794560 | consumed tokens: 3675258880 | elapsed time per iteration (s): 0.18 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.092918E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.726 | TFLOPs: 22.85 | +7: iteration 7020/ 60336 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.106041E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.856 | TFLOPs: 26.17 | +7: iteration 7030/ 60336 | consumed samples: 1799680 | consumed tokens: 3685744640 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.108792E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.956 | TFLOPs: 26.16 | +7: iteration 7040/ 60336 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.094150E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.693 | TFLOPs: 26.15 | +7: iteration 7050/ 60336 | consumed samples: 1804800 | consumed tokens: 3696230400 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.098031E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.559 | TFLOPs: 26.15 | +7: iteration 7060/ 60336 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.120783E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.354 | TFLOPs: 26.12 | +7: iteration 7070/ 60336 | consumed samples: 1809920 | consumed tokens: 3706716160 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.109520E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.153 | TFLOPs: 26.13 | +7: iteration 7080/ 60336 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.100634E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.196 | TFLOPs: 26.13 | +7: iteration 7090/ 60336 | consumed samples: 1815040 | consumed tokens: 3717201920 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.106404E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.781 | TFLOPs: 26.12 | +7: iteration 7100/ 60336 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.091231E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.965 | TFLOPs: 26.14 | +7: iteration 7110/ 60336 | consumed samples: 1820160 | consumed tokens: 3727687680 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.104539E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.289 | TFLOPs: 26.13 | +7: iteration 7120/ 60336 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.086409E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.277 | TFLOPs: 26.15 | +7: iteration 7130/ 60336 | consumed samples: 1825280 | consumed tokens: 3738173440 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.100084E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.321 | TFLOPs: 26.13 | +7: iteration 7140/ 60336 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.090448E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.961 | TFLOPs: 26.14 | +7: iteration 7150/ 60336 | consumed samples: 1830400 | consumed tokens: 3748659200 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.101440E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.373 | TFLOPs: 26.15 | +7: iteration 7160/ 60336 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.088407E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.030 | TFLOPs: 26.13 | +7: iteration 7170/ 60336 | consumed samples: 1835520 | consumed tokens: 3759144960 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.093174E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.034 | TFLOPs: 26.13 | +7: iteration 7180/ 60336 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.100655E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.802 | TFLOPs: 26.14 | +7: iteration 7190/ 60336 | consumed samples: 1840640 | consumed tokens: 3769630720 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.097943E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.260 | TFLOPs: 26.13 | +7: iteration 7200/ 60336 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.106371E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.644 | TFLOPs: 26.14 | +7: iteration 7210/ 60336 | consumed samples: 1845760 | consumed tokens: 3780116480 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.107003E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.057 | TFLOPs: 26.10 | +7: iteration 7220/ 60336 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.088557E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.188 | TFLOPs: 26.13 | +7: iteration 7230/ 60336 | consumed samples: 1850880 | consumed tokens: 3790602240 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.105051E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.917 | TFLOPs: 26.09 | +7: iteration 7240/ 60336 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.084577E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.748 | TFLOPs: 26.09 | +7: iteration 7250/ 60336 | consumed samples: 1856000 | consumed tokens: 3801088000 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.107456E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.241 | TFLOPs: 26.08 | +7: iteration 7260/ 60336 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.093178E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.971 | TFLOPs: 26.11 | +7: iteration 7270/ 60336 | consumed samples: 1861120 | consumed tokens: 3811573760 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.095891E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.603 | TFLOPs: 26.12 | +7: iteration 7280/ 60336 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.088148E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.081 | TFLOPs: 26.13 | +7: iteration 7290/ 60336 | consumed samples: 1866240 | consumed tokens: 3822059520 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.102378E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.958 | TFLOPs: 26.13 | +7: iteration 7300/ 60336 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.086405E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.045 | TFLOPs: 26.13 | +7: iteration 7310/ 60336 | consumed samples: 1871360 | consumed tokens: 3832545280 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.085852E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.401 | TFLOPs: 26.13 | +7: iteration 7320/ 60336 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.076104E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.289 | TFLOPs: 26.12 | +7: iteration 7330/ 60336 | consumed samples: 1876480 | consumed tokens: 3843031040 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.089746E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.610 | TFLOPs: 26.15 | +7: iteration 7340/ 60336 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.093144E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.707 | TFLOPs: 26.15 | +7: iteration 7350/ 60336 | consumed samples: 1881600 | consumed tokens: 3853516800 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.094922E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.956 | TFLOPs: 26.13 | +7: iteration 7360/ 60336 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.077319E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.661 | TFLOPs: 26.12 | +7: iteration 7370/ 60336 | consumed samples: 1886720 | consumed tokens: 3864002560 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.098269E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.270 | TFLOPs: 26.13 | +7: iteration 7380/ 60336 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.096198E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.875 | TFLOPs: 26.17 | +7: iteration 7390/ 60336 | consumed samples: 1891840 | consumed tokens: 3874488320 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.088314E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.816 | TFLOPs: 26.14 | +7: iteration 7400/ 60336 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.091683E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.072 | TFLOPs: 26.14 | +7: iteration 7410/ 60336 | consumed samples: 1896960 | consumed tokens: 3884974080 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.089838E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.546 | TFLOPs: 26.17 | +7: iteration 7420/ 60336 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.083162E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.893 | TFLOPs: 26.16 | +7: iteration 7430/ 60336 | consumed samples: 1902080 | consumed tokens: 3895459840 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.073890E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.428 | TFLOPs: 26.20 | +7: iteration 7440/ 60336 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.086618E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.343 | TFLOPs: 26.20 | +7: iteration 7450/ 60336 | consumed samples: 1907200 | consumed tokens: 3905945600 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.079787E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.907 | TFLOPs: 26.20 | +7: iteration 7460/ 60336 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.083290E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.551 | TFLOPs: 26.20 | +7: iteration 7470/ 60336 | consumed samples: 1912320 | consumed tokens: 3916431360 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.087725E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.886 | TFLOPs: 26.20 | +7: iteration 7480/ 60336 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.074812E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.321 | TFLOPs: 26.19 | +7: iteration 7490/ 60336 | consumed samples: 1917440 | consumed tokens: 3926917120 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.084879E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.049 | TFLOPs: 26.14 | +7: iteration 7500/ 60336 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.070582E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.032 | TFLOPs: 26.14 | +7: iteration 7510/ 60336 | consumed samples: 1922560 | consumed tokens: 3937402880 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.094953E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.544 | TFLOPs: 26.15 | +7: iteration 7520/ 60336 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.085087E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.948 | TFLOPs: 26.16 | +7: iteration 7530/ 60336 | consumed samples: 1927680 | consumed tokens: 3947888640 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.088396E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.853 | TFLOPs: 26.19 | +7: iteration 7540/ 60336 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.091910E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.495 | TFLOPs: 26.13 | +7: iteration 7550/ 60336 | consumed samples: 1932800 | consumed tokens: 3958374400 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.086188E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.067 | TFLOPs: 26.14 | +7: iteration 7560/ 60336 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.080483E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.217 | TFLOPs: 26.11 | +7: iteration 7570/ 60336 | consumed samples: 1937920 | consumed tokens: 3968860160 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.071066E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.409 | TFLOPs: 26.12 | +7: iteration 7580/ 60336 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.094735E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.726 | TFLOPs: 26.12 | +7: iteration 7590/ 60336 | consumed samples: 1943040 | consumed tokens: 3979345920 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.071684E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.345 | TFLOPs: 26.12 | +7: iteration 7600/ 60336 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.077147E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.461 | TFLOPs: 26.12 | +7: iteration 7610/ 60336 | consumed samples: 1948160 | consumed tokens: 3989831680 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.087403E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.795 | TFLOPs: 26.09 | +7: iteration 7620/ 60336 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.093071E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.798 | TFLOPs: 26.11 | +7: iteration 7630/ 60336 | consumed samples: 1953280 | consumed tokens: 4000317440 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.060788E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.896 | TFLOPs: 26.14 | +7: iteration 7640/ 60336 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.060322E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.851 | TFLOPs: 26.11 | +7: iteration 7650/ 60336 | consumed samples: 1958400 | consumed tokens: 4010803200 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.073964E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.978 | TFLOPs: 26.13 | +7: iteration 7660/ 60336 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.062523E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.376 | TFLOPs: 26.12 | +7: iteration 7670/ 60336 | consumed samples: 1963520 | consumed tokens: 4021288960 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.082029E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.588 | TFLOPs: 26.12 | +7: iteration 7680/ 60336 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.084907E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.632 | TFLOPs: 26.12 | +7: iteration 7690/ 60336 | consumed samples: 1968640 | consumed tokens: 4031774720 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.060476E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.462 | TFLOPs: 26.10 | +7: iteration 7700/ 60336 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.068677E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.585 | TFLOPs: 26.14 | +7: iteration 7710/ 60336 | consumed samples: 1973760 | consumed tokens: 4042260480 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.091817E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.789 | TFLOPs: 26.12 | +7: iteration 7720/ 60336 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.078226E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.628 | TFLOPs: 26.12 | +7: iteration 7730/ 60336 | consumed samples: 1978880 | consumed tokens: 4052746240 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.066753E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.230 | TFLOPs: 26.15 | +7: iteration 7740/ 60336 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.075129E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.876 | TFLOPs: 26.00 | +7: iteration 7750/ 60336 | consumed samples: 1984000 | consumed tokens: 4063232000 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.069243E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.646 | TFLOPs: 26.17 | +7: iteration 7760/ 60336 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.089729E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.895 | TFLOPs: 26.17 | +7: iteration 7770/ 60336 | consumed samples: 1989120 | consumed tokens: 4073717760 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.059867E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.981 | TFLOPs: 26.17 | +7: iteration 7780/ 60336 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.076297E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.723 | TFLOPs: 26.19 | +7: iteration 7790/ 60336 | consumed samples: 1994240 | consumed tokens: 4084203520 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.077705E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.404 | TFLOPs: 26.18 | +7: iteration 7800/ 60336 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.064554E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.793 | TFLOPs: 26.17 | +7: iteration 7810/ 60336 | consumed samples: 1999360 | consumed tokens: 4094689280 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.072147E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.296 | TFLOPs: 25.91 | +7: iteration 7820/ 60336 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.076614E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.238 | TFLOPs: 26.19 | +7: iteration 7830/ 60336 | consumed samples: 2004480 | consumed tokens: 4105175040 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.062930E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.846 | TFLOPs: 26.27 | +7: iteration 7840/ 60336 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.074817E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.485 | TFLOPs: 26.28 | +7: iteration 7850/ 60336 | consumed samples: 2009600 | consumed tokens: 4115660800 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.073404E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.176 | TFLOPs: 26.24 | +7: iteration 7860/ 60336 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.075374E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.106 | TFLOPs: 26.22 | +7: iteration 7870/ 60336 | consumed samples: 2014720 | consumed tokens: 4126146560 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.077819E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.645 | TFLOPs: 26.23 | +7: iteration 7880/ 60336 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.073581E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.230 | TFLOPs: 26.22 | +7: iteration 7890/ 60336 | consumed samples: 2019840 | consumed tokens: 4136632320 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.056065E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.966 | TFLOPs: 26.13 | +7: iteration 7900/ 60336 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.070247E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.545 | TFLOPs: 26.10 | +7: iteration 7910/ 60336 | consumed samples: 2024960 | consumed tokens: 4147118080 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.057077E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.006 | TFLOPs: 26.03 | +7: iteration 7920/ 60336 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.058361E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.041 | TFLOPs: 26.06 | +7: iteration 7930/ 60336 | consumed samples: 2030080 | consumed tokens: 4157603840 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.059420E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.253 | TFLOPs: 26.04 | +7: iteration 7940/ 60336 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 0.16 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.068709E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.557 | TFLOPs: 25.09 | +7: iteration 7950/ 60336 | consumed samples: 2035200 | consumed tokens: 4168089600 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.064156E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.511 | TFLOPs: 26.10 | +7: iteration 7960/ 60336 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 0.16 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.077602E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.130 | TFLOPs: 25.80 | +7: iteration 7970/ 60336 | consumed samples: 2040320 | consumed tokens: 4178575360 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.081394E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.998 | TFLOPs: 26.22 | +7: iteration 7980/ 60336 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.068659E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.711 | TFLOPs: 26.20 | +7: iteration 7990/ 60336 | consumed samples: 2045440 | consumed tokens: 4189061120 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.052211E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.190 | TFLOPs: 26.10 | +0: [2023-03-17 00:40:03,661] [INFO] [logging.py:68:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.00019327530167242664, 0.00019327530167242664, 0.00019327530167242664], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 8000/ 60336 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.062782E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.840 | TFLOPs: 26.08 | +0: steps: 8000 loss: 4.0620 iter time (s): 0.153 samples/sec: 1676.896 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 8000 | lm loss value: 4.114335E+00 | lm loss PPL: 6.121147E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 8000 to checkpoints_44m32b100m +0: [2023-03-17 00:40:03,735] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is begin to save! +0: [2023-03-17 00:40:03,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:40:03,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:40:03,800] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:40:03,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:40:03,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:40:03,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:40:03,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:40:03,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:40:03,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:40:03,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:40:03,834] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:40:03,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:40:03,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:40:03,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:40:03,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:40:03,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:40:03,859] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:40:03,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:40:03,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:40:03,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:40:03,868] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step8000/mp_rank_00_model_states.pt +0: [2023-03-17 00:40:03,868] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:40:03,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:40:03,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:40:03,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:40:03,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:40:03,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:40:03,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:40:03,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:40:03,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:40:03,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:40:03,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 00:40:03,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:40:03,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:40:03,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:40:03,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:40:03,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:40:03,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: successfully saved checkpoint at iteration 8000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 178.91 +7: iteration 8010/ 60336 | consumed samples: 2050560 | consumed tokens: 4199546880 | elapsed time per iteration (s): 0.18 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.065672E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1435.118 | TFLOPs: 22.51 | +7: iteration 8020/ 60336 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.057115E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.788 | TFLOPs: 26.09 | +7: iteration 8030/ 60336 | consumed samples: 2055680 | consumed tokens: 4210032640 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.059241E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.325 | TFLOPs: 26.10 | +7: iteration 8040/ 60336 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.058508E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.374 | TFLOPs: 26.13 | +7: iteration 8050/ 60336 | consumed samples: 2060800 | consumed tokens: 4220518400 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.055819E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.937 | TFLOPs: 26.14 | +7: iteration 8060/ 60336 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.059635E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.896 | TFLOPs: 26.13 | +7: iteration 8070/ 60336 | consumed samples: 2065920 | consumed tokens: 4231004160 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.066032E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.135 | TFLOPs: 26.14 | +7: iteration 8080/ 60336 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.072114E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.280 | TFLOPs: 26.15 | +7: iteration 8090/ 60336 | consumed samples: 2071040 | consumed tokens: 4241489920 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.062053E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.597 | TFLOPs: 26.14 | +7: iteration 8100/ 60336 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.067021E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.768 | TFLOPs: 26.15 | +7: iteration 8110/ 60336 | consumed samples: 2076160 | consumed tokens: 4251975680 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.060607E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.999 | TFLOPs: 25.94 | +7: iteration 8120/ 60336 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.060049E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.656 | TFLOPs: 26.14 | +7: iteration 8130/ 60336 | consumed samples: 2081280 | consumed tokens: 4262461440 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.055822E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.111 | TFLOPs: 26.14 | +7: iteration 8140/ 60336 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.055225E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.759 | TFLOPs: 26.15 | +7: iteration 8150/ 60336 | consumed samples: 2086400 | consumed tokens: 4272947200 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.060695E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.908 | TFLOPs: 26.14 | +7: iteration 8160/ 60336 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.045809E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.176 | TFLOPs: 26.15 | +7: iteration 8170/ 60336 | consumed samples: 2091520 | consumed tokens: 4283432960 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.056165E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.005 | TFLOPs: 26.14 | +7: iteration 8180/ 60336 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.045317E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.223 | TFLOPs: 26.16 | +7: iteration 8190/ 60336 | consumed samples: 2096640 | consumed tokens: 4293918720 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.055306E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.833 | TFLOPs: 26.16 | +7: iteration 8200/ 60336 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 0.16 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.070153E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.078 | TFLOPs: 25.77 | +7: iteration 8210/ 60336 | consumed samples: 2101760 | consumed tokens: 4304404480 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.048605E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.847 | TFLOPs: 26.11 | +7: iteration 8220/ 60336 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.050166E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.405 | TFLOPs: 26.12 | +7: iteration 8230/ 60336 | consumed samples: 2106880 | consumed tokens: 4314890240 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.055008E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.862 | TFLOPs: 26.14 | +7: iteration 8240/ 60336 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.051598E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.304 | TFLOPs: 26.15 | +7: iteration 8250/ 60336 | consumed samples: 2112000 | consumed tokens: 4325376000 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.056804E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.553 | TFLOPs: 26.14 | +7: iteration 8260/ 60336 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.054956E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.394 | TFLOPs: 26.15 | +7: iteration 8270/ 60336 | consumed samples: 2117120 | consumed tokens: 4335861760 | elapsed time per iteration (s): 0.16 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.051766E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.474 | TFLOPs: 25.74 | +7: iteration 8280/ 60336 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.055004E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.149 | TFLOPs: 26.13 | +7: iteration 8290/ 60336 | consumed samples: 2122240 | consumed tokens: 4346347520 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.062487E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.857 | TFLOPs: 26.12 | +7: iteration 8300/ 60336 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.044208E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.764 | TFLOPs: 26.14 | +7: iteration 8310/ 60336 | consumed samples: 2127360 | consumed tokens: 4356833280 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.048118E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.013 | TFLOPs: 26.14 | +7: iteration 8320/ 60336 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.043470E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.638 | TFLOPs: 26.15 | +7: iteration 8330/ 60336 | consumed samples: 2132480 | consumed tokens: 4367319040 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.059720E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.424 | TFLOPs: 26.13 | +7: iteration 8340/ 60336 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.061410E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.202 | TFLOPs: 26.13 | +7: iteration 8350/ 60336 | consumed samples: 2137600 | consumed tokens: 4377804800 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.041346E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.049 | TFLOPs: 26.10 | +7: iteration 8360/ 60336 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.043067E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.799 | TFLOPs: 26.11 | +7: iteration 8370/ 60336 | consumed samples: 2142720 | consumed tokens: 4388290560 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.049891E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.771 | TFLOPs: 26.03 | +7: iteration 8380/ 60336 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 0.16 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.064380E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.427 | TFLOPs: 25.71 | +7: iteration 8390/ 60336 | consumed samples: 2147840 | consumed tokens: 4398776320 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.045950E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.462 | TFLOPs: 26.12 | +7: iteration 8400/ 60336 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.053133E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.110 | TFLOPs: 26.14 | +7: iteration 8410/ 60336 | consumed samples: 2152960 | consumed tokens: 4409262080 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.053879E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.347 | TFLOPs: 26.13 | +7: iteration 8420/ 60336 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.043612E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.627 | TFLOPs: 26.14 | +7: iteration 8430/ 60336 | consumed samples: 2158080 | consumed tokens: 4419747840 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.047887E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.715 | TFLOPs: 26.14 | +7: iteration 8440/ 60336 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.057180E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.350 | TFLOPs: 26.15 | +7: iteration 8450/ 60336 | consumed samples: 2163200 | consumed tokens: 4430233600 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.043721E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.412 | TFLOPs: 26.15 | +7: iteration 8460/ 60336 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.049867E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.420 | TFLOPs: 26.15 | +7: iteration 8470/ 60336 | consumed samples: 2168320 | consumed tokens: 4440719360 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.041070E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.926 | TFLOPs: 26.14 | +7: iteration 8480/ 60336 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.054926E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.828 | TFLOPs: 25.70 | +7: iteration 8490/ 60336 | consumed samples: 2173440 | consumed tokens: 4451205120 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.050592E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.323 | TFLOPs: 26.15 | +7: iteration 8500/ 60336 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.046122E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.641 | TFLOPs: 26.14 | +7: iteration 8510/ 60336 | consumed samples: 2178560 | consumed tokens: 4461690880 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.038398E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.837 | TFLOPs: 26.12 | +7: iteration 8520/ 60336 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.028720E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.083 | TFLOPs: 26.16 | +7: iteration 8530/ 60336 | consumed samples: 2183680 | consumed tokens: 4472176640 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.038918E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.686 | TFLOPs: 26.14 | +7: iteration 8540/ 60336 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 0.16 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.039933E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.841 | TFLOPs: 25.75 | +7: iteration 8550/ 60336 | consumed samples: 2188800 | consumed tokens: 4482662400 | elapsed time per iteration (s): 0.16 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.047236E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.331 | TFLOPs: 25.72 | +7: iteration 8560/ 60336 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.041463E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.045 | TFLOPs: 26.11 | +7: iteration 8570/ 60336 | consumed samples: 2193920 | consumed tokens: 4493148160 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.041380E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.923 | TFLOPs: 26.14 | +7: iteration 8580/ 60336 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.034915E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.496 | TFLOPs: 26.12 | +7: iteration 8590/ 60336 | consumed samples: 2199040 | consumed tokens: 4503633920 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.042769E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.617 | TFLOPs: 26.11 | +7: iteration 8600/ 60336 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.038428E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.795 | TFLOPs: 26.11 | +7: iteration 8610/ 60336 | consumed samples: 2204160 | consumed tokens: 4514119680 | elapsed time per iteration (s): 0.15 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.041153E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.075 | TFLOPs: 26.11 | +7: iteration 8620/ 60336 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 0.15 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.031509E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.939 | TFLOPs: 26.11 | +7: iteration 8630/ 60336 | consumed samples: 2209280 | consumed tokens: 4524605440 | elapsed time per iteration (s): 0.15 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.034067E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.666 | TFLOPs: 26.11 | +7: iteration 8640/ 60336 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 0.15 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.042004E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.421 | TFLOPs: 26.13 | +7: iteration 8650/ 60336 | consumed samples: 2214400 | consumed tokens: 4535091200 | elapsed time per iteration (s): 0.15 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.030312E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.563 | TFLOPs: 26.12 | +7: iteration 8660/ 60336 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 0.15 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.038346E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.600 | TFLOPs: 26.11 | +7: iteration 8670/ 60336 | consumed samples: 2219520 | consumed tokens: 4545576960 | elapsed time per iteration (s): 0.15 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.026727E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.157 | TFLOPs: 26.13 | +7: iteration 8680/ 60336 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 0.15 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.028638E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.470 | TFLOPs: 26.15 | +7: iteration 8690/ 60336 | consumed samples: 2224640 | consumed tokens: 4556062720 | elapsed time per iteration (s): 0.15 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.036910E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.631 | TFLOPs: 26.14 | +7: iteration 8700/ 60336 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 0.15 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.030322E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.628 | TFLOPs: 26.12 | +7: iteration 8710/ 60336 | consumed samples: 2229760 | consumed tokens: 4566548480 | elapsed time per iteration (s): 0.15 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.039443E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.628 | TFLOPs: 26.17 | +7: iteration 8720/ 60336 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 0.15 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.036679E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.695 | TFLOPs: 26.15 | +7: iteration 8730/ 60336 | consumed samples: 2234880 | consumed tokens: 4577034240 | elapsed time per iteration (s): 0.15 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.042152E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.125 | TFLOPs: 26.11 | +7: iteration 8740/ 60336 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 0.15 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.016701E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.324 | TFLOPs: 26.13 | +7: iteration 8750/ 60336 | consumed samples: 2240000 | consumed tokens: 4587520000 | elapsed time per iteration (s): 0.15 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.032373E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.505 | TFLOPs: 26.10 | +7: iteration 8760/ 60336 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 0.15 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.028323E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.412 | TFLOPs: 26.15 | +7: iteration 8770/ 60336 | consumed samples: 2245120 | consumed tokens: 4598005760 | elapsed time per iteration (s): 0.15 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.027380E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.627 | TFLOPs: 26.11 | +7: iteration 8780/ 60336 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 0.15 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.034343E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.156 | TFLOPs: 26.08 | +7: iteration 8790/ 60336 | consumed samples: 2250240 | consumed tokens: 4608491520 | elapsed time per iteration (s): 0.15 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.030794E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.889 | TFLOPs: 26.11 | +7: iteration 8800/ 60336 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 0.15 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.031401E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.628 | TFLOPs: 26.22 | +7: iteration 8810/ 60336 | consumed samples: 2255360 | consumed tokens: 4618977280 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.058355E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.302 | TFLOPs: 26.24 | +7: iteration 8820/ 60336 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.039536E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.685 | TFLOPs: 26.25 | +7: iteration 8830/ 60336 | consumed samples: 2260480 | consumed tokens: 4629463040 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.038436E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.268 | TFLOPs: 26.24 | +7: iteration 8840/ 60336 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.030060E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.421 | TFLOPs: 25.95 | +7: iteration 8850/ 60336 | consumed samples: 2265600 | consumed tokens: 4639948800 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.035646E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.530 | TFLOPs: 26.26 | +7: iteration 8860/ 60336 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 0.15 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.038154E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.752 | TFLOPs: 26.25 | +7: iteration 8870/ 60336 | consumed samples: 2270720 | consumed tokens: 4650434560 | elapsed time per iteration (s): 0.15 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.028040E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.466 | TFLOPs: 26.26 | +7: iteration 8880/ 60336 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 0.15 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.032692E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.017 | TFLOPs: 26.27 | +7: iteration 8890/ 60336 | consumed samples: 2275840 | consumed tokens: 4660920320 | elapsed time per iteration (s): 0.15 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.019312E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.412 | TFLOPs: 26.27 | +7: iteration 8900/ 60336 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 0.15 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.030379E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.973 | TFLOPs: 26.27 | +7: iteration 8910/ 60336 | consumed samples: 2280960 | consumed tokens: 4671406080 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.033988E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.351 | TFLOPs: 26.26 | +7: iteration 8920/ 60336 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.040414E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.810 | TFLOPs: 26.25 | +7: iteration 8930/ 60336 | consumed samples: 2286080 | consumed tokens: 4681891840 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.024308E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.445 | TFLOPs: 26.28 | +7: iteration 8940/ 60336 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.040077E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.025 | TFLOPs: 26.27 | +7: iteration 8950/ 60336 | consumed samples: 2291200 | consumed tokens: 4692377600 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.015174E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.171 | TFLOPs: 26.22 | +7: iteration 8960/ 60336 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.027070E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.348 | TFLOPs: 26.26 | +7: iteration 8970/ 60336 | consumed samples: 2296320 | consumed tokens: 4702863360 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.021787E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.199 | TFLOPs: 26.26 | +7: iteration 8980/ 60336 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.037164E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.077 | TFLOPs: 25.81 | +7: iteration 8990/ 60336 | consumed samples: 2301440 | consumed tokens: 4713349120 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.034418E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.763 | TFLOPs: 26.25 | +7: iteration 9000/ 60336 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.018171E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.614 | TFLOPs: 26.23 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 9000 | lm loss value: 4.049519E+00 | lm loss PPL: 5.736983E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 9000 to checkpoints_44m32b100m +0: [2023-03-17 00:42:37,656] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step9000 is begin to save! +0: [2023-03-17 00:42:37,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:42:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:42:37,720] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:42:37,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:42:37,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:42:37,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:42:37,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:42:37,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:42:37,746] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:42:37,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:42:37,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:42:37,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:42:37,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:42:37,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:42:37,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:42:37,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:42:37,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:42:37,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:42:37,786] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:42:37,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:42:37,787] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step9000/mp_rank_00_model_states.pt +0: [2023-03-17 00:42:37,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:42:37,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:42:37,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:42:37,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:42:37,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:42:37,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 00:42:37,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:42:37,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:42:37,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:42:37,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: successfully saved checkpoint at iteration 9000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.02 +7: iteration 9010/ 60336 | consumed samples: 2306560 | consumed tokens: 4723834880 | elapsed time per iteration (s): 0.18 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.035843E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.546 | TFLOPs: 22.83 | +7: iteration 9020/ 60336 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.029262E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.337 | TFLOPs: 26.12 | +7: iteration 9030/ 60336 | consumed samples: 2311680 | consumed tokens: 4734320640 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.031199E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.392 | TFLOPs: 26.07 | +7: iteration 9040/ 60336 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.013194E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.792 | TFLOPs: 26.08 | +7: iteration 9050/ 60336 | consumed samples: 2316800 | consumed tokens: 4744806400 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.010210E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.325 | TFLOPs: 25.43 | +7: iteration 9060/ 60336 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.020938E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.209 | TFLOPs: 26.02 | +7: iteration 9070/ 60336 | consumed samples: 2321920 | consumed tokens: 4755292160 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.029425E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.750 | TFLOPs: 26.00 | +7: iteration 9080/ 60336 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.019913E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.548 | TFLOPs: 25.96 | +7: iteration 9090/ 60336 | consumed samples: 2327040 | consumed tokens: 4765777920 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.020810E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.996 | TFLOPs: 26.03 | +7: iteration 9100/ 60336 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.016645E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.018 | TFLOPs: 26.10 | +7: iteration 9110/ 60336 | consumed samples: 2332160 | consumed tokens: 4776263680 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.029904E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.918 | TFLOPs: 26.08 | +7: iteration 9120/ 60336 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.032317E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.259 | TFLOPs: 26.12 | +7: iteration 9130/ 60336 | consumed samples: 2337280 | consumed tokens: 4786749440 | elapsed time per iteration (s): 0.16 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.035192E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.930 | TFLOPs: 25.64 | +7: iteration 9140/ 60336 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.023618E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.143 | TFLOPs: 26.19 | +7: iteration 9150/ 60336 | consumed samples: 2342400 | consumed tokens: 4797235200 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.023105E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.761 | TFLOPs: 26.17 | +7: iteration 9160/ 60336 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.030592E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.123 | TFLOPs: 26.19 | +7: iteration 9170/ 60336 | consumed samples: 2347520 | consumed tokens: 4807720960 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.022410E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.622 | TFLOPs: 26.18 | +7: iteration 9180/ 60336 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.027496E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.264 | TFLOPs: 26.19 | +7: iteration 9190/ 60336 | consumed samples: 2352640 | consumed tokens: 4818206720 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.036790E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.329 | TFLOPs: 26.18 | +7: iteration 9200/ 60336 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.028547E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.440 | TFLOPs: 26.18 | +7: iteration 9210/ 60336 | consumed samples: 2357760 | consumed tokens: 4828692480 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.029503E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.402 | TFLOPs: 26.18 | +7: iteration 9220/ 60336 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.032915E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.321 | TFLOPs: 26.29 | +7: iteration 9230/ 60336 | consumed samples: 2362880 | consumed tokens: 4839178240 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.016410E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.791 | TFLOPs: 26.31 | +7: iteration 9240/ 60336 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.024993E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.049 | TFLOPs: 26.30 | +7: iteration 9250/ 60336 | consumed samples: 2368000 | consumed tokens: 4849664000 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.027673E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.294 | TFLOPs: 26.29 | +7: iteration 9260/ 60336 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.022958E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.134 | TFLOPs: 26.29 | +7: iteration 9270/ 60336 | consumed samples: 2373120 | consumed tokens: 4860149760 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.015474E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.959 | TFLOPs: 26.28 | +7: iteration 9280/ 60336 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.011690E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.276 | TFLOPs: 26.29 | +7: iteration 9290/ 60336 | consumed samples: 2378240 | consumed tokens: 4870635520 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.017811E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.739 | TFLOPs: 26.30 | +7: iteration 9300/ 60336 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.021452E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.174 | TFLOPs: 26.30 | +7: iteration 9310/ 60336 | consumed samples: 2383360 | consumed tokens: 4881121280 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.004893E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.084 | TFLOPs: 26.32 | +7: iteration 9320/ 60336 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.013379E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.189 | TFLOPs: 26.30 | +7: iteration 9330/ 60336 | consumed samples: 2388480 | consumed tokens: 4891607040 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.029046E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.231 | TFLOPs: 26.05 | +7: iteration 9340/ 60336 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.007106E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.523 | TFLOPs: 26.29 | +7: iteration 9350/ 60336 | consumed samples: 2393600 | consumed tokens: 4902092800 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.017211E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.327 | TFLOPs: 26.30 | +7: iteration 9360/ 60336 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.025343E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.759 | TFLOPs: 26.30 | +7: iteration 9370/ 60336 | consumed samples: 2398720 | consumed tokens: 4912578560 | elapsed time per iteration (s): 0.16 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.017266E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.252 | TFLOPs: 25.57 | +7: iteration 9380/ 60336 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.999462E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.055 | TFLOPs: 26.27 | +7: iteration 9390/ 60336 | consumed samples: 2403840 | consumed tokens: 4923064320 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.020370E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.215 | TFLOPs: 26.26 | +7: iteration 9400/ 60336 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.021880E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.779 | TFLOPs: 26.26 | +7: iteration 9410/ 60336 | consumed samples: 2408960 | consumed tokens: 4933550080 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.009589E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.572 | TFLOPs: 26.25 | +7: iteration 9420/ 60336 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.024070E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.643 | TFLOPs: 26.25 | +7: iteration 9430/ 60336 | consumed samples: 2414080 | consumed tokens: 4944035840 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.033258E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.519 | TFLOPs: 26.26 | +7: iteration 9440/ 60336 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.016189E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.563 | TFLOPs: 26.20 | +7: iteration 9450/ 60336 | consumed samples: 2419200 | consumed tokens: 4954521600 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.018568E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.963 | TFLOPs: 26.13 | +7: iteration 9460/ 60336 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.016491E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.592 | TFLOPs: 26.26 | +7: iteration 9470/ 60336 | consumed samples: 2424320 | consumed tokens: 4965007360 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.016684E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.016 | TFLOPs: 26.25 | +7: iteration 9480/ 60336 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.021173E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.969 | TFLOPs: 26.19 | +7: iteration 9490/ 60336 | consumed samples: 2429440 | consumed tokens: 4975493120 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.022023E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.365 | TFLOPs: 26.21 | +7: iteration 9500/ 60336 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.007153E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.386 | TFLOPs: 26.20 | +7: iteration 9510/ 60336 | consumed samples: 2434560 | consumed tokens: 4985978880 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.016093E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.879 | TFLOPs: 26.25 | +7: iteration 9520/ 60336 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.004554E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.486 | TFLOPs: 26.26 | +7: iteration 9530/ 60336 | consumed samples: 2439680 | consumed tokens: 4996464640 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.024722E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.758 | TFLOPs: 26.26 | +7: iteration 9540/ 60336 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.004263E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.172 | TFLOPs: 26.26 | +7: iteration 9550/ 60336 | consumed samples: 2444800 | consumed tokens: 5006950400 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.020198E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.436 | TFLOPs: 26.26 | +7: iteration 9560/ 60336 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.021217E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.174 | TFLOPs: 26.27 | +7: iteration 9570/ 60336 | consumed samples: 2449920 | consumed tokens: 5017436160 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.004977E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.663 | TFLOPs: 26.28 | +7: iteration 9580/ 60336 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.020716E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.917 | TFLOPs: 26.28 | +7: iteration 9590/ 60336 | consumed samples: 2455040 | consumed tokens: 5027921920 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.011610E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.496 | TFLOPs: 26.26 | +7: iteration 9600/ 60336 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.009881E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.765 | TFLOPs: 26.28 | +7: iteration 9610/ 60336 | consumed samples: 2460160 | consumed tokens: 5038407680 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.014991E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.364 | TFLOPs: 26.27 | +7: iteration 9620/ 60336 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.011115E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.141 | TFLOPs: 26.25 | +7: iteration 9630/ 60336 | consumed samples: 2465280 | consumed tokens: 5048893440 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.017440E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.384 | TFLOPs: 26.27 | +7: iteration 9640/ 60336 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.017455E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.918 | TFLOPs: 26.25 | +7: iteration 9650/ 60336 | consumed samples: 2470400 | consumed tokens: 5059379200 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.994675E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.189 | TFLOPs: 26.26 | +7: iteration 9660/ 60336 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.017279E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.320 | TFLOPs: 26.21 | +7: iteration 9670/ 60336 | consumed samples: 2475520 | consumed tokens: 5069864960 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.021706E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.362 | TFLOPs: 26.07 | +7: iteration 9680/ 60336 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.002505E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.881 | TFLOPs: 26.06 | +7: iteration 9690/ 60336 | consumed samples: 2480640 | consumed tokens: 5080350720 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.017878E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.109 | TFLOPs: 26.07 | +7: iteration 9700/ 60336 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.002052E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.714 | TFLOPs: 26.06 | +7: iteration 9710/ 60336 | consumed samples: 2485760 | consumed tokens: 5090836480 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.997250E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.529 | TFLOPs: 26.04 | +7: iteration 9720/ 60336 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.002477E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.547 | TFLOPs: 26.01 | +7: iteration 9730/ 60336 | consumed samples: 2490880 | consumed tokens: 5101322240 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.999585E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.488 | TFLOPs: 26.02 | +7: iteration 9740/ 60336 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.016445E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.267 | TFLOPs: 26.02 | +7: iteration 9750/ 60336 | consumed samples: 2496000 | consumed tokens: 5111808000 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.004562E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.098 | TFLOPs: 26.02 | +7: iteration 9760/ 60336 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.000190E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.316 | TFLOPs: 26.02 | +7: iteration 9770/ 60336 | consumed samples: 2501120 | consumed tokens: 5122293760 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.988904E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.130 | TFLOPs: 26.00 | +7: iteration 9780/ 60336 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.006738E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.512 | TFLOPs: 25.99 | +7: iteration 9790/ 60336 | consumed samples: 2506240 | consumed tokens: 5132779520 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.997157E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.609 | TFLOPs: 25.98 | +7: iteration 9800/ 60336 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.005957E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.017 | TFLOPs: 26.02 | +7: iteration 9810/ 60336 | consumed samples: 2511360 | consumed tokens: 5143265280 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.007236E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.936 | TFLOPs: 26.02 | +7: iteration 9820/ 60336 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.991801E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.501 | TFLOPs: 26.03 | +7: iteration 9830/ 60336 | consumed samples: 2516480 | consumed tokens: 5153751040 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.000719E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.848 | TFLOPs: 26.03 | +7: iteration 9840/ 60336 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.987783E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.614 | TFLOPs: 26.01 | +7: iteration 9850/ 60336 | consumed samples: 2521600 | consumed tokens: 5164236800 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.000443E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.592 | TFLOPs: 26.01 | +7: iteration 9860/ 60336 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.011491E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.122 | TFLOPs: 26.02 | +7: iteration 9870/ 60336 | consumed samples: 2526720 | consumed tokens: 5174722560 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.996878E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.265 | TFLOPs: 26.04 | +7: iteration 9880/ 60336 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.009781E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.083 | TFLOPs: 26.02 | +7: iteration 9890/ 60336 | consumed samples: 2531840 | consumed tokens: 5185208320 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.008492E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.446 | TFLOPs: 26.02 | +7: iteration 9900/ 60336 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.000560E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.752 | TFLOPs: 26.01 | +7: iteration 9910/ 60336 | consumed samples: 2536960 | consumed tokens: 5195694080 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 4.002385E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.238 | TFLOPs: 26.01 | +7: iteration 9920/ 60336 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 4.011305E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.032 | TFLOPs: 26.02 | +7: iteration 9930/ 60336 | consumed samples: 2542080 | consumed tokens: 5206179840 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.997038E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.946 | TFLOPs: 26.02 | +7: iteration 9940/ 60336 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.999856E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.368 | TFLOPs: 26.05 | +7: iteration 9950/ 60336 | consumed samples: 2547200 | consumed tokens: 5216665600 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.997608E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.380 | TFLOPs: 26.05 | +7: iteration 9960/ 60336 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.982465E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.223 | TFLOPs: 26.08 | +7: iteration 9970/ 60336 | consumed samples: 2552320 | consumed tokens: 5227151360 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.995748E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.582 | TFLOPs: 26.07 | +7: iteration 9980/ 60336 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.986703E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.636 | TFLOPs: 26.06 | +7: iteration 9990/ 60336 | consumed samples: 2557440 | consumed tokens: 5237637120 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 4.002283E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.454 | TFLOPs: 26.07 | +0: [2023-03-17 00:45:11,458] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=0, lr=[0.00018923099670180827, 0.00018923099670180827, 0.00018923099670180827], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 10000/ 60336 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.007543E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.685 | TFLOPs: 26.09 | +0: steps: 10000 loss: 4.0004 iter time (s): 0.152 samples/sec: 1680.627 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 10000 | lm loss value: 3.992220E+00 | lm loss PPL: 5.417503E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 10000 to checkpoints_44m32b100m +0: [2023-03-17 00:45:11,531] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is begin to save! +0: [2023-03-17 00:45:11,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:45:11,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:45:11,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:45:11,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:45:11,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:45:11,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:45:11,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:45:11,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:45:11,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:45:11,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:45:11,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:45:11,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:45:11,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:45:11,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:45:11,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:45:11,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:45:11,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:45:11,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:45:11,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:45:11,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:45:11,662] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step10000/mp_rank_00_model_states.pt +0: [2023-03-17 00:45:11,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:45:11,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:45:11,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:45:11,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:45:11,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:45:11,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:45:11,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:45:11,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:45:11,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:45:11,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:45:11,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: successfully saved checkpoint at iteration 10000 to checkpoints_44m32b100m +7: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:45:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:45:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: time (ms) | save-checkpoint: 176.07 +7: iteration 10010/ 60336 | consumed samples: 2562560 | consumed tokens: 5248122880 | elapsed time per iteration (s): 0.18 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.999655E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.536 | TFLOPs: 22.62 | +7: iteration 10020/ 60336 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.003081E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.674 | TFLOPs: 25.95 | +7: iteration 10030/ 60336 | consumed samples: 2567680 | consumed tokens: 5258608640 | elapsed time per iteration (s): 0.16 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.002486E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.280 | TFLOPs: 25.68 | +7: iteration 10040/ 60336 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.003762E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.994 | TFLOPs: 26.13 | +7: iteration 10050/ 60336 | consumed samples: 2572800 | consumed tokens: 5269094400 | elapsed time per iteration (s): 0.16 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.994267E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.207 | TFLOPs: 25.35 | +7: iteration 10060/ 60336 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 0.16 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.007288E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.690 | TFLOPs: 25.01 | +7: iteration 10070/ 60336 | consumed samples: 2577920 | consumed tokens: 5279580160 | elapsed time per iteration (s): 0.16 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.991800E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.639 | TFLOPs: 25.54 | +7: iteration 10080/ 60336 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 0.16 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.998802E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.027 | TFLOPs: 25.55 | +7: iteration 10090/ 60336 | consumed samples: 2583040 | consumed tokens: 5290065920 | elapsed time per iteration (s): 0.16 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.998653E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.097 | TFLOPs: 25.09 | +7: iteration 10100/ 60336 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.999503E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.550 | TFLOPs: 26.04 | +7: iteration 10110/ 60336 | consumed samples: 2588160 | consumed tokens: 5300551680 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 4.003266E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.160 | TFLOPs: 26.07 | +7: iteration 10120/ 60336 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.991184E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.218 | TFLOPs: 26.05 | +7: iteration 10130/ 60336 | consumed samples: 2593280 | consumed tokens: 5311037440 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.990275E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.663 | TFLOPs: 26.01 | +7: iteration 10140/ 60336 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.994860E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.521 | TFLOPs: 26.03 | +7: iteration 10150/ 60336 | consumed samples: 2598400 | consumed tokens: 5321523200 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.989594E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.650 | TFLOPs: 26.09 | +7: iteration 10160/ 60336 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.998567E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.411 | TFLOPs: 26.07 | +7: iteration 10170/ 60336 | consumed samples: 2603520 | consumed tokens: 5332008960 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.999430E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.124 | TFLOPs: 26.02 | +7: iteration 10180/ 60336 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 4.005512E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.229 | TFLOPs: 26.02 | +7: iteration 10190/ 60336 | consumed samples: 2608640 | consumed tokens: 5342494720 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.998204E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.493 | TFLOPs: 26.04 | +7: iteration 10200/ 60336 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 0.16 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.996168E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.843 | TFLOPs: 25.89 | +7: iteration 10210/ 60336 | consumed samples: 2613760 | consumed tokens: 5352980480 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 4.009230E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.895 | TFLOPs: 26.25 | +7: iteration 10220/ 60336 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.997869E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.583 | TFLOPs: 26.21 | +7: iteration 10230/ 60336 | consumed samples: 2618880 | consumed tokens: 5363466240 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.999407E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.649 | TFLOPs: 26.28 | +7: iteration 10240/ 60336 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 4.000320E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.756 | TFLOPs: 26.23 | +7: iteration 10250/ 60336 | consumed samples: 2624000 | consumed tokens: 5373952000 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.975473E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.001 | TFLOPs: 26.25 | +7: iteration 10260/ 60336 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.990143E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.498 | TFLOPs: 26.24 | +7: iteration 10270/ 60336 | consumed samples: 2629120 | consumed tokens: 5384437760 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.996571E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.687 | TFLOPs: 26.17 | +7: iteration 10280/ 60336 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 4.001088E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.345 | TFLOPs: 26.13 | +7: iteration 10290/ 60336 | consumed samples: 2634240 | consumed tokens: 5394923520 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.987834E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.295 | TFLOPs: 26.16 | +7: iteration 10300/ 60336 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.984895E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.681 | TFLOPs: 26.14 | +7: iteration 10310/ 60336 | consumed samples: 2639360 | consumed tokens: 5405409280 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.986816E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.014 | TFLOPs: 26.16 | +7: iteration 10320/ 60336 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 4.001050E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.567 | TFLOPs: 26.17 | +7: iteration 10330/ 60336 | consumed samples: 2644480 | consumed tokens: 5415895040 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.992846E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.454 | TFLOPs: 26.17 | +7: iteration 10340/ 60336 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.981243E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.427 | TFLOPs: 26.13 | +7: iteration 10350/ 60336 | consumed samples: 2649600 | consumed tokens: 5426380800 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.998415E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.265 | TFLOPs: 26.13 | +7: iteration 10360/ 60336 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.992722E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.491 | TFLOPs: 26.12 | +7: iteration 10370/ 60336 | consumed samples: 2654720 | consumed tokens: 5436866560 | elapsed time per iteration (s): 0.16 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.998523E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.097 | TFLOPs: 25.89 | +7: iteration 10380/ 60336 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.993741E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.931 | TFLOPs: 26.08 | +7: iteration 10390/ 60336 | consumed samples: 2659840 | consumed tokens: 5447352320 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.991011E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.992 | TFLOPs: 26.11 | +7: iteration 10400/ 60336 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.993081E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.348 | TFLOPs: 26.10 | +7: iteration 10410/ 60336 | consumed samples: 2664960 | consumed tokens: 5457838080 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.985337E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.333 | TFLOPs: 26.09 | +7: iteration 10420/ 60336 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.985854E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.096 | TFLOPs: 26.07 | +7: iteration 10430/ 60336 | consumed samples: 2670080 | consumed tokens: 5468323840 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.988290E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.755 | TFLOPs: 26.09 | +7: iteration 10440/ 60336 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.976392E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.812 | TFLOPs: 26.08 | +7: iteration 10450/ 60336 | consumed samples: 2675200 | consumed tokens: 5478809600 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.983562E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.143 | TFLOPs: 26.10 | +7: iteration 10460/ 60336 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.968789E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.171 | TFLOPs: 26.16 | +7: iteration 10470/ 60336 | consumed samples: 2680320 | consumed tokens: 5489295360 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.986968E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.553 | TFLOPs: 26.21 | +7: iteration 10480/ 60336 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.981610E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 10490/ 60336 | consumed samples: 2685440 | consumed tokens: 5499781120 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.977113E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.055 | TFLOPs: 26.13 | +7: iteration 10500/ 60336 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.971707E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.164 | TFLOPs: 26.13 | +7: iteration 10510/ 60336 | consumed samples: 2690560 | consumed tokens: 5510266880 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 4.001871E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.782 | TFLOPs: 26.17 | +7: iteration 10520/ 60336 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.975584E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.466 | TFLOPs: 26.23 | +7: iteration 10530/ 60336 | consumed samples: 2695680 | consumed tokens: 5520752640 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 4.002035E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.206 | TFLOPs: 26.22 | +7: iteration 10540/ 60336 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.986436E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.255 | TFLOPs: 26.24 | +7: iteration 10550/ 60336 | consumed samples: 2700800 | consumed tokens: 5531238400 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.985707E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.505 | TFLOPs: 26.24 | +7: iteration 10560/ 60336 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.985403E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.805 | TFLOPs: 26.22 | +7: iteration 10570/ 60336 | consumed samples: 2705920 | consumed tokens: 5541724160 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.976769E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.776 | TFLOPs: 26.23 | +7: iteration 10580/ 60336 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.995355E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.301 | TFLOPs: 26.26 | +7: iteration 10590/ 60336 | consumed samples: 2711040 | consumed tokens: 5552209920 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.975230E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.740 | TFLOPs: 26.22 | +7: iteration 10600/ 60336 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.976278E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.799 | TFLOPs: 26.23 | +7: iteration 10610/ 60336 | consumed samples: 2716160 | consumed tokens: 5562695680 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.987059E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.382 | TFLOPs: 26.24 | +7: iteration 10620/ 60336 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.970668E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.569 | TFLOPs: 26.25 | +7: iteration 10630/ 60336 | consumed samples: 2721280 | consumed tokens: 5573181440 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.993721E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.808 | TFLOPs: 26.22 | +7: iteration 10640/ 60336 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.980148E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.087 | TFLOPs: 26.22 | +7: iteration 10650/ 60336 | consumed samples: 2726400 | consumed tokens: 5583667200 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.973391E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.957 | TFLOPs: 26.17 | +7: iteration 10660/ 60336 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.986908E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.418 | TFLOPs: 26.24 | +7: iteration 10670/ 60336 | consumed samples: 2731520 | consumed tokens: 5594152960 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.991269E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.031 | TFLOPs: 26.24 | +7: iteration 10680/ 60336 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.999306E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.346 | TFLOPs: 26.26 | +7: iteration 10690/ 60336 | consumed samples: 2736640 | consumed tokens: 5604638720 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.975940E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.840 | TFLOPs: 26.16 | +7: iteration 10700/ 60336 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.987516E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.952 | TFLOPs: 26.16 | +7: iteration 10710/ 60336 | consumed samples: 2741760 | consumed tokens: 5615124480 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.981494E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.888 | TFLOPs: 26.16 | +7: iteration 10720/ 60336 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.977885E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.922 | TFLOPs: 26.14 | +7: iteration 10730/ 60336 | consumed samples: 2746880 | consumed tokens: 5625610240 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.992670E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.857 | TFLOPs: 26.20 | +7: iteration 10740/ 60336 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.995398E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.776 | TFLOPs: 26.09 | +7: iteration 10750/ 60336 | consumed samples: 2752000 | consumed tokens: 5636096000 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.989591E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.905 | TFLOPs: 26.16 | +7: iteration 10760/ 60336 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.980634E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.366 | TFLOPs: 26.15 | +7: iteration 10770/ 60336 | consumed samples: 2757120 | consumed tokens: 5646581760 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.981803E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.367 | TFLOPs: 26.18 | +7: iteration 10780/ 60336 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.961480E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.112 | TFLOPs: 26.13 | +7: iteration 10790/ 60336 | consumed samples: 2762240 | consumed tokens: 5657067520 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.982230E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.796 | TFLOPs: 26.09 | +7: iteration 10800/ 60336 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.984618E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.941 | TFLOPs: 26.14 | +7: iteration 10810/ 60336 | consumed samples: 2767360 | consumed tokens: 5667553280 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.972137E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.048 | TFLOPs: 26.16 | +7: iteration 10820/ 60336 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.976803E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.194 | TFLOPs: 26.18 | +7: iteration 10830/ 60336 | consumed samples: 2772480 | consumed tokens: 5678039040 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.975266E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.586 | TFLOPs: 26.21 | +7: iteration 10840/ 60336 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.970871E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.464 | TFLOPs: 26.20 | +7: iteration 10850/ 60336 | consumed samples: 2777600 | consumed tokens: 5688524800 | elapsed time per iteration (s): 0.16 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.971500E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.709 | TFLOPs: 25.78 | +7: iteration 10860/ 60336 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.981361E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.375 | TFLOPs: 26.20 | +7: iteration 10870/ 60336 | consumed samples: 2782720 | consumed tokens: 5699010560 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.966208E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.816 | TFLOPs: 26.20 | +7: iteration 10880/ 60336 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.972365E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.628 | TFLOPs: 26.25 | +7: iteration 10890/ 60336 | consumed samples: 2787840 | consumed tokens: 5709496320 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.977831E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.314 | TFLOPs: 26.26 | +7: iteration 10900/ 60336 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.987542E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.024 | TFLOPs: 26.25 | +7: iteration 10910/ 60336 | consumed samples: 2792960 | consumed tokens: 5719982080 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.987695E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.828 | TFLOPs: 26.28 | +7: iteration 10920/ 60336 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.959452E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.867 | TFLOPs: 26.25 | +7: iteration 10930/ 60336 | consumed samples: 2798080 | consumed tokens: 5730467840 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.961978E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.765 | TFLOPs: 26.20 | +7: iteration 10940/ 60336 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.986253E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.864 | TFLOPs: 26.14 | +7: iteration 10950/ 60336 | consumed samples: 2803200 | consumed tokens: 5740953600 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.968181E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.022 | TFLOPs: 26.14 | +7: iteration 10960/ 60336 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.980062E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.505 | TFLOPs: 26.13 | +7: iteration 10970/ 60336 | consumed samples: 2808320 | consumed tokens: 5751439360 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.981822E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.872 | TFLOPs: 26.16 | +7: iteration 10980/ 60336 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.955628E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.266 | TFLOPs: 26.15 | +7: iteration 10990/ 60336 | consumed samples: 2813440 | consumed tokens: 5761925120 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.974415E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.054 | TFLOPs: 26.14 | +7: iteration 11000/ 60336 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.966737E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.048 | TFLOPs: 26.19 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 11000 | lm loss value: 4.067957E+00 | lm loss PPL: 5.843748E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 11000 to checkpoints_44m32b100m +0: [2023-03-17 00:47:45,561] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step11000 is begin to save! +0: [2023-03-17 00:47:45,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:47:45,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:47:45,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:47:45,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:47:45,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:47:45,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:47:45,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:47:45,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:47:45,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:47:45,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:47:45,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:47:45,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:47:45,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:47:45,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:47:45,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:47:45,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:47:45,685] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:47:45,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:47:45,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:47:45,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:47:45,694] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step11000/mp_rank_00_model_states.pt +0: [2023-03-17 00:47:45,694] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:47:45,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:47:45,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:47:45,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:47:45,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:47:45,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:47:45,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:47:45,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:47:45,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:47:45,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:47:45,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:47:45,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:47:45,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:47:45,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:47:45,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 00:47:45,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:47:45,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:47:45,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: successfully saved checkpoint at iteration 11000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.82 +7: iteration 11010/ 60336 | consumed samples: 2818560 | consumed tokens: 5772410880 | elapsed time per iteration (s): 0.18 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.975110E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.793 | TFLOPs: 22.83 | +7: iteration 11020/ 60336 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.971420E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.216 | TFLOPs: 26.10 | +7: iteration 11030/ 60336 | consumed samples: 2823680 | consumed tokens: 5782896640 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.972116E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.958 | TFLOPs: 26.20 | +7: iteration 11040/ 60336 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.985923E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.419 | TFLOPs: 26.26 | +7: iteration 11050/ 60336 | consumed samples: 2828800 | consumed tokens: 5793382400 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.971556E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.651 | TFLOPs: 26.31 | +7: iteration 11060/ 60336 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.975526E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.040 | TFLOPs: 26.28 | +7: iteration 11070/ 60336 | consumed samples: 2833920 | consumed tokens: 5803868160 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.970065E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.493 | TFLOPs: 26.28 | +7: iteration 11080/ 60336 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.973188E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.337 | TFLOPs: 26.27 | +7: iteration 11090/ 60336 | consumed samples: 2839040 | consumed tokens: 5814353920 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.974911E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.688 | TFLOPs: 26.23 | +7: iteration 11100/ 60336 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.976278E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.120 | TFLOPs: 26.24 | +7: iteration 11110/ 60336 | consumed samples: 2844160 | consumed tokens: 5824839680 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.960612E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.586 | TFLOPs: 26.15 | +7: iteration 11120/ 60336 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.967230E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.685 | TFLOPs: 26.15 | +7: iteration 11130/ 60336 | consumed samples: 2849280 | consumed tokens: 5835325440 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.979968E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.857 | TFLOPs: 26.17 | +7: iteration 11140/ 60336 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.977725E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.282 | TFLOPs: 26.19 | +7: iteration 11150/ 60336 | consumed samples: 2854400 | consumed tokens: 5845811200 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.981333E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.180 | TFLOPs: 26.19 | +7: iteration 11160/ 60336 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.974073E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.644 | TFLOPs: 26.25 | +7: iteration 11170/ 60336 | consumed samples: 2859520 | consumed tokens: 5856296960 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.982285E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.378 | TFLOPs: 26.27 | +7: iteration 11180/ 60336 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.981413E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.927 | TFLOPs: 26.27 | +7: iteration 11190/ 60336 | consumed samples: 2864640 | consumed tokens: 5866782720 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.964082E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.561 | TFLOPs: 26.28 | +7: iteration 11200/ 60336 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.982809E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.047 | TFLOPs: 26.27 | +7: iteration 11210/ 60336 | consumed samples: 2869760 | consumed tokens: 5877268480 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.979376E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.767 | TFLOPs: 26.17 | +7: iteration 11220/ 60336 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.964378E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.480 | TFLOPs: 26.20 | +7: iteration 11230/ 60336 | consumed samples: 2874880 | consumed tokens: 5887754240 | elapsed time per iteration (s): 0.16 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.973166E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.564 | TFLOPs: 25.73 | +7: iteration 11240/ 60336 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.977147E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.964 | TFLOPs: 26.19 | +7: iteration 11250/ 60336 | consumed samples: 2880000 | consumed tokens: 5898240000 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.973522E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.742 | TFLOPs: 26.06 | +7: iteration 11260/ 60336 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.967414E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.171 | TFLOPs: 26.07 | +7: iteration 11270/ 60336 | consumed samples: 2885120 | consumed tokens: 5908725760 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.968118E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.093 | TFLOPs: 26.10 | +7: iteration 11280/ 60336 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.969167E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.318 | TFLOPs: 26.10 | +7: iteration 11290/ 60336 | consumed samples: 2890240 | consumed tokens: 5919211520 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.973701E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.920 | TFLOPs: 26.13 | +7: iteration 11300/ 60336 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.982824E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.477 | TFLOPs: 26.13 | +7: iteration 11310/ 60336 | consumed samples: 2895360 | consumed tokens: 5929697280 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.979350E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.091 | TFLOPs: 26.07 | +7: iteration 11320/ 60336 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.965562E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.359 | TFLOPs: 26.09 | +7: iteration 11330/ 60336 | consumed samples: 2900480 | consumed tokens: 5940183040 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.979587E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.785 | TFLOPs: 26.12 | +7: iteration 11340/ 60336 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.970349E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.497 | TFLOPs: 26.13 | +7: iteration 11350/ 60336 | consumed samples: 2905600 | consumed tokens: 5950668800 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.975503E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.894 | TFLOPs: 26.11 | +7: iteration 11360/ 60336 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.969607E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.752 | TFLOPs: 26.01 | +7: iteration 11370/ 60336 | consumed samples: 2910720 | consumed tokens: 5961154560 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.969765E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.936 | TFLOPs: 26.13 | +7: iteration 11380/ 60336 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 0.16 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.979356E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.019 | TFLOPs: 25.39 | +7: iteration 11390/ 60336 | consumed samples: 2915840 | consumed tokens: 5971640320 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.972387E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.718 | TFLOPs: 26.09 | +7: iteration 11400/ 60336 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.961230E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.035 | TFLOPs: 26.13 | +7: iteration 11410/ 60336 | consumed samples: 2920960 | consumed tokens: 5982126080 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.968512E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.628 | TFLOPs: 26.14 | +7: iteration 11420/ 60336 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.966600E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.894 | TFLOPs: 26.08 | +7: iteration 11430/ 60336 | consumed samples: 2926080 | consumed tokens: 5992611840 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.961960E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.975 | TFLOPs: 26.10 | +7: iteration 11440/ 60336 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.955653E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.318 | TFLOPs: 26.07 | +7: iteration 11450/ 60336 | consumed samples: 2931200 | consumed tokens: 6003097600 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.961425E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.263 | TFLOPs: 26.08 | +7: iteration 11460/ 60336 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.954243E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.320 | TFLOPs: 26.09 | +7: iteration 11470/ 60336 | consumed samples: 2936320 | consumed tokens: 6013583360 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.974696E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.646 | TFLOPs: 26.06 | +7: iteration 11480/ 60336 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.964260E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.515 | TFLOPs: 26.04 | +7: iteration 11490/ 60336 | consumed samples: 2941440 | consumed tokens: 6024069120 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.953619E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.324 | TFLOPs: 26.05 | +7: iteration 11500/ 60336 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.961912E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.666 | TFLOPs: 26.07 | +7: iteration 11510/ 60336 | consumed samples: 2946560 | consumed tokens: 6034554880 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.971406E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.515 | TFLOPs: 26.09 | +7: iteration 11520/ 60336 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.959194E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.415 | TFLOPs: 26.15 | +7: iteration 11530/ 60336 | consumed samples: 2951680 | consumed tokens: 6045040640 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.968529E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.431 | TFLOPs: 26.13 | +7: iteration 11540/ 60336 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.953608E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.110 | TFLOPs: 26.13 | +7: iteration 11550/ 60336 | consumed samples: 2956800 | consumed tokens: 6055526400 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.970156E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.093 | TFLOPs: 26.13 | +7: iteration 11560/ 60336 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.966292E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.959 | TFLOPs: 26.13 | +7: iteration 11570/ 60336 | consumed samples: 2961920 | consumed tokens: 6066012160 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.976247E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.936 | TFLOPs: 26.13 | +7: iteration 11580/ 60336 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.962826E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.113 | TFLOPs: 26.14 | +7: iteration 11590/ 60336 | consumed samples: 2967040 | consumed tokens: 6076497920 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.971247E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.397 | TFLOPs: 26.13 | +7: iteration 11600/ 60336 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.960672E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.127 | TFLOPs: 26.14 | +7: iteration 11610/ 60336 | consumed samples: 2972160 | consumed tokens: 6086983680 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.967150E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.734 | TFLOPs: 26.15 | +7: iteration 11620/ 60336 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.977248E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.738 | TFLOPs: 26.12 | +7: iteration 11630/ 60336 | consumed samples: 2977280 | consumed tokens: 6097469440 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.959536E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.803 | TFLOPs: 26.12 | +7: iteration 11640/ 60336 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.956712E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.086 | TFLOPs: 26.11 | +7: iteration 11650/ 60336 | consumed samples: 2982400 | consumed tokens: 6107955200 | elapsed time per iteration (s): 0.16 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.969019E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.266 | TFLOPs: 25.65 | +7: iteration 11660/ 60336 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 0.16 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.980346E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.474 | TFLOPs: 25.85 | +7: iteration 11670/ 60336 | consumed samples: 2987520 | consumed tokens: 6118440960 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.968602E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.474 | TFLOPs: 26.09 | +7: iteration 11680/ 60336 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.950870E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.949 | TFLOPs: 26.06 | +7: iteration 11690/ 60336 | consumed samples: 2992640 | consumed tokens: 6128926720 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.968711E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.808 | TFLOPs: 26.08 | +7: iteration 11700/ 60336 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.966536E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.844 | TFLOPs: 26.08 | +7: iteration 11710/ 60336 | consumed samples: 2997760 | consumed tokens: 6139412480 | elapsed time per iteration (s): 0.16 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.960128E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.396 | TFLOPs: 25.85 | +7: iteration 11720/ 60336 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.967381E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.376 | TFLOPs: 26.10 | +7: iteration 11730/ 60336 | consumed samples: 3002880 | consumed tokens: 6149898240 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.972257E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.029 | TFLOPs: 26.11 | +7: iteration 11740/ 60336 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 0.16 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.957215E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.256 | TFLOPs: 24.70 | +7: iteration 11750/ 60336 | consumed samples: 3008000 | consumed tokens: 6160384000 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.965200E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.783 | TFLOPs: 26.19 | +7: iteration 11760/ 60336 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.967187E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.601 | TFLOPs: 26.20 | +7: iteration 11770/ 60336 | consumed samples: 3013120 | consumed tokens: 6170869760 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.968031E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.787 | TFLOPs: 26.20 | +7: iteration 11780/ 60336 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.971242E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.112 | TFLOPs: 26.19 | +7: iteration 11790/ 60336 | consumed samples: 3018240 | consumed tokens: 6181355520 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.972766E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.141 | TFLOPs: 26.21 | +7: iteration 11800/ 60336 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.962837E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.572 | TFLOPs: 26.20 | +7: iteration 11810/ 60336 | consumed samples: 3023360 | consumed tokens: 6191841280 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.960594E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.668 | TFLOPs: 26.20 | +7: iteration 11820/ 60336 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.956517E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.194 | TFLOPs: 26.19 | +7: iteration 11830/ 60336 | consumed samples: 3028480 | consumed tokens: 6202327040 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.967505E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.645 | TFLOPs: 26.18 | +7: iteration 11840/ 60336 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.953685E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.983 | TFLOPs: 26.13 | +7: iteration 11850/ 60336 | consumed samples: 3033600 | consumed tokens: 6212812800 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.953984E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.710 | TFLOPs: 26.17 | +7: iteration 11860/ 60336 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.961127E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.526 | TFLOPs: 26.17 | +7: iteration 11870/ 60336 | consumed samples: 3038720 | consumed tokens: 6223298560 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.955572E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.650 | TFLOPs: 26.17 | +7: iteration 11880/ 60336 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.965789E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.762 | TFLOPs: 26.19 | +7: iteration 11890/ 60336 | consumed samples: 3043840 | consumed tokens: 6233784320 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.962109E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.401 | TFLOPs: 26.18 | +7: iteration 11900/ 60336 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.957578E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.922 | TFLOPs: 26.19 | +7: iteration 11910/ 60336 | consumed samples: 3048960 | consumed tokens: 6244270080 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.956236E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.682 | TFLOPs: 26.17 | +7: iteration 11920/ 60336 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.959594E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.182 | TFLOPs: 26.21 | +7: iteration 11930/ 60336 | consumed samples: 3054080 | consumed tokens: 6254755840 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.953119E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.002 | TFLOPs: 26.19 | +7: iteration 11940/ 60336 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.963140E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.256 | TFLOPs: 26.15 | +7: iteration 11950/ 60336 | consumed samples: 3059200 | consumed tokens: 6265241600 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.971898E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.989 | TFLOPs: 26.17 | +7: iteration 11960/ 60336 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.955325E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.387 | TFLOPs: 26.18 | +7: iteration 11970/ 60336 | consumed samples: 3064320 | consumed tokens: 6275727360 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.953688E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.237 | TFLOPs: 26.15 | +7: iteration 11980/ 60336 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.956930E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.913 | TFLOPs: 26.11 | +7: iteration 11990/ 60336 | consumed samples: 3069440 | consumed tokens: 6286213120 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.958428E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.056 | TFLOPs: 26.13 | +0: [2023-03-17 00:50:19,481] [INFO] [logging.py:68:log_dist] [Rank 0] step=12000, skipped=0, lr=[0.00018431084145627857, 0.00018431084145627857, 0.00018431084145627857], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 12000/ 60336 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.948229E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.510 | TFLOPs: 26.14 | +0: steps: 12000 loss: 3.9599 iter time (s): 0.153 samples/sec: 1678.562 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 12000 | lm loss value: 3.968434E+00 | lm loss PPL: 5.290161E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 12000 to checkpoints_44m32b100m +0: [2023-03-17 00:50:19,554] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is begin to save! +0: [2023-03-17 00:50:19,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:50:19,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:50:19,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:50:19,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:50:19,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:50:19,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:50:19,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:50:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:50:19,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:50:19,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:50:19,653] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:50:19,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:50:19,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:50:19,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:50:19,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:50:19,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:50:19,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:50:19,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:50:19,685] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:50:19,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:50:19,686] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step12000/mp_rank_00_model_states.pt +0: [2023-03-17 00:50:19,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:50:19,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:50:19,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:50:19,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:50:19,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:50:19,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:50:19,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:50:19,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:50:19,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:50:19,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:50:19,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:50:19,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:50:19,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 00:50:19,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:50:19,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:50:19,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:50:19,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:50:19,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:50:19,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:50:19,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:50:19,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: successfully saved checkpoint at iteration 12000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.08 +7: iteration 12010/ 60336 | consumed samples: 3074560 | consumed tokens: 6296698880 | elapsed time per iteration (s): 0.18 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.956091E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1441.175 | TFLOPs: 22.60 | +7: iteration 12020/ 60336 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.968303E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.844 | TFLOPs: 26.20 | +7: iteration 12030/ 60336 | consumed samples: 3079680 | consumed tokens: 6307184640 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.956691E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.521 | TFLOPs: 26.20 | +7: iteration 12040/ 60336 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.966205E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.945 | TFLOPs: 26.19 | +7: iteration 12050/ 60336 | consumed samples: 3084800 | consumed tokens: 6317670400 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.937980E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.612 | TFLOPs: 26.18 | +7: iteration 12060/ 60336 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.953591E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.640 | TFLOPs: 26.20 | +7: iteration 12070/ 60336 | consumed samples: 3089920 | consumed tokens: 6328156160 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.959192E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.935 | TFLOPs: 26.19 | +7: iteration 12080/ 60336 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.958432E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.466 | TFLOPs: 26.20 | +7: iteration 12090/ 60336 | consumed samples: 3095040 | consumed tokens: 6338641920 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.952937E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.004 | TFLOPs: 26.21 | +7: iteration 12100/ 60336 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.951833E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.511 | TFLOPs: 26.18 | +7: iteration 12110/ 60336 | consumed samples: 3100160 | consumed tokens: 6349127680 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.965689E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.924 | TFLOPs: 26.20 | +7: iteration 12120/ 60336 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.939700E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.669 | TFLOPs: 26.23 | +7: iteration 12130/ 60336 | consumed samples: 3105280 | consumed tokens: 6359613440 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.951332E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.310 | TFLOPs: 26.05 | +7: iteration 12140/ 60336 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.947026E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.056 | TFLOPs: 26.21 | +7: iteration 12150/ 60336 | consumed samples: 3110400 | consumed tokens: 6370099200 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.970241E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.155 | TFLOPs: 26.19 | +7: iteration 12160/ 60336 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.959436E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.072 | TFLOPs: 26.22 | +7: iteration 12170/ 60336 | consumed samples: 3115520 | consumed tokens: 6380584960 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.953174E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.532 | TFLOPs: 26.21 | +7: iteration 12180/ 60336 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.963558E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.894 | TFLOPs: 26.20 | +7: iteration 12190/ 60336 | consumed samples: 3120640 | consumed tokens: 6391070720 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.958572E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.808 | TFLOPs: 26.20 | +7: iteration 12200/ 60336 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.954812E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.879 | TFLOPs: 26.20 | +7: iteration 12210/ 60336 | consumed samples: 3125760 | consumed tokens: 6401556480 | elapsed time per iteration (s): 0.16 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.959031E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.337 | TFLOPs: 25.88 | +7: iteration 12220/ 60336 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.948595E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.312 | TFLOPs: 26.19 | +7: iteration 12230/ 60336 | consumed samples: 3130880 | consumed tokens: 6412042240 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.950265E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.118 | TFLOPs: 26.21 | +7: iteration 12240/ 60336 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.939416E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.589 | TFLOPs: 26.23 | +7: iteration 12250/ 60336 | consumed samples: 3136000 | consumed tokens: 6422528000 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.940865E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.833 | TFLOPs: 26.19 | +7: iteration 12260/ 60336 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.954946E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.942 | TFLOPs: 26.24 | +7: iteration 12270/ 60336 | consumed samples: 3141120 | consumed tokens: 6433013760 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.931947E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.414 | TFLOPs: 26.23 | +7: iteration 12280/ 60336 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.963234E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.968 | TFLOPs: 26.20 | +7: iteration 12290/ 60336 | consumed samples: 3146240 | consumed tokens: 6443499520 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.947144E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.030 | TFLOPs: 26.22 | +7: iteration 12300/ 60336 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.958698E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.221 | TFLOPs: 26.22 | +7: iteration 12310/ 60336 | consumed samples: 3151360 | consumed tokens: 6453985280 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.959275E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.082 | TFLOPs: 26.10 | +7: iteration 12320/ 60336 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.960882E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.617 | TFLOPs: 26.12 | +7: iteration 12330/ 60336 | consumed samples: 3156480 | consumed tokens: 6464471040 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.950507E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.638 | TFLOPs: 26.11 | +7: iteration 12340/ 60336 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.946207E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.972 | TFLOPs: 26.11 | +7: iteration 12350/ 60336 | consumed samples: 3161600 | consumed tokens: 6474956800 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.939159E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.647 | TFLOPs: 26.14 | +7: iteration 12360/ 60336 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.946183E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.689 | TFLOPs: 26.12 | +7: iteration 12370/ 60336 | consumed samples: 3166720 | consumed tokens: 6485442560 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.949247E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.536 | TFLOPs: 26.10 | +7: iteration 12380/ 60336 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.956924E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.894 | TFLOPs: 26.11 | +7: iteration 12390/ 60336 | consumed samples: 3171840 | consumed tokens: 6495928320 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.948514E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.259 | TFLOPs: 26.10 | +7: iteration 12400/ 60336 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.945326E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.803 | TFLOPs: 26.12 | +7: iteration 12410/ 60336 | consumed samples: 3176960 | consumed tokens: 6506414080 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.943986E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.804 | TFLOPs: 26.06 | +7: iteration 12420/ 60336 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.943086E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.963 | TFLOPs: 26.10 | +7: iteration 12430/ 60336 | consumed samples: 3182080 | consumed tokens: 6516899840 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.949564E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.934 | TFLOPs: 26.11 | +7: iteration 12440/ 60336 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.951172E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.105 | TFLOPs: 26.05 | +7: iteration 12450/ 60336 | consumed samples: 3187200 | consumed tokens: 6527385600 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.933139E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.677 | TFLOPs: 26.04 | +7: iteration 12460/ 60336 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.970579E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.350 | TFLOPs: 26.05 | +7: iteration 12470/ 60336 | consumed samples: 3192320 | consumed tokens: 6537871360 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.932966E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.079 | TFLOPs: 26.07 | +7: iteration 12480/ 60336 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.952798E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.390 | TFLOPs: 26.05 | +7: iteration 12490/ 60336 | consumed samples: 3197440 | consumed tokens: 6548357120 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.953854E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.380 | TFLOPs: 26.02 | +7: iteration 12500/ 60336 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.958050E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.174 | TFLOPs: 26.05 | +7: iteration 12510/ 60336 | consumed samples: 3202560 | consumed tokens: 6558842880 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.950487E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.052 | TFLOPs: 26.03 | +7: iteration 12520/ 60336 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.956292E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.096 | TFLOPs: 26.05 | +7: iteration 12530/ 60336 | consumed samples: 3207680 | consumed tokens: 6569328640 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.953250E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.084 | TFLOPs: 26.07 | +7: iteration 12540/ 60336 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.949733E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.050 | TFLOPs: 26.07 | +7: iteration 12550/ 60336 | consumed samples: 3212800 | consumed tokens: 6579814400 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.938094E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.042 | TFLOPs: 26.05 | +7: iteration 12560/ 60336 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.963333E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.579 | TFLOPs: 26.09 | +7: iteration 12570/ 60336 | consumed samples: 3217920 | consumed tokens: 6590300160 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.961258E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.515 | TFLOPs: 26.07 | +7: iteration 12580/ 60336 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.950344E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.575 | TFLOPs: 26.09 | +7: iteration 12590/ 60336 | consumed samples: 3223040 | consumed tokens: 6600785920 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.943512E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.347 | TFLOPs: 26.05 | +7: iteration 12600/ 60336 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.958811E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.270 | TFLOPs: 26.07 | +7: iteration 12610/ 60336 | consumed samples: 3228160 | consumed tokens: 6611271680 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.948618E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.007 | TFLOPs: 26.10 | +7: iteration 12620/ 60336 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.942373E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.874 | TFLOPs: 26.09 | +7: iteration 12630/ 60336 | consumed samples: 3233280 | consumed tokens: 6621757440 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.938703E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.324 | TFLOPs: 26.09 | +7: iteration 12640/ 60336 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.951975E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.537 | TFLOPs: 26.07 | +7: iteration 12650/ 60336 | consumed samples: 3238400 | consumed tokens: 6632243200 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.947368E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.292 | TFLOPs: 26.07 | +7: iteration 12660/ 60336 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 0.16 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.942049E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.210 | TFLOPs: 25.71 | +7: iteration 12670/ 60336 | consumed samples: 3243520 | consumed tokens: 6642728960 | elapsed time per iteration (s): 0.16 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.951014E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.251 | TFLOPs: 25.85 | +7: iteration 12680/ 60336 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.943494E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.714 | TFLOPs: 26.08 | +7: iteration 12690/ 60336 | consumed samples: 3248640 | consumed tokens: 6653214720 | elapsed time per iteration (s): 0.16 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.949198E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.103 | TFLOPs: 25.50 | +7: iteration 12700/ 60336 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.947668E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.252 | TFLOPs: 26.08 | +7: iteration 12710/ 60336 | consumed samples: 3253760 | consumed tokens: 6663700480 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.942169E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.452 | TFLOPs: 26.07 | +7: iteration 12720/ 60336 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.941121E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.359 | TFLOPs: 26.13 | +7: iteration 12730/ 60336 | consumed samples: 3258880 | consumed tokens: 6674186240 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.945460E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.415 | TFLOPs: 26.16 | +7: iteration 12740/ 60336 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.941982E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.975 | TFLOPs: 26.14 | +7: iteration 12750/ 60336 | consumed samples: 3264000 | consumed tokens: 6684672000 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.936406E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.881 | TFLOPs: 26.16 | +7: iteration 12760/ 60336 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.957350E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.000 | TFLOPs: 26.16 | +7: iteration 12770/ 60336 | consumed samples: 3269120 | consumed tokens: 6695157760 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.956870E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.385 | TFLOPs: 26.16 | +7: iteration 12780/ 60336 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.941227E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.106 | TFLOPs: 26.14 | +7: iteration 12790/ 60336 | consumed samples: 3274240 | consumed tokens: 6705643520 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.949846E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.720 | TFLOPs: 26.15 | +7: iteration 12800/ 60336 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.962693E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.684 | TFLOPs: 26.18 | +7: iteration 12810/ 60336 | consumed samples: 3279360 | consumed tokens: 6716129280 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.945078E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.073 | TFLOPs: 26.18 | +7: iteration 12820/ 60336 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.932321E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.127 | TFLOPs: 26.16 | +7: iteration 12830/ 60336 | consumed samples: 3284480 | consumed tokens: 6726615040 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.946198E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.438 | TFLOPs: 26.17 | +7: iteration 12840/ 60336 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.950975E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.060 | TFLOPs: 26.14 | +7: iteration 12850/ 60336 | consumed samples: 3289600 | consumed tokens: 6737100800 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.944701E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.730 | TFLOPs: 26.15 | +7: iteration 12860/ 60336 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.942585E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.922 | TFLOPs: 26.17 | +7: iteration 12870/ 60336 | consumed samples: 3294720 | consumed tokens: 6747586560 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.949769E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.383 | TFLOPs: 26.16 | +7: iteration 12880/ 60336 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.960164E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.905 | TFLOPs: 26.16 | +7: iteration 12890/ 60336 | consumed samples: 3299840 | consumed tokens: 6758072320 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.933657E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.057 | TFLOPs: 26.16 | +7: iteration 12900/ 60336 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.935802E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.435 | TFLOPs: 26.18 | +7: iteration 12910/ 60336 | consumed samples: 3304960 | consumed tokens: 6768558080 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.936396E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.123 | TFLOPs: 26.18 | +7: iteration 12920/ 60336 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.948598E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.990 | TFLOPs: 26.14 | +7: iteration 12930/ 60336 | consumed samples: 3310080 | consumed tokens: 6779043840 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.936357E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.142 | TFLOPs: 26.18 | +7: iteration 12940/ 60336 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.945076E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.431 | TFLOPs: 26.15 | +7: iteration 12950/ 60336 | consumed samples: 3315200 | consumed tokens: 6789529600 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.939759E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.343 | TFLOPs: 26.16 | +7: iteration 12960/ 60336 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.931355E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.300 | TFLOPs: 26.15 | +7: iteration 12970/ 60336 | consumed samples: 3320320 | consumed tokens: 6800015360 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.950402E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.413 | TFLOPs: 26.15 | +7: iteration 12980/ 60336 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.935435E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.373 | TFLOPs: 26.15 | +7: iteration 12990/ 60336 | consumed samples: 3325440 | consumed tokens: 6810501120 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.948300E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.046 | TFLOPs: 26.17 | +7: iteration 13000/ 60336 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.931924E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.363 | TFLOPs: 26.15 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 13000 | lm loss value: 4.048470E+00 | lm loss PPL: 5.730968E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 13000 to checkpoints_44m32b100m +0: [2023-03-17 00:52:53,464] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step13000 is begin to save! +0: [2023-03-17 00:52:53,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:52:53,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:52:53,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:52:53,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:52:53,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:52:53,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:52:53,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:52:53,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:52:53,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:52:53,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:52:53,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:52:53,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:52:53,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:52:53,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:52:53,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:52:53,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:52:53,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:52:53,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:52:53,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:52:53,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:52:53,596] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step13000/mp_rank_00_model_states.pt +0: [2023-03-17 00:52:53,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:52:53,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:53,615] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:53,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:53,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:53,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:53,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:53,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:53,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:53,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:53,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:53,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:53,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:53,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:53,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:52:53,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:53,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:52:53,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:53,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: successfully saved checkpoint at iteration 13000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 178.85 +7: iteration 13010/ 60336 | consumed samples: 3330560 | consumed tokens: 6820986880 | elapsed time per iteration (s): 0.18 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.919554E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.280 | TFLOPs: 22.76 | +7: iteration 13020/ 60336 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.925409E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.443 | TFLOPs: 26.15 | +7: iteration 13030/ 60336 | consumed samples: 3335680 | consumed tokens: 6831472640 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.929282E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.083 | TFLOPs: 26.16 | +7: iteration 13040/ 60336 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.943760E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.683 | TFLOPs: 26.11 | +7: iteration 13050/ 60336 | consumed samples: 3340800 | consumed tokens: 6841958400 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.954393E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.360 | TFLOPs: 26.10 | +7: iteration 13060/ 60336 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.944293E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.784 | TFLOPs: 26.12 | +7: iteration 13070/ 60336 | consumed samples: 3345920 | consumed tokens: 6852444160 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.932325E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.924 | TFLOPs: 26.13 | +7: iteration 13080/ 60336 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.940564E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.083 | TFLOPs: 26.10 | +7: iteration 13090/ 60336 | consumed samples: 3351040 | consumed tokens: 6862929920 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.939791E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.799 | TFLOPs: 26.12 | +7: iteration 13100/ 60336 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.924920E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.188 | TFLOPs: 26.10 | +7: iteration 13110/ 60336 | consumed samples: 3356160 | consumed tokens: 6873415680 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.934957E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.288 | TFLOPs: 26.02 | +7: iteration 13120/ 60336 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.937663E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.215 | TFLOPs: 26.05 | +7: iteration 13130/ 60336 | consumed samples: 3361280 | consumed tokens: 6883901440 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.945575E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.399 | TFLOPs: 26.05 | +7: iteration 13140/ 60336 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.931352E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.171 | TFLOPs: 26.05 | +7: iteration 13150/ 60336 | consumed samples: 3366400 | consumed tokens: 6894387200 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.927188E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.464 | TFLOPs: 26.06 | +7: iteration 13160/ 60336 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.922137E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.454 | TFLOPs: 26.09 | +7: iteration 13170/ 60336 | consumed samples: 3371520 | consumed tokens: 6904872960 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.940374E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.514 | TFLOPs: 26.10 | +7: iteration 13180/ 60336 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.937363E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.631 | TFLOPs: 26.11 | +7: iteration 13190/ 60336 | consumed samples: 3376640 | consumed tokens: 6915358720 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.930991E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.714 | TFLOPs: 26.04 | +7: iteration 13200/ 60336 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 0.16 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.932681E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.182 | TFLOPs: 25.39 | +7: iteration 13210/ 60336 | consumed samples: 3381760 | consumed tokens: 6925844480 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.926104E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.191 | TFLOPs: 25.96 | +7: iteration 13220/ 60336 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.945140E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.937 | TFLOPs: 26.11 | +7: iteration 13230/ 60336 | consumed samples: 3386880 | consumed tokens: 6936330240 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.924960E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.429 | TFLOPs: 26.13 | +7: iteration 13240/ 60336 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.931307E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.729 | TFLOPs: 26.12 | +7: iteration 13250/ 60336 | consumed samples: 3392000 | consumed tokens: 6946816000 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.942319E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.442 | TFLOPs: 26.09 | +7: iteration 13260/ 60336 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.943544E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.833 | TFLOPs: 26.09 | +7: iteration 13270/ 60336 | consumed samples: 3397120 | consumed tokens: 6957301760 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.943572E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.106 | TFLOPs: 26.08 | +7: iteration 13280/ 60336 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.948693E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.948 | TFLOPs: 26.09 | +7: iteration 13290/ 60336 | consumed samples: 3402240 | consumed tokens: 6967787520 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.939677E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.457 | TFLOPs: 26.09 | +7: iteration 13300/ 60336 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.925812E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.858 | TFLOPs: 26.11 | +7: iteration 13310/ 60336 | consumed samples: 3407360 | consumed tokens: 6978273280 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.944475E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.734 | TFLOPs: 26.11 | +7: iteration 13320/ 60336 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.927121E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.171 | TFLOPs: 26.11 | +7: iteration 13330/ 60336 | consumed samples: 3412480 | consumed tokens: 6988759040 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.931867E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.582 | TFLOPs: 26.10 | +7: iteration 13340/ 60336 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.949022E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.600 | TFLOPs: 26.09 | +7: iteration 13350/ 60336 | consumed samples: 3417600 | consumed tokens: 6999244800 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.921841E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.160 | TFLOPs: 26.10 | +7: iteration 13360/ 60336 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.934679E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.261 | TFLOPs: 26.10 | +7: iteration 13370/ 60336 | consumed samples: 3422720 | consumed tokens: 7009730560 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.939034E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.355 | TFLOPs: 26.09 | +7: iteration 13380/ 60336 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.942272E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.986 | TFLOPs: 26.10 | +7: iteration 13390/ 60336 | consumed samples: 3427840 | consumed tokens: 7020216320 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.929008E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.640 | TFLOPs: 26.11 | +7: iteration 13400/ 60336 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.939750E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.723 | TFLOPs: 26.11 | +7: iteration 13410/ 60336 | consumed samples: 3432960 | consumed tokens: 7030702080 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.922523E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.542 | TFLOPs: 26.10 | +7: iteration 13420/ 60336 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.929128E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.196 | TFLOPs: 26.05 | +7: iteration 13430/ 60336 | consumed samples: 3438080 | consumed tokens: 7041187840 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.936212E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.040 | TFLOPs: 26.03 | +7: iteration 13440/ 60336 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.943073E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.915 | TFLOPs: 26.00 | +7: iteration 13450/ 60336 | consumed samples: 3443200 | consumed tokens: 7051673600 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.922967E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.923 | TFLOPs: 26.00 | +7: iteration 13460/ 60336 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.933675E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.302 | TFLOPs: 26.02 | +7: iteration 13470/ 60336 | consumed samples: 3448320 | consumed tokens: 7062159360 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.935978E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.050 | TFLOPs: 26.02 | +7: iteration 13480/ 60336 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.924980E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.793 | TFLOPs: 26.05 | +7: iteration 13490/ 60336 | consumed samples: 3453440 | consumed tokens: 7072645120 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.938165E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.330 | TFLOPs: 26.02 | +7: iteration 13500/ 60336 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.934360E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.200 | TFLOPs: 26.11 | +7: iteration 13510/ 60336 | consumed samples: 3458560 | consumed tokens: 7083130880 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.936964E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 13520/ 60336 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.948639E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.519 | TFLOPs: 26.09 | +7: iteration 13530/ 60336 | consumed samples: 3463680 | consumed tokens: 7093616640 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.926437E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.460 | TFLOPs: 26.04 | +7: iteration 13540/ 60336 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.924389E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.722 | TFLOPs: 26.04 | +7: iteration 13550/ 60336 | consumed samples: 3468800 | consumed tokens: 7104102400 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.931866E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.839 | TFLOPs: 26.01 | +7: iteration 13560/ 60336 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.920911E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.496 | TFLOPs: 26.06 | +7: iteration 13570/ 60336 | consumed samples: 3473920 | consumed tokens: 7114588160 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.925583E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.790 | TFLOPs: 25.98 | +7: iteration 13580/ 60336 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.909286E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.744 | TFLOPs: 26.04 | +7: iteration 13590/ 60336 | consumed samples: 3479040 | consumed tokens: 7125073920 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.927541E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.142 | TFLOPs: 26.04 | +7: iteration 13600/ 60336 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.928864E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.157 | TFLOPs: 26.00 | +7: iteration 13610/ 60336 | consumed samples: 3484160 | consumed tokens: 7135559680 | elapsed time per iteration (s): 0.16 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.929126E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.759 | TFLOPs: 25.86 | +7: iteration 13620/ 60336 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.934320E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.492 | TFLOPs: 26.02 | +7: iteration 13630/ 60336 | consumed samples: 3489280 | consumed tokens: 7146045440 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.931075E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.174 | TFLOPs: 26.04 | +7: iteration 13640/ 60336 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.919840E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.676 | TFLOPs: 25.95 | +7: iteration 13650/ 60336 | consumed samples: 3494400 | consumed tokens: 7156531200 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.939733E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.442 | TFLOPs: 26.02 | +7: iteration 13660/ 60336 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.921972E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.495 | TFLOPs: 26.04 | +7: iteration 13670/ 60336 | consumed samples: 3499520 | consumed tokens: 7167016960 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.921455E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.155 | TFLOPs: 26.04 | +7: iteration 13680/ 60336 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.926011E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.132 | TFLOPs: 26.04 | +7: iteration 13690/ 60336 | consumed samples: 3504640 | consumed tokens: 7177502720 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.933513E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.000 | TFLOPs: 26.02 | +7: iteration 13700/ 60336 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.922564E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.366 | TFLOPs: 26.04 | +7: iteration 13710/ 60336 | consumed samples: 3509760 | consumed tokens: 7187988480 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.940789E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.515 | TFLOPs: 26.04 | +7: iteration 13720/ 60336 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.929982E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.072 | TFLOPs: 26.03 | +7: iteration 13730/ 60336 | consumed samples: 3514880 | consumed tokens: 7198474240 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.931316E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.504 | TFLOPs: 26.04 | +7: iteration 13740/ 60336 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.946132E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.351 | TFLOPs: 26.04 | +7: iteration 13750/ 60336 | consumed samples: 3520000 | consumed tokens: 7208960000 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.937807E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.358 | TFLOPs: 26.04 | +7: iteration 13760/ 60336 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.934798E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.077 | TFLOPs: 26.05 | +7: iteration 13770/ 60336 | consumed samples: 3525120 | consumed tokens: 7219445760 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.936132E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.644 | TFLOPs: 26.06 | +7: iteration 13780/ 60336 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.924080E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.088 | TFLOPs: 26.11 | +7: iteration 13790/ 60336 | consumed samples: 3530240 | consumed tokens: 7229931520 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.932066E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.704 | TFLOPs: 26.11 | +7: iteration 13800/ 60336 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.940482E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.408 | TFLOPs: 26.10 | +7: iteration 13810/ 60336 | consumed samples: 3535360 | consumed tokens: 7240417280 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.925845E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.698 | TFLOPs: 26.09 | +7: iteration 13820/ 60336 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.928391E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.007 | TFLOPs: 26.10 | +7: iteration 13830/ 60336 | consumed samples: 3540480 | consumed tokens: 7250903040 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.929121E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.074 | TFLOPs: 26.10 | +7: iteration 13840/ 60336 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.919478E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.967 | TFLOPs: 26.06 | +7: iteration 13850/ 60336 | consumed samples: 3545600 | consumed tokens: 7261388800 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.926923E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.804 | TFLOPs: 26.09 | +7: iteration 13860/ 60336 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.926606E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.368 | TFLOPs: 26.10 | +7: iteration 13870/ 60336 | consumed samples: 3550720 | consumed tokens: 7271874560 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.918476E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.348 | TFLOPs: 26.09 | +7: iteration 13880/ 60336 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.939111E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.504 | TFLOPs: 26.10 | +7: iteration 13890/ 60336 | consumed samples: 3555840 | consumed tokens: 7282360320 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.917303E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.989 | TFLOPs: 26.06 | +7: iteration 13900/ 60336 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.928048E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.738 | TFLOPs: 26.09 | +7: iteration 13910/ 60336 | consumed samples: 3560960 | consumed tokens: 7292846080 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.939198E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.061 | TFLOPs: 26.07 | +7: iteration 13920/ 60336 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.931229E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.753 | TFLOPs: 26.09 | +7: iteration 13930/ 60336 | consumed samples: 3566080 | consumed tokens: 7303331840 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.919541E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.821 | TFLOPs: 26.09 | +7: iteration 13940/ 60336 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.918724E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.220 | TFLOPs: 26.08 | +7: iteration 13950/ 60336 | consumed samples: 3571200 | consumed tokens: 7313817600 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.931775E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.935 | TFLOPs: 26.09 | +7: iteration 13960/ 60336 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 0.16 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.926484E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.994 | TFLOPs: 25.75 | +7: iteration 13970/ 60336 | consumed samples: 3576320 | consumed tokens: 7324303360 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.930564E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.511 | TFLOPs: 26.14 | +7: iteration 13980/ 60336 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 0.16 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.926075E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.789 | TFLOPs: 25.81 | +7: iteration 13990/ 60336 | consumed samples: 3581440 | consumed tokens: 7334789120 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.917050E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.946 | TFLOPs: 26.13 | +0: [2023-03-17 00:55:27,707] [INFO] [logging.py:68:log_dist] [Rank 0] step=14000, skipped=0, lr=[0.0001785692252468874, 0.0001785692252468874, 0.0001785692252468874], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 14000/ 60336 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.915736E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.574 | TFLOPs: 26.14 | +0: steps: 14000 loss: 3.9017 iter time (s): 0.152 samples/sec: 1680.229 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 14000 | lm loss value: 3.992917E+00 | lm loss PPL: 5.421278E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 14000 to checkpoints_44m32b100m +0: [2023-03-17 00:55:27,779] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is begin to save! +0: [2023-03-17 00:55:27,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:55:27,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:55:27,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:55:27,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:55:27,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:55:27,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:55:27,860] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:55:27,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:55:27,868] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:55:27,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:55:27,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:55:27,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:55:27,884] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:55:27,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:55:27,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:55:27,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:55:27,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:55:27,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:55:27,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:55:27,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:55:27,909] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step14000/mp_rank_00_model_states.pt +0: [2023-03-17 00:55:27,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:55:27,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:55:27,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:55:27,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:55:27,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,942] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:55:27,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,948] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:55:27,950] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:55:27,950] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: successfully saved checkpoint at iteration 14000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 174.39 +7: iteration 14010/ 60336 | consumed samples: 3586560 | consumed tokens: 7345274880 | elapsed time per iteration (s): 0.18 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.929934E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.975 | TFLOPs: 22.60 | +7: iteration 14020/ 60336 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.935948E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.313 | TFLOPs: 26.16 | +7: iteration 14030/ 60336 | consumed samples: 3591680 | consumed tokens: 7355760640 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.929018E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.076 | TFLOPs: 26.13 | +7: iteration 14040/ 60336 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.918207E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.372 | TFLOPs: 26.18 | +7: iteration 14050/ 60336 | consumed samples: 3596800 | consumed tokens: 7366246400 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.912362E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.364 | TFLOPs: 26.16 | +7: iteration 14060/ 60336 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.930200E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.767 | TFLOPs: 26.15 | +7: iteration 14070/ 60336 | consumed samples: 3601920 | consumed tokens: 7376732160 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.923785E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.084 | TFLOPs: 26.18 | +7: iteration 14080/ 60336 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.919293E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.692 | TFLOPs: 26.25 | +7: iteration 14090/ 60336 | consumed samples: 3607040 | consumed tokens: 7387217920 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.928629E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.156 | TFLOPs: 26.24 | +7: iteration 14100/ 60336 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.918547E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.478 | TFLOPs: 26.23 | +7: iteration 14110/ 60336 | consumed samples: 3612160 | consumed tokens: 7397703680 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.920160E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.097 | TFLOPs: 26.14 | +7: iteration 14120/ 60336 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.920757E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.741 | TFLOPs: 26.04 | +7: iteration 14130/ 60336 | consumed samples: 3617280 | consumed tokens: 7408189440 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.928897E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.448 | TFLOPs: 26.12 | +7: iteration 14140/ 60336 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.917348E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.345 | TFLOPs: 26.12 | +7: iteration 14150/ 60336 | consumed samples: 3622400 | consumed tokens: 7418675200 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.925957E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.274 | TFLOPs: 26.13 | +7: iteration 14160/ 60336 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.921215E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.151 | TFLOPs: 26.13 | +7: iteration 14170/ 60336 | consumed samples: 3627520 | consumed tokens: 7429160960 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.930437E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.182 | TFLOPs: 26.13 | +7: iteration 14180/ 60336 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.921986E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.526 | TFLOPs: 26.14 | +7: iteration 14190/ 60336 | consumed samples: 3632640 | consumed tokens: 7439646720 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.936795E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.952 | TFLOPs: 26.13 | +7: iteration 14200/ 60336 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.928783E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.028 | TFLOPs: 26.06 | +7: iteration 14210/ 60336 | consumed samples: 3637760 | consumed tokens: 7450132480 | elapsed time per iteration (s): 0.16 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.916976E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.576 | TFLOPs: 25.48 | +7: iteration 14220/ 60336 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 0.16 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.939119E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.275 | TFLOPs: 25.86 | +7: iteration 14230/ 60336 | consumed samples: 3642880 | consumed tokens: 7460618240 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.939113E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.727 | TFLOPs: 26.08 | +7: iteration 14240/ 60336 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.936193E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.597 | TFLOPs: 26.07 | +7: iteration 14250/ 60336 | consumed samples: 3648000 | consumed tokens: 7471104000 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.920346E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.922 | TFLOPs: 26.09 | +7: iteration 14260/ 60336 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.918870E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.289 | TFLOPs: 26.12 | +7: iteration 14270/ 60336 | consumed samples: 3653120 | consumed tokens: 7481589760 | elapsed time per iteration (s): 0.16 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.925422E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.974 | TFLOPs: 25.59 | +7: iteration 14280/ 60336 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.912157E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.521 | TFLOPs: 26.14 | +7: iteration 14290/ 60336 | consumed samples: 3658240 | consumed tokens: 7492075520 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.923282E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.077 | TFLOPs: 26.14 | +7: iteration 14300/ 60336 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.931123E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.609 | TFLOPs: 26.15 | +7: iteration 14310/ 60336 | consumed samples: 3663360 | consumed tokens: 7502561280 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.929504E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.884 | TFLOPs: 26.16 | +7: iteration 14320/ 60336 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.919689E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.835 | TFLOPs: 26.17 | +7: iteration 14330/ 60336 | consumed samples: 3668480 | consumed tokens: 7513047040 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.918269E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.787 | TFLOPs: 26.08 | +7: iteration 14340/ 60336 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.922494E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.003 | TFLOPs: 26.02 | +7: iteration 14350/ 60336 | consumed samples: 3673600 | consumed tokens: 7523532800 | elapsed time per iteration (s): 0.16 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.903264E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.158 | TFLOPs: 25.78 | +7: iteration 14360/ 60336 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.918416E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.738 | TFLOPs: 26.03 | +7: iteration 14370/ 60336 | consumed samples: 3678720 | consumed tokens: 7534018560 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.921310E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.317 | TFLOPs: 26.01 | +7: iteration 14380/ 60336 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.922717E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.562 | TFLOPs: 26.01 | +7: iteration 14390/ 60336 | consumed samples: 3683840 | consumed tokens: 7544504320 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.927346E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.203 | TFLOPs: 26.02 | +7: iteration 14400/ 60336 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.928468E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.355 | TFLOPs: 26.02 | +7: iteration 14410/ 60336 | consumed samples: 3688960 | consumed tokens: 7554990080 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.918108E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.624 | TFLOPs: 26.04 | +7: iteration 14420/ 60336 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.931729E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.384 | TFLOPs: 26.04 | +7: iteration 14430/ 60336 | consumed samples: 3694080 | consumed tokens: 7565475840 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.910175E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.209 | TFLOPs: 26.02 | +7: iteration 14440/ 60336 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.910695E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.464 | TFLOPs: 26.04 | +7: iteration 14450/ 60336 | consumed samples: 3699200 | consumed tokens: 7575961600 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.923536E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.215 | TFLOPs: 26.04 | +7: iteration 14460/ 60336 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.907418E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.231 | TFLOPs: 26.05 | +7: iteration 14470/ 60336 | consumed samples: 3704320 | consumed tokens: 7586447360 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.914967E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.412 | TFLOPs: 26.06 | +7: iteration 14480/ 60336 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.930894E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.057 | TFLOPs: 26.07 | +7: iteration 14490/ 60336 | consumed samples: 3709440 | consumed tokens: 7596933120 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.931285E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.462 | TFLOPs: 26.06 | +7: iteration 14500/ 60336 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.931824E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.389 | TFLOPs: 26.05 | +7: iteration 14510/ 60336 | consumed samples: 3714560 | consumed tokens: 7607418880 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.915169E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.181 | TFLOPs: 26.07 | +7: iteration 14520/ 60336 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.913094E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.362 | TFLOPs: 26.04 | +7: iteration 14530/ 60336 | consumed samples: 3719680 | consumed tokens: 7617904640 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.921513E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.605 | TFLOPs: 26.04 | +7: iteration 14540/ 60336 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.908429E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.008 | TFLOPs: 26.06 | +7: iteration 14550/ 60336 | consumed samples: 3724800 | consumed tokens: 7628390400 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.911705E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.604 | TFLOPs: 26.03 | +7: iteration 14560/ 60336 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.930476E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.380 | TFLOPs: 26.02 | +7: iteration 14570/ 60336 | consumed samples: 3729920 | consumed tokens: 7638876160 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.921371E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.337 | TFLOPs: 26.04 | +7: iteration 14580/ 60336 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.905094E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.756 | TFLOPs: 26.01 | +7: iteration 14590/ 60336 | consumed samples: 3735040 | consumed tokens: 7649361920 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.918518E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.507 | TFLOPs: 26.04 | +7: iteration 14600/ 60336 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.926310E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.484 | TFLOPs: 26.04 | +7: iteration 14610/ 60336 | consumed samples: 3740160 | consumed tokens: 7659847680 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.915025E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.344 | TFLOPs: 26.04 | +7: iteration 14620/ 60336 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.900244E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.823 | TFLOPs: 26.01 | +7: iteration 14630/ 60336 | consumed samples: 3745280 | consumed tokens: 7670333440 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.910940E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.850 | TFLOPs: 26.01 | +7: iteration 14640/ 60336 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.910705E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.032 | TFLOPs: 26.02 | +7: iteration 14650/ 60336 | consumed samples: 3750400 | consumed tokens: 7680819200 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.908208E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.564 | TFLOPs: 26.01 | +7: iteration 14660/ 60336 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.923277E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.178 | TFLOPs: 26.02 | +7: iteration 14670/ 60336 | consumed samples: 3755520 | consumed tokens: 7691304960 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.916557E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.012 | TFLOPs: 26.03 | +7: iteration 14680/ 60336 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.918303E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.257 | TFLOPs: 26.02 | +7: iteration 14690/ 60336 | consumed samples: 3760640 | consumed tokens: 7701790720 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.905745E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.569 | TFLOPs: 26.01 | +7: iteration 14700/ 60336 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.914693E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.161 | TFLOPs: 26.02 | +7: iteration 14710/ 60336 | consumed samples: 3765760 | consumed tokens: 7712276480 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.902202E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.495 | TFLOPs: 26.03 | +7: iteration 14720/ 60336 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.920182E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.536 | TFLOPs: 26.04 | +7: iteration 14730/ 60336 | consumed samples: 3770880 | consumed tokens: 7722762240 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.913969E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.904 | TFLOPs: 26.03 | +7: iteration 14740/ 60336 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.894228E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.522 | TFLOPs: 26.04 | +7: iteration 14750/ 60336 | consumed samples: 3776000 | consumed tokens: 7733248000 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.916587E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.909 | TFLOPs: 26.05 | +7: iteration 14760/ 60336 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.924161E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.303 | TFLOPs: 26.04 | +7: iteration 14770/ 60336 | consumed samples: 3781120 | consumed tokens: 7743733760 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.913781E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.163 | TFLOPs: 26.04 | +7: iteration 14780/ 60336 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.911771E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.583 | TFLOPs: 26.03 | +7: iteration 14790/ 60336 | consumed samples: 3786240 | consumed tokens: 7754219520 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.896817E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.092 | TFLOPs: 26.03 | +7: iteration 14800/ 60336 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.928141E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.944 | TFLOPs: 26.06 | +7: iteration 14810/ 60336 | consumed samples: 3791360 | consumed tokens: 7764705280 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.921474E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.325 | TFLOPs: 26.04 | +7: iteration 14820/ 60336 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.919103E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.760 | TFLOPs: 26.01 | +7: iteration 14830/ 60336 | consumed samples: 3796480 | consumed tokens: 7775191040 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.911420E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.487 | TFLOPs: 26.01 | +7: iteration 14840/ 60336 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.931931E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.966 | TFLOPs: 26.00 | +7: iteration 14850/ 60336 | consumed samples: 3801600 | consumed tokens: 7785676800 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.923003E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.210 | TFLOPs: 26.02 | +7: iteration 14860/ 60336 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.917007E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.548 | TFLOPs: 25.99 | +7: iteration 14870/ 60336 | consumed samples: 3806720 | consumed tokens: 7796162560 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.914402E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.247 | TFLOPs: 26.01 | +7: iteration 14880/ 60336 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.913685E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.157 | TFLOPs: 26.00 | +7: iteration 14890/ 60336 | consumed samples: 3811840 | consumed tokens: 7806648320 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.914995E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.431 | TFLOPs: 26.01 | +7: iteration 14900/ 60336 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.911071E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.818 | TFLOPs: 26.01 | +7: iteration 14910/ 60336 | consumed samples: 3816960 | consumed tokens: 7817134080 | elapsed time per iteration (s): 0.16 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.914036E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.030 | TFLOPs: 25.88 | +7: iteration 14920/ 60336 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 0.16 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.921133E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.926 | TFLOPs: 25.31 | +7: iteration 14930/ 60336 | consumed samples: 3822080 | consumed tokens: 7827619840 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.917236E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.795 | TFLOPs: 26.09 | +7: iteration 14940/ 60336 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.924366E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.804 | TFLOPs: 26.09 | +7: iteration 14950/ 60336 | consumed samples: 3827200 | consumed tokens: 7838105600 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.907120E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.531 | TFLOPs: 26.09 | +7: iteration 14960/ 60336 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.899872E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.593 | TFLOPs: 26.06 | +7: iteration 14970/ 60336 | consumed samples: 3832320 | consumed tokens: 7848591360 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.915907E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.993 | TFLOPs: 26.06 | +7: iteration 14980/ 60336 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.909718E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.628 | TFLOPs: 26.07 | +7: iteration 14990/ 60336 | consumed samples: 3837440 | consumed tokens: 7859077120 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.912005E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.554 | TFLOPs: 26.07 | +7: iteration 15000/ 60336 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.920313E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.070 | TFLOPs: 26.08 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 15000 | lm loss value: 3.985574E+00 | lm loss PPL: 5.381615E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 15000 to checkpoints_44m32b100m +0: [2023-03-17 00:58:02,160] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step15000 is begin to save! +0: [2023-03-17 00:58:02,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:58:02,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:58:02,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:58:02,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:58:02,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:58:02,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:58:02,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:58:02,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:58:02,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:58:02,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:58:02,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:58:02,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:58:02,266] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:58:02,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:58:02,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:58:02,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:58:02,282] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:58:02,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:58:02,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:58:02,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:58:02,292] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step15000/mp_rank_00_model_states.pt +0: [2023-03-17 00:58:02,292] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:58:02,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:58:02,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:58:02,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 00:58:02,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:58:02,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:58:02,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:58:02,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:58:02,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:58:02,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:58:02,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:58:02,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:58:02,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: successfully saved checkpoint at iteration 15000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 173.96 +7: iteration 15010/ 60336 | consumed samples: 3842560 | consumed tokens: 7869562880 | elapsed time per iteration (s): 0.18 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.921184E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1460.467 | TFLOPs: 22.90 | +7: iteration 15020/ 60336 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.912107E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.973 | TFLOPs: 26.24 | +7: iteration 15030/ 60336 | consumed samples: 3847680 | consumed tokens: 7880048640 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.906977E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.024 | TFLOPs: 26.24 | +7: iteration 15040/ 60336 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.908957E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.753 | TFLOPs: 26.23 | +7: iteration 15050/ 60336 | consumed samples: 3852800 | consumed tokens: 7890534400 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.911729E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.257 | TFLOPs: 26.24 | +7: iteration 15060/ 60336 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.906071E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.067 | TFLOPs: 26.24 | +7: iteration 15070/ 60336 | consumed samples: 3857920 | consumed tokens: 7901020160 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.897327E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.435 | TFLOPs: 26.23 | +7: iteration 15080/ 60336 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.915573E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.729 | TFLOPs: 26.25 | +7: iteration 15090/ 60336 | consumed samples: 3863040 | consumed tokens: 7911505920 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.919968E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.970 | TFLOPs: 26.22 | +7: iteration 15100/ 60336 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.918350E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.785 | TFLOPs: 26.25 | +7: iteration 15110/ 60336 | consumed samples: 3868160 | consumed tokens: 7921991680 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.921355E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.775 | TFLOPs: 26.23 | +7: iteration 15120/ 60336 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.911456E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.400 | TFLOPs: 26.24 | +7: iteration 15130/ 60336 | consumed samples: 3873280 | consumed tokens: 7932477440 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.915998E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.053 | TFLOPs: 26.25 | +7: iteration 15140/ 60336 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.905561E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.385 | TFLOPs: 26.26 | +7: iteration 15150/ 60336 | consumed samples: 3878400 | consumed tokens: 7942963200 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.918646E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.166 | TFLOPs: 26.24 | +7: iteration 15160/ 60336 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.898839E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.811 | TFLOPs: 26.27 | +7: iteration 15170/ 60336 | consumed samples: 3883520 | consumed tokens: 7953448960 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.900216E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.033 | TFLOPs: 26.25 | +7: iteration 15180/ 60336 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.910286E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.556 | TFLOPs: 26.26 | +7: iteration 15190/ 60336 | consumed samples: 3888640 | consumed tokens: 7963934720 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.900280E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.293 | TFLOPs: 26.24 | +7: iteration 15200/ 60336 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.898298E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.635 | TFLOPs: 26.18 | +7: iteration 15210/ 60336 | consumed samples: 3893760 | consumed tokens: 7974420480 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.908652E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.099 | TFLOPs: 26.11 | +7: iteration 15220/ 60336 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.905083E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.970 | TFLOPs: 26.24 | +7: iteration 15230/ 60336 | consumed samples: 3898880 | consumed tokens: 7984906240 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.902440E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.853 | TFLOPs: 26.23 | +7: iteration 15240/ 60336 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.899254E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.078 | TFLOPs: 26.22 | +7: iteration 15250/ 60336 | consumed samples: 3904000 | consumed tokens: 7995392000 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.902689E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.557 | TFLOPs: 26.21 | +7: iteration 15260/ 60336 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.905181E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.454 | TFLOPs: 26.23 | +7: iteration 15270/ 60336 | consumed samples: 3909120 | consumed tokens: 8005877760 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.914774E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.683 | TFLOPs: 26.23 | +7: iteration 15280/ 60336 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.912510E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.508 | TFLOPs: 26.21 | +7: iteration 15290/ 60336 | consumed samples: 3914240 | consumed tokens: 8016363520 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.921939E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.637 | TFLOPs: 26.23 | +7: iteration 15300/ 60336 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.888480E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.939 | TFLOPs: 26.24 | +7: iteration 15310/ 60336 | consumed samples: 3919360 | consumed tokens: 8026849280 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.900694E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.307 | TFLOPs: 26.23 | +7: iteration 15320/ 60336 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.913287E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.561 | TFLOPs: 26.25 | +7: iteration 15330/ 60336 | consumed samples: 3924480 | consumed tokens: 8037335040 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.907081E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.500 | TFLOPs: 26.21 | +7: iteration 15340/ 60336 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.914902E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.012 | TFLOPs: 26.24 | +7: iteration 15350/ 60336 | consumed samples: 3929600 | consumed tokens: 8047820800 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.894218E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.667 | TFLOPs: 26.23 | +7: iteration 15360/ 60336 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.901017E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.820 | TFLOPs: 26.23 | +7: iteration 15370/ 60336 | consumed samples: 3934720 | consumed tokens: 8058306560 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.903332E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.152 | TFLOPs: 26.22 | +7: iteration 15380/ 60336 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.901089E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.465 | TFLOPs: 26.20 | +7: iteration 15390/ 60336 | consumed samples: 3939840 | consumed tokens: 8068792320 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.900557E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.600 | TFLOPs: 26.03 | +7: iteration 15400/ 60336 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.897876E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.694 | TFLOPs: 26.03 | +7: iteration 15410/ 60336 | consumed samples: 3944960 | consumed tokens: 8079278080 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.913520E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.179 | TFLOPs: 26.04 | +7: iteration 15420/ 60336 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.909568E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.759 | TFLOPs: 26.03 | +7: iteration 15430/ 60336 | consumed samples: 3950080 | consumed tokens: 8089763840 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.904465E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.900 | TFLOPs: 26.05 | +7: iteration 15440/ 60336 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.916510E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.925 | TFLOPs: 26.24 | +7: iteration 15450/ 60336 | consumed samples: 3955200 | consumed tokens: 8100249600 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.924767E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.641 | TFLOPs: 26.23 | +7: iteration 15460/ 60336 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.888882E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.398 | TFLOPs: 26.23 | +7: iteration 15470/ 60336 | consumed samples: 3960320 | consumed tokens: 8110735360 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.895721E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.479 | TFLOPs: 26.23 | +7: iteration 15480/ 60336 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.902985E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.110 | TFLOPs: 26.24 | +7: iteration 15490/ 60336 | consumed samples: 3965440 | consumed tokens: 8121221120 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.903244E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.307 | TFLOPs: 26.24 | +7: iteration 15500/ 60336 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.904580E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.611 | TFLOPs: 26.23 | +7: iteration 15510/ 60336 | consumed samples: 3970560 | consumed tokens: 8131706880 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.902161E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.894 | TFLOPs: 26.20 | +7: iteration 15520/ 60336 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.912658E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.722 | TFLOPs: 26.23 | +7: iteration 15530/ 60336 | consumed samples: 3975680 | consumed tokens: 8142192640 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.894168E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.719 | TFLOPs: 26.23 | +7: iteration 15540/ 60336 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.896122E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.506 | TFLOPs: 26.18 | +7: iteration 15550/ 60336 | consumed samples: 3980800 | consumed tokens: 8152678400 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.911145E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.171 | TFLOPs: 26.16 | +7: iteration 15560/ 60336 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.905925E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.261 | TFLOPs: 26.15 | +7: iteration 15570/ 60336 | consumed samples: 3985920 | consumed tokens: 8163164160 | elapsed time per iteration (s): 0.16 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.901711E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.661 | TFLOPs: 25.37 | +7: iteration 15580/ 60336 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.911880E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.792 | TFLOPs: 26.22 | +7: iteration 15590/ 60336 | consumed samples: 3991040 | consumed tokens: 8173649920 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.918086E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.186 | TFLOPs: 26.21 | +7: iteration 15600/ 60336 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.907574E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.391 | TFLOPs: 25.91 | +7: iteration 15610/ 60336 | consumed samples: 3996160 | consumed tokens: 8184135680 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.912452E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.324 | TFLOPs: 26.24 | +7: iteration 15620/ 60336 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.905673E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.551 | TFLOPs: 26.25 | +7: iteration 15630/ 60336 | consumed samples: 4001280 | consumed tokens: 8194621440 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.909722E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.028 | TFLOPs: 26.24 | +7: iteration 15640/ 60336 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.902366E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.423 | TFLOPs: 26.23 | +7: iteration 15650/ 60336 | consumed samples: 4006400 | consumed tokens: 8205107200 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.907388E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.447 | TFLOPs: 26.24 | +7: iteration 15660/ 60336 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.897651E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.973 | TFLOPs: 26.25 | +7: iteration 15670/ 60336 | consumed samples: 4011520 | consumed tokens: 8215592960 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.897762E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.195 | TFLOPs: 26.24 | +7: iteration 15680/ 60336 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.898706E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.347 | TFLOPs: 26.24 | +7: iteration 15690/ 60336 | consumed samples: 4016640 | consumed tokens: 8226078720 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.895554E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.781 | TFLOPs: 26.22 | +7: iteration 15700/ 60336 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.913212E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.056 | TFLOPs: 26.24 | +7: iteration 15710/ 60336 | consumed samples: 4021760 | consumed tokens: 8236564480 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.902885E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.374 | TFLOPs: 26.21 | +7: iteration 15720/ 60336 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.906510E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.233 | TFLOPs: 26.21 | +7: iteration 15730/ 60336 | consumed samples: 4026880 | consumed tokens: 8247050240 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.906332E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.531 | TFLOPs: 26.21 | +7: iteration 15740/ 60336 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.910074E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.336 | TFLOPs: 26.23 | +7: iteration 15750/ 60336 | consumed samples: 4032000 | consumed tokens: 8257536000 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.903875E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.506 | TFLOPs: 26.21 | +7: iteration 15760/ 60336 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.897475E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.715 | TFLOPs: 26.03 | +7: iteration 15770/ 60336 | consumed samples: 4037120 | consumed tokens: 8268021760 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.916016E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.348 | TFLOPs: 25.99 | +7: iteration 15780/ 60336 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.894543E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.640 | TFLOPs: 26.01 | +7: iteration 15790/ 60336 | consumed samples: 4042240 | consumed tokens: 8278507520 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.894872E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.220 | TFLOPs: 26.01 | +7: iteration 15800/ 60336 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.915671E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.156 | TFLOPs: 26.02 | +7: iteration 15810/ 60336 | consumed samples: 4047360 | consumed tokens: 8288993280 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.901409E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.938 | TFLOPs: 26.02 | +7: iteration 15820/ 60336 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.905958E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.034 | TFLOPs: 26.02 | +7: iteration 15830/ 60336 | consumed samples: 4052480 | consumed tokens: 8299479040 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.896441E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.154 | TFLOPs: 26.02 | +7: iteration 15840/ 60336 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.896451E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.538 | TFLOPs: 26.04 | +7: iteration 15850/ 60336 | consumed samples: 4057600 | consumed tokens: 8309964800 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.888242E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.018 | TFLOPs: 26.03 | +7: iteration 15860/ 60336 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.921156E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.618 | TFLOPs: 26.01 | +7: iteration 15870/ 60336 | consumed samples: 4062720 | consumed tokens: 8320450560 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.910609E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.148 | TFLOPs: 25.99 | +7: iteration 15880/ 60336 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.903937E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.613 | TFLOPs: 26.00 | +7: iteration 15890/ 60336 | consumed samples: 4067840 | consumed tokens: 8330936320 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.898768E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.457 | TFLOPs: 25.98 | +7: iteration 15900/ 60336 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.903947E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.001 | TFLOPs: 25.99 | +7: iteration 15910/ 60336 | consumed samples: 4072960 | consumed tokens: 8341422080 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.899224E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.475 | TFLOPs: 25.96 | +7: iteration 15920/ 60336 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.908591E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.327 | TFLOPs: 25.99 | +7: iteration 15930/ 60336 | consumed samples: 4078080 | consumed tokens: 8351907840 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.910875E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.556 | TFLOPs: 26.03 | +7: iteration 15940/ 60336 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.903822E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.653 | TFLOPs: 26.03 | +7: iteration 15950/ 60336 | consumed samples: 4083200 | consumed tokens: 8362393600 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.897961E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.083 | TFLOPs: 26.03 | +7: iteration 15960/ 60336 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.898988E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.547 | TFLOPs: 26.03 | +7: iteration 15970/ 60336 | consumed samples: 4088320 | consumed tokens: 8372879360 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.894675E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.842 | TFLOPs: 26.00 | +7: iteration 15980/ 60336 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.885150E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.254 | TFLOPs: 26.02 | +7: iteration 15990/ 60336 | consumed samples: 4093440 | consumed tokens: 8383365120 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.892184E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.917 | TFLOPs: 26.02 | +0: [2023-03-17 01:00:35,853] [INFO] [logging.py:68:log_dist] [Rank 0] step=16000, skipped=0, lr=[0.00017206961813389868, 0.00017206961813389868, 0.00017206961813389868], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 16000/ 60336 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.882318E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.040 | TFLOPs: 26.02 | +0: steps: 16000 loss: 3.8903 iter time (s): 0.152 samples/sec: 1680.612 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 16000 | lm loss value: 4.017365E+00 | lm loss PPL: 5.555450E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 16000 to checkpoints_44m32b100m +0: [2023-03-17 01:00:35,925] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is begin to save! +0: [2023-03-17 01:00:35,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:00:35,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:00:35,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:00:35,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:00:35,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:00:36,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:00:36,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:00:36,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:00:36,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:00:36,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:00:36,023] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:00:36,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:00:36,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:00:36,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:00:36,039] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:00:36,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:00:36,048] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:00:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:00:36,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:00:36,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:00:36,057] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step16000/mp_rank_00_model_states.pt +0: [2023-03-17 01:00:36,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:00:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:00:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:00:36,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 01:00:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:00:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:00:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:00:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:00:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 01:00:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:00:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 01:00:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 01:00:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 01:00:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:00:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: successfully saved checkpoint at iteration 16000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 175.03 +7: iteration 16010/ 60336 | consumed samples: 4098560 | consumed tokens: 8393850880 | elapsed time per iteration (s): 0.18 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.890002E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1435.204 | TFLOPs: 22.51 | +7: iteration 16020/ 60336 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.909559E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.645 | TFLOPs: 26.03 | +7: iteration 16030/ 60336 | consumed samples: 4103680 | consumed tokens: 8404336640 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.897369E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.994 | TFLOPs: 26.03 | +7: iteration 16040/ 60336 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.897657E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.044 | TFLOPs: 26.02 | +7: iteration 16050/ 60336 | consumed samples: 4108800 | consumed tokens: 8414822400 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.907285E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.151 | TFLOPs: 26.04 | +7: iteration 16060/ 60336 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.895308E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.332 | TFLOPs: 26.02 | +7: iteration 16070/ 60336 | consumed samples: 4113920 | consumed tokens: 8425308160 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.902647E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.201 | TFLOPs: 26.02 | +7: iteration 16080/ 60336 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.900766E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.950 | TFLOPs: 26.02 | +7: iteration 16090/ 60336 | consumed samples: 4119040 | consumed tokens: 8435793920 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.906496E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.790 | TFLOPs: 25.98 | +7: iteration 16100/ 60336 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.901878E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.629 | TFLOPs: 26.00 | +7: iteration 16110/ 60336 | consumed samples: 4124160 | consumed tokens: 8446279680 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.893922E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.987 | TFLOPs: 26.06 | +7: iteration 16120/ 60336 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.895963E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.856 | TFLOPs: 26.20 | +7: iteration 16130/ 60336 | consumed samples: 4129280 | consumed tokens: 8456765440 | elapsed time per iteration (s): 0.15 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.891283E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.539 | TFLOPs: 26.21 | +7: iteration 16140/ 60336 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 0.15 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.895906E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.723 | TFLOPs: 26.22 | +7: iteration 16150/ 60336 | consumed samples: 4134400 | consumed tokens: 8467251200 | elapsed time per iteration (s): 0.15 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.894622E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.695 | TFLOPs: 26.22 | +7: iteration 16160/ 60336 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 0.15 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.890799E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.620 | TFLOPs: 26.20 | +7: iteration 16170/ 60336 | consumed samples: 4139520 | consumed tokens: 8477736960 | elapsed time per iteration (s): 0.15 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.896706E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.507 | TFLOPs: 26.23 | +7: iteration 16180/ 60336 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 0.15 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.896683E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.818 | TFLOPs: 26.19 | +7: iteration 16190/ 60336 | consumed samples: 4144640 | consumed tokens: 8488222720 | elapsed time per iteration (s): 0.15 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.888495E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.273 | TFLOPs: 26.12 | +7: iteration 16200/ 60336 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 0.16 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.899733E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.037 | TFLOPs: 25.86 | +7: iteration 16210/ 60336 | consumed samples: 4149760 | consumed tokens: 8498708480 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.895446E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.911 | TFLOPs: 25.91 | +7: iteration 16220/ 60336 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.888353E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.743 | TFLOPs: 26.09 | +7: iteration 16230/ 60336 | consumed samples: 4154880 | consumed tokens: 8509194240 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.889070E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.170 | TFLOPs: 26.11 | +7: iteration 16240/ 60336 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.893355E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.686 | TFLOPs: 26.09 | +7: iteration 16250/ 60336 | consumed samples: 4160000 | consumed tokens: 8519680000 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.905902E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.738 | TFLOPs: 26.09 | +7: iteration 16260/ 60336 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.894268E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.272 | TFLOPs: 26.12 | +7: iteration 16270/ 60336 | consumed samples: 4165120 | consumed tokens: 8530165760 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.897474E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.805 | TFLOPs: 26.11 | +7: iteration 16280/ 60336 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.891223E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.871 | TFLOPs: 26.09 | +7: iteration 16290/ 60336 | consumed samples: 4170240 | consumed tokens: 8540651520 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.881349E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.836 | TFLOPs: 26.09 | +7: iteration 16300/ 60336 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.890340E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.373 | TFLOPs: 26.10 | +7: iteration 16310/ 60336 | consumed samples: 4175360 | consumed tokens: 8551137280 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.908363E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.694 | TFLOPs: 26.09 | +7: iteration 16320/ 60336 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.905788E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.714 | TFLOPs: 26.09 | +7: iteration 16330/ 60336 | consumed samples: 4180480 | consumed tokens: 8561623040 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.888078E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.952 | TFLOPs: 26.13 | +7: iteration 16340/ 60336 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.895095E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.513 | TFLOPs: 26.12 | +7: iteration 16350/ 60336 | consumed samples: 4185600 | consumed tokens: 8572108800 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.898820E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.905 | TFLOPs: 26.14 | +7: iteration 16360/ 60336 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.910043E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.795 | TFLOPs: 26.14 | +7: iteration 16370/ 60336 | consumed samples: 4190720 | consumed tokens: 8582594560 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.900085E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.276 | TFLOPs: 26.04 | +7: iteration 16380/ 60336 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.915227E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.603 | TFLOPs: 26.03 | +7: iteration 16390/ 60336 | consumed samples: 4195840 | consumed tokens: 8593080320 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.901758E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.661 | TFLOPs: 26.01 | +7: iteration 16400/ 60336 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.886601E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.366 | TFLOPs: 26.02 | +7: iteration 16410/ 60336 | consumed samples: 4200960 | consumed tokens: 8603566080 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.891679E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.119 | TFLOPs: 26.00 | +7: iteration 16420/ 60336 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.886137E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.097 | TFLOPs: 26.03 | +7: iteration 16430/ 60336 | consumed samples: 4206080 | consumed tokens: 8614051840 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.891214E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.139 | TFLOPs: 26.04 | +7: iteration 16440/ 60336 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.907004E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.177 | TFLOPs: 25.99 | +7: iteration 16450/ 60336 | consumed samples: 4211200 | consumed tokens: 8624537600 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.876854E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.698 | TFLOPs: 25.98 | +7: iteration 16460/ 60336 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.891879E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.049 | TFLOPs: 26.00 | +7: iteration 16470/ 60336 | consumed samples: 4216320 | consumed tokens: 8635023360 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.896888E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.794 | TFLOPs: 26.00 | +7: iteration 16480/ 60336 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.912024E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.872 | TFLOPs: 25.98 | +7: iteration 16490/ 60336 | consumed samples: 4221440 | consumed tokens: 8645509120 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.896635E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.991 | TFLOPs: 26.02 | +7: iteration 16500/ 60336 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.881161E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.275 | TFLOPs: 26.01 | +7: iteration 16510/ 60336 | consumed samples: 4226560 | consumed tokens: 8655994880 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.891711E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.278 | TFLOPs: 25.97 | +7: iteration 16520/ 60336 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.895779E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.474 | TFLOPs: 25.96 | +7: iteration 16530/ 60336 | consumed samples: 4231680 | consumed tokens: 8666480640 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.887852E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.181 | TFLOPs: 25.99 | +7: iteration 16540/ 60336 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.897128E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.388 | TFLOPs: 25.96 | +7: iteration 16550/ 60336 | consumed samples: 4236800 | consumed tokens: 8676966400 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.888182E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.843 | TFLOPs: 25.95 | +7: iteration 16560/ 60336 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.896318E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.885 | TFLOPs: 25.95 | +7: iteration 16570/ 60336 | consumed samples: 4241920 | consumed tokens: 8687452160 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.899264E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.837 | TFLOPs: 25.97 | +7: iteration 16580/ 60336 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.897627E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.914 | TFLOPs: 25.97 | +7: iteration 16590/ 60336 | consumed samples: 4247040 | consumed tokens: 8697937920 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.889701E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.404 | TFLOPs: 25.98 | +7: iteration 16600/ 60336 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.906377E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.558 | TFLOPs: 25.96 | +7: iteration 16610/ 60336 | consumed samples: 4252160 | consumed tokens: 8708423680 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.900913E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.542 | TFLOPs: 25.98 | +7: iteration 16620/ 60336 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.879032E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.819 | TFLOPs: 25.94 | +7: iteration 16630/ 60336 | consumed samples: 4257280 | consumed tokens: 8718909440 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.895116E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.755 | TFLOPs: 25.98 | +7: iteration 16640/ 60336 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.895139E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.153 | TFLOPs: 26.00 | +7: iteration 16650/ 60336 | consumed samples: 4262400 | consumed tokens: 8729395200 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.891148E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.610 | TFLOPs: 26.06 | +7: iteration 16660/ 60336 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.899560E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.087 | TFLOPs: 26.07 | +7: iteration 16670/ 60336 | consumed samples: 4267520 | consumed tokens: 8739880960 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.886329E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.817 | TFLOPs: 26.05 | +7: iteration 16680/ 60336 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.896508E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.751 | TFLOPs: 26.06 | +7: iteration 16690/ 60336 | consumed samples: 4272640 | consumed tokens: 8750366720 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.871363E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.477 | TFLOPs: 25.93 | +7: iteration 16700/ 60336 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.889048E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.704 | TFLOPs: 26.06 | +7: iteration 16710/ 60336 | consumed samples: 4277760 | consumed tokens: 8760852480 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.889285E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.678 | TFLOPs: 26.04 | +7: iteration 16720/ 60336 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.899420E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.077 | TFLOPs: 26.03 | +7: iteration 16730/ 60336 | consumed samples: 4282880 | consumed tokens: 8771338240 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.892477E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.891 | TFLOPs: 26.08 | +7: iteration 16740/ 60336 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.900679E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.190 | TFLOPs: 26.07 | +7: iteration 16750/ 60336 | consumed samples: 4288000 | consumed tokens: 8781824000 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.902181E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.550 | TFLOPs: 26.06 | +7: iteration 16760/ 60336 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.891345E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.857 | TFLOPs: 26.05 | +7: iteration 16770/ 60336 | consumed samples: 4293120 | consumed tokens: 8792309760 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.883783E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.224 | TFLOPs: 26.04 | +7: iteration 16780/ 60336 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.883768E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.594 | TFLOPs: 25.98 | +7: iteration 16790/ 60336 | consumed samples: 4298240 | consumed tokens: 8802795520 | elapsed time per iteration (s): 0.16 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.875457E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.786 | TFLOPs: 25.87 | +7: iteration 16800/ 60336 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.886767E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.528 | TFLOPs: 26.03 | +7: iteration 16810/ 60336 | consumed samples: 4303360 | consumed tokens: 8813281280 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.883336E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.119 | TFLOPs: 26.03 | +7: iteration 16820/ 60336 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.900232E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.521 | TFLOPs: 26.03 | +7: iteration 16830/ 60336 | consumed samples: 4308480 | consumed tokens: 8823767040 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.891213E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.858 | TFLOPs: 26.00 | +7: iteration 16840/ 60336 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.890176E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.837 | TFLOPs: 26.00 | +7: iteration 16850/ 60336 | consumed samples: 4313600 | consumed tokens: 8834252800 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.903425E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.593 | TFLOPs: 26.04 | +7: iteration 16860/ 60336 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.889505E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.671 | TFLOPs: 26.07 | +7: iteration 16870/ 60336 | consumed samples: 4318720 | consumed tokens: 8844738560 | elapsed time per iteration (s): 0.16 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.886643E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.404 | TFLOPs: 25.63 | +7: iteration 16880/ 60336 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.896450E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.630 | TFLOPs: 26.06 | +7: iteration 16890/ 60336 | consumed samples: 4323840 | consumed tokens: 8855224320 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.889906E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.828 | TFLOPs: 26.05 | +7: iteration 16900/ 60336 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.878205E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.802 | TFLOPs: 26.06 | +7: iteration 16910/ 60336 | consumed samples: 4328960 | consumed tokens: 8865710080 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.896761E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.692 | TFLOPs: 26.04 | +7: iteration 16920/ 60336 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.888844E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.977 | TFLOPs: 26.05 | +7: iteration 16930/ 60336 | consumed samples: 4334080 | consumed tokens: 8876195840 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.886629E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.338 | TFLOPs: 26.02 | +7: iteration 16940/ 60336 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.895057E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.561 | TFLOPs: 25.92 | +7: iteration 16950/ 60336 | consumed samples: 4339200 | consumed tokens: 8886681600 | elapsed time per iteration (s): 0.16 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.878320E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.755 | TFLOPs: 25.89 | +7: iteration 16960/ 60336 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 0.16 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.905734E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.671 | TFLOPs: 25.87 | +7: iteration 16970/ 60336 | consumed samples: 4344320 | consumed tokens: 8897167360 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.881667E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.346 | TFLOPs: 25.94 | +7: iteration 16980/ 60336 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.904289E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.638 | TFLOPs: 26.06 | +7: iteration 16990/ 60336 | consumed samples: 4349440 | consumed tokens: 8907653120 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.898809E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.332 | TFLOPs: 26.07 | +7: iteration 17000/ 60336 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.879581E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.138 | TFLOPs: 26.07 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 17000 | lm loss value: 3.956853E+00 | lm loss PPL: 5.229249E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 17000 to checkpoints_44m32b100m +0: [2023-03-17 01:03:10,369] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step17000 is begin to save! +0: [2023-03-17 01:03:10,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:03:10,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:03:10,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:03:10,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:03:10,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:03:10,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:03:10,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:03:10,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:03:10,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:03:10,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:03:10,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:03:10,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:03:10,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:03:10,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:03:10,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:03:10,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:03:10,492] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:03:10,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:03:10,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:03:10,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:03:10,501] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step17000/mp_rank_00_model_states.pt +0: [2023-03-17 01:03:10,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:03:10,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:03:10,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:03:10,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:03:10,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:03:10,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:03:10,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:03:10,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:03:10,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 01:03:10,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 01:03:10,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 01:03:10,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:03:10,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:03:10,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: successfully saved checkpoint at iteration 17000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.59 +7: iteration 17010/ 60336 | consumed samples: 4354560 | consumed tokens: 8918138880 | elapsed time per iteration (s): 0.18 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.895423E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.255 | TFLOPs: 22.73 | +7: iteration 17020/ 60336 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.892976E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.193 | TFLOPs: 26.07 | +7: iteration 17030/ 60336 | consumed samples: 4359680 | consumed tokens: 8928624640 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.881849E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.589 | TFLOPs: 26.03 | +7: iteration 17040/ 60336 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.897602E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.573 | TFLOPs: 26.03 | +7: iteration 17050/ 60336 | consumed samples: 4364800 | consumed tokens: 8939110400 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.889523E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.617 | TFLOPs: 26.03 | +7: iteration 17060/ 60336 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.895999E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.106 | TFLOPs: 26.02 | +7: iteration 17070/ 60336 | consumed samples: 4369920 | consumed tokens: 8949596160 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.887645E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.660 | TFLOPs: 26.06 | +7: iteration 17080/ 60336 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.879892E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.981 | TFLOPs: 26.05 | +7: iteration 17090/ 60336 | consumed samples: 4375040 | consumed tokens: 8960081920 | elapsed time per iteration (s): 0.15 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.897847E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.444 | TFLOPs: 26.06 | +7: iteration 17100/ 60336 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 0.15 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.868217E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.185 | TFLOPs: 26.04 | +7: iteration 17110/ 60336 | consumed samples: 4380160 | consumed tokens: 8970567680 | elapsed time per iteration (s): 0.15 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.897083E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.035 | TFLOPs: 26.05 | +7: iteration 17120/ 60336 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 0.15 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.896283E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.466 | TFLOPs: 26.04 | +7: iteration 17130/ 60336 | consumed samples: 4385280 | consumed tokens: 8981053440 | elapsed time per iteration (s): 0.15 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.877328E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.232 | TFLOPs: 26.04 | +7: iteration 17140/ 60336 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 0.15 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.883112E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.001 | TFLOPs: 26.06 | +7: iteration 17150/ 60336 | consumed samples: 4390400 | consumed tokens: 8991539200 | elapsed time per iteration (s): 0.15 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.896591E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.554 | TFLOPs: 26.09 | +7: iteration 17160/ 60336 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 0.15 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.896727E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.524 | TFLOPs: 26.06 | +7: iteration 17170/ 60336 | consumed samples: 4395520 | consumed tokens: 9002024960 | elapsed time per iteration (s): 0.15 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.897636E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.715 | TFLOPs: 26.09 | +7: iteration 17180/ 60336 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 0.15 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.881470E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.017 | TFLOPs: 26.14 | +7: iteration 17190/ 60336 | consumed samples: 4400640 | consumed tokens: 9012510720 | elapsed time per iteration (s): 0.15 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.897674E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.532 | TFLOPs: 26.18 | +7: iteration 17200/ 60336 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 0.15 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.880595E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.066 | TFLOPs: 25.91 | +7: iteration 17210/ 60336 | consumed samples: 4405760 | consumed tokens: 9022996480 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.881577E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.299 | TFLOPs: 25.69 | +7: iteration 17220/ 60336 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 0.15 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.888439E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.145 | TFLOPs: 26.11 | +7: iteration 17230/ 60336 | consumed samples: 4410880 | consumed tokens: 9033482240 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.890651E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.185 | TFLOPs: 25.63 | +7: iteration 17240/ 60336 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 0.15 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.878846E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.632 | TFLOPs: 26.07 | +7: iteration 17250/ 60336 | consumed samples: 4416000 | consumed tokens: 9043968000 | elapsed time per iteration (s): 0.15 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.898446E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.489 | TFLOPs: 26.17 | +7: iteration 17260/ 60336 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 0.15 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.889633E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.662 | TFLOPs: 26.18 | +7: iteration 17270/ 60336 | consumed samples: 4421120 | consumed tokens: 9054453760 | elapsed time per iteration (s): 0.15 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.876364E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.210 | TFLOPs: 26.18 | +7: iteration 17280/ 60336 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 0.15 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.893960E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.433 | TFLOPs: 26.18 | +7: iteration 17290/ 60336 | consumed samples: 4426240 | consumed tokens: 9064939520 | elapsed time per iteration (s): 0.15 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.886398E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.789 | TFLOPs: 26.17 | +7: iteration 17300/ 60336 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 0.15 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.886953E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.505 | TFLOPs: 26.18 | +7: iteration 17310/ 60336 | consumed samples: 4431360 | consumed tokens: 9075425280 | elapsed time per iteration (s): 0.15 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.879662E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.864 | TFLOPs: 26.19 | +7: iteration 17320/ 60336 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 0.15 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.876226E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.930 | TFLOPs: 26.14 | +7: iteration 17330/ 60336 | consumed samples: 4436480 | consumed tokens: 9085911040 | elapsed time per iteration (s): 0.15 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.887646E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.957 | TFLOPs: 26.11 | +7: iteration 17340/ 60336 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 0.15 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.897342E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.549 | TFLOPs: 26.20 | +7: iteration 17350/ 60336 | consumed samples: 4441600 | consumed tokens: 9096396800 | elapsed time per iteration (s): 0.15 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.898671E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.299 | TFLOPs: 26.04 | +7: iteration 17360/ 60336 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 0.15 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.888817E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.557 | TFLOPs: 26.26 | +7: iteration 17370/ 60336 | consumed samples: 4446720 | consumed tokens: 9106882560 | elapsed time per iteration (s): 0.15 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.889604E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.347 | TFLOPs: 26.27 | +7: iteration 17380/ 60336 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 0.15 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.889260E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.471 | TFLOPs: 26.26 | +7: iteration 17390/ 60336 | consumed samples: 4451840 | consumed tokens: 9117368320 | elapsed time per iteration (s): 0.15 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.891379E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.459 | TFLOPs: 26.26 | +7: iteration 17400/ 60336 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 0.15 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.894632E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.578 | TFLOPs: 26.26 | +7: iteration 17410/ 60336 | consumed samples: 4456960 | consumed tokens: 9127854080 | elapsed time per iteration (s): 0.15 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.885057E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.783 | TFLOPs: 26.22 | +7: iteration 17420/ 60336 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 0.15 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.883395E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.241 | TFLOPs: 26.26 | +7: iteration 17430/ 60336 | consumed samples: 4462080 | consumed tokens: 9138339840 | elapsed time per iteration (s): 0.15 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.884349E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.512 | TFLOPs: 26.21 | +7: iteration 17440/ 60336 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 0.15 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.873621E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.170 | TFLOPs: 26.27 | +7: iteration 17450/ 60336 | consumed samples: 4467200 | consumed tokens: 9148825600 | elapsed time per iteration (s): 0.15 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.893922E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.865 | TFLOPs: 26.27 | +7: iteration 17460/ 60336 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 0.15 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.883011E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.186 | TFLOPs: 26.26 | +7: iteration 17470/ 60336 | consumed samples: 4472320 | consumed tokens: 9159311360 | elapsed time per iteration (s): 0.15 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.890191E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.900 | TFLOPs: 26.25 | +7: iteration 17480/ 60336 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 0.15 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.884093E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.218 | TFLOPs: 26.27 | +7: iteration 17490/ 60336 | consumed samples: 4477440 | consumed tokens: 9169797120 | elapsed time per iteration (s): 0.15 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.883848E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.887 | TFLOPs: 26.28 | +7: iteration 17500/ 60336 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 0.15 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.866328E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.393 | TFLOPs: 26.27 | +7: iteration 17510/ 60336 | consumed samples: 4482560 | consumed tokens: 9180282880 | elapsed time per iteration (s): 0.15 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.878407E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.887 | TFLOPs: 26.28 | +7: iteration 17520/ 60336 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 0.15 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.900019E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.196 | TFLOPs: 25.96 | +7: iteration 17530/ 60336 | consumed samples: 4487680 | consumed tokens: 9190768640 | elapsed time per iteration (s): 0.15 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.886058E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.001 | TFLOPs: 26.27 | +7: iteration 17540/ 60336 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 0.15 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.880852E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.183 | TFLOPs: 26.27 | +7: iteration 17550/ 60336 | consumed samples: 4492800 | consumed tokens: 9201254400 | elapsed time per iteration (s): 0.15 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.887096E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.338 | TFLOPs: 26.26 | +7: iteration 17560/ 60336 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 0.15 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.891494E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.581 | TFLOPs: 26.25 | +7: iteration 17570/ 60336 | consumed samples: 4497920 | consumed tokens: 9211740160 | elapsed time per iteration (s): 0.15 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.880702E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.278 | TFLOPs: 26.27 | +7: iteration 17580/ 60336 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 0.15 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.891450E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.732 | TFLOPs: 26.25 | +7: iteration 17590/ 60336 | consumed samples: 4503040 | consumed tokens: 9222225920 | elapsed time per iteration (s): 0.15 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.886207E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.881 | TFLOPs: 26.25 | +7: iteration 17600/ 60336 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 0.15 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.903141E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.244 | TFLOPs: 26.18 | +7: iteration 17610/ 60336 | consumed samples: 4508160 | consumed tokens: 9232711680 | elapsed time per iteration (s): 0.15 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.884357E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.476 | TFLOPs: 26.21 | +7: iteration 17620/ 60336 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 0.15 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.893342E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.164 | TFLOPs: 26.15 | +7: iteration 17630/ 60336 | consumed samples: 4513280 | consumed tokens: 9243197440 | elapsed time per iteration (s): 0.15 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.875630E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.063 | TFLOPs: 26.14 | +7: iteration 17640/ 60336 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 0.15 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.896129E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.472 | TFLOPs: 26.13 | +7: iteration 17650/ 60336 | consumed samples: 4518400 | consumed tokens: 9253683200 | elapsed time per iteration (s): 0.15 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.885571E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.076 | TFLOPs: 26.10 | +7: iteration 17660/ 60336 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 0.15 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.896160E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.905 | TFLOPs: 26.14 | +7: iteration 17670/ 60336 | consumed samples: 4523520 | consumed tokens: 9264168960 | elapsed time per iteration (s): 0.15 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.888223E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.111 | TFLOPs: 26.10 | +7: iteration 17680/ 60336 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 0.15 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.885513E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.046 | TFLOPs: 26.16 | +7: iteration 17690/ 60336 | consumed samples: 4528640 | consumed tokens: 9274654720 | elapsed time per iteration (s): 0.15 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.882814E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.820 | TFLOPs: 26.16 | +7: iteration 17700/ 60336 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 0.15 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.886808E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.546 | TFLOPs: 26.12 | +7: iteration 17710/ 60336 | consumed samples: 4533760 | consumed tokens: 9285140480 | elapsed time per iteration (s): 0.15 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.881174E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.131 | TFLOPs: 26.16 | +7: iteration 17720/ 60336 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 0.15 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.876056E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.274 | TFLOPs: 26.13 | +7: iteration 17730/ 60336 | consumed samples: 4538880 | consumed tokens: 9295626240 | elapsed time per iteration (s): 0.15 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.897291E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.378 | TFLOPs: 26.02 | +7: iteration 17740/ 60336 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 0.15 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.879852E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.668 | TFLOPs: 26.06 | +7: iteration 17750/ 60336 | consumed samples: 4544000 | consumed tokens: 9306112000 | elapsed time per iteration (s): 0.15 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.883805E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.563 | TFLOPs: 26.10 | +7: iteration 17760/ 60336 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 0.15 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.890124E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.156 | TFLOPs: 26.07 | +7: iteration 17770/ 60336 | consumed samples: 4549120 | consumed tokens: 9316597760 | elapsed time per iteration (s): 0.15 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.874603E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.866 | TFLOPs: 25.98 | +7: iteration 17780/ 60336 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 0.15 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.884522E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.031 | TFLOPs: 25.99 | +7: iteration 17790/ 60336 | consumed samples: 4554240 | consumed tokens: 9327083520 | elapsed time per iteration (s): 0.15 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.882434E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.173 | TFLOPs: 26.04 | +7: iteration 17800/ 60336 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 0.15 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.877751E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.562 | TFLOPs: 26.06 | +7: iteration 17810/ 60336 | consumed samples: 4559360 | consumed tokens: 9337569280 | elapsed time per iteration (s): 0.15 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.876554E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.511 | TFLOPs: 26.07 | +7: iteration 17820/ 60336 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 0.15 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.871613E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.120 | TFLOPs: 26.10 | +7: iteration 17830/ 60336 | consumed samples: 4564480 | consumed tokens: 9348055040 | elapsed time per iteration (s): 0.15 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.889273E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.484 | TFLOPs: 26.09 | +7: iteration 17840/ 60336 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 0.15 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.878638E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.920 | TFLOPs: 26.13 | +7: iteration 17850/ 60336 | consumed samples: 4569600 | consumed tokens: 9358540800 | elapsed time per iteration (s): 0.15 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.887576E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.521 | TFLOPs: 26.12 | +7: iteration 17860/ 60336 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 0.15 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.874266E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.659 | TFLOPs: 26.12 | +7: iteration 17870/ 60336 | consumed samples: 4574720 | consumed tokens: 9369026560 | elapsed time per iteration (s): 0.15 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.887712E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.543 | TFLOPs: 26.10 | +7: iteration 17880/ 60336 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 0.15 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.880148E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.682 | TFLOPs: 26.11 | +7: iteration 17890/ 60336 | consumed samples: 4579840 | consumed tokens: 9379512320 | elapsed time per iteration (s): 0.15 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.874823E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.834 | TFLOPs: 26.08 | +7: iteration 17900/ 60336 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 0.15 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.881529E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.087 | TFLOPs: 26.10 | +7: iteration 17910/ 60336 | consumed samples: 4584960 | consumed tokens: 9389998080 | elapsed time per iteration (s): 0.15 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.881913E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.139 | TFLOPs: 26.05 | +7: iteration 17920/ 60336 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 0.15 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.891669E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.460 | TFLOPs: 26.07 | +7: iteration 17930/ 60336 | consumed samples: 4590080 | consumed tokens: 9400483840 | elapsed time per iteration (s): 0.15 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.895625E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.171 | TFLOPs: 26.07 | +7: iteration 17940/ 60336 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 0.15 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.890577E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.056 | TFLOPs: 26.10 | +7: iteration 17950/ 60336 | consumed samples: 4595200 | consumed tokens: 9410969600 | elapsed time per iteration (s): 0.15 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.880274E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.728 | TFLOPs: 26.04 | +7: iteration 17960/ 60336 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 0.15 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.883897E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.288 | TFLOPs: 26.07 | +7: iteration 17970/ 60336 | consumed samples: 4600320 | consumed tokens: 9421455360 | elapsed time per iteration (s): 0.15 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.877395E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.195 | TFLOPs: 26.05 | +7: iteration 17980/ 60336 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 0.15 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.870385E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.153 | TFLOPs: 26.07 | +7: iteration 17990/ 60336 | consumed samples: 4605440 | consumed tokens: 9431941120 | elapsed time per iteration (s): 0.15 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.875028E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.327 | TFLOPs: 26.05 | +0: [2023-03-17 01:05:44,243] [INFO] [logging.py:68:log_dist] [Rank 0] step=18000, skipped=0, lr=[0.00016488386930404097, 0.00016488386930404097, 0.00016488386930404097], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 18000/ 60336 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 0.15 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.881796E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.581 | TFLOPs: 26.06 | +0: steps: 18000 loss: 3.8837 iter time (s): 0.152 samples/sec: 1680.586 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 18000 | lm loss value: 3.963578E+00 | lm loss PPL: 5.264534E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 18000 to checkpoints_44m32b100m +0: [2023-03-17 01:05:44,316] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is begin to save! +0: [2023-03-17 01:05:44,319] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:05:44,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:05:44,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:05:44,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:05:44,389] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:05:44,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:05:44,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:05:44,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:05:44,406] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:05:44,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:05:44,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:05:44,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:05:44,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:05:44,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:05:44,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:05:44,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:05:44,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:05:44,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:05:44,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:05:44,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:05:44,448] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step18000/mp_rank_00_model_states.pt +0: [2023-03-17 01:05:44,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:05:44,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:05:44,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:05:44,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:05:44,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:05:44,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 01:05:44,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:05:44,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:05:44,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:05:44,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:05:44,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:05:44,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:05:44,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:05:44,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:05:44,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: successfully saved checkpoint at iteration 18000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.05 +7: iteration 18010/ 60336 | consumed samples: 4610560 | consumed tokens: 9442426880 | elapsed time per iteration (s): 0.18 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.888663E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.087 | TFLOPs: 22.52 | +7: iteration 18020/ 60336 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 0.15 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.871700E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.918 | TFLOPs: 26.06 | +7: iteration 18030/ 60336 | consumed samples: 4615680 | consumed tokens: 9452912640 | elapsed time per iteration (s): 0.15 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.888700E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.143 | TFLOPs: 26.07 | +7: iteration 18040/ 60336 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 0.15 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.880653E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.164 | TFLOPs: 26.08 | +7: iteration 18050/ 60336 | consumed samples: 4620800 | consumed tokens: 9463398400 | elapsed time per iteration (s): 0.15 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.887413E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.659 | TFLOPs: 26.06 | +7: iteration 18060/ 60336 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 0.15 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.871131E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.872 | TFLOPs: 26.08 | +7: iteration 18070/ 60336 | consumed samples: 4625920 | consumed tokens: 9473884160 | elapsed time per iteration (s): 0.15 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.873682E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.316 | TFLOPs: 26.07 | +7: iteration 18080/ 60336 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 0.15 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.876878E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.282 | TFLOPs: 26.05 | +7: iteration 18090/ 60336 | consumed samples: 4631040 | consumed tokens: 9484369920 | elapsed time per iteration (s): 0.15 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.874338E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.727 | TFLOPs: 26.04 | +7: iteration 18100/ 60336 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 0.15 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.873417E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.153 | TFLOPs: 26.05 | +7: iteration 18110/ 60336 | consumed samples: 4636160 | consumed tokens: 9494855680 | elapsed time per iteration (s): 0.15 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.883019E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.217 | TFLOPs: 26.04 | +7: iteration 18120/ 60336 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 0.15 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.885786E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.921 | TFLOPs: 26.05 | +7: iteration 18130/ 60336 | consumed samples: 4641280 | consumed tokens: 9505341440 | elapsed time per iteration (s): 0.15 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.872127E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.807 | TFLOPs: 26.05 | +7: iteration 18140/ 60336 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 0.15 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.864535E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.920 | TFLOPs: 26.03 | +7: iteration 18150/ 60336 | consumed samples: 4646400 | consumed tokens: 9515827200 | elapsed time per iteration (s): 0.15 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.879247E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.654 | TFLOPs: 26.04 | +7: iteration 18160/ 60336 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 0.15 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.879173E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.744 | TFLOPs: 26.03 | +7: iteration 18170/ 60336 | consumed samples: 4651520 | consumed tokens: 9526312960 | elapsed time per iteration (s): 0.15 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.873948E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.088 | TFLOPs: 26.03 | +7: iteration 18180/ 60336 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 0.15 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.883042E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.411 | TFLOPs: 26.02 | +7: iteration 18190/ 60336 | consumed samples: 4656640 | consumed tokens: 9536798720 | elapsed time per iteration (s): 0.15 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.869625E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.840 | TFLOPs: 26.05 | +7: iteration 18200/ 60336 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 0.15 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.872218E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.020 | TFLOPs: 26.02 | +7: iteration 18210/ 60336 | consumed samples: 4661760 | consumed tokens: 9547284480 | elapsed time per iteration (s): 0.16 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.885499E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.650 | TFLOPs: 25.79 | +7: iteration 18220/ 60336 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 0.15 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.884324E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.143 | TFLOPs: 26.04 | +7: iteration 18230/ 60336 | consumed samples: 4666880 | consumed tokens: 9557770240 | elapsed time per iteration (s): 0.15 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.880232E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.490 | TFLOPs: 26.01 | +7: iteration 18240/ 60336 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 0.15 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.882188E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.031 | TFLOPs: 26.02 | +7: iteration 18250/ 60336 | consumed samples: 4672000 | consumed tokens: 9568256000 | elapsed time per iteration (s): 0.15 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.881963E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.978 | TFLOPs: 26.03 | +7: iteration 18260/ 60336 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 0.15 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.885500E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.370 | TFLOPs: 26.01 | +7: iteration 18270/ 60336 | consumed samples: 4677120 | consumed tokens: 9578741760 | elapsed time per iteration (s): 0.15 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.883028E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.816 | TFLOPs: 26.03 | +7: iteration 18280/ 60336 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 0.15 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.872500E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.266 | TFLOPs: 26.01 | +7: iteration 18290/ 60336 | consumed samples: 4682240 | consumed tokens: 9589227520 | elapsed time per iteration (s): 0.15 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.878730E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.171 | TFLOPs: 26.05 | +7: iteration 18300/ 60336 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 0.15 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.883691E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.430 | TFLOPs: 26.02 | +7: iteration 18310/ 60336 | consumed samples: 4687360 | consumed tokens: 9599713280 | elapsed time per iteration (s): 0.15 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.851777E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.983 | TFLOPs: 26.02 | +7: iteration 18320/ 60336 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 0.15 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.872514E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.237 | TFLOPs: 26.05 | +7: iteration 18330/ 60336 | consumed samples: 4692480 | consumed tokens: 9610199040 | elapsed time per iteration (s): 0.15 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.874797E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.562 | TFLOPs: 26.03 | +7: iteration 18340/ 60336 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 0.15 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.872266E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.833 | TFLOPs: 26.00 | +7: iteration 18350/ 60336 | consumed samples: 4697600 | consumed tokens: 9620684800 | elapsed time per iteration (s): 0.15 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.881167E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.586 | TFLOPs: 26.01 | +7: iteration 18360/ 60336 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 0.15 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.868431E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.844 | TFLOPs: 26.01 | +7: iteration 18370/ 60336 | consumed samples: 4702720 | consumed tokens: 9631170560 | elapsed time per iteration (s): 0.15 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.864523E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.115 | TFLOPs: 26.02 | +7: iteration 18380/ 60336 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 0.15 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.878931E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.547 | TFLOPs: 26.04 | +7: iteration 18390/ 60336 | consumed samples: 4707840 | consumed tokens: 9641656320 | elapsed time per iteration (s): 0.15 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.870588E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.951 | TFLOPs: 26.03 | +7: iteration 18400/ 60336 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 0.15 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.874252E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.994 | TFLOPs: 26.05 | +7: iteration 18410/ 60336 | consumed samples: 4712960 | consumed tokens: 9652142080 | elapsed time per iteration (s): 0.15 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.879947E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.493 | TFLOPs: 26.04 | +7: iteration 18420/ 60336 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 0.15 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.890012E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.580 | TFLOPs: 26.04 | +7: iteration 18430/ 60336 | consumed samples: 4718080 | consumed tokens: 9662627840 | elapsed time per iteration (s): 0.15 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.876192E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.600 | TFLOPs: 26.04 | +7: iteration 18440/ 60336 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 0.15 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.871517E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.588 | TFLOPs: 26.04 | +7: iteration 18450/ 60336 | consumed samples: 4723200 | consumed tokens: 9673113600 | elapsed time per iteration (s): 0.15 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.880529E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.716 | TFLOPs: 26.04 | +7: iteration 18460/ 60336 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 0.15 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.867525E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.007 | TFLOPs: 26.06 | +7: iteration 18470/ 60336 | consumed samples: 4728320 | consumed tokens: 9683599360 | elapsed time per iteration (s): 0.15 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.864768E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.700 | TFLOPs: 26.09 | +7: iteration 18480/ 60336 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 0.15 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.873882E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.823 | TFLOPs: 26.08 | +7: iteration 18490/ 60336 | consumed samples: 4733440 | consumed tokens: 9694085120 | elapsed time per iteration (s): 0.15 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.864893E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.504 | TFLOPs: 26.07 | +7: iteration 18500/ 60336 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 0.15 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.873914E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.773 | TFLOPs: 26.08 | +7: iteration 18510/ 60336 | consumed samples: 4738560 | consumed tokens: 9704570880 | elapsed time per iteration (s): 0.15 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.874134E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.892 | TFLOPs: 26.05 | +7: iteration 18520/ 60336 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 0.15 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.884298E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.637 | TFLOPs: 26.04 | +7: iteration 18530/ 60336 | consumed samples: 4743680 | consumed tokens: 9715056640 | elapsed time per iteration (s): 0.15 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.883600E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.740 | TFLOPs: 26.04 | +7: iteration 18540/ 60336 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 0.15 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.876145E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.476 | TFLOPs: 26.06 | +7: iteration 18550/ 60336 | consumed samples: 4748800 | consumed tokens: 9725542400 | elapsed time per iteration (s): 0.15 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.878588E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.670 | TFLOPs: 26.04 | +7: iteration 18560/ 60336 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 0.15 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.881997E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.744 | TFLOPs: 26.04 | +7: iteration 18570/ 60336 | consumed samples: 4753920 | consumed tokens: 9736028160 | elapsed time per iteration (s): 0.15 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.875592E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.747 | TFLOPs: 26.03 | +7: iteration 18580/ 60336 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 0.15 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.861000E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.758 | TFLOPs: 26.03 | +7: iteration 18590/ 60336 | consumed samples: 4759040 | consumed tokens: 9746513920 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.873179E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.763 | TFLOPs: 26.01 | +7: iteration 18600/ 60336 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.858809E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.115 | TFLOPs: 26.02 | +7: iteration 18610/ 60336 | consumed samples: 4764160 | consumed tokens: 9756999680 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.884138E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.811 | TFLOPs: 26.03 | +7: iteration 18620/ 60336 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 0.15 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.878637E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.108 | TFLOPs: 26.05 | +7: iteration 18630/ 60336 | consumed samples: 4769280 | consumed tokens: 9767485440 | elapsed time per iteration (s): 0.15 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.885084E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.420 | TFLOPs: 26.04 | +7: iteration 18640/ 60336 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 0.15 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.871433E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.937 | TFLOPs: 26.05 | +7: iteration 18650/ 60336 | consumed samples: 4774400 | consumed tokens: 9777971200 | elapsed time per iteration (s): 0.15 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.871348E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.201 | TFLOPs: 26.05 | +7: iteration 18660/ 60336 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 0.15 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.863918E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.054 | TFLOPs: 26.03 | +7: iteration 18670/ 60336 | consumed samples: 4779520 | consumed tokens: 9788456960 | elapsed time per iteration (s): 0.16 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.862948E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.095 | TFLOPs: 25.44 | +7: iteration 18680/ 60336 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.874998E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.140 | TFLOPs: 26.05 | +7: iteration 18690/ 60336 | consumed samples: 4784640 | consumed tokens: 9798942720 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.859229E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.441 | TFLOPs: 26.06 | +7: iteration 18700/ 60336 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 0.15 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.882302E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.354 | TFLOPs: 26.05 | +7: iteration 18710/ 60336 | consumed samples: 4789760 | consumed tokens: 9809428480 | elapsed time per iteration (s): 0.15 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.872837E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.443 | TFLOPs: 26.06 | +7: iteration 18720/ 60336 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.884579E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.963 | TFLOPs: 26.05 | +7: iteration 18730/ 60336 | consumed samples: 4794880 | consumed tokens: 9819914240 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.886740E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.451 | TFLOPs: 26.04 | +7: iteration 18740/ 60336 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.870622E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.124 | TFLOPs: 26.05 | +7: iteration 18750/ 60336 | consumed samples: 4800000 | consumed tokens: 9830400000 | elapsed time per iteration (s): 0.15 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.879412E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.686 | TFLOPs: 26.06 | +7: iteration 18760/ 60336 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 0.15 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.874552E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.165 | TFLOPs: 26.00 | +7: iteration 18770/ 60336 | consumed samples: 4805120 | consumed tokens: 9840885760 | elapsed time per iteration (s): 0.15 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.882061E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.437 | TFLOPs: 26.02 | +7: iteration 18780/ 60336 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.867183E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.986 | TFLOPs: 26.03 | +7: iteration 18790/ 60336 | consumed samples: 4810240 | consumed tokens: 9851371520 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.872395E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.651 | TFLOPs: 26.04 | +7: iteration 18800/ 60336 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 0.15 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.859224E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.576 | TFLOPs: 26.06 | +7: iteration 18810/ 60336 | consumed samples: 4815360 | consumed tokens: 9861857280 | elapsed time per iteration (s): 0.15 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.872048E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.944 | TFLOPs: 26.05 | +7: iteration 18820/ 60336 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 0.16 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.875927E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.725 | TFLOPs: 25.82 | +7: iteration 18830/ 60336 | consumed samples: 4820480 | consumed tokens: 9872343040 | elapsed time per iteration (s): 0.15 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.872099E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.476 | TFLOPs: 26.06 | +7: iteration 18840/ 60336 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 0.15 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.871503E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.959 | TFLOPs: 26.06 | +7: iteration 18850/ 60336 | consumed samples: 4825600 | consumed tokens: 9882828800 | elapsed time per iteration (s): 0.15 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.882304E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.441 | TFLOPs: 26.04 | +7: iteration 18860/ 60336 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 0.15 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.873392E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.215 | TFLOPs: 26.05 | +7: iteration 18870/ 60336 | consumed samples: 4830720 | consumed tokens: 9893314560 | elapsed time per iteration (s): 0.15 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.872466E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.070 | TFLOPs: 26.05 | +7: iteration 18880/ 60336 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 0.15 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.865208E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.917 | TFLOPs: 26.03 | +7: iteration 18890/ 60336 | consumed samples: 4835840 | consumed tokens: 9903800320 | elapsed time per iteration (s): 0.15 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.886524E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.971 | TFLOPs: 26.03 | +7: iteration 18900/ 60336 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.871543E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.458 | TFLOPs: 26.04 | +7: iteration 18910/ 60336 | consumed samples: 4840960 | consumed tokens: 9914286080 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.870806E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.464 | TFLOPs: 26.04 | +7: iteration 18920/ 60336 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.868037E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.031 | TFLOPs: 26.05 | +7: iteration 18930/ 60336 | consumed samples: 4846080 | consumed tokens: 9924771840 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.870406E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.494 | TFLOPs: 26.06 | +7: iteration 18940/ 60336 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.871157E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.706 | TFLOPs: 26.04 | +7: iteration 18950/ 60336 | consumed samples: 4851200 | consumed tokens: 9935257600 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.872127E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.788 | TFLOPs: 26.06 | +7: iteration 18960/ 60336 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 0.15 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.867138E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.028 | TFLOPs: 26.03 | +7: iteration 18970/ 60336 | consumed samples: 4856320 | consumed tokens: 9945743360 | elapsed time per iteration (s): 0.15 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.875141E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.232 | TFLOPs: 26.15 | +7: iteration 18980/ 60336 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.871612E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.605 | TFLOPs: 26.15 | +7: iteration 18990/ 60336 | consumed samples: 4861440 | consumed tokens: 9956229120 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.861025E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.395 | TFLOPs: 26.15 | +7: iteration 19000/ 60336 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.871571E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.434 | TFLOPs: 26.18 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 19000 | lm loss value: 3.922414E+00 | lm loss PPL: 5.052224E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 19000 to checkpoints_44m32b100m +0: [2023-03-17 01:08:18,739] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step19000 is begin to save! +0: [2023-03-17 01:08:18,742] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:08:18,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:08:18,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:08:18,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:08:18,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:08:18,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:08:18,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:08:18,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:08:18,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:08:18,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:08:18,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:08:18,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:08:18,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:08:18,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:08:18,853] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:08:18,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:08:18,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:08:18,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:08:18,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:08:18,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:08:18,870] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step19000/mp_rank_00_model_states.pt +0: [2023-03-17 01:08:18,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:08:18,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:08:18,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:08:18,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 01:08:18,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:08:18,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:08:18,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:08:18,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:08:18,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:08:18,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: successfully saved checkpoint at iteration 19000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 174.76 +7: iteration 19010/ 60336 | consumed samples: 4866560 | consumed tokens: 9966714880 | elapsed time per iteration (s): 0.18 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.874736E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.126 | TFLOPs: 22.87 | +7: iteration 19020/ 60336 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 0.15 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.869524E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.268 | TFLOPs: 26.16 | +7: iteration 19030/ 60336 | consumed samples: 4871680 | consumed tokens: 9977200640 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.873106E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.003 | TFLOPs: 26.16 | +7: iteration 19040/ 60336 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.869081E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.490 | TFLOPs: 26.17 | +7: iteration 19050/ 60336 | consumed samples: 4876800 | consumed tokens: 9987686400 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.876043E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.186 | TFLOPs: 26.18 | +7: iteration 19060/ 60336 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 0.15 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.857704E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.255 | TFLOPs: 26.15 | +7: iteration 19070/ 60336 | consumed samples: 4881920 | consumed tokens: 9998172160 | elapsed time per iteration (s): 0.15 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.870902E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.668 | TFLOPs: 26.17 | +7: iteration 19080/ 60336 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 0.15 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.877665E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.106 | TFLOPs: 26.16 | +7: iteration 19090/ 60336 | consumed samples: 4887040 | consumed tokens: 10008657920 | elapsed time per iteration (s): 0.15 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.866018E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.969 | TFLOPs: 26.16 | +7: iteration 19100/ 60336 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 0.15 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.875959E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.888 | TFLOPs: 26.17 | +7: iteration 19110/ 60336 | consumed samples: 4892160 | consumed tokens: 10019143680 | elapsed time per iteration (s): 0.15 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.860730E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.840 | TFLOPs: 26.16 | +7: iteration 19120/ 60336 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 0.15 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.863707E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.947 | TFLOPs: 26.16 | +7: iteration 19130/ 60336 | consumed samples: 4897280 | consumed tokens: 10029629440 | elapsed time per iteration (s): 0.15 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.881139E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.662 | TFLOPs: 26.15 | +7: iteration 19140/ 60336 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.874103E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.578 | TFLOPs: 26.06 | +7: iteration 19150/ 60336 | consumed samples: 4902400 | consumed tokens: 10040115200 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.872971E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.703 | TFLOPs: 26.09 | +7: iteration 19160/ 60336 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.870230E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.397 | TFLOPs: 26.07 | +7: iteration 19170/ 60336 | consumed samples: 4907520 | consumed tokens: 10050600960 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.862636E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.172 | TFLOPs: 26.07 | +7: iteration 19180/ 60336 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.862020E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.773 | TFLOPs: 26.09 | +7: iteration 19190/ 60336 | consumed samples: 4912640 | consumed tokens: 10061086720 | elapsed time per iteration (s): 0.15 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.872311E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.150 | TFLOPs: 26.10 | +7: iteration 19200/ 60336 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 0.15 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.879866E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.107 | TFLOPs: 26.10 | +7: iteration 19210/ 60336 | consumed samples: 4917760 | consumed tokens: 10071572480 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.862986E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.568 | TFLOPs: 25.89 | +7: iteration 19220/ 60336 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 0.15 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.848603E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.701 | TFLOPs: 26.06 | +7: iteration 19230/ 60336 | consumed samples: 4922880 | consumed tokens: 10082058240 | elapsed time per iteration (s): 0.15 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.870995E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.915 | TFLOPs: 26.06 | +7: iteration 19240/ 60336 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.854484E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.997 | TFLOPs: 26.10 | +7: iteration 19250/ 60336 | consumed samples: 4928000 | consumed tokens: 10092544000 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.873936E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.883 | TFLOPs: 26.09 | +7: iteration 19260/ 60336 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.866130E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.526 | TFLOPs: 26.09 | +7: iteration 19270/ 60336 | consumed samples: 4933120 | consumed tokens: 10103029760 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.873475E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.781 | TFLOPs: 26.06 | +7: iteration 19280/ 60336 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.870828E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.310 | TFLOPs: 26.08 | +7: iteration 19290/ 60336 | consumed samples: 4938240 | consumed tokens: 10113515520 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.872033E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.657 | TFLOPs: 26.04 | +7: iteration 19300/ 60336 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.852852E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.335 | TFLOPs: 26.07 | +7: iteration 19310/ 60336 | consumed samples: 4943360 | consumed tokens: 10124001280 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.843191E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.812 | TFLOPs: 26.05 | +7: iteration 19320/ 60336 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.884453E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.636 | TFLOPs: 26.04 | +7: iteration 19330/ 60336 | consumed samples: 4948480 | consumed tokens: 10134487040 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.871584E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.641 | TFLOPs: 26.04 | +7: iteration 19340/ 60336 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.863013E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.357 | TFLOPs: 26.04 | +7: iteration 19350/ 60336 | consumed samples: 4953600 | consumed tokens: 10144972800 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.864781E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.293 | TFLOPs: 26.15 | +7: iteration 19360/ 60336 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.881952E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.835 | TFLOPs: 26.16 | +7: iteration 19370/ 60336 | consumed samples: 4958720 | consumed tokens: 10155458560 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.858052E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.003 | TFLOPs: 26.16 | +7: iteration 19380/ 60336 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.871837E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.670 | TFLOPs: 26.15 | +7: iteration 19390/ 60336 | consumed samples: 4963840 | consumed tokens: 10165944320 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.874557E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.614 | TFLOPs: 26.18 | +7: iteration 19400/ 60336 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.871564E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.971 | TFLOPs: 26.08 | +7: iteration 19410/ 60336 | consumed samples: 4968960 | consumed tokens: 10176430080 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.858892E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.979 | TFLOPs: 26.11 | +7: iteration 19420/ 60336 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.876226E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.618 | TFLOPs: 26.11 | +7: iteration 19430/ 60336 | consumed samples: 4974080 | consumed tokens: 10186915840 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.886523E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.873 | TFLOPs: 26.09 | +7: iteration 19440/ 60336 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.870647E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.404 | TFLOPs: 26.12 | +7: iteration 19450/ 60336 | consumed samples: 4979200 | consumed tokens: 10197401600 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.868566E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.027 | TFLOPs: 26.08 | +7: iteration 19460/ 60336 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.847165E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.587 | TFLOPs: 26.10 | +7: iteration 19470/ 60336 | consumed samples: 4984320 | consumed tokens: 10207887360 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.866529E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.864 | TFLOPs: 26.11 | +7: iteration 19480/ 60336 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.878779E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.240 | TFLOPs: 26.12 | +7: iteration 19490/ 60336 | consumed samples: 4989440 | consumed tokens: 10218373120 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.869310E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.428 | TFLOPs: 26.09 | +7: iteration 19500/ 60336 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.869453E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.099 | TFLOPs: 26.11 | +7: iteration 19510/ 60336 | consumed samples: 4994560 | consumed tokens: 10228858880 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.865845E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.073 | TFLOPs: 26.08 | +7: iteration 19520/ 60336 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.868669E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.432 | TFLOPs: 26.10 | +7: iteration 19530/ 60336 | consumed samples: 4999680 | consumed tokens: 10239344640 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.862912E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.078 | TFLOPs: 26.08 | +7: iteration 19540/ 60336 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 0.15 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.872803E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.800 | TFLOPs: 26.11 | +7: iteration 19550/ 60336 | consumed samples: 5004800 | consumed tokens: 10249830400 | elapsed time per iteration (s): 0.15 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.860011E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.453 | TFLOPs: 26.10 | +7: iteration 19560/ 60336 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 0.15 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.874587E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.515 | TFLOPs: 26.10 | +7: iteration 19570/ 60336 | consumed samples: 5009920 | consumed tokens: 10260316160 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.868862E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.302 | TFLOPs: 26.10 | +7: iteration 19580/ 60336 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.864597E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.081 | TFLOPs: 26.11 | +7: iteration 19590/ 60336 | consumed samples: 5015040 | consumed tokens: 10270801920 | elapsed time per iteration (s): 0.16 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.883078E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.654 | TFLOPs: 25.40 | +7: iteration 19600/ 60336 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.859384E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.979 | TFLOPs: 26.11 | +7: iteration 19610/ 60336 | consumed samples: 5020160 | consumed tokens: 10281287680 | elapsed time per iteration (s): 0.16 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.870481E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.648 | TFLOPs: 25.70 | +7: iteration 19620/ 60336 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.860129E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.202 | TFLOPs: 26.11 | +7: iteration 19630/ 60336 | consumed samples: 5025280 | consumed tokens: 10291773440 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.869576E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.364 | TFLOPs: 26.18 | +7: iteration 19640/ 60336 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.862933E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.877 | TFLOPs: 26.19 | +7: iteration 19650/ 60336 | consumed samples: 5030400 | consumed tokens: 10302259200 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.861731E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.882 | TFLOPs: 26.16 | +7: iteration 19660/ 60336 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.864552E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.083 | TFLOPs: 26.22 | +7: iteration 19670/ 60336 | consumed samples: 5035520 | consumed tokens: 10312744960 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.863290E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.821 | TFLOPs: 26.25 | +7: iteration 19680/ 60336 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.864974E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.813 | TFLOPs: 26.23 | +7: iteration 19690/ 60336 | consumed samples: 5040640 | consumed tokens: 10323230720 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.871601E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.555 | TFLOPs: 26.26 | +7: iteration 19700/ 60336 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.860431E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.582 | TFLOPs: 26.25 | +7: iteration 19710/ 60336 | consumed samples: 5045760 | consumed tokens: 10333716480 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.862465E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.486 | TFLOPs: 26.26 | +7: iteration 19720/ 60336 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.872510E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.459 | TFLOPs: 26.24 | +7: iteration 19730/ 60336 | consumed samples: 5050880 | consumed tokens: 10344202240 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.864454E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.014 | TFLOPs: 26.25 | +7: iteration 19740/ 60336 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.857108E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.262 | TFLOPs: 26.23 | +7: iteration 19750/ 60336 | consumed samples: 5056000 | consumed tokens: 10354688000 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.851527E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.155 | TFLOPs: 26.24 | +7: iteration 19760/ 60336 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 0.16 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.857033E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.061 | TFLOPs: 25.69 | +7: iteration 19770/ 60336 | consumed samples: 5061120 | consumed tokens: 10365173760 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.854632E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.751 | TFLOPs: 26.25 | +7: iteration 19780/ 60336 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 0.16 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.865704E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.712 | TFLOPs: 25.87 | +7: iteration 19790/ 60336 | consumed samples: 5066240 | consumed tokens: 10375659520 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.861845E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.012 | TFLOPs: 26.25 | +7: iteration 19800/ 60336 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.873767E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.503 | TFLOPs: 26.23 | +7: iteration 19810/ 60336 | consumed samples: 5071360 | consumed tokens: 10386145280 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.882658E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.283 | TFLOPs: 26.21 | +7: iteration 19820/ 60336 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.861921E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.729 | TFLOPs: 26.23 | +7: iteration 19830/ 60336 | consumed samples: 5076480 | consumed tokens: 10396631040 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.852663E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.133 | TFLOPs: 26.22 | +7: iteration 19840/ 60336 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.850272E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.811 | TFLOPs: 26.23 | +7: iteration 19850/ 60336 | consumed samples: 5081600 | consumed tokens: 10407116800 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.882851E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.215 | TFLOPs: 26.24 | +7: iteration 19860/ 60336 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.866029E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.630 | TFLOPs: 26.18 | +7: iteration 19870/ 60336 | consumed samples: 5086720 | consumed tokens: 10417602560 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.861848E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.718 | TFLOPs: 26.17 | +7: iteration 19880/ 60336 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.863574E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.356 | TFLOPs: 26.23 | +7: iteration 19890/ 60336 | consumed samples: 5091840 | consumed tokens: 10428088320 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.859129E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.087 | TFLOPs: 26.24 | +7: iteration 19900/ 60336 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.869574E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.152 | TFLOPs: 26.25 | +7: iteration 19910/ 60336 | consumed samples: 5096960 | consumed tokens: 10438574080 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.854763E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.389 | TFLOPs: 26.26 | +7: iteration 19920/ 60336 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 0.15 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.854495E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.681 | TFLOPs: 26.25 | +7: iteration 19930/ 60336 | consumed samples: 5102080 | consumed tokens: 10449059840 | elapsed time per iteration (s): 0.15 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.877747E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.297 | TFLOPs: 26.26 | +7: iteration 19940/ 60336 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 0.15 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.865417E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.363 | TFLOPs: 26.26 | +7: iteration 19950/ 60336 | consumed samples: 5107200 | consumed tokens: 10459545600 | elapsed time per iteration (s): 0.15 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.859986E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.499 | TFLOPs: 26.26 | +7: iteration 19960/ 60336 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 0.15 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.853050E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.746 | TFLOPs: 26.26 | +7: iteration 19970/ 60336 | consumed samples: 5112320 | consumed tokens: 10470031360 | elapsed time per iteration (s): 0.15 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.852307E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.826 | TFLOPs: 26.27 | +7: iteration 19980/ 60336 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 0.15 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.846894E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.126 | TFLOPs: 26.25 | +7: iteration 19990/ 60336 | consumed samples: 5117440 | consumed tokens: 10480517120 | elapsed time per iteration (s): 0.15 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.861455E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.716 | TFLOPs: 26.26 | +0: [2023-03-17 01:10:52,530] [INFO] [logging.py:68:log_dist] [Rank 0] step=20000, skipped=0, lr=[0.0001570914128216334, 0.0001570914128216334, 0.0001570914128216334], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 20000/ 60336 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 0.15 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.865010E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.532 | TFLOPs: 26.26 | +0: steps: 20000 loss: 3.8694 iter time (s): 0.153 samples/sec: 1678.469 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 20000 | lm loss value: 3.976112E+00 | lm loss PPL: 5.330936E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 20000 to checkpoints_44m32b100m +0: [2023-03-17 01:10:52,603] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is begin to save! +0: [2023-03-17 01:10:52,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:10:52,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:10:52,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:10:52,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:10:52,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:10:52,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:10:52,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:10:52,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:10:52,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:10:52,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:10:52,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:10:52,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:10:52,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:10:52,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:10:52,722] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:10:52,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:10:52,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:10:52,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:10:52,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:10:52,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:10:52,739] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step20000/mp_rank_00_model_states.pt +0: [2023-03-17 01:10:52,739] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:10:52,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:52,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:52,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:10:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:10:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:10:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:10:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: successfully saved checkpoint at iteration 20000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 182.84 +7: iteration 20010/ 60336 | consumed samples: 5122560 | consumed tokens: 10491002880 | elapsed time per iteration (s): 0.18 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.859498E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.643 | TFLOPs: 22.53 | +7: iteration 20020/ 60336 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.853551E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.016 | TFLOPs: 26.25 | +7: iteration 20030/ 60336 | consumed samples: 5127680 | consumed tokens: 10501488640 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.869245E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.318 | TFLOPs: 26.26 | +7: iteration 20040/ 60336 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 0.15 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.864783E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.715 | TFLOPs: 26.26 | +7: iteration 20050/ 60336 | consumed samples: 5132800 | consumed tokens: 10511974400 | elapsed time per iteration (s): 0.15 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.872782E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.729 | TFLOPs: 26.26 | +7: iteration 20060/ 60336 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.865827E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.130 | TFLOPs: 26.25 | +7: iteration 20070/ 60336 | consumed samples: 5137920 | consumed tokens: 10522460160 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.862459E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.560 | TFLOPs: 26.28 | +7: iteration 20080/ 60336 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.867863E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.886 | TFLOPs: 26.27 | +7: iteration 20090/ 60336 | consumed samples: 5143040 | consumed tokens: 10532945920 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.868918E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.232 | TFLOPs: 26.24 | +7: iteration 20100/ 60336 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.860986E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.573 | TFLOPs: 26.26 | +7: iteration 20110/ 60336 | consumed samples: 5148160 | consumed tokens: 10543431680 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.864622E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.513 | TFLOPs: 26.06 | +7: iteration 20120/ 60336 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.868968E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.658 | TFLOPs: 26.26 | +7: iteration 20130/ 60336 | consumed samples: 5153280 | consumed tokens: 10553917440 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.858023E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.857 | TFLOPs: 26.25 | +7: iteration 20140/ 60336 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 0.16 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.868347E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.791 | TFLOPs: 25.86 | +7: iteration 20150/ 60336 | consumed samples: 5158400 | consumed tokens: 10564403200 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.861669E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.880 | TFLOPs: 26.25 | +7: iteration 20160/ 60336 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.855126E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.479 | TFLOPs: 26.24 | +7: iteration 20170/ 60336 | consumed samples: 5163520 | consumed tokens: 10574888960 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.864388E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.466 | TFLOPs: 26.18 | +7: iteration 20180/ 60336 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.868805E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.765 | TFLOPs: 26.17 | +7: iteration 20190/ 60336 | consumed samples: 5168640 | consumed tokens: 10585374720 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.865182E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.215 | TFLOPs: 26.22 | +7: iteration 20200/ 60336 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.861177E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.508 | TFLOPs: 26.23 | +7: iteration 20210/ 60336 | consumed samples: 5173760 | consumed tokens: 10595860480 | elapsed time per iteration (s): 0.16 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.867416E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.165 | TFLOPs: 25.71 | +7: iteration 20220/ 60336 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.866518E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.438 | TFLOPs: 26.12 | +7: iteration 20230/ 60336 | consumed samples: 5178880 | consumed tokens: 10606346240 | elapsed time per iteration (s): 0.16 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.850713E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.844 | TFLOPs: 25.59 | +7: iteration 20240/ 60336 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 0.15 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.865429E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.236 | TFLOPs: 26.15 | +7: iteration 20250/ 60336 | consumed samples: 5184000 | consumed tokens: 10616832000 | elapsed time per iteration (s): 0.15 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.864597E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.363 | TFLOPs: 26.12 | +7: iteration 20260/ 60336 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.863107E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.334 | TFLOPs: 26.13 | +7: iteration 20270/ 60336 | consumed samples: 5189120 | consumed tokens: 10627317760 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.861469E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.252 | TFLOPs: 26.15 | +7: iteration 20280/ 60336 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.868695E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.340 | TFLOPs: 26.13 | +7: iteration 20290/ 60336 | consumed samples: 5194240 | consumed tokens: 10637803520 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.874465E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.605 | TFLOPs: 26.12 | +7: iteration 20300/ 60336 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.886367E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.931 | TFLOPs: 26.14 | +7: iteration 20310/ 60336 | consumed samples: 5199360 | consumed tokens: 10648289280 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.861863E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.376 | TFLOPs: 26.13 | +7: iteration 20320/ 60336 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.870291E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.901 | TFLOPs: 26.16 | +7: iteration 20330/ 60336 | consumed samples: 5204480 | consumed tokens: 10658775040 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.850562E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.406 | TFLOPs: 26.13 | +7: iteration 20340/ 60336 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.859383E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.931 | TFLOPs: 26.16 | +7: iteration 20350/ 60336 | consumed samples: 5209600 | consumed tokens: 10669260800 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.852317E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.474 | TFLOPs: 26.15 | +7: iteration 20360/ 60336 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.859930E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.235 | TFLOPs: 26.16 | +7: iteration 20370/ 60336 | consumed samples: 5214720 | consumed tokens: 10679746560 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.860382E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.072 | TFLOPs: 26.16 | +7: iteration 20380/ 60336 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.862784E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.209 | TFLOPs: 26.15 | +7: iteration 20390/ 60336 | consumed samples: 5219840 | consumed tokens: 10690232320 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.853279E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.859 | TFLOPs: 26.12 | +7: iteration 20400/ 60336 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.855300E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.305 | TFLOPs: 26.15 | +7: iteration 20410/ 60336 | consumed samples: 5224960 | consumed tokens: 10700718080 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.859015E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.103 | TFLOPs: 26.14 | +7: iteration 20420/ 60336 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.858736E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.052 | TFLOPs: 26.14 | +7: iteration 20430/ 60336 | consumed samples: 5230080 | consumed tokens: 10711203840 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.863634E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.291 | TFLOPs: 26.12 | +7: iteration 20440/ 60336 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.851681E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.915 | TFLOPs: 26.16 | +7: iteration 20450/ 60336 | consumed samples: 5235200 | consumed tokens: 10721689600 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.859288E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.567 | TFLOPs: 26.15 | +7: iteration 20460/ 60336 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.882279E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.117 | TFLOPs: 26.16 | +7: iteration 20470/ 60336 | consumed samples: 5240320 | consumed tokens: 10732175360 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.844241E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.680 | TFLOPs: 26.14 | +7: iteration 20480/ 60336 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.865938E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.794 | TFLOPs: 26.14 | +7: iteration 20490/ 60336 | consumed samples: 5245440 | consumed tokens: 10742661120 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.859030E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.253 | TFLOPs: 26.15 | +7: iteration 20500/ 60336 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.858493E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.029 | TFLOPs: 26.16 | +7: iteration 20510/ 60336 | consumed samples: 5250560 | consumed tokens: 10753146880 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.863892E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.353 | TFLOPs: 26.15 | +7: iteration 20520/ 60336 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.858989E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.774 | TFLOPs: 26.14 | +7: iteration 20530/ 60336 | consumed samples: 5255680 | consumed tokens: 10763632640 | elapsed time per iteration (s): 0.15 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.867548E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.617 | TFLOPs: 26.17 | +7: iteration 20540/ 60336 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 0.15 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.863312E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.477 | TFLOPs: 26.17 | +7: iteration 20550/ 60336 | consumed samples: 5260800 | consumed tokens: 10774118400 | elapsed time per iteration (s): 0.15 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.859879E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.508 | TFLOPs: 26.15 | +7: iteration 20560/ 60336 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 0.15 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.857540E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.524 | TFLOPs: 26.17 | +7: iteration 20570/ 60336 | consumed samples: 5265920 | consumed tokens: 10784604160 | elapsed time per iteration (s): 0.15 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.859627E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.952 | TFLOPs: 26.16 | +7: iteration 20580/ 60336 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.850529E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.079 | TFLOPs: 26.14 | +7: iteration 20590/ 60336 | consumed samples: 5271040 | consumed tokens: 10795089920 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.848984E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.020 | TFLOPs: 26.13 | +7: iteration 20600/ 60336 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.862637E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.732 | TFLOPs: 26.12 | +7: iteration 20610/ 60336 | consumed samples: 5276160 | consumed tokens: 10805575680 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.854308E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.199 | TFLOPs: 26.15 | +7: iteration 20620/ 60336 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.841549E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.211 | TFLOPs: 26.13 | +7: iteration 20630/ 60336 | consumed samples: 5281280 | consumed tokens: 10816061440 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.839680E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.291 | TFLOPs: 26.13 | +7: iteration 20640/ 60336 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.862844E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.635 | TFLOPs: 26.09 | +7: iteration 20650/ 60336 | consumed samples: 5286400 | consumed tokens: 10826547200 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.851476E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.605 | TFLOPs: 26.12 | +7: iteration 20660/ 60336 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.844787E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.481 | TFLOPs: 26.13 | +7: iteration 20670/ 60336 | consumed samples: 5291520 | consumed tokens: 10837032960 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.866226E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.848 | TFLOPs: 26.11 | +7: iteration 20680/ 60336 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.862384E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.281 | TFLOPs: 26.15 | +7: iteration 20690/ 60336 | consumed samples: 5296640 | consumed tokens: 10847518720 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.867530E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.006 | TFLOPs: 26.13 | +7: iteration 20700/ 60336 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.866885E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.308 | TFLOPs: 26.05 | +7: iteration 20710/ 60336 | consumed samples: 5301760 | consumed tokens: 10858004480 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.863559E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.643 | TFLOPs: 26.03 | +7: iteration 20720/ 60336 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.852103E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.220 | TFLOPs: 26.05 | +7: iteration 20730/ 60336 | consumed samples: 5306880 | consumed tokens: 10868490240 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.854615E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.351 | TFLOPs: 26.12 | +7: iteration 20740/ 60336 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.863183E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.585 | TFLOPs: 26.15 | +7: iteration 20750/ 60336 | consumed samples: 5312000 | consumed tokens: 10878976000 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.859975E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.746 | TFLOPs: 26.11 | +7: iteration 20760/ 60336 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.870280E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.250 | TFLOPs: 26.13 | +7: iteration 20770/ 60336 | consumed samples: 5317120 | consumed tokens: 10889461760 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.863681E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.523 | TFLOPs: 26.12 | +7: iteration 20780/ 60336 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.855613E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.745 | TFLOPs: 26.14 | +7: iteration 20790/ 60336 | consumed samples: 5322240 | consumed tokens: 10899947520 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.859555E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.096 | TFLOPs: 26.16 | +7: iteration 20800/ 60336 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.861409E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.177 | TFLOPs: 26.15 | +7: iteration 20810/ 60336 | consumed samples: 5327360 | consumed tokens: 10910433280 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.856566E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.637 | TFLOPs: 26.09 | +7: iteration 20820/ 60336 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.843534E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.472 | TFLOPs: 26.15 | +7: iteration 20830/ 60336 | consumed samples: 5332480 | consumed tokens: 10920919040 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.866765E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.927 | TFLOPs: 26.09 | +7: iteration 20840/ 60336 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.867199E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.661 | TFLOPs: 26.12 | +7: iteration 20850/ 60336 | consumed samples: 5337600 | consumed tokens: 10931404800 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.854810E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.735 | TFLOPs: 26.11 | +7: iteration 20860/ 60336 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.859852E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.674 | TFLOPs: 26.12 | +7: iteration 20870/ 60336 | consumed samples: 5342720 | consumed tokens: 10941890560 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.848237E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.892 | TFLOPs: 26.13 | +7: iteration 20880/ 60336 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.850318E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.628 | TFLOPs: 26.15 | +7: iteration 20890/ 60336 | consumed samples: 5347840 | consumed tokens: 10952376320 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.871656E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.061 | TFLOPs: 26.14 | +7: iteration 20900/ 60336 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.846581E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.022 | TFLOPs: 26.14 | +7: iteration 20910/ 60336 | consumed samples: 5352960 | consumed tokens: 10962862080 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.863124E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.231 | TFLOPs: 26.15 | +7: iteration 20920/ 60336 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.836518E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.006 | TFLOPs: 26.14 | +7: iteration 20930/ 60336 | consumed samples: 5358080 | consumed tokens: 10973347840 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.868095E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.167 | TFLOPs: 26.13 | +7: iteration 20940/ 60336 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.858781E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.489 | TFLOPs: 26.09 | +7: iteration 20950/ 60336 | consumed samples: 5363200 | consumed tokens: 10983833600 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.840533E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.535 | TFLOPs: 26.06 | +7: iteration 20960/ 60336 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.863645E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.004 | TFLOPs: 26.03 | +7: iteration 20970/ 60336 | consumed samples: 5368320 | consumed tokens: 10994319360 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.858053E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.329 | TFLOPs: 26.04 | +7: iteration 20980/ 60336 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.856536E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.879 | TFLOPs: 26.03 | +7: iteration 20990/ 60336 | consumed samples: 5373440 | consumed tokens: 11004805120 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.852669E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.061 | TFLOPs: 26.03 | +7: iteration 21000/ 60336 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.864942E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.384 | TFLOPs: 26.01 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 21000 | lm loss value: 3.955186E+00 | lm loss PPL: 5.220542E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 21000 to checkpoints_44m32b100m +0: [2023-03-17 01:13:26,461] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step21000 is begin to save! +0: [2023-03-17 01:13:26,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:13:26,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:13:26,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:13:26,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:13:26,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:13:26,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:13:26,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:13:26,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:13:26,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:13:26,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:13:26,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:13:26,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:13:26,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:13:26,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:13:26,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:13:26,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:13:26,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:13:26,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:13:26,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:13:26,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:13:26,594] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step21000/mp_rank_00_model_states.pt +0: [2023-03-17 01:13:26,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:13:26,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:13:26,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:13:26,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:13:26,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:13:26,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: successfully saved checkpoint at iteration 21000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 179.15 +7: iteration 21010/ 60336 | consumed samples: 5378560 | consumed tokens: 11015290880 | elapsed time per iteration (s): 0.18 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.866875E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.144 | TFLOPs: 22.77 | +7: iteration 21020/ 60336 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.861877E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.844 | TFLOPs: 26.16 | +7: iteration 21030/ 60336 | consumed samples: 5383680 | consumed tokens: 11025776640 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.858288E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.993 | TFLOPs: 26.14 | +7: iteration 21040/ 60336 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.853469E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.801 | TFLOPs: 26.16 | +7: iteration 21050/ 60336 | consumed samples: 5388800 | consumed tokens: 11036262400 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.856203E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.517 | TFLOPs: 26.14 | +7: iteration 21060/ 60336 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.863488E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.172 | TFLOPs: 26.16 | +7: iteration 21070/ 60336 | consumed samples: 5393920 | consumed tokens: 11046748160 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.872194E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.482 | TFLOPs: 26.15 | +7: iteration 21080/ 60336 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.858076E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.332 | TFLOPs: 26.10 | +7: iteration 21090/ 60336 | consumed samples: 5399040 | consumed tokens: 11057233920 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.859503E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.094 | TFLOPs: 26.11 | +7: iteration 21100/ 60336 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.849988E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.374 | TFLOPs: 26.13 | +7: iteration 21110/ 60336 | consumed samples: 5404160 | consumed tokens: 11067719680 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.857343E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.237 | TFLOPs: 26.07 | +7: iteration 21120/ 60336 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.839418E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.282 | TFLOPs: 26.04 | +7: iteration 21130/ 60336 | consumed samples: 5409280 | consumed tokens: 11078205440 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.860561E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.043 | TFLOPs: 26.03 | +7: iteration 21140/ 60336 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.857761E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.067 | TFLOPs: 26.07 | +7: iteration 21150/ 60336 | consumed samples: 5414400 | consumed tokens: 11088691200 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.862794E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.825 | TFLOPs: 26.01 | +7: iteration 21160/ 60336 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.847412E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.265 | TFLOPs: 26.02 | +7: iteration 21170/ 60336 | consumed samples: 5419520 | consumed tokens: 11099176960 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.865104E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.321 | TFLOPs: 26.02 | +7: iteration 21180/ 60336 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.857135E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.480 | TFLOPs: 26.04 | +7: iteration 21190/ 60336 | consumed samples: 5424640 | consumed tokens: 11109662720 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.852653E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.731 | TFLOPs: 26.01 | +7: iteration 21200/ 60336 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.861996E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.914 | TFLOPs: 26.02 | +7: iteration 21210/ 60336 | consumed samples: 5429760 | consumed tokens: 11120148480 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.853262E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.695 | TFLOPs: 26.01 | +7: iteration 21220/ 60336 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.844083E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.829 | TFLOPs: 26.03 | +7: iteration 21230/ 60336 | consumed samples: 5434880 | consumed tokens: 11130634240 | elapsed time per iteration (s): 0.16 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.851401E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.115 | TFLOPs: 25.89 | +7: iteration 21240/ 60336 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.851228E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.436 | TFLOPs: 26.01 | +7: iteration 21250/ 60336 | consumed samples: 5440000 | consumed tokens: 11141120000 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.858870E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.501 | TFLOPs: 26.03 | +7: iteration 21260/ 60336 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.859298E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.706 | TFLOPs: 26.12 | +7: iteration 21270/ 60336 | consumed samples: 5445120 | consumed tokens: 11151605760 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.851628E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.383 | TFLOPs: 26.12 | +7: iteration 21280/ 60336 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.854937E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.723 | TFLOPs: 26.12 | +7: iteration 21290/ 60336 | consumed samples: 5450240 | consumed tokens: 11162091520 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.841786E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.344 | TFLOPs: 26.12 | +7: iteration 21300/ 60336 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.843225E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.756 | TFLOPs: 26.12 | +7: iteration 21310/ 60336 | consumed samples: 5455360 | consumed tokens: 11172577280 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.849538E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.985 | TFLOPs: 26.10 | +7: iteration 21320/ 60336 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.858714E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.478 | TFLOPs: 26.10 | +7: iteration 21330/ 60336 | consumed samples: 5460480 | consumed tokens: 11183063040 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.860062E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.010 | TFLOPs: 26.10 | +7: iteration 21340/ 60336 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.856959E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.694 | TFLOPs: 26.08 | +7: iteration 21350/ 60336 | consumed samples: 5465600 | consumed tokens: 11193548800 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.851522E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.602 | TFLOPs: 26.07 | +7: iteration 21360/ 60336 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.866768E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.005 | TFLOPs: 26.11 | +7: iteration 21370/ 60336 | consumed samples: 5470720 | consumed tokens: 11204034560 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.859694E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.649 | TFLOPs: 26.09 | +7: iteration 21380/ 60336 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.846198E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.564 | TFLOPs: 26.10 | +7: iteration 21390/ 60336 | consumed samples: 5475840 | consumed tokens: 11214520320 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.864975E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.574 | TFLOPs: 26.10 | +7: iteration 21400/ 60336 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.838236E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.921 | TFLOPs: 26.09 | +7: iteration 21410/ 60336 | consumed samples: 5480960 | consumed tokens: 11225006080 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.848774E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.182 | TFLOPs: 26.10 | +7: iteration 21420/ 60336 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.850767E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.590 | TFLOPs: 26.10 | +7: iteration 21430/ 60336 | consumed samples: 5486080 | consumed tokens: 11235491840 | elapsed time per iteration (s): 0.16 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.854741E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.378 | TFLOPs: 25.80 | +7: iteration 21440/ 60336 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.850177E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.947 | TFLOPs: 26.09 | +7: iteration 21450/ 60336 | consumed samples: 5491200 | consumed tokens: 11245977600 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.851069E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.086 | TFLOPs: 26.11 | +7: iteration 21460/ 60336 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.845631E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.387 | TFLOPs: 26.09 | +7: iteration 21470/ 60336 | consumed samples: 5496320 | consumed tokens: 11256463360 | elapsed time per iteration (s): 0.15 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.845741E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.873 | TFLOPs: 26.09 | +7: iteration 21480/ 60336 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 0.15 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.854010E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.735 | TFLOPs: 26.09 | +7: iteration 21490/ 60336 | consumed samples: 5501440 | consumed tokens: 11266949120 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.861976E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.451 | TFLOPs: 26.09 | +7: iteration 21500/ 60336 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.846902E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.941 | TFLOPs: 26.09 | +7: iteration 21510/ 60336 | consumed samples: 5506560 | consumed tokens: 11277434880 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.867198E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.374 | TFLOPs: 26.10 | +7: iteration 21520/ 60336 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.843983E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.136 | TFLOPs: 26.11 | +7: iteration 21530/ 60336 | consumed samples: 5511680 | consumed tokens: 11287920640 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.852432E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.319 | TFLOPs: 26.10 | +7: iteration 21540/ 60336 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.847411E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.195 | TFLOPs: 26.10 | +7: iteration 21550/ 60336 | consumed samples: 5516800 | consumed tokens: 11298406400 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.862324E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.166 | TFLOPs: 26.08 | +7: iteration 21560/ 60336 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.840403E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.188 | TFLOPs: 26.10 | +7: iteration 21570/ 60336 | consumed samples: 5521920 | consumed tokens: 11308892160 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.864297E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.875 | TFLOPs: 26.08 | +7: iteration 21580/ 60336 | consumed samples: 5524480 | consumed tokens: 11314135040 | elapsed time per iteration (s): 0.16 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.851955E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.423 | TFLOPs: 25.87 | +7: iteration 21590/ 60336 | consumed samples: 5527040 | consumed tokens: 11319377920 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.847843E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.344 | TFLOPs: 26.07 | +7: iteration 21600/ 60336 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.850468E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.498 | TFLOPs: 26.09 | +7: iteration 21610/ 60336 | consumed samples: 5532160 | consumed tokens: 11329863680 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.869140E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.476 | TFLOPs: 26.07 | +7: iteration 21620/ 60336 | consumed samples: 5534720 | consumed tokens: 11335106560 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.854427E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.365 | TFLOPs: 26.05 | +7: iteration 21630/ 60336 | consumed samples: 5537280 | consumed tokens: 11340349440 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.858708E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.880 | TFLOPs: 26.09 | +7: iteration 21640/ 60336 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.852402E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.767 | TFLOPs: 26.08 | +7: iteration 21650/ 60336 | consumed samples: 5542400 | consumed tokens: 11350835200 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.844721E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.058 | TFLOPs: 26.08 | +7: iteration 21660/ 60336 | consumed samples: 5544960 | consumed tokens: 11356078080 | elapsed time per iteration (s): 0.15 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.851443E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.034 | TFLOPs: 26.08 | +7: iteration 21670/ 60336 | consumed samples: 5547520 | consumed tokens: 11361320960 | elapsed time per iteration (s): 0.15 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.857041E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.781 | TFLOPs: 26.09 | +7: iteration 21680/ 60336 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 0.15 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.836733E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.598 | TFLOPs: 26.09 | +7: iteration 21690/ 60336 | consumed samples: 5552640 | consumed tokens: 11371806720 | elapsed time per iteration (s): 0.15 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.842323E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.677 | TFLOPs: 26.09 | +7: iteration 21700/ 60336 | consumed samples: 5555200 | consumed tokens: 11377049600 | elapsed time per iteration (s): 0.15 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.862703E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.313 | TFLOPs: 26.15 | +7: iteration 21710/ 60336 | consumed samples: 5557760 | consumed tokens: 11382292480 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.850325E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.186 | TFLOPs: 26.15 | +7: iteration 21720/ 60336 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.863270E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.971 | TFLOPs: 26.14 | +7: iteration 21730/ 60336 | consumed samples: 5562880 | consumed tokens: 11392778240 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.846368E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.694 | TFLOPs: 26.15 | +7: iteration 21740/ 60336 | consumed samples: 5565440 | consumed tokens: 11398021120 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.843546E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.479 | TFLOPs: 26.15 | +7: iteration 21750/ 60336 | consumed samples: 5568000 | consumed tokens: 11403264000 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.841385E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.491 | TFLOPs: 26.13 | +7: iteration 21760/ 60336 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.859842E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.132 | TFLOPs: 26.16 | +7: iteration 21770/ 60336 | consumed samples: 5573120 | consumed tokens: 11413749760 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.863783E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.939 | TFLOPs: 26.16 | +7: iteration 21780/ 60336 | consumed samples: 5575680 | consumed tokens: 11418992640 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.847294E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.904 | TFLOPs: 26.16 | +7: iteration 21790/ 60336 | consumed samples: 5578240 | consumed tokens: 11424235520 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.848395E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.515 | TFLOPs: 26.17 | +7: iteration 21800/ 60336 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.858450E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.130 | TFLOPs: 26.16 | +7: iteration 21810/ 60336 | consumed samples: 5583360 | consumed tokens: 11434721280 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.843496E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.372 | TFLOPs: 26.15 | +7: iteration 21820/ 60336 | consumed samples: 5585920 | consumed tokens: 11439964160 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.863049E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.027 | TFLOPs: 26.16 | +7: iteration 21830/ 60336 | consumed samples: 5588480 | consumed tokens: 11445207040 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.842808E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.063 | TFLOPs: 26.14 | +7: iteration 21840/ 60336 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.843947E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.397 | TFLOPs: 26.15 | +7: iteration 21850/ 60336 | consumed samples: 5593600 | consumed tokens: 11455692800 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.858438E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.556 | TFLOPs: 26.14 | +7: iteration 21860/ 60336 | consumed samples: 5596160 | consumed tokens: 11460935680 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.850632E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.846 | TFLOPs: 26.14 | +7: iteration 21870/ 60336 | consumed samples: 5598720 | consumed tokens: 11466178560 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.846543E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.160 | TFLOPs: 26.15 | +7: iteration 21880/ 60336 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.835368E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.601 | TFLOPs: 26.17 | +7: iteration 21890/ 60336 | consumed samples: 5603840 | consumed tokens: 11476664320 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.862227E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.379 | TFLOPs: 26.15 | +7: iteration 21900/ 60336 | consumed samples: 5606400 | consumed tokens: 11481907200 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.847436E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.738 | TFLOPs: 26.14 | +7: iteration 21910/ 60336 | consumed samples: 5608960 | consumed tokens: 11487150080 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.844074E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.644 | TFLOPs: 26.14 | +7: iteration 21920/ 60336 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 0.15 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.857560E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.781 | TFLOPs: 26.15 | +7: iteration 21930/ 60336 | consumed samples: 5614080 | consumed tokens: 11497635840 | elapsed time per iteration (s): 0.15 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.855515E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.705 | TFLOPs: 26.14 | +7: iteration 21940/ 60336 | consumed samples: 5616640 | consumed tokens: 11502878720 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.838047E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.244 | TFLOPs: 26.13 | +7: iteration 21950/ 60336 | consumed samples: 5619200 | consumed tokens: 11508121600 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.848813E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.745 | TFLOPs: 26.14 | +7: iteration 21960/ 60336 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.838309E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.616 | TFLOPs: 26.15 | +7: iteration 21970/ 60336 | consumed samples: 5624320 | consumed tokens: 11518607360 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.848536E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.264 | TFLOPs: 26.15 | +7: iteration 21980/ 60336 | consumed samples: 5626880 | consumed tokens: 11523850240 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.849758E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.941 | TFLOPs: 26.14 | +7: iteration 21990/ 60336 | consumed samples: 5629440 | consumed tokens: 11529093120 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.842475E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.190 | TFLOPs: 26.15 | +0: [2023-03-17 01:16:00,464] [INFO] [logging.py:68:log_dist] [Rank 0] step=22000, skipped=0, lr=[0.00014877838953352202, 0.00014877838953352202, 0.00014877838953352202], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 22000/ 60336 | consumed samples: 5632000 | consumed tokens: 11534336000 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.837500E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.458 | TFLOPs: 26.12 | +0: steps: 22000 loss: 3.8468 iter time (s): 0.152 samples/sec: 1681.398 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 22000 | lm loss value: 3.955485E+00 | lm loss PPL: 5.222101E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 22000 to checkpoints_44m32b100m +0: [2023-03-17 01:16:00,537] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step22000 is begin to save! +0: [2023-03-17 01:16:00,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:16:00,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:16:00,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:16:00,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:16:00,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:16:00,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:16:00,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:16:00,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:16:00,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:16:00,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:16:00,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:16:00,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:16:00,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:16:00,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:16:00,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:16:00,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:16:00,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:16:00,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:16:00,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:16:00,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:16:00,669] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step22000/mp_rank_00_model_states.pt +0: [2023-03-17 01:16:00,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:16:00,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:16:00,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:16:00,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:16:00,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:16:00,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:16:00,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:16:00,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:16:00,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:16:00,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:16:00,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:16:00,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 01:16:00,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:16:00,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 01:16:00,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 01:16:00,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:16:00,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:16:00,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:16:00,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:16:00,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:16:00,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:16:00,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:16:00,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: successfully saved checkpoint at iteration 22000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 175.39 +7: iteration 22010/ 60336 | consumed samples: 5634560 | consumed tokens: 11539578880 | elapsed time per iteration (s): 0.18 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.857942E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.826 | TFLOPs: 22.55 | +7: iteration 22020/ 60336 | consumed samples: 5637120 | consumed tokens: 11544821760 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.853862E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.675 | TFLOPs: 26.14 | +7: iteration 22030/ 60336 | consumed samples: 5639680 | consumed tokens: 11550064640 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.851048E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.825 | TFLOPs: 26.12 | +7: iteration 22040/ 60336 | consumed samples: 5642240 | consumed tokens: 11555307520 | elapsed time per iteration (s): 0.15 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.851127E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.158 | TFLOPs: 26.15 | +7: iteration 22050/ 60336 | consumed samples: 5644800 | consumed tokens: 11560550400 | elapsed time per iteration (s): 0.15 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.856686E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.084 | TFLOPs: 26.13 | +7: iteration 22060/ 60336 | consumed samples: 5647360 | consumed tokens: 11565793280 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.849556E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.744 | TFLOPs: 26.12 | +7: iteration 22070/ 60336 | consumed samples: 5649920 | consumed tokens: 11571036160 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.851558E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.363 | TFLOPs: 26.12 | +7: iteration 22080/ 60336 | consumed samples: 5652480 | consumed tokens: 11576279040 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.841290E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.620 | TFLOPs: 26.11 | +7: iteration 22090/ 60336 | consumed samples: 5655040 | consumed tokens: 11581521920 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.859598E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.306 | TFLOPs: 26.07 | +7: iteration 22100/ 60336 | consumed samples: 5657600 | consumed tokens: 11586764800 | elapsed time per iteration (s): 0.16 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.846333E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.003 | TFLOPs: 25.77 | +7: iteration 22110/ 60336 | consumed samples: 5660160 | consumed tokens: 11592007680 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.841935E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.545 | TFLOPs: 26.09 | +7: iteration 22120/ 60336 | consumed samples: 5662720 | consumed tokens: 11597250560 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.850296E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.635 | TFLOPs: 26.07 | +7: iteration 22130/ 60336 | consumed samples: 5665280 | consumed tokens: 11602493440 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.859809E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.558 | TFLOPs: 26.07 | +7: iteration 22140/ 60336 | consumed samples: 5667840 | consumed tokens: 11607736320 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.841828E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.740 | TFLOPs: 26.08 | +7: iteration 22150/ 60336 | consumed samples: 5670400 | consumed tokens: 11612979200 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.847422E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.255 | TFLOPs: 26.07 | +7: iteration 22160/ 60336 | consumed samples: 5672960 | consumed tokens: 11618222080 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.858586E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.558 | TFLOPs: 26.06 | +7: iteration 22170/ 60336 | consumed samples: 5675520 | consumed tokens: 11623464960 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.837477E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.596 | TFLOPs: 26.07 | +7: iteration 22180/ 60336 | consumed samples: 5678080 | consumed tokens: 11628707840 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.846827E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.356 | TFLOPs: 26.10 | +7: iteration 22190/ 60336 | consumed samples: 5680640 | consumed tokens: 11633950720 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.851566E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.054 | TFLOPs: 26.13 | +7: iteration 22200/ 60336 | consumed samples: 5683200 | consumed tokens: 11639193600 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.832188E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.348 | TFLOPs: 26.12 | +7: iteration 22210/ 60336 | consumed samples: 5685760 | consumed tokens: 11644436480 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.849236E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.880 | TFLOPs: 26.13 | +7: iteration 22220/ 60336 | consumed samples: 5688320 | consumed tokens: 11649679360 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.848669E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.177 | TFLOPs: 26.13 | +7: iteration 22230/ 60336 | consumed samples: 5690880 | consumed tokens: 11654922240 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.858517E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.798 | TFLOPs: 26.14 | +7: iteration 22240/ 60336 | consumed samples: 5693440 | consumed tokens: 11660165120 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.860992E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.635 | TFLOPs: 26.12 | +7: iteration 22250/ 60336 | consumed samples: 5696000 | consumed tokens: 11665408000 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.860278E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.192 | TFLOPs: 26.11 | +7: iteration 22260/ 60336 | consumed samples: 5698560 | consumed tokens: 11670650880 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.845217E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.052 | TFLOPs: 26.13 | +7: iteration 22270/ 60336 | consumed samples: 5701120 | consumed tokens: 11675893760 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.852277E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.323 | TFLOPs: 26.13 | +7: iteration 22280/ 60336 | consumed samples: 5703680 | consumed tokens: 11681136640 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.855396E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.093 | TFLOPs: 26.13 | +7: iteration 22290/ 60336 | consumed samples: 5706240 | consumed tokens: 11686379520 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.837265E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.541 | TFLOPs: 26.12 | +7: iteration 22300/ 60336 | consumed samples: 5708800 | consumed tokens: 11691622400 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.843525E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.803 | TFLOPs: 26.12 | +7: iteration 22310/ 60336 | consumed samples: 5711360 | consumed tokens: 11696865280 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.849828E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.027 | TFLOPs: 26.14 | +7: iteration 22320/ 60336 | consumed samples: 5713920 | consumed tokens: 11702108160 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.836636E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.104 | TFLOPs: 26.14 | +7: iteration 22330/ 60336 | consumed samples: 5716480 | consumed tokens: 11707351040 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.834464E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.480 | TFLOPs: 26.09 | +7: iteration 22340/ 60336 | consumed samples: 5719040 | consumed tokens: 11712593920 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.844257E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.992 | TFLOPs: 26.11 | +7: iteration 22350/ 60336 | consumed samples: 5721600 | consumed tokens: 11717836800 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.851502E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.026 | TFLOPs: 26.11 | +7: iteration 22360/ 60336 | consumed samples: 5724160 | consumed tokens: 11723079680 | elapsed time per iteration (s): 0.15 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.836212E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.310 | TFLOPs: 26.10 | +7: iteration 22370/ 60336 | consumed samples: 5726720 | consumed tokens: 11728322560 | elapsed time per iteration (s): 0.15 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.833297E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.521 | TFLOPs: 26.10 | +7: iteration 22380/ 60336 | consumed samples: 5729280 | consumed tokens: 11733565440 | elapsed time per iteration (s): 0.15 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.842847E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.686 | TFLOPs: 26.09 | +7: iteration 22390/ 60336 | consumed samples: 5731840 | consumed tokens: 11738808320 | elapsed time per iteration (s): 0.15 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.846571E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.333 | TFLOPs: 26.10 | +7: iteration 22400/ 60336 | consumed samples: 5734400 | consumed tokens: 11744051200 | elapsed time per iteration (s): 0.15 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.838866E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.675 | TFLOPs: 26.07 | +7: iteration 22410/ 60336 | consumed samples: 5736960 | consumed tokens: 11749294080 | elapsed time per iteration (s): 0.15 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.852157E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.526 | TFLOPs: 26.06 | +7: iteration 22420/ 60336 | consumed samples: 5739520 | consumed tokens: 11754536960 | elapsed time per iteration (s): 0.15 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.850193E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.875 | TFLOPs: 26.08 | +7: iteration 22430/ 60336 | consumed samples: 5742080 | consumed tokens: 11759779840 | elapsed time per iteration (s): 0.15 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.845405E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.287 | TFLOPs: 26.08 | +7: iteration 22440/ 60336 | consumed samples: 5744640 | consumed tokens: 11765022720 | elapsed time per iteration (s): 0.15 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.844486E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.449 | TFLOPs: 26.07 | +7: iteration 22450/ 60336 | consumed samples: 5747200 | consumed tokens: 11770265600 | elapsed time per iteration (s): 0.15 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.838162E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.688 | TFLOPs: 26.08 | +7: iteration 22460/ 60336 | consumed samples: 5749760 | consumed tokens: 11775508480 | elapsed time per iteration (s): 0.15 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.834512E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.632 | TFLOPs: 26.06 | +7: iteration 22470/ 60336 | consumed samples: 5752320 | consumed tokens: 11780751360 | elapsed time per iteration (s): 0.15 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.834931E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.397 | TFLOPs: 26.07 | +7: iteration 22480/ 60336 | consumed samples: 5754880 | consumed tokens: 11785994240 | elapsed time per iteration (s): 0.15 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.839912E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.568 | TFLOPs: 26.10 | +7: iteration 22490/ 60336 | consumed samples: 5757440 | consumed tokens: 11791237120 | elapsed time per iteration (s): 0.15 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.844337E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.820 | TFLOPs: 26.08 | +7: iteration 22500/ 60336 | consumed samples: 5760000 | consumed tokens: 11796480000 | elapsed time per iteration (s): 0.15 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.835941E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.974 | TFLOPs: 26.08 | +7: iteration 22510/ 60336 | consumed samples: 5762560 | consumed tokens: 11801722880 | elapsed time per iteration (s): 0.15 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.848071E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.055 | TFLOPs: 26.11 | +7: iteration 22520/ 60336 | consumed samples: 5765120 | consumed tokens: 11806965760 | elapsed time per iteration (s): 0.15 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.849159E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.442 | TFLOPs: 26.10 | +7: iteration 22530/ 60336 | consumed samples: 5767680 | consumed tokens: 11812208640 | elapsed time per iteration (s): 0.15 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.846754E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.431 | TFLOPs: 26.10 | +7: iteration 22540/ 60336 | consumed samples: 5770240 | consumed tokens: 11817451520 | elapsed time per iteration (s): 0.15 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.827533E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.573 | TFLOPs: 26.10 | +7: iteration 22550/ 60336 | consumed samples: 5772800 | consumed tokens: 11822694400 | elapsed time per iteration (s): 0.15 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.843565E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.831 | TFLOPs: 26.09 | +7: iteration 22560/ 60336 | consumed samples: 5775360 | consumed tokens: 11827937280 | elapsed time per iteration (s): 0.15 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.834985E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.671 | TFLOPs: 26.11 | +7: iteration 22570/ 60336 | consumed samples: 5777920 | consumed tokens: 11833180160 | elapsed time per iteration (s): 0.15 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.854087E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.124 | TFLOPs: 26.13 | +7: iteration 22580/ 60336 | consumed samples: 5780480 | consumed tokens: 11838423040 | elapsed time per iteration (s): 0.15 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.850687E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.311 | TFLOPs: 26.10 | +7: iteration 22590/ 60336 | consumed samples: 5783040 | consumed tokens: 11843665920 | elapsed time per iteration (s): 0.15 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.850237E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.950 | TFLOPs: 26.11 | +7: iteration 22600/ 60336 | consumed samples: 5785600 | consumed tokens: 11848908800 | elapsed time per iteration (s): 0.15 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.838826E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.500 | TFLOPs: 26.10 | +7: iteration 22610/ 60336 | consumed samples: 5788160 | consumed tokens: 11854151680 | elapsed time per iteration (s): 0.15 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.852357E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.095 | TFLOPs: 26.11 | +7: iteration 22620/ 60336 | consumed samples: 5790720 | consumed tokens: 11859394560 | elapsed time per iteration (s): 0.15 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.841614E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.586 | TFLOPs: 26.10 | +7: iteration 22630/ 60336 | consumed samples: 5793280 | consumed tokens: 11864637440 | elapsed time per iteration (s): 0.15 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.842648E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.201 | TFLOPs: 26.08 | +7: iteration 22640/ 60336 | consumed samples: 5795840 | consumed tokens: 11869880320 | elapsed time per iteration (s): 0.15 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.836780E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.265 | TFLOPs: 26.13 | +7: iteration 22650/ 60336 | consumed samples: 5798400 | consumed tokens: 11875123200 | elapsed time per iteration (s): 0.15 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.838669E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.007 | TFLOPs: 26.13 | +7: iteration 22660/ 60336 | consumed samples: 5800960 | consumed tokens: 11880366080 | elapsed time per iteration (s): 0.15 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.843191E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.474 | TFLOPs: 26.12 | +7: iteration 22670/ 60336 | consumed samples: 5803520 | consumed tokens: 11885608960 | elapsed time per iteration (s): 0.15 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.842657E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.438 | TFLOPs: 26.12 | +7: iteration 22680/ 60336 | consumed samples: 5806080 | consumed tokens: 11890851840 | elapsed time per iteration (s): 0.15 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.833071E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.803 | TFLOPs: 26.12 | +7: iteration 22690/ 60336 | consumed samples: 5808640 | consumed tokens: 11896094720 | elapsed time per iteration (s): 0.15 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.847871E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.161 | TFLOPs: 26.11 | +7: iteration 22700/ 60336 | consumed samples: 5811200 | consumed tokens: 11901337600 | elapsed time per iteration (s): 0.15 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.867170E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.466 | TFLOPs: 26.12 | +7: iteration 22710/ 60336 | consumed samples: 5813760 | consumed tokens: 11906580480 | elapsed time per iteration (s): 0.15 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.856884E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.832 | TFLOPs: 26.11 | +7: iteration 22720/ 60336 | consumed samples: 5816320 | consumed tokens: 11911823360 | elapsed time per iteration (s): 0.15 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.835173E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.730 | TFLOPs: 26.12 | +7: iteration 22730/ 60336 | consumed samples: 5818880 | consumed tokens: 11917066240 | elapsed time per iteration (s): 0.15 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.842466E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.154 | TFLOPs: 26.15 | +7: iteration 22740/ 60336 | consumed samples: 5821440 | consumed tokens: 11922309120 | elapsed time per iteration (s): 0.15 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.848924E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.058 | TFLOPs: 26.14 | +7: iteration 22750/ 60336 | consumed samples: 5824000 | consumed tokens: 11927552000 | elapsed time per iteration (s): 0.15 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.840491E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.287 | TFLOPs: 26.15 | +7: iteration 22760/ 60336 | consumed samples: 5826560 | consumed tokens: 11932794880 | elapsed time per iteration (s): 0.15 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.855642E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.195 | TFLOPs: 26.11 | +7: iteration 22770/ 60336 | consumed samples: 5829120 | consumed tokens: 11938037760 | elapsed time per iteration (s): 0.15 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.849017E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.747 | TFLOPs: 26.14 | +7: iteration 22780/ 60336 | consumed samples: 5831680 | consumed tokens: 11943280640 | elapsed time per iteration (s): 0.15 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.839684E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.142 | TFLOPs: 26.14 | +7: iteration 22790/ 60336 | consumed samples: 5834240 | consumed tokens: 11948523520 | elapsed time per iteration (s): 0.15 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.841921E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.004 | TFLOPs: 26.14 | +7: iteration 22800/ 60336 | consumed samples: 5836800 | consumed tokens: 11953766400 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.829164E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.585 | TFLOPs: 26.14 | +7: iteration 22810/ 60336 | consumed samples: 5839360 | consumed tokens: 11959009280 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.856848E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.859 | TFLOPs: 26.14 | +7: iteration 22820/ 60336 | consumed samples: 5841920 | consumed tokens: 11964252160 | elapsed time per iteration (s): 0.15 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.847749E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.288 | TFLOPs: 26.15 | +7: iteration 22830/ 60336 | consumed samples: 5844480 | consumed tokens: 11969495040 | elapsed time per iteration (s): 0.15 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.860400E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.490 | TFLOPs: 26.12 | +7: iteration 22840/ 60336 | consumed samples: 5847040 | consumed tokens: 11974737920 | elapsed time per iteration (s): 0.15 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.841965E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.071 | TFLOPs: 26.13 | +7: iteration 22850/ 60336 | consumed samples: 5849600 | consumed tokens: 11979980800 | elapsed time per iteration (s): 0.15 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.849641E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.036 | TFLOPs: 26.14 | +7: iteration 22860/ 60336 | consumed samples: 5852160 | consumed tokens: 11985223680 | elapsed time per iteration (s): 0.15 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.851389E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.097 | TFLOPs: 26.05 | +7: iteration 22870/ 60336 | consumed samples: 5854720 | consumed tokens: 11990466560 | elapsed time per iteration (s): 0.15 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.842850E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.480 | TFLOPs: 26.07 | +7: iteration 22880/ 60336 | consumed samples: 5857280 | consumed tokens: 11995709440 | elapsed time per iteration (s): 0.15 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.831006E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.452 | TFLOPs: 26.04 | +7: iteration 22890/ 60336 | consumed samples: 5859840 | consumed tokens: 12000952320 | elapsed time per iteration (s): 0.15 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.844965E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.239 | TFLOPs: 26.02 | +7: iteration 22900/ 60336 | consumed samples: 5862400 | consumed tokens: 12006195200 | elapsed time per iteration (s): 0.15 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.852571E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.020 | TFLOPs: 26.05 | +7: iteration 22910/ 60336 | consumed samples: 5864960 | consumed tokens: 12011438080 | elapsed time per iteration (s): 0.15 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.846730E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.440 | TFLOPs: 26.02 | +7: iteration 22920/ 60336 | consumed samples: 5867520 | consumed tokens: 12016680960 | elapsed time per iteration (s): 0.15 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.843858E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.265 | TFLOPs: 26.05 | +7: iteration 22930/ 60336 | consumed samples: 5870080 | consumed tokens: 12021923840 | elapsed time per iteration (s): 0.15 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.853689E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.242 | TFLOPs: 26.05 | +7: iteration 22940/ 60336 | consumed samples: 5872640 | consumed tokens: 12027166720 | elapsed time per iteration (s): 0.15 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.839183E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.443 | TFLOPs: 26.06 | +7: iteration 22950/ 60336 | consumed samples: 5875200 | consumed tokens: 12032409600 | elapsed time per iteration (s): 0.15 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.842299E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.476 | TFLOPs: 26.04 | +7: iteration 22960/ 60336 | consumed samples: 5877760 | consumed tokens: 12037652480 | elapsed time per iteration (s): 0.15 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.831565E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.840 | TFLOPs: 26.06 | +7: iteration 22970/ 60336 | consumed samples: 5880320 | consumed tokens: 12042895360 | elapsed time per iteration (s): 0.15 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.857349E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.269 | TFLOPs: 26.15 | +7: iteration 22980/ 60336 | consumed samples: 5882880 | consumed tokens: 12048138240 | elapsed time per iteration (s): 0.15 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.832137E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.680 | TFLOPs: 26.14 | +7: iteration 22990/ 60336 | consumed samples: 5885440 | consumed tokens: 12053381120 | elapsed time per iteration (s): 0.15 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.839799E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.694 | TFLOPs: 26.11 | +7: iteration 23000/ 60336 | consumed samples: 5888000 | consumed tokens: 12058624000 | elapsed time per iteration (s): 0.15 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.827488E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.242 | TFLOPs: 26.13 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 23000 | lm loss value: 3.944025E+00 | lm loss PPL: 5.162597E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 23000 to checkpoints_44m32b100m +0: [2023-03-17 01:18:34,579] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step23000 is begin to save! +0: [2023-03-17 01:18:34,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:18:34,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:18:34,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:18:34,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:18:34,653] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:18:34,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:18:34,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:18:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:18:34,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:18:34,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:18:34,678] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:18:34,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:18:34,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:18:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:18:34,694] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:18:34,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:18:34,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:18:34,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:18:34,710] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:18:34,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:18:34,711] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step23000/mp_rank_00_model_states.pt +0: [2023-03-17 01:18:34,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:18:34,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:18:34,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:18:34,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:18:34,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:18:34,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:18:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:18:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:18:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:18:34,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:18:34,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:18:34,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:18:34,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: successfully saved checkpoint at iteration 23000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 179.41 +7: iteration 23010/ 60336 | consumed samples: 5890560 | consumed tokens: 12063866880 | elapsed time per iteration (s): 0.18 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.854025E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.328 | TFLOPs: 22.78 | +7: iteration 23020/ 60336 | consumed samples: 5893120 | consumed tokens: 12069109760 | elapsed time per iteration (s): 0.15 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.827371E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.328 | TFLOPs: 26.13 | +7: iteration 23030/ 60336 | consumed samples: 5895680 | consumed tokens: 12074352640 | elapsed time per iteration (s): 0.15 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.832617E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.099 | TFLOPs: 26.13 | +7: iteration 23040/ 60336 | consumed samples: 5898240 | consumed tokens: 12079595520 | elapsed time per iteration (s): 0.15 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.842566E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.689 | TFLOPs: 26.01 | +7: iteration 23050/ 60336 | consumed samples: 5900800 | consumed tokens: 12084838400 | elapsed time per iteration (s): 0.16 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.840888E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.968 | TFLOPs: 25.89 | +7: iteration 23060/ 60336 | consumed samples: 5903360 | consumed tokens: 12090081280 | elapsed time per iteration (s): 0.16 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.833994E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.450 | TFLOPs: 25.88 | +7: iteration 23070/ 60336 | consumed samples: 5905920 | consumed tokens: 12095324160 | elapsed time per iteration (s): 0.15 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.836460E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.677 | TFLOPs: 25.93 | +7: iteration 23080/ 60336 | consumed samples: 5908480 | consumed tokens: 12100567040 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.842265E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.448 | TFLOPs: 25.54 | +7: iteration 23090/ 60336 | consumed samples: 5911040 | consumed tokens: 12105809920 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.843731E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.089 | TFLOPs: 25.61 | +7: iteration 23100/ 60336 | consumed samples: 5913600 | consumed tokens: 12111052800 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.839867E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.638 | TFLOPs: 25.71 | +7: iteration 23110/ 60336 | consumed samples: 5916160 | consumed tokens: 12116295680 | elapsed time per iteration (s): 0.15 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.852567E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.153 | TFLOPs: 26.04 | +7: iteration 23120/ 60336 | consumed samples: 5918720 | consumed tokens: 12121538560 | elapsed time per iteration (s): 0.15 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.847600E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.533 | TFLOPs: 26.03 | +7: iteration 23130/ 60336 | consumed samples: 5921280 | consumed tokens: 12126781440 | elapsed time per iteration (s): 0.15 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.824743E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.935 | TFLOPs: 26.06 | +7: iteration 23140/ 60336 | consumed samples: 5923840 | consumed tokens: 12132024320 | elapsed time per iteration (s): 0.15 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.824733E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.693 | TFLOPs: 26.14 | +7: iteration 23150/ 60336 | consumed samples: 5926400 | consumed tokens: 12137267200 | elapsed time per iteration (s): 0.15 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.843854E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.886 | TFLOPs: 26.14 | +7: iteration 23160/ 60336 | consumed samples: 5928960 | consumed tokens: 12142510080 | elapsed time per iteration (s): 0.15 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.843980E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.604 | TFLOPs: 26.14 | +7: iteration 23170/ 60336 | consumed samples: 5931520 | consumed tokens: 12147752960 | elapsed time per iteration (s): 0.15 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.847470E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.139 | TFLOPs: 26.11 | +7: iteration 23180/ 60336 | consumed samples: 5934080 | consumed tokens: 12152995840 | elapsed time per iteration (s): 0.15 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.857555E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.414 | TFLOPs: 26.15 | +7: iteration 23190/ 60336 | consumed samples: 5936640 | consumed tokens: 12158238720 | elapsed time per iteration (s): 0.15 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.840580E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.520 | TFLOPs: 26.15 | +7: iteration 23200/ 60336 | consumed samples: 5939200 | consumed tokens: 12163481600 | elapsed time per iteration (s): 0.15 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.842429E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.153 | TFLOPs: 26.15 | +7: iteration 23210/ 60336 | consumed samples: 5941760 | consumed tokens: 12168724480 | elapsed time per iteration (s): 0.15 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.846203E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.790 | TFLOPs: 26.14 | +7: iteration 23220/ 60336 | consumed samples: 5944320 | consumed tokens: 12173967360 | elapsed time per iteration (s): 0.15 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.837511E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.630 | TFLOPs: 26.15 | +7: iteration 23230/ 60336 | consumed samples: 5946880 | consumed tokens: 12179210240 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.828946E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.579 | TFLOPs: 25.84 | +7: iteration 23240/ 60336 | consumed samples: 5949440 | consumed tokens: 12184453120 | elapsed time per iteration (s): 0.15 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.853616E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.669 | TFLOPs: 26.17 | +7: iteration 23250/ 60336 | consumed samples: 5952000 | consumed tokens: 12189696000 | elapsed time per iteration (s): 0.15 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.829079E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.325 | TFLOPs: 26.16 | +7: iteration 23260/ 60336 | consumed samples: 5954560 | consumed tokens: 12194938880 | elapsed time per iteration (s): 0.15 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.839840E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.751 | TFLOPs: 26.14 | +7: iteration 23270/ 60336 | consumed samples: 5957120 | consumed tokens: 12200181760 | elapsed time per iteration (s): 0.15 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.831185E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.820 | TFLOPs: 26.11 | +7: iteration 23280/ 60336 | consumed samples: 5959680 | consumed tokens: 12205424640 | elapsed time per iteration (s): 0.15 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.830606E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.524 | TFLOPs: 26.14 | +7: iteration 23290/ 60336 | consumed samples: 5962240 | consumed tokens: 12210667520 | elapsed time per iteration (s): 0.15 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.846355E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.450 | TFLOPs: 26.12 | +7: iteration 23300/ 60336 | consumed samples: 5964800 | consumed tokens: 12215910400 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.844621E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.453 | TFLOPs: 26.09 | +7: iteration 23310/ 60336 | consumed samples: 5967360 | consumed tokens: 12221153280 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.836211E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.469 | TFLOPs: 26.12 | +7: iteration 23320/ 60336 | consumed samples: 5969920 | consumed tokens: 12226396160 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.826969E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.799 | TFLOPs: 26.14 | +7: iteration 23330/ 60336 | consumed samples: 5972480 | consumed tokens: 12231639040 | elapsed time per iteration (s): 0.15 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.844346E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.514 | TFLOPs: 26.15 | +7: iteration 23340/ 60336 | consumed samples: 5975040 | consumed tokens: 12236881920 | elapsed time per iteration (s): 0.15 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.845903E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.888 | TFLOPs: 26.13 | +7: iteration 23350/ 60336 | consumed samples: 5977600 | consumed tokens: 12242124800 | elapsed time per iteration (s): 0.15 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.844271E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.915 | TFLOPs: 26.11 | +7: iteration 23360/ 60336 | consumed samples: 5980160 | consumed tokens: 12247367680 | elapsed time per iteration (s): 0.15 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.851563E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.553 | TFLOPs: 26.12 | +7: iteration 23370/ 60336 | consumed samples: 5982720 | consumed tokens: 12252610560 | elapsed time per iteration (s): 0.15 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.834707E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.873 | TFLOPs: 26.14 | +7: iteration 23380/ 60336 | consumed samples: 5985280 | consumed tokens: 12257853440 | elapsed time per iteration (s): 0.15 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.825985E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.255 | TFLOPs: 26.13 | +7: iteration 23390/ 60336 | consumed samples: 5987840 | consumed tokens: 12263096320 | elapsed time per iteration (s): 0.15 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.847961E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.944 | TFLOPs: 26.16 | +7: iteration 23400/ 60336 | consumed samples: 5990400 | consumed tokens: 12268339200 | elapsed time per iteration (s): 0.15 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.845582E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.386 | TFLOPs: 26.15 | +7: iteration 23410/ 60336 | consumed samples: 5992960 | consumed tokens: 12273582080 | elapsed time per iteration (s): 0.15 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.845269E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.784 | TFLOPs: 26.14 | +7: iteration 23420/ 60336 | consumed samples: 5995520 | consumed tokens: 12278824960 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.841570E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.739 | TFLOPs: 26.14 | +7: iteration 23430/ 60336 | consumed samples: 5998080 | consumed tokens: 12284067840 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.844469E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.271 | TFLOPs: 26.12 | +7: iteration 23440/ 60336 | consumed samples: 6000640 | consumed tokens: 12289310720 | elapsed time per iteration (s): 0.15 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.838634E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.427 | TFLOPs: 26.13 | +7: iteration 23450/ 60336 | consumed samples: 6003200 | consumed tokens: 12294553600 | elapsed time per iteration (s): 0.15 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.852936E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.021 | TFLOPs: 26.14 | +7: iteration 23460/ 60336 | consumed samples: 6005760 | consumed tokens: 12299796480 | elapsed time per iteration (s): 0.15 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.836349E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.084 | TFLOPs: 26.16 | +7: iteration 23470/ 60336 | consumed samples: 6008320 | consumed tokens: 12305039360 | elapsed time per iteration (s): 0.15 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.829155E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.123 | TFLOPs: 26.14 | +7: iteration 23480/ 60336 | consumed samples: 6010880 | consumed tokens: 12310282240 | elapsed time per iteration (s): 0.15 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.847938E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.534 | TFLOPs: 26.15 | +7: iteration 23490/ 60336 | consumed samples: 6013440 | consumed tokens: 12315525120 | elapsed time per iteration (s): 0.15 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.847874E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.487 | TFLOPs: 26.13 | +7: iteration 23500/ 60336 | consumed samples: 6016000 | consumed tokens: 12320768000 | elapsed time per iteration (s): 0.15 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.849700E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.233 | TFLOPs: 26.15 | +7: iteration 23510/ 60336 | consumed samples: 6018560 | consumed tokens: 12326010880 | elapsed time per iteration (s): 0.15 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.840439E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.503 | TFLOPs: 26.13 | +7: iteration 23520/ 60336 | consumed samples: 6021120 | consumed tokens: 12331253760 | elapsed time per iteration (s): 0.15 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.858413E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.689 | TFLOPs: 26.15 | +7: iteration 23530/ 60336 | consumed samples: 6023680 | consumed tokens: 12336496640 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.845405E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.297 | TFLOPs: 26.15 | +7: iteration 23540/ 60336 | consumed samples: 6026240 | consumed tokens: 12341739520 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.840688E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.215 | TFLOPs: 26.15 | +7: iteration 23550/ 60336 | consumed samples: 6028800 | consumed tokens: 12346982400 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.824327E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.897 | TFLOPs: 26.16 | +7: iteration 23560/ 60336 | consumed samples: 6031360 | consumed tokens: 12352225280 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.846813E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.339 | TFLOPs: 26.15 | +7: iteration 23570/ 60336 | consumed samples: 6033920 | consumed tokens: 12357468160 | elapsed time per iteration (s): 0.15 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.843956E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.479 | TFLOPs: 26.12 | +7: iteration 23580/ 60336 | consumed samples: 6036480 | consumed tokens: 12362711040 | elapsed time per iteration (s): 0.15 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.852074E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.816 | TFLOPs: 26.14 | +7: iteration 23590/ 60336 | consumed samples: 6039040 | consumed tokens: 12367953920 | elapsed time per iteration (s): 0.15 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.834664E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.503 | TFLOPs: 26.17 | +7: iteration 23600/ 60336 | consumed samples: 6041600 | consumed tokens: 12373196800 | elapsed time per iteration (s): 0.15 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.837764E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.635 | TFLOPs: 26.18 | +7: iteration 23610/ 60336 | consumed samples: 6044160 | consumed tokens: 12378439680 | elapsed time per iteration (s): 0.15 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.845963E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.609 | TFLOPs: 26.17 | +7: iteration 23620/ 60336 | consumed samples: 6046720 | consumed tokens: 12383682560 | elapsed time per iteration (s): 0.15 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.839259E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.781 | TFLOPs: 26.17 | +7: iteration 23630/ 60336 | consumed samples: 6049280 | consumed tokens: 12388925440 | elapsed time per iteration (s): 0.15 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.831030E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.285 | TFLOPs: 26.16 | +7: iteration 23640/ 60336 | consumed samples: 6051840 | consumed tokens: 12394168320 | elapsed time per iteration (s): 0.15 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.829736E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.320 | TFLOPs: 26.16 | +7: iteration 23650/ 60336 | consumed samples: 6054400 | consumed tokens: 12399411200 | elapsed time per iteration (s): 0.15 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.844843E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.670 | TFLOPs: 26.15 | +7: iteration 23660/ 60336 | consumed samples: 6056960 | consumed tokens: 12404654080 | elapsed time per iteration (s): 0.15 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.840590E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.563 | TFLOPs: 26.17 | +7: iteration 23670/ 60336 | consumed samples: 6059520 | consumed tokens: 12409896960 | elapsed time per iteration (s): 0.15 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.832779E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.857 | TFLOPs: 26.19 | +7: iteration 23680/ 60336 | consumed samples: 6062080 | consumed tokens: 12415139840 | elapsed time per iteration (s): 0.15 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.835659E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.894 | TFLOPs: 26.16 | +7: iteration 23690/ 60336 | consumed samples: 6064640 | consumed tokens: 12420382720 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.837022E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.886 | TFLOPs: 26.16 | +7: iteration 23700/ 60336 | consumed samples: 6067200 | consumed tokens: 12425625600 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.835282E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.904 | TFLOPs: 26.13 | +7: iteration 23710/ 60336 | consumed samples: 6069760 | consumed tokens: 12430868480 | elapsed time per iteration (s): 0.15 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.837835E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.003 | TFLOPs: 26.14 | +7: iteration 23720/ 60336 | consumed samples: 6072320 | consumed tokens: 12436111360 | elapsed time per iteration (s): 0.15 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.847905E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.531 | TFLOPs: 26.14 | +7: iteration 23730/ 60336 | consumed samples: 6074880 | consumed tokens: 12441354240 | elapsed time per iteration (s): 0.15 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.835902E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.454 | TFLOPs: 26.13 | +7: iteration 23740/ 60336 | consumed samples: 6077440 | consumed tokens: 12446597120 | elapsed time per iteration (s): 0.15 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.832690E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.483 | TFLOPs: 26.13 | +7: iteration 23750/ 60336 | consumed samples: 6080000 | consumed tokens: 12451840000 | elapsed time per iteration (s): 0.15 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.833220E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.998 | TFLOPs: 26.13 | +7: iteration 23760/ 60336 | consumed samples: 6082560 | consumed tokens: 12457082880 | elapsed time per iteration (s): 0.16 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.842546E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.477 | TFLOPs: 25.74 | +7: iteration 23770/ 60336 | consumed samples: 6085120 | consumed tokens: 12462325760 | elapsed time per iteration (s): 0.15 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.828964E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.362 | TFLOPs: 26.20 | +7: iteration 23780/ 60336 | consumed samples: 6087680 | consumed tokens: 12467568640 | elapsed time per iteration (s): 0.15 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.844513E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.866 | TFLOPs: 26.16 | +7: iteration 23790/ 60336 | consumed samples: 6090240 | consumed tokens: 12472811520 | elapsed time per iteration (s): 0.15 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.839226E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.096 | TFLOPs: 26.18 | +7: iteration 23800/ 60336 | consumed samples: 6092800 | consumed tokens: 12478054400 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.845503E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.508 | TFLOPs: 26.18 | +7: iteration 23810/ 60336 | consumed samples: 6095360 | consumed tokens: 12483297280 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.846723E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.598 | TFLOPs: 26.20 | +7: iteration 23820/ 60336 | consumed samples: 6097920 | consumed tokens: 12488540160 | elapsed time per iteration (s): 0.15 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.844689E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.918 | TFLOPs: 26.17 | +7: iteration 23830/ 60336 | consumed samples: 6100480 | consumed tokens: 12493783040 | elapsed time per iteration (s): 0.15 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.830190E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.386 | TFLOPs: 26.20 | +7: iteration 23840/ 60336 | consumed samples: 6103040 | consumed tokens: 12499025920 | elapsed time per iteration (s): 0.15 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.839414E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.077 | TFLOPs: 26.19 | +7: iteration 23850/ 60336 | consumed samples: 6105600 | consumed tokens: 12504268800 | elapsed time per iteration (s): 0.15 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.824773E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.604 | TFLOPs: 26.18 | +7: iteration 23860/ 60336 | consumed samples: 6108160 | consumed tokens: 12509511680 | elapsed time per iteration (s): 0.15 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.839672E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.240 | TFLOPs: 26.18 | +7: iteration 23870/ 60336 | consumed samples: 6110720 | consumed tokens: 12514754560 | elapsed time per iteration (s): 0.15 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.857130E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.203 | TFLOPs: 26.16 | +7: iteration 23880/ 60336 | consumed samples: 6113280 | consumed tokens: 12519997440 | elapsed time per iteration (s): 0.15 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.845135E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.990 | TFLOPs: 26.11 | +7: iteration 23890/ 60336 | consumed samples: 6115840 | consumed tokens: 12525240320 | elapsed time per iteration (s): 0.15 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.844083E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.873 | TFLOPs: 26.11 | +7: iteration 23900/ 60336 | consumed samples: 6118400 | consumed tokens: 12530483200 | elapsed time per iteration (s): 0.15 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.837206E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.194 | TFLOPs: 26.13 | +7: iteration 23910/ 60336 | consumed samples: 6120960 | consumed tokens: 12535726080 | elapsed time per iteration (s): 0.15 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.826435E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.045 | TFLOPs: 26.16 | +7: iteration 23920/ 60336 | consumed samples: 6123520 | consumed tokens: 12540968960 | elapsed time per iteration (s): 0.15 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.837452E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.310 | TFLOPs: 26.15 | +7: iteration 23930/ 60336 | consumed samples: 6126080 | consumed tokens: 12546211840 | elapsed time per iteration (s): 0.15 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.832119E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.664 | TFLOPs: 26.14 | +7: iteration 23940/ 60336 | consumed samples: 6128640 | consumed tokens: 12551454720 | elapsed time per iteration (s): 0.15 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.844709E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.553 | TFLOPs: 26.10 | +7: iteration 23950/ 60336 | consumed samples: 6131200 | consumed tokens: 12556697600 | elapsed time per iteration (s): 0.15 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.824373E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.365 | TFLOPs: 26.07 | +7: iteration 23960/ 60336 | consumed samples: 6133760 | consumed tokens: 12561940480 | elapsed time per iteration (s): 0.15 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.831509E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.597 | TFLOPs: 26.07 | +7: iteration 23970/ 60336 | consumed samples: 6136320 | consumed tokens: 12567183360 | elapsed time per iteration (s): 0.15 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.847798E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.701 | TFLOPs: 25.98 | +7: iteration 23980/ 60336 | consumed samples: 6138880 | consumed tokens: 12572426240 | elapsed time per iteration (s): 0.15 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.839942E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.432 | TFLOPs: 25.96 | +7: iteration 23990/ 60336 | consumed samples: 6141440 | consumed tokens: 12577669120 | elapsed time per iteration (s): 0.15 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.844746E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.176 | TFLOPs: 25.94 | +0: [2023-03-17 01:21:08,561] [INFO] [logging.py:68:log_dist] [Rank 0] step=24000, skipped=0, lr=[0.00014003669483463215, 0.00014003669483463215, 0.00014003669483463215], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 24000/ 60336 | consumed samples: 6144000 | consumed tokens: 12582912000 | elapsed time per iteration (s): 0.15 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.830950E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.018 | TFLOPs: 25.97 | +0: steps: 24000 loss: 3.8397 iter time (s): 0.152 samples/sec: 1681.472 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 24000 | lm loss value: 3.961286E+00 | lm loss PPL: 5.252483E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 24000 to checkpoints_44m32b100m +0: [2023-03-17 01:21:08,633] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step24000 is begin to save! +0: [2023-03-17 01:21:08,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:21:08,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:21:08,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:21:08,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:21:08,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:21:08,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:21:08,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:21:08,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:21:08,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:21:08,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:21:08,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:21:08,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:21:08,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:21:08,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:21:08,749] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:21:08,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:21:08,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:21:08,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:21:08,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:21:08,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:21:08,766] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step24000/mp_rank_00_model_states.pt +0: [2023-03-17 01:21:08,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:21:08,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:21:08,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:21:08,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:21:08,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:21:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:21:08,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:21:08,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:21:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:21:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:21:08,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:21:08,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: successfully saved checkpoint at iteration 24000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.06 +7: iteration 24010/ 60336 | consumed samples: 6146560 | consumed tokens: 12588154880 | elapsed time per iteration (s): 0.18 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.837959E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1384.799 | TFLOPs: 21.72 | +7: iteration 24020/ 60336 | consumed samples: 6149120 | consumed tokens: 12593397760 | elapsed time per iteration (s): 0.15 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.826074E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.138 | TFLOPs: 26.13 | +7: iteration 24030/ 60336 | consumed samples: 6151680 | consumed tokens: 12598640640 | elapsed time per iteration (s): 0.15 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.842189E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.922 | TFLOPs: 26.13 | +7: iteration 24040/ 60336 | consumed samples: 6154240 | consumed tokens: 12603883520 | elapsed time per iteration (s): 0.15 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.829543E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.503 | TFLOPs: 25.93 | +7: iteration 24050/ 60336 | consumed samples: 6156800 | consumed tokens: 12609126400 | elapsed time per iteration (s): 0.15 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.833486E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.279 | TFLOPs: 26.16 | +7: iteration 24060/ 60336 | consumed samples: 6159360 | consumed tokens: 12614369280 | elapsed time per iteration (s): 0.15 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.835739E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.108 | TFLOPs: 26.25 | +7: iteration 24070/ 60336 | consumed samples: 6161920 | consumed tokens: 12619612160 | elapsed time per iteration (s): 0.15 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.838928E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.740 | TFLOPs: 26.15 | +7: iteration 24080/ 60336 | consumed samples: 6164480 | consumed tokens: 12624855040 | elapsed time per iteration (s): 0.15 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.835691E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.746 | TFLOPs: 26.17 | +7: iteration 24090/ 60336 | consumed samples: 6167040 | consumed tokens: 12630097920 | elapsed time per iteration (s): 0.15 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.844454E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.454 | TFLOPs: 26.20 | +7: iteration 24100/ 60336 | consumed samples: 6169600 | consumed tokens: 12635340800 | elapsed time per iteration (s): 0.15 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.837349E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.178 | TFLOPs: 26.16 | +7: iteration 24110/ 60336 | consumed samples: 6172160 | consumed tokens: 12640583680 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.832642E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.428 | TFLOPs: 26.17 | +7: iteration 24120/ 60336 | consumed samples: 6174720 | consumed tokens: 12645826560 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.828484E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.979 | TFLOPs: 26.16 | +7: iteration 24130/ 60336 | consumed samples: 6177280 | consumed tokens: 12651069440 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.826941E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.357 | TFLOPs: 26.16 | +7: iteration 24140/ 60336 | consumed samples: 6179840 | consumed tokens: 12656312320 | elapsed time per iteration (s): 0.15 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.846096E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.660 | TFLOPs: 26.17 | +7: iteration 24150/ 60336 | consumed samples: 6182400 | consumed tokens: 12661555200 | elapsed time per iteration (s): 0.15 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.824520E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.928 | TFLOPs: 26.14 | +7: iteration 24160/ 60336 | consumed samples: 6184960 | consumed tokens: 12666798080 | elapsed time per iteration (s): 0.15 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.835312E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.619 | TFLOPs: 26.17 | +7: iteration 24170/ 60336 | consumed samples: 6187520 | consumed tokens: 12672040960 | elapsed time per iteration (s): 0.15 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.840588E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.948 | TFLOPs: 26.13 | +7: iteration 24180/ 60336 | consumed samples: 6190080 | consumed tokens: 12677283840 | elapsed time per iteration (s): 0.15 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.809338E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.457 | TFLOPs: 26.13 | +7: iteration 24190/ 60336 | consumed samples: 6192640 | consumed tokens: 12682526720 | elapsed time per iteration (s): 0.15 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.832509E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.299 | TFLOPs: 26.10 | +7: iteration 24200/ 60336 | consumed samples: 6195200 | consumed tokens: 12687769600 | elapsed time per iteration (s): 0.15 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.820879E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.637 | TFLOPs: 26.12 | +7: iteration 24210/ 60336 | consumed samples: 6197760 | consumed tokens: 12693012480 | elapsed time per iteration (s): 0.15 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.829116E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.311 | TFLOPs: 26.10 | +7: iteration 24220/ 60336 | consumed samples: 6200320 | consumed tokens: 12698255360 | elapsed time per iteration (s): 0.15 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.831610E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.737 | TFLOPs: 26.11 | +7: iteration 24230/ 60336 | consumed samples: 6202880 | consumed tokens: 12703498240 | elapsed time per iteration (s): 0.15 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.835902E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.692 | TFLOPs: 26.09 | +7: iteration 24240/ 60336 | consumed samples: 6205440 | consumed tokens: 12708741120 | elapsed time per iteration (s): 0.15 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.831296E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.568 | TFLOPs: 26.03 | +7: iteration 24250/ 60336 | consumed samples: 6208000 | consumed tokens: 12713984000 | elapsed time per iteration (s): 0.15 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.830849E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.178 | TFLOPs: 26.04 | +7: iteration 24260/ 60336 | consumed samples: 6210560 | consumed tokens: 12719226880 | elapsed time per iteration (s): 0.15 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.840109E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.118 | TFLOPs: 26.03 | +7: iteration 24270/ 60336 | consumed samples: 6213120 | consumed tokens: 12724469760 | elapsed time per iteration (s): 0.15 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.842870E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.896 | TFLOPs: 26.05 | +7: iteration 24280/ 60336 | consumed samples: 6215680 | consumed tokens: 12729712640 | elapsed time per iteration (s): 0.15 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.845811E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.647 | TFLOPs: 26.04 | +7: iteration 24290/ 60336 | consumed samples: 6218240 | consumed tokens: 12734955520 | elapsed time per iteration (s): 0.15 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.834512E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.783 | TFLOPs: 26.06 | +7: iteration 24300/ 60336 | consumed samples: 6220800 | consumed tokens: 12740198400 | elapsed time per iteration (s): 0.15 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.828160E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.954 | TFLOPs: 26.08 | +7: iteration 24310/ 60336 | consumed samples: 6223360 | consumed tokens: 12745441280 | elapsed time per iteration (s): 0.17 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.831613E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.096 | TFLOPs: 23.21 | +7: iteration 24320/ 60336 | consumed samples: 6225920 | consumed tokens: 12750684160 | elapsed time per iteration (s): 0.15 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.838671E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.180 | TFLOPs: 25.91 | +7: iteration 24330/ 60336 | consumed samples: 6228480 | consumed tokens: 12755927040 | elapsed time per iteration (s): 0.16 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.825063E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.239 | TFLOPs: 24.69 | +7: iteration 24340/ 60336 | consumed samples: 6231040 | consumed tokens: 12761169920 | elapsed time per iteration (s): 0.15 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.831813E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.177 | TFLOPs: 25.96 | +7: iteration 24350/ 60336 | consumed samples: 6233600 | consumed tokens: 12766412800 | elapsed time per iteration (s): 0.15 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.825460E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.091 | TFLOPs: 26.24 | +7: iteration 24360/ 60336 | consumed samples: 6236160 | consumed tokens: 12771655680 | elapsed time per iteration (s): 0.19 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.839336E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1362.711 | TFLOPs: 21.37 | +7: iteration 24370/ 60336 | consumed samples: 6238720 | consumed tokens: 12776898560 | elapsed time per iteration (s): 0.16 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.843432E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.710 | TFLOPs: 24.82 | +7: iteration 24380/ 60336 | consumed samples: 6241280 | consumed tokens: 12782141440 | elapsed time per iteration (s): 0.15 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.842832E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.795 | TFLOPs: 26.09 | +7: iteration 24390/ 60336 | consumed samples: 6243840 | consumed tokens: 12787384320 | elapsed time per iteration (s): 0.15 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.825687E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.086 | TFLOPs: 26.24 | +7: iteration 24400/ 60336 | consumed samples: 6246400 | consumed tokens: 12792627200 | elapsed time per iteration (s): 0.15 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.819909E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.318 | TFLOPs: 26.23 | +7: iteration 24410/ 60336 | consumed samples: 6248960 | consumed tokens: 12797870080 | elapsed time per iteration (s): 0.15 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.827230E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.143 | TFLOPs: 26.22 | +7: iteration 24420/ 60336 | consumed samples: 6251520 | consumed tokens: 12803112960 | elapsed time per iteration (s): 0.15 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.826879E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.370 | TFLOPs: 26.10 | +7: iteration 24430/ 60336 | consumed samples: 6254080 | consumed tokens: 12808355840 | elapsed time per iteration (s): 0.15 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.837088E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.488 | TFLOPs: 25.93 | +7: iteration 24440/ 60336 | consumed samples: 6256640 | consumed tokens: 12813598720 | elapsed time per iteration (s): 0.15 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.835970E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.914 | TFLOPs: 26.16 | +7: iteration 24450/ 60336 | consumed samples: 6259200 | consumed tokens: 12818841600 | elapsed time per iteration (s): 0.15 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.843118E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.664 | TFLOPs: 26.23 | +7: iteration 24460/ 60336 | consumed samples: 6261760 | consumed tokens: 12824084480 | elapsed time per iteration (s): 0.15 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.833535E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.130 | TFLOPs: 26.14 | +7: iteration 24470/ 60336 | consumed samples: 6264320 | consumed tokens: 12829327360 | elapsed time per iteration (s): 0.15 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.845010E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.858 | TFLOPs: 26.11 | +7: iteration 24480/ 60336 | consumed samples: 6266880 | consumed tokens: 12834570240 | elapsed time per iteration (s): 0.16 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.824702E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.028 | TFLOPs: 25.14 | +7: iteration 24490/ 60336 | consumed samples: 6269440 | consumed tokens: 12839813120 | elapsed time per iteration (s): 0.16 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.833707E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.093 | TFLOPs: 25.47 | +7: iteration 24500/ 60336 | consumed samples: 6272000 | consumed tokens: 12845056000 | elapsed time per iteration (s): 0.15 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.840644E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.962 | TFLOPs: 26.06 | +7: iteration 24510/ 60336 | consumed samples: 6274560 | consumed tokens: 12850298880 | elapsed time per iteration (s): 0.15 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.830051E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.416 | TFLOPs: 26.21 | +7: iteration 24520/ 60336 | consumed samples: 6277120 | consumed tokens: 12855541760 | elapsed time per iteration (s): 0.15 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.829657E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.979 | TFLOPs: 26.25 | +7: iteration 24530/ 60336 | consumed samples: 6279680 | consumed tokens: 12860784640 | elapsed time per iteration (s): 0.15 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.837714E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.209 | TFLOPs: 26.15 | +7: iteration 24540/ 60336 | consumed samples: 6282240 | consumed tokens: 12866027520 | elapsed time per iteration (s): 0.15 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.849585E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.124 | TFLOPs: 26.22 | +7: iteration 24550/ 60336 | consumed samples: 6284800 | consumed tokens: 12871270400 | elapsed time per iteration (s): 0.15 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.845496E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.309 | TFLOPs: 26.10 | +7: iteration 24560/ 60336 | consumed samples: 6287360 | consumed tokens: 12876513280 | elapsed time per iteration (s): 0.15 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.827439E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.336 | TFLOPs: 26.16 | +7: iteration 24570/ 60336 | consumed samples: 6289920 | consumed tokens: 12881756160 | elapsed time per iteration (s): 0.15 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.838229E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.117 | TFLOPs: 26.02 | +7: iteration 24580/ 60336 | consumed samples: 6292480 | consumed tokens: 12886999040 | elapsed time per iteration (s): 0.15 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.840977E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.057 | TFLOPs: 26.17 | +7: iteration 24590/ 60336 | consumed samples: 6295040 | consumed tokens: 12892241920 | elapsed time per iteration (s): 0.15 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.823507E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.171 | TFLOPs: 26.08 | +7: iteration 24600/ 60336 | consumed samples: 6297600 | consumed tokens: 12897484800 | elapsed time per iteration (s): 0.15 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.822776E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.652 | TFLOPs: 26.00 | +7: iteration 24610/ 60336 | consumed samples: 6300160 | consumed tokens: 12902727680 | elapsed time per iteration (s): 0.18 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.841222E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.604 | TFLOPs: 22.76 | +7: iteration 24620/ 60336 | consumed samples: 6302720 | consumed tokens: 12907970560 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.839902E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.083 | TFLOPs: 25.34 | +7: iteration 24630/ 60336 | consumed samples: 6305280 | consumed tokens: 12913213440 | elapsed time per iteration (s): 0.15 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.847346E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.012 | TFLOPs: 25.99 | +7: iteration 24640/ 60336 | consumed samples: 6307840 | consumed tokens: 12918456320 | elapsed time per iteration (s): 0.15 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.836720E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.207 | TFLOPs: 25.94 | +7: iteration 24650/ 60336 | consumed samples: 6310400 | consumed tokens: 12923699200 | elapsed time per iteration (s): 0.16 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.831485E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.177 | TFLOPs: 25.06 | +7: iteration 24660/ 60336 | consumed samples: 6312960 | consumed tokens: 12928942080 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.825292E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.497 | TFLOPs: 26.21 | +7: iteration 24670/ 60336 | consumed samples: 6315520 | consumed tokens: 12934184960 | elapsed time per iteration (s): 0.16 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.830410E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.759 | TFLOPs: 25.87 | +7: iteration 24680/ 60336 | consumed samples: 6318080 | consumed tokens: 12939427840 | elapsed time per iteration (s): 0.16 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.830465E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.566 | TFLOPs: 25.59 | +7: iteration 24690/ 60336 | consumed samples: 6320640 | consumed tokens: 12944670720 | elapsed time per iteration (s): 0.15 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.831905E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.700 | TFLOPs: 26.20 | +7: iteration 24700/ 60336 | consumed samples: 6323200 | consumed tokens: 12949913600 | elapsed time per iteration (s): 0.16 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.850809E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.290 | TFLOPs: 25.47 | +7: iteration 24710/ 60336 | consumed samples: 6325760 | consumed tokens: 12955156480 | elapsed time per iteration (s): 0.18 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.830539E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.896 | TFLOPs: 22.60 | +7: iteration 24720/ 60336 | consumed samples: 6328320 | consumed tokens: 12960399360 | elapsed time per iteration (s): 0.16 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.829577E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.358 | TFLOPs: 25.74 | +7: iteration 24730/ 60336 | consumed samples: 6330880 | consumed tokens: 12965642240 | elapsed time per iteration (s): 0.15 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.824560E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.882 | TFLOPs: 26.28 | +7: iteration 24740/ 60336 | consumed samples: 6333440 | consumed tokens: 12970885120 | elapsed time per iteration (s): 0.19 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.834290E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1365.406 | TFLOPs: 21.41 | +7: iteration 24750/ 60336 | consumed samples: 6336000 | consumed tokens: 12976128000 | elapsed time per iteration (s): 0.15 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.831136E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.150 | TFLOPs: 26.15 | +7: iteration 24760/ 60336 | consumed samples: 6338560 | consumed tokens: 12981370880 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.830839E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.241 | TFLOPs: 26.01 | +7: iteration 24770/ 60336 | consumed samples: 6341120 | consumed tokens: 12986613760 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.826966E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.688 | TFLOPs: 26.29 | +7: iteration 24780/ 60336 | consumed samples: 6343680 | consumed tokens: 12991856640 | elapsed time per iteration (s): 0.15 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.825743E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.793 | TFLOPs: 26.31 | +7: iteration 24790/ 60336 | consumed samples: 6346240 | consumed tokens: 12997099520 | elapsed time per iteration (s): 0.15 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.841077E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.000 | TFLOPs: 26.30 | +7: iteration 24800/ 60336 | consumed samples: 6348800 | consumed tokens: 13002342400 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.827625E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.945 | TFLOPs: 26.30 | +7: iteration 24810/ 60336 | consumed samples: 6351360 | consumed tokens: 13007585280 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.835121E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.690 | TFLOPs: 26.29 | +7: iteration 24820/ 60336 | consumed samples: 6353920 | consumed tokens: 13012828160 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.837421E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.722 | TFLOPs: 26.28 | +7: iteration 24830/ 60336 | consumed samples: 6356480 | consumed tokens: 13018071040 | elapsed time per iteration (s): 0.16 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.829415E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.130 | TFLOPs: 25.75 | +7: iteration 24840/ 60336 | consumed samples: 6359040 | consumed tokens: 13023313920 | elapsed time per iteration (s): 0.15 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.838506E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.783 | TFLOPs: 26.30 | +7: iteration 24850/ 60336 | consumed samples: 6361600 | consumed tokens: 13028556800 | elapsed time per iteration (s): 0.19 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.838073E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1318.838 | TFLOPs: 20.68 | +7: iteration 24860/ 60336 | consumed samples: 6364160 | consumed tokens: 13033799680 | elapsed time per iteration (s): 0.17 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.836612E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.652 | TFLOPs: 23.74 | +7: iteration 24870/ 60336 | consumed samples: 6366720 | consumed tokens: 13039042560 | elapsed time per iteration (s): 0.24 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.825884E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1078.692 | TFLOPs: 16.92 | +7: iteration 24880/ 60336 | consumed samples: 6369280 | consumed tokens: 13044285440 | elapsed time per iteration (s): 0.16 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.826421E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.361 | TFLOPs: 24.39 | +7: iteration 24890/ 60336 | consumed samples: 6371840 | consumed tokens: 13049528320 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.829675E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.508 | TFLOPs: 24.75 | +7: iteration 24900/ 60336 | consumed samples: 6374400 | consumed tokens: 13054771200 | elapsed time per iteration (s): 0.15 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.828954E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.300 | TFLOPs: 26.27 | +7: iteration 24910/ 60336 | consumed samples: 6376960 | consumed tokens: 13060014080 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.826226E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.457 | TFLOPs: 26.28 | +7: iteration 24920/ 60336 | consumed samples: 6379520 | consumed tokens: 13065256960 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.838459E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.011 | TFLOPs: 26.28 | +7: iteration 24930/ 60336 | consumed samples: 6382080 | consumed tokens: 13070499840 | elapsed time per iteration (s): 0.18 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.832622E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1461.841 | TFLOPs: 22.93 | +7: iteration 24940/ 60336 | consumed samples: 6384640 | consumed tokens: 13075742720 | elapsed time per iteration (s): 0.16 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.827468E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.896 | TFLOPs: 25.29 | +7: iteration 24950/ 60336 | consumed samples: 6387200 | consumed tokens: 13080985600 | elapsed time per iteration (s): 0.16 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.833788E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.383 | TFLOPs: 25.47 | +7: iteration 24960/ 60336 | consumed samples: 6389760 | consumed tokens: 13086228480 | elapsed time per iteration (s): 0.16 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.826666E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.576 | TFLOPs: 25.60 | +7: iteration 24970/ 60336 | consumed samples: 6392320 | consumed tokens: 13091471360 | elapsed time per iteration (s): 0.17 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.823455E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.895 | TFLOPs: 23.26 | +7: iteration 24980/ 60336 | consumed samples: 6394880 | consumed tokens: 13096714240 | elapsed time per iteration (s): 0.16 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.809222E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.790 | TFLOPs: 25.20 | +7: iteration 24990/ 60336 | consumed samples: 6397440 | consumed tokens: 13101957120 | elapsed time per iteration (s): 0.16 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.825567E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.623 | TFLOPs: 25.49 | +7: iteration 25000/ 60336 | consumed samples: 6400000 | consumed tokens: 13107200000 | elapsed time per iteration (s): 0.20 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.831354E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1281.636 | TFLOPs: 20.10 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 25000 | lm loss value: 3.908776E+00 | lm loss PPL: 4.983789E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 25000 to checkpoints_44m32b100m +0: [2023-03-17 01:23:47,055] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step25000 is begin to save! +0: [2023-03-17 01:23:47,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:23:47,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:23:47,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:23:47,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:23:47,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:23:47,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:23:47,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:23:47,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:23:47,176] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:23:47,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:23:47,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:23:47,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:23:47,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:23:47,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:23:47,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:23:47,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:23:47,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:23:47,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:23:47,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:23:47,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:23:47,219] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step25000/mp_rank_00_model_states.pt +0: [2023-03-17 01:23:47,219] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:23:47,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:47,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:47,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:23:47,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:47,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:47,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:47,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:47,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:23:47,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: successfully saved checkpoint at iteration 25000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 285.65 +7: iteration 25010/ 60336 | consumed samples: 6402560 | consumed tokens: 13112442880 | elapsed time per iteration (s): 0.20 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.834935E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1311.817 | TFLOPs: 20.57 | +7: iteration 25020/ 60336 | consumed samples: 6405120 | consumed tokens: 13117685760 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.825945E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.149 | TFLOPs: 25.47 | +7: iteration 25030/ 60336 | consumed samples: 6407680 | consumed tokens: 13122928640 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.823129E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.184 | TFLOPs: 25.69 | +7: iteration 25040/ 60336 | consumed samples: 6410240 | consumed tokens: 13128171520 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.832851E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.747 | TFLOPs: 25.62 | +7: iteration 25050/ 60336 | consumed samples: 6412800 | consumed tokens: 13133414400 | elapsed time per iteration (s): 0.16 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.831411E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.206 | TFLOPs: 25.46 | +7: iteration 25060/ 60336 | consumed samples: 6415360 | consumed tokens: 13138657280 | elapsed time per iteration (s): 0.16 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.845730E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.520 | TFLOPs: 24.77 | +7: iteration 25070/ 60336 | consumed samples: 6417920 | consumed tokens: 13143900160 | elapsed time per iteration (s): 0.16 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.820822E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.249 | TFLOPs: 25.57 | +7: iteration 25080/ 60336 | consumed samples: 6420480 | consumed tokens: 13149143040 | elapsed time per iteration (s): 0.17 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.837530E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.895 | TFLOPs: 24.01 | +7: iteration 25090/ 60336 | consumed samples: 6423040 | consumed tokens: 13154385920 | elapsed time per iteration (s): 0.16 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.829766E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.982 | TFLOPs: 24.46 | +7: iteration 25100/ 60336 | consumed samples: 6425600 | consumed tokens: 13159628800 | elapsed time per iteration (s): 0.16 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.843533E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.909 | TFLOPs: 25.15 | +7: iteration 25110/ 60336 | consumed samples: 6428160 | consumed tokens: 13164871680 | elapsed time per iteration (s): 0.16 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.831998E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.068 | TFLOPs: 24.43 | +7: iteration 25120/ 60336 | consumed samples: 6430720 | consumed tokens: 13170114560 | elapsed time per iteration (s): 0.19 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.839696E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1369.853 | TFLOPs: 21.48 | +7: iteration 25130/ 60336 | consumed samples: 6433280 | consumed tokens: 13175357440 | elapsed time per iteration (s): 0.18 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.843705E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.891 | TFLOPs: 22.85 | +7: iteration 25140/ 60336 | consumed samples: 6435840 | consumed tokens: 13180600320 | elapsed time per iteration (s): 0.16 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.824438E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.513 | TFLOPs: 25.02 | +7: iteration 25150/ 60336 | consumed samples: 6438400 | consumed tokens: 13185843200 | elapsed time per iteration (s): 0.16 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.825034E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.060 | TFLOPs: 25.19 | +7: iteration 25160/ 60336 | consumed samples: 6440960 | consumed tokens: 13191086080 | elapsed time per iteration (s): 0.16 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.811370E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.062 | TFLOPs: 25.28 | +7: iteration 25170/ 60336 | consumed samples: 6443520 | consumed tokens: 13196328960 | elapsed time per iteration (s): 0.16 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.827874E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.708 | TFLOPs: 25.56 | +7: iteration 25180/ 60336 | consumed samples: 6446080 | consumed tokens: 13201571840 | elapsed time per iteration (s): 0.16 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.831174E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.179 | TFLOPs: 25.39 | +7: iteration 25190/ 60336 | consumed samples: 6448640 | consumed tokens: 13206814720 | elapsed time per iteration (s): 0.17 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.841622E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.415 | TFLOPs: 23.34 | +7: iteration 25200/ 60336 | consumed samples: 6451200 | consumed tokens: 13212057600 | elapsed time per iteration (s): 0.16 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.810004E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.622 | TFLOPs: 25.12 | +7: iteration 25210/ 60336 | consumed samples: 6453760 | consumed tokens: 13217300480 | elapsed time per iteration (s): 0.16 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.826756E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.445 | TFLOPs: 25.41 | +7: iteration 25220/ 60336 | consumed samples: 6456320 | consumed tokens: 13222543360 | elapsed time per iteration (s): 0.16 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.813572E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.325 | TFLOPs: 24.75 | +7: iteration 25230/ 60336 | consumed samples: 6458880 | consumed tokens: 13227786240 | elapsed time per iteration (s): 0.15 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.824380E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.276 | TFLOPs: 26.12 | +7: iteration 25240/ 60336 | consumed samples: 6461440 | consumed tokens: 13233029120 | elapsed time per iteration (s): 0.16 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.830219E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.540 | TFLOPs: 25.10 | +7: iteration 25250/ 60336 | consumed samples: 6464000 | consumed tokens: 13238272000 | elapsed time per iteration (s): 0.19 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.843214E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1362.976 | TFLOPs: 21.37 | +7: iteration 25260/ 60336 | consumed samples: 6466560 | consumed tokens: 13243514880 | elapsed time per iteration (s): 0.16 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.831980E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.731 | TFLOPs: 24.63 | +7: iteration 25270/ 60336 | consumed samples: 6469120 | consumed tokens: 13248757760 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.820721E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.438 | TFLOPs: 25.62 | +7: iteration 25280/ 60336 | consumed samples: 6471680 | consumed tokens: 13254000640 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.834072E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.013 | TFLOPs: 24.72 | +7: iteration 25290/ 60336 | consumed samples: 6474240 | consumed tokens: 13259243520 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.833937E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.341 | TFLOPs: 24.49 | +7: iteration 25300/ 60336 | consumed samples: 6476800 | consumed tokens: 13264486400 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.819680E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.239 | TFLOPs: 25.71 | +7: iteration 25310/ 60336 | consumed samples: 6479360 | consumed tokens: 13269729280 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.831366E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.656 | TFLOPs: 25.42 | +7: iteration 25320/ 60336 | consumed samples: 6481920 | consumed tokens: 13274972160 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.826210E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.663 | TFLOPs: 25.82 | +7: iteration 25330/ 60336 | consumed samples: 6484480 | consumed tokens: 13280215040 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.830787E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.492 | TFLOPs: 24.79 | +7: iteration 25340/ 60336 | consumed samples: 6487040 | consumed tokens: 13285457920 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.817551E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.762 | TFLOPs: 25.28 | +7: iteration 25350/ 60336 | consumed samples: 6489600 | consumed tokens: 13290700800 | elapsed time per iteration (s): 0.15 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.820868E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.930 | TFLOPs: 26.09 | +7: iteration 25360/ 60336 | consumed samples: 6492160 | consumed tokens: 13295943680 | elapsed time per iteration (s): 0.16 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.829246E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.674 | TFLOPs: 25.49 | +7: iteration 25370/ 60336 | consumed samples: 6494720 | consumed tokens: 13301186560 | elapsed time per iteration (s): 0.15 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.835797E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.560 | TFLOPs: 25.93 | +7: iteration 25380/ 60336 | consumed samples: 6497280 | consumed tokens: 13306429440 | elapsed time per iteration (s): 0.17 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.832076E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.334 | TFLOPs: 23.25 | +7: iteration 25390/ 60336 | consumed samples: 6499840 | consumed tokens: 13311672320 | elapsed time per iteration (s): 0.16 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.835444E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.695 | TFLOPs: 25.56 | +7: iteration 25400/ 60336 | consumed samples: 6502400 | consumed tokens: 13316915200 | elapsed time per iteration (s): 0.15 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.829743E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.530 | TFLOPs: 26.14 | +7: iteration 25410/ 60336 | consumed samples: 6504960 | consumed tokens: 13322158080 | elapsed time per iteration (s): 0.16 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.843585E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.408 | TFLOPs: 25.52 | +7: iteration 25420/ 60336 | consumed samples: 6507520 | consumed tokens: 13327400960 | elapsed time per iteration (s): 0.16 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.826730E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.629 | TFLOPs: 25.21 | +7: iteration 25430/ 60336 | consumed samples: 6510080 | consumed tokens: 13332643840 | elapsed time per iteration (s): 0.16 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.824432E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.902 | TFLOPs: 25.84 | +7: iteration 25440/ 60336 | consumed samples: 6512640 | consumed tokens: 13337886720 | elapsed time per iteration (s): 0.19 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.839072E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1359.926 | TFLOPs: 21.33 | +7: iteration 25450/ 60336 | consumed samples: 6515200 | consumed tokens: 13343129600 | elapsed time per iteration (s): 0.16 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.826786E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.831 | TFLOPs: 25.83 | +7: iteration 25460/ 60336 | consumed samples: 6517760 | consumed tokens: 13348372480 | elapsed time per iteration (s): 0.16 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.842563E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.149 | TFLOPs: 25.55 | +7: iteration 25470/ 60336 | consumed samples: 6520320 | consumed tokens: 13353615360 | elapsed time per iteration (s): 0.15 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.825864E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.219 | TFLOPs: 26.19 | +7: iteration 25480/ 60336 | consumed samples: 6522880 | consumed tokens: 13358858240 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.832150E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.527 | TFLOPs: 25.24 | +7: iteration 25490/ 60336 | consumed samples: 6525440 | consumed tokens: 13364101120 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.818357E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.643 | TFLOPs: 24.93 | +7: iteration 25500/ 60336 | consumed samples: 6528000 | consumed tokens: 13369344000 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.826324E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.211 | TFLOPs: 25.36 | +7: iteration 25510/ 60336 | consumed samples: 6530560 | consumed tokens: 13374586880 | elapsed time per iteration (s): 0.19 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.819454E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.239 | TFLOPs: 21.33 | +7: iteration 25520/ 60336 | consumed samples: 6533120 | consumed tokens: 13379829760 | elapsed time per iteration (s): 0.15 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.835195E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.886 | TFLOPs: 26.00 | +7: iteration 25530/ 60336 | consumed samples: 6535680 | consumed tokens: 13385072640 | elapsed time per iteration (s): 0.16 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.816275E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.530 | TFLOPs: 25.59 | +7: iteration 25540/ 60336 | consumed samples: 6538240 | consumed tokens: 13390315520 | elapsed time per iteration (s): 0.16 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.821973E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.160 | TFLOPs: 25.49 | +7: iteration 25550/ 60336 | consumed samples: 6540800 | consumed tokens: 13395558400 | elapsed time per iteration (s): 0.16 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.809463E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.348 | TFLOPs: 25.38 | +7: iteration 25560/ 60336 | consumed samples: 6543360 | consumed tokens: 13400801280 | elapsed time per iteration (s): 0.15 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.829173E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.806 | TFLOPs: 26.25 | +7: iteration 25570/ 60336 | consumed samples: 6545920 | consumed tokens: 13406044160 | elapsed time per iteration (s): 0.15 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.833873E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.392 | TFLOPs: 25.96 | +7: iteration 25580/ 60336 | consumed samples: 6548480 | consumed tokens: 13411287040 | elapsed time per iteration (s): 0.16 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.829031E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.489 | TFLOPs: 25.54 | +7: iteration 25590/ 60336 | consumed samples: 6551040 | consumed tokens: 13416529920 | elapsed time per iteration (s): 0.15 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.837973E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.125 | TFLOPs: 25.93 | +7: iteration 25600/ 60336 | consumed samples: 6553600 | consumed tokens: 13421772800 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.826709E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.903 | TFLOPs: 25.42 | +7: iteration 25610/ 60336 | consumed samples: 6556160 | consumed tokens: 13427015680 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.822834E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.168 | TFLOPs: 25.89 | +7: iteration 25620/ 60336 | consumed samples: 6558720 | consumed tokens: 13432258560 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.831524E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.115 | TFLOPs: 25.66 | +7: iteration 25630/ 60336 | consumed samples: 6561280 | consumed tokens: 13437501440 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.816509E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.258 | TFLOPs: 25.00 | +7: iteration 25640/ 60336 | consumed samples: 6563840 | consumed tokens: 13442744320 | elapsed time per iteration (s): 0.17 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.824228E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1481.453 | TFLOPs: 23.23 | +7: iteration 25650/ 60336 | consumed samples: 6566400 | consumed tokens: 13447987200 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.852137E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.217 | TFLOPs: 24.75 | +7: iteration 25660/ 60336 | consumed samples: 6568960 | consumed tokens: 13453230080 | elapsed time per iteration (s): 0.16 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.828838E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.150 | TFLOPs: 25.80 | +7: iteration 25670/ 60336 | consumed samples: 6571520 | consumed tokens: 13458472960 | elapsed time per iteration (s): 0.15 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.818848E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.337 | TFLOPs: 25.94 | +7: iteration 25680/ 60336 | consumed samples: 6574080 | consumed tokens: 13463715840 | elapsed time per iteration (s): 0.16 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.830732E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.563 | TFLOPs: 25.15 | +7: iteration 25690/ 60336 | consumed samples: 6576640 | consumed tokens: 13468958720 | elapsed time per iteration (s): 0.15 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.828074E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.364 | TFLOPs: 25.96 | +7: iteration 25700/ 60336 | consumed samples: 6579200 | consumed tokens: 13474201600 | elapsed time per iteration (s): 0.17 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.824140E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1493.858 | TFLOPs: 23.43 | +7: iteration 25710/ 60336 | consumed samples: 6581760 | consumed tokens: 13479444480 | elapsed time per iteration (s): 0.16 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.825110E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.968 | TFLOPs: 25.69 | +7: iteration 25720/ 60336 | consumed samples: 6584320 | consumed tokens: 13484687360 | elapsed time per iteration (s): 0.16 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.817259E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.893 | TFLOPs: 25.84 | +7: iteration 25730/ 60336 | consumed samples: 6586880 | consumed tokens: 13489930240 | elapsed time per iteration (s): 0.16 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.839349E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.883 | TFLOPs: 25.03 | +7: iteration 25740/ 60336 | consumed samples: 6589440 | consumed tokens: 13495173120 | elapsed time per iteration (s): 0.16 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.830914E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.857 | TFLOPs: 25.69 | +7: iteration 25750/ 60336 | consumed samples: 6592000 | consumed tokens: 13500416000 | elapsed time per iteration (s): 0.16 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.830048E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.019 | TFLOPs: 25.75 | +7: iteration 25760/ 60336 | consumed samples: 6594560 | consumed tokens: 13505658880 | elapsed time per iteration (s): 0.17 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.828824E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.721 | TFLOPs: 23.25 | +7: iteration 25770/ 60336 | consumed samples: 6597120 | consumed tokens: 13510901760 | elapsed time per iteration (s): 0.16 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.825372E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.383 | TFLOPs: 25.79 | +7: iteration 25780/ 60336 | consumed samples: 6599680 | consumed tokens: 13516144640 | elapsed time per iteration (s): 0.16 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.824236E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.514 | TFLOPs: 25.62 | +7: iteration 25790/ 60336 | consumed samples: 6602240 | consumed tokens: 13521387520 | elapsed time per iteration (s): 0.16 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.827922E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.593 | TFLOPs: 25.74 | +7: iteration 25800/ 60336 | consumed samples: 6604800 | consumed tokens: 13526630400 | elapsed time per iteration (s): 0.16 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.824092E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.341 | TFLOPs: 25.52 | +7: iteration 25810/ 60336 | consumed samples: 6607360 | consumed tokens: 13531873280 | elapsed time per iteration (s): 0.15 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.835134E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.975 | TFLOPs: 26.17 | +7: iteration 25820/ 60336 | consumed samples: 6609920 | consumed tokens: 13537116160 | elapsed time per iteration (s): 0.16 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.824480E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.005 | TFLOPs: 25.81 | +7: iteration 25830/ 60336 | consumed samples: 6612480 | consumed tokens: 13542359040 | elapsed time per iteration (s): 0.16 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.842392E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.611 | TFLOPs: 25.76 | +7: iteration 25840/ 60336 | consumed samples: 6615040 | consumed tokens: 13547601920 | elapsed time per iteration (s): 0.16 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.826515E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.268 | TFLOPs: 25.88 | +7: iteration 25850/ 60336 | consumed samples: 6617600 | consumed tokens: 13552844800 | elapsed time per iteration (s): 0.15 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.829991E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.191 | TFLOPs: 26.16 | +7: iteration 25860/ 60336 | consumed samples: 6620160 | consumed tokens: 13558087680 | elapsed time per iteration (s): 0.16 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.815041E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.887 | TFLOPs: 25.89 | +7: iteration 25870/ 60336 | consumed samples: 6622720 | consumed tokens: 13563330560 | elapsed time per iteration (s): 0.15 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.830531E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.866 | TFLOPs: 25.97 | +7: iteration 25880/ 60336 | consumed samples: 6625280 | consumed tokens: 13568573440 | elapsed time per iteration (s): 0.15 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.825447E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.564 | TFLOPs: 25.98 | +7: iteration 25890/ 60336 | consumed samples: 6627840 | consumed tokens: 13573816320 | elapsed time per iteration (s): 0.15 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.842445E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.965 | TFLOPs: 26.27 | +7: iteration 25900/ 60336 | consumed samples: 6630400 | consumed tokens: 13579059200 | elapsed time per iteration (s): 0.15 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.815714E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.241 | TFLOPs: 25.93 | +7: iteration 25910/ 60336 | consumed samples: 6632960 | consumed tokens: 13584302080 | elapsed time per iteration (s): 0.15 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.817789E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.419 | TFLOPs: 26.27 | +7: iteration 25920/ 60336 | consumed samples: 6635520 | consumed tokens: 13589544960 | elapsed time per iteration (s): 0.16 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.821961E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.463 | TFLOPs: 24.90 | +7: iteration 25930/ 60336 | consumed samples: 6638080 | consumed tokens: 13594787840 | elapsed time per iteration (s): 0.15 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.822639E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.769 | TFLOPs: 26.22 | +7: iteration 25940/ 60336 | consumed samples: 6640640 | consumed tokens: 13600030720 | elapsed time per iteration (s): 0.15 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.843089E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.097 | TFLOPs: 26.29 | +7: iteration 25950/ 60336 | consumed samples: 6643200 | consumed tokens: 13605273600 | elapsed time per iteration (s): 0.16 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.821789E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.932 | TFLOPs: 25.75 | +7: iteration 25960/ 60336 | consumed samples: 6645760 | consumed tokens: 13610516480 | elapsed time per iteration (s): 0.15 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.822301E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.773 | TFLOPs: 26.26 | +7: iteration 25970/ 60336 | consumed samples: 6648320 | consumed tokens: 13615759360 | elapsed time per iteration (s): 0.15 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.809924E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.791 | TFLOPs: 26.26 | +7: iteration 25980/ 60336 | consumed samples: 6650880 | consumed tokens: 13621002240 | elapsed time per iteration (s): 0.15 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.814024E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.628 | TFLOPs: 26.25 | +7: iteration 25990/ 60336 | consumed samples: 6653440 | consumed tokens: 13626245120 | elapsed time per iteration (s): 0.15 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.840594E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.168 | TFLOPs: 26.18 | +0: [2023-03-17 01:26:26,630] [INFO] [logging.py:68:log_dist] [Rank 0] step=26000, skipped=0, lr=[0.00013096296282050622, 0.00013096296282050622, 0.00013096296282050622], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 26000/ 60336 | consumed samples: 6656000 | consumed tokens: 13631488000 | elapsed time per iteration (s): 0.15 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.821953E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.605 | TFLOPs: 26.23 | +0: steps: 26000 loss: 3.8202 iter time (s): 0.157 samples/sec: 1630.649 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 26000 | lm loss value: 3.920000E+00 | lm loss PPL: 5.040042E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 26000 to checkpoints_44m32b100m +0: [2023-03-17 01:26:26,703] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step26000 is begin to save! +0: [2023-03-17 01:26:26,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:26:26,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:26:26,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:26:26,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:26:26,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:26:26,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:26:26,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:26:26,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:26:26,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:26:26,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:26:26,804] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:26:26,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:26:26,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:26:26,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:26:26,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:26:26,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:26:26,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:26:26,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:26:26,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:26:26,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:26:26,838] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step26000/mp_rank_00_model_states.pt +0: [2023-03-17 01:26:26,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:26:26,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:26:26,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:26:26,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:26:26,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:26:26,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:26:26,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 01:26:26,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:26:26,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:26:26,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:26:26,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:26:26,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:26:26,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:26:26,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: successfully saved checkpoint at iteration 26000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 184.31 +7: iteration 26010/ 60336 | consumed samples: 6658560 | consumed tokens: 13636730880 | elapsed time per iteration (s): 0.18 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.829822E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.189 | TFLOPs: 22.52 | +7: iteration 26020/ 60336 | consumed samples: 6661120 | consumed tokens: 13641973760 | elapsed time per iteration (s): 0.15 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.839364E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.635 | TFLOPs: 26.20 | +7: iteration 26030/ 60336 | consumed samples: 6663680 | consumed tokens: 13647216640 | elapsed time per iteration (s): 0.15 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.812731E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.959 | TFLOPs: 26.13 | +7: iteration 26040/ 60336 | consumed samples: 6666240 | consumed tokens: 13652459520 | elapsed time per iteration (s): 0.15 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.845831E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.445 | TFLOPs: 26.13 | +7: iteration 26050/ 60336 | consumed samples: 6668800 | consumed tokens: 13657702400 | elapsed time per iteration (s): 0.15 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.815005E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.153 | TFLOPs: 26.18 | +7: iteration 26060/ 60336 | consumed samples: 6671360 | consumed tokens: 13662945280 | elapsed time per iteration (s): 0.16 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.813213E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.957 | TFLOPs: 25.53 | +7: iteration 26070/ 60336 | consumed samples: 6673920 | consumed tokens: 13668188160 | elapsed time per iteration (s): 0.15 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.839206E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.825 | TFLOPs: 26.11 | +7: iteration 26080/ 60336 | consumed samples: 6676480 | consumed tokens: 13673431040 | elapsed time per iteration (s): 0.15 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.818013E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.669 | TFLOPs: 26.12 | +7: iteration 26090/ 60336 | consumed samples: 6679040 | consumed tokens: 13678673920 | elapsed time per iteration (s): 0.15 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.821412E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.555 | TFLOPs: 26.12 | +7: iteration 26100/ 60336 | consumed samples: 6681600 | consumed tokens: 13683916800 | elapsed time per iteration (s): 0.15 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.844155E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.085 | TFLOPs: 26.08 | +7: iteration 26110/ 60336 | consumed samples: 6684160 | consumed tokens: 13689159680 | elapsed time per iteration (s): 0.15 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.811855E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.540 | TFLOPs: 26.06 | +7: iteration 26120/ 60336 | consumed samples: 6686720 | consumed tokens: 13694402560 | elapsed time per iteration (s): 0.15 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.827739E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.486 | TFLOPs: 26.09 | +7: iteration 26130/ 60336 | consumed samples: 6689280 | consumed tokens: 13699645440 | elapsed time per iteration (s): 0.16 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.814590E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.268 | TFLOPs: 24.45 | +7: iteration 26140/ 60336 | consumed samples: 6691840 | consumed tokens: 13704888320 | elapsed time per iteration (s): 0.15 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.820880E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.422 | TFLOPs: 26.12 | +7: iteration 26150/ 60336 | consumed samples: 6694400 | consumed tokens: 13710131200 | elapsed time per iteration (s): 0.16 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.810882E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.090 | TFLOPs: 25.23 | +7: iteration 26160/ 60336 | consumed samples: 6696960 | consumed tokens: 13715374080 | elapsed time per iteration (s): 0.15 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.821906E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.452 | TFLOPs: 26.09 | +7: iteration 26170/ 60336 | consumed samples: 6699520 | consumed tokens: 13720616960 | elapsed time per iteration (s): 0.16 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.823744E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.348 | TFLOPs: 25.41 | +7: iteration 26180/ 60336 | consumed samples: 6702080 | consumed tokens: 13725859840 | elapsed time per iteration (s): 0.15 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.828505E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.661 | TFLOPs: 26.11 | +7: iteration 26190/ 60336 | consumed samples: 6704640 | consumed tokens: 13731102720 | elapsed time per iteration (s): 0.16 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.832869E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.095 | TFLOPs: 25.89 | +7: iteration 26200/ 60336 | consumed samples: 6707200 | consumed tokens: 13736345600 | elapsed time per iteration (s): 0.16 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.821189E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.017 | TFLOPs: 24.79 | +7: iteration 26210/ 60336 | consumed samples: 6709760 | consumed tokens: 13741588480 | elapsed time per iteration (s): 0.15 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.813696E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.222 | TFLOPs: 26.10 | +7: iteration 26220/ 60336 | consumed samples: 6712320 | consumed tokens: 13746831360 | elapsed time per iteration (s): 0.15 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.822466E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.826 | TFLOPs: 26.11 | +7: iteration 26230/ 60336 | consumed samples: 6714880 | consumed tokens: 13752074240 | elapsed time per iteration (s): 0.15 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.833646E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.229 | TFLOPs: 26.08 | +7: iteration 26240/ 60336 | consumed samples: 6717440 | consumed tokens: 13757317120 | elapsed time per iteration (s): 0.15 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.820262E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.334 | TFLOPs: 26.09 | +7: iteration 26250/ 60336 | consumed samples: 6720000 | consumed tokens: 13762560000 | elapsed time per iteration (s): 0.15 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.828561E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.616 | TFLOPs: 26.11 | +7: iteration 26260/ 60336 | consumed samples: 6722560 | consumed tokens: 13767802880 | elapsed time per iteration (s): 0.16 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.834028E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.648 | TFLOPs: 25.64 | +7: iteration 26270/ 60336 | consumed samples: 6725120 | consumed tokens: 13773045760 | elapsed time per iteration (s): 0.15 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.833983E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.887 | TFLOPs: 26.09 | +7: iteration 26280/ 60336 | consumed samples: 6727680 | consumed tokens: 13778288640 | elapsed time per iteration (s): 0.15 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.838334E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.869 | TFLOPs: 26.08 | +7: iteration 26290/ 60336 | consumed samples: 6730240 | consumed tokens: 13783531520 | elapsed time per iteration (s): 0.15 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.817802E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.799 | TFLOPs: 26.08 | +7: iteration 26300/ 60336 | consumed samples: 6732800 | consumed tokens: 13788774400 | elapsed time per iteration (s): 0.15 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.832566E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.326 | TFLOPs: 26.10 | +7: iteration 26310/ 60336 | consumed samples: 6735360 | consumed tokens: 13794017280 | elapsed time per iteration (s): 0.15 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.818050E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.213 | TFLOPs: 26.10 | +7: iteration 26320/ 60336 | consumed samples: 6737920 | consumed tokens: 13799260160 | elapsed time per iteration (s): 0.15 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.814957E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.958 | TFLOPs: 26.08 | +7: iteration 26330/ 60336 | consumed samples: 6740480 | consumed tokens: 13804503040 | elapsed time per iteration (s): 0.15 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.832598E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.272 | TFLOPs: 26.08 | +7: iteration 26340/ 60336 | consumed samples: 6743040 | consumed tokens: 13809745920 | elapsed time per iteration (s): 0.15 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.815579E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.492 | TFLOPs: 26.04 | +7: iteration 26350/ 60336 | consumed samples: 6745600 | consumed tokens: 13814988800 | elapsed time per iteration (s): 0.15 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.822680E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.537 | TFLOPs: 26.04 | +7: iteration 26360/ 60336 | consumed samples: 6748160 | consumed tokens: 13820231680 | elapsed time per iteration (s): 0.16 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.827304E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.034 | TFLOPs: 25.72 | +7: iteration 26370/ 60336 | consumed samples: 6750720 | consumed tokens: 13825474560 | elapsed time per iteration (s): 0.15 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.833050E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.216 | TFLOPs: 26.05 | +7: iteration 26380/ 60336 | consumed samples: 6753280 | consumed tokens: 13830717440 | elapsed time per iteration (s): 0.15 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.817522E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.633 | TFLOPs: 26.14 | +7: iteration 26390/ 60336 | consumed samples: 6755840 | consumed tokens: 13835960320 | elapsed time per iteration (s): 0.15 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.834133E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.404 | TFLOPs: 26.21 | +7: iteration 26400/ 60336 | consumed samples: 6758400 | consumed tokens: 13841203200 | elapsed time per iteration (s): 0.15 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.822018E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.901 | TFLOPs: 26.13 | +7: iteration 26410/ 60336 | consumed samples: 6760960 | consumed tokens: 13846446080 | elapsed time per iteration (s): 0.15 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.828476E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.724 | TFLOPs: 26.23 | +7: iteration 26420/ 60336 | consumed samples: 6763520 | consumed tokens: 13851688960 | elapsed time per iteration (s): 0.15 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.807564E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.990 | TFLOPs: 26.21 | +7: iteration 26430/ 60336 | consumed samples: 6766080 | consumed tokens: 13856931840 | elapsed time per iteration (s): 0.15 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.814282E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.607 | TFLOPs: 26.15 | +7: iteration 26440/ 60336 | consumed samples: 6768640 | consumed tokens: 13862174720 | elapsed time per iteration (s): 0.15 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.825701E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.774 | TFLOPs: 26.20 | +7: iteration 26450/ 60336 | consumed samples: 6771200 | consumed tokens: 13867417600 | elapsed time per iteration (s): 0.15 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.809647E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.035 | TFLOPs: 26.19 | +7: iteration 26460/ 60336 | consumed samples: 6773760 | consumed tokens: 13872660480 | elapsed time per iteration (s): 0.15 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.817534E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.808 | TFLOPs: 26.12 | +7: iteration 26470/ 60336 | consumed samples: 6776320 | consumed tokens: 13877903360 | elapsed time per iteration (s): 0.15 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.829570E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.263 | TFLOPs: 26.04 | +7: iteration 26480/ 60336 | consumed samples: 6778880 | consumed tokens: 13883146240 | elapsed time per iteration (s): 0.15 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.826557E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.931 | TFLOPs: 26.03 | +7: iteration 26490/ 60336 | consumed samples: 6781440 | consumed tokens: 13888389120 | elapsed time per iteration (s): 0.15 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.823479E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.222 | TFLOPs: 26.05 | +7: iteration 26500/ 60336 | consumed samples: 6784000 | consumed tokens: 13893632000 | elapsed time per iteration (s): 0.15 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.833311E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.790 | TFLOPs: 26.03 | +7: iteration 26510/ 60336 | consumed samples: 6786560 | consumed tokens: 13898874880 | elapsed time per iteration (s): 0.15 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.814959E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.410 | TFLOPs: 26.06 | +7: iteration 26520/ 60336 | consumed samples: 6789120 | consumed tokens: 13904117760 | elapsed time per iteration (s): 0.15 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.808702E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.923 | TFLOPs: 26.08 | +7: iteration 26530/ 60336 | consumed samples: 6791680 | consumed tokens: 13909360640 | elapsed time per iteration (s): 0.15 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.834925E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.879 | TFLOPs: 26.11 | +7: iteration 26540/ 60336 | consumed samples: 6794240 | consumed tokens: 13914603520 | elapsed time per iteration (s): 0.15 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.810007E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.495 | TFLOPs: 26.09 | +7: iteration 26550/ 60336 | consumed samples: 6796800 | consumed tokens: 13919846400 | elapsed time per iteration (s): 0.15 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.820374E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.375 | TFLOPs: 26.09 | +7: iteration 26560/ 60336 | consumed samples: 6799360 | consumed tokens: 13925089280 | elapsed time per iteration (s): 0.15 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.819336E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 26570/ 60336 | consumed samples: 6801920 | consumed tokens: 13930332160 | elapsed time per iteration (s): 0.16 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.835796E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.970 | TFLOPs: 24.75 | +7: iteration 26580/ 60336 | consumed samples: 6804480 | consumed tokens: 13935575040 | elapsed time per iteration (s): 0.15 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.832626E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.675 | TFLOPs: 26.11 | +7: iteration 26590/ 60336 | consumed samples: 6807040 | consumed tokens: 13940817920 | elapsed time per iteration (s): 0.15 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.818052E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.188 | TFLOPs: 26.05 | +7: iteration 26600/ 60336 | consumed samples: 6809600 | consumed tokens: 13946060800 | elapsed time per iteration (s): 0.15 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.814576E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.464 | TFLOPs: 26.06 | +7: iteration 26610/ 60336 | consumed samples: 6812160 | consumed tokens: 13951303680 | elapsed time per iteration (s): 0.15 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.830956E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.484 | TFLOPs: 26.07 | +7: iteration 26620/ 60336 | consumed samples: 6814720 | consumed tokens: 13956546560 | elapsed time per iteration (s): 0.15 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.828230E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.534 | TFLOPs: 25.99 | +7: iteration 26630/ 60336 | consumed samples: 6817280 | consumed tokens: 13961789440 | elapsed time per iteration (s): 0.15 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.812415E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.493 | TFLOPs: 26.06 | +7: iteration 26640/ 60336 | consumed samples: 6819840 | consumed tokens: 13967032320 | elapsed time per iteration (s): 0.15 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.826699E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.370 | TFLOPs: 26.05 | +7: iteration 26650/ 60336 | consumed samples: 6822400 | consumed tokens: 13972275200 | elapsed time per iteration (s): 0.15 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.824610E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.474 | TFLOPs: 26.07 | +7: iteration 26660/ 60336 | consumed samples: 6824960 | consumed tokens: 13977518080 | elapsed time per iteration (s): 0.15 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.836407E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.331 | TFLOPs: 26.05 | +7: iteration 26670/ 60336 | consumed samples: 6827520 | consumed tokens: 13982760960 | elapsed time per iteration (s): 0.15 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.820003E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.134 | TFLOPs: 26.05 | +7: iteration 26680/ 60336 | consumed samples: 6830080 | consumed tokens: 13988003840 | elapsed time per iteration (s): 0.15 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.825940E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.243 | TFLOPs: 26.05 | +7: iteration 26690/ 60336 | consumed samples: 6832640 | consumed tokens: 13993246720 | elapsed time per iteration (s): 0.15 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.835440E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.597 | TFLOPs: 26.01 | +7: iteration 26700/ 60336 | consumed samples: 6835200 | consumed tokens: 13998489600 | elapsed time per iteration (s): 0.15 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.804973E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.787 | TFLOPs: 26.05 | +7: iteration 26710/ 60336 | consumed samples: 6837760 | consumed tokens: 14003732480 | elapsed time per iteration (s): 0.15 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.822738E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.068 | TFLOPs: 26.07 | +7: iteration 26720/ 60336 | consumed samples: 6840320 | consumed tokens: 14008975360 | elapsed time per iteration (s): 0.15 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.834271E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.140 | TFLOPs: 26.07 | +7: iteration 26730/ 60336 | consumed samples: 6842880 | consumed tokens: 14014218240 | elapsed time per iteration (s): 0.15 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.811718E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.920 | TFLOPs: 26.06 | +7: iteration 26740/ 60336 | consumed samples: 6845440 | consumed tokens: 14019461120 | elapsed time per iteration (s): 0.15 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.827132E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.799 | TFLOPs: 26.06 | +7: iteration 26750/ 60336 | consumed samples: 6848000 | consumed tokens: 14024704000 | elapsed time per iteration (s): 0.15 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.816652E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.440 | TFLOPs: 26.10 | +7: iteration 26760/ 60336 | consumed samples: 6850560 | consumed tokens: 14029946880 | elapsed time per iteration (s): 0.15 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.815917E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.888 | TFLOPs: 26.09 | +7: iteration 26770/ 60336 | consumed samples: 6853120 | consumed tokens: 14035189760 | elapsed time per iteration (s): 0.15 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.825259E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.335 | TFLOPs: 26.09 | +7: iteration 26780/ 60336 | consumed samples: 6855680 | consumed tokens: 14040432640 | elapsed time per iteration (s): 0.15 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.826329E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.594 | TFLOPs: 26.09 | +7: iteration 26790/ 60336 | consumed samples: 6858240 | consumed tokens: 14045675520 | elapsed time per iteration (s): 0.15 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.836263E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.442 | TFLOPs: 26.24 | +7: iteration 26800/ 60336 | consumed samples: 6860800 | consumed tokens: 14050918400 | elapsed time per iteration (s): 0.15 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.818007E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.786 | TFLOPs: 26.25 | +7: iteration 26810/ 60336 | consumed samples: 6863360 | consumed tokens: 14056161280 | elapsed time per iteration (s): 0.15 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.818220E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.527 | TFLOPs: 26.25 | +7: iteration 26820/ 60336 | consumed samples: 6865920 | consumed tokens: 14061404160 | elapsed time per iteration (s): 0.15 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.813113E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.246 | TFLOPs: 26.21 | +7: iteration 26830/ 60336 | consumed samples: 6868480 | consumed tokens: 14066647040 | elapsed time per iteration (s): 0.15 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.817504E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.790 | TFLOPs: 26.20 | +7: iteration 26840/ 60336 | consumed samples: 6871040 | consumed tokens: 14071889920 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.824549E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.087 | TFLOPs: 25.56 | +7: iteration 26850/ 60336 | consumed samples: 6873600 | consumed tokens: 14077132800 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.813780E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.502 | TFLOPs: 25.65 | +7: iteration 26860/ 60336 | consumed samples: 6876160 | consumed tokens: 14082375680 | elapsed time per iteration (s): 0.15 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.827367E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.492 | TFLOPs: 25.93 | +7: iteration 26870/ 60336 | consumed samples: 6878720 | consumed tokens: 14087618560 | elapsed time per iteration (s): 0.15 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.826722E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.208 | TFLOPs: 26.18 | +7: iteration 26880/ 60336 | consumed samples: 6881280 | consumed tokens: 14092861440 | elapsed time per iteration (s): 0.15 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.829295E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.624 | TFLOPs: 26.04 | +7: iteration 26890/ 60336 | consumed samples: 6883840 | consumed tokens: 14098104320 | elapsed time per iteration (s): 0.15 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.821487E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.594 | TFLOPs: 26.04 | +7: iteration 26900/ 60336 | consumed samples: 6886400 | consumed tokens: 14103347200 | elapsed time per iteration (s): 0.15 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.821366E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.301 | TFLOPs: 26.04 | +7: iteration 26910/ 60336 | consumed samples: 6888960 | consumed tokens: 14108590080 | elapsed time per iteration (s): 0.15 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.815548E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.548 | TFLOPs: 26.04 | +7: iteration 26920/ 60336 | consumed samples: 6891520 | consumed tokens: 14113832960 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.823314E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.233 | TFLOPs: 25.83 | +7: iteration 26930/ 60336 | consumed samples: 6894080 | consumed tokens: 14119075840 | elapsed time per iteration (s): 0.15 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.823612E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.050 | TFLOPs: 26.03 | +7: iteration 26940/ 60336 | consumed samples: 6896640 | consumed tokens: 14124318720 | elapsed time per iteration (s): 0.16 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.826023E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.414 | TFLOPs: 24.47 | +7: iteration 26950/ 60336 | consumed samples: 6899200 | consumed tokens: 14129561600 | elapsed time per iteration (s): 0.15 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.818042E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.933 | TFLOPs: 26.03 | +7: iteration 26960/ 60336 | consumed samples: 6901760 | consumed tokens: 14134804480 | elapsed time per iteration (s): 0.15 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.814798E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.155 | TFLOPs: 26.02 | +7: iteration 26970/ 60336 | consumed samples: 6904320 | consumed tokens: 14140047360 | elapsed time per iteration (s): 0.15 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.819001E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.124 | TFLOPs: 26.05 | +7: iteration 26980/ 60336 | consumed samples: 6906880 | consumed tokens: 14145290240 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.806352E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.199 | TFLOPs: 25.42 | +7: iteration 26990/ 60336 | consumed samples: 6909440 | consumed tokens: 14150533120 | elapsed time per iteration (s): 0.15 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.821751E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.536 | TFLOPs: 26.03 | +7: iteration 27000/ 60336 | consumed samples: 6912000 | consumed tokens: 14155776000 | elapsed time per iteration (s): 0.15 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.817830E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.161 | TFLOPs: 26.04 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 27000 | lm loss value: 3.945173E+00 | lm loss PPL: 5.168529E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 27000 to checkpoints_44m32b100m +0: [2023-03-17 01:29:01,464] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step27000 is begin to save! +0: [2023-03-17 01:29:01,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:29:01,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:29:01,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:29:01,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:29:01,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:29:01,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:29:01,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:29:01,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:29:01,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:29:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:29:01,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:29:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:29:01,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:29:01,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:29:01,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:29:01,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:29:01,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:29:01,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:29:01,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:29:01,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:29:01,600] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step27000/mp_rank_00_model_states.pt +0: [2023-03-17 01:29:01,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:29:01,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:29:01,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:29:01,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:29:01,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:29:01,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:29:01,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:29:01,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:29:01,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:29:01,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:29:01,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:29:01,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:29:01,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: successfully saved checkpoint at iteration 27000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 180.34 +7: iteration 27010/ 60336 | consumed samples: 6914560 | consumed tokens: 14161018880 | elapsed time per iteration (s): 0.18 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.827842E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.779 | TFLOPs: 22.77 | +7: iteration 27020/ 60336 | consumed samples: 6917120 | consumed tokens: 14166261760 | elapsed time per iteration (s): 0.15 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.804942E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.050 | TFLOPs: 26.10 | +7: iteration 27030/ 60336 | consumed samples: 6919680 | consumed tokens: 14171504640 | elapsed time per iteration (s): 0.15 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.816485E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.567 | TFLOPs: 26.12 | +7: iteration 27040/ 60336 | consumed samples: 6922240 | consumed tokens: 14176747520 | elapsed time per iteration (s): 0.15 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.805655E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.428 | TFLOPs: 26.13 | +7: iteration 27050/ 60336 | consumed samples: 6924800 | consumed tokens: 14181990400 | elapsed time per iteration (s): 0.15 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.804550E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.987 | TFLOPs: 26.10 | +7: iteration 27060/ 60336 | consumed samples: 6927360 | consumed tokens: 14187233280 | elapsed time per iteration (s): 0.15 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.818254E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.913 | TFLOPs: 26.06 | +7: iteration 27070/ 60336 | consumed samples: 6929920 | consumed tokens: 14192476160 | elapsed time per iteration (s): 0.15 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.821793E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.536 | TFLOPs: 26.03 | +7: iteration 27080/ 60336 | consumed samples: 6932480 | consumed tokens: 14197719040 | elapsed time per iteration (s): 0.15 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.826329E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.210 | TFLOPs: 26.04 | +7: iteration 27090/ 60336 | consumed samples: 6935040 | consumed tokens: 14202961920 | elapsed time per iteration (s): 0.15 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.810292E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.311 | TFLOPs: 26.07 | +7: iteration 27100/ 60336 | consumed samples: 6937600 | consumed tokens: 14208204800 | elapsed time per iteration (s): 0.15 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.813273E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.341 | TFLOPs: 26.01 | +7: iteration 27110/ 60336 | consumed samples: 6940160 | consumed tokens: 14213447680 | elapsed time per iteration (s): 0.15 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.817501E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.536 | TFLOPs: 26.01 | +7: iteration 27120/ 60336 | consumed samples: 6942720 | consumed tokens: 14218690560 | elapsed time per iteration (s): 0.15 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.830899E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.181 | TFLOPs: 26.08 | +7: iteration 27130/ 60336 | consumed samples: 6945280 | consumed tokens: 14223933440 | elapsed time per iteration (s): 0.15 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.831405E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.372 | TFLOPs: 26.07 | +7: iteration 27140/ 60336 | consumed samples: 6947840 | consumed tokens: 14229176320 | elapsed time per iteration (s): 0.15 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.822620E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.880 | TFLOPs: 26.08 | +7: iteration 27150/ 60336 | consumed samples: 6950400 | consumed tokens: 14234419200 | elapsed time per iteration (s): 0.15 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.819925E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.472 | TFLOPs: 26.10 | +7: iteration 27160/ 60336 | consumed samples: 6952960 | consumed tokens: 14239662080 | elapsed time per iteration (s): 0.15 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.825105E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.658 | TFLOPs: 26.09 | +7: iteration 27170/ 60336 | consumed samples: 6955520 | consumed tokens: 14244904960 | elapsed time per iteration (s): 0.15 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.819493E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.410 | TFLOPs: 26.07 | +7: iteration 27180/ 60336 | consumed samples: 6958080 | consumed tokens: 14250147840 | elapsed time per iteration (s): 0.15 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.803426E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.819 | TFLOPs: 26.08 | +7: iteration 27190/ 60336 | consumed samples: 6960640 | consumed tokens: 14255390720 | elapsed time per iteration (s): 0.15 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.822376E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.431 | TFLOPs: 26.10 | +7: iteration 27200/ 60336 | consumed samples: 6963200 | consumed tokens: 14260633600 | elapsed time per iteration (s): 0.15 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.816684E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.929 | TFLOPs: 26.08 | +7: iteration 27210/ 60336 | consumed samples: 6965760 | consumed tokens: 14265876480 | elapsed time per iteration (s): 0.15 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.820786E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.937 | TFLOPs: 26.08 | +7: iteration 27220/ 60336 | consumed samples: 6968320 | consumed tokens: 14271119360 | elapsed time per iteration (s): 0.15 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.835006E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.921 | TFLOPs: 26.08 | +7: iteration 27230/ 60336 | consumed samples: 6970880 | consumed tokens: 14276362240 | elapsed time per iteration (s): 0.15 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.818313E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.875 | TFLOPs: 25.91 | +7: iteration 27240/ 60336 | consumed samples: 6973440 | consumed tokens: 14281605120 | elapsed time per iteration (s): 0.16 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.808931E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.119 | TFLOPs: 25.49 | +7: iteration 27250/ 60336 | consumed samples: 6976000 | consumed tokens: 14286848000 | elapsed time per iteration (s): 0.15 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.815659E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.203 | TFLOPs: 26.05 | +7: iteration 27260/ 60336 | consumed samples: 6978560 | consumed tokens: 14292090880 | elapsed time per iteration (s): 0.15 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.805887E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.643 | TFLOPs: 26.07 | +7: iteration 27270/ 60336 | consumed samples: 6981120 | consumed tokens: 14297333760 | elapsed time per iteration (s): 0.15 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.827465E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.618 | TFLOPs: 26.06 | +7: iteration 27280/ 60336 | consumed samples: 6983680 | consumed tokens: 14302576640 | elapsed time per iteration (s): 0.15 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.807929E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.194 | TFLOPs: 26.08 | +7: iteration 27290/ 60336 | consumed samples: 6986240 | consumed tokens: 14307819520 | elapsed time per iteration (s): 0.15 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.810920E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.321 | TFLOPs: 26.07 | +7: iteration 27300/ 60336 | consumed samples: 6988800 | consumed tokens: 14313062400 | elapsed time per iteration (s): 0.15 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.811966E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.976 | TFLOPs: 26.06 | +7: iteration 27310/ 60336 | consumed samples: 6991360 | consumed tokens: 14318305280 | elapsed time per iteration (s): 0.15 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.821609E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.331 | TFLOPs: 26.12 | +7: iteration 27320/ 60336 | consumed samples: 6993920 | consumed tokens: 14323548160 | elapsed time per iteration (s): 0.15 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.817979E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.085 | TFLOPs: 26.13 | +7: iteration 27330/ 60336 | consumed samples: 6996480 | consumed tokens: 14328791040 | elapsed time per iteration (s): 0.15 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.832600E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.127 | TFLOPs: 26.11 | +7: iteration 27340/ 60336 | consumed samples: 6999040 | consumed tokens: 14334033920 | elapsed time per iteration (s): 0.15 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.822562E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.483 | TFLOPs: 26.12 | +7: iteration 27350/ 60336 | consumed samples: 7001600 | consumed tokens: 14339276800 | elapsed time per iteration (s): 0.15 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.821035E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.328 | TFLOPs: 26.12 | +7: iteration 27360/ 60336 | consumed samples: 7004160 | consumed tokens: 14344519680 | elapsed time per iteration (s): 0.15 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.822684E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.988 | TFLOPs: 26.10 | +7: iteration 27370/ 60336 | consumed samples: 7006720 | consumed tokens: 14349762560 | elapsed time per iteration (s): 0.15 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.821495E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.306 | TFLOPs: 26.10 | +7: iteration 27380/ 60336 | consumed samples: 7009280 | consumed tokens: 14355005440 | elapsed time per iteration (s): 0.15 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.821636E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.229 | TFLOPs: 26.10 | +7: iteration 27390/ 60336 | consumed samples: 7011840 | consumed tokens: 14360248320 | elapsed time per iteration (s): 0.15 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.818184E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.107 | TFLOPs: 26.10 | +7: iteration 27400/ 60336 | consumed samples: 7014400 | consumed tokens: 14365491200 | elapsed time per iteration (s): 0.15 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.819763E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.271 | TFLOPs: 26.12 | +7: iteration 27410/ 60336 | consumed samples: 7016960 | consumed tokens: 14370734080 | elapsed time per iteration (s): 0.16 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.812711E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.692 | TFLOPs: 25.86 | +7: iteration 27420/ 60336 | consumed samples: 7019520 | consumed tokens: 14375976960 | elapsed time per iteration (s): 0.15 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.822684E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.618 | TFLOPs: 26.12 | +7: iteration 27430/ 60336 | consumed samples: 7022080 | consumed tokens: 14381219840 | elapsed time per iteration (s): 0.15 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.830606E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.138 | TFLOPs: 26.13 | +7: iteration 27440/ 60336 | consumed samples: 7024640 | consumed tokens: 14386462720 | elapsed time per iteration (s): 0.15 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.820242E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.913 | TFLOPs: 26.13 | +7: iteration 27450/ 60336 | consumed samples: 7027200 | consumed tokens: 14391705600 | elapsed time per iteration (s): 0.15 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.823321E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.582 | TFLOPs: 26.12 | +7: iteration 27460/ 60336 | consumed samples: 7029760 | consumed tokens: 14396948480 | elapsed time per iteration (s): 0.15 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.819355E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.921 | TFLOPs: 26.14 | +7: iteration 27470/ 60336 | consumed samples: 7032320 | consumed tokens: 14402191360 | elapsed time per iteration (s): 0.15 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.832920E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.012 | TFLOPs: 26.10 | +7: iteration 27480/ 60336 | consumed samples: 7034880 | consumed tokens: 14407434240 | elapsed time per iteration (s): 0.15 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.825190E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.722 | TFLOPs: 26.11 | +7: iteration 27490/ 60336 | consumed samples: 7037440 | consumed tokens: 14412677120 | elapsed time per iteration (s): 0.15 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.821568E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.183 | TFLOPs: 26.07 | +7: iteration 27500/ 60336 | consumed samples: 7040000 | consumed tokens: 14417920000 | elapsed time per iteration (s): 0.15 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.824193E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.116 | TFLOPs: 26.11 | +7: iteration 27510/ 60336 | consumed samples: 7042560 | consumed tokens: 14423162880 | elapsed time per iteration (s): 0.15 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.813917E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.202 | TFLOPs: 26.11 | +7: iteration 27520/ 60336 | consumed samples: 7045120 | consumed tokens: 14428405760 | elapsed time per iteration (s): 0.15 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.814674E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.226 | TFLOPs: 26.08 | +7: iteration 27530/ 60336 | consumed samples: 7047680 | consumed tokens: 14433648640 | elapsed time per iteration (s): 0.15 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.822665E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.248 | TFLOPs: 26.08 | +7: iteration 27540/ 60336 | consumed samples: 7050240 | consumed tokens: 14438891520 | elapsed time per iteration (s): 0.15 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.819326E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.507 | TFLOPs: 26.10 | +7: iteration 27550/ 60336 | consumed samples: 7052800 | consumed tokens: 14444134400 | elapsed time per iteration (s): 0.15 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.802760E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.246 | TFLOPs: 26.04 | +7: iteration 27560/ 60336 | consumed samples: 7055360 | consumed tokens: 14449377280 | elapsed time per iteration (s): 0.15 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.812978E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.489 | TFLOPs: 25.99 | +7: iteration 27570/ 60336 | consumed samples: 7057920 | consumed tokens: 14454620160 | elapsed time per iteration (s): 0.15 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.816847E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.954 | TFLOPs: 26.02 | +7: iteration 27580/ 60336 | consumed samples: 7060480 | consumed tokens: 14459863040 | elapsed time per iteration (s): 0.15 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.832327E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.409 | TFLOPs: 26.02 | +7: iteration 27590/ 60336 | consumed samples: 7063040 | consumed tokens: 14465105920 | elapsed time per iteration (s): 0.15 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.823709E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.048 | TFLOPs: 26.05 | +7: iteration 27600/ 60336 | consumed samples: 7065600 | consumed tokens: 14470348800 | elapsed time per iteration (s): 0.15 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.808865E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.869 | TFLOPs: 26.05 | +7: iteration 27610/ 60336 | consumed samples: 7068160 | consumed tokens: 14475591680 | elapsed time per iteration (s): 0.15 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.817922E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.737 | TFLOPs: 26.03 | +7: iteration 27620/ 60336 | consumed samples: 7070720 | consumed tokens: 14480834560 | elapsed time per iteration (s): 0.15 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.821639E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.111 | TFLOPs: 26.05 | +7: iteration 27630/ 60336 | consumed samples: 7073280 | consumed tokens: 14486077440 | elapsed time per iteration (s): 0.15 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.822371E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.274 | TFLOPs: 26.07 | +7: iteration 27640/ 60336 | consumed samples: 7075840 | consumed tokens: 14491320320 | elapsed time per iteration (s): 0.15 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.805704E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.900 | TFLOPs: 26.05 | +7: iteration 27650/ 60336 | consumed samples: 7078400 | consumed tokens: 14496563200 | elapsed time per iteration (s): 0.15 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.824783E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.731 | TFLOPs: 26.06 | +7: iteration 27660/ 60336 | consumed samples: 7080960 | consumed tokens: 14501806080 | elapsed time per iteration (s): 0.15 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.830611E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.950 | TFLOPs: 26.06 | +7: iteration 27670/ 60336 | consumed samples: 7083520 | consumed tokens: 14507048960 | elapsed time per iteration (s): 0.15 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.820357E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.786 | TFLOPs: 26.06 | +7: iteration 27680/ 60336 | consumed samples: 7086080 | consumed tokens: 14512291840 | elapsed time per iteration (s): 0.15 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.816234E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.479 | TFLOPs: 26.06 | +7: iteration 27690/ 60336 | consumed samples: 7088640 | consumed tokens: 14517534720 | elapsed time per iteration (s): 0.15 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.825651E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.307 | TFLOPs: 26.07 | +7: iteration 27700/ 60336 | consumed samples: 7091200 | consumed tokens: 14522777600 | elapsed time per iteration (s): 0.15 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.818443E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.187 | TFLOPs: 26.05 | +7: iteration 27710/ 60336 | consumed samples: 7093760 | consumed tokens: 14528020480 | elapsed time per iteration (s): 0.15 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.810075E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.607 | TFLOPs: 26.07 | +7: iteration 27720/ 60336 | consumed samples: 7096320 | consumed tokens: 14533263360 | elapsed time per iteration (s): 0.15 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.812057E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.163 | TFLOPs: 26.05 | +7: iteration 27730/ 60336 | consumed samples: 7098880 | consumed tokens: 14538506240 | elapsed time per iteration (s): 0.15 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.820653E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.596 | TFLOPs: 26.07 | +7: iteration 27740/ 60336 | consumed samples: 7101440 | consumed tokens: 14543749120 | elapsed time per iteration (s): 0.15 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.816153E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.672 | TFLOPs: 26.07 | +7: iteration 27750/ 60336 | consumed samples: 7104000 | consumed tokens: 14548992000 | elapsed time per iteration (s): 0.15 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.806031E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.722 | TFLOPs: 26.08 | +7: iteration 27760/ 60336 | consumed samples: 7106560 | consumed tokens: 14554234880 | elapsed time per iteration (s): 0.15 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.807028E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.110 | TFLOPs: 26.08 | +7: iteration 27770/ 60336 | consumed samples: 7109120 | consumed tokens: 14559477760 | elapsed time per iteration (s): 0.15 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.824284E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.993 | TFLOPs: 26.03 | +7: iteration 27780/ 60336 | consumed samples: 7111680 | consumed tokens: 14564720640 | elapsed time per iteration (s): 0.15 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.810394E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.015 | TFLOPs: 26.03 | +7: iteration 27790/ 60336 | consumed samples: 7114240 | consumed tokens: 14569963520 | elapsed time per iteration (s): 0.15 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.810332E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.399 | TFLOPs: 26.05 | +7: iteration 27800/ 60336 | consumed samples: 7116800 | consumed tokens: 14575206400 | elapsed time per iteration (s): 0.15 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.816951E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.741 | TFLOPs: 26.06 | +7: iteration 27810/ 60336 | consumed samples: 7119360 | consumed tokens: 14580449280 | elapsed time per iteration (s): 0.15 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.827242E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.843 | TFLOPs: 26.05 | +7: iteration 27820/ 60336 | consumed samples: 7121920 | consumed tokens: 14585692160 | elapsed time per iteration (s): 0.15 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.816889E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.811 | TFLOPs: 26.08 | +7: iteration 27830/ 60336 | consumed samples: 7124480 | consumed tokens: 14590935040 | elapsed time per iteration (s): 0.15 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.811035E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.763 | TFLOPs: 26.08 | +7: iteration 27840/ 60336 | consumed samples: 7127040 | consumed tokens: 14596177920 | elapsed time per iteration (s): 0.15 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.811125E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.837 | TFLOPs: 26.08 | +7: iteration 27850/ 60336 | consumed samples: 7129600 | consumed tokens: 14601420800 | elapsed time per iteration (s): 0.15 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.818488E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.919 | TFLOPs: 25.92 | +7: iteration 27860/ 60336 | consumed samples: 7132160 | consumed tokens: 14606663680 | elapsed time per iteration (s): 0.16 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.801889E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.382 | TFLOPs: 25.88 | +7: iteration 27870/ 60336 | consumed samples: 7134720 | consumed tokens: 14611906560 | elapsed time per iteration (s): 0.15 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.810557E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.442 | TFLOPs: 26.07 | +7: iteration 27880/ 60336 | consumed samples: 7137280 | consumed tokens: 14617149440 | elapsed time per iteration (s): 0.15 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.805648E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.554 | TFLOPs: 26.06 | +7: iteration 27890/ 60336 | consumed samples: 7139840 | consumed tokens: 14622392320 | elapsed time per iteration (s): 0.15 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.830183E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.180 | TFLOPs: 26.07 | +7: iteration 27900/ 60336 | consumed samples: 7142400 | consumed tokens: 14627635200 | elapsed time per iteration (s): 0.15 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.815062E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.193 | TFLOPs: 26.05 | +7: iteration 27910/ 60336 | consumed samples: 7144960 | consumed tokens: 14632878080 | elapsed time per iteration (s): 0.15 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.813084E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.282 | TFLOPs: 26.05 | +7: iteration 27920/ 60336 | consumed samples: 7147520 | consumed tokens: 14638120960 | elapsed time per iteration (s): 0.15 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.827950E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.745 | TFLOPs: 26.03 | +7: iteration 27930/ 60336 | consumed samples: 7150080 | consumed tokens: 14643363840 | elapsed time per iteration (s): 0.15 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.803574E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.941 | TFLOPs: 26.08 | +7: iteration 27940/ 60336 | consumed samples: 7152640 | consumed tokens: 14648606720 | elapsed time per iteration (s): 0.15 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.830630E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.961 | TFLOPs: 26.11 | +7: iteration 27950/ 60336 | consumed samples: 7155200 | consumed tokens: 14653849600 | elapsed time per iteration (s): 0.15 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.807486E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.799 | TFLOPs: 26.16 | +7: iteration 27960/ 60336 | consumed samples: 7157760 | consumed tokens: 14659092480 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.822687E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.059 | TFLOPs: 26.14 | +7: iteration 27970/ 60336 | consumed samples: 7160320 | consumed tokens: 14664335360 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.822591E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.559 | TFLOPs: 26.14 | +7: iteration 27980/ 60336 | consumed samples: 7162880 | consumed tokens: 14669578240 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.816391E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.030 | TFLOPs: 26.02 | +7: iteration 27990/ 60336 | consumed samples: 7165440 | consumed tokens: 14674821120 | elapsed time per iteration (s): 0.15 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.805977E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.061 | TFLOPs: 26.02 | +0: [2023-03-17 01:31:35,678] [INFO] [logging.py:68:log_dist] [Rank 0] step=28000, skipped=0, lr=[0.00012165749805640123, 0.00012165749805640123, 0.00012165749805640123], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 28000/ 60336 | consumed samples: 7168000 | consumed tokens: 14680064000 | elapsed time per iteration (s): 0.15 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.822777E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.131 | TFLOPs: 26.07 | +0: steps: 28000 loss: 3.7904 iter time (s): 0.153 samples/sec: 1675.788 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 28000 | lm loss value: 3.929517E+00 | lm loss PPL: 5.088241E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 28000 to checkpoints_44m32b100m +0: [2023-03-17 01:31:35,749] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step28000 is begin to save! +0: [2023-03-17 01:31:35,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:31:35,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:31:35,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:31:35,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:31:35,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:31:35,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:31:35,830] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:31:35,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:31:35,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:31:35,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:31:35,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:31:35,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:31:35,855] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:31:35,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:31:35,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:31:35,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:31:35,871] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:31:35,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:31:35,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:31:35,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:31:35,880] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step28000/mp_rank_00_model_states.pt +0: [2023-03-17 01:31:35,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:31:35,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:31:35,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:31:35,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:31:35,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 01:31:35,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 01:31:35,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:31:35,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:31:35,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:31:35,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:31:35,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:31:35,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:31:35,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: successfully saved checkpoint at iteration 28000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 175.59 +7: iteration 28010/ 60336 | consumed samples: 7170560 | consumed tokens: 14685306880 | elapsed time per iteration (s): 0.18 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.810749E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.376 | TFLOPs: 22.56 | +7: iteration 28020/ 60336 | consumed samples: 7173120 | consumed tokens: 14690549760 | elapsed time per iteration (s): 0.15 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.822051E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.158 | TFLOPs: 26.15 | +7: iteration 28030/ 60336 | consumed samples: 7175680 | consumed tokens: 14695792640 | elapsed time per iteration (s): 0.15 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.812099E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.856 | TFLOPs: 26.11 | +7: iteration 28040/ 60336 | consumed samples: 7178240 | consumed tokens: 14701035520 | elapsed time per iteration (s): 0.15 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.827439E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.631 | TFLOPs: 26.11 | +7: iteration 28050/ 60336 | consumed samples: 7180800 | consumed tokens: 14706278400 | elapsed time per iteration (s): 0.16 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.814103E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.798 | TFLOPs: 24.87 | +7: iteration 28060/ 60336 | consumed samples: 7183360 | consumed tokens: 14711521280 | elapsed time per iteration (s): 0.16 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.814700E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.405 | TFLOPs: 25.73 | +7: iteration 28070/ 60336 | consumed samples: 7185920 | consumed tokens: 14716764160 | elapsed time per iteration (s): 0.16 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.828339E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.326 | TFLOPs: 25.77 | +7: iteration 28080/ 60336 | consumed samples: 7188480 | consumed tokens: 14722007040 | elapsed time per iteration (s): 0.16 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.810928E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.192 | TFLOPs: 25.17 | +7: iteration 28090/ 60336 | consumed samples: 7191040 | consumed tokens: 14727249920 | elapsed time per iteration (s): 0.15 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.814911E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.970 | TFLOPs: 26.14 | +7: iteration 28100/ 60336 | consumed samples: 7193600 | consumed tokens: 14732492800 | elapsed time per iteration (s): 0.16 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.828519E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.780 | TFLOPs: 25.73 | +7: iteration 28110/ 60336 | consumed samples: 7196160 | consumed tokens: 14737735680 | elapsed time per iteration (s): 0.16 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.811190E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.581 | TFLOPs: 25.52 | +7: iteration 28120/ 60336 | consumed samples: 7198720 | consumed tokens: 14742978560 | elapsed time per iteration (s): 0.15 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.816294E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.259 | TFLOPs: 26.07 | +7: iteration 28130/ 60336 | consumed samples: 7201280 | consumed tokens: 14748221440 | elapsed time per iteration (s): 0.16 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.821335E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.730 | TFLOPs: 25.70 | +7: iteration 28140/ 60336 | consumed samples: 7203840 | consumed tokens: 14753464320 | elapsed time per iteration (s): 0.16 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.812442E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.052 | TFLOPs: 25.28 | +7: iteration 28150/ 60336 | consumed samples: 7206400 | consumed tokens: 14758707200 | elapsed time per iteration (s): 0.16 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.830612E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.118 | TFLOPs: 25.72 | +7: iteration 28160/ 60336 | consumed samples: 7208960 | consumed tokens: 14763950080 | elapsed time per iteration (s): 0.16 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.816811E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.435 | TFLOPs: 25.55 | +7: iteration 28170/ 60336 | consumed samples: 7211520 | consumed tokens: 14769192960 | elapsed time per iteration (s): 0.15 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.808897E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.457 | TFLOPs: 26.12 | +7: iteration 28180/ 60336 | consumed samples: 7214080 | consumed tokens: 14774435840 | elapsed time per iteration (s): 0.15 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.813467E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.944 | TFLOPs: 26.08 | +7: iteration 28190/ 60336 | consumed samples: 7216640 | consumed tokens: 14779678720 | elapsed time per iteration (s): 0.15 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.804858E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.132 | TFLOPs: 26.11 | +7: iteration 28200/ 60336 | consumed samples: 7219200 | consumed tokens: 14784921600 | elapsed time per iteration (s): 0.16 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.825980E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.371 | TFLOPs: 25.79 | +7: iteration 28210/ 60336 | consumed samples: 7221760 | consumed tokens: 14790164480 | elapsed time per iteration (s): 0.15 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.812089E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.412 | TFLOPs: 26.10 | +7: iteration 28220/ 60336 | consumed samples: 7224320 | consumed tokens: 14795407360 | elapsed time per iteration (s): 0.16 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.826461E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.423 | TFLOPs: 25.21 | +7: iteration 28230/ 60336 | consumed samples: 7226880 | consumed tokens: 14800650240 | elapsed time per iteration (s): 0.16 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.811877E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.756 | TFLOPs: 25.43 | +7: iteration 28240/ 60336 | consumed samples: 7229440 | consumed tokens: 14805893120 | elapsed time per iteration (s): 0.16 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.796611E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.276 | TFLOPs: 25.55 | +7: iteration 28250/ 60336 | consumed samples: 7232000 | consumed tokens: 14811136000 | elapsed time per iteration (s): 0.15 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.830413E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.727 | TFLOPs: 26.12 | +7: iteration 28260/ 60336 | consumed samples: 7234560 | consumed tokens: 14816378880 | elapsed time per iteration (s): 0.15 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.805621E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.378 | TFLOPs: 26.09 | +7: iteration 28270/ 60336 | consumed samples: 7237120 | consumed tokens: 14821621760 | elapsed time per iteration (s): 0.15 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.809781E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.750 | TFLOPs: 26.12 | +7: iteration 28280/ 60336 | consumed samples: 7239680 | consumed tokens: 14826864640 | elapsed time per iteration (s): 0.15 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.809006E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.392 | TFLOPs: 26.10 | +7: iteration 28290/ 60336 | consumed samples: 7242240 | consumed tokens: 14832107520 | elapsed time per iteration (s): 0.15 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.820081E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.943 | TFLOPs: 26.11 | +7: iteration 28300/ 60336 | consumed samples: 7244800 | consumed tokens: 14837350400 | elapsed time per iteration (s): 0.15 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.820413E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.479 | TFLOPs: 26.12 | +7: iteration 28310/ 60336 | consumed samples: 7247360 | consumed tokens: 14842593280 | elapsed time per iteration (s): 0.15 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.808707E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.087 | TFLOPs: 26.10 | +7: iteration 28320/ 60336 | consumed samples: 7249920 | consumed tokens: 14847836160 | elapsed time per iteration (s): 0.16 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.807079E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.730 | TFLOPs: 25.57 | +7: iteration 28330/ 60336 | consumed samples: 7252480 | consumed tokens: 14853079040 | elapsed time per iteration (s): 0.15 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.823249E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.298 | TFLOPs: 26.08 | +7: iteration 28340/ 60336 | consumed samples: 7255040 | consumed tokens: 14858321920 | elapsed time per iteration (s): 0.16 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.812912E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.257 | TFLOPs: 25.66 | +7: iteration 28350/ 60336 | consumed samples: 7257600 | consumed tokens: 14863564800 | elapsed time per iteration (s): 0.15 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.800505E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.021 | TFLOPs: 26.10 | +7: iteration 28360/ 60336 | consumed samples: 7260160 | consumed tokens: 14868807680 | elapsed time per iteration (s): 0.15 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.813381E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.753 | TFLOPs: 26.11 | +7: iteration 28370/ 60336 | consumed samples: 7262720 | consumed tokens: 14874050560 | elapsed time per iteration (s): 0.16 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.813134E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.856 | TFLOPs: 25.73 | +7: iteration 28380/ 60336 | consumed samples: 7265280 | consumed tokens: 14879293440 | elapsed time per iteration (s): 0.15 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.823972E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.466 | TFLOPs: 26.12 | +7: iteration 28390/ 60336 | consumed samples: 7267840 | consumed tokens: 14884536320 | elapsed time per iteration (s): 0.15 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.813124E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.319 | TFLOPs: 26.10 | +7: iteration 28400/ 60336 | consumed samples: 7270400 | consumed tokens: 14889779200 | elapsed time per iteration (s): 0.15 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.808949E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.913 | TFLOPs: 26.11 | +7: iteration 28410/ 60336 | consumed samples: 7272960 | consumed tokens: 14895022080 | elapsed time per iteration (s): 0.15 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.804303E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.719 | TFLOPs: 26.12 | +7: iteration 28420/ 60336 | consumed samples: 7275520 | consumed tokens: 14900264960 | elapsed time per iteration (s): 0.15 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.799541E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.000 | TFLOPs: 26.11 | +7: iteration 28430/ 60336 | consumed samples: 7278080 | consumed tokens: 14905507840 | elapsed time per iteration (s): 0.15 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.815378E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.582 | TFLOPs: 26.12 | +7: iteration 28440/ 60336 | consumed samples: 7280640 | consumed tokens: 14910750720 | elapsed time per iteration (s): 0.15 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.816249E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.255 | TFLOPs: 26.13 | +7: iteration 28450/ 60336 | consumed samples: 7283200 | consumed tokens: 14915993600 | elapsed time per iteration (s): 0.15 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.805853E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.613 | TFLOPs: 26.14 | +7: iteration 28460/ 60336 | consumed samples: 7285760 | consumed tokens: 14921236480 | elapsed time per iteration (s): 0.15 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.816037E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.045 | TFLOPs: 26.11 | +7: iteration 28470/ 60336 | consumed samples: 7288320 | consumed tokens: 14926479360 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.809029E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.074 | TFLOPs: 26.10 | +7: iteration 28480/ 60336 | consumed samples: 7290880 | consumed tokens: 14931722240 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.801495E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.832 | TFLOPs: 26.08 | +7: iteration 28490/ 60336 | consumed samples: 7293440 | consumed tokens: 14936965120 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.820951E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.653 | TFLOPs: 26.14 | +7: iteration 28500/ 60336 | consumed samples: 7296000 | consumed tokens: 14942208000 | elapsed time per iteration (s): 0.15 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.812502E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.632 | TFLOPs: 26.14 | +7: iteration 28510/ 60336 | consumed samples: 7298560 | consumed tokens: 14947450880 | elapsed time per iteration (s): 0.15 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.810828E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.193 | TFLOPs: 26.13 | +7: iteration 28520/ 60336 | consumed samples: 7301120 | consumed tokens: 14952693760 | elapsed time per iteration (s): 0.15 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.821992E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.763 | TFLOPs: 26.12 | +7: iteration 28530/ 60336 | consumed samples: 7303680 | consumed tokens: 14957936640 | elapsed time per iteration (s): 0.15 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.805280E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.831 | TFLOPs: 26.06 | +7: iteration 28540/ 60336 | consumed samples: 7306240 | consumed tokens: 14963179520 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.825952E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.504 | TFLOPs: 26.13 | +7: iteration 28550/ 60336 | consumed samples: 7308800 | consumed tokens: 14968422400 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.819949E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.070 | TFLOPs: 26.11 | +7: iteration 28560/ 60336 | consumed samples: 7311360 | consumed tokens: 14973665280 | elapsed time per iteration (s): 0.15 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.805759E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.997 | TFLOPs: 26.08 | +7: iteration 28570/ 60336 | consumed samples: 7313920 | consumed tokens: 14978908160 | elapsed time per iteration (s): 0.15 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.808532E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.340 | TFLOPs: 26.10 | +7: iteration 28580/ 60336 | consumed samples: 7316480 | consumed tokens: 14984151040 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.806091E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.697 | TFLOPs: 26.08 | +7: iteration 28590/ 60336 | consumed samples: 7319040 | consumed tokens: 14989393920 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.840187E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.312 | TFLOPs: 26.07 | +7: iteration 28600/ 60336 | consumed samples: 7321600 | consumed tokens: 14994636800 | elapsed time per iteration (s): 0.15 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.814936E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.541 | TFLOPs: 26.04 | +7: iteration 28610/ 60336 | consumed samples: 7324160 | consumed tokens: 14999879680 | elapsed time per iteration (s): 0.15 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.821731E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.405 | TFLOPs: 26.04 | +7: iteration 28620/ 60336 | consumed samples: 7326720 | consumed tokens: 15005122560 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.820837E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.839 | TFLOPs: 26.05 | +7: iteration 28630/ 60336 | consumed samples: 7329280 | consumed tokens: 15010365440 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.815828E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.614 | TFLOPs: 26.04 | +7: iteration 28640/ 60336 | consumed samples: 7331840 | consumed tokens: 15015608320 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.820041E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.300 | TFLOPs: 26.04 | +7: iteration 28650/ 60336 | consumed samples: 7334400 | consumed tokens: 15020851200 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.817008E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.289 | TFLOPs: 26.04 | +7: iteration 28660/ 60336 | consumed samples: 7336960 | consumed tokens: 15026094080 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.815925E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.233 | TFLOPs: 26.04 | +7: iteration 28670/ 60336 | consumed samples: 7339520 | consumed tokens: 15031336960 | elapsed time per iteration (s): 0.15 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.819205E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.893 | TFLOPs: 26.03 | +7: iteration 28680/ 60336 | consumed samples: 7342080 | consumed tokens: 15036579840 | elapsed time per iteration (s): 0.15 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.822991E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.380 | TFLOPs: 26.02 | +7: iteration 28690/ 60336 | consumed samples: 7344640 | consumed tokens: 15041822720 | elapsed time per iteration (s): 0.15 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.803841E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.233 | TFLOPs: 26.05 | +7: iteration 28700/ 60336 | consumed samples: 7347200 | consumed tokens: 15047065600 | elapsed time per iteration (s): 0.15 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.803712E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.822 | TFLOPs: 26.06 | +7: iteration 28710/ 60336 | consumed samples: 7349760 | consumed tokens: 15052308480 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.807977E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.748 | TFLOPs: 26.04 | +7: iteration 28720/ 60336 | consumed samples: 7352320 | consumed tokens: 15057551360 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.804815E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.689 | TFLOPs: 26.04 | +7: iteration 28730/ 60336 | consumed samples: 7354880 | consumed tokens: 15062794240 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.811732E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.721 | TFLOPs: 26.04 | +7: iteration 28740/ 60336 | consumed samples: 7357440 | consumed tokens: 15068037120 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.811575E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.539 | TFLOPs: 26.01 | +7: iteration 28750/ 60336 | consumed samples: 7360000 | consumed tokens: 15073280000 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.815612E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.651 | TFLOPs: 26.03 | +7: iteration 28760/ 60336 | consumed samples: 7362560 | consumed tokens: 15078522880 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.808674E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.283 | TFLOPs: 26.04 | +7: iteration 28770/ 60336 | consumed samples: 7365120 | consumed tokens: 15083765760 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.806436E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.529 | TFLOPs: 26.03 | +7: iteration 28780/ 60336 | consumed samples: 7367680 | consumed tokens: 15089008640 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.815537E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.205 | TFLOPs: 26.05 | +7: iteration 28790/ 60336 | consumed samples: 7370240 | consumed tokens: 15094251520 | elapsed time per iteration (s): 0.15 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.825030E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.227 | TFLOPs: 26.10 | +7: iteration 28800/ 60336 | consumed samples: 7372800 | consumed tokens: 15099494400 | elapsed time per iteration (s): 0.15 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.811103E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.598 | TFLOPs: 26.09 | +7: iteration 28810/ 60336 | consumed samples: 7375360 | consumed tokens: 15104737280 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.814002E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.545 | TFLOPs: 26.09 | +7: iteration 28820/ 60336 | consumed samples: 7377920 | consumed tokens: 15109980160 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.833427E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.081 | TFLOPs: 26.10 | +7: iteration 28830/ 60336 | consumed samples: 7380480 | consumed tokens: 15115223040 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.818978E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.587 | TFLOPs: 26.10 | +7: iteration 28840/ 60336 | consumed samples: 7383040 | consumed tokens: 15120465920 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.804735E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.743 | TFLOPs: 26.11 | +7: iteration 28850/ 60336 | consumed samples: 7385600 | consumed tokens: 15125708800 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.813325E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.860 | TFLOPs: 26.09 | +7: iteration 28860/ 60336 | consumed samples: 7388160 | consumed tokens: 15130951680 | elapsed time per iteration (s): 0.15 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.801411E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.621 | TFLOPs: 26.11 | +7: iteration 28870/ 60336 | consumed samples: 7390720 | consumed tokens: 15136194560 | elapsed time per iteration (s): 0.15 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.805598E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.048 | TFLOPs: 26.10 | +7: iteration 28880/ 60336 | consumed samples: 7393280 | consumed tokens: 15141437440 | elapsed time per iteration (s): 0.15 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.823618E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.867 | TFLOPs: 26.09 | +7: iteration 28890/ 60336 | consumed samples: 7395840 | consumed tokens: 15146680320 | elapsed time per iteration (s): 0.15 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.801028E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.800 | TFLOPs: 26.09 | +7: iteration 28900/ 60336 | consumed samples: 7398400 | consumed tokens: 15151923200 | elapsed time per iteration (s): 0.15 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.802232E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.146 | TFLOPs: 26.10 | +7: iteration 28910/ 60336 | consumed samples: 7400960 | consumed tokens: 15157166080 | elapsed time per iteration (s): 0.15 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.824017E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.699 | TFLOPs: 26.11 | +7: iteration 28920/ 60336 | consumed samples: 7403520 | consumed tokens: 15162408960 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.809619E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.131 | TFLOPs: 26.11 | +7: iteration 28930/ 60336 | consumed samples: 7406080 | consumed tokens: 15167651840 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.815898E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.112 | TFLOPs: 26.10 | +7: iteration 28940/ 60336 | consumed samples: 7408640 | consumed tokens: 15172894720 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.819974E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.271 | TFLOPs: 26.12 | +7: iteration 28950/ 60336 | consumed samples: 7411200 | consumed tokens: 15178137600 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.811650E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.231 | TFLOPs: 26.11 | +7: iteration 28960/ 60336 | consumed samples: 7413760 | consumed tokens: 15183380480 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.817294E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.901 | TFLOPs: 26.08 | +7: iteration 28970/ 60336 | consumed samples: 7416320 | consumed tokens: 15188623360 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.808588E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.749 | TFLOPs: 26.09 | +7: iteration 28980/ 60336 | consumed samples: 7418880 | consumed tokens: 15193866240 | elapsed time per iteration (s): 0.15 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.813221E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.383 | TFLOPs: 26.10 | +7: iteration 28990/ 60336 | consumed samples: 7421440 | consumed tokens: 15199109120 | elapsed time per iteration (s): 0.15 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.814655E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.195 | TFLOPs: 26.05 | +7: iteration 29000/ 60336 | consumed samples: 7424000 | consumed tokens: 15204352000 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.823771E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.254 | TFLOPs: 26.08 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 29000 | lm loss value: 3.869955E+00 | lm loss PPL: 4.794023E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 29000 to checkpoints_44m32b100m +0: [2023-03-17 01:34:10,437] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step29000 is begin to save! +0: [2023-03-17 01:34:10,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:34:10,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:34:10,500] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:34:10,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:34:10,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:34:10,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:34:10,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:34:10,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:34:10,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:34:10,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:34:10,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:34:10,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:34:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:34:10,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:34:10,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:34:10,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:34:10,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:34:10,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:34:10,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:34:10,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:34:10,568] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step29000/mp_rank_00_model_states.pt +0: [2023-03-17 01:34:10,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:34:10,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:34:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 01:34:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:34:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:34:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:34:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-17 01:34:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:34:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:34:10,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:34:10,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:34:10,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:34:10,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:34:10,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:34:10,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:34:10,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: successfully saved checkpoint at iteration 29000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.34 +7: iteration 29010/ 60336 | consumed samples: 7426560 | consumed tokens: 15209594880 | elapsed time per iteration (s): 0.18 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.819183E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.215 | TFLOPs: 22.76 | +7: iteration 29020/ 60336 | consumed samples: 7429120 | consumed tokens: 15214837760 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.814948E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.371 | TFLOPs: 26.12 | +7: iteration 29030/ 60336 | consumed samples: 7431680 | consumed tokens: 15220080640 | elapsed time per iteration (s): 0.15 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.813686E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.754 | TFLOPs: 26.11 | +7: iteration 29040/ 60336 | consumed samples: 7434240 | consumed tokens: 15225323520 | elapsed time per iteration (s): 0.15 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.814185E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.265 | TFLOPs: 26.12 | +7: iteration 29050/ 60336 | consumed samples: 7436800 | consumed tokens: 15230566400 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.811892E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.657 | TFLOPs: 26.09 | +7: iteration 29060/ 60336 | consumed samples: 7439360 | consumed tokens: 15235809280 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.809877E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.644 | TFLOPs: 26.09 | +7: iteration 29070/ 60336 | consumed samples: 7441920 | consumed tokens: 15241052160 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.817406E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.252 | TFLOPs: 26.07 | +7: iteration 29080/ 60336 | consumed samples: 7444480 | consumed tokens: 15246295040 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.814132E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.112 | TFLOPs: 26.05 | +7: iteration 29090/ 60336 | consumed samples: 7447040 | consumed tokens: 15251537920 | elapsed time per iteration (s): 0.15 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.818555E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.613 | TFLOPs: 26.09 | +7: iteration 29100/ 60336 | consumed samples: 7449600 | consumed tokens: 15256780800 | elapsed time per iteration (s): 0.15 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.816189E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.097 | TFLOPs: 26.08 | +7: iteration 29110/ 60336 | consumed samples: 7452160 | consumed tokens: 15262023680 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.807171E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.164 | TFLOPs: 26.08 | +7: iteration 29120/ 60336 | consumed samples: 7454720 | consumed tokens: 15267266560 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.806613E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.406 | TFLOPs: 26.09 | +7: iteration 29130/ 60336 | consumed samples: 7457280 | consumed tokens: 15272509440 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.804221E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.710 | TFLOPs: 26.11 | +7: iteration 29140/ 60336 | consumed samples: 7459840 | consumed tokens: 15277752320 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.805405E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.291 | TFLOPs: 26.08 | +7: iteration 29150/ 60336 | consumed samples: 7462400 | consumed tokens: 15282995200 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.812211E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.455 | TFLOPs: 26.09 | +7: iteration 29160/ 60336 | consumed samples: 7464960 | consumed tokens: 15288238080 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.825499E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.155 | TFLOPs: 26.07 | +7: iteration 29170/ 60336 | consumed samples: 7467520 | consumed tokens: 15293480960 | elapsed time per iteration (s): 0.15 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.821502E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.722 | TFLOPs: 26.11 | +7: iteration 29180/ 60336 | consumed samples: 7470080 | consumed tokens: 15298723840 | elapsed time per iteration (s): 0.16 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.809048E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.612 | TFLOPs: 25.70 | +7: iteration 29190/ 60336 | consumed samples: 7472640 | consumed tokens: 15303966720 | elapsed time per iteration (s): 0.15 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.817092E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.212 | TFLOPs: 26.10 | +7: iteration 29200/ 60336 | consumed samples: 7475200 | consumed tokens: 15309209600 | elapsed time per iteration (s): 0.16 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.815833E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.168 | TFLOPs: 25.14 | +7: iteration 29210/ 60336 | consumed samples: 7477760 | consumed tokens: 15314452480 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.820566E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.977 | TFLOPs: 26.10 | +7: iteration 29220/ 60336 | consumed samples: 7480320 | consumed tokens: 15319695360 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.816162E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.062 | TFLOPs: 26.10 | +7: iteration 29230/ 60336 | consumed samples: 7482880 | consumed tokens: 15324938240 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.810873E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.036 | TFLOPs: 26.10 | +7: iteration 29240/ 60336 | consumed samples: 7485440 | consumed tokens: 15330181120 | elapsed time per iteration (s): 0.15 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.813528E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.378 | TFLOPs: 26.07 | +7: iteration 29250/ 60336 | consumed samples: 7488000 | consumed tokens: 15335424000 | elapsed time per iteration (s): 0.15 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.813026E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.892 | TFLOPs: 26.06 | +7: iteration 29260/ 60336 | consumed samples: 7490560 | consumed tokens: 15340666880 | elapsed time per iteration (s): 0.15 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.813589E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.073 | TFLOPs: 26.08 | +7: iteration 29270/ 60336 | consumed samples: 7493120 | consumed tokens: 15345909760 | elapsed time per iteration (s): 0.15 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.816430E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.225 | TFLOPs: 26.10 | +7: iteration 29280/ 60336 | consumed samples: 7495680 | consumed tokens: 15351152640 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.792357E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.667 | TFLOPs: 26.11 | +7: iteration 29290/ 60336 | consumed samples: 7498240 | consumed tokens: 15356395520 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.810532E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.395 | TFLOPs: 26.12 | +7: iteration 29300/ 60336 | consumed samples: 7500800 | consumed tokens: 15361638400 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.812497E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.494 | TFLOPs: 26.09 | +7: iteration 29310/ 60336 | consumed samples: 7503360 | consumed tokens: 15366881280 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.803488E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.052 | TFLOPs: 26.10 | +7: iteration 29320/ 60336 | consumed samples: 7505920 | consumed tokens: 15372124160 | elapsed time per iteration (s): 0.15 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.807505E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.990 | TFLOPs: 26.11 | +7: iteration 29330/ 60336 | consumed samples: 7508480 | consumed tokens: 15377367040 | elapsed time per iteration (s): 0.15 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.814671E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.968 | TFLOPs: 26.11 | +7: iteration 29340/ 60336 | consumed samples: 7511040 | consumed tokens: 15382609920 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.808300E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.821 | TFLOPs: 26.09 | +7: iteration 29350/ 60336 | consumed samples: 7513600 | consumed tokens: 15387852800 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.815448E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.184 | TFLOPs: 26.11 | +7: iteration 29360/ 60336 | consumed samples: 7516160 | consumed tokens: 15393095680 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.813086E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.198 | TFLOPs: 26.10 | +7: iteration 29370/ 60336 | consumed samples: 7518720 | consumed tokens: 15398338560 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.811759E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.429 | TFLOPs: 26.10 | +7: iteration 29380/ 60336 | consumed samples: 7521280 | consumed tokens: 15403581440 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.814536E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.453 | TFLOPs: 26.10 | +7: iteration 29390/ 60336 | consumed samples: 7523840 | consumed tokens: 15408824320 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.818843E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.878 | TFLOPs: 26.09 | +7: iteration 29400/ 60336 | consumed samples: 7526400 | consumed tokens: 15414067200 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.816917E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.774 | TFLOPs: 26.09 | +7: iteration 29410/ 60336 | consumed samples: 7528960 | consumed tokens: 15419310080 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.813949E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.519 | TFLOPs: 26.10 | +7: iteration 29420/ 60336 | consumed samples: 7531520 | consumed tokens: 15424552960 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.809485E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.563 | TFLOPs: 26.09 | +7: iteration 29430/ 60336 | consumed samples: 7534080 | consumed tokens: 15429795840 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.804473E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.563 | TFLOPs: 26.10 | +7: iteration 29440/ 60336 | consumed samples: 7536640 | consumed tokens: 15435038720 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.809222E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.219 | TFLOPs: 26.10 | +7: iteration 29450/ 60336 | consumed samples: 7539200 | consumed tokens: 15440281600 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.805141E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.306 | TFLOPs: 26.10 | +7: iteration 29460/ 60336 | consumed samples: 7541760 | consumed tokens: 15445524480 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.804905E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.507 | TFLOPs: 26.15 | +7: iteration 29470/ 60336 | consumed samples: 7544320 | consumed tokens: 15450767360 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.804835E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.668 | TFLOPs: 26.15 | +7: iteration 29480/ 60336 | consumed samples: 7546880 | consumed tokens: 15456010240 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.810362E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.799 | TFLOPs: 26.16 | +7: iteration 29490/ 60336 | consumed samples: 7549440 | consumed tokens: 15461253120 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.811990E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.475 | TFLOPs: 26.15 | +7: iteration 29500/ 60336 | consumed samples: 7552000 | consumed tokens: 15466496000 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.810937E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.402 | TFLOPs: 26.15 | +7: iteration 29510/ 60336 | consumed samples: 7554560 | consumed tokens: 15471738880 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.807957E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.032 | TFLOPs: 26.16 | +7: iteration 29520/ 60336 | consumed samples: 7557120 | consumed tokens: 15476981760 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.807720E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.025 | TFLOPs: 26.17 | +7: iteration 29530/ 60336 | consumed samples: 7559680 | consumed tokens: 15482224640 | elapsed time per iteration (s): 0.16 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.807985E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.696 | TFLOPs: 25.59 | +7: iteration 29540/ 60336 | consumed samples: 7562240 | consumed tokens: 15487467520 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.818283E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.916 | TFLOPs: 26.16 | +7: iteration 29550/ 60336 | consumed samples: 7564800 | consumed tokens: 15492710400 | elapsed time per iteration (s): 0.16 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.809032E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.330 | TFLOPs: 25.03 | +7: iteration 29560/ 60336 | consumed samples: 7567360 | consumed tokens: 15497953280 | elapsed time per iteration (s): 0.16 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.813293E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.935 | TFLOPs: 25.80 | +7: iteration 29570/ 60336 | consumed samples: 7569920 | consumed tokens: 15503196160 | elapsed time per iteration (s): 0.15 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.809017E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.680 | TFLOPs: 26.17 | +7: iteration 29580/ 60336 | consumed samples: 7572480 | consumed tokens: 15508439040 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.815355E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.743 | TFLOPs: 26.15 | +7: iteration 29590/ 60336 | consumed samples: 7575040 | consumed tokens: 15513681920 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.806388E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.834 | TFLOPs: 26.17 | +7: iteration 29600/ 60336 | consumed samples: 7577600 | consumed tokens: 15518924800 | elapsed time per iteration (s): 0.15 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.819302E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.218 | TFLOPs: 26.15 | +7: iteration 29610/ 60336 | consumed samples: 7580160 | consumed tokens: 15524167680 | elapsed time per iteration (s): 0.15 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.800653E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.712 | TFLOPs: 26.15 | +7: iteration 29620/ 60336 | consumed samples: 7582720 | consumed tokens: 15529410560 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.811184E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.494 | TFLOPs: 26.17 | +7: iteration 29630/ 60336 | consumed samples: 7585280 | consumed tokens: 15534653440 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.803853E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.162 | TFLOPs: 26.15 | +7: iteration 29640/ 60336 | consumed samples: 7587840 | consumed tokens: 15539896320 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.806027E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.249 | TFLOPs: 26.16 | +7: iteration 29650/ 60336 | consumed samples: 7590400 | consumed tokens: 15545139200 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.809624E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.155 | TFLOPs: 26.16 | +7: iteration 29660/ 60336 | consumed samples: 7592960 | consumed tokens: 15550382080 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.799134E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.580 | TFLOPs: 26.18 | +7: iteration 29670/ 60336 | consumed samples: 7595520 | consumed tokens: 15555624960 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.804221E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.742 | TFLOPs: 26.17 | +7: iteration 29680/ 60336 | consumed samples: 7598080 | consumed tokens: 15560867840 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.803047E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.907 | TFLOPs: 26.16 | +7: iteration 29690/ 60336 | consumed samples: 7600640 | consumed tokens: 15566110720 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.819913E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.761 | TFLOPs: 26.15 | +7: iteration 29700/ 60336 | consumed samples: 7603200 | consumed tokens: 15571353600 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.804219E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.864 | TFLOPs: 26.17 | +7: iteration 29710/ 60336 | consumed samples: 7605760 | consumed tokens: 15576596480 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.791544E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.980 | TFLOPs: 26.14 | +7: iteration 29720/ 60336 | consumed samples: 7608320 | consumed tokens: 15581839360 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.801723E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.793 | TFLOPs: 26.01 | +7: iteration 29730/ 60336 | consumed samples: 7610880 | consumed tokens: 15587082240 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.798738E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.884 | TFLOPs: 26.03 | +7: iteration 29740/ 60336 | consumed samples: 7613440 | consumed tokens: 15592325120 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.814688E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.761 | TFLOPs: 26.03 | +7: iteration 29750/ 60336 | consumed samples: 7616000 | consumed tokens: 15597568000 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.813726E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.410 | TFLOPs: 26.04 | +7: iteration 29760/ 60336 | consumed samples: 7618560 | consumed tokens: 15602810880 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.799360E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.015 | TFLOPs: 26.06 | +7: iteration 29770/ 60336 | consumed samples: 7621120 | consumed tokens: 15608053760 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.800124E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.583 | TFLOPs: 26.15 | +7: iteration 29780/ 60336 | consumed samples: 7623680 | consumed tokens: 15613296640 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.818831E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.648 | TFLOPs: 26.15 | +7: iteration 29790/ 60336 | consumed samples: 7626240 | consumed tokens: 15618539520 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.800319E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.084 | TFLOPs: 26.16 | +7: iteration 29800/ 60336 | consumed samples: 7628800 | consumed tokens: 15623782400 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.803257E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.518 | TFLOPs: 26.15 | +7: iteration 29810/ 60336 | consumed samples: 7631360 | consumed tokens: 15629025280 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.794409E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.344 | TFLOPs: 26.12 | +7: iteration 29820/ 60336 | consumed samples: 7633920 | consumed tokens: 15634268160 | elapsed time per iteration (s): 0.16 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.804744E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.364 | TFLOPs: 25.80 | +7: iteration 29830/ 60336 | consumed samples: 7636480 | consumed tokens: 15639511040 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.813504E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.058 | TFLOPs: 26.24 | +7: iteration 29840/ 60336 | consumed samples: 7639040 | consumed tokens: 15644753920 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.820107E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.477 | TFLOPs: 26.21 | +7: iteration 29850/ 60336 | consumed samples: 7641600 | consumed tokens: 15649996800 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.802516E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.529 | TFLOPs: 26.25 | +7: iteration 29860/ 60336 | consumed samples: 7644160 | consumed tokens: 15655239680 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.812096E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.813 | TFLOPs: 26.05 | +7: iteration 29870/ 60336 | consumed samples: 7646720 | consumed tokens: 15660482560 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.796409E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.633 | TFLOPs: 26.23 | +7: iteration 29880/ 60336 | consumed samples: 7649280 | consumed tokens: 15665725440 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.817498E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.468 | TFLOPs: 26.20 | +7: iteration 29890/ 60336 | consumed samples: 7651840 | consumed tokens: 15670968320 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.806745E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.401 | TFLOPs: 26.23 | +7: iteration 29900/ 60336 | consumed samples: 7654400 | consumed tokens: 15676211200 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.809703E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.556 | TFLOPs: 26.23 | +7: iteration 29910/ 60336 | consumed samples: 7656960 | consumed tokens: 15681454080 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.809477E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.027 | TFLOPs: 26.24 | +7: iteration 29920/ 60336 | consumed samples: 7659520 | consumed tokens: 15686696960 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.809149E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.319 | TFLOPs: 26.24 | +7: iteration 29930/ 60336 | consumed samples: 7662080 | consumed tokens: 15691939840 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.811943E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.589 | TFLOPs: 26.21 | +7: iteration 29940/ 60336 | consumed samples: 7664640 | consumed tokens: 15697182720 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.816766E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.918 | TFLOPs: 26.22 | +7: iteration 29950/ 60336 | consumed samples: 7667200 | consumed tokens: 15702425600 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.811491E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.744 | TFLOPs: 26.20 | +7: iteration 29960/ 60336 | consumed samples: 7669760 | consumed tokens: 15707668480 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.806143E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.682 | TFLOPs: 26.23 | +7: iteration 29970/ 60336 | consumed samples: 7672320 | consumed tokens: 15712911360 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.796973E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.903 | TFLOPs: 26.22 | +7: iteration 29980/ 60336 | consumed samples: 7674880 | consumed tokens: 15718154240 | elapsed time per iteration (s): 0.15 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.812354E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.849 | TFLOPs: 26.23 | +7: iteration 29990/ 60336 | consumed samples: 7677440 | consumed tokens: 15723397120 | elapsed time per iteration (s): 0.15 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.796429E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.264 | TFLOPs: 26.23 | +0: [2023-03-17 01:36:44,467] [INFO] [logging.py:68:log_dist] [Rank 0] step=30000, skipped=0, lr=[0.0001122231667715846, 0.0001122231667715846, 0.0001122231667715846], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 30000/ 60336 | consumed samples: 7680000 | consumed tokens: 15728640000 | elapsed time per iteration (s): 0.15 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.813627E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.296 | TFLOPs: 26.21 | +0: steps: 30000 loss: 3.8258 iter time (s): 0.153 samples/sec: 1676.555 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 30000 | lm loss value: 3.927694E+00 | lm loss PPL: 5.078971E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 30000 to checkpoints_44m32b100m +0: [2023-03-17 01:36:44,545] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step30000 is begin to save! +0: [2023-03-17 01:36:44,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:36:44,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:36:44,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:36:44,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:36:44,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:36:44,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:36:44,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:36:44,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:36:44,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:36:44,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:36:44,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:36:44,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:36:44,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:36:44,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:36:44,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:36:44,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:36:44,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:36:44,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:36:44,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:36:44,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:36:44,679] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step30000/mp_rank_00_model_states.pt +0: [2023-03-17 01:36:44,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:36:44,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:36:44,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:36:44,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:36:44,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:36:44,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:36:44,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:36:44,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 01:36:44,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:36:44,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:36:44,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:36:44,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:36:44,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: successfully saved checkpoint at iteration 30000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 179.31 +7: iteration 30010/ 60336 | consumed samples: 7682560 | consumed tokens: 15733882880 | elapsed time per iteration (s): 0.18 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.814548E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1429.369 | TFLOPs: 22.42 | +7: iteration 30020/ 60336 | consumed samples: 7685120 | consumed tokens: 15739125760 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.809023E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.858 | TFLOPs: 26.09 | +7: iteration 30030/ 60336 | consumed samples: 7687680 | consumed tokens: 15744368640 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.805859E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.660 | TFLOPs: 26.12 | +7: iteration 30040/ 60336 | consumed samples: 7690240 | consumed tokens: 15749611520 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.805930E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.449 | TFLOPs: 26.10 | +7: iteration 30050/ 60336 | consumed samples: 7692800 | consumed tokens: 15754854400 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.820661E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.749 | TFLOPs: 26.11 | +7: iteration 30060/ 60336 | consumed samples: 7695360 | consumed tokens: 15760097280 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.811897E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.312 | TFLOPs: 26.10 | +7: iteration 30070/ 60336 | consumed samples: 7697920 | consumed tokens: 15765340160 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.804667E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.178 | TFLOPs: 26.11 | +7: iteration 30080/ 60336 | consumed samples: 7700480 | consumed tokens: 15770583040 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.819680E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.381 | TFLOPs: 26.10 | +7: iteration 30090/ 60336 | consumed samples: 7703040 | consumed tokens: 15775825920 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.803376E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.413 | TFLOPs: 26.10 | +7: iteration 30100/ 60336 | consumed samples: 7705600 | consumed tokens: 15781068800 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.792748E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.453 | TFLOPs: 26.10 | +7: iteration 30110/ 60336 | consumed samples: 7708160 | consumed tokens: 15786311680 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.805756E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.857 | TFLOPs: 26.09 | +7: iteration 30120/ 60336 | consumed samples: 7710720 | consumed tokens: 15791554560 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.799672E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.491 | TFLOPs: 26.12 | +7: iteration 30130/ 60336 | consumed samples: 7713280 | consumed tokens: 15796797440 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.818403E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.877 | TFLOPs: 26.11 | +7: iteration 30140/ 60336 | consumed samples: 7715840 | consumed tokens: 15802040320 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.807487E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.124 | TFLOPs: 26.11 | +7: iteration 30150/ 60336 | consumed samples: 7718400 | consumed tokens: 15807283200 | elapsed time per iteration (s): 0.15 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.797474E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.690 | TFLOPs: 26.11 | +7: iteration 30160/ 60336 | consumed samples: 7720960 | consumed tokens: 15812526080 | elapsed time per iteration (s): 0.15 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.803312E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.417 | TFLOPs: 26.13 | +7: iteration 30170/ 60336 | consumed samples: 7723520 | consumed tokens: 15817768960 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.802366E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.745 | TFLOPs: 26.14 | +7: iteration 30180/ 60336 | consumed samples: 7726080 | consumed tokens: 15823011840 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.807576E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.420 | TFLOPs: 26.12 | +7: iteration 30190/ 60336 | consumed samples: 7728640 | consumed tokens: 15828254720 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.810215E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.267 | TFLOPs: 26.08 | +7: iteration 30200/ 60336 | consumed samples: 7731200 | consumed tokens: 15833497600 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.799995E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.601 | TFLOPs: 26.04 | +7: iteration 30210/ 60336 | consumed samples: 7733760 | consumed tokens: 15838740480 | elapsed time per iteration (s): 0.16 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.806627E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.993 | TFLOPs: 25.80 | +7: iteration 30220/ 60336 | consumed samples: 7736320 | consumed tokens: 15843983360 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.818736E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.685 | TFLOPs: 26.04 | +7: iteration 30230/ 60336 | consumed samples: 7738880 | consumed tokens: 15849226240 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.798231E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.319 | TFLOPs: 26.04 | +7: iteration 30240/ 60336 | consumed samples: 7741440 | consumed tokens: 15854469120 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.802295E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.256 | TFLOPs: 26.04 | +7: iteration 30250/ 60336 | consumed samples: 7744000 | consumed tokens: 15859712000 | elapsed time per iteration (s): 0.15 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.805655E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.371 | TFLOPs: 26.04 | +7: iteration 30260/ 60336 | consumed samples: 7746560 | consumed tokens: 15864954880 | elapsed time per iteration (s): 0.15 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.816650E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.725 | TFLOPs: 26.04 | +7: iteration 30270/ 60336 | consumed samples: 7749120 | consumed tokens: 15870197760 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.810471E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.475 | TFLOPs: 26.04 | +7: iteration 30280/ 60336 | consumed samples: 7751680 | consumed tokens: 15875440640 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.812452E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.631 | TFLOPs: 26.04 | +7: iteration 30290/ 60336 | consumed samples: 7754240 | consumed tokens: 15880683520 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.795294E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.202 | TFLOPs: 26.05 | +7: iteration 30300/ 60336 | consumed samples: 7756800 | consumed tokens: 15885926400 | elapsed time per iteration (s): 0.15 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.809953E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.889 | TFLOPs: 26.03 | +7: iteration 30310/ 60336 | consumed samples: 7759360 | consumed tokens: 15891169280 | elapsed time per iteration (s): 0.15 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.809429E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.324 | TFLOPs: 26.01 | +7: iteration 30320/ 60336 | consumed samples: 7761920 | consumed tokens: 15896412160 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.806922E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.924 | TFLOPs: 26.02 | +7: iteration 30330/ 60336 | consumed samples: 7764480 | consumed tokens: 15901655040 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.803603E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.464 | TFLOPs: 26.02 | +7: iteration 30340/ 60336 | consumed samples: 7767040 | consumed tokens: 15906897920 | elapsed time per iteration (s): 0.15 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.799883E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.269 | TFLOPs: 26.01 | +7: iteration 30350/ 60336 | consumed samples: 7769600 | consumed tokens: 15912140800 | elapsed time per iteration (s): 0.15 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.814288E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.086 | TFLOPs: 26.05 | +7: iteration 30360/ 60336 | consumed samples: 7772160 | consumed tokens: 15917383680 | elapsed time per iteration (s): 0.15 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.809516E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.409 | TFLOPs: 26.04 | +7: iteration 30370/ 60336 | consumed samples: 7774720 | consumed tokens: 15922626560 | elapsed time per iteration (s): 0.15 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.809468E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.979 | TFLOPs: 26.02 | +7: iteration 30380/ 60336 | consumed samples: 7777280 | consumed tokens: 15927869440 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.814521E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.774 | TFLOPs: 26.06 | +7: iteration 30390/ 60336 | consumed samples: 7779840 | consumed tokens: 15933112320 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.804687E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.150 | TFLOPs: 26.15 | +7: iteration 30400/ 60336 | consumed samples: 7782400 | consumed tokens: 15938355200 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.804761E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.432 | TFLOPs: 26.15 | +7: iteration 30410/ 60336 | consumed samples: 7784960 | consumed tokens: 15943598080 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.801455E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.404 | TFLOPs: 26.13 | +7: iteration 30420/ 60336 | consumed samples: 7787520 | consumed tokens: 15948840960 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.813160E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.867 | TFLOPs: 26.14 | +7: iteration 30430/ 60336 | consumed samples: 7790080 | consumed tokens: 15954083840 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.825645E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.202 | TFLOPs: 26.15 | +7: iteration 30440/ 60336 | consumed samples: 7792640 | consumed tokens: 15959326720 | elapsed time per iteration (s): 0.15 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.805833E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.486 | TFLOPs: 26.13 | +7: iteration 30450/ 60336 | consumed samples: 7795200 | consumed tokens: 15964569600 | elapsed time per iteration (s): 0.16 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.812210E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.887 | TFLOPs: 25.80 | +7: iteration 30460/ 60336 | consumed samples: 7797760 | consumed tokens: 15969812480 | elapsed time per iteration (s): 0.16 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.805152E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.566 | TFLOPs: 25.78 | +7: iteration 30470/ 60336 | consumed samples: 7800320 | consumed tokens: 15975055360 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.804337E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.151 | TFLOPs: 26.13 | +7: iteration 30480/ 60336 | consumed samples: 7802880 | consumed tokens: 15980298240 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.785620E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.249 | TFLOPs: 26.08 | +7: iteration 30490/ 60336 | consumed samples: 7805440 | consumed tokens: 15985541120 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.824157E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.458 | TFLOPs: 26.07 | +7: iteration 30500/ 60336 | consumed samples: 7808000 | consumed tokens: 15990784000 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.800139E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.613 | TFLOPs: 25.98 | +7: iteration 30510/ 60336 | consumed samples: 7810560 | consumed tokens: 15996026880 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.814372E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.668 | TFLOPs: 26.03 | +7: iteration 30520/ 60336 | consumed samples: 7813120 | consumed tokens: 16001269760 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.817037E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.957 | TFLOPs: 26.03 | +7: iteration 30530/ 60336 | consumed samples: 7815680 | consumed tokens: 16006512640 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.814710E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.814 | TFLOPs: 26.03 | +7: iteration 30540/ 60336 | consumed samples: 7818240 | consumed tokens: 16011755520 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.810181E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.535 | TFLOPs: 26.04 | +7: iteration 30550/ 60336 | consumed samples: 7820800 | consumed tokens: 16016998400 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.796855E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.443 | TFLOPs: 26.02 | +7: iteration 30560/ 60336 | consumed samples: 7823360 | consumed tokens: 16022241280 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.803294E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.871 | TFLOPs: 26.03 | +7: iteration 30570/ 60336 | consumed samples: 7825920 | consumed tokens: 16027484160 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.799271E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.624 | TFLOPs: 26.00 | +7: iteration 30580/ 60336 | consumed samples: 7828480 | consumed tokens: 16032727040 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.801845E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.283 | TFLOPs: 26.02 | +7: iteration 30590/ 60336 | consumed samples: 7831040 | consumed tokens: 16037969920 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.807098E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.957 | TFLOPs: 26.03 | +7: iteration 30600/ 60336 | consumed samples: 7833600 | consumed tokens: 16043212800 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.814674E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.273 | TFLOPs: 25.93 | +7: iteration 30610/ 60336 | consumed samples: 7836160 | consumed tokens: 16048455680 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.812495E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.846 | TFLOPs: 26.05 | +7: iteration 30620/ 60336 | consumed samples: 7838720 | consumed tokens: 16053698560 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.800064E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.174 | TFLOPs: 26.05 | +7: iteration 30630/ 60336 | consumed samples: 7841280 | consumed tokens: 16058941440 | elapsed time per iteration (s): 0.15 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.803037E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.358 | TFLOPs: 26.05 | +7: iteration 30640/ 60336 | consumed samples: 7843840 | consumed tokens: 16064184320 | elapsed time per iteration (s): 0.15 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.793279E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.025 | TFLOPs: 26.05 | +7: iteration 30650/ 60336 | consumed samples: 7846400 | consumed tokens: 16069427200 | elapsed time per iteration (s): 0.15 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.803182E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.494 | TFLOPs: 26.04 | +7: iteration 30660/ 60336 | consumed samples: 7848960 | consumed tokens: 16074670080 | elapsed time per iteration (s): 0.15 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.796182E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.954 | TFLOPs: 26.03 | +7: iteration 30670/ 60336 | consumed samples: 7851520 | consumed tokens: 16079912960 | elapsed time per iteration (s): 0.15 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.822789E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.088 | TFLOPs: 26.05 | +7: iteration 30680/ 60336 | consumed samples: 7854080 | consumed tokens: 16085155840 | elapsed time per iteration (s): 0.15 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.806876E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.633 | TFLOPs: 26.04 | +7: iteration 30690/ 60336 | consumed samples: 7856640 | consumed tokens: 16090398720 | elapsed time per iteration (s): 0.15 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.810790E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.101 | TFLOPs: 26.05 | +7: iteration 30700/ 60336 | consumed samples: 7859200 | consumed tokens: 16095641600 | elapsed time per iteration (s): 0.15 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.784321E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.703 | TFLOPs: 26.04 | +7: iteration 30710/ 60336 | consumed samples: 7861760 | consumed tokens: 16100884480 | elapsed time per iteration (s): 0.15 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.805468E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.229 | TFLOPs: 26.05 | +7: iteration 30720/ 60336 | consumed samples: 7864320 | consumed tokens: 16106127360 | elapsed time per iteration (s): 0.15 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.793734E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.712 | TFLOPs: 26.04 | +7: iteration 30730/ 60336 | consumed samples: 7866880 | consumed tokens: 16111370240 | elapsed time per iteration (s): 0.15 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.803064E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.377 | TFLOPs: 26.02 | +7: iteration 30740/ 60336 | consumed samples: 7869440 | consumed tokens: 16116613120 | elapsed time per iteration (s): 0.15 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.798766E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.975 | TFLOPs: 26.03 | +7: iteration 30750/ 60336 | consumed samples: 7872000 | consumed tokens: 16121856000 | elapsed time per iteration (s): 0.15 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.795265E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.421 | TFLOPs: 26.06 | +7: iteration 30760/ 60336 | consumed samples: 7874560 | consumed tokens: 16127098880 | elapsed time per iteration (s): 0.15 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.815707E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.482 | TFLOPs: 26.07 | +7: iteration 30770/ 60336 | consumed samples: 7877120 | consumed tokens: 16132341760 | elapsed time per iteration (s): 0.15 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.812228E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.437 | TFLOPs: 26.07 | +7: iteration 30780/ 60336 | consumed samples: 7879680 | consumed tokens: 16137584640 | elapsed time per iteration (s): 0.15 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.800013E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.589 | TFLOPs: 26.06 | +7: iteration 30790/ 60336 | consumed samples: 7882240 | consumed tokens: 16142827520 | elapsed time per iteration (s): 0.15 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.789729E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.087 | TFLOPs: 26.05 | +7: iteration 30800/ 60336 | consumed samples: 7884800 | consumed tokens: 16148070400 | elapsed time per iteration (s): 0.15 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.794849E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.440 | TFLOPs: 26.04 | +7: iteration 30810/ 60336 | consumed samples: 7887360 | consumed tokens: 16153313280 | elapsed time per iteration (s): 0.15 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.810862E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.501 | TFLOPs: 26.04 | +7: iteration 30820/ 60336 | consumed samples: 7889920 | consumed tokens: 16158556160 | elapsed time per iteration (s): 0.15 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.812193E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.706 | TFLOPs: 26.03 | +7: iteration 30830/ 60336 | consumed samples: 7892480 | consumed tokens: 16163799040 | elapsed time per iteration (s): 0.15 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.823006E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.478 | TFLOPs: 26.07 | +7: iteration 30840/ 60336 | consumed samples: 7895040 | consumed tokens: 16169041920 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.803312E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.364 | TFLOPs: 26.04 | +7: iteration 30850/ 60336 | consumed samples: 7897600 | consumed tokens: 16174284800 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.798664E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.296 | TFLOPs: 26.05 | +7: iteration 30860/ 60336 | consumed samples: 7900160 | consumed tokens: 16179527680 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.800533E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.276 | TFLOPs: 26.05 | +7: iteration 30870/ 60336 | consumed samples: 7902720 | consumed tokens: 16184770560 | elapsed time per iteration (s): 0.15 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.805806E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.451 | TFLOPs: 26.06 | +7: iteration 30880/ 60336 | consumed samples: 7905280 | consumed tokens: 16190013440 | elapsed time per iteration (s): 0.15 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.819666E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.489 | TFLOPs: 26.06 | +7: iteration 30890/ 60336 | consumed samples: 7907840 | consumed tokens: 16195256320 | elapsed time per iteration (s): 0.15 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.799969E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.815 | TFLOPs: 26.06 | +7: iteration 30900/ 60336 | consumed samples: 7910400 | consumed tokens: 16200499200 | elapsed time per iteration (s): 0.15 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.814138E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.931 | TFLOPs: 26.05 | +7: iteration 30910/ 60336 | consumed samples: 7912960 | consumed tokens: 16205742080 | elapsed time per iteration (s): 0.15 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.812761E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.630 | TFLOPs: 26.04 | +7: iteration 30920/ 60336 | consumed samples: 7915520 | consumed tokens: 16210984960 | elapsed time per iteration (s): 0.15 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.805537E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.344 | TFLOPs: 26.07 | +7: iteration 30930/ 60336 | consumed samples: 7918080 | consumed tokens: 16216227840 | elapsed time per iteration (s): 0.15 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.804331E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.244 | TFLOPs: 26.07 | +7: iteration 30940/ 60336 | consumed samples: 7920640 | consumed tokens: 16221470720 | elapsed time per iteration (s): 0.15 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.800189E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.381 | TFLOPs: 26.05 | +7: iteration 30950/ 60336 | consumed samples: 7923200 | consumed tokens: 16226713600 | elapsed time per iteration (s): 0.15 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.795408E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.516 | TFLOPs: 26.04 | +7: iteration 30960/ 60336 | consumed samples: 7925760 | consumed tokens: 16231956480 | elapsed time per iteration (s): 0.15 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.797987E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.797 | TFLOPs: 26.06 | +7: iteration 30970/ 60336 | consumed samples: 7928320 | consumed tokens: 16237199360 | elapsed time per iteration (s): 0.15 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.788979E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.371 | TFLOPs: 26.05 | +7: iteration 30980/ 60336 | consumed samples: 7930880 | consumed tokens: 16242442240 | elapsed time per iteration (s): 0.15 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.803283E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.428 | TFLOPs: 26.06 | +7: iteration 30990/ 60336 | consumed samples: 7933440 | consumed tokens: 16247685120 | elapsed time per iteration (s): 0.15 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.802625E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.801 | TFLOPs: 26.05 | +7: iteration 31000/ 60336 | consumed samples: 7936000 | consumed tokens: 16252928000 | elapsed time per iteration (s): 0.15 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.801519E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.108 | TFLOPs: 26.05 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 31000 | lm loss value: 3.916873E+00 | lm loss PPL: 5.024310E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 31000 to checkpoints_44m32b100m +0: [2023-03-17 01:39:18,869] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step31000 is begin to save! +0: [2023-03-17 01:39:18,872] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:39:18,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:39:18,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:39:18,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:39:18,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:39:18,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:39:18,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:39:18,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:39:18,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:39:18,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:39:18,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:39:18,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:39:18,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:39:18,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:39:18,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:39:18,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:39:18,991] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:39:18,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:39:18,999] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:39:18,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:39:19,000] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step31000/mp_rank_00_model_states.pt +0: [2023-03-17 01:39:19,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:39:19,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:39:19,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:39:19,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:39:19,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:39:19,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 01:39:19,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:39:19,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:39:19,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:39:19,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:39:19,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:39:19,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:39:19,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 01:39:19,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 01:39:19,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:39:19,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:39:19,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:39:19,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:39:19,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:39:19,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: successfully saved checkpoint at iteration 31000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.31 +7: iteration 31010/ 60336 | consumed samples: 7938560 | consumed tokens: 16258170880 | elapsed time per iteration (s): 0.18 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.811294E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.644 | TFLOPs: 22.75 | +7: iteration 31020/ 60336 | consumed samples: 7941120 | consumed tokens: 16263413760 | elapsed time per iteration (s): 0.15 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.797979E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.442 | TFLOPs: 26.06 | +7: iteration 31030/ 60336 | consumed samples: 7943680 | consumed tokens: 16268656640 | elapsed time per iteration (s): 0.15 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.807471E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.302 | TFLOPs: 26.04 | +7: iteration 31040/ 60336 | consumed samples: 7946240 | consumed tokens: 16273899520 | elapsed time per iteration (s): 0.15 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.788283E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.766 | TFLOPs: 26.06 | +7: iteration 31050/ 60336 | consumed samples: 7948800 | consumed tokens: 16279142400 | elapsed time per iteration (s): 0.15 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.796449E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.597 | TFLOPs: 26.04 | +7: iteration 31060/ 60336 | consumed samples: 7951360 | consumed tokens: 16284385280 | elapsed time per iteration (s): 0.15 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.783846E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.367 | TFLOPs: 26.04 | +7: iteration 31070/ 60336 | consumed samples: 7953920 | consumed tokens: 16289628160 | elapsed time per iteration (s): 0.15 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.816719E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.854 | TFLOPs: 26.03 | +7: iteration 31080/ 60336 | consumed samples: 7956480 | consumed tokens: 16294871040 | elapsed time per iteration (s): 0.15 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.816257E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.337 | TFLOPs: 26.05 | +7: iteration 31090/ 60336 | consumed samples: 7959040 | consumed tokens: 16300113920 | elapsed time per iteration (s): 0.15 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.805513E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.997 | TFLOPs: 26.05 | +7: iteration 31100/ 60336 | consumed samples: 7961600 | consumed tokens: 16305356800 | elapsed time per iteration (s): 0.15 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.804901E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.615 | TFLOPs: 26.04 | +7: iteration 31110/ 60336 | consumed samples: 7964160 | consumed tokens: 16310599680 | elapsed time per iteration (s): 0.15 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.806572E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.161 | TFLOPs: 26.05 | +7: iteration 31120/ 60336 | consumed samples: 7966720 | consumed tokens: 16315842560 | elapsed time per iteration (s): 0.15 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.792028E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.351 | TFLOPs: 26.05 | +7: iteration 31130/ 60336 | consumed samples: 7969280 | consumed tokens: 16321085440 | elapsed time per iteration (s): 0.15 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.799903E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.477 | TFLOPs: 26.04 | +7: iteration 31140/ 60336 | consumed samples: 7971840 | consumed tokens: 16326328320 | elapsed time per iteration (s): 0.15 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.803570E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.491 | TFLOPs: 26.09 | +7: iteration 31150/ 60336 | consumed samples: 7974400 | consumed tokens: 16331571200 | elapsed time per iteration (s): 0.15 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.799943E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.446 | TFLOPs: 26.12 | +7: iteration 31160/ 60336 | consumed samples: 7976960 | consumed tokens: 16336814080 | elapsed time per iteration (s): 0.15 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.797038E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.180 | TFLOPs: 26.07 | +7: iteration 31170/ 60336 | consumed samples: 7979520 | consumed tokens: 16342056960 | elapsed time per iteration (s): 0.15 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.791293E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.622 | TFLOPs: 26.07 | +7: iteration 31180/ 60336 | consumed samples: 7982080 | consumed tokens: 16347299840 | elapsed time per iteration (s): 0.15 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.805101E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.417 | TFLOPs: 26.09 | +7: iteration 31190/ 60336 | consumed samples: 7984640 | consumed tokens: 16352542720 | elapsed time per iteration (s): 0.15 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.807142E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.153 | TFLOPs: 26.08 | +7: iteration 31200/ 60336 | consumed samples: 7987200 | consumed tokens: 16357785600 | elapsed time per iteration (s): 0.15 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.821363E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.113 | TFLOPs: 26.08 | +7: iteration 31210/ 60336 | consumed samples: 7989760 | consumed tokens: 16363028480 | elapsed time per iteration (s): 0.15 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.801434E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.055 | TFLOPs: 26.08 | +7: iteration 31220/ 60336 | consumed samples: 7992320 | consumed tokens: 16368271360 | elapsed time per iteration (s): 0.15 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.789228E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.287 | TFLOPs: 26.16 | +7: iteration 31230/ 60336 | consumed samples: 7994880 | consumed tokens: 16373514240 | elapsed time per iteration (s): 0.15 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.795927E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.397 | TFLOPs: 26.16 | +7: iteration 31240/ 60336 | consumed samples: 7997440 | consumed tokens: 16378757120 | elapsed time per iteration (s): 0.15 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.790017E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.196 | TFLOPs: 26.19 | +7: iteration 31250/ 60336 | consumed samples: 8000000 | consumed tokens: 16384000000 | elapsed time per iteration (s): 0.15 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.804889E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.124 | TFLOPs: 26.18 | +7: iteration 31260/ 60336 | consumed samples: 8002560 | consumed tokens: 16389242880 | elapsed time per iteration (s): 0.15 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.816674E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.199 | TFLOPs: 26.16 | +7: iteration 31270/ 60336 | consumed samples: 8005120 | consumed tokens: 16394485760 | elapsed time per iteration (s): 0.15 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.800668E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.021 | TFLOPs: 26.17 | +7: iteration 31280/ 60336 | consumed samples: 8007680 | consumed tokens: 16399728640 | elapsed time per iteration (s): 0.15 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.791147E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.260 | TFLOPs: 26.16 | +7: iteration 31290/ 60336 | consumed samples: 8010240 | consumed tokens: 16404971520 | elapsed time per iteration (s): 0.15 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.797760E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.538 | TFLOPs: 26.18 | +7: iteration 31300/ 60336 | consumed samples: 8012800 | consumed tokens: 16410214400 | elapsed time per iteration (s): 0.15 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.804182E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.852 | TFLOPs: 26.17 | +7: iteration 31310/ 60336 | consumed samples: 8015360 | consumed tokens: 16415457280 | elapsed time per iteration (s): 0.15 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.798195E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.672 | TFLOPs: 26.15 | +7: iteration 31320/ 60336 | consumed samples: 8017920 | consumed tokens: 16420700160 | elapsed time per iteration (s): 0.15 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.808923E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.751 | TFLOPs: 26.15 | +7: iteration 31330/ 60336 | consumed samples: 8020480 | consumed tokens: 16425943040 | elapsed time per iteration (s): 0.15 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.787519E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.742 | TFLOPs: 26.15 | +7: iteration 31340/ 60336 | consumed samples: 8023040 | consumed tokens: 16431185920 | elapsed time per iteration (s): 0.15 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.804971E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.244 | TFLOPs: 26.13 | +7: iteration 31350/ 60336 | consumed samples: 8025600 | consumed tokens: 16436428800 | elapsed time per iteration (s): 0.15 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.799755E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.457 | TFLOPs: 26.17 | +7: iteration 31360/ 60336 | consumed samples: 8028160 | consumed tokens: 16441671680 | elapsed time per iteration (s): 0.15 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.819770E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.660 | TFLOPs: 26.17 | +7: iteration 31370/ 60336 | consumed samples: 8030720 | consumed tokens: 16446914560 | elapsed time per iteration (s): 0.15 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.804581E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.654 | TFLOPs: 26.15 | +7: iteration 31380/ 60336 | consumed samples: 8033280 | consumed tokens: 16452157440 | elapsed time per iteration (s): 0.15 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.792476E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.299 | TFLOPs: 26.16 | +7: iteration 31390/ 60336 | consumed samples: 8035840 | consumed tokens: 16457400320 | elapsed time per iteration (s): 0.15 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.795545E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.022 | TFLOPs: 26.19 | +7: iteration 31400/ 60336 | consumed samples: 8038400 | consumed tokens: 16462643200 | elapsed time per iteration (s): 0.15 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.799471E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.246 | TFLOPs: 26.16 | +7: iteration 31410/ 60336 | consumed samples: 8040960 | consumed tokens: 16467886080 | elapsed time per iteration (s): 0.15 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.799366E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.333 | TFLOPs: 26.20 | +7: iteration 31420/ 60336 | consumed samples: 8043520 | consumed tokens: 16473128960 | elapsed time per iteration (s): 0.15 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.802749E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.194 | TFLOPs: 26.10 | +7: iteration 31430/ 60336 | consumed samples: 8046080 | consumed tokens: 16478371840 | elapsed time per iteration (s): 0.15 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.797068E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.854 | TFLOPs: 26.08 | +7: iteration 31440/ 60336 | consumed samples: 8048640 | consumed tokens: 16483614720 | elapsed time per iteration (s): 0.15 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.797405E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.071 | TFLOPs: 26.08 | +7: iteration 31450/ 60336 | consumed samples: 8051200 | consumed tokens: 16488857600 | elapsed time per iteration (s): 0.15 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.799714E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.821 | TFLOPs: 26.06 | +7: iteration 31460/ 60336 | consumed samples: 8053760 | consumed tokens: 16494100480 | elapsed time per iteration (s): 0.16 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.801127E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.273 | TFLOPs: 25.74 | +7: iteration 31470/ 60336 | consumed samples: 8056320 | consumed tokens: 16499343360 | elapsed time per iteration (s): 0.15 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.795012E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.042 | TFLOPs: 26.10 | +7: iteration 31480/ 60336 | consumed samples: 8058880 | consumed tokens: 16504586240 | elapsed time per iteration (s): 0.15 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.796326E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.312 | TFLOPs: 26.10 | +7: iteration 31490/ 60336 | consumed samples: 8061440 | consumed tokens: 16509829120 | elapsed time per iteration (s): 0.15 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.808074E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.021 | TFLOPs: 26.11 | +7: iteration 31500/ 60336 | consumed samples: 8064000 | consumed tokens: 16515072000 | elapsed time per iteration (s): 0.15 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.804397E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.927 | TFLOPs: 25.97 | +7: iteration 31510/ 60336 | consumed samples: 8066560 | consumed tokens: 16520314880 | elapsed time per iteration (s): 0.15 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.798904E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.296 | TFLOPs: 26.08 | +7: iteration 31520/ 60336 | consumed samples: 8069120 | consumed tokens: 16525557760 | elapsed time per iteration (s): 0.15 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.800616E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.657 | TFLOPs: 26.09 | +7: iteration 31530/ 60336 | consumed samples: 8071680 | consumed tokens: 16530800640 | elapsed time per iteration (s): 0.15 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.795698E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.984 | TFLOPs: 26.10 | +7: iteration 31540/ 60336 | consumed samples: 8074240 | consumed tokens: 16536043520 | elapsed time per iteration (s): 0.15 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.800007E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.204 | TFLOPs: 26.08 | +7: iteration 31550/ 60336 | consumed samples: 8076800 | consumed tokens: 16541286400 | elapsed time per iteration (s): 0.15 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.796798E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.887 | TFLOPs: 26.06 | +7: iteration 31560/ 60336 | consumed samples: 8079360 | consumed tokens: 16546529280 | elapsed time per iteration (s): 0.15 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.809038E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.648 | TFLOPs: 26.06 | +7: iteration 31570/ 60336 | consumed samples: 8081920 | consumed tokens: 16551772160 | elapsed time per iteration (s): 0.15 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.803854E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.662 | TFLOPs: 26.06 | +7: iteration 31580/ 60336 | consumed samples: 8084480 | consumed tokens: 16557015040 | elapsed time per iteration (s): 0.15 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.784605E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.638 | TFLOPs: 26.07 | +7: iteration 31590/ 60336 | consumed samples: 8087040 | consumed tokens: 16562257920 | elapsed time per iteration (s): 0.15 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.801705E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.350 | TFLOPs: 26.05 | +7: iteration 31600/ 60336 | consumed samples: 8089600 | consumed tokens: 16567500800 | elapsed time per iteration (s): 0.15 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.806707E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.842 | TFLOPs: 26.08 | +7: iteration 31610/ 60336 | consumed samples: 8092160 | consumed tokens: 16572743680 | elapsed time per iteration (s): 0.15 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.805684E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.124 | TFLOPs: 26.05 | +7: iteration 31620/ 60336 | consumed samples: 8094720 | consumed tokens: 16577986560 | elapsed time per iteration (s): 0.15 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.802610E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.774 | TFLOPs: 26.06 | +7: iteration 31630/ 60336 | consumed samples: 8097280 | consumed tokens: 16583229440 | elapsed time per iteration (s): 0.15 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.808074E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.451 | TFLOPs: 26.07 | +7: iteration 31640/ 60336 | consumed samples: 8099840 | consumed tokens: 16588472320 | elapsed time per iteration (s): 0.15 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.810934E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.095 | TFLOPs: 26.08 | +7: iteration 31650/ 60336 | consumed samples: 8102400 | consumed tokens: 16593715200 | elapsed time per iteration (s): 0.15 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.802614E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.440 | TFLOPs: 26.09 | +7: iteration 31660/ 60336 | consumed samples: 8104960 | consumed tokens: 16598958080 | elapsed time per iteration (s): 0.15 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.802357E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.222 | TFLOPs: 26.11 | +7: iteration 31670/ 60336 | consumed samples: 8107520 | consumed tokens: 16604200960 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.806653E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.371 | TFLOPs: 26.09 | +7: iteration 31680/ 60336 | consumed samples: 8110080 | consumed tokens: 16609443840 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.801185E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.283 | TFLOPs: 26.10 | +7: iteration 31690/ 60336 | consumed samples: 8112640 | consumed tokens: 16614686720 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.801423E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.269 | TFLOPs: 26.10 | +7: iteration 31700/ 60336 | consumed samples: 8115200 | consumed tokens: 16619929600 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.806721E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.497 | TFLOPs: 26.10 | +7: iteration 31710/ 60336 | consumed samples: 8117760 | consumed tokens: 16625172480 | elapsed time per iteration (s): 0.15 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.798733E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.436 | TFLOPs: 26.10 | +7: iteration 31720/ 60336 | consumed samples: 8120320 | consumed tokens: 16630415360 | elapsed time per iteration (s): 0.15 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.800557E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.903 | TFLOPs: 26.09 | +7: iteration 31730/ 60336 | consumed samples: 8122880 | consumed tokens: 16635658240 | elapsed time per iteration (s): 0.15 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.797108E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.625 | TFLOPs: 26.09 | +7: iteration 31740/ 60336 | consumed samples: 8125440 | consumed tokens: 16640901120 | elapsed time per iteration (s): 0.15 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.786655E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.360 | TFLOPs: 26.07 | +7: iteration 31750/ 60336 | consumed samples: 8128000 | consumed tokens: 16646144000 | elapsed time per iteration (s): 0.16 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.798525E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.105 | TFLOPs: 25.83 | +7: iteration 31760/ 60336 | consumed samples: 8130560 | consumed tokens: 16651386880 | elapsed time per iteration (s): 0.16 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.806126E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.221 | TFLOPs: 25.77 | +7: iteration 31770/ 60336 | consumed samples: 8133120 | consumed tokens: 16656629760 | elapsed time per iteration (s): 0.15 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.807457E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.223 | TFLOPs: 26.24 | +7: iteration 31780/ 60336 | consumed samples: 8135680 | consumed tokens: 16661872640 | elapsed time per iteration (s): 0.15 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.796105E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.916 | TFLOPs: 26.22 | +7: iteration 31790/ 60336 | consumed samples: 8138240 | consumed tokens: 16667115520 | elapsed time per iteration (s): 0.15 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.797307E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.457 | TFLOPs: 26.18 | +7: iteration 31800/ 60336 | consumed samples: 8140800 | consumed tokens: 16672358400 | elapsed time per iteration (s): 0.15 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.793727E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.077 | TFLOPs: 26.14 | +7: iteration 31810/ 60336 | consumed samples: 8143360 | consumed tokens: 16677601280 | elapsed time per iteration (s): 0.15 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.804760E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.095 | TFLOPs: 26.13 | +7: iteration 31820/ 60336 | consumed samples: 8145920 | consumed tokens: 16682844160 | elapsed time per iteration (s): 0.15 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.810119E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.000 | TFLOPs: 26.10 | +7: iteration 31830/ 60336 | consumed samples: 8148480 | consumed tokens: 16688087040 | elapsed time per iteration (s): 0.15 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.809696E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.785 | TFLOPs: 26.14 | +7: iteration 31840/ 60336 | consumed samples: 8151040 | consumed tokens: 16693329920 | elapsed time per iteration (s): 0.15 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.794715E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.795 | TFLOPs: 26.12 | +7: iteration 31850/ 60336 | consumed samples: 8153600 | consumed tokens: 16698572800 | elapsed time per iteration (s): 0.15 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.793381E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.334 | TFLOPs: 26.12 | +7: iteration 31860/ 60336 | consumed samples: 8156160 | consumed tokens: 16703815680 | elapsed time per iteration (s): 0.15 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.799627E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.356 | TFLOPs: 26.12 | +7: iteration 31870/ 60336 | consumed samples: 8158720 | consumed tokens: 16709058560 | elapsed time per iteration (s): 0.15 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.789396E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.362 | TFLOPs: 26.13 | +7: iteration 31880/ 60336 | consumed samples: 8161280 | consumed tokens: 16714301440 | elapsed time per iteration (s): 0.15 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.796183E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.992 | TFLOPs: 26.13 | +7: iteration 31890/ 60336 | consumed samples: 8163840 | consumed tokens: 16719544320 | elapsed time per iteration (s): 0.15 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.808593E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.394 | TFLOPs: 26.12 | +7: iteration 31900/ 60336 | consumed samples: 8166400 | consumed tokens: 16724787200 | elapsed time per iteration (s): 0.15 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.803680E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.267 | TFLOPs: 26.13 | +7: iteration 31910/ 60336 | consumed samples: 8168960 | consumed tokens: 16730030080 | elapsed time per iteration (s): 0.15 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.805153E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.418 | TFLOPs: 26.13 | +7: iteration 31920/ 60336 | consumed samples: 8171520 | consumed tokens: 16735272960 | elapsed time per iteration (s): 0.15 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.805386E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.576 | TFLOPs: 26.10 | +7: iteration 31930/ 60336 | consumed samples: 8174080 | consumed tokens: 16740515840 | elapsed time per iteration (s): 0.15 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.792303E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.498 | TFLOPs: 26.12 | +7: iteration 31940/ 60336 | consumed samples: 8176640 | consumed tokens: 16745758720 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.802265E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.769 | TFLOPs: 26.12 | +7: iteration 31950/ 60336 | consumed samples: 8179200 | consumed tokens: 16751001600 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.791756E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.290 | TFLOPs: 26.13 | +7: iteration 31960/ 60336 | consumed samples: 8181760 | consumed tokens: 16756244480 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.809837E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.448 | TFLOPs: 26.13 | +7: iteration 31970/ 60336 | consumed samples: 8184320 | consumed tokens: 16761487360 | elapsed time per iteration (s): 0.15 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.798832E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.043 | TFLOPs: 26.13 | +7: iteration 31980/ 60336 | consumed samples: 8186880 | consumed tokens: 16766730240 | elapsed time per iteration (s): 0.15 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.783967E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.077 | TFLOPs: 26.11 | +7: iteration 31990/ 60336 | consumed samples: 8189440 | consumed tokens: 16771973120 | elapsed time per iteration (s): 0.15 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.803733E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.840 | TFLOPs: 26.09 | +0: [2023-03-17 01:41:52,882] [INFO] [logging.py:68:log_dist] [Rank 0] step=32000, skipped=0, lr=[0.00010276425973599967, 0.00010276425973599967, 0.00010276425973599967], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 32000/ 60336 | consumed samples: 8192000 | consumed tokens: 16777216000 | elapsed time per iteration (s): 0.15 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.805761E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.621 | TFLOPs: 26.09 | +0: steps: 32000 loss: 3.8078 iter time (s): 0.152 samples/sec: 1679.283 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 32000 | lm loss value: 3.904689E+00 | lm loss PPL: 4.963466E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 32000 to checkpoints_44m32b100m +0: [2023-03-17 01:41:52,962] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step32000 is begin to save! +0: [2023-03-17 01:41:52,965] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:41:53,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:41:53,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:41:53,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:41:53,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:41:53,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:41:53,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:41:53,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:41:53,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:41:53,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:41:53,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:41:53,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:41:53,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:41:53,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:41:53,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:41:53,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:41:53,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:41:53,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:41:53,101] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:41:53,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:41:53,103] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step32000/mp_rank_00_model_states.pt +0: [2023-03-17 01:41:53,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:41:53,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:53,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:53,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:53,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:53,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:53,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:53,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:53,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:53,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:53,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:41:53,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:53,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:53,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: successfully saved checkpoint at iteration 32000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 188.02 +7: iteration 32010/ 60336 | consumed samples: 8194560 | consumed tokens: 16782458880 | elapsed time per iteration (s): 0.18 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.794756E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1422.600 | TFLOPs: 22.31 | +7: iteration 32020/ 60336 | consumed samples: 8197120 | consumed tokens: 16787701760 | elapsed time per iteration (s): 0.15 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.797801E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.182 | TFLOPs: 26.11 | +7: iteration 32030/ 60336 | consumed samples: 8199680 | consumed tokens: 16792944640 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.793695E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.331 | TFLOPs: 26.12 | +7: iteration 32040/ 60336 | consumed samples: 8202240 | consumed tokens: 16798187520 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.805745E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.006 | TFLOPs: 26.10 | +7: iteration 32050/ 60336 | consumed samples: 8204800 | consumed tokens: 16803430400 | elapsed time per iteration (s): 0.15 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.803392E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.322 | TFLOPs: 26.09 | +7: iteration 32060/ 60336 | consumed samples: 8207360 | consumed tokens: 16808673280 | elapsed time per iteration (s): 0.15 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.811380E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.453 | TFLOPs: 26.09 | +7: iteration 32070/ 60336 | consumed samples: 8209920 | consumed tokens: 16813916160 | elapsed time per iteration (s): 0.15 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.800611E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.707 | TFLOPs: 26.11 | +7: iteration 32080/ 60336 | consumed samples: 8212480 | consumed tokens: 16819159040 | elapsed time per iteration (s): 0.15 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.811098E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.220 | TFLOPs: 26.10 | +7: iteration 32090/ 60336 | consumed samples: 8215040 | consumed tokens: 16824401920 | elapsed time per iteration (s): 0.15 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.799739E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.452 | TFLOPs: 26.09 | +7: iteration 32100/ 60336 | consumed samples: 8217600 | consumed tokens: 16829644800 | elapsed time per iteration (s): 0.15 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.813805E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.369 | TFLOPs: 26.10 | +7: iteration 32110/ 60336 | consumed samples: 8220160 | consumed tokens: 16834887680 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.797652E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.054 | TFLOPs: 26.08 | +7: iteration 32120/ 60336 | consumed samples: 8222720 | consumed tokens: 16840130560 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.807731E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.930 | TFLOPs: 26.11 | +7: iteration 32130/ 60336 | consumed samples: 8225280 | consumed tokens: 16845373440 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.789372E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.417 | TFLOPs: 26.10 | +7: iteration 32140/ 60336 | consumed samples: 8227840 | consumed tokens: 16850616320 | elapsed time per iteration (s): 0.15 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.803030E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.632 | TFLOPs: 26.09 | +7: iteration 32150/ 60336 | consumed samples: 8230400 | consumed tokens: 16855859200 | elapsed time per iteration (s): 0.15 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.799372E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.198 | TFLOPs: 26.10 | +7: iteration 32160/ 60336 | consumed samples: 8232960 | consumed tokens: 16861102080 | elapsed time per iteration (s): 0.15 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.806363E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.484 | TFLOPs: 26.04 | +7: iteration 32170/ 60336 | consumed samples: 8235520 | consumed tokens: 16866344960 | elapsed time per iteration (s): 0.15 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.799139E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.775 | TFLOPs: 26.06 | +7: iteration 32180/ 60336 | consumed samples: 8238080 | consumed tokens: 16871587840 | elapsed time per iteration (s): 0.15 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.796915E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.168 | TFLOPs: 26.05 | +7: iteration 32190/ 60336 | consumed samples: 8240640 | consumed tokens: 16876830720 | elapsed time per iteration (s): 0.15 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.798489E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.799 | TFLOPs: 26.05 | +7: iteration 32200/ 60336 | consumed samples: 8243200 | consumed tokens: 16882073600 | elapsed time per iteration (s): 0.16 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.806772E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.294 | TFLOPs: 25.83 | +7: iteration 32210/ 60336 | consumed samples: 8245760 | consumed tokens: 16887316480 | elapsed time per iteration (s): 0.15 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.803781E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.759 | TFLOPs: 25.90 | +7: iteration 32220/ 60336 | consumed samples: 8248320 | consumed tokens: 16892559360 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.800470E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.448 | TFLOPs: 26.01 | +7: iteration 32230/ 60336 | consumed samples: 8250880 | consumed tokens: 16897802240 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.810496E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.137 | TFLOPs: 26.00 | +7: iteration 32240/ 60336 | consumed samples: 8253440 | consumed tokens: 16903045120 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.794098E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.547 | TFLOPs: 26.01 | +7: iteration 32250/ 60336 | consumed samples: 8256000 | consumed tokens: 16908288000 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.794427E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.802 | TFLOPs: 26.03 | +7: iteration 32260/ 60336 | consumed samples: 8258560 | consumed tokens: 16913530880 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.785400E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.924 | TFLOPs: 26.03 | +7: iteration 32270/ 60336 | consumed samples: 8261120 | consumed tokens: 16918773760 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.784712E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.419 | TFLOPs: 26.06 | +7: iteration 32280/ 60336 | consumed samples: 8263680 | consumed tokens: 16924016640 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.796972E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.929 | TFLOPs: 26.05 | +7: iteration 32290/ 60336 | consumed samples: 8266240 | consumed tokens: 16929259520 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.798742E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.768 | TFLOPs: 26.03 | +7: iteration 32300/ 60336 | consumed samples: 8268800 | consumed tokens: 16934502400 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.791321E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.489 | TFLOPs: 26.04 | +7: iteration 32310/ 60336 | consumed samples: 8271360 | consumed tokens: 16939745280 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.805890E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.756 | TFLOPs: 26.09 | +7: iteration 32320/ 60336 | consumed samples: 8273920 | consumed tokens: 16944988160 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.800174E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.245 | TFLOPs: 26.10 | +7: iteration 32330/ 60336 | consumed samples: 8276480 | consumed tokens: 16950231040 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.798156E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.894 | TFLOPs: 26.13 | +7: iteration 32340/ 60336 | consumed samples: 8279040 | consumed tokens: 16955473920 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.814881E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.021 | TFLOPs: 26.11 | +7: iteration 32350/ 60336 | consumed samples: 8281600 | consumed tokens: 16960716800 | elapsed time per iteration (s): 0.15 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.801139E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.995 | TFLOPs: 26.16 | +7: iteration 32360/ 60336 | consumed samples: 8284160 | consumed tokens: 16965959680 | elapsed time per iteration (s): 0.15 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.799559E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.967 | TFLOPs: 26.11 | +7: iteration 32370/ 60336 | consumed samples: 8286720 | consumed tokens: 16971202560 | elapsed time per iteration (s): 0.15 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.797486E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.959 | TFLOPs: 26.13 | +7: iteration 32380/ 60336 | consumed samples: 8289280 | consumed tokens: 16976445440 | elapsed time per iteration (s): 0.15 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.798781E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.113 | TFLOPs: 26.14 | +7: iteration 32390/ 60336 | consumed samples: 8291840 | consumed tokens: 16981688320 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.797793E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.565 | TFLOPs: 26.12 | +7: iteration 32400/ 60336 | consumed samples: 8294400 | consumed tokens: 16986931200 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.799459E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.631 | TFLOPs: 26.14 | +7: iteration 32410/ 60336 | consumed samples: 8296960 | consumed tokens: 16992174080 | elapsed time per iteration (s): 0.16 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.815104E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.503 | TFLOPs: 25.13 | +7: iteration 32420/ 60336 | consumed samples: 8299520 | consumed tokens: 16997416960 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.797575E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.263 | TFLOPs: 26.16 | +7: iteration 32430/ 60336 | consumed samples: 8302080 | consumed tokens: 17002659840 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.785413E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.089 | TFLOPs: 26.11 | +7: iteration 32440/ 60336 | consumed samples: 8304640 | consumed tokens: 17007902720 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.807104E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.131 | TFLOPs: 26.10 | +7: iteration 32450/ 60336 | consumed samples: 8307200 | consumed tokens: 17013145600 | elapsed time per iteration (s): 0.15 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.806335E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.456 | TFLOPs: 26.13 | +7: iteration 32460/ 60336 | consumed samples: 8309760 | consumed tokens: 17018388480 | elapsed time per iteration (s): 0.15 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.787857E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.872 | TFLOPs: 26.13 | +7: iteration 32470/ 60336 | consumed samples: 8312320 | consumed tokens: 17023631360 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.791073E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.974 | TFLOPs: 26.13 | +7: iteration 32480/ 60336 | consumed samples: 8314880 | consumed tokens: 17028874240 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.799937E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.730 | TFLOPs: 26.14 | +7: iteration 32490/ 60336 | consumed samples: 8317440 | consumed tokens: 17034117120 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.803302E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.000 | TFLOPs: 26.14 | +7: iteration 32500/ 60336 | consumed samples: 8320000 | consumed tokens: 17039360000 | elapsed time per iteration (s): 0.15 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.807333E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.775 | TFLOPs: 26.15 | +7: iteration 32510/ 60336 | consumed samples: 8322560 | consumed tokens: 17044602880 | elapsed time per iteration (s): 0.15 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.794004E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.045 | TFLOPs: 26.14 | +7: iteration 32520/ 60336 | consumed samples: 8325120 | consumed tokens: 17049845760 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.797624E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.951 | TFLOPs: 26.13 | +7: iteration 32530/ 60336 | consumed samples: 8327680 | consumed tokens: 17055088640 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.812996E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.453 | TFLOPs: 26.09 | +7: iteration 32540/ 60336 | consumed samples: 8330240 | consumed tokens: 17060331520 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.795221E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.737 | TFLOPs: 26.11 | +7: iteration 32550/ 60336 | consumed samples: 8332800 | consumed tokens: 17065574400 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.803389E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.989 | TFLOPs: 26.10 | +7: iteration 32560/ 60336 | consumed samples: 8335360 | consumed tokens: 17070817280 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.780905E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.043 | TFLOPs: 26.10 | +7: iteration 32570/ 60336 | consumed samples: 8337920 | consumed tokens: 17076060160 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.807102E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.824 | TFLOPs: 26.09 | +7: iteration 32580/ 60336 | consumed samples: 8340480 | consumed tokens: 17081303040 | elapsed time per iteration (s): 0.15 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.796378E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.210 | TFLOPs: 26.10 | +7: iteration 32590/ 60336 | consumed samples: 8343040 | consumed tokens: 17086545920 | elapsed time per iteration (s): 0.15 | learning rate: 9.998E-05 | global batch size: 256 | lm loss: 3.807767E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.918 | TFLOPs: 26.09 | +7: iteration 32600/ 60336 | consumed samples: 8345600 | consumed tokens: 17091788800 | elapsed time per iteration (s): 0.15 | learning rate: 9.994E-05 | global batch size: 256 | lm loss: 3.798874E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.141 | TFLOPs: 26.08 | +7: iteration 32610/ 60336 | consumed samples: 8348160 | consumed tokens: 17097031680 | elapsed time per iteration (s): 0.15 | learning rate: 9.989E-05 | global batch size: 256 | lm loss: 3.792181E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.644 | TFLOPs: 26.09 | +7: iteration 32620/ 60336 | consumed samples: 8350720 | consumed tokens: 17102274560 | elapsed time per iteration (s): 0.16 | learning rate: 9.984E-05 | global batch size: 256 | lm loss: 3.804708E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.487 | TFLOPs: 25.52 | +7: iteration 32630/ 60336 | consumed samples: 8353280 | consumed tokens: 17107517440 | elapsed time per iteration (s): 0.15 | learning rate: 9.980E-05 | global batch size: 256 | lm loss: 3.803936E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.431 | TFLOPs: 26.10 | +7: iteration 32640/ 60336 | consumed samples: 8355840 | consumed tokens: 17112760320 | elapsed time per iteration (s): 0.15 | learning rate: 9.975E-05 | global batch size: 256 | lm loss: 3.793760E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.589 | TFLOPs: 26.10 | +7: iteration 32650/ 60336 | consumed samples: 8358400 | consumed tokens: 17118003200 | elapsed time per iteration (s): 0.15 | learning rate: 9.970E-05 | global batch size: 256 | lm loss: 3.789341E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.474 | TFLOPs: 26.07 | +7: iteration 32660/ 60336 | consumed samples: 8360960 | consumed tokens: 17123246080 | elapsed time per iteration (s): 0.15 | learning rate: 9.966E-05 | global batch size: 256 | lm loss: 3.805856E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.435 | TFLOPs: 26.09 | +7: iteration 32670/ 60336 | consumed samples: 8363520 | consumed tokens: 17128488960 | elapsed time per iteration (s): 0.15 | learning rate: 9.961E-05 | global batch size: 256 | lm loss: 3.793247E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.534 | TFLOPs: 26.10 | +7: iteration 32680/ 60336 | consumed samples: 8366080 | consumed tokens: 17133731840 | elapsed time per iteration (s): 0.15 | learning rate: 9.956E-05 | global batch size: 256 | lm loss: 3.796100E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.841 | TFLOPs: 26.08 | +7: iteration 32690/ 60336 | consumed samples: 8368640 | consumed tokens: 17138974720 | elapsed time per iteration (s): 0.15 | learning rate: 9.951E-05 | global batch size: 256 | lm loss: 3.780848E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.178 | TFLOPs: 26.10 | +7: iteration 32700/ 60336 | consumed samples: 8371200 | consumed tokens: 17144217600 | elapsed time per iteration (s): 0.15 | learning rate: 9.947E-05 | global batch size: 256 | lm loss: 3.789048E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.641 | TFLOPs: 26.11 | +7: iteration 32710/ 60336 | consumed samples: 8373760 | consumed tokens: 17149460480 | elapsed time per iteration (s): 0.15 | learning rate: 9.942E-05 | global batch size: 256 | lm loss: 3.798318E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.307 | TFLOPs: 26.10 | +7: iteration 32720/ 60336 | consumed samples: 8376320 | consumed tokens: 17154703360 | elapsed time per iteration (s): 0.15 | learning rate: 9.937E-05 | global batch size: 256 | lm loss: 3.801471E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.597 | TFLOPs: 26.12 | +7: iteration 32730/ 60336 | consumed samples: 8378880 | consumed tokens: 17159946240 | elapsed time per iteration (s): 0.15 | learning rate: 9.933E-05 | global batch size: 256 | lm loss: 3.790312E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.918 | TFLOPs: 26.08 | +7: iteration 32740/ 60336 | consumed samples: 8381440 | consumed tokens: 17165189120 | elapsed time per iteration (s): 0.15 | learning rate: 9.928E-05 | global batch size: 256 | lm loss: 3.790726E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.307 | TFLOPs: 26.08 | +7: iteration 32750/ 60336 | consumed samples: 8384000 | consumed tokens: 17170432000 | elapsed time per iteration (s): 0.15 | learning rate: 9.923E-05 | global batch size: 256 | lm loss: 3.797543E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.423 | TFLOPs: 26.10 | +7: iteration 32760/ 60336 | consumed samples: 8386560 | consumed tokens: 17175674880 | elapsed time per iteration (s): 0.15 | learning rate: 9.919E-05 | global batch size: 256 | lm loss: 3.782092E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.363 | TFLOPs: 26.07 | +7: iteration 32770/ 60336 | consumed samples: 8389120 | consumed tokens: 17180917760 | elapsed time per iteration (s): 0.15 | learning rate: 9.914E-05 | global batch size: 256 | lm loss: 3.803106E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.015 | TFLOPs: 26.11 | +7: iteration 32780/ 60336 | consumed samples: 8391680 | consumed tokens: 17186160640 | elapsed time per iteration (s): 0.15 | learning rate: 9.909E-05 | global batch size: 256 | lm loss: 3.799430E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.227 | TFLOPs: 26.13 | +7: iteration 32790/ 60336 | consumed samples: 8394240 | consumed tokens: 17191403520 | elapsed time per iteration (s): 0.15 | learning rate: 9.904E-05 | global batch size: 256 | lm loss: 3.801757E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.910 | TFLOPs: 26.11 | +7: iteration 32800/ 60336 | consumed samples: 8396800 | consumed tokens: 17196646400 | elapsed time per iteration (s): 0.15 | learning rate: 9.900E-05 | global batch size: 256 | lm loss: 3.801181E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.191 | TFLOPs: 26.11 | +7: iteration 32810/ 60336 | consumed samples: 8399360 | consumed tokens: 17201889280 | elapsed time per iteration (s): 0.15 | learning rate: 9.895E-05 | global batch size: 256 | lm loss: 3.785743E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.533 | TFLOPs: 26.12 | +7: iteration 32820/ 60336 | consumed samples: 8401920 | consumed tokens: 17207132160 | elapsed time per iteration (s): 0.15 | learning rate: 9.890E-05 | global batch size: 256 | lm loss: 3.799262E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.657 | TFLOPs: 26.09 | +7: iteration 32830/ 60336 | consumed samples: 8404480 | consumed tokens: 17212375040 | elapsed time per iteration (s): 0.15 | learning rate: 9.886E-05 | global batch size: 256 | lm loss: 3.776377E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.898 | TFLOPs: 26.11 | +7: iteration 32840/ 60336 | consumed samples: 8407040 | consumed tokens: 17217617920 | elapsed time per iteration (s): 0.15 | learning rate: 9.881E-05 | global batch size: 256 | lm loss: 3.797832E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.867 | TFLOPs: 26.09 | +7: iteration 32850/ 60336 | consumed samples: 8409600 | consumed tokens: 17222860800 | elapsed time per iteration (s): 0.15 | learning rate: 9.876E-05 | global batch size: 256 | lm loss: 3.803908E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.353 | TFLOPs: 26.13 | +7: iteration 32860/ 60336 | consumed samples: 8412160 | consumed tokens: 17228103680 | elapsed time per iteration (s): 0.15 | learning rate: 9.872E-05 | global batch size: 256 | lm loss: 3.789124E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.391 | TFLOPs: 26.10 | +7: iteration 32870/ 60336 | consumed samples: 8414720 | consumed tokens: 17233346560 | elapsed time per iteration (s): 0.15 | learning rate: 9.867E-05 | global batch size: 256 | lm loss: 3.792886E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.532 | TFLOPs: 26.03 | +7: iteration 32880/ 60336 | consumed samples: 8417280 | consumed tokens: 17238589440 | elapsed time per iteration (s): 0.15 | learning rate: 9.862E-05 | global batch size: 256 | lm loss: 3.796862E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.726 | TFLOPs: 26.03 | +7: iteration 32890/ 60336 | consumed samples: 8419840 | consumed tokens: 17243832320 | elapsed time per iteration (s): 0.16 | learning rate: 9.857E-05 | global batch size: 256 | lm loss: 3.789291E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.876 | TFLOPs: 25.70 | +7: iteration 32900/ 60336 | consumed samples: 8422400 | consumed tokens: 17249075200 | elapsed time per iteration (s): 0.15 | learning rate: 9.853E-05 | global batch size: 256 | lm loss: 3.795001E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.583 | TFLOPs: 26.06 | +7: iteration 32910/ 60336 | consumed samples: 8424960 | consumed tokens: 17254318080 | elapsed time per iteration (s): 0.16 | learning rate: 9.848E-05 | global batch size: 256 | lm loss: 3.783116E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.918 | TFLOPs: 25.47 | +7: iteration 32920/ 60336 | consumed samples: 8427520 | consumed tokens: 17259560960 | elapsed time per iteration (s): 0.15 | learning rate: 9.843E-05 | global batch size: 256 | lm loss: 3.801706E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.024 | TFLOPs: 26.03 | +7: iteration 32930/ 60336 | consumed samples: 8430080 | consumed tokens: 17264803840 | elapsed time per iteration (s): 0.15 | learning rate: 9.839E-05 | global batch size: 256 | lm loss: 3.809457E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.571 | TFLOPs: 26.04 | +7: iteration 32940/ 60336 | consumed samples: 8432640 | consumed tokens: 17270046720 | elapsed time per iteration (s): 0.15 | learning rate: 9.834E-05 | global batch size: 256 | lm loss: 3.808220E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.783 | TFLOPs: 26.05 | +7: iteration 32950/ 60336 | consumed samples: 8435200 | consumed tokens: 17275289600 | elapsed time per iteration (s): 0.15 | learning rate: 9.829E-05 | global batch size: 256 | lm loss: 3.796205E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.556 | TFLOPs: 26.06 | +7: iteration 32960/ 60336 | consumed samples: 8437760 | consumed tokens: 17280532480 | elapsed time per iteration (s): 0.15 | learning rate: 9.825E-05 | global batch size: 256 | lm loss: 3.789119E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.872 | TFLOPs: 26.02 | +7: iteration 32970/ 60336 | consumed samples: 8440320 | consumed tokens: 17285775360 | elapsed time per iteration (s): 0.15 | learning rate: 9.820E-05 | global batch size: 256 | lm loss: 3.803095E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.616 | TFLOPs: 26.03 | +7: iteration 32980/ 60336 | consumed samples: 8442880 | consumed tokens: 17291018240 | elapsed time per iteration (s): 0.15 | learning rate: 9.815E-05 | global batch size: 256 | lm loss: 3.794775E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.169 | TFLOPs: 26.02 | +7: iteration 32990/ 60336 | consumed samples: 8445440 | consumed tokens: 17296261120 | elapsed time per iteration (s): 0.15 | learning rate: 9.811E-05 | global batch size: 256 | lm loss: 3.798745E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.490 | TFLOPs: 26.02 | +7: iteration 33000/ 60336 | consumed samples: 8448000 | consumed tokens: 17301504000 | elapsed time per iteration (s): 0.15 | learning rate: 9.806E-05 | global batch size: 256 | lm loss: 3.799598E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.047 | TFLOPs: 26.02 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 33000 | lm loss value: 3.910256E+00 | lm loss PPL: 4.991174E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 33000 to checkpoints_44m32b100m +0: [2023-03-17 01:44:27,269] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step33000 is begin to save! +0: [2023-03-17 01:44:27,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:44:27,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:44:27,337] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:44:27,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:44:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:44:27,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:44:27,354] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:44:27,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:44:27,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:44:27,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:44:27,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:44:27,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:44:27,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:44:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:44:27,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:44:27,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:44:27,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:44:27,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:44:27,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:44:27,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:44:27,404] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step33000/mp_rank_00_model_states.pt +0: [2023-03-17 01:44:27,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:44:27,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:44:27,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:44:27,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:44:27,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:44:27,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:44:27,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:44:27,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:44:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:44:27,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 01:44:27,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:44:27,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:44:27,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:44:27,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:44:27,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: successfully saved checkpoint at iteration 33000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 182.42 +7: iteration 33010/ 60336 | consumed samples: 8450560 | consumed tokens: 17306746880 | elapsed time per iteration (s): 0.18 | learning rate: 9.801E-05 | global batch size: 256 | lm loss: 3.792005E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1444.443 | TFLOPs: 22.65 | +7: iteration 33020/ 60336 | consumed samples: 8453120 | consumed tokens: 17311989760 | elapsed time per iteration (s): 0.15 | learning rate: 9.796E-05 | global batch size: 256 | lm loss: 3.795638E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.241 | TFLOPs: 26.04 | +7: iteration 33030/ 60336 | consumed samples: 8455680 | consumed tokens: 17317232640 | elapsed time per iteration (s): 0.15 | learning rate: 9.792E-05 | global batch size: 256 | lm loss: 3.803293E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.686 | TFLOPs: 26.03 | +7: iteration 33040/ 60336 | consumed samples: 8458240 | consumed tokens: 17322475520 | elapsed time per iteration (s): 0.15 | learning rate: 9.787E-05 | global batch size: 256 | lm loss: 3.804332E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.540 | TFLOPs: 26.01 | +7: iteration 33050/ 60336 | consumed samples: 8460800 | consumed tokens: 17327718400 | elapsed time per iteration (s): 0.15 | learning rate: 9.782E-05 | global batch size: 256 | lm loss: 3.800270E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.387 | TFLOPs: 26.04 | +7: iteration 33060/ 60336 | consumed samples: 8463360 | consumed tokens: 17332961280 | elapsed time per iteration (s): 0.16 | learning rate: 9.778E-05 | global batch size: 256 | lm loss: 3.794054E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.501 | TFLOPs: 25.40 | +7: iteration 33070/ 60336 | consumed samples: 8465920 | consumed tokens: 17338204160 | elapsed time per iteration (s): 0.15 | learning rate: 9.773E-05 | global batch size: 256 | lm loss: 3.803309E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.651 | TFLOPs: 26.01 | +7: iteration 33080/ 60336 | consumed samples: 8468480 | consumed tokens: 17343447040 | elapsed time per iteration (s): 0.15 | learning rate: 9.768E-05 | global batch size: 256 | lm loss: 3.792123E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.754 | TFLOPs: 26.04 | +7: iteration 33090/ 60336 | consumed samples: 8471040 | consumed tokens: 17348689920 | elapsed time per iteration (s): 0.15 | learning rate: 9.764E-05 | global batch size: 256 | lm loss: 3.796091E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.456 | TFLOPs: 26.04 | +7: iteration 33100/ 60336 | consumed samples: 8473600 | consumed tokens: 17353932800 | elapsed time per iteration (s): 0.15 | learning rate: 9.759E-05 | global batch size: 256 | lm loss: 3.811589E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.576 | TFLOPs: 26.07 | +7: iteration 33110/ 60336 | consumed samples: 8476160 | consumed tokens: 17359175680 | elapsed time per iteration (s): 0.15 | learning rate: 9.754E-05 | global batch size: 256 | lm loss: 3.801690E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.080 | TFLOPs: 26.05 | +7: iteration 33120/ 60336 | consumed samples: 8478720 | consumed tokens: 17364418560 | elapsed time per iteration (s): 0.15 | learning rate: 9.750E-05 | global batch size: 256 | lm loss: 3.799477E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.140 | TFLOPs: 26.05 | +7: iteration 33130/ 60336 | consumed samples: 8481280 | consumed tokens: 17369661440 | elapsed time per iteration (s): 0.15 | learning rate: 9.745E-05 | global batch size: 256 | lm loss: 3.789845E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.804 | TFLOPs: 26.06 | +7: iteration 33140/ 60336 | consumed samples: 8483840 | consumed tokens: 17374904320 | elapsed time per iteration (s): 0.15 | learning rate: 9.740E-05 | global batch size: 256 | lm loss: 3.795231E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.175 | TFLOPs: 26.05 | +7: iteration 33150/ 60336 | consumed samples: 8486400 | consumed tokens: 17380147200 | elapsed time per iteration (s): 0.15 | learning rate: 9.735E-05 | global batch size: 256 | lm loss: 3.799095E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.264 | TFLOPs: 26.05 | +7: iteration 33160/ 60336 | consumed samples: 8488960 | consumed tokens: 17385390080 | elapsed time per iteration (s): 0.15 | learning rate: 9.731E-05 | global batch size: 256 | lm loss: 3.811964E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.812 | TFLOPs: 26.05 | +7: iteration 33170/ 60336 | consumed samples: 8491520 | consumed tokens: 17390632960 | elapsed time per iteration (s): 0.16 | learning rate: 9.726E-05 | global batch size: 256 | lm loss: 3.787471E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.466 | TFLOPs: 25.71 | +7: iteration 33180/ 60336 | consumed samples: 8494080 | consumed tokens: 17395875840 | elapsed time per iteration (s): 0.15 | learning rate: 9.721E-05 | global batch size: 256 | lm loss: 3.780369E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.390 | TFLOPs: 26.07 | +7: iteration 33190/ 60336 | consumed samples: 8496640 | consumed tokens: 17401118720 | elapsed time per iteration (s): 0.15 | learning rate: 9.717E-05 | global batch size: 256 | lm loss: 3.793959E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.675 | TFLOPs: 26.06 | +7: iteration 33200/ 60336 | consumed samples: 8499200 | consumed tokens: 17406361600 | elapsed time per iteration (s): 0.16 | learning rate: 9.712E-05 | global batch size: 256 | lm loss: 3.801842E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.787 | TFLOPs: 25.45 | +7: iteration 33210/ 60336 | consumed samples: 8501760 | consumed tokens: 17411604480 | elapsed time per iteration (s): 0.15 | learning rate: 9.707E-05 | global batch size: 256 | lm loss: 3.804791E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.713 | TFLOPs: 26.12 | +7: iteration 33220/ 60336 | consumed samples: 8504320 | consumed tokens: 17416847360 | elapsed time per iteration (s): 0.15 | learning rate: 9.703E-05 | global batch size: 256 | lm loss: 3.792744E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.909 | TFLOPs: 26.11 | +7: iteration 33230/ 60336 | consumed samples: 8506880 | consumed tokens: 17422090240 | elapsed time per iteration (s): 0.15 | learning rate: 9.698E-05 | global batch size: 256 | lm loss: 3.800419E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.276 | TFLOPs: 25.96 | +7: iteration 33240/ 60336 | consumed samples: 8509440 | consumed tokens: 17427333120 | elapsed time per iteration (s): 0.15 | learning rate: 9.693E-05 | global batch size: 256 | lm loss: 3.808028E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.102 | TFLOPs: 25.99 | +7: iteration 33250/ 60336 | consumed samples: 8512000 | consumed tokens: 17432576000 | elapsed time per iteration (s): 0.15 | learning rate: 9.689E-05 | global batch size: 256 | lm loss: 3.794002E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.161 | TFLOPs: 26.08 | +7: iteration 33260/ 60336 | consumed samples: 8514560 | consumed tokens: 17437818880 | elapsed time per iteration (s): 0.15 | learning rate: 9.684E-05 | global batch size: 256 | lm loss: 3.793027E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.059 | TFLOPs: 26.08 | +7: iteration 33270/ 60336 | consumed samples: 8517120 | consumed tokens: 17443061760 | elapsed time per iteration (s): 0.16 | learning rate: 9.679E-05 | global batch size: 256 | lm loss: 3.795246E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.566 | TFLOPs: 25.68 | +7: iteration 33280/ 60336 | consumed samples: 8519680 | consumed tokens: 17448304640 | elapsed time per iteration (s): 0.15 | learning rate: 9.675E-05 | global batch size: 256 | lm loss: 3.790996E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.268 | TFLOPs: 26.12 | +7: iteration 33290/ 60336 | consumed samples: 8522240 | consumed tokens: 17453547520 | elapsed time per iteration (s): 0.15 | learning rate: 9.670E-05 | global batch size: 256 | lm loss: 3.803152E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.372 | TFLOPs: 26.12 | +7: iteration 33300/ 60336 | consumed samples: 8524800 | consumed tokens: 17458790400 | elapsed time per iteration (s): 0.15 | learning rate: 9.665E-05 | global batch size: 256 | lm loss: 3.788395E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.383 | TFLOPs: 26.10 | +7: iteration 33310/ 60336 | consumed samples: 8527360 | consumed tokens: 17464033280 | elapsed time per iteration (s): 0.15 | learning rate: 9.661E-05 | global batch size: 256 | lm loss: 3.798519E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.178 | TFLOPs: 26.10 | +7: iteration 33320/ 60336 | consumed samples: 8529920 | consumed tokens: 17469276160 | elapsed time per iteration (s): 0.15 | learning rate: 9.656E-05 | global batch size: 256 | lm loss: 3.798178E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.909 | TFLOPs: 26.11 | +7: iteration 33330/ 60336 | consumed samples: 8532480 | consumed tokens: 17474519040 | elapsed time per iteration (s): 0.15 | learning rate: 9.651E-05 | global batch size: 256 | lm loss: 3.790417E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.600 | TFLOPs: 26.12 | +7: iteration 33340/ 60336 | consumed samples: 8535040 | consumed tokens: 17479761920 | elapsed time per iteration (s): 0.15 | learning rate: 9.647E-05 | global batch size: 256 | lm loss: 3.791994E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.336 | TFLOPs: 26.12 | +7: iteration 33350/ 60336 | consumed samples: 8537600 | consumed tokens: 17485004800 | elapsed time per iteration (s): 0.16 | learning rate: 9.642E-05 | global batch size: 256 | lm loss: 3.791001E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.885 | TFLOPs: 25.67 | +7: iteration 33360/ 60336 | consumed samples: 8540160 | consumed tokens: 17490247680 | elapsed time per iteration (s): 0.15 | learning rate: 9.637E-05 | global batch size: 256 | lm loss: 3.801254E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.269 | TFLOPs: 26.10 | +7: iteration 33370/ 60336 | consumed samples: 8542720 | consumed tokens: 17495490560 | elapsed time per iteration (s): 0.15 | learning rate: 9.632E-05 | global batch size: 256 | lm loss: 3.796184E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.969 | TFLOPs: 26.10 | +7: iteration 33380/ 60336 | consumed samples: 8545280 | consumed tokens: 17500733440 | elapsed time per iteration (s): 0.15 | learning rate: 9.628E-05 | global batch size: 256 | lm loss: 3.791063E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.726 | TFLOPs: 26.12 | +7: iteration 33390/ 60336 | consumed samples: 8547840 | consumed tokens: 17505976320 | elapsed time per iteration (s): 0.15 | learning rate: 9.623E-05 | global batch size: 256 | lm loss: 3.793638E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.197 | TFLOPs: 26.11 | +7: iteration 33400/ 60336 | consumed samples: 8550400 | consumed tokens: 17511219200 | elapsed time per iteration (s): 0.15 | learning rate: 9.618E-05 | global batch size: 256 | lm loss: 3.794770E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.503 | TFLOPs: 26.04 | +7: iteration 33410/ 60336 | consumed samples: 8552960 | consumed tokens: 17516462080 | elapsed time per iteration (s): 0.15 | learning rate: 9.614E-05 | global batch size: 256 | lm loss: 3.800904E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.792 | TFLOPs: 26.06 | +7: iteration 33420/ 60336 | consumed samples: 8555520 | consumed tokens: 17521704960 | elapsed time per iteration (s): 0.15 | learning rate: 9.609E-05 | global batch size: 256 | lm loss: 3.788930E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.308 | TFLOPs: 26.04 | +7: iteration 33430/ 60336 | consumed samples: 8558080 | consumed tokens: 17526947840 | elapsed time per iteration (s): 0.15 | learning rate: 9.604E-05 | global batch size: 256 | lm loss: 3.792508E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.452 | TFLOPs: 26.02 | +7: iteration 33440/ 60336 | consumed samples: 8560640 | consumed tokens: 17532190720 | elapsed time per iteration (s): 0.15 | learning rate: 9.600E-05 | global batch size: 256 | lm loss: 3.807485E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.344 | TFLOPs: 26.05 | +7: iteration 33450/ 60336 | consumed samples: 8563200 | consumed tokens: 17537433600 | elapsed time per iteration (s): 0.15 | learning rate: 9.595E-05 | global batch size: 256 | lm loss: 3.788369E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.085 | TFLOPs: 26.07 | +7: iteration 33460/ 60336 | consumed samples: 8565760 | consumed tokens: 17542676480 | elapsed time per iteration (s): 0.15 | learning rate: 9.590E-05 | global batch size: 256 | lm loss: 3.803593E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.434 | TFLOPs: 26.07 | +7: iteration 33470/ 60336 | consumed samples: 8568320 | consumed tokens: 17547919360 | elapsed time per iteration (s): 0.15 | learning rate: 9.586E-05 | global batch size: 256 | lm loss: 3.787662E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.176 | TFLOPs: 26.05 | +7: iteration 33480/ 60336 | consumed samples: 8570880 | consumed tokens: 17553162240 | elapsed time per iteration (s): 0.15 | learning rate: 9.581E-05 | global batch size: 256 | lm loss: 3.797213E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.037 | TFLOPs: 26.06 | +7: iteration 33490/ 60336 | consumed samples: 8573440 | consumed tokens: 17558405120 | elapsed time per iteration (s): 0.15 | learning rate: 9.576E-05 | global batch size: 256 | lm loss: 3.783453E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.781 | TFLOPs: 26.06 | +7: iteration 33500/ 60336 | consumed samples: 8576000 | consumed tokens: 17563648000 | elapsed time per iteration (s): 0.15 | learning rate: 9.572E-05 | global batch size: 256 | lm loss: 3.790072E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.097 | TFLOPs: 26.07 | +7: iteration 33510/ 60336 | consumed samples: 8578560 | consumed tokens: 17568890880 | elapsed time per iteration (s): 0.15 | learning rate: 9.567E-05 | global batch size: 256 | lm loss: 3.784920E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.667 | TFLOPs: 26.06 | +7: iteration 33520/ 60336 | consumed samples: 8581120 | consumed tokens: 17574133760 | elapsed time per iteration (s): 0.15 | learning rate: 9.562E-05 | global batch size: 256 | lm loss: 3.795169E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.128 | TFLOPs: 26.07 | +7: iteration 33530/ 60336 | consumed samples: 8583680 | consumed tokens: 17579376640 | elapsed time per iteration (s): 0.15 | learning rate: 9.558E-05 | global batch size: 256 | lm loss: 3.801218E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.390 | TFLOPs: 26.05 | +7: iteration 33540/ 60336 | consumed samples: 8586240 | consumed tokens: 17584619520 | elapsed time per iteration (s): 0.15 | learning rate: 9.553E-05 | global batch size: 256 | lm loss: 3.800452E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.771 | TFLOPs: 26.06 | +7: iteration 33550/ 60336 | consumed samples: 8588800 | consumed tokens: 17589862400 | elapsed time per iteration (s): 0.15 | learning rate: 9.548E-05 | global batch size: 256 | lm loss: 3.791997E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.803 | TFLOPs: 26.06 | +7: iteration 33560/ 60336 | consumed samples: 8591360 | consumed tokens: 17595105280 | elapsed time per iteration (s): 0.15 | learning rate: 9.544E-05 | global batch size: 256 | lm loss: 3.783387E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.586 | TFLOPs: 26.07 | +7: iteration 33570/ 60336 | consumed samples: 8593920 | consumed tokens: 17600348160 | elapsed time per iteration (s): 0.15 | learning rate: 9.539E-05 | global batch size: 256 | lm loss: 3.789985E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.124 | TFLOPs: 26.05 | +7: iteration 33580/ 60336 | consumed samples: 8596480 | consumed tokens: 17605591040 | elapsed time per iteration (s): 0.15 | learning rate: 9.534E-05 | global batch size: 256 | lm loss: 3.796028E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.967 | TFLOPs: 26.08 | +7: iteration 33590/ 60336 | consumed samples: 8599040 | consumed tokens: 17610833920 | elapsed time per iteration (s): 0.15 | learning rate: 9.530E-05 | global batch size: 256 | lm loss: 3.786102E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.676 | TFLOPs: 26.06 | +7: iteration 33600/ 60336 | consumed samples: 8601600 | consumed tokens: 17616076800 | elapsed time per iteration (s): 0.15 | learning rate: 9.525E-05 | global batch size: 256 | lm loss: 3.783908E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.288 | TFLOPs: 26.05 | +7: iteration 33610/ 60336 | consumed samples: 8604160 | consumed tokens: 17621319680 | elapsed time per iteration (s): 0.15 | learning rate: 9.520E-05 | global batch size: 256 | lm loss: 3.793228E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.245 | TFLOPs: 26.05 | +7: iteration 33620/ 60336 | consumed samples: 8606720 | consumed tokens: 17626562560 | elapsed time per iteration (s): 0.15 | learning rate: 9.516E-05 | global batch size: 256 | lm loss: 3.799134E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.388 | TFLOPs: 26.05 | +7: iteration 33630/ 60336 | consumed samples: 8609280 | consumed tokens: 17631805440 | elapsed time per iteration (s): 0.15 | learning rate: 9.511E-05 | global batch size: 256 | lm loss: 3.784492E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.998 | TFLOPs: 26.06 | +7: iteration 33640/ 60336 | consumed samples: 8611840 | consumed tokens: 17637048320 | elapsed time per iteration (s): 0.15 | learning rate: 9.506E-05 | global batch size: 256 | lm loss: 3.788691E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.498 | TFLOPs: 26.04 | +7: iteration 33650/ 60336 | consumed samples: 8614400 | consumed tokens: 17642291200 | elapsed time per iteration (s): 0.15 | learning rate: 9.502E-05 | global batch size: 256 | lm loss: 3.805111E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.152 | TFLOPs: 26.05 | +7: iteration 33660/ 60336 | consumed samples: 8616960 | consumed tokens: 17647534080 | elapsed time per iteration (s): 0.15 | learning rate: 9.497E-05 | global batch size: 256 | lm loss: 3.802267E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.428 | TFLOPs: 26.07 | +7: iteration 33670/ 60336 | consumed samples: 8619520 | consumed tokens: 17652776960 | elapsed time per iteration (s): 0.15 | learning rate: 9.492E-05 | global batch size: 256 | lm loss: 3.782553E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.392 | TFLOPs: 26.05 | +7: iteration 33680/ 60336 | consumed samples: 8622080 | consumed tokens: 17658019840 | elapsed time per iteration (s): 0.15 | learning rate: 9.488E-05 | global batch size: 256 | lm loss: 3.795633E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.557 | TFLOPs: 26.07 | +7: iteration 33690/ 60336 | consumed samples: 8624640 | consumed tokens: 17663262720 | elapsed time per iteration (s): 0.15 | learning rate: 9.483E-05 | global batch size: 256 | lm loss: 3.804173E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.909 | TFLOPs: 26.06 | +7: iteration 33700/ 60336 | consumed samples: 8627200 | consumed tokens: 17668505600 | elapsed time per iteration (s): 0.15 | learning rate: 9.478E-05 | global batch size: 256 | lm loss: 3.800241E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.186 | TFLOPs: 26.07 | +7: iteration 33710/ 60336 | consumed samples: 8629760 | consumed tokens: 17673748480 | elapsed time per iteration (s): 0.16 | learning rate: 9.474E-05 | global batch size: 256 | lm loss: 3.792330E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.034 | TFLOPs: 25.55 | +7: iteration 33720/ 60336 | consumed samples: 8632320 | consumed tokens: 17678991360 | elapsed time per iteration (s): 0.15 | learning rate: 9.469E-05 | global batch size: 256 | lm loss: 3.795482E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.973 | TFLOPs: 26.08 | +7: iteration 33730/ 60336 | consumed samples: 8634880 | consumed tokens: 17684234240 | elapsed time per iteration (s): 0.15 | learning rate: 9.464E-05 | global batch size: 256 | lm loss: 3.792899E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.769 | TFLOPs: 26.08 | +7: iteration 33740/ 60336 | consumed samples: 8637440 | consumed tokens: 17689477120 | elapsed time per iteration (s): 0.15 | learning rate: 9.460E-05 | global batch size: 256 | lm loss: 3.797859E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.898 | TFLOPs: 26.08 | +7: iteration 33750/ 60336 | consumed samples: 8640000 | consumed tokens: 17694720000 | elapsed time per iteration (s): 0.15 | learning rate: 9.455E-05 | global batch size: 256 | lm loss: 3.794967E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.851 | TFLOPs: 26.08 | +7: iteration 33760/ 60336 | consumed samples: 8642560 | consumed tokens: 17699962880 | elapsed time per iteration (s): 0.16 | learning rate: 9.450E-05 | global batch size: 256 | lm loss: 3.792241E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.221 | TFLOPs: 25.86 | +7: iteration 33770/ 60336 | consumed samples: 8645120 | consumed tokens: 17705205760 | elapsed time per iteration (s): 0.15 | learning rate: 9.446E-05 | global batch size: 256 | lm loss: 3.799298E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.844 | TFLOPs: 26.06 | +7: iteration 33780/ 60336 | consumed samples: 8647680 | consumed tokens: 17710448640 | elapsed time per iteration (s): 0.15 | learning rate: 9.441E-05 | global batch size: 256 | lm loss: 3.795028E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.456 | TFLOPs: 26.06 | +7: iteration 33790/ 60336 | consumed samples: 8650240 | consumed tokens: 17715691520 | elapsed time per iteration (s): 0.15 | learning rate: 9.436E-05 | global batch size: 256 | lm loss: 3.801133E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.318 | TFLOPs: 26.05 | +7: iteration 33800/ 60336 | consumed samples: 8652800 | consumed tokens: 17720934400 | elapsed time per iteration (s): 0.15 | learning rate: 9.432E-05 | global batch size: 256 | lm loss: 3.782990E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.167 | TFLOPs: 26.08 | +7: iteration 33810/ 60336 | consumed samples: 8655360 | consumed tokens: 17726177280 | elapsed time per iteration (s): 0.15 | learning rate: 9.427E-05 | global batch size: 256 | lm loss: 3.778099E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.843 | TFLOPs: 26.09 | +7: iteration 33820/ 60336 | consumed samples: 8657920 | consumed tokens: 17731420160 | elapsed time per iteration (s): 0.16 | learning rate: 9.422E-05 | global batch size: 256 | lm loss: 3.793902E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.512 | TFLOPs: 25.46 | +7: iteration 33830/ 60336 | consumed samples: 8660480 | consumed tokens: 17736663040 | elapsed time per iteration (s): 0.15 | learning rate: 9.418E-05 | global batch size: 256 | lm loss: 3.779309E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.745 | TFLOPs: 26.08 | +7: iteration 33840/ 60336 | consumed samples: 8663040 | consumed tokens: 17741905920 | elapsed time per iteration (s): 0.15 | learning rate: 9.413E-05 | global batch size: 256 | lm loss: 3.788765E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.513 | TFLOPs: 26.09 | +7: iteration 33850/ 60336 | consumed samples: 8665600 | consumed tokens: 17747148800 | elapsed time per iteration (s): 0.15 | learning rate: 9.408E-05 | global batch size: 256 | lm loss: 3.788358E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.307 | TFLOPs: 26.10 | +7: iteration 33860/ 60336 | consumed samples: 8668160 | consumed tokens: 17752391680 | elapsed time per iteration (s): 0.15 | learning rate: 9.404E-05 | global batch size: 256 | lm loss: 3.782988E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.173 | TFLOPs: 26.07 | +7: iteration 33870/ 60336 | consumed samples: 8670720 | consumed tokens: 17757634560 | elapsed time per iteration (s): 0.15 | learning rate: 9.399E-05 | global batch size: 256 | lm loss: 3.784161E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.793 | TFLOPs: 26.09 | +7: iteration 33880/ 60336 | consumed samples: 8673280 | consumed tokens: 17762877440 | elapsed time per iteration (s): 0.15 | learning rate: 9.394E-05 | global batch size: 256 | lm loss: 3.818329E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.444 | TFLOPs: 25.96 | +7: iteration 33890/ 60336 | consumed samples: 8675840 | consumed tokens: 17768120320 | elapsed time per iteration (s): 0.16 | learning rate: 9.390E-05 | global batch size: 256 | lm loss: 3.802158E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.406 | TFLOPs: 25.90 | +7: iteration 33900/ 60336 | consumed samples: 8678400 | consumed tokens: 17773363200 | elapsed time per iteration (s): 0.15 | learning rate: 9.385E-05 | global batch size: 256 | lm loss: 3.800444E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.779 | TFLOPs: 26.01 | +7: iteration 33910/ 60336 | consumed samples: 8680960 | consumed tokens: 17778606080 | elapsed time per iteration (s): 0.15 | learning rate: 9.380E-05 | global batch size: 256 | lm loss: 3.800264E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.375 | TFLOPs: 26.04 | +7: iteration 33920/ 60336 | consumed samples: 8683520 | consumed tokens: 17783848960 | elapsed time per iteration (s): 0.15 | learning rate: 9.376E-05 | global batch size: 256 | lm loss: 3.790684E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.306 | TFLOPs: 26.01 | +7: iteration 33930/ 60336 | consumed samples: 8686080 | consumed tokens: 17789091840 | elapsed time per iteration (s): 0.15 | learning rate: 9.371E-05 | global batch size: 256 | lm loss: 3.793188E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.834 | TFLOPs: 26.00 | +7: iteration 33940/ 60336 | consumed samples: 8688640 | consumed tokens: 17794334720 | elapsed time per iteration (s): 0.15 | learning rate: 9.366E-05 | global batch size: 256 | lm loss: 3.785798E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.440 | TFLOPs: 25.99 | +7: iteration 33950/ 60336 | consumed samples: 8691200 | consumed tokens: 17799577600 | elapsed time per iteration (s): 0.15 | learning rate: 9.362E-05 | global batch size: 256 | lm loss: 3.802885E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.057 | TFLOPs: 25.99 | +7: iteration 33960/ 60336 | consumed samples: 8693760 | consumed tokens: 17804820480 | elapsed time per iteration (s): 0.15 | learning rate: 9.357E-05 | global batch size: 256 | lm loss: 3.799922E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.843 | TFLOPs: 25.98 | +7: iteration 33970/ 60336 | consumed samples: 8696320 | consumed tokens: 17810063360 | elapsed time per iteration (s): 0.15 | learning rate: 9.352E-05 | global batch size: 256 | lm loss: 3.778291E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.360 | TFLOPs: 26.01 | +7: iteration 33980/ 60336 | consumed samples: 8698880 | consumed tokens: 17815306240 | elapsed time per iteration (s): 0.15 | learning rate: 9.348E-05 | global batch size: 256 | lm loss: 3.790988E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.800 | TFLOPs: 26.03 | +7: iteration 33990/ 60336 | consumed samples: 8701440 | consumed tokens: 17820549120 | elapsed time per iteration (s): 0.16 | learning rate: 9.343E-05 | global batch size: 256 | lm loss: 3.789101E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.554 | TFLOPs: 25.63 | +0: [2023-03-17 01:47:01,773] [INFO] [logging.py:68:log_dist] [Rank 0] step=34000, skipped=0, lr=[9.33853393895039e-05, 9.33853393895039e-05, 9.33853393895039e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 34000/ 60336 | consumed samples: 8704000 | consumed tokens: 17825792000 | elapsed time per iteration (s): 0.15 | learning rate: 9.339E-05 | global batch size: 256 | lm loss: 3.792840E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.755 | TFLOPs: 26.00 | +0: steps: 34000 loss: 3.7828 iter time (s): 0.153 samples/sec: 1676.734 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 34000 | lm loss value: 3.944648E+00 | lm loss PPL: 5.165814E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 34000 to checkpoints_44m32b100m +0: [2023-03-17 01:47:01,844] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step34000 is begin to save! +0: [2023-03-17 01:47:01,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:47:01,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:47:01,906] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:47:01,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:47:01,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:47:01,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:47:01,926] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:47:01,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:47:01,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:47:01,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:47:01,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:47:01,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:47:01,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:47:01,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:47:01,959] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:47:01,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:47:01,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:47:01,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:47:01,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:47:01,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:47:01,976] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step34000/mp_rank_00_model_states.pt +0: [2023-03-17 01:47:01,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:47:01,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:47:01,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:47:02,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 01:47:02,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:47:02,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:47:02,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:47:02,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:47:02,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 01:47:02,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:47:02,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:47:02,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:47:02,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 01:47:02,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 01:47:02,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:47:02,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: successfully saved checkpoint at iteration 34000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.34 +7: iteration 34010/ 60336 | consumed samples: 8706560 | consumed tokens: 17831034880 | elapsed time per iteration (s): 0.18 | learning rate: 9.334E-05 | global batch size: 256 | lm loss: 3.800368E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1432.940 | TFLOPs: 22.47 | +7: iteration 34020/ 60336 | consumed samples: 8709120 | consumed tokens: 17836277760 | elapsed time per iteration (s): 0.15 | learning rate: 9.329E-05 | global batch size: 256 | lm loss: 3.795329E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.155 | TFLOPs: 26.02 | +7: iteration 34030/ 60336 | consumed samples: 8711680 | consumed tokens: 17841520640 | elapsed time per iteration (s): 0.15 | learning rate: 9.325E-05 | global batch size: 256 | lm loss: 3.792027E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.127 | TFLOPs: 26.02 | +7: iteration 34040/ 60336 | consumed samples: 8714240 | consumed tokens: 17846763520 | elapsed time per iteration (s): 0.15 | learning rate: 9.320E-05 | global batch size: 256 | lm loss: 3.775537E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.874 | TFLOPs: 26.02 | +7: iteration 34050/ 60336 | consumed samples: 8716800 | consumed tokens: 17852006400 | elapsed time per iteration (s): 0.15 | learning rate: 9.315E-05 | global batch size: 256 | lm loss: 3.790171E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.143 | TFLOPs: 26.00 | +7: iteration 34060/ 60336 | consumed samples: 8719360 | consumed tokens: 17857249280 | elapsed time per iteration (s): 0.15 | learning rate: 9.311E-05 | global batch size: 256 | lm loss: 3.791072E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.618 | TFLOPs: 26.01 | +7: iteration 34070/ 60336 | consumed samples: 8721920 | consumed tokens: 17862492160 | elapsed time per iteration (s): 0.15 | learning rate: 9.306E-05 | global batch size: 256 | lm loss: 3.789700E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.762 | TFLOPs: 26.03 | +7: iteration 34080/ 60336 | consumed samples: 8724480 | consumed tokens: 17867735040 | elapsed time per iteration (s): 0.15 | learning rate: 9.301E-05 | global batch size: 256 | lm loss: 3.804824E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.736 | TFLOPs: 26.03 | +7: iteration 34090/ 60336 | consumed samples: 8727040 | consumed tokens: 17872977920 | elapsed time per iteration (s): 0.15 | learning rate: 9.297E-05 | global batch size: 256 | lm loss: 3.791920E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.348 | TFLOPs: 26.02 | +7: iteration 34100/ 60336 | consumed samples: 8729600 | consumed tokens: 17878220800 | elapsed time per iteration (s): 0.15 | learning rate: 9.292E-05 | global batch size: 256 | lm loss: 3.793094E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.824 | TFLOPs: 26.03 | +7: iteration 34110/ 60336 | consumed samples: 8732160 | consumed tokens: 17883463680 | elapsed time per iteration (s): 0.15 | learning rate: 9.287E-05 | global batch size: 256 | lm loss: 3.789628E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.412 | TFLOPs: 26.01 | +7: iteration 34120/ 60336 | consumed samples: 8734720 | consumed tokens: 17888706560 | elapsed time per iteration (s): 0.15 | learning rate: 9.283E-05 | global batch size: 256 | lm loss: 3.797585E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.988 | TFLOPs: 26.02 | +7: iteration 34130/ 60336 | consumed samples: 8737280 | consumed tokens: 17893949440 | elapsed time per iteration (s): 0.15 | learning rate: 9.278E-05 | global batch size: 256 | lm loss: 3.802317E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.429 | TFLOPs: 26.02 | +7: iteration 34140/ 60336 | consumed samples: 8739840 | consumed tokens: 17899192320 | elapsed time per iteration (s): 0.15 | learning rate: 9.273E-05 | global batch size: 256 | lm loss: 3.791276E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.167 | TFLOPs: 26.02 | +7: iteration 34150/ 60336 | consumed samples: 8742400 | consumed tokens: 17904435200 | elapsed time per iteration (s): 0.15 | learning rate: 9.269E-05 | global batch size: 256 | lm loss: 3.788171E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.314 | TFLOPs: 26.02 | +7: iteration 34160/ 60336 | consumed samples: 8744960 | consumed tokens: 17909678080 | elapsed time per iteration (s): 0.15 | learning rate: 9.264E-05 | global batch size: 256 | lm loss: 3.789276E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.179 | TFLOPs: 26.04 | +7: iteration 34170/ 60336 | consumed samples: 8747520 | consumed tokens: 17914920960 | elapsed time per iteration (s): 0.15 | learning rate: 9.260E-05 | global batch size: 256 | lm loss: 3.800647E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.180 | TFLOPs: 26.02 | +7: iteration 34180/ 60336 | consumed samples: 8750080 | consumed tokens: 17920163840 | elapsed time per iteration (s): 0.15 | learning rate: 9.255E-05 | global batch size: 256 | lm loss: 3.796454E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.734 | TFLOPs: 26.03 | +7: iteration 34190/ 60336 | consumed samples: 8752640 | consumed tokens: 17925406720 | elapsed time per iteration (s): 0.15 | learning rate: 9.250E-05 | global batch size: 256 | lm loss: 3.786806E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.848 | TFLOPs: 26.01 | +7: iteration 34200/ 60336 | consumed samples: 8755200 | consumed tokens: 17930649600 | elapsed time per iteration (s): 0.15 | learning rate: 9.246E-05 | global batch size: 256 | lm loss: 3.792585E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.712 | TFLOPs: 26.04 | +7: iteration 34210/ 60336 | consumed samples: 8757760 | consumed tokens: 17935892480 | elapsed time per iteration (s): 0.15 | learning rate: 9.241E-05 | global batch size: 256 | lm loss: 3.796206E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.691 | TFLOPs: 26.01 | +7: iteration 34220/ 60336 | consumed samples: 8760320 | consumed tokens: 17941135360 | elapsed time per iteration (s): 0.15 | learning rate: 9.236E-05 | global batch size: 256 | lm loss: 3.796828E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.884 | TFLOPs: 26.00 | +7: iteration 34230/ 60336 | consumed samples: 8762880 | consumed tokens: 17946378240 | elapsed time per iteration (s): 0.16 | learning rate: 9.232E-05 | global batch size: 256 | lm loss: 3.796719E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.736 | TFLOPs: 25.65 | +7: iteration 34240/ 60336 | consumed samples: 8765440 | consumed tokens: 17951621120 | elapsed time per iteration (s): 0.15 | learning rate: 9.227E-05 | global batch size: 256 | lm loss: 3.793668E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.230 | TFLOPs: 26.04 | +7: iteration 34250/ 60336 | consumed samples: 8768000 | consumed tokens: 17956864000 | elapsed time per iteration (s): 0.15 | learning rate: 9.222E-05 | global batch size: 256 | lm loss: 3.802628E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.769 | TFLOPs: 26.03 | +7: iteration 34260/ 60336 | consumed samples: 8770560 | consumed tokens: 17962106880 | elapsed time per iteration (s): 0.15 | learning rate: 9.218E-05 | global batch size: 256 | lm loss: 3.798066E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.627 | TFLOPs: 26.04 | +7: iteration 34270/ 60336 | consumed samples: 8773120 | consumed tokens: 17967349760 | elapsed time per iteration (s): 0.15 | learning rate: 9.213E-05 | global batch size: 256 | lm loss: 3.791534E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.456 | TFLOPs: 26.04 | +7: iteration 34280/ 60336 | consumed samples: 8775680 | consumed tokens: 17972592640 | elapsed time per iteration (s): 0.15 | learning rate: 9.208E-05 | global batch size: 256 | lm loss: 3.778614E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.786 | TFLOPs: 26.00 | +7: iteration 34290/ 60336 | consumed samples: 8778240 | consumed tokens: 17977835520 | elapsed time per iteration (s): 0.15 | learning rate: 9.204E-05 | global batch size: 256 | lm loss: 3.791800E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.978 | TFLOPs: 26.03 | +7: iteration 34300/ 60336 | consumed samples: 8780800 | consumed tokens: 17983078400 | elapsed time per iteration (s): 0.15 | learning rate: 9.199E-05 | global batch size: 256 | lm loss: 3.792999E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.924 | TFLOPs: 26.05 | +7: iteration 34310/ 60336 | consumed samples: 8783360 | consumed tokens: 17988321280 | elapsed time per iteration (s): 0.15 | learning rate: 9.195E-05 | global batch size: 256 | lm loss: 3.791274E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.613 | TFLOPs: 26.04 | +7: iteration 34320/ 60336 | consumed samples: 8785920 | consumed tokens: 17993564160 | elapsed time per iteration (s): 0.15 | learning rate: 9.190E-05 | global batch size: 256 | lm loss: 3.804754E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.365 | TFLOPs: 26.04 | +7: iteration 34330/ 60336 | consumed samples: 8788480 | consumed tokens: 17998807040 | elapsed time per iteration (s): 0.15 | learning rate: 9.185E-05 | global batch size: 256 | lm loss: 3.790160E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.562 | TFLOPs: 26.04 | +7: iteration 34340/ 60336 | consumed samples: 8791040 | consumed tokens: 18004049920 | elapsed time per iteration (s): 0.16 | learning rate: 9.181E-05 | global batch size: 256 | lm loss: 3.782615E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.781 | TFLOPs: 25.40 | +7: iteration 34350/ 60336 | consumed samples: 8793600 | consumed tokens: 18009292800 | elapsed time per iteration (s): 0.15 | learning rate: 9.176E-05 | global batch size: 256 | lm loss: 3.790813E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.366 | TFLOPs: 26.01 | +7: iteration 34360/ 60336 | consumed samples: 8796160 | consumed tokens: 18014535680 | elapsed time per iteration (s): 0.16 | learning rate: 9.171E-05 | global batch size: 256 | lm loss: 3.776778E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.994 | TFLOPs: 25.59 | +7: iteration 34370/ 60336 | consumed samples: 8798720 | consumed tokens: 18019778560 | elapsed time per iteration (s): 0.15 | learning rate: 9.167E-05 | global batch size: 256 | lm loss: 3.785613E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.511 | TFLOPs: 25.98 | +7: iteration 34380/ 60336 | consumed samples: 8801280 | consumed tokens: 18025021440 | elapsed time per iteration (s): 0.15 | learning rate: 9.162E-05 | global batch size: 256 | lm loss: 3.802019E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.747 | TFLOPs: 26.00 | +7: iteration 34390/ 60336 | consumed samples: 8803840 | consumed tokens: 18030264320 | elapsed time per iteration (s): 0.15 | learning rate: 9.157E-05 | global batch size: 256 | lm loss: 3.803994E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.725 | TFLOPs: 26.03 | +7: iteration 34400/ 60336 | consumed samples: 8806400 | consumed tokens: 18035507200 | elapsed time per iteration (s): 0.15 | learning rate: 9.153E-05 | global batch size: 256 | lm loss: 3.776179E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.435 | TFLOPs: 26.01 | +7: iteration 34410/ 60336 | consumed samples: 8808960 | consumed tokens: 18040750080 | elapsed time per iteration (s): 0.15 | learning rate: 9.148E-05 | global batch size: 256 | lm loss: 3.787280E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.054 | TFLOPs: 26.00 | +7: iteration 34420/ 60336 | consumed samples: 8811520 | consumed tokens: 18045992960 | elapsed time per iteration (s): 0.15 | learning rate: 9.144E-05 | global batch size: 256 | lm loss: 3.797733E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.136 | TFLOPs: 26.00 | +7: iteration 34430/ 60336 | consumed samples: 8814080 | consumed tokens: 18051235840 | elapsed time per iteration (s): 0.15 | learning rate: 9.139E-05 | global batch size: 256 | lm loss: 3.779713E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.638 | TFLOPs: 25.98 | +7: iteration 34440/ 60336 | consumed samples: 8816640 | consumed tokens: 18056478720 | elapsed time per iteration (s): 0.15 | learning rate: 9.134E-05 | global batch size: 256 | lm loss: 3.778260E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.633 | TFLOPs: 26.01 | +7: iteration 34450/ 60336 | consumed samples: 8819200 | consumed tokens: 18061721600 | elapsed time per iteration (s): 0.15 | learning rate: 9.130E-05 | global batch size: 256 | lm loss: 3.785343E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.793 | TFLOPs: 26.00 | +7: iteration 34460/ 60336 | consumed samples: 8821760 | consumed tokens: 18066964480 | elapsed time per iteration (s): 0.15 | learning rate: 9.125E-05 | global batch size: 256 | lm loss: 3.781896E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.100 | TFLOPs: 26.02 | +7: iteration 34470/ 60336 | consumed samples: 8824320 | consumed tokens: 18072207360 | elapsed time per iteration (s): 0.15 | learning rate: 9.120E-05 | global batch size: 256 | lm loss: 3.797573E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.441 | TFLOPs: 26.02 | +7: iteration 34480/ 60336 | consumed samples: 8826880 | consumed tokens: 18077450240 | elapsed time per iteration (s): 0.15 | learning rate: 9.116E-05 | global batch size: 256 | lm loss: 3.786341E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.573 | TFLOPs: 25.99 | +7: iteration 34490/ 60336 | consumed samples: 8829440 | consumed tokens: 18082693120 | elapsed time per iteration (s): 0.15 | learning rate: 9.111E-05 | global batch size: 256 | lm loss: 3.776679E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.766 | TFLOPs: 26.00 | +7: iteration 34500/ 60336 | consumed samples: 8832000 | consumed tokens: 18087936000 | elapsed time per iteration (s): 0.15 | learning rate: 9.107E-05 | global batch size: 256 | lm loss: 3.789489E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.314 | TFLOPs: 26.01 | +7: iteration 34510/ 60336 | consumed samples: 8834560 | consumed tokens: 18093178880 | elapsed time per iteration (s): 0.15 | learning rate: 9.102E-05 | global batch size: 256 | lm loss: 3.797953E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.123 | TFLOPs: 25.97 | +7: iteration 34520/ 60336 | consumed samples: 8837120 | consumed tokens: 18098421760 | elapsed time per iteration (s): 0.15 | learning rate: 9.097E-05 | global batch size: 256 | lm loss: 3.798065E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.845 | TFLOPs: 26.00 | +7: iteration 34530/ 60336 | consumed samples: 8839680 | consumed tokens: 18103664640 | elapsed time per iteration (s): 0.15 | learning rate: 9.093E-05 | global batch size: 256 | lm loss: 3.803733E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.754 | TFLOPs: 26.00 | +7: iteration 34540/ 60336 | consumed samples: 8842240 | consumed tokens: 18108907520 | elapsed time per iteration (s): 0.15 | learning rate: 9.088E-05 | global batch size: 256 | lm loss: 3.797113E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.838 | TFLOPs: 26.00 | +7: iteration 34550/ 60336 | consumed samples: 8844800 | consumed tokens: 18114150400 | elapsed time per iteration (s): 0.15 | learning rate: 9.083E-05 | global batch size: 256 | lm loss: 3.793971E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.265 | TFLOPs: 25.97 | +7: iteration 34560/ 60336 | consumed samples: 8847360 | consumed tokens: 18119393280 | elapsed time per iteration (s): 0.15 | learning rate: 9.079E-05 | global batch size: 256 | lm loss: 3.800614E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.240 | TFLOPs: 26.01 | +7: iteration 34570/ 60336 | consumed samples: 8849920 | consumed tokens: 18124636160 | elapsed time per iteration (s): 0.15 | learning rate: 9.074E-05 | global batch size: 256 | lm loss: 3.794285E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.418 | TFLOPs: 25.99 | +7: iteration 34580/ 60336 | consumed samples: 8852480 | consumed tokens: 18129879040 | elapsed time per iteration (s): 0.15 | learning rate: 9.070E-05 | global batch size: 256 | lm loss: 3.791903E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.196 | TFLOPs: 25.99 | +7: iteration 34590/ 60336 | consumed samples: 8855040 | consumed tokens: 18135121920 | elapsed time per iteration (s): 0.15 | learning rate: 9.065E-05 | global batch size: 256 | lm loss: 3.799029E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.095 | TFLOPs: 25.99 | +7: iteration 34600/ 60336 | consumed samples: 8857600 | consumed tokens: 18140364800 | elapsed time per iteration (s): 0.15 | learning rate: 9.060E-05 | global batch size: 256 | lm loss: 3.797406E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.623 | TFLOPs: 25.98 | +7: iteration 34610/ 60336 | consumed samples: 8860160 | consumed tokens: 18145607680 | elapsed time per iteration (s): 0.15 | learning rate: 9.056E-05 | global batch size: 256 | lm loss: 3.800487E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.306 | TFLOPs: 25.99 | +7: iteration 34620/ 60336 | consumed samples: 8862720 | consumed tokens: 18150850560 | elapsed time per iteration (s): 0.15 | learning rate: 9.051E-05 | global batch size: 256 | lm loss: 3.797520E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.343 | TFLOPs: 25.98 | +7: iteration 34630/ 60336 | consumed samples: 8865280 | consumed tokens: 18156093440 | elapsed time per iteration (s): 0.15 | learning rate: 9.046E-05 | global batch size: 256 | lm loss: 3.784697E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.390 | TFLOPs: 25.98 | +7: iteration 34640/ 60336 | consumed samples: 8867840 | consumed tokens: 18161336320 | elapsed time per iteration (s): 0.15 | learning rate: 9.042E-05 | global batch size: 256 | lm loss: 3.781089E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.025 | TFLOPs: 25.99 | +7: iteration 34650/ 60336 | consumed samples: 8870400 | consumed tokens: 18166579200 | elapsed time per iteration (s): 0.15 | learning rate: 9.037E-05 | global batch size: 256 | lm loss: 3.785865E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.957 | TFLOPs: 25.99 | +7: iteration 34660/ 60336 | consumed samples: 8872960 | consumed tokens: 18171822080 | elapsed time per iteration (s): 0.15 | learning rate: 9.033E-05 | global batch size: 256 | lm loss: 3.783801E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.460 | TFLOPs: 25.99 | +7: iteration 34670/ 60336 | consumed samples: 8875520 | consumed tokens: 18177064960 | elapsed time per iteration (s): 0.15 | learning rate: 9.028E-05 | global batch size: 256 | lm loss: 3.784538E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.424 | TFLOPs: 25.98 | +7: iteration 34680/ 60336 | consumed samples: 8878080 | consumed tokens: 18182307840 | elapsed time per iteration (s): 0.15 | learning rate: 9.023E-05 | global batch size: 256 | lm loss: 3.792467E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.174 | TFLOPs: 25.97 | +7: iteration 34690/ 60336 | consumed samples: 8880640 | consumed tokens: 18187550720 | elapsed time per iteration (s): 0.15 | learning rate: 9.019E-05 | global batch size: 256 | lm loss: 3.778498E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.889 | TFLOPs: 26.00 | +7: iteration 34700/ 60336 | consumed samples: 8883200 | consumed tokens: 18192793600 | elapsed time per iteration (s): 0.17 | learning rate: 9.014E-05 | global batch size: 256 | lm loss: 3.783406E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.182 | TFLOPs: 24.30 | +7: iteration 34710/ 60336 | consumed samples: 8885760 | consumed tokens: 18198036480 | elapsed time per iteration (s): 0.17 | learning rate: 9.009E-05 | global batch size: 256 | lm loss: 3.796583E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.525 | TFLOPs: 23.45 | +7: iteration 34720/ 60336 | consumed samples: 8888320 | consumed tokens: 18203279360 | elapsed time per iteration (s): 0.16 | learning rate: 9.005E-05 | global batch size: 256 | lm loss: 3.791327E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.181 | TFLOPs: 25.85 | +7: iteration 34730/ 60336 | consumed samples: 8890880 | consumed tokens: 18208522240 | elapsed time per iteration (s): 0.15 | learning rate: 9.000E-05 | global batch size: 256 | lm loss: 3.799677E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.261 | TFLOPs: 25.97 | +7: iteration 34740/ 60336 | consumed samples: 8893440 | consumed tokens: 18213765120 | elapsed time per iteration (s): 0.15 | learning rate: 8.996E-05 | global batch size: 256 | lm loss: 3.792730E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.184 | TFLOPs: 25.99 | +7: iteration 34750/ 60336 | consumed samples: 8896000 | consumed tokens: 18219008000 | elapsed time per iteration (s): 0.15 | learning rate: 8.991E-05 | global batch size: 256 | lm loss: 3.789935E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.401 | TFLOPs: 25.99 | +7: iteration 34760/ 60336 | consumed samples: 8898560 | consumed tokens: 18224250880 | elapsed time per iteration (s): 0.15 | learning rate: 8.986E-05 | global batch size: 256 | lm loss: 3.785844E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.752 | TFLOPs: 25.98 | +7: iteration 34770/ 60336 | consumed samples: 8901120 | consumed tokens: 18229493760 | elapsed time per iteration (s): 0.15 | learning rate: 8.982E-05 | global batch size: 256 | lm loss: 3.779695E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.691 | TFLOPs: 25.95 | +7: iteration 34780/ 60336 | consumed samples: 8903680 | consumed tokens: 18234736640 | elapsed time per iteration (s): 0.15 | learning rate: 8.977E-05 | global batch size: 256 | lm loss: 3.794693E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.118 | TFLOPs: 25.97 | +7: iteration 34790/ 60336 | consumed samples: 8906240 | consumed tokens: 18239979520 | elapsed time per iteration (s): 0.15 | learning rate: 8.973E-05 | global batch size: 256 | lm loss: 3.774495E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.768 | TFLOPs: 25.98 | +7: iteration 34800/ 60336 | consumed samples: 8908800 | consumed tokens: 18245222400 | elapsed time per iteration (s): 0.15 | learning rate: 8.968E-05 | global batch size: 256 | lm loss: 3.793047E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.089 | TFLOPs: 25.99 | +7: iteration 34810/ 60336 | consumed samples: 8911360 | consumed tokens: 18250465280 | elapsed time per iteration (s): 0.15 | learning rate: 8.963E-05 | global batch size: 256 | lm loss: 3.791487E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.486 | TFLOPs: 25.98 | +7: iteration 34820/ 60336 | consumed samples: 8913920 | consumed tokens: 18255708160 | elapsed time per iteration (s): 0.15 | learning rate: 8.959E-05 | global batch size: 256 | lm loss: 3.783809E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.381 | TFLOPs: 25.98 | +7: iteration 34830/ 60336 | consumed samples: 8916480 | consumed tokens: 18260951040 | elapsed time per iteration (s): 0.15 | learning rate: 8.954E-05 | global batch size: 256 | lm loss: 3.812721E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.299 | TFLOPs: 25.99 | +7: iteration 34840/ 60336 | consumed samples: 8919040 | consumed tokens: 18266193920 | elapsed time per iteration (s): 0.15 | learning rate: 8.950E-05 | global batch size: 256 | lm loss: 3.795770E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.091 | TFLOPs: 25.97 | +7: iteration 34850/ 60336 | consumed samples: 8921600 | consumed tokens: 18271436800 | elapsed time per iteration (s): 0.15 | learning rate: 8.945E-05 | global batch size: 256 | lm loss: 3.776114E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.731 | TFLOPs: 26.00 | +7: iteration 34860/ 60336 | consumed samples: 8924160 | consumed tokens: 18276679680 | elapsed time per iteration (s): 0.15 | learning rate: 8.940E-05 | global batch size: 256 | lm loss: 3.788851E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.768 | TFLOPs: 25.97 | +7: iteration 34870/ 60336 | consumed samples: 8926720 | consumed tokens: 18281922560 | elapsed time per iteration (s): 0.15 | learning rate: 8.936E-05 | global batch size: 256 | lm loss: 3.802166E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.502 | TFLOPs: 25.99 | +7: iteration 34880/ 60336 | consumed samples: 8929280 | consumed tokens: 18287165440 | elapsed time per iteration (s): 0.15 | learning rate: 8.931E-05 | global batch size: 256 | lm loss: 3.786786E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.852 | TFLOPs: 25.98 | +7: iteration 34890/ 60336 | consumed samples: 8931840 | consumed tokens: 18292408320 | elapsed time per iteration (s): 0.15 | learning rate: 8.926E-05 | global batch size: 256 | lm loss: 3.783494E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.518 | TFLOPs: 25.98 | +7: iteration 34900/ 60336 | consumed samples: 8934400 | consumed tokens: 18297651200 | elapsed time per iteration (s): 0.15 | learning rate: 8.922E-05 | global batch size: 256 | lm loss: 3.807524E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.451 | TFLOPs: 25.98 | +7: iteration 34910/ 60336 | consumed samples: 8936960 | consumed tokens: 18302894080 | elapsed time per iteration (s): 0.15 | learning rate: 8.917E-05 | global batch size: 256 | lm loss: 3.796614E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.925 | TFLOPs: 26.00 | +7: iteration 34920/ 60336 | consumed samples: 8939520 | consumed tokens: 18308136960 | elapsed time per iteration (s): 0.15 | learning rate: 8.913E-05 | global batch size: 256 | lm loss: 3.791559E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.229 | TFLOPs: 26.02 | +7: iteration 34930/ 60336 | consumed samples: 8942080 | consumed tokens: 18313379840 | elapsed time per iteration (s): 0.15 | learning rate: 8.908E-05 | global batch size: 256 | lm loss: 3.793701E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.597 | TFLOPs: 26.01 | +7: iteration 34940/ 60336 | consumed samples: 8944640 | consumed tokens: 18318622720 | elapsed time per iteration (s): 0.15 | learning rate: 8.903E-05 | global batch size: 256 | lm loss: 3.803165E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.193 | TFLOPs: 26.02 | +7: iteration 34950/ 60336 | consumed samples: 8947200 | consumed tokens: 18323865600 | elapsed time per iteration (s): 0.15 | learning rate: 8.899E-05 | global batch size: 256 | lm loss: 3.786563E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.787 | TFLOPs: 26.00 | +7: iteration 34960/ 60336 | consumed samples: 8949760 | consumed tokens: 18329108480 | elapsed time per iteration (s): 0.15 | learning rate: 8.894E-05 | global batch size: 256 | lm loss: 3.786615E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.390 | TFLOPs: 26.01 | +7: iteration 34970/ 60336 | consumed samples: 8952320 | consumed tokens: 18334351360 | elapsed time per iteration (s): 0.15 | learning rate: 8.890E-05 | global batch size: 256 | lm loss: 3.778950E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.191 | TFLOPs: 25.99 | +7: iteration 34980/ 60336 | consumed samples: 8954880 | consumed tokens: 18339594240 | elapsed time per iteration (s): 0.15 | learning rate: 8.885E-05 | global batch size: 256 | lm loss: 3.783327E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.552 | TFLOPs: 25.99 | +7: iteration 34990/ 60336 | consumed samples: 8957440 | consumed tokens: 18344837120 | elapsed time per iteration (s): 0.15 | learning rate: 8.880E-05 | global batch size: 256 | lm loss: 3.798417E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.365 | TFLOPs: 26.01 | +7: iteration 35000/ 60336 | consumed samples: 8960000 | consumed tokens: 18350080000 | elapsed time per iteration (s): 0.15 | learning rate: 8.876E-05 | global batch size: 256 | lm loss: 3.790028E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.322 | TFLOPs: 25.98 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 35000 | lm loss value: 3.892988E+00 | lm loss PPL: 4.905727E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 35000 to checkpoints_44m32b100m +0: [2023-03-17 01:49:36,835] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step35000 is begin to save! +0: [2023-03-17 01:49:36,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:49:36,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:49:36,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:49:36,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:49:36,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:49:36,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:49:36,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:49:36,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:49:36,926] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:49:36,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:49:36,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:49:36,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:49:36,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:49:36,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:49:36,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:49:36,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:49:36,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:49:36,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:49:36,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:49:36,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:49:36,967] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step35000/mp_rank_00_model_states.pt +0: [2023-03-17 01:49:36,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:49:36,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:49:36,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:49:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:49:36,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:36,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:36,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:36,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:36,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:36,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:36,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:36,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:36,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:49:37,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:37,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:49:37,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:37,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:37,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:49:37,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:49:37,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:49:37,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:37,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:49:37,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:49:37,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:37,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:37,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:49:37,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:49:37,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:49:37,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 01:49:37,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 01:49:37,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:49:37,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:49:37,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: successfully saved checkpoint at iteration 35000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.98 +7: iteration 35010/ 60336 | consumed samples: 8962560 | consumed tokens: 18355322880 | elapsed time per iteration (s): 0.18 | learning rate: 8.871E-05 | global batch size: 256 | lm loss: 3.791835E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.183 | TFLOPs: 22.71 | +7: iteration 35020/ 60336 | consumed samples: 8965120 | consumed tokens: 18360565760 | elapsed time per iteration (s): 0.15 | learning rate: 8.867E-05 | global batch size: 256 | lm loss: 3.786348E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.048 | TFLOPs: 26.03 | +7: iteration 35030/ 60336 | consumed samples: 8967680 | consumed tokens: 18365808640 | elapsed time per iteration (s): 0.15 | learning rate: 8.862E-05 | global batch size: 256 | lm loss: 3.793839E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.660 | TFLOPs: 26.01 | +7: iteration 35040/ 60336 | consumed samples: 8970240 | consumed tokens: 18371051520 | elapsed time per iteration (s): 0.15 | learning rate: 8.857E-05 | global batch size: 256 | lm loss: 3.796436E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.463 | TFLOPs: 25.99 | +7: iteration 35050/ 60336 | consumed samples: 8972800 | consumed tokens: 18376294400 | elapsed time per iteration (s): 0.15 | learning rate: 8.853E-05 | global batch size: 256 | lm loss: 3.803598E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.717 | TFLOPs: 25.95 | +7: iteration 35060/ 60336 | consumed samples: 8975360 | consumed tokens: 18381537280 | elapsed time per iteration (s): 0.16 | learning rate: 8.848E-05 | global batch size: 256 | lm loss: 3.799746E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.501 | TFLOPs: 25.26 | +7: iteration 35070/ 60336 | consumed samples: 8977920 | consumed tokens: 18386780160 | elapsed time per iteration (s): 0.15 | learning rate: 8.844E-05 | global batch size: 256 | lm loss: 3.799729E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.264 | TFLOPs: 26.01 | +7: iteration 35080/ 60336 | consumed samples: 8980480 | consumed tokens: 18392023040 | elapsed time per iteration (s): 0.15 | learning rate: 8.839E-05 | global batch size: 256 | lm loss: 3.798986E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.588 | TFLOPs: 26.01 | +7: iteration 35090/ 60336 | consumed samples: 8983040 | consumed tokens: 18397265920 | elapsed time per iteration (s): 0.15 | learning rate: 8.834E-05 | global batch size: 256 | lm loss: 3.802184E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.600 | TFLOPs: 26.04 | +7: iteration 35100/ 60336 | consumed samples: 8985600 | consumed tokens: 18402508800 | elapsed time per iteration (s): 0.15 | learning rate: 8.830E-05 | global batch size: 256 | lm loss: 3.797829E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.815 | TFLOPs: 26.05 | +7: iteration 35110/ 60336 | consumed samples: 8988160 | consumed tokens: 18407751680 | elapsed time per iteration (s): 0.15 | learning rate: 8.825E-05 | global batch size: 256 | lm loss: 3.786006E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.558 | TFLOPs: 26.07 | +7: iteration 35120/ 60336 | consumed samples: 8990720 | consumed tokens: 18412994560 | elapsed time per iteration (s): 0.15 | learning rate: 8.821E-05 | global batch size: 256 | lm loss: 3.789509E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.564 | TFLOPs: 26.09 | +7: iteration 35130/ 60336 | consumed samples: 8993280 | consumed tokens: 18418237440 | elapsed time per iteration (s): 0.15 | learning rate: 8.816E-05 | global batch size: 256 | lm loss: 3.787140E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.039 | TFLOPs: 26.03 | +7: iteration 35140/ 60336 | consumed samples: 8995840 | consumed tokens: 18423480320 | elapsed time per iteration (s): 0.15 | learning rate: 8.811E-05 | global batch size: 256 | lm loss: 3.787754E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.929 | TFLOPs: 26.02 | +7: iteration 35150/ 60336 | consumed samples: 8998400 | consumed tokens: 18428723200 | elapsed time per iteration (s): 0.15 | learning rate: 8.807E-05 | global batch size: 256 | lm loss: 3.777605E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.390 | TFLOPs: 26.04 | +7: iteration 35160/ 60336 | consumed samples: 9000960 | consumed tokens: 18433966080 | elapsed time per iteration (s): 0.15 | learning rate: 8.802E-05 | global batch size: 256 | lm loss: 3.776044E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.005 | TFLOPs: 26.03 | +7: iteration 35170/ 60336 | consumed samples: 9003520 | consumed tokens: 18439208960 | elapsed time per iteration (s): 0.15 | learning rate: 8.798E-05 | global batch size: 256 | lm loss: 3.788531E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.382 | TFLOPs: 26.12 | +7: iteration 35180/ 60336 | consumed samples: 9006080 | consumed tokens: 18444451840 | elapsed time per iteration (s): 0.15 | learning rate: 8.793E-05 | global batch size: 256 | lm loss: 3.786129E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.832 | TFLOPs: 26.22 | +7: iteration 35190/ 60336 | consumed samples: 9008640 | consumed tokens: 18449694720 | elapsed time per iteration (s): 0.15 | learning rate: 8.789E-05 | global batch size: 256 | lm loss: 3.781132E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.112 | TFLOPs: 26.19 | +7: iteration 35200/ 60336 | consumed samples: 9011200 | consumed tokens: 18454937600 | elapsed time per iteration (s): 0.15 | learning rate: 8.784E-05 | global batch size: 256 | lm loss: 3.783236E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.268 | TFLOPs: 26.10 | +7: iteration 35210/ 60336 | consumed samples: 9013760 | consumed tokens: 18460180480 | elapsed time per iteration (s): 0.15 | learning rate: 8.779E-05 | global batch size: 256 | lm loss: 3.768666E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.745 | TFLOPs: 26.22 | +7: iteration 35220/ 60336 | consumed samples: 9016320 | consumed tokens: 18465423360 | elapsed time per iteration (s): 0.15 | learning rate: 8.775E-05 | global batch size: 256 | lm loss: 3.800100E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.026 | TFLOPs: 26.22 | +7: iteration 35230/ 60336 | consumed samples: 9018880 | consumed tokens: 18470666240 | elapsed time per iteration (s): 0.15 | learning rate: 8.770E-05 | global batch size: 256 | lm loss: 3.789487E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.642 | TFLOPs: 26.23 | +7: iteration 35240/ 60336 | consumed samples: 9021440 | consumed tokens: 18475909120 | elapsed time per iteration (s): 0.15 | learning rate: 8.766E-05 | global batch size: 256 | lm loss: 3.790983E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.802 | TFLOPs: 26.20 | +7: iteration 35250/ 60336 | consumed samples: 9024000 | consumed tokens: 18481152000 | elapsed time per iteration (s): 0.15 | learning rate: 8.761E-05 | global batch size: 256 | lm loss: 3.783049E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.716 | TFLOPs: 26.22 | +7: iteration 35260/ 60336 | consumed samples: 9026560 | consumed tokens: 18486394880 | elapsed time per iteration (s): 0.15 | learning rate: 8.756E-05 | global batch size: 256 | lm loss: 3.790211E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.707 | TFLOPs: 26.22 | +7: iteration 35270/ 60336 | consumed samples: 9029120 | consumed tokens: 18491637760 | elapsed time per iteration (s): 0.15 | learning rate: 8.752E-05 | global batch size: 256 | lm loss: 3.796347E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.218 | TFLOPs: 26.21 | +7: iteration 35280/ 60336 | consumed samples: 9031680 | consumed tokens: 18496880640 | elapsed time per iteration (s): 0.16 | learning rate: 8.747E-05 | global batch size: 256 | lm loss: 3.785962E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.862 | TFLOPs: 25.81 | +7: iteration 35290/ 60336 | consumed samples: 9034240 | consumed tokens: 18502123520 | elapsed time per iteration (s): 0.15 | learning rate: 8.743E-05 | global batch size: 256 | lm loss: 3.800360E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.157 | TFLOPs: 26.22 | +7: iteration 35300/ 60336 | consumed samples: 9036800 | consumed tokens: 18507366400 | elapsed time per iteration (s): 0.15 | learning rate: 8.738E-05 | global batch size: 256 | lm loss: 3.783572E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.666 | TFLOPs: 26.17 | +7: iteration 35310/ 60336 | consumed samples: 9039360 | consumed tokens: 18512609280 | elapsed time per iteration (s): 0.15 | learning rate: 8.734E-05 | global batch size: 256 | lm loss: 3.772989E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.245 | TFLOPs: 26.18 | +7: iteration 35320/ 60336 | consumed samples: 9041920 | consumed tokens: 18517852160 | elapsed time per iteration (s): 0.15 | learning rate: 8.729E-05 | global batch size: 256 | lm loss: 3.800364E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.758 | TFLOPs: 26.22 | +7: iteration 35330/ 60336 | consumed samples: 9044480 | consumed tokens: 18523095040 | elapsed time per iteration (s): 0.15 | learning rate: 8.724E-05 | global batch size: 256 | lm loss: 3.793711E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.302 | TFLOPs: 26.23 | +7: iteration 35340/ 60336 | consumed samples: 9047040 | consumed tokens: 18528337920 | elapsed time per iteration (s): 0.15 | learning rate: 8.720E-05 | global batch size: 256 | lm loss: 3.782899E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.257 | TFLOPs: 26.24 | +7: iteration 35350/ 60336 | consumed samples: 9049600 | consumed tokens: 18533580800 | elapsed time per iteration (s): 0.15 | learning rate: 8.715E-05 | global batch size: 256 | lm loss: 3.794709E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.176 | TFLOPs: 26.24 | +7: iteration 35360/ 60336 | consumed samples: 9052160 | consumed tokens: 18538823680 | elapsed time per iteration (s): 0.15 | learning rate: 8.711E-05 | global batch size: 256 | lm loss: 3.784880E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.642 | TFLOPs: 26.25 | +7: iteration 35370/ 60336 | consumed samples: 9054720 | consumed tokens: 18544066560 | elapsed time per iteration (s): 0.15 | learning rate: 8.706E-05 | global batch size: 256 | lm loss: 3.796799E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.331 | TFLOPs: 26.24 | +7: iteration 35380/ 60336 | consumed samples: 9057280 | consumed tokens: 18549309440 | elapsed time per iteration (s): 0.15 | learning rate: 8.701E-05 | global batch size: 256 | lm loss: 3.796387E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.051 | TFLOPs: 26.25 | +7: iteration 35390/ 60336 | consumed samples: 9059840 | consumed tokens: 18554552320 | elapsed time per iteration (s): 0.15 | learning rate: 8.697E-05 | global batch size: 256 | lm loss: 3.789175E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.573 | TFLOPs: 26.21 | +7: iteration 35400/ 60336 | consumed samples: 9062400 | consumed tokens: 18559795200 | elapsed time per iteration (s): 0.15 | learning rate: 8.692E-05 | global batch size: 256 | lm loss: 3.791563E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.551 | TFLOPs: 26.21 | +7: iteration 35410/ 60336 | consumed samples: 9064960 | consumed tokens: 18565038080 | elapsed time per iteration (s): 0.15 | learning rate: 8.688E-05 | global batch size: 256 | lm loss: 3.781897E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.296 | TFLOPs: 26.23 | +7: iteration 35420/ 60336 | consumed samples: 9067520 | consumed tokens: 18570280960 | elapsed time per iteration (s): 0.15 | learning rate: 8.683E-05 | global batch size: 256 | lm loss: 3.793889E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.622 | TFLOPs: 26.07 | +7: iteration 35430/ 60336 | consumed samples: 9070080 | consumed tokens: 18575523840 | elapsed time per iteration (s): 0.15 | learning rate: 8.679E-05 | global batch size: 256 | lm loss: 3.781213E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.596 | TFLOPs: 26.09 | +7: iteration 35440/ 60336 | consumed samples: 9072640 | consumed tokens: 18580766720 | elapsed time per iteration (s): 0.15 | learning rate: 8.674E-05 | global batch size: 256 | lm loss: 3.779502E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.330 | TFLOPs: 26.09 | +7: iteration 35450/ 60336 | consumed samples: 9075200 | consumed tokens: 18586009600 | elapsed time per iteration (s): 0.15 | learning rate: 8.669E-05 | global batch size: 256 | lm loss: 3.791603E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.756 | TFLOPs: 26.08 | +7: iteration 35460/ 60336 | consumed samples: 9077760 | consumed tokens: 18591252480 | elapsed time per iteration (s): 0.15 | learning rate: 8.665E-05 | global batch size: 256 | lm loss: 3.785818E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.471 | TFLOPs: 26.07 | +7: iteration 35470/ 60336 | consumed samples: 9080320 | consumed tokens: 18596495360 | elapsed time per iteration (s): 0.15 | learning rate: 8.660E-05 | global batch size: 256 | lm loss: 3.788993E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.977 | TFLOPs: 26.10 | +7: iteration 35480/ 60336 | consumed samples: 9082880 | consumed tokens: 18601738240 | elapsed time per iteration (s): 0.15 | learning rate: 8.656E-05 | global batch size: 256 | lm loss: 3.793374E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.458 | TFLOPs: 26.07 | +7: iteration 35490/ 60336 | consumed samples: 9085440 | consumed tokens: 18606981120 | elapsed time per iteration (s): 0.15 | learning rate: 8.651E-05 | global batch size: 256 | lm loss: 3.793105E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 35500/ 60336 | consumed samples: 9088000 | consumed tokens: 18612224000 | elapsed time per iteration (s): 0.15 | learning rate: 8.647E-05 | global batch size: 256 | lm loss: 3.793491E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.301 | TFLOPs: 26.08 | +7: iteration 35510/ 60336 | consumed samples: 9090560 | consumed tokens: 18617466880 | elapsed time per iteration (s): 0.15 | learning rate: 8.642E-05 | global batch size: 256 | lm loss: 3.789206E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.478 | TFLOPs: 26.09 | +7: iteration 35520/ 60336 | consumed samples: 9093120 | consumed tokens: 18622709760 | elapsed time per iteration (s): 0.16 | learning rate: 8.637E-05 | global batch size: 256 | lm loss: 3.795926E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.550 | TFLOPs: 25.74 | +7: iteration 35530/ 60336 | consumed samples: 9095680 | consumed tokens: 18627952640 | elapsed time per iteration (s): 0.15 | learning rate: 8.633E-05 | global batch size: 256 | lm loss: 3.791816E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.299 | TFLOPs: 26.07 | +7: iteration 35540/ 60336 | consumed samples: 9098240 | consumed tokens: 18633195520 | elapsed time per iteration (s): 0.15 | learning rate: 8.628E-05 | global batch size: 256 | lm loss: 3.797589E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.530 | TFLOPs: 26.10 | +7: iteration 35550/ 60336 | consumed samples: 9100800 | consumed tokens: 18638438400 | elapsed time per iteration (s): 0.15 | learning rate: 8.624E-05 | global batch size: 256 | lm loss: 3.795050E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.740 | TFLOPs: 26.11 | +7: iteration 35560/ 60336 | consumed samples: 9103360 | consumed tokens: 18643681280 | elapsed time per iteration (s): 0.15 | learning rate: 8.619E-05 | global batch size: 256 | lm loss: 3.793742E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.916 | TFLOPs: 26.08 | +7: iteration 35570/ 60336 | consumed samples: 9105920 | consumed tokens: 18648924160 | elapsed time per iteration (s): 0.15 | learning rate: 8.615E-05 | global batch size: 256 | lm loss: 3.791626E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.005 | TFLOPs: 26.10 | +7: iteration 35580/ 60336 | consumed samples: 9108480 | consumed tokens: 18654167040 | elapsed time per iteration (s): 0.15 | learning rate: 8.610E-05 | global batch size: 256 | lm loss: 3.780042E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.462 | TFLOPs: 26.07 | +7: iteration 35590/ 60336 | consumed samples: 9111040 | consumed tokens: 18659409920 | elapsed time per iteration (s): 0.15 | learning rate: 8.606E-05 | global batch size: 256 | lm loss: 3.787104E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.111 | TFLOPs: 26.08 | +7: iteration 35600/ 60336 | consumed samples: 9113600 | consumed tokens: 18664652800 | elapsed time per iteration (s): 0.15 | learning rate: 8.601E-05 | global batch size: 256 | lm loss: 3.791847E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.770 | TFLOPs: 26.09 | +7: iteration 35610/ 60336 | consumed samples: 9116160 | consumed tokens: 18669895680 | elapsed time per iteration (s): 0.15 | learning rate: 8.596E-05 | global batch size: 256 | lm loss: 3.781910E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.225 | TFLOPs: 26.08 | +7: iteration 35620/ 60336 | consumed samples: 9118720 | consumed tokens: 18675138560 | elapsed time per iteration (s): 0.15 | learning rate: 8.592E-05 | global batch size: 256 | lm loss: 3.797910E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.852 | TFLOPs: 26.08 | +7: iteration 35630/ 60336 | consumed samples: 9121280 | consumed tokens: 18680381440 | elapsed time per iteration (s): 0.15 | learning rate: 8.587E-05 | global batch size: 256 | lm loss: 3.793959E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.088 | TFLOPs: 26.08 | +7: iteration 35640/ 60336 | consumed samples: 9123840 | consumed tokens: 18685624320 | elapsed time per iteration (s): 0.15 | learning rate: 8.583E-05 | global batch size: 256 | lm loss: 3.784480E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.954 | TFLOPs: 26.06 | +7: iteration 35650/ 60336 | consumed samples: 9126400 | consumed tokens: 18690867200 | elapsed time per iteration (s): 0.15 | learning rate: 8.578E-05 | global batch size: 256 | lm loss: 3.800331E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.620 | TFLOPs: 26.06 | +7: iteration 35660/ 60336 | consumed samples: 9128960 | consumed tokens: 18696110080 | elapsed time per iteration (s): 0.15 | learning rate: 8.574E-05 | global batch size: 256 | lm loss: 3.793217E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.690 | TFLOPs: 26.06 | +7: iteration 35670/ 60336 | consumed samples: 9131520 | consumed tokens: 18701352960 | elapsed time per iteration (s): 0.15 | learning rate: 8.569E-05 | global batch size: 256 | lm loss: 3.801717E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.165 | TFLOPs: 26.08 | +7: iteration 35680/ 60336 | consumed samples: 9134080 | consumed tokens: 18706595840 | elapsed time per iteration (s): 0.15 | learning rate: 8.564E-05 | global batch size: 256 | lm loss: 3.801527E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.602 | TFLOPs: 26.07 | +7: iteration 35690/ 60336 | consumed samples: 9136640 | consumed tokens: 18711838720 | elapsed time per iteration (s): 0.15 | learning rate: 8.560E-05 | global batch size: 256 | lm loss: 3.787215E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.048 | TFLOPs: 26.07 | +7: iteration 35700/ 60336 | consumed samples: 9139200 | consumed tokens: 18717081600 | elapsed time per iteration (s): 0.16 | learning rate: 8.555E-05 | global batch size: 256 | lm loss: 3.785909E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.687 | TFLOPs: 25.67 | +7: iteration 35710/ 60336 | consumed samples: 9141760 | consumed tokens: 18722324480 | elapsed time per iteration (s): 0.15 | learning rate: 8.551E-05 | global batch size: 256 | lm loss: 3.795397E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.516 | TFLOPs: 26.07 | +7: iteration 35720/ 60336 | consumed samples: 9144320 | consumed tokens: 18727567360 | elapsed time per iteration (s): 0.15 | learning rate: 8.546E-05 | global batch size: 256 | lm loss: 3.790790E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.912 | TFLOPs: 26.08 | +7: iteration 35730/ 60336 | consumed samples: 9146880 | consumed tokens: 18732810240 | elapsed time per iteration (s): 0.15 | learning rate: 8.542E-05 | global batch size: 256 | lm loss: 3.781046E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.749 | TFLOPs: 26.03 | +7: iteration 35740/ 60336 | consumed samples: 9149440 | consumed tokens: 18738053120 | elapsed time per iteration (s): 0.15 | learning rate: 8.537E-05 | global batch size: 256 | lm loss: 3.802213E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.663 | TFLOPs: 26.01 | +7: iteration 35750/ 60336 | consumed samples: 9152000 | consumed tokens: 18743296000 | elapsed time per iteration (s): 0.15 | learning rate: 8.533E-05 | global batch size: 256 | lm loss: 3.794633E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.005 | TFLOPs: 26.02 | +7: iteration 35760/ 60336 | consumed samples: 9154560 | consumed tokens: 18748538880 | elapsed time per iteration (s): 0.15 | learning rate: 8.528E-05 | global batch size: 256 | lm loss: 3.784037E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.358 | TFLOPs: 26.04 | +7: iteration 35770/ 60336 | consumed samples: 9157120 | consumed tokens: 18753781760 | elapsed time per iteration (s): 0.15 | learning rate: 8.523E-05 | global batch size: 256 | lm loss: 3.804002E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.083 | TFLOPs: 26.02 | +7: iteration 35780/ 60336 | consumed samples: 9159680 | consumed tokens: 18759024640 | elapsed time per iteration (s): 0.15 | learning rate: 8.519E-05 | global batch size: 256 | lm loss: 3.779364E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.462 | TFLOPs: 26.02 | +7: iteration 35790/ 60336 | consumed samples: 9162240 | consumed tokens: 18764267520 | elapsed time per iteration (s): 0.15 | learning rate: 8.514E-05 | global batch size: 256 | lm loss: 3.784126E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.943 | TFLOPs: 26.02 | +7: iteration 35800/ 60336 | consumed samples: 9164800 | consumed tokens: 18769510400 | elapsed time per iteration (s): 0.15 | learning rate: 8.510E-05 | global batch size: 256 | lm loss: 3.782369E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.863 | TFLOPs: 26.03 | +7: iteration 35810/ 60336 | consumed samples: 9167360 | consumed tokens: 18774753280 | elapsed time per iteration (s): 0.15 | learning rate: 8.505E-05 | global batch size: 256 | lm loss: 3.785178E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.873 | TFLOPs: 26.00 | +7: iteration 35820/ 60336 | consumed samples: 9169920 | consumed tokens: 18779996160 | elapsed time per iteration (s): 0.15 | learning rate: 8.501E-05 | global batch size: 256 | lm loss: 3.778460E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.551 | TFLOPs: 25.99 | +7: iteration 35830/ 60336 | consumed samples: 9172480 | consumed tokens: 18785239040 | elapsed time per iteration (s): 0.15 | learning rate: 8.496E-05 | global batch size: 256 | lm loss: 3.795331E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.413 | TFLOPs: 26.01 | +7: iteration 35840/ 60336 | consumed samples: 9175040 | consumed tokens: 18790481920 | elapsed time per iteration (s): 0.15 | learning rate: 8.492E-05 | global batch size: 256 | lm loss: 3.787667E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.399 | TFLOPs: 26.02 | +7: iteration 35850/ 60336 | consumed samples: 9177600 | consumed tokens: 18795724800 | elapsed time per iteration (s): 0.15 | learning rate: 8.487E-05 | global batch size: 256 | lm loss: 3.799683E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.044 | TFLOPs: 26.02 | +7: iteration 35860/ 60336 | consumed samples: 9180160 | consumed tokens: 18800967680 | elapsed time per iteration (s): 0.15 | learning rate: 8.483E-05 | global batch size: 256 | lm loss: 3.785039E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.255 | TFLOPs: 26.02 | +7: iteration 35870/ 60336 | consumed samples: 9182720 | consumed tokens: 18806210560 | elapsed time per iteration (s): 0.15 | learning rate: 8.478E-05 | global batch size: 256 | lm loss: 3.795667E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.427 | TFLOPs: 26.04 | +7: iteration 35880/ 60336 | consumed samples: 9185280 | consumed tokens: 18811453440 | elapsed time per iteration (s): 0.15 | learning rate: 8.473E-05 | global batch size: 256 | lm loss: 3.787241E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.351 | TFLOPs: 26.01 | +7: iteration 35890/ 60336 | consumed samples: 9187840 | consumed tokens: 18816696320 | elapsed time per iteration (s): 0.15 | learning rate: 8.469E-05 | global batch size: 256 | lm loss: 3.783206E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.615 | TFLOPs: 26.03 | +7: iteration 35900/ 60336 | consumed samples: 9190400 | consumed tokens: 18821939200 | elapsed time per iteration (s): 0.15 | learning rate: 8.464E-05 | global batch size: 256 | lm loss: 3.787285E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.063 | TFLOPs: 26.03 | +7: iteration 35910/ 60336 | consumed samples: 9192960 | consumed tokens: 18827182080 | elapsed time per iteration (s): 0.15 | learning rate: 8.460E-05 | global batch size: 256 | lm loss: 3.791108E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.272 | TFLOPs: 26.04 | +7: iteration 35920/ 60336 | consumed samples: 9195520 | consumed tokens: 18832424960 | elapsed time per iteration (s): 0.15 | learning rate: 8.455E-05 | global batch size: 256 | lm loss: 3.796127E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.027 | TFLOPs: 26.05 | +7: iteration 35930/ 60336 | consumed samples: 9198080 | consumed tokens: 18837667840 | elapsed time per iteration (s): 0.15 | learning rate: 8.451E-05 | global batch size: 256 | lm loss: 3.778385E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.233 | TFLOPs: 26.05 | +7: iteration 35940/ 60336 | consumed samples: 9200640 | consumed tokens: 18842910720 | elapsed time per iteration (s): 0.15 | learning rate: 8.446E-05 | global batch size: 256 | lm loss: 3.779742E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.001 | TFLOPs: 26.05 | +7: iteration 35950/ 60336 | consumed samples: 9203200 | consumed tokens: 18848153600 | elapsed time per iteration (s): 0.15 | learning rate: 8.442E-05 | global batch size: 256 | lm loss: 3.784487E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.808 | TFLOPs: 26.03 | +7: iteration 35960/ 60336 | consumed samples: 9205760 | consumed tokens: 18853396480 | elapsed time per iteration (s): 0.15 | learning rate: 8.437E-05 | global batch size: 256 | lm loss: 3.795582E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.223 | TFLOPs: 26.05 | +7: iteration 35970/ 60336 | consumed samples: 9208320 | consumed tokens: 18858639360 | elapsed time per iteration (s): 0.15 | learning rate: 8.433E-05 | global batch size: 256 | lm loss: 3.792703E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.097 | TFLOPs: 26.02 | +7: iteration 35980/ 60336 | consumed samples: 9210880 | consumed tokens: 18863882240 | elapsed time per iteration (s): 0.15 | learning rate: 8.428E-05 | global batch size: 256 | lm loss: 3.783070E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.271 | TFLOPs: 26.04 | +7: iteration 35990/ 60336 | consumed samples: 9213440 | consumed tokens: 18869125120 | elapsed time per iteration (s): 0.15 | learning rate: 8.424E-05 | global batch size: 256 | lm loss: 3.778430E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.149 | TFLOPs: 26.04 | +0: [2023-03-17 01:52:11,010] [INFO] [logging.py:68:log_dist] [Rank 0] step=36000, skipped=0, lr=[8.419008396796206e-05, 8.419008396796206e-05, 8.419008396796206e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 36000/ 60336 | consumed samples: 9216000 | consumed tokens: 18874368000 | elapsed time per iteration (s): 0.15 | learning rate: 8.419E-05 | global batch size: 256 | lm loss: 3.780090E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.507 | TFLOPs: 26.03 | +0: steps: 36000 loss: 3.7575 iter time (s): 0.153 samples/sec: 1674.752 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 36000 | lm loss value: 3.965026E+00 | lm loss PPL: 5.272165E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 36000 to checkpoints_44m32b100m +0: [2023-03-17 01:52:11,082] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step36000 is begin to save! +0: [2023-03-17 01:52:11,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:52:11,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:52:11,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:52:11,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:52:11,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:52:11,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:52:11,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:52:11,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:52:11,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:52:11,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:52:11,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:52:11,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:52:11,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:52:11,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:52:11,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:52:11,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:52:11,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:52:11,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:52:11,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:52:11,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:52:11,212] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step36000/mp_rank_00_model_states.pt +0: [2023-03-17 01:52:11,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:52:11,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:52:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:52:11,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:52:11,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:52:11,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:52:11,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,246] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:52:11,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,250] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:52:11,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:52:11,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:52:11,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:52:11,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:52:11,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:52:11,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:52:11,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:52:11,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:52:11,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:52:11,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:52:11,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: successfully saved checkpoint at iteration 36000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 175.44 +7: iteration 36010/ 60336 | consumed samples: 9218560 | consumed tokens: 18879610880 | elapsed time per iteration (s): 0.18 | learning rate: 8.414E-05 | global batch size: 256 | lm loss: 3.780558E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.559 | TFLOPs: 22.48 | +7: iteration 36020/ 60336 | consumed samples: 9221120 | consumed tokens: 18884853760 | elapsed time per iteration (s): 0.15 | learning rate: 8.410E-05 | global batch size: 256 | lm loss: 3.773896E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.607 | TFLOPs: 26.04 | +7: iteration 36030/ 60336 | consumed samples: 9223680 | consumed tokens: 18890096640 | elapsed time per iteration (s): 0.15 | learning rate: 8.405E-05 | global batch size: 256 | lm loss: 3.799112E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.014 | TFLOPs: 26.06 | +7: iteration 36040/ 60336 | consumed samples: 9226240 | consumed tokens: 18895339520 | elapsed time per iteration (s): 0.15 | learning rate: 8.401E-05 | global batch size: 256 | lm loss: 3.787761E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.387 | TFLOPs: 26.05 | +7: iteration 36050/ 60336 | consumed samples: 9228800 | consumed tokens: 18900582400 | elapsed time per iteration (s): 0.15 | learning rate: 8.396E-05 | global batch size: 256 | lm loss: 3.805229E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.524 | TFLOPs: 26.04 | +7: iteration 36060/ 60336 | consumed samples: 9231360 | consumed tokens: 18905825280 | elapsed time per iteration (s): 0.15 | learning rate: 8.392E-05 | global batch size: 256 | lm loss: 3.784774E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.066 | TFLOPs: 26.05 | +7: iteration 36070/ 60336 | consumed samples: 9233920 | consumed tokens: 18911068160 | elapsed time per iteration (s): 0.15 | learning rate: 8.387E-05 | global batch size: 256 | lm loss: 3.800513E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.644 | TFLOPs: 26.07 | +7: iteration 36080/ 60336 | consumed samples: 9236480 | consumed tokens: 18916311040 | elapsed time per iteration (s): 0.15 | learning rate: 8.383E-05 | global batch size: 256 | lm loss: 3.792840E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.826 | TFLOPs: 26.11 | +7: iteration 36090/ 60336 | consumed samples: 9239040 | consumed tokens: 18921553920 | elapsed time per iteration (s): 0.15 | learning rate: 8.378E-05 | global batch size: 256 | lm loss: 3.776682E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.261 | TFLOPs: 26.08 | +7: iteration 36100/ 60336 | consumed samples: 9241600 | consumed tokens: 18926796800 | elapsed time per iteration (s): 0.15 | learning rate: 8.374E-05 | global batch size: 256 | lm loss: 3.774118E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.409 | TFLOPs: 26.09 | +7: iteration 36110/ 60336 | consumed samples: 9244160 | consumed tokens: 18932039680 | elapsed time per iteration (s): 0.15 | learning rate: 8.369E-05 | global batch size: 256 | lm loss: 3.788553E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.242 | TFLOPs: 26.05 | +7: iteration 36120/ 60336 | consumed samples: 9246720 | consumed tokens: 18937282560 | elapsed time per iteration (s): 0.15 | learning rate: 8.365E-05 | global batch size: 256 | lm loss: 3.782908E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.767 | TFLOPs: 26.06 | +7: iteration 36130/ 60336 | consumed samples: 9249280 | consumed tokens: 18942525440 | elapsed time per iteration (s): 0.15 | learning rate: 8.360E-05 | global batch size: 256 | lm loss: 3.781199E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.146 | TFLOPs: 26.05 | +7: iteration 36140/ 60336 | consumed samples: 9251840 | consumed tokens: 18947768320 | elapsed time per iteration (s): 0.15 | learning rate: 8.356E-05 | global batch size: 256 | lm loss: 3.805708E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.219 | TFLOPs: 26.05 | +7: iteration 36150/ 60336 | consumed samples: 9254400 | consumed tokens: 18953011200 | elapsed time per iteration (s): 0.16 | learning rate: 8.351E-05 | global batch size: 256 | lm loss: 3.783097E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.076 | TFLOPs: 25.64 | +7: iteration 36160/ 60336 | consumed samples: 9256960 | consumed tokens: 18958254080 | elapsed time per iteration (s): 0.15 | learning rate: 8.347E-05 | global batch size: 256 | lm loss: 3.784182E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.334 | TFLOPs: 26.05 | +7: iteration 36170/ 60336 | consumed samples: 9259520 | consumed tokens: 18963496960 | elapsed time per iteration (s): 0.15 | learning rate: 8.342E-05 | global batch size: 256 | lm loss: 3.785210E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.861 | TFLOPs: 26.06 | +7: iteration 36180/ 60336 | consumed samples: 9262080 | consumed tokens: 18968739840 | elapsed time per iteration (s): 0.15 | learning rate: 8.338E-05 | global batch size: 256 | lm loss: 3.801786E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.022 | TFLOPs: 26.06 | +7: iteration 36190/ 60336 | consumed samples: 9264640 | consumed tokens: 18973982720 | elapsed time per iteration (s): 0.15 | learning rate: 8.333E-05 | global batch size: 256 | lm loss: 3.789888E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.974 | TFLOPs: 26.05 | +7: iteration 36200/ 60336 | consumed samples: 9267200 | consumed tokens: 18979225600 | elapsed time per iteration (s): 0.15 | learning rate: 8.328E-05 | global batch size: 256 | lm loss: 3.787268E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.981 | TFLOPs: 26.03 | +7: iteration 36210/ 60336 | consumed samples: 9269760 | consumed tokens: 18984468480 | elapsed time per iteration (s): 0.15 | learning rate: 8.324E-05 | global batch size: 256 | lm loss: 3.790696E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.322 | TFLOPs: 26.02 | +7: iteration 36220/ 60336 | consumed samples: 9272320 | consumed tokens: 18989711360 | elapsed time per iteration (s): 0.15 | learning rate: 8.319E-05 | global batch size: 256 | lm loss: 3.778607E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.196 | TFLOPs: 26.04 | +7: iteration 36230/ 60336 | consumed samples: 9274880 | consumed tokens: 18994954240 | elapsed time per iteration (s): 0.15 | learning rate: 8.315E-05 | global batch size: 256 | lm loss: 3.788527E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.467 | TFLOPs: 26.02 | +7: iteration 36240/ 60336 | consumed samples: 9277440 | consumed tokens: 19000197120 | elapsed time per iteration (s): 0.15 | learning rate: 8.310E-05 | global batch size: 256 | lm loss: 3.803903E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.280 | TFLOPs: 26.02 | +7: iteration 36250/ 60336 | consumed samples: 9280000 | consumed tokens: 19005440000 | elapsed time per iteration (s): 0.15 | learning rate: 8.306E-05 | global batch size: 256 | lm loss: 3.785363E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.987 | TFLOPs: 26.00 | +7: iteration 36260/ 60336 | consumed samples: 9282560 | consumed tokens: 19010682880 | elapsed time per iteration (s): 0.15 | learning rate: 8.301E-05 | global batch size: 256 | lm loss: 3.803103E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.805 | TFLOPs: 25.98 | +7: iteration 36270/ 60336 | consumed samples: 9285120 | consumed tokens: 19015925760 | elapsed time per iteration (s): 0.15 | learning rate: 8.297E-05 | global batch size: 256 | lm loss: 3.786675E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.598 | TFLOPs: 26.01 | +7: iteration 36280/ 60336 | consumed samples: 9287680 | consumed tokens: 19021168640 | elapsed time per iteration (s): 0.15 | learning rate: 8.292E-05 | global batch size: 256 | lm loss: 3.792817E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.025 | TFLOPs: 25.99 | +7: iteration 36290/ 60336 | consumed samples: 9290240 | consumed tokens: 19026411520 | elapsed time per iteration (s): 0.15 | learning rate: 8.288E-05 | global batch size: 256 | lm loss: 3.788037E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.500 | TFLOPs: 26.04 | +7: iteration 36300/ 60336 | consumed samples: 9292800 | consumed tokens: 19031654400 | elapsed time per iteration (s): 0.16 | learning rate: 8.283E-05 | global batch size: 256 | lm loss: 3.785852E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.939 | TFLOPs: 25.61 | +7: iteration 36310/ 60336 | consumed samples: 9295360 | consumed tokens: 19036897280 | elapsed time per iteration (s): 0.15 | learning rate: 8.279E-05 | global batch size: 256 | lm loss: 3.776771E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.145 | TFLOPs: 26.13 | +7: iteration 36320/ 60336 | consumed samples: 9297920 | consumed tokens: 19042140160 | elapsed time per iteration (s): 0.15 | learning rate: 8.274E-05 | global batch size: 256 | lm loss: 3.792416E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.264 | TFLOPs: 26.15 | +7: iteration 36330/ 60336 | consumed samples: 9300480 | consumed tokens: 19047383040 | elapsed time per iteration (s): 0.15 | learning rate: 8.270E-05 | global batch size: 256 | lm loss: 3.772769E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.080 | TFLOPs: 26.14 | +7: iteration 36340/ 60336 | consumed samples: 9303040 | consumed tokens: 19052625920 | elapsed time per iteration (s): 0.15 | learning rate: 8.265E-05 | global batch size: 256 | lm loss: 3.775313E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.265 | TFLOPs: 26.13 | +7: iteration 36350/ 60336 | consumed samples: 9305600 | consumed tokens: 19057868800 | elapsed time per iteration (s): 0.15 | learning rate: 8.261E-05 | global batch size: 256 | lm loss: 3.775661E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.933 | TFLOPs: 26.14 | +7: iteration 36360/ 60336 | consumed samples: 9308160 | consumed tokens: 19063111680 | elapsed time per iteration (s): 0.15 | learning rate: 8.256E-05 | global batch size: 256 | lm loss: 3.781777E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.635 | TFLOPs: 26.15 | +7: iteration 36370/ 60336 | consumed samples: 9310720 | consumed tokens: 19068354560 | elapsed time per iteration (s): 0.15 | learning rate: 8.252E-05 | global batch size: 256 | lm loss: 3.780346E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.767 | TFLOPs: 26.17 | +7: iteration 36380/ 60336 | consumed samples: 9313280 | consumed tokens: 19073597440 | elapsed time per iteration (s): 0.15 | learning rate: 8.247E-05 | global batch size: 256 | lm loss: 3.773918E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.501 | TFLOPs: 26.17 | +7: iteration 36390/ 60336 | consumed samples: 9315840 | consumed tokens: 19078840320 | elapsed time per iteration (s): 0.15 | learning rate: 8.243E-05 | global batch size: 256 | lm loss: 3.798584E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.155 | TFLOPs: 26.10 | +7: iteration 36400/ 60336 | consumed samples: 9318400 | consumed tokens: 19084083200 | elapsed time per iteration (s): 0.15 | learning rate: 8.238E-05 | global batch size: 256 | lm loss: 3.785305E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.253 | TFLOPs: 26.13 | +7: iteration 36410/ 60336 | consumed samples: 9320960 | consumed tokens: 19089326080 | elapsed time per iteration (s): 0.15 | learning rate: 8.234E-05 | global batch size: 256 | lm loss: 3.796741E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.714 | TFLOPs: 26.14 | +7: iteration 36420/ 60336 | consumed samples: 9323520 | consumed tokens: 19094568960 | elapsed time per iteration (s): 0.15 | learning rate: 8.229E-05 | global batch size: 256 | lm loss: 3.785103E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.104 | TFLOPs: 26.16 | +7: iteration 36430/ 60336 | consumed samples: 9326080 | consumed tokens: 19099811840 | elapsed time per iteration (s): 0.15 | learning rate: 8.225E-05 | global batch size: 256 | lm loss: 3.783549E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.210 | TFLOPs: 26.11 | +7: iteration 36440/ 60336 | consumed samples: 9328640 | consumed tokens: 19105054720 | elapsed time per iteration (s): 0.15 | learning rate: 8.220E-05 | global batch size: 256 | lm loss: 3.793054E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.425 | TFLOPs: 26.17 | +7: iteration 36450/ 60336 | consumed samples: 9331200 | consumed tokens: 19110297600 | elapsed time per iteration (s): 0.15 | learning rate: 8.216E-05 | global batch size: 256 | lm loss: 3.794854E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.956 | TFLOPs: 26.14 | +7: iteration 36460/ 60336 | consumed samples: 9333760 | consumed tokens: 19115540480 | elapsed time per iteration (s): 0.16 | learning rate: 8.211E-05 | global batch size: 256 | lm loss: 3.785875E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.744 | TFLOPs: 25.48 | +7: iteration 36470/ 60336 | consumed samples: 9336320 | consumed tokens: 19120783360 | elapsed time per iteration (s): 0.15 | learning rate: 8.207E-05 | global batch size: 256 | lm loss: 3.793604E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.805 | TFLOPs: 26.14 | +7: iteration 36480/ 60336 | consumed samples: 9338880 | consumed tokens: 19126026240 | elapsed time per iteration (s): 0.15 | learning rate: 8.202E-05 | global batch size: 256 | lm loss: 3.789322E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.800 | TFLOPs: 26.17 | +7: iteration 36490/ 60336 | consumed samples: 9341440 | consumed tokens: 19131269120 | elapsed time per iteration (s): 0.15 | learning rate: 8.198E-05 | global batch size: 256 | lm loss: 3.793564E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.020 | TFLOPs: 26.16 | +7: iteration 36500/ 60336 | consumed samples: 9344000 | consumed tokens: 19136512000 | elapsed time per iteration (s): 0.15 | learning rate: 8.193E-05 | global batch size: 256 | lm loss: 3.780218E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.000 | TFLOPs: 26.16 | +7: iteration 36510/ 60336 | consumed samples: 9346560 | consumed tokens: 19141754880 | elapsed time per iteration (s): 0.15 | learning rate: 8.189E-05 | global batch size: 256 | lm loss: 3.791334E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.104 | TFLOPs: 26.16 | +7: iteration 36520/ 60336 | consumed samples: 9349120 | consumed tokens: 19146997760 | elapsed time per iteration (s): 0.15 | learning rate: 8.184E-05 | global batch size: 256 | lm loss: 3.781305E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.352 | TFLOPs: 26.15 | +7: iteration 36530/ 60336 | consumed samples: 9351680 | consumed tokens: 19152240640 | elapsed time per iteration (s): 0.15 | learning rate: 8.180E-05 | global batch size: 256 | lm loss: 3.788666E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.744 | TFLOPs: 26.15 | +7: iteration 36540/ 60336 | consumed samples: 9354240 | consumed tokens: 19157483520 | elapsed time per iteration (s): 0.15 | learning rate: 8.175E-05 | global batch size: 256 | lm loss: 3.785124E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.493 | TFLOPs: 26.15 | +7: iteration 36550/ 60336 | consumed samples: 9356800 | consumed tokens: 19162726400 | elapsed time per iteration (s): 0.15 | learning rate: 8.171E-05 | global batch size: 256 | lm loss: 3.781700E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.920 | TFLOPs: 26.14 | +7: iteration 36560/ 60336 | consumed samples: 9359360 | consumed tokens: 19167969280 | elapsed time per iteration (s): 0.15 | learning rate: 8.166E-05 | global batch size: 256 | lm loss: 3.792558E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.716 | TFLOPs: 26.15 | +7: iteration 36570/ 60336 | consumed samples: 9361920 | consumed tokens: 19173212160 | elapsed time per iteration (s): 0.15 | learning rate: 8.162E-05 | global batch size: 256 | lm loss: 3.784093E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.779 | TFLOPs: 25.92 | +7: iteration 36580/ 60336 | consumed samples: 9364480 | consumed tokens: 19178455040 | elapsed time per iteration (s): 0.15 | learning rate: 8.157E-05 | global batch size: 256 | lm loss: 3.785434E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.028 | TFLOPs: 26.14 | +7: iteration 36590/ 60336 | consumed samples: 9367040 | consumed tokens: 19183697920 | elapsed time per iteration (s): 0.15 | learning rate: 8.153E-05 | global batch size: 256 | lm loss: 3.789684E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.667 | TFLOPs: 26.15 | +7: iteration 36600/ 60336 | consumed samples: 9369600 | consumed tokens: 19188940800 | elapsed time per iteration (s): 0.15 | learning rate: 8.148E-05 | global batch size: 256 | lm loss: 3.774476E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.274 | TFLOPs: 26.15 | +7: iteration 36610/ 60336 | consumed samples: 9372160 | consumed tokens: 19194183680 | elapsed time per iteration (s): 0.15 | learning rate: 8.144E-05 | global batch size: 256 | lm loss: 3.784736E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.028 | TFLOPs: 26.16 | +7: iteration 36620/ 60336 | consumed samples: 9374720 | consumed tokens: 19199426560 | elapsed time per iteration (s): 0.15 | learning rate: 8.139E-05 | global batch size: 256 | lm loss: 3.787880E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.225 | TFLOPs: 26.07 | +7: iteration 36630/ 60336 | consumed samples: 9377280 | consumed tokens: 19204669440 | elapsed time per iteration (s): 0.15 | learning rate: 8.135E-05 | global batch size: 256 | lm loss: 3.804350E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.318 | TFLOPs: 26.08 | +7: iteration 36640/ 60336 | consumed samples: 9379840 | consumed tokens: 19209912320 | elapsed time per iteration (s): 0.15 | learning rate: 8.130E-05 | global batch size: 256 | lm loss: 3.786658E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.228 | TFLOPs: 26.04 | +7: iteration 36650/ 60336 | consumed samples: 9382400 | consumed tokens: 19215155200 | elapsed time per iteration (s): 0.15 | learning rate: 8.126E-05 | global batch size: 256 | lm loss: 3.771574E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.447 | TFLOPs: 26.09 | +7: iteration 36660/ 60336 | consumed samples: 9384960 | consumed tokens: 19220398080 | elapsed time per iteration (s): 0.15 | learning rate: 8.121E-05 | global batch size: 256 | lm loss: 3.776777E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.361 | TFLOPs: 26.12 | +7: iteration 36670/ 60336 | consumed samples: 9387520 | consumed tokens: 19225640960 | elapsed time per iteration (s): 0.15 | learning rate: 8.117E-05 | global batch size: 256 | lm loss: 3.794224E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.633 | TFLOPs: 26.12 | +7: iteration 36680/ 60336 | consumed samples: 9390080 | consumed tokens: 19230883840 | elapsed time per iteration (s): 0.15 | learning rate: 8.112E-05 | global batch size: 256 | lm loss: 3.784765E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.279 | TFLOPs: 26.13 | +7: iteration 36690/ 60336 | consumed samples: 9392640 | consumed tokens: 19236126720 | elapsed time per iteration (s): 0.15 | learning rate: 8.108E-05 | global batch size: 256 | lm loss: 3.774565E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.253 | TFLOPs: 26.13 | +7: iteration 36700/ 60336 | consumed samples: 9395200 | consumed tokens: 19241369600 | elapsed time per iteration (s): 0.15 | learning rate: 8.103E-05 | global batch size: 256 | lm loss: 3.786713E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.301 | TFLOPs: 26.12 | +7: iteration 36710/ 60336 | consumed samples: 9397760 | consumed tokens: 19246612480 | elapsed time per iteration (s): 0.15 | learning rate: 8.099E-05 | global batch size: 256 | lm loss: 3.772318E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.280 | TFLOPs: 26.12 | +7: iteration 36720/ 60336 | consumed samples: 9400320 | consumed tokens: 19251855360 | elapsed time per iteration (s): 0.15 | learning rate: 8.094E-05 | global batch size: 256 | lm loss: 3.792457E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.771 | TFLOPs: 26.12 | +7: iteration 36730/ 60336 | consumed samples: 9402880 | consumed tokens: 19257098240 | elapsed time per iteration (s): 0.15 | learning rate: 8.090E-05 | global batch size: 256 | lm loss: 3.799023E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.700 | TFLOPs: 26.14 | +7: iteration 36740/ 60336 | consumed samples: 9405440 | consumed tokens: 19262341120 | elapsed time per iteration (s): 0.15 | learning rate: 8.085E-05 | global batch size: 256 | lm loss: 3.790379E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.593 | TFLOPs: 26.15 | +7: iteration 36750/ 60336 | consumed samples: 9408000 | consumed tokens: 19267584000 | elapsed time per iteration (s): 0.15 | learning rate: 8.081E-05 | global batch size: 256 | lm loss: 3.770689E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.268 | TFLOPs: 26.13 | +7: iteration 36760/ 60336 | consumed samples: 9410560 | consumed tokens: 19272826880 | elapsed time per iteration (s): 0.15 | learning rate: 8.077E-05 | global batch size: 256 | lm loss: 3.766097E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.635 | TFLOPs: 26.12 | +7: iteration 36770/ 60336 | consumed samples: 9413120 | consumed tokens: 19278069760 | elapsed time per iteration (s): 0.15 | learning rate: 8.072E-05 | global batch size: 256 | lm loss: 3.778727E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.816 | TFLOPs: 26.14 | +7: iteration 36780/ 60336 | consumed samples: 9415680 | consumed tokens: 19283312640 | elapsed time per iteration (s): 0.15 | learning rate: 8.068E-05 | global batch size: 256 | lm loss: 3.793432E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.932 | TFLOPs: 26.13 | +7: iteration 36790/ 60336 | consumed samples: 9418240 | consumed tokens: 19288555520 | elapsed time per iteration (s): 0.15 | learning rate: 8.063E-05 | global batch size: 256 | lm loss: 3.772842E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.010 | TFLOPs: 26.13 | +7: iteration 36800/ 60336 | consumed samples: 9420800 | consumed tokens: 19293798400 | elapsed time per iteration (s): 0.15 | learning rate: 8.059E-05 | global batch size: 256 | lm loss: 3.790029E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.630 | TFLOPs: 26.12 | +7: iteration 36810/ 60336 | consumed samples: 9423360 | consumed tokens: 19299041280 | elapsed time per iteration (s): 0.15 | learning rate: 8.054E-05 | global batch size: 256 | lm loss: 3.780793E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.183 | TFLOPs: 26.13 | +7: iteration 36820/ 60336 | consumed samples: 9425920 | consumed tokens: 19304284160 | elapsed time per iteration (s): 0.15 | learning rate: 8.050E-05 | global batch size: 256 | lm loss: 3.789452E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.359 | TFLOPs: 26.13 | +7: iteration 36830/ 60336 | consumed samples: 9428480 | consumed tokens: 19309527040 | elapsed time per iteration (s): 0.15 | learning rate: 8.045E-05 | global batch size: 256 | lm loss: 3.796378E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.174 | TFLOPs: 26.05 | +7: iteration 36840/ 60336 | consumed samples: 9431040 | consumed tokens: 19314769920 | elapsed time per iteration (s): 0.15 | learning rate: 8.041E-05 | global batch size: 256 | lm loss: 3.799338E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.454 | TFLOPs: 26.06 | +7: iteration 36850/ 60336 | consumed samples: 9433600 | consumed tokens: 19320012800 | elapsed time per iteration (s): 0.15 | learning rate: 8.036E-05 | global batch size: 256 | lm loss: 3.776960E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.078 | TFLOPs: 26.07 | +7: iteration 36860/ 60336 | consumed samples: 9436160 | consumed tokens: 19325255680 | elapsed time per iteration (s): 0.15 | learning rate: 8.032E-05 | global batch size: 256 | lm loss: 3.791571E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.566 | TFLOPs: 26.06 | +7: iteration 36870/ 60336 | consumed samples: 9438720 | consumed tokens: 19330498560 | elapsed time per iteration (s): 0.15 | learning rate: 8.027E-05 | global batch size: 256 | lm loss: 3.788570E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.528 | TFLOPs: 26.06 | +7: iteration 36880/ 60336 | consumed samples: 9441280 | consumed tokens: 19335741440 | elapsed time per iteration (s): 0.15 | learning rate: 8.023E-05 | global batch size: 256 | lm loss: 3.785562E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.960 | TFLOPs: 26.08 | +7: iteration 36890/ 60336 | consumed samples: 9443840 | consumed tokens: 19340984320 | elapsed time per iteration (s): 0.15 | learning rate: 8.018E-05 | global batch size: 256 | lm loss: 3.780329E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.238 | TFLOPs: 26.08 | +7: iteration 36900/ 60336 | consumed samples: 9446400 | consumed tokens: 19346227200 | elapsed time per iteration (s): 0.15 | learning rate: 8.014E-05 | global batch size: 256 | lm loss: 3.788158E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.218 | TFLOPs: 26.10 | +7: iteration 36910/ 60336 | consumed samples: 9448960 | consumed tokens: 19351470080 | elapsed time per iteration (s): 0.15 | learning rate: 8.009E-05 | global batch size: 256 | lm loss: 3.793449E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.910 | TFLOPs: 26.06 | +7: iteration 36920/ 60336 | consumed samples: 9451520 | consumed tokens: 19356712960 | elapsed time per iteration (s): 0.15 | learning rate: 8.005E-05 | global batch size: 256 | lm loss: 3.786520E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.865 | TFLOPs: 26.09 | +7: iteration 36930/ 60336 | consumed samples: 9454080 | consumed tokens: 19361955840 | elapsed time per iteration (s): 0.15 | learning rate: 8.001E-05 | global batch size: 256 | lm loss: 3.772893E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.946 | TFLOPs: 26.05 | +7: iteration 36940/ 60336 | consumed samples: 9456640 | consumed tokens: 19367198720 | elapsed time per iteration (s): 0.15 | learning rate: 7.996E-05 | global batch size: 256 | lm loss: 3.775796E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.147 | TFLOPs: 26.02 | +7: iteration 36950/ 60336 | consumed samples: 9459200 | consumed tokens: 19372441600 | elapsed time per iteration (s): 0.15 | learning rate: 7.992E-05 | global batch size: 256 | lm loss: 3.782822E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.155 | TFLOPs: 26.08 | +7: iteration 36960/ 60336 | consumed samples: 9461760 | consumed tokens: 19377684480 | elapsed time per iteration (s): 0.15 | learning rate: 7.987E-05 | global batch size: 256 | lm loss: 3.788030E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.414 | TFLOPs: 26.09 | +7: iteration 36970/ 60336 | consumed samples: 9464320 | consumed tokens: 19382927360 | elapsed time per iteration (s): 0.15 | learning rate: 7.983E-05 | global batch size: 256 | lm loss: 3.775611E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.511 | TFLOPs: 26.12 | +7: iteration 36980/ 60336 | consumed samples: 9466880 | consumed tokens: 19388170240 | elapsed time per iteration (s): 0.15 | learning rate: 7.978E-05 | global batch size: 256 | lm loss: 3.773132E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.804 | TFLOPs: 26.11 | +7: iteration 36990/ 60336 | consumed samples: 9469440 | consumed tokens: 19393413120 | elapsed time per iteration (s): 0.15 | learning rate: 7.974E-05 | global batch size: 256 | lm loss: 3.772047E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.039 | TFLOPs: 26.10 | +7: iteration 37000/ 60336 | consumed samples: 9472000 | consumed tokens: 19398656000 | elapsed time per iteration (s): 0.15 | learning rate: 7.969E-05 | global batch size: 256 | lm loss: 3.785258E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.685 | TFLOPs: 26.11 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 37000 | lm loss value: 3.930985E+00 | lm loss PPL: 5.095717E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 37000 to checkpoints_44m32b100m +0: [2023-03-17 01:54:45,252] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step37000 is begin to save! +0: [2023-03-17 01:54:45,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:54:45,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:54:45,315] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:54:45,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:54:45,325] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:54:45,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:54:45,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:54:45,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:54:45,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:54:45,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:54:45,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:54:45,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:54:45,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:54:45,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:54:45,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:54:45,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:54:45,374] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:54:45,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:54:45,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:54:45,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:54:45,383] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step37000/mp_rank_00_model_states.pt +0: [2023-03-17 01:54:45,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:54:45,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:54:45,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:54:45,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:54:45,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:54:45,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:54:45,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:54:45,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:54:45,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:54:45,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:54:45,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:54:45,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: successfully saved checkpoint at iteration 37000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 178.16 +7: iteration 37010/ 60336 | consumed samples: 9474560 | consumed tokens: 19403898880 | elapsed time per iteration (s): 0.18 | learning rate: 7.965E-05 | global batch size: 256 | lm loss: 3.800378E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.501 | TFLOPs: 22.78 | +7: iteration 37020/ 60336 | consumed samples: 9477120 | consumed tokens: 19409141760 | elapsed time per iteration (s): 0.15 | learning rate: 7.960E-05 | global batch size: 256 | lm loss: 3.794166E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.278 | TFLOPs: 26.12 | +7: iteration 37030/ 60336 | consumed samples: 9479680 | consumed tokens: 19414384640 | elapsed time per iteration (s): 0.15 | learning rate: 7.956E-05 | global batch size: 256 | lm loss: 3.780807E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.997 | TFLOPs: 26.11 | +7: iteration 37040/ 60336 | consumed samples: 9482240 | consumed tokens: 19419627520 | elapsed time per iteration (s): 0.15 | learning rate: 7.951E-05 | global batch size: 256 | lm loss: 3.774917E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.444 | TFLOPs: 26.09 | +7: iteration 37050/ 60336 | consumed samples: 9484800 | consumed tokens: 19424870400 | elapsed time per iteration (s): 0.15 | learning rate: 7.947E-05 | global batch size: 256 | lm loss: 3.787771E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.967 | TFLOPs: 26.11 | +7: iteration 37060/ 60336 | consumed samples: 9487360 | consumed tokens: 19430113280 | elapsed time per iteration (s): 0.15 | learning rate: 7.943E-05 | global batch size: 256 | lm loss: 3.792556E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.910 | TFLOPs: 26.08 | +7: iteration 37070/ 60336 | consumed samples: 9489920 | consumed tokens: 19435356160 | elapsed time per iteration (s): 0.15 | learning rate: 7.938E-05 | global batch size: 256 | lm loss: 3.785430E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.733 | TFLOPs: 26.12 | +7: iteration 37080/ 60336 | consumed samples: 9492480 | consumed tokens: 19440599040 | elapsed time per iteration (s): 0.15 | learning rate: 7.934E-05 | global batch size: 256 | lm loss: 3.787134E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.109 | TFLOPs: 26.13 | +7: iteration 37090/ 60336 | consumed samples: 9495040 | consumed tokens: 19445841920 | elapsed time per iteration (s): 0.15 | learning rate: 7.929E-05 | global batch size: 256 | lm loss: 3.779224E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.702 | TFLOPs: 26.14 | +7: iteration 37100/ 60336 | consumed samples: 9497600 | consumed tokens: 19451084800 | elapsed time per iteration (s): 0.15 | learning rate: 7.925E-05 | global batch size: 256 | lm loss: 3.772767E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.384 | TFLOPs: 26.12 | +7: iteration 37110/ 60336 | consumed samples: 9500160 | consumed tokens: 19456327680 | elapsed time per iteration (s): 0.15 | learning rate: 7.920E-05 | global batch size: 256 | lm loss: 3.793299E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.607 | TFLOPs: 26.09 | +7: iteration 37120/ 60336 | consumed samples: 9502720 | consumed tokens: 19461570560 | elapsed time per iteration (s): 0.15 | learning rate: 7.916E-05 | global batch size: 256 | lm loss: 3.783489E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.243 | TFLOPs: 26.15 | +7: iteration 37130/ 60336 | consumed samples: 9505280 | consumed tokens: 19466813440 | elapsed time per iteration (s): 0.15 | learning rate: 7.911E-05 | global batch size: 256 | lm loss: 3.790345E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.877 | TFLOPs: 26.11 | +7: iteration 37140/ 60336 | consumed samples: 9507840 | consumed tokens: 19472056320 | elapsed time per iteration (s): 0.15 | learning rate: 7.907E-05 | global batch size: 256 | lm loss: 3.777987E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.194 | TFLOPs: 26.11 | +7: iteration 37150/ 60336 | consumed samples: 9510400 | consumed tokens: 19477299200 | elapsed time per iteration (s): 0.15 | learning rate: 7.903E-05 | global batch size: 256 | lm loss: 3.779286E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.005 | TFLOPs: 26.14 | +7: iteration 37160/ 60336 | consumed samples: 9512960 | consumed tokens: 19482542080 | elapsed time per iteration (s): 0.15 | learning rate: 7.898E-05 | global batch size: 256 | lm loss: 3.773472E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.703 | TFLOPs: 26.12 | +7: iteration 37170/ 60336 | consumed samples: 9515520 | consumed tokens: 19487784960 | elapsed time per iteration (s): 0.15 | learning rate: 7.894E-05 | global batch size: 256 | lm loss: 3.781953E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.748 | TFLOPs: 26.11 | +7: iteration 37180/ 60336 | consumed samples: 9518080 | consumed tokens: 19493027840 | elapsed time per iteration (s): 0.15 | learning rate: 7.889E-05 | global batch size: 256 | lm loss: 3.784728E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.793 | TFLOPs: 26.12 | +7: iteration 37190/ 60336 | consumed samples: 9520640 | consumed tokens: 19498270720 | elapsed time per iteration (s): 0.15 | learning rate: 7.885E-05 | global batch size: 256 | lm loss: 3.781977E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.916 | TFLOPs: 26.11 | +7: iteration 37200/ 60336 | consumed samples: 9523200 | consumed tokens: 19503513600 | elapsed time per iteration (s): 0.16 | learning rate: 7.880E-05 | global batch size: 256 | lm loss: 3.794599E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.967 | TFLOPs: 25.88 | +7: iteration 37210/ 60336 | consumed samples: 9525760 | consumed tokens: 19508756480 | elapsed time per iteration (s): 0.16 | learning rate: 7.876E-05 | global batch size: 256 | lm loss: 3.784908E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.895 | TFLOPs: 25.83 | +7: iteration 37220/ 60336 | consumed samples: 9528320 | consumed tokens: 19513999360 | elapsed time per iteration (s): 0.15 | learning rate: 7.871E-05 | global batch size: 256 | lm loss: 3.778014E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.693 | TFLOPs: 26.11 | +7: iteration 37230/ 60336 | consumed samples: 9530880 | consumed tokens: 19519242240 | elapsed time per iteration (s): 0.15 | learning rate: 7.867E-05 | global batch size: 256 | lm loss: 3.794586E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.382 | TFLOPs: 26.12 | +7: iteration 37240/ 60336 | consumed samples: 9533440 | consumed tokens: 19524485120 | elapsed time per iteration (s): 0.15 | learning rate: 7.863E-05 | global batch size: 256 | lm loss: 3.780606E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.690 | TFLOPs: 26.12 | +7: iteration 37250/ 60336 | consumed samples: 9536000 | consumed tokens: 19529728000 | elapsed time per iteration (s): 0.15 | learning rate: 7.858E-05 | global batch size: 256 | lm loss: 3.789581E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.553 | TFLOPs: 26.14 | +7: iteration 37260/ 60336 | consumed samples: 9538560 | consumed tokens: 19534970880 | elapsed time per iteration (s): 0.15 | learning rate: 7.854E-05 | global batch size: 256 | lm loss: 3.780024E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.166 | TFLOPs: 26.13 | +7: iteration 37270/ 60336 | consumed samples: 9541120 | consumed tokens: 19540213760 | elapsed time per iteration (s): 0.15 | learning rate: 7.849E-05 | global batch size: 256 | lm loss: 3.788345E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.262 | TFLOPs: 26.13 | +7: iteration 37280/ 60336 | consumed samples: 9543680 | consumed tokens: 19545456640 | elapsed time per iteration (s): 0.15 | learning rate: 7.845E-05 | global batch size: 256 | lm loss: 3.791711E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.142 | TFLOPs: 26.13 | +7: iteration 37290/ 60336 | consumed samples: 9546240 | consumed tokens: 19550699520 | elapsed time per iteration (s): 0.15 | learning rate: 7.840E-05 | global batch size: 256 | lm loss: 3.795147E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.071 | TFLOPs: 26.13 | +7: iteration 37300/ 60336 | consumed samples: 9548800 | consumed tokens: 19555942400 | elapsed time per iteration (s): 0.15 | learning rate: 7.836E-05 | global batch size: 256 | lm loss: 3.786037E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.651 | TFLOPs: 26.06 | +7: iteration 37310/ 60336 | consumed samples: 9551360 | consumed tokens: 19561185280 | elapsed time per iteration (s): 0.15 | learning rate: 7.832E-05 | global batch size: 256 | lm loss: 3.772569E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.569 | TFLOPs: 26.07 | +7: iteration 37320/ 60336 | consumed samples: 9553920 | consumed tokens: 19566428160 | elapsed time per iteration (s): 0.15 | learning rate: 7.827E-05 | global batch size: 256 | lm loss: 3.782705E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.882 | TFLOPs: 26.08 | +7: iteration 37330/ 60336 | consumed samples: 9556480 | consumed tokens: 19571671040 | elapsed time per iteration (s): 0.15 | learning rate: 7.823E-05 | global batch size: 256 | lm loss: 3.779171E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.189 | TFLOPs: 26.13 | +7: iteration 37340/ 60336 | consumed samples: 9559040 | consumed tokens: 19576913920 | elapsed time per iteration (s): 0.15 | learning rate: 7.818E-05 | global batch size: 256 | lm loss: 3.789968E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.158 | TFLOPs: 26.13 | +7: iteration 37350/ 60336 | consumed samples: 9561600 | consumed tokens: 19582156800 | elapsed time per iteration (s): 0.15 | learning rate: 7.814E-05 | global batch size: 256 | lm loss: 3.786442E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.576 | TFLOPs: 26.14 | +7: iteration 37360/ 60336 | consumed samples: 9564160 | consumed tokens: 19587399680 | elapsed time per iteration (s): 0.15 | learning rate: 7.809E-05 | global batch size: 256 | lm loss: 3.789678E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.816 | TFLOPs: 26.14 | +7: iteration 37370/ 60336 | consumed samples: 9566720 | consumed tokens: 19592642560 | elapsed time per iteration (s): 0.15 | learning rate: 7.805E-05 | global batch size: 256 | lm loss: 3.791870E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.870 | TFLOPs: 26.17 | +7: iteration 37380/ 60336 | consumed samples: 9569280 | consumed tokens: 19597885440 | elapsed time per iteration (s): 0.15 | learning rate: 7.801E-05 | global batch size: 256 | lm loss: 3.782686E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.450 | TFLOPs: 26.09 | +7: iteration 37390/ 60336 | consumed samples: 9571840 | consumed tokens: 19603128320 | elapsed time per iteration (s): 0.15 | learning rate: 7.796E-05 | global batch size: 256 | lm loss: 3.779066E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.752 | TFLOPs: 26.08 | +7: iteration 37400/ 60336 | consumed samples: 9574400 | consumed tokens: 19608371200 | elapsed time per iteration (s): 0.15 | learning rate: 7.792E-05 | global batch size: 256 | lm loss: 3.789152E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.917 | TFLOPs: 26.09 | +7: iteration 37410/ 60336 | consumed samples: 9576960 | consumed tokens: 19613614080 | elapsed time per iteration (s): 0.15 | learning rate: 7.787E-05 | global batch size: 256 | lm loss: 3.778825E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.400 | TFLOPs: 26.09 | +7: iteration 37420/ 60336 | consumed samples: 9579520 | consumed tokens: 19618856960 | elapsed time per iteration (s): 0.15 | learning rate: 7.783E-05 | global batch size: 256 | lm loss: 3.776066E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.492 | TFLOPs: 26.07 | +7: iteration 37430/ 60336 | consumed samples: 9582080 | consumed tokens: 19624099840 | elapsed time per iteration (s): 0.15 | learning rate: 7.778E-05 | global batch size: 256 | lm loss: 3.788181E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.123 | TFLOPs: 26.08 | +7: iteration 37440/ 60336 | consumed samples: 9584640 | consumed tokens: 19629342720 | elapsed time per iteration (s): 0.15 | learning rate: 7.774E-05 | global batch size: 256 | lm loss: 3.780911E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.130 | TFLOPs: 26.07 | +7: iteration 37450/ 60336 | consumed samples: 9587200 | consumed tokens: 19634585600 | elapsed time per iteration (s): 0.15 | learning rate: 7.770E-05 | global batch size: 256 | lm loss: 3.782745E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.639 | TFLOPs: 26.04 | +7: iteration 37460/ 60336 | consumed samples: 9589760 | consumed tokens: 19639828480 | elapsed time per iteration (s): 0.15 | learning rate: 7.765E-05 | global batch size: 256 | lm loss: 3.785714E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.812 | TFLOPs: 26.05 | +7: iteration 37470/ 60336 | consumed samples: 9592320 | consumed tokens: 19645071360 | elapsed time per iteration (s): 0.15 | learning rate: 7.761E-05 | global batch size: 256 | lm loss: 3.772207E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.357 | TFLOPs: 26.04 | +7: iteration 37480/ 60336 | consumed samples: 9594880 | consumed tokens: 19650314240 | elapsed time per iteration (s): 0.15 | learning rate: 7.756E-05 | global batch size: 256 | lm loss: 3.788853E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.931 | TFLOPs: 26.05 | +7: iteration 37490/ 60336 | consumed samples: 9597440 | consumed tokens: 19655557120 | elapsed time per iteration (s): 0.15 | learning rate: 7.752E-05 | global batch size: 256 | lm loss: 3.783612E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.880 | TFLOPs: 26.03 | +7: iteration 37500/ 60336 | consumed samples: 9600000 | consumed tokens: 19660800000 | elapsed time per iteration (s): 0.15 | learning rate: 7.748E-05 | global batch size: 256 | lm loss: 3.783008E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.068 | TFLOPs: 26.02 | +7: iteration 37510/ 60336 | consumed samples: 9602560 | consumed tokens: 19666042880 | elapsed time per iteration (s): 0.15 | learning rate: 7.743E-05 | global batch size: 256 | lm loss: 3.795692E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.989 | TFLOPs: 26.02 | +7: iteration 37520/ 60336 | consumed samples: 9605120 | consumed tokens: 19671285760 | elapsed time per iteration (s): 0.15 | learning rate: 7.739E-05 | global batch size: 256 | lm loss: 3.798857E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.022 | TFLOPs: 26.00 | +7: iteration 37530/ 60336 | consumed samples: 9607680 | consumed tokens: 19676528640 | elapsed time per iteration (s): 0.15 | learning rate: 7.734E-05 | global batch size: 256 | lm loss: 3.780602E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.586 | TFLOPs: 26.03 | +7: iteration 37540/ 60336 | consumed samples: 9610240 | consumed tokens: 19681771520 | elapsed time per iteration (s): 0.15 | learning rate: 7.730E-05 | global batch size: 256 | lm loss: 3.777936E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.619 | TFLOPs: 26.01 | +7: iteration 37550/ 60336 | consumed samples: 9612800 | consumed tokens: 19687014400 | elapsed time per iteration (s): 0.15 | learning rate: 7.725E-05 | global batch size: 256 | lm loss: 3.788634E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.162 | TFLOPs: 26.07 | +7: iteration 37560/ 60336 | consumed samples: 9615360 | consumed tokens: 19692257280 | elapsed time per iteration (s): 0.15 | learning rate: 7.721E-05 | global batch size: 256 | lm loss: 3.781376E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.234 | TFLOPs: 26.07 | +7: iteration 37570/ 60336 | consumed samples: 9617920 | consumed tokens: 19697500160 | elapsed time per iteration (s): 0.15 | learning rate: 7.717E-05 | global batch size: 256 | lm loss: 3.784356E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.639 | TFLOPs: 26.06 | +7: iteration 37580/ 60336 | consumed samples: 9620480 | consumed tokens: 19702743040 | elapsed time per iteration (s): 0.15 | learning rate: 7.712E-05 | global batch size: 256 | lm loss: 3.788686E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.334 | TFLOPs: 26.09 | +7: iteration 37590/ 60336 | consumed samples: 9623040 | consumed tokens: 19707985920 | elapsed time per iteration (s): 0.15 | learning rate: 7.708E-05 | global batch size: 256 | lm loss: 3.786590E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.103 | TFLOPs: 25.97 | +7: iteration 37600/ 60336 | consumed samples: 9625600 | consumed tokens: 19713228800 | elapsed time per iteration (s): 0.15 | learning rate: 7.703E-05 | global batch size: 256 | lm loss: 3.795146E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.553 | TFLOPs: 26.07 | +7: iteration 37610/ 60336 | consumed samples: 9628160 | consumed tokens: 19718471680 | elapsed time per iteration (s): 0.15 | learning rate: 7.699E-05 | global batch size: 256 | lm loss: 3.789363E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.053 | TFLOPs: 26.07 | +7: iteration 37620/ 60336 | consumed samples: 9630720 | consumed tokens: 19723714560 | elapsed time per iteration (s): 0.15 | learning rate: 7.695E-05 | global batch size: 256 | lm loss: 3.777151E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.156 | TFLOPs: 26.05 | +7: iteration 37630/ 60336 | consumed samples: 9633280 | consumed tokens: 19728957440 | elapsed time per iteration (s): 0.15 | learning rate: 7.690E-05 | global batch size: 256 | lm loss: 3.778148E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.172 | TFLOPs: 26.07 | +7: iteration 37640/ 60336 | consumed samples: 9635840 | consumed tokens: 19734200320 | elapsed time per iteration (s): 0.15 | learning rate: 7.686E-05 | global batch size: 256 | lm loss: 3.790366E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.220 | TFLOPs: 26.07 | +7: iteration 37650/ 60336 | consumed samples: 9638400 | consumed tokens: 19739443200 | elapsed time per iteration (s): 0.15 | learning rate: 7.681E-05 | global batch size: 256 | lm loss: 3.794000E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.817 | TFLOPs: 26.06 | +7: iteration 37660/ 60336 | consumed samples: 9640960 | consumed tokens: 19744686080 | elapsed time per iteration (s): 0.15 | learning rate: 7.677E-05 | global batch size: 256 | lm loss: 3.777646E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.300 | TFLOPs: 26.07 | +7: iteration 37670/ 60336 | consumed samples: 9643520 | consumed tokens: 19749928960 | elapsed time per iteration (s): 0.15 | learning rate: 7.673E-05 | global batch size: 256 | lm loss: 3.780243E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.183 | TFLOPs: 26.04 | +7: iteration 37680/ 60336 | consumed samples: 9646080 | consumed tokens: 19755171840 | elapsed time per iteration (s): 0.15 | learning rate: 7.668E-05 | global batch size: 256 | lm loss: 3.766506E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.432 | TFLOPs: 26.06 | +7: iteration 37690/ 60336 | consumed samples: 9648640 | consumed tokens: 19760414720 | elapsed time per iteration (s): 0.15 | learning rate: 7.664E-05 | global batch size: 256 | lm loss: 3.777689E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.979 | TFLOPs: 26.05 | +7: iteration 37700/ 60336 | consumed samples: 9651200 | consumed tokens: 19765657600 | elapsed time per iteration (s): 0.15 | learning rate: 7.659E-05 | global batch size: 256 | lm loss: 3.791009E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.693 | TFLOPs: 26.08 | +7: iteration 37710/ 60336 | consumed samples: 9653760 | consumed tokens: 19770900480 | elapsed time per iteration (s): 0.15 | learning rate: 7.655E-05 | global batch size: 256 | lm loss: 3.788086E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.318 | TFLOPs: 26.07 | +7: iteration 37720/ 60336 | consumed samples: 9656320 | consumed tokens: 19776143360 | elapsed time per iteration (s): 0.15 | learning rate: 7.651E-05 | global batch size: 256 | lm loss: 3.788391E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.102 | TFLOPs: 26.05 | +7: iteration 37730/ 60336 | consumed samples: 9658880 | consumed tokens: 19781386240 | elapsed time per iteration (s): 0.15 | learning rate: 7.646E-05 | global batch size: 256 | lm loss: 3.780079E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.281 | TFLOPs: 26.07 | +7: iteration 37740/ 60336 | consumed samples: 9661440 | consumed tokens: 19786629120 | elapsed time per iteration (s): 0.15 | learning rate: 7.642E-05 | global batch size: 256 | lm loss: 3.786359E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.637 | TFLOPs: 26.06 | +7: iteration 37750/ 60336 | consumed samples: 9664000 | consumed tokens: 19791872000 | elapsed time per iteration (s): 0.15 | learning rate: 7.637E-05 | global batch size: 256 | lm loss: 3.776277E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.380 | TFLOPs: 26.04 | +7: iteration 37760/ 60336 | consumed samples: 9666560 | consumed tokens: 19797114880 | elapsed time per iteration (s): 0.15 | learning rate: 7.633E-05 | global batch size: 256 | lm loss: 3.781862E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.270 | TFLOPs: 26.07 | +7: iteration 37770/ 60336 | consumed samples: 9669120 | consumed tokens: 19802357760 | elapsed time per iteration (s): 0.15 | learning rate: 7.629E-05 | global batch size: 256 | lm loss: 3.797761E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.860 | TFLOPs: 26.06 | +7: iteration 37780/ 60336 | consumed samples: 9671680 | consumed tokens: 19807600640 | elapsed time per iteration (s): 0.15 | learning rate: 7.624E-05 | global batch size: 256 | lm loss: 3.778115E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.523 | TFLOPs: 26.06 | +7: iteration 37790/ 60336 | consumed samples: 9674240 | consumed tokens: 19812843520 | elapsed time per iteration (s): 0.15 | learning rate: 7.620E-05 | global batch size: 256 | lm loss: 3.790197E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.550 | TFLOPs: 26.06 | +7: iteration 37800/ 60336 | consumed samples: 9676800 | consumed tokens: 19818086400 | elapsed time per iteration (s): 0.15 | learning rate: 7.616E-05 | global batch size: 256 | lm loss: 3.779950E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.932 | TFLOPs: 26.05 | +7: iteration 37810/ 60336 | consumed samples: 9679360 | consumed tokens: 19823329280 | elapsed time per iteration (s): 0.15 | learning rate: 7.611E-05 | global batch size: 256 | lm loss: 3.783033E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.989 | TFLOPs: 26.05 | +7: iteration 37820/ 60336 | consumed samples: 9681920 | consumed tokens: 19828572160 | elapsed time per iteration (s): 0.15 | learning rate: 7.607E-05 | global batch size: 256 | lm loss: 3.783876E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.118 | TFLOPs: 26.07 | +7: iteration 37830/ 60336 | consumed samples: 9684480 | consumed tokens: 19833815040 | elapsed time per iteration (s): 0.15 | learning rate: 7.602E-05 | global batch size: 256 | lm loss: 3.778509E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.758 | TFLOPs: 26.04 | +7: iteration 37840/ 60336 | consumed samples: 9687040 | consumed tokens: 19839057920 | elapsed time per iteration (s): 0.15 | learning rate: 7.598E-05 | global batch size: 256 | lm loss: 3.775848E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.629 | TFLOPs: 26.07 | +7: iteration 37850/ 60336 | consumed samples: 9689600 | consumed tokens: 19844300800 | elapsed time per iteration (s): 0.15 | learning rate: 7.594E-05 | global batch size: 256 | lm loss: 3.775996E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.197 | TFLOPs: 26.05 | +7: iteration 37860/ 60336 | consumed samples: 9692160 | consumed tokens: 19849543680 | elapsed time per iteration (s): 0.15 | learning rate: 7.589E-05 | global batch size: 256 | lm loss: 3.800475E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.407 | TFLOPs: 26.04 | +7: iteration 37870/ 60336 | consumed samples: 9694720 | consumed tokens: 19854786560 | elapsed time per iteration (s): 0.15 | learning rate: 7.585E-05 | global batch size: 256 | lm loss: 3.788484E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.018 | TFLOPs: 26.08 | +7: iteration 37880/ 60336 | consumed samples: 9697280 | consumed tokens: 19860029440 | elapsed time per iteration (s): 0.15 | learning rate: 7.580E-05 | global batch size: 256 | lm loss: 3.785777E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.841 | TFLOPs: 26.03 | +7: iteration 37890/ 60336 | consumed samples: 9699840 | consumed tokens: 19865272320 | elapsed time per iteration (s): 0.15 | learning rate: 7.576E-05 | global batch size: 256 | lm loss: 3.765281E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.393 | TFLOPs: 26.04 | +7: iteration 37900/ 60336 | consumed samples: 9702400 | consumed tokens: 19870515200 | elapsed time per iteration (s): 0.15 | learning rate: 7.572E-05 | global batch size: 256 | lm loss: 3.778577E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.518 | TFLOPs: 26.06 | +7: iteration 37910/ 60336 | consumed samples: 9704960 | consumed tokens: 19875758080 | elapsed time per iteration (s): 0.15 | learning rate: 7.567E-05 | global batch size: 256 | lm loss: 3.771922E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.237 | TFLOPs: 26.02 | +7: iteration 37920/ 60336 | consumed samples: 9707520 | consumed tokens: 19881000960 | elapsed time per iteration (s): 0.15 | learning rate: 7.563E-05 | global batch size: 256 | lm loss: 3.782661E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.910 | TFLOPs: 26.06 | +7: iteration 37930/ 60336 | consumed samples: 9710080 | consumed tokens: 19886243840 | elapsed time per iteration (s): 0.15 | learning rate: 7.559E-05 | global batch size: 256 | lm loss: 3.777060E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.381 | TFLOPs: 26.05 | +7: iteration 37940/ 60336 | consumed samples: 9712640 | consumed tokens: 19891486720 | elapsed time per iteration (s): 0.15 | learning rate: 7.554E-05 | global batch size: 256 | lm loss: 3.802867E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.463 | TFLOPs: 26.04 | +7: iteration 37950/ 60336 | consumed samples: 9715200 | consumed tokens: 19896729600 | elapsed time per iteration (s): 0.15 | learning rate: 7.550E-05 | global batch size: 256 | lm loss: 3.782961E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.926 | TFLOPs: 26.05 | +7: iteration 37960/ 60336 | consumed samples: 9717760 | consumed tokens: 19901972480 | elapsed time per iteration (s): 0.15 | learning rate: 7.545E-05 | global batch size: 256 | lm loss: 3.784827E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.907 | TFLOPs: 26.06 | +7: iteration 37970/ 60336 | consumed samples: 9720320 | consumed tokens: 19907215360 | elapsed time per iteration (s): 0.15 | learning rate: 7.541E-05 | global batch size: 256 | lm loss: 3.782592E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.861 | TFLOPs: 26.08 | +7: iteration 37980/ 60336 | consumed samples: 9722880 | consumed tokens: 19912458240 | elapsed time per iteration (s): 0.15 | learning rate: 7.537E-05 | global batch size: 256 | lm loss: 3.791614E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.057 | TFLOPs: 26.08 | +7: iteration 37990/ 60336 | consumed samples: 9725440 | consumed tokens: 19917701120 | elapsed time per iteration (s): 0.15 | learning rate: 7.532E-05 | global batch size: 256 | lm loss: 3.774984E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.172 | TFLOPs: 26.05 | +0: [2023-03-17 01:57:19,408] [INFO] [logging.py:68:log_dist] [Rank 0] step=38000, skipped=0, lr=[7.528014140367449e-05, 7.528014140367449e-05, 7.528014140367449e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 38000/ 60336 | consumed samples: 9728000 | consumed tokens: 19922944000 | elapsed time per iteration (s): 0.15 | learning rate: 7.528E-05 | global batch size: 256 | lm loss: 3.776178E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.185 | TFLOPs: 26.05 | +0: steps: 38000 loss: 3.7721 iter time (s): 0.152 samples/sec: 1679.649 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 38000 | lm loss value: 3.893603E+00 | lm loss PPL: 4.908741E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 38000 to checkpoints_44m32b100m +0: [2023-03-17 01:57:19,480] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step38000 is begin to save! +0: [2023-03-17 01:57:19,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:57:19,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:57:19,544] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:57:19,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:57:19,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:57:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:57:19,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:57:19,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:57:19,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:57:19,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:57:19,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:57:19,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:57:19,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:57:19,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:57:19,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:57:19,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:57:19,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:57:19,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:57:19,609] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:57:19,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:57:19,610] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step38000/mp_rank_00_model_states.pt +0: [2023-03-17 01:57:19,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:57:19,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:57:19,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:57:19,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:57:19,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:57:19,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:57:19,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:57:19,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:57:19,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:57:19,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:57:19,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:57:19,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:57:19,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:57:19,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:57:19,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step38000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:57:19,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: successfully saved checkpoint at iteration 38000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.95 +7: iteration 38010/ 60336 | consumed samples: 9730560 | consumed tokens: 19928186880 | elapsed time per iteration (s): 0.18 | learning rate: 7.524E-05 | global batch size: 256 | lm loss: 3.777603E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.271 | TFLOPs: 22.52 | +7: iteration 38020/ 60336 | consumed samples: 9733120 | consumed tokens: 19933429760 | elapsed time per iteration (s): 0.15 | learning rate: 7.519E-05 | global batch size: 256 | lm loss: 3.783178E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.846 | TFLOPs: 26.06 | +7: iteration 38030/ 60336 | consumed samples: 9735680 | consumed tokens: 19938672640 | elapsed time per iteration (s): 0.15 | learning rate: 7.515E-05 | global batch size: 256 | lm loss: 3.770351E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.057 | TFLOPs: 26.11 | +7: iteration 38040/ 60336 | consumed samples: 9738240 | consumed tokens: 19943915520 | elapsed time per iteration (s): 0.15 | learning rate: 7.511E-05 | global batch size: 256 | lm loss: 3.794086E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.418 | TFLOPs: 26.12 | +7: iteration 38050/ 60336 | consumed samples: 9740800 | consumed tokens: 19949158400 | elapsed time per iteration (s): 0.15 | learning rate: 7.506E-05 | global batch size: 256 | lm loss: 3.789529E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.297 | TFLOPs: 26.13 | +7: iteration 38060/ 60336 | consumed samples: 9743360 | consumed tokens: 19954401280 | elapsed time per iteration (s): 0.15 | learning rate: 7.502E-05 | global batch size: 256 | lm loss: 3.775563E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.799 | TFLOPs: 26.12 | +7: iteration 38070/ 60336 | consumed samples: 9745920 | consumed tokens: 19959644160 | elapsed time per iteration (s): 0.15 | learning rate: 7.497E-05 | global batch size: 256 | lm loss: 3.776508E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.356 | TFLOPs: 26.15 | +7: iteration 38080/ 60336 | consumed samples: 9748480 | consumed tokens: 19964887040 | elapsed time per iteration (s): 0.15 | learning rate: 7.493E-05 | global batch size: 256 | lm loss: 3.778007E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.148 | TFLOPs: 26.15 | +7: iteration 38090/ 60336 | consumed samples: 9751040 | consumed tokens: 19970129920 | elapsed time per iteration (s): 0.15 | learning rate: 7.489E-05 | global batch size: 256 | lm loss: 3.770723E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.115 | TFLOPs: 26.11 | +7: iteration 38100/ 60336 | consumed samples: 9753600 | consumed tokens: 19975372800 | elapsed time per iteration (s): 0.15 | learning rate: 7.484E-05 | global batch size: 256 | lm loss: 3.800562E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.187 | TFLOPs: 26.16 | +7: iteration 38110/ 60336 | consumed samples: 9756160 | consumed tokens: 19980615680 | elapsed time per iteration (s): 0.15 | learning rate: 7.480E-05 | global batch size: 256 | lm loss: 3.778469E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.862 | TFLOPs: 26.16 | +7: iteration 38120/ 60336 | consumed samples: 9758720 | consumed tokens: 19985858560 | elapsed time per iteration (s): 0.15 | learning rate: 7.476E-05 | global batch size: 256 | lm loss: 3.785068E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.527 | TFLOPs: 26.15 | +7: iteration 38130/ 60336 | consumed samples: 9761280 | consumed tokens: 19991101440 | elapsed time per iteration (s): 0.15 | learning rate: 7.471E-05 | global batch size: 256 | lm loss: 3.777538E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.914 | TFLOPs: 26.14 | +7: iteration 38140/ 60336 | consumed samples: 9763840 | consumed tokens: 19996344320 | elapsed time per iteration (s): 0.15 | learning rate: 7.467E-05 | global batch size: 256 | lm loss: 3.790333E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.952 | TFLOPs: 26.14 | +7: iteration 38150/ 60336 | consumed samples: 9766400 | consumed tokens: 20001587200 | elapsed time per iteration (s): 0.15 | learning rate: 7.463E-05 | global batch size: 256 | lm loss: 3.781517E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.182 | TFLOPs: 26.13 | +7: iteration 38160/ 60336 | consumed samples: 9768960 | consumed tokens: 20006830080 | elapsed time per iteration (s): 0.15 | learning rate: 7.458E-05 | global batch size: 256 | lm loss: 3.773562E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.874 | TFLOPs: 26.13 | +7: iteration 38170/ 60336 | consumed samples: 9771520 | consumed tokens: 20012072960 | elapsed time per iteration (s): 0.15 | learning rate: 7.454E-05 | global batch size: 256 | lm loss: 3.782853E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.100 | TFLOPs: 26.13 | +7: iteration 38180/ 60336 | consumed samples: 9774080 | consumed tokens: 20017315840 | elapsed time per iteration (s): 0.15 | learning rate: 7.450E-05 | global batch size: 256 | lm loss: 3.770726E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.215 | TFLOPs: 26.13 | +7: iteration 38190/ 60336 | consumed samples: 9776640 | consumed tokens: 20022558720 | elapsed time per iteration (s): 0.15 | learning rate: 7.445E-05 | global batch size: 256 | lm loss: 3.779332E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.832 | TFLOPs: 26.14 | +7: iteration 38200/ 60336 | consumed samples: 9779200 | consumed tokens: 20027801600 | elapsed time per iteration (s): 0.15 | learning rate: 7.441E-05 | global batch size: 256 | lm loss: 3.795095E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.779 | TFLOPs: 26.12 | +7: iteration 38210/ 60336 | consumed samples: 9781760 | consumed tokens: 20033044480 | elapsed time per iteration (s): 0.15 | learning rate: 7.437E-05 | global batch size: 256 | lm loss: 3.782022E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.830 | TFLOPs: 25.95 | +7: iteration 38220/ 60336 | consumed samples: 9784320 | consumed tokens: 20038287360 | elapsed time per iteration (s): 0.15 | learning rate: 7.432E-05 | global batch size: 256 | lm loss: 3.784455E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.029 | TFLOPs: 26.14 | +7: iteration 38230/ 60336 | consumed samples: 9786880 | consumed tokens: 20043530240 | elapsed time per iteration (s): 0.15 | learning rate: 7.428E-05 | global batch size: 256 | lm loss: 3.776090E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.377 | TFLOPs: 26.15 | +7: iteration 38240/ 60336 | consumed samples: 9789440 | consumed tokens: 20048773120 | elapsed time per iteration (s): 0.16 | learning rate: 7.423E-05 | global batch size: 256 | lm loss: 3.782243E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.085 | TFLOPs: 25.52 | +7: iteration 38250/ 60336 | consumed samples: 9792000 | consumed tokens: 20054016000 | elapsed time per iteration (s): 0.15 | learning rate: 7.419E-05 | global batch size: 256 | lm loss: 3.782141E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.642 | TFLOPs: 26.15 | +7: iteration 38260/ 60336 | consumed samples: 9794560 | consumed tokens: 20059258880 | elapsed time per iteration (s): 0.15 | learning rate: 7.415E-05 | global batch size: 256 | lm loss: 3.783490E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.219 | TFLOPs: 26.13 | +7: iteration 38270/ 60336 | consumed samples: 9797120 | consumed tokens: 20064501760 | elapsed time per iteration (s): 0.15 | learning rate: 7.410E-05 | global batch size: 256 | lm loss: 3.788006E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.894 | TFLOPs: 26.14 | +7: iteration 38280/ 60336 | consumed samples: 9799680 | consumed tokens: 20069744640 | elapsed time per iteration (s): 0.15 | learning rate: 7.406E-05 | global batch size: 256 | lm loss: 3.785955E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.710 | TFLOPs: 26.12 | +7: iteration 38290/ 60336 | consumed samples: 9802240 | consumed tokens: 20074987520 | elapsed time per iteration (s): 0.15 | learning rate: 7.402E-05 | global batch size: 256 | lm loss: 3.780589E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.776 | TFLOPs: 26.15 | +7: iteration 38300/ 60336 | consumed samples: 9804800 | consumed tokens: 20080230400 | elapsed time per iteration (s): 0.15 | learning rate: 7.397E-05 | global batch size: 256 | lm loss: 3.786166E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.460 | TFLOPs: 26.13 | +7: iteration 38310/ 60336 | consumed samples: 9807360 | consumed tokens: 20085473280 | elapsed time per iteration (s): 0.15 | learning rate: 7.393E-05 | global batch size: 256 | lm loss: 3.774227E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.042 | TFLOPs: 26.13 | +7: iteration 38320/ 60336 | consumed samples: 9809920 | consumed tokens: 20090716160 | elapsed time per iteration (s): 0.15 | learning rate: 7.389E-05 | global batch size: 256 | lm loss: 3.784135E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.703 | TFLOPs: 26.14 | +7: iteration 38330/ 60336 | consumed samples: 9812480 | consumed tokens: 20095959040 | elapsed time per iteration (s): 0.15 | learning rate: 7.384E-05 | global batch size: 256 | lm loss: 3.758656E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.651 | TFLOPs: 26.12 | +7: iteration 38340/ 60336 | consumed samples: 9815040 | consumed tokens: 20101201920 | elapsed time per iteration (s): 0.15 | learning rate: 7.380E-05 | global batch size: 256 | lm loss: 3.788791E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.373 | TFLOPs: 26.15 | +7: iteration 38350/ 60336 | consumed samples: 9817600 | consumed tokens: 20106444800 | elapsed time per iteration (s): 0.15 | learning rate: 7.376E-05 | global batch size: 256 | lm loss: 3.775603E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.711 | TFLOPs: 26.15 | +7: iteration 38360/ 60336 | consumed samples: 9820160 | consumed tokens: 20111687680 | elapsed time per iteration (s): 0.15 | learning rate: 7.371E-05 | global batch size: 256 | lm loss: 3.778349E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.889 | TFLOPs: 25.95 | +7: iteration 38370/ 60336 | consumed samples: 9822720 | consumed tokens: 20116930560 | elapsed time per iteration (s): 0.15 | learning rate: 7.367E-05 | global batch size: 256 | lm loss: 3.763787E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.846 | TFLOPs: 26.11 | +7: iteration 38380/ 60336 | consumed samples: 9825280 | consumed tokens: 20122173440 | elapsed time per iteration (s): 0.15 | learning rate: 7.363E-05 | global batch size: 256 | lm loss: 3.791965E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.429 | TFLOPs: 26.12 | +7: iteration 38390/ 60336 | consumed samples: 9827840 | consumed tokens: 20127416320 | elapsed time per iteration (s): 0.15 | learning rate: 7.358E-05 | global batch size: 256 | lm loss: 3.772810E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.960 | TFLOPs: 26.16 | +7: iteration 38400/ 60336 | consumed samples: 9830400 | consumed tokens: 20132659200 | elapsed time per iteration (s): 0.15 | learning rate: 7.354E-05 | global batch size: 256 | lm loss: 3.772282E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.242 | TFLOPs: 26.16 | +7: iteration 38410/ 60336 | consumed samples: 9832960 | consumed tokens: 20137902080 | elapsed time per iteration (s): 0.15 | learning rate: 7.350E-05 | global batch size: 256 | lm loss: 3.787403E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.691 | TFLOPs: 26.15 | +7: iteration 38420/ 60336 | consumed samples: 9835520 | consumed tokens: 20143144960 | elapsed time per iteration (s): 0.15 | learning rate: 7.345E-05 | global batch size: 256 | lm loss: 3.771302E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.575 | TFLOPs: 26.15 | +7: iteration 38430/ 60336 | consumed samples: 9838080 | consumed tokens: 20148387840 | elapsed time per iteration (s): 0.15 | learning rate: 7.341E-05 | global batch size: 256 | lm loss: 3.782752E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.454 | TFLOPs: 26.15 | +7: iteration 38440/ 60336 | consumed samples: 9840640 | consumed tokens: 20153630720 | elapsed time per iteration (s): 0.15 | learning rate: 7.337E-05 | global batch size: 256 | lm loss: 3.786909E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.252 | TFLOPs: 26.15 | +7: iteration 38450/ 60336 | consumed samples: 9843200 | consumed tokens: 20158873600 | elapsed time per iteration (s): 0.15 | learning rate: 7.332E-05 | global batch size: 256 | lm loss: 3.778033E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.437 | TFLOPs: 26.15 | +7: iteration 38460/ 60336 | consumed samples: 9845760 | consumed tokens: 20164116480 | elapsed time per iteration (s): 0.15 | learning rate: 7.328E-05 | global batch size: 256 | lm loss: 3.784777E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.199 | TFLOPs: 26.13 | +7: iteration 38470/ 60336 | consumed samples: 9848320 | consumed tokens: 20169359360 | elapsed time per iteration (s): 0.15 | learning rate: 7.324E-05 | global batch size: 256 | lm loss: 3.790578E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.871 | TFLOPs: 26.16 | +7: iteration 38480/ 60336 | consumed samples: 9850880 | consumed tokens: 20174602240 | elapsed time per iteration (s): 0.15 | learning rate: 7.320E-05 | global batch size: 256 | lm loss: 3.765348E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.281 | TFLOPs: 26.13 | +7: iteration 38490/ 60336 | consumed samples: 9853440 | consumed tokens: 20179845120 | elapsed time per iteration (s): 0.15 | learning rate: 7.315E-05 | global batch size: 256 | lm loss: 3.787520E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.649 | TFLOPs: 26.12 | +7: iteration 38500/ 60336 | consumed samples: 9856000 | consumed tokens: 20185088000 | elapsed time per iteration (s): 0.15 | learning rate: 7.311E-05 | global batch size: 256 | lm loss: 3.783995E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.634 | TFLOPs: 26.01 | +7: iteration 38510/ 60336 | consumed samples: 9858560 | consumed tokens: 20190330880 | elapsed time per iteration (s): 0.15 | learning rate: 7.307E-05 | global batch size: 256 | lm loss: 3.782675E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.502 | TFLOPs: 26.15 | +7: iteration 38520/ 60336 | consumed samples: 9861120 | consumed tokens: 20195573760 | elapsed time per iteration (s): 0.15 | learning rate: 7.302E-05 | global batch size: 256 | lm loss: 3.774959E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.773 | TFLOPs: 26.12 | +7: iteration 38530/ 60336 | consumed samples: 9863680 | consumed tokens: 20200816640 | elapsed time per iteration (s): 0.15 | learning rate: 7.298E-05 | global batch size: 256 | lm loss: 3.781051E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.446 | TFLOPs: 26.12 | +7: iteration 38540/ 60336 | consumed samples: 9866240 | consumed tokens: 20206059520 | elapsed time per iteration (s): 0.15 | learning rate: 7.294E-05 | global batch size: 256 | lm loss: 3.782988E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.285 | TFLOPs: 26.10 | +7: iteration 38550/ 60336 | consumed samples: 9868800 | consumed tokens: 20211302400 | elapsed time per iteration (s): 0.15 | learning rate: 7.289E-05 | global batch size: 256 | lm loss: 3.779989E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.737 | TFLOPs: 26.12 | +7: iteration 38560/ 60336 | consumed samples: 9871360 | consumed tokens: 20216545280 | elapsed time per iteration (s): 0.15 | learning rate: 7.285E-05 | global batch size: 256 | lm loss: 3.783001E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.359 | TFLOPs: 26.13 | +7: iteration 38570/ 60336 | consumed samples: 9873920 | consumed tokens: 20221788160 | elapsed time per iteration (s): 0.15 | learning rate: 7.281E-05 | global batch size: 256 | lm loss: 3.793287E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.609 | TFLOPs: 26.12 | +7: iteration 38580/ 60336 | consumed samples: 9876480 | consumed tokens: 20227031040 | elapsed time per iteration (s): 0.15 | learning rate: 7.276E-05 | global batch size: 256 | lm loss: 3.772095E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.111 | TFLOPs: 26.08 | +7: iteration 38590/ 60336 | consumed samples: 9879040 | consumed tokens: 20232273920 | elapsed time per iteration (s): 0.15 | learning rate: 7.272E-05 | global batch size: 256 | lm loss: 3.783907E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.103 | TFLOPs: 26.08 | +7: iteration 38600/ 60336 | consumed samples: 9881600 | consumed tokens: 20237516800 | elapsed time per iteration (s): 0.17 | learning rate: 7.268E-05 | global batch size: 256 | lm loss: 3.764325E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.771 | TFLOPs: 24.05 | +7: iteration 38610/ 60336 | consumed samples: 9884160 | consumed tokens: 20242759680 | elapsed time per iteration (s): 0.15 | learning rate: 7.263E-05 | global batch size: 256 | lm loss: 3.788649E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.506 | TFLOPs: 26.07 | +7: iteration 38620/ 60336 | consumed samples: 9886720 | consumed tokens: 20248002560 | elapsed time per iteration (s): 0.15 | learning rate: 7.259E-05 | global batch size: 256 | lm loss: 3.789679E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.749 | TFLOPs: 26.08 | +7: iteration 38630/ 60336 | consumed samples: 9889280 | consumed tokens: 20253245440 | elapsed time per iteration (s): 0.15 | learning rate: 7.255E-05 | global batch size: 256 | lm loss: 3.784480E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.902 | TFLOPs: 26.06 | +7: iteration 38640/ 60336 | consumed samples: 9891840 | consumed tokens: 20258488320 | elapsed time per iteration (s): 0.15 | learning rate: 7.251E-05 | global batch size: 256 | lm loss: 3.785749E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.850 | TFLOPs: 26.09 | +7: iteration 38650/ 60336 | consumed samples: 9894400 | consumed tokens: 20263731200 | elapsed time per iteration (s): 0.15 | learning rate: 7.246E-05 | global batch size: 256 | lm loss: 3.783561E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.556 | TFLOPs: 26.10 | +7: iteration 38660/ 60336 | consumed samples: 9896960 | consumed tokens: 20268974080 | elapsed time per iteration (s): 0.15 | learning rate: 7.242E-05 | global batch size: 256 | lm loss: 3.785413E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.078 | TFLOPs: 26.08 | +7: iteration 38670/ 60336 | consumed samples: 9899520 | consumed tokens: 20274216960 | elapsed time per iteration (s): 0.15 | learning rate: 7.238E-05 | global batch size: 256 | lm loss: 3.781598E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.253 | TFLOPs: 26.10 | +7: iteration 38680/ 60336 | consumed samples: 9902080 | consumed tokens: 20279459840 | elapsed time per iteration (s): 0.15 | learning rate: 7.233E-05 | global batch size: 256 | lm loss: 3.786814E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.286 | TFLOPs: 26.08 | +7: iteration 38690/ 60336 | consumed samples: 9904640 | consumed tokens: 20284702720 | elapsed time per iteration (s): 0.15 | learning rate: 7.229E-05 | global batch size: 256 | lm loss: 3.770408E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.505 | TFLOPs: 26.09 | +7: iteration 38700/ 60336 | consumed samples: 9907200 | consumed tokens: 20289945600 | elapsed time per iteration (s): 0.15 | learning rate: 7.225E-05 | global batch size: 256 | lm loss: 3.779651E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.563 | TFLOPs: 26.12 | +7: iteration 38710/ 60336 | consumed samples: 9909760 | consumed tokens: 20295188480 | elapsed time per iteration (s): 0.15 | learning rate: 7.220E-05 | global batch size: 256 | lm loss: 3.781173E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.943 | TFLOPs: 26.13 | +7: iteration 38720/ 60336 | consumed samples: 9912320 | consumed tokens: 20300431360 | elapsed time per iteration (s): 0.15 | learning rate: 7.216E-05 | global batch size: 256 | lm loss: 3.777605E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.599 | TFLOPs: 26.12 | +7: iteration 38730/ 60336 | consumed samples: 9914880 | consumed tokens: 20305674240 | elapsed time per iteration (s): 0.15 | learning rate: 7.212E-05 | global batch size: 256 | lm loss: 3.793074E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.452 | TFLOPs: 26.12 | +7: iteration 38740/ 60336 | consumed samples: 9917440 | consumed tokens: 20310917120 | elapsed time per iteration (s): 0.15 | learning rate: 7.208E-05 | global batch size: 256 | lm loss: 3.772396E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.151 | TFLOPs: 26.11 | +7: iteration 38750/ 60336 | consumed samples: 9920000 | consumed tokens: 20316160000 | elapsed time per iteration (s): 0.15 | learning rate: 7.203E-05 | global batch size: 256 | lm loss: 3.782296E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.968 | TFLOPs: 26.13 | +7: iteration 38760/ 60336 | consumed samples: 9922560 | consumed tokens: 20321402880 | elapsed time per iteration (s): 0.15 | learning rate: 7.199E-05 | global batch size: 256 | lm loss: 3.788299E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.613 | TFLOPs: 26.14 | +7: iteration 38770/ 60336 | consumed samples: 9925120 | consumed tokens: 20326645760 | elapsed time per iteration (s): 0.15 | learning rate: 7.195E-05 | global batch size: 256 | lm loss: 3.780449E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.577 | TFLOPs: 26.12 | +7: iteration 38780/ 60336 | consumed samples: 9927680 | consumed tokens: 20331888640 | elapsed time per iteration (s): 0.15 | learning rate: 7.190E-05 | global batch size: 256 | lm loss: 3.786621E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.145 | TFLOPs: 26.13 | +7: iteration 38790/ 60336 | consumed samples: 9930240 | consumed tokens: 20337131520 | elapsed time per iteration (s): 0.15 | learning rate: 7.186E-05 | global batch size: 256 | lm loss: 3.784333E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.015 | TFLOPs: 26.13 | +7: iteration 38800/ 60336 | consumed samples: 9932800 | consumed tokens: 20342374400 | elapsed time per iteration (s): 0.15 | learning rate: 7.182E-05 | global batch size: 256 | lm loss: 3.770398E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.264 | TFLOPs: 26.13 | +7: iteration 38810/ 60336 | consumed samples: 9935360 | consumed tokens: 20347617280 | elapsed time per iteration (s): 0.15 | learning rate: 7.178E-05 | global batch size: 256 | lm loss: 3.778939E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.927 | TFLOPs: 26.13 | +7: iteration 38820/ 60336 | consumed samples: 9937920 | consumed tokens: 20352860160 | elapsed time per iteration (s): 0.16 | learning rate: 7.173E-05 | global batch size: 256 | lm loss: 3.784426E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.457 | TFLOPs: 25.71 | +7: iteration 38830/ 60336 | consumed samples: 9940480 | consumed tokens: 20358103040 | elapsed time per iteration (s): 0.15 | learning rate: 7.169E-05 | global batch size: 256 | lm loss: 3.788651E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.125 | TFLOPs: 26.13 | +7: iteration 38840/ 60336 | consumed samples: 9943040 | consumed tokens: 20363345920 | elapsed time per iteration (s): 0.15 | learning rate: 7.165E-05 | global batch size: 256 | lm loss: 3.790908E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.075 | TFLOPs: 26.11 | +7: iteration 38850/ 60336 | consumed samples: 9945600 | consumed tokens: 20368588800 | elapsed time per iteration (s): 0.15 | learning rate: 7.160E-05 | global batch size: 256 | lm loss: 3.775821E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.899 | TFLOPs: 26.14 | +7: iteration 38860/ 60336 | consumed samples: 9948160 | consumed tokens: 20373831680 | elapsed time per iteration (s): 0.15 | learning rate: 7.156E-05 | global batch size: 256 | lm loss: 3.777042E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.650 | TFLOPs: 26.11 | +7: iteration 38870/ 60336 | consumed samples: 9950720 | consumed tokens: 20379074560 | elapsed time per iteration (s): 0.15 | learning rate: 7.152E-05 | global batch size: 256 | lm loss: 3.772872E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.192 | TFLOPs: 26.11 | +7: iteration 38880/ 60336 | consumed samples: 9953280 | consumed tokens: 20384317440 | elapsed time per iteration (s): 0.15 | learning rate: 7.148E-05 | global batch size: 256 | lm loss: 3.779804E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.361 | TFLOPs: 26.13 | +7: iteration 38890/ 60336 | consumed samples: 9955840 | consumed tokens: 20389560320 | elapsed time per iteration (s): 0.15 | learning rate: 7.143E-05 | global batch size: 256 | lm loss: 3.775797E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.383 | TFLOPs: 26.10 | +7: iteration 38900/ 60336 | consumed samples: 9958400 | consumed tokens: 20394803200 | elapsed time per iteration (s): 0.15 | learning rate: 7.139E-05 | global batch size: 256 | lm loss: 3.774321E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.357 | TFLOPs: 26.05 | +7: iteration 38910/ 60336 | consumed samples: 9960960 | consumed tokens: 20400046080 | elapsed time per iteration (s): 0.15 | learning rate: 7.135E-05 | global batch size: 256 | lm loss: 3.781268E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.345 | TFLOPs: 26.01 | +7: iteration 38920/ 60336 | consumed samples: 9963520 | consumed tokens: 20405288960 | elapsed time per iteration (s): 0.15 | learning rate: 7.130E-05 | global batch size: 256 | lm loss: 3.782110E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.125 | TFLOPs: 26.02 | +7: iteration 38930/ 60336 | consumed samples: 9966080 | consumed tokens: 20410531840 | elapsed time per iteration (s): 0.15 | learning rate: 7.126E-05 | global batch size: 256 | lm loss: 3.772133E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.064 | TFLOPs: 26.02 | +7: iteration 38940/ 60336 | consumed samples: 9968640 | consumed tokens: 20415774720 | elapsed time per iteration (s): 0.15 | learning rate: 7.122E-05 | global batch size: 256 | lm loss: 3.772438E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.273 | TFLOPs: 26.01 | +7: iteration 38950/ 60336 | consumed samples: 9971200 | consumed tokens: 20421017600 | elapsed time per iteration (s): 0.15 | learning rate: 7.118E-05 | global batch size: 256 | lm loss: 3.788622E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.441 | TFLOPs: 26.06 | +7: iteration 38960/ 60336 | consumed samples: 9973760 | consumed tokens: 20426260480 | elapsed time per iteration (s): 0.15 | learning rate: 7.113E-05 | global batch size: 256 | lm loss: 3.785833E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.951 | TFLOPs: 26.02 | +7: iteration 38970/ 60336 | consumed samples: 9976320 | consumed tokens: 20431503360 | elapsed time per iteration (s): 0.15 | learning rate: 7.109E-05 | global batch size: 256 | lm loss: 3.779877E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.285 | TFLOPs: 26.04 | +7: iteration 38980/ 60336 | consumed samples: 9978880 | consumed tokens: 20436746240 | elapsed time per iteration (s): 0.15 | learning rate: 7.105E-05 | global batch size: 256 | lm loss: 3.787703E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.505 | TFLOPs: 26.06 | +7: iteration 38990/ 60336 | consumed samples: 9981440 | consumed tokens: 20441989120 | elapsed time per iteration (s): 0.15 | learning rate: 7.101E-05 | global batch size: 256 | lm loss: 3.775235E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.734 | TFLOPs: 26.08 | +7: iteration 39000/ 60336 | consumed samples: 9984000 | consumed tokens: 20447232000 | elapsed time per iteration (s): 0.15 | learning rate: 7.096E-05 | global batch size: 256 | lm loss: 3.774482E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.428 | TFLOPs: 26.10 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 39000 | lm loss value: 3.922919E+00 | lm loss PPL: 5.054779E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 39000 to checkpoints_44m32b100m +0: [2023-03-17 01:59:53,658] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step39000 is begin to save! +0: [2023-03-17 01:59:53,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:59:53,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:59:53,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:59:53,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:59:53,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:59:53,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:59:53,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:59:53,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:59:53,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:59:53,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:59:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:59:53,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:59:53,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:59:53,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:59:53,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:59:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:59:53,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:59:53,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:59:53,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:59:53,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:59:53,789] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step39000/mp_rank_00_model_states.pt +0: [2023-03-17 01:59:53,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:59:53,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:53,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:53,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:53,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:53,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:53,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:59:53,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:59:53,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:53,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:53,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:53,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:53,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:53,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:53,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:53,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:59:53,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:53,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step39000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:53,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: successfully saved checkpoint at iteration 39000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.68 +7: iteration 39010/ 60336 | consumed samples: 9986560 | consumed tokens: 20452474880 | elapsed time per iteration (s): 0.18 | learning rate: 7.092E-05 | global batch size: 256 | lm loss: 3.767617E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.020 | TFLOPs: 22.76 | +7: iteration 39020/ 60336 | consumed samples: 9989120 | consumed tokens: 20457717760 | elapsed time per iteration (s): 0.15 | learning rate: 7.088E-05 | global batch size: 256 | lm loss: 3.772644E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.526 | TFLOPs: 26.06 | +7: iteration 39030/ 60336 | consumed samples: 9991680 | consumed tokens: 20462960640 | elapsed time per iteration (s): 0.15 | learning rate: 7.084E-05 | global batch size: 256 | lm loss: 3.782530E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.923 | TFLOPs: 26.06 | +7: iteration 39040/ 60336 | consumed samples: 9994240 | consumed tokens: 20468203520 | elapsed time per iteration (s): 0.15 | learning rate: 7.079E-05 | global batch size: 256 | lm loss: 3.780207E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.465 | TFLOPs: 26.07 | +7: iteration 39050/ 60336 | consumed samples: 9996800 | consumed tokens: 20473446400 | elapsed time per iteration (s): 0.15 | learning rate: 7.075E-05 | global batch size: 256 | lm loss: 3.775180E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.177 | TFLOPs: 26.07 | +7: iteration 39060/ 60336 | consumed samples: 9999360 | consumed tokens: 20478689280 | elapsed time per iteration (s): 0.15 | learning rate: 7.071E-05 | global batch size: 256 | lm loss: 3.773119E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.469 | TFLOPs: 26.06 | +7: iteration 39070/ 60336 | consumed samples: 10001920 | consumed tokens: 20483932160 | elapsed time per iteration (s): 0.15 | learning rate: 7.066E-05 | global batch size: 256 | lm loss: 3.778374E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.221 | TFLOPs: 26.08 | +7: iteration 39080/ 60336 | consumed samples: 10004480 | consumed tokens: 20489175040 | elapsed time per iteration (s): 0.15 | learning rate: 7.062E-05 | global batch size: 256 | lm loss: 3.771087E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.905 | TFLOPs: 26.05 | +7: iteration 39090/ 60336 | consumed samples: 10007040 | consumed tokens: 20494417920 | elapsed time per iteration (s): 0.15 | learning rate: 7.058E-05 | global batch size: 256 | lm loss: 3.785057E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.122 | TFLOPs: 26.05 | +7: iteration 39100/ 60336 | consumed samples: 10009600 | consumed tokens: 20499660800 | elapsed time per iteration (s): 0.15 | learning rate: 7.054E-05 | global batch size: 256 | lm loss: 3.795115E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.868 | TFLOPs: 26.06 | +7: iteration 39110/ 60336 | consumed samples: 10012160 | consumed tokens: 20504903680 | elapsed time per iteration (s): 0.15 | learning rate: 7.049E-05 | global batch size: 256 | lm loss: 3.791423E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.731 | TFLOPs: 26.09 | +7: iteration 39120/ 60336 | consumed samples: 10014720 | consumed tokens: 20510146560 | elapsed time per iteration (s): 0.15 | learning rate: 7.045E-05 | global batch size: 256 | lm loss: 3.776513E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.369 | TFLOPs: 26.09 | +7: iteration 39130/ 60336 | consumed samples: 10017280 | consumed tokens: 20515389440 | elapsed time per iteration (s): 0.15 | learning rate: 7.041E-05 | global batch size: 256 | lm loss: 3.773481E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.251 | TFLOPs: 26.07 | +7: iteration 39140/ 60336 | consumed samples: 10019840 | consumed tokens: 20520632320 | elapsed time per iteration (s): 0.15 | learning rate: 7.037E-05 | global batch size: 256 | lm loss: 3.781724E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.948 | TFLOPs: 26.02 | +7: iteration 39150/ 60336 | consumed samples: 10022400 | consumed tokens: 20525875200 | elapsed time per iteration (s): 0.15 | learning rate: 7.032E-05 | global batch size: 256 | lm loss: 3.767389E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.242 | TFLOPs: 26.02 | +7: iteration 39160/ 60336 | consumed samples: 10024960 | consumed tokens: 20531118080 | elapsed time per iteration (s): 0.15 | learning rate: 7.028E-05 | global batch size: 256 | lm loss: 3.771006E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.985 | TFLOPs: 26.03 | +7: iteration 39170/ 60336 | consumed samples: 10027520 | consumed tokens: 20536360960 | elapsed time per iteration (s): 0.15 | learning rate: 7.024E-05 | global batch size: 256 | lm loss: 3.774052E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.756 | TFLOPs: 26.03 | +7: iteration 39180/ 60336 | consumed samples: 10030080 | consumed tokens: 20541603840 | elapsed time per iteration (s): 0.15 | learning rate: 7.020E-05 | global batch size: 256 | lm loss: 3.777195E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.413 | TFLOPs: 26.02 | +7: iteration 39190/ 60336 | consumed samples: 10032640 | consumed tokens: 20546846720 | elapsed time per iteration (s): 0.15 | learning rate: 7.015E-05 | global batch size: 256 | lm loss: 3.782953E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.195 | TFLOPs: 26.04 | +7: iteration 39200/ 60336 | consumed samples: 10035200 | consumed tokens: 20552089600 | elapsed time per iteration (s): 0.16 | learning rate: 7.011E-05 | global batch size: 256 | lm loss: 3.791198E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.347 | TFLOPs: 25.79 | +7: iteration 39210/ 60336 | consumed samples: 10037760 | consumed tokens: 20557332480 | elapsed time per iteration (s): 0.15 | learning rate: 7.007E-05 | global batch size: 256 | lm loss: 3.781096E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.852 | TFLOPs: 25.92 | +7: iteration 39220/ 60336 | consumed samples: 10040320 | consumed tokens: 20562575360 | elapsed time per iteration (s): 0.15 | learning rate: 7.003E-05 | global batch size: 256 | lm loss: 3.775089E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.290 | TFLOPs: 26.12 | +7: iteration 39230/ 60336 | consumed samples: 10042880 | consumed tokens: 20567818240 | elapsed time per iteration (s): 0.15 | learning rate: 6.999E-05 | global batch size: 256 | lm loss: 3.797881E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.413 | TFLOPs: 26.09 | +7: iteration 39240/ 60336 | consumed samples: 10045440 | consumed tokens: 20573061120 | elapsed time per iteration (s): 0.15 | learning rate: 6.994E-05 | global batch size: 256 | lm loss: 3.768082E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.272 | TFLOPs: 26.04 | +7: iteration 39250/ 60336 | consumed samples: 10048000 | consumed tokens: 20578304000 | elapsed time per iteration (s): 0.15 | learning rate: 6.990E-05 | global batch size: 256 | lm loss: 3.781111E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.469 | TFLOPs: 26.04 | +7: iteration 39260/ 60336 | consumed samples: 10050560 | consumed tokens: 20583546880 | elapsed time per iteration (s): 0.16 | learning rate: 6.986E-05 | global batch size: 256 | lm loss: 3.764997E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.879 | TFLOPs: 25.73 | +7: iteration 39270/ 60336 | consumed samples: 10053120 | consumed tokens: 20588789760 | elapsed time per iteration (s): 0.15 | learning rate: 6.982E-05 | global batch size: 256 | lm loss: 3.786287E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.050 | TFLOPs: 26.07 | +7: iteration 39280/ 60336 | consumed samples: 10055680 | consumed tokens: 20594032640 | elapsed time per iteration (s): 0.15 | learning rate: 6.977E-05 | global batch size: 256 | lm loss: 3.780914E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.411 | TFLOPs: 26.09 | +7: iteration 39290/ 60336 | consumed samples: 10058240 | consumed tokens: 20599275520 | elapsed time per iteration (s): 0.15 | learning rate: 6.973E-05 | global batch size: 256 | lm loss: 3.772327E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.506 | TFLOPs: 26.09 | +7: iteration 39300/ 60336 | consumed samples: 10060800 | consumed tokens: 20604518400 | elapsed time per iteration (s): 0.15 | learning rate: 6.969E-05 | global batch size: 256 | lm loss: 3.777413E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.198 | TFLOPs: 26.05 | +7: iteration 39310/ 60336 | consumed samples: 10063360 | consumed tokens: 20609761280 | elapsed time per iteration (s): 0.15 | learning rate: 6.965E-05 | global batch size: 256 | lm loss: 3.792420E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.327 | TFLOPs: 26.04 | +7: iteration 39320/ 60336 | consumed samples: 10065920 | consumed tokens: 20615004160 | elapsed time per iteration (s): 0.15 | learning rate: 6.960E-05 | global batch size: 256 | lm loss: 3.791032E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.784 | TFLOPs: 26.08 | +7: iteration 39330/ 60336 | consumed samples: 10068480 | consumed tokens: 20620247040 | elapsed time per iteration (s): 0.15 | learning rate: 6.956E-05 | global batch size: 256 | lm loss: 3.772977E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.063 | TFLOPs: 26.05 | +7: iteration 39340/ 60336 | consumed samples: 10071040 | consumed tokens: 20625489920 | elapsed time per iteration (s): 0.15 | learning rate: 6.952E-05 | global batch size: 256 | lm loss: 3.772330E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.374 | TFLOPs: 26.05 | +7: iteration 39350/ 60336 | consumed samples: 10073600 | consumed tokens: 20630732800 | elapsed time per iteration (s): 0.15 | learning rate: 6.948E-05 | global batch size: 256 | lm loss: 3.778376E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.914 | TFLOPs: 26.05 | +7: iteration 39360/ 60336 | consumed samples: 10076160 | consumed tokens: 20635975680 | elapsed time per iteration (s): 0.15 | learning rate: 6.943E-05 | global batch size: 256 | lm loss: 3.778925E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.502 | TFLOPs: 26.04 | +7: iteration 39370/ 60336 | consumed samples: 10078720 | consumed tokens: 20641218560 | elapsed time per iteration (s): 0.15 | learning rate: 6.939E-05 | global batch size: 256 | lm loss: 3.788337E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.325 | TFLOPs: 26.04 | +7: iteration 39380/ 60336 | consumed samples: 10081280 | consumed tokens: 20646461440 | elapsed time per iteration (s): 0.15 | learning rate: 6.935E-05 | global batch size: 256 | lm loss: 3.795235E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.235 | TFLOPs: 26.05 | +7: iteration 39390/ 60336 | consumed samples: 10083840 | consumed tokens: 20651704320 | elapsed time per iteration (s): 0.15 | learning rate: 6.931E-05 | global batch size: 256 | lm loss: 3.786589E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.045 | TFLOPs: 26.07 | +7: iteration 39400/ 60336 | consumed samples: 10086400 | consumed tokens: 20656947200 | elapsed time per iteration (s): 0.15 | learning rate: 6.927E-05 | global batch size: 256 | lm loss: 3.784397E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.395 | TFLOPs: 26.05 | +7: iteration 39410/ 60336 | consumed samples: 10088960 | consumed tokens: 20662190080 | elapsed time per iteration (s): 0.15 | learning rate: 6.922E-05 | global batch size: 256 | lm loss: 3.775642E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.572 | TFLOPs: 26.07 | +7: iteration 39420/ 60336 | consumed samples: 10091520 | consumed tokens: 20667432960 | elapsed time per iteration (s): 0.15 | learning rate: 6.918E-05 | global batch size: 256 | lm loss: 3.770734E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.449 | TFLOPs: 26.04 | +7: iteration 39430/ 60336 | consumed samples: 10094080 | consumed tokens: 20672675840 | elapsed time per iteration (s): 0.15 | learning rate: 6.914E-05 | global batch size: 256 | lm loss: 3.794604E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.922 | TFLOPs: 26.06 | +7: iteration 39440/ 60336 | consumed samples: 10096640 | consumed tokens: 20677918720 | elapsed time per iteration (s): 0.15 | learning rate: 6.910E-05 | global batch size: 256 | lm loss: 3.787462E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.290 | TFLOPs: 26.05 | +7: iteration 39450/ 60336 | consumed samples: 10099200 | consumed tokens: 20683161600 | elapsed time per iteration (s): 0.15 | learning rate: 6.905E-05 | global batch size: 256 | lm loss: 3.775604E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.702 | TFLOPs: 26.04 | +7: iteration 39460/ 60336 | consumed samples: 10101760 | consumed tokens: 20688404480 | elapsed time per iteration (s): 0.15 | learning rate: 6.901E-05 | global batch size: 256 | lm loss: 3.774654E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.617 | TFLOPs: 26.06 | +7: iteration 39470/ 60336 | consumed samples: 10104320 | consumed tokens: 20693647360 | elapsed time per iteration (s): 0.15 | learning rate: 6.897E-05 | global batch size: 256 | lm loss: 3.772475E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.592 | TFLOPs: 26.04 | +7: iteration 39480/ 60336 | consumed samples: 10106880 | consumed tokens: 20698890240 | elapsed time per iteration (s): 0.15 | learning rate: 6.893E-05 | global batch size: 256 | lm loss: 3.777834E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.098 | TFLOPs: 26.05 | +7: iteration 39490/ 60336 | consumed samples: 10109440 | consumed tokens: 20704133120 | elapsed time per iteration (s): 0.15 | learning rate: 6.889E-05 | global batch size: 256 | lm loss: 3.787201E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.664 | TFLOPs: 26.06 | +7: iteration 39500/ 60336 | consumed samples: 10112000 | consumed tokens: 20709376000 | elapsed time per iteration (s): 0.15 | learning rate: 6.884E-05 | global batch size: 256 | lm loss: 3.781275E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.173 | TFLOPs: 26.05 | +7: iteration 39510/ 60336 | consumed samples: 10114560 | consumed tokens: 20714618880 | elapsed time per iteration (s): 0.15 | learning rate: 6.880E-05 | global batch size: 256 | lm loss: 3.785999E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.975 | TFLOPs: 26.06 | +7: iteration 39520/ 60336 | consumed samples: 10117120 | consumed tokens: 20719861760 | elapsed time per iteration (s): 0.15 | learning rate: 6.876E-05 | global batch size: 256 | lm loss: 3.779465E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.114 | TFLOPs: 26.05 | +7: iteration 39530/ 60336 | consumed samples: 10119680 | consumed tokens: 20725104640 | elapsed time per iteration (s): 0.16 | learning rate: 6.872E-05 | global batch size: 256 | lm loss: 3.769633E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.599 | TFLOPs: 25.78 | +7: iteration 39540/ 60336 | consumed samples: 10122240 | consumed tokens: 20730347520 | elapsed time per iteration (s): 0.15 | learning rate: 6.868E-05 | global batch size: 256 | lm loss: 3.788030E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.759 | TFLOPs: 26.08 | +7: iteration 39550/ 60336 | consumed samples: 10124800 | consumed tokens: 20735590400 | elapsed time per iteration (s): 0.15 | learning rate: 6.863E-05 | global batch size: 256 | lm loss: 3.780905E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.101 | TFLOPs: 26.10 | +7: iteration 39560/ 60336 | consumed samples: 10127360 | consumed tokens: 20740833280 | elapsed time per iteration (s): 0.15 | learning rate: 6.859E-05 | global batch size: 256 | lm loss: 3.782009E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.056 | TFLOPs: 26.08 | +7: iteration 39570/ 60336 | consumed samples: 10129920 | consumed tokens: 20746076160 | elapsed time per iteration (s): 0.15 | learning rate: 6.855E-05 | global batch size: 256 | lm loss: 3.783881E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.463 | TFLOPs: 26.09 | +7: iteration 39580/ 60336 | consumed samples: 10132480 | consumed tokens: 20751319040 | elapsed time per iteration (s): 0.15 | learning rate: 6.851E-05 | global batch size: 256 | lm loss: 3.775418E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.081 | TFLOPs: 26.07 | +7: iteration 39590/ 60336 | consumed samples: 10135040 | consumed tokens: 20756561920 | elapsed time per iteration (s): 0.15 | learning rate: 6.847E-05 | global batch size: 256 | lm loss: 3.769075E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.397 | TFLOPs: 26.12 | +7: iteration 39600/ 60336 | consumed samples: 10137600 | consumed tokens: 20761804800 | elapsed time per iteration (s): 0.15 | learning rate: 6.842E-05 | global batch size: 256 | lm loss: 3.771629E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.835 | TFLOPs: 26.11 | +7: iteration 39610/ 60336 | consumed samples: 10140160 | consumed tokens: 20767047680 | elapsed time per iteration (s): 0.15 | learning rate: 6.838E-05 | global batch size: 256 | lm loss: 3.774016E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.445 | TFLOPs: 26.09 | +7: iteration 39620/ 60336 | consumed samples: 10142720 | consumed tokens: 20772290560 | elapsed time per iteration (s): 0.15 | learning rate: 6.834E-05 | global batch size: 256 | lm loss: 3.794103E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.024 | TFLOPs: 26.10 | +7: iteration 39630/ 60336 | consumed samples: 10145280 | consumed tokens: 20777533440 | elapsed time per iteration (s): 0.15 | learning rate: 6.830E-05 | global batch size: 256 | lm loss: 3.778064E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.056 | TFLOPs: 26.08 | +7: iteration 39640/ 60336 | consumed samples: 10147840 | consumed tokens: 20782776320 | elapsed time per iteration (s): 0.15 | learning rate: 6.826E-05 | global batch size: 256 | lm loss: 3.768052E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.605 | TFLOPs: 26.11 | +7: iteration 39650/ 60336 | consumed samples: 10150400 | consumed tokens: 20788019200 | elapsed time per iteration (s): 0.15 | learning rate: 6.821E-05 | global batch size: 256 | lm loss: 3.790275E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.306 | TFLOPs: 26.12 | +7: iteration 39660/ 60336 | consumed samples: 10152960 | consumed tokens: 20793262080 | elapsed time per iteration (s): 0.15 | learning rate: 6.817E-05 | global batch size: 256 | lm loss: 3.767372E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.912 | TFLOPs: 26.11 | +7: iteration 39670/ 60336 | consumed samples: 10155520 | consumed tokens: 20798504960 | elapsed time per iteration (s): 0.15 | learning rate: 6.813E-05 | global batch size: 256 | lm loss: 3.769716E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.119 | TFLOPs: 26.13 | +7: iteration 39680/ 60336 | consumed samples: 10158080 | consumed tokens: 20803747840 | elapsed time per iteration (s): 0.15 | learning rate: 6.809E-05 | global batch size: 256 | lm loss: 3.784533E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.634 | TFLOPs: 26.14 | +7: iteration 39690/ 60336 | consumed samples: 10160640 | consumed tokens: 20808990720 | elapsed time per iteration (s): 0.15 | learning rate: 6.805E-05 | global batch size: 256 | lm loss: 3.772130E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.171 | TFLOPs: 26.15 | +7: iteration 39700/ 60336 | consumed samples: 10163200 | consumed tokens: 20814233600 | elapsed time per iteration (s): 0.17 | learning rate: 6.800E-05 | global batch size: 256 | lm loss: 3.785687E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.260 | TFLOPs: 24.17 | +7: iteration 39710/ 60336 | consumed samples: 10165760 | consumed tokens: 20819476480 | elapsed time per iteration (s): 0.15 | learning rate: 6.796E-05 | global batch size: 256 | lm loss: 3.790471E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.008 | TFLOPs: 26.14 | +7: iteration 39720/ 60336 | consumed samples: 10168320 | consumed tokens: 20824719360 | elapsed time per iteration (s): 0.15 | learning rate: 6.792E-05 | global batch size: 256 | lm loss: 3.795623E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.422 | TFLOPs: 26.13 | +7: iteration 39730/ 60336 | consumed samples: 10170880 | consumed tokens: 20829962240 | elapsed time per iteration (s): 0.15 | learning rate: 6.788E-05 | global batch size: 256 | lm loss: 3.775025E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.960 | TFLOPs: 26.16 | +7: iteration 39740/ 60336 | consumed samples: 10173440 | consumed tokens: 20835205120 | elapsed time per iteration (s): 0.15 | learning rate: 6.784E-05 | global batch size: 256 | lm loss: 3.784943E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.235 | TFLOPs: 26.15 | +7: iteration 39750/ 60336 | consumed samples: 10176000 | consumed tokens: 20840448000 | elapsed time per iteration (s): 0.15 | learning rate: 6.780E-05 | global batch size: 256 | lm loss: 3.774143E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.691 | TFLOPs: 26.17 | +7: iteration 39760/ 60336 | consumed samples: 10178560 | consumed tokens: 20845690880 | elapsed time per iteration (s): 0.15 | learning rate: 6.775E-05 | global batch size: 256 | lm loss: 3.784664E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.363 | TFLOPs: 26.18 | +7: iteration 39770/ 60336 | consumed samples: 10181120 | consumed tokens: 20850933760 | elapsed time per iteration (s): 0.15 | learning rate: 6.771E-05 | global batch size: 256 | lm loss: 3.781791E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.277 | TFLOPs: 26.15 | +7: iteration 39780/ 60336 | consumed samples: 10183680 | consumed tokens: 20856176640 | elapsed time per iteration (s): 0.15 | learning rate: 6.767E-05 | global batch size: 256 | lm loss: 3.785145E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.120 | TFLOPs: 26.13 | +7: iteration 39790/ 60336 | consumed samples: 10186240 | consumed tokens: 20861419520 | elapsed time per iteration (s): 0.15 | learning rate: 6.763E-05 | global batch size: 256 | lm loss: 3.776745E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.480 | TFLOPs: 26.15 | +7: iteration 39800/ 60336 | consumed samples: 10188800 | consumed tokens: 20866662400 | elapsed time per iteration (s): 0.15 | learning rate: 6.759E-05 | global batch size: 256 | lm loss: 3.771266E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.822 | TFLOPs: 26.16 | +7: iteration 39810/ 60336 | consumed samples: 10191360 | consumed tokens: 20871905280 | elapsed time per iteration (s): 0.15 | learning rate: 6.754E-05 | global batch size: 256 | lm loss: 3.771182E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.721 | TFLOPs: 26.14 | +7: iteration 39820/ 60336 | consumed samples: 10193920 | consumed tokens: 20877148160 | elapsed time per iteration (s): 0.15 | learning rate: 6.750E-05 | global batch size: 256 | lm loss: 3.797522E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.595 | TFLOPs: 26.15 | +7: iteration 39830/ 60336 | consumed samples: 10196480 | consumed tokens: 20882391040 | elapsed time per iteration (s): 0.15 | learning rate: 6.746E-05 | global batch size: 256 | lm loss: 3.772328E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.081 | TFLOPs: 26.16 | +7: iteration 39840/ 60336 | consumed samples: 10199040 | consumed tokens: 20887633920 | elapsed time per iteration (s): 0.15 | learning rate: 6.742E-05 | global batch size: 256 | lm loss: 3.778763E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.565 | TFLOPs: 26.14 | +7: iteration 39850/ 60336 | consumed samples: 10201600 | consumed tokens: 20892876800 | elapsed time per iteration (s): 0.15 | learning rate: 6.738E-05 | global batch size: 256 | lm loss: 3.775690E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.499 | TFLOPs: 26.13 | +7: iteration 39860/ 60336 | consumed samples: 10204160 | consumed tokens: 20898119680 | elapsed time per iteration (s): 0.15 | learning rate: 6.734E-05 | global batch size: 256 | lm loss: 3.773740E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.842 | TFLOPs: 26.12 | +7: iteration 39870/ 60336 | consumed samples: 10206720 | consumed tokens: 20903362560 | elapsed time per iteration (s): 0.15 | learning rate: 6.729E-05 | global batch size: 256 | lm loss: 3.781267E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.192 | TFLOPs: 26.15 | +7: iteration 39880/ 60336 | consumed samples: 10209280 | consumed tokens: 20908605440 | elapsed time per iteration (s): 0.15 | learning rate: 6.725E-05 | global batch size: 256 | lm loss: 3.781051E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.327 | TFLOPs: 26.13 | +7: iteration 39890/ 60336 | consumed samples: 10211840 | consumed tokens: 20913848320 | elapsed time per iteration (s): 0.15 | learning rate: 6.721E-05 | global batch size: 256 | lm loss: 3.787692E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.662 | TFLOPs: 26.12 | +7: iteration 39900/ 60336 | consumed samples: 10214400 | consumed tokens: 20919091200 | elapsed time per iteration (s): 0.15 | learning rate: 6.717E-05 | global batch size: 256 | lm loss: 3.783936E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.484 | TFLOPs: 25.95 | +7: iteration 39910/ 60336 | consumed samples: 10216960 | consumed tokens: 20924334080 | elapsed time per iteration (s): 0.15 | learning rate: 6.713E-05 | global batch size: 256 | lm loss: 3.765882E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.633 | TFLOPs: 26.12 | +7: iteration 39920/ 60336 | consumed samples: 10219520 | consumed tokens: 20929576960 | elapsed time per iteration (s): 0.15 | learning rate: 6.709E-05 | global batch size: 256 | lm loss: 3.769373E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.888 | TFLOPs: 26.13 | +7: iteration 39930/ 60336 | consumed samples: 10222080 | consumed tokens: 20934819840 | elapsed time per iteration (s): 0.15 | learning rate: 6.704E-05 | global batch size: 256 | lm loss: 3.781987E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.426 | TFLOPs: 26.13 | +7: iteration 39940/ 60336 | consumed samples: 10224640 | consumed tokens: 20940062720 | elapsed time per iteration (s): 0.15 | learning rate: 6.700E-05 | global batch size: 256 | lm loss: 3.773383E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.719 | TFLOPs: 26.15 | +7: iteration 39950/ 60336 | consumed samples: 10227200 | consumed tokens: 20945305600 | elapsed time per iteration (s): 0.15 | learning rate: 6.696E-05 | global batch size: 256 | lm loss: 3.784778E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.155 | TFLOPs: 26.10 | +7: iteration 39960/ 60336 | consumed samples: 10229760 | consumed tokens: 20950548480 | elapsed time per iteration (s): 0.15 | learning rate: 6.692E-05 | global batch size: 256 | lm loss: 3.761090E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.040 | TFLOPs: 26.10 | +7: iteration 39970/ 60336 | consumed samples: 10232320 | consumed tokens: 20955791360 | elapsed time per iteration (s): 0.15 | learning rate: 6.688E-05 | global batch size: 256 | lm loss: 3.765642E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.235 | TFLOPs: 26.12 | +7: iteration 39980/ 60336 | consumed samples: 10234880 | consumed tokens: 20961034240 | elapsed time per iteration (s): 0.15 | learning rate: 6.684E-05 | global batch size: 256 | lm loss: 3.774616E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.145 | TFLOPs: 26.13 | +7: iteration 39990/ 60336 | consumed samples: 10237440 | consumed tokens: 20966277120 | elapsed time per iteration (s): 0.15 | learning rate: 6.680E-05 | global batch size: 256 | lm loss: 3.774986E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.356 | TFLOPs: 26.13 | +0: [2023-03-17 02:02:27,914] [INFO] [logging.py:68:log_dist] [Rank 0] step=40000, skipped=0, lr=[6.675400566957137e-05, 6.675400566957137e-05, 6.675400566957137e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 40000/ 60336 | consumed samples: 10240000 | consumed tokens: 20971520000 | elapsed time per iteration (s): 0.15 | learning rate: 6.675E-05 | global batch size: 256 | lm loss: 3.778008E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.451 | TFLOPs: 26.12 | +0: steps: 40000 loss: 3.7573 iter time (s): 0.153 samples/sec: 1671.996 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 40000 | lm loss value: 3.886712E+00 | lm loss PPL: 4.875031E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 40000 to checkpoints_44m32b100m +0: [2023-03-17 02:02:27,986] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step40000 is begin to save! +0: [2023-03-17 02:02:27,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:02:28,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:02:28,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:02:28,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:02:28,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:02:28,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:02:28,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:02:28,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:02:28,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:02:28,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:02:28,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:02:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:02:28,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:02:28,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:02:28,100] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:02:28,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:02:28,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:02:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:02:28,116] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:02:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:02:28,117] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step40000/mp_rank_00_model_states.pt +0: [2023-03-17 02:02:28,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:02:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:02:28,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:02:28,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:02:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:02:28,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:02:28,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 02:02:28,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:02:28,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:02:28,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: successfully saved checkpoint at iteration 40000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 175.18 +7: iteration 40010/ 60336 | consumed samples: 10242560 | consumed tokens: 20976762880 | elapsed time per iteration (s): 0.18 | learning rate: 6.671E-05 | global batch size: 256 | lm loss: 3.763222E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.556 | TFLOPs: 22.59 | +7: iteration 40020/ 60336 | consumed samples: 10245120 | consumed tokens: 20982005760 | elapsed time per iteration (s): 0.15 | learning rate: 6.667E-05 | global batch size: 256 | lm loss: 3.781160E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.408 | TFLOPs: 26.15 | +7: iteration 40030/ 60336 | consumed samples: 10247680 | consumed tokens: 20987248640 | elapsed time per iteration (s): 0.15 | learning rate: 6.663E-05 | global batch size: 256 | lm loss: 3.788059E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.645 | TFLOPs: 26.11 | +7: iteration 40040/ 60336 | consumed samples: 10250240 | consumed tokens: 20992491520 | elapsed time per iteration (s): 0.15 | learning rate: 6.659E-05 | global batch size: 256 | lm loss: 3.780874E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.892 | TFLOPs: 26.14 | +7: iteration 40050/ 60336 | consumed samples: 10252800 | consumed tokens: 20997734400 | elapsed time per iteration (s): 0.15 | learning rate: 6.655E-05 | global batch size: 256 | lm loss: 3.802153E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.936 | TFLOPs: 26.14 | +7: iteration 40060/ 60336 | consumed samples: 10255360 | consumed tokens: 21002977280 | elapsed time per iteration (s): 0.15 | learning rate: 6.651E-05 | global batch size: 256 | lm loss: 3.772474E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.374 | TFLOPs: 26.15 | +7: iteration 40070/ 60336 | consumed samples: 10257920 | consumed tokens: 21008220160 | elapsed time per iteration (s): 0.15 | learning rate: 6.646E-05 | global batch size: 256 | lm loss: 3.783202E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.888 | TFLOPs: 26.13 | +7: iteration 40080/ 60336 | consumed samples: 10260480 | consumed tokens: 21013463040 | elapsed time per iteration (s): 0.15 | learning rate: 6.642E-05 | global batch size: 256 | lm loss: 3.789135E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.148 | TFLOPs: 26.11 | +7: iteration 40090/ 60336 | consumed samples: 10263040 | consumed tokens: 21018705920 | elapsed time per iteration (s): 0.15 | learning rate: 6.638E-05 | global batch size: 256 | lm loss: 3.787490E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.739 | TFLOPs: 26.14 | +7: iteration 40100/ 60336 | consumed samples: 10265600 | consumed tokens: 21023948800 | elapsed time per iteration (s): 0.15 | learning rate: 6.634E-05 | global batch size: 256 | lm loss: 3.768890E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.880 | TFLOPs: 26.11 | +7: iteration 40110/ 60336 | consumed samples: 10268160 | consumed tokens: 21029191680 | elapsed time per iteration (s): 0.15 | learning rate: 6.630E-05 | global batch size: 256 | lm loss: 3.772035E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.830 | TFLOPs: 26.11 | +7: iteration 40120/ 60336 | consumed samples: 10270720 | consumed tokens: 21034434560 | elapsed time per iteration (s): 0.15 | learning rate: 6.626E-05 | global batch size: 256 | lm loss: 3.770974E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.898 | TFLOPs: 26.13 | +7: iteration 40130/ 60336 | consumed samples: 10273280 | consumed tokens: 21039677440 | elapsed time per iteration (s): 0.15 | learning rate: 6.622E-05 | global batch size: 256 | lm loss: 3.781376E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.395 | TFLOPs: 26.12 | +7: iteration 40140/ 60336 | consumed samples: 10275840 | consumed tokens: 21044920320 | elapsed time per iteration (s): 0.15 | learning rate: 6.617E-05 | global batch size: 256 | lm loss: 3.779249E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 40150/ 60336 | consumed samples: 10278400 | consumed tokens: 21050163200 | elapsed time per iteration (s): 0.15 | learning rate: 6.613E-05 | global batch size: 256 | lm loss: 3.788845E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.639 | TFLOPs: 26.12 | +7: iteration 40160/ 60336 | consumed samples: 10280960 | consumed tokens: 21055406080 | elapsed time per iteration (s): 0.15 | learning rate: 6.609E-05 | global batch size: 256 | lm loss: 3.778576E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.989 | TFLOPs: 26.13 | +7: iteration 40170/ 60336 | consumed samples: 10283520 | consumed tokens: 21060648960 | elapsed time per iteration (s): 0.15 | learning rate: 6.605E-05 | global batch size: 256 | lm loss: 3.763422E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.824 | TFLOPs: 26.12 | +7: iteration 40180/ 60336 | consumed samples: 10286080 | consumed tokens: 21065891840 | elapsed time per iteration (s): 0.15 | learning rate: 6.601E-05 | global batch size: 256 | lm loss: 3.769373E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.684 | TFLOPs: 26.12 | +7: iteration 40190/ 60336 | consumed samples: 10288640 | consumed tokens: 21071134720 | elapsed time per iteration (s): 0.15 | learning rate: 6.597E-05 | global batch size: 256 | lm loss: 3.754974E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.285 | TFLOPs: 26.12 | +7: iteration 40200/ 60336 | consumed samples: 10291200 | consumed tokens: 21076377600 | elapsed time per iteration (s): 0.15 | learning rate: 6.593E-05 | global batch size: 256 | lm loss: 3.783876E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.798 | TFLOPs: 26.11 | +7: iteration 40210/ 60336 | consumed samples: 10293760 | consumed tokens: 21081620480 | elapsed time per iteration (s): 0.15 | learning rate: 6.588E-05 | global batch size: 256 | lm loss: 3.775737E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.513 | TFLOPs: 26.06 | +7: iteration 40220/ 60336 | consumed samples: 10296320 | consumed tokens: 21086863360 | elapsed time per iteration (s): 0.15 | learning rate: 6.584E-05 | global batch size: 256 | lm loss: 3.776517E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.319 | TFLOPs: 26.12 | +7: iteration 40230/ 60336 | consumed samples: 10298880 | consumed tokens: 21092106240 | elapsed time per iteration (s): 0.15 | learning rate: 6.580E-05 | global batch size: 256 | lm loss: 3.790222E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.883 | TFLOPs: 26.11 | +7: iteration 40240/ 60336 | consumed samples: 10301440 | consumed tokens: 21097349120 | elapsed time per iteration (s): 0.15 | learning rate: 6.576E-05 | global batch size: 256 | lm loss: 3.784507E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.194 | TFLOPs: 26.13 | +7: iteration 40250/ 60336 | consumed samples: 10304000 | consumed tokens: 21102592000 | elapsed time per iteration (s): 0.15 | learning rate: 6.572E-05 | global batch size: 256 | lm loss: 3.793867E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.793 | TFLOPs: 26.12 | +7: iteration 40260/ 60336 | consumed samples: 10306560 | consumed tokens: 21107834880 | elapsed time per iteration (s): 0.15 | learning rate: 6.568E-05 | global batch size: 256 | lm loss: 3.789037E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.200 | TFLOPs: 26.11 | +7: iteration 40270/ 60336 | consumed samples: 10309120 | consumed tokens: 21113077760 | elapsed time per iteration (s): 0.15 | learning rate: 6.564E-05 | global batch size: 256 | lm loss: 3.773830E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.960 | TFLOPs: 26.13 | +7: iteration 40280/ 60336 | consumed samples: 10311680 | consumed tokens: 21118320640 | elapsed time per iteration (s): 0.15 | learning rate: 6.560E-05 | global batch size: 256 | lm loss: 3.778674E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.791 | TFLOPs: 26.11 | +7: iteration 40290/ 60336 | consumed samples: 10314240 | consumed tokens: 21123563520 | elapsed time per iteration (s): 0.15 | learning rate: 6.556E-05 | global batch size: 256 | lm loss: 3.781086E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.746 | TFLOPs: 26.14 | +7: iteration 40300/ 60336 | consumed samples: 10316800 | consumed tokens: 21128806400 | elapsed time per iteration (s): 0.15 | learning rate: 6.551E-05 | global batch size: 256 | lm loss: 3.766251E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.203 | TFLOPs: 26.11 | +7: iteration 40310/ 60336 | consumed samples: 10319360 | consumed tokens: 21134049280 | elapsed time per iteration (s): 0.15 | learning rate: 6.547E-05 | global batch size: 256 | lm loss: 3.770945E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.416 | TFLOPs: 26.13 | +7: iteration 40320/ 60336 | consumed samples: 10321920 | consumed tokens: 21139292160 | elapsed time per iteration (s): 0.15 | learning rate: 6.543E-05 | global batch size: 256 | lm loss: 3.778685E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.883 | TFLOPs: 26.11 | +7: iteration 40330/ 60336 | consumed samples: 10324480 | consumed tokens: 21144535040 | elapsed time per iteration (s): 0.15 | learning rate: 6.539E-05 | global batch size: 256 | lm loss: 3.792441E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.628 | TFLOPs: 26.12 | +7: iteration 40340/ 60336 | consumed samples: 10327040 | consumed tokens: 21149777920 | elapsed time per iteration (s): 0.15 | learning rate: 6.535E-05 | global batch size: 256 | lm loss: 3.776307E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.494 | TFLOPs: 26.12 | +7: iteration 40350/ 60336 | consumed samples: 10329600 | consumed tokens: 21155020800 | elapsed time per iteration (s): 0.15 | learning rate: 6.531E-05 | global batch size: 256 | lm loss: 3.779040E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.850 | TFLOPs: 26.11 | +7: iteration 40360/ 60336 | consumed samples: 10332160 | consumed tokens: 21160263680 | elapsed time per iteration (s): 0.15 | learning rate: 6.527E-05 | global batch size: 256 | lm loss: 3.775541E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.099 | TFLOPs: 26.13 | +7: iteration 40370/ 60336 | consumed samples: 10334720 | consumed tokens: 21165506560 | elapsed time per iteration (s): 0.15 | learning rate: 6.523E-05 | global batch size: 256 | lm loss: 3.786530E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.716 | TFLOPs: 26.12 | +7: iteration 40380/ 60336 | consumed samples: 10337280 | consumed tokens: 21170749440 | elapsed time per iteration (s): 0.15 | learning rate: 6.519E-05 | global batch size: 256 | lm loss: 3.778298E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.928 | TFLOPs: 26.14 | +7: iteration 40390/ 60336 | consumed samples: 10339840 | consumed tokens: 21175992320 | elapsed time per iteration (s): 0.15 | learning rate: 6.514E-05 | global batch size: 256 | lm loss: 3.782380E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.471 | TFLOPs: 26.13 | +7: iteration 40400/ 60336 | consumed samples: 10342400 | consumed tokens: 21181235200 | elapsed time per iteration (s): 0.15 | learning rate: 6.510E-05 | global batch size: 256 | lm loss: 3.783309E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.154 | TFLOPs: 26.10 | +7: iteration 40410/ 60336 | consumed samples: 10344960 | consumed tokens: 21186478080 | elapsed time per iteration (s): 0.15 | learning rate: 6.506E-05 | global batch size: 256 | lm loss: 3.771195E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.601 | TFLOPs: 26.11 | +7: iteration 40420/ 60336 | consumed samples: 10347520 | consumed tokens: 21191720960 | elapsed time per iteration (s): 0.15 | learning rate: 6.502E-05 | global batch size: 256 | lm loss: 3.766378E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.029 | TFLOPs: 26.13 | +7: iteration 40430/ 60336 | consumed samples: 10350080 | consumed tokens: 21196963840 | elapsed time per iteration (s): 0.15 | learning rate: 6.498E-05 | global batch size: 256 | lm loss: 3.773135E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.286 | TFLOPs: 26.12 | +7: iteration 40440/ 60336 | consumed samples: 10352640 | consumed tokens: 21202206720 | elapsed time per iteration (s): 0.15 | learning rate: 6.494E-05 | global batch size: 256 | lm loss: 3.782851E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.132 | TFLOPs: 26.11 | +7: iteration 40450/ 60336 | consumed samples: 10355200 | consumed tokens: 21207449600 | elapsed time per iteration (s): 0.16 | learning rate: 6.490E-05 | global batch size: 256 | lm loss: 3.772353E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.860 | TFLOPs: 25.43 | +7: iteration 40460/ 60336 | consumed samples: 10357760 | consumed tokens: 21212692480 | elapsed time per iteration (s): 0.15 | learning rate: 6.486E-05 | global batch size: 256 | lm loss: 3.764730E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.182 | TFLOPs: 26.10 | +7: iteration 40470/ 60336 | consumed samples: 10360320 | consumed tokens: 21217935360 | elapsed time per iteration (s): 0.15 | learning rate: 6.482E-05 | global batch size: 256 | lm loss: 3.766053E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.323 | TFLOPs: 26.13 | +7: iteration 40480/ 60336 | consumed samples: 10362880 | consumed tokens: 21223178240 | elapsed time per iteration (s): 0.15 | learning rate: 6.478E-05 | global batch size: 256 | lm loss: 3.777463E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.731 | TFLOPs: 26.11 | +7: iteration 40490/ 60336 | consumed samples: 10365440 | consumed tokens: 21228421120 | elapsed time per iteration (s): 0.15 | learning rate: 6.473E-05 | global batch size: 256 | lm loss: 3.777696E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.224 | TFLOPs: 26.05 | +7: iteration 40500/ 60336 | consumed samples: 10368000 | consumed tokens: 21233664000 | elapsed time per iteration (s): 0.15 | learning rate: 6.469E-05 | global batch size: 256 | lm loss: 3.792033E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.669 | TFLOPs: 26.06 | +7: iteration 40510/ 60336 | consumed samples: 10370560 | consumed tokens: 21238906880 | elapsed time per iteration (s): 0.15 | learning rate: 6.465E-05 | global batch size: 256 | lm loss: 3.785789E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.357 | TFLOPs: 26.05 | +7: iteration 40520/ 60336 | consumed samples: 10373120 | consumed tokens: 21244149760 | elapsed time per iteration (s): 0.15 | learning rate: 6.461E-05 | global batch size: 256 | lm loss: 3.783082E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.574 | TFLOPs: 26.04 | +7: iteration 40530/ 60336 | consumed samples: 10375680 | consumed tokens: 21249392640 | elapsed time per iteration (s): 0.15 | learning rate: 6.457E-05 | global batch size: 256 | lm loss: 3.774337E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.883 | TFLOPs: 26.05 | +7: iteration 40540/ 60336 | consumed samples: 10378240 | consumed tokens: 21254635520 | elapsed time per iteration (s): 0.15 | learning rate: 6.453E-05 | global batch size: 256 | lm loss: 3.766283E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.635 | TFLOPs: 25.98 | +7: iteration 40550/ 60336 | consumed samples: 10380800 | consumed tokens: 21259878400 | elapsed time per iteration (s): 0.15 | learning rate: 6.449E-05 | global batch size: 256 | lm loss: 3.778804E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.787 | TFLOPs: 26.03 | +7: iteration 40560/ 60336 | consumed samples: 10383360 | consumed tokens: 21265121280 | elapsed time per iteration (s): 0.15 | learning rate: 6.445E-05 | global batch size: 256 | lm loss: 3.762089E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.495 | TFLOPs: 26.06 | +7: iteration 40570/ 60336 | consumed samples: 10385920 | consumed tokens: 21270364160 | elapsed time per iteration (s): 0.15 | learning rate: 6.441E-05 | global batch size: 256 | lm loss: 3.772308E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.210 | TFLOPs: 26.05 | +7: iteration 40580/ 60336 | consumed samples: 10388480 | consumed tokens: 21275607040 | elapsed time per iteration (s): 0.16 | learning rate: 6.437E-05 | global batch size: 256 | lm loss: 3.767589E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.888 | TFLOPs: 25.69 | +7: iteration 40590/ 60336 | consumed samples: 10391040 | consumed tokens: 21280849920 | elapsed time per iteration (s): 0.15 | learning rate: 6.433E-05 | global batch size: 256 | lm loss: 3.781804E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.141 | TFLOPs: 25.93 | +7: iteration 40600/ 60336 | consumed samples: 10393600 | consumed tokens: 21286092800 | elapsed time per iteration (s): 0.15 | learning rate: 6.429E-05 | global batch size: 256 | lm loss: 3.767249E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.786 | TFLOPs: 26.01 | +7: iteration 40610/ 60336 | consumed samples: 10396160 | consumed tokens: 21291335680 | elapsed time per iteration (s): 0.15 | learning rate: 6.424E-05 | global batch size: 256 | lm loss: 3.769058E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.110 | TFLOPs: 26.07 | +7: iteration 40620/ 60336 | consumed samples: 10398720 | consumed tokens: 21296578560 | elapsed time per iteration (s): 0.16 | learning rate: 6.420E-05 | global batch size: 256 | lm loss: 3.773276E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.668 | TFLOPs: 25.84 | +7: iteration 40630/ 60336 | consumed samples: 10401280 | consumed tokens: 21301821440 | elapsed time per iteration (s): 0.15 | learning rate: 6.416E-05 | global batch size: 256 | lm loss: 3.776984E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.702 | TFLOPs: 26.08 | +7: iteration 40640/ 60336 | consumed samples: 10403840 | consumed tokens: 21307064320 | elapsed time per iteration (s): 0.15 | learning rate: 6.412E-05 | global batch size: 256 | lm loss: 3.776237E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.119 | TFLOPs: 26.07 | +7: iteration 40650/ 60336 | consumed samples: 10406400 | consumed tokens: 21312307200 | elapsed time per iteration (s): 0.15 | learning rate: 6.408E-05 | global batch size: 256 | lm loss: 3.762160E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.580 | TFLOPs: 26.07 | +7: iteration 40660/ 60336 | consumed samples: 10408960 | consumed tokens: 21317550080 | elapsed time per iteration (s): 0.15 | learning rate: 6.404E-05 | global batch size: 256 | lm loss: 3.779366E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.959 | TFLOPs: 26.08 | +7: iteration 40670/ 60336 | consumed samples: 10411520 | consumed tokens: 21322792960 | elapsed time per iteration (s): 0.15 | learning rate: 6.400E-05 | global batch size: 256 | lm loss: 3.778169E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.727 | TFLOPs: 26.08 | +7: iteration 40680/ 60336 | consumed samples: 10414080 | consumed tokens: 21328035840 | elapsed time per iteration (s): 0.15 | learning rate: 6.396E-05 | global batch size: 256 | lm loss: 3.767394E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.394 | TFLOPs: 26.07 | +7: iteration 40690/ 60336 | consumed samples: 10416640 | consumed tokens: 21333278720 | elapsed time per iteration (s): 0.15 | learning rate: 6.392E-05 | global batch size: 256 | lm loss: 3.774542E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.473 | TFLOPs: 26.06 | +7: iteration 40700/ 60336 | consumed samples: 10419200 | consumed tokens: 21338521600 | elapsed time per iteration (s): 0.15 | learning rate: 6.388E-05 | global batch size: 256 | lm loss: 3.776360E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.404 | TFLOPs: 26.04 | +7: iteration 40710/ 60336 | consumed samples: 10421760 | consumed tokens: 21343764480 | elapsed time per iteration (s): 0.15 | learning rate: 6.384E-05 | global batch size: 256 | lm loss: 3.779904E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.835 | TFLOPs: 26.06 | +7: iteration 40720/ 60336 | consumed samples: 10424320 | consumed tokens: 21349007360 | elapsed time per iteration (s): 0.15 | learning rate: 6.380E-05 | global batch size: 256 | lm loss: 3.777866E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.178 | TFLOPs: 26.10 | +7: iteration 40730/ 60336 | consumed samples: 10426880 | consumed tokens: 21354250240 | elapsed time per iteration (s): 0.15 | learning rate: 6.376E-05 | global batch size: 256 | lm loss: 3.780034E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.550 | TFLOPs: 26.07 | +7: iteration 40740/ 60336 | consumed samples: 10429440 | consumed tokens: 21359493120 | elapsed time per iteration (s): 0.15 | learning rate: 6.372E-05 | global batch size: 256 | lm loss: 3.768196E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.568 | TFLOPs: 26.06 | +7: iteration 40750/ 60336 | consumed samples: 10432000 | consumed tokens: 21364736000 | elapsed time per iteration (s): 0.15 | learning rate: 6.368E-05 | global batch size: 256 | lm loss: 3.785054E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.950 | TFLOPs: 26.06 | +7: iteration 40760/ 60336 | consumed samples: 10434560 | consumed tokens: 21369978880 | elapsed time per iteration (s): 0.15 | learning rate: 6.363E-05 | global batch size: 256 | lm loss: 3.775680E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.086 | TFLOPs: 26.08 | +7: iteration 40770/ 60336 | consumed samples: 10437120 | consumed tokens: 21375221760 | elapsed time per iteration (s): 0.15 | learning rate: 6.359E-05 | global batch size: 256 | lm loss: 3.770089E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.922 | TFLOPs: 26.05 | +7: iteration 40780/ 60336 | consumed samples: 10439680 | consumed tokens: 21380464640 | elapsed time per iteration (s): 0.15 | learning rate: 6.355E-05 | global batch size: 256 | lm loss: 3.766690E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.046 | TFLOPs: 26.03 | +7: iteration 40790/ 60336 | consumed samples: 10442240 | consumed tokens: 21385707520 | elapsed time per iteration (s): 0.16 | learning rate: 6.351E-05 | global batch size: 256 | lm loss: 3.764161E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.587 | TFLOPs: 25.67 | +7: iteration 40800/ 60336 | consumed samples: 10444800 | consumed tokens: 21390950400 | elapsed time per iteration (s): 0.15 | learning rate: 6.347E-05 | global batch size: 256 | lm loss: 3.786964E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.048 | TFLOPs: 25.99 | +7: iteration 40810/ 60336 | consumed samples: 10447360 | consumed tokens: 21396193280 | elapsed time per iteration (s): 0.15 | learning rate: 6.343E-05 | global batch size: 256 | lm loss: 3.761684E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.081 | TFLOPs: 26.02 | +7: iteration 40820/ 60336 | consumed samples: 10449920 | consumed tokens: 21401436160 | elapsed time per iteration (s): 0.15 | learning rate: 6.339E-05 | global batch size: 256 | lm loss: 3.772036E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.208 | TFLOPs: 25.97 | +7: iteration 40830/ 60336 | consumed samples: 10452480 | consumed tokens: 21406679040 | elapsed time per iteration (s): 0.15 | learning rate: 6.335E-05 | global batch size: 256 | lm loss: 3.776941E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.511 | TFLOPs: 25.99 | +7: iteration 40840/ 60336 | consumed samples: 10455040 | consumed tokens: 21411921920 | elapsed time per iteration (s): 0.16 | learning rate: 6.331E-05 | global batch size: 256 | lm loss: 3.771328E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.667 | TFLOPs: 25.43 | +7: iteration 40850/ 60336 | consumed samples: 10457600 | consumed tokens: 21417164800 | elapsed time per iteration (s): 0.15 | learning rate: 6.327E-05 | global batch size: 256 | lm loss: 3.779872E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.937 | TFLOPs: 25.98 | +7: iteration 40860/ 60336 | consumed samples: 10460160 | consumed tokens: 21422407680 | elapsed time per iteration (s): 0.16 | learning rate: 6.323E-05 | global batch size: 256 | lm loss: 3.791006E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.305 | TFLOPs: 25.77 | +7: iteration 40870/ 60336 | consumed samples: 10462720 | consumed tokens: 21427650560 | elapsed time per iteration (s): 0.15 | learning rate: 6.319E-05 | global batch size: 256 | lm loss: 3.779771E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.089 | TFLOPs: 26.02 | +7: iteration 40880/ 60336 | consumed samples: 10465280 | consumed tokens: 21432893440 | elapsed time per iteration (s): 0.15 | learning rate: 6.315E-05 | global batch size: 256 | lm loss: 3.777973E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.685 | TFLOPs: 26.01 | +7: iteration 40890/ 60336 | consumed samples: 10467840 | consumed tokens: 21438136320 | elapsed time per iteration (s): 0.15 | learning rate: 6.311E-05 | global batch size: 256 | lm loss: 3.765255E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.630 | TFLOPs: 26.00 | +7: iteration 40900/ 60336 | consumed samples: 10470400 | consumed tokens: 21443379200 | elapsed time per iteration (s): 0.15 | learning rate: 6.307E-05 | global batch size: 256 | lm loss: 3.777526E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.372 | TFLOPs: 26.01 | +7: iteration 40910/ 60336 | consumed samples: 10472960 | consumed tokens: 21448622080 | elapsed time per iteration (s): 0.15 | learning rate: 6.303E-05 | global batch size: 256 | lm loss: 3.773862E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.722 | TFLOPs: 26.00 | +7: iteration 40920/ 60336 | consumed samples: 10475520 | consumed tokens: 21453864960 | elapsed time per iteration (s): 0.15 | learning rate: 6.299E-05 | global batch size: 256 | lm loss: 3.766244E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.049 | TFLOPs: 26.00 | +7: iteration 40930/ 60336 | consumed samples: 10478080 | consumed tokens: 21459107840 | elapsed time per iteration (s): 0.15 | learning rate: 6.295E-05 | global batch size: 256 | lm loss: 3.772909E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.959 | TFLOPs: 26.02 | +7: iteration 40940/ 60336 | consumed samples: 10480640 | consumed tokens: 21464350720 | elapsed time per iteration (s): 0.15 | learning rate: 6.291E-05 | global batch size: 256 | lm loss: 3.769294E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.214 | TFLOPs: 26.02 | +7: iteration 40950/ 60336 | consumed samples: 10483200 | consumed tokens: 21469593600 | elapsed time per iteration (s): 0.15 | learning rate: 6.287E-05 | global batch size: 256 | lm loss: 3.779798E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.508 | TFLOPs: 26.06 | +7: iteration 40960/ 60336 | consumed samples: 10485760 | consumed tokens: 21474836480 | elapsed time per iteration (s): 0.15 | learning rate: 6.283E-05 | global batch size: 256 | lm loss: 3.763743E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.614 | TFLOPs: 26.03 | +7: iteration 40970/ 60336 | consumed samples: 10488320 | consumed tokens: 21480079360 | elapsed time per iteration (s): 0.15 | learning rate: 6.279E-05 | global batch size: 256 | lm loss: 3.765366E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.624 | TFLOPs: 26.04 | +7: iteration 40980/ 60336 | consumed samples: 10490880 | consumed tokens: 21485322240 | elapsed time per iteration (s): 0.16 | learning rate: 6.275E-05 | global batch size: 256 | lm loss: 3.787313E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.070 | TFLOPs: 25.88 | +7: iteration 40990/ 60336 | consumed samples: 10493440 | consumed tokens: 21490565120 | elapsed time per iteration (s): 0.15 | learning rate: 6.270E-05 | global batch size: 256 | lm loss: 3.771709E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.125 | TFLOPs: 26.02 | +7: iteration 41000/ 60336 | consumed samples: 10496000 | consumed tokens: 21495808000 | elapsed time per iteration (s): 0.15 | learning rate: 6.266E-05 | global batch size: 256 | lm loss: 3.779904E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.033 | TFLOPs: 26.00 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 41000 | lm loss value: 3.869606E+00 | lm loss PPL: 4.792348E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 41000 to checkpoints_44m32b100m +0: [2023-03-17 02:05:02,333] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step41000 is begin to save! +0: [2023-03-17 02:05:02,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:05:02,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:05:02,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:05:02,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:05:02,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:05:02,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:05:02,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:05:02,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:05:02,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:05:02,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:05:02,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:05:02,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:05:02,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:05:02,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:05:02,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:05:02,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:05:02,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:05:02,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:05:02,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:05:02,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:05:02,463] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step41000/mp_rank_00_model_states.pt +0: [2023-03-17 02:05:02,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:05:02,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:05:02,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:05:02,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:05:02,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:05:02,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:05:02,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:05:02,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:05:02,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:05:02,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step41000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:05:02,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: successfully saved checkpoint at iteration 41000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 176.71 +7: iteration 41010/ 60336 | consumed samples: 10498560 | consumed tokens: 21501050880 | elapsed time per iteration (s): 0.18 | learning rate: 6.262E-05 | global batch size: 256 | lm loss: 3.786682E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.599 | TFLOPs: 22.73 | +7: iteration 41020/ 60336 | consumed samples: 10501120 | consumed tokens: 21506293760 | elapsed time per iteration (s): 0.15 | learning rate: 6.258E-05 | global batch size: 256 | lm loss: 3.772890E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.099 | TFLOPs: 26.10 | +7: iteration 41030/ 60336 | consumed samples: 10503680 | consumed tokens: 21511536640 | elapsed time per iteration (s): 0.15 | learning rate: 6.254E-05 | global batch size: 256 | lm loss: 3.786730E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.281 | TFLOPs: 26.10 | +7: iteration 41040/ 60336 | consumed samples: 10506240 | consumed tokens: 21516779520 | elapsed time per iteration (s): 0.15 | learning rate: 6.250E-05 | global batch size: 256 | lm loss: 3.771537E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.869 | TFLOPs: 26.08 | +7: iteration 41050/ 60336 | consumed samples: 10508800 | consumed tokens: 21522022400 | elapsed time per iteration (s): 0.15 | learning rate: 6.246E-05 | global batch size: 256 | lm loss: 3.780350E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.323 | TFLOPs: 26.05 | +7: iteration 41060/ 60336 | consumed samples: 10511360 | consumed tokens: 21527265280 | elapsed time per iteration (s): 0.15 | learning rate: 6.242E-05 | global batch size: 256 | lm loss: 3.772629E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.440 | TFLOPs: 26.04 | +7: iteration 41070/ 60336 | consumed samples: 10513920 | consumed tokens: 21532508160 | elapsed time per iteration (s): 0.15 | learning rate: 6.238E-05 | global batch size: 256 | lm loss: 3.787342E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.602 | TFLOPs: 26.07 | +7: iteration 41080/ 60336 | consumed samples: 10516480 | consumed tokens: 21537751040 | elapsed time per iteration (s): 0.15 | learning rate: 6.234E-05 | global batch size: 256 | lm loss: 3.777552E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.936 | TFLOPs: 26.05 | +7: iteration 41090/ 60336 | consumed samples: 10519040 | consumed tokens: 21542993920 | elapsed time per iteration (s): 0.15 | learning rate: 6.230E-05 | global batch size: 256 | lm loss: 3.780817E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.405 | TFLOPs: 26.02 | +7: iteration 41100/ 60336 | consumed samples: 10521600 | consumed tokens: 21548236800 | elapsed time per iteration (s): 0.15 | learning rate: 6.226E-05 | global batch size: 256 | lm loss: 3.771283E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.849 | TFLOPs: 26.06 | +7: iteration 41110/ 60336 | consumed samples: 10524160 | consumed tokens: 21553479680 | elapsed time per iteration (s): 0.15 | learning rate: 6.222E-05 | global batch size: 256 | lm loss: 3.781630E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.413 | TFLOPs: 26.04 | +7: iteration 41120/ 60336 | consumed samples: 10526720 | consumed tokens: 21558722560 | elapsed time per iteration (s): 0.15 | learning rate: 6.218E-05 | global batch size: 256 | lm loss: 3.771622E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.574 | TFLOPs: 26.04 | +7: iteration 41130/ 60336 | consumed samples: 10529280 | consumed tokens: 21563965440 | elapsed time per iteration (s): 0.16 | learning rate: 6.214E-05 | global batch size: 256 | lm loss: 3.762499E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.996 | TFLOPs: 24.89 | +7: iteration 41140/ 60336 | consumed samples: 10531840 | consumed tokens: 21569208320 | elapsed time per iteration (s): 0.15 | learning rate: 6.210E-05 | global batch size: 256 | lm loss: 3.769133E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.324 | TFLOPs: 26.07 | +7: iteration 41150/ 60336 | consumed samples: 10534400 | consumed tokens: 21574451200 | elapsed time per iteration (s): 0.15 | learning rate: 6.206E-05 | global batch size: 256 | lm loss: 3.784490E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.942 | TFLOPs: 26.03 | +7: iteration 41160/ 60336 | consumed samples: 10536960 | consumed tokens: 21579694080 | elapsed time per iteration (s): 0.15 | learning rate: 6.202E-05 | global batch size: 256 | lm loss: 3.761871E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.158 | TFLOPs: 26.05 | +7: iteration 41170/ 60336 | consumed samples: 10539520 | consumed tokens: 21584936960 | elapsed time per iteration (s): 0.15 | learning rate: 6.198E-05 | global batch size: 256 | lm loss: 3.769358E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.356 | TFLOPs: 26.04 | +7: iteration 41180/ 60336 | consumed samples: 10542080 | consumed tokens: 21590179840 | elapsed time per iteration (s): 0.16 | learning rate: 6.194E-05 | global batch size: 256 | lm loss: 3.770674E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.066 | TFLOPs: 25.72 | +7: iteration 41190/ 60336 | consumed samples: 10544640 | consumed tokens: 21595422720 | elapsed time per iteration (s): 0.16 | learning rate: 6.190E-05 | global batch size: 256 | lm loss: 3.771834E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.326 | TFLOPs: 25.32 | +7: iteration 41200/ 60336 | consumed samples: 10547200 | consumed tokens: 21600665600 | elapsed time per iteration (s): 0.16 | learning rate: 6.186E-05 | global batch size: 256 | lm loss: 3.780670E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.020 | TFLOPs: 25.86 | +7: iteration 41210/ 60336 | consumed samples: 10549760 | consumed tokens: 21605908480 | elapsed time per iteration (s): 0.15 | learning rate: 6.182E-05 | global batch size: 256 | lm loss: 3.790573E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.246 | TFLOPs: 26.05 | +7: iteration 41220/ 60336 | consumed samples: 10552320 | consumed tokens: 21611151360 | elapsed time per iteration (s): 0.15 | learning rate: 6.178E-05 | global batch size: 256 | lm loss: 3.778393E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.602 | TFLOPs: 26.04 | +7: iteration 41230/ 60336 | consumed samples: 10554880 | consumed tokens: 21616394240 | elapsed time per iteration (s): 0.15 | learning rate: 6.174E-05 | global batch size: 256 | lm loss: 3.779706E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.841 | TFLOPs: 26.06 | +7: iteration 41240/ 60336 | consumed samples: 10557440 | consumed tokens: 21621637120 | elapsed time per iteration (s): 0.15 | learning rate: 6.170E-05 | global batch size: 256 | lm loss: 3.762635E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.117 | TFLOPs: 26.03 | +7: iteration 41250/ 60336 | consumed samples: 10560000 | consumed tokens: 21626880000 | elapsed time per iteration (s): 0.15 | learning rate: 6.166E-05 | global batch size: 256 | lm loss: 3.776205E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.124 | TFLOPs: 26.00 | +7: iteration 41260/ 60336 | consumed samples: 10562560 | consumed tokens: 21632122880 | elapsed time per iteration (s): 0.15 | learning rate: 6.162E-05 | global batch size: 256 | lm loss: 3.778013E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.483 | TFLOPs: 25.95 | +7: iteration 41270/ 60336 | consumed samples: 10565120 | consumed tokens: 21637365760 | elapsed time per iteration (s): 0.15 | learning rate: 6.158E-05 | global batch size: 256 | lm loss: 3.768633E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.784 | TFLOPs: 25.94 | +7: iteration 41280/ 60336 | consumed samples: 10567680 | consumed tokens: 21642608640 | elapsed time per iteration (s): 0.15 | learning rate: 6.154E-05 | global batch size: 256 | lm loss: 3.779570E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.455 | TFLOPs: 25.95 | +7: iteration 41290/ 60336 | consumed samples: 10570240 | consumed tokens: 21647851520 | elapsed time per iteration (s): 0.15 | learning rate: 6.150E-05 | global batch size: 256 | lm loss: 3.770629E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.979 | TFLOPs: 25.97 | +7: iteration 41300/ 60336 | consumed samples: 10572800 | consumed tokens: 21653094400 | elapsed time per iteration (s): 0.15 | learning rate: 6.146E-05 | global batch size: 256 | lm loss: 3.766508E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.546 | TFLOPs: 25.96 | +7: iteration 41310/ 60336 | consumed samples: 10575360 | consumed tokens: 21658337280 | elapsed time per iteration (s): 0.16 | learning rate: 6.142E-05 | global batch size: 256 | lm loss: 3.763071E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.861 | TFLOPs: 24.73 | +7: iteration 41320/ 60336 | consumed samples: 10577920 | consumed tokens: 21663580160 | elapsed time per iteration (s): 0.15 | learning rate: 6.138E-05 | global batch size: 256 | lm loss: 3.766231E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.660 | TFLOPs: 25.98 | +7: iteration 41330/ 60336 | consumed samples: 10580480 | consumed tokens: 21668823040 | elapsed time per iteration (s): 0.15 | learning rate: 6.134E-05 | global batch size: 256 | lm loss: 3.778239E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.555 | TFLOPs: 25.96 | +7: iteration 41340/ 60336 | consumed samples: 10583040 | consumed tokens: 21674065920 | elapsed time per iteration (s): 0.15 | learning rate: 6.130E-05 | global batch size: 256 | lm loss: 3.773004E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.092 | TFLOPs: 25.99 | +7: iteration 41350/ 60336 | consumed samples: 10585600 | consumed tokens: 21679308800 | elapsed time per iteration (s): 0.15 | learning rate: 6.126E-05 | global batch size: 256 | lm loss: 3.768859E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.179 | TFLOPs: 25.97 | +7: iteration 41360/ 60336 | consumed samples: 10588160 | consumed tokens: 21684551680 | elapsed time per iteration (s): 0.15 | learning rate: 6.122E-05 | global batch size: 256 | lm loss: 3.779290E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.449 | TFLOPs: 25.95 | +7: iteration 41370/ 60336 | consumed samples: 10590720 | consumed tokens: 21689794560 | elapsed time per iteration (s): 0.15 | learning rate: 6.118E-05 | global batch size: 256 | lm loss: 3.784662E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.303 | TFLOPs: 25.99 | +7: iteration 41380/ 60336 | consumed samples: 10593280 | consumed tokens: 21695037440 | elapsed time per iteration (s): 0.15 | learning rate: 6.114E-05 | global batch size: 256 | lm loss: 3.765535E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.785 | TFLOPs: 26.01 | +7: iteration 41390/ 60336 | consumed samples: 10595840 | consumed tokens: 21700280320 | elapsed time per iteration (s): 0.15 | learning rate: 6.110E-05 | global batch size: 256 | lm loss: 3.760920E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.476 | TFLOPs: 26.04 | +7: iteration 41400/ 60336 | consumed samples: 10598400 | consumed tokens: 21705523200 | elapsed time per iteration (s): 0.15 | learning rate: 6.106E-05 | global batch size: 256 | lm loss: 3.771925E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.156 | TFLOPs: 26.11 | +7: iteration 41410/ 60336 | consumed samples: 10600960 | consumed tokens: 21710766080 | elapsed time per iteration (s): 0.15 | learning rate: 6.103E-05 | global batch size: 256 | lm loss: 3.779147E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.962 | TFLOPs: 26.11 | +7: iteration 41420/ 60336 | consumed samples: 10603520 | consumed tokens: 21716008960 | elapsed time per iteration (s): 0.15 | learning rate: 6.099E-05 | global batch size: 256 | lm loss: 3.768940E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.552 | TFLOPs: 26.07 | +7: iteration 41430/ 60336 | consumed samples: 10606080 | consumed tokens: 21721251840 | elapsed time per iteration (s): 0.15 | learning rate: 6.095E-05 | global batch size: 256 | lm loss: 3.770645E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.951 | TFLOPs: 26.06 | +7: iteration 41440/ 60336 | consumed samples: 10608640 | consumed tokens: 21726494720 | elapsed time per iteration (s): 0.15 | learning rate: 6.091E-05 | global batch size: 256 | lm loss: 3.765303E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.942 | TFLOPs: 26.09 | +7: iteration 41450/ 60336 | consumed samples: 10611200 | consumed tokens: 21731737600 | elapsed time per iteration (s): 0.16 | learning rate: 6.087E-05 | global batch size: 256 | lm loss: 3.767989E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.092 | TFLOPs: 24.45 | +7: iteration 41460/ 60336 | consumed samples: 10613760 | consumed tokens: 21736980480 | elapsed time per iteration (s): 0.15 | learning rate: 6.083E-05 | global batch size: 256 | lm loss: 3.770611E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.683 | TFLOPs: 25.90 | +7: iteration 41470/ 60336 | consumed samples: 10616320 | consumed tokens: 21742223360 | elapsed time per iteration (s): 0.15 | learning rate: 6.079E-05 | global batch size: 256 | lm loss: 3.781963E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.953 | TFLOPs: 26.13 | +7: iteration 41480/ 60336 | consumed samples: 10618880 | consumed tokens: 21747466240 | elapsed time per iteration (s): 0.15 | learning rate: 6.075E-05 | global batch size: 256 | lm loss: 3.771644E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.940 | TFLOPs: 26.11 | +7: iteration 41490/ 60336 | consumed samples: 10621440 | consumed tokens: 21752709120 | elapsed time per iteration (s): 0.15 | learning rate: 6.071E-05 | global batch size: 256 | lm loss: 3.771518E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.767 | TFLOPs: 26.12 | +7: iteration 41500/ 60336 | consumed samples: 10624000 | consumed tokens: 21757952000 | elapsed time per iteration (s): 0.15 | learning rate: 6.067E-05 | global batch size: 256 | lm loss: 3.791241E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.669 | TFLOPs: 26.12 | +7: iteration 41510/ 60336 | consumed samples: 10626560 | consumed tokens: 21763194880 | elapsed time per iteration (s): 0.15 | learning rate: 6.063E-05 | global batch size: 256 | lm loss: 3.774255E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.638 | TFLOPs: 26.11 | +7: iteration 41520/ 60336 | consumed samples: 10629120 | consumed tokens: 21768437760 | elapsed time per iteration (s): 0.15 | learning rate: 6.059E-05 | global batch size: 256 | lm loss: 3.777473E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.882 | TFLOPs: 26.13 | +7: iteration 41530/ 60336 | consumed samples: 10631680 | consumed tokens: 21773680640 | elapsed time per iteration (s): 0.15 | learning rate: 6.055E-05 | global batch size: 256 | lm loss: 3.767583E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.483 | TFLOPs: 26.13 | +7: iteration 41540/ 60336 | consumed samples: 10634240 | consumed tokens: 21778923520 | elapsed time per iteration (s): 0.15 | learning rate: 6.051E-05 | global batch size: 256 | lm loss: 3.771147E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.488 | TFLOPs: 26.12 | +7: iteration 41550/ 60336 | consumed samples: 10636800 | consumed tokens: 21784166400 | elapsed time per iteration (s): 0.15 | learning rate: 6.047E-05 | global batch size: 256 | lm loss: 3.788840E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.929 | TFLOPs: 26.11 | +7: iteration 41560/ 60336 | consumed samples: 10639360 | consumed tokens: 21789409280 | elapsed time per iteration (s): 0.15 | learning rate: 6.043E-05 | global batch size: 256 | lm loss: 3.790396E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.563 | TFLOPs: 26.09 | +7: iteration 41570/ 60336 | consumed samples: 10641920 | consumed tokens: 21794652160 | elapsed time per iteration (s): 0.15 | learning rate: 6.039E-05 | global batch size: 256 | lm loss: 3.761921E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.096 | TFLOPs: 26.10 | +7: iteration 41580/ 60336 | consumed samples: 10644480 | consumed tokens: 21799895040 | elapsed time per iteration (s): 0.15 | learning rate: 6.035E-05 | global batch size: 256 | lm loss: 3.771693E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.859 | TFLOPs: 26.12 | +7: iteration 41590/ 60336 | consumed samples: 10647040 | consumed tokens: 21805137920 | elapsed time per iteration (s): 0.15 | learning rate: 6.031E-05 | global batch size: 256 | lm loss: 3.786193E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.156 | TFLOPs: 26.10 | +7: iteration 41600/ 60336 | consumed samples: 10649600 | consumed tokens: 21810380800 | elapsed time per iteration (s): 0.15 | learning rate: 6.027E-05 | global batch size: 256 | lm loss: 3.771463E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.064 | TFLOPs: 26.07 | +7: iteration 41610/ 60336 | consumed samples: 10652160 | consumed tokens: 21815623680 | elapsed time per iteration (s): 0.15 | learning rate: 6.023E-05 | global batch size: 256 | lm loss: 3.779043E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.452 | TFLOPs: 26.07 | +7: iteration 41620/ 60336 | consumed samples: 10654720 | consumed tokens: 21820866560 | elapsed time per iteration (s): 0.15 | learning rate: 6.019E-05 | global batch size: 256 | lm loss: 3.768453E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.872 | TFLOPs: 26.09 | +7: iteration 41630/ 60336 | consumed samples: 10657280 | consumed tokens: 21826109440 | elapsed time per iteration (s): 0.15 | learning rate: 6.015E-05 | global batch size: 256 | lm loss: 3.777179E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.532 | TFLOPs: 26.14 | +7: iteration 41640/ 60336 | consumed samples: 10659840 | consumed tokens: 21831352320 | elapsed time per iteration (s): 0.15 | learning rate: 6.012E-05 | global batch size: 256 | lm loss: 3.774757E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.309 | TFLOPs: 26.10 | +7: iteration 41650/ 60336 | consumed samples: 10662400 | consumed tokens: 21836595200 | elapsed time per iteration (s): 0.15 | learning rate: 6.008E-05 | global batch size: 256 | lm loss: 3.769282E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.061 | TFLOPs: 26.11 | +7: iteration 41660/ 60336 | consumed samples: 10664960 | consumed tokens: 21841838080 | elapsed time per iteration (s): 0.15 | learning rate: 6.004E-05 | global batch size: 256 | lm loss: 3.779892E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.266 | TFLOPs: 26.12 | +7: iteration 41670/ 60336 | consumed samples: 10667520 | consumed tokens: 21847080960 | elapsed time per iteration (s): 0.15 | learning rate: 6.000E-05 | global batch size: 256 | lm loss: 3.765352E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.909 | TFLOPs: 26.13 | +7: iteration 41680/ 60336 | consumed samples: 10670080 | consumed tokens: 21852323840 | elapsed time per iteration (s): 0.15 | learning rate: 5.996E-05 | global batch size: 256 | lm loss: 3.782681E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.072 | TFLOPs: 26.13 | +7: iteration 41690/ 60336 | consumed samples: 10672640 | consumed tokens: 21857566720 | elapsed time per iteration (s): 0.15 | learning rate: 5.992E-05 | global batch size: 256 | lm loss: 3.788314E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.194 | TFLOPs: 26.13 | +7: iteration 41700/ 60336 | consumed samples: 10675200 | consumed tokens: 21862809600 | elapsed time per iteration (s): 0.15 | learning rate: 5.988E-05 | global batch size: 256 | lm loss: 3.758915E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.483 | TFLOPs: 26.12 | +7: iteration 41710/ 60336 | consumed samples: 10677760 | consumed tokens: 21868052480 | elapsed time per iteration (s): 0.15 | learning rate: 5.984E-05 | global batch size: 256 | lm loss: 3.776735E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.613 | TFLOPs: 26.14 | +7: iteration 41720/ 60336 | consumed samples: 10680320 | consumed tokens: 21873295360 | elapsed time per iteration (s): 0.15 | learning rate: 5.980E-05 | global batch size: 256 | lm loss: 3.774207E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.995 | TFLOPs: 26.17 | +7: iteration 41730/ 60336 | consumed samples: 10682880 | consumed tokens: 21878538240 | elapsed time per iteration (s): 0.15 | learning rate: 5.976E-05 | global batch size: 256 | lm loss: 3.767384E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.555 | TFLOPs: 26.20 | +7: iteration 41740/ 60336 | consumed samples: 10685440 | consumed tokens: 21883781120 | elapsed time per iteration (s): 0.15 | learning rate: 5.972E-05 | global batch size: 256 | lm loss: 3.779618E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.778 | TFLOPs: 26.20 | +7: iteration 41750/ 60336 | consumed samples: 10688000 | consumed tokens: 21889024000 | elapsed time per iteration (s): 0.15 | learning rate: 5.968E-05 | global batch size: 256 | lm loss: 3.782896E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.699 | TFLOPs: 26.19 | +7: iteration 41760/ 60336 | consumed samples: 10690560 | consumed tokens: 21894266880 | elapsed time per iteration (s): 0.15 | learning rate: 5.964E-05 | global batch size: 256 | lm loss: 3.779424E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.804 | TFLOPs: 26.19 | +7: iteration 41770/ 60336 | consumed samples: 10693120 | consumed tokens: 21899509760 | elapsed time per iteration (s): 0.15 | learning rate: 5.960E-05 | global batch size: 256 | lm loss: 3.773909E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.083 | TFLOPs: 26.18 | +7: iteration 41780/ 60336 | consumed samples: 10695680 | consumed tokens: 21904752640 | elapsed time per iteration (s): 0.15 | learning rate: 5.957E-05 | global batch size: 256 | lm loss: 3.776757E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.897 | TFLOPs: 26.19 | +7: iteration 41790/ 60336 | consumed samples: 10698240 | consumed tokens: 21909995520 | elapsed time per iteration (s): 0.15 | learning rate: 5.953E-05 | global batch size: 256 | lm loss: 3.783971E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.826 | TFLOPs: 26.17 | +7: iteration 41800/ 60336 | consumed samples: 10700800 | consumed tokens: 21915238400 | elapsed time per iteration (s): 0.15 | learning rate: 5.949E-05 | global batch size: 256 | lm loss: 3.780643E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.149 | TFLOPs: 26.15 | +7: iteration 41810/ 60336 | consumed samples: 10703360 | consumed tokens: 21920481280 | elapsed time per iteration (s): 0.15 | learning rate: 5.945E-05 | global batch size: 256 | lm loss: 3.759889E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.986 | TFLOPs: 26.11 | +7: iteration 41820/ 60336 | consumed samples: 10705920 | consumed tokens: 21925724160 | elapsed time per iteration (s): 0.15 | learning rate: 5.941E-05 | global batch size: 256 | lm loss: 3.772057E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.253 | TFLOPs: 26.10 | +7: iteration 41830/ 60336 | consumed samples: 10708480 | consumed tokens: 21930967040 | elapsed time per iteration (s): 0.16 | learning rate: 5.937E-05 | global batch size: 256 | lm loss: 3.779651E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.340 | TFLOPs: 25.80 | +7: iteration 41840/ 60336 | consumed samples: 10711040 | consumed tokens: 21936209920 | elapsed time per iteration (s): 0.15 | learning rate: 5.933E-05 | global batch size: 256 | lm loss: 3.777262E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.717 | TFLOPs: 26.12 | +7: iteration 41850/ 60336 | consumed samples: 10713600 | consumed tokens: 21941452800 | elapsed time per iteration (s): 0.15 | learning rate: 5.929E-05 | global batch size: 256 | lm loss: 3.774053E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.466 | TFLOPs: 26.12 | +7: iteration 41860/ 60336 | consumed samples: 10716160 | consumed tokens: 21946695680 | elapsed time per iteration (s): 0.16 | learning rate: 5.925E-05 | global batch size: 256 | lm loss: 3.770682E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.437 | TFLOPs: 25.38 | +7: iteration 41870/ 60336 | consumed samples: 10718720 | consumed tokens: 21951938560 | elapsed time per iteration (s): 0.15 | learning rate: 5.921E-05 | global batch size: 256 | lm loss: 3.771224E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.333 | TFLOPs: 26.12 | +7: iteration 41880/ 60336 | consumed samples: 10721280 | consumed tokens: 21957181440 | elapsed time per iteration (s): 0.15 | learning rate: 5.917E-05 | global batch size: 256 | lm loss: 3.772327E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.404 | TFLOPs: 26.12 | +7: iteration 41890/ 60336 | consumed samples: 10723840 | consumed tokens: 21962424320 | elapsed time per iteration (s): 0.15 | learning rate: 5.913E-05 | global batch size: 256 | lm loss: 3.760454E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.553 | TFLOPs: 26.12 | +7: iteration 41900/ 60336 | consumed samples: 10726400 | consumed tokens: 21967667200 | elapsed time per iteration (s): 0.15 | learning rate: 5.910E-05 | global batch size: 256 | lm loss: 3.763462E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.893 | TFLOPs: 26.13 | +7: iteration 41910/ 60336 | consumed samples: 10728960 | consumed tokens: 21972910080 | elapsed time per iteration (s): 0.15 | learning rate: 5.906E-05 | global batch size: 256 | lm loss: 3.767023E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.566 | TFLOPs: 26.17 | +7: iteration 41920/ 60336 | consumed samples: 10731520 | consumed tokens: 21978152960 | elapsed time per iteration (s): 0.15 | learning rate: 5.902E-05 | global batch size: 256 | lm loss: 3.767352E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.455 | TFLOPs: 26.18 | +7: iteration 41930/ 60336 | consumed samples: 10734080 | consumed tokens: 21983395840 | elapsed time per iteration (s): 0.16 | learning rate: 5.898E-05 | global batch size: 256 | lm loss: 3.780589E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.395 | TFLOPs: 25.46 | +7: iteration 41940/ 60336 | consumed samples: 10736640 | consumed tokens: 21988638720 | elapsed time per iteration (s): 0.15 | learning rate: 5.894E-05 | global batch size: 256 | lm loss: 3.763307E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.195 | TFLOPs: 26.15 | +7: iteration 41950/ 60336 | consumed samples: 10739200 | consumed tokens: 21993881600 | elapsed time per iteration (s): 0.15 | learning rate: 5.890E-05 | global batch size: 256 | lm loss: 3.777752E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.167 | TFLOPs: 26.11 | +7: iteration 41960/ 60336 | consumed samples: 10741760 | consumed tokens: 21999124480 | elapsed time per iteration (s): 0.15 | learning rate: 5.886E-05 | global batch size: 256 | lm loss: 3.768487E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.451 | TFLOPs: 26.12 | +7: iteration 41970/ 60336 | consumed samples: 10744320 | consumed tokens: 22004367360 | elapsed time per iteration (s): 0.15 | learning rate: 5.882E-05 | global batch size: 256 | lm loss: 3.768385E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.362 | TFLOPs: 26.13 | +7: iteration 41980/ 60336 | consumed samples: 10746880 | consumed tokens: 22009610240 | elapsed time per iteration (s): 0.15 | learning rate: 5.878E-05 | global batch size: 256 | lm loss: 3.774583E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.460 | TFLOPs: 26.10 | +7: iteration 41990/ 60336 | consumed samples: 10749440 | consumed tokens: 22014853120 | elapsed time per iteration (s): 0.15 | learning rate: 5.874E-05 | global batch size: 256 | lm loss: 3.771644E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.541 | TFLOPs: 26.09 | +0: [2023-03-17 02:07:36,860] [INFO] [logging.py:68:log_dist] [Rank 0] step=42000, skipped=0, lr=[5.870592798850111e-05, 5.870592798850111e-05, 5.870592798850111e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 42000/ 60336 | consumed samples: 10752000 | consumed tokens: 22020096000 | elapsed time per iteration (s): 0.15 | learning rate: 5.871E-05 | global batch size: 256 | lm loss: 3.765916E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.969 | TFLOPs: 26.13 | +0: steps: 42000 loss: 3.7449 iter time (s): 0.153 samples/sec: 1669.801 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 42000 | lm loss value: 3.942299E+00 | lm loss PPL: 5.153696E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 42000 to checkpoints_44m32b100m +0: [2023-03-17 02:07:36,935] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step42000 is begin to save! +0: [2023-03-17 02:07:36,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:07:37,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:07:37,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:07:37,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:07:37,018] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:07:37,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:07:37,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:07:37,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:07:37,035] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:07:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:07:37,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:07:37,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:07:37,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:07:37,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:07:37,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:07:37,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:07:37,067] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:07:37,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:07:37,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:07:37,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:07:37,077] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step42000/mp_rank_00_model_states.pt +0: [2023-03-17 02:07:37,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:07:37,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:07:37,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:07:37,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:07:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:07:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 02:07:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:07:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:07:37,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:07:37,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step42000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:07:37,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: successfully saved checkpoint at iteration 42000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 188.00 +7: iteration 42010/ 60336 | consumed samples: 10754560 | consumed tokens: 22025338880 | elapsed time per iteration (s): 0.18 | learning rate: 5.867E-05 | global batch size: 256 | lm loss: 3.771075E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1429.012 | TFLOPs: 22.41 | +7: iteration 42020/ 60336 | consumed samples: 10757120 | consumed tokens: 22030581760 | elapsed time per iteration (s): 0.15 | learning rate: 5.863E-05 | global batch size: 256 | lm loss: 3.780731E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.561 | TFLOPs: 26.21 | +7: iteration 42030/ 60336 | consumed samples: 10759680 | consumed tokens: 22035824640 | elapsed time per iteration (s): 0.15 | learning rate: 5.859E-05 | global batch size: 256 | lm loss: 3.775457E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.508 | TFLOPs: 26.21 | +7: iteration 42040/ 60336 | consumed samples: 10762240 | consumed tokens: 22041067520 | elapsed time per iteration (s): 0.15 | learning rate: 5.855E-05 | global batch size: 256 | lm loss: 3.781465E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.448 | TFLOPs: 26.21 | +7: iteration 42050/ 60336 | consumed samples: 10764800 | consumed tokens: 22046310400 | elapsed time per iteration (s): 0.15 | learning rate: 5.851E-05 | global batch size: 256 | lm loss: 3.758686E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.221 | TFLOPs: 26.24 | +7: iteration 42060/ 60336 | consumed samples: 10767360 | consumed tokens: 22051553280 | elapsed time per iteration (s): 0.15 | learning rate: 5.847E-05 | global batch size: 256 | lm loss: 3.773667E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.318 | TFLOPs: 26.15 | +7: iteration 42070/ 60336 | consumed samples: 10769920 | consumed tokens: 22056796160 | elapsed time per iteration (s): 0.15 | learning rate: 5.843E-05 | global batch size: 256 | lm loss: 3.783321E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.236 | TFLOPs: 26.22 | +7: iteration 42080/ 60336 | consumed samples: 10772480 | consumed tokens: 22062039040 | elapsed time per iteration (s): 0.15 | learning rate: 5.840E-05 | global batch size: 256 | lm loss: 3.777831E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.575 | TFLOPs: 26.21 | +7: iteration 42090/ 60336 | consumed samples: 10775040 | consumed tokens: 22067281920 | elapsed time per iteration (s): 0.15 | learning rate: 5.836E-05 | global batch size: 256 | lm loss: 3.764462E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.323 | TFLOPs: 26.19 | +7: iteration 42100/ 60336 | consumed samples: 10777600 | consumed tokens: 22072524800 | elapsed time per iteration (s): 0.15 | learning rate: 5.832E-05 | global batch size: 256 | lm loss: 3.769417E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.324 | TFLOPs: 26.21 | +7: iteration 42110/ 60336 | consumed samples: 10780160 | consumed tokens: 22077767680 | elapsed time per iteration (s): 0.15 | learning rate: 5.828E-05 | global batch size: 256 | lm loss: 3.782115E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.950 | TFLOPs: 26.20 | +7: iteration 42120/ 60336 | consumed samples: 10782720 | consumed tokens: 22083010560 | elapsed time per iteration (s): 0.15 | learning rate: 5.824E-05 | global batch size: 256 | lm loss: 3.784088E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.388 | TFLOPs: 26.21 | +7: iteration 42130/ 60336 | consumed samples: 10785280 | consumed tokens: 22088253440 | elapsed time per iteration (s): 0.15 | learning rate: 5.820E-05 | global batch size: 256 | lm loss: 3.769488E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.568 | TFLOPs: 26.21 | +7: iteration 42140/ 60336 | consumed samples: 10787840 | consumed tokens: 22093496320 | elapsed time per iteration (s): 0.15 | learning rate: 5.816E-05 | global batch size: 256 | lm loss: 3.771774E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.528 | TFLOPs: 26.21 | +7: iteration 42150/ 60336 | consumed samples: 10790400 | consumed tokens: 22098739200 | elapsed time per iteration (s): 0.15 | learning rate: 5.812E-05 | global batch size: 256 | lm loss: 3.767095E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.957 | TFLOPs: 26.20 | +7: iteration 42160/ 60336 | consumed samples: 10792960 | consumed tokens: 22103982080 | elapsed time per iteration (s): 0.15 | learning rate: 5.809E-05 | global batch size: 256 | lm loss: 3.777423E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.907 | TFLOPs: 26.20 | +7: iteration 42170/ 60336 | consumed samples: 10795520 | consumed tokens: 22109224960 | elapsed time per iteration (s): 0.15 | learning rate: 5.805E-05 | global batch size: 256 | lm loss: 3.781741E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.525 | TFLOPs: 26.17 | +7: iteration 42180/ 60336 | consumed samples: 10798080 | consumed tokens: 22114467840 | elapsed time per iteration (s): 0.15 | learning rate: 5.801E-05 | global batch size: 256 | lm loss: 3.758120E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.578 | TFLOPs: 26.20 | +7: iteration 42190/ 60336 | consumed samples: 10800640 | consumed tokens: 22119710720 | elapsed time per iteration (s): 0.16 | learning rate: 5.797E-05 | global batch size: 256 | lm loss: 3.769077E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.300 | TFLOPs: 25.88 | +7: iteration 42200/ 60336 | consumed samples: 10803200 | consumed tokens: 22124953600 | elapsed time per iteration (s): 0.15 | learning rate: 5.793E-05 | global batch size: 256 | lm loss: 3.775081E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.335 | TFLOPs: 26.13 | +7: iteration 42210/ 60336 | consumed samples: 10805760 | consumed tokens: 22130196480 | elapsed time per iteration (s): 0.15 | learning rate: 5.789E-05 | global batch size: 256 | lm loss: 3.768934E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.508 | TFLOPs: 26.04 | +7: iteration 42220/ 60336 | consumed samples: 10808320 | consumed tokens: 22135439360 | elapsed time per iteration (s): 0.15 | learning rate: 5.785E-05 | global batch size: 256 | lm loss: 3.772037E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.299 | TFLOPs: 26.13 | +7: iteration 42230/ 60336 | consumed samples: 10810880 | consumed tokens: 22140682240 | elapsed time per iteration (s): 0.15 | learning rate: 5.782E-05 | global batch size: 256 | lm loss: 3.762942E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.303 | TFLOPs: 26.19 | +7: iteration 42240/ 60336 | consumed samples: 10813440 | consumed tokens: 22145925120 | elapsed time per iteration (s): 0.15 | learning rate: 5.778E-05 | global batch size: 256 | lm loss: 3.783475E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.494 | TFLOPs: 26.13 | +7: iteration 42250/ 60336 | consumed samples: 10816000 | consumed tokens: 22151168000 | elapsed time per iteration (s): 0.15 | learning rate: 5.774E-05 | global batch size: 256 | lm loss: 3.778498E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.174 | TFLOPs: 26.16 | +7: iteration 42260/ 60336 | consumed samples: 10818560 | consumed tokens: 22156410880 | elapsed time per iteration (s): 0.15 | learning rate: 5.770E-05 | global batch size: 256 | lm loss: 3.764837E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.145 | TFLOPs: 26.19 | +7: iteration 42270/ 60336 | consumed samples: 10821120 | consumed tokens: 22161653760 | elapsed time per iteration (s): 0.15 | learning rate: 5.766E-05 | global batch size: 256 | lm loss: 3.794197E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.820 | TFLOPs: 26.20 | +7: iteration 42280/ 60336 | consumed samples: 10823680 | consumed tokens: 22166896640 | elapsed time per iteration (s): 0.15 | learning rate: 5.762E-05 | global batch size: 256 | lm loss: 3.765908E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.605 | TFLOPs: 26.14 | +7: iteration 42290/ 60336 | consumed samples: 10826240 | consumed tokens: 22172139520 | elapsed time per iteration (s): 0.15 | learning rate: 5.758E-05 | global batch size: 256 | lm loss: 3.773121E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.295 | TFLOPs: 26.12 | +7: iteration 42300/ 60336 | consumed samples: 10828800 | consumed tokens: 22177382400 | elapsed time per iteration (s): 0.15 | learning rate: 5.755E-05 | global batch size: 256 | lm loss: 3.775502E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.068 | TFLOPs: 26.18 | +7: iteration 42310/ 60336 | consumed samples: 10831360 | consumed tokens: 22182625280 | elapsed time per iteration (s): 0.15 | learning rate: 5.751E-05 | global batch size: 256 | lm loss: 3.764346E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.493 | TFLOPs: 26.20 | +7: iteration 42320/ 60336 | consumed samples: 10833920 | consumed tokens: 22187868160 | elapsed time per iteration (s): 0.15 | learning rate: 5.747E-05 | global batch size: 256 | lm loss: 3.774376E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.379 | TFLOPs: 26.20 | +7: iteration 42330/ 60336 | consumed samples: 10836480 | consumed tokens: 22193111040 | elapsed time per iteration (s): 0.15 | learning rate: 5.743E-05 | global batch size: 256 | lm loss: 3.772647E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.965 | TFLOPs: 26.19 | +7: iteration 42340/ 60336 | consumed samples: 10839040 | consumed tokens: 22198353920 | elapsed time per iteration (s): 0.15 | learning rate: 5.739E-05 | global batch size: 256 | lm loss: 3.783041E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.483 | TFLOPs: 26.21 | +7: iteration 42350/ 60336 | consumed samples: 10841600 | consumed tokens: 22203596800 | elapsed time per iteration (s): 0.15 | learning rate: 5.735E-05 | global batch size: 256 | lm loss: 3.767901E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.215 | TFLOPs: 26.19 | +7: iteration 42360/ 60336 | consumed samples: 10844160 | consumed tokens: 22208839680 | elapsed time per iteration (s): 0.15 | learning rate: 5.732E-05 | global batch size: 256 | lm loss: 3.775579E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.759 | TFLOPs: 26.20 | +7: iteration 42370/ 60336 | consumed samples: 10846720 | consumed tokens: 22214082560 | elapsed time per iteration (s): 0.15 | learning rate: 5.728E-05 | global batch size: 256 | lm loss: 3.773431E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.742 | TFLOPs: 26.22 | +7: iteration 42380/ 60336 | consumed samples: 10849280 | consumed tokens: 22219325440 | elapsed time per iteration (s): 0.15 | learning rate: 5.724E-05 | global batch size: 256 | lm loss: 3.776846E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.735 | TFLOPs: 26.19 | +7: iteration 42390/ 60336 | consumed samples: 10851840 | consumed tokens: 22224568320 | elapsed time per iteration (s): 0.15 | learning rate: 5.720E-05 | global batch size: 256 | lm loss: 3.771065E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.238 | TFLOPs: 26.18 | +7: iteration 42400/ 60336 | consumed samples: 10854400 | consumed tokens: 22229811200 | elapsed time per iteration (s): 0.15 | learning rate: 5.716E-05 | global batch size: 256 | lm loss: 3.778337E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.127 | TFLOPs: 26.19 | +7: iteration 42410/ 60336 | consumed samples: 10856960 | consumed tokens: 22235054080 | elapsed time per iteration (s): 0.15 | learning rate: 5.712E-05 | global batch size: 256 | lm loss: 3.777332E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.088 | TFLOPs: 26.21 | +7: iteration 42420/ 60336 | consumed samples: 10859520 | consumed tokens: 22240296960 | elapsed time per iteration (s): 0.15 | learning rate: 5.709E-05 | global batch size: 256 | lm loss: 3.776002E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.092 | TFLOPs: 26.16 | +7: iteration 42430/ 60336 | consumed samples: 10862080 | consumed tokens: 22245539840 | elapsed time per iteration (s): 0.15 | learning rate: 5.705E-05 | global batch size: 256 | lm loss: 3.769667E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.169 | TFLOPs: 26.19 | +7: iteration 42440/ 60336 | consumed samples: 10864640 | consumed tokens: 22250782720 | elapsed time per iteration (s): 0.15 | learning rate: 5.701E-05 | global batch size: 256 | lm loss: 3.787622E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.863 | TFLOPs: 26.19 | +7: iteration 42450/ 60336 | consumed samples: 10867200 | consumed tokens: 22256025600 | elapsed time per iteration (s): 0.15 | learning rate: 5.697E-05 | global batch size: 256 | lm loss: 3.786238E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.498 | TFLOPs: 26.20 | +7: iteration 42460/ 60336 | consumed samples: 10869760 | consumed tokens: 22261268480 | elapsed time per iteration (s): 0.17 | learning rate: 5.693E-05 | global batch size: 256 | lm loss: 3.769331E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.872 | TFLOPs: 24.23 | +7: iteration 42470/ 60336 | consumed samples: 10872320 | consumed tokens: 22266511360 | elapsed time per iteration (s): 0.15 | learning rate: 5.689E-05 | global batch size: 256 | lm loss: 3.782184E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.254 | TFLOPs: 26.15 | +7: iteration 42480/ 60336 | consumed samples: 10874880 | consumed tokens: 22271754240 | elapsed time per iteration (s): 0.15 | learning rate: 5.686E-05 | global batch size: 256 | lm loss: 3.781736E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.973 | TFLOPs: 26.11 | +7: iteration 42490/ 60336 | consumed samples: 10877440 | consumed tokens: 22276997120 | elapsed time per iteration (s): 0.15 | learning rate: 5.682E-05 | global batch size: 256 | lm loss: 3.768383E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.664 | TFLOPs: 26.12 | +7: iteration 42500/ 60336 | consumed samples: 10880000 | consumed tokens: 22282240000 | elapsed time per iteration (s): 0.15 | learning rate: 5.678E-05 | global batch size: 256 | lm loss: 3.758430E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.249 | TFLOPs: 26.15 | +7: iteration 42510/ 60336 | consumed samples: 10882560 | consumed tokens: 22287482880 | elapsed time per iteration (s): 0.15 | learning rate: 5.674E-05 | global batch size: 256 | lm loss: 3.762543E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.263 | TFLOPs: 26.15 | +7: iteration 42520/ 60336 | consumed samples: 10885120 | consumed tokens: 22292725760 | elapsed time per iteration (s): 0.15 | learning rate: 5.670E-05 | global batch size: 256 | lm loss: 3.777385E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.335 | TFLOPs: 26.13 | +7: iteration 42530/ 60336 | consumed samples: 10887680 | consumed tokens: 22297968640 | elapsed time per iteration (s): 0.15 | learning rate: 5.666E-05 | global batch size: 256 | lm loss: 3.786236E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.289 | TFLOPs: 26.13 | +7: iteration 42540/ 60336 | consumed samples: 10890240 | consumed tokens: 22303211520 | elapsed time per iteration (s): 0.15 | learning rate: 5.663E-05 | global batch size: 256 | lm loss: 3.773528E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.238 | TFLOPs: 26.13 | +7: iteration 42550/ 60336 | consumed samples: 10892800 | consumed tokens: 22308454400 | elapsed time per iteration (s): 0.15 | learning rate: 5.659E-05 | global batch size: 256 | lm loss: 3.780185E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.104 | TFLOPs: 26.11 | +7: iteration 42560/ 60336 | consumed samples: 10895360 | consumed tokens: 22313697280 | elapsed time per iteration (s): 0.15 | learning rate: 5.655E-05 | global batch size: 256 | lm loss: 3.773640E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.182 | TFLOPs: 26.15 | +7: iteration 42570/ 60336 | consumed samples: 10897920 | consumed tokens: 22318940160 | elapsed time per iteration (s): 0.15 | learning rate: 5.651E-05 | global batch size: 256 | lm loss: 3.769783E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.248 | TFLOPs: 26.16 | +7: iteration 42580/ 60336 | consumed samples: 10900480 | consumed tokens: 22324183040 | elapsed time per iteration (s): 0.15 | learning rate: 5.647E-05 | global batch size: 256 | lm loss: 3.757837E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.225 | TFLOPs: 26.15 | +7: iteration 42590/ 60336 | consumed samples: 10903040 | consumed tokens: 22329425920 | elapsed time per iteration (s): 0.15 | learning rate: 5.644E-05 | global batch size: 256 | lm loss: 3.770043E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.177 | TFLOPs: 26.15 | +7: iteration 42600/ 60336 | consumed samples: 10905600 | consumed tokens: 22334668800 | elapsed time per iteration (s): 0.15 | learning rate: 5.640E-05 | global batch size: 256 | lm loss: 3.770367E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.918 | TFLOPs: 26.14 | +7: iteration 42610/ 60336 | consumed samples: 10908160 | consumed tokens: 22339911680 | elapsed time per iteration (s): 0.15 | learning rate: 5.636E-05 | global batch size: 256 | lm loss: 3.772013E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.325 | TFLOPs: 26.15 | +7: iteration 42620/ 60336 | consumed samples: 10910720 | consumed tokens: 22345154560 | elapsed time per iteration (s): 0.15 | learning rate: 5.632E-05 | global batch size: 256 | lm loss: 3.781875E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.721 | TFLOPs: 26.14 | +7: iteration 42630/ 60336 | consumed samples: 10913280 | consumed tokens: 22350397440 | elapsed time per iteration (s): 0.15 | learning rate: 5.628E-05 | global batch size: 256 | lm loss: 3.786144E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.224 | TFLOPs: 26.16 | +7: iteration 42640/ 60336 | consumed samples: 10915840 | consumed tokens: 22355640320 | elapsed time per iteration (s): 0.15 | learning rate: 5.625E-05 | global batch size: 256 | lm loss: 3.782550E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.149 | TFLOPs: 26.16 | +7: iteration 42650/ 60336 | consumed samples: 10918400 | consumed tokens: 22360883200 | elapsed time per iteration (s): 0.15 | learning rate: 5.621E-05 | global batch size: 256 | lm loss: 3.763868E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.505 | TFLOPs: 26.13 | +7: iteration 42660/ 60336 | consumed samples: 10920960 | consumed tokens: 22366126080 | elapsed time per iteration (s): 0.15 | learning rate: 5.617E-05 | global batch size: 256 | lm loss: 3.772568E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.766 | TFLOPs: 26.19 | +7: iteration 42670/ 60336 | consumed samples: 10923520 | consumed tokens: 22371368960 | elapsed time per iteration (s): 0.15 | learning rate: 5.613E-05 | global batch size: 256 | lm loss: 3.761122E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.880 | TFLOPs: 26.17 | +7: iteration 42680/ 60336 | consumed samples: 10926080 | consumed tokens: 22376611840 | elapsed time per iteration (s): 0.15 | learning rate: 5.609E-05 | global batch size: 256 | lm loss: 3.771066E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.858 | TFLOPs: 26.17 | +7: iteration 42690/ 60336 | consumed samples: 10928640 | consumed tokens: 22381854720 | elapsed time per iteration (s): 0.15 | learning rate: 5.606E-05 | global batch size: 256 | lm loss: 3.768510E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.295 | TFLOPs: 26.16 | +7: iteration 42700/ 60336 | consumed samples: 10931200 | consumed tokens: 22387097600 | elapsed time per iteration (s): 0.15 | learning rate: 5.602E-05 | global batch size: 256 | lm loss: 3.756355E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.745 | TFLOPs: 26.12 | +7: iteration 42710/ 60336 | consumed samples: 10933760 | consumed tokens: 22392340480 | elapsed time per iteration (s): 0.15 | learning rate: 5.598E-05 | global batch size: 256 | lm loss: 3.768256E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.534 | TFLOPs: 26.12 | +7: iteration 42720/ 60336 | consumed samples: 10936320 | consumed tokens: 22397583360 | elapsed time per iteration (s): 0.15 | learning rate: 5.594E-05 | global batch size: 256 | lm loss: 3.778714E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.370 | TFLOPs: 26.13 | +7: iteration 42730/ 60336 | consumed samples: 10938880 | consumed tokens: 22402826240 | elapsed time per iteration (s): 0.15 | learning rate: 5.591E-05 | global batch size: 256 | lm loss: 3.768372E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.268 | TFLOPs: 26.08 | +7: iteration 42740/ 60336 | consumed samples: 10941440 | consumed tokens: 22408069120 | elapsed time per iteration (s): 0.15 | learning rate: 5.587E-05 | global batch size: 256 | lm loss: 3.764555E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.507 | TFLOPs: 26.12 | +7: iteration 42750/ 60336 | consumed samples: 10944000 | consumed tokens: 22413312000 | elapsed time per iteration (s): 0.15 | learning rate: 5.583E-05 | global batch size: 256 | lm loss: 3.780922E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.445 | TFLOPs: 26.10 | +7: iteration 42760/ 60336 | consumed samples: 10946560 | consumed tokens: 22418554880 | elapsed time per iteration (s): 0.15 | learning rate: 5.579E-05 | global batch size: 256 | lm loss: 3.775237E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.835 | TFLOPs: 26.12 | +7: iteration 42770/ 60336 | consumed samples: 10949120 | consumed tokens: 22423797760 | elapsed time per iteration (s): 0.15 | learning rate: 5.575E-05 | global batch size: 256 | lm loss: 3.772319E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.742 | TFLOPs: 26.11 | +7: iteration 42780/ 60336 | consumed samples: 10951680 | consumed tokens: 22429040640 | elapsed time per iteration (s): 0.15 | learning rate: 5.572E-05 | global batch size: 256 | lm loss: 3.768671E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.025 | TFLOPs: 26.13 | +7: iteration 42790/ 60336 | consumed samples: 10954240 | consumed tokens: 22434283520 | elapsed time per iteration (s): 0.15 | learning rate: 5.568E-05 | global batch size: 256 | lm loss: 3.773178E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.829 | TFLOPs: 26.09 | +7: iteration 42800/ 60336 | consumed samples: 10956800 | consumed tokens: 22439526400 | elapsed time per iteration (s): 0.15 | learning rate: 5.564E-05 | global batch size: 256 | lm loss: 3.782653E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.249 | TFLOPs: 26.12 | +7: iteration 42810/ 60336 | consumed samples: 10959360 | consumed tokens: 22444769280 | elapsed time per iteration (s): 0.15 | learning rate: 5.560E-05 | global batch size: 256 | lm loss: 3.781197E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.563 | TFLOPs: 26.06 | +7: iteration 42820/ 60336 | consumed samples: 10961920 | consumed tokens: 22450012160 | elapsed time per iteration (s): 0.15 | learning rate: 5.557E-05 | global batch size: 256 | lm loss: 3.768692E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.141 | TFLOPs: 26.08 | +7: iteration 42830/ 60336 | consumed samples: 10964480 | consumed tokens: 22455255040 | elapsed time per iteration (s): 0.15 | learning rate: 5.553E-05 | global batch size: 256 | lm loss: 3.767165E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.184 | TFLOPs: 26.08 | +7: iteration 42840/ 60336 | consumed samples: 10967040 | consumed tokens: 22460497920 | elapsed time per iteration (s): 0.15 | learning rate: 5.549E-05 | global batch size: 256 | lm loss: 3.772957E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.529 | TFLOPs: 26.09 | +7: iteration 42850/ 60336 | consumed samples: 10969600 | consumed tokens: 22465740800 | elapsed time per iteration (s): 0.15 | learning rate: 5.545E-05 | global batch size: 256 | lm loss: 3.776363E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.640 | TFLOPs: 26.07 | +7: iteration 42860/ 60336 | consumed samples: 10972160 | consumed tokens: 22470983680 | elapsed time per iteration (s): 0.15 | learning rate: 5.541E-05 | global batch size: 256 | lm loss: 3.776937E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.518 | TFLOPs: 26.12 | +7: iteration 42870/ 60336 | consumed samples: 10974720 | consumed tokens: 22476226560 | elapsed time per iteration (s): 0.15 | learning rate: 5.538E-05 | global batch size: 256 | lm loss: 3.763353E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.788 | TFLOPs: 26.08 | +7: iteration 42880/ 60336 | consumed samples: 10977280 | consumed tokens: 22481469440 | elapsed time per iteration (s): 0.15 | learning rate: 5.534E-05 | global batch size: 256 | lm loss: 3.785409E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.552 | TFLOPs: 26.09 | +7: iteration 42890/ 60336 | consumed samples: 10979840 | consumed tokens: 22486712320 | elapsed time per iteration (s): 0.15 | learning rate: 5.530E-05 | global batch size: 256 | lm loss: 3.773617E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.300 | TFLOPs: 26.12 | +7: iteration 42900/ 60336 | consumed samples: 10982400 | consumed tokens: 22491955200 | elapsed time per iteration (s): 0.15 | learning rate: 5.526E-05 | global batch size: 256 | lm loss: 3.767416E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.656 | TFLOPs: 26.09 | +7: iteration 42910/ 60336 | consumed samples: 10984960 | consumed tokens: 22497198080 | elapsed time per iteration (s): 0.15 | learning rate: 5.523E-05 | global batch size: 256 | lm loss: 3.777318E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.896 | TFLOPs: 26.09 | +7: iteration 42920/ 60336 | consumed samples: 10987520 | consumed tokens: 22502440960 | elapsed time per iteration (s): 0.15 | learning rate: 5.519E-05 | global batch size: 256 | lm loss: 3.756049E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.616 | TFLOPs: 26.09 | +7: iteration 42930/ 60336 | consumed samples: 10990080 | consumed tokens: 22507683840 | elapsed time per iteration (s): 0.15 | learning rate: 5.515E-05 | global batch size: 256 | lm loss: 3.769896E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.834 | TFLOPs: 26.08 | +7: iteration 42940/ 60336 | consumed samples: 10992640 | consumed tokens: 22512926720 | elapsed time per iteration (s): 0.15 | learning rate: 5.511E-05 | global batch size: 256 | lm loss: 3.764833E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.272 | TFLOPs: 26.10 | +7: iteration 42950/ 60336 | consumed samples: 10995200 | consumed tokens: 22518169600 | elapsed time per iteration (s): 0.15 | learning rate: 5.508E-05 | global batch size: 256 | lm loss: 3.774351E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.255 | TFLOPs: 26.07 | +7: iteration 42960/ 60336 | consumed samples: 10997760 | consumed tokens: 22523412480 | elapsed time per iteration (s): 0.15 | learning rate: 5.504E-05 | global batch size: 256 | lm loss: 3.767342E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.548 | TFLOPs: 26.07 | +7: iteration 42970/ 60336 | consumed samples: 11000320 | consumed tokens: 22528655360 | elapsed time per iteration (s): 0.15 | learning rate: 5.500E-05 | global batch size: 256 | lm loss: 3.766849E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.197 | TFLOPs: 26.07 | +7: iteration 42980/ 60336 | consumed samples: 11002880 | consumed tokens: 22533898240 | elapsed time per iteration (s): 0.15 | learning rate: 5.496E-05 | global batch size: 256 | lm loss: 3.771286E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.667 | TFLOPs: 26.09 | +7: iteration 42990/ 60336 | consumed samples: 11005440 | consumed tokens: 22539141120 | elapsed time per iteration (s): 0.15 | learning rate: 5.493E-05 | global batch size: 256 | lm loss: 3.770984E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.107 | TFLOPs: 26.08 | +7: iteration 43000/ 60336 | consumed samples: 11008000 | consumed tokens: 22544384000 | elapsed time per iteration (s): 0.15 | learning rate: 5.489E-05 | global batch size: 256 | lm loss: 3.759673E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.606 | TFLOPs: 26.07 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 43000 | lm loss value: 3.862833E+00 | lm loss PPL: 4.760001E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 43000 to checkpoints_44m32b100m +0: [2023-03-17 02:10:10,851] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step43000 is begin to save! +0: [2023-03-17 02:10:10,855] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:10:10,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:10:10,915] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:10:10,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:10:10,926] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:10:10,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:10:10,935] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:10:10,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:10:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:10:10,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:10:10,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:10:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:10:10,959] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:10:10,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:10:10,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:10:10,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:10:10,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:10:10,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:10:10,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:10:10,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:10:10,984] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step43000/mp_rank_00_model_states.pt +0: [2023-03-17 02:10:10,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:10:10,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:10:11,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:10:11,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:10:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:10:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:10:11,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:10:11,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:10:11,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:10:11,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:10:11,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step43000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:10:11,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: successfully saved checkpoint at iteration 43000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 178.06 +7: iteration 43010/ 60336 | consumed samples: 11010560 | consumed tokens: 22549626880 | elapsed time per iteration (s): 0.18 | learning rate: 5.485E-05 | global batch size: 256 | lm loss: 3.779686E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.862 | TFLOPs: 22.77 | +7: iteration 43020/ 60336 | consumed samples: 11013120 | consumed tokens: 22554869760 | elapsed time per iteration (s): 0.15 | learning rate: 5.481E-05 | global batch size: 256 | lm loss: 3.777326E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.039 | TFLOPs: 26.10 | +7: iteration 43030/ 60336 | consumed samples: 11015680 | consumed tokens: 22560112640 | elapsed time per iteration (s): 0.15 | learning rate: 5.478E-05 | global batch size: 256 | lm loss: 3.774936E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.802 | TFLOPs: 26.09 | +7: iteration 43040/ 60336 | consumed samples: 11018240 | consumed tokens: 22565355520 | elapsed time per iteration (s): 0.15 | learning rate: 5.474E-05 | global batch size: 256 | lm loss: 3.769342E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.289 | TFLOPs: 26.05 | +7: iteration 43050/ 60336 | consumed samples: 11020800 | consumed tokens: 22570598400 | elapsed time per iteration (s): 0.15 | learning rate: 5.470E-05 | global batch size: 256 | lm loss: 3.759221E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.496 | TFLOPs: 26.09 | +7: iteration 43060/ 60336 | consumed samples: 11023360 | consumed tokens: 22575841280 | elapsed time per iteration (s): 0.15 | learning rate: 5.466E-05 | global batch size: 256 | lm loss: 3.764032E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.068 | TFLOPs: 26.08 | +7: iteration 43070/ 60336 | consumed samples: 11025920 | consumed tokens: 22581084160 | elapsed time per iteration (s): 0.15 | learning rate: 5.463E-05 | global batch size: 256 | lm loss: 3.775895E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.553 | TFLOPs: 26.09 | +7: iteration 43080/ 60336 | consumed samples: 11028480 | consumed tokens: 22586327040 | elapsed time per iteration (s): 0.15 | learning rate: 5.459E-05 | global batch size: 256 | lm loss: 3.766209E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.778 | TFLOPs: 26.08 | +7: iteration 43090/ 60336 | consumed samples: 11031040 | consumed tokens: 22591569920 | elapsed time per iteration (s): 0.15 | learning rate: 5.455E-05 | global batch size: 256 | lm loss: 3.776060E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.722 | TFLOPs: 26.08 | +7: iteration 43100/ 60336 | consumed samples: 11033600 | consumed tokens: 22596812800 | elapsed time per iteration (s): 0.15 | learning rate: 5.452E-05 | global batch size: 256 | lm loss: 3.773458E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.566 | TFLOPs: 26.10 | +7: iteration 43110/ 60336 | consumed samples: 11036160 | consumed tokens: 22602055680 | elapsed time per iteration (s): 0.15 | learning rate: 5.448E-05 | global batch size: 256 | lm loss: 3.773589E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.939 | TFLOPs: 26.06 | +7: iteration 43120/ 60336 | consumed samples: 11038720 | consumed tokens: 22607298560 | elapsed time per iteration (s): 0.15 | learning rate: 5.444E-05 | global batch size: 256 | lm loss: 3.786176E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.965 | TFLOPs: 26.11 | +7: iteration 43130/ 60336 | consumed samples: 11041280 | consumed tokens: 22612541440 | elapsed time per iteration (s): 0.15 | learning rate: 5.440E-05 | global batch size: 256 | lm loss: 3.763510E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.471 | TFLOPs: 26.06 | +7: iteration 43140/ 60336 | consumed samples: 11043840 | consumed tokens: 22617784320 | elapsed time per iteration (s): 0.15 | learning rate: 5.437E-05 | global batch size: 256 | lm loss: 3.776198E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.719 | TFLOPs: 26.09 | +7: iteration 43150/ 60336 | consumed samples: 11046400 | consumed tokens: 22623027200 | elapsed time per iteration (s): 0.15 | learning rate: 5.433E-05 | global batch size: 256 | lm loss: 3.770936E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.880 | TFLOPs: 26.09 | +7: iteration 43160/ 60336 | consumed samples: 11048960 | consumed tokens: 22628270080 | elapsed time per iteration (s): 0.15 | learning rate: 5.429E-05 | global batch size: 256 | lm loss: 3.770799E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.292 | TFLOPs: 26.08 | +7: iteration 43170/ 60336 | consumed samples: 11051520 | consumed tokens: 22633512960 | elapsed time per iteration (s): 0.15 | learning rate: 5.426E-05 | global batch size: 256 | lm loss: 3.775320E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.600 | TFLOPs: 26.09 | +7: iteration 43180/ 60336 | consumed samples: 11054080 | consumed tokens: 22638755840 | elapsed time per iteration (s): 0.15 | learning rate: 5.422E-05 | global batch size: 256 | lm loss: 3.791015E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.297 | TFLOPs: 26.08 | +7: iteration 43190/ 60336 | consumed samples: 11056640 | consumed tokens: 22643998720 | elapsed time per iteration (s): 0.15 | learning rate: 5.418E-05 | global batch size: 256 | lm loss: 3.775293E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.650 | TFLOPs: 26.11 | +7: iteration 43200/ 60336 | consumed samples: 11059200 | consumed tokens: 22649241600 | elapsed time per iteration (s): 0.15 | learning rate: 5.414E-05 | global batch size: 256 | lm loss: 3.778808E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.901 | TFLOPs: 26.08 | +7: iteration 43210/ 60336 | consumed samples: 11061760 | consumed tokens: 22654484480 | elapsed time per iteration (s): 0.15 | learning rate: 5.411E-05 | global batch size: 256 | lm loss: 3.775381E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.132 | TFLOPs: 26.10 | +7: iteration 43220/ 60336 | consumed samples: 11064320 | consumed tokens: 22659727360 | elapsed time per iteration (s): 0.15 | learning rate: 5.407E-05 | global batch size: 256 | lm loss: 3.763194E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.523 | TFLOPs: 26.09 | +7: iteration 43230/ 60336 | consumed samples: 11066880 | consumed tokens: 22664970240 | elapsed time per iteration (s): 0.15 | learning rate: 5.403E-05 | global batch size: 256 | lm loss: 3.770057E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.617 | TFLOPs: 26.09 | +7: iteration 43240/ 60336 | consumed samples: 11069440 | consumed tokens: 22670213120 | elapsed time per iteration (s): 0.15 | learning rate: 5.400E-05 | global batch size: 256 | lm loss: 3.784194E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.006 | TFLOPs: 26.10 | +7: iteration 43250/ 60336 | consumed samples: 11072000 | consumed tokens: 22675456000 | elapsed time per iteration (s): 0.15 | learning rate: 5.396E-05 | global batch size: 256 | lm loss: 3.766577E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.926 | TFLOPs: 26.09 | +7: iteration 43260/ 60336 | consumed samples: 11074560 | consumed tokens: 22680698880 | elapsed time per iteration (s): 0.15 | learning rate: 5.392E-05 | global batch size: 256 | lm loss: 3.768021E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.201 | TFLOPs: 26.07 | +7: iteration 43270/ 60336 | consumed samples: 11077120 | consumed tokens: 22685941760 | elapsed time per iteration (s): 0.15 | learning rate: 5.388E-05 | global batch size: 256 | lm loss: 3.772715E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.710 | TFLOPs: 26.08 | +7: iteration 43280/ 60336 | consumed samples: 11079680 | consumed tokens: 22691184640 | elapsed time per iteration (s): 0.15 | learning rate: 5.385E-05 | global batch size: 256 | lm loss: 3.786555E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.721 | TFLOPs: 26.03 | +7: iteration 43290/ 60336 | consumed samples: 11082240 | consumed tokens: 22696427520 | elapsed time per iteration (s): 0.15 | learning rate: 5.381E-05 | global batch size: 256 | lm loss: 3.762169E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.688 | TFLOPs: 26.08 | +7: iteration 43300/ 60336 | consumed samples: 11084800 | consumed tokens: 22701670400 | elapsed time per iteration (s): 0.15 | learning rate: 5.377E-05 | global batch size: 256 | lm loss: 3.785812E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.235 | TFLOPs: 26.07 | +7: iteration 43310/ 60336 | consumed samples: 11087360 | consumed tokens: 22706913280 | elapsed time per iteration (s): 0.15 | learning rate: 5.374E-05 | global batch size: 256 | lm loss: 3.781552E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.433 | TFLOPs: 26.09 | +7: iteration 43320/ 60336 | consumed samples: 11089920 | consumed tokens: 22712156160 | elapsed time per iteration (s): 0.15 | learning rate: 5.370E-05 | global batch size: 256 | lm loss: 3.764219E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.738 | TFLOPs: 26.08 | +7: iteration 43330/ 60336 | consumed samples: 11092480 | consumed tokens: 22717399040 | elapsed time per iteration (s): 0.15 | learning rate: 5.366E-05 | global batch size: 256 | lm loss: 3.771590E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.714 | TFLOPs: 26.08 | +7: iteration 43340/ 60336 | consumed samples: 11095040 | consumed tokens: 22722641920 | elapsed time per iteration (s): 0.15 | learning rate: 5.363E-05 | global batch size: 256 | lm loss: 3.771708E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.949 | TFLOPs: 26.06 | +7: iteration 43350/ 60336 | consumed samples: 11097600 | consumed tokens: 22727884800 | elapsed time per iteration (s): 0.15 | learning rate: 5.359E-05 | global batch size: 256 | lm loss: 3.774102E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.893 | TFLOPs: 26.06 | +7: iteration 43360/ 60336 | consumed samples: 11100160 | consumed tokens: 22733127680 | elapsed time per iteration (s): 0.15 | learning rate: 5.355E-05 | global batch size: 256 | lm loss: 3.770988E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.788 | TFLOPs: 26.06 | +7: iteration 43370/ 60336 | consumed samples: 11102720 | consumed tokens: 22738370560 | elapsed time per iteration (s): 0.15 | learning rate: 5.352E-05 | global batch size: 256 | lm loss: 3.762202E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.126 | TFLOPs: 26.08 | +7: iteration 43380/ 60336 | consumed samples: 11105280 | consumed tokens: 22743613440 | elapsed time per iteration (s): 0.15 | learning rate: 5.348E-05 | global batch size: 256 | lm loss: 3.777845E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.658 | TFLOPs: 26.11 | +7: iteration 43390/ 60336 | consumed samples: 11107840 | consumed tokens: 22748856320 | elapsed time per iteration (s): 0.15 | learning rate: 5.344E-05 | global batch size: 256 | lm loss: 3.765590E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.656 | TFLOPs: 26.11 | +7: iteration 43400/ 60336 | consumed samples: 11110400 | consumed tokens: 22754099200 | elapsed time per iteration (s): 0.15 | learning rate: 5.340E-05 | global batch size: 256 | lm loss: 3.768655E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.213 | TFLOPs: 26.11 | +7: iteration 43410/ 60336 | consumed samples: 11112960 | consumed tokens: 22759342080 | elapsed time per iteration (s): 0.15 | learning rate: 5.337E-05 | global batch size: 256 | lm loss: 3.779871E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.421 | TFLOPs: 26.10 | +7: iteration 43420/ 60336 | consumed samples: 11115520 | consumed tokens: 22764584960 | elapsed time per iteration (s): 0.15 | learning rate: 5.333E-05 | global batch size: 256 | lm loss: 3.779481E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.633 | TFLOPs: 26.06 | +7: iteration 43430/ 60336 | consumed samples: 11118080 | consumed tokens: 22769827840 | elapsed time per iteration (s): 0.15 | learning rate: 5.329E-05 | global batch size: 256 | lm loss: 3.763964E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.557 | TFLOPs: 26.09 | +7: iteration 43440/ 60336 | consumed samples: 11120640 | consumed tokens: 22775070720 | elapsed time per iteration (s): 0.15 | learning rate: 5.326E-05 | global batch size: 256 | lm loss: 3.769639E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.146 | TFLOPs: 26.10 | +7: iteration 43450/ 60336 | consumed samples: 11123200 | consumed tokens: 22780313600 | elapsed time per iteration (s): 0.15 | learning rate: 5.322E-05 | global batch size: 256 | lm loss: 3.784118E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.073 | TFLOPs: 26.11 | +7: iteration 43460/ 60336 | consumed samples: 11125760 | consumed tokens: 22785556480 | elapsed time per iteration (s): 0.15 | learning rate: 5.318E-05 | global batch size: 256 | lm loss: 3.774354E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.457 | TFLOPs: 26.10 | +7: iteration 43470/ 60336 | consumed samples: 11128320 | consumed tokens: 22790799360 | elapsed time per iteration (s): 0.15 | learning rate: 5.315E-05 | global batch size: 256 | lm loss: 3.763541E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.984 | TFLOPs: 26.10 | +7: iteration 43480/ 60336 | consumed samples: 11130880 | consumed tokens: 22796042240 | elapsed time per iteration (s): 0.16 | learning rate: 5.311E-05 | global batch size: 256 | lm loss: 3.769543E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.657 | TFLOPs: 25.53 | +7: iteration 43490/ 60336 | consumed samples: 11133440 | consumed tokens: 22801285120 | elapsed time per iteration (s): 0.15 | learning rate: 5.307E-05 | global batch size: 256 | lm loss: 3.776208E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.635 | TFLOPs: 26.11 | +7: iteration 43500/ 60336 | consumed samples: 11136000 | consumed tokens: 22806528000 | elapsed time per iteration (s): 0.15 | learning rate: 5.304E-05 | global batch size: 256 | lm loss: 3.759312E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.332 | TFLOPs: 26.09 | +7: iteration 43510/ 60336 | consumed samples: 11138560 | consumed tokens: 22811770880 | elapsed time per iteration (s): 0.15 | learning rate: 5.300E-05 | global batch size: 256 | lm loss: 3.749612E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.229 | TFLOPs: 26.10 | +7: iteration 43520/ 60336 | consumed samples: 11141120 | consumed tokens: 22817013760 | elapsed time per iteration (s): 0.15 | learning rate: 5.296E-05 | global batch size: 256 | lm loss: 3.775089E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.584 | TFLOPs: 26.12 | +7: iteration 43530/ 60336 | consumed samples: 11143680 | consumed tokens: 22822256640 | elapsed time per iteration (s): 0.15 | learning rate: 5.293E-05 | global batch size: 256 | lm loss: 3.776778E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.811 | TFLOPs: 26.09 | +7: iteration 43540/ 60336 | consumed samples: 11146240 | consumed tokens: 22827499520 | elapsed time per iteration (s): 0.15 | learning rate: 5.289E-05 | global batch size: 256 | lm loss: 3.769756E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.994 | TFLOPs: 26.10 | +7: iteration 43550/ 60336 | consumed samples: 11148800 | consumed tokens: 22832742400 | elapsed time per iteration (s): 0.15 | learning rate: 5.285E-05 | global batch size: 256 | lm loss: 3.787027E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.701 | TFLOPs: 26.09 | +7: iteration 43560/ 60336 | consumed samples: 11151360 | consumed tokens: 22837985280 | elapsed time per iteration (s): 0.15 | learning rate: 5.282E-05 | global batch size: 256 | lm loss: 3.762456E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.227 | TFLOPs: 26.08 | +7: iteration 43570/ 60336 | consumed samples: 11153920 | consumed tokens: 22843228160 | elapsed time per iteration (s): 0.15 | learning rate: 5.278E-05 | global batch size: 256 | lm loss: 3.761011E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.643 | TFLOPs: 26.11 | +7: iteration 43580/ 60336 | consumed samples: 11156480 | consumed tokens: 22848471040 | elapsed time per iteration (s): 0.15 | learning rate: 5.274E-05 | global batch size: 256 | lm loss: 3.764571E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.648 | TFLOPs: 26.09 | +7: iteration 43590/ 60336 | consumed samples: 11159040 | consumed tokens: 22853713920 | elapsed time per iteration (s): 0.15 | learning rate: 5.271E-05 | global batch size: 256 | lm loss: 3.771344E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.113 | TFLOPs: 26.10 | +7: iteration 43600/ 60336 | consumed samples: 11161600 | consumed tokens: 22858956800 | elapsed time per iteration (s): 0.15 | learning rate: 5.267E-05 | global batch size: 256 | lm loss: 3.774626E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.528 | TFLOPs: 26.10 | +7: iteration 43610/ 60336 | consumed samples: 11164160 | consumed tokens: 22864199680 | elapsed time per iteration (s): 0.15 | learning rate: 5.264E-05 | global batch size: 256 | lm loss: 3.769077E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.654 | TFLOPs: 26.11 | +7: iteration 43620/ 60336 | consumed samples: 11166720 | consumed tokens: 22869442560 | elapsed time per iteration (s): 0.15 | learning rate: 5.260E-05 | global batch size: 256 | lm loss: 3.778520E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.554 | TFLOPs: 26.09 | +7: iteration 43630/ 60336 | consumed samples: 11169280 | consumed tokens: 22874685440 | elapsed time per iteration (s): 0.15 | learning rate: 5.256E-05 | global batch size: 256 | lm loss: 3.771282E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.766 | TFLOPs: 26.11 | +7: iteration 43640/ 60336 | consumed samples: 11171840 | consumed tokens: 22879928320 | elapsed time per iteration (s): 0.15 | learning rate: 5.253E-05 | global batch size: 256 | lm loss: 3.774940E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.300 | TFLOPs: 26.07 | +7: iteration 43650/ 60336 | consumed samples: 11174400 | consumed tokens: 22885171200 | elapsed time per iteration (s): 0.15 | learning rate: 5.249E-05 | global batch size: 256 | lm loss: 3.777227E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.702 | TFLOPs: 26.09 | +7: iteration 43660/ 60336 | consumed samples: 11176960 | consumed tokens: 22890414080 | elapsed time per iteration (s): 0.15 | learning rate: 5.245E-05 | global batch size: 256 | lm loss: 3.777121E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.343 | TFLOPs: 26.09 | +7: iteration 43670/ 60336 | consumed samples: 11179520 | consumed tokens: 22895656960 | elapsed time per iteration (s): 0.15 | learning rate: 5.242E-05 | global batch size: 256 | lm loss: 3.774826E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.019 | TFLOPs: 26.10 | +7: iteration 43680/ 60336 | consumed samples: 11182080 | consumed tokens: 22900899840 | elapsed time per iteration (s): 0.15 | learning rate: 5.238E-05 | global batch size: 256 | lm loss: 3.772474E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.001 | TFLOPs: 26.08 | +7: iteration 43690/ 60336 | consumed samples: 11184640 | consumed tokens: 22906142720 | elapsed time per iteration (s): 0.15 | learning rate: 5.234E-05 | global batch size: 256 | lm loss: 3.780745E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.242 | TFLOPs: 26.08 | +7: iteration 43700/ 60336 | consumed samples: 11187200 | consumed tokens: 22911385600 | elapsed time per iteration (s): 0.15 | learning rate: 5.231E-05 | global batch size: 256 | lm loss: 3.763505E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.373 | TFLOPs: 26.09 | +7: iteration 43710/ 60336 | consumed samples: 11189760 | consumed tokens: 22916628480 | elapsed time per iteration (s): 0.15 | learning rate: 5.227E-05 | global batch size: 256 | lm loss: 3.769680E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.900 | TFLOPs: 26.08 | +7: iteration 43720/ 60336 | consumed samples: 11192320 | consumed tokens: 22921871360 | elapsed time per iteration (s): 0.15 | learning rate: 5.223E-05 | global batch size: 256 | lm loss: 3.777477E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.696 | TFLOPs: 26.11 | +7: iteration 43730/ 60336 | consumed samples: 11194880 | consumed tokens: 22927114240 | elapsed time per iteration (s): 0.15 | learning rate: 5.220E-05 | global batch size: 256 | lm loss: 3.766081E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.294 | TFLOPs: 26.07 | +7: iteration 43740/ 60336 | consumed samples: 11197440 | consumed tokens: 22932357120 | elapsed time per iteration (s): 0.15 | learning rate: 5.216E-05 | global batch size: 256 | lm loss: 3.764398E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.255 | TFLOPs: 26.07 | +7: iteration 43750/ 60336 | consumed samples: 11200000 | consumed tokens: 22937600000 | elapsed time per iteration (s): 0.15 | learning rate: 5.213E-05 | global batch size: 256 | lm loss: 3.766898E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.950 | TFLOPs: 26.08 | +7: iteration 43760/ 60336 | consumed samples: 11202560 | consumed tokens: 22942842880 | elapsed time per iteration (s): 0.15 | learning rate: 5.209E-05 | global batch size: 256 | lm loss: 3.769088E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.136 | TFLOPs: 26.07 | +7: iteration 43770/ 60336 | consumed samples: 11205120 | consumed tokens: 22948085760 | elapsed time per iteration (s): 0.15 | learning rate: 5.205E-05 | global batch size: 256 | lm loss: 3.752266E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.747 | TFLOPs: 26.11 | +7: iteration 43780/ 60336 | consumed samples: 11207680 | consumed tokens: 22953328640 | elapsed time per iteration (s): 0.15 | learning rate: 5.202E-05 | global batch size: 256 | lm loss: 3.762109E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.882 | TFLOPs: 26.08 | +7: iteration 43790/ 60336 | consumed samples: 11210240 | consumed tokens: 22958571520 | elapsed time per iteration (s): 0.15 | learning rate: 5.198E-05 | global batch size: 256 | lm loss: 3.778387E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.700 | TFLOPs: 26.09 | +7: iteration 43800/ 60336 | consumed samples: 11212800 | consumed tokens: 22963814400 | elapsed time per iteration (s): 0.15 | learning rate: 5.195E-05 | global batch size: 256 | lm loss: 3.777463E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.148 | TFLOPs: 26.10 | +7: iteration 43810/ 60336 | consumed samples: 11215360 | consumed tokens: 22969057280 | elapsed time per iteration (s): 0.15 | learning rate: 5.191E-05 | global batch size: 256 | lm loss: 3.771223E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.649 | TFLOPs: 26.09 | +7: iteration 43820/ 60336 | consumed samples: 11217920 | consumed tokens: 22974300160 | elapsed time per iteration (s): 0.15 | learning rate: 5.187E-05 | global batch size: 256 | lm loss: 3.769933E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.541 | TFLOPs: 26.09 | +7: iteration 43830/ 60336 | consumed samples: 11220480 | consumed tokens: 22979543040 | elapsed time per iteration (s): 0.15 | learning rate: 5.184E-05 | global batch size: 256 | lm loss: 3.757799E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.204 | TFLOPs: 26.10 | +7: iteration 43840/ 60336 | consumed samples: 11223040 | consumed tokens: 22984785920 | elapsed time per iteration (s): 0.15 | learning rate: 5.180E-05 | global batch size: 256 | lm loss: 3.777895E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.463 | TFLOPs: 26.07 | +7: iteration 43850/ 60336 | consumed samples: 11225600 | consumed tokens: 22990028800 | elapsed time per iteration (s): 0.15 | learning rate: 5.176E-05 | global batch size: 256 | lm loss: 3.757266E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.836 | TFLOPs: 26.09 | +7: iteration 43860/ 60336 | consumed samples: 11228160 | consumed tokens: 22995271680 | elapsed time per iteration (s): 0.15 | learning rate: 5.173E-05 | global batch size: 256 | lm loss: 3.760846E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.037 | TFLOPs: 26.11 | +7: iteration 43870/ 60336 | consumed samples: 11230720 | consumed tokens: 23000514560 | elapsed time per iteration (s): 0.15 | learning rate: 5.169E-05 | global batch size: 256 | lm loss: 3.774259E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.771 | TFLOPs: 26.11 | +7: iteration 43880/ 60336 | consumed samples: 11233280 | consumed tokens: 23005757440 | elapsed time per iteration (s): 0.15 | learning rate: 5.166E-05 | global batch size: 256 | lm loss: 3.768476E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.411 | TFLOPs: 26.10 | +7: iteration 43890/ 60336 | consumed samples: 11235840 | consumed tokens: 23011000320 | elapsed time per iteration (s): 0.15 | learning rate: 5.162E-05 | global batch size: 256 | lm loss: 3.771434E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.637 | TFLOPs: 26.09 | +7: iteration 43900/ 60336 | consumed samples: 11238400 | consumed tokens: 23016243200 | elapsed time per iteration (s): 0.15 | learning rate: 5.158E-05 | global batch size: 256 | lm loss: 3.770486E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.239 | TFLOPs: 26.07 | +7: iteration 43910/ 60336 | consumed samples: 11240960 | consumed tokens: 23021486080 | elapsed time per iteration (s): 0.15 | learning rate: 5.155E-05 | global batch size: 256 | lm loss: 3.773399E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.489 | TFLOPs: 26.10 | +7: iteration 43920/ 60336 | consumed samples: 11243520 | consumed tokens: 23026728960 | elapsed time per iteration (s): 0.15 | learning rate: 5.151E-05 | global batch size: 256 | lm loss: 3.773788E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.613 | TFLOPs: 26.09 | +7: iteration 43930/ 60336 | consumed samples: 11246080 | consumed tokens: 23031971840 | elapsed time per iteration (s): 0.15 | learning rate: 5.148E-05 | global batch size: 256 | lm loss: 3.769141E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.424 | TFLOPs: 26.09 | +7: iteration 43940/ 60336 | consumed samples: 11248640 | consumed tokens: 23037214720 | elapsed time per iteration (s): 0.15 | learning rate: 5.144E-05 | global batch size: 256 | lm loss: 3.766674E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.508 | TFLOPs: 26.10 | +7: iteration 43950/ 60336 | consumed samples: 11251200 | consumed tokens: 23042457600 | elapsed time per iteration (s): 0.15 | learning rate: 5.140E-05 | global batch size: 256 | lm loss: 3.773065E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.508 | TFLOPs: 26.07 | +7: iteration 43960/ 60336 | consumed samples: 11253760 | consumed tokens: 23047700480 | elapsed time per iteration (s): 0.15 | learning rate: 5.137E-05 | global batch size: 256 | lm loss: 3.772071E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.464 | TFLOPs: 26.12 | +7: iteration 43970/ 60336 | consumed samples: 11256320 | consumed tokens: 23052943360 | elapsed time per iteration (s): 0.15 | learning rate: 5.133E-05 | global batch size: 256 | lm loss: 3.765023E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.266 | TFLOPs: 26.12 | +7: iteration 43980/ 60336 | consumed samples: 11258880 | consumed tokens: 23058186240 | elapsed time per iteration (s): 0.15 | learning rate: 5.130E-05 | global batch size: 256 | lm loss: 3.777413E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.169 | TFLOPs: 26.10 | +7: iteration 43990/ 60336 | consumed samples: 11261440 | consumed tokens: 23063429120 | elapsed time per iteration (s): 0.15 | learning rate: 5.126E-05 | global batch size: 256 | lm loss: 3.761376E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.089 | TFLOPs: 26.11 | +0: [2023-03-17 02:12:44,942] [INFO] [logging.py:68:log_dist] [Rank 0] step=44000, skipped=0, lr=[5.122487494352964e-05, 5.122487494352964e-05, 5.122487494352964e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 44000/ 60336 | consumed samples: 11264000 | consumed tokens: 23068672000 | elapsed time per iteration (s): 0.15 | learning rate: 5.122E-05 | global batch size: 256 | lm loss: 3.762814E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.435 | TFLOPs: 26.12 | +0: steps: 44000 loss: 3.7995 iter time (s): 0.153 samples/sec: 1674.687 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 44000 | lm loss value: 3.926302E+00 | lm loss PPL: 5.071908E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 44000 to checkpoints_44m32b100m +0: [2023-03-17 02:12:45,018] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step44000 is begin to save! +0: [2023-03-17 02:12:45,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:12:45,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:12:45,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:12:45,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:12:45,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:12:45,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:12:45,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:12:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:12:45,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:12:45,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:12:45,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:12:45,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:12:45,127] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:12:45,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:12:45,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:12:45,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:12:45,143] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:12:45,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:12:45,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:12:45,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:12:45,152] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step44000/mp_rank_00_model_states.pt +0: [2023-03-17 02:12:45,152] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:12:45,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:12:45,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:12:45,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 02:12:45,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 02:12:45,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 02:12:45,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:12:45,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:12:45,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step44000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:12:45,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: successfully saved checkpoint at iteration 44000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 179.47 +7: iteration 44010/ 60336 | consumed samples: 11266560 | consumed tokens: 23073914880 | elapsed time per iteration (s): 0.18 | learning rate: 5.119E-05 | global batch size: 256 | lm loss: 3.778727E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.067 | TFLOPs: 22.49 | +7: iteration 44020/ 60336 | consumed samples: 11269120 | consumed tokens: 23079157760 | elapsed time per iteration (s): 0.15 | learning rate: 5.115E-05 | global batch size: 256 | lm loss: 3.764857E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.643 | TFLOPs: 26.12 | +7: iteration 44030/ 60336 | consumed samples: 11271680 | consumed tokens: 23084400640 | elapsed time per iteration (s): 0.15 | learning rate: 5.112E-05 | global batch size: 256 | lm loss: 3.758921E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.365 | TFLOPs: 26.12 | +7: iteration 44040/ 60336 | consumed samples: 11274240 | consumed tokens: 23089643520 | elapsed time per iteration (s): 0.15 | learning rate: 5.108E-05 | global batch size: 256 | lm loss: 3.768184E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.712 | TFLOPs: 26.11 | +7: iteration 44050/ 60336 | consumed samples: 11276800 | consumed tokens: 23094886400 | elapsed time per iteration (s): 0.15 | learning rate: 5.105E-05 | global batch size: 256 | lm loss: 3.760167E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.152 | TFLOPs: 26.10 | +7: iteration 44060/ 60336 | consumed samples: 11279360 | consumed tokens: 23100129280 | elapsed time per iteration (s): 0.15 | learning rate: 5.101E-05 | global batch size: 256 | lm loss: 3.762842E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.074 | TFLOPs: 26.00 | +7: iteration 44070/ 60336 | consumed samples: 11281920 | consumed tokens: 23105372160 | elapsed time per iteration (s): 0.15 | learning rate: 5.097E-05 | global batch size: 256 | lm loss: 3.770273E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.369 | TFLOPs: 26.09 | +7: iteration 44080/ 60336 | consumed samples: 11284480 | consumed tokens: 23110615040 | elapsed time per iteration (s): 0.15 | learning rate: 5.094E-05 | global batch size: 256 | lm loss: 3.785753E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.926 | TFLOPs: 26.09 | +7: iteration 44090/ 60336 | consumed samples: 11287040 | consumed tokens: 23115857920 | elapsed time per iteration (s): 0.15 | learning rate: 5.090E-05 | global batch size: 256 | lm loss: 3.760824E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.514 | TFLOPs: 26.10 | +7: iteration 44100/ 60336 | consumed samples: 11289600 | consumed tokens: 23121100800 | elapsed time per iteration (s): 0.15 | learning rate: 5.087E-05 | global batch size: 256 | lm loss: 3.771767E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.323 | TFLOPs: 26.09 | +7: iteration 44110/ 60336 | consumed samples: 11292160 | consumed tokens: 23126343680 | elapsed time per iteration (s): 0.15 | learning rate: 5.083E-05 | global batch size: 256 | lm loss: 3.760289E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.698 | TFLOPs: 26.09 | +7: iteration 44120/ 60336 | consumed samples: 11294720 | consumed tokens: 23131586560 | elapsed time per iteration (s): 0.15 | learning rate: 5.080E-05 | global batch size: 256 | lm loss: 3.766623E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.588 | TFLOPs: 26.09 | +7: iteration 44130/ 60336 | consumed samples: 11297280 | consumed tokens: 23136829440 | elapsed time per iteration (s): 0.15 | learning rate: 5.076E-05 | global batch size: 256 | lm loss: 3.777340E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.682 | TFLOPs: 26.08 | +7: iteration 44140/ 60336 | consumed samples: 11299840 | consumed tokens: 23142072320 | elapsed time per iteration (s): 0.15 | learning rate: 5.072E-05 | global batch size: 256 | lm loss: 3.760337E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.939 | TFLOPs: 26.09 | +7: iteration 44150/ 60336 | consumed samples: 11302400 | consumed tokens: 23147315200 | elapsed time per iteration (s): 0.15 | learning rate: 5.069E-05 | global batch size: 256 | lm loss: 3.779961E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.878 | TFLOPs: 26.09 | +7: iteration 44160/ 60336 | consumed samples: 11304960 | consumed tokens: 23152558080 | elapsed time per iteration (s): 0.15 | learning rate: 5.065E-05 | global batch size: 256 | lm loss: 3.761520E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.852 | TFLOPs: 26.11 | +7: iteration 44170/ 60336 | consumed samples: 11307520 | consumed tokens: 23157800960 | elapsed time per iteration (s): 0.15 | learning rate: 5.062E-05 | global batch size: 256 | lm loss: 3.780171E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.739 | TFLOPs: 26.09 | +7: iteration 44180/ 60336 | consumed samples: 11310080 | consumed tokens: 23163043840 | elapsed time per iteration (s): 0.15 | learning rate: 5.058E-05 | global batch size: 256 | lm loss: 3.764567E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.777 | TFLOPs: 26.09 | +7: iteration 44190/ 60336 | consumed samples: 11312640 | consumed tokens: 23168286720 | elapsed time per iteration (s): 0.15 | learning rate: 5.055E-05 | global batch size: 256 | lm loss: 3.782022E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.426 | TFLOPs: 26.09 | +7: iteration 44200/ 60336 | consumed samples: 11315200 | consumed tokens: 23173529600 | elapsed time per iteration (s): 0.15 | learning rate: 5.051E-05 | global batch size: 256 | lm loss: 3.762819E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.854 | TFLOPs: 26.11 | +7: iteration 44210/ 60336 | consumed samples: 11317760 | consumed tokens: 23178772480 | elapsed time per iteration (s): 0.15 | learning rate: 5.048E-05 | global batch size: 256 | lm loss: 3.760438E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.031 | TFLOPs: 26.06 | +7: iteration 44220/ 60336 | consumed samples: 11320320 | consumed tokens: 23184015360 | elapsed time per iteration (s): 0.15 | learning rate: 5.044E-05 | global batch size: 256 | lm loss: 3.776341E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.255 | TFLOPs: 26.10 | +7: iteration 44230/ 60336 | consumed samples: 11322880 | consumed tokens: 23189258240 | elapsed time per iteration (s): 0.15 | learning rate: 5.040E-05 | global batch size: 256 | lm loss: 3.774815E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.528 | TFLOPs: 26.09 | +7: iteration 44240/ 60336 | consumed samples: 11325440 | consumed tokens: 23194501120 | elapsed time per iteration (s): 0.15 | learning rate: 5.037E-05 | global batch size: 256 | lm loss: 3.764914E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.945 | TFLOPs: 26.11 | +7: iteration 44250/ 60336 | consumed samples: 11328000 | consumed tokens: 23199744000 | elapsed time per iteration (s): 0.15 | learning rate: 5.033E-05 | global batch size: 256 | lm loss: 3.770031E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.178 | TFLOPs: 26.11 | +7: iteration 44260/ 60336 | consumed samples: 11330560 | consumed tokens: 23204986880 | elapsed time per iteration (s): 0.15 | learning rate: 5.030E-05 | global batch size: 256 | lm loss: 3.774532E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.986 | TFLOPs: 26.11 | +7: iteration 44270/ 60336 | consumed samples: 11333120 | consumed tokens: 23210229760 | elapsed time per iteration (s): 0.16 | learning rate: 5.026E-05 | global batch size: 256 | lm loss: 3.768368E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.180 | TFLOPs: 25.78 | +7: iteration 44280/ 60336 | consumed samples: 11335680 | consumed tokens: 23215472640 | elapsed time per iteration (s): 0.15 | learning rate: 5.023E-05 | global batch size: 256 | lm loss: 3.760706E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.806 | TFLOPs: 26.09 | +7: iteration 44290/ 60336 | consumed samples: 11338240 | consumed tokens: 23220715520 | elapsed time per iteration (s): 0.15 | learning rate: 5.019E-05 | global batch size: 256 | lm loss: 3.765519E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.154 | TFLOPs: 26.10 | +7: iteration 44300/ 60336 | consumed samples: 11340800 | consumed tokens: 23225958400 | elapsed time per iteration (s): 0.15 | learning rate: 5.016E-05 | global batch size: 256 | lm loss: 3.782872E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.761 | TFLOPs: 26.08 | +7: iteration 44310/ 60336 | consumed samples: 11343360 | consumed tokens: 23231201280 | elapsed time per iteration (s): 0.15 | learning rate: 5.012E-05 | global batch size: 256 | lm loss: 3.769625E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.419 | TFLOPs: 26.10 | +7: iteration 44320/ 60336 | consumed samples: 11345920 | consumed tokens: 23236444160 | elapsed time per iteration (s): 0.15 | learning rate: 5.009E-05 | global batch size: 256 | lm loss: 3.776057E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.677 | TFLOPs: 26.11 | +7: iteration 44330/ 60336 | consumed samples: 11348480 | consumed tokens: 23241687040 | elapsed time per iteration (s): 0.15 | learning rate: 5.005E-05 | global batch size: 256 | lm loss: 3.775229E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.347 | TFLOPs: 26.09 | +7: iteration 44340/ 60336 | consumed samples: 11351040 | consumed tokens: 23246929920 | elapsed time per iteration (s): 0.15 | learning rate: 5.002E-05 | global batch size: 256 | lm loss: 3.774380E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.907 | TFLOPs: 26.09 | +7: iteration 44350/ 60336 | consumed samples: 11353600 | consumed tokens: 23252172800 | elapsed time per iteration (s): 0.15 | learning rate: 4.998E-05 | global batch size: 256 | lm loss: 3.785196E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.938 | TFLOPs: 26.11 | +7: iteration 44360/ 60336 | consumed samples: 11356160 | consumed tokens: 23257415680 | elapsed time per iteration (s): 0.15 | learning rate: 4.994E-05 | global batch size: 256 | lm loss: 3.766153E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.295 | TFLOPs: 26.08 | +7: iteration 44370/ 60336 | consumed samples: 11358720 | consumed tokens: 23262658560 | elapsed time per iteration (s): 0.15 | learning rate: 4.991E-05 | global batch size: 256 | lm loss: 3.759261E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.426 | TFLOPs: 26.10 | +7: iteration 44380/ 60336 | consumed samples: 11361280 | consumed tokens: 23267901440 | elapsed time per iteration (s): 0.15 | learning rate: 4.987E-05 | global batch size: 256 | lm loss: 3.770245E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.765 | TFLOPs: 26.06 | +7: iteration 44390/ 60336 | consumed samples: 11363840 | consumed tokens: 23273144320 | elapsed time per iteration (s): 0.15 | learning rate: 4.984E-05 | global batch size: 256 | lm loss: 3.758424E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.741 | TFLOPs: 26.09 | +7: iteration 44400/ 60336 | consumed samples: 11366400 | consumed tokens: 23278387200 | elapsed time per iteration (s): 0.15 | learning rate: 4.980E-05 | global batch size: 256 | lm loss: 3.770642E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.023 | TFLOPs: 26.11 | +7: iteration 44410/ 60336 | consumed samples: 11368960 | consumed tokens: 23283630080 | elapsed time per iteration (s): 0.15 | learning rate: 4.977E-05 | global batch size: 256 | lm loss: 3.754902E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.370 | TFLOPs: 26.10 | +7: iteration 44420/ 60336 | consumed samples: 11371520 | consumed tokens: 23288872960 | elapsed time per iteration (s): 0.15 | learning rate: 4.973E-05 | global batch size: 256 | lm loss: 3.766322E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.102 | TFLOPs: 26.10 | +7: iteration 44430/ 60336 | consumed samples: 11374080 | consumed tokens: 23294115840 | elapsed time per iteration (s): 0.15 | learning rate: 4.970E-05 | global batch size: 256 | lm loss: 3.782965E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.873 | TFLOPs: 26.09 | +7: iteration 44440/ 60336 | consumed samples: 11376640 | consumed tokens: 23299358720 | elapsed time per iteration (s): 0.15 | learning rate: 4.966E-05 | global batch size: 256 | lm loss: 3.771589E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.705 | TFLOPs: 26.11 | +7: iteration 44450/ 60336 | consumed samples: 11379200 | consumed tokens: 23304601600 | elapsed time per iteration (s): 0.16 | learning rate: 4.963E-05 | global batch size: 256 | lm loss: 3.769384E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.021 | TFLOPs: 25.69 | +7: iteration 44460/ 60336 | consumed samples: 11381760 | consumed tokens: 23309844480 | elapsed time per iteration (s): 0.15 | learning rate: 4.959E-05 | global batch size: 256 | lm loss: 3.761891E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.489 | TFLOPs: 26.06 | +7: iteration 44470/ 60336 | consumed samples: 11384320 | consumed tokens: 23315087360 | elapsed time per iteration (s): 0.15 | learning rate: 4.956E-05 | global batch size: 256 | lm loss: 3.765704E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.907 | TFLOPs: 26.08 | +7: iteration 44480/ 60336 | consumed samples: 11386880 | consumed tokens: 23320330240 | elapsed time per iteration (s): 0.15 | learning rate: 4.952E-05 | global batch size: 256 | lm loss: 3.772235E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.626 | TFLOPs: 26.11 | +7: iteration 44490/ 60336 | consumed samples: 11389440 | consumed tokens: 23325573120 | elapsed time per iteration (s): 0.15 | learning rate: 4.949E-05 | global batch size: 256 | lm loss: 3.775277E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.167 | TFLOPs: 26.10 | +7: iteration 44500/ 60336 | consumed samples: 11392000 | consumed tokens: 23330816000 | elapsed time per iteration (s): 0.15 | learning rate: 4.945E-05 | global batch size: 256 | lm loss: 3.771057E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.352 | TFLOPs: 26.10 | +7: iteration 44510/ 60336 | consumed samples: 11394560 | consumed tokens: 23336058880 | elapsed time per iteration (s): 0.15 | learning rate: 4.942E-05 | global batch size: 256 | lm loss: 3.769975E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.550 | TFLOPs: 26.09 | +7: iteration 44520/ 60336 | consumed samples: 11397120 | consumed tokens: 23341301760 | elapsed time per iteration (s): 0.15 | learning rate: 4.938E-05 | global batch size: 256 | lm loss: 3.772876E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.005 | TFLOPs: 26.08 | +7: iteration 44530/ 60336 | consumed samples: 11399680 | consumed tokens: 23346544640 | elapsed time per iteration (s): 0.15 | learning rate: 4.935E-05 | global batch size: 256 | lm loss: 3.759261E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.972 | TFLOPs: 26.08 | +7: iteration 44540/ 60336 | consumed samples: 11402240 | consumed tokens: 23351787520 | elapsed time per iteration (s): 0.15 | learning rate: 4.931E-05 | global batch size: 256 | lm loss: 3.764422E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.992 | TFLOPs: 26.10 | +7: iteration 44550/ 60336 | consumed samples: 11404800 | consumed tokens: 23357030400 | elapsed time per iteration (s): 0.15 | learning rate: 4.928E-05 | global batch size: 256 | lm loss: 3.770578E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.965 | TFLOPs: 26.10 | +7: iteration 44560/ 60336 | consumed samples: 11407360 | consumed tokens: 23362273280 | elapsed time per iteration (s): 0.15 | learning rate: 4.924E-05 | global batch size: 256 | lm loss: 3.762823E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.722 | TFLOPs: 26.09 | +7: iteration 44570/ 60336 | consumed samples: 11409920 | consumed tokens: 23367516160 | elapsed time per iteration (s): 0.15 | learning rate: 4.921E-05 | global batch size: 256 | lm loss: 3.773342E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.737 | TFLOPs: 26.08 | +7: iteration 44580/ 60336 | consumed samples: 11412480 | consumed tokens: 23372759040 | elapsed time per iteration (s): 0.15 | learning rate: 4.917E-05 | global batch size: 256 | lm loss: 3.771510E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.681 | TFLOPs: 26.11 | +7: iteration 44590/ 60336 | consumed samples: 11415040 | consumed tokens: 23378001920 | elapsed time per iteration (s): 0.15 | learning rate: 4.914E-05 | global batch size: 256 | lm loss: 3.770894E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.161 | TFLOPs: 26.08 | +7: iteration 44600/ 60336 | consumed samples: 11417600 | consumed tokens: 23383244800 | elapsed time per iteration (s): 0.15 | learning rate: 4.910E-05 | global batch size: 256 | lm loss: 3.768073E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.100 | TFLOPs: 26.10 | +7: iteration 44610/ 60336 | consumed samples: 11420160 | consumed tokens: 23388487680 | elapsed time per iteration (s): 0.15 | learning rate: 4.907E-05 | global batch size: 256 | lm loss: 3.777229E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.062 | TFLOPs: 26.08 | +7: iteration 44620/ 60336 | consumed samples: 11422720 | consumed tokens: 23393730560 | elapsed time per iteration (s): 0.15 | learning rate: 4.903E-05 | global batch size: 256 | lm loss: 3.763226E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.396 | TFLOPs: 26.07 | +7: iteration 44630/ 60336 | consumed samples: 11425280 | consumed tokens: 23398973440 | elapsed time per iteration (s): 0.15 | learning rate: 4.900E-05 | global batch size: 256 | lm loss: 3.776130E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.560 | TFLOPs: 26.06 | +7: iteration 44640/ 60336 | consumed samples: 11427840 | consumed tokens: 23404216320 | elapsed time per iteration (s): 0.15 | learning rate: 4.896E-05 | global batch size: 256 | lm loss: 3.767210E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.407 | TFLOPs: 26.07 | +7: iteration 44650/ 60336 | consumed samples: 11430400 | consumed tokens: 23409459200 | elapsed time per iteration (s): 0.15 | learning rate: 4.893E-05 | global batch size: 256 | lm loss: 3.768925E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.241 | TFLOPs: 26.10 | +7: iteration 44660/ 60336 | consumed samples: 11432960 | consumed tokens: 23414702080 | elapsed time per iteration (s): 0.15 | learning rate: 4.889E-05 | global batch size: 256 | lm loss: 3.765526E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.378 | TFLOPs: 26.09 | +7: iteration 44670/ 60336 | consumed samples: 11435520 | consumed tokens: 23419944960 | elapsed time per iteration (s): 0.15 | learning rate: 4.886E-05 | global batch size: 256 | lm loss: 3.779392E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.443 | TFLOPs: 26.07 | +7: iteration 44680/ 60336 | consumed samples: 11438080 | consumed tokens: 23425187840 | elapsed time per iteration (s): 0.15 | learning rate: 4.883E-05 | global batch size: 256 | lm loss: 3.763078E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.085 | TFLOPs: 26.07 | +7: iteration 44690/ 60336 | consumed samples: 11440640 | consumed tokens: 23430430720 | elapsed time per iteration (s): 0.15 | learning rate: 4.879E-05 | global batch size: 256 | lm loss: 3.766663E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.693 | TFLOPs: 26.08 | +7: iteration 44700/ 60336 | consumed samples: 11443200 | consumed tokens: 23435673600 | elapsed time per iteration (s): 0.15 | learning rate: 4.876E-05 | global batch size: 256 | lm loss: 3.776274E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.177 | TFLOPs: 26.08 | +7: iteration 44710/ 60336 | consumed samples: 11445760 | consumed tokens: 23440916480 | elapsed time per iteration (s): 0.15 | learning rate: 4.872E-05 | global batch size: 256 | lm loss: 3.765442E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.278 | TFLOPs: 26.10 | +7: iteration 44720/ 60336 | consumed samples: 11448320 | consumed tokens: 23446159360 | elapsed time per iteration (s): 0.15 | learning rate: 4.869E-05 | global batch size: 256 | lm loss: 3.771482E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.776 | TFLOPs: 26.09 | +7: iteration 44730/ 60336 | consumed samples: 11450880 | consumed tokens: 23451402240 | elapsed time per iteration (s): 0.15 | learning rate: 4.865E-05 | global batch size: 256 | lm loss: 3.775641E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.453 | TFLOPs: 26.09 | +7: iteration 44740/ 60336 | consumed samples: 11453440 | consumed tokens: 23456645120 | elapsed time per iteration (s): 0.15 | learning rate: 4.862E-05 | global batch size: 256 | lm loss: 3.762783E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.396 | TFLOPs: 26.07 | +7: iteration 44750/ 60336 | consumed samples: 11456000 | consumed tokens: 23461888000 | elapsed time per iteration (s): 0.15 | learning rate: 4.858E-05 | global batch size: 256 | lm loss: 3.764484E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.525 | TFLOPs: 26.09 | +7: iteration 44760/ 60336 | consumed samples: 11458560 | consumed tokens: 23467130880 | elapsed time per iteration (s): 0.15 | learning rate: 4.855E-05 | global batch size: 256 | lm loss: 3.763111E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.890 | TFLOPs: 26.09 | +7: iteration 44770/ 60336 | consumed samples: 11461120 | consumed tokens: 23472373760 | elapsed time per iteration (s): 0.15 | learning rate: 4.851E-05 | global batch size: 256 | lm loss: 3.768636E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.155 | TFLOPs: 26.07 | +7: iteration 44780/ 60336 | consumed samples: 11463680 | consumed tokens: 23477616640 | elapsed time per iteration (s): 0.15 | learning rate: 4.848E-05 | global batch size: 256 | lm loss: 3.766875E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.768 | TFLOPs: 25.90 | +7: iteration 44790/ 60336 | consumed samples: 11466240 | consumed tokens: 23482859520 | elapsed time per iteration (s): 0.15 | learning rate: 4.844E-05 | global batch size: 256 | lm loss: 3.774831E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.190 | TFLOPs: 26.10 | +7: iteration 44800/ 60336 | consumed samples: 11468800 | consumed tokens: 23488102400 | elapsed time per iteration (s): 0.15 | learning rate: 4.841E-05 | global batch size: 256 | lm loss: 3.754858E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.428 | TFLOPs: 26.10 | +7: iteration 44810/ 60336 | consumed samples: 11471360 | consumed tokens: 23493345280 | elapsed time per iteration (s): 0.15 | learning rate: 4.838E-05 | global batch size: 256 | lm loss: 3.761988E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.701 | TFLOPs: 26.11 | +7: iteration 44820/ 60336 | consumed samples: 11473920 | consumed tokens: 23498588160 | elapsed time per iteration (s): 0.15 | learning rate: 4.834E-05 | global batch size: 256 | lm loss: 3.769965E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.787 | TFLOPs: 26.12 | +7: iteration 44830/ 60336 | consumed samples: 11476480 | consumed tokens: 23503831040 | elapsed time per iteration (s): 0.15 | learning rate: 4.831E-05 | global batch size: 256 | lm loss: 3.773014E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.580 | TFLOPs: 26.14 | +7: iteration 44840/ 60336 | consumed samples: 11479040 | consumed tokens: 23509073920 | elapsed time per iteration (s): 0.15 | learning rate: 4.827E-05 | global batch size: 256 | lm loss: 3.754389E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.422 | TFLOPs: 26.12 | +7: iteration 44850/ 60336 | consumed samples: 11481600 | consumed tokens: 23514316800 | elapsed time per iteration (s): 0.15 | learning rate: 4.824E-05 | global batch size: 256 | lm loss: 3.776195E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.898 | TFLOPs: 26.09 | +7: iteration 44860/ 60336 | consumed samples: 11484160 | consumed tokens: 23519559680 | elapsed time per iteration (s): 0.15 | learning rate: 4.820E-05 | global batch size: 256 | lm loss: 3.765524E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.477 | TFLOPs: 26.10 | +7: iteration 44870/ 60336 | consumed samples: 11486720 | consumed tokens: 23524802560 | elapsed time per iteration (s): 0.15 | learning rate: 4.817E-05 | global batch size: 256 | lm loss: 3.767693E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.525 | TFLOPs: 26.14 | +7: iteration 44880/ 60336 | consumed samples: 11489280 | consumed tokens: 23530045440 | elapsed time per iteration (s): 0.15 | learning rate: 4.813E-05 | global batch size: 256 | lm loss: 3.771280E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.658 | TFLOPs: 26.14 | +7: iteration 44890/ 60336 | consumed samples: 11491840 | consumed tokens: 23535288320 | elapsed time per iteration (s): 0.15 | learning rate: 4.810E-05 | global batch size: 256 | lm loss: 3.773985E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.915 | TFLOPs: 26.13 | +7: iteration 44900/ 60336 | consumed samples: 11494400 | consumed tokens: 23540531200 | elapsed time per iteration (s): 0.15 | learning rate: 4.807E-05 | global batch size: 256 | lm loss: 3.762510E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.336 | TFLOPs: 26.12 | +7: iteration 44910/ 60336 | consumed samples: 11496960 | consumed tokens: 23545774080 | elapsed time per iteration (s): 0.15 | learning rate: 4.803E-05 | global batch size: 256 | lm loss: 3.763882E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.632 | TFLOPs: 26.12 | +7: iteration 44920/ 60336 | consumed samples: 11499520 | consumed tokens: 23551016960 | elapsed time per iteration (s): 0.15 | learning rate: 4.800E-05 | global batch size: 256 | lm loss: 3.780707E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.916 | TFLOPs: 26.14 | +7: iteration 44930/ 60336 | consumed samples: 11502080 | consumed tokens: 23556259840 | elapsed time per iteration (s): 0.15 | learning rate: 4.796E-05 | global batch size: 256 | lm loss: 3.777715E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.301 | TFLOPs: 26.12 | +7: iteration 44940/ 60336 | consumed samples: 11504640 | consumed tokens: 23561502720 | elapsed time per iteration (s): 0.15 | learning rate: 4.793E-05 | global batch size: 256 | lm loss: 3.771215E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.363 | TFLOPs: 26.12 | +7: iteration 44950/ 60336 | consumed samples: 11507200 | consumed tokens: 23566745600 | elapsed time per iteration (s): 0.15 | learning rate: 4.789E-05 | global batch size: 256 | lm loss: 3.767896E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.436 | TFLOPs: 26.13 | +7: iteration 44960/ 60336 | consumed samples: 11509760 | consumed tokens: 23571988480 | elapsed time per iteration (s): 0.15 | learning rate: 4.786E-05 | global batch size: 256 | lm loss: 3.750788E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.042 | TFLOPs: 26.13 | +7: iteration 44970/ 60336 | consumed samples: 11512320 | consumed tokens: 23577231360 | elapsed time per iteration (s): 0.15 | learning rate: 4.783E-05 | global batch size: 256 | lm loss: 3.759738E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.527 | TFLOPs: 26.10 | +7: iteration 44980/ 60336 | consumed samples: 11514880 | consumed tokens: 23582474240 | elapsed time per iteration (s): 0.15 | learning rate: 4.779E-05 | global batch size: 256 | lm loss: 3.777128E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.640 | TFLOPs: 26.12 | +7: iteration 44990/ 60336 | consumed samples: 11517440 | consumed tokens: 23587717120 | elapsed time per iteration (s): 0.15 | learning rate: 4.776E-05 | global batch size: 256 | lm loss: 3.764336E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.891 | TFLOPs: 26.13 | +7: iteration 45000/ 60336 | consumed samples: 11520000 | consumed tokens: 23592960000 | elapsed time per iteration (s): 0.15 | learning rate: 4.772E-05 | global batch size: 256 | lm loss: 3.787315E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.277 | TFLOPs: 26.12 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 45000 | lm loss value: 3.880912E+00 | lm loss PPL: 4.846840E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 45000 to checkpoints_44m32b100m +0: [2023-03-17 02:15:19,141] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step45000 is begin to save! +0: [2023-03-17 02:15:19,144] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:15:19,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:15:19,208] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:15:19,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:15:19,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:15:19,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:15:19,227] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:15:19,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:15:19,235] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:15:19,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:15:19,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:15:19,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:15:19,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:15:19,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:15:19,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:15:19,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:15:19,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:15:19,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:15:19,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:15:19,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:15:19,277] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step45000/mp_rank_00_model_states.pt +0: [2023-03-17 02:15:19,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:15:19,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:15:19,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:15:19,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:15:19,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:15:19,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:15:19,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:15:19,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:15:19,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:15:19,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:15:19,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:15:19,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:15:19,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:15:19,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step45000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:15:19,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: successfully saved checkpoint at iteration 45000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 181.99 +7: iteration 45010/ 60336 | consumed samples: 11522560 | consumed tokens: 23598202880 | elapsed time per iteration (s): 0.18 | learning rate: 4.769E-05 | global batch size: 256 | lm loss: 3.770803E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1447.233 | TFLOPs: 22.70 | +7: iteration 45020/ 60336 | consumed samples: 11525120 | consumed tokens: 23603445760 | elapsed time per iteration (s): 0.15 | learning rate: 4.765E-05 | global batch size: 256 | lm loss: 3.760654E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.439 | TFLOPs: 26.13 | +7: iteration 45030/ 60336 | consumed samples: 11527680 | consumed tokens: 23608688640 | elapsed time per iteration (s): 0.15 | learning rate: 4.762E-05 | global batch size: 256 | lm loss: 3.777704E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.226 | TFLOPs: 26.15 | +7: iteration 45040/ 60336 | consumed samples: 11530240 | consumed tokens: 23613931520 | elapsed time per iteration (s): 0.15 | learning rate: 4.759E-05 | global batch size: 256 | lm loss: 3.766877E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.933 | TFLOPs: 26.14 | +7: iteration 45050/ 60336 | consumed samples: 11532800 | consumed tokens: 23619174400 | elapsed time per iteration (s): 0.15 | learning rate: 4.755E-05 | global batch size: 256 | lm loss: 3.748355E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.449 | TFLOPs: 26.10 | +7: iteration 45060/ 60336 | consumed samples: 11535360 | consumed tokens: 23624417280 | elapsed time per iteration (s): 0.15 | learning rate: 4.752E-05 | global batch size: 256 | lm loss: 3.772241E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.922 | TFLOPs: 26.08 | +7: iteration 45070/ 60336 | consumed samples: 11537920 | consumed tokens: 23629660160 | elapsed time per iteration (s): 0.16 | learning rate: 4.748E-05 | global batch size: 256 | lm loss: 3.768417E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.741 | TFLOPs: 25.45 | +7: iteration 45080/ 60336 | consumed samples: 11540480 | consumed tokens: 23634903040 | elapsed time per iteration (s): 0.16 | learning rate: 4.745E-05 | global batch size: 256 | lm loss: 3.761691E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.469 | TFLOPs: 25.46 | +7: iteration 45090/ 60336 | consumed samples: 11543040 | consumed tokens: 23640145920 | elapsed time per iteration (s): 0.15 | learning rate: 4.742E-05 | global batch size: 256 | lm loss: 3.782050E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.187 | TFLOPs: 26.13 | +7: iteration 45100/ 60336 | consumed samples: 11545600 | consumed tokens: 23645388800 | elapsed time per iteration (s): 0.15 | learning rate: 4.738E-05 | global batch size: 256 | lm loss: 3.756813E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.653 | TFLOPs: 26.11 | +7: iteration 45110/ 60336 | consumed samples: 11548160 | consumed tokens: 23650631680 | elapsed time per iteration (s): 0.15 | learning rate: 4.735E-05 | global batch size: 256 | lm loss: 3.767296E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.697 | TFLOPs: 26.12 | +7: iteration 45120/ 60336 | consumed samples: 11550720 | consumed tokens: 23655874560 | elapsed time per iteration (s): 0.15 | learning rate: 4.731E-05 | global batch size: 256 | lm loss: 3.772630E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.061 | TFLOPs: 26.13 | +7: iteration 45130/ 60336 | consumed samples: 11553280 | consumed tokens: 23661117440 | elapsed time per iteration (s): 0.15 | learning rate: 4.728E-05 | global batch size: 256 | lm loss: 3.766501E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.641 | TFLOPs: 26.12 | +7: iteration 45140/ 60336 | consumed samples: 11555840 | consumed tokens: 23666360320 | elapsed time per iteration (s): 0.15 | learning rate: 4.725E-05 | global batch size: 256 | lm loss: 3.761007E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.871 | TFLOPs: 26.11 | +7: iteration 45150/ 60336 | consumed samples: 11558400 | consumed tokens: 23671603200 | elapsed time per iteration (s): 0.15 | learning rate: 4.721E-05 | global batch size: 256 | lm loss: 3.754226E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.573 | TFLOPs: 26.10 | +7: iteration 45160/ 60336 | consumed samples: 11560960 | consumed tokens: 23676846080 | elapsed time per iteration (s): 0.15 | learning rate: 4.718E-05 | global batch size: 256 | lm loss: 3.776658E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.718 | TFLOPs: 26.09 | +7: iteration 45170/ 60336 | consumed samples: 11563520 | consumed tokens: 23682088960 | elapsed time per iteration (s): 0.15 | learning rate: 4.714E-05 | global batch size: 256 | lm loss: 3.776447E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.567 | TFLOPs: 26.09 | +7: iteration 45180/ 60336 | consumed samples: 11566080 | consumed tokens: 23687331840 | elapsed time per iteration (s): 0.15 | learning rate: 4.711E-05 | global batch size: 256 | lm loss: 3.782667E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.449 | TFLOPs: 26.12 | +7: iteration 45190/ 60336 | consumed samples: 11568640 | consumed tokens: 23692574720 | elapsed time per iteration (s): 0.15 | learning rate: 4.708E-05 | global batch size: 256 | lm loss: 3.761327E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.843 | TFLOPs: 26.11 | +7: iteration 45200/ 60336 | consumed samples: 11571200 | consumed tokens: 23697817600 | elapsed time per iteration (s): 0.15 | learning rate: 4.704E-05 | global batch size: 256 | lm loss: 3.795985E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.050 | TFLOPs: 26.10 | +7: iteration 45210/ 60336 | consumed samples: 11573760 | consumed tokens: 23703060480 | elapsed time per iteration (s): 0.16 | learning rate: 4.701E-05 | global batch size: 256 | lm loss: 3.761008E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.797 | TFLOPs: 25.84 | +7: iteration 45220/ 60336 | consumed samples: 11576320 | consumed tokens: 23708303360 | elapsed time per iteration (s): 0.15 | learning rate: 4.698E-05 | global batch size: 256 | lm loss: 3.777756E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.474 | TFLOPs: 26.07 | +7: iteration 45230/ 60336 | consumed samples: 11578880 | consumed tokens: 23713546240 | elapsed time per iteration (s): 0.15 | learning rate: 4.694E-05 | global batch size: 256 | lm loss: 3.776233E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.774 | TFLOPs: 26.01 | +7: iteration 45240/ 60336 | consumed samples: 11581440 | consumed tokens: 23718789120 | elapsed time per iteration (s): 0.16 | learning rate: 4.691E-05 | global batch size: 256 | lm loss: 3.760016E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.241 | TFLOPs: 25.82 | +7: iteration 45250/ 60336 | consumed samples: 11584000 | consumed tokens: 23724032000 | elapsed time per iteration (s): 0.15 | learning rate: 4.687E-05 | global batch size: 256 | lm loss: 3.755965E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.450 | TFLOPs: 25.99 | +7: iteration 45260/ 60336 | consumed samples: 11586560 | consumed tokens: 23729274880 | elapsed time per iteration (s): 0.15 | learning rate: 4.684E-05 | global batch size: 256 | lm loss: 3.767226E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.268 | TFLOPs: 25.99 | +7: iteration 45270/ 60336 | consumed samples: 11589120 | consumed tokens: 23734517760 | elapsed time per iteration (s): 0.15 | learning rate: 4.681E-05 | global batch size: 256 | lm loss: 3.760606E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.282 | TFLOPs: 26.08 | +7: iteration 45280/ 60336 | consumed samples: 11591680 | consumed tokens: 23739760640 | elapsed time per iteration (s): 0.15 | learning rate: 4.677E-05 | global batch size: 256 | lm loss: 3.771089E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.701 | TFLOPs: 26.11 | +7: iteration 45290/ 60336 | consumed samples: 11594240 | consumed tokens: 23745003520 | elapsed time per iteration (s): 0.15 | learning rate: 4.674E-05 | global batch size: 256 | lm loss: 3.771990E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.085 | TFLOPs: 26.07 | +7: iteration 45300/ 60336 | consumed samples: 11596800 | consumed tokens: 23750246400 | elapsed time per iteration (s): 0.15 | learning rate: 4.671E-05 | global batch size: 256 | lm loss: 3.774305E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.306 | TFLOPs: 26.07 | +7: iteration 45310/ 60336 | consumed samples: 11599360 | consumed tokens: 23755489280 | elapsed time per iteration (s): 0.15 | learning rate: 4.667E-05 | global batch size: 256 | lm loss: 3.750049E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.081 | TFLOPs: 26.11 | +7: iteration 45320/ 60336 | consumed samples: 11601920 | consumed tokens: 23760732160 | elapsed time per iteration (s): 0.15 | learning rate: 4.664E-05 | global batch size: 256 | lm loss: 3.775508E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.121 | TFLOPs: 26.11 | +7: iteration 45330/ 60336 | consumed samples: 11604480 | consumed tokens: 23765975040 | elapsed time per iteration (s): 0.15 | learning rate: 4.660E-05 | global batch size: 256 | lm loss: 3.768597E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.329 | TFLOPs: 26.10 | +7: iteration 45340/ 60336 | consumed samples: 11607040 | consumed tokens: 23771217920 | elapsed time per iteration (s): 0.15 | learning rate: 4.657E-05 | global batch size: 256 | lm loss: 3.766804E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.733 | TFLOPs: 25.98 | +7: iteration 45350/ 60336 | consumed samples: 11609600 | consumed tokens: 23776460800 | elapsed time per iteration (s): 0.15 | learning rate: 4.654E-05 | global batch size: 256 | lm loss: 3.761124E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.294 | TFLOPs: 25.99 | +7: iteration 45360/ 60336 | consumed samples: 11612160 | consumed tokens: 23781703680 | elapsed time per iteration (s): 0.15 | learning rate: 4.650E-05 | global batch size: 256 | lm loss: 3.772308E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.245 | TFLOPs: 25.97 | +7: iteration 45370/ 60336 | consumed samples: 11614720 | consumed tokens: 23786946560 | elapsed time per iteration (s): 0.15 | learning rate: 4.647E-05 | global batch size: 256 | lm loss: 3.769322E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.404 | TFLOPs: 25.99 | +7: iteration 45380/ 60336 | consumed samples: 11617280 | consumed tokens: 23792189440 | elapsed time per iteration (s): 0.15 | learning rate: 4.644E-05 | global batch size: 256 | lm loss: 3.757363E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.797 | TFLOPs: 25.98 | +7: iteration 45390/ 60336 | consumed samples: 11619840 | consumed tokens: 23797432320 | elapsed time per iteration (s): 0.15 | learning rate: 4.640E-05 | global batch size: 256 | lm loss: 3.776574E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.617 | TFLOPs: 25.98 | +7: iteration 45400/ 60336 | consumed samples: 11622400 | consumed tokens: 23802675200 | elapsed time per iteration (s): 0.15 | learning rate: 4.637E-05 | global batch size: 256 | lm loss: 3.775486E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.607 | TFLOPs: 25.98 | +7: iteration 45410/ 60336 | consumed samples: 11624960 | consumed tokens: 23807918080 | elapsed time per iteration (s): 0.15 | learning rate: 4.634E-05 | global batch size: 256 | lm loss: 3.765601E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.330 | TFLOPs: 25.96 | +7: iteration 45420/ 60336 | consumed samples: 11627520 | consumed tokens: 23813160960 | elapsed time per iteration (s): 0.15 | learning rate: 4.630E-05 | global batch size: 256 | lm loss: 3.758318E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.174 | TFLOPs: 25.97 | +7: iteration 45430/ 60336 | consumed samples: 11630080 | consumed tokens: 23818403840 | elapsed time per iteration (s): 0.16 | learning rate: 4.627E-05 | global batch size: 256 | lm loss: 3.765917E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.635 | TFLOPs: 25.73 | +7: iteration 45440/ 60336 | consumed samples: 11632640 | consumed tokens: 23823646720 | elapsed time per iteration (s): 0.17 | learning rate: 4.624E-05 | global batch size: 256 | lm loss: 3.781325E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.395 | TFLOPs: 23.81 | +7: iteration 45450/ 60336 | consumed samples: 11635200 | consumed tokens: 23828889600 | elapsed time per iteration (s): 0.16 | learning rate: 4.620E-05 | global batch size: 256 | lm loss: 3.778167E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.430 | TFLOPs: 25.54 | +7: iteration 45460/ 60336 | consumed samples: 11637760 | consumed tokens: 23834132480 | elapsed time per iteration (s): 0.15 | learning rate: 4.617E-05 | global batch size: 256 | lm loss: 3.769843E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.907 | TFLOPs: 25.98 | +7: iteration 45470/ 60336 | consumed samples: 11640320 | consumed tokens: 23839375360 | elapsed time per iteration (s): 0.15 | learning rate: 4.614E-05 | global batch size: 256 | lm loss: 3.760763E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.149 | TFLOPs: 26.00 | +7: iteration 45480/ 60336 | consumed samples: 11642880 | consumed tokens: 23844618240 | elapsed time per iteration (s): 0.15 | learning rate: 4.610E-05 | global batch size: 256 | lm loss: 3.762667E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.915 | TFLOPs: 25.98 | +7: iteration 45490/ 60336 | consumed samples: 11645440 | consumed tokens: 23849861120 | elapsed time per iteration (s): 0.15 | learning rate: 4.607E-05 | global batch size: 256 | lm loss: 3.770546E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.859 | TFLOPs: 25.98 | +7: iteration 45500/ 60336 | consumed samples: 11648000 | consumed tokens: 23855104000 | elapsed time per iteration (s): 0.15 | learning rate: 4.604E-05 | global batch size: 256 | lm loss: 3.749646E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.326 | TFLOPs: 25.99 | +7: iteration 45510/ 60336 | consumed samples: 11650560 | consumed tokens: 23860346880 | elapsed time per iteration (s): 0.15 | learning rate: 4.600E-05 | global batch size: 256 | lm loss: 3.767068E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.923 | TFLOPs: 25.98 | +7: iteration 45520/ 60336 | consumed samples: 11653120 | consumed tokens: 23865589760 | elapsed time per iteration (s): 0.15 | learning rate: 4.597E-05 | global batch size: 256 | lm loss: 3.768915E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.201 | TFLOPs: 26.00 | +7: iteration 45530/ 60336 | consumed samples: 11655680 | consumed tokens: 23870832640 | elapsed time per iteration (s): 0.15 | learning rate: 4.594E-05 | global batch size: 256 | lm loss: 3.766272E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.416 | TFLOPs: 26.01 | +7: iteration 45540/ 60336 | consumed samples: 11658240 | consumed tokens: 23876075520 | elapsed time per iteration (s): 0.15 | learning rate: 4.590E-05 | global batch size: 256 | lm loss: 3.771991E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.594 | TFLOPs: 25.98 | +7: iteration 45550/ 60336 | consumed samples: 11660800 | consumed tokens: 23881318400 | elapsed time per iteration (s): 0.15 | learning rate: 4.587E-05 | global batch size: 256 | lm loss: 3.771774E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.228 | TFLOPs: 25.99 | +7: iteration 45560/ 60336 | consumed samples: 11663360 | consumed tokens: 23886561280 | elapsed time per iteration (s): 0.15 | learning rate: 4.584E-05 | global batch size: 256 | lm loss: 3.761828E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.742 | TFLOPs: 25.98 | +7: iteration 45570/ 60336 | consumed samples: 11665920 | consumed tokens: 23891804160 | elapsed time per iteration (s): 0.15 | learning rate: 4.580E-05 | global batch size: 256 | lm loss: 3.764950E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.456 | TFLOPs: 26.01 | +7: iteration 45580/ 60336 | consumed samples: 11668480 | consumed tokens: 23897047040 | elapsed time per iteration (s): 0.15 | learning rate: 4.577E-05 | global batch size: 256 | lm loss: 3.777513E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.066 | TFLOPs: 25.99 | +7: iteration 45590/ 60336 | consumed samples: 11671040 | consumed tokens: 23902289920 | elapsed time per iteration (s): 0.15 | learning rate: 4.574E-05 | global batch size: 256 | lm loss: 3.751585E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.446 | TFLOPs: 25.98 | +7: iteration 45600/ 60336 | consumed samples: 11673600 | consumed tokens: 23907532800 | elapsed time per iteration (s): 0.15 | learning rate: 4.570E-05 | global batch size: 256 | lm loss: 3.746837E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.611 | TFLOPs: 26.00 | +7: iteration 45610/ 60336 | consumed samples: 11676160 | consumed tokens: 23912775680 | elapsed time per iteration (s): 0.15 | learning rate: 4.567E-05 | global batch size: 256 | lm loss: 3.770857E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.911 | TFLOPs: 25.97 | +7: iteration 45620/ 60336 | consumed samples: 11678720 | consumed tokens: 23918018560 | elapsed time per iteration (s): 0.15 | learning rate: 4.564E-05 | global batch size: 256 | lm loss: 3.767367E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.942 | TFLOPs: 25.98 | +7: iteration 45630/ 60336 | consumed samples: 11681280 | consumed tokens: 23923261440 | elapsed time per iteration (s): 0.15 | learning rate: 4.560E-05 | global batch size: 256 | lm loss: 3.761781E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.898 | TFLOPs: 25.98 | +7: iteration 45640/ 60336 | consumed samples: 11683840 | consumed tokens: 23928504320 | elapsed time per iteration (s): 0.15 | learning rate: 4.557E-05 | global batch size: 256 | lm loss: 3.767159E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.043 | TFLOPs: 26.00 | +7: iteration 45650/ 60336 | consumed samples: 11686400 | consumed tokens: 23933747200 | elapsed time per iteration (s): 0.15 | learning rate: 4.554E-05 | global batch size: 256 | lm loss: 3.754923E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.895 | TFLOPs: 25.97 | +7: iteration 45660/ 60336 | consumed samples: 11688960 | consumed tokens: 23938990080 | elapsed time per iteration (s): 0.15 | learning rate: 4.551E-05 | global batch size: 256 | lm loss: 3.762898E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.007 | TFLOPs: 25.99 | +7: iteration 45670/ 60336 | consumed samples: 11691520 | consumed tokens: 23944232960 | elapsed time per iteration (s): 0.15 | learning rate: 4.547E-05 | global batch size: 256 | lm loss: 3.761032E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.966 | TFLOPs: 26.02 | +7: iteration 45680/ 60336 | consumed samples: 11694080 | consumed tokens: 23949475840 | elapsed time per iteration (s): 0.15 | learning rate: 4.544E-05 | global batch size: 256 | lm loss: 3.772840E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.553 | TFLOPs: 26.01 | +7: iteration 45690/ 60336 | consumed samples: 11696640 | consumed tokens: 23954718720 | elapsed time per iteration (s): 0.15 | learning rate: 4.541E-05 | global batch size: 256 | lm loss: 3.769349E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.435 | TFLOPs: 25.99 | +7: iteration 45700/ 60336 | consumed samples: 11699200 | consumed tokens: 23959961600 | elapsed time per iteration (s): 0.15 | learning rate: 4.537E-05 | global batch size: 256 | lm loss: 3.768344E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.903 | TFLOPs: 25.98 | +7: iteration 45710/ 60336 | consumed samples: 11701760 | consumed tokens: 23965204480 | elapsed time per iteration (s): 0.15 | learning rate: 4.534E-05 | global batch size: 256 | lm loss: 3.766454E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.990 | TFLOPs: 26.00 | +7: iteration 45720/ 60336 | consumed samples: 11704320 | consumed tokens: 23970447360 | elapsed time per iteration (s): 0.15 | learning rate: 4.531E-05 | global batch size: 256 | lm loss: 3.754396E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.539 | TFLOPs: 25.99 | +7: iteration 45730/ 60336 | consumed samples: 11706880 | consumed tokens: 23975690240 | elapsed time per iteration (s): 0.15 | learning rate: 4.528E-05 | global batch size: 256 | lm loss: 3.758124E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.657 | TFLOPs: 26.00 | +7: iteration 45740/ 60336 | consumed samples: 11709440 | consumed tokens: 23980933120 | elapsed time per iteration (s): 0.15 | learning rate: 4.524E-05 | global batch size: 256 | lm loss: 3.760114E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.617 | TFLOPs: 25.96 | +7: iteration 45750/ 60336 | consumed samples: 11712000 | consumed tokens: 23986176000 | elapsed time per iteration (s): 0.15 | learning rate: 4.521E-05 | global batch size: 256 | lm loss: 3.760653E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.260 | TFLOPs: 25.93 | +7: iteration 45760/ 60336 | consumed samples: 11714560 | consumed tokens: 23991418880 | elapsed time per iteration (s): 0.15 | learning rate: 4.518E-05 | global batch size: 256 | lm loss: 3.766671E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.894 | TFLOPs: 25.95 | +7: iteration 45770/ 60336 | consumed samples: 11717120 | consumed tokens: 23996661760 | elapsed time per iteration (s): 0.15 | learning rate: 4.514E-05 | global batch size: 256 | lm loss: 3.763574E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.406 | TFLOPs: 25.96 | +7: iteration 45780/ 60336 | consumed samples: 11719680 | consumed tokens: 24001904640 | elapsed time per iteration (s): 0.15 | learning rate: 4.511E-05 | global batch size: 256 | lm loss: 3.766234E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.706 | TFLOPs: 25.95 | +7: iteration 45790/ 60336 | consumed samples: 11722240 | consumed tokens: 24007147520 | elapsed time per iteration (s): 0.15 | learning rate: 4.508E-05 | global batch size: 256 | lm loss: 3.774336E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.788 | TFLOPs: 25.95 | +7: iteration 45800/ 60336 | consumed samples: 11724800 | consumed tokens: 24012390400 | elapsed time per iteration (s): 0.15 | learning rate: 4.505E-05 | global batch size: 256 | lm loss: 3.763144E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.635 | TFLOPs: 25.98 | +7: iteration 45810/ 60336 | consumed samples: 11727360 | consumed tokens: 24017633280 | elapsed time per iteration (s): 0.15 | learning rate: 4.501E-05 | global batch size: 256 | lm loss: 3.771764E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.618 | TFLOPs: 25.93 | +7: iteration 45820/ 60336 | consumed samples: 11729920 | consumed tokens: 24022876160 | elapsed time per iteration (s): 0.15 | learning rate: 4.498E-05 | global batch size: 256 | lm loss: 3.760994E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.721 | TFLOPs: 25.97 | +7: iteration 45830/ 60336 | consumed samples: 11732480 | consumed tokens: 24028119040 | elapsed time per iteration (s): 0.15 | learning rate: 4.495E-05 | global batch size: 256 | lm loss: 3.761119E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.380 | TFLOPs: 25.96 | +7: iteration 45840/ 60336 | consumed samples: 11735040 | consumed tokens: 24033361920 | elapsed time per iteration (s): 0.15 | learning rate: 4.491E-05 | global batch size: 256 | lm loss: 3.764265E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.882 | TFLOPs: 25.95 | +7: iteration 45850/ 60336 | consumed samples: 11737600 | consumed tokens: 24038604800 | elapsed time per iteration (s): 0.15 | learning rate: 4.488E-05 | global batch size: 256 | lm loss: 3.767908E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.681 | TFLOPs: 25.97 | +7: iteration 45860/ 60336 | consumed samples: 11740160 | consumed tokens: 24043847680 | elapsed time per iteration (s): 0.15 | learning rate: 4.485E-05 | global batch size: 256 | lm loss: 3.760923E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.224 | TFLOPs: 26.01 | +7: iteration 45870/ 60336 | consumed samples: 11742720 | consumed tokens: 24049090560 | elapsed time per iteration (s): 0.15 | learning rate: 4.482E-05 | global batch size: 256 | lm loss: 3.748431E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.242 | TFLOPs: 26.02 | +7: iteration 45880/ 60336 | consumed samples: 11745280 | consumed tokens: 24054333440 | elapsed time per iteration (s): 0.15 | learning rate: 4.478E-05 | global batch size: 256 | lm loss: 3.781424E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.070 | TFLOPs: 26.00 | +7: iteration 45890/ 60336 | consumed samples: 11747840 | consumed tokens: 24059576320 | elapsed time per iteration (s): 0.15 | learning rate: 4.475E-05 | global batch size: 256 | lm loss: 3.763944E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.851 | TFLOPs: 26.01 | +7: iteration 45900/ 60336 | consumed samples: 11750400 | consumed tokens: 24064819200 | elapsed time per iteration (s): 0.15 | learning rate: 4.472E-05 | global batch size: 256 | lm loss: 3.775858E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.508 | TFLOPs: 26.01 | +7: iteration 45910/ 60336 | consumed samples: 11752960 | consumed tokens: 24070062080 | elapsed time per iteration (s): 0.15 | learning rate: 4.469E-05 | global batch size: 256 | lm loss: 3.752559E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.232 | TFLOPs: 26.01 | +7: iteration 45920/ 60336 | consumed samples: 11755520 | consumed tokens: 24075304960 | elapsed time per iteration (s): 0.15 | learning rate: 4.465E-05 | global batch size: 256 | lm loss: 3.763379E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.698 | TFLOPs: 26.01 | +7: iteration 45930/ 60336 | consumed samples: 11758080 | consumed tokens: 24080547840 | elapsed time per iteration (s): 0.15 | learning rate: 4.462E-05 | global batch size: 256 | lm loss: 3.764533E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.509 | TFLOPs: 26.01 | +7: iteration 45940/ 60336 | consumed samples: 11760640 | consumed tokens: 24085790720 | elapsed time per iteration (s): 0.15 | learning rate: 4.459E-05 | global batch size: 256 | lm loss: 3.756147E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.241 | TFLOPs: 26.02 | +7: iteration 45950/ 60336 | consumed samples: 11763200 | consumed tokens: 24091033600 | elapsed time per iteration (s): 0.15 | learning rate: 4.456E-05 | global batch size: 256 | lm loss: 3.768974E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.395 | TFLOPs: 25.98 | +7: iteration 45960/ 60336 | consumed samples: 11765760 | consumed tokens: 24096276480 | elapsed time per iteration (s): 0.15 | learning rate: 4.452E-05 | global batch size: 256 | lm loss: 3.762567E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.734 | TFLOPs: 26.01 | +7: iteration 45970/ 60336 | consumed samples: 11768320 | consumed tokens: 24101519360 | elapsed time per iteration (s): 0.15 | learning rate: 4.449E-05 | global batch size: 256 | lm loss: 3.763395E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.799 | TFLOPs: 25.98 | +7: iteration 45980/ 60336 | consumed samples: 11770880 | consumed tokens: 24106762240 | elapsed time per iteration (s): 0.15 | learning rate: 4.446E-05 | global batch size: 256 | lm loss: 3.766202E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.577 | TFLOPs: 25.99 | +7: iteration 45990/ 60336 | consumed samples: 11773440 | consumed tokens: 24112005120 | elapsed time per iteration (s): 0.15 | learning rate: 4.443E-05 | global batch size: 256 | lm loss: 3.755883E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.147 | TFLOPs: 26.00 | +0: [2023-03-17 02:17:53,894] [INFO] [logging.py:68:log_dist] [Rank 0] step=46000, skipped=0, lr=[4.4393545006704866e-05, 4.4393545006704866e-05, 4.4393545006704866e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 46000/ 60336 | consumed samples: 11776000 | consumed tokens: 24117248000 | elapsed time per iteration (s): 0.15 | learning rate: 4.439E-05 | global batch size: 256 | lm loss: 3.756175E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.579 | TFLOPs: 26.01 | +0: steps: 46000 loss: 3.7681 iter time (s): 0.153 samples/sec: 1669.916 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 46000 | lm loss value: 3.886619E+00 | lm loss PPL: 4.874581E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 46000 to checkpoints_44m32b100m +0: [2023-03-17 02:17:53,968] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step46000 is begin to save! +0: [2023-03-17 02:17:53,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:17:54,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:17:54,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:17:54,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:17:54,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:17:54,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:17:54,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:17:54,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:17:54,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:17:54,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:17:54,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:17:54,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:17:54,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:17:54,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:17:54,081] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:17:54,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:17:54,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:17:54,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:17:54,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:17:54,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:17:54,098] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step46000/mp_rank_00_model_states.pt +0: [2023-03-17 02:17:54,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:17:54,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:17:54,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:17:54,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:17:54,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:17:54,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:17:54,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:17:54,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:17:54,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:17:54,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:17:54,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:17:54,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:17:54,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:17:54,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:17:54,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:17:54,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step46000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:17:54,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: successfully saved checkpoint at iteration 46000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 189.94 +7: iteration 46010/ 60336 | consumed samples: 11778560 | consumed tokens: 24122490880 | elapsed time per iteration (s): 0.18 | learning rate: 4.436E-05 | global batch size: 256 | lm loss: 3.768025E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.962 | TFLOPs: 22.14 | +7: iteration 46020/ 60336 | consumed samples: 11781120 | consumed tokens: 24127733760 | elapsed time per iteration (s): 0.15 | learning rate: 4.433E-05 | global batch size: 256 | lm loss: 3.760085E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.557 | TFLOPs: 26.17 | +7: iteration 46030/ 60336 | consumed samples: 11783680 | consumed tokens: 24132976640 | elapsed time per iteration (s): 0.15 | learning rate: 4.430E-05 | global batch size: 256 | lm loss: 3.773855E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.807 | TFLOPs: 26.12 | +7: iteration 46040/ 60336 | consumed samples: 11786240 | consumed tokens: 24138219520 | elapsed time per iteration (s): 0.15 | learning rate: 4.426E-05 | global batch size: 256 | lm loss: 3.780204E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.452 | TFLOPs: 26.13 | +7: iteration 46050/ 60336 | consumed samples: 11788800 | consumed tokens: 24143462400 | elapsed time per iteration (s): 0.15 | learning rate: 4.423E-05 | global batch size: 256 | lm loss: 3.778649E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.160 | TFLOPs: 26.11 | +7: iteration 46060/ 60336 | consumed samples: 11791360 | consumed tokens: 24148705280 | elapsed time per iteration (s): 0.15 | learning rate: 4.420E-05 | global batch size: 256 | lm loss: 3.771566E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.302 | TFLOPs: 26.12 | +7: iteration 46070/ 60336 | consumed samples: 11793920 | consumed tokens: 24153948160 | elapsed time per iteration (s): 0.15 | learning rate: 4.417E-05 | global batch size: 256 | lm loss: 3.777047E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.525 | TFLOPs: 26.15 | +7: iteration 46080/ 60336 | consumed samples: 11796480 | consumed tokens: 24159191040 | elapsed time per iteration (s): 0.15 | learning rate: 4.413E-05 | global batch size: 256 | lm loss: 3.778942E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.852 | TFLOPs: 26.12 | +7: iteration 46090/ 60336 | consumed samples: 11799040 | consumed tokens: 24164433920 | elapsed time per iteration (s): 0.15 | learning rate: 4.410E-05 | global batch size: 256 | lm loss: 3.753319E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.916 | TFLOPs: 26.13 | +7: iteration 46100/ 60336 | consumed samples: 11801600 | consumed tokens: 24169676800 | elapsed time per iteration (s): 0.15 | learning rate: 4.407E-05 | global batch size: 256 | lm loss: 3.767373E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.833 | TFLOPs: 26.12 | +7: iteration 46110/ 60336 | consumed samples: 11804160 | consumed tokens: 24174919680 | elapsed time per iteration (s): 0.15 | learning rate: 4.404E-05 | global batch size: 256 | lm loss: 3.764367E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.235 | TFLOPs: 26.12 | +7: iteration 46120/ 60336 | consumed samples: 11806720 | consumed tokens: 24180162560 | elapsed time per iteration (s): 0.15 | learning rate: 4.401E-05 | global batch size: 256 | lm loss: 3.766058E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.352 | TFLOPs: 26.12 | +7: iteration 46130/ 60336 | consumed samples: 11809280 | consumed tokens: 24185405440 | elapsed time per iteration (s): 0.15 | learning rate: 4.397E-05 | global batch size: 256 | lm loss: 3.755427E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.093 | TFLOPs: 26.11 | +7: iteration 46140/ 60336 | consumed samples: 11811840 | consumed tokens: 24190648320 | elapsed time per iteration (s): 0.15 | learning rate: 4.394E-05 | global batch size: 256 | lm loss: 3.766867E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.206 | TFLOPs: 26.13 | +7: iteration 46150/ 60336 | consumed samples: 11814400 | consumed tokens: 24195891200 | elapsed time per iteration (s): 0.15 | learning rate: 4.391E-05 | global batch size: 256 | lm loss: 3.752776E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.117 | TFLOPs: 26.11 | +7: iteration 46160/ 60336 | consumed samples: 11816960 | consumed tokens: 24201134080 | elapsed time per iteration (s): 0.15 | learning rate: 4.388E-05 | global batch size: 256 | lm loss: 3.769721E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.701 | TFLOPs: 26.09 | +7: iteration 46170/ 60336 | consumed samples: 11819520 | consumed tokens: 24206376960 | elapsed time per iteration (s): 0.15 | learning rate: 4.385E-05 | global batch size: 256 | lm loss: 3.768951E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.099 | TFLOPs: 26.11 | +7: iteration 46180/ 60336 | consumed samples: 11822080 | consumed tokens: 24211619840 | elapsed time per iteration (s): 0.15 | learning rate: 4.381E-05 | global batch size: 256 | lm loss: 3.759350E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.974 | TFLOPs: 26.11 | +7: iteration 46190/ 60336 | consumed samples: 11824640 | consumed tokens: 24216862720 | elapsed time per iteration (s): 0.15 | learning rate: 4.378E-05 | global batch size: 256 | lm loss: 3.770148E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.941 | TFLOPs: 26.08 | +7: iteration 46200/ 60336 | consumed samples: 11827200 | consumed tokens: 24222105600 | elapsed time per iteration (s): 0.15 | learning rate: 4.375E-05 | global batch size: 256 | lm loss: 3.776085E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.144 | TFLOPs: 26.07 | +7: iteration 46210/ 60336 | consumed samples: 11829760 | consumed tokens: 24227348480 | elapsed time per iteration (s): 0.16 | learning rate: 4.372E-05 | global batch size: 256 | lm loss: 3.766769E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.334 | TFLOPs: 25.60 | +7: iteration 46220/ 60336 | consumed samples: 11832320 | consumed tokens: 24232591360 | elapsed time per iteration (s): 0.15 | learning rate: 4.369E-05 | global batch size: 256 | lm loss: 3.760599E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.069 | TFLOPs: 26.08 | +7: iteration 46230/ 60336 | consumed samples: 11834880 | consumed tokens: 24237834240 | elapsed time per iteration (s): 0.15 | learning rate: 4.365E-05 | global batch size: 256 | lm loss: 3.772347E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.470 | TFLOPs: 26.09 | +7: iteration 46240/ 60336 | consumed samples: 11837440 | consumed tokens: 24243077120 | elapsed time per iteration (s): 0.15 | learning rate: 4.362E-05 | global batch size: 256 | lm loss: 3.767427E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.280 | TFLOPs: 26.10 | +7: iteration 46250/ 60336 | consumed samples: 11840000 | consumed tokens: 24248320000 | elapsed time per iteration (s): 0.15 | learning rate: 4.359E-05 | global batch size: 256 | lm loss: 3.763352E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.528 | TFLOPs: 26.09 | +7: iteration 46260/ 60336 | consumed samples: 11842560 | consumed tokens: 24253562880 | elapsed time per iteration (s): 0.15 | learning rate: 4.356E-05 | global batch size: 256 | lm loss: 3.756321E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.615 | TFLOPs: 26.09 | +7: iteration 46270/ 60336 | consumed samples: 11845120 | consumed tokens: 24258805760 | elapsed time per iteration (s): 0.15 | learning rate: 4.353E-05 | global batch size: 256 | lm loss: 3.758374E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.349 | TFLOPs: 26.10 | +7: iteration 46280/ 60336 | consumed samples: 11847680 | consumed tokens: 24264048640 | elapsed time per iteration (s): 0.15 | learning rate: 4.349E-05 | global batch size: 256 | lm loss: 3.754797E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.876 | TFLOPs: 26.06 | +7: iteration 46290/ 60336 | consumed samples: 11850240 | consumed tokens: 24269291520 | elapsed time per iteration (s): 0.15 | learning rate: 4.346E-05 | global batch size: 256 | lm loss: 3.768932E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.417 | TFLOPs: 26.12 | +7: iteration 46300/ 60336 | consumed samples: 11852800 | consumed tokens: 24274534400 | elapsed time per iteration (s): 0.15 | learning rate: 4.343E-05 | global batch size: 256 | lm loss: 3.760613E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.221 | TFLOPs: 26.10 | +7: iteration 46310/ 60336 | consumed samples: 11855360 | consumed tokens: 24279777280 | elapsed time per iteration (s): 0.15 | learning rate: 4.340E-05 | global batch size: 256 | lm loss: 3.776181E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.752 | TFLOPs: 26.09 | +7: iteration 46320/ 60336 | consumed samples: 11857920 | consumed tokens: 24285020160 | elapsed time per iteration (s): 0.15 | learning rate: 4.337E-05 | global batch size: 256 | lm loss: 3.769253E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.040 | TFLOPs: 26.17 | +7: iteration 46330/ 60336 | consumed samples: 11860480 | consumed tokens: 24290263040 | elapsed time per iteration (s): 0.15 | learning rate: 4.333E-05 | global batch size: 256 | lm loss: 3.773193E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.785 | TFLOPs: 26.16 | +7: iteration 46340/ 60336 | consumed samples: 11863040 | consumed tokens: 24295505920 | elapsed time per iteration (s): 0.15 | learning rate: 4.330E-05 | global batch size: 256 | lm loss: 3.757545E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.223 | TFLOPs: 26.13 | +7: iteration 46350/ 60336 | consumed samples: 11865600 | consumed tokens: 24300748800 | elapsed time per iteration (s): 0.15 | learning rate: 4.327E-05 | global batch size: 256 | lm loss: 3.768045E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.800 | TFLOPs: 26.12 | +7: iteration 46360/ 60336 | consumed samples: 11868160 | consumed tokens: 24305991680 | elapsed time per iteration (s): 0.15 | learning rate: 4.324E-05 | global batch size: 256 | lm loss: 3.770603E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.331 | TFLOPs: 26.10 | +7: iteration 46370/ 60336 | consumed samples: 11870720 | consumed tokens: 24311234560 | elapsed time per iteration (s): 0.15 | learning rate: 4.321E-05 | global batch size: 256 | lm loss: 3.756353E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.406 | TFLOPs: 26.10 | +7: iteration 46380/ 60336 | consumed samples: 11873280 | consumed tokens: 24316477440 | elapsed time per iteration (s): 0.15 | learning rate: 4.318E-05 | global batch size: 256 | lm loss: 3.754549E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.286 | TFLOPs: 26.10 | +7: iteration 46390/ 60336 | consumed samples: 11875840 | consumed tokens: 24321720320 | elapsed time per iteration (s): 0.15 | learning rate: 4.314E-05 | global batch size: 256 | lm loss: 3.767305E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.079 | TFLOPs: 26.16 | +7: iteration 46400/ 60336 | consumed samples: 11878400 | consumed tokens: 24326963200 | elapsed time per iteration (s): 0.15 | learning rate: 4.311E-05 | global batch size: 256 | lm loss: 3.760321E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.778 | TFLOPs: 26.12 | +7: iteration 46410/ 60336 | consumed samples: 11880960 | consumed tokens: 24332206080 | elapsed time per iteration (s): 0.15 | learning rate: 4.308E-05 | global batch size: 256 | lm loss: 3.774213E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.108 | TFLOPs: 26.14 | +7: iteration 46420/ 60336 | consumed samples: 11883520 | consumed tokens: 24337448960 | elapsed time per iteration (s): 0.15 | learning rate: 4.305E-05 | global batch size: 256 | lm loss: 3.759115E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.057 | TFLOPs: 26.14 | +7: iteration 46430/ 60336 | consumed samples: 11886080 | consumed tokens: 24342691840 | elapsed time per iteration (s): 0.15 | learning rate: 4.302E-05 | global batch size: 256 | lm loss: 3.772284E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.189 | TFLOPs: 26.11 | +7: iteration 46440/ 60336 | consumed samples: 11888640 | consumed tokens: 24347934720 | elapsed time per iteration (s): 0.15 | learning rate: 4.299E-05 | global batch size: 256 | lm loss: 3.777452E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.036 | TFLOPs: 26.14 | +7: iteration 46450/ 60336 | consumed samples: 11891200 | consumed tokens: 24353177600 | elapsed time per iteration (s): 0.15 | learning rate: 4.295E-05 | global batch size: 256 | lm loss: 3.766787E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.908 | TFLOPs: 26.11 | +7: iteration 46460/ 60336 | consumed samples: 11893760 | consumed tokens: 24358420480 | elapsed time per iteration (s): 0.15 | learning rate: 4.292E-05 | global batch size: 256 | lm loss: 3.757873E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.051 | TFLOPs: 26.00 | +7: iteration 46470/ 60336 | consumed samples: 11896320 | consumed tokens: 24363663360 | elapsed time per iteration (s): 0.15 | learning rate: 4.289E-05 | global batch size: 256 | lm loss: 3.771111E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.041 | TFLOPs: 26.14 | +7: iteration 46480/ 60336 | consumed samples: 11898880 | consumed tokens: 24368906240 | elapsed time per iteration (s): 0.15 | learning rate: 4.286E-05 | global batch size: 256 | lm loss: 3.769717E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.993 | TFLOPs: 26.14 | +7: iteration 46490/ 60336 | consumed samples: 11901440 | consumed tokens: 24374149120 | elapsed time per iteration (s): 0.15 | learning rate: 4.283E-05 | global batch size: 256 | lm loss: 3.766888E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.120 | TFLOPs: 26.16 | +7: iteration 46500/ 60336 | consumed samples: 11904000 | consumed tokens: 24379392000 | elapsed time per iteration (s): 0.15 | learning rate: 4.280E-05 | global batch size: 256 | lm loss: 3.758354E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.026 | TFLOPs: 26.16 | +7: iteration 46510/ 60336 | consumed samples: 11906560 | consumed tokens: 24384634880 | elapsed time per iteration (s): 0.15 | learning rate: 4.276E-05 | global batch size: 256 | lm loss: 3.767020E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.938 | TFLOPs: 26.17 | +7: iteration 46520/ 60336 | consumed samples: 11909120 | consumed tokens: 24389877760 | elapsed time per iteration (s): 0.15 | learning rate: 4.273E-05 | global batch size: 256 | lm loss: 3.768377E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.207 | TFLOPs: 26.16 | +7: iteration 46530/ 60336 | consumed samples: 11911680 | consumed tokens: 24395120640 | elapsed time per iteration (s): 0.15 | learning rate: 4.270E-05 | global batch size: 256 | lm loss: 3.752299E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.192 | TFLOPs: 26.15 | +7: iteration 46540/ 60336 | consumed samples: 11914240 | consumed tokens: 24400363520 | elapsed time per iteration (s): 0.15 | learning rate: 4.267E-05 | global batch size: 256 | lm loss: 3.751339E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.063 | TFLOPs: 26.16 | +7: iteration 46550/ 60336 | consumed samples: 11916800 | consumed tokens: 24405606400 | elapsed time per iteration (s): 0.15 | learning rate: 4.264E-05 | global batch size: 256 | lm loss: 3.749762E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.760 | TFLOPs: 26.19 | +7: iteration 46560/ 60336 | consumed samples: 11919360 | consumed tokens: 24410849280 | elapsed time per iteration (s): 0.15 | learning rate: 4.261E-05 | global batch size: 256 | lm loss: 3.756487E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.089 | TFLOPs: 26.18 | +7: iteration 46570/ 60336 | consumed samples: 11921920 | consumed tokens: 24416092160 | elapsed time per iteration (s): 0.15 | learning rate: 4.258E-05 | global batch size: 256 | lm loss: 3.759837E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.590 | TFLOPs: 26.17 | +7: iteration 46580/ 60336 | consumed samples: 11924480 | consumed tokens: 24421335040 | elapsed time per iteration (s): 0.15 | learning rate: 4.254E-05 | global batch size: 256 | lm loss: 3.759831E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.232 | TFLOPs: 26.18 | +7: iteration 46590/ 60336 | consumed samples: 11927040 | consumed tokens: 24426577920 | elapsed time per iteration (s): 0.15 | learning rate: 4.251E-05 | global batch size: 256 | lm loss: 3.765171E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.817 | TFLOPs: 26.17 | +7: iteration 46600/ 60336 | consumed samples: 11929600 | consumed tokens: 24431820800 | elapsed time per iteration (s): 0.15 | learning rate: 4.248E-05 | global batch size: 256 | lm loss: 3.764051E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.278 | TFLOPs: 26.18 | +7: iteration 46610/ 60336 | consumed samples: 11932160 | consumed tokens: 24437063680 | elapsed time per iteration (s): 0.15 | learning rate: 4.245E-05 | global batch size: 256 | lm loss: 3.763443E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.318 | TFLOPs: 26.18 | +7: iteration 46620/ 60336 | consumed samples: 11934720 | consumed tokens: 24442306560 | elapsed time per iteration (s): 0.15 | learning rate: 4.242E-05 | global batch size: 256 | lm loss: 3.769480E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.592 | TFLOPs: 26.15 | +7: iteration 46630/ 60336 | consumed samples: 11937280 | consumed tokens: 24447549440 | elapsed time per iteration (s): 0.15 | learning rate: 4.239E-05 | global batch size: 256 | lm loss: 3.758366E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.608 | TFLOPs: 26.15 | +7: iteration 46640/ 60336 | consumed samples: 11939840 | consumed tokens: 24452792320 | elapsed time per iteration (s): 0.15 | learning rate: 4.236E-05 | global batch size: 256 | lm loss: 3.757878E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.055 | TFLOPs: 26.17 | +7: iteration 46650/ 60336 | consumed samples: 11942400 | consumed tokens: 24458035200 | elapsed time per iteration (s): 0.15 | learning rate: 4.233E-05 | global batch size: 256 | lm loss: 3.781123E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.332 | TFLOPs: 26.12 | +7: iteration 46660/ 60336 | consumed samples: 11944960 | consumed tokens: 24463278080 | elapsed time per iteration (s): 0.15 | learning rate: 4.229E-05 | global batch size: 256 | lm loss: 3.780660E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.285 | TFLOPs: 26.12 | +7: iteration 46670/ 60336 | consumed samples: 11947520 | consumed tokens: 24468520960 | elapsed time per iteration (s): 0.15 | learning rate: 4.226E-05 | global batch size: 256 | lm loss: 3.765067E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.853 | TFLOPs: 26.12 | +7: iteration 46680/ 60336 | consumed samples: 11950080 | consumed tokens: 24473763840 | elapsed time per iteration (s): 0.15 | learning rate: 4.223E-05 | global batch size: 256 | lm loss: 3.759457E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.643 | TFLOPs: 26.12 | +7: iteration 46690/ 60336 | consumed samples: 11952640 | consumed tokens: 24479006720 | elapsed time per iteration (s): 0.15 | learning rate: 4.220E-05 | global batch size: 256 | lm loss: 3.779791E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.953 | TFLOPs: 26.14 | +7: iteration 46700/ 60336 | consumed samples: 11955200 | consumed tokens: 24484249600 | elapsed time per iteration (s): 0.15 | learning rate: 4.217E-05 | global batch size: 256 | lm loss: 3.768949E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.388 | TFLOPs: 26.12 | +7: iteration 46710/ 60336 | consumed samples: 11957760 | consumed tokens: 24489492480 | elapsed time per iteration (s): 0.15 | learning rate: 4.214E-05 | global batch size: 256 | lm loss: 3.757324E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.077 | TFLOPs: 26.13 | +7: iteration 46720/ 60336 | consumed samples: 11960320 | consumed tokens: 24494735360 | elapsed time per iteration (s): 0.15 | learning rate: 4.211E-05 | global batch size: 256 | lm loss: 3.780908E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.719 | TFLOPs: 26.15 | +7: iteration 46730/ 60336 | consumed samples: 11962880 | consumed tokens: 24499978240 | elapsed time per iteration (s): 0.15 | learning rate: 4.208E-05 | global batch size: 256 | lm loss: 3.765231E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.851 | TFLOPs: 26.16 | +7: iteration 46740/ 60336 | consumed samples: 11965440 | consumed tokens: 24505221120 | elapsed time per iteration (s): 0.15 | learning rate: 4.205E-05 | global batch size: 256 | lm loss: 3.774238E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.760 | TFLOPs: 26.14 | +7: iteration 46750/ 60336 | consumed samples: 11968000 | consumed tokens: 24510464000 | elapsed time per iteration (s): 0.15 | learning rate: 4.201E-05 | global batch size: 256 | lm loss: 3.762904E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.772 | TFLOPs: 26.17 | +7: iteration 46760/ 60336 | consumed samples: 11970560 | consumed tokens: 24515706880 | elapsed time per iteration (s): 0.15 | learning rate: 4.198E-05 | global batch size: 256 | lm loss: 3.768776E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.720 | TFLOPs: 26.17 | +7: iteration 46770/ 60336 | consumed samples: 11973120 | consumed tokens: 24520949760 | elapsed time per iteration (s): 0.15 | learning rate: 4.195E-05 | global batch size: 256 | lm loss: 3.752037E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.256 | TFLOPs: 26.12 | +7: iteration 46780/ 60336 | consumed samples: 11975680 | consumed tokens: 24526192640 | elapsed time per iteration (s): 0.15 | learning rate: 4.192E-05 | global batch size: 256 | lm loss: 3.771250E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.706 | TFLOPs: 26.12 | +7: iteration 46790/ 60336 | consumed samples: 11978240 | consumed tokens: 24531435520 | elapsed time per iteration (s): 0.15 | learning rate: 4.189E-05 | global batch size: 256 | lm loss: 3.762387E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.265 | TFLOPs: 26.12 | +7: iteration 46800/ 60336 | consumed samples: 11980800 | consumed tokens: 24536678400 | elapsed time per iteration (s): 0.15 | learning rate: 4.186E-05 | global batch size: 256 | lm loss: 3.765786E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.982 | TFLOPs: 26.13 | +7: iteration 46810/ 60336 | consumed samples: 11983360 | consumed tokens: 24541921280 | elapsed time per iteration (s): 0.15 | learning rate: 4.183E-05 | global batch size: 256 | lm loss: 3.764658E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.611 | TFLOPs: 26.12 | +7: iteration 46820/ 60336 | consumed samples: 11985920 | consumed tokens: 24547164160 | elapsed time per iteration (s): 0.15 | learning rate: 4.180E-05 | global batch size: 256 | lm loss: 3.764708E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.804 | TFLOPs: 26.12 | +7: iteration 46830/ 60336 | consumed samples: 11988480 | consumed tokens: 24552407040 | elapsed time per iteration (s): 0.15 | learning rate: 4.177E-05 | global batch size: 256 | lm loss: 3.749471E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.316 | TFLOPs: 26.12 | +7: iteration 46840/ 60336 | consumed samples: 11991040 | consumed tokens: 24557649920 | elapsed time per iteration (s): 0.15 | learning rate: 4.174E-05 | global batch size: 256 | lm loss: 3.772955E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.624 | TFLOPs: 26.12 | +7: iteration 46850/ 60336 | consumed samples: 11993600 | consumed tokens: 24562892800 | elapsed time per iteration (s): 0.15 | learning rate: 4.171E-05 | global batch size: 256 | lm loss: 3.770882E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.023 | TFLOPs: 26.13 | +7: iteration 46860/ 60336 | consumed samples: 11996160 | consumed tokens: 24568135680 | elapsed time per iteration (s): 0.15 | learning rate: 4.167E-05 | global batch size: 256 | lm loss: 3.763327E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.032 | TFLOPs: 26.11 | +7: iteration 46870/ 60336 | consumed samples: 11998720 | consumed tokens: 24573378560 | elapsed time per iteration (s): 0.15 | learning rate: 4.164E-05 | global batch size: 256 | lm loss: 3.762667E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.327 | TFLOPs: 26.13 | +7: iteration 46880/ 60336 | consumed samples: 12001280 | consumed tokens: 24578621440 | elapsed time per iteration (s): 0.15 | learning rate: 4.161E-05 | global batch size: 256 | lm loss: 3.762627E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.119 | TFLOPs: 26.10 | +7: iteration 46890/ 60336 | consumed samples: 12003840 | consumed tokens: 24583864320 | elapsed time per iteration (s): 0.15 | learning rate: 4.158E-05 | global batch size: 256 | lm loss: 3.761747E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.792 | TFLOPs: 26.09 | +7: iteration 46900/ 60336 | consumed samples: 12006400 | consumed tokens: 24589107200 | elapsed time per iteration (s): 0.15 | learning rate: 4.155E-05 | global batch size: 256 | lm loss: 3.763800E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.547 | TFLOPs: 26.12 | +7: iteration 46910/ 60336 | consumed samples: 12008960 | consumed tokens: 24594350080 | elapsed time per iteration (s): 0.15 | learning rate: 4.152E-05 | global batch size: 256 | lm loss: 3.751690E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.015 | TFLOPs: 26.11 | +7: iteration 46920/ 60336 | consumed samples: 12011520 | consumed tokens: 24599592960 | elapsed time per iteration (s): 0.15 | learning rate: 4.149E-05 | global batch size: 256 | lm loss: 3.765377E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.842 | TFLOPs: 26.12 | +7: iteration 46930/ 60336 | consumed samples: 12014080 | consumed tokens: 24604835840 | elapsed time per iteration (s): 0.15 | learning rate: 4.146E-05 | global batch size: 256 | lm loss: 3.764069E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.463 | TFLOPs: 26.12 | +7: iteration 46940/ 60336 | consumed samples: 12016640 | consumed tokens: 24610078720 | elapsed time per iteration (s): 0.15 | learning rate: 4.143E-05 | global batch size: 256 | lm loss: 3.774657E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 46950/ 60336 | consumed samples: 12019200 | consumed tokens: 24615321600 | elapsed time per iteration (s): 0.15 | learning rate: 4.140E-05 | global batch size: 256 | lm loss: 3.763400E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.819 | TFLOPs: 26.11 | +7: iteration 46960/ 60336 | consumed samples: 12021760 | consumed tokens: 24620564480 | elapsed time per iteration (s): 0.15 | learning rate: 4.137E-05 | global batch size: 256 | lm loss: 3.768646E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.676 | TFLOPs: 26.07 | +7: iteration 46970/ 60336 | consumed samples: 12024320 | consumed tokens: 24625807360 | elapsed time per iteration (s): 0.15 | learning rate: 4.134E-05 | global batch size: 256 | lm loss: 3.773137E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.867 | TFLOPs: 26.14 | +7: iteration 46980/ 60336 | consumed samples: 12026880 | consumed tokens: 24631050240 | elapsed time per iteration (s): 0.15 | learning rate: 4.131E-05 | global batch size: 256 | lm loss: 3.750274E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.450 | TFLOPs: 26.15 | +7: iteration 46990/ 60336 | consumed samples: 12029440 | consumed tokens: 24636293120 | elapsed time per iteration (s): 0.15 | learning rate: 4.128E-05 | global batch size: 256 | lm loss: 3.782743E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.214 | TFLOPs: 26.13 | +7: iteration 47000/ 60336 | consumed samples: 12032000 | consumed tokens: 24641536000 | elapsed time per iteration (s): 0.15 | learning rate: 4.125E-05 | global batch size: 256 | lm loss: 3.762737E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.168 | TFLOPs: 26.15 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 47000 | lm loss value: 3.884904E+00 | lm loss PPL: 4.866228E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 47000 to checkpoints_44m32b100m +0: [2023-03-17 02:20:27,922] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step47000 is begin to save! +0: [2023-03-17 02:20:27,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:20:27,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:20:27,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:20:27,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:20:27,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:20:28,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:20:28,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:20:28,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:20:28,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:20:28,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:20:28,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:20:28,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:20:28,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:20:28,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:20:28,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:20:28,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:20:28,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:20:28,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:20:28,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:20:28,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:20:28,055] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step47000/mp_rank_00_model_states.pt +0: [2023-03-17 02:20:28,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:20:28,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:20:28,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:20:28,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:20:28,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:20:28,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:20:28,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:20:28,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:20:28,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:20:28,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:20:28,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:20:28,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step47000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:20:28,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: successfully saved checkpoint at iteration 47000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 183.70 +7: iteration 47010/ 60336 | consumed samples: 12034560 | consumed tokens: 24646778880 | elapsed time per iteration (s): 0.18 | learning rate: 4.121E-05 | global batch size: 256 | lm loss: 3.764929E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1445.363 | TFLOPs: 22.67 | +7: iteration 47020/ 60336 | consumed samples: 12037120 | consumed tokens: 24652021760 | elapsed time per iteration (s): 0.15 | learning rate: 4.118E-05 | global batch size: 256 | lm loss: 3.762085E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.031 | TFLOPs: 26.08 | +7: iteration 47030/ 60336 | consumed samples: 12039680 | consumed tokens: 24657264640 | elapsed time per iteration (s): 0.15 | learning rate: 4.115E-05 | global batch size: 256 | lm loss: 3.765956E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.601 | TFLOPs: 26.12 | +7: iteration 47040/ 60336 | consumed samples: 12042240 | consumed tokens: 24662507520 | elapsed time per iteration (s): 0.15 | learning rate: 4.112E-05 | global batch size: 256 | lm loss: 3.762144E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.416 | TFLOPs: 26.12 | +7: iteration 47050/ 60336 | consumed samples: 12044800 | consumed tokens: 24667750400 | elapsed time per iteration (s): 0.15 | learning rate: 4.109E-05 | global batch size: 256 | lm loss: 3.760212E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.287 | TFLOPs: 26.08 | +7: iteration 47060/ 60336 | consumed samples: 12047360 | consumed tokens: 24672993280 | elapsed time per iteration (s): 0.15 | learning rate: 4.106E-05 | global batch size: 256 | lm loss: 3.762349E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.611 | TFLOPs: 26.06 | +7: iteration 47070/ 60336 | consumed samples: 12049920 | consumed tokens: 24678236160 | elapsed time per iteration (s): 0.15 | learning rate: 4.103E-05 | global batch size: 256 | lm loss: 3.759592E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.974 | TFLOPs: 26.08 | +7: iteration 47080/ 60336 | consumed samples: 12052480 | consumed tokens: 24683479040 | elapsed time per iteration (s): 0.15 | learning rate: 4.100E-05 | global batch size: 256 | lm loss: 3.769891E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.408 | TFLOPs: 26.04 | +7: iteration 47090/ 60336 | consumed samples: 12055040 | consumed tokens: 24688721920 | elapsed time per iteration (s): 0.15 | learning rate: 4.097E-05 | global batch size: 256 | lm loss: 3.763754E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.042 | TFLOPs: 26.05 | +7: iteration 47100/ 60336 | consumed samples: 12057600 | consumed tokens: 24693964800 | elapsed time per iteration (s): 0.15 | learning rate: 4.094E-05 | global batch size: 256 | lm loss: 3.776528E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.472 | TFLOPs: 26.07 | +7: iteration 47110/ 60336 | consumed samples: 12060160 | consumed tokens: 24699207680 | elapsed time per iteration (s): 0.15 | learning rate: 4.091E-05 | global batch size: 256 | lm loss: 3.768980E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.773 | TFLOPs: 26.08 | +7: iteration 47120/ 60336 | consumed samples: 12062720 | consumed tokens: 24704450560 | elapsed time per iteration (s): 0.15 | learning rate: 4.088E-05 | global batch size: 256 | lm loss: 3.768839E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.821 | TFLOPs: 26.08 | +7: iteration 47130/ 60336 | consumed samples: 12065280 | consumed tokens: 24709693440 | elapsed time per iteration (s): 0.15 | learning rate: 4.085E-05 | global batch size: 256 | lm loss: 3.760012E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.125 | TFLOPs: 26.08 | +7: iteration 47140/ 60336 | consumed samples: 12067840 | consumed tokens: 24714936320 | elapsed time per iteration (s): 0.15 | learning rate: 4.082E-05 | global batch size: 256 | lm loss: 3.748537E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.626 | TFLOPs: 26.09 | +7: iteration 47150/ 60336 | consumed samples: 12070400 | consumed tokens: 24720179200 | elapsed time per iteration (s): 0.15 | learning rate: 4.079E-05 | global batch size: 256 | lm loss: 3.756598E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.973 | TFLOPs: 26.10 | +7: iteration 47160/ 60336 | consumed samples: 12072960 | consumed tokens: 24725422080 | elapsed time per iteration (s): 0.15 | learning rate: 4.076E-05 | global batch size: 256 | lm loss: 3.765367E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.449 | TFLOPs: 26.07 | +7: iteration 47170/ 60336 | consumed samples: 12075520 | consumed tokens: 24730664960 | elapsed time per iteration (s): 0.15 | learning rate: 4.073E-05 | global batch size: 256 | lm loss: 3.756576E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.559 | TFLOPs: 26.04 | +7: iteration 47180/ 60336 | consumed samples: 12078080 | consumed tokens: 24735907840 | elapsed time per iteration (s): 0.15 | learning rate: 4.070E-05 | global batch size: 256 | lm loss: 3.766719E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.803 | TFLOPs: 26.06 | +7: iteration 47190/ 60336 | consumed samples: 12080640 | consumed tokens: 24741150720 | elapsed time per iteration (s): 0.15 | learning rate: 4.067E-05 | global batch size: 256 | lm loss: 3.764233E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.320 | TFLOPs: 26.07 | +7: iteration 47200/ 60336 | consumed samples: 12083200 | consumed tokens: 24746393600 | elapsed time per iteration (s): 0.15 | learning rate: 4.064E-05 | global batch size: 256 | lm loss: 3.773117E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.059 | TFLOPs: 26.08 | +7: iteration 47210/ 60336 | consumed samples: 12085760 | consumed tokens: 24751636480 | elapsed time per iteration (s): 0.16 | learning rate: 4.061E-05 | global batch size: 256 | lm loss: 3.775480E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.177 | TFLOPs: 25.71 | +7: iteration 47220/ 60336 | consumed samples: 12088320 | consumed tokens: 24756879360 | elapsed time per iteration (s): 0.15 | learning rate: 4.058E-05 | global batch size: 256 | lm loss: 3.772766E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.629 | TFLOPs: 26.03 | +7: iteration 47230/ 60336 | consumed samples: 12090880 | consumed tokens: 24762122240 | elapsed time per iteration (s): 0.15 | learning rate: 4.055E-05 | global batch size: 256 | lm loss: 3.760424E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.912 | TFLOPs: 26.03 | +7: iteration 47240/ 60336 | consumed samples: 12093440 | consumed tokens: 24767365120 | elapsed time per iteration (s): 0.15 | learning rate: 4.052E-05 | global batch size: 256 | lm loss: 3.770915E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.219 | TFLOPs: 26.10 | +7: iteration 47250/ 60336 | consumed samples: 12096000 | consumed tokens: 24772608000 | elapsed time per iteration (s): 0.15 | learning rate: 4.049E-05 | global batch size: 256 | lm loss: 3.753680E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.044 | TFLOPs: 26.07 | +7: iteration 47260/ 60336 | consumed samples: 12098560 | consumed tokens: 24777850880 | elapsed time per iteration (s): 0.15 | learning rate: 4.046E-05 | global batch size: 256 | lm loss: 3.756103E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.387 | TFLOPs: 26.07 | +7: iteration 47270/ 60336 | consumed samples: 12101120 | consumed tokens: 24783093760 | elapsed time per iteration (s): 0.15 | learning rate: 4.043E-05 | global batch size: 256 | lm loss: 3.768992E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.838 | TFLOPs: 26.08 | +7: iteration 47280/ 60336 | consumed samples: 12103680 | consumed tokens: 24788336640 | elapsed time per iteration (s): 0.15 | learning rate: 4.040E-05 | global batch size: 256 | lm loss: 3.771130E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.359 | TFLOPs: 26.02 | +7: iteration 47290/ 60336 | consumed samples: 12106240 | consumed tokens: 24793579520 | elapsed time per iteration (s): 0.15 | learning rate: 4.037E-05 | global batch size: 256 | lm loss: 3.771546E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.110 | TFLOPs: 26.05 | +7: iteration 47300/ 60336 | consumed samples: 12108800 | consumed tokens: 24798822400 | elapsed time per iteration (s): 0.16 | learning rate: 4.034E-05 | global batch size: 256 | lm loss: 3.770979E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.193 | TFLOPs: 25.28 | +7: iteration 47310/ 60336 | consumed samples: 12111360 | consumed tokens: 24804065280 | elapsed time per iteration (s): 0.15 | learning rate: 4.031E-05 | global batch size: 256 | lm loss: 3.761374E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.611 | TFLOPs: 25.96 | +7: iteration 47320/ 60336 | consumed samples: 12113920 | consumed tokens: 24809308160 | elapsed time per iteration (s): 0.15 | learning rate: 4.028E-05 | global batch size: 256 | lm loss: 3.759489E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.700 | TFLOPs: 26.08 | +7: iteration 47330/ 60336 | consumed samples: 12116480 | consumed tokens: 24814551040 | elapsed time per iteration (s): 0.15 | learning rate: 4.025E-05 | global batch size: 256 | lm loss: 3.761296E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.793 | TFLOPs: 26.06 | +7: iteration 47340/ 60336 | consumed samples: 12119040 | consumed tokens: 24819793920 | elapsed time per iteration (s): 0.15 | learning rate: 4.022E-05 | global batch size: 256 | lm loss: 3.765361E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.318 | TFLOPs: 26.04 | +7: iteration 47350/ 60336 | consumed samples: 12121600 | consumed tokens: 24825036800 | elapsed time per iteration (s): 0.15 | learning rate: 4.019E-05 | global batch size: 256 | lm loss: 3.763997E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.261 | TFLOPs: 26.07 | +7: iteration 47360/ 60336 | consumed samples: 12124160 | consumed tokens: 24830279680 | elapsed time per iteration (s): 0.15 | learning rate: 4.016E-05 | global batch size: 256 | lm loss: 3.760451E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.974 | TFLOPs: 25.91 | +7: iteration 47370/ 60336 | consumed samples: 12126720 | consumed tokens: 24835522560 | elapsed time per iteration (s): 0.15 | learning rate: 4.013E-05 | global batch size: 256 | lm loss: 3.757895E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.898 | TFLOPs: 26.08 | +7: iteration 47380/ 60336 | consumed samples: 12129280 | consumed tokens: 24840765440 | elapsed time per iteration (s): 0.15 | learning rate: 4.010E-05 | global batch size: 256 | lm loss: 3.775678E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.063 | TFLOPs: 26.07 | +7: iteration 47390/ 60336 | consumed samples: 12131840 | consumed tokens: 24846008320 | elapsed time per iteration (s): 0.15 | learning rate: 4.007E-05 | global batch size: 256 | lm loss: 3.778003E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.801 | TFLOPs: 26.08 | +7: iteration 47400/ 60336 | consumed samples: 12134400 | consumed tokens: 24851251200 | elapsed time per iteration (s): 0.16 | learning rate: 4.004E-05 | global batch size: 256 | lm loss: 3.762422E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.412 | TFLOPs: 25.69 | +7: iteration 47410/ 60336 | consumed samples: 12136960 | consumed tokens: 24856494080 | elapsed time per iteration (s): 0.15 | learning rate: 4.001E-05 | global batch size: 256 | lm loss: 3.765004E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.616 | TFLOPs: 26.09 | +7: iteration 47420/ 60336 | consumed samples: 12139520 | consumed tokens: 24861736960 | elapsed time per iteration (s): 0.15 | learning rate: 3.998E-05 | global batch size: 256 | lm loss: 3.767807E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.169 | TFLOPs: 26.10 | +7: iteration 47430/ 60336 | consumed samples: 12142080 | consumed tokens: 24866979840 | elapsed time per iteration (s): 0.15 | learning rate: 3.995E-05 | global batch size: 256 | lm loss: 3.758821E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.265 | TFLOPs: 26.13 | +7: iteration 47440/ 60336 | consumed samples: 12144640 | consumed tokens: 24872222720 | elapsed time per iteration (s): 0.15 | learning rate: 3.992E-05 | global batch size: 256 | lm loss: 3.755403E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.012 | TFLOPs: 26.11 | +7: iteration 47450/ 60336 | consumed samples: 12147200 | consumed tokens: 24877465600 | elapsed time per iteration (s): 0.15 | learning rate: 3.989E-05 | global batch size: 256 | lm loss: 3.756087E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.852 | TFLOPs: 26.14 | +7: iteration 47460/ 60336 | consumed samples: 12149760 | consumed tokens: 24882708480 | elapsed time per iteration (s): 0.15 | learning rate: 3.986E-05 | global batch size: 256 | lm loss: 3.772249E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.066 | TFLOPs: 26.13 | +7: iteration 47470/ 60336 | consumed samples: 12152320 | consumed tokens: 24887951360 | elapsed time per iteration (s): 0.15 | learning rate: 3.983E-05 | global batch size: 256 | lm loss: 3.772563E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.610 | TFLOPs: 26.11 | +7: iteration 47480/ 60336 | consumed samples: 12154880 | consumed tokens: 24893194240 | elapsed time per iteration (s): 0.15 | learning rate: 3.980E-05 | global batch size: 256 | lm loss: 3.763424E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.229 | TFLOPs: 26.10 | +7: iteration 47490/ 60336 | consumed samples: 12157440 | consumed tokens: 24898437120 | elapsed time per iteration (s): 0.15 | learning rate: 3.977E-05 | global batch size: 256 | lm loss: 3.755080E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.899 | TFLOPs: 26.11 | +7: iteration 47500/ 60336 | consumed samples: 12160000 | consumed tokens: 24903680000 | elapsed time per iteration (s): 0.15 | learning rate: 3.974E-05 | global batch size: 256 | lm loss: 3.756377E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.754 | TFLOPs: 26.14 | +7: iteration 47510/ 60336 | consumed samples: 12162560 | consumed tokens: 24908922880 | elapsed time per iteration (s): 0.15 | learning rate: 3.971E-05 | global batch size: 256 | lm loss: 3.750053E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.796 | TFLOPs: 26.03 | +7: iteration 47520/ 60336 | consumed samples: 12165120 | consumed tokens: 24914165760 | elapsed time per iteration (s): 0.15 | learning rate: 3.968E-05 | global batch size: 256 | lm loss: 3.762279E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.460 | TFLOPs: 26.04 | +7: iteration 47530/ 60336 | consumed samples: 12167680 | consumed tokens: 24919408640 | elapsed time per iteration (s): 0.15 | learning rate: 3.965E-05 | global batch size: 256 | lm loss: 3.760590E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.196 | TFLOPs: 26.08 | +7: iteration 47540/ 60336 | consumed samples: 12170240 | consumed tokens: 24924651520 | elapsed time per iteration (s): 0.16 | learning rate: 3.962E-05 | global batch size: 256 | lm loss: 3.760424E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.109 | TFLOPs: 25.36 | +7: iteration 47550/ 60336 | consumed samples: 12172800 | consumed tokens: 24929894400 | elapsed time per iteration (s): 0.16 | learning rate: 3.959E-05 | global batch size: 256 | lm loss: 3.756772E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.055 | TFLOPs: 25.89 | +7: iteration 47560/ 60336 | consumed samples: 12175360 | consumed tokens: 24935137280 | elapsed time per iteration (s): 0.16 | learning rate: 3.957E-05 | global batch size: 256 | lm loss: 3.788530E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.484 | TFLOPs: 25.70 | +7: iteration 47570/ 60336 | consumed samples: 12177920 | consumed tokens: 24940380160 | elapsed time per iteration (s): 0.15 | learning rate: 3.954E-05 | global batch size: 256 | lm loss: 3.747496E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.489 | TFLOPs: 26.10 | +7: iteration 47580/ 60336 | consumed samples: 12180480 | consumed tokens: 24945623040 | elapsed time per iteration (s): 0.15 | learning rate: 3.951E-05 | global batch size: 256 | lm loss: 3.765623E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.113 | TFLOPs: 26.11 | +7: iteration 47590/ 60336 | consumed samples: 12183040 | consumed tokens: 24950865920 | elapsed time per iteration (s): 0.15 | learning rate: 3.948E-05 | global batch size: 256 | lm loss: 3.772507E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.752 | TFLOPs: 26.11 | +7: iteration 47600/ 60336 | consumed samples: 12185600 | consumed tokens: 24956108800 | elapsed time per iteration (s): 0.15 | learning rate: 3.945E-05 | global batch size: 256 | lm loss: 3.758327E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.305 | TFLOPs: 26.12 | +7: iteration 47610/ 60336 | consumed samples: 12188160 | consumed tokens: 24961351680 | elapsed time per iteration (s): 0.15 | learning rate: 3.942E-05 | global batch size: 256 | lm loss: 3.763881E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.096 | TFLOPs: 26.08 | +7: iteration 47620/ 60336 | consumed samples: 12190720 | consumed tokens: 24966594560 | elapsed time per iteration (s): 0.15 | learning rate: 3.939E-05 | global batch size: 256 | lm loss: 3.780014E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.596 | TFLOPs: 26.11 | +7: iteration 47630/ 60336 | consumed samples: 12193280 | consumed tokens: 24971837440 | elapsed time per iteration (s): 0.15 | learning rate: 3.936E-05 | global batch size: 256 | lm loss: 3.758313E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.629 | TFLOPs: 26.12 | +7: iteration 47640/ 60336 | consumed samples: 12195840 | consumed tokens: 24977080320 | elapsed time per iteration (s): 0.15 | learning rate: 3.933E-05 | global batch size: 256 | lm loss: 3.757564E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.988 | TFLOPs: 26.16 | +7: iteration 47650/ 60336 | consumed samples: 12198400 | consumed tokens: 24982323200 | elapsed time per iteration (s): 0.15 | learning rate: 3.930E-05 | global batch size: 256 | lm loss: 3.767950E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.966 | TFLOPs: 26.14 | +7: iteration 47660/ 60336 | consumed samples: 12200960 | consumed tokens: 24987566080 | elapsed time per iteration (s): 0.15 | learning rate: 3.927E-05 | global batch size: 256 | lm loss: 3.771860E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.122 | TFLOPs: 26.08 | +7: iteration 47670/ 60336 | consumed samples: 12203520 | consumed tokens: 24992808960 | elapsed time per iteration (s): 0.15 | learning rate: 3.924E-05 | global batch size: 256 | lm loss: 3.766515E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.422 | TFLOPs: 26.12 | +7: iteration 47680/ 60336 | consumed samples: 12206080 | consumed tokens: 24998051840 | elapsed time per iteration (s): 0.15 | learning rate: 3.921E-05 | global batch size: 256 | lm loss: 3.764988E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.774 | TFLOPs: 26.12 | +7: iteration 47690/ 60336 | consumed samples: 12208640 | consumed tokens: 25003294720 | elapsed time per iteration (s): 0.15 | learning rate: 3.918E-05 | global batch size: 256 | lm loss: 3.759071E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.529 | TFLOPs: 26.09 | +7: iteration 47700/ 60336 | consumed samples: 12211200 | consumed tokens: 25008537600 | elapsed time per iteration (s): 0.15 | learning rate: 3.915E-05 | global batch size: 256 | lm loss: 3.754855E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.413 | TFLOPs: 26.04 | +7: iteration 47710/ 60336 | consumed samples: 12213760 | consumed tokens: 25013780480 | elapsed time per iteration (s): 0.15 | learning rate: 3.913E-05 | global batch size: 256 | lm loss: 3.769788E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.447 | TFLOPs: 26.04 | +7: iteration 47720/ 60336 | consumed samples: 12216320 | consumed tokens: 25019023360 | elapsed time per iteration (s): 0.15 | learning rate: 3.910E-05 | global batch size: 256 | lm loss: 3.759321E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.316 | TFLOPs: 26.04 | +7: iteration 47730/ 60336 | consumed samples: 12218880 | consumed tokens: 25024266240 | elapsed time per iteration (s): 0.15 | learning rate: 3.907E-05 | global batch size: 256 | lm loss: 3.770751E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.700 | TFLOPs: 26.01 | +7: iteration 47740/ 60336 | consumed samples: 12221440 | consumed tokens: 25029509120 | elapsed time per iteration (s): 0.15 | learning rate: 3.904E-05 | global batch size: 256 | lm loss: 3.760955E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.814 | TFLOPs: 26.06 | +7: iteration 47750/ 60336 | consumed samples: 12224000 | consumed tokens: 25034752000 | elapsed time per iteration (s): 0.15 | learning rate: 3.901E-05 | global batch size: 256 | lm loss: 3.765413E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.205 | TFLOPs: 26.07 | +7: iteration 47760/ 60336 | consumed samples: 12226560 | consumed tokens: 25039994880 | elapsed time per iteration (s): 0.15 | learning rate: 3.898E-05 | global batch size: 256 | lm loss: 3.745868E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.615 | TFLOPs: 26.09 | +7: iteration 47770/ 60336 | consumed samples: 12229120 | consumed tokens: 25045237760 | elapsed time per iteration (s): 0.15 | learning rate: 3.895E-05 | global batch size: 256 | lm loss: 3.763574E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.462 | TFLOPs: 26.09 | +7: iteration 47780/ 60336 | consumed samples: 12231680 | consumed tokens: 25050480640 | elapsed time per iteration (s): 0.15 | learning rate: 3.892E-05 | global batch size: 256 | lm loss: 3.760733E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.683 | TFLOPs: 26.12 | +7: iteration 47790/ 60336 | consumed samples: 12234240 | consumed tokens: 25055723520 | elapsed time per iteration (s): 0.15 | learning rate: 3.889E-05 | global batch size: 256 | lm loss: 3.759873E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.139 | TFLOPs: 26.16 | +7: iteration 47800/ 60336 | consumed samples: 12236800 | consumed tokens: 25060966400 | elapsed time per iteration (s): 0.15 | learning rate: 3.886E-05 | global batch size: 256 | lm loss: 3.748948E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.585 | TFLOPs: 26.12 | +7: iteration 47810/ 60336 | consumed samples: 12239360 | consumed tokens: 25066209280 | elapsed time per iteration (s): 0.15 | learning rate: 3.883E-05 | global batch size: 256 | lm loss: 3.773160E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.387 | TFLOPs: 26.13 | +7: iteration 47820/ 60336 | consumed samples: 12241920 | consumed tokens: 25071452160 | elapsed time per iteration (s): 0.15 | learning rate: 3.881E-05 | global batch size: 256 | lm loss: 3.764322E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.415 | TFLOPs: 26.13 | +7: iteration 47830/ 60336 | consumed samples: 12244480 | consumed tokens: 25076695040 | elapsed time per iteration (s): 0.15 | learning rate: 3.878E-05 | global batch size: 256 | lm loss: 3.753339E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.821 | TFLOPs: 26.12 | +7: iteration 47840/ 60336 | consumed samples: 12247040 | consumed tokens: 25081937920 | elapsed time per iteration (s): 0.15 | learning rate: 3.875E-05 | global batch size: 256 | lm loss: 3.758028E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.214 | TFLOPs: 26.10 | +7: iteration 47850/ 60336 | consumed samples: 12249600 | consumed tokens: 25087180800 | elapsed time per iteration (s): 0.15 | learning rate: 3.872E-05 | global batch size: 256 | lm loss: 3.771054E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.381 | TFLOPs: 26.10 | +7: iteration 47860/ 60336 | consumed samples: 12252160 | consumed tokens: 25092423680 | elapsed time per iteration (s): 0.15 | learning rate: 3.869E-05 | global batch size: 256 | lm loss: 3.760361E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.348 | TFLOPs: 26.10 | +7: iteration 47870/ 60336 | consumed samples: 12254720 | consumed tokens: 25097666560 | elapsed time per iteration (s): 0.15 | learning rate: 3.866E-05 | global batch size: 256 | lm loss: 3.769740E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.992 | TFLOPs: 26.11 | +7: iteration 47880/ 60336 | consumed samples: 12257280 | consumed tokens: 25102909440 | elapsed time per iteration (s): 0.15 | learning rate: 3.863E-05 | global batch size: 256 | lm loss: 3.769674E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.398 | TFLOPs: 26.12 | +7: iteration 47890/ 60336 | consumed samples: 12259840 | consumed tokens: 25108152320 | elapsed time per iteration (s): 0.15 | learning rate: 3.860E-05 | global batch size: 256 | lm loss: 3.770050E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.638 | TFLOPs: 26.12 | +7: iteration 47900/ 60336 | consumed samples: 12262400 | consumed tokens: 25113395200 | elapsed time per iteration (s): 0.15 | learning rate: 3.857E-05 | global batch size: 256 | lm loss: 3.737959E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.499 | TFLOPs: 26.12 | +7: iteration 47910/ 60336 | consumed samples: 12264960 | consumed tokens: 25118638080 | elapsed time per iteration (s): 0.15 | learning rate: 3.855E-05 | global batch size: 256 | lm loss: 3.760875E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.254 | TFLOPs: 26.04 | +7: iteration 47920/ 60336 | consumed samples: 12267520 | consumed tokens: 25123880960 | elapsed time per iteration (s): 0.15 | learning rate: 3.852E-05 | global batch size: 256 | lm loss: 3.764709E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.647 | TFLOPs: 25.95 | +7: iteration 47930/ 60336 | consumed samples: 12270080 | consumed tokens: 25129123840 | elapsed time per iteration (s): 0.15 | learning rate: 3.849E-05 | global batch size: 256 | lm loss: 3.762002E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.427 | TFLOPs: 26.02 | +7: iteration 47940/ 60336 | consumed samples: 12272640 | consumed tokens: 25134366720 | elapsed time per iteration (s): 0.15 | learning rate: 3.846E-05 | global batch size: 256 | lm loss: 3.759453E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.680 | TFLOPs: 26.01 | +7: iteration 47950/ 60336 | consumed samples: 12275200 | consumed tokens: 25139609600 | elapsed time per iteration (s): 0.15 | learning rate: 3.843E-05 | global batch size: 256 | lm loss: 3.745589E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.889 | TFLOPs: 25.97 | +7: iteration 47960/ 60336 | consumed samples: 12277760 | consumed tokens: 25144852480 | elapsed time per iteration (s): 0.15 | learning rate: 3.840E-05 | global batch size: 256 | lm loss: 3.762483E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.828 | TFLOPs: 26.06 | +7: iteration 47970/ 60336 | consumed samples: 12280320 | consumed tokens: 25150095360 | elapsed time per iteration (s): 0.16 | learning rate: 3.837E-05 | global batch size: 256 | lm loss: 3.756160E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.882 | TFLOPs: 25.65 | +7: iteration 47980/ 60336 | consumed samples: 12282880 | consumed tokens: 25155338240 | elapsed time per iteration (s): 0.15 | learning rate: 3.834E-05 | global batch size: 256 | lm loss: 3.769652E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.084 | TFLOPs: 26.10 | +7: iteration 47990/ 60336 | consumed samples: 12285440 | consumed tokens: 25160581120 | elapsed time per iteration (s): 0.15 | learning rate: 3.832E-05 | global batch size: 256 | lm loss: 3.772559E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.363 | TFLOPs: 26.07 | +0: [2023-03-17 02:23:02,241] [INFO] [logging.py:68:log_dist] [Rank 0] step=48000, skipped=0, lr=[3.8287454357959406e-05, 3.8287454357959406e-05, 3.8287454357959406e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 48000/ 60336 | consumed samples: 12288000 | consumed tokens: 25165824000 | elapsed time per iteration (s): 0.15 | learning rate: 3.829E-05 | global batch size: 256 | lm loss: 3.767183E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.832 | TFLOPs: 26.08 | +0: steps: 48000 loss: 3.7637 iter time (s): 0.153 samples/sec: 1673.265 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 48000 | lm loss value: 3.952780E+00 | lm loss PPL: 5.207994E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 48000 to checkpoints_44m32b100m +0: [2023-03-17 02:23:02,315] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step48000 is begin to save! +0: [2023-03-17 02:23:02,318] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:23:02,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:23:02,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:23:02,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:23:02,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:23:02,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:23:02,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:23:02,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:23:02,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:23:02,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:23:02,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:23:02,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:23:02,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:23:02,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:23:02,429] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:23:02,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:23:02,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:23:02,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:23:02,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:23:02,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:23:02,446] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step48000/mp_rank_00_model_states.pt +0: [2023-03-17 02:23:02,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:23:02,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:23:02,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:23:02,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,480] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:23:02,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:23:02,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:23:02,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:23:02,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:23:02,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:23:02,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:23:02,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step48000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:23:02,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:23:02,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: successfully saved checkpoint at iteration 48000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 175.75 +7: iteration 48010/ 60336 | consumed samples: 12290560 | consumed tokens: 25171066880 | elapsed time per iteration (s): 0.18 | learning rate: 3.826E-05 | global batch size: 256 | lm loss: 3.769317E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.887 | TFLOPs: 22.49 | +7: iteration 48020/ 60336 | consumed samples: 12293120 | consumed tokens: 25176309760 | elapsed time per iteration (s): 0.16 | learning rate: 3.823E-05 | global batch size: 256 | lm loss: 3.772487E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.592 | TFLOPs: 25.65 | +7: iteration 48030/ 60336 | consumed samples: 12295680 | consumed tokens: 25181552640 | elapsed time per iteration (s): 0.15 | learning rate: 3.820E-05 | global batch size: 256 | lm loss: 3.769254E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.313 | TFLOPs: 26.08 | +7: iteration 48040/ 60336 | consumed samples: 12298240 | consumed tokens: 25186795520 | elapsed time per iteration (s): 0.15 | learning rate: 3.817E-05 | global batch size: 256 | lm loss: 3.757125E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.692 | TFLOPs: 26.09 | +7: iteration 48050/ 60336 | consumed samples: 12300800 | consumed tokens: 25192038400 | elapsed time per iteration (s): 0.15 | learning rate: 3.814E-05 | global batch size: 256 | lm loss: 3.766797E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.989 | TFLOPs: 26.08 | +7: iteration 48060/ 60336 | consumed samples: 12303360 | consumed tokens: 25197281280 | elapsed time per iteration (s): 0.16 | learning rate: 3.812E-05 | global batch size: 256 | lm loss: 3.776748E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.575 | TFLOPs: 25.67 | +7: iteration 48070/ 60336 | consumed samples: 12305920 | consumed tokens: 25202524160 | elapsed time per iteration (s): 0.15 | learning rate: 3.809E-05 | global batch size: 256 | lm loss: 3.761782E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.593 | TFLOPs: 26.09 | +7: iteration 48080/ 60336 | consumed samples: 12308480 | consumed tokens: 25207767040 | elapsed time per iteration (s): 0.15 | learning rate: 3.806E-05 | global batch size: 256 | lm loss: 3.776355E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.314 | TFLOPs: 26.07 | +7: iteration 48090/ 60336 | consumed samples: 12311040 | consumed tokens: 25213009920 | elapsed time per iteration (s): 0.15 | learning rate: 3.803E-05 | global batch size: 256 | lm loss: 3.764941E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.716 | TFLOPs: 26.09 | +7: iteration 48100/ 60336 | consumed samples: 12313600 | consumed tokens: 25218252800 | elapsed time per iteration (s): 0.15 | learning rate: 3.800E-05 | global batch size: 256 | lm loss: 3.742828E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.294 | TFLOPs: 26.07 | +7: iteration 48110/ 60336 | consumed samples: 12316160 | consumed tokens: 25223495680 | elapsed time per iteration (s): 0.15 | learning rate: 3.797E-05 | global batch size: 256 | lm loss: 3.773793E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.397 | TFLOPs: 26.07 | +7: iteration 48120/ 60336 | consumed samples: 12318720 | consumed tokens: 25228738560 | elapsed time per iteration (s): 0.15 | learning rate: 3.795E-05 | global batch size: 256 | lm loss: 3.753531E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.870 | TFLOPs: 26.08 | +7: iteration 48130/ 60336 | consumed samples: 12321280 | consumed tokens: 25233981440 | elapsed time per iteration (s): 0.15 | learning rate: 3.792E-05 | global batch size: 256 | lm loss: 3.765715E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.645 | TFLOPs: 26.09 | +7: iteration 48140/ 60336 | consumed samples: 12323840 | consumed tokens: 25239224320 | elapsed time per iteration (s): 0.15 | learning rate: 3.789E-05 | global batch size: 256 | lm loss: 3.777634E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.929 | TFLOPs: 26.08 | +7: iteration 48150/ 60336 | consumed samples: 12326400 | consumed tokens: 25244467200 | elapsed time per iteration (s): 0.15 | learning rate: 3.786E-05 | global batch size: 256 | lm loss: 3.765467E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.328 | TFLOPs: 26.02 | +7: iteration 48160/ 60336 | consumed samples: 12328960 | consumed tokens: 25249710080 | elapsed time per iteration (s): 0.15 | learning rate: 3.783E-05 | global batch size: 256 | lm loss: 3.772380E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.977 | TFLOPs: 26.03 | +7: iteration 48170/ 60336 | consumed samples: 12331520 | consumed tokens: 25254952960 | elapsed time per iteration (s): 0.15 | learning rate: 3.780E-05 | global batch size: 256 | lm loss: 3.766715E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.208 | TFLOPs: 26.00 | +7: iteration 48180/ 60336 | consumed samples: 12334080 | consumed tokens: 25260195840 | elapsed time per iteration (s): 0.15 | learning rate: 3.778E-05 | global batch size: 256 | lm loss: 3.767416E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.205 | TFLOPs: 25.97 | +7: iteration 48190/ 60336 | consumed samples: 12336640 | consumed tokens: 25265438720 | elapsed time per iteration (s): 0.15 | learning rate: 3.775E-05 | global batch size: 256 | lm loss: 3.760730E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.803 | TFLOPs: 26.01 | +7: iteration 48200/ 60336 | consumed samples: 12339200 | consumed tokens: 25270681600 | elapsed time per iteration (s): 0.15 | learning rate: 3.772E-05 | global batch size: 256 | lm loss: 3.772227E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.199 | TFLOPs: 26.02 | +7: iteration 48210/ 60336 | consumed samples: 12341760 | consumed tokens: 25275924480 | elapsed time per iteration (s): 0.15 | learning rate: 3.769E-05 | global batch size: 256 | lm loss: 3.756690E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.780 | TFLOPs: 26.01 | +7: iteration 48220/ 60336 | consumed samples: 12344320 | consumed tokens: 25281167360 | elapsed time per iteration (s): 0.15 | learning rate: 3.766E-05 | global batch size: 256 | lm loss: 3.757999E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.776 | TFLOPs: 26.01 | +7: iteration 48230/ 60336 | consumed samples: 12346880 | consumed tokens: 25286410240 | elapsed time per iteration (s): 0.15 | learning rate: 3.763E-05 | global batch size: 256 | lm loss: 3.747122E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.156 | TFLOPs: 26.04 | +7: iteration 48240/ 60336 | consumed samples: 12349440 | consumed tokens: 25291653120 | elapsed time per iteration (s): 0.15 | learning rate: 3.761E-05 | global batch size: 256 | lm loss: 3.770636E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.260 | TFLOPs: 26.04 | +7: iteration 48250/ 60336 | consumed samples: 12352000 | consumed tokens: 25296896000 | elapsed time per iteration (s): 0.15 | learning rate: 3.758E-05 | global batch size: 256 | lm loss: 3.769534E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.431 | TFLOPs: 26.04 | +7: iteration 48260/ 60336 | consumed samples: 12354560 | consumed tokens: 25302138880 | elapsed time per iteration (s): 0.15 | learning rate: 3.755E-05 | global batch size: 256 | lm loss: 3.751484E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.808 | TFLOPs: 26.03 | +7: iteration 48270/ 60336 | consumed samples: 12357120 | consumed tokens: 25307381760 | elapsed time per iteration (s): 0.15 | learning rate: 3.752E-05 | global batch size: 256 | lm loss: 3.751103E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.616 | TFLOPs: 26.04 | +7: iteration 48280/ 60336 | consumed samples: 12359680 | consumed tokens: 25312624640 | elapsed time per iteration (s): 0.16 | learning rate: 3.749E-05 | global batch size: 256 | lm loss: 3.762510E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.251 | TFLOPs: 25.32 | +7: iteration 48290/ 60336 | consumed samples: 12362240 | consumed tokens: 25317867520 | elapsed time per iteration (s): 0.15 | learning rate: 3.747E-05 | global batch size: 256 | lm loss: 3.779672E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.102 | TFLOPs: 26.10 | +7: iteration 48300/ 60336 | consumed samples: 12364800 | consumed tokens: 25323110400 | elapsed time per iteration (s): 0.15 | learning rate: 3.744E-05 | global batch size: 256 | lm loss: 3.778097E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.130 | TFLOPs: 26.07 | +7: iteration 48310/ 60336 | consumed samples: 12367360 | consumed tokens: 25328353280 | elapsed time per iteration (s): 0.15 | learning rate: 3.741E-05 | global batch size: 256 | lm loss: 3.786831E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.226 | TFLOPs: 26.08 | +7: iteration 48320/ 60336 | consumed samples: 12369920 | consumed tokens: 25333596160 | elapsed time per iteration (s): 0.15 | learning rate: 3.738E-05 | global batch size: 256 | lm loss: 3.753593E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.526 | TFLOPs: 26.06 | +7: iteration 48330/ 60336 | consumed samples: 12372480 | consumed tokens: 25338839040 | elapsed time per iteration (s): 0.16 | learning rate: 3.735E-05 | global batch size: 256 | lm loss: 3.768417E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.168 | TFLOPs: 25.63 | +7: iteration 48340/ 60336 | consumed samples: 12375040 | consumed tokens: 25344081920 | elapsed time per iteration (s): 0.16 | learning rate: 3.733E-05 | global batch size: 256 | lm loss: 3.748325E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.528 | TFLOPs: 25.32 | +7: iteration 48350/ 60336 | consumed samples: 12377600 | consumed tokens: 25349324800 | elapsed time per iteration (s): 0.15 | learning rate: 3.730E-05 | global batch size: 256 | lm loss: 3.764594E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.581 | TFLOPs: 26.09 | +7: iteration 48360/ 60336 | consumed samples: 12380160 | consumed tokens: 25354567680 | elapsed time per iteration (s): 0.15 | learning rate: 3.727E-05 | global batch size: 256 | lm loss: 3.747485E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.740 | TFLOPs: 26.04 | +7: iteration 48370/ 60336 | consumed samples: 12382720 | consumed tokens: 25359810560 | elapsed time per iteration (s): 0.15 | learning rate: 3.724E-05 | global batch size: 256 | lm loss: 3.763572E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.176 | TFLOPs: 26.02 | +7: iteration 48380/ 60336 | consumed samples: 12385280 | consumed tokens: 25365053440 | elapsed time per iteration (s): 0.15 | learning rate: 3.721E-05 | global batch size: 256 | lm loss: 3.772825E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.507 | TFLOPs: 26.01 | +7: iteration 48390/ 60336 | consumed samples: 12387840 | consumed tokens: 25370296320 | elapsed time per iteration (s): 0.15 | learning rate: 3.719E-05 | global batch size: 256 | lm loss: 3.760018E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.493 | TFLOPs: 26.04 | +7: iteration 48400/ 60336 | consumed samples: 12390400 | consumed tokens: 25375539200 | elapsed time per iteration (s): 0.15 | learning rate: 3.716E-05 | global batch size: 256 | lm loss: 3.753754E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.952 | TFLOPs: 25.99 | +7: iteration 48410/ 60336 | consumed samples: 12392960 | consumed tokens: 25380782080 | elapsed time per iteration (s): 0.15 | learning rate: 3.713E-05 | global batch size: 256 | lm loss: 3.764157E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.701 | TFLOPs: 26.03 | +7: iteration 48420/ 60336 | consumed samples: 12395520 | consumed tokens: 25386024960 | elapsed time per iteration (s): 0.15 | learning rate: 3.710E-05 | global batch size: 256 | lm loss: 3.778999E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.150 | TFLOPs: 26.05 | +7: iteration 48430/ 60336 | consumed samples: 12398080 | consumed tokens: 25391267840 | elapsed time per iteration (s): 0.15 | learning rate: 3.708E-05 | global batch size: 256 | lm loss: 3.763655E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.295 | TFLOPs: 26.02 | +7: iteration 48440/ 60336 | consumed samples: 12400640 | consumed tokens: 25396510720 | elapsed time per iteration (s): 0.15 | learning rate: 3.705E-05 | global batch size: 256 | lm loss: 3.779567E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.418 | TFLOPs: 26.01 | +7: iteration 48450/ 60336 | consumed samples: 12403200 | consumed tokens: 25401753600 | elapsed time per iteration (s): 0.15 | learning rate: 3.702E-05 | global batch size: 256 | lm loss: 3.768792E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.747 | TFLOPs: 26.15 | +7: iteration 48460/ 60336 | consumed samples: 12405760 | consumed tokens: 25406996480 | elapsed time per iteration (s): 0.15 | learning rate: 3.699E-05 | global batch size: 256 | lm loss: 3.767110E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.713 | TFLOPs: 25.97 | +7: iteration 48470/ 60336 | consumed samples: 12408320 | consumed tokens: 25412239360 | elapsed time per iteration (s): 0.15 | learning rate: 3.697E-05 | global batch size: 256 | lm loss: 3.762427E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.998 | TFLOPs: 26.16 | +7: iteration 48480/ 60336 | consumed samples: 12410880 | consumed tokens: 25417482240 | elapsed time per iteration (s): 0.16 | learning rate: 3.694E-05 | global batch size: 256 | lm loss: 3.760289E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.082 | TFLOPs: 25.49 | +7: iteration 48490/ 60336 | consumed samples: 12413440 | consumed tokens: 25422725120 | elapsed time per iteration (s): 0.15 | learning rate: 3.691E-05 | global batch size: 256 | lm loss: 3.760658E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.227 | TFLOPs: 26.07 | +7: iteration 48500/ 60336 | consumed samples: 12416000 | consumed tokens: 25427968000 | elapsed time per iteration (s): 0.16 | learning rate: 3.688E-05 | global batch size: 256 | lm loss: 3.758094E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.424 | TFLOPs: 25.38 | +7: iteration 48510/ 60336 | consumed samples: 12418560 | consumed tokens: 25433210880 | elapsed time per iteration (s): 0.15 | learning rate: 3.685E-05 | global batch size: 256 | lm loss: 3.756695E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.892 | TFLOPs: 26.03 | +7: iteration 48520/ 60336 | consumed samples: 12421120 | consumed tokens: 25438453760 | elapsed time per iteration (s): 0.15 | learning rate: 3.683E-05 | global batch size: 256 | lm loss: 3.768804E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.671 | TFLOPs: 26.06 | +7: iteration 48530/ 60336 | consumed samples: 12423680 | consumed tokens: 25443696640 | elapsed time per iteration (s): 0.15 | learning rate: 3.680E-05 | global batch size: 256 | lm loss: 3.764529E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.669 | TFLOPs: 26.03 | +7: iteration 48540/ 60336 | consumed samples: 12426240 | consumed tokens: 25448939520 | elapsed time per iteration (s): 0.15 | learning rate: 3.677E-05 | global batch size: 256 | lm loss: 3.752667E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.158 | TFLOPs: 26.05 | +7: iteration 48550/ 60336 | consumed samples: 12428800 | consumed tokens: 25454182400 | elapsed time per iteration (s): 0.15 | learning rate: 3.674E-05 | global batch size: 256 | lm loss: 3.764130E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.667 | TFLOPs: 26.06 | +7: iteration 48560/ 60336 | consumed samples: 12431360 | consumed tokens: 25459425280 | elapsed time per iteration (s): 0.15 | learning rate: 3.672E-05 | global batch size: 256 | lm loss: 3.754315E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.188 | TFLOPs: 26.05 | +7: iteration 48570/ 60336 | consumed samples: 12433920 | consumed tokens: 25464668160 | elapsed time per iteration (s): 0.16 | learning rate: 3.669E-05 | global batch size: 256 | lm loss: 3.774505E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.598 | TFLOPs: 25.27 | +7: iteration 48580/ 60336 | consumed samples: 12436480 | consumed tokens: 25469911040 | elapsed time per iteration (s): 0.15 | learning rate: 3.666E-05 | global batch size: 256 | lm loss: 3.757634E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.651 | TFLOPs: 26.06 | +7: iteration 48590/ 60336 | consumed samples: 12439040 | consumed tokens: 25475153920 | elapsed time per iteration (s): 0.15 | learning rate: 3.663E-05 | global batch size: 256 | lm loss: 3.757888E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.950 | TFLOPs: 26.05 | +7: iteration 48600/ 60336 | consumed samples: 12441600 | consumed tokens: 25480396800 | elapsed time per iteration (s): 0.15 | learning rate: 3.661E-05 | global batch size: 256 | lm loss: 3.776138E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.287 | TFLOPs: 26.04 | +7: iteration 48610/ 60336 | consumed samples: 12444160 | consumed tokens: 25485639680 | elapsed time per iteration (s): 0.15 | learning rate: 3.658E-05 | global batch size: 256 | lm loss: 3.760386E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.823 | TFLOPs: 26.06 | +7: iteration 48620/ 60336 | consumed samples: 12446720 | consumed tokens: 25490882560 | elapsed time per iteration (s): 0.15 | learning rate: 3.655E-05 | global batch size: 256 | lm loss: 3.766789E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.682 | TFLOPs: 25.98 | +7: iteration 48630/ 60336 | consumed samples: 12449280 | consumed tokens: 25496125440 | elapsed time per iteration (s): 0.15 | learning rate: 3.653E-05 | global batch size: 256 | lm loss: 3.741145E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.149 | TFLOPs: 26.00 | +7: iteration 48640/ 60336 | consumed samples: 12451840 | consumed tokens: 25501368320 | elapsed time per iteration (s): 0.15 | learning rate: 3.650E-05 | global batch size: 256 | lm loss: 3.769315E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.695 | TFLOPs: 26.01 | +7: iteration 48650/ 60336 | consumed samples: 12454400 | consumed tokens: 25506611200 | elapsed time per iteration (s): 0.15 | learning rate: 3.647E-05 | global batch size: 256 | lm loss: 3.764811E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.933 | TFLOPs: 26.02 | +7: iteration 48660/ 60336 | consumed samples: 12456960 | consumed tokens: 25511854080 | elapsed time per iteration (s): 0.15 | learning rate: 3.644E-05 | global batch size: 256 | lm loss: 3.761728E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.569 | TFLOPs: 26.03 | +7: iteration 48670/ 60336 | consumed samples: 12459520 | consumed tokens: 25517096960 | elapsed time per iteration (s): 0.15 | learning rate: 3.642E-05 | global batch size: 256 | lm loss: 3.768674E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.741 | TFLOPs: 26.03 | +7: iteration 48680/ 60336 | consumed samples: 12462080 | consumed tokens: 25522339840 | elapsed time per iteration (s): 0.15 | learning rate: 3.639E-05 | global batch size: 256 | lm loss: 3.761593E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.887 | TFLOPs: 26.00 | +7: iteration 48690/ 60336 | consumed samples: 12464640 | consumed tokens: 25527582720 | elapsed time per iteration (s): 0.15 | learning rate: 3.636E-05 | global batch size: 256 | lm loss: 3.752682E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.041 | TFLOPs: 26.02 | +7: iteration 48700/ 60336 | consumed samples: 12467200 | consumed tokens: 25532825600 | elapsed time per iteration (s): 0.15 | learning rate: 3.633E-05 | global batch size: 256 | lm loss: 3.760423E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.627 | TFLOPs: 26.15 | +7: iteration 48710/ 60336 | consumed samples: 12469760 | consumed tokens: 25538068480 | elapsed time per iteration (s): 0.15 | learning rate: 3.631E-05 | global batch size: 256 | lm loss: 3.762009E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.136 | TFLOPs: 26.14 | +7: iteration 48720/ 60336 | consumed samples: 12472320 | consumed tokens: 25543311360 | elapsed time per iteration (s): 0.15 | learning rate: 3.628E-05 | global batch size: 256 | lm loss: 3.760489E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.513 | TFLOPs: 26.15 | +7: iteration 48730/ 60336 | consumed samples: 12474880 | consumed tokens: 25548554240 | elapsed time per iteration (s): 0.15 | learning rate: 3.625E-05 | global batch size: 256 | lm loss: 3.765287E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.743 | TFLOPs: 26.19 | +7: iteration 48740/ 60336 | consumed samples: 12477440 | consumed tokens: 25553797120 | elapsed time per iteration (s): 0.15 | learning rate: 3.623E-05 | global batch size: 256 | lm loss: 3.758175E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.473 | TFLOPs: 26.15 | +7: iteration 48750/ 60336 | consumed samples: 12480000 | consumed tokens: 25559040000 | elapsed time per iteration (s): 0.15 | learning rate: 3.620E-05 | global batch size: 256 | lm loss: 3.765586E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.858 | TFLOPs: 26.14 | +7: iteration 48760/ 60336 | consumed samples: 12482560 | consumed tokens: 25564282880 | elapsed time per iteration (s): 0.15 | learning rate: 3.617E-05 | global batch size: 256 | lm loss: 3.762675E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.736 | TFLOPs: 26.15 | +7: iteration 48770/ 60336 | consumed samples: 12485120 | consumed tokens: 25569525760 | elapsed time per iteration (s): 0.15 | learning rate: 3.614E-05 | global batch size: 256 | lm loss: 3.756229E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.019 | TFLOPs: 26.16 | +7: iteration 48780/ 60336 | consumed samples: 12487680 | consumed tokens: 25574768640 | elapsed time per iteration (s): 0.15 | learning rate: 3.612E-05 | global batch size: 256 | lm loss: 3.773121E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.701 | TFLOPs: 26.14 | +7: iteration 48790/ 60336 | consumed samples: 12490240 | consumed tokens: 25580011520 | elapsed time per iteration (s): 0.15 | learning rate: 3.609E-05 | global batch size: 256 | lm loss: 3.778099E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.202 | TFLOPs: 26.10 | +7: iteration 48800/ 60336 | consumed samples: 12492800 | consumed tokens: 25585254400 | elapsed time per iteration (s): 0.15 | learning rate: 3.606E-05 | global batch size: 256 | lm loss: 3.763601E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.689 | TFLOPs: 26.08 | +7: iteration 48810/ 60336 | consumed samples: 12495360 | consumed tokens: 25590497280 | elapsed time per iteration (s): 0.15 | learning rate: 3.604E-05 | global batch size: 256 | lm loss: 3.766862E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.179 | TFLOPs: 26.10 | +7: iteration 48820/ 60336 | consumed samples: 12497920 | consumed tokens: 25595740160 | elapsed time per iteration (s): 0.15 | learning rate: 3.601E-05 | global batch size: 256 | lm loss: 3.764540E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.837 | TFLOPs: 26.09 | +7: iteration 48830/ 60336 | consumed samples: 12500480 | consumed tokens: 25600983040 | elapsed time per iteration (s): 0.15 | learning rate: 3.598E-05 | global batch size: 256 | lm loss: 3.760765E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.870 | TFLOPs: 26.08 | +7: iteration 48840/ 60336 | consumed samples: 12503040 | consumed tokens: 25606225920 | elapsed time per iteration (s): 0.15 | learning rate: 3.596E-05 | global batch size: 256 | lm loss: 3.764691E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.282 | TFLOPs: 26.08 | +7: iteration 48850/ 60336 | consumed samples: 12505600 | consumed tokens: 25611468800 | elapsed time per iteration (s): 0.15 | learning rate: 3.593E-05 | global batch size: 256 | lm loss: 3.782489E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.425 | TFLOPs: 26.07 | +7: iteration 48860/ 60336 | consumed samples: 12508160 | consumed tokens: 25616711680 | elapsed time per iteration (s): 0.15 | learning rate: 3.590E-05 | global batch size: 256 | lm loss: 3.745625E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.743 | TFLOPs: 26.09 | +7: iteration 48870/ 60336 | consumed samples: 12510720 | consumed tokens: 25621954560 | elapsed time per iteration (s): 0.15 | learning rate: 3.588E-05 | global batch size: 256 | lm loss: 3.764262E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.949 | TFLOPs: 26.08 | +7: iteration 48880/ 60336 | consumed samples: 12513280 | consumed tokens: 25627197440 | elapsed time per iteration (s): 0.15 | learning rate: 3.585E-05 | global batch size: 256 | lm loss: 3.763079E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.077 | TFLOPs: 26.11 | +7: iteration 48890/ 60336 | consumed samples: 12515840 | consumed tokens: 25632440320 | elapsed time per iteration (s): 0.15 | learning rate: 3.582E-05 | global batch size: 256 | lm loss: 3.754412E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.718 | TFLOPs: 26.09 | +7: iteration 48900/ 60336 | consumed samples: 12518400 | consumed tokens: 25637683200 | elapsed time per iteration (s): 0.15 | learning rate: 3.579E-05 | global batch size: 256 | lm loss: 3.759970E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.429 | TFLOPs: 26.09 | +7: iteration 48910/ 60336 | consumed samples: 12520960 | consumed tokens: 25642926080 | elapsed time per iteration (s): 0.15 | learning rate: 3.577E-05 | global batch size: 256 | lm loss: 3.760134E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.186 | TFLOPs: 26.08 | +7: iteration 48920/ 60336 | consumed samples: 12523520 | consumed tokens: 25648168960 | elapsed time per iteration (s): 0.15 | learning rate: 3.574E-05 | global batch size: 256 | lm loss: 3.762199E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.915 | TFLOPs: 26.08 | +7: iteration 48930/ 60336 | consumed samples: 12526080 | consumed tokens: 25653411840 | elapsed time per iteration (s): 0.15 | learning rate: 3.571E-05 | global batch size: 256 | lm loss: 3.754125E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.180 | TFLOPs: 26.08 | +7: iteration 48940/ 60336 | consumed samples: 12528640 | consumed tokens: 25658654720 | elapsed time per iteration (s): 0.15 | learning rate: 3.569E-05 | global batch size: 256 | lm loss: 3.774203E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.893 | TFLOPs: 26.09 | +7: iteration 48950/ 60336 | consumed samples: 12531200 | consumed tokens: 25663897600 | elapsed time per iteration (s): 0.15 | learning rate: 3.566E-05 | global batch size: 256 | lm loss: 3.759361E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.172 | TFLOPs: 26.08 | +7: iteration 48960/ 60336 | consumed samples: 12533760 | consumed tokens: 25669140480 | elapsed time per iteration (s): 0.15 | learning rate: 3.563E-05 | global batch size: 256 | lm loss: 3.769513E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.014 | TFLOPs: 26.10 | +7: iteration 48970/ 60336 | consumed samples: 12536320 | consumed tokens: 25674383360 | elapsed time per iteration (s): 0.16 | learning rate: 3.561E-05 | global batch size: 256 | lm loss: 3.758464E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.036 | TFLOPs: 25.88 | +7: iteration 48980/ 60336 | consumed samples: 12538880 | consumed tokens: 25679626240 | elapsed time per iteration (s): 0.15 | learning rate: 3.558E-05 | global batch size: 256 | lm loss: 3.763721E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.882 | TFLOPs: 26.09 | +7: iteration 48990/ 60336 | consumed samples: 12541440 | consumed tokens: 25684869120 | elapsed time per iteration (s): 0.15 | learning rate: 3.555E-05 | global batch size: 256 | lm loss: 3.768243E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.978 | TFLOPs: 26.10 | +7: iteration 49000/ 60336 | consumed samples: 12544000 | consumed tokens: 25690112000 | elapsed time per iteration (s): 0.15 | learning rate: 3.553E-05 | global batch size: 256 | lm loss: 3.755444E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.105 | TFLOPs: 26.10 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 49000 | lm loss value: 3.870086E+00 | lm loss PPL: 4.794651E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 49000 to checkpoints_44m32b100m +0: [2023-03-17 02:25:36,867] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step49000 is begin to save! +0: [2023-03-17 02:25:36,871] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:25:36,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:25:36,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:25:36,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:25:36,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:25:36,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:25:36,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:25:36,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:25:36,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:25:36,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:25:36,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:25:36,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:25:36,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:25:36,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:25:36,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:25:36,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:25:36,994] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:25:37,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:25:37,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:25:37,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:25:37,003] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step49000/mp_rank_00_model_states.pt +0: [2023-03-17 02:25:37,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:25:37,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:25:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:25:37,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:25:37,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 02:25:37,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:25:37,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:25:37,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step49000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:25:37,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: successfully saved checkpoint at iteration 49000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 181.41 +7: iteration 49010/ 60336 | consumed samples: 12546560 | consumed tokens: 25695354880 | elapsed time per iteration (s): 0.18 | learning rate: 3.550E-05 | global batch size: 256 | lm loss: 3.779755E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1445.205 | TFLOPs: 22.66 | +7: iteration 49020/ 60336 | consumed samples: 12549120 | consumed tokens: 25700597760 | elapsed time per iteration (s): 0.15 | learning rate: 3.547E-05 | global batch size: 256 | lm loss: 3.762094E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.082 | TFLOPs: 26.08 | +7: iteration 49030/ 60336 | consumed samples: 12551680 | consumed tokens: 25705840640 | elapsed time per iteration (s): 0.15 | learning rate: 3.545E-05 | global batch size: 256 | lm loss: 3.750806E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.855 | TFLOPs: 26.08 | +7: iteration 49040/ 60336 | consumed samples: 12554240 | consumed tokens: 25711083520 | elapsed time per iteration (s): 0.15 | learning rate: 3.542E-05 | global batch size: 256 | lm loss: 3.760179E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.086 | TFLOPs: 26.08 | +7: iteration 49050/ 60336 | consumed samples: 12556800 | consumed tokens: 25716326400 | elapsed time per iteration (s): 0.15 | learning rate: 3.540E-05 | global batch size: 256 | lm loss: 3.764220E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.587 | TFLOPs: 26.09 | +7: iteration 49060/ 60336 | consumed samples: 12559360 | consumed tokens: 25721569280 | elapsed time per iteration (s): 0.15 | learning rate: 3.537E-05 | global batch size: 256 | lm loss: 3.746898E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.826 | TFLOPs: 26.11 | +7: iteration 49070/ 60336 | consumed samples: 12561920 | consumed tokens: 25726812160 | elapsed time per iteration (s): 0.15 | learning rate: 3.534E-05 | global batch size: 256 | lm loss: 3.752412E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.302 | TFLOPs: 26.13 | +7: iteration 49080/ 60336 | consumed samples: 12564480 | consumed tokens: 25732055040 | elapsed time per iteration (s): 0.15 | learning rate: 3.532E-05 | global batch size: 256 | lm loss: 3.772031E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.773 | TFLOPs: 26.12 | +7: iteration 49090/ 60336 | consumed samples: 12567040 | consumed tokens: 25737297920 | elapsed time per iteration (s): 0.15 | learning rate: 3.529E-05 | global batch size: 256 | lm loss: 3.771243E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.269 | TFLOPs: 26.10 | +7: iteration 49100/ 60336 | consumed samples: 12569600 | consumed tokens: 25742540800 | elapsed time per iteration (s): 0.15 | learning rate: 3.526E-05 | global batch size: 256 | lm loss: 3.771912E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.180 | TFLOPs: 26.10 | +7: iteration 49110/ 60336 | consumed samples: 12572160 | consumed tokens: 25747783680 | elapsed time per iteration (s): 0.15 | learning rate: 3.524E-05 | global batch size: 256 | lm loss: 3.750417E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.447 | TFLOPs: 26.10 | +7: iteration 49120/ 60336 | consumed samples: 12574720 | consumed tokens: 25753026560 | elapsed time per iteration (s): 0.15 | learning rate: 3.521E-05 | global batch size: 256 | lm loss: 3.740876E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.899 | TFLOPs: 26.11 | +7: iteration 49130/ 60336 | consumed samples: 12577280 | consumed tokens: 25758269440 | elapsed time per iteration (s): 0.15 | learning rate: 3.518E-05 | global batch size: 256 | lm loss: 3.755502E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.815 | TFLOPs: 26.11 | +7: iteration 49140/ 60336 | consumed samples: 12579840 | consumed tokens: 25763512320 | elapsed time per iteration (s): 0.15 | learning rate: 3.516E-05 | global batch size: 256 | lm loss: 3.764289E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.350 | TFLOPs: 26.09 | +7: iteration 49150/ 60336 | consumed samples: 12582400 | consumed tokens: 25768755200 | elapsed time per iteration (s): 0.15 | learning rate: 3.513E-05 | global batch size: 256 | lm loss: 3.768602E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.801 | TFLOPs: 26.09 | +7: iteration 49160/ 60336 | consumed samples: 12584960 | consumed tokens: 25773998080 | elapsed time per iteration (s): 0.15 | learning rate: 3.511E-05 | global batch size: 256 | lm loss: 3.770967E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.361 | TFLOPs: 26.12 | +7: iteration 49170/ 60336 | consumed samples: 12587520 | consumed tokens: 25779240960 | elapsed time per iteration (s): 0.15 | learning rate: 3.508E-05 | global batch size: 256 | lm loss: 3.741661E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.842 | TFLOPs: 26.11 | +7: iteration 49180/ 60336 | consumed samples: 12590080 | consumed tokens: 25784483840 | elapsed time per iteration (s): 0.15 | learning rate: 3.505E-05 | global batch size: 256 | lm loss: 3.764043E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.286 | TFLOPs: 26.12 | +7: iteration 49190/ 60336 | consumed samples: 12592640 | consumed tokens: 25789726720 | elapsed time per iteration (s): 0.15 | learning rate: 3.503E-05 | global batch size: 256 | lm loss: 3.760823E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.322 | TFLOPs: 26.12 | +7: iteration 49200/ 60336 | consumed samples: 12595200 | consumed tokens: 25794969600 | elapsed time per iteration (s): 0.15 | learning rate: 3.500E-05 | global batch size: 256 | lm loss: 3.761559E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.960 | TFLOPs: 26.13 | +7: iteration 49210/ 60336 | consumed samples: 12597760 | consumed tokens: 25800212480 | elapsed time per iteration (s): 0.15 | learning rate: 3.497E-05 | global batch size: 256 | lm loss: 3.761866E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.251 | TFLOPs: 26.10 | +7: iteration 49220/ 60336 | consumed samples: 12600320 | consumed tokens: 25805455360 | elapsed time per iteration (s): 0.15 | learning rate: 3.495E-05 | global batch size: 256 | lm loss: 3.754790E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.586 | TFLOPs: 26.10 | +7: iteration 49230/ 60336 | consumed samples: 12602880 | consumed tokens: 25810698240 | elapsed time per iteration (s): 0.15 | learning rate: 3.492E-05 | global batch size: 256 | lm loss: 3.762772E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.552 | TFLOPs: 26.14 | +7: iteration 49240/ 60336 | consumed samples: 12605440 | consumed tokens: 25815941120 | elapsed time per iteration (s): 0.15 | learning rate: 3.490E-05 | global batch size: 256 | lm loss: 3.763454E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.649 | TFLOPs: 26.14 | +7: iteration 49250/ 60336 | consumed samples: 12608000 | consumed tokens: 25821184000 | elapsed time per iteration (s): 0.15 | learning rate: 3.487E-05 | global batch size: 256 | lm loss: 3.747886E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.006 | TFLOPs: 25.91 | +7: iteration 49260/ 60336 | consumed samples: 12610560 | consumed tokens: 25826426880 | elapsed time per iteration (s): 0.15 | learning rate: 3.484E-05 | global batch size: 256 | lm loss: 3.766311E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.978 | TFLOPs: 26.14 | +7: iteration 49270/ 60336 | consumed samples: 12613120 | consumed tokens: 25831669760 | elapsed time per iteration (s): 0.15 | learning rate: 3.482E-05 | global batch size: 256 | lm loss: 3.757685E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.692 | TFLOPs: 26.12 | +7: iteration 49280/ 60336 | consumed samples: 12615680 | consumed tokens: 25836912640 | elapsed time per iteration (s): 0.15 | learning rate: 3.479E-05 | global batch size: 256 | lm loss: 3.761958E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.560 | TFLOPs: 26.12 | +7: iteration 49290/ 60336 | consumed samples: 12618240 | consumed tokens: 25842155520 | elapsed time per iteration (s): 0.15 | learning rate: 3.477E-05 | global batch size: 256 | lm loss: 3.762371E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.896 | TFLOPs: 26.13 | +7: iteration 49300/ 60336 | consumed samples: 12620800 | consumed tokens: 25847398400 | elapsed time per iteration (s): 0.15 | learning rate: 3.474E-05 | global batch size: 256 | lm loss: 3.759721E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.547 | TFLOPs: 26.12 | +7: iteration 49310/ 60336 | consumed samples: 12623360 | consumed tokens: 25852641280 | elapsed time per iteration (s): 0.15 | learning rate: 3.471E-05 | global batch size: 256 | lm loss: 3.762769E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.341 | TFLOPs: 26.10 | +7: iteration 49320/ 60336 | consumed samples: 12625920 | consumed tokens: 25857884160 | elapsed time per iteration (s): 0.15 | learning rate: 3.469E-05 | global batch size: 256 | lm loss: 3.751431E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.057 | TFLOPs: 26.13 | +7: iteration 49330/ 60336 | consumed samples: 12628480 | consumed tokens: 25863127040 | elapsed time per iteration (s): 0.15 | learning rate: 3.466E-05 | global batch size: 256 | lm loss: 3.748057E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.350 | TFLOPs: 26.13 | +7: iteration 49340/ 60336 | consumed samples: 12631040 | consumed tokens: 25868369920 | elapsed time per iteration (s): 0.15 | learning rate: 3.464E-05 | global batch size: 256 | lm loss: 3.755603E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.851 | TFLOPs: 26.12 | +7: iteration 49350/ 60336 | consumed samples: 12633600 | consumed tokens: 25873612800 | elapsed time per iteration (s): 0.15 | learning rate: 3.461E-05 | global batch size: 256 | lm loss: 3.754279E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.460 | TFLOPs: 26.15 | +7: iteration 49360/ 60336 | consumed samples: 12636160 | consumed tokens: 25878855680 | elapsed time per iteration (s): 0.15 | learning rate: 3.458E-05 | global batch size: 256 | lm loss: 3.768092E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.082 | TFLOPs: 26.13 | +7: iteration 49370/ 60336 | consumed samples: 12638720 | consumed tokens: 25884098560 | elapsed time per iteration (s): 0.15 | learning rate: 3.456E-05 | global batch size: 256 | lm loss: 3.762553E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.238 | TFLOPs: 26.15 | +7: iteration 49380/ 60336 | consumed samples: 12641280 | consumed tokens: 25889341440 | elapsed time per iteration (s): 0.15 | learning rate: 3.453E-05 | global batch size: 256 | lm loss: 3.755591E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.449 | TFLOPs: 26.15 | +7: iteration 49390/ 60336 | consumed samples: 12643840 | consumed tokens: 25894584320 | elapsed time per iteration (s): 0.15 | learning rate: 3.451E-05 | global batch size: 256 | lm loss: 3.764000E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.418 | TFLOPs: 26.15 | +7: iteration 49400/ 60336 | consumed samples: 12646400 | consumed tokens: 25899827200 | elapsed time per iteration (s): 0.15 | learning rate: 3.448E-05 | global batch size: 256 | lm loss: 3.773479E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.332 | TFLOPs: 26.15 | +7: iteration 49410/ 60336 | consumed samples: 12648960 | consumed tokens: 25905070080 | elapsed time per iteration (s): 0.15 | learning rate: 3.446E-05 | global batch size: 256 | lm loss: 3.762476E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.888 | TFLOPs: 26.14 | +7: iteration 49420/ 60336 | consumed samples: 12651520 | consumed tokens: 25910312960 | elapsed time per iteration (s): 0.15 | learning rate: 3.443E-05 | global batch size: 256 | lm loss: 3.779110E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.393 | TFLOPs: 26.15 | +7: iteration 49430/ 60336 | consumed samples: 12654080 | consumed tokens: 25915555840 | elapsed time per iteration (s): 0.15 | learning rate: 3.440E-05 | global batch size: 256 | lm loss: 3.749779E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.761 | TFLOPs: 26.09 | +7: iteration 49440/ 60336 | consumed samples: 12656640 | consumed tokens: 25920798720 | elapsed time per iteration (s): 0.15 | learning rate: 3.438E-05 | global batch size: 256 | lm loss: 3.749846E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.399 | TFLOPs: 26.15 | +7: iteration 49450/ 60336 | consumed samples: 12659200 | consumed tokens: 25926041600 | elapsed time per iteration (s): 0.15 | learning rate: 3.435E-05 | global batch size: 256 | lm loss: 3.762535E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.069 | TFLOPs: 26.03 | +7: iteration 49460/ 60336 | consumed samples: 12661760 | consumed tokens: 25931284480 | elapsed time per iteration (s): 0.15 | learning rate: 3.433E-05 | global batch size: 256 | lm loss: 3.760639E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.409 | TFLOPs: 26.13 | +7: iteration 49470/ 60336 | consumed samples: 12664320 | consumed tokens: 25936527360 | elapsed time per iteration (s): 0.15 | learning rate: 3.430E-05 | global batch size: 256 | lm loss: 3.763009E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.160 | TFLOPs: 26.13 | +7: iteration 49480/ 60336 | consumed samples: 12666880 | consumed tokens: 25941770240 | elapsed time per iteration (s): 0.15 | learning rate: 3.428E-05 | global batch size: 256 | lm loss: 3.755741E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.377 | TFLOPs: 26.13 | +7: iteration 49490/ 60336 | consumed samples: 12669440 | consumed tokens: 25947013120 | elapsed time per iteration (s): 0.15 | learning rate: 3.425E-05 | global batch size: 256 | lm loss: 3.756446E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.265 | TFLOPs: 26.13 | +7: iteration 49500/ 60336 | consumed samples: 12672000 | consumed tokens: 25952256000 | elapsed time per iteration (s): 0.15 | learning rate: 3.422E-05 | global batch size: 256 | lm loss: 3.742943E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.921 | TFLOPs: 26.11 | +7: iteration 49510/ 60336 | consumed samples: 12674560 | consumed tokens: 25957498880 | elapsed time per iteration (s): 0.15 | learning rate: 3.420E-05 | global batch size: 256 | lm loss: 3.765331E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.269 | TFLOPs: 26.19 | +7: iteration 49520/ 60336 | consumed samples: 12677120 | consumed tokens: 25962741760 | elapsed time per iteration (s): 0.15 | learning rate: 3.417E-05 | global batch size: 256 | lm loss: 3.749773E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.186 | TFLOPs: 26.19 | +7: iteration 49530/ 60336 | consumed samples: 12679680 | consumed tokens: 25967984640 | elapsed time per iteration (s): 0.15 | learning rate: 3.415E-05 | global batch size: 256 | lm loss: 3.766252E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.465 | TFLOPs: 26.18 | +7: iteration 49540/ 60336 | consumed samples: 12682240 | consumed tokens: 25973227520 | elapsed time per iteration (s): 0.16 | learning rate: 3.412E-05 | global batch size: 256 | lm loss: 3.770352E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.635 | TFLOPs: 25.87 | +7: iteration 49550/ 60336 | consumed samples: 12684800 | consumed tokens: 25978470400 | elapsed time per iteration (s): 0.15 | learning rate: 3.410E-05 | global batch size: 256 | lm loss: 3.757243E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.218 | TFLOPs: 26.19 | +7: iteration 49560/ 60336 | consumed samples: 12687360 | consumed tokens: 25983713280 | elapsed time per iteration (s): 0.15 | learning rate: 3.407E-05 | global batch size: 256 | lm loss: 3.757191E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.638 | TFLOPs: 26.14 | +7: iteration 49570/ 60336 | consumed samples: 12689920 | consumed tokens: 25988956160 | elapsed time per iteration (s): 0.15 | learning rate: 3.405E-05 | global batch size: 256 | lm loss: 3.751101E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.567 | TFLOPs: 26.20 | +7: iteration 49580/ 60336 | consumed samples: 12692480 | consumed tokens: 25994199040 | elapsed time per iteration (s): 0.16 | learning rate: 3.402E-05 | global batch size: 256 | lm loss: 3.745619E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.981 | TFLOPs: 25.59 | +7: iteration 49590/ 60336 | consumed samples: 12695040 | consumed tokens: 25999441920 | elapsed time per iteration (s): 0.15 | learning rate: 3.400E-05 | global batch size: 256 | lm loss: 3.754827E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.066 | TFLOPs: 26.18 | +7: iteration 49600/ 60336 | consumed samples: 12697600 | consumed tokens: 26004684800 | elapsed time per iteration (s): 0.16 | learning rate: 3.397E-05 | global batch size: 256 | lm loss: 3.753654E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.146 | TFLOPs: 25.85 | +7: iteration 49610/ 60336 | consumed samples: 12700160 | consumed tokens: 26009927680 | elapsed time per iteration (s): 0.15 | learning rate: 3.395E-05 | global batch size: 256 | lm loss: 3.754746E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.324 | TFLOPs: 26.16 | +7: iteration 49620/ 60336 | consumed samples: 12702720 | consumed tokens: 26015170560 | elapsed time per iteration (s): 0.15 | learning rate: 3.392E-05 | global batch size: 256 | lm loss: 3.762570E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.977 | TFLOPs: 26.17 | +7: iteration 49630/ 60336 | consumed samples: 12705280 | consumed tokens: 26020413440 | elapsed time per iteration (s): 0.15 | learning rate: 3.389E-05 | global batch size: 256 | lm loss: 3.756118E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.157 | TFLOPs: 26.18 | +7: iteration 49640/ 60336 | consumed samples: 12707840 | consumed tokens: 26025656320 | elapsed time per iteration (s): 0.15 | learning rate: 3.387E-05 | global batch size: 256 | lm loss: 3.754851E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.630 | TFLOPs: 26.18 | +7: iteration 49650/ 60336 | consumed samples: 12710400 | consumed tokens: 26030899200 | elapsed time per iteration (s): 0.15 | learning rate: 3.384E-05 | global batch size: 256 | lm loss: 3.756503E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.930 | TFLOPs: 25.95 | +7: iteration 49660/ 60336 | consumed samples: 12712960 | consumed tokens: 26036142080 | elapsed time per iteration (s): 0.15 | learning rate: 3.382E-05 | global batch size: 256 | lm loss: 3.759698E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.366 | TFLOPs: 26.05 | +7: iteration 49670/ 60336 | consumed samples: 12715520 | consumed tokens: 26041384960 | elapsed time per iteration (s): 0.15 | learning rate: 3.379E-05 | global batch size: 256 | lm loss: 3.762802E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.800 | TFLOPs: 25.97 | +7: iteration 49680/ 60336 | consumed samples: 12718080 | consumed tokens: 26046627840 | elapsed time per iteration (s): 0.16 | learning rate: 3.377E-05 | global batch size: 256 | lm loss: 3.757314E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.590 | TFLOPs: 25.71 | +7: iteration 49690/ 60336 | consumed samples: 12720640 | consumed tokens: 26051870720 | elapsed time per iteration (s): 0.15 | learning rate: 3.374E-05 | global batch size: 256 | lm loss: 3.747348E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.901 | TFLOPs: 26.19 | +7: iteration 49700/ 60336 | consumed samples: 12723200 | consumed tokens: 26057113600 | elapsed time per iteration (s): 0.15 | learning rate: 3.372E-05 | global batch size: 256 | lm loss: 3.762426E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.067 | TFLOPs: 26.14 | +7: iteration 49710/ 60336 | consumed samples: 12725760 | consumed tokens: 26062356480 | elapsed time per iteration (s): 0.15 | learning rate: 3.369E-05 | global batch size: 256 | lm loss: 3.741993E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.302 | TFLOPs: 25.99 | +7: iteration 49720/ 60336 | consumed samples: 12728320 | consumed tokens: 26067599360 | elapsed time per iteration (s): 0.16 | learning rate: 3.367E-05 | global batch size: 256 | lm loss: 3.754473E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.417 | TFLOPs: 25.87 | +7: iteration 49730/ 60336 | consumed samples: 12730880 | consumed tokens: 26072842240 | elapsed time per iteration (s): 0.17 | learning rate: 3.364E-05 | global batch size: 256 | lm loss: 3.744794E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.209 | TFLOPs: 23.15 | +7: iteration 49740/ 60336 | consumed samples: 12733440 | consumed tokens: 26078085120 | elapsed time per iteration (s): 0.16 | learning rate: 3.362E-05 | global batch size: 256 | lm loss: 3.758650E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.431 | TFLOPs: 25.41 | +7: iteration 49750/ 60336 | consumed samples: 12736000 | consumed tokens: 26083328000 | elapsed time per iteration (s): 0.15 | learning rate: 3.359E-05 | global batch size: 256 | lm loss: 3.758533E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.936 | TFLOPs: 25.97 | +7: iteration 49760/ 60336 | consumed samples: 12738560 | consumed tokens: 26088570880 | elapsed time per iteration (s): 0.15 | learning rate: 3.357E-05 | global batch size: 256 | lm loss: 3.755331E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.088 | TFLOPs: 25.97 | +7: iteration 49770/ 60336 | consumed samples: 12741120 | consumed tokens: 26093813760 | elapsed time per iteration (s): 0.15 | learning rate: 3.354E-05 | global batch size: 256 | lm loss: 3.743220E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.101 | TFLOPs: 26.10 | +7: iteration 49780/ 60336 | consumed samples: 12743680 | consumed tokens: 26099056640 | elapsed time per iteration (s): 0.17 | learning rate: 3.352E-05 | global batch size: 256 | lm loss: 3.759066E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1493.220 | TFLOPs: 23.42 | +7: iteration 49790/ 60336 | consumed samples: 12746240 | consumed tokens: 26104299520 | elapsed time per iteration (s): 0.17 | learning rate: 3.349E-05 | global batch size: 256 | lm loss: 3.759663E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.433 | TFLOPs: 23.67 | +7: iteration 49800/ 60336 | consumed samples: 12748800 | consumed tokens: 26109542400 | elapsed time per iteration (s): 0.15 | learning rate: 3.347E-05 | global batch size: 256 | lm loss: 3.757294E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.404 | TFLOPs: 26.21 | +7: iteration 49810/ 60336 | consumed samples: 12751360 | consumed tokens: 26114785280 | elapsed time per iteration (s): 0.16 | learning rate: 3.344E-05 | global batch size: 256 | lm loss: 3.758436E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.772 | TFLOPs: 25.07 | +7: iteration 49820/ 60336 | consumed samples: 12753920 | consumed tokens: 26120028160 | elapsed time per iteration (s): 0.16 | learning rate: 3.342E-05 | global batch size: 256 | lm loss: 3.765490E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.757 | TFLOPs: 25.04 | +7: iteration 49830/ 60336 | consumed samples: 12756480 | consumed tokens: 26125271040 | elapsed time per iteration (s): 0.16 | learning rate: 3.339E-05 | global batch size: 256 | lm loss: 3.767366E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.189 | TFLOPs: 25.50 | +7: iteration 49840/ 60336 | consumed samples: 12759040 | consumed tokens: 26130513920 | elapsed time per iteration (s): 0.16 | learning rate: 3.337E-05 | global batch size: 256 | lm loss: 3.765928E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.065 | TFLOPs: 25.66 | +7: iteration 49850/ 60336 | consumed samples: 12761600 | consumed tokens: 26135756800 | elapsed time per iteration (s): 0.17 | learning rate: 3.334E-05 | global batch size: 256 | lm loss: 3.758366E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.714 | TFLOPs: 23.82 | +7: iteration 49860/ 60336 | consumed samples: 12764160 | consumed tokens: 26140999680 | elapsed time per iteration (s): 0.18 | learning rate: 3.332E-05 | global batch size: 256 | lm loss: 3.757225E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.081 | TFLOPs: 22.62 | +7: iteration 49870/ 60336 | consumed samples: 12766720 | consumed tokens: 26146242560 | elapsed time per iteration (s): 0.15 | learning rate: 3.329E-05 | global batch size: 256 | lm loss: 3.768265E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.864 | TFLOPs: 26.12 | +7: iteration 49880/ 60336 | consumed samples: 12769280 | consumed tokens: 26151485440 | elapsed time per iteration (s): 0.16 | learning rate: 3.327E-05 | global batch size: 256 | lm loss: 3.748039E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.599 | TFLOPs: 25.63 | +7: iteration 49890/ 60336 | consumed samples: 12771840 | consumed tokens: 26156728320 | elapsed time per iteration (s): 0.15 | learning rate: 3.324E-05 | global batch size: 256 | lm loss: 3.747490E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.157 | TFLOPs: 25.93 | +7: iteration 49900/ 60336 | consumed samples: 12774400 | consumed tokens: 26161971200 | elapsed time per iteration (s): 0.16 | learning rate: 3.322E-05 | global batch size: 256 | lm loss: 3.766055E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.637 | TFLOPs: 25.37 | +7: iteration 49910/ 60336 | consumed samples: 12776960 | consumed tokens: 26167214080 | elapsed time per iteration (s): 0.17 | learning rate: 3.320E-05 | global batch size: 256 | lm loss: 3.762977E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.347 | TFLOPs: 23.69 | +7: iteration 49920/ 60336 | consumed samples: 12779520 | consumed tokens: 26172456960 | elapsed time per iteration (s): 0.17 | learning rate: 3.317E-05 | global batch size: 256 | lm loss: 3.765379E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1491.890 | TFLOPs: 23.40 | +7: iteration 49930/ 60336 | consumed samples: 12782080 | consumed tokens: 26177699840 | elapsed time per iteration (s): 0.16 | learning rate: 3.315E-05 | global batch size: 256 | lm loss: 3.761114E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.523 | TFLOPs: 25.10 | +7: iteration 49940/ 60336 | consumed samples: 12784640 | consumed tokens: 26182942720 | elapsed time per iteration (s): 0.17 | learning rate: 3.312E-05 | global batch size: 256 | lm loss: 3.768935E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1491.573 | TFLOPs: 23.39 | +7: iteration 49950/ 60336 | consumed samples: 12787200 | consumed tokens: 26188185600 | elapsed time per iteration (s): 0.15 | learning rate: 3.310E-05 | global batch size: 256 | lm loss: 3.754096E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.259 | TFLOPs: 26.12 | +7: iteration 49960/ 60336 | consumed samples: 12789760 | consumed tokens: 26193428480 | elapsed time per iteration (s): 0.15 | learning rate: 3.307E-05 | global batch size: 256 | lm loss: 3.765406E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.041 | TFLOPs: 26.19 | +7: iteration 49970/ 60336 | consumed samples: 12792320 | consumed tokens: 26198671360 | elapsed time per iteration (s): 0.16 | learning rate: 3.305E-05 | global batch size: 256 | lm loss: 3.760220E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.222 | TFLOPs: 24.86 | +7: iteration 49980/ 60336 | consumed samples: 12794880 | consumed tokens: 26203914240 | elapsed time per iteration (s): 0.16 | learning rate: 3.302E-05 | global batch size: 256 | lm loss: 3.757935E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.179 | TFLOPs: 25.66 | +7: iteration 49990/ 60336 | consumed samples: 12797440 | consumed tokens: 26209157120 | elapsed time per iteration (s): 0.16 | learning rate: 3.300E-05 | global batch size: 256 | lm loss: 3.758485E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.641 | TFLOPs: 25.67 | +0: [2023-03-17 02:28:12,883] [INFO] [logging.py:68:log_dist] [Rank 0] step=50000, skipped=0, lr=[3.297410209986627e-05, 3.297410209986627e-05, 3.297410209986627e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 50000/ 60336 | consumed samples: 12800000 | consumed tokens: 26214400000 | elapsed time per iteration (s): 0.16 | learning rate: 3.297E-05 | global batch size: 256 | lm loss: 3.774447E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.867 | TFLOPs: 25.18 | +0: steps: 50000 loss: 3.7407 iter time (s): 0.154 samples/sec: 1660.732 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 50000 | lm loss value: 3.879568E+00 | lm loss PPL: 4.840328E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 50000 to checkpoints_44m32b100m +0: [2023-03-17 02:28:12,961] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step50000 is begin to save! +0: [2023-03-17 02:28:12,965] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:28:13,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:28:13,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:28:13,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:28:13,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:28:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:28:13,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:28:13,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:28:13,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:28:13,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:28:13,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:28:13,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:28:13,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:28:13,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:28:13,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:28:13,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:28:13,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:28:13,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:28:13,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:28:13,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:28:13,104] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step50000/mp_rank_00_model_states.pt +0: [2023-03-17 02:28:13,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:28:13,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:28:13,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:28:13,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 02:28:13,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:28:13,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:28:13,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:28:13,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:28:13,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:28:13,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:28:13,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:28:13,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:28:13,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:28:13,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:28:13,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:28:13,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: successfully saved checkpoint at iteration 50000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 197.15 +7: iteration 50010/ 60336 | consumed samples: 12802560 | consumed tokens: 26219642880 | elapsed time per iteration (s): 0.18 | learning rate: 3.295E-05 | global batch size: 256 | lm loss: 3.759191E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1423.131 | TFLOPs: 22.32 | +7: iteration 50020/ 60336 | consumed samples: 12805120 | consumed tokens: 26224885760 | elapsed time per iteration (s): 0.16 | learning rate: 3.293E-05 | global batch size: 256 | lm loss: 3.769363E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.209 | TFLOPs: 25.53 | +7: iteration 50030/ 60336 | consumed samples: 12807680 | consumed tokens: 26230128640 | elapsed time per iteration (s): 0.15 | learning rate: 3.290E-05 | global batch size: 256 | lm loss: 3.762001E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.656 | TFLOPs: 26.17 | +7: iteration 50040/ 60336 | consumed samples: 12810240 | consumed tokens: 26235371520 | elapsed time per iteration (s): 0.16 | learning rate: 3.288E-05 | global batch size: 256 | lm loss: 3.756979E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.476 | TFLOPs: 25.79 | +7: iteration 50050/ 60336 | consumed samples: 12812800 | consumed tokens: 26240614400 | elapsed time per iteration (s): 0.16 | learning rate: 3.285E-05 | global batch size: 256 | lm loss: 3.768890E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.581 | TFLOPs: 25.89 | +7: iteration 50060/ 60336 | consumed samples: 12815360 | consumed tokens: 26245857280 | elapsed time per iteration (s): 0.16 | learning rate: 3.283E-05 | global batch size: 256 | lm loss: 3.761957E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.222 | TFLOPs: 25.60 | +7: iteration 50070/ 60336 | consumed samples: 12817920 | consumed tokens: 26251100160 | elapsed time per iteration (s): 0.17 | learning rate: 3.280E-05 | global batch size: 256 | lm loss: 3.755387E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.263 | TFLOPs: 24.00 | +7: iteration 50080/ 60336 | consumed samples: 12820480 | consumed tokens: 26256343040 | elapsed time per iteration (s): 0.16 | learning rate: 3.278E-05 | global batch size: 256 | lm loss: 3.759563E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.627 | TFLOPs: 25.26 | +7: iteration 50090/ 60336 | consumed samples: 12823040 | consumed tokens: 26261585920 | elapsed time per iteration (s): 0.16 | learning rate: 3.275E-05 | global batch size: 256 | lm loss: 3.752925E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.405 | TFLOPs: 24.75 | +7: iteration 50100/ 60336 | consumed samples: 12825600 | consumed tokens: 26266828800 | elapsed time per iteration (s): 0.16 | learning rate: 3.273E-05 | global batch size: 256 | lm loss: 3.759404E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.995 | TFLOPs: 25.61 | +7: iteration 50110/ 60336 | consumed samples: 12828160 | consumed tokens: 26272071680 | elapsed time per iteration (s): 0.16 | learning rate: 3.271E-05 | global batch size: 256 | lm loss: 3.762420E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.109 | TFLOPs: 25.82 | +7: iteration 50120/ 60336 | consumed samples: 12830720 | consumed tokens: 26277314560 | elapsed time per iteration (s): 0.16 | learning rate: 3.268E-05 | global batch size: 256 | lm loss: 3.769541E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.940 | TFLOPs: 25.28 | +7: iteration 50130/ 60336 | consumed samples: 12833280 | consumed tokens: 26282557440 | elapsed time per iteration (s): 0.17 | learning rate: 3.266E-05 | global batch size: 256 | lm loss: 3.762426E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.304 | TFLOPs: 23.83 | +7: iteration 50140/ 60336 | consumed samples: 12835840 | consumed tokens: 26287800320 | elapsed time per iteration (s): 0.16 | learning rate: 3.263E-05 | global batch size: 256 | lm loss: 3.760887E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.259 | TFLOPs: 24.34 | +7: iteration 50150/ 60336 | consumed samples: 12838400 | consumed tokens: 26293043200 | elapsed time per iteration (s): 0.16 | learning rate: 3.261E-05 | global batch size: 256 | lm loss: 3.760302E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.529 | TFLOPs: 25.49 | +7: iteration 50160/ 60336 | consumed samples: 12840960 | consumed tokens: 26298286080 | elapsed time per iteration (s): 0.16 | learning rate: 3.259E-05 | global batch size: 256 | lm loss: 3.760205E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.801 | TFLOPs: 24.95 | +7: iteration 50170/ 60336 | consumed samples: 12843520 | consumed tokens: 26303528960 | elapsed time per iteration (s): 0.16 | learning rate: 3.256E-05 | global batch size: 256 | lm loss: 3.764865E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.339 | TFLOPs: 25.24 | +7: iteration 50180/ 60336 | consumed samples: 12846080 | consumed tokens: 26308771840 | elapsed time per iteration (s): 0.15 | learning rate: 3.254E-05 | global batch size: 256 | lm loss: 3.770487E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.773 | TFLOPs: 26.06 | +7: iteration 50190/ 60336 | consumed samples: 12848640 | consumed tokens: 26314014720 | elapsed time per iteration (s): 0.16 | learning rate: 3.251E-05 | global batch size: 256 | lm loss: 3.767695E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.497 | TFLOPs: 25.44 | +7: iteration 50200/ 60336 | consumed samples: 12851200 | consumed tokens: 26319257600 | elapsed time per iteration (s): 0.17 | learning rate: 3.249E-05 | global batch size: 256 | lm loss: 3.755289E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.853 | TFLOPs: 24.29 | +7: iteration 50210/ 60336 | consumed samples: 12853760 | consumed tokens: 26324500480 | elapsed time per iteration (s): 0.16 | learning rate: 3.246E-05 | global batch size: 256 | lm loss: 3.761322E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.610 | TFLOPs: 25.68 | +7: iteration 50220/ 60336 | consumed samples: 12856320 | consumed tokens: 26329743360 | elapsed time per iteration (s): 0.17 | learning rate: 3.244E-05 | global batch size: 256 | lm loss: 3.753791E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.633 | TFLOPs: 24.19 | +7: iteration 50230/ 60336 | consumed samples: 12858880 | consumed tokens: 26334986240 | elapsed time per iteration (s): 0.18 | learning rate: 3.242E-05 | global batch size: 256 | lm loss: 3.744418E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1428.448 | TFLOPs: 22.40 | +7: iteration 50240/ 60336 | consumed samples: 12861440 | consumed tokens: 26340229120 | elapsed time per iteration (s): 0.16 | learning rate: 3.239E-05 | global batch size: 256 | lm loss: 3.745923E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.876 | TFLOPs: 25.45 | +7: iteration 50250/ 60336 | consumed samples: 12864000 | consumed tokens: 26345472000 | elapsed time per iteration (s): 0.16 | learning rate: 3.237E-05 | global batch size: 256 | lm loss: 3.769120E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.588 | TFLOPs: 24.80 | +7: iteration 50260/ 60336 | consumed samples: 12866560 | consumed tokens: 26350714880 | elapsed time per iteration (s): 0.17 | learning rate: 3.234E-05 | global batch size: 256 | lm loss: 3.752246E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.185 | TFLOPs: 23.71 | +7: iteration 50270/ 60336 | consumed samples: 12869120 | consumed tokens: 26355957760 | elapsed time per iteration (s): 0.16 | learning rate: 3.232E-05 | global batch size: 256 | lm loss: 3.751434E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.996 | TFLOPs: 24.37 | +7: iteration 50280/ 60336 | consumed samples: 12871680 | consumed tokens: 26361200640 | elapsed time per iteration (s): 0.16 | learning rate: 3.230E-05 | global batch size: 256 | lm loss: 3.760138E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.984 | TFLOPs: 24.42 | +7: iteration 50290/ 60336 | consumed samples: 12874240 | consumed tokens: 26366443520 | elapsed time per iteration (s): 0.16 | learning rate: 3.227E-05 | global batch size: 256 | lm loss: 3.760894E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.342 | TFLOPs: 25.11 | +7: iteration 50300/ 60336 | consumed samples: 12876800 | consumed tokens: 26371686400 | elapsed time per iteration (s): 0.17 | learning rate: 3.225E-05 | global batch size: 256 | lm loss: 3.759863E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.805 | TFLOPs: 23.47 | +7: iteration 50310/ 60336 | consumed samples: 12879360 | consumed tokens: 26376929280 | elapsed time per iteration (s): 0.17 | learning rate: 3.223E-05 | global batch size: 256 | lm loss: 3.757428E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.301 | TFLOPs: 24.22 | +7: iteration 50320/ 60336 | consumed samples: 12881920 | consumed tokens: 26382172160 | elapsed time per iteration (s): 0.17 | learning rate: 3.220E-05 | global batch size: 256 | lm loss: 3.755573E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.629 | TFLOPs: 23.44 | +7: iteration 50330/ 60336 | consumed samples: 12884480 | consumed tokens: 26387415040 | elapsed time per iteration (s): 0.16 | learning rate: 3.218E-05 | global batch size: 256 | lm loss: 3.770383E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.231 | TFLOPs: 25.19 | +7: iteration 50340/ 60336 | consumed samples: 12887040 | consumed tokens: 26392657920 | elapsed time per iteration (s): 0.16 | learning rate: 3.215E-05 | global batch size: 256 | lm loss: 3.760609E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.689 | TFLOPs: 24.70 | +7: iteration 50350/ 60336 | consumed samples: 12889600 | consumed tokens: 26397900800 | elapsed time per iteration (s): 0.17 | learning rate: 3.213E-05 | global batch size: 256 | lm loss: 3.762595E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.616 | TFLOPs: 24.07 | +7: iteration 50360/ 60336 | consumed samples: 12892160 | consumed tokens: 26403143680 | elapsed time per iteration (s): 0.16 | learning rate: 3.211E-05 | global batch size: 256 | lm loss: 3.779461E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.738 | TFLOPs: 25.06 | +7: iteration 50370/ 60336 | consumed samples: 12894720 | consumed tokens: 26408386560 | elapsed time per iteration (s): 0.16 | learning rate: 3.208E-05 | global batch size: 256 | lm loss: 3.760709E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.362 | TFLOPs: 24.56 | +7: iteration 50380/ 60336 | consumed samples: 12897280 | consumed tokens: 26413629440 | elapsed time per iteration (s): 0.16 | learning rate: 3.206E-05 | global batch size: 256 | lm loss: 3.771294E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.693 | TFLOPs: 24.43 | +7: iteration 50390/ 60336 | consumed samples: 12899840 | consumed tokens: 26418872320 | elapsed time per iteration (s): 0.17 | learning rate: 3.204E-05 | global batch size: 256 | lm loss: 3.741178E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.215 | TFLOPs: 23.65 | +7: iteration 50400/ 60336 | consumed samples: 12902400 | consumed tokens: 26424115200 | elapsed time per iteration (s): 0.16 | learning rate: 3.201E-05 | global batch size: 256 | lm loss: 3.770129E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.642 | TFLOPs: 25.01 | +7: iteration 50410/ 60336 | consumed samples: 12904960 | consumed tokens: 26429358080 | elapsed time per iteration (s): 0.17 | learning rate: 3.199E-05 | global batch size: 256 | lm loss: 3.762351E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.484 | TFLOPs: 23.95 | +7: iteration 50420/ 60336 | consumed samples: 12907520 | consumed tokens: 26434600960 | elapsed time per iteration (s): 0.17 | learning rate: 3.196E-05 | global batch size: 256 | lm loss: 3.758374E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.380 | TFLOPs: 24.30 | +7: iteration 50430/ 60336 | consumed samples: 12910080 | consumed tokens: 26439843840 | elapsed time per iteration (s): 0.16 | learning rate: 3.194E-05 | global batch size: 256 | lm loss: 3.753624E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.325 | TFLOPs: 24.91 | +7: iteration 50440/ 60336 | consumed samples: 12912640 | consumed tokens: 26445086720 | elapsed time per iteration (s): 0.16 | learning rate: 3.192E-05 | global batch size: 256 | lm loss: 3.764301E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.069 | TFLOPs: 24.51 | +7: iteration 50450/ 60336 | consumed samples: 12915200 | consumed tokens: 26450329600 | elapsed time per iteration (s): 0.16 | learning rate: 3.189E-05 | global batch size: 256 | lm loss: 3.751429E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.832 | TFLOPs: 25.06 | +7: iteration 50460/ 60336 | consumed samples: 12917760 | consumed tokens: 26455572480 | elapsed time per iteration (s): 0.16 | learning rate: 3.187E-05 | global batch size: 256 | lm loss: 3.760009E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.485 | TFLOPs: 25.32 | +7: iteration 50470/ 60336 | consumed samples: 12920320 | consumed tokens: 26460815360 | elapsed time per iteration (s): 0.16 | learning rate: 3.185E-05 | global batch size: 256 | lm loss: 3.755697E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.945 | TFLOPs: 25.28 | +7: iteration 50480/ 60336 | consumed samples: 12922880 | consumed tokens: 26466058240 | elapsed time per iteration (s): 0.16 | learning rate: 3.182E-05 | global batch size: 256 | lm loss: 3.755090E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.187 | TFLOPs: 24.39 | +7: iteration 50490/ 60336 | consumed samples: 12925440 | consumed tokens: 26471301120 | elapsed time per iteration (s): 0.16 | learning rate: 3.180E-05 | global batch size: 256 | lm loss: 3.760896E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.841 | TFLOPs: 25.09 | +7: iteration 50500/ 60336 | consumed samples: 12928000 | consumed tokens: 26476544000 | elapsed time per iteration (s): 0.16 | learning rate: 3.178E-05 | global batch size: 256 | lm loss: 3.760955E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.559 | TFLOPs: 24.47 | +7: iteration 50510/ 60336 | consumed samples: 12930560 | consumed tokens: 26481786880 | elapsed time per iteration (s): 0.16 | learning rate: 3.175E-05 | global batch size: 256 | lm loss: 3.770460E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.818 | TFLOPs: 24.70 | +7: iteration 50520/ 60336 | consumed samples: 12933120 | consumed tokens: 26487029760 | elapsed time per iteration (s): 0.16 | learning rate: 3.173E-05 | global batch size: 256 | lm loss: 3.752274E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.751 | TFLOPs: 25.17 | +7: iteration 50530/ 60336 | consumed samples: 12935680 | consumed tokens: 26492272640 | elapsed time per iteration (s): 0.16 | learning rate: 3.171E-05 | global batch size: 256 | lm loss: 3.744086E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.102 | TFLOPs: 25.33 | +7: iteration 50540/ 60336 | consumed samples: 12938240 | consumed tokens: 26497515520 | elapsed time per iteration (s): 0.17 | learning rate: 3.168E-05 | global batch size: 256 | lm loss: 3.761536E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.033 | TFLOPs: 23.71 | +7: iteration 50550/ 60336 | consumed samples: 12940800 | consumed tokens: 26502758400 | elapsed time per iteration (s): 0.16 | learning rate: 3.166E-05 | global batch size: 256 | lm loss: 3.758159E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.177 | TFLOPs: 24.40 | +7: iteration 50560/ 60336 | consumed samples: 12943360 | consumed tokens: 26508001280 | elapsed time per iteration (s): 0.17 | learning rate: 3.164E-05 | global batch size: 256 | lm loss: 3.773510E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.793 | TFLOPs: 23.87 | +7: iteration 50570/ 60336 | consumed samples: 12945920 | consumed tokens: 26513244160 | elapsed time per iteration (s): 0.17 | learning rate: 3.161E-05 | global batch size: 256 | lm loss: 3.751052E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.758 | TFLOPs: 23.94 | +7: iteration 50580/ 60336 | consumed samples: 12948480 | consumed tokens: 26518487040 | elapsed time per iteration (s): 0.16 | learning rate: 3.159E-05 | global batch size: 256 | lm loss: 3.770152E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.318 | TFLOPs: 24.49 | +7: iteration 50590/ 60336 | consumed samples: 12951040 | consumed tokens: 26523729920 | elapsed time per iteration (s): 0.16 | learning rate: 3.157E-05 | global batch size: 256 | lm loss: 3.758451E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.101 | TFLOPs: 25.38 | +7: iteration 50600/ 60336 | consumed samples: 12953600 | consumed tokens: 26528972800 | elapsed time per iteration (s): 0.16 | learning rate: 3.154E-05 | global batch size: 256 | lm loss: 3.756077E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.988 | TFLOPs: 24.64 | +7: iteration 50610/ 60336 | consumed samples: 12956160 | consumed tokens: 26534215680 | elapsed time per iteration (s): 0.16 | learning rate: 3.152E-05 | global batch size: 256 | lm loss: 3.759600E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.194 | TFLOPs: 24.51 | +7: iteration 50620/ 60336 | consumed samples: 12958720 | consumed tokens: 26539458560 | elapsed time per iteration (s): 0.17 | learning rate: 3.150E-05 | global batch size: 256 | lm loss: 3.755335E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.289 | TFLOPs: 23.34 | +7: iteration 50630/ 60336 | consumed samples: 12961280 | consumed tokens: 26544701440 | elapsed time per iteration (s): 0.18 | learning rate: 3.147E-05 | global batch size: 256 | lm loss: 3.755651E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.252 | TFLOPs: 21.76 | +7: iteration 50640/ 60336 | consumed samples: 12963840 | consumed tokens: 26549944320 | elapsed time per iteration (s): 0.16 | learning rate: 3.145E-05 | global batch size: 256 | lm loss: 3.744524E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.052 | TFLOPs: 25.25 | +7: iteration 50650/ 60336 | consumed samples: 12966400 | consumed tokens: 26555187200 | elapsed time per iteration (s): 0.17 | learning rate: 3.143E-05 | global batch size: 256 | lm loss: 3.760103E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.662 | TFLOPs: 24.00 | +7: iteration 50660/ 60336 | consumed samples: 12968960 | consumed tokens: 26560430080 | elapsed time per iteration (s): 0.17 | learning rate: 3.140E-05 | global batch size: 256 | lm loss: 3.746883E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.138 | TFLOPs: 24.18 | +7: iteration 50670/ 60336 | consumed samples: 12971520 | consumed tokens: 26565672960 | elapsed time per iteration (s): 0.16 | learning rate: 3.138E-05 | global batch size: 256 | lm loss: 3.753836E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.197 | TFLOPs: 25.10 | +7: iteration 50680/ 60336 | consumed samples: 12974080 | consumed tokens: 26570915840 | elapsed time per iteration (s): 0.17 | learning rate: 3.136E-05 | global batch size: 256 | lm loss: 3.767957E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.624 | TFLOPs: 23.96 | +7: iteration 50690/ 60336 | consumed samples: 12976640 | consumed tokens: 26576158720 | elapsed time per iteration (s): 0.16 | learning rate: 3.134E-05 | global batch size: 256 | lm loss: 3.758554E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.367 | TFLOPs: 25.19 | +7: iteration 50700/ 60336 | consumed samples: 12979200 | consumed tokens: 26581401600 | elapsed time per iteration (s): 0.16 | learning rate: 3.131E-05 | global batch size: 256 | lm loss: 3.759126E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.688 | TFLOPs: 24.48 | +7: iteration 50710/ 60336 | consumed samples: 12981760 | consumed tokens: 26586644480 | elapsed time per iteration (s): 0.17 | learning rate: 3.129E-05 | global batch size: 256 | lm loss: 3.765651E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.023 | TFLOPs: 24.09 | +7: iteration 50720/ 60336 | consumed samples: 12984320 | consumed tokens: 26591887360 | elapsed time per iteration (s): 0.17 | learning rate: 3.127E-05 | global batch size: 256 | lm loss: 3.776535E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.680 | TFLOPs: 24.18 | +7: iteration 50730/ 60336 | consumed samples: 12986880 | consumed tokens: 26597130240 | elapsed time per iteration (s): 0.17 | learning rate: 3.124E-05 | global batch size: 256 | lm loss: 3.748500E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.621 | TFLOPs: 24.21 | +7: iteration 50740/ 60336 | consumed samples: 12989440 | consumed tokens: 26602373120 | elapsed time per iteration (s): 0.16 | learning rate: 3.122E-05 | global batch size: 256 | lm loss: 3.755278E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.336 | TFLOPs: 24.36 | +7: iteration 50750/ 60336 | consumed samples: 12992000 | consumed tokens: 26607616000 | elapsed time per iteration (s): 0.16 | learning rate: 3.120E-05 | global batch size: 256 | lm loss: 3.755372E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.549 | TFLOPs: 24.72 | +7: iteration 50760/ 60336 | consumed samples: 12994560 | consumed tokens: 26612858880 | elapsed time per iteration (s): 0.18 | learning rate: 3.118E-05 | global batch size: 256 | lm loss: 3.757955E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.961 | TFLOPs: 22.85 | +7: iteration 50770/ 60336 | consumed samples: 12997120 | consumed tokens: 26618101760 | elapsed time per iteration (s): 0.17 | learning rate: 3.115E-05 | global batch size: 256 | lm loss: 3.760377E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.495 | TFLOPs: 24.05 | +7: iteration 50780/ 60336 | consumed samples: 12999680 | consumed tokens: 26623344640 | elapsed time per iteration (s): 0.16 | learning rate: 3.113E-05 | global batch size: 256 | lm loss: 3.777813E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.965 | TFLOPs: 25.00 | +7: iteration 50790/ 60336 | consumed samples: 13002240 | consumed tokens: 26628587520 | elapsed time per iteration (s): 0.16 | learning rate: 3.111E-05 | global batch size: 256 | lm loss: 3.764820E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.012 | TFLOPs: 25.67 | +7: iteration 50800/ 60336 | consumed samples: 13004800 | consumed tokens: 26633830400 | elapsed time per iteration (s): 0.17 | learning rate: 3.108E-05 | global batch size: 256 | lm loss: 3.741262E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.898 | TFLOPs: 23.51 | +7: iteration 50810/ 60336 | consumed samples: 13007360 | consumed tokens: 26639073280 | elapsed time per iteration (s): 0.17 | learning rate: 3.106E-05 | global batch size: 256 | lm loss: 3.738883E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.191 | TFLOPs: 24.26 | +7: iteration 50820/ 60336 | consumed samples: 13009920 | consumed tokens: 26644316160 | elapsed time per iteration (s): 0.16 | learning rate: 3.104E-05 | global batch size: 256 | lm loss: 3.744987E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.375 | TFLOPs: 25.11 | +7: iteration 50830/ 60336 | consumed samples: 13012480 | consumed tokens: 26649559040 | elapsed time per iteration (s): 0.17 | learning rate: 3.102E-05 | global batch size: 256 | lm loss: 3.762167E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.401 | TFLOPs: 23.59 | +7: iteration 50840/ 60336 | consumed samples: 13015040 | consumed tokens: 26654801920 | elapsed time per iteration (s): 0.17 | learning rate: 3.099E-05 | global batch size: 256 | lm loss: 3.756016E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1499.544 | TFLOPs: 23.52 | +7: iteration 50850/ 60336 | consumed samples: 13017600 | consumed tokens: 26660044800 | elapsed time per iteration (s): 0.17 | learning rate: 3.097E-05 | global batch size: 256 | lm loss: 3.760599E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1471.447 | TFLOPs: 23.08 | +7: iteration 50860/ 60336 | consumed samples: 13020160 | consumed tokens: 26665287680 | elapsed time per iteration (s): 0.16 | learning rate: 3.095E-05 | global batch size: 256 | lm loss: 3.756189E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.930 | TFLOPs: 24.97 | +7: iteration 50870/ 60336 | consumed samples: 13022720 | consumed tokens: 26670530560 | elapsed time per iteration (s): 0.17 | learning rate: 3.093E-05 | global batch size: 256 | lm loss: 3.758468E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.119 | TFLOPs: 24.07 | +7: iteration 50880/ 60336 | consumed samples: 13025280 | consumed tokens: 26675773440 | elapsed time per iteration (s): 0.16 | learning rate: 3.090E-05 | global batch size: 256 | lm loss: 3.763777E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.918 | TFLOPs: 24.46 | +7: iteration 50890/ 60336 | consumed samples: 13027840 | consumed tokens: 26681016320 | elapsed time per iteration (s): 0.16 | learning rate: 3.088E-05 | global batch size: 256 | lm loss: 3.762362E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.852 | TFLOPs: 25.40 | +7: iteration 50900/ 60336 | consumed samples: 13030400 | consumed tokens: 26686259200 | elapsed time per iteration (s): 0.16 | learning rate: 3.086E-05 | global batch size: 256 | lm loss: 3.755750E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.643 | TFLOPs: 24.66 | +7: iteration 50910/ 60336 | consumed samples: 13032960 | consumed tokens: 26691502080 | elapsed time per iteration (s): 0.16 | learning rate: 3.084E-05 | global batch size: 256 | lm loss: 3.764315E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.236 | TFLOPs: 24.69 | +7: iteration 50920/ 60336 | consumed samples: 13035520 | consumed tokens: 26696744960 | elapsed time per iteration (s): 0.16 | learning rate: 3.081E-05 | global batch size: 256 | lm loss: 3.753691E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.180 | TFLOPs: 24.84 | +7: iteration 50930/ 60336 | consumed samples: 13038080 | consumed tokens: 26701987840 | elapsed time per iteration (s): 0.17 | learning rate: 3.079E-05 | global batch size: 256 | lm loss: 3.759743E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1514.785 | TFLOPs: 23.76 | +7: iteration 50940/ 60336 | consumed samples: 13040640 | consumed tokens: 26707230720 | elapsed time per iteration (s): 0.16 | learning rate: 3.077E-05 | global batch size: 256 | lm loss: 3.765001E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.749 | TFLOPs: 24.38 | +7: iteration 50950/ 60336 | consumed samples: 13043200 | consumed tokens: 26712473600 | elapsed time per iteration (s): 0.17 | learning rate: 3.075E-05 | global batch size: 256 | lm loss: 3.769392E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.593 | TFLOPs: 23.93 | +7: iteration 50960/ 60336 | consumed samples: 13045760 | consumed tokens: 26717716480 | elapsed time per iteration (s): 0.17 | learning rate: 3.072E-05 | global batch size: 256 | lm loss: 3.749707E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.603 | TFLOPs: 23.20 | +7: iteration 50970/ 60336 | consumed samples: 13048320 | consumed tokens: 26722959360 | elapsed time per iteration (s): 0.16 | learning rate: 3.070E-05 | global batch size: 256 | lm loss: 3.775040E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.156 | TFLOPs: 24.51 | +7: iteration 50980/ 60336 | consumed samples: 13050880 | consumed tokens: 26728202240 | elapsed time per iteration (s): 0.16 | learning rate: 3.068E-05 | global batch size: 256 | lm loss: 3.751999E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.846 | TFLOPs: 24.38 | +7: iteration 50990/ 60336 | consumed samples: 13053440 | consumed tokens: 26733445120 | elapsed time per iteration (s): 0.16 | learning rate: 3.066E-05 | global batch size: 256 | lm loss: 3.752536E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.492 | TFLOPs: 25.44 | +7: iteration 51000/ 60336 | consumed samples: 13056000 | consumed tokens: 26738688000 | elapsed time per iteration (s): 0.18 | learning rate: 3.063E-05 | global batch size: 256 | lm loss: 3.759908E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1454.946 | TFLOPs: 22.82 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 51000 | lm loss value: 3.901681E+00 | lm loss PPL: 4.948556E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 51000 to checkpoints_44m32b100m +0: [2023-03-17 02:30:56,869] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step51000 is begin to save! +0: [2023-03-17 02:30:56,872] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:30:56,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:30:56,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:30:56,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:30:56,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:30:56,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:30:56,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:30:56,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:30:56,965] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:30:56,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:30:56,973] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:30:56,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:30:56,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:30:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:30:56,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:30:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:30:56,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:30:57,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:30:57,006] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:30:57,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:30:57,008] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step51000/mp_rank_00_model_states.pt +0: [2023-03-17 02:30:57,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:30:57,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:57,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:57,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:30:57,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:57,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:57,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:30:57,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:57,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:30:57,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:57,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:57,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:57,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step51000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:57,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: successfully saved checkpoint at iteration 51000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 183.74 +7: iteration 51010/ 60336 | consumed samples: 13058560 | consumed tokens: 26743930880 | elapsed time per iteration (s): 0.18 | learning rate: 3.061E-05 | global batch size: 256 | lm loss: 3.756760E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1422.724 | TFLOPs: 22.31 | +7: iteration 51020/ 60336 | consumed samples: 13061120 | consumed tokens: 26749173760 | elapsed time per iteration (s): 0.17 | learning rate: 3.059E-05 | global batch size: 256 | lm loss: 3.769903E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.782 | TFLOPs: 24.26 | +7: iteration 51030/ 60336 | consumed samples: 13063680 | consumed tokens: 26754416640 | elapsed time per iteration (s): 0.16 | learning rate: 3.057E-05 | global batch size: 256 | lm loss: 3.761261E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.101 | TFLOPs: 24.76 | +7: iteration 51040/ 60336 | consumed samples: 13066240 | consumed tokens: 26759659520 | elapsed time per iteration (s): 0.17 | learning rate: 3.054E-05 | global batch size: 256 | lm loss: 3.762666E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.449 | TFLOPs: 23.70 | +7: iteration 51050/ 60336 | consumed samples: 13068800 | consumed tokens: 26764902400 | elapsed time per iteration (s): 0.17 | learning rate: 3.052E-05 | global batch size: 256 | lm loss: 3.766476E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.007 | TFLOPs: 23.71 | +7: iteration 51060/ 60336 | consumed samples: 13071360 | consumed tokens: 26770145280 | elapsed time per iteration (s): 0.17 | learning rate: 3.050E-05 | global batch size: 256 | lm loss: 3.752847E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.325 | TFLOPs: 24.28 | +7: iteration 51070/ 60336 | consumed samples: 13073920 | consumed tokens: 26775388160 | elapsed time per iteration (s): 0.16 | learning rate: 3.048E-05 | global batch size: 256 | lm loss: 3.761212E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.044 | TFLOPs: 25.39 | +7: iteration 51080/ 60336 | consumed samples: 13076480 | consumed tokens: 26780631040 | elapsed time per iteration (s): 0.16 | learning rate: 3.046E-05 | global batch size: 256 | lm loss: 3.763976E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.254 | TFLOPs: 24.78 | +7: iteration 51090/ 60336 | consumed samples: 13079040 | consumed tokens: 26785873920 | elapsed time per iteration (s): 0.16 | learning rate: 3.043E-05 | global batch size: 256 | lm loss: 3.763224E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.457 | TFLOPs: 24.68 | +7: iteration 51100/ 60336 | consumed samples: 13081600 | consumed tokens: 26791116800 | elapsed time per iteration (s): 0.16 | learning rate: 3.041E-05 | global batch size: 256 | lm loss: 3.762784E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.184 | TFLOPs: 24.37 | +7: iteration 51110/ 60336 | consumed samples: 13084160 | consumed tokens: 26796359680 | elapsed time per iteration (s): 0.16 | learning rate: 3.039E-05 | global batch size: 256 | lm loss: 3.744271E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.821 | TFLOPs: 25.28 | +7: iteration 51120/ 60336 | consumed samples: 13086720 | consumed tokens: 26801602560 | elapsed time per iteration (s): 0.16 | learning rate: 3.037E-05 | global batch size: 256 | lm loss: 3.764687E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.590 | TFLOPs: 24.91 | +7: iteration 51130/ 60336 | consumed samples: 13089280 | consumed tokens: 26806845440 | elapsed time per iteration (s): 0.16 | learning rate: 3.035E-05 | global batch size: 256 | lm loss: 3.762673E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.962 | TFLOPs: 24.81 | +7: iteration 51140/ 60336 | consumed samples: 13091840 | consumed tokens: 26812088320 | elapsed time per iteration (s): 0.17 | learning rate: 3.032E-05 | global batch size: 256 | lm loss: 3.754808E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.197 | TFLOPs: 24.30 | +7: iteration 51150/ 60336 | consumed samples: 13094400 | consumed tokens: 26817331200 | elapsed time per iteration (s): 0.17 | learning rate: 3.030E-05 | global batch size: 256 | lm loss: 3.760143E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1500.071 | TFLOPs: 23.52 | +7: iteration 51160/ 60336 | consumed samples: 13096960 | consumed tokens: 26822574080 | elapsed time per iteration (s): 0.16 | learning rate: 3.028E-05 | global batch size: 256 | lm loss: 3.751177E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.233 | TFLOPs: 25.82 | +7: iteration 51170/ 60336 | consumed samples: 13099520 | consumed tokens: 26827816960 | elapsed time per iteration (s): 0.16 | learning rate: 3.026E-05 | global batch size: 256 | lm loss: 3.746474E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.932 | TFLOPs: 25.20 | +7: iteration 51180/ 60336 | consumed samples: 13102080 | consumed tokens: 26833059840 | elapsed time per iteration (s): 0.17 | learning rate: 3.024E-05 | global batch size: 256 | lm loss: 3.751818E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.288 | TFLOPs: 24.17 | +7: iteration 51190/ 60336 | consumed samples: 13104640 | consumed tokens: 26838302720 | elapsed time per iteration (s): 0.17 | learning rate: 3.021E-05 | global batch size: 256 | lm loss: 3.763915E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.414 | TFLOPs: 23.81 | +7: iteration 51200/ 60336 | consumed samples: 13107200 | consumed tokens: 26843545600 | elapsed time per iteration (s): 0.16 | learning rate: 3.019E-05 | global batch size: 256 | lm loss: 3.761821E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.133 | TFLOPs: 24.62 | +7: iteration 51210/ 60336 | consumed samples: 13109760 | consumed tokens: 26848788480 | elapsed time per iteration (s): 0.17 | learning rate: 3.017E-05 | global batch size: 256 | lm loss: 3.780492E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.768 | TFLOPs: 24.30 | +7: iteration 51220/ 60336 | consumed samples: 13112320 | consumed tokens: 26854031360 | elapsed time per iteration (s): 0.16 | learning rate: 3.015E-05 | global batch size: 256 | lm loss: 3.770344E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.289 | TFLOPs: 24.39 | +7: iteration 51230/ 60336 | consumed samples: 13114880 | consumed tokens: 26859274240 | elapsed time per iteration (s): 0.16 | learning rate: 3.013E-05 | global batch size: 256 | lm loss: 3.750076E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.255 | TFLOPs: 25.52 | +7: iteration 51240/ 60336 | consumed samples: 13117440 | consumed tokens: 26864517120 | elapsed time per iteration (s): 0.16 | learning rate: 3.010E-05 | global batch size: 256 | lm loss: 3.765331E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.433 | TFLOPs: 24.97 | +7: iteration 51250/ 60336 | consumed samples: 13120000 | consumed tokens: 26869760000 | elapsed time per iteration (s): 0.16 | learning rate: 3.008E-05 | global batch size: 256 | lm loss: 3.758738E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.511 | TFLOPs: 25.34 | +7: iteration 51260/ 60336 | consumed samples: 13122560 | consumed tokens: 26875002880 | elapsed time per iteration (s): 0.16 | learning rate: 3.006E-05 | global batch size: 256 | lm loss: 3.754010E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.467 | TFLOPs: 24.80 | +7: iteration 51270/ 60336 | consumed samples: 13125120 | consumed tokens: 26880245760 | elapsed time per iteration (s): 0.16 | learning rate: 3.004E-05 | global batch size: 256 | lm loss: 3.764470E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.525 | TFLOPs: 25.30 | +7: iteration 51280/ 60336 | consumed samples: 13127680 | consumed tokens: 26885488640 | elapsed time per iteration (s): 0.17 | learning rate: 3.002E-05 | global batch size: 256 | lm loss: 3.759399E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.467 | TFLOPs: 24.22 | +7: iteration 51290/ 60336 | consumed samples: 13130240 | consumed tokens: 26890731520 | elapsed time per iteration (s): 0.16 | learning rate: 3.000E-05 | global batch size: 256 | lm loss: 3.748180E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.542 | TFLOPs: 24.96 | +7: iteration 51300/ 60336 | consumed samples: 13132800 | consumed tokens: 26895974400 | elapsed time per iteration (s): 0.16 | learning rate: 2.997E-05 | global batch size: 256 | lm loss: 3.762798E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.665 | TFLOPs: 24.90 | +7: iteration 51310/ 60336 | consumed samples: 13135360 | consumed tokens: 26901217280 | elapsed time per iteration (s): 0.16 | learning rate: 2.995E-05 | global batch size: 256 | lm loss: 3.760228E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.880 | TFLOPs: 25.06 | +7: iteration 51320/ 60336 | consumed samples: 13137920 | consumed tokens: 26906460160 | elapsed time per iteration (s): 0.16 | learning rate: 2.993E-05 | global batch size: 256 | lm loss: 3.757044E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.961 | TFLOPs: 24.54 | +7: iteration 51330/ 60336 | consumed samples: 13140480 | consumed tokens: 26911703040 | elapsed time per iteration (s): 0.16 | learning rate: 2.991E-05 | global batch size: 256 | lm loss: 3.755036E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.775 | TFLOPs: 24.59 | +7: iteration 51340/ 60336 | consumed samples: 13143040 | consumed tokens: 26916945920 | elapsed time per iteration (s): 0.16 | learning rate: 2.989E-05 | global batch size: 256 | lm loss: 3.750935E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.812 | TFLOPs: 25.43 | +7: iteration 51350/ 60336 | consumed samples: 13145600 | consumed tokens: 26922188800 | elapsed time per iteration (s): 0.16 | learning rate: 2.987E-05 | global batch size: 256 | lm loss: 3.750602E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.901 | TFLOPs: 25.00 | +7: iteration 51360/ 60336 | consumed samples: 13148160 | consumed tokens: 26927431680 | elapsed time per iteration (s): 0.16 | learning rate: 2.984E-05 | global batch size: 256 | lm loss: 3.770955E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.757 | TFLOPs: 25.03 | +7: iteration 51370/ 60336 | consumed samples: 13150720 | consumed tokens: 26932674560 | elapsed time per iteration (s): 0.16 | learning rate: 2.982E-05 | global batch size: 256 | lm loss: 3.760302E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.220 | TFLOPs: 25.03 | +7: iteration 51380/ 60336 | consumed samples: 13153280 | consumed tokens: 26937917440 | elapsed time per iteration (s): 0.16 | learning rate: 2.980E-05 | global batch size: 256 | lm loss: 3.761488E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.839 | TFLOPs: 24.62 | +7: iteration 51390/ 60336 | consumed samples: 13155840 | consumed tokens: 26943160320 | elapsed time per iteration (s): 0.17 | learning rate: 2.978E-05 | global batch size: 256 | lm loss: 3.748063E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.492 | TFLOPs: 23.33 | +7: iteration 51400/ 60336 | consumed samples: 13158400 | consumed tokens: 26948403200 | elapsed time per iteration (s): 0.16 | learning rate: 2.976E-05 | global batch size: 256 | lm loss: 3.756441E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.305 | TFLOPs: 24.77 | +7: iteration 51410/ 60336 | consumed samples: 13160960 | consumed tokens: 26953646080 | elapsed time per iteration (s): 0.17 | learning rate: 2.974E-05 | global batch size: 256 | lm loss: 3.758726E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1485.514 | TFLOPs: 23.30 | +7: iteration 51420/ 60336 | consumed samples: 13163520 | consumed tokens: 26958888960 | elapsed time per iteration (s): 0.16 | learning rate: 2.972E-05 | global batch size: 256 | lm loss: 3.752492E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.089 | TFLOPs: 24.86 | +7: iteration 51430/ 60336 | consumed samples: 13166080 | consumed tokens: 26964131840 | elapsed time per iteration (s): 0.16 | learning rate: 2.969E-05 | global batch size: 256 | lm loss: 3.744790E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.472 | TFLOPs: 25.44 | +7: iteration 51440/ 60336 | consumed samples: 13168640 | consumed tokens: 26969374720 | elapsed time per iteration (s): 0.16 | learning rate: 2.967E-05 | global batch size: 256 | lm loss: 3.775194E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.160 | TFLOPs: 24.81 | +7: iteration 51450/ 60336 | consumed samples: 13171200 | consumed tokens: 26974617600 | elapsed time per iteration (s): 0.17 | learning rate: 2.965E-05 | global batch size: 256 | lm loss: 3.755288E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.531 | TFLOPs: 24.10 | +7: iteration 51460/ 60336 | consumed samples: 13173760 | consumed tokens: 26979860480 | elapsed time per iteration (s): 0.16 | learning rate: 2.963E-05 | global batch size: 256 | lm loss: 3.757225E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.330 | TFLOPs: 25.33 | +7: iteration 51470/ 60336 | consumed samples: 13176320 | consumed tokens: 26985103360 | elapsed time per iteration (s): 0.16 | learning rate: 2.961E-05 | global batch size: 256 | lm loss: 3.755804E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.209 | TFLOPs: 25.49 | +7: iteration 51480/ 60336 | consumed samples: 13178880 | consumed tokens: 26990346240 | elapsed time per iteration (s): 0.16 | learning rate: 2.959E-05 | global batch size: 256 | lm loss: 3.768888E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.127 | TFLOPs: 25.38 | +7: iteration 51490/ 60336 | consumed samples: 13181440 | consumed tokens: 26995589120 | elapsed time per iteration (s): 0.15 | learning rate: 2.957E-05 | global batch size: 256 | lm loss: 3.757774E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.536 | TFLOPs: 26.23 | +7: iteration 51500/ 60336 | consumed samples: 13184000 | consumed tokens: 27000832000 | elapsed time per iteration (s): 0.17 | learning rate: 2.955E-05 | global batch size: 256 | lm loss: 3.764766E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.211 | TFLOPs: 24.20 | +7: iteration 51510/ 60336 | consumed samples: 13186560 | consumed tokens: 27006074880 | elapsed time per iteration (s): 0.16 | learning rate: 2.952E-05 | global batch size: 256 | lm loss: 3.772419E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.571 | TFLOPs: 25.18 | +7: iteration 51520/ 60336 | consumed samples: 13189120 | consumed tokens: 27011317760 | elapsed time per iteration (s): 0.16 | learning rate: 2.950E-05 | global batch size: 256 | lm loss: 3.747300E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.100 | TFLOPs: 25.28 | +7: iteration 51530/ 60336 | consumed samples: 13191680 | consumed tokens: 27016560640 | elapsed time per iteration (s): 0.16 | learning rate: 2.948E-05 | global batch size: 256 | lm loss: 3.750894E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.153 | TFLOPs: 25.67 | +7: iteration 51540/ 60336 | consumed samples: 13194240 | consumed tokens: 27021803520 | elapsed time per iteration (s): 0.16 | learning rate: 2.946E-05 | global batch size: 256 | lm loss: 3.758178E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.978 | TFLOPs: 25.47 | +7: iteration 51550/ 60336 | consumed samples: 13196800 | consumed tokens: 27027046400 | elapsed time per iteration (s): 0.16 | learning rate: 2.944E-05 | global batch size: 256 | lm loss: 3.752924E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.809 | TFLOPs: 24.79 | +7: iteration 51560/ 60336 | consumed samples: 13199360 | consumed tokens: 27032289280 | elapsed time per iteration (s): 0.16 | learning rate: 2.942E-05 | global batch size: 256 | lm loss: 3.750100E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.476 | TFLOPs: 25.79 | +7: iteration 51570/ 60336 | consumed samples: 13201920 | consumed tokens: 27037532160 | elapsed time per iteration (s): 0.16 | learning rate: 2.940E-05 | global batch size: 256 | lm loss: 3.755959E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.990 | TFLOPs: 25.84 | +7: iteration 51580/ 60336 | consumed samples: 13204480 | consumed tokens: 27042775040 | elapsed time per iteration (s): 0.16 | learning rate: 2.938E-05 | global batch size: 256 | lm loss: 3.753659E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.721 | TFLOPs: 25.04 | +7: iteration 51590/ 60336 | consumed samples: 13207040 | consumed tokens: 27048017920 | elapsed time per iteration (s): 0.16 | learning rate: 2.935E-05 | global batch size: 256 | lm loss: 3.764135E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.903 | TFLOPs: 25.75 | +7: iteration 51600/ 60336 | consumed samples: 13209600 | consumed tokens: 27053260800 | elapsed time per iteration (s): 0.16 | learning rate: 2.933E-05 | global batch size: 256 | lm loss: 3.750439E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.600 | TFLOPs: 25.43 | +7: iteration 51610/ 60336 | consumed samples: 13212160 | consumed tokens: 27058503680 | elapsed time per iteration (s): 0.16 | learning rate: 2.931E-05 | global batch size: 256 | lm loss: 3.761393E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.810 | TFLOPs: 25.83 | +7: iteration 51620/ 60336 | consumed samples: 13214720 | consumed tokens: 27063746560 | elapsed time per iteration (s): 0.16 | learning rate: 2.929E-05 | global batch size: 256 | lm loss: 3.756598E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.942 | TFLOPs: 25.72 | +7: iteration 51630/ 60336 | consumed samples: 13217280 | consumed tokens: 27068989440 | elapsed time per iteration (s): 0.16 | learning rate: 2.927E-05 | global batch size: 256 | lm loss: 3.761103E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.850 | TFLOPs: 24.76 | +7: iteration 51640/ 60336 | consumed samples: 13219840 | consumed tokens: 27074232320 | elapsed time per iteration (s): 0.16 | learning rate: 2.925E-05 | global batch size: 256 | lm loss: 3.751261E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.171 | TFLOPs: 24.89 | +7: iteration 51650/ 60336 | consumed samples: 13222400 | consumed tokens: 27079475200 | elapsed time per iteration (s): 0.16 | learning rate: 2.923E-05 | global batch size: 256 | lm loss: 3.764854E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.963 | TFLOPs: 25.75 | +7: iteration 51660/ 60336 | consumed samples: 13224960 | consumed tokens: 27084718080 | elapsed time per iteration (s): 0.16 | learning rate: 2.921E-05 | global batch size: 256 | lm loss: 3.768292E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.441 | TFLOPs: 25.00 | +7: iteration 51670/ 60336 | consumed samples: 13227520 | consumed tokens: 27089960960 | elapsed time per iteration (s): 0.15 | learning rate: 2.919E-05 | global batch size: 256 | lm loss: 3.769016E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.885 | TFLOPs: 26.05 | +7: iteration 51680/ 60336 | consumed samples: 13230080 | consumed tokens: 27095203840 | elapsed time per iteration (s): 0.16 | learning rate: 2.917E-05 | global batch size: 256 | lm loss: 3.753627E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.270 | TFLOPs: 25.10 | +7: iteration 51690/ 60336 | consumed samples: 13232640 | consumed tokens: 27100446720 | elapsed time per iteration (s): 0.16 | learning rate: 2.915E-05 | global batch size: 256 | lm loss: 3.764225E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.007 | TFLOPs: 24.68 | +7: iteration 51700/ 60336 | consumed samples: 13235200 | consumed tokens: 27105689600 | elapsed time per iteration (s): 0.16 | learning rate: 2.913E-05 | global batch size: 256 | lm loss: 3.753798E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.572 | TFLOPs: 24.87 | +7: iteration 51710/ 60336 | consumed samples: 13237760 | consumed tokens: 27110932480 | elapsed time per iteration (s): 0.16 | learning rate: 2.910E-05 | global batch size: 256 | lm loss: 3.766750E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.510 | TFLOPs: 25.34 | +7: iteration 51720/ 60336 | consumed samples: 13240320 | consumed tokens: 27116175360 | elapsed time per iteration (s): 0.16 | learning rate: 2.908E-05 | global batch size: 256 | lm loss: 3.768948E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.760 | TFLOPs: 25.20 | +7: iteration 51730/ 60336 | consumed samples: 13242880 | consumed tokens: 27121418240 | elapsed time per iteration (s): 0.15 | learning rate: 2.906E-05 | global batch size: 256 | lm loss: 3.764489E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.330 | TFLOPs: 26.07 | +7: iteration 51740/ 60336 | consumed samples: 13245440 | consumed tokens: 27126661120 | elapsed time per iteration (s): 0.15 | learning rate: 2.904E-05 | global batch size: 256 | lm loss: 3.740833E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.809 | TFLOPs: 25.95 | +7: iteration 51750/ 60336 | consumed samples: 13248000 | consumed tokens: 27131904000 | elapsed time per iteration (s): 0.16 | learning rate: 2.902E-05 | global batch size: 256 | lm loss: 3.763538E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.029 | TFLOPs: 25.12 | +7: iteration 51760/ 60336 | consumed samples: 13250560 | consumed tokens: 27137146880 | elapsed time per iteration (s): 0.16 | learning rate: 2.900E-05 | global batch size: 256 | lm loss: 3.760712E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.208 | TFLOPs: 25.36 | +7: iteration 51770/ 60336 | consumed samples: 13253120 | consumed tokens: 27142389760 | elapsed time per iteration (s): 0.16 | learning rate: 2.898E-05 | global batch size: 256 | lm loss: 3.768590E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.622 | TFLOPs: 25.46 | +7: iteration 51780/ 60336 | consumed samples: 13255680 | consumed tokens: 27147632640 | elapsed time per iteration (s): 0.16 | learning rate: 2.896E-05 | global batch size: 256 | lm loss: 3.755474E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.224 | TFLOPs: 25.21 | +7: iteration 51790/ 60336 | consumed samples: 13258240 | consumed tokens: 27152875520 | elapsed time per iteration (s): 0.16 | learning rate: 2.894E-05 | global batch size: 256 | lm loss: 3.764789E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.712 | TFLOPs: 24.98 | +7: iteration 51800/ 60336 | consumed samples: 13260800 | consumed tokens: 27158118400 | elapsed time per iteration (s): 0.17 | learning rate: 2.892E-05 | global batch size: 256 | lm loss: 3.767412E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.591 | TFLOPs: 24.03 | +7: iteration 51810/ 60336 | consumed samples: 13263360 | consumed tokens: 27163361280 | elapsed time per iteration (s): 0.16 | learning rate: 2.890E-05 | global batch size: 256 | lm loss: 3.760032E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.171 | TFLOPs: 25.11 | +7: iteration 51820/ 60336 | consumed samples: 13265920 | consumed tokens: 27168604160 | elapsed time per iteration (s): 0.16 | learning rate: 2.888E-05 | global batch size: 256 | lm loss: 3.753062E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.831 | TFLOPs: 25.47 | +7: iteration 51830/ 60336 | consumed samples: 13268480 | consumed tokens: 27173847040 | elapsed time per iteration (s): 0.16 | learning rate: 2.886E-05 | global batch size: 256 | lm loss: 3.759605E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.456 | TFLOPs: 25.43 | +7: iteration 51840/ 60336 | consumed samples: 13271040 | consumed tokens: 27179089920 | elapsed time per iteration (s): 0.16 | learning rate: 2.884E-05 | global batch size: 256 | lm loss: 3.767873E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.633 | TFLOPs: 24.84 | +7: iteration 51850/ 60336 | consumed samples: 13273600 | consumed tokens: 27184332800 | elapsed time per iteration (s): 0.17 | learning rate: 2.882E-05 | global batch size: 256 | lm loss: 3.762325E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1464.518 | TFLOPs: 22.97 | +7: iteration 51860/ 60336 | consumed samples: 13276160 | consumed tokens: 27189575680 | elapsed time per iteration (s): 0.16 | learning rate: 2.880E-05 | global batch size: 256 | lm loss: 3.748862E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.906 | TFLOPs: 24.70 | +7: iteration 51870/ 60336 | consumed samples: 13278720 | consumed tokens: 27194818560 | elapsed time per iteration (s): 0.16 | learning rate: 2.878E-05 | global batch size: 256 | lm loss: 3.756150E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.735 | TFLOPs: 25.70 | +7: iteration 51880/ 60336 | consumed samples: 13281280 | consumed tokens: 27200061440 | elapsed time per iteration (s): 0.17 | learning rate: 2.875E-05 | global batch size: 256 | lm loss: 3.758542E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.828 | TFLOPs: 24.13 | +7: iteration 51890/ 60336 | consumed samples: 13283840 | consumed tokens: 27205304320 | elapsed time per iteration (s): 0.16 | learning rate: 2.873E-05 | global batch size: 256 | lm loss: 3.749387E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.328 | TFLOPs: 25.71 | +7: iteration 51900/ 60336 | consumed samples: 13286400 | consumed tokens: 27210547200 | elapsed time per iteration (s): 0.16 | learning rate: 2.871E-05 | global batch size: 256 | lm loss: 3.760199E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.304 | TFLOPs: 24.80 | +7: iteration 51910/ 60336 | consumed samples: 13288960 | consumed tokens: 27215790080 | elapsed time per iteration (s): 0.15 | learning rate: 2.869E-05 | global batch size: 256 | lm loss: 3.768189E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.318 | TFLOPs: 25.93 | +7: iteration 51920/ 60336 | consumed samples: 13291520 | consumed tokens: 27221032960 | elapsed time per iteration (s): 0.16 | learning rate: 2.867E-05 | global batch size: 256 | lm loss: 3.767697E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.715 | TFLOPs: 25.35 | +7: iteration 51930/ 60336 | consumed samples: 13294080 | consumed tokens: 27226275840 | elapsed time per iteration (s): 0.16 | learning rate: 2.865E-05 | global batch size: 256 | lm loss: 3.740012E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.511 | TFLOPs: 25.41 | +7: iteration 51940/ 60336 | consumed samples: 13296640 | consumed tokens: 27231518720 | elapsed time per iteration (s): 0.16 | learning rate: 2.863E-05 | global batch size: 256 | lm loss: 3.757331E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.334 | TFLOPs: 25.66 | +7: iteration 51950/ 60336 | consumed samples: 13299200 | consumed tokens: 27236761600 | elapsed time per iteration (s): 0.16 | learning rate: 2.861E-05 | global batch size: 256 | lm loss: 3.764662E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.106 | TFLOPs: 25.22 | +7: iteration 51960/ 60336 | consumed samples: 13301760 | consumed tokens: 27242004480 | elapsed time per iteration (s): 0.17 | learning rate: 2.859E-05 | global batch size: 256 | lm loss: 3.765102E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1466.811 | TFLOPs: 23.00 | +7: iteration 51970/ 60336 | consumed samples: 13304320 | consumed tokens: 27247247360 | elapsed time per iteration (s): 0.16 | learning rate: 2.857E-05 | global batch size: 256 | lm loss: 3.764560E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.082 | TFLOPs: 25.67 | +7: iteration 51980/ 60336 | consumed samples: 13306880 | consumed tokens: 27252490240 | elapsed time per iteration (s): 0.16 | learning rate: 2.855E-05 | global batch size: 256 | lm loss: 3.760952E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.094 | TFLOPs: 25.50 | +7: iteration 51990/ 60336 | consumed samples: 13309440 | consumed tokens: 27257733120 | elapsed time per iteration (s): 0.15 | learning rate: 2.853E-05 | global batch size: 256 | lm loss: 3.758519E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.375 | TFLOPs: 25.91 | +0: [2023-03-17 02:33:37,820] [INFO] [logging.py:68:log_dist] [Rank 0] step=52000, skipped=0, lr=[2.8512224096287758e-05, 2.8512224096287758e-05, 2.8512224096287758e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 52000/ 60336 | consumed samples: 13312000 | consumed tokens: 27262976000 | elapsed time per iteration (s): 0.16 | learning rate: 2.851E-05 | global batch size: 256 | lm loss: 3.753205E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.503 | TFLOPs: 25.71 | +0: steps: 52000 loss: 3.7378 iter time (s): 0.161 samples/sec: 1587.096 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 52000 | lm loss value: 3.896060E+00 | lm loss PPL: 4.920819E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 52000 to checkpoints_44m32b100m +0: [2023-03-17 02:33:37,892] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step52000 is begin to save! +0: [2023-03-17 02:33:37,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:33:37,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:33:37,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:33:37,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:33:37,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:33:37,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:33:37,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:33:37,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:33:37,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:33:37,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:33:37,992] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:33:38,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:33:38,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:33:38,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:33:38,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:33:38,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:33:38,016] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:33:38,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:33:38,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:33:38,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:33:38,025] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step52000/mp_rank_00_model_states.pt +0: [2023-03-17 02:33:38,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:33:38,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:33:38,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:33:38,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:33:38,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:33:38,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:33:38,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:33:38,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:33:38,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:33:38,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:33:38,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 02:33:38,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 02:33:38,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:33:38,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:33:38,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:33:38,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step52000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:33:38,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: successfully saved checkpoint at iteration 52000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 182.15 +7: iteration 52010/ 60336 | consumed samples: 13314560 | consumed tokens: 27268218880 | elapsed time per iteration (s): 0.18 | learning rate: 2.849E-05 | global batch size: 256 | lm loss: 3.759045E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.702 | TFLOPs: 22.15 | +7: iteration 52020/ 60336 | consumed samples: 13317120 | consumed tokens: 27273461760 | elapsed time per iteration (s): 0.16 | learning rate: 2.847E-05 | global batch size: 256 | lm loss: 3.753455E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.255 | TFLOPs: 25.60 | +7: iteration 52030/ 60336 | consumed samples: 13319680 | consumed tokens: 27278704640 | elapsed time per iteration (s): 0.16 | learning rate: 2.845E-05 | global batch size: 256 | lm loss: 3.764777E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.976 | TFLOPs: 25.37 | +7: iteration 52040/ 60336 | consumed samples: 13322240 | consumed tokens: 27283947520 | elapsed time per iteration (s): 0.16 | learning rate: 2.843E-05 | global batch size: 256 | lm loss: 3.761769E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.981 | TFLOPs: 25.42 | +7: iteration 52050/ 60336 | consumed samples: 13324800 | consumed tokens: 27289190400 | elapsed time per iteration (s): 0.17 | learning rate: 2.841E-05 | global batch size: 256 | lm loss: 3.750634E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.479 | TFLOPs: 23.88 | +7: iteration 52060/ 60336 | consumed samples: 13327360 | consumed tokens: 27294433280 | elapsed time per iteration (s): 0.16 | learning rate: 2.839E-05 | global batch size: 256 | lm loss: 3.751255E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.639 | TFLOPs: 25.35 | +7: iteration 52070/ 60336 | consumed samples: 13329920 | consumed tokens: 27299676160 | elapsed time per iteration (s): 0.15 | learning rate: 2.837E-05 | global batch size: 256 | lm loss: 3.753329E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.424 | TFLOPs: 26.24 | +7: iteration 52080/ 60336 | consumed samples: 13332480 | consumed tokens: 27304919040 | elapsed time per iteration (s): 0.16 | learning rate: 2.835E-05 | global batch size: 256 | lm loss: 3.755120E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.400 | TFLOPs: 25.11 | +7: iteration 52090/ 60336 | consumed samples: 13335040 | consumed tokens: 27310161920 | elapsed time per iteration (s): 0.16 | learning rate: 2.833E-05 | global batch size: 256 | lm loss: 3.752063E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.633 | TFLOPs: 24.76 | +7: iteration 52100/ 60336 | consumed samples: 13337600 | consumed tokens: 27315404800 | elapsed time per iteration (s): 0.16 | learning rate: 2.831E-05 | global batch size: 256 | lm loss: 3.761017E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.263 | TFLOPs: 25.88 | +7: iteration 52110/ 60336 | consumed samples: 13340160 | consumed tokens: 27320647680 | elapsed time per iteration (s): 0.16 | learning rate: 2.829E-05 | global batch size: 256 | lm loss: 3.758113E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.112 | TFLOPs: 24.36 | +7: iteration 52120/ 60336 | consumed samples: 13342720 | consumed tokens: 27325890560 | elapsed time per iteration (s): 0.15 | learning rate: 2.827E-05 | global batch size: 256 | lm loss: 3.759979E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.449 | TFLOPs: 26.04 | +7: iteration 52130/ 60336 | consumed samples: 13345280 | consumed tokens: 27331133440 | elapsed time per iteration (s): 0.16 | learning rate: 2.825E-05 | global batch size: 256 | lm loss: 3.751929E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.576 | TFLOPs: 24.65 | +7: iteration 52140/ 60336 | consumed samples: 13347840 | consumed tokens: 27336376320 | elapsed time per iteration (s): 0.15 | learning rate: 2.823E-05 | global batch size: 256 | lm loss: 3.756285E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.663 | TFLOPs: 26.01 | +7: iteration 52150/ 60336 | consumed samples: 13350400 | consumed tokens: 27341619200 | elapsed time per iteration (s): 0.16 | learning rate: 2.821E-05 | global batch size: 256 | lm loss: 3.752823E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.000 | TFLOPs: 25.36 | +7: iteration 52160/ 60336 | consumed samples: 13352960 | consumed tokens: 27346862080 | elapsed time per iteration (s): 0.16 | learning rate: 2.819E-05 | global batch size: 256 | lm loss: 3.759898E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.172 | TFLOPs: 25.82 | +7: iteration 52170/ 60336 | consumed samples: 13355520 | consumed tokens: 27352104960 | elapsed time per iteration (s): 0.16 | learning rate: 2.817E-05 | global batch size: 256 | lm loss: 3.753404E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.849 | TFLOPs: 25.86 | +7: iteration 52180/ 60336 | consumed samples: 13358080 | consumed tokens: 27357347840 | elapsed time per iteration (s): 0.16 | learning rate: 2.815E-05 | global batch size: 256 | lm loss: 3.758469E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.484 | TFLOPs: 24.50 | +7: iteration 52190/ 60336 | consumed samples: 13360640 | consumed tokens: 27362590720 | elapsed time per iteration (s): 0.16 | learning rate: 2.813E-05 | global batch size: 256 | lm loss: 3.763657E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.310 | TFLOPs: 25.00 | +7: iteration 52200/ 60336 | consumed samples: 13363200 | consumed tokens: 27367833600 | elapsed time per iteration (s): 0.16 | learning rate: 2.811E-05 | global batch size: 256 | lm loss: 3.748637E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.889 | TFLOPs: 24.96 | +7: iteration 52210/ 60336 | consumed samples: 13365760 | consumed tokens: 27373076480 | elapsed time per iteration (s): 0.15 | learning rate: 2.810E-05 | global batch size: 256 | lm loss: 3.752118E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.871 | TFLOPs: 26.00 | +7: iteration 52220/ 60336 | consumed samples: 13368320 | consumed tokens: 27378319360 | elapsed time per iteration (s): 0.16 | learning rate: 2.808E-05 | global batch size: 256 | lm loss: 3.771071E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.063 | TFLOPs: 25.03 | +7: iteration 52230/ 60336 | consumed samples: 13370880 | consumed tokens: 27383562240 | elapsed time per iteration (s): 0.15 | learning rate: 2.806E-05 | global batch size: 256 | lm loss: 3.756971E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.760 | TFLOPs: 25.92 | +7: iteration 52240/ 60336 | consumed samples: 13373440 | consumed tokens: 27388805120 | elapsed time per iteration (s): 0.16 | learning rate: 2.804E-05 | global batch size: 256 | lm loss: 3.769440E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.983 | TFLOPs: 25.15 | +7: iteration 52250/ 60336 | consumed samples: 13376000 | consumed tokens: 27394048000 | elapsed time per iteration (s): 0.16 | learning rate: 2.802E-05 | global batch size: 256 | lm loss: 3.749845E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.039 | TFLOPs: 25.74 | +7: iteration 52260/ 60336 | consumed samples: 13378560 | consumed tokens: 27399290880 | elapsed time per iteration (s): 0.16 | learning rate: 2.800E-05 | global batch size: 256 | lm loss: 3.755120E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.311 | TFLOPs: 25.85 | +7: iteration 52270/ 60336 | consumed samples: 13381120 | consumed tokens: 27404533760 | elapsed time per iteration (s): 0.16 | learning rate: 2.798E-05 | global batch size: 256 | lm loss: 3.761995E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.572 | TFLOPs: 25.46 | +7: iteration 52280/ 60336 | consumed samples: 13383680 | consumed tokens: 27409776640 | elapsed time per iteration (s): 0.16 | learning rate: 2.796E-05 | global batch size: 256 | lm loss: 3.754926E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.793 | TFLOPs: 25.25 | +7: iteration 52290/ 60336 | consumed samples: 13386240 | consumed tokens: 27415019520 | elapsed time per iteration (s): 0.16 | learning rate: 2.794E-05 | global batch size: 256 | lm loss: 3.762732E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.615 | TFLOPs: 24.91 | +7: iteration 52300/ 60336 | consumed samples: 13388800 | consumed tokens: 27420262400 | elapsed time per iteration (s): 0.16 | learning rate: 2.792E-05 | global batch size: 256 | lm loss: 3.759598E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.832 | TFLOPs: 25.61 | +7: iteration 52310/ 60336 | consumed samples: 13391360 | consumed tokens: 27425505280 | elapsed time per iteration (s): 0.16 | learning rate: 2.790E-05 | global batch size: 256 | lm loss: 3.755704E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.912 | TFLOPs: 25.48 | +7: iteration 52320/ 60336 | consumed samples: 13393920 | consumed tokens: 27430748160 | elapsed time per iteration (s): 0.16 | learning rate: 2.788E-05 | global batch size: 256 | lm loss: 3.773127E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.205 | TFLOPs: 24.94 | +7: iteration 52330/ 60336 | consumed samples: 13396480 | consumed tokens: 27435991040 | elapsed time per iteration (s): 0.16 | learning rate: 2.786E-05 | global batch size: 256 | lm loss: 3.768524E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.062 | TFLOPs: 25.74 | +7: iteration 52340/ 60336 | consumed samples: 13399040 | consumed tokens: 27441233920 | elapsed time per iteration (s): 0.16 | learning rate: 2.784E-05 | global batch size: 256 | lm loss: 3.769815E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.419 | TFLOPs: 25.74 | +7: iteration 52350/ 60336 | consumed samples: 13401600 | consumed tokens: 27446476800 | elapsed time per iteration (s): 0.17 | learning rate: 2.782E-05 | global batch size: 256 | lm loss: 3.753014E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.504 | TFLOPs: 24.24 | +7: iteration 52360/ 60336 | consumed samples: 13404160 | consumed tokens: 27451719680 | elapsed time per iteration (s): 0.16 | learning rate: 2.780E-05 | global batch size: 256 | lm loss: 3.757485E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.873 | TFLOPs: 24.93 | +7: iteration 52370/ 60336 | consumed samples: 13406720 | consumed tokens: 27456962560 | elapsed time per iteration (s): 0.16 | learning rate: 2.778E-05 | global batch size: 256 | lm loss: 3.768938E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.824 | TFLOPs: 24.92 | +7: iteration 52380/ 60336 | consumed samples: 13409280 | consumed tokens: 27462205440 | elapsed time per iteration (s): 0.17 | learning rate: 2.776E-05 | global batch size: 256 | lm loss: 3.766895E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.232 | TFLOPs: 23.95 | +7: iteration 52390/ 60336 | consumed samples: 13411840 | consumed tokens: 27467448320 | elapsed time per iteration (s): 0.16 | learning rate: 2.775E-05 | global batch size: 256 | lm loss: 3.769544E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.071 | TFLOPs: 24.94 | +7: iteration 52400/ 60336 | consumed samples: 13414400 | consumed tokens: 27472691200 | elapsed time per iteration (s): 0.16 | learning rate: 2.773E-05 | global batch size: 256 | lm loss: 3.746229E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.220 | TFLOPs: 24.91 | +7: iteration 52410/ 60336 | consumed samples: 13416960 | consumed tokens: 27477934080 | elapsed time per iteration (s): 0.16 | learning rate: 2.771E-05 | global batch size: 256 | lm loss: 3.753632E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.475 | TFLOPs: 24.96 | +7: iteration 52420/ 60336 | consumed samples: 13419520 | consumed tokens: 27483176960 | elapsed time per iteration (s): 0.16 | learning rate: 2.769E-05 | global batch size: 256 | lm loss: 3.759775E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.707 | TFLOPs: 25.24 | +7: iteration 52430/ 60336 | consumed samples: 13422080 | consumed tokens: 27488419840 | elapsed time per iteration (s): 0.16 | learning rate: 2.767E-05 | global batch size: 256 | lm loss: 3.753164E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.547 | TFLOPs: 24.94 | +7: iteration 52440/ 60336 | consumed samples: 13424640 | consumed tokens: 27493662720 | elapsed time per iteration (s): 0.16 | learning rate: 2.765E-05 | global batch size: 256 | lm loss: 3.778223E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.656 | TFLOPs: 25.48 | +7: iteration 52450/ 60336 | consumed samples: 13427200 | consumed tokens: 27498905600 | elapsed time per iteration (s): 0.15 | learning rate: 2.763E-05 | global batch size: 256 | lm loss: 3.754586E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.809 | TFLOPs: 26.16 | +7: iteration 52460/ 60336 | consumed samples: 13429760 | consumed tokens: 27504148480 | elapsed time per iteration (s): 0.15 | learning rate: 2.761E-05 | global batch size: 256 | lm loss: 3.753541E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.477 | TFLOPs: 26.17 | +7: iteration 52470/ 60336 | consumed samples: 13432320 | consumed tokens: 27509391360 | elapsed time per iteration (s): 0.16 | learning rate: 2.759E-05 | global batch size: 256 | lm loss: 3.756878E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.005 | TFLOPs: 25.86 | +7: iteration 52480/ 60336 | consumed samples: 13434880 | consumed tokens: 27514634240 | elapsed time per iteration (s): 0.15 | learning rate: 2.757E-05 | global batch size: 256 | lm loss: 3.743889E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.026 | TFLOPs: 26.14 | +7: iteration 52490/ 60336 | consumed samples: 13437440 | consumed tokens: 27519877120 | elapsed time per iteration (s): 0.16 | learning rate: 2.755E-05 | global batch size: 256 | lm loss: 3.752011E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.133 | TFLOPs: 25.83 | +7: iteration 52500/ 60336 | consumed samples: 13440000 | consumed tokens: 27525120000 | elapsed time per iteration (s): 0.16 | learning rate: 2.754E-05 | global batch size: 256 | lm loss: 3.749468E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.725 | TFLOPs: 25.79 | +7: iteration 52510/ 60336 | consumed samples: 13442560 | consumed tokens: 27530362880 | elapsed time per iteration (s): 0.15 | learning rate: 2.752E-05 | global batch size: 256 | lm loss: 3.743306E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.190 | TFLOPs: 26.15 | +7: iteration 52520/ 60336 | consumed samples: 13445120 | consumed tokens: 27535605760 | elapsed time per iteration (s): 0.15 | learning rate: 2.750E-05 | global batch size: 256 | lm loss: 3.765649E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.192 | TFLOPs: 26.15 | +7: iteration 52530/ 60336 | consumed samples: 13447680 | consumed tokens: 27540848640 | elapsed time per iteration (s): 0.16 | learning rate: 2.748E-05 | global batch size: 256 | lm loss: 3.755092E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.335 | TFLOPs: 25.35 | +7: iteration 52540/ 60336 | consumed samples: 13450240 | consumed tokens: 27546091520 | elapsed time per iteration (s): 0.16 | learning rate: 2.746E-05 | global batch size: 256 | lm loss: 3.760162E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.235 | TFLOPs: 25.28 | +7: iteration 52550/ 60336 | consumed samples: 13452800 | consumed tokens: 27551334400 | elapsed time per iteration (s): 0.16 | learning rate: 2.744E-05 | global batch size: 256 | lm loss: 3.769991E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.991 | TFLOPs: 25.81 | +7: iteration 52560/ 60336 | consumed samples: 13455360 | consumed tokens: 27556577280 | elapsed time per iteration (s): 0.16 | learning rate: 2.742E-05 | global batch size: 256 | lm loss: 3.757261E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.107 | TFLOPs: 24.84 | +7: iteration 52570/ 60336 | consumed samples: 13457920 | consumed tokens: 27561820160 | elapsed time per iteration (s): 0.16 | learning rate: 2.740E-05 | global batch size: 256 | lm loss: 3.770458E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.902 | TFLOPs: 25.07 | +7: iteration 52580/ 60336 | consumed samples: 13460480 | consumed tokens: 27567063040 | elapsed time per iteration (s): 0.16 | learning rate: 2.738E-05 | global batch size: 256 | lm loss: 3.766494E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.539 | TFLOPs: 25.37 | +7: iteration 52590/ 60336 | consumed samples: 13463040 | consumed tokens: 27572305920 | elapsed time per iteration (s): 0.16 | learning rate: 2.737E-05 | global batch size: 256 | lm loss: 3.759317E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.236 | TFLOPs: 24.92 | +7: iteration 52600/ 60336 | consumed samples: 13465600 | consumed tokens: 27577548800 | elapsed time per iteration (s): 0.16 | learning rate: 2.735E-05 | global batch size: 256 | lm loss: 3.750903E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.836 | TFLOPs: 25.15 | +7: iteration 52610/ 60336 | consumed samples: 13468160 | consumed tokens: 27582791680 | elapsed time per iteration (s): 0.16 | learning rate: 2.733E-05 | global batch size: 256 | lm loss: 3.748627E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.178 | TFLOPs: 24.94 | +7: iteration 52620/ 60336 | consumed samples: 13470720 | consumed tokens: 27588034560 | elapsed time per iteration (s): 0.16 | learning rate: 2.731E-05 | global batch size: 256 | lm loss: 3.753976E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.808 | TFLOPs: 25.43 | +7: iteration 52630/ 60336 | consumed samples: 13473280 | consumed tokens: 27593277440 | elapsed time per iteration (s): 0.16 | learning rate: 2.729E-05 | global batch size: 256 | lm loss: 3.757747E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.321 | TFLOPs: 25.29 | +7: iteration 52640/ 60336 | consumed samples: 13475840 | consumed tokens: 27598520320 | elapsed time per iteration (s): 0.15 | learning rate: 2.727E-05 | global batch size: 256 | lm loss: 3.749743E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.197 | TFLOPs: 26.10 | +7: iteration 52650/ 60336 | consumed samples: 13478400 | consumed tokens: 27603763200 | elapsed time per iteration (s): 0.16 | learning rate: 2.725E-05 | global batch size: 256 | lm loss: 3.762555E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.816 | TFLOPs: 25.42 | +7: iteration 52660/ 60336 | consumed samples: 13480960 | consumed tokens: 27609006080 | elapsed time per iteration (s): 0.16 | learning rate: 2.724E-05 | global batch size: 256 | lm loss: 3.749902E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.638 | TFLOPs: 25.76 | +7: iteration 52670/ 60336 | consumed samples: 13483520 | consumed tokens: 27614248960 | elapsed time per iteration (s): 0.16 | learning rate: 2.722E-05 | global batch size: 256 | lm loss: 3.778695E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.268 | TFLOPs: 24.58 | +7: iteration 52680/ 60336 | consumed samples: 13486080 | consumed tokens: 27619491840 | elapsed time per iteration (s): 0.16 | learning rate: 2.720E-05 | global batch size: 256 | lm loss: 3.753442E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.322 | TFLOPs: 24.61 | +7: iteration 52690/ 60336 | consumed samples: 13488640 | consumed tokens: 27624734720 | elapsed time per iteration (s): 0.16 | learning rate: 2.718E-05 | global batch size: 256 | lm loss: 3.777864E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.861 | TFLOPs: 25.34 | +7: iteration 52700/ 60336 | consumed samples: 13491200 | consumed tokens: 27629977600 | elapsed time per iteration (s): 0.16 | learning rate: 2.716E-05 | global batch size: 256 | lm loss: 3.756548E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.071 | TFLOPs: 25.25 | +7: iteration 52710/ 60336 | consumed samples: 13493760 | consumed tokens: 27635220480 | elapsed time per iteration (s): 0.16 | learning rate: 2.714E-05 | global batch size: 256 | lm loss: 3.757611E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.331 | TFLOPs: 25.05 | +7: iteration 52720/ 60336 | consumed samples: 13496320 | consumed tokens: 27640463360 | elapsed time per iteration (s): 0.16 | learning rate: 2.712E-05 | global batch size: 256 | lm loss: 3.765228E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.199 | TFLOPs: 25.77 | +7: iteration 52730/ 60336 | consumed samples: 13498880 | consumed tokens: 27645706240 | elapsed time per iteration (s): 0.15 | learning rate: 2.711E-05 | global batch size: 256 | lm loss: 3.744102E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.620 | TFLOPs: 26.06 | +7: iteration 52740/ 60336 | consumed samples: 13501440 | consumed tokens: 27650949120 | elapsed time per iteration (s): 0.16 | learning rate: 2.709E-05 | global batch size: 256 | lm loss: 3.766889E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.191 | TFLOPs: 25.72 | +7: iteration 52750/ 60336 | consumed samples: 13504000 | consumed tokens: 27656192000 | elapsed time per iteration (s): 0.16 | learning rate: 2.707E-05 | global batch size: 256 | lm loss: 3.752311E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.817 | TFLOPs: 25.65 | +7: iteration 52760/ 60336 | consumed samples: 13506560 | consumed tokens: 27661434880 | elapsed time per iteration (s): 0.16 | learning rate: 2.705E-05 | global batch size: 256 | lm loss: 3.749273E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.972 | TFLOPs: 25.75 | +7: iteration 52770/ 60336 | consumed samples: 13509120 | consumed tokens: 27666677760 | elapsed time per iteration (s): 0.16 | learning rate: 2.703E-05 | global batch size: 256 | lm loss: 3.758869E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.192 | TFLOPs: 25.10 | +7: iteration 52780/ 60336 | consumed samples: 13511680 | consumed tokens: 27671920640 | elapsed time per iteration (s): 0.16 | learning rate: 2.701E-05 | global batch size: 256 | lm loss: 3.759903E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.472 | TFLOPs: 25.46 | +7: iteration 52790/ 60336 | consumed samples: 13514240 | consumed tokens: 27677163520 | elapsed time per iteration (s): 0.16 | learning rate: 2.700E-05 | global batch size: 256 | lm loss: 3.752142E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.553 | TFLOPs: 25.26 | +7: iteration 52800/ 60336 | consumed samples: 13516800 | consumed tokens: 27682406400 | elapsed time per iteration (s): 0.16 | learning rate: 2.698E-05 | global batch size: 256 | lm loss: 3.763306E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.098 | TFLOPs: 25.41 | +7: iteration 52810/ 60336 | consumed samples: 13519360 | consumed tokens: 27687649280 | elapsed time per iteration (s): 0.16 | learning rate: 2.696E-05 | global batch size: 256 | lm loss: 3.764165E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.480 | TFLOPs: 25.52 | +7: iteration 52820/ 60336 | consumed samples: 13521920 | consumed tokens: 27692892160 | elapsed time per iteration (s): 0.16 | learning rate: 2.694E-05 | global batch size: 256 | lm loss: 3.757237E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.691 | TFLOPs: 25.09 | +7: iteration 52830/ 60336 | consumed samples: 13524480 | consumed tokens: 27698135040 | elapsed time per iteration (s): 0.16 | learning rate: 2.692E-05 | global batch size: 256 | lm loss: 3.762649E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.658 | TFLOPs: 25.65 | +7: iteration 52840/ 60336 | consumed samples: 13527040 | consumed tokens: 27703377920 | elapsed time per iteration (s): 0.16 | learning rate: 2.690E-05 | global batch size: 256 | lm loss: 3.752584E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.988 | TFLOPs: 25.70 | +7: iteration 52850/ 60336 | consumed samples: 13529600 | consumed tokens: 27708620800 | elapsed time per iteration (s): 0.15 | learning rate: 2.689E-05 | global batch size: 256 | lm loss: 3.757492E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.409 | TFLOPs: 26.13 | +7: iteration 52860/ 60336 | consumed samples: 13532160 | consumed tokens: 27713863680 | elapsed time per iteration (s): 0.18 | learning rate: 2.687E-05 | global batch size: 256 | lm loss: 3.751476E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1426.124 | TFLOPs: 22.37 | +7: iteration 52870/ 60336 | consumed samples: 13534720 | consumed tokens: 27719106560 | elapsed time per iteration (s): 0.16 | learning rate: 2.685E-05 | global batch size: 256 | lm loss: 3.755622E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.332 | TFLOPs: 25.36 | +7: iteration 52880/ 60336 | consumed samples: 13537280 | consumed tokens: 27724349440 | elapsed time per iteration (s): 0.15 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 3.747026E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.395 | TFLOPs: 25.99 | +7: iteration 52890/ 60336 | consumed samples: 13539840 | consumed tokens: 27729592320 | elapsed time per iteration (s): 0.16 | learning rate: 2.681E-05 | global batch size: 256 | lm loss: 3.755845E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.942 | TFLOPs: 25.89 | +7: iteration 52900/ 60336 | consumed samples: 13542400 | consumed tokens: 27734835200 | elapsed time per iteration (s): 0.16 | learning rate: 2.680E-05 | global batch size: 256 | lm loss: 3.763675E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.167 | TFLOPs: 25.63 | +7: iteration 52910/ 60336 | consumed samples: 13544960 | consumed tokens: 27740078080 | elapsed time per iteration (s): 0.16 | learning rate: 2.678E-05 | global batch size: 256 | lm loss: 3.760722E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.054 | TFLOPs: 25.41 | +7: iteration 52920/ 60336 | consumed samples: 13547520 | consumed tokens: 27745320960 | elapsed time per iteration (s): 0.16 | learning rate: 2.676E-05 | global batch size: 256 | lm loss: 3.764761E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.720 | TFLOPs: 25.15 | +7: iteration 52930/ 60336 | consumed samples: 13550080 | consumed tokens: 27750563840 | elapsed time per iteration (s): 0.16 | learning rate: 2.674E-05 | global batch size: 256 | lm loss: 3.759997E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.750 | TFLOPs: 24.98 | +7: iteration 52940/ 60336 | consumed samples: 13552640 | consumed tokens: 27755806720 | elapsed time per iteration (s): 0.16 | learning rate: 2.672E-05 | global batch size: 256 | lm loss: 3.755086E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.881 | TFLOPs: 25.72 | +7: iteration 52950/ 60336 | consumed samples: 13555200 | consumed tokens: 27761049600 | elapsed time per iteration (s): 0.16 | learning rate: 2.671E-05 | global batch size: 256 | lm loss: 3.760433E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.078 | TFLOPs: 25.47 | +7: iteration 52960/ 60336 | consumed samples: 13557760 | consumed tokens: 27766292480 | elapsed time per iteration (s): 0.16 | learning rate: 2.669E-05 | global batch size: 256 | lm loss: 3.754902E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.786 | TFLOPs: 25.70 | +7: iteration 52970/ 60336 | consumed samples: 13560320 | consumed tokens: 27771535360 | elapsed time per iteration (s): 0.15 | learning rate: 2.667E-05 | global batch size: 256 | lm loss: 3.753456E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.451 | TFLOPs: 26.13 | +7: iteration 52980/ 60336 | consumed samples: 13562880 | consumed tokens: 27776778240 | elapsed time per iteration (s): 0.16 | learning rate: 2.665E-05 | global batch size: 256 | lm loss: 3.762295E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.215 | TFLOPs: 25.52 | +7: iteration 52990/ 60336 | consumed samples: 13565440 | consumed tokens: 27782021120 | elapsed time per iteration (s): 0.16 | learning rate: 2.663E-05 | global batch size: 256 | lm loss: 3.755816E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.564 | TFLOPs: 25.48 | +7: iteration 53000/ 60336 | consumed samples: 13568000 | consumed tokens: 27787264000 | elapsed time per iteration (s): 0.16 | learning rate: 2.662E-05 | global batch size: 256 | lm loss: 3.749886E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.909 | TFLOPs: 25.62 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 53000 | lm loss value: 3.908243E+00 | lm loss PPL: 4.981134E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 53000 to checkpoints_44m32b100m +0: [2023-03-17 02:36:16,417] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step53000 is begin to save! +0: [2023-03-17 02:36:16,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:36:16,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:36:16,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:36:16,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:36:16,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:36:16,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:36:16,502] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:36:16,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:36:16,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:36:16,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:36:16,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:36:16,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:36:16,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:36:16,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:36:16,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:36:16,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:36:16,542] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:36:16,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:36:16,550] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:36:16,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:36:16,552] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step53000/mp_rank_00_model_states.pt +0: [2023-03-17 02:36:16,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:36:16,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:36:16,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:36:16,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:36:16,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 02:36:16,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:36:16,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:36:16,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step53000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:36:16,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: successfully saved checkpoint at iteration 53000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 181.39 +7: iteration 53010/ 60336 | consumed samples: 13570560 | consumed tokens: 27792506880 | elapsed time per iteration (s): 0.18 | learning rate: 2.660E-05 | global batch size: 256 | lm loss: 3.753847E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.214 | TFLOPs: 22.13 | +7: iteration 53020/ 60336 | consumed samples: 13573120 | consumed tokens: 27797749760 | elapsed time per iteration (s): 0.16 | learning rate: 2.658E-05 | global batch size: 256 | lm loss: 3.761691E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.647 | TFLOPs: 24.55 | +7: iteration 53030/ 60336 | consumed samples: 13575680 | consumed tokens: 27802992640 | elapsed time per iteration (s): 0.16 | learning rate: 2.656E-05 | global batch size: 256 | lm loss: 3.749036E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.986 | TFLOPs: 25.61 | +7: iteration 53040/ 60336 | consumed samples: 13578240 | consumed tokens: 27808235520 | elapsed time per iteration (s): 0.15 | learning rate: 2.655E-05 | global batch size: 256 | lm loss: 3.761429E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.846 | TFLOPs: 26.16 | +7: iteration 53050/ 60336 | consumed samples: 13580800 | consumed tokens: 27813478400 | elapsed time per iteration (s): 0.16 | learning rate: 2.653E-05 | global batch size: 256 | lm loss: 3.768828E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.351 | TFLOPs: 25.68 | +7: iteration 53060/ 60336 | consumed samples: 13583360 | consumed tokens: 27818721280 | elapsed time per iteration (s): 0.16 | learning rate: 2.651E-05 | global batch size: 256 | lm loss: 3.753907E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.125 | TFLOPs: 25.53 | +7: iteration 53070/ 60336 | consumed samples: 13585920 | consumed tokens: 27823964160 | elapsed time per iteration (s): 0.15 | learning rate: 2.649E-05 | global batch size: 256 | lm loss: 3.762043E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.899 | TFLOPs: 26.13 | +7: iteration 53080/ 60336 | consumed samples: 13588480 | consumed tokens: 27829207040 | elapsed time per iteration (s): 0.16 | learning rate: 2.647E-05 | global batch size: 256 | lm loss: 3.753893E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.988 | TFLOPs: 24.56 | +7: iteration 53090/ 60336 | consumed samples: 13591040 | consumed tokens: 27834449920 | elapsed time per iteration (s): 0.16 | learning rate: 2.646E-05 | global batch size: 256 | lm loss: 3.766038E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.747 | TFLOPs: 25.29 | +7: iteration 53100/ 60336 | consumed samples: 13593600 | consumed tokens: 27839692800 | elapsed time per iteration (s): 0.16 | learning rate: 2.644E-05 | global batch size: 256 | lm loss: 3.735549E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.113 | TFLOPs: 25.83 | +7: iteration 53110/ 60336 | consumed samples: 13596160 | consumed tokens: 27844935680 | elapsed time per iteration (s): 0.16 | learning rate: 2.642E-05 | global batch size: 256 | lm loss: 3.768147E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.963 | TFLOPs: 25.81 | +7: iteration 53120/ 60336 | consumed samples: 13598720 | consumed tokens: 27850178560 | elapsed time per iteration (s): 0.16 | learning rate: 2.640E-05 | global batch size: 256 | lm loss: 3.758377E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.412 | TFLOPs: 25.87 | +7: iteration 53130/ 60336 | consumed samples: 13601280 | consumed tokens: 27855421440 | elapsed time per iteration (s): 0.15 | learning rate: 2.639E-05 | global batch size: 256 | lm loss: 3.769046E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.441 | TFLOPs: 26.10 | +7: iteration 53140/ 60336 | consumed samples: 13603840 | consumed tokens: 27860664320 | elapsed time per iteration (s): 0.15 | learning rate: 2.637E-05 | global batch size: 256 | lm loss: 3.765923E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.649 | TFLOPs: 26.11 | +7: iteration 53150/ 60336 | consumed samples: 13606400 | consumed tokens: 27865907200 | elapsed time per iteration (s): 0.16 | learning rate: 2.635E-05 | global batch size: 256 | lm loss: 3.750652E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.203 | TFLOPs: 25.77 | +7: iteration 53160/ 60336 | consumed samples: 13608960 | consumed tokens: 27871150080 | elapsed time per iteration (s): 0.15 | learning rate: 2.633E-05 | global batch size: 256 | lm loss: 3.746955E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.661 | TFLOPs: 26.14 | +7: iteration 53170/ 60336 | consumed samples: 13611520 | consumed tokens: 27876392960 | elapsed time per iteration (s): 0.16 | learning rate: 2.632E-05 | global batch size: 256 | lm loss: 3.753468E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.149 | TFLOPs: 25.82 | +7: iteration 53180/ 60336 | consumed samples: 13614080 | consumed tokens: 27881635840 | elapsed time per iteration (s): 0.15 | learning rate: 2.630E-05 | global batch size: 256 | lm loss: 3.754622E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.829 | TFLOPs: 26.17 | +7: iteration 53190/ 60336 | consumed samples: 13616640 | consumed tokens: 27886878720 | elapsed time per iteration (s): 0.15 | learning rate: 2.628E-05 | global batch size: 256 | lm loss: 3.748571E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.966 | TFLOPs: 26.17 | +7: iteration 53200/ 60336 | consumed samples: 13619200 | consumed tokens: 27892121600 | elapsed time per iteration (s): 0.15 | learning rate: 2.626E-05 | global batch size: 256 | lm loss: 3.744739E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.417 | TFLOPs: 26.15 | +7: iteration 53210/ 60336 | consumed samples: 13621760 | consumed tokens: 27897364480 | elapsed time per iteration (s): 0.15 | learning rate: 2.625E-05 | global batch size: 256 | lm loss: 3.761996E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.163 | TFLOPs: 26.10 | +7: iteration 53220/ 60336 | consumed samples: 13624320 | consumed tokens: 27902607360 | elapsed time per iteration (s): 0.15 | learning rate: 2.623E-05 | global batch size: 256 | lm loss: 3.747213E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.576 | TFLOPs: 26.04 | +7: iteration 53230/ 60336 | consumed samples: 13626880 | consumed tokens: 27907850240 | elapsed time per iteration (s): 0.15 | learning rate: 2.621E-05 | global batch size: 256 | lm loss: 3.757849E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.437 | TFLOPs: 25.95 | +7: iteration 53240/ 60336 | consumed samples: 13629440 | consumed tokens: 27913093120 | elapsed time per iteration (s): 0.15 | learning rate: 2.620E-05 | global batch size: 256 | lm loss: 3.749314E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.070 | TFLOPs: 26.13 | +7: iteration 53250/ 60336 | consumed samples: 13632000 | consumed tokens: 27918336000 | elapsed time per iteration (s): 0.15 | learning rate: 2.618E-05 | global batch size: 256 | lm loss: 3.754844E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.118 | TFLOPs: 26.03 | +7: iteration 53260/ 60336 | consumed samples: 13634560 | consumed tokens: 27923578880 | elapsed time per iteration (s): 0.15 | learning rate: 2.616E-05 | global batch size: 256 | lm loss: 3.749547E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.699 | TFLOPs: 26.08 | +7: iteration 53270/ 60336 | consumed samples: 13637120 | consumed tokens: 27928821760 | elapsed time per iteration (s): 0.16 | learning rate: 2.614E-05 | global batch size: 256 | lm loss: 3.752996E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.285 | TFLOPs: 25.55 | +7: iteration 53280/ 60336 | consumed samples: 13639680 | consumed tokens: 27934064640 | elapsed time per iteration (s): 0.16 | learning rate: 2.613E-05 | global batch size: 256 | lm loss: 3.750983E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.015 | TFLOPs: 25.81 | +7: iteration 53290/ 60336 | consumed samples: 13642240 | consumed tokens: 27939307520 | elapsed time per iteration (s): 0.15 | learning rate: 2.611E-05 | global batch size: 256 | lm loss: 3.750544E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.786 | TFLOPs: 26.16 | +7: iteration 53300/ 60336 | consumed samples: 13644800 | consumed tokens: 27944550400 | elapsed time per iteration (s): 0.15 | learning rate: 2.609E-05 | global batch size: 256 | lm loss: 3.753248E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.135 | TFLOPs: 26.14 | +7: iteration 53310/ 60336 | consumed samples: 13647360 | consumed tokens: 27949793280 | elapsed time per iteration (s): 0.15 | learning rate: 2.608E-05 | global batch size: 256 | lm loss: 3.749213E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.873 | TFLOPs: 26.14 | +7: iteration 53320/ 60336 | consumed samples: 13649920 | consumed tokens: 27955036160 | elapsed time per iteration (s): 0.15 | learning rate: 2.606E-05 | global batch size: 256 | lm loss: 3.753017E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.233 | TFLOPs: 26.15 | +7: iteration 53330/ 60336 | consumed samples: 13652480 | consumed tokens: 27960279040 | elapsed time per iteration (s): 0.16 | learning rate: 2.604E-05 | global batch size: 256 | lm loss: 3.757701E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.577 | TFLOPs: 25.71 | +7: iteration 53340/ 60336 | consumed samples: 13655040 | consumed tokens: 27965521920 | elapsed time per iteration (s): 0.15 | learning rate: 2.602E-05 | global batch size: 256 | lm loss: 3.771043E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.180 | TFLOPs: 26.16 | +7: iteration 53350/ 60336 | consumed samples: 13657600 | consumed tokens: 27970764800 | elapsed time per iteration (s): 0.15 | learning rate: 2.601E-05 | global batch size: 256 | lm loss: 3.761759E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.598 | TFLOPs: 26.18 | +7: iteration 53360/ 60336 | consumed samples: 13660160 | consumed tokens: 27976007680 | elapsed time per iteration (s): 0.15 | learning rate: 2.599E-05 | global batch size: 256 | lm loss: 3.761891E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.136 | TFLOPs: 26.16 | +7: iteration 53370/ 60336 | consumed samples: 13662720 | consumed tokens: 27981250560 | elapsed time per iteration (s): 0.16 | learning rate: 2.597E-05 | global batch size: 256 | lm loss: 3.751052E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.644 | TFLOPs: 25.71 | +7: iteration 53380/ 60336 | consumed samples: 13665280 | consumed tokens: 27986493440 | elapsed time per iteration (s): 0.16 | learning rate: 2.596E-05 | global batch size: 256 | lm loss: 3.750895E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.251 | TFLOPs: 25.83 | +7: iteration 53390/ 60336 | consumed samples: 13667840 | consumed tokens: 27991736320 | elapsed time per iteration (s): 0.15 | learning rate: 2.594E-05 | global batch size: 256 | lm loss: 3.753238E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.640 | TFLOPs: 26.15 | +7: iteration 53400/ 60336 | consumed samples: 13670400 | consumed tokens: 27996979200 | elapsed time per iteration (s): 0.15 | learning rate: 2.592E-05 | global batch size: 256 | lm loss: 3.747202E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.012 | TFLOPs: 26.16 | +7: iteration 53410/ 60336 | consumed samples: 13672960 | consumed tokens: 28002222080 | elapsed time per iteration (s): 0.16 | learning rate: 2.591E-05 | global batch size: 256 | lm loss: 3.765149E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.381 | TFLOPs: 25.54 | +7: iteration 53420/ 60336 | consumed samples: 13675520 | consumed tokens: 28007464960 | elapsed time per iteration (s): 0.16 | learning rate: 2.589E-05 | global batch size: 256 | lm loss: 3.753111E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.644 | TFLOPs: 25.60 | +7: iteration 53430/ 60336 | consumed samples: 13678080 | consumed tokens: 28012707840 | elapsed time per iteration (s): 0.15 | learning rate: 2.587E-05 | global batch size: 256 | lm loss: 3.749840E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.776 | TFLOPs: 26.17 | +7: iteration 53440/ 60336 | consumed samples: 13680640 | consumed tokens: 28017950720 | elapsed time per iteration (s): 0.15 | learning rate: 2.585E-05 | global batch size: 256 | lm loss: 3.763926E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.415 | TFLOPs: 26.18 | +7: iteration 53450/ 60336 | consumed samples: 13683200 | consumed tokens: 28023193600 | elapsed time per iteration (s): 0.15 | learning rate: 2.584E-05 | global batch size: 256 | lm loss: 3.763193E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.766 | TFLOPs: 26.17 | +7: iteration 53460/ 60336 | consumed samples: 13685760 | consumed tokens: 28028436480 | elapsed time per iteration (s): 0.15 | learning rate: 2.582E-05 | global batch size: 256 | lm loss: 3.759464E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.977 | TFLOPs: 26.19 | +7: iteration 53470/ 60336 | consumed samples: 13688320 | consumed tokens: 28033679360 | elapsed time per iteration (s): 0.16 | learning rate: 2.580E-05 | global batch size: 256 | lm loss: 3.758862E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.898 | TFLOPs: 25.83 | +7: iteration 53480/ 60336 | consumed samples: 13690880 | consumed tokens: 28038922240 | elapsed time per iteration (s): 0.15 | learning rate: 2.579E-05 | global batch size: 256 | lm loss: 3.753812E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.618 | TFLOPs: 26.14 | +7: iteration 53490/ 60336 | consumed samples: 13693440 | consumed tokens: 28044165120 | elapsed time per iteration (s): 0.15 | learning rate: 2.577E-05 | global batch size: 256 | lm loss: 3.764169E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.879 | TFLOPs: 26.17 | +7: iteration 53500/ 60336 | consumed samples: 13696000 | consumed tokens: 28049408000 | elapsed time per iteration (s): 0.16 | learning rate: 2.575E-05 | global batch size: 256 | lm loss: 3.760382E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.110 | TFLOPs: 25.80 | +7: iteration 53510/ 60336 | consumed samples: 13698560 | consumed tokens: 28054650880 | elapsed time per iteration (s): 0.15 | learning rate: 2.574E-05 | global batch size: 256 | lm loss: 3.750118E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.381 | TFLOPs: 26.18 | +7: iteration 53520/ 60336 | consumed samples: 13701120 | consumed tokens: 28059893760 | elapsed time per iteration (s): 0.15 | learning rate: 2.572E-05 | global batch size: 256 | lm loss: 3.755424E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.943 | TFLOPs: 26.19 | +7: iteration 53530/ 60336 | consumed samples: 13703680 | consumed tokens: 28065136640 | elapsed time per iteration (s): 0.15 | learning rate: 2.570E-05 | global batch size: 256 | lm loss: 3.765239E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.748 | TFLOPs: 26.17 | +7: iteration 53540/ 60336 | consumed samples: 13706240 | consumed tokens: 28070379520 | elapsed time per iteration (s): 0.15 | learning rate: 2.569E-05 | global batch size: 256 | lm loss: 3.761709E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.734 | TFLOPs: 26.09 | +7: iteration 53550/ 60336 | consumed samples: 13708800 | consumed tokens: 28075622400 | elapsed time per iteration (s): 0.16 | learning rate: 2.567E-05 | global batch size: 256 | lm loss: 3.747717E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.479 | TFLOPs: 25.73 | +7: iteration 53560/ 60336 | consumed samples: 13711360 | consumed tokens: 28080865280 | elapsed time per iteration (s): 0.15 | learning rate: 2.566E-05 | global batch size: 256 | lm loss: 3.762661E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.305 | TFLOPs: 26.12 | +7: iteration 53570/ 60336 | consumed samples: 13713920 | consumed tokens: 28086108160 | elapsed time per iteration (s): 0.15 | learning rate: 2.564E-05 | global batch size: 256 | lm loss: 3.757714E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.151 | TFLOPs: 26.08 | +7: iteration 53580/ 60336 | consumed samples: 13716480 | consumed tokens: 28091351040 | elapsed time per iteration (s): 0.15 | learning rate: 2.562E-05 | global batch size: 256 | lm loss: 3.762975E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.238 | TFLOPs: 26.12 | +7: iteration 53590/ 60336 | consumed samples: 13719040 | consumed tokens: 28096593920 | elapsed time per iteration (s): 0.16 | learning rate: 2.561E-05 | global batch size: 256 | lm loss: 3.747432E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.951 | TFLOPs: 25.39 | +7: iteration 53600/ 60336 | consumed samples: 13721600 | consumed tokens: 28101836800 | elapsed time per iteration (s): 0.15 | learning rate: 2.559E-05 | global batch size: 256 | lm loss: 3.756554E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.872 | TFLOPs: 26.14 | +7: iteration 53610/ 60336 | consumed samples: 13724160 | consumed tokens: 28107079680 | elapsed time per iteration (s): 0.16 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 3.753520E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.445 | TFLOPs: 25.54 | +7: iteration 53620/ 60336 | consumed samples: 13726720 | consumed tokens: 28112322560 | elapsed time per iteration (s): 0.15 | learning rate: 2.556E-05 | global batch size: 256 | lm loss: 3.743839E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.938 | TFLOPs: 26.11 | +7: iteration 53630/ 60336 | consumed samples: 13729280 | consumed tokens: 28117565440 | elapsed time per iteration (s): 0.16 | learning rate: 2.554E-05 | global batch size: 256 | lm loss: 3.747956E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.027 | TFLOPs: 25.61 | +7: iteration 53640/ 60336 | consumed samples: 13731840 | consumed tokens: 28122808320 | elapsed time per iteration (s): 0.15 | learning rate: 2.552E-05 | global batch size: 256 | lm loss: 3.774252E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.826 | TFLOPs: 26.12 | +7: iteration 53650/ 60336 | consumed samples: 13734400 | consumed tokens: 28128051200 | elapsed time per iteration (s): 0.15 | learning rate: 2.551E-05 | global batch size: 256 | lm loss: 3.746452E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.708 | TFLOPs: 26.12 | +7: iteration 53660/ 60336 | consumed samples: 13736960 | consumed tokens: 28133294080 | elapsed time per iteration (s): 0.15 | learning rate: 2.549E-05 | global batch size: 256 | lm loss: 3.763741E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.866 | TFLOPs: 26.11 | +7: iteration 53670/ 60336 | consumed samples: 13739520 | consumed tokens: 28138536960 | elapsed time per iteration (s): 0.15 | learning rate: 2.547E-05 | global batch size: 256 | lm loss: 3.763154E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.097 | TFLOPs: 26.11 | +7: iteration 53680/ 60336 | consumed samples: 13742080 | consumed tokens: 28143779840 | elapsed time per iteration (s): 0.16 | learning rate: 2.546E-05 | global batch size: 256 | lm loss: 3.765026E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.048 | TFLOPs: 24.97 | +7: iteration 53690/ 60336 | consumed samples: 13744640 | consumed tokens: 28149022720 | elapsed time per iteration (s): 0.15 | learning rate: 2.544E-05 | global batch size: 256 | lm loss: 3.752957E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.489 | TFLOPs: 26.13 | +7: iteration 53700/ 60336 | consumed samples: 13747200 | consumed tokens: 28154265600 | elapsed time per iteration (s): 0.17 | learning rate: 2.543E-05 | global batch size: 256 | lm loss: 3.752428E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.162 | TFLOPs: 24.14 | +7: iteration 53710/ 60336 | consumed samples: 13749760 | consumed tokens: 28159508480 | elapsed time per iteration (s): 0.16 | learning rate: 2.541E-05 | global batch size: 256 | lm loss: 3.762531E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.845 | TFLOPs: 25.78 | +7: iteration 53720/ 60336 | consumed samples: 13752320 | consumed tokens: 28164751360 | elapsed time per iteration (s): 0.15 | learning rate: 2.539E-05 | global batch size: 256 | lm loss: 3.766740E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.973 | TFLOPs: 26.19 | +7: iteration 53730/ 60336 | consumed samples: 13754880 | consumed tokens: 28169994240 | elapsed time per iteration (s): 0.15 | learning rate: 2.538E-05 | global batch size: 256 | lm loss: 3.758939E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.689 | TFLOPs: 26.20 | +7: iteration 53740/ 60336 | consumed samples: 13757440 | consumed tokens: 28175237120 | elapsed time per iteration (s): 0.15 | learning rate: 2.536E-05 | global batch size: 256 | lm loss: 3.774521E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.391 | TFLOPs: 26.20 | +7: iteration 53750/ 60336 | consumed samples: 13760000 | consumed tokens: 28180480000 | elapsed time per iteration (s): 0.15 | learning rate: 2.535E-05 | global batch size: 256 | lm loss: 3.762576E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.828 | TFLOPs: 26.19 | +7: iteration 53760/ 60336 | consumed samples: 13762560 | consumed tokens: 28185722880 | elapsed time per iteration (s): 0.15 | learning rate: 2.533E-05 | global batch size: 256 | lm loss: 3.753845E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.475 | TFLOPs: 26.20 | +7: iteration 53770/ 60336 | consumed samples: 13765120 | consumed tokens: 28190965760 | elapsed time per iteration (s): 0.15 | learning rate: 2.531E-05 | global batch size: 256 | lm loss: 3.759817E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.165 | TFLOPs: 26.19 | +7: iteration 53780/ 60336 | consumed samples: 13767680 | consumed tokens: 28196208640 | elapsed time per iteration (s): 0.15 | learning rate: 2.530E-05 | global batch size: 256 | lm loss: 3.760764E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.943 | TFLOPs: 26.14 | +7: iteration 53790/ 60336 | consumed samples: 13770240 | consumed tokens: 28201451520 | elapsed time per iteration (s): 0.15 | learning rate: 2.528E-05 | global batch size: 256 | lm loss: 3.772197E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.966 | TFLOPs: 26.14 | +7: iteration 53800/ 60336 | consumed samples: 13772800 | consumed tokens: 28206694400 | elapsed time per iteration (s): 0.15 | learning rate: 2.527E-05 | global batch size: 256 | lm loss: 3.762050E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.867 | TFLOPs: 26.19 | +7: iteration 53810/ 60336 | consumed samples: 13775360 | consumed tokens: 28211937280 | elapsed time per iteration (s): 0.15 | learning rate: 2.525E-05 | global batch size: 256 | lm loss: 3.749395E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.261 | TFLOPs: 26.16 | +7: iteration 53820/ 60336 | consumed samples: 13777920 | consumed tokens: 28217180160 | elapsed time per iteration (s): 0.16 | learning rate: 2.523E-05 | global batch size: 256 | lm loss: 3.764129E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.276 | TFLOPs: 25.61 | +7: iteration 53830/ 60336 | consumed samples: 13780480 | consumed tokens: 28222423040 | elapsed time per iteration (s): 0.15 | learning rate: 2.522E-05 | global batch size: 256 | lm loss: 3.753141E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.977 | TFLOPs: 26.17 | +7: iteration 53840/ 60336 | consumed samples: 13783040 | consumed tokens: 28227665920 | elapsed time per iteration (s): 0.15 | learning rate: 2.520E-05 | global batch size: 256 | lm loss: 3.749070E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.320 | TFLOPs: 26.09 | +7: iteration 53850/ 60336 | consumed samples: 13785600 | consumed tokens: 28232908800 | elapsed time per iteration (s): 0.16 | learning rate: 2.519E-05 | global batch size: 256 | lm loss: 3.737983E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.662 | TFLOPs: 25.75 | +7: iteration 53860/ 60336 | consumed samples: 13788160 | consumed tokens: 28238151680 | elapsed time per iteration (s): 0.15 | learning rate: 2.517E-05 | global batch size: 256 | lm loss: 3.759546E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.070 | TFLOPs: 26.07 | +7: iteration 53870/ 60336 | consumed samples: 13790720 | consumed tokens: 28243394560 | elapsed time per iteration (s): 0.15 | learning rate: 2.515E-05 | global batch size: 256 | lm loss: 3.758319E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.565 | TFLOPs: 26.07 | +7: iteration 53880/ 60336 | consumed samples: 13793280 | consumed tokens: 28248637440 | elapsed time per iteration (s): 0.16 | learning rate: 2.514E-05 | global batch size: 256 | lm loss: 3.756303E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.363 | TFLOPs: 25.62 | +7: iteration 53890/ 60336 | consumed samples: 13795840 | consumed tokens: 28253880320 | elapsed time per iteration (s): 0.15 | learning rate: 2.512E-05 | global batch size: 256 | lm loss: 3.761002E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.467 | TFLOPs: 26.15 | +7: iteration 53900/ 60336 | consumed samples: 13798400 | consumed tokens: 28259123200 | elapsed time per iteration (s): 0.15 | learning rate: 2.511E-05 | global batch size: 256 | lm loss: 3.756772E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.610 | TFLOPs: 26.09 | +7: iteration 53910/ 60336 | consumed samples: 13800960 | consumed tokens: 28264366080 | elapsed time per iteration (s): 0.15 | learning rate: 2.509E-05 | global batch size: 256 | lm loss: 3.754226E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.551 | TFLOPs: 26.07 | +7: iteration 53920/ 60336 | consumed samples: 13803520 | consumed tokens: 28269608960 | elapsed time per iteration (s): 0.15 | learning rate: 2.508E-05 | global batch size: 256 | lm loss: 3.744384E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.313 | TFLOPs: 26.08 | +7: iteration 53930/ 60336 | consumed samples: 13806080 | consumed tokens: 28274851840 | elapsed time per iteration (s): 0.15 | learning rate: 2.506E-05 | global batch size: 256 | lm loss: 3.758587E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.929 | TFLOPs: 26.09 | +7: iteration 53940/ 60336 | consumed samples: 13808640 | consumed tokens: 28280094720 | elapsed time per iteration (s): 0.15 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 3.757528E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.430 | TFLOPs: 26.09 | +7: iteration 53950/ 60336 | consumed samples: 13811200 | consumed tokens: 28285337600 | elapsed time per iteration (s): 0.15 | learning rate: 2.503E-05 | global batch size: 256 | lm loss: 3.758210E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.704 | TFLOPs: 26.09 | +7: iteration 53960/ 60336 | consumed samples: 13813760 | consumed tokens: 28290580480 | elapsed time per iteration (s): 0.16 | learning rate: 2.501E-05 | global batch size: 256 | lm loss: 3.767072E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.495 | TFLOPs: 25.60 | +7: iteration 53970/ 60336 | consumed samples: 13816320 | consumed tokens: 28295823360 | elapsed time per iteration (s): 0.15 | learning rate: 2.500E-05 | global batch size: 256 | lm loss: 3.760620E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.047 | TFLOPs: 26.11 | +7: iteration 53980/ 60336 | consumed samples: 13818880 | consumed tokens: 28301066240 | elapsed time per iteration (s): 0.16 | learning rate: 2.498E-05 | global batch size: 256 | lm loss: 3.763905E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.805 | TFLOPs: 25.47 | +7: iteration 53990/ 60336 | consumed samples: 13821440 | consumed tokens: 28306309120 | elapsed time per iteration (s): 0.16 | learning rate: 2.497E-05 | global batch size: 256 | lm loss: 3.759918E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.806 | TFLOPs: 25.64 | +0: [2023-03-17 02:38:51,501] [INFO] [logging.py:68:log_dist] [Rank 0] step=54000, skipped=0, lr=[2.495114368327582e-05, 2.495114368327582e-05, 2.495114368327582e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 54000/ 60336 | consumed samples: 13824000 | consumed tokens: 28311552000 | elapsed time per iteration (s): 0.16 | learning rate: 2.495E-05 | global batch size: 256 | lm loss: 3.745620E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.589 | TFLOPs: 25.73 | +0: steps: 54000 loss: 3.8018 iter time (s): 0.156 samples/sec: 1644.304 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 54000 | lm loss value: 3.891078E+00 | lm loss PPL: 4.896366E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 54000 to checkpoints_44m32b100m +0: [2023-03-17 02:38:51,574] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step54000 is begin to save! +0: [2023-03-17 02:38:51,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:38:51,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:38:51,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:38:51,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:38:51,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:38:51,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:38:51,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:38:51,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:38:51,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:38:51,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:38:51,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:38:51,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:38:51,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:38:51,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:38:51,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:38:51,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:38:51,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:38:51,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:38:51,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:38:51,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:38:51,714] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step54000/mp_rank_00_model_states.pt +0: [2023-03-17 02:38:51,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:38:51,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:51,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:51,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:51,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:51,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:51,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:51,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:51,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:38:51,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:38:51,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step54000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:51,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: successfully saved checkpoint at iteration 54000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 184.81 +7: iteration 54010/ 60336 | consumed samples: 13826560 | consumed tokens: 28316794880 | elapsed time per iteration (s): 0.18 | learning rate: 2.494E-05 | global batch size: 256 | lm loss: 3.760551E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.404 | TFLOPs: 22.50 | +7: iteration 54020/ 60336 | consumed samples: 13829120 | consumed tokens: 28322037760 | elapsed time per iteration (s): 0.16 | learning rate: 2.492E-05 | global batch size: 256 | lm loss: 3.759369E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.011 | TFLOPs: 25.58 | +7: iteration 54030/ 60336 | consumed samples: 13831680 | consumed tokens: 28327280640 | elapsed time per iteration (s): 0.15 | learning rate: 2.490E-05 | global batch size: 256 | lm loss: 3.752317E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.630 | TFLOPs: 26.18 | +7: iteration 54040/ 60336 | consumed samples: 13834240 | consumed tokens: 28332523520 | elapsed time per iteration (s): 0.15 | learning rate: 2.489E-05 | global batch size: 256 | lm loss: 3.761123E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.290 | TFLOPs: 26.16 | +7: iteration 54050/ 60336 | consumed samples: 13836800 | consumed tokens: 28337766400 | elapsed time per iteration (s): 0.15 | learning rate: 2.487E-05 | global batch size: 256 | lm loss: 3.745191E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.516 | TFLOPs: 26.18 | +7: iteration 54060/ 60336 | consumed samples: 13839360 | consumed tokens: 28343009280 | elapsed time per iteration (s): 0.15 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 3.755597E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.221 | TFLOPs: 26.22 | +7: iteration 54070/ 60336 | consumed samples: 13841920 | consumed tokens: 28348252160 | elapsed time per iteration (s): 0.15 | learning rate: 2.484E-05 | global batch size: 256 | lm loss: 3.753091E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.188 | TFLOPs: 26.22 | +7: iteration 54080/ 60336 | consumed samples: 13844480 | consumed tokens: 28353495040 | elapsed time per iteration (s): 0.15 | learning rate: 2.483E-05 | global batch size: 256 | lm loss: 3.756802E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.849 | TFLOPs: 26.19 | +7: iteration 54090/ 60336 | consumed samples: 13847040 | consumed tokens: 28358737920 | elapsed time per iteration (s): 0.15 | learning rate: 2.481E-05 | global batch size: 256 | lm loss: 3.745017E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.969 | TFLOPs: 26.17 | +7: iteration 54100/ 60336 | consumed samples: 13849600 | consumed tokens: 28363980800 | elapsed time per iteration (s): 0.15 | learning rate: 2.480E-05 | global batch size: 256 | lm loss: 3.763847E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.553 | TFLOPs: 26.07 | +7: iteration 54110/ 60336 | consumed samples: 13852160 | consumed tokens: 28369223680 | elapsed time per iteration (s): 0.15 | learning rate: 2.478E-05 | global batch size: 256 | lm loss: 3.752530E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.883 | TFLOPs: 26.19 | +7: iteration 54120/ 60336 | consumed samples: 13854720 | consumed tokens: 28374466560 | elapsed time per iteration (s): 0.15 | learning rate: 2.477E-05 | global batch size: 256 | lm loss: 3.764868E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.238 | TFLOPs: 26.18 | +7: iteration 54130/ 60336 | consumed samples: 13857280 | consumed tokens: 28379709440 | elapsed time per iteration (s): 0.15 | learning rate: 2.475E-05 | global batch size: 256 | lm loss: 3.754839E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.049 | TFLOPs: 26.17 | +7: iteration 54140/ 60336 | consumed samples: 13859840 | consumed tokens: 28384952320 | elapsed time per iteration (s): 0.15 | learning rate: 2.474E-05 | global batch size: 256 | lm loss: 3.759615E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.859 | TFLOPs: 26.17 | +7: iteration 54150/ 60336 | consumed samples: 13862400 | consumed tokens: 28390195200 | elapsed time per iteration (s): 0.15 | learning rate: 2.472E-05 | global batch size: 256 | lm loss: 3.761587E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.013 | TFLOPs: 26.17 | +7: iteration 54160/ 60336 | consumed samples: 13864960 | consumed tokens: 28395438080 | elapsed time per iteration (s): 0.15 | learning rate: 2.471E-05 | global batch size: 256 | lm loss: 3.747675E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.293 | TFLOPs: 26.16 | +7: iteration 54170/ 60336 | consumed samples: 13867520 | consumed tokens: 28400680960 | elapsed time per iteration (s): 0.15 | learning rate: 2.469E-05 | global batch size: 256 | lm loss: 3.753031E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.794 | TFLOPs: 26.16 | +7: iteration 54180/ 60336 | consumed samples: 13870080 | consumed tokens: 28405923840 | elapsed time per iteration (s): 0.15 | learning rate: 2.468E-05 | global batch size: 256 | lm loss: 3.762074E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.185 | TFLOPs: 26.15 | +7: iteration 54190/ 60336 | consumed samples: 13872640 | consumed tokens: 28411166720 | elapsed time per iteration (s): 0.15 | learning rate: 2.466E-05 | global batch size: 256 | lm loss: 3.764005E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.903 | TFLOPs: 26.22 | +7: iteration 54200/ 60336 | consumed samples: 13875200 | consumed tokens: 28416409600 | elapsed time per iteration (s): 0.15 | learning rate: 2.465E-05 | global batch size: 256 | lm loss: 3.751995E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.022 | TFLOPs: 26.21 | +7: iteration 54210/ 60336 | consumed samples: 13877760 | consumed tokens: 28421652480 | elapsed time per iteration (s): 0.15 | learning rate: 2.463E-05 | global batch size: 256 | lm loss: 3.764078E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.153 | TFLOPs: 25.94 | +7: iteration 54220/ 60336 | consumed samples: 13880320 | consumed tokens: 28426895360 | elapsed time per iteration (s): 0.15 | learning rate: 2.462E-05 | global batch size: 256 | lm loss: 3.764245E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.202 | TFLOPs: 26.19 | +7: iteration 54230/ 60336 | consumed samples: 13882880 | consumed tokens: 28432138240 | elapsed time per iteration (s): 0.15 | learning rate: 2.460E-05 | global batch size: 256 | lm loss: 3.769321E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.209 | TFLOPs: 26.19 | +7: iteration 54240/ 60336 | consumed samples: 13885440 | consumed tokens: 28437381120 | elapsed time per iteration (s): 0.15 | learning rate: 2.459E-05 | global batch size: 256 | lm loss: 3.754002E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.770 | TFLOPs: 26.22 | +7: iteration 54250/ 60336 | consumed samples: 13888000 | consumed tokens: 28442624000 | elapsed time per iteration (s): 0.15 | learning rate: 2.457E-05 | global batch size: 256 | lm loss: 3.758174E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.197 | TFLOPs: 26.21 | +7: iteration 54260/ 60336 | consumed samples: 13890560 | consumed tokens: 28447866880 | elapsed time per iteration (s): 0.15 | learning rate: 2.456E-05 | global batch size: 256 | lm loss: 3.749875E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.992 | TFLOPs: 26.21 | +7: iteration 54270/ 60336 | consumed samples: 13893120 | consumed tokens: 28453109760 | elapsed time per iteration (s): 0.15 | learning rate: 2.454E-05 | global batch size: 256 | lm loss: 3.755234E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.978 | TFLOPs: 26.21 | +7: iteration 54280/ 60336 | consumed samples: 13895680 | consumed tokens: 28458352640 | elapsed time per iteration (s): 0.15 | learning rate: 2.453E-05 | global batch size: 256 | lm loss: 3.752225E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.098 | TFLOPs: 26.21 | +7: iteration 54290/ 60336 | consumed samples: 13898240 | consumed tokens: 28463595520 | elapsed time per iteration (s): 0.16 | learning rate: 2.451E-05 | global batch size: 256 | lm loss: 3.754219E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.660 | TFLOPs: 25.81 | +7: iteration 54300/ 60336 | consumed samples: 13900800 | consumed tokens: 28468838400 | elapsed time per iteration (s): 0.15 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 3.755538E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.175 | TFLOPs: 26.21 | +7: iteration 54310/ 60336 | consumed samples: 13903360 | consumed tokens: 28474081280 | elapsed time per iteration (s): 0.15 | learning rate: 2.448E-05 | global batch size: 256 | lm loss: 3.768453E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.213 | TFLOPs: 26.21 | +7: iteration 54320/ 60336 | consumed samples: 13905920 | consumed tokens: 28479324160 | elapsed time per iteration (s): 0.15 | learning rate: 2.447E-05 | global batch size: 256 | lm loss: 3.747458E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.648 | TFLOPs: 26.20 | +7: iteration 54330/ 60336 | consumed samples: 13908480 | consumed tokens: 28484567040 | elapsed time per iteration (s): 0.16 | learning rate: 2.445E-05 | global batch size: 256 | lm loss: 3.759570E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.793 | TFLOPs: 25.78 | +7: iteration 54340/ 60336 | consumed samples: 13911040 | consumed tokens: 28489809920 | elapsed time per iteration (s): 0.15 | learning rate: 2.444E-05 | global batch size: 256 | lm loss: 3.746197E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.182 | TFLOPs: 26.21 | +7: iteration 54350/ 60336 | consumed samples: 13913600 | consumed tokens: 28495052800 | elapsed time per iteration (s): 0.15 | learning rate: 2.442E-05 | global batch size: 256 | lm loss: 3.746201E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.939 | TFLOPs: 26.16 | +7: iteration 54360/ 60336 | consumed samples: 13916160 | consumed tokens: 28500295680 | elapsed time per iteration (s): 0.15 | learning rate: 2.441E-05 | global batch size: 256 | lm loss: 3.749267E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.131 | TFLOPs: 26.14 | +7: iteration 54370/ 60336 | consumed samples: 13918720 | consumed tokens: 28505538560 | elapsed time per iteration (s): 0.15 | learning rate: 2.439E-05 | global batch size: 256 | lm loss: 3.761850E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.046 | TFLOPs: 26.14 | +7: iteration 54380/ 60336 | consumed samples: 13921280 | consumed tokens: 28510781440 | elapsed time per iteration (s): 0.15 | learning rate: 2.438E-05 | global batch size: 256 | lm loss: 3.752215E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.851 | TFLOPs: 26.19 | +7: iteration 54390/ 60336 | consumed samples: 13923840 | consumed tokens: 28516024320 | elapsed time per iteration (s): 0.16 | learning rate: 2.437E-05 | global batch size: 256 | lm loss: 3.752826E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.529 | TFLOPs: 24.50 | +7: iteration 54400/ 60336 | consumed samples: 13926400 | consumed tokens: 28521267200 | elapsed time per iteration (s): 0.15 | learning rate: 2.435E-05 | global batch size: 256 | lm loss: 3.767612E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.700 | TFLOPs: 26.20 | +7: iteration 54410/ 60336 | consumed samples: 13928960 | consumed tokens: 28526510080 | elapsed time per iteration (s): 0.15 | learning rate: 2.434E-05 | global batch size: 256 | lm loss: 3.755588E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.653 | TFLOPs: 26.20 | +7: iteration 54420/ 60336 | consumed samples: 13931520 | consumed tokens: 28531752960 | elapsed time per iteration (s): 0.15 | learning rate: 2.432E-05 | global batch size: 256 | lm loss: 3.757742E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.823 | TFLOPs: 26.20 | +7: iteration 54430/ 60336 | consumed samples: 13934080 | consumed tokens: 28536995840 | elapsed time per iteration (s): 0.15 | learning rate: 2.431E-05 | global batch size: 256 | lm loss: 3.776457E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.156 | TFLOPs: 26.22 | +7: iteration 54440/ 60336 | consumed samples: 13936640 | consumed tokens: 28542238720 | elapsed time per iteration (s): 0.15 | learning rate: 2.429E-05 | global batch size: 256 | lm loss: 3.761729E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.899 | TFLOPs: 26.02 | +7: iteration 54450/ 60336 | consumed samples: 13939200 | consumed tokens: 28547481600 | elapsed time per iteration (s): 0.15 | learning rate: 2.428E-05 | global batch size: 256 | lm loss: 3.749260E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.044 | TFLOPs: 26.02 | +7: iteration 54460/ 60336 | consumed samples: 13941760 | consumed tokens: 28552724480 | elapsed time per iteration (s): 0.15 | learning rate: 2.426E-05 | global batch size: 256 | lm loss: 3.764283E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.145 | TFLOPs: 26.02 | +7: iteration 54470/ 60336 | consumed samples: 13944320 | consumed tokens: 28557967360 | elapsed time per iteration (s): 0.15 | learning rate: 2.425E-05 | global batch size: 256 | lm loss: 3.768862E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.823 | TFLOPs: 26.03 | +7: iteration 54480/ 60336 | consumed samples: 13946880 | consumed tokens: 28563210240 | elapsed time per iteration (s): 0.15 | learning rate: 2.424E-05 | global batch size: 256 | lm loss: 3.760311E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.001 | TFLOPs: 26.02 | +7: iteration 54490/ 60336 | consumed samples: 13949440 | consumed tokens: 28568453120 | elapsed time per iteration (s): 0.15 | learning rate: 2.422E-05 | global batch size: 256 | lm loss: 3.741400E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.117 | TFLOPs: 26.02 | +7: iteration 54500/ 60336 | consumed samples: 13952000 | consumed tokens: 28573696000 | elapsed time per iteration (s): 0.15 | learning rate: 2.421E-05 | global batch size: 256 | lm loss: 3.764476E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.395 | TFLOPs: 26.05 | +7: iteration 54510/ 60336 | consumed samples: 13954560 | consumed tokens: 28578938880 | elapsed time per iteration (s): 0.15 | learning rate: 2.419E-05 | global batch size: 256 | lm loss: 3.766515E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.371 | TFLOPs: 26.05 | +7: iteration 54520/ 60336 | consumed samples: 13957120 | consumed tokens: 28584181760 | elapsed time per iteration (s): 0.15 | learning rate: 2.418E-05 | global batch size: 256 | lm loss: 3.752995E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.234 | TFLOPs: 26.04 | +7: iteration 54530/ 60336 | consumed samples: 13959680 | consumed tokens: 28589424640 | elapsed time per iteration (s): 0.15 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 3.749613E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.859 | TFLOPs: 26.06 | +7: iteration 54540/ 60336 | consumed samples: 13962240 | consumed tokens: 28594667520 | elapsed time per iteration (s): 0.15 | learning rate: 2.415E-05 | global batch size: 256 | lm loss: 3.757883E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.588 | TFLOPs: 26.07 | +7: iteration 54550/ 60336 | consumed samples: 13964800 | consumed tokens: 28599910400 | elapsed time per iteration (s): 0.15 | learning rate: 2.414E-05 | global batch size: 256 | lm loss: 3.762156E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.887 | TFLOPs: 26.08 | +7: iteration 54560/ 60336 | consumed samples: 13967360 | consumed tokens: 28605153280 | elapsed time per iteration (s): 0.15 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 3.759075E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.401 | TFLOPs: 26.07 | +7: iteration 54570/ 60336 | consumed samples: 13969920 | consumed tokens: 28610396160 | elapsed time per iteration (s): 0.16 | learning rate: 2.411E-05 | global batch size: 256 | lm loss: 3.761946E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.290 | TFLOPs: 25.69 | +7: iteration 54580/ 60336 | consumed samples: 13972480 | consumed tokens: 28615639040 | elapsed time per iteration (s): 0.15 | learning rate: 2.409E-05 | global batch size: 256 | lm loss: 3.756445E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.713 | TFLOPs: 26.08 | +7: iteration 54590/ 60336 | consumed samples: 13975040 | consumed tokens: 28620881920 | elapsed time per iteration (s): 0.15 | learning rate: 2.408E-05 | global batch size: 256 | lm loss: 3.766861E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.187 | TFLOPs: 26.08 | +7: iteration 54600/ 60336 | consumed samples: 13977600 | consumed tokens: 28626124800 | elapsed time per iteration (s): 0.15 | learning rate: 2.406E-05 | global batch size: 256 | lm loss: 3.754175E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.768 | TFLOPs: 26.00 | +7: iteration 54610/ 60336 | consumed samples: 13980160 | consumed tokens: 28631367680 | elapsed time per iteration (s): 0.16 | learning rate: 2.405E-05 | global batch size: 256 | lm loss: 3.772845E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.112 | TFLOPs: 25.80 | +7: iteration 54620/ 60336 | consumed samples: 13982720 | consumed tokens: 28636610560 | elapsed time per iteration (s): 0.16 | learning rate: 2.404E-05 | global batch size: 256 | lm loss: 3.769527E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.632 | TFLOPs: 25.60 | +7: iteration 54630/ 60336 | consumed samples: 13985280 | consumed tokens: 28641853440 | elapsed time per iteration (s): 0.15 | learning rate: 2.402E-05 | global batch size: 256 | lm loss: 3.749644E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.140 | TFLOPs: 26.14 | +7: iteration 54640/ 60336 | consumed samples: 13987840 | consumed tokens: 28647096320 | elapsed time per iteration (s): 0.15 | learning rate: 2.401E-05 | global batch size: 256 | lm loss: 3.751231E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.392 | TFLOPs: 26.15 | +7: iteration 54650/ 60336 | consumed samples: 13990400 | consumed tokens: 28652339200 | elapsed time per iteration (s): 0.15 | learning rate: 2.399E-05 | global batch size: 256 | lm loss: 3.750726E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.678 | TFLOPs: 26.18 | +7: iteration 54660/ 60336 | consumed samples: 13992960 | consumed tokens: 28657582080 | elapsed time per iteration (s): 0.15 | learning rate: 2.398E-05 | global batch size: 256 | lm loss: 3.751374E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.054 | TFLOPs: 26.19 | +7: iteration 54670/ 60336 | consumed samples: 13995520 | consumed tokens: 28662824960 | elapsed time per iteration (s): 0.16 | learning rate: 2.397E-05 | global batch size: 256 | lm loss: 3.762627E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.413 | TFLOPs: 25.77 | +7: iteration 54680/ 60336 | consumed samples: 13998080 | consumed tokens: 28668067840 | elapsed time per iteration (s): 0.15 | learning rate: 2.395E-05 | global batch size: 256 | lm loss: 3.762887E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.885 | TFLOPs: 26.11 | +7: iteration 54690/ 60336 | consumed samples: 14000640 | consumed tokens: 28673310720 | elapsed time per iteration (s): 0.15 | learning rate: 2.394E-05 | global batch size: 256 | lm loss: 3.755322E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.272 | TFLOPs: 26.19 | +7: iteration 54700/ 60336 | consumed samples: 14003200 | consumed tokens: 28678553600 | elapsed time per iteration (s): 0.15 | learning rate: 2.393E-05 | global batch size: 256 | lm loss: 3.762006E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.421 | TFLOPs: 26.20 | +7: iteration 54710/ 60336 | consumed samples: 14005760 | consumed tokens: 28683796480 | elapsed time per iteration (s): 0.15 | learning rate: 2.391E-05 | global batch size: 256 | lm loss: 3.751735E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.667 | TFLOPs: 26.20 | +7: iteration 54720/ 60336 | consumed samples: 14008320 | consumed tokens: 28689039360 | elapsed time per iteration (s): 0.15 | learning rate: 2.390E-05 | global batch size: 256 | lm loss: 3.757294E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.719 | TFLOPs: 26.20 | +7: iteration 54730/ 60336 | consumed samples: 14010880 | consumed tokens: 28694282240 | elapsed time per iteration (s): 0.15 | learning rate: 2.388E-05 | global batch size: 256 | lm loss: 3.755270E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.743 | TFLOPs: 26.19 | +7: iteration 54740/ 60336 | consumed samples: 14013440 | consumed tokens: 28699525120 | elapsed time per iteration (s): 0.15 | learning rate: 2.387E-05 | global batch size: 256 | lm loss: 3.760798E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.900 | TFLOPs: 26.19 | +7: iteration 54750/ 60336 | consumed samples: 14016000 | consumed tokens: 28704768000 | elapsed time per iteration (s): 0.15 | learning rate: 2.386E-05 | global batch size: 256 | lm loss: 3.752264E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.554 | TFLOPs: 26.20 | +7: iteration 54760/ 60336 | consumed samples: 14018560 | consumed tokens: 28710010880 | elapsed time per iteration (s): 0.15 | learning rate: 2.384E-05 | global batch size: 256 | lm loss: 3.742300E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.535 | TFLOPs: 26.18 | +7: iteration 54770/ 60336 | consumed samples: 14021120 | consumed tokens: 28715253760 | elapsed time per iteration (s): 0.15 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 3.761731E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.345 | TFLOPs: 26.18 | +7: iteration 54780/ 60336 | consumed samples: 14023680 | consumed tokens: 28720496640 | elapsed time per iteration (s): 0.15 | learning rate: 2.382E-05 | global batch size: 256 | lm loss: 3.751849E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.929 | TFLOPs: 26.20 | +7: iteration 54790/ 60336 | consumed samples: 14026240 | consumed tokens: 28725739520 | elapsed time per iteration (s): 0.15 | learning rate: 2.380E-05 | global batch size: 256 | lm loss: 3.758687E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.526 | TFLOPs: 26.20 | +7: iteration 54800/ 60336 | consumed samples: 14028800 | consumed tokens: 28730982400 | elapsed time per iteration (s): 0.15 | learning rate: 2.379E-05 | global batch size: 256 | lm loss: 3.750562E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.825 | TFLOPs: 26.19 | +7: iteration 54810/ 60336 | consumed samples: 14031360 | consumed tokens: 28736225280 | elapsed time per iteration (s): 0.15 | learning rate: 2.377E-05 | global batch size: 256 | lm loss: 3.768699E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.189 | TFLOPs: 26.19 | +7: iteration 54820/ 60336 | consumed samples: 14033920 | consumed tokens: 28741468160 | elapsed time per iteration (s): 0.15 | learning rate: 2.376E-05 | global batch size: 256 | lm loss: 3.757206E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.289 | TFLOPs: 26.15 | +7: iteration 54830/ 60336 | consumed samples: 14036480 | consumed tokens: 28746711040 | elapsed time per iteration (s): 0.15 | learning rate: 2.375E-05 | global batch size: 256 | lm loss: 3.747940E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.164 | TFLOPs: 26.21 | +7: iteration 54840/ 60336 | consumed samples: 14039040 | consumed tokens: 28751953920 | elapsed time per iteration (s): 0.15 | learning rate: 2.373E-05 | global batch size: 256 | lm loss: 3.767173E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.517 | TFLOPs: 26.23 | +7: iteration 54850/ 60336 | consumed samples: 14041600 | consumed tokens: 28757196800 | elapsed time per iteration (s): 0.15 | learning rate: 2.372E-05 | global batch size: 256 | lm loss: 3.755453E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.510 | TFLOPs: 26.20 | +7: iteration 54860/ 60336 | consumed samples: 14044160 | consumed tokens: 28762439680 | elapsed time per iteration (s): 0.15 | learning rate: 2.371E-05 | global batch size: 256 | lm loss: 3.752388E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.359 | TFLOPs: 26.21 | +7: iteration 54870/ 60336 | consumed samples: 14046720 | consumed tokens: 28767682560 | elapsed time per iteration (s): 0.15 | learning rate: 2.369E-05 | global batch size: 256 | lm loss: 3.747380E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.080 | TFLOPs: 26.21 | +7: iteration 54880/ 60336 | consumed samples: 14049280 | consumed tokens: 28772925440 | elapsed time per iteration (s): 0.15 | learning rate: 2.368E-05 | global batch size: 256 | lm loss: 3.737765E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.891 | TFLOPs: 26.19 | +7: iteration 54890/ 60336 | consumed samples: 14051840 | consumed tokens: 28778168320 | elapsed time per iteration (s): 0.15 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 3.762178E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.125 | TFLOPs: 26.22 | +7: iteration 54900/ 60336 | consumed samples: 14054400 | consumed tokens: 28783411200 | elapsed time per iteration (s): 0.16 | learning rate: 2.365E-05 | global batch size: 256 | lm loss: 3.760260E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.202 | TFLOPs: 25.74 | +7: iteration 54910/ 60336 | consumed samples: 14056960 | consumed tokens: 28788654080 | elapsed time per iteration (s): 0.15 | learning rate: 2.364E-05 | global batch size: 256 | lm loss: 3.750037E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.491 | TFLOPs: 26.21 | +7: iteration 54920/ 60336 | consumed samples: 14059520 | consumed tokens: 28793896960 | elapsed time per iteration (s): 0.16 | learning rate: 2.363E-05 | global batch size: 256 | lm loss: 3.748984E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.853 | TFLOPs: 25.81 | +7: iteration 54930/ 60336 | consumed samples: 14062080 | consumed tokens: 28799139840 | elapsed time per iteration (s): 0.15 | learning rate: 2.361E-05 | global batch size: 256 | lm loss: 3.746076E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.729 | TFLOPs: 26.20 | +7: iteration 54940/ 60336 | consumed samples: 14064640 | consumed tokens: 28804382720 | elapsed time per iteration (s): 0.15 | learning rate: 2.360E-05 | global batch size: 256 | lm loss: 3.747002E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.820 | TFLOPs: 26.20 | +7: iteration 54950/ 60336 | consumed samples: 14067200 | consumed tokens: 28809625600 | elapsed time per iteration (s): 0.15 | learning rate: 2.359E-05 | global batch size: 256 | lm loss: 3.766501E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.402 | TFLOPs: 26.18 | +7: iteration 54960/ 60336 | consumed samples: 14069760 | consumed tokens: 28814868480 | elapsed time per iteration (s): 0.16 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 3.749725E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.819 | TFLOPs: 25.48 | +7: iteration 54970/ 60336 | consumed samples: 14072320 | consumed tokens: 28820111360 | elapsed time per iteration (s): 0.15 | learning rate: 2.356E-05 | global batch size: 256 | lm loss: 3.755546E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.279 | TFLOPs: 26.08 | +7: iteration 54980/ 60336 | consumed samples: 14074880 | consumed tokens: 28825354240 | elapsed time per iteration (s): 0.15 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 3.748893E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.286 | TFLOPs: 26.07 | +7: iteration 54990/ 60336 | consumed samples: 14077440 | consumed tokens: 28830597120 | elapsed time per iteration (s): 0.15 | learning rate: 2.353E-05 | global batch size: 256 | lm loss: 3.756004E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.987 | TFLOPs: 26.06 | +7: iteration 55000/ 60336 | consumed samples: 14080000 | consumed tokens: 28835840000 | elapsed time per iteration (s): 0.15 | learning rate: 2.352E-05 | global batch size: 256 | lm loss: 3.753155E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.945 | TFLOPs: 26.06 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 55000 | lm loss value: 3.882610E+00 | lm loss PPL: 4.855077E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 55000 to checkpoints_44m32b100m +0: [2023-03-17 02:41:25,682] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step55000 is begin to save! +0: [2023-03-17 02:41:25,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:41:25,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:41:25,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:41:25,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:41:25,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:41:25,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:41:25,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:41:25,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:41:25,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:41:25,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:41:25,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:41:25,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:41:25,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:41:25,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:41:25,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:41:25,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:41:25,815] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:41:25,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:41:25,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:41:25,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:41:25,825] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step55000/mp_rank_00_model_states.pt +0: [2023-03-17 02:41:25,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:41:25,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:41:25,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:41:25,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:41:25,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:41:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:41:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:41:25,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:41:25,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:41:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:41:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:41:25,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step55000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:41:25,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: successfully saved checkpoint at iteration 55000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 186.75 +7: iteration 55010/ 60336 | consumed samples: 14082560 | consumed tokens: 28841082880 | elapsed time per iteration (s): 0.18 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 3.755704E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1447.344 | TFLOPs: 22.70 | +7: iteration 55020/ 60336 | consumed samples: 14085120 | consumed tokens: 28846325760 | elapsed time per iteration (s): 0.16 | learning rate: 2.349E-05 | global batch size: 256 | lm loss: 3.750843E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.069 | TFLOPs: 24.59 | +7: iteration 55030/ 60336 | consumed samples: 14087680 | consumed tokens: 28851568640 | elapsed time per iteration (s): 0.15 | learning rate: 2.348E-05 | global batch size: 256 | lm loss: 3.752121E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.621 | TFLOPs: 26.12 | +7: iteration 55040/ 60336 | consumed samples: 14090240 | consumed tokens: 28856811520 | elapsed time per iteration (s): 0.15 | learning rate: 2.347E-05 | global batch size: 256 | lm loss: 3.757097E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.777 | TFLOPs: 26.17 | +7: iteration 55050/ 60336 | consumed samples: 14092800 | consumed tokens: 28862054400 | elapsed time per iteration (s): 0.15 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 3.768331E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.815 | TFLOPs: 26.17 | +7: iteration 55060/ 60336 | consumed samples: 14095360 | consumed tokens: 28867297280 | elapsed time per iteration (s): 0.15 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 3.756482E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.738 | TFLOPs: 26.14 | +7: iteration 55070/ 60336 | consumed samples: 14097920 | consumed tokens: 28872540160 | elapsed time per iteration (s): 0.16 | learning rate: 2.343E-05 | global batch size: 256 | lm loss: 3.754288E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.867 | TFLOPs: 25.83 | +7: iteration 55080/ 60336 | consumed samples: 14100480 | consumed tokens: 28877783040 | elapsed time per iteration (s): 0.15 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 3.758772E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.541 | TFLOPs: 26.15 | +7: iteration 55090/ 60336 | consumed samples: 14103040 | consumed tokens: 28883025920 | elapsed time per iteration (s): 0.15 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 3.765080E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.881 | TFLOPs: 26.16 | +7: iteration 55100/ 60336 | consumed samples: 14105600 | consumed tokens: 28888268800 | elapsed time per iteration (s): 0.15 | learning rate: 2.339E-05 | global batch size: 256 | lm loss: 3.761522E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.686 | TFLOPs: 26.15 | +7: iteration 55110/ 60336 | consumed samples: 14108160 | consumed tokens: 28893511680 | elapsed time per iteration (s): 0.15 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 3.739854E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.914 | TFLOPs: 26.17 | +7: iteration 55120/ 60336 | consumed samples: 14110720 | consumed tokens: 28898754560 | elapsed time per iteration (s): 0.15 | learning rate: 2.337E-05 | global batch size: 256 | lm loss: 3.767704E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.269 | TFLOPs: 26.15 | +7: iteration 55130/ 60336 | consumed samples: 14113280 | consumed tokens: 28903997440 | elapsed time per iteration (s): 0.15 | learning rate: 2.335E-05 | global batch size: 256 | lm loss: 3.752760E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.206 | TFLOPs: 26.16 | +7: iteration 55140/ 60336 | consumed samples: 14115840 | consumed tokens: 28909240320 | elapsed time per iteration (s): 0.15 | learning rate: 2.334E-05 | global batch size: 256 | lm loss: 3.758185E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.325 | TFLOPs: 26.15 | +7: iteration 55150/ 60336 | consumed samples: 14118400 | consumed tokens: 28914483200 | elapsed time per iteration (s): 0.15 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 3.763729E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.529 | TFLOPs: 26.14 | +7: iteration 55160/ 60336 | consumed samples: 14120960 | consumed tokens: 28919726080 | elapsed time per iteration (s): 0.16 | learning rate: 2.331E-05 | global batch size: 256 | lm loss: 3.744722E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.138 | TFLOPs: 25.88 | +7: iteration 55170/ 60336 | consumed samples: 14123520 | consumed tokens: 28924968960 | elapsed time per iteration (s): 0.15 | learning rate: 2.330E-05 | global batch size: 256 | lm loss: 3.756379E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.763 | TFLOPs: 26.08 | +7: iteration 55180/ 60336 | consumed samples: 14126080 | consumed tokens: 28930211840 | elapsed time per iteration (s): 0.15 | learning rate: 2.329E-05 | global batch size: 256 | lm loss: 3.770374E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.578 | TFLOPs: 26.07 | +7: iteration 55190/ 60336 | consumed samples: 14128640 | consumed tokens: 28935454720 | elapsed time per iteration (s): 0.15 | learning rate: 2.328E-05 | global batch size: 256 | lm loss: 3.748424E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.039 | TFLOPs: 26.10 | +7: iteration 55200/ 60336 | consumed samples: 14131200 | consumed tokens: 28940697600 | elapsed time per iteration (s): 0.15 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.750140E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.335 | TFLOPs: 26.10 | +7: iteration 55210/ 60336 | consumed samples: 14133760 | consumed tokens: 28945940480 | elapsed time per iteration (s): 0.16 | learning rate: 2.325E-05 | global batch size: 256 | lm loss: 3.753545E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.994 | TFLOPs: 25.77 | +7: iteration 55220/ 60336 | consumed samples: 14136320 | consumed tokens: 28951183360 | elapsed time per iteration (s): 0.16 | learning rate: 2.324E-05 | global batch size: 256 | lm loss: 3.756345E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.691 | TFLOPs: 25.75 | +7: iteration 55230/ 60336 | consumed samples: 14138880 | consumed tokens: 28956426240 | elapsed time per iteration (s): 0.15 | learning rate: 2.323E-05 | global batch size: 256 | lm loss: 3.744966E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.456 | TFLOPs: 26.10 | +7: iteration 55240/ 60336 | consumed samples: 14141440 | consumed tokens: 28961669120 | elapsed time per iteration (s): 0.16 | learning rate: 2.321E-05 | global batch size: 256 | lm loss: 3.772599E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.382 | TFLOPs: 25.47 | +7: iteration 55250/ 60336 | consumed samples: 14144000 | consumed tokens: 28966912000 | elapsed time per iteration (s): 0.15 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 3.755404E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.458 | TFLOPs: 26.10 | +7: iteration 55260/ 60336 | consumed samples: 14146560 | consumed tokens: 28972154880 | elapsed time per iteration (s): 0.15 | learning rate: 2.319E-05 | global batch size: 256 | lm loss: 3.743110E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.968 | TFLOPs: 26.10 | +7: iteration 55270/ 60336 | consumed samples: 14149120 | consumed tokens: 28977397760 | elapsed time per iteration (s): 0.15 | learning rate: 2.318E-05 | global batch size: 256 | lm loss: 3.751365E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.614 | TFLOPs: 26.12 | +7: iteration 55280/ 60336 | consumed samples: 14151680 | consumed tokens: 28982640640 | elapsed time per iteration (s): 0.15 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 3.767793E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.027 | TFLOPs: 26.16 | +7: iteration 55290/ 60336 | consumed samples: 14154240 | consumed tokens: 28987883520 | elapsed time per iteration (s): 0.15 | learning rate: 2.315E-05 | global batch size: 256 | lm loss: 3.753241E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.776 | TFLOPs: 26.12 | +7: iteration 55300/ 60336 | consumed samples: 14156800 | consumed tokens: 28993126400 | elapsed time per iteration (s): 0.15 | learning rate: 2.314E-05 | global batch size: 256 | lm loss: 3.753880E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.937 | TFLOPs: 26.16 | +7: iteration 55310/ 60336 | consumed samples: 14159360 | consumed tokens: 28998369280 | elapsed time per iteration (s): 0.15 | learning rate: 2.313E-05 | global batch size: 256 | lm loss: 3.743122E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.958 | TFLOPs: 26.11 | +7: iteration 55320/ 60336 | consumed samples: 14161920 | consumed tokens: 29003612160 | elapsed time per iteration (s): 0.15 | learning rate: 2.311E-05 | global batch size: 256 | lm loss: 3.761553E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.760 | TFLOPs: 26.15 | +7: iteration 55330/ 60336 | consumed samples: 14164480 | consumed tokens: 29008855040 | elapsed time per iteration (s): 0.15 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 3.765827E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.334 | TFLOPs: 25.98 | +7: iteration 55340/ 60336 | consumed samples: 14167040 | consumed tokens: 29014097920 | elapsed time per iteration (s): 0.15 | learning rate: 2.309E-05 | global batch size: 256 | lm loss: 3.762698E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.542 | TFLOPs: 26.15 | +7: iteration 55350/ 60336 | consumed samples: 14169600 | consumed tokens: 29019340800 | elapsed time per iteration (s): 0.16 | learning rate: 2.308E-05 | global batch size: 256 | lm loss: 3.766613E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.597 | TFLOPs: 25.38 | +7: iteration 55360/ 60336 | consumed samples: 14172160 | consumed tokens: 29024583680 | elapsed time per iteration (s): 0.15 | learning rate: 2.306E-05 | global batch size: 256 | lm loss: 3.756982E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.465 | TFLOPs: 26.12 | +7: iteration 55370/ 60336 | consumed samples: 14174720 | consumed tokens: 29029826560 | elapsed time per iteration (s): 0.15 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 3.749565E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.507 | TFLOPs: 26.15 | +7: iteration 55380/ 60336 | consumed samples: 14177280 | consumed tokens: 29035069440 | elapsed time per iteration (s): 0.15 | learning rate: 2.304E-05 | global batch size: 256 | lm loss: 3.757060E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.303 | TFLOPs: 26.15 | +7: iteration 55390/ 60336 | consumed samples: 14179840 | consumed tokens: 29040312320 | elapsed time per iteration (s): 0.15 | learning rate: 2.303E-05 | global batch size: 256 | lm loss: 3.754512E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.557 | TFLOPs: 26.15 | +7: iteration 55400/ 60336 | consumed samples: 14182400 | consumed tokens: 29045555200 | elapsed time per iteration (s): 0.15 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 3.761346E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.288 | TFLOPs: 26.15 | +7: iteration 55410/ 60336 | consumed samples: 14184960 | consumed tokens: 29050798080 | elapsed time per iteration (s): 0.15 | learning rate: 2.300E-05 | global batch size: 256 | lm loss: 3.753613E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.778 | TFLOPs: 26.14 | +7: iteration 55420/ 60336 | consumed samples: 14187520 | consumed tokens: 29056040960 | elapsed time per iteration (s): 0.15 | learning rate: 2.299E-05 | global batch size: 256 | lm loss: 3.757821E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.713 | TFLOPs: 26.14 | +7: iteration 55430/ 60336 | consumed samples: 14190080 | consumed tokens: 29061283840 | elapsed time per iteration (s): 0.15 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 3.750270E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.744 | TFLOPs: 26.15 | +7: iteration 55440/ 60336 | consumed samples: 14192640 | consumed tokens: 29066526720 | elapsed time per iteration (s): 0.15 | learning rate: 2.297E-05 | global batch size: 256 | lm loss: 3.754972E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.155 | TFLOPs: 26.16 | +7: iteration 55450/ 60336 | consumed samples: 14195200 | consumed tokens: 29071769600 | elapsed time per iteration (s): 0.15 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 3.754268E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.501 | TFLOPs: 26.18 | +7: iteration 55460/ 60336 | consumed samples: 14197760 | consumed tokens: 29077012480 | elapsed time per iteration (s): 0.15 | learning rate: 2.294E-05 | global batch size: 256 | lm loss: 3.749959E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.726 | TFLOPs: 26.14 | +7: iteration 55470/ 60336 | consumed samples: 14200320 | consumed tokens: 29082255360 | elapsed time per iteration (s): 0.16 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 3.753460E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.724 | TFLOPs: 25.51 | +7: iteration 55480/ 60336 | consumed samples: 14202880 | consumed tokens: 29087498240 | elapsed time per iteration (s): 0.15 | learning rate: 2.292E-05 | global batch size: 256 | lm loss: 3.756130E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.618 | TFLOPs: 26.17 | +7: iteration 55490/ 60336 | consumed samples: 14205440 | consumed tokens: 29092741120 | elapsed time per iteration (s): 0.15 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 3.751803E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.047 | TFLOPs: 26.13 | +7: iteration 55500/ 60336 | consumed samples: 14208000 | consumed tokens: 29097984000 | elapsed time per iteration (s): 0.15 | learning rate: 2.290E-05 | global batch size: 256 | lm loss: 3.750474E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.344 | TFLOPs: 26.15 | +7: iteration 55510/ 60336 | consumed samples: 14210560 | consumed tokens: 29103226880 | elapsed time per iteration (s): 0.15 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 3.748346E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.968 | TFLOPs: 26.17 | +7: iteration 55520/ 60336 | consumed samples: 14213120 | consumed tokens: 29108469760 | elapsed time per iteration (s): 0.15 | learning rate: 2.287E-05 | global batch size: 256 | lm loss: 3.753118E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.502 | TFLOPs: 26.17 | +7: iteration 55530/ 60336 | consumed samples: 14215680 | consumed tokens: 29113712640 | elapsed time per iteration (s): 0.15 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 3.755412E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.824 | TFLOPs: 26.14 | +7: iteration 55540/ 60336 | consumed samples: 14218240 | consumed tokens: 29118955520 | elapsed time per iteration (s): 0.15 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 3.748107E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.795 | TFLOPs: 26.14 | +7: iteration 55550/ 60336 | consumed samples: 14220800 | consumed tokens: 29124198400 | elapsed time per iteration (s): 0.15 | learning rate: 2.284E-05 | global batch size: 256 | lm loss: 3.756422E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.504 | TFLOPs: 26.17 | +7: iteration 55560/ 60336 | consumed samples: 14223360 | consumed tokens: 29129441280 | elapsed time per iteration (s): 0.15 | learning rate: 2.282E-05 | global batch size: 256 | lm loss: 3.748904E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.873 | TFLOPs: 26.17 | +7: iteration 55570/ 60336 | consumed samples: 14225920 | consumed tokens: 29134684160 | elapsed time per iteration (s): 0.15 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 3.740716E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.857 | TFLOPs: 26.08 | +7: iteration 55580/ 60336 | consumed samples: 14228480 | consumed tokens: 29139927040 | elapsed time per iteration (s): 0.15 | learning rate: 2.280E-05 | global batch size: 256 | lm loss: 3.757215E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.091 | TFLOPs: 26.08 | +7: iteration 55590/ 60336 | consumed samples: 14231040 | consumed tokens: 29145169920 | elapsed time per iteration (s): 0.15 | learning rate: 2.279E-05 | global batch size: 256 | lm loss: 3.765254E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.300 | TFLOPs: 26.07 | +7: iteration 55600/ 60336 | consumed samples: 14233600 | consumed tokens: 29150412800 | elapsed time per iteration (s): 0.16 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 3.741995E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.643 | TFLOPs: 25.65 | +7: iteration 55610/ 60336 | consumed samples: 14236160 | consumed tokens: 29155655680 | elapsed time per iteration (s): 0.15 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 3.748655E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.760 | TFLOPs: 26.01 | +7: iteration 55620/ 60336 | consumed samples: 14238720 | consumed tokens: 29160898560 | elapsed time per iteration (s): 0.15 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 3.752777E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.151 | TFLOPs: 26.04 | +7: iteration 55630/ 60336 | consumed samples: 14241280 | consumed tokens: 29166141440 | elapsed time per iteration (s): 0.15 | learning rate: 2.274E-05 | global batch size: 256 | lm loss: 3.764187E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.233 | TFLOPs: 26.02 | +7: iteration 55640/ 60336 | consumed samples: 14243840 | consumed tokens: 29171384320 | elapsed time per iteration (s): 0.15 | learning rate: 2.273E-05 | global batch size: 256 | lm loss: 3.772552E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.197 | TFLOPs: 26.10 | +7: iteration 55650/ 60336 | consumed samples: 14246400 | consumed tokens: 29176627200 | elapsed time per iteration (s): 0.15 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 3.757215E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.071 | TFLOPs: 26.07 | +7: iteration 55660/ 60336 | consumed samples: 14248960 | consumed tokens: 29181870080 | elapsed time per iteration (s): 0.15 | learning rate: 2.271E-05 | global batch size: 256 | lm loss: 3.771452E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.189 | TFLOPs: 26.10 | +7: iteration 55670/ 60336 | consumed samples: 14251520 | consumed tokens: 29187112960 | elapsed time per iteration (s): 0.15 | learning rate: 2.270E-05 | global batch size: 256 | lm loss: 3.751351E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.538 | TFLOPs: 26.06 | +7: iteration 55680/ 60336 | consumed samples: 14254080 | consumed tokens: 29192355840 | elapsed time per iteration (s): 0.16 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 3.753934E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.642 | TFLOPs: 25.73 | +7: iteration 55690/ 60336 | consumed samples: 14256640 | consumed tokens: 29197598720 | elapsed time per iteration (s): 0.15 | learning rate: 2.267E-05 | global batch size: 256 | lm loss: 3.764547E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.131 | TFLOPs: 26.04 | +7: iteration 55700/ 60336 | consumed samples: 14259200 | consumed tokens: 29202841600 | elapsed time per iteration (s): 0.16 | learning rate: 2.266E-05 | global batch size: 256 | lm loss: 3.765900E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.011 | TFLOPs: 25.77 | +7: iteration 55710/ 60336 | consumed samples: 14261760 | consumed tokens: 29208084480 | elapsed time per iteration (s): 0.15 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.764534E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.157 | TFLOPs: 26.07 | +7: iteration 55720/ 60336 | consumed samples: 14264320 | consumed tokens: 29213327360 | elapsed time per iteration (s): 0.16 | learning rate: 2.264E-05 | global batch size: 256 | lm loss: 3.756031E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.333 | TFLOPs: 25.71 | +7: iteration 55730/ 60336 | consumed samples: 14266880 | consumed tokens: 29218570240 | elapsed time per iteration (s): 0.15 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 3.758586E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.760 | TFLOPs: 26.06 | +7: iteration 55740/ 60336 | consumed samples: 14269440 | consumed tokens: 29223813120 | elapsed time per iteration (s): 0.15 | learning rate: 2.262E-05 | global batch size: 256 | lm loss: 3.758395E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.692 | TFLOPs: 26.04 | +7: iteration 55750/ 60336 | consumed samples: 14272000 | consumed tokens: 29229056000 | elapsed time per iteration (s): 0.15 | learning rate: 2.261E-05 | global batch size: 256 | lm loss: 3.745744E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.681 | TFLOPs: 26.06 | +7: iteration 55760/ 60336 | consumed samples: 14274560 | consumed tokens: 29234298880 | elapsed time per iteration (s): 0.15 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 3.750987E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.021 | TFLOPs: 26.11 | +7: iteration 55770/ 60336 | consumed samples: 14277120 | consumed tokens: 29239541760 | elapsed time per iteration (s): 0.15 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 3.766341E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.254 | TFLOPs: 26.12 | +7: iteration 55780/ 60336 | consumed samples: 14279680 | consumed tokens: 29244784640 | elapsed time per iteration (s): 0.15 | learning rate: 2.257E-05 | global batch size: 256 | lm loss: 3.753326E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.366 | TFLOPs: 26.05 | +7: iteration 55790/ 60336 | consumed samples: 14282240 | consumed tokens: 29250027520 | elapsed time per iteration (s): 0.15 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 3.754729E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.267 | TFLOPs: 26.02 | +7: iteration 55800/ 60336 | consumed samples: 14284800 | consumed tokens: 29255270400 | elapsed time per iteration (s): 0.15 | learning rate: 2.255E-05 | global batch size: 256 | lm loss: 3.759232E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.286 | TFLOPs: 26.04 | +7: iteration 55810/ 60336 | consumed samples: 14287360 | consumed tokens: 29260513280 | elapsed time per iteration (s): 0.15 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 3.761988E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.387 | TFLOPs: 26.04 | +7: iteration 55820/ 60336 | consumed samples: 14289920 | consumed tokens: 29265756160 | elapsed time per iteration (s): 0.15 | learning rate: 2.253E-05 | global batch size: 256 | lm loss: 3.749475E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.792 | TFLOPs: 26.06 | +7: iteration 55830/ 60336 | consumed samples: 14292480 | consumed tokens: 29270999040 | elapsed time per iteration (s): 0.15 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 3.757133E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.729 | TFLOPs: 26.06 | +7: iteration 55840/ 60336 | consumed samples: 14295040 | consumed tokens: 29276241920 | elapsed time per iteration (s): 0.15 | learning rate: 2.250E-05 | global batch size: 256 | lm loss: 3.749365E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.231 | TFLOPs: 26.10 | +7: iteration 55850/ 60336 | consumed samples: 14297600 | consumed tokens: 29281484800 | elapsed time per iteration (s): 0.15 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 3.761786E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.418 | TFLOPs: 26.09 | +7: iteration 55860/ 60336 | consumed samples: 14300160 | consumed tokens: 29286727680 | elapsed time per iteration (s): 0.15 | learning rate: 2.248E-05 | global batch size: 256 | lm loss: 3.757463E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.888 | TFLOPs: 26.17 | +7: iteration 55870/ 60336 | consumed samples: 14302720 | consumed tokens: 29291970560 | elapsed time per iteration (s): 0.15 | learning rate: 2.247E-05 | global batch size: 256 | lm loss: 3.751601E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.116 | TFLOPs: 26.16 | +7: iteration 55880/ 60336 | consumed samples: 14305280 | consumed tokens: 29297213440 | elapsed time per iteration (s): 0.15 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 3.754890E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.102 | TFLOPs: 26.10 | +7: iteration 55890/ 60336 | consumed samples: 14307840 | consumed tokens: 29302456320 | elapsed time per iteration (s): 0.15 | learning rate: 2.245E-05 | global batch size: 256 | lm loss: 3.754061E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.603 | TFLOPs: 26.14 | +7: iteration 55900/ 60336 | consumed samples: 14310400 | consumed tokens: 29307699200 | elapsed time per iteration (s): 0.15 | learning rate: 2.244E-05 | global batch size: 256 | lm loss: 3.749479E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.409 | TFLOPs: 26.13 | +7: iteration 55910/ 60336 | consumed samples: 14312960 | consumed tokens: 29312942080 | elapsed time per iteration (s): 0.15 | learning rate: 2.243E-05 | global batch size: 256 | lm loss: 3.763105E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.713 | TFLOPs: 26.15 | +7: iteration 55920/ 60336 | consumed samples: 14315520 | consumed tokens: 29318184960 | elapsed time per iteration (s): 0.15 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 3.745605E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.290 | TFLOPs: 26.16 | +7: iteration 55930/ 60336 | consumed samples: 14318080 | consumed tokens: 29323427840 | elapsed time per iteration (s): 0.15 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 3.759572E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.557 | TFLOPs: 26.14 | +7: iteration 55940/ 60336 | consumed samples: 14320640 | consumed tokens: 29328670720 | elapsed time per iteration (s): 0.15 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 3.750378E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.237 | TFLOPs: 26.16 | +7: iteration 55950/ 60336 | consumed samples: 14323200 | consumed tokens: 29333913600 | elapsed time per iteration (s): 0.15 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 3.757064E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.820 | TFLOPs: 26.16 | +7: iteration 55960/ 60336 | consumed samples: 14325760 | consumed tokens: 29339156480 | elapsed time per iteration (s): 0.15 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 3.768200E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.718 | TFLOPs: 26.11 | +7: iteration 55970/ 60336 | consumed samples: 14328320 | consumed tokens: 29344399360 | elapsed time per iteration (s): 0.15 | learning rate: 2.236E-05 | global batch size: 256 | lm loss: 3.757297E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.354 | TFLOPs: 26.13 | +7: iteration 55980/ 60336 | consumed samples: 14330880 | consumed tokens: 29349642240 | elapsed time per iteration (s): 0.15 | learning rate: 2.235E-05 | global batch size: 256 | lm loss: 3.741397E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.415 | TFLOPs: 26.15 | +7: iteration 55990/ 60336 | consumed samples: 14333440 | consumed tokens: 29354885120 | elapsed time per iteration (s): 0.15 | learning rate: 2.234E-05 | global batch size: 256 | lm loss: 3.751177E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.399 | TFLOPs: 26.15 | +0: [2023-03-17 02:43:59,968] [INFO] [logging.py:68:log_dist] [Rank 0] step=56000, skipped=0, lr=[2.2330226429718313e-05, 2.2330226429718313e-05, 2.2330226429718313e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 56000/ 60336 | consumed samples: 14336000 | consumed tokens: 29360128000 | elapsed time per iteration (s): 0.15 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 3.746453E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.417 | TFLOPs: 26.16 | +0: steps: 56000 loss: 3.7326 iter time (s): 0.153 samples/sec: 1672.746 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 56000 | lm loss value: 3.903251E+00 | lm loss PPL: 4.956330E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 56000 to checkpoints_44m32b100m +0: [2023-03-17 02:44:00,042] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step56000 is begin to save! +0: [2023-03-17 02:44:00,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:44:00,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:44:00,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:44:00,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:44:00,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:44:00,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:44:00,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:44:00,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:44:00,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:44:00,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:44:00,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:44:00,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:44:00,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:44:00,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:44:00,158] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:44:00,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:44:00,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:44:00,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:44:00,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:44:00,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:44:00,176] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step56000/mp_rank_00_model_states.pt +0: [2023-03-17 02:44:00,176] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:44:00,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:44:00,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:44:00,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:44:00,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:44:00,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:44:00,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:44:00,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:44:00,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:44:00,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:44:00,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:44:00,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:44:00,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:44:00,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 02:44:00,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:44:00,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:44:00,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:44:00,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:44:00,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:44:00,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:44:00,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step56000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:44:00,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: successfully saved checkpoint at iteration 56000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.81 +7: iteration 56010/ 60336 | consumed samples: 14338560 | consumed tokens: 29365370880 | elapsed time per iteration (s): 0.18 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 3.745330E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.415 | TFLOPs: 22.54 | +7: iteration 56020/ 60336 | consumed samples: 14341120 | consumed tokens: 29370613760 | elapsed time per iteration (s): 0.15 | learning rate: 2.231E-05 | global batch size: 256 | lm loss: 3.761342E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.984 | TFLOPs: 26.16 | +7: iteration 56030/ 60336 | consumed samples: 14343680 | consumed tokens: 29375856640 | elapsed time per iteration (s): 0.15 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 3.754937E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.894 | TFLOPs: 26.14 | +7: iteration 56040/ 60336 | consumed samples: 14346240 | consumed tokens: 29381099520 | elapsed time per iteration (s): 0.15 | learning rate: 2.229E-05 | global batch size: 256 | lm loss: 3.761369E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.332 | TFLOPs: 26.12 | +7: iteration 56050/ 60336 | consumed samples: 14348800 | consumed tokens: 29386342400 | elapsed time per iteration (s): 0.15 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 3.744304E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.150 | TFLOPs: 26.15 | +7: iteration 56060/ 60336 | consumed samples: 14351360 | consumed tokens: 29391585280 | elapsed time per iteration (s): 0.15 | learning rate: 2.227E-05 | global batch size: 256 | lm loss: 3.738985E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.342 | TFLOPs: 26.10 | +7: iteration 56070/ 60336 | consumed samples: 14353920 | consumed tokens: 29396828160 | elapsed time per iteration (s): 0.15 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 3.762997E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.807 | TFLOPs: 26.12 | +7: iteration 56080/ 60336 | consumed samples: 14356480 | consumed tokens: 29402071040 | elapsed time per iteration (s): 0.15 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 3.754111E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.123 | TFLOPs: 26.10 | +7: iteration 56090/ 60336 | consumed samples: 14359040 | consumed tokens: 29407313920 | elapsed time per iteration (s): 0.15 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 3.758445E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.523 | TFLOPs: 26.10 | +7: iteration 56100/ 60336 | consumed samples: 14361600 | consumed tokens: 29412556800 | elapsed time per iteration (s): 0.15 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 3.759324E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.194 | TFLOPs: 26.10 | +7: iteration 56110/ 60336 | consumed samples: 14364160 | consumed tokens: 29417799680 | elapsed time per iteration (s): 0.15 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 3.755206E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.223 | TFLOPs: 25.99 | +7: iteration 56120/ 60336 | consumed samples: 14366720 | consumed tokens: 29423042560 | elapsed time per iteration (s): 0.16 | learning rate: 2.220E-05 | global batch size: 256 | lm loss: 3.763706E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.824 | TFLOPs: 25.43 | +7: iteration 56130/ 60336 | consumed samples: 14369280 | consumed tokens: 29428285440 | elapsed time per iteration (s): 0.16 | learning rate: 2.219E-05 | global batch size: 256 | lm loss: 3.758718E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.758 | TFLOPs: 25.79 | +7: iteration 56140/ 60336 | consumed samples: 14371840 | consumed tokens: 29433528320 | elapsed time per iteration (s): 0.15 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 3.760728E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.001 | TFLOPs: 26.13 | +7: iteration 56150/ 60336 | consumed samples: 14374400 | consumed tokens: 29438771200 | elapsed time per iteration (s): 0.15 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 3.764296E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.161 | TFLOPs: 26.10 | +7: iteration 56160/ 60336 | consumed samples: 14376960 | consumed tokens: 29444014080 | elapsed time per iteration (s): 0.15 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 3.743501E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.243 | TFLOPs: 26.13 | +7: iteration 56170/ 60336 | consumed samples: 14379520 | consumed tokens: 29449256960 | elapsed time per iteration (s): 0.15 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 3.765940E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.875 | TFLOPs: 26.11 | +7: iteration 56180/ 60336 | consumed samples: 14382080 | consumed tokens: 29454499840 | elapsed time per iteration (s): 0.15 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 3.747698E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.773 | TFLOPs: 26.11 | +7: iteration 56190/ 60336 | consumed samples: 14384640 | consumed tokens: 29459742720 | elapsed time per iteration (s): 0.16 | learning rate: 2.213E-05 | global batch size: 256 | lm loss: 3.750385E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.731 | TFLOPs: 25.82 | +7: iteration 56200/ 60336 | consumed samples: 14387200 | consumed tokens: 29464985600 | elapsed time per iteration (s): 0.15 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 3.762045E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.577 | TFLOPs: 26.07 | +7: iteration 56210/ 60336 | consumed samples: 14389760 | consumed tokens: 29470228480 | elapsed time per iteration (s): 0.15 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 3.765490E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.857 | TFLOPs: 26.00 | +7: iteration 56220/ 60336 | consumed samples: 14392320 | consumed tokens: 29475471360 | elapsed time per iteration (s): 0.15 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 3.752932E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.401 | TFLOPs: 26.10 | +7: iteration 56230/ 60336 | consumed samples: 14394880 | consumed tokens: 29480714240 | elapsed time per iteration (s): 0.15 | learning rate: 2.209E-05 | global batch size: 256 | lm loss: 3.744395E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.081 | TFLOPs: 25.92 | +7: iteration 56240/ 60336 | consumed samples: 14397440 | consumed tokens: 29485957120 | elapsed time per iteration (s): 0.15 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 3.748291E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.387 | TFLOPs: 26.15 | +7: iteration 56250/ 60336 | consumed samples: 14400000 | consumed tokens: 29491200000 | elapsed time per iteration (s): 0.15 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 3.741705E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.138 | TFLOPs: 26.14 | +7: iteration 56260/ 60336 | consumed samples: 14402560 | consumed tokens: 29496442880 | elapsed time per iteration (s): 0.15 | learning rate: 2.206E-05 | global batch size: 256 | lm loss: 3.741823E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.159 | TFLOPs: 26.11 | +7: iteration 56270/ 60336 | consumed samples: 14405120 | consumed tokens: 29501685760 | elapsed time per iteration (s): 0.15 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 3.752541E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.435 | TFLOPs: 26.13 | +7: iteration 56280/ 60336 | consumed samples: 14407680 | consumed tokens: 29506928640 | elapsed time per iteration (s): 0.15 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 3.754955E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.466 | TFLOPs: 26.15 | +7: iteration 56290/ 60336 | consumed samples: 14410240 | consumed tokens: 29512171520 | elapsed time per iteration (s): 0.15 | learning rate: 2.203E-05 | global batch size: 256 | lm loss: 3.755126E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.705 | TFLOPs: 26.15 | +7: iteration 56300/ 60336 | consumed samples: 14412800 | consumed tokens: 29517414400 | elapsed time per iteration (s): 0.15 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 3.748587E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.678 | TFLOPs: 26.14 | +7: iteration 56310/ 60336 | consumed samples: 14415360 | consumed tokens: 29522657280 | elapsed time per iteration (s): 0.15 | learning rate: 2.201E-05 | global batch size: 256 | lm loss: 3.752600E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.340 | TFLOPs: 26.15 | +7: iteration 56320/ 60336 | consumed samples: 14417920 | consumed tokens: 29527900160 | elapsed time per iteration (s): 0.15 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 3.748518E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.577 | TFLOPs: 26.15 | +7: iteration 56330/ 60336 | consumed samples: 14420480 | consumed tokens: 29533143040 | elapsed time per iteration (s): 0.15 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 3.759432E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.707 | TFLOPs: 26.14 | +7: iteration 56340/ 60336 | consumed samples: 14423040 | consumed tokens: 29538385920 | elapsed time per iteration (s): 0.15 | learning rate: 2.198E-05 | global batch size: 256 | lm loss: 3.755588E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.659 | TFLOPs: 26.14 | +7: iteration 56350/ 60336 | consumed samples: 14425600 | consumed tokens: 29543628800 | elapsed time per iteration (s): 0.16 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 3.742139E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.127 | TFLOPs: 25.72 | +7: iteration 56360/ 60336 | consumed samples: 14428160 | consumed tokens: 29548871680 | elapsed time per iteration (s): 0.15 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 3.736589E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.928 | TFLOPs: 26.13 | +7: iteration 56370/ 60336 | consumed samples: 14430720 | consumed tokens: 29554114560 | elapsed time per iteration (s): 0.15 | learning rate: 2.195E-05 | global batch size: 256 | lm loss: 3.756977E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.225 | TFLOPs: 26.15 | +7: iteration 56380/ 60336 | consumed samples: 14433280 | consumed tokens: 29559357440 | elapsed time per iteration (s): 0.15 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 3.755926E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.798 | TFLOPs: 25.95 | +7: iteration 56390/ 60336 | consumed samples: 14435840 | consumed tokens: 29564600320 | elapsed time per iteration (s): 0.15 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 3.744836E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.706 | TFLOPs: 26.12 | +7: iteration 56400/ 60336 | consumed samples: 14438400 | consumed tokens: 29569843200 | elapsed time per iteration (s): 0.15 | learning rate: 2.192E-05 | global batch size: 256 | lm loss: 3.754585E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.308 | TFLOPs: 26.13 | +7: iteration 56410/ 60336 | consumed samples: 14440960 | consumed tokens: 29575086080 | elapsed time per iteration (s): 0.15 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.755513E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.120 | TFLOPs: 26.13 | +7: iteration 56420/ 60336 | consumed samples: 14443520 | consumed tokens: 29580328960 | elapsed time per iteration (s): 0.15 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 3.750726E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.624 | TFLOPs: 26.12 | +7: iteration 56430/ 60336 | consumed samples: 14446080 | consumed tokens: 29585571840 | elapsed time per iteration (s): 0.15 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 3.751303E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.707 | TFLOPs: 26.14 | +7: iteration 56440/ 60336 | consumed samples: 14448640 | consumed tokens: 29590814720 | elapsed time per iteration (s): 0.15 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 3.749089E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.871 | TFLOPs: 26.09 | +7: iteration 56450/ 60336 | consumed samples: 14451200 | consumed tokens: 29596057600 | elapsed time per iteration (s): 0.17 | learning rate: 2.187E-05 | global batch size: 256 | lm loss: 3.760076E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1491.339 | TFLOPs: 23.39 | +7: iteration 56460/ 60336 | consumed samples: 14453760 | consumed tokens: 29601300480 | elapsed time per iteration (s): 0.15 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 3.754258E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.421 | TFLOPs: 26.20 | +7: iteration 56470/ 60336 | consumed samples: 14456320 | consumed tokens: 29606543360 | elapsed time per iteration (s): 0.15 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 3.747691E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.049 | TFLOPs: 26.24 | +7: iteration 56480/ 60336 | consumed samples: 14458880 | consumed tokens: 29611786240 | elapsed time per iteration (s): 0.15 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 3.748305E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.671 | TFLOPs: 26.22 | +7: iteration 56490/ 60336 | consumed samples: 14461440 | consumed tokens: 29617029120 | elapsed time per iteration (s): 0.15 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 3.755626E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.500 | TFLOPs: 26.23 | +7: iteration 56500/ 60336 | consumed samples: 14464000 | consumed tokens: 29622272000 | elapsed time per iteration (s): 0.15 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.749915E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.073 | TFLOPs: 26.24 | +7: iteration 56510/ 60336 | consumed samples: 14466560 | consumed tokens: 29627514880 | elapsed time per iteration (s): 0.15 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 3.764974E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.847 | TFLOPs: 26.23 | +7: iteration 56520/ 60336 | consumed samples: 14469120 | consumed tokens: 29632757760 | elapsed time per iteration (s): 0.15 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 3.762219E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.988 | TFLOPs: 26.22 | +7: iteration 56530/ 60336 | consumed samples: 14471680 | consumed tokens: 29638000640 | elapsed time per iteration (s): 0.15 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 3.762730E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.582 | TFLOPs: 26.23 | +7: iteration 56540/ 60336 | consumed samples: 14474240 | consumed tokens: 29643243520 | elapsed time per iteration (s): 0.15 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 3.752805E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.075 | TFLOPs: 26.22 | +7: iteration 56550/ 60336 | consumed samples: 14476800 | consumed tokens: 29648486400 | elapsed time per iteration (s): 0.15 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 3.744199E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.880 | TFLOPs: 26.22 | +7: iteration 56560/ 60336 | consumed samples: 14479360 | consumed tokens: 29653729280 | elapsed time per iteration (s): 0.15 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 3.740825E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.631 | TFLOPs: 26.22 | +7: iteration 56570/ 60336 | consumed samples: 14481920 | consumed tokens: 29658972160 | elapsed time per iteration (s): 0.15 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 3.759472E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.438 | TFLOPs: 26.23 | +7: iteration 56580/ 60336 | consumed samples: 14484480 | consumed tokens: 29664215040 | elapsed time per iteration (s): 0.15 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 3.746349E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.006 | TFLOPs: 26.24 | +7: iteration 56590/ 60336 | consumed samples: 14487040 | consumed tokens: 29669457920 | elapsed time per iteration (s): 0.15 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 3.755093E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.492 | TFLOPs: 26.21 | +7: iteration 56600/ 60336 | consumed samples: 14489600 | consumed tokens: 29674700800 | elapsed time per iteration (s): 0.15 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 3.751221E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.766 | TFLOPs: 26.20 | +7: iteration 56610/ 60336 | consumed samples: 14492160 | consumed tokens: 29679943680 | elapsed time per iteration (s): 0.15 | learning rate: 2.172E-05 | global batch size: 256 | lm loss: 3.773037E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.038 | TFLOPs: 26.19 | +7: iteration 56620/ 60336 | consumed samples: 14494720 | consumed tokens: 29685186560 | elapsed time per iteration (s): 0.15 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 3.756657E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.822 | TFLOPs: 26.22 | +7: iteration 56630/ 60336 | consumed samples: 14497280 | consumed tokens: 29690429440 | elapsed time per iteration (s): 0.15 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.754700E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.407 | TFLOPs: 26.24 | +7: iteration 56640/ 60336 | consumed samples: 14499840 | consumed tokens: 29695672320 | elapsed time per iteration (s): 0.15 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.765762E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.853 | TFLOPs: 26.23 | +7: iteration 56650/ 60336 | consumed samples: 14502400 | consumed tokens: 29700915200 | elapsed time per iteration (s): 0.15 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 3.745007E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.935 | TFLOPs: 26.24 | +7: iteration 56660/ 60336 | consumed samples: 14504960 | consumed tokens: 29706158080 | elapsed time per iteration (s): 0.15 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.754489E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.375 | TFLOPs: 26.26 | +7: iteration 56670/ 60336 | consumed samples: 14507520 | consumed tokens: 29711400960 | elapsed time per iteration (s): 0.15 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 3.743929E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.900 | TFLOPs: 26.27 | +7: iteration 56680/ 60336 | consumed samples: 14510080 | consumed tokens: 29716643840 | elapsed time per iteration (s): 0.15 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 3.749841E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.050 | TFLOPs: 26.27 | +7: iteration 56690/ 60336 | consumed samples: 14512640 | consumed tokens: 29721886720 | elapsed time per iteration (s): 0.15 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 3.763531E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.465 | TFLOPs: 26.26 | +7: iteration 56700/ 60336 | consumed samples: 14515200 | consumed tokens: 29727129600 | elapsed time per iteration (s): 0.15 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 3.740342E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.672 | TFLOPs: 26.29 | +7: iteration 56710/ 60336 | consumed samples: 14517760 | consumed tokens: 29732372480 | elapsed time per iteration (s): 0.18 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 3.776856E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.010 | TFLOPs: 22.82 | +7: iteration 56720/ 60336 | consumed samples: 14520320 | consumed tokens: 29737615360 | elapsed time per iteration (s): 0.15 | learning rate: 2.162E-05 | global batch size: 256 | lm loss: 3.764257E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.902 | TFLOPs: 26.27 | +7: iteration 56730/ 60336 | consumed samples: 14522880 | consumed tokens: 29742858240 | elapsed time per iteration (s): 0.15 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 3.758490E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.016 | TFLOPs: 26.30 | +7: iteration 56740/ 60336 | consumed samples: 14525440 | consumed tokens: 29748101120 | elapsed time per iteration (s): 0.15 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.769306E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.437 | TFLOPs: 26.24 | +7: iteration 56750/ 60336 | consumed samples: 14528000 | consumed tokens: 29753344000 | elapsed time per iteration (s): 0.15 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.753696E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.447 | TFLOPs: 26.28 | +7: iteration 56760/ 60336 | consumed samples: 14530560 | consumed tokens: 29758586880 | elapsed time per iteration (s): 0.15 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 3.755346E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.008 | TFLOPs: 26.28 | +7: iteration 56770/ 60336 | consumed samples: 14533120 | consumed tokens: 29763829760 | elapsed time per iteration (s): 0.15 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 3.746508E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.898 | TFLOPs: 26.25 | +7: iteration 56780/ 60336 | consumed samples: 14535680 | consumed tokens: 29769072640 | elapsed time per iteration (s): 0.15 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 3.753527E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.612 | TFLOPs: 26.25 | +7: iteration 56790/ 60336 | consumed samples: 14538240 | consumed tokens: 29774315520 | elapsed time per iteration (s): 0.15 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 3.766923E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.242 | TFLOPs: 26.29 | +7: iteration 56800/ 60336 | consumed samples: 14540800 | consumed tokens: 29779558400 | elapsed time per iteration (s): 0.15 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 3.753793E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.912 | TFLOPs: 26.28 | +7: iteration 56810/ 60336 | consumed samples: 14543360 | consumed tokens: 29784801280 | elapsed time per iteration (s): 0.15 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 3.748070E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.719 | TFLOPs: 26.28 | +7: iteration 56820/ 60336 | consumed samples: 14545920 | consumed tokens: 29790044160 | elapsed time per iteration (s): 0.15 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.757980E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.320 | TFLOPs: 26.30 | +7: iteration 56830/ 60336 | consumed samples: 14548480 | consumed tokens: 29795287040 | elapsed time per iteration (s): 0.15 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.749531E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.357 | TFLOPs: 26.27 | +7: iteration 56840/ 60336 | consumed samples: 14551040 | consumed tokens: 29800529920 | elapsed time per iteration (s): 0.17 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 3.749643E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.065 | TFLOPs: 23.32 | +7: iteration 56850/ 60336 | consumed samples: 14553600 | consumed tokens: 29805772800 | elapsed time per iteration (s): 0.15 | learning rate: 2.151E-05 | global batch size: 256 | lm loss: 3.755593E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.111 | TFLOPs: 26.29 | +7: iteration 56860/ 60336 | consumed samples: 14556160 | consumed tokens: 29811015680 | elapsed time per iteration (s): 0.15 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.752148E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.042 | TFLOPs: 26.28 | +7: iteration 56870/ 60336 | consumed samples: 14558720 | consumed tokens: 29816258560 | elapsed time per iteration (s): 0.15 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 3.745346E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.276 | TFLOPs: 26.27 | +7: iteration 56880/ 60336 | consumed samples: 14561280 | consumed tokens: 29821501440 | elapsed time per iteration (s): 0.16 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 3.758185E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.313 | TFLOPs: 25.00 | +7: iteration 56890/ 60336 | consumed samples: 14563840 | consumed tokens: 29826744320 | elapsed time per iteration (s): 0.15 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.755537E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.832 | TFLOPs: 26.30 | +7: iteration 56900/ 60336 | consumed samples: 14566400 | consumed tokens: 29831987200 | elapsed time per iteration (s): 0.15 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.764086E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.015 | TFLOPs: 26.30 | +7: iteration 56910/ 60336 | consumed samples: 14568960 | consumed tokens: 29837230080 | elapsed time per iteration (s): 0.15 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 3.761949E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.660 | TFLOPs: 26.29 | +7: iteration 56920/ 60336 | consumed samples: 14571520 | consumed tokens: 29842472960 | elapsed time per iteration (s): 0.15 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 3.741366E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.633 | TFLOPs: 26.28 | +7: iteration 56930/ 60336 | consumed samples: 14574080 | consumed tokens: 29847715840 | elapsed time per iteration (s): 0.18 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 3.755784E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1425.863 | TFLOPs: 22.36 | +7: iteration 56940/ 60336 | consumed samples: 14576640 | consumed tokens: 29852958720 | elapsed time per iteration (s): 0.16 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 3.746688E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.881 | TFLOPs: 25.17 | +7: iteration 56950/ 60336 | consumed samples: 14579200 | consumed tokens: 29858201600 | elapsed time per iteration (s): 0.15 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.746962E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.345 | TFLOPs: 26.21 | +7: iteration 56960/ 60336 | consumed samples: 14581760 | consumed tokens: 29863444480 | elapsed time per iteration (s): 0.17 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.755969E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.829 | TFLOPs: 23.69 | +7: iteration 56970/ 60336 | consumed samples: 14584320 | consumed tokens: 29868687360 | elapsed time per iteration (s): 0.17 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 3.752144E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.210 | TFLOPs: 23.34 | +7: iteration 56980/ 60336 | consumed samples: 14586880 | consumed tokens: 29873930240 | elapsed time per iteration (s): 0.15 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 3.751774E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.551 | TFLOPs: 26.21 | +7: iteration 56990/ 60336 | consumed samples: 14589440 | consumed tokens: 29879173120 | elapsed time per iteration (s): 0.15 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 3.757822E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.339 | TFLOPs: 26.21 | +7: iteration 57000/ 60336 | consumed samples: 14592000 | consumed tokens: 29884416000 | elapsed time per iteration (s): 0.15 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.752383E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.102 | TFLOPs: 26.19 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 57000 | lm loss value: 3.925637E+00 | lm loss PPL: 5.068535E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 57000 to checkpoints_44m32b100m +0: [2023-03-17 02:46:35,048] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step57000 is begin to save! +0: [2023-03-17 02:46:35,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:46:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:46:35,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:46:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:46:35,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:46:35,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:46:35,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:46:35,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:46:35,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:46:35,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:46:35,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:46:35,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:46:35,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:46:35,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:46:35,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:46:35,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:46:35,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:46:35,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:46:35,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:46:35,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:46:35,181] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step57000/mp_rank_00_model_states.pt +0: [2023-03-17 02:46:35,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:46:35,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:46:35,200] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:46:35,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:46:35,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:46:35,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:46:35,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:46:35,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:46:35,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:46:35,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:46:35,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:46:35,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:46:35,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:46:35,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:46:35,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:46:35,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:46:35,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:46:35,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step57000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:46:35,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: successfully saved checkpoint at iteration 57000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 179.90 +7: iteration 57010/ 60336 | consumed samples: 14594560 | consumed tokens: 29889658880 | elapsed time per iteration (s): 0.18 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.749654E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.687 | TFLOPs: 22.75 | +7: iteration 57020/ 60336 | consumed samples: 14597120 | consumed tokens: 29894901760 | elapsed time per iteration (s): 0.15 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.757783E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.902 | TFLOPs: 26.28 | +7: iteration 57030/ 60336 | consumed samples: 14599680 | consumed tokens: 29900144640 | elapsed time per iteration (s): 0.15 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 3.749794E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.955 | TFLOPs: 26.27 | +7: iteration 57040/ 60336 | consumed samples: 14602240 | consumed tokens: 29905387520 | elapsed time per iteration (s): 0.15 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 3.755017E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.216 | TFLOPs: 26.29 | +7: iteration 57050/ 60336 | consumed samples: 14604800 | consumed tokens: 29910630400 | elapsed time per iteration (s): 0.15 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.758510E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.584 | TFLOPs: 26.28 | +7: iteration 57060/ 60336 | consumed samples: 14607360 | consumed tokens: 29915873280 | elapsed time per iteration (s): 0.15 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 3.748928E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.347 | TFLOPs: 26.27 | +7: iteration 57070/ 60336 | consumed samples: 14609920 | consumed tokens: 29921116160 | elapsed time per iteration (s): 0.15 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.747973E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.144 | TFLOPs: 26.27 | +7: iteration 57080/ 60336 | consumed samples: 14612480 | consumed tokens: 29926359040 | elapsed time per iteration (s): 0.15 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.756128E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.064 | TFLOPs: 26.27 | +7: iteration 57090/ 60336 | consumed samples: 14615040 | consumed tokens: 29931601920 | elapsed time per iteration (s): 0.19 | learning rate: 2.131E-05 | global batch size: 256 | lm loss: 3.768420E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1329.188 | TFLOPs: 20.85 | +7: iteration 57100/ 60336 | consumed samples: 14617600 | consumed tokens: 29936844800 | elapsed time per iteration (s): 0.15 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 3.756205E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.501 | TFLOPs: 26.24 | +7: iteration 57110/ 60336 | consumed samples: 14620160 | consumed tokens: 29942087680 | elapsed time per iteration (s): 0.16 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.754533E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.886 | TFLOPs: 25.67 | +7: iteration 57120/ 60336 | consumed samples: 14622720 | consumed tokens: 29947330560 | elapsed time per iteration (s): 0.15 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.747885E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.618 | TFLOPs: 26.20 | +7: iteration 57130/ 60336 | consumed samples: 14625280 | consumed tokens: 29952573440 | elapsed time per iteration (s): 0.15 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.747439E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.063 | TFLOPs: 26.28 | +7: iteration 57140/ 60336 | consumed samples: 14627840 | consumed tokens: 29957816320 | elapsed time per iteration (s): 0.15 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.758239E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.491 | TFLOPs: 26.28 | +7: iteration 57150/ 60336 | consumed samples: 14630400 | consumed tokens: 29963059200 | elapsed time per iteration (s): 0.17 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 3.747442E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.400 | TFLOPs: 23.70 | +7: iteration 57160/ 60336 | consumed samples: 14632960 | consumed tokens: 29968302080 | elapsed time per iteration (s): 0.17 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.749289E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.549 | TFLOPs: 23.64 | +7: iteration 57170/ 60336 | consumed samples: 14635520 | consumed tokens: 29973544960 | elapsed time per iteration (s): 0.15 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.753217E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.877 | TFLOPs: 26.25 | +7: iteration 57180/ 60336 | consumed samples: 14638080 | consumed tokens: 29978787840 | elapsed time per iteration (s): 0.15 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.748398E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.896 | TFLOPs: 26.25 | +7: iteration 57190/ 60336 | consumed samples: 14640640 | consumed tokens: 29984030720 | elapsed time per iteration (s): 0.15 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 3.750390E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.249 | TFLOPs: 26.26 | +7: iteration 57200/ 60336 | consumed samples: 14643200 | consumed tokens: 29989273600 | elapsed time per iteration (s): 0.15 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 3.753352E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.061 | TFLOPs: 26.25 | +7: iteration 57210/ 60336 | consumed samples: 14645760 | consumed tokens: 29994516480 | elapsed time per iteration (s): 0.15 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.746039E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.184 | TFLOPs: 26.26 | +7: iteration 57220/ 60336 | consumed samples: 14648320 | consumed tokens: 29999759360 | elapsed time per iteration (s): 0.19 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.765250E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.310 | TFLOPs: 21.62 | +7: iteration 57230/ 60336 | consumed samples: 14650880 | consumed tokens: 30005002240 | elapsed time per iteration (s): 0.15 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 3.740919E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.004 | TFLOPs: 26.25 | +7: iteration 57240/ 60336 | consumed samples: 14653440 | consumed tokens: 30010245120 | elapsed time per iteration (s): 0.15 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 3.760159E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.041 | TFLOPs: 26.24 | +7: iteration 57250/ 60336 | consumed samples: 14656000 | consumed tokens: 30015488000 | elapsed time per iteration (s): 0.15 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.733814E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.258 | TFLOPs: 26.23 | +7: iteration 57260/ 60336 | consumed samples: 14658560 | consumed tokens: 30020730880 | elapsed time per iteration (s): 0.15 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.755675E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.431 | TFLOPs: 26.12 | +7: iteration 57270/ 60336 | consumed samples: 14661120 | consumed tokens: 30025973760 | elapsed time per iteration (s): 0.15 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 3.757425E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.428 | TFLOPs: 26.13 | +7: iteration 57280/ 60336 | consumed samples: 14663680 | consumed tokens: 30031216640 | elapsed time per iteration (s): 0.15 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 3.755563E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.196 | TFLOPs: 26.22 | +7: iteration 57290/ 60336 | consumed samples: 14666240 | consumed tokens: 30036459520 | elapsed time per iteration (s): 0.15 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 3.760951E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.125 | TFLOPs: 26.14 | +7: iteration 57300/ 60336 | consumed samples: 14668800 | consumed tokens: 30041702400 | elapsed time per iteration (s): 0.15 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.766608E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.748 | TFLOPs: 26.11 | +7: iteration 57310/ 60336 | consumed samples: 14671360 | consumed tokens: 30046945280 | elapsed time per iteration (s): 0.15 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.742573E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.221 | TFLOPs: 26.10 | +7: iteration 57320/ 60336 | consumed samples: 14673920 | consumed tokens: 30052188160 | elapsed time per iteration (s): 0.15 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 3.752828E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.887 | TFLOPs: 26.11 | +7: iteration 57330/ 60336 | consumed samples: 14676480 | consumed tokens: 30057431040 | elapsed time per iteration (s): 0.15 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.749399E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.032 | TFLOPs: 26.11 | +7: iteration 57340/ 60336 | consumed samples: 14679040 | consumed tokens: 30062673920 | elapsed time per iteration (s): 0.15 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.747700E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.160 | TFLOPs: 25.93 | +7: iteration 57350/ 60336 | consumed samples: 14681600 | consumed tokens: 30067916800 | elapsed time per iteration (s): 0.19 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 3.751410E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1366.028 | TFLOPs: 21.42 | +7: iteration 57360/ 60336 | consumed samples: 14684160 | consumed tokens: 30073159680 | elapsed time per iteration (s): 0.15 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.751989E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.532 | TFLOPs: 26.25 | +7: iteration 57370/ 60336 | consumed samples: 14686720 | consumed tokens: 30078402560 | elapsed time per iteration (s): 0.15 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.755520E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.461 | TFLOPs: 26.26 | +7: iteration 57380/ 60336 | consumed samples: 14689280 | consumed tokens: 30083645440 | elapsed time per iteration (s): 0.15 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.751128E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.934 | TFLOPs: 26.24 | +7: iteration 57390/ 60336 | consumed samples: 14691840 | consumed tokens: 30088888320 | elapsed time per iteration (s): 0.15 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.754052E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.812 | TFLOPs: 26.25 | +7: iteration 57400/ 60336 | consumed samples: 14694400 | consumed tokens: 30094131200 | elapsed time per iteration (s): 0.15 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 3.734962E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.077 | TFLOPs: 26.21 | +7: iteration 57410/ 60336 | consumed samples: 14696960 | consumed tokens: 30099374080 | elapsed time per iteration (s): 0.15 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.754134E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.983 | TFLOPs: 26.24 | +7: iteration 57420/ 60336 | consumed samples: 14699520 | consumed tokens: 30104616960 | elapsed time per iteration (s): 0.15 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.752822E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.134 | TFLOPs: 26.24 | +7: iteration 57430/ 60336 | consumed samples: 14702080 | consumed tokens: 30109859840 | elapsed time per iteration (s): 0.15 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 3.743798E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.523 | TFLOPs: 26.25 | +7: iteration 57440/ 60336 | consumed samples: 14704640 | consumed tokens: 30115102720 | elapsed time per iteration (s): 0.15 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.755277E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.046 | TFLOPs: 26.24 | +7: iteration 57450/ 60336 | consumed samples: 14707200 | consumed tokens: 30120345600 | elapsed time per iteration (s): 0.15 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.762840E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.885 | TFLOPs: 26.24 | +7: iteration 57460/ 60336 | consumed samples: 14709760 | consumed tokens: 30125588480 | elapsed time per iteration (s): 0.15 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.762673E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.867 | TFLOPs: 26.23 | +7: iteration 57470/ 60336 | consumed samples: 14712320 | consumed tokens: 30130831360 | elapsed time per iteration (s): 0.15 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 3.741633E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.900 | TFLOPs: 26.24 | +7: iteration 57480/ 60336 | consumed samples: 14714880 | consumed tokens: 30136074240 | elapsed time per iteration (s): 0.17 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.747978E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.563 | TFLOPs: 23.50 | +7: iteration 57490/ 60336 | consumed samples: 14717440 | consumed tokens: 30141317120 | elapsed time per iteration (s): 0.15 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.764930E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.596 | TFLOPs: 26.26 | +7: iteration 57500/ 60336 | consumed samples: 14720000 | consumed tokens: 30146560000 | elapsed time per iteration (s): 0.15 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.755544E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.593 | TFLOPs: 26.25 | +7: iteration 57510/ 60336 | consumed samples: 14722560 | consumed tokens: 30151802880 | elapsed time per iteration (s): 0.15 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.745083E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.869 | TFLOPs: 26.23 | +7: iteration 57520/ 60336 | consumed samples: 14725120 | consumed tokens: 30157045760 | elapsed time per iteration (s): 0.15 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.746855E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.854 | TFLOPs: 26.25 | +7: iteration 57530/ 60336 | consumed samples: 14727680 | consumed tokens: 30162288640 | elapsed time per iteration (s): 0.15 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.761364E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.721 | TFLOPs: 26.22 | +7: iteration 57540/ 60336 | consumed samples: 14730240 | consumed tokens: 30167531520 | elapsed time per iteration (s): 0.15 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.746795E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.453 | TFLOPs: 26.23 | +7: iteration 57550/ 60336 | consumed samples: 14732800 | consumed tokens: 30172774400 | elapsed time per iteration (s): 0.15 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.754651E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.340 | TFLOPs: 26.24 | +7: iteration 57560/ 60336 | consumed samples: 14735360 | consumed tokens: 30178017280 | elapsed time per iteration (s): 0.15 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.752450E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.007 | TFLOPs: 26.22 | +7: iteration 57570/ 60336 | consumed samples: 14737920 | consumed tokens: 30183260160 | elapsed time per iteration (s): 0.15 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.763675E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.613 | TFLOPs: 26.25 | +7: iteration 57580/ 60336 | consumed samples: 14740480 | consumed tokens: 30188503040 | elapsed time per iteration (s): 0.15 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.750221E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.776 | TFLOPs: 26.22 | +7: iteration 57590/ 60336 | consumed samples: 14743040 | consumed tokens: 30193745920 | elapsed time per iteration (s): 0.16 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.758909E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.633 | TFLOPs: 25.62 | +7: iteration 57600/ 60336 | consumed samples: 14745600 | consumed tokens: 30198988800 | elapsed time per iteration (s): 0.17 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.754742E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.448 | TFLOPs: 23.67 | +7: iteration 57610/ 60336 | consumed samples: 14748160 | consumed tokens: 30204231680 | elapsed time per iteration (s): 0.15 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.751983E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.751 | TFLOPs: 26.25 | +7: iteration 57620/ 60336 | consumed samples: 14750720 | consumed tokens: 30209474560 | elapsed time per iteration (s): 0.15 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.754939E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.516 | TFLOPs: 26.24 | +7: iteration 57630/ 60336 | consumed samples: 14753280 | consumed tokens: 30214717440 | elapsed time per iteration (s): 0.15 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 3.743615E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.365 | TFLOPs: 26.24 | +7: iteration 57640/ 60336 | consumed samples: 14755840 | consumed tokens: 30219960320 | elapsed time per iteration (s): 0.15 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.751144E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.138 | TFLOPs: 26.24 | +7: iteration 57650/ 60336 | consumed samples: 14758400 | consumed tokens: 30225203200 | elapsed time per iteration (s): 0.15 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.761131E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.867 | TFLOPs: 26.22 | +7: iteration 57660/ 60336 | consumed samples: 14760960 | consumed tokens: 30230446080 | elapsed time per iteration (s): 0.15 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 3.759184E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.183 | TFLOPs: 26.24 | +7: iteration 57670/ 60336 | consumed samples: 14763520 | consumed tokens: 30235688960 | elapsed time per iteration (s): 0.15 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.747856E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.636 | TFLOPs: 26.25 | +7: iteration 57680/ 60336 | consumed samples: 14766080 | consumed tokens: 30240931840 | elapsed time per iteration (s): 0.15 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.744201E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.857 | TFLOPs: 26.22 | +7: iteration 57690/ 60336 | consumed samples: 14768640 | consumed tokens: 30246174720 | elapsed time per iteration (s): 0.15 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.757093E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.607 | TFLOPs: 26.25 | +7: iteration 57700/ 60336 | consumed samples: 14771200 | consumed tokens: 30251417600 | elapsed time per iteration (s): 0.15 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.750648E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.010 | TFLOPs: 26.22 | +7: iteration 57710/ 60336 | consumed samples: 14773760 | consumed tokens: 30256660480 | elapsed time per iteration (s): 0.16 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.747445E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.352 | TFLOPs: 25.60 | +7: iteration 57720/ 60336 | consumed samples: 14776320 | consumed tokens: 30261903360 | elapsed time per iteration (s): 0.16 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.766940E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.267 | TFLOPs: 25.69 | +7: iteration 57730/ 60336 | consumed samples: 14778880 | consumed tokens: 30267146240 | elapsed time per iteration (s): 0.17 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.746259E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.674 | TFLOPs: 23.69 | +7: iteration 57740/ 60336 | consumed samples: 14781440 | consumed tokens: 30272389120 | elapsed time per iteration (s): 0.15 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.769273E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.024 | TFLOPs: 26.24 | +7: iteration 57750/ 60336 | consumed samples: 14784000 | consumed tokens: 30277632000 | elapsed time per iteration (s): 0.15 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.768096E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.805 | TFLOPs: 26.23 | +7: iteration 57760/ 60336 | consumed samples: 14786560 | consumed tokens: 30282874880 | elapsed time per iteration (s): 0.15 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.748692E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.430 | TFLOPs: 26.21 | +7: iteration 57770/ 60336 | consumed samples: 14789120 | consumed tokens: 30288117760 | elapsed time per iteration (s): 0.15 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.750075E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.915 | TFLOPs: 26.20 | +7: iteration 57780/ 60336 | consumed samples: 14791680 | consumed tokens: 30293360640 | elapsed time per iteration (s): 0.15 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.767509E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.443 | TFLOPs: 26.23 | +7: iteration 57790/ 60336 | consumed samples: 14794240 | consumed tokens: 30298603520 | elapsed time per iteration (s): 0.15 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.756493E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.042 | TFLOPs: 26.21 | +7: iteration 57800/ 60336 | consumed samples: 14796800 | consumed tokens: 30303846400 | elapsed time per iteration (s): 0.15 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 3.752799E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.190 | TFLOPs: 26.19 | +7: iteration 57810/ 60336 | consumed samples: 14799360 | consumed tokens: 30309089280 | elapsed time per iteration (s): 0.15 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.755242E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.434 | TFLOPs: 26.20 | +7: iteration 57820/ 60336 | consumed samples: 14801920 | consumed tokens: 30314332160 | elapsed time per iteration (s): 0.15 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.757887E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.738 | TFLOPs: 26.20 | +7: iteration 57830/ 60336 | consumed samples: 14804480 | consumed tokens: 30319575040 | elapsed time per iteration (s): 0.15 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.750008E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.820 | TFLOPs: 26.20 | +7: iteration 57840/ 60336 | consumed samples: 14807040 | consumed tokens: 30324817920 | elapsed time per iteration (s): 0.15 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.742090E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.035 | TFLOPs: 26.22 | +7: iteration 57850/ 60336 | consumed samples: 14809600 | consumed tokens: 30330060800 | elapsed time per iteration (s): 0.15 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.752602E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.605 | TFLOPs: 26.25 | +7: iteration 57860/ 60336 | consumed samples: 14812160 | consumed tokens: 30335303680 | elapsed time per iteration (s): 0.17 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.739988E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1500.904 | TFLOPs: 23.54 | +7: iteration 57870/ 60336 | consumed samples: 14814720 | consumed tokens: 30340546560 | elapsed time per iteration (s): 0.15 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.752514E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.086 | TFLOPs: 26.27 | +7: iteration 57880/ 60336 | consumed samples: 14817280 | consumed tokens: 30345789440 | elapsed time per iteration (s): 0.15 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.748291E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.392 | TFLOPs: 26.26 | +7: iteration 57890/ 60336 | consumed samples: 14819840 | consumed tokens: 30351032320 | elapsed time per iteration (s): 0.15 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.756984E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.398 | TFLOPs: 26.26 | +7: iteration 57900/ 60336 | consumed samples: 14822400 | consumed tokens: 30356275200 | elapsed time per iteration (s): 0.15 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.746221E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.379 | TFLOPs: 26.26 | +7: iteration 57910/ 60336 | consumed samples: 14824960 | consumed tokens: 30361518080 | elapsed time per iteration (s): 0.15 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.751292E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.805 | TFLOPs: 26.27 | +7: iteration 57920/ 60336 | consumed samples: 14827520 | consumed tokens: 30366760960 | elapsed time per iteration (s): 0.15 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.754043E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.915 | TFLOPs: 26.24 | +7: iteration 57930/ 60336 | consumed samples: 14830080 | consumed tokens: 30372003840 | elapsed time per iteration (s): 0.15 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.752988E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.275 | TFLOPs: 26.26 | +7: iteration 57940/ 60336 | consumed samples: 14832640 | consumed tokens: 30377246720 | elapsed time per iteration (s): 0.15 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.757141E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.096 | TFLOPs: 26.24 | +7: iteration 57950/ 60336 | consumed samples: 14835200 | consumed tokens: 30382489600 | elapsed time per iteration (s): 0.15 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.749950E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.829 | TFLOPs: 26.23 | +7: iteration 57960/ 60336 | consumed samples: 14837760 | consumed tokens: 30387732480 | elapsed time per iteration (s): 0.15 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.759950E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.067 | TFLOPs: 26.25 | +7: iteration 57970/ 60336 | consumed samples: 14840320 | consumed tokens: 30392975360 | elapsed time per iteration (s): 0.15 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.750650E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.749 | TFLOPs: 26.25 | +7: iteration 57980/ 60336 | consumed samples: 14842880 | consumed tokens: 30398218240 | elapsed time per iteration (s): 0.15 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.747147E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.362 | TFLOPs: 26.24 | +7: iteration 57990/ 60336 | consumed samples: 14845440 | consumed tokens: 30403461120 | elapsed time per iteration (s): 0.17 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.750822E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.187 | TFLOPs: 23.67 | +0: [2023-03-17 02:49:10,688] [INFO] [logging.py:68:log_dist] [Rank 0] step=58000, skipped=0, lr=[2.067844497501907e-05, 2.067844497501907e-05, 2.067844497501907e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 58000/ 60336 | consumed samples: 14848000 | consumed tokens: 30408704000 | elapsed time per iteration (s): 0.15 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.753077E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.228 | TFLOPs: 26.22 | +0: steps: 58000 loss: 3.7860 iter time (s): 0.154 samples/sec: 1662.667 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 58000 | lm loss value: 3.852386E+00 | lm loss PPL: 4.710531E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 58000 to checkpoints_44m32b100m +0: [2023-03-17 02:49:10,769] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step58000 is begin to save! +0: [2023-03-17 02:49:10,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:49:10,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:49:10,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:49:10,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:49:10,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:49:10,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:49:10,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:49:10,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:49:10,884] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:49:10,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:49:10,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:49:10,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:49:10,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:49:10,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:49:10,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:49:10,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:49:10,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:49:10,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:49:10,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:49:10,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:49:10,925] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step58000/mp_rank_00_model_states.pt +0: [2023-03-17 02:49:10,925] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:49:10,926] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:49:10,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:49:10,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:49:10,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:49:10,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:49:10,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:49:10,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:49:10,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:49:10,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:49:10,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:49:10,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step58000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:49:10,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: successfully saved checkpoint at iteration 58000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 202.80 +7: iteration 58010/ 60336 | consumed samples: 14850560 | consumed tokens: 30413946880 | elapsed time per iteration (s): 0.18 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.759016E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.156 | TFLOPs: 22.19 | +7: iteration 58020/ 60336 | consumed samples: 14853120 | consumed tokens: 30419189760 | elapsed time per iteration (s): 0.15 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.754873E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.895 | TFLOPs: 26.27 | +7: iteration 58030/ 60336 | consumed samples: 14855680 | consumed tokens: 30424432640 | elapsed time per iteration (s): 0.15 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.757034E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.942 | TFLOPs: 26.27 | +7: iteration 58040/ 60336 | consumed samples: 14858240 | consumed tokens: 30429675520 | elapsed time per iteration (s): 0.15 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.735770E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.455 | TFLOPs: 26.24 | +7: iteration 58050/ 60336 | consumed samples: 14860800 | consumed tokens: 30434918400 | elapsed time per iteration (s): 0.15 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.751105E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.260 | TFLOPs: 26.24 | +7: iteration 58060/ 60336 | consumed samples: 14863360 | consumed tokens: 30440161280 | elapsed time per iteration (s): 0.15 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.759649E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.300 | TFLOPs: 26.23 | +7: iteration 58070/ 60336 | consumed samples: 14865920 | consumed tokens: 30445404160 | elapsed time per iteration (s): 0.15 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.738699E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.328 | TFLOPs: 26.26 | +7: iteration 58080/ 60336 | consumed samples: 14868480 | consumed tokens: 30450647040 | elapsed time per iteration (s): 0.15 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.757816E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.732 | TFLOPs: 26.25 | +7: iteration 58090/ 60336 | consumed samples: 14871040 | consumed tokens: 30455889920 | elapsed time per iteration (s): 0.15 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.757588E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.762 | TFLOPs: 26.22 | +7: iteration 58100/ 60336 | consumed samples: 14873600 | consumed tokens: 30461132800 | elapsed time per iteration (s): 0.15 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.749247E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.935 | TFLOPs: 26.19 | +7: iteration 58110/ 60336 | consumed samples: 14876160 | consumed tokens: 30466375680 | elapsed time per iteration (s): 0.17 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.768082E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.416 | TFLOPs: 23.72 | +7: iteration 58120/ 60336 | consumed samples: 14878720 | consumed tokens: 30471618560 | elapsed time per iteration (s): 0.15 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.756546E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.396 | TFLOPs: 26.18 | +7: iteration 58130/ 60336 | consumed samples: 14881280 | consumed tokens: 30476861440 | elapsed time per iteration (s): 0.15 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.749184E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.651 | TFLOPs: 26.18 | +7: iteration 58140/ 60336 | consumed samples: 14883840 | consumed tokens: 30482104320 | elapsed time per iteration (s): 0.15 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.766823E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.894 | TFLOPs: 26.16 | +7: iteration 58150/ 60336 | consumed samples: 14886400 | consumed tokens: 30487347200 | elapsed time per iteration (s): 0.15 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.751837E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.143 | TFLOPs: 26.07 | +7: iteration 58160/ 60336 | consumed samples: 14888960 | consumed tokens: 30492590080 | elapsed time per iteration (s): 0.15 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.766101E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.716 | TFLOPs: 26.08 | +7: iteration 58170/ 60336 | consumed samples: 14891520 | consumed tokens: 30497832960 | elapsed time per iteration (s): 0.16 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.745596E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.313 | TFLOPs: 25.60 | +7: iteration 58180/ 60336 | consumed samples: 14894080 | consumed tokens: 30503075840 | elapsed time per iteration (s): 0.15 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.750227E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.667 | TFLOPs: 26.04 | +7: iteration 58190/ 60336 | consumed samples: 14896640 | consumed tokens: 30508318720 | elapsed time per iteration (s): 0.15 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.768335E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.174 | TFLOPs: 26.04 | +7: iteration 58200/ 60336 | consumed samples: 14899200 | consumed tokens: 30513561600 | elapsed time per iteration (s): 0.15 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.753812E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.926 | TFLOPs: 26.02 | +7: iteration 58210/ 60336 | consumed samples: 14901760 | consumed tokens: 30518804480 | elapsed time per iteration (s): 0.15 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.761065E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.581 | TFLOPs: 26.04 | +7: iteration 58220/ 60336 | consumed samples: 14904320 | consumed tokens: 30524047360 | elapsed time per iteration (s): 0.15 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.751842E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.103 | TFLOPs: 26.05 | +7: iteration 58230/ 60336 | consumed samples: 14906880 | consumed tokens: 30529290240 | elapsed time per iteration (s): 0.15 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.749603E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.179 | TFLOPs: 26.07 | +7: iteration 58240/ 60336 | consumed samples: 14909440 | consumed tokens: 30534533120 | elapsed time per iteration (s): 0.19 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.761131E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.351 | TFLOPs: 21.57 | +7: iteration 58250/ 60336 | consumed samples: 14912000 | consumed tokens: 30539776000 | elapsed time per iteration (s): 0.15 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.748161E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.284 | TFLOPs: 26.07 | +7: iteration 58260/ 60336 | consumed samples: 14914560 | consumed tokens: 30545018880 | elapsed time per iteration (s): 0.15 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.760356E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.982 | TFLOPs: 26.06 | +7: iteration 58270/ 60336 | consumed samples: 14917120 | consumed tokens: 30550261760 | elapsed time per iteration (s): 0.15 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.750200E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.074 | TFLOPs: 26.03 | +7: iteration 58280/ 60336 | consumed samples: 14919680 | consumed tokens: 30555504640 | elapsed time per iteration (s): 0.15 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.749752E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.555 | TFLOPs: 26.03 | +7: iteration 58290/ 60336 | consumed samples: 14922240 | consumed tokens: 30560747520 | elapsed time per iteration (s): 0.16 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.754622E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.565 | TFLOPs: 24.88 | +7: iteration 58300/ 60336 | consumed samples: 14924800 | consumed tokens: 30565990400 | elapsed time per iteration (s): 0.15 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.769953E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.615 | TFLOPs: 26.00 | +7: iteration 58310/ 60336 | consumed samples: 14927360 | consumed tokens: 30571233280 | elapsed time per iteration (s): 0.16 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.754071E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.049 | TFLOPs: 25.01 | +7: iteration 58320/ 60336 | consumed samples: 14929920 | consumed tokens: 30576476160 | elapsed time per iteration (s): 0.15 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.763300E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.632 | TFLOPs: 25.98 | +7: iteration 58330/ 60336 | consumed samples: 14932480 | consumed tokens: 30581719040 | elapsed time per iteration (s): 0.15 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.763680E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.114 | TFLOPs: 26.07 | +7: iteration 58340/ 60336 | consumed samples: 14935040 | consumed tokens: 30586961920 | elapsed time per iteration (s): 0.17 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.731086E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.840 | TFLOPs: 23.65 | +7: iteration 58350/ 60336 | consumed samples: 14937600 | consumed tokens: 30592204800 | elapsed time per iteration (s): 0.16 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.745555E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.550 | TFLOPs: 25.19 | +7: iteration 58360/ 60336 | consumed samples: 14940160 | consumed tokens: 30597447680 | elapsed time per iteration (s): 0.16 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.745641E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.162 | TFLOPs: 25.72 | +7: iteration 58370/ 60336 | consumed samples: 14942720 | consumed tokens: 30602690560 | elapsed time per iteration (s): 0.15 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.763110E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.812 | TFLOPs: 26.06 | +7: iteration 58380/ 60336 | consumed samples: 14945280 | consumed tokens: 30607933440 | elapsed time per iteration (s): 0.15 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.741438E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.702 | TFLOPs: 26.06 | +7: iteration 58390/ 60336 | consumed samples: 14947840 | consumed tokens: 30613176320 | elapsed time per iteration (s): 0.15 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.758868E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.633 | TFLOPs: 26.04 | +7: iteration 58400/ 60336 | consumed samples: 14950400 | consumed tokens: 30618419200 | elapsed time per iteration (s): 0.15 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.752753E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.134 | TFLOPs: 26.07 | +7: iteration 58410/ 60336 | consumed samples: 14952960 | consumed tokens: 30623662080 | elapsed time per iteration (s): 0.15 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.761961E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.353 | TFLOPs: 26.05 | +7: iteration 58420/ 60336 | consumed samples: 14955520 | consumed tokens: 30628904960 | elapsed time per iteration (s): 0.15 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.757802E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.237 | TFLOPs: 26.05 | +7: iteration 58430/ 60336 | consumed samples: 14958080 | consumed tokens: 30634147840 | elapsed time per iteration (s): 0.15 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.746022E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.857 | TFLOPs: 26.06 | +7: iteration 58440/ 60336 | consumed samples: 14960640 | consumed tokens: 30639390720 | elapsed time per iteration (s): 0.16 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.752917E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.497 | TFLOPs: 25.84 | +7: iteration 58450/ 60336 | consumed samples: 14963200 | consumed tokens: 30644633600 | elapsed time per iteration (s): 0.15 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.738453E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.504 | TFLOPs: 26.04 | +7: iteration 58460/ 60336 | consumed samples: 14965760 | consumed tokens: 30649876480 | elapsed time per iteration (s): 0.15 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.747477E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.563 | TFLOPs: 25.92 | +7: iteration 58470/ 60336 | consumed samples: 14968320 | consumed tokens: 30655119360 | elapsed time per iteration (s): 0.15 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.748551E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.370 | TFLOPs: 26.02 | +7: iteration 58480/ 60336 | consumed samples: 14970880 | consumed tokens: 30660362240 | elapsed time per iteration (s): 0.15 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.760286E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.004 | TFLOPs: 26.05 | +7: iteration 58490/ 60336 | consumed samples: 14973440 | consumed tokens: 30665605120 | elapsed time per iteration (s): 0.15 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.764082E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.069 | TFLOPs: 26.03 | +7: iteration 58500/ 60336 | consumed samples: 14976000 | consumed tokens: 30670848000 | elapsed time per iteration (s): 0.15 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.747401E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.973 | TFLOPs: 26.03 | +7: iteration 58510/ 60336 | consumed samples: 14978560 | consumed tokens: 30676090880 | elapsed time per iteration (s): 0.15 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.758585E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.308 | TFLOPs: 26.02 | +7: iteration 58520/ 60336 | consumed samples: 14981120 | consumed tokens: 30681333760 | elapsed time per iteration (s): 0.15 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.745059E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.719 | TFLOPs: 26.03 | +7: iteration 58530/ 60336 | consumed samples: 14983680 | consumed tokens: 30686576640 | elapsed time per iteration (s): 0.15 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.752182E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.909 | TFLOPs: 26.03 | +7: iteration 58540/ 60336 | consumed samples: 14986240 | consumed tokens: 30691819520 | elapsed time per iteration (s): 0.16 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.752726E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.341 | TFLOPs: 25.66 | +7: iteration 58550/ 60336 | consumed samples: 14988800 | consumed tokens: 30697062400 | elapsed time per iteration (s): 0.15 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.762418E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.707 | TFLOPs: 26.06 | +7: iteration 58560/ 60336 | consumed samples: 14991360 | consumed tokens: 30702305280 | elapsed time per iteration (s): 0.15 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.746254E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.793 | TFLOPs: 26.05 | +7: iteration 58570/ 60336 | consumed samples: 14993920 | consumed tokens: 30707548160 | elapsed time per iteration (s): 0.15 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.739215E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.661 | TFLOPs: 26.06 | +7: iteration 58580/ 60336 | consumed samples: 14996480 | consumed tokens: 30712791040 | elapsed time per iteration (s): 0.15 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.750024E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.363 | TFLOPs: 26.07 | +7: iteration 58590/ 60336 | consumed samples: 14999040 | consumed tokens: 30718033920 | elapsed time per iteration (s): 0.15 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.746538E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.199 | TFLOPs: 26.07 | +7: iteration 58600/ 60336 | consumed samples: 15001600 | consumed tokens: 30723276800 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.758530E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.494 | TFLOPs: 26.04 | +7: iteration 58610/ 60336 | consumed samples: 15004160 | consumed tokens: 30728519680 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.739462E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.942 | TFLOPs: 26.05 | +7: iteration 58620/ 60336 | consumed samples: 15006720 | consumed tokens: 30733762560 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.745064E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.101 | TFLOPs: 26.05 | +7: iteration 58630/ 60336 | consumed samples: 15009280 | consumed tokens: 30739005440 | elapsed time per iteration (s): 0.15 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.748212E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.581 | TFLOPs: 26.06 | +7: iteration 58640/ 60336 | consumed samples: 15011840 | consumed tokens: 30744248320 | elapsed time per iteration (s): 0.15 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.763442E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.845 | TFLOPs: 26.03 | +7: iteration 58650/ 60336 | consumed samples: 15014400 | consumed tokens: 30749491200 | elapsed time per iteration (s): 0.15 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.753623E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.645 | TFLOPs: 26.04 | +7: iteration 58660/ 60336 | consumed samples: 15016960 | consumed tokens: 30754734080 | elapsed time per iteration (s): 0.15 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.749396E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.247 | TFLOPs: 26.04 | +7: iteration 58670/ 60336 | consumed samples: 15019520 | consumed tokens: 30759976960 | elapsed time per iteration (s): 0.15 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.755500E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.150 | TFLOPs: 26.05 | +7: iteration 58680/ 60336 | consumed samples: 15022080 | consumed tokens: 30765219840 | elapsed time per iteration (s): 0.15 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.761907E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.603 | TFLOPs: 26.04 | +7: iteration 58690/ 60336 | consumed samples: 15024640 | consumed tokens: 30770462720 | elapsed time per iteration (s): 0.15 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.742260E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.016 | TFLOPs: 26.03 | +7: iteration 58700/ 60336 | consumed samples: 15027200 | consumed tokens: 30775705600 | elapsed time per iteration (s): 0.15 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.762062E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.236 | TFLOPs: 26.04 | +7: iteration 58710/ 60336 | consumed samples: 15029760 | consumed tokens: 30780948480 | elapsed time per iteration (s): 0.15 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.758136E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.974 | TFLOPs: 26.05 | +7: iteration 58720/ 60336 | consumed samples: 15032320 | consumed tokens: 30786191360 | elapsed time per iteration (s): 0.15 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.753799E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.204 | TFLOPs: 26.04 | +7: iteration 58730/ 60336 | consumed samples: 15034880 | consumed tokens: 30791434240 | elapsed time per iteration (s): 0.15 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.753089E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.786 | TFLOPs: 26.03 | +7: iteration 58740/ 60336 | consumed samples: 15037440 | consumed tokens: 30796677120 | elapsed time per iteration (s): 0.15 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.752756E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.648 | TFLOPs: 26.04 | +7: iteration 58750/ 60336 | consumed samples: 15040000 | consumed tokens: 30801920000 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.752463E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.079 | TFLOPs: 26.03 | +7: iteration 58760/ 60336 | consumed samples: 15042560 | consumed tokens: 30807162880 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.751677E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.733 | TFLOPs: 26.06 | +7: iteration 58770/ 60336 | consumed samples: 15045120 | consumed tokens: 30812405760 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.759261E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.343 | TFLOPs: 26.05 | +7: iteration 58780/ 60336 | consumed samples: 15047680 | consumed tokens: 30817648640 | elapsed time per iteration (s): 0.15 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.744547E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.167 | TFLOPs: 26.05 | +7: iteration 58790/ 60336 | consumed samples: 15050240 | consumed tokens: 30822891520 | elapsed time per iteration (s): 0.15 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.747911E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.430 | TFLOPs: 26.07 | +7: iteration 58800/ 60336 | consumed samples: 15052800 | consumed tokens: 30828134400 | elapsed time per iteration (s): 0.15 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.752545E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.289 | TFLOPs: 26.10 | +7: iteration 58810/ 60336 | consumed samples: 15055360 | consumed tokens: 30833377280 | elapsed time per iteration (s): 0.15 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.747919E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.203 | TFLOPs: 26.08 | +7: iteration 58820/ 60336 | consumed samples: 15057920 | consumed tokens: 30838620160 | elapsed time per iteration (s): 0.15 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.754386E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.916 | TFLOPs: 26.08 | +7: iteration 58830/ 60336 | consumed samples: 15060480 | consumed tokens: 30843863040 | elapsed time per iteration (s): 0.15 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.759174E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.356 | TFLOPs: 26.10 | +7: iteration 58840/ 60336 | consumed samples: 15063040 | consumed tokens: 30849105920 | elapsed time per iteration (s): 0.15 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.750137E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.860 | TFLOPs: 26.14 | +7: iteration 58850/ 60336 | consumed samples: 15065600 | consumed tokens: 30854348800 | elapsed time per iteration (s): 0.15 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.754270E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.840 | TFLOPs: 26.16 | +7: iteration 58860/ 60336 | consumed samples: 15068160 | consumed tokens: 30859591680 | elapsed time per iteration (s): 0.15 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.750636E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.051 | TFLOPs: 26.14 | +7: iteration 58870/ 60336 | consumed samples: 15070720 | consumed tokens: 30864834560 | elapsed time per iteration (s): 0.16 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.747212E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.004 | TFLOPs: 25.81 | +7: iteration 58880/ 60336 | consumed samples: 15073280 | consumed tokens: 30870077440 | elapsed time per iteration (s): 0.15 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.749995E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.801 | TFLOPs: 26.16 | +7: iteration 58890/ 60336 | consumed samples: 15075840 | consumed tokens: 30875320320 | elapsed time per iteration (s): 0.15 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.774392E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.671 | TFLOPs: 26.17 | +7: iteration 58900/ 60336 | consumed samples: 15078400 | consumed tokens: 30880563200 | elapsed time per iteration (s): 0.15 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.763304E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.814 | TFLOPs: 26.17 | +7: iteration 58910/ 60336 | consumed samples: 15080960 | consumed tokens: 30885806080 | elapsed time per iteration (s): 0.15 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.762212E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.002 | TFLOPs: 26.16 | +7: iteration 58920/ 60336 | consumed samples: 15083520 | consumed tokens: 30891048960 | elapsed time per iteration (s): 0.15 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.763331E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.113 | TFLOPs: 26.16 | +7: iteration 58930/ 60336 | consumed samples: 15086080 | consumed tokens: 30896291840 | elapsed time per iteration (s): 0.15 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.755003E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.398 | TFLOPs: 26.15 | +7: iteration 58940/ 60336 | consumed samples: 15088640 | consumed tokens: 30901534720 | elapsed time per iteration (s): 0.15 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.748117E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.016 | TFLOPs: 26.16 | +7: iteration 58950/ 60336 | consumed samples: 15091200 | consumed tokens: 30906777600 | elapsed time per iteration (s): 0.15 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.748550E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.089 | TFLOPs: 26.14 | +7: iteration 58960/ 60336 | consumed samples: 15093760 | consumed tokens: 30912020480 | elapsed time per iteration (s): 0.15 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.749277E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.317 | TFLOPs: 26.15 | +7: iteration 58970/ 60336 | consumed samples: 15096320 | consumed tokens: 30917263360 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.764625E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.061 | TFLOPs: 26.16 | +7: iteration 58980/ 60336 | consumed samples: 15098880 | consumed tokens: 30922506240 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.757960E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.775 | TFLOPs: 26.15 | +7: iteration 58990/ 60336 | consumed samples: 15101440 | consumed tokens: 30927749120 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.743528E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.112 | TFLOPs: 26.13 | +7: iteration 59000/ 60336 | consumed samples: 15104000 | consumed tokens: 30932992000 | elapsed time per iteration (s): 0.15 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.746297E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.601 | TFLOPs: 26.18 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 59000 | lm loss value: 3.860917E+00 | lm loss PPL: 4.750888E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 59000 to checkpoints_44m32b100m +0: [2023-03-17 02:51:45,833] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step59000 is begin to save! +0: [2023-03-17 02:51:45,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:51:45,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:51:45,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:51:45,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:51:45,906] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:51:45,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:51:45,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:51:45,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:51:45,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:51:45,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:51:45,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:51:45,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:51:45,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:51:45,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:51:45,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:51:45,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:51:45,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:51:45,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:51:45,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:51:45,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:51:45,964] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step59000/mp_rank_00_model_states.pt +0: [2023-03-17 02:51:45,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:51:45,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:45,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:45,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:45,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:45,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:45,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:45,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:45,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:45,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:45,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:45,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:45,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:45,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:45,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:45,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:45,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:45,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:46,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:46,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:46,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:46,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:46,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:46,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:46,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:46,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:46,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:46,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:46,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:46,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:46,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:46,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:51:46,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:51:46,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:46,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:46,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step59000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:51:46,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: successfully saved checkpoint at iteration 59000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 178.32 +7: iteration 59010/ 60336 | consumed samples: 15106560 | consumed tokens: 30938234880 | elapsed time per iteration (s): 0.18 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.760270E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1454.403 | TFLOPs: 22.81 | +7: iteration 59020/ 60336 | consumed samples: 15109120 | consumed tokens: 30943477760 | elapsed time per iteration (s): 0.15 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.751021E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.449 | TFLOPs: 26.17 | +7: iteration 59030/ 60336 | consumed samples: 15111680 | consumed tokens: 30948720640 | elapsed time per iteration (s): 0.15 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.740213E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.120 | TFLOPs: 26.18 | +7: iteration 59040/ 60336 | consumed samples: 15114240 | consumed tokens: 30953963520 | elapsed time per iteration (s): 0.15 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.758894E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.779 | TFLOPs: 26.14 | +7: iteration 59050/ 60336 | consumed samples: 15116800 | consumed tokens: 30959206400 | elapsed time per iteration (s): 0.15 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.746587E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.942 | TFLOPs: 26.14 | +7: iteration 59060/ 60336 | consumed samples: 15119360 | consumed tokens: 30964449280 | elapsed time per iteration (s): 0.15 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.752788E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.218 | TFLOPs: 26.11 | +7: iteration 59070/ 60336 | consumed samples: 15121920 | consumed tokens: 30969692160 | elapsed time per iteration (s): 0.15 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.749352E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.476 | TFLOPs: 26.12 | +7: iteration 59080/ 60336 | consumed samples: 15124480 | consumed tokens: 30974935040 | elapsed time per iteration (s): 0.15 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.766326E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.483 | TFLOPs: 26.13 | +7: iteration 59090/ 60336 | consumed samples: 15127040 | consumed tokens: 30980177920 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.752886E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.166 | TFLOPs: 26.15 | +7: iteration 59100/ 60336 | consumed samples: 15129600 | consumed tokens: 30985420800 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.752436E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.952 | TFLOPs: 26.16 | +7: iteration 59110/ 60336 | consumed samples: 15132160 | consumed tokens: 30990663680 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.759042E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.714 | TFLOPs: 26.15 | +7: iteration 59120/ 60336 | consumed samples: 15134720 | consumed tokens: 30995906560 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.742317E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.725 | TFLOPs: 26.14 | +7: iteration 59130/ 60336 | consumed samples: 15137280 | consumed tokens: 31001149440 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.745595E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.337 | TFLOPs: 26.16 | +7: iteration 59140/ 60336 | consumed samples: 15139840 | consumed tokens: 31006392320 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.759879E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.310 | TFLOPs: 26.15 | +7: iteration 59150/ 60336 | consumed samples: 15142400 | consumed tokens: 31011635200 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.757166E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.058 | TFLOPs: 26.14 | +7: iteration 59160/ 60336 | consumed samples: 15144960 | consumed tokens: 31016878080 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.754227E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.817 | TFLOPs: 26.16 | +7: iteration 59170/ 60336 | consumed samples: 15147520 | consumed tokens: 31022120960 | elapsed time per iteration (s): 0.16 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.748120E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.586 | TFLOPs: 25.35 | +7: iteration 59180/ 60336 | consumed samples: 15150080 | consumed tokens: 31027363840 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.753079E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.606 | TFLOPs: 26.15 | +7: iteration 59190/ 60336 | consumed samples: 15152640 | consumed tokens: 31032606720 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.766756E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.238 | TFLOPs: 26.12 | +7: iteration 59200/ 60336 | consumed samples: 15155200 | consumed tokens: 31037849600 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.763620E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.050 | TFLOPs: 26.00 | +7: iteration 59210/ 60336 | consumed samples: 15157760 | consumed tokens: 31043092480 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.756200E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.574 | TFLOPs: 26.12 | +7: iteration 59220/ 60336 | consumed samples: 15160320 | consumed tokens: 31048335360 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.763213E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.131 | TFLOPs: 26.16 | +7: iteration 59230/ 60336 | consumed samples: 15162880 | consumed tokens: 31053578240 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.747685E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.870 | TFLOPs: 26.16 | +7: iteration 59240/ 60336 | consumed samples: 15165440 | consumed tokens: 31058821120 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.754840E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.141 | TFLOPs: 26.21 | +7: iteration 59250/ 60336 | consumed samples: 15168000 | consumed tokens: 31064064000 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.756115E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.426 | TFLOPs: 26.26 | +7: iteration 59260/ 60336 | consumed samples: 15170560 | consumed tokens: 31069306880 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.762273E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.109 | TFLOPs: 26.29 | +7: iteration 59270/ 60336 | consumed samples: 15173120 | consumed tokens: 31074549760 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.760085E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.226 | TFLOPs: 26.27 | +7: iteration 59280/ 60336 | consumed samples: 15175680 | consumed tokens: 31079792640 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.761816E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.448 | TFLOPs: 26.26 | +7: iteration 59290/ 60336 | consumed samples: 15178240 | consumed tokens: 31085035520 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.757772E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.577 | TFLOPs: 26.29 | +7: iteration 59300/ 60336 | consumed samples: 15180800 | consumed tokens: 31090278400 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.756903E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.016 | TFLOPs: 26.22 | +7: iteration 59310/ 60336 | consumed samples: 15183360 | consumed tokens: 31095521280 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.762236E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.092 | TFLOPs: 26.24 | +7: iteration 59320/ 60336 | consumed samples: 15185920 | consumed tokens: 31100764160 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.758242E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.333 | TFLOPs: 26.26 | +7: iteration 59330/ 60336 | consumed samples: 15188480 | consumed tokens: 31106007040 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.757316E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.101 | TFLOPs: 26.29 | +7: iteration 59340/ 60336 | consumed samples: 15191040 | consumed tokens: 31111249920 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.747224E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.770 | TFLOPs: 25.75 | +7: iteration 59350/ 60336 | consumed samples: 15193600 | consumed tokens: 31116492800 | elapsed time per iteration (s): 0.15 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.742489E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.988 | TFLOPs: 26.24 | +7: iteration 59360/ 60336 | consumed samples: 15196160 | consumed tokens: 31121735680 | elapsed time per iteration (s): 0.15 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.763779E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.024 | TFLOPs: 26.28 | +7: iteration 59370/ 60336 | consumed samples: 15198720 | consumed tokens: 31126978560 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.752808E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.656 | TFLOPs: 25.64 | +7: iteration 59380/ 60336 | consumed samples: 15201280 | consumed tokens: 31132221440 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.768361E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.118 | TFLOPs: 26.22 | +7: iteration 59390/ 60336 | consumed samples: 15203840 | consumed tokens: 31137464320 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.766614E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.410 | TFLOPs: 26.27 | +7: iteration 59400/ 60336 | consumed samples: 15206400 | consumed tokens: 31142707200 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.751858E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.871 | TFLOPs: 26.23 | +7: iteration 59410/ 60336 | consumed samples: 15208960 | consumed tokens: 31147950080 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.761565E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.275 | TFLOPs: 26.21 | +7: iteration 59420/ 60336 | consumed samples: 15211520 | consumed tokens: 31153192960 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.743366E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.303 | TFLOPs: 26.19 | +7: iteration 59430/ 60336 | consumed samples: 15214080 | consumed tokens: 31158435840 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.751065E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.365 | TFLOPs: 26.12 | +7: iteration 59440/ 60336 | consumed samples: 15216640 | consumed tokens: 31163678720 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.759982E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.887 | TFLOPs: 26.13 | +7: iteration 59450/ 60336 | consumed samples: 15219200 | consumed tokens: 31168921600 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.741796E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.019 | TFLOPs: 26.13 | +7: iteration 59460/ 60336 | consumed samples: 15221760 | consumed tokens: 31174164480 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.747018E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.253 | TFLOPs: 26.10 | +7: iteration 59470/ 60336 | consumed samples: 15224320 | consumed tokens: 31179407360 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.751713E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.018 | TFLOPs: 26.13 | +7: iteration 59480/ 60336 | consumed samples: 15226880 | consumed tokens: 31184650240 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.766474E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.342 | TFLOPs: 26.13 | +7: iteration 59490/ 60336 | consumed samples: 15229440 | consumed tokens: 31189893120 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.748615E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.851 | TFLOPs: 26.12 | +7: iteration 59500/ 60336 | consumed samples: 15232000 | consumed tokens: 31195136000 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.764189E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.612 | TFLOPs: 26.11 | +7: iteration 59510/ 60336 | consumed samples: 15234560 | consumed tokens: 31200378880 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.749022E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.177 | TFLOPs: 26.10 | +7: iteration 59520/ 60336 | consumed samples: 15237120 | consumed tokens: 31205621760 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.763157E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.552 | TFLOPs: 26.12 | +7: iteration 59530/ 60336 | consumed samples: 15239680 | consumed tokens: 31210864640 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.753591E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.697 | TFLOPs: 26.06 | +7: iteration 59540/ 60336 | consumed samples: 15242240 | consumed tokens: 31216107520 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.757294E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.089 | TFLOPs: 26.05 | +7: iteration 59550/ 60336 | consumed samples: 15244800 | consumed tokens: 31221350400 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.752826E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.925 | TFLOPs: 26.05 | +7: iteration 59560/ 60336 | consumed samples: 15247360 | consumed tokens: 31226593280 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.760878E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.696 | TFLOPs: 26.04 | +7: iteration 59570/ 60336 | consumed samples: 15249920 | consumed tokens: 31231836160 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.757217E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.195 | TFLOPs: 26.04 | +7: iteration 59580/ 60336 | consumed samples: 15252480 | consumed tokens: 31237079040 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.759446E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.042 | TFLOPs: 26.06 | +7: iteration 59590/ 60336 | consumed samples: 15255040 | consumed tokens: 31242321920 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.753225E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.626 | TFLOPs: 26.07 | +7: iteration 59600/ 60336 | consumed samples: 15257600 | consumed tokens: 31247564800 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.751081E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.965 | TFLOPs: 26.20 | +7: iteration 59610/ 60336 | consumed samples: 15260160 | consumed tokens: 31252807680 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.766092E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.908 | TFLOPs: 26.22 | +7: iteration 59620/ 60336 | consumed samples: 15262720 | consumed tokens: 31258050560 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.751379E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.156 | TFLOPs: 26.22 | +7: iteration 59630/ 60336 | consumed samples: 15265280 | consumed tokens: 31263293440 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.753983E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.413 | TFLOPs: 26.20 | +7: iteration 59640/ 60336 | consumed samples: 15267840 | consumed tokens: 31268536320 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.757336E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.448 | TFLOPs: 26.21 | +7: iteration 59650/ 60336 | consumed samples: 15270400 | consumed tokens: 31273779200 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.735840E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.558 | TFLOPs: 26.18 | +7: iteration 59660/ 60336 | consumed samples: 15272960 | consumed tokens: 31279022080 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.746276E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.678 | TFLOPs: 26.17 | +7: iteration 59670/ 60336 | consumed samples: 15275520 | consumed tokens: 31284264960 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.753393E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.886 | TFLOPs: 26.16 | +7: iteration 59680/ 60336 | consumed samples: 15278080 | consumed tokens: 31289507840 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.764578E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.855 | TFLOPs: 26.14 | +7: iteration 59690/ 60336 | consumed samples: 15280640 | consumed tokens: 31294750720 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.756657E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.342 | TFLOPs: 26.13 | +7: iteration 59700/ 60336 | consumed samples: 15283200 | consumed tokens: 31299993600 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.765660E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.809 | TFLOPs: 26.14 | +7: iteration 59710/ 60336 | consumed samples: 15285760 | consumed tokens: 31305236480 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.762175E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.673 | TFLOPs: 26.12 | +7: iteration 59720/ 60336 | consumed samples: 15288320 | consumed tokens: 31310479360 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.755609E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.082 | TFLOPs: 26.14 | +7: iteration 59730/ 60336 | consumed samples: 15290880 | consumed tokens: 31315722240 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.761131E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.339 | TFLOPs: 26.13 | +7: iteration 59740/ 60336 | consumed samples: 15293440 | consumed tokens: 31320965120 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.754948E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.465 | TFLOPs: 26.13 | +7: iteration 59750/ 60336 | consumed samples: 15296000 | consumed tokens: 31326208000 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.759412E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.568 | TFLOPs: 26.15 | +7: iteration 59760/ 60336 | consumed samples: 15298560 | consumed tokens: 31331450880 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.755289E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.687 | TFLOPs: 26.15 | +7: iteration 59770/ 60336 | consumed samples: 15301120 | consumed tokens: 31336693760 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.758599E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.643 | TFLOPs: 26.17 | +7: iteration 59780/ 60336 | consumed samples: 15303680 | consumed tokens: 31341936640 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.751523E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.188 | TFLOPs: 26.21 | +7: iteration 59790/ 60336 | consumed samples: 15306240 | consumed tokens: 31347179520 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.756221E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.968 | TFLOPs: 26.16 | +7: iteration 59800/ 60336 | consumed samples: 15308800 | consumed tokens: 31352422400 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.751675E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.157 | TFLOPs: 26.15 | +7: iteration 59810/ 60336 | consumed samples: 15311360 | consumed tokens: 31357665280 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.757290E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.675 | TFLOPs: 26.18 | +7: iteration 59820/ 60336 | consumed samples: 15313920 | consumed tokens: 31362908160 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.749648E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.882 | TFLOPs: 26.17 | +7: iteration 59830/ 60336 | consumed samples: 15316480 | consumed tokens: 31368151040 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.755246E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.693 | TFLOPs: 26.20 | +7: iteration 59840/ 60336 | consumed samples: 15319040 | consumed tokens: 31373393920 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.762952E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.724 | TFLOPs: 26.15 | +7: iteration 59850/ 60336 | consumed samples: 15321600 | consumed tokens: 31378636800 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.762609E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.078 | TFLOPs: 26.19 | +7: iteration 59860/ 60336 | consumed samples: 15324160 | consumed tokens: 31383879680 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.753793E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.664 | TFLOPs: 26.20 | +7: iteration 59870/ 60336 | consumed samples: 15326720 | consumed tokens: 31389122560 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.744910E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.809 | TFLOPs: 25.83 | +7: iteration 59880/ 60336 | consumed samples: 15329280 | consumed tokens: 31394365440 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.762309E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.699 | TFLOPs: 26.19 | +7: iteration 59890/ 60336 | consumed samples: 15331840 | consumed tokens: 31399608320 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.753428E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.470 | TFLOPs: 26.23 | +7: iteration 59900/ 60336 | consumed samples: 15334400 | consumed tokens: 31404851200 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.759197E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.144 | TFLOPs: 26.21 | +7: iteration 59910/ 60336 | consumed samples: 15336960 | consumed tokens: 31410094080 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.750806E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.165 | TFLOPs: 26.22 | +7: iteration 59920/ 60336 | consumed samples: 15339520 | consumed tokens: 31415336960 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.752578E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.659 | TFLOPs: 25.02 | +7: iteration 59930/ 60336 | consumed samples: 15342080 | consumed tokens: 31420579840 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.760459E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.131 | TFLOPs: 26.21 | +7: iteration 59940/ 60336 | consumed samples: 15344640 | consumed tokens: 31425822720 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.744677E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.467 | TFLOPs: 26.21 | +7: iteration 59950/ 60336 | consumed samples: 15347200 | consumed tokens: 31431065600 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.751648E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.572 | TFLOPs: 26.23 | +7: iteration 59960/ 60336 | consumed samples: 15349760 | consumed tokens: 31436308480 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.753547E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.514 | TFLOPs: 26.21 | +7: iteration 59970/ 60336 | consumed samples: 15352320 | consumed tokens: 31441551360 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.756540E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.710 | TFLOPs: 26.20 | +7: iteration 59980/ 60336 | consumed samples: 15354880 | consumed tokens: 31446794240 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.747852E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.372 | TFLOPs: 26.21 | +7: iteration 59990/ 60336 | consumed samples: 15357440 | consumed tokens: 31452037120 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.752499E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.491 | TFLOPs: 26.21 | +0: [2023-03-17 02:54:19,622] [INFO] [logging.py:68:log_dist] [Rank 0] step=60000, skipped=0, lr=[2.0014058754265557e-05, 2.0014058754265557e-05, 2.0014058754265557e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 60000/ 60336 | consumed samples: 15360000 | consumed tokens: 31457280000 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.749803E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.472 | TFLOPs: 26.23 | +0: steps: 60000 loss: 3.7611 iter time (s): 0.153 samples/sec: 1670.195 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 60000 | lm loss value: 3.884921E+00 | lm loss PPL: 4.866310E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 60000 to checkpoints_44m32b100m +0: [2023-03-17 02:54:19,696] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step60000 is begin to save! +0: [2023-03-17 02:54:19,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:54:19,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:54:19,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:54:19,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:54:19,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:54:19,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:54:19,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:54:19,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:54:19,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:54:19,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:54:19,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:54:19,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:54:19,804] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:54:19,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:54:19,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:54:19,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:54:19,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:54:19,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:54:19,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:54:19,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:54:19,829] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step60000/mp_rank_00_model_states.pt +0: [2023-03-17 02:54:19,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:54:19,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:54:19,847] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:54:19,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:54:19,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:54:19,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:54:19,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 02:54:19,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:54:19,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:54:19,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:54:19,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:54:19,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: successfully saved checkpoint at iteration 60000 to checkpoints_44m32b100m +7: time (ms) | save-checkpoint: 177.93 +7: iteration 60010/ 60336 | consumed samples: 15362560 | consumed tokens: 31462522880 | elapsed time per iteration (s): 0.18 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.752154E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.048 | TFLOPs: 22.58 | +7: iteration 60020/ 60336 | consumed samples: 15365120 | consumed tokens: 31467765760 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.759135E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.898 | TFLOPs: 26.22 | +7: iteration 60030/ 60336 | consumed samples: 15367680 | consumed tokens: 31473008640 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.765541E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.486 | TFLOPs: 26.21 | +7: iteration 60040/ 60336 | consumed samples: 15370240 | consumed tokens: 31478251520 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.762001E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.192 | TFLOPs: 26.21 | +7: iteration 60050/ 60336 | consumed samples: 15372800 | consumed tokens: 31483494400 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.743983E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.239 | TFLOPs: 26.21 | +7: iteration 60060/ 60336 | consumed samples: 15375360 | consumed tokens: 31488737280 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.765790E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.599 | TFLOPs: 26.20 | +7: iteration 60070/ 60336 | consumed samples: 15377920 | consumed tokens: 31493980160 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.754570E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.265 | TFLOPs: 26.19 | +7: iteration 60080/ 60336 | consumed samples: 15380480 | consumed tokens: 31499223040 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.750166E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.457 | TFLOPs: 26.21 | +7: iteration 60090/ 60336 | consumed samples: 15383040 | consumed tokens: 31504465920 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.745874E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.998 | TFLOPs: 26.22 | +7: iteration 60100/ 60336 | consumed samples: 15385600 | consumed tokens: 31509708800 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.756356E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.911 | TFLOPs: 26.22 | +7: iteration 60110/ 60336 | consumed samples: 15388160 | consumed tokens: 31514951680 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.760110E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.863 | TFLOPs: 26.22 | +7: iteration 60120/ 60336 | consumed samples: 15390720 | consumed tokens: 31520194560 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.761229E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.493 | TFLOPs: 26.23 | +7: iteration 60130/ 60336 | consumed samples: 15393280 | consumed tokens: 31525437440 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.746731E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.123 | TFLOPs: 26.21 | +7: iteration 60140/ 60336 | consumed samples: 15395840 | consumed tokens: 31530680320 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.749392E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.742 | TFLOPs: 26.22 | +7: iteration 60150/ 60336 | consumed samples: 15398400 | consumed tokens: 31535923200 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.747928E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.402 | TFLOPs: 26.21 | +7: iteration 60160/ 60336 | consumed samples: 15400960 | consumed tokens: 31541166080 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.751093E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.617 | TFLOPs: 26.04 | +7: iteration 60170/ 60336 | consumed samples: 15403520 | consumed tokens: 31546408960 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.755727E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.839 | TFLOPs: 26.01 | +7: iteration 60180/ 60336 | consumed samples: 15406080 | consumed tokens: 31551651840 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.750329E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.310 | TFLOPs: 26.04 | +7: iteration 60190/ 60336 | consumed samples: 15408640 | consumed tokens: 31556894720 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.748953E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.491 | TFLOPs: 26.06 | +7: iteration 60200/ 60336 | consumed samples: 15411200 | consumed tokens: 31562137600 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.769680E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.797 | TFLOPs: 26.00 | +7: iteration 60210/ 60336 | consumed samples: 15413760 | consumed tokens: 31567380480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.748784E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.575 | TFLOPs: 25.73 | +7: iteration 60220/ 60336 | consumed samples: 15416320 | consumed tokens: 31572623360 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.735170E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.559 | TFLOPs: 26.06 | +7: iteration 60230/ 60336 | consumed samples: 15418880 | consumed tokens: 31577866240 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.755297E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.106 | TFLOPs: 26.05 | +7: iteration 60240/ 60336 | consumed samples: 15421440 | consumed tokens: 31583109120 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.758381E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.741 | TFLOPs: 26.01 | +7: iteration 60250/ 60336 | consumed samples: 15424000 | consumed tokens: 31588352000 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.743202E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.519 | TFLOPs: 26.03 | +7: iteration 60260/ 60336 | consumed samples: 15426560 | consumed tokens: 31593594880 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.745200E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.335 | TFLOPs: 26.05 | +7: iteration 60270/ 60336 | consumed samples: 15429120 | consumed tokens: 31598837760 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.743526E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.522 | TFLOPs: 26.07 | +7: iteration 60280/ 60336 | consumed samples: 15431680 | consumed tokens: 31604080640 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.748799E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.567 | TFLOPs: 26.06 | +7: iteration 60290/ 60336 | consumed samples: 15434240 | consumed tokens: 31609323520 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.763738E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.012 | TFLOPs: 26.06 | +7: iteration 60300/ 60336 | consumed samples: 15436800 | consumed tokens: 31614566400 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.758266E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.159 | TFLOPs: 26.07 | +7: iteration 60310/ 60336 | consumed samples: 15439360 | consumed tokens: 31619809280 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.755417E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.155 | TFLOPs: 26.07 | +7: iteration 60320/ 60336 | consumed samples: 15441920 | consumed tokens: 31625052160 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.748151E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.922 | TFLOPs: 26.05 | +7: iteration 60330/ 60336 | consumed samples: 15444480 | consumed tokens: 31630295040 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.736851E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.107 | TFLOPs: 26.05 | +0: [after training is done] datetime: 2023-03-17 02:55:11 +0: saving checkpoint at iteration 60336 to checkpoints_44m32b100m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.882590E+00 | lm loss PPL: 4.854978E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-17 02:55:11,585] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step60336 is begin to save! +0: [2023-03-17 02:55:11,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:55:11,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:55:11,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:55:11,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:55:11,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:55:11,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:55:11,666] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:55:11,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:55:11,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:55:11,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:55:11,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:55:11,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:55:11,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:55:11,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:55:11,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:55:11,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:55:11,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:55:11,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:55:11,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:55:11,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:55:11,715] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt +0: [2023-03-17 02:55:11,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 02:55:11,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:55:11,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:55:11,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:55:11,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:55:11,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:55:11,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +6: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +3: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:55:11,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +6: [2023-03-17 02:55:11,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +3: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:55:11,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +3: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:55:11,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:55:11,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +6: [2023-03-17 02:55:11,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +6: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:55:11,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +3: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:55:11,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +6: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +1: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +4: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +7: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +6: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +2: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +5: [2023-03-17 02:55:11,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60336 is ready now! +0: successfully saved checkpoint at iteration 60336 to checkpoints_44m32b100m +END 3327118: Fri 17 Mar 2023 02:55:36 AM EET diff --git a/44m32b100m/3328578.err b/44m32b100m/3328578.err new file mode 100644 index 0000000000000000000000000000000000000000..e22de414e7d3b53eb0bb039157662c3a5dc83a4b --- /dev/null +++ b/44m32b100m/3328578.err @@ -0,0 +1,306 @@ +0: 2023-03-17 09:45:28.498658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:28.498662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498684: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498704: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: 2023-03-17 09:45:28.498678: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:28.498686: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:28.498686: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:28.498675: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:28.498682: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498709: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498711: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:28.498674: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498705: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498716: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:28.498719: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:30.139276: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-17 09:45:30.139307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:30.139284: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 09:45:30.139294: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139316: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 09:45:30.139288: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139317: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 09:45:30.139279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139316: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 09:45:30.139292: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139323: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 09:45:30.139306: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139325: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 09:45:30.139293: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:30.139319: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:30.139957: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139971: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139973: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139966: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139965: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139963: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139965: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139969: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139972: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:30.139974: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139979: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139980: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139981: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139982: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139985: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:30.139987: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:35.130202: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130217: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.130209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131189: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131196: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131197: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.131201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131742: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131744: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132845: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132847: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132847: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132856: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132857: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132858: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132868: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132868: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132869: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132872: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132871: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132873: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132896: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:35.132908: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:45:35.132913: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131752: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131761: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131761: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131761: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131765: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131767: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131765: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131815: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131830: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:45:35.131832: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:35.131847: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: +1: +1: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +1: Building extension module utils... +1: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +0: +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/44m32b100m/3328578.out b/44m32b100m/3328578.out new file mode 100644 index 0000000000000000000000000000000000000000..48eeab1c0066dcfeb1e1d3bc39c2fdb65cbd1b9d --- /dev/null +++ b/44m32b100m/3328578.out @@ -0,0 +1,1367 @@ +Model parameters: d_model 512 ffw_size 2048 kv_size 64 n_heads 8 n_layers 8 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 8 --hidden-size 512 --num-attention-heads 8 --kv-channels 64 --ffn-hidden-size 2048 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 64 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-44m32b100mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_44m32b100mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_44m32b100m --load checkpoints_44m32b100m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3328578.json --zero-stage 0 +START 3328578: Fri 17 Mar 2023 09:45:06 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 42.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 37.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 42.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +1: Launching on nid007495 (1/2), master nid007494 port 9999, GPUs 8, CUDA: True +0: Launching on nid007494 (0/2), master nid007494 port 9999, GPUs 8, CUDA: True +0: using world size: 16, data-parallel-size: 16, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 16 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3328578.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2048 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 64 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 512 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-44m32b100mval +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_44m32b100m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 8 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 8 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_44m32b100m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_44m32b100mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 16 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +1: > setting tensorboard ... +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-17 09:45:53,835] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.100 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_cuda.o layer_norm_hip_kernel.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 25.913 seconds +0: time to initialize megatron (seconds): 76.506 +0: [after megatron is initialized] datetime: 2023-03-17 09:46:20 +0: building GPT model ... +0: [2023-03-17 09:46:20,640] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-17 09:46:20,640] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-17 09:46:20,641] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.67 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15} +0: [2023-03-17 09:46:21,118] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=15 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: undo +0: 12: MixedFusedLayerNorm +0: 13: EmbeddingPipe +0: 14: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-17 09:46:21,383] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-17 09:46:21,384] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 09:46:21,384] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.69 GB, percent = 6.1% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-17 09:46:21,385] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-17 09:46:28,430] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-17 09:46:28,431] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-17 09:46:28,431] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-17 09:46:28,433] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-17 09:46:28,433] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-17 09:46:28,549] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-17 09:46:28,550] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 09:46:28,550] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +1: ninja: no work to do. +1: Time to load utils op: 0.15442585945129395 seconds +1: Time to load utils op: 0.20387649536132812 seconds +1: Time to load utils op: 0.20440077781677246 seconds +1: Time to load utils op: 0.2043149471282959 seconds +1: Time to load utils op: 0.20441269874572754 seconds +1: Time to load utils op: 0.20457243919372559 seconds +1: Time to load utils op: 0.20498991012573242 seconds +1: Time to load utils op: 0.20502853393554688 seconds +0: Time to load utils op: 0.21097826957702637 secondsTime to load utils op: 0.21174144744873047 secondsTime to load utils op: 0.2113947868347168 seconds +0: +0: +0: Time to load utils op: 0.21172785758972168 seconds +0: Time to load utils op: 0.20850682258605957 seconds +0: Time to load utils op: 0.21164393424987793 seconds +0: Time to load utils op: 0.21158456802368164 seconds +0: Time to load utils op: 0.10217475891113281 seconds +1: Time to load utils op: 0.0005857944488525391 seconds +1: Time to load utils op: 0.0008189678192138672 secondsTime to load utils op: 0.00047898292541503906 seconds +1: +1: Time to load utils op: 0.0007109642028808594 seconds +1: Time to load utils op: 0.0004718303680419922 seconds +1: Time to load utils op: 0.00034117698669433594 seconds +1: Time to load utils op: 0.00033211708068847656 seconds +1: Time to load utils op: 0.0003204345703125 seconds +0: Time to load utils op: 0.0005950927734375 secondsTime to load utils op: 0.00061798095703125 secondsTime to load utils op: 0.0005733966827392578 seconds +0: +0: +0: Time to load utils op: 0.0005528926849365234 seconds +0: Time to load utils op: 0.0005950927734375 seconds +0: Time to load utils op: 0.0006499290466308594 seconds +0: Time to load utils op: 0.0006430149078369141 seconds +0: [2023-03-17 09:46:28,761] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-17 09:46:28,762] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 09:46:28,762] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:28,874] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-17 09:46:28,874] [INFO] [utils.py:828:see_memory_usage] MA 0.25 GB Max_MA 0.25 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 09:46:28,874] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:28,977] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-17 09:46:28,978] [INFO] [utils.py:828:see_memory_usage] MA 0.25 GB Max_MA 0.25 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 09:46:28,978] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,082] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-17 09:46:29,083] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:29,083] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,185] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-17 09:46:29,186] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:29,186] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,290] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-17 09:46:29,290] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:29,291] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,393] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-17 09:46:29,394] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:29,394] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,502] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-17 09:46:29,503] [INFO] [utils.py:828:see_memory_usage] MA 0.33 GB Max_MA 0.33 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:29,503] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,606] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-17 09:46:29,606] [INFO] [utils.py:828:see_memory_usage] MA 0.33 GB Max_MA 0.33 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:29,606] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-17 09:46:29,606] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-17 09:46:29,607] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-17 09:46:29,607] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-17 09:46:29,607] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-17 09:46:29,607] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-17 09:46:29,607] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-17 09:46:29,607] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-17 09:46:29,607] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-17 09:46:29,607] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-17 09:46:29,608] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] train_batch_size ............. 64 +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] world_size ................... 16 +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-17 09:46:29,609] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-17 09:46:29,609] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 64, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0004241466522216797 seconds +0: [2023-03-17 09:46:29,610] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-17 09:46:29,620] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=52024320 (52.024M) TOTAL_PARAMS=52024320 (52.024M) UNIQUE_PARAMS=52024320 (52.024M) +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:29,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:29,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:29,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:29,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:29,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:29,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:29,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:29,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:29,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:29,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:29,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:29,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:29,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:29,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:30,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:30,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:30,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:30,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:30,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:30,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:30,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:30,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:30,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:30,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: > overriding learning rate value to 0.0002 +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:30,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:30,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:30,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,308] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +1: [2023-03-17 09:46:30,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,312] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +0: [2023-03-17 09:46:30,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,313] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +1: [2023-03-17 09:46:30,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,315] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +1: [2023-03-17 09:46:30,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,321] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +1: [2023-03-17 09:46:30,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,322] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +0: [2023-03-17 09:46:30,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,323] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +1: [2023-03-17 09:46:30,323] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +0: [2023-03-17 09:46:30,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,324] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +0: [2023-03-17 09:46:30,325] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +0: [2023-03-17 09:46:30,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,325] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +1: [2023-03-17 09:46:30,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,326] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +1: [2023-03-17 09:46:30,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,328] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +1: [2023-03-17 09:46:30,333] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +0: [2023-03-17 09:46:30,333] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +1: [2023-03-17 09:46:30,337] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +0: [2023-03-17 09:46:30,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,343] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +0: [2023-03-17 09:46:30,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:30,345] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +0: [2023-03-17 09:46:30,372] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +1: [2023-03-17 09:46:30,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:30,397] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +1: [2023-03-17 09:46:30,408] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +0: [2023-03-17 09:46:30,529] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +0: [2023-03-17 09:46:30,537] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +1: [2023-03-17 09:46:30,540] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +0: [2023-03-17 09:46:30,542] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +0: [2023-03-17 09:46:30,560] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +0: [2023-03-17 09:46:30,572] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +0: [2023-03-17 09:46:30,582] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +1: [2023-03-17 09:46:30,612] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +1: [2023-03-17 09:46:30,614] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +1: [2023-03-17 09:46:30,748] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +1: [2023-03-17 09:46:30,752] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +0: successfully loaded checkpoint from checkpoints_44m32b100m at iteration 0 +1: time (ms) | load-checkpoint: 1132.75 +0: estimated model parameters: 0.05202432 +0: estimated model parameters without embeddings: 0.025220096 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-17 09:46:31 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 6400 +0: test: 6400 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.007631 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.010 seconds +0: total number of samples: 48805 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.036716 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_6400ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_6400ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_6400ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.009 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-17 09:46:43 +0: done with setup ... +0: training ... +1: time (ms) | model-and-optimizer-setup: 10577.99 | train/valid/test-data-iterators-setup: 11047.64 +0: [after training is done] datetime: 2023-03-17 09:46:43 +1: ----------------------------------------------------------------------------------------------------------------- +1: validation loss at the end of training for val data | lm loss value: 3.892240E+00 | lm loss PPL: 4.902058E+01 | +1: ----------------------------------------------------------------------------------------------------------------- +END 3328578: Fri 17 Mar 2023 09:47:01 AM EET diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdae5313979bd55b90b3e12b546b8f8104ed5409 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d46f1e0bccd1f0a5eff43e22ad596d948ee8164d377c00edbb4049fea5df0b3 +size 9758999 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..266d84da37de9a15fad17ed39764b66df915cf1e --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:644afea8282fe61b791f95744759f4353f9e2576b71c2b4f344d851cbf04fae3 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..255f675b0e50fb58df1569a707521796a2eaea3b --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261df812165ca2cb815e195615d1eb8eed557a0be4ebdf03c6328c1665531e69 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e72c32f2a049fa811ed7de6f854c1b70c21adfed --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61047a0bfe2a9f98a3d4d0fa6e414beafd3d76055659c85d1ae1adf6dcf9241b +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7499ab15c0c35ffa2f47d4454d438e6d3b6f9c81 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53a8067bd4dcdf07c3d6a80a5281a16d4c7585a459f56f96a0665e99677b10f5 +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f14d0f6fb1d80357237aff9e60bbf6095bb9005c --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2932eb5a2c39c57e0acd6d6140ead410c59cf1efe958aae1a261290b81cd3d5a +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e7f9d835334fd7d497774b0d2b0b0a70207974d --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a17698afc484f5cdbaa5b79494bf272a913cb461e1d676f65aa7a9cb464c4f5 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bc48cd5e8a7f438b7e72c118902cf15ae14a96e --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56478379deb8a67197da15f5d6b0ae9b6baa986b38d2a19128868b29d5981c6e +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14836cd84fcf261d4fdd4923fbae358db774a4d9 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b789d9d2f05d891d4ab7f45201e95aad941405ffeab4f697799a6d82834817b3 +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97e6c5ac7f331dc64393e9775ef8a56e8f3590bd --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24347ea4503e7a34199db54a42caef4740a24eb87e31e22e912717261ec12d54 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..925a03a4460d1f4e23bca956d04c783dcba7b0d6 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2898ea648df43426dff615e42284ed3032b4beb787f65ca932fa769fc49a6a93 +size 9759202 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16bcd6cccf9ce9826da3436a4dc4290b4b7ec641 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:898deb657a70bfefbdaa56ebfd5a92a3c773721052a890859510906028871386 +size 9759063 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dd8a198f203ad77a12f57a17f4eb13b11fc7f72 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad9f165577513246abcc21190c2f717b0a7130a2cf1553657d2d1b127f131301 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43f23ce4a012e2442533ca85054f30fcc120ce38 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db9b76a743c86f3cafdc303cfe56c29a66d2ec02d5e67326530ebeb6a17c6d0a +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cc3e7dcddd9681f81731e718447807f3f71f0d8 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f4384422a968698db917ab028ba69a6d76ac777a000ba153ceab7045f59d649 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42b8c5cb1c83bf31363a461b0355489ab28f16a4 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d4a796b95d8a11051fbbd9d390d5ef1093ef204f64871e2ff583eb52676eea +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e04d0b27dd2c01f60e5eb9d16b71330c26e3255d --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a45ce3ee8896d53f72ecadac797eca6ea6e2d7640cfdb4f05c499d5a274275a3 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3de8d229c92f401cbb011741812ecec51db4a666 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c056695fb3f69a134e15e40eb20acc8d74186d78540804f9f7565f450e97e331 +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..122ee06b80643074604a14b09a444f5a50cb4b3b --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c16fb835a1fba8d8d081a08c6c1ec3f10557affe9ce7cfa801d070d6764f8de +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..900252028fd42602415dd436dc9b8db572bd58d1 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d70ab073c7b3cece09d76af35fd1108c830a1403c5e5e85e2725808f782cba25 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f3c5a83e0a566e27266f8deca752a72ebcd85b7 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c298a1bee762f02e1c81e83aaa5233fe213854d58d075192c8c0c6accee52cf1 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..459f9a577c9dc9d08c8d25c6506db4021744009a --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c7970a08ea5783001cdc22554407bd28b5894fdc8b32d4499cce17ed6d60416 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba34d395ee8e0617b4788b950f0b786666f79a51 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cea42477a7a50aaf7fae929d714a1d0adf2d523573054fd00b74ab24fba75479 +size 9758999 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd9e2a223b057ff3c1ff08fda78a498c3b8a1351 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a984ab6b59168f202aa061bc8363f60e40703669228770870882c3d19eaced1 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..031cf98a1108cb053377962a4f95f8ecca939ed2 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa0213c9985c5d33667a25494af2e372070a4d70f5622a86a0c1723fa0d62a36 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2d58421e928b72ac06fb5b9e32a93756a8adae3 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2495c422ce2f3dc0232ab6da9d67fbacb863160dc8816cc1fd354e07200fc983 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5787eb681af4330842bdba0901b7e809d6de5adc --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5ae04137f5dd0d3c2696117965af1842fed0c33a3d5b1ffa39455e154f06f9 +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6db81307676412919cb544e9bb7003086eda801f --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38401e71f704392bcf4eadb022c1ee153c7f150895eb5e793904cea4ffd5a1ef +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8548db20711ae3b2432f698fba88ecd9c03d26ae --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0c34de68c4e587b3e48702d69283bdb05a2924672951d56dc611d61fcae29e6 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8544af1dd6e9e8b5e0b944e2f2f78f40f7e875d6 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f65e6f204d60631f41c9d3da6dd7ea38e46b797b9c75377c30ab7b09a1f199a +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee2e859a2ff63029013656267ce7261ccc9433e9 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05e70b0e46458579a2066eea15a28642941670946d87a5239ac57e18d455fefe +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f489d063ad7d2008b57570e5f0d68c160516824 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507505197efa15b4cd312f72116a45ea588d6dbd985691644bb074db7ca2d895 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..467390567a387cc7065a831a5505491e02afa931 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c3ac88d8501a1d021363d3bc1eff31c89c6078c8c5d90fd965e4b7af837ac54 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b99570f751234717eb2eaa49cf3a9d04a5e3843c --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001e5f96c1f37f35604d06a21a04972fed8bad1577c7d02ca49beb7b1cd0ff5e +size 9759127 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82a5a56eb3c114773ba33d301f424dc197caa8a4 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc7d0cbf93ddd9369a077ea301279d0b96cc41f4a401995db410e08185d9a9ed +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..886801eb288fc53e53085782e3a2e1ed3c315d31 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36c05c775011837c5ac0fc7dfd9e110504b9a8bc299cf4a5a8951956ab752428 +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc65661bbdf30aa14439f425bcebd9824891fdf0 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12852add30de87b8af8a2d8a87987405ca76b6f427a50648d74d35745077498a +size 9759202 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0705c4b8653eb930159dfe86ebf3e92f647da943 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed439364ecf0d58e2abefb7d278fae1d128811c1081a354290b2fc67a9889b81 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..919c44754d2874cefc0b154e4cdc1cf066c206d1 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdad477ffdac77f6ddfad5d330ff4b20f52be00815acd4b37df1f355aeedeeb3 +size 9759202 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..186d97a57d3f323650787291da26704fb030e7dd --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcc80497ebd494f2eee488b0464318b6c907708262938b12b814cbe1b56b349c +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f331f44b537d39dedb928540dfcc5ddcffd241f6 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ab16266e42fed0d0e5c17ef626e422d056435dd43ef422fa7d3bc726b3c4845 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc04a163d7a09b6ab03cd5220606d52dee84a775 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4030dd554fede403a753d4ccb395950b967f169c7b20873e68d03ea63de3eb61 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..443a50e81e52dcddd9a49b9e541c97c6155256ff --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9807225b0ae7b8acb1b415050022e5836205440092f6f1bba2ee1f67e674c7b +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ee93a6e859a1151e9e99e1a3a3b0ff9d41b80aa --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22bc8e1bae83f7bd59281c7254a70a25f25825bce1b5022366b05430a21cf98a +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fa35cfd796847efebc2a271752781faf05f6dd4 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ecba17b62427fcc0338fea94b0c5d495e2327b9c2340ddb41b8a96a964409df +size 9759191 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b05094250626eb6259e3a54572cb42e0821f050c --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dbf65da1e2afee91b41eaac0226c5c67bbf2ea162fa59b64c008e85486259ae +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9866b6b105b4ad03def7505a960bf10fd098ad16 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b5e748dab6e2348aed10542622ba86755790ed6cd6804185df8290822042951 +size 9759266 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf6d4b9a124c84fa4ac225982aad891af361fed3 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2488be83515f4100f3332f977c9693ff53f0eb695fcafbc75efaf18f818504b1 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..400ec793eda8603373a9c9f3562688bdf9f864cb --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f099576cef94d625760b3320e6b35f86fd2feeaffecf99adc7cddca12e75ed2e +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..618c3eaf353400578f898c83a5d5a830bd0144a9 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9196396813fb879ab88ee915bd5ee97cbffc16996a850696a9d338bcdd89b42 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30248b841596bad8e92afc2d28f0b456c492e7c2 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4474ea759ec291a78c8e567f702242215f5d8243f2186b9e9f312ad1806108d +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdbeaae8fea3b64c9fdf8a6f0c9d52b7c02da5eb --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d74f9ba17438d15838b1f3d269b6ae561f150da3d514e5c7934159b5196294e +size 9759202 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fae8a7c7f06e99bf04c7318e9c57ceb46793a95 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eb815ee7f40a5f914a3e82530f09e5bbc636134aceb8f48390c9dfd990f9392 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38a505710935a14d695bd69ef7ed9beec56fd158 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20b23a60f939829b8419f5022d92a7aeb76b0d677cc8ab037c2c8726aca42d3 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c9305bf399ebfcd776a6f19a2d5bd2040ba5d19 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b58742502dec030abb4208c6b61b5ed962c9fa6e9e6b2704d30d6f9919c89e +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b79e72854ff54ed45a0dd7f3b0ef667af6be4469 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ba2377504484e8f28ece16c03d1e4437f340c4d495fb455ad4d21b61ff2817 +size 9758999 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21280130f7abd158e1912f5bada8fbead59749c5 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c2e14c94788d344113363e06c5028e89d48584acfed6b6b662dc6ef461828cb +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..942d307235a10291dc1809a5d7744ef3602e870a --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ab6a5c0e63deb584b2e44fcb06412699c7ba82ae317bac7a5744fa0dd5a0f66 +size 9759074 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1d206ddb7c344af6db252db461b6e648b3f4807 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6684b9cba9653004760fe960833bfee42b2764abdfeae6948031f55a7a1cba8 +size 9759138 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f388923b171f2be5fd4ead1abacacbef0d87a856 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c41cbb1ff6a0fd55d0ce620210daf3b66ef958cabdf0bdcc33f1a6376483c7a +size 9759010 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f34eea6ab0c9651be21a5132dbe4016eabaa589b --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219fa2056af7f5c87f4367b82f57374ca2b3cd74923f3f820ebf115d2bd4fd09 +size 9758999 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52dea5416a422d0c2e66401c2bab6ed48c7e228e --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3177e15e199a2968241e72bc9579eb4be13e82fd91bc4b780e1fca6af5d314d3 +size 9759127 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34fe749b698ed4ddfc9ee974b9b3bd4465885e55 --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eed9d7d180b5147c766d228e8199a73ee9a55be55f5f1b94e2711e77aeb7663f +size 9759127 diff --git a/44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5da2a20968bf1b771c7e9eac8832f0d5941819dc --- /dev/null +++ b/44m32b100m/global_step60336/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a3390782dd66aba2b522995981e40cb0f4021d7f1fd63d66fda190c5b343f0 +size 9759127 diff --git a/44m32b100m/global_step60336/layer_01-model_00-model_states.pt b/44m32b100m/global_step60336/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00c0d533f4fad5236becac40226bb7b0055f4543 --- /dev/null +++ b/44m32b100m/global_step60336/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe941801582c2ad215b01497ebc5a5b204fcdd22e066067d1ecfb3b09cb0ee7 +size 53609731 diff --git a/44m32b100m/global_step60336/layer_03-model_00-model_states.pt b/44m32b100m/global_step60336/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cac3b4aa093908b5e747fdc6f669b2c1dc31c78 --- /dev/null +++ b/44m32b100m/global_step60336/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce7a00ca31d97c65dfac9328fca23996faf6fa25e18e49a21842789377bb7099 +size 6309123 diff --git a/44m32b100m/global_step60336/layer_04-model_00-model_states.pt b/44m32b100m/global_step60336/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59b771a4c61d28a3d2649f42abd134427dae0b43 --- /dev/null +++ b/44m32b100m/global_step60336/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c05d2c2e1f47cb4d7790d13d8de1e773d44c02c1ed138ac84fbe2f4415005b +size 6309123 diff --git a/44m32b100m/global_step60336/layer_05-model_00-model_states.pt b/44m32b100m/global_step60336/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fad2c01efad034277dae5f1d14b2767ff770fc70 --- /dev/null +++ b/44m32b100m/global_step60336/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86879db582d83182f16bbab5f42ac69af92d3713de64c67a5974c4880385fd84 +size 6309123 diff --git a/44m32b100m/global_step60336/layer_06-model_00-model_states.pt b/44m32b100m/global_step60336/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c193b799f2367cf8b6ea46224b4b1e230d839c5 --- /dev/null +++ b/44m32b100m/global_step60336/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0d6df23f2977531e48d94d4b3af974a9d8a639953b7bdad3bda9bb7b73d5fcf +size 6309123 diff --git a/44m32b100m/global_step60336/layer_07-model_00-model_states.pt b/44m32b100m/global_step60336/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e59f619111d61e50b83314612bbcd2f608ec8b4a --- /dev/null +++ b/44m32b100m/global_step60336/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b9e2e1b2a7983d9a24e83883f5d4f14a0866bdc1e57a6acdb57e0ffbaceb09 +size 6309123 diff --git a/44m32b100m/global_step60336/layer_08-model_00-model_states.pt b/44m32b100m/global_step60336/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5166429d00135742e5486f79467ff93d5ca824fb --- /dev/null +++ b/44m32b100m/global_step60336/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d35b8c1f2f1cac9abcfec1c2dae5df1d0809ce02ac2105732e9ad3f279b41ac +size 6309123 diff --git a/44m32b100m/global_step60336/layer_09-model_00-model_states.pt b/44m32b100m/global_step60336/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ffc13ef535a9d97a95bde9a7483b490147f636a --- /dev/null +++ b/44m32b100m/global_step60336/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba989d68776792234ebb7fb0d4ecdad055cc37dca027a315371227a65c25046 +size 6309123 diff --git a/44m32b100m/global_step60336/layer_10-model_00-model_states.pt b/44m32b100m/global_step60336/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39b7063491d8df269f8ddef38a62d4d387a35195 --- /dev/null +++ b/44m32b100m/global_step60336/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51c8fc43879805da307da6311616cd2aadd519d92d4471aa4f150a9ad439fefb +size 6309123 diff --git a/44m32b100m/global_step60336/layer_12-model_00-model_states.pt b/44m32b100m/global_step60336/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dea9781911527b109ca26e44ced65e896086b52 --- /dev/null +++ b/44m32b100m/global_step60336/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3316e7527011a8595824d42c0e056c69561b7684e690ae096c27c79351cdb513 +size 3267 diff --git a/44m32b100m/global_step60336/mp_rank_00_model_states.pt b/44m32b100m/global_step60336/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..45a3a1c4e7d28aa761047ce93e7bfe277008b58b --- /dev/null +++ b/44m32b100m/global_step60336/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c2962bdf63c5833142e942435327790fe13ec552cfd1c625328f7d0c8acdb3b +size 30131 diff --git a/44m32b100m/sbatch_44m32b100m.sh b/44m32b100m/sbatch_44m32b100m.sh new file mode 100644 index 0000000000000000000000000000000000000000..a6751b03f9cf167491a0077a4fb9e0b4a9de7aec --- /dev/null +++ b/44m32b100m/sbatch_44m32b100m.sh @@ -0,0 +1,175 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=44m32b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_44M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +# TRAIN_SAMPLES=9_703_701 +# Tokens: 90964260000 +# -> Samples: 44416143 +# TRAIN_SAMPLES=44_416_143 +# TRAIN_SAMPLES=9_703_701 +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=15_446_035 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 154_460 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/44m32b100m/sbatch_44m32b100mval.sh b/44m32b100m/sbatch_44m32b100mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..87f518efcdbd09661618d3653a7ce56e698cec40 --- /dev/null +++ b/44m32b100m/sbatch_44m32b100mval.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p small-g +#SBATCH -t 12:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=44m32b100mval +VARIANT_CKPT=44m32b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_44M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --no-load-optim \ + --reset-progress \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/44m32b100m/tensorboard_44m32b100m/events.out.tfevents.1679005068.nid005299.50953.0 b/44m32b100m/tensorboard_44m32b100m/events.out.tfevents.1679005068.nid005299.50953.0 new file mode 100644 index 0000000000000000000000000000000000000000..752f32b15f62dee4cc0dbebd7a7116451b519833 --- /dev/null +++ b/44m32b100m/tensorboard_44m32b100m/events.out.tfevents.1679005068.nid005299.50953.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e94101aeef5cb68f2bfe19ae50625cfbc64b5f5eabe97ff5e4abc632da6958 +size 107898548 diff --git a/44m32b100m/tensorboard_44m32b100mval/events.out.tfevents.1679039153.nid007495.86332.0 b/44m32b100m/tensorboard_44m32b100mval/events.out.tfevents.1679039153.nid007495.86332.0 new file mode 100644 index 0000000000000000000000000000000000000000..a9fde417129f0349545c2b499b8b749d89b3955a --- /dev/null +++ b/44m32b100m/tensorboard_44m32b100mval/events.out.tfevents.1679039153.nid007495.86332.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89d027884f6a909eedd76d06d60a13dc650988425c702273cc30d575c18eba0 +size 980 diff --git a/44m91b100m/3327057.err b/44m91b100m/3327057.err new file mode 100644 index 0000000000000000000000000000000000000000..2a3f9b43599a27fec614fc331387895d176ec06b --- /dev/null +++ b/44m91b100m/3327057.err @@ -0,0 +1,1123 @@ +7: 2023-03-17 00:11:58.267527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267518: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267543: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267622: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267627: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:11:58.267696: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:11:58.268369: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268410: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268430: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: 2023-03-17 00:11:58.268440: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268462: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268466: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: 2023-03-17 00:11:58.268284: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268255: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268249: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268434: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268454: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268285: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268278: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268294: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: 2023-03-17 00:11:58.268089: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:11:58.268108: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:11:58.268110: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-17 00:11:58.268420: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268472: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268484: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268496: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268502: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268347: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268311: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:11:58.268127: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:11:58.268145: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:11:58.268415: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:11:58.268503: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268493: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268306: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268308: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:11:58.268322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: 2023-03-17 00:11:58.268309: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:11:58.268322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:11:58.268327: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-17 00:11:58.268471: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268538: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:11:58.268538: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268382: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:11:58.268483: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:11:58.268493: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:11:58.268438: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:11:58.268512: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:11:58.268572: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269330: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269340: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269341: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269342: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269372: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:11:58.269377: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:12:08.121600: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.121617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-17 00:12:08.121970: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.122147: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.121629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-17 00:12:08.121997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.122160: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.121655: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-17 00:12:08.121998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.121648: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-17 00:12:08.122017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.122174: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.121987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-17 00:12:08.121652: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:08.122187: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-17 00:12:08.121662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:08.122013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-17 00:12:08.121658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:08.122025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-17 00:12:08.122200: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:12:08.122204: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:12:08.122212: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:12:08.122217: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:08.122566: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122570: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122576: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122577: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122581: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122581: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122587: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:12:08.122585: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129567: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129856: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129573: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129567: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129582: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129879: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129882: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129568: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129895: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129603: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:08.129901: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129903: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129913: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:08.129923: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.138927: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.138949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.139237: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.138944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.139247: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.138945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.138966: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.138974: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.138984: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.139264: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.139272: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.138973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:08.139279: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.139291: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.139296: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:12:08.139308: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139415: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139432: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139668: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139445: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139450: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139679: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139457: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139471: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:08.139695: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139697: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139704: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139709: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139716: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:12:08.139719: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:12:08.139987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140027: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140017: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140027: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140042: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140046: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140046: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140045: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140063: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:08.140047: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:08.140067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:08.140236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140644: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140673: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:12:08.140667: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140255: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:08.140690: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:12:08.140694: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:12:08.140698: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140518: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:12:08.140705: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:12:08.140709: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140266: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:08.140534: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:08.140684: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:12:08.140667: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140278: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:08.140287: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:08.140544: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:12:08.140689: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:12:08.140691: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:12:08.140698: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140293: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-17 00:12:08.140701: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:12:08.140712: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:12:08.140712: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:08.140298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:08.140559: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140560: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140570: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140574: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:12:08.140576: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:12:34.061670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062252: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062257: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062265: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.062331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.070547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.070579: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.070594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.070694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-17 00:12:34.070601: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.070723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-17 00:12:34.070603: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.070614: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.070629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.070811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 00:12:34.070720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.070779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 00:12:34.070738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-17 00:12:34.070752: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.070751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.070762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.070766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 00:12:34.070769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.070788: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 00:12:34.070874: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 00:12:34.070850: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: 2023-03-17 00:12:34.070803: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.070788: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-17 00:12:34.070854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: 2023-03-17 00:12:34.070825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.070808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.070830: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.070807: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: 2023-03-17 00:12:34.070845: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.070905: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-17 00:12:34.070814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.070926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.070845: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 00:12:34.070948: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.070862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 00:12:34.070973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.070871: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 00:12:34.070982: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.070991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.071002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073286: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073290: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073290: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073292: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073296: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:12:34.073469: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-17 00:12:34.073482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:12:34.073290: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.073471: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:12:34.073295: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.073482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 00:12:34.073469: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:12:34.073290: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073308: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:12:34.073309: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 00:12:34.073469: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:12:34.073313: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:12:34.073312: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:12:34.073315: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:12:34.073317: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:12:34.073319: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 00:12:34.073474: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.073482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 00:12:34.073476: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.073491: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 00:12:34.073479: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.073486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:12:34.073487: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:12:34.073483: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.073493: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:12:34.073495: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:12:34.073496: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073489: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-17 00:12:34.073496: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:12:34.073498: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:12:34.073502: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.073494: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:12:34.073501: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073501: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073502: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073510: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073511: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073510: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073513: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:12:34.073514: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:12:34.073875: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073878: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073881: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073882: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:12:34.073991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073880: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:12:34.073990: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073884: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:12:34.073995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073886: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:12:34.073995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073893: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:12:34.073894: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.074007: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:12:34.073897: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:12:34.073899: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:12:34.073901: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074001: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-17 00:12:34.073901: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:12:34.073903: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073950: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:12:34.073998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:12:34.073964: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.073998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.074000: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:12:34.074017: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074018: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074021: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074025: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:12:34.074026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.081151: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.081181: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.081194: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.081220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.081227: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:12:34.081496: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.081231: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 00:12:34.081235: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081528: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 00:12:34.081312: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081554: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.081723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083569: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083576: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083581: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083580: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:12:34.083587: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083589: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083590: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083596: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083595: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083597: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083599: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:12:34.083600: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084064: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084059: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084071: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084064: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084068: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084068: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:12:34.084081: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084083: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084085: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084086: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084087: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:12:34.084090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071242: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071246: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071244: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071247: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071259: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071251: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071256: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071261: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071258: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:12:34.071268: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071268: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071269: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071273: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071275: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:12:34.071279: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +6: Successfully preprocessed all matching files. +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +3: Building extension module utils... +3: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +3: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +3: Building extension module utils... +3: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +0: Loading extension module utils... +5: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils...Loading extension module utils... +3: +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...Loading extension module utils... +0: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: +0: Loading extension module utils...Loading extension module utils... +0: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils...Loading extension module utils... +2: +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils...Loading extension module utils... +6: +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...Loading extension module utils... +1: +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +1: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils...Loading extension module utils... +7: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/44m91b100m/3327057.out b/44m91b100m/3327057.out new file mode 100644 index 0000000000000000000000000000000000000000..1e02a90a222d9e68beb196e225095f5051daaa58 --- /dev/null +++ b/44m91b100m/3327057.out @@ -0,0 +1,68143 @@ +Model parameters: d_model 512 ffw_size 2048 kv_size 64 n_heads 8 n_layers 8 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 8 --hidden-size 512 --num-attention-heads 8 --kv-channels 64 --ffn-hidden-size 2048 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 44_416_143 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-44m91b100m --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 44_416_143 --lr-warmup-samples 444_161 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_44m91b100m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_44m91b100m --load checkpoints_44m91b100m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3327057.json --zero-stage 0 +START 3327057: Fri 17 Mar 2023 12:11:06 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 50.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 50.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 48.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 45.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 48.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 40.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 40.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 39.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 45.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 50.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 49.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 43.0c 77.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 48.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +7: Launching on nid006236 (7/8), master nid006229 port 9999, GPUs 8, CUDA: True +5: Launching on nid006234 (5/8), master nid006229 port 9999, GPUs 8, CUDA: True +2: Launching on nid006231 (2/8), master nid006229 port 9999, GPUs 8, CUDA: True +6: Launching on nid006235 (6/8), master nid006229 port 9999, GPUs 8, CUDA: True +4: Launching on nid006233 (4/8), master nid006229 port 9999, GPUs 8, CUDA: True +1: Launching on nid006230 (1/8), master nid006229 port 9999, GPUs 8, CUDA: True +3: Launching on nid006232 (3/8), master nid006229 port 9999, GPUs 8, CUDA: True +0: Launching on nid006229 (0/8), master nid006229 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... True +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3327057.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2048 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 512 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-44m91b100m +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_44m91b100m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... 12.0 +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 44416143 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 444161 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 8 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 8 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_44m91b100m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_44m91b100m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 44416143 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-17 00:13:48,090] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.104 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.o scaled_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 25.882 seconds +0: time to initialize megatron (seconds): 56.826 +0: [after megatron is initialized] datetime: 2023-03-17 00:14:16 +0: building GPT model ... +0: [2023-03-17 00:14:16,966] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-17 00:14:16,967] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-17 00:14:16,967] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.66 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-17 00:14:18,951] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=15 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: undo +0: 12: MixedFusedLayerNorm +0: 13: EmbeddingPipe +0: 14: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-17 00:14:19,216] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-17 00:14:19,216] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 00:14:19,217] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.68 GB, percent = 6.1% +0: setting training iterations to 173500 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-17 00:14:19,218] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-17 00:14:30,821] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-17 00:14:30,822] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-17 00:14:30,822] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-17 00:14:30,825] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-17 00:14:30,825] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-17 00:14:30,944] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-17 00:14:30,945] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 00:14:30,945] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.35 GB, percent = 6.2% +3: ninja: no work to do. +3: Time to load utils op: 0.25453686714172363 seconds +3: Time to load utils op: 0.0008056163787841797 seconds +3: ninja: no work to do. +3: Time to load utils op: 0.21546435356140137 seconds +3: Time to load utils op: 0.0006551742553710938 seconds +0: Time to load utils op: 0.4106309413909912 seconds +5: Time to load utils op: 0.5247793197631836 seconds +0: Time to load utils op: 0.5055019855499268 seconds +0: Time to load utils op: 0.5061633586883545 seconds +0: Time to load utils op: 0.5059621334075928 seconds +0: Time to load utils op: 0.505626916885376 seconds +0: Time to load utils op: 0.5060796737670898 seconds +0: Time to load utils op: 0.5069994926452637 seconds +0: Time to load utils op: 0.5061936378479004 seconds +3: Time to load utils op: 0.5046885013580322 seconds +3: Time to load utils op: 0.504981279373169 seconds +3: Time to load utils op: 0.5050342082977295 seconds +3: Time to load utils op: 0.5050628185272217 seconds +2: Time to load utils op: 0.5113101005554199 seconds +2: Time to load utils op: 0.5112278461456299 seconds +2: Time to load utils op: 0.5121457576751709 secondsTime to load utils op: 0.5110151767730713 seconds +2: +2: Time to load utils op: 0.5120561122894287 secondsTime to load utils op: 0.5121605396270752 seconds +2: +2: Time to load utils op: 0.511106014251709 seconds +2: Time to load utils op: 0.5116329193115234 seconds +1: Time to load utils op: 0.5114619731903076 seconds +1: Time to load utils op: 0.5114734172821045 seconds +1: Time to load utils op: 0.5114812850952148 seconds +1: Time to load utils op: 0.5115160942077637 seconds +1: Time to load utils op: 0.5115218162536621 seconds +1: Time to load utils op: 0.5115156173706055 secondsTime to load utils op: 0.5115177631378174 seconds +1: +1: Time to load utils op: 0.511526346206665 seconds +3: Time to load utils op: 0.0005195140838623047 seconds +3: Time to load utils op: 0.0005409717559814453 seconds +3: Time to load utils op: 0.0004787445068359375 seconds +3: Time to load utils op: 0.00045800209045410156 seconds +3: Time to load utils op: 0.3027799129486084 seconds +3: Time to load utils op: 0.3029518127441406 seconds +5: Time to load utils op: 0.3033294677734375 seconds +5: Time to load utils op: 0.30376338958740234 seconds +5: Time to load utils op: 0.3040134906768799 seconds +5: Time to load utils op: 0.3041970729827881 seconds +5: Time to load utils op: 0.30467653274536133 seconds +5: Time to load utils op: 0.30426740646362305 seconds +5: Time to load utils op: 0.3042447566986084 seconds +6: Time to load utils op: 0.31343817710876465 seconds +6: Time to load utils op: 0.31324267387390137 seconds +6: Time to load utils op: 0.3133091926574707 seconds +6: Time to load utils op: 0.3138742446899414 secondsTime to load utils op: 0.31330299377441406 seconds +6: +6: Time to load utils op: 0.31330084800720215 secondsTime to load utils op: 0.31333017349243164 seconds +6: +6: Time to load utils op: 0.3139808177947998 seconds +7: Time to load utils op: 0.3097836971282959 secondsTime to load utils op: 0.3098154067993164 seconds +7: +7: Time to load utils op: 0.30982089042663574 secondsTime to load utils op: 0.3098294734954834 seconds +7: Time to load utils op: 0.3098320960998535 seconds +7: Time to load utils op: 0.309826135635376 seconds +7: Time to load utils op: 0.30983519554138184 seconds +7: +7: Time to load utils op: 0.30983614921569824 seconds +4: Time to load utils op: 0.31423091888427734 seconds +4: Time to load utils op: 0.31423282623291016 seconds +4: Time to load utils op: 0.314267635345459 seconds +4: Time to load utils op: 0.3142738342285156 secondsTime to load utils op: 0.31430745124816895 secondsTime to load utils op: 0.3142855167388916 seconds +4: +4: +4: Time to load utils op: 0.31429171562194824 secondsTime to load utils op: 0.3142893314361572 seconds +4: +3: Time to load utils op: 0.0005340576171875 seconds +3: Time to load utils op: 0.0005323886871337891 seconds +0: Time to load utils op: 0.0004794597625732422 seconds +0: Time to load utils op: 0.00041747093200683594 seconds +0: Time to load utils op: 0.0004191398620605469 secondsTime to load utils op: 0.0005693435668945312 seconds +0: +0: Time to load utils op: 0.0005950927734375 secondsTime to load utils op: 0.00041747093200683594 secondsTime to load utils op: 0.0005645751953125 seconds +0: +0: +2: Time to load utils op: 0.0009548664093017578 seconds +2: Time to load utils op: 0.0008945465087890625 seconds +2: Time to load utils op: 0.0007920265197753906 seconds +2: Time to load utils op: 0.0009958744049072266 seconds +2: Time to load utils op: 0.0010972023010253906 secondsTime to load utils op: 0.0010993480682373047 seconds +2: +2: Time to load utils op: 0.0011529922485351562 seconds +2: Time to load utils op: 0.0011417865753173828 seconds +6: Time to load utils op: 0.0006539821624755859 seconds +6: Time to load utils op: 0.0008044242858886719 seconds +6: Time to load utils op: 0.0011839866638183594 seconds +6: Time to load utils op: 0.001161813735961914 seconds +6: Time to load utils op: 0.0011298656463623047 seconds +6: Time to load utils op: 0.0011806488037109375 seconds +6: Time to load utils op: 0.0010654926300048828 seconds +6: Time to load utils op: 0.001291513442993164 seconds +1: Time to load utils op: 0.0007917881011962891 seconds +1: Time to load utils op: 0.0007672309875488281 secondsTime to load utils op: 0.000843048095703125 seconds +1: +1: Time to load utils op: 0.0009324550628662109 seconds +5: Time to load utils op: 0.0004930496215820312 seconds +5: Time to load utils op: 0.0004963874816894531 seconds +5: Time to load utils op: 0.0005035400390625 seconds +5: Time to load utils op: 0.0005385875701904297 seconds +5: Time to load utils op: 0.0005581378936767578 seconds +1: Time to load utils op: 0.0011630058288574219 secondsTime to load utils op: 0.0010929107666015625 seconds +1: +5: Time to load utils op: 0.0005676746368408203 seconds +1: Time to load utils op: 0.0010347366333007812 seconds +1: Time to load utils op: 0.0011692047119140625 seconds +5: Time to load utils op: 0.0006387233734130859 seconds +5: Time to load utils op: 0.0006163120269775391 seconds +4: Time to load utils op: 0.0008265972137451172 seconds +7: Time to load utils op: 0.000782012939453125 seconds +7: Time to load utils op: 0.0009365081787109375 seconds +4: Time to load utils op: 0.0008540153503417969 seconds +7: Time to load utils op: 0.0009930133819580078 seconds +4: Time to load utils op: 0.0011229515075683594 seconds +4: Time to load utils op: 0.0010602474212646484 secondsTime to load utils op: 0.0010936260223388672 secondsTime to load utils op: 0.001130819320678711 seconds +4: +4: +4: Time to load utils op: 0.0011551380157470703 seconds +4: Time to load utils op: 0.0011582374572753906 seconds +7: Time to load utils op: 0.0011539459228515625 seconds +7: Time to load utils op: 0.0011432170867919922 seconds +7: Time to load utils op: 0.0011658668518066406 seconds +7: Time to load utils op: 0.0011341571807861328 seconds +7: Time to load utils op: 0.001245260238647461 seconds +0: [2023-03-17 00:14:31,467] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-17 00:14:31,468] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 00:14:31,468] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.47 GB, percent = 6.3% +0: [2023-03-17 00:14:31,582] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-17 00:14:31,583] [INFO] [utils.py:828:see_memory_usage] MA 0.24 GB Max_MA 0.24 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 00:14:31,583] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:31,687] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-17 00:14:31,688] [INFO] [utils.py:828:see_memory_usage] MA 0.24 GB Max_MA 0.24 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 00:14:31,688] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:31,791] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-17 00:14:31,791] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:14:31,791] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:31,893] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-17 00:14:31,893] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:14:31,893] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:31,997] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-17 00:14:31,997] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:14:31,997] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:32,099] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-17 00:14:32,100] [INFO] [utils.py:828:see_memory_usage] MA 0.29 GB Max_MA 0.29 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:14:32,100] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:32,207] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-17 00:14:32,207] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:14:32,207] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:32,309] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-17 00:14:32,310] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 00:14:32,310] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-17 00:14:32,310] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-17 00:14:32,310] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-17 00:14:32,310] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-17 00:14:32,310] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-17 00:14:32,310] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-17 00:14:32,311] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-17 00:14:32,312] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-17 00:14:32,313] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-17 00:14:32,313] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-17 00:14:32,313] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-17 00:14:32,313] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00042700767517089844 seconds +0: [2023-03-17 00:14:32,313] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-17 00:14:32,365] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=52024320 (52.024M) TOTAL_PARAMS=52024320 (52.024M) UNIQUE_PARAMS=52024320 (52.024M) +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_44m91b100m +0: will not load any checkpoints and will start from random +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,371] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-17 00:14:32,372] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_44m91b100m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 7.26 +0: estimated model parameters: 0.05202432 +0: estimated model parameters without embeddings: 0.025220096 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-17 00:14:32 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 44416143 +0: validation: 44544 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.028636 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_44416143ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_44416143ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_44416143ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.094 seconds +0: total number of samples: 44461248 +0: total number of epochs: 911 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.049906 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_44544ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_44544ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_44544ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.058 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-17 00:14:46 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 16033.91 | train/valid/test-data-iterators-setup: 13319.66 +0: [000-000] 0.0520B / 0.0252B +0: [before the start of training step] datetime: 2023-03-17 00:14:46 +0: [2023-03-17 00:14:47,426] [INFO] [checkpointing.py:553:forward] Activation Checkpointing Information +0: [2023-03-17 00:14:47,426] [INFO] [checkpointing.py:554:forward] ----Partition Activations False, CPU CHECKPOINTING False +0: [2023-03-17 00:14:47,426] [INFO] [checkpointing.py:557:forward] ----contiguous Memory Checkpointing False with None total layers +0: [2023-03-17 00:14:47,426] [INFO] [checkpointing.py:560:forward] ----Synchronization False +0: [2023-03-17 00:14:47,426] [INFO] [checkpointing.py:561:forward] ----Profiling time in checkpointing False +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 1986.51220703125 | max allocated: 4434.69921875 | reserved: 5476.0 | max reserved: 5476.0 +7: iteration 10/ 173500 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 1.23 | learning rate: 1.153E-06 | global batch size: 256 | lm loss: 1.094483E+01 | grad norm: 6.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 208.605 | TFLOPs: 3.27 | +7: iteration 20/ 173500 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.17 | learning rate: 2.305E-06 | global batch size: 256 | lm loss: 1.083033E+01 | grad norm: 5.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.863 | TFLOPs: 23.93 | +7: iteration 30/ 173500 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.17 | learning rate: 3.458E-06 | global batch size: 256 | lm loss: 1.054820E+01 | grad norm: 3.759 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.392 | TFLOPs: 23.80 | +7: iteration 40/ 173500 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.17 | learning rate: 4.611E-06 | global batch size: 256 | lm loss: 1.028292E+01 | grad norm: 2.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.205 | TFLOPs: 23.70 | +7: iteration 50/ 173500 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.17 | learning rate: 5.764E-06 | global batch size: 256 | lm loss: 1.010078E+01 | grad norm: 1.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.444 | TFLOPs: 23.15 | +7: iteration 60/ 173500 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.18 | learning rate: 6.916E-06 | global batch size: 256 | lm loss: 9.983430E+00 | grad norm: 1.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1420.305 | TFLOPs: 22.27 | +7: iteration 70/ 173500 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.17 | learning rate: 8.069E-06 | global batch size: 256 | lm loss: 9.888268E+00 | grad norm: 1.850 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.402 | TFLOPs: 23.67 | +7: iteration 80/ 173500 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.17 | learning rate: 9.222E-06 | global batch size: 256 | lm loss: 9.777607E+00 | grad norm: 1.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.514 | TFLOPs: 23.55 | +7: iteration 90/ 173500 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.20 | learning rate: 1.037E-05 | global batch size: 256 | lm loss: 9.669044E+00 | grad norm: 1.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1298.606 | TFLOPs: 20.37 | +7: iteration 100/ 173500 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.17 | learning rate: 1.153E-05 | global batch size: 256 | lm loss: 9.553539E+00 | grad norm: 1.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.943 | TFLOPs: 23.22 | +7: iteration 110/ 173500 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.17 | learning rate: 1.268E-05 | global batch size: 256 | lm loss: 9.437059E+00 | grad norm: 1.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.486 | TFLOPs: 23.99 | +7: iteration 120/ 173500 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.17 | learning rate: 1.383E-05 | global batch size: 256 | lm loss: 9.317985E+00 | grad norm: 1.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.654 | TFLOPs: 23.61 | +7: iteration 130/ 173500 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.17 | learning rate: 1.499E-05 | global batch size: 256 | lm loss: 9.200048E+00 | grad norm: 1.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.912 | TFLOPs: 23.59 | +7: iteration 140/ 173500 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.17 | learning rate: 1.614E-05 | global batch size: 256 | lm loss: 9.085298E+00 | grad norm: 1.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.490 | TFLOPs: 23.33 | +7: iteration 150/ 173500 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.16 | learning rate: 1.729E-05 | global batch size: 256 | lm loss: 8.974730E+00 | grad norm: 1.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.526 | TFLOPs: 24.47 | +7: iteration 160/ 173500 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.16 | learning rate: 1.844E-05 | global batch size: 256 | lm loss: 8.856667E+00 | grad norm: 1.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.677 | TFLOPs: 24.74 | +7: iteration 170/ 173500 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.16 | learning rate: 1.960E-05 | global batch size: 256 | lm loss: 8.734164E+00 | grad norm: 1.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.243 | TFLOPs: 24.64 | +7: iteration 180/ 173500 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.17 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 8.615869E+00 | grad norm: 1.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.236 | TFLOPs: 24.17 | +7: iteration 190/ 173500 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.17 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 8.496962E+00 | grad norm: 1.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.383 | TFLOPs: 24.22 | +7: iteration 200/ 173500 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.18 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 8.378725E+00 | grad norm: 1.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.858 | TFLOPs: 22.86 | +7: iteration 210/ 173500 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.18 | learning rate: 2.421E-05 | global batch size: 256 | lm loss: 8.253471E+00 | grad norm: 1.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1429.527 | TFLOPs: 22.42 | +7: iteration 220/ 173500 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.18 | learning rate: 2.536E-05 | global batch size: 256 | lm loss: 8.122841E+00 | grad norm: 1.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.207 | TFLOPs: 22.07 | +7: iteration 230/ 173500 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.16 | learning rate: 2.651E-05 | global batch size: 256 | lm loss: 8.008952E+00 | grad norm: 1.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.870 | TFLOPs: 24.57 | +7: iteration 240/ 173500 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.17 | learning rate: 2.767E-05 | global batch size: 256 | lm loss: 7.894707E+00 | grad norm: 1.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.795 | TFLOPs: 24.29 | +7: iteration 250/ 173500 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.18 | learning rate: 2.882E-05 | global batch size: 256 | lm loss: 7.773412E+00 | grad norm: 1.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.167 | TFLOPs: 22.48 | +7: iteration 260/ 173500 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.18 | learning rate: 2.997E-05 | global batch size: 256 | lm loss: 7.660309E+00 | grad norm: 1.102 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.091 | TFLOPs: 21.94 | +7: iteration 270/ 173500 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.17 | learning rate: 3.112E-05 | global batch size: 256 | lm loss: 7.580443E+00 | grad norm: 1.042 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.618 | TFLOPs: 23.86 | +7: iteration 280/ 173500 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.16 | learning rate: 3.228E-05 | global batch size: 256 | lm loss: 7.479124E+00 | grad norm: 0.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.877 | TFLOPs: 24.68 | +7: iteration 290/ 173500 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.16 | learning rate: 3.343E-05 | global batch size: 256 | lm loss: 7.389704E+00 | grad norm: 0.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.324 | TFLOPs: 24.85 | +7: iteration 300/ 173500 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.17 | learning rate: 3.458E-05 | global batch size: 256 | lm loss: 7.312852E+00 | grad norm: 0.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1499.503 | TFLOPs: 23.52 | +7: iteration 310/ 173500 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.17 | learning rate: 3.573E-05 | global batch size: 256 | lm loss: 7.247289E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.714 | TFLOPs: 24.12 | +7: iteration 320/ 173500 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.17 | learning rate: 3.689E-05 | global batch size: 256 | lm loss: 7.194901E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.396 | TFLOPs: 23.81 | +7: iteration 330/ 173500 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.16 | learning rate: 3.804E-05 | global batch size: 256 | lm loss: 7.124931E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.969 | TFLOPs: 24.72 | +7: iteration 340/ 173500 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.17 | learning rate: 3.919E-05 | global batch size: 256 | lm loss: 7.094804E+00 | grad norm: 0.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.679 | TFLOPs: 24.18 | +7: iteration 350/ 173500 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.16 | learning rate: 4.035E-05 | global batch size: 256 | lm loss: 7.042100E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.409 | TFLOPs: 25.65 | +7: iteration 360/ 173500 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.16 | learning rate: 4.150E-05 | global batch size: 256 | lm loss: 6.985632E+00 | grad norm: 0.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.361 | TFLOPs: 24.44 | +7: iteration 370/ 173500 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.16 | learning rate: 4.265E-05 | global batch size: 256 | lm loss: 6.958079E+00 | grad norm: 0.853 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.440 | TFLOPs: 24.61 | +7: iteration 380/ 173500 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.17 | learning rate: 4.380E-05 | global batch size: 256 | lm loss: 6.932171E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.980 | TFLOPs: 24.03 | +7: iteration 390/ 173500 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.16 | learning rate: 4.496E-05 | global batch size: 256 | lm loss: 6.873635E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.709 | TFLOPs: 25.50 | +7: iteration 400/ 173500 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.17 | learning rate: 4.611E-05 | global batch size: 256 | lm loss: 6.844902E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.878 | TFLOPs: 24.16 | +7: iteration 410/ 173500 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.16 | learning rate: 4.726E-05 | global batch size: 256 | lm loss: 6.812637E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.837 | TFLOPs: 24.70 | +7: iteration 420/ 173500 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.17 | learning rate: 4.841E-05 | global batch size: 256 | lm loss: 6.764165E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.376 | TFLOPs: 23.59 | +7: iteration 430/ 173500 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.16 | learning rate: 4.957E-05 | global batch size: 256 | lm loss: 6.749188E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.923 | TFLOPs: 24.62 | +7: iteration 440/ 173500 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.17 | learning rate: 5.072E-05 | global batch size: 256 | lm loss: 6.707951E+00 | grad norm: 0.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1490.461 | TFLOPs: 23.37 | +7: iteration 450/ 173500 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.16 | learning rate: 5.187E-05 | global batch size: 256 | lm loss: 6.682183E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.858 | TFLOPs: 25.40 | +7: iteration 460/ 173500 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.17 | learning rate: 5.303E-05 | global batch size: 256 | lm loss: 6.651023E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1468.365 | TFLOPs: 23.03 | +7: iteration 470/ 173500 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.17 | learning rate: 5.418E-05 | global batch size: 256 | lm loss: 6.633155E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1483.126 | TFLOPs: 23.26 | +7: iteration 480/ 173500 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.17 | learning rate: 5.533E-05 | global batch size: 256 | lm loss: 6.622745E+00 | grad norm: 0.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.911 | TFLOPs: 23.15 | +7: iteration 490/ 173500 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.16 | learning rate: 5.648E-05 | global batch size: 256 | lm loss: 6.606720E+00 | grad norm: 0.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.179 | TFLOPs: 25.09 | +7: iteration 500/ 173500 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.17 | learning rate: 5.764E-05 | global batch size: 256 | lm loss: 6.567258E+00 | grad norm: 0.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.203 | TFLOPs: 24.06 | +7: iteration 510/ 173500 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.17 | learning rate: 5.879E-05 | global batch size: 256 | lm loss: 6.538212E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.935 | TFLOPs: 23.85 | +7: iteration 520/ 173500 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.17 | learning rate: 5.994E-05 | global batch size: 256 | lm loss: 6.529171E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.407 | TFLOPs: 24.03 | +7: iteration 530/ 173500 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.16 | learning rate: 6.109E-05 | global batch size: 256 | lm loss: 6.519296E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.099 | TFLOPs: 24.36 | +7: iteration 540/ 173500 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.17 | learning rate: 6.225E-05 | global batch size: 256 | lm loss: 6.497031E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.127 | TFLOPs: 23.57 | +7: iteration 550/ 173500 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.17 | learning rate: 6.340E-05 | global batch size: 256 | lm loss: 6.486476E+00 | grad norm: 0.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.086 | TFLOPs: 23.95 | +7: iteration 560/ 173500 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.17 | learning rate: 6.455E-05 | global batch size: 256 | lm loss: 6.463375E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.554 | TFLOPs: 23.25 | +7: iteration 570/ 173500 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.17 | learning rate: 6.571E-05 | global batch size: 256 | lm loss: 6.438885E+00 | grad norm: 0.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.342 | TFLOPs: 24.02 | +7: iteration 580/ 173500 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.16 | learning rate: 6.686E-05 | global batch size: 256 | lm loss: 6.413338E+00 | grad norm: 0.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.863 | TFLOPs: 24.34 | +7: iteration 590/ 173500 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.16 | learning rate: 6.801E-05 | global batch size: 256 | lm loss: 6.420156E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.378 | TFLOPs: 24.94 | +7: iteration 600/ 173500 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.16 | learning rate: 6.916E-05 | global batch size: 256 | lm loss: 6.402865E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.238 | TFLOPs: 24.61 | +7: iteration 610/ 173500 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.17 | learning rate: 7.032E-05 | global batch size: 256 | lm loss: 6.375974E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.291 | TFLOPs: 24.16 | +7: iteration 620/ 173500 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.17 | learning rate: 7.147E-05 | global batch size: 256 | lm loss: 6.362081E+00 | grad norm: 0.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.747 | TFLOPs: 24.24 | +7: iteration 630/ 173500 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.16 | learning rate: 7.262E-05 | global batch size: 256 | lm loss: 6.351416E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.492 | TFLOPs: 24.41 | +7: iteration 640/ 173500 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.18 | learning rate: 7.378E-05 | global batch size: 256 | lm loss: 6.347457E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.918 | TFLOPs: 22.75 | +7: iteration 650/ 173500 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.17 | learning rate: 7.493E-05 | global batch size: 256 | lm loss: 6.332642E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.778 | TFLOPs: 23.32 | +7: iteration 660/ 173500 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.16 | learning rate: 7.608E-05 | global batch size: 256 | lm loss: 6.313762E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.474 | TFLOPs: 25.54 | +7: iteration 670/ 173500 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.17 | learning rate: 7.723E-05 | global batch size: 256 | lm loss: 6.297423E+00 | grad norm: 1.058 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.190 | TFLOPs: 24.14 | +7: iteration 680/ 173500 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.16 | learning rate: 7.839E-05 | global batch size: 256 | lm loss: 6.298869E+00 | grad norm: 0.972 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.419 | TFLOPs: 24.89 | +7: iteration 690/ 173500 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.17 | learning rate: 7.954E-05 | global batch size: 256 | lm loss: 6.274003E+00 | grad norm: 0.904 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.286 | TFLOPs: 23.98 | +7: iteration 700/ 173500 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.16 | learning rate: 8.069E-05 | global batch size: 256 | lm loss: 6.263961E+00 | grad norm: 0.845 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.473 | TFLOPs: 24.43 | +7: iteration 710/ 173500 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.17 | learning rate: 8.184E-05 | global batch size: 256 | lm loss: 6.247409E+00 | grad norm: 0.870 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.569 | TFLOPs: 23.97 | +7: iteration 720/ 173500 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.17 | learning rate: 8.300E-05 | global batch size: 256 | lm loss: 6.232645E+00 | grad norm: 0.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.751 | TFLOPs: 23.97 | +7: iteration 730/ 173500 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.18 | learning rate: 8.415E-05 | global batch size: 256 | lm loss: 6.215924E+00 | grad norm: 0.987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.903 | TFLOPs: 22.22 | +7: iteration 740/ 173500 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.17 | learning rate: 8.530E-05 | global batch size: 256 | lm loss: 6.218312E+00 | grad norm: 0.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.179 | TFLOPs: 23.93 | +7: iteration 750/ 173500 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.16 | learning rate: 8.646E-05 | global batch size: 256 | lm loss: 6.200503E+00 | grad norm: 0.736 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.283 | TFLOPs: 24.92 | +7: iteration 760/ 173500 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.16 | learning rate: 8.761E-05 | global batch size: 256 | lm loss: 6.174608E+00 | grad norm: 0.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.226 | TFLOPs: 25.00 | +7: iteration 770/ 173500 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.17 | learning rate: 8.876E-05 | global batch size: 256 | lm loss: 6.143757E+00 | grad norm: 1.115 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.787 | TFLOPs: 23.32 | +7: iteration 780/ 173500 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.18 | learning rate: 8.991E-05 | global batch size: 256 | lm loss: 6.148958E+00 | grad norm: 0.919 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.374 | TFLOPs: 22.84 | +7: iteration 790/ 173500 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.17 | learning rate: 9.107E-05 | global batch size: 256 | lm loss: 6.130771E+00 | grad norm: 0.909 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1481.781 | TFLOPs: 23.24 | +7: iteration 800/ 173500 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.18 | learning rate: 9.222E-05 | global batch size: 256 | lm loss: 6.114100E+00 | grad norm: 0.900 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.988 | TFLOPs: 22.57 | +7: iteration 810/ 173500 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.17 | learning rate: 9.337E-05 | global batch size: 256 | lm loss: 6.100539E+00 | grad norm: 1.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1481.163 | TFLOPs: 23.23 | +7: iteration 820/ 173500 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.17 | learning rate: 9.452E-05 | global batch size: 256 | lm loss: 6.099314E+00 | grad norm: 1.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.235 | TFLOPs: 23.94 | +7: iteration 830/ 173500 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.17 | learning rate: 9.568E-05 | global batch size: 256 | lm loss: 6.070179E+00 | grad norm: 0.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.280 | TFLOPs: 23.32 | +7: iteration 840/ 173500 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.18 | learning rate: 9.683E-05 | global batch size: 256 | lm loss: 6.054445E+00 | grad norm: 1.019 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.243 | TFLOPs: 22.62 | +7: iteration 850/ 173500 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.17 | learning rate: 9.798E-05 | global batch size: 256 | lm loss: 6.039222E+00 | grad norm: 1.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1500.939 | TFLOPs: 23.54 | +7: iteration 860/ 173500 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.16 | learning rate: 9.914E-05 | global batch size: 256 | lm loss: 6.032740E+00 | grad norm: 1.025 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.124 | TFLOPs: 24.86 | +7: iteration 870/ 173500 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.17 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 6.011692E+00 | grad norm: 0.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.601 | TFLOPs: 24.16 | +7: iteration 880/ 173500 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.17 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 6.009389E+00 | grad norm: 0.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.141 | TFLOPs: 23.73 | +7: iteration 890/ 173500 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.16 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 5.985099E+00 | grad norm: 1.063 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.207 | TFLOPs: 24.75 | +7: iteration 900/ 173500 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.17 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 5.976472E+00 | grad norm: 0.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.041 | TFLOPs: 24.23 | +7: iteration 910/ 173500 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.16 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 5.965524E+00 | grad norm: 1.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.510 | TFLOPs: 24.46 | +7: iteration 920/ 173500 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.17 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 5.976131E+00 | grad norm: 1.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.662 | TFLOPs: 24.16 | +7: iteration 930/ 173500 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 5.942941E+00 | grad norm: 0.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.442 | TFLOPs: 24.42 | +7: iteration 940/ 173500 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.16 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 5.929107E+00 | grad norm: 1.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.977 | TFLOPs: 24.97 | +7: iteration 950/ 173500 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.17 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 5.914635E+00 | grad norm: 1.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.903 | TFLOPs: 23.99 | +7: iteration 960/ 173500 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.17 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 5.906785E+00 | grad norm: 0.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.227 | TFLOPs: 23.81 | +7: iteration 970/ 173500 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.16 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 5.889445E+00 | grad norm: 1.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.085 | TFLOPs: 24.98 | +7: iteration 980/ 173500 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.17 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 5.880866E+00 | grad norm: 1.043 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.226 | TFLOPs: 24.23 | +7: iteration 990/ 173500 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.17 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 5.873759E+00 | grad norm: 1.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.490 | TFLOPs: 24.00 | +7: iteration 1000/ 173500 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.16 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 5.865643E+00 | grad norm: 1.089 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.290 | TFLOPs: 25.27 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 1000 | lm loss value: 5.818318E+00 | lm loss PPL: 3.364059E+02 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 1000 to checkpoints_44m91b100m +0: [2023-03-17 00:17:45,411] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +0: [2023-03-17 00:17:45,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:17:45,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:17:45,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:17:45,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:17:45,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:17:45,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:17:45,611] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:17:45,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:17:45,619] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:17:45,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:17:45,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:17:45,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:17:45,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:17:45,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:17:45,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:17:45,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:17:45,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:17:45,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:17:45,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:17:45,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:17:45,662] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step1000/mp_rank_00_model_states.pt +0: [2023-03-17 00:17:45,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:17:45,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:17:45,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:17:45,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-17 00:17:45,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:17:45,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:17:45,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:17:45,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 00:17:45,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:17:45,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: successfully saved checkpoint at iteration 1000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 297.64 +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:17:45,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:17:45,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: iteration 1010/ 173500 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.20 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 5.846238E+00 | grad norm: 1.062 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1264.470 | TFLOPs: 19.83 | +7: iteration 1020/ 173500 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.16 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 5.839495E+00 | grad norm: 1.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.128 | TFLOPs: 24.40 | +7: iteration 1030/ 173500 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.17 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 5.827932E+00 | grad norm: 1.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1467.893 | TFLOPs: 23.02 | +7: iteration 1040/ 173500 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.17 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 5.815763E+00 | grad norm: 0.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.106 | TFLOPs: 24.11 | +7: iteration 1050/ 173500 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.17 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 5.801493E+00 | grad norm: 1.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.342 | TFLOPs: 23.17 | +7: iteration 1060/ 173500 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.16 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 5.788890E+00 | grad norm: 1.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.226 | TFLOPs: 24.48 | +7: iteration 1070/ 173500 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.19 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 5.774991E+00 | grad norm: 1.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.090 | TFLOPs: 21.46 | +7: iteration 1080/ 173500 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.16 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 5.773173E+00 | grad norm: 1.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.755 | TFLOPs: 25.14 | +7: iteration 1090/ 173500 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 5.748616E+00 | grad norm: 1.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.945 | TFLOPs: 25.73 | +7: iteration 1100/ 173500 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.16 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 5.751179E+00 | grad norm: 0.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.398 | TFLOPs: 24.86 | +7: iteration 1110/ 173500 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.17 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 5.737375E+00 | grad norm: 1.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.525 | TFLOPs: 24.13 | +7: iteration 1120/ 173500 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.16 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 5.713351E+00 | grad norm: 1.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.997 | TFLOPs: 25.42 | +7: iteration 1130/ 173500 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.16 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 5.715084E+00 | grad norm: 1.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.117 | TFLOPs: 25.30 | +7: iteration 1140/ 173500 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.16 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 5.691091E+00 | grad norm: 1.796 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.441 | TFLOPs: 24.38 | +7: iteration 1150/ 173500 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 5.693197E+00 | grad norm: 1.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.572 | TFLOPs: 24.63 | +7: iteration 1160/ 173500 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.17 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 5.678979E+00 | grad norm: 1.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.161 | TFLOPs: 23.71 | +7: iteration 1170/ 173500 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.16 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 5.672793E+00 | grad norm: 1.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.215 | TFLOPs: 24.53 | +7: iteration 1180/ 173500 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.17 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 5.664103E+00 | grad norm: 1.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.585 | TFLOPs: 24.21 | +7: iteration 1190/ 173500 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.16 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 5.648818E+00 | grad norm: 0.890 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.735 | TFLOPs: 24.37 | +7: iteration 1200/ 173500 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.16 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 5.637393E+00 | grad norm: 1.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.381 | TFLOPs: 25.30 | +7: iteration 1210/ 173500 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.17 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 5.630153E+00 | grad norm: 1.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1514.499 | TFLOPs: 23.75 | +7: iteration 1220/ 173500 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.16 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 5.616582E+00 | grad norm: 0.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.198 | TFLOPs: 25.46 | +7: iteration 1230/ 173500 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.16 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 5.596059E+00 | grad norm: 1.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.979 | TFLOPs: 24.84 | +7: iteration 1240/ 173500 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.16 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 5.589482E+00 | grad norm: 1.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.051 | TFLOPs: 25.08 | +7: iteration 1250/ 173500 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.17 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 5.579333E+00 | grad norm: 1.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.743 | TFLOPs: 23.96 | +7: iteration 1260/ 173500 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.16 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 5.570916E+00 | grad norm: 1.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.522 | TFLOPs: 24.99 | +7: iteration 1270/ 173500 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.17 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 5.565501E+00 | grad norm: 1.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.096 | TFLOPs: 23.96 | +7: iteration 1280/ 173500 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.16 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 5.548999E+00 | grad norm: 0.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.581 | TFLOPs: 24.49 | +7: iteration 1290/ 173500 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.17 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 5.540463E+00 | grad norm: 1.067 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.521 | TFLOPs: 23.20 | +7: iteration 1300/ 173500 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.17 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 5.532383E+00 | grad norm: 0.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.071 | TFLOPs: 24.31 | +7: iteration 1310/ 173500 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.17 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 5.528476E+00 | grad norm: 1.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.016 | TFLOPs: 23.81 | +7: iteration 1320/ 173500 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.16 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 5.507189E+00 | grad norm: 1.079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.279 | TFLOPs: 24.66 | +7: iteration 1330/ 173500 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.17 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 5.513039E+00 | grad norm: 0.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1484.614 | TFLOPs: 23.28 | +7: iteration 1340/ 173500 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.16 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 5.481489E+00 | grad norm: 1.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.677 | TFLOPs: 24.69 | +7: iteration 1350/ 173500 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.17 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 5.489944E+00 | grad norm: 1.037 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.329 | TFLOPs: 24.16 | +7: iteration 1360/ 173500 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.17 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 5.459400E+00 | grad norm: 1.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.522 | TFLOPs: 23.81 | +7: iteration 1370/ 173500 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.17 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 5.460046E+00 | grad norm: 1.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.874 | TFLOPs: 24.16 | +7: iteration 1380/ 173500 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.17 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 5.452454E+00 | grad norm: 0.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.149 | TFLOPs: 23.89 | +7: iteration 1390/ 173500 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.17 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 5.436159E+00 | grad norm: 1.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.553 | TFLOPs: 23.78 | +7: iteration 1400/ 173500 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.16 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 5.431487E+00 | grad norm: 0.896 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.528 | TFLOPs: 24.91 | +7: iteration 1410/ 173500 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 5.419676E+00 | grad norm: 0.831 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.689 | TFLOPs: 24.85 | +7: iteration 1420/ 173500 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.17 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 5.408024E+00 | grad norm: 1.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.697 | TFLOPs: 23.64 | +7: iteration 1430/ 173500 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 5.385776E+00 | grad norm: 1.061 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.564 | TFLOPs: 24.83 | +7: iteration 1440/ 173500 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.17 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 5.389522E+00 | grad norm: 1.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.037 | TFLOPs: 23.34 | +7: iteration 1450/ 173500 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.17 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 5.379199E+00 | grad norm: 1.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.142 | TFLOPs: 23.95 | +7: iteration 1460/ 173500 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.17 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 5.366122E+00 | grad norm: 0.928 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.178 | TFLOPs: 23.46 | +7: iteration 1470/ 173500 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.16 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 5.369260E+00 | grad norm: 0.907 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.944 | TFLOPs: 25.33 | +7: iteration 1480/ 173500 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.17 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 5.349372E+00 | grad norm: 0.854 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.725 | TFLOPs: 24.19 | +7: iteration 1490/ 173500 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.16 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 5.357907E+00 | grad norm: 1.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.688 | TFLOPs: 24.48 | +7: iteration 1500/ 173500 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.16 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 5.347930E+00 | grad norm: 0.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.770 | TFLOPs: 25.28 | +7: iteration 1510/ 173500 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.17 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 5.329758E+00 | grad norm: 0.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.936 | TFLOPs: 23.46 | +7: iteration 1520/ 173500 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.16 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 5.306782E+00 | grad norm: 1.104 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.622 | TFLOPs: 24.99 | +7: iteration 1530/ 173500 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.16 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 5.305640E+00 | grad norm: 1.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.636 | TFLOPs: 25.32 | +7: iteration 1540/ 173500 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.17 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 5.306722E+00 | grad norm: 0.954 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1465.341 | TFLOPs: 22.98 | +7: iteration 1550/ 173500 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.16 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 5.284884E+00 | grad norm: 1.044 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.479 | TFLOPs: 25.66 | +7: iteration 1560/ 173500 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 5.282154E+00 | grad norm: 1.035 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.652 | TFLOPs: 25.93 | +7: iteration 1570/ 173500 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.16 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 5.269820E+00 | grad norm: 1.121 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.903 | TFLOPs: 25.61 | +7: iteration 1580/ 173500 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.16 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 5.267624E+00 | grad norm: 0.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.776 | TFLOPs: 25.10 | +7: iteration 1590/ 173500 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.16 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 5.249804E+00 | grad norm: 1.029 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.049 | TFLOPs: 25.72 | +7: iteration 1600/ 173500 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.17 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 5.253963E+00 | grad norm: 0.772 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.373 | TFLOPs: 24.28 | +7: iteration 1610/ 173500 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.16 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 5.239935E+00 | grad norm: 0.895 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.649 | TFLOPs: 24.38 | +7: iteration 1620/ 173500 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.16 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 5.224144E+00 | grad norm: 1.080 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.798 | TFLOPs: 24.62 | +7: iteration 1630/ 173500 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.16 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 5.230456E+00 | grad norm: 0.979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.103 | TFLOPs: 25.02 | +7: iteration 1640/ 173500 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.17 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 5.204922E+00 | grad norm: 0.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.495 | TFLOPs: 23.11 | +7: iteration 1650/ 173500 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.17 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 5.210931E+00 | grad norm: 0.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.285 | TFLOPs: 24.27 | +7: iteration 1660/ 173500 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 5.193649E+00 | grad norm: 1.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.521 | TFLOPs: 24.90 | +7: iteration 1670/ 173500 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.17 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 5.172920E+00 | grad norm: 1.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.668 | TFLOPs: 24.15 | +7: iteration 1680/ 173500 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.17 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 5.175151E+00 | grad norm: 0.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.098 | TFLOPs: 24.09 | +7: iteration 1690/ 173500 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.16 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 5.157759E+00 | grad norm: 0.951 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.077 | TFLOPs: 24.64 | +7: iteration 1700/ 173500 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.17 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 5.157778E+00 | grad norm: 1.030 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.468 | TFLOPs: 23.31 | +7: iteration 1710/ 173500 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 5.146725E+00 | grad norm: 0.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.865 | TFLOPs: 24.68 | +7: iteration 1720/ 173500 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 5.139659E+00 | grad norm: 0.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.925 | TFLOPs: 24.84 | +7: iteration 1730/ 173500 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 5.137285E+00 | grad norm: 0.936 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.339 | TFLOPs: 24.55 | +7: iteration 1740/ 173500 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.118845E+00 | grad norm: 0.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.135 | TFLOPs: 24.22 | +7: iteration 1750/ 173500 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.123123E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.298 | TFLOPs: 24.50 | +7: iteration 1760/ 173500 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.104716E+00 | grad norm: 0.821 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.362 | TFLOPs: 24.72 | +7: iteration 1770/ 173500 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.087981E+00 | grad norm: 1.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.389 | TFLOPs: 24.49 | +7: iteration 1780/ 173500 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.081685E+00 | grad norm: 0.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.205 | TFLOPs: 24.50 | +7: iteration 1790/ 173500 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.091161E+00 | grad norm: 0.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.780 | TFLOPs: 25.34 | +7: iteration 1800/ 173500 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.066504E+00 | grad norm: 0.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.942 | TFLOPs: 24.07 | +7: iteration 1810/ 173500 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.047121E+00 | grad norm: 0.927 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.207 | TFLOPs: 24.00 | +7: iteration 1820/ 173500 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.047333E+00 | grad norm: 0.878 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.749 | TFLOPs: 23.86 | +7: iteration 1830/ 173500 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.025462E+00 | grad norm: 0.941 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.660 | TFLOPs: 24.05 | +7: iteration 1840/ 173500 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.021955E+00 | grad norm: 0.825 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.601 | TFLOPs: 24.46 | +7: iteration 1850/ 173500 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.027612E+00 | grad norm: 0.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.079 | TFLOPs: 24.59 | +7: iteration 1860/ 173500 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.009081E+00 | grad norm: 0.893 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1460.784 | TFLOPs: 22.91 | +7: iteration 1870/ 173500 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.005905E+00 | grad norm: 0.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.136 | TFLOPs: 24.48 | +7: iteration 1880/ 173500 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.987793E+00 | grad norm: 0.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.104 | TFLOPs: 23.96 | +7: iteration 1890/ 173500 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.985677E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.509 | TFLOPs: 25.41 | +7: iteration 1900/ 173500 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.974247E+00 | grad norm: 0.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.277 | TFLOPs: 24.50 | +7: iteration 1910/ 173500 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.961548E+00 | grad norm: 0.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.555 | TFLOPs: 24.19 | +7: iteration 1920/ 173500 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.955909E+00 | grad norm: 0.836 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.298 | TFLOPs: 24.91 | +7: iteration 1930/ 173500 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.954799E+00 | grad norm: 0.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.622 | TFLOPs: 24.41 | +7: iteration 1940/ 173500 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.934749E+00 | grad norm: 0.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.918 | TFLOPs: 24.35 | +7: iteration 1950/ 173500 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.927072E+00 | grad norm: 0.893 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.687 | TFLOPs: 25.15 | +7: iteration 1960/ 173500 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.943632E+00 | grad norm: 0.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.393 | TFLOPs: 25.15 | +7: iteration 1970/ 173500 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.917624E+00 | grad norm: 0.844 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.252 | TFLOPs: 24.83 | +7: iteration 1980/ 173500 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.914783E+00 | grad norm: 0.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.919 | TFLOPs: 25.29 | +7: iteration 1990/ 173500 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.915568E+00 | grad norm: 0.817 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.275 | TFLOPs: 24.99 | +0: [2023-03-17 00:20:30,218] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00019999894289482022, 0.00019999894289482022, 0.00019999894289482022], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 173500 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.901785E+00 | grad norm: 0.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.345 | TFLOPs: 24.89 | +0: steps: 2000 loss: 4.8924 iter time (s): 0.170 samples/sec: 1506.030 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 2000 | lm loss value: 4.770314E+00 | lm loss PPL: 1.179563E+02 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 2000 to checkpoints_44m91b100m +0: [2023-03-17 00:20:30,291] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +0: [2023-03-17 00:20:30,294] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:20:30,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:20:30,351] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:20:30,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:20:30,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:20:30,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:20:30,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:20:30,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:20:30,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:20:30,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:20:30,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:20:30,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:20:30,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:20:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:20:30,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:20:30,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:20:30,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:20:30,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:20:30,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:20:30,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:20:30,420] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step2000/mp_rank_00_model_states.pt +0: [2023-03-17 00:20:30,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:20:30,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:20:30,439] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:20:30,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:20:30,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:20:30,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:20:30,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:20:30,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 00:20:30,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:20:30,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:20:30,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:20:30,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:20:30,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-17 00:20:30,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:20:30,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 00:20:30,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-17 00:20:30,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:20:30,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-17 00:20:30,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:20:30,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:20:30,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: successfully saved checkpoint at iteration 2000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.74 +7: iteration 2010/ 173500 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.895624E+00 | grad norm: 1.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.741 | TFLOPs: 21.68 | +7: iteration 2020/ 173500 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.881416E+00 | grad norm: 0.787 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.796 | TFLOPs: 24.88 | +7: iteration 2030/ 173500 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.888567E+00 | grad norm: 0.791 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.650 | TFLOPs: 24.82 | +7: iteration 2040/ 173500 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.884373E+00 | grad norm: 0.754 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.055 | TFLOPs: 24.17 | +7: iteration 2050/ 173500 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.881538E+00 | grad norm: 0.898 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.373 | TFLOPs: 24.31 | +7: iteration 2060/ 173500 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.870856E+00 | grad norm: 0.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.346 | TFLOPs: 25.41 | +7: iteration 2070/ 173500 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.870568E+00 | grad norm: 0.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.979 | TFLOPs: 24.03 | +7: iteration 2080/ 173500 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.868797E+00 | grad norm: 0.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.184 | TFLOPs: 24.19 | +7: iteration 2090/ 173500 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.851057E+00 | grad norm: 0.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.589 | TFLOPs: 24.57 | +7: iteration 2100/ 173500 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.848610E+00 | grad norm: 0.998 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.264 | TFLOPs: 24.77 | +7: iteration 2110/ 173500 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.832885E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.668 | TFLOPs: 23.14 | +7: iteration 2120/ 173500 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.827758E+00 | grad norm: 0.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.857 | TFLOPs: 24.63 | +7: iteration 2130/ 173500 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.832937E+00 | grad norm: 0.937 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.276 | TFLOPs: 24.41 | +7: iteration 2140/ 173500 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.819087E+00 | grad norm: 0.797 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.296 | TFLOPs: 25.22 | +7: iteration 2150/ 173500 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.821680E+00 | grad norm: 1.008 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.291 | TFLOPs: 24.72 | +7: iteration 2160/ 173500 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.818949E+00 | grad norm: 0.981 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.976 | TFLOPs: 25.25 | +7: iteration 2170/ 173500 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.817940E+00 | grad norm: 0.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.638 | TFLOPs: 24.29 | +7: iteration 2180/ 173500 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.810749E+00 | grad norm: 0.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.074 | TFLOPs: 24.83 | +7: iteration 2190/ 173500 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.803121E+00 | grad norm: 0.752 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.138 | TFLOPs: 25.02 | +7: iteration 2200/ 173500 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.787683E+00 | grad norm: 0.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.969 | TFLOPs: 24.40 | +7: iteration 2210/ 173500 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.800041E+00 | grad norm: 0.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.882 | TFLOPs: 22.77 | +7: iteration 2220/ 173500 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.784797E+00 | grad norm: 0.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.802 | TFLOPs: 24.26 | +7: iteration 2230/ 173500 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.785894E+00 | grad norm: 0.881 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.173 | TFLOPs: 24.25 | +7: iteration 2240/ 173500 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.771210E+00 | grad norm: 0.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.414 | TFLOPs: 23.66 | +7: iteration 2250/ 173500 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.772678E+00 | grad norm: 0.912 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.554 | TFLOPs: 24.25 | +7: iteration 2260/ 173500 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.766884E+00 | grad norm: 0.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.006 | TFLOPs: 24.97 | +7: iteration 2270/ 173500 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.752319E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.313 | TFLOPs: 24.09 | +7: iteration 2280/ 173500 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.761189E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.210 | TFLOPs: 24.00 | +7: iteration 2290/ 173500 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.759950E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.487 | TFLOPs: 23.83 | +7: iteration 2300/ 173500 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.750616E+00 | grad norm: 0.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.487 | TFLOPs: 25.24 | +7: iteration 2310/ 173500 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.745750E+00 | grad norm: 0.868 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.297 | TFLOPs: 24.31 | +7: iteration 2320/ 173500 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.738653E+00 | grad norm: 0.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.567 | TFLOPs: 25.15 | +7: iteration 2330/ 173500 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.749062E+00 | grad norm: 0.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.259 | TFLOPs: 24.97 | +7: iteration 2340/ 173500 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.724596E+00 | grad norm: 0.682 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.489 | TFLOPs: 25.40 | +7: iteration 2350/ 173500 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.718946E+00 | grad norm: 0.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.589 | TFLOPs: 24.46 | +7: iteration 2360/ 173500 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.729007E+00 | grad norm: 0.932 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.044 | TFLOPs: 25.05 | +7: iteration 2370/ 173500 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.720943E+00 | grad norm: 0.830 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.533 | TFLOPs: 24.30 | +7: iteration 2380/ 173500 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.741734E+00 | grad norm: 0.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.340 | TFLOPs: 24.67 | +7: iteration 2390/ 173500 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.718837E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.447 | TFLOPs: 24.60 | +7: iteration 2400/ 173500 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.707828E+00 | grad norm: 0.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.197 | TFLOPs: 25.30 | +7: iteration 2410/ 173500 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.715247E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.319 | TFLOPs: 23.91 | +7: iteration 2420/ 173500 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.718927E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.649 | TFLOPs: 24.77 | +7: iteration 2430/ 173500 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.705016E+00 | grad norm: 0.689 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.013 | TFLOPs: 25.11 | +7: iteration 2440/ 173500 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.709760E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.620 | TFLOPs: 25.09 | +7: iteration 2450/ 173500 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.688106E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.747 | TFLOPs: 25.18 | +7: iteration 2460/ 173500 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.705506E+00 | grad norm: 0.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.027 | TFLOPs: 25.59 | +7: iteration 2470/ 173500 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.689796E+00 | grad norm: 0.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.628 | TFLOPs: 24.68 | +7: iteration 2480/ 173500 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.696059E+00 | grad norm: 0.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.850 | TFLOPs: 25.39 | +7: iteration 2490/ 173500 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.681205E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.411 | TFLOPs: 24.28 | +7: iteration 2500/ 173500 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.685329E+00 | grad norm: 0.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.397 | TFLOPs: 25.57 | +7: iteration 2510/ 173500 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.688219E+00 | grad norm: 1.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.571 | TFLOPs: 25.15 | +7: iteration 2520/ 173500 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.690613E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.205 | TFLOPs: 23.82 | +7: iteration 2530/ 173500 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.673780E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.740 | TFLOPs: 25.65 | +7: iteration 2540/ 173500 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.671458E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.473 | TFLOPs: 24.79 | +7: iteration 2550/ 173500 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.663474E+00 | grad norm: 0.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.096 | TFLOPs: 25.63 | +7: iteration 2560/ 173500 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.674040E+00 | grad norm: 0.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.856 | TFLOPs: 25.36 | +7: iteration 2570/ 173500 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.668623E+00 | grad norm: 0.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.581 | TFLOPs: 25.52 | +7: iteration 2580/ 173500 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.654866E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.852 | TFLOPs: 25.22 | +7: iteration 2590/ 173500 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.664891E+00 | grad norm: 0.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.026 | TFLOPs: 24.92 | +7: iteration 2600/ 173500 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.660134E+00 | grad norm: 0.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.743 | TFLOPs: 24.74 | +7: iteration 2610/ 173500 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.660636E+00 | grad norm: 0.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.068 | TFLOPs: 24.97 | +7: iteration 2620/ 173500 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.653830E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.517 | TFLOPs: 25.88 | +7: iteration 2630/ 173500 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.648596E+00 | grad norm: 0.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.823 | TFLOPs: 25.45 | +7: iteration 2640/ 173500 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.629882E+00 | grad norm: 0.698 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.031 | TFLOPs: 24.56 | +7: iteration 2650/ 173500 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.652204E+00 | grad norm: 0.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.136 | TFLOPs: 25.66 | +7: iteration 2660/ 173500 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.633822E+00 | grad norm: 0.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.247 | TFLOPs: 25.03 | +7: iteration 2670/ 173500 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.652399E+00 | grad norm: 0.739 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.324 | TFLOPs: 25.65 | +7: iteration 2680/ 173500 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.638916E+00 | grad norm: 0.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.560 | TFLOPs: 24.47 | +7: iteration 2690/ 173500 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.637772E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.235 | TFLOPs: 25.30 | +7: iteration 2700/ 173500 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.625155E+00 | grad norm: 0.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.665 | TFLOPs: 23.96 | +7: iteration 2710/ 173500 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.637306E+00 | grad norm: 0.836 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.554 | TFLOPs: 24.94 | +7: iteration 2720/ 173500 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.635348E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.100 | TFLOPs: 25.44 | +7: iteration 2730/ 173500 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.623655E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.631 | TFLOPs: 24.98 | +7: iteration 2740/ 173500 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.617526E+00 | grad norm: 0.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.447 | TFLOPs: 24.71 | +7: iteration 2750/ 173500 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.620889E+00 | grad norm: 1.028 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.850 | TFLOPs: 25.12 | +7: iteration 2760/ 173500 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.626520E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.037 | TFLOPs: 24.51 | +7: iteration 2770/ 173500 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.614426E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.195 | TFLOPs: 25.61 | +7: iteration 2780/ 173500 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.612776E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.383 | TFLOPs: 24.25 | +7: iteration 2790/ 173500 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.605572E+00 | grad norm: 0.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.500 | TFLOPs: 24.74 | +7: iteration 2800/ 173500 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.619040E+00 | grad norm: 0.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.218 | TFLOPs: 25.77 | +7: iteration 2810/ 173500 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.607331E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.742 | TFLOPs: 25.21 | +7: iteration 2820/ 173500 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.613523E+00 | grad norm: 0.789 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.032 | TFLOPs: 25.11 | +7: iteration 2830/ 173500 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.597322E+00 | grad norm: 0.796 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.652 | TFLOPs: 25.49 | +7: iteration 2840/ 173500 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.610088E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.038 | TFLOPs: 25.09 | +7: iteration 2850/ 173500 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.598260E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.178 | TFLOPs: 25.39 | +7: iteration 2860/ 173500 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.586481E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.905 | TFLOPs: 24.76 | +7: iteration 2870/ 173500 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.584216E+00 | grad norm: 0.699 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.230 | TFLOPs: 25.83 | +7: iteration 2880/ 173500 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.595593E+00 | grad norm: 0.816 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.077 | TFLOPs: 26.11 | +7: iteration 2890/ 173500 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.598312E+00 | grad norm: 0.848 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.823 | TFLOPs: 25.04 | +7: iteration 2900/ 173500 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.596328E+00 | grad norm: 0.748 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.192 | TFLOPs: 25.69 | +7: iteration 2910/ 173500 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.587759E+00 | grad norm: 0.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.021 | TFLOPs: 25.11 | +7: iteration 2920/ 173500 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.589122E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.658 | TFLOPs: 25.62 | +7: iteration 2930/ 173500 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.570257E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.091 | TFLOPs: 24.91 | +7: iteration 2940/ 173500 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.583744E+00 | grad norm: 0.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.708 | TFLOPs: 25.68 | +7: iteration 2950/ 173500 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.581396E+00 | grad norm: 0.788 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.538 | TFLOPs: 24.94 | +7: iteration 2960/ 173500 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.585202E+00 | grad norm: 0.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.974 | TFLOPs: 25.62 | +7: iteration 2970/ 173500 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.562978E+00 | grad norm: 1.105 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.370 | TFLOPs: 25.25 | +7: iteration 2980/ 173500 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.578896E+00 | grad norm: 0.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.174 | TFLOPs: 25.13 | +7: iteration 2990/ 173500 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.576628E+00 | grad norm: 0.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.670 | TFLOPs: 25.48 | +7: iteration 3000/ 173500 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.574781E+00 | grad norm: 0.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.006 | TFLOPs: 25.88 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 3000 | lm loss value: 4.449742E+00 | lm loss PPL: 8.560484E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 3000 to checkpoints_44m91b100m +0: [2023-03-17 00:23:11,833] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +0: [2023-03-17 00:23:11,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:23:11,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:23:11,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:23:11,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:23:11,904] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:23:11,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:23:11,912] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:23:11,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:23:11,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:23:11,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:23:11,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:23:11,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:23:11,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:23:11,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:23:11,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:23:11,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:23:11,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:23:11,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:23:11,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:23:11,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:23:11,962] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step3000/mp_rank_00_model_states.pt +0: [2023-03-17 00:23:11,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:23:11,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:23:11,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:23:11,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:11,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:11,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:11,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:11,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:11,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:11,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:11,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:11,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:11,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:11,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:11,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:11,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:11,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:11,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:11,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:23:11,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:11,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:11,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:23:11,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:11,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:11,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:23:11,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:11,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:11,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:11,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:11,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:11,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:11,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:23:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:12,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:12,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:12,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:12,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:12,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:12,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:12,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:12,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:12,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:12,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:12,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:23:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:23:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-17 00:23:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:23:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:23:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 00:23:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-17 00:23:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:12,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:23:12,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:12,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-17 00:23:12,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:23:12,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:23:12,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-17 00:23:12,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:23:12,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:23:12,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-17 00:23:12,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:23:12,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:23:12,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: successfully saved checkpoint at iteration 3000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 175.86 +7: iteration 3010/ 173500 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.559288E+00 | grad norm: 0.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.051 | TFLOPs: 22.22 | +7: iteration 3020/ 173500 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.574483E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.478 | TFLOPs: 25.16 | +7: iteration 3030/ 173500 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.558094E+00 | grad norm: 0.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.231 | TFLOPs: 25.27 | +7: iteration 3040/ 173500 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.568433E+00 | grad norm: 0.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.239 | TFLOPs: 25.80 | +7: iteration 3050/ 173500 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.569829E+00 | grad norm: 0.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.698 | TFLOPs: 25.78 | +7: iteration 3060/ 173500 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.558492E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.849 | TFLOPs: 24.96 | +7: iteration 3070/ 173500 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.552855E+00 | grad norm: 0.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.147 | TFLOPs: 24.66 | +7: iteration 3080/ 173500 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.548888E+00 | grad norm: 0.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.686 | TFLOPs: 25.45 | +7: iteration 3090/ 173500 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.566299E+00 | grad norm: 0.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.219 | TFLOPs: 25.86 | +7: iteration 3100/ 173500 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.543801E+00 | grad norm: 0.668 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.464 | TFLOPs: 24.94 | +7: iteration 3110/ 173500 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.545027E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.422 | TFLOPs: 24.91 | +7: iteration 3120/ 173500 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.550958E+00 | grad norm: 0.693 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.121 | TFLOPs: 25.69 | +7: iteration 3130/ 173500 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.554726E+00 | grad norm: 0.947 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.698 | TFLOPs: 25.21 | +7: iteration 3140/ 173500 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.546129E+00 | grad norm: 0.699 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.548 | TFLOPs: 24.76 | +7: iteration 3150/ 173500 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.547466E+00 | grad norm: 1.119 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.363 | TFLOPs: 25.58 | +7: iteration 3160/ 173500 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.554324E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.767 | TFLOPs: 25.12 | +7: iteration 3170/ 173500 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.529126E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.237 | TFLOPs: 25.08 | +7: iteration 3180/ 173500 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.524634E+00 | grad norm: 0.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.828 | TFLOPs: 25.40 | +7: iteration 3190/ 173500 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.546825E+00 | grad norm: 0.631 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.614 | TFLOPs: 25.67 | +7: iteration 3200/ 173500 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.515718E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.040 | TFLOPs: 25.20 | +7: iteration 3210/ 173500 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.534904E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.661 | TFLOPs: 25.18 | +7: iteration 3220/ 173500 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.527067E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.988 | TFLOPs: 25.37 | +7: iteration 3230/ 173500 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.530087E+00 | grad norm: 0.863 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.616 | TFLOPs: 26.22 | +7: iteration 3240/ 173500 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.516633E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.409 | TFLOPs: 25.57 | +7: iteration 3250/ 173500 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.511203E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.527 | TFLOPs: 24.65 | +7: iteration 3260/ 173500 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.515337E+00 | grad norm: 0.764 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.345 | TFLOPs: 25.58 | +7: iteration 3270/ 173500 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.499605E+00 | grad norm: 0.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.156 | TFLOPs: 26.16 | +7: iteration 3280/ 173500 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.500462E+00 | grad norm: 0.810 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.611 | TFLOPs: 24.52 | +7: iteration 3290/ 173500 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.511705E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.941 | TFLOPs: 24.12 | +7: iteration 3300/ 173500 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.513361E+00 | grad norm: 0.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.087 | TFLOPs: 25.16 | +7: iteration 3310/ 173500 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.497777E+00 | grad norm: 0.892 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.101 | TFLOPs: 23.71 | +7: iteration 3320/ 173500 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.506983E+00 | grad norm: 0.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.546 | TFLOPs: 25.04 | +7: iteration 3330/ 173500 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.502088E+00 | grad norm: 0.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.220 | TFLOPs: 23.61 | +7: iteration 3340/ 173500 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.506924E+00 | grad norm: 0.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.964 | TFLOPs: 25.94 | +7: iteration 3350/ 173500 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.489664E+00 | grad norm: 0.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.358 | TFLOPs: 25.46 | +7: iteration 3360/ 173500 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.489287E+00 | grad norm: 0.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.114 | TFLOPs: 25.53 | +7: iteration 3370/ 173500 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.483924E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.243 | TFLOPs: 25.30 | +7: iteration 3380/ 173500 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.499845E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.008 | TFLOPs: 25.12 | +7: iteration 3390/ 173500 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.478980E+00 | grad norm: 0.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.710 | TFLOPs: 23.79 | +7: iteration 3400/ 173500 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.485636E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.834 | TFLOPs: 25.48 | +7: iteration 3410/ 173500 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.501182E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.568 | TFLOPs: 25.71 | +7: iteration 3420/ 173500 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.474710E+00 | grad norm: 0.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.156 | TFLOPs: 25.13 | +7: iteration 3430/ 173500 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.471246E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.791 | TFLOPs: 24.56 | +7: iteration 3440/ 173500 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.475667E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.767 | TFLOPs: 24.52 | +7: iteration 3450/ 173500 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.477241E+00 | grad norm: 0.826 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.547 | TFLOPs: 25.37 | +7: iteration 3460/ 173500 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.478164E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.454 | TFLOPs: 25.30 | +7: iteration 3470/ 173500 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.470402E+00 | grad norm: 0.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.660 | TFLOPs: 25.35 | +7: iteration 3480/ 173500 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.17 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.477440E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.432 | TFLOPs: 24.31 | +7: iteration 3490/ 173500 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.461523E+00 | grad norm: 0.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.363 | TFLOPs: 25.25 | +7: iteration 3500/ 173500 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.472287E+00 | grad norm: 0.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.159 | TFLOPs: 24.64 | +7: iteration 3510/ 173500 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.453561E+00 | grad norm: 0.736 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.457 | TFLOPs: 24.83 | +7: iteration 3520/ 173500 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.450618E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.752 | TFLOPs: 24.95 | +7: iteration 3530/ 173500 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.471062E+00 | grad norm: 0.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.701 | TFLOPs: 25.18 | +7: iteration 3540/ 173500 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.463107E+00 | grad norm: 0.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.513 | TFLOPs: 25.10 | +7: iteration 3550/ 173500 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 4.453714E+00 | grad norm: 0.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.853 | TFLOPs: 24.73 | +7: iteration 3560/ 173500 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.439310E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.407 | TFLOPs: 24.77 | +7: iteration 3570/ 173500 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.454097E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.999 | TFLOPs: 24.95 | +7: iteration 3580/ 173500 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.453012E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.733 | TFLOPs: 24.92 | +7: iteration 3590/ 173500 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.457038E+00 | grad norm: 0.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.634 | TFLOPs: 25.37 | +7: iteration 3600/ 173500 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.452314E+00 | grad norm: 0.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.252 | TFLOPs: 24.81 | +7: iteration 3610/ 173500 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.449196E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.002 | TFLOPs: 24.12 | +7: iteration 3620/ 173500 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.423378E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.520 | TFLOPs: 24.49 | +7: iteration 3630/ 173500 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.460197E+00 | grad norm: 0.812 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.768 | TFLOPs: 24.51 | +7: iteration 3640/ 173500 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.436347E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.632 | TFLOPs: 25.70 | +7: iteration 3650/ 173500 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.451391E+00 | grad norm: 0.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.008 | TFLOPs: 24.73 | +7: iteration 3660/ 173500 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.440328E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.275 | TFLOPs: 24.58 | +7: iteration 3670/ 173500 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.433778E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.707 | TFLOPs: 24.98 | +7: iteration 3680/ 173500 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.434666E+00 | grad norm: 0.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.795 | TFLOPs: 24.98 | +7: iteration 3690/ 173500 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.437239E+00 | grad norm: 0.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.682 | TFLOPs: 24.60 | +7: iteration 3700/ 173500 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.433929E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.420 | TFLOPs: 24.91 | +7: iteration 3710/ 173500 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.432214E+00 | grad norm: 0.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1484.584 | TFLOPs: 23.28 | +7: iteration 3720/ 173500 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.427516E+00 | grad norm: 0.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.280 | TFLOPs: 24.69 | +7: iteration 3730/ 173500 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.425873E+00 | grad norm: 0.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.158 | TFLOPs: 25.50 | +7: iteration 3740/ 173500 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.429244E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.102 | TFLOPs: 24.98 | +7: iteration 3750/ 173500 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.423787E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.652 | TFLOPs: 24.49 | +7: iteration 3760/ 173500 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.431163E+00 | grad norm: 0.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.691 | TFLOPs: 25.95 | +7: iteration 3770/ 173500 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.412357E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.387 | TFLOPs: 25.15 | +7: iteration 3780/ 173500 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.399196E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.107 | TFLOPs: 25.75 | +7: iteration 3790/ 173500 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.422489E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.914 | TFLOPs: 24.81 | +7: iteration 3800/ 173500 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.416340E+00 | grad norm: 0.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.150 | TFLOPs: 25.52 | +7: iteration 3810/ 173500 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.421909E+00 | grad norm: 0.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.428 | TFLOPs: 25.48 | +7: iteration 3820/ 173500 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.419423E+00 | grad norm: 0.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.664 | TFLOPs: 24.90 | +7: iteration 3830/ 173500 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.419925E+00 | grad norm: 0.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.425 | TFLOPs: 25.02 | +7: iteration 3840/ 173500 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.421931E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.885 | TFLOPs: 24.40 | +7: iteration 3850/ 173500 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.402337E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.649 | TFLOPs: 25.45 | +7: iteration 3860/ 173500 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.425761E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.738 | TFLOPs: 25.37 | +7: iteration 3870/ 173500 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.394204E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.394 | TFLOPs: 24.64 | +7: iteration 3880/ 173500 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.397775E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.600 | TFLOPs: 25.70 | +7: iteration 3890/ 173500 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.399419E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.285 | TFLOPs: 24.56 | +7: iteration 3900/ 173500 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.407261E+00 | grad norm: 0.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.103 | TFLOPs: 24.75 | +7: iteration 3910/ 173500 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.404645E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.742 | TFLOPs: 26.04 | +7: iteration 3920/ 173500 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.393909E+00 | grad norm: 0.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.894 | TFLOPs: 24.71 | +7: iteration 3930/ 173500 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.393887E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.934 | TFLOPs: 25.40 | +7: iteration 3940/ 173500 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.392518E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.087 | TFLOPs: 25.56 | +7: iteration 3950/ 173500 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.373233E+00 | grad norm: 0.705 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.330 | TFLOPs: 25.16 | +7: iteration 3960/ 173500 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.406936E+00 | grad norm: 0.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.120 | TFLOPs: 24.31 | +7: iteration 3970/ 173500 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.392916E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.442 | TFLOPs: 24.53 | +7: iteration 3980/ 173500 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.389248E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.052 | TFLOPs: 24.28 | +7: iteration 3990/ 173500 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.392878E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.543 | TFLOPs: 23.50 | +0: [2023-03-17 00:25:52,409] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.00019992278300259638, 0.00019992278300259638, 0.00019992278300259638], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 173500 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.380869E+00 | grad norm: 0.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.846 | TFLOPs: 25.36 | +0: steps: 4000 loss: 4.3783 iter time (s): 0.159 samples/sec: 1607.068 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 4000 | lm loss value: 4.335633E+00 | lm loss PPL: 7.637327E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 4000 to checkpoints_44m91b100m +0: [2023-03-17 00:25:52,497] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +0: [2023-03-17 00:25:52,500] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:25:52,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:25:52,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:25:52,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:25:52,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:25:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:25:52,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:25:52,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:25:52,583] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:25:52,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:25:52,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:25:52,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:25:52,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:25:52,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:25:52,608] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:25:52,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:25:52,616] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:25:52,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:25:52,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:25:52,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:25:52,625] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step4000/mp_rank_00_model_states.pt +0: [2023-03-17 00:25:52,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:25:52,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:25:52,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:25:52,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:25:52,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:25:52,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:25:52,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:25:52,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 00:25:52,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:25:52,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:25:52,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: successfully saved checkpoint at iteration 4000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 172.93 +7: iteration 4010/ 173500 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.18 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.381528E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.493 | TFLOPs: 22.03 | +7: iteration 4020/ 173500 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.386327E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.776 | TFLOPs: 25.46 | +7: iteration 4030/ 173500 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.385611E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.652 | TFLOPs: 24.74 | +7: iteration 4040/ 173500 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.373473E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.739 | TFLOPs: 25.32 | +7: iteration 4050/ 173500 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.377306E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.398 | TFLOPs: 25.21 | +7: iteration 4060/ 173500 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.384121E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.358 | TFLOPs: 25.30 | +7: iteration 4070/ 173500 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.383882E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.879 | TFLOPs: 25.06 | +7: iteration 4080/ 173500 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.380449E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.758 | TFLOPs: 25.57 | +7: iteration 4090/ 173500 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.379881E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.757 | TFLOPs: 24.99 | +7: iteration 4100/ 173500 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.381342E+00 | grad norm: 0.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.640 | TFLOPs: 24.88 | +7: iteration 4110/ 173500 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.372736E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.129 | TFLOPs: 25.42 | +7: iteration 4120/ 173500 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.374327E+00 | grad norm: 0.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.103 | TFLOPs: 24.62 | +7: iteration 4130/ 173500 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.376340E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.891 | TFLOPs: 24.90 | +7: iteration 4140/ 173500 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.366676E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.090 | TFLOPs: 24.73 | +7: iteration 4150/ 173500 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.366803E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.606 | TFLOPs: 24.71 | +7: iteration 4160/ 173500 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.364198E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.994 | TFLOPs: 24.06 | +7: iteration 4170/ 173500 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.362348E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.878 | TFLOPs: 26.16 | +7: iteration 4180/ 173500 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.366741E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.887 | TFLOPs: 25.44 | +7: iteration 4190/ 173500 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.367485E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.429 | TFLOPs: 25.84 | +7: iteration 4200/ 173500 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.366034E+00 | grad norm: 0.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.206 | TFLOPs: 23.95 | +7: iteration 4210/ 173500 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.363504E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.765 | TFLOPs: 24.24 | +7: iteration 4220/ 173500 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.367249E+00 | grad norm: 0.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.925 | TFLOPs: 25.39 | +7: iteration 4230/ 173500 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.360087E+00 | grad norm: 0.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.415 | TFLOPs: 24.72 | +7: iteration 4240/ 173500 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.368428E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.414 | TFLOPs: 25.11 | +7: iteration 4250/ 173500 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.362431E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.579 | TFLOPs: 25.87 | +7: iteration 4260/ 173500 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.346257E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.583 | TFLOPs: 24.46 | +7: iteration 4270/ 173500 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.353331E+00 | grad norm: 0.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.877 | TFLOPs: 25.17 | +7: iteration 4280/ 173500 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.363067E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.293 | TFLOPs: 25.72 | +7: iteration 4290/ 173500 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.355667E+00 | grad norm: 0.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.493 | TFLOPs: 26.20 | +7: iteration 4300/ 173500 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.361214E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.172 | TFLOPs: 24.70 | +7: iteration 4310/ 173500 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.349464E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.754 | TFLOPs: 26.20 | +7: iteration 4320/ 173500 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.349311E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.478 | TFLOPs: 24.66 | +7: iteration 4330/ 173500 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.344604E+00 | grad norm: 0.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.465 | TFLOPs: 25.21 | +7: iteration 4340/ 173500 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.354456E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.459 | TFLOPs: 25.84 | +7: iteration 4350/ 173500 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.349483E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.880 | TFLOPs: 25.04 | +7: iteration 4360/ 173500 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.343843E+00 | grad norm: 0.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.055 | TFLOPs: 25.80 | +7: iteration 4370/ 173500 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.340123E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.876 | TFLOPs: 24.68 | +7: iteration 4380/ 173500 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.359809E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.808 | TFLOPs: 24.78 | +7: iteration 4390/ 173500 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.333568E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.195 | TFLOPs: 25.75 | +7: iteration 4400/ 173500 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.353349E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.759 | TFLOPs: 24.29 | +7: iteration 4410/ 173500 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.330303E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.236 | TFLOPs: 24.50 | +7: iteration 4420/ 173500 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.326251E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.630 | TFLOPs: 25.21 | +7: iteration 4430/ 173500 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.335383E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.761 | TFLOPs: 25.46 | +7: iteration 4440/ 173500 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.324037E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.108 | TFLOPs: 25.27 | +7: iteration 4450/ 173500 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.342316E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.185 | TFLOPs: 24.81 | +7: iteration 4460/ 173500 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.333865E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.428 | TFLOPs: 24.25 | +7: iteration 4470/ 173500 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.328500E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.727 | TFLOPs: 25.34 | +7: iteration 4480/ 173500 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.331286E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.533 | TFLOPs: 25.27 | +7: iteration 4490/ 173500 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.335938E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.259 | TFLOPs: 24.72 | +7: iteration 4500/ 173500 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.326401E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.640 | TFLOPs: 25.20 | +7: iteration 4510/ 173500 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.331065E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.931 | TFLOPs: 25.39 | +7: iteration 4520/ 173500 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.335466E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.612 | TFLOPs: 24.90 | +7: iteration 4530/ 173500 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.315636E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.321 | TFLOPs: 24.42 | +7: iteration 4540/ 173500 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.310617E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.378 | TFLOPs: 25.13 | +7: iteration 4550/ 173500 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.333107E+00 | grad norm: 0.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.362 | TFLOPs: 25.43 | +7: iteration 4560/ 173500 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.329517E+00 | grad norm: 0.768 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.472 | TFLOPs: 25.49 | +7: iteration 4570/ 173500 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.324150E+00 | grad norm: 0.670 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.788 | TFLOPs: 24.95 | +7: iteration 4580/ 173500 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.308208E+00 | grad norm: 0.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.360 | TFLOPs: 25.18 | +7: iteration 4590/ 173500 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.327376E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.562 | TFLOPs: 24.19 | +7: iteration 4600/ 173500 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.321074E+00 | grad norm: 0.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.779 | TFLOPs: 23.99 | +7: iteration 4610/ 173500 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.321867E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.395 | TFLOPs: 25.87 | +7: iteration 4620/ 173500 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.309787E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.476 | TFLOPs: 25.41 | +7: iteration 4630/ 173500 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.309703E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.243 | TFLOPs: 25.46 | +7: iteration 4640/ 173500 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.321563E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.835 | TFLOPs: 25.94 | +7: iteration 4650/ 173500 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.321190E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.987 | TFLOPs: 24.03 | +7: iteration 4660/ 173500 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.314673E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.517 | TFLOPs: 25.43 | +7: iteration 4670/ 173500 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.312021E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.736 | TFLOPs: 25.81 | +7: iteration 4680/ 173500 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.313437E+00 | grad norm: 0.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.577 | TFLOPs: 26.17 | +7: iteration 4690/ 173500 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.314578E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.317 | TFLOPs: 25.72 | +7: iteration 4700/ 173500 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.312560E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.660 | TFLOPs: 24.57 | +7: iteration 4710/ 173500 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.314428E+00 | grad norm: 0.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.861 | TFLOPs: 24.95 | +7: iteration 4720/ 173500 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.308198E+00 | grad norm: 0.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.369 | TFLOPs: 25.65 | +7: iteration 4730/ 173500 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.301672E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.628 | TFLOPs: 25.23 | +7: iteration 4740/ 173500 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.305745E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.752 | TFLOPs: 25.48 | +7: iteration 4750/ 173500 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.17 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.299391E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.152 | TFLOPs: 24.33 | +7: iteration 4760/ 173500 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.299906E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.688 | TFLOPs: 24.46 | +7: iteration 4770/ 173500 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.301470E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.288 | TFLOPs: 25.03 | +7: iteration 4780/ 173500 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.308909E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.952 | TFLOPs: 26.11 | +7: iteration 4790/ 173500 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.321732E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.541 | TFLOPs: 25.21 | +7: iteration 4800/ 173500 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.293468E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.666 | TFLOPs: 25.32 | +7: iteration 4810/ 173500 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.303852E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.766 | TFLOPs: 25.46 | +7: iteration 4820/ 173500 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.304420E+00 | grad norm: 0.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.286 | TFLOPs: 24.55 | +7: iteration 4830/ 173500 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.308056E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.173 | TFLOPs: 24.91 | +7: iteration 4840/ 173500 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.303201E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.163 | TFLOPs: 24.62 | +7: iteration 4850/ 173500 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.303963E+00 | grad norm: 0.589 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.900 | TFLOPs: 24.76 | +7: iteration 4860/ 173500 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.297636E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.117 | TFLOPs: 24.86 | +7: iteration 4870/ 173500 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.296331E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.660 | TFLOPs: 25.54 | +7: iteration 4880/ 173500 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.15 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.290200E+00 | grad norm: 0.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.720 | TFLOPs: 25.90 | +7: iteration 4890/ 173500 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.16 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.288386E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.926 | TFLOPs: 25.25 | +7: iteration 4900/ 173500 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.284217E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.162 | TFLOPs: 25.85 | +7: iteration 4910/ 173500 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.285054E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.913 | TFLOPs: 25.59 | +7: iteration 4920/ 173500 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.286542E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.972 | TFLOPs: 24.87 | +7: iteration 4930/ 173500 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.291820E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.870 | TFLOPs: 25.22 | +7: iteration 4940/ 173500 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.283338E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.481 | TFLOPs: 25.27 | +7: iteration 4950/ 173500 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.274138E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.928 | TFLOPs: 25.45 | +7: iteration 4960/ 173500 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.267413E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.727 | TFLOPs: 24.74 | +7: iteration 4970/ 173500 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.289582E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.021 | TFLOPs: 25.59 | +7: iteration 4980/ 173500 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.284187E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.372 | TFLOPs: 26.05 | +7: iteration 4990/ 173500 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.278764E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.466 | TFLOPs: 25.91 | +7: iteration 5000/ 173500 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.290896E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.965 | TFLOPs: 25.44 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 5000 | lm loss value: 4.224629E+00 | lm loss PPL: 6.834917E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 5000 to checkpoints_44m91b100m +0: [2023-03-17 00:28:32,320] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +0: [2023-03-17 00:28:32,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:28:32,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:28:32,380] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:28:32,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:28:32,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:28:32,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:28:32,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:28:32,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:28:32,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:28:32,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:28:32,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:28:32,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:28:32,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:28:32,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:28:32,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:28:32,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:28:32,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:28:32,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:28:32,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:28:32,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:28:32,449] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step5000/mp_rank_00_model_states.pt +0: [2023-03-17 00:28:32,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:28:32,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:28:32,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,481] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:28:32,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,483] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:28:32,483] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,484] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,485] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:28:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:28:32,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:28:32,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:28:32,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:28:32,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-17 00:28:32,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:28:32,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:28:32,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: successfully saved checkpoint at iteration 5000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.10 +7: iteration 5010/ 173500 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.18 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.282945E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.103 | TFLOPs: 22.07 | +7: iteration 5020/ 173500 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.270976E+00 | grad norm: 0.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.294 | TFLOPs: 25.36 | +7: iteration 5030/ 173500 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.271939E+00 | grad norm: 0.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.249 | TFLOPs: 23.73 | +7: iteration 5040/ 173500 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.268835E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.186 | TFLOPs: 25.94 | +7: iteration 5050/ 173500 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.282562E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.547 | TFLOPs: 25.65 | +7: iteration 5060/ 173500 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.279058E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.846 | TFLOPs: 25.69 | +7: iteration 5070/ 173500 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.271614E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.045 | TFLOPs: 25.22 | +7: iteration 5080/ 173500 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.273038E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.166 | TFLOPs: 24.25 | +7: iteration 5090/ 173500 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.277202E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.399 | TFLOPs: 25.68 | +7: iteration 5100/ 173500 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.266716E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.719 | TFLOPs: 24.76 | +7: iteration 5110/ 173500 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.277773E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.061 | TFLOPs: 23.90 | +7: iteration 5120/ 173500 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.262894E+00 | grad norm: 0.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.508 | TFLOPs: 24.68 | +7: iteration 5130/ 173500 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.271663E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.475 | TFLOPs: 23.81 | +7: iteration 5140/ 173500 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.276266E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.205 | TFLOPs: 25.49 | +7: iteration 5150/ 173500 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.266846E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.765 | TFLOPs: 25.78 | +7: iteration 5160/ 173500 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.263109E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.548 | TFLOPs: 24.79 | +7: iteration 5170/ 173500 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.259918E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.662 | TFLOPs: 23.91 | +7: iteration 5180/ 173500 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.272869E+00 | grad norm: 0.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.587 | TFLOPs: 25.73 | +7: iteration 5190/ 173500 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.268277E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.520 | TFLOPs: 25.51 | +7: iteration 5200/ 173500 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.271320E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.711 | TFLOPs: 24.74 | +7: iteration 5210/ 173500 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.266899E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.126 | TFLOPs: 26.13 | +7: iteration 5220/ 173500 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.248439E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.261 | TFLOPs: 24.09 | +7: iteration 5230/ 173500 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.249125E+00 | grad norm: 0.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.989 | TFLOPs: 23.81 | +7: iteration 5240/ 173500 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.273280E+00 | grad norm: 0.582 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.462 | TFLOPs: 25.38 | +7: iteration 5250/ 173500 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.259441E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.475 | TFLOPs: 25.22 | +7: iteration 5260/ 173500 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.256974E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.193 | TFLOPs: 25.49 | +7: iteration 5270/ 173500 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.263890E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.222 | TFLOPs: 26.15 | +7: iteration 5280/ 173500 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.239389E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.773 | TFLOPs: 25.31 | +7: iteration 5290/ 173500 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.249954E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.339 | TFLOPs: 26.05 | +7: iteration 5300/ 173500 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.254392E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.525 | TFLOPs: 24.76 | +7: iteration 5310/ 173500 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.250150E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.099 | TFLOPs: 25.28 | +7: iteration 5320/ 173500 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.255304E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.060 | TFLOPs: 25.74 | +7: iteration 5330/ 173500 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.252963E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.678 | TFLOPs: 24.19 | +7: iteration 5340/ 173500 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.248698E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.947 | TFLOPs: 25.99 | +7: iteration 5350/ 173500 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.246369E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.614 | TFLOPs: 25.21 | +7: iteration 5360/ 173500 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.252633E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.526 | TFLOPs: 24.80 | +7: iteration 5370/ 173500 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.240697E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.092 | TFLOPs: 26.14 | +7: iteration 5380/ 173500 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.253864E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.393 | TFLOPs: 25.88 | +7: iteration 5390/ 173500 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.244581E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.810 | TFLOPs: 25.72 | +7: iteration 5400/ 173500 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.247735E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.389 | TFLOPs: 25.60 | +7: iteration 5410/ 173500 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.252432E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.608 | TFLOPs: 24.57 | +7: iteration 5420/ 173500 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.249079E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.079 | TFLOPs: 25.09 | +7: iteration 5430/ 173500 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.239896E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.968 | TFLOPs: 25.01 | +7: iteration 5440/ 173500 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.252482E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.539 | TFLOPs: 24.46 | +7: iteration 5450/ 173500 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.246865E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.582 | TFLOPs: 25.79 | +7: iteration 5460/ 173500 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.240828E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.442 | TFLOPs: 25.37 | +7: iteration 5470/ 173500 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.252991E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.861 | TFLOPs: 25.84 | +7: iteration 5480/ 173500 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.248114E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.040 | TFLOPs: 25.34 | +7: iteration 5490/ 173500 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.231872E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.374 | TFLOPs: 24.30 | +7: iteration 5500/ 173500 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.224513E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.960 | TFLOPs: 25.78 | +7: iteration 5510/ 173500 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 0.18 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.241156E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1444.140 | TFLOPs: 22.65 | +7: iteration 5520/ 173500 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.246474E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.006 | TFLOPs: 24.50 | +7: iteration 5530/ 173500 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.238325E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.688 | TFLOPs: 26.23 | +7: iteration 5540/ 173500 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.234059E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.251 | TFLOPs: 25.25 | +7: iteration 5550/ 173500 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.235087E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.812 | TFLOPs: 25.62 | +7: iteration 5560/ 173500 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.230919E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.290 | TFLOPs: 25.69 | +7: iteration 5570/ 173500 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.234488E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.957 | TFLOPs: 25.83 | +7: iteration 5580/ 173500 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.235519E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.800 | TFLOPs: 25.37 | +7: iteration 5590/ 173500 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.218289E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.467 | TFLOPs: 25.95 | +7: iteration 5600/ 173500 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.234829E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.925 | TFLOPs: 25.20 | +7: iteration 5610/ 173500 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.230844E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.606 | TFLOPs: 25.87 | +7: iteration 5620/ 173500 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.219764E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.100 | TFLOPs: 25.23 | +7: iteration 5630/ 173500 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.235776E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.977 | TFLOPs: 24.97 | +7: iteration 5640/ 173500 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.223318E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.464 | TFLOPs: 24.36 | +7: iteration 5650/ 173500 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.219100E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.025 | TFLOPs: 24.31 | +7: iteration 5660/ 173500 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.224673E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.754 | TFLOPs: 24.49 | +7: iteration 5670/ 173500 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.223928E+00 | grad norm: 0.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.532 | TFLOPs: 24.97 | +7: iteration 5680/ 173500 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.207486E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.021 | TFLOPs: 26.19 | +7: iteration 5690/ 173500 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.243430E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.150 | TFLOPs: 26.11 | +7: iteration 5700/ 173500 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 0.17 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.240891E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.975 | TFLOPs: 24.28 | +7: iteration 5710/ 173500 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.218859E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.941 | TFLOPs: 24.35 | +7: iteration 5720/ 173500 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.202140E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.082 | TFLOPs: 25.74 | +7: iteration 5730/ 173500 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.220676E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.996 | TFLOPs: 25.72 | +7: iteration 5740/ 173500 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.221947E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.963 | TFLOPs: 25.50 | +7: iteration 5750/ 173500 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.216116E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.992 | TFLOPs: 25.91 | +7: iteration 5760/ 173500 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.220772E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.775 | TFLOPs: 25.40 | +7: iteration 5770/ 173500 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.224982E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.599 | TFLOPs: 25.70 | +7: iteration 5780/ 173500 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.228254E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.401 | TFLOPs: 25.76 | +7: iteration 5790/ 173500 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.216599E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.035 | TFLOPs: 24.67 | +7: iteration 5800/ 173500 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 0.15 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.207811E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.611 | TFLOPs: 25.93 | +7: iteration 5810/ 173500 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 0.16 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.212132E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.514 | TFLOPs: 25.70 | +7: iteration 5820/ 173500 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.206187E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.803 | TFLOPs: 25.32 | +7: iteration 5830/ 173500 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.217830E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.754 | TFLOPs: 25.31 | +7: iteration 5840/ 173500 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.217570E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.259 | TFLOPs: 25.08 | +7: iteration 5850/ 173500 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.204027E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.046 | TFLOPs: 25.66 | +7: iteration 5860/ 173500 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.198805E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.148 | TFLOPs: 25.58 | +7: iteration 5870/ 173500 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.218866E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.196 | TFLOPs: 25.66 | +7: iteration 5880/ 173500 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.206989E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.088 | TFLOPs: 26.07 | +7: iteration 5890/ 173500 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.217931E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.603 | TFLOPs: 25.92 | +7: iteration 5900/ 173500 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.213926E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.561 | TFLOPs: 25.04 | +7: iteration 5910/ 173500 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.211504E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.169 | TFLOPs: 24.61 | +7: iteration 5920/ 173500 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.205340E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.430 | TFLOPs: 26.15 | +7: iteration 5930/ 173500 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.215207E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.847 | TFLOPs: 24.24 | +7: iteration 5940/ 173500 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.204767E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.885 | TFLOPs: 25.61 | +7: iteration 5950/ 173500 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.201903E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.905 | TFLOPs: 24.82 | +7: iteration 5960/ 173500 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.215334E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.274 | TFLOPs: 25.05 | +7: iteration 5970/ 173500 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.205336E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.224 | TFLOPs: 24.94 | +7: iteration 5980/ 173500 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.197027E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.905 | TFLOPs: 25.72 | +7: iteration 5990/ 173500 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.198197E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.858 | TFLOPs: 24.78 | +0: [2023-03-17 00:31:11,783] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.0001997263111243839, 0.0001997263111243839, 0.0001997263111243839], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 6000/ 173500 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.204546E+00 | grad norm: 0.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.350 | TFLOPs: 25.00 | +0: steps: 6000 loss: 4.1805 iter time (s): 0.158 samples/sec: 1620.475 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 6000 | lm loss value: 4.207329E+00 | lm loss PPL: 6.717689E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 6000 to checkpoints_44m91b100m +0: [2023-03-17 00:31:11,856] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! +0: [2023-03-17 00:31:11,859] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:31:11,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:31:11,921] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:31:11,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:31:11,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:31:11,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:31:11,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:31:11,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:31:11,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:31:11,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:31:11,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:31:11,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:31:11,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:31:11,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:31:11,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:31:11,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:31:11,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:31:11,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:31:11,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:31:11,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:31:11,989] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step6000/mp_rank_00_model_states.pt +0: [2023-03-17 00:31:11,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:31:11,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:31:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:31:12,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:31:12,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 00:31:12,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:31:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:31:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:31:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-17 00:31:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:31:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:31:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-17 00:31:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-17 00:31:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:31:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-17 00:31:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:31:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:31:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: successfully saved checkpoint at iteration 6000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.74 +7: iteration 6010/ 173500 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 0.18 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.197656E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.199 | TFLOPs: 22.02 | +7: iteration 6020/ 173500 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.204381E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.608 | TFLOPs: 24.91 | +7: iteration 6030/ 173500 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.209627E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.498 | TFLOPs: 25.30 | +7: iteration 6040/ 173500 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.191143E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.664 | TFLOPs: 26.00 | +7: iteration 6050/ 173500 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.199451E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.874 | TFLOPs: 24.70 | +7: iteration 6060/ 173500 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.210909E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.799 | TFLOPs: 25.39 | +7: iteration 6070/ 173500 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.193548E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.889 | TFLOPs: 25.29 | +7: iteration 6080/ 173500 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.186865E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.382 | TFLOPs: 25.22 | +7: iteration 6090/ 173500 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.192217E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.035 | TFLOPs: 25.39 | +7: iteration 6100/ 173500 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.206948E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.104 | TFLOPs: 24.58 | +7: iteration 6110/ 173500 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.200509E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.590 | TFLOPs: 25.56 | +7: iteration 6120/ 173500 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.181708E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.794 | TFLOPs: 25.50 | +7: iteration 6130/ 173500 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.192551E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.210 | TFLOPs: 25.60 | +7: iteration 6140/ 173500 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.195842E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.625 | TFLOPs: 25.54 | +7: iteration 6150/ 173500 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.195464E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.283 | TFLOPs: 24.70 | +7: iteration 6160/ 173500 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.177602E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.197 | TFLOPs: 25.50 | +7: iteration 6170/ 173500 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.181726E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.381 | TFLOPs: 24.63 | +7: iteration 6180/ 173500 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.181691E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.768 | TFLOPs: 25.81 | +7: iteration 6190/ 173500 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.201942E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.747 | TFLOPs: 25.95 | +7: iteration 6200/ 173500 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.195903E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.630 | TFLOPs: 24.63 | +7: iteration 6210/ 173500 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.194557E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.085 | TFLOPs: 23.46 | +7: iteration 6220/ 173500 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.188677E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.160 | TFLOPs: 24.53 | +7: iteration 6230/ 173500 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.191722E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.132 | TFLOPs: 25.27 | +7: iteration 6240/ 173500 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.197756E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.923 | TFLOPs: 25.23 | +7: iteration 6250/ 173500 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.185996E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.729 | TFLOPs: 24.08 | +7: iteration 6260/ 173500 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.176160E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.861 | TFLOPs: 25.09 | +7: iteration 6270/ 173500 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.185300E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.780 | TFLOPs: 25.18 | +7: iteration 6280/ 173500 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.178143E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.934 | TFLOPs: 25.14 | +7: iteration 6290/ 173500 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 0.17 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.180275E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.350 | TFLOPs: 23.65 | +7: iteration 6300/ 173500 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.183401E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.001 | TFLOPs: 25.39 | +7: iteration 6310/ 173500 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.179509E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.721 | TFLOPs: 25.07 | +7: iteration 6320/ 173500 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.172010E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.906 | TFLOPs: 25.62 | +7: iteration 6330/ 173500 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.193325E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.352 | TFLOPs: 25.65 | +7: iteration 6340/ 173500 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.182902E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.236 | TFLOPs: 26.01 | +7: iteration 6350/ 173500 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.173001E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.969 | TFLOPs: 25.30 | +7: iteration 6360/ 173500 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.169214E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.267 | TFLOPs: 25.82 | +7: iteration 6370/ 173500 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.179702E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.709 | TFLOPs: 25.97 | +7: iteration 6380/ 173500 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.177697E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.490 | TFLOPs: 25.57 | +7: iteration 6390/ 173500 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.183350E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.218 | TFLOPs: 25.55 | +7: iteration 6400/ 173500 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.168888E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.248 | TFLOPs: 25.79 | +7: iteration 6410/ 173500 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.174657E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.493 | TFLOPs: 25.51 | +7: iteration 6420/ 173500 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.174553E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.902 | TFLOPs: 26.00 | +7: iteration 6430/ 173500 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.174520E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.724 | TFLOPs: 25.50 | +7: iteration 6440/ 173500 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.162162E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.031 | TFLOPs: 25.74 | +7: iteration 6450/ 173500 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.170250E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.065 | TFLOPs: 26.14 | +7: iteration 6460/ 173500 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.170258E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.349 | TFLOPs: 26.13 | +7: iteration 6470/ 173500 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.170491E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.348 | TFLOPs: 26.18 | +7: iteration 6480/ 173500 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.170921E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.434 | TFLOPs: 25.08 | +7: iteration 6490/ 173500 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.174822E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.082 | TFLOPs: 24.42 | +7: iteration 6500/ 173500 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.186666E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.732 | TFLOPs: 24.48 | +7: iteration 6510/ 173500 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.167257E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.594 | TFLOPs: 26.18 | +7: iteration 6520/ 173500 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.174200E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.036 | TFLOPs: 25.69 | +7: iteration 6530/ 173500 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.169791E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.412 | TFLOPs: 24.78 | +7: iteration 6540/ 173500 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 0.16 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.157108E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.002 | TFLOPs: 25.14 | +7: iteration 6550/ 173500 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 0.15 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.165111E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.579 | TFLOPs: 26.01 | +7: iteration 6560/ 173500 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.181104E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.922 | TFLOPs: 25.53 | +7: iteration 6570/ 173500 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.173013E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.996 | TFLOPs: 26.05 | +7: iteration 6580/ 173500 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.165636E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.956 | TFLOPs: 25.00 | +7: iteration 6590/ 173500 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.159825E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.251 | TFLOPs: 25.61 | +7: iteration 6600/ 173500 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.177201E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.364 | TFLOPs: 25.46 | +7: iteration 6610/ 173500 | consumed samples: 1692160 | consumed tokens: 3465543680 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.163793E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.798 | TFLOPs: 24.98 | +7: iteration 6620/ 173500 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.154842E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.003 | TFLOPs: 25.58 | +7: iteration 6630/ 173500 | consumed samples: 1697280 | consumed tokens: 3476029440 | elapsed time per iteration (s): 0.17 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.155853E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.472 | TFLOPs: 23.66 | +7: iteration 6640/ 173500 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.158175E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.057 | TFLOPs: 25.23 | +7: iteration 6650/ 173500 | consumed samples: 1702400 | consumed tokens: 3486515200 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.160380E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.953 | TFLOPs: 25.70 | +7: iteration 6660/ 173500 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.160684E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.521 | TFLOPs: 25.35 | +7: iteration 6670/ 173500 | consumed samples: 1707520 | consumed tokens: 3497000960 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.153251E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.420 | TFLOPs: 25.18 | +7: iteration 6680/ 173500 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.166673E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.228 | TFLOPs: 25.71 | +7: iteration 6690/ 173500 | consumed samples: 1712640 | consumed tokens: 3507486720 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.148121E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.216 | TFLOPs: 25.75 | +7: iteration 6700/ 173500 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.168638E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.186 | TFLOPs: 26.08 | +7: iteration 6710/ 173500 | consumed samples: 1717760 | consumed tokens: 3517972480 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.157385E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.270 | TFLOPs: 26.13 | +7: iteration 6720/ 173500 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.171005E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.647 | TFLOPs: 25.75 | +7: iteration 6730/ 173500 | consumed samples: 1722880 | consumed tokens: 3528458240 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.154615E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.362 | TFLOPs: 25.79 | +7: iteration 6740/ 173500 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.159964E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.149 | TFLOPs: 25.24 | +7: iteration 6750/ 173500 | consumed samples: 1728000 | consumed tokens: 3538944000 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.158389E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.899 | TFLOPs: 26.11 | +7: iteration 6760/ 173500 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.162095E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.336 | TFLOPs: 26.09 | +7: iteration 6770/ 173500 | consumed samples: 1733120 | consumed tokens: 3549429760 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.149393E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.729 | TFLOPs: 25.54 | +7: iteration 6780/ 173500 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.152138E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.857 | TFLOPs: 24.40 | +7: iteration 6790/ 173500 | consumed samples: 1738240 | consumed tokens: 3559915520 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.147429E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.362 | TFLOPs: 25.07 | +7: iteration 6800/ 173500 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.138330E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.789 | TFLOPs: 25.09 | +7: iteration 6810/ 173500 | consumed samples: 1743360 | consumed tokens: 3570401280 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.145474E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.593 | TFLOPs: 25.38 | +7: iteration 6820/ 173500 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.156860E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.747 | TFLOPs: 25.67 | +7: iteration 6830/ 173500 | consumed samples: 1748480 | consumed tokens: 3580887040 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.142283E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.288 | TFLOPs: 24.83 | +7: iteration 6840/ 173500 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.159032E+00 | grad norm: 0.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.208 | TFLOPs: 24.70 | +7: iteration 6850/ 173500 | consumed samples: 1753600 | consumed tokens: 3591372800 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.152053E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.727 | TFLOPs: 25.04 | +7: iteration 6860/ 173500 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.157275E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.862 | TFLOPs: 25.80 | +7: iteration 6870/ 173500 | consumed samples: 1758720 | consumed tokens: 3601858560 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.144216E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.442 | TFLOPs: 24.64 | +7: iteration 6880/ 173500 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.149038E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.615 | TFLOPs: 24.82 | +7: iteration 6890/ 173500 | consumed samples: 1763840 | consumed tokens: 3612344320 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.146796E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.874 | TFLOPs: 26.16 | +7: iteration 6900/ 173500 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.145135E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.530 | TFLOPs: 25.93 | +7: iteration 6910/ 173500 | consumed samples: 1768960 | consumed tokens: 3622830080 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.152338E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.224 | TFLOPs: 25.14 | +7: iteration 6920/ 173500 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.144002E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.646 | TFLOPs: 25.73 | +7: iteration 6930/ 173500 | consumed samples: 1774080 | consumed tokens: 3633315840 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.153529E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.229 | TFLOPs: 25.17 | +7: iteration 6940/ 173500 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.140177E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.751 | TFLOPs: 25.62 | +7: iteration 6950/ 173500 | consumed samples: 1779200 | consumed tokens: 3643801600 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.154092E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.745 | TFLOPs: 25.48 | +7: iteration 6960/ 173500 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.137805E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.735 | TFLOPs: 25.06 | +7: iteration 6970/ 173500 | consumed samples: 1784320 | consumed tokens: 3654287360 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.139873E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.213 | TFLOPs: 25.30 | +7: iteration 6980/ 173500 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.143737E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.633 | TFLOPs: 26.18 | +7: iteration 6990/ 173500 | consumed samples: 1789440 | consumed tokens: 3664773120 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.141148E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.259 | TFLOPs: 25.38 | +7: iteration 7000/ 173500 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.135421E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.209 | TFLOPs: 26.22 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 7000 | lm loss value: 4.160453E+00 | lm loss PPL: 6.410054E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 7000 to checkpoints_44m91b100m +0: [2023-03-17 00:33:50,456] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is begin to save! +0: [2023-03-17 00:33:50,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:33:50,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:33:50,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:33:50,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:33:50,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:33:50,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:33:50,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:33:50,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:33:50,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:33:50,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:33:50,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:33:50,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:33:50,562] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:33:50,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:33:50,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:33:50,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:33:50,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:33:50,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:33:50,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:33:50,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:33:50,587] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step7000/mp_rank_00_model_states.pt +0: [2023-03-17 00:33:50,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:33:50,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:50,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:50,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:50,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:50,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:50,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:50,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:50,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:50,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:50,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-17 00:33:50,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:50,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-17 00:33:50,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:50,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:50,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: successfully saved checkpoint at iteration 7000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.47 +7: iteration 7010/ 173500 | consumed samples: 1794560 | consumed tokens: 3675258880 | elapsed time per iteration (s): 0.18 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.142170E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.695 | TFLOPs: 22.14 | +7: iteration 7020/ 173500 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.141006E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.969 | TFLOPs: 26.25 | +7: iteration 7030/ 173500 | consumed samples: 1799680 | consumed tokens: 3685744640 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.141689E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.472 | TFLOPs: 25.85 | +7: iteration 7040/ 173500 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.143501E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.920 | TFLOPs: 25.91 | +7: iteration 7050/ 173500 | consumed samples: 1804800 | consumed tokens: 3696230400 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.143511E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.702 | TFLOPs: 25.35 | +7: iteration 7060/ 173500 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.144346E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.912 | TFLOPs: 26.22 | +7: iteration 7070/ 173500 | consumed samples: 1809920 | consumed tokens: 3706716160 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.141082E+00 | grad norm: 0.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.184 | TFLOPs: 25.74 | +7: iteration 7080/ 173500 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.148219E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.963 | TFLOPs: 25.37 | +7: iteration 7090/ 173500 | consumed samples: 1815040 | consumed tokens: 3717201920 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.133859E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.520 | TFLOPs: 25.49 | +7: iteration 7100/ 173500 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.138752E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.611 | TFLOPs: 25.54 | +7: iteration 7110/ 173500 | consumed samples: 1820160 | consumed tokens: 3727687680 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.135563E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.475 | TFLOPs: 25.98 | +7: iteration 7120/ 173500 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 0.15 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.135390E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.692 | TFLOPs: 26.04 | +7: iteration 7130/ 173500 | consumed samples: 1825280 | consumed tokens: 3738173440 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.143462E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.912 | TFLOPs: 25.81 | +7: iteration 7140/ 173500 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.148819E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.015 | TFLOPs: 25.81 | +7: iteration 7150/ 173500 | consumed samples: 1830400 | consumed tokens: 3748659200 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.141470E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.979 | TFLOPs: 25.56 | +7: iteration 7160/ 173500 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.127739E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.320 | TFLOPs: 25.57 | +7: iteration 7170/ 173500 | consumed samples: 1835520 | consumed tokens: 3759144960 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.129594E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.787 | TFLOPs: 25.64 | +7: iteration 7180/ 173500 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.124209E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.046 | TFLOPs: 25.01 | +7: iteration 7190/ 173500 | consumed samples: 1840640 | consumed tokens: 3769630720 | elapsed time per iteration (s): 0.17 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.123524E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.826 | TFLOPs: 24.15 | +7: iteration 7200/ 173500 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 0.16 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.129566E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.256 | TFLOPs: 25.54 | +7: iteration 7210/ 173500 | consumed samples: 1845760 | consumed tokens: 3780116480 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.127258E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.259 | TFLOPs: 25.22 | +7: iteration 7220/ 173500 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.126166E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.836 | TFLOPs: 25.83 | +7: iteration 7230/ 173500 | consumed samples: 1850880 | consumed tokens: 3790602240 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.148362E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.165 | TFLOPs: 24.81 | +7: iteration 7240/ 173500 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 0.17 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.142841E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.497 | TFLOPs: 24.02 | +7: iteration 7250/ 173500 | consumed samples: 1856000 | consumed tokens: 3801088000 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.134986E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.075 | TFLOPs: 24.42 | +7: iteration 7260/ 173500 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.139474E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.467 | TFLOPs: 24.35 | +7: iteration 7270/ 173500 | consumed samples: 1861120 | consumed tokens: 3811573760 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.125486E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.492 | TFLOPs: 24.94 | +7: iteration 7280/ 173500 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.131799E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.110 | TFLOPs: 24.72 | +7: iteration 7290/ 173500 | consumed samples: 1866240 | consumed tokens: 3822059520 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.131528E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.395 | TFLOPs: 25.11 | +7: iteration 7300/ 173500 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.123848E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.920 | TFLOPs: 26.16 | +7: iteration 7310/ 173500 | consumed samples: 1871360 | consumed tokens: 3832545280 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.110411E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.040 | TFLOPs: 26.14 | +7: iteration 7320/ 173500 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.119957E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.745 | TFLOPs: 25.53 | +7: iteration 7330/ 173500 | consumed samples: 1876480 | consumed tokens: 3843031040 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.130890E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.889 | TFLOPs: 26.16 | +7: iteration 7340/ 173500 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.131838E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.697 | TFLOPs: 26.14 | +7: iteration 7350/ 173500 | consumed samples: 1881600 | consumed tokens: 3853516800 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.116882E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.282 | TFLOPs: 25.65 | +7: iteration 7360/ 173500 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117639E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.511 | TFLOPs: 25.29 | +7: iteration 7370/ 173500 | consumed samples: 1886720 | consumed tokens: 3864002560 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.122882E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.743 | TFLOPs: 25.06 | +7: iteration 7380/ 173500 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.125943E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.940 | TFLOPs: 26.19 | +7: iteration 7390/ 173500 | consumed samples: 1891840 | consumed tokens: 3874488320 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.121154E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.583 | TFLOPs: 24.99 | +7: iteration 7400/ 173500 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117463E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.038 | TFLOPs: 25.96 | +7: iteration 7410/ 173500 | consumed samples: 1896960 | consumed tokens: 3884974080 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.114456E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.402 | TFLOPs: 26.09 | +7: iteration 7420/ 173500 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.132624E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.234 | TFLOPs: 25.10 | +7: iteration 7430/ 173500 | consumed samples: 1902080 | consumed tokens: 3895459840 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.121435E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.803 | TFLOPs: 25.59 | +7: iteration 7440/ 173500 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117576E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.014 | TFLOPs: 25.86 | +7: iteration 7450/ 173500 | consumed samples: 1907200 | consumed tokens: 3905945600 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.127491E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.077 | TFLOPs: 24.98 | +7: iteration 7460/ 173500 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.125328E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.051 | TFLOPs: 25.66 | +7: iteration 7470/ 173500 | consumed samples: 1912320 | consumed tokens: 3916431360 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117977E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.368 | TFLOPs: 25.91 | +7: iteration 7480/ 173500 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.119227E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.452 | TFLOPs: 25.66 | +7: iteration 7490/ 173500 | consumed samples: 1917440 | consumed tokens: 3926917120 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.116081E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.361 | TFLOPs: 24.94 | +7: iteration 7500/ 173500 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.112680E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.222 | TFLOPs: 24.72 | +7: iteration 7510/ 173500 | consumed samples: 1922560 | consumed tokens: 3937402880 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.114639E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.539 | TFLOPs: 26.14 | +7: iteration 7520/ 173500 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.126924E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.717 | TFLOPs: 25.20 | +7: iteration 7530/ 173500 | consumed samples: 1927680 | consumed tokens: 3947888640 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.114542E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.243 | TFLOPs: 26.08 | +7: iteration 7540/ 173500 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.113008E+00 | grad norm: 0.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.230 | TFLOPs: 26.05 | +7: iteration 7550/ 173500 | consumed samples: 1932800 | consumed tokens: 3958374400 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.110423E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.594 | TFLOPs: 25.02 | +7: iteration 7560/ 173500 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.115033E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.212 | TFLOPs: 25.66 | +7: iteration 7570/ 173500 | consumed samples: 1937920 | consumed tokens: 3968860160 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.128533E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.999 | TFLOPs: 24.34 | +7: iteration 7580/ 173500 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.122705E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.421 | TFLOPs: 26.12 | +7: iteration 7590/ 173500 | consumed samples: 1943040 | consumed tokens: 3979345920 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.101934E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.392 | TFLOPs: 24.86 | +7: iteration 7600/ 173500 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.119917E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.174 | TFLOPs: 26.05 | +7: iteration 7610/ 173500 | consumed samples: 1948160 | consumed tokens: 3989831680 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117211E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.804 | TFLOPs: 25.81 | +7: iteration 7620/ 173500 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.106017E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.650 | TFLOPs: 26.00 | +7: iteration 7630/ 173500 | consumed samples: 1953280 | consumed tokens: 4000317440 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.112021E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.240 | TFLOPs: 25.52 | +7: iteration 7640/ 173500 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.116360E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.810 | TFLOPs: 25.62 | +7: iteration 7650/ 173500 | consumed samples: 1958400 | consumed tokens: 4010803200 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.107391E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.476 | TFLOPs: 26.23 | +7: iteration 7660/ 173500 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.109236E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.653 | TFLOPs: 25.54 | +7: iteration 7670/ 173500 | consumed samples: 1963520 | consumed tokens: 4021288960 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117902E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.240 | TFLOPs: 25.80 | +7: iteration 7680/ 173500 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.114262E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.493 | TFLOPs: 26.21 | +7: iteration 7690/ 173500 | consumed samples: 1968640 | consumed tokens: 4031774720 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.107854E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.455 | TFLOPs: 26.02 | +7: iteration 7700/ 173500 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.105905E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.013 | TFLOPs: 26.17 | +7: iteration 7710/ 173500 | consumed samples: 1973760 | consumed tokens: 4042260480 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.113330E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.127 | TFLOPs: 25.42 | +7: iteration 7720/ 173500 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.110467E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.668 | TFLOPs: 25.62 | +7: iteration 7730/ 173500 | consumed samples: 1978880 | consumed tokens: 4052746240 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.099286E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.906 | TFLOPs: 25.76 | +7: iteration 7740/ 173500 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.117876E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.742 | TFLOPs: 25.14 | +7: iteration 7750/ 173500 | consumed samples: 1984000 | consumed tokens: 4063232000 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.109034E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.908 | TFLOPs: 24.62 | +7: iteration 7760/ 173500 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.101604E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.251 | TFLOPs: 25.74 | +7: iteration 7770/ 173500 | consumed samples: 1989120 | consumed tokens: 4073717760 | elapsed time per iteration (s): 0.15 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.107760E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.698 | TFLOPs: 25.97 | +7: iteration 7780/ 173500 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 0.16 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.106759E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.917 | TFLOPs: 25.70 | +7: iteration 7790/ 173500 | consumed samples: 1994240 | consumed tokens: 4084203520 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.113665E+00 | grad norm: 0.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.476 | TFLOPs: 26.17 | +7: iteration 7800/ 173500 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.094176E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.597 | TFLOPs: 25.92 | +7: iteration 7810/ 173500 | consumed samples: 1999360 | consumed tokens: 4094689280 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.091994E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.299 | TFLOPs: 25.71 | +7: iteration 7820/ 173500 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.116782E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.959 | TFLOPs: 25.48 | +7: iteration 7830/ 173500 | consumed samples: 2004480 | consumed tokens: 4105175040 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.111873E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.076 | TFLOPs: 24.94 | +7: iteration 7840/ 173500 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.099700E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.560 | TFLOPs: 24.33 | +7: iteration 7850/ 173500 | consumed samples: 2009600 | consumed tokens: 4115660800 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.097180E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.313 | TFLOPs: 26.18 | +7: iteration 7860/ 173500 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.099936E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.593 | TFLOPs: 26.17 | +7: iteration 7870/ 173500 | consumed samples: 2014720 | consumed tokens: 4126146560 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.104296E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.507 | TFLOPs: 24.52 | +7: iteration 7880/ 173500 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.103627E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.013 | TFLOPs: 25.48 | +7: iteration 7890/ 173500 | consumed samples: 2019840 | consumed tokens: 4136632320 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.100726E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.221 | TFLOPs: 26.02 | +7: iteration 7900/ 173500 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.097411E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.531 | TFLOPs: 25.48 | +7: iteration 7910/ 173500 | consumed samples: 2024960 | consumed tokens: 4147118080 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.096540E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.720 | TFLOPs: 25.61 | +7: iteration 7920/ 173500 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.113538E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.868 | TFLOPs: 25.47 | +7: iteration 7930/ 173500 | consumed samples: 2030080 | consumed tokens: 4157603840 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.098544E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.856 | TFLOPs: 25.78 | +7: iteration 7940/ 173500 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.110955E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.064 | TFLOPs: 25.50 | +7: iteration 7950/ 173500 | consumed samples: 2035200 | consumed tokens: 4168089600 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.102860E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.409 | TFLOPs: 26.10 | +7: iteration 7960/ 173500 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.092974E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.829 | TFLOPs: 25.56 | +7: iteration 7970/ 173500 | consumed samples: 2040320 | consumed tokens: 4178575360 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.088621E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.667 | TFLOPs: 25.71 | +7: iteration 7980/ 173500 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.096944E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.028 | TFLOPs: 25.55 | +7: iteration 7990/ 173500 | consumed samples: 2045440 | consumed tokens: 4189061120 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.087174E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.934 | TFLOPs: 25.36 | +0: [2023-03-17 00:36:27,875] [INFO] [logging.py:68:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.00019940979012929202, 0.00019940979012929202, 0.00019940979012929202], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 8000/ 173500 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.096262E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.673 | TFLOPs: 25.53 | +0: steps: 8000 loss: 4.1188 iter time (s): 0.156 samples/sec: 1638.387 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 8000 | lm loss value: 4.129919E+00 | lm loss PPL: 6.217286E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 8000 to checkpoints_44m91b100m +0: [2023-03-17 00:36:27,950] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is begin to save! +0: [2023-03-17 00:36:27,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:36:28,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:36:28,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:36:28,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:36:28,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:36:28,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:36:28,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:36:28,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:36:28,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:36:28,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:36:28,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:36:28,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:36:28,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:36:28,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:36:28,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:36:28,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:36:28,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:36:28,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:36:28,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:36:28,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:36:28,079] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step8000/mp_rank_00_model_states.pt +0: [2023-03-17 00:36:28,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:36:28,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:28,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:36:28,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:36:28,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:28,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:36:28,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:28,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:28,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-17 00:36:28,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:28,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:28,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: successfully saved checkpoint at iteration 8000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 172.71 +7: iteration 8010/ 173500 | consumed samples: 2050560 | consumed tokens: 4199546880 | elapsed time per iteration (s): 0.18 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.104758E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.863 | TFLOPs: 22.46 | +7: iteration 8020/ 173500 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.092039E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.483 | TFLOPs: 25.82 | +7: iteration 8030/ 173500 | consumed samples: 2055680 | consumed tokens: 4210032640 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.099465E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.468 | TFLOPs: 25.49 | +7: iteration 8040/ 173500 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.090158E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.985 | TFLOPs: 25.12 | +7: iteration 8050/ 173500 | consumed samples: 2060800 | consumed tokens: 4220518400 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.098856E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.618 | TFLOPs: 25.65 | +7: iteration 8060/ 173500 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.087963E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.651 | TFLOPs: 25.87 | +7: iteration 8070/ 173500 | consumed samples: 2065920 | consumed tokens: 4231004160 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.094295E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.075 | TFLOPs: 25.89 | +7: iteration 8080/ 173500 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.090576E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.788 | TFLOPs: 25.68 | +7: iteration 8090/ 173500 | consumed samples: 2071040 | consumed tokens: 4241489920 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.083211E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.624 | TFLOPs: 26.07 | +7: iteration 8100/ 173500 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.098610E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.699 | TFLOPs: 26.19 | +7: iteration 8110/ 173500 | consumed samples: 2076160 | consumed tokens: 4251975680 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.094547E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.398 | TFLOPs: 25.57 | +7: iteration 8120/ 173500 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.091361E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.982 | TFLOPs: 25.81 | +7: iteration 8130/ 173500 | consumed samples: 2081280 | consumed tokens: 4262461440 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.079279E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.306 | TFLOPs: 25.79 | +7: iteration 8140/ 173500 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.095522E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.838 | TFLOPs: 25.75 | +7: iteration 8150/ 173500 | consumed samples: 2086400 | consumed tokens: 4272947200 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.088921E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.317 | TFLOPs: 26.13 | +7: iteration 8160/ 173500 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.090751E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.333 | TFLOPs: 25.74 | +7: iteration 8170/ 173500 | consumed samples: 2091520 | consumed tokens: 4283432960 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.090153E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.044 | TFLOPs: 26.08 | +7: iteration 8180/ 173500 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.080426E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.000 | TFLOPs: 25.81 | +7: iteration 8190/ 173500 | consumed samples: 2096640 | consumed tokens: 4293918720 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.083642E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.551 | TFLOPs: 26.18 | +7: iteration 8200/ 173500 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.079383E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.647 | TFLOPs: 25.93 | +7: iteration 8210/ 173500 | consumed samples: 2101760 | consumed tokens: 4304404480 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.083269E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.942 | TFLOPs: 25.95 | +7: iteration 8220/ 173500 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.099202E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.705 | TFLOPs: 26.03 | +7: iteration 8230/ 173500 | consumed samples: 2106880 | consumed tokens: 4314890240 | elapsed time per iteration (s): 0.15 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.077742E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.203 | TFLOPs: 26.19 | +7: iteration 8240/ 173500 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.082212E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.227 | TFLOPs: 25.79 | +7: iteration 8250/ 173500 | consumed samples: 2112000 | consumed tokens: 4325376000 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.091632E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.944 | TFLOPs: 25.09 | +7: iteration 8260/ 173500 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.091272E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.174 | TFLOPs: 25.33 | +7: iteration 8270/ 173500 | consumed samples: 2117120 | consumed tokens: 4335861760 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.085390E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.958 | TFLOPs: 25.86 | +7: iteration 8280/ 173500 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.079582E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.299 | TFLOPs: 25.63 | +7: iteration 8290/ 173500 | consumed samples: 2122240 | consumed tokens: 4346347520 | elapsed time per iteration (s): 0.17 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.089085E+00 | grad norm: 0.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.814 | TFLOPs: 24.20 | +7: iteration 8300/ 173500 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.089854E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.276 | TFLOPs: 24.91 | +7: iteration 8310/ 173500 | consumed samples: 2127360 | consumed tokens: 4356833280 | elapsed time per iteration (s): 0.16 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.080276E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.213 | TFLOPs: 25.33 | +7: iteration 8320/ 173500 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.092165E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.096 | TFLOPs: 26.21 | +7: iteration 8330/ 173500 | consumed samples: 2132480 | consumed tokens: 4367319040 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.083077E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.770 | TFLOPs: 26.23 | +7: iteration 8340/ 173500 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.088510E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.181 | TFLOPs: 24.97 | +7: iteration 8350/ 173500 | consumed samples: 2137600 | consumed tokens: 4377804800 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.073430E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.068 | TFLOPs: 26.25 | +7: iteration 8360/ 173500 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.078502E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.706 | TFLOPs: 25.87 | +7: iteration 8370/ 173500 | consumed samples: 2142720 | consumed tokens: 4388290560 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.084919E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.017 | TFLOPs: 25.01 | +7: iteration 8380/ 173500 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.092158E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.433 | TFLOPs: 25.77 | +7: iteration 8390/ 173500 | consumed samples: 2147840 | consumed tokens: 4398776320 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.071537E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.856 | TFLOPs: 26.16 | +7: iteration 8400/ 173500 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.072785E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.562 | TFLOPs: 25.78 | +7: iteration 8410/ 173500 | consumed samples: 2152960 | consumed tokens: 4409262080 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.076147E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.242 | TFLOPs: 25.57 | +7: iteration 8420/ 173500 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.079434E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.482 | TFLOPs: 25.82 | +7: iteration 8430/ 173500 | consumed samples: 2158080 | consumed tokens: 4419747840 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.073061E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.673 | TFLOPs: 26.25 | +7: iteration 8440/ 173500 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.087445E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.823 | TFLOPs: 26.27 | +7: iteration 8450/ 173500 | consumed samples: 2163200 | consumed tokens: 4430233600 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.065827E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.138 | TFLOPs: 24.73 | +7: iteration 8460/ 173500 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.078890E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.251 | TFLOPs: 25.97 | +7: iteration 8470/ 173500 | consumed samples: 2168320 | consumed tokens: 4440719360 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.057988E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.894 | TFLOPs: 24.35 | +7: iteration 8480/ 173500 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 0.17 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.081421E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.469 | TFLOPs: 23.72 | +7: iteration 8490/ 173500 | consumed samples: 2173440 | consumed tokens: 4451205120 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.065660E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.432 | TFLOPs: 26.17 | +7: iteration 8500/ 173500 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.075177E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.617 | TFLOPs: 26.00 | +7: iteration 8510/ 173500 | consumed samples: 2178560 | consumed tokens: 4461690880 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.076474E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.055 | TFLOPs: 26.02 | +7: iteration 8520/ 173500 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.077406E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.757 | TFLOPs: 25.94 | +7: iteration 8530/ 173500 | consumed samples: 2183680 | consumed tokens: 4472176640 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.077154E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.189 | TFLOPs: 24.50 | +7: iteration 8540/ 173500 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 0.17 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.087217E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.295 | TFLOPs: 23.17 | +7: iteration 8550/ 173500 | consumed samples: 2188800 | consumed tokens: 4482662400 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.091423E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.722 | TFLOPs: 26.17 | +7: iteration 8560/ 173500 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 0.17 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.071427E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1485.137 | TFLOPs: 23.29 | +7: iteration 8570/ 173500 | consumed samples: 2193920 | consumed tokens: 4493148160 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.070252E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.943 | TFLOPs: 24.65 | +7: iteration 8580/ 173500 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.077481E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.495 | TFLOPs: 25.63 | +7: iteration 8590/ 173500 | consumed samples: 2199040 | consumed tokens: 4503633920 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.073838E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.277 | TFLOPs: 26.18 | +7: iteration 8600/ 173500 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.066792E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.417 | TFLOPs: 25.37 | +7: iteration 8610/ 173500 | consumed samples: 2204160 | consumed tokens: 4514119680 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.074838E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.104 | TFLOPs: 25.34 | +7: iteration 8620/ 173500 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.073875E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.797 | TFLOPs: 25.12 | +7: iteration 8630/ 173500 | consumed samples: 2209280 | consumed tokens: 4524605440 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.070885E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.104 | TFLOPs: 25.66 | +7: iteration 8640/ 173500 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.065390E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.803 | TFLOPs: 24.62 | +7: iteration 8650/ 173500 | consumed samples: 2214400 | consumed tokens: 4535091200 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.084276E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.239 | TFLOPs: 25.97 | +7: iteration 8660/ 173500 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.060088E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.549 | TFLOPs: 25.34 | +7: iteration 8670/ 173500 | consumed samples: 2219520 | consumed tokens: 4545576960 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.063709E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.885 | TFLOPs: 25.69 | +7: iteration 8680/ 173500 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.071241E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.855 | TFLOPs: 25.36 | +7: iteration 8690/ 173500 | consumed samples: 2224640 | consumed tokens: 4556062720 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.074462E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.965 | TFLOPs: 25.34 | +7: iteration 8700/ 173500 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.068884E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.917 | TFLOPs: 26.19 | +7: iteration 8710/ 173500 | consumed samples: 2229760 | consumed tokens: 4566548480 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.063952E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.824 | TFLOPs: 26.20 | +7: iteration 8720/ 173500 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.066431E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.582 | TFLOPs: 25.57 | +7: iteration 8730/ 173500 | consumed samples: 2234880 | consumed tokens: 4577034240 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.066296E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.362 | TFLOPs: 26.13 | +7: iteration 8740/ 173500 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.053278E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.404 | TFLOPs: 26.12 | +7: iteration 8750/ 173500 | consumed samples: 2240000 | consumed tokens: 4587520000 | elapsed time per iteration (s): 0.15 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.063055E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.421 | TFLOPs: 26.02 | +7: iteration 8760/ 173500 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.076325E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.994 | TFLOPs: 25.83 | +7: iteration 8770/ 173500 | consumed samples: 2245120 | consumed tokens: 4598005760 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.078349E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.695 | TFLOPs: 25.79 | +7: iteration 8780/ 173500 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.065242E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.451 | TFLOPs: 25.82 | +7: iteration 8790/ 173500 | consumed samples: 2250240 | consumed tokens: 4608491520 | elapsed time per iteration (s): 0.16 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.066074E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.613 | TFLOPs: 25.79 | +7: iteration 8800/ 173500 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.066947E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.057 | TFLOPs: 26.17 | +7: iteration 8810/ 173500 | consumed samples: 2255360 | consumed tokens: 4618977280 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.051939E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.494 | TFLOPs: 25.63 | +7: iteration 8820/ 173500 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.069700E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.737 | TFLOPs: 26.09 | +7: iteration 8830/ 173500 | consumed samples: 2260480 | consumed tokens: 4629463040 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.062411E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.336 | TFLOPs: 24.94 | +7: iteration 8840/ 173500 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.063836E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.605 | TFLOPs: 25.82 | +7: iteration 8850/ 173500 | consumed samples: 2265600 | consumed tokens: 4639948800 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.057393E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.123 | TFLOPs: 25.74 | +7: iteration 8860/ 173500 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.062631E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.914 | TFLOPs: 25.58 | +7: iteration 8870/ 173500 | consumed samples: 2270720 | consumed tokens: 4650434560 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.056741E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.004 | TFLOPs: 25.48 | +7: iteration 8880/ 173500 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.065710E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.937 | TFLOPs: 26.19 | +7: iteration 8890/ 173500 | consumed samples: 2275840 | consumed tokens: 4660920320 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.057049E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.717 | TFLOPs: 26.22 | +7: iteration 8900/ 173500 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.055849E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.858 | TFLOPs: 25.87 | +7: iteration 8910/ 173500 | consumed samples: 2280960 | consumed tokens: 4671406080 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.065371E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.410 | TFLOPs: 25.37 | +7: iteration 8920/ 173500 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.045064E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.378 | TFLOPs: 25.84 | +7: iteration 8930/ 173500 | consumed samples: 2286080 | consumed tokens: 4681891840 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.059965E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.469 | TFLOPs: 25.37 | +7: iteration 8940/ 173500 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.063033E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.886 | TFLOPs: 26.11 | +7: iteration 8950/ 173500 | consumed samples: 2291200 | consumed tokens: 4692377600 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.061212E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.460 | TFLOPs: 26.12 | +7: iteration 8960/ 173500 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.051363E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.076 | TFLOPs: 26.10 | +7: iteration 8970/ 173500 | consumed samples: 2296320 | consumed tokens: 4702863360 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.058509E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.438 | TFLOPs: 26.01 | +7: iteration 8980/ 173500 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.050720E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.803 | TFLOPs: 25.62 | +7: iteration 8990/ 173500 | consumed samples: 2301440 | consumed tokens: 4713349120 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.065630E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.131 | TFLOPs: 26.07 | +7: iteration 9000/ 173500 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.065751E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.852 | TFLOPs: 25.09 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 9000 | lm loss value: 4.059625E+00 | lm loss PPL: 5.795256E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 9000 to checkpoints_44m91b100m +0: [2023-03-17 00:39:04,834] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step9000 is begin to save! +0: [2023-03-17 00:39:04,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:39:04,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:39:04,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:39:04,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:39:04,906] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:39:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:39:04,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:39:04,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:39:04,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:39:04,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:39:04,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:39:04,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:39:04,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:39:04,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:39:04,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:39:04,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:39:04,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:39:04,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:39:04,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:39:04,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:39:04,964] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step9000/mp_rank_00_model_states.pt +0: [2023-03-17 00:39:04,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:39:04,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:04,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:04,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:04,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:04,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:04,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:04,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:04,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:04,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:04,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:04,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:04,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:04,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:04,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:04,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:04,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:04,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:04,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:04,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:05,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:05,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:05,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:39:05,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:05,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:05,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:05,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:05,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:05,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:39:05,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:05,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:05,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:39:05,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:05,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:05,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:39:05,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:05,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:05,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: successfully saved checkpoint at iteration 9000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.28 +7: iteration 9010/ 173500 | consumed samples: 2306560 | consumed tokens: 4723834880 | elapsed time per iteration (s): 0.18 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.054015E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1425.686 | TFLOPs: 22.36 | +7: iteration 9020/ 173500 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.051555E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.828 | TFLOPs: 26.06 | +7: iteration 9030/ 173500 | consumed samples: 2311680 | consumed tokens: 4734320640 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.062487E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.242 | TFLOPs: 26.08 | +7: iteration 9040/ 173500 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.050466E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.651 | TFLOPs: 25.15 | +7: iteration 9050/ 173500 | consumed samples: 2316800 | consumed tokens: 4744806400 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.057455E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.061 | TFLOPs: 25.16 | +7: iteration 9060/ 173500 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.047888E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.006 | TFLOPs: 26.21 | +7: iteration 9070/ 173500 | consumed samples: 2321920 | consumed tokens: 4755292160 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.056249E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.907 | TFLOPs: 26.19 | +7: iteration 9080/ 173500 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.045085E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.416 | TFLOPs: 26.15 | +7: iteration 9090/ 173500 | consumed samples: 2327040 | consumed tokens: 4765777920 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.048824E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.325 | TFLOPs: 26.18 | +7: iteration 9100/ 173500 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.042942E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.546 | TFLOPs: 25.46 | +7: iteration 9110/ 173500 | consumed samples: 2332160 | consumed tokens: 4776263680 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.060016E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.873 | TFLOPs: 26.06 | +7: iteration 9120/ 173500 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.046591E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.291 | TFLOPs: 26.13 | +7: iteration 9130/ 173500 | consumed samples: 2337280 | consumed tokens: 4786749440 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.054393E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.476 | TFLOPs: 25.73 | +7: iteration 9140/ 173500 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.054392E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.014 | TFLOPs: 25.89 | +7: iteration 9150/ 173500 | consumed samples: 2342400 | consumed tokens: 4797235200 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.050629E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.309 | TFLOPs: 25.74 | +7: iteration 9160/ 173500 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 0.17 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.045770E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.020 | TFLOPs: 24.14 | +7: iteration 9170/ 173500 | consumed samples: 2347520 | consumed tokens: 4807720960 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.038316E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.976 | TFLOPs: 25.58 | +7: iteration 9180/ 173500 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.059047E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.387 | TFLOPs: 26.23 | +7: iteration 9190/ 173500 | consumed samples: 2352640 | consumed tokens: 4818206720 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.063804E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.653 | TFLOPs: 25.53 | +7: iteration 9200/ 173500 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.047123E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.898 | TFLOPs: 25.11 | +7: iteration 9210/ 173500 | consumed samples: 2357760 | consumed tokens: 4828692480 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.045999E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.279 | TFLOPs: 25.93 | +7: iteration 9220/ 173500 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.051450E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.847 | TFLOPs: 25.43 | +7: iteration 9230/ 173500 | consumed samples: 2362880 | consumed tokens: 4839178240 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.050459E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.107 | TFLOPs: 26.05 | +7: iteration 9240/ 173500 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 0.16 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.055399E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.397 | TFLOPs: 25.51 | +7: iteration 9250/ 173500 | consumed samples: 2368000 | consumed tokens: 4849664000 | elapsed time per iteration (s): 0.15 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.043390E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.555 | TFLOPs: 26.17 | +7: iteration 9260/ 173500 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.056421E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.775 | TFLOPs: 25.90 | +7: iteration 9270/ 173500 | consumed samples: 2373120 | consumed tokens: 4860149760 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.038895E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.417 | TFLOPs: 26.12 | +7: iteration 9280/ 173500 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.047911E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.552 | TFLOPs: 25.27 | +7: iteration 9290/ 173500 | consumed samples: 2378240 | consumed tokens: 4870635520 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.045914E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.353 | TFLOPs: 26.18 | +7: iteration 9300/ 173500 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.046833E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.968 | TFLOPs: 25.14 | +7: iteration 9310/ 173500 | consumed samples: 2383360 | consumed tokens: 4881121280 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.028357E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.462 | TFLOPs: 25.71 | +7: iteration 9320/ 173500 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.044702E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.982 | TFLOPs: 25.31 | +7: iteration 9330/ 173500 | consumed samples: 2388480 | consumed tokens: 4891607040 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.035133E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.597 | TFLOPs: 25.49 | +7: iteration 9340/ 173500 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.052219E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.793 | TFLOPs: 26.14 | +7: iteration 9350/ 173500 | consumed samples: 2393600 | consumed tokens: 4902092800 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.046534E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.742 | TFLOPs: 25.48 | +7: iteration 9360/ 173500 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.032357E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.944 | TFLOPs: 25.66 | +7: iteration 9370/ 173500 | consumed samples: 2398720 | consumed tokens: 4912578560 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.047499E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.239 | TFLOPs: 26.15 | +7: iteration 9380/ 173500 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.047711E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.618 | TFLOPs: 25.65 | +7: iteration 9390/ 173500 | consumed samples: 2403840 | consumed tokens: 4923064320 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.045996E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.078 | TFLOPs: 25.66 | +7: iteration 9400/ 173500 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.042570E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.002 | TFLOPs: 25.75 | +7: iteration 9410/ 173500 | consumed samples: 2408960 | consumed tokens: 4933550080 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.035374E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.476 | TFLOPs: 26.23 | +7: iteration 9420/ 173500 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.023681E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.076 | TFLOPs: 25.39 | +7: iteration 9430/ 173500 | consumed samples: 2414080 | consumed tokens: 4944035840 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.037334E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.262 | TFLOPs: 25.93 | +7: iteration 9440/ 173500 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.042884E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.825 | TFLOPs: 26.22 | +7: iteration 9450/ 173500 | consumed samples: 2419200 | consumed tokens: 4954521600 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.047966E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.011 | TFLOPs: 25.30 | +7: iteration 9460/ 173500 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.044806E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.917 | TFLOPs: 25.22 | +7: iteration 9470/ 173500 | consumed samples: 2424320 | consumed tokens: 4965007360 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.032748E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.497 | TFLOPs: 25.81 | +7: iteration 9480/ 173500 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.043330E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.889 | TFLOPs: 26.24 | +7: iteration 9490/ 173500 | consumed samples: 2429440 | consumed tokens: 4975493120 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.039399E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.194 | TFLOPs: 26.19 | +7: iteration 9500/ 173500 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.035065E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.496 | TFLOPs: 26.21 | +7: iteration 9510/ 173500 | consumed samples: 2434560 | consumed tokens: 4985978880 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.034578E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.428 | TFLOPs: 24.96 | +7: iteration 9520/ 173500 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.022524E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.767 | TFLOPs: 26.20 | +7: iteration 9530/ 173500 | consumed samples: 2439680 | consumed tokens: 4996464640 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.039855E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.520 | TFLOPs: 26.18 | +7: iteration 9540/ 173500 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.041454E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.306 | TFLOPs: 26.18 | +7: iteration 9550/ 173500 | consumed samples: 2444800 | consumed tokens: 5006950400 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.033220E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.983 | TFLOPs: 26.19 | +7: iteration 9560/ 173500 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.034972E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.756 | TFLOPs: 26.14 | +7: iteration 9570/ 173500 | consumed samples: 2449920 | consumed tokens: 5017436160 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.032964E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.938 | TFLOPs: 26.19 | +7: iteration 9580/ 173500 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.035958E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.610 | TFLOPs: 25.95 | +7: iteration 9590/ 173500 | consumed samples: 2455040 | consumed tokens: 5027921920 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.028293E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.013 | TFLOPs: 26.21 | +7: iteration 9600/ 173500 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.039628E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.457 | TFLOPs: 25.52 | +7: iteration 9610/ 173500 | consumed samples: 2460160 | consumed tokens: 5038407680 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.051394E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.216 | TFLOPs: 26.19 | +7: iteration 9620/ 173500 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.043828E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.650 | TFLOPs: 25.70 | +7: iteration 9630/ 173500 | consumed samples: 2465280 | consumed tokens: 5048893440 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.038260E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.597 | TFLOPs: 26.09 | +7: iteration 9640/ 173500 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.039127E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.303 | TFLOPs: 25.66 | +7: iteration 9650/ 173500 | consumed samples: 2470400 | consumed tokens: 5059379200 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.026516E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.895 | TFLOPs: 25.75 | +7: iteration 9660/ 173500 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.040713E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.067 | TFLOPs: 26.16 | +7: iteration 9670/ 173500 | consumed samples: 2475520 | consumed tokens: 5069864960 | elapsed time per iteration (s): 0.16 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.040652E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.640 | TFLOPs: 25.29 | +7: iteration 9680/ 173500 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 0.15 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.046846E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.511 | TFLOPs: 26.09 | +7: iteration 9690/ 173500 | consumed samples: 2480640 | consumed tokens: 5080350720 | elapsed time per iteration (s): 0.17 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.032700E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.209 | TFLOPs: 24.25 | +7: iteration 9700/ 173500 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.016833E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.727 | TFLOPs: 25.45 | +7: iteration 9710/ 173500 | consumed samples: 2485760 | consumed tokens: 5090836480 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.017292E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.276 | TFLOPs: 25.75 | +7: iteration 9720/ 173500 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.033384E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.278 | TFLOPs: 26.08 | +7: iteration 9730/ 173500 | consumed samples: 2490880 | consumed tokens: 5101322240 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.027638E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.308 | TFLOPs: 25.11 | +7: iteration 9740/ 173500 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.037184E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.866 | TFLOPs: 26.17 | +7: iteration 9750/ 173500 | consumed samples: 2496000 | consumed tokens: 5111808000 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.021878E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.251 | TFLOPs: 25.80 | +7: iteration 9760/ 173500 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.031741E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.342 | TFLOPs: 26.18 | +7: iteration 9770/ 173500 | consumed samples: 2501120 | consumed tokens: 5122293760 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.034863E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.792 | TFLOPs: 25.47 | +7: iteration 9780/ 173500 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.042675E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.120 | TFLOPs: 26.21 | +7: iteration 9790/ 173500 | consumed samples: 2506240 | consumed tokens: 5132779520 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.022278E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.992 | TFLOPs: 26.22 | +7: iteration 9800/ 173500 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.037146E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.671 | TFLOPs: 25.97 | +7: iteration 9810/ 173500 | consumed samples: 2511360 | consumed tokens: 5143265280 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.028557E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.839 | TFLOPs: 25.18 | +7: iteration 9820/ 173500 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.026704E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.697 | TFLOPs: 26.22 | +7: iteration 9830/ 173500 | consumed samples: 2516480 | consumed tokens: 5153751040 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.034686E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.117 | TFLOPs: 25.97 | +7: iteration 9840/ 173500 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.028238E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.960 | TFLOPs: 25.75 | +7: iteration 9850/ 173500 | consumed samples: 2521600 | consumed tokens: 5164236800 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.024986E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.625 | TFLOPs: 25.29 | +7: iteration 9860/ 173500 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.026665E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.399 | TFLOPs: 26.24 | +7: iteration 9870/ 173500 | consumed samples: 2526720 | consumed tokens: 5174722560 | elapsed time per iteration (s): 0.17 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.024839E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.830 | TFLOPs: 23.10 | +7: iteration 9880/ 173500 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.028136E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.903 | TFLOPs: 25.33 | +7: iteration 9890/ 173500 | consumed samples: 2531840 | consumed tokens: 5185208320 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.052258E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.926 | TFLOPs: 25.29 | +7: iteration 9900/ 173500 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.025979E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.662 | TFLOPs: 25.20 | +7: iteration 9910/ 173500 | consumed samples: 2536960 | consumed tokens: 5195694080 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.028480E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.920 | TFLOPs: 26.14 | +7: iteration 9920/ 173500 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.018156E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.279 | TFLOPs: 26.15 | +7: iteration 9930/ 173500 | consumed samples: 2542080 | consumed tokens: 5206179840 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.024323E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.827 | TFLOPs: 26.14 | +7: iteration 9940/ 173500 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.014708E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.253 | TFLOPs: 26.15 | +7: iteration 9950/ 173500 | consumed samples: 2547200 | consumed tokens: 5216665600 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.021850E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.109 | TFLOPs: 26.13 | +7: iteration 9960/ 173500 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.022723E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.878 | TFLOPs: 26.11 | +7: iteration 9970/ 173500 | consumed samples: 2552320 | consumed tokens: 5227151360 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.019064E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.478 | TFLOPs: 25.59 | +7: iteration 9980/ 173500 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 0.17 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.024679E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.682 | TFLOPs: 24.15 | +7: iteration 9990/ 173500 | consumed samples: 2557440 | consumed tokens: 5237637120 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.028475E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.228 | TFLOPs: 26.07 | +0: [2023-03-17 00:41:40,992] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=0, lr=[0.00019897364350587667, 0.00019897364350587667, 0.00019897364350587667], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 10000/ 173500 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.033152E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.525 | TFLOPs: 25.68 | +0: steps: 10000 loss: 4.0231 iter time (s): 0.155 samples/sec: 1652.742 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 10000 | lm loss value: 4.003658E+00 | lm loss PPL: 5.479825E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 10000 to checkpoints_44m91b100m +0: [2023-03-17 00:41:41,066] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is begin to save! +0: [2023-03-17 00:41:41,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:41:41,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:41:41,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:41:41,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:41:41,137] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:41:41,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:41:41,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:41:41,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:41:41,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:41:41,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:41:41,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:41:41,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:41:41,169] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:41:41,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:41:41,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:41:41,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:41:41,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:41:41,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:41:41,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:41:41,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:41:41,195] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step10000/mp_rank_00_model_states.pt +0: [2023-03-17 00:41:41,195] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:41:41,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:41,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:41,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:41,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:41:41,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:41,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:41,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:41,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:41:41,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: successfully saved checkpoint at iteration 10000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 174.03 +7: iteration 10010/ 173500 | consumed samples: 2562560 | consumed tokens: 5248122880 | elapsed time per iteration (s): 0.18 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.033879E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.171 | TFLOPs: 22.18 | +7: iteration 10020/ 173500 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.027830E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.543 | TFLOPs: 26.17 | +7: iteration 10030/ 173500 | consumed samples: 2567680 | consumed tokens: 5258608640 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.024360E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.967 | TFLOPs: 26.16 | +7: iteration 10040/ 173500 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.022976E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.334 | TFLOPs: 26.15 | +7: iteration 10050/ 173500 | consumed samples: 2572800 | consumed tokens: 5269094400 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.010258E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.153 | TFLOPs: 24.67 | +7: iteration 10060/ 173500 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.015519E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.553 | TFLOPs: 26.01 | +7: iteration 10070/ 173500 | consumed samples: 2577920 | consumed tokens: 5279580160 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.025359E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.991 | TFLOPs: 25.56 | +7: iteration 10080/ 173500 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 0.16 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.017018E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.698 | TFLOPs: 24.40 | +7: iteration 10090/ 173500 | consumed samples: 2583040 | consumed tokens: 5290065920 | elapsed time per iteration (s): 0.15 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.025860E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.287 | TFLOPs: 26.07 | +7: iteration 10100/ 173500 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.021047E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.122 | TFLOPs: 26.03 | +7: iteration 10110/ 173500 | consumed samples: 2588160 | consumed tokens: 5300551680 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.019556E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.047 | TFLOPs: 24.73 | +7: iteration 10120/ 173500 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.017525E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.200 | TFLOPs: 25.80 | +7: iteration 10130/ 173500 | consumed samples: 2593280 | consumed tokens: 5311037440 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.027044E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.184 | TFLOPs: 25.78 | +7: iteration 10140/ 173500 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.014684E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.698 | TFLOPs: 25.79 | +7: iteration 10150/ 173500 | consumed samples: 2598400 | consumed tokens: 5321523200 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.025307E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.988 | TFLOPs: 26.03 | +7: iteration 10160/ 173500 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.017660E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.782 | TFLOPs: 26.19 | +7: iteration 10170/ 173500 | consumed samples: 2603520 | consumed tokens: 5332008960 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.018584E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.764 | TFLOPs: 26.17 | +7: iteration 10180/ 173500 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.021879E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.392 | TFLOPs: 26.18 | +7: iteration 10190/ 173500 | consumed samples: 2608640 | consumed tokens: 5342494720 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.025508E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.771 | TFLOPs: 25.20 | +7: iteration 10200/ 173500 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.020734E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.319 | TFLOPs: 25.58 | +7: iteration 10210/ 173500 | consumed samples: 2613760 | consumed tokens: 5352980480 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.015588E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.934 | TFLOPs: 26.09 | +7: iteration 10220/ 173500 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.025246E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.307 | TFLOPs: 24.42 | +7: iteration 10230/ 173500 | consumed samples: 2618880 | consumed tokens: 5363466240 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.016095E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.921 | TFLOPs: 24.62 | +7: iteration 10240/ 173500 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.017388E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.408 | TFLOPs: 26.18 | +7: iteration 10250/ 173500 | consumed samples: 2624000 | consumed tokens: 5373952000 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.019506E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.369 | TFLOPs: 26.23 | +7: iteration 10260/ 173500 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.000848E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.417 | TFLOPs: 26.20 | +7: iteration 10270/ 173500 | consumed samples: 2629120 | consumed tokens: 5384437760 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.015850E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.950 | TFLOPs: 26.16 | +7: iteration 10280/ 173500 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.025845E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.686 | TFLOPs: 24.85 | +7: iteration 10290/ 173500 | consumed samples: 2634240 | consumed tokens: 5394923520 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.008307E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.505 | TFLOPs: 25.02 | +7: iteration 10300/ 173500 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.022050E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.485 | TFLOPs: 26.21 | +7: iteration 10310/ 173500 | consumed samples: 2639360 | consumed tokens: 5405409280 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.017081E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.712 | TFLOPs: 26.23 | +7: iteration 10320/ 173500 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.023733E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.371 | TFLOPs: 26.23 | +7: iteration 10330/ 173500 | consumed samples: 2644480 | consumed tokens: 5415895040 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.012048E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.481 | TFLOPs: 26.12 | +7: iteration 10340/ 173500 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.024749E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.229 | TFLOPs: 26.11 | +7: iteration 10350/ 173500 | consumed samples: 2649600 | consumed tokens: 5426380800 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.019849E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.200 | TFLOPs: 25.35 | +7: iteration 10360/ 173500 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.030257E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.049 | TFLOPs: 25.88 | +7: iteration 10370/ 173500 | consumed samples: 2654720 | consumed tokens: 5436866560 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.013592E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.971 | TFLOPs: 25.53 | +7: iteration 10380/ 173500 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.009943E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.491 | TFLOPs: 26.02 | +7: iteration 10390/ 173500 | consumed samples: 2659840 | consumed tokens: 5447352320 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.013979E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.619 | TFLOPs: 25.98 | +7: iteration 10400/ 173500 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.012929E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.454 | TFLOPs: 25.99 | +7: iteration 10410/ 173500 | consumed samples: 2664960 | consumed tokens: 5457838080 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.024967E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.258 | TFLOPs: 25.82 | +7: iteration 10420/ 173500 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.009355E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.788 | TFLOPs: 26.08 | +7: iteration 10430/ 173500 | consumed samples: 2670080 | consumed tokens: 5468323840 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.015700E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.292 | TFLOPs: 25.17 | +7: iteration 10440/ 173500 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 3.999522E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.281 | TFLOPs: 26.08 | +7: iteration 10450/ 173500 | consumed samples: 2675200 | consumed tokens: 5478809600 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.012152E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.983 | TFLOPs: 25.64 | +7: iteration 10460/ 173500 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 0.15 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.017545E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.480 | TFLOPs: 26.04 | +7: iteration 10470/ 173500 | consumed samples: 2680320 | consumed tokens: 5489295360 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.014729E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.317 | TFLOPs: 25.66 | +7: iteration 10480/ 173500 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 0.16 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.009723E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.678 | TFLOPs: 25.62 | +7: iteration 10490/ 173500 | consumed samples: 2685440 | consumed tokens: 5499781120 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.015490E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.820 | TFLOPs: 25.34 | +7: iteration 10500/ 173500 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.007233E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.867 | TFLOPs: 25.86 | +7: iteration 10510/ 173500 | consumed samples: 2690560 | consumed tokens: 5510266880 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.998939E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.820 | TFLOPs: 25.89 | +7: iteration 10520/ 173500 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.012233E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.037 | TFLOPs: 25.86 | +7: iteration 10530/ 173500 | consumed samples: 2695680 | consumed tokens: 5520752640 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.003140E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.476 | TFLOPs: 26.10 | +7: iteration 10540/ 173500 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.012530E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.498 | TFLOPs: 26.07 | +7: iteration 10550/ 173500 | consumed samples: 2700800 | consumed tokens: 5531238400 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.009448E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.770 | TFLOPs: 25.84 | +7: iteration 10560/ 173500 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.004770E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.467 | TFLOPs: 25.88 | +7: iteration 10570/ 173500 | consumed samples: 2705920 | consumed tokens: 5541724160 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.012962E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.756 | TFLOPs: 25.89 | +7: iteration 10580/ 173500 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.026337E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.287 | TFLOPs: 26.08 | +7: iteration 10590/ 173500 | consumed samples: 2711040 | consumed tokens: 5552209920 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.014952E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.114 | TFLOPs: 25.42 | +7: iteration 10600/ 173500 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.012243E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.351 | TFLOPs: 25.71 | +7: iteration 10610/ 173500 | consumed samples: 2716160 | consumed tokens: 5562695680 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.997204E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.147 | TFLOPs: 25.42 | +7: iteration 10620/ 173500 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.005056E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.159 | TFLOPs: 26.15 | +7: iteration 10630/ 173500 | consumed samples: 2721280 | consumed tokens: 5573181440 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.011083E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.131 | TFLOPs: 26.13 | +7: iteration 10640/ 173500 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.004351E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.316 | TFLOPs: 25.71 | +7: iteration 10650/ 173500 | consumed samples: 2726400 | consumed tokens: 5583667200 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.999549E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.438 | TFLOPs: 26.10 | +7: iteration 10660/ 173500 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.001758E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.957 | TFLOPs: 26.05 | +7: iteration 10670/ 173500 | consumed samples: 2731520 | consumed tokens: 5594152960 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.005641E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.750 | TFLOPs: 26.08 | +7: iteration 10680/ 173500 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.012239E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.601 | TFLOPs: 25.59 | +7: iteration 10690/ 173500 | consumed samples: 2736640 | consumed tokens: 5604638720 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.012922E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.766 | TFLOPs: 26.15 | +7: iteration 10700/ 173500 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.007940E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.778 | TFLOPs: 26.19 | +7: iteration 10710/ 173500 | consumed samples: 2741760 | consumed tokens: 5615124480 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.997750E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.758 | TFLOPs: 25.72 | +7: iteration 10720/ 173500 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.000763E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.920 | TFLOPs: 26.17 | +7: iteration 10730/ 173500 | consumed samples: 2746880 | consumed tokens: 5625610240 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.005934E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.571 | TFLOPs: 26.07 | +7: iteration 10740/ 173500 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.001192E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.166 | TFLOPs: 26.18 | +7: iteration 10750/ 173500 | consumed samples: 2752000 | consumed tokens: 5636096000 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.007195E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.454 | TFLOPs: 26.07 | +7: iteration 10760/ 173500 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.006392E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.603 | TFLOPs: 25.35 | +7: iteration 10770/ 173500 | consumed samples: 2757120 | consumed tokens: 5646581760 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.014078E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.136 | TFLOPs: 26.13 | +7: iteration 10780/ 173500 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.018630E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.836 | TFLOPs: 26.11 | +7: iteration 10790/ 173500 | consumed samples: 2762240 | consumed tokens: 5657067520 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.001925E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.083 | TFLOPs: 26.16 | +7: iteration 10800/ 173500 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.986240E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.058 | TFLOPs: 25.72 | +7: iteration 10810/ 173500 | consumed samples: 2767360 | consumed tokens: 5667553280 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.005749E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.757 | TFLOPs: 26.11 | +7: iteration 10820/ 173500 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 0.15 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.014031E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.488 | TFLOPs: 26.10 | +7: iteration 10830/ 173500 | consumed samples: 2772480 | consumed tokens: 5678039040 | elapsed time per iteration (s): 0.17 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 3.997735E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.144 | TFLOPs: 23.49 | +7: iteration 10840/ 173500 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.001778E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.975 | TFLOPs: 25.66 | +7: iteration 10850/ 173500 | consumed samples: 2777600 | consumed tokens: 5688524800 | elapsed time per iteration (s): 0.16 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.009351E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.846 | TFLOPs: 25.59 | +7: iteration 10860/ 173500 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.011676E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.701 | TFLOPs: 26.08 | +7: iteration 10870/ 173500 | consumed samples: 2782720 | consumed tokens: 5699010560 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.002608E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.126 | TFLOPs: 26.07 | +7: iteration 10880/ 173500 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.006560E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.878 | TFLOPs: 25.97 | +7: iteration 10890/ 173500 | consumed samples: 2787840 | consumed tokens: 5709496320 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.009409E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.345 | TFLOPs: 25.57 | +7: iteration 10900/ 173500 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.006177E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.474 | TFLOPs: 25.49 | +7: iteration 10910/ 173500 | consumed samples: 2792960 | consumed tokens: 5719982080 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.997005E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.859 | TFLOPs: 25.95 | +7: iteration 10920/ 173500 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.007258E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.352 | TFLOPs: 25.91 | +7: iteration 10930/ 173500 | consumed samples: 2798080 | consumed tokens: 5730467840 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.993137E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.593 | TFLOPs: 26.01 | +7: iteration 10940/ 173500 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.005411E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.453 | TFLOPs: 25.37 | +7: iteration 10950/ 173500 | consumed samples: 2803200 | consumed tokens: 5740953600 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.994614E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.802 | TFLOPs: 25.64 | +7: iteration 10960/ 173500 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.013696E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.739 | TFLOPs: 25.29 | +7: iteration 10970/ 173500 | consumed samples: 2808320 | consumed tokens: 5751439360 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.991594E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.052 | TFLOPs: 26.13 | +7: iteration 10980/ 173500 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.004858E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.769 | TFLOPs: 26.12 | +7: iteration 10990/ 173500 | consumed samples: 2813440 | consumed tokens: 5761925120 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.004072E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.429 | TFLOPs: 25.95 | +7: iteration 11000/ 173500 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.000592E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.763 | TFLOPs: 26.06 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 11000 | lm loss value: 4.081484E+00 | lm loss PPL: 5.923333E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 11000 to checkpoints_44m91b100m +0: [2023-03-17 00:44:16,980] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step11000 is begin to save! +0: [2023-03-17 00:44:16,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:44:17,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:44:17,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:44:17,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:44:17,052] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:44:17,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:44:17,060] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:44:17,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:44:17,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:44:17,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:44:17,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:44:17,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:44:17,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:44:17,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:44:17,093] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:44:17,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:44:17,101] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:44:17,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:44:17,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:44:17,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:44:17,110] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step11000/mp_rank_00_model_states.pt +0: [2023-03-17 00:44:17,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:44:17,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:44:17,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:44:17,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:44:17,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:44:17,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:44:17,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:44:17,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:44:17,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:44:17,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:44:17,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:44:17,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:44:17,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:44:17,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 00:44:17,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:44:17,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:44:17,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:44:17,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:44:17,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: successfully saved checkpoint at iteration 11000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 175.23 +7: iteration 11010/ 173500 | consumed samples: 2818560 | consumed tokens: 5772410880 | elapsed time per iteration (s): 0.18 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.006054E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1454.823 | TFLOPs: 22.82 | +7: iteration 11020/ 173500 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.006678E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.487 | TFLOPs: 26.13 | +7: iteration 11030/ 173500 | consumed samples: 2823680 | consumed tokens: 5782896640 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.991507E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.998 | TFLOPs: 25.84 | +7: iteration 11040/ 173500 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.001113E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.256 | TFLOPs: 26.18 | +7: iteration 11050/ 173500 | consumed samples: 2828800 | consumed tokens: 5793382400 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.994805E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.696 | TFLOPs: 26.19 | +7: iteration 11060/ 173500 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.994038E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.862 | TFLOPs: 25.75 | +7: iteration 11070/ 173500 | consumed samples: 2833920 | consumed tokens: 5803868160 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.995582E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.076 | TFLOPs: 26.18 | +7: iteration 11080/ 173500 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.995696E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.546 | TFLOPs: 25.71 | +7: iteration 11090/ 173500 | consumed samples: 2839040 | consumed tokens: 5814353920 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.993721E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.503 | TFLOPs: 25.88 | +7: iteration 11100/ 173500 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.991676E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.819 | TFLOPs: 25.61 | +7: iteration 11110/ 173500 | consumed samples: 2844160 | consumed tokens: 5824839680 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.999226E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.260 | TFLOPs: 25.54 | +7: iteration 11120/ 173500 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.997766E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.305 | TFLOPs: 25.72 | +7: iteration 11130/ 173500 | consumed samples: 2849280 | consumed tokens: 5835325440 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.987830E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.882 | TFLOPs: 26.14 | +7: iteration 11140/ 173500 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.990825E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.812 | TFLOPs: 25.75 | +7: iteration 11150/ 173500 | consumed samples: 2854400 | consumed tokens: 5845811200 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.991774E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.713 | TFLOPs: 26.17 | +7: iteration 11160/ 173500 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.979025E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.408 | TFLOPs: 26.16 | +7: iteration 11170/ 173500 | consumed samples: 2859520 | consumed tokens: 5856296960 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.993167E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.215 | TFLOPs: 26.08 | +7: iteration 11180/ 173500 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.004942E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.195 | TFLOPs: 25.68 | +7: iteration 11190/ 173500 | consumed samples: 2864640 | consumed tokens: 5866782720 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.995997E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.431 | TFLOPs: 26.18 | +7: iteration 11200/ 173500 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 0.15 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.003874E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.105 | TFLOPs: 25.92 | +7: iteration 11210/ 173500 | consumed samples: 2869760 | consumed tokens: 5877268480 | elapsed time per iteration (s): 0.16 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 3.986594E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.907 | TFLOPs: 25.75 | +7: iteration 11220/ 173500 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.990752E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.389 | TFLOPs: 25.85 | +7: iteration 11230/ 173500 | consumed samples: 2874880 | consumed tokens: 5887754240 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.994337E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.105 | TFLOPs: 26.07 | +7: iteration 11240/ 173500 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.992270E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.476 | TFLOPs: 26.07 | +7: iteration 11250/ 173500 | consumed samples: 2880000 | consumed tokens: 5898240000 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.995382E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.702 | TFLOPs: 25.76 | +7: iteration 11260/ 173500 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.994250E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.713 | TFLOPs: 26.17 | +7: iteration 11270/ 173500 | consumed samples: 2885120 | consumed tokens: 5908725760 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.994164E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.889 | TFLOPs: 25.42 | +7: iteration 11280/ 173500 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.980650E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.068 | TFLOPs: 25.45 | +7: iteration 11290/ 173500 | consumed samples: 2890240 | consumed tokens: 5919211520 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.980626E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.282 | TFLOPs: 26.19 | +7: iteration 11300/ 173500 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.985239E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.264 | TFLOPs: 26.19 | +7: iteration 11310/ 173500 | consumed samples: 2895360 | consumed tokens: 5929697280 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.990461E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.187 | TFLOPs: 26.18 | +7: iteration 11320/ 173500 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.995717E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.984 | TFLOPs: 26.19 | +7: iteration 11330/ 173500 | consumed samples: 2900480 | consumed tokens: 5940183040 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.988097E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.309 | TFLOPs: 25.27 | +7: iteration 11340/ 173500 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.993744E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.433 | TFLOPs: 26.21 | +7: iteration 11350/ 173500 | consumed samples: 2905600 | consumed tokens: 5950668800 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.980500E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.282 | TFLOPs: 26.19 | +7: iteration 11360/ 173500 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.988387E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.101 | TFLOPs: 26.19 | +7: iteration 11370/ 173500 | consumed samples: 2910720 | consumed tokens: 5961154560 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.985586E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.025 | TFLOPs: 26.21 | +7: iteration 11380/ 173500 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.987657E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.027 | TFLOPs: 26.10 | +7: iteration 11390/ 173500 | consumed samples: 2915840 | consumed tokens: 5971640320 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.979454E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.962 | TFLOPs: 25.67 | +7: iteration 11400/ 173500 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.999135E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.463 | TFLOPs: 26.09 | +7: iteration 11410/ 173500 | consumed samples: 2920960 | consumed tokens: 5982126080 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.982421E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.748 | TFLOPs: 25.84 | +7: iteration 11420/ 173500 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.988640E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.253 | TFLOPs: 25.66 | +7: iteration 11430/ 173500 | consumed samples: 2926080 | consumed tokens: 5992611840 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.986094E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.677 | TFLOPs: 24.98 | +7: iteration 11440/ 173500 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.977898E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.540 | TFLOPs: 26.09 | +7: iteration 11450/ 173500 | consumed samples: 2931200 | consumed tokens: 6003097600 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.975008E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.569 | TFLOPs: 26.09 | +7: iteration 11460/ 173500 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 0.16 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.980780E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.094 | TFLOPs: 25.60 | +7: iteration 11470/ 173500 | consumed samples: 2936320 | consumed tokens: 6013583360 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.981237E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.149 | TFLOPs: 26.08 | +7: iteration 11480/ 173500 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.994350E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.244 | TFLOPs: 26.10 | +7: iteration 11490/ 173500 | consumed samples: 2941440 | consumed tokens: 6024069120 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.986148E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.807 | TFLOPs: 25.90 | +7: iteration 11500/ 173500 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.983731E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.184 | TFLOPs: 26.07 | +7: iteration 11510/ 173500 | consumed samples: 2946560 | consumed tokens: 6034554880 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.980745E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.211 | TFLOPs: 26.07 | +7: iteration 11520/ 173500 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.985605E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.867 | TFLOPs: 26.09 | +7: iteration 11530/ 173500 | consumed samples: 2951680 | consumed tokens: 6045040640 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.983217E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.182 | TFLOPs: 26.07 | +7: iteration 11540/ 173500 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.986833E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.964 | TFLOPs: 26.10 | +7: iteration 11550/ 173500 | consumed samples: 2956800 | consumed tokens: 6055526400 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.984881E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.738 | TFLOPs: 26.09 | +7: iteration 11560/ 173500 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 0.15 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 3.978228E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.211 | TFLOPs: 26.02 | +7: iteration 11570/ 173500 | consumed samples: 2961920 | consumed tokens: 6066012160 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.988537E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.941 | TFLOPs: 25.15 | +7: iteration 11580/ 173500 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.976057E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.319 | TFLOPs: 26.02 | +7: iteration 11590/ 173500 | consumed samples: 2967040 | consumed tokens: 6076497920 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.985926E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.312 | TFLOPs: 25.77 | +7: iteration 11600/ 173500 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.995788E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.134 | TFLOPs: 26.08 | +7: iteration 11610/ 173500 | consumed samples: 2972160 | consumed tokens: 6086983680 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.982956E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.649 | TFLOPs: 26.11 | +7: iteration 11620/ 173500 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.982423E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.789 | TFLOPs: 26.09 | +7: iteration 11630/ 173500 | consumed samples: 2977280 | consumed tokens: 6097469440 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.986042E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.276 | TFLOPs: 26.07 | +7: iteration 11640/ 173500 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.981690E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.253 | TFLOPs: 26.08 | +7: iteration 11650/ 173500 | consumed samples: 2982400 | consumed tokens: 6107955200 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.976587E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.406 | TFLOPs: 25.65 | +7: iteration 11660/ 173500 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.983712E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.929 | TFLOPs: 26.03 | +7: iteration 11670/ 173500 | consumed samples: 2987520 | consumed tokens: 6118440960 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.974900E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.001 | TFLOPs: 26.03 | +7: iteration 11680/ 173500 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.986249E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.740 | TFLOPs: 26.01 | +7: iteration 11690/ 173500 | consumed samples: 2992640 | consumed tokens: 6128926720 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.983282E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.049 | TFLOPs: 25.66 | +7: iteration 11700/ 173500 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.991072E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.959 | TFLOPs: 25.99 | +7: iteration 11710/ 173500 | consumed samples: 2997760 | consumed tokens: 6139412480 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.977847E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.258 | TFLOPs: 26.10 | +7: iteration 11720/ 173500 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.986673E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.846 | TFLOPs: 26.06 | +7: iteration 11730/ 173500 | consumed samples: 3002880 | consumed tokens: 6149898240 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.970736E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.423 | TFLOPs: 24.52 | +7: iteration 11740/ 173500 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.976468E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.012 | TFLOPs: 26.02 | +7: iteration 11750/ 173500 | consumed samples: 3008000 | consumed tokens: 6160384000 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.973257E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.760 | TFLOPs: 26.06 | +7: iteration 11760/ 173500 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.987661E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.709 | TFLOPs: 26.06 | +7: iteration 11770/ 173500 | consumed samples: 3013120 | consumed tokens: 6170869760 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.994094E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.133 | TFLOPs: 26.07 | +7: iteration 11780/ 173500 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.973309E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.249 | TFLOPs: 26.07 | +7: iteration 11790/ 173500 | consumed samples: 3018240 | consumed tokens: 6181355520 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.971970E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.508 | TFLOPs: 24.69 | +7: iteration 11800/ 173500 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.977088E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.024 | TFLOPs: 26.05 | +7: iteration 11810/ 173500 | consumed samples: 3023360 | consumed tokens: 6191841280 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.968032E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.750 | TFLOPs: 26.01 | +7: iteration 11820/ 173500 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.979828E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.244 | TFLOPs: 26.01 | +7: iteration 11830/ 173500 | consumed samples: 3028480 | consumed tokens: 6202327040 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.979163E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.108 | TFLOPs: 26.10 | +7: iteration 11840/ 173500 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.990168E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.998 | TFLOPs: 26.10 | +7: iteration 11850/ 173500 | consumed samples: 3033600 | consumed tokens: 6212812800 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.969674E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.303 | TFLOPs: 25.83 | +7: iteration 11860/ 173500 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.965012E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.355 | TFLOPs: 25.69 | +7: iteration 11870/ 173500 | consumed samples: 3038720 | consumed tokens: 6223298560 | elapsed time per iteration (s): 0.16 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.982969E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.588 | TFLOPs: 25.21 | +7: iteration 11880/ 173500 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.969390E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.271 | TFLOPs: 26.12 | +7: iteration 11890/ 173500 | consumed samples: 3043840 | consumed tokens: 6233784320 | elapsed time per iteration (s): 0.15 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 3.968434E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.631 | TFLOPs: 26.07 | +7: iteration 11900/ 173500 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971052E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.726 | TFLOPs: 26.09 | +7: iteration 11910/ 173500 | consumed samples: 3048960 | consumed tokens: 6244270080 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.988543E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.355 | TFLOPs: 25.44 | +7: iteration 11920/ 173500 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.982371E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.621 | TFLOPs: 26.11 | +7: iteration 11930/ 173500 | consumed samples: 3054080 | consumed tokens: 6254755840 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.968110E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.692 | TFLOPs: 26.12 | +7: iteration 11940/ 173500 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971439E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.783 | TFLOPs: 26.12 | +7: iteration 11950/ 173500 | consumed samples: 3059200 | consumed tokens: 6265241600 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971892E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.572 | TFLOPs: 26.10 | +7: iteration 11960/ 173500 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971223E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.638 | TFLOPs: 26.04 | +7: iteration 11970/ 173500 | consumed samples: 3064320 | consumed tokens: 6275727360 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.961348E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.414 | TFLOPs: 26.13 | +7: iteration 11980/ 173500 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.983007E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.400 | TFLOPs: 26.13 | +7: iteration 11990/ 173500 | consumed samples: 3069440 | consumed tokens: 6286213120 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.977361E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.110 | TFLOPs: 26.13 | +0: [2023-03-17 00:46:52,031] [INFO] [logging.py:68:log_dist] [Rank 0] step=12000, skipped=0, lr=[0.0001984184547955352, 0.0001984184547955352, 0.0001984184547955352], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 12000/ 173500 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971919E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.737 | TFLOPs: 25.53 | +0: steps: 12000 loss: 3.9058 iter time (s): 0.154 samples/sec: 1664.322 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 12000 | lm loss value: 3.975328E+00 | lm loss PPL: 5.326761E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 12000 to checkpoints_44m91b100m +0: [2023-03-17 00:46:52,106] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is begin to save! +0: [2023-03-17 00:46:52,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:46:52,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:46:52,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:46:52,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:46:52,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:46:52,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:46:52,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:46:52,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:46:52,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:46:52,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:46:52,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:46:52,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:46:52,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:46:52,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:46:52,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:46:52,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:46:52,229] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:46:52,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:46:52,237] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:46:52,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:46:52,238] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step12000/mp_rank_00_model_states.pt +0: [2023-03-17 00:46:52,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:46:52,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:46:52,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:46:52,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:46:52,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:46:52,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:46:52,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:46:52,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:46:52,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:46:52,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: successfully saved checkpoint at iteration 12000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.54 +7: iteration 12010/ 173500 | consumed samples: 3074560 | consumed tokens: 6296698880 | elapsed time per iteration (s): 0.18 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.969925E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.949 | TFLOPs: 22.55 | +7: iteration 12020/ 173500 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.975365E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.824 | TFLOPs: 26.14 | +7: iteration 12030/ 173500 | consumed samples: 3079680 | consumed tokens: 6307184640 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.981357E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.427 | TFLOPs: 26.13 | +7: iteration 12040/ 173500 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.982099E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.541 | TFLOPs: 26.10 | +7: iteration 12050/ 173500 | consumed samples: 3084800 | consumed tokens: 6317670400 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.981548E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.792 | TFLOPs: 26.08 | +7: iteration 12060/ 173500 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.980299E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.371 | TFLOPs: 26.04 | +7: iteration 12070/ 173500 | consumed samples: 3089920 | consumed tokens: 6328156160 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971901E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.549 | TFLOPs: 26.06 | +7: iteration 12080/ 173500 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.976844E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.564 | TFLOPs: 26.03 | +7: iteration 12090/ 173500 | consumed samples: 3095040 | consumed tokens: 6338641920 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.988029E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.015 | TFLOPs: 25.92 | +7: iteration 12100/ 173500 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.969645E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.481 | TFLOPs: 25.60 | +7: iteration 12110/ 173500 | consumed samples: 3100160 | consumed tokens: 6349127680 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.969910E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.661 | TFLOPs: 26.09 | +7: iteration 12120/ 173500 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.976884E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.545 | TFLOPs: 26.10 | +7: iteration 12130/ 173500 | consumed samples: 3105280 | consumed tokens: 6359613440 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.971644E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.856 | TFLOPs: 26.08 | +7: iteration 12140/ 173500 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.980460E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.107 | TFLOPs: 25.71 | +7: iteration 12150/ 173500 | consumed samples: 3110400 | consumed tokens: 6370099200 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.962939E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.765 | TFLOPs: 26.08 | +7: iteration 12160/ 173500 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.966343E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.414 | TFLOPs: 25.66 | +7: iteration 12170/ 173500 | consumed samples: 3115520 | consumed tokens: 6380584960 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.969692E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.464 | TFLOPs: 26.02 | +7: iteration 12180/ 173500 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 0.16 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.980773E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.970 | TFLOPs: 25.59 | +7: iteration 12190/ 173500 | consumed samples: 3120640 | consumed tokens: 6391070720 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.959759E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.593 | TFLOPs: 26.04 | +7: iteration 12200/ 173500 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.964220E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.451 | TFLOPs: 25.96 | +7: iteration 12210/ 173500 | consumed samples: 3125760 | consumed tokens: 6401556480 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.976126E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.737 | TFLOPs: 26.09 | +7: iteration 12220/ 173500 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 0.15 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 3.965757E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.661 | TFLOPs: 26.11 | +7: iteration 12230/ 173500 | consumed samples: 3130880 | consumed tokens: 6412042240 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.970139E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.604 | TFLOPs: 25.05 | +7: iteration 12240/ 173500 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.968658E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.814 | TFLOPs: 26.09 | +7: iteration 12250/ 173500 | consumed samples: 3136000 | consumed tokens: 6422528000 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.966494E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.663 | TFLOPs: 26.09 | +7: iteration 12260/ 173500 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.957302E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.438 | TFLOPs: 25.41 | +7: iteration 12270/ 173500 | consumed samples: 3141120 | consumed tokens: 6433013760 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.978094E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.512 | TFLOPs: 26.07 | +7: iteration 12280/ 173500 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.956802E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.053 | TFLOPs: 26.11 | +7: iteration 12290/ 173500 | consumed samples: 3146240 | consumed tokens: 6443499520 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.965170E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.649 | TFLOPs: 26.11 | +7: iteration 12300/ 173500 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.963046E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.547 | TFLOPs: 26.12 | +7: iteration 12310/ 173500 | consumed samples: 3151360 | consumed tokens: 6453985280 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.969899E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.223 | TFLOPs: 26.10 | +7: iteration 12320/ 173500 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.952512E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.267 | TFLOPs: 26.10 | +7: iteration 12330/ 173500 | consumed samples: 3156480 | consumed tokens: 6464471040 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.976909E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.203 | TFLOPs: 26.11 | +7: iteration 12340/ 173500 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.977379E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.174 | TFLOPs: 26.13 | +7: iteration 12350/ 173500 | consumed samples: 3161600 | consumed tokens: 6474956800 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.962870E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.446 | TFLOPs: 25.76 | +7: iteration 12360/ 173500 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.969265E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.002 | TFLOPs: 25.48 | +7: iteration 12370/ 173500 | consumed samples: 3166720 | consumed tokens: 6485442560 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.956914E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.055 | TFLOPs: 25.63 | +7: iteration 12380/ 173500 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.953089E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.868 | TFLOPs: 26.11 | +7: iteration 12390/ 173500 | consumed samples: 3171840 | consumed tokens: 6495928320 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.961103E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.926 | TFLOPs: 26.08 | +7: iteration 12400/ 173500 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.958714E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.401 | TFLOPs: 25.08 | +7: iteration 12410/ 173500 | consumed samples: 3176960 | consumed tokens: 6506414080 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.959740E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.484 | TFLOPs: 25.43 | +7: iteration 12420/ 173500 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.968504E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.750 | TFLOPs: 25.50 | +7: iteration 12430/ 173500 | consumed samples: 3182080 | consumed tokens: 6516899840 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.960712E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.983 | TFLOPs: 26.10 | +7: iteration 12440/ 173500 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.961135E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.569 | TFLOPs: 26.07 | +7: iteration 12450/ 173500 | consumed samples: 3187200 | consumed tokens: 6527385600 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.963985E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.491 | TFLOPs: 26.06 | +7: iteration 12460/ 173500 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.955819E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.018 | TFLOPs: 25.95 | +7: iteration 12470/ 173500 | consumed samples: 3192320 | consumed tokens: 6537871360 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.961789E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.424 | TFLOPs: 25.35 | +7: iteration 12480/ 173500 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.959135E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.495 | TFLOPs: 25.98 | +7: iteration 12490/ 173500 | consumed samples: 3197440 | consumed tokens: 6548357120 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.965184E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.916 | TFLOPs: 26.09 | +7: iteration 12500/ 173500 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.962241E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.701 | TFLOPs: 26.06 | +7: iteration 12510/ 173500 | consumed samples: 3202560 | consumed tokens: 6558842880 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.982735E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.446 | TFLOPs: 25.96 | +7: iteration 12520/ 173500 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 0.15 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.951834E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.423 | TFLOPs: 26.01 | +7: iteration 12530/ 173500 | consumed samples: 3207680 | consumed tokens: 6569328640 | elapsed time per iteration (s): 0.16 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 3.952503E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.391 | TFLOPs: 25.80 | +7: iteration 12540/ 173500 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.963489E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.013 | TFLOPs: 25.75 | +7: iteration 12550/ 173500 | consumed samples: 3212800 | consumed tokens: 6579814400 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.957228E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.292 | TFLOPs: 26.08 | +7: iteration 12560/ 173500 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.955076E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.503 | TFLOPs: 26.07 | +7: iteration 12570/ 173500 | consumed samples: 3217920 | consumed tokens: 6590300160 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.960598E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.590 | TFLOPs: 25.27 | +7: iteration 12580/ 173500 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.964312E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.715 | TFLOPs: 26.03 | +7: iteration 12590/ 173500 | consumed samples: 3223040 | consumed tokens: 6600785920 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.967952E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.807 | TFLOPs: 26.05 | +7: iteration 12600/ 173500 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.956209E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.619 | TFLOPs: 26.17 | +7: iteration 12610/ 173500 | consumed samples: 3228160 | consumed tokens: 6611271680 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.964196E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.599 | TFLOPs: 25.90 | +7: iteration 12620/ 173500 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.960302E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.775 | TFLOPs: 26.22 | +7: iteration 12630/ 173500 | consumed samples: 3233280 | consumed tokens: 6621757440 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.957566E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.540 | TFLOPs: 25.79 | +7: iteration 12640/ 173500 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.962309E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.139 | TFLOPs: 25.80 | +7: iteration 12650/ 173500 | consumed samples: 3238400 | consumed tokens: 6632243200 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.959234E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.907 | TFLOPs: 26.25 | +7: iteration 12660/ 173500 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.970940E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.540 | TFLOPs: 25.38 | +7: iteration 12670/ 173500 | consumed samples: 3243520 | consumed tokens: 6642728960 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.956976E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.738 | TFLOPs: 25.53 | +7: iteration 12680/ 173500 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.951072E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.027 | TFLOPs: 26.32 | +7: iteration 12690/ 173500 | consumed samples: 3248640 | consumed tokens: 6653214720 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.966226E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.178 | TFLOPs: 25.06 | +7: iteration 12700/ 173500 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 0.17 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.964543E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.087 | TFLOPs: 23.60 | +7: iteration 12710/ 173500 | consumed samples: 3253760 | consumed tokens: 6663700480 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.956663E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.457 | TFLOPs: 24.83 | +7: iteration 12720/ 173500 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.969428E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.075 | TFLOPs: 25.61 | +7: iteration 12730/ 173500 | consumed samples: 3258880 | consumed tokens: 6674186240 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.957686E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.666 | TFLOPs: 26.20 | +7: iteration 12740/ 173500 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.949054E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.904 | TFLOPs: 26.24 | +7: iteration 12750/ 173500 | consumed samples: 3264000 | consumed tokens: 6684672000 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.955829E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.919 | TFLOPs: 26.27 | +7: iteration 12760/ 173500 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.955322E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.717 | TFLOPs: 25.92 | +7: iteration 12770/ 173500 | consumed samples: 3269120 | consumed tokens: 6695157760 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.952486E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.255 | TFLOPs: 26.23 | +7: iteration 12780/ 173500 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.950953E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.854 | TFLOPs: 25.73 | +7: iteration 12790/ 173500 | consumed samples: 3274240 | consumed tokens: 6705643520 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.966967E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.149 | TFLOPs: 25.67 | +7: iteration 12800/ 173500 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.953695E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.396 | TFLOPs: 26.07 | +7: iteration 12810/ 173500 | consumed samples: 3279360 | consumed tokens: 6716129280 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.957649E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.877 | TFLOPs: 25.65 | +7: iteration 12820/ 173500 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 0.16 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.955108E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.406 | TFLOPs: 25.63 | +7: iteration 12830/ 173500 | consumed samples: 3284480 | consumed tokens: 6726615040 | elapsed time per iteration (s): 0.15 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 3.950663E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.642 | TFLOPs: 26.18 | +7: iteration 12840/ 173500 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.953328E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.202 | TFLOPs: 25.61 | +7: iteration 12850/ 173500 | consumed samples: 3289600 | consumed tokens: 6737100800 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.955167E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.217 | TFLOPs: 26.30 | +7: iteration 12860/ 173500 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.970111E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.567 | TFLOPs: 26.28 | +7: iteration 12870/ 173500 | consumed samples: 3294720 | consumed tokens: 6747586560 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.973071E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.247 | TFLOPs: 26.29 | +7: iteration 12880/ 173500 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.960917E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.556 | TFLOPs: 26.29 | +7: iteration 12890/ 173500 | consumed samples: 3299840 | consumed tokens: 6758072320 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.955429E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.265 | TFLOPs: 26.29 | +7: iteration 12900/ 173500 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.968896E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.092 | TFLOPs: 26.25 | +7: iteration 12910/ 173500 | consumed samples: 3304960 | consumed tokens: 6768558080 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.948624E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.363 | TFLOPs: 26.20 | +7: iteration 12920/ 173500 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.953593E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.818 | TFLOPs: 26.23 | +7: iteration 12930/ 173500 | consumed samples: 3310080 | consumed tokens: 6779043840 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.936032E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.216 | TFLOPs: 26.32 | +7: iteration 12940/ 173500 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.950082E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.773 | TFLOPs: 26.31 | +7: iteration 12950/ 173500 | consumed samples: 3315200 | consumed tokens: 6789529600 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.962711E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.457 | TFLOPs: 26.01 | +7: iteration 12960/ 173500 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.955515E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.330 | TFLOPs: 26.29 | +7: iteration 12970/ 173500 | consumed samples: 3320320 | consumed tokens: 6800015360 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.958071E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.966 | TFLOPs: 26.20 | +7: iteration 12980/ 173500 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.946740E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.796 | TFLOPs: 26.19 | +7: iteration 12990/ 173500 | consumed samples: 3325440 | consumed tokens: 6810501120 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.960561E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.398 | TFLOPs: 26.20 | +7: iteration 13000/ 173500 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.939060E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.018 | TFLOPs: 26.17 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 13000 | lm loss value: 4.052712E+00 | lm loss PPL: 5.755336E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 13000 to checkpoints_44m91b100m +0: [2023-03-17 00:49:27,215] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step13000 is begin to save! +0: [2023-03-17 00:49:27,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:49:27,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:49:27,278] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:49:27,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:49:27,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:49:27,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:49:27,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:49:27,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:49:27,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:49:27,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:49:27,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:49:27,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:49:27,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:49:27,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:49:27,330] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:49:27,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:49:27,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:49:27,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:49:27,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:49:27,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:49:27,347] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step13000/mp_rank_00_model_states.pt +0: [2023-03-17 00:49:27,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:49:27,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:49:27,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:49:27,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:49:27,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:49:27,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:49:27,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:49:27,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:49:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:49:27,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:49:27,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:49:27,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: successfully saved checkpoint at iteration 13000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.90 +7: iteration 13010/ 173500 | consumed samples: 3330560 | consumed tokens: 6820986880 | elapsed time per iteration (s): 0.18 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.952606E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.165 | TFLOPs: 22.74 | +7: iteration 13020/ 173500 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.967734E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.455 | TFLOPs: 24.91 | +7: iteration 13030/ 173500 | consumed samples: 3335680 | consumed tokens: 6831472640 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.959278E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.737 | TFLOPs: 26.12 | +7: iteration 13040/ 173500 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.954390E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.656 | TFLOPs: 25.10 | +7: iteration 13050/ 173500 | consumed samples: 3340800 | consumed tokens: 6841958400 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.960681E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.830 | TFLOPs: 25.76 | +7: iteration 13060/ 173500 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 0.16 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.950705E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.032 | TFLOPs: 25.70 | +7: iteration 13070/ 173500 | consumed samples: 3345920 | consumed tokens: 6852444160 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.956205E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.631 | TFLOPs: 26.03 | +7: iteration 13080/ 173500 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.963001E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.370 | TFLOPs: 26.13 | +7: iteration 13090/ 173500 | consumed samples: 3351040 | consumed tokens: 6862929920 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.960700E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.254 | TFLOPs: 26.13 | +7: iteration 13100/ 173500 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.939856E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.042 | TFLOPs: 26.13 | +7: iteration 13110/ 173500 | consumed samples: 3356160 | consumed tokens: 6873415680 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.948359E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.934 | TFLOPs: 26.08 | +7: iteration 13120/ 173500 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.958109E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.411 | TFLOPs: 26.09 | +7: iteration 13130/ 173500 | consumed samples: 3361280 | consumed tokens: 6883901440 | elapsed time per iteration (s): 0.15 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 3.950503E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.101 | TFLOPs: 26.11 | +7: iteration 13140/ 173500 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.960570E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.805 | TFLOPs: 24.82 | +7: iteration 13150/ 173500 | consumed samples: 3366400 | consumed tokens: 6894387200 | elapsed time per iteration (s): 0.18 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.951172E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.340 | TFLOPs: 21.77 | +7: iteration 13160/ 173500 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.940846E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.280 | TFLOPs: 24.85 | +7: iteration 13170/ 173500 | consumed samples: 3371520 | consumed tokens: 6904872960 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.948911E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.660 | TFLOPs: 26.09 | +7: iteration 13180/ 173500 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.944032E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.801 | TFLOPs: 25.83 | +7: iteration 13190/ 173500 | consumed samples: 3376640 | consumed tokens: 6915358720 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.955518E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.808 | TFLOPs: 26.09 | +7: iteration 13200/ 173500 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.952452E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.748 | TFLOPs: 25.72 | +7: iteration 13210/ 173500 | consumed samples: 3381760 | consumed tokens: 6925844480 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.942937E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.769 | TFLOPs: 26.09 | +7: iteration 13220/ 173500 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.955018E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.059 | TFLOPs: 26.10 | +7: iteration 13230/ 173500 | consumed samples: 3386880 | consumed tokens: 6936330240 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.948803E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.604 | TFLOPs: 25.78 | +7: iteration 13240/ 173500 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.957309E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.438 | TFLOPs: 24.53 | +7: iteration 13250/ 173500 | consumed samples: 3392000 | consumed tokens: 6946816000 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.957121E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.247 | TFLOPs: 25.30 | +7: iteration 13260/ 173500 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.951474E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.184 | TFLOPs: 26.11 | +7: iteration 13270/ 173500 | consumed samples: 3397120 | consumed tokens: 6957301760 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.950521E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.784 | TFLOPs: 25.68 | +7: iteration 13280/ 173500 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.949582E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.487 | TFLOPs: 26.12 | +7: iteration 13290/ 173500 | consumed samples: 3402240 | consumed tokens: 6967787520 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.946886E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.525 | TFLOPs: 25.74 | +7: iteration 13300/ 173500 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.941482E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.250 | TFLOPs: 26.12 | +7: iteration 13310/ 173500 | consumed samples: 3407360 | consumed tokens: 6978273280 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.949723E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.520 | TFLOPs: 26.06 | +7: iteration 13320/ 173500 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.953863E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.611 | TFLOPs: 26.06 | +7: iteration 13330/ 173500 | consumed samples: 3412480 | consumed tokens: 6988759040 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.931443E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.227 | TFLOPs: 25.66 | +7: iteration 13340/ 173500 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.936206E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.696 | TFLOPs: 25.90 | +7: iteration 13350/ 173500 | consumed samples: 3417600 | consumed tokens: 6999244800 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.958173E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.148 | TFLOPs: 24.39 | +7: iteration 13360/ 173500 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 0.16 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.948956E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.498 | TFLOPs: 25.07 | +7: iteration 13370/ 173500 | consumed samples: 3422720 | consumed tokens: 7009730560 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.934000E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.708 | TFLOPs: 25.93 | +7: iteration 13380/ 173500 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.943657E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.431 | TFLOPs: 25.93 | +7: iteration 13390/ 173500 | consumed samples: 3427840 | consumed tokens: 7020216320 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.949978E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.335 | TFLOPs: 26.12 | +7: iteration 13400/ 173500 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.944292E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.023 | TFLOPs: 26.13 | +7: iteration 13410/ 173500 | consumed samples: 3432960 | consumed tokens: 7030702080 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.956277E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.219 | TFLOPs: 26.13 | +7: iteration 13420/ 173500 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 0.15 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 3.946297E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.152 | TFLOPs: 25.97 | +7: iteration 13430/ 173500 | consumed samples: 3438080 | consumed tokens: 7041187840 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.950159E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.913 | TFLOPs: 26.13 | +7: iteration 13440/ 173500 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.951279E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.049 | TFLOPs: 26.13 | +7: iteration 13450/ 173500 | consumed samples: 3443200 | consumed tokens: 7051673600 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.950916E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.795 | TFLOPs: 26.12 | +7: iteration 13460/ 173500 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.940516E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.416 | TFLOPs: 26.10 | +7: iteration 13470/ 173500 | consumed samples: 3448320 | consumed tokens: 7062159360 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.951340E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.804 | TFLOPs: 25.87 | +7: iteration 13480/ 173500 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.945158E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.331 | TFLOPs: 26.12 | +7: iteration 13490/ 173500 | consumed samples: 3453440 | consumed tokens: 7072645120 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.938853E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.079 | TFLOPs: 26.11 | +7: iteration 13500/ 173500 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.948367E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.695 | TFLOPs: 26.09 | +7: iteration 13510/ 173500 | consumed samples: 3458560 | consumed tokens: 7083130880 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.942701E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.300 | TFLOPs: 26.12 | +7: iteration 13520/ 173500 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.941769E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.238 | TFLOPs: 26.12 | +7: iteration 13530/ 173500 | consumed samples: 3463680 | consumed tokens: 7093616640 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.938212E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.592 | TFLOPs: 26.14 | +7: iteration 13540/ 173500 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.954361E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.606 | TFLOPs: 26.12 | +7: iteration 13550/ 173500 | consumed samples: 3468800 | consumed tokens: 7104102400 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.944997E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.264 | TFLOPs: 26.10 | +7: iteration 13560/ 173500 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.939166E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.649 | TFLOPs: 26.14 | +7: iteration 13570/ 173500 | consumed samples: 3473920 | consumed tokens: 7114588160 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.946187E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.941 | TFLOPs: 26.13 | +7: iteration 13580/ 173500 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.923356E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.910 | TFLOPs: 25.36 | +7: iteration 13590/ 173500 | consumed samples: 3479040 | consumed tokens: 7125073920 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.939416E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.409 | TFLOPs: 26.26 | +7: iteration 13600/ 173500 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.941514E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.896 | TFLOPs: 26.11 | +7: iteration 13610/ 173500 | consumed samples: 3484160 | consumed tokens: 7135559680 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.945865E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.318 | TFLOPs: 26.12 | +7: iteration 13620/ 173500 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.945795E+00 | grad norm: 0.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.644 | TFLOPs: 26.12 | +7: iteration 13630/ 173500 | consumed samples: 3489280 | consumed tokens: 7146045440 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.949334E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.846 | TFLOPs: 26.12 | +7: iteration 13640/ 173500 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 0.16 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.955190E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.489 | TFLOPs: 25.29 | +7: iteration 13650/ 173500 | consumed samples: 3494400 | consumed tokens: 7156531200 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.923676E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.878 | TFLOPs: 26.11 | +7: iteration 13660/ 173500 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.956677E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.398 | TFLOPs: 26.12 | +7: iteration 13670/ 173500 | consumed samples: 3499520 | consumed tokens: 7167016960 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.929965E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.020 | TFLOPs: 26.11 | +7: iteration 13680/ 173500 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.937473E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.798 | TFLOPs: 26.11 | +7: iteration 13690/ 173500 | consumed samples: 3504640 | consumed tokens: 7177502720 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.934143E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.431 | TFLOPs: 26.07 | +7: iteration 13700/ 173500 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 0.15 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 3.932253E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.676 | TFLOPs: 26.09 | +7: iteration 13710/ 173500 | consumed samples: 3509760 | consumed tokens: 7187988480 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.966194E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.653 | TFLOPs: 26.12 | +7: iteration 13720/ 173500 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.947808E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.007 | TFLOPs: 26.13 | +7: iteration 13730/ 173500 | consumed samples: 3514880 | consumed tokens: 7198474240 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.927412E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.410 | TFLOPs: 26.10 | +7: iteration 13740/ 173500 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.961273E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.166 | TFLOPs: 26.11 | +7: iteration 13750/ 173500 | consumed samples: 3520000 | consumed tokens: 7208960000 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.939798E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.477 | TFLOPs: 26.10 | +7: iteration 13760/ 173500 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.924806E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.212 | TFLOPs: 26.11 | +7: iteration 13770/ 173500 | consumed samples: 3525120 | consumed tokens: 7219445760 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.932705E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.517 | TFLOPs: 26.04 | +7: iteration 13780/ 173500 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.940134E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.761 | TFLOPs: 26.04 | +7: iteration 13790/ 173500 | consumed samples: 3530240 | consumed tokens: 7229931520 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.943594E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.094 | TFLOPs: 26.03 | +7: iteration 13800/ 173500 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.936540E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.891 | TFLOPs: 26.14 | +7: iteration 13810/ 173500 | consumed samples: 3535360 | consumed tokens: 7240417280 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.953320E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.477 | TFLOPs: 26.13 | +7: iteration 13820/ 173500 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 0.16 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.935278E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.869 | TFLOPs: 25.43 | +7: iteration 13830/ 173500 | consumed samples: 3540480 | consumed tokens: 7250903040 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.942757E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.916 | TFLOPs: 26.13 | +7: iteration 13840/ 173500 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 0.16 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.934153E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.879 | TFLOPs: 25.14 | +7: iteration 13850/ 173500 | consumed samples: 3545600 | consumed tokens: 7261388800 | elapsed time per iteration (s): 0.16 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.944263E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.261 | TFLOPs: 25.75 | +7: iteration 13860/ 173500 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.947504E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.043 | TFLOPs: 26.14 | +7: iteration 13870/ 173500 | consumed samples: 3550720 | consumed tokens: 7271874560 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.945232E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.396 | TFLOPs: 26.12 | +7: iteration 13880/ 173500 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.929824E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.417 | TFLOPs: 26.12 | +7: iteration 13890/ 173500 | consumed samples: 3555840 | consumed tokens: 7282360320 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.937605E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.037 | TFLOPs: 26.11 | +7: iteration 13900/ 173500 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.934575E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.044 | TFLOPs: 26.13 | +7: iteration 13910/ 173500 | consumed samples: 3560960 | consumed tokens: 7292846080 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.943563E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.904 | TFLOPs: 26.13 | +7: iteration 13920/ 173500 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.930400E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.541 | TFLOPs: 26.12 | +7: iteration 13930/ 173500 | consumed samples: 3566080 | consumed tokens: 7303331840 | elapsed time per iteration (s): 0.16 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.948007E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.375 | TFLOPs: 25.62 | +7: iteration 13940/ 173500 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.938298E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.441 | TFLOPs: 26.10 | +7: iteration 13950/ 173500 | consumed samples: 3571200 | consumed tokens: 7313817600 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.935242E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.640 | TFLOPs: 26.09 | +7: iteration 13960/ 173500 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.937976E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.520 | TFLOPs: 26.10 | +7: iteration 13970/ 173500 | consumed samples: 3576320 | consumed tokens: 7324303360 | elapsed time per iteration (s): 0.17 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.941327E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.930 | TFLOPs: 24.24 | +7: iteration 13980/ 173500 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 0.15 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 3.934718E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.893 | TFLOPs: 26.14 | +7: iteration 13990/ 173500 | consumed samples: 3581440 | consumed tokens: 7334789120 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.947765E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.807 | TFLOPs: 26.12 | +0: [2023-03-17 00:52:02,668] [INFO] [logging.py:68:log_dist] [Rank 0] step=14000, skipped=0, lr=[0.00019774496681175836, 0.00019774496681175836, 0.00019774496681175836], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 14000/ 173500 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.935374E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.191 | TFLOPs: 26.13 | +0: steps: 14000 loss: 3.9138 iter time (s): 0.153 samples/sec: 1668.593 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 14000 | lm loss value: 4.000070E+00 | lm loss PPL: 5.460198E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 14000 to checkpoints_44m91b100m +0: [2023-03-17 00:52:02,741] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is begin to save! +0: [2023-03-17 00:52:02,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:52:02,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:52:02,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:52:02,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:52:02,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:52:02,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:52:02,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:52:02,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:52:02,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:52:02,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:52:02,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:52:02,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:52:02,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:52:02,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:52:02,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:52:02,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:52:02,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:52:02,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:52:02,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:52:02,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:52:02,870] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step14000/mp_rank_00_model_states.pt +0: [2023-03-17 00:52:02,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:52:02,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:52:02,889] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:52:02,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:02,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:02,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:52:02,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:02,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:02,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:52:02,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:02,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:02,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:02,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:02,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:52:02,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 00:52:02,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:52:02,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:52:02,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:52:02,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:52:02,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:52:02,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:52:02,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: successfully saved checkpoint at iteration 14000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 174.81 +7: iteration 14010/ 173500 | consumed samples: 3586560 | consumed tokens: 7345274880 | elapsed time per iteration (s): 0.18 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.940016E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.583 | TFLOPs: 22.59 | +7: iteration 14020/ 173500 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.940024E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.877 | TFLOPs: 25.94 | +7: iteration 14030/ 173500 | consumed samples: 3591680 | consumed tokens: 7355760640 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.934124E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.601 | TFLOPs: 26.12 | +7: iteration 14040/ 173500 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.943771E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.061 | TFLOPs: 25.89 | +7: iteration 14050/ 173500 | consumed samples: 3596800 | consumed tokens: 7366246400 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.939526E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.346 | TFLOPs: 25.71 | +7: iteration 14060/ 173500 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.942378E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.049 | TFLOPs: 25.85 | +7: iteration 14070/ 173500 | consumed samples: 3601920 | consumed tokens: 7376732160 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.922932E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.017 | TFLOPs: 25.86 | +7: iteration 14080/ 173500 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.929910E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.736 | TFLOPs: 26.15 | +7: iteration 14090/ 173500 | consumed samples: 3607040 | consumed tokens: 7387217920 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.932227E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.733 | TFLOPs: 25.75 | +7: iteration 14100/ 173500 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.943651E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.418 | TFLOPs: 26.15 | +7: iteration 14110/ 173500 | consumed samples: 3612160 | consumed tokens: 7397703680 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.947393E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.507 | TFLOPs: 25.81 | +7: iteration 14120/ 173500 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.938959E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.986 | TFLOPs: 25.86 | +7: iteration 14130/ 173500 | consumed samples: 3617280 | consumed tokens: 7408189440 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.936974E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.328 | TFLOPs: 26.12 | +7: iteration 14140/ 173500 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.936292E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.974 | TFLOPs: 26.14 | +7: iteration 14150/ 173500 | consumed samples: 3622400 | consumed tokens: 7418675200 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.922199E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.268 | TFLOPs: 26.16 | +7: iteration 14160/ 173500 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.936311E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.673 | TFLOPs: 26.14 | +7: iteration 14170/ 173500 | consumed samples: 3627520 | consumed tokens: 7429160960 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.938260E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.762 | TFLOPs: 26.11 | +7: iteration 14180/ 173500 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.939394E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.937 | TFLOPs: 26.14 | +7: iteration 14190/ 173500 | consumed samples: 3632640 | consumed tokens: 7439646720 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.934050E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.996 | TFLOPs: 25.89 | +7: iteration 14200/ 173500 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.924087E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.658 | TFLOPs: 26.11 | +7: iteration 14210/ 173500 | consumed samples: 3637760 | consumed tokens: 7450132480 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.937837E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.934 | TFLOPs: 26.11 | +7: iteration 14220/ 173500 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.931371E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.669 | TFLOPs: 26.01 | +7: iteration 14230/ 173500 | consumed samples: 3642880 | consumed tokens: 7460618240 | elapsed time per iteration (s): 0.15 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.924915E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.454 | TFLOPs: 26.07 | +7: iteration 14240/ 173500 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.926050E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.209 | TFLOPs: 25.85 | +7: iteration 14250/ 173500 | consumed samples: 3648000 | consumed tokens: 7471104000 | elapsed time per iteration (s): 0.16 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 3.948038E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.754 | TFLOPs: 25.67 | +7: iteration 14260/ 173500 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.919603E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.744 | TFLOPs: 26.08 | +7: iteration 14270/ 173500 | consumed samples: 3653120 | consumed tokens: 7481589760 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.929932E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.378 | TFLOPs: 26.04 | +7: iteration 14280/ 173500 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.922545E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.559 | TFLOPs: 25.57 | +7: iteration 14290/ 173500 | consumed samples: 3658240 | consumed tokens: 7492075520 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.941244E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.101 | TFLOPs: 25.97 | +7: iteration 14300/ 173500 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.921735E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.356 | TFLOPs: 26.09 | +7: iteration 14310/ 173500 | consumed samples: 3663360 | consumed tokens: 7502561280 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.951213E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.535 | TFLOPs: 25.79 | +7: iteration 14320/ 173500 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.916580E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.566 | TFLOPs: 25.78 | +7: iteration 14330/ 173500 | consumed samples: 3668480 | consumed tokens: 7513047040 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.933843E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.965 | TFLOPs: 25.78 | +7: iteration 14340/ 173500 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.938636E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.837 | TFLOPs: 25.73 | +7: iteration 14350/ 173500 | consumed samples: 3673600 | consumed tokens: 7523532800 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.931106E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.826 | TFLOPs: 25.79 | +7: iteration 14360/ 173500 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.932642E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.263 | TFLOPs: 25.80 | +7: iteration 14370/ 173500 | consumed samples: 3678720 | consumed tokens: 7534018560 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.934888E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.672 | TFLOPs: 25.81 | +7: iteration 14380/ 173500 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.923343E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.133 | TFLOPs: 25.75 | +7: iteration 14390/ 173500 | consumed samples: 3683840 | consumed tokens: 7544504320 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.932812E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.351 | TFLOPs: 25.58 | +7: iteration 14400/ 173500 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.925225E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.505 | TFLOPs: 26.17 | +7: iteration 14410/ 173500 | consumed samples: 3688960 | consumed tokens: 7554990080 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.933081E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.975 | TFLOPs: 26.16 | +7: iteration 14420/ 173500 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.937807E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.781 | TFLOPs: 26.17 | +7: iteration 14430/ 173500 | consumed samples: 3694080 | consumed tokens: 7565475840 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.925422E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.930 | TFLOPs: 26.16 | +7: iteration 14440/ 173500 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.922935E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.513 | TFLOPs: 26.17 | +7: iteration 14450/ 173500 | consumed samples: 3699200 | consumed tokens: 7575961600 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.911962E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.252 | TFLOPs: 26.15 | +7: iteration 14460/ 173500 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.942316E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.986 | TFLOPs: 26.17 | +7: iteration 14470/ 173500 | consumed samples: 3704320 | consumed tokens: 7586447360 | elapsed time per iteration (s): 0.16 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.941395E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.282 | TFLOPs: 25.38 | +7: iteration 14480/ 173500 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.920319E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.063 | TFLOPs: 26.14 | +7: iteration 14490/ 173500 | consumed samples: 3709440 | consumed tokens: 7596933120 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.930197E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.119 | TFLOPs: 26.14 | +7: iteration 14500/ 173500 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.924079E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.503 | TFLOPs: 26.13 | +7: iteration 14510/ 173500 | consumed samples: 3714560 | consumed tokens: 7607418880 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.941177E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.306 | TFLOPs: 26.15 | +7: iteration 14520/ 173500 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 0.15 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 3.937159E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.420 | TFLOPs: 26.15 | +7: iteration 14530/ 173500 | consumed samples: 3719680 | consumed tokens: 7617904640 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.930550E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.006 | TFLOPs: 26.16 | +7: iteration 14540/ 173500 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.920304E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.475 | TFLOPs: 26.15 | +7: iteration 14550/ 173500 | consumed samples: 3724800 | consumed tokens: 7628390400 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.932256E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.906 | TFLOPs: 26.13 | +7: iteration 14560/ 173500 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.935147E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.756 | TFLOPs: 26.15 | +7: iteration 14570/ 173500 | consumed samples: 3729920 | consumed tokens: 7638876160 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.949799E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.236 | TFLOPs: 26.15 | +7: iteration 14580/ 173500 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.922716E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.850 | TFLOPs: 26.16 | +7: iteration 14590/ 173500 | consumed samples: 3735040 | consumed tokens: 7649361920 | elapsed time per iteration (s): 0.16 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.950190E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.644 | TFLOPs: 25.49 | +7: iteration 14600/ 173500 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.933362E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.814 | TFLOPs: 26.19 | +7: iteration 14610/ 173500 | consumed samples: 3740160 | consumed tokens: 7659847680 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.932991E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.876 | TFLOPs: 26.19 | +7: iteration 14620/ 173500 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 0.16 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.921736E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.981 | TFLOPs: 25.75 | +7: iteration 14630/ 173500 | consumed samples: 3745280 | consumed tokens: 7670333440 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.937482E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.077 | TFLOPs: 26.16 | +7: iteration 14640/ 173500 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 0.16 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.930206E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.834 | TFLOPs: 25.70 | +7: iteration 14650/ 173500 | consumed samples: 3750400 | consumed tokens: 7680819200 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.941409E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.182 | TFLOPs: 26.19 | +7: iteration 14660/ 173500 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 0.16 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.929768E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.055 | TFLOPs: 25.31 | +7: iteration 14670/ 173500 | consumed samples: 3755520 | consumed tokens: 7691304960 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.927546E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.163 | TFLOPs: 26.21 | +7: iteration 14680/ 173500 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 0.16 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.936269E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.794 | TFLOPs: 25.87 | +7: iteration 14690/ 173500 | consumed samples: 3760640 | consumed tokens: 7701790720 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.926248E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.095 | TFLOPs: 26.22 | +7: iteration 14700/ 173500 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.934475E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.448 | TFLOPs: 26.21 | +7: iteration 14710/ 173500 | consumed samples: 3765760 | consumed tokens: 7712276480 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.926453E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.627 | TFLOPs: 26.20 | +7: iteration 14720/ 173500 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.936430E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.458 | TFLOPs: 26.20 | +7: iteration 14730/ 173500 | consumed samples: 3770880 | consumed tokens: 7722762240 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.930309E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.194 | TFLOPs: 26.21 | +7: iteration 14740/ 173500 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.926488E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.515 | TFLOPs: 26.20 | +7: iteration 14750/ 173500 | consumed samples: 3776000 | consumed tokens: 7733248000 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.920881E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.890 | TFLOPs: 26.17 | +7: iteration 14760/ 173500 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.928311E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.923 | TFLOPs: 26.16 | +7: iteration 14770/ 173500 | consumed samples: 3781120 | consumed tokens: 7743733760 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.927363E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.731 | TFLOPs: 25.90 | +7: iteration 14780/ 173500 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 0.15 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 3.916738E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.723 | TFLOPs: 26.14 | +7: iteration 14790/ 173500 | consumed samples: 3786240 | consumed tokens: 7754219520 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.922535E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.210 | TFLOPs: 25.83 | +7: iteration 14800/ 173500 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.937926E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.375 | TFLOPs: 26.15 | +7: iteration 14810/ 173500 | consumed samples: 3791360 | consumed tokens: 7764705280 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.918738E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.229 | TFLOPs: 26.15 | +7: iteration 14820/ 173500 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.934854E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.724 | TFLOPs: 26.15 | +7: iteration 14830/ 173500 | consumed samples: 3796480 | consumed tokens: 7775191040 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.923055E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.652 | TFLOPs: 26.17 | +7: iteration 14840/ 173500 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.930086E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.065 | TFLOPs: 26.18 | +7: iteration 14850/ 173500 | consumed samples: 3801600 | consumed tokens: 7785676800 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.936818E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.612 | TFLOPs: 25.07 | +7: iteration 14860/ 173500 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.931491E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.294 | TFLOPs: 25.27 | +7: iteration 14870/ 173500 | consumed samples: 3806720 | consumed tokens: 7796162560 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.919571E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.800 | TFLOPs: 25.61 | +7: iteration 14880/ 173500 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.929684E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.999 | TFLOPs: 26.08 | +7: iteration 14890/ 173500 | consumed samples: 3811840 | consumed tokens: 7806648320 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.933247E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.123 | TFLOPs: 26.08 | +7: iteration 14900/ 173500 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.939151E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.948 | TFLOPs: 25.42 | +7: iteration 14910/ 173500 | consumed samples: 3816960 | consumed tokens: 7817134080 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.934811E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.172 | TFLOPs: 26.13 | +7: iteration 14920/ 173500 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.926297E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.637 | TFLOPs: 26.11 | +7: iteration 14930/ 173500 | consumed samples: 3822080 | consumed tokens: 7827619840 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.912779E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.298 | TFLOPs: 26.12 | +7: iteration 14940/ 173500 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.930076E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.624 | TFLOPs: 25.84 | +7: iteration 14950/ 173500 | consumed samples: 3827200 | consumed tokens: 7838105600 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.921643E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.400 | TFLOPs: 24.97 | +7: iteration 14960/ 173500 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.923338E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.139 | TFLOPs: 26.10 | +7: iteration 14970/ 173500 | consumed samples: 3832320 | consumed tokens: 7848591360 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.912946E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.072 | TFLOPs: 26.11 | +7: iteration 14980/ 173500 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 0.16 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.916824E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.882 | TFLOPs: 25.56 | +7: iteration 14990/ 173500 | consumed samples: 3837440 | consumed tokens: 7859077120 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.928578E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.767 | TFLOPs: 26.09 | +7: iteration 15000/ 173500 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.913956E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.436 | TFLOPs: 26.07 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 15000 | lm loss value: 3.987894E+00 | lm loss PPL: 5.394117E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 15000 to checkpoints_44m91b100m +0: [2023-03-17 00:54:37,550] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step15000 is begin to save! +0: [2023-03-17 00:54:37,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:54:37,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:54:37,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:54:37,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:54:37,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:54:37,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:54:37,634] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:54:37,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:54:37,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:54:37,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:54:37,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:54:37,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:54:37,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:54:37,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:54:37,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:54:37,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:54:37,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:54:37,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:54:37,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:54:37,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:54:37,684] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step15000/mp_rank_00_model_states.pt +0: [2023-03-17 00:54:37,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:54:37,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:54:37,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:54:37,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:54:37,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:54:37,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:54:37,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:54:37,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:54:37,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:54:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:54:37,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:54:37,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:54:37,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:54:37,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:54:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:54:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:54:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:54:37,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:54:37,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:54:37,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:54:37,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:54:37,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: successfully saved checkpoint at iteration 15000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 180.05 +7: iteration 15010/ 173500 | consumed samples: 3842560 | consumed tokens: 7869562880 | elapsed time per iteration (s): 0.18 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.935884E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.548 | TFLOPs: 22.01 | +7: iteration 15020/ 173500 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.920445E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.105 | TFLOPs: 26.11 | +7: iteration 15030/ 173500 | consumed samples: 3847680 | consumed tokens: 7880048640 | elapsed time per iteration (s): 0.15 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 3.926180E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.957 | TFLOPs: 26.14 | +7: iteration 15040/ 173500 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.925389E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.447 | TFLOPs: 26.17 | +7: iteration 15050/ 173500 | consumed samples: 3852800 | consumed tokens: 7890534400 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.913626E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.252 | TFLOPs: 26.21 | +7: iteration 15060/ 173500 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.920602E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.800 | TFLOPs: 25.42 | +7: iteration 15070/ 173500 | consumed samples: 3857920 | consumed tokens: 7901020160 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.917875E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.617 | TFLOPs: 25.85 | +7: iteration 15080/ 173500 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.920995E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.603 | TFLOPs: 26.17 | +7: iteration 15090/ 173500 | consumed samples: 3863040 | consumed tokens: 7911505920 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.911943E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.386 | TFLOPs: 26.18 | +7: iteration 15100/ 173500 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.929434E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.051 | TFLOPs: 26.19 | +7: iteration 15110/ 173500 | consumed samples: 3868160 | consumed tokens: 7921991680 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.924286E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.341 | TFLOPs: 26.16 | +7: iteration 15120/ 173500 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.920786E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.982 | TFLOPs: 25.37 | +7: iteration 15130/ 173500 | consumed samples: 3873280 | consumed tokens: 7932477440 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.933707E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.464 | TFLOPs: 26.20 | +7: iteration 15140/ 173500 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.916675E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.675 | TFLOPs: 26.18 | +7: iteration 15150/ 173500 | consumed samples: 3878400 | consumed tokens: 7942963200 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.915914E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.053 | TFLOPs: 26.14 | +7: iteration 15160/ 173500 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.905774E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.800 | TFLOPs: 25.10 | +7: iteration 15170/ 173500 | consumed samples: 3883520 | consumed tokens: 7953448960 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.921776E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.542 | TFLOPs: 26.15 | +7: iteration 15180/ 173500 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.923985E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.221 | TFLOPs: 26.08 | +7: iteration 15190/ 173500 | consumed samples: 3888640 | consumed tokens: 7963934720 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.913037E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.051 | TFLOPs: 26.07 | +7: iteration 15200/ 173500 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.932256E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.401 | TFLOPs: 25.68 | +7: iteration 15210/ 173500 | consumed samples: 3893760 | consumed tokens: 7974420480 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.924464E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.812 | TFLOPs: 26.08 | +7: iteration 15220/ 173500 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.915108E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.398 | TFLOPs: 25.44 | +7: iteration 15230/ 173500 | consumed samples: 3898880 | consumed tokens: 7984906240 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.921010E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.304 | TFLOPs: 26.08 | +7: iteration 15240/ 173500 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.910079E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.299 | TFLOPs: 26.10 | +7: iteration 15250/ 173500 | consumed samples: 3904000 | consumed tokens: 7995392000 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.913280E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.535 | TFLOPs: 26.09 | +7: iteration 15260/ 173500 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.918717E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.735 | TFLOPs: 26.08 | +7: iteration 15270/ 173500 | consumed samples: 3909120 | consumed tokens: 8005877760 | elapsed time per iteration (s): 0.15 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.920897E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.762 | TFLOPs: 26.04 | +7: iteration 15280/ 173500 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 0.16 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 3.916205E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.620 | TFLOPs: 25.48 | +7: iteration 15290/ 173500 | consumed samples: 3914240 | consumed tokens: 8016363520 | elapsed time per iteration (s): 0.16 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.923521E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.133 | TFLOPs: 25.77 | +7: iteration 15300/ 173500 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 0.16 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.940198E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.248 | TFLOPs: 25.83 | +7: iteration 15310/ 173500 | consumed samples: 3919360 | consumed tokens: 8026849280 | elapsed time per iteration (s): 0.16 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.910966E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.973 | TFLOPs: 25.77 | +7: iteration 15320/ 173500 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.928717E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.894 | TFLOPs: 26.09 | +7: iteration 15330/ 173500 | consumed samples: 3924480 | consumed tokens: 8037335040 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.928030E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.224 | TFLOPs: 26.13 | +7: iteration 15340/ 173500 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.920043E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.623 | TFLOPs: 26.12 | +7: iteration 15350/ 173500 | consumed samples: 3929600 | consumed tokens: 8047820800 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.911814E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.189 | TFLOPs: 26.15 | +7: iteration 15360/ 173500 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.910848E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.962 | TFLOPs: 26.14 | +7: iteration 15370/ 173500 | consumed samples: 3934720 | consumed tokens: 8058306560 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.917909E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.190 | TFLOPs: 26.18 | +7: iteration 15380/ 173500 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.931211E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.116 | TFLOPs: 26.18 | +7: iteration 15390/ 173500 | consumed samples: 3939840 | consumed tokens: 8068792320 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.918022E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.716 | TFLOPs: 26.19 | +7: iteration 15400/ 173500 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.917556E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.611 | TFLOPs: 26.14 | +7: iteration 15410/ 173500 | consumed samples: 3944960 | consumed tokens: 8079278080 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.921492E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.863 | TFLOPs: 26.17 | +7: iteration 15420/ 173500 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.914576E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.265 | TFLOPs: 26.16 | +7: iteration 15430/ 173500 | consumed samples: 3950080 | consumed tokens: 8089763840 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.918806E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.610 | TFLOPs: 26.17 | +7: iteration 15440/ 173500 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.912125E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.039 | TFLOPs: 26.16 | +7: iteration 15450/ 173500 | consumed samples: 3955200 | consumed tokens: 8100249600 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.922478E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.232 | TFLOPs: 26.16 | +7: iteration 15460/ 173500 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.905922E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.899 | TFLOPs: 26.19 | +7: iteration 15470/ 173500 | consumed samples: 3960320 | consumed tokens: 8110735360 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.918304E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.014 | TFLOPs: 26.14 | +7: iteration 15480/ 173500 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.911268E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.708 | TFLOPs: 26.12 | +7: iteration 15490/ 173500 | consumed samples: 3965440 | consumed tokens: 8121221120 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.919793E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.395 | TFLOPs: 26.15 | +7: iteration 15500/ 173500 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.908054E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.215 | TFLOPs: 26.13 | +7: iteration 15510/ 173500 | consumed samples: 3970560 | consumed tokens: 8131706880 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.914762E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.259 | TFLOPs: 26.13 | +7: iteration 15520/ 173500 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.904561E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.487 | TFLOPs: 26.09 | +7: iteration 15530/ 173500 | consumed samples: 3975680 | consumed tokens: 8142192640 | elapsed time per iteration (s): 0.15 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 3.920280E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.185 | TFLOPs: 26.15 | +7: iteration 15540/ 173500 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.929063E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.947 | TFLOPs: 25.45 | +7: iteration 15550/ 173500 | consumed samples: 3980800 | consumed tokens: 8152678400 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.906670E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.549 | TFLOPs: 26.14 | +7: iteration 15560/ 173500 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.918739E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.476 | TFLOPs: 26.13 | +7: iteration 15570/ 173500 | consumed samples: 3985920 | consumed tokens: 8163164160 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.921758E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.398 | TFLOPs: 26.09 | +7: iteration 15580/ 173500 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.930835E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.663 | TFLOPs: 26.11 | +7: iteration 15590/ 173500 | consumed samples: 3991040 | consumed tokens: 8173649920 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.917906E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.887 | TFLOPs: 26.13 | +7: iteration 15600/ 173500 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.918405E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.730 | TFLOPs: 26.12 | +7: iteration 15610/ 173500 | consumed samples: 3996160 | consumed tokens: 8184135680 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.920811E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.355 | TFLOPs: 26.12 | +7: iteration 15620/ 173500 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.903551E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.493 | TFLOPs: 26.13 | +7: iteration 15630/ 173500 | consumed samples: 4001280 | consumed tokens: 8194621440 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.911564E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.373 | TFLOPs: 26.04 | +7: iteration 15640/ 173500 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.921479E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.098 | TFLOPs: 26.11 | +7: iteration 15650/ 173500 | consumed samples: 4006400 | consumed tokens: 8205107200 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.899221E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.903 | TFLOPs: 26.09 | +7: iteration 15660/ 173500 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.924849E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.631 | TFLOPs: 26.12 | +7: iteration 15670/ 173500 | consumed samples: 4011520 | consumed tokens: 8215592960 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.929943E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.120 | TFLOPs: 26.10 | +7: iteration 15680/ 173500 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.897564E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.975 | TFLOPs: 26.13 | +7: iteration 15690/ 173500 | consumed samples: 4016640 | consumed tokens: 8226078720 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.941819E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.205 | TFLOPs: 26.16 | +7: iteration 15700/ 173500 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.911836E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.317 | TFLOPs: 25.49 | +7: iteration 15710/ 173500 | consumed samples: 4021760 | consumed tokens: 8236564480 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.918991E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.992 | TFLOPs: 25.50 | +7: iteration 15720/ 173500 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.906638E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.095 | TFLOPs: 26.16 | +7: iteration 15730/ 173500 | consumed samples: 4026880 | consumed tokens: 8247050240 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.926344E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.971 | TFLOPs: 26.16 | +7: iteration 15740/ 173500 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.915841E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.630 | TFLOPs: 26.15 | +7: iteration 15750/ 173500 | consumed samples: 4032000 | consumed tokens: 8257536000 | elapsed time per iteration (s): 0.15 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.910011E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.865 | TFLOPs: 26.17 | +7: iteration 15760/ 173500 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.910689E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.878 | TFLOPs: 25.61 | +7: iteration 15770/ 173500 | consumed samples: 4037120 | consumed tokens: 8268021760 | elapsed time per iteration (s): 0.16 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 3.912408E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.407 | TFLOPs: 25.60 | +7: iteration 15780/ 173500 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.925451E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.106 | TFLOPs: 26.19 | +7: iteration 15790/ 173500 | consumed samples: 4042240 | consumed tokens: 8278507520 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.915771E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.949 | TFLOPs: 26.13 | +7: iteration 15800/ 173500 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.905205E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.580 | TFLOPs: 26.12 | +7: iteration 15810/ 173500 | consumed samples: 4047360 | consumed tokens: 8288993280 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.915957E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.092 | TFLOPs: 26.18 | +7: iteration 15820/ 173500 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.913942E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.158 | TFLOPs: 26.19 | +7: iteration 15830/ 173500 | consumed samples: 4052480 | consumed tokens: 8299479040 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.910933E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.549 | TFLOPs: 26.10 | +7: iteration 15840/ 173500 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.921243E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.569 | TFLOPs: 26.12 | +7: iteration 15850/ 173500 | consumed samples: 4057600 | consumed tokens: 8309964800 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.917975E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.638 | TFLOPs: 26.14 | +7: iteration 15860/ 173500 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.916560E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.103 | TFLOPs: 26.14 | +7: iteration 15870/ 173500 | consumed samples: 4062720 | consumed tokens: 8320450560 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.904934E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.313 | TFLOPs: 26.18 | +7: iteration 15880/ 173500 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.918630E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.157 | TFLOPs: 26.15 | +7: iteration 15890/ 173500 | consumed samples: 4067840 | consumed tokens: 8330936320 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.926797E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.336 | TFLOPs: 26.16 | +7: iteration 15900/ 173500 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.908548E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.894 | TFLOPs: 26.16 | +7: iteration 15910/ 173500 | consumed samples: 4072960 | consumed tokens: 8341422080 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.900991E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.919 | TFLOPs: 26.16 | +7: iteration 15920/ 173500 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.890429E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.010 | TFLOPs: 26.17 | +7: iteration 15930/ 173500 | consumed samples: 4078080 | consumed tokens: 8351907840 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.925652E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.868 | TFLOPs: 26.17 | +7: iteration 15940/ 173500 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.922572E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.921 | TFLOPs: 26.14 | +7: iteration 15950/ 173500 | consumed samples: 4083200 | consumed tokens: 8362393600 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.903009E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.607 | TFLOPs: 26.17 | +7: iteration 15960/ 173500 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.910782E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.431 | TFLOPs: 26.10 | +7: iteration 15970/ 173500 | consumed samples: 4088320 | consumed tokens: 8372879360 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.907758E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.517 | TFLOPs: 26.15 | +7: iteration 15980/ 173500 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.898464E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.153 | TFLOPs: 26.15 | +7: iteration 15990/ 173500 | consumed samples: 4093440 | consumed tokens: 8383365120 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.902210E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.707 | TFLOPs: 26.15 | +0: [2023-03-17 00:57:11,902] [INFO] [logging.py:68:log_dist] [Rank 0] step=16000, skipped=0, lr=[0.00019695408064628468, 0.00019695408064628468, 0.00019695408064628468], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 16000/ 173500 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 0.15 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 3.913984E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.106 | TFLOPs: 26.16 | +0: steps: 16000 loss: 3.9357 iter time (s): 0.153 samples/sec: 1675.484 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 16000 | lm loss value: 4.020677E+00 | lm loss PPL: 5.573883E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 16000 to checkpoints_44m91b100m +0: [2023-03-17 00:57:11,976] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is begin to save! +0: [2023-03-17 00:57:11,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:57:12,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:57:12,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:57:12,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:57:12,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:57:12,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:57:12,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:57:12,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:57:12,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:57:12,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:57:12,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:57:12,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:57:12,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:57:12,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:57:12,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:57:12,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:57:12,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:57:12,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:57:12,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:57:12,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:57:12,111] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step16000/mp_rank_00_model_states.pt +0: [2023-03-17 00:57:12,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:57:12,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:57:12,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:57:12,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:57:12,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:57:12,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:57:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:57:12,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 00:57:12,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:57:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 00:57:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:57:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:57:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:57:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: successfully saved checkpoint at iteration 16000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.15 +7: iteration 16010/ 173500 | consumed samples: 4098560 | consumed tokens: 8393850880 | elapsed time per iteration (s): 0.18 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.909158E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.049 | TFLOPs: 22.55 | +7: iteration 16020/ 173500 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.916531E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.031 | TFLOPs: 26.17 | +7: iteration 16030/ 173500 | consumed samples: 4103680 | consumed tokens: 8404336640 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.910096E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.196 | TFLOPs: 26.18 | +7: iteration 16040/ 173500 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.915452E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.630 | TFLOPs: 26.17 | +7: iteration 16050/ 173500 | consumed samples: 4108800 | consumed tokens: 8414822400 | elapsed time per iteration (s): 0.16 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.913092E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.960 | TFLOPs: 25.84 | +7: iteration 16060/ 173500 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.906755E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.424 | TFLOPs: 26.20 | +7: iteration 16070/ 173500 | consumed samples: 4113920 | consumed tokens: 8425308160 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.908135E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.004 | TFLOPs: 26.19 | +7: iteration 16080/ 173500 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.909343E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.155 | TFLOPs: 26.18 | +7: iteration 16090/ 173500 | consumed samples: 4119040 | consumed tokens: 8435793920 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.904678E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.149 | TFLOPs: 26.13 | +7: iteration 16100/ 173500 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.917178E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.257 | TFLOPs: 26.18 | +7: iteration 16110/ 173500 | consumed samples: 4124160 | consumed tokens: 8446279680 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.907728E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.704 | TFLOPs: 26.19 | +7: iteration 16120/ 173500 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 0.16 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.904532E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.929 | TFLOPs: 25.80 | +7: iteration 16130/ 173500 | consumed samples: 4129280 | consumed tokens: 8456765440 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.896474E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.950 | TFLOPs: 26.11 | +7: iteration 16140/ 173500 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.915924E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.093 | TFLOPs: 26.16 | +7: iteration 16150/ 173500 | consumed samples: 4134400 | consumed tokens: 8467251200 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.909867E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.381 | TFLOPs: 26.15 | +7: iteration 16160/ 173500 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.902076E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.145 | TFLOPs: 26.15 | +7: iteration 16170/ 173500 | consumed samples: 4139520 | consumed tokens: 8477736960 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.908190E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.689 | TFLOPs: 26.06 | +7: iteration 16180/ 173500 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 0.16 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.903712E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.955 | TFLOPs: 25.80 | +7: iteration 16190/ 173500 | consumed samples: 4144640 | consumed tokens: 8488222720 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.906117E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.014 | TFLOPs: 26.17 | +7: iteration 16200/ 173500 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.897589E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.903 | TFLOPs: 25.95 | +7: iteration 16210/ 173500 | consumed samples: 4149760 | consumed tokens: 8498708480 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.905995E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.542 | TFLOPs: 26.17 | +7: iteration 16220/ 173500 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.898705E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.995 | TFLOPs: 26.17 | +7: iteration 16230/ 173500 | consumed samples: 4154880 | consumed tokens: 8509194240 | elapsed time per iteration (s): 0.16 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.905997E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.606 | TFLOPs: 25.79 | +7: iteration 16240/ 173500 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 0.15 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 3.914126E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.327 | TFLOPs: 26.16 | +7: iteration 16250/ 173500 | consumed samples: 4160000 | consumed tokens: 8519680000 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.909710E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.927 | TFLOPs: 25.62 | +7: iteration 16260/ 173500 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.901950E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.459 | TFLOPs: 26.09 | +7: iteration 16270/ 173500 | consumed samples: 4165120 | consumed tokens: 8530165760 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.914804E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.031 | TFLOPs: 26.10 | +7: iteration 16280/ 173500 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.901913E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.149 | TFLOPs: 26.10 | +7: iteration 16290/ 173500 | consumed samples: 4170240 | consumed tokens: 8540651520 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.897005E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.825 | TFLOPs: 26.09 | +7: iteration 16300/ 173500 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.922972E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.578 | TFLOPs: 26.10 | +7: iteration 16310/ 173500 | consumed samples: 4175360 | consumed tokens: 8551137280 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.908245E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.941 | TFLOPs: 26.09 | +7: iteration 16320/ 173500 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.907327E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.992 | TFLOPs: 26.11 | +7: iteration 16330/ 173500 | consumed samples: 4180480 | consumed tokens: 8561623040 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.907642E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.414 | TFLOPs: 26.09 | +7: iteration 16340/ 173500 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.912004E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.879 | TFLOPs: 26.11 | +7: iteration 16350/ 173500 | consumed samples: 4185600 | consumed tokens: 8572108800 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.907080E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.674 | TFLOPs: 26.09 | +7: iteration 16360/ 173500 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.902102E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.937 | TFLOPs: 25.69 | +7: iteration 16370/ 173500 | consumed samples: 4190720 | consumed tokens: 8582594560 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.902773E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.558 | TFLOPs: 26.06 | +7: iteration 16380/ 173500 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.913405E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.119 | TFLOPs: 25.75 | +7: iteration 16390/ 173500 | consumed samples: 4195840 | consumed tokens: 8593080320 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.916791E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.323 | TFLOPs: 26.07 | +7: iteration 16400/ 173500 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.918301E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.623 | TFLOPs: 26.07 | +7: iteration 16410/ 173500 | consumed samples: 4200960 | consumed tokens: 8603566080 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.902779E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.740 | TFLOPs: 26.08 | +7: iteration 16420/ 173500 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.886100E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.791 | TFLOPs: 26.06 | +7: iteration 16430/ 173500 | consumed samples: 4206080 | consumed tokens: 8614051840 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.906406E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.036 | TFLOPs: 26.06 | +7: iteration 16440/ 173500 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.899877E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.381 | TFLOPs: 26.07 | +7: iteration 16450/ 173500 | consumed samples: 4211200 | consumed tokens: 8624537600 | elapsed time per iteration (s): 0.15 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.896367E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.816 | TFLOPs: 26.03 | +7: iteration 16460/ 173500 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.910928E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.839 | TFLOPs: 25.64 | +7: iteration 16470/ 173500 | consumed samples: 4216320 | consumed tokens: 8635023360 | elapsed time per iteration (s): 0.16 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 3.906483E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.364 | TFLOPs: 25.69 | +7: iteration 16480/ 173500 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.906847E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.058 | TFLOPs: 26.02 | +7: iteration 16490/ 173500 | consumed samples: 4221440 | consumed tokens: 8645509120 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.895634E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.231 | TFLOPs: 25.99 | +7: iteration 16500/ 173500 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.906731E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.683 | TFLOPs: 26.03 | +7: iteration 16510/ 173500 | consumed samples: 4226560 | consumed tokens: 8655994880 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.899695E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.006 | TFLOPs: 26.11 | +7: iteration 16520/ 173500 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.894688E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.074 | TFLOPs: 26.10 | +7: iteration 16530/ 173500 | consumed samples: 4231680 | consumed tokens: 8666480640 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.904362E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.287 | TFLOPs: 26.18 | +7: iteration 16540/ 173500 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.893758E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.152 | TFLOPs: 26.24 | +7: iteration 16550/ 173500 | consumed samples: 4236800 | consumed tokens: 8676966400 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.898637E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.094 | TFLOPs: 26.25 | +7: iteration 16560/ 173500 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.906059E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.202 | TFLOPs: 26.21 | +7: iteration 16570/ 173500 | consumed samples: 4241920 | consumed tokens: 8687452160 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.914468E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.733 | TFLOPs: 26.14 | +7: iteration 16580/ 173500 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.912661E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.214 | TFLOPs: 26.15 | +7: iteration 16590/ 173500 | consumed samples: 4247040 | consumed tokens: 8697937920 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.911721E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.567 | TFLOPs: 26.18 | +7: iteration 16600/ 173500 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.899592E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.173 | TFLOPs: 26.21 | +7: iteration 16610/ 173500 | consumed samples: 4252160 | consumed tokens: 8708423680 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.909437E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.098 | TFLOPs: 26.24 | +7: iteration 16620/ 173500 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.896521E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.497 | TFLOPs: 26.23 | +7: iteration 16630/ 173500 | consumed samples: 4257280 | consumed tokens: 8718909440 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.906499E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.664 | TFLOPs: 26.22 | +7: iteration 16640/ 173500 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.905260E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.904 | TFLOPs: 26.24 | +7: iteration 16650/ 173500 | consumed samples: 4262400 | consumed tokens: 8729395200 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.906844E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.291 | TFLOPs: 26.08 | +7: iteration 16660/ 173500 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.910600E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.123 | TFLOPs: 26.14 | +7: iteration 16670/ 173500 | consumed samples: 4267520 | consumed tokens: 8739880960 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.906799E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.737 | TFLOPs: 26.14 | +7: iteration 16680/ 173500 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.903686E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.303 | TFLOPs: 26.19 | +7: iteration 16690/ 173500 | consumed samples: 4272640 | consumed tokens: 8750366720 | elapsed time per iteration (s): 0.15 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 3.893005E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.817 | TFLOPs: 26.20 | +7: iteration 16700/ 173500 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.907109E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.076 | TFLOPs: 26.18 | +7: iteration 16710/ 173500 | consumed samples: 4277760 | consumed tokens: 8760852480 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.891032E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.966 | TFLOPs: 26.16 | +7: iteration 16720/ 173500 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.911937E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.959 | TFLOPs: 25.88 | +7: iteration 16730/ 173500 | consumed samples: 4282880 | consumed tokens: 8771338240 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.903642E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.319 | TFLOPs: 26.16 | +7: iteration 16740/ 173500 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.910284E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.810 | TFLOPs: 26.16 | +7: iteration 16750/ 173500 | consumed samples: 4288000 | consumed tokens: 8781824000 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.898286E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.095 | TFLOPs: 26.11 | +7: iteration 16760/ 173500 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.899505E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.393 | TFLOPs: 25.73 | +7: iteration 16770/ 173500 | consumed samples: 4293120 | consumed tokens: 8792309760 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.905991E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.638 | TFLOPs: 25.82 | +7: iteration 16780/ 173500 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.903530E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.842 | TFLOPs: 26.09 | +7: iteration 16790/ 173500 | consumed samples: 4298240 | consumed tokens: 8802795520 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.907937E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.102 | TFLOPs: 26.11 | +7: iteration 16800/ 173500 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.912312E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.625 | TFLOPs: 26.17 | +7: iteration 16810/ 173500 | consumed samples: 4303360 | consumed tokens: 8813281280 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.900598E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.083 | TFLOPs: 26.21 | +7: iteration 16820/ 173500 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.893468E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.923 | TFLOPs: 26.19 | +7: iteration 16830/ 173500 | consumed samples: 4308480 | consumed tokens: 8823767040 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.900110E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.925 | TFLOPs: 26.00 | +7: iteration 16840/ 173500 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.906225E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.286 | TFLOPs: 26.19 | +7: iteration 16850/ 173500 | consumed samples: 4313600 | consumed tokens: 8834252800 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.883930E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.401 | TFLOPs: 26.21 | +7: iteration 16860/ 173500 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.899930E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.989 | TFLOPs: 26.17 | +7: iteration 16870/ 173500 | consumed samples: 4318720 | consumed tokens: 8844738560 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.912397E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.806 | TFLOPs: 26.12 | +7: iteration 16880/ 173500 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.896082E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.696 | TFLOPs: 26.14 | +7: iteration 16890/ 173500 | consumed samples: 4323840 | consumed tokens: 8855224320 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.895215E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.389 | TFLOPs: 25.58 | +7: iteration 16900/ 173500 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.898575E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.571 | TFLOPs: 26.17 | +7: iteration 16910/ 173500 | consumed samples: 4328960 | consumed tokens: 8865710080 | elapsed time per iteration (s): 0.16 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.896894E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.664 | TFLOPs: 25.20 | +7: iteration 16920/ 173500 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 0.15 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 3.895596E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.568 | TFLOPs: 26.09 | +7: iteration 16930/ 173500 | consumed samples: 4334080 | consumed tokens: 8876195840 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.897960E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.732 | TFLOPs: 26.09 | +7: iteration 16940/ 173500 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.894640E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.555 | TFLOPs: 26.10 | +7: iteration 16950/ 173500 | consumed samples: 4339200 | consumed tokens: 8886681600 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.906691E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.935 | TFLOPs: 26.09 | +7: iteration 16960/ 173500 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.904238E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.201 | TFLOPs: 26.13 | +7: iteration 16970/ 173500 | consumed samples: 4344320 | consumed tokens: 8897167360 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.893734E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.673 | TFLOPs: 26.11 | +7: iteration 16980/ 173500 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.901320E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.531 | TFLOPs: 26.14 | +7: iteration 16990/ 173500 | consumed samples: 4349440 | consumed tokens: 8907653120 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.909631E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.437 | TFLOPs: 26.12 | +7: iteration 17000/ 173500 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.895542E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.794 | TFLOPs: 26.16 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 17000 | lm loss value: 3.964372E+00 | lm loss PPL: 5.268718E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 17000 to checkpoints_44m91b100m +0: [2023-03-17 00:59:46,188] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step17000 is begin to save! +0: [2023-03-17 00:59:46,192] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:59:46,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:59:46,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:59:46,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:59:46,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:59:46,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:59:46,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:59:46,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:59:46,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:59:46,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:59:46,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:59:46,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:59:46,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:59:46,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:59:46,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:59:46,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:59:46,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:59:46,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:59:46,321] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:59:46,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:59:46,322] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step17000/mp_rank_00_model_states.pt +0: [2023-03-17 00:59:46,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:59:46,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:59:46,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:59:46,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:59:46,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:59:46,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,361] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,361] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,362] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,363] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,363] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,364] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:59:46,364] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:59:46,365] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:59:46,365] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:59:46,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:59:46,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:59:46,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:59:46,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:59:46,367] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:59:46,367] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: successfully saved checkpoint at iteration 17000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.17 +7: iteration 17010/ 173500 | consumed samples: 4354560 | consumed tokens: 8918138880 | elapsed time per iteration (s): 0.18 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.894181E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.442 | TFLOPs: 22.75 | +7: iteration 17020/ 173500 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.890706E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.603 | TFLOPs: 26.20 | +7: iteration 17030/ 173500 | consumed samples: 4359680 | consumed tokens: 8928624640 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.889891E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.857 | TFLOPs: 26.20 | +7: iteration 17040/ 173500 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.898576E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.507 | TFLOPs: 26.21 | +7: iteration 17050/ 173500 | consumed samples: 4364800 | consumed tokens: 8939110400 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.896058E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.929 | TFLOPs: 26.09 | +7: iteration 17060/ 173500 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.910199E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.895 | TFLOPs: 26.06 | +7: iteration 17070/ 173500 | consumed samples: 4369920 | consumed tokens: 8949596160 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.898294E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.508 | TFLOPs: 26.07 | +7: iteration 17080/ 173500 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 0.16 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.902750E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.119 | TFLOPs: 25.38 | +7: iteration 17090/ 173500 | consumed samples: 4375040 | consumed tokens: 8960081920 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.900597E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.141 | TFLOPs: 26.02 | +7: iteration 17100/ 173500 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.900980E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.150 | TFLOPs: 26.08 | +7: iteration 17110/ 173500 | consumed samples: 4380160 | consumed tokens: 8970567680 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.891454E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.561 | TFLOPs: 26.07 | +7: iteration 17120/ 173500 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.893401E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.287 | TFLOPs: 26.01 | +7: iteration 17130/ 173500 | consumed samples: 4385280 | consumed tokens: 8981053440 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.901437E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.564 | TFLOPs: 26.01 | +7: iteration 17140/ 173500 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 0.15 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 3.904264E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.638 | TFLOPs: 25.98 | +7: iteration 17150/ 173500 | consumed samples: 4390400 | consumed tokens: 8991539200 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.904499E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.835 | TFLOPs: 26.00 | +7: iteration 17160/ 173500 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.898096E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.706 | TFLOPs: 26.00 | +7: iteration 17170/ 173500 | consumed samples: 4395520 | consumed tokens: 9002024960 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.910487E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.762 | TFLOPs: 26.00 | +7: iteration 17180/ 173500 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.891978E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.923 | TFLOPs: 26.02 | +7: iteration 17190/ 173500 | consumed samples: 4400640 | consumed tokens: 9012510720 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.893195E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.601 | TFLOPs: 26.01 | +7: iteration 17200/ 173500 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 0.16 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.900793E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.253 | TFLOPs: 25.86 | +7: iteration 17210/ 173500 | consumed samples: 4405760 | consumed tokens: 9022996480 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.905728E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.993 | TFLOPs: 26.08 | +7: iteration 17220/ 173500 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.898316E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.512 | TFLOPs: 26.09 | +7: iteration 17230/ 173500 | consumed samples: 4410880 | consumed tokens: 9033482240 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.902692E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.681 | TFLOPs: 26.09 | +7: iteration 17240/ 173500 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.888808E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.507 | TFLOPs: 26.09 | +7: iteration 17250/ 173500 | consumed samples: 4416000 | consumed tokens: 9043968000 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.900763E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.814 | TFLOPs: 26.08 | +7: iteration 17260/ 173500 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.896021E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.654 | TFLOPs: 26.03 | +7: iteration 17270/ 173500 | consumed samples: 4421120 | consumed tokens: 9054453760 | elapsed time per iteration (s): 0.16 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.889420E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.794 | TFLOPs: 25.75 | +7: iteration 17280/ 173500 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.894581E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.327 | TFLOPs: 26.07 | +7: iteration 17290/ 173500 | consumed samples: 4426240 | consumed tokens: 9064939520 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.898225E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.546 | TFLOPs: 26.07 | +7: iteration 17300/ 173500 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.884201E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.557 | TFLOPs: 26.09 | +7: iteration 17310/ 173500 | consumed samples: 4431360 | consumed tokens: 9075425280 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.876217E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.506 | TFLOPs: 26.09 | +7: iteration 17320/ 173500 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.902027E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.265 | TFLOPs: 26.10 | +7: iteration 17330/ 173500 | consumed samples: 4436480 | consumed tokens: 9085911040 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.893949E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.751 | TFLOPs: 26.08 | +7: iteration 17340/ 173500 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 0.16 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.891799E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.274 | TFLOPs: 24.83 | +7: iteration 17350/ 173500 | consumed samples: 4441600 | consumed tokens: 9096396800 | elapsed time per iteration (s): 0.15 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 3.896710E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.754 | TFLOPs: 26.14 | +7: iteration 17360/ 173500 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.891788E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.444 | TFLOPs: 26.13 | +7: iteration 17370/ 173500 | consumed samples: 4446720 | consumed tokens: 9106882560 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.908955E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.509 | TFLOPs: 26.14 | +7: iteration 17380/ 173500 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.884608E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.926 | TFLOPs: 26.14 | +7: iteration 17390/ 173500 | consumed samples: 4451840 | consumed tokens: 9117368320 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.903511E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.069 | TFLOPs: 26.13 | +7: iteration 17400/ 173500 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.891843E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.336 | TFLOPs: 26.12 | +7: iteration 17410/ 173500 | consumed samples: 4456960 | consumed tokens: 9127854080 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.883501E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.399 | TFLOPs: 26.09 | +7: iteration 17420/ 173500 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.884403E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.774 | TFLOPs: 26.06 | +7: iteration 17430/ 173500 | consumed samples: 4462080 | consumed tokens: 9138339840 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.884190E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.337 | TFLOPs: 26.07 | +7: iteration 17440/ 173500 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.892392E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.272 | TFLOPs: 26.08 | +7: iteration 17450/ 173500 | consumed samples: 4467200 | consumed tokens: 9148825600 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.888308E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.412 | TFLOPs: 26.09 | +7: iteration 17460/ 173500 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 0.16 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.887605E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.155 | TFLOPs: 25.77 | +7: iteration 17470/ 173500 | consumed samples: 4472320 | consumed tokens: 9159311360 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.905133E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.213 | TFLOPs: 26.11 | +7: iteration 17480/ 173500 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.886849E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.502 | TFLOPs: 26.10 | +7: iteration 17490/ 173500 | consumed samples: 4477440 | consumed tokens: 9169797120 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.909473E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.244 | TFLOPs: 26.10 | +7: iteration 17500/ 173500 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.897081E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.598 | TFLOPs: 26.11 | +7: iteration 17510/ 173500 | consumed samples: 4482560 | consumed tokens: 9180282880 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.903682E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.404 | TFLOPs: 26.10 | +7: iteration 17520/ 173500 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.891822E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.552 | TFLOPs: 26.09 | +7: iteration 17530/ 173500 | consumed samples: 4487680 | consumed tokens: 9190768640 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.908585E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.429 | TFLOPs: 26.09 | +7: iteration 17540/ 173500 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 0.17 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.894566E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.615 | TFLOPs: 23.88 | +7: iteration 17550/ 173500 | consumed samples: 4492800 | consumed tokens: 9201254400 | elapsed time per iteration (s): 0.16 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.902397E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.576 | TFLOPs: 25.71 | +7: iteration 17560/ 173500 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 0.16 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.891656E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.166 | TFLOPs: 25.89 | +7: iteration 17570/ 173500 | consumed samples: 4497920 | consumed tokens: 9211740160 | elapsed time per iteration (s): 0.15 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 3.884856E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.949 | TFLOPs: 26.11 | +7: iteration 17580/ 173500 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.880876E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.481 | TFLOPs: 26.07 | +7: iteration 17590/ 173500 | consumed samples: 4503040 | consumed tokens: 9222225920 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.891793E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.504 | TFLOPs: 26.12 | +7: iteration 17600/ 173500 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.883821E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.148 | TFLOPs: 26.05 | +7: iteration 17610/ 173500 | consumed samples: 4508160 | consumed tokens: 9232711680 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.900217E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.159 | TFLOPs: 26.10 | +7: iteration 17620/ 173500 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.886543E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.308 | TFLOPs: 26.07 | +7: iteration 17630/ 173500 | consumed samples: 4513280 | consumed tokens: 9243197440 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.893339E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.304 | TFLOPs: 26.08 | +7: iteration 17640/ 173500 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.897031E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.524 | TFLOPs: 26.07 | +7: iteration 17650/ 173500 | consumed samples: 4518400 | consumed tokens: 9253683200 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.900744E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.195 | TFLOPs: 26.11 | +7: iteration 17660/ 173500 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.893904E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.249 | TFLOPs: 26.10 | +7: iteration 17670/ 173500 | consumed samples: 4523520 | consumed tokens: 9264168960 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.888095E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.712 | TFLOPs: 26.11 | +7: iteration 17680/ 173500 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.897078E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.985 | TFLOPs: 26.10 | +7: iteration 17690/ 173500 | consumed samples: 4528640 | consumed tokens: 9274654720 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.894519E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.872 | TFLOPs: 26.11 | +7: iteration 17700/ 173500 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.891172E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.592 | TFLOPs: 26.09 | +7: iteration 17710/ 173500 | consumed samples: 4533760 | consumed tokens: 9285140480 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.877831E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.947 | TFLOPs: 26.08 | +7: iteration 17720/ 173500 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.896358E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.408 | TFLOPs: 26.09 | +7: iteration 17730/ 173500 | consumed samples: 4538880 | consumed tokens: 9295626240 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.892113E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.793 | TFLOPs: 26.09 | +7: iteration 17740/ 173500 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.890085E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.716 | TFLOPs: 26.08 | +7: iteration 17750/ 173500 | consumed samples: 4544000 | consumed tokens: 9306112000 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.895949E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.785 | TFLOPs: 26.06 | +7: iteration 17760/ 173500 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.878161E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.816 | TFLOPs: 26.08 | +7: iteration 17770/ 173500 | consumed samples: 4549120 | consumed tokens: 9316597760 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.879640E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.698 | TFLOPs: 26.08 | +7: iteration 17780/ 173500 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 0.15 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 3.897215E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.161 | TFLOPs: 26.10 | +7: iteration 17790/ 173500 | consumed samples: 4554240 | consumed tokens: 9327083520 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.897409E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.487 | TFLOPs: 26.06 | +7: iteration 17800/ 173500 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.876991E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.081 | TFLOPs: 26.10 | +7: iteration 17810/ 173500 | consumed samples: 4559360 | consumed tokens: 9337569280 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.890511E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.276 | TFLOPs: 26.08 | +7: iteration 17820/ 173500 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.902484E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.676 | TFLOPs: 26.11 | +7: iteration 17830/ 173500 | consumed samples: 4564480 | consumed tokens: 9348055040 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.895573E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.141 | TFLOPs: 26.11 | +7: iteration 17840/ 173500 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.894361E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.745 | TFLOPs: 26.09 | +7: iteration 17850/ 173500 | consumed samples: 4569600 | consumed tokens: 9358540800 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.879662E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.386 | TFLOPs: 26.12 | +7: iteration 17860/ 173500 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.897439E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.143 | TFLOPs: 26.13 | +7: iteration 17870/ 173500 | consumed samples: 4574720 | consumed tokens: 9369026560 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.875320E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.359 | TFLOPs: 26.12 | +7: iteration 17880/ 173500 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.892000E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.900 | TFLOPs: 26.11 | +7: iteration 17890/ 173500 | consumed samples: 4579840 | consumed tokens: 9379512320 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.896140E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.692 | TFLOPs: 26.12 | +7: iteration 17900/ 173500 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.902662E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.509 | TFLOPs: 26.10 | +7: iteration 17910/ 173500 | consumed samples: 4584960 | consumed tokens: 9389998080 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.888762E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.439 | TFLOPs: 26.10 | +7: iteration 17920/ 173500 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.883420E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.002 | TFLOPs: 26.11 | +7: iteration 17930/ 173500 | consumed samples: 4590080 | consumed tokens: 9400483840 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.889439E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.462 | TFLOPs: 26.12 | +7: iteration 17940/ 173500 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 0.16 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.900987E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.008 | TFLOPs: 25.70 | +7: iteration 17950/ 173500 | consumed samples: 4595200 | consumed tokens: 9410969600 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.879986E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.872 | TFLOPs: 26.09 | +7: iteration 17960/ 173500 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.893263E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.967 | TFLOPs: 26.08 | +7: iteration 17970/ 173500 | consumed samples: 4600320 | consumed tokens: 9421455360 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.893417E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.722 | TFLOPs: 26.11 | +7: iteration 17980/ 173500 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.900932E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.098 | TFLOPs: 26.11 | +7: iteration 17990/ 173500 | consumed samples: 4605440 | consumed tokens: 9431941120 | elapsed time per iteration (s): 0.15 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 3.885209E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.927 | TFLOPs: 26.09 | +0: [2023-03-17 01:02:20,623] [INFO] [logging.py:68:log_dist] [Rank 0] step=18000, skipped=0, lr=[0.00019604685446348677, 0.00019604685446348677, 0.00019604685446348677], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 18000/ 173500 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.869993E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.109 | TFLOPs: 26.10 | +0: steps: 18000 loss: 3.8677 iter time (s): 0.153 samples/sec: 1676.887 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 18000 | lm loss value: 3.965241E+00 | lm loss PPL: 5.273296E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 18000 to checkpoints_44m91b100m +0: [2023-03-17 01:02:20,698] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is begin to save! +0: [2023-03-17 01:02:20,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:02:20,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:02:20,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:02:20,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:02:20,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:02:20,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:02:20,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:02:20,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:02:20,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:02:20,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:02:20,795] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:02:20,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:02:20,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:02:20,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:02:20,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:02:20,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:02:20,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:02:20,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:02:20,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:02:20,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:02:20,829] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step18000/mp_rank_00_model_states.pt +0: [2023-03-17 01:02:20,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:02:20,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:02:20,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:02:20,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:02:20,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:02:20,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:02:20,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 01:02:20,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 01:02:20,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: successfully saved checkpoint at iteration 18000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.07 +7: iteration 18010/ 173500 | consumed samples: 4610560 | consumed tokens: 9442426880 | elapsed time per iteration (s): 0.18 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.877628E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.348 | TFLOPs: 22.54 | +7: iteration 18020/ 173500 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.894088E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.599 | TFLOPs: 26.12 | +7: iteration 18030/ 173500 | consumed samples: 4615680 | consumed tokens: 9452912640 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.886298E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.572 | TFLOPs: 26.10 | +7: iteration 18040/ 173500 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.880351E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.088 | TFLOPs: 26.13 | +7: iteration 18050/ 173500 | consumed samples: 4620800 | consumed tokens: 9463398400 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.888864E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.526 | TFLOPs: 26.10 | +7: iteration 18060/ 173500 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.878887E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.100 | TFLOPs: 26.11 | +7: iteration 18070/ 173500 | consumed samples: 4625920 | consumed tokens: 9473884160 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.887801E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.966 | TFLOPs: 26.11 | +7: iteration 18080/ 173500 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.886649E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.597 | TFLOPs: 26.12 | +7: iteration 18090/ 173500 | consumed samples: 4631040 | consumed tokens: 9484369920 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.881773E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.370 | TFLOPs: 26.12 | +7: iteration 18100/ 173500 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.891376E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.545 | TFLOPs: 26.12 | +7: iteration 18110/ 173500 | consumed samples: 4636160 | consumed tokens: 9494855680 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.900435E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.922 | TFLOPs: 26.13 | +7: iteration 18120/ 173500 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.899509E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.113 | TFLOPs: 26.13 | +7: iteration 18130/ 173500 | consumed samples: 4641280 | consumed tokens: 9505341440 | elapsed time per iteration (s): 0.16 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.887271E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.716 | TFLOPs: 25.75 | +7: iteration 18140/ 173500 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.872317E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.375 | TFLOPs: 26.12 | +7: iteration 18150/ 173500 | consumed samples: 4646400 | consumed tokens: 9515827200 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.870642E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.917 | TFLOPs: 26.11 | +7: iteration 18160/ 173500 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.878465E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.304 | TFLOPs: 26.12 | +7: iteration 18170/ 173500 | consumed samples: 4651520 | consumed tokens: 9526312960 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.885334E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.275 | TFLOPs: 26.12 | +7: iteration 18180/ 173500 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.887885E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.773 | TFLOPs: 26.12 | +7: iteration 18190/ 173500 | consumed samples: 4656640 | consumed tokens: 9536798720 | elapsed time per iteration (s): 0.15 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 3.888657E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.428 | TFLOPs: 26.12 | +7: iteration 18200/ 173500 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 0.16 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.893231E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.203 | TFLOPs: 25.75 | +7: iteration 18210/ 173500 | consumed samples: 4661760 | consumed tokens: 9547284480 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.878115E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.421 | TFLOPs: 26.12 | +7: iteration 18220/ 173500 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.876294E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.880 | TFLOPs: 26.14 | +7: iteration 18230/ 173500 | consumed samples: 4666880 | consumed tokens: 9557770240 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.883348E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.579 | TFLOPs: 26.14 | +7: iteration 18240/ 173500 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.890881E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.404 | TFLOPs: 26.13 | +7: iteration 18250/ 173500 | consumed samples: 4672000 | consumed tokens: 9568256000 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.894887E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.721 | TFLOPs: 26.11 | +7: iteration 18260/ 173500 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.885281E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.584 | TFLOPs: 26.07 | +7: iteration 18270/ 173500 | consumed samples: 4677120 | consumed tokens: 9578741760 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.894884E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.211 | TFLOPs: 26.11 | +7: iteration 18280/ 173500 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.894021E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.958 | TFLOPs: 26.11 | +7: iteration 18290/ 173500 | consumed samples: 4682240 | consumed tokens: 9589227520 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.886772E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.431 | TFLOPs: 26.13 | +7: iteration 18300/ 173500 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.871184E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.518 | TFLOPs: 26.14 | +7: iteration 18310/ 173500 | consumed samples: 4687360 | consumed tokens: 9599713280 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.881334E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.872 | TFLOPs: 26.13 | +7: iteration 18320/ 173500 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.873341E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.921 | TFLOPs: 26.13 | +7: iteration 18330/ 173500 | consumed samples: 4692480 | consumed tokens: 9610199040 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.895434E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.669 | TFLOPs: 26.12 | +7: iteration 18340/ 173500 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.880041E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.966 | TFLOPs: 26.13 | +7: iteration 18350/ 173500 | consumed samples: 4697600 | consumed tokens: 9620684800 | elapsed time per iteration (s): 0.16 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.891215E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.930 | TFLOPs: 25.69 | +7: iteration 18360/ 173500 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 0.16 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.886414E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.200 | TFLOPs: 25.74 | +7: iteration 18370/ 173500 | consumed samples: 4702720 | consumed tokens: 9631170560 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.883615E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.605 | TFLOPs: 26.09 | +7: iteration 18380/ 173500 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.892712E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.595 | TFLOPs: 26.09 | +7: iteration 18390/ 173500 | consumed samples: 4707840 | consumed tokens: 9641656320 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.879020E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.341 | TFLOPs: 26.07 | +7: iteration 18400/ 173500 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 0.15 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 3.884319E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.121 | TFLOPs: 26.08 | +7: iteration 18410/ 173500 | consumed samples: 4712960 | consumed tokens: 9652142080 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.872795E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.834 | TFLOPs: 26.09 | +7: iteration 18420/ 173500 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.879492E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.599 | TFLOPs: 26.09 | +7: iteration 18430/ 173500 | consumed samples: 4718080 | consumed tokens: 9662627840 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.887290E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.637 | TFLOPs: 26.09 | +7: iteration 18440/ 173500 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.888207E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.333 | TFLOPs: 26.10 | +7: iteration 18450/ 173500 | consumed samples: 4723200 | consumed tokens: 9673113600 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.885838E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.888 | TFLOPs: 26.08 | +7: iteration 18460/ 173500 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.896844E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.061 | TFLOPs: 26.10 | +7: iteration 18470/ 173500 | consumed samples: 4728320 | consumed tokens: 9683599360 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.883274E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.392 | TFLOPs: 26.10 | +7: iteration 18480/ 173500 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 0.16 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.876433E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.687 | TFLOPs: 25.48 | +7: iteration 18490/ 173500 | consumed samples: 4733440 | consumed tokens: 9694085120 | elapsed time per iteration (s): 0.16 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.882071E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.920 | TFLOPs: 25.70 | +7: iteration 18500/ 173500 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.880542E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.107 | TFLOPs: 26.10 | +7: iteration 18510/ 173500 | consumed samples: 4738560 | consumed tokens: 9704570880 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.882149E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.081 | TFLOPs: 26.11 | +7: iteration 18520/ 173500 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.904335E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.057 | TFLOPs: 26.13 | +7: iteration 18530/ 173500 | consumed samples: 4743680 | consumed tokens: 9715056640 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.895057E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.195 | TFLOPs: 26.13 | +7: iteration 18540/ 173500 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.884712E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.809 | TFLOPs: 26.11 | +7: iteration 18550/ 173500 | consumed samples: 4748800 | consumed tokens: 9725542400 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.878715E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.995 | TFLOPs: 26.13 | +7: iteration 18560/ 173500 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.888781E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.034 | TFLOPs: 26.13 | +7: iteration 18570/ 173500 | consumed samples: 4753920 | consumed tokens: 9736028160 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.885185E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.525 | TFLOPs: 26.10 | +7: iteration 18580/ 173500 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.872320E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.344 | TFLOPs: 26.12 | +7: iteration 18590/ 173500 | consumed samples: 4759040 | consumed tokens: 9746513920 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.878278E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.813 | TFLOPs: 26.11 | +7: iteration 18600/ 173500 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 0.15 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 3.884029E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.255 | TFLOPs: 26.12 | +7: iteration 18610/ 173500 | consumed samples: 4764160 | consumed tokens: 9756999680 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.891721E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.504 | TFLOPs: 26.12 | +7: iteration 18620/ 173500 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.877874E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.121 | TFLOPs: 26.14 | +7: iteration 18630/ 173500 | consumed samples: 4769280 | consumed tokens: 9767485440 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.891000E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.748 | TFLOPs: 26.17 | +7: iteration 18640/ 173500 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 0.16 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.870598E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.700 | TFLOPs: 25.73 | +7: iteration 18650/ 173500 | consumed samples: 4774400 | consumed tokens: 9777971200 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.887154E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.307 | TFLOPs: 26.18 | +7: iteration 18660/ 173500 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.885989E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.812 | TFLOPs: 26.17 | +7: iteration 18670/ 173500 | consumed samples: 4779520 | consumed tokens: 9788456960 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.886678E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.616 | TFLOPs: 26.14 | +7: iteration 18680/ 173500 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.878388E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.799 | TFLOPs: 26.16 | +7: iteration 18690/ 173500 | consumed samples: 4784640 | consumed tokens: 9798942720 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.882170E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.438 | TFLOPs: 26.17 | +7: iteration 18700/ 173500 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.868654E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.544 | TFLOPs: 26.15 | +7: iteration 18710/ 173500 | consumed samples: 4789760 | consumed tokens: 9809428480 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.876585E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.132 | TFLOPs: 26.11 | +7: iteration 18720/ 173500 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.881908E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.439 | TFLOPs: 26.15 | +7: iteration 18730/ 173500 | consumed samples: 4794880 | consumed tokens: 9819914240 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.884421E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.353 | TFLOPs: 26.16 | +7: iteration 18740/ 173500 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.880934E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.246 | TFLOPs: 26.15 | +7: iteration 18750/ 173500 | consumed samples: 4800000 | consumed tokens: 9830400000 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.869223E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.188 | TFLOPs: 26.13 | +7: iteration 18760/ 173500 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.881301E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.358 | TFLOPs: 26.15 | +7: iteration 18770/ 173500 | consumed samples: 4805120 | consumed tokens: 9840885760 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.873835E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.009 | TFLOPs: 26.14 | +7: iteration 18780/ 173500 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.890937E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.827 | TFLOPs: 26.14 | +7: iteration 18790/ 173500 | consumed samples: 4810240 | consumed tokens: 9851371520 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.876428E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.811 | TFLOPs: 26.12 | +7: iteration 18800/ 173500 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 0.15 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 3.873267E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.184 | TFLOPs: 26.13 | +7: iteration 18810/ 173500 | consumed samples: 4815360 | consumed tokens: 9861857280 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.873854E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.737 | TFLOPs: 26.15 | +7: iteration 18820/ 173500 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.876484E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.792 | TFLOPs: 26.17 | +7: iteration 18830/ 173500 | consumed samples: 4820480 | consumed tokens: 9872343040 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.878319E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.820 | TFLOPs: 26.17 | +7: iteration 18840/ 173500 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 0.16 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.878542E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.887 | TFLOPs: 25.15 | +7: iteration 18850/ 173500 | consumed samples: 4825600 | consumed tokens: 9882828800 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.864974E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.331 | TFLOPs: 26.18 | +7: iteration 18860/ 173500 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.876175E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.699 | TFLOPs: 26.17 | +7: iteration 18870/ 173500 | consumed samples: 4830720 | consumed tokens: 9893314560 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.882539E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.096 | TFLOPs: 26.18 | +7: iteration 18880/ 173500 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 0.16 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.869054E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.192 | TFLOPs: 25.14 | +7: iteration 18890/ 173500 | consumed samples: 4835840 | consumed tokens: 9903800320 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.875494E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.152 | TFLOPs: 26.19 | +7: iteration 18900/ 173500 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.880159E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.754 | TFLOPs: 26.17 | +7: iteration 18910/ 173500 | consumed samples: 4840960 | consumed tokens: 9914286080 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.891067E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.079 | TFLOPs: 26.16 | +7: iteration 18920/ 173500 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.883742E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.895 | TFLOPs: 26.14 | +7: iteration 18930/ 173500 | consumed samples: 4846080 | consumed tokens: 9924771840 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.877918E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.560 | TFLOPs: 26.15 | +7: iteration 18940/ 173500 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.882517E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.677 | TFLOPs: 26.15 | +7: iteration 18950/ 173500 | consumed samples: 4851200 | consumed tokens: 9935257600 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.871960E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.025 | TFLOPs: 26.17 | +7: iteration 18960/ 173500 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.881056E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.883 | TFLOPs: 26.17 | +7: iteration 18970/ 173500 | consumed samples: 4856320 | consumed tokens: 9945743360 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.875729E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.304 | TFLOPs: 26.07 | +7: iteration 18980/ 173500 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.860660E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.118 | TFLOPs: 26.10 | +7: iteration 18990/ 173500 | consumed samples: 4861440 | consumed tokens: 9956229120 | elapsed time per iteration (s): 0.15 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 3.881631E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.869 | TFLOPs: 26.16 | +7: iteration 19000/ 173500 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.882707E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.923 | TFLOPs: 26.14 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 19000 | lm loss value: 3.924659E+00 | lm loss PPL: 5.063582E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 19000 to checkpoints_44m91b100m +0: [2023-03-17 01:04:54,882] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step19000 is begin to save! +0: [2023-03-17 01:04:54,885] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:04:54,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:04:54,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:04:54,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:04:54,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:04:54,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:04:54,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:04:54,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:04:54,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:04:54,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:04:54,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:04:54,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:04:54,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:04:54,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:04:54,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:04:55,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:04:55,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:04:55,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:04:55,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:04:55,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:04:55,013] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step19000/mp_rank_00_model_states.pt +0: [2023-03-17 01:04:55,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:04:55,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:04:55,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:04:55,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:04:55,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:04:55,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:04:55,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:04:55,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:04:55,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 01:04:55,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:04:55,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:04:55,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: successfully saved checkpoint at iteration 19000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 174.96 +7: iteration 19010/ 173500 | consumed samples: 4866560 | consumed tokens: 9966714880 | elapsed time per iteration (s): 0.18 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.873190E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.061 | TFLOPs: 22.55 | +7: iteration 19020/ 173500 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.880931E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.637 | TFLOPs: 26.15 | +7: iteration 19030/ 173500 | consumed samples: 4871680 | consumed tokens: 9977200640 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.870174E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.992 | TFLOPs: 26.14 | +7: iteration 19040/ 173500 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.897915E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.474 | TFLOPs: 26.17 | +7: iteration 19050/ 173500 | consumed samples: 4876800 | consumed tokens: 9987686400 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.893495E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.634 | TFLOPs: 26.17 | +7: iteration 19060/ 173500 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.892906E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.105 | TFLOPs: 26.18 | +7: iteration 19070/ 173500 | consumed samples: 4881920 | consumed tokens: 9998172160 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.877776E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.773 | TFLOPs: 26.17 | +7: iteration 19080/ 173500 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.874280E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.492 | TFLOPs: 26.17 | +7: iteration 19090/ 173500 | consumed samples: 4887040 | consumed tokens: 10008657920 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.882817E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.377 | TFLOPs: 26.16 | +7: iteration 19100/ 173500 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.888562E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.290 | TFLOPs: 26.16 | +7: iteration 19110/ 173500 | consumed samples: 4892160 | consumed tokens: 10019143680 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.882235E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.995 | TFLOPs: 26.14 | +7: iteration 19120/ 173500 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.864838E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.881 | TFLOPs: 26.17 | +7: iteration 19130/ 173500 | consumed samples: 4897280 | consumed tokens: 10029629440 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.872203E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.051 | TFLOPs: 26.19 | +7: iteration 19140/ 173500 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.889420E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.604 | TFLOPs: 26.17 | +7: iteration 19150/ 173500 | consumed samples: 4902400 | consumed tokens: 10040115200 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.874136E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.942 | TFLOPs: 26.17 | +7: iteration 19160/ 173500 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.876239E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.633 | TFLOPs: 26.15 | +7: iteration 19170/ 173500 | consumed samples: 4907520 | consumed tokens: 10050600960 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.887289E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.358 | TFLOPs: 26.16 | +7: iteration 19180/ 173500 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 0.15 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.886748E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.593 | TFLOPs: 26.18 | +7: iteration 19190/ 173500 | consumed samples: 4912640 | consumed tokens: 10061086720 | elapsed time per iteration (s): 0.16 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 3.886364E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.288 | TFLOPs: 25.88 | +7: iteration 19200/ 173500 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 0.16 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.879698E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.376 | TFLOPs: 24.86 | +7: iteration 19210/ 173500 | consumed samples: 4917760 | consumed tokens: 10071572480 | elapsed time per iteration (s): 0.16 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.881818E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.540 | TFLOPs: 25.38 | +7: iteration 19220/ 173500 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.871323E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.334 | TFLOPs: 26.16 | +7: iteration 19230/ 173500 | consumed samples: 4922880 | consumed tokens: 10082058240 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.873197E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.484 | TFLOPs: 26.21 | +7: iteration 19240/ 173500 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.875826E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.146 | TFLOPs: 26.22 | +7: iteration 19250/ 173500 | consumed samples: 4928000 | consumed tokens: 10092544000 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.868866E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.641 | TFLOPs: 26.23 | +7: iteration 19260/ 173500 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 0.16 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.877802E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.417 | TFLOPs: 25.85 | +7: iteration 19270/ 173500 | consumed samples: 4933120 | consumed tokens: 10103029760 | elapsed time per iteration (s): 0.16 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.870293E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.300 | TFLOPs: 25.85 | +7: iteration 19280/ 173500 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.882182E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.567 | TFLOPs: 26.28 | +7: iteration 19290/ 173500 | consumed samples: 4938240 | consumed tokens: 10113515520 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.873209E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.063 | TFLOPs: 26.22 | +7: iteration 19300/ 173500 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.870305E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.534 | TFLOPs: 26.23 | +7: iteration 19310/ 173500 | consumed samples: 4943360 | consumed tokens: 10124001280 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.874667E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.724 | TFLOPs: 26.28 | +7: iteration 19320/ 173500 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 0.16 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.887338E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.204 | TFLOPs: 25.75 | +7: iteration 19330/ 173500 | consumed samples: 4948480 | consumed tokens: 10134487040 | elapsed time per iteration (s): 0.16 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.881965E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.038 | TFLOPs: 25.81 | +7: iteration 19340/ 173500 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.863261E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.321 | TFLOPs: 26.12 | +7: iteration 19350/ 173500 | consumed samples: 4953600 | consumed tokens: 10144972800 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.863578E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.993 | TFLOPs: 26.33 | +7: iteration 19360/ 173500 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.870960E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.238 | TFLOPs: 26.30 | +7: iteration 19370/ 173500 | consumed samples: 4958720 | consumed tokens: 10155458560 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.875742E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.014 | TFLOPs: 26.30 | +7: iteration 19380/ 173500 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 0.15 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 3.875657E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.894 | TFLOPs: 26.27 | +7: iteration 19390/ 173500 | consumed samples: 4963840 | consumed tokens: 10165944320 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.889228E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.569 | TFLOPs: 26.26 | +7: iteration 19400/ 173500 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.875370E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.485 | TFLOPs: 26.18 | +7: iteration 19410/ 173500 | consumed samples: 4968960 | consumed tokens: 10176430080 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.864244E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.036 | TFLOPs: 26.19 | +7: iteration 19420/ 173500 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.894075E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.393 | TFLOPs: 26.21 | +7: iteration 19430/ 173500 | consumed samples: 4974080 | consumed tokens: 10186915840 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.875364E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.146 | TFLOPs: 26.24 | +7: iteration 19440/ 173500 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.868720E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.979 | TFLOPs: 26.27 | +7: iteration 19450/ 173500 | consumed samples: 4979200 | consumed tokens: 10197401600 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.865002E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.311 | TFLOPs: 26.29 | +7: iteration 19460/ 173500 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.868596E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.594 | TFLOPs: 26.26 | +7: iteration 19470/ 173500 | consumed samples: 4984320 | consumed tokens: 10207887360 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.879576E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.853 | TFLOPs: 26.19 | +7: iteration 19480/ 173500 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.888453E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.909 | TFLOPs: 26.19 | +7: iteration 19490/ 173500 | consumed samples: 4989440 | consumed tokens: 10218373120 | elapsed time per iteration (s): 0.16 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.880561E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.014 | TFLOPs: 25.89 | +7: iteration 19500/ 173500 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.875627E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.713 | TFLOPs: 26.03 | +7: iteration 19510/ 173500 | consumed samples: 4994560 | consumed tokens: 10228858880 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.876942E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.657 | TFLOPs: 26.09 | +7: iteration 19520/ 173500 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.886981E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.803 | TFLOPs: 26.06 | +7: iteration 19530/ 173500 | consumed samples: 4999680 | consumed tokens: 10239344640 | elapsed time per iteration (s): 0.16 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.881104E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.023 | TFLOPs: 25.67 | +7: iteration 19540/ 173500 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.879679E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.208 | TFLOPs: 26.05 | +7: iteration 19550/ 173500 | consumed samples: 5004800 | consumed tokens: 10249830400 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.877220E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.173 | TFLOPs: 26.21 | +7: iteration 19560/ 173500 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.873952E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.845 | TFLOPs: 26.25 | +7: iteration 19570/ 173500 | consumed samples: 5009920 | consumed tokens: 10260316160 | elapsed time per iteration (s): 0.15 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 3.873987E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.319 | TFLOPs: 26.16 | +7: iteration 19580/ 173500 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.868782E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.600 | TFLOPs: 26.26 | +7: iteration 19590/ 173500 | consumed samples: 5015040 | consumed tokens: 10270801920 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.869339E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.690 | TFLOPs: 26.03 | +7: iteration 19600/ 173500 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.868062E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.351 | TFLOPs: 26.05 | +7: iteration 19610/ 173500 | consumed samples: 5020160 | consumed tokens: 10281287680 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.892812E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.338 | TFLOPs: 26.04 | +7: iteration 19620/ 173500 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.877468E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.903 | TFLOPs: 26.05 | +7: iteration 19630/ 173500 | consumed samples: 5025280 | consumed tokens: 10291773440 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.863726E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.697 | TFLOPs: 26.08 | +7: iteration 19640/ 173500 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.856042E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.856 | TFLOPs: 26.05 | +7: iteration 19650/ 173500 | consumed samples: 5030400 | consumed tokens: 10302259200 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.862924E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.946 | TFLOPs: 26.03 | +7: iteration 19660/ 173500 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.876111E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.101 | TFLOPs: 26.07 | +7: iteration 19670/ 173500 | consumed samples: 5035520 | consumed tokens: 10312744960 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.868233E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.429 | TFLOPs: 26.07 | +7: iteration 19680/ 173500 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.875325E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.012 | TFLOPs: 26.06 | +7: iteration 19690/ 173500 | consumed samples: 5040640 | consumed tokens: 10323230720 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.863543E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.567 | TFLOPs: 26.06 | +7: iteration 19700/ 173500 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.867470E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.623 | TFLOPs: 25.98 | +7: iteration 19710/ 173500 | consumed samples: 5045760 | consumed tokens: 10333716480 | elapsed time per iteration (s): 0.16 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.869898E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.155 | TFLOPs: 25.25 | +7: iteration 19720/ 173500 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.875269E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.872 | TFLOPs: 25.98 | +7: iteration 19730/ 173500 | consumed samples: 5050880 | consumed tokens: 10344202240 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.859779E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.285 | TFLOPs: 26.04 | +7: iteration 19740/ 173500 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.869448E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.704 | TFLOPs: 26.04 | +7: iteration 19750/ 173500 | consumed samples: 5056000 | consumed tokens: 10354688000 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.866148E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.136 | TFLOPs: 26.07 | +7: iteration 19760/ 173500 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 0.15 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 3.868968E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.355 | TFLOPs: 26.05 | +7: iteration 19770/ 173500 | consumed samples: 5061120 | consumed tokens: 10365173760 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.875231E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.516 | TFLOPs: 26.06 | +7: iteration 19780/ 173500 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 0.16 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.878172E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.049 | TFLOPs: 25.16 | +7: iteration 19790/ 173500 | consumed samples: 5066240 | consumed tokens: 10375659520 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.868859E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.471 | TFLOPs: 26.07 | +7: iteration 19800/ 173500 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.890591E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.352 | TFLOPs: 26.07 | +7: iteration 19810/ 173500 | consumed samples: 5071360 | consumed tokens: 10386145280 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.871018E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.228 | TFLOPs: 26.04 | +7: iteration 19820/ 173500 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.877643E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.163 | TFLOPs: 26.02 | +7: iteration 19830/ 173500 | consumed samples: 5076480 | consumed tokens: 10396631040 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.880076E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.347 | TFLOPs: 26.07 | +7: iteration 19840/ 173500 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.880094E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.494 | TFLOPs: 26.07 | +7: iteration 19850/ 173500 | consumed samples: 5081600 | consumed tokens: 10407116800 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.866485E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.294 | TFLOPs: 26.04 | +7: iteration 19860/ 173500 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.872762E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.797 | TFLOPs: 26.06 | +7: iteration 19870/ 173500 | consumed samples: 5086720 | consumed tokens: 10417602560 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.873584E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.841 | TFLOPs: 26.06 | +7: iteration 19880/ 173500 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.881378E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.188 | TFLOPs: 26.05 | +7: iteration 19890/ 173500 | consumed samples: 5091840 | consumed tokens: 10428088320 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.870704E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.025 | TFLOPs: 26.03 | +7: iteration 19900/ 173500 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.866751E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.399 | TFLOPs: 25.99 | +7: iteration 19910/ 173500 | consumed samples: 5096960 | consumed tokens: 10438574080 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.870343E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.609 | TFLOPs: 26.04 | +7: iteration 19920/ 173500 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.865171E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.337 | TFLOPs: 26.04 | +7: iteration 19930/ 173500 | consumed samples: 5102080 | consumed tokens: 10449059840 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.875233E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.858 | TFLOPs: 26.05 | +7: iteration 19940/ 173500 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.876722E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.850 | TFLOPs: 25.95 | +7: iteration 19950/ 173500 | consumed samples: 5107200 | consumed tokens: 10459545600 | elapsed time per iteration (s): 0.15 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 3.860523E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.300 | TFLOPs: 26.05 | +7: iteration 19960/ 173500 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.872282E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.493 | TFLOPs: 26.06 | +7: iteration 19970/ 173500 | consumed samples: 5112320 | consumed tokens: 10470031360 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.869628E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.426 | TFLOPs: 26.06 | +7: iteration 19980/ 173500 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.873597E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.308 | TFLOPs: 26.05 | +7: iteration 19990/ 173500 | consumed samples: 5117440 | consumed tokens: 10480517120 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.874958E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.500 | TFLOPs: 26.07 | +0: [2023-03-17 01:07:29,101] [INFO] [logging.py:68:log_dist] [Rank 0] step=20000, skipped=0, lr=[0.00019502450208460265, 0.00019502450208460265, 0.00019502450208460265], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 20000/ 173500 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.874545E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.587 | TFLOPs: 26.07 | +0: steps: 20000 loss: 3.9082 iter time (s): 0.153 samples/sec: 1677.309 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 20000 | lm loss value: 3.976546E+00 | lm loss PPL: 5.333251E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 20000 to checkpoints_44m91b100m +0: [2023-03-17 01:07:29,176] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is begin to save! +0: [2023-03-17 01:07:29,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:07:29,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:07:29,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:07:29,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:07:29,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:07:29,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:07:29,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:07:29,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:07:29,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:07:29,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:07:29,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:07:29,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:07:29,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:07:29,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:07:29,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:07:29,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:07:29,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:07:29,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:07:29,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:07:29,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:07:29,308] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step20000/mp_rank_00_model_states.pt +0: [2023-03-17 01:07:29,308] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:07:29,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:07:29,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:07:29,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:07:29,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:07:29,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:07:29,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:07:29,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:07:29,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:07:29,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:07:29,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: successfully saved checkpoint at iteration 20000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.58 +7: iteration 20010/ 173500 | consumed samples: 5122560 | consumed tokens: 10491002880 | elapsed time per iteration (s): 0.18 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.857330E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1435.767 | TFLOPs: 22.52 | +7: iteration 20020/ 173500 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.869723E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.754 | TFLOPs: 26.11 | +7: iteration 20030/ 173500 | consumed samples: 5127680 | consumed tokens: 10501488640 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.874932E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.363 | TFLOPs: 26.13 | +7: iteration 20040/ 173500 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.881776E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.768 | TFLOPs: 26.11 | +7: iteration 20050/ 173500 | consumed samples: 5132800 | consumed tokens: 10511974400 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.874987E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.628 | TFLOPs: 26.09 | +7: iteration 20060/ 173500 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.880844E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.150 | TFLOPs: 26.10 | +7: iteration 20070/ 173500 | consumed samples: 5137920 | consumed tokens: 10522460160 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.872810E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.520 | TFLOPs: 26.10 | +7: iteration 20080/ 173500 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.866931E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.749 | TFLOPs: 26.11 | +7: iteration 20090/ 173500 | consumed samples: 5143040 | consumed tokens: 10532945920 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.860877E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.566 | TFLOPs: 26.14 | +7: iteration 20100/ 173500 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.877724E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.906 | TFLOPs: 26.11 | +7: iteration 20110/ 173500 | consumed samples: 5148160 | consumed tokens: 10543431680 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.859186E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.429 | TFLOPs: 26.13 | +7: iteration 20120/ 173500 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 0.15 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.856628E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.626 | TFLOPs: 26.12 | +7: iteration 20130/ 173500 | consumed samples: 5153280 | consumed tokens: 10553917440 | elapsed time per iteration (s): 0.16 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 3.872661E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.817 | TFLOPs: 25.61 | +7: iteration 20140/ 173500 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 0.16 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.863309E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.850 | TFLOPs: 25.65 | +7: iteration 20150/ 173500 | consumed samples: 5158400 | consumed tokens: 10564403200 | elapsed time per iteration (s): 0.16 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.875798E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.592 | TFLOPs: 25.79 | +7: iteration 20160/ 173500 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.870274E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.912 | TFLOPs: 26.09 | +7: iteration 20170/ 173500 | consumed samples: 5163520 | consumed tokens: 10574888960 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.867842E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.026 | TFLOPs: 26.16 | +7: iteration 20180/ 173500 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.869781E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.279 | TFLOPs: 26.16 | +7: iteration 20190/ 173500 | consumed samples: 5168640 | consumed tokens: 10585374720 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.864587E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.304 | TFLOPs: 26.15 | +7: iteration 20200/ 173500 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.871589E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.890 | TFLOPs: 25.92 | +7: iteration 20210/ 173500 | consumed samples: 5173760 | consumed tokens: 10595860480 | elapsed time per iteration (s): 0.16 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.866875E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.170 | TFLOPs: 25.75 | +7: iteration 20220/ 173500 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.868326E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.131 | TFLOPs: 26.04 | +7: iteration 20230/ 173500 | consumed samples: 5178880 | consumed tokens: 10606346240 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.879399E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.052 | TFLOPs: 25.97 | +7: iteration 20240/ 173500 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.865441E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.405 | TFLOPs: 26.01 | +7: iteration 20250/ 173500 | consumed samples: 5184000 | consumed tokens: 10616832000 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.866718E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.765 | TFLOPs: 26.08 | +7: iteration 20260/ 173500 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.876020E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.906 | TFLOPs: 26.05 | +7: iteration 20270/ 173500 | consumed samples: 5189120 | consumed tokens: 10627317760 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.874195E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.256 | TFLOPs: 26.07 | +7: iteration 20280/ 173500 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.875180E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.854 | TFLOPs: 26.09 | +7: iteration 20290/ 173500 | consumed samples: 5194240 | consumed tokens: 10637803520 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.872446E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.601 | TFLOPs: 26.09 | +7: iteration 20300/ 173500 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.865225E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.595 | TFLOPs: 26.07 | +7: iteration 20310/ 173500 | consumed samples: 5199360 | consumed tokens: 10648289280 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.871587E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.011 | TFLOPs: 26.08 | +7: iteration 20320/ 173500 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 0.15 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 3.863198E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.746 | TFLOPs: 26.08 | +7: iteration 20330/ 173500 | consumed samples: 5204480 | consumed tokens: 10658775040 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.860192E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.883 | TFLOPs: 26.14 | +7: iteration 20340/ 173500 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.865271E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.546 | TFLOPs: 26.18 | +7: iteration 20350/ 173500 | consumed samples: 5209600 | consumed tokens: 10669260800 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.865748E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.711 | TFLOPs: 26.19 | +7: iteration 20360/ 173500 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.880594E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.096 | TFLOPs: 26.19 | +7: iteration 20370/ 173500 | consumed samples: 5214720 | consumed tokens: 10679746560 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.864792E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.342 | TFLOPs: 26.20 | +7: iteration 20380/ 173500 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.869281E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.322 | TFLOPs: 26.23 | +7: iteration 20390/ 173500 | consumed samples: 5219840 | consumed tokens: 10690232320 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.875169E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.197 | TFLOPs: 26.08 | +7: iteration 20400/ 173500 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.875458E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.498 | TFLOPs: 26.09 | +7: iteration 20410/ 173500 | consumed samples: 5224960 | consumed tokens: 10700718080 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.851677E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.539 | TFLOPs: 26.18 | +7: iteration 20420/ 173500 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.870358E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.235 | TFLOPs: 26.18 | +7: iteration 20430/ 173500 | consumed samples: 5230080 | consumed tokens: 10711203840 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.857965E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.449 | TFLOPs: 26.18 | +7: iteration 20440/ 173500 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.858901E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.862 | TFLOPs: 26.19 | +7: iteration 20450/ 173500 | consumed samples: 5235200 | consumed tokens: 10721689600 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.888705E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.023 | TFLOPs: 26.17 | +7: iteration 20460/ 173500 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.866084E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.685 | TFLOPs: 26.20 | +7: iteration 20470/ 173500 | consumed samples: 5240320 | consumed tokens: 10732175360 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.869576E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.575 | TFLOPs: 26.17 | +7: iteration 20480/ 173500 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.882091E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.032 | TFLOPs: 26.17 | +7: iteration 20490/ 173500 | consumed samples: 5245440 | consumed tokens: 10742661120 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.874219E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.029 | TFLOPs: 26.19 | +7: iteration 20500/ 173500 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 0.15 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 3.862778E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.750 | TFLOPs: 26.25 | +7: iteration 20510/ 173500 | consumed samples: 5250560 | consumed tokens: 10753146880 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.853145E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.680 | TFLOPs: 26.23 | +7: iteration 20520/ 173500 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.872809E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.131 | TFLOPs: 26.22 | +7: iteration 20530/ 173500 | consumed samples: 5255680 | consumed tokens: 10763632640 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.861643E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.152 | TFLOPs: 26.22 | +7: iteration 20540/ 173500 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.876305E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.759 | TFLOPs: 26.23 | +7: iteration 20550/ 173500 | consumed samples: 5260800 | consumed tokens: 10774118400 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.860632E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.604 | TFLOPs: 26.23 | +7: iteration 20560/ 173500 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.872249E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.600 | TFLOPs: 26.25 | +7: iteration 20570/ 173500 | consumed samples: 5265920 | consumed tokens: 10784604160 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.877225E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.891 | TFLOPs: 26.25 | +7: iteration 20580/ 173500 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.860675E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.829 | TFLOPs: 26.25 | +7: iteration 20590/ 173500 | consumed samples: 5271040 | consumed tokens: 10795089920 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.859591E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.862 | TFLOPs: 26.23 | +7: iteration 20600/ 173500 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.869277E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.648 | TFLOPs: 26.26 | +7: iteration 20610/ 173500 | consumed samples: 5276160 | consumed tokens: 10805575680 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.862296E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.072 | TFLOPs: 26.27 | +7: iteration 20620/ 173500 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.866477E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.418 | TFLOPs: 26.26 | +7: iteration 20630/ 173500 | consumed samples: 5281280 | consumed tokens: 10816061440 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.876587E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.904 | TFLOPs: 26.27 | +7: iteration 20640/ 173500 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.875438E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.063 | TFLOPs: 26.22 | +7: iteration 20650/ 173500 | consumed samples: 5286400 | consumed tokens: 10826547200 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.864610E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.895 | TFLOPs: 26.03 | +7: iteration 20660/ 173500 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.860674E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.099 | TFLOPs: 26.21 | +7: iteration 20670/ 173500 | consumed samples: 5291520 | consumed tokens: 10837032960 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.864788E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.704 | TFLOPs: 26.15 | +7: iteration 20680/ 173500 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 0.15 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 3.860685E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.963 | TFLOPs: 26.14 | +7: iteration 20690/ 173500 | consumed samples: 5296640 | consumed tokens: 10847518720 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.867577E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.853 | TFLOPs: 26.16 | +7: iteration 20700/ 173500 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.851290E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.020 | TFLOPs: 26.16 | +7: iteration 20710/ 173500 | consumed samples: 5301760 | consumed tokens: 10858004480 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.862463E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.642 | TFLOPs: 26.07 | +7: iteration 20720/ 173500 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.871806E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.915 | TFLOPs: 26.09 | +7: iteration 20730/ 173500 | consumed samples: 5306880 | consumed tokens: 10868490240 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.865565E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.861 | TFLOPs: 26.11 | +7: iteration 20740/ 173500 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.873980E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.145 | TFLOPs: 26.14 | +7: iteration 20750/ 173500 | consumed samples: 5312000 | consumed tokens: 10878976000 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.861033E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.534 | TFLOPs: 26.17 | +7: iteration 20760/ 173500 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.869630E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.974 | TFLOPs: 26.17 | +7: iteration 20770/ 173500 | consumed samples: 5317120 | consumed tokens: 10889461760 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.863600E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.691 | TFLOPs: 26.18 | +7: iteration 20780/ 173500 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.855246E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.767 | TFLOPs: 26.17 | +7: iteration 20790/ 173500 | consumed samples: 5322240 | consumed tokens: 10899947520 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.859845E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.987 | TFLOPs: 26.17 | +7: iteration 20800/ 173500 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.855149E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.343 | TFLOPs: 26.18 | +7: iteration 20810/ 173500 | consumed samples: 5327360 | consumed tokens: 10910433280 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.859768E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.478 | TFLOPs: 26.20 | +7: iteration 20820/ 173500 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.888833E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.198 | TFLOPs: 26.19 | +7: iteration 20830/ 173500 | consumed samples: 5332480 | consumed tokens: 10920919040 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.877392E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.104 | TFLOPs: 26.11 | +7: iteration 20840/ 173500 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.853404E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.559 | TFLOPs: 26.06 | +7: iteration 20850/ 173500 | consumed samples: 5337600 | consumed tokens: 10931404800 | elapsed time per iteration (s): 0.15 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 3.863556E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.374 | TFLOPs: 26.09 | +7: iteration 20860/ 173500 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.875109E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.238 | TFLOPs: 26.18 | +7: iteration 20870/ 173500 | consumed samples: 5342720 | consumed tokens: 10941890560 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.864430E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.950 | TFLOPs: 26.19 | +7: iteration 20880/ 173500 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.868795E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.582 | TFLOPs: 26.18 | +7: iteration 20890/ 173500 | consumed samples: 5347840 | consumed tokens: 10952376320 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.861892E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.547 | TFLOPs: 26.17 | +7: iteration 20900/ 173500 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.868709E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.296 | TFLOPs: 26.13 | +7: iteration 20910/ 173500 | consumed samples: 5352960 | consumed tokens: 10962862080 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.862776E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.975 | TFLOPs: 26.13 | +7: iteration 20920/ 173500 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.859855E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.785 | TFLOPs: 26.14 | +7: iteration 20930/ 173500 | consumed samples: 5358080 | consumed tokens: 10973347840 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.859546E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.023 | TFLOPs: 26.14 | +7: iteration 20940/ 173500 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.857845E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.439 | TFLOPs: 26.13 | +7: iteration 20950/ 173500 | consumed samples: 5363200 | consumed tokens: 10983833600 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.866714E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.977 | TFLOPs: 26.27 | +7: iteration 20960/ 173500 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.869099E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.582 | TFLOPs: 26.31 | +7: iteration 20970/ 173500 | consumed samples: 5368320 | consumed tokens: 10994319360 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.871810E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.970 | TFLOPs: 26.30 | +7: iteration 20980/ 173500 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.843542E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.306 | TFLOPs: 26.30 | +7: iteration 20990/ 173500 | consumed samples: 5373440 | consumed tokens: 11004805120 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.853325E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.850 | TFLOPs: 26.28 | +7: iteration 21000/ 173500 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.865200E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.159 | TFLOPs: 26.25 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 21000 | lm loss value: 3.962696E+00 | lm loss PPL: 5.259895E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 21000 to checkpoints_44m91b100m +0: [2023-03-17 01:10:02,997] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step21000 is begin to save! +0: [2023-03-17 01:10:03,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:10:03,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:10:03,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:10:03,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:10:03,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:10:03,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:10:03,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:10:03,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:10:03,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:10:03,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:10:03,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:10:03,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:10:03,105] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:10:03,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:10:03,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:10:03,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:10:03,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:10:03,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:10:03,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:10:03,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:10:03,131] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step21000/mp_rank_00_model_states.pt +0: [2023-03-17 01:10:03,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:10:03,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:10:03,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:10:03,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:10:03,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 01:10:03,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:10:03,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:10:03,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: successfully saved checkpoint at iteration 21000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.54 +7: iteration 21010/ 173500 | consumed samples: 5378560 | consumed tokens: 11015290880 | elapsed time per iteration (s): 0.18 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.867376E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1428.037 | TFLOPs: 22.40 | +7: iteration 21020/ 173500 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.852980E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.191 | TFLOPs: 26.13 | +7: iteration 21030/ 173500 | consumed samples: 5383680 | consumed tokens: 11025776640 | elapsed time per iteration (s): 0.15 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 3.867741E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.158 | TFLOPs: 26.11 | +7: iteration 21040/ 173500 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.851302E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.363 | TFLOPs: 26.12 | +7: iteration 21050/ 173500 | consumed samples: 5388800 | consumed tokens: 11036262400 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.864973E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.475 | TFLOPs: 26.12 | +7: iteration 21060/ 173500 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.871211E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.640 | TFLOPs: 26.09 | +7: iteration 21070/ 173500 | consumed samples: 5393920 | consumed tokens: 11046748160 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.840340E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.418 | TFLOPs: 26.13 | +7: iteration 21080/ 173500 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.863065E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.611 | TFLOPs: 26.17 | +7: iteration 21090/ 173500 | consumed samples: 5399040 | consumed tokens: 11057233920 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.863879E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.715 | TFLOPs: 26.19 | +7: iteration 21100/ 173500 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.872742E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.498 | TFLOPs: 26.18 | +7: iteration 21110/ 173500 | consumed samples: 5404160 | consumed tokens: 11067719680 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.858634E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.987 | TFLOPs: 26.21 | +7: iteration 21120/ 173500 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.856539E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.808 | TFLOPs: 26.19 | +7: iteration 21130/ 173500 | consumed samples: 5409280 | consumed tokens: 11078205440 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.852261E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.100 | TFLOPs: 26.18 | +7: iteration 21140/ 173500 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 0.16 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.873635E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.843 | TFLOPs: 25.65 | +7: iteration 21150/ 173500 | consumed samples: 5414400 | consumed tokens: 11088691200 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.866910E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.356 | TFLOPs: 26.21 | +7: iteration 21160/ 173500 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.857452E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.342 | TFLOPs: 26.20 | +7: iteration 21170/ 173500 | consumed samples: 5419520 | consumed tokens: 11099176960 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.853126E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.319 | TFLOPs: 26.15 | +7: iteration 21180/ 173500 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.851845E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.072 | TFLOPs: 26.22 | +7: iteration 21190/ 173500 | consumed samples: 5424640 | consumed tokens: 11109662720 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.863743E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.575 | TFLOPs: 26.20 | +7: iteration 21200/ 173500 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.870219E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.021 | TFLOPs: 26.03 | +7: iteration 21210/ 173500 | consumed samples: 5429760 | consumed tokens: 11120148480 | elapsed time per iteration (s): 0.15 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 3.870522E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.038 | TFLOPs: 26.24 | +7: iteration 21220/ 173500 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.861947E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.956 | TFLOPs: 26.30 | +7: iteration 21230/ 173500 | consumed samples: 5434880 | consumed tokens: 11130634240 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.852030E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.910 | TFLOPs: 26.28 | +7: iteration 21240/ 173500 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.862883E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.336 | TFLOPs: 26.26 | +7: iteration 21250/ 173500 | consumed samples: 5440000 | consumed tokens: 11141120000 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.854004E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.310 | TFLOPs: 26.26 | +7: iteration 21260/ 173500 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.880543E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.214 | TFLOPs: 26.27 | +7: iteration 21270/ 173500 | consumed samples: 5445120 | consumed tokens: 11151605760 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.851300E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.995 | TFLOPs: 26.28 | +7: iteration 21280/ 173500 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.848130E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.503 | TFLOPs: 26.32 | +7: iteration 21290/ 173500 | consumed samples: 5450240 | consumed tokens: 11162091520 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.862244E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.667 | TFLOPs: 26.29 | +7: iteration 21300/ 173500 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.853826E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.619 | TFLOPs: 26.31 | +7: iteration 21310/ 173500 | consumed samples: 5455360 | consumed tokens: 11172577280 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.853431E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.387 | TFLOPs: 26.31 | +7: iteration 21320/ 173500 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.863967E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.780 | TFLOPs: 26.26 | +7: iteration 21330/ 173500 | consumed samples: 5460480 | consumed tokens: 11183063040 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.848848E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.207 | TFLOPs: 26.26 | +7: iteration 21340/ 173500 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.841516E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.873 | TFLOPs: 26.19 | +7: iteration 21350/ 173500 | consumed samples: 5465600 | consumed tokens: 11193548800 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.860980E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.631 | TFLOPs: 26.18 | +7: iteration 21360/ 173500 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.852228E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.066 | TFLOPs: 26.19 | +7: iteration 21370/ 173500 | consumed samples: 5470720 | consumed tokens: 11204034560 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.845573E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.633 | TFLOPs: 26.17 | +7: iteration 21380/ 173500 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 0.15 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 3.861788E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.386 | TFLOPs: 26.16 | +7: iteration 21390/ 173500 | consumed samples: 5475840 | consumed tokens: 11214520320 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.845717E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.459 | TFLOPs: 26.18 | +7: iteration 21400/ 173500 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.841715E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.204 | TFLOPs: 26.13 | +7: iteration 21410/ 173500 | consumed samples: 5480960 | consumed tokens: 11225006080 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.859532E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.515 | TFLOPs: 26.12 | +7: iteration 21420/ 173500 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.847876E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.684 | TFLOPs: 26.12 | +7: iteration 21430/ 173500 | consumed samples: 5486080 | consumed tokens: 11235491840 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.877466E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.535 | TFLOPs: 26.29 | +7: iteration 21440/ 173500 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.845427E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.176 | TFLOPs: 26.21 | +7: iteration 21450/ 173500 | consumed samples: 5491200 | consumed tokens: 11245977600 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.850038E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.950 | TFLOPs: 26.31 | +7: iteration 21460/ 173500 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.872333E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.983 | TFLOPs: 26.31 | +7: iteration 21470/ 173500 | consumed samples: 5496320 | consumed tokens: 11256463360 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.851260E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.881 | TFLOPs: 26.33 | +7: iteration 21480/ 173500 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.856573E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.519 | TFLOPs: 26.32 | +7: iteration 21490/ 173500 | consumed samples: 5501440 | consumed tokens: 11266949120 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.866112E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.996 | TFLOPs: 26.32 | +7: iteration 21500/ 173500 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.875954E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.243 | TFLOPs: 26.30 | +7: iteration 21510/ 173500 | consumed samples: 5506560 | consumed tokens: 11277434880 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.843727E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.961 | TFLOPs: 26.31 | +7: iteration 21520/ 173500 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.854882E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.057 | TFLOPs: 26.32 | +7: iteration 21530/ 173500 | consumed samples: 5511680 | consumed tokens: 11287920640 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.865088E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.200 | TFLOPs: 26.13 | +7: iteration 21540/ 173500 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.853154E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.568 | TFLOPs: 26.14 | +7: iteration 21550/ 173500 | consumed samples: 5516800 | consumed tokens: 11298406400 | elapsed time per iteration (s): 0.15 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 3.856944E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.995 | TFLOPs: 26.16 | +7: iteration 21560/ 173500 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.857751E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.032 | TFLOPs: 26.14 | +7: iteration 21570/ 173500 | consumed samples: 5521920 | consumed tokens: 11308892160 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.853003E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.281 | TFLOPs: 26.15 | +7: iteration 21580/ 173500 | consumed samples: 5524480 | consumed tokens: 11314135040 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.856922E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.100 | TFLOPs: 26.14 | +7: iteration 21590/ 173500 | consumed samples: 5527040 | consumed tokens: 11319377920 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.867962E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.571 | TFLOPs: 26.14 | +7: iteration 21600/ 173500 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.872913E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.045 | TFLOPs: 26.14 | +7: iteration 21610/ 173500 | consumed samples: 5532160 | consumed tokens: 11329863680 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.861033E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.682 | TFLOPs: 26.14 | +7: iteration 21620/ 173500 | consumed samples: 5534720 | consumed tokens: 11335106560 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.861806E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.905 | TFLOPs: 26.14 | +7: iteration 21630/ 173500 | consumed samples: 5537280 | consumed tokens: 11340349440 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.849931E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.372 | TFLOPs: 26.13 | +7: iteration 21640/ 173500 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.852452E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.340 | TFLOPs: 26.09 | +7: iteration 21650/ 173500 | consumed samples: 5542400 | consumed tokens: 11350835200 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.849501E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.092 | TFLOPs: 26.10 | +7: iteration 21660/ 173500 | consumed samples: 5544960 | consumed tokens: 11356078080 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.850721E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.150 | TFLOPs: 26.10 | +7: iteration 21670/ 173500 | consumed samples: 5547520 | consumed tokens: 11361320960 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.849966E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.569 | TFLOPs: 26.17 | +7: iteration 21680/ 173500 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.856092E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.985 | TFLOPs: 26.17 | +7: iteration 21690/ 173500 | consumed samples: 5552640 | consumed tokens: 11371806720 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.853474E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.878 | TFLOPs: 26.17 | +7: iteration 21700/ 173500 | consumed samples: 5555200 | consumed tokens: 11377049600 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.867794E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.729 | TFLOPs: 26.17 | +7: iteration 21710/ 173500 | consumed samples: 5557760 | consumed tokens: 11382292480 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.861174E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.051 | TFLOPs: 26.17 | +7: iteration 21720/ 173500 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 0.15 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 3.843877E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.100 | TFLOPs: 26.18 | +7: iteration 21730/ 173500 | consumed samples: 5562880 | consumed tokens: 11392778240 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.858333E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.588 | TFLOPs: 26.18 | +7: iteration 21740/ 173500 | consumed samples: 5565440 | consumed tokens: 11398021120 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.853671E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.479 | TFLOPs: 26.17 | +7: iteration 21750/ 173500 | consumed samples: 5568000 | consumed tokens: 11403264000 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.866472E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.622 | TFLOPs: 26.17 | +7: iteration 21760/ 173500 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.858949E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.565 | TFLOPs: 26.17 | +7: iteration 21770/ 173500 | consumed samples: 5573120 | consumed tokens: 11413749760 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.859008E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.254 | TFLOPs: 26.18 | +7: iteration 21780/ 173500 | consumed samples: 5575680 | consumed tokens: 11418992640 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.848531E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.106 | TFLOPs: 26.18 | +7: iteration 21790/ 173500 | consumed samples: 5578240 | consumed tokens: 11424235520 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.867152E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.576 | TFLOPs: 26.17 | +7: iteration 21800/ 173500 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.857771E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.904 | TFLOPs: 26.19 | +7: iteration 21810/ 173500 | consumed samples: 5583360 | consumed tokens: 11434721280 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.853506E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.695 | TFLOPs: 26.15 | +7: iteration 21820/ 173500 | consumed samples: 5585920 | consumed tokens: 11439964160 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.861026E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.278 | TFLOPs: 26.18 | +7: iteration 21830/ 173500 | consumed samples: 5588480 | consumed tokens: 11445207040 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.854583E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.848 | TFLOPs: 26.17 | +7: iteration 21840/ 173500 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.851048E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.865 | TFLOPs: 26.17 | +7: iteration 21850/ 173500 | consumed samples: 5593600 | consumed tokens: 11455692800 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.856001E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.954 | TFLOPs: 26.17 | +7: iteration 21860/ 173500 | consumed samples: 5596160 | consumed tokens: 11460935680 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.858580E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.012 | TFLOPs: 26.17 | +7: iteration 21870/ 173500 | consumed samples: 5598720 | consumed tokens: 11466178560 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.853381E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.317 | TFLOPs: 26.18 | +7: iteration 21880/ 173500 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.868872E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.993 | TFLOPs: 26.17 | +7: iteration 21890/ 173500 | consumed samples: 5603840 | consumed tokens: 11476664320 | elapsed time per iteration (s): 0.15 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 3.844720E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.319 | TFLOPs: 26.18 | +7: iteration 21900/ 173500 | consumed samples: 5606400 | consumed tokens: 11481907200 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.870750E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.121 | TFLOPs: 26.21 | +7: iteration 21910/ 173500 | consumed samples: 5608960 | consumed tokens: 11487150080 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.843457E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.548 | TFLOPs: 26.20 | +7: iteration 21920/ 173500 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.871716E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.380 | TFLOPs: 26.20 | +7: iteration 21930/ 173500 | consumed samples: 5614080 | consumed tokens: 11497635840 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.842186E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.835 | TFLOPs: 26.20 | +7: iteration 21940/ 173500 | consumed samples: 5616640 | consumed tokens: 11502878720 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.844705E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.823 | TFLOPs: 26.20 | +7: iteration 21950/ 173500 | consumed samples: 5619200 | consumed tokens: 11508121600 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.868593E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.998 | TFLOPs: 26.19 | +7: iteration 21960/ 173500 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.855229E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.503 | TFLOPs: 26.20 | +7: iteration 21970/ 173500 | consumed samples: 5624320 | consumed tokens: 11518607360 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.853671E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.461 | TFLOPs: 26.20 | +7: iteration 21980/ 173500 | consumed samples: 5626880 | consumed tokens: 11523850240 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.847591E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.564 | TFLOPs: 26.20 | +7: iteration 21990/ 173500 | consumed samples: 5629440 | consumed tokens: 11529093120 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.860611E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.303 | TFLOPs: 26.19 | +0: [2023-03-17 01:12:36,510] [INFO] [logging.py:68:log_dist] [Rank 0] step=22000, skipped=0, lr=[0.00019388839136370641, 0.00019388839136370641, 0.00019388839136370641], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 22000/ 173500 | consumed samples: 5632000 | consumed tokens: 11534336000 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.858039E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.556 | TFLOPs: 26.20 | +0: steps: 22000 loss: 3.8506 iter time (s): 0.152 samples/sec: 1684.648 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 22000 | lm loss value: 3.960228E+00 | lm loss PPL: 5.246929E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 22000 to checkpoints_44m91b100m +0: [2023-03-17 01:12:36,583] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step22000 is begin to save! +0: [2023-03-17 01:12:36,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:12:36,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:12:36,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:12:36,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:12:36,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:12:36,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:12:36,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:12:36,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:12:36,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:12:36,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:12:36,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:12:36,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:12:36,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:12:36,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:12:36,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:12:36,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:12:36,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:12:36,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:12:36,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:12:36,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:12:36,713] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step22000/mp_rank_00_model_states.pt +0: [2023-03-17 01:12:36,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:12:36,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:12:36,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:12:36,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:12:36,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:12:36,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:12:36,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:12:36,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:12:36,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:12:36,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-17 01:12:36,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:12:36,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:12:36,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: successfully saved checkpoint at iteration 22000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.54 +7: iteration 22010/ 173500 | consumed samples: 5634560 | consumed tokens: 11539578880 | elapsed time per iteration (s): 0.18 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.862699E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.340 | TFLOPs: 22.62 | +7: iteration 22020/ 173500 | consumed samples: 5637120 | consumed tokens: 11544821760 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.855408E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.967 | TFLOPs: 26.22 | +7: iteration 22030/ 173500 | consumed samples: 5639680 | consumed tokens: 11550064640 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.849252E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.965 | TFLOPs: 26.20 | +7: iteration 22040/ 173500 | consumed samples: 5642240 | consumed tokens: 11555307520 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.843666E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.829 | TFLOPs: 26.20 | +7: iteration 22050/ 173500 | consumed samples: 5644800 | consumed tokens: 11560550400 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.850073E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.972 | TFLOPs: 26.21 | +7: iteration 22060/ 173500 | consumed samples: 5647360 | consumed tokens: 11565793280 | elapsed time per iteration (s): 0.15 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 3.856392E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.883 | TFLOPs: 26.19 | +7: iteration 22070/ 173500 | consumed samples: 5649920 | consumed tokens: 11571036160 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.856107E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.300 | TFLOPs: 26.21 | +7: iteration 22080/ 173500 | consumed samples: 5652480 | consumed tokens: 11576279040 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.859587E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.102 | TFLOPs: 26.22 | +7: iteration 22090/ 173500 | consumed samples: 5655040 | consumed tokens: 11581521920 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.846669E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.965 | TFLOPs: 26.20 | +7: iteration 22100/ 173500 | consumed samples: 5657600 | consumed tokens: 11586764800 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.856727E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.618 | TFLOPs: 26.22 | +7: iteration 22110/ 173500 | consumed samples: 5660160 | consumed tokens: 11592007680 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.865226E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.787 | TFLOPs: 26.20 | +7: iteration 22120/ 173500 | consumed samples: 5662720 | consumed tokens: 11597250560 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.854467E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.616 | TFLOPs: 26.20 | +7: iteration 22130/ 173500 | consumed samples: 5665280 | consumed tokens: 11602493440 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.843184E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.005 | TFLOPs: 25.95 | +7: iteration 22140/ 173500 | consumed samples: 5667840 | consumed tokens: 11607736320 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.836679E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.460 | TFLOPs: 26.23 | +7: iteration 22150/ 173500 | consumed samples: 5670400 | consumed tokens: 11612979200 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.851976E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.788 | TFLOPs: 26.22 | +7: iteration 22160/ 173500 | consumed samples: 5672960 | consumed tokens: 11618222080 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.855186E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.746 | TFLOPs: 26.22 | +7: iteration 22170/ 173500 | consumed samples: 5675520 | consumed tokens: 11623464960 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.858867E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.368 | TFLOPs: 26.21 | +7: iteration 22180/ 173500 | consumed samples: 5678080 | consumed tokens: 11628707840 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.853903E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.767 | TFLOPs: 26.20 | +7: iteration 22190/ 173500 | consumed samples: 5680640 | consumed tokens: 11633950720 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.850893E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.538 | TFLOPs: 26.21 | +7: iteration 22200/ 173500 | consumed samples: 5683200 | consumed tokens: 11639193600 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.849949E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.312 | TFLOPs: 26.21 | +7: iteration 22210/ 173500 | consumed samples: 5685760 | consumed tokens: 11644436480 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.851589E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.784 | TFLOPs: 26.16 | +7: iteration 22220/ 173500 | consumed samples: 5688320 | consumed tokens: 11649679360 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.850668E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.575 | TFLOPs: 26.21 | +7: iteration 22230/ 173500 | consumed samples: 5690880 | consumed tokens: 11654922240 | elapsed time per iteration (s): 0.15 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 3.859454E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.496 | TFLOPs: 25.95 | +7: iteration 22240/ 173500 | consumed samples: 5693440 | consumed tokens: 11660165120 | elapsed time per iteration (s): 0.16 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.850764E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.919 | TFLOPs: 25.86 | +7: iteration 22250/ 173500 | consumed samples: 5696000 | consumed tokens: 11665408000 | elapsed time per iteration (s): 0.16 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.855738E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.830 | TFLOPs: 25.89 | +7: iteration 22260/ 173500 | consumed samples: 5698560 | consumed tokens: 11670650880 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.853836E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.031 | TFLOPs: 26.19 | +7: iteration 22270/ 173500 | consumed samples: 5701120 | consumed tokens: 11675893760 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.850697E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.071 | TFLOPs: 26.21 | +7: iteration 22280/ 173500 | consumed samples: 5703680 | consumed tokens: 11681136640 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.865154E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.517 | TFLOPs: 26.23 | +7: iteration 22290/ 173500 | consumed samples: 5706240 | consumed tokens: 11686379520 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.855265E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.533 | TFLOPs: 26.23 | +7: iteration 22300/ 173500 | consumed samples: 5708800 | consumed tokens: 11691622400 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.863648E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.666 | TFLOPs: 26.23 | +7: iteration 22310/ 173500 | consumed samples: 5711360 | consumed tokens: 11696865280 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.850065E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.523 | TFLOPs: 26.20 | +7: iteration 22320/ 173500 | consumed samples: 5713920 | consumed tokens: 11702108160 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.849625E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.788 | TFLOPs: 26.22 | +7: iteration 22330/ 173500 | consumed samples: 5716480 | consumed tokens: 11707351040 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.858169E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.103 | TFLOPs: 26.29 | +7: iteration 22340/ 173500 | consumed samples: 5719040 | consumed tokens: 11712593920 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.851863E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.556 | TFLOPs: 26.25 | +7: iteration 22350/ 173500 | consumed samples: 5721600 | consumed tokens: 11717836800 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.847192E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.741 | TFLOPs: 26.30 | +7: iteration 22360/ 173500 | consumed samples: 5724160 | consumed tokens: 11723079680 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.851606E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.387 | TFLOPs: 26.34 | +7: iteration 22370/ 173500 | consumed samples: 5726720 | consumed tokens: 11728322560 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.858234E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.501 | TFLOPs: 26.34 | +7: iteration 22380/ 173500 | consumed samples: 5729280 | consumed tokens: 11733565440 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.852817E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.117 | TFLOPs: 26.32 | +7: iteration 22390/ 173500 | consumed samples: 5731840 | consumed tokens: 11738808320 | elapsed time per iteration (s): 0.15 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 3.852269E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.080 | TFLOPs: 26.32 | +7: iteration 22400/ 173500 | consumed samples: 5734400 | consumed tokens: 11744051200 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.846812E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.020 | TFLOPs: 26.32 | +7: iteration 22410/ 173500 | consumed samples: 5736960 | consumed tokens: 11749294080 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.855164E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.326 | TFLOPs: 26.30 | +7: iteration 22420/ 173500 | consumed samples: 5739520 | consumed tokens: 11754536960 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.855891E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.757 | TFLOPs: 26.28 | +7: iteration 22430/ 173500 | consumed samples: 5742080 | consumed tokens: 11759779840 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.842350E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.435 | TFLOPs: 26.29 | +7: iteration 22440/ 173500 | consumed samples: 5744640 | consumed tokens: 11765022720 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.857475E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.945 | TFLOPs: 26.28 | +7: iteration 22450/ 173500 | consumed samples: 5747200 | consumed tokens: 11770265600 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.850478E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.546 | TFLOPs: 26.28 | +7: iteration 22460/ 173500 | consumed samples: 5749760 | consumed tokens: 11775508480 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.835007E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.992 | TFLOPs: 26.27 | +7: iteration 22470/ 173500 | consumed samples: 5752320 | consumed tokens: 11780751360 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.859096E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.220 | TFLOPs: 26.27 | +7: iteration 22480/ 173500 | consumed samples: 5754880 | consumed tokens: 11785994240 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.851534E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.702 | TFLOPs: 26.29 | +7: iteration 22490/ 173500 | consumed samples: 5757440 | consumed tokens: 11791237120 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.843620E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.374 | TFLOPs: 26.20 | +7: iteration 22500/ 173500 | consumed samples: 5760000 | consumed tokens: 11796480000 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.849605E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.044 | TFLOPs: 26.17 | +7: iteration 22510/ 173500 | consumed samples: 5762560 | consumed tokens: 11801722880 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.841085E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.120 | TFLOPs: 26.18 | +7: iteration 22520/ 173500 | consumed samples: 5765120 | consumed tokens: 11806965760 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.852303E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.918 | TFLOPs: 26.17 | +7: iteration 22530/ 173500 | consumed samples: 5767680 | consumed tokens: 11812208640 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.847877E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.709 | TFLOPs: 26.17 | +7: iteration 22540/ 173500 | consumed samples: 5770240 | consumed tokens: 11817451520 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.845342E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.953 | TFLOPs: 26.17 | +7: iteration 22550/ 173500 | consumed samples: 5772800 | consumed tokens: 11822694400 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.847137E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.914 | TFLOPs: 26.17 | +7: iteration 22560/ 173500 | consumed samples: 5775360 | consumed tokens: 11827937280 | elapsed time per iteration (s): 0.15 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 3.859064E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.991 | TFLOPs: 26.17 | +7: iteration 22570/ 173500 | consumed samples: 5777920 | consumed tokens: 11833180160 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.835736E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.563 | TFLOPs: 26.15 | +7: iteration 22580/ 173500 | consumed samples: 5780480 | consumed tokens: 11838423040 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.859950E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.515 | TFLOPs: 26.14 | +7: iteration 22590/ 173500 | consumed samples: 5783040 | consumed tokens: 11843665920 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.854842E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.094 | TFLOPs: 26.14 | +7: iteration 22600/ 173500 | consumed samples: 5785600 | consumed tokens: 11848908800 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.868763E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.670 | TFLOPs: 26.09 | +7: iteration 22610/ 173500 | consumed samples: 5788160 | consumed tokens: 11854151680 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.852327E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.237 | TFLOPs: 26.15 | +7: iteration 22620/ 173500 | consumed samples: 5790720 | consumed tokens: 11859394560 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.840829E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.530 | TFLOPs: 26.15 | +7: iteration 22630/ 173500 | consumed samples: 5793280 | consumed tokens: 11864637440 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.853872E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.806 | TFLOPs: 26.14 | +7: iteration 22640/ 173500 | consumed samples: 5795840 | consumed tokens: 11869880320 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.853413E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.430 | TFLOPs: 26.15 | +7: iteration 22650/ 173500 | consumed samples: 5798400 | consumed tokens: 11875123200 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.848560E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.527 | TFLOPs: 26.14 | +7: iteration 22660/ 173500 | consumed samples: 5800960 | consumed tokens: 11880366080 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.852584E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.108 | TFLOPs: 26.14 | +7: iteration 22670/ 173500 | consumed samples: 5803520 | consumed tokens: 11885608960 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.848073E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.281 | TFLOPs: 26.13 | +7: iteration 22680/ 173500 | consumed samples: 5806080 | consumed tokens: 11890851840 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.853367E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.881 | TFLOPs: 26.14 | +7: iteration 22690/ 173500 | consumed samples: 5808640 | consumed tokens: 11896094720 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.848676E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.979 | TFLOPs: 26.13 | +7: iteration 22700/ 173500 | consumed samples: 5811200 | consumed tokens: 11901337600 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.851230E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.142 | TFLOPs: 26.13 | +7: iteration 22710/ 173500 | consumed samples: 5813760 | consumed tokens: 11906580480 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.841367E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.388 | TFLOPs: 26.13 | +7: iteration 22720/ 173500 | consumed samples: 5816320 | consumed tokens: 11911823360 | elapsed time per iteration (s): 0.15 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 3.851574E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.695 | TFLOPs: 26.15 | +7: iteration 22730/ 173500 | consumed samples: 5818880 | consumed tokens: 11917066240 | elapsed time per iteration (s): 0.17 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.862650E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.855 | TFLOPs: 24.21 | +7: iteration 22740/ 173500 | consumed samples: 5821440 | consumed tokens: 11922309120 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.843148E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.877 | TFLOPs: 26.16 | +7: iteration 22750/ 173500 | consumed samples: 5824000 | consumed tokens: 11927552000 | elapsed time per iteration (s): 0.16 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.839235E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.712 | TFLOPs: 25.32 | +7: iteration 22760/ 173500 | consumed samples: 5826560 | consumed tokens: 11932794880 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.859948E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.990 | TFLOPs: 26.13 | +7: iteration 22770/ 173500 | consumed samples: 5829120 | consumed tokens: 11938037760 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.845693E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.524 | TFLOPs: 26.12 | +7: iteration 22780/ 173500 | consumed samples: 5831680 | consumed tokens: 11943280640 | elapsed time per iteration (s): 0.16 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.864697E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.017 | TFLOPs: 25.85 | +7: iteration 22790/ 173500 | consumed samples: 5834240 | consumed tokens: 11948523520 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.845475E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.325 | TFLOPs: 26.13 | +7: iteration 22800/ 173500 | consumed samples: 5836800 | consumed tokens: 11953766400 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.862166E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.109 | TFLOPs: 26.14 | +7: iteration 22810/ 173500 | consumed samples: 5839360 | consumed tokens: 11959009280 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.852361E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.472 | TFLOPs: 26.13 | +7: iteration 22820/ 173500 | consumed samples: 5841920 | consumed tokens: 11964252160 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.829208E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.865 | TFLOPs: 26.12 | +7: iteration 22830/ 173500 | consumed samples: 5844480 | consumed tokens: 11969495040 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.839594E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.780 | TFLOPs: 26.12 | +7: iteration 22840/ 173500 | consumed samples: 5847040 | consumed tokens: 11974737920 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.842966E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.318 | TFLOPs: 26.13 | +7: iteration 22850/ 173500 | consumed samples: 5849600 | consumed tokens: 11979980800 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.850702E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.765 | TFLOPs: 26.14 | +7: iteration 22860/ 173500 | consumed samples: 5852160 | consumed tokens: 11985223680 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.843725E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.680 | TFLOPs: 26.12 | +7: iteration 22870/ 173500 | consumed samples: 5854720 | consumed tokens: 11990466560 | elapsed time per iteration (s): 0.15 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.848087E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.132 | TFLOPs: 26.11 | +7: iteration 22880/ 173500 | consumed samples: 5857280 | consumed tokens: 11995709440 | elapsed time per iteration (s): 0.16 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 3.848998E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.933 | TFLOPs: 25.44 | +7: iteration 22890/ 173500 | consumed samples: 5859840 | consumed tokens: 12000952320 | elapsed time per iteration (s): 0.16 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.855327E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.727 | TFLOPs: 25.79 | +7: iteration 22900/ 173500 | consumed samples: 5862400 | consumed tokens: 12006195200 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.845483E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.813 | TFLOPs: 26.14 | +7: iteration 22910/ 173500 | consumed samples: 5864960 | consumed tokens: 12011438080 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.848159E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.660 | TFLOPs: 26.14 | +7: iteration 22920/ 173500 | consumed samples: 5867520 | consumed tokens: 12016680960 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.850531E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.447 | TFLOPs: 26.13 | +7: iteration 22930/ 173500 | consumed samples: 5870080 | consumed tokens: 12021923840 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.856356E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.065 | TFLOPs: 26.14 | +7: iteration 22940/ 173500 | consumed samples: 5872640 | consumed tokens: 12027166720 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.847276E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.578 | TFLOPs: 26.10 | +7: iteration 22950/ 173500 | consumed samples: 5875200 | consumed tokens: 12032409600 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.839846E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.617 | TFLOPs: 26.14 | +7: iteration 22960/ 173500 | consumed samples: 5877760 | consumed tokens: 12037652480 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.845880E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.595 | TFLOPs: 26.14 | +7: iteration 22970/ 173500 | consumed samples: 5880320 | consumed tokens: 12042895360 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.847074E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.308 | TFLOPs: 26.15 | +7: iteration 22980/ 173500 | consumed samples: 5882880 | consumed tokens: 12048138240 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.847907E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.733 | TFLOPs: 26.14 | +7: iteration 22990/ 173500 | consumed samples: 5885440 | consumed tokens: 12053381120 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.848988E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.678 | TFLOPs: 26.22 | +7: iteration 23000/ 173500 | consumed samples: 5888000 | consumed tokens: 12058624000 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.860755E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.446 | TFLOPs: 26.21 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 23000 | lm loss value: 3.946279E+00 | lm loss PPL: 5.174248E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 23000 to checkpoints_44m91b100m +0: [2023-03-17 01:15:10,427] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step23000 is begin to save! +0: [2023-03-17 01:15:10,431] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:15:10,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:15:10,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:15:10,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:15:10,500] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:15:10,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:15:10,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:15:10,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:15:10,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:15:10,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:15:10,524] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:15:10,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:15:10,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:15:10,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:15:10,540] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:15:10,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:15:10,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:15:10,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:15:10,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:15:10,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:15:10,558] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step23000/mp_rank_00_model_states.pt +0: [2023-03-17 01:15:10,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:15:10,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:15:10,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:15:10,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:15:10,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:15:10,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:15:10,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 01:15:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 01:15:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 01:15:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-17 01:15:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:15:10,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:15:10,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:15:10,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 01:15:10,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-17 01:15:10,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: successfully saved checkpoint at iteration 23000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 175.05 +7: iteration 23010/ 173500 | consumed samples: 5890560 | consumed tokens: 12063866880 | elapsed time per iteration (s): 0.18 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.842582E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1459.279 | TFLOPs: 22.89 | +7: iteration 23020/ 173500 | consumed samples: 5893120 | consumed tokens: 12069109760 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.843507E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.438 | TFLOPs: 26.20 | +7: iteration 23030/ 173500 | consumed samples: 5895680 | consumed tokens: 12074352640 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.856663E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.822 | TFLOPs: 26.22 | +7: iteration 23040/ 173500 | consumed samples: 5898240 | consumed tokens: 12079595520 | elapsed time per iteration (s): 0.15 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 3.842397E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.302 | TFLOPs: 26.23 | +7: iteration 23050/ 173500 | consumed samples: 5900800 | consumed tokens: 12084838400 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.847929E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.340 | TFLOPs: 26.21 | +7: iteration 23060/ 173500 | consumed samples: 5903360 | consumed tokens: 12090081280 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.833427E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.268 | TFLOPs: 26.21 | +7: iteration 23070/ 173500 | consumed samples: 5905920 | consumed tokens: 12095324160 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.855169E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.377 | TFLOPs: 26.21 | +7: iteration 23080/ 173500 | consumed samples: 5908480 | consumed tokens: 12100567040 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.853618E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.402 | TFLOPs: 26.23 | +7: iteration 23090/ 173500 | consumed samples: 5911040 | consumed tokens: 12105809920 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.845613E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.312 | TFLOPs: 26.23 | +7: iteration 23100/ 173500 | consumed samples: 5913600 | consumed tokens: 12111052800 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.824297E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.688 | TFLOPs: 26.23 | +7: iteration 23110/ 173500 | consumed samples: 5916160 | consumed tokens: 12116295680 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.844964E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.765 | TFLOPs: 26.19 | +7: iteration 23120/ 173500 | consumed samples: 5918720 | consumed tokens: 12121538560 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.857603E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.018 | TFLOPs: 26.13 | +7: iteration 23130/ 173500 | consumed samples: 5921280 | consumed tokens: 12126781440 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.841287E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.972 | TFLOPs: 26.19 | +7: iteration 23140/ 173500 | consumed samples: 5923840 | consumed tokens: 12132024320 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.853085E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.194 | TFLOPs: 26.22 | +7: iteration 23150/ 173500 | consumed samples: 5926400 | consumed tokens: 12137267200 | elapsed time per iteration (s): 0.16 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.838037E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.731 | TFLOPs: 25.87 | +7: iteration 23160/ 173500 | consumed samples: 5928960 | consumed tokens: 12142510080 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.851240E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.880 | TFLOPs: 26.17 | +7: iteration 23170/ 173500 | consumed samples: 5931520 | consumed tokens: 12147752960 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.846093E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.793 | TFLOPs: 25.90 | +7: iteration 23180/ 173500 | consumed samples: 5934080 | consumed tokens: 12152995840 | elapsed time per iteration (s): 0.16 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.852548E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.267 | TFLOPs: 25.88 | +7: iteration 23190/ 173500 | consumed samples: 5936640 | consumed tokens: 12158238720 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.861158E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.386 | TFLOPs: 26.13 | +7: iteration 23200/ 173500 | consumed samples: 5939200 | consumed tokens: 12163481600 | elapsed time per iteration (s): 0.15 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 3.819292E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.505 | TFLOPs: 26.15 | +7: iteration 23210/ 173500 | consumed samples: 5941760 | consumed tokens: 12168724480 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.843387E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.238 | TFLOPs: 26.15 | +7: iteration 23220/ 173500 | consumed samples: 5944320 | consumed tokens: 12173967360 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.832458E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.293 | TFLOPs: 26.16 | +7: iteration 23230/ 173500 | consumed samples: 5946880 | consumed tokens: 12179210240 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.847950E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.031 | TFLOPs: 26.14 | +7: iteration 23240/ 173500 | consumed samples: 5949440 | consumed tokens: 12184453120 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.846537E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.883 | TFLOPs: 26.19 | +7: iteration 23250/ 173500 | consumed samples: 5952000 | consumed tokens: 12189696000 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.844359E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.600 | TFLOPs: 26.21 | +7: iteration 23260/ 173500 | consumed samples: 5954560 | consumed tokens: 12194938880 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.839732E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.917 | TFLOPs: 26.20 | +7: iteration 23270/ 173500 | consumed samples: 5957120 | consumed tokens: 12200181760 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.845883E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.715 | TFLOPs: 26.20 | +7: iteration 23280/ 173500 | consumed samples: 5959680 | consumed tokens: 12205424640 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.842616E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.360 | TFLOPs: 26.20 | +7: iteration 23290/ 173500 | consumed samples: 5962240 | consumed tokens: 12210667520 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.854962E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.395 | TFLOPs: 26.21 | +7: iteration 23300/ 173500 | consumed samples: 5964800 | consumed tokens: 12215910400 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.841940E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.718 | TFLOPs: 26.17 | +7: iteration 23310/ 173500 | consumed samples: 5967360 | consumed tokens: 12221153280 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.829336E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.904 | TFLOPs: 26.16 | +7: iteration 23320/ 173500 | consumed samples: 5969920 | consumed tokens: 12226396160 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.838009E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.920 | TFLOPs: 26.17 | +7: iteration 23330/ 173500 | consumed samples: 5972480 | consumed tokens: 12231639040 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.847808E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.496 | TFLOPs: 26.15 | +7: iteration 23340/ 173500 | consumed samples: 5975040 | consumed tokens: 12236881920 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.843752E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.718 | TFLOPs: 26.12 | +7: iteration 23350/ 173500 | consumed samples: 5977600 | consumed tokens: 12242124800 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.840747E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.150 | TFLOPs: 26.18 | +7: iteration 23360/ 173500 | consumed samples: 5980160 | consumed tokens: 12247367680 | elapsed time per iteration (s): 0.15 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 3.835474E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.486 | TFLOPs: 26.18 | +7: iteration 23370/ 173500 | consumed samples: 5982720 | consumed tokens: 12252610560 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.855316E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.742 | TFLOPs: 26.17 | +7: iteration 23380/ 173500 | consumed samples: 5985280 | consumed tokens: 12257853440 | elapsed time per iteration (s): 0.16 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.836454E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.606 | TFLOPs: 24.74 | +7: iteration 23390/ 173500 | consumed samples: 5987840 | consumed tokens: 12263096320 | elapsed time per iteration (s): 0.16 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.853619E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.876 | TFLOPs: 25.40 | +7: iteration 23400/ 173500 | consumed samples: 5990400 | consumed tokens: 12268339200 | elapsed time per iteration (s): 0.16 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.846743E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.925 | TFLOPs: 24.53 | +7: iteration 23410/ 173500 | consumed samples: 5992960 | consumed tokens: 12273582080 | elapsed time per iteration (s): 0.16 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.839554E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.931 | TFLOPs: 25.20 | +7: iteration 23420/ 173500 | consumed samples: 5995520 | consumed tokens: 12278824960 | elapsed time per iteration (s): 0.16 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.841669E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.863 | TFLOPs: 25.01 | +7: iteration 23430/ 173500 | consumed samples: 5998080 | consumed tokens: 12284067840 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.845534E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.778 | TFLOPs: 26.19 | +7: iteration 23440/ 173500 | consumed samples: 6000640 | consumed tokens: 12289310720 | elapsed time per iteration (s): 0.18 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.842441E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1423.521 | TFLOPs: 22.32 | +7: iteration 23450/ 173500 | consumed samples: 6003200 | consumed tokens: 12294553600 | elapsed time per iteration (s): 0.17 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.851031E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.822 | TFLOPs: 24.09 | +7: iteration 23460/ 173500 | consumed samples: 6005760 | consumed tokens: 12299796480 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.841893E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.664 | TFLOPs: 25.98 | +7: iteration 23470/ 173500 | consumed samples: 6008320 | consumed tokens: 12305039360 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.847469E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.407 | TFLOPs: 26.20 | +7: iteration 23480/ 173500 | consumed samples: 6010880 | consumed tokens: 12310282240 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.832607E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.238 | TFLOPs: 26.21 | +7: iteration 23490/ 173500 | consumed samples: 6013440 | consumed tokens: 12315525120 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.836868E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.152 | TFLOPs: 26.21 | +7: iteration 23500/ 173500 | consumed samples: 6016000 | consumed tokens: 12320768000 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.834854E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.315 | TFLOPs: 26.21 | +7: iteration 23510/ 173500 | consumed samples: 6018560 | consumed tokens: 12326010880 | elapsed time per iteration (s): 0.15 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 3.837537E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.611 | TFLOPs: 26.20 | +7: iteration 23520/ 173500 | consumed samples: 6021120 | consumed tokens: 12331253760 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.857212E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.439 | TFLOPs: 26.18 | +7: iteration 23530/ 173500 | consumed samples: 6023680 | consumed tokens: 12336496640 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.842088E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.088 | TFLOPs: 26.18 | +7: iteration 23540/ 173500 | consumed samples: 6026240 | consumed tokens: 12341739520 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.823330E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.621 | TFLOPs: 26.18 | +7: iteration 23550/ 173500 | consumed samples: 6028800 | consumed tokens: 12346982400 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.852508E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.119 | TFLOPs: 26.16 | +7: iteration 23560/ 173500 | consumed samples: 6031360 | consumed tokens: 12352225280 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.846317E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.686 | TFLOPs: 26.18 | +7: iteration 23570/ 173500 | consumed samples: 6033920 | consumed tokens: 12357468160 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.848502E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.636 | TFLOPs: 26.17 | +7: iteration 23580/ 173500 | consumed samples: 6036480 | consumed tokens: 12362711040 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.846433E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.266 | TFLOPs: 26.16 | +7: iteration 23590/ 173500 | consumed samples: 6039040 | consumed tokens: 12367953920 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.854763E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.571 | TFLOPs: 26.17 | +7: iteration 23600/ 173500 | consumed samples: 6041600 | consumed tokens: 12373196800 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.841373E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.320 | TFLOPs: 26.16 | +7: iteration 23610/ 173500 | consumed samples: 6044160 | consumed tokens: 12378439680 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.848061E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.208 | TFLOPs: 26.16 | +7: iteration 23620/ 173500 | consumed samples: 6046720 | consumed tokens: 12383682560 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.850505E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.906 | TFLOPs: 26.14 | +7: iteration 23630/ 173500 | consumed samples: 6049280 | consumed tokens: 12388925440 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.836613E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.391 | TFLOPs: 26.15 | +7: iteration 23640/ 173500 | consumed samples: 6051840 | consumed tokens: 12394168320 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.849525E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.168 | TFLOPs: 26.19 | +7: iteration 23650/ 173500 | consumed samples: 6054400 | consumed tokens: 12399411200 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.846723E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.426 | TFLOPs: 26.18 | +7: iteration 23660/ 173500 | consumed samples: 6056960 | consumed tokens: 12404654080 | elapsed time per iteration (s): 0.16 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.841688E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.046 | TFLOPs: 25.45 | +7: iteration 23670/ 173500 | consumed samples: 6059520 | consumed tokens: 12409896960 | elapsed time per iteration (s): 0.15 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 3.841776E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.918 | TFLOPs: 26.11 | +7: iteration 23680/ 173500 | consumed samples: 6062080 | consumed tokens: 12415139840 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.842222E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.835 | TFLOPs: 26.11 | +7: iteration 23690/ 173500 | consumed samples: 6064640 | consumed tokens: 12420382720 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.840964E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.181 | TFLOPs: 26.08 | +7: iteration 23700/ 173500 | consumed samples: 6067200 | consumed tokens: 12425625600 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.843389E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.661 | TFLOPs: 25.95 | +7: iteration 23710/ 173500 | consumed samples: 6069760 | consumed tokens: 12430868480 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.830787E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.600 | TFLOPs: 26.15 | +7: iteration 23720/ 173500 | consumed samples: 6072320 | consumed tokens: 12436111360 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.833282E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.192 | TFLOPs: 26.05 | +7: iteration 23730/ 173500 | consumed samples: 6074880 | consumed tokens: 12441354240 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.844995E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.198 | TFLOPs: 26.16 | +7: iteration 23740/ 173500 | consumed samples: 6077440 | consumed tokens: 12446597120 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.835866E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.026 | TFLOPs: 26.17 | +7: iteration 23750/ 173500 | consumed samples: 6080000 | consumed tokens: 12451840000 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.836670E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.153 | TFLOPs: 26.18 | +7: iteration 23760/ 173500 | consumed samples: 6082560 | consumed tokens: 12457082880 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.839804E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.225 | TFLOPs: 26.16 | +7: iteration 23770/ 173500 | consumed samples: 6085120 | consumed tokens: 12462325760 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.846648E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.023 | TFLOPs: 26.17 | +7: iteration 23780/ 173500 | consumed samples: 6087680 | consumed tokens: 12467568640 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.829420E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.255 | TFLOPs: 26.15 | +7: iteration 23790/ 173500 | consumed samples: 6090240 | consumed tokens: 12472811520 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.842660E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.476 | TFLOPs: 26.12 | +7: iteration 23800/ 173500 | consumed samples: 6092800 | consumed tokens: 12478054400 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.843040E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.333 | TFLOPs: 26.15 | +7: iteration 23810/ 173500 | consumed samples: 6095360 | consumed tokens: 12483297280 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.845587E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.580 | TFLOPs: 26.17 | +7: iteration 23820/ 173500 | consumed samples: 6097920 | consumed tokens: 12488540160 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.822173E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.698 | TFLOPs: 26.17 | +7: iteration 23830/ 173500 | consumed samples: 6100480 | consumed tokens: 12493783040 | elapsed time per iteration (s): 0.15 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 3.834757E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.690 | TFLOPs: 26.17 | +7: iteration 23840/ 173500 | consumed samples: 6103040 | consumed tokens: 12499025920 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.835083E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.150 | TFLOPs: 26.15 | +7: iteration 23850/ 173500 | consumed samples: 6105600 | consumed tokens: 12504268800 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.857097E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.995 | TFLOPs: 26.17 | +7: iteration 23860/ 173500 | consumed samples: 6108160 | consumed tokens: 12509511680 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.843844E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.789 | TFLOPs: 26.16 | +7: iteration 23870/ 173500 | consumed samples: 6110720 | consumed tokens: 12514754560 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.834733E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.006 | TFLOPs: 26.14 | +7: iteration 23880/ 173500 | consumed samples: 6113280 | consumed tokens: 12519997440 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.854360E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.456 | TFLOPs: 26.09 | +7: iteration 23890/ 173500 | consumed samples: 6115840 | consumed tokens: 12525240320 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.837179E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.966 | TFLOPs: 26.10 | +7: iteration 23900/ 173500 | consumed samples: 6118400 | consumed tokens: 12530483200 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.850891E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.443 | TFLOPs: 26.07 | +7: iteration 23910/ 173500 | consumed samples: 6120960 | consumed tokens: 12535726080 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.842254E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.625 | TFLOPs: 26.11 | +7: iteration 23920/ 173500 | consumed samples: 6123520 | consumed tokens: 12540968960 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.852293E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.046 | TFLOPs: 26.05 | +7: iteration 23930/ 173500 | consumed samples: 6126080 | consumed tokens: 12546211840 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.855217E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.537 | TFLOPs: 26.15 | +7: iteration 23940/ 173500 | consumed samples: 6128640 | consumed tokens: 12551454720 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.841822E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.114 | TFLOPs: 26.16 | +7: iteration 23950/ 173500 | consumed samples: 6131200 | consumed tokens: 12556697600 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.835884E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.013 | TFLOPs: 26.16 | +7: iteration 23960/ 173500 | consumed samples: 6133760 | consumed tokens: 12561940480 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.838691E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.174 | TFLOPs: 26.16 | +7: iteration 23970/ 173500 | consumed samples: 6136320 | consumed tokens: 12567183360 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.834099E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.133 | TFLOPs: 26.16 | +7: iteration 23980/ 173500 | consumed samples: 6138880 | consumed tokens: 12572426240 | elapsed time per iteration (s): 0.15 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 3.851372E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.045 | TFLOPs: 26.08 | +7: iteration 23990/ 173500 | consumed samples: 6141440 | consumed tokens: 12577669120 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.848470E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.782 | TFLOPs: 26.14 | +0: [2023-03-17 01:17:44,916] [INFO] [logging.py:68:log_dist] [Rank 0] step=24000, skipped=0, lr=[0.00019264004235759096, 0.00019264004235759096, 0.00019264004235759096], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 24000/ 173500 | consumed samples: 6144000 | consumed tokens: 12582912000 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.834013E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.660 | TFLOPs: 26.14 | +0: steps: 24000 loss: 3.8377 iter time (s): 0.152 samples/sec: 1681.272 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 24000 | lm loss value: 3.963075E+00 | lm loss PPL: 5.261888E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 24000 to checkpoints_44m91b100m +0: [2023-03-17 01:17:44,990] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step24000 is begin to save! +0: [2023-03-17 01:17:44,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:17:45,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:17:45,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:17:45,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:17:45,072] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:17:45,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:17:45,081] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:17:45,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:17:45,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:17:45,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:17:45,099] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:17:45,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:17:45,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:17:45,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:17:45,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:17:45,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:17:45,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:17:45,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:17:45,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:17:45,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:17:45,136] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step24000/mp_rank_00_model_states.pt +0: [2023-03-17 01:17:45,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:17:45,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:17:45,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:17:45,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:17:45,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:17:45,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-17 01:17:45,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-17 01:17:45,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:17:45,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:17:45,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:17:45,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-17 01:17:45,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:17:45,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: successfully saved checkpoint at iteration 24000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 190.80 +7: iteration 24010/ 173500 | consumed samples: 6146560 | consumed tokens: 12588154880 | elapsed time per iteration (s): 0.18 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.852270E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1426.867 | TFLOPs: 22.38 | +7: iteration 24020/ 173500 | consumed samples: 6149120 | consumed tokens: 12593397760 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.854147E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.330 | TFLOPs: 26.16 | +7: iteration 24030/ 173500 | consumed samples: 6151680 | consumed tokens: 12598640640 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.847772E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.066 | TFLOPs: 26.16 | +7: iteration 24040/ 173500 | consumed samples: 6154240 | consumed tokens: 12603883520 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.843170E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.295 | TFLOPs: 26.13 | +7: iteration 24050/ 173500 | consumed samples: 6156800 | consumed tokens: 12609126400 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.834832E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.295 | TFLOPs: 26.05 | +7: iteration 24060/ 173500 | consumed samples: 6159360 | consumed tokens: 12614369280 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.855968E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.872 | TFLOPs: 26.16 | +7: iteration 24070/ 173500 | consumed samples: 6161920 | consumed tokens: 12619612160 | elapsed time per iteration (s): 0.16 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.834642E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.729 | TFLOPs: 25.06 | +7: iteration 24080/ 173500 | consumed samples: 6164480 | consumed tokens: 12624855040 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.841412E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.799 | TFLOPs: 26.06 | +7: iteration 24090/ 173500 | consumed samples: 6167040 | consumed tokens: 12630097920 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.833630E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.911 | TFLOPs: 26.08 | +7: iteration 24100/ 173500 | consumed samples: 6169600 | consumed tokens: 12635340800 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.837300E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.874 | TFLOPs: 26.08 | +7: iteration 24110/ 173500 | consumed samples: 6172160 | consumed tokens: 12640583680 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.841350E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.036 | TFLOPs: 26.06 | +7: iteration 24120/ 173500 | consumed samples: 6174720 | consumed tokens: 12645826560 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.838364E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.311 | TFLOPs: 26.08 | +7: iteration 24130/ 173500 | consumed samples: 6177280 | consumed tokens: 12651069440 | elapsed time per iteration (s): 0.15 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 3.835274E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.616 | TFLOPs: 26.09 | +7: iteration 24140/ 173500 | consumed samples: 6179840 | consumed tokens: 12656312320 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.844179E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.349 | TFLOPs: 26.09 | +7: iteration 24150/ 173500 | consumed samples: 6182400 | consumed tokens: 12661555200 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.819304E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.370 | TFLOPs: 26.13 | +7: iteration 24160/ 173500 | consumed samples: 6184960 | consumed tokens: 12666798080 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.839750E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.223 | TFLOPs: 26.16 | +7: iteration 24170/ 173500 | consumed samples: 6187520 | consumed tokens: 12672040960 | elapsed time per iteration (s): 0.16 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.838879E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.375 | TFLOPs: 25.66 | +7: iteration 24180/ 173500 | consumed samples: 6190080 | consumed tokens: 12677283840 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.836565E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.851 | TFLOPs: 26.12 | +7: iteration 24190/ 173500 | consumed samples: 6192640 | consumed tokens: 12682526720 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.837709E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.829 | TFLOPs: 26.14 | +7: iteration 24200/ 173500 | consumed samples: 6195200 | consumed tokens: 12687769600 | elapsed time per iteration (s): 0.16 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.831646E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.520 | TFLOPs: 25.81 | +7: iteration 24210/ 173500 | consumed samples: 6197760 | consumed tokens: 12693012480 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.835433E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.331 | TFLOPs: 26.16 | +7: iteration 24220/ 173500 | consumed samples: 6200320 | consumed tokens: 12698255360 | elapsed time per iteration (s): 0.16 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.847370E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.499 | TFLOPs: 25.90 | +7: iteration 24230/ 173500 | consumed samples: 6202880 | consumed tokens: 12703498240 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.825393E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.324 | TFLOPs: 26.15 | +7: iteration 24240/ 173500 | consumed samples: 6205440 | consumed tokens: 12708741120 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.839298E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.859 | TFLOPs: 26.06 | +7: iteration 24250/ 173500 | consumed samples: 6208000 | consumed tokens: 12713984000 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.830573E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.446 | TFLOPs: 26.06 | +7: iteration 24260/ 173500 | consumed samples: 6210560 | consumed tokens: 12719226880 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.836809E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.802 | TFLOPs: 26.14 | +7: iteration 24270/ 173500 | consumed samples: 6213120 | consumed tokens: 12724469760 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.835464E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.130 | TFLOPs: 26.10 | +7: iteration 24280/ 173500 | consumed samples: 6215680 | consumed tokens: 12729712640 | elapsed time per iteration (s): 0.15 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 3.840318E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.041 | TFLOPs: 26.11 | +7: iteration 24290/ 173500 | consumed samples: 6218240 | consumed tokens: 12734955520 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.835419E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.283 | TFLOPs: 26.10 | +7: iteration 24300/ 173500 | consumed samples: 6220800 | consumed tokens: 12740198400 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.861214E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.418 | TFLOPs: 26.12 | +7: iteration 24310/ 173500 | consumed samples: 6223360 | consumed tokens: 12745441280 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.845868E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.772 | TFLOPs: 26.14 | +7: iteration 24320/ 173500 | consumed samples: 6225920 | consumed tokens: 12750684160 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.822592E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.490 | TFLOPs: 26.12 | +7: iteration 24330/ 173500 | consumed samples: 6228480 | consumed tokens: 12755927040 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.850094E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.100 | TFLOPs: 26.13 | +7: iteration 24340/ 173500 | consumed samples: 6231040 | consumed tokens: 12761169920 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.835110E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.440 | TFLOPs: 26.09 | +7: iteration 24350/ 173500 | consumed samples: 6233600 | consumed tokens: 12766412800 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.849290E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.548 | TFLOPs: 26.07 | +7: iteration 24360/ 173500 | consumed samples: 6236160 | consumed tokens: 12771655680 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.836187E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.556 | TFLOPs: 25.43 | +7: iteration 24370/ 173500 | consumed samples: 6238720 | consumed tokens: 12776898560 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.840008E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.902 | TFLOPs: 25.55 | +7: iteration 24380/ 173500 | consumed samples: 6241280 | consumed tokens: 12782141440 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.840391E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.510 | TFLOPs: 25.90 | +7: iteration 24390/ 173500 | consumed samples: 6243840 | consumed tokens: 12787384320 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.828507E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.551 | TFLOPs: 25.85 | +7: iteration 24400/ 173500 | consumed samples: 6246400 | consumed tokens: 12792627200 | elapsed time per iteration (s): 0.15 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.833025E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.604 | TFLOPs: 25.95 | +7: iteration 24410/ 173500 | consumed samples: 6248960 | consumed tokens: 12797870080 | elapsed time per iteration (s): 0.17 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.846260E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.821 | TFLOPs: 23.82 | +7: iteration 24420/ 173500 | consumed samples: 6251520 | consumed tokens: 12803112960 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.831001E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.366 | TFLOPs: 24.41 | +7: iteration 24430/ 173500 | consumed samples: 6254080 | consumed tokens: 12808355840 | elapsed time per iteration (s): 0.17 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.840038E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.747 | TFLOPs: 23.49 | +7: iteration 24440/ 173500 | consumed samples: 6256640 | consumed tokens: 12813598720 | elapsed time per iteration (s): 0.16 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 3.843045E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.401 | TFLOPs: 25.41 | +7: iteration 24450/ 173500 | consumed samples: 6259200 | consumed tokens: 12818841600 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.840274E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.128 | TFLOPs: 26.08 | +7: iteration 24460/ 173500 | consumed samples: 6261760 | consumed tokens: 12824084480 | elapsed time per iteration (s): 0.16 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.833786E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.340 | TFLOPs: 25.72 | +7: iteration 24470/ 173500 | consumed samples: 6264320 | consumed tokens: 12829327360 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.846753E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.539 | TFLOPs: 26.15 | +7: iteration 24480/ 173500 | consumed samples: 6266880 | consumed tokens: 12834570240 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.830639E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.763 | TFLOPs: 26.14 | +7: iteration 24490/ 173500 | consumed samples: 6269440 | consumed tokens: 12839813120 | elapsed time per iteration (s): 0.16 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.846527E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.141 | TFLOPs: 25.86 | +7: iteration 24500/ 173500 | consumed samples: 6272000 | consumed tokens: 12845056000 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.851359E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.638 | TFLOPs: 26.03 | +7: iteration 24510/ 173500 | consumed samples: 6274560 | consumed tokens: 12850298880 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.823658E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.563 | TFLOPs: 26.14 | +7: iteration 24520/ 173500 | consumed samples: 6277120 | consumed tokens: 12855541760 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.818929E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.738 | TFLOPs: 26.14 | +7: iteration 24530/ 173500 | consumed samples: 6279680 | consumed tokens: 12860784640 | elapsed time per iteration (s): 0.16 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.834826E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.225 | TFLOPs: 25.60 | +7: iteration 24540/ 173500 | consumed samples: 6282240 | consumed tokens: 12866027520 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.846761E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.074 | TFLOPs: 26.18 | +7: iteration 24550/ 173500 | consumed samples: 6284800 | consumed tokens: 12871270400 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.850561E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.530 | TFLOPs: 25.96 | +7: iteration 24560/ 173500 | consumed samples: 6287360 | consumed tokens: 12876513280 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.852585E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.614 | TFLOPs: 26.15 | +7: iteration 24570/ 173500 | consumed samples: 6289920 | consumed tokens: 12881756160 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.857997E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.588 | TFLOPs: 26.15 | +7: iteration 24580/ 173500 | consumed samples: 6292480 | consumed tokens: 12886999040 | elapsed time per iteration (s): 0.15 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.837397E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.145 | TFLOPs: 26.16 | +7: iteration 24590/ 173500 | consumed samples: 6295040 | consumed tokens: 12892241920 | elapsed time per iteration (s): 0.17 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 3.842048E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.514 | TFLOPs: 24.30 | +7: iteration 24600/ 173500 | consumed samples: 6297600 | consumed tokens: 12897484800 | elapsed time per iteration (s): 0.18 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.830019E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.321 | TFLOPs: 22.85 | +7: iteration 24610/ 173500 | consumed samples: 6300160 | consumed tokens: 12902727680 | elapsed time per iteration (s): 0.20 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.836846E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1257.062 | TFLOPs: 19.71 | +7: iteration 24620/ 173500 | consumed samples: 6302720 | consumed tokens: 12907970560 | elapsed time per iteration (s): 0.19 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.835233E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.095 | TFLOPs: 21.27 | +7: iteration 24630/ 173500 | consumed samples: 6305280 | consumed tokens: 12913213440 | elapsed time per iteration (s): 0.17 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.827354E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.333 | TFLOPs: 24.28 | +7: iteration 24640/ 173500 | consumed samples: 6307840 | consumed tokens: 12918456320 | elapsed time per iteration (s): 0.16 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.846226E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.159 | TFLOPs: 25.02 | +7: iteration 24650/ 173500 | consumed samples: 6310400 | consumed tokens: 12923699200 | elapsed time per iteration (s): 0.17 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.842477E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.508 | TFLOPs: 23.81 | +7: iteration 24660/ 173500 | consumed samples: 6312960 | consumed tokens: 12928942080 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.838214E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.852 | TFLOPs: 26.23 | +7: iteration 24670/ 173500 | consumed samples: 6315520 | consumed tokens: 12934184960 | elapsed time per iteration (s): 0.15 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.838775E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.803 | TFLOPs: 26.08 | +7: iteration 24680/ 173500 | consumed samples: 6318080 | consumed tokens: 12939427840 | elapsed time per iteration (s): 0.16 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.833295E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.026 | TFLOPs: 25.66 | +7: iteration 24690/ 173500 | consumed samples: 6320640 | consumed tokens: 12944670720 | elapsed time per iteration (s): 0.16 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.842937E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.825 | TFLOPs: 24.54 | +7: iteration 24700/ 173500 | consumed samples: 6323200 | consumed tokens: 12949913600 | elapsed time per iteration (s): 0.17 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.841491E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.729 | TFLOPs: 23.64 | +7: iteration 24710/ 173500 | consumed samples: 6325760 | consumed tokens: 12955156480 | elapsed time per iteration (s): 0.16 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.845615E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.828 | TFLOPs: 25.18 | +7: iteration 24720/ 173500 | consumed samples: 6328320 | consumed tokens: 12960399360 | elapsed time per iteration (s): 0.16 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.840215E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.544 | TFLOPs: 25.68 | +7: iteration 24730/ 173500 | consumed samples: 6330880 | consumed tokens: 12965642240 | elapsed time per iteration (s): 0.17 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.825276E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.437 | TFLOPs: 23.92 | +7: iteration 24740/ 173500 | consumed samples: 6333440 | consumed tokens: 12970885120 | elapsed time per iteration (s): 0.18 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 3.824261E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.857 | TFLOPs: 22.60 | +7: iteration 24750/ 173500 | consumed samples: 6336000 | consumed tokens: 12976128000 | elapsed time per iteration (s): 0.16 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.824003E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.547 | TFLOPs: 25.08 | +7: iteration 24760/ 173500 | consumed samples: 6338560 | consumed tokens: 12981370880 | elapsed time per iteration (s): 0.17 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.835631E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.029 | TFLOPs: 23.81 | +7: iteration 24770/ 173500 | consumed samples: 6341120 | consumed tokens: 12986613760 | elapsed time per iteration (s): 0.20 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.840348E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1253.945 | TFLOPs: 19.67 | +7: iteration 24780/ 173500 | consumed samples: 6343680 | consumed tokens: 12991856640 | elapsed time per iteration (s): 0.17 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.850000E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.183 | TFLOPs: 23.09 | +7: iteration 24790/ 173500 | consumed samples: 6346240 | consumed tokens: 12997099520 | elapsed time per iteration (s): 0.17 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.839865E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1478.237 | TFLOPs: 23.18 | +7: iteration 24800/ 173500 | consumed samples: 6348800 | consumed tokens: 13002342400 | elapsed time per iteration (s): 0.18 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.843749E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1423.709 | TFLOPs: 22.33 | +7: iteration 24810/ 173500 | consumed samples: 6351360 | consumed tokens: 13007585280 | elapsed time per iteration (s): 0.17 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.837295E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.078 | TFLOPs: 23.45 | +7: iteration 24820/ 173500 | consumed samples: 6353920 | consumed tokens: 13012828160 | elapsed time per iteration (s): 0.16 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.836631E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.417 | TFLOPs: 24.44 | +7: iteration 24830/ 173500 | consumed samples: 6356480 | consumed tokens: 13018071040 | elapsed time per iteration (s): 0.16 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.843554E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.148 | TFLOPs: 24.73 | +7: iteration 24840/ 173500 | consumed samples: 6359040 | consumed tokens: 13023313920 | elapsed time per iteration (s): 0.16 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.849398E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.846 | TFLOPs: 25.29 | +7: iteration 24850/ 173500 | consumed samples: 6361600 | consumed tokens: 13028556800 | elapsed time per iteration (s): 0.16 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.839767E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.905 | TFLOPs: 24.57 | +7: iteration 24860/ 173500 | consumed samples: 6364160 | consumed tokens: 13033799680 | elapsed time per iteration (s): 0.17 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.832645E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.613 | TFLOPs: 23.72 | +7: iteration 24870/ 173500 | consumed samples: 6366720 | consumed tokens: 13039042560 | elapsed time per iteration (s): 0.19 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.837870E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1330.735 | TFLOPs: 20.87 | +7: iteration 24880/ 173500 | consumed samples: 6369280 | consumed tokens: 13044285440 | elapsed time per iteration (s): 0.16 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 3.823336E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.152 | TFLOPs: 24.87 | +7: iteration 24890/ 173500 | consumed samples: 6371840 | consumed tokens: 13049528320 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.828019E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.398 | TFLOPs: 25.18 | +7: iteration 24900/ 173500 | consumed samples: 6374400 | consumed tokens: 13054771200 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.827264E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.232 | TFLOPs: 24.80 | +7: iteration 24910/ 173500 | consumed samples: 6376960 | consumed tokens: 13060014080 | elapsed time per iteration (s): 0.17 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.846742E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.966 | TFLOPs: 24.01 | +7: iteration 24920/ 173500 | consumed samples: 6379520 | consumed tokens: 13065256960 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.839584E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.024 | TFLOPs: 25.80 | +7: iteration 24930/ 173500 | consumed samples: 6382080 | consumed tokens: 13070499840 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.841488E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.391 | TFLOPs: 24.47 | +7: iteration 24940/ 173500 | consumed samples: 6384640 | consumed tokens: 13075742720 | elapsed time per iteration (s): 0.17 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.836459E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1466.110 | TFLOPs: 22.99 | +7: iteration 24950/ 173500 | consumed samples: 6387200 | consumed tokens: 13080985600 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.843986E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.140 | TFLOPs: 25.13 | +7: iteration 24960/ 173500 | consumed samples: 6389760 | consumed tokens: 13086228480 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.854590E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.160 | TFLOPs: 24.92 | +7: iteration 24970/ 173500 | consumed samples: 6392320 | consumed tokens: 13091471360 | elapsed time per iteration (s): 0.17 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.842149E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.494 | TFLOPs: 23.94 | +7: iteration 24980/ 173500 | consumed samples: 6394880 | consumed tokens: 13096714240 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.851999E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.387 | TFLOPs: 25.74 | +7: iteration 24990/ 173500 | consumed samples: 6397440 | consumed tokens: 13101957120 | elapsed time per iteration (s): 0.17 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.845169E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.644 | TFLOPs: 23.74 | +7: iteration 25000/ 173500 | consumed samples: 6400000 | consumed tokens: 13107200000 | elapsed time per iteration (s): 0.19 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.835337E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1333.868 | TFLOPs: 20.92 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 25000 | lm loss value: 3.910533E+00 | lm loss PPL: 4.992558E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 25000 to checkpoints_44m91b100m +0: [2023-03-17 01:20:25,951] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step25000 is begin to save! +0: [2023-03-17 01:20:25,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:20:26,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:20:26,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:20:26,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:20:26,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:20:26,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:20:26,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:20:26,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:20:26,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:20:26,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:20:26,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:20:26,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:20:26,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:20:26,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:20:26,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:20:26,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:20:26,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:20:26,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:20:26,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:20:26,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:20:26,104] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step25000/mp_rank_00_model_states.pt +0: [2023-03-17 01:20:26,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:20:26,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:20:26,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:20:26,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-17 01:20:26,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:20:26,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:20:26,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:20:26,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:20:26,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:20:26,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-17 01:20:26,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-17 01:20:26,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: successfully saved checkpoint at iteration 25000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 230.00 +7: iteration 25010/ 173500 | consumed samples: 6402560 | consumed tokens: 13112442880 | elapsed time per iteration (s): 0.19 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.830908E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1376.369 | TFLOPs: 21.58 | +7: iteration 25020/ 173500 | consumed samples: 6405120 | consumed tokens: 13117685760 | elapsed time per iteration (s): 0.17 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.852252E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.116 | TFLOPs: 24.25 | +7: iteration 25030/ 173500 | consumed samples: 6407680 | consumed tokens: 13122928640 | elapsed time per iteration (s): 0.16 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 3.838970E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.407 | TFLOPs: 24.69 | +7: iteration 25040/ 173500 | consumed samples: 6410240 | consumed tokens: 13128171520 | elapsed time per iteration (s): 0.18 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.830130E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1430.757 | TFLOPs: 22.44 | +7: iteration 25050/ 173500 | consumed samples: 6412800 | consumed tokens: 13133414400 | elapsed time per iteration (s): 0.17 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.835011E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.324 | TFLOPs: 24.16 | +7: iteration 25060/ 173500 | consumed samples: 6415360 | consumed tokens: 13138657280 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.836153E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.754 | TFLOPs: 25.10 | +7: iteration 25070/ 173500 | consumed samples: 6417920 | consumed tokens: 13143900160 | elapsed time per iteration (s): 0.17 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.845779E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1478.581 | TFLOPs: 23.19 | +7: iteration 25080/ 173500 | consumed samples: 6420480 | consumed tokens: 13149143040 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.837945E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.407 | TFLOPs: 24.91 | +7: iteration 25090/ 173500 | consumed samples: 6423040 | consumed tokens: 13154385920 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.838916E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.292 | TFLOPs: 25.49 | +7: iteration 25100/ 173500 | consumed samples: 6425600 | consumed tokens: 13159628800 | elapsed time per iteration (s): 0.17 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.823655E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.747 | TFLOPs: 24.02 | +7: iteration 25110/ 173500 | consumed samples: 6428160 | consumed tokens: 13164871680 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.836501E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.972 | TFLOPs: 24.81 | +7: iteration 25120/ 173500 | consumed samples: 6430720 | consumed tokens: 13170114560 | elapsed time per iteration (s): 0.20 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.832166E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1309.967 | TFLOPs: 20.54 | +7: iteration 25130/ 173500 | consumed samples: 6433280 | consumed tokens: 13175357440 | elapsed time per iteration (s): 0.17 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.824902E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.671 | TFLOPs: 23.79 | +7: iteration 25140/ 173500 | consumed samples: 6435840 | consumed tokens: 13180600320 | elapsed time per iteration (s): 0.17 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.834081E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.192 | TFLOPs: 23.64 | +7: iteration 25150/ 173500 | consumed samples: 6438400 | consumed tokens: 13185843200 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.837386E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.088 | TFLOPs: 24.58 | +7: iteration 25160/ 173500 | consumed samples: 6440960 | consumed tokens: 13191086080 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.827561E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.850 | TFLOPs: 24.71 | +7: iteration 25170/ 173500 | consumed samples: 6443520 | consumed tokens: 13196328960 | elapsed time per iteration (s): 0.16 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.832908E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.592 | TFLOPs: 24.49 | +7: iteration 25180/ 173500 | consumed samples: 6446080 | consumed tokens: 13201571840 | elapsed time per iteration (s): 0.17 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 3.827715E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.836 | TFLOPs: 24.12 | +7: iteration 25190/ 173500 | consumed samples: 6448640 | consumed tokens: 13206814720 | elapsed time per iteration (s): 0.19 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.834236E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1345.260 | TFLOPs: 21.10 | +7: iteration 25200/ 173500 | consumed samples: 6451200 | consumed tokens: 13212057600 | elapsed time per iteration (s): 0.17 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.825813E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.270 | TFLOPs: 23.54 | +7: iteration 25210/ 173500 | consumed samples: 6453760 | consumed tokens: 13217300480 | elapsed time per iteration (s): 0.18 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.848060E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1439.711 | TFLOPs: 22.58 | +7: iteration 25220/ 173500 | consumed samples: 6456320 | consumed tokens: 13222543360 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.843444E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.276 | TFLOPs: 24.97 | +7: iteration 25230/ 173500 | consumed samples: 6458880 | consumed tokens: 13227786240 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.824632E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.868 | TFLOPs: 24.76 | +7: iteration 25240/ 173500 | consumed samples: 6461440 | consumed tokens: 13233029120 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.830082E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.627 | TFLOPs: 25.40 | +7: iteration 25250/ 173500 | consumed samples: 6464000 | consumed tokens: 13238272000 | elapsed time per iteration (s): 0.19 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.844721E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1322.408 | TFLOPs: 20.74 | +7: iteration 25260/ 173500 | consumed samples: 6466560 | consumed tokens: 13243514880 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.830801E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.515 | TFLOPs: 24.36 | +7: iteration 25270/ 173500 | consumed samples: 6469120 | consumed tokens: 13248757760 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.839917E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.284 | TFLOPs: 24.38 | +7: iteration 25280/ 173500 | consumed samples: 6471680 | consumed tokens: 13254000640 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.844177E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.171 | TFLOPs: 25.17 | +7: iteration 25290/ 173500 | consumed samples: 6474240 | consumed tokens: 13259243520 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.844395E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.303 | TFLOPs: 25.50 | +7: iteration 25300/ 173500 | consumed samples: 6476800 | consumed tokens: 13264486400 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.837075E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.483 | TFLOPs: 25.04 | +7: iteration 25310/ 173500 | consumed samples: 6479360 | consumed tokens: 13269729280 | elapsed time per iteration (s): 0.16 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.829691E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.187 | TFLOPs: 24.50 | +7: iteration 25320/ 173500 | consumed samples: 6481920 | consumed tokens: 13274972160 | elapsed time per iteration (s): 0.18 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 3.838133E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.546 | TFLOPs: 22.54 | +7: iteration 25330/ 173500 | consumed samples: 6484480 | consumed tokens: 13280215040 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.842588E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.876 | TFLOPs: 25.09 | +7: iteration 25340/ 173500 | consumed samples: 6487040 | consumed tokens: 13285457920 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.832943E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.794 | TFLOPs: 24.90 | +7: iteration 25350/ 173500 | consumed samples: 6489600 | consumed tokens: 13290700800 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.814577E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.574 | TFLOPs: 25.23 | +7: iteration 25360/ 173500 | consumed samples: 6492160 | consumed tokens: 13295943680 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.828550E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.966 | TFLOPs: 26.22 | +7: iteration 25370/ 173500 | consumed samples: 6494720 | consumed tokens: 13301186560 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.820827E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.113 | TFLOPs: 24.42 | +7: iteration 25380/ 173500 | consumed samples: 6497280 | consumed tokens: 13306429440 | elapsed time per iteration (s): 0.18 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.833628E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.741 | TFLOPs: 22.83 | +7: iteration 25390/ 173500 | consumed samples: 6499840 | consumed tokens: 13311672320 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.829781E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.098 | TFLOPs: 25.88 | +7: iteration 25400/ 173500 | consumed samples: 6502400 | consumed tokens: 13316915200 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.841389E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.998 | TFLOPs: 25.59 | +7: iteration 25410/ 173500 | consumed samples: 6504960 | consumed tokens: 13322158080 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.833870E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.536 | TFLOPs: 25.59 | +7: iteration 25420/ 173500 | consumed samples: 6507520 | consumed tokens: 13327400960 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.833673E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.330 | TFLOPs: 25.27 | +7: iteration 25430/ 173500 | consumed samples: 6510080 | consumed tokens: 13332643840 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.823878E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.688 | TFLOPs: 25.84 | +7: iteration 25440/ 173500 | consumed samples: 6512640 | consumed tokens: 13337886720 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.837250E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.914 | TFLOPs: 25.20 | +7: iteration 25450/ 173500 | consumed samples: 6515200 | consumed tokens: 13343129600 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.840539E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.224 | TFLOPs: 24.36 | +7: iteration 25460/ 173500 | consumed samples: 6517760 | consumed tokens: 13348372480 | elapsed time per iteration (s): 0.16 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.838280E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.293 | TFLOPs: 25.43 | +7: iteration 25470/ 173500 | consumed samples: 6520320 | consumed tokens: 13353615360 | elapsed time per iteration (s): 0.15 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 3.836703E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.556 | TFLOPs: 26.29 | +7: iteration 25480/ 173500 | consumed samples: 6522880 | consumed tokens: 13358858240 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.841119E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.188 | TFLOPs: 25.83 | +7: iteration 25490/ 173500 | consumed samples: 6525440 | consumed tokens: 13364101120 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.835865E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.660 | TFLOPs: 25.27 | +7: iteration 25500/ 173500 | consumed samples: 6528000 | consumed tokens: 13369344000 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.836881E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.392 | TFLOPs: 25.69 | +7: iteration 25510/ 173500 | consumed samples: 6530560 | consumed tokens: 13374586880 | elapsed time per iteration (s): 0.19 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.838293E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.933 | TFLOPs: 21.28 | +7: iteration 25520/ 173500 | consumed samples: 6533120 | consumed tokens: 13379829760 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.835427E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.316 | TFLOPs: 25.68 | +7: iteration 25530/ 173500 | consumed samples: 6535680 | consumed tokens: 13385072640 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.838437E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.084 | TFLOPs: 25.30 | +7: iteration 25540/ 173500 | consumed samples: 6538240 | consumed tokens: 13390315520 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.830924E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.377 | TFLOPs: 25.80 | +7: iteration 25550/ 173500 | consumed samples: 6540800 | consumed tokens: 13395558400 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.825037E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.701 | TFLOPs: 25.34 | +7: iteration 25560/ 173500 | consumed samples: 6543360 | consumed tokens: 13400801280 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.834560E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.810 | TFLOPs: 25.07 | +7: iteration 25570/ 173500 | consumed samples: 6545920 | consumed tokens: 13406044160 | elapsed time per iteration (s): 0.17 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.835556E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.921 | TFLOPs: 23.22 | +7: iteration 25580/ 173500 | consumed samples: 6548480 | consumed tokens: 13411287040 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.833121E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.894 | TFLOPs: 25.86 | +7: iteration 25590/ 173500 | consumed samples: 6551040 | consumed tokens: 13416529920 | elapsed time per iteration (s): 0.15 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.834487E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.225 | TFLOPs: 26.24 | +7: iteration 25600/ 173500 | consumed samples: 6553600 | consumed tokens: 13421772800 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.836436E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.858 | TFLOPs: 25.70 | +7: iteration 25610/ 173500 | consumed samples: 6556160 | consumed tokens: 13427015680 | elapsed time per iteration (s): 0.16 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 3.843843E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.433 | TFLOPs: 25.43 | +7: iteration 25620/ 173500 | consumed samples: 6558720 | consumed tokens: 13432258560 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.823482E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.377 | TFLOPs: 26.23 | +7: iteration 25630/ 173500 | consumed samples: 6561280 | consumed tokens: 13437501440 | elapsed time per iteration (s): 0.16 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.836785E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.177 | TFLOPs: 24.75 | +7: iteration 25640/ 173500 | consumed samples: 6563840 | consumed tokens: 13442744320 | elapsed time per iteration (s): 0.17 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.838755E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.900 | TFLOPs: 23.66 | +7: iteration 25650/ 173500 | consumed samples: 6566400 | consumed tokens: 13447987200 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.829614E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.415 | TFLOPs: 26.31 | +7: iteration 25660/ 173500 | consumed samples: 6568960 | consumed tokens: 13453230080 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.835550E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.135 | TFLOPs: 25.91 | +7: iteration 25670/ 173500 | consumed samples: 6571520 | consumed tokens: 13458472960 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.835575E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.819 | TFLOPs: 26.30 | +7: iteration 25680/ 173500 | consumed samples: 6574080 | consumed tokens: 13463715840 | elapsed time per iteration (s): 0.16 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.832396E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.452 | TFLOPs: 25.55 | +7: iteration 25690/ 173500 | consumed samples: 6576640 | consumed tokens: 13468958720 | elapsed time per iteration (s): 0.16 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.848196E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.837 | TFLOPs: 25.81 | +7: iteration 25700/ 173500 | consumed samples: 6579200 | consumed tokens: 13474201600 | elapsed time per iteration (s): 0.16 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.838461E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.303 | TFLOPs: 25.60 | +7: iteration 25710/ 173500 | consumed samples: 6581760 | consumed tokens: 13479444480 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.823989E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.725 | TFLOPs: 26.14 | +7: iteration 25720/ 173500 | consumed samples: 6584320 | consumed tokens: 13484687360 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.832158E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.765 | TFLOPs: 26.26 | +7: iteration 25730/ 173500 | consumed samples: 6586880 | consumed tokens: 13489930240 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.836008E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.839 | TFLOPs: 26.25 | +7: iteration 25740/ 173500 | consumed samples: 6589440 | consumed tokens: 13495173120 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.832596E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.015 | TFLOPs: 25.99 | +7: iteration 25750/ 173500 | consumed samples: 6592000 | consumed tokens: 13500416000 | elapsed time per iteration (s): 0.15 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 3.830030E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.555 | TFLOPs: 26.23 | +7: iteration 25760/ 173500 | consumed samples: 6594560 | consumed tokens: 13505658880 | elapsed time per iteration (s): 0.17 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.839094E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1468.590 | TFLOPs: 23.03 | +7: iteration 25770/ 173500 | consumed samples: 6597120 | consumed tokens: 13510901760 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.828970E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.675 | TFLOPs: 25.20 | +7: iteration 25780/ 173500 | consumed samples: 6599680 | consumed tokens: 13516144640 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.834157E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.500 | TFLOPs: 25.01 | +7: iteration 25790/ 173500 | consumed samples: 6602240 | consumed tokens: 13521387520 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.828328E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.616 | TFLOPs: 25.23 | +7: iteration 25800/ 173500 | consumed samples: 6604800 | consumed tokens: 13526630400 | elapsed time per iteration (s): 0.17 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.831108E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1514.653 | TFLOPs: 23.75 | +7: iteration 25810/ 173500 | consumed samples: 6607360 | consumed tokens: 13531873280 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.827812E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.938 | TFLOPs: 26.16 | +7: iteration 25820/ 173500 | consumed samples: 6609920 | consumed tokens: 13537116160 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.835557E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.788 | TFLOPs: 25.83 | +7: iteration 25830/ 173500 | consumed samples: 6612480 | consumed tokens: 13542359040 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.827712E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.325 | TFLOPs: 26.27 | +7: iteration 25840/ 173500 | consumed samples: 6615040 | consumed tokens: 13547601920 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.828523E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.074 | TFLOPs: 25.80 | +7: iteration 25850/ 173500 | consumed samples: 6617600 | consumed tokens: 13552844800 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.831652E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.662 | TFLOPs: 26.25 | +7: iteration 25860/ 173500 | consumed samples: 6620160 | consumed tokens: 13558087680 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.826139E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.919 | TFLOPs: 25.31 | +7: iteration 25870/ 173500 | consumed samples: 6622720 | consumed tokens: 13563330560 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.837410E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.382 | TFLOPs: 26.20 | +7: iteration 25880/ 173500 | consumed samples: 6625280 | consumed tokens: 13568573440 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.843744E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.126 | TFLOPs: 26.24 | +7: iteration 25890/ 173500 | consumed samples: 6627840 | consumed tokens: 13573816320 | elapsed time per iteration (s): 0.15 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.845026E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.973 | TFLOPs: 25.91 | +7: iteration 25900/ 173500 | consumed samples: 6630400 | consumed tokens: 13579059200 | elapsed time per iteration (s): 0.16 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 3.832484E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.871 | TFLOPs: 25.23 | +7: iteration 25910/ 173500 | consumed samples: 6632960 | consumed tokens: 13584302080 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.833371E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.888 | TFLOPs: 25.59 | +7: iteration 25920/ 173500 | consumed samples: 6635520 | consumed tokens: 13589544960 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.822104E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.340 | TFLOPs: 25.69 | +7: iteration 25930/ 173500 | consumed samples: 6638080 | consumed tokens: 13594787840 | elapsed time per iteration (s): 0.17 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.809083E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.499 | TFLOPs: 24.24 | +7: iteration 25940/ 173500 | consumed samples: 6640640 | consumed tokens: 13600030720 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.839291E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.281 | TFLOPs: 26.01 | +7: iteration 25950/ 173500 | consumed samples: 6643200 | consumed tokens: 13605273600 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.834414E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.632 | TFLOPs: 25.37 | +7: iteration 25960/ 173500 | consumed samples: 6645760 | consumed tokens: 13610516480 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.832859E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.029 | TFLOPs: 26.28 | +7: iteration 25970/ 173500 | consumed samples: 6648320 | consumed tokens: 13615759360 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.811857E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.346 | TFLOPs: 25.90 | +7: iteration 25980/ 173500 | consumed samples: 6650880 | consumed tokens: 13621002240 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.828078E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.199 | TFLOPs: 26.11 | +7: iteration 25990/ 173500 | consumed samples: 6653440 | consumed tokens: 13626245120 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.833832E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.094 | TFLOPs: 25.83 | +0: [2023-03-17 01:23:07,311] [INFO] [logging.py:68:log_dist] [Rank 0] step=26000, skipped=0, lr=[0.00019128112529201118, 0.00019128112529201118, 0.00019128112529201118], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 26000/ 173500 | consumed samples: 6656000 | consumed tokens: 13631488000 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.833514E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.973 | TFLOPs: 26.33 | +0: steps: 26000 loss: 3.8379 iter time (s): 0.159 samples/sec: 1605.641 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 26000 | lm loss value: 3.923929E+00 | lm loss PPL: 5.059884E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 26000 to checkpoints_44m91b100m +0: [2023-03-17 01:23:07,385] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step26000 is begin to save! +0: [2023-03-17 01:23:07,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:23:07,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:23:07,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:23:07,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:23:07,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:23:07,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:23:07,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:23:07,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:23:07,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:23:07,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:23:07,484] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:23:07,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:23:07,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:23:07,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:23:07,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:23:07,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:23:07,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:23:07,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:23:07,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:23:07,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:23:07,518] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step26000/mp_rank_00_model_states.pt +0: [2023-03-17 01:23:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:23:07,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:23:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:23:07,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:23:07,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:23:07,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-17 01:23:07,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:23:07,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:23:07,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: successfully saved checkpoint at iteration 26000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.31 +7: iteration 26010/ 173500 | consumed samples: 6658560 | consumed tokens: 13636730880 | elapsed time per iteration (s): 0.18 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.838266E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.450 | TFLOPs: 22.62 | +7: iteration 26020/ 173500 | consumed samples: 6661120 | consumed tokens: 13641973760 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.840900E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.551 | TFLOPs: 26.12 | +7: iteration 26030/ 173500 | consumed samples: 6663680 | consumed tokens: 13647216640 | elapsed time per iteration (s): 0.15 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.834875E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.613 | TFLOPs: 25.92 | +7: iteration 26040/ 173500 | consumed samples: 6666240 | consumed tokens: 13652459520 | elapsed time per iteration (s): 0.16 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 3.835158E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.543 | TFLOPs: 25.63 | +7: iteration 26050/ 173500 | consumed samples: 6668800 | consumed tokens: 13657702400 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.834867E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.181 | TFLOPs: 26.16 | +7: iteration 26060/ 173500 | consumed samples: 6671360 | consumed tokens: 13662945280 | elapsed time per iteration (s): 0.16 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.829847E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.629 | TFLOPs: 25.16 | +7: iteration 26070/ 173500 | consumed samples: 6673920 | consumed tokens: 13668188160 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.841718E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.659 | TFLOPs: 26.07 | +7: iteration 26080/ 173500 | consumed samples: 6676480 | consumed tokens: 13673431040 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.833861E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.255 | TFLOPs: 26.21 | +7: iteration 26090/ 173500 | consumed samples: 6679040 | consumed tokens: 13678673920 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.830615E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.820 | TFLOPs: 26.17 | +7: iteration 26100/ 173500 | consumed samples: 6681600 | consumed tokens: 13683916800 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.839846E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.418 | TFLOPs: 26.16 | +7: iteration 26110/ 173500 | consumed samples: 6684160 | consumed tokens: 13689159680 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.833524E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.693 | TFLOPs: 26.25 | +7: iteration 26120/ 173500 | consumed samples: 6686720 | consumed tokens: 13694402560 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.831126E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.506 | TFLOPs: 26.31 | +7: iteration 26130/ 173500 | consumed samples: 6689280 | consumed tokens: 13699645440 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.837438E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.958 | TFLOPs: 26.24 | +7: iteration 26140/ 173500 | consumed samples: 6691840 | consumed tokens: 13704888320 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.834825E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.789 | TFLOPs: 26.20 | +7: iteration 26150/ 173500 | consumed samples: 6694400 | consumed tokens: 13710131200 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.829638E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.756 | TFLOPs: 25.97 | +7: iteration 26160/ 173500 | consumed samples: 6696960 | consumed tokens: 13715374080 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.833522E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.310 | TFLOPs: 26.27 | +7: iteration 26170/ 173500 | consumed samples: 6699520 | consumed tokens: 13720616960 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.829111E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.738 | TFLOPs: 26.26 | +7: iteration 26180/ 173500 | consumed samples: 6702080 | consumed tokens: 13725859840 | elapsed time per iteration (s): 0.15 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 3.826177E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.340 | TFLOPs: 26.27 | +7: iteration 26190/ 173500 | consumed samples: 6704640 | consumed tokens: 13731102720 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.838593E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.034 | TFLOPs: 26.25 | +7: iteration 26200/ 173500 | consumed samples: 6707200 | consumed tokens: 13736345600 | elapsed time per iteration (s): 0.16 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.819344E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.016 | TFLOPs: 25.88 | +7: iteration 26210/ 173500 | consumed samples: 6709760 | consumed tokens: 13741588480 | elapsed time per iteration (s): 0.16 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.824936E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.509 | TFLOPs: 25.90 | +7: iteration 26220/ 173500 | consumed samples: 6712320 | consumed tokens: 13746831360 | elapsed time per iteration (s): 0.16 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.826521E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.695 | TFLOPs: 25.62 | +7: iteration 26230/ 173500 | consumed samples: 6714880 | consumed tokens: 13752074240 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.819635E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.606 | TFLOPs: 26.11 | +7: iteration 26240/ 173500 | consumed samples: 6717440 | consumed tokens: 13757317120 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.834099E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.844 | TFLOPs: 26.11 | +7: iteration 26250/ 173500 | consumed samples: 6720000 | consumed tokens: 13762560000 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.824687E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.552 | TFLOPs: 26.09 | +7: iteration 26260/ 173500 | consumed samples: 6722560 | consumed tokens: 13767802880 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.830227E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.013 | TFLOPs: 26.08 | +7: iteration 26270/ 173500 | consumed samples: 6725120 | consumed tokens: 13773045760 | elapsed time per iteration (s): 0.16 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.838266E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.611 | TFLOPs: 25.90 | +7: iteration 26280/ 173500 | consumed samples: 6727680 | consumed tokens: 13778288640 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.820296E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.545 | TFLOPs: 26.10 | +7: iteration 26290/ 173500 | consumed samples: 6730240 | consumed tokens: 13783531520 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.827195E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.484 | TFLOPs: 26.12 | +7: iteration 26300/ 173500 | consumed samples: 6732800 | consumed tokens: 13788774400 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.826221E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.394 | TFLOPs: 26.09 | +7: iteration 26310/ 173500 | consumed samples: 6735360 | consumed tokens: 13794017280 | elapsed time per iteration (s): 0.16 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.831799E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.251 | TFLOPs: 25.74 | +7: iteration 26320/ 173500 | consumed samples: 6737920 | consumed tokens: 13799260160 | elapsed time per iteration (s): 0.15 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 3.834643E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.146 | TFLOPs: 26.05 | +7: iteration 26330/ 173500 | consumed samples: 6740480 | consumed tokens: 13804503040 | elapsed time per iteration (s): 0.16 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.831936E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.908 | TFLOPs: 25.62 | +7: iteration 26340/ 173500 | consumed samples: 6743040 | consumed tokens: 13809745920 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.822719E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.451 | TFLOPs: 26.09 | +7: iteration 26350/ 173500 | consumed samples: 6745600 | consumed tokens: 13814988800 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.828014E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.160 | TFLOPs: 26.08 | +7: iteration 26360/ 173500 | consumed samples: 6748160 | consumed tokens: 13820231680 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.823834E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.309 | TFLOPs: 26.05 | +7: iteration 26370/ 173500 | consumed samples: 6750720 | consumed tokens: 13825474560 | elapsed time per iteration (s): 0.16 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.823119E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.969 | TFLOPs: 25.50 | +7: iteration 26380/ 173500 | consumed samples: 6753280 | consumed tokens: 13830717440 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.811639E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.416 | TFLOPs: 26.02 | +7: iteration 26390/ 173500 | consumed samples: 6755840 | consumed tokens: 13835960320 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.828701E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.301 | TFLOPs: 26.07 | +7: iteration 26400/ 173500 | consumed samples: 6758400 | consumed tokens: 13841203200 | elapsed time per iteration (s): 0.16 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.831649E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.492 | TFLOPs: 25.65 | +7: iteration 26410/ 173500 | consumed samples: 6760960 | consumed tokens: 13846446080 | elapsed time per iteration (s): 0.16 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.823387E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.783 | TFLOPs: 25.73 | +7: iteration 26420/ 173500 | consumed samples: 6763520 | consumed tokens: 13851688960 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.829417E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.680 | TFLOPs: 26.06 | +7: iteration 26430/ 173500 | consumed samples: 6766080 | consumed tokens: 13856931840 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.830417E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.908 | TFLOPs: 26.08 | +7: iteration 26440/ 173500 | consumed samples: 6768640 | consumed tokens: 13862174720 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.831024E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.145 | TFLOPs: 26.08 | +7: iteration 26450/ 173500 | consumed samples: 6771200 | consumed tokens: 13867417600 | elapsed time per iteration (s): 0.16 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.827054E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.513 | TFLOPs: 25.76 | +7: iteration 26460/ 173500 | consumed samples: 6773760 | consumed tokens: 13872660480 | elapsed time per iteration (s): 0.15 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 3.826590E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.132 | TFLOPs: 26.25 | +7: iteration 26470/ 173500 | consumed samples: 6776320 | consumed tokens: 13877903360 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.826029E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.273 | TFLOPs: 26.21 | +7: iteration 26480/ 173500 | consumed samples: 6778880 | consumed tokens: 13883146240 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.832826E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.246 | TFLOPs: 26.15 | +7: iteration 26490/ 173500 | consumed samples: 6781440 | consumed tokens: 13888389120 | elapsed time per iteration (s): 0.16 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.819310E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.187 | TFLOPs: 25.78 | +7: iteration 26500/ 173500 | consumed samples: 6784000 | consumed tokens: 13893632000 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.833805E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.645 | TFLOPs: 26.18 | +7: iteration 26510/ 173500 | consumed samples: 6786560 | consumed tokens: 13898874880 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.827504E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.579 | TFLOPs: 26.18 | +7: iteration 26520/ 173500 | consumed samples: 6789120 | consumed tokens: 13904117760 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.820913E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.444 | TFLOPs: 26.18 | +7: iteration 26530/ 173500 | consumed samples: 6791680 | consumed tokens: 13909360640 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.820911E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.422 | TFLOPs: 26.18 | +7: iteration 26540/ 173500 | consumed samples: 6794240 | consumed tokens: 13914603520 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.816267E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.702 | TFLOPs: 26.19 | +7: iteration 26550/ 173500 | consumed samples: 6796800 | consumed tokens: 13919846400 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.824817E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.660 | TFLOPs: 25.90 | +7: iteration 26560/ 173500 | consumed samples: 6799360 | consumed tokens: 13925089280 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.842437E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.198 | TFLOPs: 26.08 | +7: iteration 26570/ 173500 | consumed samples: 6801920 | consumed tokens: 13930332160 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.819738E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.759 | TFLOPs: 26.06 | +7: iteration 26580/ 173500 | consumed samples: 6804480 | consumed tokens: 13935575040 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.836734E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.798 | TFLOPs: 26.05 | +7: iteration 26590/ 173500 | consumed samples: 6807040 | consumed tokens: 13940817920 | elapsed time per iteration (s): 0.15 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.827719E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.229 | TFLOPs: 26.07 | +7: iteration 26600/ 173500 | consumed samples: 6809600 | consumed tokens: 13946060800 | elapsed time per iteration (s): 0.16 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 3.820958E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.440 | TFLOPs: 25.73 | +7: iteration 26610/ 173500 | consumed samples: 6812160 | consumed tokens: 13951303680 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.834686E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.783 | TFLOPs: 26.06 | +7: iteration 26620/ 173500 | consumed samples: 6814720 | consumed tokens: 13956546560 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.836403E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.229 | TFLOPs: 26.08 | +7: iteration 26630/ 173500 | consumed samples: 6817280 | consumed tokens: 13961789440 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.831742E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.566 | TFLOPs: 26.09 | +7: iteration 26640/ 173500 | consumed samples: 6819840 | consumed tokens: 13967032320 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.835522E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.391 | TFLOPs: 26.09 | +7: iteration 26650/ 173500 | consumed samples: 6822400 | consumed tokens: 13972275200 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.822076E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.161 | TFLOPs: 26.10 | +7: iteration 26660/ 173500 | consumed samples: 6824960 | consumed tokens: 13977518080 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.826607E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.191 | TFLOPs: 26.10 | +7: iteration 26670/ 173500 | consumed samples: 6827520 | consumed tokens: 13982760960 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.831698E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.800 | TFLOPs: 26.09 | +7: iteration 26680/ 173500 | consumed samples: 6830080 | consumed tokens: 13988003840 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.846842E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.952 | TFLOPs: 26.09 | +7: iteration 26690/ 173500 | consumed samples: 6832640 | consumed tokens: 13993246720 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.829088E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.014 | TFLOPs: 26.10 | +7: iteration 26700/ 173500 | consumed samples: 6835200 | consumed tokens: 13998489600 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.828426E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.115 | TFLOPs: 26.10 | +7: iteration 26710/ 173500 | consumed samples: 6837760 | consumed tokens: 14003732480 | elapsed time per iteration (s): 0.16 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.828665E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.781 | TFLOPs: 25.89 | +7: iteration 26720/ 173500 | consumed samples: 6840320 | consumed tokens: 14008975360 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.836991E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.894 | TFLOPs: 26.08 | +7: iteration 26730/ 173500 | consumed samples: 6842880 | consumed tokens: 14014218240 | elapsed time per iteration (s): 0.16 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.824464E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.729 | TFLOPs: 25.72 | +7: iteration 26740/ 173500 | consumed samples: 6845440 | consumed tokens: 14019461120 | elapsed time per iteration (s): 0.15 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 3.835162E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.526 | TFLOPs: 26.10 | +7: iteration 26750/ 173500 | consumed samples: 6848000 | consumed tokens: 14024704000 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.821735E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.925 | TFLOPs: 26.08 | +7: iteration 26760/ 173500 | consumed samples: 6850560 | consumed tokens: 14029946880 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.822422E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.295 | TFLOPs: 26.01 | +7: iteration 26770/ 173500 | consumed samples: 6853120 | consumed tokens: 14035189760 | elapsed time per iteration (s): 0.16 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.825988E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.309 | TFLOPs: 25.85 | +7: iteration 26780/ 173500 | consumed samples: 6855680 | consumed tokens: 14040432640 | elapsed time per iteration (s): 0.16 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.828723E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.002 | TFLOPs: 25.55 | +7: iteration 26790/ 173500 | consumed samples: 6858240 | consumed tokens: 14045675520 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.818564E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.574 | TFLOPs: 26.10 | +7: iteration 26800/ 173500 | consumed samples: 6860800 | consumed tokens: 14050918400 | elapsed time per iteration (s): 0.16 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.832741E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.395 | TFLOPs: 25.26 | +7: iteration 26810/ 173500 | consumed samples: 6863360 | consumed tokens: 14056161280 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.831070E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.992 | TFLOPs: 26.08 | +7: iteration 26820/ 173500 | consumed samples: 6865920 | consumed tokens: 14061404160 | elapsed time per iteration (s): 0.16 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.832004E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.016 | TFLOPs: 25.67 | +7: iteration 26830/ 173500 | consumed samples: 6868480 | consumed tokens: 14066647040 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.840822E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.280 | TFLOPs: 26.08 | +7: iteration 26840/ 173500 | consumed samples: 6871040 | consumed tokens: 14071889920 | elapsed time per iteration (s): 0.16 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.826171E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.935 | TFLOPs: 25.89 | +7: iteration 26850/ 173500 | consumed samples: 6873600 | consumed tokens: 14077132800 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.833620E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.470 | TFLOPs: 25.99 | +7: iteration 26860/ 173500 | consumed samples: 6876160 | consumed tokens: 14082375680 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.836453E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.003 | TFLOPs: 26.32 | +7: iteration 26870/ 173500 | consumed samples: 6878720 | consumed tokens: 14087618560 | elapsed time per iteration (s): 0.15 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 3.838146E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.710 | TFLOPs: 26.31 | +7: iteration 26880/ 173500 | consumed samples: 6881280 | consumed tokens: 14092861440 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.828622E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.859 | TFLOPs: 26.31 | +7: iteration 26890/ 173500 | consumed samples: 6883840 | consumed tokens: 14098104320 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.815675E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.409 | TFLOPs: 26.29 | +7: iteration 26900/ 173500 | consumed samples: 6886400 | consumed tokens: 14103347200 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.821916E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.730 | TFLOPs: 26.26 | +7: iteration 26910/ 173500 | consumed samples: 6888960 | consumed tokens: 14108590080 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.817123E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.663 | TFLOPs: 26.28 | +7: iteration 26920/ 173500 | consumed samples: 6891520 | consumed tokens: 14113832960 | elapsed time per iteration (s): 0.16 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.809082E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.237 | TFLOPs: 25.27 | +7: iteration 26930/ 173500 | consumed samples: 6894080 | consumed tokens: 14119075840 | elapsed time per iteration (s): 0.16 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.822199E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.221 | TFLOPs: 25.85 | +7: iteration 26940/ 173500 | consumed samples: 6896640 | consumed tokens: 14124318720 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.820006E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.163 | TFLOPs: 26.33 | +7: iteration 26950/ 173500 | consumed samples: 6899200 | consumed tokens: 14129561600 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.817449E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.173 | TFLOPs: 26.35 | +7: iteration 26960/ 173500 | consumed samples: 6901760 | consumed tokens: 14134804480 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.815247E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.927 | TFLOPs: 26.31 | +7: iteration 26970/ 173500 | consumed samples: 6904320 | consumed tokens: 14140047360 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.848201E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.947 | TFLOPs: 26.28 | +7: iteration 26980/ 173500 | consumed samples: 6906880 | consumed tokens: 14145290240 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.818482E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.137 | TFLOPs: 26.32 | +7: iteration 26990/ 173500 | consumed samples: 6909440 | consumed tokens: 14150533120 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.828814E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.539 | TFLOPs: 26.32 | +7: iteration 27000/ 173500 | consumed samples: 6912000 | consumed tokens: 14155776000 | elapsed time per iteration (s): 0.15 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.815997E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.065 | TFLOPs: 26.28 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 27000 | lm loss value: 3.944165E+00 | lm loss PPL: 5.163318E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 27000 to checkpoints_44m91b100m +0: [2023-03-17 01:25:41,799] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step27000 is begin to save! +0: [2023-03-17 01:25:41,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:25:41,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:25:41,865] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:25:41,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:25:41,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:25:41,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:25:41,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:25:41,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:25:41,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:25:41,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:25:41,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:25:41,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:25:41,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:25:41,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:25:41,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:25:41,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:25:41,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:25:41,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:25:41,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:25:41,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:25:41,933] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step27000/mp_rank_00_model_states.pt +0: [2023-03-17 01:25:41,933] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:25:41,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:25:41,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:25:41,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:25:41,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:25:41,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:25:41,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:25:41,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:25:41,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:25:41,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-17 01:25:41,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:25:41,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-17 01:25:41,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:25:41,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:25:41,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-17 01:25:41,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:25:41,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:25:41,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: successfully saved checkpoint at iteration 27000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.51 +7: iteration 27010/ 173500 | consumed samples: 6914560 | consumed tokens: 14161018880 | elapsed time per iteration (s): 0.18 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 3.825689E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1461.166 | TFLOPs: 22.91 | +7: iteration 27020/ 173500 | consumed samples: 6917120 | consumed tokens: 14166261760 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.823190E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.470 | TFLOPs: 26.31 | +7: iteration 27030/ 173500 | consumed samples: 6919680 | consumed tokens: 14171504640 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.826411E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.625 | TFLOPs: 26.33 | +7: iteration 27040/ 173500 | consumed samples: 6922240 | consumed tokens: 14176747520 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.821899E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.535 | TFLOPs: 26.34 | +7: iteration 27050/ 173500 | consumed samples: 6924800 | consumed tokens: 14181990400 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.830993E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.179 | TFLOPs: 26.35 | +7: iteration 27060/ 173500 | consumed samples: 6927360 | consumed tokens: 14187233280 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.828485E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.295 | TFLOPs: 26.30 | +7: iteration 27070/ 173500 | consumed samples: 6929920 | consumed tokens: 14192476160 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.823568E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.769 | TFLOPs: 26.22 | +7: iteration 27080/ 173500 | consumed samples: 6932480 | consumed tokens: 14197719040 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.818692E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.108 | TFLOPs: 26.05 | +7: iteration 27090/ 173500 | consumed samples: 6935040 | consumed tokens: 14202961920 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.819130E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.217 | TFLOPs: 26.35 | +7: iteration 27100/ 173500 | consumed samples: 6937600 | consumed tokens: 14208204800 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.835415E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.224 | TFLOPs: 26.32 | +7: iteration 27110/ 173500 | consumed samples: 6940160 | consumed tokens: 14213447680 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.830063E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.946 | TFLOPs: 26.28 | +7: iteration 27120/ 173500 | consumed samples: 6942720 | consumed tokens: 14218690560 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.827972E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.150 | TFLOPs: 26.30 | +7: iteration 27130/ 173500 | consumed samples: 6945280 | consumed tokens: 14223933440 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.825235E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.764 | TFLOPs: 26.36 | +7: iteration 27140/ 173500 | consumed samples: 6947840 | consumed tokens: 14229176320 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.822094E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.849 | TFLOPs: 26.30 | +7: iteration 27150/ 173500 | consumed samples: 6950400 | consumed tokens: 14234419200 | elapsed time per iteration (s): 0.15 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 3.821276E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.751 | TFLOPs: 26.34 | +7: iteration 27160/ 173500 | consumed samples: 6952960 | consumed tokens: 14239662080 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.816386E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.925 | TFLOPs: 26.31 | +7: iteration 27170/ 173500 | consumed samples: 6955520 | consumed tokens: 14244904960 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.825942E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.753 | TFLOPs: 26.34 | +7: iteration 27180/ 173500 | consumed samples: 6958080 | consumed tokens: 14250147840 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.839871E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.589 | TFLOPs: 26.34 | +7: iteration 27190/ 173500 | consumed samples: 6960640 | consumed tokens: 14255390720 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.816167E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.328 | TFLOPs: 26.32 | +7: iteration 27200/ 173500 | consumed samples: 6963200 | consumed tokens: 14260633600 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.830260E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.166 | TFLOPs: 26.33 | +7: iteration 27210/ 173500 | consumed samples: 6965760 | consumed tokens: 14265876480 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.836141E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.696 | TFLOPs: 26.22 | +7: iteration 27220/ 173500 | consumed samples: 6968320 | consumed tokens: 14271119360 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.821899E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.296 | TFLOPs: 26.15 | +7: iteration 27230/ 173500 | consumed samples: 6970880 | consumed tokens: 14276362240 | elapsed time per iteration (s): 0.16 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.825153E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.019 | TFLOPs: 25.37 | +7: iteration 27240/ 173500 | consumed samples: 6973440 | consumed tokens: 14281605120 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.824820E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.337 | TFLOPs: 26.09 | +7: iteration 27250/ 173500 | consumed samples: 6976000 | consumed tokens: 14286848000 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.827835E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.922 | TFLOPs: 26.17 | +7: iteration 27260/ 173500 | consumed samples: 6978560 | consumed tokens: 14292090880 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.836617E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.387 | TFLOPs: 26.16 | +7: iteration 27270/ 173500 | consumed samples: 6981120 | consumed tokens: 14297333760 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.820599E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.645 | TFLOPs: 26.14 | +7: iteration 27280/ 173500 | consumed samples: 6983680 | consumed tokens: 14302576640 | elapsed time per iteration (s): 0.15 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 3.837725E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.691 | TFLOPs: 26.15 | +7: iteration 27290/ 173500 | consumed samples: 6986240 | consumed tokens: 14307819520 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.817720E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.112 | TFLOPs: 26.16 | +7: iteration 27300/ 173500 | consumed samples: 6988800 | consumed tokens: 14313062400 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.821509E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.485 | TFLOPs: 26.13 | +7: iteration 27310/ 173500 | consumed samples: 6991360 | consumed tokens: 14318305280 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.825007E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.644 | TFLOPs: 26.17 | +7: iteration 27320/ 173500 | consumed samples: 6993920 | consumed tokens: 14323548160 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.808111E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.443 | TFLOPs: 26.15 | +7: iteration 27330/ 173500 | consumed samples: 6996480 | consumed tokens: 14328791040 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.831113E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.818 | TFLOPs: 26.17 | +7: iteration 27340/ 173500 | consumed samples: 6999040 | consumed tokens: 14334033920 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.819394E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.913 | TFLOPs: 26.16 | +7: iteration 27350/ 173500 | consumed samples: 7001600 | consumed tokens: 14339276800 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.820484E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.206 | TFLOPs: 26.18 | +7: iteration 27360/ 173500 | consumed samples: 7004160 | consumed tokens: 14344519680 | elapsed time per iteration (s): 0.16 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.823257E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.792 | TFLOPs: 25.79 | +7: iteration 27370/ 173500 | consumed samples: 7006720 | consumed tokens: 14349762560 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.824199E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.816 | TFLOPs: 26.14 | +7: iteration 27380/ 173500 | consumed samples: 7009280 | consumed tokens: 14355005440 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.815442E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.967 | TFLOPs: 26.14 | +7: iteration 27390/ 173500 | consumed samples: 7011840 | consumed tokens: 14360248320 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.821990E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.062 | TFLOPs: 26.14 | +7: iteration 27400/ 173500 | consumed samples: 7014400 | consumed tokens: 14365491200 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.816240E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.804 | TFLOPs: 26.17 | +7: iteration 27410/ 173500 | consumed samples: 7016960 | consumed tokens: 14370734080 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.830745E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.446 | TFLOPs: 26.13 | +7: iteration 27420/ 173500 | consumed samples: 7019520 | consumed tokens: 14375976960 | elapsed time per iteration (s): 0.15 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 3.821442E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.974 | TFLOPs: 26.17 | +7: iteration 27430/ 173500 | consumed samples: 7022080 | consumed tokens: 14381219840 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.819970E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.726 | TFLOPs: 26.17 | +7: iteration 27440/ 173500 | consumed samples: 7024640 | consumed tokens: 14386462720 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.817685E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.297 | TFLOPs: 26.16 | +7: iteration 27450/ 173500 | consumed samples: 7027200 | consumed tokens: 14391705600 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.820042E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.458 | TFLOPs: 26.13 | +7: iteration 27460/ 173500 | consumed samples: 7029760 | consumed tokens: 14396948480 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.816265E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.035 | TFLOPs: 26.11 | +7: iteration 27470/ 173500 | consumed samples: 7032320 | consumed tokens: 14402191360 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.838352E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.252 | TFLOPs: 26.27 | +7: iteration 27480/ 173500 | consumed samples: 7034880 | consumed tokens: 14407434240 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.815106E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.643 | TFLOPs: 26.29 | +7: iteration 27490/ 173500 | consumed samples: 7037440 | consumed tokens: 14412677120 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.823753E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.035 | TFLOPs: 26.28 | +7: iteration 27500/ 173500 | consumed samples: 7040000 | consumed tokens: 14417920000 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.832469E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.232 | TFLOPs: 26.30 | +7: iteration 27510/ 173500 | consumed samples: 7042560 | consumed tokens: 14423162880 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.814601E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.192 | TFLOPs: 26.30 | +7: iteration 27520/ 173500 | consumed samples: 7045120 | consumed tokens: 14428405760 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.826879E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.203 | TFLOPs: 26.32 | +7: iteration 27530/ 173500 | consumed samples: 7047680 | consumed tokens: 14433648640 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.820137E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.242 | TFLOPs: 26.29 | +7: iteration 27540/ 173500 | consumed samples: 7050240 | consumed tokens: 14438891520 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.825224E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.201 | TFLOPs: 26.32 | +7: iteration 27550/ 173500 | consumed samples: 7052800 | consumed tokens: 14444134400 | elapsed time per iteration (s): 0.15 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 3.822145E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.856 | TFLOPs: 25.94 | +7: iteration 27560/ 173500 | consumed samples: 7055360 | consumed tokens: 14449377280 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.811137E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.794 | TFLOPs: 26.30 | +7: iteration 27570/ 173500 | consumed samples: 7057920 | consumed tokens: 14454620160 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.820933E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.627 | TFLOPs: 26.25 | +7: iteration 27580/ 173500 | consumed samples: 7060480 | consumed tokens: 14459863040 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.812823E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.046 | TFLOPs: 26.21 | +7: iteration 27590/ 173500 | consumed samples: 7063040 | consumed tokens: 14465105920 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.824724E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.013 | TFLOPs: 26.17 | +7: iteration 27600/ 173500 | consumed samples: 7065600 | consumed tokens: 14470348800 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.807886E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.620 | TFLOPs: 26.20 | +7: iteration 27610/ 173500 | consumed samples: 7068160 | consumed tokens: 14475591680 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.828664E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.776 | TFLOPs: 26.22 | +7: iteration 27620/ 173500 | consumed samples: 7070720 | consumed tokens: 14480834560 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.822327E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.955 | TFLOPs: 26.24 | +7: iteration 27630/ 173500 | consumed samples: 7073280 | consumed tokens: 14486077440 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.824067E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.429 | TFLOPs: 26.23 | +7: iteration 27640/ 173500 | consumed samples: 7075840 | consumed tokens: 14491320320 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.820403E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.015 | TFLOPs: 26.22 | +7: iteration 27650/ 173500 | consumed samples: 7078400 | consumed tokens: 14496563200 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.832191E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.797 | TFLOPs: 26.20 | +7: iteration 27660/ 173500 | consumed samples: 7080960 | consumed tokens: 14501806080 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.816779E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.684 | TFLOPs: 26.20 | +7: iteration 27670/ 173500 | consumed samples: 7083520 | consumed tokens: 14507048960 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.825186E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.028 | TFLOPs: 26.22 | +7: iteration 27680/ 173500 | consumed samples: 7086080 | consumed tokens: 14512291840 | elapsed time per iteration (s): 0.15 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 3.826538E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.869 | TFLOPs: 26.22 | +7: iteration 27690/ 173500 | consumed samples: 7088640 | consumed tokens: 14517534720 | elapsed time per iteration (s): 0.16 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.835962E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.893 | TFLOPs: 25.86 | +7: iteration 27700/ 173500 | consumed samples: 7091200 | consumed tokens: 14522777600 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.827003E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.595 | TFLOPs: 26.20 | +7: iteration 27710/ 173500 | consumed samples: 7093760 | consumed tokens: 14528020480 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.819118E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.250 | TFLOPs: 26.23 | +7: iteration 27720/ 173500 | consumed samples: 7096320 | consumed tokens: 14533263360 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.830755E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.921 | TFLOPs: 26.22 | +7: iteration 27730/ 173500 | consumed samples: 7098880 | consumed tokens: 14538506240 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.826506E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.146 | TFLOPs: 26.21 | +7: iteration 27740/ 173500 | consumed samples: 7101440 | consumed tokens: 14543749120 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.830753E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.946 | TFLOPs: 26.24 | +7: iteration 27750/ 173500 | consumed samples: 7104000 | consumed tokens: 14548992000 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.827210E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.637 | TFLOPs: 26.23 | +7: iteration 27760/ 173500 | consumed samples: 7106560 | consumed tokens: 14554234880 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.834130E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.415 | TFLOPs: 26.21 | +7: iteration 27770/ 173500 | consumed samples: 7109120 | consumed tokens: 14559477760 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.819006E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.203 | TFLOPs: 26.22 | +7: iteration 27780/ 173500 | consumed samples: 7111680 | consumed tokens: 14564720640 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.825532E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.520 | TFLOPs: 26.21 | +7: iteration 27790/ 173500 | consumed samples: 7114240 | consumed tokens: 14569963520 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.826499E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.705 | TFLOPs: 26.23 | +7: iteration 27800/ 173500 | consumed samples: 7116800 | consumed tokens: 14575206400 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.824624E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.385 | TFLOPs: 26.23 | +7: iteration 27810/ 173500 | consumed samples: 7119360 | consumed tokens: 14580449280 | elapsed time per iteration (s): 0.15 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 3.829429E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.173 | TFLOPs: 26.21 | +7: iteration 27820/ 173500 | consumed samples: 7121920 | consumed tokens: 14585692160 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.814945E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.185 | TFLOPs: 26.22 | +7: iteration 27830/ 173500 | consumed samples: 7124480 | consumed tokens: 14590935040 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.840195E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.410 | TFLOPs: 26.20 | +7: iteration 27840/ 173500 | consumed samples: 7127040 | consumed tokens: 14596177920 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.819646E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.726 | TFLOPs: 26.20 | +7: iteration 27850/ 173500 | consumed samples: 7129600 | consumed tokens: 14601420800 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.821611E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.022 | TFLOPs: 26.21 | +7: iteration 27860/ 173500 | consumed samples: 7132160 | consumed tokens: 14606663680 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.829605E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.411 | TFLOPs: 26.16 | +7: iteration 27870/ 173500 | consumed samples: 7134720 | consumed tokens: 14611906560 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.824809E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.611 | TFLOPs: 26.34 | +7: iteration 27880/ 173500 | consumed samples: 7137280 | consumed tokens: 14617149440 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.819624E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.967 | TFLOPs: 26.08 | +7: iteration 27890/ 173500 | consumed samples: 7139840 | consumed tokens: 14622392320 | elapsed time per iteration (s): 0.16 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.819979E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.075 | TFLOPs: 25.49 | +7: iteration 27900/ 173500 | consumed samples: 7142400 | consumed tokens: 14627635200 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.824808E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.027 | TFLOPs: 26.33 | +7: iteration 27910/ 173500 | consumed samples: 7144960 | consumed tokens: 14632878080 | elapsed time per iteration (s): 0.16 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.819447E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.323 | TFLOPs: 25.40 | +7: iteration 27920/ 173500 | consumed samples: 7147520 | consumed tokens: 14638120960 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.816569E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.031 | TFLOPs: 26.16 | +7: iteration 27930/ 173500 | consumed samples: 7150080 | consumed tokens: 14643363840 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.824308E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.524 | TFLOPs: 26.17 | +7: iteration 27940/ 173500 | consumed samples: 7152640 | consumed tokens: 14648606720 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.824626E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.130 | TFLOPs: 26.16 | +7: iteration 27950/ 173500 | consumed samples: 7155200 | consumed tokens: 14653849600 | elapsed time per iteration (s): 0.15 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 3.828077E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.945 | TFLOPs: 26.20 | +7: iteration 27960/ 173500 | consumed samples: 7157760 | consumed tokens: 14659092480 | elapsed time per iteration (s): 0.16 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.815631E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.587 | TFLOPs: 25.85 | +7: iteration 27970/ 173500 | consumed samples: 7160320 | consumed tokens: 14664335360 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.813773E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.688 | TFLOPs: 26.14 | +7: iteration 27980/ 173500 | consumed samples: 7162880 | consumed tokens: 14669578240 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.835073E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.146 | TFLOPs: 26.13 | +7: iteration 27990/ 173500 | consumed samples: 7165440 | consumed tokens: 14674821120 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.839700E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.445 | TFLOPs: 26.13 | +0: [2023-03-17 01:28:15,308] [INFO] [logging.py:68:log_dist] [Rank 0] step=28000, skipped=0, lr=[0.00018981345832700956, 0.00018981345832700956, 0.00018981345832700956], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 28000/ 173500 | consumed samples: 7168000 | consumed tokens: 14680064000 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.827788E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.465 | TFLOPs: 26.17 | +0: steps: 28000 loss: 3.8076 iter time (s): 0.153 samples/sec: 1674.996 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 28000 | lm loss value: 3.929790E+00 | lm loss PPL: 5.089629E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 28000 to checkpoints_44m91b100m +0: [2023-03-17 01:28:15,383] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step28000 is begin to save! +0: [2023-03-17 01:28:15,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:28:15,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:28:15,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:28:15,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:28:15,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:28:15,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:28:15,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:28:15,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:28:15,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:28:15,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:28:15,484] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:28:15,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:28:15,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:28:15,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:28:15,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:28:15,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:28:15,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:28:15,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:28:15,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:28:15,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:28:15,519] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step28000/mp_rank_00_model_states.pt +0: [2023-03-17 01:28:15,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:28:15,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:28:15,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:28:15,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:28:15,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:28:15,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:28:15,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:28:15,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:28:15,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:28:15,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:28:15,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:28:15,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:28:15,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:28:15,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:28:15,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:28:15,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 01:28:15,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-17 01:28:15,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: successfully saved checkpoint at iteration 28000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.72 +7: iteration 28010/ 173500 | consumed samples: 7170560 | consumed tokens: 14685306880 | elapsed time per iteration (s): 0.18 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.821111E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.934 | TFLOPs: 22.46 | +7: iteration 28020/ 173500 | consumed samples: 7173120 | consumed tokens: 14690549760 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.825330E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.193 | TFLOPs: 26.15 | +7: iteration 28030/ 173500 | consumed samples: 7175680 | consumed tokens: 14695792640 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.818994E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.194 | TFLOPs: 26.15 | +7: iteration 28040/ 173500 | consumed samples: 7178240 | consumed tokens: 14701035520 | elapsed time per iteration (s): 0.16 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.812518E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.360 | TFLOPs: 25.80 | +7: iteration 28050/ 173500 | consumed samples: 7180800 | consumed tokens: 14706278400 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.836620E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.541 | TFLOPs: 26.18 | +7: iteration 28060/ 173500 | consumed samples: 7183360 | consumed tokens: 14711521280 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.821221E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.888 | TFLOPs: 26.19 | +7: iteration 28070/ 173500 | consumed samples: 7185920 | consumed tokens: 14716764160 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.827258E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.938 | TFLOPs: 26.17 | +7: iteration 28080/ 173500 | consumed samples: 7188480 | consumed tokens: 14722007040 | elapsed time per iteration (s): 0.15 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 3.812338E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.882 | TFLOPs: 26.19 | +7: iteration 28090/ 173500 | consumed samples: 7191040 | consumed tokens: 14727249920 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.819939E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.273 | TFLOPs: 25.91 | +7: iteration 28100/ 173500 | consumed samples: 7193600 | consumed tokens: 14732492800 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.828509E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.313 | TFLOPs: 26.18 | +7: iteration 28110/ 173500 | consumed samples: 7196160 | consumed tokens: 14737735680 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.823702E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.378 | TFLOPs: 26.18 | +7: iteration 28120/ 173500 | consumed samples: 7198720 | consumed tokens: 14742978560 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.807948E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.359 | TFLOPs: 26.18 | +7: iteration 28130/ 173500 | consumed samples: 7201280 | consumed tokens: 14748221440 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.821046E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.005 | TFLOPs: 26.17 | +7: iteration 28140/ 173500 | consumed samples: 7203840 | consumed tokens: 14753464320 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.819505E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.861 | TFLOPs: 26.16 | +7: iteration 28150/ 173500 | consumed samples: 7206400 | consumed tokens: 14758707200 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.828786E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.262 | TFLOPs: 26.15 | +7: iteration 28160/ 173500 | consumed samples: 7208960 | consumed tokens: 14763950080 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.818758E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.129 | TFLOPs: 26.14 | +7: iteration 28170/ 173500 | consumed samples: 7211520 | consumed tokens: 14769192960 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.826953E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.481 | TFLOPs: 26.18 | +7: iteration 28180/ 173500 | consumed samples: 7214080 | consumed tokens: 14774435840 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.816074E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.913 | TFLOPs: 26.19 | +7: iteration 28190/ 173500 | consumed samples: 7216640 | consumed tokens: 14779678720 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.832238E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.582 | TFLOPs: 26.18 | +7: iteration 28200/ 173500 | consumed samples: 7219200 | consumed tokens: 14784921600 | elapsed time per iteration (s): 0.15 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.819148E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.529 | TFLOPs: 26.17 | +7: iteration 28210/ 173500 | consumed samples: 7221760 | consumed tokens: 14790164480 | elapsed time per iteration (s): 0.17 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 3.802237E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.303 | TFLOPs: 23.95 | +7: iteration 28220/ 173500 | consumed samples: 7224320 | consumed tokens: 14795407360 | elapsed time per iteration (s): 0.16 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.814496E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.099 | TFLOPs: 25.80 | +7: iteration 28230/ 173500 | consumed samples: 7226880 | consumed tokens: 14800650240 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.814078E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.154 | TFLOPs: 25.97 | +7: iteration 28240/ 173500 | consumed samples: 7229440 | consumed tokens: 14805893120 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.818834E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.372 | TFLOPs: 26.16 | +7: iteration 28250/ 173500 | consumed samples: 7232000 | consumed tokens: 14811136000 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.821575E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.883 | TFLOPs: 26.17 | +7: iteration 28260/ 173500 | consumed samples: 7234560 | consumed tokens: 14816378880 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.813816E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.131 | TFLOPs: 26.19 | +7: iteration 28270/ 173500 | consumed samples: 7237120 | consumed tokens: 14821621760 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.830339E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.330 | TFLOPs: 26.19 | +7: iteration 28280/ 173500 | consumed samples: 7239680 | consumed tokens: 14826864640 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.817053E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.542 | TFLOPs: 25.98 | +7: iteration 28290/ 173500 | consumed samples: 7242240 | consumed tokens: 14832107520 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.812580E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.951 | TFLOPs: 26.19 | +7: iteration 28300/ 173500 | consumed samples: 7244800 | consumed tokens: 14837350400 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.808672E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.992 | TFLOPs: 26.22 | +7: iteration 28310/ 173500 | consumed samples: 7247360 | consumed tokens: 14842593280 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.809448E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.861 | TFLOPs: 26.23 | +7: iteration 28320/ 173500 | consumed samples: 7249920 | consumed tokens: 14847836160 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.823852E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.150 | TFLOPs: 26.22 | +7: iteration 28330/ 173500 | consumed samples: 7252480 | consumed tokens: 14853079040 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.813702E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.928 | TFLOPs: 25.94 | +7: iteration 28340/ 173500 | consumed samples: 7255040 | consumed tokens: 14858321920 | elapsed time per iteration (s): 0.15 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 3.807128E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.199 | TFLOPs: 26.21 | +7: iteration 28350/ 173500 | consumed samples: 7257600 | consumed tokens: 14863564800 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.822125E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.253 | TFLOPs: 26.19 | +7: iteration 28360/ 173500 | consumed samples: 7260160 | consumed tokens: 14868807680 | elapsed time per iteration (s): 0.16 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.818678E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.500 | TFLOPs: 25.74 | +7: iteration 28370/ 173500 | consumed samples: 7262720 | consumed tokens: 14874050560 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.821199E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.913 | TFLOPs: 26.22 | +7: iteration 28380/ 173500 | consumed samples: 7265280 | consumed tokens: 14879293440 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.834136E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.633 | TFLOPs: 26.22 | +7: iteration 28390/ 173500 | consumed samples: 7267840 | consumed tokens: 14884536320 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.805873E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.877 | TFLOPs: 26.23 | +7: iteration 28400/ 173500 | consumed samples: 7270400 | consumed tokens: 14889779200 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.830853E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.092 | TFLOPs: 26.22 | +7: iteration 28410/ 173500 | consumed samples: 7272960 | consumed tokens: 14895022080 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.811755E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.483 | TFLOPs: 26.24 | +7: iteration 28420/ 173500 | consumed samples: 7275520 | consumed tokens: 14900264960 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.807848E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.954 | TFLOPs: 26.17 | +7: iteration 28430/ 173500 | consumed samples: 7278080 | consumed tokens: 14905507840 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.814699E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.092 | TFLOPs: 26.27 | +7: iteration 28440/ 173500 | consumed samples: 7280640 | consumed tokens: 14910750720 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.808867E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.752 | TFLOPs: 26.31 | +7: iteration 28450/ 173500 | consumed samples: 7283200 | consumed tokens: 14915993600 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.805564E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.931 | TFLOPs: 26.33 | +7: iteration 28460/ 173500 | consumed samples: 7285760 | consumed tokens: 14921236480 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.835603E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.625 | TFLOPs: 26.31 | +7: iteration 28470/ 173500 | consumed samples: 7288320 | consumed tokens: 14926479360 | elapsed time per iteration (s): 0.15 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 3.817161E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.512 | TFLOPs: 26.29 | +7: iteration 28480/ 173500 | consumed samples: 7290880 | consumed tokens: 14931722240 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.809402E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.230 | TFLOPs: 26.29 | +7: iteration 28490/ 173500 | consumed samples: 7293440 | consumed tokens: 14936965120 | elapsed time per iteration (s): 0.16 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.813601E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.918 | TFLOPs: 25.72 | +7: iteration 28500/ 173500 | consumed samples: 7296000 | consumed tokens: 14942208000 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.822353E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.283 | TFLOPs: 26.18 | +7: iteration 28510/ 173500 | consumed samples: 7298560 | consumed tokens: 14947450880 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.823692E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.671 | TFLOPs: 26.15 | +7: iteration 28520/ 173500 | consumed samples: 7301120 | consumed tokens: 14952693760 | elapsed time per iteration (s): 0.16 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.822784E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.300 | TFLOPs: 25.61 | +7: iteration 28530/ 173500 | consumed samples: 7303680 | consumed tokens: 14957936640 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.821697E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.080 | TFLOPs: 25.94 | +7: iteration 28540/ 173500 | consumed samples: 7306240 | consumed tokens: 14963179520 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.816985E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.605 | TFLOPs: 26.21 | +7: iteration 28550/ 173500 | consumed samples: 7308800 | consumed tokens: 14968422400 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.806174E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.373 | TFLOPs: 26.21 | +7: iteration 28560/ 173500 | consumed samples: 7311360 | consumed tokens: 14973665280 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.817844E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.713 | TFLOPs: 26.25 | +7: iteration 28570/ 173500 | consumed samples: 7313920 | consumed tokens: 14978908160 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.831004E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.088 | TFLOPs: 26.16 | +7: iteration 28580/ 173500 | consumed samples: 7316480 | consumed tokens: 14984151040 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.820935E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.938 | TFLOPs: 26.09 | +7: iteration 28590/ 173500 | consumed samples: 7319040 | consumed tokens: 14989393920 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.819579E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.006 | TFLOPs: 26.14 | +7: iteration 28600/ 173500 | consumed samples: 7321600 | consumed tokens: 14994636800 | elapsed time per iteration (s): 0.15 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.811855E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.755 | TFLOPs: 26.14 | +7: iteration 28610/ 173500 | consumed samples: 7324160 | consumed tokens: 14999879680 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.814327E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.870 | TFLOPs: 26.11 | +7: iteration 28620/ 173500 | consumed samples: 7326720 | consumed tokens: 15005122560 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.810831E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.519 | TFLOPs: 26.17 | +7: iteration 28630/ 173500 | consumed samples: 7329280 | consumed tokens: 15010365440 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.823875E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.136 | TFLOPs: 26.25 | +7: iteration 28640/ 173500 | consumed samples: 7331840 | consumed tokens: 15015608320 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.815681E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.255 | TFLOPs: 26.21 | +7: iteration 28650/ 173500 | consumed samples: 7334400 | consumed tokens: 15020851200 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.814189E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.028 | TFLOPs: 26.21 | +7: iteration 28660/ 173500 | consumed samples: 7336960 | consumed tokens: 15026094080 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.826492E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.400 | TFLOPs: 26.24 | +7: iteration 28670/ 173500 | consumed samples: 7339520 | consumed tokens: 15031336960 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.824149E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.751 | TFLOPs: 26.25 | +7: iteration 28680/ 173500 | consumed samples: 7342080 | consumed tokens: 15036579840 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.810573E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.822 | TFLOPs: 26.28 | +7: iteration 28690/ 173500 | consumed samples: 7344640 | consumed tokens: 15041822720 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.820285E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.328 | TFLOPs: 26.29 | +7: iteration 28700/ 173500 | consumed samples: 7347200 | consumed tokens: 15047065600 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.828819E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.736 | TFLOPs: 26.23 | +7: iteration 28710/ 173500 | consumed samples: 7349760 | consumed tokens: 15052308480 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.824525E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.391 | TFLOPs: 26.23 | +7: iteration 28720/ 173500 | consumed samples: 7352320 | consumed tokens: 15057551360 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.826808E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.303 | TFLOPs: 26.23 | +7: iteration 28730/ 173500 | consumed samples: 7354880 | consumed tokens: 15062794240 | elapsed time per iteration (s): 0.15 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.823119E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.796 | TFLOPs: 26.20 | +7: iteration 28740/ 173500 | consumed samples: 7357440 | consumed tokens: 15068037120 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.804055E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.762 | TFLOPs: 26.20 | +7: iteration 28750/ 173500 | consumed samples: 7360000 | consumed tokens: 15073280000 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.818296E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.578 | TFLOPs: 26.20 | +7: iteration 28760/ 173500 | consumed samples: 7362560 | consumed tokens: 15078522880 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.816942E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.627 | TFLOPs: 26.22 | +7: iteration 28770/ 173500 | consumed samples: 7365120 | consumed tokens: 15083765760 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.823291E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.995 | TFLOPs: 26.24 | +7: iteration 28780/ 173500 | consumed samples: 7367680 | consumed tokens: 15089008640 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.825961E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.331 | TFLOPs: 26.19 | +7: iteration 28790/ 173500 | consumed samples: 7370240 | consumed tokens: 15094251520 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.819141E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.432 | TFLOPs: 26.21 | +7: iteration 28800/ 173500 | consumed samples: 7372800 | consumed tokens: 15099494400 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.812260E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.938 | TFLOPs: 26.20 | +7: iteration 28810/ 173500 | consumed samples: 7375360 | consumed tokens: 15104737280 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.820039E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.984 | TFLOPs: 26.19 | +7: iteration 28820/ 173500 | consumed samples: 7377920 | consumed tokens: 15109980160 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.828652E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.089 | TFLOPs: 26.19 | +7: iteration 28830/ 173500 | consumed samples: 7380480 | consumed tokens: 15115223040 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.816954E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.867 | TFLOPs: 26.19 | +7: iteration 28840/ 173500 | consumed samples: 7383040 | consumed tokens: 15120465920 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.822422E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.049 | TFLOPs: 26.17 | +7: iteration 28850/ 173500 | consumed samples: 7385600 | consumed tokens: 15125708800 | elapsed time per iteration (s): 0.15 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.822031E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.529 | TFLOPs: 26.20 | +7: iteration 28860/ 173500 | consumed samples: 7388160 | consumed tokens: 15130951680 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.806955E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.561 | TFLOPs: 26.18 | +7: iteration 28870/ 173500 | consumed samples: 7390720 | consumed tokens: 15136194560 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.820519E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.686 | TFLOPs: 26.22 | +7: iteration 28880/ 173500 | consumed samples: 7393280 | consumed tokens: 15141437440 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.812715E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.363 | TFLOPs: 26.16 | +7: iteration 28890/ 173500 | consumed samples: 7395840 | consumed tokens: 15146680320 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.823832E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.227 | TFLOPs: 26.19 | +7: iteration 28900/ 173500 | consumed samples: 7398400 | consumed tokens: 15151923200 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.812571E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.869 | TFLOPs: 26.16 | +7: iteration 28910/ 173500 | consumed samples: 7400960 | consumed tokens: 15157166080 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.811835E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.405 | TFLOPs: 26.15 | +7: iteration 28920/ 173500 | consumed samples: 7403520 | consumed tokens: 15162408960 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.805948E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.342 | TFLOPs: 26.18 | +7: iteration 28930/ 173500 | consumed samples: 7406080 | consumed tokens: 15167651840 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.815469E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.140 | TFLOPs: 26.18 | +7: iteration 28940/ 173500 | consumed samples: 7408640 | consumed tokens: 15172894720 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.815750E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.455 | TFLOPs: 26.17 | +7: iteration 28950/ 173500 | consumed samples: 7411200 | consumed tokens: 15178137600 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.819333E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.083 | TFLOPs: 26.18 | +7: iteration 28960/ 173500 | consumed samples: 7413760 | consumed tokens: 15183380480 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.815482E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.249 | TFLOPs: 26.15 | +7: iteration 28970/ 173500 | consumed samples: 7416320 | consumed tokens: 15188623360 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.813486E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.382 | TFLOPs: 26.20 | +7: iteration 28980/ 173500 | consumed samples: 7418880 | consumed tokens: 15193866240 | elapsed time per iteration (s): 0.15 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 3.799792E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.549 | TFLOPs: 26.20 | +7: iteration 28990/ 173500 | consumed samples: 7421440 | consumed tokens: 15199109120 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.817189E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.566 | TFLOPs: 26.21 | +7: iteration 29000/ 173500 | consumed samples: 7424000 | consumed tokens: 15204352000 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.825631E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.416 | TFLOPs: 26.21 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 29000 | lm loss value: 3.871333E+00 | lm loss PPL: 4.800635E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 29000 to checkpoints_44m91b100m +0: [2023-03-17 01:30:49,211] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step29000 is begin to save! +0: [2023-03-17 01:30:49,215] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:30:49,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:30:49,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:30:49,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:30:49,285] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:30:49,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:30:49,293] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:30:49,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:30:49,302] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:30:49,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:30:49,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:30:49,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:30:49,318] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:30:49,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:30:49,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:30:49,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:30:49,335] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:30:49,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:30:49,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:30:49,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:30:49,344] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step29000/mp_rank_00_model_states.pt +0: [2023-03-17 01:30:49,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:30:49,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:30:49,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:30:49,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:30:49,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:30:49,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:30:49,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: successfully saved checkpoint at iteration 29000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.94 +4: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-17 01:30:49,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:30:49,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:30:49,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: iteration 29010/ 173500 | consumed samples: 7426560 | consumed tokens: 15209594880 | elapsed time per iteration (s): 0.18 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.816322E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.366 | TFLOPs: 22.86 | +7: iteration 29020/ 173500 | consumed samples: 7429120 | consumed tokens: 15214837760 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.802035E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.165 | TFLOPs: 26.21 | +7: iteration 29030/ 173500 | consumed samples: 7431680 | consumed tokens: 15220080640 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.808098E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.816 | TFLOPs: 26.20 | +7: iteration 29040/ 173500 | consumed samples: 7434240 | consumed tokens: 15225323520 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.822709E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.441 | TFLOPs: 26.23 | +7: iteration 29050/ 173500 | consumed samples: 7436800 | consumed tokens: 15230566400 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.818518E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.376 | TFLOPs: 26.21 | +7: iteration 29060/ 173500 | consumed samples: 7439360 | consumed tokens: 15235809280 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.818975E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.089 | TFLOPs: 26.21 | +7: iteration 29070/ 173500 | consumed samples: 7441920 | consumed tokens: 15241052160 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.816056E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.470 | TFLOPs: 26.18 | +7: iteration 29080/ 173500 | consumed samples: 7444480 | consumed tokens: 15246295040 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.808315E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.169 | TFLOPs: 26.19 | +7: iteration 29090/ 173500 | consumed samples: 7447040 | consumed tokens: 15251537920 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.825733E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.456 | TFLOPs: 26.21 | +7: iteration 29100/ 173500 | consumed samples: 7449600 | consumed tokens: 15256780800 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.814100E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.333 | TFLOPs: 26.21 | +7: iteration 29110/ 173500 | consumed samples: 7452160 | consumed tokens: 15262023680 | elapsed time per iteration (s): 0.15 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.817849E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.444 | TFLOPs: 26.21 | +7: iteration 29120/ 173500 | consumed samples: 7454720 | consumed tokens: 15267266560 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.815799E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.118 | TFLOPs: 26.22 | +7: iteration 29130/ 173500 | consumed samples: 7457280 | consumed tokens: 15272509440 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.818967E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.727 | TFLOPs: 26.20 | +7: iteration 29140/ 173500 | consumed samples: 7459840 | consumed tokens: 15277752320 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.824802E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.009 | TFLOPs: 26.22 | +7: iteration 29150/ 173500 | consumed samples: 7462400 | consumed tokens: 15282995200 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.827549E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.587 | TFLOPs: 26.21 | +7: iteration 29160/ 173500 | consumed samples: 7464960 | consumed tokens: 15288238080 | elapsed time per iteration (s): 0.16 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.810027E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.026 | TFLOPs: 25.56 | +7: iteration 29170/ 173500 | consumed samples: 7467520 | consumed tokens: 15293480960 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.834023E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.932 | TFLOPs: 26.20 | +7: iteration 29180/ 173500 | consumed samples: 7470080 | consumed tokens: 15298723840 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.823351E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.755 | TFLOPs: 26.20 | +7: iteration 29190/ 173500 | consumed samples: 7472640 | consumed tokens: 15303966720 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.809926E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.346 | TFLOPs: 26.20 | +7: iteration 29200/ 173500 | consumed samples: 7475200 | consumed tokens: 15309209600 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.809754E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.713 | TFLOPs: 26.19 | +7: iteration 29210/ 173500 | consumed samples: 7477760 | consumed tokens: 15314452480 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.822493E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.522 | TFLOPs: 25.93 | +7: iteration 29220/ 173500 | consumed samples: 7480320 | consumed tokens: 15319695360 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.813083E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.416 | TFLOPs: 26.16 | +7: iteration 29230/ 173500 | consumed samples: 7482880 | consumed tokens: 15324938240 | elapsed time per iteration (s): 0.15 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 3.820959E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.806 | TFLOPs: 26.20 | +7: iteration 29240/ 173500 | consumed samples: 7485440 | consumed tokens: 15330181120 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.785648E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.741 | TFLOPs: 26.19 | +7: iteration 29250/ 173500 | consumed samples: 7488000 | consumed tokens: 15335424000 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.820845E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.664 | TFLOPs: 26.00 | +7: iteration 29260/ 173500 | consumed samples: 7490560 | consumed tokens: 15340666880 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.817155E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.338 | TFLOPs: 26.20 | +7: iteration 29270/ 173500 | consumed samples: 7493120 | consumed tokens: 15345909760 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.808931E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.435 | TFLOPs: 26.20 | +7: iteration 29280/ 173500 | consumed samples: 7495680 | consumed tokens: 15351152640 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.809697E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.804 | TFLOPs: 26.19 | +7: iteration 29290/ 173500 | consumed samples: 7498240 | consumed tokens: 15356395520 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.829674E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.734 | TFLOPs: 26.20 | +7: iteration 29300/ 173500 | consumed samples: 7500800 | consumed tokens: 15361638400 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.819662E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.896 | TFLOPs: 26.20 | +7: iteration 29310/ 173500 | consumed samples: 7503360 | consumed tokens: 15366881280 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.811406E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.074 | TFLOPs: 26.18 | +7: iteration 29320/ 173500 | consumed samples: 7505920 | consumed tokens: 15372124160 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.822110E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.794 | TFLOPs: 26.14 | +7: iteration 29330/ 173500 | consumed samples: 7508480 | consumed tokens: 15377367040 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.825027E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.424 | TFLOPs: 26.12 | +7: iteration 29340/ 173500 | consumed samples: 7511040 | consumed tokens: 15382609920 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.820728E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.119 | TFLOPs: 26.13 | +7: iteration 29350/ 173500 | consumed samples: 7513600 | consumed tokens: 15387852800 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.811561E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.915 | TFLOPs: 25.97 | +7: iteration 29360/ 173500 | consumed samples: 7516160 | consumed tokens: 15393095680 | elapsed time per iteration (s): 0.15 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.818915E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.939 | TFLOPs: 26.02 | +7: iteration 29370/ 173500 | consumed samples: 7518720 | consumed tokens: 15398338560 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.799283E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.315 | TFLOPs: 26.10 | +7: iteration 29380/ 173500 | consumed samples: 7521280 | consumed tokens: 15403581440 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.817620E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.926 | TFLOPs: 26.13 | +7: iteration 29390/ 173500 | consumed samples: 7523840 | consumed tokens: 15408824320 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.805290E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.228 | TFLOPs: 26.10 | +7: iteration 29400/ 173500 | consumed samples: 7526400 | consumed tokens: 15414067200 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.810954E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.010 | TFLOPs: 26.11 | +7: iteration 29410/ 173500 | consumed samples: 7528960 | consumed tokens: 15419310080 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.816399E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.415 | TFLOPs: 26.13 | +7: iteration 29420/ 173500 | consumed samples: 7531520 | consumed tokens: 15424552960 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.813343E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.963 | TFLOPs: 26.19 | +7: iteration 29430/ 173500 | consumed samples: 7534080 | consumed tokens: 15429795840 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.804599E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.746 | TFLOPs: 26.19 | +7: iteration 29440/ 173500 | consumed samples: 7536640 | consumed tokens: 15435038720 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.820692E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.072 | TFLOPs: 26.19 | +7: iteration 29450/ 173500 | consumed samples: 7539200 | consumed tokens: 15440281600 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.801850E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.381 | TFLOPs: 26.18 | +7: iteration 29460/ 173500 | consumed samples: 7541760 | consumed tokens: 15445524480 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.801833E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.702 | TFLOPs: 26.19 | +7: iteration 29470/ 173500 | consumed samples: 7544320 | consumed tokens: 15450767360 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.796391E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.387 | TFLOPs: 26.05 | +7: iteration 29480/ 173500 | consumed samples: 7546880 | consumed tokens: 15456010240 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.811930E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.352 | TFLOPs: 26.18 | +7: iteration 29490/ 173500 | consumed samples: 7549440 | consumed tokens: 15461253120 | elapsed time per iteration (s): 0.15 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 3.818104E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.942 | TFLOPs: 26.14 | +7: iteration 29500/ 173500 | consumed samples: 7552000 | consumed tokens: 15466496000 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.809384E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.832 | TFLOPs: 26.17 | +7: iteration 29510/ 173500 | consumed samples: 7554560 | consumed tokens: 15471738880 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.808286E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.417 | TFLOPs: 26.18 | +7: iteration 29520/ 173500 | consumed samples: 7557120 | consumed tokens: 15476981760 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.798236E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.299 | TFLOPs: 26.10 | +7: iteration 29530/ 173500 | consumed samples: 7559680 | consumed tokens: 15482224640 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.813187E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.692 | TFLOPs: 25.92 | +7: iteration 29540/ 173500 | consumed samples: 7562240 | consumed tokens: 15487467520 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.808319E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.675 | TFLOPs: 26.17 | +7: iteration 29550/ 173500 | consumed samples: 7564800 | consumed tokens: 15492710400 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.816513E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.316 | TFLOPs: 26.15 | +7: iteration 29560/ 173500 | consumed samples: 7567360 | consumed tokens: 15497953280 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.815575E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.119 | TFLOPs: 26.18 | +7: iteration 29570/ 173500 | consumed samples: 7569920 | consumed tokens: 15503196160 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.817125E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.023 | TFLOPs: 26.19 | +7: iteration 29580/ 173500 | consumed samples: 7572480 | consumed tokens: 15508439040 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.824282E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.175 | TFLOPs: 26.22 | +7: iteration 29590/ 173500 | consumed samples: 7575040 | consumed tokens: 15513681920 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.814748E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.792 | TFLOPs: 26.22 | +7: iteration 29600/ 173500 | consumed samples: 7577600 | consumed tokens: 15518924800 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.820994E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.386 | TFLOPs: 26.20 | +7: iteration 29610/ 173500 | consumed samples: 7580160 | consumed tokens: 15524167680 | elapsed time per iteration (s): 0.15 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.810363E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.561 | TFLOPs: 26.21 | +7: iteration 29620/ 173500 | consumed samples: 7582720 | consumed tokens: 15529410560 | elapsed time per iteration (s): 0.16 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.809136E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.331 | TFLOPs: 25.50 | +7: iteration 29630/ 173500 | consumed samples: 7585280 | consumed tokens: 15534653440 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.816589E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.415 | TFLOPs: 26.16 | +7: iteration 29640/ 173500 | consumed samples: 7587840 | consumed tokens: 15539896320 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.808643E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.054 | TFLOPs: 26.19 | +7: iteration 29650/ 173500 | consumed samples: 7590400 | consumed tokens: 15545139200 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.816962E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.585 | TFLOPs: 26.21 | +7: iteration 29660/ 173500 | consumed samples: 7592960 | consumed tokens: 15550382080 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.820352E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.532 | TFLOPs: 26.20 | +7: iteration 29670/ 173500 | consumed samples: 7595520 | consumed tokens: 15555624960 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.817690E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.945 | TFLOPs: 26.19 | +7: iteration 29680/ 173500 | consumed samples: 7598080 | consumed tokens: 15560867840 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.821733E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.231 | TFLOPs: 26.19 | +7: iteration 29690/ 173500 | consumed samples: 7600640 | consumed tokens: 15566110720 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.821933E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.074 | TFLOPs: 26.19 | +7: iteration 29700/ 173500 | consumed samples: 7603200 | consumed tokens: 15571353600 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.811441E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.896 | TFLOPs: 26.17 | +7: iteration 29710/ 173500 | consumed samples: 7605760 | consumed tokens: 15576596480 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.807261E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.185 | TFLOPs: 26.19 | +7: iteration 29720/ 173500 | consumed samples: 7608320 | consumed tokens: 15581839360 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.820193E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.808 | TFLOPs: 26.20 | +7: iteration 29730/ 173500 | consumed samples: 7610880 | consumed tokens: 15587082240 | elapsed time per iteration (s): 0.15 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 3.806988E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.086 | TFLOPs: 26.21 | +7: iteration 29740/ 173500 | consumed samples: 7613440 | consumed tokens: 15592325120 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.818546E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.819 | TFLOPs: 26.23 | +7: iteration 29750/ 173500 | consumed samples: 7616000 | consumed tokens: 15597568000 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.831818E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.546 | TFLOPs: 26.23 | +7: iteration 29760/ 173500 | consumed samples: 7618560 | consumed tokens: 15602810880 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.822491E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.769 | TFLOPs: 26.25 | +7: iteration 29770/ 173500 | consumed samples: 7621120 | consumed tokens: 15608053760 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.798859E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.640 | TFLOPs: 26.26 | +7: iteration 29780/ 173500 | consumed samples: 7623680 | consumed tokens: 15613296640 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.810445E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.220 | TFLOPs: 26.26 | +7: iteration 29790/ 173500 | consumed samples: 7626240 | consumed tokens: 15618539520 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.816339E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.777 | TFLOPs: 26.11 | +7: iteration 29800/ 173500 | consumed samples: 7628800 | consumed tokens: 15623782400 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.806971E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.959 | TFLOPs: 26.20 | +7: iteration 29810/ 173500 | consumed samples: 7631360 | consumed tokens: 15629025280 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.822467E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.159 | TFLOPs: 26.19 | +7: iteration 29820/ 173500 | consumed samples: 7633920 | consumed tokens: 15634268160 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.811716E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.818 | TFLOPs: 26.20 | +7: iteration 29830/ 173500 | consumed samples: 7636480 | consumed tokens: 15639511040 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.817849E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.793 | TFLOPs: 25.92 | +7: iteration 29840/ 173500 | consumed samples: 7639040 | consumed tokens: 15644753920 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.810626E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.652 | TFLOPs: 26.20 | +7: iteration 29850/ 173500 | consumed samples: 7641600 | consumed tokens: 15649996800 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.808313E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.840 | TFLOPs: 26.20 | +7: iteration 29860/ 173500 | consumed samples: 7644160 | consumed tokens: 15655239680 | elapsed time per iteration (s): 0.15 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.818044E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.085 | TFLOPs: 26.19 | +7: iteration 29870/ 173500 | consumed samples: 7646720 | consumed tokens: 15660482560 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.813644E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.168 | TFLOPs: 26.19 | +7: iteration 29880/ 173500 | consumed samples: 7649280 | consumed tokens: 15665725440 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.810537E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.159 | TFLOPs: 26.19 | +7: iteration 29890/ 173500 | consumed samples: 7651840 | consumed tokens: 15670968320 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.808171E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.116 | TFLOPs: 26.19 | +7: iteration 29900/ 173500 | consumed samples: 7654400 | consumed tokens: 15676211200 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.810394E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.382 | TFLOPs: 26.18 | +7: iteration 29910/ 173500 | consumed samples: 7656960 | consumed tokens: 15681454080 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.811755E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.233 | TFLOPs: 26.19 | +7: iteration 29920/ 173500 | consumed samples: 7659520 | consumed tokens: 15686696960 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.786976E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.063 | TFLOPs: 26.18 | +7: iteration 29930/ 173500 | consumed samples: 7662080 | consumed tokens: 15691939840 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.820051E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.068 | TFLOPs: 26.21 | +7: iteration 29940/ 173500 | consumed samples: 7664640 | consumed tokens: 15697182720 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.797823E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.365 | TFLOPs: 26.23 | +7: iteration 29950/ 173500 | consumed samples: 7667200 | consumed tokens: 15702425600 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.816796E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.051 | TFLOPs: 26.24 | +7: iteration 29960/ 173500 | consumed samples: 7669760 | consumed tokens: 15707668480 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.809457E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.855 | TFLOPs: 26.23 | +7: iteration 29970/ 173500 | consumed samples: 7672320 | consumed tokens: 15712911360 | elapsed time per iteration (s): 0.15 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.809441E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.865 | TFLOPs: 26.22 | +7: iteration 29980/ 173500 | consumed samples: 7674880 | consumed tokens: 15718154240 | elapsed time per iteration (s): 0.16 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.807413E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.188 | TFLOPs: 25.77 | +7: iteration 29990/ 173500 | consumed samples: 7677440 | consumed tokens: 15723397120 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.815391E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.997 | TFLOPs: 26.25 | +0: [2023-03-17 01:33:22,863] [INFO] [logging.py:68:log_dist] [Rank 0] step=30000, skipped=0, lr=[0.00018823900512431258, 0.00018823900512431258, 0.00018823900512431258], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 30000/ 173500 | consumed samples: 7680000 | consumed tokens: 15728640000 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.818346E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.154 | TFLOPs: 26.25 | +0: steps: 30000 loss: 3.8110 iter time (s): 0.153 samples/sec: 1677.471 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 30000 | lm loss value: 3.930682E+00 | lm loss PPL: 5.094169E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 30000 to checkpoints_44m91b100m +0: [2023-03-17 01:33:22,936] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step30000 is begin to save! +0: [2023-03-17 01:33:22,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:33:23,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:33:23,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:33:23,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:33:23,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:33:23,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:33:23,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:33:23,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:33:23,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:33:23,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:33:23,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:33:23,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:33:23,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:33:23,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:33:23,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:33:23,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:33:23,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:33:23,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:33:23,066] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:33:23,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:33:23,068] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step30000/mp_rank_00_model_states.pt +0: [2023-03-17 01:33:23,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:33:23,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:33:23,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:33:23,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:33:23,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:33:23,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-17 01:33:23,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:33:23,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-17 01:33:23,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:33:23,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-17 01:33:23,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:33:23,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: successfully saved checkpoint at iteration 30000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.64 +7: iteration 30010/ 173500 | consumed samples: 7682560 | consumed tokens: 15733882880 | elapsed time per iteration (s): 0.18 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.806642E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.899 | TFLOPs: 22.60 | +7: iteration 30020/ 173500 | consumed samples: 7685120 | consumed tokens: 15739125760 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.806859E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.620 | TFLOPs: 26.28 | +7: iteration 30030/ 173500 | consumed samples: 7687680 | consumed tokens: 15744368640 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.799398E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.888 | TFLOPs: 26.25 | +7: iteration 30040/ 173500 | consumed samples: 7690240 | consumed tokens: 15749611520 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.810458E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.774 | TFLOPs: 26.25 | +7: iteration 30050/ 173500 | consumed samples: 7692800 | consumed tokens: 15754854400 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.799451E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.908 | TFLOPs: 26.25 | +7: iteration 30060/ 173500 | consumed samples: 7695360 | consumed tokens: 15760097280 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.807978E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.995 | TFLOPs: 26.28 | +7: iteration 30070/ 173500 | consumed samples: 7697920 | consumed tokens: 15765340160 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.812558E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.779 | TFLOPs: 26.19 | +7: iteration 30080/ 173500 | consumed samples: 7700480 | consumed tokens: 15770583040 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.814770E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.527 | TFLOPs: 26.20 | +7: iteration 30090/ 173500 | consumed samples: 7703040 | consumed tokens: 15775825920 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.798878E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.832 | TFLOPs: 26.19 | +7: iteration 30100/ 173500 | consumed samples: 7705600 | consumed tokens: 15781068800 | elapsed time per iteration (s): 0.15 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.822963E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.786 | TFLOPs: 26.17 | +7: iteration 30110/ 173500 | consumed samples: 7708160 | consumed tokens: 15786311680 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.804705E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.011 | TFLOPs: 26.19 | +7: iteration 30120/ 173500 | consumed samples: 7710720 | consumed tokens: 15791554560 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.806699E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.045 | TFLOPs: 26.17 | +7: iteration 30130/ 173500 | consumed samples: 7713280 | consumed tokens: 15796797440 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.821088E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.312 | TFLOPs: 26.19 | +7: iteration 30140/ 173500 | consumed samples: 7715840 | consumed tokens: 15802040320 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.808342E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.424 | TFLOPs: 26.17 | +7: iteration 30150/ 173500 | consumed samples: 7718400 | consumed tokens: 15807283200 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.832444E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.651 | TFLOPs: 26.20 | +7: iteration 30160/ 173500 | consumed samples: 7720960 | consumed tokens: 15812526080 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.793854E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.755 | TFLOPs: 26.20 | +7: iteration 30170/ 173500 | consumed samples: 7723520 | consumed tokens: 15817768960 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.815309E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.217 | TFLOPs: 26.21 | +7: iteration 30180/ 173500 | consumed samples: 7726080 | consumed tokens: 15823011840 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.808034E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.597 | TFLOPs: 26.20 | +7: iteration 30190/ 173500 | consumed samples: 7728640 | consumed tokens: 15828254720 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.819464E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.009 | TFLOPs: 26.16 | +7: iteration 30200/ 173500 | consumed samples: 7731200 | consumed tokens: 15833497600 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.806418E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.251 | TFLOPs: 26.15 | +7: iteration 30210/ 173500 | consumed samples: 7733760 | consumed tokens: 15838740480 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.818342E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.451 | TFLOPs: 26.12 | +7: iteration 30220/ 173500 | consumed samples: 7736320 | consumed tokens: 15843983360 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.805897E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.240 | TFLOPs: 26.13 | +7: iteration 30230/ 173500 | consumed samples: 7738880 | consumed tokens: 15849226240 | elapsed time per iteration (s): 0.15 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.805896E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.625 | TFLOPs: 26.14 | +7: iteration 30240/ 173500 | consumed samples: 7741440 | consumed tokens: 15854469120 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.805763E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.072 | TFLOPs: 26.14 | +7: iteration 30250/ 173500 | consumed samples: 7744000 | consumed tokens: 15859712000 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.797723E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.281 | TFLOPs: 26.12 | +7: iteration 30260/ 173500 | consumed samples: 7746560 | consumed tokens: 15864954880 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.806129E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.888 | TFLOPs: 26.14 | +7: iteration 30270/ 173500 | consumed samples: 7749120 | consumed tokens: 15870197760 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.809927E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.516 | TFLOPs: 26.10 | +7: iteration 30280/ 173500 | consumed samples: 7751680 | consumed tokens: 15875440640 | elapsed time per iteration (s): 0.16 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.827500E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.121 | TFLOPs: 25.41 | +7: iteration 30290/ 173500 | consumed samples: 7754240 | consumed tokens: 15880683520 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.799003E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.825 | TFLOPs: 26.09 | +7: iteration 30300/ 173500 | consumed samples: 7756800 | consumed tokens: 15885926400 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.819287E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.996 | TFLOPs: 26.11 | +7: iteration 30310/ 173500 | consumed samples: 7759360 | consumed tokens: 15891169280 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.821481E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.248 | TFLOPs: 26.12 | +7: iteration 30320/ 173500 | consumed samples: 7761920 | consumed tokens: 15896412160 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.796892E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.055 | TFLOPs: 26.11 | +7: iteration 30330/ 173500 | consumed samples: 7764480 | consumed tokens: 15901655040 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.798228E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.927 | TFLOPs: 26.09 | +7: iteration 30340/ 173500 | consumed samples: 7767040 | consumed tokens: 15906897920 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.805382E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.833 | TFLOPs: 26.12 | +7: iteration 30350/ 173500 | consumed samples: 7769600 | consumed tokens: 15912140800 | elapsed time per iteration (s): 0.15 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.818874E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.256 | TFLOPs: 26.15 | +7: iteration 30360/ 173500 | consumed samples: 7772160 | consumed tokens: 15917383680 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.805763E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.248 | TFLOPs: 26.13 | +7: iteration 30370/ 173500 | consumed samples: 7774720 | consumed tokens: 15922626560 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.801818E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.907 | TFLOPs: 26.11 | +7: iteration 30380/ 173500 | consumed samples: 7777280 | consumed tokens: 15927869440 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.795620E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.002 | TFLOPs: 26.13 | +7: iteration 30390/ 173500 | consumed samples: 7779840 | consumed tokens: 15933112320 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.817218E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.812 | TFLOPs: 26.16 | +7: iteration 30400/ 173500 | consumed samples: 7782400 | consumed tokens: 15938355200 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.811664E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.837 | TFLOPs: 26.14 | +7: iteration 30410/ 173500 | consumed samples: 7784960 | consumed tokens: 15943598080 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.815837E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.931 | TFLOPs: 26.13 | +7: iteration 30420/ 173500 | consumed samples: 7787520 | consumed tokens: 15948840960 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.804958E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.240 | TFLOPs: 26.13 | +7: iteration 30430/ 173500 | consumed samples: 7790080 | consumed tokens: 15954083840 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.799273E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.390 | TFLOPs: 26.15 | +7: iteration 30440/ 173500 | consumed samples: 7792640 | consumed tokens: 15959326720 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.805092E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.769 | TFLOPs: 26.17 | +7: iteration 30450/ 173500 | consumed samples: 7795200 | consumed tokens: 15964569600 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.804856E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.830 | TFLOPs: 26.17 | +7: iteration 30460/ 173500 | consumed samples: 7797760 | consumed tokens: 15969812480 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.810608E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.287 | TFLOPs: 26.13 | +7: iteration 30470/ 173500 | consumed samples: 7800320 | consumed tokens: 15975055360 | elapsed time per iteration (s): 0.15 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.798916E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.939 | TFLOPs: 26.16 | +7: iteration 30480/ 173500 | consumed samples: 7802880 | consumed tokens: 15980298240 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.810913E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.204 | TFLOPs: 26.16 | +7: iteration 30490/ 173500 | consumed samples: 7805440 | consumed tokens: 15985541120 | elapsed time per iteration (s): 0.16 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.806344E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.218 | TFLOPs: 25.75 | +7: iteration 30500/ 173500 | consumed samples: 7808000 | consumed tokens: 15990784000 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.803821E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.365 | TFLOPs: 26.21 | +7: iteration 30510/ 173500 | consumed samples: 7810560 | consumed tokens: 15996026880 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.799889E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.356 | TFLOPs: 26.21 | +7: iteration 30520/ 173500 | consumed samples: 7813120 | consumed tokens: 16001269760 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.810846E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.838 | TFLOPs: 25.92 | +7: iteration 30530/ 173500 | consumed samples: 7815680 | consumed tokens: 16006512640 | elapsed time per iteration (s): 0.16 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.811416E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.721 | TFLOPs: 25.64 | +7: iteration 30540/ 173500 | consumed samples: 7818240 | consumed tokens: 16011755520 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.809089E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.036 | TFLOPs: 26.17 | +7: iteration 30550/ 173500 | consumed samples: 7820800 | consumed tokens: 16016998400 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.809435E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.407 | TFLOPs: 26.16 | +7: iteration 30560/ 173500 | consumed samples: 7823360 | consumed tokens: 16022241280 | elapsed time per iteration (s): 0.16 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.806467E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.159 | TFLOPs: 25.88 | +7: iteration 30570/ 173500 | consumed samples: 7825920 | consumed tokens: 16027484160 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.818338E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.981 | TFLOPs: 26.17 | +7: iteration 30580/ 173500 | consumed samples: 7828480 | consumed tokens: 16032727040 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.811723E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.445 | TFLOPs: 26.17 | +7: iteration 30590/ 173500 | consumed samples: 7831040 | consumed tokens: 16037969920 | elapsed time per iteration (s): 0.15 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.805158E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.696 | TFLOPs: 26.17 | +7: iteration 30600/ 173500 | consumed samples: 7833600 | consumed tokens: 16043212800 | elapsed time per iteration (s): 0.16 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.820620E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.424 | TFLOPs: 25.35 | +7: iteration 30610/ 173500 | consumed samples: 7836160 | consumed tokens: 16048455680 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.800626E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.578 | TFLOPs: 26.21 | +7: iteration 30620/ 173500 | consumed samples: 7838720 | consumed tokens: 16053698560 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.809225E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.283 | TFLOPs: 26.19 | +7: iteration 30630/ 173500 | consumed samples: 7841280 | consumed tokens: 16058941440 | elapsed time per iteration (s): 0.16 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.818164E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.788 | TFLOPs: 25.89 | +7: iteration 30640/ 173500 | consumed samples: 7843840 | consumed tokens: 16064184320 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.812718E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.978 | TFLOPs: 26.19 | +7: iteration 30650/ 173500 | consumed samples: 7846400 | consumed tokens: 16069427200 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.807416E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.826 | TFLOPs: 26.20 | +7: iteration 30660/ 173500 | consumed samples: 7848960 | consumed tokens: 16074670080 | elapsed time per iteration (s): 0.16 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.813522E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.066 | TFLOPs: 24.86 | +7: iteration 30670/ 173500 | consumed samples: 7851520 | consumed tokens: 16079912960 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.804633E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.833 | TFLOPs: 26.23 | +7: iteration 30680/ 173500 | consumed samples: 7854080 | consumed tokens: 16085155840 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.790113E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.219 | TFLOPs: 26.21 | +7: iteration 30690/ 173500 | consumed samples: 7856640 | consumed tokens: 16090398720 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.795230E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.233 | TFLOPs: 26.24 | +7: iteration 30700/ 173500 | consumed samples: 7859200 | consumed tokens: 16095641600 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.814464E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.120 | TFLOPs: 26.22 | +7: iteration 30710/ 173500 | consumed samples: 7861760 | consumed tokens: 16100884480 | elapsed time per iteration (s): 0.15 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.802318E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.253 | TFLOPs: 26.21 | +7: iteration 30720/ 173500 | consumed samples: 7864320 | consumed tokens: 16106127360 | elapsed time per iteration (s): 0.16 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.812858E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.297 | TFLOPs: 25.80 | +7: iteration 30730/ 173500 | consumed samples: 7866880 | consumed tokens: 16111370240 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.816661E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.190 | TFLOPs: 26.22 | +7: iteration 30740/ 173500 | consumed samples: 7869440 | consumed tokens: 16116613120 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.804037E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.188 | TFLOPs: 26.21 | +7: iteration 30750/ 173500 | consumed samples: 7872000 | consumed tokens: 16121856000 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.804063E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.805 | TFLOPs: 26.20 | +7: iteration 30760/ 173500 | consumed samples: 7874560 | consumed tokens: 16127098880 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.806763E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.616 | TFLOPs: 26.22 | +7: iteration 30770/ 173500 | consumed samples: 7877120 | consumed tokens: 16132341760 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.812866E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.216 | TFLOPs: 26.24 | +7: iteration 30780/ 173500 | consumed samples: 7879680 | consumed tokens: 16137584640 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.806195E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.180 | TFLOPs: 26.26 | +7: iteration 30790/ 173500 | consumed samples: 7882240 | consumed tokens: 16142827520 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.800778E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.748 | TFLOPs: 26.19 | +7: iteration 30800/ 173500 | consumed samples: 7884800 | consumed tokens: 16148070400 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.785093E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.479 | TFLOPs: 26.18 | +7: iteration 30810/ 173500 | consumed samples: 7887360 | consumed tokens: 16153313280 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.817477E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.963 | TFLOPs: 26.13 | +7: iteration 30820/ 173500 | consumed samples: 7889920 | consumed tokens: 16158556160 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.806327E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.865 | TFLOPs: 26.14 | +7: iteration 30830/ 173500 | consumed samples: 7892480 | consumed tokens: 16163799040 | elapsed time per iteration (s): 0.15 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.805679E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.473 | TFLOPs: 26.18 | +7: iteration 30840/ 173500 | consumed samples: 7895040 | consumed tokens: 16169041920 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.811482E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.714 | TFLOPs: 26.19 | +7: iteration 30850/ 173500 | consumed samples: 7897600 | consumed tokens: 16174284800 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.805541E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.993 | TFLOPs: 26.19 | +7: iteration 30860/ 173500 | consumed samples: 7900160 | consumed tokens: 16179527680 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.795120E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.728 | TFLOPs: 26.19 | +7: iteration 30870/ 173500 | consumed samples: 7902720 | consumed tokens: 16184770560 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.804968E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.964 | TFLOPs: 26.17 | +7: iteration 30880/ 173500 | consumed samples: 7905280 | consumed tokens: 16190013440 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.814954E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.801 | TFLOPs: 26.17 | +7: iteration 30890/ 173500 | consumed samples: 7907840 | consumed tokens: 16195256320 | elapsed time per iteration (s): 0.16 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.810276E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.943 | TFLOPs: 25.78 | +7: iteration 30900/ 173500 | consumed samples: 7910400 | consumed tokens: 16200499200 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.808578E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.952 | TFLOPs: 26.16 | +7: iteration 30910/ 173500 | consumed samples: 7912960 | consumed tokens: 16205742080 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.796331E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.936 | TFLOPs: 26.16 | +7: iteration 30920/ 173500 | consumed samples: 7915520 | consumed tokens: 16210984960 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.798146E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.896 | TFLOPs: 26.16 | +7: iteration 30930/ 173500 | consumed samples: 7918080 | consumed tokens: 16216227840 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.814646E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.136 | TFLOPs: 26.18 | +7: iteration 30940/ 173500 | consumed samples: 7920640 | consumed tokens: 16221470720 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.795570E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.117 | TFLOPs: 26.18 | +7: iteration 30950/ 173500 | consumed samples: 7923200 | consumed tokens: 16226713600 | elapsed time per iteration (s): 0.15 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.814358E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.216 | TFLOPs: 26.19 | +7: iteration 30960/ 173500 | consumed samples: 7925760 | consumed tokens: 16231956480 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.802131E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.457 | TFLOPs: 26.20 | +7: iteration 30970/ 173500 | consumed samples: 7928320 | consumed tokens: 16237199360 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.817063E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.073 | TFLOPs: 26.21 | +7: iteration 30980/ 173500 | consumed samples: 7930880 | consumed tokens: 16242442240 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.797407E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.561 | TFLOPs: 26.18 | +7: iteration 30990/ 173500 | consumed samples: 7933440 | consumed tokens: 16247685120 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.801694E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.033 | TFLOPs: 26.19 | +7: iteration 31000/ 173500 | consumed samples: 7936000 | consumed tokens: 16252928000 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.807866E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.343 | TFLOPs: 26.20 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 31000 | lm loss value: 3.916390E+00 | lm loss PPL: 5.021881E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 31000 to checkpoints_44m91b100m +0: [2023-03-17 01:35:56,864] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step31000 is begin to save! +0: [2023-03-17 01:35:56,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:35:56,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:35:56,925] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:35:56,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:35:56,937] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:35:56,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:35:56,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:35:56,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:35:56,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:35:56,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:35:56,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:35:56,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:35:56,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:35:56,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:35:56,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:35:56,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:35:56,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:35:56,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:35:56,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:35:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:35:56,994] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step31000/mp_rank_00_model_states.pt +0: [2023-03-17 01:35:56,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:35:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:35:57,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:35:57,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:35:57,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:35:57,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:35:57,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:35:57,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-17 01:35:57,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:35:57,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:35:57,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:35:57,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:35:57,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:35:57,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:35:57,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:35:57,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:35:57,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:35:57,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:35:57,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:35:57,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:35:57,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:35:57,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:35:57,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:35:57,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: successfully saved checkpoint at iteration 31000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 175.00 +7: iteration 31010/ 173500 | consumed samples: 7938560 | consumed tokens: 16258170880 | elapsed time per iteration (s): 0.18 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.803049E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1460.416 | TFLOPs: 22.90 | +7: iteration 31020/ 173500 | consumed samples: 7941120 | consumed tokens: 16263413760 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.809092E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.081 | TFLOPs: 26.21 | +7: iteration 31030/ 173500 | consumed samples: 7943680 | consumed tokens: 16268656640 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.814618E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.997 | TFLOPs: 26.19 | +7: iteration 31040/ 173500 | consumed samples: 7946240 | consumed tokens: 16273899520 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.805338E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.162 | TFLOPs: 26.21 | +7: iteration 31050/ 173500 | consumed samples: 7948800 | consumed tokens: 16279142400 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.804493E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.578 | TFLOPs: 26.21 | +7: iteration 31060/ 173500 | consumed samples: 7951360 | consumed tokens: 16284385280 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.817154E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.896 | TFLOPs: 26.19 | +7: iteration 31070/ 173500 | consumed samples: 7953920 | consumed tokens: 16289628160 | elapsed time per iteration (s): 0.15 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.816252E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.856 | TFLOPs: 26.17 | +7: iteration 31080/ 173500 | consumed samples: 7956480 | consumed tokens: 16294871040 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.799068E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.533 | TFLOPs: 26.12 | +7: iteration 31090/ 173500 | consumed samples: 7959040 | consumed tokens: 16300113920 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.810751E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.719 | TFLOPs: 26.12 | +7: iteration 31100/ 173500 | consumed samples: 7961600 | consumed tokens: 16305356800 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.819012E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.389 | TFLOPs: 26.12 | +7: iteration 31110/ 173500 | consumed samples: 7964160 | consumed tokens: 16310599680 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.796286E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.117 | TFLOPs: 26.16 | +7: iteration 31120/ 173500 | consumed samples: 7966720 | consumed tokens: 16315842560 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.819751E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.952 | TFLOPs: 26.17 | +7: iteration 31130/ 173500 | consumed samples: 7969280 | consumed tokens: 16321085440 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.825757E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.944 | TFLOPs: 26.19 | +7: iteration 31140/ 173500 | consumed samples: 7971840 | consumed tokens: 16326328320 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.810295E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.979 | TFLOPs: 26.16 | +7: iteration 31150/ 173500 | consumed samples: 7974400 | consumed tokens: 16331571200 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.808250E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.275 | TFLOPs: 26.18 | +7: iteration 31160/ 173500 | consumed samples: 7976960 | consumed tokens: 16336814080 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.810810E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.931 | TFLOPs: 26.19 | +7: iteration 31170/ 173500 | consumed samples: 7979520 | consumed tokens: 16342056960 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.802028E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.235 | TFLOPs: 26.18 | +7: iteration 31180/ 173500 | consumed samples: 7982080 | consumed tokens: 16347299840 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.806371E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.352 | TFLOPs: 26.20 | +7: iteration 31190/ 173500 | consumed samples: 7984640 | consumed tokens: 16352542720 | elapsed time per iteration (s): 0.15 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.819835E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.792 | TFLOPs: 26.20 | +7: iteration 31200/ 173500 | consumed samples: 7987200 | consumed tokens: 16357785600 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.817301E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.571 | TFLOPs: 26.18 | +7: iteration 31210/ 173500 | consumed samples: 7989760 | consumed tokens: 16363028480 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.798573E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.486 | TFLOPs: 25.93 | +7: iteration 31220/ 173500 | consumed samples: 7992320 | consumed tokens: 16368271360 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.812606E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.845 | TFLOPs: 26.17 | +7: iteration 31230/ 173500 | consumed samples: 7994880 | consumed tokens: 16373514240 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.812234E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.512 | TFLOPs: 26.18 | +7: iteration 31240/ 173500 | consumed samples: 7997440 | consumed tokens: 16378757120 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.820110E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.434 | TFLOPs: 26.17 | +7: iteration 31250/ 173500 | consumed samples: 8000000 | consumed tokens: 16384000000 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.806273E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.408 | TFLOPs: 26.18 | +7: iteration 31260/ 173500 | consumed samples: 8002560 | consumed tokens: 16389242880 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.810605E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.227 | TFLOPs: 26.19 | +7: iteration 31270/ 173500 | consumed samples: 8005120 | consumed tokens: 16394485760 | elapsed time per iteration (s): 0.16 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.808952E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.663 | TFLOPs: 25.76 | +7: iteration 31280/ 173500 | consumed samples: 8007680 | consumed tokens: 16399728640 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.802269E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.844 | TFLOPs: 26.19 | +7: iteration 31290/ 173500 | consumed samples: 8010240 | consumed tokens: 16404971520 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.795966E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.882 | TFLOPs: 26.19 | +7: iteration 31300/ 173500 | consumed samples: 8012800 | consumed tokens: 16410214400 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.798073E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.699 | TFLOPs: 26.19 | +7: iteration 31310/ 173500 | consumed samples: 8015360 | consumed tokens: 16415457280 | elapsed time per iteration (s): 0.15 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.796315E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.434 | TFLOPs: 26.18 | +7: iteration 31320/ 173500 | consumed samples: 8017920 | consumed tokens: 16420700160 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.818153E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.403 | TFLOPs: 26.18 | +7: iteration 31330/ 173500 | consumed samples: 8020480 | consumed tokens: 16425943040 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.796303E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.212 | TFLOPs: 26.19 | +7: iteration 31340/ 173500 | consumed samples: 8023040 | consumed tokens: 16431185920 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.804321E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.943 | TFLOPs: 26.17 | +7: iteration 31350/ 173500 | consumed samples: 8025600 | consumed tokens: 16436428800 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.815419E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.942 | TFLOPs: 26.17 | +7: iteration 31360/ 173500 | consumed samples: 8028160 | consumed tokens: 16441671680 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.796149E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.050 | TFLOPs: 26.19 | +7: iteration 31370/ 173500 | consumed samples: 8030720 | consumed tokens: 16446914560 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.808776E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.637 | TFLOPs: 26.18 | +7: iteration 31380/ 173500 | consumed samples: 8033280 | consumed tokens: 16452157440 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.814912E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.028 | TFLOPs: 26.17 | +7: iteration 31390/ 173500 | consumed samples: 8035840 | consumed tokens: 16457400320 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.822126E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.056 | TFLOPs: 26.19 | +7: iteration 31400/ 173500 | consumed samples: 8038400 | consumed tokens: 16462643200 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.793018E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.551 | TFLOPs: 26.18 | +7: iteration 31410/ 173500 | consumed samples: 8040960 | consumed tokens: 16467886080 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.813440E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.441 | TFLOPs: 26.18 | +7: iteration 31420/ 173500 | consumed samples: 8043520 | consumed tokens: 16473128960 | elapsed time per iteration (s): 0.15 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.804379E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.351 | TFLOPs: 26.18 | +7: iteration 31430/ 173500 | consumed samples: 8046080 | consumed tokens: 16478371840 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.805027E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.126 | TFLOPs: 26.19 | +7: iteration 31440/ 173500 | consumed samples: 8048640 | consumed tokens: 16483614720 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.822892E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.500 | TFLOPs: 26.17 | +7: iteration 31450/ 173500 | consumed samples: 8051200 | consumed tokens: 16488857600 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.807011E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.050 | TFLOPs: 26.16 | +7: iteration 31460/ 173500 | consumed samples: 8053760 | consumed tokens: 16494100480 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.799114E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.702 | TFLOPs: 26.17 | +7: iteration 31470/ 173500 | consumed samples: 8056320 | consumed tokens: 16499343360 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.812791E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.047 | TFLOPs: 26.16 | +7: iteration 31480/ 173500 | consumed samples: 8058880 | consumed tokens: 16504586240 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.799820E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.144 | TFLOPs: 26.18 | +7: iteration 31490/ 173500 | consumed samples: 8061440 | consumed tokens: 16509829120 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.798637E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.438 | TFLOPs: 26.18 | +7: iteration 31500/ 173500 | consumed samples: 8064000 | consumed tokens: 16515072000 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.803088E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.037 | TFLOPs: 25.94 | +7: iteration 31510/ 173500 | consumed samples: 8066560 | consumed tokens: 16520314880 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.807676E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.008 | TFLOPs: 26.17 | +7: iteration 31520/ 173500 | consumed samples: 8069120 | consumed tokens: 16525557760 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.802496E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.815 | TFLOPs: 26.17 | +7: iteration 31530/ 173500 | consumed samples: 8071680 | consumed tokens: 16530800640 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.805721E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.374 | TFLOPs: 26.16 | +7: iteration 31540/ 173500 | consumed samples: 8074240 | consumed tokens: 16536043520 | elapsed time per iteration (s): 0.15 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.804886E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.799 | TFLOPs: 26.17 | +7: iteration 31550/ 173500 | consumed samples: 8076800 | consumed tokens: 16541286400 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.817191E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.444 | TFLOPs: 26.18 | +7: iteration 31560/ 173500 | consumed samples: 8079360 | consumed tokens: 16546529280 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.801280E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.584 | TFLOPs: 26.18 | +7: iteration 31570/ 173500 | consumed samples: 8081920 | consumed tokens: 16551772160 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.814303E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.847 | TFLOPs: 26.17 | +7: iteration 31580/ 173500 | consumed samples: 8084480 | consumed tokens: 16557015040 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.811320E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.889 | TFLOPs: 26.17 | +7: iteration 31590/ 173500 | consumed samples: 8087040 | consumed tokens: 16562257920 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.796075E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.073 | TFLOPs: 26.16 | +7: iteration 31600/ 173500 | consumed samples: 8089600 | consumed tokens: 16567500800 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.790039E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.382 | TFLOPs: 26.18 | +7: iteration 31610/ 173500 | consumed samples: 8092160 | consumed tokens: 16572743680 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.812659E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.385 | TFLOPs: 26.16 | +7: iteration 31620/ 173500 | consumed samples: 8094720 | consumed tokens: 16577986560 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.803072E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.413 | TFLOPs: 26.16 | +7: iteration 31630/ 173500 | consumed samples: 8097280 | consumed tokens: 16583229440 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.806320E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.720 | TFLOPs: 26.17 | +7: iteration 31640/ 173500 | consumed samples: 8099840 | consumed tokens: 16588472320 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.808486E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.883 | TFLOPs: 26.17 | +7: iteration 31650/ 173500 | consumed samples: 8102400 | consumed tokens: 16593715200 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.797512E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.069 | TFLOPs: 26.18 | +7: iteration 31660/ 173500 | consumed samples: 8104960 | consumed tokens: 16598958080 | elapsed time per iteration (s): 0.15 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.806889E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.118 | TFLOPs: 26.18 | +7: iteration 31670/ 173500 | consumed samples: 8107520 | consumed tokens: 16604200960 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.798904E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.357 | TFLOPs: 26.18 | +7: iteration 31680/ 173500 | consumed samples: 8110080 | consumed tokens: 16609443840 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.823341E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.482 | TFLOPs: 26.18 | +7: iteration 31690/ 173500 | consumed samples: 8112640 | consumed tokens: 16614686720 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.800132E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.294 | TFLOPs: 26.19 | +7: iteration 31700/ 173500 | consumed samples: 8115200 | consumed tokens: 16619929600 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.800655E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.004 | TFLOPs: 26.19 | +7: iteration 31710/ 173500 | consumed samples: 8117760 | consumed tokens: 16625172480 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.799585E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.951 | TFLOPs: 26.11 | +7: iteration 31720/ 173500 | consumed samples: 8120320 | consumed tokens: 16630415360 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.810225E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.648 | TFLOPs: 26.14 | +7: iteration 31730/ 173500 | consumed samples: 8122880 | consumed tokens: 16635658240 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.807047E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.353 | TFLOPs: 26.12 | +7: iteration 31740/ 173500 | consumed samples: 8125440 | consumed tokens: 16640901120 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.806734E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.067 | TFLOPs: 26.16 | +7: iteration 31750/ 173500 | consumed samples: 8128000 | consumed tokens: 16646144000 | elapsed time per iteration (s): 0.15 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.806275E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.505 | TFLOPs: 26.23 | +7: iteration 31760/ 173500 | consumed samples: 8130560 | consumed tokens: 16651386880 | elapsed time per iteration (s): 0.16 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.807966E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.917 | TFLOPs: 25.37 | +7: iteration 31770/ 173500 | consumed samples: 8133120 | consumed tokens: 16656629760 | elapsed time per iteration (s): 0.16 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.795436E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.153 | TFLOPs: 25.82 | +7: iteration 31780/ 173500 | consumed samples: 8135680 | consumed tokens: 16661872640 | elapsed time per iteration (s): 0.16 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.814519E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.533 | TFLOPs: 25.41 | +7: iteration 31790/ 173500 | consumed samples: 8138240 | consumed tokens: 16667115520 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.810144E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.468 | TFLOPs: 26.13 | +7: iteration 31800/ 173500 | consumed samples: 8140800 | consumed tokens: 16672358400 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.806750E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.061 | TFLOPs: 25.92 | +7: iteration 31810/ 173500 | consumed samples: 8143360 | consumed tokens: 16677601280 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.816150E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.316 | TFLOPs: 26.16 | +7: iteration 31820/ 173500 | consumed samples: 8145920 | consumed tokens: 16682844160 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.805573E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.540 | TFLOPs: 26.23 | +7: iteration 31830/ 173500 | consumed samples: 8148480 | consumed tokens: 16688087040 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.813940E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.736 | TFLOPs: 26.22 | +7: iteration 31840/ 173500 | consumed samples: 8151040 | consumed tokens: 16693329920 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.803021E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.207 | TFLOPs: 26.18 | +7: iteration 31850/ 173500 | consumed samples: 8153600 | consumed tokens: 16698572800 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.804286E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.289 | TFLOPs: 26.13 | +7: iteration 31860/ 173500 | consumed samples: 8156160 | consumed tokens: 16703815680 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.814120E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.432 | TFLOPs: 26.15 | +7: iteration 31870/ 173500 | consumed samples: 8158720 | consumed tokens: 16709058560 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.811538E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.749 | TFLOPs: 26.15 | +7: iteration 31880/ 173500 | consumed samples: 8161280 | consumed tokens: 16714301440 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.794070E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.151 | TFLOPs: 26.16 | +7: iteration 31890/ 173500 | consumed samples: 8163840 | consumed tokens: 16719544320 | elapsed time per iteration (s): 0.15 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.804108E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.190 | TFLOPs: 26.19 | +7: iteration 31900/ 173500 | consumed samples: 8166400 | consumed tokens: 16724787200 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.820070E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.457 | TFLOPs: 26.20 | +7: iteration 31910/ 173500 | consumed samples: 8168960 | consumed tokens: 16730030080 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.805334E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.720 | TFLOPs: 26.20 | +7: iteration 31920/ 173500 | consumed samples: 8171520 | consumed tokens: 16735272960 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.806276E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.342 | TFLOPs: 26.21 | +7: iteration 31930/ 173500 | consumed samples: 8174080 | consumed tokens: 16740515840 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.792359E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.689 | TFLOPs: 26.20 | +7: iteration 31940/ 173500 | consumed samples: 8176640 | consumed tokens: 16745758720 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.831576E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.778 | TFLOPs: 26.17 | +7: iteration 31950/ 173500 | consumed samples: 8179200 | consumed tokens: 16751001600 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.801090E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.393 | TFLOPs: 26.20 | +7: iteration 31960/ 173500 | consumed samples: 8181760 | consumed tokens: 16756244480 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.793432E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.973 | TFLOPs: 26.19 | +7: iteration 31970/ 173500 | consumed samples: 8184320 | consumed tokens: 16761487360 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.792726E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.902 | TFLOPs: 26.16 | +7: iteration 31980/ 173500 | consumed samples: 8186880 | consumed tokens: 16766730240 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.808255E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.285 | TFLOPs: 26.15 | +7: iteration 31990/ 173500 | consumed samples: 8189440 | consumed tokens: 16771973120 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.801960E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.042 | TFLOPs: 26.16 | +0: [2023-03-17 01:38:30,592] [INFO] [logging.py:68:log_dist] [Rank 0] step=32000, skipped=0, lr=[0.00018655987222005428, 0.00018655987222005428, 0.00018655987222005428], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 32000/ 173500 | consumed samples: 8192000 | consumed tokens: 16777216000 | elapsed time per iteration (s): 0.15 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.799194E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.444 | TFLOPs: 26.18 | +0: steps: 32000 loss: 3.8376 iter time (s): 0.153 samples/sec: 1676.259 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 32000 | lm loss value: 3.906591E+00 | lm loss PPL: 4.972916E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 32000 to checkpoints_44m91b100m +0: [2023-03-17 01:38:30,666] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step32000 is begin to save! +0: [2023-03-17 01:38:30,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:38:30,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:38:30,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:38:30,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:38:30,739] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:38:30,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:38:30,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:38:30,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:38:30,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:38:30,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:38:30,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:38:30,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:38:30,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:38:30,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:38:30,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:38:30,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:38:30,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:38:30,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:38:30,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:38:30,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:38:30,799] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step32000/mp_rank_00_model_states.pt +0: [2023-03-17 01:38:30,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:38:30,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:38:30,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:38:30,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:38:30,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:38:30,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:38:30,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:38:30,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:38:30,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: successfully saved checkpoint at iteration 32000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.25 +7: iteration 32010/ 173500 | consumed samples: 8194560 | consumed tokens: 16782458880 | elapsed time per iteration (s): 0.18 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.815340E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.542 | TFLOPs: 22.59 | +7: iteration 32020/ 173500 | consumed samples: 8197120 | consumed tokens: 16787701760 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.802569E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.122 | TFLOPs: 26.22 | +7: iteration 32030/ 173500 | consumed samples: 8199680 | consumed tokens: 16792944640 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.813778E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.169 | TFLOPs: 26.21 | +7: iteration 32040/ 173500 | consumed samples: 8202240 | consumed tokens: 16798187520 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.811568E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.350 | TFLOPs: 26.23 | +7: iteration 32050/ 173500 | consumed samples: 8204800 | consumed tokens: 16803430400 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.811703E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.331 | TFLOPs: 26.23 | +7: iteration 32060/ 173500 | consumed samples: 8207360 | consumed tokens: 16808673280 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.807797E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.665 | TFLOPs: 26.23 | +7: iteration 32070/ 173500 | consumed samples: 8209920 | consumed tokens: 16813916160 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.806326E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.751 | TFLOPs: 26.20 | +7: iteration 32080/ 173500 | consumed samples: 8212480 | consumed tokens: 16819159040 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.800631E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.136 | TFLOPs: 26.19 | +7: iteration 32090/ 173500 | consumed samples: 8215040 | consumed tokens: 16824401920 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.806891E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.915 | TFLOPs: 26.17 | +7: iteration 32100/ 173500 | consumed samples: 8217600 | consumed tokens: 16829644800 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.799501E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.976 | TFLOPs: 26.17 | +7: iteration 32110/ 173500 | consumed samples: 8220160 | consumed tokens: 16834887680 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.806875E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.498 | TFLOPs: 26.18 | +7: iteration 32120/ 173500 | consumed samples: 8222720 | consumed tokens: 16840130560 | elapsed time per iteration (s): 0.15 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.803939E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.316 | TFLOPs: 26.18 | +7: iteration 32130/ 173500 | consumed samples: 8225280 | consumed tokens: 16845373440 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.784029E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.226 | TFLOPs: 26.19 | +7: iteration 32140/ 173500 | consumed samples: 8227840 | consumed tokens: 16850616320 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.807623E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.195 | TFLOPs: 26.18 | +7: iteration 32150/ 173500 | consumed samples: 8230400 | consumed tokens: 16855859200 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.804914E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.722 | TFLOPs: 26.20 | +7: iteration 32160/ 173500 | consumed samples: 8232960 | consumed tokens: 16861102080 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.792759E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.796 | TFLOPs: 26.20 | +7: iteration 32170/ 173500 | consumed samples: 8235520 | consumed tokens: 16866344960 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.812217E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.474 | TFLOPs: 26.18 | +7: iteration 32180/ 173500 | consumed samples: 8238080 | consumed tokens: 16871587840 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.813880E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.896 | TFLOPs: 26.19 | +7: iteration 32190/ 173500 | consumed samples: 8240640 | consumed tokens: 16876830720 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.816999E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.946 | TFLOPs: 26.19 | +7: iteration 32200/ 173500 | consumed samples: 8243200 | consumed tokens: 16882073600 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.819061E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.690 | TFLOPs: 26.18 | +7: iteration 32210/ 173500 | consumed samples: 8245760 | consumed tokens: 16887316480 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.812117E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.451 | TFLOPs: 25.91 | +7: iteration 32220/ 173500 | consumed samples: 8248320 | consumed tokens: 16892559360 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.809723E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.582 | TFLOPs: 26.18 | +7: iteration 32230/ 173500 | consumed samples: 8250880 | consumed tokens: 16897802240 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.812278E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.270 | TFLOPs: 26.19 | +7: iteration 32240/ 173500 | consumed samples: 8253440 | consumed tokens: 16903045120 | elapsed time per iteration (s): 0.15 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.791626E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.911 | TFLOPs: 26.14 | +7: iteration 32250/ 173500 | consumed samples: 8256000 | consumed tokens: 16908288000 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.811660E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.150 | TFLOPs: 26.19 | +7: iteration 32260/ 173500 | consumed samples: 8258560 | consumed tokens: 16913530880 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.805253E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.331 | TFLOPs: 26.18 | +7: iteration 32270/ 173500 | consumed samples: 8261120 | consumed tokens: 16918773760 | elapsed time per iteration (s): 0.16 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.796903E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.039 | TFLOPs: 25.67 | +7: iteration 32280/ 173500 | consumed samples: 8263680 | consumed tokens: 16924016640 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.812251E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.415 | TFLOPs: 26.20 | +7: iteration 32290/ 173500 | consumed samples: 8266240 | consumed tokens: 16929259520 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.803599E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.912 | TFLOPs: 26.19 | +7: iteration 32300/ 173500 | consumed samples: 8268800 | consumed tokens: 16934502400 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.805222E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.645 | TFLOPs: 26.18 | +7: iteration 32310/ 173500 | consumed samples: 8271360 | consumed tokens: 16939745280 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.805479E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.823 | TFLOPs: 26.19 | +7: iteration 32320/ 173500 | consumed samples: 8273920 | consumed tokens: 16944988160 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.807493E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.708 | TFLOPs: 26.15 | +7: iteration 32330/ 173500 | consumed samples: 8276480 | consumed tokens: 16950231040 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.790174E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.238 | TFLOPs: 26.16 | +7: iteration 32340/ 173500 | consumed samples: 8279040 | consumed tokens: 16955473920 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.804372E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.769 | TFLOPs: 26.19 | +7: iteration 32350/ 173500 | consumed samples: 8281600 | consumed tokens: 16960716800 | elapsed time per iteration (s): 0.15 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.812162E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.197 | TFLOPs: 26.18 | +7: iteration 32360/ 173500 | consumed samples: 8284160 | consumed tokens: 16965959680 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.801170E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.960 | TFLOPs: 26.19 | +7: iteration 32370/ 173500 | consumed samples: 8286720 | consumed tokens: 16971202560 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.792980E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.650 | TFLOPs: 26.20 | +7: iteration 32380/ 173500 | consumed samples: 8289280 | consumed tokens: 16976445440 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.801608E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.111 | TFLOPs: 26.19 | +7: iteration 32390/ 173500 | consumed samples: 8291840 | consumed tokens: 16981688320 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.804591E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.073 | TFLOPs: 26.19 | +7: iteration 32400/ 173500 | consumed samples: 8294400 | consumed tokens: 16986931200 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.800610E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.204 | TFLOPs: 26.21 | +7: iteration 32410/ 173500 | consumed samples: 8296960 | consumed tokens: 16992174080 | elapsed time per iteration (s): 0.16 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.806345E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.288 | TFLOPs: 25.90 | +7: iteration 32420/ 173500 | consumed samples: 8299520 | consumed tokens: 16997416960 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.799213E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.538 | TFLOPs: 26.21 | +7: iteration 32430/ 173500 | consumed samples: 8302080 | consumed tokens: 17002659840 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.811785E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.656 | TFLOPs: 26.22 | +7: iteration 32440/ 173500 | consumed samples: 8304640 | consumed tokens: 17007902720 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.800556E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.215 | TFLOPs: 25.99 | +7: iteration 32450/ 173500 | consumed samples: 8307200 | consumed tokens: 17013145600 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.810622E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.747 | TFLOPs: 26.17 | +7: iteration 32460/ 173500 | consumed samples: 8309760 | consumed tokens: 17018388480 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.795104E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.325 | TFLOPs: 26.19 | +7: iteration 32470/ 173500 | consumed samples: 8312320 | consumed tokens: 17023631360 | elapsed time per iteration (s): 0.15 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.795699E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.380 | TFLOPs: 26.16 | +7: iteration 32480/ 173500 | consumed samples: 8314880 | consumed tokens: 17028874240 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.811930E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.067 | TFLOPs: 26.18 | +7: iteration 32490/ 173500 | consumed samples: 8317440 | consumed tokens: 17034117120 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.796539E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.430 | TFLOPs: 26.17 | +7: iteration 32500/ 173500 | consumed samples: 8320000 | consumed tokens: 17039360000 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.800442E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.726 | TFLOPs: 26.17 | +7: iteration 32510/ 173500 | consumed samples: 8322560 | consumed tokens: 17044602880 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.789124E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.195 | TFLOPs: 26.18 | +7: iteration 32520/ 173500 | consumed samples: 8325120 | consumed tokens: 17049845760 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.803041E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.838 | TFLOPs: 26.16 | +7: iteration 32530/ 173500 | consumed samples: 8327680 | consumed tokens: 17055088640 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.800732E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.254 | TFLOPs: 26.15 | +7: iteration 32540/ 173500 | consumed samples: 8330240 | consumed tokens: 17060331520 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.797359E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.584 | TFLOPs: 26.17 | +7: iteration 32550/ 173500 | consumed samples: 8332800 | consumed tokens: 17065574400 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.793911E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.513 | TFLOPs: 26.20 | +7: iteration 32560/ 173500 | consumed samples: 8335360 | consumed tokens: 17070817280 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.800819E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.042 | TFLOPs: 26.21 | +7: iteration 32570/ 173500 | consumed samples: 8337920 | consumed tokens: 17076060160 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.815387E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.353 | TFLOPs: 26.21 | +7: iteration 32580/ 173500 | consumed samples: 8340480 | consumed tokens: 17081303040 | elapsed time per iteration (s): 0.15 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.804693E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.906 | TFLOPs: 26.11 | +7: iteration 32590/ 173500 | consumed samples: 8343040 | consumed tokens: 17086545920 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.806301E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.293 | TFLOPs: 26.08 | +7: iteration 32600/ 173500 | consumed samples: 8345600 | consumed tokens: 17091788800 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.805930E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.343 | TFLOPs: 26.09 | +7: iteration 32610/ 173500 | consumed samples: 8348160 | consumed tokens: 17097031680 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.793129E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.845 | TFLOPs: 26.19 | +7: iteration 32620/ 173500 | consumed samples: 8350720 | consumed tokens: 17102274560 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.802121E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.861 | TFLOPs: 26.19 | +7: iteration 32630/ 173500 | consumed samples: 8353280 | consumed tokens: 17107517440 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.805346E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.364 | TFLOPs: 26.16 | +7: iteration 32640/ 173500 | consumed samples: 8355840 | consumed tokens: 17112760320 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.807729E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.333 | TFLOPs: 26.18 | +7: iteration 32650/ 173500 | consumed samples: 8358400 | consumed tokens: 17118003200 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.791515E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.510 | TFLOPs: 26.14 | +7: iteration 32660/ 173500 | consumed samples: 8360960 | consumed tokens: 17123246080 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.793501E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.740 | TFLOPs: 26.12 | +7: iteration 32670/ 173500 | consumed samples: 8363520 | consumed tokens: 17128488960 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.808677E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.198 | TFLOPs: 26.21 | +7: iteration 32680/ 173500 | consumed samples: 8366080 | consumed tokens: 17133731840 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.793177E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.833 | TFLOPs: 26.19 | +7: iteration 32690/ 173500 | consumed samples: 8368640 | consumed tokens: 17138974720 | elapsed time per iteration (s): 0.15 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.803574E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.324 | TFLOPs: 26.21 | +7: iteration 32700/ 173500 | consumed samples: 8371200 | consumed tokens: 17144217600 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.803602E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.623 | TFLOPs: 26.17 | +7: iteration 32710/ 173500 | consumed samples: 8373760 | consumed tokens: 17149460480 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.794802E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.440 | TFLOPs: 26.20 | +7: iteration 32720/ 173500 | consumed samples: 8376320 | consumed tokens: 17154703360 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.791390E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.315 | TFLOPs: 26.19 | +7: iteration 32730/ 173500 | consumed samples: 8378880 | consumed tokens: 17159946240 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.788696E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.762 | TFLOPs: 26.20 | +7: iteration 32740/ 173500 | consumed samples: 8381440 | consumed tokens: 17165189120 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.791112E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.982 | TFLOPs: 26.21 | +7: iteration 32750/ 173500 | consumed samples: 8384000 | consumed tokens: 17170432000 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.789724E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.779 | TFLOPs: 26.26 | +7: iteration 32760/ 173500 | consumed samples: 8386560 | consumed tokens: 17175674880 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.795613E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.039 | TFLOPs: 26.24 | +7: iteration 32770/ 173500 | consumed samples: 8389120 | consumed tokens: 17180917760 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.789157E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.801 | TFLOPs: 26.22 | +7: iteration 32780/ 173500 | consumed samples: 8391680 | consumed tokens: 17186160640 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.800673E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.285 | TFLOPs: 26.21 | +7: iteration 32790/ 173500 | consumed samples: 8394240 | consumed tokens: 17191403520 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.787087E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.431 | TFLOPs: 26.23 | +7: iteration 32800/ 173500 | consumed samples: 8396800 | consumed tokens: 17196646400 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.812071E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.279 | TFLOPs: 26.24 | +7: iteration 32810/ 173500 | consumed samples: 8399360 | consumed tokens: 17201889280 | elapsed time per iteration (s): 0.15 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.794032E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.945 | TFLOPs: 26.24 | +7: iteration 32820/ 173500 | consumed samples: 8401920 | consumed tokens: 17207132160 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.792778E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.844 | TFLOPs: 26.23 | +7: iteration 32830/ 173500 | consumed samples: 8404480 | consumed tokens: 17212375040 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.785442E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.501 | TFLOPs: 26.24 | +7: iteration 32840/ 173500 | consumed samples: 8407040 | consumed tokens: 17217617920 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.793312E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.674 | TFLOPs: 26.23 | +7: iteration 32850/ 173500 | consumed samples: 8409600 | consumed tokens: 17222860800 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.803157E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.034 | TFLOPs: 26.24 | +7: iteration 32860/ 173500 | consumed samples: 8412160 | consumed tokens: 17228103680 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.800584E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.673 | TFLOPs: 26.22 | +7: iteration 32870/ 173500 | consumed samples: 8414720 | consumed tokens: 17233346560 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.813803E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.141 | TFLOPs: 26.22 | +7: iteration 32880/ 173500 | consumed samples: 8417280 | consumed tokens: 17238589440 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.801254E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.081 | TFLOPs: 26.22 | +7: iteration 32890/ 173500 | consumed samples: 8419840 | consumed tokens: 17243832320 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.801478E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.666 | TFLOPs: 26.22 | +7: iteration 32900/ 173500 | consumed samples: 8422400 | consumed tokens: 17249075200 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.806151E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.288 | TFLOPs: 26.21 | +7: iteration 32910/ 173500 | consumed samples: 8424960 | consumed tokens: 17254318080 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.793635E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.486 | TFLOPs: 26.21 | +7: iteration 32920/ 173500 | consumed samples: 8427520 | consumed tokens: 17259560960 | elapsed time per iteration (s): 0.15 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.816655E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.296 | TFLOPs: 26.21 | +7: iteration 32930/ 173500 | consumed samples: 8430080 | consumed tokens: 17264803840 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.803865E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.300 | TFLOPs: 26.21 | +7: iteration 32940/ 173500 | consumed samples: 8432640 | consumed tokens: 17270046720 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.805958E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.894 | TFLOPs: 26.20 | +7: iteration 32950/ 173500 | consumed samples: 8435200 | consumed tokens: 17275289600 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.808927E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.886 | TFLOPs: 26.24 | +7: iteration 32960/ 173500 | consumed samples: 8437760 | consumed tokens: 17280532480 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.803532E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.469 | TFLOPs: 26.23 | +7: iteration 32970/ 173500 | consumed samples: 8440320 | consumed tokens: 17285775360 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.807310E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.900 | TFLOPs: 26.24 | +7: iteration 32980/ 173500 | consumed samples: 8442880 | consumed tokens: 17291018240 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.792919E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.564 | TFLOPs: 26.25 | +7: iteration 32990/ 173500 | consumed samples: 8445440 | consumed tokens: 17296261120 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.803426E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.660 | TFLOPs: 26.25 | +7: iteration 33000/ 173500 | consumed samples: 8448000 | consumed tokens: 17301504000 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.796079E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.919 | TFLOPs: 26.24 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 33000 | lm loss value: 3.909961E+00 | lm loss PPL: 4.989702E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 33000 to checkpoints_44m91b100m +0: [2023-03-17 01:41:04,232] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step33000 is begin to save! +0: [2023-03-17 01:41:04,235] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:41:04,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:41:04,293] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:41:04,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:41:04,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:41:04,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:41:04,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:41:04,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:41:04,321] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:41:04,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:41:04,329] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:41:04,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:41:04,337] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:41:04,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:41:04,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:41:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:41:04,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:41:04,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:41:04,361] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:41:04,362] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:41:04,363] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step33000/mp_rank_00_model_states.pt +0: [2023-03-17 01:41:04,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:41:04,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:41:04,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:04,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:04,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:04,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:04,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-17 01:41:04,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:41:04,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-17 01:41:04,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:41:04,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 01:41:04,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-17 01:41:04,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:41:04,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:41:04,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-17 01:41:04,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:41:04,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:41:04,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: successfully saved checkpoint at iteration 33000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.32 +7: iteration 33010/ 173500 | consumed samples: 8450560 | consumed tokens: 17306746880 | elapsed time per iteration (s): 0.18 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.797965E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.454 | TFLOPs: 22.86 | +7: iteration 33020/ 173500 | consumed samples: 8453120 | consumed tokens: 17311989760 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.805421E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.354 | TFLOPs: 26.16 | +7: iteration 33030/ 173500 | consumed samples: 8455680 | consumed tokens: 17317232640 | elapsed time per iteration (s): 0.15 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.797911E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.820 | TFLOPs: 26.14 | +7: iteration 33040/ 173500 | consumed samples: 8458240 | consumed tokens: 17322475520 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.803065E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.956 | TFLOPs: 26.14 | +7: iteration 33050/ 173500 | consumed samples: 8460800 | consumed tokens: 17327718400 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.810081E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.218 | TFLOPs: 26.15 | +7: iteration 33060/ 173500 | consumed samples: 8463360 | consumed tokens: 17332961280 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.793832E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.023 | TFLOPs: 26.11 | +7: iteration 33070/ 173500 | consumed samples: 8465920 | consumed tokens: 17338204160 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.789560E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.934 | TFLOPs: 26.03 | +7: iteration 33080/ 173500 | consumed samples: 8468480 | consumed tokens: 17343447040 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.796445E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.080 | TFLOPs: 26.18 | +7: iteration 33090/ 173500 | consumed samples: 8471040 | consumed tokens: 17348689920 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.807777E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.797 | TFLOPs: 26.17 | +7: iteration 33100/ 173500 | consumed samples: 8473600 | consumed tokens: 17353932800 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.796175E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.205 | TFLOPs: 26.19 | +7: iteration 33110/ 173500 | consumed samples: 8476160 | consumed tokens: 17359175680 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.801899E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.429 | TFLOPs: 26.20 | +7: iteration 33120/ 173500 | consumed samples: 8478720 | consumed tokens: 17364418560 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.800946E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.844 | TFLOPs: 26.16 | +7: iteration 33130/ 173500 | consumed samples: 8481280 | consumed tokens: 17369661440 | elapsed time per iteration (s): 0.16 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.810791E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.551 | TFLOPs: 25.77 | +7: iteration 33140/ 173500 | consumed samples: 8483840 | consumed tokens: 17374904320 | elapsed time per iteration (s): 0.15 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.802728E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.628 | TFLOPs: 26.07 | +7: iteration 33150/ 173500 | consumed samples: 8486400 | consumed tokens: 17380147200 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.800729E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.576 | TFLOPs: 26.10 | +7: iteration 33160/ 173500 | consumed samples: 8488960 | consumed tokens: 17385390080 | elapsed time per iteration (s): 0.16 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.796910E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.232 | TFLOPs: 25.83 | +7: iteration 33170/ 173500 | consumed samples: 8491520 | consumed tokens: 17390632960 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.801005E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.045 | TFLOPs: 26.22 | +7: iteration 33180/ 173500 | consumed samples: 8494080 | consumed tokens: 17395875840 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.795472E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.301 | TFLOPs: 26.18 | +7: iteration 33190/ 173500 | consumed samples: 8496640 | consumed tokens: 17401118720 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.794889E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.877 | TFLOPs: 26.14 | +7: iteration 33200/ 173500 | consumed samples: 8499200 | consumed tokens: 17406361600 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.802851E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.711 | TFLOPs: 26.08 | +7: iteration 33210/ 173500 | consumed samples: 8501760 | consumed tokens: 17411604480 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.787861E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.380 | TFLOPs: 26.05 | +7: iteration 33220/ 173500 | consumed samples: 8504320 | consumed tokens: 17416847360 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.801277E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.560 | TFLOPs: 26.15 | +7: iteration 33230/ 173500 | consumed samples: 8506880 | consumed tokens: 17422090240 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.805633E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.181 | TFLOPs: 26.16 | +7: iteration 33240/ 173500 | consumed samples: 8509440 | consumed tokens: 17427333120 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.795735E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.699 | TFLOPs: 26.20 | +7: iteration 33250/ 173500 | consumed samples: 8512000 | consumed tokens: 17432576000 | elapsed time per iteration (s): 0.15 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.803162E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.188 | TFLOPs: 26.21 | +7: iteration 33260/ 173500 | consumed samples: 8514560 | consumed tokens: 17437818880 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.816170E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.691 | TFLOPs: 26.22 | +7: iteration 33270/ 173500 | consumed samples: 8517120 | consumed tokens: 17443061760 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.803165E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.428 | TFLOPs: 26.21 | +7: iteration 33280/ 173500 | consumed samples: 8519680 | consumed tokens: 17448304640 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.790784E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.659 | TFLOPs: 26.22 | +7: iteration 33290/ 173500 | consumed samples: 8522240 | consumed tokens: 17453547520 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.804066E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.818 | TFLOPs: 26.16 | +7: iteration 33300/ 173500 | consumed samples: 8524800 | consumed tokens: 17458790400 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.813585E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.225 | TFLOPs: 26.13 | +7: iteration 33310/ 173500 | consumed samples: 8527360 | consumed tokens: 17464033280 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.806398E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.194 | TFLOPs: 26.16 | +7: iteration 33320/ 173500 | consumed samples: 8529920 | consumed tokens: 17469276160 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.791361E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.442 | TFLOPs: 26.15 | +7: iteration 33330/ 173500 | consumed samples: 8532480 | consumed tokens: 17474519040 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.798729E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.674 | TFLOPs: 26.18 | +7: iteration 33340/ 173500 | consumed samples: 8535040 | consumed tokens: 17479761920 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.805843E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.145 | TFLOPs: 26.21 | +7: iteration 33350/ 173500 | consumed samples: 8537600 | consumed tokens: 17485004800 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.803863E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.670 | TFLOPs: 26.22 | +7: iteration 33360/ 173500 | consumed samples: 8540160 | consumed tokens: 17490247680 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.806349E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.245 | TFLOPs: 26.19 | +7: iteration 33370/ 173500 | consumed samples: 8542720 | consumed tokens: 17495490560 | elapsed time per iteration (s): 0.15 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.779093E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.955 | TFLOPs: 26.19 | +7: iteration 33380/ 173500 | consumed samples: 8545280 | consumed tokens: 17500733440 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.790654E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.992 | TFLOPs: 26.19 | +7: iteration 33390/ 173500 | consumed samples: 8547840 | consumed tokens: 17505976320 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.803141E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.746 | TFLOPs: 26.19 | +7: iteration 33400/ 173500 | consumed samples: 8550400 | consumed tokens: 17511219200 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.797125E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.708 | TFLOPs: 26.22 | +7: iteration 33410/ 173500 | consumed samples: 8552960 | consumed tokens: 17516462080 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.797424E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.974 | TFLOPs: 26.21 | +7: iteration 33420/ 173500 | consumed samples: 8555520 | consumed tokens: 17521704960 | elapsed time per iteration (s): 0.16 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.810736E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.053 | TFLOPs: 25.55 | +7: iteration 33430/ 173500 | consumed samples: 8558080 | consumed tokens: 17526947840 | elapsed time per iteration (s): 0.16 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.800136E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.263 | TFLOPs: 25.58 | +7: iteration 33440/ 173500 | consumed samples: 8560640 | consumed tokens: 17532190720 | elapsed time per iteration (s): 0.16 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.793138E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.744 | TFLOPs: 25.84 | +7: iteration 33450/ 173500 | consumed samples: 8563200 | consumed tokens: 17537433600 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.789374E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.123 | TFLOPs: 26.21 | +7: iteration 33460/ 173500 | consumed samples: 8565760 | consumed tokens: 17542676480 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.787140E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.454 | TFLOPs: 26.20 | +7: iteration 33470/ 173500 | consumed samples: 8568320 | consumed tokens: 17547919360 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.794305E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.963 | TFLOPs: 26.17 | +7: iteration 33480/ 173500 | consumed samples: 8570880 | consumed tokens: 17553162240 | elapsed time per iteration (s): 0.15 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.792600E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.083 | TFLOPs: 26.18 | +7: iteration 33490/ 173500 | consumed samples: 8573440 | consumed tokens: 17558405120 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.782408E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.650 | TFLOPs: 26.18 | +7: iteration 33500/ 173500 | consumed samples: 8576000 | consumed tokens: 17563648000 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.803954E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.124 | TFLOPs: 26.18 | +7: iteration 33510/ 173500 | consumed samples: 8578560 | consumed tokens: 17568890880 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.778998E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.416 | TFLOPs: 26.21 | +7: iteration 33520/ 173500 | consumed samples: 8581120 | consumed tokens: 17574133760 | elapsed time per iteration (s): 0.16 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.787822E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.289 | TFLOPs: 25.57 | +7: iteration 33530/ 173500 | consumed samples: 8583680 | consumed tokens: 17579376640 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.783112E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.953 | TFLOPs: 26.22 | +7: iteration 33540/ 173500 | consumed samples: 8586240 | consumed tokens: 17584619520 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.803563E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.610 | TFLOPs: 26.20 | +7: iteration 33550/ 173500 | consumed samples: 8588800 | consumed tokens: 17589862400 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.798561E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.178 | TFLOPs: 26.21 | +7: iteration 33560/ 173500 | consumed samples: 8591360 | consumed tokens: 17595105280 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.800308E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.284 | TFLOPs: 26.18 | +7: iteration 33570/ 173500 | consumed samples: 8593920 | consumed tokens: 17600348160 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.802423E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.079 | TFLOPs: 26.21 | +7: iteration 33580/ 173500 | consumed samples: 8596480 | consumed tokens: 17605591040 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.799700E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.249 | TFLOPs: 26.19 | +7: iteration 33590/ 173500 | consumed samples: 8599040 | consumed tokens: 17610833920 | elapsed time per iteration (s): 0.15 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.781046E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.179 | TFLOPs: 26.19 | +7: iteration 33600/ 173500 | consumed samples: 8601600 | consumed tokens: 17616076800 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.803233E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.153 | TFLOPs: 26.18 | +7: iteration 33610/ 173500 | consumed samples: 8604160 | consumed tokens: 17621319680 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.789545E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.655 | TFLOPs: 26.18 | +7: iteration 33620/ 173500 | consumed samples: 8606720 | consumed tokens: 17626562560 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.800829E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.335 | TFLOPs: 26.20 | +7: iteration 33630/ 173500 | consumed samples: 8609280 | consumed tokens: 17631805440 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.805661E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.052 | TFLOPs: 26.21 | +7: iteration 33640/ 173500 | consumed samples: 8611840 | consumed tokens: 17637048320 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.802142E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.854 | TFLOPs: 26.22 | +7: iteration 33650/ 173500 | consumed samples: 8614400 | consumed tokens: 17642291200 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.797534E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.626 | TFLOPs: 26.20 | +7: iteration 33660/ 173500 | consumed samples: 8616960 | consumed tokens: 17647534080 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.795810E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.090 | TFLOPs: 26.18 | +7: iteration 33670/ 173500 | consumed samples: 8619520 | consumed tokens: 17652776960 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.786999E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.274 | TFLOPs: 26.18 | +7: iteration 33680/ 173500 | consumed samples: 8622080 | consumed tokens: 17658019840 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.796481E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.598 | TFLOPs: 26.18 | +7: iteration 33690/ 173500 | consumed samples: 8624640 | consumed tokens: 17663262720 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.799333E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.839 | TFLOPs: 26.19 | +7: iteration 33700/ 173500 | consumed samples: 8627200 | consumed tokens: 17668505600 | elapsed time per iteration (s): 0.15 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.796209E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.606 | TFLOPs: 26.18 | +7: iteration 33710/ 173500 | consumed samples: 8629760 | consumed tokens: 17673748480 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.799869E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.024 | TFLOPs: 26.10 | +7: iteration 33720/ 173500 | consumed samples: 8632320 | consumed tokens: 17678991360 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.798771E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.790 | TFLOPs: 26.06 | +7: iteration 33730/ 173500 | consumed samples: 8634880 | consumed tokens: 17684234240 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.791625E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.037 | TFLOPs: 26.06 | +7: iteration 33740/ 173500 | consumed samples: 8637440 | consumed tokens: 17689477120 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.795497E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.886 | TFLOPs: 26.05 | +7: iteration 33750/ 173500 | consumed samples: 8640000 | consumed tokens: 17694720000 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.791829E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.379 | TFLOPs: 26.01 | +7: iteration 33760/ 173500 | consumed samples: 8642560 | consumed tokens: 17699962880 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.797964E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.912 | TFLOPs: 26.05 | +7: iteration 33770/ 173500 | consumed samples: 8645120 | consumed tokens: 17705205760 | elapsed time per iteration (s): 0.16 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.794202E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.831 | TFLOPs: 25.81 | +7: iteration 33780/ 173500 | consumed samples: 8647680 | consumed tokens: 17710448640 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.803879E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.315 | TFLOPs: 26.05 | +7: iteration 33790/ 173500 | consumed samples: 8650240 | consumed tokens: 17715691520 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.806005E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.668 | TFLOPs: 26.07 | +7: iteration 33800/ 173500 | consumed samples: 8652800 | consumed tokens: 17720934400 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.788231E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.800 | TFLOPs: 26.08 | +7: iteration 33810/ 173500 | consumed samples: 8655360 | consumed tokens: 17726177280 | elapsed time per iteration (s): 0.15 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.799493E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.588 | TFLOPs: 26.04 | +7: iteration 33820/ 173500 | consumed samples: 8657920 | consumed tokens: 17731420160 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.794569E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.799 | TFLOPs: 26.08 | +7: iteration 33830/ 173500 | consumed samples: 8660480 | consumed tokens: 17736663040 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.799076E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.049 | TFLOPs: 26.08 | +7: iteration 33840/ 173500 | consumed samples: 8663040 | consumed tokens: 17741905920 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.801444E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.073 | TFLOPs: 26.08 | +7: iteration 33850/ 173500 | consumed samples: 8665600 | consumed tokens: 17747148800 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.789020E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.343 | TFLOPs: 26.13 | +7: iteration 33860/ 173500 | consumed samples: 8668160 | consumed tokens: 17752391680 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.787705E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.324 | TFLOPs: 26.19 | +7: iteration 33870/ 173500 | consumed samples: 8670720 | consumed tokens: 17757634560 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.790225E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.664 | TFLOPs: 26.20 | +7: iteration 33880/ 173500 | consumed samples: 8673280 | consumed tokens: 17762877440 | elapsed time per iteration (s): 0.16 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.796410E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.867 | TFLOPs: 25.81 | +7: iteration 33890/ 173500 | consumed samples: 8675840 | consumed tokens: 17768120320 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.799115E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.453 | TFLOPs: 26.20 | +7: iteration 33900/ 173500 | consumed samples: 8678400 | consumed tokens: 17773363200 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.788557E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.876 | TFLOPs: 26.17 | +7: iteration 33910/ 173500 | consumed samples: 8680960 | consumed tokens: 17778606080 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.810222E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.703 | TFLOPs: 26.19 | +7: iteration 33920/ 173500 | consumed samples: 8683520 | consumed tokens: 17783848960 | elapsed time per iteration (s): 0.15 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.813963E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.442 | TFLOPs: 26.20 | +7: iteration 33930/ 173500 | consumed samples: 8686080 | consumed tokens: 17789091840 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.793561E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.053 | TFLOPs: 26.17 | +7: iteration 33940/ 173500 | consumed samples: 8688640 | consumed tokens: 17794334720 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.800435E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.484 | TFLOPs: 26.21 | +7: iteration 33950/ 173500 | consumed samples: 8691200 | consumed tokens: 17799577600 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.799070E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.871 | TFLOPs: 26.20 | +7: iteration 33960/ 173500 | consumed samples: 8693760 | consumed tokens: 17804820480 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.808927E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.638 | TFLOPs: 26.20 | +7: iteration 33970/ 173500 | consumed samples: 8696320 | consumed tokens: 17810063360 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.791359E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.171 | TFLOPs: 26.19 | +7: iteration 33980/ 173500 | consumed samples: 8698880 | consumed tokens: 17815306240 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.803540E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.756 | TFLOPs: 26.19 | +7: iteration 33990/ 173500 | consumed samples: 8701440 | consumed tokens: 17820549120 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.798053E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.533 | TFLOPs: 26.20 | +0: [2023-03-17 01:43:38,072] [INFO] [logging.py:68:log_dist] [Rank 0] step=34000, skipped=0, lr=[0.00018477830620634072, 0.00018477830620634072, 0.00018477830620634072], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 34000/ 173500 | consumed samples: 8704000 | consumed tokens: 17825792000 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.804049E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.948 | TFLOPs: 26.19 | +0: steps: 34000 loss: 3.8070 iter time (s): 0.153 samples/sec: 1678.616 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 34000 | lm loss value: 3.943025E+00 | lm loss PPL: 5.157438E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 34000 to checkpoints_44m91b100m +0: [2023-03-17 01:43:38,145] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step34000 is begin to save! +0: [2023-03-17 01:43:38,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:43:38,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:43:38,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:43:38,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:43:38,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:43:38,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:43:38,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:43:38,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:43:38,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:43:38,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:43:38,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:43:38,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:43:38,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:43:38,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:43:38,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:43:38,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:43:38,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:43:38,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:43:38,281] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:43:38,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:43:38,282] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step34000/mp_rank_00_model_states.pt +0: [2023-03-17 01:43:38,282] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:43:38,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:43:38,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:43:38,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:43:38,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:43:38,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:43:38,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:43:38,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:43:38,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:43:38,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 01:43:38,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:43:38,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:43:38,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:43:38,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:43:38,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:43:38,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:43:38,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-17 01:43:38,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:43:38,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:43:38,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: successfully saved checkpoint at iteration 34000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.97 +7: iteration 34010/ 173500 | consumed samples: 8706560 | consumed tokens: 17831034880 | elapsed time per iteration (s): 0.18 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.809501E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.689 | TFLOPs: 22.53 | +7: iteration 34020/ 173500 | consumed samples: 8709120 | consumed tokens: 17836277760 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.795301E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.221 | TFLOPs: 26.21 | +7: iteration 34030/ 173500 | consumed samples: 8711680 | consumed tokens: 17841520640 | elapsed time per iteration (s): 0.15 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.801131E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.763 | TFLOPs: 26.20 | +7: iteration 34040/ 173500 | consumed samples: 8714240 | consumed tokens: 17846763520 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.786518E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.922 | TFLOPs: 26.22 | +7: iteration 34050/ 173500 | consumed samples: 8716800 | consumed tokens: 17852006400 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.795999E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.663 | TFLOPs: 26.20 | +7: iteration 34060/ 173500 | consumed samples: 8719360 | consumed tokens: 17857249280 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.797849E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.030 | TFLOPs: 26.21 | +7: iteration 34070/ 173500 | consumed samples: 8721920 | consumed tokens: 17862492160 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.789987E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.085 | TFLOPs: 26.21 | +7: iteration 34080/ 173500 | consumed samples: 8724480 | consumed tokens: 17867735040 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.805731E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.284 | TFLOPs: 26.19 | +7: iteration 34090/ 173500 | consumed samples: 8727040 | consumed tokens: 17872977920 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.801225E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.547 | TFLOPs: 26.21 | +7: iteration 34100/ 173500 | consumed samples: 8729600 | consumed tokens: 17878220800 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.787418E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.415 | TFLOPs: 26.23 | +7: iteration 34110/ 173500 | consumed samples: 8732160 | consumed tokens: 17883463680 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.816724E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.496 | TFLOPs: 26.21 | +7: iteration 34120/ 173500 | consumed samples: 8734720 | consumed tokens: 17888706560 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.789556E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.006 | TFLOPs: 26.21 | +7: iteration 34130/ 173500 | consumed samples: 8737280 | consumed tokens: 17893949440 | elapsed time per iteration (s): 0.15 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.783832E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.673 | TFLOPs: 26.20 | +7: iteration 34140/ 173500 | consumed samples: 8739840 | consumed tokens: 17899192320 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.793150E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.718 | TFLOPs: 26.22 | +7: iteration 34150/ 173500 | consumed samples: 8742400 | consumed tokens: 17904435200 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.785028E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.991 | TFLOPs: 26.21 | +7: iteration 34160/ 173500 | consumed samples: 8744960 | consumed tokens: 17909678080 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.808109E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.074 | TFLOPs: 26.22 | +7: iteration 34170/ 173500 | consumed samples: 8747520 | consumed tokens: 17914920960 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.800504E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.766 | TFLOPs: 26.22 | +7: iteration 34180/ 173500 | consumed samples: 8750080 | consumed tokens: 17920163840 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.791105E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.969 | TFLOPs: 26.20 | +7: iteration 34190/ 173500 | consumed samples: 8752640 | consumed tokens: 17925406720 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.791322E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.150 | TFLOPs: 26.21 | +7: iteration 34200/ 173500 | consumed samples: 8755200 | consumed tokens: 17930649600 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.802349E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.931 | TFLOPs: 26.22 | +7: iteration 34210/ 173500 | consumed samples: 8757760 | consumed tokens: 17935892480 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.801430E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.886 | TFLOPs: 26.17 | +7: iteration 34220/ 173500 | consumed samples: 8760320 | consumed tokens: 17941135360 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.799437E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.908 | TFLOPs: 26.22 | +7: iteration 34230/ 173500 | consumed samples: 8762880 | consumed tokens: 17946378240 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.791425E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.191 | TFLOPs: 26.21 | +7: iteration 34240/ 173500 | consumed samples: 8765440 | consumed tokens: 17951621120 | elapsed time per iteration (s): 0.15 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.812239E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.473 | TFLOPs: 26.21 | +7: iteration 34250/ 173500 | consumed samples: 8768000 | consumed tokens: 17956864000 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.801825E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.876 | TFLOPs: 26.22 | +7: iteration 34260/ 173500 | consumed samples: 8770560 | consumed tokens: 17962106880 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.798359E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.538 | TFLOPs: 26.21 | +7: iteration 34270/ 173500 | consumed samples: 8773120 | consumed tokens: 17967349760 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.808578E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.200 | TFLOPs: 26.19 | +7: iteration 34280/ 173500 | consumed samples: 8775680 | consumed tokens: 17972592640 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.803081E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.496 | TFLOPs: 26.18 | +7: iteration 34290/ 173500 | consumed samples: 8778240 | consumed tokens: 17977835520 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.791713E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.823 | TFLOPs: 26.20 | +7: iteration 34300/ 173500 | consumed samples: 8780800 | consumed tokens: 17983078400 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.813743E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.926 | TFLOPs: 26.19 | +7: iteration 34310/ 173500 | consumed samples: 8783360 | consumed tokens: 17988321280 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.796138E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.453 | TFLOPs: 26.20 | +7: iteration 34320/ 173500 | consumed samples: 8785920 | consumed tokens: 17993564160 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.792711E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.913 | TFLOPs: 26.22 | +7: iteration 34330/ 173500 | consumed samples: 8788480 | consumed tokens: 17998807040 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.794501E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.358 | TFLOPs: 26.23 | +7: iteration 34340/ 173500 | consumed samples: 8791040 | consumed tokens: 18004049920 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.797976E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.249 | TFLOPs: 26.23 | +7: iteration 34350/ 173500 | consumed samples: 8793600 | consumed tokens: 18009292800 | elapsed time per iteration (s): 0.15 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.780234E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.423 | TFLOPs: 26.21 | +7: iteration 34360/ 173500 | consumed samples: 8796160 | consumed tokens: 18014535680 | elapsed time per iteration (s): 0.16 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.802460E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.440 | TFLOPs: 25.77 | +7: iteration 34370/ 173500 | consumed samples: 8798720 | consumed tokens: 18019778560 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.799826E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.848 | TFLOPs: 26.17 | +7: iteration 34380/ 173500 | consumed samples: 8801280 | consumed tokens: 18025021440 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.792820E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.383 | TFLOPs: 26.21 | +7: iteration 34390/ 173500 | consumed samples: 8803840 | consumed tokens: 18030264320 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.814965E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.260 | TFLOPs: 26.21 | +7: iteration 34400/ 173500 | consumed samples: 8806400 | consumed tokens: 18035507200 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.789534E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.323 | TFLOPs: 26.19 | +7: iteration 34410/ 173500 | consumed samples: 8808960 | consumed tokens: 18040750080 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.793493E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.421 | TFLOPs: 26.21 | +7: iteration 34420/ 173500 | consumed samples: 8811520 | consumed tokens: 18045992960 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.793695E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.260 | TFLOPs: 26.19 | +7: iteration 34430/ 173500 | consumed samples: 8814080 | consumed tokens: 18051235840 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.805932E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.471 | TFLOPs: 26.20 | +7: iteration 34440/ 173500 | consumed samples: 8816640 | consumed tokens: 18056478720 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.784719E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.290 | TFLOPs: 26.16 | +7: iteration 34450/ 173500 | consumed samples: 8819200 | consumed tokens: 18061721600 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.790889E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.417 | TFLOPs: 26.18 | +7: iteration 34460/ 173500 | consumed samples: 8821760 | consumed tokens: 18066964480 | elapsed time per iteration (s): 0.15 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.785157E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.348 | TFLOPs: 26.20 | +7: iteration 34470/ 173500 | consumed samples: 8824320 | consumed tokens: 18072207360 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.780740E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.592 | TFLOPs: 26.20 | +7: iteration 34480/ 173500 | consumed samples: 8826880 | consumed tokens: 18077450240 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.780096E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.646 | TFLOPs: 26.18 | +7: iteration 34490/ 173500 | consumed samples: 8829440 | consumed tokens: 18082693120 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.795977E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.091 | TFLOPs: 26.19 | +7: iteration 34500/ 173500 | consumed samples: 8832000 | consumed tokens: 18087936000 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.804553E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.820 | TFLOPs: 26.19 | +7: iteration 34510/ 173500 | consumed samples: 8834560 | consumed tokens: 18093178880 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.795419E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.329 | TFLOPs: 26.18 | +7: iteration 34520/ 173500 | consumed samples: 8837120 | consumed tokens: 18098421760 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.781714E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.783 | TFLOPs: 26.19 | +7: iteration 34530/ 173500 | consumed samples: 8839680 | consumed tokens: 18103664640 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.815538E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.642 | TFLOPs: 26.18 | +7: iteration 34540/ 173500 | consumed samples: 8842240 | consumed tokens: 18108907520 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.785639E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.489 | TFLOPs: 26.17 | +7: iteration 34550/ 173500 | consumed samples: 8844800 | consumed tokens: 18114150400 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.799358E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.677 | TFLOPs: 26.15 | +7: iteration 34560/ 173500 | consumed samples: 8847360 | consumed tokens: 18119393280 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.793412E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.436 | TFLOPs: 26.15 | +7: iteration 34570/ 173500 | consumed samples: 8849920 | consumed tokens: 18124636160 | elapsed time per iteration (s): 0.15 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.800856E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.800 | TFLOPs: 26.14 | +7: iteration 34580/ 173500 | consumed samples: 8852480 | consumed tokens: 18129879040 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.792822E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.370 | TFLOPs: 26.15 | +7: iteration 34590/ 173500 | consumed samples: 8855040 | consumed tokens: 18135121920 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.786123E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.701 | TFLOPs: 26.15 | +7: iteration 34600/ 173500 | consumed samples: 8857600 | consumed tokens: 18140364800 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.791634E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.828 | TFLOPs: 26.16 | +7: iteration 34610/ 173500 | consumed samples: 8860160 | consumed tokens: 18145607680 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.789399E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.393 | TFLOPs: 26.15 | +7: iteration 34620/ 173500 | consumed samples: 8862720 | consumed tokens: 18150850560 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.797666E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.828 | TFLOPs: 26.08 | +7: iteration 34630/ 173500 | consumed samples: 8865280 | consumed tokens: 18156093440 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.789681E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.657 | TFLOPs: 26.14 | +7: iteration 34640/ 173500 | consumed samples: 8867840 | consumed tokens: 18161336320 | elapsed time per iteration (s): 0.16 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.799216E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.328 | TFLOPs: 25.55 | +7: iteration 34650/ 173500 | consumed samples: 8870400 | consumed tokens: 18166579200 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.782881E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.686 | TFLOPs: 26.14 | +7: iteration 34660/ 173500 | consumed samples: 8872960 | consumed tokens: 18171822080 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.785460E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.072 | TFLOPs: 26.13 | +7: iteration 34670/ 173500 | consumed samples: 8875520 | consumed tokens: 18177064960 | elapsed time per iteration (s): 0.15 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.791917E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.766 | TFLOPs: 26.12 | +7: iteration 34680/ 173500 | consumed samples: 8878080 | consumed tokens: 18182307840 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.790772E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.309 | TFLOPs: 26.15 | +7: iteration 34690/ 173500 | consumed samples: 8880640 | consumed tokens: 18187550720 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.795327E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.237 | TFLOPs: 26.15 | +7: iteration 34700/ 173500 | consumed samples: 8883200 | consumed tokens: 18192793600 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.799988E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.630 | TFLOPs: 26.14 | +7: iteration 34710/ 173500 | consumed samples: 8885760 | consumed tokens: 18198036480 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.790903E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.433 | TFLOPs: 26.13 | +7: iteration 34720/ 173500 | consumed samples: 8888320 | consumed tokens: 18203279360 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.774506E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.015 | TFLOPs: 26.16 | +7: iteration 34730/ 173500 | consumed samples: 8890880 | consumed tokens: 18208522240 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.784719E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.332 | TFLOPs: 26.18 | +7: iteration 34740/ 173500 | consumed samples: 8893440 | consumed tokens: 18213765120 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.800360E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.865 | TFLOPs: 26.17 | +7: iteration 34750/ 173500 | consumed samples: 8896000 | consumed tokens: 18219008000 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.790528E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.127 | TFLOPs: 26.16 | +7: iteration 34760/ 173500 | consumed samples: 8898560 | consumed tokens: 18224250880 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.796025E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.424 | TFLOPs: 26.17 | +7: iteration 34770/ 173500 | consumed samples: 8901120 | consumed tokens: 18229493760 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.791243E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.325 | TFLOPs: 26.15 | +7: iteration 34780/ 173500 | consumed samples: 8903680 | consumed tokens: 18234736640 | elapsed time per iteration (s): 0.15 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.787867E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.416 | TFLOPs: 26.15 | +7: iteration 34790/ 173500 | consumed samples: 8906240 | consumed tokens: 18239979520 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.790811E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.600 | TFLOPs: 26.17 | +7: iteration 34800/ 173500 | consumed samples: 8908800 | consumed tokens: 18245222400 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.806376E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.984 | TFLOPs: 26.17 | +7: iteration 34810/ 173500 | consumed samples: 8911360 | consumed tokens: 18250465280 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.787848E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.560 | TFLOPs: 26.15 | +7: iteration 34820/ 173500 | consumed samples: 8913920 | consumed tokens: 18255708160 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.798516E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.835 | TFLOPs: 26.16 | +7: iteration 34830/ 173500 | consumed samples: 8916480 | consumed tokens: 18260951040 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.803931E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.957 | TFLOPs: 26.16 | +7: iteration 34840/ 173500 | consumed samples: 8919040 | consumed tokens: 18266193920 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.813049E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.746 | TFLOPs: 26.14 | +7: iteration 34850/ 173500 | consumed samples: 8921600 | consumed tokens: 18271436800 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.795332E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.396 | TFLOPs: 26.16 | +7: iteration 34860/ 173500 | consumed samples: 8924160 | consumed tokens: 18276679680 | elapsed time per iteration (s): 0.16 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.792670E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.119 | TFLOPs: 25.49 | +7: iteration 34870/ 173500 | consumed samples: 8926720 | consumed tokens: 18281922560 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.784344E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.594 | TFLOPs: 26.14 | +7: iteration 34880/ 173500 | consumed samples: 8929280 | consumed tokens: 18287165440 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.799635E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.478 | TFLOPs: 26.13 | +7: iteration 34890/ 173500 | consumed samples: 8931840 | consumed tokens: 18292408320 | elapsed time per iteration (s): 0.15 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.806566E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.170 | TFLOPs: 26.13 | +7: iteration 34900/ 173500 | consumed samples: 8934400 | consumed tokens: 18297651200 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.795658E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.868 | TFLOPs: 26.12 | +7: iteration 34910/ 173500 | consumed samples: 8936960 | consumed tokens: 18302894080 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.784504E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.235 | TFLOPs: 26.13 | +7: iteration 34920/ 173500 | consumed samples: 8939520 | consumed tokens: 18308136960 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.796045E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.349 | TFLOPs: 26.12 | +7: iteration 34930/ 173500 | consumed samples: 8942080 | consumed tokens: 18313379840 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.801795E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.791 | TFLOPs: 26.11 | +7: iteration 34940/ 173500 | consumed samples: 8944640 | consumed tokens: 18318622720 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.795984E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.234 | TFLOPs: 26.13 | +7: iteration 34950/ 173500 | consumed samples: 8947200 | consumed tokens: 18323865600 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.785433E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.078 | TFLOPs: 26.13 | +7: iteration 34960/ 173500 | consumed samples: 8949760 | consumed tokens: 18329108480 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.775358E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.641 | TFLOPs: 26.14 | +7: iteration 34970/ 173500 | consumed samples: 8952320 | consumed tokens: 18334351360 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.784432E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.029 | TFLOPs: 26.14 | +7: iteration 34980/ 173500 | consumed samples: 8954880 | consumed tokens: 18339594240 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.803485E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.159 | TFLOPs: 26.15 | +7: iteration 34990/ 173500 | consumed samples: 8957440 | consumed tokens: 18344837120 | elapsed time per iteration (s): 0.15 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.783125E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.771 | TFLOPs: 26.14 | +7: iteration 35000/ 173500 | consumed samples: 8960000 | consumed tokens: 18350080000 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.773948E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.318 | TFLOPs: 26.12 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 35000 | lm loss value: 3.897846E+00 | lm loss PPL: 4.929617E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 35000 to checkpoints_44m91b100m +0: [2023-03-17 01:46:11,860] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step35000 is begin to save! +0: [2023-03-17 01:46:11,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:46:11,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:46:11,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:46:11,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:46:11,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:46:11,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:46:11,940] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:46:11,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:46:11,948] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:46:11,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:46:11,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:46:11,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:46:11,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:46:11,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:46:11,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:46:11,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:46:11,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:46:11,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:46:11,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:46:11,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:46:11,990] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step35000/mp_rank_00_model_states.pt +0: [2023-03-17 01:46:11,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:46:11,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:46:12,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:46:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:46:12,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:46:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:46:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:46:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:46:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:46:12,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:46:12,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-17 01:46:12,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:46:12,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-17 01:46:12,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:46:12,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:46:12,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-17 01:46:12,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:46:12,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:46:12,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: successfully saved checkpoint at iteration 35000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 174.08 +7: iteration 35010/ 173500 | consumed samples: 8962560 | consumed tokens: 18355322880 | elapsed time per iteration (s): 0.18 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.792310E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.814 | TFLOPs: 22.77 | +7: iteration 35020/ 173500 | consumed samples: 8965120 | consumed tokens: 18360565760 | elapsed time per iteration (s): 0.16 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.807024E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.710 | TFLOPs: 25.86 | +7: iteration 35030/ 173500 | consumed samples: 8967680 | consumed tokens: 18365808640 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.799929E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.035 | TFLOPs: 26.14 | +7: iteration 35040/ 173500 | consumed samples: 8970240 | consumed tokens: 18371051520 | elapsed time per iteration (s): 0.16 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.785230E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.990 | TFLOPs: 25.81 | +7: iteration 35050/ 173500 | consumed samples: 8972800 | consumed tokens: 18376294400 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.796450E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.218 | TFLOPs: 26.13 | +7: iteration 35060/ 173500 | consumed samples: 8975360 | consumed tokens: 18381537280 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.791287E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.314 | TFLOPs: 26.13 | +7: iteration 35070/ 173500 | consumed samples: 8977920 | consumed tokens: 18386780160 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.795195E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.749 | TFLOPs: 26.12 | +7: iteration 35080/ 173500 | consumed samples: 8980480 | consumed tokens: 18392023040 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.800882E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.762 | TFLOPs: 26.08 | +7: iteration 35090/ 173500 | consumed samples: 8983040 | consumed tokens: 18397265920 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.801043E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.505 | TFLOPs: 26.10 | +7: iteration 35100/ 173500 | consumed samples: 8985600 | consumed tokens: 18402508800 | elapsed time per iteration (s): 0.15 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.785366E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.637 | TFLOPs: 26.09 | +7: iteration 35110/ 173500 | consumed samples: 8988160 | consumed tokens: 18407751680 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.780822E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.114 | TFLOPs: 26.08 | +7: iteration 35120/ 173500 | consumed samples: 8990720 | consumed tokens: 18412994560 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.802760E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.704 | TFLOPs: 26.11 | +7: iteration 35130/ 173500 | consumed samples: 8993280 | consumed tokens: 18418237440 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.783754E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.442 | TFLOPs: 26.10 | +7: iteration 35140/ 173500 | consumed samples: 8995840 | consumed tokens: 18423480320 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.797869E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.317 | TFLOPs: 26.08 | +7: iteration 35150/ 173500 | consumed samples: 8998400 | consumed tokens: 18428723200 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.792307E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.588 | TFLOPs: 26.10 | +7: iteration 35160/ 173500 | consumed samples: 9000960 | consumed tokens: 18433966080 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.789716E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.197 | TFLOPs: 26.07 | +7: iteration 35170/ 173500 | consumed samples: 9003520 | consumed tokens: 18439208960 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.801984E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.586 | TFLOPs: 26.14 | +7: iteration 35180/ 173500 | consumed samples: 9006080 | consumed tokens: 18444451840 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.790443E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.572 | TFLOPs: 26.14 | +7: iteration 35190/ 173500 | consumed samples: 9008640 | consumed tokens: 18449694720 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.787509E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.992 | TFLOPs: 26.14 | +7: iteration 35200/ 173500 | consumed samples: 9011200 | consumed tokens: 18454937600 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.789349E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.979 | TFLOPs: 25.97 | +7: iteration 35210/ 173500 | consumed samples: 9013760 | consumed tokens: 18460180480 | elapsed time per iteration (s): 0.15 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.794824E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.805 | TFLOPs: 26.12 | +7: iteration 35220/ 173500 | consumed samples: 9016320 | consumed tokens: 18465423360 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.804395E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.549 | TFLOPs: 26.12 | +7: iteration 35230/ 173500 | consumed samples: 9018880 | consumed tokens: 18470666240 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.804549E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.929 | TFLOPs: 26.17 | +7: iteration 35240/ 173500 | consumed samples: 9021440 | consumed tokens: 18475909120 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.785155E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.741 | TFLOPs: 26.17 | +7: iteration 35250/ 173500 | consumed samples: 9024000 | consumed tokens: 18481152000 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.792104E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.363 | TFLOPs: 26.15 | +7: iteration 35260/ 173500 | consumed samples: 9026560 | consumed tokens: 18486394880 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.794777E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.393 | TFLOPs: 26.16 | +7: iteration 35270/ 173500 | consumed samples: 9029120 | consumed tokens: 18491637760 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.781149E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.762 | TFLOPs: 26.15 | +7: iteration 35280/ 173500 | consumed samples: 9031680 | consumed tokens: 18496880640 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.782751E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.404 | TFLOPs: 26.15 | +7: iteration 35290/ 173500 | consumed samples: 9034240 | consumed tokens: 18502123520 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.778256E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.868 | TFLOPs: 26.14 | +7: iteration 35300/ 173500 | consumed samples: 9036800 | consumed tokens: 18507366400 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.794008E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.932 | TFLOPs: 26.14 | +7: iteration 35310/ 173500 | consumed samples: 9039360 | consumed tokens: 18512609280 | elapsed time per iteration (s): 0.15 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.786108E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.775 | TFLOPs: 26.15 | +7: iteration 35320/ 173500 | consumed samples: 9041920 | consumed tokens: 18517852160 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.796114E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.985 | TFLOPs: 26.14 | +7: iteration 35330/ 173500 | consumed samples: 9044480 | consumed tokens: 18523095040 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.799263E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.214 | TFLOPs: 26.16 | +7: iteration 35340/ 173500 | consumed samples: 9047040 | consumed tokens: 18528337920 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.781293E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.818 | TFLOPs: 26.16 | +7: iteration 35350/ 173500 | consumed samples: 9049600 | consumed tokens: 18533580800 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.790708E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.617 | TFLOPs: 26.15 | +7: iteration 35360/ 173500 | consumed samples: 9052160 | consumed tokens: 18538823680 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.791514E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.682 | TFLOPs: 26.17 | +7: iteration 35370/ 173500 | consumed samples: 9054720 | consumed tokens: 18544066560 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.785970E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.544 | TFLOPs: 26.17 | +7: iteration 35380/ 173500 | consumed samples: 9057280 | consumed tokens: 18549309440 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.800945E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.920 | TFLOPs: 26.17 | +7: iteration 35390/ 173500 | consumed samples: 9059840 | consumed tokens: 18554552320 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.787685E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.631 | TFLOPs: 26.15 | +7: iteration 35400/ 173500 | consumed samples: 9062400 | consumed tokens: 18559795200 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.789860E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.571 | TFLOPs: 26.17 | +7: iteration 35410/ 173500 | consumed samples: 9064960 | consumed tokens: 18565038080 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.782840E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.581 | TFLOPs: 26.17 | +7: iteration 35420/ 173500 | consumed samples: 9067520 | consumed tokens: 18570280960 | elapsed time per iteration (s): 0.15 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.789790E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.263 | TFLOPs: 26.19 | +7: iteration 35430/ 173500 | consumed samples: 9070080 | consumed tokens: 18575523840 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.788448E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.027 | TFLOPs: 26.21 | +7: iteration 35440/ 173500 | consumed samples: 9072640 | consumed tokens: 18580766720 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.782414E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.179 | TFLOPs: 26.21 | +7: iteration 35450/ 173500 | consumed samples: 9075200 | consumed tokens: 18586009600 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.788705E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.866 | TFLOPs: 26.22 | +7: iteration 35460/ 173500 | consumed samples: 9077760 | consumed tokens: 18591252480 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.801656E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.375 | TFLOPs: 26.21 | +7: iteration 35470/ 173500 | consumed samples: 9080320 | consumed tokens: 18596495360 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.784531E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.166 | TFLOPs: 26.21 | +7: iteration 35480/ 173500 | consumed samples: 9082880 | consumed tokens: 18601738240 | elapsed time per iteration (s): 0.17 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.773503E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.294 | TFLOPs: 24.20 | +7: iteration 35490/ 173500 | consumed samples: 9085440 | consumed tokens: 18606981120 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.772540E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.636 | TFLOPs: 26.20 | +7: iteration 35500/ 173500 | consumed samples: 9088000 | consumed tokens: 18612224000 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.789799E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.051 | TFLOPs: 26.21 | +7: iteration 35510/ 173500 | consumed samples: 9090560 | consumed tokens: 18617466880 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.802695E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.155 | TFLOPs: 26.19 | +7: iteration 35520/ 173500 | consumed samples: 9093120 | consumed tokens: 18622709760 | elapsed time per iteration (s): 0.15 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.797674E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.755 | TFLOPs: 26.20 | +7: iteration 35530/ 173500 | consumed samples: 9095680 | consumed tokens: 18627952640 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.791495E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.756 | TFLOPs: 26.12 | +7: iteration 35540/ 173500 | consumed samples: 9098240 | consumed tokens: 18633195520 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.792756E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.389 | TFLOPs: 26.09 | +7: iteration 35550/ 173500 | consumed samples: 9100800 | consumed tokens: 18638438400 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.792075E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.474 | TFLOPs: 26.13 | +7: iteration 35560/ 173500 | consumed samples: 9103360 | consumed tokens: 18643681280 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.793493E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.022 | TFLOPs: 26.13 | +7: iteration 35570/ 173500 | consumed samples: 9105920 | consumed tokens: 18648924160 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.787387E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.853 | TFLOPs: 26.14 | +7: iteration 35580/ 173500 | consumed samples: 9108480 | consumed tokens: 18654167040 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.806614E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.089 | TFLOPs: 26.11 | +7: iteration 35590/ 173500 | consumed samples: 9111040 | consumed tokens: 18659409920 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.787631E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.127 | TFLOPs: 26.11 | +7: iteration 35600/ 173500 | consumed samples: 9113600 | consumed tokens: 18664652800 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.799291E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.406 | TFLOPs: 26.09 | +7: iteration 35610/ 173500 | consumed samples: 9116160 | consumed tokens: 18669895680 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.798782E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.395 | TFLOPs: 26.07 | +7: iteration 35620/ 173500 | consumed samples: 9118720 | consumed tokens: 18675138560 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.796675E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.400 | TFLOPs: 26.07 | +7: iteration 35630/ 173500 | consumed samples: 9121280 | consumed tokens: 18680381440 | elapsed time per iteration (s): 0.15 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.801676E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.532 | TFLOPs: 26.09 | +7: iteration 35640/ 173500 | consumed samples: 9123840 | consumed tokens: 18685624320 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.788364E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.119 | TFLOPs: 26.10 | +7: iteration 35650/ 173500 | consumed samples: 9126400 | consumed tokens: 18690867200 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.785877E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.121 | TFLOPs: 26.13 | +7: iteration 35660/ 173500 | consumed samples: 9128960 | consumed tokens: 18696110080 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.796850E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.459 | TFLOPs: 26.13 | +7: iteration 35670/ 173500 | consumed samples: 9131520 | consumed tokens: 18701352960 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.789495E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.931 | TFLOPs: 26.13 | +7: iteration 35680/ 173500 | consumed samples: 9134080 | consumed tokens: 18706595840 | elapsed time per iteration (s): 0.16 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.793165E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.844 | TFLOPs: 25.61 | +7: iteration 35690/ 173500 | consumed samples: 9136640 | consumed tokens: 18711838720 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.792694E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.177 | TFLOPs: 26.10 | +7: iteration 35700/ 173500 | consumed samples: 9139200 | consumed tokens: 18717081600 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.800733E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.097 | TFLOPs: 26.05 | +7: iteration 35710/ 173500 | consumed samples: 9141760 | consumed tokens: 18722324480 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.802630E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.232 | TFLOPs: 26.10 | +7: iteration 35720/ 173500 | consumed samples: 9144320 | consumed tokens: 18727567360 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.792364E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.189 | TFLOPs: 26.10 | +7: iteration 35730/ 173500 | consumed samples: 9146880 | consumed tokens: 18732810240 | elapsed time per iteration (s): 0.15 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.785788E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.226 | TFLOPs: 26.08 | +7: iteration 35740/ 173500 | consumed samples: 9149440 | consumed tokens: 18738053120 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.799489E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.473 | TFLOPs: 26.18 | +7: iteration 35750/ 173500 | consumed samples: 9152000 | consumed tokens: 18743296000 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.796477E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.189 | TFLOPs: 26.19 | +7: iteration 35760/ 173500 | consumed samples: 9154560 | consumed tokens: 18748538880 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.784712E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.955 | TFLOPs: 26.17 | +7: iteration 35770/ 173500 | consumed samples: 9157120 | consumed tokens: 18753781760 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.782785E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.431 | TFLOPs: 26.12 | +7: iteration 35780/ 173500 | consumed samples: 9159680 | consumed tokens: 18759024640 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.777742E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.847 | TFLOPs: 26.16 | +7: iteration 35790/ 173500 | consumed samples: 9162240 | consumed tokens: 18764267520 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.782053E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.844 | TFLOPs: 26.16 | +7: iteration 35800/ 173500 | consumed samples: 9164800 | consumed tokens: 18769510400 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.795517E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.899 | TFLOPs: 26.09 | +7: iteration 35810/ 173500 | consumed samples: 9167360 | consumed tokens: 18774753280 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.790067E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.486 | TFLOPs: 26.12 | +7: iteration 35820/ 173500 | consumed samples: 9169920 | consumed tokens: 18779996160 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.792744E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.926 | TFLOPs: 26.19 | +7: iteration 35830/ 173500 | consumed samples: 9172480 | consumed tokens: 18785239040 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.794133E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.959 | TFLOPs: 26.17 | +7: iteration 35840/ 173500 | consumed samples: 9175040 | consumed tokens: 18790481920 | elapsed time per iteration (s): 0.15 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.795800E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.328 | TFLOPs: 26.10 | +7: iteration 35850/ 173500 | consumed samples: 9177600 | consumed tokens: 18795724800 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.786841E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.746 | TFLOPs: 26.11 | +7: iteration 35860/ 173500 | consumed samples: 9180160 | consumed tokens: 18800967680 | elapsed time per iteration (s): 0.16 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.783149E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.083 | TFLOPs: 25.80 | +7: iteration 35870/ 173500 | consumed samples: 9182720 | consumed tokens: 18806210560 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.787054E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.447 | TFLOPs: 26.10 | +7: iteration 35880/ 173500 | consumed samples: 9185280 | consumed tokens: 18811453440 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.806569E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.880 | TFLOPs: 26.16 | +7: iteration 35890/ 173500 | consumed samples: 9187840 | consumed tokens: 18816696320 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.777510E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.599 | TFLOPs: 26.18 | +7: iteration 35900/ 173500 | consumed samples: 9190400 | consumed tokens: 18821939200 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.783872E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.642 | TFLOPs: 26.18 | +7: iteration 35910/ 173500 | consumed samples: 9192960 | consumed tokens: 18827182080 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.789781E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.712 | TFLOPs: 26.20 | +7: iteration 35920/ 173500 | consumed samples: 9195520 | consumed tokens: 18832424960 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.785495E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.285 | TFLOPs: 26.19 | +7: iteration 35930/ 173500 | consumed samples: 9198080 | consumed tokens: 18837667840 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.785945E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.657 | TFLOPs: 26.18 | +7: iteration 35940/ 173500 | consumed samples: 9200640 | consumed tokens: 18842910720 | elapsed time per iteration (s): 0.15 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.780463E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.408 | TFLOPs: 26.18 | +7: iteration 35950/ 173500 | consumed samples: 9203200 | consumed tokens: 18848153600 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.780207E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.773 | TFLOPs: 26.15 | +7: iteration 35960/ 173500 | consumed samples: 9205760 | consumed tokens: 18853396480 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.794252E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.242 | TFLOPs: 26.18 | +7: iteration 35970/ 173500 | consumed samples: 9208320 | consumed tokens: 18858639360 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.799928E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.183 | TFLOPs: 26.18 | +7: iteration 35980/ 173500 | consumed samples: 9210880 | consumed tokens: 18863882240 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.790062E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.294 | TFLOPs: 26.18 | +7: iteration 35990/ 173500 | consumed samples: 9213440 | consumed tokens: 18869125120 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.798081E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.597 | TFLOPs: 26.18 | +0: [2023-03-17 01:48:45,825] [INFO] [logging.py:68:log_dist] [Rank 0] step=36000, skipped=0, lr=[0.00018289669072542715, 0.00018289669072542715, 0.00018289669072542715], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 36000/ 173500 | consumed samples: 9216000 | consumed tokens: 18874368000 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.799929E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.406 | TFLOPs: 26.18 | +0: steps: 36000 loss: 3.7690 iter time (s): 0.153 samples/sec: 1678.124 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 36000 | lm loss value: 3.965480E+00 | lm loss PPL: 5.274556E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 36000 to checkpoints_44m91b100m +0: [2023-03-17 01:48:45,899] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step36000 is begin to save! +0: [2023-03-17 01:48:45,902] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:48:45,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:48:45,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:48:45,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:48:45,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:48:45,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:48:45,980] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:48:45,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:48:45,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:48:45,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:48:45,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:48:46,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:48:46,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:48:46,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:48:46,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:48:46,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:48:46,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:48:46,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:48:46,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:48:46,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:48:46,030] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step36000/mp_rank_00_model_states.pt +0: [2023-03-17 01:48:46,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:48:46,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:48:46,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:48:46,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 01:48:46,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:48:46,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-17 01:48:46,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:48:46,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-17 01:48:46,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:48:46,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:48:46,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: successfully saved checkpoint at iteration 36000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.87 +7: iteration 36010/ 173500 | consumed samples: 9218560 | consumed tokens: 18879610880 | elapsed time per iteration (s): 0.18 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.795930E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1439.951 | TFLOPs: 22.58 | +7: iteration 36020/ 173500 | consumed samples: 9221120 | consumed tokens: 18884853760 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.789635E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.881 | TFLOPs: 26.20 | +7: iteration 36030/ 173500 | consumed samples: 9223680 | consumed tokens: 18890096640 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.782349E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.717 | TFLOPs: 26.20 | +7: iteration 36040/ 173500 | consumed samples: 9226240 | consumed tokens: 18895339520 | elapsed time per iteration (s): 0.15 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.787688E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.149 | TFLOPs: 26.21 | +7: iteration 36050/ 173500 | consumed samples: 9228800 | consumed tokens: 18900582400 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.796984E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.263 | TFLOPs: 26.19 | +7: iteration 36060/ 173500 | consumed samples: 9231360 | consumed tokens: 18905825280 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.795690E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.826 | TFLOPs: 26.20 | +7: iteration 36070/ 173500 | consumed samples: 9233920 | consumed tokens: 18911068160 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.786949E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.282 | TFLOPs: 26.21 | +7: iteration 36080/ 173500 | consumed samples: 9236480 | consumed tokens: 18916311040 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.771035E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.295 | TFLOPs: 26.19 | +7: iteration 36090/ 173500 | consumed samples: 9239040 | consumed tokens: 18921553920 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.795188E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.699 | TFLOPs: 26.20 | +7: iteration 36100/ 173500 | consumed samples: 9241600 | consumed tokens: 18926796800 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.788917E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.117 | TFLOPs: 26.19 | +7: iteration 36110/ 173500 | consumed samples: 9244160 | consumed tokens: 18932039680 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.791212E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.084 | TFLOPs: 26.18 | +7: iteration 36120/ 173500 | consumed samples: 9246720 | consumed tokens: 18937282560 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.789588E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.163 | TFLOPs: 26.21 | +7: iteration 36130/ 173500 | consumed samples: 9249280 | consumed tokens: 18942525440 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.788665E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.304 | TFLOPs: 26.19 | +7: iteration 36140/ 173500 | consumed samples: 9251840 | consumed tokens: 18947768320 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.782103E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.850 | TFLOPs: 26.16 | +7: iteration 36150/ 173500 | consumed samples: 9254400 | consumed tokens: 18953011200 | elapsed time per iteration (s): 0.15 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.790911E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.228 | TFLOPs: 26.11 | +7: iteration 36160/ 173500 | consumed samples: 9256960 | consumed tokens: 18958254080 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.767755E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.127 | TFLOPs: 26.08 | +7: iteration 36170/ 173500 | consumed samples: 9259520 | consumed tokens: 18963496960 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.780441E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.724 | TFLOPs: 26.06 | +7: iteration 36180/ 173500 | consumed samples: 9262080 | consumed tokens: 18968739840 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.787680E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.561 | TFLOPs: 26.04 | +7: iteration 36190/ 173500 | consumed samples: 9264640 | consumed tokens: 18973982720 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.795803E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.741 | TFLOPs: 26.09 | +7: iteration 36200/ 173500 | consumed samples: 9267200 | consumed tokens: 18979225600 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.780193E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.317 | TFLOPs: 26.12 | +7: iteration 36210/ 173500 | consumed samples: 9269760 | consumed tokens: 18984468480 | elapsed time per iteration (s): 0.16 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.777441E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.300 | TFLOPs: 25.82 | +7: iteration 36220/ 173500 | consumed samples: 9272320 | consumed tokens: 18989711360 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.784472E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.333 | TFLOPs: 26.15 | +7: iteration 36230/ 173500 | consumed samples: 9274880 | consumed tokens: 18994954240 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.795208E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.927 | TFLOPs: 26.14 | +7: iteration 36240/ 173500 | consumed samples: 9277440 | consumed tokens: 19000197120 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.784120E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.486 | TFLOPs: 26.13 | +7: iteration 36250/ 173500 | consumed samples: 9280000 | consumed tokens: 19005440000 | elapsed time per iteration (s): 0.15 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.776093E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.544 | TFLOPs: 26.14 | +7: iteration 36260/ 173500 | consumed samples: 9282560 | consumed tokens: 19010682880 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.806590E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.302 | TFLOPs: 26.13 | +7: iteration 36270/ 173500 | consumed samples: 9285120 | consumed tokens: 19015925760 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.787164E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.739 | TFLOPs: 26.12 | +7: iteration 36280/ 173500 | consumed samples: 9287680 | consumed tokens: 19021168640 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.789324E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.433 | TFLOPs: 26.12 | +7: iteration 36290/ 173500 | consumed samples: 9290240 | consumed tokens: 19026411520 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.780872E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.357 | TFLOPs: 26.13 | +7: iteration 36300/ 173500 | consumed samples: 9292800 | consumed tokens: 19031654400 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.785202E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.160 | TFLOPs: 26.15 | +7: iteration 36310/ 173500 | consumed samples: 9295360 | consumed tokens: 19036897280 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.794610E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.557 | TFLOPs: 26.14 | +7: iteration 36320/ 173500 | consumed samples: 9297920 | consumed tokens: 19042140160 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.794758E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.273 | TFLOPs: 26.13 | +7: iteration 36330/ 173500 | consumed samples: 9300480 | consumed tokens: 19047383040 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.789539E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.868 | TFLOPs: 26.12 | +7: iteration 36340/ 173500 | consumed samples: 9303040 | consumed tokens: 19052625920 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.788086E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.567 | TFLOPs: 26.14 | +7: iteration 36350/ 173500 | consumed samples: 9305600 | consumed tokens: 19057868800 | elapsed time per iteration (s): 0.15 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.787816E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.875 | TFLOPs: 26.14 | +7: iteration 36360/ 173500 | consumed samples: 9308160 | consumed tokens: 19063111680 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.788875E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.259 | TFLOPs: 26.02 | +7: iteration 36370/ 173500 | consumed samples: 9310720 | consumed tokens: 19068354560 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.799001E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.442 | TFLOPs: 26.13 | +7: iteration 36380/ 173500 | consumed samples: 9313280 | consumed tokens: 19073597440 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.780880E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.262 | TFLOPs: 26.13 | +7: iteration 36390/ 173500 | consumed samples: 9315840 | consumed tokens: 19078840320 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.780647E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.588 | TFLOPs: 26.14 | +7: iteration 36400/ 173500 | consumed samples: 9318400 | consumed tokens: 19084083200 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.793920E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.249 | TFLOPs: 26.13 | +7: iteration 36410/ 173500 | consumed samples: 9320960 | consumed tokens: 19089326080 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.794361E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.791 | TFLOPs: 26.11 | +7: iteration 36420/ 173500 | consumed samples: 9323520 | consumed tokens: 19094568960 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.786479E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.658 | TFLOPs: 26.12 | +7: iteration 36430/ 173500 | consumed samples: 9326080 | consumed tokens: 19099811840 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.794537E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.967 | TFLOPs: 26.08 | +7: iteration 36440/ 173500 | consumed samples: 9328640 | consumed tokens: 19105054720 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.784144E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.553 | TFLOPs: 26.12 | +7: iteration 36450/ 173500 | consumed samples: 9331200 | consumed tokens: 19110297600 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.787995E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.462 | TFLOPs: 26.12 | +7: iteration 36460/ 173500 | consumed samples: 9333760 | consumed tokens: 19115540480 | elapsed time per iteration (s): 0.15 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.792536E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.766 | TFLOPs: 26.09 | +7: iteration 36470/ 173500 | consumed samples: 9336320 | consumed tokens: 19120783360 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.785711E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.659 | TFLOPs: 26.09 | +7: iteration 36480/ 173500 | consumed samples: 9338880 | consumed tokens: 19126026240 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.802606E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.510 | TFLOPs: 26.09 | +7: iteration 36490/ 173500 | consumed samples: 9341440 | consumed tokens: 19131269120 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.787251E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.861 | TFLOPs: 26.12 | +7: iteration 36500/ 173500 | consumed samples: 9344000 | consumed tokens: 19136512000 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.783375E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.224 | TFLOPs: 26.11 | +7: iteration 36510/ 173500 | consumed samples: 9346560 | consumed tokens: 19141754880 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.792795E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.921 | TFLOPs: 26.13 | +7: iteration 36520/ 173500 | consumed samples: 9349120 | consumed tokens: 19146997760 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.773163E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.042 | TFLOPs: 26.13 | +7: iteration 36530/ 173500 | consumed samples: 9351680 | consumed tokens: 19152240640 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.784690E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.326 | TFLOPs: 26.13 | +7: iteration 36540/ 173500 | consumed samples: 9354240 | consumed tokens: 19157483520 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.789432E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.855 | TFLOPs: 26.12 | +7: iteration 36550/ 173500 | consumed samples: 9356800 | consumed tokens: 19162726400 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.791267E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.304 | TFLOPs: 26.13 | +7: iteration 36560/ 173500 | consumed samples: 9359360 | consumed tokens: 19167969280 | elapsed time per iteration (s): 0.15 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.792576E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.199 | TFLOPs: 26.13 | +7: iteration 36570/ 173500 | consumed samples: 9361920 | consumed tokens: 19173212160 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.789843E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.538 | TFLOPs: 26.12 | +7: iteration 36580/ 173500 | consumed samples: 9364480 | consumed tokens: 19178455040 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.788258E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.264 | TFLOPs: 26.15 | +7: iteration 36590/ 173500 | consumed samples: 9367040 | consumed tokens: 19183697920 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.780974E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.954 | TFLOPs: 26.14 | +7: iteration 36600/ 173500 | consumed samples: 9369600 | consumed tokens: 19188940800 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.788818E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.777 | TFLOPs: 26.12 | +7: iteration 36610/ 173500 | consumed samples: 9372160 | consumed tokens: 19194183680 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.774517E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.805 | TFLOPs: 26.12 | +7: iteration 36620/ 173500 | consumed samples: 9374720 | consumed tokens: 19199426560 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.795550E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.953 | TFLOPs: 26.13 | +7: iteration 36630/ 173500 | consumed samples: 9377280 | consumed tokens: 19204669440 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.795579E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.788 | TFLOPs: 26.12 | +7: iteration 36640/ 173500 | consumed samples: 9379840 | consumed tokens: 19209912320 | elapsed time per iteration (s): 0.15 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.803101E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.809 | TFLOPs: 26.14 | +7: iteration 36650/ 173500 | consumed samples: 9382400 | consumed tokens: 19215155200 | elapsed time per iteration (s): 0.16 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.776845E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.353 | TFLOPs: 25.08 | +7: iteration 36660/ 173500 | consumed samples: 9384960 | consumed tokens: 19220398080 | elapsed time per iteration (s): 0.16 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.789073E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.158 | TFLOPs: 25.71 | +7: iteration 36670/ 173500 | consumed samples: 9387520 | consumed tokens: 19225640960 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.792384E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.020 | TFLOPs: 26.11 | +7: iteration 36680/ 173500 | consumed samples: 9390080 | consumed tokens: 19230883840 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.789374E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.730 | TFLOPs: 26.12 | +7: iteration 36690/ 173500 | consumed samples: 9392640 | consumed tokens: 19236126720 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.803118E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.515 | TFLOPs: 26.12 | +7: iteration 36700/ 173500 | consumed samples: 9395200 | consumed tokens: 19241369600 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.788604E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.511 | TFLOPs: 26.10 | +7: iteration 36710/ 173500 | consumed samples: 9397760 | consumed tokens: 19246612480 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.795380E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.265 | TFLOPs: 26.12 | +7: iteration 36720/ 173500 | consumed samples: 9400320 | consumed tokens: 19251855360 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.785741E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.116 | TFLOPs: 26.13 | +7: iteration 36730/ 173500 | consumed samples: 9402880 | consumed tokens: 19257098240 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.790744E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.289 | TFLOPs: 26.12 | +7: iteration 36740/ 173500 | consumed samples: 9405440 | consumed tokens: 19262341120 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.794754E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.932 | TFLOPs: 26.14 | +7: iteration 36750/ 173500 | consumed samples: 9408000 | consumed tokens: 19267584000 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.793570E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.865 | TFLOPs: 26.14 | +7: iteration 36760/ 173500 | consumed samples: 9410560 | consumed tokens: 19272826880 | elapsed time per iteration (s): 0.15 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.793842E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.900 | TFLOPs: 26.14 | +7: iteration 36770/ 173500 | consumed samples: 9413120 | consumed tokens: 19278069760 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.796153E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.713 | TFLOPs: 26.15 | +7: iteration 36780/ 173500 | consumed samples: 9415680 | consumed tokens: 19283312640 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.782048E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.334 | TFLOPs: 26.20 | +7: iteration 36790/ 173500 | consumed samples: 9418240 | consumed tokens: 19288555520 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.777898E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.277 | TFLOPs: 26.19 | +7: iteration 36800/ 173500 | consumed samples: 9420800 | consumed tokens: 19293798400 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.790241E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.243 | TFLOPs: 26.21 | +7: iteration 36810/ 173500 | consumed samples: 9423360 | consumed tokens: 19299041280 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.780928E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.651 | TFLOPs: 26.20 | +7: iteration 36820/ 173500 | consumed samples: 9425920 | consumed tokens: 19304284160 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.776468E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.310 | TFLOPs: 26.19 | +7: iteration 36830/ 173500 | consumed samples: 9428480 | consumed tokens: 19309527040 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.795669E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.255 | TFLOPs: 26.18 | +7: iteration 36840/ 173500 | consumed samples: 9431040 | consumed tokens: 19314769920 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.795448E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.643 | TFLOPs: 26.20 | +7: iteration 36850/ 173500 | consumed samples: 9433600 | consumed tokens: 19320012800 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.794335E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.574 | TFLOPs: 26.10 | +7: iteration 36860/ 173500 | consumed samples: 9436160 | consumed tokens: 19325255680 | elapsed time per iteration (s): 0.15 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.783798E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.234 | TFLOPs: 26.10 | +7: iteration 36870/ 173500 | consumed samples: 9438720 | consumed tokens: 19330498560 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.786462E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.187 | TFLOPs: 26.08 | +7: iteration 36880/ 173500 | consumed samples: 9441280 | consumed tokens: 19335741440 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.790586E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.146 | TFLOPs: 26.11 | +7: iteration 36890/ 173500 | consumed samples: 9443840 | consumed tokens: 19340984320 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.777538E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.691 | TFLOPs: 26.11 | +7: iteration 36900/ 173500 | consumed samples: 9446400 | consumed tokens: 19346227200 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.799259E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.066 | TFLOPs: 26.10 | +7: iteration 36910/ 173500 | consumed samples: 9448960 | consumed tokens: 19351470080 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.779901E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.915 | TFLOPs: 26.09 | +7: iteration 36920/ 173500 | consumed samples: 9451520 | consumed tokens: 19356712960 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.780817E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.725 | TFLOPs: 26.11 | +7: iteration 36930/ 173500 | consumed samples: 9454080 | consumed tokens: 19361955840 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.785639E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.069 | TFLOPs: 26.11 | +7: iteration 36940/ 173500 | consumed samples: 9456640 | consumed tokens: 19367198720 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.786955E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.256 | TFLOPs: 26.10 | +7: iteration 36950/ 173500 | consumed samples: 9459200 | consumed tokens: 19372441600 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.775363E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.617 | TFLOPs: 26.14 | +7: iteration 36960/ 173500 | consumed samples: 9461760 | consumed tokens: 19377684480 | elapsed time per iteration (s): 0.15 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.786369E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.591 | TFLOPs: 26.18 | +7: iteration 36970/ 173500 | consumed samples: 9464320 | consumed tokens: 19382927360 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.787453E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.302 | TFLOPs: 26.18 | +7: iteration 36980/ 173500 | consumed samples: 9466880 | consumed tokens: 19388170240 | elapsed time per iteration (s): 0.16 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.783356E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.217 | TFLOPs: 25.33 | +7: iteration 36990/ 173500 | consumed samples: 9469440 | consumed tokens: 19393413120 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.782872E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.729 | TFLOPs: 25.97 | +7: iteration 37000/ 173500 | consumed samples: 9472000 | consumed tokens: 19398656000 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.796585E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.480 | TFLOPs: 26.20 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 37000 | lm loss value: 3.932555E+00 | lm loss PPL: 5.103723E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 37000 to checkpoints_44m91b100m +0: [2023-03-17 01:51:19,896] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step37000 is begin to save! +0: [2023-03-17 01:51:19,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:51:19,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:51:19,962] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:51:19,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:51:19,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:51:19,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:51:19,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:51:19,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:51:19,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:51:19,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:51:19,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:51:20,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:51:20,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:51:20,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:51:20,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:51:20,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:51:20,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:51:20,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:51:20,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:51:20,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:51:20,029] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step37000/mp_rank_00_model_states.pt +0: [2023-03-17 01:51:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:51:20,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:51:20,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 01:51:20,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:51:20,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:51:20,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: successfully saved checkpoint at iteration 37000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.93 +7: iteration 37010/ 173500 | consumed samples: 9474560 | consumed tokens: 19403898880 | elapsed time per iteration (s): 0.18 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.779217E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.695 | TFLOPs: 22.84 | +7: iteration 37020/ 173500 | consumed samples: 9477120 | consumed tokens: 19409141760 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.771437E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.196 | TFLOPs: 26.19 | +7: iteration 37030/ 173500 | consumed samples: 9479680 | consumed tokens: 19414384640 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.792347E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.873 | TFLOPs: 26.20 | +7: iteration 37040/ 173500 | consumed samples: 9482240 | consumed tokens: 19419627520 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.779290E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.062 | TFLOPs: 26.21 | +7: iteration 37050/ 173500 | consumed samples: 9484800 | consumed tokens: 19424870400 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.767732E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.755 | TFLOPs: 26.20 | +7: iteration 37060/ 173500 | consumed samples: 9487360 | consumed tokens: 19430113280 | elapsed time per iteration (s): 0.15 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.788442E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.657 | TFLOPs: 26.18 | +7: iteration 37070/ 173500 | consumed samples: 9489920 | consumed tokens: 19435356160 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.789226E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.088 | TFLOPs: 26.21 | +7: iteration 37080/ 173500 | consumed samples: 9492480 | consumed tokens: 19440599040 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.775266E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.605 | TFLOPs: 26.11 | +7: iteration 37090/ 173500 | consumed samples: 9495040 | consumed tokens: 19445841920 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.784274E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.413 | TFLOPs: 26.20 | +7: iteration 37100/ 173500 | consumed samples: 9497600 | consumed tokens: 19451084800 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.781962E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.030 | TFLOPs: 26.17 | +7: iteration 37110/ 173500 | consumed samples: 9500160 | consumed tokens: 19456327680 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.783150E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.365 | TFLOPs: 26.20 | +7: iteration 37120/ 173500 | consumed samples: 9502720 | consumed tokens: 19461570560 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.784495E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.556 | TFLOPs: 26.20 | +7: iteration 37130/ 173500 | consumed samples: 9505280 | consumed tokens: 19466813440 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.785886E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.856 | TFLOPs: 26.20 | +7: iteration 37140/ 173500 | consumed samples: 9507840 | consumed tokens: 19472056320 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.799039E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.342 | TFLOPs: 26.20 | +7: iteration 37150/ 173500 | consumed samples: 9510400 | consumed tokens: 19477299200 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.793906E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.726 | TFLOPs: 26.19 | +7: iteration 37160/ 173500 | consumed samples: 9512960 | consumed tokens: 19482542080 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.774704E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.259 | TFLOPs: 26.19 | +7: iteration 37170/ 173500 | consumed samples: 9515520 | consumed tokens: 19487784960 | elapsed time per iteration (s): 0.15 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.790913E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.461 | TFLOPs: 26.23 | +7: iteration 37180/ 173500 | consumed samples: 9518080 | consumed tokens: 19493027840 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.771399E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.836 | TFLOPs: 26.19 | +7: iteration 37190/ 173500 | consumed samples: 9520640 | consumed tokens: 19498270720 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.792228E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.827 | TFLOPs: 26.22 | +7: iteration 37200/ 173500 | consumed samples: 9523200 | consumed tokens: 19503513600 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.784521E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.238 | TFLOPs: 26.18 | +7: iteration 37210/ 173500 | consumed samples: 9525760 | consumed tokens: 19508756480 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.781204E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.526 | TFLOPs: 25.92 | +7: iteration 37220/ 173500 | consumed samples: 9528320 | consumed tokens: 19513999360 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.794638E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.968 | TFLOPs: 26.22 | +7: iteration 37230/ 173500 | consumed samples: 9530880 | consumed tokens: 19519242240 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.786481E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.258 | TFLOPs: 26.21 | +7: iteration 37240/ 173500 | consumed samples: 9533440 | consumed tokens: 19524485120 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.776845E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.420 | TFLOPs: 26.21 | +7: iteration 37250/ 173500 | consumed samples: 9536000 | consumed tokens: 19529728000 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.803268E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.329 | TFLOPs: 26.21 | +7: iteration 37260/ 173500 | consumed samples: 9538560 | consumed tokens: 19534970880 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.780622E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.208 | TFLOPs: 26.33 | +7: iteration 37270/ 173500 | consumed samples: 9541120 | consumed tokens: 19540213760 | elapsed time per iteration (s): 0.15 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.783913E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.110 | TFLOPs: 26.33 | +7: iteration 37280/ 173500 | consumed samples: 9543680 | consumed tokens: 19545456640 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.794444E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.432 | TFLOPs: 26.23 | +7: iteration 37290/ 173500 | consumed samples: 9546240 | consumed tokens: 19550699520 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.775024E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.365 | TFLOPs: 26.24 | +7: iteration 37300/ 173500 | consumed samples: 9548800 | consumed tokens: 19555942400 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.777699E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.940 | TFLOPs: 26.24 | +7: iteration 37310/ 173500 | consumed samples: 9551360 | consumed tokens: 19561185280 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.788231E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.126 | TFLOPs: 26.24 | +7: iteration 37320/ 173500 | consumed samples: 9553920 | consumed tokens: 19566428160 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.790128E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.551 | TFLOPs: 26.23 | +7: iteration 37330/ 173500 | consumed samples: 9556480 | consumed tokens: 19571671040 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.793255E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.672 | TFLOPs: 26.22 | +7: iteration 37340/ 173500 | consumed samples: 9559040 | consumed tokens: 19576913920 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.785535E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.703 | TFLOPs: 26.19 | +7: iteration 37350/ 173500 | consumed samples: 9561600 | consumed tokens: 19582156800 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.778364E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.597 | TFLOPs: 26.21 | +7: iteration 37360/ 173500 | consumed samples: 9564160 | consumed tokens: 19587399680 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.788268E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.205 | TFLOPs: 26.16 | +7: iteration 37370/ 173500 | consumed samples: 9566720 | consumed tokens: 19592642560 | elapsed time per iteration (s): 0.15 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.769693E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.196 | TFLOPs: 26.19 | +7: iteration 37380/ 173500 | consumed samples: 9569280 | consumed tokens: 19597885440 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.788393E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.000 | TFLOPs: 26.19 | +7: iteration 37390/ 173500 | consumed samples: 9571840 | consumed tokens: 19603128320 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.801554E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.411 | TFLOPs: 26.21 | +7: iteration 37400/ 173500 | consumed samples: 9574400 | consumed tokens: 19608371200 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.789386E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.847 | TFLOPs: 26.27 | +7: iteration 37410/ 173500 | consumed samples: 9576960 | consumed tokens: 19613614080 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.785242E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.089 | TFLOPs: 26.21 | +7: iteration 37420/ 173500 | consumed samples: 9579520 | consumed tokens: 19618856960 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.789528E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.348 | TFLOPs: 26.21 | +7: iteration 37430/ 173500 | consumed samples: 9582080 | consumed tokens: 19624099840 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.779930E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.453 | TFLOPs: 26.24 | +7: iteration 37440/ 173500 | consumed samples: 9584640 | consumed tokens: 19629342720 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.787829E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.885 | TFLOPs: 26.24 | +7: iteration 37450/ 173500 | consumed samples: 9587200 | consumed tokens: 19634585600 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.798181E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.874 | TFLOPs: 26.23 | +7: iteration 37460/ 173500 | consumed samples: 9589760 | consumed tokens: 19639828480 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.787632E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.886 | TFLOPs: 26.20 | +7: iteration 37470/ 173500 | consumed samples: 9592320 | consumed tokens: 19645071360 | elapsed time per iteration (s): 0.15 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.790434E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.503 | TFLOPs: 26.23 | +7: iteration 37480/ 173500 | consumed samples: 9594880 | consumed tokens: 19650314240 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.786539E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.858 | TFLOPs: 26.31 | +7: iteration 37490/ 173500 | consumed samples: 9597440 | consumed tokens: 19655557120 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.769141E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.477 | TFLOPs: 26.18 | +7: iteration 37500/ 173500 | consumed samples: 9600000 | consumed tokens: 19660800000 | elapsed time per iteration (s): 0.16 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.782796E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.707 | TFLOPs: 25.64 | +7: iteration 37510/ 173500 | consumed samples: 9602560 | consumed tokens: 19666042880 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.800164E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.807 | TFLOPs: 26.12 | +7: iteration 37520/ 173500 | consumed samples: 9605120 | consumed tokens: 19671285760 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.778162E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.457 | TFLOPs: 26.13 | +7: iteration 37530/ 173500 | consumed samples: 9607680 | consumed tokens: 19676528640 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.790385E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.440 | TFLOPs: 26.13 | +7: iteration 37540/ 173500 | consumed samples: 9610240 | consumed tokens: 19681771520 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.780589E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.643 | TFLOPs: 26.12 | +7: iteration 37550/ 173500 | consumed samples: 9612800 | consumed tokens: 19687014400 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.790799E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.009 | TFLOPs: 26.13 | +7: iteration 37560/ 173500 | consumed samples: 9615360 | consumed tokens: 19692257280 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.779776E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.659 | TFLOPs: 26.12 | +7: iteration 37570/ 173500 | consumed samples: 9617920 | consumed tokens: 19697500160 | elapsed time per iteration (s): 0.15 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.781477E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.192 | TFLOPs: 26.11 | +7: iteration 37580/ 173500 | consumed samples: 9620480 | consumed tokens: 19702743040 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.789582E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.706 | TFLOPs: 26.15 | +7: iteration 37590/ 173500 | consumed samples: 9623040 | consumed tokens: 19707985920 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.779623E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.688 | TFLOPs: 26.15 | +7: iteration 37600/ 173500 | consumed samples: 9625600 | consumed tokens: 19713228800 | elapsed time per iteration (s): 0.16 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.788208E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.599 | TFLOPs: 25.78 | +7: iteration 37610/ 173500 | consumed samples: 9628160 | consumed tokens: 19718471680 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.782024E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.529 | TFLOPs: 26.12 | +7: iteration 37620/ 173500 | consumed samples: 9630720 | consumed tokens: 19723714560 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.786029E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.594 | TFLOPs: 26.14 | +7: iteration 37630/ 173500 | consumed samples: 9633280 | consumed tokens: 19728957440 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.786374E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.289 | TFLOPs: 26.13 | +7: iteration 37640/ 173500 | consumed samples: 9635840 | consumed tokens: 19734200320 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.785069E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.767 | TFLOPs: 26.15 | +7: iteration 37650/ 173500 | consumed samples: 9638400 | consumed tokens: 19739443200 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.783483E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.471 | TFLOPs: 26.12 | +7: iteration 37660/ 173500 | consumed samples: 9640960 | consumed tokens: 19744686080 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.784063E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.800 | TFLOPs: 26.16 | +7: iteration 37670/ 173500 | consumed samples: 9643520 | consumed tokens: 19749928960 | elapsed time per iteration (s): 0.15 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.790927E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.980 | TFLOPs: 26.16 | +7: iteration 37680/ 173500 | consumed samples: 9646080 | consumed tokens: 19755171840 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.782966E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.481 | TFLOPs: 26.15 | +7: iteration 37690/ 173500 | consumed samples: 9648640 | consumed tokens: 19760414720 | elapsed time per iteration (s): 0.16 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.792654E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.196 | TFLOPs: 25.75 | +7: iteration 37700/ 173500 | consumed samples: 9651200 | consumed tokens: 19765657600 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.788416E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.990 | TFLOPs: 26.14 | +7: iteration 37710/ 173500 | consumed samples: 9653760 | consumed tokens: 19770900480 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.785601E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.636 | TFLOPs: 26.15 | +7: iteration 37720/ 173500 | consumed samples: 9656320 | consumed tokens: 19776143360 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.780764E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.605 | TFLOPs: 26.15 | +7: iteration 37730/ 173500 | consumed samples: 9658880 | consumed tokens: 19781386240 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.794371E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.748 | TFLOPs: 26.14 | +7: iteration 37740/ 173500 | consumed samples: 9661440 | consumed tokens: 19786629120 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.781542E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.712 | TFLOPs: 26.12 | +7: iteration 37750/ 173500 | consumed samples: 9664000 | consumed tokens: 19791872000 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.787108E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.118 | TFLOPs: 26.10 | +7: iteration 37760/ 173500 | consumed samples: 9666560 | consumed tokens: 19797114880 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.790581E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.009 | TFLOPs: 26.14 | +7: iteration 37770/ 173500 | consumed samples: 9669120 | consumed tokens: 19802357760 | elapsed time per iteration (s): 0.15 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.773738E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.643 | TFLOPs: 26.14 | +7: iteration 37780/ 173500 | consumed samples: 9671680 | consumed tokens: 19807600640 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.787017E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.753 | TFLOPs: 26.14 | +7: iteration 37790/ 173500 | consumed samples: 9674240 | consumed tokens: 19812843520 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.783690E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.642 | TFLOPs: 26.14 | +7: iteration 37800/ 173500 | consumed samples: 9676800 | consumed tokens: 19818086400 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.791330E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.870 | TFLOPs: 26.13 | +7: iteration 37810/ 173500 | consumed samples: 9679360 | consumed tokens: 19823329280 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.774572E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.209 | TFLOPs: 26.11 | +7: iteration 37820/ 173500 | consumed samples: 9681920 | consumed tokens: 19828572160 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.784633E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.434 | TFLOPs: 26.15 | +7: iteration 37830/ 173500 | consumed samples: 9684480 | consumed tokens: 19833815040 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.793983E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.759 | TFLOPs: 26.12 | +7: iteration 37840/ 173500 | consumed samples: 9687040 | consumed tokens: 19839057920 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.779640E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.111 | TFLOPs: 26.14 | +7: iteration 37850/ 173500 | consumed samples: 9689600 | consumed tokens: 19844300800 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.782874E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.368 | TFLOPs: 26.15 | +7: iteration 37860/ 173500 | consumed samples: 9692160 | consumed tokens: 19849543680 | elapsed time per iteration (s): 0.15 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.777034E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.515 | TFLOPs: 26.15 | +7: iteration 37870/ 173500 | consumed samples: 9694720 | consumed tokens: 19854786560 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.773129E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.215 | TFLOPs: 26.11 | +7: iteration 37880/ 173500 | consumed samples: 9697280 | consumed tokens: 19860029440 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.785649E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.175 | TFLOPs: 26.15 | +7: iteration 37890/ 173500 | consumed samples: 9699840 | consumed tokens: 19865272320 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.776180E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.171 | TFLOPs: 26.13 | +7: iteration 37900/ 173500 | consumed samples: 9702400 | consumed tokens: 19870515200 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.795117E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.731 | TFLOPs: 26.14 | +7: iteration 37910/ 173500 | consumed samples: 9704960 | consumed tokens: 19875758080 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.779780E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.478 | TFLOPs: 26.13 | +7: iteration 37920/ 173500 | consumed samples: 9707520 | consumed tokens: 19881000960 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.774044E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.564 | TFLOPs: 26.14 | +7: iteration 37930/ 173500 | consumed samples: 9710080 | consumed tokens: 19886243840 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.783273E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.257 | TFLOPs: 26.13 | +7: iteration 37940/ 173500 | consumed samples: 9712640 | consumed tokens: 19891486720 | elapsed time per iteration (s): 0.15 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.801323E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.671 | TFLOPs: 26.14 | +7: iteration 37950/ 173500 | consumed samples: 9715200 | consumed tokens: 19896729600 | elapsed time per iteration (s): 0.16 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.782357E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.975 | TFLOPs: 25.37 | +7: iteration 37960/ 173500 | consumed samples: 9717760 | consumed tokens: 19901972480 | elapsed time per iteration (s): 0.16 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.784975E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.910 | TFLOPs: 25.56 | +7: iteration 37970/ 173500 | consumed samples: 9720320 | consumed tokens: 19907215360 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.786188E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.744 | TFLOPs: 26.12 | +7: iteration 37980/ 173500 | consumed samples: 9722880 | consumed tokens: 19912458240 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.786773E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.529 | TFLOPs: 26.10 | +7: iteration 37990/ 173500 | consumed samples: 9725440 | consumed tokens: 19917701120 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.757239E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.429 | TFLOPs: 26.06 | +0: [2023-03-17 01:53:53,637] [INFO] [logging.py:68:log_dist] [Rank 0] step=38000, skipped=0, lr=[0.00018091754328052937, 0.00018091754328052937, 0.00018091754328052937], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 38000/ 173500 | consumed samples: 9728000 | consumed tokens: 19922944000 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.784910E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.739 | TFLOPs: 26.12 | +0: steps: 38000 loss: 3.7780 iter time (s): 0.153 samples/sec: 1675.648 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 38000 | lm loss value: 3.891882E+00 | lm loss PPL: 4.900301E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 38000 to checkpoints_44m91b100m +0: [2023-03-17 01:53:53,711] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step38000 is begin to save! +0: [2023-03-17 01:53:53,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:53:53,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:53:53,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:53:53,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:53:53,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:53:53,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:53:53,792] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:53:53,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:53:53,800] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:53:53,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:53:53,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:53:53,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:53:53,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:53:53,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:53:53,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:53:53,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:53:53,833] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:53:53,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:53:53,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:53:53,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:53:53,842] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step38000/mp_rank_00_model_states.pt +0: [2023-03-17 01:53:53,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:53:53,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:53:53,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:53:53,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:53:53,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:53:53,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:53:53,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:53:53,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:53:53,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:53:53,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:53:53,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:53:53,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +6: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +3: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 01:53:53,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 01:53:53,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +5: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +4: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +7: [2023-03-17 01:53:53,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:53:53,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +1: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +2: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:53:53,884] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step38000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:53:53,884] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step38000 is ready now! +0: successfully saved checkpoint at iteration 38000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.09 +7: iteration 38010/ 173500 | consumed samples: 9730560 | consumed tokens: 19928186880 | elapsed time per iteration (s): 0.18 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.779467E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.059 | TFLOPs: 22.55 | +7: iteration 38020/ 173500 | consumed samples: 9733120 | consumed tokens: 19933429760 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.778976E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.764 | TFLOPs: 26.11 | +7: iteration 38030/ 173500 | consumed samples: 9735680 | consumed tokens: 19938672640 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.774556E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.108 | TFLOPs: 26.13 | +7: iteration 38040/ 173500 | consumed samples: 9738240 | consumed tokens: 19943915520 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.782874E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.714 | TFLOPs: 26.12 | +7: iteration 38050/ 173500 | consumed samples: 9740800 | consumed tokens: 19949158400 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.780130E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.945 | TFLOPs: 26.13 | +7: iteration 38060/ 173500 | consumed samples: 9743360 | consumed tokens: 19954401280 | elapsed time per iteration (s): 0.15 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.777510E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.419 | TFLOPs: 26.12 | +7: iteration 38070/ 173500 | consumed samples: 9745920 | consumed tokens: 19959644160 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.792133E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.278 | TFLOPs: 26.13 | +7: iteration 38080/ 173500 | consumed samples: 9748480 | consumed tokens: 19964887040 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.775243E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.495 | TFLOPs: 26.13 | +7: iteration 38090/ 173500 | consumed samples: 9751040 | consumed tokens: 19970129920 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.780164E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.010 | TFLOPs: 26.13 | +7: iteration 38100/ 173500 | consumed samples: 9753600 | consumed tokens: 19975372800 | elapsed time per iteration (s): 0.16 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.775146E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.414 | TFLOPs: 25.68 | +7: iteration 38110/ 173500 | consumed samples: 9756160 | consumed tokens: 19980615680 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.785253E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.754 | TFLOPs: 26.14 | +7: iteration 38120/ 173500 | consumed samples: 9758720 | consumed tokens: 19985858560 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.789502E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.690 | TFLOPs: 26.11 | +7: iteration 38130/ 173500 | consumed samples: 9761280 | consumed tokens: 19991101440 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.781866E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.643 | TFLOPs: 26.14 | +7: iteration 38140/ 173500 | consumed samples: 9763840 | consumed tokens: 19996344320 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.781462E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.171 | TFLOPs: 26.11 | +7: iteration 38150/ 173500 | consumed samples: 9766400 | consumed tokens: 20001587200 | elapsed time per iteration (s): 0.15 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.789062E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.974 | TFLOPs: 26.11 | +7: iteration 38160/ 173500 | consumed samples: 9768960 | consumed tokens: 20006830080 | elapsed time per iteration (s): 0.16 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.800431E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.118 | TFLOPs: 25.75 | +7: iteration 38170/ 173500 | consumed samples: 9771520 | consumed tokens: 20012072960 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.790437E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.498 | TFLOPs: 26.12 | +7: iteration 38180/ 173500 | consumed samples: 9774080 | consumed tokens: 20017315840 | elapsed time per iteration (s): 0.16 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.774341E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.366 | TFLOPs: 25.69 | +7: iteration 38190/ 173500 | consumed samples: 9776640 | consumed tokens: 20022558720 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.772740E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.232 | TFLOPs: 26.08 | +7: iteration 38200/ 173500 | consumed samples: 9779200 | consumed tokens: 20027801600 | elapsed time per iteration (s): 0.16 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.782611E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.729 | TFLOPs: 24.87 | +7: iteration 38210/ 173500 | consumed samples: 9781760 | consumed tokens: 20033044480 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.767439E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.310 | TFLOPs: 26.08 | +7: iteration 38220/ 173500 | consumed samples: 9784320 | consumed tokens: 20038287360 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.787935E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.163 | TFLOPs: 26.05 | +7: iteration 38230/ 173500 | consumed samples: 9786880 | consumed tokens: 20043530240 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.779867E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.969 | TFLOPs: 26.08 | +7: iteration 38240/ 173500 | consumed samples: 9789440 | consumed tokens: 20048773120 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.791278E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.512 | TFLOPs: 26.09 | +7: iteration 38250/ 173500 | consumed samples: 9792000 | consumed tokens: 20054016000 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.789617E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.029 | TFLOPs: 26.08 | +7: iteration 38260/ 173500 | consumed samples: 9794560 | consumed tokens: 20059258880 | elapsed time per iteration (s): 0.15 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.770131E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.602 | TFLOPs: 26.09 | +7: iteration 38270/ 173500 | consumed samples: 9797120 | consumed tokens: 20064501760 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.779815E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.404 | TFLOPs: 26.10 | +7: iteration 38280/ 173500 | consumed samples: 9799680 | consumed tokens: 20069744640 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.790527E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.976 | TFLOPs: 26.08 | +7: iteration 38290/ 173500 | consumed samples: 9802240 | consumed tokens: 20074987520 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.775266E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.546 | TFLOPs: 26.10 | +7: iteration 38300/ 173500 | consumed samples: 9804800 | consumed tokens: 20080230400 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.785461E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.569 | TFLOPs: 26.10 | +7: iteration 38310/ 173500 | consumed samples: 9807360 | consumed tokens: 20085473280 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.782421E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.171 | TFLOPs: 26.10 | +7: iteration 38320/ 173500 | consumed samples: 9809920 | consumed tokens: 20090716160 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.782006E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.947 | TFLOPs: 26.13 | +7: iteration 38330/ 173500 | consumed samples: 9812480 | consumed tokens: 20095959040 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.793731E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.094 | TFLOPs: 26.13 | +7: iteration 38340/ 173500 | consumed samples: 9815040 | consumed tokens: 20101201920 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.773820E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.053 | TFLOPs: 26.11 | +7: iteration 38350/ 173500 | consumed samples: 9817600 | consumed tokens: 20106444800 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.778429E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.840 | TFLOPs: 26.12 | +7: iteration 38360/ 173500 | consumed samples: 9820160 | consumed tokens: 20111687680 | elapsed time per iteration (s): 0.15 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.777697E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.022 | TFLOPs: 26.11 | +7: iteration 38370/ 173500 | consumed samples: 9822720 | consumed tokens: 20116930560 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.784901E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.518 | TFLOPs: 26.14 | +7: iteration 38380/ 173500 | consumed samples: 9825280 | consumed tokens: 20122173440 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.787608E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.741 | TFLOPs: 26.11 | +7: iteration 38390/ 173500 | consumed samples: 9827840 | consumed tokens: 20127416320 | elapsed time per iteration (s): 0.16 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.780376E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.170 | TFLOPs: 25.80 | +7: iteration 38400/ 173500 | consumed samples: 9830400 | consumed tokens: 20132659200 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.782527E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.279 | TFLOPs: 26.12 | +7: iteration 38410/ 173500 | consumed samples: 9832960 | consumed tokens: 20137902080 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.772479E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.656 | TFLOPs: 26.14 | +7: iteration 38420/ 173500 | consumed samples: 9835520 | consumed tokens: 20143144960 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.779113E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.544 | TFLOPs: 26.12 | +7: iteration 38430/ 173500 | consumed samples: 9838080 | consumed tokens: 20148387840 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.780423E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.070 | TFLOPs: 26.13 | +7: iteration 38440/ 173500 | consumed samples: 9840640 | consumed tokens: 20153630720 | elapsed time per iteration (s): 0.15 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.776441E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.629 | TFLOPs: 26.12 | +7: iteration 38450/ 173500 | consumed samples: 9843200 | consumed tokens: 20158873600 | elapsed time per iteration (s): 0.16 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.795298E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.653 | TFLOPs: 25.79 | +7: iteration 38460/ 173500 | consumed samples: 9845760 | consumed tokens: 20164116480 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.790651E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.433 | TFLOPs: 26.10 | +7: iteration 38470/ 173500 | consumed samples: 9848320 | consumed tokens: 20169359360 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.782046E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.277 | TFLOPs: 26.12 | +7: iteration 38480/ 173500 | consumed samples: 9850880 | consumed tokens: 20174602240 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.791348E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.163 | TFLOPs: 26.10 | +7: iteration 38490/ 173500 | consumed samples: 9853440 | consumed tokens: 20179845120 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.793815E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.807 | TFLOPs: 26.11 | +7: iteration 38500/ 173500 | consumed samples: 9856000 | consumed tokens: 20185088000 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.766557E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.505 | TFLOPs: 26.13 | +7: iteration 38510/ 173500 | consumed samples: 9858560 | consumed tokens: 20190330880 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.794945E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.111 | TFLOPs: 26.14 | +7: iteration 38520/ 173500 | consumed samples: 9861120 | consumed tokens: 20195573760 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.786345E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.198 | TFLOPs: 26.15 | +7: iteration 38530/ 173500 | consumed samples: 9863680 | consumed tokens: 20200816640 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.792964E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.222 | TFLOPs: 26.18 | +7: iteration 38540/ 173500 | consumed samples: 9866240 | consumed tokens: 20206059520 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.777202E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.150 | TFLOPs: 26.19 | +7: iteration 38550/ 173500 | consumed samples: 9868800 | consumed tokens: 20211302400 | elapsed time per iteration (s): 0.15 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.755584E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.040 | TFLOPs: 26.22 | +7: iteration 38560/ 173500 | consumed samples: 9871360 | consumed tokens: 20216545280 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.776116E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.640 | TFLOPs: 26.23 | +7: iteration 38570/ 173500 | consumed samples: 9873920 | consumed tokens: 20221788160 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.781948E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.382 | TFLOPs: 26.23 | +7: iteration 38580/ 173500 | consumed samples: 9876480 | consumed tokens: 20227031040 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.775828E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.795 | TFLOPs: 26.23 | +7: iteration 38590/ 173500 | consumed samples: 9879040 | consumed tokens: 20232273920 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.786559E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.953 | TFLOPs: 26.22 | +7: iteration 38600/ 173500 | consumed samples: 9881600 | consumed tokens: 20237516800 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.790587E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.116 | TFLOPs: 26.22 | +7: iteration 38610/ 173500 | consumed samples: 9884160 | consumed tokens: 20242759680 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.779905E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.810 | TFLOPs: 26.22 | +7: iteration 38620/ 173500 | consumed samples: 9886720 | consumed tokens: 20248002560 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.790035E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.312 | TFLOPs: 26.23 | +7: iteration 38630/ 173500 | consumed samples: 9889280 | consumed tokens: 20253245440 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.779480E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.401 | TFLOPs: 26.18 | +7: iteration 38640/ 173500 | consumed samples: 9891840 | consumed tokens: 20258488320 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.763781E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.751 | TFLOPs: 26.23 | +7: iteration 38650/ 173500 | consumed samples: 9894400 | consumed tokens: 20263731200 | elapsed time per iteration (s): 0.15 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.788137E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.075 | TFLOPs: 26.22 | +7: iteration 38660/ 173500 | consumed samples: 9896960 | consumed tokens: 20268974080 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.782917E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.192 | TFLOPs: 26.24 | +7: iteration 38670/ 173500 | consumed samples: 9899520 | consumed tokens: 20274216960 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.794994E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.716 | TFLOPs: 26.25 | +7: iteration 38680/ 173500 | consumed samples: 9902080 | consumed tokens: 20279459840 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.776319E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.262 | TFLOPs: 26.23 | +7: iteration 38690/ 173500 | consumed samples: 9904640 | consumed tokens: 20284702720 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.768665E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.499 | TFLOPs: 26.18 | +7: iteration 38700/ 173500 | consumed samples: 9907200 | consumed tokens: 20289945600 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.768016E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.352 | TFLOPs: 26.15 | +7: iteration 38710/ 173500 | consumed samples: 9909760 | consumed tokens: 20295188480 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.773067E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.859 | TFLOPs: 26.19 | +7: iteration 38720/ 173500 | consumed samples: 9912320 | consumed tokens: 20300431360 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.781282E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.920 | TFLOPs: 26.17 | +7: iteration 38730/ 173500 | consumed samples: 9914880 | consumed tokens: 20305674240 | elapsed time per iteration (s): 0.16 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.787695E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.608 | TFLOPs: 25.82 | +7: iteration 38740/ 173500 | consumed samples: 9917440 | consumed tokens: 20310917120 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.783759E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.197 | TFLOPs: 26.18 | +7: iteration 38750/ 173500 | consumed samples: 9920000 | consumed tokens: 20316160000 | elapsed time per iteration (s): 0.15 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.786028E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.674 | TFLOPs: 26.18 | +7: iteration 38760/ 173500 | consumed samples: 9922560 | consumed tokens: 20321402880 | elapsed time per iteration (s): 0.16 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.789324E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.180 | TFLOPs: 25.83 | +7: iteration 38770/ 173500 | consumed samples: 9925120 | consumed tokens: 20326645760 | elapsed time per iteration (s): 0.16 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.792988E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.894 | TFLOPs: 25.78 | +7: iteration 38780/ 173500 | consumed samples: 9927680 | consumed tokens: 20331888640 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.783185E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.254 | TFLOPs: 26.16 | +7: iteration 38790/ 173500 | consumed samples: 9930240 | consumed tokens: 20337131520 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.790277E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.536 | TFLOPs: 26.18 | +7: iteration 38800/ 173500 | consumed samples: 9932800 | consumed tokens: 20342374400 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.774555E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.959 | TFLOPs: 26.17 | +7: iteration 38810/ 173500 | consumed samples: 9935360 | consumed tokens: 20347617280 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.790992E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.199 | TFLOPs: 26.18 | +7: iteration 38820/ 173500 | consumed samples: 9937920 | consumed tokens: 20352860160 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.775769E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.156 | TFLOPs: 26.19 | +7: iteration 38830/ 173500 | consumed samples: 9940480 | consumed tokens: 20358103040 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.789272E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.785 | TFLOPs: 26.19 | +7: iteration 38840/ 173500 | consumed samples: 9943040 | consumed tokens: 20363345920 | elapsed time per iteration (s): 0.15 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.781815E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.934 | TFLOPs: 26.19 | +7: iteration 38850/ 173500 | consumed samples: 9945600 | consumed tokens: 20368588800 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.783767E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.153 | TFLOPs: 26.19 | +7: iteration 38860/ 173500 | consumed samples: 9948160 | consumed tokens: 20373831680 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.788722E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.301 | TFLOPs: 26.15 | +7: iteration 38870/ 173500 | consumed samples: 9950720 | consumed tokens: 20379074560 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.792119E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.073 | TFLOPs: 26.16 | +7: iteration 38880/ 173500 | consumed samples: 9953280 | consumed tokens: 20384317440 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.789454E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.451 | TFLOPs: 26.17 | +7: iteration 38890/ 173500 | consumed samples: 9955840 | consumed tokens: 20389560320 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.765843E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.978 | TFLOPs: 26.17 | +7: iteration 38900/ 173500 | consumed samples: 9958400 | consumed tokens: 20394803200 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.763413E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.815 | TFLOPs: 26.16 | +7: iteration 38910/ 173500 | consumed samples: 9960960 | consumed tokens: 20400046080 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.767510E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.791 | TFLOPs: 26.17 | +7: iteration 38920/ 173500 | consumed samples: 9963520 | consumed tokens: 20405288960 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.781839E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.889 | TFLOPs: 25.97 | +7: iteration 38930/ 173500 | consumed samples: 9966080 | consumed tokens: 20410531840 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.771368E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.108 | TFLOPs: 26.16 | +7: iteration 38940/ 173500 | consumed samples: 9968640 | consumed tokens: 20415774720 | elapsed time per iteration (s): 0.15 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.776918E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.293 | TFLOPs: 26.19 | +7: iteration 38950/ 173500 | consumed samples: 9971200 | consumed tokens: 20421017600 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.773169E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.967 | TFLOPs: 26.19 | +7: iteration 38960/ 173500 | consumed samples: 9973760 | consumed tokens: 20426260480 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.778117E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.814 | TFLOPs: 26.22 | +7: iteration 38970/ 173500 | consumed samples: 9976320 | consumed tokens: 20431503360 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.765917E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.947 | TFLOPs: 26.19 | +7: iteration 38980/ 173500 | consumed samples: 9978880 | consumed tokens: 20436746240 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.770532E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.886 | TFLOPs: 26.17 | +7: iteration 38990/ 173500 | consumed samples: 9981440 | consumed tokens: 20441989120 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.796719E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.036 | TFLOPs: 26.22 | +7: iteration 39000/ 173500 | consumed samples: 9984000 | consumed tokens: 20447232000 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.778172E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.242 | TFLOPs: 26.19 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 39000 | lm loss value: 3.917865E+00 | lm loss PPL: 5.029296E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 39000 to checkpoints_44m91b100m +0: [2023-03-17 01:56:27,720] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step39000 is begin to save! +0: [2023-03-17 01:56:27,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:56:27,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:56:27,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:56:27,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:56:27,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:56:27,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:56:27,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:56:27,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:56:27,810] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:56:27,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:56:27,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:56:27,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:56:27,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:56:27,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:56:27,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:56:27,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:56:27,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:56:27,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:56:27,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:56:27,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:56:27,852] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step39000/mp_rank_00_model_states.pt +0: [2023-03-17 01:56:27,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:56:27,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:56:27,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:56:27,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:56:27,882] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,882] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:56:27,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:56:27,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:56:27,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:56:27,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:56:27,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:56:27,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:56:27,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:56:27,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 01:56:27,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +5: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +2: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +1: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +7: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +4: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +3: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +6: [2023-03-17 01:56:27,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step39000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 01:56:27,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step39000 is ready now! +0: successfully saved checkpoint at iteration 39000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.61 +7: iteration 39010/ 173500 | consumed samples: 9986560 | consumed tokens: 20452474880 | elapsed time per iteration (s): 0.18 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.764858E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.913 | TFLOPs: 22.88 | +7: iteration 39020/ 173500 | consumed samples: 9989120 | consumed tokens: 20457717760 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.778421E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.240 | TFLOPs: 26.19 | +7: iteration 39030/ 173500 | consumed samples: 9991680 | consumed tokens: 20462960640 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.772311E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.302 | TFLOPs: 26.18 | +7: iteration 39040/ 173500 | consumed samples: 9994240 | consumed tokens: 20468203520 | elapsed time per iteration (s): 0.15 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.782958E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.845 | TFLOPs: 26.19 | +7: iteration 39050/ 173500 | consumed samples: 9996800 | consumed tokens: 20473446400 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.778396E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.428 | TFLOPs: 26.20 | +7: iteration 39060/ 173500 | consumed samples: 9999360 | consumed tokens: 20478689280 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.780010E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.315 | TFLOPs: 26.19 | +7: iteration 39070/ 173500 | consumed samples: 10001920 | consumed tokens: 20483932160 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.782796E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.130 | TFLOPs: 26.19 | +7: iteration 39080/ 173500 | consumed samples: 10004480 | consumed tokens: 20489175040 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.780901E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.220 | TFLOPs: 26.19 | +7: iteration 39090/ 173500 | consumed samples: 10007040 | consumed tokens: 20494417920 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.776766E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.938 | TFLOPs: 26.16 | +7: iteration 39100/ 173500 | consumed samples: 10009600 | consumed tokens: 20499660800 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.773030E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.770 | TFLOPs: 26.17 | +7: iteration 39110/ 173500 | consumed samples: 10012160 | consumed tokens: 20504903680 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.765993E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.521 | TFLOPs: 26.20 | +7: iteration 39120/ 173500 | consumed samples: 10014720 | consumed tokens: 20510146560 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.778083E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.364 | TFLOPs: 26.18 | +7: iteration 39130/ 173500 | consumed samples: 10017280 | consumed tokens: 20515389440 | elapsed time per iteration (s): 0.15 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.789907E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.082 | TFLOPs: 26.18 | +7: iteration 39140/ 173500 | consumed samples: 10019840 | consumed tokens: 20520632320 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.778531E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.126 | TFLOPs: 26.18 | +7: iteration 39150/ 173500 | consumed samples: 10022400 | consumed tokens: 20525875200 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.782544E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.276 | TFLOPs: 26.16 | +7: iteration 39160/ 173500 | consumed samples: 10024960 | consumed tokens: 20531118080 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.780715E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.044 | TFLOPs: 26.14 | +7: iteration 39170/ 173500 | consumed samples: 10027520 | consumed tokens: 20536360960 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.778523E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.265 | TFLOPs: 26.19 | +7: iteration 39180/ 173500 | consumed samples: 10030080 | consumed tokens: 20541603840 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.785556E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.384 | TFLOPs: 26.16 | +7: iteration 39190/ 173500 | consumed samples: 10032640 | consumed tokens: 20546846720 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.775116E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.272 | TFLOPs: 26.12 | +7: iteration 39200/ 173500 | consumed samples: 10035200 | consumed tokens: 20552089600 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.781898E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.433 | TFLOPs: 26.06 | +7: iteration 39210/ 173500 | consumed samples: 10037760 | consumed tokens: 20557332480 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.783531E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.618 | TFLOPs: 26.12 | +7: iteration 39220/ 173500 | consumed samples: 10040320 | consumed tokens: 20562575360 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.792274E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.649 | TFLOPs: 26.14 | +7: iteration 39230/ 173500 | consumed samples: 10042880 | consumed tokens: 20567818240 | elapsed time per iteration (s): 0.15 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.783191E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.022 | TFLOPs: 26.14 | +7: iteration 39240/ 173500 | consumed samples: 10045440 | consumed tokens: 20573061120 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.789859E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.636 | TFLOPs: 26.15 | +7: iteration 39250/ 173500 | consumed samples: 10048000 | consumed tokens: 20578304000 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.776881E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.646 | TFLOPs: 26.14 | +7: iteration 39260/ 173500 | consumed samples: 10050560 | consumed tokens: 20583546880 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.781712E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.288 | TFLOPs: 26.12 | +7: iteration 39270/ 173500 | consumed samples: 10053120 | consumed tokens: 20588789760 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.778459E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.396 | TFLOPs: 26.15 | +7: iteration 39280/ 173500 | consumed samples: 10055680 | consumed tokens: 20594032640 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.784849E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.646 | TFLOPs: 26.14 | +7: iteration 39290/ 173500 | consumed samples: 10058240 | consumed tokens: 20599275520 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.772740E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.761 | TFLOPs: 26.17 | +7: iteration 39300/ 173500 | consumed samples: 10060800 | consumed tokens: 20604518400 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.785928E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.464 | TFLOPs: 26.15 | +7: iteration 39310/ 173500 | consumed samples: 10063360 | consumed tokens: 20609761280 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.774694E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.988 | TFLOPs: 26.14 | +7: iteration 39320/ 173500 | consumed samples: 10065920 | consumed tokens: 20615004160 | elapsed time per iteration (s): 0.15 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.775124E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.085 | TFLOPs: 26.14 | +7: iteration 39330/ 173500 | consumed samples: 10068480 | consumed tokens: 20620247040 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.766949E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.125 | TFLOPs: 26.14 | +7: iteration 39340/ 173500 | consumed samples: 10071040 | consumed tokens: 20625489920 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.783966E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.340 | TFLOPs: 26.16 | +7: iteration 39350/ 173500 | consumed samples: 10073600 | consumed tokens: 20630732800 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.770539E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.183 | TFLOPs: 26.13 | +7: iteration 39360/ 173500 | consumed samples: 10076160 | consumed tokens: 20635975680 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.776110E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.889 | TFLOPs: 26.13 | +7: iteration 39370/ 173500 | consumed samples: 10078720 | consumed tokens: 20641218560 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.778479E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.945 | TFLOPs: 26.14 | +7: iteration 39380/ 173500 | consumed samples: 10081280 | consumed tokens: 20646461440 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.765154E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.948 | TFLOPs: 26.14 | +7: iteration 39390/ 173500 | consumed samples: 10083840 | consumed tokens: 20651704320 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.776630E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.719 | TFLOPs: 26.15 | +7: iteration 39400/ 173500 | consumed samples: 10086400 | consumed tokens: 20656947200 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.775651E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.918 | TFLOPs: 26.16 | +7: iteration 39410/ 173500 | consumed samples: 10088960 | consumed tokens: 20662190080 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.784684E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.027 | TFLOPs: 25.91 | +7: iteration 39420/ 173500 | consumed samples: 10091520 | consumed tokens: 20667432960 | elapsed time per iteration (s): 0.15 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.785019E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.031 | TFLOPs: 26.14 | +7: iteration 39430/ 173500 | consumed samples: 10094080 | consumed tokens: 20672675840 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.765615E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.986 | TFLOPs: 26.14 | +7: iteration 39440/ 173500 | consumed samples: 10096640 | consumed tokens: 20677918720 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.783742E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.993 | TFLOPs: 26.16 | +7: iteration 39450/ 173500 | consumed samples: 10099200 | consumed tokens: 20683161600 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.776241E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.181 | TFLOPs: 26.16 | +7: iteration 39460/ 173500 | consumed samples: 10101760 | consumed tokens: 20688404480 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.780169E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.270 | TFLOPs: 26.15 | +7: iteration 39470/ 173500 | consumed samples: 10104320 | consumed tokens: 20693647360 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.764289E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.617 | TFLOPs: 26.15 | +7: iteration 39480/ 173500 | consumed samples: 10106880 | consumed tokens: 20698890240 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.767213E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.162 | TFLOPs: 26.16 | +7: iteration 39490/ 173500 | consumed samples: 10109440 | consumed tokens: 20704133120 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.783654E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.731 | TFLOPs: 26.14 | +7: iteration 39500/ 173500 | consumed samples: 10112000 | consumed tokens: 20709376000 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.772849E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.404 | TFLOPs: 26.12 | +7: iteration 39510/ 173500 | consumed samples: 10114560 | consumed tokens: 20714618880 | elapsed time per iteration (s): 0.15 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.773917E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.326 | TFLOPs: 26.12 | +7: iteration 39520/ 173500 | consumed samples: 10117120 | consumed tokens: 20719861760 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.803214E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.480 | TFLOPs: 26.13 | +7: iteration 39530/ 173500 | consumed samples: 10119680 | consumed tokens: 20725104640 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.778279E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.553 | TFLOPs: 26.10 | +7: iteration 39540/ 173500 | consumed samples: 10122240 | consumed tokens: 20730347520 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.756700E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.725 | TFLOPs: 26.11 | +7: iteration 39550/ 173500 | consumed samples: 10124800 | consumed tokens: 20735590400 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.780450E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.480 | TFLOPs: 26.12 | +7: iteration 39560/ 173500 | consumed samples: 10127360 | consumed tokens: 20740833280 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.781219E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.240 | TFLOPs: 26.12 | +7: iteration 39570/ 173500 | consumed samples: 10129920 | consumed tokens: 20746076160 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.789802E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.244 | TFLOPs: 26.13 | +7: iteration 39580/ 173500 | consumed samples: 10132480 | consumed tokens: 20751319040 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.795332E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.994 | TFLOPs: 26.11 | +7: iteration 39590/ 173500 | consumed samples: 10135040 | consumed tokens: 20756561920 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.769393E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.348 | TFLOPs: 26.09 | +7: iteration 39600/ 173500 | consumed samples: 10137600 | consumed tokens: 20761804800 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.762293E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.491 | TFLOPs: 26.12 | +7: iteration 39610/ 173500 | consumed samples: 10140160 | consumed tokens: 20767047680 | elapsed time per iteration (s): 0.15 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.770788E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.491 | TFLOPs: 26.12 | +7: iteration 39620/ 173500 | consumed samples: 10142720 | consumed tokens: 20772290560 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.776258E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.922 | TFLOPs: 26.14 | +7: iteration 39630/ 173500 | consumed samples: 10145280 | consumed tokens: 20777533440 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.781682E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.426 | TFLOPs: 26.12 | +7: iteration 39640/ 173500 | consumed samples: 10147840 | consumed tokens: 20782776320 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.779636E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.338 | TFLOPs: 26.16 | +7: iteration 39650/ 173500 | consumed samples: 10150400 | consumed tokens: 20788019200 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.795033E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.110 | TFLOPs: 26.18 | +7: iteration 39660/ 173500 | consumed samples: 10152960 | consumed tokens: 20793262080 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.787900E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.356 | TFLOPs: 26.21 | +7: iteration 39670/ 173500 | consumed samples: 10155520 | consumed tokens: 20798504960 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.785381E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.212 | TFLOPs: 26.21 | +7: iteration 39680/ 173500 | consumed samples: 10158080 | consumed tokens: 20803747840 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.766154E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.867 | TFLOPs: 26.20 | +7: iteration 39690/ 173500 | consumed samples: 10160640 | consumed tokens: 20808990720 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.785783E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.186 | TFLOPs: 26.21 | +7: iteration 39700/ 173500 | consumed samples: 10163200 | consumed tokens: 20814233600 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.780314E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.405 | TFLOPs: 26.21 | +7: iteration 39710/ 173500 | consumed samples: 10165760 | consumed tokens: 20819476480 | elapsed time per iteration (s): 0.15 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.786750E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.945 | TFLOPs: 26.22 | +7: iteration 39720/ 173500 | consumed samples: 10168320 | consumed tokens: 20824719360 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.778347E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.123 | TFLOPs: 26.22 | +7: iteration 39730/ 173500 | consumed samples: 10170880 | consumed tokens: 20829962240 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.772294E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.551 | TFLOPs: 26.23 | +7: iteration 39740/ 173500 | consumed samples: 10173440 | consumed tokens: 20835205120 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.787953E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.818 | TFLOPs: 26.22 | +7: iteration 39750/ 173500 | consumed samples: 10176000 | consumed tokens: 20840448000 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.786698E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.153 | TFLOPs: 26.22 | +7: iteration 39760/ 173500 | consumed samples: 10178560 | consumed tokens: 20845690880 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.788663E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.997 | TFLOPs: 26.22 | +7: iteration 39770/ 173500 | consumed samples: 10181120 | consumed tokens: 20850933760 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.772104E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.784 | TFLOPs: 26.22 | +7: iteration 39780/ 173500 | consumed samples: 10183680 | consumed tokens: 20856176640 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.775948E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.611 | TFLOPs: 26.22 | +7: iteration 39790/ 173500 | consumed samples: 10186240 | consumed tokens: 20861419520 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.778861E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.677 | TFLOPs: 26.20 | +7: iteration 39800/ 173500 | consumed samples: 10188800 | consumed tokens: 20866662400 | elapsed time per iteration (s): 0.15 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.767865E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.804 | TFLOPs: 26.20 | +7: iteration 39810/ 173500 | consumed samples: 10191360 | consumed tokens: 20871905280 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.771333E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.171 | TFLOPs: 26.19 | +7: iteration 39820/ 173500 | consumed samples: 10193920 | consumed tokens: 20877148160 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.783667E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.288 | TFLOPs: 26.23 | +7: iteration 39830/ 173500 | consumed samples: 10196480 | consumed tokens: 20882391040 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.773224E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.134 | TFLOPs: 26.33 | +7: iteration 39840/ 173500 | consumed samples: 10199040 | consumed tokens: 20887633920 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.770606E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.486 | TFLOPs: 26.34 | +7: iteration 39850/ 173500 | consumed samples: 10201600 | consumed tokens: 20892876800 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.785186E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.937 | TFLOPs: 26.31 | +7: iteration 39860/ 173500 | consumed samples: 10204160 | consumed tokens: 20898119680 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.779161E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.963 | TFLOPs: 26.31 | +7: iteration 39870/ 173500 | consumed samples: 10206720 | consumed tokens: 20903362560 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.772981E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.335 | TFLOPs: 26.32 | +7: iteration 39880/ 173500 | consumed samples: 10209280 | consumed tokens: 20908605440 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.786769E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.868 | TFLOPs: 26.30 | +7: iteration 39890/ 173500 | consumed samples: 10211840 | consumed tokens: 20913848320 | elapsed time per iteration (s): 0.15 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.771298E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.957 | TFLOPs: 26.28 | +7: iteration 39900/ 173500 | consumed samples: 10214400 | consumed tokens: 20919091200 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.767098E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.353 | TFLOPs: 26.29 | +7: iteration 39910/ 173500 | consumed samples: 10216960 | consumed tokens: 20924334080 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.782626E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.055 | TFLOPs: 26.32 | +7: iteration 39920/ 173500 | consumed samples: 10219520 | consumed tokens: 20929576960 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.778582E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.375 | TFLOPs: 25.98 | +7: iteration 39930/ 173500 | consumed samples: 10222080 | consumed tokens: 20934819840 | elapsed time per iteration (s): 0.16 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.788698E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.414 | TFLOPs: 25.51 | +7: iteration 39940/ 173500 | consumed samples: 10224640 | consumed tokens: 20940062720 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.773574E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.644 | TFLOPs: 26.23 | +7: iteration 39950/ 173500 | consumed samples: 10227200 | consumed tokens: 20945305600 | elapsed time per iteration (s): 0.16 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.776692E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.295 | TFLOPs: 25.77 | +7: iteration 39960/ 173500 | consumed samples: 10229760 | consumed tokens: 20950548480 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.785313E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.818 | TFLOPs: 26.19 | +7: iteration 39970/ 173500 | consumed samples: 10232320 | consumed tokens: 20955791360 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.785898E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.228 | TFLOPs: 26.30 | +7: iteration 39980/ 173500 | consumed samples: 10234880 | consumed tokens: 20961034240 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.784983E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.616 | TFLOPs: 26.29 | +7: iteration 39990/ 173500 | consumed samples: 10237440 | consumed tokens: 20966277120 | elapsed time per iteration (s): 0.15 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.773450E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.795 | TFLOPs: 26.30 | +0: [2023-03-17 01:59:01,321] [INFO] [logging.py:68:log_dist] [Rank 0] step=40000, skipped=0, lr=[0.0001788435118675357, 0.0001788435118675357, 0.0001788435118675357], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 40000/ 173500 | consumed samples: 10240000 | consumed tokens: 20971520000 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.785066E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.205 | TFLOPs: 26.32 | +0: steps: 40000 loss: 3.8126 iter time (s): 0.153 samples/sec: 1676.510 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 40000 | lm loss value: 3.888198E+00 | lm loss PPL: 4.882284E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 40000 to checkpoints_44m91b100m +0: [2023-03-17 01:59:01,393] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step40000 is begin to save! +0: [2023-03-17 01:59:01,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_01-model_00-model_states.pt... +0: [2023-03-17 01:59:01,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_01-model_00-model_states.pt. +0: [2023-03-17 01:59:01,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_03-model_00-model_states.pt... +0: [2023-03-17 01:59:01,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_03-model_00-model_states.pt. +0: [2023-03-17 01:59:01,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_04-model_00-model_states.pt... +0: [2023-03-17 01:59:01,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_04-model_00-model_states.pt. +0: [2023-03-17 01:59:01,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_05-model_00-model_states.pt... +0: [2023-03-17 01:59:01,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_05-model_00-model_states.pt. +0: [2023-03-17 01:59:01,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_06-model_00-model_states.pt... +0: [2023-03-17 01:59:01,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_06-model_00-model_states.pt. +0: [2023-03-17 01:59:01,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_07-model_00-model_states.pt... +0: [2023-03-17 01:59:01,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_07-model_00-model_states.pt. +0: [2023-03-17 01:59:01,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_08-model_00-model_states.pt... +0: [2023-03-17 01:59:01,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_08-model_00-model_states.pt. +0: [2023-03-17 01:59:01,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_09-model_00-model_states.pt... +0: [2023-03-17 01:59:01,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_09-model_00-model_states.pt. +0: [2023-03-17 01:59:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_10-model_00-model_states.pt... +0: [2023-03-17 01:59:01,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_10-model_00-model_states.pt. +0: [2023-03-17 01:59:01,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/layer_12-model_00-model_states.pt... +0: [2023-03-17 01:59:01,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/layer_12-model_00-model_states.pt. +0: [2023-03-17 01:59:01,526] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step40000/mp_rank_00_model_states.pt +0: [2023-03-17 01:59:01,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/mp_rank_00_model_states.pt... +0: [2023-03-17 01:59:01,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/mp_rank_00_model_states.pt. +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 01:59:01,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +7: [2023-03-17 01:59:01,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:01,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +7: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 01:59:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +2: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +1: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +4: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +5: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +6: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +3: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 01:59:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step40000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 01:59:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step40000 is ready now! +0: successfully saved checkpoint at iteration 40000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.58 +7: iteration 40010/ 173500 | consumed samples: 10242560 | consumed tokens: 20976762880 | elapsed time per iteration (s): 0.18 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.775193E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.713 | TFLOPs: 22.12 | +7: iteration 40020/ 173500 | consumed samples: 10245120 | consumed tokens: 20982005760 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.786591E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.455 | TFLOPs: 26.34 | +7: iteration 40030/ 173500 | consumed samples: 10247680 | consumed tokens: 20987248640 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.776788E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.963 | TFLOPs: 26.33 | +7: iteration 40040/ 173500 | consumed samples: 10250240 | consumed tokens: 20992491520 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.765631E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.296 | TFLOPs: 26.35 | +7: iteration 40050/ 173500 | consumed samples: 10252800 | consumed tokens: 20997734400 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.769157E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.550 | TFLOPs: 26.32 | +7: iteration 40060/ 173500 | consumed samples: 10255360 | consumed tokens: 21002977280 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.765543E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.184 | TFLOPs: 26.33 | +7: iteration 40070/ 173500 | consumed samples: 10257920 | consumed tokens: 21008220160 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.781588E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.330 | TFLOPs: 26.35 | +7: iteration 40080/ 173500 | consumed samples: 10260480 | consumed tokens: 21013463040 | elapsed time per iteration (s): 0.15 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.773378E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.386 | TFLOPs: 26.35 | +7: iteration 40090/ 173500 | consumed samples: 10263040 | consumed tokens: 21018705920 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.779858E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.714 | TFLOPs: 26.34 | +7: iteration 40100/ 173500 | consumed samples: 10265600 | consumed tokens: 21023948800 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.762452E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.548 | TFLOPs: 26.34 | +7: iteration 40110/ 173500 | consumed samples: 10268160 | consumed tokens: 21029191680 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.780834E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.898 | TFLOPs: 26.33 | +7: iteration 40120/ 173500 | consumed samples: 10270720 | consumed tokens: 21034434560 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.772087E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.146 | TFLOPs: 26.35 | +7: iteration 40130/ 173500 | consumed samples: 10273280 | consumed tokens: 21039677440 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.779353E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.739 | TFLOPs: 26.33 | +7: iteration 40140/ 173500 | consumed samples: 10275840 | consumed tokens: 21044920320 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.784042E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.299 | TFLOPs: 26.23 | +7: iteration 40150/ 173500 | consumed samples: 10278400 | consumed tokens: 21050163200 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.777960E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.116 | TFLOPs: 26.22 | +7: iteration 40160/ 173500 | consumed samples: 10280960 | consumed tokens: 21055406080 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.772467E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.907 | TFLOPs: 26.24 | +7: iteration 40170/ 173500 | consumed samples: 10283520 | consumed tokens: 21060648960 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.783677E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.503 | TFLOPs: 26.32 | +7: iteration 40180/ 173500 | consumed samples: 10286080 | consumed tokens: 21065891840 | elapsed time per iteration (s): 0.15 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.783630E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.553 | TFLOPs: 26.31 | +7: iteration 40190/ 173500 | consumed samples: 10288640 | consumed tokens: 21071134720 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.787937E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.660 | TFLOPs: 26.31 | +7: iteration 40200/ 173500 | consumed samples: 10291200 | consumed tokens: 21076377600 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.778790E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.311 | TFLOPs: 26.30 | +7: iteration 40210/ 173500 | consumed samples: 10293760 | consumed tokens: 21081620480 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.786349E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.453 | TFLOPs: 26.32 | +7: iteration 40220/ 173500 | consumed samples: 10296320 | consumed tokens: 21086863360 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.771202E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.371 | TFLOPs: 25.99 | +7: iteration 40230/ 173500 | consumed samples: 10298880 | consumed tokens: 21092106240 | elapsed time per iteration (s): 0.16 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.775695E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.385 | TFLOPs: 25.84 | +7: iteration 40240/ 173500 | consumed samples: 10301440 | consumed tokens: 21097349120 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.776543E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.267 | TFLOPs: 26.15 | +7: iteration 40250/ 173500 | consumed samples: 10304000 | consumed tokens: 21102592000 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.788246E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.095 | TFLOPs: 26.13 | +7: iteration 40260/ 173500 | consumed samples: 10306560 | consumed tokens: 21107834880 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.774295E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.424 | TFLOPs: 26.15 | +7: iteration 40270/ 173500 | consumed samples: 10309120 | consumed tokens: 21113077760 | elapsed time per iteration (s): 0.15 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.773053E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.056 | TFLOPs: 26.17 | +7: iteration 40280/ 173500 | consumed samples: 10311680 | consumed tokens: 21118320640 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.788742E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.298 | TFLOPs: 26.15 | +7: iteration 40290/ 173500 | consumed samples: 10314240 | consumed tokens: 21123563520 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.784668E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.937 | TFLOPs: 26.09 | +7: iteration 40300/ 173500 | consumed samples: 10316800 | consumed tokens: 21128806400 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.779951E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.205 | TFLOPs: 26.10 | +7: iteration 40310/ 173500 | consumed samples: 10319360 | consumed tokens: 21134049280 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.767500E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.750 | TFLOPs: 26.11 | +7: iteration 40320/ 173500 | consumed samples: 10321920 | consumed tokens: 21139292160 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.782188E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.835 | TFLOPs: 26.09 | +7: iteration 40330/ 173500 | consumed samples: 10324480 | consumed tokens: 21144535040 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.772884E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.555 | TFLOPs: 26.09 | +7: iteration 40340/ 173500 | consumed samples: 10327040 | consumed tokens: 21149777920 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.773766E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.785 | TFLOPs: 26.09 | +7: iteration 40350/ 173500 | consumed samples: 10329600 | consumed tokens: 21155020800 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.777959E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.773 | TFLOPs: 26.14 | +7: iteration 40360/ 173500 | consumed samples: 10332160 | consumed tokens: 21160263680 | elapsed time per iteration (s): 0.15 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.784412E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.940 | TFLOPs: 26.13 | +7: iteration 40370/ 173500 | consumed samples: 10334720 | consumed tokens: 21165506560 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.771630E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.239 | TFLOPs: 26.13 | +7: iteration 40380/ 173500 | consumed samples: 10337280 | consumed tokens: 21170749440 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.770977E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.734 | TFLOPs: 26.17 | +7: iteration 40390/ 173500 | consumed samples: 10339840 | consumed tokens: 21175992320 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.766506E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.156 | TFLOPs: 26.18 | +7: iteration 40400/ 173500 | consumed samples: 10342400 | consumed tokens: 21181235200 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.783416E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.692 | TFLOPs: 26.17 | +7: iteration 40410/ 173500 | consumed samples: 10344960 | consumed tokens: 21186478080 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.771765E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.917 | TFLOPs: 26.14 | +7: iteration 40420/ 173500 | consumed samples: 10347520 | consumed tokens: 21191720960 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.796090E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.737 | TFLOPs: 26.15 | +7: iteration 40430/ 173500 | consumed samples: 10350080 | consumed tokens: 21196963840 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.778765E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.868 | TFLOPs: 26.19 | +7: iteration 40440/ 173500 | consumed samples: 10352640 | consumed tokens: 21202206720 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.770518E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.380 | TFLOPs: 26.18 | +7: iteration 40450/ 173500 | consumed samples: 10355200 | consumed tokens: 21207449600 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.771428E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.758 | TFLOPs: 26.09 | +7: iteration 40460/ 173500 | consumed samples: 10357760 | consumed tokens: 21212692480 | elapsed time per iteration (s): 0.15 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.785143E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.174 | TFLOPs: 26.10 | +7: iteration 40470/ 173500 | consumed samples: 10360320 | consumed tokens: 21217935360 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.774738E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.859 | TFLOPs: 26.08 | +7: iteration 40480/ 173500 | consumed samples: 10362880 | consumed tokens: 21223178240 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.778675E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.290 | TFLOPs: 26.10 | +7: iteration 40490/ 173500 | consumed samples: 10365440 | consumed tokens: 21228421120 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.774294E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.176 | TFLOPs: 26.04 | +7: iteration 40500/ 173500 | consumed samples: 10368000 | consumed tokens: 21233664000 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.767548E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.400 | TFLOPs: 26.13 | +7: iteration 40510/ 173500 | consumed samples: 10370560 | consumed tokens: 21238906880 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.768246E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.005 | TFLOPs: 26.13 | +7: iteration 40520/ 173500 | consumed samples: 10373120 | consumed tokens: 21244149760 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.780499E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.674 | TFLOPs: 26.11 | +7: iteration 40530/ 173500 | consumed samples: 10375680 | consumed tokens: 21249392640 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.773888E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.811 | TFLOPs: 26.11 | +7: iteration 40540/ 173500 | consumed samples: 10378240 | consumed tokens: 21254635520 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.774335E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.744 | TFLOPs: 26.11 | +7: iteration 40550/ 173500 | consumed samples: 10380800 | consumed tokens: 21259878400 | elapsed time per iteration (s): 0.15 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.763618E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.603 | TFLOPs: 26.12 | +7: iteration 40560/ 173500 | consumed samples: 10383360 | consumed tokens: 21265121280 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.781212E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.879 | TFLOPs: 26.11 | +7: iteration 40570/ 173500 | consumed samples: 10385920 | consumed tokens: 21270364160 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.766346E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.360 | TFLOPs: 26.12 | +7: iteration 40580/ 173500 | consumed samples: 10388480 | consumed tokens: 21275607040 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.763350E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.250 | TFLOPs: 26.10 | +7: iteration 40590/ 173500 | consumed samples: 10391040 | consumed tokens: 21280849920 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.790904E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.082 | TFLOPs: 26.10 | +7: iteration 40600/ 173500 | consumed samples: 10393600 | consumed tokens: 21286092800 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.774229E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.306 | TFLOPs: 26.13 | +7: iteration 40610/ 173500 | consumed samples: 10396160 | consumed tokens: 21291335680 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.771632E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.791 | TFLOPs: 26.11 | +7: iteration 40620/ 173500 | consumed samples: 10398720 | consumed tokens: 21296578560 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.764722E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.180 | TFLOPs: 26.11 | +7: iteration 40630/ 173500 | consumed samples: 10401280 | consumed tokens: 21301821440 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.770137E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.325 | TFLOPs: 26.29 | +7: iteration 40640/ 173500 | consumed samples: 10403840 | consumed tokens: 21307064320 | elapsed time per iteration (s): 0.15 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.771354E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.859 | TFLOPs: 26.30 | +7: iteration 40650/ 173500 | consumed samples: 10406400 | consumed tokens: 21312307200 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.769393E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.854 | TFLOPs: 26.31 | +7: iteration 40660/ 173500 | consumed samples: 10408960 | consumed tokens: 21317550080 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.768304E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.265 | TFLOPs: 26.32 | +7: iteration 40670/ 173500 | consumed samples: 10411520 | consumed tokens: 21322792960 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.782454E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.779 | TFLOPs: 26.34 | +7: iteration 40680/ 173500 | consumed samples: 10414080 | consumed tokens: 21328035840 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.762153E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.876 | TFLOPs: 26.31 | +7: iteration 40690/ 173500 | consumed samples: 10416640 | consumed tokens: 21333278720 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.767001E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.854 | TFLOPs: 26.34 | +7: iteration 40700/ 173500 | consumed samples: 10419200 | consumed tokens: 21338521600 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.781960E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.300 | TFLOPs: 26.34 | +7: iteration 40710/ 173500 | consumed samples: 10421760 | consumed tokens: 21343764480 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.766260E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.973 | TFLOPs: 26.33 | +7: iteration 40720/ 173500 | consumed samples: 10424320 | consumed tokens: 21349007360 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.762920E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.394 | TFLOPs: 26.32 | +7: iteration 40730/ 173500 | consumed samples: 10426880 | consumed tokens: 21354250240 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.768258E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.711 | TFLOPs: 26.31 | +7: iteration 40740/ 173500 | consumed samples: 10429440 | consumed tokens: 21359493120 | elapsed time per iteration (s): 0.15 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.782188E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.340 | TFLOPs: 26.34 | +7: iteration 40750/ 173500 | consumed samples: 10432000 | consumed tokens: 21364736000 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.778616E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.790 | TFLOPs: 26.33 | +7: iteration 40760/ 173500 | consumed samples: 10434560 | consumed tokens: 21369978880 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.773873E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.647 | TFLOPs: 26.33 | +7: iteration 40770/ 173500 | consumed samples: 10437120 | consumed tokens: 21375221760 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.775333E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.460 | TFLOPs: 26.32 | +7: iteration 40780/ 173500 | consumed samples: 10439680 | consumed tokens: 21380464640 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.770149E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.314 | TFLOPs: 26.34 | +7: iteration 40790/ 173500 | consumed samples: 10442240 | consumed tokens: 21385707520 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.779181E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.767 | TFLOPs: 26.33 | +7: iteration 40800/ 173500 | consumed samples: 10444800 | consumed tokens: 21390950400 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.779266E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.184 | TFLOPs: 26.35 | +7: iteration 40810/ 173500 | consumed samples: 10447360 | consumed tokens: 21396193280 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.772298E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.998 | TFLOPs: 26.33 | +7: iteration 40820/ 173500 | consumed samples: 10449920 | consumed tokens: 21401436160 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.783146E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.300 | TFLOPs: 26.32 | +7: iteration 40830/ 173500 | consumed samples: 10452480 | consumed tokens: 21406679040 | elapsed time per iteration (s): 0.15 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.770705E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.106 | TFLOPs: 26.33 | +7: iteration 40840/ 173500 | consumed samples: 10455040 | consumed tokens: 21411921920 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.785064E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.713 | TFLOPs: 26.34 | +7: iteration 40850/ 173500 | consumed samples: 10457600 | consumed tokens: 21417164800 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.771693E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.150 | TFLOPs: 26.33 | +7: iteration 40860/ 173500 | consumed samples: 10460160 | consumed tokens: 21422407680 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.775652E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.066 | TFLOPs: 26.35 | +7: iteration 40870/ 173500 | consumed samples: 10462720 | consumed tokens: 21427650560 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.772799E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.350 | TFLOPs: 25.99 | +7: iteration 40880/ 173500 | consumed samples: 10465280 | consumed tokens: 21432893440 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.778810E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.560 | TFLOPs: 26.26 | +7: iteration 40890/ 173500 | consumed samples: 10467840 | consumed tokens: 21438136320 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.791240E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.681 | TFLOPs: 26.25 | +7: iteration 40900/ 173500 | consumed samples: 10470400 | consumed tokens: 21443379200 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.770195E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.559 | TFLOPs: 26.32 | +7: iteration 40910/ 173500 | consumed samples: 10472960 | consumed tokens: 21448622080 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.759044E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.994 | TFLOPs: 26.30 | +7: iteration 40920/ 173500 | consumed samples: 10475520 | consumed tokens: 21453864960 | elapsed time per iteration (s): 0.15 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.766236E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.447 | TFLOPs: 26.34 | +7: iteration 40930/ 173500 | consumed samples: 10478080 | consumed tokens: 21459107840 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.769424E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.366 | TFLOPs: 26.32 | +7: iteration 40940/ 173500 | consumed samples: 10480640 | consumed tokens: 21464350720 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.774397E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.025 | TFLOPs: 26.32 | +7: iteration 40950/ 173500 | consumed samples: 10483200 | consumed tokens: 21469593600 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.779789E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.699 | TFLOPs: 25.98 | +7: iteration 40960/ 173500 | consumed samples: 10485760 | consumed tokens: 21474836480 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.785966E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.412 | TFLOPs: 26.13 | +7: iteration 40970/ 173500 | consumed samples: 10488320 | consumed tokens: 21480079360 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.773224E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.074 | TFLOPs: 26.33 | +7: iteration 40980/ 173500 | consumed samples: 10490880 | consumed tokens: 21485322240 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.769738E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.509 | TFLOPs: 26.32 | +7: iteration 40990/ 173500 | consumed samples: 10493440 | consumed tokens: 21490565120 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.769196E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.518 | TFLOPs: 26.32 | +7: iteration 41000/ 173500 | consumed samples: 10496000 | consumed tokens: 21495808000 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.763600E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.496 | TFLOPs: 26.32 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 41000 | lm loss value: 3.871658E+00 | lm loss PPL: 4.802192E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 41000 to checkpoints_44m91b100m +0: [2023-03-17 02:01:34,749] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step41000 is begin to save! +0: [2023-03-17 02:01:34,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:01:34,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:01:34,815] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:01:34,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:01:34,824] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:01:34,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:01:34,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:01:34,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:01:34,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:01:34,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:01:34,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:01:34,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:01:34,856] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:01:34,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:01:34,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:01:34,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:01:34,872] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:01:34,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:01:34,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:01:34,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:01:34,881] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step41000/mp_rank_00_model_states.pt +0: [2023-03-17 02:01:34,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:01:34,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:01:34,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:01:34,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:01:34,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:01:34,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:01:34,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:01:34,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +5: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:01:34,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:01:34,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +2: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +4: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +1: [2023-03-17 02:01:34,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:01:34,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +6: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:01:34,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +3: [2023-03-17 02:01:34,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +7: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:01:34,923] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step41000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:01:34,923] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step41000 is ready now! +0: successfully saved checkpoint at iteration 41000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.65 +7: iteration 41010/ 173500 | consumed samples: 10498560 | consumed tokens: 21501050880 | elapsed time per iteration (s): 0.18 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.772022E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.592 | TFLOPs: 22.86 | +7: iteration 41020/ 173500 | consumed samples: 10501120 | consumed tokens: 21506293760 | elapsed time per iteration (s): 0.15 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.759180E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.853 | TFLOPs: 26.23 | +7: iteration 41030/ 173500 | consumed samples: 10503680 | consumed tokens: 21511536640 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.787804E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.978 | TFLOPs: 26.24 | +7: iteration 41040/ 173500 | consumed samples: 10506240 | consumed tokens: 21516779520 | elapsed time per iteration (s): 0.16 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.771112E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.766 | TFLOPs: 25.56 | +7: iteration 41050/ 173500 | consumed samples: 10508800 | consumed tokens: 21522022400 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.780778E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.332 | TFLOPs: 26.26 | +7: iteration 41060/ 173500 | consumed samples: 10511360 | consumed tokens: 21527265280 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.770638E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.791 | TFLOPs: 26.25 | +7: iteration 41070/ 173500 | consumed samples: 10513920 | consumed tokens: 21532508160 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.775286E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.176 | TFLOPs: 26.24 | +7: iteration 41080/ 173500 | consumed samples: 10516480 | consumed tokens: 21537751040 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.784233E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.457 | TFLOPs: 26.24 | +7: iteration 41090/ 173500 | consumed samples: 10519040 | consumed tokens: 21542993920 | elapsed time per iteration (s): 0.17 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.773295E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.146 | TFLOPs: 24.25 | +7: iteration 41100/ 173500 | consumed samples: 10521600 | consumed tokens: 21548236800 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.748346E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.914 | TFLOPs: 26.22 | +7: iteration 41110/ 173500 | consumed samples: 10524160 | consumed tokens: 21553479680 | elapsed time per iteration (s): 0.15 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.770076E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.489 | TFLOPs: 26.23 | +7: iteration 41120/ 173500 | consumed samples: 10526720 | consumed tokens: 21558722560 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.773336E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.741 | TFLOPs: 26.22 | +7: iteration 41130/ 173500 | consumed samples: 10529280 | consumed tokens: 21563965440 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.764328E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.787 | TFLOPs: 26.26 | +7: iteration 41140/ 173500 | consumed samples: 10531840 | consumed tokens: 21569208320 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.774007E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.640 | TFLOPs: 26.23 | +7: iteration 41150/ 173500 | consumed samples: 10534400 | consumed tokens: 21574451200 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.765538E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.504 | TFLOPs: 26.28 | +7: iteration 41160/ 173500 | consumed samples: 10536960 | consumed tokens: 21579694080 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.794811E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.236 | TFLOPs: 26.29 | +7: iteration 41170/ 173500 | consumed samples: 10539520 | consumed tokens: 21584936960 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.772550E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.605 | TFLOPs: 26.28 | +7: iteration 41180/ 173500 | consumed samples: 10542080 | consumed tokens: 21590179840 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.780653E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.416 | TFLOPs: 26.31 | +7: iteration 41190/ 173500 | consumed samples: 10544640 | consumed tokens: 21595422720 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.793110E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.596 | TFLOPs: 26.31 | +7: iteration 41200/ 173500 | consumed samples: 10547200 | consumed tokens: 21600665600 | elapsed time per iteration (s): 0.15 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.769979E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.266 | TFLOPs: 26.32 | +7: iteration 41210/ 173500 | consumed samples: 10549760 | consumed tokens: 21605908480 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.767903E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.462 | TFLOPs: 26.20 | +7: iteration 41220/ 173500 | consumed samples: 10552320 | consumed tokens: 21611151360 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.780459E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.506 | TFLOPs: 26.17 | +7: iteration 41230/ 173500 | consumed samples: 10554880 | consumed tokens: 21616394240 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.780136E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.076 | TFLOPs: 26.11 | +7: iteration 41240/ 173500 | consumed samples: 10557440 | consumed tokens: 21621637120 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.770413E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.899 | TFLOPs: 26.13 | +7: iteration 41250/ 173500 | consumed samples: 10560000 | consumed tokens: 21626880000 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.772180E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.002 | TFLOPs: 26.13 | +7: iteration 41260/ 173500 | consumed samples: 10562560 | consumed tokens: 21632122880 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.777312E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.248 | TFLOPs: 26.13 | +7: iteration 41270/ 173500 | consumed samples: 10565120 | consumed tokens: 21637365760 | elapsed time per iteration (s): 0.16 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.783082E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.280 | TFLOPs: 25.77 | +7: iteration 41280/ 173500 | consumed samples: 10567680 | consumed tokens: 21642608640 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.789523E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.221 | TFLOPs: 26.13 | +7: iteration 41290/ 173500 | consumed samples: 10570240 | consumed tokens: 21647851520 | elapsed time per iteration (s): 0.15 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.770621E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.259 | TFLOPs: 26.13 | +7: iteration 41300/ 173500 | consumed samples: 10572800 | consumed tokens: 21653094400 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.768009E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.625 | TFLOPs: 26.15 | +7: iteration 41310/ 173500 | consumed samples: 10575360 | consumed tokens: 21658337280 | elapsed time per iteration (s): 0.16 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.771813E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.951 | TFLOPs: 25.75 | +7: iteration 41320/ 173500 | consumed samples: 10577920 | consumed tokens: 21663580160 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.771580E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.409 | TFLOPs: 26.16 | +7: iteration 41330/ 173500 | consumed samples: 10580480 | consumed tokens: 21668823040 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.759201E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.553 | TFLOPs: 26.15 | +7: iteration 41340/ 173500 | consumed samples: 10583040 | consumed tokens: 21674065920 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.768813E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.565 | TFLOPs: 26.15 | +7: iteration 41350/ 173500 | consumed samples: 10585600 | consumed tokens: 21679308800 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.784972E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.039 | TFLOPs: 26.16 | +7: iteration 41360/ 173500 | consumed samples: 10588160 | consumed tokens: 21684551680 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.762762E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.818 | TFLOPs: 26.16 | +7: iteration 41370/ 173500 | consumed samples: 10590720 | consumed tokens: 21689794560 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.776031E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.138 | TFLOPs: 26.18 | +7: iteration 41380/ 173500 | consumed samples: 10593280 | consumed tokens: 21695037440 | elapsed time per iteration (s): 0.15 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.768103E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.723 | TFLOPs: 26.15 | +7: iteration 41390/ 173500 | consumed samples: 10595840 | consumed tokens: 21700280320 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.776068E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.254 | TFLOPs: 26.15 | +7: iteration 41400/ 173500 | consumed samples: 10598400 | consumed tokens: 21705523200 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.774518E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.042 | TFLOPs: 26.16 | +7: iteration 41410/ 173500 | consumed samples: 10600960 | consumed tokens: 21710766080 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.766201E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.276 | TFLOPs: 26.15 | +7: iteration 41420/ 173500 | consumed samples: 10603520 | consumed tokens: 21716008960 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.784039E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.304 | TFLOPs: 26.15 | +7: iteration 41430/ 173500 | consumed samples: 10606080 | consumed tokens: 21721251840 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.767410E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.883 | TFLOPs: 26.14 | +7: iteration 41440/ 173500 | consumed samples: 10608640 | consumed tokens: 21726494720 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.769056E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.230 | TFLOPs: 26.13 | +7: iteration 41450/ 173500 | consumed samples: 10611200 | consumed tokens: 21731737600 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.767899E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.853 | TFLOPs: 26.11 | +7: iteration 41460/ 173500 | consumed samples: 10613760 | consumed tokens: 21736980480 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.762892E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.126 | TFLOPs: 26.13 | +7: iteration 41470/ 173500 | consumed samples: 10616320 | consumed tokens: 21742223360 | elapsed time per iteration (s): 0.15 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.772918E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.203 | TFLOPs: 26.13 | +7: iteration 41480/ 173500 | consumed samples: 10618880 | consumed tokens: 21747466240 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.770530E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.090 | TFLOPs: 26.10 | +7: iteration 41490/ 173500 | consumed samples: 10621440 | consumed tokens: 21752709120 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.767775E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.127 | TFLOPs: 26.10 | +7: iteration 41500/ 173500 | consumed samples: 10624000 | consumed tokens: 21757952000 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.773731E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.729 | TFLOPs: 26.12 | +7: iteration 41510/ 173500 | consumed samples: 10626560 | consumed tokens: 21763194880 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.773280E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.974 | TFLOPs: 26.08 | +7: iteration 41520/ 173500 | consumed samples: 10629120 | consumed tokens: 21768437760 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.782281E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.780 | TFLOPs: 26.12 | +7: iteration 41530/ 173500 | consumed samples: 10631680 | consumed tokens: 21773680640 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.770161E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.193 | TFLOPs: 26.13 | +7: iteration 41540/ 173500 | consumed samples: 10634240 | consumed tokens: 21778923520 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.775555E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.749 | TFLOPs: 25.98 | +7: iteration 41550/ 173500 | consumed samples: 10636800 | consumed tokens: 21784166400 | elapsed time per iteration (s): 0.16 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.769836E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.238 | TFLOPs: 25.35 | +7: iteration 41560/ 173500 | consumed samples: 10639360 | consumed tokens: 21789409280 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.778794E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.849 | TFLOPs: 26.12 | +7: iteration 41570/ 173500 | consumed samples: 10641920 | consumed tokens: 21794652160 | elapsed time per iteration (s): 0.15 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.778957E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.068 | TFLOPs: 25.92 | +7: iteration 41580/ 173500 | consumed samples: 10644480 | consumed tokens: 21799895040 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.765779E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.865 | TFLOPs: 26.08 | +7: iteration 41590/ 173500 | consumed samples: 10647040 | consumed tokens: 21805137920 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.759606E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.793 | TFLOPs: 26.11 | +7: iteration 41600/ 173500 | consumed samples: 10649600 | consumed tokens: 21810380800 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.762947E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.397 | TFLOPs: 26.10 | +7: iteration 41610/ 173500 | consumed samples: 10652160 | consumed tokens: 21815623680 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.775020E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.403 | TFLOPs: 26.09 | +7: iteration 41620/ 173500 | consumed samples: 10654720 | consumed tokens: 21820866560 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.776228E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.218 | TFLOPs: 26.10 | +7: iteration 41630/ 173500 | consumed samples: 10657280 | consumed tokens: 21826109440 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.765901E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.140 | TFLOPs: 26.10 | +7: iteration 41640/ 173500 | consumed samples: 10659840 | consumed tokens: 21831352320 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.767638E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.085 | TFLOPs: 26.11 | +7: iteration 41650/ 173500 | consumed samples: 10662400 | consumed tokens: 21836595200 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.771827E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.447 | TFLOPs: 26.12 | +7: iteration 41660/ 173500 | consumed samples: 10664960 | consumed tokens: 21841838080 | elapsed time per iteration (s): 0.15 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.767622E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.891 | TFLOPs: 26.13 | +7: iteration 41670/ 173500 | consumed samples: 10667520 | consumed tokens: 21847080960 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.767182E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.593 | TFLOPs: 26.10 | +7: iteration 41680/ 173500 | consumed samples: 10670080 | consumed tokens: 21852323840 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.774242E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.913 | TFLOPs: 26.11 | +7: iteration 41690/ 173500 | consumed samples: 10672640 | consumed tokens: 21857566720 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.760994E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.861 | TFLOPs: 26.05 | +7: iteration 41700/ 173500 | consumed samples: 10675200 | consumed tokens: 21862809600 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.764770E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.835 | TFLOPs: 26.03 | +7: iteration 41710/ 173500 | consumed samples: 10677760 | consumed tokens: 21868052480 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.775432E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.808 | TFLOPs: 26.06 | +7: iteration 41720/ 173500 | consumed samples: 10680320 | consumed tokens: 21873295360 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.768225E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.576 | TFLOPs: 26.04 | +7: iteration 41730/ 173500 | consumed samples: 10682880 | consumed tokens: 21878538240 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.760268E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.082 | TFLOPs: 26.05 | +7: iteration 41740/ 173500 | consumed samples: 10685440 | consumed tokens: 21883781120 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.772974E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.421 | TFLOPs: 26.07 | +7: iteration 41750/ 173500 | consumed samples: 10688000 | consumed tokens: 21889024000 | elapsed time per iteration (s): 0.15 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.764317E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.757 | TFLOPs: 26.06 | +7: iteration 41760/ 173500 | consumed samples: 10690560 | consumed tokens: 21894266880 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.765866E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.219 | TFLOPs: 26.05 | +7: iteration 41770/ 173500 | consumed samples: 10693120 | consumed tokens: 21899509760 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.770286E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.092 | TFLOPs: 26.05 | +7: iteration 41780/ 173500 | consumed samples: 10695680 | consumed tokens: 21904752640 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.780862E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.172 | TFLOPs: 26.07 | +7: iteration 41790/ 173500 | consumed samples: 10698240 | consumed tokens: 21909995520 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.777905E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.941 | TFLOPs: 26.06 | +7: iteration 41800/ 173500 | consumed samples: 10700800 | consumed tokens: 21915238400 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.766198E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.018 | TFLOPs: 26.00 | +7: iteration 41810/ 173500 | consumed samples: 10703360 | consumed tokens: 21920481280 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.776350E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.740 | TFLOPs: 26.06 | +7: iteration 41820/ 173500 | consumed samples: 10705920 | consumed tokens: 21925724160 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.773159E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.489 | TFLOPs: 26.04 | +7: iteration 41830/ 173500 | consumed samples: 10708480 | consumed tokens: 21930967040 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.769700E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.756 | TFLOPs: 26.06 | +7: iteration 41840/ 173500 | consumed samples: 10711040 | consumed tokens: 21936209920 | elapsed time per iteration (s): 0.15 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.775051E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.173 | TFLOPs: 26.11 | +7: iteration 41850/ 173500 | consumed samples: 10713600 | consumed tokens: 21941452800 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.759593E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.846 | TFLOPs: 26.12 | +7: iteration 41860/ 173500 | consumed samples: 10716160 | consumed tokens: 21946695680 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.769956E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.939 | TFLOPs: 26.11 | +7: iteration 41870/ 173500 | consumed samples: 10718720 | consumed tokens: 21951938560 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.778379E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.743 | TFLOPs: 26.11 | +7: iteration 41880/ 173500 | consumed samples: 10721280 | consumed tokens: 21957181440 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.770158E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.158 | TFLOPs: 26.11 | +7: iteration 41890/ 173500 | consumed samples: 10723840 | consumed tokens: 21962424320 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.771014E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.215 | TFLOPs: 26.13 | +7: iteration 41900/ 173500 | consumed samples: 10726400 | consumed tokens: 21967667200 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.762284E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.752 | TFLOPs: 26.12 | +7: iteration 41910/ 173500 | consumed samples: 10728960 | consumed tokens: 21972910080 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.771423E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.946 | TFLOPs: 26.13 | +7: iteration 41920/ 173500 | consumed samples: 10731520 | consumed tokens: 21978152960 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.765877E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.811 | TFLOPs: 26.09 | +7: iteration 41930/ 173500 | consumed samples: 10734080 | consumed tokens: 21983395840 | elapsed time per iteration (s): 0.15 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.775015E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.023 | TFLOPs: 26.06 | +7: iteration 41940/ 173500 | consumed samples: 10736640 | consumed tokens: 21988638720 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.774670E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.533 | TFLOPs: 26.07 | +7: iteration 41950/ 173500 | consumed samples: 10739200 | consumed tokens: 21993881600 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.763274E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.621 | TFLOPs: 26.07 | +7: iteration 41960/ 173500 | consumed samples: 10741760 | consumed tokens: 21999124480 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.769499E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.766 | TFLOPs: 26.08 | +7: iteration 41970/ 173500 | consumed samples: 10744320 | consumed tokens: 22004367360 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.777813E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.514 | TFLOPs: 26.07 | +7: iteration 41980/ 173500 | consumed samples: 10746880 | consumed tokens: 22009610240 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.753596E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.474 | TFLOPs: 26.09 | +7: iteration 41990/ 173500 | consumed samples: 10749440 | consumed tokens: 22014853120 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.775216E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.153 | TFLOPs: 26.13 | +0: [2023-03-17 02:04:08,798] [INFO] [logging.py:68:log_dist] [Rank 0] step=42000, skipped=0, lr=[0.00017667737143212697, 0.00017667737143212697, 0.00017667737143212697], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 42000/ 173500 | consumed samples: 10752000 | consumed tokens: 22020096000 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.761322E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.124 | TFLOPs: 26.13 | +0: steps: 42000 loss: 3.7633 iter time (s): 0.153 samples/sec: 1677.600 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 42000 | lm loss value: 3.942492E+00 | lm loss PPL: 5.154692E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 42000 to checkpoints_44m91b100m +0: [2023-03-17 02:04:08,871] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step42000 is begin to save! +0: [2023-03-17 02:04:08,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:04:08,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:04:08,935] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:04:08,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:04:08,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:04:08,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:04:08,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:04:08,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:04:08,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:04:08,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:04:08,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:04:08,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:04:08,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:04:08,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:04:08,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:04:08,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:04:08,992] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:04:09,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:04:09,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:04:09,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:04:09,002] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step42000/mp_rank_00_model_states.pt +0: [2023-03-17 02:04:09,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:04:09,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:04:09,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:04:09,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:04:09,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:04:09,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:04:09,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:04:09,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +7: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +2: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +6: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +3: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +1: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +4: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +5: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:04:09,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step42000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:04:09,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step42000 is ready now! +0: successfully saved checkpoint at iteration 42000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.39 +7: iteration 42010/ 173500 | consumed samples: 10754560 | consumed tokens: 22025338880 | elapsed time per iteration (s): 0.18 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.778886E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.930 | TFLOPs: 22.55 | +7: iteration 42020/ 173500 | consumed samples: 10757120 | consumed tokens: 22030581760 | elapsed time per iteration (s): 0.15 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.785018E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.503 | TFLOPs: 26.35 | +7: iteration 42030/ 173500 | consumed samples: 10759680 | consumed tokens: 22035824640 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.773461E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.594 | TFLOPs: 26.31 | +7: iteration 42040/ 173500 | consumed samples: 10762240 | consumed tokens: 22041067520 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.774233E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.046 | TFLOPs: 26.24 | +7: iteration 42050/ 173500 | consumed samples: 10764800 | consumed tokens: 22046310400 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.770884E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.497 | TFLOPs: 26.26 | +7: iteration 42060/ 173500 | consumed samples: 10767360 | consumed tokens: 22051553280 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.771770E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.003 | TFLOPs: 26.25 | +7: iteration 42070/ 173500 | consumed samples: 10769920 | consumed tokens: 22056796160 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.765043E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.120 | TFLOPs: 26.24 | +7: iteration 42080/ 173500 | consumed samples: 10772480 | consumed tokens: 22062039040 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.771655E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.813 | TFLOPs: 26.23 | +7: iteration 42090/ 173500 | consumed samples: 10775040 | consumed tokens: 22067281920 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.772166E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.832 | TFLOPs: 26.25 | +7: iteration 42100/ 173500 | consumed samples: 10777600 | consumed tokens: 22072524800 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.773833E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.824 | TFLOPs: 26.23 | +7: iteration 42110/ 173500 | consumed samples: 10780160 | consumed tokens: 22077767680 | elapsed time per iteration (s): 0.15 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.762841E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.339 | TFLOPs: 26.24 | +7: iteration 42120/ 173500 | consumed samples: 10782720 | consumed tokens: 22083010560 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.763326E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.402 | TFLOPs: 26.23 | +7: iteration 42130/ 173500 | consumed samples: 10785280 | consumed tokens: 22088253440 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.760345E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.252 | TFLOPs: 26.12 | +7: iteration 42140/ 173500 | consumed samples: 10787840 | consumed tokens: 22093496320 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.785303E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.808 | TFLOPs: 26.12 | +7: iteration 42150/ 173500 | consumed samples: 10790400 | consumed tokens: 22098739200 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.763060E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.008 | TFLOPs: 26.00 | +7: iteration 42160/ 173500 | consumed samples: 10792960 | consumed tokens: 22103982080 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.771762E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.909 | TFLOPs: 25.98 | +7: iteration 42170/ 173500 | consumed samples: 10795520 | consumed tokens: 22109224960 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.772163E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.424 | TFLOPs: 26.02 | +7: iteration 42180/ 173500 | consumed samples: 10798080 | consumed tokens: 22114467840 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.765473E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.760 | TFLOPs: 26.06 | +7: iteration 42190/ 173500 | consumed samples: 10800640 | consumed tokens: 22119710720 | elapsed time per iteration (s): 0.16 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.763479E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.094 | TFLOPs: 25.50 | +7: iteration 42200/ 173500 | consumed samples: 10803200 | consumed tokens: 22124953600 | elapsed time per iteration (s): 0.15 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.767023E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.955 | TFLOPs: 26.00 | +7: iteration 42210/ 173500 | consumed samples: 10805760 | consumed tokens: 22130196480 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.766662E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.878 | TFLOPs: 26.02 | +7: iteration 42220/ 173500 | consumed samples: 10808320 | consumed tokens: 22135439360 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.777234E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.546 | TFLOPs: 26.01 | +7: iteration 42230/ 173500 | consumed samples: 10810880 | consumed tokens: 22140682240 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.768335E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.938 | TFLOPs: 26.03 | +7: iteration 42240/ 173500 | consumed samples: 10813440 | consumed tokens: 22145925120 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.774829E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.471 | TFLOPs: 26.02 | +7: iteration 42250/ 173500 | consumed samples: 10816000 | consumed tokens: 22151168000 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.770574E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.977 | TFLOPs: 26.02 | +7: iteration 42260/ 173500 | consumed samples: 10818560 | consumed tokens: 22156410880 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.782129E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.711 | TFLOPs: 26.04 | +7: iteration 42270/ 173500 | consumed samples: 10821120 | consumed tokens: 22161653760 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.775522E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.095 | TFLOPs: 26.13 | +7: iteration 42280/ 173500 | consumed samples: 10823680 | consumed tokens: 22166896640 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.780036E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.221 | TFLOPs: 26.13 | +7: iteration 42290/ 173500 | consumed samples: 10826240 | consumed tokens: 22172139520 | elapsed time per iteration (s): 0.15 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.770446E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.542 | TFLOPs: 26.12 | +7: iteration 42300/ 173500 | consumed samples: 10828800 | consumed tokens: 22177382400 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.756243E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.313 | TFLOPs: 26.13 | +7: iteration 42310/ 173500 | consumed samples: 10831360 | consumed tokens: 22182625280 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.769590E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.971 | TFLOPs: 26.13 | +7: iteration 42320/ 173500 | consumed samples: 10833920 | consumed tokens: 22187868160 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.755880E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.099 | TFLOPs: 26.11 | +7: iteration 42330/ 173500 | consumed samples: 10836480 | consumed tokens: 22193111040 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.780654E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.059 | TFLOPs: 26.11 | +7: iteration 42340/ 173500 | consumed samples: 10839040 | consumed tokens: 22198353920 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.772046E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.539 | TFLOPs: 26.14 | +7: iteration 42350/ 173500 | consumed samples: 10841600 | consumed tokens: 22203596800 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.763912E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.853 | TFLOPs: 26.12 | +7: iteration 42360/ 173500 | consumed samples: 10844160 | consumed tokens: 22208839680 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.767803E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.489 | TFLOPs: 26.13 | +7: iteration 42370/ 173500 | consumed samples: 10846720 | consumed tokens: 22214082560 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.779966E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.158 | TFLOPs: 26.13 | +7: iteration 42380/ 173500 | consumed samples: 10849280 | consumed tokens: 22219325440 | elapsed time per iteration (s): 0.15 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.783298E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.472 | TFLOPs: 26.13 | +7: iteration 42390/ 173500 | consumed samples: 10851840 | consumed tokens: 22224568320 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.769799E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.206 | TFLOPs: 26.13 | +7: iteration 42400/ 173500 | consumed samples: 10854400 | consumed tokens: 22229811200 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.781988E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.080 | TFLOPs: 26.13 | +7: iteration 42410/ 173500 | consumed samples: 10856960 | consumed tokens: 22235054080 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.759029E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.912 | TFLOPs: 26.08 | +7: iteration 42420/ 173500 | consumed samples: 10859520 | consumed tokens: 22240296960 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.774751E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.183 | TFLOPs: 26.04 | +7: iteration 42430/ 173500 | consumed samples: 10862080 | consumed tokens: 22245539840 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.776879E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.433 | TFLOPs: 26.04 | +7: iteration 42440/ 173500 | consumed samples: 10864640 | consumed tokens: 22250782720 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.763479E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.092 | TFLOPs: 26.03 | +7: iteration 42450/ 173500 | consumed samples: 10867200 | consumed tokens: 22256025600 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.767605E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.161 | TFLOPs: 26.02 | +7: iteration 42460/ 173500 | consumed samples: 10869760 | consumed tokens: 22261268480 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.765341E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.110 | TFLOPs: 26.02 | +7: iteration 42470/ 173500 | consumed samples: 10872320 | consumed tokens: 22266511360 | elapsed time per iteration (s): 0.15 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.764230E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.529 | TFLOPs: 26.03 | +7: iteration 42480/ 173500 | consumed samples: 10874880 | consumed tokens: 22271754240 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.770311E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.589 | TFLOPs: 26.00 | +7: iteration 42490/ 173500 | consumed samples: 10877440 | consumed tokens: 22276997120 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.771531E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.118 | TFLOPs: 25.97 | +7: iteration 42500/ 173500 | consumed samples: 10880000 | consumed tokens: 22282240000 | elapsed time per iteration (s): 0.16 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.761440E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.065 | TFLOPs: 24.97 | +7: iteration 42510/ 173500 | consumed samples: 10882560 | consumed tokens: 22287482880 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.780347E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.124 | TFLOPs: 26.11 | +7: iteration 42520/ 173500 | consumed samples: 10885120 | consumed tokens: 22292725760 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.776920E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.602 | TFLOPs: 26.11 | +7: iteration 42530/ 173500 | consumed samples: 10887680 | consumed tokens: 22297968640 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.775060E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.267 | TFLOPs: 26.12 | +7: iteration 42540/ 173500 | consumed samples: 10890240 | consumed tokens: 22303211520 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.783408E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.475 | TFLOPs: 26.12 | +7: iteration 42550/ 173500 | consumed samples: 10892800 | consumed tokens: 22308454400 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.791636E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.616 | TFLOPs: 26.11 | +7: iteration 42560/ 173500 | consumed samples: 10895360 | consumed tokens: 22313697280 | elapsed time per iteration (s): 0.15 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.776377E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.697 | TFLOPs: 26.15 | +7: iteration 42570/ 173500 | consumed samples: 10897920 | consumed tokens: 22318940160 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.761855E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.567 | TFLOPs: 26.20 | +7: iteration 42580/ 173500 | consumed samples: 10900480 | consumed tokens: 22324183040 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.767966E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.349 | TFLOPs: 26.21 | +7: iteration 42590/ 173500 | consumed samples: 10903040 | consumed tokens: 22329425920 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.771627E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.023 | TFLOPs: 26.22 | +7: iteration 42600/ 173500 | consumed samples: 10905600 | consumed tokens: 22334668800 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.775806E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.139 | TFLOPs: 26.21 | +7: iteration 42610/ 173500 | consumed samples: 10908160 | consumed tokens: 22339911680 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.786840E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.085 | TFLOPs: 26.22 | +7: iteration 42620/ 173500 | consumed samples: 10910720 | consumed tokens: 22345154560 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.768697E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.353 | TFLOPs: 26.21 | +7: iteration 42630/ 173500 | consumed samples: 10913280 | consumed tokens: 22350397440 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.769181E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.926 | TFLOPs: 26.22 | +7: iteration 42640/ 173500 | consumed samples: 10915840 | consumed tokens: 22355640320 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.775187E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.090 | TFLOPs: 26.22 | +7: iteration 42650/ 173500 | consumed samples: 10918400 | consumed tokens: 22360883200 | elapsed time per iteration (s): 0.15 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.771844E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.922 | TFLOPs: 26.24 | +7: iteration 42660/ 173500 | consumed samples: 10920960 | consumed tokens: 22366126080 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.775782E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.577 | TFLOPs: 26.23 | +7: iteration 42670/ 173500 | consumed samples: 10923520 | consumed tokens: 22371368960 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.768660E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.797 | TFLOPs: 26.23 | +7: iteration 42680/ 173500 | consumed samples: 10926080 | consumed tokens: 22376611840 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.761254E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.395 | TFLOPs: 26.21 | +7: iteration 42690/ 173500 | consumed samples: 10928640 | consumed tokens: 22381854720 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.770799E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.768 | TFLOPs: 26.23 | +7: iteration 42700/ 173500 | consumed samples: 10931200 | consumed tokens: 22387097600 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.764573E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.090 | TFLOPs: 26.18 | +7: iteration 42710/ 173500 | consumed samples: 10933760 | consumed tokens: 22392340480 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.762055E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.466 | TFLOPs: 26.21 | +7: iteration 42720/ 173500 | consumed samples: 10936320 | consumed tokens: 22397583360 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.776590E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.836 | TFLOPs: 26.22 | +7: iteration 42730/ 173500 | consumed samples: 10938880 | consumed tokens: 22402826240 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.769559E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.142 | TFLOPs: 26.21 | +7: iteration 42740/ 173500 | consumed samples: 10941440 | consumed tokens: 22408069120 | elapsed time per iteration (s): 0.15 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.775299E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.374 | TFLOPs: 26.21 | +7: iteration 42750/ 173500 | consumed samples: 10944000 | consumed tokens: 22413312000 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.773375E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.377 | TFLOPs: 26.20 | +7: iteration 42760/ 173500 | consumed samples: 10946560 | consumed tokens: 22418554880 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.755467E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.077 | TFLOPs: 26.22 | +7: iteration 42770/ 173500 | consumed samples: 10949120 | consumed tokens: 22423797760 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.761136E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.576 | TFLOPs: 26.21 | +7: iteration 42780/ 173500 | consumed samples: 10951680 | consumed tokens: 22429040640 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.779004E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.762 | TFLOPs: 26.20 | +7: iteration 42790/ 173500 | consumed samples: 10954240 | consumed tokens: 22434283520 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.768667E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.639 | TFLOPs: 26.22 | +7: iteration 42800/ 173500 | consumed samples: 10956800 | consumed tokens: 22439526400 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.770912E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.946 | TFLOPs: 26.19 | +7: iteration 42810/ 173500 | consumed samples: 10959360 | consumed tokens: 22444769280 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.763816E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.287 | TFLOPs: 26.21 | +7: iteration 42820/ 173500 | consumed samples: 10961920 | consumed tokens: 22450012160 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.782789E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.358 | TFLOPs: 26.23 | +7: iteration 42830/ 173500 | consumed samples: 10964480 | consumed tokens: 22455255040 | elapsed time per iteration (s): 0.15 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.759705E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.860 | TFLOPs: 26.23 | +7: iteration 42840/ 173500 | consumed samples: 10967040 | consumed tokens: 22460497920 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.773620E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.674 | TFLOPs: 26.22 | +7: iteration 42850/ 173500 | consumed samples: 10969600 | consumed tokens: 22465740800 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.756322E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.546 | TFLOPs: 26.23 | +7: iteration 42860/ 173500 | consumed samples: 10972160 | consumed tokens: 22470983680 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.776498E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.680 | TFLOPs: 26.15 | +7: iteration 42870/ 173500 | consumed samples: 10974720 | consumed tokens: 22476226560 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.769247E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.864 | TFLOPs: 26.20 | +7: iteration 42880/ 173500 | consumed samples: 10977280 | consumed tokens: 22481469440 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.777585E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.446 | TFLOPs: 26.21 | +7: iteration 42890/ 173500 | consumed samples: 10979840 | consumed tokens: 22486712320 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.761242E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.620 | TFLOPs: 26.22 | +7: iteration 42900/ 173500 | consumed samples: 10982400 | consumed tokens: 22491955200 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.758506E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.914 | TFLOPs: 26.22 | +7: iteration 42910/ 173500 | consumed samples: 10984960 | consumed tokens: 22497198080 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.768756E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.583 | TFLOPs: 26.21 | +7: iteration 42920/ 173500 | consumed samples: 10987520 | consumed tokens: 22502440960 | elapsed time per iteration (s): 0.15 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.772639E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.934 | TFLOPs: 26.20 | +7: iteration 42930/ 173500 | consumed samples: 10990080 | consumed tokens: 22507683840 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.764684E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.147 | TFLOPs: 26.21 | +7: iteration 42940/ 173500 | consumed samples: 10992640 | consumed tokens: 22512926720 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.769930E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.191 | TFLOPs: 26.21 | +7: iteration 42950/ 173500 | consumed samples: 10995200 | consumed tokens: 22518169600 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.780301E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.290 | TFLOPs: 26.21 | +7: iteration 42960/ 173500 | consumed samples: 10997760 | consumed tokens: 22523412480 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.768087E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.311 | TFLOPs: 26.19 | +7: iteration 42970/ 173500 | consumed samples: 11000320 | consumed tokens: 22528655360 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.774847E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.609 | TFLOPs: 26.22 | +7: iteration 42980/ 173500 | consumed samples: 11002880 | consumed tokens: 22533898240 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.776054E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.954 | TFLOPs: 26.20 | +7: iteration 42990/ 173500 | consumed samples: 11005440 | consumed tokens: 22539141120 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.760663E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.427 | TFLOPs: 26.21 | +7: iteration 43000/ 173500 | consumed samples: 11008000 | consumed tokens: 22544384000 | elapsed time per iteration (s): 0.15 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.764605E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.361 | TFLOPs: 26.23 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 43000 | lm loss value: 3.858657E+00 | lm loss PPL: 4.740167E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 43000 to checkpoints_44m91b100m +0: [2023-03-17 02:06:42,692] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step43000 is begin to save! +0: [2023-03-17 02:06:42,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:06:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:06:42,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:06:42,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:06:42,768] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:06:42,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:06:42,776] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:06:42,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:06:42,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:06:42,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:06:42,793] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:06:42,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:06:42,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:06:42,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:06:42,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:06:42,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:06:42,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:06:42,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:06:42,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:06:42,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:06:42,827] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step43000/mp_rank_00_model_states.pt +0: [2023-03-17 02:06:42,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:06:42,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:06:42,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:06:42,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +7: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +2: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +6: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +3: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +5: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +1: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +4: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:06:42,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step43000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:06:42,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step43000 is ready now! +0: successfully saved checkpoint at iteration 43000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.44 +7: iteration 43010/ 173500 | consumed samples: 11010560 | consumed tokens: 22549626880 | elapsed time per iteration (s): 0.18 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.764291E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.215 | TFLOPs: 22.84 | +7: iteration 43020/ 173500 | consumed samples: 11013120 | consumed tokens: 22554869760 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.780118E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.674 | TFLOPs: 26.23 | +7: iteration 43030/ 173500 | consumed samples: 11015680 | consumed tokens: 22560112640 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.771434E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.510 | TFLOPs: 26.23 | +7: iteration 43040/ 173500 | consumed samples: 11018240 | consumed tokens: 22565355520 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.755486E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.955 | TFLOPs: 26.25 | +7: iteration 43050/ 173500 | consumed samples: 11020800 | consumed tokens: 22570598400 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.778111E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.761 | TFLOPs: 26.22 | +7: iteration 43060/ 173500 | consumed samples: 11023360 | consumed tokens: 22575841280 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.764407E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.425 | TFLOPs: 26.21 | +7: iteration 43070/ 173500 | consumed samples: 11025920 | consumed tokens: 22581084160 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.769664E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.584 | TFLOPs: 26.23 | +7: iteration 43080/ 173500 | consumed samples: 11028480 | consumed tokens: 22586327040 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.765106E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.800 | TFLOPs: 26.22 | +7: iteration 43090/ 173500 | consumed samples: 11031040 | consumed tokens: 22591569920 | elapsed time per iteration (s): 0.15 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.761075E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.295 | TFLOPs: 26.23 | +7: iteration 43100/ 173500 | consumed samples: 11033600 | consumed tokens: 22596812800 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.765705E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.028 | TFLOPs: 26.21 | +7: iteration 43110/ 173500 | consumed samples: 11036160 | consumed tokens: 22602055680 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.761806E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.455 | TFLOPs: 26.17 | +7: iteration 43120/ 173500 | consumed samples: 11038720 | consumed tokens: 22607298560 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.773509E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.671 | TFLOPs: 26.11 | +7: iteration 43130/ 173500 | consumed samples: 11041280 | consumed tokens: 22612541440 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.784784E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.142 | TFLOPs: 26.11 | +7: iteration 43140/ 173500 | consumed samples: 11043840 | consumed tokens: 22617784320 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.757838E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.889 | TFLOPs: 26.13 | +7: iteration 43150/ 173500 | consumed samples: 11046400 | consumed tokens: 22623027200 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.772979E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.215 | TFLOPs: 26.10 | +7: iteration 43160/ 173500 | consumed samples: 11048960 | consumed tokens: 22628270080 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.766294E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.198 | TFLOPs: 26.10 | +7: iteration 43170/ 173500 | consumed samples: 11051520 | consumed tokens: 22633512960 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.767272E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.703 | TFLOPs: 26.08 | +7: iteration 43180/ 173500 | consumed samples: 11054080 | consumed tokens: 22638755840 | elapsed time per iteration (s): 0.15 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.774564E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.360 | TFLOPs: 26.09 | +7: iteration 43190/ 173500 | consumed samples: 11056640 | consumed tokens: 22643998720 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.773821E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.910 | TFLOPs: 26.09 | +7: iteration 43200/ 173500 | consumed samples: 11059200 | consumed tokens: 22649241600 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.764253E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.586 | TFLOPs: 26.10 | +7: iteration 43210/ 173500 | consumed samples: 11061760 | consumed tokens: 22654484480 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.776177E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.705 | TFLOPs: 25.97 | +7: iteration 43220/ 173500 | consumed samples: 11064320 | consumed tokens: 22659727360 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.771447E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.879 | TFLOPs: 26.09 | +7: iteration 43230/ 173500 | consumed samples: 11066880 | consumed tokens: 22664970240 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.781448E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.815 | TFLOPs: 26.09 | +7: iteration 43240/ 173500 | consumed samples: 11069440 | consumed tokens: 22670213120 | elapsed time per iteration (s): 0.16 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.766214E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.852 | TFLOPs: 25.56 | +7: iteration 43250/ 173500 | consumed samples: 11072000 | consumed tokens: 22675456000 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.771994E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.399 | TFLOPs: 26.10 | +7: iteration 43260/ 173500 | consumed samples: 11074560 | consumed tokens: 22680698880 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.775006E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.426 | TFLOPs: 26.10 | +7: iteration 43270/ 173500 | consumed samples: 11077120 | consumed tokens: 22685941760 | elapsed time per iteration (s): 0.15 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.781596E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.910 | TFLOPs: 26.11 | +7: iteration 43280/ 173500 | consumed samples: 11079680 | consumed tokens: 22691184640 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.780033E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.040 | TFLOPs: 26.11 | +7: iteration 43290/ 173500 | consumed samples: 11082240 | consumed tokens: 22696427520 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.784810E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.466 | TFLOPs: 26.10 | +7: iteration 43300/ 173500 | consumed samples: 11084800 | consumed tokens: 22701670400 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.768544E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.403 | TFLOPs: 26.12 | +7: iteration 43310/ 173500 | consumed samples: 11087360 | consumed tokens: 22706913280 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.782831E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.929 | TFLOPs: 26.09 | +7: iteration 43320/ 173500 | consumed samples: 11089920 | consumed tokens: 22712156160 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.752990E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.367 | TFLOPs: 26.13 | +7: iteration 43330/ 173500 | consumed samples: 11092480 | consumed tokens: 22717399040 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.774812E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.067 | TFLOPs: 26.13 | +7: iteration 43340/ 173500 | consumed samples: 11095040 | consumed tokens: 22722641920 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.770216E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.517 | TFLOPs: 26.14 | +7: iteration 43350/ 173500 | consumed samples: 11097600 | consumed tokens: 22727884800 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.762809E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.460 | TFLOPs: 26.13 | +7: iteration 43360/ 173500 | consumed samples: 11100160 | consumed tokens: 22733127680 | elapsed time per iteration (s): 0.15 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.783357E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.926 | TFLOPs: 26.13 | +7: iteration 43370/ 173500 | consumed samples: 11102720 | consumed tokens: 22738370560 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.758930E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.097 | TFLOPs: 26.11 | +7: iteration 43380/ 173500 | consumed samples: 11105280 | consumed tokens: 22743613440 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.763086E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.847 | TFLOPs: 26.11 | +7: iteration 43390/ 173500 | consumed samples: 11107840 | consumed tokens: 22748856320 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.781781E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.207 | TFLOPs: 26.13 | +7: iteration 43400/ 173500 | consumed samples: 11110400 | consumed tokens: 22754099200 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.767507E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.741 | TFLOPs: 26.12 | +7: iteration 43410/ 173500 | consumed samples: 11112960 | consumed tokens: 22759342080 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.770344E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.843 | TFLOPs: 26.12 | +7: iteration 43420/ 173500 | consumed samples: 11115520 | consumed tokens: 22764584960 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.763000E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.184 | TFLOPs: 26.11 | +7: iteration 43430/ 173500 | consumed samples: 11118080 | consumed tokens: 22769827840 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.775186E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.432 | TFLOPs: 26.10 | +7: iteration 43440/ 173500 | consumed samples: 11120640 | consumed tokens: 22775070720 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.770694E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.560 | TFLOPs: 26.07 | +7: iteration 43450/ 173500 | consumed samples: 11123200 | consumed tokens: 22780313600 | elapsed time per iteration (s): 0.15 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.770158E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.584 | TFLOPs: 26.10 | +7: iteration 43460/ 173500 | consumed samples: 11125760 | consumed tokens: 22785556480 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.761637E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.703 | TFLOPs: 26.11 | +7: iteration 43470/ 173500 | consumed samples: 11128320 | consumed tokens: 22790799360 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.784073E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.965 | TFLOPs: 26.16 | +7: iteration 43480/ 173500 | consumed samples: 11130880 | consumed tokens: 22796042240 | elapsed time per iteration (s): 0.16 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.772409E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.982 | TFLOPs: 25.78 | +7: iteration 43490/ 173500 | consumed samples: 11133440 | consumed tokens: 22801285120 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.773210E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.173 | TFLOPs: 26.16 | +7: iteration 43500/ 173500 | consumed samples: 11136000 | consumed tokens: 22806528000 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.765180E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.823 | TFLOPs: 26.17 | +7: iteration 43510/ 173500 | consumed samples: 11138560 | consumed tokens: 22811770880 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.754080E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.123 | TFLOPs: 26.18 | +7: iteration 43520/ 173500 | consumed samples: 11141120 | consumed tokens: 22817013760 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.763026E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.778 | TFLOPs: 26.12 | +7: iteration 43530/ 173500 | consumed samples: 11143680 | consumed tokens: 22822256640 | elapsed time per iteration (s): 0.15 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.773107E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.978 | TFLOPs: 26.22 | +7: iteration 43540/ 173500 | consumed samples: 11146240 | consumed tokens: 22827499520 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.773924E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.964 | TFLOPs: 26.22 | +7: iteration 43550/ 173500 | consumed samples: 11148800 | consumed tokens: 22832742400 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.764650E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.758 | TFLOPs: 26.17 | +7: iteration 43560/ 173500 | consumed samples: 11151360 | consumed tokens: 22837985280 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.777947E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.589 | TFLOPs: 26.21 | +7: iteration 43570/ 173500 | consumed samples: 11153920 | consumed tokens: 22843228160 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.769672E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.727 | TFLOPs: 26.22 | +7: iteration 43580/ 173500 | consumed samples: 11156480 | consumed tokens: 22848471040 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.773683E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.874 | TFLOPs: 26.22 | +7: iteration 43590/ 173500 | consumed samples: 11159040 | consumed tokens: 22853713920 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.778745E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.894 | TFLOPs: 26.20 | +7: iteration 43600/ 173500 | consumed samples: 11161600 | consumed tokens: 22858956800 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.761331E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.898 | TFLOPs: 26.20 | +7: iteration 43610/ 173500 | consumed samples: 11164160 | consumed tokens: 22864199680 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.762798E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.886 | TFLOPs: 26.19 | +7: iteration 43620/ 173500 | consumed samples: 11166720 | consumed tokens: 22869442560 | elapsed time per iteration (s): 0.15 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.768987E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.502 | TFLOPs: 26.18 | +7: iteration 43630/ 173500 | consumed samples: 11169280 | consumed tokens: 22874685440 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.771504E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.631 | TFLOPs: 26.17 | +7: iteration 43640/ 173500 | consumed samples: 11171840 | consumed tokens: 22879928320 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.764838E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.053 | TFLOPs: 26.17 | +7: iteration 43650/ 173500 | consumed samples: 11174400 | consumed tokens: 22885171200 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.776323E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.229 | TFLOPs: 26.21 | +7: iteration 43660/ 173500 | consumed samples: 11176960 | consumed tokens: 22890414080 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.767344E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.913 | TFLOPs: 26.20 | +7: iteration 43670/ 173500 | consumed samples: 11179520 | consumed tokens: 22895656960 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.761264E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.784 | TFLOPs: 26.19 | +7: iteration 43680/ 173500 | consumed samples: 11182080 | consumed tokens: 22900899840 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.754739E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.280 | TFLOPs: 26.16 | +7: iteration 43690/ 173500 | consumed samples: 11184640 | consumed tokens: 22906142720 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.770290E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.189 | TFLOPs: 26.13 | +7: iteration 43700/ 173500 | consumed samples: 11187200 | consumed tokens: 22911385600 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.763742E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.259 | TFLOPs: 26.12 | +7: iteration 43710/ 173500 | consumed samples: 11189760 | consumed tokens: 22916628480 | elapsed time per iteration (s): 0.15 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.778812E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.351 | TFLOPs: 26.16 | +7: iteration 43720/ 173500 | consumed samples: 11192320 | consumed tokens: 22921871360 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.759741E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.960 | TFLOPs: 26.17 | +7: iteration 43730/ 173500 | consumed samples: 11194880 | consumed tokens: 22927114240 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.765219E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.884 | TFLOPs: 26.13 | +7: iteration 43740/ 173500 | consumed samples: 11197440 | consumed tokens: 22932357120 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.758859E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.244 | TFLOPs: 26.15 | +7: iteration 43750/ 173500 | consumed samples: 11200000 | consumed tokens: 22937600000 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.760402E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.711 | TFLOPs: 26.15 | +7: iteration 43760/ 173500 | consumed samples: 11202560 | consumed tokens: 22942842880 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.774586E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.290 | TFLOPs: 26.13 | +7: iteration 43770/ 173500 | consumed samples: 11205120 | consumed tokens: 22948085760 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.755515E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.316 | TFLOPs: 26.16 | +7: iteration 43780/ 173500 | consumed samples: 11207680 | consumed tokens: 22953328640 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.770789E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.565 | TFLOPs: 26.18 | +7: iteration 43790/ 173500 | consumed samples: 11210240 | consumed tokens: 22958571520 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.769639E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.390 | TFLOPs: 26.18 | +7: iteration 43800/ 173500 | consumed samples: 11212800 | consumed tokens: 22963814400 | elapsed time per iteration (s): 0.15 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.769627E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.426 | TFLOPs: 26.18 | +7: iteration 43810/ 173500 | consumed samples: 11215360 | consumed tokens: 22969057280 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.760536E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.113 | TFLOPs: 26.18 | +7: iteration 43820/ 173500 | consumed samples: 11217920 | consumed tokens: 22974300160 | elapsed time per iteration (s): 0.16 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.769177E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.971 | TFLOPs: 25.78 | +7: iteration 43830/ 173500 | consumed samples: 11220480 | consumed tokens: 22979543040 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.771043E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.657 | TFLOPs: 26.17 | +7: iteration 43840/ 173500 | consumed samples: 11223040 | consumed tokens: 22984785920 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.779321E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.552 | TFLOPs: 26.17 | +7: iteration 43850/ 173500 | consumed samples: 11225600 | consumed tokens: 22990028800 | elapsed time per iteration (s): 0.16 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.767251E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.725 | TFLOPs: 25.64 | +7: iteration 43860/ 173500 | consumed samples: 11228160 | consumed tokens: 22995271680 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.754842E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.746 | TFLOPs: 26.15 | +7: iteration 43870/ 173500 | consumed samples: 11230720 | consumed tokens: 23000514560 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.750602E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.153 | TFLOPs: 26.18 | +7: iteration 43880/ 173500 | consumed samples: 11233280 | consumed tokens: 23005757440 | elapsed time per iteration (s): 0.15 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.769849E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.982 | TFLOPs: 26.16 | +7: iteration 43890/ 173500 | consumed samples: 11235840 | consumed tokens: 23011000320 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.769074E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.425 | TFLOPs: 26.18 | +7: iteration 43900/ 173500 | consumed samples: 11238400 | consumed tokens: 23016243200 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.766085E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.763 | TFLOPs: 26.20 | +7: iteration 43910/ 173500 | consumed samples: 11240960 | consumed tokens: 23021486080 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.752438E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.956 | TFLOPs: 26.20 | +7: iteration 43920/ 173500 | consumed samples: 11243520 | consumed tokens: 23026728960 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.778676E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.703 | TFLOPs: 26.22 | +7: iteration 43930/ 173500 | consumed samples: 11246080 | consumed tokens: 23031971840 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.759985E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.933 | TFLOPs: 26.20 | +7: iteration 43940/ 173500 | consumed samples: 11248640 | consumed tokens: 23037214720 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.768550E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.757 | TFLOPs: 26.22 | +7: iteration 43950/ 173500 | consumed samples: 11251200 | consumed tokens: 23042457600 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.773975E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.876 | TFLOPs: 26.20 | +7: iteration 43960/ 173500 | consumed samples: 11253760 | consumed tokens: 23047700480 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.774651E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.408 | TFLOPs: 26.20 | +7: iteration 43970/ 173500 | consumed samples: 11256320 | consumed tokens: 23052943360 | elapsed time per iteration (s): 0.15 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.774618E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.291 | TFLOPs: 26.15 | +7: iteration 43980/ 173500 | consumed samples: 11258880 | consumed tokens: 23058186240 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.769066E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.083 | TFLOPs: 26.16 | +7: iteration 43990/ 173500 | consumed samples: 11261440 | consumed tokens: 23063429120 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.775581E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.865 | TFLOPs: 26.16 | +0: [2023-03-17 02:09:16,470] [INFO] [logging.py:68:log_dist] [Rank 0] step=44000, skipped=0, lr=[0.00017442202015704406, 0.00017442202015704406, 0.00017442202015704406], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 44000/ 173500 | consumed samples: 11264000 | consumed tokens: 23068672000 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.750417E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.571 | TFLOPs: 26.15 | +0: steps: 44000 loss: 3.7524 iter time (s): 0.153 samples/sec: 1676.636 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 44000 | lm loss value: 3.925983E+00 | lm loss PPL: 5.070289E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 44000 to checkpoints_44m91b100m +0: [2023-03-17 02:09:16,545] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step44000 is begin to save! +0: [2023-03-17 02:09:16,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:09:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:09:16,608] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:09:16,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:09:16,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:09:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:09:16,627] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:09:16,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:09:16,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:09:16,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:09:16,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:09:16,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:09:16,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:09:16,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:09:16,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:09:16,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:09:16,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:09:16,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:09:16,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:09:16,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:09:16,677] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step44000/mp_rank_00_model_states.pt +0: [2023-03-17 02:09:16,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:09:16,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:09:16,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:09:16,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:09:16,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:09:16,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:09:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:09:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:09:16,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:09:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:09:16,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:09:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +3: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:09:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:09:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +5: [2023-03-17 02:09:16,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +1: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +2: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +4: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +6: [2023-03-17 02:09:16,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +7: [2023-03-17 02:09:16,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:09:16,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step44000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:09:16,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step44000 is ready now! +0: successfully saved checkpoint at iteration 44000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.24 +7: iteration 44010/ 173500 | consumed samples: 11266560 | consumed tokens: 23073914880 | elapsed time per iteration (s): 0.18 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.759798E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.015 | TFLOPs: 22.54 | +7: iteration 44020/ 173500 | consumed samples: 11269120 | consumed tokens: 23079157760 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.780201E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.343 | TFLOPs: 26.12 | +7: iteration 44030/ 173500 | consumed samples: 11271680 | consumed tokens: 23084400640 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.756022E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.430 | TFLOPs: 26.10 | +7: iteration 44040/ 173500 | consumed samples: 11274240 | consumed tokens: 23089643520 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.773905E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.516 | TFLOPs: 26.14 | +7: iteration 44050/ 173500 | consumed samples: 11276800 | consumed tokens: 23094886400 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.765106E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.074 | TFLOPs: 26.14 | +7: iteration 44060/ 173500 | consumed samples: 11279360 | consumed tokens: 23100129280 | elapsed time per iteration (s): 0.15 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.760098E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.870 | TFLOPs: 26.14 | +7: iteration 44070/ 173500 | consumed samples: 11281920 | consumed tokens: 23105372160 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.764960E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.925 | TFLOPs: 26.11 | +7: iteration 44080/ 173500 | consumed samples: 11284480 | consumed tokens: 23110615040 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.775446E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.808 | TFLOPs: 26.11 | +7: iteration 44090/ 173500 | consumed samples: 11287040 | consumed tokens: 23115857920 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.766189E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.426 | TFLOPs: 26.12 | +7: iteration 44100/ 173500 | consumed samples: 11289600 | consumed tokens: 23121100800 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.770153E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.653 | TFLOPs: 26.14 | +7: iteration 44110/ 173500 | consumed samples: 11292160 | consumed tokens: 23126343680 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.765288E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.895 | TFLOPs: 26.11 | +7: iteration 44120/ 173500 | consumed samples: 11294720 | consumed tokens: 23131586560 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.764588E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.574 | TFLOPs: 26.10 | +7: iteration 44130/ 173500 | consumed samples: 11297280 | consumed tokens: 23136829440 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.766079E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.164 | TFLOPs: 26.13 | +7: iteration 44140/ 173500 | consumed samples: 11299840 | consumed tokens: 23142072320 | elapsed time per iteration (s): 0.15 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.760314E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.774 | TFLOPs: 26.14 | +7: iteration 44150/ 173500 | consumed samples: 11302400 | consumed tokens: 23147315200 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.773278E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.003 | TFLOPs: 26.13 | +7: iteration 44160/ 173500 | consumed samples: 11304960 | consumed tokens: 23152558080 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.763387E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.157 | TFLOPs: 26.15 | +7: iteration 44170/ 173500 | consumed samples: 11307520 | consumed tokens: 23157800960 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.773545E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.707 | TFLOPs: 26.17 | +7: iteration 44180/ 173500 | consumed samples: 11310080 | consumed tokens: 23163043840 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.764257E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.831 | TFLOPs: 26.17 | +7: iteration 44190/ 173500 | consumed samples: 11312640 | consumed tokens: 23168286720 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.773521E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.008 | TFLOPs: 26.17 | +7: iteration 44200/ 173500 | consumed samples: 11315200 | consumed tokens: 23173529600 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.766676E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.388 | TFLOPs: 26.13 | +7: iteration 44210/ 173500 | consumed samples: 11317760 | consumed tokens: 23178772480 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.770600E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.723 | TFLOPs: 26.14 | +7: iteration 44220/ 173500 | consumed samples: 11320320 | consumed tokens: 23184015360 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.780167E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.893 | TFLOPs: 26.14 | +7: iteration 44230/ 173500 | consumed samples: 11322880 | consumed tokens: 23189258240 | elapsed time per iteration (s): 0.15 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.767191E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.953 | TFLOPs: 26.14 | +7: iteration 44240/ 173500 | consumed samples: 11325440 | consumed tokens: 23194501120 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.774403E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.277 | TFLOPs: 26.13 | +7: iteration 44250/ 173500 | consumed samples: 11328000 | consumed tokens: 23199744000 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.770839E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.793 | TFLOPs: 26.12 | +7: iteration 44260/ 173500 | consumed samples: 11330560 | consumed tokens: 23204986880 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.770225E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.059 | TFLOPs: 26.11 | +7: iteration 44270/ 173500 | consumed samples: 11333120 | consumed tokens: 23210229760 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.745450E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.887 | TFLOPs: 26.06 | +7: iteration 44280/ 173500 | consumed samples: 11335680 | consumed tokens: 23215472640 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.758747E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.026 | TFLOPs: 26.08 | +7: iteration 44290/ 173500 | consumed samples: 11338240 | consumed tokens: 23220715520 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.774511E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.643 | TFLOPs: 26.09 | +7: iteration 44300/ 173500 | consumed samples: 11340800 | consumed tokens: 23225958400 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.763120E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.520 | TFLOPs: 26.09 | +7: iteration 44310/ 173500 | consumed samples: 11343360 | consumed tokens: 23231201280 | elapsed time per iteration (s): 0.16 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.772038E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.083 | TFLOPs: 25.67 | +7: iteration 44320/ 173500 | consumed samples: 11345920 | consumed tokens: 23236444160 | elapsed time per iteration (s): 0.15 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.762933E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.225 | TFLOPs: 26.08 | +7: iteration 44330/ 173500 | consumed samples: 11348480 | consumed tokens: 23241687040 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.768145E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.192 | TFLOPs: 26.11 | +7: iteration 44340/ 173500 | consumed samples: 11351040 | consumed tokens: 23246929920 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.768169E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.658 | TFLOPs: 26.09 | +7: iteration 44350/ 173500 | consumed samples: 11353600 | consumed tokens: 23252172800 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.768477E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.194 | TFLOPs: 26.11 | +7: iteration 44360/ 173500 | consumed samples: 11356160 | consumed tokens: 23257415680 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.771404E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.201 | TFLOPs: 26.10 | +7: iteration 44370/ 173500 | consumed samples: 11358720 | consumed tokens: 23262658560 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.770942E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.913 | TFLOPs: 26.09 | +7: iteration 44380/ 173500 | consumed samples: 11361280 | consumed tokens: 23267901440 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.764707E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.266 | TFLOPs: 26.12 | +7: iteration 44390/ 173500 | consumed samples: 11363840 | consumed tokens: 23273144320 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.782935E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.236 | TFLOPs: 26.12 | +7: iteration 44400/ 173500 | consumed samples: 11366400 | consumed tokens: 23278387200 | elapsed time per iteration (s): 0.15 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.764824E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.788 | TFLOPs: 26.12 | +7: iteration 44410/ 173500 | consumed samples: 11368960 | consumed tokens: 23283630080 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.762840E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.604 | TFLOPs: 26.14 | +7: iteration 44420/ 173500 | consumed samples: 11371520 | consumed tokens: 23288872960 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.773077E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.153 | TFLOPs: 26.11 | +7: iteration 44430/ 173500 | consumed samples: 11374080 | consumed tokens: 23294115840 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.753699E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.458 | TFLOPs: 26.12 | +7: iteration 44440/ 173500 | consumed samples: 11376640 | consumed tokens: 23299358720 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.765184E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.278 | TFLOPs: 26.12 | +7: iteration 44450/ 173500 | consumed samples: 11379200 | consumed tokens: 23304601600 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.760202E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.327 | TFLOPs: 26.13 | +7: iteration 44460/ 173500 | consumed samples: 11381760 | consumed tokens: 23309844480 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.775993E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.534 | TFLOPs: 26.14 | +7: iteration 44470/ 173500 | consumed samples: 11384320 | consumed tokens: 23315087360 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.765516E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.484 | TFLOPs: 26.13 | +7: iteration 44480/ 173500 | consumed samples: 11386880 | consumed tokens: 23320330240 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.767402E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.503 | TFLOPs: 26.13 | +7: iteration 44490/ 173500 | consumed samples: 11389440 | consumed tokens: 23325573120 | elapsed time per iteration (s): 0.15 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.764209E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.799 | TFLOPs: 26.14 | +7: iteration 44500/ 173500 | consumed samples: 11392000 | consumed tokens: 23330816000 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.763602E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.914 | TFLOPs: 26.13 | +7: iteration 44510/ 173500 | consumed samples: 11394560 | consumed tokens: 23336058880 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.753561E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.610 | TFLOPs: 26.03 | +7: iteration 44520/ 173500 | consumed samples: 11397120 | consumed tokens: 23341301760 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.767474E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.425 | TFLOPs: 26.02 | +7: iteration 44530/ 173500 | consumed samples: 11399680 | consumed tokens: 23346544640 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.757007E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.863 | TFLOPs: 26.03 | +7: iteration 44540/ 173500 | consumed samples: 11402240 | consumed tokens: 23351787520 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.774724E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.297 | TFLOPs: 26.04 | +7: iteration 44550/ 173500 | consumed samples: 11404800 | consumed tokens: 23357030400 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.764049E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.383 | TFLOPs: 26.02 | +7: iteration 44560/ 173500 | consumed samples: 11407360 | consumed tokens: 23362273280 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.753436E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.308 | TFLOPs: 25.98 | +7: iteration 44570/ 173500 | consumed samples: 11409920 | consumed tokens: 23367516160 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.755146E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.170 | TFLOPs: 25.99 | +7: iteration 44580/ 173500 | consumed samples: 11412480 | consumed tokens: 23372759040 | elapsed time per iteration (s): 0.15 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.751672E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.892 | TFLOPs: 26.00 | +7: iteration 44590/ 173500 | consumed samples: 11415040 | consumed tokens: 23378001920 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.770478E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.969 | TFLOPs: 26.03 | +7: iteration 44600/ 173500 | consumed samples: 11417600 | consumed tokens: 23383244800 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.780324E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.471 | TFLOPs: 26.02 | +7: iteration 44610/ 173500 | consumed samples: 11420160 | consumed tokens: 23388487680 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.777311E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.873 | TFLOPs: 26.05 | +7: iteration 44620/ 173500 | consumed samples: 11422720 | consumed tokens: 23393730560 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.758248E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.629 | TFLOPs: 26.03 | +7: iteration 44630/ 173500 | consumed samples: 11425280 | consumed tokens: 23398973440 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.759942E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.164 | TFLOPs: 26.00 | +7: iteration 44640/ 173500 | consumed samples: 11427840 | consumed tokens: 23404216320 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.770925E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.308 | TFLOPs: 26.01 | +7: iteration 44650/ 173500 | consumed samples: 11430400 | consumed tokens: 23409459200 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.761261E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.019 | TFLOPs: 26.06 | +7: iteration 44660/ 173500 | consumed samples: 11432960 | consumed tokens: 23414702080 | elapsed time per iteration (s): 0.15 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.764063E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.687 | TFLOPs: 26.09 | +7: iteration 44670/ 173500 | consumed samples: 11435520 | consumed tokens: 23419944960 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.766782E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.354 | TFLOPs: 26.10 | +7: iteration 44680/ 173500 | consumed samples: 11438080 | consumed tokens: 23425187840 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.783821E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.524 | TFLOPs: 26.07 | +7: iteration 44690/ 173500 | consumed samples: 11440640 | consumed tokens: 23430430720 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.756731E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.869 | TFLOPs: 26.06 | +7: iteration 44700/ 173500 | consumed samples: 11443200 | consumed tokens: 23435673600 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.759864E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.223 | TFLOPs: 26.08 | +7: iteration 44710/ 173500 | consumed samples: 11445760 | consumed tokens: 23440916480 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.755721E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.529 | TFLOPs: 26.07 | +7: iteration 44720/ 173500 | consumed samples: 11448320 | consumed tokens: 23446159360 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.780480E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.320 | TFLOPs: 26.05 | +7: iteration 44730/ 173500 | consumed samples: 11450880 | consumed tokens: 23451402240 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.757579E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.138 | TFLOPs: 26.07 | +7: iteration 44740/ 173500 | consumed samples: 11453440 | consumed tokens: 23456645120 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.757891E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.568 | TFLOPs: 26.12 | +7: iteration 44750/ 173500 | consumed samples: 11456000 | consumed tokens: 23461888000 | elapsed time per iteration (s): 0.15 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.755728E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.553 | TFLOPs: 26.12 | +7: iteration 44760/ 173500 | consumed samples: 11458560 | consumed tokens: 23467130880 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.765361E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.936 | TFLOPs: 26.13 | +7: iteration 44770/ 173500 | consumed samples: 11461120 | consumed tokens: 23472373760 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.754208E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.542 | TFLOPs: 26.15 | +7: iteration 44780/ 173500 | consumed samples: 11463680 | consumed tokens: 23477616640 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.771712E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.452 | TFLOPs: 26.15 | +7: iteration 44790/ 173500 | consumed samples: 11466240 | consumed tokens: 23482859520 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.767665E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.334 | TFLOPs: 26.15 | +7: iteration 44800/ 173500 | consumed samples: 11468800 | consumed tokens: 23488102400 | elapsed time per iteration (s): 0.16 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.759010E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.809 | TFLOPs: 25.67 | +7: iteration 44810/ 173500 | consumed samples: 11471360 | consumed tokens: 23493345280 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.770303E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.852 | TFLOPs: 26.16 | +7: iteration 44820/ 173500 | consumed samples: 11473920 | consumed tokens: 23498588160 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.774807E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.415 | TFLOPs: 26.13 | +7: iteration 44830/ 173500 | consumed samples: 11476480 | consumed tokens: 23503831040 | elapsed time per iteration (s): 0.15 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.763208E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.575 | TFLOPs: 26.12 | +7: iteration 44840/ 173500 | consumed samples: 11479040 | consumed tokens: 23509073920 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.767688E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.442 | TFLOPs: 26.12 | +7: iteration 44850/ 173500 | consumed samples: 11481600 | consumed tokens: 23514316800 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.767455E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.992 | TFLOPs: 26.13 | +7: iteration 44860/ 173500 | consumed samples: 11484160 | consumed tokens: 23519559680 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.770122E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.283 | TFLOPs: 26.13 | +7: iteration 44870/ 173500 | consumed samples: 11486720 | consumed tokens: 23524802560 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.763571E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.682 | TFLOPs: 26.12 | +7: iteration 44880/ 173500 | consumed samples: 11489280 | consumed tokens: 23530045440 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.761488E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.036 | TFLOPs: 26.11 | +7: iteration 44890/ 173500 | consumed samples: 11491840 | consumed tokens: 23535288320 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.754546E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.905 | TFLOPs: 26.13 | +7: iteration 44900/ 173500 | consumed samples: 11494400 | consumed tokens: 23540531200 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.766600E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.948 | TFLOPs: 26.13 | +7: iteration 44910/ 173500 | consumed samples: 11496960 | consumed tokens: 23545774080 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.760163E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.842 | TFLOPs: 26.11 | +7: iteration 44920/ 173500 | consumed samples: 11499520 | consumed tokens: 23551016960 | elapsed time per iteration (s): 0.15 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.769198E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.385 | TFLOPs: 26.13 | +7: iteration 44930/ 173500 | consumed samples: 11502080 | consumed tokens: 23556259840 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.764090E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.697 | TFLOPs: 26.12 | +7: iteration 44940/ 173500 | consumed samples: 11504640 | consumed tokens: 23561502720 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.765488E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.312 | TFLOPs: 26.12 | +7: iteration 44950/ 173500 | consumed samples: 11507200 | consumed tokens: 23566745600 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.764146E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.469 | TFLOPs: 26.15 | +7: iteration 44960/ 173500 | consumed samples: 11509760 | consumed tokens: 23571988480 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.770995E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.522 | TFLOPs: 26.15 | +7: iteration 44970/ 173500 | consumed samples: 11512320 | consumed tokens: 23577231360 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.760431E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.326 | TFLOPs: 26.13 | +7: iteration 44980/ 173500 | consumed samples: 11514880 | consumed tokens: 23582474240 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.767279E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.548 | TFLOPs: 26.14 | +7: iteration 44990/ 173500 | consumed samples: 11517440 | consumed tokens: 23587717120 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.761683E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.356 | TFLOPs: 26.09 | +7: iteration 45000/ 173500 | consumed samples: 11520000 | consumed tokens: 23592960000 | elapsed time per iteration (s): 0.15 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.751464E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.239 | TFLOPs: 26.12 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 45000 | lm loss value: 3.884390E+00 | lm loss PPL: 4.863725E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 45000 to checkpoints_44m91b100m +0: [2023-03-17 02:11:50,617] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step45000 is begin to save! +0: [2023-03-17 02:11:50,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:11:50,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:11:50,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:11:50,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:11:50,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:11:50,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:11:50,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:11:50,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:11:50,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:11:50,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:11:50,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:11:50,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:11:50,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:11:50,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:11:50,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:11:50,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:11:50,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:11:50,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:11:50,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:11:50,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:11:50,746] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step45000/mp_rank_00_model_states.pt +0: [2023-03-17 02:11:50,746] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:11:50,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:11:50,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:11:50,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:11:50,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:11:50,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 02:11:50,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:11:50,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:11:50,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:11:50,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:11:50,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:11:50,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:11:50,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:11:50,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:11:50,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:11:50,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +1: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:11:50,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +6: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +3: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +4: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +7: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +5: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 02:11:50,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step45000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +2: [2023-03-17 02:11:50,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step45000 is ready now! +0: successfully saved checkpoint at iteration 45000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.39 +7: iteration 45010/ 173500 | consumed samples: 11522560 | consumed tokens: 23598202880 | elapsed time per iteration (s): 0.18 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.759579E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.237 | TFLOPs: 22.71 | +7: iteration 45020/ 173500 | consumed samples: 11525120 | consumed tokens: 23603445760 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.762910E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.076 | TFLOPs: 26.08 | +7: iteration 45030/ 173500 | consumed samples: 11527680 | consumed tokens: 23608688640 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.766454E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.754 | TFLOPs: 26.08 | +7: iteration 45040/ 173500 | consumed samples: 11530240 | consumed tokens: 23613931520 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.762516E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.537 | TFLOPs: 26.06 | +7: iteration 45050/ 173500 | consumed samples: 11532800 | consumed tokens: 23619174400 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.758597E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.067 | TFLOPs: 26.08 | +7: iteration 45060/ 173500 | consumed samples: 11535360 | consumed tokens: 23624417280 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.768670E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.112 | TFLOPs: 26.11 | +7: iteration 45070/ 173500 | consumed samples: 11537920 | consumed tokens: 23629660160 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.790397E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.652 | TFLOPs: 26.09 | +7: iteration 45080/ 173500 | consumed samples: 11540480 | consumed tokens: 23634903040 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.759876E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.555 | TFLOPs: 26.10 | +7: iteration 45090/ 173500 | consumed samples: 11543040 | consumed tokens: 23640145920 | elapsed time per iteration (s): 0.15 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.766362E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.612 | TFLOPs: 26.09 | +7: iteration 45100/ 173500 | consumed samples: 11545600 | consumed tokens: 23645388800 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.759037E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.768 | TFLOPs: 26.11 | +7: iteration 45110/ 173500 | consumed samples: 11548160 | consumed tokens: 23650631680 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.763160E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.006 | TFLOPs: 26.11 | +7: iteration 45120/ 173500 | consumed samples: 11550720 | consumed tokens: 23655874560 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.767364E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.751 | TFLOPs: 26.11 | +7: iteration 45130/ 173500 | consumed samples: 11553280 | consumed tokens: 23661117440 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.761120E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.343 | TFLOPs: 26.12 | +7: iteration 45140/ 173500 | consumed samples: 11555840 | consumed tokens: 23666360320 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.754407E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.560 | TFLOPs: 26.10 | +7: iteration 45150/ 173500 | consumed samples: 11558400 | consumed tokens: 23671603200 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.760070E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.487 | TFLOPs: 26.12 | +7: iteration 45160/ 173500 | consumed samples: 11560960 | consumed tokens: 23676846080 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.764558E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.789 | TFLOPs: 26.11 | +7: iteration 45170/ 173500 | consumed samples: 11563520 | consumed tokens: 23682088960 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.766706E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.201 | TFLOPs: 26.08 | +7: iteration 45180/ 173500 | consumed samples: 11566080 | consumed tokens: 23687331840 | elapsed time per iteration (s): 0.15 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.771946E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.924 | TFLOPs: 26.08 | +7: iteration 45190/ 173500 | consumed samples: 11568640 | consumed tokens: 23692574720 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.778540E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.013 | TFLOPs: 26.08 | +7: iteration 45200/ 173500 | consumed samples: 11571200 | consumed tokens: 23697817600 | elapsed time per iteration (s): 0.16 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.760640E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.080 | TFLOPs: 25.86 | +7: iteration 45210/ 173500 | consumed samples: 11573760 | consumed tokens: 23703060480 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.764214E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.144 | TFLOPs: 26.04 | +7: iteration 45220/ 173500 | consumed samples: 11576320 | consumed tokens: 23708303360 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.761963E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.402 | TFLOPs: 26.09 | +7: iteration 45230/ 173500 | consumed samples: 11578880 | consumed tokens: 23713546240 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.773058E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.385 | TFLOPs: 26.09 | +7: iteration 45240/ 173500 | consumed samples: 11581440 | consumed tokens: 23718789120 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.757490E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.125 | TFLOPs: 26.08 | +7: iteration 45250/ 173500 | consumed samples: 11584000 | consumed tokens: 23724032000 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.766413E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.081 | TFLOPs: 26.08 | +7: iteration 45260/ 173500 | consumed samples: 11586560 | consumed tokens: 23729274880 | elapsed time per iteration (s): 0.15 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.764451E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.706 | TFLOPs: 26.09 | +7: iteration 45270/ 173500 | consumed samples: 11589120 | consumed tokens: 23734517760 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.768845E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.501 | TFLOPs: 26.09 | +7: iteration 45280/ 173500 | consumed samples: 11591680 | consumed tokens: 23739760640 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.762610E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.415 | TFLOPs: 26.09 | +7: iteration 45290/ 173500 | consumed samples: 11594240 | consumed tokens: 23745003520 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.745578E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.937 | TFLOPs: 26.11 | +7: iteration 45300/ 173500 | consumed samples: 11596800 | consumed tokens: 23750246400 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.745026E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.904 | TFLOPs: 26.08 | +7: iteration 45310/ 173500 | consumed samples: 11599360 | consumed tokens: 23755489280 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.757753E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.849 | TFLOPs: 26.09 | +7: iteration 45320/ 173500 | consumed samples: 11601920 | consumed tokens: 23760732160 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.747482E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.796 | TFLOPs: 26.08 | +7: iteration 45330/ 173500 | consumed samples: 11604480 | consumed tokens: 23765975040 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.771857E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.652 | TFLOPs: 26.07 | +7: iteration 45340/ 173500 | consumed samples: 11607040 | consumed tokens: 23771217920 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.766278E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.108 | TFLOPs: 26.08 | +7: iteration 45350/ 173500 | consumed samples: 11609600 | consumed tokens: 23776460800 | elapsed time per iteration (s): 0.15 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.762527E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.723 | TFLOPs: 26.08 | +7: iteration 45360/ 173500 | consumed samples: 11612160 | consumed tokens: 23781703680 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.765120E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.743 | TFLOPs: 25.90 | +7: iteration 45370/ 173500 | consumed samples: 11614720 | consumed tokens: 23786946560 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.766441E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.608 | TFLOPs: 26.11 | +7: iteration 45380/ 173500 | consumed samples: 11617280 | consumed tokens: 23792189440 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.765916E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.936 | TFLOPs: 26.16 | +7: iteration 45390/ 173500 | consumed samples: 11619840 | consumed tokens: 23797432320 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.773006E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.874 | TFLOPs: 26.14 | +7: iteration 45400/ 173500 | consumed samples: 11622400 | consumed tokens: 23802675200 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.764770E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.908 | TFLOPs: 26.16 | +7: iteration 45410/ 173500 | consumed samples: 11624960 | consumed tokens: 23807918080 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.764375E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.627 | TFLOPs: 26.09 | +7: iteration 45420/ 173500 | consumed samples: 11627520 | consumed tokens: 23813160960 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.766549E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.114 | TFLOPs: 26.07 | +7: iteration 45430/ 173500 | consumed samples: 11630080 | consumed tokens: 23818403840 | elapsed time per iteration (s): 0.15 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.764056E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.172 | TFLOPs: 26.05 | +7: iteration 45440/ 173500 | consumed samples: 11632640 | consumed tokens: 23823646720 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.756072E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.417 | TFLOPs: 26.06 | +7: iteration 45450/ 173500 | consumed samples: 11635200 | consumed tokens: 23828889600 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.770968E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.351 | TFLOPs: 26.09 | +7: iteration 45460/ 173500 | consumed samples: 11637760 | consumed tokens: 23834132480 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.763638E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.096 | TFLOPs: 26.05 | +7: iteration 45470/ 173500 | consumed samples: 11640320 | consumed tokens: 23839375360 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.770318E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.944 | TFLOPs: 26.08 | +7: iteration 45480/ 173500 | consumed samples: 11642880 | consumed tokens: 23844618240 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.763467E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.200 | TFLOPs: 26.07 | +7: iteration 45490/ 173500 | consumed samples: 11645440 | consumed tokens: 23849861120 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.762654E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.330 | TFLOPs: 26.07 | +7: iteration 45500/ 173500 | consumed samples: 11648000 | consumed tokens: 23855104000 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.766060E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.045 | TFLOPs: 26.07 | +7: iteration 45510/ 173500 | consumed samples: 11650560 | consumed tokens: 23860346880 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.760959E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.543 | TFLOPs: 25.98 | +7: iteration 45520/ 173500 | consumed samples: 11653120 | consumed tokens: 23865589760 | elapsed time per iteration (s): 0.15 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.764514E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.540 | TFLOPs: 26.01 | +7: iteration 45530/ 173500 | consumed samples: 11655680 | consumed tokens: 23870832640 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.749553E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.827 | TFLOPs: 26.01 | +7: iteration 45540/ 173500 | consumed samples: 11658240 | consumed tokens: 23876075520 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.757190E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.339 | TFLOPs: 26.04 | +7: iteration 45550/ 173500 | consumed samples: 11660800 | consumed tokens: 23881318400 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.773222E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.284 | TFLOPs: 26.05 | +7: iteration 45560/ 173500 | consumed samples: 11663360 | consumed tokens: 23886561280 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.763068E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.429 | TFLOPs: 26.09 | +7: iteration 45570/ 173500 | consumed samples: 11665920 | consumed tokens: 23891804160 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.753856E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.269 | TFLOPs: 26.10 | +7: iteration 45580/ 173500 | consumed samples: 11668480 | consumed tokens: 23897047040 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.764663E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.001 | TFLOPs: 26.11 | +7: iteration 45590/ 173500 | consumed samples: 11671040 | consumed tokens: 23902289920 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.756864E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.563 | TFLOPs: 26.12 | +7: iteration 45600/ 173500 | consumed samples: 11673600 | consumed tokens: 23907532800 | elapsed time per iteration (s): 0.15 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.764660E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.703 | TFLOPs: 26.12 | +7: iteration 45610/ 173500 | consumed samples: 11676160 | consumed tokens: 23912775680 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.758059E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.082 | TFLOPs: 26.11 | +7: iteration 45620/ 173500 | consumed samples: 11678720 | consumed tokens: 23918018560 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.759679E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.218 | TFLOPs: 26.11 | +7: iteration 45630/ 173500 | consumed samples: 11681280 | consumed tokens: 23923261440 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.760989E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.939 | TFLOPs: 26.13 | +7: iteration 45640/ 173500 | consumed samples: 11683840 | consumed tokens: 23928504320 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.751023E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.805 | TFLOPs: 26.14 | +7: iteration 45650/ 173500 | consumed samples: 11686400 | consumed tokens: 23933747200 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.771622E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.938 | TFLOPs: 26.17 | +7: iteration 45660/ 173500 | consumed samples: 11688960 | consumed tokens: 23938990080 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.757250E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.025 | TFLOPs: 26.16 | +7: iteration 45670/ 173500 | consumed samples: 11691520 | consumed tokens: 23944232960 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.761988E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.556 | TFLOPs: 26.17 | +7: iteration 45680/ 173500 | consumed samples: 11694080 | consumed tokens: 23949475840 | elapsed time per iteration (s): 0.15 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.743061E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.545 | TFLOPs: 26.15 | +7: iteration 45690/ 173500 | consumed samples: 11696640 | consumed tokens: 23954718720 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.772054E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.832 | TFLOPs: 26.17 | +7: iteration 45700/ 173500 | consumed samples: 11699200 | consumed tokens: 23959961600 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.754260E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.024 | TFLOPs: 26.17 | +7: iteration 45710/ 173500 | consumed samples: 11701760 | consumed tokens: 23965204480 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.758569E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.282 | TFLOPs: 26.18 | +7: iteration 45720/ 173500 | consumed samples: 11704320 | consumed tokens: 23970447360 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.773928E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.203 | TFLOPs: 26.13 | +7: iteration 45730/ 173500 | consumed samples: 11706880 | consumed tokens: 23975690240 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.767523E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.857 | TFLOPs: 26.12 | +7: iteration 45740/ 173500 | consumed samples: 11709440 | consumed tokens: 23980933120 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.759726E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.345 | TFLOPs: 26.12 | +7: iteration 45750/ 173500 | consumed samples: 11712000 | consumed tokens: 23986176000 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.761620E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.190 | TFLOPs: 26.11 | +7: iteration 45760/ 173500 | consumed samples: 11714560 | consumed tokens: 23991418880 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.742057E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.224 | TFLOPs: 26.13 | +7: iteration 45770/ 173500 | consumed samples: 11717120 | consumed tokens: 23996661760 | elapsed time per iteration (s): 0.15 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.768798E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.137 | TFLOPs: 26.13 | +7: iteration 45780/ 173500 | consumed samples: 11719680 | consumed tokens: 24001904640 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.758134E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.659 | TFLOPs: 26.14 | +7: iteration 45790/ 173500 | consumed samples: 11722240 | consumed tokens: 24007147520 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.751369E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.718 | TFLOPs: 26.11 | +7: iteration 45800/ 173500 | consumed samples: 11724800 | consumed tokens: 24012390400 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.775980E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.968 | TFLOPs: 26.16 | +7: iteration 45810/ 173500 | consumed samples: 11727360 | consumed tokens: 24017633280 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.767345E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.870 | TFLOPs: 26.19 | +7: iteration 45820/ 173500 | consumed samples: 11729920 | consumed tokens: 24022876160 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.754488E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.662 | TFLOPs: 26.20 | +7: iteration 45830/ 173500 | consumed samples: 11732480 | consumed tokens: 24028119040 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.764801E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.728 | TFLOPs: 26.22 | +7: iteration 45840/ 173500 | consumed samples: 11735040 | consumed tokens: 24033361920 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.751145E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.204 | TFLOPs: 26.21 | +7: iteration 45850/ 173500 | consumed samples: 11737600 | consumed tokens: 24038604800 | elapsed time per iteration (s): 0.15 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.766367E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.294 | TFLOPs: 26.18 | +7: iteration 45860/ 173500 | consumed samples: 11740160 | consumed tokens: 24043847680 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.770621E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.066 | TFLOPs: 26.21 | +7: iteration 45870/ 173500 | consumed samples: 11742720 | consumed tokens: 24049090560 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.759166E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.726 | TFLOPs: 26.20 | +7: iteration 45880/ 173500 | consumed samples: 11745280 | consumed tokens: 24054333440 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.774514E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.600 | TFLOPs: 26.20 | +7: iteration 45890/ 173500 | consumed samples: 11747840 | consumed tokens: 24059576320 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.746409E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.785 | TFLOPs: 26.20 | +7: iteration 45900/ 173500 | consumed samples: 11750400 | consumed tokens: 24064819200 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.747198E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.483 | TFLOPs: 26.20 | +7: iteration 45910/ 173500 | consumed samples: 11752960 | consumed tokens: 24070062080 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.763722E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.737 | TFLOPs: 26.20 | +7: iteration 45920/ 173500 | consumed samples: 11755520 | consumed tokens: 24075304960 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.766850E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.211 | TFLOPs: 26.19 | +7: iteration 45930/ 173500 | consumed samples: 11758080 | consumed tokens: 24080547840 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.764544E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.237 | TFLOPs: 26.21 | +7: iteration 45940/ 173500 | consumed samples: 11760640 | consumed tokens: 24085790720 | elapsed time per iteration (s): 0.15 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.758783E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.562 | TFLOPs: 26.20 | +7: iteration 45950/ 173500 | consumed samples: 11763200 | consumed tokens: 24091033600 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.755682E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.398 | TFLOPs: 26.18 | +7: iteration 45960/ 173500 | consumed samples: 11765760 | consumed tokens: 24096276480 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.768331E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.728 | TFLOPs: 26.20 | +7: iteration 45970/ 173500 | consumed samples: 11768320 | consumed tokens: 24101519360 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.751221E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.333 | TFLOPs: 26.20 | +7: iteration 45980/ 173500 | consumed samples: 11770880 | consumed tokens: 24106762240 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.772892E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.571 | TFLOPs: 26.18 | +7: iteration 45990/ 173500 | consumed samples: 11773440 | consumed tokens: 24112005120 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.760933E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.433 | TFLOPs: 26.21 | +0: [2023-03-17 02:14:24,528] [INFO] [logging.py:68:log_dist] [Rank 0] step=46000, skipped=0, lr=[0.00017208047558447097, 0.00017208047558447097, 0.00017208047558447097], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 46000/ 173500 | consumed samples: 11776000 | consumed tokens: 24117248000 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.774883E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.676 | TFLOPs: 26.22 | +0: steps: 46000 loss: 3.7708 iter time (s): 0.153 samples/sec: 1674.708 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 46000 | lm loss value: 3.884946E+00 | lm loss PPL: 4.866433E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 46000 to checkpoints_44m91b100m +0: [2023-03-17 02:14:24,601] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step46000 is begin to save! +0: [2023-03-17 02:14:24,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:14:24,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:14:24,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:14:24,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:14:24,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:14:24,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:14:24,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:14:24,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:14:24,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:14:24,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:14:24,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:14:24,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:14:24,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:14:24,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:14:24,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:14:24,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:14:24,722] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:14:24,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:14:24,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:14:24,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:14:24,731] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step46000/mp_rank_00_model_states.pt +0: [2023-03-17 02:14:24,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:14:24,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:14:24,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:14:24,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:14:24,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:14:24,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:14:24,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:14:24,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:14:24,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +5: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +1: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +7: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +4: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +2: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:14:24,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +6: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:14:24,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:14:24,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:14:24,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step46000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +3: [2023-03-17 02:14:24,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step46000 is ready now! +0: successfully saved checkpoint at iteration 46000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.05 +7: iteration 46010/ 173500 | consumed samples: 11778560 | consumed tokens: 24122490880 | elapsed time per iteration (s): 0.18 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.748498E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.884 | TFLOPs: 22.64 | +7: iteration 46020/ 173500 | consumed samples: 11781120 | consumed tokens: 24127733760 | elapsed time per iteration (s): 0.15 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.764320E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.306 | TFLOPs: 26.24 | +7: iteration 46030/ 173500 | consumed samples: 11783680 | consumed tokens: 24132976640 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.769718E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.534 | TFLOPs: 26.23 | +7: iteration 46040/ 173500 | consumed samples: 11786240 | consumed tokens: 24138219520 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.767098E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.812 | TFLOPs: 26.23 | +7: iteration 46050/ 173500 | consumed samples: 11788800 | consumed tokens: 24143462400 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.741263E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.551 | TFLOPs: 26.25 | +7: iteration 46060/ 173500 | consumed samples: 11791360 | consumed tokens: 24148705280 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.762483E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.450 | TFLOPs: 26.24 | +7: iteration 46070/ 173500 | consumed samples: 11793920 | consumed tokens: 24153948160 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.763935E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.865 | TFLOPs: 26.23 | +7: iteration 46080/ 173500 | consumed samples: 11796480 | consumed tokens: 24159191040 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.771207E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.340 | TFLOPs: 26.24 | +7: iteration 46090/ 173500 | consumed samples: 11799040 | consumed tokens: 24164433920 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.770816E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.584 | TFLOPs: 26.25 | +7: iteration 46100/ 173500 | consumed samples: 11801600 | consumed tokens: 24169676800 | elapsed time per iteration (s): 0.15 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.759329E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.377 | TFLOPs: 26.21 | +7: iteration 46110/ 173500 | consumed samples: 11804160 | consumed tokens: 24174919680 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.763827E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.040 | TFLOPs: 26.19 | +7: iteration 46120/ 173500 | consumed samples: 11806720 | consumed tokens: 24180162560 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.770431E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.894 | TFLOPs: 26.20 | +7: iteration 46130/ 173500 | consumed samples: 11809280 | consumed tokens: 24185405440 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.763586E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.471 | TFLOPs: 26.21 | +7: iteration 46140/ 173500 | consumed samples: 11811840 | consumed tokens: 24190648320 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.769634E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.965 | TFLOPs: 26.20 | +7: iteration 46150/ 173500 | consumed samples: 11814400 | consumed tokens: 24195891200 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.768369E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.441 | TFLOPs: 26.21 | +7: iteration 46160/ 173500 | consumed samples: 11816960 | consumed tokens: 24201134080 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.765636E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.704 | TFLOPs: 26.20 | +7: iteration 46170/ 173500 | consumed samples: 11819520 | consumed tokens: 24206376960 | elapsed time per iteration (s): 0.16 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.762535E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.564 | TFLOPs: 25.59 | +7: iteration 46180/ 173500 | consumed samples: 11822080 | consumed tokens: 24211619840 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.756798E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.702 | TFLOPs: 26.15 | +7: iteration 46190/ 173500 | consumed samples: 11824640 | consumed tokens: 24216862720 | elapsed time per iteration (s): 0.15 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.756892E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.015 | TFLOPs: 26.17 | +7: iteration 46200/ 173500 | consumed samples: 11827200 | consumed tokens: 24222105600 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.755451E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.499 | TFLOPs: 26.20 | +7: iteration 46210/ 173500 | consumed samples: 11829760 | consumed tokens: 24227348480 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.763813E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.297 | TFLOPs: 26.05 | +7: iteration 46220/ 173500 | consumed samples: 11832320 | consumed tokens: 24232591360 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.764098E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.549 | TFLOPs: 26.21 | +7: iteration 46230/ 173500 | consumed samples: 11834880 | consumed tokens: 24237834240 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.763428E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.804 | TFLOPs: 26.17 | +7: iteration 46240/ 173500 | consumed samples: 11837440 | consumed tokens: 24243077120 | elapsed time per iteration (s): 0.15 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.754884E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.843 | TFLOPs: 26.19 | +7: iteration 46250/ 173500 | consumed samples: 11840000 | consumed tokens: 24248320000 | elapsed time per iteration (s): 0.16 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.759419E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.380 | TFLOPs: 25.66 | +7: iteration 46260/ 173500 | consumed samples: 11842560 | consumed tokens: 24253562880 | elapsed time per iteration (s): 0.16 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.754206E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.879 | TFLOPs: 25.53 | +7: iteration 46270/ 173500 | consumed samples: 11845120 | consumed tokens: 24258805760 | elapsed time per iteration (s): 0.16 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.766271E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.620 | TFLOPs: 25.78 | +7: iteration 46280/ 173500 | consumed samples: 11847680 | consumed tokens: 24264048640 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.758017E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.690 | TFLOPs: 25.90 | +7: iteration 46290/ 173500 | consumed samples: 11850240 | consumed tokens: 24269291520 | elapsed time per iteration (s): 0.16 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.772729E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.778 | TFLOPs: 25.86 | +7: iteration 46300/ 173500 | consumed samples: 11852800 | consumed tokens: 24274534400 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.775798E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.613 | TFLOPs: 26.20 | +7: iteration 46310/ 173500 | consumed samples: 11855360 | consumed tokens: 24279777280 | elapsed time per iteration (s): 0.16 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.770384E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.180 | TFLOPs: 25.27 | +7: iteration 46320/ 173500 | consumed samples: 11857920 | consumed tokens: 24285020160 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.741599E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.166 | TFLOPs: 26.21 | +7: iteration 46330/ 173500 | consumed samples: 11860480 | consumed tokens: 24290263040 | elapsed time per iteration (s): 0.16 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.763520E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.917 | TFLOPs: 25.62 | +7: iteration 46340/ 173500 | consumed samples: 11863040 | consumed tokens: 24295505920 | elapsed time per iteration (s): 0.16 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.774534E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.829 | TFLOPs: 25.54 | +7: iteration 46350/ 173500 | consumed samples: 11865600 | consumed tokens: 24300748800 | elapsed time per iteration (s): 0.16 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.764092E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.402 | TFLOPs: 25.87 | +7: iteration 46360/ 173500 | consumed samples: 11868160 | consumed tokens: 24305991680 | elapsed time per iteration (s): 0.15 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.760252E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.969 | TFLOPs: 26.20 | +7: iteration 46370/ 173500 | consumed samples: 11870720 | consumed tokens: 24311234560 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.760117E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.908 | TFLOPs: 25.86 | +7: iteration 46380/ 173500 | consumed samples: 11873280 | consumed tokens: 24316477440 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.768266E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.856 | TFLOPs: 25.86 | +7: iteration 46390/ 173500 | consumed samples: 11875840 | consumed tokens: 24321720320 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.771364E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.559 | TFLOPs: 25.74 | +7: iteration 46400/ 173500 | consumed samples: 11878400 | consumed tokens: 24326963200 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.770630E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.846 | TFLOPs: 25.81 | +7: iteration 46410/ 173500 | consumed samples: 11880960 | consumed tokens: 24332206080 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.762291E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.000 | TFLOPs: 25.86 | +7: iteration 46420/ 173500 | consumed samples: 11883520 | consumed tokens: 24337448960 | elapsed time per iteration (s): 0.15 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.740868E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.173 | TFLOPs: 26.13 | +7: iteration 46430/ 173500 | consumed samples: 11886080 | consumed tokens: 24342691840 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.766460E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.200 | TFLOPs: 25.72 | +7: iteration 46440/ 173500 | consumed samples: 11888640 | consumed tokens: 24347934720 | elapsed time per iteration (s): 0.16 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.754553E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.829 | TFLOPs: 25.87 | +7: iteration 46450/ 173500 | consumed samples: 11891200 | consumed tokens: 24353177600 | elapsed time per iteration (s): 0.15 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.754416E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.548 | TFLOPs: 26.18 | +7: iteration 46460/ 173500 | consumed samples: 11893760 | consumed tokens: 24358420480 | elapsed time per iteration (s): 0.16 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.750430E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.679 | TFLOPs: 25.59 | +7: iteration 46470/ 173500 | consumed samples: 11896320 | consumed tokens: 24363663360 | elapsed time per iteration (s): 0.16 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.768697E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.770 | TFLOPs: 25.70 | +7: iteration 46480/ 173500 | consumed samples: 11898880 | consumed tokens: 24368906240 | elapsed time per iteration (s): 0.16 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.767826E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.017 | TFLOPs: 25.70 | +7: iteration 46490/ 173500 | consumed samples: 11901440 | consumed tokens: 24374149120 | elapsed time per iteration (s): 0.15 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.752975E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.140 | TFLOPs: 26.21 | +7: iteration 46500/ 173500 | consumed samples: 11904000 | consumed tokens: 24379392000 | elapsed time per iteration (s): 0.16 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.749604E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.829 | TFLOPs: 25.83 | +7: iteration 46510/ 173500 | consumed samples: 11906560 | consumed tokens: 24384634880 | elapsed time per iteration (s): 0.16 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.741046E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.218 | TFLOPs: 25.71 | +7: iteration 46520/ 173500 | consumed samples: 11909120 | consumed tokens: 24389877760 | elapsed time per iteration (s): 0.15 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.764820E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.218 | TFLOPs: 25.91 | +7: iteration 46530/ 173500 | consumed samples: 11911680 | consumed tokens: 24395120640 | elapsed time per iteration (s): 0.16 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.760719E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.048 | TFLOPs: 25.78 | +7: iteration 46540/ 173500 | consumed samples: 11914240 | consumed tokens: 24400363520 | elapsed time per iteration (s): 0.16 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.762463E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.777 | TFLOPs: 25.83 | +7: iteration 46550/ 173500 | consumed samples: 11916800 | consumed tokens: 24405606400 | elapsed time per iteration (s): 0.16 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.770269E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.337 | TFLOPs: 25.82 | +7: iteration 46560/ 173500 | consumed samples: 11919360 | consumed tokens: 24410849280 | elapsed time per iteration (s): 0.16 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.762133E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.426 | TFLOPs: 25.15 | +7: iteration 46570/ 173500 | consumed samples: 11921920 | consumed tokens: 24416092160 | elapsed time per iteration (s): 0.15 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.757279E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.162 | TFLOPs: 26.21 | +7: iteration 46580/ 173500 | consumed samples: 11924480 | consumed tokens: 24421335040 | elapsed time per iteration (s): 0.15 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.755891E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.438 | TFLOPs: 26.20 | +7: iteration 46590/ 173500 | consumed samples: 11927040 | consumed tokens: 24426577920 | elapsed time per iteration (s): 0.15 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.756071E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.508 | TFLOPs: 26.21 | +7: iteration 46600/ 173500 | consumed samples: 11929600 | consumed tokens: 24431820800 | elapsed time per iteration (s): 0.15 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.762679E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.750 | TFLOPs: 26.20 | +7: iteration 46610/ 173500 | consumed samples: 11932160 | consumed tokens: 24437063680 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.765218E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.376 | TFLOPs: 26.20 | +7: iteration 46620/ 173500 | consumed samples: 11934720 | consumed tokens: 24442306560 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.763366E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.644 | TFLOPs: 26.17 | +7: iteration 46630/ 173500 | consumed samples: 11937280 | consumed tokens: 24447549440 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.767401E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.306 | TFLOPs: 26.18 | +7: iteration 46640/ 173500 | consumed samples: 11939840 | consumed tokens: 24452792320 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.765072E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.698 | TFLOPs: 26.20 | +7: iteration 46650/ 173500 | consumed samples: 11942400 | consumed tokens: 24458035200 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.754598E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.544 | TFLOPs: 26.21 | +7: iteration 46660/ 173500 | consumed samples: 11944960 | consumed tokens: 24463278080 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.749176E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.444 | TFLOPs: 26.20 | +7: iteration 46670/ 173500 | consumed samples: 11947520 | consumed tokens: 24468520960 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.758501E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.615 | TFLOPs: 26.22 | +7: iteration 46680/ 173500 | consumed samples: 11950080 | consumed tokens: 24473763840 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.768515E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.293 | TFLOPs: 26.21 | +7: iteration 46690/ 173500 | consumed samples: 11952640 | consumed tokens: 24479006720 | elapsed time per iteration (s): 0.15 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.749454E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.261 | TFLOPs: 26.21 | +7: iteration 46700/ 173500 | consumed samples: 11955200 | consumed tokens: 24484249600 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.754182E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.073 | TFLOPs: 26.22 | +7: iteration 46710/ 173500 | consumed samples: 11957760 | consumed tokens: 24489492480 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.762071E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.608 | TFLOPs: 26.20 | +7: iteration 46720/ 173500 | consumed samples: 11960320 | consumed tokens: 24494735360 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.757948E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.877 | TFLOPs: 26.22 | +7: iteration 46730/ 173500 | consumed samples: 11962880 | consumed tokens: 24499978240 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.747625E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.280 | TFLOPs: 26.19 | +7: iteration 46740/ 173500 | consumed samples: 11965440 | consumed tokens: 24505221120 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.772429E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.383 | TFLOPs: 26.21 | +7: iteration 46750/ 173500 | consumed samples: 11968000 | consumed tokens: 24510464000 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.757866E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.731 | TFLOPs: 26.20 | +7: iteration 46760/ 173500 | consumed samples: 11970560 | consumed tokens: 24515706880 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.752161E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.895 | TFLOPs: 26.20 | +7: iteration 46770/ 173500 | consumed samples: 11973120 | consumed tokens: 24520949760 | elapsed time per iteration (s): 0.15 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.769144E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.318 | TFLOPs: 26.21 | +7: iteration 46780/ 173500 | consumed samples: 11975680 | consumed tokens: 24526192640 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.760921E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.104 | TFLOPs: 26.21 | +7: iteration 46790/ 173500 | consumed samples: 11978240 | consumed tokens: 24531435520 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.754853E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.609 | TFLOPs: 26.20 | +7: iteration 46800/ 173500 | consumed samples: 11980800 | consumed tokens: 24536678400 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.765148E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.264 | TFLOPs: 26.19 | +7: iteration 46810/ 173500 | consumed samples: 11983360 | consumed tokens: 24541921280 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.765478E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.777 | TFLOPs: 26.22 | +7: iteration 46820/ 173500 | consumed samples: 11985920 | consumed tokens: 24547164160 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.764787E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.762 | TFLOPs: 26.22 | +7: iteration 46830/ 173500 | consumed samples: 11988480 | consumed tokens: 24552407040 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.776065E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.147 | TFLOPs: 26.21 | +7: iteration 46840/ 173500 | consumed samples: 11991040 | consumed tokens: 24557649920 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.765335E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.406 | TFLOPs: 26.21 | +7: iteration 46850/ 173500 | consumed samples: 11993600 | consumed tokens: 24562892800 | elapsed time per iteration (s): 0.15 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.745537E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.017 | TFLOPs: 26.22 | +7: iteration 46860/ 173500 | consumed samples: 11996160 | consumed tokens: 24568135680 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.753209E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.264 | TFLOPs: 26.24 | +7: iteration 46870/ 173500 | consumed samples: 11998720 | consumed tokens: 24573378560 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.761423E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.721 | TFLOPs: 26.23 | +7: iteration 46880/ 173500 | consumed samples: 12001280 | consumed tokens: 24578621440 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.769467E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.402 | TFLOPs: 26.24 | +7: iteration 46890/ 173500 | consumed samples: 12003840 | consumed tokens: 24583864320 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.766991E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.067 | TFLOPs: 26.24 | +7: iteration 46900/ 173500 | consumed samples: 12006400 | consumed tokens: 24589107200 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.762042E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.352 | TFLOPs: 26.21 | +7: iteration 46910/ 173500 | consumed samples: 12008960 | consumed tokens: 24594350080 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.765251E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.914 | TFLOPs: 26.24 | +7: iteration 46920/ 173500 | consumed samples: 12011520 | consumed tokens: 24599592960 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.748439E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.544 | TFLOPs: 26.20 | +7: iteration 46930/ 173500 | consumed samples: 12014080 | consumed tokens: 24604835840 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.761342E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.985 | TFLOPs: 26.17 | +7: iteration 46940/ 173500 | consumed samples: 12016640 | consumed tokens: 24610078720 | elapsed time per iteration (s): 0.15 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.765023E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.059 | TFLOPs: 26.21 | +7: iteration 46950/ 173500 | consumed samples: 12019200 | consumed tokens: 24615321600 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.778573E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.404 | TFLOPs: 26.21 | +7: iteration 46960/ 173500 | consumed samples: 12021760 | consumed tokens: 24620564480 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.739806E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.074 | TFLOPs: 26.22 | +7: iteration 46970/ 173500 | consumed samples: 12024320 | consumed tokens: 24625807360 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.768683E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.690 | TFLOPs: 26.15 | +7: iteration 46980/ 173500 | consumed samples: 12026880 | consumed tokens: 24631050240 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.767240E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.434 | TFLOPs: 26.18 | +7: iteration 46990/ 173500 | consumed samples: 12029440 | consumed tokens: 24636293120 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.762993E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.117 | TFLOPs: 26.18 | +7: iteration 47000/ 173500 | consumed samples: 12032000 | consumed tokens: 24641536000 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.755427E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.973 | TFLOPs: 26.19 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 47000 | lm loss value: 3.883117E+00 | lm loss PPL: 4.857540E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 47000 to checkpoints_44m91b100m +0: [2023-03-17 02:16:58,820] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step47000 is begin to save! +0: [2023-03-17 02:16:58,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:16:58,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:16:58,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:16:58,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:16:58,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:16:58,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:16:58,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:16:58,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:16:58,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:16:58,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:16:58,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:16:58,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:16:58,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:16:58,932] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:16:58,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:16:58,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:16:58,940] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:16:58,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:16:58,948] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:16:58,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:16:58,949] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step47000/mp_rank_00_model_states.pt +0: [2023-03-17 02:16:58,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:16:58,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:16:58,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:16:58,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:16:58,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,983] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,983] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:16:58,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,985] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:16:58,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,986] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:16:58,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: [2023-03-17 02:16:58,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +5: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +3: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +7: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +1: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +2: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:16:58,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 02:16:58,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 02:16:58,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +6: [2023-03-17 02:16:58,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +4: [2023-03-17 02:16:58,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:16:58,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step47000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:16:58,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step47000 is ready now! +0: successfully saved checkpoint at iteration 47000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.30 +7: iteration 47010/ 173500 | consumed samples: 12034560 | consumed tokens: 24646778880 | elapsed time per iteration (s): 0.18 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.763473E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.209 | TFLOPs: 22.87 | +7: iteration 47020/ 173500 | consumed samples: 12037120 | consumed tokens: 24652021760 | elapsed time per iteration (s): 0.15 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.759188E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.897 | TFLOPs: 26.20 | +7: iteration 47030/ 173500 | consumed samples: 12039680 | consumed tokens: 24657264640 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.759381E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.254 | TFLOPs: 26.23 | +7: iteration 47040/ 173500 | consumed samples: 12042240 | consumed tokens: 24662507520 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.757690E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.690 | TFLOPs: 26.22 | +7: iteration 47050/ 173500 | consumed samples: 12044800 | consumed tokens: 24667750400 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.779805E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.687 | TFLOPs: 26.22 | +7: iteration 47060/ 173500 | consumed samples: 12047360 | consumed tokens: 24672993280 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.756644E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.900 | TFLOPs: 26.19 | +7: iteration 47070/ 173500 | consumed samples: 12049920 | consumed tokens: 24678236160 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.756852E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.776 | TFLOPs: 26.20 | +7: iteration 47080/ 173500 | consumed samples: 12052480 | consumed tokens: 24683479040 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.759421E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.653 | TFLOPs: 26.20 | +7: iteration 47090/ 173500 | consumed samples: 12055040 | consumed tokens: 24688721920 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.768102E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.079 | TFLOPs: 26.16 | +7: iteration 47100/ 173500 | consumed samples: 12057600 | consumed tokens: 24693964800 | elapsed time per iteration (s): 0.15 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.750010E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.517 | TFLOPs: 26.18 | +7: iteration 47110/ 173500 | consumed samples: 12060160 | consumed tokens: 24699207680 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.744903E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.080 | TFLOPs: 26.18 | +7: iteration 47120/ 173500 | consumed samples: 12062720 | consumed tokens: 24704450560 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.746581E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.345 | TFLOPs: 26.18 | +7: iteration 47130/ 173500 | consumed samples: 12065280 | consumed tokens: 24709693440 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.767274E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.870 | TFLOPs: 26.17 | +7: iteration 47140/ 173500 | consumed samples: 12067840 | consumed tokens: 24714936320 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.770328E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.703 | TFLOPs: 26.15 | +7: iteration 47150/ 173500 | consumed samples: 12070400 | consumed tokens: 24720179200 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.753067E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.535 | TFLOPs: 26.15 | +7: iteration 47160/ 173500 | consumed samples: 12072960 | consumed tokens: 24725422080 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.763195E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.228 | TFLOPs: 26.18 | +7: iteration 47170/ 173500 | consumed samples: 12075520 | consumed tokens: 24730664960 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.755321E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.733 | TFLOPs: 26.17 | +7: iteration 47180/ 173500 | consumed samples: 12078080 | consumed tokens: 24735907840 | elapsed time per iteration (s): 0.15 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.759335E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.979 | TFLOPs: 26.19 | +7: iteration 47190/ 173500 | consumed samples: 12080640 | consumed tokens: 24741150720 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.757775E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.486 | TFLOPs: 26.20 | +7: iteration 47200/ 173500 | consumed samples: 12083200 | consumed tokens: 24746393600 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.752680E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.476 | TFLOPs: 26.17 | +7: iteration 47210/ 173500 | consumed samples: 12085760 | consumed tokens: 24751636480 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.750014E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.172 | TFLOPs: 25.93 | +7: iteration 47220/ 173500 | consumed samples: 12088320 | consumed tokens: 24756879360 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.762639E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.737 | TFLOPs: 26.06 | +7: iteration 47230/ 173500 | consumed samples: 12090880 | consumed tokens: 24762122240 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.775222E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.707 | TFLOPs: 26.17 | +7: iteration 47240/ 173500 | consumed samples: 12093440 | consumed tokens: 24767365120 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.760179E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.857 | TFLOPs: 26.19 | +7: iteration 47250/ 173500 | consumed samples: 12096000 | consumed tokens: 24772608000 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.753921E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.034 | TFLOPs: 26.19 | +7: iteration 47260/ 173500 | consumed samples: 12098560 | consumed tokens: 24777850880 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.736026E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.471 | TFLOPs: 26.20 | +7: iteration 47270/ 173500 | consumed samples: 12101120 | consumed tokens: 24783093760 | elapsed time per iteration (s): 0.15 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.759463E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.405 | TFLOPs: 26.18 | +7: iteration 47280/ 173500 | consumed samples: 12103680 | consumed tokens: 24788336640 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.764728E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.833 | TFLOPs: 26.17 | +7: iteration 47290/ 173500 | consumed samples: 12106240 | consumed tokens: 24793579520 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.754676E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.560 | TFLOPs: 26.18 | +7: iteration 47300/ 173500 | consumed samples: 12108800 | consumed tokens: 24798822400 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.758998E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.109 | TFLOPs: 26.19 | +7: iteration 47310/ 173500 | consumed samples: 12111360 | consumed tokens: 24804065280 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.771023E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.997 | TFLOPs: 26.17 | +7: iteration 47320/ 173500 | consumed samples: 12113920 | consumed tokens: 24809308160 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.746719E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.595 | TFLOPs: 26.18 | +7: iteration 47330/ 173500 | consumed samples: 12116480 | consumed tokens: 24814551040 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.764026E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.168 | TFLOPs: 26.16 | +7: iteration 47340/ 173500 | consumed samples: 12119040 | consumed tokens: 24819793920 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.761810E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.894 | TFLOPs: 26.19 | +7: iteration 47350/ 173500 | consumed samples: 12121600 | consumed tokens: 24825036800 | elapsed time per iteration (s): 0.15 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.773573E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.306 | TFLOPs: 26.18 | +7: iteration 47360/ 173500 | consumed samples: 12124160 | consumed tokens: 24830279680 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.765257E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.669 | TFLOPs: 26.20 | +7: iteration 47370/ 173500 | consumed samples: 12126720 | consumed tokens: 24835522560 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.754260E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.611 | TFLOPs: 26.17 | +7: iteration 47380/ 173500 | consumed samples: 12129280 | consumed tokens: 24840765440 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.758032E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.679 | TFLOPs: 26.11 | +7: iteration 47390/ 173500 | consumed samples: 12131840 | consumed tokens: 24846008320 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.751382E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.830 | TFLOPs: 26.14 | +7: iteration 47400/ 173500 | consumed samples: 12134400 | consumed tokens: 24851251200 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.750050E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.053 | TFLOPs: 26.17 | +7: iteration 47410/ 173500 | consumed samples: 12136960 | consumed tokens: 24856494080 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.758779E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.530 | TFLOPs: 26.17 | +7: iteration 47420/ 173500 | consumed samples: 12139520 | consumed tokens: 24861736960 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.770417E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.060 | TFLOPs: 26.18 | +7: iteration 47430/ 173500 | consumed samples: 12142080 | consumed tokens: 24866979840 | elapsed time per iteration (s): 0.15 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.755869E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.837 | TFLOPs: 26.17 | +7: iteration 47440/ 173500 | consumed samples: 12144640 | consumed tokens: 24872222720 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.730767E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.035 | TFLOPs: 26.17 | +7: iteration 47450/ 173500 | consumed samples: 12147200 | consumed tokens: 24877465600 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.751145E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.725 | TFLOPs: 26.19 | +7: iteration 47460/ 173500 | consumed samples: 12149760 | consumed tokens: 24882708480 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.757703E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.603 | TFLOPs: 26.17 | +7: iteration 47470/ 173500 | consumed samples: 12152320 | consumed tokens: 24887951360 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.764148E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.380 | TFLOPs: 26.20 | +7: iteration 47480/ 173500 | consumed samples: 12154880 | consumed tokens: 24893194240 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.746930E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.901 | TFLOPs: 26.27 | +7: iteration 47490/ 173500 | consumed samples: 12157440 | consumed tokens: 24898437120 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.765503E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.850 | TFLOPs: 26.27 | +7: iteration 47500/ 173500 | consumed samples: 12160000 | consumed tokens: 24903680000 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.769506E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.421 | TFLOPs: 26.27 | +7: iteration 47510/ 173500 | consumed samples: 12162560 | consumed tokens: 24908922880 | elapsed time per iteration (s): 0.15 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.755659E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.152 | TFLOPs: 26.27 | +7: iteration 47520/ 173500 | consumed samples: 12165120 | consumed tokens: 24914165760 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.760629E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.442 | TFLOPs: 26.21 | +7: iteration 47530/ 173500 | consumed samples: 12167680 | consumed tokens: 24919408640 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.767941E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.560 | TFLOPs: 26.25 | +7: iteration 47540/ 173500 | consumed samples: 12170240 | consumed tokens: 24924651520 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.750883E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.599 | TFLOPs: 26.26 | +7: iteration 47550/ 173500 | consumed samples: 12172800 | consumed tokens: 24929894400 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.751807E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.646 | TFLOPs: 26.26 | +7: iteration 47560/ 173500 | consumed samples: 12175360 | consumed tokens: 24935137280 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.760477E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.976 | TFLOPs: 26.27 | +7: iteration 47570/ 173500 | consumed samples: 12177920 | consumed tokens: 24940380160 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.756408E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.567 | TFLOPs: 26.26 | +7: iteration 47580/ 173500 | consumed samples: 12180480 | consumed tokens: 24945623040 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.755531E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.302 | TFLOPs: 26.26 | +7: iteration 47590/ 173500 | consumed samples: 12183040 | consumed tokens: 24950865920 | elapsed time per iteration (s): 0.15 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.750422E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.350 | TFLOPs: 26.26 | +7: iteration 47600/ 173500 | consumed samples: 12185600 | consumed tokens: 24956108800 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.775377E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.933 | TFLOPs: 26.25 | +7: iteration 47610/ 173500 | consumed samples: 12188160 | consumed tokens: 24961351680 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.751993E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.537 | TFLOPs: 26.26 | +7: iteration 47620/ 173500 | consumed samples: 12190720 | consumed tokens: 24966594560 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.755341E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.551 | TFLOPs: 26.26 | +7: iteration 47630/ 173500 | consumed samples: 12193280 | consumed tokens: 24971837440 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.768020E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.066 | TFLOPs: 26.24 | +7: iteration 47640/ 173500 | consumed samples: 12195840 | consumed tokens: 24977080320 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.757781E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.114 | TFLOPs: 26.24 | +7: iteration 47650/ 173500 | consumed samples: 12198400 | consumed tokens: 24982323200 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.758988E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.244 | TFLOPs: 26.21 | +7: iteration 47660/ 173500 | consumed samples: 12200960 | consumed tokens: 24987566080 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.755247E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.680 | TFLOPs: 26.15 | +7: iteration 47670/ 173500 | consumed samples: 12203520 | consumed tokens: 24992808960 | elapsed time per iteration (s): 0.15 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.756054E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.909 | TFLOPs: 26.11 | +7: iteration 47680/ 173500 | consumed samples: 12206080 | consumed tokens: 24998051840 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.755803E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.598 | TFLOPs: 26.09 | +7: iteration 47690/ 173500 | consumed samples: 12208640 | consumed tokens: 25003294720 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.755864E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.919 | TFLOPs: 26.09 | +7: iteration 47700/ 173500 | consumed samples: 12211200 | consumed tokens: 25008537600 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.760910E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.888 | TFLOPs: 26.11 | +7: iteration 47710/ 173500 | consumed samples: 12213760 | consumed tokens: 25013780480 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.765136E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.555 | TFLOPs: 26.10 | +7: iteration 47720/ 173500 | consumed samples: 12216320 | consumed tokens: 25019023360 | elapsed time per iteration (s): 0.16 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.755878E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.323 | TFLOPs: 25.63 | +7: iteration 47730/ 173500 | consumed samples: 12218880 | consumed tokens: 25024266240 | elapsed time per iteration (s): 0.16 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.747168E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.556 | TFLOPs: 25.76 | +7: iteration 47740/ 173500 | consumed samples: 12221440 | consumed tokens: 25029509120 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.764309E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.654 | TFLOPs: 26.15 | +7: iteration 47750/ 173500 | consumed samples: 12224000 | consumed tokens: 25034752000 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.768422E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.232 | TFLOPs: 26.18 | +7: iteration 47760/ 173500 | consumed samples: 12226560 | consumed tokens: 25039994880 | elapsed time per iteration (s): 0.15 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.754585E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.019 | TFLOPs: 26.14 | +7: iteration 47770/ 173500 | consumed samples: 12229120 | consumed tokens: 25045237760 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.756704E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.255 | TFLOPs: 26.08 | +7: iteration 47780/ 173500 | consumed samples: 12231680 | consumed tokens: 25050480640 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.759717E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.785 | TFLOPs: 26.11 | +7: iteration 47790/ 173500 | consumed samples: 12234240 | consumed tokens: 25055723520 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.762115E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.348 | TFLOPs: 26.10 | +7: iteration 47800/ 173500 | consumed samples: 12236800 | consumed tokens: 25060966400 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.758731E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.355 | TFLOPs: 26.09 | +7: iteration 47810/ 173500 | consumed samples: 12239360 | consumed tokens: 25066209280 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.763600E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.304 | TFLOPs: 26.01 | +7: iteration 47820/ 173500 | consumed samples: 12241920 | consumed tokens: 25071452160 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.758170E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.455 | TFLOPs: 26.13 | +7: iteration 47830/ 173500 | consumed samples: 12244480 | consumed tokens: 25076695040 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.748238E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.662 | TFLOPs: 26.15 | +7: iteration 47840/ 173500 | consumed samples: 12247040 | consumed tokens: 25081937920 | elapsed time per iteration (s): 0.15 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.772354E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.332 | TFLOPs: 26.12 | +7: iteration 47850/ 173500 | consumed samples: 12249600 | consumed tokens: 25087180800 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.760641E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.921 | TFLOPs: 26.13 | +7: iteration 47860/ 173500 | consumed samples: 12252160 | consumed tokens: 25092423680 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.746150E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.763 | TFLOPs: 26.14 | +7: iteration 47870/ 173500 | consumed samples: 12254720 | consumed tokens: 25097666560 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.771867E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.740 | TFLOPs: 26.14 | +7: iteration 47880/ 173500 | consumed samples: 12257280 | consumed tokens: 25102909440 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.757490E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.160 | TFLOPs: 26.11 | +7: iteration 47890/ 173500 | consumed samples: 12259840 | consumed tokens: 25108152320 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.759782E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.743 | TFLOPs: 26.12 | +7: iteration 47900/ 173500 | consumed samples: 12262400 | consumed tokens: 25113395200 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.751138E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.662 | TFLOPs: 26.14 | +7: iteration 47910/ 173500 | consumed samples: 12264960 | consumed tokens: 25118638080 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.760640E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.436 | TFLOPs: 26.12 | +7: iteration 47920/ 173500 | consumed samples: 12267520 | consumed tokens: 25123880960 | elapsed time per iteration (s): 0.15 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.743815E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.579 | TFLOPs: 26.10 | +7: iteration 47930/ 173500 | consumed samples: 12270080 | consumed tokens: 25129123840 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.757720E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.611 | TFLOPs: 26.14 | +7: iteration 47940/ 173500 | consumed samples: 12272640 | consumed tokens: 25134366720 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.749469E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.657 | TFLOPs: 26.15 | +7: iteration 47950/ 173500 | consumed samples: 12275200 | consumed tokens: 25139609600 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.771553E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.901 | TFLOPs: 26.19 | +7: iteration 47960/ 173500 | consumed samples: 12277760 | consumed tokens: 25144852480 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.754573E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.154 | TFLOPs: 26.21 | +7: iteration 47970/ 173500 | consumed samples: 12280320 | consumed tokens: 25150095360 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.761060E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.120 | TFLOPs: 26.21 | +7: iteration 47980/ 173500 | consumed samples: 12282880 | consumed tokens: 25155338240 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.745483E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.974 | TFLOPs: 26.21 | +7: iteration 47990/ 173500 | consumed samples: 12285440 | consumed tokens: 25160581120 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.749870E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.462 | TFLOPs: 26.20 | +0: [2023-03-17 02:19:32,433] [INFO] [logging.py:68:log_dist] [Rank 0] step=48000, skipped=0, lr=[0.00016965587057872074, 0.00016965587057872074, 0.00016965587057872074], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 48000/ 173500 | consumed samples: 12288000 | consumed tokens: 25165824000 | elapsed time per iteration (s): 0.15 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.741844E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.045 | TFLOPs: 26.21 | +0: steps: 48000 loss: 3.7251 iter time (s): 0.153 samples/sec: 1674.834 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 48000 | lm loss value: 3.947184E+00 | lm loss PPL: 5.178930E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 48000 to checkpoints_44m91b100m +0: [2023-03-17 02:19:32,506] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step48000 is begin to save! +0: [2023-03-17 02:19:32,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:19:32,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:19:32,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:19:32,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:19:32,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:19:32,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:19:32,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:19:32,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:19:32,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:19:32,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:19:32,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:19:32,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:19:32,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:19:32,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:19:32,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:19:32,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:19:32,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:19:32,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:19:32,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:19:32,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:19:32,638] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step48000/mp_rank_00_model_states.pt +0: [2023-03-17 02:19:32,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:19:32,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:19:32,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:19:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:19:32,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:19:32,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:19:32,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:19:32,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +7: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +1: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +5: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +3: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +4: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +6: [2023-03-17 02:19:32,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:19:32,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:19:32,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +2: [2023-03-17 02:19:32,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:19:32,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step48000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:19:32,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step48000 is ready now! +0: successfully saved checkpoint at iteration 48000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.79 +7: iteration 48010/ 173500 | consumed samples: 12290560 | consumed tokens: 25171066880 | elapsed time per iteration (s): 0.18 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.749844E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1439.457 | TFLOPs: 22.57 | +7: iteration 48020/ 173500 | consumed samples: 12293120 | consumed tokens: 25176309760 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.749327E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.691 | TFLOPs: 25.97 | +7: iteration 48030/ 173500 | consumed samples: 12295680 | consumed tokens: 25181552640 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.746409E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.656 | TFLOPs: 26.15 | +7: iteration 48040/ 173500 | consumed samples: 12298240 | consumed tokens: 25186795520 | elapsed time per iteration (s): 0.16 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.741917E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.427 | TFLOPs: 25.85 | +7: iteration 48050/ 173500 | consumed samples: 12300800 | consumed tokens: 25192038400 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.764953E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.405 | TFLOPs: 26.15 | +7: iteration 48060/ 173500 | consumed samples: 12303360 | consumed tokens: 25197281280 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.748168E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.482 | TFLOPs: 26.12 | +7: iteration 48070/ 173500 | consumed samples: 12305920 | consumed tokens: 25202524160 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.740935E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.765 | TFLOPs: 26.15 | +7: iteration 48080/ 173500 | consumed samples: 12308480 | consumed tokens: 25207767040 | elapsed time per iteration (s): 0.15 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.767418E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.605 | TFLOPs: 26.15 | +7: iteration 48090/ 173500 | consumed samples: 12311040 | consumed tokens: 25213009920 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.750079E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.390 | TFLOPs: 26.15 | +7: iteration 48100/ 173500 | consumed samples: 12313600 | consumed tokens: 25218252800 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.770914E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.175 | TFLOPs: 26.16 | +7: iteration 48110/ 173500 | consumed samples: 12316160 | consumed tokens: 25223495680 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.758522E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.409 | TFLOPs: 26.15 | +7: iteration 48120/ 173500 | consumed samples: 12318720 | consumed tokens: 25228738560 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.766111E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.327 | TFLOPs: 26.04 | +7: iteration 48130/ 173500 | consumed samples: 12321280 | consumed tokens: 25233981440 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.750888E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.198 | TFLOPs: 26.07 | +7: iteration 48140/ 173500 | consumed samples: 12323840 | consumed tokens: 25239224320 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.760474E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.805 | TFLOPs: 26.06 | +7: iteration 48150/ 173500 | consumed samples: 12326400 | consumed tokens: 25244467200 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.756904E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.768 | TFLOPs: 26.05 | +7: iteration 48160/ 173500 | consumed samples: 12328960 | consumed tokens: 25249710080 | elapsed time per iteration (s): 0.15 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.754881E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.405 | TFLOPs: 26.05 | +7: iteration 48170/ 173500 | consumed samples: 12331520 | consumed tokens: 25254952960 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.746449E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.183 | TFLOPs: 26.07 | +7: iteration 48180/ 173500 | consumed samples: 12334080 | consumed tokens: 25260195840 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.762565E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.682 | TFLOPs: 26.06 | +7: iteration 48190/ 173500 | consumed samples: 12336640 | consumed tokens: 25265438720 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.762466E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.510 | TFLOPs: 26.07 | +7: iteration 48200/ 173500 | consumed samples: 12339200 | consumed tokens: 25270681600 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.763885E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.717 | TFLOPs: 26.01 | +7: iteration 48210/ 173500 | consumed samples: 12341760 | consumed tokens: 25275924480 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.755053E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.225 | TFLOPs: 26.05 | +7: iteration 48220/ 173500 | consumed samples: 12344320 | consumed tokens: 25281167360 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.776068E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.098 | TFLOPs: 26.02 | +7: iteration 48230/ 173500 | consumed samples: 12346880 | consumed tokens: 25286410240 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.763028E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.296 | TFLOPs: 26.05 | +7: iteration 48240/ 173500 | consumed samples: 12349440 | consumed tokens: 25291653120 | elapsed time per iteration (s): 0.15 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.755775E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.099 | TFLOPs: 26.07 | +7: iteration 48250/ 173500 | consumed samples: 12352000 | consumed tokens: 25296896000 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.763611E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.983 | TFLOPs: 26.05 | +7: iteration 48260/ 173500 | consumed samples: 12354560 | consumed tokens: 25302138880 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.752142E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.223 | TFLOPs: 26.07 | +7: iteration 48270/ 173500 | consumed samples: 12357120 | consumed tokens: 25307381760 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.756265E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.760 | TFLOPs: 26.04 | +7: iteration 48280/ 173500 | consumed samples: 12359680 | consumed tokens: 25312624640 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.742824E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.058 | TFLOPs: 26.05 | +7: iteration 48290/ 173500 | consumed samples: 12362240 | consumed tokens: 25317867520 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.758790E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.047 | TFLOPs: 26.05 | +7: iteration 48300/ 173500 | consumed samples: 12364800 | consumed tokens: 25323110400 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.753605E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.461 | TFLOPs: 26.04 | +7: iteration 48310/ 173500 | consumed samples: 12367360 | consumed tokens: 25328353280 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.765251E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.161 | TFLOPs: 26.05 | +7: iteration 48320/ 173500 | consumed samples: 12369920 | consumed tokens: 25333596160 | elapsed time per iteration (s): 0.15 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.759333E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.339 | TFLOPs: 26.07 | +7: iteration 48330/ 173500 | consumed samples: 12372480 | consumed tokens: 25338839040 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.738697E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.663 | TFLOPs: 26.06 | +7: iteration 48340/ 173500 | consumed samples: 12375040 | consumed tokens: 25344081920 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.764321E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.473 | TFLOPs: 26.02 | +7: iteration 48350/ 173500 | consumed samples: 12377600 | consumed tokens: 25349324800 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.770007E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.100 | TFLOPs: 26.07 | +7: iteration 48360/ 173500 | consumed samples: 12380160 | consumed tokens: 25354567680 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.764747E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.054 | TFLOPs: 26.05 | +7: iteration 48370/ 173500 | consumed samples: 12382720 | consumed tokens: 25359810560 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.761496E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.325 | TFLOPs: 26.07 | +7: iteration 48380/ 173500 | consumed samples: 12385280 | consumed tokens: 25365053440 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.762148E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.282 | TFLOPs: 26.02 | +7: iteration 48390/ 173500 | consumed samples: 12387840 | consumed tokens: 25370296320 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.744676E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.522 | TFLOPs: 26.06 | +7: iteration 48400/ 173500 | consumed samples: 12390400 | consumed tokens: 25375539200 | elapsed time per iteration (s): 0.15 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.752269E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.913 | TFLOPs: 26.05 | +7: iteration 48410/ 173500 | consumed samples: 12392960 | consumed tokens: 25380782080 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.744889E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.040 | TFLOPs: 26.06 | +7: iteration 48420/ 173500 | consumed samples: 12395520 | consumed tokens: 25386024960 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.751180E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.579 | TFLOPs: 26.07 | +7: iteration 48430/ 173500 | consumed samples: 12398080 | consumed tokens: 25391267840 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.754739E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.094 | TFLOPs: 26.05 | +7: iteration 48440/ 173500 | consumed samples: 12400640 | consumed tokens: 25396510720 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.766296E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.801 | TFLOPs: 26.05 | +7: iteration 48450/ 173500 | consumed samples: 12403200 | consumed tokens: 25401753600 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.762523E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.276 | TFLOPs: 26.05 | +7: iteration 48460/ 173500 | consumed samples: 12405760 | consumed tokens: 25406996480 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.761848E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.431 | TFLOPs: 26.09 | +7: iteration 48470/ 173500 | consumed samples: 12408320 | consumed tokens: 25412239360 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.753046E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.837 | TFLOPs: 26.12 | +7: iteration 48480/ 173500 | consumed samples: 12410880 | consumed tokens: 25417482240 | elapsed time per iteration (s): 0.15 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.756097E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.444 | TFLOPs: 26.13 | +7: iteration 48490/ 173500 | consumed samples: 12413440 | consumed tokens: 25422725120 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.748319E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.826 | TFLOPs: 26.14 | +7: iteration 48500/ 173500 | consumed samples: 12416000 | consumed tokens: 25427968000 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.756833E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.551 | TFLOPs: 26.12 | +7: iteration 48510/ 173500 | consumed samples: 12418560 | consumed tokens: 25433210880 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.767250E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.407 | TFLOPs: 26.12 | +7: iteration 48520/ 173500 | consumed samples: 12421120 | consumed tokens: 25438453760 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.757999E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.520 | TFLOPs: 26.15 | +7: iteration 48530/ 173500 | consumed samples: 12423680 | consumed tokens: 25443696640 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.767596E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.695 | TFLOPs: 26.11 | +7: iteration 48540/ 173500 | consumed samples: 12426240 | consumed tokens: 25448939520 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.760205E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.975 | TFLOPs: 26.16 | +7: iteration 48550/ 173500 | consumed samples: 12428800 | consumed tokens: 25454182400 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.749504E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.087 | TFLOPs: 26.16 | +7: iteration 48560/ 173500 | consumed samples: 12431360 | consumed tokens: 25459425280 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.755780E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.340 | TFLOPs: 26.15 | +7: iteration 48570/ 173500 | consumed samples: 12433920 | consumed tokens: 25464668160 | elapsed time per iteration (s): 0.15 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.757251E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.024 | TFLOPs: 26.14 | +7: iteration 48580/ 173500 | consumed samples: 12436480 | consumed tokens: 25469911040 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.750498E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.091 | TFLOPs: 26.16 | +7: iteration 48590/ 173500 | consumed samples: 12439040 | consumed tokens: 25475153920 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.751651E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.179 | TFLOPs: 26.16 | +7: iteration 48600/ 173500 | consumed samples: 12441600 | consumed tokens: 25480396800 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.770329E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.271 | TFLOPs: 26.15 | +7: iteration 48610/ 173500 | consumed samples: 12444160 | consumed tokens: 25485639680 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.756947E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.238 | TFLOPs: 26.16 | +7: iteration 48620/ 173500 | consumed samples: 12446720 | consumed tokens: 25490882560 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.752596E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.713 | TFLOPs: 26.15 | +7: iteration 48630/ 173500 | consumed samples: 12449280 | consumed tokens: 25496125440 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.749782E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.839 | TFLOPs: 26.16 | +7: iteration 48640/ 173500 | consumed samples: 12451840 | consumed tokens: 25501368320 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.752769E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.870 | TFLOPs: 26.13 | +7: iteration 48650/ 173500 | consumed samples: 12454400 | consumed tokens: 25506611200 | elapsed time per iteration (s): 0.15 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.762232E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.336 | TFLOPs: 26.16 | +7: iteration 48660/ 173500 | consumed samples: 12456960 | consumed tokens: 25511854080 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.750240E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.799 | TFLOPs: 26.20 | +7: iteration 48670/ 173500 | consumed samples: 12459520 | consumed tokens: 25517096960 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.770343E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.588 | TFLOPs: 26.21 | +7: iteration 48680/ 173500 | consumed samples: 12462080 | consumed tokens: 25522339840 | elapsed time per iteration (s): 0.16 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.746629E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.105 | TFLOPs: 25.80 | +7: iteration 48690/ 173500 | consumed samples: 12464640 | consumed tokens: 25527582720 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.767709E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.646 | TFLOPs: 25.92 | +7: iteration 48700/ 173500 | consumed samples: 12467200 | consumed tokens: 25532825600 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.755816E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.862 | TFLOPs: 26.25 | +7: iteration 48710/ 173500 | consumed samples: 12469760 | consumed tokens: 25538068480 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.753315E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.364 | TFLOPs: 26.24 | +7: iteration 48720/ 173500 | consumed samples: 12472320 | consumed tokens: 25543311360 | elapsed time per iteration (s): 0.15 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.772486E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.070 | TFLOPs: 26.00 | +7: iteration 48730/ 173500 | consumed samples: 12474880 | consumed tokens: 25548554240 | elapsed time per iteration (s): 0.16 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.757544E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.837 | TFLOPs: 25.89 | +7: iteration 48740/ 173500 | consumed samples: 12477440 | consumed tokens: 25553797120 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.757778E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.354 | TFLOPs: 26.24 | +7: iteration 48750/ 173500 | consumed samples: 12480000 | consumed tokens: 25559040000 | elapsed time per iteration (s): 0.16 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.750989E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.983 | TFLOPs: 25.86 | +7: iteration 48760/ 173500 | consumed samples: 12482560 | consumed tokens: 25564282880 | elapsed time per iteration (s): 0.16 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.763281E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.696 | TFLOPs: 25.75 | +7: iteration 48770/ 173500 | consumed samples: 12485120 | consumed tokens: 25569525760 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.755357E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.401 | TFLOPs: 26.23 | +7: iteration 48780/ 173500 | consumed samples: 12487680 | consumed tokens: 25574768640 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.762108E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.048 | TFLOPs: 26.22 | +7: iteration 48790/ 173500 | consumed samples: 12490240 | consumed tokens: 25580011520 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.755980E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.004 | TFLOPs: 26.22 | +7: iteration 48800/ 173500 | consumed samples: 12492800 | consumed tokens: 25585254400 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.748934E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.631 | TFLOPs: 26.23 | +7: iteration 48810/ 173500 | consumed samples: 12495360 | consumed tokens: 25590497280 | elapsed time per iteration (s): 0.15 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.760465E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.314 | TFLOPs: 26.26 | +7: iteration 48820/ 173500 | consumed samples: 12497920 | consumed tokens: 25595740160 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.752557E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.586 | TFLOPs: 26.25 | +7: iteration 48830/ 173500 | consumed samples: 12500480 | consumed tokens: 25600983040 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.764627E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.531 | TFLOPs: 26.28 | +7: iteration 48840/ 173500 | consumed samples: 12503040 | consumed tokens: 25606225920 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.759757E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.940 | TFLOPs: 26.25 | +7: iteration 48850/ 173500 | consumed samples: 12505600 | consumed tokens: 25611468800 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.752892E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.116 | TFLOPs: 26.27 | +7: iteration 48860/ 173500 | consumed samples: 12508160 | consumed tokens: 25616711680 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.757893E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.913 | TFLOPs: 26.25 | +7: iteration 48870/ 173500 | consumed samples: 12510720 | consumed tokens: 25621954560 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.752177E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.854 | TFLOPs: 26.22 | +7: iteration 48880/ 173500 | consumed samples: 12513280 | consumed tokens: 25627197440 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.741330E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.758 | TFLOPs: 26.12 | +7: iteration 48890/ 173500 | consumed samples: 12515840 | consumed tokens: 25632440320 | elapsed time per iteration (s): 0.15 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.760587E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.448 | TFLOPs: 26.10 | +7: iteration 48900/ 173500 | consumed samples: 12518400 | consumed tokens: 25637683200 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.761860E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.656 | TFLOPs: 26.17 | +7: iteration 48910/ 173500 | consumed samples: 12520960 | consumed tokens: 25642926080 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.765147E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.806 | TFLOPs: 26.12 | +7: iteration 48920/ 173500 | consumed samples: 12523520 | consumed tokens: 25648168960 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.752913E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.188 | TFLOPs: 26.07 | +7: iteration 48930/ 173500 | consumed samples: 12526080 | consumed tokens: 25653411840 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.762486E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.445 | TFLOPs: 26.13 | +7: iteration 48940/ 173500 | consumed samples: 12528640 | consumed tokens: 25658654720 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.752762E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.857 | TFLOPs: 26.14 | +7: iteration 48950/ 173500 | consumed samples: 12531200 | consumed tokens: 25663897600 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.768151E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.454 | TFLOPs: 26.09 | +7: iteration 48960/ 173500 | consumed samples: 12533760 | consumed tokens: 25669140480 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.761109E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.840 | TFLOPs: 26.03 | +7: iteration 48970/ 173500 | consumed samples: 12536320 | consumed tokens: 25674383360 | elapsed time per iteration (s): 0.15 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.751406E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.569 | TFLOPs: 25.95 | +7: iteration 48980/ 173500 | consumed samples: 12538880 | consumed tokens: 25679626240 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.761271E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.647 | TFLOPs: 26.15 | +7: iteration 48990/ 173500 | consumed samples: 12541440 | consumed tokens: 25684869120 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.755743E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.723 | TFLOPs: 25.90 | +7: iteration 49000/ 173500 | consumed samples: 12544000 | consumed tokens: 25690112000 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.757733E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.540 | TFLOPs: 26.14 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 49000 | lm loss value: 3.871442E+00 | lm loss PPL: 4.801157E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 49000 to checkpoints_44m91b100m +0: [2023-03-17 02:22:06,562] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step49000 is begin to save! +0: [2023-03-17 02:22:06,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:22:06,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:22:06,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:22:06,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:22:06,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:22:06,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:22:06,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:22:06,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:22:06,653] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:22:06,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:22:06,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:22:06,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:22:06,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:22:06,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:22:06,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:22:06,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:22:06,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:22:06,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:22:06,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:22:06,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:22:06,695] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step49000/mp_rank_00_model_states.pt +0: [2023-03-17 02:22:06,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:22:06,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:22:06,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:22:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:22:06,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +3: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:22:06,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +2: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:22:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +7: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:22:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 02:22:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 02:22:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +5: [2023-03-17 02:22:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +4: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:22:06,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:22:06,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +1: [2023-03-17 02:22:06,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:22:06,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:22:06,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +6: [2023-03-17 02:22:06,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:22:06,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step49000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:22:06,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step49000 is ready now! +0: successfully saved checkpoint at iteration 49000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.51 +7: iteration 49010/ 173500 | consumed samples: 12546560 | consumed tokens: 25695354880 | elapsed time per iteration (s): 0.18 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.761179E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1428.415 | TFLOPs: 22.40 | +7: iteration 49020/ 173500 | consumed samples: 12549120 | consumed tokens: 25700597760 | elapsed time per iteration (s): 0.16 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.764433E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.060 | TFLOPs: 25.69 | +7: iteration 49030/ 173500 | consumed samples: 12551680 | consumed tokens: 25705840640 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.740358E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.341 | TFLOPs: 26.12 | +7: iteration 49040/ 173500 | consumed samples: 12554240 | consumed tokens: 25711083520 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.762314E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.639 | TFLOPs: 26.12 | +7: iteration 49050/ 173500 | consumed samples: 12556800 | consumed tokens: 25716326400 | elapsed time per iteration (s): 0.15 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.777090E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.247 | TFLOPs: 26.15 | +7: iteration 49060/ 173500 | consumed samples: 12559360 | consumed tokens: 25721569280 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.761319E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.255 | TFLOPs: 26.18 | +7: iteration 49070/ 173500 | consumed samples: 12561920 | consumed tokens: 25726812160 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.762277E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.228 | TFLOPs: 26.16 | +7: iteration 49080/ 173500 | consumed samples: 12564480 | consumed tokens: 25732055040 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.747433E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.495 | TFLOPs: 26.17 | +7: iteration 49090/ 173500 | consumed samples: 12567040 | consumed tokens: 25737297920 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.751034E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.471 | TFLOPs: 25.91 | +7: iteration 49100/ 173500 | consumed samples: 12569600 | consumed tokens: 25742540800 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.764285E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.446 | TFLOPs: 26.15 | +7: iteration 49110/ 173500 | consumed samples: 12572160 | consumed tokens: 25747783680 | elapsed time per iteration (s): 0.16 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.748984E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.715 | TFLOPs: 25.37 | +7: iteration 49120/ 173500 | consumed samples: 12574720 | consumed tokens: 25753026560 | elapsed time per iteration (s): 0.15 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.764824E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.043 | TFLOPs: 26.16 | +7: iteration 49130/ 173500 | consumed samples: 12577280 | consumed tokens: 25758269440 | elapsed time per iteration (s): 0.16 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.756691E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.080 | TFLOPs: 25.88 | +7: iteration 49140/ 173500 | consumed samples: 12579840 | consumed tokens: 25763512320 | elapsed time per iteration (s): 0.15 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.744227E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.919 | TFLOPs: 25.98 | +7: iteration 49150/ 173500 | consumed samples: 12582400 | consumed tokens: 25768755200 | elapsed time per iteration (s): 0.16 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.749703E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.968 | TFLOPs: 25.47 | +7: iteration 49160/ 173500 | consumed samples: 12584960 | consumed tokens: 25773998080 | elapsed time per iteration (s): 0.16 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.756802E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.005 | TFLOPs: 25.50 | +7: iteration 49170/ 173500 | consumed samples: 12587520 | consumed tokens: 25779240960 | elapsed time per iteration (s): 0.16 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.766975E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.161 | TFLOPs: 25.82 | +7: iteration 49180/ 173500 | consumed samples: 12590080 | consumed tokens: 25784483840 | elapsed time per iteration (s): 0.16 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.764510E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.950 | TFLOPs: 25.39 | +7: iteration 49190/ 173500 | consumed samples: 12592640 | consumed tokens: 25789726720 | elapsed time per iteration (s): 0.16 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.746035E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.494 | TFLOPs: 25.62 | +7: iteration 49200/ 173500 | consumed samples: 12595200 | consumed tokens: 25794969600 | elapsed time per iteration (s): 0.17 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.749525E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1470.777 | TFLOPs: 23.07 | +7: iteration 49210/ 173500 | consumed samples: 12597760 | consumed tokens: 25800212480 | elapsed time per iteration (s): 0.15 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.744737E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.134 | TFLOPs: 26.04 | +7: iteration 49220/ 173500 | consumed samples: 12600320 | consumed tokens: 25805455360 | elapsed time per iteration (s): 0.16 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.766101E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.519 | TFLOPs: 25.15 | +7: iteration 49230/ 173500 | consumed samples: 12602880 | consumed tokens: 25810698240 | elapsed time per iteration (s): 0.16 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.762070E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.492 | TFLOPs: 25.18 | +7: iteration 49240/ 173500 | consumed samples: 12605440 | consumed tokens: 25815941120 | elapsed time per iteration (s): 0.16 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.757025E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.662 | TFLOPs: 25.15 | +7: iteration 49250/ 173500 | consumed samples: 12608000 | consumed tokens: 25821184000 | elapsed time per iteration (s): 0.16 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.750725E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.545 | TFLOPs: 25.65 | +7: iteration 49260/ 173500 | consumed samples: 12610560 | consumed tokens: 25826426880 | elapsed time per iteration (s): 0.17 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.762781E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.315 | TFLOPs: 23.81 | +7: iteration 49270/ 173500 | consumed samples: 12613120 | consumed tokens: 25831669760 | elapsed time per iteration (s): 0.17 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.736772E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.972 | TFLOPs: 24.12 | +7: iteration 49280/ 173500 | consumed samples: 12615680 | consumed tokens: 25836912640 | elapsed time per iteration (s): 0.16 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.750837E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.925 | TFLOPs: 25.04 | +7: iteration 49290/ 173500 | consumed samples: 12618240 | consumed tokens: 25842155520 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.757840E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.557 | TFLOPs: 25.21 | +7: iteration 49300/ 173500 | consumed samples: 12620800 | consumed tokens: 25847398400 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.757796E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.896 | TFLOPs: 24.43 | +7: iteration 49310/ 173500 | consumed samples: 12623360 | consumed tokens: 25852641280 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.754379E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.037 | TFLOPs: 25.05 | +7: iteration 49320/ 173500 | consumed samples: 12625920 | consumed tokens: 25857884160 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.754378E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.684 | TFLOPs: 25.04 | +7: iteration 49330/ 173500 | consumed samples: 12628480 | consumed tokens: 25863127040 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.767387E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.619 | TFLOPs: 25.18 | +7: iteration 49340/ 173500 | consumed samples: 12631040 | consumed tokens: 25868369920 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.757174E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.382 | TFLOPs: 24.47 | +7: iteration 49350/ 173500 | consumed samples: 12633600 | consumed tokens: 25873612800 | elapsed time per iteration (s): 0.16 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.751674E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.615 | TFLOPs: 25.10 | +7: iteration 49360/ 173500 | consumed samples: 12636160 | consumed tokens: 25878855680 | elapsed time per iteration (s): 0.17 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.756099E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.506 | TFLOPs: 24.13 | +7: iteration 49370/ 173500 | consumed samples: 12638720 | consumed tokens: 25884098560 | elapsed time per iteration (s): 0.16 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.758786E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.456 | TFLOPs: 24.86 | +7: iteration 49380/ 173500 | consumed samples: 12641280 | consumed tokens: 25889341440 | elapsed time per iteration (s): 0.18 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.757959E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.715 | TFLOPs: 22.88 | +7: iteration 49390/ 173500 | consumed samples: 12643840 | consumed tokens: 25894584320 | elapsed time per iteration (s): 0.17 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.739286E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.670 | TFLOPs: 23.31 | +7: iteration 49400/ 173500 | consumed samples: 12646400 | consumed tokens: 25899827200 | elapsed time per iteration (s): 0.17 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.748672E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.047 | TFLOPs: 23.65 | +7: iteration 49410/ 173500 | consumed samples: 12648960 | consumed tokens: 25905070080 | elapsed time per iteration (s): 0.16 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.753086E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.460 | TFLOPs: 25.70 | +7: iteration 49420/ 173500 | consumed samples: 12651520 | consumed tokens: 25910312960 | elapsed time per iteration (s): 0.17 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.755310E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.381 | TFLOPs: 23.84 | +7: iteration 49430/ 173500 | consumed samples: 12654080 | consumed tokens: 25915555840 | elapsed time per iteration (s): 0.16 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.749405E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.082 | TFLOPs: 24.75 | +7: iteration 49440/ 173500 | consumed samples: 12656640 | consumed tokens: 25920798720 | elapsed time per iteration (s): 0.18 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.759251E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1435.866 | TFLOPs: 22.52 | +7: iteration 49450/ 173500 | consumed samples: 12659200 | consumed tokens: 25926041600 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.762058E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.890 | TFLOPs: 25.50 | +7: iteration 49460/ 173500 | consumed samples: 12661760 | consumed tokens: 25931284480 | elapsed time per iteration (s): 0.17 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.756730E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1491.324 | TFLOPs: 23.39 | +7: iteration 49470/ 173500 | consumed samples: 12664320 | consumed tokens: 25936527360 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.763140E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.842 | TFLOPs: 25.12 | +7: iteration 49480/ 173500 | consumed samples: 12666880 | consumed tokens: 25941770240 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.752867E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.152 | TFLOPs: 24.55 | +7: iteration 49490/ 173500 | consumed samples: 12669440 | consumed tokens: 25947013120 | elapsed time per iteration (s): 0.17 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.747031E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.012 | TFLOPs: 24.10 | +7: iteration 49500/ 173500 | consumed samples: 12672000 | consumed tokens: 25952256000 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.761724E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.040 | TFLOPs: 25.11 | +7: iteration 49510/ 173500 | consumed samples: 12674560 | consumed tokens: 25957498880 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.755414E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.510 | TFLOPs: 24.57 | +7: iteration 49520/ 173500 | consumed samples: 12677120 | consumed tokens: 25962741760 | elapsed time per iteration (s): 0.16 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.761975E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.922 | TFLOPs: 25.36 | +7: iteration 49530/ 173500 | consumed samples: 12679680 | consumed tokens: 25967984640 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.736121E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.314 | TFLOPs: 24.60 | +7: iteration 49540/ 173500 | consumed samples: 12682240 | consumed tokens: 25973227520 | elapsed time per iteration (s): 0.15 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.757657E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.540 | TFLOPs: 25.96 | +7: iteration 49550/ 173500 | consumed samples: 12684800 | consumed tokens: 25978470400 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.765693E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.065 | TFLOPs: 25.08 | +7: iteration 49560/ 173500 | consumed samples: 12687360 | consumed tokens: 25983713280 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.751772E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.535 | TFLOPs: 24.74 | +7: iteration 49570/ 173500 | consumed samples: 12689920 | consumed tokens: 25988956160 | elapsed time per iteration (s): 0.17 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.760347E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.021 | TFLOPs: 24.14 | +7: iteration 49580/ 173500 | consumed samples: 12692480 | consumed tokens: 25994199040 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.753316E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.460 | TFLOPs: 24.85 | +7: iteration 49590/ 173500 | consumed samples: 12695040 | consumed tokens: 25999441920 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.770986E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.004 | TFLOPs: 25.22 | +7: iteration 49600/ 173500 | consumed samples: 12697600 | consumed tokens: 26004684800 | elapsed time per iteration (s): 0.16 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.761153E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.461 | TFLOPs: 25.29 | +7: iteration 49610/ 173500 | consumed samples: 12700160 | consumed tokens: 26009927680 | elapsed time per iteration (s): 0.16 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.739920E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.360 | TFLOPs: 24.93 | +7: iteration 49620/ 173500 | consumed samples: 12702720 | consumed tokens: 26015170560 | elapsed time per iteration (s): 0.17 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.754423E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.185 | TFLOPs: 23.93 | +7: iteration 49630/ 173500 | consumed samples: 12705280 | consumed tokens: 26020413440 | elapsed time per iteration (s): 0.16 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.755753E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.682 | TFLOPs: 25.23 | +7: iteration 49640/ 173500 | consumed samples: 12707840 | consumed tokens: 26025656320 | elapsed time per iteration (s): 0.17 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.764039E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.816 | TFLOPs: 24.18 | +7: iteration 49650/ 173500 | consumed samples: 12710400 | consumed tokens: 26030899200 | elapsed time per iteration (s): 0.16 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.766615E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.517 | TFLOPs: 24.35 | +7: iteration 49660/ 173500 | consumed samples: 12712960 | consumed tokens: 26036142080 | elapsed time per iteration (s): 0.16 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.745851E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.943 | TFLOPs: 24.76 | +7: iteration 49670/ 173500 | consumed samples: 12715520 | consumed tokens: 26041384960 | elapsed time per iteration (s): 0.17 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.763451E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.346 | TFLOPs: 23.65 | +7: iteration 49680/ 173500 | consumed samples: 12718080 | consumed tokens: 26046627840 | elapsed time per iteration (s): 0.17 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.764105E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.053 | TFLOPs: 23.92 | +7: iteration 49690/ 173500 | consumed samples: 12720640 | consumed tokens: 26051870720 | elapsed time per iteration (s): 0.17 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.756328E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.464 | TFLOPs: 24.22 | +7: iteration 49700/ 173500 | consumed samples: 12723200 | consumed tokens: 26057113600 | elapsed time per iteration (s): 0.16 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.745631E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.621 | TFLOPs: 24.76 | +7: iteration 49710/ 173500 | consumed samples: 12725760 | consumed tokens: 26062356480 | elapsed time per iteration (s): 0.17 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.759269E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.976 | TFLOPs: 23.87 | +7: iteration 49720/ 173500 | consumed samples: 12728320 | consumed tokens: 26067599360 | elapsed time per iteration (s): 0.17 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.752847E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.980 | TFLOPs: 24.07 | +7: iteration 49730/ 173500 | consumed samples: 12730880 | consumed tokens: 26072842240 | elapsed time per iteration (s): 0.16 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.764581E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.414 | TFLOPs: 25.24 | +7: iteration 49740/ 173500 | consumed samples: 12733440 | consumed tokens: 26078085120 | elapsed time per iteration (s): 0.17 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.762753E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.833 | TFLOPs: 23.85 | +7: iteration 49750/ 173500 | consumed samples: 12736000 | consumed tokens: 26083328000 | elapsed time per iteration (s): 0.16 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.756425E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.255 | TFLOPs: 25.10 | +7: iteration 49760/ 173500 | consumed samples: 12738560 | consumed tokens: 26088570880 | elapsed time per iteration (s): 0.17 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.757166E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.204 | TFLOPs: 24.14 | +7: iteration 49770/ 173500 | consumed samples: 12741120 | consumed tokens: 26093813760 | elapsed time per iteration (s): 0.16 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.751134E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.203 | TFLOPs: 25.19 | +7: iteration 49780/ 173500 | consumed samples: 12743680 | consumed tokens: 26099056640 | elapsed time per iteration (s): 0.16 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.751649E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.331 | TFLOPs: 25.10 | +7: iteration 49790/ 173500 | consumed samples: 12746240 | consumed tokens: 26104299520 | elapsed time per iteration (s): 0.16 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.755006E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.972 | TFLOPs: 25.22 | +7: iteration 49800/ 173500 | consumed samples: 12748800 | consumed tokens: 26109542400 | elapsed time per iteration (s): 0.17 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.756236E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1485.008 | TFLOPs: 23.29 | +7: iteration 49810/ 173500 | consumed samples: 12751360 | consumed tokens: 26114785280 | elapsed time per iteration (s): 0.16 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.758482E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.255 | TFLOPs: 24.52 | +7: iteration 49820/ 173500 | consumed samples: 12753920 | consumed tokens: 26120028160 | elapsed time per iteration (s): 0.17 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.755093E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.264 | TFLOPs: 24.20 | +7: iteration 49830/ 173500 | consumed samples: 12756480 | consumed tokens: 26125271040 | elapsed time per iteration (s): 0.16 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.764992E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.663 | TFLOPs: 24.98 | +7: iteration 49840/ 173500 | consumed samples: 12759040 | consumed tokens: 26130513920 | elapsed time per iteration (s): 0.16 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.764566E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.259 | TFLOPs: 24.41 | +7: iteration 49850/ 173500 | consumed samples: 12761600 | consumed tokens: 26135756800 | elapsed time per iteration (s): 0.16 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.739444E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.350 | TFLOPs: 25.77 | +7: iteration 49860/ 173500 | consumed samples: 12764160 | consumed tokens: 26140999680 | elapsed time per iteration (s): 0.17 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.743639E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1492.223 | TFLOPs: 23.40 | +7: iteration 49870/ 173500 | consumed samples: 12766720 | consumed tokens: 26146242560 | elapsed time per iteration (s): 0.17 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.749891E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.419 | TFLOPs: 23.78 | +7: iteration 49880/ 173500 | consumed samples: 12769280 | consumed tokens: 26151485440 | elapsed time per iteration (s): 0.16 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.753072E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.856 | TFLOPs: 24.89 | +7: iteration 49890/ 173500 | consumed samples: 12771840 | consumed tokens: 26156728320 | elapsed time per iteration (s): 0.18 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.747839E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1427.436 | TFLOPs: 22.39 | +7: iteration 49900/ 173500 | consumed samples: 12774400 | consumed tokens: 26161971200 | elapsed time per iteration (s): 0.16 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.764272E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.209 | TFLOPs: 25.90 | +7: iteration 49910/ 173500 | consumed samples: 12776960 | consumed tokens: 26167214080 | elapsed time per iteration (s): 0.16 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.754612E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.772 | TFLOPs: 24.85 | +7: iteration 49920/ 173500 | consumed samples: 12779520 | consumed tokens: 26172456960 | elapsed time per iteration (s): 0.16 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.741859E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.852 | TFLOPs: 24.90 | +7: iteration 49930/ 173500 | consumed samples: 12782080 | consumed tokens: 26177699840 | elapsed time per iteration (s): 0.16 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.765120E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.669 | TFLOPs: 24.96 | +7: iteration 49940/ 173500 | consumed samples: 12784640 | consumed tokens: 26182942720 | elapsed time per iteration (s): 0.15 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.758861E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.904 | TFLOPs: 25.92 | +7: iteration 49950/ 173500 | consumed samples: 12787200 | consumed tokens: 26188185600 | elapsed time per iteration (s): 0.18 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.745938E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.786 | TFLOPs: 22.83 | +7: iteration 49960/ 173500 | consumed samples: 12789760 | consumed tokens: 26193428480 | elapsed time per iteration (s): 0.17 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.745976E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.235 | TFLOPs: 24.12 | +7: iteration 49970/ 173500 | consumed samples: 12792320 | consumed tokens: 26198671360 | elapsed time per iteration (s): 0.16 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.738396E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.539 | TFLOPs: 24.38 | +7: iteration 49980/ 173500 | consumed samples: 12794880 | consumed tokens: 26203914240 | elapsed time per iteration (s): 0.17 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.758479E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.469 | TFLOPs: 23.47 | +7: iteration 49990/ 173500 | consumed samples: 12797440 | consumed tokens: 26209157120 | elapsed time per iteration (s): 0.17 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.749051E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.453 | TFLOPs: 23.67 | +0: [2023-03-17 02:24:48,930] [INFO] [logging.py:68:log_dist] [Rank 0] step=50000, skipped=0, lr=[0.00016715144913462704, 0.00016715144913462704, 0.00016715144913462704], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 50000/ 173500 | consumed samples: 12800000 | consumed tokens: 26214400000 | elapsed time per iteration (s): 0.17 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.748832E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.331 | TFLOPs: 23.97 | +0: steps: 50000 loss: 3.7788 iter time (s): 0.157 samples/sec: 1629.195 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 50000 | lm loss value: 3.876811E+00 | lm loss PPL: 4.827005E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 50000 to checkpoints_44m91b100m +0: [2023-03-17 02:24:49,006] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step50000 is begin to save! +0: [2023-03-17 02:24:49,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:24:49,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:24:49,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:24:49,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:24:49,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:24:49,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:24:49,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:24:49,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:24:49,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:24:49,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:24:49,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:24:49,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:24:49,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:24:49,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:24:49,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:24:49,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:24:49,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:24:49,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:24:49,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:24:49,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:24:49,158] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step50000/mp_rank_00_model_states.pt +0: [2023-03-17 02:24:49,158] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:24:49,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:24:49,177] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:24:49,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,193] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: [2023-03-17 02:24:49,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:24:49,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:24:49,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +7: [2023-03-17 02:24:49,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:24:49,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:24:49,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +1: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +3: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +5: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +2: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +6: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:24:49,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step50000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:24:49,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step50000 is ready now! +0: successfully saved checkpoint at iteration 50000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 225.31 +7: iteration 50010/ 173500 | consumed samples: 12802560 | consumed tokens: 26219642880 | elapsed time per iteration (s): 0.19 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.734861E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1374.415 | TFLOPs: 21.55 | +7: iteration 50020/ 173500 | consumed samples: 12805120 | consumed tokens: 26224885760 | elapsed time per iteration (s): 0.16 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.754558E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.085 | TFLOPs: 24.48 | +7: iteration 50030/ 173500 | consumed samples: 12807680 | consumed tokens: 26230128640 | elapsed time per iteration (s): 0.16 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.752202E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.877 | TFLOPs: 25.23 | +7: iteration 50040/ 173500 | consumed samples: 12810240 | consumed tokens: 26235371520 | elapsed time per iteration (s): 0.16 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.749823E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.728 | TFLOPs: 24.82 | +7: iteration 50050/ 173500 | consumed samples: 12812800 | consumed tokens: 26240614400 | elapsed time per iteration (s): 0.17 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.760722E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.976 | TFLOPs: 23.23 | +7: iteration 50060/ 173500 | consumed samples: 12815360 | consumed tokens: 26245857280 | elapsed time per iteration (s): 0.16 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.755114E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.133 | TFLOPs: 24.34 | +7: iteration 50070/ 173500 | consumed samples: 12817920 | consumed tokens: 26251100160 | elapsed time per iteration (s): 0.18 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.743415E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.975 | TFLOPs: 21.92 | +7: iteration 50080/ 173500 | consumed samples: 12820480 | consumed tokens: 26256343040 | elapsed time per iteration (s): 0.17 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.757794E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.345 | TFLOPs: 24.20 | +7: iteration 50090/ 173500 | consumed samples: 12823040 | consumed tokens: 26261585920 | elapsed time per iteration (s): 0.17 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.756454E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.085 | TFLOPs: 23.87 | +7: iteration 50100/ 173500 | consumed samples: 12825600 | consumed tokens: 26266828800 | elapsed time per iteration (s): 0.16 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.757642E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.700 | TFLOPs: 25.70 | +7: iteration 50110/ 173500 | consumed samples: 12828160 | consumed tokens: 26272071680 | elapsed time per iteration (s): 0.17 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.749381E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.473 | TFLOPs: 23.70 | +7: iteration 50120/ 173500 | consumed samples: 12830720 | consumed tokens: 26277314560 | elapsed time per iteration (s): 0.17 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.742643E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.538 | TFLOPs: 24.08 | +7: iteration 50130/ 173500 | consumed samples: 12833280 | consumed tokens: 26282557440 | elapsed time per iteration (s): 0.16 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.747602E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.718 | TFLOPs: 24.76 | +7: iteration 50140/ 173500 | consumed samples: 12835840 | consumed tokens: 26287800320 | elapsed time per iteration (s): 0.18 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.761187E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.999 | TFLOPs: 22.74 | +7: iteration 50150/ 173500 | consumed samples: 12838400 | consumed tokens: 26293043200 | elapsed time per iteration (s): 0.16 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.747489E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.668 | TFLOPs: 24.49 | +7: iteration 50160/ 173500 | consumed samples: 12840960 | consumed tokens: 26298286080 | elapsed time per iteration (s): 0.17 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.757374E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.346 | TFLOPs: 24.20 | +7: iteration 50170/ 173500 | consumed samples: 12843520 | consumed tokens: 26303528960 | elapsed time per iteration (s): 0.16 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.750510E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.852 | TFLOPs: 24.95 | +7: iteration 50180/ 173500 | consumed samples: 12846080 | consumed tokens: 26308771840 | elapsed time per iteration (s): 0.16 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.755188E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.398 | TFLOPs: 24.47 | +7: iteration 50190/ 173500 | consumed samples: 12848640 | consumed tokens: 26314014720 | elapsed time per iteration (s): 0.17 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.753265E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.027 | TFLOPs: 23.67 | +7: iteration 50200/ 173500 | consumed samples: 12851200 | consumed tokens: 26319257600 | elapsed time per iteration (s): 0.16 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.748620E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.087 | TFLOPs: 24.64 | +7: iteration 50210/ 173500 | consumed samples: 12853760 | consumed tokens: 26324500480 | elapsed time per iteration (s): 0.16 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.736715E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.192 | TFLOPs: 24.47 | +7: iteration 50220/ 173500 | consumed samples: 12856320 | consumed tokens: 26329743360 | elapsed time per iteration (s): 0.16 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.749142E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.766 | TFLOPs: 25.72 | +7: iteration 50230/ 173500 | consumed samples: 12858880 | consumed tokens: 26334986240 | elapsed time per iteration (s): 0.17 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.754556E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1463.031 | TFLOPs: 22.94 | +7: iteration 50240/ 173500 | consumed samples: 12861440 | consumed tokens: 26340229120 | elapsed time per iteration (s): 0.17 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.750830E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1464.688 | TFLOPs: 22.97 | +7: iteration 50250/ 173500 | consumed samples: 12864000 | consumed tokens: 26345472000 | elapsed time per iteration (s): 0.16 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.751801E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.018 | TFLOPs: 25.36 | +7: iteration 50260/ 173500 | consumed samples: 12866560 | consumed tokens: 26350714880 | elapsed time per iteration (s): 0.17 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.751986E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.629 | TFLOPs: 23.13 | +7: iteration 50270/ 173500 | consumed samples: 12869120 | consumed tokens: 26355957760 | elapsed time per iteration (s): 0.16 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.744951E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.970 | TFLOPs: 24.72 | +7: iteration 50280/ 173500 | consumed samples: 12871680 | consumed tokens: 26361200640 | elapsed time per iteration (s): 0.16 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.749618E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.517 | TFLOPs: 25.18 | +7: iteration 50290/ 173500 | consumed samples: 12874240 | consumed tokens: 26366443520 | elapsed time per iteration (s): 0.16 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.758099E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.284 | TFLOPs: 24.67 | +7: iteration 50300/ 173500 | consumed samples: 12876800 | consumed tokens: 26371686400 | elapsed time per iteration (s): 0.16 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.755627E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.343 | TFLOPs: 24.41 | +7: iteration 50310/ 173500 | consumed samples: 12879360 | consumed tokens: 26376929280 | elapsed time per iteration (s): 0.17 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.747287E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.206 | TFLOPs: 24.08 | +7: iteration 50320/ 173500 | consumed samples: 12881920 | consumed tokens: 26382172160 | elapsed time per iteration (s): 0.18 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.744960E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.010 | TFLOPs: 22.77 | +7: iteration 50330/ 173500 | consumed samples: 12884480 | consumed tokens: 26387415040 | elapsed time per iteration (s): 0.16 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.740927E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.933 | TFLOPs: 25.31 | +7: iteration 50340/ 173500 | consumed samples: 12887040 | consumed tokens: 26392657920 | elapsed time per iteration (s): 0.17 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.757630E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.069 | TFLOPs: 23.93 | +7: iteration 50350/ 173500 | consumed samples: 12889600 | consumed tokens: 26397900800 | elapsed time per iteration (s): 0.16 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.753161E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.633 | TFLOPs: 24.84 | +7: iteration 50360/ 173500 | consumed samples: 12892160 | consumed tokens: 26403143680 | elapsed time per iteration (s): 0.16 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.751774E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.039 | TFLOPs: 24.86 | +7: iteration 50370/ 173500 | consumed samples: 12894720 | consumed tokens: 26408386560 | elapsed time per iteration (s): 0.16 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.757944E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.234 | TFLOPs: 24.59 | +7: iteration 50380/ 173500 | consumed samples: 12897280 | consumed tokens: 26413629440 | elapsed time per iteration (s): 0.17 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.750584E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.066 | TFLOPs: 23.65 | +7: iteration 50390/ 173500 | consumed samples: 12899840 | consumed tokens: 26418872320 | elapsed time per iteration (s): 0.16 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.762383E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.969 | TFLOPs: 25.19 | +7: iteration 50400/ 173500 | consumed samples: 12902400 | consumed tokens: 26424115200 | elapsed time per iteration (s): 0.17 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.754371E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.030 | TFLOPs: 24.15 | +7: iteration 50410/ 173500 | consumed samples: 12904960 | consumed tokens: 26429358080 | elapsed time per iteration (s): 0.17 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.763976E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.826 | TFLOPs: 23.74 | +7: iteration 50420/ 173500 | consumed samples: 12907520 | consumed tokens: 26434600960 | elapsed time per iteration (s): 0.16 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.748135E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.654 | TFLOPs: 25.23 | +7: iteration 50430/ 173500 | consumed samples: 12910080 | consumed tokens: 26439843840 | elapsed time per iteration (s): 0.16 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.755212E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.753 | TFLOPs: 24.99 | +7: iteration 50440/ 173500 | consumed samples: 12912640 | consumed tokens: 26445086720 | elapsed time per iteration (s): 0.16 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.763385E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.350 | TFLOPs: 25.32 | +7: iteration 50450/ 173500 | consumed samples: 12915200 | consumed tokens: 26450329600 | elapsed time per iteration (s): 0.16 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.759288E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.099 | TFLOPs: 24.92 | +7: iteration 50460/ 173500 | consumed samples: 12917760 | consumed tokens: 26455572480 | elapsed time per iteration (s): 0.17 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.745571E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.701 | TFLOPs: 24.19 | +7: iteration 50470/ 173500 | consumed samples: 12920320 | consumed tokens: 26460815360 | elapsed time per iteration (s): 0.16 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.758603E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.374 | TFLOPs: 24.80 | +7: iteration 50480/ 173500 | consumed samples: 12922880 | consumed tokens: 26466058240 | elapsed time per iteration (s): 0.16 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.770825E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.016 | TFLOPs: 24.86 | +7: iteration 50490/ 173500 | consumed samples: 12925440 | consumed tokens: 26471301120 | elapsed time per iteration (s): 0.16 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.760444E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.823 | TFLOPs: 24.95 | +7: iteration 50500/ 173500 | consumed samples: 12928000 | consumed tokens: 26476544000 | elapsed time per iteration (s): 0.16 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.766412E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.986 | TFLOPs: 24.78 | +7: iteration 50510/ 173500 | consumed samples: 12930560 | consumed tokens: 26481786880 | elapsed time per iteration (s): 0.17 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.757133E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.031 | TFLOPs: 23.88 | +7: iteration 50520/ 173500 | consumed samples: 12933120 | consumed tokens: 26487029760 | elapsed time per iteration (s): 0.17 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.748336E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.341 | TFLOPs: 23.78 | +7: iteration 50530/ 173500 | consumed samples: 12935680 | consumed tokens: 26492272640 | elapsed time per iteration (s): 0.17 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.752896E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.463 | TFLOPs: 23.73 | +7: iteration 50540/ 173500 | consumed samples: 12938240 | consumed tokens: 26497515520 | elapsed time per iteration (s): 0.18 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.763861E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.812 | TFLOPs: 22.75 | +7: iteration 50550/ 173500 | consumed samples: 12940800 | consumed tokens: 26502758400 | elapsed time per iteration (s): 0.16 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.745852E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.184 | TFLOPs: 25.38 | +7: iteration 50560/ 173500 | consumed samples: 12943360 | consumed tokens: 26508001280 | elapsed time per iteration (s): 0.17 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.747556E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1484.210 | TFLOPs: 23.28 | +7: iteration 50570/ 173500 | consumed samples: 12945920 | consumed tokens: 26513244160 | elapsed time per iteration (s): 0.17 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.765272E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1483.422 | TFLOPs: 23.26 | +7: iteration 50580/ 173500 | consumed samples: 12948480 | consumed tokens: 26518487040 | elapsed time per iteration (s): 0.17 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.755057E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1468.132 | TFLOPs: 23.02 | +7: iteration 50590/ 173500 | consumed samples: 12951040 | consumed tokens: 26523729920 | elapsed time per iteration (s): 0.16 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.735104E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.681 | TFLOPs: 25.48 | +7: iteration 50600/ 173500 | consumed samples: 12953600 | consumed tokens: 26528972800 | elapsed time per iteration (s): 0.17 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.757647E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.609 | TFLOPs: 24.32 | +7: iteration 50610/ 173500 | consumed samples: 12956160 | consumed tokens: 26534215680 | elapsed time per iteration (s): 0.17 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.761202E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.370 | TFLOPs: 23.61 | +7: iteration 50620/ 173500 | consumed samples: 12958720 | consumed tokens: 26539458560 | elapsed time per iteration (s): 0.16 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.745977E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.087 | TFLOPs: 25.00 | +7: iteration 50630/ 173500 | consumed samples: 12961280 | consumed tokens: 26544701440 | elapsed time per iteration (s): 0.17 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.749565E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.997 | TFLOPs: 23.23 | +7: iteration 50640/ 173500 | consumed samples: 12963840 | consumed tokens: 26549944320 | elapsed time per iteration (s): 0.16 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.753698E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.708 | TFLOPs: 25.23 | +7: iteration 50650/ 173500 | consumed samples: 12966400 | consumed tokens: 26555187200 | elapsed time per iteration (s): 0.17 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.732399E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.542 | TFLOPs: 24.29 | +7: iteration 50660/ 173500 | consumed samples: 12968960 | consumed tokens: 26560430080 | elapsed time per iteration (s): 0.17 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.749082E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1489.172 | TFLOPs: 23.35 | +7: iteration 50670/ 173500 | consumed samples: 12971520 | consumed tokens: 26565672960 | elapsed time per iteration (s): 0.16 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.744116E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.106 | TFLOPs: 25.11 | +7: iteration 50680/ 173500 | consumed samples: 12974080 | consumed tokens: 26570915840 | elapsed time per iteration (s): 0.17 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.732848E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.128 | TFLOPs: 24.06 | +7: iteration 50690/ 173500 | consumed samples: 12976640 | consumed tokens: 26576158720 | elapsed time per iteration (s): 0.16 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.749391E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.672 | TFLOPs: 25.01 | +7: iteration 50700/ 173500 | consumed samples: 12979200 | consumed tokens: 26581401600 | elapsed time per iteration (s): 0.17 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.741993E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.665 | TFLOPs: 23.93 | +7: iteration 50710/ 173500 | consumed samples: 12981760 | consumed tokens: 26586644480 | elapsed time per iteration (s): 0.16 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.759796E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.753 | TFLOPs: 24.43 | +7: iteration 50720/ 173500 | consumed samples: 12984320 | consumed tokens: 26591887360 | elapsed time per iteration (s): 0.16 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.757146E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.924 | TFLOPs: 24.35 | +7: iteration 50730/ 173500 | consumed samples: 12986880 | consumed tokens: 26597130240 | elapsed time per iteration (s): 0.16 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.750149E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.159 | TFLOPs: 25.19 | +7: iteration 50740/ 173500 | consumed samples: 12989440 | consumed tokens: 26602373120 | elapsed time per iteration (s): 0.17 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.751468E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.617 | TFLOPs: 23.35 | +7: iteration 50750/ 173500 | consumed samples: 12992000 | consumed tokens: 26607616000 | elapsed time per iteration (s): 0.16 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.743585E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.249 | TFLOPs: 25.21 | +7: iteration 50760/ 173500 | consumed samples: 12994560 | consumed tokens: 26612858880 | elapsed time per iteration (s): 0.16 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.750003E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.860 | TFLOPs: 25.47 | +7: iteration 50770/ 173500 | consumed samples: 12997120 | consumed tokens: 26618101760 | elapsed time per iteration (s): 0.17 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.743961E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.961 | TFLOPs: 24.21 | +7: iteration 50780/ 173500 | consumed samples: 12999680 | consumed tokens: 26623344640 | elapsed time per iteration (s): 0.16 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.751054E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.963 | TFLOPs: 24.95 | +7: iteration 50790/ 173500 | consumed samples: 13002240 | consumed tokens: 26628587520 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.747956E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.782 | TFLOPs: 24.49 | +7: iteration 50800/ 173500 | consumed samples: 13004800 | consumed tokens: 26633830400 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.746590E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.875 | TFLOPs: 24.59 | +7: iteration 50810/ 173500 | consumed samples: 13007360 | consumed tokens: 26639073280 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.747685E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.872 | TFLOPs: 25.45 | +7: iteration 50820/ 173500 | consumed samples: 13009920 | consumed tokens: 26644316160 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.772332E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.524 | TFLOPs: 24.39 | +7: iteration 50830/ 173500 | consumed samples: 13012480 | consumed tokens: 26649559040 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.760392E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.919 | TFLOPs: 25.39 | +7: iteration 50840/ 173500 | consumed samples: 13015040 | consumed tokens: 26654801920 | elapsed time per iteration (s): 0.17 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.746609E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.747 | TFLOPs: 24.21 | +7: iteration 50850/ 173500 | consumed samples: 13017600 | consumed tokens: 26660044800 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.758434E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.058 | TFLOPs: 24.95 | +7: iteration 50860/ 173500 | consumed samples: 13020160 | consumed tokens: 26665287680 | elapsed time per iteration (s): 0.16 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.752157E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.866 | TFLOPs: 25.37 | +7: iteration 50870/ 173500 | consumed samples: 13022720 | consumed tokens: 26670530560 | elapsed time per iteration (s): 0.16 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.762712E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.840 | TFLOPs: 24.92 | +7: iteration 50880/ 173500 | consumed samples: 13025280 | consumed tokens: 26675773440 | elapsed time per iteration (s): 0.17 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.749861E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.110 | TFLOPs: 24.20 | +7: iteration 50890/ 173500 | consumed samples: 13027840 | consumed tokens: 26681016320 | elapsed time per iteration (s): 0.16 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.751326E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.713 | TFLOPs: 24.96 | +7: iteration 50900/ 173500 | consumed samples: 13030400 | consumed tokens: 26686259200 | elapsed time per iteration (s): 0.16 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.754739E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.330 | TFLOPs: 25.22 | +7: iteration 50910/ 173500 | consumed samples: 13032960 | consumed tokens: 26691502080 | elapsed time per iteration (s): 0.16 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.750232E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.936 | TFLOPs: 25.03 | +7: iteration 50920/ 173500 | consumed samples: 13035520 | consumed tokens: 26696744960 | elapsed time per iteration (s): 0.16 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.764179E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.616 | TFLOPs: 25.38 | +7: iteration 50930/ 173500 | consumed samples: 13038080 | consumed tokens: 26701987840 | elapsed time per iteration (s): 0.17 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.759793E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.736 | TFLOPs: 24.05 | +7: iteration 50940/ 173500 | consumed samples: 13040640 | consumed tokens: 26707230720 | elapsed time per iteration (s): 0.16 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.745444E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.102 | TFLOPs: 24.87 | +7: iteration 50950/ 173500 | consumed samples: 13043200 | consumed tokens: 26712473600 | elapsed time per iteration (s): 0.16 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.748981E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.619 | TFLOPs: 24.98 | +7: iteration 50960/ 173500 | consumed samples: 13045760 | consumed tokens: 26717716480 | elapsed time per iteration (s): 0.16 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.752239E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.880 | TFLOPs: 25.28 | +7: iteration 50970/ 173500 | consumed samples: 13048320 | consumed tokens: 26722959360 | elapsed time per iteration (s): 0.16 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.757340E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.334 | TFLOPs: 24.72 | +7: iteration 50980/ 173500 | consumed samples: 13050880 | consumed tokens: 26728202240 | elapsed time per iteration (s): 0.16 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.757242E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.802 | TFLOPs: 25.39 | +7: iteration 50990/ 173500 | consumed samples: 13053440 | consumed tokens: 26733445120 | elapsed time per iteration (s): 0.16 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.755161E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.723 | TFLOPs: 25.78 | +7: iteration 51000/ 173500 | consumed samples: 13056000 | consumed tokens: 26738688000 | elapsed time per iteration (s): 0.18 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.745765E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.317 | TFLOPs: 22.59 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 51000 | lm loss value: 3.901439E+00 | lm loss PPL: 4.947360E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 51000 to checkpoints_44m91b100m +0: [2023-03-17 02:27:33,684] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step51000 is begin to save! +0: [2023-03-17 02:27:33,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:27:33,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:27:33,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:27:33,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:27:33,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:27:33,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:27:33,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:27:33,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:27:33,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:27:33,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:27:33,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:27:33,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:27:33,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:27:33,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:27:33,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:27:33,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:27:33,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:27:33,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:27:33,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:27:33,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:27:33,823] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step51000/mp_rank_00_model_states.pt +0: [2023-03-17 02:27:33,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:27:33,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:27:33,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:27:33,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:27:33,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:27:33,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:27:33,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:27:33,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:27:33,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +3: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +4: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 02:27:33,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 02:27:33,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +5: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:27:33,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +1: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:27:33,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +2: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +6: [2023-03-17 02:27:33,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:27:33,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:27:33,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +7: [2023-03-17 02:27:33,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:27:33,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step51000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:27:33,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step51000 is ready now! +0: successfully saved checkpoint at iteration 51000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.90 +7: iteration 51010/ 173500 | consumed samples: 13058560 | consumed tokens: 26743930880 | elapsed time per iteration (s): 0.18 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.752633E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1384.977 | TFLOPs: 21.72 | +7: iteration 51020/ 173500 | consumed samples: 13061120 | consumed tokens: 26749173760 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.765292E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.921 | TFLOPs: 25.39 | +7: iteration 51030/ 173500 | consumed samples: 13063680 | consumed tokens: 26754416640 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.749048E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.090 | TFLOPs: 25.16 | +7: iteration 51040/ 173500 | consumed samples: 13066240 | consumed tokens: 26759659520 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.742947E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.737 | TFLOPs: 25.10 | +7: iteration 51050/ 173500 | consumed samples: 13068800 | consumed tokens: 26764902400 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.749212E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.933 | TFLOPs: 24.75 | +7: iteration 51060/ 173500 | consumed samples: 13071360 | consumed tokens: 26770145280 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.748835E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.785 | TFLOPs: 24.62 | +7: iteration 51070/ 173500 | consumed samples: 13073920 | consumed tokens: 26775388160 | elapsed time per iteration (s): 0.18 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.766222E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.319 | TFLOPs: 22.85 | +7: iteration 51080/ 173500 | consumed samples: 13076480 | consumed tokens: 26780631040 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.759053E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.439 | TFLOPs: 24.47 | +7: iteration 51090/ 173500 | consumed samples: 13079040 | consumed tokens: 26785873920 | elapsed time per iteration (s): 0.16 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.754977E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.650 | TFLOPs: 25.02 | +7: iteration 51100/ 173500 | consumed samples: 13081600 | consumed tokens: 26791116800 | elapsed time per iteration (s): 0.16 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.751103E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.311 | TFLOPs: 24.80 | +7: iteration 51110/ 173500 | consumed samples: 13084160 | consumed tokens: 26796359680 | elapsed time per iteration (s): 0.16 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.764486E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.312 | TFLOPs: 24.63 | +7: iteration 51120/ 173500 | consumed samples: 13086720 | consumed tokens: 26801602560 | elapsed time per iteration (s): 0.17 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.748018E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.860 | TFLOPs: 24.21 | +7: iteration 51130/ 173500 | consumed samples: 13089280 | consumed tokens: 26806845440 | elapsed time per iteration (s): 0.16 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.750023E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.077 | TFLOPs: 24.39 | +7: iteration 51140/ 173500 | consumed samples: 13091840 | consumed tokens: 26812088320 | elapsed time per iteration (s): 0.16 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.747517E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.571 | TFLOPs: 25.23 | +7: iteration 51150/ 173500 | consumed samples: 13094400 | consumed tokens: 26817331200 | elapsed time per iteration (s): 0.16 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.743377E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.103 | TFLOPs: 24.84 | +7: iteration 51160/ 173500 | consumed samples: 13096960 | consumed tokens: 26822574080 | elapsed time per iteration (s): 0.16 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.761638E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.560 | TFLOPs: 25.07 | +7: iteration 51170/ 173500 | consumed samples: 13099520 | consumed tokens: 26827816960 | elapsed time per iteration (s): 0.17 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.760473E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.926 | TFLOPs: 24.24 | +7: iteration 51180/ 173500 | consumed samples: 13102080 | consumed tokens: 26833059840 | elapsed time per iteration (s): 0.16 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.743599E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.292 | TFLOPs: 24.86 | +7: iteration 51190/ 173500 | consumed samples: 13104640 | consumed tokens: 26838302720 | elapsed time per iteration (s): 0.16 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.761964E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.884 | TFLOPs: 24.40 | +7: iteration 51200/ 173500 | consumed samples: 13107200 | consumed tokens: 26843545600 | elapsed time per iteration (s): 0.16 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.748300E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.542 | TFLOPs: 25.12 | +7: iteration 51210/ 173500 | consumed samples: 13109760 | consumed tokens: 26848788480 | elapsed time per iteration (s): 0.16 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.739611E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.242 | TFLOPs: 24.81 | +7: iteration 51220/ 173500 | consumed samples: 13112320 | consumed tokens: 26854031360 | elapsed time per iteration (s): 0.17 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.730265E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.755 | TFLOPs: 24.04 | +7: iteration 51230/ 173500 | consumed samples: 13114880 | consumed tokens: 26859274240 | elapsed time per iteration (s): 0.16 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.759826E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.073 | TFLOPs: 24.83 | +7: iteration 51240/ 173500 | consumed samples: 13117440 | consumed tokens: 26864517120 | elapsed time per iteration (s): 0.16 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.752829E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.969 | TFLOPs: 25.47 | +7: iteration 51250/ 173500 | consumed samples: 13120000 | consumed tokens: 26869760000 | elapsed time per iteration (s): 0.16 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.762143E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.749 | TFLOPs: 25.20 | +7: iteration 51260/ 173500 | consumed samples: 13122560 | consumed tokens: 26875002880 | elapsed time per iteration (s): 0.16 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.750864E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.774 | TFLOPs: 25.09 | +7: iteration 51270/ 173500 | consumed samples: 13125120 | consumed tokens: 26880245760 | elapsed time per iteration (s): 0.16 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.758064E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.379 | TFLOPs: 24.86 | +7: iteration 51280/ 173500 | consumed samples: 13127680 | consumed tokens: 26885488640 | elapsed time per iteration (s): 0.16 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.755750E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.347 | TFLOPs: 24.74 | +7: iteration 51290/ 173500 | consumed samples: 13130240 | consumed tokens: 26890731520 | elapsed time per iteration (s): 0.16 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.753962E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.762 | TFLOPs: 24.51 | +7: iteration 51300/ 173500 | consumed samples: 13132800 | consumed tokens: 26895974400 | elapsed time per iteration (s): 0.17 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.765143E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.198 | TFLOPs: 23.20 | +7: iteration 51310/ 173500 | consumed samples: 13135360 | consumed tokens: 26901217280 | elapsed time per iteration (s): 0.17 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.744443E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.799 | TFLOPs: 24.07 | +7: iteration 51320/ 173500 | consumed samples: 13137920 | consumed tokens: 26906460160 | elapsed time per iteration (s): 0.17 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.753176E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.893 | TFLOPs: 24.15 | +7: iteration 51330/ 173500 | consumed samples: 13140480 | consumed tokens: 26911703040 | elapsed time per iteration (s): 0.17 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.754061E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.195 | TFLOPs: 23.43 | +7: iteration 51340/ 173500 | consumed samples: 13143040 | consumed tokens: 26916945920 | elapsed time per iteration (s): 0.16 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.757840E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.579 | TFLOPs: 25.41 | +7: iteration 51350/ 173500 | consumed samples: 13145600 | consumed tokens: 26922188800 | elapsed time per iteration (s): 0.16 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.741363E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.783 | TFLOPs: 25.18 | +7: iteration 51360/ 173500 | consumed samples: 13148160 | consumed tokens: 26927431680 | elapsed time per iteration (s): 0.16 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.752530E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.225 | TFLOPs: 25.03 | +7: iteration 51370/ 173500 | consumed samples: 13150720 | consumed tokens: 26932674560 | elapsed time per iteration (s): 0.17 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.755480E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.618 | TFLOPs: 23.69 | +7: iteration 51380/ 173500 | consumed samples: 13153280 | consumed tokens: 26937917440 | elapsed time per iteration (s): 0.16 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.734730E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.431 | TFLOPs: 25.41 | +7: iteration 51390/ 173500 | consumed samples: 13155840 | consumed tokens: 26943160320 | elapsed time per iteration (s): 0.16 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.751487E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.718 | TFLOPs: 25.20 | +7: iteration 51400/ 173500 | consumed samples: 13158400 | consumed tokens: 26948403200 | elapsed time per iteration (s): 0.16 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.757320E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.582 | TFLOPs: 24.57 | +7: iteration 51410/ 173500 | consumed samples: 13160960 | consumed tokens: 26953646080 | elapsed time per iteration (s): 0.16 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.739663E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.442 | TFLOPs: 24.61 | +7: iteration 51420/ 173500 | consumed samples: 13163520 | consumed tokens: 26958888960 | elapsed time per iteration (s): 0.16 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.745678E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.631 | TFLOPs: 24.93 | +7: iteration 51430/ 173500 | consumed samples: 13166080 | consumed tokens: 26964131840 | elapsed time per iteration (s): 0.17 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.747518E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.192 | TFLOPs: 24.04 | +7: iteration 51440/ 173500 | consumed samples: 13168640 | consumed tokens: 26969374720 | elapsed time per iteration (s): 0.16 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.759730E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.409 | TFLOPs: 25.19 | +7: iteration 51450/ 173500 | consumed samples: 13171200 | consumed tokens: 26974617600 | elapsed time per iteration (s): 0.17 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.766335E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.881 | TFLOPs: 23.32 | +7: iteration 51460/ 173500 | consumed samples: 13173760 | consumed tokens: 26979860480 | elapsed time per iteration (s): 0.17 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.749942E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1490.274 | TFLOPs: 23.37 | +7: iteration 51470/ 173500 | consumed samples: 13176320 | consumed tokens: 26985103360 | elapsed time per iteration (s): 0.16 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.738223E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.329 | TFLOPs: 24.91 | +7: iteration 51480/ 173500 | consumed samples: 13178880 | consumed tokens: 26990346240 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.760027E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.963 | TFLOPs: 24.64 | +7: iteration 51490/ 173500 | consumed samples: 13181440 | consumed tokens: 26995589120 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.757637E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.031 | TFLOPs: 25.31 | +7: iteration 51500/ 173500 | consumed samples: 13184000 | consumed tokens: 27000832000 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.755051E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.659 | TFLOPs: 24.54 | +7: iteration 51510/ 173500 | consumed samples: 13186560 | consumed tokens: 27006074880 | elapsed time per iteration (s): 0.17 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.754514E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.325 | TFLOPs: 23.69 | +7: iteration 51520/ 173500 | consumed samples: 13189120 | consumed tokens: 27011317760 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.761341E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.667 | TFLOPs: 24.55 | +7: iteration 51530/ 173500 | consumed samples: 13191680 | consumed tokens: 27016560640 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.744317E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.255 | TFLOPs: 24.86 | +7: iteration 51540/ 173500 | consumed samples: 13194240 | consumed tokens: 27021803520 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.738270E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.176 | TFLOPs: 24.78 | +7: iteration 51550/ 173500 | consumed samples: 13196800 | consumed tokens: 27027046400 | elapsed time per iteration (s): 0.16 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.746028E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.530 | TFLOPs: 25.04 | +7: iteration 51560/ 173500 | consumed samples: 13199360 | consumed tokens: 27032289280 | elapsed time per iteration (s): 0.16 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.751545E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.123 | TFLOPs: 25.33 | +7: iteration 51570/ 173500 | consumed samples: 13201920 | consumed tokens: 27037532160 | elapsed time per iteration (s): 0.16 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.760458E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.045 | TFLOPs: 25.23 | +7: iteration 51580/ 173500 | consumed samples: 13204480 | consumed tokens: 27042775040 | elapsed time per iteration (s): 0.16 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.749902E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.896 | TFLOPs: 25.03 | +7: iteration 51590/ 173500 | consumed samples: 13207040 | consumed tokens: 27048017920 | elapsed time per iteration (s): 0.17 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.740036E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.713 | TFLOPs: 23.46 | +7: iteration 51600/ 173500 | consumed samples: 13209600 | consumed tokens: 27053260800 | elapsed time per iteration (s): 0.17 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.740409E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1470.261 | TFLOPs: 23.06 | +7: iteration 51610/ 173500 | consumed samples: 13212160 | consumed tokens: 27058503680 | elapsed time per iteration (s): 0.16 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.752843E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.864 | TFLOPs: 25.34 | +7: iteration 51620/ 173500 | consumed samples: 13214720 | consumed tokens: 27063746560 | elapsed time per iteration (s): 0.16 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.751737E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.870 | TFLOPs: 24.98 | +7: iteration 51630/ 173500 | consumed samples: 13217280 | consumed tokens: 27068989440 | elapsed time per iteration (s): 0.16 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.755823E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.404 | TFLOPs: 24.89 | +7: iteration 51640/ 173500 | consumed samples: 13219840 | consumed tokens: 27074232320 | elapsed time per iteration (s): 0.17 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.760410E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.320 | TFLOPs: 23.43 | +7: iteration 51650/ 173500 | consumed samples: 13222400 | consumed tokens: 27079475200 | elapsed time per iteration (s): 0.16 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.750398E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.498 | TFLOPs: 24.50 | +7: iteration 51660/ 173500 | consumed samples: 13224960 | consumed tokens: 27084718080 | elapsed time per iteration (s): 0.18 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.760265E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1447.455 | TFLOPs: 22.70 | +7: iteration 51670/ 173500 | consumed samples: 13227520 | consumed tokens: 27089960960 | elapsed time per iteration (s): 0.15 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.752087E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.684 | TFLOPs: 26.00 | +7: iteration 51680/ 173500 | consumed samples: 13230080 | consumed tokens: 27095203840 | elapsed time per iteration (s): 0.16 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.759482E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.494 | TFLOPs: 25.12 | +7: iteration 51690/ 173500 | consumed samples: 13232640 | consumed tokens: 27100446720 | elapsed time per iteration (s): 0.16 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.753641E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.649 | TFLOPs: 24.82 | +7: iteration 51700/ 173500 | consumed samples: 13235200 | consumed tokens: 27105689600 | elapsed time per iteration (s): 0.16 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.757167E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.207 | TFLOPs: 25.63 | +7: iteration 51710/ 173500 | consumed samples: 13237760 | consumed tokens: 27110932480 | elapsed time per iteration (s): 0.16 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.747348E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.457 | TFLOPs: 24.88 | +7: iteration 51720/ 173500 | consumed samples: 13240320 | consumed tokens: 27116175360 | elapsed time per iteration (s): 0.16 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.754646E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.653 | TFLOPs: 24.41 | +7: iteration 51730/ 173500 | consumed samples: 13242880 | consumed tokens: 27121418240 | elapsed time per iteration (s): 0.16 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.760942E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.785 | TFLOPs: 25.28 | +7: iteration 51740/ 173500 | consumed samples: 13245440 | consumed tokens: 27126661120 | elapsed time per iteration (s): 0.16 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.748642E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.099 | TFLOPs: 25.28 | +7: iteration 51750/ 173500 | consumed samples: 13248000 | consumed tokens: 27131904000 | elapsed time per iteration (s): 0.17 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.744765E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.840 | TFLOPs: 24.32 | +7: iteration 51760/ 173500 | consumed samples: 13250560 | consumed tokens: 27137146880 | elapsed time per iteration (s): 0.16 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.743529E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.357 | TFLOPs: 25.60 | +7: iteration 51770/ 173500 | consumed samples: 13253120 | consumed tokens: 27142389760 | elapsed time per iteration (s): 0.16 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.741930E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.400 | TFLOPs: 25.15 | +7: iteration 51780/ 173500 | consumed samples: 13255680 | consumed tokens: 27147632640 | elapsed time per iteration (s): 0.17 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.749805E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.051 | TFLOPs: 24.23 | +7: iteration 51790/ 173500 | consumed samples: 13258240 | consumed tokens: 27152875520 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.748454E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.235 | TFLOPs: 24.95 | +7: iteration 51800/ 173500 | consumed samples: 13260800 | consumed tokens: 27158118400 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.754854E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.194 | TFLOPs: 25.31 | +7: iteration 51810/ 173500 | consumed samples: 13263360 | consumed tokens: 27163361280 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.749825E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.382 | TFLOPs: 24.52 | +7: iteration 51820/ 173500 | consumed samples: 13265920 | consumed tokens: 27168604160 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.752168E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.483 | TFLOPs: 25.43 | +7: iteration 51830/ 173500 | consumed samples: 13268480 | consumed tokens: 27173847040 | elapsed time per iteration (s): 0.17 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.764107E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.098 | TFLOPs: 24.09 | +7: iteration 51840/ 173500 | consumed samples: 13271040 | consumed tokens: 27179089920 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.769861E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.130 | TFLOPs: 24.36 | +7: iteration 51850/ 173500 | consumed samples: 13273600 | consumed tokens: 27184332800 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.747295E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.889 | TFLOPs: 25.20 | +7: iteration 51860/ 173500 | consumed samples: 13276160 | consumed tokens: 27189575680 | elapsed time per iteration (s): 0.16 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.764513E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.433 | TFLOPs: 24.46 | +7: iteration 51870/ 173500 | consumed samples: 13278720 | consumed tokens: 27194818560 | elapsed time per iteration (s): 0.16 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.756629E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.162 | TFLOPs: 25.58 | +7: iteration 51880/ 173500 | consumed samples: 13281280 | consumed tokens: 27200061440 | elapsed time per iteration (s): 0.16 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.750616E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.071 | TFLOPs: 25.20 | +7: iteration 51890/ 173500 | consumed samples: 13283840 | consumed tokens: 27205304320 | elapsed time per iteration (s): 0.15 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.749886E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.285 | TFLOPs: 25.93 | +7: iteration 51900/ 173500 | consumed samples: 13286400 | consumed tokens: 27210547200 | elapsed time per iteration (s): 0.16 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.747931E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.578 | TFLOPs: 25.05 | +7: iteration 51910/ 173500 | consumed samples: 13288960 | consumed tokens: 27215790080 | elapsed time per iteration (s): 0.16 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.756346E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.552 | TFLOPs: 25.84 | +7: iteration 51920/ 173500 | consumed samples: 13291520 | consumed tokens: 27221032960 | elapsed time per iteration (s): 0.16 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.755326E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.714 | TFLOPs: 25.70 | +7: iteration 51930/ 173500 | consumed samples: 13294080 | consumed tokens: 27226275840 | elapsed time per iteration (s): 0.16 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.751418E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.750 | TFLOPs: 24.59 | +7: iteration 51940/ 173500 | consumed samples: 13296640 | consumed tokens: 27231518720 | elapsed time per iteration (s): 0.16 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.757512E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.426 | TFLOPs: 25.33 | +7: iteration 51950/ 173500 | consumed samples: 13299200 | consumed tokens: 27236761600 | elapsed time per iteration (s): 0.16 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.751838E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.479 | TFLOPs: 24.58 | +7: iteration 51960/ 173500 | consumed samples: 13301760 | consumed tokens: 27242004480 | elapsed time per iteration (s): 0.16 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.757434E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.190 | TFLOPs: 25.02 | +7: iteration 51970/ 173500 | consumed samples: 13304320 | consumed tokens: 27247247360 | elapsed time per iteration (s): 0.16 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.746781E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.841 | TFLOPs: 25.00 | +7: iteration 51980/ 173500 | consumed samples: 13306880 | consumed tokens: 27252490240 | elapsed time per iteration (s): 0.16 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.764211E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.369 | TFLOPs: 24.45 | +7: iteration 51990/ 173500 | consumed samples: 13309440 | consumed tokens: 27257733120 | elapsed time per iteration (s): 0.16 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.739883E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.240 | TFLOPs: 24.55 | +0: [2023-03-17 02:30:16,337] [INFO] [logging.py:68:log_dist] [Rank 0] step=52000, skipped=0, lr=[0.00016457056203724818, 0.00016457056203724818, 0.00016457056203724818], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 52000/ 173500 | consumed samples: 13312000 | consumed tokens: 27262976000 | elapsed time per iteration (s): 0.17 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.744650E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.836 | TFLOPs: 24.15 | +0: steps: 52000 loss: 3.7448 iter time (s): 0.163 samples/sec: 1574.924 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 52000 | lm loss value: 3.890474E+00 | lm loss PPL: 4.893408E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 52000 to checkpoints_44m91b100m +0: [2023-03-17 02:30:16,425] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step52000 is begin to save! +0: [2023-03-17 02:30:16,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:30:16,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:30:16,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:30:16,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:30:16,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:30:16,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:30:16,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:30:16,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:30:16,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:30:16,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:30:16,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:30:16,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:30:16,542] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:30:16,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:30:16,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:30:16,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:30:16,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:30:16,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:30:16,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:30:16,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:30:16,569] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step52000/mp_rank_00_model_states.pt +0: [2023-03-17 02:30:16,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:30:16,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:30:16,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:30:16,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:16,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +4: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +3: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:30:16,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +1: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +2: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +6: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +5: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +7: [2023-03-17 02:30:16,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step52000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:30:16,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step52000 is ready now! +0: successfully saved checkpoint at iteration 52000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.44 +7: iteration 52010/ 173500 | consumed samples: 13314560 | consumed tokens: 27268218880 | elapsed time per iteration (s): 0.19 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.757148E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1332.875 | TFLOPs: 20.90 | +7: iteration 52020/ 173500 | consumed samples: 13317120 | consumed tokens: 27273461760 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.758521E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.959 | TFLOPs: 25.55 | +7: iteration 52030/ 173500 | consumed samples: 13319680 | consumed tokens: 27278704640 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.739856E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.439 | TFLOPs: 25.59 | +7: iteration 52040/ 173500 | consumed samples: 13322240 | consumed tokens: 27283947520 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.744756E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.828 | TFLOPs: 24.63 | +7: iteration 52050/ 173500 | consumed samples: 13324800 | consumed tokens: 27289190400 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.754867E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.142 | TFLOPs: 25.31 | +7: iteration 52060/ 173500 | consumed samples: 13327360 | consumed tokens: 27294433280 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.737267E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.651 | TFLOPs: 25.01 | +7: iteration 52070/ 173500 | consumed samples: 13329920 | consumed tokens: 27299676160 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.765575E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.541 | TFLOPs: 25.13 | +7: iteration 52080/ 173500 | consumed samples: 13332480 | consumed tokens: 27304919040 | elapsed time per iteration (s): 0.17 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.760772E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.432 | TFLOPs: 24.25 | +7: iteration 52090/ 173500 | consumed samples: 13335040 | consumed tokens: 27310161920 | elapsed time per iteration (s): 0.16 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.765355E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.320 | TFLOPs: 25.08 | +7: iteration 52100/ 173500 | consumed samples: 13337600 | consumed tokens: 27315404800 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.738199E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.064 | TFLOPs: 24.42 | +7: iteration 52110/ 173500 | consumed samples: 13340160 | consumed tokens: 27320647680 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.733603E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.556 | TFLOPs: 25.10 | +7: iteration 52120/ 173500 | consumed samples: 13342720 | consumed tokens: 27325890560 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.743430E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.290 | TFLOPs: 25.58 | +7: iteration 52130/ 173500 | consumed samples: 13345280 | consumed tokens: 27331133440 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.758569E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.505 | TFLOPs: 24.65 | +7: iteration 52140/ 173500 | consumed samples: 13347840 | consumed tokens: 27336376320 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.744644E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.654 | TFLOPs: 25.01 | +7: iteration 52150/ 173500 | consumed samples: 13350400 | consumed tokens: 27341619200 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.753670E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.835 | TFLOPs: 24.59 | +7: iteration 52160/ 173500 | consumed samples: 13352960 | consumed tokens: 27346862080 | elapsed time per iteration (s): 0.16 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.745847E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.297 | TFLOPs: 25.21 | +7: iteration 52170/ 173500 | consumed samples: 13355520 | consumed tokens: 27352104960 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.742283E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.883 | TFLOPs: 25.44 | +7: iteration 52180/ 173500 | consumed samples: 13358080 | consumed tokens: 27357347840 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.727499E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.824 | TFLOPs: 24.40 | +7: iteration 52190/ 173500 | consumed samples: 13360640 | consumed tokens: 27362590720 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.760403E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.864 | TFLOPs: 25.39 | +7: iteration 52200/ 173500 | consumed samples: 13363200 | consumed tokens: 27367833600 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.757893E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.857 | TFLOPs: 25.87 | +7: iteration 52210/ 173500 | consumed samples: 13365760 | consumed tokens: 27373076480 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.751053E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.385 | TFLOPs: 25.84 | +7: iteration 52220/ 173500 | consumed samples: 13368320 | consumed tokens: 27378319360 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.759789E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.702 | TFLOPs: 25.73 | +7: iteration 52230/ 173500 | consumed samples: 13370880 | consumed tokens: 27383562240 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.739847E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.724 | TFLOPs: 24.70 | +7: iteration 52240/ 173500 | consumed samples: 13373440 | consumed tokens: 27388805120 | elapsed time per iteration (s): 0.16 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.738808E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.312 | TFLOPs: 25.66 | +7: iteration 52250/ 173500 | consumed samples: 13376000 | consumed tokens: 27394048000 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.743605E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.165 | TFLOPs: 24.40 | +7: iteration 52260/ 173500 | consumed samples: 13378560 | consumed tokens: 27399290880 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.759394E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.512 | TFLOPs: 25.71 | +7: iteration 52270/ 173500 | consumed samples: 13381120 | consumed tokens: 27404533760 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.744657E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.326 | TFLOPs: 25.33 | +7: iteration 52280/ 173500 | consumed samples: 13383680 | consumed tokens: 27409776640 | elapsed time per iteration (s): 0.15 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.747319E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.690 | TFLOPs: 26.20 | +7: iteration 52290/ 173500 | consumed samples: 13386240 | consumed tokens: 27415019520 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.763433E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.658 | TFLOPs: 25.86 | +7: iteration 52300/ 173500 | consumed samples: 13388800 | consumed tokens: 27420262400 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.753709E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.602 | TFLOPs: 25.82 | +7: iteration 52310/ 173500 | consumed samples: 13391360 | consumed tokens: 27425505280 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.748628E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.486 | TFLOPs: 25.90 | +7: iteration 52320/ 173500 | consumed samples: 13393920 | consumed tokens: 27430748160 | elapsed time per iteration (s): 0.16 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.736665E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.256 | TFLOPs: 25.75 | +7: iteration 52330/ 173500 | consumed samples: 13396480 | consumed tokens: 27435991040 | elapsed time per iteration (s): 0.15 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.759655E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.381 | TFLOPs: 26.32 | +7: iteration 52340/ 173500 | consumed samples: 13399040 | consumed tokens: 27441233920 | elapsed time per iteration (s): 0.16 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.759129E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.219 | TFLOPs: 25.05 | +7: iteration 52350/ 173500 | consumed samples: 13401600 | consumed tokens: 27446476800 | elapsed time per iteration (s): 0.16 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.754527E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.793 | TFLOPs: 25.40 | +7: iteration 52360/ 173500 | consumed samples: 13404160 | consumed tokens: 27451719680 | elapsed time per iteration (s): 0.16 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.746539E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.408 | TFLOPs: 25.37 | +7: iteration 52370/ 173500 | consumed samples: 13406720 | consumed tokens: 27456962560 | elapsed time per iteration (s): 0.15 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.739597E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.902 | TFLOPs: 26.24 | +7: iteration 52380/ 173500 | consumed samples: 13409280 | consumed tokens: 27462205440 | elapsed time per iteration (s): 0.16 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.759272E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.056 | TFLOPs: 25.38 | +7: iteration 52390/ 173500 | consumed samples: 13411840 | consumed tokens: 27467448320 | elapsed time per iteration (s): 0.16 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.746918E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.746 | TFLOPs: 25.24 | +7: iteration 52400/ 173500 | consumed samples: 13414400 | consumed tokens: 27472691200 | elapsed time per iteration (s): 0.15 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.747033E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.058 | TFLOPs: 26.08 | +7: iteration 52410/ 173500 | consumed samples: 13416960 | consumed tokens: 27477934080 | elapsed time per iteration (s): 0.16 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.742009E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.296 | TFLOPs: 25.33 | +7: iteration 52420/ 173500 | consumed samples: 13419520 | consumed tokens: 27483176960 | elapsed time per iteration (s): 0.16 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.757454E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.902 | TFLOPs: 25.81 | +7: iteration 52430/ 173500 | consumed samples: 13422080 | consumed tokens: 27488419840 | elapsed time per iteration (s): 0.16 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.759773E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.219 | TFLOPs: 25.64 | +7: iteration 52440/ 173500 | consumed samples: 13424640 | consumed tokens: 27493662720 | elapsed time per iteration (s): 0.15 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.749509E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.228 | TFLOPs: 26.29 | +7: iteration 52450/ 173500 | consumed samples: 13427200 | consumed tokens: 27498905600 | elapsed time per iteration (s): 0.15 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.750412E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.495 | TFLOPs: 26.20 | +7: iteration 52460/ 173500 | consumed samples: 13429760 | consumed tokens: 27504148480 | elapsed time per iteration (s): 0.16 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.755303E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.690 | TFLOPs: 25.49 | +7: iteration 52470/ 173500 | consumed samples: 13432320 | consumed tokens: 27509391360 | elapsed time per iteration (s): 0.15 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.753798E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.952 | TFLOPs: 25.91 | +7: iteration 52480/ 173500 | consumed samples: 13434880 | consumed tokens: 27514634240 | elapsed time per iteration (s): 0.15 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.760062E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.286 | TFLOPs: 25.94 | +7: iteration 52490/ 173500 | consumed samples: 13437440 | consumed tokens: 27519877120 | elapsed time per iteration (s): 0.16 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.758102E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.619 | TFLOPs: 24.87 | +7: iteration 52500/ 173500 | consumed samples: 13440000 | consumed tokens: 27525120000 | elapsed time per iteration (s): 0.16 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.745300E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.313 | TFLOPs: 25.57 | +7: iteration 52510/ 173500 | consumed samples: 13442560 | consumed tokens: 27530362880 | elapsed time per iteration (s): 0.15 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.756974E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.972 | TFLOPs: 25.92 | +7: iteration 52520/ 173500 | consumed samples: 13445120 | consumed tokens: 27535605760 | elapsed time per iteration (s): 0.16 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.760545E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.937 | TFLOPs: 25.31 | +7: iteration 52530/ 173500 | consumed samples: 13447680 | consumed tokens: 27540848640 | elapsed time per iteration (s): 0.16 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.759285E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.784 | TFLOPs: 25.06 | +7: iteration 52540/ 173500 | consumed samples: 13450240 | consumed tokens: 27546091520 | elapsed time per iteration (s): 0.16 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.742608E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.756 | TFLOPs: 25.79 | +7: iteration 52550/ 173500 | consumed samples: 13452800 | consumed tokens: 27551334400 | elapsed time per iteration (s): 0.17 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.749946E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.388 | TFLOPs: 23.81 | +7: iteration 52560/ 173500 | consumed samples: 13455360 | consumed tokens: 27556577280 | elapsed time per iteration (s): 0.15 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.753645E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.098 | TFLOPs: 26.33 | +7: iteration 52570/ 173500 | consumed samples: 13457920 | consumed tokens: 27561820160 | elapsed time per iteration (s): 0.16 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.749435E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.151 | TFLOPs: 25.71 | +7: iteration 52580/ 173500 | consumed samples: 13460480 | consumed tokens: 27567063040 | elapsed time per iteration (s): 0.16 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.757584E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.796 | TFLOPs: 25.39 | +7: iteration 52590/ 173500 | consumed samples: 13463040 | consumed tokens: 27572305920 | elapsed time per iteration (s): 0.15 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.742178E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.363 | TFLOPs: 26.12 | +7: iteration 52600/ 173500 | consumed samples: 13465600 | consumed tokens: 27577548800 | elapsed time per iteration (s): 0.16 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.738376E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.864 | TFLOPs: 25.58 | +7: iteration 52610/ 173500 | consumed samples: 13468160 | consumed tokens: 27582791680 | elapsed time per iteration (s): 0.15 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.756735E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.806 | TFLOPs: 26.03 | +7: iteration 52620/ 173500 | consumed samples: 13470720 | consumed tokens: 27588034560 | elapsed time per iteration (s): 0.16 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.751096E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.073 | TFLOPs: 25.70 | +7: iteration 52630/ 173500 | consumed samples: 13473280 | consumed tokens: 27593277440 | elapsed time per iteration (s): 0.16 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.739182E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.262 | TFLOPs: 25.32 | +7: iteration 52640/ 173500 | consumed samples: 13475840 | consumed tokens: 27598520320 | elapsed time per iteration (s): 0.16 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.747434E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.012 | TFLOPs: 25.33 | +7: iteration 52650/ 173500 | consumed samples: 13478400 | consumed tokens: 27603763200 | elapsed time per iteration (s): 0.16 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.758853E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.635 | TFLOPs: 25.67 | +7: iteration 52660/ 173500 | consumed samples: 13480960 | consumed tokens: 27609006080 | elapsed time per iteration (s): 0.15 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.750237E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.592 | TFLOPs: 26.15 | +7: iteration 52670/ 173500 | consumed samples: 13483520 | consumed tokens: 27614248960 | elapsed time per iteration (s): 0.16 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.749170E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.531 | TFLOPs: 25.84 | +7: iteration 52680/ 173500 | consumed samples: 13486080 | consumed tokens: 27619491840 | elapsed time per iteration (s): 0.16 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.747623E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.352 | TFLOPs: 25.51 | +7: iteration 52690/ 173500 | consumed samples: 13488640 | consumed tokens: 27624734720 | elapsed time per iteration (s): 0.15 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.744262E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.005 | TFLOPs: 26.22 | +7: iteration 52700/ 173500 | consumed samples: 13491200 | consumed tokens: 27629977600 | elapsed time per iteration (s): 0.17 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.742112E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.379 | TFLOPs: 23.70 | +7: iteration 52710/ 173500 | consumed samples: 13493760 | consumed tokens: 27635220480 | elapsed time per iteration (s): 0.16 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.757089E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.763 | TFLOPs: 25.83 | +7: iteration 52720/ 173500 | consumed samples: 13496320 | consumed tokens: 27640463360 | elapsed time per iteration (s): 0.15 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.738675E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.434 | TFLOPs: 26.21 | +7: iteration 52730/ 173500 | consumed samples: 13498880 | consumed tokens: 27645706240 | elapsed time per iteration (s): 0.17 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.749482E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.769 | TFLOPs: 23.82 | +7: iteration 52740/ 173500 | consumed samples: 13501440 | consumed tokens: 27650949120 | elapsed time per iteration (s): 0.16 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.746795E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.848 | TFLOPs: 24.82 | +7: iteration 52750/ 173500 | consumed samples: 13504000 | consumed tokens: 27656192000 | elapsed time per iteration (s): 0.15 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.744981E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.641 | TFLOPs: 26.23 | +7: iteration 52760/ 173500 | consumed samples: 13506560 | consumed tokens: 27661434880 | elapsed time per iteration (s): 0.16 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.737665E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.724 | TFLOPs: 25.54 | +7: iteration 52770/ 173500 | consumed samples: 13509120 | consumed tokens: 27666677760 | elapsed time per iteration (s): 0.16 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.753275E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.375 | TFLOPs: 25.77 | +7: iteration 52780/ 173500 | consumed samples: 13511680 | consumed tokens: 27671920640 | elapsed time per iteration (s): 0.15 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.755764E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.717 | TFLOPs: 26.09 | +7: iteration 52790/ 173500 | consumed samples: 13514240 | consumed tokens: 27677163520 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.746871E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.235 | TFLOPs: 25.58 | +7: iteration 52800/ 173500 | consumed samples: 13516800 | consumed tokens: 27682406400 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.753663E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.001 | TFLOPs: 25.61 | +7: iteration 52810/ 173500 | consumed samples: 13519360 | consumed tokens: 27687649280 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.738173E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.270 | TFLOPs: 25.43 | +7: iteration 52820/ 173500 | consumed samples: 13521920 | consumed tokens: 27692892160 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.760539E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.038 | TFLOPs: 25.78 | +7: iteration 52830/ 173500 | consumed samples: 13524480 | consumed tokens: 27698135040 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.749113E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.967 | TFLOPs: 25.22 | +7: iteration 52840/ 173500 | consumed samples: 13527040 | consumed tokens: 27703377920 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.746293E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.266 | TFLOPs: 25.63 | +7: iteration 52850/ 173500 | consumed samples: 13529600 | consumed tokens: 27708620800 | elapsed time per iteration (s): 0.16 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.747346E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.118 | TFLOPs: 25.56 | +7: iteration 52860/ 173500 | consumed samples: 13532160 | consumed tokens: 27713863680 | elapsed time per iteration (s): 0.16 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.753941E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.042 | TFLOPs: 25.33 | +7: iteration 52870/ 173500 | consumed samples: 13534720 | consumed tokens: 27719106560 | elapsed time per iteration (s): 0.16 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.748210E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.792 | TFLOPs: 25.23 | +7: iteration 52880/ 173500 | consumed samples: 13537280 | consumed tokens: 27724349440 | elapsed time per iteration (s): 0.16 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.749686E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.831 | TFLOPs: 25.62 | +7: iteration 52890/ 173500 | consumed samples: 13539840 | consumed tokens: 27729592320 | elapsed time per iteration (s): 0.15 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.757086E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.615 | TFLOPs: 26.12 | +7: iteration 52900/ 173500 | consumed samples: 13542400 | consumed tokens: 27734835200 | elapsed time per iteration (s): 0.15 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.755035E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.654 | TFLOPs: 25.95 | +7: iteration 52910/ 173500 | consumed samples: 13544960 | consumed tokens: 27740078080 | elapsed time per iteration (s): 0.15 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.748970E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.062 | TFLOPs: 25.92 | +7: iteration 52920/ 173500 | consumed samples: 13547520 | consumed tokens: 27745320960 | elapsed time per iteration (s): 0.16 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.752317E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.667 | TFLOPs: 25.17 | +7: iteration 52930/ 173500 | consumed samples: 13550080 | consumed tokens: 27750563840 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.755012E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.550 | TFLOPs: 25.90 | +7: iteration 52940/ 173500 | consumed samples: 13552640 | consumed tokens: 27755806720 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.744404E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.862 | TFLOPs: 25.01 | +7: iteration 52950/ 173500 | consumed samples: 13555200 | consumed tokens: 27761049600 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.747232E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.886 | TFLOPs: 25.39 | +7: iteration 52960/ 173500 | consumed samples: 13557760 | consumed tokens: 27766292480 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.734405E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.746 | TFLOPs: 25.45 | +7: iteration 52970/ 173500 | consumed samples: 13560320 | consumed tokens: 27771535360 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.746557E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.395 | TFLOPs: 25.76 | +7: iteration 52980/ 173500 | consumed samples: 13562880 | consumed tokens: 27776778240 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.761267E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.029 | TFLOPs: 24.97 | +7: iteration 52990/ 173500 | consumed samples: 13565440 | consumed tokens: 27782021120 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.754918E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.632 | TFLOPs: 25.79 | +7: iteration 53000/ 173500 | consumed samples: 13568000 | consumed tokens: 27787264000 | elapsed time per iteration (s): 0.16 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.743439E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.186 | TFLOPs: 25.28 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 53000 | lm loss value: 3.901969E+00 | lm loss PPL: 4.949984E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 53000 to checkpoints_44m91b100m +0: [2023-03-17 02:32:54,441] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step53000 is begin to save! +0: [2023-03-17 02:32:54,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:32:54,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:32:54,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:32:54,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:32:54,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:32:54,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:32:54,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:32:54,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:32:54,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:32:54,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:32:54,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:32:54,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:32:54,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:32:54,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:32:54,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:32:54,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:32:54,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:32:54,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:32:54,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:32:54,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:32:54,576] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step53000/mp_rank_00_model_states.pt +0: [2023-03-17 02:32:54,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:32:54,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:32:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:32:54,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:32:54,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:32:54,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +7: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +3: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:32:54,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +5: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +2: [2023-03-17 02:32:54,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +4: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:32:54,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:32:54,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:32:54,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +1: [2023-03-17 02:32:54,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:32:54,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:32:54,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +6: [2023-03-17 02:32:54,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:32:54,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step53000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:32:54,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step53000 is ready now! +0: successfully saved checkpoint at iteration 53000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.16 +7: iteration 53010/ 173500 | consumed samples: 13570560 | consumed tokens: 27792506880 | elapsed time per iteration (s): 0.18 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.754453E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.195 | TFLOPs: 22.44 | +7: iteration 53020/ 173500 | consumed samples: 13573120 | consumed tokens: 27797749760 | elapsed time per iteration (s): 0.16 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.752853E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.121 | TFLOPs: 24.84 | +7: iteration 53030/ 173500 | consumed samples: 13575680 | consumed tokens: 27802992640 | elapsed time per iteration (s): 0.16 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.748288E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.160 | TFLOPs: 25.47 | +7: iteration 53040/ 173500 | consumed samples: 13578240 | consumed tokens: 27808235520 | elapsed time per iteration (s): 0.16 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.754741E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.987 | TFLOPs: 25.69 | +7: iteration 53050/ 173500 | consumed samples: 13580800 | consumed tokens: 27813478400 | elapsed time per iteration (s): 0.16 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.749875E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.396 | TFLOPs: 24.75 | +7: iteration 53060/ 173500 | consumed samples: 13583360 | consumed tokens: 27818721280 | elapsed time per iteration (s): 0.16 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.733952E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.342 | TFLOPs: 24.49 | +7: iteration 53070/ 173500 | consumed samples: 13585920 | consumed tokens: 27823964160 | elapsed time per iteration (s): 0.16 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.741420E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.298 | TFLOPs: 24.85 | +7: iteration 53080/ 173500 | consumed samples: 13588480 | consumed tokens: 27829207040 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.750471E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.439 | TFLOPs: 25.44 | +7: iteration 53090/ 173500 | consumed samples: 13591040 | consumed tokens: 27834449920 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.748962E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.975 | TFLOPs: 25.48 | +7: iteration 53100/ 173500 | consumed samples: 13593600 | consumed tokens: 27839692800 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.748168E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.965 | TFLOPs: 25.41 | +7: iteration 53110/ 173500 | consumed samples: 13596160 | consumed tokens: 27844935680 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.744207E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.805 | TFLOPs: 25.14 | +7: iteration 53120/ 173500 | consumed samples: 13598720 | consumed tokens: 27850178560 | elapsed time per iteration (s): 0.15 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.751771E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.795 | TFLOPs: 25.92 | +7: iteration 53130/ 173500 | consumed samples: 13601280 | consumed tokens: 27855421440 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.762559E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.214 | TFLOPs: 25.58 | +7: iteration 53140/ 173500 | consumed samples: 13603840 | consumed tokens: 27860664320 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.763639E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.150 | TFLOPs: 25.42 | +7: iteration 53150/ 173500 | consumed samples: 13606400 | consumed tokens: 27865907200 | elapsed time per iteration (s): 0.16 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.750669E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.853 | TFLOPs: 25.61 | +7: iteration 53160/ 173500 | consumed samples: 13608960 | consumed tokens: 27871150080 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.751399E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.444 | TFLOPs: 25.71 | +7: iteration 53170/ 173500 | consumed samples: 13611520 | consumed tokens: 27876392960 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.745920E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.403 | TFLOPs: 24.93 | +7: iteration 53180/ 173500 | consumed samples: 13614080 | consumed tokens: 27881635840 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.757140E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.095 | TFLOPs: 25.77 | +7: iteration 53190/ 173500 | consumed samples: 13616640 | consumed tokens: 27886878720 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.739510E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.542 | TFLOPs: 25.76 | +7: iteration 53200/ 173500 | consumed samples: 13619200 | consumed tokens: 27892121600 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.751490E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.385 | TFLOPs: 25.47 | +7: iteration 53210/ 173500 | consumed samples: 13621760 | consumed tokens: 27897364480 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.753027E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.123 | TFLOPs: 25.52 | +7: iteration 53220/ 173500 | consumed samples: 13624320 | consumed tokens: 27902607360 | elapsed time per iteration (s): 0.16 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.751571E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.807 | TFLOPs: 25.23 | +7: iteration 53230/ 173500 | consumed samples: 13626880 | consumed tokens: 27907850240 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.755497E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.532 | TFLOPs: 25.21 | +7: iteration 53240/ 173500 | consumed samples: 13629440 | consumed tokens: 27913093120 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.748997E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.441 | TFLOPs: 25.10 | +7: iteration 53250/ 173500 | consumed samples: 13632000 | consumed tokens: 27918336000 | elapsed time per iteration (s): 0.15 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.744930E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.168 | TFLOPs: 25.93 | +7: iteration 53260/ 173500 | consumed samples: 13634560 | consumed tokens: 27923578880 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.747938E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.264 | TFLOPs: 25.16 | +7: iteration 53270/ 173500 | consumed samples: 13637120 | consumed tokens: 27928821760 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.756087E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.967 | TFLOPs: 25.64 | +7: iteration 53280/ 173500 | consumed samples: 13639680 | consumed tokens: 27934064640 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.744352E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.310 | TFLOPs: 25.72 | +7: iteration 53290/ 173500 | consumed samples: 13642240 | consumed tokens: 27939307520 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.760183E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.543 | TFLOPs: 25.51 | +7: iteration 53300/ 173500 | consumed samples: 13644800 | consumed tokens: 27944550400 | elapsed time per iteration (s): 0.16 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.739893E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.925 | TFLOPs: 25.48 | +7: iteration 53310/ 173500 | consumed samples: 13647360 | consumed tokens: 27949793280 | elapsed time per iteration (s): 0.16 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.751767E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.605 | TFLOPs: 25.78 | +7: iteration 53320/ 173500 | consumed samples: 13649920 | consumed tokens: 27955036160 | elapsed time per iteration (s): 0.16 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.749877E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.059 | TFLOPs: 25.74 | +7: iteration 53330/ 173500 | consumed samples: 13652480 | consumed tokens: 27960279040 | elapsed time per iteration (s): 0.17 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.744093E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.708 | TFLOPs: 23.91 | +7: iteration 53340/ 173500 | consumed samples: 13655040 | consumed tokens: 27965521920 | elapsed time per iteration (s): 0.16 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.753104E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.962 | TFLOPs: 25.77 | +7: iteration 53350/ 173500 | consumed samples: 13657600 | consumed tokens: 27970764800 | elapsed time per iteration (s): 0.15 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.737964E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.888 | TFLOPs: 25.91 | +7: iteration 53360/ 173500 | consumed samples: 13660160 | consumed tokens: 27976007680 | elapsed time per iteration (s): 0.16 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.731710E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.337 | TFLOPs: 25.05 | +7: iteration 53370/ 173500 | consumed samples: 13662720 | consumed tokens: 27981250560 | elapsed time per iteration (s): 0.16 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.740009E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.396 | TFLOPs: 25.49 | +7: iteration 53380/ 173500 | consumed samples: 13665280 | consumed tokens: 27986493440 | elapsed time per iteration (s): 0.16 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.747024E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.444 | TFLOPs: 25.54 | +7: iteration 53390/ 173500 | consumed samples: 13667840 | consumed tokens: 27991736320 | elapsed time per iteration (s): 0.16 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.749290E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.480 | TFLOPs: 25.79 | +7: iteration 53400/ 173500 | consumed samples: 13670400 | consumed tokens: 27996979200 | elapsed time per iteration (s): 0.16 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.752010E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.258 | TFLOPs: 25.90 | +7: iteration 53410/ 173500 | consumed samples: 13672960 | consumed tokens: 28002222080 | elapsed time per iteration (s): 0.15 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.751060E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.314 | TFLOPs: 25.94 | +7: iteration 53420/ 173500 | consumed samples: 13675520 | consumed tokens: 28007464960 | elapsed time per iteration (s): 0.15 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.752921E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.895 | TFLOPs: 26.24 | +7: iteration 53430/ 173500 | consumed samples: 13678080 | consumed tokens: 28012707840 | elapsed time per iteration (s): 0.16 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.744858E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.685 | TFLOPs: 25.51 | +7: iteration 53440/ 173500 | consumed samples: 13680640 | consumed tokens: 28017950720 | elapsed time per iteration (s): 0.16 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.751505E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.876 | TFLOPs: 25.84 | +7: iteration 53450/ 173500 | consumed samples: 13683200 | consumed tokens: 28023193600 | elapsed time per iteration (s): 0.15 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.757834E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.904 | TFLOPs: 26.13 | +7: iteration 53460/ 173500 | consumed samples: 13685760 | consumed tokens: 28028436480 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.744799E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.294 | TFLOPs: 26.24 | +7: iteration 53470/ 173500 | consumed samples: 13688320 | consumed tokens: 28033679360 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.744753E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.940 | TFLOPs: 26.20 | +7: iteration 53480/ 173500 | consumed samples: 13690880 | consumed tokens: 28038922240 | elapsed time per iteration (s): 0.16 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.753696E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.866 | TFLOPs: 25.84 | +7: iteration 53490/ 173500 | consumed samples: 13693440 | consumed tokens: 28044165120 | elapsed time per iteration (s): 0.16 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.736988E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.337 | TFLOPs: 24.82 | +7: iteration 53500/ 173500 | consumed samples: 13696000 | consumed tokens: 28049408000 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.743237E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.664 | TFLOPs: 26.14 | +7: iteration 53510/ 173500 | consumed samples: 13698560 | consumed tokens: 28054650880 | elapsed time per iteration (s): 0.15 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.747342E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.931 | TFLOPs: 26.03 | +7: iteration 53520/ 173500 | consumed samples: 13701120 | consumed tokens: 28059893760 | elapsed time per iteration (s): 0.16 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.753445E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.457 | TFLOPs: 25.79 | +7: iteration 53530/ 173500 | consumed samples: 13703680 | consumed tokens: 28065136640 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.763100E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.559 | TFLOPs: 25.65 | +7: iteration 53540/ 173500 | consumed samples: 13706240 | consumed tokens: 28070379520 | elapsed time per iteration (s): 0.15 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.742998E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.991 | TFLOPs: 26.11 | +7: iteration 53550/ 173500 | consumed samples: 13708800 | consumed tokens: 28075622400 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.758061E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.828 | TFLOPs: 24.74 | +7: iteration 53560/ 173500 | consumed samples: 13711360 | consumed tokens: 28080865280 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.740669E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.604 | TFLOPs: 25.04 | +7: iteration 53570/ 173500 | consumed samples: 13713920 | consumed tokens: 28086108160 | elapsed time per iteration (s): 0.15 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.746121E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.211 | TFLOPs: 26.10 | +7: iteration 53580/ 173500 | consumed samples: 13716480 | consumed tokens: 28091351040 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.738012E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.454 | TFLOPs: 25.74 | +7: iteration 53590/ 173500 | consumed samples: 13719040 | consumed tokens: 28096593920 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.736371E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.442 | TFLOPs: 25.69 | +7: iteration 53600/ 173500 | consumed samples: 13721600 | consumed tokens: 28101836800 | elapsed time per iteration (s): 0.16 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.752612E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.950 | TFLOPs: 25.72 | +7: iteration 53610/ 173500 | consumed samples: 13724160 | consumed tokens: 28107079680 | elapsed time per iteration (s): 0.16 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.750362E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.264 | TFLOPs: 25.49 | +7: iteration 53620/ 173500 | consumed samples: 13726720 | consumed tokens: 28112322560 | elapsed time per iteration (s): 0.16 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.738120E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.542 | TFLOPs: 25.79 | +7: iteration 53630/ 173500 | consumed samples: 13729280 | consumed tokens: 28117565440 | elapsed time per iteration (s): 0.16 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.744143E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.619 | TFLOPs: 25.81 | +7: iteration 53640/ 173500 | consumed samples: 13731840 | consumed tokens: 28122808320 | elapsed time per iteration (s): 0.16 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.760165E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.899 | TFLOPs: 25.26 | +7: iteration 53650/ 173500 | consumed samples: 13734400 | consumed tokens: 28128051200 | elapsed time per iteration (s): 0.15 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.741624E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.320 | TFLOPs: 26.13 | +7: iteration 53660/ 173500 | consumed samples: 13736960 | consumed tokens: 28133294080 | elapsed time per iteration (s): 0.16 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.734311E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.498 | TFLOPs: 25.55 | +7: iteration 53670/ 173500 | consumed samples: 13739520 | consumed tokens: 28138536960 | elapsed time per iteration (s): 0.16 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.735405E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.678 | TFLOPs: 25.82 | +7: iteration 53680/ 173500 | consumed samples: 13742080 | consumed tokens: 28143779840 | elapsed time per iteration (s): 0.16 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.749168E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.427 | TFLOPs: 25.80 | +7: iteration 53690/ 173500 | consumed samples: 13744640 | consumed tokens: 28149022720 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.738486E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.318 | TFLOPs: 26.15 | +7: iteration 53700/ 173500 | consumed samples: 13747200 | consumed tokens: 28154265600 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.752711E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.619 | TFLOPs: 26.12 | +7: iteration 53710/ 173500 | consumed samples: 13749760 | consumed tokens: 28159508480 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.747615E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.742 | TFLOPs: 26.12 | +7: iteration 53720/ 173500 | consumed samples: 13752320 | consumed tokens: 28164751360 | elapsed time per iteration (s): 0.16 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.741797E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.246 | TFLOPs: 25.21 | +7: iteration 53730/ 173500 | consumed samples: 13754880 | consumed tokens: 28169994240 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.742027E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.663 | TFLOPs: 26.09 | +7: iteration 53740/ 173500 | consumed samples: 13757440 | consumed tokens: 28175237120 | elapsed time per iteration (s): 0.16 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.742327E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.934 | TFLOPs: 25.28 | +7: iteration 53750/ 173500 | consumed samples: 13760000 | consumed tokens: 28180480000 | elapsed time per iteration (s): 0.15 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.754066E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.275 | TFLOPs: 25.97 | +7: iteration 53760/ 173500 | consumed samples: 13762560 | consumed tokens: 28185722880 | elapsed time per iteration (s): 0.17 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.742786E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1469.224 | TFLOPs: 23.04 | +7: iteration 53770/ 173500 | consumed samples: 13765120 | consumed tokens: 28190965760 | elapsed time per iteration (s): 0.16 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.742393E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.953 | TFLOPs: 25.78 | +7: iteration 53780/ 173500 | consumed samples: 13767680 | consumed tokens: 28196208640 | elapsed time per iteration (s): 0.15 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.758667E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.725 | TFLOPs: 26.15 | +7: iteration 53790/ 173500 | consumed samples: 13770240 | consumed tokens: 28201451520 | elapsed time per iteration (s): 0.15 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.747472E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.131 | TFLOPs: 26.14 | +7: iteration 53800/ 173500 | consumed samples: 13772800 | consumed tokens: 28206694400 | elapsed time per iteration (s): 0.16 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.756843E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.512 | TFLOPs: 25.87 | +7: iteration 53810/ 173500 | consumed samples: 13775360 | consumed tokens: 28211937280 | elapsed time per iteration (s): 0.15 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.739921E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.614 | TFLOPs: 26.09 | +7: iteration 53820/ 173500 | consumed samples: 13777920 | consumed tokens: 28217180160 | elapsed time per iteration (s): 0.15 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.754019E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.091 | TFLOPs: 26.22 | +7: iteration 53830/ 173500 | consumed samples: 13780480 | consumed tokens: 28222423040 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.731487E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.990 | TFLOPs: 26.27 | +7: iteration 53840/ 173500 | consumed samples: 13783040 | consumed tokens: 28227665920 | elapsed time per iteration (s): 0.16 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.752977E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.835 | TFLOPs: 25.59 | +7: iteration 53850/ 173500 | consumed samples: 13785600 | consumed tokens: 28232908800 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.744225E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.209 | TFLOPs: 26.18 | +7: iteration 53860/ 173500 | consumed samples: 13788160 | consumed tokens: 28238151680 | elapsed time per iteration (s): 0.16 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.753331E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.769 | TFLOPs: 25.87 | +7: iteration 53870/ 173500 | consumed samples: 13790720 | consumed tokens: 28243394560 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.744899E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.844 | TFLOPs: 26.17 | +7: iteration 53880/ 173500 | consumed samples: 13793280 | consumed tokens: 28248637440 | elapsed time per iteration (s): 0.16 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.751705E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.303 | TFLOPs: 25.60 | +7: iteration 53890/ 173500 | consumed samples: 13795840 | consumed tokens: 28253880320 | elapsed time per iteration (s): 0.15 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.741151E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.592 | TFLOPs: 26.26 | +7: iteration 53900/ 173500 | consumed samples: 13798400 | consumed tokens: 28259123200 | elapsed time per iteration (s): 0.16 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.752722E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.407 | TFLOPs: 25.24 | +7: iteration 53910/ 173500 | consumed samples: 13800960 | consumed tokens: 28264366080 | elapsed time per iteration (s): 0.15 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.758055E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.554 | TFLOPs: 26.23 | +7: iteration 53920/ 173500 | consumed samples: 13803520 | consumed tokens: 28269608960 | elapsed time per iteration (s): 0.16 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.755844E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.742 | TFLOPs: 25.87 | +7: iteration 53930/ 173500 | consumed samples: 13806080 | consumed tokens: 28274851840 | elapsed time per iteration (s): 0.16 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.734309E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.456 | TFLOPs: 25.73 | +7: iteration 53940/ 173500 | consumed samples: 13808640 | consumed tokens: 28280094720 | elapsed time per iteration (s): 0.16 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.758502E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.341 | TFLOPs: 25.87 | +7: iteration 53950/ 173500 | consumed samples: 13811200 | consumed tokens: 28285337600 | elapsed time per iteration (s): 0.16 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.742783E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.022 | TFLOPs: 25.69 | +7: iteration 53960/ 173500 | consumed samples: 13813760 | consumed tokens: 28290580480 | elapsed time per iteration (s): 0.15 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.747132E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.109 | TFLOPs: 25.91 | +7: iteration 53970/ 173500 | consumed samples: 13816320 | consumed tokens: 28295823360 | elapsed time per iteration (s): 0.16 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.733410E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.512 | TFLOPs: 25.65 | +7: iteration 53980/ 173500 | consumed samples: 13818880 | consumed tokens: 28301066240 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.748064E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.788 | TFLOPs: 26.20 | +7: iteration 53990/ 173500 | consumed samples: 13821440 | consumed tokens: 28306309120 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.741349E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.331 | TFLOPs: 26.30 | +0: [2023-03-17 02:35:31,194] [INFO] [logging.py:68:log_dist] [Rank 0] step=54000, skipped=0, lr=[0.00016191666237869197, 0.00016191666237869197, 0.00016191666237869197], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 54000/ 173500 | consumed samples: 13824000 | consumed tokens: 28311552000 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.766037E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.941 | TFLOPs: 26.28 | +0: steps: 54000 loss: 3.7369 iter time (s): 0.156 samples/sec: 1638.126 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 54000 | lm loss value: 3.883396E+00 | lm loss PPL: 4.858893E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 54000 to checkpoints_44m91b100m +0: [2023-03-17 02:35:31,293] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step54000 is begin to save! +0: [2023-03-17 02:35:31,298] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:35:31,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:35:31,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:35:31,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:35:31,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:35:31,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:35:31,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:35:31,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:35:31,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:35:31,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:35:31,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:35:31,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:35:31,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:35:31,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:35:31,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:35:31,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:35:31,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:35:31,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:35:31,427] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:35:31,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:35:31,429] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step54000/mp_rank_00_model_states.pt +0: [2023-03-17 02:35:31,429] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:35:31,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:35:31,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:35:31,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:35:31,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:35:31,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:35:31,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:35:31,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:35:31,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:35:31,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:35:31,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +4: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +5: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +1: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +7: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:35:31,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +6: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:35:31,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:35:31,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 02:35:31,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +3: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +2: [2023-03-17 02:35:31,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step54000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:35:31,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step54000 is ready now! +0: successfully saved checkpoint at iteration 54000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 208.65 +7: iteration 54010/ 173500 | consumed samples: 13826560 | consumed tokens: 28316794880 | elapsed time per iteration (s): 0.18 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.743393E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.969 | TFLOPs: 22.14 | +7: iteration 54020/ 173500 | consumed samples: 13829120 | consumed tokens: 28322037760 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.741704E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.267 | TFLOPs: 25.99 | +7: iteration 54030/ 173500 | consumed samples: 13831680 | consumed tokens: 28327280640 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.748726E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.497 | TFLOPs: 26.18 | +7: iteration 54040/ 173500 | consumed samples: 13834240 | consumed tokens: 28332523520 | elapsed time per iteration (s): 0.15 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.745159E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.239 | TFLOPs: 26.27 | +7: iteration 54050/ 173500 | consumed samples: 13836800 | consumed tokens: 28337766400 | elapsed time per iteration (s): 0.15 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.749737E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.495 | TFLOPs: 26.24 | +7: iteration 54060/ 173500 | consumed samples: 13839360 | consumed tokens: 28343009280 | elapsed time per iteration (s): 0.16 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.752985E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.973 | TFLOPs: 25.70 | +7: iteration 54070/ 173500 | consumed samples: 13841920 | consumed tokens: 28348252160 | elapsed time per iteration (s): 0.16 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.741281E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.232 | TFLOPs: 25.41 | +7: iteration 54080/ 173500 | consumed samples: 13844480 | consumed tokens: 28353495040 | elapsed time per iteration (s): 0.16 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.736403E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.784 | TFLOPs: 24.99 | +7: iteration 54090/ 173500 | consumed samples: 13847040 | consumed tokens: 28358737920 | elapsed time per iteration (s): 0.16 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.746069E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.577 | TFLOPs: 25.18 | +7: iteration 54100/ 173500 | consumed samples: 13849600 | consumed tokens: 28363980800 | elapsed time per iteration (s): 0.16 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.755473E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.000 | TFLOPs: 25.37 | +7: iteration 54110/ 173500 | consumed samples: 13852160 | consumed tokens: 28369223680 | elapsed time per iteration (s): 0.15 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.752368E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.166 | TFLOPs: 25.91 | +7: iteration 54120/ 173500 | consumed samples: 13854720 | consumed tokens: 28374466560 | elapsed time per iteration (s): 0.15 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.742603E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.589 | TFLOPs: 26.28 | +7: iteration 54130/ 173500 | consumed samples: 13857280 | consumed tokens: 28379709440 | elapsed time per iteration (s): 0.15 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.749806E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.385 | TFLOPs: 25.94 | +7: iteration 54140/ 173500 | consumed samples: 13859840 | consumed tokens: 28384952320 | elapsed time per iteration (s): 0.16 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.748938E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.995 | TFLOPs: 25.52 | +7: iteration 54150/ 173500 | consumed samples: 13862400 | consumed tokens: 28390195200 | elapsed time per iteration (s): 0.16 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.742702E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.533 | TFLOPs: 25.79 | +7: iteration 54160/ 173500 | consumed samples: 13864960 | consumed tokens: 28395438080 | elapsed time per iteration (s): 0.15 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.745612E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.796 | TFLOPs: 26.09 | +7: iteration 54170/ 173500 | consumed samples: 13867520 | consumed tokens: 28400680960 | elapsed time per iteration (s): 0.16 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.749689E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.737 | TFLOPs: 25.32 | +7: iteration 54180/ 173500 | consumed samples: 13870080 | consumed tokens: 28405923840 | elapsed time per iteration (s): 0.16 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.760137E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.172 | TFLOPs: 25.82 | +7: iteration 54190/ 173500 | consumed samples: 13872640 | consumed tokens: 28411166720 | elapsed time per iteration (s): 0.16 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.758423E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.073 | TFLOPs: 25.05 | +7: iteration 54200/ 173500 | consumed samples: 13875200 | consumed tokens: 28416409600 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.747686E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.096 | TFLOPs: 25.85 | +7: iteration 54210/ 173500 | consumed samples: 13877760 | consumed tokens: 28421652480 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.743901E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.467 | TFLOPs: 25.79 | +7: iteration 54220/ 173500 | consumed samples: 13880320 | consumed tokens: 28426895360 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.758485E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.774 | TFLOPs: 25.84 | +7: iteration 54230/ 173500 | consumed samples: 13882880 | consumed tokens: 28432138240 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.754035E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.306 | TFLOPs: 25.83 | +7: iteration 54240/ 173500 | consumed samples: 13885440 | consumed tokens: 28437381120 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.735078E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.072 | TFLOPs: 25.77 | +7: iteration 54250/ 173500 | consumed samples: 13888000 | consumed tokens: 28442624000 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.740300E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.587 | TFLOPs: 25.76 | +7: iteration 54260/ 173500 | consumed samples: 13890560 | consumed tokens: 28447866880 | elapsed time per iteration (s): 0.15 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.744193E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.688 | TFLOPs: 26.15 | +7: iteration 54270/ 173500 | consumed samples: 13893120 | consumed tokens: 28453109760 | elapsed time per iteration (s): 0.16 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.736199E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.551 | TFLOPs: 25.87 | +7: iteration 54280/ 173500 | consumed samples: 13895680 | consumed tokens: 28458352640 | elapsed time per iteration (s): 0.15 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.753939E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.404 | TFLOPs: 26.23 | +7: iteration 54290/ 173500 | consumed samples: 13898240 | consumed tokens: 28463595520 | elapsed time per iteration (s): 0.16 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.748254E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.541 | TFLOPs: 25.81 | +7: iteration 54300/ 173500 | consumed samples: 13900800 | consumed tokens: 28468838400 | elapsed time per iteration (s): 0.15 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.735835E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.491 | TFLOPs: 26.21 | +7: iteration 54310/ 173500 | consumed samples: 13903360 | consumed tokens: 28474081280 | elapsed time per iteration (s): 0.15 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.739843E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.401 | TFLOPs: 26.21 | +7: iteration 54320/ 173500 | consumed samples: 13905920 | consumed tokens: 28479324160 | elapsed time per iteration (s): 0.16 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.730257E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.905 | TFLOPs: 25.37 | +7: iteration 54330/ 173500 | consumed samples: 13908480 | consumed tokens: 28484567040 | elapsed time per iteration (s): 0.16 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.756931E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.063 | TFLOPs: 25.81 | +7: iteration 54340/ 173500 | consumed samples: 13911040 | consumed tokens: 28489809920 | elapsed time per iteration (s): 0.15 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.744246E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.920 | TFLOPs: 25.91 | +7: iteration 54350/ 173500 | consumed samples: 13913600 | consumed tokens: 28495052800 | elapsed time per iteration (s): 0.16 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.754052E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.811 | TFLOPs: 25.79 | +7: iteration 54360/ 173500 | consumed samples: 13916160 | consumed tokens: 28500295680 | elapsed time per iteration (s): 0.17 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.725863E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.503 | TFLOPs: 24.16 | +7: iteration 54370/ 173500 | consumed samples: 13918720 | consumed tokens: 28505538560 | elapsed time per iteration (s): 0.16 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.744738E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.384 | TFLOPs: 25.68 | +7: iteration 54380/ 173500 | consumed samples: 13921280 | consumed tokens: 28510781440 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.759090E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.132 | TFLOPs: 26.22 | +7: iteration 54390/ 173500 | consumed samples: 13923840 | consumed tokens: 28516024320 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.741742E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.031 | TFLOPs: 26.24 | +7: iteration 54400/ 173500 | consumed samples: 13926400 | consumed tokens: 28521267200 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.765483E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.434 | TFLOPs: 26.21 | +7: iteration 54410/ 173500 | consumed samples: 13928960 | consumed tokens: 28526510080 | elapsed time per iteration (s): 0.16 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.745195E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.616 | TFLOPs: 25.82 | +7: iteration 54420/ 173500 | consumed samples: 13931520 | consumed tokens: 28531752960 | elapsed time per iteration (s): 0.15 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.745288E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.818 | TFLOPs: 26.19 | +7: iteration 54430/ 173500 | consumed samples: 13934080 | consumed tokens: 28536995840 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.748795E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.409 | TFLOPs: 26.21 | +7: iteration 54440/ 173500 | consumed samples: 13936640 | consumed tokens: 28542238720 | elapsed time per iteration (s): 0.16 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.758435E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.215 | TFLOPs: 25.88 | +7: iteration 54450/ 173500 | consumed samples: 13939200 | consumed tokens: 28547481600 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.749591E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.050 | TFLOPs: 26.24 | +7: iteration 54460/ 173500 | consumed samples: 13941760 | consumed tokens: 28552724480 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.736624E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.158 | TFLOPs: 26.22 | +7: iteration 54470/ 173500 | consumed samples: 13944320 | consumed tokens: 28557967360 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.739458E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.269 | TFLOPs: 26.21 | +7: iteration 54480/ 173500 | consumed samples: 13946880 | consumed tokens: 28563210240 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.753696E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.963 | TFLOPs: 26.22 | +7: iteration 54490/ 173500 | consumed samples: 13949440 | consumed tokens: 28568453120 | elapsed time per iteration (s): 0.15 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.750681E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.511 | TFLOPs: 26.23 | +7: iteration 54500/ 173500 | consumed samples: 13952000 | consumed tokens: 28573696000 | elapsed time per iteration (s): 0.16 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.753761E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.394 | TFLOPs: 25.90 | +7: iteration 54510/ 173500 | consumed samples: 13954560 | consumed tokens: 28578938880 | elapsed time per iteration (s): 0.15 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.752388E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.418 | TFLOPs: 26.27 | +7: iteration 54520/ 173500 | consumed samples: 13957120 | consumed tokens: 28584181760 | elapsed time per iteration (s): 0.15 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.754236E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.158 | TFLOPs: 26.27 | +7: iteration 54530/ 173500 | consumed samples: 13959680 | consumed tokens: 28589424640 | elapsed time per iteration (s): 0.15 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.747953E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.286 | TFLOPs: 26.27 | +7: iteration 54540/ 173500 | consumed samples: 13962240 | consumed tokens: 28594667520 | elapsed time per iteration (s): 0.15 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.745954E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.837 | TFLOPs: 26.28 | +7: iteration 54550/ 173500 | consumed samples: 13964800 | consumed tokens: 28599910400 | elapsed time per iteration (s): 0.16 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.742236E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.243 | TFLOPs: 25.90 | +7: iteration 54560/ 173500 | consumed samples: 13967360 | consumed tokens: 28605153280 | elapsed time per iteration (s): 0.16 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.747394E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.295 | TFLOPs: 25.55 | +7: iteration 54570/ 173500 | consumed samples: 13969920 | consumed tokens: 28610396160 | elapsed time per iteration (s): 0.16 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.744706E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.780 | TFLOPs: 25.65 | +7: iteration 54580/ 173500 | consumed samples: 13972480 | consumed tokens: 28615639040 | elapsed time per iteration (s): 0.16 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.745458E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.183 | TFLOPs: 25.88 | +7: iteration 54590/ 173500 | consumed samples: 13975040 | consumed tokens: 28620881920 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.755106E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.100 | TFLOPs: 26.25 | +7: iteration 54600/ 173500 | consumed samples: 13977600 | consumed tokens: 28626124800 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.749826E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.239 | TFLOPs: 26.24 | +7: iteration 54610/ 173500 | consumed samples: 13980160 | consumed tokens: 28631367680 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.744692E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.725 | TFLOPs: 26.17 | +7: iteration 54620/ 173500 | consumed samples: 13982720 | consumed tokens: 28636610560 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.759497E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.076 | TFLOPs: 26.16 | +7: iteration 54630/ 173500 | consumed samples: 13985280 | consumed tokens: 28641853440 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.748957E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.051 | TFLOPs: 26.16 | +7: iteration 54640/ 173500 | consumed samples: 13987840 | consumed tokens: 28647096320 | elapsed time per iteration (s): 0.15 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.744461E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.309 | TFLOPs: 26.18 | +7: iteration 54650/ 173500 | consumed samples: 13990400 | consumed tokens: 28652339200 | elapsed time per iteration (s): 0.15 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.751327E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.480 | TFLOPs: 26.18 | +7: iteration 54660/ 173500 | consumed samples: 13992960 | consumed tokens: 28657582080 | elapsed time per iteration (s): 0.15 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.754481E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.408 | TFLOPs: 26.24 | +7: iteration 54670/ 173500 | consumed samples: 13995520 | consumed tokens: 28662824960 | elapsed time per iteration (s): 0.16 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.744051E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.918 | TFLOPs: 25.58 | +7: iteration 54680/ 173500 | consumed samples: 13998080 | consumed tokens: 28668067840 | elapsed time per iteration (s): 0.16 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.736809E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.768 | TFLOPs: 25.76 | +7: iteration 54690/ 173500 | consumed samples: 14000640 | consumed tokens: 28673310720 | elapsed time per iteration (s): 0.15 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.746749E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.818 | TFLOPs: 26.14 | +7: iteration 54700/ 173500 | consumed samples: 14003200 | consumed tokens: 28678553600 | elapsed time per iteration (s): 0.16 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.744446E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.090 | TFLOPs: 25.80 | +7: iteration 54710/ 173500 | consumed samples: 14005760 | consumed tokens: 28683796480 | elapsed time per iteration (s): 0.15 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.743433E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.803 | TFLOPs: 26.16 | +7: iteration 54720/ 173500 | consumed samples: 14008320 | consumed tokens: 28689039360 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.740271E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.013 | TFLOPs: 26.16 | +7: iteration 54730/ 173500 | consumed samples: 14010880 | consumed tokens: 28694282240 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.750871E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.442 | TFLOPs: 26.15 | +7: iteration 54740/ 173500 | consumed samples: 14013440 | consumed tokens: 28699525120 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.748276E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.456 | TFLOPs: 26.15 | +7: iteration 54750/ 173500 | consumed samples: 14016000 | consumed tokens: 28704768000 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.756942E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.074 | TFLOPs: 26.13 | +7: iteration 54760/ 173500 | consumed samples: 14018560 | consumed tokens: 28710010880 | elapsed time per iteration (s): 0.15 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.756396E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.547 | TFLOPs: 26.15 | +7: iteration 54770/ 173500 | consumed samples: 14021120 | consumed tokens: 28715253760 | elapsed time per iteration (s): 0.16 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.743019E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.575 | TFLOPs: 25.87 | +7: iteration 54780/ 173500 | consumed samples: 14023680 | consumed tokens: 28720496640 | elapsed time per iteration (s): 0.16 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.754931E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.270 | TFLOPs: 25.80 | +7: iteration 54790/ 173500 | consumed samples: 14026240 | consumed tokens: 28725739520 | elapsed time per iteration (s): 0.16 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.739473E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.069 | TFLOPs: 25.11 | +7: iteration 54800/ 173500 | consumed samples: 14028800 | consumed tokens: 28730982400 | elapsed time per iteration (s): 0.16 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.750262E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.509 | TFLOPs: 25.59 | +7: iteration 54810/ 173500 | consumed samples: 14031360 | consumed tokens: 28736225280 | elapsed time per iteration (s): 0.16 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.731702E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.816 | TFLOPs: 25.67 | +7: iteration 54820/ 173500 | consumed samples: 14033920 | consumed tokens: 28741468160 | elapsed time per iteration (s): 0.15 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.749397E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.374 | TFLOPs: 26.12 | +7: iteration 54830/ 173500 | consumed samples: 14036480 | consumed tokens: 28746711040 | elapsed time per iteration (s): 0.15 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.740782E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.192 | TFLOPs: 26.13 | +7: iteration 54840/ 173500 | consumed samples: 14039040 | consumed tokens: 28751953920 | elapsed time per iteration (s): 0.15 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.744558E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.269 | TFLOPs: 26.13 | +7: iteration 54850/ 173500 | consumed samples: 14041600 | consumed tokens: 28757196800 | elapsed time per iteration (s): 0.16 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.738017E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.055 | TFLOPs: 25.44 | +7: iteration 54860/ 173500 | consumed samples: 14044160 | consumed tokens: 28762439680 | elapsed time per iteration (s): 0.15 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.745326E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.028 | TFLOPs: 26.13 | +7: iteration 54870/ 173500 | consumed samples: 14046720 | consumed tokens: 28767682560 | elapsed time per iteration (s): 0.15 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.743132E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.025 | TFLOPs: 26.14 | +7: iteration 54880/ 173500 | consumed samples: 14049280 | consumed tokens: 28772925440 | elapsed time per iteration (s): 0.16 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.747575E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.137 | TFLOPs: 25.80 | +7: iteration 54890/ 173500 | consumed samples: 14051840 | consumed tokens: 28778168320 | elapsed time per iteration (s): 0.16 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.739571E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.512 | TFLOPs: 25.77 | +7: iteration 54900/ 173500 | consumed samples: 14054400 | consumed tokens: 28783411200 | elapsed time per iteration (s): 0.16 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.745531E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.795 | TFLOPs: 25.43 | +7: iteration 54910/ 173500 | consumed samples: 14056960 | consumed tokens: 28788654080 | elapsed time per iteration (s): 0.16 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.739347E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.556 | TFLOPs: 25.81 | +7: iteration 54920/ 173500 | consumed samples: 14059520 | consumed tokens: 28793896960 | elapsed time per iteration (s): 0.15 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.736127E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.715 | TFLOPs: 26.04 | +7: iteration 54930/ 173500 | consumed samples: 14062080 | consumed tokens: 28799139840 | elapsed time per iteration (s): 0.16 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.745645E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.123 | TFLOPs: 25.72 | +7: iteration 54940/ 173500 | consumed samples: 14064640 | consumed tokens: 28804382720 | elapsed time per iteration (s): 0.16 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.733115E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.460 | TFLOPs: 25.79 | +7: iteration 54950/ 173500 | consumed samples: 14067200 | consumed tokens: 28809625600 | elapsed time per iteration (s): 0.15 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.736623E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.900 | TFLOPs: 26.13 | +7: iteration 54960/ 173500 | consumed samples: 14069760 | consumed tokens: 28814868480 | elapsed time per iteration (s): 0.16 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.744662E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.049 | TFLOPs: 25.53 | +7: iteration 54970/ 173500 | consumed samples: 14072320 | consumed tokens: 28820111360 | elapsed time per iteration (s): 0.16 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.755481E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.651 | TFLOPs: 25.82 | +7: iteration 54980/ 173500 | consumed samples: 14074880 | consumed tokens: 28825354240 | elapsed time per iteration (s): 0.16 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.739453E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.570 | TFLOPs: 25.57 | +7: iteration 54990/ 173500 | consumed samples: 14077440 | consumed tokens: 28830597120 | elapsed time per iteration (s): 0.16 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.747982E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.160 | TFLOPs: 25.06 | +7: iteration 55000/ 173500 | consumed samples: 14080000 | consumed tokens: 28835840000 | elapsed time per iteration (s): 0.15 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.744922E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.192 | TFLOPs: 26.11 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 55000 | lm loss value: 3.875039E+00 | lm loss PPL: 4.818457E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 55000 to checkpoints_44m91b100m +0: [2023-03-17 02:38:06,590] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step55000 is begin to save! +0: [2023-03-17 02:38:06,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:38:06,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:38:06,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:38:06,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:38:06,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:38:06,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:38:06,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:38:06,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:38:06,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:38:06,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:38:06,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:38:06,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:38:06,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:38:06,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:38:06,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:38:06,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:38:06,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:38:06,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:38:06,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:38:06,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:38:06,725] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step55000/mp_rank_00_model_states.pt +0: [2023-03-17 02:38:06,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:38:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:38:06,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:38:06,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:06,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:06,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +4: [2023-03-17 02:38:06,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +7: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +3: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 02:38:06,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +2: [2023-03-17 02:38:06,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +6: [2023-03-17 02:38:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:38:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:38:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +5: [2023-03-17 02:38:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:38:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +1: [2023-03-17 02:38:06,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step55000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:38:06,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step55000 is ready now! +0: successfully saved checkpoint at iteration 55000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.90 +7: iteration 55010/ 173500 | consumed samples: 14082560 | consumed tokens: 28841082880 | elapsed time per iteration (s): 0.18 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.748980E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1445.022 | TFLOPs: 22.66 | +7: iteration 55020/ 173500 | consumed samples: 14085120 | consumed tokens: 28846325760 | elapsed time per iteration (s): 0.16 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.735926E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.666 | TFLOPs: 25.67 | +7: iteration 55030/ 173500 | consumed samples: 14087680 | consumed tokens: 28851568640 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.741591E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.031 | TFLOPs: 26.22 | +7: iteration 55040/ 173500 | consumed samples: 14090240 | consumed tokens: 28856811520 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.757364E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.199 | TFLOPs: 26.24 | +7: iteration 55050/ 173500 | consumed samples: 14092800 | consumed tokens: 28862054400 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.747028E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.285 | TFLOPs: 26.08 | +7: iteration 55060/ 173500 | consumed samples: 14095360 | consumed tokens: 28867297280 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.751752E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.314 | TFLOPs: 26.21 | +7: iteration 55070/ 173500 | consumed samples: 14097920 | consumed tokens: 28872540160 | elapsed time per iteration (s): 0.16 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.749272E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.027 | TFLOPs: 25.59 | +7: iteration 55080/ 173500 | consumed samples: 14100480 | consumed tokens: 28877783040 | elapsed time per iteration (s): 0.15 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.750708E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.700 | TFLOPs: 26.08 | +7: iteration 55090/ 173500 | consumed samples: 14103040 | consumed tokens: 28883025920 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.752233E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.837 | TFLOPs: 26.16 | +7: iteration 55100/ 173500 | consumed samples: 14105600 | consumed tokens: 28888268800 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.755687E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.103 | TFLOPs: 26.08 | +7: iteration 55110/ 173500 | consumed samples: 14108160 | consumed tokens: 28893511680 | elapsed time per iteration (s): 0.16 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.733844E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.594 | TFLOPs: 25.70 | +7: iteration 55120/ 173500 | consumed samples: 14110720 | consumed tokens: 28898754560 | elapsed time per iteration (s): 0.16 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.750644E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.176 | TFLOPs: 25.72 | +7: iteration 55130/ 173500 | consumed samples: 14113280 | consumed tokens: 28903997440 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.745854E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.489 | TFLOPs: 26.07 | +7: iteration 55140/ 173500 | consumed samples: 14115840 | consumed tokens: 28909240320 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.755282E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.546 | TFLOPs: 26.10 | +7: iteration 55150/ 173500 | consumed samples: 14118400 | consumed tokens: 28914483200 | elapsed time per iteration (s): 0.15 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.739851E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.631 | TFLOPs: 26.14 | +7: iteration 55160/ 173500 | consumed samples: 14120960 | consumed tokens: 28919726080 | elapsed time per iteration (s): 0.16 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.737356E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.301 | TFLOPs: 25.41 | +7: iteration 55170/ 173500 | consumed samples: 14123520 | consumed tokens: 28924968960 | elapsed time per iteration (s): 0.15 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.727849E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.995 | TFLOPs: 26.24 | +7: iteration 55180/ 173500 | consumed samples: 14126080 | consumed tokens: 28930211840 | elapsed time per iteration (s): 0.15 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.747298E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.414 | TFLOPs: 26.23 | +7: iteration 55190/ 173500 | consumed samples: 14128640 | consumed tokens: 28935454720 | elapsed time per iteration (s): 0.16 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.748728E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.048 | TFLOPs: 25.70 | +7: iteration 55200/ 173500 | consumed samples: 14131200 | consumed tokens: 28940697600 | elapsed time per iteration (s): 0.16 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.739606E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.697 | TFLOPs: 25.09 | +7: iteration 55210/ 173500 | consumed samples: 14133760 | consumed tokens: 28945940480 | elapsed time per iteration (s): 0.16 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.750528E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.152 | TFLOPs: 25.74 | +7: iteration 55220/ 173500 | consumed samples: 14136320 | consumed tokens: 28951183360 | elapsed time per iteration (s): 0.15 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.717847E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.687 | TFLOPs: 26.01 | +7: iteration 55230/ 173500 | consumed samples: 14138880 | consumed tokens: 28956426240 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.747286E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.247 | TFLOPs: 25.27 | +7: iteration 55240/ 173500 | consumed samples: 14141440 | consumed tokens: 28961669120 | elapsed time per iteration (s): 0.17 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.744841E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.781 | TFLOPs: 23.22 | +7: iteration 55250/ 173500 | consumed samples: 14144000 | consumed tokens: 28966912000 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.756730E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.434 | TFLOPs: 25.62 | +7: iteration 55260/ 173500 | consumed samples: 14146560 | consumed tokens: 28972154880 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.750821E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.989 | TFLOPs: 25.52 | +7: iteration 55270/ 173500 | consumed samples: 14149120 | consumed tokens: 28977397760 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.735051E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.670 | TFLOPs: 25.79 | +7: iteration 55280/ 173500 | consumed samples: 14151680 | consumed tokens: 28982640640 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.745589E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.718 | TFLOPs: 25.70 | +7: iteration 55290/ 173500 | consumed samples: 14154240 | consumed tokens: 28987883520 | elapsed time per iteration (s): 0.15 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.735299E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.674 | TFLOPs: 26.18 | +7: iteration 55300/ 173500 | consumed samples: 14156800 | consumed tokens: 28993126400 | elapsed time per iteration (s): 0.16 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.743895E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.754 | TFLOPs: 25.84 | +7: iteration 55310/ 173500 | consumed samples: 14159360 | consumed tokens: 28998369280 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.752925E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.510 | TFLOPs: 26.28 | +7: iteration 55320/ 173500 | consumed samples: 14161920 | consumed tokens: 29003612160 | elapsed time per iteration (s): 0.16 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.724241E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.850 | TFLOPs: 25.72 | +7: iteration 55330/ 173500 | consumed samples: 14164480 | consumed tokens: 29008855040 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.748820E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.888 | TFLOPs: 26.14 | +7: iteration 55340/ 173500 | consumed samples: 14167040 | consumed tokens: 29014097920 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.748687E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.549 | TFLOPs: 26.15 | +7: iteration 55350/ 173500 | consumed samples: 14169600 | consumed tokens: 29019340800 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.745366E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.257 | TFLOPs: 26.13 | +7: iteration 55360/ 173500 | consumed samples: 14172160 | consumed tokens: 29024583680 | elapsed time per iteration (s): 0.15 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.742698E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.120 | TFLOPs: 26.13 | +7: iteration 55370/ 173500 | consumed samples: 14174720 | consumed tokens: 29029826560 | elapsed time per iteration (s): 0.16 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.748767E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.436 | TFLOPs: 25.85 | +7: iteration 55380/ 173500 | consumed samples: 14177280 | consumed tokens: 29035069440 | elapsed time per iteration (s): 0.16 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.737931E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.913 | TFLOPs: 25.67 | +7: iteration 55390/ 173500 | consumed samples: 14179840 | consumed tokens: 29040312320 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.742832E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.351 | TFLOPs: 26.07 | +7: iteration 55400/ 173500 | consumed samples: 14182400 | consumed tokens: 29045555200 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.750278E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.816 | TFLOPs: 26.08 | +7: iteration 55410/ 173500 | consumed samples: 14184960 | consumed tokens: 29050798080 | elapsed time per iteration (s): 0.16 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.748769E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.636 | TFLOPs: 25.84 | +7: iteration 55420/ 173500 | consumed samples: 14187520 | consumed tokens: 29056040960 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.728316E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.278 | TFLOPs: 26.13 | +7: iteration 55430/ 173500 | consumed samples: 14190080 | consumed tokens: 29061283840 | elapsed time per iteration (s): 0.16 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.741245E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.342 | TFLOPs: 25.68 | +7: iteration 55440/ 173500 | consumed samples: 14192640 | consumed tokens: 29066526720 | elapsed time per iteration (s): 0.15 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.736474E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.517 | TFLOPs: 26.14 | +7: iteration 55450/ 173500 | consumed samples: 14195200 | consumed tokens: 29071769600 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.745734E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.317 | TFLOPs: 26.10 | +7: iteration 55460/ 173500 | consumed samples: 14197760 | consumed tokens: 29077012480 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.743257E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.526 | TFLOPs: 26.15 | +7: iteration 55470/ 173500 | consumed samples: 14200320 | consumed tokens: 29082255360 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.752486E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.208 | TFLOPs: 26.11 | +7: iteration 55480/ 173500 | consumed samples: 14202880 | consumed tokens: 29087498240 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.750254E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.223 | TFLOPs: 26.15 | +7: iteration 55490/ 173500 | consumed samples: 14205440 | consumed tokens: 29092741120 | elapsed time per iteration (s): 0.16 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.741314E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.902 | TFLOPs: 25.33 | +7: iteration 55500/ 173500 | consumed samples: 14208000 | consumed tokens: 29097984000 | elapsed time per iteration (s): 0.16 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.740128E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.578 | TFLOPs: 25.79 | +7: iteration 55510/ 173500 | consumed samples: 14210560 | consumed tokens: 29103226880 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.741545E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.981 | TFLOPs: 26.11 | +7: iteration 55520/ 173500 | consumed samples: 14213120 | consumed tokens: 29108469760 | elapsed time per iteration (s): 0.15 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.749417E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.484 | TFLOPs: 26.12 | +7: iteration 55530/ 173500 | consumed samples: 14215680 | consumed tokens: 29113712640 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.739507E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.363 | TFLOPs: 25.91 | +7: iteration 55540/ 173500 | consumed samples: 14218240 | consumed tokens: 29118955520 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.733147E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.411 | TFLOPs: 25.99 | +7: iteration 55550/ 173500 | consumed samples: 14220800 | consumed tokens: 29124198400 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.744975E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.967 | TFLOPs: 26.00 | +7: iteration 55560/ 173500 | consumed samples: 14223360 | consumed tokens: 29129441280 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.750345E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.891 | TFLOPs: 26.00 | +7: iteration 55570/ 173500 | consumed samples: 14225920 | consumed tokens: 29134684160 | elapsed time per iteration (s): 0.16 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.733300E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.107 | TFLOPs: 25.44 | +7: iteration 55580/ 173500 | consumed samples: 14228480 | consumed tokens: 29139927040 | elapsed time per iteration (s): 0.15 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.736810E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.173 | TFLOPs: 25.99 | +7: iteration 55590/ 173500 | consumed samples: 14231040 | consumed tokens: 29145169920 | elapsed time per iteration (s): 0.16 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.739757E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.994 | TFLOPs: 25.78 | +7: iteration 55600/ 173500 | consumed samples: 14233600 | consumed tokens: 29150412800 | elapsed time per iteration (s): 0.16 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.745193E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.531 | TFLOPs: 25.76 | +7: iteration 55610/ 173500 | consumed samples: 14236160 | consumed tokens: 29155655680 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.744183E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.565 | TFLOPs: 26.01 | +7: iteration 55620/ 173500 | consumed samples: 14238720 | consumed tokens: 29160898560 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.740094E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.207 | TFLOPs: 26.00 | +7: iteration 55630/ 173500 | consumed samples: 14241280 | consumed tokens: 29166141440 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.745100E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.870 | TFLOPs: 26.00 | +7: iteration 55640/ 173500 | consumed samples: 14243840 | consumed tokens: 29171384320 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.749802E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.450 | TFLOPs: 25.99 | +7: iteration 55650/ 173500 | consumed samples: 14246400 | consumed tokens: 29176627200 | elapsed time per iteration (s): 0.16 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.746413E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.619 | TFLOPs: 25.67 | +7: iteration 55660/ 173500 | consumed samples: 14248960 | consumed tokens: 29181870080 | elapsed time per iteration (s): 0.15 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.746849E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.868 | TFLOPs: 25.98 | +7: iteration 55670/ 173500 | consumed samples: 14251520 | consumed tokens: 29187112960 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.755693E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.957 | TFLOPs: 26.00 | +7: iteration 55680/ 173500 | consumed samples: 14254080 | consumed tokens: 29192355840 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.742250E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.163 | TFLOPs: 26.08 | +7: iteration 55690/ 173500 | consumed samples: 14256640 | consumed tokens: 29197598720 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.751272E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.148 | TFLOPs: 26.13 | +7: iteration 55700/ 173500 | consumed samples: 14259200 | consumed tokens: 29202841600 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.741354E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.696 | TFLOPs: 26.14 | +7: iteration 55710/ 173500 | consumed samples: 14261760 | consumed tokens: 29208084480 | elapsed time per iteration (s): 0.16 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.736370E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.774 | TFLOPs: 25.51 | +7: iteration 55720/ 173500 | consumed samples: 14264320 | consumed tokens: 29213327360 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.735251E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.660 | TFLOPs: 26.09 | +7: iteration 55730/ 173500 | consumed samples: 14266880 | consumed tokens: 29218570240 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.746709E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.469 | TFLOPs: 26.10 | +7: iteration 55740/ 173500 | consumed samples: 14269440 | consumed tokens: 29223813120 | elapsed time per iteration (s): 0.15 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.742701E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.155 | TFLOPs: 26.11 | +7: iteration 55750/ 173500 | consumed samples: 14272000 | consumed tokens: 29229056000 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.739608E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.343 | TFLOPs: 26.12 | +7: iteration 55760/ 173500 | consumed samples: 14274560 | consumed tokens: 29234298880 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.747964E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.595 | TFLOPs: 26.12 | +7: iteration 55770/ 173500 | consumed samples: 14277120 | consumed tokens: 29239541760 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.747181E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.621 | TFLOPs: 26.07 | +7: iteration 55780/ 173500 | consumed samples: 14279680 | consumed tokens: 29244784640 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.752673E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.145 | TFLOPs: 26.10 | +7: iteration 55790/ 173500 | consumed samples: 14282240 | consumed tokens: 29250027520 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.748538E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.685 | TFLOPs: 26.11 | +7: iteration 55800/ 173500 | consumed samples: 14284800 | consumed tokens: 29255270400 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.745758E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.422 | TFLOPs: 26.09 | +7: iteration 55810/ 173500 | consumed samples: 14287360 | consumed tokens: 29260513280 | elapsed time per iteration (s): 0.15 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.748282E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.335 | TFLOPs: 26.12 | +7: iteration 55820/ 173500 | consumed samples: 14289920 | consumed tokens: 29265756160 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.741328E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.294 | TFLOPs: 26.12 | +7: iteration 55830/ 173500 | consumed samples: 14292480 | consumed tokens: 29270999040 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.742548E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.516 | TFLOPs: 26.14 | +7: iteration 55840/ 173500 | consumed samples: 14295040 | consumed tokens: 29276241920 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.743418E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.218 | TFLOPs: 26.11 | +7: iteration 55850/ 173500 | consumed samples: 14297600 | consumed tokens: 29281484800 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.749615E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.055 | TFLOPs: 26.10 | +7: iteration 55860/ 173500 | consumed samples: 14300160 | consumed tokens: 29286727680 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.736231E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.193 | TFLOPs: 26.10 | +7: iteration 55870/ 173500 | consumed samples: 14302720 | consumed tokens: 29291970560 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.750815E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.583 | TFLOPs: 26.12 | +7: iteration 55880/ 173500 | consumed samples: 14305280 | consumed tokens: 29297213440 | elapsed time per iteration (s): 0.15 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.746159E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.788 | TFLOPs: 26.11 | +7: iteration 55890/ 173500 | consumed samples: 14307840 | consumed tokens: 29302456320 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.740223E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.320 | TFLOPs: 26.10 | +7: iteration 55900/ 173500 | consumed samples: 14310400 | consumed tokens: 29307699200 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.741861E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.849 | TFLOPs: 26.09 | +7: iteration 55910/ 173500 | consumed samples: 14312960 | consumed tokens: 29312942080 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.727356E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.050 | TFLOPs: 26.13 | +7: iteration 55920/ 173500 | consumed samples: 14315520 | consumed tokens: 29318184960 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.737474E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.476 | TFLOPs: 26.12 | +7: iteration 55930/ 173500 | consumed samples: 14318080 | consumed tokens: 29323427840 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.756667E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.386 | TFLOPs: 25.98 | +7: iteration 55940/ 173500 | consumed samples: 14320640 | consumed tokens: 29328670720 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.745467E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.668 | TFLOPs: 26.01 | +7: iteration 55950/ 173500 | consumed samples: 14323200 | consumed tokens: 29333913600 | elapsed time per iteration (s): 0.15 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.739834E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.105 | TFLOPs: 26.03 | +7: iteration 55960/ 173500 | consumed samples: 14325760 | consumed tokens: 29339156480 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.750557E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.632 | TFLOPs: 26.14 | +7: iteration 55970/ 173500 | consumed samples: 14328320 | consumed tokens: 29344399360 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.745736E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.019 | TFLOPs: 26.14 | +7: iteration 55980/ 173500 | consumed samples: 14330880 | consumed tokens: 29349642240 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.742293E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.813 | TFLOPs: 26.11 | +7: iteration 55990/ 173500 | consumed samples: 14333440 | consumed tokens: 29354885120 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.760494E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.528 | TFLOPs: 26.12 | +0: [2023-03-17 02:40:41,537] [INFO] [logging.py:68:log_dist] [Rank 0] step=56000, skipped=0, lr=[0.0001591933009380588, 0.0001591933009380588, 0.0001591933009380588], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 56000/ 173500 | consumed samples: 14336000 | consumed tokens: 29360128000 | elapsed time per iteration (s): 0.16 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.741338E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.212 | TFLOPs: 25.82 | +0: steps: 56000 loss: 3.7637 iter time (s): 0.154 samples/sec: 1662.108 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 56000 | lm loss value: 3.892998E+00 | lm loss PPL: 4.905776E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 56000 to checkpoints_44m91b100m +0: [2023-03-17 02:40:41,611] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step56000 is begin to save! +0: [2023-03-17 02:40:41,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:40:41,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:40:41,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:40:41,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:40:41,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:40:41,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:40:41,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:40:41,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:40:41,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:40:41,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:40:41,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:40:41,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:40:41,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:40:41,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:40:41,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:40:41,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:40:41,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:40:41,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:40:41,739] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:40:41,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:40:41,741] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step56000/mp_rank_00_model_states.pt +0: [2023-03-17 02:40:41,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:40:41,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:40:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:40:41,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:40:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:40:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 02:40:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 02:40:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:40:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:40:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +2: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +7: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +5: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +4: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +6: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +3: [2023-03-17 02:40:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:40:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:40:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +1: [2023-03-17 02:40:41,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:40:41,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step56000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:40:41,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step56000 is ready now! +0: successfully saved checkpoint at iteration 56000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 175.46 +7: iteration 56010/ 173500 | consumed samples: 14338560 | consumed tokens: 29365370880 | elapsed time per iteration (s): 0.18 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.754102E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.380 | TFLOPs: 22.23 | +7: iteration 56020/ 173500 | consumed samples: 14341120 | consumed tokens: 29370613760 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.731625E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.671 | TFLOPs: 26.15 | +7: iteration 56030/ 173500 | consumed samples: 14343680 | consumed tokens: 29375856640 | elapsed time per iteration (s): 0.15 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.737650E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.306 | TFLOPs: 26.12 | +7: iteration 56040/ 173500 | consumed samples: 14346240 | consumed tokens: 29381099520 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.728219E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.563 | TFLOPs: 26.10 | +7: iteration 56050/ 173500 | consumed samples: 14348800 | consumed tokens: 29386342400 | elapsed time per iteration (s): 0.16 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.731014E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.817 | TFLOPs: 25.76 | +7: iteration 56060/ 173500 | consumed samples: 14351360 | consumed tokens: 29391585280 | elapsed time per iteration (s): 0.16 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.723788E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.043 | TFLOPs: 25.27 | +7: iteration 56070/ 173500 | consumed samples: 14353920 | consumed tokens: 29396828160 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.752177E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.698 | TFLOPs: 26.12 | +7: iteration 56080/ 173500 | consumed samples: 14356480 | consumed tokens: 29402071040 | elapsed time per iteration (s): 0.16 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.739978E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.125 | TFLOPs: 25.47 | +7: iteration 56090/ 173500 | consumed samples: 14359040 | consumed tokens: 29407313920 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.754499E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.106 | TFLOPs: 26.08 | +7: iteration 56100/ 173500 | consumed samples: 14361600 | consumed tokens: 29412556800 | elapsed time per iteration (s): 0.15 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.745805E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.596 | TFLOPs: 26.00 | +7: iteration 56110/ 173500 | consumed samples: 14364160 | consumed tokens: 29417799680 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.746250E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.520 | TFLOPs: 25.99 | +7: iteration 56120/ 173500 | consumed samples: 14366720 | consumed tokens: 29423042560 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.747228E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.757 | TFLOPs: 26.14 | +7: iteration 56130/ 173500 | consumed samples: 14369280 | consumed tokens: 29428285440 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.739969E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.493 | TFLOPs: 26.15 | +7: iteration 56140/ 173500 | consumed samples: 14371840 | consumed tokens: 29433528320 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.740179E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.605 | TFLOPs: 26.15 | +7: iteration 56150/ 173500 | consumed samples: 14374400 | consumed tokens: 29438771200 | elapsed time per iteration (s): 0.16 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.748064E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.051 | TFLOPs: 25.78 | +7: iteration 56160/ 173500 | consumed samples: 14376960 | consumed tokens: 29444014080 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.762208E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.281 | TFLOPs: 26.15 | +7: iteration 56170/ 173500 | consumed samples: 14379520 | consumed tokens: 29449256960 | elapsed time per iteration (s): 0.15 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.749300E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.858 | TFLOPs: 26.14 | +7: iteration 56180/ 173500 | consumed samples: 14382080 | consumed tokens: 29454499840 | elapsed time per iteration (s): 0.15 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.750771E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.147 | TFLOPs: 26.15 | +7: iteration 56190/ 173500 | consumed samples: 14384640 | consumed tokens: 29459742720 | elapsed time per iteration (s): 0.16 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.749078E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.616 | TFLOPs: 25.68 | +7: iteration 56200/ 173500 | consumed samples: 14387200 | consumed tokens: 29464985600 | elapsed time per iteration (s): 0.18 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.737942E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.795 | TFLOPs: 22.56 | +7: iteration 56210/ 173500 | consumed samples: 14389760 | consumed tokens: 29470228480 | elapsed time per iteration (s): 0.15 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.751602E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.060 | TFLOPs: 26.11 | +7: iteration 56220/ 173500 | consumed samples: 14392320 | consumed tokens: 29475471360 | elapsed time per iteration (s): 0.16 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.737259E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.303 | TFLOPs: 25.77 | +7: iteration 56230/ 173500 | consumed samples: 14394880 | consumed tokens: 29480714240 | elapsed time per iteration (s): 0.16 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.731455E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.684 | TFLOPs: 25.75 | +7: iteration 56240/ 173500 | consumed samples: 14397440 | consumed tokens: 29485957120 | elapsed time per iteration (s): 0.15 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.742689E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.278 | TFLOPs: 26.16 | +7: iteration 56250/ 173500 | consumed samples: 14400000 | consumed tokens: 29491200000 | elapsed time per iteration (s): 0.16 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.748254E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.004 | TFLOPs: 25.69 | +7: iteration 56260/ 173500 | consumed samples: 14402560 | consumed tokens: 29496442880 | elapsed time per iteration (s): 0.18 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.757635E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1460.533 | TFLOPs: 22.90 | +7: iteration 56270/ 173500 | consumed samples: 14405120 | consumed tokens: 29501685760 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.727445E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.779 | TFLOPs: 26.15 | +7: iteration 56280/ 173500 | consumed samples: 14407680 | consumed tokens: 29506928640 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.738789E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.419 | TFLOPs: 26.15 | +7: iteration 56290/ 173500 | consumed samples: 14410240 | consumed tokens: 29512171520 | elapsed time per iteration (s): 0.16 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.735296E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.112 | TFLOPs: 25.38 | +7: iteration 56300/ 173500 | consumed samples: 14412800 | consumed tokens: 29517414400 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.760018E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.690 | TFLOPs: 26.09 | +7: iteration 56310/ 173500 | consumed samples: 14415360 | consumed tokens: 29522657280 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.732951E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.078 | TFLOPs: 26.13 | +7: iteration 56320/ 173500 | consumed samples: 14417920 | consumed tokens: 29527900160 | elapsed time per iteration (s): 0.15 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.747077E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.434 | TFLOPs: 26.10 | +7: iteration 56330/ 173500 | consumed samples: 14420480 | consumed tokens: 29533143040 | elapsed time per iteration (s): 0.17 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.725688E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1484.102 | TFLOPs: 23.27 | +7: iteration 56340/ 173500 | consumed samples: 14423040 | consumed tokens: 29538385920 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.741517E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.341 | TFLOPs: 26.16 | +7: iteration 56350/ 173500 | consumed samples: 14425600 | consumed tokens: 29543628800 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.750875E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.325 | TFLOPs: 26.16 | +7: iteration 56360/ 173500 | consumed samples: 14428160 | consumed tokens: 29548871680 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.746708E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.558 | TFLOPs: 26.17 | +7: iteration 56370/ 173500 | consumed samples: 14430720 | consumed tokens: 29554114560 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.738199E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.122 | TFLOPs: 26.16 | +7: iteration 56380/ 173500 | consumed samples: 14433280 | consumed tokens: 29559357440 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.735089E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.034 | TFLOPs: 26.14 | +7: iteration 56390/ 173500 | consumed samples: 14435840 | consumed tokens: 29564600320 | elapsed time per iteration (s): 0.15 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.742638E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.620 | TFLOPs: 26.14 | +7: iteration 56400/ 173500 | consumed samples: 14438400 | consumed tokens: 29569843200 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.737620E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.581 | TFLOPs: 26.09 | +7: iteration 56410/ 173500 | consumed samples: 14440960 | consumed tokens: 29575086080 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.736546E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.391 | TFLOPs: 26.15 | +7: iteration 56420/ 173500 | consumed samples: 14443520 | consumed tokens: 29580328960 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.753714E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.270 | TFLOPs: 26.15 | +7: iteration 56430/ 173500 | consumed samples: 14446080 | consumed tokens: 29585571840 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.741702E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.112 | TFLOPs: 26.14 | +7: iteration 56440/ 173500 | consumed samples: 14448640 | consumed tokens: 29590814720 | elapsed time per iteration (s): 0.16 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.750355E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.946 | TFLOPs: 25.78 | +7: iteration 56450/ 173500 | consumed samples: 14451200 | consumed tokens: 29596057600 | elapsed time per iteration (s): 0.17 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.746067E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.355 | TFLOPs: 23.11 | +7: iteration 56460/ 173500 | consumed samples: 14453760 | consumed tokens: 29601300480 | elapsed time per iteration (s): 0.15 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.741869E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.689 | TFLOPs: 26.17 | +7: iteration 56470/ 173500 | consumed samples: 14456320 | consumed tokens: 29606543360 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.738876E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.318 | TFLOPs: 26.15 | +7: iteration 56480/ 173500 | consumed samples: 14458880 | consumed tokens: 29611786240 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.734609E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.511 | TFLOPs: 26.15 | +7: iteration 56490/ 173500 | consumed samples: 14461440 | consumed tokens: 29617029120 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.746291E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.924 | TFLOPs: 25.94 | +7: iteration 56500/ 173500 | consumed samples: 14464000 | consumed tokens: 29622272000 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.749715E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.513 | TFLOPs: 26.15 | +7: iteration 56510/ 173500 | consumed samples: 14466560 | consumed tokens: 29627514880 | elapsed time per iteration (s): 0.16 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.739906E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.496 | TFLOPs: 25.82 | +7: iteration 56520/ 173500 | consumed samples: 14469120 | consumed tokens: 29632757760 | elapsed time per iteration (s): 0.16 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.748149E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.558 | TFLOPs: 25.79 | +7: iteration 56530/ 173500 | consumed samples: 14471680 | consumed tokens: 29638000640 | elapsed time per iteration (s): 0.15 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.744647E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.998 | TFLOPs: 26.14 | +7: iteration 56540/ 173500 | consumed samples: 14474240 | consumed tokens: 29643243520 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.748651E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.325 | TFLOPs: 26.05 | +7: iteration 56550/ 173500 | consumed samples: 14476800 | consumed tokens: 29648486400 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.742463E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.806 | TFLOPs: 26.05 | +7: iteration 56560/ 173500 | consumed samples: 14479360 | consumed tokens: 29653729280 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.740292E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.217 | TFLOPs: 26.05 | +7: iteration 56570/ 173500 | consumed samples: 14481920 | consumed tokens: 29658972160 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.736813E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.254 | TFLOPs: 26.01 | +7: iteration 56580/ 173500 | consumed samples: 14484480 | consumed tokens: 29664215040 | elapsed time per iteration (s): 0.18 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.758475E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1457.834 | TFLOPs: 22.86 | +7: iteration 56590/ 173500 | consumed samples: 14487040 | consumed tokens: 29669457920 | elapsed time per iteration (s): 0.16 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.730871E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.075 | TFLOPs: 25.23 | +7: iteration 56600/ 173500 | consumed samples: 14489600 | consumed tokens: 29674700800 | elapsed time per iteration (s): 0.15 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.737404E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.790 | TFLOPs: 26.06 | +7: iteration 56610/ 173500 | consumed samples: 14492160 | consumed tokens: 29679943680 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.744070E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.212 | TFLOPs: 26.08 | +7: iteration 56620/ 173500 | consumed samples: 14494720 | consumed tokens: 29685186560 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.738065E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.363 | TFLOPs: 26.07 | +7: iteration 56630/ 173500 | consumed samples: 14497280 | consumed tokens: 29690429440 | elapsed time per iteration (s): 0.16 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.745439E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.192 | TFLOPs: 25.75 | +7: iteration 56640/ 173500 | consumed samples: 14499840 | consumed tokens: 29695672320 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.742174E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.896 | TFLOPs: 26.09 | +7: iteration 56650/ 173500 | consumed samples: 14502400 | consumed tokens: 29700915200 | elapsed time per iteration (s): 0.15 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.729899E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.766 | TFLOPs: 26.11 | +7: iteration 56660/ 173500 | consumed samples: 14504960 | consumed tokens: 29706158080 | elapsed time per iteration (s): 0.16 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.744370E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.220 | TFLOPs: 25.63 | +7: iteration 56670/ 173500 | consumed samples: 14507520 | consumed tokens: 29711400960 | elapsed time per iteration (s): 0.16 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.736522E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.594 | TFLOPs: 25.87 | +7: iteration 56680/ 173500 | consumed samples: 14510080 | consumed tokens: 29716643840 | elapsed time per iteration (s): 0.16 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.745538E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.636 | TFLOPs: 25.48 | +7: iteration 56690/ 173500 | consumed samples: 14512640 | consumed tokens: 29721886720 | elapsed time per iteration (s): 0.16 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.739911E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.932 | TFLOPs: 25.73 | +7: iteration 56700/ 173500 | consumed samples: 14515200 | consumed tokens: 29727129600 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.751757E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.087 | TFLOPs: 26.08 | +7: iteration 56710/ 173500 | consumed samples: 14517760 | consumed tokens: 29732372480 | elapsed time per iteration (s): 0.19 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.742467E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1338.316 | TFLOPs: 20.99 | +7: iteration 56720/ 173500 | consumed samples: 14520320 | consumed tokens: 29737615360 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.742817E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.712 | TFLOPs: 26.30 | +7: iteration 56730/ 173500 | consumed samples: 14522880 | consumed tokens: 29742858240 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.745754E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.739 | TFLOPs: 26.26 | +7: iteration 56740/ 173500 | consumed samples: 14525440 | consumed tokens: 29748101120 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.744205E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.324 | TFLOPs: 26.27 | +7: iteration 56750/ 173500 | consumed samples: 14528000 | consumed tokens: 29753344000 | elapsed time per iteration (s): 0.15 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.742097E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.011 | TFLOPs: 26.25 | +7: iteration 56760/ 173500 | consumed samples: 14530560 | consumed tokens: 29758586880 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.726616E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.710 | TFLOPs: 26.28 | +7: iteration 56770/ 173500 | consumed samples: 14533120 | consumed tokens: 29763829760 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.735772E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.706 | TFLOPs: 26.15 | +7: iteration 56780/ 173500 | consumed samples: 14535680 | consumed tokens: 29769072640 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.738708E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.273 | TFLOPs: 26.24 | +7: iteration 56790/ 173500 | consumed samples: 14538240 | consumed tokens: 29774315520 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.739415E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.037 | TFLOPs: 26.22 | +7: iteration 56800/ 173500 | consumed samples: 14540800 | consumed tokens: 29779558400 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.741196E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.756 | TFLOPs: 26.19 | +7: iteration 56810/ 173500 | consumed samples: 14543360 | consumed tokens: 29784801280 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.749169E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.250 | TFLOPs: 26.24 | +7: iteration 56820/ 173500 | consumed samples: 14545920 | consumed tokens: 29790044160 | elapsed time per iteration (s): 0.15 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.758308E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.154 | TFLOPs: 26.25 | +7: iteration 56830/ 173500 | consumed samples: 14548480 | consumed tokens: 29795287040 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.750039E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.213 | TFLOPs: 26.10 | +7: iteration 56840/ 173500 | consumed samples: 14551040 | consumed tokens: 29800529920 | elapsed time per iteration (s): 0.17 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.749194E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.544 | TFLOPs: 23.67 | +7: iteration 56850/ 173500 | consumed samples: 14553600 | consumed tokens: 29805772800 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.740666E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.152 | TFLOPs: 26.13 | +7: iteration 56860/ 173500 | consumed samples: 14556160 | consumed tokens: 29811015680 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.740361E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.513 | TFLOPs: 26.18 | +7: iteration 56870/ 173500 | consumed samples: 14558720 | consumed tokens: 29816258560 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.740899E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.415 | TFLOPs: 26.10 | +7: iteration 56880/ 173500 | consumed samples: 14561280 | consumed tokens: 29821501440 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.744520E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.954 | TFLOPs: 26.19 | +7: iteration 56890/ 173500 | consumed samples: 14563840 | consumed tokens: 29826744320 | elapsed time per iteration (s): 0.15 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.734546E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.864 | TFLOPs: 26.22 | +7: iteration 56900/ 173500 | consumed samples: 14566400 | consumed tokens: 29831987200 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.747184E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.956 | TFLOPs: 26.02 | +7: iteration 56910/ 173500 | consumed samples: 14568960 | consumed tokens: 29837230080 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.746365E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.425 | TFLOPs: 26.10 | +7: iteration 56920/ 173500 | consumed samples: 14571520 | consumed tokens: 29842472960 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.740494E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.126 | TFLOPs: 26.25 | +7: iteration 56930/ 173500 | consumed samples: 14574080 | consumed tokens: 29847715840 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.741594E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.715 | TFLOPs: 26.25 | +7: iteration 56940/ 173500 | consumed samples: 14576640 | consumed tokens: 29852958720 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.754421E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.601 | TFLOPs: 26.26 | +7: iteration 56950/ 173500 | consumed samples: 14579200 | consumed tokens: 29858201600 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.745711E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.336 | TFLOPs: 26.27 | +7: iteration 56960/ 173500 | consumed samples: 14581760 | consumed tokens: 29863444480 | elapsed time per iteration (s): 0.15 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.741126E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.010 | TFLOPs: 26.30 | +7: iteration 56970/ 173500 | consumed samples: 14584320 | consumed tokens: 29868687360 | elapsed time per iteration (s): 0.18 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.745065E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.163 | TFLOPs: 21.72 | +7: iteration 56980/ 173500 | consumed samples: 14586880 | consumed tokens: 29873930240 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.738263E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.760 | TFLOPs: 26.33 | +7: iteration 56990/ 173500 | consumed samples: 14589440 | consumed tokens: 29879173120 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.746667E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.768 | TFLOPs: 26.30 | +7: iteration 57000/ 173500 | consumed samples: 14592000 | consumed tokens: 29884416000 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.731970E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.535 | TFLOPs: 26.14 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 57000 | lm loss value: 3.917732E+00 | lm loss PPL: 5.028625E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 57000 to checkpoints_44m91b100m +0: [2023-03-17 02:43:17,861] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step57000 is begin to save! +0: [2023-03-17 02:43:17,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:43:17,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:43:17,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:43:17,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:43:17,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:43:17,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:43:17,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:43:17,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:43:17,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:43:17,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:43:17,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:43:17,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:43:17,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:43:17,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:43:17,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:43:17,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:43:17,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:43:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:43:17,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:43:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:43:17,992] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step57000/mp_rank_00_model_states.pt +0: [2023-03-17 02:43:17,992] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:43:17,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:43:18,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:43:18,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:43:18,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:43:18,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +5: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +2: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +7: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +6: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:43:18,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: [2023-03-17 02:43:18,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +1: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:43:18,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 02:43:18,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +3: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +4: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:43:18,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step57000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:43:18,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step57000 is ready now! +0: successfully saved checkpoint at iteration 57000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.14 +7: iteration 57010/ 173500 | consumed samples: 14594560 | consumed tokens: 29889658880 | elapsed time per iteration (s): 0.18 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.743328E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1453.720 | TFLOPs: 22.80 | +7: iteration 57020/ 173500 | consumed samples: 14597120 | consumed tokens: 29894901760 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.744332E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.459 | TFLOPs: 26.12 | +7: iteration 57030/ 173500 | consumed samples: 14599680 | consumed tokens: 29900144640 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.755897E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.603 | TFLOPs: 26.12 | +7: iteration 57040/ 173500 | consumed samples: 14602240 | consumed tokens: 29905387520 | elapsed time per iteration (s): 0.15 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.743155E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.367 | TFLOPs: 26.12 | +7: iteration 57050/ 173500 | consumed samples: 14604800 | consumed tokens: 29910630400 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.738590E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.737 | TFLOPs: 26.09 | +7: iteration 57060/ 173500 | consumed samples: 14607360 | consumed tokens: 29915873280 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.741084E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.032 | TFLOPs: 26.10 | +7: iteration 57070/ 173500 | consumed samples: 14609920 | consumed tokens: 29921116160 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.738744E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.724 | TFLOPs: 26.12 | +7: iteration 57080/ 173500 | consumed samples: 14612480 | consumed tokens: 29926359040 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.742955E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.572 | TFLOPs: 26.10 | +7: iteration 57090/ 173500 | consumed samples: 14615040 | consumed tokens: 29931601920 | elapsed time per iteration (s): 0.17 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.748027E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.156 | TFLOPs: 23.57 | +7: iteration 57100/ 173500 | consumed samples: 14617600 | consumed tokens: 29936844800 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.737869E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.445 | TFLOPs: 26.12 | +7: iteration 57110/ 173500 | consumed samples: 14620160 | consumed tokens: 29942087680 | elapsed time per iteration (s): 0.15 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.734811E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.898 | TFLOPs: 26.11 | +7: iteration 57120/ 173500 | consumed samples: 14622720 | consumed tokens: 29947330560 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.718456E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.425 | TFLOPs: 26.10 | +7: iteration 57130/ 173500 | consumed samples: 14625280 | consumed tokens: 29952573440 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.751189E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.813 | TFLOPs: 26.11 | +7: iteration 57140/ 173500 | consumed samples: 14627840 | consumed tokens: 29957816320 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.734508E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.920 | TFLOPs: 26.11 | +7: iteration 57150/ 173500 | consumed samples: 14630400 | consumed tokens: 29963059200 | elapsed time per iteration (s): 0.17 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.758736E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1506.652 | TFLOPs: 23.63 | +7: iteration 57160/ 173500 | consumed samples: 14632960 | consumed tokens: 29968302080 | elapsed time per iteration (s): 0.17 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.748786E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.229 | TFLOPs: 23.50 | +7: iteration 57170/ 173500 | consumed samples: 14635520 | consumed tokens: 29973544960 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.744161E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.263 | TFLOPs: 26.26 | +7: iteration 57180/ 173500 | consumed samples: 14638080 | consumed tokens: 29978787840 | elapsed time per iteration (s): 0.15 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.739463E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.650 | TFLOPs: 26.18 | +7: iteration 57190/ 173500 | consumed samples: 14640640 | consumed tokens: 29984030720 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.729843E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.605 | TFLOPs: 26.20 | +7: iteration 57200/ 173500 | consumed samples: 14643200 | consumed tokens: 29989273600 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.743843E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.889 | TFLOPs: 25.97 | +7: iteration 57210/ 173500 | consumed samples: 14645760 | consumed tokens: 29994516480 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.749976E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.893 | TFLOPs: 26.22 | +7: iteration 57220/ 173500 | consumed samples: 14648320 | consumed tokens: 29999759360 | elapsed time per iteration (s): 0.17 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.744323E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.493 | TFLOPs: 23.69 | +7: iteration 57230/ 173500 | consumed samples: 14650880 | consumed tokens: 30005002240 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.742086E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.184 | TFLOPs: 26.19 | +7: iteration 57240/ 173500 | consumed samples: 14653440 | consumed tokens: 30010245120 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.741638E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.993 | TFLOPs: 26.19 | +7: iteration 57250/ 173500 | consumed samples: 14656000 | consumed tokens: 30015488000 | elapsed time per iteration (s): 0.15 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.742615E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.397 | TFLOPs: 26.18 | +7: iteration 57260/ 173500 | consumed samples: 14658560 | consumed tokens: 30020730880 | elapsed time per iteration (s): 0.16 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.737573E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.151 | TFLOPs: 25.77 | +7: iteration 57270/ 173500 | consumed samples: 14661120 | consumed tokens: 30025973760 | elapsed time per iteration (s): 0.15 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.733472E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.496 | TFLOPs: 26.18 | +7: iteration 57280/ 173500 | consumed samples: 14663680 | consumed tokens: 30031216640 | elapsed time per iteration (s): 0.16 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.753311E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.437 | TFLOPs: 25.76 | +7: iteration 57290/ 173500 | consumed samples: 14666240 | consumed tokens: 30036459520 | elapsed time per iteration (s): 0.16 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.719041E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.944 | TFLOPs: 25.59 | +7: iteration 57300/ 173500 | consumed samples: 14668800 | consumed tokens: 30041702400 | elapsed time per iteration (s): 0.16 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.749532E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.163 | TFLOPs: 25.57 | +7: iteration 57310/ 173500 | consumed samples: 14671360 | consumed tokens: 30046945280 | elapsed time per iteration (s): 0.16 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.747231E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.357 | TFLOPs: 25.76 | +7: iteration 57320/ 173500 | consumed samples: 14673920 | consumed tokens: 30052188160 | elapsed time per iteration (s): 0.16 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.739669E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.825 | TFLOPs: 25.78 | +7: iteration 57330/ 173500 | consumed samples: 14676480 | consumed tokens: 30057431040 | elapsed time per iteration (s): 0.16 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.740691E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.993 | TFLOPs: 25.39 | +7: iteration 57340/ 173500 | consumed samples: 14679040 | consumed tokens: 30062673920 | elapsed time per iteration (s): 0.16 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.743778E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.084 | TFLOPs: 25.74 | +7: iteration 57350/ 173500 | consumed samples: 14681600 | consumed tokens: 30067916800 | elapsed time per iteration (s): 0.15 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.745906E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.986 | TFLOPs: 25.94 | +7: iteration 57360/ 173500 | consumed samples: 14684160 | consumed tokens: 30073159680 | elapsed time per iteration (s): 0.16 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.741743E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.902 | TFLOPs: 25.17 | +7: iteration 57370/ 173500 | consumed samples: 14686720 | consumed tokens: 30078402560 | elapsed time per iteration (s): 0.16 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.736855E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.807 | TFLOPs: 25.86 | +7: iteration 57380/ 173500 | consumed samples: 14689280 | consumed tokens: 30083645440 | elapsed time per iteration (s): 0.16 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.748840E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.930 | TFLOPs: 25.36 | +7: iteration 57390/ 173500 | consumed samples: 14691840 | consumed tokens: 30088888320 | elapsed time per iteration (s): 0.16 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.732991E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.602 | TFLOPs: 25.79 | +7: iteration 57400/ 173500 | consumed samples: 14694400 | consumed tokens: 30094131200 | elapsed time per iteration (s): 0.15 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.752901E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.852 | TFLOPs: 25.94 | +7: iteration 57410/ 173500 | consumed samples: 14696960 | consumed tokens: 30099374080 | elapsed time per iteration (s): 0.16 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.744611E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.862 | TFLOPs: 25.11 | +7: iteration 57420/ 173500 | consumed samples: 14699520 | consumed tokens: 30104616960 | elapsed time per iteration (s): 0.16 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.754015E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.130 | TFLOPs: 25.13 | +7: iteration 57430/ 173500 | consumed samples: 14702080 | consumed tokens: 30109859840 | elapsed time per iteration (s): 0.16 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.729308E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.683 | TFLOPs: 25.20 | +7: iteration 57440/ 173500 | consumed samples: 14704640 | consumed tokens: 30115102720 | elapsed time per iteration (s): 0.16 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.740864E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.921 | TFLOPs: 25.75 | +7: iteration 57450/ 173500 | consumed samples: 14707200 | consumed tokens: 30120345600 | elapsed time per iteration (s): 0.16 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.726410E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.862 | TFLOPs: 25.87 | +7: iteration 57460/ 173500 | consumed samples: 14709760 | consumed tokens: 30125588480 | elapsed time per iteration (s): 0.16 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.741833E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.222 | TFLOPs: 25.69 | +7: iteration 57470/ 173500 | consumed samples: 14712320 | consumed tokens: 30130831360 | elapsed time per iteration (s): 0.16 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.743842E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.840 | TFLOPs: 25.28 | +7: iteration 57480/ 173500 | consumed samples: 14714880 | consumed tokens: 30136074240 | elapsed time per iteration (s): 0.17 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.748393E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.335 | TFLOPs: 23.64 | +7: iteration 57490/ 173500 | consumed samples: 14717440 | consumed tokens: 30141317120 | elapsed time per iteration (s): 0.15 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.739336E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.043 | TFLOPs: 26.03 | +7: iteration 57500/ 173500 | consumed samples: 14720000 | consumed tokens: 30146560000 | elapsed time per iteration (s): 0.16 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.743138E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.016 | TFLOPs: 25.50 | +7: iteration 57510/ 173500 | consumed samples: 14722560 | consumed tokens: 30151802880 | elapsed time per iteration (s): 0.15 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.736708E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.010 | TFLOPs: 26.06 | +7: iteration 57520/ 173500 | consumed samples: 14725120 | consumed tokens: 30157045760 | elapsed time per iteration (s): 0.15 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.731538E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.785 | TFLOPs: 25.94 | +7: iteration 57530/ 173500 | consumed samples: 14727680 | consumed tokens: 30162288640 | elapsed time per iteration (s): 0.15 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.747261E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.611 | TFLOPs: 26.36 | +7: iteration 57540/ 173500 | consumed samples: 14730240 | consumed tokens: 30167531520 | elapsed time per iteration (s): 0.16 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.742616E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.764 | TFLOPs: 24.98 | +7: iteration 57550/ 173500 | consumed samples: 14732800 | consumed tokens: 30172774400 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.730774E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.020 | TFLOPs: 26.36 | +7: iteration 57560/ 173500 | consumed samples: 14735360 | consumed tokens: 30178017280 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.741789E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.503 | TFLOPs: 26.34 | +7: iteration 57570/ 173500 | consumed samples: 14737920 | consumed tokens: 30183260160 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.738184E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.260 | TFLOPs: 26.18 | +7: iteration 57580/ 173500 | consumed samples: 14740480 | consumed tokens: 30188503040 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.738031E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.945 | TFLOPs: 26.13 | +7: iteration 57590/ 173500 | consumed samples: 14743040 | consumed tokens: 30193745920 | elapsed time per iteration (s): 0.16 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.734690E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.927 | TFLOPs: 25.78 | +7: iteration 57600/ 173500 | consumed samples: 14745600 | consumed tokens: 30198988800 | elapsed time per iteration (s): 0.17 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.735748E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.501 | TFLOPs: 23.16 | +7: iteration 57610/ 173500 | consumed samples: 14748160 | consumed tokens: 30204231680 | elapsed time per iteration (s): 0.15 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.741160E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.937 | TFLOPs: 26.13 | +7: iteration 57620/ 173500 | consumed samples: 14750720 | consumed tokens: 30209474560 | elapsed time per iteration (s): 0.16 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.731240E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.801 | TFLOPs: 25.43 | +7: iteration 57630/ 173500 | consumed samples: 14753280 | consumed tokens: 30214717440 | elapsed time per iteration (s): 0.16 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.724805E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.435 | TFLOPs: 25.66 | +7: iteration 57640/ 173500 | consumed samples: 14755840 | consumed tokens: 30219960320 | elapsed time per iteration (s): 0.15 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.744222E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.580 | TFLOPs: 26.14 | +7: iteration 57650/ 173500 | consumed samples: 14758400 | consumed tokens: 30225203200 | elapsed time per iteration (s): 0.15 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.744512E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.127 | TFLOPs: 26.14 | +7: iteration 57660/ 173500 | consumed samples: 14760960 | consumed tokens: 30230446080 | elapsed time per iteration (s): 0.16 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.732364E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.881 | TFLOPs: 25.78 | +7: iteration 57670/ 173500 | consumed samples: 14763520 | consumed tokens: 30235688960 | elapsed time per iteration (s): 0.16 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.736878E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.651 | TFLOPs: 25.70 | +7: iteration 57680/ 173500 | consumed samples: 14766080 | consumed tokens: 30240931840 | elapsed time per iteration (s): 0.15 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.736103E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.025 | TFLOPs: 26.13 | +7: iteration 57690/ 173500 | consumed samples: 14768640 | consumed tokens: 30246174720 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.741058E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.498 | TFLOPs: 26.15 | +7: iteration 57700/ 173500 | consumed samples: 14771200 | consumed tokens: 30251417600 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.740316E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.386 | TFLOPs: 26.13 | +7: iteration 57710/ 173500 | consumed samples: 14773760 | consumed tokens: 30256660480 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.741729E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.487 | TFLOPs: 26.15 | +7: iteration 57720/ 173500 | consumed samples: 14776320 | consumed tokens: 30261903360 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.733325E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.881 | TFLOPs: 26.14 | +7: iteration 57730/ 173500 | consumed samples: 14778880 | consumed tokens: 30267146240 | elapsed time per iteration (s): 0.17 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.752283E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.694 | TFLOPs: 23.58 | +7: iteration 57740/ 173500 | consumed samples: 14781440 | consumed tokens: 30272389120 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.728486E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.328 | TFLOPs: 26.15 | +7: iteration 57750/ 173500 | consumed samples: 14784000 | consumed tokens: 30277632000 | elapsed time per iteration (s): 0.15 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.739087E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.869 | TFLOPs: 26.16 | +7: iteration 57760/ 173500 | consumed samples: 14786560 | consumed tokens: 30282874880 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.733562E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.189 | TFLOPs: 26.02 | +7: iteration 57770/ 173500 | consumed samples: 14789120 | consumed tokens: 30288117760 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.736006E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.868 | TFLOPs: 26.14 | +7: iteration 57780/ 173500 | consumed samples: 14791680 | consumed tokens: 30293360640 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.728941E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.838 | TFLOPs: 26.14 | +7: iteration 57790/ 173500 | consumed samples: 14794240 | consumed tokens: 30298603520 | elapsed time per iteration (s): 0.16 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.732222E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.746 | TFLOPs: 25.86 | +7: iteration 57800/ 173500 | consumed samples: 14796800 | consumed tokens: 30303846400 | elapsed time per iteration (s): 0.16 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.741060E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.991 | TFLOPs: 25.80 | +7: iteration 57810/ 173500 | consumed samples: 14799360 | consumed tokens: 30309089280 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.747824E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.816 | TFLOPs: 26.14 | +7: iteration 57820/ 173500 | consumed samples: 14801920 | consumed tokens: 30314332160 | elapsed time per iteration (s): 0.15 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.726170E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.139 | TFLOPs: 26.14 | +7: iteration 57830/ 173500 | consumed samples: 14804480 | consumed tokens: 30319575040 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.741277E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.107 | TFLOPs: 26.13 | +7: iteration 57840/ 173500 | consumed samples: 14807040 | consumed tokens: 30324817920 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.756682E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.500 | TFLOPs: 26.17 | +7: iteration 57850/ 173500 | consumed samples: 14809600 | consumed tokens: 30330060800 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.752284E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.663 | TFLOPs: 26.15 | +7: iteration 57860/ 173500 | consumed samples: 14812160 | consumed tokens: 30335303680 | elapsed time per iteration (s): 0.17 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.742123E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1500.254 | TFLOPs: 23.53 | +7: iteration 57870/ 173500 | consumed samples: 14814720 | consumed tokens: 30340546560 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.756178E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.980 | TFLOPs: 25.91 | +7: iteration 57880/ 173500 | consumed samples: 14817280 | consumed tokens: 30345789440 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.744194E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.961 | TFLOPs: 26.14 | +7: iteration 57890/ 173500 | consumed samples: 14819840 | consumed tokens: 30351032320 | elapsed time per iteration (s): 0.15 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.734789E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.583 | TFLOPs: 26.15 | +7: iteration 57900/ 173500 | consumed samples: 14822400 | consumed tokens: 30356275200 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.760633E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.395 | TFLOPs: 26.15 | +7: iteration 57910/ 173500 | consumed samples: 14824960 | consumed tokens: 30361518080 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.730435E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.651 | TFLOPs: 26.15 | +7: iteration 57920/ 173500 | consumed samples: 14827520 | consumed tokens: 30366760960 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.755894E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.115 | TFLOPs: 26.13 | +7: iteration 57930/ 173500 | consumed samples: 14830080 | consumed tokens: 30372003840 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.751788E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.903 | TFLOPs: 26.11 | +7: iteration 57940/ 173500 | consumed samples: 14832640 | consumed tokens: 30377246720 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.754721E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.383 | TFLOPs: 26.15 | +7: iteration 57950/ 173500 | consumed samples: 14835200 | consumed tokens: 30382489600 | elapsed time per iteration (s): 0.16 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.727857E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.642 | TFLOPs: 25.54 | +7: iteration 57960/ 173500 | consumed samples: 14837760 | consumed tokens: 30387732480 | elapsed time per iteration (s): 0.15 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.758022E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.828 | TFLOPs: 26.17 | +7: iteration 57970/ 173500 | consumed samples: 14840320 | consumed tokens: 30392975360 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.742716E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.738 | TFLOPs: 26.17 | +7: iteration 57980/ 173500 | consumed samples: 14842880 | consumed tokens: 30398218240 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.753962E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.556 | TFLOPs: 26.17 | +7: iteration 57990/ 173500 | consumed samples: 14845440 | consumed tokens: 30403461120 | elapsed time per iteration (s): 0.17 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.735987E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1470.998 | TFLOPs: 23.07 | +0: [2023-03-17 02:45:54,228] [INFO] [logging.py:68:log_dist] [Rank 0] step=58000, skipped=0, lr=[0.00015640412143068475, 0.00015640412143068475, 0.00015640412143068475], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 58000/ 173500 | consumed samples: 14848000 | consumed tokens: 30408704000 | elapsed time per iteration (s): 0.16 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.740577E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.405 | TFLOPs: 25.71 | +0: steps: 58000 loss: 3.7681 iter time (s): 0.155 samples/sec: 1649.371 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 58000 | lm loss value: 3.843901E+00 | lm loss PPL: 4.670731E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 58000 to checkpoints_44m91b100m +0: [2023-03-17 02:45:54,302] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step58000 is begin to save! +0: [2023-03-17 02:45:54,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:45:54,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:45:54,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:45:54,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:45:54,376] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:45:54,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:45:54,384] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:45:54,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:45:54,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:45:54,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:45:54,400] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:45:54,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:45:54,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:45:54,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:45:54,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:45:54,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:45:54,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:45:54,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:45:54,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:45:54,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:45:54,434] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step58000/mp_rank_00_model_states.pt +0: [2023-03-17 02:45:54,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:45:54,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:45:54,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:45:54,453] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:45:54,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:45:54,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:45:54,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:45:54,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:45:54,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:45:54,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +2: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +5: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +4: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +1: [2023-03-17 02:45:54,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: [2023-03-17 02:45:54,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +6: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +7: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:45:54,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +3: [2023-03-17 02:45:54,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:45:54,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step58000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:45:54,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step58000 is ready now! +0: successfully saved checkpoint at iteration 58000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.98 +7: iteration 58010/ 173500 | consumed samples: 14850560 | consumed tokens: 30413946880 | elapsed time per iteration (s): 0.18 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.756660E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1441.725 | TFLOPs: 22.61 | +7: iteration 58020/ 173500 | consumed samples: 14853120 | consumed tokens: 30419189760 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.748413E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.542 | TFLOPs: 26.18 | +7: iteration 58030/ 173500 | consumed samples: 14855680 | consumed tokens: 30424432640 | elapsed time per iteration (s): 0.15 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.741440E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.199 | TFLOPs: 26.18 | +7: iteration 58040/ 173500 | consumed samples: 14858240 | consumed tokens: 30429675520 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.736856E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.858 | TFLOPs: 26.22 | +7: iteration 58050/ 173500 | consumed samples: 14860800 | consumed tokens: 30434918400 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.744743E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.257 | TFLOPs: 26.19 | +7: iteration 58060/ 173500 | consumed samples: 14863360 | consumed tokens: 30440161280 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.735135E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.864 | TFLOPs: 26.19 | +7: iteration 58070/ 173500 | consumed samples: 14865920 | consumed tokens: 30445404160 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.739677E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.356 | TFLOPs: 26.18 | +7: iteration 58080/ 173500 | consumed samples: 14868480 | consumed tokens: 30450647040 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.747893E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.067 | TFLOPs: 26.16 | +7: iteration 58090/ 173500 | consumed samples: 14871040 | consumed tokens: 30455889920 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.734523E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.699 | TFLOPs: 26.17 | +7: iteration 58100/ 173500 | consumed samples: 14873600 | consumed tokens: 30461132800 | elapsed time per iteration (s): 0.15 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.747398E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.136 | TFLOPs: 26.02 | +7: iteration 58110/ 173500 | consumed samples: 14876160 | consumed tokens: 30466375680 | elapsed time per iteration (s): 0.17 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.738648E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.000 | TFLOPs: 23.66 | +7: iteration 58120/ 173500 | consumed samples: 14878720 | consumed tokens: 30471618560 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.740548E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.297 | TFLOPs: 26.18 | +7: iteration 58130/ 173500 | consumed samples: 14881280 | consumed tokens: 30476861440 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.743164E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.853 | TFLOPs: 26.16 | +7: iteration 58140/ 173500 | consumed samples: 14883840 | consumed tokens: 30482104320 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.745892E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.837 | TFLOPs: 26.17 | +7: iteration 58150/ 173500 | consumed samples: 14886400 | consumed tokens: 30487347200 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.748473E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.592 | TFLOPs: 26.17 | +7: iteration 58160/ 173500 | consumed samples: 14888960 | consumed tokens: 30492590080 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.730282E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.592 | TFLOPs: 26.31 | +7: iteration 58170/ 173500 | consumed samples: 14891520 | consumed tokens: 30497832960 | elapsed time per iteration (s): 0.15 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.746732E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.307 | TFLOPs: 26.32 | +7: iteration 58180/ 173500 | consumed samples: 14894080 | consumed tokens: 30503075840 | elapsed time per iteration (s): 0.17 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.716889E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.420 | TFLOPs: 23.67 | +7: iteration 58190/ 173500 | consumed samples: 14896640 | consumed tokens: 30508318720 | elapsed time per iteration (s): 0.15 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.755238E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.385 | TFLOPs: 26.27 | +7: iteration 58200/ 173500 | consumed samples: 14899200 | consumed tokens: 30513561600 | elapsed time per iteration (s): 0.16 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.744094E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.355 | TFLOPs: 25.79 | +7: iteration 58210/ 173500 | consumed samples: 14901760 | consumed tokens: 30518804480 | elapsed time per iteration (s): 0.15 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.734816E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.320 | TFLOPs: 26.16 | +7: iteration 58220/ 173500 | consumed samples: 14904320 | consumed tokens: 30524047360 | elapsed time per iteration (s): 0.15 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.755721E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.901 | TFLOPs: 26.19 | +7: iteration 58230/ 173500 | consumed samples: 14906880 | consumed tokens: 30529290240 | elapsed time per iteration (s): 0.15 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.743748E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.155 | TFLOPs: 26.18 | +7: iteration 58240/ 173500 | consumed samples: 14909440 | consumed tokens: 30534533120 | elapsed time per iteration (s): 0.17 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.730958E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.653 | TFLOPs: 23.64 | +7: iteration 58250/ 173500 | consumed samples: 14912000 | consumed tokens: 30539776000 | elapsed time per iteration (s): 0.16 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.744528E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.180 | TFLOPs: 25.83 | +7: iteration 58260/ 173500 | consumed samples: 14914560 | consumed tokens: 30545018880 | elapsed time per iteration (s): 0.16 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.743767E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.927 | TFLOPs: 24.39 | +7: iteration 58270/ 173500 | consumed samples: 14917120 | consumed tokens: 30550261760 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.748145E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.512 | TFLOPs: 26.20 | +7: iteration 58280/ 173500 | consumed samples: 14919680 | consumed tokens: 30555504640 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.754077E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.423 | TFLOPs: 26.20 | +7: iteration 58290/ 173500 | consumed samples: 14922240 | consumed tokens: 30560747520 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.732225E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.634 | TFLOPs: 26.17 | +7: iteration 58300/ 173500 | consumed samples: 14924800 | consumed tokens: 30565990400 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.736876E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.187 | TFLOPs: 26.18 | +7: iteration 58310/ 173500 | consumed samples: 14927360 | consumed tokens: 30571233280 | elapsed time per iteration (s): 0.16 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.741622E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.872 | TFLOPs: 24.82 | +7: iteration 58320/ 173500 | consumed samples: 14929920 | consumed tokens: 30576476160 | elapsed time per iteration (s): 0.15 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.738548E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.272 | TFLOPs: 26.24 | +7: iteration 58330/ 173500 | consumed samples: 14932480 | consumed tokens: 30581719040 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.756297E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.415 | TFLOPs: 26.24 | +7: iteration 58340/ 173500 | consumed samples: 14935040 | consumed tokens: 30586961920 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.738800E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.466 | TFLOPs: 26.24 | +7: iteration 58350/ 173500 | consumed samples: 14937600 | consumed tokens: 30592204800 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.740876E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.703 | TFLOPs: 26.23 | +7: iteration 58360/ 173500 | consumed samples: 14940160 | consumed tokens: 30597447680 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.738595E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.785 | TFLOPs: 26.23 | +7: iteration 58370/ 173500 | consumed samples: 14942720 | consumed tokens: 30602690560 | elapsed time per iteration (s): 0.17 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.737433E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.111 | TFLOPs: 23.78 | +7: iteration 58380/ 173500 | consumed samples: 14945280 | consumed tokens: 30607933440 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.740882E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.372 | TFLOPs: 26.26 | +7: iteration 58390/ 173500 | consumed samples: 14947840 | consumed tokens: 30613176320 | elapsed time per iteration (s): 0.15 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.732694E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.058 | TFLOPs: 26.36 | +7: iteration 58400/ 173500 | consumed samples: 14950400 | consumed tokens: 30618419200 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.742538E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.078 | TFLOPs: 26.35 | +7: iteration 58410/ 173500 | consumed samples: 14952960 | consumed tokens: 30623662080 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.739704E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.949 | TFLOPs: 26.35 | +7: iteration 58420/ 173500 | consumed samples: 14955520 | consumed tokens: 30628904960 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.747031E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.667 | TFLOPs: 26.31 | +7: iteration 58430/ 173500 | consumed samples: 14958080 | consumed tokens: 30634147840 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.744317E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.948 | TFLOPs: 26.38 | +7: iteration 58440/ 173500 | consumed samples: 14960640 | consumed tokens: 30639390720 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.733524E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.966 | TFLOPs: 26.38 | +7: iteration 58450/ 173500 | consumed samples: 14963200 | consumed tokens: 30644633600 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.737620E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.535 | TFLOPs: 26.35 | +7: iteration 58460/ 173500 | consumed samples: 14965760 | consumed tokens: 30649876480 | elapsed time per iteration (s): 0.15 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.742644E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.147 | TFLOPs: 26.36 | +7: iteration 58470/ 173500 | consumed samples: 14968320 | consumed tokens: 30655119360 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.727628E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.686 | TFLOPs: 26.37 | +7: iteration 58480/ 173500 | consumed samples: 14970880 | consumed tokens: 30660362240 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.745718E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.686 | TFLOPs: 26.39 | +7: iteration 58490/ 173500 | consumed samples: 14973440 | consumed tokens: 30665605120 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.750614E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.805 | TFLOPs: 26.36 | +7: iteration 58500/ 173500 | consumed samples: 14976000 | consumed tokens: 30670848000 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.732480E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.167 | TFLOPs: 26.36 | +7: iteration 58510/ 173500 | consumed samples: 14978560 | consumed tokens: 30676090880 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.759212E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.930 | TFLOPs: 26.35 | +7: iteration 58520/ 173500 | consumed samples: 14981120 | consumed tokens: 30681333760 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.736460E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.639 | TFLOPs: 26.36 | +7: iteration 58530/ 173500 | consumed samples: 14983680 | consumed tokens: 30686576640 | elapsed time per iteration (s): 0.15 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.744071E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.760 | TFLOPs: 26.34 | +7: iteration 58540/ 173500 | consumed samples: 14986240 | consumed tokens: 30691819520 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.739888E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.345 | TFLOPs: 26.38 | +7: iteration 58550/ 173500 | consumed samples: 14988800 | consumed tokens: 30697062400 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.740111E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.005 | TFLOPs: 26.35 | +7: iteration 58560/ 173500 | consumed samples: 14991360 | consumed tokens: 30702305280 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.728212E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.109 | TFLOPs: 26.33 | +7: iteration 58570/ 173500 | consumed samples: 14993920 | consumed tokens: 30707548160 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.733771E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.012 | TFLOPs: 26.33 | +7: iteration 58580/ 173500 | consumed samples: 14996480 | consumed tokens: 30712791040 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.739723E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.140 | TFLOPs: 26.32 | +7: iteration 58590/ 173500 | consumed samples: 14999040 | consumed tokens: 30718033920 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.744793E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.342 | TFLOPs: 26.34 | +7: iteration 58600/ 173500 | consumed samples: 15001600 | consumed tokens: 30723276800 | elapsed time per iteration (s): 0.15 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.726972E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.721 | TFLOPs: 26.33 | +7: iteration 58610/ 173500 | consumed samples: 15004160 | consumed tokens: 30728519680 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.731809E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.078 | TFLOPs: 26.33 | +7: iteration 58620/ 173500 | consumed samples: 15006720 | consumed tokens: 30733762560 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.745755E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.119 | TFLOPs: 26.32 | +7: iteration 58630/ 173500 | consumed samples: 15009280 | consumed tokens: 30739005440 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.728657E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.021 | TFLOPs: 25.97 | +7: iteration 58640/ 173500 | consumed samples: 15011840 | consumed tokens: 30744248320 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.733554E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.724 | TFLOPs: 26.33 | +7: iteration 58650/ 173500 | consumed samples: 15014400 | consumed tokens: 30749491200 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.748009E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.098 | TFLOPs: 26.32 | +7: iteration 58660/ 173500 | consumed samples: 15016960 | consumed tokens: 30754734080 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.748616E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.542 | TFLOPs: 26.34 | +7: iteration 58670/ 173500 | consumed samples: 15019520 | consumed tokens: 30759976960 | elapsed time per iteration (s): 0.15 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.744054E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.927 | TFLOPs: 26.33 | +7: iteration 58680/ 173500 | consumed samples: 15022080 | consumed tokens: 30765219840 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.734259E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.665 | TFLOPs: 26.33 | +7: iteration 58690/ 173500 | consumed samples: 15024640 | consumed tokens: 30770462720 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.732268E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.485 | TFLOPs: 26.35 | +7: iteration 58700/ 173500 | consumed samples: 15027200 | consumed tokens: 30775705600 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.736520E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.086 | TFLOPs: 26.38 | +7: iteration 58710/ 173500 | consumed samples: 15029760 | consumed tokens: 30780948480 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.734343E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.571 | TFLOPs: 26.39 | +7: iteration 58720/ 173500 | consumed samples: 15032320 | consumed tokens: 30786191360 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.736233E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.378 | TFLOPs: 26.37 | +7: iteration 58730/ 173500 | consumed samples: 15034880 | consumed tokens: 30791434240 | elapsed time per iteration (s): 0.16 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.750462E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.836 | TFLOPs: 25.78 | +7: iteration 58740/ 173500 | consumed samples: 15037440 | consumed tokens: 30796677120 | elapsed time per iteration (s): 0.15 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.744141E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.030 | TFLOPs: 25.94 | +7: iteration 58750/ 173500 | consumed samples: 15040000 | consumed tokens: 30801920000 | elapsed time per iteration (s): 0.16 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.740876E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.175 | TFLOPs: 25.55 | +7: iteration 58760/ 173500 | consumed samples: 15042560 | consumed tokens: 30807162880 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.748745E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.204 | TFLOPs: 26.30 | +7: iteration 58770/ 173500 | consumed samples: 15045120 | consumed tokens: 30812405760 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.736130E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.736 | TFLOPs: 26.33 | +7: iteration 58780/ 173500 | consumed samples: 15047680 | consumed tokens: 30817648640 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.730119E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.011 | TFLOPs: 26.14 | +7: iteration 58790/ 173500 | consumed samples: 15050240 | consumed tokens: 30822891520 | elapsed time per iteration (s): 0.16 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.744859E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.578 | TFLOPs: 25.84 | +7: iteration 58800/ 173500 | consumed samples: 15052800 | consumed tokens: 30828134400 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.728794E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.329 | TFLOPs: 26.32 | +7: iteration 58810/ 173500 | consumed samples: 15055360 | consumed tokens: 30833377280 | elapsed time per iteration (s): 0.15 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.738673E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.713 | TFLOPs: 26.28 | +7: iteration 58820/ 173500 | consumed samples: 15057920 | consumed tokens: 30838620160 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.756818E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.269 | TFLOPs: 26.12 | +7: iteration 58830/ 173500 | consumed samples: 15060480 | consumed tokens: 30843863040 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.743623E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.404 | TFLOPs: 26.12 | +7: iteration 58840/ 173500 | consumed samples: 15063040 | consumed tokens: 30849105920 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.748017E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.943 | TFLOPs: 26.11 | +7: iteration 58850/ 173500 | consumed samples: 15065600 | consumed tokens: 30854348800 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.750226E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.568 | TFLOPs: 26.12 | +7: iteration 58860/ 173500 | consumed samples: 15068160 | consumed tokens: 30859591680 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.740039E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.393 | TFLOPs: 26.10 | +7: iteration 58870/ 173500 | consumed samples: 15070720 | consumed tokens: 30864834560 | elapsed time per iteration (s): 0.16 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.732324E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.975 | TFLOPs: 25.89 | +7: iteration 58880/ 173500 | consumed samples: 15073280 | consumed tokens: 30870077440 | elapsed time per iteration (s): 0.15 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.745751E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.516 | TFLOPs: 26.07 | +7: iteration 58890/ 173500 | consumed samples: 15075840 | consumed tokens: 30875320320 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.742617E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.511 | TFLOPs: 26.12 | +7: iteration 58900/ 173500 | consumed samples: 15078400 | consumed tokens: 30880563200 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.752253E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.212 | TFLOPs: 26.10 | +7: iteration 58910/ 173500 | consumed samples: 15080960 | consumed tokens: 30885806080 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.744521E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.745 | TFLOPs: 26.12 | +7: iteration 58920/ 173500 | consumed samples: 15083520 | consumed tokens: 30891048960 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.742732E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.870 | TFLOPs: 26.11 | +7: iteration 58930/ 173500 | consumed samples: 15086080 | consumed tokens: 30896291840 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.733151E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.115 | TFLOPs: 26.14 | +7: iteration 58940/ 173500 | consumed samples: 15088640 | consumed tokens: 30901534720 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.743483E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.751 | TFLOPs: 26.11 | +7: iteration 58950/ 173500 | consumed samples: 15091200 | consumed tokens: 30906777600 | elapsed time per iteration (s): 0.15 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.748967E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.465 | TFLOPs: 26.12 | +7: iteration 58960/ 173500 | consumed samples: 15093760 | consumed tokens: 30912020480 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.738248E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.722 | TFLOPs: 26.12 | +7: iteration 58970/ 173500 | consumed samples: 15096320 | consumed tokens: 30917263360 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.734178E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.705 | TFLOPs: 26.12 | +7: iteration 58980/ 173500 | consumed samples: 15098880 | consumed tokens: 30922506240 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.743056E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.176 | TFLOPs: 26.05 | +7: iteration 58990/ 173500 | consumed samples: 15101440 | consumed tokens: 30927749120 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.733085E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.538 | TFLOPs: 26.06 | +7: iteration 59000/ 173500 | consumed samples: 15104000 | consumed tokens: 30932992000 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.726793E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.626 | TFLOPs: 26.07 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 59000 | lm loss value: 3.849137E+00 | lm loss PPL: 4.695253E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 59000 to checkpoints_44m91b100m +0: [2023-03-17 02:48:28,582] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step59000 is begin to save! +0: [2023-03-17 02:48:28,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:48:28,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:48:28,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:48:28,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:48:28,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:48:28,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:48:28,665] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:48:28,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:48:28,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:48:28,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:48:28,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:48:28,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:48:28,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:48:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:48:28,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:48:28,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:48:28,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:48:28,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:48:28,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:48:28,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:48:28,716] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step59000/mp_rank_00_model_states.pt +0: [2023-03-17 02:48:28,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:48:28,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:48:28,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:48:28,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:48:28,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:48:28,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +3: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +5: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +4: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +6: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +1: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +2: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:48:28,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step59000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +7: [2023-03-17 02:48:28,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step59000 is ready now! +0: successfully saved checkpoint at iteration 59000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.22 +7: iteration 59010/ 173500 | consumed samples: 15106560 | consumed tokens: 30938234880 | elapsed time per iteration (s): 0.18 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.750504E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.495 | TFLOPs: 22.72 | +7: iteration 59020/ 173500 | consumed samples: 15109120 | consumed tokens: 30943477760 | elapsed time per iteration (s): 0.15 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.734762E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.202 | TFLOPs: 26.05 | +7: iteration 59030/ 173500 | consumed samples: 15111680 | consumed tokens: 30948720640 | elapsed time per iteration (s): 0.16 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.726034E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.971 | TFLOPs: 25.67 | +7: iteration 59040/ 173500 | consumed samples: 15114240 | consumed tokens: 30953963520 | elapsed time per iteration (s): 0.15 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.724273E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.126 | TFLOPs: 26.13 | +7: iteration 59050/ 173500 | consumed samples: 15116800 | consumed tokens: 30959206400 | elapsed time per iteration (s): 0.15 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.730485E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.901 | TFLOPs: 26.14 | +7: iteration 59060/ 173500 | consumed samples: 15119360 | consumed tokens: 30964449280 | elapsed time per iteration (s): 0.15 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.735910E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.726 | TFLOPs: 26.12 | +7: iteration 59070/ 173500 | consumed samples: 15121920 | consumed tokens: 30969692160 | elapsed time per iteration (s): 0.16 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.723872E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.733 | TFLOPs: 25.68 | +7: iteration 59080/ 173500 | consumed samples: 15124480 | consumed tokens: 30974935040 | elapsed time per iteration (s): 0.16 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.729100E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.354 | TFLOPs: 25.69 | +7: iteration 59090/ 173500 | consumed samples: 15127040 | consumed tokens: 30980177920 | elapsed time per iteration (s): 0.16 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.742109E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.127 | TFLOPs: 25.61 | +7: iteration 59100/ 173500 | consumed samples: 15129600 | consumed tokens: 30985420800 | elapsed time per iteration (s): 0.16 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.735359E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.865 | TFLOPs: 25.56 | +7: iteration 59110/ 173500 | consumed samples: 15132160 | consumed tokens: 30990663680 | elapsed time per iteration (s): 0.16 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.740065E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.457 | TFLOPs: 25.59 | +7: iteration 59120/ 173500 | consumed samples: 15134720 | consumed tokens: 30995906560 | elapsed time per iteration (s): 0.16 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.737029E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.789 | TFLOPs: 25.28 | +7: iteration 59130/ 173500 | consumed samples: 15137280 | consumed tokens: 31001149440 | elapsed time per iteration (s): 0.15 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.742532E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.750 | TFLOPs: 26.17 | +7: iteration 59140/ 173500 | consumed samples: 15139840 | consumed tokens: 31006392320 | elapsed time per iteration (s): 0.15 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.722042E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.332 | TFLOPs: 26.16 | +7: iteration 59150/ 173500 | consumed samples: 15142400 | consumed tokens: 31011635200 | elapsed time per iteration (s): 0.15 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.732739E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.612 | TFLOPs: 26.17 | +7: iteration 59160/ 173500 | consumed samples: 15144960 | consumed tokens: 31016878080 | elapsed time per iteration (s): 0.15 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.723748E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.417 | TFLOPs: 26.20 | +7: iteration 59170/ 173500 | consumed samples: 15147520 | consumed tokens: 31022120960 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.766618E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.685 | TFLOPs: 26.15 | +7: iteration 59180/ 173500 | consumed samples: 15150080 | consumed tokens: 31027363840 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.740606E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.896 | TFLOPs: 26.17 | +7: iteration 59190/ 173500 | consumed samples: 15152640 | consumed tokens: 31032606720 | elapsed time per iteration (s): 0.16 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.737201E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.876 | TFLOPs: 25.62 | +7: iteration 59200/ 173500 | consumed samples: 15155200 | consumed tokens: 31037849600 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.728426E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.156 | TFLOPs: 25.96 | +7: iteration 59210/ 173500 | consumed samples: 15157760 | consumed tokens: 31043092480 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.747522E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.350 | TFLOPs: 26.23 | +7: iteration 59220/ 173500 | consumed samples: 15160320 | consumed tokens: 31048335360 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.743237E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.449 | TFLOPs: 26.21 | +7: iteration 59230/ 173500 | consumed samples: 15162880 | consumed tokens: 31053578240 | elapsed time per iteration (s): 0.15 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.756509E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.074 | TFLOPs: 26.22 | +7: iteration 59240/ 173500 | consumed samples: 15165440 | consumed tokens: 31058821120 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.749442E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.697 | TFLOPs: 26.17 | +7: iteration 59250/ 173500 | consumed samples: 15168000 | consumed tokens: 31064064000 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.736922E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.114 | TFLOPs: 26.18 | +7: iteration 59260/ 173500 | consumed samples: 15170560 | consumed tokens: 31069306880 | elapsed time per iteration (s): 0.16 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.731603E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.444 | TFLOPs: 25.77 | +7: iteration 59270/ 173500 | consumed samples: 15173120 | consumed tokens: 31074549760 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.754120E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.352 | TFLOPs: 26.18 | +7: iteration 59280/ 173500 | consumed samples: 15175680 | consumed tokens: 31079792640 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.721025E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.969 | TFLOPs: 26.22 | +7: iteration 59290/ 173500 | consumed samples: 15178240 | consumed tokens: 31085035520 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.732703E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.968 | TFLOPs: 26.17 | +7: iteration 59300/ 173500 | consumed samples: 15180800 | consumed tokens: 31090278400 | elapsed time per iteration (s): 0.15 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.743282E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.196 | TFLOPs: 26.22 | +7: iteration 59310/ 173500 | consumed samples: 15183360 | consumed tokens: 31095521280 | elapsed time per iteration (s): 0.16 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.745288E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.479 | TFLOPs: 25.81 | +7: iteration 59320/ 173500 | consumed samples: 15185920 | consumed tokens: 31100764160 | elapsed time per iteration (s): 0.16 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.733216E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.373 | TFLOPs: 25.46 | +7: iteration 59330/ 173500 | consumed samples: 15188480 | consumed tokens: 31106007040 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.747210E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.594 | TFLOPs: 26.01 | +7: iteration 59340/ 173500 | consumed samples: 15191040 | consumed tokens: 31111249920 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.743094E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.377 | TFLOPs: 26.23 | +7: iteration 59350/ 173500 | consumed samples: 15193600 | consumed tokens: 31116492800 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.734996E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.429 | TFLOPs: 26.21 | +7: iteration 59360/ 173500 | consumed samples: 15196160 | consumed tokens: 31121735680 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.730994E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.272 | TFLOPs: 26.24 | +7: iteration 59370/ 173500 | consumed samples: 15198720 | consumed tokens: 31126978560 | elapsed time per iteration (s): 0.15 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.733681E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.541 | TFLOPs: 26.21 | +7: iteration 59380/ 173500 | consumed samples: 15201280 | consumed tokens: 31132221440 | elapsed time per iteration (s): 0.16 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.745855E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.489 | TFLOPs: 25.84 | +7: iteration 59390/ 173500 | consumed samples: 15203840 | consumed tokens: 31137464320 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.746061E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.602 | TFLOPs: 25.92 | +7: iteration 59400/ 173500 | consumed samples: 15206400 | consumed tokens: 31142707200 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.743100E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.569 | TFLOPs: 26.26 | +7: iteration 59410/ 173500 | consumed samples: 15208960 | consumed tokens: 31147950080 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.730531E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.473 | TFLOPs: 26.26 | +7: iteration 59420/ 173500 | consumed samples: 15211520 | consumed tokens: 31153192960 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.736406E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.853 | TFLOPs: 26.23 | +7: iteration 59430/ 173500 | consumed samples: 15214080 | consumed tokens: 31158435840 | elapsed time per iteration (s): 0.16 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.744470E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.264 | TFLOPs: 25.88 | +7: iteration 59440/ 173500 | consumed samples: 15216640 | consumed tokens: 31163678720 | elapsed time per iteration (s): 0.15 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.737063E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.120 | TFLOPs: 26.27 | +7: iteration 59450/ 173500 | consumed samples: 15219200 | consumed tokens: 31168921600 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.746674E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.149 | TFLOPs: 26.25 | +7: iteration 59460/ 173500 | consumed samples: 15221760 | consumed tokens: 31174164480 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.743658E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.802 | TFLOPs: 26.25 | +7: iteration 59470/ 173500 | consumed samples: 15224320 | consumed tokens: 31179407360 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.728200E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.049 | TFLOPs: 26.24 | +7: iteration 59480/ 173500 | consumed samples: 15226880 | consumed tokens: 31184650240 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.724974E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.795 | TFLOPs: 26.23 | +7: iteration 59490/ 173500 | consumed samples: 15229440 | consumed tokens: 31189893120 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.734275E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.315 | TFLOPs: 26.24 | +7: iteration 59500/ 173500 | consumed samples: 15232000 | consumed tokens: 31195136000 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.730830E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.149 | TFLOPs: 26.21 | +7: iteration 59510/ 173500 | consumed samples: 15234560 | consumed tokens: 31200378880 | elapsed time per iteration (s): 0.15 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.737354E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.182 | TFLOPs: 26.24 | +7: iteration 59520/ 173500 | consumed samples: 15237120 | consumed tokens: 31205621760 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.733923E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.193 | TFLOPs: 26.26 | +7: iteration 59530/ 173500 | consumed samples: 15239680 | consumed tokens: 31210864640 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.743987E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.520 | TFLOPs: 26.24 | +7: iteration 59540/ 173500 | consumed samples: 15242240 | consumed tokens: 31216107520 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.733212E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.681 | TFLOPs: 26.23 | +7: iteration 59550/ 173500 | consumed samples: 15244800 | consumed tokens: 31221350400 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.731811E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.423 | TFLOPs: 26.23 | +7: iteration 59560/ 173500 | consumed samples: 15247360 | consumed tokens: 31226593280 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.740699E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.003 | TFLOPs: 26.24 | +7: iteration 59570/ 173500 | consumed samples: 15249920 | consumed tokens: 31231836160 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.738041E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.554 | TFLOPs: 26.23 | +7: iteration 59580/ 173500 | consumed samples: 15252480 | consumed tokens: 31237079040 | elapsed time per iteration (s): 0.15 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.726696E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.857 | TFLOPs: 26.22 | +7: iteration 59590/ 173500 | consumed samples: 15255040 | consumed tokens: 31242321920 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.727964E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.521 | TFLOPs: 26.24 | +7: iteration 59600/ 173500 | consumed samples: 15257600 | consumed tokens: 31247564800 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.741514E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.659 | TFLOPs: 26.23 | +7: iteration 59610/ 173500 | consumed samples: 15260160 | consumed tokens: 31252807680 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.733088E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.788 | TFLOPs: 26.26 | +7: iteration 59620/ 173500 | consumed samples: 15262720 | consumed tokens: 31258050560 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.735870E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.498 | TFLOPs: 26.26 | +7: iteration 59630/ 173500 | consumed samples: 15265280 | consumed tokens: 31263293440 | elapsed time per iteration (s): 0.16 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.740291E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.384 | TFLOPs: 25.63 | +7: iteration 59640/ 173500 | consumed samples: 15267840 | consumed tokens: 31268536320 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.750809E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.544 | TFLOPs: 26.21 | +7: iteration 59650/ 173500 | consumed samples: 15270400 | consumed tokens: 31273779200 | elapsed time per iteration (s): 0.15 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.744828E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.861 | TFLOPs: 26.02 | +7: iteration 59660/ 173500 | consumed samples: 15272960 | consumed tokens: 31279022080 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.728736E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.945 | TFLOPs: 26.25 | +7: iteration 59670/ 173500 | consumed samples: 15275520 | consumed tokens: 31284264960 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.734621E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.625 | TFLOPs: 26.25 | +7: iteration 59680/ 173500 | consumed samples: 15278080 | consumed tokens: 31289507840 | elapsed time per iteration (s): 0.16 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.741096E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.309 | TFLOPs: 25.58 | +7: iteration 59690/ 173500 | consumed samples: 15280640 | consumed tokens: 31294750720 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.737297E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.598 | TFLOPs: 26.26 | +7: iteration 59700/ 173500 | consumed samples: 15283200 | consumed tokens: 31299993600 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.730768E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.176 | TFLOPs: 26.27 | +7: iteration 59710/ 173500 | consumed samples: 15285760 | consumed tokens: 31305236480 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.737130E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.366 | TFLOPs: 26.27 | +7: iteration 59720/ 173500 | consumed samples: 15288320 | consumed tokens: 31310479360 | elapsed time per iteration (s): 0.15 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.752214E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.661 | TFLOPs: 26.28 | +7: iteration 59730/ 173500 | consumed samples: 15290880 | consumed tokens: 31315722240 | elapsed time per iteration (s): 0.16 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.736232E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.945 | TFLOPs: 25.58 | +7: iteration 59740/ 173500 | consumed samples: 15293440 | consumed tokens: 31320965120 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.732050E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.149 | TFLOPs: 26.27 | +7: iteration 59750/ 173500 | consumed samples: 15296000 | consumed tokens: 31326208000 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.731881E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.367 | TFLOPs: 26.27 | +7: iteration 59760/ 173500 | consumed samples: 15298560 | consumed tokens: 31331450880 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.730830E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.296 | TFLOPs: 26.30 | +7: iteration 59770/ 173500 | consumed samples: 15301120 | consumed tokens: 31336693760 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.733574E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.078 | TFLOPs: 26.30 | +7: iteration 59780/ 173500 | consumed samples: 15303680 | consumed tokens: 31341936640 | elapsed time per iteration (s): 0.15 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.748316E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.447 | TFLOPs: 26.31 | +7: iteration 59790/ 173500 | consumed samples: 15306240 | consumed tokens: 31347179520 | elapsed time per iteration (s): 0.16 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.739450E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.595 | TFLOPs: 25.85 | +7: iteration 59800/ 173500 | consumed samples: 15308800 | consumed tokens: 31352422400 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.735865E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.156 | TFLOPs: 26.30 | +7: iteration 59810/ 173500 | consumed samples: 15311360 | consumed tokens: 31357665280 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.738515E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.465 | TFLOPs: 26.34 | +7: iteration 59820/ 173500 | consumed samples: 15313920 | consumed tokens: 31362908160 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.722372E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.869 | TFLOPs: 26.39 | +7: iteration 59830/ 173500 | consumed samples: 15316480 | consumed tokens: 31368151040 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.730032E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.095 | TFLOPs: 26.38 | +7: iteration 59840/ 173500 | consumed samples: 15319040 | consumed tokens: 31373393920 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.736750E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.076 | TFLOPs: 26.38 | +7: iteration 59850/ 173500 | consumed samples: 15321600 | consumed tokens: 31378636800 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.740058E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.730 | TFLOPs: 26.39 | +7: iteration 59860/ 173500 | consumed samples: 15324160 | consumed tokens: 31383879680 | elapsed time per iteration (s): 0.15 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.736196E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.943 | TFLOPs: 26.02 | +7: iteration 59870/ 173500 | consumed samples: 15326720 | consumed tokens: 31389122560 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.730296E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.523 | TFLOPs: 26.39 | +7: iteration 59880/ 173500 | consumed samples: 15329280 | consumed tokens: 31394365440 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.724408E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.065 | TFLOPs: 26.33 | +7: iteration 59890/ 173500 | consumed samples: 15331840 | consumed tokens: 31399608320 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.740523E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.934 | TFLOPs: 26.30 | +7: iteration 59900/ 173500 | consumed samples: 15334400 | consumed tokens: 31404851200 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.742656E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.483 | TFLOPs: 26.20 | +7: iteration 59910/ 173500 | consumed samples: 15336960 | consumed tokens: 31410094080 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.735598E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.524 | TFLOPs: 26.18 | +7: iteration 59920/ 173500 | consumed samples: 15339520 | consumed tokens: 31415336960 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.735427E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.263 | TFLOPs: 26.16 | +7: iteration 59930/ 173500 | consumed samples: 15342080 | consumed tokens: 31420579840 | elapsed time per iteration (s): 0.15 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.747158E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.088 | TFLOPs: 26.16 | +7: iteration 59940/ 173500 | consumed samples: 15344640 | consumed tokens: 31425822720 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.743233E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.759 | TFLOPs: 26.17 | +7: iteration 59950/ 173500 | consumed samples: 15347200 | consumed tokens: 31431065600 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.737893E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.549 | TFLOPs: 26.20 | +7: iteration 59960/ 173500 | consumed samples: 15349760 | consumed tokens: 31436308480 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.740613E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.352 | TFLOPs: 26.18 | +7: iteration 59970/ 173500 | consumed samples: 15352320 | consumed tokens: 31441551360 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.724263E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.774 | TFLOPs: 26.25 | +7: iteration 59980/ 173500 | consumed samples: 15354880 | consumed tokens: 31446794240 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.750640E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.783 | TFLOPs: 26.37 | +7: iteration 59990/ 173500 | consumed samples: 15357440 | consumed tokens: 31452037120 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.743700E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.372 | TFLOPs: 26.38 | +0: [2023-03-17 02:51:02,434] [INFO] [logging.py:68:log_dist] [Rank 0] step=60000, skipped=0, lr=[0.00015355285563304073, 0.00015355285563304073, 0.00015355285563304073], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 60000/ 173500 | consumed samples: 15360000 | consumed tokens: 31457280000 | elapsed time per iteration (s): 0.15 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.730028E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.236 | TFLOPs: 26.37 | +0: steps: 60000 loss: 3.7690 iter time (s): 0.153 samples/sec: 1675.649 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 60000 | lm loss value: 3.873833E+00 | lm loss PPL: 4.812652E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 60000 to checkpoints_44m91b100m +0: [2023-03-17 02:51:02,508] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step60000 is begin to save! +0: [2023-03-17 02:51:02,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:51:02,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:51:02,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:51:02,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:51:02,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:51:02,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:51:02,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:51:02,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:51:02,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:51:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:51:02,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:51:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:51:02,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:51:02,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:51:02,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:51:02,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:51:02,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:51:02,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:51:02,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:51:02,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:51:02,645] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step60000/mp_rank_00_model_states.pt +0: [2023-03-17 02:51:02,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:51:02,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:51:02,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:51:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:02,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +3: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +5: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +2: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +6: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +4: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +1: [2023-03-17 02:51:02,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:51:02,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:51:02,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:02,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +7: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:51:02,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step60000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:51:02,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step60000 is ready now! +0: successfully saved checkpoint at iteration 60000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.26 +7: iteration 60010/ 173500 | consumed samples: 15362560 | consumed tokens: 31462522880 | elapsed time per iteration (s): 0.18 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.740187E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.901 | TFLOPs: 22.64 | +7: iteration 60020/ 173500 | consumed samples: 15365120 | consumed tokens: 31467765760 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.746294E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.120 | TFLOPs: 26.36 | +7: iteration 60030/ 173500 | consumed samples: 15367680 | consumed tokens: 31473008640 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.736745E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.452 | TFLOPs: 26.39 | +7: iteration 60040/ 173500 | consumed samples: 15370240 | consumed tokens: 31478251520 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.747928E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.707 | TFLOPs: 26.26 | +7: iteration 60050/ 173500 | consumed samples: 15372800 | consumed tokens: 31483494400 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.733759E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.859 | TFLOPs: 26.23 | +7: iteration 60060/ 173500 | consumed samples: 15375360 | consumed tokens: 31488737280 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.741664E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.329 | TFLOPs: 26.24 | +7: iteration 60070/ 173500 | consumed samples: 15377920 | consumed tokens: 31493980160 | elapsed time per iteration (s): 0.15 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.745222E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.046 | TFLOPs: 26.24 | +7: iteration 60080/ 173500 | consumed samples: 15380480 | consumed tokens: 31499223040 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.742461E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.653 | TFLOPs: 26.22 | +7: iteration 60090/ 173500 | consumed samples: 15383040 | consumed tokens: 31504465920 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.739772E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.727 | TFLOPs: 26.08 | +7: iteration 60100/ 173500 | consumed samples: 15385600 | consumed tokens: 31509708800 | elapsed time per iteration (s): 0.16 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.733442E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.980 | TFLOPs: 25.83 | +7: iteration 60110/ 173500 | consumed samples: 15388160 | consumed tokens: 31514951680 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.741120E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.848 | TFLOPs: 26.09 | +7: iteration 60120/ 173500 | consumed samples: 15390720 | consumed tokens: 31520194560 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.752998E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.368 | TFLOPs: 26.09 | +7: iteration 60130/ 173500 | consumed samples: 15393280 | consumed tokens: 31525437440 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.734942E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.107 | TFLOPs: 26.08 | +7: iteration 60140/ 173500 | consumed samples: 15395840 | consumed tokens: 31530680320 | elapsed time per iteration (s): 0.15 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.730334E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.797 | TFLOPs: 26.09 | +7: iteration 60150/ 173500 | consumed samples: 15398400 | consumed tokens: 31535923200 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.740294E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.782 | TFLOPs: 26.14 | +7: iteration 60160/ 173500 | consumed samples: 15400960 | consumed tokens: 31541166080 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.741777E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.683 | TFLOPs: 26.14 | +7: iteration 60170/ 173500 | consumed samples: 15403520 | consumed tokens: 31546408960 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.737680E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.255 | TFLOPs: 26.10 | +7: iteration 60180/ 173500 | consumed samples: 15406080 | consumed tokens: 31551651840 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.739143E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.715 | TFLOPs: 26.12 | +7: iteration 60190/ 173500 | consumed samples: 15408640 | consumed tokens: 31556894720 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.743224E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.560 | TFLOPs: 26.14 | +7: iteration 60200/ 173500 | consumed samples: 15411200 | consumed tokens: 31562137600 | elapsed time per iteration (s): 0.15 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.748081E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.706 | TFLOPs: 26.11 | +7: iteration 60210/ 173500 | consumed samples: 15413760 | consumed tokens: 31567380480 | elapsed time per iteration (s): 0.16 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.733337E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.261 | TFLOPs: 25.85 | +7: iteration 60220/ 173500 | consumed samples: 15416320 | consumed tokens: 31572623360 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.735223E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.189 | TFLOPs: 26.13 | +7: iteration 60230/ 173500 | consumed samples: 15418880 | consumed tokens: 31577866240 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.737683E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.221 | TFLOPs: 26.33 | +7: iteration 60240/ 173500 | consumed samples: 15421440 | consumed tokens: 31583109120 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.745882E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.971 | TFLOPs: 26.27 | +7: iteration 60250/ 173500 | consumed samples: 15424000 | consumed tokens: 31588352000 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.737273E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.375 | TFLOPs: 26.34 | +7: iteration 60260/ 173500 | consumed samples: 15426560 | consumed tokens: 31593594880 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.738110E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.846 | TFLOPs: 26.12 | +7: iteration 60270/ 173500 | consumed samples: 15429120 | consumed tokens: 31598837760 | elapsed time per iteration (s): 0.15 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.724662E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.743 | TFLOPs: 26.04 | +7: iteration 60280/ 173500 | consumed samples: 15431680 | consumed tokens: 31604080640 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.738960E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.127 | TFLOPs: 26.35 | +7: iteration 60290/ 173500 | consumed samples: 15434240 | consumed tokens: 31609323520 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.734547E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.692 | TFLOPs: 26.36 | +7: iteration 60300/ 173500 | consumed samples: 15436800 | consumed tokens: 31614566400 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.738469E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.670 | TFLOPs: 26.37 | +7: iteration 60310/ 173500 | consumed samples: 15439360 | consumed tokens: 31619809280 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.749726E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.445 | TFLOPs: 26.37 | +7: iteration 60320/ 173500 | consumed samples: 15441920 | consumed tokens: 31625052160 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.745119E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.447 | TFLOPs: 26.35 | +7: iteration 60330/ 173500 | consumed samples: 15444480 | consumed tokens: 31630295040 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.734422E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.044 | TFLOPs: 26.35 | +7: iteration 60340/ 173500 | consumed samples: 15447040 | consumed tokens: 31635537920 | elapsed time per iteration (s): 0.15 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.737363E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.593 | TFLOPs: 26.32 | +7: iteration 60350/ 173500 | consumed samples: 15449600 | consumed tokens: 31640780800 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.742427E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.083 | TFLOPs: 26.25 | +7: iteration 60360/ 173500 | consumed samples: 15452160 | consumed tokens: 31646023680 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.750653E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.186 | TFLOPs: 26.22 | +7: iteration 60370/ 173500 | consumed samples: 15454720 | consumed tokens: 31651266560 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.746684E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.808 | TFLOPs: 26.22 | +7: iteration 60380/ 173500 | consumed samples: 15457280 | consumed tokens: 31656509440 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.749225E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.380 | TFLOPs: 26.20 | +7: iteration 60390/ 173500 | consumed samples: 15459840 | consumed tokens: 31661752320 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.737038E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.826 | TFLOPs: 26.22 | +7: iteration 60400/ 173500 | consumed samples: 15462400 | consumed tokens: 31666995200 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.734032E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.156 | TFLOPs: 26.24 | +7: iteration 60410/ 173500 | consumed samples: 15464960 | consumed tokens: 31672238080 | elapsed time per iteration (s): 0.15 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.737999E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.822 | TFLOPs: 26.22 | +7: iteration 60420/ 173500 | consumed samples: 15467520 | consumed tokens: 31677480960 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.737685E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.098 | TFLOPs: 26.30 | +7: iteration 60430/ 173500 | consumed samples: 15470080 | consumed tokens: 31682723840 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.751667E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.234 | TFLOPs: 26.37 | +7: iteration 60440/ 173500 | consumed samples: 15472640 | consumed tokens: 31687966720 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.728915E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.404 | TFLOPs: 26.37 | +7: iteration 60450/ 173500 | consumed samples: 15475200 | consumed tokens: 31693209600 | elapsed time per iteration (s): 0.16 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.748152E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.347 | TFLOPs: 25.16 | +7: iteration 60460/ 173500 | consumed samples: 15477760 | consumed tokens: 31698452480 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.746778E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.015 | TFLOPs: 26.33 | +7: iteration 60470/ 173500 | consumed samples: 15480320 | consumed tokens: 31703695360 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.736798E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.654 | TFLOPs: 26.37 | +7: iteration 60480/ 173500 | consumed samples: 15482880 | consumed tokens: 31708938240 | elapsed time per iteration (s): 0.15 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.732130E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.538 | TFLOPs: 26.37 | +7: iteration 60490/ 173500 | consumed samples: 15485440 | consumed tokens: 31714181120 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.733829E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.107 | TFLOPs: 26.36 | +7: iteration 60500/ 173500 | consumed samples: 15488000 | consumed tokens: 31719424000 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.739770E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.648 | TFLOPs: 26.37 | +7: iteration 60510/ 173500 | consumed samples: 15490560 | consumed tokens: 31724666880 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.742493E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.494 | TFLOPs: 26.34 | +7: iteration 60520/ 173500 | consumed samples: 15493120 | consumed tokens: 31729909760 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.721600E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.665 | TFLOPs: 26.37 | +7: iteration 60530/ 173500 | consumed samples: 15495680 | consumed tokens: 31735152640 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.737158E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.615 | TFLOPs: 26.28 | +7: iteration 60540/ 173500 | consumed samples: 15498240 | consumed tokens: 31740395520 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.735718E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.129 | TFLOPs: 26.27 | +7: iteration 60550/ 173500 | consumed samples: 15500800 | consumed tokens: 31745638400 | elapsed time per iteration (s): 0.15 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.751517E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.646 | TFLOPs: 26.28 | +7: iteration 60560/ 173500 | consumed samples: 15503360 | consumed tokens: 31750881280 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.734808E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.932 | TFLOPs: 26.27 | +7: iteration 60570/ 173500 | consumed samples: 15505920 | consumed tokens: 31756124160 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.733523E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.319 | TFLOPs: 26.35 | +7: iteration 60580/ 173500 | consumed samples: 15508480 | consumed tokens: 31761367040 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.732703E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.879 | TFLOPs: 26.36 | +7: iteration 60590/ 173500 | consumed samples: 15511040 | consumed tokens: 31766609920 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.727394E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.169 | TFLOPs: 26.36 | +7: iteration 60600/ 173500 | consumed samples: 15513600 | consumed tokens: 31771852800 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.729929E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.849 | TFLOPs: 26.34 | +7: iteration 60610/ 173500 | consumed samples: 15516160 | consumed tokens: 31777095680 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.750335E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.027 | TFLOPs: 26.36 | +7: iteration 60620/ 173500 | consumed samples: 15518720 | consumed tokens: 31782338560 | elapsed time per iteration (s): 0.15 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.736595E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.975 | TFLOPs: 26.36 | +7: iteration 60630/ 173500 | consumed samples: 15521280 | consumed tokens: 31787581440 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.743131E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.728 | TFLOPs: 26.31 | +7: iteration 60640/ 173500 | consumed samples: 15523840 | consumed tokens: 31792824320 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.726400E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.503 | TFLOPs: 26.21 | +7: iteration 60650/ 173500 | consumed samples: 15526400 | consumed tokens: 31798067200 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.743491E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.901 | TFLOPs: 26.24 | +7: iteration 60660/ 173500 | consumed samples: 15528960 | consumed tokens: 31803310080 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.747237E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.827 | TFLOPs: 26.23 | +7: iteration 60670/ 173500 | consumed samples: 15531520 | consumed tokens: 31808552960 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.739788E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.812 | TFLOPs: 26.27 | +7: iteration 60680/ 173500 | consumed samples: 15534080 | consumed tokens: 31813795840 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.743958E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.994 | TFLOPs: 25.91 | +7: iteration 60690/ 173500 | consumed samples: 15536640 | consumed tokens: 31819038720 | elapsed time per iteration (s): 0.15 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.732103E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.295 | TFLOPs: 26.24 | +7: iteration 60700/ 173500 | consumed samples: 15539200 | consumed tokens: 31824281600 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.742036E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.550 | TFLOPs: 26.25 | +7: iteration 60710/ 173500 | consumed samples: 15541760 | consumed tokens: 31829524480 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.739612E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.435 | TFLOPs: 26.26 | +7: iteration 60720/ 173500 | consumed samples: 15544320 | consumed tokens: 31834767360 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.735188E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.035 | TFLOPs: 26.25 | +7: iteration 60730/ 173500 | consumed samples: 15546880 | consumed tokens: 31840010240 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.733709E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.230 | TFLOPs: 26.26 | +7: iteration 60740/ 173500 | consumed samples: 15549440 | consumed tokens: 31845253120 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.736690E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.833 | TFLOPs: 26.23 | +7: iteration 60750/ 173500 | consumed samples: 15552000 | consumed tokens: 31850496000 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.723178E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.615 | TFLOPs: 26.25 | +7: iteration 60760/ 173500 | consumed samples: 15554560 | consumed tokens: 31855738880 | elapsed time per iteration (s): 0.15 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.738626E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.729 | TFLOPs: 26.31 | +7: iteration 60770/ 173500 | consumed samples: 15557120 | consumed tokens: 31860981760 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.728372E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.112 | TFLOPs: 26.22 | +7: iteration 60780/ 173500 | consumed samples: 15559680 | consumed tokens: 31866224640 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.743001E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.558 | TFLOPs: 26.21 | +7: iteration 60790/ 173500 | consumed samples: 15562240 | consumed tokens: 31871467520 | elapsed time per iteration (s): 0.16 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.733105E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.956 | TFLOPs: 25.88 | +7: iteration 60800/ 173500 | consumed samples: 15564800 | consumed tokens: 31876710400 | elapsed time per iteration (s): 0.16 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.745902E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.084 | TFLOPs: 25.67 | +7: iteration 60810/ 173500 | consumed samples: 15567360 | consumed tokens: 31881953280 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.744283E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.434 | TFLOPs: 26.21 | +7: iteration 60820/ 173500 | consumed samples: 15569920 | consumed tokens: 31887196160 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.736974E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.103 | TFLOPs: 26.22 | +7: iteration 60830/ 173500 | consumed samples: 15572480 | consumed tokens: 31892439040 | elapsed time per iteration (s): 0.15 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.740479E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.870 | TFLOPs: 26.23 | +7: iteration 60840/ 173500 | consumed samples: 15575040 | consumed tokens: 31897681920 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.739582E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.370 | TFLOPs: 26.23 | +7: iteration 60850/ 173500 | consumed samples: 15577600 | consumed tokens: 31902924800 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.744634E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.834 | TFLOPs: 26.20 | +7: iteration 60860/ 173500 | consumed samples: 15580160 | consumed tokens: 31908167680 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.733201E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.203 | TFLOPs: 26.21 | +7: iteration 60870/ 173500 | consumed samples: 15582720 | consumed tokens: 31913410560 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.734771E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.971 | TFLOPs: 26.21 | +7: iteration 60880/ 173500 | consumed samples: 15585280 | consumed tokens: 31918653440 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.739947E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.726 | TFLOPs: 26.12 | +7: iteration 60890/ 173500 | consumed samples: 15587840 | consumed tokens: 31923896320 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.755990E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.907 | TFLOPs: 26.16 | +7: iteration 60900/ 173500 | consumed samples: 15590400 | consumed tokens: 31929139200 | elapsed time per iteration (s): 0.15 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.729454E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.503 | TFLOPs: 26.13 | +7: iteration 60910/ 173500 | consumed samples: 15592960 | consumed tokens: 31934382080 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.727169E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.167 | TFLOPs: 26.15 | +7: iteration 60920/ 173500 | consumed samples: 15595520 | consumed tokens: 31939624960 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.744196E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.591 | TFLOPs: 26.15 | +7: iteration 60930/ 173500 | consumed samples: 15598080 | consumed tokens: 31944867840 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.745480E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.053 | TFLOPs: 26.11 | +7: iteration 60940/ 173500 | consumed samples: 15600640 | consumed tokens: 31950110720 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.761725E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.593 | TFLOPs: 26.17 | +7: iteration 60950/ 173500 | consumed samples: 15603200 | consumed tokens: 31955353600 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.732193E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.390 | TFLOPs: 26.16 | +7: iteration 60960/ 173500 | consumed samples: 15605760 | consumed tokens: 31960596480 | elapsed time per iteration (s): 0.15 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.747373E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.454 | TFLOPs: 26.17 | +7: iteration 60970/ 173500 | consumed samples: 15608320 | consumed tokens: 31965839360 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.730068E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.007 | TFLOPs: 26.17 | +7: iteration 60980/ 173500 | consumed samples: 15610880 | consumed tokens: 31971082240 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.735115E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.009 | TFLOPs: 26.16 | +7: iteration 60990/ 173500 | consumed samples: 15613440 | consumed tokens: 31976325120 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.747012E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.429 | TFLOPs: 26.17 | +7: iteration 61000/ 173500 | consumed samples: 15616000 | consumed tokens: 31981568000 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.735130E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.470 | TFLOPs: 26.12 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 61000 | lm loss value: 3.870688E+00 | lm loss PPL: 4.797537E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 61000 to checkpoints_44m91b100m +0: [2023-03-17 02:53:35,919] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step61000 is begin to save! +0: [2023-03-17 02:53:35,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:53:35,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:53:35,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:53:35,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:53:35,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:53:36,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:53:36,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:53:36,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:53:36,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:53:36,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:53:36,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:53:36,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:53:36,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:53:36,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:53:36,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:53:36,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:53:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:53:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:53:36,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:53:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:53:36,055] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step61000/mp_rank_00_model_states.pt +0: [2023-03-17 02:53:36,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:53:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:53:36,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:53:36,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 02:53:36,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:53:36,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:53:36,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +3: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +5: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 02:53:36,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +7: [2023-03-17 02:53:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +1: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:53:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 02:53:36,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +6: [2023-03-17 02:53:36,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +4: [2023-03-17 02:53:36,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:53:36,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:53:36,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +2: [2023-03-17 02:53:36,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:53:36,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step61000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:53:36,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step61000 is ready now! +0: successfully saved checkpoint at iteration 61000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 186.60 +7: iteration 61010/ 173500 | consumed samples: 15618560 | consumed tokens: 31986810880 | elapsed time per iteration (s): 0.18 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.743569E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.866 | TFLOPs: 22.08 | +7: iteration 61020/ 173500 | consumed samples: 15621120 | consumed tokens: 31992053760 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.737602E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.728 | TFLOPs: 26.14 | +7: iteration 61030/ 173500 | consumed samples: 15623680 | consumed tokens: 31997296640 | elapsed time per iteration (s): 0.15 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.731848E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.580 | TFLOPs: 26.18 | +7: iteration 61040/ 173500 | consumed samples: 15626240 | consumed tokens: 32002539520 | elapsed time per iteration (s): 0.16 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.730488E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.451 | TFLOPs: 25.68 | +7: iteration 61050/ 173500 | consumed samples: 15628800 | consumed tokens: 32007782400 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.742014E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.738 | TFLOPs: 26.17 | +7: iteration 61060/ 173500 | consumed samples: 15631360 | consumed tokens: 32013025280 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.730186E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.729 | TFLOPs: 26.17 | +7: iteration 61070/ 173500 | consumed samples: 15633920 | consumed tokens: 32018268160 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.732690E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.870 | TFLOPs: 26.16 | +7: iteration 61080/ 173500 | consumed samples: 15636480 | consumed tokens: 32023511040 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.740137E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.162 | TFLOPs: 26.16 | +7: iteration 61090/ 173500 | consumed samples: 15639040 | consumed tokens: 32028753920 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.736191E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.316 | TFLOPs: 26.13 | +7: iteration 61100/ 173500 | consumed samples: 15641600 | consumed tokens: 32033996800 | elapsed time per iteration (s): 0.15 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.741060E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.510 | TFLOPs: 26.12 | +7: iteration 61110/ 173500 | consumed samples: 15644160 | consumed tokens: 32039239680 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.722792E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.961 | TFLOPs: 26.13 | +7: iteration 61120/ 173500 | consumed samples: 15646720 | consumed tokens: 32044482560 | elapsed time per iteration (s): 0.16 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.722722E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.314 | TFLOPs: 25.76 | +7: iteration 61130/ 173500 | consumed samples: 15649280 | consumed tokens: 32049725440 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.742854E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.616 | TFLOPs: 26.17 | +7: iteration 61140/ 173500 | consumed samples: 15651840 | consumed tokens: 32054968320 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.726140E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.548 | TFLOPs: 26.18 | +7: iteration 61150/ 173500 | consumed samples: 15654400 | consumed tokens: 32060211200 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.743768E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.242 | TFLOPs: 26.16 | +7: iteration 61160/ 173500 | consumed samples: 15656960 | consumed tokens: 32065454080 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.732846E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.233 | TFLOPs: 26.16 | +7: iteration 61170/ 173500 | consumed samples: 15659520 | consumed tokens: 32070696960 | elapsed time per iteration (s): 0.15 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.752104E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.099 | TFLOPs: 26.18 | +7: iteration 61180/ 173500 | consumed samples: 15662080 | consumed tokens: 32075939840 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.729142E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.239 | TFLOPs: 26.15 | +7: iteration 61190/ 173500 | consumed samples: 15664640 | consumed tokens: 32081182720 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.729748E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.658 | TFLOPs: 26.20 | +7: iteration 61200/ 173500 | consumed samples: 15667200 | consumed tokens: 32086425600 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.746391E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.741 | TFLOPs: 26.09 | +7: iteration 61210/ 173500 | consumed samples: 15669760 | consumed tokens: 32091668480 | elapsed time per iteration (s): 0.16 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.749431E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.209 | TFLOPs: 25.85 | +7: iteration 61220/ 173500 | consumed samples: 15672320 | consumed tokens: 32096911360 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.736428E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.199 | TFLOPs: 26.19 | +7: iteration 61230/ 173500 | consumed samples: 15674880 | consumed tokens: 32102154240 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.730920E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.754 | TFLOPs: 26.19 | +7: iteration 61240/ 173500 | consumed samples: 15677440 | consumed tokens: 32107397120 | elapsed time per iteration (s): 0.15 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.729498E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.977 | TFLOPs: 26.19 | +7: iteration 61250/ 173500 | consumed samples: 15680000 | consumed tokens: 32112640000 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.723668E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.488 | TFLOPs: 26.13 | +7: iteration 61260/ 173500 | consumed samples: 15682560 | consumed tokens: 32117882880 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.736016E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.035 | TFLOPs: 26.16 | +7: iteration 61270/ 173500 | consumed samples: 15685120 | consumed tokens: 32123125760 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.741798E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.503 | TFLOPs: 26.17 | +7: iteration 61280/ 173500 | consumed samples: 15687680 | consumed tokens: 32128368640 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.724654E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.352 | TFLOPs: 26.13 | +7: iteration 61290/ 173500 | consumed samples: 15690240 | consumed tokens: 32133611520 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.743099E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.105 | TFLOPs: 25.97 | +7: iteration 61300/ 173500 | consumed samples: 15692800 | consumed tokens: 32138854400 | elapsed time per iteration (s): 0.16 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.740995E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.877 | TFLOPs: 25.40 | +7: iteration 61310/ 173500 | consumed samples: 15695360 | consumed tokens: 32144097280 | elapsed time per iteration (s): 0.15 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.736011E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.373 | TFLOPs: 26.18 | +7: iteration 61320/ 173500 | consumed samples: 15697920 | consumed tokens: 32149340160 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.735036E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.044 | TFLOPs: 26.19 | +7: iteration 61330/ 173500 | consumed samples: 15700480 | consumed tokens: 32154583040 | elapsed time per iteration (s): 0.16 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.742830E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.055 | TFLOPs: 25.03 | +7: iteration 61340/ 173500 | consumed samples: 15703040 | consumed tokens: 32159825920 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.742821E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.859 | TFLOPs: 26.20 | +7: iteration 61350/ 173500 | consumed samples: 15705600 | consumed tokens: 32165068800 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.731298E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.720 | TFLOPs: 26.19 | +7: iteration 61360/ 173500 | consumed samples: 15708160 | consumed tokens: 32170311680 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.727961E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.288 | TFLOPs: 26.19 | +7: iteration 61370/ 173500 | consumed samples: 15710720 | consumed tokens: 32175554560 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.733959E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.476 | TFLOPs: 26.20 | +7: iteration 61380/ 173500 | consumed samples: 15713280 | consumed tokens: 32180797440 | elapsed time per iteration (s): 0.15 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.755220E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.421 | TFLOPs: 26.20 | +7: iteration 61390/ 173500 | consumed samples: 15715840 | consumed tokens: 32186040320 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.729700E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.739 | TFLOPs: 26.20 | +7: iteration 61400/ 173500 | consumed samples: 15718400 | consumed tokens: 32191283200 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.723994E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.492 | TFLOPs: 26.18 | +7: iteration 61410/ 173500 | consumed samples: 15720960 | consumed tokens: 32196526080 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.734616E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.885 | TFLOPs: 26.20 | +7: iteration 61420/ 173500 | consumed samples: 15723520 | consumed tokens: 32201768960 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.730391E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.144 | TFLOPs: 26.19 | +7: iteration 61430/ 173500 | consumed samples: 15726080 | consumed tokens: 32207011840 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.737415E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.176 | TFLOPs: 26.19 | +7: iteration 61440/ 173500 | consumed samples: 15728640 | consumed tokens: 32212254720 | elapsed time per iteration (s): 0.15 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.735014E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.985 | TFLOPs: 26.16 | +7: iteration 61450/ 173500 | consumed samples: 15731200 | consumed tokens: 32217497600 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.732981E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.472 | TFLOPs: 26.15 | +7: iteration 61460/ 173500 | consumed samples: 15733760 | consumed tokens: 32222740480 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.735290E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.257 | TFLOPs: 26.15 | +7: iteration 61470/ 173500 | consumed samples: 15736320 | consumed tokens: 32227983360 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.732792E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.367 | TFLOPs: 26.13 | +7: iteration 61480/ 173500 | consumed samples: 15738880 | consumed tokens: 32233226240 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.722819E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.046 | TFLOPs: 26.16 | +7: iteration 61490/ 173500 | consumed samples: 15741440 | consumed tokens: 32238469120 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.737082E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.033 | TFLOPs: 26.14 | +7: iteration 61500/ 173500 | consumed samples: 15744000 | consumed tokens: 32243712000 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.747424E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.021 | TFLOPs: 26.14 | +7: iteration 61510/ 173500 | consumed samples: 15746560 | consumed tokens: 32248954880 | elapsed time per iteration (s): 0.15 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.736733E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.326 | TFLOPs: 26.15 | +7: iteration 61520/ 173500 | consumed samples: 15749120 | consumed tokens: 32254197760 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.737283E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.391 | TFLOPs: 26.16 | +7: iteration 61530/ 173500 | consumed samples: 15751680 | consumed tokens: 32259440640 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.727727E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.564 | TFLOPs: 26.17 | +7: iteration 61540/ 173500 | consumed samples: 15754240 | consumed tokens: 32264683520 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.722278E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.304 | TFLOPs: 26.15 | +7: iteration 61550/ 173500 | consumed samples: 15756800 | consumed tokens: 32269926400 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.743478E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.149 | TFLOPs: 26.15 | +7: iteration 61560/ 173500 | consumed samples: 15759360 | consumed tokens: 32275169280 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.737629E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.871 | TFLOPs: 26.16 | +7: iteration 61570/ 173500 | consumed samples: 15761920 | consumed tokens: 32280412160 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.736629E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.365 | TFLOPs: 26.13 | +7: iteration 61580/ 173500 | consumed samples: 15764480 | consumed tokens: 32285655040 | elapsed time per iteration (s): 0.15 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.728711E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.112 | TFLOPs: 26.14 | +7: iteration 61590/ 173500 | consumed samples: 15767040 | consumed tokens: 32290897920 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.742856E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.234 | TFLOPs: 26.16 | +7: iteration 61600/ 173500 | consumed samples: 15769600 | consumed tokens: 32296140800 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.732581E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.955 | TFLOPs: 26.14 | +7: iteration 61610/ 173500 | consumed samples: 15772160 | consumed tokens: 32301383680 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.744599E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.307 | TFLOPs: 26.16 | +7: iteration 61620/ 173500 | consumed samples: 15774720 | consumed tokens: 32306626560 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.736869E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.467 | TFLOPs: 26.15 | +7: iteration 61630/ 173500 | consumed samples: 15777280 | consumed tokens: 32311869440 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.731959E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.758 | TFLOPs: 26.15 | +7: iteration 61640/ 173500 | consumed samples: 15779840 | consumed tokens: 32317112320 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.750185E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.297 | TFLOPs: 26.16 | +7: iteration 61650/ 173500 | consumed samples: 15782400 | consumed tokens: 32322355200 | elapsed time per iteration (s): 0.15 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.732534E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.211 | TFLOPs: 26.11 | +7: iteration 61660/ 173500 | consumed samples: 15784960 | consumed tokens: 32327598080 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.723845E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.056 | TFLOPs: 26.11 | +7: iteration 61670/ 173500 | consumed samples: 15787520 | consumed tokens: 32332840960 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.727864E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.567 | TFLOPs: 26.10 | +7: iteration 61680/ 173500 | consumed samples: 15790080 | consumed tokens: 32338083840 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.732318E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.512 | TFLOPs: 26.10 | +7: iteration 61690/ 173500 | consumed samples: 15792640 | consumed tokens: 32343326720 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.724423E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.554 | TFLOPs: 26.12 | +7: iteration 61700/ 173500 | consumed samples: 15795200 | consumed tokens: 32348569600 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.731104E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.844 | TFLOPs: 26.14 | +7: iteration 61710/ 173500 | consumed samples: 15797760 | consumed tokens: 32353812480 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.732014E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.687 | TFLOPs: 26.12 | +7: iteration 61720/ 173500 | consumed samples: 15800320 | consumed tokens: 32359055360 | elapsed time per iteration (s): 0.15 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.733970E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.648 | TFLOPs: 26.14 | +7: iteration 61730/ 173500 | consumed samples: 15802880 | consumed tokens: 32364298240 | elapsed time per iteration (s): 0.15 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.741832E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.180 | TFLOPs: 26.15 | +7: iteration 61740/ 173500 | consumed samples: 15805440 | consumed tokens: 32369541120 | elapsed time per iteration (s): 0.16 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.726067E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.447 | TFLOPs: 25.32 | +7: iteration 61750/ 173500 | consumed samples: 15808000 | consumed tokens: 32374784000 | elapsed time per iteration (s): 0.15 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.742035E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.407 | TFLOPs: 26.15 | +7: iteration 61760/ 173500 | consumed samples: 15810560 | consumed tokens: 32380026880 | elapsed time per iteration (s): 0.16 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.745185E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.794 | TFLOPs: 25.20 | +7: iteration 61770/ 173500 | consumed samples: 15813120 | consumed tokens: 32385269760 | elapsed time per iteration (s): 0.15 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.742703E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.518 | TFLOPs: 26.15 | +7: iteration 61780/ 173500 | consumed samples: 15815680 | consumed tokens: 32390512640 | elapsed time per iteration (s): 0.16 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.741349E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.960 | TFLOPs: 25.80 | +7: iteration 61790/ 173500 | consumed samples: 15818240 | consumed tokens: 32395755520 | elapsed time per iteration (s): 0.15 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.727671E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.151 | TFLOPs: 26.13 | +7: iteration 61800/ 173500 | consumed samples: 15820800 | consumed tokens: 32400998400 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.740176E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.606 | TFLOPs: 26.11 | +7: iteration 61810/ 173500 | consumed samples: 15823360 | consumed tokens: 32406241280 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.747253E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.154 | TFLOPs: 26.11 | +7: iteration 61820/ 173500 | consumed samples: 15825920 | consumed tokens: 32411484160 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.722456E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.183 | TFLOPs: 26.08 | +7: iteration 61830/ 173500 | consumed samples: 15828480 | consumed tokens: 32416727040 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.738276E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.544 | TFLOPs: 26.12 | +7: iteration 61840/ 173500 | consumed samples: 15831040 | consumed tokens: 32421969920 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.740220E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.817 | TFLOPs: 26.14 | +7: iteration 61850/ 173500 | consumed samples: 15833600 | consumed tokens: 32427212800 | elapsed time per iteration (s): 0.15 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.731051E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.561 | TFLOPs: 26.14 | +7: iteration 61860/ 173500 | consumed samples: 15836160 | consumed tokens: 32432455680 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.732586E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.007 | TFLOPs: 26.14 | +7: iteration 61870/ 173500 | consumed samples: 15838720 | consumed tokens: 32437698560 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.727511E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.149 | TFLOPs: 26.19 | +7: iteration 61880/ 173500 | consumed samples: 15841280 | consumed tokens: 32442941440 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.739351E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.487 | TFLOPs: 26.24 | +7: iteration 61890/ 173500 | consumed samples: 15843840 | consumed tokens: 32448184320 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.728803E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.095 | TFLOPs: 26.25 | +7: iteration 61900/ 173500 | consumed samples: 15846400 | consumed tokens: 32453427200 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.747775E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.084 | TFLOPs: 26.25 | +7: iteration 61910/ 173500 | consumed samples: 15848960 | consumed tokens: 32458670080 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.733360E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.955 | TFLOPs: 26.22 | +7: iteration 61920/ 173500 | consumed samples: 15851520 | consumed tokens: 32463912960 | elapsed time per iteration (s): 0.15 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.733588E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.082 | TFLOPs: 26.22 | +7: iteration 61930/ 173500 | consumed samples: 15854080 | consumed tokens: 32469155840 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.735362E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.385 | TFLOPs: 26.16 | +7: iteration 61940/ 173500 | consumed samples: 15856640 | consumed tokens: 32474398720 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.726897E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.640 | TFLOPs: 26.18 | +7: iteration 61950/ 173500 | consumed samples: 15859200 | consumed tokens: 32479641600 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.734562E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.063 | TFLOPs: 26.18 | +7: iteration 61960/ 173500 | consumed samples: 15861760 | consumed tokens: 32484884480 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.745023E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.607 | TFLOPs: 26.14 | +7: iteration 61970/ 173500 | consumed samples: 15864320 | consumed tokens: 32490127360 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.722669E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.260 | TFLOPs: 26.13 | +7: iteration 61980/ 173500 | consumed samples: 15866880 | consumed tokens: 32495370240 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.730496E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.155 | TFLOPs: 26.13 | +7: iteration 61990/ 173500 | consumed samples: 15869440 | consumed tokens: 32500613120 | elapsed time per iteration (s): 0.15 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.734791E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.762 | TFLOPs: 26.12 | +0: [2023-03-17 02:56:09,954] [INFO] [logging.py:68:log_dist] [Rank 0] step=62000, skipped=0, lr=[0.00015064331838981058, 0.00015064331838981058, 0.00015064331838981058], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 62000/ 173500 | consumed samples: 15872000 | consumed tokens: 32505856000 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.725933E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.113 | TFLOPs: 26.14 | +0: steps: 62000 loss: 3.7338 iter time (s): 0.153 samples/sec: 1676.994 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 62000 | lm loss value: 3.895160E+00 | lm loss PPL: 4.916394E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 62000 to checkpoints_44m91b100m +0: [2023-03-17 02:56:10,027] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step62000 is begin to save! +0: [2023-03-17 02:56:10,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:56:10,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:56:10,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:56:10,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:56:10,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:56:10,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:56:10,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:56:10,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:56:10,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:56:10,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:56:10,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:56:10,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:56:10,138] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:56:10,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:56:10,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:56:10,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:56:10,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:56:10,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:56:10,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:56:10,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:56:10,164] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step62000/mp_rank_00_model_states.pt +0: [2023-03-17 02:56:10,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:56:10,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:56:10,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: [2023-03-17 02:56:10,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +6: [2023-03-17 02:56:10,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +6: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +6: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +6: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:10,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +6: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: [2023-03-17 02:56:10,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +6: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: [2023-03-17 02:56:10,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +3: [2023-03-17 02:56:10,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +5: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +4: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +7: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +2: [2023-03-17 02:56:10,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:56:10,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 02:56:10,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +1: [2023-03-17 02:56:10,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:56:10,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step62000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 02:56:10,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step62000 is ready now! +0: successfully saved checkpoint at iteration 62000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.52 +7: iteration 62010/ 173500 | consumed samples: 15874560 | consumed tokens: 32511098880 | elapsed time per iteration (s): 0.18 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.736766E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1435.225 | TFLOPs: 22.51 | +7: iteration 62020/ 173500 | consumed samples: 15877120 | consumed tokens: 32516341760 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.745836E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.970 | TFLOPs: 26.14 | +7: iteration 62030/ 173500 | consumed samples: 15879680 | consumed tokens: 32521584640 | elapsed time per iteration (s): 0.16 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.723114E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.827 | TFLOPs: 24.79 | +7: iteration 62040/ 173500 | consumed samples: 15882240 | consumed tokens: 32526827520 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.730760E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.835 | TFLOPs: 26.14 | +7: iteration 62050/ 173500 | consumed samples: 15884800 | consumed tokens: 32532070400 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.725488E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.942 | TFLOPs: 26.03 | +7: iteration 62060/ 173500 | consumed samples: 15887360 | consumed tokens: 32537313280 | elapsed time per iteration (s): 0.15 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.731800E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.995 | TFLOPs: 26.14 | +7: iteration 62070/ 173500 | consumed samples: 15889920 | consumed tokens: 32542556160 | elapsed time per iteration (s): 0.16 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.733435E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.173 | TFLOPs: 25.69 | +7: iteration 62080/ 173500 | consumed samples: 15892480 | consumed tokens: 32547799040 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.729528E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.427 | TFLOPs: 26.13 | +7: iteration 62090/ 173500 | consumed samples: 15895040 | consumed tokens: 32553041920 | elapsed time per iteration (s): 0.16 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.722765E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.571 | TFLOPs: 25.82 | +7: iteration 62100/ 173500 | consumed samples: 15897600 | consumed tokens: 32558284800 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.731521E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.819 | TFLOPs: 26.14 | +7: iteration 62110/ 173500 | consumed samples: 15900160 | consumed tokens: 32563527680 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.730254E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.239 | TFLOPs: 26.13 | +7: iteration 62120/ 173500 | consumed samples: 15902720 | consumed tokens: 32568770560 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.738614E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.554 | TFLOPs: 26.17 | +7: iteration 62130/ 173500 | consumed samples: 15905280 | consumed tokens: 32574013440 | elapsed time per iteration (s): 0.15 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.739730E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.447 | TFLOPs: 26.15 | +7: iteration 62140/ 173500 | consumed samples: 15907840 | consumed tokens: 32579256320 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.729744E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.645 | TFLOPs: 26.15 | +7: iteration 62150/ 173500 | consumed samples: 15910400 | consumed tokens: 32584499200 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.746632E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.038 | TFLOPs: 26.16 | +7: iteration 62160/ 173500 | consumed samples: 15912960 | consumed tokens: 32589742080 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.737739E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.935 | TFLOPs: 26.13 | +7: iteration 62170/ 173500 | consumed samples: 15915520 | consumed tokens: 32594984960 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.741603E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.622 | TFLOPs: 26.14 | +7: iteration 62180/ 173500 | consumed samples: 15918080 | consumed tokens: 32600227840 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.722411E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.000 | TFLOPs: 26.14 | +7: iteration 62190/ 173500 | consumed samples: 15920640 | consumed tokens: 32605470720 | elapsed time per iteration (s): 0.15 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.739207E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.332 | TFLOPs: 26.10 | +7: iteration 62200/ 173500 | consumed samples: 15923200 | consumed tokens: 32610713600 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.736006E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.366 | TFLOPs: 26.04 | +7: iteration 62210/ 173500 | consumed samples: 15925760 | consumed tokens: 32615956480 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.734259E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.156 | TFLOPs: 26.11 | +7: iteration 62220/ 173500 | consumed samples: 15928320 | consumed tokens: 32621199360 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.731247E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.118 | TFLOPs: 26.14 | +7: iteration 62230/ 173500 | consumed samples: 15930880 | consumed tokens: 32626442240 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.740145E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.441 | TFLOPs: 26.10 | +7: iteration 62240/ 173500 | consumed samples: 15933440 | consumed tokens: 32631685120 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.735639E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.790 | TFLOPs: 26.14 | +7: iteration 62250/ 173500 | consumed samples: 15936000 | consumed tokens: 32636928000 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.737941E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.956 | TFLOPs: 26.00 | +7: iteration 62260/ 173500 | consumed samples: 15938560 | consumed tokens: 32642170880 | elapsed time per iteration (s): 0.15 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.747231E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.493 | TFLOPs: 26.26 | +7: iteration 62270/ 173500 | consumed samples: 15941120 | consumed tokens: 32647413760 | elapsed time per iteration (s): 0.15 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.734915E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.380 | TFLOPs: 26.21 | +7: iteration 62280/ 173500 | consumed samples: 15943680 | consumed tokens: 32652656640 | elapsed time per iteration (s): 0.15 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.735798E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.695 | TFLOPs: 26.22 | +7: iteration 62290/ 173500 | consumed samples: 15946240 | consumed tokens: 32657899520 | elapsed time per iteration (s): 0.16 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.722046E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.940 | TFLOPs: 24.98 | +7: iteration 62300/ 173500 | consumed samples: 15948800 | consumed tokens: 32663142400 | elapsed time per iteration (s): 0.15 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.728394E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.405 | TFLOPs: 26.04 | +7: iteration 62310/ 173500 | consumed samples: 15951360 | consumed tokens: 32668385280 | elapsed time per iteration (s): 0.15 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.713676E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.441 | TFLOPs: 26.24 | +7: iteration 62320/ 173500 | consumed samples: 15953920 | consumed tokens: 32673628160 | elapsed time per iteration (s): 0.16 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.729724E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.124 | TFLOPs: 25.71 | +7: iteration 62330/ 173500 | consumed samples: 15956480 | consumed tokens: 32678871040 | elapsed time per iteration (s): 0.16 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.722529E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.658 | TFLOPs: 25.82 | +7: iteration 62340/ 173500 | consumed samples: 15959040 | consumed tokens: 32684113920 | elapsed time per iteration (s): 0.15 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.738767E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.749 | TFLOPs: 26.17 | +7: iteration 62350/ 173500 | consumed samples: 15961600 | consumed tokens: 32689356800 | elapsed time per iteration (s): 0.15 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.728086E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.282 | TFLOPs: 26.16 | +7: iteration 62360/ 173500 | consumed samples: 15964160 | consumed tokens: 32694599680 | elapsed time per iteration (s): 0.16 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.730558E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.846 | TFLOPs: 25.83 | +7: iteration 62370/ 173500 | consumed samples: 15966720 | consumed tokens: 32699842560 | elapsed time per iteration (s): 0.15 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.731455E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.344 | TFLOPs: 26.15 | +7: iteration 62380/ 173500 | consumed samples: 15969280 | consumed tokens: 32705085440 | elapsed time per iteration (s): 0.16 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.741740E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.291 | TFLOPs: 25.82 | +7: iteration 62390/ 173500 | consumed samples: 15971840 | consumed tokens: 32710328320 | elapsed time per iteration (s): 0.16 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.736804E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.227 | TFLOPs: 25.68 | +7: iteration 62400/ 173500 | consumed samples: 15974400 | consumed tokens: 32715571200 | elapsed time per iteration (s): 0.16 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.732678E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.644 | TFLOPs: 25.56 | +7: iteration 62410/ 173500 | consumed samples: 15976960 | consumed tokens: 32720814080 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.734475E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.479 | TFLOPs: 26.18 | +7: iteration 62420/ 173500 | consumed samples: 15979520 | consumed tokens: 32726056960 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.731174E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.426 | TFLOPs: 26.17 | +7: iteration 62430/ 173500 | consumed samples: 15982080 | consumed tokens: 32731299840 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.741695E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.374 | TFLOPs: 26.10 | +7: iteration 62440/ 173500 | consumed samples: 15984640 | consumed tokens: 32736542720 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.738481E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.796 | TFLOPs: 26.11 | +7: iteration 62450/ 173500 | consumed samples: 15987200 | consumed tokens: 32741785600 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.723175E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.786 | TFLOPs: 26.11 | +7: iteration 62460/ 173500 | consumed samples: 15989760 | consumed tokens: 32747028480 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.731795E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.165 | TFLOPs: 26.11 | +7: iteration 62470/ 173500 | consumed samples: 15992320 | consumed tokens: 32752271360 | elapsed time per iteration (s): 0.15 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.730009E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.233 | TFLOPs: 26.13 | +7: iteration 62480/ 173500 | consumed samples: 15994880 | consumed tokens: 32757514240 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.737801E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.856 | TFLOPs: 26.16 | +7: iteration 62490/ 173500 | consumed samples: 15997440 | consumed tokens: 32762757120 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.742150E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.656 | TFLOPs: 26.14 | +7: iteration 62500/ 173500 | consumed samples: 16000000 | consumed tokens: 32768000000 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.726877E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.170 | TFLOPs: 26.15 | +7: iteration 62510/ 173500 | consumed samples: 16002560 | consumed tokens: 32773242880 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.739038E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.192 | TFLOPs: 26.16 | +7: iteration 62520/ 173500 | consumed samples: 16005120 | consumed tokens: 32778485760 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.741467E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.968 | TFLOPs: 26.17 | +7: iteration 62530/ 173500 | consumed samples: 16007680 | consumed tokens: 32783728640 | elapsed time per iteration (s): 0.15 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.742543E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.983 | TFLOPs: 26.14 | +7: iteration 62540/ 173500 | consumed samples: 16010240 | consumed tokens: 32788971520 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.728366E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.376 | TFLOPs: 26.09 | +7: iteration 62550/ 173500 | consumed samples: 16012800 | consumed tokens: 32794214400 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.729043E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.175 | TFLOPs: 26.16 | +7: iteration 62560/ 173500 | consumed samples: 16015360 | consumed tokens: 32799457280 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.724829E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.217 | TFLOPs: 26.21 | +7: iteration 62570/ 173500 | consumed samples: 16017920 | consumed tokens: 32804700160 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.731157E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.860 | TFLOPs: 26.17 | +7: iteration 62580/ 173500 | consumed samples: 16020480 | consumed tokens: 32809943040 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.728125E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.324 | TFLOPs: 26.23 | +7: iteration 62590/ 173500 | consumed samples: 16023040 | consumed tokens: 32815185920 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.739945E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.283 | TFLOPs: 26.24 | +7: iteration 62600/ 173500 | consumed samples: 16025600 | consumed tokens: 32820428800 | elapsed time per iteration (s): 0.15 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.741892E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.466 | TFLOPs: 26.17 | +7: iteration 62610/ 173500 | consumed samples: 16028160 | consumed tokens: 32825671680 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.731558E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.877 | TFLOPs: 26.11 | +7: iteration 62620/ 173500 | consumed samples: 16030720 | consumed tokens: 32830914560 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.722442E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.948 | TFLOPs: 26.09 | +7: iteration 62630/ 173500 | consumed samples: 16033280 | consumed tokens: 32836157440 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.725894E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.985 | TFLOPs: 26.17 | +7: iteration 62640/ 173500 | consumed samples: 16035840 | consumed tokens: 32841400320 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.725361E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.040 | TFLOPs: 26.28 | +7: iteration 62650/ 173500 | consumed samples: 16038400 | consumed tokens: 32846643200 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.722672E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.975 | TFLOPs: 26.24 | +7: iteration 62660/ 173500 | consumed samples: 16040960 | consumed tokens: 32851886080 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.745395E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.846 | TFLOPs: 26.22 | +7: iteration 62670/ 173500 | consumed samples: 16043520 | consumed tokens: 32857128960 | elapsed time per iteration (s): 0.15 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.733216E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.890 | TFLOPs: 26.30 | +7: iteration 62680/ 173500 | consumed samples: 16046080 | consumed tokens: 32862371840 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.720506E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.124 | TFLOPs: 26.27 | +7: iteration 62690/ 173500 | consumed samples: 16048640 | consumed tokens: 32867614720 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.742923E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.295 | TFLOPs: 26.29 | +7: iteration 62700/ 173500 | consumed samples: 16051200 | consumed tokens: 32872857600 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.722350E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.431 | TFLOPs: 26.29 | +7: iteration 62710/ 173500 | consumed samples: 16053760 | consumed tokens: 32878100480 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.748287E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.801 | TFLOPs: 26.30 | +7: iteration 62720/ 173500 | consumed samples: 16056320 | consumed tokens: 32883343360 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.722847E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.558 | TFLOPs: 26.28 | +7: iteration 62730/ 173500 | consumed samples: 16058880 | consumed tokens: 32888586240 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.734303E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.579 | TFLOPs: 26.21 | +7: iteration 62740/ 173500 | consumed samples: 16061440 | consumed tokens: 32893829120 | elapsed time per iteration (s): 0.15 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.725549E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.714 | TFLOPs: 26.22 | +7: iteration 62750/ 173500 | consumed samples: 16064000 | consumed tokens: 32899072000 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.741818E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.038 | TFLOPs: 26.22 | +7: iteration 62760/ 173500 | consumed samples: 16066560 | consumed tokens: 32904314880 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.734004E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.843 | TFLOPs: 26.20 | +7: iteration 62770/ 173500 | consumed samples: 16069120 | consumed tokens: 32909557760 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.727872E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.856 | TFLOPs: 26.19 | +7: iteration 62780/ 173500 | consumed samples: 16071680 | consumed tokens: 32914800640 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.748428E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.555 | TFLOPs: 26.21 | +7: iteration 62790/ 173500 | consumed samples: 16074240 | consumed tokens: 32920043520 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.726320E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.004 | TFLOPs: 26.21 | +7: iteration 62800/ 173500 | consumed samples: 16076800 | consumed tokens: 32925286400 | elapsed time per iteration (s): 0.15 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.729896E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.862 | TFLOPs: 26.22 | +7: iteration 62810/ 173500 | consumed samples: 16079360 | consumed tokens: 32930529280 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.734079E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.502 | TFLOPs: 26.21 | +7: iteration 62820/ 173500 | consumed samples: 16081920 | consumed tokens: 32935772160 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.737944E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.556 | TFLOPs: 26.20 | +7: iteration 62830/ 173500 | consumed samples: 16084480 | consumed tokens: 32941015040 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.729623E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.065 | TFLOPs: 26.22 | +7: iteration 62840/ 173500 | consumed samples: 16087040 | consumed tokens: 32946257920 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.722368E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.761 | TFLOPs: 26.23 | +7: iteration 62850/ 173500 | consumed samples: 16089600 | consumed tokens: 32951500800 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.734082E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.594 | TFLOPs: 26.25 | +7: iteration 62860/ 173500 | consumed samples: 16092160 | consumed tokens: 32956743680 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.727804E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.483 | TFLOPs: 26.23 | +7: iteration 62870/ 173500 | consumed samples: 16094720 | consumed tokens: 32961986560 | elapsed time per iteration (s): 0.15 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.736309E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.873 | TFLOPs: 26.23 | +7: iteration 62880/ 173500 | consumed samples: 16097280 | consumed tokens: 32967229440 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.734127E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.623 | TFLOPs: 26.22 | +7: iteration 62890/ 173500 | consumed samples: 16099840 | consumed tokens: 32972472320 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.736350E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.027 | TFLOPs: 26.22 | +7: iteration 62900/ 173500 | consumed samples: 16102400 | consumed tokens: 32977715200 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.731255E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.971 | TFLOPs: 26.22 | +7: iteration 62910/ 173500 | consumed samples: 16104960 | consumed tokens: 32982958080 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.727095E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.236 | TFLOPs: 26.21 | +7: iteration 62920/ 173500 | consumed samples: 16107520 | consumed tokens: 32988200960 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.745142E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.153 | TFLOPs: 26.24 | +7: iteration 62930/ 173500 | consumed samples: 16110080 | consumed tokens: 32993443840 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.742312E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.200 | TFLOPs: 26.27 | +7: iteration 62940/ 173500 | consumed samples: 16112640 | consumed tokens: 32998686720 | elapsed time per iteration (s): 0.15 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.725191E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.579 | TFLOPs: 26.25 | +7: iteration 62950/ 173500 | consumed samples: 16115200 | consumed tokens: 33003929600 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.734973E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.605 | TFLOPs: 26.26 | +7: iteration 62960/ 173500 | consumed samples: 16117760 | consumed tokens: 33009172480 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.728321E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.854 | TFLOPs: 26.27 | +7: iteration 62970/ 173500 | consumed samples: 16120320 | consumed tokens: 33014415360 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.729252E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.549 | TFLOPs: 26.26 | +7: iteration 62980/ 173500 | consumed samples: 16122880 | consumed tokens: 33019658240 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.736221E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.682 | TFLOPs: 26.26 | +7: iteration 62990/ 173500 | consumed samples: 16125440 | consumed tokens: 33024901120 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.735499E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.597 | TFLOPs: 26.26 | +7: iteration 63000/ 173500 | consumed samples: 16128000 | consumed tokens: 33030144000 | elapsed time per iteration (s): 0.15 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.726317E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.944 | TFLOPs: 26.25 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 63000 | lm loss value: 3.873098E+00 | lm loss PPL: 4.809116E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 63000 to checkpoints_44m91b100m +0: [2023-03-17 02:58:43,961] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step63000 is begin to save! +0: [2023-03-17 02:58:43,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_01-model_00-model_states.pt... +0: [2023-03-17 02:58:44,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_01-model_00-model_states.pt. +0: [2023-03-17 02:58:44,027] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_03-model_00-model_states.pt... +0: [2023-03-17 02:58:44,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_03-model_00-model_states.pt. +0: [2023-03-17 02:58:44,035] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_04-model_00-model_states.pt... +0: [2023-03-17 02:58:44,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_04-model_00-model_states.pt. +0: [2023-03-17 02:58:44,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_05-model_00-model_states.pt... +0: [2023-03-17 02:58:44,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_05-model_00-model_states.pt. +0: [2023-03-17 02:58:44,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_06-model_00-model_states.pt... +0: [2023-03-17 02:58:44,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_06-model_00-model_states.pt. +0: [2023-03-17 02:58:44,059] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_07-model_00-model_states.pt... +0: [2023-03-17 02:58:44,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_07-model_00-model_states.pt. +0: [2023-03-17 02:58:44,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_08-model_00-model_states.pt... +0: [2023-03-17 02:58:44,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_08-model_00-model_states.pt. +0: [2023-03-17 02:58:44,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_09-model_00-model_states.pt... +0: [2023-03-17 02:58:44,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_09-model_00-model_states.pt. +0: [2023-03-17 02:58:44,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_10-model_00-model_states.pt... +0: [2023-03-17 02:58:44,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_10-model_00-model_states.pt. +0: [2023-03-17 02:58:44,091] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/layer_12-model_00-model_states.pt... +0: [2023-03-17 02:58:44,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/layer_12-model_00-model_states.pt. +0: [2023-03-17 02:58:44,093] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step63000/mp_rank_00_model_states.pt +0: [2023-03-17 02:58:44,093] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/mp_rank_00_model_states.pt... +0: [2023-03-17 02:58:44,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/mp_rank_00_model_states.pt. +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 02:58:44,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +0: [2023-03-17 02:58:44,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:58:44,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:58:44,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 02:58:44,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:58:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 02:58:44,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:58:44,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 02:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:58:44,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 02:58:44,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +1: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +2: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +3: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +4: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 02:58:44,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +5: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +6: [2023-03-17 02:58:44,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 02:58:44,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 02:58:44,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +7: [2023-03-17 02:58:44,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 02:58:44,135] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step63000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 02:58:44,135] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step63000 is ready now! +0: successfully saved checkpoint at iteration 63000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.36 +7: iteration 63010/ 173500 | consumed samples: 16130560 | consumed tokens: 33035386880 | elapsed time per iteration (s): 0.18 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.735118E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.265 | TFLOPs: 22.82 | +7: iteration 63020/ 173500 | consumed samples: 16133120 | consumed tokens: 33040629760 | elapsed time per iteration (s): 0.15 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.733652E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.948 | TFLOPs: 26.14 | +7: iteration 63030/ 173500 | consumed samples: 16135680 | consumed tokens: 33045872640 | elapsed time per iteration (s): 0.15 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.730813E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.137 | TFLOPs: 26.16 | +7: iteration 63040/ 173500 | consumed samples: 16138240 | consumed tokens: 33051115520 | elapsed time per iteration (s): 0.16 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.727661E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.799 | TFLOPs: 25.81 | +7: iteration 63050/ 173500 | consumed samples: 16140800 | consumed tokens: 33056358400 | elapsed time per iteration (s): 0.16 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.722400E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.416 | TFLOPs: 25.79 | +7: iteration 63060/ 173500 | consumed samples: 16143360 | consumed tokens: 33061601280 | elapsed time per iteration (s): 0.15 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.741254E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.242 | TFLOPs: 26.15 | +7: iteration 63070/ 173500 | consumed samples: 16145920 | consumed tokens: 33066844160 | elapsed time per iteration (s): 0.15 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.729655E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.650 | TFLOPs: 26.14 | +7: iteration 63080/ 173500 | consumed samples: 16148480 | consumed tokens: 33072087040 | elapsed time per iteration (s): 0.16 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.745561E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.740 | TFLOPs: 25.43 | +7: iteration 63090/ 173500 | consumed samples: 16151040 | consumed tokens: 33077329920 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.724930E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.137 | TFLOPs: 26.21 | +7: iteration 63100/ 173500 | consumed samples: 16153600 | consumed tokens: 33082572800 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.723757E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.273 | TFLOPs: 26.16 | +7: iteration 63110/ 173500 | consumed samples: 16156160 | consumed tokens: 33087815680 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.712887E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.612 | TFLOPs: 26.17 | +7: iteration 63120/ 173500 | consumed samples: 16158720 | consumed tokens: 33093058560 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.733994E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.830 | TFLOPs: 26.19 | +7: iteration 63130/ 173500 | consumed samples: 16161280 | consumed tokens: 33098301440 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.733459E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.096 | TFLOPs: 26.18 | +7: iteration 63140/ 173500 | consumed samples: 16163840 | consumed tokens: 33103544320 | elapsed time per iteration (s): 0.15 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.742193E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.035 | TFLOPs: 26.14 | +7: iteration 63150/ 173500 | consumed samples: 16166400 | consumed tokens: 33108787200 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.734348E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.297 | TFLOPs: 26.16 | +7: iteration 63160/ 173500 | consumed samples: 16168960 | consumed tokens: 33114030080 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.720810E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.692 | TFLOPs: 26.14 | +7: iteration 63170/ 173500 | consumed samples: 16171520 | consumed tokens: 33119272960 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.736343E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.208 | TFLOPs: 26.16 | +7: iteration 63180/ 173500 | consumed samples: 16174080 | consumed tokens: 33124515840 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.731762E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.277 | TFLOPs: 26.18 | +7: iteration 63190/ 173500 | consumed samples: 16176640 | consumed tokens: 33129758720 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.736488E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.526 | TFLOPs: 26.17 | +7: iteration 63200/ 173500 | consumed samples: 16179200 | consumed tokens: 33135001600 | elapsed time per iteration (s): 0.15 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.736639E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.694 | TFLOPs: 26.08 | +7: iteration 63210/ 173500 | consumed samples: 16181760 | consumed tokens: 33140244480 | elapsed time per iteration (s): 0.16 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.735833E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.853 | TFLOPs: 25.23 | +7: iteration 63220/ 173500 | consumed samples: 16184320 | consumed tokens: 33145487360 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.732011E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.209 | TFLOPs: 26.15 | +7: iteration 63230/ 173500 | consumed samples: 16186880 | consumed tokens: 33150730240 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.725590E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.419 | TFLOPs: 26.15 | +7: iteration 63240/ 173500 | consumed samples: 16189440 | consumed tokens: 33155973120 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.734058E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.042 | TFLOPs: 26.17 | +7: iteration 63250/ 173500 | consumed samples: 16192000 | consumed tokens: 33161216000 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.731596E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.923 | TFLOPs: 26.16 | +7: iteration 63260/ 173500 | consumed samples: 16194560 | consumed tokens: 33166458880 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.732318E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.026 | TFLOPs: 26.16 | +7: iteration 63270/ 173500 | consumed samples: 16197120 | consumed tokens: 33171701760 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.734583E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.494 | TFLOPs: 26.18 | +7: iteration 63280/ 173500 | consumed samples: 16199680 | consumed tokens: 33176944640 | elapsed time per iteration (s): 0.15 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.732241E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.313 | TFLOPs: 26.15 | +7: iteration 63290/ 173500 | consumed samples: 16202240 | consumed tokens: 33182187520 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.721292E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.957 | TFLOPs: 26.14 | +7: iteration 63300/ 173500 | consumed samples: 16204800 | consumed tokens: 33187430400 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.735885E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.784 | TFLOPs: 26.17 | +7: iteration 63310/ 173500 | consumed samples: 16207360 | consumed tokens: 33192673280 | elapsed time per iteration (s): 0.16 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.724043E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.291 | TFLOPs: 25.72 | +7: iteration 63320/ 173500 | consumed samples: 16209920 | consumed tokens: 33197916160 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.738268E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.846 | TFLOPs: 26.17 | +7: iteration 63330/ 173500 | consumed samples: 16212480 | consumed tokens: 33203159040 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.731353E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.997 | TFLOPs: 26.16 | +7: iteration 63340/ 173500 | consumed samples: 16215040 | consumed tokens: 33208401920 | elapsed time per iteration (s): 0.15 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.731286E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.192 | TFLOPs: 26.11 | +7: iteration 63350/ 173500 | consumed samples: 16217600 | consumed tokens: 33213644800 | elapsed time per iteration (s): 0.16 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.735818E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.341 | TFLOPs: 25.66 | +7: iteration 63360/ 173500 | consumed samples: 16220160 | consumed tokens: 33218887680 | elapsed time per iteration (s): 0.15 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.727460E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.147 | TFLOPs: 26.08 | +7: iteration 63370/ 173500 | consumed samples: 16222720 | consumed tokens: 33224130560 | elapsed time per iteration (s): 0.16 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.730994E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.986 | TFLOPs: 25.64 | +7: iteration 63380/ 173500 | consumed samples: 16225280 | consumed tokens: 33229373440 | elapsed time per iteration (s): 0.16 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.734623E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.304 | TFLOPs: 25.50 | +7: iteration 63390/ 173500 | consumed samples: 16227840 | consumed tokens: 33234616320 | elapsed time per iteration (s): 0.15 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.723674E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.248 | TFLOPs: 26.18 | +7: iteration 63400/ 173500 | consumed samples: 16230400 | consumed tokens: 33239859200 | elapsed time per iteration (s): 0.15 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.718682E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.433 | TFLOPs: 26.17 | +7: iteration 63410/ 173500 | consumed samples: 16232960 | consumed tokens: 33245102080 | elapsed time per iteration (s): 0.15 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.728682E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.227 | TFLOPs: 26.16 | +7: iteration 63420/ 173500 | consumed samples: 16235520 | consumed tokens: 33250344960 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.727206E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.108 | TFLOPs: 26.14 | +7: iteration 63430/ 173500 | consumed samples: 16238080 | consumed tokens: 33255587840 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.731205E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.531 | TFLOPs: 26.15 | +7: iteration 63440/ 173500 | consumed samples: 16240640 | consumed tokens: 33260830720 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.724691E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.883 | TFLOPs: 26.19 | +7: iteration 63450/ 173500 | consumed samples: 16243200 | consumed tokens: 33266073600 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.724175E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.902 | TFLOPs: 26.27 | +7: iteration 63460/ 173500 | consumed samples: 16245760 | consumed tokens: 33271316480 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.731913E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.747 | TFLOPs: 26.28 | +7: iteration 63470/ 173500 | consumed samples: 16248320 | consumed tokens: 33276559360 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.735560E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.634 | TFLOPs: 26.23 | +7: iteration 63480/ 173500 | consumed samples: 16250880 | consumed tokens: 33281802240 | elapsed time per iteration (s): 0.15 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.729882E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.097 | TFLOPs: 26.27 | +7: iteration 63490/ 173500 | consumed samples: 16253440 | consumed tokens: 33287045120 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.728022E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.449 | TFLOPs: 26.29 | +7: iteration 63500/ 173500 | consumed samples: 16256000 | consumed tokens: 33292288000 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.716454E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.636 | TFLOPs: 26.26 | +7: iteration 63510/ 173500 | consumed samples: 16258560 | consumed tokens: 33297530880 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.726032E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.628 | TFLOPs: 26.25 | +7: iteration 63520/ 173500 | consumed samples: 16261120 | consumed tokens: 33302773760 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.732473E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.729 | TFLOPs: 26.25 | +7: iteration 63530/ 173500 | consumed samples: 16263680 | consumed tokens: 33308016640 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.730616E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.262 | TFLOPs: 26.26 | +7: iteration 63540/ 173500 | consumed samples: 16266240 | consumed tokens: 33313259520 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.730784E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.981 | TFLOPs: 26.28 | +7: iteration 63550/ 173500 | consumed samples: 16268800 | consumed tokens: 33318502400 | elapsed time per iteration (s): 0.15 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.737077E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.818 | TFLOPs: 26.30 | +7: iteration 63560/ 173500 | consumed samples: 16271360 | consumed tokens: 33323745280 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.753030E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.017 | TFLOPs: 26.22 | +7: iteration 63570/ 173500 | consumed samples: 16273920 | consumed tokens: 33328988160 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.733461E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.455 | TFLOPs: 26.21 | +7: iteration 63580/ 173500 | consumed samples: 16276480 | consumed tokens: 33334231040 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.731537E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.630 | TFLOPs: 26.23 | +7: iteration 63590/ 173500 | consumed samples: 16279040 | consumed tokens: 33339473920 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.736079E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.801 | TFLOPs: 26.22 | +7: iteration 63600/ 173500 | consumed samples: 16281600 | consumed tokens: 33344716800 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.716325E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.606 | TFLOPs: 26.17 | +7: iteration 63610/ 173500 | consumed samples: 16284160 | consumed tokens: 33349959680 | elapsed time per iteration (s): 0.15 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.728095E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.293 | TFLOPs: 26.16 | +7: iteration 63620/ 173500 | consumed samples: 16286720 | consumed tokens: 33355202560 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.736605E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.289 | TFLOPs: 26.16 | +7: iteration 63630/ 173500 | consumed samples: 16289280 | consumed tokens: 33360445440 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.719040E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.492 | TFLOPs: 26.18 | +7: iteration 63640/ 173500 | consumed samples: 16291840 | consumed tokens: 33365688320 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.723959E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.309 | TFLOPs: 25.98 | +7: iteration 63650/ 173500 | consumed samples: 16294400 | consumed tokens: 33370931200 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.724289E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.756 | TFLOPs: 26.08 | +7: iteration 63660/ 173500 | consumed samples: 16296960 | consumed tokens: 33376174080 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.739188E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.396 | TFLOPs: 26.05 | +7: iteration 63670/ 173500 | consumed samples: 16299520 | consumed tokens: 33381416960 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.739932E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.124 | TFLOPs: 26.07 | +7: iteration 63680/ 173500 | consumed samples: 16302080 | consumed tokens: 33386659840 | elapsed time per iteration (s): 0.15 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.742192E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.599 | TFLOPs: 26.11 | +7: iteration 63690/ 173500 | consumed samples: 16304640 | consumed tokens: 33391902720 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.718325E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.818 | TFLOPs: 26.09 | +7: iteration 63700/ 173500 | consumed samples: 16307200 | consumed tokens: 33397145600 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.729962E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.247 | TFLOPs: 26.13 | +7: iteration 63710/ 173500 | consumed samples: 16309760 | consumed tokens: 33402388480 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.734298E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.268 | TFLOPs: 26.18 | +7: iteration 63720/ 173500 | consumed samples: 16312320 | consumed tokens: 33407631360 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.730299E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.021 | TFLOPs: 26.19 | +7: iteration 63730/ 173500 | consumed samples: 16314880 | consumed tokens: 33412874240 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.727547E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.432 | TFLOPs: 26.18 | +7: iteration 63740/ 173500 | consumed samples: 16317440 | consumed tokens: 33418117120 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.738445E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.129 | TFLOPs: 26.19 | +7: iteration 63750/ 173500 | consumed samples: 16320000 | consumed tokens: 33423360000 | elapsed time per iteration (s): 0.15 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.729624E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.991 | TFLOPs: 26.19 | +7: iteration 63760/ 173500 | consumed samples: 16322560 | consumed tokens: 33428602880 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.738838E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.889 | TFLOPs: 26.20 | +7: iteration 63770/ 173500 | consumed samples: 16325120 | consumed tokens: 33433845760 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.751143E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.787 | TFLOPs: 26.22 | +7: iteration 63780/ 173500 | consumed samples: 16327680 | consumed tokens: 33439088640 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.732257E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.031 | TFLOPs: 26.19 | +7: iteration 63790/ 173500 | consumed samples: 16330240 | consumed tokens: 33444331520 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.727366E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.641 | TFLOPs: 26.18 | +7: iteration 63800/ 173500 | consumed samples: 16332800 | consumed tokens: 33449574400 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.724861E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.774 | TFLOPs: 26.19 | +7: iteration 63810/ 173500 | consumed samples: 16335360 | consumed tokens: 33454817280 | elapsed time per iteration (s): 0.15 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.730535E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.271 | TFLOPs: 26.19 | +7: iteration 63820/ 173500 | consumed samples: 16337920 | consumed tokens: 33460060160 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.726981E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.054 | TFLOPs: 26.19 | +7: iteration 63830/ 173500 | consumed samples: 16340480 | consumed tokens: 33465303040 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.744611E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.913 | TFLOPs: 26.20 | +7: iteration 63840/ 173500 | consumed samples: 16343040 | consumed tokens: 33470545920 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.733200E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.579 | TFLOPs: 26.21 | +7: iteration 63850/ 173500 | consumed samples: 16345600 | consumed tokens: 33475788800 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.742474E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.099 | TFLOPs: 26.22 | +7: iteration 63860/ 173500 | consumed samples: 16348160 | consumed tokens: 33481031680 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.725631E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.016 | TFLOPs: 26.21 | +7: iteration 63870/ 173500 | consumed samples: 16350720 | consumed tokens: 33486274560 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.735392E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.349 | TFLOPs: 26.20 | +7: iteration 63880/ 173500 | consumed samples: 16353280 | consumed tokens: 33491517440 | elapsed time per iteration (s): 0.15 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.730165E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.536 | TFLOPs: 26.18 | +7: iteration 63890/ 173500 | consumed samples: 16355840 | consumed tokens: 33496760320 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.722229E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.831 | TFLOPs: 26.19 | +7: iteration 63900/ 173500 | consumed samples: 16358400 | consumed tokens: 33502003200 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.731171E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.932 | TFLOPs: 26.19 | +7: iteration 63910/ 173500 | consumed samples: 16360960 | consumed tokens: 33507246080 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.720308E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.919 | TFLOPs: 26.19 | +7: iteration 63920/ 173500 | consumed samples: 16363520 | consumed tokens: 33512488960 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.734497E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.991 | TFLOPs: 26.19 | +7: iteration 63930/ 173500 | consumed samples: 16366080 | consumed tokens: 33517731840 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.726826E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.309 | TFLOPs: 26.21 | +7: iteration 63940/ 173500 | consumed samples: 16368640 | consumed tokens: 33522974720 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.727713E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.927 | TFLOPs: 26.25 | +7: iteration 63950/ 173500 | consumed samples: 16371200 | consumed tokens: 33528217600 | elapsed time per iteration (s): 0.15 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.723160E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.857 | TFLOPs: 26.23 | +7: iteration 63960/ 173500 | consumed samples: 16373760 | consumed tokens: 33533460480 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.725377E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.748 | TFLOPs: 26.23 | +7: iteration 63970/ 173500 | consumed samples: 16376320 | consumed tokens: 33538703360 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.722466E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.261 | TFLOPs: 26.24 | +7: iteration 63980/ 173500 | consumed samples: 16378880 | consumed tokens: 33543946240 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.719382E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.344 | TFLOPs: 26.26 | +7: iteration 63990/ 173500 | consumed samples: 16381440 | consumed tokens: 33549189120 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.731441E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.510 | TFLOPs: 26.24 | +0: [2023-03-17 03:01:17,750] [INFO] [logging.py:68:log_dist] [Rank 0] step=64000, skipped=0, lr=[0.0001476794025098283, 0.0001476794025098283, 0.0001476794025098283], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 64000/ 173500 | consumed samples: 16384000 | consumed tokens: 33554432000 | elapsed time per iteration (s): 0.15 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.723621E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.269 | TFLOPs: 26.24 | +0: steps: 64000 loss: 3.7153 iter time (s): 0.153 samples/sec: 1675.259 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 64000 | lm loss value: 3.908602E+00 | lm loss PPL: 4.982922E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 64000 to checkpoints_44m91b100m +0: [2023-03-17 03:01:17,823] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step64000 is begin to save! +0: [2023-03-17 03:01:17,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:01:17,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:01:17,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:01:17,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:01:17,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:01:17,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:01:17,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:01:17,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:01:17,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:01:17,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:01:17,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:01:17,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:01:17,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:01:17,939] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:01:17,940] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:01:17,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:01:17,948] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:01:17,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:01:17,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:01:17,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:01:17,957] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step64000/mp_rank_00_model_states.pt +0: [2023-03-17 03:01:17,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:01:17,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:01:17,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +3: [2023-03-17 03:01:17,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +3: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:01:17,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +3: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 03:01:17,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +3: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +3: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:01:17,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:01:17,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:01:17,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +4: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +6: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +2: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +7: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +0: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +3: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +5: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +1: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:01:17,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step64000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:01:17,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step64000 is ready now! +0: successfully saved checkpoint at iteration 64000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.65 +7: iteration 64010/ 173500 | consumed samples: 16386560 | consumed tokens: 33559674880 | elapsed time per iteration (s): 0.18 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.726524E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.332 | TFLOPs: 22.62 | +7: iteration 64020/ 173500 | consumed samples: 16389120 | consumed tokens: 33564917760 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.729050E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.322 | TFLOPs: 26.24 | +7: iteration 64030/ 173500 | consumed samples: 16391680 | consumed tokens: 33570160640 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.743970E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.680 | TFLOPs: 26.26 | +7: iteration 64040/ 173500 | consumed samples: 16394240 | consumed tokens: 33575403520 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.740115E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.770 | TFLOPs: 26.25 | +7: iteration 64050/ 173500 | consumed samples: 16396800 | consumed tokens: 33580646400 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.736312E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.892 | TFLOPs: 26.24 | +7: iteration 64060/ 173500 | consumed samples: 16399360 | consumed tokens: 33585889280 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.731801E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.336 | TFLOPs: 26.26 | +7: iteration 64070/ 173500 | consumed samples: 16401920 | consumed tokens: 33591132160 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.738051E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.597 | TFLOPs: 26.25 | +7: iteration 64080/ 173500 | consumed samples: 16404480 | consumed tokens: 33596375040 | elapsed time per iteration (s): 0.15 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.740613E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.989 | TFLOPs: 26.24 | +7: iteration 64090/ 173500 | consumed samples: 16407040 | consumed tokens: 33601617920 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.733055E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.326 | TFLOPs: 26.23 | +7: iteration 64100/ 173500 | consumed samples: 16409600 | consumed tokens: 33606860800 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.730084E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.488 | TFLOPs: 26.26 | +7: iteration 64110/ 173500 | consumed samples: 16412160 | consumed tokens: 33612103680 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.721761E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.692 | TFLOPs: 26.25 | +7: iteration 64120/ 173500 | consumed samples: 16414720 | consumed tokens: 33617346560 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.727186E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.904 | TFLOPs: 26.25 | +7: iteration 64130/ 173500 | consumed samples: 16417280 | consumed tokens: 33622589440 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.727206E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.888 | TFLOPs: 26.25 | +7: iteration 64140/ 173500 | consumed samples: 16419840 | consumed tokens: 33627832320 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.738083E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.055 | TFLOPs: 26.24 | +7: iteration 64150/ 173500 | consumed samples: 16422400 | consumed tokens: 33633075200 | elapsed time per iteration (s): 0.15 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.731770E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.761 | TFLOPs: 26.23 | +7: iteration 64160/ 173500 | consumed samples: 16424960 | consumed tokens: 33638318080 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.725678E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.843 | TFLOPs: 26.27 | +7: iteration 64170/ 173500 | consumed samples: 16427520 | consumed tokens: 33643560960 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.732780E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.970 | TFLOPs: 26.24 | +7: iteration 64180/ 173500 | consumed samples: 16430080 | consumed tokens: 33648803840 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.739190E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.347 | TFLOPs: 26.18 | +7: iteration 64190/ 173500 | consumed samples: 16432640 | consumed tokens: 33654046720 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.728168E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.037 | TFLOPs: 26.17 | +7: iteration 64200/ 173500 | consumed samples: 16435200 | consumed tokens: 33659289600 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.730952E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.193 | TFLOPs: 26.19 | +7: iteration 64210/ 173500 | consumed samples: 16437760 | consumed tokens: 33664532480 | elapsed time per iteration (s): 0.16 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.722920E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.631 | TFLOPs: 25.74 | +7: iteration 64220/ 173500 | consumed samples: 16440320 | consumed tokens: 33669775360 | elapsed time per iteration (s): 0.15 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.728607E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.512 | TFLOPs: 26.24 | +7: iteration 64230/ 173500 | consumed samples: 16442880 | consumed tokens: 33675018240 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.738239E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.868 | TFLOPs: 26.22 | +7: iteration 64240/ 173500 | consumed samples: 16445440 | consumed tokens: 33680261120 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.741367E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.037 | TFLOPs: 26.22 | +7: iteration 64250/ 173500 | consumed samples: 16448000 | consumed tokens: 33685504000 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.735014E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.977 | TFLOPs: 26.22 | +7: iteration 64260/ 173500 | consumed samples: 16450560 | consumed tokens: 33690746880 | elapsed time per iteration (s): 0.16 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.730424E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.015 | TFLOPs: 25.72 | +7: iteration 64270/ 173500 | consumed samples: 16453120 | consumed tokens: 33695989760 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.711885E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.361 | TFLOPs: 26.18 | +7: iteration 64280/ 173500 | consumed samples: 16455680 | consumed tokens: 33701232640 | elapsed time per iteration (s): 0.15 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.728016E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.784 | TFLOPs: 26.23 | +7: iteration 64290/ 173500 | consumed samples: 16458240 | consumed tokens: 33706475520 | elapsed time per iteration (s): 0.15 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.725653E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.068 | TFLOPs: 26.18 | +7: iteration 64300/ 173500 | consumed samples: 16460800 | consumed tokens: 33711718400 | elapsed time per iteration (s): 0.15 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.738927E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.184 | TFLOPs: 26.21 | +7: iteration 64310/ 173500 | consumed samples: 16463360 | consumed tokens: 33716961280 | elapsed time per iteration (s): 0.15 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.726399E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.013 | TFLOPs: 26.25 | +7: iteration 64320/ 173500 | consumed samples: 16465920 | consumed tokens: 33722204160 | elapsed time per iteration (s): 0.16 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.725927E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.075 | TFLOPs: 24.67 | +7: iteration 64330/ 173500 | consumed samples: 16468480 | consumed tokens: 33727447040 | elapsed time per iteration (s): 0.16 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.729172E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.483 | TFLOPs: 25.81 | +7: iteration 64340/ 173500 | consumed samples: 16471040 | consumed tokens: 33732689920 | elapsed time per iteration (s): 0.15 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.723643E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.619 | TFLOPs: 26.25 | +7: iteration 64350/ 173500 | consumed samples: 16473600 | consumed tokens: 33737932800 | elapsed time per iteration (s): 0.16 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.733499E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.131 | TFLOPs: 25.72 | +7: iteration 64360/ 173500 | consumed samples: 16476160 | consumed tokens: 33743175680 | elapsed time per iteration (s): 0.16 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.728011E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.693 | TFLOPs: 25.71 | +7: iteration 64370/ 173500 | consumed samples: 16478720 | consumed tokens: 33748418560 | elapsed time per iteration (s): 0.16 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.726293E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.870 | TFLOPs: 25.75 | +7: iteration 64380/ 173500 | consumed samples: 16481280 | consumed tokens: 33753661440 | elapsed time per iteration (s): 0.16 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.721151E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.456 | TFLOPs: 25.24 | +7: iteration 64390/ 173500 | consumed samples: 16483840 | consumed tokens: 33758904320 | elapsed time per iteration (s): 0.16 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.727917E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.939 | TFLOPs: 25.62 | +7: iteration 64400/ 173500 | consumed samples: 16486400 | consumed tokens: 33764147200 | elapsed time per iteration (s): 0.16 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.734725E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.276 | TFLOPs: 25.55 | +7: iteration 64410/ 173500 | consumed samples: 16488960 | consumed tokens: 33769390080 | elapsed time per iteration (s): 0.15 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.722593E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.657 | TFLOPs: 26.03 | +7: iteration 64420/ 173500 | consumed samples: 16491520 | consumed tokens: 33774632960 | elapsed time per iteration (s): 0.16 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.725922E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.286 | TFLOPs: 25.33 | +7: iteration 64430/ 173500 | consumed samples: 16494080 | consumed tokens: 33779875840 | elapsed time per iteration (s): 0.15 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.705703E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.669 | TFLOPs: 25.95 | +7: iteration 64440/ 173500 | consumed samples: 16496640 | consumed tokens: 33785118720 | elapsed time per iteration (s): 0.16 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.731942E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.682 | TFLOPs: 25.51 | +7: iteration 64450/ 173500 | consumed samples: 16499200 | consumed tokens: 33790361600 | elapsed time per iteration (s): 0.16 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.713980E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.063 | TFLOPs: 25.23 | +7: iteration 64460/ 173500 | consumed samples: 16501760 | consumed tokens: 33795604480 | elapsed time per iteration (s): 0.16 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.725885E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.204 | TFLOPs: 24.97 | +7: iteration 64470/ 173500 | consumed samples: 16504320 | consumed tokens: 33800847360 | elapsed time per iteration (s): 0.16 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.738944E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.059 | TFLOPs: 25.72 | +7: iteration 64480/ 173500 | consumed samples: 16506880 | consumed tokens: 33806090240 | elapsed time per iteration (s): 0.15 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.722620E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.845 | TFLOPs: 25.95 | +7: iteration 64490/ 173500 | consumed samples: 16509440 | consumed tokens: 33811333120 | elapsed time per iteration (s): 0.16 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.738361E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.091 | TFLOPs: 25.71 | +7: iteration 64500/ 173500 | consumed samples: 16512000 | consumed tokens: 33816576000 | elapsed time per iteration (s): 0.16 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.725626E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.618 | TFLOPs: 25.71 | +7: iteration 64510/ 173500 | consumed samples: 16514560 | consumed tokens: 33821818880 | elapsed time per iteration (s): 0.15 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.740782E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.255 | TFLOPs: 25.99 | +7: iteration 64520/ 173500 | consumed samples: 16517120 | consumed tokens: 33827061760 | elapsed time per iteration (s): 0.16 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.736240E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.654 | TFLOPs: 25.86 | +7: iteration 64530/ 173500 | consumed samples: 16519680 | consumed tokens: 33832304640 | elapsed time per iteration (s): 0.15 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.727441E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.873 | TFLOPs: 26.19 | +7: iteration 64540/ 173500 | consumed samples: 16522240 | consumed tokens: 33837547520 | elapsed time per iteration (s): 0.15 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.714909E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.736 | TFLOPs: 26.03 | +7: iteration 64550/ 173500 | consumed samples: 16524800 | consumed tokens: 33842790400 | elapsed time per iteration (s): 0.15 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.719220E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.367 | TFLOPs: 26.20 | +7: iteration 64560/ 173500 | consumed samples: 16527360 | consumed tokens: 33848033280 | elapsed time per iteration (s): 0.15 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.732612E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.971 | TFLOPs: 25.94 | +7: iteration 64570/ 173500 | consumed samples: 16529920 | consumed tokens: 33853276160 | elapsed time per iteration (s): 0.16 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.716180E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.248 | TFLOPs: 24.86 | +7: iteration 64580/ 173500 | consumed samples: 16532480 | consumed tokens: 33858519040 | elapsed time per iteration (s): 0.16 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.733640E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.228 | TFLOPs: 24.99 | +7: iteration 64590/ 173500 | consumed samples: 16535040 | consumed tokens: 33863761920 | elapsed time per iteration (s): 0.16 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.731703E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.953 | TFLOPs: 25.83 | +7: iteration 64600/ 173500 | consumed samples: 16537600 | consumed tokens: 33869004800 | elapsed time per iteration (s): 0.16 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.733620E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.758 | TFLOPs: 25.12 | +7: iteration 64610/ 173500 | consumed samples: 16540160 | consumed tokens: 33874247680 | elapsed time per iteration (s): 0.16 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.734256E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.655 | TFLOPs: 25.62 | +7: iteration 64620/ 173500 | consumed samples: 16542720 | consumed tokens: 33879490560 | elapsed time per iteration (s): 0.16 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.731401E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.805 | TFLOPs: 25.65 | +7: iteration 64630/ 173500 | consumed samples: 16545280 | consumed tokens: 33884733440 | elapsed time per iteration (s): 0.16 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.737535E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.727 | TFLOPs: 25.31 | +7: iteration 64640/ 173500 | consumed samples: 16547840 | consumed tokens: 33889976320 | elapsed time per iteration (s): 0.16 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.726692E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.056 | TFLOPs: 25.72 | +7: iteration 64650/ 173500 | consumed samples: 16550400 | consumed tokens: 33895219200 | elapsed time per iteration (s): 0.16 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.733586E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.222 | TFLOPs: 25.55 | +7: iteration 64660/ 173500 | consumed samples: 16552960 | consumed tokens: 33900462080 | elapsed time per iteration (s): 0.15 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.744087E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.862 | TFLOPs: 25.97 | +7: iteration 64670/ 173500 | consumed samples: 16555520 | consumed tokens: 33905704960 | elapsed time per iteration (s): 0.16 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.734181E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.083 | TFLOPs: 25.63 | +7: iteration 64680/ 173500 | consumed samples: 16558080 | consumed tokens: 33910947840 | elapsed time per iteration (s): 0.16 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.750623E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.088 | TFLOPs: 25.39 | +7: iteration 64690/ 173500 | consumed samples: 16560640 | consumed tokens: 33916190720 | elapsed time per iteration (s): 0.16 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.732261E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.883 | TFLOPs: 25.54 | +7: iteration 64700/ 173500 | consumed samples: 16563200 | consumed tokens: 33921433600 | elapsed time per iteration (s): 0.16 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.734335E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.254 | TFLOPs: 24.89 | +7: iteration 64710/ 173500 | consumed samples: 16565760 | consumed tokens: 33926676480 | elapsed time per iteration (s): 0.16 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.722877E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.394 | TFLOPs: 25.30 | +7: iteration 64720/ 173500 | consumed samples: 16568320 | consumed tokens: 33931919360 | elapsed time per iteration (s): 0.16 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.741680E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.459 | TFLOPs: 25.74 | +7: iteration 64730/ 173500 | consumed samples: 16570880 | consumed tokens: 33937162240 | elapsed time per iteration (s): 0.15 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.729129E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.167 | TFLOPs: 26.07 | +7: iteration 64740/ 173500 | consumed samples: 16573440 | consumed tokens: 33942405120 | elapsed time per iteration (s): 0.16 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.736676E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.194 | TFLOPs: 25.68 | +7: iteration 64750/ 173500 | consumed samples: 16576000 | consumed tokens: 33947648000 | elapsed time per iteration (s): 0.16 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.737914E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.392 | TFLOPs: 25.82 | +7: iteration 64760/ 173500 | consumed samples: 16578560 | consumed tokens: 33952890880 | elapsed time per iteration (s): 0.16 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.734409E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.108 | TFLOPs: 24.94 | +7: iteration 64770/ 173500 | consumed samples: 16581120 | consumed tokens: 33958133760 | elapsed time per iteration (s): 0.16 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.728362E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.313 | TFLOPs: 25.60 | +7: iteration 64780/ 173500 | consumed samples: 16583680 | consumed tokens: 33963376640 | elapsed time per iteration (s): 0.16 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.732747E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.759 | TFLOPs: 24.87 | +7: iteration 64790/ 173500 | consumed samples: 16586240 | consumed tokens: 33968619520 | elapsed time per iteration (s): 0.16 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.722481E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.567 | TFLOPs: 24.87 | +7: iteration 64800/ 173500 | consumed samples: 16588800 | consumed tokens: 33973862400 | elapsed time per iteration (s): 0.16 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.719640E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.286 | TFLOPs: 25.74 | +7: iteration 64810/ 173500 | consumed samples: 16591360 | consumed tokens: 33979105280 | elapsed time per iteration (s): 0.15 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.731072E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.913 | TFLOPs: 26.11 | +7: iteration 64820/ 173500 | consumed samples: 16593920 | consumed tokens: 33984348160 | elapsed time per iteration (s): 0.16 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.723268E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.242 | TFLOPs: 25.60 | +7: iteration 64830/ 173500 | consumed samples: 16596480 | consumed tokens: 33989591040 | elapsed time per iteration (s): 0.15 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.730553E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.122 | TFLOPs: 25.94 | +7: iteration 64840/ 173500 | consumed samples: 16599040 | consumed tokens: 33994833920 | elapsed time per iteration (s): 0.15 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.734658E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.767 | TFLOPs: 26.12 | +7: iteration 64850/ 173500 | consumed samples: 16601600 | consumed tokens: 34000076800 | elapsed time per iteration (s): 0.15 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.729179E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.513 | TFLOPs: 25.92 | +7: iteration 64860/ 173500 | consumed samples: 16604160 | consumed tokens: 34005319680 | elapsed time per iteration (s): 0.16 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.731664E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.720 | TFLOPs: 25.75 | +7: iteration 64870/ 173500 | consumed samples: 16606720 | consumed tokens: 34010562560 | elapsed time per iteration (s): 0.16 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.736631E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.293 | TFLOPs: 25.85 | +7: iteration 64880/ 173500 | consumed samples: 16609280 | consumed tokens: 34015805440 | elapsed time per iteration (s): 0.16 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.736366E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.739 | TFLOPs: 25.79 | +7: iteration 64890/ 173500 | consumed samples: 16611840 | consumed tokens: 34021048320 | elapsed time per iteration (s): 0.16 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.732624E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.771 | TFLOPs: 25.10 | +7: iteration 64900/ 173500 | consumed samples: 16614400 | consumed tokens: 34026291200 | elapsed time per iteration (s): 0.16 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.720774E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.130 | TFLOPs: 25.24 | +7: iteration 64910/ 173500 | consumed samples: 16616960 | consumed tokens: 34031534080 | elapsed time per iteration (s): 0.15 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.740993E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.422 | TFLOPs: 26.15 | +7: iteration 64920/ 173500 | consumed samples: 16619520 | consumed tokens: 34036776960 | elapsed time per iteration (s): 0.16 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.725925E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.731 | TFLOPs: 25.06 | +7: iteration 64930/ 173500 | consumed samples: 16622080 | consumed tokens: 34042019840 | elapsed time per iteration (s): 0.16 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.726636E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.350 | TFLOPs: 25.74 | +7: iteration 64940/ 173500 | consumed samples: 16624640 | consumed tokens: 34047262720 | elapsed time per iteration (s): 0.15 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.721385E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.949 | TFLOPs: 26.14 | +7: iteration 64950/ 173500 | consumed samples: 16627200 | consumed tokens: 34052505600 | elapsed time per iteration (s): 0.16 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.720589E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.831 | TFLOPs: 24.63 | +7: iteration 64960/ 173500 | consumed samples: 16629760 | consumed tokens: 34057748480 | elapsed time per iteration (s): 0.16 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.725875E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.282 | TFLOPs: 25.35 | +7: iteration 64970/ 173500 | consumed samples: 16632320 | consumed tokens: 34062991360 | elapsed time per iteration (s): 0.15 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.738608E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.170 | TFLOPs: 26.16 | +7: iteration 64980/ 173500 | consumed samples: 16634880 | consumed tokens: 34068234240 | elapsed time per iteration (s): 0.16 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.727370E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.921 | TFLOPs: 25.77 | +7: iteration 64990/ 173500 | consumed samples: 16637440 | consumed tokens: 34073477120 | elapsed time per iteration (s): 0.16 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.733989E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.144 | TFLOPs: 25.41 | +7: iteration 65000/ 173500 | consumed samples: 16640000 | consumed tokens: 34078720000 | elapsed time per iteration (s): 0.15 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.746989E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.361 | TFLOPs: 26.13 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 65000 | lm loss value: 3.852672E+00 | lm loss PPL: 4.711878E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 65000 to checkpoints_44m91b100m +0: [2023-03-17 03:03:53,741] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step65000 is begin to save! +0: [2023-03-17 03:03:53,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:03:53,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:03:53,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:03:53,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:03:53,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:03:53,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:03:53,824] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:03:53,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:03:53,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:03:53,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:03:53,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:03:53,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:03:53,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:03:53,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:03:53,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:03:53,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:03:53,865] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:03:53,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:03:53,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:03:53,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:03:53,874] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step65000/mp_rank_00_model_states.pt +0: [2023-03-17 03:03:53,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:03:53,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:03:53,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:03:53,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:03:53,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +3: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +3: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +3: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +3: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:03:53,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +6: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +5: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +1: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +4: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +2: [2023-03-17 03:03:53,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:03:53,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +7: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:03:53,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +3: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +3: [2023-03-17 03:03:53,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:03:53,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:03:53,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: [2023-03-17 03:03:53,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:03:53,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step65000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:03:53,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step65000 is ready now! +0: successfully saved checkpoint at iteration 65000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.39 +7: iteration 65010/ 173500 | consumed samples: 16642560 | consumed tokens: 34083962880 | elapsed time per iteration (s): 0.18 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.727288E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.535 | TFLOPs: 21.93 | +7: iteration 65020/ 173500 | consumed samples: 16645120 | consumed tokens: 34089205760 | elapsed time per iteration (s): 0.16 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.720353E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.465 | TFLOPs: 25.37 | +7: iteration 65030/ 173500 | consumed samples: 16647680 | consumed tokens: 34094448640 | elapsed time per iteration (s): 0.16 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.739478E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.285 | TFLOPs: 25.13 | +7: iteration 65040/ 173500 | consumed samples: 16650240 | consumed tokens: 34099691520 | elapsed time per iteration (s): 0.16 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.723249E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.921 | TFLOPs: 25.72 | +7: iteration 65050/ 173500 | consumed samples: 16652800 | consumed tokens: 34104934400 | elapsed time per iteration (s): 0.15 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.710775E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.430 | TFLOPs: 26.17 | +7: iteration 65060/ 173500 | consumed samples: 16655360 | consumed tokens: 34110177280 | elapsed time per iteration (s): 0.16 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.735825E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.681 | TFLOPs: 25.45 | +7: iteration 65070/ 173500 | consumed samples: 16657920 | consumed tokens: 34115420160 | elapsed time per iteration (s): 0.16 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.727036E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.844 | TFLOPs: 25.73 | +7: iteration 65080/ 173500 | consumed samples: 16660480 | consumed tokens: 34120663040 | elapsed time per iteration (s): 0.15 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.738476E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.938 | TFLOPs: 26.14 | +7: iteration 65090/ 173500 | consumed samples: 16663040 | consumed tokens: 34125905920 | elapsed time per iteration (s): 0.16 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.735627E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.785 | TFLOPs: 25.09 | +7: iteration 65100/ 173500 | consumed samples: 16665600 | consumed tokens: 34131148800 | elapsed time per iteration (s): 0.16 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.734414E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.960 | TFLOPs: 25.70 | +7: iteration 65110/ 173500 | consumed samples: 16668160 | consumed tokens: 34136391680 | elapsed time per iteration (s): 0.16 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.731673E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.129 | TFLOPs: 25.64 | +7: iteration 65120/ 173500 | consumed samples: 16670720 | consumed tokens: 34141634560 | elapsed time per iteration (s): 0.15 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.730296E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.114 | TFLOPs: 26.14 | +7: iteration 65130/ 173500 | consumed samples: 16673280 | consumed tokens: 34146877440 | elapsed time per iteration (s): 0.16 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.731666E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.659 | TFLOPs: 25.34 | +7: iteration 65140/ 173500 | consumed samples: 16675840 | consumed tokens: 34152120320 | elapsed time per iteration (s): 0.16 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.723573E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.246 | TFLOPs: 24.37 | +7: iteration 65150/ 173500 | consumed samples: 16678400 | consumed tokens: 34157363200 | elapsed time per iteration (s): 0.16 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.724319E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.445 | TFLOPs: 25.69 | +7: iteration 65160/ 173500 | consumed samples: 16680960 | consumed tokens: 34162606080 | elapsed time per iteration (s): 0.16 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.742012E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.833 | TFLOPs: 25.34 | +7: iteration 65170/ 173500 | consumed samples: 16683520 | consumed tokens: 34167848960 | elapsed time per iteration (s): 0.16 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.743399E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.338 | TFLOPs: 25.85 | +7: iteration 65180/ 173500 | consumed samples: 16686080 | consumed tokens: 34173091840 | elapsed time per iteration (s): 0.16 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.725196E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.957 | TFLOPs: 25.84 | +7: iteration 65190/ 173500 | consumed samples: 16688640 | consumed tokens: 34178334720 | elapsed time per iteration (s): 0.16 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.727870E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.025 | TFLOPs: 25.30 | +7: iteration 65200/ 173500 | consumed samples: 16691200 | consumed tokens: 34183577600 | elapsed time per iteration (s): 0.15 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.730371E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.720 | TFLOPs: 26.00 | +7: iteration 65210/ 173500 | consumed samples: 16693760 | consumed tokens: 34188820480 | elapsed time per iteration (s): 0.16 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.726978E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.201 | TFLOPs: 25.74 | +7: iteration 65220/ 173500 | consumed samples: 16696320 | consumed tokens: 34194063360 | elapsed time per iteration (s): 0.15 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.722845E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.267 | TFLOPs: 26.10 | +7: iteration 65230/ 173500 | consumed samples: 16698880 | consumed tokens: 34199306240 | elapsed time per iteration (s): 0.16 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.738951E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.278 | TFLOPs: 25.65 | +7: iteration 65240/ 173500 | consumed samples: 16701440 | consumed tokens: 34204549120 | elapsed time per iteration (s): 0.15 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.738946E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.965 | TFLOPs: 26.05 | +7: iteration 65250/ 173500 | consumed samples: 16704000 | consumed tokens: 34209792000 | elapsed time per iteration (s): 0.16 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.728111E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.251 | TFLOPs: 25.60 | +7: iteration 65260/ 173500 | consumed samples: 16706560 | consumed tokens: 34215034880 | elapsed time per iteration (s): 0.16 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.734182E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.681 | TFLOPs: 24.44 | +7: iteration 65270/ 173500 | consumed samples: 16709120 | consumed tokens: 34220277760 | elapsed time per iteration (s): 0.16 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.727743E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.504 | TFLOPs: 24.60 | +7: iteration 65280/ 173500 | consumed samples: 16711680 | consumed tokens: 34225520640 | elapsed time per iteration (s): 0.16 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.729097E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.947 | TFLOPs: 25.09 | +7: iteration 65290/ 173500 | consumed samples: 16714240 | consumed tokens: 34230763520 | elapsed time per iteration (s): 0.16 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.717635E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.925 | TFLOPs: 25.86 | +7: iteration 65300/ 173500 | consumed samples: 16716800 | consumed tokens: 34236006400 | elapsed time per iteration (s): 0.15 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.724678E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.102 | TFLOPs: 26.11 | +7: iteration 65310/ 173500 | consumed samples: 16719360 | consumed tokens: 34241249280 | elapsed time per iteration (s): 0.15 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.721759E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.469 | TFLOPs: 26.13 | +7: iteration 65320/ 173500 | consumed samples: 16721920 | consumed tokens: 34246492160 | elapsed time per iteration (s): 0.16 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.723090E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.180 | TFLOPs: 25.39 | +7: iteration 65330/ 173500 | consumed samples: 16724480 | consumed tokens: 34251735040 | elapsed time per iteration (s): 0.16 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.738677E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.721 | TFLOPs: 25.84 | +7: iteration 65340/ 173500 | consumed samples: 16727040 | consumed tokens: 34256977920 | elapsed time per iteration (s): 0.16 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.728435E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.166 | TFLOPs: 25.13 | +7: iteration 65350/ 173500 | consumed samples: 16729600 | consumed tokens: 34262220800 | elapsed time per iteration (s): 0.16 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.740753E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.643 | TFLOPs: 25.82 | +7: iteration 65360/ 173500 | consumed samples: 16732160 | consumed tokens: 34267463680 | elapsed time per iteration (s): 0.16 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.731751E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.594 | TFLOPs: 25.82 | +7: iteration 65370/ 173500 | consumed samples: 16734720 | consumed tokens: 34272706560 | elapsed time per iteration (s): 0.16 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.727397E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.532 | TFLOPs: 24.97 | +7: iteration 65380/ 173500 | consumed samples: 16737280 | consumed tokens: 34277949440 | elapsed time per iteration (s): 0.16 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.732159E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.023 | TFLOPs: 25.36 | +7: iteration 65390/ 173500 | consumed samples: 16739840 | consumed tokens: 34283192320 | elapsed time per iteration (s): 0.16 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.726755E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.734 | TFLOPs: 25.86 | +7: iteration 65400/ 173500 | consumed samples: 16742400 | consumed tokens: 34288435200 | elapsed time per iteration (s): 0.16 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.732446E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.251 | TFLOPs: 25.85 | +7: iteration 65410/ 173500 | consumed samples: 16744960 | consumed tokens: 34293678080 | elapsed time per iteration (s): 0.16 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.739156E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.921 | TFLOPs: 25.47 | +7: iteration 65420/ 173500 | consumed samples: 16747520 | consumed tokens: 34298920960 | elapsed time per iteration (s): 0.16 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.740806E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.397 | TFLOPs: 24.52 | +7: iteration 65430/ 173500 | consumed samples: 16750080 | consumed tokens: 34304163840 | elapsed time per iteration (s): 0.16 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.714783E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.431 | TFLOPs: 25.22 | +7: iteration 65440/ 173500 | consumed samples: 16752640 | consumed tokens: 34309406720 | elapsed time per iteration (s): 0.15 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.723903E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.390 | TFLOPs: 26.16 | +7: iteration 65450/ 173500 | consumed samples: 16755200 | consumed tokens: 34314649600 | elapsed time per iteration (s): 0.15 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.731683E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.302 | TFLOPs: 26.13 | +7: iteration 65460/ 173500 | consumed samples: 16757760 | consumed tokens: 34319892480 | elapsed time per iteration (s): 0.15 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.729426E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.909 | TFLOPs: 26.16 | +7: iteration 65470/ 173500 | consumed samples: 16760320 | consumed tokens: 34325135360 | elapsed time per iteration (s): 0.16 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.736507E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.192 | TFLOPs: 25.82 | +7: iteration 65480/ 173500 | consumed samples: 16762880 | consumed tokens: 34330378240 | elapsed time per iteration (s): 0.16 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.728327E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.688 | TFLOPs: 25.89 | +7: iteration 65490/ 173500 | consumed samples: 16765440 | consumed tokens: 34335621120 | elapsed time per iteration (s): 0.16 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.727816E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.800 | TFLOPs: 24.46 | +7: iteration 65500/ 173500 | consumed samples: 16768000 | consumed tokens: 34340864000 | elapsed time per iteration (s): 0.16 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.725826E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.579 | TFLOPs: 25.43 | +7: iteration 65510/ 173500 | consumed samples: 16770560 | consumed tokens: 34346106880 | elapsed time per iteration (s): 0.15 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.733590E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.304 | TFLOPs: 26.13 | +7: iteration 65520/ 173500 | consumed samples: 16773120 | consumed tokens: 34351349760 | elapsed time per iteration (s): 0.16 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.729130E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.773 | TFLOPs: 25.70 | +7: iteration 65530/ 173500 | consumed samples: 16775680 | consumed tokens: 34356592640 | elapsed time per iteration (s): 0.16 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.735968E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.057 | TFLOPs: 25.34 | +7: iteration 65540/ 173500 | consumed samples: 16778240 | consumed tokens: 34361835520 | elapsed time per iteration (s): 0.15 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.723531E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.963 | TFLOPs: 26.10 | +7: iteration 65550/ 173500 | consumed samples: 16780800 | consumed tokens: 34367078400 | elapsed time per iteration (s): 0.16 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.736886E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.235 | TFLOPs: 25.69 | +7: iteration 65560/ 173500 | consumed samples: 16783360 | consumed tokens: 34372321280 | elapsed time per iteration (s): 0.16 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.729038E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.981 | TFLOPs: 25.41 | +7: iteration 65570/ 173500 | consumed samples: 16785920 | consumed tokens: 34377564160 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.730254E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.483 | TFLOPs: 26.07 | +7: iteration 65580/ 173500 | consumed samples: 16788480 | consumed tokens: 34382807040 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.723632E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.054 | TFLOPs: 26.08 | +7: iteration 65590/ 173500 | consumed samples: 16791040 | consumed tokens: 34388049920 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.716315E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.241 | TFLOPs: 26.01 | +7: iteration 65600/ 173500 | consumed samples: 16793600 | consumed tokens: 34393292800 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.729882E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.888 | TFLOPs: 26.16 | +7: iteration 65610/ 173500 | consumed samples: 16796160 | consumed tokens: 34398535680 | elapsed time per iteration (s): 0.15 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.736185E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.968 | TFLOPs: 26.30 | +7: iteration 65620/ 173500 | consumed samples: 16798720 | consumed tokens: 34403778560 | elapsed time per iteration (s): 0.15 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.724903E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.878 | TFLOPs: 25.91 | +7: iteration 65630/ 173500 | consumed samples: 16801280 | consumed tokens: 34409021440 | elapsed time per iteration (s): 0.16 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.728943E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.351 | TFLOPs: 25.32 | +7: iteration 65640/ 173500 | consumed samples: 16803840 | consumed tokens: 34414264320 | elapsed time per iteration (s): 0.16 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.744173E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.743 | TFLOPs: 25.54 | +7: iteration 65650/ 173500 | consumed samples: 16806400 | consumed tokens: 34419507200 | elapsed time per iteration (s): 0.15 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.728619E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.551 | TFLOPs: 26.23 | +7: iteration 65660/ 173500 | consumed samples: 16808960 | consumed tokens: 34424750080 | elapsed time per iteration (s): 0.15 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.727085E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.240 | TFLOPs: 26.29 | +7: iteration 65670/ 173500 | consumed samples: 16811520 | consumed tokens: 34429992960 | elapsed time per iteration (s): 0.16 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.729544E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.259 | TFLOPs: 25.39 | +7: iteration 65680/ 173500 | consumed samples: 16814080 | consumed tokens: 34435235840 | elapsed time per iteration (s): 0.16 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.725891E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.556 | TFLOPs: 25.60 | +7: iteration 65690/ 173500 | consumed samples: 16816640 | consumed tokens: 34440478720 | elapsed time per iteration (s): 0.15 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.728613E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.025 | TFLOPs: 25.94 | +7: iteration 65700/ 173500 | consumed samples: 16819200 | consumed tokens: 34445721600 | elapsed time per iteration (s): 0.15 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.730663E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.808 | TFLOPs: 25.90 | +7: iteration 65710/ 173500 | consumed samples: 16821760 | consumed tokens: 34450964480 | elapsed time per iteration (s): 0.16 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.739797E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.263 | TFLOPs: 25.71 | +7: iteration 65720/ 173500 | consumed samples: 16824320 | consumed tokens: 34456207360 | elapsed time per iteration (s): 0.15 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.728467E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.457 | TFLOPs: 25.95 | +7: iteration 65730/ 173500 | consumed samples: 16826880 | consumed tokens: 34461450240 | elapsed time per iteration (s): 0.16 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.725406E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.938 | TFLOPs: 25.58 | +7: iteration 65740/ 173500 | consumed samples: 16829440 | consumed tokens: 34466693120 | elapsed time per iteration (s): 0.16 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.728162E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.822 | TFLOPs: 25.42 | +7: iteration 65750/ 173500 | consumed samples: 16832000 | consumed tokens: 34471936000 | elapsed time per iteration (s): 0.15 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.721525E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.788 | TFLOPs: 26.16 | +7: iteration 65760/ 173500 | consumed samples: 16834560 | consumed tokens: 34477178880 | elapsed time per iteration (s): 0.16 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.717592E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.319 | TFLOPs: 24.88 | +7: iteration 65770/ 173500 | consumed samples: 16837120 | consumed tokens: 34482421760 | elapsed time per iteration (s): 0.15 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.729286E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.688 | TFLOPs: 26.15 | +7: iteration 65780/ 173500 | consumed samples: 16839680 | consumed tokens: 34487664640 | elapsed time per iteration (s): 0.15 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.719309E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.785 | TFLOPs: 26.14 | +7: iteration 65790/ 173500 | consumed samples: 16842240 | consumed tokens: 34492907520 | elapsed time per iteration (s): 0.16 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.728168E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.647 | TFLOPs: 24.90 | +7: iteration 65800/ 173500 | consumed samples: 16844800 | consumed tokens: 34498150400 | elapsed time per iteration (s): 0.16 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.732801E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.568 | TFLOPs: 25.82 | +7: iteration 65810/ 173500 | consumed samples: 16847360 | consumed tokens: 34503393280 | elapsed time per iteration (s): 0.16 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.724051E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.252 | TFLOPs: 25.49 | +7: iteration 65820/ 173500 | consumed samples: 16849920 | consumed tokens: 34508636160 | elapsed time per iteration (s): 0.15 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.724398E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.954 | TFLOPs: 26.16 | +7: iteration 65830/ 173500 | consumed samples: 16852480 | consumed tokens: 34513879040 | elapsed time per iteration (s): 0.16 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.728019E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.806 | TFLOPs: 25.14 | +7: iteration 65840/ 173500 | consumed samples: 16855040 | consumed tokens: 34519121920 | elapsed time per iteration (s): 0.16 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.726552E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.478 | TFLOPs: 25.70 | +7: iteration 65850/ 173500 | consumed samples: 16857600 | consumed tokens: 34524364800 | elapsed time per iteration (s): 0.16 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.733375E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.179 | TFLOPs: 25.74 | +7: iteration 65860/ 173500 | consumed samples: 16860160 | consumed tokens: 34529607680 | elapsed time per iteration (s): 0.15 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.722087E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.705 | TFLOPs: 26.17 | +7: iteration 65870/ 173500 | consumed samples: 16862720 | consumed tokens: 34534850560 | elapsed time per iteration (s): 0.16 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.725359E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.283 | TFLOPs: 25.74 | +7: iteration 65880/ 173500 | consumed samples: 16865280 | consumed tokens: 34540093440 | elapsed time per iteration (s): 0.15 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.730893E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.714 | TFLOPs: 26.22 | +7: iteration 65890/ 173500 | consumed samples: 16867840 | consumed tokens: 34545336320 | elapsed time per iteration (s): 0.16 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.732674E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.689 | TFLOPs: 25.39 | +7: iteration 65900/ 173500 | consumed samples: 16870400 | consumed tokens: 34550579200 | elapsed time per iteration (s): 0.16 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.725787E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.268 | TFLOPs: 25.88 | +7: iteration 65910/ 173500 | consumed samples: 16872960 | consumed tokens: 34555822080 | elapsed time per iteration (s): 0.15 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.741158E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.314 | TFLOPs: 25.93 | +7: iteration 65920/ 173500 | consumed samples: 16875520 | consumed tokens: 34561064960 | elapsed time per iteration (s): 0.16 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.730258E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.799 | TFLOPs: 25.25 | +7: iteration 65930/ 173500 | consumed samples: 16878080 | consumed tokens: 34566307840 | elapsed time per iteration (s): 0.16 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.716072E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.329 | TFLOPs: 25.63 | +7: iteration 65940/ 173500 | consumed samples: 16880640 | consumed tokens: 34571550720 | elapsed time per iteration (s): 0.16 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.717664E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.503 | TFLOPs: 25.51 | +7: iteration 65950/ 173500 | consumed samples: 16883200 | consumed tokens: 34576793600 | elapsed time per iteration (s): 0.16 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.727763E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.870 | TFLOPs: 25.31 | +7: iteration 65960/ 173500 | consumed samples: 16885760 | consumed tokens: 34582036480 | elapsed time per iteration (s): 0.16 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.734541E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.872 | TFLOPs: 25.07 | +7: iteration 65970/ 173500 | consumed samples: 16888320 | consumed tokens: 34587279360 | elapsed time per iteration (s): 0.15 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.735371E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.413 | TFLOPs: 26.21 | +7: iteration 65980/ 173500 | consumed samples: 16890880 | consumed tokens: 34592522240 | elapsed time per iteration (s): 0.16 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.721209E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.973 | TFLOPs: 25.80 | +7: iteration 65990/ 173500 | consumed samples: 16893440 | consumed tokens: 34597765120 | elapsed time per iteration (s): 0.15 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.720415E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.655 | TFLOPs: 25.95 | +0: [2023-03-17 03:06:30,464] [INFO] [logging.py:68:log_dist] [Rank 0] step=66000, skipped=0, lr=[0.00014466507355770288, 0.00014466507355770288, 0.00014466507355770288], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 66000/ 173500 | consumed samples: 16896000 | consumed tokens: 34603008000 | elapsed time per iteration (s): 0.16 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.714755E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.698 | TFLOPs: 25.53 | +0: steps: 66000 loss: 3.6286 iter time (s): 0.155 samples/sec: 1648.736 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 66000 | lm loss value: 3.903466E+00 | lm loss PPL: 4.957400E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 66000 to checkpoints_44m91b100m +0: [2023-03-17 03:06:30,537] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step66000 is begin to save! +0: [2023-03-17 03:06:30,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:06:30,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:06:30,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:06:30,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:06:30,616] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:06:30,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:06:30,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:06:30,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:06:30,631] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:06:30,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:06:30,639] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:06:30,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:06:30,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:06:30,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:06:30,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:06:30,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:06:30,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:06:30,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:06:30,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:06:30,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:06:30,673] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step66000/mp_rank_00_model_states.pt +0: [2023-03-17 03:06:30,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:06:30,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:06:30,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:06:30,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: [2023-03-17 03:06:30,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: [2023-03-17 03:06:30,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:06:30,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +6: [2023-03-17 03:06:30,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +6: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +6: [2023-03-17 03:06:30,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:06:30,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:06:30,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +6: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:06:30,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +2: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +4: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +1: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +6: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +3: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +7: [2023-03-17 03:06:30,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:06:30,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:06:30,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +5: [2023-03-17 03:06:30,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:06:30,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step66000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:06:30,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step66000 is ready now! +0: successfully saved checkpoint at iteration 66000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 185.26 +7: iteration 66010/ 173500 | consumed samples: 16898560 | consumed tokens: 34608250880 | elapsed time per iteration (s): 0.19 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.724079E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1370.048 | TFLOPs: 21.49 | +7: iteration 66020/ 173500 | consumed samples: 16901120 | consumed tokens: 34613493760 | elapsed time per iteration (s): 0.16 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.714078E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.580 | TFLOPs: 25.43 | +7: iteration 66030/ 173500 | consumed samples: 16903680 | consumed tokens: 34618736640 | elapsed time per iteration (s): 0.16 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.719412E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.861 | TFLOPs: 25.23 | +7: iteration 66040/ 173500 | consumed samples: 16906240 | consumed tokens: 34623979520 | elapsed time per iteration (s): 0.16 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.736488E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.787 | TFLOPs: 25.18 | +7: iteration 66050/ 173500 | consumed samples: 16908800 | consumed tokens: 34629222400 | elapsed time per iteration (s): 0.16 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.718571E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.785 | TFLOPs: 25.84 | +7: iteration 66060/ 173500 | consumed samples: 16911360 | consumed tokens: 34634465280 | elapsed time per iteration (s): 0.15 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.739609E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.899 | TFLOPs: 26.06 | +7: iteration 66070/ 173500 | consumed samples: 16913920 | consumed tokens: 34639708160 | elapsed time per iteration (s): 0.16 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.732133E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.845 | TFLOPs: 25.72 | +7: iteration 66080/ 173500 | consumed samples: 16916480 | consumed tokens: 34644951040 | elapsed time per iteration (s): 0.16 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.740215E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.313 | TFLOPs: 25.29 | +7: iteration 66090/ 173500 | consumed samples: 16919040 | consumed tokens: 34650193920 | elapsed time per iteration (s): 0.15 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.734786E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.952 | TFLOPs: 26.00 | +7: iteration 66100/ 173500 | consumed samples: 16921600 | consumed tokens: 34655436800 | elapsed time per iteration (s): 0.15 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.728486E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.521 | TFLOPs: 26.21 | +7: iteration 66110/ 173500 | consumed samples: 16924160 | consumed tokens: 34660679680 | elapsed time per iteration (s): 0.16 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.736100E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.611 | TFLOPs: 25.26 | +7: iteration 66120/ 173500 | consumed samples: 16926720 | consumed tokens: 34665922560 | elapsed time per iteration (s): 0.16 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.734498E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.310 | TFLOPs: 25.83 | +7: iteration 66130/ 173500 | consumed samples: 16929280 | consumed tokens: 34671165440 | elapsed time per iteration (s): 0.16 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.742342E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.275 | TFLOPs: 25.58 | +7: iteration 66140/ 173500 | consumed samples: 16931840 | consumed tokens: 34676408320 | elapsed time per iteration (s): 0.16 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.729825E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.949 | TFLOPs: 25.88 | +7: iteration 66150/ 173500 | consumed samples: 16934400 | consumed tokens: 34681651200 | elapsed time per iteration (s): 0.15 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.733660E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.511 | TFLOPs: 26.01 | +7: iteration 66160/ 173500 | consumed samples: 16936960 | consumed tokens: 34686894080 | elapsed time per iteration (s): 0.15 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.718122E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.784 | TFLOPs: 26.23 | +7: iteration 66170/ 173500 | consumed samples: 16939520 | consumed tokens: 34692136960 | elapsed time per iteration (s): 0.16 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.715494E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.930 | TFLOPs: 25.70 | +7: iteration 66180/ 173500 | consumed samples: 16942080 | consumed tokens: 34697379840 | elapsed time per iteration (s): 0.15 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.735642E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.036 | TFLOPs: 25.91 | +7: iteration 66190/ 173500 | consumed samples: 16944640 | consumed tokens: 34702622720 | elapsed time per iteration (s): 0.15 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.735625E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.719 | TFLOPs: 25.97 | +7: iteration 66200/ 173500 | consumed samples: 16947200 | consumed tokens: 34707865600 | elapsed time per iteration (s): 0.15 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.740053E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.877 | TFLOPs: 25.97 | +7: iteration 66210/ 173500 | consumed samples: 16949760 | consumed tokens: 34713108480 | elapsed time per iteration (s): 0.16 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.733351E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.892 | TFLOPs: 25.76 | +7: iteration 66220/ 173500 | consumed samples: 16952320 | consumed tokens: 34718351360 | elapsed time per iteration (s): 0.16 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.735580E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.943 | TFLOPs: 24.92 | +7: iteration 66230/ 173500 | consumed samples: 16954880 | consumed tokens: 34723594240 | elapsed time per iteration (s): 0.16 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.739793E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.023 | TFLOPs: 25.77 | +7: iteration 66240/ 173500 | consumed samples: 16957440 | consumed tokens: 34728837120 | elapsed time per iteration (s): 0.16 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.732322E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.621 | TFLOPs: 25.85 | +7: iteration 66250/ 173500 | consumed samples: 16960000 | consumed tokens: 34734080000 | elapsed time per iteration (s): 0.16 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.720669E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.450 | TFLOPs: 25.08 | +7: iteration 66260/ 173500 | consumed samples: 16962560 | consumed tokens: 34739322880 | elapsed time per iteration (s): 0.15 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.725009E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.433 | TFLOPs: 25.91 | +7: iteration 66270/ 173500 | consumed samples: 16965120 | consumed tokens: 34744565760 | elapsed time per iteration (s): 0.16 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.722072E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.169 | TFLOPs: 25.39 | +7: iteration 66280/ 173500 | consumed samples: 16967680 | consumed tokens: 34749808640 | elapsed time per iteration (s): 0.16 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.739409E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.944 | TFLOPs: 24.65 | +7: iteration 66290/ 173500 | consumed samples: 16970240 | consumed tokens: 34755051520 | elapsed time per iteration (s): 0.17 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.727674E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.782 | TFLOPs: 23.94 | +7: iteration 66300/ 173500 | consumed samples: 16972800 | consumed tokens: 34760294400 | elapsed time per iteration (s): 0.17 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.733610E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.520 | TFLOPs: 23.12 | +7: iteration 66310/ 173500 | consumed samples: 16975360 | consumed tokens: 34765537280 | elapsed time per iteration (s): 0.16 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.714465E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.954 | TFLOPs: 25.12 | +7: iteration 66320/ 173500 | consumed samples: 16977920 | consumed tokens: 34770780160 | elapsed time per iteration (s): 0.16 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.730816E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.520 | TFLOPs: 25.51 | +7: iteration 66330/ 173500 | consumed samples: 16980480 | consumed tokens: 34776023040 | elapsed time per iteration (s): 0.16 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.724845E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.854 | TFLOPs: 25.22 | +7: iteration 66340/ 173500 | consumed samples: 16983040 | consumed tokens: 34781265920 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.731744E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.063 | TFLOPs: 25.25 | +7: iteration 66350/ 173500 | consumed samples: 16985600 | consumed tokens: 34786508800 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.733607E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.337 | TFLOPs: 25.44 | +7: iteration 66360/ 173500 | consumed samples: 16988160 | consumed tokens: 34791751680 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.722248E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.868 | TFLOPs: 24.76 | +7: iteration 66370/ 173500 | consumed samples: 16990720 | consumed tokens: 34796994560 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.727408E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.846 | TFLOPs: 25.37 | +7: iteration 66380/ 173500 | consumed samples: 16993280 | consumed tokens: 34802237440 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.731704E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.659 | TFLOPs: 24.60 | +7: iteration 66390/ 173500 | consumed samples: 16995840 | consumed tokens: 34807480320 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.716270E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.835 | TFLOPs: 24.87 | +7: iteration 66400/ 173500 | consumed samples: 16998400 | consumed tokens: 34812723200 | elapsed time per iteration (s): 0.16 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.731448E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.591 | TFLOPs: 25.41 | +7: iteration 66410/ 173500 | consumed samples: 17000960 | consumed tokens: 34817966080 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.730590E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.961 | TFLOPs: 25.33 | +7: iteration 66420/ 173500 | consumed samples: 17003520 | consumed tokens: 34823208960 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.740563E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.490 | TFLOPs: 24.68 | +7: iteration 66430/ 173500 | consumed samples: 17006080 | consumed tokens: 34828451840 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.714376E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.556 | TFLOPs: 25.23 | +7: iteration 66440/ 173500 | consumed samples: 17008640 | consumed tokens: 34833694720 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.734701E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.775 | TFLOPs: 25.53 | +7: iteration 66450/ 173500 | consumed samples: 17011200 | consumed tokens: 34838937600 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.741957E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.583 | TFLOPs: 25.35 | +7: iteration 66460/ 173500 | consumed samples: 17013760 | consumed tokens: 34844180480 | elapsed time per iteration (s): 0.16 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.717717E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.355 | TFLOPs: 24.82 | +7: iteration 66470/ 173500 | consumed samples: 17016320 | consumed tokens: 34849423360 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.722707E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.087 | TFLOPs: 25.58 | +7: iteration 66480/ 173500 | consumed samples: 17018880 | consumed tokens: 34854666240 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.732272E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.192 | TFLOPs: 24.78 | +7: iteration 66490/ 173500 | consumed samples: 17021440 | consumed tokens: 34859909120 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.708538E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.163 | TFLOPs: 24.78 | +7: iteration 66500/ 173500 | consumed samples: 17024000 | consumed tokens: 34865152000 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.728317E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.977 | TFLOPs: 25.19 | +7: iteration 66510/ 173500 | consumed samples: 17026560 | consumed tokens: 34870394880 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.726805E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.988 | TFLOPs: 24.62 | +7: iteration 66520/ 173500 | consumed samples: 17029120 | consumed tokens: 34875637760 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.725566E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.478 | TFLOPs: 25.46 | +7: iteration 66530/ 173500 | consumed samples: 17031680 | consumed tokens: 34880880640 | elapsed time per iteration (s): 0.16 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.708335E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.210 | TFLOPs: 25.22 | +7: iteration 66540/ 173500 | consumed samples: 17034240 | consumed tokens: 34886123520 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.730646E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.150 | TFLOPs: 24.73 | +7: iteration 66550/ 173500 | consumed samples: 17036800 | consumed tokens: 34891366400 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.728373E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.598 | TFLOPs: 25.63 | +7: iteration 66560/ 173500 | consumed samples: 17039360 | consumed tokens: 34896609280 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.735347E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.863 | TFLOPs: 24.65 | +7: iteration 66570/ 173500 | consumed samples: 17041920 | consumed tokens: 34901852160 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.724971E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.530 | TFLOPs: 24.38 | +7: iteration 66580/ 173500 | consumed samples: 17044480 | consumed tokens: 34907095040 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.729765E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.483 | TFLOPs: 25.05 | +7: iteration 66590/ 173500 | consumed samples: 17047040 | consumed tokens: 34912337920 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.717112E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.834 | TFLOPs: 25.36 | +7: iteration 66600/ 173500 | consumed samples: 17049600 | consumed tokens: 34917580800 | elapsed time per iteration (s): 0.16 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.725474E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.214 | TFLOPs: 24.64 | +7: iteration 66610/ 173500 | consumed samples: 17052160 | consumed tokens: 34922823680 | elapsed time per iteration (s): 0.16 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.743631E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.369 | TFLOPs: 25.57 | +7: iteration 66620/ 173500 | consumed samples: 17054720 | consumed tokens: 34928066560 | elapsed time per iteration (s): 0.16 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.730853E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.971 | TFLOPs: 25.26 | +7: iteration 66630/ 173500 | consumed samples: 17057280 | consumed tokens: 34933309440 | elapsed time per iteration (s): 0.16 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.740992E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.675 | TFLOPs: 25.24 | +7: iteration 66640/ 173500 | consumed samples: 17059840 | consumed tokens: 34938552320 | elapsed time per iteration (s): 0.16 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.718132E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.406 | TFLOPs: 25.69 | +7: iteration 66650/ 173500 | consumed samples: 17062400 | consumed tokens: 34943795200 | elapsed time per iteration (s): 0.16 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.723042E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.882 | TFLOPs: 24.35 | +7: iteration 66660/ 173500 | consumed samples: 17064960 | consumed tokens: 34949038080 | elapsed time per iteration (s): 0.15 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.722879E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.947 | TFLOPs: 25.91 | +7: iteration 66670/ 173500 | consumed samples: 17067520 | consumed tokens: 34954280960 | elapsed time per iteration (s): 0.16 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.724300E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.837 | TFLOPs: 24.76 | +7: iteration 66680/ 173500 | consumed samples: 17070080 | consumed tokens: 34959523840 | elapsed time per iteration (s): 0.16 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.724361E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.901 | TFLOPs: 25.81 | +7: iteration 66690/ 173500 | consumed samples: 17072640 | consumed tokens: 34964766720 | elapsed time per iteration (s): 0.16 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.724931E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.157 | TFLOPs: 24.50 | +7: iteration 66700/ 173500 | consumed samples: 17075200 | consumed tokens: 34970009600 | elapsed time per iteration (s): 0.16 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.734572E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.317 | TFLOPs: 25.19 | +7: iteration 66710/ 173500 | consumed samples: 17077760 | consumed tokens: 34975252480 | elapsed time per iteration (s): 0.16 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.738156E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.687 | TFLOPs: 25.23 | +7: iteration 66720/ 173500 | consumed samples: 17080320 | consumed tokens: 34980495360 | elapsed time per iteration (s): 0.17 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.730016E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.979 | TFLOPs: 23.34 | +7: iteration 66730/ 173500 | consumed samples: 17082880 | consumed tokens: 34985738240 | elapsed time per iteration (s): 0.16 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.727949E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.783 | TFLOPs: 24.63 | +7: iteration 66740/ 173500 | consumed samples: 17085440 | consumed tokens: 34990981120 | elapsed time per iteration (s): 0.16 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.734639E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.273 | TFLOPs: 24.83 | +7: iteration 66750/ 173500 | consumed samples: 17088000 | consumed tokens: 34996224000 | elapsed time per iteration (s): 0.16 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.734370E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.373 | TFLOPs: 25.02 | +7: iteration 66760/ 173500 | consumed samples: 17090560 | consumed tokens: 35001466880 | elapsed time per iteration (s): 0.16 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.724325E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.881 | TFLOPs: 25.18 | +7: iteration 66770/ 173500 | consumed samples: 17093120 | consumed tokens: 35006709760 | elapsed time per iteration (s): 0.17 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.733681E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.534 | TFLOPs: 23.25 | +7: iteration 66780/ 173500 | consumed samples: 17095680 | consumed tokens: 35011952640 | elapsed time per iteration (s): 0.16 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.723964E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.701 | TFLOPs: 25.78 | +7: iteration 66790/ 173500 | consumed samples: 17098240 | consumed tokens: 35017195520 | elapsed time per iteration (s): 0.16 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.732143E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.791 | TFLOPs: 25.18 | +7: iteration 66800/ 173500 | consumed samples: 17100800 | consumed tokens: 35022438400 | elapsed time per iteration (s): 0.15 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.728934E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.398 | TFLOPs: 26.02 | +7: iteration 66810/ 173500 | consumed samples: 17103360 | consumed tokens: 35027681280 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.736808E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.702 | TFLOPs: 25.29 | +7: iteration 66820/ 173500 | consumed samples: 17105920 | consumed tokens: 35032924160 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.730400E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.410 | TFLOPs: 24.88 | +7: iteration 66830/ 173500 | consumed samples: 17108480 | consumed tokens: 35038167040 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.737163E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.711 | TFLOPs: 25.32 | +7: iteration 66840/ 173500 | consumed samples: 17111040 | consumed tokens: 35043409920 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.731974E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.031 | TFLOPs: 24.62 | +7: iteration 66850/ 173500 | consumed samples: 17113600 | consumed tokens: 35048652800 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.729123E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.745 | TFLOPs: 25.15 | +7: iteration 66860/ 173500 | consumed samples: 17116160 | consumed tokens: 35053895680 | elapsed time per iteration (s): 0.16 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.713527E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.188 | TFLOPs: 25.33 | +7: iteration 66870/ 173500 | consumed samples: 17118720 | consumed tokens: 35059138560 | elapsed time per iteration (s): 0.16 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.729220E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.147 | TFLOPs: 25.50 | +7: iteration 66880/ 173500 | consumed samples: 17121280 | consumed tokens: 35064381440 | elapsed time per iteration (s): 0.15 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.719348E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.519 | TFLOPs: 25.96 | +7: iteration 66890/ 173500 | consumed samples: 17123840 | consumed tokens: 35069624320 | elapsed time per iteration (s): 0.16 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.724903E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.234 | TFLOPs: 25.16 | +7: iteration 66900/ 173500 | consumed samples: 17126400 | consumed tokens: 35074867200 | elapsed time per iteration (s): 0.16 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.725875E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.758 | TFLOPs: 25.84 | +7: iteration 66910/ 173500 | consumed samples: 17128960 | consumed tokens: 35080110080 | elapsed time per iteration (s): 0.16 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.738831E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.803 | TFLOPs: 24.49 | +7: iteration 66920/ 173500 | consumed samples: 17131520 | consumed tokens: 35085352960 | elapsed time per iteration (s): 0.16 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.728326E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.668 | TFLOPs: 25.06 | +7: iteration 66930/ 173500 | consumed samples: 17134080 | consumed tokens: 35090595840 | elapsed time per iteration (s): 0.15 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.727158E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.402 | TFLOPs: 26.02 | +7: iteration 66940/ 173500 | consumed samples: 17136640 | consumed tokens: 35095838720 | elapsed time per iteration (s): 0.15 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.725249E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.562 | TFLOPs: 26.21 | +7: iteration 66950/ 173500 | consumed samples: 17139200 | consumed tokens: 35101081600 | elapsed time per iteration (s): 0.15 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.726578E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.945 | TFLOPs: 26.11 | +7: iteration 66960/ 173500 | consumed samples: 17141760 | consumed tokens: 35106324480 | elapsed time per iteration (s): 0.16 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.725690E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.826 | TFLOPs: 25.84 | +7: iteration 66970/ 173500 | consumed samples: 17144320 | consumed tokens: 35111567360 | elapsed time per iteration (s): 0.15 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.726870E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.071 | TFLOPs: 26.24 | +7: iteration 66980/ 173500 | consumed samples: 17146880 | consumed tokens: 35116810240 | elapsed time per iteration (s): 0.16 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.736143E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.152 | TFLOPs: 25.42 | +7: iteration 66990/ 173500 | consumed samples: 17149440 | consumed tokens: 35122053120 | elapsed time per iteration (s): 0.16 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.729519E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.142 | TFLOPs: 25.28 | +7: iteration 67000/ 173500 | consumed samples: 17152000 | consumed tokens: 35127296000 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.730985E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.512 | TFLOPs: 26.24 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 67000 | lm loss value: 3.861304E+00 | lm loss PPL: 4.752727E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 67000 to checkpoints_44m91b100m +0: [2023-03-17 03:09:09,691] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step67000 is begin to save! +0: [2023-03-17 03:09:09,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:09:09,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:09:09,754] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:09:09,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:09:09,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:09:09,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:09:09,775] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:09:09,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:09:09,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:09:09,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:09:09,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:09:09,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:09:09,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:09:09,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:09:09,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:09:09,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:09:09,815] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:09:09,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:09:09,823] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:09:09,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:09:09,825] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step67000/mp_rank_00_model_states.pt +0: [2023-03-17 03:09:09,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:09:09,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:09:09,845] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:09:09,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +7: [2023-03-17 03:09:09,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:09:09,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 03:09:09,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +2: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +7: [2023-03-17 03:09:09,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +2: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +7: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +2: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 03:09:09,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +7: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,863] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +7: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +2: [2023-03-17 03:09:09,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +2: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:09:09,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +2: [2023-03-17 03:09:09,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +3: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +4: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +6: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 03:09:09,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +1: [2023-03-17 03:09:09,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +5: [2023-03-17 03:09:09,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:09:09,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:09:09,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +7: [2023-03-17 03:09:09,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:09:09,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step67000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:09:09,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step67000 is ready now! +0: successfully saved checkpoint at iteration 67000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.18 +7: iteration 67010/ 173500 | consumed samples: 17154560 | consumed tokens: 35132538880 | elapsed time per iteration (s): 0.18 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.728635E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1432.786 | TFLOPs: 22.47 | +7: iteration 67020/ 173500 | consumed samples: 17157120 | consumed tokens: 35137781760 | elapsed time per iteration (s): 0.16 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.727372E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.186 | TFLOPs: 25.63 | +7: iteration 67030/ 173500 | consumed samples: 17159680 | consumed tokens: 35143024640 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.725151E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.764 | TFLOPs: 26.22 | +7: iteration 67040/ 173500 | consumed samples: 17162240 | consumed tokens: 35148267520 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.733004E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.608 | TFLOPs: 26.20 | +7: iteration 67050/ 173500 | consumed samples: 17164800 | consumed tokens: 35153510400 | elapsed time per iteration (s): 0.15 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.720260E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.391 | TFLOPs: 26.32 | +7: iteration 67060/ 173500 | consumed samples: 17167360 | consumed tokens: 35158753280 | elapsed time per iteration (s): 0.15 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.738492E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.339 | TFLOPs: 26.30 | +7: iteration 67070/ 173500 | consumed samples: 17169920 | consumed tokens: 35163996160 | elapsed time per iteration (s): 0.15 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.749992E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.873 | TFLOPs: 26.27 | +7: iteration 67080/ 173500 | consumed samples: 17172480 | consumed tokens: 35169239040 | elapsed time per iteration (s): 0.16 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.718752E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.158 | TFLOPs: 25.66 | +7: iteration 67090/ 173500 | consumed samples: 17175040 | consumed tokens: 35174481920 | elapsed time per iteration (s): 0.15 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.711908E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.872 | TFLOPs: 25.95 | +7: iteration 67100/ 173500 | consumed samples: 17177600 | consumed tokens: 35179724800 | elapsed time per iteration (s): 0.16 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.716943E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.888 | TFLOPs: 25.34 | +7: iteration 67110/ 173500 | consumed samples: 17180160 | consumed tokens: 35184967680 | elapsed time per iteration (s): 0.16 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.734547E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.998 | TFLOPs: 25.88 | +7: iteration 67120/ 173500 | consumed samples: 17182720 | consumed tokens: 35190210560 | elapsed time per iteration (s): 0.16 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.721650E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.680 | TFLOPs: 25.62 | +7: iteration 67130/ 173500 | consumed samples: 17185280 | consumed tokens: 35195453440 | elapsed time per iteration (s): 0.16 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.717463E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.577 | TFLOPs: 25.84 | +7: iteration 67140/ 173500 | consumed samples: 17187840 | consumed tokens: 35200696320 | elapsed time per iteration (s): 0.15 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.727281E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.129 | TFLOPs: 26.36 | +7: iteration 67150/ 173500 | consumed samples: 17190400 | consumed tokens: 35205939200 | elapsed time per iteration (s): 0.16 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.731018E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.229 | TFLOPs: 25.83 | +7: iteration 67160/ 173500 | consumed samples: 17192960 | consumed tokens: 35211182080 | elapsed time per iteration (s): 0.16 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.722062E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.869 | TFLOPs: 25.76 | +7: iteration 67170/ 173500 | consumed samples: 17195520 | consumed tokens: 35216424960 | elapsed time per iteration (s): 0.16 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.725982E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.278 | TFLOPs: 25.65 | +7: iteration 67180/ 173500 | consumed samples: 17198080 | consumed tokens: 35221667840 | elapsed time per iteration (s): 0.16 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.740503E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.582 | TFLOPs: 25.85 | +7: iteration 67190/ 173500 | consumed samples: 17200640 | consumed tokens: 35226910720 | elapsed time per iteration (s): 0.16 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.724389E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.262 | TFLOPs: 24.53 | +7: iteration 67200/ 173500 | consumed samples: 17203200 | consumed tokens: 35232153600 | elapsed time per iteration (s): 0.16 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.721629E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.409 | TFLOPs: 25.90 | +7: iteration 67210/ 173500 | consumed samples: 17205760 | consumed tokens: 35237396480 | elapsed time per iteration (s): 0.16 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.713411E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.654 | TFLOPs: 25.10 | +7: iteration 67220/ 173500 | consumed samples: 17208320 | consumed tokens: 35242639360 | elapsed time per iteration (s): 0.15 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.726933E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.104 | TFLOPs: 26.13 | +7: iteration 67230/ 173500 | consumed samples: 17210880 | consumed tokens: 35247882240 | elapsed time per iteration (s): 0.16 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.724520E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.477 | TFLOPs: 25.63 | +7: iteration 67240/ 173500 | consumed samples: 17213440 | consumed tokens: 35253125120 | elapsed time per iteration (s): 0.16 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.743664E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.946 | TFLOPs: 24.81 | +7: iteration 67250/ 173500 | consumed samples: 17216000 | consumed tokens: 35258368000 | elapsed time per iteration (s): 0.16 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.730400E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.507 | TFLOPs: 24.79 | +7: iteration 67260/ 173500 | consumed samples: 17218560 | consumed tokens: 35263610880 | elapsed time per iteration (s): 0.15 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.733615E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.791 | TFLOPs: 26.20 | +7: iteration 67270/ 173500 | consumed samples: 17221120 | consumed tokens: 35268853760 | elapsed time per iteration (s): 0.16 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.725410E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.760 | TFLOPs: 25.67 | +7: iteration 67280/ 173500 | consumed samples: 17223680 | consumed tokens: 35274096640 | elapsed time per iteration (s): 0.16 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.727702E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.910 | TFLOPs: 25.81 | +7: iteration 67290/ 173500 | consumed samples: 17226240 | consumed tokens: 35279339520 | elapsed time per iteration (s): 0.15 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.724833E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.400 | TFLOPs: 26.21 | +7: iteration 67300/ 173500 | consumed samples: 17228800 | consumed tokens: 35284582400 | elapsed time per iteration (s): 0.16 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.724909E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.927 | TFLOPs: 25.14 | +7: iteration 67310/ 173500 | consumed samples: 17231360 | consumed tokens: 35289825280 | elapsed time per iteration (s): 0.16 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.729590E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.841 | TFLOPs: 25.87 | +7: iteration 67320/ 173500 | consumed samples: 17233920 | consumed tokens: 35295068160 | elapsed time per iteration (s): 0.16 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.731632E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.916 | TFLOPs: 25.59 | +7: iteration 67330/ 173500 | consumed samples: 17236480 | consumed tokens: 35300311040 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.720193E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.273 | TFLOPs: 26.19 | +7: iteration 67340/ 173500 | consumed samples: 17239040 | consumed tokens: 35305553920 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.723197E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.836 | TFLOPs: 26.16 | +7: iteration 67350/ 173500 | consumed samples: 17241600 | consumed tokens: 35310796800 | elapsed time per iteration (s): 0.16 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.732700E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.260 | TFLOPs: 25.68 | +7: iteration 67360/ 173500 | consumed samples: 17244160 | consumed tokens: 35316039680 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.710405E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.744 | TFLOPs: 26.09 | +7: iteration 67370/ 173500 | consumed samples: 17246720 | consumed tokens: 35321282560 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.730512E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.529 | TFLOPs: 26.10 | +7: iteration 67380/ 173500 | consumed samples: 17249280 | consumed tokens: 35326525440 | elapsed time per iteration (s): 0.15 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.713252E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.060 | TFLOPs: 26.13 | +7: iteration 67390/ 173500 | consumed samples: 17251840 | consumed tokens: 35331768320 | elapsed time per iteration (s): 0.15 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.719361E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.171 | TFLOPs: 26.21 | +7: iteration 67400/ 173500 | consumed samples: 17254400 | consumed tokens: 35337011200 | elapsed time per iteration (s): 0.16 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.725823E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.533 | TFLOPs: 25.01 | +7: iteration 67410/ 173500 | consumed samples: 17256960 | consumed tokens: 35342254080 | elapsed time per iteration (s): 0.15 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.729686E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.666 | TFLOPs: 26.28 | +7: iteration 67420/ 173500 | consumed samples: 17259520 | consumed tokens: 35347496960 | elapsed time per iteration (s): 0.15 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.726137E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.980 | TFLOPs: 26.17 | +7: iteration 67430/ 173500 | consumed samples: 17262080 | consumed tokens: 35352739840 | elapsed time per iteration (s): 0.15 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.717917E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.070 | TFLOPs: 26.11 | +7: iteration 67440/ 173500 | consumed samples: 17264640 | consumed tokens: 35357982720 | elapsed time per iteration (s): 0.16 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.722033E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.297 | TFLOPs: 25.88 | +7: iteration 67450/ 173500 | consumed samples: 17267200 | consumed tokens: 35363225600 | elapsed time per iteration (s): 0.16 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.733363E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.466 | TFLOPs: 25.88 | +7: iteration 67460/ 173500 | consumed samples: 17269760 | consumed tokens: 35368468480 | elapsed time per iteration (s): 0.15 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.727806E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.518 | TFLOPs: 26.07 | +7: iteration 67470/ 173500 | consumed samples: 17272320 | consumed tokens: 35373711360 | elapsed time per iteration (s): 0.16 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.730115E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.996 | TFLOPs: 25.80 | +7: iteration 67480/ 173500 | consumed samples: 17274880 | consumed tokens: 35378954240 | elapsed time per iteration (s): 0.16 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.736428E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.750 | TFLOPs: 25.59 | +7: iteration 67490/ 173500 | consumed samples: 17277440 | consumed tokens: 35384197120 | elapsed time per iteration (s): 0.16 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.723906E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.120 | TFLOPs: 25.83 | +7: iteration 67500/ 173500 | consumed samples: 17280000 | consumed tokens: 35389440000 | elapsed time per iteration (s): 0.16 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.721651E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.216 | TFLOPs: 25.86 | +7: iteration 67510/ 173500 | consumed samples: 17282560 | consumed tokens: 35394682880 | elapsed time per iteration (s): 0.15 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.715889E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.969 | TFLOPs: 26.27 | +7: iteration 67520/ 173500 | consumed samples: 17285120 | consumed tokens: 35399925760 | elapsed time per iteration (s): 0.16 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.722473E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.777 | TFLOPs: 24.90 | +7: iteration 67530/ 173500 | consumed samples: 17287680 | consumed tokens: 35405168640 | elapsed time per iteration (s): 0.15 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.716941E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.195 | TFLOPs: 26.21 | +7: iteration 67540/ 173500 | consumed samples: 17290240 | consumed tokens: 35410411520 | elapsed time per iteration (s): 0.16 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.722711E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.722 | TFLOPs: 25.48 | +7: iteration 67550/ 173500 | consumed samples: 17292800 | consumed tokens: 35415654400 | elapsed time per iteration (s): 0.15 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.718974E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.279 | TFLOPs: 26.18 | +7: iteration 67560/ 173500 | consumed samples: 17295360 | consumed tokens: 35420897280 | elapsed time per iteration (s): 0.15 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.734762E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.887 | TFLOPs: 25.95 | +7: iteration 67570/ 173500 | consumed samples: 17297920 | consumed tokens: 35426140160 | elapsed time per iteration (s): 0.16 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.715635E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.695 | TFLOPs: 25.31 | +7: iteration 67580/ 173500 | consumed samples: 17300480 | consumed tokens: 35431383040 | elapsed time per iteration (s): 0.16 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.734012E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.550 | TFLOPs: 25.68 | +7: iteration 67590/ 173500 | consumed samples: 17303040 | consumed tokens: 35436625920 | elapsed time per iteration (s): 0.15 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.717266E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.927 | TFLOPs: 26.30 | +7: iteration 67600/ 173500 | consumed samples: 17305600 | consumed tokens: 35441868800 | elapsed time per iteration (s): 0.16 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.723631E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.290 | TFLOPs: 25.71 | +7: iteration 67610/ 173500 | consumed samples: 17308160 | consumed tokens: 35447111680 | elapsed time per iteration (s): 0.15 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.736819E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.049 | TFLOPs: 26.19 | +7: iteration 67620/ 173500 | consumed samples: 17310720 | consumed tokens: 35452354560 | elapsed time per iteration (s): 0.15 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.731134E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.330 | TFLOPs: 26.32 | +7: iteration 67630/ 173500 | consumed samples: 17313280 | consumed tokens: 35457597440 | elapsed time per iteration (s): 0.16 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.738135E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.274 | TFLOPs: 25.08 | +7: iteration 67640/ 173500 | consumed samples: 17315840 | consumed tokens: 35462840320 | elapsed time per iteration (s): 0.16 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.747078E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.739 | TFLOPs: 25.70 | +7: iteration 67650/ 173500 | consumed samples: 17318400 | consumed tokens: 35468083200 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.719762E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.181 | TFLOPs: 26.18 | +7: iteration 67660/ 173500 | consumed samples: 17320960 | consumed tokens: 35473326080 | elapsed time per iteration (s): 0.16 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.737360E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.710 | TFLOPs: 25.79 | +7: iteration 67670/ 173500 | consumed samples: 17323520 | consumed tokens: 35478568960 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.720004E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.959 | TFLOPs: 26.00 | +7: iteration 67680/ 173500 | consumed samples: 17326080 | consumed tokens: 35483811840 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.732333E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.159 | TFLOPs: 26.22 | +7: iteration 67690/ 173500 | consumed samples: 17328640 | consumed tokens: 35489054720 | elapsed time per iteration (s): 0.16 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.726989E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.649 | TFLOPs: 25.75 | +7: iteration 67700/ 173500 | consumed samples: 17331200 | consumed tokens: 35494297600 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.718266E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.120 | TFLOPs: 26.29 | +7: iteration 67710/ 173500 | consumed samples: 17333760 | consumed tokens: 35499540480 | elapsed time per iteration (s): 0.15 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.719395E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.157 | TFLOPs: 26.22 | +7: iteration 67720/ 173500 | consumed samples: 17336320 | consumed tokens: 35504783360 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.732362E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.546 | TFLOPs: 26.17 | +7: iteration 67730/ 173500 | consumed samples: 17338880 | consumed tokens: 35510026240 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.733157E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.588 | TFLOPs: 26.23 | +7: iteration 67740/ 173500 | consumed samples: 17341440 | consumed tokens: 35515269120 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.726646E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.276 | TFLOPs: 26.16 | +7: iteration 67750/ 173500 | consumed samples: 17344000 | consumed tokens: 35520512000 | elapsed time per iteration (s): 0.16 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.723566E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.574 | TFLOPs: 25.87 | +7: iteration 67760/ 173500 | consumed samples: 17346560 | consumed tokens: 35525754880 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.720259E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.418 | TFLOPs: 26.06 | +7: iteration 67770/ 173500 | consumed samples: 17349120 | consumed tokens: 35530997760 | elapsed time per iteration (s): 0.15 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.735385E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.364 | TFLOPs: 26.04 | +7: iteration 67780/ 173500 | consumed samples: 17351680 | consumed tokens: 35536240640 | elapsed time per iteration (s): 0.16 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.721260E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.591 | TFLOPs: 24.54 | +7: iteration 67790/ 173500 | consumed samples: 17354240 | consumed tokens: 35541483520 | elapsed time per iteration (s): 0.16 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.733997E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.568 | TFLOPs: 25.54 | +7: iteration 67800/ 173500 | consumed samples: 17356800 | consumed tokens: 35546726400 | elapsed time per iteration (s): 0.16 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.720135E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.861 | TFLOPs: 24.62 | +7: iteration 67810/ 173500 | consumed samples: 17359360 | consumed tokens: 35551969280 | elapsed time per iteration (s): 0.15 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.713294E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.752 | TFLOPs: 26.14 | +7: iteration 67820/ 173500 | consumed samples: 17361920 | consumed tokens: 35557212160 | elapsed time per iteration (s): 0.15 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.719009E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.926 | TFLOPs: 26.14 | +7: iteration 67830/ 173500 | consumed samples: 17364480 | consumed tokens: 35562455040 | elapsed time per iteration (s): 0.16 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.734712E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.300 | TFLOPs: 25.71 | +7: iteration 67840/ 173500 | consumed samples: 17367040 | consumed tokens: 35567697920 | elapsed time per iteration (s): 0.16 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.707958E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.940 | TFLOPs: 25.75 | +7: iteration 67850/ 173500 | consumed samples: 17369600 | consumed tokens: 35572940800 | elapsed time per iteration (s): 0.16 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.726445E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.827 | TFLOPs: 25.62 | +7: iteration 67860/ 173500 | consumed samples: 17372160 | consumed tokens: 35578183680 | elapsed time per iteration (s): 0.16 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.717852E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.207 | TFLOPs: 25.69 | +7: iteration 67870/ 173500 | consumed samples: 17374720 | consumed tokens: 35583426560 | elapsed time per iteration (s): 0.16 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.723052E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.094 | TFLOPs: 25.89 | +7: iteration 67880/ 173500 | consumed samples: 17377280 | consumed tokens: 35588669440 | elapsed time per iteration (s): 0.16 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.740683E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.390 | TFLOPs: 25.68 | +7: iteration 67890/ 173500 | consumed samples: 17379840 | consumed tokens: 35593912320 | elapsed time per iteration (s): 0.15 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.738379E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.164 | TFLOPs: 26.02 | +7: iteration 67900/ 173500 | consumed samples: 17382400 | consumed tokens: 35599155200 | elapsed time per iteration (s): 0.15 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.726071E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.485 | TFLOPs: 26.01 | +7: iteration 67910/ 173500 | consumed samples: 17384960 | consumed tokens: 35604398080 | elapsed time per iteration (s): 0.16 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.729607E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.391 | TFLOPs: 25.73 | +7: iteration 67920/ 173500 | consumed samples: 17387520 | consumed tokens: 35609640960 | elapsed time per iteration (s): 0.16 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.717644E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.553 | TFLOPs: 25.65 | +7: iteration 67930/ 173500 | consumed samples: 17390080 | consumed tokens: 35614883840 | elapsed time per iteration (s): 0.15 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.718353E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.661 | TFLOPs: 26.14 | +7: iteration 67940/ 173500 | consumed samples: 17392640 | consumed tokens: 35620126720 | elapsed time per iteration (s): 0.16 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.714509E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.850 | TFLOPs: 25.72 | +7: iteration 67950/ 173500 | consumed samples: 17395200 | consumed tokens: 35625369600 | elapsed time per iteration (s): 0.16 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.734196E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.049 | TFLOPs: 25.64 | +7: iteration 67960/ 173500 | consumed samples: 17397760 | consumed tokens: 35630612480 | elapsed time per iteration (s): 0.16 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.721342E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.539 | TFLOPs: 25.84 | +7: iteration 67970/ 173500 | consumed samples: 17400320 | consumed tokens: 35635855360 | elapsed time per iteration (s): 0.16 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.728095E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.423 | TFLOPs: 25.33 | +7: iteration 67980/ 173500 | consumed samples: 17402880 | consumed tokens: 35641098240 | elapsed time per iteration (s): 0.15 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.731993E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.332 | TFLOPs: 26.09 | +7: iteration 67990/ 173500 | consumed samples: 17405440 | consumed tokens: 35646341120 | elapsed time per iteration (s): 0.15 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.698262E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.077 | TFLOPs: 26.13 | +0: [2023-03-17 03:11:45,335] [INFO] [logging.py:68:log_dist] [Rank 0] step=68000, skipped=0, lr=[0.00014160436454810027, 0.00014160436454810027, 0.00014160436454810027], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 68000/ 173500 | consumed samples: 17408000 | consumed tokens: 35651584000 | elapsed time per iteration (s): 0.15 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.732761E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.353 | TFLOPs: 26.27 | +0: steps: 68000 loss: 3.7358 iter time (s): 0.156 samples/sec: 1639.472 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 68000 | lm loss value: 3.884776E+00 | lm loss PPL: 4.865605E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 68000 to checkpoints_44m91b100m +0: [2023-03-17 03:11:45,408] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step68000 is begin to save! +0: [2023-03-17 03:11:45,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:11:45,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:11:45,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:11:45,483] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:11:45,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:11:45,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:11:45,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:11:45,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:11:45,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:11:45,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:11:45,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:11:45,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:11:45,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:11:45,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:11:45,524] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:11:45,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:11:45,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:11:45,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:11:45,540] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:11:45,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:11:45,541] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step68000/mp_rank_00_model_states.pt +0: [2023-03-17 03:11:45,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:11:45,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:11:45,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:11:45,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:11:45,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:11:45,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:11:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:11:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:11:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:11:45,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +5: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +6: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 03:11:45,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +1: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +2: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +7: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +3: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:11:45,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:11:45,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +4: [2023-03-17 03:11:45,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:11:45,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step68000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:11:45,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step68000 is ready now! +0: successfully saved checkpoint at iteration 68000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.12 +7: iteration 68010/ 173500 | consumed samples: 17410560 | consumed tokens: 35656826880 | elapsed time per iteration (s): 0.18 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.720811E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.743 | TFLOPs: 22.22 | +7: iteration 68020/ 173500 | consumed samples: 17413120 | consumed tokens: 35662069760 | elapsed time per iteration (s): 0.16 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.735696E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.957 | TFLOPs: 25.66 | +7: iteration 68030/ 173500 | consumed samples: 17415680 | consumed tokens: 35667312640 | elapsed time per iteration (s): 0.15 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.740187E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.814 | TFLOPs: 26.03 | +7: iteration 68040/ 173500 | consumed samples: 17418240 | consumed tokens: 35672555520 | elapsed time per iteration (s): 0.15 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.715423E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.082 | TFLOPs: 26.10 | +7: iteration 68050/ 173500 | consumed samples: 17420800 | consumed tokens: 35677798400 | elapsed time per iteration (s): 0.15 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.731839E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.541 | TFLOPs: 26.07 | +7: iteration 68060/ 173500 | consumed samples: 17423360 | consumed tokens: 35683041280 | elapsed time per iteration (s): 0.16 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.725657E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.882 | TFLOPs: 25.83 | +7: iteration 68070/ 173500 | consumed samples: 17425920 | consumed tokens: 35688284160 | elapsed time per iteration (s): 0.16 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.724942E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.289 | TFLOPs: 25.33 | +7: iteration 68080/ 173500 | consumed samples: 17428480 | consumed tokens: 35693527040 | elapsed time per iteration (s): 0.16 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.719087E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.432 | TFLOPs: 25.66 | +7: iteration 68090/ 173500 | consumed samples: 17431040 | consumed tokens: 35698769920 | elapsed time per iteration (s): 0.16 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.725300E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.538 | TFLOPs: 25.63 | +7: iteration 68100/ 173500 | consumed samples: 17433600 | consumed tokens: 35704012800 | elapsed time per iteration (s): 0.15 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.726052E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.236 | TFLOPs: 26.32 | +7: iteration 68110/ 173500 | consumed samples: 17436160 | consumed tokens: 35709255680 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.724561E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.481 | TFLOPs: 25.93 | +7: iteration 68120/ 173500 | consumed samples: 17438720 | consumed tokens: 35714498560 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.718754E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.271 | TFLOPs: 26.29 | +7: iteration 68130/ 173500 | consumed samples: 17441280 | consumed tokens: 35719741440 | elapsed time per iteration (s): 0.16 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.738140E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.657 | TFLOPs: 25.84 | +7: iteration 68140/ 173500 | consumed samples: 17443840 | consumed tokens: 35724984320 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.722499E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.411 | TFLOPs: 26.07 | +7: iteration 68150/ 173500 | consumed samples: 17446400 | consumed tokens: 35730227200 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.728739E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.463 | TFLOPs: 26.29 | +7: iteration 68160/ 173500 | consumed samples: 17448960 | consumed tokens: 35735470080 | elapsed time per iteration (s): 0.15 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.725488E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.436 | TFLOPs: 26.35 | +7: iteration 68170/ 173500 | consumed samples: 17451520 | consumed tokens: 35740712960 | elapsed time per iteration (s): 0.16 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.716795E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.240 | TFLOPs: 25.36 | +7: iteration 68180/ 173500 | consumed samples: 17454080 | consumed tokens: 35745955840 | elapsed time per iteration (s): 0.16 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.727251E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.655 | TFLOPs: 25.21 | +7: iteration 68190/ 173500 | consumed samples: 17456640 | consumed tokens: 35751198720 | elapsed time per iteration (s): 0.16 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.727980E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.732 | TFLOPs: 24.76 | +7: iteration 68200/ 173500 | consumed samples: 17459200 | consumed tokens: 35756441600 | elapsed time per iteration (s): 0.16 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.723243E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.721 | TFLOPs: 24.38 | +7: iteration 68210/ 173500 | consumed samples: 17461760 | consumed tokens: 35761684480 | elapsed time per iteration (s): 0.15 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.727794E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.243 | TFLOPs: 26.10 | +7: iteration 68220/ 173500 | consumed samples: 17464320 | consumed tokens: 35766927360 | elapsed time per iteration (s): 0.16 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.739307E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.239 | TFLOPs: 25.41 | +7: iteration 68230/ 173500 | consumed samples: 17466880 | consumed tokens: 35772170240 | elapsed time per iteration (s): 0.15 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.727378E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.620 | TFLOPs: 26.14 | +7: iteration 68240/ 173500 | consumed samples: 17469440 | consumed tokens: 35777413120 | elapsed time per iteration (s): 0.15 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.734894E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.201 | TFLOPs: 26.35 | +7: iteration 68250/ 173500 | consumed samples: 17472000 | consumed tokens: 35782656000 | elapsed time per iteration (s): 0.15 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.727772E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.717 | TFLOPs: 26.06 | +7: iteration 68260/ 173500 | consumed samples: 17474560 | consumed tokens: 35787898880 | elapsed time per iteration (s): 0.15 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.726373E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.365 | TFLOPs: 26.12 | +7: iteration 68270/ 173500 | consumed samples: 17477120 | consumed tokens: 35793141760 | elapsed time per iteration (s): 0.16 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.737914E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.614 | TFLOPs: 25.87 | +7: iteration 68280/ 173500 | consumed samples: 17479680 | consumed tokens: 35798384640 | elapsed time per iteration (s): 0.16 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.732572E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.859 | TFLOPs: 25.78 | +7: iteration 68290/ 173500 | consumed samples: 17482240 | consumed tokens: 35803627520 | elapsed time per iteration (s): 0.16 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.732878E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.711 | TFLOPs: 25.78 | +7: iteration 68300/ 173500 | consumed samples: 17484800 | consumed tokens: 35808870400 | elapsed time per iteration (s): 0.15 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.715214E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.572 | TFLOPs: 25.92 | +7: iteration 68310/ 173500 | consumed samples: 17487360 | consumed tokens: 35814113280 | elapsed time per iteration (s): 0.16 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.719040E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.801 | TFLOPs: 25.47 | +7: iteration 68320/ 173500 | consumed samples: 17489920 | consumed tokens: 35819356160 | elapsed time per iteration (s): 0.15 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.722540E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.614 | TFLOPs: 26.17 | +7: iteration 68330/ 173500 | consumed samples: 17492480 | consumed tokens: 35824599040 | elapsed time per iteration (s): 0.16 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.732262E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.195 | TFLOPs: 25.13 | +7: iteration 68340/ 173500 | consumed samples: 17495040 | consumed tokens: 35829841920 | elapsed time per iteration (s): 0.16 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.731158E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.529 | TFLOPs: 25.73 | +7: iteration 68350/ 173500 | consumed samples: 17497600 | consumed tokens: 35835084800 | elapsed time per iteration (s): 0.16 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.722292E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.188 | TFLOPs: 25.33 | +7: iteration 68360/ 173500 | consumed samples: 17500160 | consumed tokens: 35840327680 | elapsed time per iteration (s): 0.16 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.743119E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.964 | TFLOPs: 25.86 | +7: iteration 68370/ 173500 | consumed samples: 17502720 | consumed tokens: 35845570560 | elapsed time per iteration (s): 0.16 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.726980E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.802 | TFLOPs: 25.07 | +7: iteration 68380/ 173500 | consumed samples: 17505280 | consumed tokens: 35850813440 | elapsed time per iteration (s): 0.16 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.723578E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.387 | TFLOPs: 25.08 | +7: iteration 68390/ 173500 | consumed samples: 17507840 | consumed tokens: 35856056320 | elapsed time per iteration (s): 0.15 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.726912E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.880 | TFLOPs: 26.11 | +7: iteration 68400/ 173500 | consumed samples: 17510400 | consumed tokens: 35861299200 | elapsed time per iteration (s): 0.16 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.728815E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.866 | TFLOPs: 25.09 | +7: iteration 68410/ 173500 | consumed samples: 17512960 | consumed tokens: 35866542080 | elapsed time per iteration (s): 0.15 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.714024E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.394 | TFLOPs: 26.21 | +7: iteration 68420/ 173500 | consumed samples: 17515520 | consumed tokens: 35871784960 | elapsed time per iteration (s): 0.16 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.727373E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.384 | TFLOPs: 25.90 | +7: iteration 68430/ 173500 | consumed samples: 17518080 | consumed tokens: 35877027840 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.730350E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.026 | TFLOPs: 26.21 | +7: iteration 68440/ 173500 | consumed samples: 17520640 | consumed tokens: 35882270720 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.724239E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.762 | TFLOPs: 26.20 | +7: iteration 68450/ 173500 | consumed samples: 17523200 | consumed tokens: 35887513600 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.725656E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.466 | TFLOPs: 26.23 | +7: iteration 68460/ 173500 | consumed samples: 17525760 | consumed tokens: 35892756480 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.728513E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.607 | TFLOPs: 26.32 | +7: iteration 68470/ 173500 | consumed samples: 17528320 | consumed tokens: 35897999360 | elapsed time per iteration (s): 0.15 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.737010E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.439 | TFLOPs: 26.12 | +7: iteration 68480/ 173500 | consumed samples: 17530880 | consumed tokens: 35903242240 | elapsed time per iteration (s): 0.16 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.729120E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.190 | TFLOPs: 25.14 | +7: iteration 68490/ 173500 | consumed samples: 17533440 | consumed tokens: 35908485120 | elapsed time per iteration (s): 0.16 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.732640E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.858 | TFLOPs: 25.01 | +7: iteration 68500/ 173500 | consumed samples: 17536000 | consumed tokens: 35913728000 | elapsed time per iteration (s): 0.16 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.731009E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.699 | TFLOPs: 25.50 | +7: iteration 68510/ 173500 | consumed samples: 17538560 | consumed tokens: 35918970880 | elapsed time per iteration (s): 0.16 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.729800E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.953 | TFLOPs: 25.72 | +7: iteration 68520/ 173500 | consumed samples: 17541120 | consumed tokens: 35924213760 | elapsed time per iteration (s): 0.15 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.727512E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.092 | TFLOPs: 26.24 | +7: iteration 68530/ 173500 | consumed samples: 17543680 | consumed tokens: 35929456640 | elapsed time per iteration (s): 0.15 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.736257E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.914 | TFLOPs: 26.28 | +7: iteration 68540/ 173500 | consumed samples: 17546240 | consumed tokens: 35934699520 | elapsed time per iteration (s): 0.15 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.713378E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.198 | TFLOPs: 26.26 | +7: iteration 68550/ 173500 | consumed samples: 17548800 | consumed tokens: 35939942400 | elapsed time per iteration (s): 0.15 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.718773E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.343 | TFLOPs: 26.30 | +7: iteration 68560/ 173500 | consumed samples: 17551360 | consumed tokens: 35945185280 | elapsed time per iteration (s): 0.15 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.725553E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.838 | TFLOPs: 26.33 | +7: iteration 68570/ 173500 | consumed samples: 17553920 | consumed tokens: 35950428160 | elapsed time per iteration (s): 0.16 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.708026E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.626 | TFLOPs: 25.71 | +7: iteration 68580/ 173500 | consumed samples: 17556480 | consumed tokens: 35955671040 | elapsed time per iteration (s): 0.16 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.710257E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.096 | TFLOPs: 25.60 | +7: iteration 68590/ 173500 | consumed samples: 17559040 | consumed tokens: 35960913920 | elapsed time per iteration (s): 0.16 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.716616E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.170 | TFLOPs: 25.86 | +7: iteration 68600/ 173500 | consumed samples: 17561600 | consumed tokens: 35966156800 | elapsed time per iteration (s): 0.15 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.727940E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.734 | TFLOPs: 26.22 | +7: iteration 68610/ 173500 | consumed samples: 17564160 | consumed tokens: 35971399680 | elapsed time per iteration (s): 0.15 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.722663E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.357 | TFLOPs: 26.24 | +7: iteration 68620/ 173500 | consumed samples: 17566720 | consumed tokens: 35976642560 | elapsed time per iteration (s): 0.15 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.728727E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.417 | TFLOPs: 25.98 | +7: iteration 68630/ 173500 | consumed samples: 17569280 | consumed tokens: 35981885440 | elapsed time per iteration (s): 0.16 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.722904E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.551 | TFLOPs: 25.81 | +7: iteration 68640/ 173500 | consumed samples: 17571840 | consumed tokens: 35987128320 | elapsed time per iteration (s): 0.16 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.721469E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.530 | TFLOPs: 25.48 | +7: iteration 68650/ 173500 | consumed samples: 17574400 | consumed tokens: 35992371200 | elapsed time per iteration (s): 0.15 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.725000E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.393 | TFLOPs: 25.96 | +7: iteration 68660/ 173500 | consumed samples: 17576960 | consumed tokens: 35997614080 | elapsed time per iteration (s): 0.16 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.730233E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.820 | TFLOPs: 25.62 | +7: iteration 68670/ 173500 | consumed samples: 17579520 | consumed tokens: 36002856960 | elapsed time per iteration (s): 0.16 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.719885E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.108 | TFLOPs: 25.52 | +7: iteration 68680/ 173500 | consumed samples: 17582080 | consumed tokens: 36008099840 | elapsed time per iteration (s): 0.16 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.737310E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.060 | TFLOPs: 24.83 | +7: iteration 68690/ 173500 | consumed samples: 17584640 | consumed tokens: 36013342720 | elapsed time per iteration (s): 0.16 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.715661E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.582 | TFLOPs: 25.76 | +7: iteration 68700/ 173500 | consumed samples: 17587200 | consumed tokens: 36018585600 | elapsed time per iteration (s): 0.16 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.717226E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.003 | TFLOPs: 25.52 | +7: iteration 68710/ 173500 | consumed samples: 17589760 | consumed tokens: 36023828480 | elapsed time per iteration (s): 0.16 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.719634E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.731 | TFLOPs: 25.70 | +7: iteration 68720/ 173500 | consumed samples: 17592320 | consumed tokens: 36029071360 | elapsed time per iteration (s): 0.16 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.718825E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.233 | TFLOPs: 25.63 | +7: iteration 68730/ 173500 | consumed samples: 17594880 | consumed tokens: 36034314240 | elapsed time per iteration (s): 0.16 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.718018E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.011 | TFLOPs: 25.66 | +7: iteration 68740/ 173500 | consumed samples: 17597440 | consumed tokens: 36039557120 | elapsed time per iteration (s): 0.16 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.723126E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.621 | TFLOPs: 25.56 | +7: iteration 68750/ 173500 | consumed samples: 17600000 | consumed tokens: 36044800000 | elapsed time per iteration (s): 0.15 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.727912E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.410 | TFLOPs: 26.27 | +7: iteration 68760/ 173500 | consumed samples: 17602560 | consumed tokens: 36050042880 | elapsed time per iteration (s): 0.16 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.728004E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.025 | TFLOPs: 25.81 | +7: iteration 68770/ 173500 | consumed samples: 17605120 | consumed tokens: 36055285760 | elapsed time per iteration (s): 0.16 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.732754E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.366 | TFLOPs: 25.83 | +7: iteration 68780/ 173500 | consumed samples: 17607680 | consumed tokens: 36060528640 | elapsed time per iteration (s): 0.16 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.724793E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.016 | TFLOPs: 25.75 | +7: iteration 68790/ 173500 | consumed samples: 17610240 | consumed tokens: 36065771520 | elapsed time per iteration (s): 0.15 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.737315E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.167 | TFLOPs: 26.13 | +7: iteration 68800/ 173500 | consumed samples: 17612800 | consumed tokens: 36071014400 | elapsed time per iteration (s): 0.15 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.733854E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.737 | TFLOPs: 26.15 | +7: iteration 68810/ 173500 | consumed samples: 17615360 | consumed tokens: 36076257280 | elapsed time per iteration (s): 0.15 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.738651E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.962 | TFLOPs: 25.92 | +7: iteration 68820/ 173500 | consumed samples: 17617920 | consumed tokens: 36081500160 | elapsed time per iteration (s): 0.16 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.727333E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.037 | TFLOPs: 24.68 | +7: iteration 68830/ 173500 | consumed samples: 17620480 | consumed tokens: 36086743040 | elapsed time per iteration (s): 0.16 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.711479E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.273 | TFLOPs: 24.86 | +7: iteration 68840/ 173500 | consumed samples: 17623040 | consumed tokens: 36091985920 | elapsed time per iteration (s): 0.16 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.723049E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.084 | TFLOPs: 25.74 | +7: iteration 68850/ 173500 | consumed samples: 17625600 | consumed tokens: 36097228800 | elapsed time per iteration (s): 0.16 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.724506E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.380 | TFLOPs: 25.07 | +7: iteration 68860/ 173500 | consumed samples: 17628160 | consumed tokens: 36102471680 | elapsed time per iteration (s): 0.15 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.728247E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.657 | TFLOPs: 26.15 | +7: iteration 68870/ 173500 | consumed samples: 17630720 | consumed tokens: 36107714560 | elapsed time per iteration (s): 0.15 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.730161E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.782 | TFLOPs: 26.14 | +7: iteration 68880/ 173500 | consumed samples: 17633280 | consumed tokens: 36112957440 | elapsed time per iteration (s): 0.15 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.720022E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.841 | TFLOPs: 25.95 | +7: iteration 68890/ 173500 | consumed samples: 17635840 | consumed tokens: 36118200320 | elapsed time per iteration (s): 0.15 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.730658E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.484 | TFLOPs: 25.96 | +7: iteration 68900/ 173500 | consumed samples: 17638400 | consumed tokens: 36123443200 | elapsed time per iteration (s): 0.15 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.721318E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.984 | TFLOPs: 26.16 | +7: iteration 68910/ 173500 | consumed samples: 17640960 | consumed tokens: 36128686080 | elapsed time per iteration (s): 0.16 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.734389E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.379 | TFLOPs: 25.30 | +7: iteration 68920/ 173500 | consumed samples: 17643520 | consumed tokens: 36133928960 | elapsed time per iteration (s): 0.16 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.729456E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.050 | TFLOPs: 25.25 | +7: iteration 68930/ 173500 | consumed samples: 17646080 | consumed tokens: 36139171840 | elapsed time per iteration (s): 0.16 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.711434E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.649 | TFLOPs: 25.51 | +7: iteration 68940/ 173500 | consumed samples: 17648640 | consumed tokens: 36144414720 | elapsed time per iteration (s): 0.15 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.718752E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.182 | TFLOPs: 25.97 | +7: iteration 68950/ 173500 | consumed samples: 17651200 | consumed tokens: 36149657600 | elapsed time per iteration (s): 0.16 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.713929E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.748 | TFLOPs: 25.84 | +7: iteration 68960/ 173500 | consumed samples: 17653760 | consumed tokens: 36154900480 | elapsed time per iteration (s): 0.16 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.729212E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.294 | TFLOPs: 25.77 | +7: iteration 68970/ 173500 | consumed samples: 17656320 | consumed tokens: 36160143360 | elapsed time per iteration (s): 0.16 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.725725E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.007 | TFLOPs: 25.89 | +7: iteration 68980/ 173500 | consumed samples: 17658880 | consumed tokens: 36165386240 | elapsed time per iteration (s): 0.16 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.743097E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.270 | TFLOPs: 25.80 | +7: iteration 68990/ 173500 | consumed samples: 17661440 | consumed tokens: 36170629120 | elapsed time per iteration (s): 0.16 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.722658E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.609 | TFLOPs: 25.05 | +7: iteration 69000/ 173500 | consumed samples: 17664000 | consumed tokens: 36175872000 | elapsed time per iteration (s): 0.16 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.720026E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.275 | TFLOPs: 25.63 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 69000 | lm loss value: 3.824410E+00 | lm loss PPL: 4.580575E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 69000 to checkpoints_44m91b100m +0: [2023-03-17 03:14:21,495] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step69000 is begin to save! +0: [2023-03-17 03:14:21,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:14:21,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:14:21,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:14:21,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:14:21,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:14:21,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:14:21,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:14:21,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:14:21,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:14:21,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:14:21,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:14:21,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:14:21,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:14:21,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:14:21,608] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:14:21,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:14:21,616] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:14:21,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:14:21,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:14:21,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:14:21,626] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step69000/mp_rank_00_model_states.pt +0: [2023-03-17 03:14:21,626] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:14:21,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:14:21,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:14:21,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +4: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:14:21,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +4: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +4: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +4: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:14:21,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +4: [2023-03-17 03:14:21,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:14:21,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:14:21,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +4: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +1: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +2: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +6: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:14:21,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +5: [2023-03-17 03:14:21,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +3: [2023-03-17 03:14:21,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:14:21,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:14:21,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +7: [2023-03-17 03:14:21,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:14:21,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step69000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:14:21,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step69000 is ready now! +0: successfully saved checkpoint at iteration 69000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.63 +7: iteration 69010/ 173500 | consumed samples: 17666560 | consumed tokens: 36181114880 | elapsed time per iteration (s): 0.18 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.718157E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.808 | TFLOPs: 22.49 | +7: iteration 69020/ 173500 | consumed samples: 17669120 | consumed tokens: 36186357760 | elapsed time per iteration (s): 0.16 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.730683E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.881 | TFLOPs: 25.62 | +7: iteration 69030/ 173500 | consumed samples: 17671680 | consumed tokens: 36191600640 | elapsed time per iteration (s): 0.15 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.737663E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.669 | TFLOPs: 25.92 | +7: iteration 69040/ 173500 | consumed samples: 17674240 | consumed tokens: 36196843520 | elapsed time per iteration (s): 0.16 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.722116E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.417 | TFLOPs: 25.15 | +7: iteration 69050/ 173500 | consumed samples: 17676800 | consumed tokens: 36202086400 | elapsed time per iteration (s): 0.16 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.729836E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.534 | TFLOPs: 25.04 | +7: iteration 69060/ 173500 | consumed samples: 17679360 | consumed tokens: 36207329280 | elapsed time per iteration (s): 0.16 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.723518E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.321 | TFLOPs: 25.49 | +7: iteration 69070/ 173500 | consumed samples: 17681920 | consumed tokens: 36212572160 | elapsed time per iteration (s): 0.15 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.727070E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.518 | TFLOPs: 26.14 | +7: iteration 69080/ 173500 | consumed samples: 17684480 | consumed tokens: 36217815040 | elapsed time per iteration (s): 0.16 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.725313E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.589 | TFLOPs: 25.68 | +7: iteration 69090/ 173500 | consumed samples: 17687040 | consumed tokens: 36223057920 | elapsed time per iteration (s): 0.15 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.725397E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.765 | TFLOPs: 26.15 | +7: iteration 69100/ 173500 | consumed samples: 17689600 | consumed tokens: 36228300800 | elapsed time per iteration (s): 0.16 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.731455E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.817 | TFLOPs: 25.64 | +7: iteration 69110/ 173500 | consumed samples: 17692160 | consumed tokens: 36233543680 | elapsed time per iteration (s): 0.16 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.729035E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.745 | TFLOPs: 25.61 | +7: iteration 69120/ 173500 | consumed samples: 17694720 | consumed tokens: 36238786560 | elapsed time per iteration (s): 0.17 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.727937E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.076 | TFLOPs: 24.00 | +7: iteration 69130/ 173500 | consumed samples: 17697280 | consumed tokens: 36244029440 | elapsed time per iteration (s): 0.16 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.718405E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.178 | TFLOPs: 25.78 | +7: iteration 69140/ 173500 | consumed samples: 17699840 | consumed tokens: 36249272320 | elapsed time per iteration (s): 0.15 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.714387E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.967 | TFLOPs: 26.16 | +7: iteration 69150/ 173500 | consumed samples: 17702400 | consumed tokens: 36254515200 | elapsed time per iteration (s): 0.15 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.723839E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.068 | TFLOPs: 25.94 | +7: iteration 69160/ 173500 | consumed samples: 17704960 | consumed tokens: 36259758080 | elapsed time per iteration (s): 0.16 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.711943E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.403 | TFLOPs: 25.73 | +7: iteration 69170/ 173500 | consumed samples: 17707520 | consumed tokens: 36265000960 | elapsed time per iteration (s): 0.17 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.717093E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.288 | TFLOPs: 23.83 | +7: iteration 69180/ 173500 | consumed samples: 17710080 | consumed tokens: 36270243840 | elapsed time per iteration (s): 0.16 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.712495E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.770 | TFLOPs: 25.17 | +7: iteration 69190/ 173500 | consumed samples: 17712640 | consumed tokens: 36275486720 | elapsed time per iteration (s): 0.15 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.715887E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.365 | TFLOPs: 25.91 | +7: iteration 69200/ 173500 | consumed samples: 17715200 | consumed tokens: 36280729600 | elapsed time per iteration (s): 0.18 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.715904E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.274 | TFLOPs: 22.23 | +7: iteration 69210/ 173500 | consumed samples: 17717760 | consumed tokens: 36285972480 | elapsed time per iteration (s): 0.16 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.736798E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.592 | TFLOPs: 25.38 | +7: iteration 69220/ 173500 | consumed samples: 17720320 | consumed tokens: 36291215360 | elapsed time per iteration (s): 0.15 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.731752E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.068 | TFLOPs: 26.30 | +7: iteration 69230/ 173500 | consumed samples: 17722880 | consumed tokens: 36296458240 | elapsed time per iteration (s): 0.16 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.727942E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.674 | TFLOPs: 24.90 | +7: iteration 69240/ 173500 | consumed samples: 17725440 | consumed tokens: 36301701120 | elapsed time per iteration (s): 0.16 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.719762E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.520 | TFLOPs: 25.49 | +7: iteration 69250/ 173500 | consumed samples: 17728000 | consumed tokens: 36306944000 | elapsed time per iteration (s): 0.16 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.724868E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.958 | TFLOPs: 25.86 | +7: iteration 69260/ 173500 | consumed samples: 17730560 | consumed tokens: 36312186880 | elapsed time per iteration (s): 0.16 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.733208E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.881 | TFLOPs: 25.61 | +7: iteration 69270/ 173500 | consumed samples: 17733120 | consumed tokens: 36317429760 | elapsed time per iteration (s): 0.15 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.725832E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.110 | TFLOPs: 25.96 | +7: iteration 69280/ 173500 | consumed samples: 17735680 | consumed tokens: 36322672640 | elapsed time per iteration (s): 0.16 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.725179E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.518 | TFLOPs: 25.51 | +7: iteration 69290/ 173500 | consumed samples: 17738240 | consumed tokens: 36327915520 | elapsed time per iteration (s): 0.15 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.732100E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.419 | TFLOPs: 26.13 | +7: iteration 69300/ 173500 | consumed samples: 17740800 | consumed tokens: 36333158400 | elapsed time per iteration (s): 0.16 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.703847E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.576 | TFLOPs: 24.83 | +7: iteration 69310/ 173500 | consumed samples: 17743360 | consumed tokens: 36338401280 | elapsed time per iteration (s): 0.16 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.721426E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.576 | TFLOPs: 25.65 | +7: iteration 69320/ 173500 | consumed samples: 17745920 | consumed tokens: 36343644160 | elapsed time per iteration (s): 0.15 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.728928E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.154 | TFLOPs: 26.11 | +7: iteration 69330/ 173500 | consumed samples: 17748480 | consumed tokens: 36348887040 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.719273E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.842 | TFLOPs: 25.95 | +7: iteration 69340/ 173500 | consumed samples: 17751040 | consumed tokens: 36354129920 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.733903E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.859 | TFLOPs: 25.92 | +7: iteration 69350/ 173500 | consumed samples: 17753600 | consumed tokens: 36359372800 | elapsed time per iteration (s): 0.16 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.732410E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.896 | TFLOPs: 25.87 | +7: iteration 69360/ 173500 | consumed samples: 17756160 | consumed tokens: 36364615680 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.729247E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.160 | TFLOPs: 25.94 | +7: iteration 69370/ 173500 | consumed samples: 17758720 | consumed tokens: 36369858560 | elapsed time per iteration (s): 0.16 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.714796E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.754 | TFLOPs: 25.53 | +7: iteration 69380/ 173500 | consumed samples: 17761280 | consumed tokens: 36375101440 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.721557E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.742 | TFLOPs: 26.09 | +7: iteration 69390/ 173500 | consumed samples: 17763840 | consumed tokens: 36380344320 | elapsed time per iteration (s): 0.15 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.718912E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.310 | TFLOPs: 26.08 | +7: iteration 69400/ 173500 | consumed samples: 17766400 | consumed tokens: 36385587200 | elapsed time per iteration (s): 0.15 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.726328E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.177 | TFLOPs: 25.94 | +7: iteration 69410/ 173500 | consumed samples: 17768960 | consumed tokens: 36390830080 | elapsed time per iteration (s): 0.15 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.733087E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.279 | TFLOPs: 26.12 | +7: iteration 69420/ 173500 | consumed samples: 17771520 | consumed tokens: 36396072960 | elapsed time per iteration (s): 0.16 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.730833E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.101 | TFLOPs: 25.71 | +7: iteration 69430/ 173500 | consumed samples: 17774080 | consumed tokens: 36401315840 | elapsed time per iteration (s): 0.16 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.724298E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.426 | TFLOPs: 25.58 | +7: iteration 69440/ 173500 | consumed samples: 17776640 | consumed tokens: 36406558720 | elapsed time per iteration (s): 0.16 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.722360E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.050 | TFLOPs: 25.75 | +7: iteration 69450/ 173500 | consumed samples: 17779200 | consumed tokens: 36411801600 | elapsed time per iteration (s): 0.15 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.721564E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.026 | TFLOPs: 25.95 | +7: iteration 69460/ 173500 | consumed samples: 17781760 | consumed tokens: 36417044480 | elapsed time per iteration (s): 0.16 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.715834E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.001 | TFLOPs: 25.72 | +7: iteration 69470/ 173500 | consumed samples: 17784320 | consumed tokens: 36422287360 | elapsed time per iteration (s): 0.15 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.729037E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.886 | TFLOPs: 25.92 | +7: iteration 69480/ 173500 | consumed samples: 17786880 | consumed tokens: 36427530240 | elapsed time per iteration (s): 0.15 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.732419E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.009 | TFLOPs: 26.11 | +7: iteration 69490/ 173500 | consumed samples: 17789440 | consumed tokens: 36432773120 | elapsed time per iteration (s): 0.16 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.722960E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.726 | TFLOPs: 25.87 | +7: iteration 69500/ 173500 | consumed samples: 17792000 | consumed tokens: 36438016000 | elapsed time per iteration (s): 0.16 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.725750E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.995 | TFLOPs: 25.81 | +7: iteration 69510/ 173500 | consumed samples: 17794560 | consumed tokens: 36443258880 | elapsed time per iteration (s): 0.15 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.721705E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.543 | TFLOPs: 26.14 | +7: iteration 69520/ 173500 | consumed samples: 17797120 | consumed tokens: 36448501760 | elapsed time per iteration (s): 0.16 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.718386E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.381 | TFLOPs: 24.93 | +7: iteration 69530/ 173500 | consumed samples: 17799680 | consumed tokens: 36453744640 | elapsed time per iteration (s): 0.16 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.719465E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.829 | TFLOPs: 25.23 | +7: iteration 69540/ 173500 | consumed samples: 17802240 | consumed tokens: 36458987520 | elapsed time per iteration (s): 0.16 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.711838E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.969 | TFLOPs: 25.36 | +7: iteration 69550/ 173500 | consumed samples: 17804800 | consumed tokens: 36464230400 | elapsed time per iteration (s): 0.16 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.715349E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.359 | TFLOPs: 25.54 | +7: iteration 69560/ 173500 | consumed samples: 17807360 | consumed tokens: 36469473280 | elapsed time per iteration (s): 0.16 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.720315E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.320 | TFLOPs: 25.69 | +7: iteration 69570/ 173500 | consumed samples: 17809920 | consumed tokens: 36474716160 | elapsed time per iteration (s): 0.16 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.728968E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.564 | TFLOPs: 25.43 | +7: iteration 69580/ 173500 | consumed samples: 17812480 | consumed tokens: 36479959040 | elapsed time per iteration (s): 0.15 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.715213E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.412 | TFLOPs: 25.91 | +7: iteration 69590/ 173500 | consumed samples: 17815040 | consumed tokens: 36485201920 | elapsed time per iteration (s): 0.16 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.721155E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.198 | TFLOPs: 25.80 | +7: iteration 69600/ 173500 | consumed samples: 17817600 | consumed tokens: 36490444800 | elapsed time per iteration (s): 0.16 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.716094E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.835 | TFLOPs: 25.75 | +7: iteration 69610/ 173500 | consumed samples: 17820160 | consumed tokens: 36495687680 | elapsed time per iteration (s): 0.16 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.718166E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.925 | TFLOPs: 25.56 | +7: iteration 69620/ 173500 | consumed samples: 17822720 | consumed tokens: 36500930560 | elapsed time per iteration (s): 0.15 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.721649E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.579 | TFLOPs: 26.17 | +7: iteration 69630/ 173500 | consumed samples: 17825280 | consumed tokens: 36506173440 | elapsed time per iteration (s): 0.16 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.728162E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.587 | TFLOPs: 24.47 | +7: iteration 69640/ 173500 | consumed samples: 17827840 | consumed tokens: 36511416320 | elapsed time per iteration (s): 0.16 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.720855E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.748 | TFLOPs: 25.81 | +7: iteration 69650/ 173500 | consumed samples: 17830400 | consumed tokens: 36516659200 | elapsed time per iteration (s): 0.16 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.715143E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.582 | TFLOPs: 25.85 | +7: iteration 69660/ 173500 | consumed samples: 17832960 | consumed tokens: 36521902080 | elapsed time per iteration (s): 0.16 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.730915E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.922 | TFLOPs: 25.70 | +7: iteration 69670/ 173500 | consumed samples: 17835520 | consumed tokens: 36527144960 | elapsed time per iteration (s): 0.16 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.730886E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.002 | TFLOPs: 25.89 | +7: iteration 69680/ 173500 | consumed samples: 17838080 | consumed tokens: 36532387840 | elapsed time per iteration (s): 0.16 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.722685E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.253 | TFLOPs: 25.30 | +7: iteration 69690/ 173500 | consumed samples: 17840640 | consumed tokens: 36537630720 | elapsed time per iteration (s): 0.16 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.723433E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.986 | TFLOPs: 25.89 | +7: iteration 69700/ 173500 | consumed samples: 17843200 | consumed tokens: 36542873600 | elapsed time per iteration (s): 0.16 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.720450E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.824 | TFLOPs: 24.98 | +7: iteration 69710/ 173500 | consumed samples: 17845760 | consumed tokens: 36548116480 | elapsed time per iteration (s): 0.15 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.734205E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.191 | TFLOPs: 26.11 | +7: iteration 69720/ 173500 | consumed samples: 17848320 | consumed tokens: 36553359360 | elapsed time per iteration (s): 0.16 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.724499E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.210 | TFLOPs: 25.22 | +7: iteration 69730/ 173500 | consumed samples: 17850880 | consumed tokens: 36558602240 | elapsed time per iteration (s): 0.15 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.729288E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.847 | TFLOPs: 25.92 | +7: iteration 69740/ 173500 | consumed samples: 17853440 | consumed tokens: 36563845120 | elapsed time per iteration (s): 0.15 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.726154E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.763 | TFLOPs: 26.23 | +7: iteration 69750/ 173500 | consumed samples: 17856000 | consumed tokens: 36569088000 | elapsed time per iteration (s): 0.16 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.715325E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.485 | TFLOPs: 25.77 | +7: iteration 69760/ 173500 | consumed samples: 17858560 | consumed tokens: 36574330880 | elapsed time per iteration (s): 0.16 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.733877E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.089 | TFLOPs: 25.55 | +7: iteration 69770/ 173500 | consumed samples: 17861120 | consumed tokens: 36579573760 | elapsed time per iteration (s): 0.16 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.722894E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.112 | TFLOPs: 25.22 | +7: iteration 69780/ 173500 | consumed samples: 17863680 | consumed tokens: 36584816640 | elapsed time per iteration (s): 0.16 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.714683E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.254 | TFLOPs: 25.66 | +7: iteration 69790/ 173500 | consumed samples: 17866240 | consumed tokens: 36590059520 | elapsed time per iteration (s): 0.16 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.733856E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.337 | TFLOPs: 25.68 | +7: iteration 69800/ 173500 | consumed samples: 17868800 | consumed tokens: 36595302400 | elapsed time per iteration (s): 0.16 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.715441E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.108 | TFLOPs: 25.56 | +7: iteration 69810/ 173500 | consumed samples: 17871360 | consumed tokens: 36600545280 | elapsed time per iteration (s): 0.16 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.723563E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.661 | TFLOPs: 25.18 | +7: iteration 69820/ 173500 | consumed samples: 17873920 | consumed tokens: 36605788160 | elapsed time per iteration (s): 0.16 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.709410E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.466 | TFLOPs: 25.76 | +7: iteration 69830/ 173500 | consumed samples: 17876480 | consumed tokens: 36611031040 | elapsed time per iteration (s): 0.16 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.738449E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.227 | TFLOPs: 24.86 | +7: iteration 69840/ 173500 | consumed samples: 17879040 | consumed tokens: 36616273920 | elapsed time per iteration (s): 0.15 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.721436E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.200 | TFLOPs: 26.07 | +7: iteration 69850/ 173500 | consumed samples: 17881600 | consumed tokens: 36621516800 | elapsed time per iteration (s): 0.15 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.735519E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.035 | TFLOPs: 26.14 | +7: iteration 69860/ 173500 | consumed samples: 17884160 | consumed tokens: 36626759680 | elapsed time per iteration (s): 0.16 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.729717E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.737 | TFLOPs: 25.62 | +7: iteration 69870/ 173500 | consumed samples: 17886720 | consumed tokens: 36632002560 | elapsed time per iteration (s): 0.15 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.708511E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.801 | TFLOPs: 26.16 | +7: iteration 69880/ 173500 | consumed samples: 17889280 | consumed tokens: 36637245440 | elapsed time per iteration (s): 0.16 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.732795E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.315 | TFLOPs: 24.63 | +7: iteration 69890/ 173500 | consumed samples: 17891840 | consumed tokens: 36642488320 | elapsed time per iteration (s): 0.15 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.726897E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.354 | TFLOPs: 26.18 | +7: iteration 69900/ 173500 | consumed samples: 17894400 | consumed tokens: 36647731200 | elapsed time per iteration (s): 0.15 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.727561E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.933 | TFLOPs: 25.95 | +7: iteration 69910/ 173500 | consumed samples: 17896960 | consumed tokens: 36652974080 | elapsed time per iteration (s): 0.16 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.718607E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.118 | TFLOPs: 25.39 | +7: iteration 69920/ 173500 | consumed samples: 17899520 | consumed tokens: 36658216960 | elapsed time per iteration (s): 0.16 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.715554E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.648 | TFLOPs: 25.62 | +7: iteration 69930/ 173500 | consumed samples: 17902080 | consumed tokens: 36663459840 | elapsed time per iteration (s): 0.15 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.725947E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.099 | TFLOPs: 26.16 | +7: iteration 69940/ 173500 | consumed samples: 17904640 | consumed tokens: 36668702720 | elapsed time per iteration (s): 0.16 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.712530E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.327 | TFLOPs: 25.54 | +7: iteration 69950/ 173500 | consumed samples: 17907200 | consumed tokens: 36673945600 | elapsed time per iteration (s): 0.15 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.712534E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.164 | TFLOPs: 26.16 | +7: iteration 69960/ 173500 | consumed samples: 17909760 | consumed tokens: 36679188480 | elapsed time per iteration (s): 0.16 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.717610E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.127 | TFLOPs: 25.63 | +7: iteration 69970/ 173500 | consumed samples: 17912320 | consumed tokens: 36684431360 | elapsed time per iteration (s): 0.16 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.718704E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.872 | TFLOPs: 24.54 | +7: iteration 69980/ 173500 | consumed samples: 17914880 | consumed tokens: 36689674240 | elapsed time per iteration (s): 0.15 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.721337E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.522 | TFLOPs: 26.15 | +7: iteration 69990/ 173500 | consumed samples: 17917440 | consumed tokens: 36694917120 | elapsed time per iteration (s): 0.16 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.706849E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.810 | TFLOPs: 25.70 | +0: [2023-03-17 03:16:58,525] [INFO] [logging.py:68:log_dist] [Rank 0] step=70000, skipped=0, lr=[0.0001385013705497804, 0.0001385013705497804, 0.0001385013705497804], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 70000/ 173500 | consumed samples: 17920000 | consumed tokens: 36700160000 | elapsed time per iteration (s): 0.16 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.723301E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.152 | TFLOPs: 24.61 | +0: steps: 70000 loss: 3.7364 iter time (s): 0.155 samples/sec: 1647.035 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 70000 | lm loss value: 3.876340E+00 | lm loss PPL: 4.824731E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 70000 to checkpoints_44m91b100m +0: [2023-03-17 03:16:58,600] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step70000 is begin to save! +0: [2023-03-17 03:16:58,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:16:58,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:16:58,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:16:58,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:16:58,678] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:16:58,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:16:58,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:16:58,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:16:58,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:16:58,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:16:58,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:16:58,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:16:58,711] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:16:58,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:16:58,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:16:58,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:16:58,727] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:16:58,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:16:58,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:16:58,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:16:58,737] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step70000/mp_rank_00_model_states.pt +0: [2023-03-17 03:16:58,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:16:58,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:16:58,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:16:58,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-17 03:16:58,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-17 03:16:58,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +6: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:16:58,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +4: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +2: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +5: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +3: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +1: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:16:58,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:16:58,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +7: [2023-03-17 03:16:58,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:16:58,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step70000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:16:58,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step70000 is ready now! +0: successfully saved checkpoint at iteration 70000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 197.20 +7: iteration 70010/ 173500 | consumed samples: 17922560 | consumed tokens: 36705402880 | elapsed time per iteration (s): 0.18 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.734367E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.704 | TFLOPs: 21.97 | +7: iteration 70020/ 173500 | consumed samples: 17925120 | consumed tokens: 36710645760 | elapsed time per iteration (s): 0.16 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.739021E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.118 | TFLOPs: 25.28 | +7: iteration 70030/ 173500 | consumed samples: 17927680 | consumed tokens: 36715888640 | elapsed time per iteration (s): 0.16 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.709362E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.256 | TFLOPs: 25.47 | +7: iteration 70040/ 173500 | consumed samples: 17930240 | consumed tokens: 36721131520 | elapsed time per iteration (s): 0.16 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.724812E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.750 | TFLOPs: 25.81 | +7: iteration 70050/ 173500 | consumed samples: 17932800 | consumed tokens: 36726374400 | elapsed time per iteration (s): 0.16 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.723021E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.758 | TFLOPs: 24.68 | +7: iteration 70060/ 173500 | consumed samples: 17935360 | consumed tokens: 36731617280 | elapsed time per iteration (s): 0.16 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.720863E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.592 | TFLOPs: 25.65 | +7: iteration 70070/ 173500 | consumed samples: 17937920 | consumed tokens: 36736860160 | elapsed time per iteration (s): 0.15 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.719519E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.530 | TFLOPs: 25.93 | +7: iteration 70080/ 173500 | consumed samples: 17940480 | consumed tokens: 36742103040 | elapsed time per iteration (s): 0.15 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.717367E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.673 | TFLOPs: 25.98 | +7: iteration 70090/ 173500 | consumed samples: 17943040 | consumed tokens: 36747345920 | elapsed time per iteration (s): 0.16 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.712092E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.001 | TFLOPs: 24.83 | +7: iteration 70100/ 173500 | consumed samples: 17945600 | consumed tokens: 36752588800 | elapsed time per iteration (s): 0.15 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.712862E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.888 | TFLOPs: 26.11 | +7: iteration 70110/ 173500 | consumed samples: 17948160 | consumed tokens: 36757831680 | elapsed time per iteration (s): 0.16 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.712717E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.506 | TFLOPs: 25.65 | +7: iteration 70120/ 173500 | consumed samples: 17950720 | consumed tokens: 36763074560 | elapsed time per iteration (s): 0.15 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.719695E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.157 | TFLOPs: 26.25 | +7: iteration 70130/ 173500 | consumed samples: 17953280 | consumed tokens: 36768317440 | elapsed time per iteration (s): 0.17 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.727998E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1506.846 | TFLOPs: 23.63 | +7: iteration 70140/ 173500 | consumed samples: 17955840 | consumed tokens: 36773560320 | elapsed time per iteration (s): 0.15 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.723760E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.360 | TFLOPs: 26.29 | +7: iteration 70150/ 173500 | consumed samples: 17958400 | consumed tokens: 36778803200 | elapsed time per iteration (s): 0.15 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.710559E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.619 | TFLOPs: 26.25 | +7: iteration 70160/ 173500 | consumed samples: 17960960 | consumed tokens: 36784046080 | elapsed time per iteration (s): 0.16 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.717194E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.705 | TFLOPs: 25.53 | +7: iteration 70170/ 173500 | consumed samples: 17963520 | consumed tokens: 36789288960 | elapsed time per iteration (s): 0.16 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.721017E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.483 | TFLOPs: 24.44 | +7: iteration 70180/ 173500 | consumed samples: 17966080 | consumed tokens: 36794531840 | elapsed time per iteration (s): 0.16 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.718216E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.445 | TFLOPs: 25.82 | +7: iteration 70190/ 173500 | consumed samples: 17968640 | consumed tokens: 36799774720 | elapsed time per iteration (s): 0.17 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.724256E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1464.406 | TFLOPs: 22.97 | +7: iteration 70200/ 173500 | consumed samples: 17971200 | consumed tokens: 36805017600 | elapsed time per iteration (s): 0.17 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.710322E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.198 | TFLOPs: 24.12 | +7: iteration 70210/ 173500 | consumed samples: 17973760 | consumed tokens: 36810260480 | elapsed time per iteration (s): 0.16 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.727100E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.870 | TFLOPs: 24.56 | +7: iteration 70220/ 173500 | consumed samples: 17976320 | consumed tokens: 36815503360 | elapsed time per iteration (s): 0.15 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.728169E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.136 | TFLOPs: 26.25 | +7: iteration 70230/ 173500 | consumed samples: 17978880 | consumed tokens: 36820746240 | elapsed time per iteration (s): 0.16 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.722588E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.465 | TFLOPs: 25.07 | +7: iteration 70240/ 173500 | consumed samples: 17981440 | consumed tokens: 36825989120 | elapsed time per iteration (s): 0.16 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.739434E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.319 | TFLOPs: 25.77 | +7: iteration 70250/ 173500 | consumed samples: 17984000 | consumed tokens: 36831232000 | elapsed time per iteration (s): 0.15 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.728094E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.897 | TFLOPs: 26.34 | +7: iteration 70260/ 173500 | consumed samples: 17986560 | consumed tokens: 36836474880 | elapsed time per iteration (s): 0.16 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.701747E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.476 | TFLOPs: 25.77 | +7: iteration 70270/ 173500 | consumed samples: 17989120 | consumed tokens: 36841717760 | elapsed time per iteration (s): 0.16 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.719786E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.479 | TFLOPs: 25.74 | +7: iteration 70280/ 173500 | consumed samples: 17991680 | consumed tokens: 36846960640 | elapsed time per iteration (s): 0.15 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.731468E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.633 | TFLOPs: 26.31 | +7: iteration 70290/ 173500 | consumed samples: 17994240 | consumed tokens: 36852203520 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.723748E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.275 | TFLOPs: 25.75 | +7: iteration 70300/ 173500 | consumed samples: 17996800 | consumed tokens: 36857446400 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.717175E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.180 | TFLOPs: 25.72 | +7: iteration 70310/ 173500 | consumed samples: 17999360 | consumed tokens: 36862689280 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.721020E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.915 | TFLOPs: 25.48 | +7: iteration 70320/ 173500 | consumed samples: 18001920 | consumed tokens: 36867932160 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.715969E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.520 | TFLOPs: 25.45 | +7: iteration 70330/ 173500 | consumed samples: 18004480 | consumed tokens: 36873175040 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.714280E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.289 | TFLOPs: 25.86 | +7: iteration 70340/ 173500 | consumed samples: 18007040 | consumed tokens: 36878417920 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.712975E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.136 | TFLOPs: 25.86 | +7: iteration 70350/ 173500 | consumed samples: 18009600 | consumed tokens: 36883660800 | elapsed time per iteration (s): 0.16 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.720410E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.093 | TFLOPs: 25.77 | +7: iteration 70360/ 173500 | consumed samples: 18012160 | consumed tokens: 36888903680 | elapsed time per iteration (s): 0.16 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.736769E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.837 | TFLOPs: 25.07 | +7: iteration 70370/ 173500 | consumed samples: 18014720 | consumed tokens: 36894146560 | elapsed time per iteration (s): 0.16 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.725087E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.634 | TFLOPs: 25.53 | +7: iteration 70380/ 173500 | consumed samples: 18017280 | consumed tokens: 36899389440 | elapsed time per iteration (s): 0.16 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.720599E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.494 | TFLOPs: 25.87 | +7: iteration 70390/ 173500 | consumed samples: 18019840 | consumed tokens: 36904632320 | elapsed time per iteration (s): 0.15 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.726672E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.415 | TFLOPs: 26.29 | +7: iteration 70400/ 173500 | consumed samples: 18022400 | consumed tokens: 36909875200 | elapsed time per iteration (s): 0.15 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.730424E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.404 | TFLOPs: 25.91 | +7: iteration 70410/ 173500 | consumed samples: 18024960 | consumed tokens: 36915118080 | elapsed time per iteration (s): 0.15 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.723077E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.504 | TFLOPs: 26.04 | +7: iteration 70420/ 173500 | consumed samples: 18027520 | consumed tokens: 36920360960 | elapsed time per iteration (s): 0.17 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.721769E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.730 | TFLOPs: 23.72 | +7: iteration 70430/ 173500 | consumed samples: 18030080 | consumed tokens: 36925603840 | elapsed time per iteration (s): 0.16 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.718329E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.668 | TFLOPs: 25.24 | +7: iteration 70440/ 173500 | consumed samples: 18032640 | consumed tokens: 36930846720 | elapsed time per iteration (s): 0.15 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.724473E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.412 | TFLOPs: 25.95 | +7: iteration 70450/ 173500 | consumed samples: 18035200 | consumed tokens: 36936089600 | elapsed time per iteration (s): 0.16 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.724926E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.218 | TFLOPs: 25.19 | +7: iteration 70460/ 173500 | consumed samples: 18037760 | consumed tokens: 36941332480 | elapsed time per iteration (s): 0.16 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.711394E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.941 | TFLOPs: 25.25 | +7: iteration 70470/ 173500 | consumed samples: 18040320 | consumed tokens: 36946575360 | elapsed time per iteration (s): 0.16 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.728952E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.706 | TFLOPs: 25.31 | +7: iteration 70480/ 173500 | consumed samples: 18042880 | consumed tokens: 36951818240 | elapsed time per iteration (s): 0.15 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.717154E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.088 | TFLOPs: 26.14 | +7: iteration 70490/ 173500 | consumed samples: 18045440 | consumed tokens: 36957061120 | elapsed time per iteration (s): 0.16 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.718795E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.130 | TFLOPs: 25.14 | +7: iteration 70500/ 173500 | consumed samples: 18048000 | consumed tokens: 36962304000 | elapsed time per iteration (s): 0.15 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.722013E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.253 | TFLOPs: 26.07 | +7: iteration 70510/ 173500 | consumed samples: 18050560 | consumed tokens: 36967546880 | elapsed time per iteration (s): 0.17 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.708901E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.584 | TFLOPs: 24.27 | +7: iteration 70520/ 173500 | consumed samples: 18053120 | consumed tokens: 36972789760 | elapsed time per iteration (s): 0.16 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.715490E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.019 | TFLOPs: 25.26 | +7: iteration 70530/ 173500 | consumed samples: 18055680 | consumed tokens: 36978032640 | elapsed time per iteration (s): 0.16 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.706287E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.822 | TFLOPs: 25.81 | +7: iteration 70540/ 173500 | consumed samples: 18058240 | consumed tokens: 36983275520 | elapsed time per iteration (s): 0.16 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.719044E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.280 | TFLOPs: 25.17 | +7: iteration 70550/ 173500 | consumed samples: 18060800 | consumed tokens: 36988518400 | elapsed time per iteration (s): 0.16 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.714758E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.291 | TFLOPs: 24.81 | +7: iteration 70560/ 173500 | consumed samples: 18063360 | consumed tokens: 36993761280 | elapsed time per iteration (s): 0.15 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.720200E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.349 | TFLOPs: 26.24 | +7: iteration 70570/ 173500 | consumed samples: 18065920 | consumed tokens: 36999004160 | elapsed time per iteration (s): 0.15 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.722697E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.677 | TFLOPs: 26.25 | +7: iteration 70580/ 173500 | consumed samples: 18068480 | consumed tokens: 37004247040 | elapsed time per iteration (s): 0.15 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.725426E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.243 | TFLOPs: 26.26 | +7: iteration 70590/ 173500 | consumed samples: 18071040 | consumed tokens: 37009489920 | elapsed time per iteration (s): 0.16 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.720222E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.918 | TFLOPs: 25.67 | +7: iteration 70600/ 173500 | consumed samples: 18073600 | consumed tokens: 37014732800 | elapsed time per iteration (s): 0.16 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.719929E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.481 | TFLOPs: 25.29 | +7: iteration 70610/ 173500 | consumed samples: 18076160 | consumed tokens: 37019975680 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.710922E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.628 | TFLOPs: 24.51 | +7: iteration 70620/ 173500 | consumed samples: 18078720 | consumed tokens: 37025218560 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.728194E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.637 | TFLOPs: 25.43 | +7: iteration 70630/ 173500 | consumed samples: 18081280 | consumed tokens: 37030461440 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.710403E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.976 | TFLOPs: 25.59 | +7: iteration 70640/ 173500 | consumed samples: 18083840 | consumed tokens: 37035704320 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.720483E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.907 | TFLOPs: 25.76 | +7: iteration 70650/ 173500 | consumed samples: 18086400 | consumed tokens: 37040947200 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.714757E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.791 | TFLOPs: 25.34 | +7: iteration 70660/ 173500 | consumed samples: 18088960 | consumed tokens: 37046190080 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.714941E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.241 | TFLOPs: 25.35 | +7: iteration 70670/ 173500 | consumed samples: 18091520 | consumed tokens: 37051432960 | elapsed time per iteration (s): 0.16 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.723311E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.528 | TFLOPs: 25.63 | +7: iteration 70680/ 173500 | consumed samples: 18094080 | consumed tokens: 37056675840 | elapsed time per iteration (s): 0.15 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.721341E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.655 | TFLOPs: 26.18 | +7: iteration 70690/ 173500 | consumed samples: 18096640 | consumed tokens: 37061918720 | elapsed time per iteration (s): 0.15 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.723354E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.345 | TFLOPs: 25.96 | +7: iteration 70700/ 173500 | consumed samples: 18099200 | consumed tokens: 37067161600 | elapsed time per iteration (s): 0.16 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.710843E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.881 | TFLOPs: 24.35 | +7: iteration 70710/ 173500 | consumed samples: 18101760 | consumed tokens: 37072404480 | elapsed time per iteration (s): 0.16 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.718169E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.245 | TFLOPs: 25.57 | +7: iteration 70720/ 173500 | consumed samples: 18104320 | consumed tokens: 37077647360 | elapsed time per iteration (s): 0.15 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.721264E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.593 | TFLOPs: 26.21 | +7: iteration 70730/ 173500 | consumed samples: 18106880 | consumed tokens: 37082890240 | elapsed time per iteration (s): 0.16 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.720882E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.155 | TFLOPs: 24.91 | +7: iteration 70740/ 173500 | consumed samples: 18109440 | consumed tokens: 37088133120 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.716620E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.848 | TFLOPs: 24.89 | +7: iteration 70750/ 173500 | consumed samples: 18112000 | consumed tokens: 37093376000 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.723799E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.171 | TFLOPs: 25.24 | +7: iteration 70760/ 173500 | consumed samples: 18114560 | consumed tokens: 37098618880 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.728017E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.543 | TFLOPs: 25.71 | +7: iteration 70770/ 173500 | consumed samples: 18117120 | consumed tokens: 37103861760 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.713369E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.004 | TFLOPs: 25.84 | +7: iteration 70780/ 173500 | consumed samples: 18119680 | consumed tokens: 37109104640 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.715526E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.764 | TFLOPs: 25.64 | +7: iteration 70790/ 173500 | consumed samples: 18122240 | consumed tokens: 37114347520 | elapsed time per iteration (s): 0.16 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.722657E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.487 | TFLOPs: 25.85 | +7: iteration 70800/ 173500 | consumed samples: 18124800 | consumed tokens: 37119590400 | elapsed time per iteration (s): 0.16 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.723455E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.486 | TFLOPs: 25.79 | +7: iteration 70810/ 173500 | consumed samples: 18127360 | consumed tokens: 37124833280 | elapsed time per iteration (s): 0.16 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.715275E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.499 | TFLOPs: 25.49 | +7: iteration 70820/ 173500 | consumed samples: 18129920 | consumed tokens: 37130076160 | elapsed time per iteration (s): 0.15 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.733585E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.850 | TFLOPs: 26.16 | +7: iteration 70830/ 173500 | consumed samples: 18132480 | consumed tokens: 37135319040 | elapsed time per iteration (s): 0.15 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.726244E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.920 | TFLOPs: 26.02 | +7: iteration 70840/ 173500 | consumed samples: 18135040 | consumed tokens: 37140561920 | elapsed time per iteration (s): 0.16 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.723946E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.568 | TFLOPs: 25.70 | +7: iteration 70850/ 173500 | consumed samples: 18137600 | consumed tokens: 37145804800 | elapsed time per iteration (s): 0.15 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.713212E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.634 | TFLOPs: 25.96 | +7: iteration 70860/ 173500 | consumed samples: 18140160 | consumed tokens: 37151047680 | elapsed time per iteration (s): 0.15 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.728967E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.842 | TFLOPs: 25.98 | +7: iteration 70870/ 173500 | consumed samples: 18142720 | consumed tokens: 37156290560 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.712687E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.116 | TFLOPs: 26.00 | +7: iteration 70880/ 173500 | consumed samples: 18145280 | consumed tokens: 37161533440 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.720702E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.774 | TFLOPs: 26.00 | +7: iteration 70890/ 173500 | consumed samples: 18147840 | consumed tokens: 37166776320 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.718128E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.042 | TFLOPs: 26.00 | +7: iteration 70900/ 173500 | consumed samples: 18150400 | consumed tokens: 37172019200 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.718211E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.559 | TFLOPs: 26.03 | +7: iteration 70910/ 173500 | consumed samples: 18152960 | consumed tokens: 37177262080 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.722911E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.436 | TFLOPs: 25.98 | +7: iteration 70920/ 173500 | consumed samples: 18155520 | consumed tokens: 37182504960 | elapsed time per iteration (s): 0.15 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.720916E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.073 | TFLOPs: 25.99 | +7: iteration 70930/ 173500 | consumed samples: 18158080 | consumed tokens: 37187747840 | elapsed time per iteration (s): 0.15 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.713264E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.857 | TFLOPs: 26.00 | +7: iteration 70940/ 173500 | consumed samples: 18160640 | consumed tokens: 37192990720 | elapsed time per iteration (s): 0.15 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.729035E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.287 | TFLOPs: 26.05 | +7: iteration 70950/ 173500 | consumed samples: 18163200 | consumed tokens: 37198233600 | elapsed time per iteration (s): 0.15 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.723431E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.102 | TFLOPs: 26.08 | +7: iteration 70960/ 173500 | consumed samples: 18165760 | consumed tokens: 37203476480 | elapsed time per iteration (s): 0.15 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.723870E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.561 | TFLOPs: 26.06 | +7: iteration 70970/ 173500 | consumed samples: 18168320 | consumed tokens: 37208719360 | elapsed time per iteration (s): 0.15 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.721930E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.506 | TFLOPs: 25.93 | +7: iteration 70980/ 173500 | consumed samples: 18170880 | consumed tokens: 37213962240 | elapsed time per iteration (s): 0.16 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.720054E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.798 | TFLOPs: 25.21 | +7: iteration 70990/ 173500 | consumed samples: 18173440 | consumed tokens: 37219205120 | elapsed time per iteration (s): 0.15 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.731532E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.514 | TFLOPs: 26.03 | +7: iteration 71000/ 173500 | consumed samples: 18176000 | consumed tokens: 37224448000 | elapsed time per iteration (s): 0.16 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.731886E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.319 | TFLOPs: 25.66 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 71000 | lm loss value: 3.869479E+00 | lm loss PPL: 4.791740E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 71000 to checkpoints_44m91b100m +0: [2023-03-17 03:19:35,896] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step71000 is begin to save! +0: [2023-03-17 03:19:35,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:19:35,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:19:35,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:19:35,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:19:35,970] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:19:35,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:19:35,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:19:35,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:19:35,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:19:35,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:19:35,994] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:19:36,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:19:36,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:19:36,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:19:36,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:19:36,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:19:36,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:19:36,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:19:36,027] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:19:36,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:19:36,028] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step71000/mp_rank_00_model_states.pt +0: [2023-03-17 03:19:36,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:19:36,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:19:36,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:19:36,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:19:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:19:36,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +7: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +5: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +2: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +4: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:19:36,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 03:19:36,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +1: [2023-03-17 03:19:36,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:19:36,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:19:36,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +3: [2023-03-17 03:19:36,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:19:36,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:19:36,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +6: [2023-03-17 03:19:36,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:19:36,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step71000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:19:36,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step71000 is ready now! +0: successfully saved checkpoint at iteration 71000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.96 +7: iteration 71010/ 173500 | consumed samples: 18178560 | consumed tokens: 37229690880 | elapsed time per iteration (s): 0.18 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.727708E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.748 | TFLOPs: 22.74 | +7: iteration 71020/ 173500 | consumed samples: 18181120 | consumed tokens: 37234933760 | elapsed time per iteration (s): 0.15 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.724350E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.530 | TFLOPs: 26.07 | +7: iteration 71030/ 173500 | consumed samples: 18183680 | consumed tokens: 37240176640 | elapsed time per iteration (s): 0.16 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.724876E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.862 | TFLOPs: 25.67 | +7: iteration 71040/ 173500 | consumed samples: 18186240 | consumed tokens: 37245419520 | elapsed time per iteration (s): 0.16 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.729490E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.994 | TFLOPs: 25.36 | +7: iteration 71050/ 173500 | consumed samples: 18188800 | consumed tokens: 37250662400 | elapsed time per iteration (s): 0.16 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.712115E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.512 | TFLOPs: 25.27 | +7: iteration 71060/ 173500 | consumed samples: 18191360 | consumed tokens: 37255905280 | elapsed time per iteration (s): 0.16 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.719038E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.974 | TFLOPs: 25.53 | +7: iteration 71070/ 173500 | consumed samples: 18193920 | consumed tokens: 37261148160 | elapsed time per iteration (s): 0.16 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.726609E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.587 | TFLOPs: 25.51 | +7: iteration 71080/ 173500 | consumed samples: 18196480 | consumed tokens: 37266391040 | elapsed time per iteration (s): 0.16 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.733475E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.513 | TFLOPs: 25.66 | +7: iteration 71090/ 173500 | consumed samples: 18199040 | consumed tokens: 37271633920 | elapsed time per iteration (s): 0.15 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.728815E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.775 | TFLOPs: 25.92 | +7: iteration 71100/ 173500 | consumed samples: 18201600 | consumed tokens: 37276876800 | elapsed time per iteration (s): 0.16 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.719780E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.981 | TFLOPs: 25.64 | +7: iteration 71110/ 173500 | consumed samples: 18204160 | consumed tokens: 37282119680 | elapsed time per iteration (s): 0.16 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.703953E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.357 | TFLOPs: 25.29 | +7: iteration 71120/ 173500 | consumed samples: 18206720 | consumed tokens: 37287362560 | elapsed time per iteration (s): 0.16 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.735279E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.363 | TFLOPs: 25.49 | +7: iteration 71130/ 173500 | consumed samples: 18209280 | consumed tokens: 37292605440 | elapsed time per iteration (s): 0.16 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.718029E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.427 | TFLOPs: 25.05 | +7: iteration 71140/ 173500 | consumed samples: 18211840 | consumed tokens: 37297848320 | elapsed time per iteration (s): 0.15 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.705965E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.566 | TFLOPs: 26.12 | +7: iteration 71150/ 173500 | consumed samples: 18214400 | consumed tokens: 37303091200 | elapsed time per iteration (s): 0.16 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.716911E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.183 | TFLOPs: 25.86 | +7: iteration 71160/ 173500 | consumed samples: 18216960 | consumed tokens: 37308334080 | elapsed time per iteration (s): 0.15 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.712163E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.286 | TFLOPs: 26.10 | +7: iteration 71170/ 173500 | consumed samples: 18219520 | consumed tokens: 37313576960 | elapsed time per iteration (s): 0.15 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.728582E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.734 | TFLOPs: 26.12 | +7: iteration 71180/ 173500 | consumed samples: 18222080 | consumed tokens: 37318819840 | elapsed time per iteration (s): 0.16 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.719645E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.239 | TFLOPs: 25.25 | +7: iteration 71190/ 173500 | consumed samples: 18224640 | consumed tokens: 37324062720 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.723987E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.528 | TFLOPs: 26.12 | +7: iteration 71200/ 173500 | consumed samples: 18227200 | consumed tokens: 37329305600 | elapsed time per iteration (s): 0.16 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.718262E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.976 | TFLOPs: 25.78 | +7: iteration 71210/ 173500 | consumed samples: 18229760 | consumed tokens: 37334548480 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.728524E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.058 | TFLOPs: 25.97 | +7: iteration 71220/ 173500 | consumed samples: 18232320 | consumed tokens: 37339791360 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.707832E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.270 | TFLOPs: 26.07 | +7: iteration 71230/ 173500 | consumed samples: 18234880 | consumed tokens: 37345034240 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.709470E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.035 | TFLOPs: 26.10 | +7: iteration 71240/ 173500 | consumed samples: 18237440 | consumed tokens: 37350277120 | elapsed time per iteration (s): 0.15 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.735510E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.905 | TFLOPs: 26.11 | +7: iteration 71250/ 173500 | consumed samples: 18240000 | consumed tokens: 37355520000 | elapsed time per iteration (s): 0.15 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.726723E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.511 | TFLOPs: 26.09 | +7: iteration 71260/ 173500 | consumed samples: 18242560 | consumed tokens: 37360762880 | elapsed time per iteration (s): 0.16 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.716805E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.247 | TFLOPs: 25.61 | +7: iteration 71270/ 173500 | consumed samples: 18245120 | consumed tokens: 37366005760 | elapsed time per iteration (s): 0.16 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.725051E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.489 | TFLOPs: 25.74 | +7: iteration 71280/ 173500 | consumed samples: 18247680 | consumed tokens: 37371248640 | elapsed time per iteration (s): 0.16 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.710560E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.595 | TFLOPs: 25.73 | +7: iteration 71290/ 173500 | consumed samples: 18250240 | consumed tokens: 37376491520 | elapsed time per iteration (s): 0.16 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.718579E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.356 | TFLOPs: 25.60 | +7: iteration 71300/ 173500 | consumed samples: 18252800 | consumed tokens: 37381734400 | elapsed time per iteration (s): 0.15 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.735531E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.132 | TFLOPs: 26.13 | +7: iteration 71310/ 173500 | consumed samples: 18255360 | consumed tokens: 37386977280 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.725630E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.301 | TFLOPs: 26.07 | +7: iteration 71320/ 173500 | consumed samples: 18257920 | consumed tokens: 37392220160 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.718246E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.336 | TFLOPs: 26.09 | +7: iteration 71330/ 173500 | consumed samples: 18260480 | consumed tokens: 37397463040 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.719139E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.002 | TFLOPs: 26.08 | +7: iteration 71340/ 173500 | consumed samples: 18263040 | consumed tokens: 37402705920 | elapsed time per iteration (s): 0.16 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.715279E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.464 | TFLOPs: 25.44 | +7: iteration 71350/ 173500 | consumed samples: 18265600 | consumed tokens: 37407948800 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.729372E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.537 | TFLOPs: 26.09 | +7: iteration 71360/ 173500 | consumed samples: 18268160 | consumed tokens: 37413191680 | elapsed time per iteration (s): 0.16 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.723063E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.571 | TFLOPs: 25.84 | +7: iteration 71370/ 173500 | consumed samples: 18270720 | consumed tokens: 37418434560 | elapsed time per iteration (s): 0.15 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.721931E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.322 | TFLOPs: 26.04 | +7: iteration 71380/ 173500 | consumed samples: 18273280 | consumed tokens: 37423677440 | elapsed time per iteration (s): 0.15 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.721939E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.080 | TFLOPs: 26.05 | +7: iteration 71390/ 173500 | consumed samples: 18275840 | consumed tokens: 37428920320 | elapsed time per iteration (s): 0.16 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.726955E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.927 | TFLOPs: 25.33 | +7: iteration 71400/ 173500 | consumed samples: 18278400 | consumed tokens: 37434163200 | elapsed time per iteration (s): 0.16 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.733615E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.128 | TFLOPs: 25.61 | +7: iteration 71410/ 173500 | consumed samples: 18280960 | consumed tokens: 37439406080 | elapsed time per iteration (s): 0.16 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.720460E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.653 | TFLOPs: 25.27 | +7: iteration 71420/ 173500 | consumed samples: 18283520 | consumed tokens: 37444648960 | elapsed time per iteration (s): 0.16 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.733544E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.612 | TFLOPs: 25.85 | +7: iteration 71430/ 173500 | consumed samples: 18286080 | consumed tokens: 37449891840 | elapsed time per iteration (s): 0.16 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.719270E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.890 | TFLOPs: 25.56 | +7: iteration 71440/ 173500 | consumed samples: 18288640 | consumed tokens: 37455134720 | elapsed time per iteration (s): 0.16 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.723373E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.678 | TFLOPs: 25.43 | +7: iteration 71450/ 173500 | consumed samples: 18291200 | consumed tokens: 37460377600 | elapsed time per iteration (s): 0.15 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.727859E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.760 | TFLOPs: 26.12 | +7: iteration 71460/ 173500 | consumed samples: 18293760 | consumed tokens: 37465620480 | elapsed time per iteration (s): 0.15 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.710466E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.559 | TFLOPs: 25.98 | +7: iteration 71470/ 173500 | consumed samples: 18296320 | consumed tokens: 37470863360 | elapsed time per iteration (s): 0.16 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.721573E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.553 | TFLOPs: 25.73 | +7: iteration 71480/ 173500 | consumed samples: 18298880 | consumed tokens: 37476106240 | elapsed time per iteration (s): 0.16 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.724037E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.539 | TFLOPs: 25.46 | +7: iteration 71490/ 173500 | consumed samples: 18301440 | consumed tokens: 37481349120 | elapsed time per iteration (s): 0.16 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.725312E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.670 | TFLOPs: 24.65 | +7: iteration 71500/ 173500 | consumed samples: 18304000 | consumed tokens: 37486592000 | elapsed time per iteration (s): 0.16 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.725584E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.441 | TFLOPs: 25.69 | +7: iteration 71510/ 173500 | consumed samples: 18306560 | consumed tokens: 37491834880 | elapsed time per iteration (s): 0.15 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.724218E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.590 | TFLOPs: 26.12 | +7: iteration 71520/ 173500 | consumed samples: 18309120 | consumed tokens: 37497077760 | elapsed time per iteration (s): 0.15 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.728365E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.721 | TFLOPs: 26.11 | +7: iteration 71530/ 173500 | consumed samples: 18311680 | consumed tokens: 37502320640 | elapsed time per iteration (s): 0.16 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.713309E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.077 | TFLOPs: 25.70 | +7: iteration 71540/ 173500 | consumed samples: 18314240 | consumed tokens: 37507563520 | elapsed time per iteration (s): 0.16 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.726254E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.673 | TFLOPs: 25.31 | +7: iteration 71550/ 173500 | consumed samples: 18316800 | consumed tokens: 37512806400 | elapsed time per iteration (s): 0.15 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.728394E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.346 | TFLOPs: 26.13 | +7: iteration 71560/ 173500 | consumed samples: 18319360 | consumed tokens: 37518049280 | elapsed time per iteration (s): 0.16 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.733699E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.787 | TFLOPs: 25.76 | +7: iteration 71570/ 173500 | consumed samples: 18321920 | consumed tokens: 37523292160 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.717754E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.076 | TFLOPs: 25.27 | +7: iteration 71580/ 173500 | consumed samples: 18324480 | consumed tokens: 37528535040 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.734033E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.482 | TFLOPs: 25.66 | +7: iteration 71590/ 173500 | consumed samples: 18327040 | consumed tokens: 37533777920 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.700894E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.417 | TFLOPs: 25.30 | +7: iteration 71600/ 173500 | consumed samples: 18329600 | consumed tokens: 37539020800 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.723799E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.493 | TFLOPs: 25.44 | +7: iteration 71610/ 173500 | consumed samples: 18332160 | consumed tokens: 37544263680 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.723665E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.996 | TFLOPs: 25.39 | +7: iteration 71620/ 173500 | consumed samples: 18334720 | consumed tokens: 37549506560 | elapsed time per iteration (s): 0.16 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.718177E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.743 | TFLOPs: 25.26 | +7: iteration 71630/ 173500 | consumed samples: 18337280 | consumed tokens: 37554749440 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.726155E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.572 | TFLOPs: 26.06 | +7: iteration 71640/ 173500 | consumed samples: 18339840 | consumed tokens: 37559992320 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.720117E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.839 | TFLOPs: 26.11 | +7: iteration 71650/ 173500 | consumed samples: 18342400 | consumed tokens: 37565235200 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.713702E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.664 | TFLOPs: 26.11 | +7: iteration 71660/ 173500 | consumed samples: 18344960 | consumed tokens: 37570478080 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.719072E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.552 | TFLOPs: 26.10 | +7: iteration 71670/ 173500 | consumed samples: 18347520 | consumed tokens: 37575720960 | elapsed time per iteration (s): 0.16 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.714581E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.959 | TFLOPs: 25.80 | +7: iteration 71680/ 173500 | consumed samples: 18350080 | consumed tokens: 37580963840 | elapsed time per iteration (s): 0.15 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.712248E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.452 | TFLOPs: 26.17 | +7: iteration 71690/ 173500 | consumed samples: 18352640 | consumed tokens: 37586206720 | elapsed time per iteration (s): 0.15 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.716114E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.392 | TFLOPs: 26.15 | +7: iteration 71700/ 173500 | consumed samples: 18355200 | consumed tokens: 37591449600 | elapsed time per iteration (s): 0.15 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.719986E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.624 | TFLOPs: 25.95 | +7: iteration 71710/ 173500 | consumed samples: 18357760 | consumed tokens: 37596692480 | elapsed time per iteration (s): 0.16 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.733298E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.625 | TFLOPs: 25.84 | +7: iteration 71720/ 173500 | consumed samples: 18360320 | consumed tokens: 37601935360 | elapsed time per iteration (s): 0.15 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.719882E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.664 | TFLOPs: 26.15 | +7: iteration 71730/ 173500 | consumed samples: 18362880 | consumed tokens: 37607178240 | elapsed time per iteration (s): 0.16 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.700057E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.750 | TFLOPs: 25.48 | +7: iteration 71740/ 173500 | consumed samples: 18365440 | consumed tokens: 37612421120 | elapsed time per iteration (s): 0.16 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.720949E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.531 | TFLOPs: 25.68 | +7: iteration 71750/ 173500 | consumed samples: 18368000 | consumed tokens: 37617664000 | elapsed time per iteration (s): 0.16 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.715301E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.460 | TFLOPs: 25.08 | +7: iteration 71760/ 173500 | consumed samples: 18370560 | consumed tokens: 37622906880 | elapsed time per iteration (s): 0.16 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.717993E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.466 | TFLOPs: 25.57 | +7: iteration 71770/ 173500 | consumed samples: 18373120 | consumed tokens: 37628149760 | elapsed time per iteration (s): 0.15 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.720926E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.431 | TFLOPs: 26.18 | +7: iteration 71780/ 173500 | consumed samples: 18375680 | consumed tokens: 37633392640 | elapsed time per iteration (s): 0.16 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.705959E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.289 | TFLOPs: 25.77 | +7: iteration 71790/ 173500 | consumed samples: 18378240 | consumed tokens: 37638635520 | elapsed time per iteration (s): 0.16 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.722739E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.621 | TFLOPs: 25.53 | +7: iteration 71800/ 173500 | consumed samples: 18380800 | consumed tokens: 37643878400 | elapsed time per iteration (s): 0.16 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.717879E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.889 | TFLOPs: 25.84 | +7: iteration 71810/ 173500 | consumed samples: 18383360 | consumed tokens: 37649121280 | elapsed time per iteration (s): 0.16 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.724032E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.943 | TFLOPs: 25.73 | +7: iteration 71820/ 173500 | consumed samples: 18385920 | consumed tokens: 37654364160 | elapsed time per iteration (s): 0.16 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.729519E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.681 | TFLOPs: 25.78 | +7: iteration 71830/ 173500 | consumed samples: 18388480 | consumed tokens: 37659607040 | elapsed time per iteration (s): 0.15 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.734295E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.394 | TFLOPs: 26.12 | +7: iteration 71840/ 173500 | consumed samples: 18391040 | consumed tokens: 37664849920 | elapsed time per iteration (s): 0.16 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.721377E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.490 | TFLOPs: 25.70 | +7: iteration 71850/ 173500 | consumed samples: 18393600 | consumed tokens: 37670092800 | elapsed time per iteration (s): 0.16 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.738360E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.157 | TFLOPs: 25.55 | +7: iteration 71860/ 173500 | consumed samples: 18396160 | consumed tokens: 37675335680 | elapsed time per iteration (s): 0.15 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.727034E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.985 | TFLOPs: 25.99 | +7: iteration 71870/ 173500 | consumed samples: 18398720 | consumed tokens: 37680578560 | elapsed time per iteration (s): 0.16 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.721797E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.929 | TFLOPs: 25.23 | +7: iteration 71880/ 173500 | consumed samples: 18401280 | consumed tokens: 37685821440 | elapsed time per iteration (s): 0.16 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.716419E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.788 | TFLOPs: 25.79 | +7: iteration 71890/ 173500 | consumed samples: 18403840 | consumed tokens: 37691064320 | elapsed time per iteration (s): 0.15 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.718024E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.948 | TFLOPs: 26.25 | +7: iteration 71900/ 173500 | consumed samples: 18406400 | consumed tokens: 37696307200 | elapsed time per iteration (s): 0.15 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.717481E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.026 | TFLOPs: 26.25 | +7: iteration 71910/ 173500 | consumed samples: 18408960 | consumed tokens: 37701550080 | elapsed time per iteration (s): 0.16 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.725941E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.455 | TFLOPs: 25.88 | +7: iteration 71920/ 173500 | consumed samples: 18411520 | consumed tokens: 37706792960 | elapsed time per iteration (s): 0.16 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.711944E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.647 | TFLOPs: 25.82 | +7: iteration 71930/ 173500 | consumed samples: 18414080 | consumed tokens: 37712035840 | elapsed time per iteration (s): 0.15 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.723289E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.045 | TFLOPs: 26.14 | +7: iteration 71940/ 173500 | consumed samples: 18416640 | consumed tokens: 37717278720 | elapsed time per iteration (s): 0.15 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.720821E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.077 | TFLOPs: 26.10 | +7: iteration 71950/ 173500 | consumed samples: 18419200 | consumed tokens: 37722521600 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.711665E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.249 | TFLOPs: 25.58 | +7: iteration 71960/ 173500 | consumed samples: 18421760 | consumed tokens: 37727764480 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.716580E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.484 | TFLOPs: 24.88 | +7: iteration 71970/ 173500 | consumed samples: 18424320 | consumed tokens: 37733007360 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.726325E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.049 | TFLOPs: 25.78 | +7: iteration 71980/ 173500 | consumed samples: 18426880 | consumed tokens: 37738250240 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.712010E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.932 | TFLOPs: 25.40 | +7: iteration 71990/ 173500 | consumed samples: 18429440 | consumed tokens: 37743493120 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.717577E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.968 | TFLOPs: 25.69 | +0: [2023-03-17 03:22:11,978] [INFO] [logging.py:68:log_dist] [Rank 0] step=72000, skipped=0, lr=[0.0001353602432066091, 0.0001353602432066091, 0.0001353602432066091], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 72000/ 173500 | consumed samples: 18432000 | consumed tokens: 37748736000 | elapsed time per iteration (s): 0.16 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.718382E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.242 | TFLOPs: 25.36 | +0: steps: 72000 loss: 3.7187 iter time (s): 0.155 samples/sec: 1650.333 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 72000 | lm loss value: 3.815428E+00 | lm loss PPL: 4.539619E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 72000 to checkpoints_44m91b100m +0: [2023-03-17 03:22:12,051] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step72000 is begin to save! +0: [2023-03-17 03:22:12,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:22:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:22:12,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:22:12,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:22:12,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:22:12,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:22:12,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:22:12,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:22:12,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:22:12,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:22:12,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:22:12,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:22:12,158] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:22:12,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:22:12,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:22:12,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:22:12,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:22:12,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:22:12,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:22:12,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:22:12,183] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step72000/mp_rank_00_model_states.pt +0: [2023-03-17 03:22:12,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:22:12,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:22:12,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:22:12,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:22:12,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +4: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:22:12,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +4: [2023-03-17 03:22:12,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +4: [2023-03-17 03:22:12,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:22:12,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 03:22:12,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +4: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:22:12,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +3: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +3: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +3: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:22:12,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +3: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +4: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:22:12,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +4: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:22:12,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +5: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +3: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +3: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +7: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +6: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +1: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +2: [2023-03-17 03:22:12,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:22:12,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step72000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:22:12,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step72000 is ready now! +0: successfully saved checkpoint at iteration 72000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.71 +7: iteration 72010/ 173500 | consumed samples: 18434560 | consumed tokens: 37753978880 | elapsed time per iteration (s): 0.18 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.712466E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.438 | TFLOPs: 22.50 | +7: iteration 72020/ 173500 | consumed samples: 18437120 | consumed tokens: 37759221760 | elapsed time per iteration (s): 0.15 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.721773E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.324 | TFLOPs: 26.24 | +7: iteration 72030/ 173500 | consumed samples: 18439680 | consumed tokens: 37764464640 | elapsed time per iteration (s): 0.16 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.709903E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.878 | TFLOPs: 25.87 | +7: iteration 72040/ 173500 | consumed samples: 18442240 | consumed tokens: 37769707520 | elapsed time per iteration (s): 0.16 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.708257E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.087 | TFLOPs: 25.63 | +7: iteration 72050/ 173500 | consumed samples: 18444800 | consumed tokens: 37774950400 | elapsed time per iteration (s): 0.16 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.717007E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.239 | TFLOPs: 25.75 | +7: iteration 72060/ 173500 | consumed samples: 18447360 | consumed tokens: 37780193280 | elapsed time per iteration (s): 0.16 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.711775E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.046 | TFLOPs: 25.59 | +7: iteration 72070/ 173500 | consumed samples: 18449920 | consumed tokens: 37785436160 | elapsed time per iteration (s): 0.16 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.717989E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.218 | TFLOPs: 25.80 | +7: iteration 72080/ 173500 | consumed samples: 18452480 | consumed tokens: 37790679040 | elapsed time per iteration (s): 0.15 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.723893E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.508 | TFLOPs: 25.93 | +7: iteration 72090/ 173500 | consumed samples: 18455040 | consumed tokens: 37795921920 | elapsed time per iteration (s): 0.15 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.714880E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.012 | TFLOPs: 25.94 | +7: iteration 72100/ 173500 | consumed samples: 18457600 | consumed tokens: 37801164800 | elapsed time per iteration (s): 0.15 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.727835E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.142 | TFLOPs: 26.25 | +7: iteration 72110/ 173500 | consumed samples: 18460160 | consumed tokens: 37806407680 | elapsed time per iteration (s): 0.15 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.734780E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.620 | TFLOPs: 26.04 | +7: iteration 72120/ 173500 | consumed samples: 18462720 | consumed tokens: 37811650560 | elapsed time per iteration (s): 0.15 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.733843E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.380 | TFLOPs: 26.29 | +7: iteration 72130/ 173500 | consumed samples: 18465280 | consumed tokens: 37816893440 | elapsed time per iteration (s): 0.15 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.723811E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.299 | TFLOPs: 26.01 | +7: iteration 72140/ 173500 | consumed samples: 18467840 | consumed tokens: 37822136320 | elapsed time per iteration (s): 0.16 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.729771E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.160 | TFLOPs: 24.94 | +7: iteration 72150/ 173500 | consumed samples: 18470400 | consumed tokens: 37827379200 | elapsed time per iteration (s): 0.15 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.728818E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.942 | TFLOPs: 25.94 | +7: iteration 72160/ 173500 | consumed samples: 18472960 | consumed tokens: 37832622080 | elapsed time per iteration (s): 0.15 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.720485E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.153 | TFLOPs: 26.13 | +7: iteration 72170/ 173500 | consumed samples: 18475520 | consumed tokens: 37837864960 | elapsed time per iteration (s): 0.16 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.721818E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.759 | TFLOPs: 25.20 | +7: iteration 72180/ 173500 | consumed samples: 18478080 | consumed tokens: 37843107840 | elapsed time per iteration (s): 0.16 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.727229E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.264 | TFLOPs: 25.00 | +7: iteration 72190/ 173500 | consumed samples: 18480640 | consumed tokens: 37848350720 | elapsed time per iteration (s): 0.16 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.731738E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.952 | TFLOPs: 25.88 | +7: iteration 72200/ 173500 | consumed samples: 18483200 | consumed tokens: 37853593600 | elapsed time per iteration (s): 0.16 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.729215E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.387 | TFLOPs: 25.79 | +7: iteration 72210/ 173500 | consumed samples: 18485760 | consumed tokens: 37858836480 | elapsed time per iteration (s): 0.15 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.729902E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.080 | TFLOPs: 26.18 | +7: iteration 72220/ 173500 | consumed samples: 18488320 | consumed tokens: 37864079360 | elapsed time per iteration (s): 0.15 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.723035E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.231 | TFLOPs: 25.93 | +7: iteration 72230/ 173500 | consumed samples: 18490880 | consumed tokens: 37869322240 | elapsed time per iteration (s): 0.15 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.705663E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.726 | TFLOPs: 25.97 | +7: iteration 72240/ 173500 | consumed samples: 18493440 | consumed tokens: 37874565120 | elapsed time per iteration (s): 0.16 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.720173E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.282 | TFLOPs: 25.80 | +7: iteration 72250/ 173500 | consumed samples: 18496000 | consumed tokens: 37879808000 | elapsed time per iteration (s): 0.15 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.721376E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.358 | TFLOPs: 25.98 | +7: iteration 72260/ 173500 | consumed samples: 18498560 | consumed tokens: 37885050880 | elapsed time per iteration (s): 0.15 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.715578E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.583 | TFLOPs: 26.06 | +7: iteration 72270/ 173500 | consumed samples: 18501120 | consumed tokens: 37890293760 | elapsed time per iteration (s): 0.15 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.709494E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.732 | TFLOPs: 25.93 | +7: iteration 72280/ 173500 | consumed samples: 18503680 | consumed tokens: 37895536640 | elapsed time per iteration (s): 0.15 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.742684E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.138 | TFLOPs: 25.91 | +7: iteration 72290/ 173500 | consumed samples: 18506240 | consumed tokens: 37900779520 | elapsed time per iteration (s): 0.16 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.717281E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.513 | TFLOPs: 25.65 | +7: iteration 72300/ 173500 | consumed samples: 18508800 | consumed tokens: 37906022400 | elapsed time per iteration (s): 0.15 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.718411E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.738 | TFLOPs: 25.97 | +7: iteration 72310/ 173500 | consumed samples: 18511360 | consumed tokens: 37911265280 | elapsed time per iteration (s): 0.15 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.727493E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.371 | TFLOPs: 26.20 | +7: iteration 72320/ 173500 | consumed samples: 18513920 | consumed tokens: 37916508160 | elapsed time per iteration (s): 0.16 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.721288E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.834 | TFLOPs: 25.37 | +7: iteration 72330/ 173500 | consumed samples: 18516480 | consumed tokens: 37921751040 | elapsed time per iteration (s): 0.16 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.729230E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.444 | TFLOPs: 25.46 | +7: iteration 72340/ 173500 | consumed samples: 18519040 | consumed tokens: 37926993920 | elapsed time per iteration (s): 0.15 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.727871E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.414 | TFLOPs: 25.95 | +7: iteration 72350/ 173500 | consumed samples: 18521600 | consumed tokens: 37932236800 | elapsed time per iteration (s): 0.16 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.713924E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.839 | TFLOPs: 25.65 | +7: iteration 72360/ 173500 | consumed samples: 18524160 | consumed tokens: 37937479680 | elapsed time per iteration (s): 0.16 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.717057E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.674 | TFLOPs: 25.64 | +7: iteration 72370/ 173500 | consumed samples: 18526720 | consumed tokens: 37942722560 | elapsed time per iteration (s): 0.16 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.711934E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.765 | TFLOPs: 25.61 | +7: iteration 72380/ 173500 | consumed samples: 18529280 | consumed tokens: 37947965440 | elapsed time per iteration (s): 0.15 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.727951E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.611 | TFLOPs: 26.09 | +7: iteration 72390/ 173500 | consumed samples: 18531840 | consumed tokens: 37953208320 | elapsed time per iteration (s): 0.16 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.712140E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.593 | TFLOPs: 25.78 | +7: iteration 72400/ 173500 | consumed samples: 18534400 | consumed tokens: 37958451200 | elapsed time per iteration (s): 0.15 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.717691E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.589 | TFLOPs: 26.10 | +7: iteration 72410/ 173500 | consumed samples: 18536960 | consumed tokens: 37963694080 | elapsed time per iteration (s): 0.15 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.733679E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.434 | TFLOPs: 26.32 | +7: iteration 72420/ 173500 | consumed samples: 18539520 | consumed tokens: 37968936960 | elapsed time per iteration (s): 0.15 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.725750E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.452 | TFLOPs: 26.31 | +7: iteration 72430/ 173500 | consumed samples: 18542080 | consumed tokens: 37974179840 | elapsed time per iteration (s): 0.15 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.713119E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.786 | TFLOPs: 26.17 | +7: iteration 72440/ 173500 | consumed samples: 18544640 | consumed tokens: 37979422720 | elapsed time per iteration (s): 0.15 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.718111E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.053 | TFLOPs: 26.11 | +7: iteration 72450/ 173500 | consumed samples: 18547200 | consumed tokens: 37984665600 | elapsed time per iteration (s): 0.15 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.710755E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.759 | TFLOPs: 25.92 | +7: iteration 72460/ 173500 | consumed samples: 18549760 | consumed tokens: 37989908480 | elapsed time per iteration (s): 0.16 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.722353E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.354 | TFLOPs: 25.85 | +7: iteration 72470/ 173500 | consumed samples: 18552320 | consumed tokens: 37995151360 | elapsed time per iteration (s): 0.15 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.724044E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.739 | TFLOPs: 26.23 | +7: iteration 72480/ 173500 | consumed samples: 18554880 | consumed tokens: 38000394240 | elapsed time per iteration (s): 0.15 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.711671E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.067 | TFLOPs: 26.25 | +7: iteration 72490/ 173500 | consumed samples: 18557440 | consumed tokens: 38005637120 | elapsed time per iteration (s): 0.15 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.723613E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.489 | TFLOPs: 26.29 | +7: iteration 72500/ 173500 | consumed samples: 18560000 | consumed tokens: 38010880000 | elapsed time per iteration (s): 0.15 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.733749E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.962 | TFLOPs: 26.20 | +7: iteration 72510/ 173500 | consumed samples: 18562560 | consumed tokens: 38016122880 | elapsed time per iteration (s): 0.15 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.734647E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.962 | TFLOPs: 26.24 | +7: iteration 72520/ 173500 | consumed samples: 18565120 | consumed tokens: 38021365760 | elapsed time per iteration (s): 0.15 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.711481E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.218 | TFLOPs: 26.27 | +7: iteration 72530/ 173500 | consumed samples: 18567680 | consumed tokens: 38026608640 | elapsed time per iteration (s): 0.15 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.712712E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.763 | TFLOPs: 26.25 | +7: iteration 72540/ 173500 | consumed samples: 18570240 | consumed tokens: 38031851520 | elapsed time per iteration (s): 0.15 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.716104E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.277 | TFLOPs: 26.27 | +7: iteration 72550/ 173500 | consumed samples: 18572800 | consumed tokens: 38037094400 | elapsed time per iteration (s): 0.15 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.712992E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.814 | TFLOPs: 26.22 | +7: iteration 72560/ 173500 | consumed samples: 18575360 | consumed tokens: 38042337280 | elapsed time per iteration (s): 0.15 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.719724E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.011 | TFLOPs: 26.24 | +7: iteration 72570/ 173500 | consumed samples: 18577920 | consumed tokens: 38047580160 | elapsed time per iteration (s): 0.16 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.719866E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.102 | TFLOPs: 25.27 | +7: iteration 72580/ 173500 | consumed samples: 18580480 | consumed tokens: 38052823040 | elapsed time per iteration (s): 0.16 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.717049E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.076 | TFLOPs: 25.74 | +7: iteration 72590/ 173500 | consumed samples: 18583040 | consumed tokens: 38058065920 | elapsed time per iteration (s): 0.15 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.714688E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.496 | TFLOPs: 26.24 | +7: iteration 72600/ 173500 | consumed samples: 18585600 | consumed tokens: 38063308800 | elapsed time per iteration (s): 0.16 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.704424E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.643 | TFLOPs: 25.78 | +7: iteration 72610/ 173500 | consumed samples: 18588160 | consumed tokens: 38068551680 | elapsed time per iteration (s): 0.15 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.722379E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.379 | TFLOPs: 26.15 | +7: iteration 72620/ 173500 | consumed samples: 18590720 | consumed tokens: 38073794560 | elapsed time per iteration (s): 0.16 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.729739E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.381 | TFLOPs: 25.24 | +7: iteration 72630/ 173500 | consumed samples: 18593280 | consumed tokens: 38079037440 | elapsed time per iteration (s): 0.15 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.706464E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.149 | TFLOPs: 26.18 | +7: iteration 72640/ 173500 | consumed samples: 18595840 | consumed tokens: 38084280320 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.701976E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.647 | TFLOPs: 25.38 | +7: iteration 72650/ 173500 | consumed samples: 18598400 | consumed tokens: 38089523200 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.718929E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.068 | TFLOPs: 25.63 | +7: iteration 72660/ 173500 | consumed samples: 18600960 | consumed tokens: 38094766080 | elapsed time per iteration (s): 0.15 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.725598E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.674 | TFLOPs: 26.18 | +7: iteration 72670/ 173500 | consumed samples: 18603520 | consumed tokens: 38100008960 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.699630E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.094 | TFLOPs: 25.83 | +7: iteration 72680/ 173500 | consumed samples: 18606080 | consumed tokens: 38105251840 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.713933E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.338 | TFLOPs: 25.87 | +7: iteration 72690/ 173500 | consumed samples: 18608640 | consumed tokens: 38110494720 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.730271E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.558 | TFLOPs: 25.73 | +7: iteration 72700/ 173500 | consumed samples: 18611200 | consumed tokens: 38115737600 | elapsed time per iteration (s): 0.16 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.721812E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.207 | TFLOPs: 25.74 | +7: iteration 72710/ 173500 | consumed samples: 18613760 | consumed tokens: 38120980480 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.698897E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.881 | TFLOPs: 25.58 | +7: iteration 72720/ 173500 | consumed samples: 18616320 | consumed tokens: 38126223360 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.704087E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.358 | TFLOPs: 25.79 | +7: iteration 72730/ 173500 | consumed samples: 18618880 | consumed tokens: 38131466240 | elapsed time per iteration (s): 0.15 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.725026E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.003 | TFLOPs: 26.17 | +7: iteration 72740/ 173500 | consumed samples: 18621440 | consumed tokens: 38136709120 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.714938E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.532 | TFLOPs: 25.15 | +7: iteration 72750/ 173500 | consumed samples: 18624000 | consumed tokens: 38141952000 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.726043E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.484 | TFLOPs: 25.30 | +7: iteration 72760/ 173500 | consumed samples: 18626560 | consumed tokens: 38147194880 | elapsed time per iteration (s): 0.16 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.724170E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.416 | TFLOPs: 25.32 | +7: iteration 72770/ 173500 | consumed samples: 18629120 | consumed tokens: 38152437760 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.708749E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.176 | TFLOPs: 25.66 | +7: iteration 72780/ 173500 | consumed samples: 18631680 | consumed tokens: 38157680640 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.724390E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.314 | TFLOPs: 25.52 | +7: iteration 72790/ 173500 | consumed samples: 18634240 | consumed tokens: 38162923520 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.721259E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.681 | TFLOPs: 25.60 | +7: iteration 72800/ 173500 | consumed samples: 18636800 | consumed tokens: 38168166400 | elapsed time per iteration (s): 0.15 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.719740E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.047 | TFLOPs: 26.13 | +7: iteration 72810/ 173500 | consumed samples: 18639360 | consumed tokens: 38173409280 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.729028E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.145 | TFLOPs: 25.06 | +7: iteration 72820/ 173500 | consumed samples: 18641920 | consumed tokens: 38178652160 | elapsed time per iteration (s): 0.16 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.713305E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.454 | TFLOPs: 24.68 | +7: iteration 72830/ 173500 | consumed samples: 18644480 | consumed tokens: 38183895040 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.691926E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.013 | TFLOPs: 24.95 | +7: iteration 72840/ 173500 | consumed samples: 18647040 | consumed tokens: 38189137920 | elapsed time per iteration (s): 0.15 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.722937E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.610 | TFLOPs: 25.96 | +7: iteration 72850/ 173500 | consumed samples: 18649600 | consumed tokens: 38194380800 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.715609E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.412 | TFLOPs: 25.32 | +7: iteration 72860/ 173500 | consumed samples: 18652160 | consumed tokens: 38199623680 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.720185E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.478 | TFLOPs: 24.72 | +7: iteration 72870/ 173500 | consumed samples: 18654720 | consumed tokens: 38204866560 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.730784E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.758 | TFLOPs: 25.51 | +7: iteration 72880/ 173500 | consumed samples: 18657280 | consumed tokens: 38210109440 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.724760E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.233 | TFLOPs: 25.35 | +7: iteration 72890/ 173500 | consumed samples: 18659840 | consumed tokens: 38215352320 | elapsed time per iteration (s): 0.16 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.724154E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.911 | TFLOPs: 25.39 | +7: iteration 72900/ 173500 | consumed samples: 18662400 | consumed tokens: 38220595200 | elapsed time per iteration (s): 0.15 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.724984E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.588 | TFLOPs: 25.95 | +7: iteration 72910/ 173500 | consumed samples: 18664960 | consumed tokens: 38225838080 | elapsed time per iteration (s): 0.15 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.710945E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.789 | TFLOPs: 26.19 | +7: iteration 72920/ 173500 | consumed samples: 18667520 | consumed tokens: 38231080960 | elapsed time per iteration (s): 0.17 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.711310E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.817 | TFLOPs: 24.05 | +7: iteration 72930/ 173500 | consumed samples: 18670080 | consumed tokens: 38236323840 | elapsed time per iteration (s): 0.16 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.711491E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.560 | TFLOPs: 24.77 | +7: iteration 72940/ 173500 | consumed samples: 18672640 | consumed tokens: 38241566720 | elapsed time per iteration (s): 0.16 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.720651E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.936 | TFLOPs: 25.70 | +7: iteration 72950/ 173500 | consumed samples: 18675200 | consumed tokens: 38246809600 | elapsed time per iteration (s): 0.16 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.706002E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.464 | TFLOPs: 25.65 | +7: iteration 72960/ 173500 | consumed samples: 18677760 | consumed tokens: 38252052480 | elapsed time per iteration (s): 0.16 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.708514E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.406 | TFLOPs: 25.66 | +7: iteration 72970/ 173500 | consumed samples: 18680320 | consumed tokens: 38257295360 | elapsed time per iteration (s): 0.15 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.718131E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.792 | TFLOPs: 25.95 | +7: iteration 72980/ 173500 | consumed samples: 18682880 | consumed tokens: 38262538240 | elapsed time per iteration (s): 0.16 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.716298E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.727 | TFLOPs: 25.40 | +7: iteration 72990/ 173500 | consumed samples: 18685440 | consumed tokens: 38267781120 | elapsed time per iteration (s): 0.16 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.722160E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.663 | TFLOPs: 25.43 | +7: iteration 73000/ 173500 | consumed samples: 18688000 | consumed tokens: 38273024000 | elapsed time per iteration (s): 0.16 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.737289E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.245 | TFLOPs: 25.68 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 73000 | lm loss value: 3.869070E+00 | lm loss PPL: 4.789780E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 73000 to checkpoints_44m91b100m +0: [2023-03-17 03:24:48,101] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step73000 is begin to save! +0: [2023-03-17 03:24:48,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:24:48,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:24:48,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:24:48,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:24:48,173] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:24:48,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:24:48,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:24:48,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:24:48,189] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:24:48,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:24:48,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:24:48,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:24:48,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:24:48,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:24:48,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:24:48,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:24:48,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:24:48,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:24:48,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:24:48,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:24:48,231] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step73000/mp_rank_00_model_states.pt +0: [2023-03-17 03:24:48,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:24:48,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:24:48,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:24:48,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +1: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,265] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,265] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +1: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:24:48,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +1: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,267] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +1: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,268] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,268] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:24:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:24:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +1: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +5: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:24:48,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +2: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +3: [2023-03-17 03:24:48,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +1: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:24:48,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:24:48,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +6: [2023-03-17 03:24:48,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:24:48,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:24:48,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +4: [2023-03-17 03:24:48,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:24:48,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:24:48,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +7: [2023-03-17 03:24:48,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:24:48,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step73000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:24:48,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step73000 is ready now! +0: successfully saved checkpoint at iteration 73000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.10 +7: iteration 73010/ 173500 | consumed samples: 18690560 | consumed tokens: 38278266880 | elapsed time per iteration (s): 0.18 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.726006E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.120 | TFLOPs: 22.47 | +7: iteration 73020/ 173500 | consumed samples: 18693120 | consumed tokens: 38283509760 | elapsed time per iteration (s): 0.15 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.712529E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.167 | TFLOPs: 26.16 | +7: iteration 73030/ 173500 | consumed samples: 18695680 | consumed tokens: 38288752640 | elapsed time per iteration (s): 0.16 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.714115E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.394 | TFLOPs: 25.46 | +7: iteration 73040/ 173500 | consumed samples: 18698240 | consumed tokens: 38293995520 | elapsed time per iteration (s): 0.16 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.718862E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.471 | TFLOPs: 25.70 | +7: iteration 73050/ 173500 | consumed samples: 18700800 | consumed tokens: 38299238400 | elapsed time per iteration (s): 0.15 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.726548E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.155 | TFLOPs: 26.04 | +7: iteration 73060/ 173500 | consumed samples: 18703360 | consumed tokens: 38304481280 | elapsed time per iteration (s): 0.17 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.713214E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.709 | TFLOPs: 23.55 | +7: iteration 73070/ 173500 | consumed samples: 18705920 | consumed tokens: 38309724160 | elapsed time per iteration (s): 0.16 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.716740E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.753 | TFLOPs: 25.53 | +7: iteration 73080/ 173500 | consumed samples: 18708480 | consumed tokens: 38314967040 | elapsed time per iteration (s): 0.16 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.710363E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.426 | TFLOPs: 25.74 | +7: iteration 73090/ 173500 | consumed samples: 18711040 | consumed tokens: 38320209920 | elapsed time per iteration (s): 0.16 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.716250E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.511 | TFLOPs: 25.43 | +7: iteration 73100/ 173500 | consumed samples: 18713600 | consumed tokens: 38325452800 | elapsed time per iteration (s): 0.15 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.715988E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.480 | TFLOPs: 26.07 | +7: iteration 73110/ 173500 | consumed samples: 18716160 | consumed tokens: 38330695680 | elapsed time per iteration (s): 0.16 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.714916E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.574 | TFLOPs: 25.56 | +7: iteration 73120/ 173500 | consumed samples: 18718720 | consumed tokens: 38335938560 | elapsed time per iteration (s): 0.15 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.719887E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.957 | TFLOPs: 26.19 | +7: iteration 73130/ 173500 | consumed samples: 18721280 | consumed tokens: 38341181440 | elapsed time per iteration (s): 0.16 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.720782E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.875 | TFLOPs: 25.59 | +7: iteration 73140/ 173500 | consumed samples: 18723840 | consumed tokens: 38346424320 | elapsed time per iteration (s): 0.15 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.717525E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.790 | TFLOPs: 25.94 | +7: iteration 73150/ 173500 | consumed samples: 18726400 | consumed tokens: 38351667200 | elapsed time per iteration (s): 0.15 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.714802E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.319 | TFLOPs: 26.16 | +7: iteration 73160/ 173500 | consumed samples: 18728960 | consumed tokens: 38356910080 | elapsed time per iteration (s): 0.16 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.721787E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.891 | TFLOPs: 25.80 | +7: iteration 73170/ 173500 | consumed samples: 18731520 | consumed tokens: 38362152960 | elapsed time per iteration (s): 0.15 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.720821E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.227 | TFLOPs: 26.16 | +7: iteration 73180/ 173500 | consumed samples: 18734080 | consumed tokens: 38367395840 | elapsed time per iteration (s): 0.16 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.712415E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.969 | TFLOPs: 25.41 | +7: iteration 73190/ 173500 | consumed samples: 18736640 | consumed tokens: 38372638720 | elapsed time per iteration (s): 0.16 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.732046E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.254 | TFLOPs: 25.88 | +7: iteration 73200/ 173500 | consumed samples: 18739200 | consumed tokens: 38377881600 | elapsed time per iteration (s): 0.16 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.716359E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.376 | TFLOPs: 25.62 | +7: iteration 73210/ 173500 | consumed samples: 18741760 | consumed tokens: 38383124480 | elapsed time per iteration (s): 0.16 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.725610E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.584 | TFLOPs: 25.32 | +7: iteration 73220/ 173500 | consumed samples: 18744320 | consumed tokens: 38388367360 | elapsed time per iteration (s): 0.15 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.720411E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.011 | TFLOPs: 26.11 | +7: iteration 73230/ 173500 | consumed samples: 18746880 | consumed tokens: 38393610240 | elapsed time per iteration (s): 0.16 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.718230E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.125 | TFLOPs: 25.25 | +7: iteration 73240/ 173500 | consumed samples: 18749440 | consumed tokens: 38398853120 | elapsed time per iteration (s): 0.16 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.712661E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.279 | TFLOPs: 25.69 | +7: iteration 73250/ 173500 | consumed samples: 18752000 | consumed tokens: 38404096000 | elapsed time per iteration (s): 0.16 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.712595E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.481 | TFLOPs: 25.66 | +7: iteration 73260/ 173500 | consumed samples: 18754560 | consumed tokens: 38409338880 | elapsed time per iteration (s): 0.15 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.707426E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.003 | TFLOPs: 26.13 | +7: iteration 73270/ 173500 | consumed samples: 18757120 | consumed tokens: 38414581760 | elapsed time per iteration (s): 0.15 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.712484E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.238 | TFLOPs: 26.13 | +7: iteration 73280/ 173500 | consumed samples: 18759680 | consumed tokens: 38419824640 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.721833E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.192 | TFLOPs: 25.64 | +7: iteration 73290/ 173500 | consumed samples: 18762240 | consumed tokens: 38425067520 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.729940E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.846 | TFLOPs: 25.37 | +7: iteration 73300/ 173500 | consumed samples: 18764800 | consumed tokens: 38430310400 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.726135E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.830 | TFLOPs: 25.62 | +7: iteration 73310/ 173500 | consumed samples: 18767360 | consumed tokens: 38435553280 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.732510E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.998 | TFLOPs: 24.98 | +7: iteration 73320/ 173500 | consumed samples: 18769920 | consumed tokens: 38440796160 | elapsed time per iteration (s): 0.16 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.715691E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.231 | TFLOPs: 25.03 | +7: iteration 73330/ 173500 | consumed samples: 18772480 | consumed tokens: 38446039040 | elapsed time per iteration (s): 0.17 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.701712E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.648 | TFLOPs: 23.80 | +7: iteration 73340/ 173500 | consumed samples: 18775040 | consumed tokens: 38451281920 | elapsed time per iteration (s): 0.17 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.716698E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.313 | TFLOPs: 24.30 | +7: iteration 73350/ 173500 | consumed samples: 18777600 | consumed tokens: 38456524800 | elapsed time per iteration (s): 0.17 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.726614E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.150 | TFLOPs: 24.29 | +7: iteration 73360/ 173500 | consumed samples: 18780160 | consumed tokens: 38461767680 | elapsed time per iteration (s): 0.16 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.710887E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.316 | TFLOPs: 25.74 | +7: iteration 73370/ 173500 | consumed samples: 18782720 | consumed tokens: 38467010560 | elapsed time per iteration (s): 0.15 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.726100E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.113 | TFLOPs: 26.18 | +7: iteration 73380/ 173500 | consumed samples: 18785280 | consumed tokens: 38472253440 | elapsed time per iteration (s): 0.16 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.708053E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.500 | TFLOPs: 25.13 | +7: iteration 73390/ 173500 | consumed samples: 18787840 | consumed tokens: 38477496320 | elapsed time per iteration (s): 0.15 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.724784E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.063 | TFLOPs: 25.96 | +7: iteration 73400/ 173500 | consumed samples: 18790400 | consumed tokens: 38482739200 | elapsed time per iteration (s): 0.16 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.720084E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.090 | TFLOPs: 25.42 | +7: iteration 73410/ 173500 | consumed samples: 18792960 | consumed tokens: 38487982080 | elapsed time per iteration (s): 0.15 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.725987E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.365 | TFLOPs: 26.24 | +7: iteration 73420/ 173500 | consumed samples: 18795520 | consumed tokens: 38493224960 | elapsed time per iteration (s): 0.16 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.718769E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.752 | TFLOPs: 25.76 | +7: iteration 73430/ 173500 | consumed samples: 18798080 | consumed tokens: 38498467840 | elapsed time per iteration (s): 0.15 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.715340E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.696 | TFLOPs: 26.20 | +7: iteration 73440/ 173500 | consumed samples: 18800640 | consumed tokens: 38503710720 | elapsed time per iteration (s): 0.16 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.725066E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.917 | TFLOPs: 25.55 | +7: iteration 73450/ 173500 | consumed samples: 18803200 | consumed tokens: 38508953600 | elapsed time per iteration (s): 0.15 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.713747E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.926 | TFLOPs: 26.06 | +7: iteration 73460/ 173500 | consumed samples: 18805760 | consumed tokens: 38514196480 | elapsed time per iteration (s): 0.15 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.716883E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.779 | TFLOPs: 26.09 | +7: iteration 73470/ 173500 | consumed samples: 18808320 | consumed tokens: 38519439360 | elapsed time per iteration (s): 0.15 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.704390E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.510 | TFLOPs: 26.24 | +7: iteration 73480/ 173500 | consumed samples: 18810880 | consumed tokens: 38524682240 | elapsed time per iteration (s): 0.16 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.719781E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.433 | TFLOPs: 24.53 | +7: iteration 73490/ 173500 | consumed samples: 18813440 | consumed tokens: 38529925120 | elapsed time per iteration (s): 0.16 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.723819E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.657 | TFLOPs: 24.54 | +7: iteration 73500/ 173500 | consumed samples: 18816000 | consumed tokens: 38535168000 | elapsed time per iteration (s): 0.17 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.714392E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.550 | TFLOPs: 23.99 | +7: iteration 73510/ 173500 | consumed samples: 18818560 | consumed tokens: 38540410880 | elapsed time per iteration (s): 0.16 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.731408E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.476 | TFLOPs: 24.49 | +7: iteration 73520/ 173500 | consumed samples: 18821120 | consumed tokens: 38545653760 | elapsed time per iteration (s): 0.16 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.716168E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.113 | TFLOPs: 25.16 | +7: iteration 73530/ 173500 | consumed samples: 18823680 | consumed tokens: 38550896640 | elapsed time per iteration (s): 0.16 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.713611E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.954 | TFLOPs: 25.78 | +7: iteration 73540/ 173500 | consumed samples: 18826240 | consumed tokens: 38556139520 | elapsed time per iteration (s): 0.16 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.725072E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.746 | TFLOPs: 25.75 | +7: iteration 73550/ 173500 | consumed samples: 18828800 | consumed tokens: 38561382400 | elapsed time per iteration (s): 0.16 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.713411E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.763 | TFLOPs: 25.87 | +7: iteration 73560/ 173500 | consumed samples: 18831360 | consumed tokens: 38566625280 | elapsed time per iteration (s): 0.16 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.715833E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.102 | TFLOPs: 25.77 | +7: iteration 73570/ 173500 | consumed samples: 18833920 | consumed tokens: 38571868160 | elapsed time per iteration (s): 0.15 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.720177E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.700 | TFLOPs: 25.93 | +7: iteration 73580/ 173500 | consumed samples: 18836480 | consumed tokens: 38577111040 | elapsed time per iteration (s): 0.16 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.724513E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.654 | TFLOPs: 25.62 | +7: iteration 73590/ 173500 | consumed samples: 18839040 | consumed tokens: 38582353920 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.714390E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.962 | TFLOPs: 24.87 | +7: iteration 73600/ 173500 | consumed samples: 18841600 | consumed tokens: 38587596800 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.710979E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.801 | TFLOPs: 25.42 | +7: iteration 73610/ 173500 | consumed samples: 18844160 | consumed tokens: 38592839680 | elapsed time per iteration (s): 0.15 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.733060E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.475 | TFLOPs: 26.04 | +7: iteration 73620/ 173500 | consumed samples: 18846720 | consumed tokens: 38598082560 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.710930E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.261 | TFLOPs: 25.71 | +7: iteration 73630/ 173500 | consumed samples: 18849280 | consumed tokens: 38603325440 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.721048E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.557 | TFLOPs: 25.15 | +7: iteration 73640/ 173500 | consumed samples: 18851840 | consumed tokens: 38608568320 | elapsed time per iteration (s): 0.16 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.700182E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.042 | TFLOPs: 25.48 | +7: iteration 73650/ 173500 | consumed samples: 18854400 | consumed tokens: 38613811200 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.712627E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.064 | TFLOPs: 25.39 | +7: iteration 73660/ 173500 | consumed samples: 18856960 | consumed tokens: 38619054080 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.724510E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.602 | TFLOPs: 24.94 | +7: iteration 73670/ 173500 | consumed samples: 18859520 | consumed tokens: 38624296960 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.715854E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.193 | TFLOPs: 25.66 | +7: iteration 73680/ 173500 | consumed samples: 18862080 | consumed tokens: 38629539840 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.727303E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.789 | TFLOPs: 25.68 | +7: iteration 73690/ 173500 | consumed samples: 18864640 | consumed tokens: 38634782720 | elapsed time per iteration (s): 0.15 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.729911E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.745 | TFLOPs: 26.08 | +7: iteration 73700/ 173500 | consumed samples: 18867200 | consumed tokens: 38640025600 | elapsed time per iteration (s): 0.16 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.718366E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.822 | TFLOPs: 25.73 | +7: iteration 73710/ 173500 | consumed samples: 18869760 | consumed tokens: 38645268480 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.706407E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.128 | TFLOPs: 25.24 | +7: iteration 73720/ 173500 | consumed samples: 18872320 | consumed tokens: 38650511360 | elapsed time per iteration (s): 0.15 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.727213E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.570 | TFLOPs: 26.07 | +7: iteration 73730/ 173500 | consumed samples: 18874880 | consumed tokens: 38655754240 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.733283E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.415 | TFLOPs: 25.43 | +7: iteration 73740/ 173500 | consumed samples: 18877440 | consumed tokens: 38660997120 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.711553E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.871 | TFLOPs: 25.69 | +7: iteration 73750/ 173500 | consumed samples: 18880000 | consumed tokens: 38666240000 | elapsed time per iteration (s): 0.15 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.733722E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.747 | TFLOPs: 26.09 | +7: iteration 73760/ 173500 | consumed samples: 18882560 | consumed tokens: 38671482880 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.709435E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.756 | TFLOPs: 25.26 | +7: iteration 73770/ 173500 | consumed samples: 18885120 | consumed tokens: 38676725760 | elapsed time per iteration (s): 0.16 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.717235E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.462 | TFLOPs: 25.55 | +7: iteration 73780/ 173500 | consumed samples: 18887680 | consumed tokens: 38681968640 | elapsed time per iteration (s): 0.15 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.711853E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.240 | TFLOPs: 26.13 | +7: iteration 73790/ 173500 | consumed samples: 18890240 | consumed tokens: 38687211520 | elapsed time per iteration (s): 0.15 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.707371E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.749 | TFLOPs: 25.98 | +7: iteration 73800/ 173500 | consumed samples: 18892800 | consumed tokens: 38692454400 | elapsed time per iteration (s): 0.16 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.718955E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.332 | TFLOPs: 25.41 | +7: iteration 73810/ 173500 | consumed samples: 18895360 | consumed tokens: 38697697280 | elapsed time per iteration (s): 0.15 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.721804E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.142 | TFLOPs: 26.00 | +7: iteration 73820/ 173500 | consumed samples: 18897920 | consumed tokens: 38702940160 | elapsed time per iteration (s): 0.16 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.728733E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.124 | TFLOPs: 25.85 | +7: iteration 73830/ 173500 | consumed samples: 18900480 | consumed tokens: 38708183040 | elapsed time per iteration (s): 0.16 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.730437E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.827 | TFLOPs: 25.58 | +7: iteration 73840/ 173500 | consumed samples: 18903040 | consumed tokens: 38713425920 | elapsed time per iteration (s): 0.16 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.713210E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.638 | TFLOPs: 25.78 | +7: iteration 73850/ 173500 | consumed samples: 18905600 | consumed tokens: 38718668800 | elapsed time per iteration (s): 0.16 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.726534E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.514 | TFLOPs: 25.23 | +7: iteration 73860/ 173500 | consumed samples: 18908160 | consumed tokens: 38723911680 | elapsed time per iteration (s): 0.15 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.731071E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.904 | TFLOPs: 25.94 | +7: iteration 73870/ 173500 | consumed samples: 18910720 | consumed tokens: 38729154560 | elapsed time per iteration (s): 0.15 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.708561E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.338 | TFLOPs: 25.99 | +7: iteration 73880/ 173500 | consumed samples: 18913280 | consumed tokens: 38734397440 | elapsed time per iteration (s): 0.16 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.716912E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.921 | TFLOPs: 25.80 | +7: iteration 73890/ 173500 | consumed samples: 18915840 | consumed tokens: 38739640320 | elapsed time per iteration (s): 0.16 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.726286E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.069 | TFLOPs: 25.52 | +7: iteration 73900/ 173500 | consumed samples: 18918400 | consumed tokens: 38744883200 | elapsed time per iteration (s): 0.15 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.713646E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.281 | TFLOPs: 26.08 | +7: iteration 73910/ 173500 | consumed samples: 18920960 | consumed tokens: 38750126080 | elapsed time per iteration (s): 0.15 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.717269E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.451 | TFLOPs: 26.02 | +7: iteration 73920/ 173500 | consumed samples: 18923520 | consumed tokens: 38755368960 | elapsed time per iteration (s): 0.16 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.712881E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.554 | TFLOPs: 25.82 | +7: iteration 73930/ 173500 | consumed samples: 18926080 | consumed tokens: 38760611840 | elapsed time per iteration (s): 0.16 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.718719E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.931 | TFLOPs: 25.06 | +7: iteration 73940/ 173500 | consumed samples: 18928640 | consumed tokens: 38765854720 | elapsed time per iteration (s): 0.15 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.714537E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.067 | TFLOPs: 25.92 | +7: iteration 73950/ 173500 | consumed samples: 18931200 | consumed tokens: 38771097600 | elapsed time per iteration (s): 0.16 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.720833E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.464 | TFLOPs: 24.97 | +7: iteration 73960/ 173500 | consumed samples: 18933760 | consumed tokens: 38776340480 | elapsed time per iteration (s): 0.15 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.726779E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.490 | TFLOPs: 26.07 | +7: iteration 73970/ 173500 | consumed samples: 18936320 | consumed tokens: 38781583360 | elapsed time per iteration (s): 0.15 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.713075E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.241 | TFLOPs: 25.96 | +7: iteration 73980/ 173500 | consumed samples: 18938880 | consumed tokens: 38786826240 | elapsed time per iteration (s): 0.17 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.728067E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.871 | TFLOPs: 24.16 | +7: iteration 73990/ 173500 | consumed samples: 18941440 | consumed tokens: 38792069120 | elapsed time per iteration (s): 0.15 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.722246E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.894 | TFLOPs: 26.14 | +0: [2023-03-17 03:27:25,321] [INFO] [logging.py:68:log_dist] [Rank 0] step=74000, skipped=0, lr=[0.0001321851851828754, 0.0001321851851828754, 0.0001321851851828754], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 74000/ 173500 | consumed samples: 18944000 | consumed tokens: 38797312000 | elapsed time per iteration (s): 0.16 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.712838E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.074 | TFLOPs: 25.70 | +0: steps: 74000 loss: 3.7258 iter time (s): 0.155 samples/sec: 1651.064 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 74000 | lm loss value: 3.873047E+00 | lm loss PPL: 4.808871E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 74000 to checkpoints_44m91b100m +0: [2023-03-17 03:27:25,394] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step74000 is begin to save! +0: [2023-03-17 03:27:25,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:27:25,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:27:25,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:27:25,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:27:25,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:27:25,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:27:25,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:27:25,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:27:25,489] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:27:25,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:27:25,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:27:25,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:27:25,505] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:27:25,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:27:25,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:27:25,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:27:25,521] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:27:25,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:27:25,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:27:25,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:27:25,531] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step74000/mp_rank_00_model_states.pt +0: [2023-03-17 03:27:25,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:27:25,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:27:25,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:27:25,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:27:25,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +4: [2023-03-17 03:27:25,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:27:25,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:27:25,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +4: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:27:25,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +4: [2023-03-17 03:27:25,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:27:25,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:27:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +4: [2023-03-17 03:27:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +4: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +5: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +3: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +4: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:27:25,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 03:27:25,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +1: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +2: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +7: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:27:25,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +6: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:27:25,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step74000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:27:25,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step74000 is ready now! +0: successfully saved checkpoint at iteration 74000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.70 +7: iteration 74010/ 173500 | consumed samples: 18946560 | consumed tokens: 38802554880 | elapsed time per iteration (s): 0.18 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.711814E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.284 | TFLOPs: 22.52 | +7: iteration 74020/ 173500 | consumed samples: 18949120 | consumed tokens: 38807797760 | elapsed time per iteration (s): 0.16 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.715120E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.892 | TFLOPs: 25.70 | +7: iteration 74030/ 173500 | consumed samples: 18951680 | consumed tokens: 38813040640 | elapsed time per iteration (s): 0.16 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.716047E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.914 | TFLOPs: 25.40 | +7: iteration 74040/ 173500 | consumed samples: 18954240 | consumed tokens: 38818283520 | elapsed time per iteration (s): 0.15 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.740204E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.557 | TFLOPs: 26.09 | +7: iteration 74050/ 173500 | consumed samples: 18956800 | consumed tokens: 38823526400 | elapsed time per iteration (s): 0.16 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.723888E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.514 | TFLOPs: 25.52 | +7: iteration 74060/ 173500 | consumed samples: 18959360 | consumed tokens: 38828769280 | elapsed time per iteration (s): 0.15 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.730534E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.360 | TFLOPs: 26.13 | +7: iteration 74070/ 173500 | consumed samples: 18961920 | consumed tokens: 38834012160 | elapsed time per iteration (s): 0.15 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.719701E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.623 | TFLOPs: 26.14 | +7: iteration 74080/ 173500 | consumed samples: 18964480 | consumed tokens: 38839255040 | elapsed time per iteration (s): 0.15 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.712506E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.764 | TFLOPs: 26.14 | +7: iteration 74090/ 173500 | consumed samples: 18967040 | consumed tokens: 38844497920 | elapsed time per iteration (s): 0.16 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.717113E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.699 | TFLOPs: 25.82 | +7: iteration 74100/ 173500 | consumed samples: 18969600 | consumed tokens: 38849740800 | elapsed time per iteration (s): 0.16 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.723968E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.240 | TFLOPs: 25.80 | +7: iteration 74110/ 173500 | consumed samples: 18972160 | consumed tokens: 38854983680 | elapsed time per iteration (s): 0.15 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.724924E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.425 | TFLOPs: 26.26 | +7: iteration 74120/ 173500 | consumed samples: 18974720 | consumed tokens: 38860226560 | elapsed time per iteration (s): 0.15 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.725723E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.671 | TFLOPs: 26.25 | +7: iteration 74130/ 173500 | consumed samples: 18977280 | consumed tokens: 38865469440 | elapsed time per iteration (s): 0.15 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.712753E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.294 | TFLOPs: 26.27 | +7: iteration 74140/ 173500 | consumed samples: 18979840 | consumed tokens: 38870712320 | elapsed time per iteration (s): 0.15 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.722044E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.103 | TFLOPs: 26.07 | +7: iteration 74150/ 173500 | consumed samples: 18982400 | consumed tokens: 38875955200 | elapsed time per iteration (s): 0.15 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.708839E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.335 | TFLOPs: 26.26 | +7: iteration 74160/ 173500 | consumed samples: 18984960 | consumed tokens: 38881198080 | elapsed time per iteration (s): 0.15 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.739242E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.433 | TFLOPs: 26.26 | +7: iteration 74170/ 173500 | consumed samples: 18987520 | consumed tokens: 38886440960 | elapsed time per iteration (s): 0.15 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.728280E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.277 | TFLOPs: 26.29 | +7: iteration 74180/ 173500 | consumed samples: 18990080 | consumed tokens: 38891683840 | elapsed time per iteration (s): 0.16 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.711090E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.347 | TFLOPs: 25.87 | +7: iteration 74190/ 173500 | consumed samples: 18992640 | consumed tokens: 38896926720 | elapsed time per iteration (s): 0.15 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.705912E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.632 | TFLOPs: 26.07 | +7: iteration 74200/ 173500 | consumed samples: 18995200 | consumed tokens: 38902169600 | elapsed time per iteration (s): 0.15 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.697824E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.982 | TFLOPs: 26.05 | +7: iteration 74210/ 173500 | consumed samples: 18997760 | consumed tokens: 38907412480 | elapsed time per iteration (s): 0.15 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.708277E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.875 | TFLOPs: 26.28 | +7: iteration 74220/ 173500 | consumed samples: 19000320 | consumed tokens: 38912655360 | elapsed time per iteration (s): 0.15 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.720766E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.904 | TFLOPs: 26.28 | +7: iteration 74230/ 173500 | consumed samples: 19002880 | consumed tokens: 38917898240 | elapsed time per iteration (s): 0.16 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.724697E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.740 | TFLOPs: 25.34 | +7: iteration 74240/ 173500 | consumed samples: 19005440 | consumed tokens: 38923141120 | elapsed time per iteration (s): 0.15 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.718456E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.292 | TFLOPs: 26.12 | +7: iteration 74250/ 173500 | consumed samples: 19008000 | consumed tokens: 38928384000 | elapsed time per iteration (s): 0.15 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.723563E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.767 | TFLOPs: 26.12 | +7: iteration 74260/ 173500 | consumed samples: 19010560 | consumed tokens: 38933626880 | elapsed time per iteration (s): 0.15 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.717645E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.568 | TFLOPs: 26.29 | +7: iteration 74270/ 173500 | consumed samples: 19013120 | consumed tokens: 38938869760 | elapsed time per iteration (s): 0.16 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.718776E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.928 | TFLOPs: 25.75 | +7: iteration 74280/ 173500 | consumed samples: 19015680 | consumed tokens: 38944112640 | elapsed time per iteration (s): 0.15 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.713882E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.511 | TFLOPs: 26.28 | +7: iteration 74290/ 173500 | consumed samples: 19018240 | consumed tokens: 38949355520 | elapsed time per iteration (s): 0.15 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.708237E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.689 | TFLOPs: 26.04 | +7: iteration 74300/ 173500 | consumed samples: 19020800 | consumed tokens: 38954598400 | elapsed time per iteration (s): 0.16 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.717738E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.350 | TFLOPs: 25.71 | +7: iteration 74310/ 173500 | consumed samples: 19023360 | consumed tokens: 38959841280 | elapsed time per iteration (s): 0.15 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.718038E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.667 | TFLOPs: 25.97 | +7: iteration 74320/ 173500 | consumed samples: 19025920 | consumed tokens: 38965084160 | elapsed time per iteration (s): 0.16 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.706010E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.497 | TFLOPs: 25.74 | +7: iteration 74330/ 173500 | consumed samples: 19028480 | consumed tokens: 38970327040 | elapsed time per iteration (s): 0.16 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.725035E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.302 | TFLOPs: 25.11 | +7: iteration 74340/ 173500 | consumed samples: 19031040 | consumed tokens: 38975569920 | elapsed time per iteration (s): 0.16 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.722070E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.449 | TFLOPs: 25.15 | +7: iteration 74350/ 173500 | consumed samples: 19033600 | consumed tokens: 38980812800 | elapsed time per iteration (s): 0.16 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.715059E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.985 | TFLOPs: 24.54 | +7: iteration 74360/ 173500 | consumed samples: 19036160 | consumed tokens: 38986055680 | elapsed time per iteration (s): 0.16 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.714212E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.494 | TFLOPs: 25.44 | +7: iteration 74370/ 173500 | consumed samples: 19038720 | consumed tokens: 38991298560 | elapsed time per iteration (s): 0.16 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.719839E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.506 | TFLOPs: 24.77 | +7: iteration 74380/ 173500 | consumed samples: 19041280 | consumed tokens: 38996541440 | elapsed time per iteration (s): 0.15 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.721671E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.623 | TFLOPs: 26.04 | +7: iteration 74390/ 173500 | consumed samples: 19043840 | consumed tokens: 39001784320 | elapsed time per iteration (s): 0.16 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.712174E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.481 | TFLOPs: 24.39 | +7: iteration 74400/ 173500 | consumed samples: 19046400 | consumed tokens: 39007027200 | elapsed time per iteration (s): 0.16 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.718616E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.382 | TFLOPs: 24.72 | +7: iteration 74410/ 173500 | consumed samples: 19048960 | consumed tokens: 39012270080 | elapsed time per iteration (s): 0.16 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.717733E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.024 | TFLOPs: 25.00 | +7: iteration 74420/ 173500 | consumed samples: 19051520 | consumed tokens: 39017512960 | elapsed time per iteration (s): 0.15 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.714263E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.315 | TFLOPs: 25.93 | +7: iteration 74430/ 173500 | consumed samples: 19054080 | consumed tokens: 39022755840 | elapsed time per iteration (s): 0.16 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.714828E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.670 | TFLOPs: 24.55 | +7: iteration 74440/ 173500 | consumed samples: 19056640 | consumed tokens: 39027998720 | elapsed time per iteration (s): 0.16 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.721947E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.882 | TFLOPs: 25.26 | +7: iteration 74450/ 173500 | consumed samples: 19059200 | consumed tokens: 39033241600 | elapsed time per iteration (s): 0.16 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.714739E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.177 | TFLOPs: 25.02 | +7: iteration 74460/ 173500 | consumed samples: 19061760 | consumed tokens: 39038484480 | elapsed time per iteration (s): 0.16 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.709798E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.971 | TFLOPs: 25.86 | +7: iteration 74470/ 173500 | consumed samples: 19064320 | consumed tokens: 39043727360 | elapsed time per iteration (s): 0.16 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.725763E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.998 | TFLOPs: 25.61 | +7: iteration 74480/ 173500 | consumed samples: 19066880 | consumed tokens: 39048970240 | elapsed time per iteration (s): 0.15 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.708047E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.818 | TFLOPs: 26.25 | +7: iteration 74490/ 173500 | consumed samples: 19069440 | consumed tokens: 39054213120 | elapsed time per iteration (s): 0.16 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.725488E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.955 | TFLOPs: 25.09 | +7: iteration 74500/ 173500 | consumed samples: 19072000 | consumed tokens: 39059456000 | elapsed time per iteration (s): 0.16 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.713132E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.264 | TFLOPs: 25.30 | +7: iteration 74510/ 173500 | consumed samples: 19074560 | consumed tokens: 39064698880 | elapsed time per iteration (s): 0.16 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.721667E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.114 | TFLOPs: 24.53 | +7: iteration 74520/ 173500 | consumed samples: 19077120 | consumed tokens: 39069941760 | elapsed time per iteration (s): 0.16 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.708176E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.944 | TFLOPs: 24.40 | +7: iteration 74530/ 173500 | consumed samples: 19079680 | consumed tokens: 39075184640 | elapsed time per iteration (s): 0.16 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.716445E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.824 | TFLOPs: 24.81 | +7: iteration 74540/ 173500 | consumed samples: 19082240 | consumed tokens: 39080427520 | elapsed time per iteration (s): 0.16 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.724097E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.951 | TFLOPs: 24.51 | +7: iteration 74550/ 173500 | consumed samples: 19084800 | consumed tokens: 39085670400 | elapsed time per iteration (s): 0.16 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.719708E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.673 | TFLOPs: 25.20 | +7: iteration 74560/ 173500 | consumed samples: 19087360 | consumed tokens: 39090913280 | elapsed time per iteration (s): 0.17 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.728561E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1466.358 | TFLOPs: 23.00 | +7: iteration 74570/ 173500 | consumed samples: 19089920 | consumed tokens: 39096156160 | elapsed time per iteration (s): 0.17 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.699694E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.276 | TFLOPs: 24.11 | +7: iteration 74580/ 173500 | consumed samples: 19092480 | consumed tokens: 39101399040 | elapsed time per iteration (s): 0.17 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.712606E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.153 | TFLOPs: 23.24 | +7: iteration 74590/ 173500 | consumed samples: 19095040 | consumed tokens: 39106641920 | elapsed time per iteration (s): 0.17 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.714732E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.229 | TFLOPs: 23.54 | +7: iteration 74600/ 173500 | consumed samples: 19097600 | consumed tokens: 39111884800 | elapsed time per iteration (s): 0.16 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.715826E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.959 | TFLOPs: 24.54 | +7: iteration 74610/ 173500 | consumed samples: 19100160 | consumed tokens: 39117127680 | elapsed time per iteration (s): 0.17 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.714025E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.304 | TFLOPs: 24.20 | +7: iteration 74620/ 173500 | consumed samples: 19102720 | consumed tokens: 39122370560 | elapsed time per iteration (s): 0.16 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.717841E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.125 | TFLOPs: 25.36 | +7: iteration 74630/ 173500 | consumed samples: 19105280 | consumed tokens: 39127613440 | elapsed time per iteration (s): 0.16 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.718200E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.795 | TFLOPs: 24.43 | +7: iteration 74640/ 173500 | consumed samples: 19107840 | consumed tokens: 39132856320 | elapsed time per iteration (s): 0.17 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.719556E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.861 | TFLOPs: 24.23 | +7: iteration 74650/ 173500 | consumed samples: 19110400 | consumed tokens: 39138099200 | elapsed time per iteration (s): 0.16 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.729511E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.513 | TFLOPs: 25.32 | +7: iteration 74660/ 173500 | consumed samples: 19112960 | consumed tokens: 39143342080 | elapsed time per iteration (s): 0.16 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.719600E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.369 | TFLOPs: 24.78 | +7: iteration 74670/ 173500 | consumed samples: 19115520 | consumed tokens: 39148584960 | elapsed time per iteration (s): 0.17 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.703896E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.854 | TFLOPs: 23.49 | +7: iteration 74680/ 173500 | consumed samples: 19118080 | consumed tokens: 39153827840 | elapsed time per iteration (s): 0.15 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.729232E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.959 | TFLOPs: 25.95 | +7: iteration 74690/ 173500 | consumed samples: 19120640 | consumed tokens: 39159070720 | elapsed time per iteration (s): 0.16 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.725532E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.988 | TFLOPs: 24.72 | +7: iteration 74700/ 173500 | consumed samples: 19123200 | consumed tokens: 39164313600 | elapsed time per iteration (s): 0.17 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.729624E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.081 | TFLOPs: 24.11 | +7: iteration 74710/ 173500 | consumed samples: 19125760 | consumed tokens: 39169556480 | elapsed time per iteration (s): 0.16 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.728122E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.346 | TFLOPs: 25.80 | +7: iteration 74720/ 173500 | consumed samples: 19128320 | consumed tokens: 39174799360 | elapsed time per iteration (s): 0.16 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.716358E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.631 | TFLOPs: 24.52 | +7: iteration 74730/ 173500 | consumed samples: 19130880 | consumed tokens: 39180042240 | elapsed time per iteration (s): 0.17 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.712928E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.989 | TFLOPs: 24.18 | +7: iteration 74740/ 173500 | consumed samples: 19133440 | consumed tokens: 39185285120 | elapsed time per iteration (s): 0.17 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.727474E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.674 | TFLOPs: 23.85 | +7: iteration 74750/ 173500 | consumed samples: 19136000 | consumed tokens: 39190528000 | elapsed time per iteration (s): 0.16 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.715665E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.861 | TFLOPs: 24.57 | +7: iteration 74760/ 173500 | consumed samples: 19138560 | consumed tokens: 39195770880 | elapsed time per iteration (s): 0.16 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.712492E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.832 | TFLOPs: 24.43 | +7: iteration 74770/ 173500 | consumed samples: 19141120 | consumed tokens: 39201013760 | elapsed time per iteration (s): 0.17 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.717255E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.150 | TFLOPs: 24.18 | +7: iteration 74780/ 173500 | consumed samples: 19143680 | consumed tokens: 39206256640 | elapsed time per iteration (s): 0.18 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.708153E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.178 | TFLOPs: 22.63 | +7: iteration 74790/ 173500 | consumed samples: 19146240 | consumed tokens: 39211499520 | elapsed time per iteration (s): 0.16 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.722604E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.690 | TFLOPs: 25.32 | +7: iteration 74800/ 173500 | consumed samples: 19148800 | consumed tokens: 39216742400 | elapsed time per iteration (s): 0.17 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.712251E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.680 | TFLOPs: 23.82 | +7: iteration 74810/ 173500 | consumed samples: 19151360 | consumed tokens: 39221985280 | elapsed time per iteration (s): 0.16 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.725515E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.205 | TFLOPs: 25.31 | +7: iteration 74820/ 173500 | consumed samples: 19153920 | consumed tokens: 39227228160 | elapsed time per iteration (s): 0.17 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.697752E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.338 | TFLOPs: 23.91 | +7: iteration 74830/ 173500 | consumed samples: 19156480 | consumed tokens: 39232471040 | elapsed time per iteration (s): 0.17 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.733619E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.248 | TFLOPs: 23.46 | +7: iteration 74840/ 173500 | consumed samples: 19159040 | consumed tokens: 39237713920 | elapsed time per iteration (s): 0.16 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.724889E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.409 | TFLOPs: 24.42 | +7: iteration 74850/ 173500 | consumed samples: 19161600 | consumed tokens: 39242956800 | elapsed time per iteration (s): 0.17 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.728873E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.799 | TFLOPs: 24.27 | +7: iteration 74860/ 173500 | consumed samples: 19164160 | consumed tokens: 39248199680 | elapsed time per iteration (s): 0.17 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.724501E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.204 | TFLOPs: 24.09 | +7: iteration 74870/ 173500 | consumed samples: 19166720 | consumed tokens: 39253442560 | elapsed time per iteration (s): 0.17 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.700198E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1471.971 | TFLOPs: 23.08 | +7: iteration 74880/ 173500 | consumed samples: 19169280 | consumed tokens: 39258685440 | elapsed time per iteration (s): 0.17 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.722867E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.781 | TFLOPs: 23.18 | +7: iteration 74890/ 173500 | consumed samples: 19171840 | consumed tokens: 39263928320 | elapsed time per iteration (s): 0.18 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.721437E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.913 | TFLOPs: 22.88 | +7: iteration 74900/ 173500 | consumed samples: 19174400 | consumed tokens: 39269171200 | elapsed time per iteration (s): 0.16 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.710750E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.827 | TFLOPs: 24.92 | +7: iteration 74910/ 173500 | consumed samples: 19176960 | consumed tokens: 39274414080 | elapsed time per iteration (s): 0.17 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.699859E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1506.525 | TFLOPs: 23.63 | +7: iteration 74920/ 173500 | consumed samples: 19179520 | consumed tokens: 39279656960 | elapsed time per iteration (s): 0.18 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.727924E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1460.350 | TFLOPs: 22.90 | +7: iteration 74930/ 173500 | consumed samples: 19182080 | consumed tokens: 39284899840 | elapsed time per iteration (s): 0.16 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.726404E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.963 | TFLOPs: 24.34 | +7: iteration 74940/ 173500 | consumed samples: 19184640 | consumed tokens: 39290142720 | elapsed time per iteration (s): 0.17 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.713365E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1490.994 | TFLOPs: 23.38 | +7: iteration 74950/ 173500 | consumed samples: 19187200 | consumed tokens: 39295385600 | elapsed time per iteration (s): 0.17 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.708121E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1483.723 | TFLOPs: 23.27 | +7: iteration 74960/ 173500 | consumed samples: 19189760 | consumed tokens: 39300628480 | elapsed time per iteration (s): 0.16 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.714529E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.215 | TFLOPs: 25.00 | +7: iteration 74970/ 173500 | consumed samples: 19192320 | consumed tokens: 39305871360 | elapsed time per iteration (s): 0.17 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.708154E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.663 | TFLOPs: 23.10 | +7: iteration 74980/ 173500 | consumed samples: 19194880 | consumed tokens: 39311114240 | elapsed time per iteration (s): 0.16 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.719279E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.168 | TFLOPs: 25.24 | +7: iteration 74990/ 173500 | consumed samples: 19197440 | consumed tokens: 39316357120 | elapsed time per iteration (s): 0.16 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.723891E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.646 | TFLOPs: 24.98 | +7: iteration 75000/ 173500 | consumed samples: 19200000 | consumed tokens: 39321600000 | elapsed time per iteration (s): 0.16 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.720084E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.022 | TFLOPs: 24.64 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 75000 | lm loss value: 3.845731E+00 | lm loss PPL: 4.679286E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 75000 to checkpoints_44m91b100m +0: [2023-03-17 03:30:06,593] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step75000 is begin to save! +0: [2023-03-17 03:30:06,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:30:06,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:30:06,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:30:06,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:30:06,678] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:30:06,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:30:06,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:30:06,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:30:06,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:30:06,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:30:06,703] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:30:06,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:30:06,711] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:30:06,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:30:06,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:30:06,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:30:06,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:30:06,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:30:06,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:30:06,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:30:06,737] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step75000/mp_rank_00_model_states.pt +0: [2023-03-17 03:30:06,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:30:06,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:30:06,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:30:06,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:30:06,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:30:06,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:30:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 03:30:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:30:06,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:30:06,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:30:06,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:30:06,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +1: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +2: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +4: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +7: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +6: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:30:06,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:30:06,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +3: [2023-03-17 03:30:06,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:30:06,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step75000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:30:06,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step75000 is ready now! +0: successfully saved checkpoint at iteration 75000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 190.13 +7: iteration 75010/ 173500 | consumed samples: 19202560 | consumed tokens: 39326842880 | elapsed time per iteration (s): 0.18 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.717577E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.942 | TFLOPs: 21.70 | +7: iteration 75020/ 173500 | consumed samples: 19205120 | consumed tokens: 39332085760 | elapsed time per iteration (s): 0.18 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.723995E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.568 | TFLOPs: 22.23 | +7: iteration 75030/ 173500 | consumed samples: 19207680 | consumed tokens: 39337328640 | elapsed time per iteration (s): 0.17 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.708658E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.700 | TFLOPs: 23.25 | +7: iteration 75040/ 173500 | consumed samples: 19210240 | consumed tokens: 39342571520 | elapsed time per iteration (s): 0.16 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.708072E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.977 | TFLOPs: 25.33 | +7: iteration 75050/ 173500 | consumed samples: 19212800 | consumed tokens: 39347814400 | elapsed time per iteration (s): 0.16 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.720486E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.154 | TFLOPs: 25.46 | +7: iteration 75060/ 173500 | consumed samples: 19215360 | consumed tokens: 39353057280 | elapsed time per iteration (s): 0.17 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.723964E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.490 | TFLOPs: 23.91 | +7: iteration 75070/ 173500 | consumed samples: 19217920 | consumed tokens: 39358300160 | elapsed time per iteration (s): 0.17 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.714894E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.464 | TFLOPs: 23.69 | +7: iteration 75080/ 173500 | consumed samples: 19220480 | consumed tokens: 39363543040 | elapsed time per iteration (s): 0.18 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.704262E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.255 | TFLOPs: 22.77 | +7: iteration 75090/ 173500 | consumed samples: 19223040 | consumed tokens: 39368785920 | elapsed time per iteration (s): 0.17 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.719446E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1463.526 | TFLOPs: 22.95 | +7: iteration 75100/ 173500 | consumed samples: 19225600 | consumed tokens: 39374028800 | elapsed time per iteration (s): 0.17 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.717940E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.346 | TFLOPs: 24.02 | +7: iteration 75110/ 173500 | consumed samples: 19228160 | consumed tokens: 39379271680 | elapsed time per iteration (s): 0.16 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.704905E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.839 | TFLOPs: 24.52 | +7: iteration 75120/ 173500 | consumed samples: 19230720 | consumed tokens: 39384514560 | elapsed time per iteration (s): 0.17 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.711982E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.112 | TFLOPs: 24.22 | +7: iteration 75130/ 173500 | consumed samples: 19233280 | consumed tokens: 39389757440 | elapsed time per iteration (s): 0.17 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.724506E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.803 | TFLOPs: 23.13 | +7: iteration 75140/ 173500 | consumed samples: 19235840 | consumed tokens: 39395000320 | elapsed time per iteration (s): 0.16 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.715044E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.362 | TFLOPs: 24.88 | +7: iteration 75150/ 173500 | consumed samples: 19238400 | consumed tokens: 39400243200 | elapsed time per iteration (s): 0.17 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.704608E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.396 | TFLOPs: 23.61 | +7: iteration 75160/ 173500 | consumed samples: 19240960 | consumed tokens: 39405486080 | elapsed time per iteration (s): 0.16 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.724822E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.554 | TFLOPs: 24.69 | +7: iteration 75170/ 173500 | consumed samples: 19243520 | consumed tokens: 39410728960 | elapsed time per iteration (s): 0.16 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.717204E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.634 | TFLOPs: 24.80 | +7: iteration 75180/ 173500 | consumed samples: 19246080 | consumed tokens: 39415971840 | elapsed time per iteration (s): 0.16 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.707715E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.854 | TFLOPs: 24.37 | +7: iteration 75190/ 173500 | consumed samples: 19248640 | consumed tokens: 39421214720 | elapsed time per iteration (s): 0.17 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.710442E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.568 | TFLOPs: 23.71 | +7: iteration 75200/ 173500 | consumed samples: 19251200 | consumed tokens: 39426457600 | elapsed time per iteration (s): 0.17 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.707227E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1489.191 | TFLOPs: 23.35 | +7: iteration 75210/ 173500 | consumed samples: 19253760 | consumed tokens: 39431700480 | elapsed time per iteration (s): 0.16 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.724070E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.371 | TFLOPs: 24.47 | +7: iteration 75220/ 173500 | consumed samples: 19256320 | consumed tokens: 39436943360 | elapsed time per iteration (s): 0.16 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.734131E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.232 | TFLOPs: 24.58 | +7: iteration 75230/ 173500 | consumed samples: 19258880 | consumed tokens: 39442186240 | elapsed time per iteration (s): 0.17 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.714666E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.758 | TFLOPs: 23.71 | +7: iteration 75240/ 173500 | consumed samples: 19261440 | consumed tokens: 39447429120 | elapsed time per iteration (s): 0.16 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.724091E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.863 | TFLOPs: 25.12 | +7: iteration 75250/ 173500 | consumed samples: 19264000 | consumed tokens: 39452672000 | elapsed time per iteration (s): 0.16 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.725804E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.644 | TFLOPs: 25.12 | +7: iteration 75260/ 173500 | consumed samples: 19266560 | consumed tokens: 39457914880 | elapsed time per iteration (s): 0.17 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.709690E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.727 | TFLOPs: 23.77 | +7: iteration 75270/ 173500 | consumed samples: 19269120 | consumed tokens: 39463157760 | elapsed time per iteration (s): 0.17 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.704036E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.508 | TFLOPs: 24.19 | +7: iteration 75280/ 173500 | consumed samples: 19271680 | consumed tokens: 39468400640 | elapsed time per iteration (s): 0.17 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.720886E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.870 | TFLOPs: 24.29 | +7: iteration 75290/ 173500 | consumed samples: 19274240 | consumed tokens: 39473643520 | elapsed time per iteration (s): 0.16 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.701408E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.068 | TFLOPs: 24.47 | +7: iteration 75300/ 173500 | consumed samples: 19276800 | consumed tokens: 39478886400 | elapsed time per iteration (s): 0.16 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.703122E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.878 | TFLOPs: 25.11 | +7: iteration 75310/ 173500 | consumed samples: 19279360 | consumed tokens: 39484129280 | elapsed time per iteration (s): 0.17 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.723120E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.269 | TFLOPs: 23.89 | +7: iteration 75320/ 173500 | consumed samples: 19281920 | consumed tokens: 39489372160 | elapsed time per iteration (s): 0.16 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.715398E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.418 | TFLOPs: 25.51 | +7: iteration 75330/ 173500 | consumed samples: 19284480 | consumed tokens: 39494615040 | elapsed time per iteration (s): 0.17 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.712162E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.578 | TFLOPs: 24.14 | +7: iteration 75340/ 173500 | consumed samples: 19287040 | consumed tokens: 39499857920 | elapsed time per iteration (s): 0.16 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.725553E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.084 | TFLOPs: 24.48 | +7: iteration 75350/ 173500 | consumed samples: 19289600 | consumed tokens: 39505100800 | elapsed time per iteration (s): 0.17 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.720523E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.867 | TFLOPs: 24.13 | +7: iteration 75360/ 173500 | consumed samples: 19292160 | consumed tokens: 39510343680 | elapsed time per iteration (s): 0.16 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.709779E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.380 | TFLOPs: 24.71 | +7: iteration 75370/ 173500 | consumed samples: 19294720 | consumed tokens: 39515586560 | elapsed time per iteration (s): 0.18 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.720468E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.634 | TFLOPs: 22.88 | +7: iteration 75380/ 173500 | consumed samples: 19297280 | consumed tokens: 39520829440 | elapsed time per iteration (s): 0.16 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.707368E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.567 | TFLOPs: 25.48 | +7: iteration 75390/ 173500 | consumed samples: 19299840 | consumed tokens: 39526072320 | elapsed time per iteration (s): 0.16 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.718003E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.835 | TFLOPs: 25.14 | +7: iteration 75400/ 173500 | consumed samples: 19302400 | consumed tokens: 39531315200 | elapsed time per iteration (s): 0.16 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.702693E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.857 | TFLOPs: 25.70 | +7: iteration 75410/ 173500 | consumed samples: 19304960 | consumed tokens: 39536558080 | elapsed time per iteration (s): 0.17 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.711665E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.656 | TFLOPs: 24.10 | +7: iteration 75420/ 173500 | consumed samples: 19307520 | consumed tokens: 39541800960 | elapsed time per iteration (s): 0.16 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.715590E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.390 | TFLOPs: 24.91 | +7: iteration 75430/ 173500 | consumed samples: 19310080 | consumed tokens: 39547043840 | elapsed time per iteration (s): 0.16 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.720999E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.439 | TFLOPs: 24.55 | +7: iteration 75440/ 173500 | consumed samples: 19312640 | consumed tokens: 39552286720 | elapsed time per iteration (s): 0.16 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.718350E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.563 | TFLOPs: 24.52 | +7: iteration 75450/ 173500 | consumed samples: 19315200 | consumed tokens: 39557529600 | elapsed time per iteration (s): 0.16 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.723409E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.670 | TFLOPs: 25.32 | +7: iteration 75460/ 173500 | consumed samples: 19317760 | consumed tokens: 39562772480 | elapsed time per iteration (s): 0.17 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.720329E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1506.116 | TFLOPs: 23.62 | +7: iteration 75470/ 173500 | consumed samples: 19320320 | consumed tokens: 39568015360 | elapsed time per iteration (s): 0.16 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.704502E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.829 | TFLOPs: 24.84 | +7: iteration 75480/ 173500 | consumed samples: 19322880 | consumed tokens: 39573258240 | elapsed time per iteration (s): 0.16 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.717162E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.531 | TFLOPs: 25.07 | +7: iteration 75490/ 173500 | consumed samples: 19325440 | consumed tokens: 39578501120 | elapsed time per iteration (s): 0.17 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.716818E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.892 | TFLOPs: 24.21 | +7: iteration 75500/ 173500 | consumed samples: 19328000 | consumed tokens: 39583744000 | elapsed time per iteration (s): 0.17 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.721180E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.229 | TFLOPs: 24.33 | +7: iteration 75510/ 173500 | consumed samples: 19330560 | consumed tokens: 39588986880 | elapsed time per iteration (s): 0.16 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.712333E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.053 | TFLOPs: 24.54 | +7: iteration 75520/ 173500 | consumed samples: 19333120 | consumed tokens: 39594229760 | elapsed time per iteration (s): 0.17 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.714815E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.904 | TFLOPs: 23.35 | +7: iteration 75530/ 173500 | consumed samples: 19335680 | consumed tokens: 39599472640 | elapsed time per iteration (s): 0.16 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.715422E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.511 | TFLOPs: 25.01 | +7: iteration 75540/ 173500 | consumed samples: 19338240 | consumed tokens: 39604715520 | elapsed time per iteration (s): 0.16 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.716844E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.345 | TFLOPs: 25.32 | +7: iteration 75550/ 173500 | consumed samples: 19340800 | consumed tokens: 39609958400 | elapsed time per iteration (s): 0.16 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.716493E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.940 | TFLOPs: 25.11 | +7: iteration 75560/ 173500 | consumed samples: 19343360 | consumed tokens: 39615201280 | elapsed time per iteration (s): 0.16 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.717578E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.170 | TFLOPs: 24.44 | +7: iteration 75570/ 173500 | consumed samples: 19345920 | consumed tokens: 39620444160 | elapsed time per iteration (s): 0.16 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.709430E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.552 | TFLOPs: 24.57 | +7: iteration 75580/ 173500 | consumed samples: 19348480 | consumed tokens: 39625687040 | elapsed time per iteration (s): 0.16 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.707943E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.258 | TFLOPs: 25.77 | +7: iteration 75590/ 173500 | consumed samples: 19351040 | consumed tokens: 39630929920 | elapsed time per iteration (s): 0.16 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.707848E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.187 | TFLOPs: 24.67 | +7: iteration 75600/ 173500 | consumed samples: 19353600 | consumed tokens: 39636172800 | elapsed time per iteration (s): 0.16 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.724950E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.874 | TFLOPs: 25.12 | +7: iteration 75610/ 173500 | consumed samples: 19356160 | consumed tokens: 39641415680 | elapsed time per iteration (s): 0.16 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.717882E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.541 | TFLOPs: 25.37 | +7: iteration 75620/ 173500 | consumed samples: 19358720 | consumed tokens: 39646658560 | elapsed time per iteration (s): 0.16 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.707515E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.827 | TFLOPs: 25.29 | +7: iteration 75630/ 173500 | consumed samples: 19361280 | consumed tokens: 39651901440 | elapsed time per iteration (s): 0.16 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.706823E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.744 | TFLOPs: 25.26 | +7: iteration 75640/ 173500 | consumed samples: 19363840 | consumed tokens: 39657144320 | elapsed time per iteration (s): 0.16 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.714840E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.065 | TFLOPs: 25.17 | +7: iteration 75650/ 173500 | consumed samples: 19366400 | consumed tokens: 39662387200 | elapsed time per iteration (s): 0.16 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.719115E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.824 | TFLOPs: 25.17 | +7: iteration 75660/ 173500 | consumed samples: 19368960 | consumed tokens: 39667630080 | elapsed time per iteration (s): 0.15 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.707414E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.962 | TFLOPs: 25.92 | +7: iteration 75670/ 173500 | consumed samples: 19371520 | consumed tokens: 39672872960 | elapsed time per iteration (s): 0.16 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.711182E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.709 | TFLOPs: 25.81 | +7: iteration 75680/ 173500 | consumed samples: 19374080 | consumed tokens: 39678115840 | elapsed time per iteration (s): 0.16 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.716210E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.185 | TFLOPs: 24.94 | +7: iteration 75690/ 173500 | consumed samples: 19376640 | consumed tokens: 39683358720 | elapsed time per iteration (s): 0.16 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.713417E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.457 | TFLOPs: 24.42 | +7: iteration 75700/ 173500 | consumed samples: 19379200 | consumed tokens: 39688601600 | elapsed time per iteration (s): 0.16 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.717623E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.611 | TFLOPs: 25.20 | +7: iteration 75710/ 173500 | consumed samples: 19381760 | consumed tokens: 39693844480 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.714155E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.503 | TFLOPs: 25.12 | +7: iteration 75720/ 173500 | consumed samples: 19384320 | consumed tokens: 39699087360 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.713413E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.146 | TFLOPs: 25.64 | +7: iteration 75730/ 173500 | consumed samples: 19386880 | consumed tokens: 39704330240 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.720667E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.755 | TFLOPs: 25.54 | +7: iteration 75740/ 173500 | consumed samples: 19389440 | consumed tokens: 39709573120 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.717835E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.532 | TFLOPs: 25.38 | +7: iteration 75750/ 173500 | consumed samples: 19392000 | consumed tokens: 39714816000 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.721525E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.942 | TFLOPs: 24.57 | +7: iteration 75760/ 173500 | consumed samples: 19394560 | consumed tokens: 39720058880 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.712542E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.020 | TFLOPs: 25.59 | +7: iteration 75770/ 173500 | consumed samples: 19397120 | consumed tokens: 39725301760 | elapsed time per iteration (s): 0.16 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.716883E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.654 | TFLOPs: 25.20 | +7: iteration 75780/ 173500 | consumed samples: 19399680 | consumed tokens: 39730544640 | elapsed time per iteration (s): 0.16 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.717076E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.427 | TFLOPs: 24.80 | +7: iteration 75790/ 173500 | consumed samples: 19402240 | consumed tokens: 39735787520 | elapsed time per iteration (s): 0.16 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.726631E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.365 | TFLOPs: 25.18 | +7: iteration 75800/ 173500 | consumed samples: 19404800 | consumed tokens: 39741030400 | elapsed time per iteration (s): 0.16 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.714593E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.649 | TFLOPs: 25.51 | +7: iteration 75810/ 173500 | consumed samples: 19407360 | consumed tokens: 39746273280 | elapsed time per iteration (s): 0.15 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.718571E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.907 | TFLOPs: 26.19 | +7: iteration 75820/ 173500 | consumed samples: 19409920 | consumed tokens: 39751516160 | elapsed time per iteration (s): 0.17 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.707814E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1470.407 | TFLOPs: 23.06 | +7: iteration 75830/ 173500 | consumed samples: 19412480 | consumed tokens: 39756759040 | elapsed time per iteration (s): 0.16 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.712374E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.733 | TFLOPs: 25.15 | +7: iteration 75840/ 173500 | consumed samples: 19415040 | consumed tokens: 39762001920 | elapsed time per iteration (s): 0.16 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.728403E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.080 | TFLOPs: 25.09 | +7: iteration 75850/ 173500 | consumed samples: 19417600 | consumed tokens: 39767244800 | elapsed time per iteration (s): 0.16 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.728447E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.790 | TFLOPs: 25.89 | +7: iteration 75860/ 173500 | consumed samples: 19420160 | consumed tokens: 39772487680 | elapsed time per iteration (s): 0.16 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.723759E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.889 | TFLOPs: 25.26 | +7: iteration 75870/ 173500 | consumed samples: 19422720 | consumed tokens: 39777730560 | elapsed time per iteration (s): 0.16 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.710026E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.705 | TFLOPs: 25.24 | +7: iteration 75880/ 173500 | consumed samples: 19425280 | consumed tokens: 39782973440 | elapsed time per iteration (s): 0.16 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.720554E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.091 | TFLOPs: 24.89 | +7: iteration 75890/ 173500 | consumed samples: 19427840 | consumed tokens: 39788216320 | elapsed time per iteration (s): 0.15 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.715142E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.806 | TFLOPs: 26.11 | +7: iteration 75900/ 173500 | consumed samples: 19430400 | consumed tokens: 39793459200 | elapsed time per iteration (s): 0.16 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.724269E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.816 | TFLOPs: 25.42 | +7: iteration 75910/ 173500 | consumed samples: 19432960 | consumed tokens: 39798702080 | elapsed time per iteration (s): 0.16 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.722881E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.902 | TFLOPs: 24.49 | +7: iteration 75920/ 173500 | consumed samples: 19435520 | consumed tokens: 39803944960 | elapsed time per iteration (s): 0.16 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.707474E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.200 | TFLOPs: 24.81 | +7: iteration 75930/ 173500 | consumed samples: 19438080 | consumed tokens: 39809187840 | elapsed time per iteration (s): 0.16 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.700212E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.351 | TFLOPs: 24.74 | +7: iteration 75940/ 173500 | consumed samples: 19440640 | consumed tokens: 39814430720 | elapsed time per iteration (s): 0.17 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.722517E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.112 | TFLOPs: 23.78 | +7: iteration 75950/ 173500 | consumed samples: 19443200 | consumed tokens: 39819673600 | elapsed time per iteration (s): 0.16 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.712983E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.451 | TFLOPs: 24.68 | +7: iteration 75960/ 173500 | consumed samples: 19445760 | consumed tokens: 39824916480 | elapsed time per iteration (s): 0.16 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.713938E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.608 | TFLOPs: 25.29 | +7: iteration 75970/ 173500 | consumed samples: 19448320 | consumed tokens: 39830159360 | elapsed time per iteration (s): 0.16 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.717810E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.662 | TFLOPs: 25.15 | +7: iteration 75980/ 173500 | consumed samples: 19450880 | consumed tokens: 39835402240 | elapsed time per iteration (s): 0.16 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.711537E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.723 | TFLOPs: 25.29 | +7: iteration 75990/ 173500 | consumed samples: 19453440 | consumed tokens: 39840645120 | elapsed time per iteration (s): 0.16 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.719780E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.059 | TFLOPs: 25.52 | +0: [2023-03-17 03:32:49,475] [INFO] [logging.py:68:log_dist] [Rank 0] step=76000, skipped=0, lr=[0.0001289804445403464, 0.0001289804445403464, 0.0001289804445403464], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 76000/ 173500 | consumed samples: 19456000 | consumed tokens: 39845888000 | elapsed time per iteration (s): 0.16 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.705700E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.405 | TFLOPs: 24.83 | +0: steps: 76000 loss: 3.7173 iter time (s): 0.160 samples/sec: 1598.375 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 76000 | lm loss value: 3.861235E+00 | lm loss PPL: 4.752403E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 76000 to checkpoints_44m91b100m +0: [2023-03-17 03:32:49,565] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step76000 is begin to save! +0: [2023-03-17 03:32:49,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:32:49,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:32:49,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:32:49,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:32:49,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:32:49,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:32:49,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:32:49,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:32:49,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:32:49,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:32:49,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:32:49,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:32:49,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:32:49,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:32:49,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:32:49,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:32:49,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:32:49,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:32:49,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:32:49,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:32:49,706] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step76000/mp_rank_00_model_states.pt +0: [2023-03-17 03:32:49,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:32:49,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:32:49,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:32:49,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:32:49,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +7: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:32:49,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +7: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:32:49,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:32:49,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 03:32:49,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +7: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +7: [2023-03-17 03:32:49,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:32:49,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +7: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:32:49,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +5: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +3: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +1: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +4: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 03:32:49,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:32:49,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +2: [2023-03-17 03:32:49,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:32:49,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +6: [2023-03-17 03:32:49,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:32:49,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:32:49,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +7: [2023-03-17 03:32:49,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:32:49,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step76000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:32:49,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step76000 is ready now! +0: successfully saved checkpoint at iteration 76000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.60 +7: iteration 76010/ 173500 | consumed samples: 19458560 | consumed tokens: 39851130880 | elapsed time per iteration (s): 0.18 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.716328E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.247 | TFLOPs: 21.85 | +7: iteration 76020/ 173500 | consumed samples: 19461120 | consumed tokens: 39856373760 | elapsed time per iteration (s): 0.16 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.706757E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.390 | TFLOPs: 24.49 | +7: iteration 76030/ 173500 | consumed samples: 19463680 | consumed tokens: 39861616640 | elapsed time per iteration (s): 0.17 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.707772E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.302 | TFLOPs: 24.16 | +7: iteration 76040/ 173500 | consumed samples: 19466240 | consumed tokens: 39866859520 | elapsed time per iteration (s): 0.16 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.725456E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.644 | TFLOPs: 24.33 | +7: iteration 76050/ 173500 | consumed samples: 19468800 | consumed tokens: 39872102400 | elapsed time per iteration (s): 0.17 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.716103E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.578 | TFLOPs: 24.21 | +7: iteration 76060/ 173500 | consumed samples: 19471360 | consumed tokens: 39877345280 | elapsed time per iteration (s): 0.16 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.727690E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.216 | TFLOPs: 25.30 | +7: iteration 76070/ 173500 | consumed samples: 19473920 | consumed tokens: 39882588160 | elapsed time per iteration (s): 0.16 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.717098E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.570 | TFLOPs: 25.41 | +7: iteration 76080/ 173500 | consumed samples: 19476480 | consumed tokens: 39887831040 | elapsed time per iteration (s): 0.16 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.713332E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.343 | TFLOPs: 24.50 | +7: iteration 76090/ 173500 | consumed samples: 19479040 | consumed tokens: 39893073920 | elapsed time per iteration (s): 0.16 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.722753E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.792 | TFLOPs: 25.54 | +7: iteration 76100/ 173500 | consumed samples: 19481600 | consumed tokens: 39898316800 | elapsed time per iteration (s): 0.16 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.717197E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.099 | TFLOPs: 25.14 | +7: iteration 76110/ 173500 | consumed samples: 19484160 | consumed tokens: 39903559680 | elapsed time per iteration (s): 0.16 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.716759E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.859 | TFLOPs: 25.42 | +7: iteration 76120/ 173500 | consumed samples: 19486720 | consumed tokens: 39908802560 | elapsed time per iteration (s): 0.16 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.727778E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.733 | TFLOPs: 24.88 | +7: iteration 76130/ 173500 | consumed samples: 19489280 | consumed tokens: 39914045440 | elapsed time per iteration (s): 0.15 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.711786E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.548 | TFLOPs: 26.21 | +7: iteration 76140/ 173500 | consumed samples: 19491840 | consumed tokens: 39919288320 | elapsed time per iteration (s): 0.16 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.703774E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.391 | TFLOPs: 24.50 | +7: iteration 76150/ 173500 | consumed samples: 19494400 | consumed tokens: 39924531200 | elapsed time per iteration (s): 0.16 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.719386E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.389 | TFLOPs: 25.69 | +7: iteration 76160/ 173500 | consumed samples: 19496960 | consumed tokens: 39929774080 | elapsed time per iteration (s): 0.15 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.709962E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.597 | TFLOPs: 26.26 | +7: iteration 76170/ 173500 | consumed samples: 19499520 | consumed tokens: 39935016960 | elapsed time per iteration (s): 0.16 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.702874E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.896 | TFLOPs: 25.65 | +7: iteration 76180/ 173500 | consumed samples: 19502080 | consumed tokens: 39940259840 | elapsed time per iteration (s): 0.15 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.710698E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.533 | TFLOPs: 26.25 | +7: iteration 76190/ 173500 | consumed samples: 19504640 | consumed tokens: 39945502720 | elapsed time per iteration (s): 0.16 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.719268E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.290 | TFLOPs: 25.46 | +7: iteration 76200/ 173500 | consumed samples: 19507200 | consumed tokens: 39950745600 | elapsed time per iteration (s): 0.16 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.721376E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.938 | TFLOPs: 24.48 | +7: iteration 76210/ 173500 | consumed samples: 19509760 | consumed tokens: 39955988480 | elapsed time per iteration (s): 0.16 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.730230E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.311 | TFLOPs: 25.35 | +7: iteration 76220/ 173500 | consumed samples: 19512320 | consumed tokens: 39961231360 | elapsed time per iteration (s): 0.16 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.720147E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.316 | TFLOPs: 25.46 | +7: iteration 76230/ 173500 | consumed samples: 19514880 | consumed tokens: 39966474240 | elapsed time per iteration (s): 0.17 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.711605E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.364 | TFLOPs: 24.22 | +7: iteration 76240/ 173500 | consumed samples: 19517440 | consumed tokens: 39971717120 | elapsed time per iteration (s): 0.16 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.702544E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.546 | TFLOPs: 25.07 | +7: iteration 76250/ 173500 | consumed samples: 19520000 | consumed tokens: 39976960000 | elapsed time per iteration (s): 0.16 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.712219E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.568 | TFLOPs: 25.38 | +7: iteration 76260/ 173500 | consumed samples: 19522560 | consumed tokens: 39982202880 | elapsed time per iteration (s): 0.16 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.707544E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.974 | TFLOPs: 25.56 | +7: iteration 76270/ 173500 | consumed samples: 19525120 | consumed tokens: 39987445760 | elapsed time per iteration (s): 0.16 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.718710E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.528 | TFLOPs: 25.62 | +7: iteration 76280/ 173500 | consumed samples: 19527680 | consumed tokens: 39992688640 | elapsed time per iteration (s): 0.18 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.687582E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.613 | TFLOPs: 22.45 | +7: iteration 76290/ 173500 | consumed samples: 19530240 | consumed tokens: 39997931520 | elapsed time per iteration (s): 0.17 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.713676E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.594 | TFLOPs: 24.11 | +7: iteration 76300/ 173500 | consumed samples: 19532800 | consumed tokens: 40003174400 | elapsed time per iteration (s): 0.16 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.708997E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.216 | TFLOPs: 25.50 | +7: iteration 76310/ 173500 | consumed samples: 19535360 | consumed tokens: 40008417280 | elapsed time per iteration (s): 0.17 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.707709E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.475 | TFLOPs: 23.97 | +7: iteration 76320/ 173500 | consumed samples: 19537920 | consumed tokens: 40013660160 | elapsed time per iteration (s): 0.16 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.723011E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.124 | TFLOPs: 24.65 | +7: iteration 76330/ 173500 | consumed samples: 19540480 | consumed tokens: 40018903040 | elapsed time per iteration (s): 0.16 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.713224E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.593 | TFLOPs: 24.62 | +7: iteration 76340/ 173500 | consumed samples: 19543040 | consumed tokens: 40024145920 | elapsed time per iteration (s): 0.18 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.716378E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1427.588 | TFLOPs: 22.39 | +7: iteration 76350/ 173500 | consumed samples: 19545600 | consumed tokens: 40029388800 | elapsed time per iteration (s): 0.16 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.709719E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.335 | TFLOPs: 25.27 | +7: iteration 76360/ 173500 | consumed samples: 19548160 | consumed tokens: 40034631680 | elapsed time per iteration (s): 0.16 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.724730E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.873 | TFLOPs: 24.48 | +7: iteration 76370/ 173500 | consumed samples: 19550720 | consumed tokens: 40039874560 | elapsed time per iteration (s): 0.16 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.705687E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.687 | TFLOPs: 25.64 | +7: iteration 76380/ 173500 | consumed samples: 19553280 | consumed tokens: 40045117440 | elapsed time per iteration (s): 0.16 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.709790E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.173 | TFLOPs: 25.46 | +7: iteration 76390/ 173500 | consumed samples: 19555840 | consumed tokens: 40050360320 | elapsed time per iteration (s): 0.16 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.720119E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.030 | TFLOPs: 25.56 | +7: iteration 76400/ 173500 | consumed samples: 19558400 | consumed tokens: 40055603200 | elapsed time per iteration (s): 0.16 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.716640E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.280 | TFLOPs: 25.77 | +7: iteration 76410/ 173500 | consumed samples: 19560960 | consumed tokens: 40060846080 | elapsed time per iteration (s): 0.16 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.704835E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.015 | TFLOPs: 24.81 | +7: iteration 76420/ 173500 | consumed samples: 19563520 | consumed tokens: 40066088960 | elapsed time per iteration (s): 0.16 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.709953E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.881 | TFLOPs: 25.00 | +7: iteration 76430/ 173500 | consumed samples: 19566080 | consumed tokens: 40071331840 | elapsed time per iteration (s): 0.16 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.711529E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.751 | TFLOPs: 24.65 | +7: iteration 76440/ 173500 | consumed samples: 19568640 | consumed tokens: 40076574720 | elapsed time per iteration (s): 0.16 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.713110E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.254 | TFLOPs: 25.38 | +7: iteration 76450/ 173500 | consumed samples: 19571200 | consumed tokens: 40081817600 | elapsed time per iteration (s): 0.17 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.722379E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.431 | TFLOPs: 23.95 | +7: iteration 76460/ 173500 | consumed samples: 19573760 | consumed tokens: 40087060480 | elapsed time per iteration (s): 0.16 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.723438E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.201 | TFLOPs: 25.05 | +7: iteration 76470/ 173500 | consumed samples: 19576320 | consumed tokens: 40092303360 | elapsed time per iteration (s): 0.16 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.719061E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.927 | TFLOPs: 25.20 | +7: iteration 76480/ 173500 | consumed samples: 19578880 | consumed tokens: 40097546240 | elapsed time per iteration (s): 0.16 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.718641E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.744 | TFLOPs: 25.21 | +7: iteration 76490/ 173500 | consumed samples: 19581440 | consumed tokens: 40102789120 | elapsed time per iteration (s): 0.16 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.699593E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.088 | TFLOPs: 25.44 | +7: iteration 76500/ 173500 | consumed samples: 19584000 | consumed tokens: 40108032000 | elapsed time per iteration (s): 0.16 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.720528E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.905 | TFLOPs: 25.34 | +7: iteration 76510/ 173500 | consumed samples: 19586560 | consumed tokens: 40113274880 | elapsed time per iteration (s): 0.17 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.713614E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.597 | TFLOPs: 23.78 | +7: iteration 76520/ 173500 | consumed samples: 19589120 | consumed tokens: 40118517760 | elapsed time per iteration (s): 0.16 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.705912E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.608 | TFLOPs: 25.60 | +7: iteration 76530/ 173500 | consumed samples: 19591680 | consumed tokens: 40123760640 | elapsed time per iteration (s): 0.16 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.719898E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.338 | TFLOPs: 25.41 | +7: iteration 76540/ 173500 | consumed samples: 19594240 | consumed tokens: 40129003520 | elapsed time per iteration (s): 0.16 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.719516E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.987 | TFLOPs: 25.20 | +7: iteration 76550/ 173500 | consumed samples: 19596800 | consumed tokens: 40134246400 | elapsed time per iteration (s): 0.16 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.722366E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.190 | TFLOPs: 25.80 | +7: iteration 76560/ 173500 | consumed samples: 19599360 | consumed tokens: 40139489280 | elapsed time per iteration (s): 0.16 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.712693E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.869 | TFLOPs: 24.96 | +7: iteration 76570/ 173500 | consumed samples: 19601920 | consumed tokens: 40144732160 | elapsed time per iteration (s): 0.16 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.715994E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.525 | TFLOPs: 24.99 | +7: iteration 76580/ 173500 | consumed samples: 19604480 | consumed tokens: 40149975040 | elapsed time per iteration (s): 0.16 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.708020E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.702 | TFLOPs: 24.59 | +7: iteration 76590/ 173500 | consumed samples: 19607040 | consumed tokens: 40155217920 | elapsed time per iteration (s): 0.16 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.721961E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.078 | TFLOPs: 24.95 | +7: iteration 76600/ 173500 | consumed samples: 19609600 | consumed tokens: 40160460800 | elapsed time per iteration (s): 0.16 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.717236E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.726 | TFLOPs: 25.51 | +7: iteration 76610/ 173500 | consumed samples: 19612160 | consumed tokens: 40165703680 | elapsed time per iteration (s): 0.15 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.717066E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.835 | TFLOPs: 25.90 | +7: iteration 76620/ 173500 | consumed samples: 19614720 | consumed tokens: 40170946560 | elapsed time per iteration (s): 0.16 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.701784E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.045 | TFLOPs: 24.48 | +7: iteration 76630/ 173500 | consumed samples: 19617280 | consumed tokens: 40176189440 | elapsed time per iteration (s): 0.17 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.711916E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.232 | TFLOPs: 24.28 | +7: iteration 76640/ 173500 | consumed samples: 19619840 | consumed tokens: 40181432320 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.710798E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.015 | TFLOPs: 25.64 | +7: iteration 76650/ 173500 | consumed samples: 19622400 | consumed tokens: 40186675200 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.728674E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.998 | TFLOPs: 25.20 | +7: iteration 76660/ 173500 | consumed samples: 19624960 | consumed tokens: 40191918080 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.724423E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.121 | TFLOPs: 24.86 | +7: iteration 76670/ 173500 | consumed samples: 19627520 | consumed tokens: 40197160960 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.715805E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.467 | TFLOPs: 25.48 | +7: iteration 76680/ 173500 | consumed samples: 19630080 | consumed tokens: 40202403840 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.709976E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.264 | TFLOPs: 25.22 | +7: iteration 76690/ 173500 | consumed samples: 19632640 | consumed tokens: 40207646720 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.709756E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.988 | TFLOPs: 25.30 | +7: iteration 76700/ 173500 | consumed samples: 19635200 | consumed tokens: 40212889600 | elapsed time per iteration (s): 0.16 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.708713E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.094 | TFLOPs: 25.17 | +7: iteration 76710/ 173500 | consumed samples: 19637760 | consumed tokens: 40218132480 | elapsed time per iteration (s): 0.16 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.710752E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.850 | TFLOPs: 25.31 | +7: iteration 76720/ 173500 | consumed samples: 19640320 | consumed tokens: 40223375360 | elapsed time per iteration (s): 0.16 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.713102E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.987 | TFLOPs: 24.87 | +7: iteration 76730/ 173500 | consumed samples: 19642880 | consumed tokens: 40228618240 | elapsed time per iteration (s): 0.16 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.714189E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.310 | TFLOPs: 25.83 | +7: iteration 76740/ 173500 | consumed samples: 19645440 | consumed tokens: 40233861120 | elapsed time per iteration (s): 0.16 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.723064E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.299 | TFLOPs: 25.79 | +7: iteration 76750/ 173500 | consumed samples: 19648000 | consumed tokens: 40239104000 | elapsed time per iteration (s): 0.16 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.713616E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.462 | TFLOPs: 24.69 | +7: iteration 76760/ 173500 | consumed samples: 19650560 | consumed tokens: 40244346880 | elapsed time per iteration (s): 0.16 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.724052E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.735 | TFLOPs: 25.24 | +7: iteration 76770/ 173500 | consumed samples: 19653120 | consumed tokens: 40249589760 | elapsed time per iteration (s): 0.16 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.711102E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.429 | TFLOPs: 25.33 | +7: iteration 76780/ 173500 | consumed samples: 19655680 | consumed tokens: 40254832640 | elapsed time per iteration (s): 0.16 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.700821E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.287 | TFLOPs: 24.92 | +7: iteration 76790/ 173500 | consumed samples: 19658240 | consumed tokens: 40260075520 | elapsed time per iteration (s): 0.16 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.723043E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.480 | TFLOPs: 25.44 | +7: iteration 76800/ 173500 | consumed samples: 19660800 | consumed tokens: 40265318400 | elapsed time per iteration (s): 0.16 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.711597E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.498 | TFLOPs: 25.59 | +7: iteration 76810/ 173500 | consumed samples: 19663360 | consumed tokens: 40270561280 | elapsed time per iteration (s): 0.16 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.721012E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.951 | TFLOPs: 25.36 | +7: iteration 76820/ 173500 | consumed samples: 19665920 | consumed tokens: 40275804160 | elapsed time per iteration (s): 0.16 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.713211E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.375 | TFLOPs: 25.44 | +7: iteration 76830/ 173500 | consumed samples: 19668480 | consumed tokens: 40281047040 | elapsed time per iteration (s): 0.16 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.719996E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.224 | TFLOPs: 25.85 | +7: iteration 76840/ 173500 | consumed samples: 19671040 | consumed tokens: 40286289920 | elapsed time per iteration (s): 0.16 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.712941E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.939 | TFLOPs: 24.68 | +7: iteration 76850/ 173500 | consumed samples: 19673600 | consumed tokens: 40291532800 | elapsed time per iteration (s): 0.16 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.720724E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.159 | TFLOPs: 25.20 | +7: iteration 76860/ 173500 | consumed samples: 19676160 | consumed tokens: 40296775680 | elapsed time per iteration (s): 0.16 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.715795E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.962 | TFLOPs: 24.53 | +7: iteration 76870/ 173500 | consumed samples: 19678720 | consumed tokens: 40302018560 | elapsed time per iteration (s): 0.16 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.723001E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.605 | TFLOPs: 25.49 | +7: iteration 76880/ 173500 | consumed samples: 19681280 | consumed tokens: 40307261440 | elapsed time per iteration (s): 0.16 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.725361E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.252 | TFLOPs: 25.44 | +7: iteration 76890/ 173500 | consumed samples: 19683840 | consumed tokens: 40312504320 | elapsed time per iteration (s): 0.16 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.716862E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.270 | TFLOPs: 25.69 | +7: iteration 76900/ 173500 | consumed samples: 19686400 | consumed tokens: 40317747200 | elapsed time per iteration (s): 0.16 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.711507E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.177 | TFLOPs: 25.75 | +7: iteration 76910/ 173500 | consumed samples: 19688960 | consumed tokens: 40322990080 | elapsed time per iteration (s): 0.15 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.712229E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.647 | TFLOPs: 25.90 | +7: iteration 76920/ 173500 | consumed samples: 19691520 | consumed tokens: 40328232960 | elapsed time per iteration (s): 0.17 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.714120E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.735 | TFLOPs: 24.27 | +7: iteration 76930/ 173500 | consumed samples: 19694080 | consumed tokens: 40333475840 | elapsed time per iteration (s): 0.16 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.704194E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.725 | TFLOPs: 24.90 | +7: iteration 76940/ 173500 | consumed samples: 19696640 | consumed tokens: 40338718720 | elapsed time per iteration (s): 0.16 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.709224E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.859 | TFLOPs: 25.18 | +7: iteration 76950/ 173500 | consumed samples: 19699200 | consumed tokens: 40343961600 | elapsed time per iteration (s): 0.16 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.715490E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.787 | TFLOPs: 24.74 | +7: iteration 76960/ 173500 | consumed samples: 19701760 | consumed tokens: 40349204480 | elapsed time per iteration (s): 0.16 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.717633E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.109 | TFLOPs: 24.80 | +7: iteration 76970/ 173500 | consumed samples: 19704320 | consumed tokens: 40354447360 | elapsed time per iteration (s): 0.17 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.718984E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.316 | TFLOPs: 24.01 | +7: iteration 76980/ 173500 | consumed samples: 19706880 | consumed tokens: 40359690240 | elapsed time per iteration (s): 0.17 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.712052E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.564 | TFLOPs: 23.83 | +7: iteration 76990/ 173500 | consumed samples: 19709440 | consumed tokens: 40364933120 | elapsed time per iteration (s): 0.16 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.705386E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.494 | TFLOPs: 24.66 | +7: iteration 77000/ 173500 | consumed samples: 19712000 | consumed tokens: 40370176000 | elapsed time per iteration (s): 0.16 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.710176E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.688 | TFLOPs: 25.28 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 77000 | lm loss value: 3.802668E+00 | lm loss PPL: 4.482059E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 77000 to checkpoints_44m91b100m +0: [2023-03-17 03:35:30,111] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step77000 is begin to save! +0: [2023-03-17 03:35:30,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:35:30,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:35:30,176] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:35:30,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:35:30,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:35:30,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:35:30,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:35:30,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:35:30,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:35:30,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:35:30,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:35:30,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:35:30,219] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:35:30,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:35:30,227] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:35:30,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:35:30,235] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:35:30,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:35:30,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:35:30,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:35:30,244] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step77000/mp_rank_00_model_states.pt +0: [2023-03-17 03:35:30,244] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:35:30,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:35:30,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:35:30,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:35:30,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:35:30,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:35:30,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:35:30,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:35:30,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:35:30,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:35:30,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:35:30,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:35:30,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +2: [2023-03-17 03:35:30,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:35:30,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +6: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +3: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +7: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +1: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +4: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:35:30,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:35:30,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +5: [2023-03-17 03:35:30,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:35:30,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step77000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:35:30,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step77000 is ready now! +0: successfully saved checkpoint at iteration 77000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.77 +7: iteration 77010/ 173500 | consumed samples: 19714560 | consumed tokens: 40375418880 | elapsed time per iteration (s): 0.18 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.709255E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.102 | TFLOPs: 21.88 | +7: iteration 77020/ 173500 | consumed samples: 19717120 | consumed tokens: 40380661760 | elapsed time per iteration (s): 0.16 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.726500E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.623 | TFLOPs: 24.68 | +7: iteration 77030/ 173500 | consumed samples: 19719680 | consumed tokens: 40385904640 | elapsed time per iteration (s): 0.16 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.728897E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.727 | TFLOPs: 25.21 | +7: iteration 77040/ 173500 | consumed samples: 19722240 | consumed tokens: 40391147520 | elapsed time per iteration (s): 0.16 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.714230E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.739 | TFLOPs: 24.81 | +7: iteration 77050/ 173500 | consumed samples: 19724800 | consumed tokens: 40396390400 | elapsed time per iteration (s): 0.16 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.709613E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.936 | TFLOPs: 25.31 | +7: iteration 77060/ 173500 | consumed samples: 19727360 | consumed tokens: 40401633280 | elapsed time per iteration (s): 0.16 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.726643E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.269 | TFLOPs: 25.39 | +7: iteration 77070/ 173500 | consumed samples: 19729920 | consumed tokens: 40406876160 | elapsed time per iteration (s): 0.16 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.706874E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.779 | TFLOPs: 25.73 | +7: iteration 77080/ 173500 | consumed samples: 19732480 | consumed tokens: 40412119040 | elapsed time per iteration (s): 0.16 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.724256E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.248 | TFLOPs: 25.11 | +7: iteration 77090/ 173500 | consumed samples: 19735040 | consumed tokens: 40417361920 | elapsed time per iteration (s): 0.16 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.720530E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.792 | TFLOPs: 25.79 | +7: iteration 77100/ 173500 | consumed samples: 19737600 | consumed tokens: 40422604800 | elapsed time per iteration (s): 0.16 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.718025E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.680 | TFLOPs: 24.44 | +7: iteration 77110/ 173500 | consumed samples: 19740160 | consumed tokens: 40427847680 | elapsed time per iteration (s): 0.16 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.715405E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.953 | TFLOPs: 25.40 | +7: iteration 77120/ 173500 | consumed samples: 19742720 | consumed tokens: 40433090560 | elapsed time per iteration (s): 0.16 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.708879E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.994 | TFLOPs: 25.00 | +7: iteration 77130/ 173500 | consumed samples: 19745280 | consumed tokens: 40438333440 | elapsed time per iteration (s): 0.16 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.715414E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.886 | TFLOPs: 24.71 | +7: iteration 77140/ 173500 | consumed samples: 19747840 | consumed tokens: 40443576320 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.712291E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.742 | TFLOPs: 24.35 | +7: iteration 77150/ 173500 | consumed samples: 19750400 | consumed tokens: 40448819200 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.712730E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.187 | TFLOPs: 25.14 | +7: iteration 77160/ 173500 | consumed samples: 19752960 | consumed tokens: 40454062080 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.727987E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.178 | TFLOPs: 25.14 | +7: iteration 77170/ 173500 | consumed samples: 19755520 | consumed tokens: 40459304960 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.706028E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.235 | TFLOPs: 25.22 | +7: iteration 77180/ 173500 | consumed samples: 19758080 | consumed tokens: 40464547840 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.708772E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.108 | TFLOPs: 24.48 | +7: iteration 77190/ 173500 | consumed samples: 19760640 | consumed tokens: 40469790720 | elapsed time per iteration (s): 0.16 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.708356E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.053 | TFLOPs: 25.25 | +7: iteration 77200/ 173500 | consumed samples: 19763200 | consumed tokens: 40475033600 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.696746E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.670 | TFLOPs: 25.29 | +7: iteration 77210/ 173500 | consumed samples: 19765760 | consumed tokens: 40480276480 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.728122E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.532 | TFLOPs: 24.66 | +7: iteration 77220/ 173500 | consumed samples: 19768320 | consumed tokens: 40485519360 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.710638E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.331 | TFLOPs: 25.32 | +7: iteration 77230/ 173500 | consumed samples: 19770880 | consumed tokens: 40490762240 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.716102E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.159 | TFLOPs: 25.02 | +7: iteration 77240/ 173500 | consumed samples: 19773440 | consumed tokens: 40496005120 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.715964E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.973 | TFLOPs: 25.06 | +7: iteration 77250/ 173500 | consumed samples: 19776000 | consumed tokens: 40501248000 | elapsed time per iteration (s): 0.16 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.722736E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.902 | TFLOPs: 25.61 | +7: iteration 77260/ 173500 | consumed samples: 19778560 | consumed tokens: 40506490880 | elapsed time per iteration (s): 0.17 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.715737E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.545 | TFLOPs: 24.29 | +7: iteration 77270/ 173500 | consumed samples: 19781120 | consumed tokens: 40511733760 | elapsed time per iteration (s): 0.17 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.712961E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.313 | TFLOPs: 24.22 | +7: iteration 77280/ 173500 | consumed samples: 19783680 | consumed tokens: 40516976640 | elapsed time per iteration (s): 0.16 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.707219E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.639 | TFLOPs: 24.44 | +7: iteration 77290/ 173500 | consumed samples: 19786240 | consumed tokens: 40522219520 | elapsed time per iteration (s): 0.17 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.723420E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.420 | TFLOPs: 24.24 | +7: iteration 77300/ 173500 | consumed samples: 19788800 | consumed tokens: 40527462400 | elapsed time per iteration (s): 0.16 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.701582E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.446 | TFLOPs: 25.19 | +7: iteration 77310/ 173500 | consumed samples: 19791360 | consumed tokens: 40532705280 | elapsed time per iteration (s): 0.16 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.724745E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.050 | TFLOPs: 25.19 | +7: iteration 77320/ 173500 | consumed samples: 19793920 | consumed tokens: 40537948160 | elapsed time per iteration (s): 0.16 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.722117E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.859 | TFLOPs: 24.62 | +7: iteration 77330/ 173500 | consumed samples: 19796480 | consumed tokens: 40543191040 | elapsed time per iteration (s): 0.16 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.706527E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.737 | TFLOPs: 25.07 | +7: iteration 77340/ 173500 | consumed samples: 19799040 | consumed tokens: 40548433920 | elapsed time per iteration (s): 0.16 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.726737E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.846 | TFLOPs: 25.36 | +7: iteration 77350/ 173500 | consumed samples: 19801600 | consumed tokens: 40553676800 | elapsed time per iteration (s): 0.16 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.711109E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.358 | TFLOPs: 24.61 | +7: iteration 77360/ 173500 | consumed samples: 19804160 | consumed tokens: 40558919680 | elapsed time per iteration (s): 0.17 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.711258E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.418 | TFLOPs: 23.77 | +7: iteration 77370/ 173500 | consumed samples: 19806720 | consumed tokens: 40564162560 | elapsed time per iteration (s): 0.16 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.716650E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.385 | TFLOPs: 25.27 | +7: iteration 77380/ 173500 | consumed samples: 19809280 | consumed tokens: 40569405440 | elapsed time per iteration (s): 0.16 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.720370E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.691 | TFLOPs: 24.59 | +7: iteration 77390/ 173500 | consumed samples: 19811840 | consumed tokens: 40574648320 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.713909E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.420 | TFLOPs: 24.52 | +7: iteration 77400/ 173500 | consumed samples: 19814400 | consumed tokens: 40579891200 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.708575E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.873 | TFLOPs: 24.85 | +7: iteration 77410/ 173500 | consumed samples: 19816960 | consumed tokens: 40585134080 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.718378E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.707 | TFLOPs: 25.13 | +7: iteration 77420/ 173500 | consumed samples: 19819520 | consumed tokens: 40590376960 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.707367E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.083 | TFLOPs: 25.30 | +7: iteration 77430/ 173500 | consumed samples: 19822080 | consumed tokens: 40595619840 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.724749E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.567 | TFLOPs: 25.10 | +7: iteration 77440/ 173500 | consumed samples: 19824640 | consumed tokens: 40600862720 | elapsed time per iteration (s): 0.16 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.716883E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.643 | TFLOPs: 25.01 | +7: iteration 77450/ 173500 | consumed samples: 19827200 | consumed tokens: 40606105600 | elapsed time per iteration (s): 0.16 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.709142E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.273 | TFLOPs: 24.70 | +7: iteration 77460/ 173500 | consumed samples: 19829760 | consumed tokens: 40611348480 | elapsed time per iteration (s): 0.17 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.701601E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.959 | TFLOPs: 24.09 | +7: iteration 77470/ 173500 | consumed samples: 19832320 | consumed tokens: 40616591360 | elapsed time per iteration (s): 0.16 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.726983E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.875 | TFLOPs: 25.53 | +7: iteration 77480/ 173500 | consumed samples: 19834880 | consumed tokens: 40621834240 | elapsed time per iteration (s): 0.16 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.708001E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.644 | TFLOPs: 25.26 | +7: iteration 77490/ 173500 | consumed samples: 19837440 | consumed tokens: 40627077120 | elapsed time per iteration (s): 0.16 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.714793E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.104 | TFLOPs: 25.52 | +7: iteration 77500/ 173500 | consumed samples: 19840000 | consumed tokens: 40632320000 | elapsed time per iteration (s): 0.16 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.717597E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.853 | TFLOPs: 24.54 | +7: iteration 77510/ 173500 | consumed samples: 19842560 | consumed tokens: 40637562880 | elapsed time per iteration (s): 0.16 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.721806E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.710 | TFLOPs: 25.50 | +7: iteration 77520/ 173500 | consumed samples: 19845120 | consumed tokens: 40642805760 | elapsed time per iteration (s): 0.16 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.718629E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.693 | TFLOPs: 25.34 | +7: iteration 77530/ 173500 | consumed samples: 19847680 | consumed tokens: 40648048640 | elapsed time per iteration (s): 0.16 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.704951E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.210 | TFLOPs: 25.46 | +7: iteration 77540/ 173500 | consumed samples: 19850240 | consumed tokens: 40653291520 | elapsed time per iteration (s): 0.16 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.718693E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.327 | TFLOPs: 24.92 | +7: iteration 77550/ 173500 | consumed samples: 19852800 | consumed tokens: 40658534400 | elapsed time per iteration (s): 0.16 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.720331E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.814 | TFLOPs: 25.25 | +7: iteration 77560/ 173500 | consumed samples: 19855360 | consumed tokens: 40663777280 | elapsed time per iteration (s): 0.16 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.718818E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.427 | TFLOPs: 25.02 | +7: iteration 77570/ 173500 | consumed samples: 19857920 | consumed tokens: 40669020160 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.703994E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.315 | TFLOPs: 25.24 | +7: iteration 77580/ 173500 | consumed samples: 19860480 | consumed tokens: 40674263040 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.713383E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.997 | TFLOPs: 25.09 | +7: iteration 77590/ 173500 | consumed samples: 19863040 | consumed tokens: 40679505920 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.699505E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.231 | TFLOPs: 25.36 | +7: iteration 77600/ 173500 | consumed samples: 19865600 | consumed tokens: 40684748800 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.703968E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.971 | TFLOPs: 24.34 | +7: iteration 77610/ 173500 | consumed samples: 19868160 | consumed tokens: 40689991680 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.689685E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.662 | TFLOPs: 25.46 | +7: iteration 77620/ 173500 | consumed samples: 19870720 | consumed tokens: 40695234560 | elapsed time per iteration (s): 0.16 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.720503E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.354 | TFLOPs: 25.29 | +7: iteration 77630/ 173500 | consumed samples: 19873280 | consumed tokens: 40700477440 | elapsed time per iteration (s): 0.16 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.722313E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.821 | TFLOPs: 24.89 | +7: iteration 77640/ 173500 | consumed samples: 19875840 | consumed tokens: 40705720320 | elapsed time per iteration (s): 0.16 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.720080E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.051 | TFLOPs: 25.16 | +7: iteration 77650/ 173500 | consumed samples: 19878400 | consumed tokens: 40710963200 | elapsed time per iteration (s): 0.16 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.720949E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.233 | TFLOPs: 25.27 | +7: iteration 77660/ 173500 | consumed samples: 19880960 | consumed tokens: 40716206080 | elapsed time per iteration (s): 0.16 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.718637E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.859 | TFLOPs: 24.92 | +7: iteration 77670/ 173500 | consumed samples: 19883520 | consumed tokens: 40721448960 | elapsed time per iteration (s): 0.16 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.721276E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.022 | TFLOPs: 25.09 | +7: iteration 77680/ 173500 | consumed samples: 19886080 | consumed tokens: 40726691840 | elapsed time per iteration (s): 0.16 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.696387E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.003 | TFLOPs: 24.92 | +7: iteration 77690/ 173500 | consumed samples: 19888640 | consumed tokens: 40731934720 | elapsed time per iteration (s): 0.17 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.717599E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1482.704 | TFLOPs: 23.25 | +7: iteration 77700/ 173500 | consumed samples: 19891200 | consumed tokens: 40737177600 | elapsed time per iteration (s): 0.16 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.705993E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.251 | TFLOPs: 24.55 | +7: iteration 77710/ 173500 | consumed samples: 19893760 | consumed tokens: 40742420480 | elapsed time per iteration (s): 0.16 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.706174E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.972 | TFLOPs: 25.69 | +7: iteration 77720/ 173500 | consumed samples: 19896320 | consumed tokens: 40747663360 | elapsed time per iteration (s): 0.16 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.711714E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.399 | TFLOPs: 25.85 | +7: iteration 77730/ 173500 | consumed samples: 19898880 | consumed tokens: 40752906240 | elapsed time per iteration (s): 0.15 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.716375E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.433 | TFLOPs: 26.15 | +7: iteration 77740/ 173500 | consumed samples: 19901440 | consumed tokens: 40758149120 | elapsed time per iteration (s): 0.16 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.726331E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.618 | TFLOPs: 25.76 | +7: iteration 77750/ 173500 | consumed samples: 19904000 | consumed tokens: 40763392000 | elapsed time per iteration (s): 0.16 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.708013E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.733 | TFLOPs: 24.99 | +7: iteration 77760/ 173500 | consumed samples: 19906560 | consumed tokens: 40768634880 | elapsed time per iteration (s): 0.16 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.716118E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.261 | TFLOPs: 25.27 | +7: iteration 77770/ 173500 | consumed samples: 19909120 | consumed tokens: 40773877760 | elapsed time per iteration (s): 0.16 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.722978E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.424 | TFLOPs: 25.04 | +7: iteration 77780/ 173500 | consumed samples: 19911680 | consumed tokens: 40779120640 | elapsed time per iteration (s): 0.16 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.720066E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.585 | TFLOPs: 25.57 | +7: iteration 77790/ 173500 | consumed samples: 19914240 | consumed tokens: 40784363520 | elapsed time per iteration (s): 0.15 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.711914E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.871 | TFLOPs: 26.23 | +7: iteration 77800/ 173500 | consumed samples: 19916800 | consumed tokens: 40789606400 | elapsed time per iteration (s): 0.16 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.717601E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.774 | TFLOPs: 25.84 | +7: iteration 77810/ 173500 | consumed samples: 19919360 | consumed tokens: 40794849280 | elapsed time per iteration (s): 0.16 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.714797E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.297 | TFLOPs: 25.61 | +7: iteration 77820/ 173500 | consumed samples: 19921920 | consumed tokens: 40800092160 | elapsed time per iteration (s): 0.16 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.714274E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.893 | TFLOPs: 25.76 | +7: iteration 77830/ 173500 | consumed samples: 19924480 | consumed tokens: 40805335040 | elapsed time per iteration (s): 0.16 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.719341E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.746 | TFLOPs: 25.17 | +7: iteration 77840/ 173500 | consumed samples: 19927040 | consumed tokens: 40810577920 | elapsed time per iteration (s): 0.16 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.702080E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.735 | TFLOPs: 25.24 | +7: iteration 77850/ 173500 | consumed samples: 19929600 | consumed tokens: 40815820800 | elapsed time per iteration (s): 0.16 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.711496E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.335 | TFLOPs: 25.19 | +7: iteration 77860/ 173500 | consumed samples: 19932160 | consumed tokens: 40821063680 | elapsed time per iteration (s): 0.16 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.725543E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.937 | TFLOPs: 24.97 | +7: iteration 77870/ 173500 | consumed samples: 19934720 | consumed tokens: 40826306560 | elapsed time per iteration (s): 0.16 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.708975E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.410 | TFLOPs: 25.40 | +7: iteration 77880/ 173500 | consumed samples: 19937280 | consumed tokens: 40831549440 | elapsed time per iteration (s): 0.16 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.702703E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.917 | TFLOPs: 25.48 | +7: iteration 77890/ 173500 | consumed samples: 19939840 | consumed tokens: 40836792320 | elapsed time per iteration (s): 0.17 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.712296E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.064 | TFLOPs: 24.31 | +7: iteration 77900/ 173500 | consumed samples: 19942400 | consumed tokens: 40842035200 | elapsed time per iteration (s): 0.16 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.701333E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.899 | TFLOPs: 25.06 | +7: iteration 77910/ 173500 | consumed samples: 19944960 | consumed tokens: 40847278080 | elapsed time per iteration (s): 0.17 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.698608E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.226 | TFLOPs: 23.87 | +7: iteration 77920/ 173500 | consumed samples: 19947520 | consumed tokens: 40852520960 | elapsed time per iteration (s): 0.16 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.713208E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.847 | TFLOPs: 25.09 | +7: iteration 77930/ 173500 | consumed samples: 19950080 | consumed tokens: 40857763840 | elapsed time per iteration (s): 0.16 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.721429E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.694 | TFLOPs: 25.06 | +7: iteration 77940/ 173500 | consumed samples: 19952640 | consumed tokens: 40863006720 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.704195E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.818 | TFLOPs: 25.76 | +7: iteration 77950/ 173500 | consumed samples: 19955200 | consumed tokens: 40868249600 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.699908E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.249 | TFLOPs: 24.99 | +7: iteration 77960/ 173500 | consumed samples: 19957760 | consumed tokens: 40873492480 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.707212E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.685 | TFLOPs: 25.26 | +7: iteration 77970/ 173500 | consumed samples: 19960320 | consumed tokens: 40878735360 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.722788E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.759 | TFLOPs: 25.72 | +7: iteration 77980/ 173500 | consumed samples: 19962880 | consumed tokens: 40883978240 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.724650E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.242 | TFLOPs: 25.74 | +7: iteration 77990/ 173500 | consumed samples: 19965440 | consumed tokens: 40889221120 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.714598E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.880 | TFLOPs: 25.76 | +0: [2023-03-17 03:38:10,357] [INFO] [logging.py:68:log_dist] [Rank 0] step=78000, skipped=0, lr=[0.00012575030905458257, 0.00012575030905458257, 0.00012575030905458257], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 78000/ 173500 | consumed samples: 19968000 | consumed tokens: 40894464000 | elapsed time per iteration (s): 0.16 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.712680E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.483 | TFLOPs: 25.48 | +0: steps: 78000 loss: 3.7013 iter time (s): 0.158 samples/sec: 1615.537 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 78000 | lm loss value: 3.813480E+00 | lm loss PPL: 4.530784E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 78000 to checkpoints_44m91b100m +0: [2023-03-17 03:38:10,431] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step78000 is begin to save! +0: [2023-03-17 03:38:10,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:38:10,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:38:10,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:38:10,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:38:10,506] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:38:10,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:38:10,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:38:10,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:38:10,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:38:10,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:38:10,530] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:38:10,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:38:10,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:38:10,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:38:10,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:38:10,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:38:10,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:38:10,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:38:10,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:38:10,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:38:10,563] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step78000/mp_rank_00_model_states.pt +0: [2023-03-17 03:38:10,563] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:38:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:38:10,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:38:10,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +3: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +3: [2023-03-17 03:38:10,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +5: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +5: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +5: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +3: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +5: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:38:10,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: [2023-03-17 03:38:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +5: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +3: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:38:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +2: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +7: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +5: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +1: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +4: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:38:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +6: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:38:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +3: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:38:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step78000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +3: [2023-03-17 03:38:10,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step78000 is ready now! +0: successfully saved checkpoint at iteration 78000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.04 +7: iteration 78010/ 173500 | consumed samples: 19970560 | consumed tokens: 40899706880 | elapsed time per iteration (s): 0.19 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.715255E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1363.504 | TFLOPs: 21.38 | +7: iteration 78020/ 173500 | consumed samples: 19973120 | consumed tokens: 40904949760 | elapsed time per iteration (s): 0.16 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.715137E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.730 | TFLOPs: 25.82 | +7: iteration 78030/ 173500 | consumed samples: 19975680 | consumed tokens: 40910192640 | elapsed time per iteration (s): 0.16 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.722261E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.978 | TFLOPs: 25.15 | +7: iteration 78040/ 173500 | consumed samples: 19978240 | consumed tokens: 40915435520 | elapsed time per iteration (s): 0.16 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.710159E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.262 | TFLOPs: 25.50 | +7: iteration 78050/ 173500 | consumed samples: 19980800 | consumed tokens: 40920678400 | elapsed time per iteration (s): 0.15 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.725565E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.931 | TFLOPs: 25.94 | +7: iteration 78060/ 173500 | consumed samples: 19983360 | consumed tokens: 40925921280 | elapsed time per iteration (s): 0.16 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.709492E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.057 | TFLOPs: 25.64 | +7: iteration 78070/ 173500 | consumed samples: 19985920 | consumed tokens: 40931164160 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.708941E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.780 | TFLOPs: 25.48 | +7: iteration 78080/ 173500 | consumed samples: 19988480 | consumed tokens: 40936407040 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.713854E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.514 | TFLOPs: 25.08 | +7: iteration 78090/ 173500 | consumed samples: 19991040 | consumed tokens: 40941649920 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.712058E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.465 | TFLOPs: 24.71 | +7: iteration 78100/ 173500 | consumed samples: 19993600 | consumed tokens: 40946892800 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.707428E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.138 | TFLOPs: 24.78 | +7: iteration 78110/ 173500 | consumed samples: 19996160 | consumed tokens: 40952135680 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.712670E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.308 | TFLOPs: 25.65 | +7: iteration 78120/ 173500 | consumed samples: 19998720 | consumed tokens: 40957378560 | elapsed time per iteration (s): 0.16 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.710423E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.417 | TFLOPs: 24.88 | +7: iteration 78130/ 173500 | consumed samples: 20001280 | consumed tokens: 40962621440 | elapsed time per iteration (s): 0.17 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.712666E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1490.222 | TFLOPs: 23.37 | +7: iteration 78140/ 173500 | consumed samples: 20003840 | consumed tokens: 40967864320 | elapsed time per iteration (s): 0.16 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.722792E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.797 | TFLOPs: 25.39 | +7: iteration 78150/ 173500 | consumed samples: 20006400 | consumed tokens: 40973107200 | elapsed time per iteration (s): 0.16 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.715247E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.465 | TFLOPs: 25.90 | +7: iteration 78160/ 173500 | consumed samples: 20008960 | consumed tokens: 40978350080 | elapsed time per iteration (s): 0.16 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.711748E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.837 | TFLOPs: 25.07 | +7: iteration 78170/ 173500 | consumed samples: 20011520 | consumed tokens: 40983592960 | elapsed time per iteration (s): 0.16 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.706731E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.415 | TFLOPs: 24.91 | +7: iteration 78180/ 173500 | consumed samples: 20014080 | consumed tokens: 40988835840 | elapsed time per iteration (s): 0.16 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.712406E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.920 | TFLOPs: 24.89 | +7: iteration 78190/ 173500 | consumed samples: 20016640 | consumed tokens: 40994078720 | elapsed time per iteration (s): 0.16 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.725725E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.767 | TFLOPs: 25.72 | +7: iteration 78200/ 173500 | consumed samples: 20019200 | consumed tokens: 40999321600 | elapsed time per iteration (s): 0.16 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.718387E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.571 | TFLOPs: 24.90 | +7: iteration 78210/ 173500 | consumed samples: 20021760 | consumed tokens: 41004564480 | elapsed time per iteration (s): 0.17 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.724222E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.297 | TFLOPs: 24.19 | +7: iteration 78220/ 173500 | consumed samples: 20024320 | consumed tokens: 41009807360 | elapsed time per iteration (s): 0.16 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.726004E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.220 | TFLOPs: 24.66 | +7: iteration 78230/ 173500 | consumed samples: 20026880 | consumed tokens: 41015050240 | elapsed time per iteration (s): 0.16 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.708156E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.769 | TFLOPs: 25.45 | +7: iteration 78240/ 173500 | consumed samples: 20029440 | consumed tokens: 41020293120 | elapsed time per iteration (s): 0.16 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.718071E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.155 | TFLOPs: 25.08 | +7: iteration 78250/ 173500 | consumed samples: 20032000 | consumed tokens: 41025536000 | elapsed time per iteration (s): 0.16 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.729388E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.724 | TFLOPs: 25.70 | +7: iteration 78260/ 173500 | consumed samples: 20034560 | consumed tokens: 41030778880 | elapsed time per iteration (s): 0.16 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.709178E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.688 | TFLOPs: 25.67 | +7: iteration 78270/ 173500 | consumed samples: 20037120 | consumed tokens: 41036021760 | elapsed time per iteration (s): 0.16 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.705061E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.954 | TFLOPs: 25.25 | +7: iteration 78280/ 173500 | consumed samples: 20039680 | consumed tokens: 41041264640 | elapsed time per iteration (s): 0.16 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.715149E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.064 | TFLOPs: 24.61 | +7: iteration 78290/ 173500 | consumed samples: 20042240 | consumed tokens: 41046507520 | elapsed time per iteration (s): 0.16 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.709813E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.447 | TFLOPs: 24.71 | +7: iteration 78300/ 173500 | consumed samples: 20044800 | consumed tokens: 41051750400 | elapsed time per iteration (s): 0.16 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.708399E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.238 | TFLOPs: 25.35 | +7: iteration 78310/ 173500 | consumed samples: 20047360 | consumed tokens: 41056993280 | elapsed time per iteration (s): 0.17 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.711772E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.304 | TFLOPs: 23.92 | +7: iteration 78320/ 173500 | consumed samples: 20049920 | consumed tokens: 41062236160 | elapsed time per iteration (s): 0.16 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.711874E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.815 | TFLOPs: 25.70 | +7: iteration 78330/ 173500 | consumed samples: 20052480 | consumed tokens: 41067479040 | elapsed time per iteration (s): 0.16 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.716153E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.799 | TFLOPs: 25.73 | +7: iteration 78340/ 173500 | consumed samples: 20055040 | consumed tokens: 41072721920 | elapsed time per iteration (s): 0.16 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.708641E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.917 | TFLOPs: 25.58 | +7: iteration 78350/ 173500 | consumed samples: 20057600 | consumed tokens: 41077964800 | elapsed time per iteration (s): 0.17 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.716146E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.674 | TFLOPs: 23.99 | +7: iteration 78360/ 173500 | consumed samples: 20060160 | consumed tokens: 41083207680 | elapsed time per iteration (s): 0.16 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.696605E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.635 | TFLOPs: 24.74 | +7: iteration 78370/ 173500 | consumed samples: 20062720 | consumed tokens: 41088450560 | elapsed time per iteration (s): 0.17 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.713818E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.476 | TFLOPs: 24.27 | +7: iteration 78380/ 173500 | consumed samples: 20065280 | consumed tokens: 41093693440 | elapsed time per iteration (s): 0.16 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.718766E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.955 | TFLOPs: 25.39 | +7: iteration 78390/ 173500 | consumed samples: 20067840 | consumed tokens: 41098936320 | elapsed time per iteration (s): 0.16 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.705871E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.722 | TFLOPs: 25.17 | +7: iteration 78400/ 173500 | consumed samples: 20070400 | consumed tokens: 41104179200 | elapsed time per iteration (s): 0.16 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.708728E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.282 | TFLOPs: 25.35 | +7: iteration 78410/ 173500 | consumed samples: 20072960 | consumed tokens: 41109422080 | elapsed time per iteration (s): 0.16 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.717972E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.926 | TFLOPs: 25.08 | +7: iteration 78420/ 173500 | consumed samples: 20075520 | consumed tokens: 41114664960 | elapsed time per iteration (s): 0.16 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.700385E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.595 | TFLOPs: 25.74 | +7: iteration 78430/ 173500 | consumed samples: 20078080 | consumed tokens: 41119907840 | elapsed time per iteration (s): 0.16 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.706685E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.409 | TFLOPs: 25.08 | +7: iteration 78440/ 173500 | consumed samples: 20080640 | consumed tokens: 41125150720 | elapsed time per iteration (s): 0.16 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.704491E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.399 | TFLOPs: 25.16 | +7: iteration 78450/ 173500 | consumed samples: 20083200 | consumed tokens: 41130393600 | elapsed time per iteration (s): 0.16 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.720436E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.191 | TFLOPs: 25.52 | +7: iteration 78460/ 173500 | consumed samples: 20085760 | consumed tokens: 41135636480 | elapsed time per iteration (s): 0.16 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.703527E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.934 | TFLOPs: 24.79 | +7: iteration 78470/ 173500 | consumed samples: 20088320 | consumed tokens: 41140879360 | elapsed time per iteration (s): 0.16 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.706821E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.561 | TFLOPs: 25.56 | +7: iteration 78480/ 173500 | consumed samples: 20090880 | consumed tokens: 41146122240 | elapsed time per iteration (s): 0.16 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.710983E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.056 | TFLOPs: 25.75 | +7: iteration 78490/ 173500 | consumed samples: 20093440 | consumed tokens: 41151365120 | elapsed time per iteration (s): 0.16 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.725396E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.232 | TFLOPs: 25.25 | +7: iteration 78500/ 173500 | consumed samples: 20096000 | consumed tokens: 41156608000 | elapsed time per iteration (s): 0.16 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.730213E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.566 | TFLOPs: 24.80 | +7: iteration 78510/ 173500 | consumed samples: 20098560 | consumed tokens: 41161850880 | elapsed time per iteration (s): 0.16 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.713581E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.964 | TFLOPs: 25.34 | +7: iteration 78520/ 173500 | consumed samples: 20101120 | consumed tokens: 41167093760 | elapsed time per iteration (s): 0.16 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.722382E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.969 | TFLOPs: 25.69 | +7: iteration 78530/ 173500 | consumed samples: 20103680 | consumed tokens: 41172336640 | elapsed time per iteration (s): 0.16 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.712395E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.412 | TFLOPs: 24.46 | +7: iteration 78540/ 173500 | consumed samples: 20106240 | consumed tokens: 41177579520 | elapsed time per iteration (s): 0.16 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.703316E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.843 | TFLOPs: 25.06 | +7: iteration 78550/ 173500 | consumed samples: 20108800 | consumed tokens: 41182822400 | elapsed time per iteration (s): 0.16 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.721617E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.034 | TFLOPs: 25.61 | +7: iteration 78560/ 173500 | consumed samples: 20111360 | consumed tokens: 41188065280 | elapsed time per iteration (s): 0.16 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.709730E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.951 | TFLOPs: 25.17 | +7: iteration 78570/ 173500 | consumed samples: 20113920 | consumed tokens: 41193308160 | elapsed time per iteration (s): 0.15 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.696811E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.547 | TFLOPs: 26.21 | +7: iteration 78580/ 173500 | consumed samples: 20116480 | consumed tokens: 41198551040 | elapsed time per iteration (s): 0.16 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.710115E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.481 | TFLOPs: 25.38 | +7: iteration 78590/ 173500 | consumed samples: 20119040 | consumed tokens: 41203793920 | elapsed time per iteration (s): 0.16 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.706959E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.038 | TFLOPs: 25.01 | +7: iteration 78600/ 173500 | consumed samples: 20121600 | consumed tokens: 41209036800 | elapsed time per iteration (s): 0.15 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.719649E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.764 | TFLOPs: 26.22 | +7: iteration 78610/ 173500 | consumed samples: 20124160 | consumed tokens: 41214279680 | elapsed time per iteration (s): 0.16 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.712568E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.626 | TFLOPs: 25.53 | +7: iteration 78620/ 173500 | consumed samples: 20126720 | consumed tokens: 41219522560 | elapsed time per iteration (s): 0.16 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.702995E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.743 | TFLOPs: 25.09 | +7: iteration 78630/ 173500 | consumed samples: 20129280 | consumed tokens: 41224765440 | elapsed time per iteration (s): 0.16 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.722130E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.206 | TFLOPs: 24.94 | +7: iteration 78640/ 173500 | consumed samples: 20131840 | consumed tokens: 41230008320 | elapsed time per iteration (s): 0.16 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.722642E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.973 | TFLOPs: 25.50 | +7: iteration 78650/ 173500 | consumed samples: 20134400 | consumed tokens: 41235251200 | elapsed time per iteration (s): 0.16 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.708838E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.297 | TFLOPs: 25.63 | +7: iteration 78660/ 173500 | consumed samples: 20136960 | consumed tokens: 41240494080 | elapsed time per iteration (s): 0.17 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.716637E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.918 | TFLOPs: 23.98 | +7: iteration 78670/ 173500 | consumed samples: 20139520 | consumed tokens: 41245736960 | elapsed time per iteration (s): 0.16 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.709800E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.655 | TFLOPs: 25.89 | +7: iteration 78680/ 173500 | consumed samples: 20142080 | consumed tokens: 41250979840 | elapsed time per iteration (s): 0.16 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.710449E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.121 | TFLOPs: 25.85 | +7: iteration 78690/ 173500 | consumed samples: 20144640 | consumed tokens: 41256222720 | elapsed time per iteration (s): 0.16 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.694865E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.987 | TFLOPs: 25.83 | +7: iteration 78700/ 173500 | consumed samples: 20147200 | consumed tokens: 41261465600 | elapsed time per iteration (s): 0.16 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.715339E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.264 | TFLOPs: 25.74 | +7: iteration 78710/ 173500 | consumed samples: 20149760 | consumed tokens: 41266708480 | elapsed time per iteration (s): 0.16 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.707435E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.879 | TFLOPs: 25.25 | +7: iteration 78720/ 173500 | consumed samples: 20152320 | consumed tokens: 41271951360 | elapsed time per iteration (s): 0.16 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.702767E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.839 | TFLOPs: 25.48 | +7: iteration 78730/ 173500 | consumed samples: 20154880 | consumed tokens: 41277194240 | elapsed time per iteration (s): 0.16 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.712413E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.208 | TFLOPs: 25.58 | +7: iteration 78740/ 173500 | consumed samples: 20157440 | consumed tokens: 41282437120 | elapsed time per iteration (s): 0.16 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.694983E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.329 | TFLOPs: 25.16 | +7: iteration 78750/ 173500 | consumed samples: 20160000 | consumed tokens: 41287680000 | elapsed time per iteration (s): 0.16 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.703456E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.830 | TFLOPs: 25.18 | +7: iteration 78760/ 173500 | consumed samples: 20162560 | consumed tokens: 41292922880 | elapsed time per iteration (s): 0.16 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.719179E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.773 | TFLOPs: 25.21 | +7: iteration 78770/ 173500 | consumed samples: 20165120 | consumed tokens: 41298165760 | elapsed time per iteration (s): 0.16 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.712008E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.248 | TFLOPs: 25.69 | +7: iteration 78780/ 173500 | consumed samples: 20167680 | consumed tokens: 41303408640 | elapsed time per iteration (s): 0.17 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.715519E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.247 | TFLOPs: 24.26 | +7: iteration 78790/ 173500 | consumed samples: 20170240 | consumed tokens: 41308651520 | elapsed time per iteration (s): 0.16 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.727419E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.406 | TFLOPs: 25.66 | +7: iteration 78800/ 173500 | consumed samples: 20172800 | consumed tokens: 41313894400 | elapsed time per iteration (s): 0.17 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.708765E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.704 | TFLOPs: 24.30 | +7: iteration 78810/ 173500 | consumed samples: 20175360 | consumed tokens: 41319137280 | elapsed time per iteration (s): 0.17 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.697433E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.807 | TFLOPs: 24.29 | +7: iteration 78820/ 173500 | consumed samples: 20177920 | consumed tokens: 41324380160 | elapsed time per iteration (s): 0.17 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.715794E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.628 | TFLOPs: 24.26 | +7: iteration 78830/ 173500 | consumed samples: 20180480 | consumed tokens: 41329623040 | elapsed time per iteration (s): 0.15 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.700224E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.419 | TFLOPs: 26.09 | +7: iteration 78840/ 173500 | consumed samples: 20183040 | consumed tokens: 41334865920 | elapsed time per iteration (s): 0.16 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.703875E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.143 | TFLOPs: 25.03 | +7: iteration 78850/ 173500 | consumed samples: 20185600 | consumed tokens: 41340108800 | elapsed time per iteration (s): 0.16 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.703345E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.313 | TFLOPs: 25.80 | +7: iteration 78860/ 173500 | consumed samples: 20188160 | consumed tokens: 41345351680 | elapsed time per iteration (s): 0.16 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.710956E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.271 | TFLOPs: 25.06 | +7: iteration 78870/ 173500 | consumed samples: 20190720 | consumed tokens: 41350594560 | elapsed time per iteration (s): 0.16 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.706240E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.505 | TFLOPs: 24.86 | +7: iteration 78880/ 173500 | consumed samples: 20193280 | consumed tokens: 41355837440 | elapsed time per iteration (s): 0.16 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.714116E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.034 | TFLOPs: 24.45 | +7: iteration 78890/ 173500 | consumed samples: 20195840 | consumed tokens: 41361080320 | elapsed time per iteration (s): 0.15 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.710815E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.043 | TFLOPs: 26.17 | +7: iteration 78900/ 173500 | consumed samples: 20198400 | consumed tokens: 41366323200 | elapsed time per iteration (s): 0.16 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.715158E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.522 | TFLOPs: 25.12 | +7: iteration 78910/ 173500 | consumed samples: 20200960 | consumed tokens: 41371566080 | elapsed time per iteration (s): 0.16 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.721979E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.303 | TFLOPs: 25.87 | +7: iteration 78920/ 173500 | consumed samples: 20203520 | consumed tokens: 41376808960 | elapsed time per iteration (s): 0.17 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.711517E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.230 | TFLOPs: 23.21 | +7: iteration 78930/ 173500 | consumed samples: 20206080 | consumed tokens: 41382051840 | elapsed time per iteration (s): 0.18 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.704742E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1424.560 | TFLOPs: 22.34 | +7: iteration 78940/ 173500 | consumed samples: 20208640 | consumed tokens: 41387294720 | elapsed time per iteration (s): 0.16 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.707565E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.517 | TFLOPs: 25.19 | +7: iteration 78950/ 173500 | consumed samples: 20211200 | consumed tokens: 41392537600 | elapsed time per iteration (s): 0.16 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.717812E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.466 | TFLOPs: 25.85 | +7: iteration 78960/ 173500 | consumed samples: 20213760 | consumed tokens: 41397780480 | elapsed time per iteration (s): 0.16 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.712903E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.775 | TFLOPs: 24.92 | +7: iteration 78970/ 173500 | consumed samples: 20216320 | consumed tokens: 41403023360 | elapsed time per iteration (s): 0.16 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.720760E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.193 | TFLOPs: 24.92 | +7: iteration 78980/ 173500 | consumed samples: 20218880 | consumed tokens: 41408266240 | elapsed time per iteration (s): 0.17 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.716573E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1489.761 | TFLOPs: 23.36 | +7: iteration 78990/ 173500 | consumed samples: 20221440 | consumed tokens: 41413509120 | elapsed time per iteration (s): 0.16 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.717557E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.641 | TFLOPs: 25.38 | +7: iteration 79000/ 173500 | consumed samples: 20224000 | consumed tokens: 41418752000 | elapsed time per iteration (s): 0.16 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.708987E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.029 | TFLOPs: 25.01 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 79000 | lm loss value: 3.849571E+00 | lm loss PPL: 4.697288E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 79000 to checkpoints_44m91b100m +0: [2023-03-17 03:40:50,492] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step79000 is begin to save! +0: [2023-03-17 03:40:50,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:40:50,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:40:50,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:40:50,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:40:50,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:40:50,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:40:50,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:40:50,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:40:50,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:40:50,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:40:50,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:40:50,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:40:50,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:40:50,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:40:50,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:40:50,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:40:50,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:40:50,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:40:50,631] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:40:50,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:40:50,632] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step79000/mp_rank_00_model_states.pt +0: [2023-03-17 03:40:50,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:40:50,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:40:50,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:40:50,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +3: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:40:50,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +3: [2023-03-17 03:40:50,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:40:50,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:40:50,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:40:50,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +3: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:40:50,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:40:50,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +3: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +3: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +6: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:40:50,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +3: [2023-03-17 03:40:50,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +5: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +4: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +7: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +1: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:40:50,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step79000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:40:50,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +2: [2023-03-17 03:40:50,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step79000 is ready now! +0: successfully saved checkpoint at iteration 79000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.66 +7: iteration 79010/ 173500 | consumed samples: 20226560 | consumed tokens: 41423994880 | elapsed time per iteration (s): 0.18 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.710518E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1422.764 | TFLOPs: 22.31 | +7: iteration 79020/ 173500 | consumed samples: 20229120 | consumed tokens: 41429237760 | elapsed time per iteration (s): 0.15 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.729342E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.022 | TFLOPs: 26.25 | +7: iteration 79030/ 173500 | consumed samples: 20231680 | consumed tokens: 41434480640 | elapsed time per iteration (s): 0.16 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.714334E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.164 | TFLOPs: 25.31 | +7: iteration 79040/ 173500 | consumed samples: 20234240 | consumed tokens: 41439723520 | elapsed time per iteration (s): 0.16 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.699733E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.460 | TFLOPs: 24.96 | +7: iteration 79050/ 173500 | consumed samples: 20236800 | consumed tokens: 41444966400 | elapsed time per iteration (s): 0.16 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.721734E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.280 | TFLOPs: 24.58 | +7: iteration 79060/ 173500 | consumed samples: 20239360 | consumed tokens: 41450209280 | elapsed time per iteration (s): 0.15 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.715723E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.150 | TFLOPs: 26.07 | +7: iteration 79070/ 173500 | consumed samples: 20241920 | consumed tokens: 41455452160 | elapsed time per iteration (s): 0.15 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.717622E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.007 | TFLOPs: 25.92 | +7: iteration 79080/ 173500 | consumed samples: 20244480 | consumed tokens: 41460695040 | elapsed time per iteration (s): 0.16 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.704531E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.820 | TFLOPs: 24.93 | +7: iteration 79090/ 173500 | consumed samples: 20247040 | consumed tokens: 41465937920 | elapsed time per iteration (s): 0.16 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.713989E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.291 | TFLOPs: 25.05 | +7: iteration 79100/ 173500 | consumed samples: 20249600 | consumed tokens: 41471180800 | elapsed time per iteration (s): 0.16 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.705403E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.172 | TFLOPs: 25.47 | +7: iteration 79110/ 173500 | consumed samples: 20252160 | consumed tokens: 41476423680 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.721523E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.932 | TFLOPs: 25.56 | +7: iteration 79120/ 173500 | consumed samples: 20254720 | consumed tokens: 41481666560 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.715947E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.198 | TFLOPs: 24.89 | +7: iteration 79130/ 173500 | consumed samples: 20257280 | consumed tokens: 41486909440 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.721225E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.386 | TFLOPs: 25.22 | +7: iteration 79140/ 173500 | consumed samples: 20259840 | consumed tokens: 41492152320 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.722209E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.275 | TFLOPs: 24.78 | +7: iteration 79150/ 173500 | consumed samples: 20262400 | consumed tokens: 41497395200 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.704320E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.889 | TFLOPs: 24.79 | +7: iteration 79160/ 173500 | consumed samples: 20264960 | consumed tokens: 41502638080 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.705779E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.309 | TFLOPs: 25.08 | +7: iteration 79170/ 173500 | consumed samples: 20267520 | consumed tokens: 41507880960 | elapsed time per iteration (s): 0.16 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.710879E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.576 | TFLOPs: 25.52 | +7: iteration 79180/ 173500 | consumed samples: 20270080 | consumed tokens: 41513123840 | elapsed time per iteration (s): 0.15 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.714568E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.217 | TFLOPs: 26.04 | +7: iteration 79190/ 173500 | consumed samples: 20272640 | consumed tokens: 41518366720 | elapsed time per iteration (s): 0.15 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.721976E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.893 | TFLOPs: 25.95 | +7: iteration 79200/ 173500 | consumed samples: 20275200 | consumed tokens: 41523609600 | elapsed time per iteration (s): 0.16 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.718335E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.770 | TFLOPs: 24.82 | +7: iteration 79210/ 173500 | consumed samples: 20277760 | consumed tokens: 41528852480 | elapsed time per iteration (s): 0.16 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.720450E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.619 | TFLOPs: 24.65 | +7: iteration 79220/ 173500 | consumed samples: 20280320 | consumed tokens: 41534095360 | elapsed time per iteration (s): 0.16 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.712275E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.999 | TFLOPs: 25.52 | +7: iteration 79230/ 173500 | consumed samples: 20282880 | consumed tokens: 41539338240 | elapsed time per iteration (s): 0.16 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.717059E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.552 | TFLOPs: 25.18 | +7: iteration 79240/ 173500 | consumed samples: 20285440 | consumed tokens: 41544581120 | elapsed time per iteration (s): 0.16 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.692472E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.262 | TFLOPs: 24.64 | +7: iteration 79250/ 173500 | consumed samples: 20288000 | consumed tokens: 41549824000 | elapsed time per iteration (s): 0.16 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.718656E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.452 | TFLOPs: 25.16 | +7: iteration 79260/ 173500 | consumed samples: 20290560 | consumed tokens: 41555066880 | elapsed time per iteration (s): 0.16 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.711681E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.533 | TFLOPs: 25.34 | +7: iteration 79270/ 173500 | consumed samples: 20293120 | consumed tokens: 41560309760 | elapsed time per iteration (s): 0.16 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.710484E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.946 | TFLOPs: 25.67 | +7: iteration 79280/ 173500 | consumed samples: 20295680 | consumed tokens: 41565552640 | elapsed time per iteration (s): 0.16 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.706526E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.914 | TFLOPs: 25.61 | +7: iteration 79290/ 173500 | consumed samples: 20298240 | consumed tokens: 41570795520 | elapsed time per iteration (s): 0.16 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.712454E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.500 | TFLOPs: 24.99 | +7: iteration 79300/ 173500 | consumed samples: 20300800 | consumed tokens: 41576038400 | elapsed time per iteration (s): 0.16 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.718016E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.962 | TFLOPs: 25.53 | +7: iteration 79310/ 173500 | consumed samples: 20303360 | consumed tokens: 41581281280 | elapsed time per iteration (s): 0.16 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.721867E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.666 | TFLOPs: 24.88 | +7: iteration 79320/ 173500 | consumed samples: 20305920 | consumed tokens: 41586524160 | elapsed time per iteration (s): 0.16 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.709797E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.412 | TFLOPs: 25.90 | +7: iteration 79330/ 173500 | consumed samples: 20308480 | consumed tokens: 41591767040 | elapsed time per iteration (s): 0.16 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.710971E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.167 | TFLOPs: 24.50 | +7: iteration 79340/ 173500 | consumed samples: 20311040 | consumed tokens: 41597009920 | elapsed time per iteration (s): 0.16 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.719204E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.698 | TFLOPs: 25.04 | +7: iteration 79350/ 173500 | consumed samples: 20313600 | consumed tokens: 41602252800 | elapsed time per iteration (s): 0.16 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.702623E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.356 | TFLOPs: 25.52 | +7: iteration 79360/ 173500 | consumed samples: 20316160 | consumed tokens: 41607495680 | elapsed time per iteration (s): 0.16 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.719053E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.733 | TFLOPs: 25.87 | +7: iteration 79370/ 173500 | consumed samples: 20318720 | consumed tokens: 41612738560 | elapsed time per iteration (s): 0.16 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.713718E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.664 | TFLOPs: 25.54 | +7: iteration 79380/ 173500 | consumed samples: 20321280 | consumed tokens: 41617981440 | elapsed time per iteration (s): 0.16 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.710892E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.959 | TFLOPs: 25.61 | +7: iteration 79390/ 173500 | consumed samples: 20323840 | consumed tokens: 41623224320 | elapsed time per iteration (s): 0.16 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.715806E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.827 | TFLOPs: 24.96 | +7: iteration 79400/ 173500 | consumed samples: 20326400 | consumed tokens: 41628467200 | elapsed time per iteration (s): 0.16 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.710156E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.207 | TFLOPs: 25.36 | +7: iteration 79410/ 173500 | consumed samples: 20328960 | consumed tokens: 41633710080 | elapsed time per iteration (s): 0.16 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.712754E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.811 | TFLOPs: 25.23 | +7: iteration 79420/ 173500 | consumed samples: 20331520 | consumed tokens: 41638952960 | elapsed time per iteration (s): 0.16 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.715457E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.374 | TFLOPs: 25.73 | +7: iteration 79430/ 173500 | consumed samples: 20334080 | consumed tokens: 41644195840 | elapsed time per iteration (s): 0.15 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.708551E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.216 | TFLOPs: 26.16 | +7: iteration 79440/ 173500 | consumed samples: 20336640 | consumed tokens: 41649438720 | elapsed time per iteration (s): 0.17 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.723236E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.186 | TFLOPs: 24.31 | +7: iteration 79450/ 173500 | consumed samples: 20339200 | consumed tokens: 41654681600 | elapsed time per iteration (s): 0.16 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.705453E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.548 | TFLOPs: 25.32 | +7: iteration 79460/ 173500 | consumed samples: 20341760 | consumed tokens: 41659924480 | elapsed time per iteration (s): 0.16 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.706466E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.941 | TFLOPs: 24.68 | +7: iteration 79470/ 173500 | consumed samples: 20344320 | consumed tokens: 41665167360 | elapsed time per iteration (s): 0.18 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.711950E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.961 | TFLOPs: 22.79 | +7: iteration 79480/ 173500 | consumed samples: 20346880 | consumed tokens: 41670410240 | elapsed time per iteration (s): 0.15 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.725388E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.075 | TFLOPs: 26.16 | +7: iteration 79490/ 173500 | consumed samples: 20349440 | consumed tokens: 41675653120 | elapsed time per iteration (s): 0.16 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.709036E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.535 | TFLOPs: 25.51 | +7: iteration 79500/ 173500 | consumed samples: 20352000 | consumed tokens: 41680896000 | elapsed time per iteration (s): 0.15 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.713435E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.020 | TFLOPs: 25.95 | +7: iteration 79510/ 173500 | consumed samples: 20354560 | consumed tokens: 41686138880 | elapsed time per iteration (s): 0.16 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.703011E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.999 | TFLOPs: 25.37 | +7: iteration 79520/ 173500 | consumed samples: 20357120 | consumed tokens: 41691381760 | elapsed time per iteration (s): 0.16 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.716365E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.080 | TFLOPs: 24.94 | +7: iteration 79530/ 173500 | consumed samples: 20359680 | consumed tokens: 41696624640 | elapsed time per iteration (s): 0.16 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.708608E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.339 | TFLOPs: 25.90 | +7: iteration 79540/ 173500 | consumed samples: 20362240 | consumed tokens: 41701867520 | elapsed time per iteration (s): 0.17 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.705738E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.795 | TFLOPs: 23.49 | +7: iteration 79550/ 173500 | consumed samples: 20364800 | consumed tokens: 41707110400 | elapsed time per iteration (s): 0.16 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.720253E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.866 | TFLOPs: 25.48 | +7: iteration 79560/ 173500 | consumed samples: 20367360 | consumed tokens: 41712353280 | elapsed time per iteration (s): 0.16 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.698246E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.085 | TFLOPs: 24.39 | +7: iteration 79570/ 173500 | consumed samples: 20369920 | consumed tokens: 41717596160 | elapsed time per iteration (s): 0.16 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.721513E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.957 | TFLOPs: 25.40 | +7: iteration 79580/ 173500 | consumed samples: 20372480 | consumed tokens: 41722839040 | elapsed time per iteration (s): 0.16 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.720615E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.385 | TFLOPs: 25.35 | +7: iteration 79590/ 173500 | consumed samples: 20375040 | consumed tokens: 41728081920 | elapsed time per iteration (s): 0.17 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.720062E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.778 | TFLOPs: 24.23 | +7: iteration 79600/ 173500 | consumed samples: 20377600 | consumed tokens: 41733324800 | elapsed time per iteration (s): 0.16 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.723495E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.966 | TFLOPs: 25.58 | +7: iteration 79610/ 173500 | consumed samples: 20380160 | consumed tokens: 41738567680 | elapsed time per iteration (s): 0.15 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.707125E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.934 | TFLOPs: 26.28 | +7: iteration 79620/ 173500 | consumed samples: 20382720 | consumed tokens: 41743810560 | elapsed time per iteration (s): 0.15 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.714627E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.874 | TFLOPs: 25.95 | +7: iteration 79630/ 173500 | consumed samples: 20385280 | consumed tokens: 41749053440 | elapsed time per iteration (s): 0.16 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.712390E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.628 | TFLOPs: 25.42 | +7: iteration 79640/ 173500 | consumed samples: 20387840 | consumed tokens: 41754296320 | elapsed time per iteration (s): 0.16 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.711566E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.947 | TFLOPs: 25.50 | +7: iteration 79650/ 173500 | consumed samples: 20390400 | consumed tokens: 41759539200 | elapsed time per iteration (s): 0.16 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.702272E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.566 | TFLOPs: 25.23 | +7: iteration 79660/ 173500 | consumed samples: 20392960 | consumed tokens: 41764782080 | elapsed time per iteration (s): 0.16 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.712831E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.097 | TFLOPs: 25.38 | +7: iteration 79670/ 173500 | consumed samples: 20395520 | consumed tokens: 41770024960 | elapsed time per iteration (s): 0.16 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.696710E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.203 | TFLOPs: 25.03 | +7: iteration 79680/ 173500 | consumed samples: 20398080 | consumed tokens: 41775267840 | elapsed time per iteration (s): 0.16 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.705721E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.874 | TFLOPs: 25.58 | +7: iteration 79690/ 173500 | consumed samples: 20400640 | consumed tokens: 41780510720 | elapsed time per iteration (s): 0.16 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.730000E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.699 | TFLOPs: 25.42 | +7: iteration 79700/ 173500 | consumed samples: 20403200 | consumed tokens: 41785753600 | elapsed time per iteration (s): 0.16 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.702147E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.462 | TFLOPs: 25.33 | +7: iteration 79710/ 173500 | consumed samples: 20405760 | consumed tokens: 41790996480 | elapsed time per iteration (s): 0.16 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.721664E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.195 | TFLOPs: 25.58 | +7: iteration 79720/ 173500 | consumed samples: 20408320 | consumed tokens: 41796239360 | elapsed time per iteration (s): 0.15 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.713356E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.992 | TFLOPs: 25.95 | +7: iteration 79730/ 173500 | consumed samples: 20410880 | consumed tokens: 41801482240 | elapsed time per iteration (s): 0.16 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.728435E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.067 | TFLOPs: 24.34 | +7: iteration 79740/ 173500 | consumed samples: 20413440 | consumed tokens: 41806725120 | elapsed time per iteration (s): 0.16 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.719519E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.690 | TFLOPs: 25.32 | +7: iteration 79750/ 173500 | consumed samples: 20416000 | consumed tokens: 41811968000 | elapsed time per iteration (s): 0.16 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.719462E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.629 | TFLOPs: 25.10 | +7: iteration 79760/ 173500 | consumed samples: 20418560 | consumed tokens: 41817210880 | elapsed time per iteration (s): 0.15 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.708187E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.004 | TFLOPs: 26.30 | +7: iteration 79770/ 173500 | consumed samples: 20421120 | consumed tokens: 41822453760 | elapsed time per iteration (s): 0.16 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.723743E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.165 | TFLOPs: 25.66 | +7: iteration 79780/ 173500 | consumed samples: 20423680 | consumed tokens: 41827696640 | elapsed time per iteration (s): 0.16 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.701908E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.934 | TFLOPs: 25.01 | +7: iteration 79790/ 173500 | consumed samples: 20426240 | consumed tokens: 41832939520 | elapsed time per iteration (s): 0.15 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.697113E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.417 | TFLOPs: 26.27 | +7: iteration 79800/ 173500 | consumed samples: 20428800 | consumed tokens: 41838182400 | elapsed time per iteration (s): 0.15 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.708807E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.210 | TFLOPs: 26.26 | +7: iteration 79810/ 173500 | consumed samples: 20431360 | consumed tokens: 41843425280 | elapsed time per iteration (s): 0.16 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.724630E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.164 | TFLOPs: 25.89 | +7: iteration 79820/ 173500 | consumed samples: 20433920 | consumed tokens: 41848668160 | elapsed time per iteration (s): 0.16 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.709740E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.602 | TFLOPs: 25.51 | +7: iteration 79830/ 173500 | consumed samples: 20436480 | consumed tokens: 41853911040 | elapsed time per iteration (s): 0.15 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.711283E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.375 | TFLOPs: 26.26 | +7: iteration 79840/ 173500 | consumed samples: 20439040 | consumed tokens: 41859153920 | elapsed time per iteration (s): 0.16 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.718679E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.289 | TFLOPs: 25.88 | +7: iteration 79850/ 173500 | consumed samples: 20441600 | consumed tokens: 41864396800 | elapsed time per iteration (s): 0.16 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.704283E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.730 | TFLOPs: 25.72 | +7: iteration 79860/ 173500 | consumed samples: 20444160 | consumed tokens: 41869639680 | elapsed time per iteration (s): 0.16 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.698268E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.296 | TFLOPs: 25.87 | +7: iteration 79870/ 173500 | consumed samples: 20446720 | consumed tokens: 41874882560 | elapsed time per iteration (s): 0.15 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.707461E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.202 | TFLOPs: 26.26 | +7: iteration 79880/ 173500 | consumed samples: 20449280 | consumed tokens: 41880125440 | elapsed time per iteration (s): 0.16 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.708878E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.180 | TFLOPs: 25.85 | +7: iteration 79890/ 173500 | consumed samples: 20451840 | consumed tokens: 41885368320 | elapsed time per iteration (s): 0.16 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.705562E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.520 | TFLOPs: 25.16 | +7: iteration 79900/ 173500 | consumed samples: 20454400 | consumed tokens: 41890611200 | elapsed time per iteration (s): 0.16 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.710423E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.986 | TFLOPs: 24.53 | +7: iteration 79910/ 173500 | consumed samples: 20456960 | consumed tokens: 41895854080 | elapsed time per iteration (s): 0.16 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.714520E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.966 | TFLOPs: 25.20 | +7: iteration 79920/ 173500 | consumed samples: 20459520 | consumed tokens: 41901096960 | elapsed time per iteration (s): 0.17 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.717960E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1493.874 | TFLOPs: 23.43 | +7: iteration 79930/ 173500 | consumed samples: 20462080 | consumed tokens: 41906339840 | elapsed time per iteration (s): 0.16 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.705485E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.080 | TFLOPs: 25.74 | +7: iteration 79940/ 173500 | consumed samples: 20464640 | consumed tokens: 41911582720 | elapsed time per iteration (s): 0.16 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.714185E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.801 | TFLOPs: 25.50 | +7: iteration 79950/ 173500 | consumed samples: 20467200 | consumed tokens: 41916825600 | elapsed time per iteration (s): 0.16 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.714118E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.651 | TFLOPs: 25.51 | +7: iteration 79960/ 173500 | consumed samples: 20469760 | consumed tokens: 41922068480 | elapsed time per iteration (s): 0.16 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.699133E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.117 | TFLOPs: 25.80 | +7: iteration 79970/ 173500 | consumed samples: 20472320 | consumed tokens: 41927311360 | elapsed time per iteration (s): 0.16 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.720181E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.317 | TFLOPs: 24.56 | +7: iteration 79980/ 173500 | consumed samples: 20474880 | consumed tokens: 41932554240 | elapsed time per iteration (s): 0.16 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.717355E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.600 | TFLOPs: 24.71 | +7: iteration 79990/ 173500 | consumed samples: 20477440 | consumed tokens: 41937797120 | elapsed time per iteration (s): 0.16 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.714381E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.749 | TFLOPs: 25.87 | +0: [2023-03-17 03:43:29,217] [INFO] [logging.py:68:log_dist] [Rank 0] step=80000, skipped=0, lr=[0.00012249910047811783, 0.00012249910047811783, 0.00012249910047811783], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 80000/ 173500 | consumed samples: 20480000 | consumed tokens: 41943040000 | elapsed time per iteration (s): 0.16 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.707005E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.169 | TFLOPs: 25.41 | +0: steps: 80000 loss: 3.6853 iter time (s): 0.158 samples/sec: 1624.560 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 80000 | lm loss value: 3.835573E+00 | lm loss PPL: 4.631995E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 80000 to checkpoints_44m91b100m +0: [2023-03-17 03:43:29,290] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step80000 is begin to save! +0: [2023-03-17 03:43:29,295] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:43:29,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:43:29,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:43:29,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:43:29,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:43:29,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:43:29,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:43:29,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:43:29,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:43:29,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:43:29,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:43:29,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:43:29,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:43:29,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:43:29,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:43:29,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:43:29,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:43:29,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:43:29,422] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:43:29,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:43:29,424] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step80000/mp_rank_00_model_states.pt +0: [2023-03-17 03:43:29,424] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:43:29,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:43:29,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:43:29,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 03:43:29,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 03:43:29,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +3: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +5: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +6: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +2: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +1: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +7: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 03:43:29,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step80000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +4: [2023-03-17 03:43:29,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step80000 is ready now! +0: successfully saved checkpoint at iteration 80000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.89 +7: iteration 80010/ 173500 | consumed samples: 20482560 | consumed tokens: 41948282880 | elapsed time per iteration (s): 0.18 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.725296E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1402.511 | TFLOPs: 21.99 | +7: iteration 80020/ 173500 | consumed samples: 20485120 | consumed tokens: 41953525760 | elapsed time per iteration (s): 0.16 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.717335E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.515 | TFLOPs: 25.52 | +7: iteration 80030/ 173500 | consumed samples: 20487680 | consumed tokens: 41958768640 | elapsed time per iteration (s): 0.16 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.705802E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.932 | TFLOPs: 24.98 | +7: iteration 80040/ 173500 | consumed samples: 20490240 | consumed tokens: 41964011520 | elapsed time per iteration (s): 0.16 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.722145E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.360 | TFLOPs: 24.56 | +7: iteration 80050/ 173500 | consumed samples: 20492800 | consumed tokens: 41969254400 | elapsed time per iteration (s): 0.16 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.720037E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.401 | TFLOPs: 25.68 | +7: iteration 80060/ 173500 | consumed samples: 20495360 | consumed tokens: 41974497280 | elapsed time per iteration (s): 0.16 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.721202E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.762 | TFLOPs: 25.54 | +7: iteration 80070/ 173500 | consumed samples: 20497920 | consumed tokens: 41979740160 | elapsed time per iteration (s): 0.15 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.710139E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.200 | TFLOPs: 25.94 | +7: iteration 80080/ 173500 | consumed samples: 20500480 | consumed tokens: 41984983040 | elapsed time per iteration (s): 0.16 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.714834E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.468 | TFLOPs: 25.54 | +7: iteration 80090/ 173500 | consumed samples: 20503040 | consumed tokens: 41990225920 | elapsed time per iteration (s): 0.16 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.715060E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.240 | TFLOPs: 25.00 | +7: iteration 80100/ 173500 | consumed samples: 20505600 | consumed tokens: 41995468800 | elapsed time per iteration (s): 0.16 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.715912E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.095 | TFLOPs: 24.89 | +7: iteration 80110/ 173500 | consumed samples: 20508160 | consumed tokens: 42000711680 | elapsed time per iteration (s): 0.16 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.714212E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.543 | TFLOPs: 25.16 | +7: iteration 80120/ 173500 | consumed samples: 20510720 | consumed tokens: 42005954560 | elapsed time per iteration (s): 0.16 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.708561E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.109 | TFLOPs: 25.75 | +7: iteration 80130/ 173500 | consumed samples: 20513280 | consumed tokens: 42011197440 | elapsed time per iteration (s): 0.16 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.709089E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.485 | TFLOPs: 25.65 | +7: iteration 80140/ 173500 | consumed samples: 20515840 | consumed tokens: 42016440320 | elapsed time per iteration (s): 0.16 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.690823E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.302 | TFLOPs: 25.14 | +7: iteration 80150/ 173500 | consumed samples: 20518400 | consumed tokens: 42021683200 | elapsed time per iteration (s): 0.15 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.708968E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.545 | TFLOPs: 25.95 | +7: iteration 80160/ 173500 | consumed samples: 20520960 | consumed tokens: 42026926080 | elapsed time per iteration (s): 0.16 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.709695E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.573 | TFLOPs: 25.35 | +7: iteration 80170/ 173500 | consumed samples: 20523520 | consumed tokens: 42032168960 | elapsed time per iteration (s): 0.15 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.711193E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.127 | TFLOPs: 26.30 | +7: iteration 80180/ 173500 | consumed samples: 20526080 | consumed tokens: 42037411840 | elapsed time per iteration (s): 0.15 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.707637E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.470 | TFLOPs: 26.17 | +7: iteration 80190/ 173500 | consumed samples: 20528640 | consumed tokens: 42042654720 | elapsed time per iteration (s): 0.16 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.707967E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.384 | TFLOPs: 24.60 | +7: iteration 80200/ 173500 | consumed samples: 20531200 | consumed tokens: 42047897600 | elapsed time per iteration (s): 0.15 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.711262E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.345 | TFLOPs: 25.98 | +7: iteration 80210/ 173500 | consumed samples: 20533760 | consumed tokens: 42053140480 | elapsed time per iteration (s): 0.15 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.717138E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.651 | TFLOPs: 26.06 | +7: iteration 80220/ 173500 | consumed samples: 20536320 | consumed tokens: 42058383360 | elapsed time per iteration (s): 0.16 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.712915E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.621 | TFLOPs: 25.57 | +7: iteration 80230/ 173500 | consumed samples: 20538880 | consumed tokens: 42063626240 | elapsed time per iteration (s): 0.16 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.717953E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.581 | TFLOPs: 25.12 | +7: iteration 80240/ 173500 | consumed samples: 20541440 | consumed tokens: 42068869120 | elapsed time per iteration (s): 0.16 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.706944E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.703 | TFLOPs: 25.62 | +7: iteration 80250/ 173500 | consumed samples: 20544000 | consumed tokens: 42074112000 | elapsed time per iteration (s): 0.16 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.705847E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.020 | TFLOPs: 24.87 | +7: iteration 80260/ 173500 | consumed samples: 20546560 | consumed tokens: 42079354880 | elapsed time per iteration (s): 0.16 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.713961E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.461 | TFLOPs: 24.71 | +7: iteration 80270/ 173500 | consumed samples: 20549120 | consumed tokens: 42084597760 | elapsed time per iteration (s): 0.16 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.686477E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.503 | TFLOPs: 25.71 | +7: iteration 80280/ 173500 | consumed samples: 20551680 | consumed tokens: 42089840640 | elapsed time per iteration (s): 0.16 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.713216E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.785 | TFLOPs: 25.87 | +7: iteration 80290/ 173500 | consumed samples: 20554240 | consumed tokens: 42095083520 | elapsed time per iteration (s): 0.16 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.721215E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.896 | TFLOPs: 25.65 | +7: iteration 80300/ 173500 | consumed samples: 20556800 | consumed tokens: 42100326400 | elapsed time per iteration (s): 0.15 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.700073E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.352 | TFLOPs: 26.31 | +7: iteration 80310/ 173500 | consumed samples: 20559360 | consumed tokens: 42105569280 | elapsed time per iteration (s): 0.16 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.716965E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.737 | TFLOPs: 25.28 | +7: iteration 80320/ 173500 | consumed samples: 20561920 | consumed tokens: 42110812160 | elapsed time per iteration (s): 0.16 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.716143E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.471 | TFLOPs: 25.49 | +7: iteration 80330/ 173500 | consumed samples: 20564480 | consumed tokens: 42116055040 | elapsed time per iteration (s): 0.16 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.708133E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.278 | TFLOPs: 25.61 | +7: iteration 80340/ 173500 | consumed samples: 20567040 | consumed tokens: 42121297920 | elapsed time per iteration (s): 0.16 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.713951E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.290 | TFLOPs: 25.74 | +7: iteration 80350/ 173500 | consumed samples: 20569600 | consumed tokens: 42126540800 | elapsed time per iteration (s): 0.15 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.717486E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.720 | TFLOPs: 26.26 | +7: iteration 80360/ 173500 | consumed samples: 20572160 | consumed tokens: 42131783680 | elapsed time per iteration (s): 0.16 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.716952E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.142 | TFLOPs: 25.28 | +7: iteration 80370/ 173500 | consumed samples: 20574720 | consumed tokens: 42137026560 | elapsed time per iteration (s): 0.16 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.706897E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.383 | TFLOPs: 25.63 | +7: iteration 80380/ 173500 | consumed samples: 20577280 | consumed tokens: 42142269440 | elapsed time per iteration (s): 0.16 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.714169E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.390 | TFLOPs: 24.91 | +7: iteration 80390/ 173500 | consumed samples: 20579840 | consumed tokens: 42147512320 | elapsed time per iteration (s): 0.15 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.708017E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.115 | TFLOPs: 26.27 | +7: iteration 80400/ 173500 | consumed samples: 20582400 | consumed tokens: 42152755200 | elapsed time per iteration (s): 0.16 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.714397E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.040 | TFLOPs: 25.39 | +7: iteration 80410/ 173500 | consumed samples: 20584960 | consumed tokens: 42157998080 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.708297E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.531 | TFLOPs: 26.31 | +7: iteration 80420/ 173500 | consumed samples: 20587520 | consumed tokens: 42163240960 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.715243E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.825 | TFLOPs: 25.98 | +7: iteration 80430/ 173500 | consumed samples: 20590080 | consumed tokens: 42168483840 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.704738E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.206 | TFLOPs: 26.29 | +7: iteration 80440/ 173500 | consumed samples: 20592640 | consumed tokens: 42173726720 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.712145E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.856 | TFLOPs: 26.28 | +7: iteration 80450/ 173500 | consumed samples: 20595200 | consumed tokens: 42178969600 | elapsed time per iteration (s): 0.15 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.705389E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.181 | TFLOPs: 26.05 | +7: iteration 80460/ 173500 | consumed samples: 20597760 | consumed tokens: 42184212480 | elapsed time per iteration (s): 0.15 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.713292E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.797 | TFLOPs: 25.90 | +7: iteration 80470/ 173500 | consumed samples: 20600320 | consumed tokens: 42189455360 | elapsed time per iteration (s): 0.16 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.710051E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.369 | TFLOPs: 25.60 | +7: iteration 80480/ 173500 | consumed samples: 20602880 | consumed tokens: 42194698240 | elapsed time per iteration (s): 0.15 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.714532E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.471 | TFLOPs: 26.01 | +7: iteration 80490/ 173500 | consumed samples: 20605440 | consumed tokens: 42199941120 | elapsed time per iteration (s): 0.16 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.708631E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.036 | TFLOPs: 25.22 | +7: iteration 80500/ 173500 | consumed samples: 20608000 | consumed tokens: 42205184000 | elapsed time per iteration (s): 0.16 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.723898E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.993 | TFLOPs: 25.36 | +7: iteration 80510/ 173500 | consumed samples: 20610560 | consumed tokens: 42210426880 | elapsed time per iteration (s): 0.15 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.712004E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.190 | TFLOPs: 25.93 | +7: iteration 80520/ 173500 | consumed samples: 20613120 | consumed tokens: 42215669760 | elapsed time per iteration (s): 0.15 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.715541E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.436 | TFLOPs: 26.20 | +7: iteration 80530/ 173500 | consumed samples: 20615680 | consumed tokens: 42220912640 | elapsed time per iteration (s): 0.15 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.718078E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.018 | TFLOPs: 25.97 | +7: iteration 80540/ 173500 | consumed samples: 20618240 | consumed tokens: 42226155520 | elapsed time per iteration (s): 0.16 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.711997E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.573 | TFLOPs: 24.82 | +7: iteration 80550/ 173500 | consumed samples: 20620800 | consumed tokens: 42231398400 | elapsed time per iteration (s): 0.15 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.713522E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.785 | TFLOPs: 26.03 | +7: iteration 80560/ 173500 | consumed samples: 20623360 | consumed tokens: 42236641280 | elapsed time per iteration (s): 0.16 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.704233E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.271 | TFLOPs: 25.60 | +7: iteration 80570/ 173500 | consumed samples: 20625920 | consumed tokens: 42241884160 | elapsed time per iteration (s): 0.15 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.720134E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.881 | TFLOPs: 26.09 | +7: iteration 80580/ 173500 | consumed samples: 20628480 | consumed tokens: 42247127040 | elapsed time per iteration (s): 0.16 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.714484E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.681 | TFLOPs: 25.82 | +7: iteration 80590/ 173500 | consumed samples: 20631040 | consumed tokens: 42252369920 | elapsed time per iteration (s): 0.15 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.708403E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.612 | TFLOPs: 26.18 | +7: iteration 80600/ 173500 | consumed samples: 20633600 | consumed tokens: 42257612800 | elapsed time per iteration (s): 0.16 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.695953E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.082 | TFLOPs: 25.55 | +7: iteration 80610/ 173500 | consumed samples: 20636160 | consumed tokens: 42262855680 | elapsed time per iteration (s): 0.16 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.719239E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.221 | TFLOPs: 25.66 | +7: iteration 80620/ 173500 | consumed samples: 20638720 | consumed tokens: 42268098560 | elapsed time per iteration (s): 0.16 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.711672E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.118 | TFLOPs: 25.66 | +7: iteration 80630/ 173500 | consumed samples: 20641280 | consumed tokens: 42273341440 | elapsed time per iteration (s): 0.16 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.695402E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.329 | TFLOPs: 25.72 | +7: iteration 80640/ 173500 | consumed samples: 20643840 | consumed tokens: 42278584320 | elapsed time per iteration (s): 0.16 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.703632E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.681 | TFLOPs: 25.40 | +7: iteration 80650/ 173500 | consumed samples: 20646400 | consumed tokens: 42283827200 | elapsed time per iteration (s): 0.16 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.700269E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.216 | TFLOPs: 25.72 | +7: iteration 80660/ 173500 | consumed samples: 20648960 | consumed tokens: 42289070080 | elapsed time per iteration (s): 0.15 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.722004E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.669 | TFLOPs: 25.98 | +7: iteration 80670/ 173500 | consumed samples: 20651520 | consumed tokens: 42294312960 | elapsed time per iteration (s): 0.16 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.713096E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.472 | TFLOPs: 25.48 | +7: iteration 80680/ 173500 | consumed samples: 20654080 | consumed tokens: 42299555840 | elapsed time per iteration (s): 0.16 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.713848E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.015 | TFLOPs: 24.95 | +7: iteration 80690/ 173500 | consumed samples: 20656640 | consumed tokens: 42304798720 | elapsed time per iteration (s): 0.15 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.709016E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.998 | TFLOPs: 25.95 | +7: iteration 80700/ 173500 | consumed samples: 20659200 | consumed tokens: 42310041600 | elapsed time per iteration (s): 0.16 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.707809E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.931 | TFLOPs: 24.62 | +7: iteration 80710/ 173500 | consumed samples: 20661760 | consumed tokens: 42315284480 | elapsed time per iteration (s): 0.16 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.725543E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.459 | TFLOPs: 25.79 | +7: iteration 80720/ 173500 | consumed samples: 20664320 | consumed tokens: 42320527360 | elapsed time per iteration (s): 0.15 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.710578E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.080 | TFLOPs: 26.32 | +7: iteration 80730/ 173500 | consumed samples: 20666880 | consumed tokens: 42325770240 | elapsed time per iteration (s): 0.15 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.703083E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.325 | TFLOPs: 26.21 | +7: iteration 80740/ 173500 | consumed samples: 20669440 | consumed tokens: 42331013120 | elapsed time per iteration (s): 0.16 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.718854E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.499 | TFLOPs: 25.62 | +7: iteration 80750/ 173500 | consumed samples: 20672000 | consumed tokens: 42336256000 | elapsed time per iteration (s): 0.16 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.707527E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.930 | TFLOPs: 25.37 | +7: iteration 80760/ 173500 | consumed samples: 20674560 | consumed tokens: 42341498880 | elapsed time per iteration (s): 0.16 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.711620E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.117 | TFLOPs: 25.82 | +7: iteration 80770/ 173500 | consumed samples: 20677120 | consumed tokens: 42346741760 | elapsed time per iteration (s): 0.16 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.709762E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.371 | TFLOPs: 25.58 | +7: iteration 80780/ 173500 | consumed samples: 20679680 | consumed tokens: 42351984640 | elapsed time per iteration (s): 0.16 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.708311E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.528 | TFLOPs: 25.81 | +7: iteration 80790/ 173500 | consumed samples: 20682240 | consumed tokens: 42357227520 | elapsed time per iteration (s): 0.18 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.699894E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1453.105 | TFLOPs: 22.79 | +7: iteration 80800/ 173500 | consumed samples: 20684800 | consumed tokens: 42362470400 | elapsed time per iteration (s): 0.16 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.706636E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.640 | TFLOPs: 25.45 | +7: iteration 80810/ 173500 | consumed samples: 20687360 | consumed tokens: 42367713280 | elapsed time per iteration (s): 0.16 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.714077E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.214 | TFLOPs: 25.82 | +7: iteration 80820/ 173500 | consumed samples: 20689920 | consumed tokens: 42372956160 | elapsed time per iteration (s): 0.16 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.702055E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.031 | TFLOPs: 25.88 | +7: iteration 80830/ 173500 | consumed samples: 20692480 | consumed tokens: 42378199040 | elapsed time per iteration (s): 0.16 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.714683E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.184 | TFLOPs: 25.83 | +7: iteration 80840/ 173500 | consumed samples: 20695040 | consumed tokens: 42383441920 | elapsed time per iteration (s): 0.17 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.705955E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.782 | TFLOPs: 24.02 | +7: iteration 80850/ 173500 | consumed samples: 20697600 | consumed tokens: 42388684800 | elapsed time per iteration (s): 0.16 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.713298E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.344 | TFLOPs: 25.47 | +7: iteration 80860/ 173500 | consumed samples: 20700160 | consumed tokens: 42393927680 | elapsed time per iteration (s): 0.16 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.713377E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.490 | TFLOPs: 25.81 | +7: iteration 80870/ 173500 | consumed samples: 20702720 | consumed tokens: 42399170560 | elapsed time per iteration (s): 0.16 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.702261E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.042 | TFLOPs: 25.20 | +7: iteration 80880/ 173500 | consumed samples: 20705280 | consumed tokens: 42404413440 | elapsed time per iteration (s): 0.16 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.699083E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.987 | TFLOPs: 25.80 | +7: iteration 80890/ 173500 | consumed samples: 20707840 | consumed tokens: 42409656320 | elapsed time per iteration (s): 0.16 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.701926E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.987 | TFLOPs: 25.12 | +7: iteration 80900/ 173500 | consumed samples: 20710400 | consumed tokens: 42414899200 | elapsed time per iteration (s): 0.16 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.707827E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.003 | TFLOPs: 25.03 | +7: iteration 80910/ 173500 | consumed samples: 20712960 | consumed tokens: 42420142080 | elapsed time per iteration (s): 0.16 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.704588E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.618 | TFLOPs: 25.62 | +7: iteration 80920/ 173500 | consumed samples: 20715520 | consumed tokens: 42425384960 | elapsed time per iteration (s): 0.15 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.708596E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.256 | TFLOPs: 26.32 | +7: iteration 80930/ 173500 | consumed samples: 20718080 | consumed tokens: 42430627840 | elapsed time per iteration (s): 0.15 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.714396E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.028 | TFLOPs: 25.97 | +7: iteration 80940/ 173500 | consumed samples: 20720640 | consumed tokens: 42435870720 | elapsed time per iteration (s): 0.15 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.729879E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.447 | TFLOPs: 26.31 | +7: iteration 80950/ 173500 | consumed samples: 20723200 | consumed tokens: 42441113600 | elapsed time per iteration (s): 0.16 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.711400E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.022 | TFLOPs: 25.22 | +7: iteration 80960/ 173500 | consumed samples: 20725760 | consumed tokens: 42446356480 | elapsed time per iteration (s): 0.16 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.711101E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.500 | TFLOPs: 24.90 | +7: iteration 80970/ 173500 | consumed samples: 20728320 | consumed tokens: 42451599360 | elapsed time per iteration (s): 0.15 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.702017E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.640 | TFLOPs: 26.07 | +7: iteration 80980/ 173500 | consumed samples: 20730880 | consumed tokens: 42456842240 | elapsed time per iteration (s): 0.15 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.712899E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.267 | TFLOPs: 25.99 | +7: iteration 80990/ 173500 | consumed samples: 20733440 | consumed tokens: 42462085120 | elapsed time per iteration (s): 0.15 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.709526E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.681 | TFLOPs: 26.29 | +7: iteration 81000/ 173500 | consumed samples: 20736000 | consumed tokens: 42467328000 | elapsed time per iteration (s): 0.15 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.718485E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.663 | TFLOPs: 25.93 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 81000 | lm loss value: 3.840738E+00 | lm loss PPL: 4.655983E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 81000 to checkpoints_44m91b100m +0: [2023-03-17 03:46:06,407] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step81000 is begin to save! +0: [2023-03-17 03:46:06,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:46:06,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:46:06,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:46:06,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:46:06,486] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:46:06,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:46:06,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:46:06,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:46:06,502] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:46:06,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:46:06,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:46:06,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:46:06,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:46:06,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:46:06,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:46:06,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:46:06,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:46:06,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:46:06,544] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:46:06,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:46:06,545] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step81000/mp_rank_00_model_states.pt +0: [2023-03-17 03:46:06,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:46:06,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:46:06,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:46:06,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +5: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +6: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +1: [2023-03-17 03:46:06,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +4: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:46:06,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:46:06,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +3: [2023-03-17 03:46:06,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +7: [2023-03-17 03:46:06,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:46:06,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:46:06,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:46:06,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step81000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:46:06,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step81000 is ready now! +0: successfully saved checkpoint at iteration 81000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 201.12 +7: iteration 81010/ 173500 | consumed samples: 20738560 | consumed tokens: 42472570880 | elapsed time per iteration (s): 0.18 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.716248E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1422.481 | TFLOPs: 22.31 | +7: iteration 81020/ 173500 | consumed samples: 20741120 | consumed tokens: 42477813760 | elapsed time per iteration (s): 0.16 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.704604E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.332 | TFLOPs: 25.82 | +7: iteration 81030/ 173500 | consumed samples: 20743680 | consumed tokens: 42483056640 | elapsed time per iteration (s): 0.15 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.701991E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.920 | TFLOPs: 26.11 | +7: iteration 81040/ 173500 | consumed samples: 20746240 | consumed tokens: 42488299520 | elapsed time per iteration (s): 0.15 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.728970E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.934 | TFLOPs: 26.11 | +7: iteration 81050/ 173500 | consumed samples: 20748800 | consumed tokens: 42493542400 | elapsed time per iteration (s): 0.16 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.702244E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.729 | TFLOPs: 25.81 | +7: iteration 81060/ 173500 | consumed samples: 20751360 | consumed tokens: 42498785280 | elapsed time per iteration (s): 0.16 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.717015E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.788 | TFLOPs: 25.50 | +7: iteration 81070/ 173500 | consumed samples: 20753920 | consumed tokens: 42504028160 | elapsed time per iteration (s): 0.17 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.707628E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.035 | TFLOPs: 23.78 | +7: iteration 81080/ 173500 | consumed samples: 20756480 | consumed tokens: 42509271040 | elapsed time per iteration (s): 0.15 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.720774E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.659 | TFLOPs: 26.28 | +7: iteration 81090/ 173500 | consumed samples: 20759040 | consumed tokens: 42514513920 | elapsed time per iteration (s): 0.15 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.701017E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.419 | TFLOPs: 26.27 | +7: iteration 81100/ 173500 | consumed samples: 20761600 | consumed tokens: 42519756800 | elapsed time per iteration (s): 0.16 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.693459E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.156 | TFLOPs: 25.72 | +7: iteration 81110/ 173500 | consumed samples: 20764160 | consumed tokens: 42524999680 | elapsed time per iteration (s): 0.15 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.705943E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.703 | TFLOPs: 26.09 | +7: iteration 81120/ 173500 | consumed samples: 20766720 | consumed tokens: 42530242560 | elapsed time per iteration (s): 0.16 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.703792E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.442 | TFLOPs: 25.65 | +7: iteration 81130/ 173500 | consumed samples: 20769280 | consumed tokens: 42535485440 | elapsed time per iteration (s): 0.16 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.690296E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.961 | TFLOPs: 25.31 | +7: iteration 81140/ 173500 | consumed samples: 20771840 | consumed tokens: 42540728320 | elapsed time per iteration (s): 0.16 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.722560E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.231 | TFLOPs: 25.46 | +7: iteration 81150/ 173500 | consumed samples: 20774400 | consumed tokens: 42545971200 | elapsed time per iteration (s): 0.16 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.710854E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.159 | TFLOPs: 25.36 | +7: iteration 81160/ 173500 | consumed samples: 20776960 | consumed tokens: 42551214080 | elapsed time per iteration (s): 0.15 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.709990E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.786 | TFLOPs: 26.26 | +7: iteration 81170/ 173500 | consumed samples: 20779520 | consumed tokens: 42556456960 | elapsed time per iteration (s): 0.15 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.703561E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.593 | TFLOPs: 26.10 | +7: iteration 81180/ 173500 | consumed samples: 20782080 | consumed tokens: 42561699840 | elapsed time per iteration (s): 0.16 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.710991E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.096 | TFLOPs: 25.83 | +7: iteration 81190/ 173500 | consumed samples: 20784640 | consumed tokens: 42566942720 | elapsed time per iteration (s): 0.15 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.714485E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.058 | TFLOPs: 26.28 | +7: iteration 81200/ 173500 | consumed samples: 20787200 | consumed tokens: 42572185600 | elapsed time per iteration (s): 0.16 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.704290E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.538 | TFLOPs: 25.41 | +7: iteration 81210/ 173500 | consumed samples: 20789760 | consumed tokens: 42577428480 | elapsed time per iteration (s): 0.16 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.705663E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.392 | TFLOPs: 25.90 | +7: iteration 81220/ 173500 | consumed samples: 20792320 | consumed tokens: 42582671360 | elapsed time per iteration (s): 0.15 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.727244E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.091 | TFLOPs: 26.32 | +7: iteration 81230/ 173500 | consumed samples: 20794880 | consumed tokens: 42587914240 | elapsed time per iteration (s): 0.15 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.725414E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.253 | TFLOPs: 26.13 | +7: iteration 81240/ 173500 | consumed samples: 20797440 | consumed tokens: 42593157120 | elapsed time per iteration (s): 0.15 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.714253E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.629 | TFLOPs: 26.23 | +7: iteration 81250/ 173500 | consumed samples: 20800000 | consumed tokens: 42598400000 | elapsed time per iteration (s): 0.16 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.723764E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.667 | TFLOPs: 25.46 | +7: iteration 81260/ 173500 | consumed samples: 20802560 | consumed tokens: 42603642880 | elapsed time per iteration (s): 0.16 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.704402E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.702 | TFLOPs: 25.70 | +7: iteration 81270/ 173500 | consumed samples: 20805120 | consumed tokens: 42608885760 | elapsed time per iteration (s): 0.16 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.712884E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.792 | TFLOPs: 25.72 | +7: iteration 81280/ 173500 | consumed samples: 20807680 | consumed tokens: 42614128640 | elapsed time per iteration (s): 0.15 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.707457E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.448 | TFLOPs: 26.04 | +7: iteration 81290/ 173500 | consumed samples: 20810240 | consumed tokens: 42619371520 | elapsed time per iteration (s): 0.16 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.712899E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.375 | TFLOPs: 25.41 | +7: iteration 81300/ 173500 | consumed samples: 20812800 | consumed tokens: 42624614400 | elapsed time per iteration (s): 0.15 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.710770E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.132 | TFLOPs: 26.11 | +7: iteration 81310/ 173500 | consumed samples: 20815360 | consumed tokens: 42629857280 | elapsed time per iteration (s): 0.15 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.716290E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.074 | TFLOPs: 26.13 | +7: iteration 81320/ 173500 | consumed samples: 20817920 | consumed tokens: 42635100160 | elapsed time per iteration (s): 0.16 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.716050E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.624 | TFLOPs: 25.43 | +7: iteration 81330/ 173500 | consumed samples: 20820480 | consumed tokens: 42640343040 | elapsed time per iteration (s): 0.15 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.702214E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.274 | TFLOPs: 26.23 | +7: iteration 81340/ 173500 | consumed samples: 20823040 | consumed tokens: 42645585920 | elapsed time per iteration (s): 0.16 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.711919E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.759 | TFLOPs: 25.87 | +7: iteration 81350/ 173500 | consumed samples: 20825600 | consumed tokens: 42650828800 | elapsed time per iteration (s): 0.16 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.684083E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.119 | TFLOPs: 25.27 | +7: iteration 81360/ 173500 | consumed samples: 20828160 | consumed tokens: 42656071680 | elapsed time per iteration (s): 0.16 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.714080E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.978 | TFLOPs: 25.78 | +7: iteration 81370/ 173500 | consumed samples: 20830720 | consumed tokens: 42661314560 | elapsed time per iteration (s): 0.16 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.705080E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.369 | TFLOPs: 25.29 | +7: iteration 81380/ 173500 | consumed samples: 20833280 | consumed tokens: 42666557440 | elapsed time per iteration (s): 0.16 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.715426E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.908 | TFLOPs: 25.64 | +7: iteration 81390/ 173500 | consumed samples: 20835840 | consumed tokens: 42671800320 | elapsed time per iteration (s): 0.16 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.718999E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.649 | TFLOPs: 25.24 | +7: iteration 81400/ 173500 | consumed samples: 20838400 | consumed tokens: 42677043200 | elapsed time per iteration (s): 0.15 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.692046E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.859 | TFLOPs: 25.92 | +7: iteration 81410/ 173500 | consumed samples: 20840960 | consumed tokens: 42682286080 | elapsed time per iteration (s): 0.15 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.704044E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.259 | TFLOPs: 26.26 | +7: iteration 81420/ 173500 | consumed samples: 20843520 | consumed tokens: 42687528960 | elapsed time per iteration (s): 0.15 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.706062E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.670 | TFLOPs: 26.17 | +7: iteration 81430/ 173500 | consumed samples: 20846080 | consumed tokens: 42692771840 | elapsed time per iteration (s): 0.15 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.703426E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.410 | TFLOPs: 26.23 | +7: iteration 81440/ 173500 | consumed samples: 20848640 | consumed tokens: 42698014720 | elapsed time per iteration (s): 0.16 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.708005E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.289 | TFLOPs: 25.44 | +7: iteration 81450/ 173500 | consumed samples: 20851200 | consumed tokens: 42703257600 | elapsed time per iteration (s): 0.16 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.722941E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.034 | TFLOPs: 25.85 | +7: iteration 81460/ 173500 | consumed samples: 20853760 | consumed tokens: 42708500480 | elapsed time per iteration (s): 0.16 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.707550E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.214 | TFLOPs: 25.88 | +7: iteration 81470/ 173500 | consumed samples: 20856320 | consumed tokens: 42713743360 | elapsed time per iteration (s): 0.15 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.702410E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.573 | TFLOPs: 26.10 | +7: iteration 81480/ 173500 | consumed samples: 20858880 | consumed tokens: 42718986240 | elapsed time per iteration (s): 0.16 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.696928E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.161 | TFLOPs: 25.72 | +7: iteration 81490/ 173500 | consumed samples: 20861440 | consumed tokens: 42724229120 | elapsed time per iteration (s): 0.15 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.715355E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.921 | TFLOPs: 26.05 | +7: iteration 81500/ 173500 | consumed samples: 20864000 | consumed tokens: 42729472000 | elapsed time per iteration (s): 0.16 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.706068E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.076 | TFLOPs: 25.63 | +7: iteration 81510/ 173500 | consumed samples: 20866560 | consumed tokens: 42734714880 | elapsed time per iteration (s): 0.15 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.715159E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.340 | TFLOPs: 26.07 | +7: iteration 81520/ 173500 | consumed samples: 20869120 | consumed tokens: 42739957760 | elapsed time per iteration (s): 0.15 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.706245E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.155 | TFLOPs: 26.07 | +7: iteration 81530/ 173500 | consumed samples: 20871680 | consumed tokens: 42745200640 | elapsed time per iteration (s): 0.16 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.698634E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.152 | TFLOPs: 25.71 | +7: iteration 81540/ 173500 | consumed samples: 20874240 | consumed tokens: 42750443520 | elapsed time per iteration (s): 0.15 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.701856E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.940 | TFLOPs: 25.92 | +7: iteration 81550/ 173500 | consumed samples: 20876800 | consumed tokens: 42755686400 | elapsed time per iteration (s): 0.16 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.723820E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.982 | TFLOPs: 25.59 | +7: iteration 81560/ 173500 | consumed samples: 20879360 | consumed tokens: 42760929280 | elapsed time per iteration (s): 0.15 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.707359E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.478 | TFLOPs: 26.32 | +7: iteration 81570/ 173500 | consumed samples: 20881920 | consumed tokens: 42766172160 | elapsed time per iteration (s): 0.15 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.704749E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.529 | TFLOPs: 26.31 | +7: iteration 81580/ 173500 | consumed samples: 20884480 | consumed tokens: 42771415040 | elapsed time per iteration (s): 0.15 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.690056E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.727 | TFLOPs: 26.20 | +7: iteration 81590/ 173500 | consumed samples: 20887040 | consumed tokens: 42776657920 | elapsed time per iteration (s): 0.15 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.714711E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.448 | TFLOPs: 26.17 | +7: iteration 81600/ 173500 | consumed samples: 20889600 | consumed tokens: 42781900800 | elapsed time per iteration (s): 0.16 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.705359E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.268 | TFLOPs: 25.77 | +7: iteration 81610/ 173500 | consumed samples: 20892160 | consumed tokens: 42787143680 | elapsed time per iteration (s): 0.16 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.708691E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.532 | TFLOPs: 25.63 | +7: iteration 81620/ 173500 | consumed samples: 20894720 | consumed tokens: 42792386560 | elapsed time per iteration (s): 0.15 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.706217E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.730 | TFLOPs: 26.19 | +7: iteration 81630/ 173500 | consumed samples: 20897280 | consumed tokens: 42797629440 | elapsed time per iteration (s): 0.15 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.713276E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.928 | TFLOPs: 26.20 | +7: iteration 81640/ 173500 | consumed samples: 20899840 | consumed tokens: 42802872320 | elapsed time per iteration (s): 0.15 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.700330E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.691 | TFLOPs: 26.22 | +7: iteration 81650/ 173500 | consumed samples: 20902400 | consumed tokens: 42808115200 | elapsed time per iteration (s): 0.16 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.704633E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.931 | TFLOPs: 25.50 | +7: iteration 81660/ 173500 | consumed samples: 20904960 | consumed tokens: 42813358080 | elapsed time per iteration (s): 0.16 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.716536E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.923 | TFLOPs: 25.86 | +7: iteration 81670/ 173500 | consumed samples: 20907520 | consumed tokens: 42818600960 | elapsed time per iteration (s): 0.15 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.702346E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.567 | TFLOPs: 26.25 | +7: iteration 81680/ 173500 | consumed samples: 20910080 | consumed tokens: 42823843840 | elapsed time per iteration (s): 0.16 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.720436E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.798 | TFLOPs: 25.87 | +7: iteration 81690/ 173500 | consumed samples: 20912640 | consumed tokens: 42829086720 | elapsed time per iteration (s): 0.16 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.712527E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.591 | TFLOPs: 25.62 | +7: iteration 81700/ 173500 | consumed samples: 20915200 | consumed tokens: 42834329600 | elapsed time per iteration (s): 0.15 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.707245E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.741 | TFLOPs: 25.92 | +7: iteration 81710/ 173500 | consumed samples: 20917760 | consumed tokens: 42839572480 | elapsed time per iteration (s): 0.15 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.695577E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.166 | TFLOPs: 26.22 | +7: iteration 81720/ 173500 | consumed samples: 20920320 | consumed tokens: 42844815360 | elapsed time per iteration (s): 0.15 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.695394E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.359 | TFLOPs: 26.18 | +7: iteration 81730/ 173500 | consumed samples: 20922880 | consumed tokens: 42850058240 | elapsed time per iteration (s): 0.16 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.706685E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.977 | TFLOPs: 25.56 | +7: iteration 81740/ 173500 | consumed samples: 20925440 | consumed tokens: 42855301120 | elapsed time per iteration (s): 0.15 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.711490E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.926 | TFLOPs: 26.05 | +7: iteration 81750/ 173500 | consumed samples: 20928000 | consumed tokens: 42860544000 | elapsed time per iteration (s): 0.15 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.697922E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.299 | TFLOPs: 26.26 | +7: iteration 81760/ 173500 | consumed samples: 20930560 | consumed tokens: 42865786880 | elapsed time per iteration (s): 0.15 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.703440E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.406 | TFLOPs: 25.91 | +7: iteration 81770/ 173500 | consumed samples: 20933120 | consumed tokens: 42871029760 | elapsed time per iteration (s): 0.16 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.704628E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.102 | TFLOPs: 25.08 | +7: iteration 81780/ 173500 | consumed samples: 20935680 | consumed tokens: 42876272640 | elapsed time per iteration (s): 0.16 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.710537E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.879 | TFLOPs: 25.53 | +7: iteration 81790/ 173500 | consumed samples: 20938240 | consumed tokens: 42881515520 | elapsed time per iteration (s): 0.15 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.705658E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.522 | TFLOPs: 26.28 | +7: iteration 81800/ 173500 | consumed samples: 20940800 | consumed tokens: 42886758400 | elapsed time per iteration (s): 0.16 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.715038E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.537 | TFLOPs: 25.76 | +7: iteration 81810/ 173500 | consumed samples: 20943360 | consumed tokens: 42892001280 | elapsed time per iteration (s): 0.15 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.713868E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.851 | TFLOPs: 26.14 | +7: iteration 81820/ 173500 | consumed samples: 20945920 | consumed tokens: 42897244160 | elapsed time per iteration (s): 0.16 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.705042E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.230 | TFLOPs: 25.44 | +7: iteration 81830/ 173500 | consumed samples: 20948480 | consumed tokens: 42902487040 | elapsed time per iteration (s): 0.15 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.711956E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.268 | TFLOPs: 26.10 | +7: iteration 81840/ 173500 | consumed samples: 20951040 | consumed tokens: 42907729920 | elapsed time per iteration (s): 0.16 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.720500E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.398 | TFLOPs: 25.54 | +7: iteration 81850/ 173500 | consumed samples: 20953600 | consumed tokens: 42912972800 | elapsed time per iteration (s): 0.15 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.713051E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.809 | TFLOPs: 26.09 | +7: iteration 81860/ 173500 | consumed samples: 20956160 | consumed tokens: 42918215680 | elapsed time per iteration (s): 0.16 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.693089E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.227 | TFLOPs: 25.16 | +7: iteration 81870/ 173500 | consumed samples: 20958720 | consumed tokens: 42923458560 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.701302E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.974 | TFLOPs: 26.10 | +7: iteration 81880/ 173500 | consumed samples: 20961280 | consumed tokens: 42928701440 | elapsed time per iteration (s): 0.16 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.708285E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.518 | TFLOPs: 25.73 | +7: iteration 81890/ 173500 | consumed samples: 20963840 | consumed tokens: 42933944320 | elapsed time per iteration (s): 0.16 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.722417E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.626 | TFLOPs: 25.23 | +7: iteration 81900/ 173500 | consumed samples: 20966400 | consumed tokens: 42939187200 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.696191E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.257 | TFLOPs: 26.13 | +7: iteration 81910/ 173500 | consumed samples: 20968960 | consumed tokens: 42944430080 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.700715E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.697 | TFLOPs: 26.14 | +7: iteration 81920/ 173500 | consumed samples: 20971520 | consumed tokens: 42949672960 | elapsed time per iteration (s): 0.15 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.715385E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.088 | TFLOPs: 25.91 | +7: iteration 81930/ 173500 | consumed samples: 20974080 | consumed tokens: 42954915840 | elapsed time per iteration (s): 0.16 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.727433E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.870 | TFLOPs: 25.70 | +7: iteration 81940/ 173500 | consumed samples: 20976640 | consumed tokens: 42960158720 | elapsed time per iteration (s): 0.16 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.687853E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.783 | TFLOPs: 25.64 | +7: iteration 81950/ 173500 | consumed samples: 20979200 | consumed tokens: 42965401600 | elapsed time per iteration (s): 0.15 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.709962E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.773 | TFLOPs: 26.37 | +7: iteration 81960/ 173500 | consumed samples: 20981760 | consumed tokens: 42970644480 | elapsed time per iteration (s): 0.15 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.714452E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.169 | TFLOPs: 26.35 | +7: iteration 81970/ 173500 | consumed samples: 20984320 | consumed tokens: 42975887360 | elapsed time per iteration (s): 0.15 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.719695E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.251 | TFLOPs: 26.01 | +7: iteration 81980/ 173500 | consumed samples: 20986880 | consumed tokens: 42981130240 | elapsed time per iteration (s): 0.15 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.714407E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.441 | TFLOPs: 26.12 | +7: iteration 81990/ 173500 | consumed samples: 20989440 | consumed tokens: 42986373120 | elapsed time per iteration (s): 0.15 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.700420E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.552 | TFLOPs: 26.36 | +0: [2023-03-17 03:48:41,867] [INFO] [logging.py:68:log_dist] [Rank 0] step=82000, skipped=0, lr=[0.00011923116875818059, 0.00011923116875818059, 0.00011923116875818059], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 82000/ 173500 | consumed samples: 20992000 | consumed tokens: 42991616000 | elapsed time per iteration (s): 0.16 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.719405E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.830 | TFLOPs: 25.73 | +0: steps: 82000 loss: 3.7327 iter time (s): 0.155 samples/sec: 1656.599 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 82000 | lm loss value: 3.845752E+00 | lm loss PPL: 4.679385E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 82000 to checkpoints_44m91b100m +0: [2023-03-17 03:48:41,941] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step82000 is begin to save! +0: [2023-03-17 03:48:41,945] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:48:42,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:48:42,024] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:48:42,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:48:42,033] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:48:42,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:48:42,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:48:42,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:48:42,049] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:48:42,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:48:42,057] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:48:42,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:48:42,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:48:42,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:48:42,074] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:48:42,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:48:42,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:48:42,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:48:42,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:48:42,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:48:42,091] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step82000/mp_rank_00_model_states.pt +0: [2023-03-17 03:48:42,091] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:48:42,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +5: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:48:42,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +5: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +5: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +5: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:48:42,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +3: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +5: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +1: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +6: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:48:42,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +7: [2023-03-17 03:48:42,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:48:42,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 03:48:42,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +5: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +2: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:48:42,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +4: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:48:42,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step82000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:48:42,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step82000 is ready now! +0: successfully saved checkpoint at iteration 82000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 197.40 +7: iteration 82010/ 173500 | consumed samples: 20994560 | consumed tokens: 42996858880 | elapsed time per iteration (s): 0.18 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.710919E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.952 | TFLOPs: 22.06 | +7: iteration 82020/ 173500 | consumed samples: 20997120 | consumed tokens: 43002101760 | elapsed time per iteration (s): 0.16 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.699850E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.294 | TFLOPs: 25.52 | +7: iteration 82030/ 173500 | consumed samples: 20999680 | consumed tokens: 43007344640 | elapsed time per iteration (s): 0.15 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.710537E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.917 | TFLOPs: 26.35 | +7: iteration 82040/ 173500 | consumed samples: 21002240 | consumed tokens: 43012587520 | elapsed time per iteration (s): 0.15 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.711951E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.404 | TFLOPs: 26.32 | +7: iteration 82050/ 173500 | consumed samples: 21004800 | consumed tokens: 43017830400 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.704717E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.882 | TFLOPs: 26.38 | +7: iteration 82060/ 173500 | consumed samples: 21007360 | consumed tokens: 43023073280 | elapsed time per iteration (s): 0.16 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.715796E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.026 | TFLOPs: 25.80 | +7: iteration 82070/ 173500 | consumed samples: 21009920 | consumed tokens: 43028316160 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.713353E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.456 | TFLOPs: 26.15 | +7: iteration 82080/ 173500 | consumed samples: 21012480 | consumed tokens: 43033559040 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.705754E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.258 | TFLOPs: 26.16 | +7: iteration 82090/ 173500 | consumed samples: 21015040 | consumed tokens: 43038801920 | elapsed time per iteration (s): 0.16 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.701349E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.156 | TFLOPs: 25.16 | +7: iteration 82100/ 173500 | consumed samples: 21017600 | consumed tokens: 43044044800 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.713478E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.288 | TFLOPs: 26.16 | +7: iteration 82110/ 173500 | consumed samples: 21020160 | consumed tokens: 43049287680 | elapsed time per iteration (s): 0.15 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.710101E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.526 | TFLOPs: 26.15 | +7: iteration 82120/ 173500 | consumed samples: 21022720 | consumed tokens: 43054530560 | elapsed time per iteration (s): 0.16 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.697120E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.047 | TFLOPs: 25.85 | +7: iteration 82130/ 173500 | consumed samples: 21025280 | consumed tokens: 43059773440 | elapsed time per iteration (s): 0.15 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.701311E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.502 | TFLOPs: 26.35 | +7: iteration 82140/ 173500 | consumed samples: 21027840 | consumed tokens: 43065016320 | elapsed time per iteration (s): 0.16 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.721784E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.254 | TFLOPs: 25.63 | +7: iteration 82150/ 173500 | consumed samples: 21030400 | consumed tokens: 43070259200 | elapsed time per iteration (s): 0.16 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.710545E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.833 | TFLOPs: 25.62 | +7: iteration 82160/ 173500 | consumed samples: 21032960 | consumed tokens: 43075502080 | elapsed time per iteration (s): 0.16 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.695871E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.869 | TFLOPs: 25.62 | +7: iteration 82170/ 173500 | consumed samples: 21035520 | consumed tokens: 43080744960 | elapsed time per iteration (s): 0.16 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.710802E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.969 | TFLOPs: 25.86 | +7: iteration 82180/ 173500 | consumed samples: 21038080 | consumed tokens: 43085987840 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.711448E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.692 | TFLOPs: 26.33 | +7: iteration 82190/ 173500 | consumed samples: 21040640 | consumed tokens: 43091230720 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.705956E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.108 | TFLOPs: 26.32 | +7: iteration 82200/ 173500 | consumed samples: 21043200 | consumed tokens: 43096473600 | elapsed time per iteration (s): 0.16 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.700631E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.564 | TFLOPs: 25.81 | +7: iteration 82210/ 173500 | consumed samples: 21045760 | consumed tokens: 43101716480 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.705857E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.740 | TFLOPs: 26.36 | +7: iteration 82220/ 173500 | consumed samples: 21048320 | consumed tokens: 43106959360 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.716655E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.754 | TFLOPs: 26.01 | +7: iteration 82230/ 173500 | consumed samples: 21050880 | consumed tokens: 43112202240 | elapsed time per iteration (s): 0.15 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.704351E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.478 | TFLOPs: 26.37 | +7: iteration 82240/ 173500 | consumed samples: 21053440 | consumed tokens: 43117445120 | elapsed time per iteration (s): 0.16 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.698914E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.461 | TFLOPs: 25.63 | +7: iteration 82250/ 173500 | consumed samples: 21056000 | consumed tokens: 43122688000 | elapsed time per iteration (s): 0.16 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.713445E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.934 | TFLOPs: 25.83 | +7: iteration 82260/ 173500 | consumed samples: 21058560 | consumed tokens: 43127930880 | elapsed time per iteration (s): 0.15 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.704228E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.021 | TFLOPs: 26.05 | +7: iteration 82270/ 173500 | consumed samples: 21061120 | consumed tokens: 43133173760 | elapsed time per iteration (s): 0.15 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.710946E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.749 | TFLOPs: 26.39 | +7: iteration 82280/ 173500 | consumed samples: 21063680 | consumed tokens: 43138416640 | elapsed time per iteration (s): 0.15 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.713553E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.312 | TFLOPs: 25.93 | +7: iteration 82290/ 173500 | consumed samples: 21066240 | consumed tokens: 43143659520 | elapsed time per iteration (s): 0.16 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.721337E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.065 | TFLOPs: 25.30 | +7: iteration 82300/ 173500 | consumed samples: 21068800 | consumed tokens: 43148902400 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.719635E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.574 | TFLOPs: 26.14 | +7: iteration 82310/ 173500 | consumed samples: 21071360 | consumed tokens: 43154145280 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.709640E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.822 | TFLOPs: 26.33 | +7: iteration 82320/ 173500 | consumed samples: 21073920 | consumed tokens: 43159388160 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.716876E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.853 | TFLOPs: 26.31 | +7: iteration 82330/ 173500 | consumed samples: 21076480 | consumed tokens: 43164631040 | elapsed time per iteration (s): 0.16 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.713630E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.293 | TFLOPs: 25.65 | +7: iteration 82340/ 173500 | consumed samples: 21079040 | consumed tokens: 43169873920 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.719275E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.348 | TFLOPs: 26.34 | +7: iteration 82350/ 173500 | consumed samples: 21081600 | consumed tokens: 43175116800 | elapsed time per iteration (s): 0.15 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.711539E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.504 | TFLOPs: 26.17 | +7: iteration 82360/ 173500 | consumed samples: 21084160 | consumed tokens: 43180359680 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.705799E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.270 | TFLOPs: 26.24 | +7: iteration 82370/ 173500 | consumed samples: 21086720 | consumed tokens: 43185602560 | elapsed time per iteration (s): 0.16 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.696725E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.013 | TFLOPs: 25.74 | +7: iteration 82380/ 173500 | consumed samples: 21089280 | consumed tokens: 43190845440 | elapsed time per iteration (s): 0.16 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.711804E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.980 | TFLOPs: 25.72 | +7: iteration 82390/ 173500 | consumed samples: 21091840 | consumed tokens: 43196088320 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.721656E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.468 | TFLOPs: 26.34 | +7: iteration 82400/ 173500 | consumed samples: 21094400 | consumed tokens: 43201331200 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.702939E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.368 | TFLOPs: 26.12 | +7: iteration 82410/ 173500 | consumed samples: 21096960 | consumed tokens: 43206574080 | elapsed time per iteration (s): 0.15 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.718555E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.539 | TFLOPs: 26.34 | +7: iteration 82420/ 173500 | consumed samples: 21099520 | consumed tokens: 43211816960 | elapsed time per iteration (s): 0.15 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.709580E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.589 | TFLOPs: 26.03 | +7: iteration 82430/ 173500 | consumed samples: 21102080 | consumed tokens: 43217059840 | elapsed time per iteration (s): 0.16 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.716964E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.493 | TFLOPs: 25.76 | +7: iteration 82440/ 173500 | consumed samples: 21104640 | consumed tokens: 43222302720 | elapsed time per iteration (s): 0.15 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.707010E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.862 | TFLOPs: 26.27 | +7: iteration 82450/ 173500 | consumed samples: 21107200 | consumed tokens: 43227545600 | elapsed time per iteration (s): 0.15 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.699502E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.993 | TFLOPs: 26.28 | +7: iteration 82460/ 173500 | consumed samples: 21109760 | consumed tokens: 43232788480 | elapsed time per iteration (s): 0.16 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.697940E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.309 | TFLOPs: 25.76 | +7: iteration 82470/ 173500 | consumed samples: 21112320 | consumed tokens: 43238031360 | elapsed time per iteration (s): 0.15 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.713488E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.036 | TFLOPs: 26.27 | +7: iteration 82480/ 173500 | consumed samples: 21114880 | consumed tokens: 43243274240 | elapsed time per iteration (s): 0.15 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.705910E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.159 | TFLOPs: 26.25 | +7: iteration 82490/ 173500 | consumed samples: 21117440 | consumed tokens: 43248517120 | elapsed time per iteration (s): 0.16 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.709254E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.827 | TFLOPs: 25.34 | +7: iteration 82500/ 173500 | consumed samples: 21120000 | consumed tokens: 43253760000 | elapsed time per iteration (s): 0.15 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.711641E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.802 | TFLOPs: 26.31 | +7: iteration 82510/ 173500 | consumed samples: 21122560 | consumed tokens: 43259002880 | elapsed time per iteration (s): 0.16 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.711554E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.716 | TFLOPs: 25.51 | +7: iteration 82520/ 173500 | consumed samples: 21125120 | consumed tokens: 43264245760 | elapsed time per iteration (s): 0.15 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.707664E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.817 | TFLOPs: 26.31 | +7: iteration 82530/ 173500 | consumed samples: 21127680 | consumed tokens: 43269488640 | elapsed time per iteration (s): 0.15 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.699973E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.897 | TFLOPs: 26.00 | +7: iteration 82540/ 173500 | consumed samples: 21130240 | consumed tokens: 43274731520 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.696132E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.910 | TFLOPs: 26.05 | +7: iteration 82550/ 173500 | consumed samples: 21132800 | consumed tokens: 43279974400 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.701497E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.305 | TFLOPs: 26.30 | +7: iteration 82560/ 173500 | consumed samples: 21135360 | consumed tokens: 43285217280 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.703288E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.189 | TFLOPs: 26.29 | +7: iteration 82570/ 173500 | consumed samples: 21137920 | consumed tokens: 43290460160 | elapsed time per iteration (s): 0.16 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.713793E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.122 | TFLOPs: 25.78 | +7: iteration 82580/ 173500 | consumed samples: 21140480 | consumed tokens: 43295703040 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.712940E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.433 | TFLOPs: 26.31 | +7: iteration 82590/ 173500 | consumed samples: 21143040 | consumed tokens: 43300945920 | elapsed time per iteration (s): 0.15 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.706998E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.623 | TFLOPs: 26.33 | +7: iteration 82600/ 173500 | consumed samples: 21145600 | consumed tokens: 43306188800 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.719970E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.802 | TFLOPs: 26.33 | +7: iteration 82610/ 173500 | consumed samples: 21148160 | consumed tokens: 43311431680 | elapsed time per iteration (s): 0.16 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.707485E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.278 | TFLOPs: 25.83 | +7: iteration 82620/ 173500 | consumed samples: 21150720 | consumed tokens: 43316674560 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.715580E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.389 | TFLOPs: 26.34 | +7: iteration 82630/ 173500 | consumed samples: 21153280 | consumed tokens: 43321917440 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.702662E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.242 | TFLOPs: 26.37 | +7: iteration 82640/ 173500 | consumed samples: 21155840 | consumed tokens: 43327160320 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.704908E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.067 | TFLOPs: 26.36 | +7: iteration 82650/ 173500 | consumed samples: 21158400 | consumed tokens: 43332403200 | elapsed time per iteration (s): 0.15 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.709938E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.914 | TFLOPs: 26.33 | +7: iteration 82660/ 173500 | consumed samples: 21160960 | consumed tokens: 43337646080 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.718370E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.409 | TFLOPs: 26.34 | +7: iteration 82670/ 173500 | consumed samples: 21163520 | consumed tokens: 43342888960 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.704863E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.920 | TFLOPs: 26.31 | +7: iteration 82680/ 173500 | consumed samples: 21166080 | consumed tokens: 43348131840 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.715093E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.197 | TFLOPs: 26.37 | +7: iteration 82690/ 173500 | consumed samples: 21168640 | consumed tokens: 43353374720 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.708744E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.649 | TFLOPs: 26.36 | +7: iteration 82700/ 173500 | consumed samples: 21171200 | consumed tokens: 43358617600 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.699110E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.519 | TFLOPs: 26.35 | +7: iteration 82710/ 173500 | consumed samples: 21173760 | consumed tokens: 43363860480 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.717327E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.312 | TFLOPs: 26.37 | +7: iteration 82720/ 173500 | consumed samples: 21176320 | consumed tokens: 43369103360 | elapsed time per iteration (s): 0.15 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.704501E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.557 | TFLOPs: 26.36 | +7: iteration 82730/ 173500 | consumed samples: 21178880 | consumed tokens: 43374346240 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.707066E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.849 | TFLOPs: 26.36 | +7: iteration 82740/ 173500 | consumed samples: 21181440 | consumed tokens: 43379589120 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.699224E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.835 | TFLOPs: 26.33 | +7: iteration 82750/ 173500 | consumed samples: 21184000 | consumed tokens: 43384832000 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.701802E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.116 | TFLOPs: 26.33 | +7: iteration 82760/ 173500 | consumed samples: 21186560 | consumed tokens: 43390074880 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.710294E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.482 | TFLOPs: 26.20 | +7: iteration 82770/ 173500 | consumed samples: 21189120 | consumed tokens: 43395317760 | elapsed time per iteration (s): 0.15 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.707091E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.646 | TFLOPs: 26.29 | +7: iteration 82780/ 173500 | consumed samples: 21191680 | consumed tokens: 43400560640 | elapsed time per iteration (s): 0.16 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.705191E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.561 | TFLOPs: 25.67 | +7: iteration 82790/ 173500 | consumed samples: 21194240 | consumed tokens: 43405803520 | elapsed time per iteration (s): 0.16 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.699079E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.096 | TFLOPs: 25.30 | +7: iteration 82800/ 173500 | consumed samples: 21196800 | consumed tokens: 43411046400 | elapsed time per iteration (s): 0.15 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.720121E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.812 | TFLOPs: 26.31 | +7: iteration 82810/ 173500 | consumed samples: 21199360 | consumed tokens: 43416289280 | elapsed time per iteration (s): 0.16 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.727034E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.300 | TFLOPs: 25.47 | +7: iteration 82820/ 173500 | consumed samples: 21201920 | consumed tokens: 43421532160 | elapsed time per iteration (s): 0.15 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.697039E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.161 | TFLOPs: 26.27 | +7: iteration 82830/ 173500 | consumed samples: 21204480 | consumed tokens: 43426775040 | elapsed time per iteration (s): 0.16 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.697944E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.748 | TFLOPs: 25.51 | +7: iteration 82840/ 173500 | consumed samples: 21207040 | consumed tokens: 43432017920 | elapsed time per iteration (s): 0.15 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.709796E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.947 | TFLOPs: 26.27 | +7: iteration 82850/ 173500 | consumed samples: 21209600 | consumed tokens: 43437260800 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.713010E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.744 | TFLOPs: 25.97 | +7: iteration 82860/ 173500 | consumed samples: 21212160 | consumed tokens: 43442503680 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.722264E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.609 | TFLOPs: 26.32 | +7: iteration 82870/ 173500 | consumed samples: 21214720 | consumed tokens: 43447746560 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.709471E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.164 | TFLOPs: 26.36 | +7: iteration 82880/ 173500 | consumed samples: 21217280 | consumed tokens: 43452989440 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.710224E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.341 | TFLOPs: 26.37 | +7: iteration 82890/ 173500 | consumed samples: 21219840 | consumed tokens: 43458232320 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.700972E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.957 | TFLOPs: 25.99 | +7: iteration 82900/ 173500 | consumed samples: 21222400 | consumed tokens: 43463475200 | elapsed time per iteration (s): 0.15 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.720021E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.493 | TFLOPs: 26.29 | +7: iteration 82910/ 173500 | consumed samples: 21224960 | consumed tokens: 43468718080 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.703744E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.159 | TFLOPs: 26.35 | +7: iteration 82920/ 173500 | consumed samples: 21227520 | consumed tokens: 43473960960 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.706678E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.985 | TFLOPs: 26.35 | +7: iteration 82930/ 173500 | consumed samples: 21230080 | consumed tokens: 43479203840 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.711961E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.106 | TFLOPs: 26.36 | +7: iteration 82940/ 173500 | consumed samples: 21232640 | consumed tokens: 43484446720 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.703504E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.257 | TFLOPs: 26.38 | +7: iteration 82950/ 173500 | consumed samples: 21235200 | consumed tokens: 43489689600 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.705597E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.560 | TFLOPs: 26.36 | +7: iteration 82960/ 173500 | consumed samples: 21237760 | consumed tokens: 43494932480 | elapsed time per iteration (s): 0.15 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.716870E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.286 | TFLOPs: 26.37 | +7: iteration 82970/ 173500 | consumed samples: 21240320 | consumed tokens: 43500175360 | elapsed time per iteration (s): 0.15 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.699244E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.563 | TFLOPs: 26.37 | +7: iteration 82980/ 173500 | consumed samples: 21242880 | consumed tokens: 43505418240 | elapsed time per iteration (s): 0.15 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.697048E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.765 | TFLOPs: 26.36 | +7: iteration 82990/ 173500 | consumed samples: 21245440 | consumed tokens: 43510661120 | elapsed time per iteration (s): 0.15 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.690025E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.016 | TFLOPs: 26.38 | +7: iteration 83000/ 173500 | consumed samples: 21248000 | consumed tokens: 43515904000 | elapsed time per iteration (s): 0.16 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.695656E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.648 | TFLOPs: 25.87 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 83000 | lm loss value: 3.853893E+00 | lm loss PPL: 4.717634E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 83000 to checkpoints_44m91b100m +0: [2023-03-17 03:51:15,998] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step83000 is begin to save! +0: [2023-03-17 03:51:16,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:51:16,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:51:16,060] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:51:16,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:51:16,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:51:16,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:51:16,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:51:16,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:51:16,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:51:16,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:51:16,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:51:16,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:51:16,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:51:16,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:51:16,111] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:51:16,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:51:16,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:51:16,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:51:16,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:51:16,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:51:16,128] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step83000/mp_rank_00_model_states.pt +0: [2023-03-17 03:51:16,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:51:16,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:51:16,146] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:51:16,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:51:16,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 03:51:16,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +7: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +1: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +5: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:51:16,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +2: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +4: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +3: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +6: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:51:16,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step83000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:51:16,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step83000 is ready now! +0: successfully saved checkpoint at iteration 83000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 174.75 +7: iteration 83010/ 173500 | consumed samples: 21250560 | consumed tokens: 43521146880 | elapsed time per iteration (s): 0.18 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.697835E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1459.851 | TFLOPs: 22.89 | +7: iteration 83020/ 173500 | consumed samples: 21253120 | consumed tokens: 43526389760 | elapsed time per iteration (s): 0.16 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.699783E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.438 | TFLOPs: 25.82 | +7: iteration 83030/ 173500 | consumed samples: 21255680 | consumed tokens: 43531632640 | elapsed time per iteration (s): 0.15 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.717965E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.297 | TFLOPs: 26.16 | +7: iteration 83040/ 173500 | consumed samples: 21258240 | consumed tokens: 43536875520 | elapsed time per iteration (s): 0.16 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.706800E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.997 | TFLOPs: 25.14 | +7: iteration 83050/ 173500 | consumed samples: 21260800 | consumed tokens: 43542118400 | elapsed time per iteration (s): 0.16 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.714359E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.115 | TFLOPs: 24.56 | +7: iteration 83060/ 173500 | consumed samples: 21263360 | consumed tokens: 43547361280 | elapsed time per iteration (s): 0.15 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.697622E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.128 | TFLOPs: 26.22 | +7: iteration 83070/ 173500 | consumed samples: 21265920 | consumed tokens: 43552604160 | elapsed time per iteration (s): 0.15 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.721506E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.670 | TFLOPs: 26.15 | +7: iteration 83080/ 173500 | consumed samples: 21268480 | consumed tokens: 43557847040 | elapsed time per iteration (s): 0.15 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.706432E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.106 | TFLOPs: 26.21 | +7: iteration 83090/ 173500 | consumed samples: 21271040 | consumed tokens: 43563089920 | elapsed time per iteration (s): 0.16 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.699012E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.756 | TFLOPs: 25.89 | +7: iteration 83100/ 173500 | consumed samples: 21273600 | consumed tokens: 43568332800 | elapsed time per iteration (s): 0.16 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.693135E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.119 | TFLOPs: 25.36 | +7: iteration 83110/ 173500 | consumed samples: 21276160 | consumed tokens: 43573575680 | elapsed time per iteration (s): 0.16 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.707954E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.770 | TFLOPs: 25.04 | +7: iteration 83120/ 173500 | consumed samples: 21278720 | consumed tokens: 43578818560 | elapsed time per iteration (s): 0.15 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.701039E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.990 | TFLOPs: 25.97 | +7: iteration 83130/ 173500 | consumed samples: 21281280 | consumed tokens: 43584061440 | elapsed time per iteration (s): 0.15 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.718623E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.890 | TFLOPs: 25.92 | +7: iteration 83140/ 173500 | consumed samples: 21283840 | consumed tokens: 43589304320 | elapsed time per iteration (s): 0.15 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.709801E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.721 | TFLOPs: 25.93 | +7: iteration 83150/ 173500 | consumed samples: 21286400 | consumed tokens: 43594547200 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.704391E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.812 | TFLOPs: 26.14 | +7: iteration 83160/ 173500 | consumed samples: 21288960 | consumed tokens: 43599790080 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.708909E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.379 | TFLOPs: 26.15 | +7: iteration 83170/ 173500 | consumed samples: 21291520 | consumed tokens: 43605032960 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.696349E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.891 | TFLOPs: 26.19 | +7: iteration 83180/ 173500 | consumed samples: 21294080 | consumed tokens: 43610275840 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.697229E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.651 | TFLOPs: 26.18 | +7: iteration 83190/ 173500 | consumed samples: 21296640 | consumed tokens: 43615518720 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.708616E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.172 | TFLOPs: 26.18 | +7: iteration 83200/ 173500 | consumed samples: 21299200 | consumed tokens: 43620761600 | elapsed time per iteration (s): 0.15 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.711010E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.257 | TFLOPs: 26.18 | +7: iteration 83210/ 173500 | consumed samples: 21301760 | consumed tokens: 43626004480 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.712960E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.033 | TFLOPs: 26.19 | +7: iteration 83220/ 173500 | consumed samples: 21304320 | consumed tokens: 43631247360 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.707336E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.947 | TFLOPs: 26.17 | +7: iteration 83230/ 173500 | consumed samples: 21306880 | consumed tokens: 43636490240 | elapsed time per iteration (s): 0.16 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.714413E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.363 | TFLOPs: 24.96 | +7: iteration 83240/ 173500 | consumed samples: 21309440 | consumed tokens: 43641733120 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.704089E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.624 | TFLOPs: 26.20 | +7: iteration 83250/ 173500 | consumed samples: 21312000 | consumed tokens: 43646976000 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.713938E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.161 | TFLOPs: 26.21 | +7: iteration 83260/ 173500 | consumed samples: 21314560 | consumed tokens: 43652218880 | elapsed time per iteration (s): 0.15 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.706898E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.310 | TFLOPs: 26.21 | +7: iteration 83270/ 173500 | consumed samples: 21317120 | consumed tokens: 43657461760 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.714468E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.332 | TFLOPs: 26.21 | +7: iteration 83280/ 173500 | consumed samples: 21319680 | consumed tokens: 43662704640 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.703668E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.800 | TFLOPs: 26.19 | +7: iteration 83290/ 173500 | consumed samples: 21322240 | consumed tokens: 43667947520 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.718932E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.622 | TFLOPs: 26.23 | +7: iteration 83300/ 173500 | consumed samples: 21324800 | consumed tokens: 43673190400 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.707201E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.032 | TFLOPs: 26.05 | +7: iteration 83310/ 173500 | consumed samples: 21327360 | consumed tokens: 43678433280 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.714666E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.991 | TFLOPs: 26.25 | +7: iteration 83320/ 173500 | consumed samples: 21329920 | consumed tokens: 43683676160 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.697417E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.381 | TFLOPs: 26.18 | +7: iteration 83330/ 173500 | consumed samples: 21332480 | consumed tokens: 43688919040 | elapsed time per iteration (s): 0.15 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.706343E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.352 | TFLOPs: 26.21 | +7: iteration 83340/ 173500 | consumed samples: 21335040 | consumed tokens: 43694161920 | elapsed time per iteration (s): 0.15 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.699748E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.405 | TFLOPs: 26.21 | +7: iteration 83350/ 173500 | consumed samples: 21337600 | consumed tokens: 43699404800 | elapsed time per iteration (s): 0.15 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.709086E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.967 | TFLOPs: 26.20 | +7: iteration 83360/ 173500 | consumed samples: 21340160 | consumed tokens: 43704647680 | elapsed time per iteration (s): 0.15 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.704918E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.210 | TFLOPs: 25.91 | +7: iteration 83370/ 173500 | consumed samples: 21342720 | consumed tokens: 43709890560 | elapsed time per iteration (s): 0.15 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.705336E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.717 | TFLOPs: 26.15 | +7: iteration 83380/ 173500 | consumed samples: 21345280 | consumed tokens: 43715133440 | elapsed time per iteration (s): 0.17 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.705486E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.686 | TFLOPs: 23.11 | +7: iteration 83390/ 173500 | consumed samples: 21347840 | consumed tokens: 43720376320 | elapsed time per iteration (s): 0.16 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.701842E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.190 | TFLOPs: 25.27 | +7: iteration 83400/ 173500 | consumed samples: 21350400 | consumed tokens: 43725619200 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.707361E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.613 | TFLOPs: 26.11 | +7: iteration 83410/ 173500 | consumed samples: 21352960 | consumed tokens: 43730862080 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.709401E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.040 | TFLOPs: 26.00 | +7: iteration 83420/ 173500 | consumed samples: 21355520 | consumed tokens: 43736104960 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.700446E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.476 | TFLOPs: 26.18 | +7: iteration 83430/ 173500 | consumed samples: 21358080 | consumed tokens: 43741347840 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.716265E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.205 | TFLOPs: 26.19 | +7: iteration 83440/ 173500 | consumed samples: 21360640 | consumed tokens: 43746590720 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.703581E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.150 | TFLOPs: 26.16 | +7: iteration 83450/ 173500 | consumed samples: 21363200 | consumed tokens: 43751833600 | elapsed time per iteration (s): 0.15 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.702393E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.923 | TFLOPs: 26.16 | +7: iteration 83460/ 173500 | consumed samples: 21365760 | consumed tokens: 43757076480 | elapsed time per iteration (s): 0.15 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.689386E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.649 | TFLOPs: 26.15 | +7: iteration 83470/ 173500 | consumed samples: 21368320 | consumed tokens: 43762319360 | elapsed time per iteration (s): 0.16 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.718204E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.855 | TFLOPs: 25.43 | +7: iteration 83480/ 173500 | consumed samples: 21370880 | consumed tokens: 43767562240 | elapsed time per iteration (s): 0.15 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.696336E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.989 | TFLOPs: 26.02 | +7: iteration 83490/ 173500 | consumed samples: 21373440 | consumed tokens: 43772805120 | elapsed time per iteration (s): 0.16 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.711914E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.944 | TFLOPs: 25.67 | +7: iteration 83500/ 173500 | consumed samples: 21376000 | consumed tokens: 43778048000 | elapsed time per iteration (s): 0.15 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.715718E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.705 | TFLOPs: 26.09 | +7: iteration 83510/ 173500 | consumed samples: 21378560 | consumed tokens: 43783290880 | elapsed time per iteration (s): 0.16 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.706873E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.292 | TFLOPs: 25.76 | +7: iteration 83520/ 173500 | consumed samples: 21381120 | consumed tokens: 43788533760 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.712577E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.635 | TFLOPs: 26.15 | +7: iteration 83530/ 173500 | consumed samples: 21383680 | consumed tokens: 43793776640 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.709164E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.432 | TFLOPs: 26.13 | +7: iteration 83540/ 173500 | consumed samples: 21386240 | consumed tokens: 43799019520 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.704965E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.312 | TFLOPs: 26.16 | +7: iteration 83550/ 173500 | consumed samples: 21388800 | consumed tokens: 43804262400 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.719928E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.351 | TFLOPs: 26.12 | +7: iteration 83560/ 173500 | consumed samples: 21391360 | consumed tokens: 43809505280 | elapsed time per iteration (s): 0.15 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.722475E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.118 | TFLOPs: 26.10 | +7: iteration 83570/ 173500 | consumed samples: 21393920 | consumed tokens: 43814748160 | elapsed time per iteration (s): 0.16 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.705486E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.233 | TFLOPs: 25.66 | +7: iteration 83580/ 173500 | consumed samples: 21396480 | consumed tokens: 43819991040 | elapsed time per iteration (s): 0.16 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.709738E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.289 | TFLOPs: 25.66 | +7: iteration 83590/ 173500 | consumed samples: 21399040 | consumed tokens: 43825233920 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.708517E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.390 | TFLOPs: 26.04 | +7: iteration 83600/ 173500 | consumed samples: 21401600 | consumed tokens: 43830476800 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.716487E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.233 | TFLOPs: 26.15 | +7: iteration 83610/ 173500 | consumed samples: 21404160 | consumed tokens: 43835719680 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.711652E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.155 | TFLOPs: 26.15 | +7: iteration 83620/ 173500 | consumed samples: 21406720 | consumed tokens: 43840962560 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.698527E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.589 | TFLOPs: 26.14 | +7: iteration 83630/ 173500 | consumed samples: 21409280 | consumed tokens: 43846205440 | elapsed time per iteration (s): 0.15 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.710107E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.887 | TFLOPs: 26.16 | +7: iteration 83640/ 173500 | consumed samples: 21411840 | consumed tokens: 43851448320 | elapsed time per iteration (s): 0.15 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.706957E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.105 | TFLOPs: 26.14 | +7: iteration 83650/ 173500 | consumed samples: 21414400 | consumed tokens: 43856691200 | elapsed time per iteration (s): 0.15 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.707343E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.172 | TFLOPs: 26.15 | +7: iteration 83660/ 173500 | consumed samples: 21416960 | consumed tokens: 43861934080 | elapsed time per iteration (s): 0.16 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.700775E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.999 | TFLOPs: 25.34 | +7: iteration 83670/ 173500 | consumed samples: 21419520 | consumed tokens: 43867176960 | elapsed time per iteration (s): 0.16 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.703643E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.943 | TFLOPs: 25.55 | +7: iteration 83680/ 173500 | consumed samples: 21422080 | consumed tokens: 43872419840 | elapsed time per iteration (s): 0.15 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.698839E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.039 | TFLOPs: 26.11 | +7: iteration 83690/ 173500 | consumed samples: 21424640 | consumed tokens: 43877662720 | elapsed time per iteration (s): 0.15 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.711219E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.662 | TFLOPs: 26.12 | +7: iteration 83700/ 173500 | consumed samples: 21427200 | consumed tokens: 43882905600 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.704887E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.359 | TFLOPs: 26.15 | +7: iteration 83710/ 173500 | consumed samples: 21429760 | consumed tokens: 43888148480 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.700571E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.720 | TFLOPs: 26.14 | +7: iteration 83720/ 173500 | consumed samples: 21432320 | consumed tokens: 43893391360 | elapsed time per iteration (s): 0.16 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.718821E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.227 | TFLOPs: 25.47 | +7: iteration 83730/ 173500 | consumed samples: 21434880 | consumed tokens: 43898634240 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.710542E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.159 | TFLOPs: 26.08 | +7: iteration 83740/ 173500 | consumed samples: 21437440 | consumed tokens: 43903877120 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.722855E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.673 | TFLOPs: 26.07 | +7: iteration 83750/ 173500 | consumed samples: 21440000 | consumed tokens: 43909120000 | elapsed time per iteration (s): 0.15 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.712672E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.404 | TFLOPs: 26.05 | +7: iteration 83760/ 173500 | consumed samples: 21442560 | consumed tokens: 43914362880 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.716785E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.380 | TFLOPs: 26.07 | +7: iteration 83770/ 173500 | consumed samples: 21445120 | consumed tokens: 43919605760 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.694400E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.293 | TFLOPs: 26.12 | +7: iteration 83780/ 173500 | consumed samples: 21447680 | consumed tokens: 43924848640 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.702905E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.219 | TFLOPs: 26.11 | +7: iteration 83790/ 173500 | consumed samples: 21450240 | consumed tokens: 43930091520 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.713597E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.398 | TFLOPs: 26.13 | +7: iteration 83800/ 173500 | consumed samples: 21452800 | consumed tokens: 43935334400 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.699171E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.336 | TFLOPs: 25.99 | +7: iteration 83810/ 173500 | consumed samples: 21455360 | consumed tokens: 43940577280 | elapsed time per iteration (s): 0.15 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.715243E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.557 | TFLOPs: 26.07 | +7: iteration 83820/ 173500 | consumed samples: 21457920 | consumed tokens: 43945820160 | elapsed time per iteration (s): 0.16 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.716218E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.998 | TFLOPs: 25.84 | +7: iteration 83830/ 173500 | consumed samples: 21460480 | consumed tokens: 43951063040 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.702302E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.763 | TFLOPs: 26.12 | +7: iteration 83840/ 173500 | consumed samples: 21463040 | consumed tokens: 43956305920 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.708040E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.424 | TFLOPs: 26.13 | +7: iteration 83850/ 173500 | consumed samples: 21465600 | consumed tokens: 43961548800 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.704192E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.281 | TFLOPs: 26.13 | +7: iteration 83860/ 173500 | consumed samples: 21468160 | consumed tokens: 43966791680 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.698074E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.243 | TFLOPs: 26.13 | +7: iteration 83870/ 173500 | consumed samples: 21470720 | consumed tokens: 43972034560 | elapsed time per iteration (s): 0.15 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.710486E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.122 | TFLOPs: 26.14 | +7: iteration 83880/ 173500 | consumed samples: 21473280 | consumed tokens: 43977277440 | elapsed time per iteration (s): 0.16 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.714072E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.997 | TFLOPs: 25.45 | +7: iteration 83890/ 173500 | consumed samples: 21475840 | consumed tokens: 43982520320 | elapsed time per iteration (s): 0.16 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.707441E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.449 | TFLOPs: 25.62 | +7: iteration 83900/ 173500 | consumed samples: 21478400 | consumed tokens: 43987763200 | elapsed time per iteration (s): 0.15 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.698907E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.091 | TFLOPs: 26.13 | +7: iteration 83910/ 173500 | consumed samples: 21480960 | consumed tokens: 43993006080 | elapsed time per iteration (s): 0.16 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.701776E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.734 | TFLOPs: 25.79 | +7: iteration 83920/ 173500 | consumed samples: 21483520 | consumed tokens: 43998248960 | elapsed time per iteration (s): 0.15 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.707267E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.928 | TFLOPs: 26.17 | +7: iteration 83930/ 173500 | consumed samples: 21486080 | consumed tokens: 44003491840 | elapsed time per iteration (s): 0.15 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.700541E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.178 | TFLOPs: 26.15 | +7: iteration 83940/ 173500 | consumed samples: 21488640 | consumed tokens: 44008734720 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.709337E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.410 | TFLOPs: 26.15 | +7: iteration 83950/ 173500 | consumed samples: 21491200 | consumed tokens: 44013977600 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.707813E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.791 | TFLOPs: 26.16 | +7: iteration 83960/ 173500 | consumed samples: 21493760 | consumed tokens: 44019220480 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.705237E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.063 | TFLOPs: 26.14 | +7: iteration 83970/ 173500 | consumed samples: 21496320 | consumed tokens: 44024463360 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.711328E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.087 | TFLOPs: 26.16 | +7: iteration 83980/ 173500 | consumed samples: 21498880 | consumed tokens: 44029706240 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.709842E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.492 | TFLOPs: 26.13 | +7: iteration 83990/ 173500 | consumed samples: 21501440 | consumed tokens: 44034949120 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.690543E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.531 | TFLOPs: 26.10 | +0: [2023-03-17 03:53:50,806] [INFO] [logging.py:68:log_dist] [Rank 0] step=84000, skipped=0, lr=[0.00011595088621669176, 0.00011595088621669176, 0.00011595088621669176], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 84000/ 173500 | consumed samples: 21504000 | consumed tokens: 44040192000 | elapsed time per iteration (s): 0.15 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.708628E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.321 | TFLOPs: 26.13 | +0: steps: 84000 loss: 3.6957 iter time (s): 0.153 samples/sec: 1676.754 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 84000 | lm loss value: 3.887584E+00 | lm loss PPL: 4.879287E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 84000 to checkpoints_44m91b100m +0: [2023-03-17 03:53:50,880] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step84000 is begin to save! +0: [2023-03-17 03:53:50,884] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:53:50,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:53:50,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:53:50,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:53:50,955] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:53:50,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:53:50,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:53:50,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:53:50,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:53:50,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:53:50,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:53:50,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:53:50,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:53:50,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:53:50,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:53:51,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:53:51,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:53:51,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:53:51,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:53:51,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:53:51,012] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step84000/mp_rank_00_model_states.pt +0: [2023-03-17 03:53:51,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:53:51,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:53:51,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:53:51,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:53:51,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +1: [2023-03-17 03:53:51,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 03:53:51,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:53:51,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:53:51,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +1: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:53:51,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:53:51,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +1: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:53:51,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 03:53:51,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:53:51,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +1: [2023-03-17 03:53:51,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +5: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +1: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +3: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:53:51,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +7: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:53:51,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:53:51,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +6: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:53:51,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 03:53:51,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +2: [2023-03-17 03:53:51,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:53:51,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:53:51,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +4: [2023-03-17 03:53:51,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:53:51,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step84000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:53:51,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step84000 is ready now! +0: successfully saved checkpoint at iteration 84000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.35 +7: iteration 84010/ 173500 | consumed samples: 21506560 | consumed tokens: 44045434880 | elapsed time per iteration (s): 0.18 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.723848E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.251 | TFLOPs: 22.52 | +7: iteration 84020/ 173500 | consumed samples: 21509120 | consumed tokens: 44050677760 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.707951E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.958 | TFLOPs: 26.16 | +7: iteration 84030/ 173500 | consumed samples: 21511680 | consumed tokens: 44055920640 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.709208E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.472 | TFLOPs: 26.15 | +7: iteration 84040/ 173500 | consumed samples: 21514240 | consumed tokens: 44061163520 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.722294E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.951 | TFLOPs: 26.16 | +7: iteration 84050/ 173500 | consumed samples: 21516800 | consumed tokens: 44066406400 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.702711E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.371 | TFLOPs: 26.26 | +7: iteration 84060/ 173500 | consumed samples: 21519360 | consumed tokens: 44071649280 | elapsed time per iteration (s): 0.15 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.708296E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.058 | TFLOPs: 26.25 | +7: iteration 84070/ 173500 | consumed samples: 21521920 | consumed tokens: 44076892160 | elapsed time per iteration (s): 0.16 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.722966E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.398 | TFLOPs: 25.13 | +7: iteration 84080/ 173500 | consumed samples: 21524480 | consumed tokens: 44082135040 | elapsed time per iteration (s): 0.16 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.711162E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.312 | TFLOPs: 25.77 | +7: iteration 84090/ 173500 | consumed samples: 21527040 | consumed tokens: 44087377920 | elapsed time per iteration (s): 0.16 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.705612E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.001 | TFLOPs: 25.78 | +7: iteration 84100/ 173500 | consumed samples: 21529600 | consumed tokens: 44092620800 | elapsed time per iteration (s): 0.15 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.703126E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.235 | TFLOPs: 26.15 | +7: iteration 84110/ 173500 | consumed samples: 21532160 | consumed tokens: 44097863680 | elapsed time per iteration (s): 0.15 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.699397E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.938 | TFLOPs: 25.92 | +7: iteration 84120/ 173500 | consumed samples: 21534720 | consumed tokens: 44103106560 | elapsed time per iteration (s): 0.15 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.704722E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.506 | TFLOPs: 26.26 | +7: iteration 84130/ 173500 | consumed samples: 21537280 | consumed tokens: 44108349440 | elapsed time per iteration (s): 0.15 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.705062E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.893 | TFLOPs: 26.27 | +7: iteration 84140/ 173500 | consumed samples: 21539840 | consumed tokens: 44113592320 | elapsed time per iteration (s): 0.15 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.711260E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.191 | TFLOPs: 26.27 | +7: iteration 84150/ 173500 | consumed samples: 21542400 | consumed tokens: 44118835200 | elapsed time per iteration (s): 0.15 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.706936E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.437 | TFLOPs: 26.26 | +7: iteration 84160/ 173500 | consumed samples: 21544960 | consumed tokens: 44124078080 | elapsed time per iteration (s): 0.16 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.710708E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.354 | TFLOPs: 25.60 | +7: iteration 84170/ 173500 | consumed samples: 21547520 | consumed tokens: 44129320960 | elapsed time per iteration (s): 0.15 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.707551E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.604 | TFLOPs: 26.28 | +7: iteration 84180/ 173500 | consumed samples: 21550080 | consumed tokens: 44134563840 | elapsed time per iteration (s): 0.16 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.702281E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.791 | TFLOPs: 25.43 | +7: iteration 84190/ 173500 | consumed samples: 21552640 | consumed tokens: 44139806720 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.700679E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.239 | TFLOPs: 26.29 | +7: iteration 84200/ 173500 | consumed samples: 21555200 | consumed tokens: 44145049600 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.691222E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.998 | TFLOPs: 26.17 | +7: iteration 84210/ 173500 | consumed samples: 21557760 | consumed tokens: 44150292480 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.706002E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.113 | TFLOPs: 26.27 | +7: iteration 84220/ 173500 | consumed samples: 21560320 | consumed tokens: 44155535360 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.708914E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.898 | TFLOPs: 26.27 | +7: iteration 84230/ 173500 | consumed samples: 21562880 | consumed tokens: 44160778240 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.705988E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.569 | TFLOPs: 26.28 | +7: iteration 84240/ 173500 | consumed samples: 21565440 | consumed tokens: 44166021120 | elapsed time per iteration (s): 0.15 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.721843E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.229 | TFLOPs: 26.24 | +7: iteration 84250/ 173500 | consumed samples: 21568000 | consumed tokens: 44171264000 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.706544E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.120 | TFLOPs: 26.27 | +7: iteration 84260/ 173500 | consumed samples: 21570560 | consumed tokens: 44176506880 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.693287E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.763 | TFLOPs: 26.25 | +7: iteration 84270/ 173500 | consumed samples: 21573120 | consumed tokens: 44181749760 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.701297E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.602 | TFLOPs: 26.25 | +7: iteration 84280/ 173500 | consumed samples: 21575680 | consumed tokens: 44186992640 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.713689E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.292 | TFLOPs: 26.24 | +7: iteration 84290/ 173500 | consumed samples: 21578240 | consumed tokens: 44192235520 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.717768E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.574 | TFLOPs: 26.25 | +7: iteration 84300/ 173500 | consumed samples: 21580800 | consumed tokens: 44197478400 | elapsed time per iteration (s): 0.15 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.706815E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.785 | TFLOPs: 26.25 | +7: iteration 84310/ 173500 | consumed samples: 21583360 | consumed tokens: 44202721280 | elapsed time per iteration (s): 0.16 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.698398E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.852 | TFLOPs: 25.81 | +7: iteration 84320/ 173500 | consumed samples: 21585920 | consumed tokens: 44207964160 | elapsed time per iteration (s): 0.16 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.703321E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.163 | TFLOPs: 25.89 | +7: iteration 84330/ 173500 | consumed samples: 21588480 | consumed tokens: 44213207040 | elapsed time per iteration (s): 0.15 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.705334E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.156 | TFLOPs: 26.24 | +7: iteration 84340/ 173500 | consumed samples: 21591040 | consumed tokens: 44218449920 | elapsed time per iteration (s): 0.15 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.704956E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.931 | TFLOPs: 26.25 | +7: iteration 84350/ 173500 | consumed samples: 21593600 | consumed tokens: 44223692800 | elapsed time per iteration (s): 0.15 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.717710E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.711 | TFLOPs: 26.25 | +7: iteration 84360/ 173500 | consumed samples: 21596160 | consumed tokens: 44228935680 | elapsed time per iteration (s): 0.15 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.722806E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.238 | TFLOPs: 26.24 | +7: iteration 84370/ 173500 | consumed samples: 21598720 | consumed tokens: 44234178560 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.703466E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.248 | TFLOPs: 26.24 | +7: iteration 84380/ 173500 | consumed samples: 21601280 | consumed tokens: 44239421440 | elapsed time per iteration (s): 0.16 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.696491E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.538 | TFLOPs: 25.84 | +7: iteration 84390/ 173500 | consumed samples: 21603840 | consumed tokens: 44244664320 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.703251E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.831 | TFLOPs: 25.90 | +7: iteration 84400/ 173500 | consumed samples: 21606400 | consumed tokens: 44249907200 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.706683E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.554 | TFLOPs: 26.23 | +7: iteration 84410/ 173500 | consumed samples: 21608960 | consumed tokens: 44255150080 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.701286E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.291 | TFLOPs: 26.24 | +7: iteration 84420/ 173500 | consumed samples: 21611520 | consumed tokens: 44260392960 | elapsed time per iteration (s): 0.15 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.717235E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.752 | TFLOPs: 26.23 | +7: iteration 84430/ 173500 | consumed samples: 21614080 | consumed tokens: 44265635840 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.691348E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.148 | TFLOPs: 26.19 | +7: iteration 84440/ 173500 | consumed samples: 21616640 | consumed tokens: 44270878720 | elapsed time per iteration (s): 0.16 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.713969E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.283 | TFLOPs: 25.77 | +7: iteration 84450/ 173500 | consumed samples: 21619200 | consumed tokens: 44276121600 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.715919E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.639 | TFLOPs: 26.25 | +7: iteration 84460/ 173500 | consumed samples: 21621760 | consumed tokens: 44281364480 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.704165E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.729 | TFLOPs: 26.28 | +7: iteration 84470/ 173500 | consumed samples: 21624320 | consumed tokens: 44286607360 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.704348E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.748 | TFLOPs: 26.25 | +7: iteration 84480/ 173500 | consumed samples: 21626880 | consumed tokens: 44291850240 | elapsed time per iteration (s): 0.15 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.705032E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.852 | TFLOPs: 26.25 | +7: iteration 84490/ 173500 | consumed samples: 21629440 | consumed tokens: 44297093120 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.701567E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.625 | TFLOPs: 26.26 | +7: iteration 84500/ 173500 | consumed samples: 21632000 | consumed tokens: 44302336000 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.705888E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.022 | TFLOPs: 26.27 | +7: iteration 84510/ 173500 | consumed samples: 21634560 | consumed tokens: 44307578880 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.704364E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.295 | TFLOPs: 26.37 | +7: iteration 84520/ 173500 | consumed samples: 21637120 | consumed tokens: 44312821760 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.709196E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.354 | TFLOPs: 26.35 | +7: iteration 84530/ 173500 | consumed samples: 21639680 | consumed tokens: 44318064640 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.701012E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.168 | TFLOPs: 26.36 | +7: iteration 84540/ 173500 | consumed samples: 21642240 | consumed tokens: 44323307520 | elapsed time per iteration (s): 0.15 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.696003E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.885 | TFLOPs: 26.36 | +7: iteration 84550/ 173500 | consumed samples: 21644800 | consumed tokens: 44328550400 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.711502E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.069 | TFLOPs: 26.36 | +7: iteration 84560/ 173500 | consumed samples: 21647360 | consumed tokens: 44333793280 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.704048E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.488 | TFLOPs: 26.35 | +7: iteration 84570/ 173500 | consumed samples: 21649920 | consumed tokens: 44339036160 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.712262E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.047 | TFLOPs: 26.33 | +7: iteration 84580/ 173500 | consumed samples: 21652480 | consumed tokens: 44344279040 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.691615E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.201 | TFLOPs: 26.33 | +7: iteration 84590/ 173500 | consumed samples: 21655040 | consumed tokens: 44349521920 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.699433E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.292 | TFLOPs: 26.35 | +7: iteration 84600/ 173500 | consumed samples: 21657600 | consumed tokens: 44354764800 | elapsed time per iteration (s): 0.15 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.711921E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.242 | TFLOPs: 26.38 | +7: iteration 84610/ 173500 | consumed samples: 21660160 | consumed tokens: 44360007680 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.710101E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.787 | TFLOPs: 26.37 | +7: iteration 84620/ 173500 | consumed samples: 21662720 | consumed tokens: 44365250560 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.698272E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.921 | TFLOPs: 26.36 | +7: iteration 84630/ 173500 | consumed samples: 21665280 | consumed tokens: 44370493440 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.712457E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.572 | TFLOPs: 26.32 | +7: iteration 84640/ 173500 | consumed samples: 21667840 | consumed tokens: 44375736320 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.710750E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.593 | TFLOPs: 26.26 | +7: iteration 84650/ 173500 | consumed samples: 21670400 | consumed tokens: 44380979200 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.696396E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.622 | TFLOPs: 26.28 | +7: iteration 84660/ 173500 | consumed samples: 21672960 | consumed tokens: 44386222080 | elapsed time per iteration (s): 0.15 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.699352E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.937 | TFLOPs: 26.25 | +7: iteration 84670/ 173500 | consumed samples: 21675520 | consumed tokens: 44391464960 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.713773E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.551 | TFLOPs: 26.25 | +7: iteration 84680/ 173500 | consumed samples: 21678080 | consumed tokens: 44396707840 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.715363E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.241 | TFLOPs: 26.27 | +7: iteration 84690/ 173500 | consumed samples: 21680640 | consumed tokens: 44401950720 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.717930E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.937 | TFLOPs: 26.25 | +7: iteration 84700/ 173500 | consumed samples: 21683200 | consumed tokens: 44407193600 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.706110E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.307 | TFLOPs: 26.26 | +7: iteration 84710/ 173500 | consumed samples: 21685760 | consumed tokens: 44412436480 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.715903E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.720 | TFLOPs: 26.22 | +7: iteration 84720/ 173500 | consumed samples: 21688320 | consumed tokens: 44417679360 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.713841E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.652 | TFLOPs: 26.25 | +7: iteration 84730/ 173500 | consumed samples: 21690880 | consumed tokens: 44422922240 | elapsed time per iteration (s): 0.15 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.727928E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.305 | TFLOPs: 26.26 | +7: iteration 84740/ 173500 | consumed samples: 21693440 | consumed tokens: 44428165120 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.695700E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.200 | TFLOPs: 26.24 | +7: iteration 84750/ 173500 | consumed samples: 21696000 | consumed tokens: 44433408000 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.711053E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.666 | TFLOPs: 26.22 | +7: iteration 84760/ 173500 | consumed samples: 21698560 | consumed tokens: 44438650880 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.696984E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.882 | TFLOPs: 26.23 | +7: iteration 84770/ 173500 | consumed samples: 21701120 | consumed tokens: 44443893760 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.716412E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.541 | TFLOPs: 26.21 | +7: iteration 84780/ 173500 | consumed samples: 21703680 | consumed tokens: 44449136640 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.702851E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.167 | TFLOPs: 26.19 | +7: iteration 84790/ 173500 | consumed samples: 21706240 | consumed tokens: 44454379520 | elapsed time per iteration (s): 0.15 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.709428E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.835 | TFLOPs: 26.23 | +7: iteration 84800/ 173500 | consumed samples: 21708800 | consumed tokens: 44459622400 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.704075E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.473 | TFLOPs: 26.23 | +7: iteration 84810/ 173500 | consumed samples: 21711360 | consumed tokens: 44464865280 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.707887E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.066 | TFLOPs: 26.22 | +7: iteration 84820/ 173500 | consumed samples: 21713920 | consumed tokens: 44470108160 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.705999E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.771 | TFLOPs: 26.23 | +7: iteration 84830/ 173500 | consumed samples: 21716480 | consumed tokens: 44475351040 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.714238E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.754 | TFLOPs: 26.25 | +7: iteration 84840/ 173500 | consumed samples: 21719040 | consumed tokens: 44480593920 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.711597E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.305 | TFLOPs: 26.24 | +7: iteration 84850/ 173500 | consumed samples: 21721600 | consumed tokens: 44485836800 | elapsed time per iteration (s): 0.15 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.721005E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.274 | TFLOPs: 26.24 | +7: iteration 84860/ 173500 | consumed samples: 21724160 | consumed tokens: 44491079680 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.722192E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.525 | TFLOPs: 26.09 | +7: iteration 84870/ 173500 | consumed samples: 21726720 | consumed tokens: 44496322560 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.698681E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.023 | TFLOPs: 26.16 | +7: iteration 84880/ 173500 | consumed samples: 21729280 | consumed tokens: 44501565440 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.716060E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.928 | TFLOPs: 26.25 | +7: iteration 84890/ 173500 | consumed samples: 21731840 | consumed tokens: 44506808320 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.707509E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.675 | TFLOPs: 26.23 | +7: iteration 84900/ 173500 | consumed samples: 21734400 | consumed tokens: 44512051200 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.700239E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.445 | TFLOPs: 26.23 | +7: iteration 84910/ 173500 | consumed samples: 21736960 | consumed tokens: 44517294080 | elapsed time per iteration (s): 0.15 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.716461E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.115 | TFLOPs: 26.24 | +7: iteration 84920/ 173500 | consumed samples: 21739520 | consumed tokens: 44522536960 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.715186E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.687 | TFLOPs: 26.23 | +7: iteration 84930/ 173500 | consumed samples: 21742080 | consumed tokens: 44527779840 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.709992E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.051 | TFLOPs: 26.25 | +7: iteration 84940/ 173500 | consumed samples: 21744640 | consumed tokens: 44533022720 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.711503E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.099 | TFLOPs: 26.24 | +7: iteration 84950/ 173500 | consumed samples: 21747200 | consumed tokens: 44538265600 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.695535E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.092 | TFLOPs: 26.25 | +7: iteration 84960/ 173500 | consumed samples: 21749760 | consumed tokens: 44543508480 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.704425E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.355 | TFLOPs: 26.23 | +7: iteration 84970/ 173500 | consumed samples: 21752320 | consumed tokens: 44548751360 | elapsed time per iteration (s): 0.15 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.701803E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.686 | TFLOPs: 26.25 | +7: iteration 84980/ 173500 | consumed samples: 21754880 | consumed tokens: 44553994240 | elapsed time per iteration (s): 0.15 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.693180E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.721 | TFLOPs: 26.23 | +7: iteration 84990/ 173500 | consumed samples: 21757440 | consumed tokens: 44559237120 | elapsed time per iteration (s): 0.15 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.698436E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.263 | TFLOPs: 26.26 | +7: iteration 85000/ 173500 | consumed samples: 21760000 | consumed tokens: 44564480000 | elapsed time per iteration (s): 0.15 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.697395E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.855 | TFLOPs: 26.27 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 85000 | lm loss value: 3.848554E+00 | lm loss PPL: 4.692514E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 85000 to checkpoints_44m91b100m +0: [2023-03-17 03:56:24,383] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step85000 is begin to save! +0: [2023-03-17 03:56:24,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:56:24,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:56:24,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:56:24,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:56:24,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:56:24,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:56:24,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:56:24,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:56:24,484] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:56:24,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:56:24,492] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:56:24,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:56:24,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:56:24,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:56:24,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:56:24,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:56:24,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:56:24,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:56:24,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:56:24,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:56:24,528] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step85000/mp_rank_00_model_states.pt +0: [2023-03-17 03:56:24,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:56:24,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:56:24,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:56:24,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:56:24,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:56:24,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:56:24,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:56:24,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:56:24,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +2: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:56:24,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:56:24,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +7: [2023-03-17 03:56:24,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +3: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +1: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +6: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +4: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +5: [2023-03-17 03:56:24,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:56:24,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:56:24,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: [2023-03-17 03:56:24,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:56:24,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step85000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:56:24,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85000 is ready now! +0: successfully saved checkpoint at iteration 85000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 191.68 +7: iteration 85010/ 173500 | consumed samples: 21762560 | consumed tokens: 44569722880 | elapsed time per iteration (s): 0.18 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.728591E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1444.623 | TFLOPs: 22.66 | +7: iteration 85020/ 173500 | consumed samples: 21765120 | consumed tokens: 44574965760 | elapsed time per iteration (s): 0.16 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.708535E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.112 | TFLOPs: 25.53 | +7: iteration 85030/ 173500 | consumed samples: 21767680 | consumed tokens: 44580208640 | elapsed time per iteration (s): 0.16 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.719756E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.672 | TFLOPs: 25.56 | +7: iteration 85040/ 173500 | consumed samples: 21770240 | consumed tokens: 44585451520 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.711943E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.336 | TFLOPs: 26.26 | +7: iteration 85050/ 173500 | consumed samples: 21772800 | consumed tokens: 44590694400 | elapsed time per iteration (s): 0.16 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.721159E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.911 | TFLOPs: 25.59 | +7: iteration 85060/ 173500 | consumed samples: 21775360 | consumed tokens: 44595937280 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.725428E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.756 | TFLOPs: 26.25 | +7: iteration 85070/ 173500 | consumed samples: 21777920 | consumed tokens: 44601180160 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.702972E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.647 | TFLOPs: 26.23 | +7: iteration 85080/ 173500 | consumed samples: 21780480 | consumed tokens: 44606423040 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.716529E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.492 | TFLOPs: 26.18 | +7: iteration 85090/ 173500 | consumed samples: 21783040 | consumed tokens: 44611665920 | elapsed time per iteration (s): 0.15 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.710413E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.292 | TFLOPs: 26.07 | +7: iteration 85100/ 173500 | consumed samples: 21785600 | consumed tokens: 44616908800 | elapsed time per iteration (s): 0.16 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.700644E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.335 | TFLOPs: 25.24 | +7: iteration 85110/ 173500 | consumed samples: 21788160 | consumed tokens: 44622151680 | elapsed time per iteration (s): 0.15 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.703672E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.912 | TFLOPs: 26.17 | +7: iteration 85120/ 173500 | consumed samples: 21790720 | consumed tokens: 44627394560 | elapsed time per iteration (s): 0.16 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.717304E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.678 | TFLOPs: 25.76 | +7: iteration 85130/ 173500 | consumed samples: 21793280 | consumed tokens: 44632637440 | elapsed time per iteration (s): 0.15 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.715803E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.567 | TFLOPs: 26.17 | +7: iteration 85140/ 173500 | consumed samples: 21795840 | consumed tokens: 44637880320 | elapsed time per iteration (s): 0.15 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.707060E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.854 | TFLOPs: 26.19 | +7: iteration 85150/ 173500 | consumed samples: 21798400 | consumed tokens: 44643123200 | elapsed time per iteration (s): 0.15 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.699004E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.228 | TFLOPs: 26.19 | +7: iteration 85160/ 173500 | consumed samples: 21800960 | consumed tokens: 44648366080 | elapsed time per iteration (s): 0.16 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.700924E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.135 | TFLOPs: 25.63 | +7: iteration 85170/ 173500 | consumed samples: 21803520 | consumed tokens: 44653608960 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.711643E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.875 | TFLOPs: 26.14 | +7: iteration 85180/ 173500 | consumed samples: 21806080 | consumed tokens: 44658851840 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.704828E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.656 | TFLOPs: 26.11 | +7: iteration 85190/ 173500 | consumed samples: 21808640 | consumed tokens: 44664094720 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.702016E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.130 | TFLOPs: 26.13 | +7: iteration 85200/ 173500 | consumed samples: 21811200 | consumed tokens: 44669337600 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.709188E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.490 | TFLOPs: 26.10 | +7: iteration 85210/ 173500 | consumed samples: 21813760 | consumed tokens: 44674580480 | elapsed time per iteration (s): 0.15 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.704045E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.948 | TFLOPs: 26.17 | +7: iteration 85220/ 173500 | consumed samples: 21816320 | consumed tokens: 44679823360 | elapsed time per iteration (s): 0.16 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.709806E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.220 | TFLOPs: 25.42 | +7: iteration 85230/ 173500 | consumed samples: 21818880 | consumed tokens: 44685066240 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.720765E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.207 | TFLOPs: 26.16 | +7: iteration 85240/ 173500 | consumed samples: 21821440 | consumed tokens: 44690309120 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.701004E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.523 | TFLOPs: 26.17 | +7: iteration 85250/ 173500 | consumed samples: 21824000 | consumed tokens: 44695552000 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.699667E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.490 | TFLOPs: 26.17 | +7: iteration 85260/ 173500 | consumed samples: 21826560 | consumed tokens: 44700794880 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.705941E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.292 | TFLOPs: 26.16 | +7: iteration 85270/ 173500 | consumed samples: 21829120 | consumed tokens: 44706037760 | elapsed time per iteration (s): 0.15 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.710251E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.657 | TFLOPs: 26.15 | +7: iteration 85280/ 173500 | consumed samples: 21831680 | consumed tokens: 44711280640 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.708261E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.563 | TFLOPs: 26.18 | +7: iteration 85290/ 173500 | consumed samples: 21834240 | consumed tokens: 44716523520 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.702487E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.669 | TFLOPs: 26.22 | +7: iteration 85300/ 173500 | consumed samples: 21836800 | consumed tokens: 44721766400 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.702991E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.839 | TFLOPs: 26.25 | +7: iteration 85310/ 173500 | consumed samples: 21839360 | consumed tokens: 44727009280 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.722878E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.060 | TFLOPs: 26.11 | +7: iteration 85320/ 173500 | consumed samples: 21841920 | consumed tokens: 44732252160 | elapsed time per iteration (s): 0.15 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.702706E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.628 | TFLOPs: 26.17 | +7: iteration 85330/ 173500 | consumed samples: 21844480 | consumed tokens: 44737495040 | elapsed time per iteration (s): 0.16 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.707154E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.230 | TFLOPs: 25.63 | +7: iteration 85340/ 173500 | consumed samples: 21847040 | consumed tokens: 44742737920 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.691718E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.458 | TFLOPs: 26.24 | +7: iteration 85350/ 173500 | consumed samples: 21849600 | consumed tokens: 44747980800 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.704908E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.285 | TFLOPs: 26.26 | +7: iteration 85360/ 173500 | consumed samples: 21852160 | consumed tokens: 44753223680 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.707586E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.805 | TFLOPs: 26.25 | +7: iteration 85370/ 173500 | consumed samples: 21854720 | consumed tokens: 44758466560 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.701559E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.334 | TFLOPs: 25.93 | +7: iteration 85380/ 173500 | consumed samples: 21857280 | consumed tokens: 44763709440 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.706242E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.689 | TFLOPs: 26.26 | +7: iteration 85390/ 173500 | consumed samples: 21859840 | consumed tokens: 44768952320 | elapsed time per iteration (s): 0.15 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.711674E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.291 | TFLOPs: 26.26 | +7: iteration 85400/ 173500 | consumed samples: 21862400 | consumed tokens: 44774195200 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.701753E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.195 | TFLOPs: 26.26 | +7: iteration 85410/ 173500 | consumed samples: 21864960 | consumed tokens: 44779438080 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.719556E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.886 | TFLOPs: 26.27 | +7: iteration 85420/ 173500 | consumed samples: 21867520 | consumed tokens: 44784680960 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.695527E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.948 | TFLOPs: 26.22 | +7: iteration 85430/ 173500 | consumed samples: 21870080 | consumed tokens: 44789923840 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.720248E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.096 | TFLOPs: 26.24 | +7: iteration 85440/ 173500 | consumed samples: 21872640 | consumed tokens: 44795166720 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.701408E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.526 | TFLOPs: 26.25 | +7: iteration 85450/ 173500 | consumed samples: 21875200 | consumed tokens: 44800409600 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.697348E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.161 | TFLOPs: 26.27 | +7: iteration 85460/ 173500 | consumed samples: 21877760 | consumed tokens: 44805652480 | elapsed time per iteration (s): 0.15 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.710240E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.553 | TFLOPs: 26.26 | +7: iteration 85470/ 173500 | consumed samples: 21880320 | consumed tokens: 44810895360 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.700460E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.625 | TFLOPs: 26.28 | +7: iteration 85480/ 173500 | consumed samples: 21882880 | consumed tokens: 44816138240 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.708859E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.706 | TFLOPs: 26.26 | +7: iteration 85490/ 173500 | consumed samples: 21885440 | consumed tokens: 44821381120 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.689928E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.569 | TFLOPs: 26.26 | +7: iteration 85500/ 173500 | consumed samples: 21888000 | consumed tokens: 44826624000 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.714124E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.553 | TFLOPs: 26.20 | +7: iteration 85510/ 173500 | consumed samples: 21890560 | consumed tokens: 44831866880 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.698897E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.603 | TFLOPs: 26.25 | +7: iteration 85520/ 173500 | consumed samples: 21893120 | consumed tokens: 44837109760 | elapsed time per iteration (s): 0.15 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.721816E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.120 | TFLOPs: 26.21 | +7: iteration 85530/ 173500 | consumed samples: 21895680 | consumed tokens: 44842352640 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.705294E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.150 | TFLOPs: 26.21 | +7: iteration 85540/ 173500 | consumed samples: 21898240 | consumed tokens: 44847595520 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.692229E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.025 | TFLOPs: 25.92 | +7: iteration 85550/ 173500 | consumed samples: 21900800 | consumed tokens: 44852838400 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.714177E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.179 | TFLOPs: 26.26 | +7: iteration 85560/ 173500 | consumed samples: 21903360 | consumed tokens: 44858081280 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.703174E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.410 | TFLOPs: 26.26 | +7: iteration 85570/ 173500 | consumed samples: 21905920 | consumed tokens: 44863324160 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.692931E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.138 | TFLOPs: 26.25 | +7: iteration 85580/ 173500 | consumed samples: 21908480 | consumed tokens: 44868567040 | elapsed time per iteration (s): 0.15 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.702597E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.705 | TFLOPs: 26.22 | +7: iteration 85590/ 173500 | consumed samples: 21911040 | consumed tokens: 44873809920 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.707777E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.258 | TFLOPs: 26.24 | +7: iteration 85600/ 173500 | consumed samples: 21913600 | consumed tokens: 44879052800 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.690505E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.894 | TFLOPs: 26.20 | +7: iteration 85610/ 173500 | consumed samples: 21916160 | consumed tokens: 44884295680 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.712682E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.561 | TFLOPs: 26.20 | +7: iteration 85620/ 173500 | consumed samples: 21918720 | consumed tokens: 44889538560 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.716349E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.841 | TFLOPs: 26.09 | +7: iteration 85630/ 173500 | consumed samples: 21921280 | consumed tokens: 44894781440 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.697308E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.265 | TFLOPs: 26.13 | +7: iteration 85640/ 173500 | consumed samples: 21923840 | consumed tokens: 44900024320 | elapsed time per iteration (s): 0.15 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.708827E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.226 | TFLOPs: 26.21 | +7: iteration 85650/ 173500 | consumed samples: 21926400 | consumed tokens: 44905267200 | elapsed time per iteration (s): 0.16 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.701131E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.065 | TFLOPs: 24.54 | +7: iteration 85660/ 173500 | consumed samples: 21928960 | consumed tokens: 44910510080 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.696198E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.251 | TFLOPs: 26.21 | +7: iteration 85670/ 173500 | consumed samples: 21931520 | consumed tokens: 44915752960 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.709459E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.253 | TFLOPs: 26.21 | +7: iteration 85680/ 173500 | consumed samples: 21934080 | consumed tokens: 44920995840 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.695749E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.255 | TFLOPs: 26.18 | +7: iteration 85690/ 173500 | consumed samples: 21936640 | consumed tokens: 44926238720 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.707776E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.428 | TFLOPs: 26.18 | +7: iteration 85700/ 173500 | consumed samples: 21939200 | consumed tokens: 44931481600 | elapsed time per iteration (s): 0.15 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.700198E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.445 | TFLOPs: 26.20 | +7: iteration 85710/ 173500 | consumed samples: 21941760 | consumed tokens: 44936724480 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.700646E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.086 | TFLOPs: 26.24 | +7: iteration 85720/ 173500 | consumed samples: 21944320 | consumed tokens: 44941967360 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.693499E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.601 | TFLOPs: 26.25 | +7: iteration 85730/ 173500 | consumed samples: 21946880 | consumed tokens: 44947210240 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.708348E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.714 | TFLOPs: 26.23 | +7: iteration 85740/ 173500 | consumed samples: 21949440 | consumed tokens: 44952453120 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.713601E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.804 | TFLOPs: 26.22 | +7: iteration 85750/ 173500 | consumed samples: 21952000 | consumed tokens: 44957696000 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.714301E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.607 | TFLOPs: 26.20 | +7: iteration 85760/ 173500 | consumed samples: 21954560 | consumed tokens: 44962938880 | elapsed time per iteration (s): 0.15 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.701435E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.639 | TFLOPs: 26.22 | +7: iteration 85770/ 173500 | consumed samples: 21957120 | consumed tokens: 44968181760 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.710516E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.050 | TFLOPs: 26.24 | +7: iteration 85780/ 173500 | consumed samples: 21959680 | consumed tokens: 44973424640 | elapsed time per iteration (s): 0.16 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.692929E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.060 | TFLOPs: 25.81 | +7: iteration 85790/ 173500 | consumed samples: 21962240 | consumed tokens: 44978667520 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.721676E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.882 | TFLOPs: 26.19 | +7: iteration 85800/ 173500 | consumed samples: 21964800 | consumed tokens: 44983910400 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.709163E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.823 | TFLOPs: 26.25 | +7: iteration 85810/ 173500 | consumed samples: 21967360 | consumed tokens: 44989153280 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.703820E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.045 | TFLOPs: 26.33 | +7: iteration 85820/ 173500 | consumed samples: 21969920 | consumed tokens: 44994396160 | elapsed time per iteration (s): 0.15 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.701774E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.333 | TFLOPs: 26.34 | +7: iteration 85830/ 173500 | consumed samples: 21972480 | consumed tokens: 44999639040 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.703786E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.310 | TFLOPs: 26.26 | +7: iteration 85840/ 173500 | consumed samples: 21975040 | consumed tokens: 45004881920 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.708002E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.016 | TFLOPs: 26.27 | +7: iteration 85850/ 173500 | consumed samples: 21977600 | consumed tokens: 45010124800 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.705967E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.705 | TFLOPs: 26.08 | +7: iteration 85860/ 173500 | consumed samples: 21980160 | consumed tokens: 45015367680 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.702166E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.482 | TFLOPs: 26.34 | +7: iteration 85870/ 173500 | consumed samples: 21982720 | consumed tokens: 45020610560 | elapsed time per iteration (s): 0.15 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.704299E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.031 | TFLOPs: 26.33 | +7: iteration 85880/ 173500 | consumed samples: 21985280 | consumed tokens: 45025853440 | elapsed time per iteration (s): 0.16 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.704628E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.394 | TFLOPs: 25.62 | +7: iteration 85890/ 173500 | consumed samples: 21987840 | consumed tokens: 45031096320 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.701212E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.157 | TFLOPs: 26.33 | +7: iteration 85900/ 173500 | consumed samples: 21990400 | consumed tokens: 45036339200 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.710844E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.322 | TFLOPs: 26.35 | +7: iteration 85910/ 173500 | consumed samples: 21992960 | consumed tokens: 45041582080 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.697822E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.745 | TFLOPs: 26.34 | +7: iteration 85920/ 173500 | consumed samples: 21995520 | consumed tokens: 45046824960 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.700251E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.344 | TFLOPs: 26.32 | +7: iteration 85930/ 173500 | consumed samples: 21998080 | consumed tokens: 45052067840 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.708097E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.090 | TFLOPs: 26.33 | +7: iteration 85940/ 173500 | consumed samples: 22000640 | consumed tokens: 45057310720 | elapsed time per iteration (s): 0.15 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.711955E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.713 | TFLOPs: 26.36 | +7: iteration 85950/ 173500 | consumed samples: 22003200 | consumed tokens: 45062553600 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.702823E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.101 | TFLOPs: 26.33 | +7: iteration 85960/ 173500 | consumed samples: 22005760 | consumed tokens: 45067796480 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.699628E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.100 | TFLOPs: 26.32 | +7: iteration 85970/ 173500 | consumed samples: 22008320 | consumed tokens: 45073039360 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.701127E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.930 | TFLOPs: 26.35 | +7: iteration 85980/ 173500 | consumed samples: 22010880 | consumed tokens: 45078282240 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.699754E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.778 | TFLOPs: 26.01 | +7: iteration 85990/ 173500 | consumed samples: 22013440 | consumed tokens: 45083525120 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.712743E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.776 | TFLOPs: 26.37 | +0: [2023-03-17 03:58:58,174] [INFO] [logging.py:68:log_dist] [Rank 0] step=86000, skipped=0, lr=[0.0001126626417003261, 0.0001126626417003261, 0.0001126626417003261], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 86000/ 173500 | consumed samples: 22016000 | consumed tokens: 45088768000 | elapsed time per iteration (s): 0.15 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.705635E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.769 | TFLOPs: 26.37 | +0: steps: 86000 loss: 3.7148 iter time (s): 0.152 samples/sec: 1685.275 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 86000 | lm loss value: 3.846049E+00 | lm loss PPL: 4.680775E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 86000 to checkpoints_44m91b100m +0: [2023-03-17 03:58:58,246] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step86000 is begin to save! +0: [2023-03-17 03:58:58,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_01-model_00-model_states.pt... +0: [2023-03-17 03:58:58,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_01-model_00-model_states.pt. +0: [2023-03-17 03:58:58,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_03-model_00-model_states.pt... +0: [2023-03-17 03:58:58,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_03-model_00-model_states.pt. +0: [2023-03-17 03:58:58,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_04-model_00-model_states.pt... +0: [2023-03-17 03:58:58,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_04-model_00-model_states.pt. +0: [2023-03-17 03:58:58,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_05-model_00-model_states.pt... +0: [2023-03-17 03:58:58,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_05-model_00-model_states.pt. +0: [2023-03-17 03:58:58,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_06-model_00-model_states.pt... +0: [2023-03-17 03:58:58,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_06-model_00-model_states.pt. +0: [2023-03-17 03:58:58,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_07-model_00-model_states.pt... +0: [2023-03-17 03:58:58,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_07-model_00-model_states.pt. +0: [2023-03-17 03:58:58,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_08-model_00-model_states.pt... +0: [2023-03-17 03:58:58,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_08-model_00-model_states.pt. +0: [2023-03-17 03:58:58,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_09-model_00-model_states.pt... +0: [2023-03-17 03:58:58,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_09-model_00-model_states.pt. +0: [2023-03-17 03:58:58,376] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_10-model_00-model_states.pt... +0: [2023-03-17 03:58:58,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_10-model_00-model_states.pt. +0: [2023-03-17 03:58:58,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/layer_12-model_00-model_states.pt... +0: [2023-03-17 03:58:58,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/layer_12-model_00-model_states.pt. +0: [2023-03-17 03:58:58,386] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step86000/mp_rank_00_model_states.pt +0: [2023-03-17 03:58:58,386] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/mp_rank_00_model_states.pt... +0: [2023-03-17 03:58:58,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/mp_rank_00_model_states.pt. +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +7: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 03:58:58,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 03:58:58,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:58:58,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:58:58,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +1: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +6: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +5: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +7: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +2: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +4: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +3: [2023-03-17 03:58:58,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step86000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 03:58:58,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step86000 is ready now! +0: successfully saved checkpoint at iteration 86000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.38 +7: iteration 86010/ 173500 | consumed samples: 22018560 | consumed tokens: 45094010880 | elapsed time per iteration (s): 0.18 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.697184E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.714 | TFLOPs: 22.64 | +7: iteration 86020/ 173500 | consumed samples: 22021120 | consumed tokens: 45099253760 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.716216E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.793 | TFLOPs: 26.37 | +7: iteration 86030/ 173500 | consumed samples: 22023680 | consumed tokens: 45104496640 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.692979E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.342 | TFLOPs: 26.34 | +7: iteration 86040/ 173500 | consumed samples: 22026240 | consumed tokens: 45109739520 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.702248E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.304 | TFLOPs: 26.38 | +7: iteration 86050/ 173500 | consumed samples: 22028800 | consumed tokens: 45114982400 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.704697E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.254 | TFLOPs: 26.37 | +7: iteration 86060/ 173500 | consumed samples: 22031360 | consumed tokens: 45120225280 | elapsed time per iteration (s): 0.15 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.707469E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.898 | TFLOPs: 26.38 | +7: iteration 86070/ 173500 | consumed samples: 22033920 | consumed tokens: 45125468160 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.706154E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.451 | TFLOPs: 26.35 | +7: iteration 86080/ 173500 | consumed samples: 22036480 | consumed tokens: 45130711040 | elapsed time per iteration (s): 0.16 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.710545E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.785 | TFLOPs: 25.28 | +7: iteration 86090/ 173500 | consumed samples: 22039040 | consumed tokens: 45135953920 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.704396E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.806 | TFLOPs: 26.33 | +7: iteration 86100/ 173500 | consumed samples: 22041600 | consumed tokens: 45141196800 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.706096E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.608 | TFLOPs: 26.31 | +7: iteration 86110/ 173500 | consumed samples: 22044160 | consumed tokens: 45146439680 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.718192E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.804 | TFLOPs: 26.36 | +7: iteration 86120/ 173500 | consumed samples: 22046720 | consumed tokens: 45151682560 | elapsed time per iteration (s): 0.15 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.706803E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.381 | TFLOPs: 26.37 | +7: iteration 86130/ 173500 | consumed samples: 22049280 | consumed tokens: 45156925440 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.707254E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.859 | TFLOPs: 26.34 | +7: iteration 86140/ 173500 | consumed samples: 22051840 | consumed tokens: 45162168320 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.693173E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.722 | TFLOPs: 26.12 | +7: iteration 86150/ 173500 | consumed samples: 22054400 | consumed tokens: 45167411200 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.704622E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.613 | TFLOPs: 26.36 | +7: iteration 86160/ 173500 | consumed samples: 22056960 | consumed tokens: 45172654080 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.707948E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.741 | TFLOPs: 26.31 | +7: iteration 86170/ 173500 | consumed samples: 22059520 | consumed tokens: 45177896960 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.706775E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.522 | TFLOPs: 26.25 | +7: iteration 86180/ 173500 | consumed samples: 22062080 | consumed tokens: 45183139840 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.702429E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.288 | TFLOPs: 26.24 | +7: iteration 86190/ 173500 | consumed samples: 22064640 | consumed tokens: 45188382720 | elapsed time per iteration (s): 0.15 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.717461E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.655 | TFLOPs: 26.22 | +7: iteration 86200/ 173500 | consumed samples: 22067200 | consumed tokens: 45193625600 | elapsed time per iteration (s): 0.15 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.711758E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.493 | TFLOPs: 26.17 | +7: iteration 86210/ 173500 | consumed samples: 22069760 | consumed tokens: 45198868480 | elapsed time per iteration (s): 0.16 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.705497E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.899 | TFLOPs: 25.86 | +7: iteration 86220/ 173500 | consumed samples: 22072320 | consumed tokens: 45204111360 | elapsed time per iteration (s): 0.15 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.705494E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.200 | TFLOPs: 26.19 | +7: iteration 86230/ 173500 | consumed samples: 22074880 | consumed tokens: 45209354240 | elapsed time per iteration (s): 0.16 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.711244E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.550 | TFLOPs: 24.91 | +7: iteration 86240/ 173500 | consumed samples: 22077440 | consumed tokens: 45214597120 | elapsed time per iteration (s): 0.15 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.705350E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.112 | TFLOPs: 26.18 | +7: iteration 86250/ 173500 | consumed samples: 22080000 | consumed tokens: 45219840000 | elapsed time per iteration (s): 0.15 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.707562E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.105 | TFLOPs: 26.21 | +7: iteration 86260/ 173500 | consumed samples: 22082560 | consumed tokens: 45225082880 | elapsed time per iteration (s): 0.15 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.699978E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.920 | TFLOPs: 26.19 | +7: iteration 86270/ 173500 | consumed samples: 22085120 | consumed tokens: 45230325760 | elapsed time per iteration (s): 0.15 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.708672E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.605 | TFLOPs: 26.20 | +7: iteration 86280/ 173500 | consumed samples: 22087680 | consumed tokens: 45235568640 | elapsed time per iteration (s): 0.16 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.692555E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.995 | TFLOPs: 25.59 | +7: iteration 86290/ 173500 | consumed samples: 22090240 | consumed tokens: 45240811520 | elapsed time per iteration (s): 0.15 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.701380E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.854 | TFLOPs: 26.27 | +7: iteration 86300/ 173500 | consumed samples: 22092800 | consumed tokens: 45246054400 | elapsed time per iteration (s): 0.15 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.705484E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.133 | TFLOPs: 26.29 | +7: iteration 86310/ 173500 | consumed samples: 22095360 | consumed tokens: 45251297280 | elapsed time per iteration (s): 0.15 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.707888E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.311 | TFLOPs: 26.26 | +7: iteration 86320/ 173500 | consumed samples: 22097920 | consumed tokens: 45256540160 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.699746E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.446 | TFLOPs: 26.28 | +7: iteration 86330/ 173500 | consumed samples: 22100480 | consumed tokens: 45261783040 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.711512E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.977 | TFLOPs: 26.27 | +7: iteration 86340/ 173500 | consumed samples: 22103040 | consumed tokens: 45267025920 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.710160E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.851 | TFLOPs: 26.28 | +7: iteration 86350/ 173500 | consumed samples: 22105600 | consumed tokens: 45272268800 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.701359E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.813 | TFLOPs: 26.27 | +7: iteration 86360/ 173500 | consumed samples: 22108160 | consumed tokens: 45277511680 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.707824E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.671 | TFLOPs: 26.26 | +7: iteration 86370/ 173500 | consumed samples: 22110720 | consumed tokens: 45282754560 | elapsed time per iteration (s): 0.15 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.705364E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.693 | TFLOPs: 26.25 | +7: iteration 86380/ 173500 | consumed samples: 22113280 | consumed tokens: 45287997440 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.712484E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.774 | TFLOPs: 26.23 | +7: iteration 86390/ 173500 | consumed samples: 22115840 | consumed tokens: 45293240320 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.703495E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.375 | TFLOPs: 26.26 | +7: iteration 86400/ 173500 | consumed samples: 22118400 | consumed tokens: 45298483200 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.706875E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.203 | TFLOPs: 26.15 | +7: iteration 86410/ 173500 | consumed samples: 22120960 | consumed tokens: 45303726080 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.698077E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.495 | TFLOPs: 26.18 | +7: iteration 86420/ 173500 | consumed samples: 22123520 | consumed tokens: 45308968960 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.711502E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.583 | TFLOPs: 26.20 | +7: iteration 86430/ 173500 | consumed samples: 22126080 | consumed tokens: 45314211840 | elapsed time per iteration (s): 0.15 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.715795E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.239 | TFLOPs: 26.19 | +7: iteration 86440/ 173500 | consumed samples: 22128640 | consumed tokens: 45319454720 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.711374E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.388 | TFLOPs: 26.24 | +7: iteration 86450/ 173500 | consumed samples: 22131200 | consumed tokens: 45324697600 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.705080E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.884 | TFLOPs: 26.22 | +7: iteration 86460/ 173500 | consumed samples: 22133760 | consumed tokens: 45329940480 | elapsed time per iteration (s): 0.16 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.712862E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.034 | TFLOPs: 25.63 | +7: iteration 86470/ 173500 | consumed samples: 22136320 | consumed tokens: 45335183360 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.702030E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.511 | TFLOPs: 26.18 | +7: iteration 86480/ 173500 | consumed samples: 22138880 | consumed tokens: 45340426240 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.701135E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.193 | TFLOPs: 26.29 | +7: iteration 86490/ 173500 | consumed samples: 22141440 | consumed tokens: 45345669120 | elapsed time per iteration (s): 0.15 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.710909E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.834 | TFLOPs: 26.28 | +7: iteration 86500/ 173500 | consumed samples: 22144000 | consumed tokens: 45350912000 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.690315E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.119 | TFLOPs: 26.27 | +7: iteration 86510/ 173500 | consumed samples: 22146560 | consumed tokens: 45356154880 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.700885E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.927 | TFLOPs: 26.28 | +7: iteration 86520/ 173500 | consumed samples: 22149120 | consumed tokens: 45361397760 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.716489E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.019 | TFLOPs: 26.28 | +7: iteration 86530/ 173500 | consumed samples: 22151680 | consumed tokens: 45366640640 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.698703E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.360 | TFLOPs: 26.27 | +7: iteration 86540/ 173500 | consumed samples: 22154240 | consumed tokens: 45371883520 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.699533E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.029 | TFLOPs: 26.30 | +7: iteration 86550/ 173500 | consumed samples: 22156800 | consumed tokens: 45377126400 | elapsed time per iteration (s): 0.15 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.709712E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.000 | TFLOPs: 26.28 | +7: iteration 86560/ 173500 | consumed samples: 22159360 | consumed tokens: 45382369280 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.706725E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.941 | TFLOPs: 26.28 | +7: iteration 86570/ 173500 | consumed samples: 22161920 | consumed tokens: 45387612160 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.696260E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.998 | TFLOPs: 26.27 | +7: iteration 86580/ 173500 | consumed samples: 22164480 | consumed tokens: 45392855040 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.706442E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.805 | TFLOPs: 26.28 | +7: iteration 86590/ 173500 | consumed samples: 22167040 | consumed tokens: 45398097920 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.712104E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.484 | TFLOPs: 26.28 | +7: iteration 86600/ 173500 | consumed samples: 22169600 | consumed tokens: 45403340800 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.690990E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.863 | TFLOPs: 26.27 | +7: iteration 86610/ 173500 | consumed samples: 22172160 | consumed tokens: 45408583680 | elapsed time per iteration (s): 0.15 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.696059E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.273 | TFLOPs: 26.27 | +7: iteration 86620/ 173500 | consumed samples: 22174720 | consumed tokens: 45413826560 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.713897E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.416 | TFLOPs: 26.26 | +7: iteration 86630/ 173500 | consumed samples: 22177280 | consumed tokens: 45419069440 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.698492E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.179 | TFLOPs: 26.26 | +7: iteration 86640/ 173500 | consumed samples: 22179840 | consumed tokens: 45424312320 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.699814E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.507 | TFLOPs: 26.26 | +7: iteration 86650/ 173500 | consumed samples: 22182400 | consumed tokens: 45429555200 | elapsed time per iteration (s): 0.16 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.706482E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.497 | TFLOPs: 25.87 | +7: iteration 86660/ 173500 | consumed samples: 22184960 | consumed tokens: 45434798080 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.711579E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.799 | TFLOPs: 26.28 | +7: iteration 86670/ 173500 | consumed samples: 22187520 | consumed tokens: 45440040960 | elapsed time per iteration (s): 0.15 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.714140E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.246 | TFLOPs: 26.27 | +7: iteration 86680/ 173500 | consumed samples: 22190080 | consumed tokens: 45445283840 | elapsed time per iteration (s): 0.15 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.708219E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.503 | TFLOPs: 26.26 | +7: iteration 86690/ 173500 | consumed samples: 22192640 | consumed tokens: 45450526720 | elapsed time per iteration (s): 0.15 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.707831E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.494 | TFLOPs: 26.28 | +7: iteration 86700/ 173500 | consumed samples: 22195200 | consumed tokens: 45455769600 | elapsed time per iteration (s): 0.16 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.691414E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.557 | TFLOPs: 25.54 | +7: iteration 86710/ 173500 | consumed samples: 22197760 | consumed tokens: 45461012480 | elapsed time per iteration (s): 0.15 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.696756E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.296 | TFLOPs: 26.26 | +7: iteration 86720/ 173500 | consumed samples: 22200320 | consumed tokens: 45466255360 | elapsed time per iteration (s): 0.16 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.685647E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.656 | TFLOPs: 25.34 | +7: iteration 86730/ 173500 | consumed samples: 22202880 | consumed tokens: 45471498240 | elapsed time per iteration (s): 0.15 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.714073E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.028 | TFLOPs: 26.27 | +7: iteration 86740/ 173500 | consumed samples: 22205440 | consumed tokens: 45476741120 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.696446E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.911 | TFLOPs: 26.35 | +7: iteration 86750/ 173500 | consumed samples: 22208000 | consumed tokens: 45481984000 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.703476E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.343 | TFLOPs: 26.32 | +7: iteration 86760/ 173500 | consumed samples: 22210560 | consumed tokens: 45487226880 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.704140E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.369 | TFLOPs: 26.37 | +7: iteration 86770/ 173500 | consumed samples: 22213120 | consumed tokens: 45492469760 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.693194E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.386 | TFLOPs: 26.35 | +7: iteration 86780/ 173500 | consumed samples: 22215680 | consumed tokens: 45497712640 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.707633E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.883 | TFLOPs: 26.36 | +7: iteration 86790/ 173500 | consumed samples: 22218240 | consumed tokens: 45502955520 | elapsed time per iteration (s): 0.15 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.717966E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.672 | TFLOPs: 26.37 | +7: iteration 86800/ 173500 | consumed samples: 22220800 | consumed tokens: 45508198400 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.702031E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.418 | TFLOPs: 26.37 | +7: iteration 86810/ 173500 | consumed samples: 22223360 | consumed tokens: 45513441280 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.699905E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.195 | TFLOPs: 26.38 | +7: iteration 86820/ 173500 | consumed samples: 22225920 | consumed tokens: 45518684160 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.702444E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.705 | TFLOPs: 26.36 | +7: iteration 86830/ 173500 | consumed samples: 22228480 | consumed tokens: 45523927040 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.697236E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.800 | TFLOPs: 26.36 | +7: iteration 86840/ 173500 | consumed samples: 22231040 | consumed tokens: 45529169920 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.700440E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.889 | TFLOPs: 26.34 | +7: iteration 86850/ 173500 | consumed samples: 22233600 | consumed tokens: 45534412800 | elapsed time per iteration (s): 0.15 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.706763E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.126 | TFLOPs: 26.36 | +7: iteration 86860/ 173500 | consumed samples: 22236160 | consumed tokens: 45539655680 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.709846E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.182 | TFLOPs: 26.35 | +7: iteration 86870/ 173500 | consumed samples: 22238720 | consumed tokens: 45544898560 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.705229E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.361 | TFLOPs: 26.35 | +7: iteration 86880/ 173500 | consumed samples: 22241280 | consumed tokens: 45550141440 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.701595E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.575 | TFLOPs: 26.36 | +7: iteration 86890/ 173500 | consumed samples: 22243840 | consumed tokens: 45555384320 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.708311E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.232 | TFLOPs: 26.37 | +7: iteration 86900/ 173500 | consumed samples: 22246400 | consumed tokens: 45560627200 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.706155E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.162 | TFLOPs: 26.36 | +7: iteration 86910/ 173500 | consumed samples: 22248960 | consumed tokens: 45565870080 | elapsed time per iteration (s): 0.15 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.698892E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.587 | TFLOPs: 26.37 | +7: iteration 86920/ 173500 | consumed samples: 22251520 | consumed tokens: 45571112960 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.686795E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.348 | TFLOPs: 26.37 | +7: iteration 86930/ 173500 | consumed samples: 22254080 | consumed tokens: 45576355840 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.697985E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.340 | TFLOPs: 26.34 | +7: iteration 86940/ 173500 | consumed samples: 22256640 | consumed tokens: 45581598720 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.706192E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.833 | TFLOPs: 26.33 | +7: iteration 86950/ 173500 | consumed samples: 22259200 | consumed tokens: 45586841600 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.708047E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.502 | TFLOPs: 26.35 | +7: iteration 86960/ 173500 | consumed samples: 22261760 | consumed tokens: 45592084480 | elapsed time per iteration (s): 0.16 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.701379E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.814 | TFLOPs: 25.75 | +7: iteration 86970/ 173500 | consumed samples: 22264320 | consumed tokens: 45597327360 | elapsed time per iteration (s): 0.15 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.707907E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.050 | TFLOPs: 26.35 | +7: iteration 86980/ 173500 | consumed samples: 22266880 | consumed tokens: 45602570240 | elapsed time per iteration (s): 0.15 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.700906E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.220 | TFLOPs: 26.35 | +7: iteration 86990/ 173500 | consumed samples: 22269440 | consumed tokens: 45607813120 | elapsed time per iteration (s): 0.15 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.707059E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.765 | TFLOPs: 26.34 | +7: iteration 87000/ 173500 | consumed samples: 22272000 | consumed tokens: 45613056000 | elapsed time per iteration (s): 0.15 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.715434E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.214 | TFLOPs: 26.37 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 87000 | lm loss value: 3.895911E+00 | lm loss PPL: 4.920083E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 87000 to checkpoints_44m91b100m +0: [2023-03-17 04:01:31,582] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step87000 is begin to save! +0: [2023-03-17 04:01:31,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:01:31,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:01:31,650] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:01:31,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:01:31,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:01:31,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:01:31,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:01:31,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:01:31,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:01:31,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:01:31,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:01:31,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:01:31,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:01:31,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:01:31,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:01:31,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:01:31,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:01:31,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:01:31,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:01:31,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:01:31,718] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step87000/mp_rank_00_model_states.pt +0: [2023-03-17 04:01:31,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:01:31,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:01:31,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:01:31,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:01:31,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:01:31,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +7: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:01:31,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +7: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +7: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 04:01:31,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +7: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 04:01:31,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:01:31,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +7: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +2: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +4: [2023-03-17 04:01:31,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +3: [2023-03-17 04:01:31,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +7: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +1: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +6: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +5: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:01:31,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step87000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:01:31,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step87000 is ready now! +0: successfully saved checkpoint at iteration 87000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.57 +7: iteration 87010/ 173500 | consumed samples: 22274560 | consumed tokens: 45618298880 | elapsed time per iteration (s): 0.18 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.714029E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.211 | TFLOPs: 22.87 | +7: iteration 87020/ 173500 | consumed samples: 22277120 | consumed tokens: 45623541760 | elapsed time per iteration (s): 0.15 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.711595E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.887 | TFLOPs: 26.25 | +7: iteration 87030/ 173500 | consumed samples: 22279680 | consumed tokens: 45628784640 | elapsed time per iteration (s): 0.16 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.703689E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.116 | TFLOPs: 25.56 | +7: iteration 87040/ 173500 | consumed samples: 22282240 | consumed tokens: 45634027520 | elapsed time per iteration (s): 0.16 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.716496E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.609 | TFLOPs: 25.67 | +7: iteration 87050/ 173500 | consumed samples: 22284800 | consumed tokens: 45639270400 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.716004E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.534 | TFLOPs: 26.28 | +7: iteration 87060/ 173500 | consumed samples: 22287360 | consumed tokens: 45644513280 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.701532E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.296 | TFLOPs: 26.26 | +7: iteration 87070/ 173500 | consumed samples: 22289920 | consumed tokens: 45649756160 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.710940E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.120 | TFLOPs: 26.25 | +7: iteration 87080/ 173500 | consumed samples: 22292480 | consumed tokens: 45654999040 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.713137E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.037 | TFLOPs: 26.25 | +7: iteration 87090/ 173500 | consumed samples: 22295040 | consumed tokens: 45660241920 | elapsed time per iteration (s): 0.15 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.694466E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.748 | TFLOPs: 26.26 | +7: iteration 87100/ 173500 | consumed samples: 22297600 | consumed tokens: 45665484800 | elapsed time per iteration (s): 0.16 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.698209E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.278 | TFLOPs: 24.48 | +7: iteration 87110/ 173500 | consumed samples: 22300160 | consumed tokens: 45670727680 | elapsed time per iteration (s): 0.16 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.704028E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.894 | TFLOPs: 25.73 | +7: iteration 87120/ 173500 | consumed samples: 22302720 | consumed tokens: 45675970560 | elapsed time per iteration (s): 0.15 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.695602E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.219 | TFLOPs: 26.27 | +7: iteration 87130/ 173500 | consumed samples: 22305280 | consumed tokens: 45681213440 | elapsed time per iteration (s): 0.16 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.705248E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.780 | TFLOPs: 25.20 | +7: iteration 87140/ 173500 | consumed samples: 22307840 | consumed tokens: 45686456320 | elapsed time per iteration (s): 0.16 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.702679E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.882 | TFLOPs: 25.42 | +7: iteration 87150/ 173500 | consumed samples: 22310400 | consumed tokens: 45691699200 | elapsed time per iteration (s): 0.16 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.700665E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.923 | TFLOPs: 25.31 | +7: iteration 87160/ 173500 | consumed samples: 22312960 | consumed tokens: 45696942080 | elapsed time per iteration (s): 0.15 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.712465E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.141 | TFLOPs: 26.25 | +7: iteration 87170/ 173500 | consumed samples: 22315520 | consumed tokens: 45702184960 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.698585E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.956 | TFLOPs: 26.25 | +7: iteration 87180/ 173500 | consumed samples: 22318080 | consumed tokens: 45707427840 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.702770E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.608 | TFLOPs: 26.26 | +7: iteration 87190/ 173500 | consumed samples: 22320640 | consumed tokens: 45712670720 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.701907E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.080 | TFLOPs: 25.94 | +7: iteration 87200/ 173500 | consumed samples: 22323200 | consumed tokens: 45717913600 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.695425E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.695 | TFLOPs: 26.15 | +7: iteration 87210/ 173500 | consumed samples: 22325760 | consumed tokens: 45723156480 | elapsed time per iteration (s): 0.15 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.699414E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.806 | TFLOPs: 26.25 | +7: iteration 87220/ 173500 | consumed samples: 22328320 | consumed tokens: 45728399360 | elapsed time per iteration (s): 0.16 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.696303E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.900 | TFLOPs: 25.69 | +7: iteration 87230/ 173500 | consumed samples: 22330880 | consumed tokens: 45733642240 | elapsed time per iteration (s): 0.15 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.703412E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.821 | TFLOPs: 26.25 | +7: iteration 87240/ 173500 | consumed samples: 22333440 | consumed tokens: 45738885120 | elapsed time per iteration (s): 0.16 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.715210E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.257 | TFLOPs: 25.72 | +7: iteration 87250/ 173500 | consumed samples: 22336000 | consumed tokens: 45744128000 | elapsed time per iteration (s): 0.16 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.707071E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.141 | TFLOPs: 25.82 | +7: iteration 87260/ 173500 | consumed samples: 22338560 | consumed tokens: 45749370880 | elapsed time per iteration (s): 0.16 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.707090E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.543 | TFLOPs: 25.62 | +7: iteration 87270/ 173500 | consumed samples: 22341120 | consumed tokens: 45754613760 | elapsed time per iteration (s): 0.15 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.691898E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.079 | TFLOPs: 26.27 | +7: iteration 87280/ 173500 | consumed samples: 22343680 | consumed tokens: 45759856640 | elapsed time per iteration (s): 0.16 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.699957E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.764 | TFLOPs: 25.87 | +7: iteration 87290/ 173500 | consumed samples: 22346240 | consumed tokens: 45765099520 | elapsed time per iteration (s): 0.15 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.701770E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.821 | TFLOPs: 26.25 | +7: iteration 87300/ 173500 | consumed samples: 22348800 | consumed tokens: 45770342400 | elapsed time per iteration (s): 0.17 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.705539E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.866 | TFLOPs: 23.82 | +7: iteration 87310/ 173500 | consumed samples: 22351360 | consumed tokens: 45775585280 | elapsed time per iteration (s): 0.16 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.721461E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.196 | TFLOPs: 24.99 | +7: iteration 87320/ 173500 | consumed samples: 22353920 | consumed tokens: 45780828160 | elapsed time per iteration (s): 0.15 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.714634E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.962 | TFLOPs: 26.20 | +7: iteration 87330/ 173500 | consumed samples: 22356480 | consumed tokens: 45786071040 | elapsed time per iteration (s): 0.15 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.715034E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.116 | TFLOPs: 26.27 | +7: iteration 87340/ 173500 | consumed samples: 22359040 | consumed tokens: 45791313920 | elapsed time per iteration (s): 0.15 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.697187E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.829 | TFLOPs: 26.25 | +7: iteration 87350/ 173500 | consumed samples: 22361600 | consumed tokens: 45796556800 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.700184E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.037 | TFLOPs: 26.16 | +7: iteration 87360/ 173500 | consumed samples: 22364160 | consumed tokens: 45801799680 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.697231E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.324 | TFLOPs: 26.23 | +7: iteration 87370/ 173500 | consumed samples: 22366720 | consumed tokens: 45807042560 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.713976E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.855 | TFLOPs: 26.20 | +7: iteration 87380/ 173500 | consumed samples: 22369280 | consumed tokens: 45812285440 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.713092E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.643 | TFLOPs: 26.23 | +7: iteration 87390/ 173500 | consumed samples: 22371840 | consumed tokens: 45817528320 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.697789E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.618 | TFLOPs: 26.23 | +7: iteration 87400/ 173500 | consumed samples: 22374400 | consumed tokens: 45822771200 | elapsed time per iteration (s): 0.15 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.696097E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.883 | TFLOPs: 26.25 | +7: iteration 87410/ 173500 | consumed samples: 22376960 | consumed tokens: 45828014080 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.705998E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.300 | TFLOPs: 26.24 | +7: iteration 87420/ 173500 | consumed samples: 22379520 | consumed tokens: 45833256960 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.695525E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.324 | TFLOPs: 26.26 | +7: iteration 87430/ 173500 | consumed samples: 22382080 | consumed tokens: 45838499840 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.694050E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.300 | TFLOPs: 26.26 | +7: iteration 87440/ 173500 | consumed samples: 22384640 | consumed tokens: 45843742720 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.700758E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.731 | TFLOPs: 26.25 | +7: iteration 87450/ 173500 | consumed samples: 22387200 | consumed tokens: 45848985600 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.708309E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.126 | TFLOPs: 26.21 | +7: iteration 87460/ 173500 | consumed samples: 22389760 | consumed tokens: 45854228480 | elapsed time per iteration (s): 0.15 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.691626E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.918 | TFLOPs: 26.20 | +7: iteration 87470/ 173500 | consumed samples: 22392320 | consumed tokens: 45859471360 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.695018E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.627 | TFLOPs: 26.22 | +7: iteration 87480/ 173500 | consumed samples: 22394880 | consumed tokens: 45864714240 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.706938E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.557 | TFLOPs: 26.23 | +7: iteration 87490/ 173500 | consumed samples: 22397440 | consumed tokens: 45869957120 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.699774E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.362 | TFLOPs: 26.24 | +7: iteration 87500/ 173500 | consumed samples: 22400000 | consumed tokens: 45875200000 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.697379E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.255 | TFLOPs: 26.23 | +7: iteration 87510/ 173500 | consumed samples: 22402560 | consumed tokens: 45880442880 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.697552E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.934 | TFLOPs: 26.22 | +7: iteration 87520/ 173500 | consumed samples: 22405120 | consumed tokens: 45885685760 | elapsed time per iteration (s): 0.15 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.700543E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.889 | TFLOPs: 26.24 | +7: iteration 87530/ 173500 | consumed samples: 22407680 | consumed tokens: 45890928640 | elapsed time per iteration (s): 0.15 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.705952E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.741 | TFLOPs: 26.25 | +7: iteration 87540/ 173500 | consumed samples: 22410240 | consumed tokens: 45896171520 | elapsed time per iteration (s): 0.15 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.712155E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.211 | TFLOPs: 26.24 | +7: iteration 87550/ 173500 | consumed samples: 22412800 | consumed tokens: 45901414400 | elapsed time per iteration (s): 0.15 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.693031E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.772 | TFLOPs: 26.23 | +7: iteration 87560/ 173500 | consumed samples: 22415360 | consumed tokens: 45906657280 | elapsed time per iteration (s): 0.15 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.694120E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.903 | TFLOPs: 26.24 | +7: iteration 87570/ 173500 | consumed samples: 22417920 | consumed tokens: 45911900160 | elapsed time per iteration (s): 0.16 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.704340E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.332 | TFLOPs: 25.57 | +7: iteration 87580/ 173500 | consumed samples: 22420480 | consumed tokens: 45917143040 | elapsed time per iteration (s): 0.15 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.699938E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.083 | TFLOPs: 26.24 | +7: iteration 87590/ 173500 | consumed samples: 22423040 | consumed tokens: 45922385920 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.703980E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.836 | TFLOPs: 26.25 | +7: iteration 87600/ 173500 | consumed samples: 22425600 | consumed tokens: 45927628800 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.703098E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.306 | TFLOPs: 26.23 | +7: iteration 87610/ 173500 | consumed samples: 22428160 | consumed tokens: 45932871680 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.707085E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.974 | TFLOPs: 26.25 | +7: iteration 87620/ 173500 | consumed samples: 22430720 | consumed tokens: 45938114560 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.704883E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.180 | TFLOPs: 26.22 | +7: iteration 87630/ 173500 | consumed samples: 22433280 | consumed tokens: 45943357440 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.697792E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.646 | TFLOPs: 26.23 | +7: iteration 87640/ 173500 | consumed samples: 22435840 | consumed tokens: 45948600320 | elapsed time per iteration (s): 0.15 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.702154E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.163 | TFLOPs: 26.26 | +7: iteration 87650/ 173500 | consumed samples: 22438400 | consumed tokens: 45953843200 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.703297E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.452 | TFLOPs: 26.23 | +7: iteration 87660/ 173500 | consumed samples: 22440960 | consumed tokens: 45959086080 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.697881E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.775 | TFLOPs: 26.23 | +7: iteration 87670/ 173500 | consumed samples: 22443520 | consumed tokens: 45964328960 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.712106E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.407 | TFLOPs: 26.24 | +7: iteration 87680/ 173500 | consumed samples: 22446080 | consumed tokens: 45969571840 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.703894E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.987 | TFLOPs: 26.25 | +7: iteration 87690/ 173500 | consumed samples: 22448640 | consumed tokens: 45974814720 | elapsed time per iteration (s): 0.15 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.698330E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.900 | TFLOPs: 26.25 | +7: iteration 87700/ 173500 | consumed samples: 22451200 | consumed tokens: 45980057600 | elapsed time per iteration (s): 0.16 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.716927E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.612 | TFLOPs: 25.59 | +7: iteration 87710/ 173500 | consumed samples: 22453760 | consumed tokens: 45985300480 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.714924E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.787 | TFLOPs: 26.23 | +7: iteration 87720/ 173500 | consumed samples: 22456320 | consumed tokens: 45990543360 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.706318E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.618 | TFLOPs: 26.25 | +7: iteration 87730/ 173500 | consumed samples: 22458880 | consumed tokens: 45995786240 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.709970E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.289 | TFLOPs: 26.24 | +7: iteration 87740/ 173500 | consumed samples: 22461440 | consumed tokens: 46001029120 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.706643E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.543 | TFLOPs: 26.25 | +7: iteration 87750/ 173500 | consumed samples: 22464000 | consumed tokens: 46006272000 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.712186E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.333 | TFLOPs: 26.26 | +7: iteration 87760/ 173500 | consumed samples: 22466560 | consumed tokens: 46011514880 | elapsed time per iteration (s): 0.15 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.707801E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.925 | TFLOPs: 26.22 | +7: iteration 87770/ 173500 | consumed samples: 22469120 | consumed tokens: 46016757760 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.711341E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.286 | TFLOPs: 26.24 | +7: iteration 87780/ 173500 | consumed samples: 22471680 | consumed tokens: 46022000640 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.704742E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.486 | TFLOPs: 26.20 | +7: iteration 87790/ 173500 | consumed samples: 22474240 | consumed tokens: 46027243520 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.700507E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.870 | TFLOPs: 26.19 | +7: iteration 87800/ 173500 | consumed samples: 22476800 | consumed tokens: 46032486400 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.695766E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.237 | TFLOPs: 26.18 | +7: iteration 87810/ 173500 | consumed samples: 22479360 | consumed tokens: 46037729280 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.692235E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.614 | TFLOPs: 26.18 | +7: iteration 87820/ 173500 | consumed samples: 22481920 | consumed tokens: 46042972160 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.699071E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.836 | TFLOPs: 26.17 | +7: iteration 87830/ 173500 | consumed samples: 22484480 | consumed tokens: 46048215040 | elapsed time per iteration (s): 0.15 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.699760E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.961 | TFLOPs: 26.19 | +7: iteration 87840/ 173500 | consumed samples: 22487040 | consumed tokens: 46053457920 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.696457E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.219 | TFLOPs: 26.16 | +7: iteration 87850/ 173500 | consumed samples: 22489600 | consumed tokens: 46058700800 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.691365E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.669 | TFLOPs: 26.18 | +7: iteration 87860/ 173500 | consumed samples: 22492160 | consumed tokens: 46063943680 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.696981E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.774 | TFLOPs: 26.19 | +7: iteration 87870/ 173500 | consumed samples: 22494720 | consumed tokens: 46069186560 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.700211E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.109 | TFLOPs: 26.19 | +7: iteration 87880/ 173500 | consumed samples: 22497280 | consumed tokens: 46074429440 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.701625E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.537 | TFLOPs: 26.17 | +7: iteration 87890/ 173500 | consumed samples: 22499840 | consumed tokens: 46079672320 | elapsed time per iteration (s): 0.15 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.714996E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.429 | TFLOPs: 26.15 | +7: iteration 87900/ 173500 | consumed samples: 22502400 | consumed tokens: 46084915200 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.704661E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.628 | TFLOPs: 26.17 | +7: iteration 87910/ 173500 | consumed samples: 22504960 | consumed tokens: 46090158080 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.702556E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.977 | TFLOPs: 26.16 | +7: iteration 87920/ 173500 | consumed samples: 22507520 | consumed tokens: 46095400960 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.715099E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.973 | TFLOPs: 26.16 | +7: iteration 87930/ 173500 | consumed samples: 22510080 | consumed tokens: 46100643840 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.695689E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.332 | TFLOPs: 26.16 | +7: iteration 87940/ 173500 | consumed samples: 22512640 | consumed tokens: 46105886720 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.696276E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.799 | TFLOPs: 26.16 | +7: iteration 87950/ 173500 | consumed samples: 22515200 | consumed tokens: 46111129600 | elapsed time per iteration (s): 0.15 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.704028E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.457 | TFLOPs: 26.17 | +7: iteration 87960/ 173500 | consumed samples: 22517760 | consumed tokens: 46116372480 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.692296E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.704 | TFLOPs: 26.17 | +7: iteration 87970/ 173500 | consumed samples: 22520320 | consumed tokens: 46121615360 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.689011E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.759 | TFLOPs: 26.14 | +7: iteration 87980/ 173500 | consumed samples: 22522880 | consumed tokens: 46126858240 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.699323E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.780 | TFLOPs: 26.15 | +7: iteration 87990/ 173500 | consumed samples: 22525440 | consumed tokens: 46132101120 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.704799E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.263 | TFLOPs: 26.16 | +0: [2023-03-17 04:04:05,713] [INFO] [logging.py:68:log_dist] [Rank 0] step=88000, skipped=0, lr=[0.00010937083470846484, 0.00010937083470846484, 0.00010937083470846484], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 88000/ 173500 | consumed samples: 22528000 | consumed tokens: 46137344000 | elapsed time per iteration (s): 0.15 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.712053E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.452 | TFLOPs: 26.17 | +0: steps: 88000 loss: 3.6768 iter time (s): 0.152 samples/sec: 1685.189 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 88000 | lm loss value: 3.839232E+00 | lm loss PPL: 4.648977E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 88000 to checkpoints_44m91b100m +0: [2023-03-17 04:04:05,788] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step88000 is begin to save! +0: [2023-03-17 04:04:05,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:04:05,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:04:05,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:04:05,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:04:05,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:04:05,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:04:05,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:04:05,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:04:05,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:04:05,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:04:05,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:04:05,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:04:05,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:04:05,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:04:05,906] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:04:05,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:04:05,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:04:05,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:04:05,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:04:05,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:04:05,923] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step88000/mp_rank_00_model_states.pt +0: [2023-03-17 04:04:05,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:04:05,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:04:05,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:04:05,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:04:05,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:04:05,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +2: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +2: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +2: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +2: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 04:04:05,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +2: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:04:05,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:04:05,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +4: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +5: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +7: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +1: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +2: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +6: [2023-03-17 04:04:05,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:04:05,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +3: [2023-03-17 04:04:05,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step88000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:04:05,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step88000 is ready now! +0: successfully saved checkpoint at iteration 88000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.71 +7: iteration 88010/ 173500 | consumed samples: 22530560 | consumed tokens: 46142586880 | elapsed time per iteration (s): 0.18 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.696785E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.379 | TFLOPs: 22.49 | +7: iteration 88020/ 173500 | consumed samples: 22533120 | consumed tokens: 46147829760 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.719997E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.931 | TFLOPs: 26.16 | +7: iteration 88030/ 173500 | consumed samples: 22535680 | consumed tokens: 46153072640 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.697346E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.819 | TFLOPs: 26.16 | +7: iteration 88040/ 173500 | consumed samples: 22538240 | consumed tokens: 46158315520 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.706015E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.593 | TFLOPs: 26.17 | +7: iteration 88050/ 173500 | consumed samples: 22540800 | consumed tokens: 46163558400 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.719970E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.916 | TFLOPs: 26.17 | +7: iteration 88060/ 173500 | consumed samples: 22543360 | consumed tokens: 46168801280 | elapsed time per iteration (s): 0.15 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.693576E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.854 | TFLOPs: 26.16 | +7: iteration 88070/ 173500 | consumed samples: 22545920 | consumed tokens: 46174044160 | elapsed time per iteration (s): 0.16 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.709472E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.621 | TFLOPs: 25.60 | +7: iteration 88080/ 173500 | consumed samples: 22548480 | consumed tokens: 46179287040 | elapsed time per iteration (s): 0.15 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.690189E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.350 | TFLOPs: 26.16 | +7: iteration 88090/ 173500 | consumed samples: 22551040 | consumed tokens: 46184529920 | elapsed time per iteration (s): 0.15 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.703580E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.552 | TFLOPs: 26.12 | +7: iteration 88100/ 173500 | consumed samples: 22553600 | consumed tokens: 46189772800 | elapsed time per iteration (s): 0.15 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.705672E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.011 | TFLOPs: 26.13 | +7: iteration 88110/ 173500 | consumed samples: 22556160 | consumed tokens: 46195015680 | elapsed time per iteration (s): 0.15 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.709978E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.918 | TFLOPs: 26.13 | +7: iteration 88120/ 173500 | consumed samples: 22558720 | consumed tokens: 46200258560 | elapsed time per iteration (s): 0.16 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.708123E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.881 | TFLOPs: 25.53 | +7: iteration 88130/ 173500 | consumed samples: 22561280 | consumed tokens: 46205501440 | elapsed time per iteration (s): 0.16 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.712109E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.270 | TFLOPs: 25.19 | +7: iteration 88140/ 173500 | consumed samples: 22563840 | consumed tokens: 46210744320 | elapsed time per iteration (s): 0.16 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.698566E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.818 | TFLOPs: 25.89 | +7: iteration 88150/ 173500 | consumed samples: 22566400 | consumed tokens: 46215987200 | elapsed time per iteration (s): 0.15 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.701304E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.539 | TFLOPs: 25.95 | +7: iteration 88160/ 173500 | consumed samples: 22568960 | consumed tokens: 46221230080 | elapsed time per iteration (s): 0.16 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.705622E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.561 | TFLOPs: 25.46 | +7: iteration 88170/ 173500 | consumed samples: 22571520 | consumed tokens: 46226472960 | elapsed time per iteration (s): 0.15 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.702074E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.395 | TFLOPs: 25.91 | +7: iteration 88180/ 173500 | consumed samples: 22574080 | consumed tokens: 46231715840 | elapsed time per iteration (s): 0.15 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.702898E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.820 | TFLOPs: 26.12 | +7: iteration 88190/ 173500 | consumed samples: 22576640 | consumed tokens: 46236958720 | elapsed time per iteration (s): 0.16 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.707459E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.783 | TFLOPs: 25.64 | +7: iteration 88200/ 173500 | consumed samples: 22579200 | consumed tokens: 46242201600 | elapsed time per iteration (s): 0.16 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.710102E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.746 | TFLOPs: 25.57 | +7: iteration 88210/ 173500 | consumed samples: 22581760 | consumed tokens: 46247444480 | elapsed time per iteration (s): 0.15 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.699475E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.731 | TFLOPs: 25.90 | +7: iteration 88220/ 173500 | consumed samples: 22584320 | consumed tokens: 46252687360 | elapsed time per iteration (s): 0.16 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.704614E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.420 | TFLOPs: 25.41 | +7: iteration 88230/ 173500 | consumed samples: 22586880 | consumed tokens: 46257930240 | elapsed time per iteration (s): 0.16 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.683677E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.025 | TFLOPs: 25.67 | +7: iteration 88240/ 173500 | consumed samples: 22589440 | consumed tokens: 46263173120 | elapsed time per iteration (s): 0.16 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.696329E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.262 | TFLOPs: 25.71 | +7: iteration 88250/ 173500 | consumed samples: 22592000 | consumed tokens: 46268416000 | elapsed time per iteration (s): 0.15 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.697587E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.062 | TFLOPs: 26.11 | +7: iteration 88260/ 173500 | consumed samples: 22594560 | consumed tokens: 46273658880 | elapsed time per iteration (s): 0.16 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.709056E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.269 | TFLOPs: 25.80 | +7: iteration 88270/ 173500 | consumed samples: 22597120 | consumed tokens: 46278901760 | elapsed time per iteration (s): 0.16 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.709866E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.461 | TFLOPs: 25.43 | +7: iteration 88280/ 173500 | consumed samples: 22599680 | consumed tokens: 46284144640 | elapsed time per iteration (s): 0.15 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.699278E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.013 | TFLOPs: 26.17 | +7: iteration 88290/ 173500 | consumed samples: 22602240 | consumed tokens: 46289387520 | elapsed time per iteration (s): 0.16 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.693300E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.268 | TFLOPs: 25.55 | +7: iteration 88300/ 173500 | consumed samples: 22604800 | consumed tokens: 46294630400 | elapsed time per iteration (s): 0.16 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.696899E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.968 | TFLOPs: 25.78 | +7: iteration 88310/ 173500 | consumed samples: 22607360 | consumed tokens: 46299873280 | elapsed time per iteration (s): 0.16 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.711374E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.620 | TFLOPs: 25.59 | +7: iteration 88320/ 173500 | consumed samples: 22609920 | consumed tokens: 46305116160 | elapsed time per iteration (s): 0.16 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.703458E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.985 | TFLOPs: 25.39 | +7: iteration 88330/ 173500 | consumed samples: 22612480 | consumed tokens: 46310359040 | elapsed time per iteration (s): 0.15 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.707371E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.317 | TFLOPs: 26.15 | +7: iteration 88340/ 173500 | consumed samples: 22615040 | consumed tokens: 46315601920 | elapsed time per iteration (s): 0.16 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.704191E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.521 | TFLOPs: 25.55 | +7: iteration 88350/ 173500 | consumed samples: 22617600 | consumed tokens: 46320844800 | elapsed time per iteration (s): 0.16 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.691760E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.810 | TFLOPs: 25.09 | +7: iteration 88360/ 173500 | consumed samples: 22620160 | consumed tokens: 46326087680 | elapsed time per iteration (s): 0.15 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.706366E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.580 | TFLOPs: 26.23 | +7: iteration 88370/ 173500 | consumed samples: 22622720 | consumed tokens: 46331330560 | elapsed time per iteration (s): 0.16 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.706983E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.577 | TFLOPs: 25.32 | +7: iteration 88380/ 173500 | consumed samples: 22625280 | consumed tokens: 46336573440 | elapsed time per iteration (s): 0.16 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.701314E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.042 | TFLOPs: 25.30 | +7: iteration 88390/ 173500 | consumed samples: 22627840 | consumed tokens: 46341816320 | elapsed time per iteration (s): 0.16 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.697332E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.192 | TFLOPs: 25.88 | +7: iteration 88400/ 173500 | consumed samples: 22630400 | consumed tokens: 46347059200 | elapsed time per iteration (s): 0.16 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.696400E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.359 | TFLOPs: 25.60 | +7: iteration 88410/ 173500 | consumed samples: 22632960 | consumed tokens: 46352302080 | elapsed time per iteration (s): 0.15 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.701648E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.474 | TFLOPs: 26.21 | +7: iteration 88420/ 173500 | consumed samples: 22635520 | consumed tokens: 46357544960 | elapsed time per iteration (s): 0.15 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.705231E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.658 | TFLOPs: 26.25 | +7: iteration 88430/ 173500 | consumed samples: 22638080 | consumed tokens: 46362787840 | elapsed time per iteration (s): 0.16 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.686212E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.006 | TFLOPs: 25.86 | +7: iteration 88440/ 173500 | consumed samples: 22640640 | consumed tokens: 46368030720 | elapsed time per iteration (s): 0.16 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.699864E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.819 | TFLOPs: 25.64 | +7: iteration 88450/ 173500 | consumed samples: 22643200 | consumed tokens: 46373273600 | elapsed time per iteration (s): 0.16 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.703735E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.265 | TFLOPs: 25.88 | +7: iteration 88460/ 173500 | consumed samples: 22645760 | consumed tokens: 46378516480 | elapsed time per iteration (s): 0.16 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.700442E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.436 | TFLOPs: 25.19 | +7: iteration 88470/ 173500 | consumed samples: 22648320 | consumed tokens: 46383759360 | elapsed time per iteration (s): 0.16 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.715423E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.852 | TFLOPs: 25.86 | +7: iteration 88480/ 173500 | consumed samples: 22650880 | consumed tokens: 46389002240 | elapsed time per iteration (s): 0.16 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.703797E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.404 | TFLOPs: 25.88 | +7: iteration 88490/ 173500 | consumed samples: 22653440 | consumed tokens: 46394245120 | elapsed time per iteration (s): 0.16 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.683730E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.952 | TFLOPs: 25.20 | +7: iteration 88500/ 173500 | consumed samples: 22656000 | consumed tokens: 46399488000 | elapsed time per iteration (s): 0.15 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.707320E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.137 | TFLOPs: 26.24 | +7: iteration 88510/ 173500 | consumed samples: 22658560 | consumed tokens: 46404730880 | elapsed time per iteration (s): 0.16 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.685977E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.217 | TFLOPs: 25.52 | +7: iteration 88520/ 173500 | consumed samples: 22661120 | consumed tokens: 46409973760 | elapsed time per iteration (s): 0.15 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.710343E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.403 | TFLOPs: 25.93 | +7: iteration 88530/ 173500 | consumed samples: 22663680 | consumed tokens: 46415216640 | elapsed time per iteration (s): 0.16 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.691954E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.131 | TFLOPs: 25.60 | +7: iteration 88540/ 173500 | consumed samples: 22666240 | consumed tokens: 46420459520 | elapsed time per iteration (s): 0.16 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.704659E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.043 | TFLOPs: 25.52 | +7: iteration 88550/ 173500 | consumed samples: 22668800 | consumed tokens: 46425702400 | elapsed time per iteration (s): 0.16 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.695963E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.815 | TFLOPs: 25.81 | +7: iteration 88560/ 173500 | consumed samples: 22671360 | consumed tokens: 46430945280 | elapsed time per iteration (s): 0.16 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.709110E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.827 | TFLOPs: 25.69 | +7: iteration 88570/ 173500 | consumed samples: 22673920 | consumed tokens: 46436188160 | elapsed time per iteration (s): 0.15 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.711097E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.897 | TFLOPs: 26.19 | +7: iteration 88580/ 173500 | consumed samples: 22676480 | consumed tokens: 46441431040 | elapsed time per iteration (s): 0.15 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.710112E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.514 | TFLOPs: 26.18 | +7: iteration 88590/ 173500 | consumed samples: 22679040 | consumed tokens: 46446673920 | elapsed time per iteration (s): 0.16 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.712630E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.781 | TFLOPs: 25.51 | +7: iteration 88600/ 173500 | consumed samples: 22681600 | consumed tokens: 46451916800 | elapsed time per iteration (s): 0.16 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.700540E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.340 | TFLOPs: 25.85 | +7: iteration 88610/ 173500 | consumed samples: 22684160 | consumed tokens: 46457159680 | elapsed time per iteration (s): 0.16 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.704052E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.700 | TFLOPs: 25.56 | +7: iteration 88620/ 173500 | consumed samples: 22686720 | consumed tokens: 46462402560 | elapsed time per iteration (s): 0.16 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.695482E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.234 | TFLOPs: 25.30 | +7: iteration 88630/ 173500 | consumed samples: 22689280 | consumed tokens: 46467645440 | elapsed time per iteration (s): 0.16 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.700764E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.170 | TFLOPs: 25.74 | +7: iteration 88640/ 173500 | consumed samples: 22691840 | consumed tokens: 46472888320 | elapsed time per iteration (s): 0.16 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.700501E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.225 | TFLOPs: 25.47 | +7: iteration 88650/ 173500 | consumed samples: 22694400 | consumed tokens: 46478131200 | elapsed time per iteration (s): 0.16 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.704150E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.495 | TFLOPs: 25.44 | +7: iteration 88660/ 173500 | consumed samples: 22696960 | consumed tokens: 46483374080 | elapsed time per iteration (s): 0.15 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.705918E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.839 | TFLOPs: 26.19 | +7: iteration 88670/ 173500 | consumed samples: 22699520 | consumed tokens: 46488616960 | elapsed time per iteration (s): 0.16 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.697406E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.737 | TFLOPs: 25.61 | +7: iteration 88680/ 173500 | consumed samples: 22702080 | consumed tokens: 46493859840 | elapsed time per iteration (s): 0.16 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.703213E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.760 | TFLOPs: 25.12 | +7: iteration 88690/ 173500 | consumed samples: 22704640 | consumed tokens: 46499102720 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.698021E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.161 | TFLOPs: 25.99 | +7: iteration 88700/ 173500 | consumed samples: 22707200 | consumed tokens: 46504345600 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.701603E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.365 | TFLOPs: 26.20 | +7: iteration 88710/ 173500 | consumed samples: 22709760 | consumed tokens: 46509588480 | elapsed time per iteration (s): 0.16 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.696743E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.452 | TFLOPs: 24.85 | +7: iteration 88720/ 173500 | consumed samples: 22712320 | consumed tokens: 46514831360 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.692836E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.132 | TFLOPs: 26.05 | +7: iteration 88730/ 173500 | consumed samples: 22714880 | consumed tokens: 46520074240 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.711444E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.867 | TFLOPs: 26.19 | +7: iteration 88740/ 173500 | consumed samples: 22717440 | consumed tokens: 46525317120 | elapsed time per iteration (s): 0.15 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.708473E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.247 | TFLOPs: 26.18 | +7: iteration 88750/ 173500 | consumed samples: 22720000 | consumed tokens: 46530560000 | elapsed time per iteration (s): 0.16 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.692953E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.409 | TFLOPs: 25.63 | +7: iteration 88760/ 173500 | consumed samples: 22722560 | consumed tokens: 46535802880 | elapsed time per iteration (s): 0.15 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.698035E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.594 | TFLOPs: 26.18 | +7: iteration 88770/ 173500 | consumed samples: 22725120 | consumed tokens: 46541045760 | elapsed time per iteration (s): 0.15 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.694255E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.770 | TFLOPs: 26.19 | +7: iteration 88780/ 173500 | consumed samples: 22727680 | consumed tokens: 46546288640 | elapsed time per iteration (s): 0.16 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.699609E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.860 | TFLOPs: 25.31 | +7: iteration 88790/ 173500 | consumed samples: 22730240 | consumed tokens: 46551531520 | elapsed time per iteration (s): 0.16 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.702354E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.775 | TFLOPs: 25.87 | +7: iteration 88800/ 173500 | consumed samples: 22732800 | consumed tokens: 46556774400 | elapsed time per iteration (s): 0.16 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.685699E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.834 | TFLOPs: 25.76 | +7: iteration 88810/ 173500 | consumed samples: 22735360 | consumed tokens: 46562017280 | elapsed time per iteration (s): 0.15 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.701627E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.774 | TFLOPs: 26.22 | +7: iteration 88820/ 173500 | consumed samples: 22737920 | consumed tokens: 46567260160 | elapsed time per iteration (s): 0.16 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.707430E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.466 | TFLOPs: 25.87 | +7: iteration 88830/ 173500 | consumed samples: 22740480 | consumed tokens: 46572503040 | elapsed time per iteration (s): 0.16 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.696891E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.415 | TFLOPs: 25.80 | +7: iteration 88840/ 173500 | consumed samples: 22743040 | consumed tokens: 46577745920 | elapsed time per iteration (s): 0.16 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.711857E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.663 | TFLOPs: 25.82 | +7: iteration 88850/ 173500 | consumed samples: 22745600 | consumed tokens: 46582988800 | elapsed time per iteration (s): 0.15 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.712428E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.727 | TFLOPs: 26.20 | +7: iteration 88860/ 173500 | consumed samples: 22748160 | consumed tokens: 46588231680 | elapsed time per iteration (s): 0.16 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.691524E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.345 | TFLOPs: 25.80 | +7: iteration 88870/ 173500 | consumed samples: 22750720 | consumed tokens: 46593474560 | elapsed time per iteration (s): 0.16 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.704490E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.002 | TFLOPs: 25.81 | +7: iteration 88880/ 173500 | consumed samples: 22753280 | consumed tokens: 46598717440 | elapsed time per iteration (s): 0.16 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.692879E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.237 | TFLOPs: 25.88 | +7: iteration 88890/ 173500 | consumed samples: 22755840 | consumed tokens: 46603960320 | elapsed time per iteration (s): 0.15 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.701567E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.914 | TFLOPs: 26.19 | +7: iteration 88900/ 173500 | consumed samples: 22758400 | consumed tokens: 46609203200 | elapsed time per iteration (s): 0.15 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.706765E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.129 | TFLOPs: 26.18 | +7: iteration 88910/ 173500 | consumed samples: 22760960 | consumed tokens: 46614446080 | elapsed time per iteration (s): 0.16 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.697579E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.808 | TFLOPs: 25.21 | +7: iteration 88920/ 173500 | consumed samples: 22763520 | consumed tokens: 46619688960 | elapsed time per iteration (s): 0.15 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.695334E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.135 | TFLOPs: 25.94 | +7: iteration 88930/ 173500 | consumed samples: 22766080 | consumed tokens: 46624931840 | elapsed time per iteration (s): 0.16 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.698390E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.596 | TFLOPs: 25.37 | +7: iteration 88940/ 173500 | consumed samples: 22768640 | consumed tokens: 46630174720 | elapsed time per iteration (s): 0.15 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.705890E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.709 | TFLOPs: 26.14 | +7: iteration 88950/ 173500 | consumed samples: 22771200 | consumed tokens: 46635417600 | elapsed time per iteration (s): 0.15 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.713425E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.462 | TFLOPs: 26.15 | +7: iteration 88960/ 173500 | consumed samples: 22773760 | consumed tokens: 46640660480 | elapsed time per iteration (s): 0.15 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.709663E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.506 | TFLOPs: 26.15 | +7: iteration 88970/ 173500 | consumed samples: 22776320 | consumed tokens: 46645903360 | elapsed time per iteration (s): 0.16 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.705016E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.157 | TFLOPs: 25.63 | +7: iteration 88980/ 173500 | consumed samples: 22778880 | consumed tokens: 46651146240 | elapsed time per iteration (s): 0.16 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.696773E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.430 | TFLOPs: 25.63 | +7: iteration 88990/ 173500 | consumed samples: 22781440 | consumed tokens: 46656389120 | elapsed time per iteration (s): 0.16 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.699867E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.945 | TFLOPs: 25.80 | +7: iteration 89000/ 173500 | consumed samples: 22784000 | consumed tokens: 46661632000 | elapsed time per iteration (s): 0.15 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.711557E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.545 | TFLOPs: 26.18 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 89000 | lm loss value: 3.868203E+00 | lm loss PPL: 4.785633E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 89000 to checkpoints_44m91b100m +0: [2023-03-17 04:06:41,607] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step89000 is begin to save! +0: [2023-03-17 04:06:41,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:06:41,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:06:41,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:06:41,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:06:41,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:06:41,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:06:41,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:06:41,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:06:41,698] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:06:41,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:06:41,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:06:41,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:06:41,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:06:41,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:06:41,722] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:06:41,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:06:41,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:06:41,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:06:41,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:06:41,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:06:41,740] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step89000/mp_rank_00_model_states.pt +0: [2023-03-17 04:06:41,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:06:41,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:06:41,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:06:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:06:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:06:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:06:41,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +3: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +2: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +5: [2023-03-17 04:06:41,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +7: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +1: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +4: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +6: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:06:41,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step89000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:06:41,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step89000 is ready now! +0: successfully saved checkpoint at iteration 89000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.28 +7: iteration 89010/ 173500 | consumed samples: 22786560 | consumed tokens: 46666874880 | elapsed time per iteration (s): 0.18 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.711208E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1437.762 | TFLOPs: 22.55 | +7: iteration 89020/ 173500 | consumed samples: 22789120 | consumed tokens: 46672117760 | elapsed time per iteration (s): 0.16 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.707455E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.738 | TFLOPs: 24.70 | +7: iteration 89030/ 173500 | consumed samples: 22791680 | consumed tokens: 46677360640 | elapsed time per iteration (s): 0.15 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.701928E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.235 | TFLOPs: 26.15 | +7: iteration 89040/ 173500 | consumed samples: 22794240 | consumed tokens: 46682603520 | elapsed time per iteration (s): 0.16 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.700694E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.827 | TFLOPs: 25.61 | +7: iteration 89050/ 173500 | consumed samples: 22796800 | consumed tokens: 46687846400 | elapsed time per iteration (s): 0.16 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.699109E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.698 | TFLOPs: 25.31 | +7: iteration 89060/ 173500 | consumed samples: 22799360 | consumed tokens: 46693089280 | elapsed time per iteration (s): 0.16 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.698095E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.840 | TFLOPs: 25.61 | +7: iteration 89070/ 173500 | consumed samples: 22801920 | consumed tokens: 46698332160 | elapsed time per iteration (s): 0.16 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.705397E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.087 | TFLOPs: 25.69 | +7: iteration 89080/ 173500 | consumed samples: 22804480 | consumed tokens: 46703575040 | elapsed time per iteration (s): 0.16 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.705729E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.622 | TFLOPs: 25.23 | +7: iteration 89090/ 173500 | consumed samples: 22807040 | consumed tokens: 46708817920 | elapsed time per iteration (s): 0.15 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.694793E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.960 | TFLOPs: 26.02 | +7: iteration 89100/ 173500 | consumed samples: 22809600 | consumed tokens: 46714060800 | elapsed time per iteration (s): 0.16 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.695440E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.170 | TFLOPs: 25.86 | +7: iteration 89110/ 173500 | consumed samples: 22812160 | consumed tokens: 46719303680 | elapsed time per iteration (s): 0.15 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.701701E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.864 | TFLOPs: 26.22 | +7: iteration 89120/ 173500 | consumed samples: 22814720 | consumed tokens: 46724546560 | elapsed time per iteration (s): 0.15 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.693335E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.099 | TFLOPs: 26.22 | +7: iteration 89130/ 173500 | consumed samples: 22817280 | consumed tokens: 46729789440 | elapsed time per iteration (s): 0.15 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.705575E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.148 | TFLOPs: 26.22 | +7: iteration 89140/ 173500 | consumed samples: 22819840 | consumed tokens: 46735032320 | elapsed time per iteration (s): 0.16 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.715757E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.724 | TFLOPs: 25.15 | +7: iteration 89150/ 173500 | consumed samples: 22822400 | consumed tokens: 46740275200 | elapsed time per iteration (s): 0.16 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.701315E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.539 | TFLOPs: 25.48 | +7: iteration 89160/ 173500 | consumed samples: 22824960 | consumed tokens: 46745518080 | elapsed time per iteration (s): 0.15 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.699959E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.448 | TFLOPs: 26.18 | +7: iteration 89170/ 173500 | consumed samples: 22827520 | consumed tokens: 46750760960 | elapsed time per iteration (s): 0.16 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.692548E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.947 | TFLOPs: 25.73 | +7: iteration 89180/ 173500 | consumed samples: 22830080 | consumed tokens: 46756003840 | elapsed time per iteration (s): 0.16 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.708479E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.875 | TFLOPs: 24.46 | +7: iteration 89190/ 173500 | consumed samples: 22832640 | consumed tokens: 46761246720 | elapsed time per iteration (s): 0.16 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.701046E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.758 | TFLOPs: 25.40 | +7: iteration 89200/ 173500 | consumed samples: 22835200 | consumed tokens: 46766489600 | elapsed time per iteration (s): 0.16 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.697350E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.554 | TFLOPs: 25.24 | +7: iteration 89210/ 173500 | consumed samples: 22837760 | consumed tokens: 46771732480 | elapsed time per iteration (s): 0.16 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.699448E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.284 | TFLOPs: 24.96 | +7: iteration 89220/ 173500 | consumed samples: 22840320 | consumed tokens: 46776975360 | elapsed time per iteration (s): 0.16 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.704306E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.437 | TFLOPs: 25.84 | +7: iteration 89230/ 173500 | consumed samples: 22842880 | consumed tokens: 46782218240 | elapsed time per iteration (s): 0.16 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.698021E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.203 | TFLOPs: 24.69 | +7: iteration 89240/ 173500 | consumed samples: 22845440 | consumed tokens: 46787461120 | elapsed time per iteration (s): 0.17 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.699155E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.039 | TFLOPs: 23.90 | +7: iteration 89250/ 173500 | consumed samples: 22848000 | consumed tokens: 46792704000 | elapsed time per iteration (s): 0.15 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.717076E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.751 | TFLOPs: 26.00 | +7: iteration 89260/ 173500 | consumed samples: 22850560 | consumed tokens: 46797946880 | elapsed time per iteration (s): 0.16 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.715598E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.346 | TFLOPs: 25.43 | +7: iteration 89270/ 173500 | consumed samples: 22853120 | consumed tokens: 46803189760 | elapsed time per iteration (s): 0.15 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.693382E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.931 | TFLOPs: 26.20 | +7: iteration 89280/ 173500 | consumed samples: 22855680 | consumed tokens: 46808432640 | elapsed time per iteration (s): 0.16 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.700854E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.183 | TFLOPs: 25.11 | +7: iteration 89290/ 173500 | consumed samples: 22858240 | consumed tokens: 46813675520 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.704655E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.979 | TFLOPs: 25.64 | +7: iteration 89300/ 173500 | consumed samples: 22860800 | consumed tokens: 46818918400 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.699001E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.669 | TFLOPs: 25.35 | +7: iteration 89310/ 173500 | consumed samples: 22863360 | consumed tokens: 46824161280 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.707798E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.741 | TFLOPs: 25.86 | +7: iteration 89320/ 173500 | consumed samples: 22865920 | consumed tokens: 46829404160 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.709306E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.734 | TFLOPs: 25.78 | +7: iteration 89330/ 173500 | consumed samples: 22868480 | consumed tokens: 46834647040 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.696960E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.453 | TFLOPs: 25.13 | +7: iteration 89340/ 173500 | consumed samples: 22871040 | consumed tokens: 46839889920 | elapsed time per iteration (s): 0.16 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.703494E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.698 | TFLOPs: 24.37 | +7: iteration 89350/ 173500 | consumed samples: 22873600 | consumed tokens: 46845132800 | elapsed time per iteration (s): 0.16 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.699038E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.810 | TFLOPs: 25.17 | +7: iteration 89360/ 173500 | consumed samples: 22876160 | consumed tokens: 46850375680 | elapsed time per iteration (s): 0.16 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.691988E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.943 | TFLOPs: 25.58 | +7: iteration 89370/ 173500 | consumed samples: 22878720 | consumed tokens: 46855618560 | elapsed time per iteration (s): 0.15 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.704639E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.872 | TFLOPs: 26.19 | +7: iteration 89380/ 173500 | consumed samples: 22881280 | consumed tokens: 46860861440 | elapsed time per iteration (s): 0.16 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.695678E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.434 | TFLOPs: 25.80 | +7: iteration 89390/ 173500 | consumed samples: 22883840 | consumed tokens: 46866104320 | elapsed time per iteration (s): 0.16 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.716556E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.079 | TFLOPs: 25.75 | +7: iteration 89400/ 173500 | consumed samples: 22886400 | consumed tokens: 46871347200 | elapsed time per iteration (s): 0.16 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.713039E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.563 | TFLOPs: 25.01 | +7: iteration 89410/ 173500 | consumed samples: 22888960 | consumed tokens: 46876590080 | elapsed time per iteration (s): 0.16 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.705737E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.910 | TFLOPs: 25.62 | +7: iteration 89420/ 173500 | consumed samples: 22891520 | consumed tokens: 46881832960 | elapsed time per iteration (s): 0.16 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.709473E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.629 | TFLOPs: 25.27 | +7: iteration 89430/ 173500 | consumed samples: 22894080 | consumed tokens: 46887075840 | elapsed time per iteration (s): 0.15 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.702684E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.135 | TFLOPs: 25.93 | +7: iteration 89440/ 173500 | consumed samples: 22896640 | consumed tokens: 46892318720 | elapsed time per iteration (s): 0.16 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.706084E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.563 | TFLOPs: 25.62 | +7: iteration 89450/ 173500 | consumed samples: 22899200 | consumed tokens: 46897561600 | elapsed time per iteration (s): 0.17 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.702405E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.806 | TFLOPs: 24.15 | +7: iteration 89460/ 173500 | consumed samples: 22901760 | consumed tokens: 46902804480 | elapsed time per iteration (s): 0.16 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.695347E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.651 | TFLOPs: 25.51 | +7: iteration 89470/ 173500 | consumed samples: 22904320 | consumed tokens: 46908047360 | elapsed time per iteration (s): 0.16 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.682011E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.693 | TFLOPs: 25.12 | +7: iteration 89480/ 173500 | consumed samples: 22906880 | consumed tokens: 46913290240 | elapsed time per iteration (s): 0.15 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.705181E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.424 | TFLOPs: 26.21 | +7: iteration 89490/ 173500 | consumed samples: 22909440 | consumed tokens: 46918533120 | elapsed time per iteration (s): 0.16 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.698110E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.964 | TFLOPs: 25.66 | +7: iteration 89500/ 173500 | consumed samples: 22912000 | consumed tokens: 46923776000 | elapsed time per iteration (s): 0.16 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.706070E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.634 | TFLOPs: 25.49 | +7: iteration 89510/ 173500 | consumed samples: 22914560 | consumed tokens: 46929018880 | elapsed time per iteration (s): 0.16 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.705037E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.073 | TFLOPs: 25.30 | +7: iteration 89520/ 173500 | consumed samples: 22917120 | consumed tokens: 46934261760 | elapsed time per iteration (s): 0.16 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.690859E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.095 | TFLOPs: 25.19 | +7: iteration 89530/ 173500 | consumed samples: 22919680 | consumed tokens: 46939504640 | elapsed time per iteration (s): 0.15 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.697377E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.593 | TFLOPs: 26.21 | +7: iteration 89540/ 173500 | consumed samples: 22922240 | consumed tokens: 46944747520 | elapsed time per iteration (s): 0.15 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.709575E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.514 | TFLOPs: 25.93 | +7: iteration 89550/ 173500 | consumed samples: 22924800 | consumed tokens: 46949990400 | elapsed time per iteration (s): 0.16 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.698452E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.529 | TFLOPs: 25.79 | +7: iteration 89560/ 173500 | consumed samples: 22927360 | consumed tokens: 46955233280 | elapsed time per iteration (s): 0.16 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.697398E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.707 | TFLOPs: 25.62 | +7: iteration 89570/ 173500 | consumed samples: 22929920 | consumed tokens: 46960476160 | elapsed time per iteration (s): 0.15 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.716598E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.542 | TFLOPs: 26.20 | +7: iteration 89580/ 173500 | consumed samples: 22932480 | consumed tokens: 46965719040 | elapsed time per iteration (s): 0.16 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.684028E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.820 | TFLOPs: 25.81 | +7: iteration 89590/ 173500 | consumed samples: 22935040 | consumed tokens: 46970961920 | elapsed time per iteration (s): 0.16 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.696148E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.725 | TFLOPs: 25.43 | +7: iteration 89600/ 173500 | consumed samples: 22937600 | consumed tokens: 46976204800 | elapsed time per iteration (s): 0.16 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.702783E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.242 | TFLOPs: 25.43 | +7: iteration 89610/ 173500 | consumed samples: 22940160 | consumed tokens: 46981447680 | elapsed time per iteration (s): 0.16 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.694017E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.442 | TFLOPs: 25.49 | +7: iteration 89620/ 173500 | consumed samples: 22942720 | consumed tokens: 46986690560 | elapsed time per iteration (s): 0.16 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.701201E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.234 | TFLOPs: 25.44 | +7: iteration 89630/ 173500 | consumed samples: 22945280 | consumed tokens: 46991933440 | elapsed time per iteration (s): 0.15 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.711837E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.918 | TFLOPs: 26.16 | +7: iteration 89640/ 173500 | consumed samples: 22947840 | consumed tokens: 46997176320 | elapsed time per iteration (s): 0.16 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.710063E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.446 | TFLOPs: 25.65 | +7: iteration 89650/ 173500 | consumed samples: 22950400 | consumed tokens: 47002419200 | elapsed time per iteration (s): 0.16 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.707369E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.374 | TFLOPs: 25.30 | +7: iteration 89660/ 173500 | consumed samples: 22952960 | consumed tokens: 47007662080 | elapsed time per iteration (s): 0.16 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.697770E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.377 | TFLOPs: 25.90 | +7: iteration 89670/ 173500 | consumed samples: 22955520 | consumed tokens: 47012904960 | elapsed time per iteration (s): 0.16 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.711240E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.724 | TFLOPs: 25.12 | +7: iteration 89680/ 173500 | consumed samples: 22958080 | consumed tokens: 47018147840 | elapsed time per iteration (s): 0.16 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.706419E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.677 | TFLOPs: 24.43 | +7: iteration 89690/ 173500 | consumed samples: 22960640 | consumed tokens: 47023390720 | elapsed time per iteration (s): 0.15 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.690012E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.592 | TFLOPs: 26.20 | +7: iteration 89700/ 173500 | consumed samples: 22963200 | consumed tokens: 47028633600 | elapsed time per iteration (s): 0.16 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.700864E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.780 | TFLOPs: 24.56 | +7: iteration 89710/ 173500 | consumed samples: 22965760 | consumed tokens: 47033876480 | elapsed time per iteration (s): 0.16 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.709325E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.458 | TFLOPs: 25.79 | +7: iteration 89720/ 173500 | consumed samples: 22968320 | consumed tokens: 47039119360 | elapsed time per iteration (s): 0.15 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.712049E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.891 | TFLOPs: 26.20 | +7: iteration 89730/ 173500 | consumed samples: 22970880 | consumed tokens: 47044362240 | elapsed time per iteration (s): 0.16 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.718510E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.353 | TFLOPs: 25.16 | +7: iteration 89740/ 173500 | consumed samples: 22973440 | consumed tokens: 47049605120 | elapsed time per iteration (s): 0.16 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.698821E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.651 | TFLOPs: 25.53 | +7: iteration 89750/ 173500 | consumed samples: 22976000 | consumed tokens: 47054848000 | elapsed time per iteration (s): 0.16 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.700306E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.246 | TFLOPs: 25.53 | +7: iteration 89760/ 173500 | consumed samples: 22978560 | consumed tokens: 47060090880 | elapsed time per iteration (s): 0.16 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.680713E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.918 | TFLOPs: 25.80 | +7: iteration 89770/ 173500 | consumed samples: 22981120 | consumed tokens: 47065333760 | elapsed time per iteration (s): 0.16 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.690543E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.381 | TFLOPs: 25.84 | +7: iteration 89780/ 173500 | consumed samples: 22983680 | consumed tokens: 47070576640 | elapsed time per iteration (s): 0.16 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.697899E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.956 | TFLOPs: 25.69 | +7: iteration 89790/ 173500 | consumed samples: 22986240 | consumed tokens: 47075819520 | elapsed time per iteration (s): 0.16 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.708067E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.102 | TFLOPs: 25.11 | +7: iteration 89800/ 173500 | consumed samples: 22988800 | consumed tokens: 47081062400 | elapsed time per iteration (s): 0.15 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.703511E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.446 | TFLOPs: 26.18 | +7: iteration 89810/ 173500 | consumed samples: 22991360 | consumed tokens: 47086305280 | elapsed time per iteration (s): 0.16 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.701305E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.402 | TFLOPs: 25.21 | +7: iteration 89820/ 173500 | consumed samples: 22993920 | consumed tokens: 47091548160 | elapsed time per iteration (s): 0.16 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.681329E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.075 | TFLOPs: 25.66 | +7: iteration 89830/ 173500 | consumed samples: 22996480 | consumed tokens: 47096791040 | elapsed time per iteration (s): 0.16 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.716563E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.892 | TFLOPs: 25.84 | +7: iteration 89840/ 173500 | consumed samples: 22999040 | consumed tokens: 47102033920 | elapsed time per iteration (s): 0.16 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.693627E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.874 | TFLOPs: 25.87 | +7: iteration 89850/ 173500 | consumed samples: 23001600 | consumed tokens: 47107276800 | elapsed time per iteration (s): 0.16 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.702817E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.482 | TFLOPs: 25.52 | +7: iteration 89860/ 173500 | consumed samples: 23004160 | consumed tokens: 47112519680 | elapsed time per iteration (s): 0.16 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.691962E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.938 | TFLOPs: 25.22 | +7: iteration 89870/ 173500 | consumed samples: 23006720 | consumed tokens: 47117762560 | elapsed time per iteration (s): 0.15 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.703468E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.362 | TFLOPs: 26.12 | +7: iteration 89880/ 173500 | consumed samples: 23009280 | consumed tokens: 47123005440 | elapsed time per iteration (s): 0.17 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.707614E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.445 | TFLOPs: 24.06 | +7: iteration 89890/ 173500 | consumed samples: 23011840 | consumed tokens: 47128248320 | elapsed time per iteration (s): 0.16 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.695343E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.999 | TFLOPs: 25.50 | +7: iteration 89900/ 173500 | consumed samples: 23014400 | consumed tokens: 47133491200 | elapsed time per iteration (s): 0.16 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.699067E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.514 | TFLOPs: 25.45 | +7: iteration 89910/ 173500 | consumed samples: 23016960 | consumed tokens: 47138734080 | elapsed time per iteration (s): 0.16 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.706322E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.068 | TFLOPs: 24.94 | +7: iteration 89920/ 173500 | consumed samples: 23019520 | consumed tokens: 47143976960 | elapsed time per iteration (s): 0.16 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.698191E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.717 | TFLOPs: 25.07 | +7: iteration 89930/ 173500 | consumed samples: 23022080 | consumed tokens: 47149219840 | elapsed time per iteration (s): 0.16 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.702171E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.062 | TFLOPs: 25.81 | +7: iteration 89940/ 173500 | consumed samples: 23024640 | consumed tokens: 47154462720 | elapsed time per iteration (s): 0.16 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.699630E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.043 | TFLOPs: 25.41 | +7: iteration 89950/ 173500 | consumed samples: 23027200 | consumed tokens: 47159705600 | elapsed time per iteration (s): 0.16 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.694416E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.626 | TFLOPs: 25.74 | +7: iteration 89960/ 173500 | consumed samples: 23029760 | consumed tokens: 47164948480 | elapsed time per iteration (s): 0.16 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.698975E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.139 | TFLOPs: 25.42 | +7: iteration 89970/ 173500 | consumed samples: 23032320 | consumed tokens: 47170191360 | elapsed time per iteration (s): 0.16 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.695530E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.204 | TFLOPs: 25.46 | +7: iteration 89980/ 173500 | consumed samples: 23034880 | consumed tokens: 47175434240 | elapsed time per iteration (s): 0.15 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.696705E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.841 | TFLOPs: 26.12 | +7: iteration 89990/ 173500 | consumed samples: 23037440 | consumed tokens: 47180677120 | elapsed time per iteration (s): 0.16 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.705338E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.101 | TFLOPs: 24.59 | +0: [2023-03-17 04:09:19,190] [INFO] [logging.py:68:log_dist] [Rank 0] step=90000, skipped=0, lr=[0.00010607986950689534, 0.00010607986950689534, 0.00010607986950689534], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 90000/ 173500 | consumed samples: 23040000 | consumed tokens: 47185920000 | elapsed time per iteration (s): 0.15 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.702509E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.498 | TFLOPs: 25.92 | +0: steps: 90000 loss: 3.7062 iter time (s): 0.155 samples/sec: 1652.737 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 90000 | lm loss value: 3.865008E+00 | lm loss PPL: 4.770364E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 90000 to checkpoints_44m91b100m +0: [2023-03-17 04:09:19,263] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step90000 is begin to save! +0: [2023-03-17 04:09:19,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:09:19,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:09:19,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:09:19,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:09:19,339] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:09:19,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:09:19,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:09:19,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:09:19,355] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:09:19,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:09:19,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:09:19,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:09:19,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:09:19,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:09:19,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:09:19,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:09:19,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:09:19,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:09:19,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:09:19,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:09:19,397] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step90000/mp_rank_00_model_states.pt +0: [2023-03-17 04:09:19,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:09:19,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:09:19,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:09:19,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:09:19,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 04:09:19,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:09:19,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +6: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +3: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +4: [2023-03-17 04:09:19,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +5: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:09:19,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 04:09:19,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +2: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +1: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:09:19,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +7: [2023-03-17 04:09:19,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:09:19,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step90000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:09:19,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step90000 is ready now! +0: successfully saved checkpoint at iteration 90000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 180.32 +7: iteration 90010/ 173500 | consumed samples: 23042560 | consumed tokens: 47191162880 | elapsed time per iteration (s): 0.18 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.700297E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1421.329 | TFLOPs: 22.29 | +7: iteration 90020/ 173500 | consumed samples: 23045120 | consumed tokens: 47196405760 | elapsed time per iteration (s): 0.15 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.703238E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.829 | TFLOPs: 26.20 | +7: iteration 90030/ 173500 | consumed samples: 23047680 | consumed tokens: 47201648640 | elapsed time per iteration (s): 0.17 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.706353E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.291 | TFLOPs: 24.14 | +7: iteration 90040/ 173500 | consumed samples: 23050240 | consumed tokens: 47206891520 | elapsed time per iteration (s): 0.15 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.705336E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.877 | TFLOPs: 26.17 | +7: iteration 90050/ 173500 | consumed samples: 23052800 | consumed tokens: 47212134400 | elapsed time per iteration (s): 0.16 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.706686E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.381 | TFLOPs: 25.44 | +7: iteration 90060/ 173500 | consumed samples: 23055360 | consumed tokens: 47217377280 | elapsed time per iteration (s): 0.16 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.699536E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.576 | TFLOPs: 25.02 | +7: iteration 90070/ 173500 | consumed samples: 23057920 | consumed tokens: 47222620160 | elapsed time per iteration (s): 0.16 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.702220E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.108 | TFLOPs: 24.73 | +7: iteration 90080/ 173500 | consumed samples: 23060480 | consumed tokens: 47227863040 | elapsed time per iteration (s): 0.22 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.706562E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1187.936 | TFLOPs: 18.63 | +7: iteration 90090/ 173500 | consumed samples: 23063040 | consumed tokens: 47233105920 | elapsed time per iteration (s): 0.16 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.703302E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.430 | TFLOPs: 24.49 | +7: iteration 90100/ 173500 | consumed samples: 23065600 | consumed tokens: 47238348800 | elapsed time per iteration (s): 0.16 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.692616E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.609 | TFLOPs: 25.23 | +7: iteration 90110/ 173500 | consumed samples: 23068160 | consumed tokens: 47243591680 | elapsed time per iteration (s): 0.16 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.707473E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.617 | TFLOPs: 25.54 | +7: iteration 90120/ 173500 | consumed samples: 23070720 | consumed tokens: 47248834560 | elapsed time per iteration (s): 0.16 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.696312E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.678 | TFLOPs: 24.95 | +7: iteration 90130/ 173500 | consumed samples: 23073280 | consumed tokens: 47254077440 | elapsed time per iteration (s): 0.16 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.703555E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.444 | TFLOPs: 25.52 | +7: iteration 90140/ 173500 | consumed samples: 23075840 | consumed tokens: 47259320320 | elapsed time per iteration (s): 0.16 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.703343E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.485 | TFLOPs: 25.57 | +7: iteration 90150/ 173500 | consumed samples: 23078400 | consumed tokens: 47264563200 | elapsed time per iteration (s): 0.15 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.716427E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.068 | TFLOPs: 26.22 | +7: iteration 90160/ 173500 | consumed samples: 23080960 | consumed tokens: 47269806080 | elapsed time per iteration (s): 0.16 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.706604E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.279 | TFLOPs: 25.46 | +7: iteration 90170/ 173500 | consumed samples: 23083520 | consumed tokens: 47275048960 | elapsed time per iteration (s): 0.15 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.708533E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.739 | TFLOPs: 26.11 | +7: iteration 90180/ 173500 | consumed samples: 23086080 | consumed tokens: 47280291840 | elapsed time per iteration (s): 0.16 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.697839E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.021 | TFLOPs: 25.63 | +7: iteration 90190/ 173500 | consumed samples: 23088640 | consumed tokens: 47285534720 | elapsed time per iteration (s): 0.16 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.700869E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.058 | TFLOPs: 25.28 | +7: iteration 90200/ 173500 | consumed samples: 23091200 | consumed tokens: 47290777600 | elapsed time per iteration (s): 0.15 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.721957E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.877 | TFLOPs: 26.11 | +7: iteration 90210/ 173500 | consumed samples: 23093760 | consumed tokens: 47296020480 | elapsed time per iteration (s): 0.16 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.704066E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.520 | TFLOPs: 25.77 | +7: iteration 90220/ 173500 | consumed samples: 23096320 | consumed tokens: 47301263360 | elapsed time per iteration (s): 0.17 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.692421E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1512.681 | TFLOPs: 23.72 | +7: iteration 90230/ 173500 | consumed samples: 23098880 | consumed tokens: 47306506240 | elapsed time per iteration (s): 0.15 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.716396E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.609 | TFLOPs: 26.09 | +7: iteration 90240/ 173500 | consumed samples: 23101440 | consumed tokens: 47311749120 | elapsed time per iteration (s): 0.16 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.704427E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.352 | TFLOPs: 25.41 | +7: iteration 90250/ 173500 | consumed samples: 23104000 | consumed tokens: 47316992000 | elapsed time per iteration (s): 0.16 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.701856E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.272 | TFLOPs: 25.55 | +7: iteration 90260/ 173500 | consumed samples: 23106560 | consumed tokens: 47322234880 | elapsed time per iteration (s): 0.16 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.692764E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.209 | TFLOPs: 25.75 | +7: iteration 90270/ 173500 | consumed samples: 23109120 | consumed tokens: 47327477760 | elapsed time per iteration (s): 0.15 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.714115E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.186 | TFLOPs: 26.11 | +7: iteration 90280/ 173500 | consumed samples: 23111680 | consumed tokens: 47332720640 | elapsed time per iteration (s): 0.16 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.709713E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.239 | TFLOPs: 24.58 | +7: iteration 90290/ 173500 | consumed samples: 23114240 | consumed tokens: 47337963520 | elapsed time per iteration (s): 0.15 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.693740E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.693 | TFLOPs: 26.11 | +7: iteration 90300/ 173500 | consumed samples: 23116800 | consumed tokens: 47343206400 | elapsed time per iteration (s): 0.16 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.697159E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.786 | TFLOPs: 25.40 | +7: iteration 90310/ 173500 | consumed samples: 23119360 | consumed tokens: 47348449280 | elapsed time per iteration (s): 0.16 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.707236E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.676 | TFLOPs: 25.43 | +7: iteration 90320/ 173500 | consumed samples: 23121920 | consumed tokens: 47353692160 | elapsed time per iteration (s): 0.16 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.697172E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.615 | TFLOPs: 25.16 | +7: iteration 90330/ 173500 | consumed samples: 23124480 | consumed tokens: 47358935040 | elapsed time per iteration (s): 0.17 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.701408E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.185 | TFLOPs: 24.01 | +7: iteration 90340/ 173500 | consumed samples: 23127040 | consumed tokens: 47364177920 | elapsed time per iteration (s): 0.16 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.713091E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.220 | TFLOPs: 25.82 | +7: iteration 90350/ 173500 | consumed samples: 23129600 | consumed tokens: 47369420800 | elapsed time per iteration (s): 0.16 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.705334E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.371 | TFLOPs: 25.88 | +7: iteration 90360/ 173500 | consumed samples: 23132160 | consumed tokens: 47374663680 | elapsed time per iteration (s): 0.16 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.703917E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.265 | TFLOPs: 25.83 | +7: iteration 90370/ 173500 | consumed samples: 23134720 | consumed tokens: 47379906560 | elapsed time per iteration (s): 0.16 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.699075E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.266 | TFLOPs: 25.00 | +7: iteration 90380/ 173500 | consumed samples: 23137280 | consumed tokens: 47385149440 | elapsed time per iteration (s): 0.15 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.708170E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.479 | TFLOPs: 26.13 | +7: iteration 90390/ 173500 | consumed samples: 23139840 | consumed tokens: 47390392320 | elapsed time per iteration (s): 0.16 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.700005E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.310 | TFLOPs: 25.69 | +7: iteration 90400/ 173500 | consumed samples: 23142400 | consumed tokens: 47395635200 | elapsed time per iteration (s): 0.16 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.701208E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.548 | TFLOPs: 25.30 | +7: iteration 90410/ 173500 | consumed samples: 23144960 | consumed tokens: 47400878080 | elapsed time per iteration (s): 0.16 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.703964E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.011 | TFLOPs: 25.34 | +7: iteration 90420/ 173500 | consumed samples: 23147520 | consumed tokens: 47406120960 | elapsed time per iteration (s): 0.16 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.705103E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.341 | TFLOPs: 25.79 | +7: iteration 90430/ 173500 | consumed samples: 23150080 | consumed tokens: 47411363840 | elapsed time per iteration (s): 0.16 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.695611E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.573 | TFLOPs: 25.10 | +7: iteration 90440/ 173500 | consumed samples: 23152640 | consumed tokens: 47416606720 | elapsed time per iteration (s): 0.16 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.701465E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.611 | TFLOPs: 25.82 | +7: iteration 90450/ 173500 | consumed samples: 23155200 | consumed tokens: 47421849600 | elapsed time per iteration (s): 0.16 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.700777E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.031 | TFLOPs: 25.23 | +7: iteration 90460/ 173500 | consumed samples: 23157760 | consumed tokens: 47427092480 | elapsed time per iteration (s): 0.16 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.687986E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.953 | TFLOPs: 25.75 | +7: iteration 90470/ 173500 | consumed samples: 23160320 | consumed tokens: 47432335360 | elapsed time per iteration (s): 0.15 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.705746E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.000 | TFLOPs: 26.25 | +7: iteration 90480/ 173500 | consumed samples: 23162880 | consumed tokens: 47437578240 | elapsed time per iteration (s): 0.16 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.692612E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.134 | TFLOPs: 24.87 | +7: iteration 90490/ 173500 | consumed samples: 23165440 | consumed tokens: 47442821120 | elapsed time per iteration (s): 0.16 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.703186E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.290 | TFLOPs: 25.61 | +7: iteration 90500/ 173500 | consumed samples: 23168000 | consumed tokens: 47448064000 | elapsed time per iteration (s): 0.15 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.706494E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.737 | TFLOPs: 26.17 | +7: iteration 90510/ 173500 | consumed samples: 23170560 | consumed tokens: 47453306880 | elapsed time per iteration (s): 0.16 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.709893E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.445 | TFLOPs: 25.26 | +7: iteration 90520/ 173500 | consumed samples: 23173120 | consumed tokens: 47458549760 | elapsed time per iteration (s): 0.17 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.691228E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.548 | TFLOPs: 23.89 | +7: iteration 90530/ 173500 | consumed samples: 23175680 | consumed tokens: 47463792640 | elapsed time per iteration (s): 0.16 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.695359E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.309 | TFLOPs: 25.57 | +7: iteration 90540/ 173500 | consumed samples: 23178240 | consumed tokens: 47469035520 | elapsed time per iteration (s): 0.15 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.696405E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.212 | TFLOPs: 25.93 | +7: iteration 90550/ 173500 | consumed samples: 23180800 | consumed tokens: 47474278400 | elapsed time per iteration (s): 0.15 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.700751E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.368 | TFLOPs: 25.99 | +7: iteration 90560/ 173500 | consumed samples: 23183360 | consumed tokens: 47479521280 | elapsed time per iteration (s): 0.16 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.706350E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.986 | TFLOPs: 25.56 | +7: iteration 90570/ 173500 | consumed samples: 23185920 | consumed tokens: 47484764160 | elapsed time per iteration (s): 0.15 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.689885E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.661 | TFLOPs: 26.11 | +7: iteration 90580/ 173500 | consumed samples: 23188480 | consumed tokens: 47490007040 | elapsed time per iteration (s): 0.16 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.697010E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.409 | TFLOPs: 25.51 | +7: iteration 90590/ 173500 | consumed samples: 23191040 | consumed tokens: 47495249920 | elapsed time per iteration (s): 0.16 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.700116E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.026 | TFLOPs: 24.87 | +7: iteration 90600/ 173500 | consumed samples: 23193600 | consumed tokens: 47500492800 | elapsed time per iteration (s): 0.17 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.704768E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.707 | TFLOPs: 23.82 | +7: iteration 90610/ 173500 | consumed samples: 23196160 | consumed tokens: 47505735680 | elapsed time per iteration (s): 0.15 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.700831E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.787 | TFLOPs: 25.94 | +7: iteration 90620/ 173500 | consumed samples: 23198720 | consumed tokens: 47510978560 | elapsed time per iteration (s): 0.16 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.692897E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.141 | TFLOPs: 25.31 | +7: iteration 90630/ 173500 | consumed samples: 23201280 | consumed tokens: 47516221440 | elapsed time per iteration (s): 0.16 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.703707E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.806 | TFLOPs: 25.87 | +7: iteration 90640/ 173500 | consumed samples: 23203840 | consumed tokens: 47521464320 | elapsed time per iteration (s): 0.16 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.702230E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.938 | TFLOPs: 25.01 | +7: iteration 90650/ 173500 | consumed samples: 23206400 | consumed tokens: 47526707200 | elapsed time per iteration (s): 0.16 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.705453E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.833 | TFLOPs: 25.56 | +7: iteration 90660/ 173500 | consumed samples: 23208960 | consumed tokens: 47531950080 | elapsed time per iteration (s): 0.15 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.702586E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.492 | TFLOPs: 26.26 | +7: iteration 90670/ 173500 | consumed samples: 23211520 | consumed tokens: 47537192960 | elapsed time per iteration (s): 0.16 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.699123E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.140 | TFLOPs: 25.75 | +7: iteration 90680/ 173500 | consumed samples: 23214080 | consumed tokens: 47542435840 | elapsed time per iteration (s): 0.16 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.696096E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.367 | TFLOPs: 25.18 | +7: iteration 90690/ 173500 | consumed samples: 23216640 | consumed tokens: 47547678720 | elapsed time per iteration (s): 0.16 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.704052E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.155 | TFLOPs: 24.37 | +7: iteration 90700/ 173500 | consumed samples: 23219200 | consumed tokens: 47552921600 | elapsed time per iteration (s): 0.15 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.706239E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.193 | TFLOPs: 26.22 | +7: iteration 90710/ 173500 | consumed samples: 23221760 | consumed tokens: 47558164480 | elapsed time per iteration (s): 0.16 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.693050E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.931 | TFLOPs: 25.66 | +7: iteration 90720/ 173500 | consumed samples: 23224320 | consumed tokens: 47563407360 | elapsed time per iteration (s): 0.16 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.701581E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.315 | TFLOPs: 25.85 | +7: iteration 90730/ 173500 | consumed samples: 23226880 | consumed tokens: 47568650240 | elapsed time per iteration (s): 0.16 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.695816E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.059 | TFLOPs: 24.72 | +7: iteration 90740/ 173500 | consumed samples: 23229440 | consumed tokens: 47573893120 | elapsed time per iteration (s): 0.16 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.695105E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.207 | TFLOPs: 25.83 | +7: iteration 90750/ 173500 | consumed samples: 23232000 | consumed tokens: 47579136000 | elapsed time per iteration (s): 0.16 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.684505E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.830 | TFLOPs: 24.46 | +7: iteration 90760/ 173500 | consumed samples: 23234560 | consumed tokens: 47584378880 | elapsed time per iteration (s): 0.16 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.706923E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.049 | TFLOPs: 25.20 | +7: iteration 90770/ 173500 | consumed samples: 23237120 | consumed tokens: 47589621760 | elapsed time per iteration (s): 0.16 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.703625E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.110 | TFLOPs: 25.77 | +7: iteration 90780/ 173500 | consumed samples: 23239680 | consumed tokens: 47594864640 | elapsed time per iteration (s): 0.15 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.691694E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.507 | TFLOPs: 25.98 | +7: iteration 90790/ 173500 | consumed samples: 23242240 | consumed tokens: 47600107520 | elapsed time per iteration (s): 0.15 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.709402E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.046 | TFLOPs: 25.97 | +7: iteration 90800/ 173500 | consumed samples: 23244800 | consumed tokens: 47605350400 | elapsed time per iteration (s): 0.16 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.702922E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.038 | TFLOPs: 24.83 | +7: iteration 90810/ 173500 | consumed samples: 23247360 | consumed tokens: 47610593280 | elapsed time per iteration (s): 0.16 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.687597E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.569 | TFLOPs: 25.38 | +7: iteration 90820/ 173500 | consumed samples: 23249920 | consumed tokens: 47615836160 | elapsed time per iteration (s): 0.16 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.702436E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.248 | TFLOPs: 25.38 | +7: iteration 90830/ 173500 | consumed samples: 23252480 | consumed tokens: 47621079040 | elapsed time per iteration (s): 0.16 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.698452E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.504 | TFLOPs: 25.66 | +7: iteration 90840/ 173500 | consumed samples: 23255040 | consumed tokens: 47626321920 | elapsed time per iteration (s): 0.16 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.692672E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.408 | TFLOPs: 24.69 | +7: iteration 90850/ 173500 | consumed samples: 23257600 | consumed tokens: 47631564800 | elapsed time per iteration (s): 0.15 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.695926E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.126 | TFLOPs: 25.99 | +7: iteration 90860/ 173500 | consumed samples: 23260160 | consumed tokens: 47636807680 | elapsed time per iteration (s): 0.16 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.705858E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.746 | TFLOPs: 25.34 | +7: iteration 90870/ 173500 | consumed samples: 23262720 | consumed tokens: 47642050560 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.697973E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.777 | TFLOPs: 25.81 | +7: iteration 90880/ 173500 | consumed samples: 23265280 | consumed tokens: 47647293440 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.717968E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.334 | TFLOPs: 25.33 | +7: iteration 90890/ 173500 | consumed samples: 23267840 | consumed tokens: 47652536320 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.689374E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.423 | TFLOPs: 25.15 | +7: iteration 90900/ 173500 | consumed samples: 23270400 | consumed tokens: 47657779200 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.703815E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.960 | TFLOPs: 25.72 | +7: iteration 90910/ 173500 | consumed samples: 23272960 | consumed tokens: 47663022080 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.691109E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.373 | TFLOPs: 25.57 | +7: iteration 90920/ 173500 | consumed samples: 23275520 | consumed tokens: 47668264960 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.709791E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.903 | TFLOPs: 25.34 | +7: iteration 90930/ 173500 | consumed samples: 23278080 | consumed tokens: 47673507840 | elapsed time per iteration (s): 0.16 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.710336E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.966 | TFLOPs: 25.64 | +7: iteration 90940/ 173500 | consumed samples: 23280640 | consumed tokens: 47678750720 | elapsed time per iteration (s): 0.16 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.711384E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.386 | TFLOPs: 24.97 | +7: iteration 90950/ 173500 | consumed samples: 23283200 | consumed tokens: 47683993600 | elapsed time per iteration (s): 0.16 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.699038E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.654 | TFLOPs: 25.65 | +7: iteration 90960/ 173500 | consumed samples: 23285760 | consumed tokens: 47689236480 | elapsed time per iteration (s): 0.16 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.700536E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.820 | TFLOPs: 25.14 | +7: iteration 90970/ 173500 | consumed samples: 23288320 | consumed tokens: 47694479360 | elapsed time per iteration (s): 0.16 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.711661E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.677 | TFLOPs: 25.46 | +7: iteration 90980/ 173500 | consumed samples: 23290880 | consumed tokens: 47699722240 | elapsed time per iteration (s): 0.15 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.692717E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.348 | TFLOPs: 26.24 | +7: iteration 90990/ 173500 | consumed samples: 23293440 | consumed tokens: 47704965120 | elapsed time per iteration (s): 0.16 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.694298E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.617 | TFLOPs: 25.54 | +7: iteration 91000/ 173500 | consumed samples: 23296000 | consumed tokens: 47710208000 | elapsed time per iteration (s): 0.16 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.696503E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.009 | TFLOPs: 24.98 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 91000 | lm loss value: 3.841849E+00 | lm loss PPL: 4.661156E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 91000 to checkpoints_44m91b100m +0: [2023-03-17 04:11:57,906] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step91000 is begin to save! +0: [2023-03-17 04:11:57,911] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:11:57,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:11:57,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:11:57,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:11:57,985] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:11:57,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:11:57,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:11:58,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:11:58,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:11:58,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:11:58,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:11:58,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:11:58,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:11:58,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:11:58,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:11:58,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:11:58,033] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:11:58,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:11:58,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:11:58,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:11:58,043] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step91000/mp_rank_00_model_states.pt +0: [2023-03-17 04:11:58,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:11:58,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:11:58,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:11:58,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +0: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +3: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +0: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +3: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +0: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +0: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +3: [2023-03-17 04:11:58,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +3: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +3: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +1: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +5: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +2: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +3: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +7: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +6: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:11:58,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:11:58,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +4: [2023-03-17 04:11:58,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:11:58,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step91000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:11:58,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step91000 is ready now! +0: successfully saved checkpoint at iteration 91000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 191.49 +7: iteration 91010/ 173500 | consumed samples: 23298560 | consumed tokens: 47715450880 | elapsed time per iteration (s): 0.18 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.696805E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.713 | TFLOPs: 22.50 | +7: iteration 91020/ 173500 | consumed samples: 23301120 | consumed tokens: 47720693760 | elapsed time per iteration (s): 0.16 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.697001E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.757 | TFLOPs: 25.67 | +7: iteration 91030/ 173500 | consumed samples: 23303680 | consumed tokens: 47725936640 | elapsed time per iteration (s): 0.15 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.686141E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.287 | TFLOPs: 25.94 | +7: iteration 91040/ 173500 | consumed samples: 23306240 | consumed tokens: 47731179520 | elapsed time per iteration (s): 0.15 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.705312E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.107 | TFLOPs: 26.05 | +7: iteration 91050/ 173500 | consumed samples: 23308800 | consumed tokens: 47736422400 | elapsed time per iteration (s): 0.16 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.700647E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.055 | TFLOPs: 25.66 | +7: iteration 91060/ 173500 | consumed samples: 23311360 | consumed tokens: 47741665280 | elapsed time per iteration (s): 0.16 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.704580E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.177 | TFLOPs: 25.27 | +7: iteration 91070/ 173500 | consumed samples: 23313920 | consumed tokens: 47746908160 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.691129E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.655 | TFLOPs: 26.22 | +7: iteration 91080/ 173500 | consumed samples: 23316480 | consumed tokens: 47752151040 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.698577E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.904 | TFLOPs: 26.20 | +7: iteration 91090/ 173500 | consumed samples: 23319040 | consumed tokens: 47757393920 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.698052E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.380 | TFLOPs: 26.21 | +7: iteration 91100/ 173500 | consumed samples: 23321600 | consumed tokens: 47762636800 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.699380E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.352 | TFLOPs: 26.04 | +7: iteration 91110/ 173500 | consumed samples: 23324160 | consumed tokens: 47767879680 | elapsed time per iteration (s): 0.15 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.702220E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.293 | TFLOPs: 26.23 | +7: iteration 91120/ 173500 | consumed samples: 23326720 | consumed tokens: 47773122560 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.689986E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.413 | TFLOPs: 26.16 | +7: iteration 91130/ 173500 | consumed samples: 23329280 | consumed tokens: 47778365440 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.707047E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.191 | TFLOPs: 26.26 | +7: iteration 91140/ 173500 | consumed samples: 23331840 | consumed tokens: 47783608320 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.706276E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.651 | TFLOPs: 26.23 | +7: iteration 91150/ 173500 | consumed samples: 23334400 | consumed tokens: 47788851200 | elapsed time per iteration (s): 0.16 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.709875E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.771 | TFLOPs: 25.56 | +7: iteration 91160/ 173500 | consumed samples: 23336960 | consumed tokens: 47794094080 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.706655E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.991 | TFLOPs: 25.94 | +7: iteration 91170/ 173500 | consumed samples: 23339520 | consumed tokens: 47799336960 | elapsed time per iteration (s): 0.15 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.690331E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.151 | TFLOPs: 26.13 | +7: iteration 91180/ 173500 | consumed samples: 23342080 | consumed tokens: 47804579840 | elapsed time per iteration (s): 0.16 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.694041E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.945 | TFLOPs: 25.75 | +7: iteration 91190/ 173500 | consumed samples: 23344640 | consumed tokens: 47809822720 | elapsed time per iteration (s): 0.16 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.702456E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.851 | TFLOPs: 25.36 | +7: iteration 91200/ 173500 | consumed samples: 23347200 | consumed tokens: 47815065600 | elapsed time per iteration (s): 0.16 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.698874E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.287 | TFLOPs: 24.97 | +7: iteration 91210/ 173500 | consumed samples: 23349760 | consumed tokens: 47820308480 | elapsed time per iteration (s): 0.15 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.715263E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.432 | TFLOPs: 26.18 | +7: iteration 91220/ 173500 | consumed samples: 23352320 | consumed tokens: 47825551360 | elapsed time per iteration (s): 0.16 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.694541E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.542 | TFLOPs: 25.38 | +7: iteration 91230/ 173500 | consumed samples: 23354880 | consumed tokens: 47830794240 | elapsed time per iteration (s): 0.15 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.699358E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.121 | TFLOPs: 26.21 | +7: iteration 91240/ 173500 | consumed samples: 23357440 | consumed tokens: 47836037120 | elapsed time per iteration (s): 0.16 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.695538E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.632 | TFLOPs: 24.54 | +7: iteration 91250/ 173500 | consumed samples: 23360000 | consumed tokens: 47841280000 | elapsed time per iteration (s): 0.16 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.699473E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.629 | TFLOPs: 25.84 | +7: iteration 91260/ 173500 | consumed samples: 23362560 | consumed tokens: 47846522880 | elapsed time per iteration (s): 0.16 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.685194E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.053 | TFLOPs: 25.74 | +7: iteration 91270/ 173500 | consumed samples: 23365120 | consumed tokens: 47851765760 | elapsed time per iteration (s): 0.17 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.682395E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.082 | TFLOPs: 24.09 | +7: iteration 91280/ 173500 | consumed samples: 23367680 | consumed tokens: 47857008640 | elapsed time per iteration (s): 0.17 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.698938E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.587 | TFLOPs: 23.88 | +7: iteration 91290/ 173500 | consumed samples: 23370240 | consumed tokens: 47862251520 | elapsed time per iteration (s): 0.16 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.693797E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.070 | TFLOPs: 24.43 | +7: iteration 91300/ 173500 | consumed samples: 23372800 | consumed tokens: 47867494400 | elapsed time per iteration (s): 0.16 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.701184E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.045 | TFLOPs: 24.34 | +7: iteration 91310/ 173500 | consumed samples: 23375360 | consumed tokens: 47872737280 | elapsed time per iteration (s): 0.15 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.698201E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.500 | TFLOPs: 26.17 | +7: iteration 91320/ 173500 | consumed samples: 23377920 | consumed tokens: 47877980160 | elapsed time per iteration (s): 0.15 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.707614E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.542 | TFLOPs: 26.17 | +7: iteration 91330/ 173500 | consumed samples: 23380480 | consumed tokens: 47883223040 | elapsed time per iteration (s): 0.16 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.714056E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.236 | TFLOPs: 25.43 | +7: iteration 91340/ 173500 | consumed samples: 23383040 | consumed tokens: 47888465920 | elapsed time per iteration (s): 0.16 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.688761E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.341 | TFLOPs: 25.14 | +7: iteration 91350/ 173500 | consumed samples: 23385600 | consumed tokens: 47893708800 | elapsed time per iteration (s): 0.15 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.705357E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.172 | TFLOPs: 26.02 | +7: iteration 91360/ 173500 | consumed samples: 23388160 | consumed tokens: 47898951680 | elapsed time per iteration (s): 0.16 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.700527E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.036 | TFLOPs: 25.83 | +7: iteration 91370/ 173500 | consumed samples: 23390720 | consumed tokens: 47904194560 | elapsed time per iteration (s): 0.16 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.705602E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.191 | TFLOPs: 25.88 | +7: iteration 91380/ 173500 | consumed samples: 23393280 | consumed tokens: 47909437440 | elapsed time per iteration (s): 0.16 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.694377E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.968 | TFLOPs: 25.33 | +7: iteration 91390/ 173500 | consumed samples: 23395840 | consumed tokens: 47914680320 | elapsed time per iteration (s): 0.17 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.681868E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.891 | TFLOPs: 24.23 | +7: iteration 91400/ 173500 | consumed samples: 23398400 | consumed tokens: 47919923200 | elapsed time per iteration (s): 0.16 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.697067E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.985 | TFLOPs: 25.11 | +7: iteration 91410/ 173500 | consumed samples: 23400960 | consumed tokens: 47925166080 | elapsed time per iteration (s): 0.16 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.693961E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.630 | TFLOPs: 25.60 | +7: iteration 91420/ 173500 | consumed samples: 23403520 | consumed tokens: 47930408960 | elapsed time per iteration (s): 0.15 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.706885E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.886 | TFLOPs: 25.97 | +7: iteration 91430/ 173500 | consumed samples: 23406080 | consumed tokens: 47935651840 | elapsed time per iteration (s): 0.16 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.704812E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.037 | TFLOPs: 25.72 | +7: iteration 91440/ 173500 | consumed samples: 23408640 | consumed tokens: 47940894720 | elapsed time per iteration (s): 0.16 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.691733E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.985 | TFLOPs: 25.56 | +7: iteration 91450/ 173500 | consumed samples: 23411200 | consumed tokens: 47946137600 | elapsed time per iteration (s): 0.16 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.697784E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.353 | TFLOPs: 25.54 | +7: iteration 91460/ 173500 | consumed samples: 23413760 | consumed tokens: 47951380480 | elapsed time per iteration (s): 0.16 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.705461E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.774 | TFLOPs: 25.65 | +7: iteration 91470/ 173500 | consumed samples: 23416320 | consumed tokens: 47956623360 | elapsed time per iteration (s): 0.15 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.696412E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.837 | TFLOPs: 26.00 | +7: iteration 91480/ 173500 | consumed samples: 23418880 | consumed tokens: 47961866240 | elapsed time per iteration (s): 0.15 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.664697E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.115 | TFLOPs: 26.00 | +7: iteration 91490/ 173500 | consumed samples: 23421440 | consumed tokens: 47967109120 | elapsed time per iteration (s): 0.16 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.696445E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.588 | TFLOPs: 25.49 | +7: iteration 91500/ 173500 | consumed samples: 23424000 | consumed tokens: 47972352000 | elapsed time per iteration (s): 0.15 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.698084E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.263 | TFLOPs: 26.15 | +7: iteration 91510/ 173500 | consumed samples: 23426560 | consumed tokens: 47977594880 | elapsed time per iteration (s): 0.16 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.703035E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.390 | TFLOPs: 25.25 | +7: iteration 91520/ 173500 | consumed samples: 23429120 | consumed tokens: 47982837760 | elapsed time per iteration (s): 0.16 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.707436E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.242 | TFLOPs: 25.21 | +7: iteration 91530/ 173500 | consumed samples: 23431680 | consumed tokens: 47988080640 | elapsed time per iteration (s): 0.16 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.705389E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.290 | TFLOPs: 25.65 | +7: iteration 91540/ 173500 | consumed samples: 23434240 | consumed tokens: 47993323520 | elapsed time per iteration (s): 0.16 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.699625E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.977 | TFLOPs: 25.15 | +7: iteration 91550/ 173500 | consumed samples: 23436800 | consumed tokens: 47998566400 | elapsed time per iteration (s): 0.16 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.696968E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.463 | TFLOPs: 25.84 | +7: iteration 91560/ 173500 | consumed samples: 23439360 | consumed tokens: 48003809280 | elapsed time per iteration (s): 0.16 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.705658E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.550 | TFLOPs: 24.74 | +7: iteration 91570/ 173500 | consumed samples: 23441920 | consumed tokens: 48009052160 | elapsed time per iteration (s): 0.16 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.697405E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.652 | TFLOPs: 25.01 | +7: iteration 91580/ 173500 | consumed samples: 23444480 | consumed tokens: 48014295040 | elapsed time per iteration (s): 0.16 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.706707E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.094 | TFLOPs: 25.11 | +7: iteration 91590/ 173500 | consumed samples: 23447040 | consumed tokens: 48019537920 | elapsed time per iteration (s): 0.16 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.694600E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.047 | TFLOPs: 24.83 | +7: iteration 91600/ 173500 | consumed samples: 23449600 | consumed tokens: 48024780800 | elapsed time per iteration (s): 0.15 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.697107E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.870 | TFLOPs: 25.98 | +7: iteration 91610/ 173500 | consumed samples: 23452160 | consumed tokens: 48030023680 | elapsed time per iteration (s): 0.15 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.689297E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.044 | TFLOPs: 26.05 | +7: iteration 91620/ 173500 | consumed samples: 23454720 | consumed tokens: 48035266560 | elapsed time per iteration (s): 0.16 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.691069E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.152 | TFLOPs: 25.50 | +7: iteration 91630/ 173500 | consumed samples: 23457280 | consumed tokens: 48040509440 | elapsed time per iteration (s): 0.16 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.696529E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.510 | TFLOPs: 25.79 | +7: iteration 91640/ 173500 | consumed samples: 23459840 | consumed tokens: 48045752320 | elapsed time per iteration (s): 0.15 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.707224E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.015 | TFLOPs: 25.99 | +7: iteration 91650/ 173500 | consumed samples: 23462400 | consumed tokens: 48050995200 | elapsed time per iteration (s): 0.16 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.682090E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.041 | TFLOPs: 25.28 | +7: iteration 91660/ 173500 | consumed samples: 23464960 | consumed tokens: 48056238080 | elapsed time per iteration (s): 0.16 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.697762E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.073 | TFLOPs: 25.23 | +7: iteration 91670/ 173500 | consumed samples: 23467520 | consumed tokens: 48061480960 | elapsed time per iteration (s): 0.16 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.690048E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.886 | TFLOPs: 25.70 | +7: iteration 91680/ 173500 | consumed samples: 23470080 | consumed tokens: 48066723840 | elapsed time per iteration (s): 0.16 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.703073E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.440 | TFLOPs: 25.38 | +7: iteration 91690/ 173500 | consumed samples: 23472640 | consumed tokens: 48071966720 | elapsed time per iteration (s): 0.15 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.719984E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.490 | TFLOPs: 26.24 | +7: iteration 91700/ 173500 | consumed samples: 23475200 | consumed tokens: 48077209600 | elapsed time per iteration (s): 0.16 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.697374E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.851 | TFLOPs: 25.53 | +7: iteration 91710/ 173500 | consumed samples: 23477760 | consumed tokens: 48082452480 | elapsed time per iteration (s): 0.15 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.709316E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.475 | TFLOPs: 26.26 | +7: iteration 91720/ 173500 | consumed samples: 23480320 | consumed tokens: 48087695360 | elapsed time per iteration (s): 0.16 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.699062E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.325 | TFLOPs: 25.55 | +7: iteration 91730/ 173500 | consumed samples: 23482880 | consumed tokens: 48092938240 | elapsed time per iteration (s): 0.15 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.697744E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.792 | TFLOPs: 25.90 | +7: iteration 91740/ 173500 | consumed samples: 23485440 | consumed tokens: 48098181120 | elapsed time per iteration (s): 0.16 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.697877E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.301 | TFLOPs: 25.87 | +7: iteration 91750/ 173500 | consumed samples: 23488000 | consumed tokens: 48103424000 | elapsed time per iteration (s): 0.16 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.696991E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.497 | TFLOPs: 25.26 | +7: iteration 91760/ 173500 | consumed samples: 23490560 | consumed tokens: 48108666880 | elapsed time per iteration (s): 0.16 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.690015E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.439 | TFLOPs: 25.69 | +7: iteration 91770/ 173500 | consumed samples: 23493120 | consumed tokens: 48113909760 | elapsed time per iteration (s): 0.15 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.690128E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.427 | TFLOPs: 26.26 | +7: iteration 91780/ 173500 | consumed samples: 23495680 | consumed tokens: 48119152640 | elapsed time per iteration (s): 0.15 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.696117E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.465 | TFLOPs: 26.28 | +7: iteration 91790/ 173500 | consumed samples: 23498240 | consumed tokens: 48124395520 | elapsed time per iteration (s): 0.16 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.703185E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.485 | TFLOPs: 25.84 | +7: iteration 91800/ 173500 | consumed samples: 23500800 | consumed tokens: 48129638400 | elapsed time per iteration (s): 0.16 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.692133E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.532 | TFLOPs: 24.39 | +7: iteration 91810/ 173500 | consumed samples: 23503360 | consumed tokens: 48134881280 | elapsed time per iteration (s): 0.15 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.697098E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.987 | TFLOPs: 26.28 | +7: iteration 91820/ 173500 | consumed samples: 23505920 | consumed tokens: 48140124160 | elapsed time per iteration (s): 0.16 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.699877E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.942 | TFLOPs: 25.37 | +7: iteration 91830/ 173500 | consumed samples: 23508480 | consumed tokens: 48145367040 | elapsed time per iteration (s): 0.15 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.709112E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.239 | TFLOPs: 26.26 | +7: iteration 91840/ 173500 | consumed samples: 23511040 | consumed tokens: 48150609920 | elapsed time per iteration (s): 0.16 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.709697E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.646 | TFLOPs: 25.31 | +7: iteration 91850/ 173500 | consumed samples: 23513600 | consumed tokens: 48155852800 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.703221E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.152 | TFLOPs: 26.25 | +7: iteration 91860/ 173500 | consumed samples: 23516160 | consumed tokens: 48161095680 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.713295E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.842 | TFLOPs: 26.25 | +7: iteration 91870/ 173500 | consumed samples: 23518720 | consumed tokens: 48166338560 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.700573E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.817 | TFLOPs: 26.25 | +7: iteration 91880/ 173500 | consumed samples: 23521280 | consumed tokens: 48171581440 | elapsed time per iteration (s): 0.16 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.711676E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.105 | TFLOPs: 25.75 | +7: iteration 91890/ 173500 | consumed samples: 23523840 | consumed tokens: 48176824320 | elapsed time per iteration (s): 0.15 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.703643E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.438 | TFLOPs: 26.07 | +7: iteration 91900/ 173500 | consumed samples: 23526400 | consumed tokens: 48182067200 | elapsed time per iteration (s): 0.16 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.711857E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.550 | TFLOPs: 25.63 | +7: iteration 91910/ 173500 | consumed samples: 23528960 | consumed tokens: 48187310080 | elapsed time per iteration (s): 0.16 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.701670E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.100 | TFLOPs: 25.31 | +7: iteration 91920/ 173500 | consumed samples: 23531520 | consumed tokens: 48192552960 | elapsed time per iteration (s): 0.15 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.703626E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.490 | TFLOPs: 25.96 | +7: iteration 91930/ 173500 | consumed samples: 23534080 | consumed tokens: 48197795840 | elapsed time per iteration (s): 0.16 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.696091E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.839 | TFLOPs: 25.70 | +7: iteration 91940/ 173500 | consumed samples: 23536640 | consumed tokens: 48203038720 | elapsed time per iteration (s): 0.16 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.716597E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.652 | TFLOPs: 24.63 | +7: iteration 91950/ 173500 | consumed samples: 23539200 | consumed tokens: 48208281600 | elapsed time per iteration (s): 0.16 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.693468E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.392 | TFLOPs: 25.11 | +7: iteration 91960/ 173500 | consumed samples: 23541760 | consumed tokens: 48213524480 | elapsed time per iteration (s): 0.16 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.699983E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.532 | TFLOPs: 24.90 | +7: iteration 91970/ 173500 | consumed samples: 23544320 | consumed tokens: 48218767360 | elapsed time per iteration (s): 0.15 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.706132E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.087 | TFLOPs: 26.13 | +7: iteration 91980/ 173500 | consumed samples: 23546880 | consumed tokens: 48224010240 | elapsed time per iteration (s): 0.16 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.697248E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.281 | TFLOPs: 25.71 | +7: iteration 91990/ 173500 | consumed samples: 23549440 | consumed tokens: 48229253120 | elapsed time per iteration (s): 0.15 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.686916E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.787 | TFLOPs: 25.92 | +0: [2023-03-17 04:14:34,880] [INFO] [logging.py:68:log_dist] [Rank 0] step=92000, skipped=0, lr=[0.0001027941492351335, 0.0001027941492351335, 0.0001027941492351335], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 92000/ 173500 | consumed samples: 23552000 | consumed tokens: 48234496000 | elapsed time per iteration (s): 0.16 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.703879E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.055 | TFLOPs: 24.72 | +0: steps: 92000 loss: 3.7118 iter time (s): 0.156 samples/sec: 1641.005 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 92000 | lm loss value: 3.833118E+00 | lm loss PPL: 4.620641E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 92000 to checkpoints_44m91b100m +0: [2023-03-17 04:14:34,956] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step92000 is begin to save! +0: [2023-03-17 04:14:34,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:14:35,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:14:35,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:14:35,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:14:35,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:14:35,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:14:35,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:14:35,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:14:35,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:14:35,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:14:35,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:14:35,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:14:35,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:14:35,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:14:35,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:14:35,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:14:35,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:14:35,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:14:35,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:14:35,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:14:35,087] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step92000/mp_rank_00_model_states.pt +0: [2023-03-17 04:14:35,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:14:35,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:14:35,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:14:35,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:14:35,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:14:35,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:14:35,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:14:35,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 04:14:35,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +7: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +3: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +4: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +1: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +5: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +2: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +6: [2023-03-17 04:14:35,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step92000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:14:35,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step92000 is ready now! +0: successfully saved checkpoint at iteration 92000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.53 +7: iteration 92010/ 173500 | consumed samples: 23554560 | consumed tokens: 48239738880 | elapsed time per iteration (s): 0.18 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.716109E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.637 | TFLOPs: 21.97 | +7: iteration 92020/ 173500 | consumed samples: 23557120 | consumed tokens: 48244981760 | elapsed time per iteration (s): 0.16 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.696463E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.854 | TFLOPs: 25.14 | +7: iteration 92030/ 173500 | consumed samples: 23559680 | consumed tokens: 48250224640 | elapsed time per iteration (s): 0.15 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.709293E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.086 | TFLOPs: 26.16 | +7: iteration 92040/ 173500 | consumed samples: 23562240 | consumed tokens: 48255467520 | elapsed time per iteration (s): 0.16 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.687286E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.961 | TFLOPs: 25.80 | +7: iteration 92050/ 173500 | consumed samples: 23564800 | consumed tokens: 48260710400 | elapsed time per iteration (s): 0.15 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.710342E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.417 | TFLOPs: 26.16 | +7: iteration 92060/ 173500 | consumed samples: 23567360 | consumed tokens: 48265953280 | elapsed time per iteration (s): 0.15 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.691616E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.795 | TFLOPs: 25.97 | +7: iteration 92070/ 173500 | consumed samples: 23569920 | consumed tokens: 48271196160 | elapsed time per iteration (s): 0.16 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.693857E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.584 | TFLOPs: 25.15 | +7: iteration 92080/ 173500 | consumed samples: 23572480 | consumed tokens: 48276439040 | elapsed time per iteration (s): 0.15 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.696455E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.224 | TFLOPs: 26.13 | +7: iteration 92090/ 173500 | consumed samples: 23575040 | consumed tokens: 48281681920 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.699312E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.748 | TFLOPs: 26.19 | +7: iteration 92100/ 173500 | consumed samples: 23577600 | consumed tokens: 48286924800 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.702456E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.300 | TFLOPs: 26.15 | +7: iteration 92110/ 173500 | consumed samples: 23580160 | consumed tokens: 48292167680 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.692375E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.612 | TFLOPs: 26.09 | +7: iteration 92120/ 173500 | consumed samples: 23582720 | consumed tokens: 48297410560 | elapsed time per iteration (s): 0.16 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.683138E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.774 | TFLOPs: 25.51 | +7: iteration 92130/ 173500 | consumed samples: 23585280 | consumed tokens: 48302653440 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.698573E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.844 | TFLOPs: 26.20 | +7: iteration 92140/ 173500 | consumed samples: 23587840 | consumed tokens: 48307896320 | elapsed time per iteration (s): 0.15 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.692040E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.360 | TFLOPs: 26.24 | +7: iteration 92150/ 173500 | consumed samples: 23590400 | consumed tokens: 48313139200 | elapsed time per iteration (s): 0.16 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.700600E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.036 | TFLOPs: 25.59 | +7: iteration 92160/ 173500 | consumed samples: 23592960 | consumed tokens: 48318382080 | elapsed time per iteration (s): 0.16 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.705561E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.167 | TFLOPs: 25.83 | +7: iteration 92170/ 173500 | consumed samples: 23595520 | consumed tokens: 48323624960 | elapsed time per iteration (s): 0.15 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.705880E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.788 | TFLOPs: 26.23 | +7: iteration 92180/ 173500 | consumed samples: 23598080 | consumed tokens: 48328867840 | elapsed time per iteration (s): 0.16 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.713037E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.302 | TFLOPs: 25.61 | +7: iteration 92190/ 173500 | consumed samples: 23600640 | consumed tokens: 48334110720 | elapsed time per iteration (s): 0.16 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.694852E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.716 | TFLOPs: 25.75 | +7: iteration 92200/ 173500 | consumed samples: 23603200 | consumed tokens: 48339353600 | elapsed time per iteration (s): 0.16 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.697861E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.830 | TFLOPs: 24.67 | +7: iteration 92210/ 173500 | consumed samples: 23605760 | consumed tokens: 48344596480 | elapsed time per iteration (s): 0.16 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.687022E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.765 | TFLOPs: 25.06 | +7: iteration 92220/ 173500 | consumed samples: 23608320 | consumed tokens: 48349839360 | elapsed time per iteration (s): 0.15 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.690532E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.201 | TFLOPs: 26.13 | +7: iteration 92230/ 173500 | consumed samples: 23610880 | consumed tokens: 48355082240 | elapsed time per iteration (s): 0.15 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.717225E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.944 | TFLOPs: 26.06 | +7: iteration 92240/ 173500 | consumed samples: 23613440 | consumed tokens: 48360325120 | elapsed time per iteration (s): 0.16 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.688552E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.310 | TFLOPs: 25.74 | +7: iteration 92250/ 173500 | consumed samples: 23616000 | consumed tokens: 48365568000 | elapsed time per iteration (s): 0.15 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.693528E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.131 | TFLOPs: 25.96 | +7: iteration 92260/ 173500 | consumed samples: 23618560 | consumed tokens: 48370810880 | elapsed time per iteration (s): 0.15 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.696466E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.776 | TFLOPs: 25.94 | +7: iteration 92270/ 173500 | consumed samples: 23621120 | consumed tokens: 48376053760 | elapsed time per iteration (s): 0.16 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.704619E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.000 | TFLOPs: 24.51 | +7: iteration 92280/ 173500 | consumed samples: 23623680 | consumed tokens: 48381296640 | elapsed time per iteration (s): 0.16 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.699194E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.629 | TFLOPs: 25.84 | +7: iteration 92290/ 173500 | consumed samples: 23626240 | consumed tokens: 48386539520 | elapsed time per iteration (s): 0.16 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.681853E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.475 | TFLOPs: 25.66 | +7: iteration 92300/ 173500 | consumed samples: 23628800 | consumed tokens: 48391782400 | elapsed time per iteration (s): 0.16 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.694717E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.310 | TFLOPs: 25.69 | +7: iteration 92310/ 173500 | consumed samples: 23631360 | consumed tokens: 48397025280 | elapsed time per iteration (s): 0.15 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.698164E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.765 | TFLOPs: 26.20 | +7: iteration 92320/ 173500 | consumed samples: 23633920 | consumed tokens: 48402268160 | elapsed time per iteration (s): 0.16 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.709280E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.502 | TFLOPs: 25.08 | +7: iteration 92330/ 173500 | consumed samples: 23636480 | consumed tokens: 48407511040 | elapsed time per iteration (s): 0.16 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.698998E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.262 | TFLOPs: 25.63 | +7: iteration 92340/ 173500 | consumed samples: 23639040 | consumed tokens: 48412753920 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.701033E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.294 | TFLOPs: 26.18 | +7: iteration 92350/ 173500 | consumed samples: 23641600 | consumed tokens: 48417996800 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.693092E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.735 | TFLOPs: 26.15 | +7: iteration 92360/ 173500 | consumed samples: 23644160 | consumed tokens: 48423239680 | elapsed time per iteration (s): 0.16 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.704217E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.241 | TFLOPs: 25.16 | +7: iteration 92370/ 173500 | consumed samples: 23646720 | consumed tokens: 48428482560 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.688169E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.376 | TFLOPs: 26.15 | +7: iteration 92380/ 173500 | consumed samples: 23649280 | consumed tokens: 48433725440 | elapsed time per iteration (s): 0.15 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.703059E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.695 | TFLOPs: 26.17 | +7: iteration 92390/ 173500 | consumed samples: 23651840 | consumed tokens: 48438968320 | elapsed time per iteration (s): 0.16 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.695406E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.848 | TFLOPs: 25.64 | +7: iteration 92400/ 173500 | consumed samples: 23654400 | consumed tokens: 48444211200 | elapsed time per iteration (s): 0.16 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.699271E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.392 | TFLOPs: 25.55 | +7: iteration 92410/ 173500 | consumed samples: 23656960 | consumed tokens: 48449454080 | elapsed time per iteration (s): 0.16 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.699808E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.231 | TFLOPs: 25.83 | +7: iteration 92420/ 173500 | consumed samples: 23659520 | consumed tokens: 48454696960 | elapsed time per iteration (s): 0.15 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.703151E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.129 | TFLOPs: 25.96 | +7: iteration 92430/ 173500 | consumed samples: 23662080 | consumed tokens: 48459939840 | elapsed time per iteration (s): 0.15 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.690047E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.740 | TFLOPs: 25.97 | +7: iteration 92440/ 173500 | consumed samples: 23664640 | consumed tokens: 48465182720 | elapsed time per iteration (s): 0.15 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.688566E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.676 | TFLOPs: 26.22 | +7: iteration 92450/ 173500 | consumed samples: 23667200 | consumed tokens: 48470425600 | elapsed time per iteration (s): 0.15 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.711560E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.046 | TFLOPs: 26.22 | +7: iteration 92460/ 173500 | consumed samples: 23669760 | consumed tokens: 48475668480 | elapsed time per iteration (s): 0.15 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.707473E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.540 | TFLOPs: 26.21 | +7: iteration 92470/ 173500 | consumed samples: 23672320 | consumed tokens: 48480911360 | elapsed time per iteration (s): 0.16 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.689683E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.665 | TFLOPs: 25.79 | +7: iteration 92480/ 173500 | consumed samples: 23674880 | consumed tokens: 48486154240 | elapsed time per iteration (s): 0.15 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.688923E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.894 | TFLOPs: 26.22 | +7: iteration 92490/ 173500 | consumed samples: 23677440 | consumed tokens: 48491397120 | elapsed time per iteration (s): 0.16 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.702441E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.075 | TFLOPs: 25.88 | +7: iteration 92500/ 173500 | consumed samples: 23680000 | consumed tokens: 48496640000 | elapsed time per iteration (s): 0.16 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.705472E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.177 | TFLOPs: 25.68 | +7: iteration 92510/ 173500 | consumed samples: 23682560 | consumed tokens: 48501882880 | elapsed time per iteration (s): 0.15 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.696381E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.092 | TFLOPs: 26.22 | +7: iteration 92520/ 173500 | consumed samples: 23685120 | consumed tokens: 48507125760 | elapsed time per iteration (s): 0.16 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.707088E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.846 | TFLOPs: 24.93 | +7: iteration 92530/ 173500 | consumed samples: 23687680 | consumed tokens: 48512368640 | elapsed time per iteration (s): 0.16 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.698160E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.701 | TFLOPs: 25.89 | +7: iteration 92540/ 173500 | consumed samples: 23690240 | consumed tokens: 48517611520 | elapsed time per iteration (s): 0.16 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.693315E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.074 | TFLOPs: 25.80 | +7: iteration 92550/ 173500 | consumed samples: 23692800 | consumed tokens: 48522854400 | elapsed time per iteration (s): 0.15 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.697396E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.101 | TFLOPs: 26.13 | +7: iteration 92560/ 173500 | consumed samples: 23695360 | consumed tokens: 48528097280 | elapsed time per iteration (s): 0.15 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.694854E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.580 | TFLOPs: 26.12 | +7: iteration 92570/ 173500 | consumed samples: 23697920 | consumed tokens: 48533340160 | elapsed time per iteration (s): 0.15 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.689988E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.493 | TFLOPs: 26.17 | +7: iteration 92580/ 173500 | consumed samples: 23700480 | consumed tokens: 48538583040 | elapsed time per iteration (s): 0.16 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.702251E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.992 | TFLOPs: 25.33 | +7: iteration 92590/ 173500 | consumed samples: 23703040 | consumed tokens: 48543825920 | elapsed time per iteration (s): 0.16 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.701800E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.729 | TFLOPs: 25.81 | +7: iteration 92600/ 173500 | consumed samples: 23705600 | consumed tokens: 48549068800 | elapsed time per iteration (s): 0.16 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.701729E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.814 | TFLOPs: 25.65 | +7: iteration 92610/ 173500 | consumed samples: 23708160 | consumed tokens: 48554311680 | elapsed time per iteration (s): 0.16 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.688765E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.428 | TFLOPs: 25.79 | +7: iteration 92620/ 173500 | consumed samples: 23710720 | consumed tokens: 48559554560 | elapsed time per iteration (s): 0.15 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.698863E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.625 | TFLOPs: 26.23 | +7: iteration 92630/ 173500 | consumed samples: 23713280 | consumed tokens: 48564797440 | elapsed time per iteration (s): 0.15 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.689423E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.385 | TFLOPs: 26.24 | +7: iteration 92640/ 173500 | consumed samples: 23715840 | consumed tokens: 48570040320 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.702312E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.858 | TFLOPs: 26.23 | +7: iteration 92650/ 173500 | consumed samples: 23718400 | consumed tokens: 48575283200 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.689722E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.968 | TFLOPs: 26.24 | +7: iteration 92660/ 173500 | consumed samples: 23720960 | consumed tokens: 48580526080 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.695626E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.417 | TFLOPs: 26.24 | +7: iteration 92670/ 173500 | consumed samples: 23723520 | consumed tokens: 48585768960 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.690873E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.752 | TFLOPs: 26.06 | +7: iteration 92680/ 173500 | consumed samples: 23726080 | consumed tokens: 48591011840 | elapsed time per iteration (s): 0.15 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.706238E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.693 | TFLOPs: 26.23 | +7: iteration 92690/ 173500 | consumed samples: 23728640 | consumed tokens: 48596254720 | elapsed time per iteration (s): 0.16 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.694136E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.943 | TFLOPs: 25.83 | +7: iteration 92700/ 173500 | consumed samples: 23731200 | consumed tokens: 48601497600 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.715297E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.359 | TFLOPs: 26.24 | +7: iteration 92710/ 173500 | consumed samples: 23733760 | consumed tokens: 48606740480 | elapsed time per iteration (s): 0.16 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.710929E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.235 | TFLOPs: 25.83 | +7: iteration 92720/ 173500 | consumed samples: 23736320 | consumed tokens: 48611983360 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.711166E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.383 | TFLOPs: 26.21 | +7: iteration 92730/ 173500 | consumed samples: 23738880 | consumed tokens: 48617226240 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.695545E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.295 | TFLOPs: 26.23 | +7: iteration 92740/ 173500 | consumed samples: 23741440 | consumed tokens: 48622469120 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.694625E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.261 | TFLOPs: 26.24 | +7: iteration 92750/ 173500 | consumed samples: 23744000 | consumed tokens: 48627712000 | elapsed time per iteration (s): 0.15 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.688454E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.219 | TFLOPs: 26.24 | +7: iteration 92760/ 173500 | consumed samples: 23746560 | consumed tokens: 48632954880 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.699815E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.058 | TFLOPs: 26.24 | +7: iteration 92770/ 173500 | consumed samples: 23749120 | consumed tokens: 48638197760 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.696016E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.283 | TFLOPs: 26.24 | +7: iteration 92780/ 173500 | consumed samples: 23751680 | consumed tokens: 48643440640 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.705014E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.895 | TFLOPs: 26.22 | +7: iteration 92790/ 173500 | consumed samples: 23754240 | consumed tokens: 48648683520 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.696287E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.780 | TFLOPs: 26.23 | +7: iteration 92800/ 173500 | consumed samples: 23756800 | consumed tokens: 48653926400 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.698985E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.391 | TFLOPs: 26.24 | +7: iteration 92810/ 173500 | consumed samples: 23759360 | consumed tokens: 48659169280 | elapsed time per iteration (s): 0.15 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.710288E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.608 | TFLOPs: 26.17 | +7: iteration 92820/ 173500 | consumed samples: 23761920 | consumed tokens: 48664412160 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.696470E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.497 | TFLOPs: 26.18 | +7: iteration 92830/ 173500 | consumed samples: 23764480 | consumed tokens: 48669655040 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.710000E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.547 | TFLOPs: 26.17 | +7: iteration 92840/ 173500 | consumed samples: 23767040 | consumed tokens: 48674897920 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.706709E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.795 | TFLOPs: 26.16 | +7: iteration 92850/ 173500 | consumed samples: 23769600 | consumed tokens: 48680140800 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.702656E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.345 | TFLOPs: 26.23 | +7: iteration 92860/ 173500 | consumed samples: 23772160 | consumed tokens: 48685383680 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.702961E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.927 | TFLOPs: 26.25 | +7: iteration 92870/ 173500 | consumed samples: 23774720 | consumed tokens: 48690626560 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.693740E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.813 | TFLOPs: 26.25 | +7: iteration 92880/ 173500 | consumed samples: 23777280 | consumed tokens: 48695869440 | elapsed time per iteration (s): 0.15 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.708000E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.197 | TFLOPs: 26.22 | +7: iteration 92890/ 173500 | consumed samples: 23779840 | consumed tokens: 48701112320 | elapsed time per iteration (s): 0.16 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.700336E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.511 | TFLOPs: 25.60 | +7: iteration 92900/ 173500 | consumed samples: 23782400 | consumed tokens: 48706355200 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.705766E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.005 | TFLOPs: 26.24 | +7: iteration 92910/ 173500 | consumed samples: 23784960 | consumed tokens: 48711598080 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.711685E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.077 | TFLOPs: 26.25 | +7: iteration 92920/ 173500 | consumed samples: 23787520 | consumed tokens: 48716840960 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.706274E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.342 | TFLOPs: 26.24 | +7: iteration 92930/ 173500 | consumed samples: 23790080 | consumed tokens: 48722083840 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.693379E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.155 | TFLOPs: 26.24 | +7: iteration 92940/ 173500 | consumed samples: 23792640 | consumed tokens: 48727326720 | elapsed time per iteration (s): 0.15 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.694978E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.714 | TFLOPs: 26.23 | +7: iteration 92950/ 173500 | consumed samples: 23795200 | consumed tokens: 48732569600 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.694529E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.371 | TFLOPs: 26.23 | +7: iteration 92960/ 173500 | consumed samples: 23797760 | consumed tokens: 48737812480 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.696601E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.977 | TFLOPs: 26.24 | +7: iteration 92970/ 173500 | consumed samples: 23800320 | consumed tokens: 48743055360 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.697998E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.862 | TFLOPs: 26.23 | +7: iteration 92980/ 173500 | consumed samples: 23802880 | consumed tokens: 48748298240 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.694891E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.826 | TFLOPs: 26.25 | +7: iteration 92990/ 173500 | consumed samples: 23805440 | consumed tokens: 48753541120 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.699931E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.993 | TFLOPs: 26.24 | +7: iteration 93000/ 173500 | consumed samples: 23808000 | consumed tokens: 48758784000 | elapsed time per iteration (s): 0.15 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.692010E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.089 | TFLOPs: 26.24 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 93000 | lm loss value: 3.873816E+00 | lm loss PPL: 4.812566E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 93000 to checkpoints_44m91b100m +0: [2023-03-17 04:17:09,893] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step93000 is begin to save! +0: [2023-03-17 04:17:09,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:17:09,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:17:09,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:17:09,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:17:09,965] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:17:09,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:17:09,973] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:17:09,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:17:09,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:17:09,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:17:09,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:17:09,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:17:09,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:17:10,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:17:10,004] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:17:10,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:17:10,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:17:10,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:17:10,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:17:10,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:17:10,022] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step93000/mp_rank_00_model_states.pt +0: [2023-03-17 04:17:10,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:17:10,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:17:10,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:17:10,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:17:10,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:17:10,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:17:10,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:17:10,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 04:17:10,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:17:10,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +3: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: [2023-03-17 04:17:10,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +5: [2023-03-17 04:17:10,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +6: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:17:10,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:17:10,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:17:10,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +2: [2023-03-17 04:17:10,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 04:17:10,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +1: [2023-03-17 04:17:10,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:17:10,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:17:10,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +4: [2023-03-17 04:17:10,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:17:10,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:17:10,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +7: [2023-03-17 04:17:10,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:17:10,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step93000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:17:10,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step93000 is ready now! +0: successfully saved checkpoint at iteration 93000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 180.44 +7: iteration 93010/ 173500 | consumed samples: 23810560 | consumed tokens: 48764026880 | elapsed time per iteration (s): 0.18 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.698224E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.506 | TFLOPs: 22.83 | +7: iteration 93020/ 173500 | consumed samples: 23813120 | consumed tokens: 48769269760 | elapsed time per iteration (s): 0.15 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.692286E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.183 | TFLOPs: 26.24 | +7: iteration 93030/ 173500 | consumed samples: 23815680 | consumed tokens: 48774512640 | elapsed time per iteration (s): 0.15 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.692276E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.119 | TFLOPs: 26.24 | +7: iteration 93040/ 173500 | consumed samples: 23818240 | consumed tokens: 48779755520 | elapsed time per iteration (s): 0.16 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.700307E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.201 | TFLOPs: 25.77 | +7: iteration 93050/ 173500 | consumed samples: 23820800 | consumed tokens: 48784998400 | elapsed time per iteration (s): 0.16 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.697517E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.548 | TFLOPs: 25.74 | +7: iteration 93060/ 173500 | consumed samples: 23823360 | consumed tokens: 48790241280 | elapsed time per iteration (s): 0.15 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.695180E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.840 | TFLOPs: 26.20 | +7: iteration 93070/ 173500 | consumed samples: 23825920 | consumed tokens: 48795484160 | elapsed time per iteration (s): 0.16 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.705970E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.692 | TFLOPs: 25.54 | +7: iteration 93080/ 173500 | consumed samples: 23828480 | consumed tokens: 48800727040 | elapsed time per iteration (s): 0.16 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.689327E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.357 | TFLOPs: 25.66 | +7: iteration 93090/ 173500 | consumed samples: 23831040 | consumed tokens: 48805969920 | elapsed time per iteration (s): 0.15 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.688855E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.474 | TFLOPs: 26.06 | +7: iteration 93100/ 173500 | consumed samples: 23833600 | consumed tokens: 48811212800 | elapsed time per iteration (s): 0.15 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.698669E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.436 | TFLOPs: 26.02 | +7: iteration 93110/ 173500 | consumed samples: 23836160 | consumed tokens: 48816455680 | elapsed time per iteration (s): 0.15 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.693121E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.728 | TFLOPs: 26.12 | +7: iteration 93120/ 173500 | consumed samples: 23838720 | consumed tokens: 48821698560 | elapsed time per iteration (s): 0.16 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.718151E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.212 | TFLOPs: 25.46 | +7: iteration 93130/ 173500 | consumed samples: 23841280 | consumed tokens: 48826941440 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.682207E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.944 | TFLOPs: 26.19 | +7: iteration 93140/ 173500 | consumed samples: 23843840 | consumed tokens: 48832184320 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.701466E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.559 | TFLOPs: 26.18 | +7: iteration 93150/ 173500 | consumed samples: 23846400 | consumed tokens: 48837427200 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.700983E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.044 | TFLOPs: 26.16 | +7: iteration 93160/ 173500 | consumed samples: 23848960 | consumed tokens: 48842670080 | elapsed time per iteration (s): 0.16 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.692958E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.284 | TFLOPs: 25.85 | +7: iteration 93170/ 173500 | consumed samples: 23851520 | consumed tokens: 48847912960 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.695432E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.284 | TFLOPs: 26.19 | +7: iteration 93180/ 173500 | consumed samples: 23854080 | consumed tokens: 48853155840 | elapsed time per iteration (s): 0.15 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.702721E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.529 | TFLOPs: 26.21 | +7: iteration 93190/ 173500 | consumed samples: 23856640 | consumed tokens: 48858398720 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.699013E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.972 | TFLOPs: 26.21 | +7: iteration 93200/ 173500 | consumed samples: 23859200 | consumed tokens: 48863641600 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.704094E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.351 | TFLOPs: 26.20 | +7: iteration 93210/ 173500 | consumed samples: 23861760 | consumed tokens: 48868884480 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.692742E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.050 | TFLOPs: 26.19 | +7: iteration 93220/ 173500 | consumed samples: 23864320 | consumed tokens: 48874127360 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.701741E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.104 | TFLOPs: 26.21 | +7: iteration 93230/ 173500 | consumed samples: 23866880 | consumed tokens: 48879370240 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.698780E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.956 | TFLOPs: 26.20 | +7: iteration 93240/ 173500 | consumed samples: 23869440 | consumed tokens: 48884613120 | elapsed time per iteration (s): 0.15 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.705666E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.684 | TFLOPs: 26.22 | +7: iteration 93250/ 173500 | consumed samples: 23872000 | consumed tokens: 48889856000 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.706831E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.648 | TFLOPs: 26.17 | +7: iteration 93260/ 173500 | consumed samples: 23874560 | consumed tokens: 48895098880 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.702245E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.234 | TFLOPs: 26.13 | +7: iteration 93270/ 173500 | consumed samples: 23877120 | consumed tokens: 48900341760 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.698899E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.837 | TFLOPs: 26.14 | +7: iteration 93280/ 173500 | consumed samples: 23879680 | consumed tokens: 48905584640 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.711656E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.247 | TFLOPs: 26.12 | +7: iteration 93290/ 173500 | consumed samples: 23882240 | consumed tokens: 48910827520 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.703838E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.156 | TFLOPs: 26.15 | +7: iteration 93300/ 173500 | consumed samples: 23884800 | consumed tokens: 48916070400 | elapsed time per iteration (s): 0.15 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.695148E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.860 | TFLOPs: 26.09 | +7: iteration 93310/ 173500 | consumed samples: 23887360 | consumed tokens: 48921313280 | elapsed time per iteration (s): 0.15 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.697620E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.432 | TFLOPs: 26.10 | +7: iteration 93320/ 173500 | consumed samples: 23889920 | consumed tokens: 48926556160 | elapsed time per iteration (s): 0.15 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.694535E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.316 | TFLOPs: 26.10 | +7: iteration 93330/ 173500 | consumed samples: 23892480 | consumed tokens: 48931799040 | elapsed time per iteration (s): 0.16 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.704169E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.964 | TFLOPs: 25.83 | +7: iteration 93340/ 173500 | consumed samples: 23895040 | consumed tokens: 48937041920 | elapsed time per iteration (s): 0.16 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.697332E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.482 | TFLOPs: 25.81 | +7: iteration 93350/ 173500 | consumed samples: 23897600 | consumed tokens: 48942284800 | elapsed time per iteration (s): 0.16 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.703263E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.329 | TFLOPs: 25.80 | +7: iteration 93360/ 173500 | consumed samples: 23900160 | consumed tokens: 48947527680 | elapsed time per iteration (s): 0.15 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.700232E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.065 | TFLOPs: 26.14 | +7: iteration 93370/ 173500 | consumed samples: 23902720 | consumed tokens: 48952770560 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.686825E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.900 | TFLOPs: 25.95 | +7: iteration 93380/ 173500 | consumed samples: 23905280 | consumed tokens: 48958013440 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.693270E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.249 | TFLOPs: 26.15 | +7: iteration 93390/ 173500 | consumed samples: 23907840 | consumed tokens: 48963256320 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.698195E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.827 | TFLOPs: 26.14 | +7: iteration 93400/ 173500 | consumed samples: 23910400 | consumed tokens: 48968499200 | elapsed time per iteration (s): 0.16 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.699662E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.015 | TFLOPs: 25.69 | +7: iteration 93410/ 173500 | consumed samples: 23912960 | consumed tokens: 48973742080 | elapsed time per iteration (s): 0.15 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.702498E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.563 | TFLOPs: 26.17 | +7: iteration 93420/ 173500 | consumed samples: 23915520 | consumed tokens: 48978984960 | elapsed time per iteration (s): 0.16 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.707879E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.859 | TFLOPs: 25.45 | +7: iteration 93430/ 173500 | consumed samples: 23918080 | consumed tokens: 48984227840 | elapsed time per iteration (s): 0.16 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.691954E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.352 | TFLOPs: 25.55 | +7: iteration 93440/ 173500 | consumed samples: 23920640 | consumed tokens: 48989470720 | elapsed time per iteration (s): 0.15 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.705581E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.692 | TFLOPs: 26.14 | +7: iteration 93450/ 173500 | consumed samples: 23923200 | consumed tokens: 48994713600 | elapsed time per iteration (s): 0.16 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.695081E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.276 | TFLOPs: 25.65 | +7: iteration 93460/ 173500 | consumed samples: 23925760 | consumed tokens: 48999956480 | elapsed time per iteration (s): 0.15 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.697651E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.619 | TFLOPs: 26.09 | +7: iteration 93470/ 173500 | consumed samples: 23928320 | consumed tokens: 49005199360 | elapsed time per iteration (s): 0.15 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.685653E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.690 | TFLOPs: 26.14 | +7: iteration 93480/ 173500 | consumed samples: 23930880 | consumed tokens: 49010442240 | elapsed time per iteration (s): 0.15 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.709769E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.596 | TFLOPs: 26.17 | +7: iteration 93490/ 173500 | consumed samples: 23933440 | consumed tokens: 49015685120 | elapsed time per iteration (s): 0.16 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.699314E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.033 | TFLOPs: 25.61 | +7: iteration 93500/ 173500 | consumed samples: 23936000 | consumed tokens: 49020928000 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.687132E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.271 | TFLOPs: 25.99 | +7: iteration 93510/ 173500 | consumed samples: 23938560 | consumed tokens: 49026170880 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.695725E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.966 | TFLOPs: 26.00 | +7: iteration 93520/ 173500 | consumed samples: 23941120 | consumed tokens: 49031413760 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.704150E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.576 | TFLOPs: 25.96 | +7: iteration 93530/ 173500 | consumed samples: 23943680 | consumed tokens: 49036656640 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.697263E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.673 | TFLOPs: 26.00 | +7: iteration 93540/ 173500 | consumed samples: 23946240 | consumed tokens: 49041899520 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.703301E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.558 | TFLOPs: 26.01 | +7: iteration 93550/ 173500 | consumed samples: 23948800 | consumed tokens: 49047142400 | elapsed time per iteration (s): 0.15 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.701302E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.345 | TFLOPs: 26.01 | +7: iteration 93560/ 173500 | consumed samples: 23951360 | consumed tokens: 49052385280 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.688592E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.988 | TFLOPs: 26.08 | +7: iteration 93570/ 173500 | consumed samples: 23953920 | consumed tokens: 49057628160 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.691882E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.483 | TFLOPs: 26.07 | +7: iteration 93580/ 173500 | consumed samples: 23956480 | consumed tokens: 49062871040 | elapsed time per iteration (s): 0.16 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.714217E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.155 | TFLOPs: 25.83 | +7: iteration 93590/ 173500 | consumed samples: 23959040 | consumed tokens: 49068113920 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.701944E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.930 | TFLOPs: 26.00 | +7: iteration 93600/ 173500 | consumed samples: 23961600 | consumed tokens: 49073356800 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.685652E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.060 | TFLOPs: 26.00 | +7: iteration 93610/ 173500 | consumed samples: 23964160 | consumed tokens: 49078599680 | elapsed time per iteration (s): 0.15 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.704452E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.890 | TFLOPs: 26.02 | +7: iteration 93620/ 173500 | consumed samples: 23966720 | consumed tokens: 49083842560 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.692938E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.030 | TFLOPs: 25.99 | +7: iteration 93630/ 173500 | consumed samples: 23969280 | consumed tokens: 49089085440 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.700122E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.023 | TFLOPs: 25.94 | +7: iteration 93640/ 173500 | consumed samples: 23971840 | consumed tokens: 49094328320 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.693891E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.335 | TFLOPs: 25.99 | +7: iteration 93650/ 173500 | consumed samples: 23974400 | consumed tokens: 49099571200 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.707710E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.577 | TFLOPs: 25.99 | +7: iteration 93660/ 173500 | consumed samples: 23976960 | consumed tokens: 49104814080 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.682718E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.364 | TFLOPs: 25.98 | +7: iteration 93670/ 173500 | consumed samples: 23979520 | consumed tokens: 49110056960 | elapsed time per iteration (s): 0.15 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.698495E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.370 | TFLOPs: 26.05 | +7: iteration 93680/ 173500 | consumed samples: 23982080 | consumed tokens: 49115299840 | elapsed time per iteration (s): 0.15 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.682023E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.340 | TFLOPs: 26.09 | +7: iteration 93690/ 173500 | consumed samples: 23984640 | consumed tokens: 49120542720 | elapsed time per iteration (s): 0.16 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.701134E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.600 | TFLOPs: 25.76 | +7: iteration 93700/ 173500 | consumed samples: 23987200 | consumed tokens: 49125785600 | elapsed time per iteration (s): 0.15 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.688487E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.859 | TFLOPs: 26.12 | +7: iteration 93710/ 173500 | consumed samples: 23989760 | consumed tokens: 49131028480 | elapsed time per iteration (s): 0.15 | learning rate: 9.999E-05 | global batch size: 256 | lm loss: 3.693787E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.856 | TFLOPs: 26.09 | +7: iteration 93720/ 173500 | consumed samples: 23992320 | consumed tokens: 49136271360 | elapsed time per iteration (s): 0.15 | learning rate: 9.998E-05 | global batch size: 256 | lm loss: 3.698664E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.960 | TFLOPs: 26.11 | +7: iteration 93730/ 173500 | consumed samples: 23994880 | consumed tokens: 49141514240 | elapsed time per iteration (s): 0.15 | learning rate: 9.996E-05 | global batch size: 256 | lm loss: 3.710793E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.710 | TFLOPs: 26.11 | +7: iteration 93740/ 173500 | consumed samples: 23997440 | consumed tokens: 49146757120 | elapsed time per iteration (s): 0.15 | learning rate: 9.994E-05 | global batch size: 256 | lm loss: 3.694901E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.359 | TFLOPs: 26.12 | +7: iteration 93750/ 173500 | consumed samples: 24000000 | consumed tokens: 49152000000 | elapsed time per iteration (s): 0.15 | learning rate: 9.993E-05 | global batch size: 256 | lm loss: 3.704514E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.183 | TFLOPs: 26.10 | +7: iteration 93760/ 173500 | consumed samples: 24002560 | consumed tokens: 49157242880 | elapsed time per iteration (s): 0.15 | learning rate: 9.991E-05 | global batch size: 256 | lm loss: 3.705144E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.310 | TFLOPs: 26.08 | +7: iteration 93770/ 173500 | consumed samples: 24005120 | consumed tokens: 49162485760 | elapsed time per iteration (s): 0.15 | learning rate: 9.989E-05 | global batch size: 256 | lm loss: 3.696119E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.852 | TFLOPs: 26.09 | +7: iteration 93780/ 173500 | consumed samples: 24007680 | consumed tokens: 49167728640 | elapsed time per iteration (s): 0.15 | learning rate: 9.988E-05 | global batch size: 256 | lm loss: 3.704485E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.673 | TFLOPs: 26.09 | +7: iteration 93790/ 173500 | consumed samples: 24010240 | consumed tokens: 49172971520 | elapsed time per iteration (s): 0.15 | learning rate: 9.986E-05 | global batch size: 256 | lm loss: 3.699097E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.272 | TFLOPs: 26.10 | +7: iteration 93800/ 173500 | consumed samples: 24012800 | consumed tokens: 49178214400 | elapsed time per iteration (s): 0.15 | learning rate: 9.985E-05 | global batch size: 256 | lm loss: 3.698777E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.326 | TFLOPs: 26.10 | +7: iteration 93810/ 173500 | consumed samples: 24015360 | consumed tokens: 49183457280 | elapsed time per iteration (s): 0.15 | learning rate: 9.983E-05 | global batch size: 256 | lm loss: 3.690567E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.006 | TFLOPs: 26.10 | +7: iteration 93820/ 173500 | consumed samples: 24017920 | consumed tokens: 49188700160 | elapsed time per iteration (s): 0.15 | learning rate: 9.981E-05 | global batch size: 256 | lm loss: 3.708172E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.178 | TFLOPs: 26.10 | +7: iteration 93830/ 173500 | consumed samples: 24020480 | consumed tokens: 49193943040 | elapsed time per iteration (s): 0.15 | learning rate: 9.980E-05 | global batch size: 256 | lm loss: 3.698403E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.354 | TFLOPs: 26.09 | +7: iteration 93840/ 173500 | consumed samples: 24023040 | consumed tokens: 49199185920 | elapsed time per iteration (s): 0.15 | learning rate: 9.978E-05 | global batch size: 256 | lm loss: 3.705331E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.881 | TFLOPs: 26.11 | +7: iteration 93850/ 173500 | consumed samples: 24025600 | consumed tokens: 49204428800 | elapsed time per iteration (s): 0.15 | learning rate: 9.976E-05 | global batch size: 256 | lm loss: 3.693259E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.677 | TFLOPs: 26.07 | +7: iteration 93860/ 173500 | consumed samples: 24028160 | consumed tokens: 49209671680 | elapsed time per iteration (s): 0.15 | learning rate: 9.975E-05 | global batch size: 256 | lm loss: 3.698610E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.868 | TFLOPs: 25.91 | +7: iteration 93870/ 173500 | consumed samples: 24030720 | consumed tokens: 49214914560 | elapsed time per iteration (s): 0.15 | learning rate: 9.973E-05 | global batch size: 256 | lm loss: 3.690042E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.906 | TFLOPs: 26.13 | +7: iteration 93880/ 173500 | consumed samples: 24033280 | consumed tokens: 49220157440 | elapsed time per iteration (s): 0.16 | learning rate: 9.971E-05 | global batch size: 256 | lm loss: 3.677077E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.535 | TFLOPs: 25.73 | +7: iteration 93890/ 173500 | consumed samples: 24035840 | consumed tokens: 49225400320 | elapsed time per iteration (s): 0.15 | learning rate: 9.970E-05 | global batch size: 256 | lm loss: 3.698153E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.640 | TFLOPs: 26.09 | +7: iteration 93900/ 173500 | consumed samples: 24038400 | consumed tokens: 49230643200 | elapsed time per iteration (s): 0.15 | learning rate: 9.968E-05 | global batch size: 256 | lm loss: 3.698212E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.733 | TFLOPs: 26.11 | +7: iteration 93910/ 173500 | consumed samples: 24040960 | consumed tokens: 49235886080 | elapsed time per iteration (s): 0.15 | learning rate: 9.967E-05 | global batch size: 256 | lm loss: 3.711087E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.422 | TFLOPs: 26.09 | +7: iteration 93920/ 173500 | consumed samples: 24043520 | consumed tokens: 49241128960 | elapsed time per iteration (s): 0.15 | learning rate: 9.965E-05 | global batch size: 256 | lm loss: 3.693185E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.994 | TFLOPs: 26.11 | +7: iteration 93930/ 173500 | consumed samples: 24046080 | consumed tokens: 49246371840 | elapsed time per iteration (s): 0.15 | learning rate: 9.963E-05 | global batch size: 256 | lm loss: 3.697604E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.028 | TFLOPs: 26.11 | +7: iteration 93940/ 173500 | consumed samples: 24048640 | consumed tokens: 49251614720 | elapsed time per iteration (s): 0.15 | learning rate: 9.962E-05 | global batch size: 256 | lm loss: 3.706125E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.220 | TFLOPs: 26.11 | +7: iteration 93950/ 173500 | consumed samples: 24051200 | consumed tokens: 49256857600 | elapsed time per iteration (s): 0.15 | learning rate: 9.960E-05 | global batch size: 256 | lm loss: 3.694877E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.104 | TFLOPs: 26.10 | +7: iteration 93960/ 173500 | consumed samples: 24053760 | consumed tokens: 49262100480 | elapsed time per iteration (s): 0.15 | learning rate: 9.958E-05 | global batch size: 256 | lm loss: 3.673564E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.610 | TFLOPs: 26.09 | +7: iteration 93970/ 173500 | consumed samples: 24056320 | consumed tokens: 49267343360 | elapsed time per iteration (s): 0.15 | learning rate: 9.957E-05 | global batch size: 256 | lm loss: 3.692831E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.315 | TFLOPs: 26.08 | +7: iteration 93980/ 173500 | consumed samples: 24058880 | consumed tokens: 49272586240 | elapsed time per iteration (s): 0.15 | learning rate: 9.955E-05 | global batch size: 256 | lm loss: 3.693396E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.751 | TFLOPs: 26.08 | +7: iteration 93990/ 173500 | consumed samples: 24061440 | consumed tokens: 49277829120 | elapsed time per iteration (s): 0.16 | learning rate: 9.953E-05 | global batch size: 256 | lm loss: 3.699092E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.247 | TFLOPs: 25.50 | +0: [2023-03-17 04:19:44,343] [INFO] [logging.py:68:log_dist] [Rank 0] step=94000, skipped=0, lr=[9.951807001525316e-05, 9.951807001525316e-05, 9.951807001525316e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 94000/ 173500 | consumed samples: 24064000 | consumed tokens: 49283072000 | elapsed time per iteration (s): 0.15 | learning rate: 9.952E-05 | global batch size: 256 | lm loss: 3.699812E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.809 | TFLOPs: 26.09 | +0: steps: 94000 loss: 3.7005 iter time (s): 0.153 samples/sec: 1674.072 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 94000 | lm loss value: 3.833609E+00 | lm loss PPL: 4.622908E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 94000 to checkpoints_44m91b100m +0: [2023-03-17 04:19:44,417] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step94000 is begin to save! +0: [2023-03-17 04:19:44,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:19:44,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:19:44,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:19:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:19:44,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:19:44,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:19:44,495] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:19:44,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:19:44,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:19:44,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:19:44,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:19:44,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:19:44,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:19:44,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:19:44,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:19:44,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:19:44,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:19:44,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:19:44,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:19:44,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:19:44,545] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step94000/mp_rank_00_model_states.pt +0: [2023-03-17 04:19:44,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:19:44,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:19:44,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:19:44,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: [2023-03-17 04:19:44,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: [2023-03-17 04:19:44,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:19:44,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:19:44,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 04:19:44,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 04:19:44,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:19:44,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 04:19:44,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: [2023-03-17 04:19:44,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +6: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +3: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +1: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +2: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +4: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +5: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 04:19:44,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step94000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +7: [2023-03-17 04:19:44,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step94000 is ready now! +0: successfully saved checkpoint at iteration 94000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 172.83 +7: iteration 94010/ 173500 | consumed samples: 24066560 | consumed tokens: 49288314880 | elapsed time per iteration (s): 0.18 | learning rate: 9.950E-05 | global batch size: 256 | lm loss: 3.709005E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.524 | TFLOPs: 22.56 | +7: iteration 94020/ 173500 | consumed samples: 24069120 | consumed tokens: 49293557760 | elapsed time per iteration (s): 0.15 | learning rate: 9.949E-05 | global batch size: 256 | lm loss: 3.701556E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.529 | TFLOPs: 26.09 | +7: iteration 94030/ 173500 | consumed samples: 24071680 | consumed tokens: 49298800640 | elapsed time per iteration (s): 0.15 | learning rate: 9.947E-05 | global batch size: 256 | lm loss: 3.700038E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.818 | TFLOPs: 26.08 | +7: iteration 94040/ 173500 | consumed samples: 24074240 | consumed tokens: 49304043520 | elapsed time per iteration (s): 0.15 | learning rate: 9.945E-05 | global batch size: 256 | lm loss: 3.690242E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.198 | TFLOPs: 26.08 | +7: iteration 94050/ 173500 | consumed samples: 24076800 | consumed tokens: 49309286400 | elapsed time per iteration (s): 0.15 | learning rate: 9.944E-05 | global batch size: 256 | lm loss: 3.699340E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.659 | TFLOPs: 25.98 | +7: iteration 94060/ 173500 | consumed samples: 24079360 | consumed tokens: 49314529280 | elapsed time per iteration (s): 0.16 | learning rate: 9.942E-05 | global batch size: 256 | lm loss: 3.697897E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.655 | TFLOPs: 25.78 | +7: iteration 94070/ 173500 | consumed samples: 24081920 | consumed tokens: 49319772160 | elapsed time per iteration (s): 0.17 | learning rate: 9.940E-05 | global batch size: 256 | lm loss: 3.696894E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1483.211 | TFLOPs: 23.26 | +7: iteration 94080/ 173500 | consumed samples: 24084480 | consumed tokens: 49325015040 | elapsed time per iteration (s): 0.15 | learning rate: 9.939E-05 | global batch size: 256 | lm loss: 3.697172E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.333 | TFLOPs: 26.13 | +7: iteration 94090/ 173500 | consumed samples: 24087040 | consumed tokens: 49330257920 | elapsed time per iteration (s): 0.15 | learning rate: 9.937E-05 | global batch size: 256 | lm loss: 3.683836E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.313 | TFLOPs: 26.24 | +7: iteration 94100/ 173500 | consumed samples: 24089600 | consumed tokens: 49335500800 | elapsed time per iteration (s): 0.15 | learning rate: 9.935E-05 | global batch size: 256 | lm loss: 3.698703E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.392 | TFLOPs: 26.23 | +7: iteration 94110/ 173500 | consumed samples: 24092160 | consumed tokens: 49340743680 | elapsed time per iteration (s): 0.15 | learning rate: 9.934E-05 | global batch size: 256 | lm loss: 3.703866E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.191 | TFLOPs: 26.24 | +7: iteration 94120/ 173500 | consumed samples: 24094720 | consumed tokens: 49345986560 | elapsed time per iteration (s): 0.15 | learning rate: 9.932E-05 | global batch size: 256 | lm loss: 3.696320E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.574 | TFLOPs: 26.21 | +7: iteration 94130/ 173500 | consumed samples: 24097280 | consumed tokens: 49351229440 | elapsed time per iteration (s): 0.15 | learning rate: 9.931E-05 | global batch size: 256 | lm loss: 3.680568E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.719 | TFLOPs: 26.23 | +7: iteration 94140/ 173500 | consumed samples: 24099840 | consumed tokens: 49356472320 | elapsed time per iteration (s): 0.15 | learning rate: 9.929E-05 | global batch size: 256 | lm loss: 3.702531E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.919 | TFLOPs: 26.22 | +7: iteration 94150/ 173500 | consumed samples: 24102400 | consumed tokens: 49361715200 | elapsed time per iteration (s): 0.15 | learning rate: 9.927E-05 | global batch size: 256 | lm loss: 3.707446E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.118 | TFLOPs: 26.21 | +7: iteration 94160/ 173500 | consumed samples: 24104960 | consumed tokens: 49366958080 | elapsed time per iteration (s): 0.15 | learning rate: 9.926E-05 | global batch size: 256 | lm loss: 3.710125E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.255 | TFLOPs: 26.19 | +7: iteration 94170/ 173500 | consumed samples: 24107520 | consumed tokens: 49372200960 | elapsed time per iteration (s): 0.15 | learning rate: 9.924E-05 | global batch size: 256 | lm loss: 3.698457E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.454 | TFLOPs: 26.21 | +7: iteration 94180/ 173500 | consumed samples: 24110080 | consumed tokens: 49377443840 | elapsed time per iteration (s): 0.15 | learning rate: 9.922E-05 | global batch size: 256 | lm loss: 3.695911E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.414 | TFLOPs: 26.18 | +7: iteration 94190/ 173500 | consumed samples: 24112640 | consumed tokens: 49382686720 | elapsed time per iteration (s): 0.15 | learning rate: 9.921E-05 | global batch size: 256 | lm loss: 3.702725E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.950 | TFLOPs: 26.17 | +7: iteration 94200/ 173500 | consumed samples: 24115200 | consumed tokens: 49387929600 | elapsed time per iteration (s): 0.15 | learning rate: 9.919E-05 | global batch size: 256 | lm loss: 3.701649E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.867 | TFLOPs: 26.05 | +7: iteration 94210/ 173500 | consumed samples: 24117760 | consumed tokens: 49393172480 | elapsed time per iteration (s): 0.15 | learning rate: 9.917E-05 | global batch size: 256 | lm loss: 3.703128E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.836 | TFLOPs: 26.17 | +7: iteration 94220/ 173500 | consumed samples: 24120320 | consumed tokens: 49398415360 | elapsed time per iteration (s): 0.15 | learning rate: 9.916E-05 | global batch size: 256 | lm loss: 3.705536E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.107 | TFLOPs: 26.22 | +7: iteration 94230/ 173500 | consumed samples: 24122880 | consumed tokens: 49403658240 | elapsed time per iteration (s): 0.15 | learning rate: 9.914E-05 | global batch size: 256 | lm loss: 3.684777E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.666 | TFLOPs: 26.25 | +7: iteration 94240/ 173500 | consumed samples: 24125440 | consumed tokens: 49408901120 | elapsed time per iteration (s): 0.15 | learning rate: 9.913E-05 | global batch size: 256 | lm loss: 3.701416E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.612 | TFLOPs: 26.22 | +7: iteration 94250/ 173500 | consumed samples: 24128000 | consumed tokens: 49414144000 | elapsed time per iteration (s): 0.15 | learning rate: 9.911E-05 | global batch size: 256 | lm loss: 3.699332E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.665 | TFLOPs: 26.22 | +7: iteration 94260/ 173500 | consumed samples: 24130560 | consumed tokens: 49419386880 | elapsed time per iteration (s): 0.15 | learning rate: 9.909E-05 | global batch size: 256 | lm loss: 3.693471E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.894 | TFLOPs: 26.19 | +7: iteration 94270/ 173500 | consumed samples: 24133120 | consumed tokens: 49424629760 | elapsed time per iteration (s): 0.15 | learning rate: 9.908E-05 | global batch size: 256 | lm loss: 3.699508E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.199 | TFLOPs: 26.18 | +7: iteration 94280/ 173500 | consumed samples: 24135680 | consumed tokens: 49429872640 | elapsed time per iteration (s): 0.16 | learning rate: 9.906E-05 | global batch size: 256 | lm loss: 3.699183E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.064 | TFLOPs: 25.53 | +7: iteration 94290/ 173500 | consumed samples: 24138240 | consumed tokens: 49435115520 | elapsed time per iteration (s): 0.15 | learning rate: 9.904E-05 | global batch size: 256 | lm loss: 3.698996E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.307 | TFLOPs: 26.15 | +7: iteration 94300/ 173500 | consumed samples: 24140800 | consumed tokens: 49440358400 | elapsed time per iteration (s): 0.15 | learning rate: 9.903E-05 | global batch size: 256 | lm loss: 3.691176E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.681 | TFLOPs: 26.15 | +7: iteration 94310/ 173500 | consumed samples: 24143360 | consumed tokens: 49445601280 | elapsed time per iteration (s): 0.15 | learning rate: 9.901E-05 | global batch size: 256 | lm loss: 3.688567E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.240 | TFLOPs: 26.15 | +7: iteration 94320/ 173500 | consumed samples: 24145920 | consumed tokens: 49450844160 | elapsed time per iteration (s): 0.16 | learning rate: 9.900E-05 | global batch size: 256 | lm loss: 3.696214E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.589 | TFLOPs: 25.79 | +7: iteration 94330/ 173500 | consumed samples: 24148480 | consumed tokens: 49456087040 | elapsed time per iteration (s): 0.15 | learning rate: 9.898E-05 | global batch size: 256 | lm loss: 3.690293E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.906 | TFLOPs: 26.13 | +7: iteration 94340/ 173500 | consumed samples: 24151040 | consumed tokens: 49461329920 | elapsed time per iteration (s): 0.15 | learning rate: 9.896E-05 | global batch size: 256 | lm loss: 3.694213E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.628 | TFLOPs: 26.17 | +7: iteration 94350/ 173500 | consumed samples: 24153600 | consumed tokens: 49466572800 | elapsed time per iteration (s): 0.15 | learning rate: 9.895E-05 | global batch size: 256 | lm loss: 3.694341E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.157 | TFLOPs: 26.18 | +7: iteration 94360/ 173500 | consumed samples: 24156160 | consumed tokens: 49471815680 | elapsed time per iteration (s): 0.16 | learning rate: 9.893E-05 | global batch size: 256 | lm loss: 3.686639E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.402 | TFLOPs: 25.69 | +7: iteration 94370/ 173500 | consumed samples: 24158720 | consumed tokens: 49477058560 | elapsed time per iteration (s): 0.16 | learning rate: 9.891E-05 | global batch size: 256 | lm loss: 3.696017E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.295 | TFLOPs: 24.75 | +7: iteration 94380/ 173500 | consumed samples: 24161280 | consumed tokens: 49482301440 | elapsed time per iteration (s): 0.16 | learning rate: 9.890E-05 | global batch size: 256 | lm loss: 3.705064E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.535 | TFLOPs: 25.29 | +7: iteration 94390/ 173500 | consumed samples: 24163840 | consumed tokens: 49487544320 | elapsed time per iteration (s): 0.15 | learning rate: 9.888E-05 | global batch size: 256 | lm loss: 3.696215E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.465 | TFLOPs: 26.23 | +7: iteration 94400/ 173500 | consumed samples: 24166400 | consumed tokens: 49492787200 | elapsed time per iteration (s): 0.15 | learning rate: 9.886E-05 | global batch size: 256 | lm loss: 3.692108E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.196 | TFLOPs: 25.99 | +7: iteration 94410/ 173500 | consumed samples: 24168960 | consumed tokens: 49498030080 | elapsed time per iteration (s): 0.16 | learning rate: 9.885E-05 | global batch size: 256 | lm loss: 3.705486E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.753 | TFLOPs: 25.81 | +7: iteration 94420/ 173500 | consumed samples: 24171520 | consumed tokens: 49503272960 | elapsed time per iteration (s): 0.15 | learning rate: 9.883E-05 | global batch size: 256 | lm loss: 3.701820E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.274 | TFLOPs: 26.23 | +7: iteration 94430/ 173500 | consumed samples: 24174080 | consumed tokens: 49508515840 | elapsed time per iteration (s): 0.15 | learning rate: 9.882E-05 | global batch size: 256 | lm loss: 3.693041E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.555 | TFLOPs: 26.23 | +7: iteration 94440/ 173500 | consumed samples: 24176640 | consumed tokens: 49513758720 | elapsed time per iteration (s): 0.15 | learning rate: 9.880E-05 | global batch size: 256 | lm loss: 3.705512E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.828 | TFLOPs: 26.22 | +7: iteration 94450/ 173500 | consumed samples: 24179200 | consumed tokens: 49519001600 | elapsed time per iteration (s): 0.15 | learning rate: 9.878E-05 | global batch size: 256 | lm loss: 3.704399E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.838 | TFLOPs: 26.23 | +7: iteration 94460/ 173500 | consumed samples: 24181760 | consumed tokens: 49524244480 | elapsed time per iteration (s): 0.15 | learning rate: 9.877E-05 | global batch size: 256 | lm loss: 3.690323E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.418 | TFLOPs: 26.21 | +7: iteration 94470/ 173500 | consumed samples: 24184320 | consumed tokens: 49529487360 | elapsed time per iteration (s): 0.15 | learning rate: 9.875E-05 | global batch size: 256 | lm loss: 3.692960E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.465 | TFLOPs: 26.21 | +7: iteration 94480/ 173500 | consumed samples: 24186880 | consumed tokens: 49534730240 | elapsed time per iteration (s): 0.15 | learning rate: 9.873E-05 | global batch size: 256 | lm loss: 3.708749E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.920 | TFLOPs: 26.20 | +7: iteration 94490/ 173500 | consumed samples: 24189440 | consumed tokens: 49539973120 | elapsed time per iteration (s): 0.16 | learning rate: 9.872E-05 | global batch size: 256 | lm loss: 3.710402E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.987 | TFLOPs: 25.89 | +7: iteration 94500/ 173500 | consumed samples: 24192000 | consumed tokens: 49545216000 | elapsed time per iteration (s): 0.16 | learning rate: 9.870E-05 | global batch size: 256 | lm loss: 3.704627E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.336 | TFLOPs: 25.85 | +7: iteration 94510/ 173500 | consumed samples: 24194560 | consumed tokens: 49550458880 | elapsed time per iteration (s): 0.15 | learning rate: 9.868E-05 | global batch size: 256 | lm loss: 3.686005E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.085 | TFLOPs: 26.10 | +7: iteration 94520/ 173500 | consumed samples: 24197120 | consumed tokens: 49555701760 | elapsed time per iteration (s): 0.15 | learning rate: 9.867E-05 | global batch size: 256 | lm loss: 3.693927E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.878 | TFLOPs: 26.09 | +7: iteration 94530/ 173500 | consumed samples: 24199680 | consumed tokens: 49560944640 | elapsed time per iteration (s): 0.15 | learning rate: 9.865E-05 | global batch size: 256 | lm loss: 3.690403E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.135 | TFLOPs: 26.16 | +7: iteration 94540/ 173500 | consumed samples: 24202240 | consumed tokens: 49566187520 | elapsed time per iteration (s): 0.15 | learning rate: 9.864E-05 | global batch size: 256 | lm loss: 3.688347E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.460 | TFLOPs: 25.99 | +7: iteration 94550/ 173500 | consumed samples: 24204800 | consumed tokens: 49571430400 | elapsed time per iteration (s): 0.15 | learning rate: 9.862E-05 | global batch size: 256 | lm loss: 3.705981E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.645 | TFLOPs: 26.22 | +7: iteration 94560/ 173500 | consumed samples: 24207360 | consumed tokens: 49576673280 | elapsed time per iteration (s): 0.15 | learning rate: 9.860E-05 | global batch size: 256 | lm loss: 3.705780E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.043 | TFLOPs: 26.21 | +7: iteration 94570/ 173500 | consumed samples: 24209920 | consumed tokens: 49581916160 | elapsed time per iteration (s): 0.15 | learning rate: 9.859E-05 | global batch size: 256 | lm loss: 3.686953E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.082 | TFLOPs: 26.16 | +7: iteration 94580/ 173500 | consumed samples: 24212480 | consumed tokens: 49587159040 | elapsed time per iteration (s): 0.15 | learning rate: 9.857E-05 | global batch size: 256 | lm loss: 3.714198E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.214 | TFLOPs: 26.22 | +7: iteration 94590/ 173500 | consumed samples: 24215040 | consumed tokens: 49592401920 | elapsed time per iteration (s): 0.15 | learning rate: 9.855E-05 | global batch size: 256 | lm loss: 3.689648E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.549 | TFLOPs: 26.21 | +7: iteration 94600/ 173500 | consumed samples: 24217600 | consumed tokens: 49597644800 | elapsed time per iteration (s): 0.15 | learning rate: 9.854E-05 | global batch size: 256 | lm loss: 3.694815E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.957 | TFLOPs: 26.17 | +7: iteration 94610/ 173500 | consumed samples: 24220160 | consumed tokens: 49602887680 | elapsed time per iteration (s): 0.15 | learning rate: 9.852E-05 | global batch size: 256 | lm loss: 3.684840E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.929 | TFLOPs: 26.19 | +7: iteration 94620/ 173500 | consumed samples: 24222720 | consumed tokens: 49608130560 | elapsed time per iteration (s): 0.15 | learning rate: 9.851E-05 | global batch size: 256 | lm loss: 3.693936E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.376 | TFLOPs: 26.21 | +7: iteration 94630/ 173500 | consumed samples: 24225280 | consumed tokens: 49613373440 | elapsed time per iteration (s): 0.15 | learning rate: 9.849E-05 | global batch size: 256 | lm loss: 3.707942E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.394 | TFLOPs: 26.15 | +7: iteration 94640/ 173500 | consumed samples: 24227840 | consumed tokens: 49618616320 | elapsed time per iteration (s): 0.15 | learning rate: 9.847E-05 | global batch size: 256 | lm loss: 3.699297E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.359 | TFLOPs: 26.21 | +7: iteration 94650/ 173500 | consumed samples: 24230400 | consumed tokens: 49623859200 | elapsed time per iteration (s): 0.16 | learning rate: 9.846E-05 | global batch size: 256 | lm loss: 3.694868E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.502 | TFLOPs: 25.51 | +7: iteration 94660/ 173500 | consumed samples: 24232960 | consumed tokens: 49629102080 | elapsed time per iteration (s): 0.15 | learning rate: 9.844E-05 | global batch size: 256 | lm loss: 3.695305E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.099 | TFLOPs: 26.19 | +7: iteration 94670/ 173500 | consumed samples: 24235520 | consumed tokens: 49634344960 | elapsed time per iteration (s): 0.15 | learning rate: 9.842E-05 | global batch size: 256 | lm loss: 3.705392E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.606 | TFLOPs: 26.21 | +7: iteration 94680/ 173500 | consumed samples: 24238080 | consumed tokens: 49639587840 | elapsed time per iteration (s): 0.15 | learning rate: 9.841E-05 | global batch size: 256 | lm loss: 3.685152E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.829 | TFLOPs: 26.23 | +7: iteration 94690/ 173500 | consumed samples: 24240640 | consumed tokens: 49644830720 | elapsed time per iteration (s): 0.18 | learning rate: 9.839E-05 | global batch size: 256 | lm loss: 3.692048E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.800 | TFLOPs: 22.83 | +7: iteration 94700/ 173500 | consumed samples: 24243200 | consumed tokens: 49650073600 | elapsed time per iteration (s): 0.15 | learning rate: 9.837E-05 | global batch size: 256 | lm loss: 3.705378E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.661 | TFLOPs: 25.95 | +7: iteration 94710/ 173500 | consumed samples: 24245760 | consumed tokens: 49655316480 | elapsed time per iteration (s): 0.15 | learning rate: 9.836E-05 | global batch size: 256 | lm loss: 3.687854E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.118 | TFLOPs: 26.21 | +7: iteration 94720/ 173500 | consumed samples: 24248320 | consumed tokens: 49660559360 | elapsed time per iteration (s): 0.15 | learning rate: 9.834E-05 | global batch size: 256 | lm loss: 3.695692E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.848 | TFLOPs: 26.22 | +7: iteration 94730/ 173500 | consumed samples: 24250880 | consumed tokens: 49665802240 | elapsed time per iteration (s): 0.15 | learning rate: 9.833E-05 | global batch size: 256 | lm loss: 3.703011E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.099 | TFLOPs: 26.22 | +7: iteration 94740/ 173500 | consumed samples: 24253440 | consumed tokens: 49671045120 | elapsed time per iteration (s): 0.15 | learning rate: 9.831E-05 | global batch size: 256 | lm loss: 3.685274E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.088 | TFLOPs: 26.18 | +7: iteration 94750/ 173500 | consumed samples: 24256000 | consumed tokens: 49676288000 | elapsed time per iteration (s): 0.15 | learning rate: 9.829E-05 | global batch size: 256 | lm loss: 3.701205E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.518 | TFLOPs: 26.18 | +7: iteration 94760/ 173500 | consumed samples: 24258560 | consumed tokens: 49681530880 | elapsed time per iteration (s): 0.15 | learning rate: 9.828E-05 | global batch size: 256 | lm loss: 3.699247E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.723 | TFLOPs: 26.19 | +7: iteration 94770/ 173500 | consumed samples: 24261120 | consumed tokens: 49686773760 | elapsed time per iteration (s): 0.15 | learning rate: 9.826E-05 | global batch size: 256 | lm loss: 3.693412E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.086 | TFLOPs: 26.21 | +7: iteration 94780/ 173500 | consumed samples: 24263680 | consumed tokens: 49692016640 | elapsed time per iteration (s): 0.16 | learning rate: 9.824E-05 | global batch size: 256 | lm loss: 3.704290E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.190 | TFLOPs: 25.50 | +7: iteration 94790/ 173500 | consumed samples: 24266240 | consumed tokens: 49697259520 | elapsed time per iteration (s): 0.15 | learning rate: 9.823E-05 | global batch size: 256 | lm loss: 3.706881E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.539 | TFLOPs: 26.21 | +7: iteration 94800/ 173500 | consumed samples: 24268800 | consumed tokens: 49702502400 | elapsed time per iteration (s): 0.15 | learning rate: 9.821E-05 | global batch size: 256 | lm loss: 3.682087E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.565 | TFLOPs: 26.21 | +7: iteration 94810/ 173500 | consumed samples: 24271360 | consumed tokens: 49707745280 | elapsed time per iteration (s): 0.16 | learning rate: 9.820E-05 | global batch size: 256 | lm loss: 3.684677E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.039 | TFLOPs: 25.81 | +7: iteration 94820/ 173500 | consumed samples: 24273920 | consumed tokens: 49712988160 | elapsed time per iteration (s): 0.15 | learning rate: 9.818E-05 | global batch size: 256 | lm loss: 3.694360E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.092 | TFLOPs: 26.22 | +7: iteration 94830/ 173500 | consumed samples: 24276480 | consumed tokens: 49718231040 | elapsed time per iteration (s): 0.15 | learning rate: 9.816E-05 | global batch size: 256 | lm loss: 3.697272E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.563 | TFLOPs: 26.23 | +7: iteration 94840/ 173500 | consumed samples: 24279040 | consumed tokens: 49723473920 | elapsed time per iteration (s): 0.15 | learning rate: 9.815E-05 | global batch size: 256 | lm loss: 3.703712E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.196 | TFLOPs: 26.24 | +7: iteration 94850/ 173500 | consumed samples: 24281600 | consumed tokens: 49728716800 | elapsed time per iteration (s): 0.15 | learning rate: 9.813E-05 | global batch size: 256 | lm loss: 3.707008E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.352 | TFLOPs: 26.23 | +7: iteration 94860/ 173500 | consumed samples: 24284160 | consumed tokens: 49733959680 | elapsed time per iteration (s): 0.15 | learning rate: 9.811E-05 | global batch size: 256 | lm loss: 3.700175E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.128 | TFLOPs: 26.24 | +7: iteration 94870/ 173500 | consumed samples: 24286720 | consumed tokens: 49739202560 | elapsed time per iteration (s): 0.15 | learning rate: 9.810E-05 | global batch size: 256 | lm loss: 3.693824E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.659 | TFLOPs: 26.25 | +7: iteration 94880/ 173500 | consumed samples: 24289280 | consumed tokens: 49744445440 | elapsed time per iteration (s): 0.16 | learning rate: 9.808E-05 | global batch size: 256 | lm loss: 3.700185E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.326 | TFLOPs: 25.87 | +7: iteration 94890/ 173500 | consumed samples: 24291840 | consumed tokens: 49749688320 | elapsed time per iteration (s): 0.15 | learning rate: 9.806E-05 | global batch size: 256 | lm loss: 3.693580E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.050 | TFLOPs: 26.21 | +7: iteration 94900/ 173500 | consumed samples: 24294400 | consumed tokens: 49754931200 | elapsed time per iteration (s): 0.15 | learning rate: 9.805E-05 | global batch size: 256 | lm loss: 3.680710E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.973 | TFLOPs: 26.17 | +7: iteration 94910/ 173500 | consumed samples: 24296960 | consumed tokens: 49760174080 | elapsed time per iteration (s): 0.16 | learning rate: 9.803E-05 | global batch size: 256 | lm loss: 3.706758E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.286 | TFLOPs: 24.92 | +7: iteration 94920/ 173500 | consumed samples: 24299520 | consumed tokens: 49765416960 | elapsed time per iteration (s): 0.15 | learning rate: 9.802E-05 | global batch size: 256 | lm loss: 3.685888E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.375 | TFLOPs: 26.16 | +7: iteration 94930/ 173500 | consumed samples: 24302080 | consumed tokens: 49770659840 | elapsed time per iteration (s): 0.16 | learning rate: 9.800E-05 | global batch size: 256 | lm loss: 3.702499E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.326 | TFLOPs: 25.80 | +7: iteration 94940/ 173500 | consumed samples: 24304640 | consumed tokens: 49775902720 | elapsed time per iteration (s): 0.18 | learning rate: 9.798E-05 | global batch size: 256 | lm loss: 3.695067E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.302 | TFLOPs: 22.15 | +7: iteration 94950/ 173500 | consumed samples: 24307200 | consumed tokens: 49781145600 | elapsed time per iteration (s): 0.15 | learning rate: 9.797E-05 | global batch size: 256 | lm loss: 3.701603E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.538 | TFLOPs: 26.01 | +7: iteration 94960/ 173500 | consumed samples: 24309760 | consumed tokens: 49786388480 | elapsed time per iteration (s): 0.15 | learning rate: 9.795E-05 | global batch size: 256 | lm loss: 3.698715E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.011 | TFLOPs: 26.25 | +7: iteration 94970/ 173500 | consumed samples: 24312320 | consumed tokens: 49791631360 | elapsed time per iteration (s): 0.15 | learning rate: 9.793E-05 | global batch size: 256 | lm loss: 3.704436E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.697 | TFLOPs: 26.23 | +7: iteration 94980/ 173500 | consumed samples: 24314880 | consumed tokens: 49796874240 | elapsed time per iteration (s): 0.15 | learning rate: 9.792E-05 | global batch size: 256 | lm loss: 3.700744E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.075 | TFLOPs: 26.24 | +7: iteration 94990/ 173500 | consumed samples: 24317440 | consumed tokens: 49802117120 | elapsed time per iteration (s): 0.15 | learning rate: 9.790E-05 | global batch size: 256 | lm loss: 3.695500E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.809 | TFLOPs: 26.23 | +7: iteration 95000/ 173500 | consumed samples: 24320000 | consumed tokens: 49807360000 | elapsed time per iteration (s): 0.15 | learning rate: 9.789E-05 | global batch size: 256 | lm loss: 3.703733E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.705 | TFLOPs: 26.22 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 95000 | lm loss value: 3.837783E+00 | lm loss PPL: 4.642244E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 95000 to checkpoints_44m91b100m +0: [2023-03-17 04:22:19,224] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step95000 is begin to save! +0: [2023-03-17 04:22:19,228] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:22:19,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:22:19,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:22:19,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:22:19,308] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:22:19,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:22:19,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:22:19,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:22:19,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:22:19,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:22:19,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:22:19,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:22:19,341] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:22:19,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:22:19,349] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:22:19,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:22:19,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:22:19,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:22:19,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:22:19,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:22:19,367] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step95000/mp_rank_00_model_states.pt +0: [2023-03-17 04:22:19,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:22:19,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:22:19,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 04:22:19,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:22:19,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:22:19,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +4: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: [2023-03-17 04:22:19,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +6: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +2: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +5: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +3: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +7: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:22:19,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +1: [2023-03-17 04:22:19,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:22:19,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step95000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:22:19,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step95000 is ready now! +0: successfully saved checkpoint at iteration 95000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 188.99 +7: iteration 95010/ 173500 | consumed samples: 24322560 | consumed tokens: 49812602880 | elapsed time per iteration (s): 0.18 | learning rate: 9.787E-05 | global batch size: 256 | lm loss: 3.700142E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.047 | TFLOPs: 22.72 | +7: iteration 95020/ 173500 | consumed samples: 24325120 | consumed tokens: 49817845760 | elapsed time per iteration (s): 0.15 | learning rate: 9.785E-05 | global batch size: 256 | lm loss: 3.685449E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.391 | TFLOPs: 26.23 | +7: iteration 95030/ 173500 | consumed samples: 24327680 | consumed tokens: 49823088640 | elapsed time per iteration (s): 0.15 | learning rate: 9.784E-05 | global batch size: 256 | lm loss: 3.698303E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.510 | TFLOPs: 26.24 | +7: iteration 95040/ 173500 | consumed samples: 24330240 | consumed tokens: 49828331520 | elapsed time per iteration (s): 0.16 | learning rate: 9.782E-05 | global batch size: 256 | lm loss: 3.696912E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.206 | TFLOPs: 25.82 | +7: iteration 95050/ 173500 | consumed samples: 24332800 | consumed tokens: 49833574400 | elapsed time per iteration (s): 0.15 | learning rate: 9.780E-05 | global batch size: 256 | lm loss: 3.695607E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.838 | TFLOPs: 26.19 | +7: iteration 95060/ 173500 | consumed samples: 24335360 | consumed tokens: 49838817280 | elapsed time per iteration (s): 0.15 | learning rate: 9.779E-05 | global batch size: 256 | lm loss: 3.702773E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.690 | TFLOPs: 26.01 | +7: iteration 95070/ 173500 | consumed samples: 24337920 | consumed tokens: 49844060160 | elapsed time per iteration (s): 0.16 | learning rate: 9.777E-05 | global batch size: 256 | lm loss: 3.693940E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.229 | TFLOPs: 25.03 | +7: iteration 95080/ 173500 | consumed samples: 24340480 | consumed tokens: 49849303040 | elapsed time per iteration (s): 0.15 | learning rate: 9.775E-05 | global batch size: 256 | lm loss: 3.688150E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.877 | TFLOPs: 26.19 | +7: iteration 95090/ 173500 | consumed samples: 24343040 | consumed tokens: 49854545920 | elapsed time per iteration (s): 0.16 | learning rate: 9.774E-05 | global batch size: 256 | lm loss: 3.683091E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.287 | TFLOPs: 25.79 | +7: iteration 95100/ 173500 | consumed samples: 24345600 | consumed tokens: 49859788800 | elapsed time per iteration (s): 0.15 | learning rate: 9.772E-05 | global batch size: 256 | lm loss: 3.706466E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.679 | TFLOPs: 26.20 | +7: iteration 95110/ 173500 | consumed samples: 24348160 | consumed tokens: 49865031680 | elapsed time per iteration (s): 0.15 | learning rate: 9.771E-05 | global batch size: 256 | lm loss: 3.713883E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.065 | TFLOPs: 26.22 | +7: iteration 95120/ 173500 | consumed samples: 24350720 | consumed tokens: 49870274560 | elapsed time per iteration (s): 0.15 | learning rate: 9.769E-05 | global batch size: 256 | lm loss: 3.709439E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.098 | TFLOPs: 26.22 | +7: iteration 95130/ 173500 | consumed samples: 24353280 | consumed tokens: 49875517440 | elapsed time per iteration (s): 0.15 | learning rate: 9.767E-05 | global batch size: 256 | lm loss: 3.688139E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.346 | TFLOPs: 26.29 | +7: iteration 95140/ 173500 | consumed samples: 24355840 | consumed tokens: 49880760320 | elapsed time per iteration (s): 0.15 | learning rate: 9.766E-05 | global batch size: 256 | lm loss: 3.681371E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.408 | TFLOPs: 26.26 | +7: iteration 95150/ 173500 | consumed samples: 24358400 | consumed tokens: 49886003200 | elapsed time per iteration (s): 0.15 | learning rate: 9.764E-05 | global batch size: 256 | lm loss: 3.697306E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.410 | TFLOPs: 26.24 | +7: iteration 95160/ 173500 | consumed samples: 24360960 | consumed tokens: 49891246080 | elapsed time per iteration (s): 0.15 | learning rate: 9.762E-05 | global batch size: 256 | lm loss: 3.693094E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.421 | TFLOPs: 26.24 | +7: iteration 95170/ 173500 | consumed samples: 24363520 | consumed tokens: 49896488960 | elapsed time per iteration (s): 0.15 | learning rate: 9.761E-05 | global batch size: 256 | lm loss: 3.690188E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.193 | TFLOPs: 26.22 | +7: iteration 95180/ 173500 | consumed samples: 24366080 | consumed tokens: 49901731840 | elapsed time per iteration (s): 0.15 | learning rate: 9.759E-05 | global batch size: 256 | lm loss: 3.695203E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.094 | TFLOPs: 26.22 | +7: iteration 95190/ 173500 | consumed samples: 24368640 | consumed tokens: 49906974720 | elapsed time per iteration (s): 0.15 | learning rate: 9.758E-05 | global batch size: 256 | lm loss: 3.697055E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.915 | TFLOPs: 26.22 | +7: iteration 95200/ 173500 | consumed samples: 24371200 | consumed tokens: 49912217600 | elapsed time per iteration (s): 0.15 | learning rate: 9.756E-05 | global batch size: 256 | lm loss: 3.680556E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.164 | TFLOPs: 26.11 | +7: iteration 95210/ 173500 | consumed samples: 24373760 | consumed tokens: 49917460480 | elapsed time per iteration (s): 0.16 | learning rate: 9.754E-05 | global batch size: 256 | lm loss: 3.709966E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.099 | TFLOPs: 25.89 | +7: iteration 95220/ 173500 | consumed samples: 24376320 | consumed tokens: 49922703360 | elapsed time per iteration (s): 0.15 | learning rate: 9.753E-05 | global batch size: 256 | lm loss: 3.690368E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.156 | TFLOPs: 26.19 | +7: iteration 95230/ 173500 | consumed samples: 24378880 | consumed tokens: 49927946240 | elapsed time per iteration (s): 0.15 | learning rate: 9.751E-05 | global batch size: 256 | lm loss: 3.696310E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.600 | TFLOPs: 26.20 | +7: iteration 95240/ 173500 | consumed samples: 24381440 | consumed tokens: 49933189120 | elapsed time per iteration (s): 0.15 | learning rate: 9.749E-05 | global batch size: 256 | lm loss: 3.685151E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.975 | TFLOPs: 26.22 | +7: iteration 95250/ 173500 | consumed samples: 24384000 | consumed tokens: 49938432000 | elapsed time per iteration (s): 0.15 | learning rate: 9.748E-05 | global batch size: 256 | lm loss: 3.694371E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.891 | TFLOPs: 26.19 | +7: iteration 95260/ 173500 | consumed samples: 24386560 | consumed tokens: 49943674880 | elapsed time per iteration (s): 0.15 | learning rate: 9.746E-05 | global batch size: 256 | lm loss: 3.691129E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.070 | TFLOPs: 26.18 | +7: iteration 95270/ 173500 | consumed samples: 24389120 | consumed tokens: 49948917760 | elapsed time per iteration (s): 0.15 | learning rate: 9.744E-05 | global batch size: 256 | lm loss: 3.689576E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.692 | TFLOPs: 26.20 | +7: iteration 95280/ 173500 | consumed samples: 24391680 | consumed tokens: 49954160640 | elapsed time per iteration (s): 0.15 | learning rate: 9.743E-05 | global batch size: 256 | lm loss: 3.698095E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.905 | TFLOPs: 25.92 | +7: iteration 95290/ 173500 | consumed samples: 24394240 | consumed tokens: 49959403520 | elapsed time per iteration (s): 0.15 | learning rate: 9.741E-05 | global batch size: 256 | lm loss: 3.696907E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.942 | TFLOPs: 26.19 | +7: iteration 95300/ 173500 | consumed samples: 24396800 | consumed tokens: 49964646400 | elapsed time per iteration (s): 0.15 | learning rate: 9.740E-05 | global batch size: 256 | lm loss: 3.682930E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.637 | TFLOPs: 26.15 | +7: iteration 95310/ 173500 | consumed samples: 24399360 | consumed tokens: 49969889280 | elapsed time per iteration (s): 0.15 | learning rate: 9.738E-05 | global batch size: 256 | lm loss: 3.690556E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.134 | TFLOPs: 26.13 | +7: iteration 95320/ 173500 | consumed samples: 24401920 | consumed tokens: 49975132160 | elapsed time per iteration (s): 0.15 | learning rate: 9.736E-05 | global batch size: 256 | lm loss: 3.693587E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.159 | TFLOPs: 26.15 | +7: iteration 95330/ 173500 | consumed samples: 24404480 | consumed tokens: 49980375040 | elapsed time per iteration (s): 0.18 | learning rate: 9.735E-05 | global batch size: 256 | lm loss: 3.697406E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1427.784 | TFLOPs: 22.39 | +7: iteration 95340/ 173500 | consumed samples: 24407040 | consumed tokens: 49985617920 | elapsed time per iteration (s): 0.15 | learning rate: 9.733E-05 | global batch size: 256 | lm loss: 3.690721E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.916 | TFLOPs: 26.20 | +7: iteration 95350/ 173500 | consumed samples: 24409600 | consumed tokens: 49990860800 | elapsed time per iteration (s): 0.15 | learning rate: 9.731E-05 | global batch size: 256 | lm loss: 3.687677E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.456 | TFLOPs: 26.23 | +7: iteration 95360/ 173500 | consumed samples: 24412160 | consumed tokens: 49996103680 | elapsed time per iteration (s): 0.15 | learning rate: 9.730E-05 | global batch size: 256 | lm loss: 3.696896E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.828 | TFLOPs: 26.25 | +7: iteration 95370/ 173500 | consumed samples: 24414720 | consumed tokens: 50001346560 | elapsed time per iteration (s): 0.15 | learning rate: 9.728E-05 | global batch size: 256 | lm loss: 3.699429E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.814 | TFLOPs: 26.20 | +7: iteration 95380/ 173500 | consumed samples: 24417280 | consumed tokens: 50006589440 | elapsed time per iteration (s): 0.16 | learning rate: 9.727E-05 | global batch size: 256 | lm loss: 3.689669E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.471 | TFLOPs: 25.73 | +7: iteration 95390/ 173500 | consumed samples: 24419840 | consumed tokens: 50011832320 | elapsed time per iteration (s): 0.15 | learning rate: 9.725E-05 | global batch size: 256 | lm loss: 3.688176E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.133 | TFLOPs: 26.18 | +7: iteration 95400/ 173500 | consumed samples: 24422400 | consumed tokens: 50017075200 | elapsed time per iteration (s): 0.15 | learning rate: 9.723E-05 | global batch size: 256 | lm loss: 3.685119E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.666 | TFLOPs: 26.15 | +7: iteration 95410/ 173500 | consumed samples: 24424960 | consumed tokens: 50022318080 | elapsed time per iteration (s): 0.15 | learning rate: 9.722E-05 | global batch size: 256 | lm loss: 3.701520E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.052 | TFLOPs: 26.17 | +7: iteration 95420/ 173500 | consumed samples: 24427520 | consumed tokens: 50027560960 | elapsed time per iteration (s): 0.16 | learning rate: 9.720E-05 | global batch size: 256 | lm loss: 3.671970E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.211 | TFLOPs: 25.50 | +7: iteration 95430/ 173500 | consumed samples: 24430080 | consumed tokens: 50032803840 | elapsed time per iteration (s): 0.16 | learning rate: 9.718E-05 | global batch size: 256 | lm loss: 3.702729E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.747 | TFLOPs: 24.88 | +7: iteration 95440/ 173500 | consumed samples: 24432640 | consumed tokens: 50038046720 | elapsed time per iteration (s): 0.15 | learning rate: 9.717E-05 | global batch size: 256 | lm loss: 3.690463E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.110 | TFLOPs: 25.94 | +7: iteration 95450/ 173500 | consumed samples: 24435200 | consumed tokens: 50043289600 | elapsed time per iteration (s): 0.18 | learning rate: 9.715E-05 | global batch size: 256 | lm loss: 3.696329E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1453.800 | TFLOPs: 22.80 | +7: iteration 95460/ 173500 | consumed samples: 24437760 | consumed tokens: 50048532480 | elapsed time per iteration (s): 0.18 | learning rate: 9.714E-05 | global batch size: 256 | lm loss: 3.701786E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1461.532 | TFLOPs: 22.92 | +7: iteration 95470/ 173500 | consumed samples: 24440320 | consumed tokens: 50053775360 | elapsed time per iteration (s): 0.15 | learning rate: 9.712E-05 | global batch size: 256 | lm loss: 3.691182E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.929 | TFLOPs: 26.24 | +7: iteration 95480/ 173500 | consumed samples: 24442880 | consumed tokens: 50059018240 | elapsed time per iteration (s): 0.15 | learning rate: 9.710E-05 | global batch size: 256 | lm loss: 3.689301E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.654 | TFLOPs: 26.22 | +7: iteration 95490/ 173500 | consumed samples: 24445440 | consumed tokens: 50064261120 | elapsed time per iteration (s): 0.16 | learning rate: 9.709E-05 | global batch size: 256 | lm loss: 3.698031E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.594 | TFLOPs: 25.78 | +7: iteration 95500/ 173500 | consumed samples: 24448000 | consumed tokens: 50069504000 | elapsed time per iteration (s): 0.15 | learning rate: 9.707E-05 | global batch size: 256 | lm loss: 3.690973E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.281 | TFLOPs: 25.93 | +7: iteration 95510/ 173500 | consumed samples: 24450560 | consumed tokens: 50074746880 | elapsed time per iteration (s): 0.15 | learning rate: 9.705E-05 | global batch size: 256 | lm loss: 3.695803E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.773 | TFLOPs: 26.23 | +7: iteration 95520/ 173500 | consumed samples: 24453120 | consumed tokens: 50079989760 | elapsed time per iteration (s): 0.15 | learning rate: 9.704E-05 | global batch size: 256 | lm loss: 3.698371E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.494 | TFLOPs: 26.24 | +7: iteration 95530/ 173500 | consumed samples: 24455680 | consumed tokens: 50085232640 | elapsed time per iteration (s): 0.15 | learning rate: 9.702E-05 | global batch size: 256 | lm loss: 3.693961E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.382 | TFLOPs: 26.23 | +7: iteration 95540/ 173500 | consumed samples: 24458240 | consumed tokens: 50090475520 | elapsed time per iteration (s): 0.15 | learning rate: 9.700E-05 | global batch size: 256 | lm loss: 3.694421E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.280 | TFLOPs: 26.01 | +7: iteration 95550/ 173500 | consumed samples: 24460800 | consumed tokens: 50095718400 | elapsed time per iteration (s): 0.15 | learning rate: 9.699E-05 | global batch size: 256 | lm loss: 3.692155E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.772 | TFLOPs: 26.25 | +7: iteration 95560/ 173500 | consumed samples: 24463360 | consumed tokens: 50100961280 | elapsed time per iteration (s): 0.15 | learning rate: 9.697E-05 | global batch size: 256 | lm loss: 3.686264E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.997 | TFLOPs: 26.24 | +7: iteration 95570/ 173500 | consumed samples: 24465920 | consumed tokens: 50106204160 | elapsed time per iteration (s): 0.15 | learning rate: 9.696E-05 | global batch size: 256 | lm loss: 3.695874E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.392 | TFLOPs: 26.23 | +7: iteration 95580/ 173500 | consumed samples: 24468480 | consumed tokens: 50111447040 | elapsed time per iteration (s): 0.20 | learning rate: 9.694E-05 | global batch size: 256 | lm loss: 3.701872E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1290.081 | TFLOPs: 20.23 | +7: iteration 95590/ 173500 | consumed samples: 24471040 | consumed tokens: 50116689920 | elapsed time per iteration (s): 0.15 | learning rate: 9.692E-05 | global batch size: 256 | lm loss: 3.710037E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.680 | TFLOPs: 26.23 | +7: iteration 95600/ 173500 | consumed samples: 24473600 | consumed tokens: 50121932800 | elapsed time per iteration (s): 0.15 | learning rate: 9.691E-05 | global batch size: 256 | lm loss: 3.682599E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.010 | TFLOPs: 26.25 | +7: iteration 95610/ 173500 | consumed samples: 24476160 | consumed tokens: 50127175680 | elapsed time per iteration (s): 0.15 | learning rate: 9.689E-05 | global batch size: 256 | lm loss: 3.689704E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.411 | TFLOPs: 26.26 | +7: iteration 95620/ 173500 | consumed samples: 24478720 | consumed tokens: 50132418560 | elapsed time per iteration (s): 0.16 | learning rate: 9.687E-05 | global batch size: 256 | lm loss: 3.705224E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.132 | TFLOPs: 25.80 | +7: iteration 95630/ 173500 | consumed samples: 24481280 | consumed tokens: 50137661440 | elapsed time per iteration (s): 0.16 | learning rate: 9.686E-05 | global batch size: 256 | lm loss: 3.685504E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.436 | TFLOPs: 25.88 | +7: iteration 95640/ 173500 | consumed samples: 24483840 | consumed tokens: 50142904320 | elapsed time per iteration (s): 0.15 | learning rate: 9.684E-05 | global batch size: 256 | lm loss: 3.685018E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.325 | TFLOPs: 26.23 | +7: iteration 95650/ 173500 | consumed samples: 24486400 | consumed tokens: 50148147200 | elapsed time per iteration (s): 0.18 | learning rate: 9.683E-05 | global batch size: 256 | lm loss: 3.706280E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.207 | TFLOPs: 22.84 | +7: iteration 95660/ 173500 | consumed samples: 24488960 | consumed tokens: 50153390080 | elapsed time per iteration (s): 0.15 | learning rate: 9.681E-05 | global batch size: 256 | lm loss: 3.688859E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.700 | TFLOPs: 26.11 | +7: iteration 95670/ 173500 | consumed samples: 24491520 | consumed tokens: 50158632960 | elapsed time per iteration (s): 0.15 | learning rate: 9.679E-05 | global batch size: 256 | lm loss: 3.694554E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.763 | TFLOPs: 26.09 | +7: iteration 95680/ 173500 | consumed samples: 24494080 | consumed tokens: 50163875840 | elapsed time per iteration (s): 0.15 | learning rate: 9.678E-05 | global batch size: 256 | lm loss: 3.696715E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.741 | TFLOPs: 26.09 | +7: iteration 95690/ 173500 | consumed samples: 24496640 | consumed tokens: 50169118720 | elapsed time per iteration (s): 0.15 | learning rate: 9.676E-05 | global batch size: 256 | lm loss: 3.689124E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.234 | TFLOPs: 26.08 | +7: iteration 95700/ 173500 | consumed samples: 24499200 | consumed tokens: 50174361600 | elapsed time per iteration (s): 0.15 | learning rate: 9.674E-05 | global batch size: 256 | lm loss: 3.697147E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.668 | TFLOPs: 26.11 | +7: iteration 95710/ 173500 | consumed samples: 24501760 | consumed tokens: 50179604480 | elapsed time per iteration (s): 0.18 | learning rate: 9.673E-05 | global batch size: 256 | lm loss: 3.691022E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.540 | TFLOPs: 22.72 | +7: iteration 95720/ 173500 | consumed samples: 24504320 | consumed tokens: 50184847360 | elapsed time per iteration (s): 0.15 | learning rate: 9.671E-05 | global batch size: 256 | lm loss: 3.692413E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.128 | TFLOPs: 26.18 | +7: iteration 95730/ 173500 | consumed samples: 24506880 | consumed tokens: 50190090240 | elapsed time per iteration (s): 0.15 | learning rate: 9.670E-05 | global batch size: 256 | lm loss: 3.702273E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.647 | TFLOPs: 26.22 | +7: iteration 95740/ 173500 | consumed samples: 24509440 | consumed tokens: 50195333120 | elapsed time per iteration (s): 0.15 | learning rate: 9.668E-05 | global batch size: 256 | lm loss: 3.706936E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.289 | TFLOPs: 26.21 | +7: iteration 95750/ 173500 | consumed samples: 24512000 | consumed tokens: 50200576000 | elapsed time per iteration (s): 0.15 | learning rate: 9.666E-05 | global batch size: 256 | lm loss: 3.686787E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.787 | TFLOPs: 26.22 | +7: iteration 95760/ 173500 | consumed samples: 24514560 | consumed tokens: 50205818880 | elapsed time per iteration (s): 0.16 | learning rate: 9.665E-05 | global batch size: 256 | lm loss: 3.695212E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.436 | TFLOPs: 25.62 | +7: iteration 95770/ 173500 | consumed samples: 24517120 | consumed tokens: 50211061760 | elapsed time per iteration (s): 0.15 | learning rate: 9.663E-05 | global batch size: 256 | lm loss: 3.696059E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.731 | TFLOPs: 26.22 | +7: iteration 95780/ 173500 | consumed samples: 24519680 | consumed tokens: 50216304640 | elapsed time per iteration (s): 0.15 | learning rate: 9.661E-05 | global batch size: 256 | lm loss: 3.694540E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.550 | TFLOPs: 26.21 | +7: iteration 95790/ 173500 | consumed samples: 24522240 | consumed tokens: 50221547520 | elapsed time per iteration (s): 0.15 | learning rate: 9.660E-05 | global batch size: 256 | lm loss: 3.700888E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.962 | TFLOPs: 26.20 | +7: iteration 95800/ 173500 | consumed samples: 24524800 | consumed tokens: 50226790400 | elapsed time per iteration (s): 0.15 | learning rate: 9.658E-05 | global batch size: 256 | lm loss: 3.682834E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.441 | TFLOPs: 26.20 | +7: iteration 95810/ 173500 | consumed samples: 24527360 | consumed tokens: 50232033280 | elapsed time per iteration (s): 0.15 | learning rate: 9.657E-05 | global batch size: 256 | lm loss: 3.699931E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.463 | TFLOPs: 26.20 | +7: iteration 95820/ 173500 | consumed samples: 24529920 | consumed tokens: 50237276160 | elapsed time per iteration (s): 0.15 | learning rate: 9.655E-05 | global batch size: 256 | lm loss: 3.698495E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.319 | TFLOPs: 26.23 | +7: iteration 95830/ 173500 | consumed samples: 24532480 | consumed tokens: 50242519040 | elapsed time per iteration (s): 0.15 | learning rate: 9.653E-05 | global batch size: 256 | lm loss: 3.688248E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.140 | TFLOPs: 26.24 | +7: iteration 95840/ 173500 | consumed samples: 24535040 | consumed tokens: 50247761920 | elapsed time per iteration (s): 0.17 | learning rate: 9.652E-05 | global batch size: 256 | lm loss: 3.689409E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1478.692 | TFLOPs: 23.19 | +7: iteration 95850/ 173500 | consumed samples: 24537600 | consumed tokens: 50253004800 | elapsed time per iteration (s): 0.15 | learning rate: 9.650E-05 | global batch size: 256 | lm loss: 3.680327E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.470 | TFLOPs: 26.28 | +7: iteration 95860/ 173500 | consumed samples: 24540160 | consumed tokens: 50258247680 | elapsed time per iteration (s): 0.15 | learning rate: 9.648E-05 | global batch size: 256 | lm loss: 3.698471E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.631 | TFLOPs: 26.26 | +7: iteration 95870/ 173500 | consumed samples: 24542720 | consumed tokens: 50263490560 | elapsed time per iteration (s): 0.15 | learning rate: 9.647E-05 | global batch size: 256 | lm loss: 3.699390E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.475 | TFLOPs: 26.28 | +7: iteration 95880/ 173500 | consumed samples: 24545280 | consumed tokens: 50268733440 | elapsed time per iteration (s): 0.15 | learning rate: 9.645E-05 | global batch size: 256 | lm loss: 3.689926E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.933 | TFLOPs: 26.27 | +7: iteration 95890/ 173500 | consumed samples: 24547840 | consumed tokens: 50273976320 | elapsed time per iteration (s): 0.15 | learning rate: 9.643E-05 | global batch size: 256 | lm loss: 3.704646E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.507 | TFLOPs: 26.26 | +7: iteration 95900/ 173500 | consumed samples: 24550400 | consumed tokens: 50279219200 | elapsed time per iteration (s): 0.17 | learning rate: 9.642E-05 | global batch size: 256 | lm loss: 3.698415E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.620 | TFLOPs: 23.14 | +7: iteration 95910/ 173500 | consumed samples: 24552960 | consumed tokens: 50284462080 | elapsed time per iteration (s): 0.15 | learning rate: 9.640E-05 | global batch size: 256 | lm loss: 3.707030E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.860 | TFLOPs: 26.22 | +7: iteration 95920/ 173500 | consumed samples: 24555520 | consumed tokens: 50289704960 | elapsed time per iteration (s): 0.15 | learning rate: 9.639E-05 | global batch size: 256 | lm loss: 3.702984E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.412 | TFLOPs: 26.27 | +7: iteration 95930/ 173500 | consumed samples: 24558080 | consumed tokens: 50294947840 | elapsed time per iteration (s): 0.15 | learning rate: 9.637E-05 | global batch size: 256 | lm loss: 3.690615E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.761 | TFLOPs: 26.26 | +7: iteration 95940/ 173500 | consumed samples: 24560640 | consumed tokens: 50300190720 | elapsed time per iteration (s): 0.15 | learning rate: 9.635E-05 | global batch size: 256 | lm loss: 3.698714E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.091 | TFLOPs: 26.25 | +7: iteration 95950/ 173500 | consumed samples: 24563200 | consumed tokens: 50305433600 | elapsed time per iteration (s): 0.15 | learning rate: 9.634E-05 | global batch size: 256 | lm loss: 3.694074E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.597 | TFLOPs: 26.26 | +7: iteration 95960/ 173500 | consumed samples: 24565760 | consumed tokens: 50310676480 | elapsed time per iteration (s): 0.15 | learning rate: 9.632E-05 | global batch size: 256 | lm loss: 3.693640E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.193 | TFLOPs: 26.21 | +7: iteration 95970/ 173500 | consumed samples: 24568320 | consumed tokens: 50315919360 | elapsed time per iteration (s): 0.17 | learning rate: 9.630E-05 | global batch size: 256 | lm loss: 3.701349E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.207 | TFLOPs: 23.13 | +7: iteration 95980/ 173500 | consumed samples: 24570880 | consumed tokens: 50321162240 | elapsed time per iteration (s): 0.15 | learning rate: 9.629E-05 | global batch size: 256 | lm loss: 3.690384E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.630 | TFLOPs: 26.18 | +7: iteration 95990/ 173500 | consumed samples: 24573440 | consumed tokens: 50326405120 | elapsed time per iteration (s): 0.15 | learning rate: 9.627E-05 | global batch size: 256 | lm loss: 3.708689E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.648 | TFLOPs: 26.20 | +0: [2023-03-17 04:24:55,281] [INFO] [logging.py:68:log_dist] [Rank 0] step=96000, skipped=0, lr=[9.625601507010446e-05, 9.625601507010446e-05, 9.625601507010446e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 96000/ 173500 | consumed samples: 24576000 | consumed tokens: 50331648000 | elapsed time per iteration (s): 0.15 | learning rate: 9.626E-05 | global batch size: 256 | lm loss: 3.697423E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.497 | TFLOPs: 26.23 | +0: steps: 96000 loss: 3.7183 iter time (s): 0.154 samples/sec: 1662.972 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 96000 | lm loss value: 3.836501E+00 | lm loss PPL: 4.636296E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 96000 to checkpoints_44m91b100m +0: [2023-03-17 04:24:55,355] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step96000 is begin to save! +0: [2023-03-17 04:24:55,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:24:55,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:24:55,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:24:55,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:24:55,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:24:55,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:24:55,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:24:55,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:24:55,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:24:55,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:24:55,456] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:24:55,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:24:55,465] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:24:55,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:24:55,473] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:24:55,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:24:55,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:24:55,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:24:55,489] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:24:55,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:24:55,490] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step96000/mp_rank_00_model_states.pt +0: [2023-03-17 04:24:55,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:24:55,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:24:55,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:24:55,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:24:55,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: [2023-03-17 04:24:55,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:24:55,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:24:55,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:24:55,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: [2023-03-17 04:24:55,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: [2023-03-17 04:24:55,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:24:55,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:24:55,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:24:55,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +7: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +1: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +2: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +5: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +6: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:24:55,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +3: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:24:55,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:24:55,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step96000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +4: [2023-03-17 04:24:55,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step96000 is ready now! +0: successfully saved checkpoint at iteration 96000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.57 +7: iteration 96010/ 173500 | consumed samples: 24578560 | consumed tokens: 50336890880 | elapsed time per iteration (s): 0.18 | learning rate: 9.624E-05 | global batch size: 256 | lm loss: 3.697787E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.069 | TFLOPs: 22.55 | +7: iteration 96020/ 173500 | consumed samples: 24581120 | consumed tokens: 50342133760 | elapsed time per iteration (s): 0.15 | learning rate: 9.622E-05 | global batch size: 256 | lm loss: 3.694368E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.770 | TFLOPs: 26.17 | +7: iteration 96030/ 173500 | consumed samples: 24583680 | consumed tokens: 50347376640 | elapsed time per iteration (s): 0.15 | learning rate: 9.621E-05 | global batch size: 256 | lm loss: 3.689344E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.044 | TFLOPs: 26.13 | +7: iteration 96040/ 173500 | consumed samples: 24586240 | consumed tokens: 50352619520 | elapsed time per iteration (s): 0.15 | learning rate: 9.619E-05 | global batch size: 256 | lm loss: 3.698168E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.443 | TFLOPs: 26.02 | +7: iteration 96050/ 173500 | consumed samples: 24588800 | consumed tokens: 50357862400 | elapsed time per iteration (s): 0.15 | learning rate: 9.617E-05 | global batch size: 256 | lm loss: 3.692017E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.587 | TFLOPs: 26.14 | +7: iteration 96060/ 173500 | consumed samples: 24591360 | consumed tokens: 50363105280 | elapsed time per iteration (s): 0.15 | learning rate: 9.616E-05 | global batch size: 256 | lm loss: 3.694310E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.535 | TFLOPs: 26.15 | +7: iteration 96070/ 173500 | consumed samples: 24593920 | consumed tokens: 50368348160 | elapsed time per iteration (s): 0.15 | learning rate: 9.614E-05 | global batch size: 256 | lm loss: 3.678672E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.414 | TFLOPs: 26.15 | +7: iteration 96080/ 173500 | consumed samples: 24596480 | consumed tokens: 50373591040 | elapsed time per iteration (s): 0.15 | learning rate: 9.613E-05 | global batch size: 256 | lm loss: 3.687652E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.464 | TFLOPs: 26.15 | +7: iteration 96090/ 173500 | consumed samples: 24599040 | consumed tokens: 50378833920 | elapsed time per iteration (s): 0.17 | learning rate: 9.611E-05 | global batch size: 256 | lm loss: 3.690846E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.157 | TFLOPs: 23.20 | +7: iteration 96100/ 173500 | consumed samples: 24601600 | consumed tokens: 50384076800 | elapsed time per iteration (s): 0.15 | learning rate: 9.609E-05 | global batch size: 256 | lm loss: 3.683204E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.661 | TFLOPs: 26.15 | +7: iteration 96110/ 173500 | consumed samples: 24604160 | consumed tokens: 50389319680 | elapsed time per iteration (s): 0.16 | learning rate: 9.608E-05 | global batch size: 256 | lm loss: 3.706797E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.203 | TFLOPs: 25.36 | +7: iteration 96120/ 173500 | consumed samples: 24606720 | consumed tokens: 50394562560 | elapsed time per iteration (s): 0.15 | learning rate: 9.606E-05 | global batch size: 256 | lm loss: 3.679331E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.321 | TFLOPs: 26.10 | +7: iteration 96130/ 173500 | consumed samples: 24609280 | consumed tokens: 50399805440 | elapsed time per iteration (s): 0.15 | learning rate: 9.604E-05 | global batch size: 256 | lm loss: 3.687175E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.622 | TFLOPs: 26.14 | +7: iteration 96140/ 173500 | consumed samples: 24611840 | consumed tokens: 50405048320 | elapsed time per iteration (s): 0.15 | learning rate: 9.603E-05 | global batch size: 256 | lm loss: 3.688028E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.004 | TFLOPs: 26.16 | +7: iteration 96150/ 173500 | consumed samples: 24614400 | consumed tokens: 50410291200 | elapsed time per iteration (s): 0.15 | learning rate: 9.601E-05 | global batch size: 256 | lm loss: 3.668004E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.005 | TFLOPs: 26.14 | +7: iteration 96160/ 173500 | consumed samples: 24616960 | consumed tokens: 50415534080 | elapsed time per iteration (s): 0.17 | learning rate: 9.600E-05 | global batch size: 256 | lm loss: 3.694711E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1470.309 | TFLOPs: 23.06 | +7: iteration 96170/ 173500 | consumed samples: 24619520 | consumed tokens: 50420776960 | elapsed time per iteration (s): 0.15 | learning rate: 9.598E-05 | global batch size: 256 | lm loss: 3.713927E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.566 | TFLOPs: 26.17 | +7: iteration 96180/ 173500 | consumed samples: 24622080 | consumed tokens: 50426019840 | elapsed time per iteration (s): 0.15 | learning rate: 9.596E-05 | global batch size: 256 | lm loss: 3.694817E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.346 | TFLOPs: 26.15 | +7: iteration 96190/ 173500 | consumed samples: 24624640 | consumed tokens: 50431262720 | elapsed time per iteration (s): 0.15 | learning rate: 9.595E-05 | global batch size: 256 | lm loss: 3.702801E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.483 | TFLOPs: 26.10 | +7: iteration 96200/ 173500 | consumed samples: 24627200 | consumed tokens: 50436505600 | elapsed time per iteration (s): 0.15 | learning rate: 9.593E-05 | global batch size: 256 | lm loss: 3.682637E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.029 | TFLOPs: 26.10 | +7: iteration 96210/ 173500 | consumed samples: 24629760 | consumed tokens: 50441748480 | elapsed time per iteration (s): 0.16 | learning rate: 9.591E-05 | global batch size: 256 | lm loss: 3.699847E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.100 | TFLOPs: 25.83 | +7: iteration 96220/ 173500 | consumed samples: 24632320 | consumed tokens: 50446991360 | elapsed time per iteration (s): 0.17 | learning rate: 9.590E-05 | global batch size: 256 | lm loss: 3.700089E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1468.428 | TFLOPs: 23.03 | +7: iteration 96230/ 173500 | consumed samples: 24634880 | consumed tokens: 50452234240 | elapsed time per iteration (s): 0.15 | learning rate: 9.588E-05 | global batch size: 256 | lm loss: 3.692213E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.300 | TFLOPs: 26.18 | +7: iteration 96240/ 173500 | consumed samples: 24637440 | consumed tokens: 50457477120 | elapsed time per iteration (s): 0.15 | learning rate: 9.587E-05 | global batch size: 256 | lm loss: 3.689224E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.579 | TFLOPs: 26.18 | +7: iteration 96250/ 173500 | consumed samples: 24640000 | consumed tokens: 50462720000 | elapsed time per iteration (s): 0.15 | learning rate: 9.585E-05 | global batch size: 256 | lm loss: 3.709312E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.153 | TFLOPs: 26.16 | +7: iteration 96260/ 173500 | consumed samples: 24642560 | consumed tokens: 50467962880 | elapsed time per iteration (s): 0.15 | learning rate: 9.583E-05 | global batch size: 256 | lm loss: 3.700988E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.264 | TFLOPs: 26.16 | +7: iteration 96270/ 173500 | consumed samples: 24645120 | consumed tokens: 50473205760 | elapsed time per iteration (s): 0.15 | learning rate: 9.582E-05 | global batch size: 256 | lm loss: 3.699785E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.649 | TFLOPs: 26.18 | +7: iteration 96280/ 173500 | consumed samples: 24647680 | consumed tokens: 50478448640 | elapsed time per iteration (s): 0.17 | learning rate: 9.580E-05 | global batch size: 256 | lm loss: 3.690284E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1469.095 | TFLOPs: 23.04 | +7: iteration 96290/ 173500 | consumed samples: 24650240 | consumed tokens: 50483691520 | elapsed time per iteration (s): 0.15 | learning rate: 9.578E-05 | global batch size: 256 | lm loss: 3.701030E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.934 | TFLOPs: 26.25 | +7: iteration 96300/ 173500 | consumed samples: 24652800 | consumed tokens: 50488934400 | elapsed time per iteration (s): 0.15 | learning rate: 9.577E-05 | global batch size: 256 | lm loss: 3.707877E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.916 | TFLOPs: 26.25 | +7: iteration 96310/ 173500 | consumed samples: 24655360 | consumed tokens: 50494177280 | elapsed time per iteration (s): 0.15 | learning rate: 9.575E-05 | global batch size: 256 | lm loss: 3.701362E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.106 | TFLOPs: 26.22 | +7: iteration 96320/ 173500 | consumed samples: 24657920 | consumed tokens: 50499420160 | elapsed time per iteration (s): 0.16 | learning rate: 9.574E-05 | global batch size: 256 | lm loss: 3.689013E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.358 | TFLOPs: 25.88 | +7: iteration 96330/ 173500 | consumed samples: 24660480 | consumed tokens: 50504663040 | elapsed time per iteration (s): 0.15 | learning rate: 9.572E-05 | global batch size: 256 | lm loss: 3.708723E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.366 | TFLOPs: 26.27 | +7: iteration 96340/ 173500 | consumed samples: 24663040 | consumed tokens: 50509905920 | elapsed time per iteration (s): 0.15 | learning rate: 9.570E-05 | global batch size: 256 | lm loss: 3.683913E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.554 | TFLOPs: 26.25 | +7: iteration 96350/ 173500 | consumed samples: 24665600 | consumed tokens: 50515148800 | elapsed time per iteration (s): 0.19 | learning rate: 9.569E-05 | global batch size: 256 | lm loss: 3.692765E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1323.979 | TFLOPs: 20.76 | +7: iteration 96360/ 173500 | consumed samples: 24668160 | consumed tokens: 50520391680 | elapsed time per iteration (s): 0.15 | learning rate: 9.567E-05 | global batch size: 256 | lm loss: 3.701216E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.877 | TFLOPs: 26.27 | +7: iteration 96370/ 173500 | consumed samples: 24670720 | consumed tokens: 50525634560 | elapsed time per iteration (s): 0.15 | learning rate: 9.565E-05 | global batch size: 256 | lm loss: 3.679568E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.602 | TFLOPs: 26.23 | +7: iteration 96380/ 173500 | consumed samples: 24673280 | consumed tokens: 50530877440 | elapsed time per iteration (s): 0.15 | learning rate: 9.564E-05 | global batch size: 256 | lm loss: 3.700942E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.502 | TFLOPs: 26.20 | +7: iteration 96390/ 173500 | consumed samples: 24675840 | consumed tokens: 50536120320 | elapsed time per iteration (s): 0.15 | learning rate: 9.562E-05 | global batch size: 256 | lm loss: 3.699098E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.851 | TFLOPs: 26.20 | +7: iteration 96400/ 173500 | consumed samples: 24678400 | consumed tokens: 50541363200 | elapsed time per iteration (s): 0.15 | learning rate: 9.561E-05 | global batch size: 256 | lm loss: 3.706826E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.517 | TFLOPs: 26.20 | +7: iteration 96410/ 173500 | consumed samples: 24680960 | consumed tokens: 50546606080 | elapsed time per iteration (s): 0.15 | learning rate: 9.559E-05 | global batch size: 256 | lm loss: 3.691144E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.691 | TFLOPs: 26.20 | +7: iteration 96420/ 173500 | consumed samples: 24683520 | consumed tokens: 50551848960 | elapsed time per iteration (s): 0.15 | learning rate: 9.557E-05 | global batch size: 256 | lm loss: 3.694546E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.652 | TFLOPs: 26.20 | +7: iteration 96430/ 173500 | consumed samples: 24686080 | consumed tokens: 50557091840 | elapsed time per iteration (s): 0.15 | learning rate: 9.556E-05 | global batch size: 256 | lm loss: 3.704283E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.873 | TFLOPs: 26.17 | +7: iteration 96440/ 173500 | consumed samples: 24688640 | consumed tokens: 50562334720 | elapsed time per iteration (s): 0.15 | learning rate: 9.554E-05 | global batch size: 256 | lm loss: 3.680909E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.556 | TFLOPs: 26.20 | +7: iteration 96450/ 173500 | consumed samples: 24691200 | consumed tokens: 50567577600 | elapsed time per iteration (s): 0.15 | learning rate: 9.552E-05 | global batch size: 256 | lm loss: 3.700557E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.945 | TFLOPs: 26.17 | +7: iteration 96460/ 173500 | consumed samples: 24693760 | consumed tokens: 50572820480 | elapsed time per iteration (s): 0.15 | learning rate: 9.551E-05 | global batch size: 256 | lm loss: 3.691953E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.159 | TFLOPs: 26.19 | +7: iteration 96470/ 173500 | consumed samples: 24696320 | consumed tokens: 50578063360 | elapsed time per iteration (s): 0.15 | learning rate: 9.549E-05 | global batch size: 256 | lm loss: 3.692067E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.488 | TFLOPs: 26.20 | +7: iteration 96480/ 173500 | consumed samples: 24698880 | consumed tokens: 50583306240 | elapsed time per iteration (s): 0.19 | learning rate: 9.548E-05 | global batch size: 256 | lm loss: 3.700816E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1318.220 | TFLOPs: 20.67 | +7: iteration 96490/ 173500 | consumed samples: 24701440 | consumed tokens: 50588549120 | elapsed time per iteration (s): 0.15 | learning rate: 9.546E-05 | global batch size: 256 | lm loss: 3.699535E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.068 | TFLOPs: 26.27 | +7: iteration 96500/ 173500 | consumed samples: 24704000 | consumed tokens: 50593792000 | elapsed time per iteration (s): 0.15 | learning rate: 9.544E-05 | global batch size: 256 | lm loss: 3.691296E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.058 | TFLOPs: 26.24 | +7: iteration 96510/ 173500 | consumed samples: 24706560 | consumed tokens: 50599034880 | elapsed time per iteration (s): 0.15 | learning rate: 9.543E-05 | global batch size: 256 | lm loss: 3.697288E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.124 | TFLOPs: 26.24 | +7: iteration 96520/ 173500 | consumed samples: 24709120 | consumed tokens: 50604277760 | elapsed time per iteration (s): 0.15 | learning rate: 9.541E-05 | global batch size: 256 | lm loss: 3.685695E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.829 | TFLOPs: 26.22 | +7: iteration 96530/ 173500 | consumed samples: 24711680 | consumed tokens: 50609520640 | elapsed time per iteration (s): 0.16 | learning rate: 9.539E-05 | global batch size: 256 | lm loss: 3.691179E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.740 | TFLOPs: 25.10 | +7: iteration 96540/ 173500 | consumed samples: 24714240 | consumed tokens: 50614763520 | elapsed time per iteration (s): 0.17 | learning rate: 9.538E-05 | global batch size: 256 | lm loss: 3.683194E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.532 | TFLOPs: 23.20 | +7: iteration 96550/ 173500 | consumed samples: 24716800 | consumed tokens: 50620006400 | elapsed time per iteration (s): 0.15 | learning rate: 9.536E-05 | global batch size: 256 | lm loss: 3.694276E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.171 | TFLOPs: 26.21 | +7: iteration 96560/ 173500 | consumed samples: 24719360 | consumed tokens: 50625249280 | elapsed time per iteration (s): 0.15 | learning rate: 9.535E-05 | global batch size: 256 | lm loss: 3.696715E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.593 | TFLOPs: 26.21 | +7: iteration 96570/ 173500 | consumed samples: 24721920 | consumed tokens: 50630492160 | elapsed time per iteration (s): 0.15 | learning rate: 9.533E-05 | global batch size: 256 | lm loss: 3.709768E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.583 | TFLOPs: 26.21 | +7: iteration 96580/ 173500 | consumed samples: 24724480 | consumed tokens: 50635735040 | elapsed time per iteration (s): 0.15 | learning rate: 9.531E-05 | global batch size: 256 | lm loss: 3.703978E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.804 | TFLOPs: 25.97 | +7: iteration 96590/ 173500 | consumed samples: 24727040 | consumed tokens: 50640977920 | elapsed time per iteration (s): 0.15 | learning rate: 9.530E-05 | global batch size: 256 | lm loss: 3.687893E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.096 | TFLOPs: 26.18 | +7: iteration 96600/ 173500 | consumed samples: 24729600 | consumed tokens: 50646220800 | elapsed time per iteration (s): 0.15 | learning rate: 9.528E-05 | global batch size: 256 | lm loss: 3.689545E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.854 | TFLOPs: 26.22 | +7: iteration 96610/ 173500 | consumed samples: 24732160 | consumed tokens: 50651463680 | elapsed time per iteration (s): 0.20 | learning rate: 9.526E-05 | global batch size: 256 | lm loss: 3.709111E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1308.248 | TFLOPs: 20.52 | +7: iteration 96620/ 173500 | consumed samples: 24734720 | consumed tokens: 50656706560 | elapsed time per iteration (s): 0.15 | learning rate: 9.525E-05 | global batch size: 256 | lm loss: 3.704863E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.257 | TFLOPs: 26.19 | +7: iteration 96630/ 173500 | consumed samples: 24737280 | consumed tokens: 50661949440 | elapsed time per iteration (s): 0.16 | learning rate: 9.523E-05 | global batch size: 256 | lm loss: 3.687037E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.748 | TFLOPs: 25.46 | +7: iteration 96640/ 173500 | consumed samples: 24739840 | consumed tokens: 50667192320 | elapsed time per iteration (s): 0.15 | learning rate: 9.522E-05 | global batch size: 256 | lm loss: 3.698819E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.125 | TFLOPs: 26.27 | +7: iteration 96650/ 173500 | consumed samples: 24742400 | consumed tokens: 50672435200 | elapsed time per iteration (s): 0.15 | learning rate: 9.520E-05 | global batch size: 256 | lm loss: 3.688935E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.006 | TFLOPs: 26.27 | +7: iteration 96660/ 173500 | consumed samples: 24744960 | consumed tokens: 50677678080 | elapsed time per iteration (s): 0.16 | learning rate: 9.518E-05 | global batch size: 256 | lm loss: 3.702325E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.459 | TFLOPs: 25.24 | +7: iteration 96670/ 173500 | consumed samples: 24747520 | consumed tokens: 50682920960 | elapsed time per iteration (s): 0.17 | learning rate: 9.517E-05 | global batch size: 256 | lm loss: 3.688636E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1479.690 | TFLOPs: 23.21 | +7: iteration 96680/ 173500 | consumed samples: 24750080 | consumed tokens: 50688163840 | elapsed time per iteration (s): 0.15 | learning rate: 9.515E-05 | global batch size: 256 | lm loss: 3.693392E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.385 | TFLOPs: 26.26 | +7: iteration 96690/ 173500 | consumed samples: 24752640 | consumed tokens: 50693406720 | elapsed time per iteration (s): 0.15 | learning rate: 9.513E-05 | global batch size: 256 | lm loss: 3.711255E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.882 | TFLOPs: 26.03 | +7: iteration 96700/ 173500 | consumed samples: 24755200 | consumed tokens: 50698649600 | elapsed time per iteration (s): 0.15 | learning rate: 9.512E-05 | global batch size: 256 | lm loss: 3.707366E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.919 | TFLOPs: 26.27 | +7: iteration 96710/ 173500 | consumed samples: 24757760 | consumed tokens: 50703892480 | elapsed time per iteration (s): 0.15 | learning rate: 9.510E-05 | global batch size: 256 | lm loss: 3.695241E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.756 | TFLOPs: 26.23 | +7: iteration 96720/ 173500 | consumed samples: 24760320 | consumed tokens: 50709135360 | elapsed time per iteration (s): 0.16 | learning rate: 9.509E-05 | global batch size: 256 | lm loss: 3.686754E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.737 | TFLOPs: 25.76 | +7: iteration 96730/ 173500 | consumed samples: 24762880 | consumed tokens: 50714378240 | elapsed time per iteration (s): 0.18 | learning rate: 9.507E-05 | global batch size: 256 | lm loss: 3.713703E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.125 | TFLOPs: 22.73 | +7: iteration 96740/ 173500 | consumed samples: 24765440 | consumed tokens: 50719621120 | elapsed time per iteration (s): 0.15 | learning rate: 9.505E-05 | global batch size: 256 | lm loss: 3.687077E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.195 | TFLOPs: 26.27 | +7: iteration 96750/ 173500 | consumed samples: 24768000 | consumed tokens: 50724864000 | elapsed time per iteration (s): 0.15 | learning rate: 9.504E-05 | global batch size: 256 | lm loss: 3.702811E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.371 | TFLOPs: 26.04 | +7: iteration 96760/ 173500 | consumed samples: 24770560 | consumed tokens: 50730106880 | elapsed time per iteration (s): 0.16 | learning rate: 9.502E-05 | global batch size: 256 | lm loss: 3.683894E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.664 | TFLOPs: 25.35 | +7: iteration 96770/ 173500 | consumed samples: 24773120 | consumed tokens: 50735349760 | elapsed time per iteration (s): 0.15 | learning rate: 9.500E-05 | global batch size: 256 | lm loss: 3.681063E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.813 | TFLOPs: 26.08 | +7: iteration 96780/ 173500 | consumed samples: 24775680 | consumed tokens: 50740592640 | elapsed time per iteration (s): 0.16 | learning rate: 9.499E-05 | global batch size: 256 | lm loss: 3.699636E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.243 | TFLOPs: 25.80 | +7: iteration 96790/ 173500 | consumed samples: 24778240 | consumed tokens: 50745835520 | elapsed time per iteration (s): 0.15 | learning rate: 9.497E-05 | global batch size: 256 | lm loss: 3.681416E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.464 | TFLOPs: 26.26 | +7: iteration 96800/ 173500 | consumed samples: 24780800 | consumed tokens: 50751078400 | elapsed time per iteration (s): 0.15 | learning rate: 9.496E-05 | global batch size: 256 | lm loss: 3.695164E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.406 | TFLOPs: 26.26 | +7: iteration 96810/ 173500 | consumed samples: 24783360 | consumed tokens: 50756321280 | elapsed time per iteration (s): 0.15 | learning rate: 9.494E-05 | global batch size: 256 | lm loss: 3.690351E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.224 | TFLOPs: 26.27 | +7: iteration 96820/ 173500 | consumed samples: 24785920 | consumed tokens: 50761564160 | elapsed time per iteration (s): 0.15 | learning rate: 9.492E-05 | global batch size: 256 | lm loss: 3.703811E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.797 | TFLOPs: 26.25 | +7: iteration 96830/ 173500 | consumed samples: 24788480 | consumed tokens: 50766807040 | elapsed time per iteration (s): 0.15 | learning rate: 9.491E-05 | global batch size: 256 | lm loss: 3.694968E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.624 | TFLOPs: 26.12 | +7: iteration 96840/ 173500 | consumed samples: 24791040 | consumed tokens: 50772049920 | elapsed time per iteration (s): 0.15 | learning rate: 9.489E-05 | global batch size: 256 | lm loss: 3.701039E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.880 | TFLOPs: 26.11 | +7: iteration 96850/ 173500 | consumed samples: 24793600 | consumed tokens: 50777292800 | elapsed time per iteration (s): 0.15 | learning rate: 9.487E-05 | global batch size: 256 | lm loss: 3.694816E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.406 | TFLOPs: 26.12 | +7: iteration 96860/ 173500 | consumed samples: 24796160 | consumed tokens: 50782535680 | elapsed time per iteration (s): 0.16 | learning rate: 9.486E-05 | global batch size: 256 | lm loss: 3.675350E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.698 | TFLOPs: 25.78 | +7: iteration 96870/ 173500 | consumed samples: 24798720 | consumed tokens: 50787778560 | elapsed time per iteration (s): 0.15 | learning rate: 9.484E-05 | global batch size: 256 | lm loss: 3.699553E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.099 | TFLOPs: 26.13 | +7: iteration 96880/ 173500 | consumed samples: 24801280 | consumed tokens: 50793021440 | elapsed time per iteration (s): 0.15 | learning rate: 9.483E-05 | global batch size: 256 | lm loss: 3.696736E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.573 | TFLOPs: 26.12 | +7: iteration 96890/ 173500 | consumed samples: 24803840 | consumed tokens: 50798264320 | elapsed time per iteration (s): 0.16 | learning rate: 9.481E-05 | global batch size: 256 | lm loss: 3.701861E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.011 | TFLOPs: 25.64 | +7: iteration 96900/ 173500 | consumed samples: 24806400 | consumed tokens: 50803507200 | elapsed time per iteration (s): 0.16 | learning rate: 9.479E-05 | global batch size: 256 | lm loss: 3.695950E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.924 | TFLOPs: 25.25 | +7: iteration 96910/ 173500 | consumed samples: 24808960 | consumed tokens: 50808750080 | elapsed time per iteration (s): 0.15 | learning rate: 9.478E-05 | global batch size: 256 | lm loss: 3.698796E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.408 | TFLOPs: 26.13 | +7: iteration 96920/ 173500 | consumed samples: 24811520 | consumed tokens: 50813992960 | elapsed time per iteration (s): 0.16 | learning rate: 9.476E-05 | global batch size: 256 | lm loss: 3.700630E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.540 | TFLOPs: 25.43 | +7: iteration 96930/ 173500 | consumed samples: 24814080 | consumed tokens: 50819235840 | elapsed time per iteration (s): 0.15 | learning rate: 9.475E-05 | global batch size: 256 | lm loss: 3.701828E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.338 | TFLOPs: 26.10 | +7: iteration 96940/ 173500 | consumed samples: 24816640 | consumed tokens: 50824478720 | elapsed time per iteration (s): 0.16 | learning rate: 9.473E-05 | global batch size: 256 | lm loss: 3.686850E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.562 | TFLOPs: 25.62 | +7: iteration 96950/ 173500 | consumed samples: 24819200 | consumed tokens: 50829721600 | elapsed time per iteration (s): 0.16 | learning rate: 9.471E-05 | global batch size: 256 | lm loss: 3.698603E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.185 | TFLOPs: 25.49 | +7: iteration 96960/ 173500 | consumed samples: 24821760 | consumed tokens: 50834964480 | elapsed time per iteration (s): 0.16 | learning rate: 9.470E-05 | global batch size: 256 | lm loss: 3.693108E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.030 | TFLOPs: 25.42 | +7: iteration 96970/ 173500 | consumed samples: 24824320 | consumed tokens: 50840207360 | elapsed time per iteration (s): 0.16 | learning rate: 9.468E-05 | global batch size: 256 | lm loss: 3.693038E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.601 | TFLOPs: 25.85 | +7: iteration 96980/ 173500 | consumed samples: 24826880 | consumed tokens: 50845450240 | elapsed time per iteration (s): 0.16 | learning rate: 9.466E-05 | global batch size: 256 | lm loss: 3.692692E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.340 | TFLOPs: 25.29 | +7: iteration 96990/ 173500 | consumed samples: 24829440 | consumed tokens: 50850693120 | elapsed time per iteration (s): 0.17 | learning rate: 9.465E-05 | global batch size: 256 | lm loss: 3.695847E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.305 | TFLOPs: 23.09 | +7: iteration 97000/ 173500 | consumed samples: 24832000 | consumed tokens: 50855936000 | elapsed time per iteration (s): 0.16 | learning rate: 9.463E-05 | global batch size: 256 | lm loss: 3.692131E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.845 | TFLOPs: 25.51 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 97000 | lm loss value: 3.831567E+00 | lm loss PPL: 4.613476E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 97000 to checkpoints_44m91b100m +0: [2023-03-17 04:27:32,567] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step97000 is begin to save! +0: [2023-03-17 04:27:32,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:27:32,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:27:32,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:27:32,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:27:32,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:27:32,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:27:32,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:27:32,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:27:32,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:27:32,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:27:32,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:27:32,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:27:32,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:27:32,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:27:32,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:27:32,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:27:32,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:27:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:27:32,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:27:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:27:32,701] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step97000/mp_rank_00_model_states.pt +0: [2023-03-17 04:27:32,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:27:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:27:32,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:27:32,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:27:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +7: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +7: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +7: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +7: [2023-03-17 04:27:32,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +7: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:27:32,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +3: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:27:32,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:27:32,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +7: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +5: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +2: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +4: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +1: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +6: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: [2023-03-17 04:27:32,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:27:32,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step97000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:27:32,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step97000 is ready now! +0: successfully saved checkpoint at iteration 97000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.62 +7: iteration 97010/ 173500 | consumed samples: 24834560 | consumed tokens: 50861178880 | elapsed time per iteration (s): 0.18 | learning rate: 9.462E-05 | global batch size: 256 | lm loss: 3.686964E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1458.625 | TFLOPs: 22.87 | +7: iteration 97020/ 173500 | consumed samples: 24837120 | consumed tokens: 50866421760 | elapsed time per iteration (s): 0.15 | learning rate: 9.460E-05 | global batch size: 256 | lm loss: 3.703405E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.998 | TFLOPs: 26.10 | +7: iteration 97030/ 173500 | consumed samples: 24839680 | consumed tokens: 50871664640 | elapsed time per iteration (s): 0.15 | learning rate: 9.458E-05 | global batch size: 256 | lm loss: 3.688564E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.684 | TFLOPs: 26.26 | +7: iteration 97040/ 173500 | consumed samples: 24842240 | consumed tokens: 50876907520 | elapsed time per iteration (s): 0.16 | learning rate: 9.457E-05 | global batch size: 256 | lm loss: 3.697355E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.379 | TFLOPs: 25.65 | +7: iteration 97050/ 173500 | consumed samples: 24844800 | consumed tokens: 50882150400 | elapsed time per iteration (s): 0.16 | learning rate: 9.455E-05 | global batch size: 256 | lm loss: 3.684048E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.489 | TFLOPs: 25.87 | +7: iteration 97060/ 173500 | consumed samples: 24847360 | consumed tokens: 50887393280 | elapsed time per iteration (s): 0.16 | learning rate: 9.453E-05 | global batch size: 256 | lm loss: 3.696410E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.745 | TFLOPs: 25.34 | +7: iteration 97070/ 173500 | consumed samples: 24849920 | consumed tokens: 50892636160 | elapsed time per iteration (s): 0.16 | learning rate: 9.452E-05 | global batch size: 256 | lm loss: 3.699132E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.901 | TFLOPs: 24.82 | +7: iteration 97080/ 173500 | consumed samples: 24852480 | consumed tokens: 50897879040 | elapsed time per iteration (s): 0.16 | learning rate: 9.450E-05 | global batch size: 256 | lm loss: 3.703671E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.863 | TFLOPs: 25.59 | +7: iteration 97090/ 173500 | consumed samples: 24855040 | consumed tokens: 50903121920 | elapsed time per iteration (s): 0.16 | learning rate: 9.449E-05 | global batch size: 256 | lm loss: 3.685691E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.390 | TFLOPs: 25.84 | +7: iteration 97100/ 173500 | consumed samples: 24857600 | consumed tokens: 50908364800 | elapsed time per iteration (s): 0.15 | learning rate: 9.447E-05 | global batch size: 256 | lm loss: 3.695298E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.510 | TFLOPs: 25.92 | +7: iteration 97110/ 173500 | consumed samples: 24860160 | consumed tokens: 50913607680 | elapsed time per iteration (s): 0.16 | learning rate: 9.445E-05 | global batch size: 256 | lm loss: 3.690250E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.597 | TFLOPs: 25.87 | +7: iteration 97120/ 173500 | consumed samples: 24862720 | consumed tokens: 50918850560 | elapsed time per iteration (s): 0.19 | learning rate: 9.444E-05 | global batch size: 256 | lm loss: 3.689878E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1319.094 | TFLOPs: 20.69 | +7: iteration 97130/ 173500 | consumed samples: 24865280 | consumed tokens: 50924093440 | elapsed time per iteration (s): 0.15 | learning rate: 9.442E-05 | global batch size: 256 | lm loss: 3.705554E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.522 | TFLOPs: 26.29 | +7: iteration 97140/ 173500 | consumed samples: 24867840 | consumed tokens: 50929336320 | elapsed time per iteration (s): 0.16 | learning rate: 9.440E-05 | global batch size: 256 | lm loss: 3.693961E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.918 | TFLOPs: 25.36 | +7: iteration 97150/ 173500 | consumed samples: 24870400 | consumed tokens: 50934579200 | elapsed time per iteration (s): 0.15 | learning rate: 9.439E-05 | global batch size: 256 | lm loss: 3.696618E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.883 | TFLOPs: 26.23 | +7: iteration 97160/ 173500 | consumed samples: 24872960 | consumed tokens: 50939822080 | elapsed time per iteration (s): 0.15 | learning rate: 9.437E-05 | global batch size: 256 | lm loss: 3.708148E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.283 | TFLOPs: 26.23 | +7: iteration 97170/ 173500 | consumed samples: 24875520 | consumed tokens: 50945064960 | elapsed time per iteration (s): 0.16 | learning rate: 9.436E-05 | global batch size: 256 | lm loss: 3.709247E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.764 | TFLOPs: 25.87 | +7: iteration 97180/ 173500 | consumed samples: 24878080 | consumed tokens: 50950307840 | elapsed time per iteration (s): 0.15 | learning rate: 9.434E-05 | global batch size: 256 | lm loss: 3.693280E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.300 | TFLOPs: 26.23 | +7: iteration 97190/ 173500 | consumed samples: 24880640 | consumed tokens: 50955550720 | elapsed time per iteration (s): 0.16 | learning rate: 9.432E-05 | global batch size: 256 | lm loss: 3.687639E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.460 | TFLOPs: 25.65 | +7: iteration 97200/ 173500 | consumed samples: 24883200 | consumed tokens: 50960793600 | elapsed time per iteration (s): 0.16 | learning rate: 9.431E-05 | global batch size: 256 | lm loss: 3.697458E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.263 | TFLOPs: 25.33 | +7: iteration 97210/ 173500 | consumed samples: 24885760 | consumed tokens: 50966036480 | elapsed time per iteration (s): 0.16 | learning rate: 9.429E-05 | global batch size: 256 | lm loss: 3.706835E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.937 | TFLOPs: 25.73 | +7: iteration 97220/ 173500 | consumed samples: 24888320 | consumed tokens: 50971279360 | elapsed time per iteration (s): 0.15 | learning rate: 9.427E-05 | global batch size: 256 | lm loss: 3.690170E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.134 | TFLOPs: 25.97 | +7: iteration 97230/ 173500 | consumed samples: 24890880 | consumed tokens: 50976522240 | elapsed time per iteration (s): 0.15 | learning rate: 9.426E-05 | global batch size: 256 | lm loss: 3.696476E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.424 | TFLOPs: 26.24 | +7: iteration 97240/ 173500 | consumed samples: 24893440 | consumed tokens: 50981765120 | elapsed time per iteration (s): 0.17 | learning rate: 9.424E-05 | global batch size: 256 | lm loss: 3.696610E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.545 | TFLOPs: 23.14 | +7: iteration 97250/ 173500 | consumed samples: 24896000 | consumed tokens: 50987008000 | elapsed time per iteration (s): 0.16 | learning rate: 9.423E-05 | global batch size: 256 | lm loss: 3.692382E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.369 | TFLOPs: 25.82 | +7: iteration 97260/ 173500 | consumed samples: 24898560 | consumed tokens: 50992250880 | elapsed time per iteration (s): 0.15 | learning rate: 9.421E-05 | global batch size: 256 | lm loss: 3.688740E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.083 | TFLOPs: 26.25 | +7: iteration 97270/ 173500 | consumed samples: 24901120 | consumed tokens: 50997493760 | elapsed time per iteration (s): 0.15 | learning rate: 9.419E-05 | global batch size: 256 | lm loss: 3.689705E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.969 | TFLOPs: 26.25 | +7: iteration 97280/ 173500 | consumed samples: 24903680 | consumed tokens: 51002736640 | elapsed time per iteration (s): 0.15 | learning rate: 9.418E-05 | global batch size: 256 | lm loss: 3.700961E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.446 | TFLOPs: 26.26 | +7: iteration 97290/ 173500 | consumed samples: 24906240 | consumed tokens: 51007979520 | elapsed time per iteration (s): 0.15 | learning rate: 9.416E-05 | global batch size: 256 | lm loss: 3.697601E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.494 | TFLOPs: 26.01 | +7: iteration 97300/ 173500 | consumed samples: 24908800 | consumed tokens: 51013222400 | elapsed time per iteration (s): 0.15 | learning rate: 9.415E-05 | global batch size: 256 | lm loss: 3.704902E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.020 | TFLOPs: 26.25 | +7: iteration 97310/ 173500 | consumed samples: 24911360 | consumed tokens: 51018465280 | elapsed time per iteration (s): 0.16 | learning rate: 9.413E-05 | global batch size: 256 | lm loss: 3.692154E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.284 | TFLOPs: 25.83 | +7: iteration 97320/ 173500 | consumed samples: 24913920 | consumed tokens: 51023708160 | elapsed time per iteration (s): 0.15 | learning rate: 9.411E-05 | global batch size: 256 | lm loss: 3.686034E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.736 | TFLOPs: 26.23 | +7: iteration 97330/ 173500 | consumed samples: 24916480 | consumed tokens: 51028951040 | elapsed time per iteration (s): 0.15 | learning rate: 9.410E-05 | global batch size: 256 | lm loss: 3.691423E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.888 | TFLOPs: 26.25 | +7: iteration 97340/ 173500 | consumed samples: 24919040 | consumed tokens: 51034193920 | elapsed time per iteration (s): 0.15 | learning rate: 9.408E-05 | global batch size: 256 | lm loss: 3.687262E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.332 | TFLOPs: 26.23 | +7: iteration 97350/ 173500 | consumed samples: 24921600 | consumed tokens: 51039436800 | elapsed time per iteration (s): 0.15 | learning rate: 9.406E-05 | global batch size: 256 | lm loss: 3.698605E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.242 | TFLOPs: 26.24 | +7: iteration 97360/ 173500 | consumed samples: 24924160 | consumed tokens: 51044679680 | elapsed time per iteration (s): 0.15 | learning rate: 9.405E-05 | global batch size: 256 | lm loss: 3.672868E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.100 | TFLOPs: 26.25 | +7: iteration 97370/ 173500 | consumed samples: 24926720 | consumed tokens: 51049922560 | elapsed time per iteration (s): 0.17 | learning rate: 9.403E-05 | global batch size: 256 | lm loss: 3.705269E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.624 | TFLOPs: 23.17 | +7: iteration 97380/ 173500 | consumed samples: 24929280 | consumed tokens: 51055165440 | elapsed time per iteration (s): 0.15 | learning rate: 9.402E-05 | global batch size: 256 | lm loss: 3.696886E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.558 | TFLOPs: 26.21 | +7: iteration 97390/ 173500 | consumed samples: 24931840 | consumed tokens: 51060408320 | elapsed time per iteration (s): 0.15 | learning rate: 9.400E-05 | global batch size: 256 | lm loss: 3.690118E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.322 | TFLOPs: 26.23 | +7: iteration 97400/ 173500 | consumed samples: 24934400 | consumed tokens: 51065651200 | elapsed time per iteration (s): 0.15 | learning rate: 9.398E-05 | global batch size: 256 | lm loss: 3.694464E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.954 | TFLOPs: 26.19 | +7: iteration 97410/ 173500 | consumed samples: 24936960 | consumed tokens: 51070894080 | elapsed time per iteration (s): 0.15 | learning rate: 9.397E-05 | global batch size: 256 | lm loss: 3.692212E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.377 | TFLOPs: 26.26 | +7: iteration 97420/ 173500 | consumed samples: 24939520 | consumed tokens: 51076136960 | elapsed time per iteration (s): 0.15 | learning rate: 9.395E-05 | global batch size: 256 | lm loss: 3.697761E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.647 | TFLOPs: 26.22 | +7: iteration 97430/ 173500 | consumed samples: 24942080 | consumed tokens: 51081379840 | elapsed time per iteration (s): 0.15 | learning rate: 9.393E-05 | global batch size: 256 | lm loss: 3.692107E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.112 | TFLOPs: 26.27 | +7: iteration 97440/ 173500 | consumed samples: 24944640 | consumed tokens: 51086622720 | elapsed time per iteration (s): 0.15 | learning rate: 9.392E-05 | global batch size: 256 | lm loss: 3.691027E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.122 | TFLOPs: 26.24 | +7: iteration 97450/ 173500 | consumed samples: 24947200 | consumed tokens: 51091865600 | elapsed time per iteration (s): 0.15 | learning rate: 9.390E-05 | global batch size: 256 | lm loss: 3.706527E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.048 | TFLOPs: 26.25 | +7: iteration 97460/ 173500 | consumed samples: 24949760 | consumed tokens: 51097108480 | elapsed time per iteration (s): 0.15 | learning rate: 9.389E-05 | global batch size: 256 | lm loss: 3.701445E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.253 | TFLOPs: 26.27 | +7: iteration 97470/ 173500 | consumed samples: 24952320 | consumed tokens: 51102351360 | elapsed time per iteration (s): 0.15 | learning rate: 9.387E-05 | global batch size: 256 | lm loss: 3.693218E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.005 | TFLOPs: 26.27 | +7: iteration 97480/ 173500 | consumed samples: 24954880 | consumed tokens: 51107594240 | elapsed time per iteration (s): 0.15 | learning rate: 9.385E-05 | global batch size: 256 | lm loss: 3.694209E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.122 | TFLOPs: 26.25 | +7: iteration 97490/ 173500 | consumed samples: 24957440 | consumed tokens: 51112837120 | elapsed time per iteration (s): 0.15 | learning rate: 9.384E-05 | global batch size: 256 | lm loss: 3.696249E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.618 | TFLOPs: 26.26 | +7: iteration 97500/ 173500 | consumed samples: 24960000 | consumed tokens: 51118080000 | elapsed time per iteration (s): 0.17 | learning rate: 9.382E-05 | global batch size: 256 | lm loss: 3.687426E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1465.513 | TFLOPs: 22.98 | +7: iteration 97510/ 173500 | consumed samples: 24962560 | consumed tokens: 51123322880 | elapsed time per iteration (s): 0.15 | learning rate: 9.381E-05 | global batch size: 256 | lm loss: 3.695989E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.976 | TFLOPs: 26.24 | +7: iteration 97520/ 173500 | consumed samples: 24965120 | consumed tokens: 51128565760 | elapsed time per iteration (s): 0.15 | learning rate: 9.379E-05 | global batch size: 256 | lm loss: 3.682030E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.013 | TFLOPs: 26.27 | +7: iteration 97530/ 173500 | consumed samples: 24967680 | consumed tokens: 51133808640 | elapsed time per iteration (s): 0.15 | learning rate: 9.377E-05 | global batch size: 256 | lm loss: 3.698393E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.575 | TFLOPs: 26.25 | +7: iteration 97540/ 173500 | consumed samples: 24970240 | consumed tokens: 51139051520 | elapsed time per iteration (s): 0.15 | learning rate: 9.376E-05 | global batch size: 256 | lm loss: 3.690681E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.315 | TFLOPs: 26.19 | +7: iteration 97550/ 173500 | consumed samples: 24972800 | consumed tokens: 51144294400 | elapsed time per iteration (s): 0.15 | learning rate: 9.374E-05 | global batch size: 256 | lm loss: 3.681323E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.698 | TFLOPs: 26.23 | +7: iteration 97560/ 173500 | consumed samples: 24975360 | consumed tokens: 51149537280 | elapsed time per iteration (s): 0.17 | learning rate: 9.372E-05 | global batch size: 256 | lm loss: 3.686465E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.106 | TFLOPs: 23.16 | +7: iteration 97570/ 173500 | consumed samples: 24977920 | consumed tokens: 51154780160 | elapsed time per iteration (s): 0.15 | learning rate: 9.371E-05 | global batch size: 256 | lm loss: 3.690093E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.810 | TFLOPs: 26.25 | +7: iteration 97580/ 173500 | consumed samples: 24980480 | consumed tokens: 51160023040 | elapsed time per iteration (s): 0.15 | learning rate: 9.369E-05 | global batch size: 256 | lm loss: 3.695791E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.413 | TFLOPs: 26.23 | +7: iteration 97590/ 173500 | consumed samples: 24983040 | consumed tokens: 51165265920 | elapsed time per iteration (s): 0.15 | learning rate: 9.368E-05 | global batch size: 256 | lm loss: 3.684420E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.720 | TFLOPs: 26.25 | +7: iteration 97600/ 173500 | consumed samples: 24985600 | consumed tokens: 51170508800 | elapsed time per iteration (s): 0.15 | learning rate: 9.366E-05 | global batch size: 256 | lm loss: 3.697213E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.720 | TFLOPs: 26.22 | +7: iteration 97610/ 173500 | consumed samples: 24988160 | consumed tokens: 51175751680 | elapsed time per iteration (s): 0.15 | learning rate: 9.364E-05 | global batch size: 256 | lm loss: 3.704126E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.828 | TFLOPs: 26.20 | +7: iteration 97620/ 173500 | consumed samples: 24990720 | consumed tokens: 51180994560 | elapsed time per iteration (s): 0.15 | learning rate: 9.363E-05 | global batch size: 256 | lm loss: 3.695355E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.564 | TFLOPs: 26.14 | +7: iteration 97630/ 173500 | consumed samples: 24993280 | consumed tokens: 51186237440 | elapsed time per iteration (s): 0.17 | learning rate: 9.361E-05 | global batch size: 256 | lm loss: 3.698515E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.480 | TFLOPs: 23.11 | +7: iteration 97640/ 173500 | consumed samples: 24995840 | consumed tokens: 51191480320 | elapsed time per iteration (s): 0.15 | learning rate: 9.359E-05 | global batch size: 256 | lm loss: 3.703314E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.610 | TFLOPs: 26.15 | +7: iteration 97650/ 173500 | consumed samples: 24998400 | consumed tokens: 51196723200 | elapsed time per iteration (s): 0.15 | learning rate: 9.358E-05 | global batch size: 256 | lm loss: 3.703441E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.752 | TFLOPs: 26.14 | +7: iteration 97660/ 173500 | consumed samples: 25000960 | consumed tokens: 51201966080 | elapsed time per iteration (s): 0.15 | learning rate: 9.356E-05 | global batch size: 256 | lm loss: 3.693542E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.107 | TFLOPs: 26.13 | +7: iteration 97670/ 173500 | consumed samples: 25003520 | consumed tokens: 51207208960 | elapsed time per iteration (s): 0.15 | learning rate: 9.355E-05 | global batch size: 256 | lm loss: 3.690676E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.515 | TFLOPs: 26.10 | +7: iteration 97680/ 173500 | consumed samples: 25006080 | consumed tokens: 51212451840 | elapsed time per iteration (s): 0.15 | learning rate: 9.353E-05 | global batch size: 256 | lm loss: 3.682602E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.677 | TFLOPs: 26.11 | +7: iteration 97690/ 173500 | consumed samples: 25008640 | consumed tokens: 51217694720 | elapsed time per iteration (s): 0.16 | learning rate: 9.351E-05 | global batch size: 256 | lm loss: 3.707919E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.471 | TFLOPs: 25.51 | +7: iteration 97700/ 173500 | consumed samples: 25011200 | consumed tokens: 51222937600 | elapsed time per iteration (s): 0.15 | learning rate: 9.350E-05 | global batch size: 256 | lm loss: 3.694184E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.915 | TFLOPs: 26.27 | +7: iteration 97710/ 173500 | consumed samples: 25013760 | consumed tokens: 51228180480 | elapsed time per iteration (s): 0.15 | learning rate: 9.348E-05 | global batch size: 256 | lm loss: 3.693448E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.284 | TFLOPs: 26.27 | +7: iteration 97720/ 173500 | consumed samples: 25016320 | consumed tokens: 51233423360 | elapsed time per iteration (s): 0.15 | learning rate: 9.347E-05 | global batch size: 256 | lm loss: 3.701686E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.365 | TFLOPs: 26.27 | +7: iteration 97730/ 173500 | consumed samples: 25018880 | consumed tokens: 51238666240 | elapsed time per iteration (s): 0.15 | learning rate: 9.345E-05 | global batch size: 256 | lm loss: 3.705412E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.424 | TFLOPs: 26.31 | +7: iteration 97740/ 173500 | consumed samples: 25021440 | consumed tokens: 51243909120 | elapsed time per iteration (s): 0.16 | learning rate: 9.343E-05 | global batch size: 256 | lm loss: 3.702143E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.839 | TFLOPs: 25.28 | +7: iteration 97750/ 173500 | consumed samples: 25024000 | consumed tokens: 51249152000 | elapsed time per iteration (s): 0.15 | learning rate: 9.342E-05 | global batch size: 256 | lm loss: 3.696901E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.114 | TFLOPs: 26.38 | +7: iteration 97760/ 173500 | consumed samples: 25026560 | consumed tokens: 51254394880 | elapsed time per iteration (s): 0.15 | learning rate: 9.340E-05 | global batch size: 256 | lm loss: 3.690317E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.978 | TFLOPs: 25.95 | +7: iteration 97770/ 173500 | consumed samples: 25029120 | consumed tokens: 51259637760 | elapsed time per iteration (s): 0.15 | learning rate: 9.338E-05 | global batch size: 256 | lm loss: 3.692187E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.966 | TFLOPs: 26.38 | +7: iteration 97780/ 173500 | consumed samples: 25031680 | consumed tokens: 51264880640 | elapsed time per iteration (s): 0.16 | learning rate: 9.337E-05 | global batch size: 256 | lm loss: 3.701775E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.387 | TFLOPs: 25.90 | +7: iteration 97790/ 173500 | consumed samples: 25034240 | consumed tokens: 51270123520 | elapsed time per iteration (s): 0.16 | learning rate: 9.335E-05 | global batch size: 256 | lm loss: 3.689352E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.531 | TFLOPs: 25.38 | +7: iteration 97800/ 173500 | consumed samples: 25036800 | consumed tokens: 51275366400 | elapsed time per iteration (s): 0.15 | learning rate: 9.334E-05 | global batch size: 256 | lm loss: 3.696066E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.082 | TFLOPs: 26.38 | +7: iteration 97810/ 173500 | consumed samples: 25039360 | consumed tokens: 51280609280 | elapsed time per iteration (s): 0.15 | learning rate: 9.332E-05 | global batch size: 256 | lm loss: 3.693048E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.190 | TFLOPs: 26.37 | +7: iteration 97820/ 173500 | consumed samples: 25041920 | consumed tokens: 51285852160 | elapsed time per iteration (s): 0.15 | learning rate: 9.330E-05 | global batch size: 256 | lm loss: 3.683490E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.211 | TFLOPs: 26.37 | +7: iteration 97830/ 173500 | consumed samples: 25044480 | consumed tokens: 51291095040 | elapsed time per iteration (s): 0.15 | learning rate: 9.329E-05 | global batch size: 256 | lm loss: 3.684456E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.291 | TFLOPs: 26.35 | +7: iteration 97840/ 173500 | consumed samples: 25047040 | consumed tokens: 51296337920 | elapsed time per iteration (s): 0.15 | learning rate: 9.327E-05 | global batch size: 256 | lm loss: 3.697924E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.841 | TFLOPs: 26.36 | +7: iteration 97850/ 173500 | consumed samples: 25049600 | consumed tokens: 51301580800 | elapsed time per iteration (s): 0.15 | learning rate: 9.325E-05 | global batch size: 256 | lm loss: 3.703236E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.266 | TFLOPs: 26.15 | +7: iteration 97860/ 173500 | consumed samples: 25052160 | consumed tokens: 51306823680 | elapsed time per iteration (s): 0.15 | learning rate: 9.324E-05 | global batch size: 256 | lm loss: 3.695835E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.712 | TFLOPs: 26.28 | +7: iteration 97870/ 173500 | consumed samples: 25054720 | consumed tokens: 51312066560 | elapsed time per iteration (s): 0.15 | learning rate: 9.322E-05 | global batch size: 256 | lm loss: 3.685863E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.624 | TFLOPs: 26.34 | +7: iteration 97880/ 173500 | consumed samples: 25057280 | consumed tokens: 51317309440 | elapsed time per iteration (s): 0.17 | learning rate: 9.321E-05 | global batch size: 256 | lm loss: 3.707317E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1481.241 | TFLOPs: 23.23 | +7: iteration 97890/ 173500 | consumed samples: 25059840 | consumed tokens: 51322552320 | elapsed time per iteration (s): 0.15 | learning rate: 9.319E-05 | global batch size: 256 | lm loss: 3.694412E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.169 | TFLOPs: 26.32 | +7: iteration 97900/ 173500 | consumed samples: 25062400 | consumed tokens: 51327795200 | elapsed time per iteration (s): 0.15 | learning rate: 9.317E-05 | global batch size: 256 | lm loss: 3.680565E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.121 | TFLOPs: 26.35 | +7: iteration 97910/ 173500 | consumed samples: 25064960 | consumed tokens: 51333038080 | elapsed time per iteration (s): 0.15 | learning rate: 9.316E-05 | global batch size: 256 | lm loss: 3.696269E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.353 | TFLOPs: 26.37 | +7: iteration 97920/ 173500 | consumed samples: 25067520 | consumed tokens: 51338280960 | elapsed time per iteration (s): 0.15 | learning rate: 9.314E-05 | global batch size: 256 | lm loss: 3.690447E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.251 | TFLOPs: 26.37 | +7: iteration 97930/ 173500 | consumed samples: 25070080 | consumed tokens: 51343523840 | elapsed time per iteration (s): 0.15 | learning rate: 9.313E-05 | global batch size: 256 | lm loss: 3.694981E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.193 | TFLOPs: 26.37 | +7: iteration 97940/ 173500 | consumed samples: 25072640 | consumed tokens: 51348766720 | elapsed time per iteration (s): 0.16 | learning rate: 9.311E-05 | global batch size: 256 | lm loss: 3.688877E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.118 | TFLOPs: 25.89 | +7: iteration 97950/ 173500 | consumed samples: 25075200 | consumed tokens: 51354009600 | elapsed time per iteration (s): 0.15 | learning rate: 9.309E-05 | global batch size: 256 | lm loss: 3.691724E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.990 | TFLOPs: 26.33 | +7: iteration 97960/ 173500 | consumed samples: 25077760 | consumed tokens: 51359252480 | elapsed time per iteration (s): 0.15 | learning rate: 9.308E-05 | global batch size: 256 | lm loss: 3.692723E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.895 | TFLOPs: 26.34 | +7: iteration 97970/ 173500 | consumed samples: 25080320 | consumed tokens: 51364495360 | elapsed time per iteration (s): 0.15 | learning rate: 9.306E-05 | global batch size: 256 | lm loss: 3.686969E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.682 | TFLOPs: 26.34 | +7: iteration 97980/ 173500 | consumed samples: 25082880 | consumed tokens: 51369738240 | elapsed time per iteration (s): 0.15 | learning rate: 9.304E-05 | global batch size: 256 | lm loss: 3.696715E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.846 | TFLOPs: 26.34 | +7: iteration 97990/ 173500 | consumed samples: 25085440 | consumed tokens: 51374981120 | elapsed time per iteration (s): 0.15 | learning rate: 9.303E-05 | global batch size: 256 | lm loss: 3.696213E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.913 | TFLOPs: 26.35 | +0: [2023-03-17 04:30:08,084] [INFO] [logging.py:68:log_dist] [Rank 0] step=98000, skipped=0, lr=[9.301234885879047e-05, 9.301234885879047e-05, 9.301234885879047e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 98000/ 173500 | consumed samples: 25088000 | consumed tokens: 51380224000 | elapsed time per iteration (s): 0.15 | learning rate: 9.301E-05 | global batch size: 256 | lm loss: 3.691766E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.598 | TFLOPs: 26.34 | +0: steps: 98000 loss: 3.7043 iter time (s): 0.155 samples/sec: 1649.789 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 98000 | lm loss value: 3.821879E+00 | lm loss PPL: 4.569000E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 98000 to checkpoints_44m91b100m +0: [2023-03-17 04:30:08,157] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step98000 is begin to save! +0: [2023-03-17 04:30:08,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:30:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:30:08,229] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:30:08,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:30:08,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:30:08,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:30:08,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:30:08,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:30:08,262] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:30:08,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:30:08,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:30:08,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:30:08,278] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:30:08,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:30:08,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:30:08,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:30:08,295] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:30:08,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:30:08,303] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:30:08,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:30:08,305] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step98000/mp_rank_00_model_states.pt +0: [2023-03-17 04:30:08,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:30:08,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:30:08,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:30:08,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,336] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +6: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:30:08,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 04:30:08,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +6: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:30:08,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:30:08,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,338] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,339] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +6: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +6: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,341] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,341] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +6: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,343] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:30:08,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +6: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,344] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +4: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +1: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:30:08,345] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +2: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:30:08,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:30:08,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +7: [2023-03-17 04:30:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 04:30:08,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:30:08,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +5: [2023-03-17 04:30:08,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:30:08,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +3: [2023-03-17 04:30:08,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step98000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:30:08,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step98000 is ready now! +0: successfully saved checkpoint at iteration 98000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 192.53 +7: iteration 98010/ 173500 | consumed samples: 25090560 | consumed tokens: 51385466880 | elapsed time per iteration (s): 0.20 | learning rate: 9.300E-05 | global batch size: 256 | lm loss: 3.677183E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1287.919 | TFLOPs: 20.20 | +7: iteration 98020/ 173500 | consumed samples: 25093120 | consumed tokens: 51390709760 | elapsed time per iteration (s): 0.15 | learning rate: 9.298E-05 | global batch size: 256 | lm loss: 3.704305E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.461 | TFLOPs: 26.34 | +7: iteration 98030/ 173500 | consumed samples: 25095680 | consumed tokens: 51395952640 | elapsed time per iteration (s): 0.16 | learning rate: 9.296E-05 | global batch size: 256 | lm loss: 3.697739E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.946 | TFLOPs: 25.72 | +7: iteration 98040/ 173500 | consumed samples: 25098240 | consumed tokens: 51401195520 | elapsed time per iteration (s): 0.15 | learning rate: 9.295E-05 | global batch size: 256 | lm loss: 3.697328E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.073 | TFLOPs: 26.30 | +7: iteration 98050/ 173500 | consumed samples: 25100800 | consumed tokens: 51406438400 | elapsed time per iteration (s): 0.15 | learning rate: 9.293E-05 | global batch size: 256 | lm loss: 3.699067E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.131 | TFLOPs: 26.36 | +7: iteration 98060/ 173500 | consumed samples: 25103360 | consumed tokens: 51411681280 | elapsed time per iteration (s): 0.16 | learning rate: 9.292E-05 | global batch size: 256 | lm loss: 3.697728E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.267 | TFLOPs: 25.65 | +7: iteration 98070/ 173500 | consumed samples: 25105920 | consumed tokens: 51416924160 | elapsed time per iteration (s): 0.15 | learning rate: 9.290E-05 | global batch size: 256 | lm loss: 3.706366E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.363 | TFLOPs: 26.35 | +7: iteration 98080/ 173500 | consumed samples: 25108480 | consumed tokens: 51422167040 | elapsed time per iteration (s): 0.15 | learning rate: 9.288E-05 | global batch size: 256 | lm loss: 3.687399E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.205 | TFLOPs: 26.33 | +7: iteration 98090/ 173500 | consumed samples: 25111040 | consumed tokens: 51427409920 | elapsed time per iteration (s): 0.15 | learning rate: 9.287E-05 | global batch size: 256 | lm loss: 3.691345E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.766 | TFLOPs: 26.09 | +7: iteration 98100/ 173500 | consumed samples: 25113600 | consumed tokens: 51432652800 | elapsed time per iteration (s): 0.16 | learning rate: 9.285E-05 | global batch size: 256 | lm loss: 3.690316E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.977 | TFLOPs: 25.86 | +7: iteration 98110/ 173500 | consumed samples: 25116160 | consumed tokens: 51437895680 | elapsed time per iteration (s): 0.15 | learning rate: 9.283E-05 | global batch size: 256 | lm loss: 3.683803E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.461 | TFLOPs: 26.31 | +7: iteration 98120/ 173500 | consumed samples: 25118720 | consumed tokens: 51443138560 | elapsed time per iteration (s): 0.15 | learning rate: 9.282E-05 | global batch size: 256 | lm loss: 3.696691E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.354 | TFLOPs: 26.02 | +7: iteration 98130/ 173500 | consumed samples: 25121280 | consumed tokens: 51448381440 | elapsed time per iteration (s): 0.16 | learning rate: 9.280E-05 | global batch size: 256 | lm loss: 3.685902E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.469 | TFLOPs: 25.65 | +7: iteration 98140/ 173500 | consumed samples: 25123840 | consumed tokens: 51453624320 | elapsed time per iteration (s): 0.17 | learning rate: 9.279E-05 | global batch size: 256 | lm loss: 3.685654E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.865 | TFLOPs: 23.16 | +7: iteration 98150/ 173500 | consumed samples: 25126400 | consumed tokens: 51458867200 | elapsed time per iteration (s): 0.15 | learning rate: 9.277E-05 | global batch size: 256 | lm loss: 3.696902E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.725 | TFLOPs: 26.28 | +7: iteration 98160/ 173500 | consumed samples: 25128960 | consumed tokens: 51464110080 | elapsed time per iteration (s): 0.15 | learning rate: 9.275E-05 | global batch size: 256 | lm loss: 3.697874E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.514 | TFLOPs: 26.28 | +7: iteration 98170/ 173500 | consumed samples: 25131520 | consumed tokens: 51469352960 | elapsed time per iteration (s): 0.15 | learning rate: 9.274E-05 | global batch size: 256 | lm loss: 3.688100E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.535 | TFLOPs: 26.28 | +7: iteration 98180/ 173500 | consumed samples: 25134080 | consumed tokens: 51474595840 | elapsed time per iteration (s): 0.15 | learning rate: 9.272E-05 | global batch size: 256 | lm loss: 3.690703E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.612 | TFLOPs: 26.26 | +7: iteration 98190/ 173500 | consumed samples: 25136640 | consumed tokens: 51479838720 | elapsed time per iteration (s): 0.15 | learning rate: 9.271E-05 | global batch size: 256 | lm loss: 3.688309E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.497 | TFLOPs: 26.24 | +7: iteration 98200/ 173500 | consumed samples: 25139200 | consumed tokens: 51485081600 | elapsed time per iteration (s): 0.16 | learning rate: 9.269E-05 | global batch size: 256 | lm loss: 3.693439E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.048 | TFLOPs: 25.74 | +7: iteration 98210/ 173500 | consumed samples: 25141760 | consumed tokens: 51490324480 | elapsed time per iteration (s): 0.16 | learning rate: 9.267E-05 | global batch size: 256 | lm loss: 3.692432E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.403 | TFLOPs: 24.69 | +7: iteration 98220/ 173500 | consumed samples: 25144320 | consumed tokens: 51495567360 | elapsed time per iteration (s): 0.15 | learning rate: 9.266E-05 | global batch size: 256 | lm loss: 3.693315E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.737 | TFLOPs: 26.31 | +7: iteration 98230/ 173500 | consumed samples: 25146880 | consumed tokens: 51500810240 | elapsed time per iteration (s): 0.15 | learning rate: 9.264E-05 | global batch size: 256 | lm loss: 3.695548E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.216 | TFLOPs: 26.26 | +7: iteration 98240/ 173500 | consumed samples: 25149440 | consumed tokens: 51506053120 | elapsed time per iteration (s): 0.15 | learning rate: 9.262E-05 | global batch size: 256 | lm loss: 3.685241E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.110 | TFLOPs: 26.22 | +7: iteration 98250/ 173500 | consumed samples: 25152000 | consumed tokens: 51511296000 | elapsed time per iteration (s): 0.15 | learning rate: 9.261E-05 | global batch size: 256 | lm loss: 3.696684E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.116 | TFLOPs: 26.19 | +7: iteration 98260/ 173500 | consumed samples: 25154560 | consumed tokens: 51516538880 | elapsed time per iteration (s): 0.15 | learning rate: 9.259E-05 | global batch size: 256 | lm loss: 3.683394E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.920 | TFLOPs: 25.92 | +7: iteration 98270/ 173500 | consumed samples: 25157120 | consumed tokens: 51521781760 | elapsed time per iteration (s): 0.15 | learning rate: 9.258E-05 | global batch size: 256 | lm loss: 3.698131E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.410 | TFLOPs: 26.12 | +7: iteration 98280/ 173500 | consumed samples: 25159680 | consumed tokens: 51527024640 | elapsed time per iteration (s): 0.15 | learning rate: 9.256E-05 | global batch size: 256 | lm loss: 3.699388E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.454 | TFLOPs: 26.12 | +7: iteration 98290/ 173500 | consumed samples: 25162240 | consumed tokens: 51532267520 | elapsed time per iteration (s): 0.15 | learning rate: 9.254E-05 | global batch size: 256 | lm loss: 3.690535E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.287 | TFLOPs: 26.12 | +7: iteration 98300/ 173500 | consumed samples: 25164800 | consumed tokens: 51537510400 | elapsed time per iteration (s): 0.15 | learning rate: 9.253E-05 | global batch size: 256 | lm loss: 3.693100E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.040 | TFLOPs: 26.00 | +7: iteration 98310/ 173500 | consumed samples: 25167360 | consumed tokens: 51542753280 | elapsed time per iteration (s): 0.15 | learning rate: 9.251E-05 | global batch size: 256 | lm loss: 3.696228E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.050 | TFLOPs: 25.99 | +7: iteration 98320/ 173500 | consumed samples: 25169920 | consumed tokens: 51547996160 | elapsed time per iteration (s): 0.16 | learning rate: 9.250E-05 | global batch size: 256 | lm loss: 3.691759E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.994 | TFLOPs: 25.52 | +7: iteration 98330/ 173500 | consumed samples: 25172480 | consumed tokens: 51553239040 | elapsed time per iteration (s): 0.15 | learning rate: 9.248E-05 | global batch size: 256 | lm loss: 3.685622E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.293 | TFLOPs: 26.12 | +7: iteration 98340/ 173500 | consumed samples: 25175040 | consumed tokens: 51558481920 | elapsed time per iteration (s): 0.15 | learning rate: 9.246E-05 | global batch size: 256 | lm loss: 3.692210E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.285 | TFLOPs: 26.13 | +7: iteration 98350/ 173500 | consumed samples: 25177600 | consumed tokens: 51563724800 | elapsed time per iteration (s): 0.15 | learning rate: 9.245E-05 | global batch size: 256 | lm loss: 3.693753E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.036 | TFLOPs: 26.11 | +7: iteration 98360/ 173500 | consumed samples: 25180160 | consumed tokens: 51568967680 | elapsed time per iteration (s): 0.15 | learning rate: 9.243E-05 | global batch size: 256 | lm loss: 3.702775E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.432 | TFLOPs: 26.12 | +7: iteration 98370/ 173500 | consumed samples: 25182720 | consumed tokens: 51574210560 | elapsed time per iteration (s): 0.15 | learning rate: 9.241E-05 | global batch size: 256 | lm loss: 3.687369E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.749 | TFLOPs: 26.12 | +7: iteration 98380/ 173500 | consumed samples: 25185280 | consumed tokens: 51579453440 | elapsed time per iteration (s): 0.15 | learning rate: 9.240E-05 | global batch size: 256 | lm loss: 3.684672E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.061 | TFLOPs: 26.11 | +7: iteration 98390/ 173500 | consumed samples: 25187840 | consumed tokens: 51584696320 | elapsed time per iteration (s): 0.15 | learning rate: 9.238E-05 | global batch size: 256 | lm loss: 3.694234E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.651 | TFLOPs: 26.12 | +7: iteration 98400/ 173500 | consumed samples: 25190400 | consumed tokens: 51589939200 | elapsed time per iteration (s): 0.15 | learning rate: 9.237E-05 | global batch size: 256 | lm loss: 3.696132E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.859 | TFLOPs: 26.11 | +7: iteration 98410/ 173500 | consumed samples: 25192960 | consumed tokens: 51595182080 | elapsed time per iteration (s): 0.15 | learning rate: 9.235E-05 | global batch size: 256 | lm loss: 3.691887E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.142 | TFLOPs: 26.11 | +7: iteration 98420/ 173500 | consumed samples: 25195520 | consumed tokens: 51600424960 | elapsed time per iteration (s): 0.16 | learning rate: 9.233E-05 | global batch size: 256 | lm loss: 3.696045E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.377 | TFLOPs: 25.58 | +7: iteration 98430/ 173500 | consumed samples: 25198080 | consumed tokens: 51605667840 | elapsed time per iteration (s): 0.15 | learning rate: 9.232E-05 | global batch size: 256 | lm loss: 3.698802E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.685 | TFLOPs: 26.11 | +7: iteration 98440/ 173500 | consumed samples: 25200640 | consumed tokens: 51610910720 | elapsed time per iteration (s): 0.15 | learning rate: 9.230E-05 | global batch size: 256 | lm loss: 3.685397E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.911 | TFLOPs: 25.92 | +7: iteration 98450/ 173500 | consumed samples: 25203200 | consumed tokens: 51616153600 | elapsed time per iteration (s): 0.15 | learning rate: 9.229E-05 | global batch size: 256 | lm loss: 3.681165E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.549 | TFLOPs: 26.14 | +7: iteration 98460/ 173500 | consumed samples: 25205760 | consumed tokens: 51621396480 | elapsed time per iteration (s): 0.15 | learning rate: 9.227E-05 | global batch size: 256 | lm loss: 3.701030E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.430 | TFLOPs: 26.10 | +7: iteration 98470/ 173500 | consumed samples: 25208320 | consumed tokens: 51626639360 | elapsed time per iteration (s): 0.15 | learning rate: 9.225E-05 | global batch size: 256 | lm loss: 3.713462E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.268 | TFLOPs: 26.13 | +7: iteration 98480/ 173500 | consumed samples: 25210880 | consumed tokens: 51631882240 | elapsed time per iteration (s): 0.16 | learning rate: 9.224E-05 | global batch size: 256 | lm loss: 3.696946E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.040 | TFLOPs: 25.44 | +7: iteration 98490/ 173500 | consumed samples: 25213440 | consumed tokens: 51637125120 | elapsed time per iteration (s): 0.15 | learning rate: 9.222E-05 | global batch size: 256 | lm loss: 3.692556E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.405 | TFLOPs: 26.15 | +7: iteration 98500/ 173500 | consumed samples: 25216000 | consumed tokens: 51642368000 | elapsed time per iteration (s): 0.16 | learning rate: 9.220E-05 | global batch size: 256 | lm loss: 3.688917E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.141 | TFLOPs: 25.89 | +7: iteration 98510/ 173500 | consumed samples: 25218560 | consumed tokens: 51647610880 | elapsed time per iteration (s): 0.15 | learning rate: 9.219E-05 | global batch size: 256 | lm loss: 3.679953E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.759 | TFLOPs: 26.12 | +7: iteration 98520/ 173500 | consumed samples: 25221120 | consumed tokens: 51652853760 | elapsed time per iteration (s): 0.16 | learning rate: 9.217E-05 | global batch size: 256 | lm loss: 3.687539E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.958 | TFLOPs: 25.33 | +7: iteration 98530/ 173500 | consumed samples: 25223680 | consumed tokens: 51658096640 | elapsed time per iteration (s): 0.15 | learning rate: 9.216E-05 | global batch size: 256 | lm loss: 3.691923E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.426 | TFLOPs: 26.09 | +7: iteration 98540/ 173500 | consumed samples: 25226240 | consumed tokens: 51663339520 | elapsed time per iteration (s): 0.15 | learning rate: 9.214E-05 | global batch size: 256 | lm loss: 3.702138E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.006 | TFLOPs: 26.11 | +7: iteration 98550/ 173500 | consumed samples: 25228800 | consumed tokens: 51668582400 | elapsed time per iteration (s): 0.15 | learning rate: 9.212E-05 | global batch size: 256 | lm loss: 3.693069E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.552 | TFLOPs: 26.17 | +7: iteration 98560/ 173500 | consumed samples: 25231360 | consumed tokens: 51673825280 | elapsed time per iteration (s): 0.15 | learning rate: 9.211E-05 | global batch size: 256 | lm loss: 3.696415E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.996 | TFLOPs: 26.16 | +7: iteration 98570/ 173500 | consumed samples: 25233920 | consumed tokens: 51679068160 | elapsed time per iteration (s): 0.16 | learning rate: 9.209E-05 | global batch size: 256 | lm loss: 3.691138E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.663 | TFLOPs: 25.67 | +7: iteration 98580/ 173500 | consumed samples: 25236480 | consumed tokens: 51684311040 | elapsed time per iteration (s): 0.15 | learning rate: 9.208E-05 | global batch size: 256 | lm loss: 3.696549E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.742 | TFLOPs: 26.15 | +7: iteration 98590/ 173500 | consumed samples: 25239040 | consumed tokens: 51689553920 | elapsed time per iteration (s): 0.16 | learning rate: 9.206E-05 | global batch size: 256 | lm loss: 3.691611E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.686 | TFLOPs: 25.67 | +7: iteration 98600/ 173500 | consumed samples: 25241600 | consumed tokens: 51694796800 | elapsed time per iteration (s): 0.16 | learning rate: 9.204E-05 | global batch size: 256 | lm loss: 3.702886E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.591 | TFLOPs: 25.67 | +7: iteration 98610/ 173500 | consumed samples: 25244160 | consumed tokens: 51700039680 | elapsed time per iteration (s): 0.16 | learning rate: 9.203E-05 | global batch size: 256 | lm loss: 3.691103E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.930 | TFLOPs: 25.86 | +7: iteration 98620/ 173500 | consumed samples: 25246720 | consumed tokens: 51705282560 | elapsed time per iteration (s): 0.15 | learning rate: 9.201E-05 | global batch size: 256 | lm loss: 3.690581E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.126 | TFLOPs: 26.18 | +7: iteration 98630/ 173500 | consumed samples: 25249280 | consumed tokens: 51710525440 | elapsed time per iteration (s): 0.16 | learning rate: 9.200E-05 | global batch size: 256 | lm loss: 3.712751E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.418 | TFLOPs: 25.84 | +7: iteration 98640/ 173500 | consumed samples: 25251840 | consumed tokens: 51715768320 | elapsed time per iteration (s): 0.15 | learning rate: 9.198E-05 | global batch size: 256 | lm loss: 3.694640E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.798 | TFLOPs: 26.20 | +7: iteration 98650/ 173500 | consumed samples: 25254400 | consumed tokens: 51721011200 | elapsed time per iteration (s): 0.17 | learning rate: 9.196E-05 | global batch size: 256 | lm loss: 3.692641E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1466.887 | TFLOPs: 23.00 | +7: iteration 98660/ 173500 | consumed samples: 25256960 | consumed tokens: 51726254080 | elapsed time per iteration (s): 0.15 | learning rate: 9.195E-05 | global batch size: 256 | lm loss: 3.693931E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.134 | TFLOPs: 26.18 | +7: iteration 98670/ 173500 | consumed samples: 25259520 | consumed tokens: 51731496960 | elapsed time per iteration (s): 0.15 | learning rate: 9.193E-05 | global batch size: 256 | lm loss: 3.687745E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.181 | TFLOPs: 26.18 | +7: iteration 98680/ 173500 | consumed samples: 25262080 | consumed tokens: 51736739840 | elapsed time per iteration (s): 0.16 | learning rate: 9.191E-05 | global batch size: 256 | lm loss: 3.710586E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.204 | TFLOPs: 25.75 | +7: iteration 98690/ 173500 | consumed samples: 25264640 | consumed tokens: 51741982720 | elapsed time per iteration (s): 0.15 | learning rate: 9.190E-05 | global batch size: 256 | lm loss: 3.683994E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.544 | TFLOPs: 26.04 | +7: iteration 98700/ 173500 | consumed samples: 25267200 | consumed tokens: 51747225600 | elapsed time per iteration (s): 0.15 | learning rate: 9.188E-05 | global batch size: 256 | lm loss: 3.688490E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.014 | TFLOPs: 26.27 | +7: iteration 98710/ 173500 | consumed samples: 25269760 | consumed tokens: 51752468480 | elapsed time per iteration (s): 0.16 | learning rate: 9.187E-05 | global batch size: 256 | lm loss: 3.681735E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.263 | TFLOPs: 25.88 | +7: iteration 98720/ 173500 | consumed samples: 25272320 | consumed tokens: 51757711360 | elapsed time per iteration (s): 0.15 | learning rate: 9.185E-05 | global batch size: 256 | lm loss: 3.697466E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.851 | TFLOPs: 26.28 | +7: iteration 98730/ 173500 | consumed samples: 25274880 | consumed tokens: 51762954240 | elapsed time per iteration (s): 0.15 | learning rate: 9.183E-05 | global batch size: 256 | lm loss: 3.705377E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.438 | TFLOPs: 25.91 | +7: iteration 98740/ 173500 | consumed samples: 25277440 | consumed tokens: 51768197120 | elapsed time per iteration (s): 0.15 | learning rate: 9.182E-05 | global batch size: 256 | lm loss: 3.692592E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.201 | TFLOPs: 26.37 | +7: iteration 98750/ 173500 | consumed samples: 25280000 | consumed tokens: 51773440000 | elapsed time per iteration (s): 0.15 | learning rate: 9.180E-05 | global batch size: 256 | lm loss: 3.695513E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.120 | TFLOPs: 26.35 | +7: iteration 98760/ 173500 | consumed samples: 25282560 | consumed tokens: 51778682880 | elapsed time per iteration (s): 0.15 | learning rate: 9.179E-05 | global batch size: 256 | lm loss: 3.711008E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.047 | TFLOPs: 26.38 | +7: iteration 98770/ 173500 | consumed samples: 25285120 | consumed tokens: 51783925760 | elapsed time per iteration (s): 0.15 | learning rate: 9.177E-05 | global batch size: 256 | lm loss: 3.690196E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.132 | TFLOPs: 26.36 | +7: iteration 98780/ 173500 | consumed samples: 25287680 | consumed tokens: 51789168640 | elapsed time per iteration (s): 0.16 | learning rate: 9.175E-05 | global batch size: 256 | lm loss: 3.697435E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.090 | TFLOPs: 25.89 | +7: iteration 98790/ 173500 | consumed samples: 25290240 | consumed tokens: 51794411520 | elapsed time per iteration (s): 0.15 | learning rate: 9.174E-05 | global batch size: 256 | lm loss: 3.701673E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.124 | TFLOPs: 26.36 | +7: iteration 98800/ 173500 | consumed samples: 25292800 | consumed tokens: 51799654400 | elapsed time per iteration (s): 0.15 | learning rate: 9.172E-05 | global batch size: 256 | lm loss: 3.700924E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.153 | TFLOPs: 26.36 | +7: iteration 98810/ 173500 | consumed samples: 25295360 | consumed tokens: 51804897280 | elapsed time per iteration (s): 0.16 | learning rate: 9.170E-05 | global batch size: 256 | lm loss: 3.681663E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.245 | TFLOPs: 25.85 | +7: iteration 98820/ 173500 | consumed samples: 25297920 | consumed tokens: 51810140160 | elapsed time per iteration (s): 0.15 | learning rate: 9.169E-05 | global batch size: 256 | lm loss: 3.694395E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.420 | TFLOPs: 26.35 | +7: iteration 98830/ 173500 | consumed samples: 25300480 | consumed tokens: 51815383040 | elapsed time per iteration (s): 0.15 | learning rate: 9.167E-05 | global batch size: 256 | lm loss: 3.698090E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.015 | TFLOPs: 26.21 | +7: iteration 98840/ 173500 | consumed samples: 25303040 | consumed tokens: 51820625920 | elapsed time per iteration (s): 0.17 | learning rate: 9.166E-05 | global batch size: 256 | lm loss: 3.708075E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.468 | TFLOPs: 24.10 | +7: iteration 98850/ 173500 | consumed samples: 25305600 | consumed tokens: 51825868800 | elapsed time per iteration (s): 0.17 | learning rate: 9.164E-05 | global batch size: 256 | lm loss: 3.704750E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.212 | TFLOPs: 23.98 | +7: iteration 98860/ 173500 | consumed samples: 25308160 | consumed tokens: 51831111680 | elapsed time per iteration (s): 0.15 | learning rate: 9.162E-05 | global batch size: 256 | lm loss: 3.689073E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.910 | TFLOPs: 26.22 | +7: iteration 98870/ 173500 | consumed samples: 25310720 | consumed tokens: 51836354560 | elapsed time per iteration (s): 0.15 | learning rate: 9.161E-05 | global batch size: 256 | lm loss: 3.689547E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.915 | TFLOPs: 26.33 | +7: iteration 98880/ 173500 | consumed samples: 25313280 | consumed tokens: 51841597440 | elapsed time per iteration (s): 0.15 | learning rate: 9.159E-05 | global batch size: 256 | lm loss: 3.683710E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.993 | TFLOPs: 25.94 | +7: iteration 98890/ 173500 | consumed samples: 25315840 | consumed tokens: 51846840320 | elapsed time per iteration (s): 0.15 | learning rate: 9.158E-05 | global batch size: 256 | lm loss: 3.701784E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.116 | TFLOPs: 26.36 | +7: iteration 98900/ 173500 | consumed samples: 25318400 | consumed tokens: 51852083200 | elapsed time per iteration (s): 0.16 | learning rate: 9.156E-05 | global batch size: 256 | lm loss: 3.689403E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.795 | TFLOPs: 25.89 | +7: iteration 98910/ 173500 | consumed samples: 25320960 | consumed tokens: 51857326080 | elapsed time per iteration (s): 0.15 | learning rate: 9.154E-05 | global batch size: 256 | lm loss: 3.685526E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.330 | TFLOPs: 26.37 | +7: iteration 98920/ 173500 | consumed samples: 25323520 | consumed tokens: 51862568960 | elapsed time per iteration (s): 0.15 | learning rate: 9.153E-05 | global batch size: 256 | lm loss: 3.706026E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.199 | TFLOPs: 26.35 | +7: iteration 98930/ 173500 | consumed samples: 25326080 | consumed tokens: 51867811840 | elapsed time per iteration (s): 0.15 | learning rate: 9.151E-05 | global batch size: 256 | lm loss: 3.697129E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.198 | TFLOPs: 26.37 | +7: iteration 98940/ 173500 | consumed samples: 25328640 | consumed tokens: 51873054720 | elapsed time per iteration (s): 0.15 | learning rate: 9.150E-05 | global batch size: 256 | lm loss: 3.693085E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.904 | TFLOPs: 26.14 | +7: iteration 98950/ 173500 | consumed samples: 25331200 | consumed tokens: 51878297600 | elapsed time per iteration (s): 0.15 | learning rate: 9.148E-05 | global batch size: 256 | lm loss: 3.697080E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.988 | TFLOPs: 26.35 | +7: iteration 98960/ 173500 | consumed samples: 25333760 | consumed tokens: 51883540480 | elapsed time per iteration (s): 0.15 | learning rate: 9.146E-05 | global batch size: 256 | lm loss: 3.692261E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.089 | TFLOPs: 26.35 | +7: iteration 98970/ 173500 | consumed samples: 25336320 | consumed tokens: 51888783360 | elapsed time per iteration (s): 0.15 | learning rate: 9.145E-05 | global batch size: 256 | lm loss: 3.694191E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.655 | TFLOPs: 26.37 | +7: iteration 98980/ 173500 | consumed samples: 25338880 | consumed tokens: 51894026240 | elapsed time per iteration (s): 0.16 | learning rate: 9.143E-05 | global batch size: 256 | lm loss: 3.694048E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.633 | TFLOPs: 25.60 | +7: iteration 98990/ 173500 | consumed samples: 25341440 | consumed tokens: 51899269120 | elapsed time per iteration (s): 0.15 | learning rate: 9.141E-05 | global batch size: 256 | lm loss: 3.692128E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.727 | TFLOPs: 26.37 | +7: iteration 99000/ 173500 | consumed samples: 25344000 | consumed tokens: 51904512000 | elapsed time per iteration (s): 0.15 | learning rate: 9.140E-05 | global batch size: 256 | lm loss: 3.689483E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.950 | TFLOPs: 26.36 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 99000 | lm loss value: 3.884532E+00 | lm loss PPL: 4.864419E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 99000 to checkpoints_44m91b100m +0: [2023-03-17 04:32:43,206] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step99000 is begin to save! +0: [2023-03-17 04:32:43,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:32:43,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:32:43,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:32:43,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:32:43,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:32:43,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:32:43,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:32:43,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:32:43,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:32:43,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:32:43,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:32:43,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:32:43,315] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:32:43,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:32:43,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:32:43,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:32:43,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:32:43,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:32:43,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:32:43,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:32:43,341] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step99000/mp_rank_00_model_states.pt +0: [2023-03-17 04:32:43,341] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:32:43,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:32:43,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:32:43,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 04:32:43,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:32:43,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:32:43,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:32:43,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +1: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +7: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +5: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +6: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +3: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:32:43,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +2: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 04:32:43,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +4: [2023-03-17 04:32:43,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:32:43,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step99000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:32:43,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step99000 is ready now! +0: successfully saved checkpoint at iteration 99000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.63 +7: iteration 99010/ 173500 | consumed samples: 25346560 | consumed tokens: 51909754880 | elapsed time per iteration (s): 0.18 | learning rate: 9.138E-05 | global batch size: 256 | lm loss: 3.695876E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1446.135 | TFLOPs: 22.68 | +7: iteration 99020/ 173500 | consumed samples: 25349120 | consumed tokens: 51914997760 | elapsed time per iteration (s): 0.15 | learning rate: 9.137E-05 | global batch size: 256 | lm loss: 3.693162E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.128 | TFLOPs: 26.22 | +7: iteration 99030/ 173500 | consumed samples: 25351680 | consumed tokens: 51920240640 | elapsed time per iteration (s): 0.15 | learning rate: 9.135E-05 | global batch size: 256 | lm loss: 3.700241E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.641 | TFLOPs: 26.22 | +7: iteration 99040/ 173500 | consumed samples: 25354240 | consumed tokens: 51925483520 | elapsed time per iteration (s): 0.15 | learning rate: 9.133E-05 | global batch size: 256 | lm loss: 3.684938E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.991 | TFLOPs: 26.24 | +7: iteration 99050/ 173500 | consumed samples: 25356800 | consumed tokens: 51930726400 | elapsed time per iteration (s): 0.15 | learning rate: 9.132E-05 | global batch size: 256 | lm loss: 3.684104E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.546 | TFLOPs: 26.21 | +7: iteration 99060/ 173500 | consumed samples: 25359360 | consumed tokens: 51935969280 | elapsed time per iteration (s): 0.15 | learning rate: 9.130E-05 | global batch size: 256 | lm loss: 3.683884E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.489 | TFLOPs: 26.23 | +7: iteration 99070/ 173500 | consumed samples: 25361920 | consumed tokens: 51941212160 | elapsed time per iteration (s): 0.16 | learning rate: 9.129E-05 | global batch size: 256 | lm loss: 3.689339E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.287 | TFLOPs: 25.71 | +7: iteration 99080/ 173500 | consumed samples: 25364480 | consumed tokens: 51946455040 | elapsed time per iteration (s): 0.15 | learning rate: 9.127E-05 | global batch size: 256 | lm loss: 3.693206E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.167 | TFLOPs: 26.13 | +7: iteration 99090/ 173500 | consumed samples: 25367040 | consumed tokens: 51951697920 | elapsed time per iteration (s): 0.15 | learning rate: 9.125E-05 | global batch size: 256 | lm loss: 3.683909E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.476 | TFLOPs: 26.18 | +7: iteration 99100/ 173500 | consumed samples: 25369600 | consumed tokens: 51956940800 | elapsed time per iteration (s): 0.16 | learning rate: 9.124E-05 | global batch size: 256 | lm loss: 3.679803E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.346 | TFLOPs: 25.77 | +7: iteration 99110/ 173500 | consumed samples: 25372160 | consumed tokens: 51962183680 | elapsed time per iteration (s): 0.16 | learning rate: 9.122E-05 | global batch size: 256 | lm loss: 3.681248E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.396 | TFLOPs: 25.73 | +7: iteration 99120/ 173500 | consumed samples: 25374720 | consumed tokens: 51967426560 | elapsed time per iteration (s): 0.15 | learning rate: 9.121E-05 | global batch size: 256 | lm loss: 3.692688E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.602 | TFLOPs: 26.15 | +7: iteration 99130/ 173500 | consumed samples: 25377280 | consumed tokens: 51972669440 | elapsed time per iteration (s): 0.15 | learning rate: 9.119E-05 | global batch size: 256 | lm loss: 3.701564E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.972 | TFLOPs: 26.17 | +7: iteration 99140/ 173500 | consumed samples: 25379840 | consumed tokens: 51977912320 | elapsed time per iteration (s): 0.17 | learning rate: 9.117E-05 | global batch size: 256 | lm loss: 3.692970E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.129 | TFLOPs: 23.54 | +7: iteration 99150/ 173500 | consumed samples: 25382400 | consumed tokens: 51983155200 | elapsed time per iteration (s): 0.15 | learning rate: 9.116E-05 | global batch size: 256 | lm loss: 3.689665E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.681 | TFLOPs: 26.14 | +7: iteration 99160/ 173500 | consumed samples: 25384960 | consumed tokens: 51988398080 | elapsed time per iteration (s): 0.15 | learning rate: 9.114E-05 | global batch size: 256 | lm loss: 3.703426E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.028 | TFLOPs: 26.33 | +7: iteration 99170/ 173500 | consumed samples: 25387520 | consumed tokens: 51993640960 | elapsed time per iteration (s): 0.15 | learning rate: 9.113E-05 | global batch size: 256 | lm loss: 3.693896E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.411 | TFLOPs: 26.21 | +7: iteration 99180/ 173500 | consumed samples: 25390080 | consumed tokens: 51998883840 | elapsed time per iteration (s): 0.15 | learning rate: 9.111E-05 | global batch size: 256 | lm loss: 3.695792E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.108 | TFLOPs: 26.29 | +7: iteration 99190/ 173500 | consumed samples: 25392640 | consumed tokens: 52004126720 | elapsed time per iteration (s): 0.15 | learning rate: 9.109E-05 | global batch size: 256 | lm loss: 3.689867E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.142 | TFLOPs: 26.32 | +7: iteration 99200/ 173500 | consumed samples: 25395200 | consumed tokens: 52009369600 | elapsed time per iteration (s): 0.15 | learning rate: 9.108E-05 | global batch size: 256 | lm loss: 3.688979E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.534 | TFLOPs: 26.07 | +7: iteration 99210/ 173500 | consumed samples: 25397760 | consumed tokens: 52014612480 | elapsed time per iteration (s): 0.15 | learning rate: 9.106E-05 | global batch size: 256 | lm loss: 3.695723E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.364 | TFLOPs: 26.04 | +7: iteration 99220/ 173500 | consumed samples: 25400320 | consumed tokens: 52019855360 | elapsed time per iteration (s): 0.15 | learning rate: 9.104E-05 | global batch size: 256 | lm loss: 3.695845E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.502 | TFLOPs: 26.06 | +7: iteration 99230/ 173500 | consumed samples: 25402880 | consumed tokens: 52025098240 | elapsed time per iteration (s): 0.15 | learning rate: 9.103E-05 | global batch size: 256 | lm loss: 3.699369E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.710 | TFLOPs: 26.17 | +7: iteration 99240/ 173500 | consumed samples: 25405440 | consumed tokens: 52030341120 | elapsed time per iteration (s): 0.15 | learning rate: 9.101E-05 | global batch size: 256 | lm loss: 3.710464E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.497 | TFLOPs: 25.95 | +7: iteration 99250/ 173500 | consumed samples: 25408000 | consumed tokens: 52035584000 | elapsed time per iteration (s): 0.16 | learning rate: 9.100E-05 | global batch size: 256 | lm loss: 3.692677E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.064 | TFLOPs: 25.67 | +7: iteration 99260/ 173500 | consumed samples: 25410560 | consumed tokens: 52040826880 | elapsed time per iteration (s): 0.15 | learning rate: 9.098E-05 | global batch size: 256 | lm loss: 3.694392E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.611 | TFLOPs: 26.28 | +7: iteration 99270/ 173500 | consumed samples: 25413120 | consumed tokens: 52046069760 | elapsed time per iteration (s): 0.16 | learning rate: 9.096E-05 | global batch size: 256 | lm loss: 3.695745E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.467 | TFLOPs: 25.21 | +7: iteration 99280/ 173500 | consumed samples: 25415680 | consumed tokens: 52051312640 | elapsed time per iteration (s): 0.16 | learning rate: 9.095E-05 | global batch size: 256 | lm loss: 3.687273E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.282 | TFLOPs: 24.64 | +7: iteration 99290/ 173500 | consumed samples: 25418240 | consumed tokens: 52056555520 | elapsed time per iteration (s): 0.15 | learning rate: 9.093E-05 | global batch size: 256 | lm loss: 3.675222E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.996 | TFLOPs: 25.91 | +7: iteration 99300/ 173500 | consumed samples: 25420800 | consumed tokens: 52061798400 | elapsed time per iteration (s): 0.15 | learning rate: 9.092E-05 | global batch size: 256 | lm loss: 3.701516E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.216 | TFLOPs: 25.93 | +7: iteration 99310/ 173500 | consumed samples: 25423360 | consumed tokens: 52067041280 | elapsed time per iteration (s): 0.16 | learning rate: 9.090E-05 | global batch size: 256 | lm loss: 3.701421E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.763 | TFLOPs: 25.89 | +7: iteration 99320/ 173500 | consumed samples: 25425920 | consumed tokens: 52072284160 | elapsed time per iteration (s): 0.16 | learning rate: 9.088E-05 | global batch size: 256 | lm loss: 3.700315E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.556 | TFLOPs: 25.65 | +7: iteration 99330/ 173500 | consumed samples: 25428480 | consumed tokens: 52077527040 | elapsed time per iteration (s): 0.16 | learning rate: 9.087E-05 | global batch size: 256 | lm loss: 3.691655E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.982 | TFLOPs: 25.61 | +7: iteration 99340/ 173500 | consumed samples: 25431040 | consumed tokens: 52082769920 | elapsed time per iteration (s): 0.16 | learning rate: 9.085E-05 | global batch size: 256 | lm loss: 3.687410E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.555 | TFLOPs: 24.82 | +7: iteration 99350/ 173500 | consumed samples: 25433600 | consumed tokens: 52088012800 | elapsed time per iteration (s): 0.16 | learning rate: 9.084E-05 | global batch size: 256 | lm loss: 3.684709E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.876 | TFLOPs: 24.64 | +7: iteration 99360/ 173500 | consumed samples: 25436160 | consumed tokens: 52093255680 | elapsed time per iteration (s): 0.16 | learning rate: 9.082E-05 | global batch size: 256 | lm loss: 3.698113E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.345 | TFLOPs: 25.41 | +7: iteration 99370/ 173500 | consumed samples: 25438720 | consumed tokens: 52098498560 | elapsed time per iteration (s): 0.16 | learning rate: 9.080E-05 | global batch size: 256 | lm loss: 3.684251E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.034 | TFLOPs: 24.95 | +7: iteration 99380/ 173500 | consumed samples: 25441280 | consumed tokens: 52103741440 | elapsed time per iteration (s): 0.16 | learning rate: 9.079E-05 | global batch size: 256 | lm loss: 3.693835E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.503 | TFLOPs: 25.10 | +7: iteration 99390/ 173500 | consumed samples: 25443840 | consumed tokens: 52108984320 | elapsed time per iteration (s): 0.16 | learning rate: 9.077E-05 | global batch size: 256 | lm loss: 3.693799E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.336 | TFLOPs: 25.10 | +7: iteration 99400/ 173500 | consumed samples: 25446400 | consumed tokens: 52114227200 | elapsed time per iteration (s): 0.16 | learning rate: 9.076E-05 | global batch size: 256 | lm loss: 3.698820E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.700 | TFLOPs: 24.99 | +7: iteration 99410/ 173500 | consumed samples: 25448960 | consumed tokens: 52119470080 | elapsed time per iteration (s): 0.17 | learning rate: 9.074E-05 | global batch size: 256 | lm loss: 3.696705E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.092 | TFLOPs: 23.95 | +7: iteration 99420/ 173500 | consumed samples: 25451520 | consumed tokens: 52124712960 | elapsed time per iteration (s): 0.16 | learning rate: 9.072E-05 | global batch size: 256 | lm loss: 3.696002E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.892 | TFLOPs: 24.49 | +7: iteration 99430/ 173500 | consumed samples: 25454080 | consumed tokens: 52129955840 | elapsed time per iteration (s): 0.16 | learning rate: 9.071E-05 | global batch size: 256 | lm loss: 3.691663E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.852 | TFLOPs: 24.92 | +7: iteration 99440/ 173500 | consumed samples: 25456640 | consumed tokens: 52135198720 | elapsed time per iteration (s): 0.17 | learning rate: 9.069E-05 | global batch size: 256 | lm loss: 3.699519E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.805 | TFLOPs: 24.23 | +7: iteration 99450/ 173500 | consumed samples: 25459200 | consumed tokens: 52140441600 | elapsed time per iteration (s): 0.16 | learning rate: 9.067E-05 | global batch size: 256 | lm loss: 3.688912E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.271 | TFLOPs: 24.75 | +7: iteration 99460/ 173500 | consumed samples: 25461760 | consumed tokens: 52145684480 | elapsed time per iteration (s): 0.19 | learning rate: 9.066E-05 | global batch size: 256 | lm loss: 3.691801E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1352.275 | TFLOPs: 21.21 | +7: iteration 99470/ 173500 | consumed samples: 25464320 | consumed tokens: 52150927360 | elapsed time per iteration (s): 0.16 | learning rate: 9.064E-05 | global batch size: 256 | lm loss: 3.693765E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.594 | TFLOPs: 24.66 | +7: iteration 99480/ 173500 | consumed samples: 25466880 | consumed tokens: 52156170240 | elapsed time per iteration (s): 0.16 | learning rate: 9.063E-05 | global batch size: 256 | lm loss: 3.680729E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.785 | TFLOPs: 25.21 | +7: iteration 99490/ 173500 | consumed samples: 25469440 | consumed tokens: 52161413120 | elapsed time per iteration (s): 0.16 | learning rate: 9.061E-05 | global batch size: 256 | lm loss: 3.686656E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.718 | TFLOPs: 24.59 | +7: iteration 99500/ 173500 | consumed samples: 25472000 | consumed tokens: 52166656000 | elapsed time per iteration (s): 0.16 | learning rate: 9.059E-05 | global batch size: 256 | lm loss: 3.700002E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.976 | TFLOPs: 24.89 | +7: iteration 99510/ 173500 | consumed samples: 25474560 | consumed tokens: 52171898880 | elapsed time per iteration (s): 0.16 | learning rate: 9.058E-05 | global batch size: 256 | lm loss: 3.685414E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.976 | TFLOPs: 24.95 | +7: iteration 99520/ 173500 | consumed samples: 25477120 | consumed tokens: 52177141760 | elapsed time per iteration (s): 0.16 | learning rate: 9.056E-05 | global batch size: 256 | lm loss: 3.697819E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.802 | TFLOPs: 24.70 | +7: iteration 99530/ 173500 | consumed samples: 25479680 | consumed tokens: 52182384640 | elapsed time per iteration (s): 0.17 | learning rate: 9.055E-05 | global batch size: 256 | lm loss: 3.684817E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.307 | TFLOPs: 23.32 | +7: iteration 99540/ 173500 | consumed samples: 25482240 | consumed tokens: 52187627520 | elapsed time per iteration (s): 0.16 | learning rate: 9.053E-05 | global batch size: 256 | lm loss: 3.687444E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.512 | TFLOPs: 25.24 | +7: iteration 99550/ 173500 | consumed samples: 25484800 | consumed tokens: 52192870400 | elapsed time per iteration (s): 0.16 | learning rate: 9.051E-05 | global batch size: 256 | lm loss: 3.696748E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.137 | TFLOPs: 24.98 | +7: iteration 99560/ 173500 | consumed samples: 25487360 | consumed tokens: 52198113280 | elapsed time per iteration (s): 0.16 | learning rate: 9.050E-05 | global batch size: 256 | lm loss: 3.691127E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.788 | TFLOPs: 24.35 | +7: iteration 99570/ 173500 | consumed samples: 25489920 | consumed tokens: 52203356160 | elapsed time per iteration (s): 0.17 | learning rate: 9.048E-05 | global batch size: 256 | lm loss: 3.687708E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.554 | TFLOPs: 24.22 | +7: iteration 99580/ 173500 | consumed samples: 25492480 | consumed tokens: 52208599040 | elapsed time per iteration (s): 0.16 | learning rate: 9.047E-05 | global batch size: 256 | lm loss: 3.702854E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.458 | TFLOPs: 25.52 | +7: iteration 99590/ 173500 | consumed samples: 25495040 | consumed tokens: 52213841920 | elapsed time per iteration (s): 0.17 | learning rate: 9.045E-05 | global batch size: 256 | lm loss: 3.696147E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.127 | TFLOPs: 23.57 | +7: iteration 99600/ 173500 | consumed samples: 25497600 | consumed tokens: 52219084800 | elapsed time per iteration (s): 0.17 | learning rate: 9.043E-05 | global batch size: 256 | lm loss: 3.681242E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.361 | TFLOPs: 24.08 | +7: iteration 99610/ 173500 | consumed samples: 25500160 | consumed tokens: 52224327680 | elapsed time per iteration (s): 0.16 | learning rate: 9.042E-05 | global batch size: 256 | lm loss: 3.688401E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.685 | TFLOPs: 25.37 | +7: iteration 99620/ 173500 | consumed samples: 25502720 | consumed tokens: 52229570560 | elapsed time per iteration (s): 0.16 | learning rate: 9.040E-05 | global batch size: 256 | lm loss: 3.683175E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.762 | TFLOPs: 24.73 | +7: iteration 99630/ 173500 | consumed samples: 25505280 | consumed tokens: 52234813440 | elapsed time per iteration (s): 0.16 | learning rate: 9.039E-05 | global batch size: 256 | lm loss: 3.696993E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.379 | TFLOPs: 24.94 | +7: iteration 99640/ 173500 | consumed samples: 25507840 | consumed tokens: 52240056320 | elapsed time per iteration (s): 0.16 | learning rate: 9.037E-05 | global batch size: 256 | lm loss: 3.681205E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.442 | TFLOPs: 25.10 | +7: iteration 99650/ 173500 | consumed samples: 25510400 | consumed tokens: 52245299200 | elapsed time per iteration (s): 0.16 | learning rate: 9.035E-05 | global batch size: 256 | lm loss: 3.688958E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.421 | TFLOPs: 24.78 | +7: iteration 99660/ 173500 | consumed samples: 25512960 | consumed tokens: 52250542080 | elapsed time per iteration (s): 0.16 | learning rate: 9.034E-05 | global batch size: 256 | lm loss: 3.703008E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.044 | TFLOPs: 25.69 | +7: iteration 99670/ 173500 | consumed samples: 25515520 | consumed tokens: 52255784960 | elapsed time per iteration (s): 0.16 | learning rate: 9.032E-05 | global batch size: 256 | lm loss: 3.704115E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.821 | TFLOPs: 25.54 | +7: iteration 99680/ 173500 | consumed samples: 25518080 | consumed tokens: 52261027840 | elapsed time per iteration (s): 0.16 | learning rate: 9.031E-05 | global batch size: 256 | lm loss: 3.688676E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.701 | TFLOPs: 24.84 | +7: iteration 99690/ 173500 | consumed samples: 25520640 | consumed tokens: 52266270720 | elapsed time per iteration (s): 0.16 | learning rate: 9.029E-05 | global batch size: 256 | lm loss: 3.696938E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.454 | TFLOPs: 24.80 | +7: iteration 99700/ 173500 | consumed samples: 25523200 | consumed tokens: 52271513600 | elapsed time per iteration (s): 0.16 | learning rate: 9.027E-05 | global batch size: 256 | lm loss: 3.696619E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.649 | TFLOPs: 24.68 | +7: iteration 99710/ 173500 | consumed samples: 25525760 | consumed tokens: 52276756480 | elapsed time per iteration (s): 0.17 | learning rate: 9.026E-05 | global batch size: 256 | lm loss: 3.693032E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.863 | TFLOPs: 24.23 | +7: iteration 99720/ 173500 | consumed samples: 25528320 | consumed tokens: 52281999360 | elapsed time per iteration (s): 0.16 | learning rate: 9.024E-05 | global batch size: 256 | lm loss: 3.700284E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.488 | TFLOPs: 25.40 | +7: iteration 99730/ 173500 | consumed samples: 25530880 | consumed tokens: 52287242240 | elapsed time per iteration (s): 0.16 | learning rate: 9.022E-05 | global batch size: 256 | lm loss: 3.691723E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.321 | TFLOPs: 24.91 | +7: iteration 99740/ 173500 | consumed samples: 25533440 | consumed tokens: 52292485120 | elapsed time per iteration (s): 0.16 | learning rate: 9.021E-05 | global batch size: 256 | lm loss: 3.691835E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.180 | TFLOPs: 24.64 | +7: iteration 99750/ 173500 | consumed samples: 25536000 | consumed tokens: 52297728000 | elapsed time per iteration (s): 0.16 | learning rate: 9.019E-05 | global batch size: 256 | lm loss: 3.697234E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.835 | TFLOPs: 24.82 | +7: iteration 99760/ 173500 | consumed samples: 25538560 | consumed tokens: 52302970880 | elapsed time per iteration (s): 0.17 | learning rate: 9.018E-05 | global batch size: 256 | lm loss: 3.689890E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.644 | TFLOPs: 23.50 | +7: iteration 99770/ 173500 | consumed samples: 25541120 | consumed tokens: 52308213760 | elapsed time per iteration (s): 0.17 | learning rate: 9.016E-05 | global batch size: 256 | lm loss: 3.693725E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1496.940 | TFLOPs: 23.48 | +7: iteration 99780/ 173500 | consumed samples: 25543680 | consumed tokens: 52313456640 | elapsed time per iteration (s): 0.16 | learning rate: 9.014E-05 | global batch size: 256 | lm loss: 3.678696E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.057 | TFLOPs: 25.06 | +7: iteration 99790/ 173500 | consumed samples: 25546240 | consumed tokens: 52318699520 | elapsed time per iteration (s): 0.16 | learning rate: 9.013E-05 | global batch size: 256 | lm loss: 3.699739E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.624 | TFLOPs: 25.51 | +7: iteration 99800/ 173500 | consumed samples: 25548800 | consumed tokens: 52323942400 | elapsed time per iteration (s): 0.16 | learning rate: 9.011E-05 | global batch size: 256 | lm loss: 3.702535E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.240 | TFLOPs: 24.58 | +7: iteration 99810/ 173500 | consumed samples: 25551360 | consumed tokens: 52329185280 | elapsed time per iteration (s): 0.16 | learning rate: 9.010E-05 | global batch size: 256 | lm loss: 3.690553E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.303 | TFLOPs: 25.54 | +7: iteration 99820/ 173500 | consumed samples: 25553920 | consumed tokens: 52334428160 | elapsed time per iteration (s): 0.16 | learning rate: 9.008E-05 | global batch size: 256 | lm loss: 3.693005E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.587 | TFLOPs: 25.68 | +7: iteration 99830/ 173500 | consumed samples: 25556480 | consumed tokens: 52339671040 | elapsed time per iteration (s): 0.16 | learning rate: 9.006E-05 | global batch size: 256 | lm loss: 3.700278E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.456 | TFLOPs: 24.96 | +7: iteration 99840/ 173500 | consumed samples: 25559040 | consumed tokens: 52344913920 | elapsed time per iteration (s): 0.16 | learning rate: 9.005E-05 | global batch size: 256 | lm loss: 3.692273E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.713 | TFLOPs: 25.56 | +7: iteration 99850/ 173500 | consumed samples: 25561600 | consumed tokens: 52350156800 | elapsed time per iteration (s): 0.16 | learning rate: 9.003E-05 | global batch size: 256 | lm loss: 3.691551E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.521 | TFLOPs: 24.49 | +7: iteration 99860/ 173500 | consumed samples: 25564160 | consumed tokens: 52355399680 | elapsed time per iteration (s): 0.16 | learning rate: 9.002E-05 | global batch size: 256 | lm loss: 3.681015E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.067 | TFLOPs: 25.23 | +7: iteration 99870/ 173500 | consumed samples: 25566720 | consumed tokens: 52360642560 | elapsed time per iteration (s): 0.16 | learning rate: 9.000E-05 | global batch size: 256 | lm loss: 3.674720E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.865 | TFLOPs: 24.84 | +7: iteration 99880/ 173500 | consumed samples: 25569280 | consumed tokens: 52365885440 | elapsed time per iteration (s): 0.16 | learning rate: 8.998E-05 | global batch size: 256 | lm loss: 3.683048E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.715 | TFLOPs: 24.74 | +7: iteration 99890/ 173500 | consumed samples: 25571840 | consumed tokens: 52371128320 | elapsed time per iteration (s): 0.17 | learning rate: 8.997E-05 | global batch size: 256 | lm loss: 3.691307E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.119 | TFLOPs: 24.33 | +7: iteration 99900/ 173500 | consumed samples: 25574400 | consumed tokens: 52376371200 | elapsed time per iteration (s): 0.16 | learning rate: 8.995E-05 | global batch size: 256 | lm loss: 3.698419E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.314 | TFLOPs: 24.94 | +7: iteration 99910/ 173500 | consumed samples: 25576960 | consumed tokens: 52381614080 | elapsed time per iteration (s): 0.16 | learning rate: 8.994E-05 | global batch size: 256 | lm loss: 3.698673E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.726 | TFLOPs: 25.46 | +7: iteration 99920/ 173500 | consumed samples: 25579520 | consumed tokens: 52386856960 | elapsed time per iteration (s): 0.16 | learning rate: 8.992E-05 | global batch size: 256 | lm loss: 3.696707E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.928 | TFLOPs: 24.90 | +7: iteration 99930/ 173500 | consumed samples: 25582080 | consumed tokens: 52392099840 | elapsed time per iteration (s): 0.17 | learning rate: 8.990E-05 | global batch size: 256 | lm loss: 3.691114E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.030 | TFLOPs: 23.48 | +7: iteration 99940/ 173500 | consumed samples: 25584640 | consumed tokens: 52397342720 | elapsed time per iteration (s): 0.16 | learning rate: 8.989E-05 | global batch size: 256 | lm loss: 3.679728E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.936 | TFLOPs: 25.59 | +7: iteration 99950/ 173500 | consumed samples: 25587200 | consumed tokens: 52402585600 | elapsed time per iteration (s): 0.19 | learning rate: 8.987E-05 | global batch size: 256 | lm loss: 3.688393E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1316.248 | TFLOPs: 20.64 | +7: iteration 99960/ 173500 | consumed samples: 25589760 | consumed tokens: 52407828480 | elapsed time per iteration (s): 0.16 | learning rate: 8.986E-05 | global batch size: 256 | lm loss: 3.694709E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.307 | TFLOPs: 25.58 | +7: iteration 99970/ 173500 | consumed samples: 25592320 | consumed tokens: 52413071360 | elapsed time per iteration (s): 0.16 | learning rate: 8.984E-05 | global batch size: 256 | lm loss: 3.686813E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.469 | TFLOPs: 24.75 | +7: iteration 99980/ 173500 | consumed samples: 25594880 | consumed tokens: 52418314240 | elapsed time per iteration (s): 0.16 | learning rate: 8.982E-05 | global batch size: 256 | lm loss: 3.690956E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.546 | TFLOPs: 25.65 | +7: iteration 99990/ 173500 | consumed samples: 25597440 | consumed tokens: 52423557120 | elapsed time per iteration (s): 0.17 | learning rate: 8.981E-05 | global batch size: 256 | lm loss: 3.689606E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.385 | TFLOPs: 24.00 | +0: [2023-03-17 04:35:23,570] [INFO] [logging.py:68:log_dist] [Rank 0] step=100000, skipped=0, lr=[8.979141123724914e-05, 8.979141123724914e-05, 8.979141123724914e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 100000/ 173500 | consumed samples: 25600000 | consumed tokens: 52428800000 | elapsed time per iteration (s): 0.16 | learning rate: 8.979E-05 | global batch size: 256 | lm loss: 3.696239E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.941 | TFLOPs: 25.58 | +0: steps: 100000 loss: 3.6852 iter time (s): 0.157 samples/sec: 1634.857 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 100000 | lm loss value: 3.807976E+00 | lm loss PPL: 4.505913E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 100000 to checkpoints_44m91b100m +0: [2023-03-17 04:35:23,645] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step100000 is begin to save! +0: [2023-03-17 04:35:23,648] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:35:23,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:35:23,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:35:23,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:35:23,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:35:23,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:35:23,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:35:23,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:35:23,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:35:23,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:35:23,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:35:23,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:35:23,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:35:23,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:35:23,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:35:23,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:35:23,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:35:23,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:35:23,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:35:23,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:35:23,779] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step100000/mp_rank_00_model_states.pt +0: [2023-03-17 04:35:23,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:35:23,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:35:23,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:35:23,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:35:23,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:35:23,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:35:23,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,813] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:35:23,813] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:35:23,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 04:35:23,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 04:35:23,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,815] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,815] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:35:23,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 04:35:23,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:35:23,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,817] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,817] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:35:23,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,819] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:35:23,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +7: [2023-03-17 04:35:23,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:35:23,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:35:23,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: [2023-03-17 04:35:23,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:35:23,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:35:23,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +1: [2023-03-17 04:35:23,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +2: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +6: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +4: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +5: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:35:23,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step100000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:35:23,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step100000 is ready now! +0: successfully saved checkpoint at iteration 100000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 181.35 +7: iteration 100010/ 173500 | consumed samples: 25602560 | consumed tokens: 52434042880 | elapsed time per iteration (s): 0.18 | learning rate: 8.978E-05 | global batch size: 256 | lm loss: 3.692586E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1420.575 | TFLOPs: 22.28 | +7: iteration 100020/ 173500 | consumed samples: 25605120 | consumed tokens: 52439285760 | elapsed time per iteration (s): 0.17 | learning rate: 8.976E-05 | global batch size: 256 | lm loss: 3.697164E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.705 | TFLOPs: 24.26 | +7: iteration 100030/ 173500 | consumed samples: 25607680 | consumed tokens: 52444528640 | elapsed time per iteration (s): 0.16 | learning rate: 8.974E-05 | global batch size: 256 | lm loss: 3.687800E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.209 | TFLOPs: 24.86 | +7: iteration 100040/ 173500 | consumed samples: 25610240 | consumed tokens: 52449771520 | elapsed time per iteration (s): 0.16 | learning rate: 8.973E-05 | global batch size: 256 | lm loss: 3.703522E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.080 | TFLOPs: 24.80 | +7: iteration 100050/ 173500 | consumed samples: 25612800 | consumed tokens: 52455014400 | elapsed time per iteration (s): 0.17 | learning rate: 8.971E-05 | global batch size: 256 | lm loss: 3.695506E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.230 | TFLOPs: 23.81 | +7: iteration 100060/ 173500 | consumed samples: 25615360 | consumed tokens: 52460257280 | elapsed time per iteration (s): 0.16 | learning rate: 8.970E-05 | global batch size: 256 | lm loss: 3.687539E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.616 | TFLOPs: 25.02 | +7: iteration 100070/ 173500 | consumed samples: 25617920 | consumed tokens: 52465500160 | elapsed time per iteration (s): 0.16 | learning rate: 8.968E-05 | global batch size: 256 | lm loss: 3.684255E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.675 | TFLOPs: 24.57 | +7: iteration 100080/ 173500 | consumed samples: 25620480 | consumed tokens: 52470743040 | elapsed time per iteration (s): 0.16 | learning rate: 8.966E-05 | global batch size: 256 | lm loss: 3.689722E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.954 | TFLOPs: 25.64 | +7: iteration 100090/ 173500 | consumed samples: 25623040 | consumed tokens: 52475985920 | elapsed time per iteration (s): 0.16 | learning rate: 8.965E-05 | global batch size: 256 | lm loss: 3.693050E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.452 | TFLOPs: 24.75 | +7: iteration 100100/ 173500 | consumed samples: 25625600 | consumed tokens: 52481228800 | elapsed time per iteration (s): 0.17 | learning rate: 8.963E-05 | global batch size: 256 | lm loss: 3.683827E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.919 | TFLOPs: 24.07 | +7: iteration 100110/ 173500 | consumed samples: 25628160 | consumed tokens: 52486471680 | elapsed time per iteration (s): 0.17 | learning rate: 8.962E-05 | global batch size: 256 | lm loss: 3.684287E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1465.514 | TFLOPs: 22.98 | +7: iteration 100120/ 173500 | consumed samples: 25630720 | consumed tokens: 52491714560 | elapsed time per iteration (s): 0.17 | learning rate: 8.960E-05 | global batch size: 256 | lm loss: 3.702304E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.256 | TFLOPs: 24.20 | +7: iteration 100130/ 173500 | consumed samples: 25633280 | consumed tokens: 52496957440 | elapsed time per iteration (s): 0.16 | learning rate: 8.958E-05 | global batch size: 256 | lm loss: 3.693504E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.813 | TFLOPs: 24.87 | +7: iteration 100140/ 173500 | consumed samples: 25635840 | consumed tokens: 52502200320 | elapsed time per iteration (s): 0.16 | learning rate: 8.957E-05 | global batch size: 256 | lm loss: 3.687023E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.794 | TFLOPs: 25.83 | +7: iteration 100150/ 173500 | consumed samples: 25638400 | consumed tokens: 52507443200 | elapsed time per iteration (s): 0.16 | learning rate: 8.955E-05 | global batch size: 256 | lm loss: 3.695138E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.569 | TFLOPs: 25.07 | +7: iteration 100160/ 173500 | consumed samples: 25640960 | consumed tokens: 52512686080 | elapsed time per iteration (s): 0.16 | learning rate: 8.953E-05 | global batch size: 256 | lm loss: 3.698788E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.398 | TFLOPs: 24.52 | +7: iteration 100170/ 173500 | consumed samples: 25643520 | consumed tokens: 52517928960 | elapsed time per iteration (s): 0.16 | learning rate: 8.952E-05 | global batch size: 256 | lm loss: 3.694254E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.025 | TFLOPs: 24.61 | +7: iteration 100180/ 173500 | consumed samples: 25646080 | consumed tokens: 52523171840 | elapsed time per iteration (s): 0.16 | learning rate: 8.950E-05 | global batch size: 256 | lm loss: 3.698699E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.660 | TFLOPs: 25.79 | +7: iteration 100190/ 173500 | consumed samples: 25648640 | consumed tokens: 52528414720 | elapsed time per iteration (s): 0.16 | learning rate: 8.949E-05 | global batch size: 256 | lm loss: 3.702386E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.479 | TFLOPs: 24.77 | +7: iteration 100200/ 173500 | consumed samples: 25651200 | consumed tokens: 52533657600 | elapsed time per iteration (s): 0.16 | learning rate: 8.947E-05 | global batch size: 256 | lm loss: 3.694632E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.789 | TFLOPs: 25.64 | +7: iteration 100210/ 173500 | consumed samples: 25653760 | consumed tokens: 52538900480 | elapsed time per iteration (s): 0.17 | learning rate: 8.945E-05 | global batch size: 256 | lm loss: 3.677438E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1499.148 | TFLOPs: 23.51 | +7: iteration 100220/ 173500 | consumed samples: 25656320 | consumed tokens: 52544143360 | elapsed time per iteration (s): 0.16 | learning rate: 8.944E-05 | global batch size: 256 | lm loss: 3.680401E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.341 | TFLOPs: 25.43 | +7: iteration 100230/ 173500 | consumed samples: 25658880 | consumed tokens: 52549386240 | elapsed time per iteration (s): 0.16 | learning rate: 8.942E-05 | global batch size: 256 | lm loss: 3.688683E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.832 | TFLOPs: 24.73 | +7: iteration 100240/ 173500 | consumed samples: 25661440 | consumed tokens: 52554629120 | elapsed time per iteration (s): 0.16 | learning rate: 8.941E-05 | global batch size: 256 | lm loss: 3.681051E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.014 | TFLOPs: 25.81 | +7: iteration 100250/ 173500 | consumed samples: 25664000 | consumed tokens: 52559872000 | elapsed time per iteration (s): 0.16 | learning rate: 8.939E-05 | global batch size: 256 | lm loss: 3.688509E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.956 | TFLOPs: 25.56 | +7: iteration 100260/ 173500 | consumed samples: 25666560 | consumed tokens: 52565114880 | elapsed time per iteration (s): 0.16 | learning rate: 8.937E-05 | global batch size: 256 | lm loss: 3.677443E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.747 | TFLOPs: 25.39 | +7: iteration 100270/ 173500 | consumed samples: 25669120 | consumed tokens: 52570357760 | elapsed time per iteration (s): 0.16 | learning rate: 8.936E-05 | global batch size: 256 | lm loss: 3.685776E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.039 | TFLOPs: 25.77 | +7: iteration 100280/ 173500 | consumed samples: 25671680 | consumed tokens: 52575600640 | elapsed time per iteration (s): 0.16 | learning rate: 8.934E-05 | global batch size: 256 | lm loss: 3.695908E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.597 | TFLOPs: 25.71 | +7: iteration 100290/ 173500 | consumed samples: 25674240 | consumed tokens: 52580843520 | elapsed time per iteration (s): 0.15 | learning rate: 8.933E-05 | global batch size: 256 | lm loss: 3.686376E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.582 | TFLOPs: 26.23 | +7: iteration 100300/ 173500 | consumed samples: 25676800 | consumed tokens: 52586086400 | elapsed time per iteration (s): 0.16 | learning rate: 8.931E-05 | global batch size: 256 | lm loss: 3.688743E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.064 | TFLOPs: 24.90 | +7: iteration 100310/ 173500 | consumed samples: 25679360 | consumed tokens: 52591329280 | elapsed time per iteration (s): 0.16 | learning rate: 8.929E-05 | global batch size: 256 | lm loss: 3.689942E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.217 | TFLOPs: 25.63 | +7: iteration 100320/ 173500 | consumed samples: 25681920 | consumed tokens: 52596572160 | elapsed time per iteration (s): 0.15 | learning rate: 8.928E-05 | global batch size: 256 | lm loss: 3.699747E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.476 | TFLOPs: 26.21 | +7: iteration 100330/ 173500 | consumed samples: 25684480 | consumed tokens: 52601815040 | elapsed time per iteration (s): 0.16 | learning rate: 8.926E-05 | global batch size: 256 | lm loss: 3.696213E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.833 | TFLOPs: 25.83 | +7: iteration 100340/ 173500 | consumed samples: 25687040 | consumed tokens: 52607057920 | elapsed time per iteration (s): 0.16 | learning rate: 8.925E-05 | global batch size: 256 | lm loss: 3.677301E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.593 | TFLOPs: 25.49 | +7: iteration 100350/ 173500 | consumed samples: 25689600 | consumed tokens: 52612300800 | elapsed time per iteration (s): 0.16 | learning rate: 8.923E-05 | global batch size: 256 | lm loss: 3.684837E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.081 | TFLOPs: 25.33 | +7: iteration 100360/ 173500 | consumed samples: 25692160 | consumed tokens: 52617543680 | elapsed time per iteration (s): 0.16 | learning rate: 8.921E-05 | global batch size: 256 | lm loss: 3.692026E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.412 | TFLOPs: 25.66 | +7: iteration 100370/ 173500 | consumed samples: 25694720 | consumed tokens: 52622786560 | elapsed time per iteration (s): 0.16 | learning rate: 8.920E-05 | global batch size: 256 | lm loss: 3.703318E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.754 | TFLOPs: 25.31 | +7: iteration 100380/ 173500 | consumed samples: 25697280 | consumed tokens: 52628029440 | elapsed time per iteration (s): 0.16 | learning rate: 8.918E-05 | global batch size: 256 | lm loss: 3.705591E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.530 | TFLOPs: 25.48 | +7: iteration 100390/ 173500 | consumed samples: 25699840 | consumed tokens: 52633272320 | elapsed time per iteration (s): 0.17 | learning rate: 8.917E-05 | global batch size: 256 | lm loss: 3.689265E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.645 | TFLOPs: 23.99 | +7: iteration 100400/ 173500 | consumed samples: 25702400 | consumed tokens: 52638515200 | elapsed time per iteration (s): 0.16 | learning rate: 8.915E-05 | global batch size: 256 | lm loss: 3.706003E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.715 | TFLOPs: 25.67 | +7: iteration 100410/ 173500 | consumed samples: 25704960 | consumed tokens: 52643758080 | elapsed time per iteration (s): 0.16 | learning rate: 8.913E-05 | global batch size: 256 | lm loss: 3.698683E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.226 | TFLOPs: 25.66 | +7: iteration 100420/ 173500 | consumed samples: 25707520 | consumed tokens: 52649000960 | elapsed time per iteration (s): 0.17 | learning rate: 8.912E-05 | global batch size: 256 | lm loss: 3.695824E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.301 | TFLOPs: 24.30 | +7: iteration 100430/ 173500 | consumed samples: 25710080 | consumed tokens: 52654243840 | elapsed time per iteration (s): 0.16 | learning rate: 8.910E-05 | global batch size: 256 | lm loss: 3.695951E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.103 | TFLOPs: 25.00 | +7: iteration 100440/ 173500 | consumed samples: 25712640 | consumed tokens: 52659486720 | elapsed time per iteration (s): 0.16 | learning rate: 8.909E-05 | global batch size: 256 | lm loss: 3.689383E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.251 | TFLOPs: 25.86 | +7: iteration 100450/ 173500 | consumed samples: 25715200 | consumed tokens: 52664729600 | elapsed time per iteration (s): 0.16 | learning rate: 8.907E-05 | global batch size: 256 | lm loss: 3.696419E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.705 | TFLOPs: 25.13 | +7: iteration 100460/ 173500 | consumed samples: 25717760 | consumed tokens: 52669972480 | elapsed time per iteration (s): 0.16 | learning rate: 8.905E-05 | global batch size: 256 | lm loss: 3.699434E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.790 | TFLOPs: 24.57 | +7: iteration 100470/ 173500 | consumed samples: 25720320 | consumed tokens: 52675215360 | elapsed time per iteration (s): 0.16 | learning rate: 8.904E-05 | global batch size: 256 | lm loss: 3.679096E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.771 | TFLOPs: 25.40 | +7: iteration 100480/ 173500 | consumed samples: 25722880 | consumed tokens: 52680458240 | elapsed time per iteration (s): 0.16 | learning rate: 8.902E-05 | global batch size: 256 | lm loss: 3.697622E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.041 | TFLOPs: 24.53 | +7: iteration 100490/ 173500 | consumed samples: 25725440 | consumed tokens: 52685701120 | elapsed time per iteration (s): 0.16 | learning rate: 8.901E-05 | global batch size: 256 | lm loss: 3.684186E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.526 | TFLOPs: 25.73 | +7: iteration 100500/ 173500 | consumed samples: 25728000 | consumed tokens: 52690944000 | elapsed time per iteration (s): 0.17 | learning rate: 8.899E-05 | global batch size: 256 | lm loss: 3.702264E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.432 | TFLOPs: 23.15 | +7: iteration 100510/ 173500 | consumed samples: 25730560 | consumed tokens: 52696186880 | elapsed time per iteration (s): 0.17 | learning rate: 8.897E-05 | global batch size: 256 | lm loss: 3.693097E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.443 | TFLOPs: 23.99 | +7: iteration 100520/ 173500 | consumed samples: 25733120 | consumed tokens: 52701429760 | elapsed time per iteration (s): 0.17 | learning rate: 8.896E-05 | global batch size: 256 | lm loss: 3.697185E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.482 | TFLOPs: 24.32 | +7: iteration 100530/ 173500 | consumed samples: 25735680 | consumed tokens: 52706672640 | elapsed time per iteration (s): 0.16 | learning rate: 8.894E-05 | global batch size: 256 | lm loss: 3.704293E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.029 | TFLOPs: 25.22 | +7: iteration 100540/ 173500 | consumed samples: 25738240 | consumed tokens: 52711915520 | elapsed time per iteration (s): 0.16 | learning rate: 8.893E-05 | global batch size: 256 | lm loss: 3.694640E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.816 | TFLOPs: 25.83 | +7: iteration 100550/ 173500 | consumed samples: 25740800 | consumed tokens: 52717158400 | elapsed time per iteration (s): 0.16 | learning rate: 8.891E-05 | global batch size: 256 | lm loss: 3.696002E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.636 | TFLOPs: 24.51 | +7: iteration 100560/ 173500 | consumed samples: 25743360 | consumed tokens: 52722401280 | elapsed time per iteration (s): 0.16 | learning rate: 8.889E-05 | global batch size: 256 | lm loss: 3.675915E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.359 | TFLOPs: 25.80 | +7: iteration 100570/ 173500 | consumed samples: 25745920 | consumed tokens: 52727644160 | elapsed time per iteration (s): 0.17 | learning rate: 8.888E-05 | global batch size: 256 | lm loss: 3.693029E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.441 | TFLOPs: 24.24 | +7: iteration 100580/ 173500 | consumed samples: 25748480 | consumed tokens: 52732887040 | elapsed time per iteration (s): 0.17 | learning rate: 8.886E-05 | global batch size: 256 | lm loss: 3.695623E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.488 | TFLOPs: 24.24 | +7: iteration 100590/ 173500 | consumed samples: 25751040 | consumed tokens: 52738129920 | elapsed time per iteration (s): 0.15 | learning rate: 8.885E-05 | global batch size: 256 | lm loss: 3.676076E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.781 | TFLOPs: 26.23 | +7: iteration 100600/ 173500 | consumed samples: 25753600 | consumed tokens: 52743372800 | elapsed time per iteration (s): 0.16 | learning rate: 8.883E-05 | global batch size: 256 | lm loss: 3.674650E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.805 | TFLOPs: 25.26 | +7: iteration 100610/ 173500 | consumed samples: 25756160 | consumed tokens: 52748615680 | elapsed time per iteration (s): 0.16 | learning rate: 8.881E-05 | global batch size: 256 | lm loss: 3.696929E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.518 | TFLOPs: 25.62 | +7: iteration 100620/ 173500 | consumed samples: 25758720 | consumed tokens: 52753858560 | elapsed time per iteration (s): 0.16 | learning rate: 8.880E-05 | global batch size: 256 | lm loss: 3.698008E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.320 | TFLOPs: 24.41 | +7: iteration 100630/ 173500 | consumed samples: 25761280 | consumed tokens: 52759101440 | elapsed time per iteration (s): 0.16 | learning rate: 8.878E-05 | global batch size: 256 | lm loss: 3.680331E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.400 | TFLOPs: 25.88 | +7: iteration 100640/ 173500 | consumed samples: 25763840 | consumed tokens: 52764344320 | elapsed time per iteration (s): 0.16 | learning rate: 8.877E-05 | global batch size: 256 | lm loss: 3.686273E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.652 | TFLOPs: 25.37 | +7: iteration 100650/ 173500 | consumed samples: 25766400 | consumed tokens: 52769587200 | elapsed time per iteration (s): 0.16 | learning rate: 8.875E-05 | global batch size: 256 | lm loss: 3.692594E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.676 | TFLOPs: 25.43 | +7: iteration 100660/ 173500 | consumed samples: 25768960 | consumed tokens: 52774830080 | elapsed time per iteration (s): 0.16 | learning rate: 8.873E-05 | global batch size: 256 | lm loss: 3.682938E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.356 | TFLOPs: 25.80 | +7: iteration 100670/ 173500 | consumed samples: 25771520 | consumed tokens: 52780072960 | elapsed time per iteration (s): 0.16 | learning rate: 8.872E-05 | global batch size: 256 | lm loss: 3.686238E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.468 | TFLOPs: 25.10 | +7: iteration 100680/ 173500 | consumed samples: 25774080 | consumed tokens: 52785315840 | elapsed time per iteration (s): 0.16 | learning rate: 8.870E-05 | global batch size: 256 | lm loss: 3.684170E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.052 | TFLOPs: 24.34 | +7: iteration 100690/ 173500 | consumed samples: 25776640 | consumed tokens: 52790558720 | elapsed time per iteration (s): 0.16 | learning rate: 8.869E-05 | global batch size: 256 | lm loss: 3.676593E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.632 | TFLOPs: 24.43 | +7: iteration 100700/ 173500 | consumed samples: 25779200 | consumed tokens: 52795801600 | elapsed time per iteration (s): 0.16 | learning rate: 8.867E-05 | global batch size: 256 | lm loss: 3.675373E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.677 | TFLOPs: 25.75 | +7: iteration 100710/ 173500 | consumed samples: 25781760 | consumed tokens: 52801044480 | elapsed time per iteration (s): 0.16 | learning rate: 8.865E-05 | global batch size: 256 | lm loss: 3.690359E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.804 | TFLOPs: 25.18 | +7: iteration 100720/ 173500 | consumed samples: 25784320 | consumed tokens: 52806287360 | elapsed time per iteration (s): 0.16 | learning rate: 8.864E-05 | global batch size: 256 | lm loss: 3.692598E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.907 | TFLOPs: 25.53 | +7: iteration 100730/ 173500 | consumed samples: 25786880 | consumed tokens: 52811530240 | elapsed time per iteration (s): 0.16 | learning rate: 8.862E-05 | global batch size: 256 | lm loss: 3.683163E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.762 | TFLOPs: 24.77 | +7: iteration 100740/ 173500 | consumed samples: 25789440 | consumed tokens: 52816773120 | elapsed time per iteration (s): 0.16 | learning rate: 8.861E-05 | global batch size: 256 | lm loss: 3.681477E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.253 | TFLOPs: 25.30 | +7: iteration 100750/ 173500 | consumed samples: 25792000 | consumed tokens: 52822016000 | elapsed time per iteration (s): 0.16 | learning rate: 8.859E-05 | global batch size: 256 | lm loss: 3.690661E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.491 | TFLOPs: 25.15 | +7: iteration 100760/ 173500 | consumed samples: 25794560 | consumed tokens: 52827258880 | elapsed time per iteration (s): 0.16 | learning rate: 8.857E-05 | global batch size: 256 | lm loss: 3.690496E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.178 | TFLOPs: 25.35 | +7: iteration 100770/ 173500 | consumed samples: 25797120 | consumed tokens: 52832501760 | elapsed time per iteration (s): 0.16 | learning rate: 8.856E-05 | global batch size: 256 | lm loss: 3.682582E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.472 | TFLOPs: 24.60 | +7: iteration 100780/ 173500 | consumed samples: 25799680 | consumed tokens: 52837744640 | elapsed time per iteration (s): 0.16 | learning rate: 8.854E-05 | global batch size: 256 | lm loss: 3.688945E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.748 | TFLOPs: 25.81 | +7: iteration 100790/ 173500 | consumed samples: 25802240 | consumed tokens: 52842987520 | elapsed time per iteration (s): 0.16 | learning rate: 8.853E-05 | global batch size: 256 | lm loss: 3.685967E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.154 | TFLOPs: 24.83 | +7: iteration 100800/ 173500 | consumed samples: 25804800 | consumed tokens: 52848230400 | elapsed time per iteration (s): 0.16 | learning rate: 8.851E-05 | global batch size: 256 | lm loss: 3.698097E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.588 | TFLOPs: 25.40 | +7: iteration 100810/ 173500 | consumed samples: 25807360 | consumed tokens: 52853473280 | elapsed time per iteration (s): 0.16 | learning rate: 8.849E-05 | global batch size: 256 | lm loss: 3.681532E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.089 | TFLOPs: 25.09 | +7: iteration 100820/ 173500 | consumed samples: 25809920 | consumed tokens: 52858716160 | elapsed time per iteration (s): 0.16 | learning rate: 8.848E-05 | global batch size: 256 | lm loss: 3.696126E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.894 | TFLOPs: 25.42 | +7: iteration 100830/ 173500 | consumed samples: 25812480 | consumed tokens: 52863959040 | elapsed time per iteration (s): 0.16 | learning rate: 8.846E-05 | global batch size: 256 | lm loss: 3.700158E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.506 | TFLOPs: 25.84 | +7: iteration 100840/ 173500 | consumed samples: 25815040 | consumed tokens: 52869201920 | elapsed time per iteration (s): 0.16 | learning rate: 8.845E-05 | global batch size: 256 | lm loss: 3.700875E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.431 | TFLOPs: 25.13 | +7: iteration 100850/ 173500 | consumed samples: 25817600 | consumed tokens: 52874444800 | elapsed time per iteration (s): 0.17 | learning rate: 8.843E-05 | global batch size: 256 | lm loss: 3.689664E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.340 | TFLOPs: 24.20 | +7: iteration 100860/ 173500 | consumed samples: 25820160 | consumed tokens: 52879687680 | elapsed time per iteration (s): 0.16 | learning rate: 8.841E-05 | global batch size: 256 | lm loss: 3.676470E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.394 | TFLOPs: 25.58 | +7: iteration 100870/ 173500 | consumed samples: 25822720 | consumed tokens: 52884930560 | elapsed time per iteration (s): 0.16 | learning rate: 8.840E-05 | global batch size: 256 | lm loss: 3.695511E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.006 | TFLOPs: 24.48 | +7: iteration 100880/ 173500 | consumed samples: 25825280 | consumed tokens: 52890173440 | elapsed time per iteration (s): 0.16 | learning rate: 8.838E-05 | global batch size: 256 | lm loss: 3.686354E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.779 | TFLOPs: 24.40 | +7: iteration 100890/ 173500 | consumed samples: 25827840 | consumed tokens: 52895416320 | elapsed time per iteration (s): 0.16 | learning rate: 8.837E-05 | global batch size: 256 | lm loss: 3.688928E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.301 | TFLOPs: 25.30 | +7: iteration 100900/ 173500 | consumed samples: 25830400 | consumed tokens: 52900659200 | elapsed time per iteration (s): 0.16 | learning rate: 8.835E-05 | global batch size: 256 | lm loss: 3.696218E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.108 | TFLOPs: 25.25 | +7: iteration 100910/ 173500 | consumed samples: 25832960 | consumed tokens: 52905902080 | elapsed time per iteration (s): 0.16 | learning rate: 8.833E-05 | global batch size: 256 | lm loss: 3.683615E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.467 | TFLOPs: 24.74 | +7: iteration 100920/ 173500 | consumed samples: 25835520 | consumed tokens: 52911144960 | elapsed time per iteration (s): 0.16 | learning rate: 8.832E-05 | global batch size: 256 | lm loss: 3.695922E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.301 | TFLOPs: 25.30 | +7: iteration 100930/ 173500 | consumed samples: 25838080 | consumed tokens: 52916387840 | elapsed time per iteration (s): 0.16 | learning rate: 8.830E-05 | global batch size: 256 | lm loss: 3.688343E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.477 | TFLOPs: 25.85 | +7: iteration 100940/ 173500 | consumed samples: 25840640 | consumed tokens: 52921630720 | elapsed time per iteration (s): 0.17 | learning rate: 8.829E-05 | global batch size: 256 | lm loss: 3.700983E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.336 | TFLOPs: 23.09 | +7: iteration 100950/ 173500 | consumed samples: 25843200 | consumed tokens: 52926873600 | elapsed time per iteration (s): 0.16 | learning rate: 8.827E-05 | global batch size: 256 | lm loss: 3.689495E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.875 | TFLOPs: 24.40 | +7: iteration 100960/ 173500 | consumed samples: 25845760 | consumed tokens: 52932116480 | elapsed time per iteration (s): 0.16 | learning rate: 8.825E-05 | global batch size: 256 | lm loss: 3.677990E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.639 | TFLOPs: 24.73 | +7: iteration 100970/ 173500 | consumed samples: 25848320 | consumed tokens: 52937359360 | elapsed time per iteration (s): 0.16 | learning rate: 8.824E-05 | global batch size: 256 | lm loss: 3.689193E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.303 | TFLOPs: 25.82 | +7: iteration 100980/ 173500 | consumed samples: 25850880 | consumed tokens: 52942602240 | elapsed time per iteration (s): 0.16 | learning rate: 8.822E-05 | global batch size: 256 | lm loss: 3.687910E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.307 | TFLOPs: 24.36 | +7: iteration 100990/ 173500 | consumed samples: 25853440 | consumed tokens: 52947845120 | elapsed time per iteration (s): 0.15 | learning rate: 8.821E-05 | global batch size: 256 | lm loss: 3.691291E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.651 | TFLOPs: 25.96 | +7: iteration 101000/ 173500 | consumed samples: 25856000 | consumed tokens: 52953088000 | elapsed time per iteration (s): 0.16 | learning rate: 8.819E-05 | global batch size: 256 | lm loss: 3.694573E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.630 | TFLOPs: 25.04 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 101000 | lm loss value: 3.828295E+00 | lm loss PPL: 4.598405E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 101000 to checkpoints_44m91b100m +0: [2023-03-17 04:38:04,175] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step101000 is begin to save! +0: [2023-03-17 04:38:04,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:38:04,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:38:04,247] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:38:04,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:38:04,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:38:04,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:38:04,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:38:04,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:38:04,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:38:04,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:38:04,284] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:38:04,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:38:04,292] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:38:04,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:38:04,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:38:04,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:38:04,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:38:04,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:38:04,317] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:38:04,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:38:04,319] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step101000/mp_rank_00_model_states.pt +0: [2023-03-17 04:38:04,319] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:38:04,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:38:04,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:38:04,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:38:04,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +3: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:38:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +3: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:38:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +3: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:38:04,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +3: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:38:04,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +3: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +1: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +4: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +6: [2023-03-17 04:38:04,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +5: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +3: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +7: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +6: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +6: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +2: [2023-03-17 04:38:04,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step101000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:38:04,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step101000 is ready now! +0: successfully saved checkpoint at iteration 101000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 191.64 +7: iteration 101010/ 173500 | consumed samples: 25858560 | consumed tokens: 52958330880 | elapsed time per iteration (s): 0.18 | learning rate: 8.817E-05 | global batch size: 256 | lm loss: 3.699067E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.225 | TFLOPs: 21.72 | +7: iteration 101020/ 173500 | consumed samples: 25861120 | consumed tokens: 52963573760 | elapsed time per iteration (s): 0.16 | learning rate: 8.816E-05 | global batch size: 256 | lm loss: 3.687544E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.408 | TFLOPs: 25.21 | +7: iteration 101030/ 173500 | consumed samples: 25863680 | consumed tokens: 52968816640 | elapsed time per iteration (s): 0.16 | learning rate: 8.814E-05 | global batch size: 256 | lm loss: 3.686686E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.906 | TFLOPs: 24.34 | +7: iteration 101040/ 173500 | consumed samples: 25866240 | consumed tokens: 52974059520 | elapsed time per iteration (s): 0.16 | learning rate: 8.813E-05 | global batch size: 256 | lm loss: 3.693330E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.853 | TFLOPs: 25.07 | +7: iteration 101050/ 173500 | consumed samples: 25868800 | consumed tokens: 52979302400 | elapsed time per iteration (s): 0.16 | learning rate: 8.811E-05 | global batch size: 256 | lm loss: 3.691914E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.066 | TFLOPs: 24.86 | +7: iteration 101060/ 173500 | consumed samples: 25871360 | consumed tokens: 52984545280 | elapsed time per iteration (s): 0.16 | learning rate: 8.810E-05 | global batch size: 256 | lm loss: 3.693153E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.376 | TFLOPs: 24.50 | +7: iteration 101070/ 173500 | consumed samples: 25873920 | consumed tokens: 52989788160 | elapsed time per iteration (s): 0.16 | learning rate: 8.808E-05 | global batch size: 256 | lm loss: 3.686061E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.876 | TFLOPs: 24.49 | +7: iteration 101080/ 173500 | consumed samples: 25876480 | consumed tokens: 52995031040 | elapsed time per iteration (s): 0.16 | learning rate: 8.806E-05 | global batch size: 256 | lm loss: 3.692542E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.138 | TFLOPs: 24.83 | +7: iteration 101090/ 173500 | consumed samples: 25879040 | consumed tokens: 53000273920 | elapsed time per iteration (s): 0.16 | learning rate: 8.805E-05 | global batch size: 256 | lm loss: 3.697049E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.869 | TFLOPs: 24.95 | +7: iteration 101100/ 173500 | consumed samples: 25881600 | consumed tokens: 53005516800 | elapsed time per iteration (s): 0.16 | learning rate: 8.803E-05 | global batch size: 256 | lm loss: 3.693052E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.078 | TFLOPs: 24.98 | +7: iteration 101110/ 173500 | consumed samples: 25884160 | consumed tokens: 53010759680 | elapsed time per iteration (s): 0.17 | learning rate: 8.802E-05 | global batch size: 256 | lm loss: 3.698802E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.424 | TFLOPs: 24.13 | +7: iteration 101120/ 173500 | consumed samples: 25886720 | consumed tokens: 53016002560 | elapsed time per iteration (s): 0.16 | learning rate: 8.800E-05 | global batch size: 256 | lm loss: 3.697140E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.387 | TFLOPs: 24.94 | +7: iteration 101130/ 173500 | consumed samples: 25889280 | consumed tokens: 53021245440 | elapsed time per iteration (s): 0.16 | learning rate: 8.798E-05 | global batch size: 256 | lm loss: 3.679213E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.752 | TFLOPs: 25.24 | +7: iteration 101140/ 173500 | consumed samples: 25891840 | consumed tokens: 53026488320 | elapsed time per iteration (s): 0.16 | learning rate: 8.797E-05 | global batch size: 256 | lm loss: 3.704519E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.034 | TFLOPs: 24.34 | +7: iteration 101150/ 173500 | consumed samples: 25894400 | consumed tokens: 53031731200 | elapsed time per iteration (s): 0.16 | learning rate: 8.795E-05 | global batch size: 256 | lm loss: 3.695240E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.132 | TFLOPs: 24.86 | +7: iteration 101160/ 173500 | consumed samples: 25896960 | consumed tokens: 53036974080 | elapsed time per iteration (s): 0.16 | learning rate: 8.794E-05 | global batch size: 256 | lm loss: 3.688868E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.286 | TFLOPs: 24.97 | +7: iteration 101170/ 173500 | consumed samples: 25899520 | consumed tokens: 53042216960 | elapsed time per iteration (s): 0.16 | learning rate: 8.792E-05 | global batch size: 256 | lm loss: 3.690372E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.912 | TFLOPs: 24.64 | +7: iteration 101180/ 173500 | consumed samples: 25902080 | consumed tokens: 53047459840 | elapsed time per iteration (s): 0.16 | learning rate: 8.790E-05 | global batch size: 256 | lm loss: 3.693133E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.011 | TFLOPs: 25.37 | +7: iteration 101190/ 173500 | consumed samples: 25904640 | consumed tokens: 53052702720 | elapsed time per iteration (s): 0.17 | learning rate: 8.789E-05 | global batch size: 256 | lm loss: 3.693017E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.855 | TFLOPs: 24.04 | +7: iteration 101200/ 173500 | consumed samples: 25907200 | consumed tokens: 53057945600 | elapsed time per iteration (s): 0.16 | learning rate: 8.787E-05 | global batch size: 256 | lm loss: 3.687846E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.003 | TFLOPs: 25.81 | +7: iteration 101210/ 173500 | consumed samples: 25909760 | consumed tokens: 53063188480 | elapsed time per iteration (s): 0.16 | learning rate: 8.786E-05 | global batch size: 256 | lm loss: 3.700613E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.919 | TFLOPs: 25.01 | +7: iteration 101220/ 173500 | consumed samples: 25912320 | consumed tokens: 53068431360 | elapsed time per iteration (s): 0.16 | learning rate: 8.784E-05 | global batch size: 256 | lm loss: 3.701566E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.821 | TFLOPs: 24.73 | +7: iteration 101230/ 173500 | consumed samples: 25914880 | consumed tokens: 53073674240 | elapsed time per iteration (s): 0.16 | learning rate: 8.782E-05 | global batch size: 256 | lm loss: 3.697038E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.792 | TFLOPs: 25.10 | +7: iteration 101240/ 173500 | consumed samples: 25917440 | consumed tokens: 53078917120 | elapsed time per iteration (s): 0.17 | learning rate: 8.781E-05 | global batch size: 256 | lm loss: 3.687814E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.376 | TFLOPs: 23.94 | +7: iteration 101250/ 173500 | consumed samples: 25920000 | consumed tokens: 53084160000 | elapsed time per iteration (s): 0.17 | learning rate: 8.779E-05 | global batch size: 256 | lm loss: 3.703432E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.021 | TFLOPs: 24.07 | +7: iteration 101260/ 173500 | consumed samples: 25922560 | consumed tokens: 53089402880 | elapsed time per iteration (s): 0.16 | learning rate: 8.778E-05 | global batch size: 256 | lm loss: 3.690498E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.626 | TFLOPs: 24.44 | +7: iteration 101270/ 173500 | consumed samples: 25925120 | consumed tokens: 53094645760 | elapsed time per iteration (s): 0.17 | learning rate: 8.776E-05 | global batch size: 256 | lm loss: 3.675023E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.316 | TFLOPs: 24.28 | +7: iteration 101280/ 173500 | consumed samples: 25927680 | consumed tokens: 53099888640 | elapsed time per iteration (s): 0.16 | learning rate: 8.774E-05 | global batch size: 256 | lm loss: 3.686346E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.347 | TFLOPs: 24.63 | +7: iteration 101290/ 173500 | consumed samples: 25930240 | consumed tokens: 53105131520 | elapsed time per iteration (s): 0.16 | learning rate: 8.773E-05 | global batch size: 256 | lm loss: 3.697459E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.228 | TFLOPs: 25.17 | +7: iteration 101300/ 173500 | consumed samples: 25932800 | consumed tokens: 53110374400 | elapsed time per iteration (s): 0.16 | learning rate: 8.771E-05 | global batch size: 256 | lm loss: 3.694096E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.262 | TFLOPs: 24.47 | +7: iteration 101310/ 173500 | consumed samples: 25935360 | consumed tokens: 53115617280 | elapsed time per iteration (s): 0.16 | learning rate: 8.770E-05 | global batch size: 256 | lm loss: 3.685535E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.930 | TFLOPs: 24.73 | +7: iteration 101320/ 173500 | consumed samples: 25937920 | consumed tokens: 53120860160 | elapsed time per iteration (s): 0.18 | learning rate: 8.768E-05 | global batch size: 256 | lm loss: 3.697848E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.126 | TFLOPs: 22.13 | +7: iteration 101330/ 173500 | consumed samples: 25940480 | consumed tokens: 53126103040 | elapsed time per iteration (s): 0.16 | learning rate: 8.766E-05 | global batch size: 256 | lm loss: 3.694024E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.027 | TFLOPs: 25.23 | +7: iteration 101340/ 173500 | consumed samples: 25943040 | consumed tokens: 53131345920 | elapsed time per iteration (s): 0.16 | learning rate: 8.765E-05 | global batch size: 256 | lm loss: 3.670647E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.378 | TFLOPs: 25.79 | +7: iteration 101350/ 173500 | consumed samples: 25945600 | consumed tokens: 53136588800 | elapsed time per iteration (s): 0.16 | learning rate: 8.763E-05 | global batch size: 256 | lm loss: 3.689267E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.039 | TFLOPs: 25.77 | +7: iteration 101360/ 173500 | consumed samples: 25948160 | consumed tokens: 53141831680 | elapsed time per iteration (s): 0.16 | learning rate: 8.762E-05 | global batch size: 256 | lm loss: 3.689196E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.737 | TFLOPs: 25.64 | +7: iteration 101370/ 173500 | consumed samples: 25950720 | consumed tokens: 53147074560 | elapsed time per iteration (s): 0.15 | learning rate: 8.760E-05 | global batch size: 256 | lm loss: 3.699004E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.330 | TFLOPs: 26.09 | +7: iteration 101380/ 173500 | consumed samples: 25953280 | consumed tokens: 53152317440 | elapsed time per iteration (s): 0.16 | learning rate: 8.758E-05 | global batch size: 256 | lm loss: 3.689777E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.483 | TFLOPs: 24.60 | +7: iteration 101390/ 173500 | consumed samples: 25955840 | consumed tokens: 53157560320 | elapsed time per iteration (s): 0.16 | learning rate: 8.757E-05 | global batch size: 256 | lm loss: 3.699752E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.361 | TFLOPs: 25.14 | +7: iteration 101400/ 173500 | consumed samples: 25958400 | consumed tokens: 53162803200 | elapsed time per iteration (s): 0.16 | learning rate: 8.755E-05 | global batch size: 256 | lm loss: 3.691599E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.186 | TFLOPs: 25.46 | +7: iteration 101410/ 173500 | consumed samples: 25960960 | consumed tokens: 53168046080 | elapsed time per iteration (s): 0.16 | learning rate: 8.754E-05 | global batch size: 256 | lm loss: 3.686507E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.699 | TFLOPs: 25.84 | +7: iteration 101420/ 173500 | consumed samples: 25963520 | consumed tokens: 53173288960 | elapsed time per iteration (s): 0.17 | learning rate: 8.752E-05 | global batch size: 256 | lm loss: 3.685452E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.968 | TFLOPs: 23.77 | +7: iteration 101430/ 173500 | consumed samples: 25966080 | consumed tokens: 53178531840 | elapsed time per iteration (s): 0.16 | learning rate: 8.750E-05 | global batch size: 256 | lm loss: 3.683823E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.012 | TFLOPs: 25.28 | +7: iteration 101440/ 173500 | consumed samples: 25968640 | consumed tokens: 53183774720 | elapsed time per iteration (s): 0.17 | learning rate: 8.749E-05 | global batch size: 256 | lm loss: 3.686335E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.519 | TFLOPs: 23.80 | +7: iteration 101450/ 173500 | consumed samples: 25971200 | consumed tokens: 53189017600 | elapsed time per iteration (s): 0.16 | learning rate: 8.747E-05 | global batch size: 256 | lm loss: 3.697300E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.623 | TFLOPs: 24.84 | +7: iteration 101460/ 173500 | consumed samples: 25973760 | consumed tokens: 53194260480 | elapsed time per iteration (s): 0.16 | learning rate: 8.746E-05 | global batch size: 256 | lm loss: 3.693851E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.431 | TFLOPs: 25.55 | +7: iteration 101470/ 173500 | consumed samples: 25976320 | consumed tokens: 53199503360 | elapsed time per iteration (s): 0.16 | learning rate: 8.744E-05 | global batch size: 256 | lm loss: 3.708421E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.105 | TFLOPs: 24.83 | +7: iteration 101480/ 173500 | consumed samples: 25978880 | consumed tokens: 53204746240 | elapsed time per iteration (s): 0.17 | learning rate: 8.743E-05 | global batch size: 256 | lm loss: 3.704176E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1497.957 | TFLOPs: 23.49 | +7: iteration 101490/ 173500 | consumed samples: 25981440 | consumed tokens: 53209989120 | elapsed time per iteration (s): 0.16 | learning rate: 8.741E-05 | global batch size: 256 | lm loss: 3.690947E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.566 | TFLOPs: 25.01 | +7: iteration 101500/ 173500 | consumed samples: 25984000 | consumed tokens: 53215232000 | elapsed time per iteration (s): 0.16 | learning rate: 8.739E-05 | global batch size: 256 | lm loss: 3.692568E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.485 | TFLOPs: 25.54 | +7: iteration 101510/ 173500 | consumed samples: 25986560 | consumed tokens: 53220474880 | elapsed time per iteration (s): 0.16 | learning rate: 8.738E-05 | global batch size: 256 | lm loss: 3.686906E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.499 | TFLOPs: 24.90 | +7: iteration 101520/ 173500 | consumed samples: 25989120 | consumed tokens: 53225717760 | elapsed time per iteration (s): 0.16 | learning rate: 8.736E-05 | global batch size: 256 | lm loss: 3.691985E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.927 | TFLOPs: 24.89 | +7: iteration 101530/ 173500 | consumed samples: 25991680 | consumed tokens: 53230960640 | elapsed time per iteration (s): 0.16 | learning rate: 8.735E-05 | global batch size: 256 | lm loss: 3.684223E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.039 | TFLOPs: 24.97 | +7: iteration 101540/ 173500 | consumed samples: 25994240 | consumed tokens: 53236203520 | elapsed time per iteration (s): 0.16 | learning rate: 8.733E-05 | global batch size: 256 | lm loss: 3.696919E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.402 | TFLOPs: 25.02 | +7: iteration 101550/ 173500 | consumed samples: 25996800 | consumed tokens: 53241446400 | elapsed time per iteration (s): 0.16 | learning rate: 8.731E-05 | global batch size: 256 | lm loss: 3.682879E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.064 | TFLOPs: 24.72 | +7: iteration 101560/ 173500 | consumed samples: 25999360 | consumed tokens: 53246689280 | elapsed time per iteration (s): 0.17 | learning rate: 8.730E-05 | global batch size: 256 | lm loss: 3.697359E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.619 | TFLOPs: 24.30 | +7: iteration 101570/ 173500 | consumed samples: 26001920 | consumed tokens: 53251932160 | elapsed time per iteration (s): 0.17 | learning rate: 8.728E-05 | global batch size: 256 | lm loss: 3.696296E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.687 | TFLOPs: 24.26 | +7: iteration 101580/ 173500 | consumed samples: 26004480 | consumed tokens: 53257175040 | elapsed time per iteration (s): 0.16 | learning rate: 8.727E-05 | global batch size: 256 | lm loss: 3.693090E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.377 | TFLOPs: 24.96 | +7: iteration 101590/ 173500 | consumed samples: 26007040 | consumed tokens: 53262417920 | elapsed time per iteration (s): 0.16 | learning rate: 8.725E-05 | global batch size: 256 | lm loss: 3.686048E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.686 | TFLOPs: 24.79 | +7: iteration 101600/ 173500 | consumed samples: 26009600 | consumed tokens: 53267660800 | elapsed time per iteration (s): 0.16 | learning rate: 8.723E-05 | global batch size: 256 | lm loss: 3.692301E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.804 | TFLOPs: 25.68 | +7: iteration 101610/ 173500 | consumed samples: 26012160 | consumed tokens: 53272903680 | elapsed time per iteration (s): 0.16 | learning rate: 8.722E-05 | global batch size: 256 | lm loss: 3.689120E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.879 | TFLOPs: 24.45 | +7: iteration 101620/ 173500 | consumed samples: 26014720 | consumed tokens: 53278146560 | elapsed time per iteration (s): 0.16 | learning rate: 8.720E-05 | global batch size: 256 | lm loss: 3.682255E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.897 | TFLOPs: 25.48 | +7: iteration 101630/ 173500 | consumed samples: 26017280 | consumed tokens: 53283389440 | elapsed time per iteration (s): 0.16 | learning rate: 8.719E-05 | global batch size: 256 | lm loss: 3.691716E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.214 | TFLOPs: 25.75 | +7: iteration 101640/ 173500 | consumed samples: 26019840 | consumed tokens: 53288632320 | elapsed time per iteration (s): 0.17 | learning rate: 8.717E-05 | global batch size: 256 | lm loss: 3.680365E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.221 | TFLOPs: 24.12 | +7: iteration 101650/ 173500 | consumed samples: 26022400 | consumed tokens: 53293875200 | elapsed time per iteration (s): 0.16 | learning rate: 8.715E-05 | global batch size: 256 | lm loss: 3.695110E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.721 | TFLOPs: 24.46 | +7: iteration 101660/ 173500 | consumed samples: 26024960 | consumed tokens: 53299118080 | elapsed time per iteration (s): 0.16 | learning rate: 8.714E-05 | global batch size: 256 | lm loss: 3.692545E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.928 | TFLOPs: 24.43 | +7: iteration 101670/ 173500 | consumed samples: 26027520 | consumed tokens: 53304360960 | elapsed time per iteration (s): 0.16 | learning rate: 8.712E-05 | global batch size: 256 | lm loss: 3.683776E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.323 | TFLOPs: 25.19 | +7: iteration 101680/ 173500 | consumed samples: 26030080 | consumed tokens: 53309603840 | elapsed time per iteration (s): 0.16 | learning rate: 8.711E-05 | global batch size: 256 | lm loss: 3.690076E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.550 | TFLOPs: 25.40 | +7: iteration 101690/ 173500 | consumed samples: 26032640 | consumed tokens: 53314846720 | elapsed time per iteration (s): 0.16 | learning rate: 8.709E-05 | global batch size: 256 | lm loss: 3.701099E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.454 | TFLOPs: 25.55 | +7: iteration 101700/ 173500 | consumed samples: 26035200 | consumed tokens: 53320089600 | elapsed time per iteration (s): 0.16 | learning rate: 8.707E-05 | global batch size: 256 | lm loss: 3.686327E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.110 | TFLOPs: 25.50 | +7: iteration 101710/ 173500 | consumed samples: 26037760 | consumed tokens: 53325332480 | elapsed time per iteration (s): 0.16 | learning rate: 8.706E-05 | global batch size: 256 | lm loss: 3.690173E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.718 | TFLOPs: 24.66 | +7: iteration 101720/ 173500 | consumed samples: 26040320 | consumed tokens: 53330575360 | elapsed time per iteration (s): 0.16 | learning rate: 8.704E-05 | global batch size: 256 | lm loss: 3.682536E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.578 | TFLOPs: 25.32 | +7: iteration 101730/ 173500 | consumed samples: 26042880 | consumed tokens: 53335818240 | elapsed time per iteration (s): 0.16 | learning rate: 8.703E-05 | global batch size: 256 | lm loss: 3.692810E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.384 | TFLOPs: 24.89 | +7: iteration 101740/ 173500 | consumed samples: 26045440 | consumed tokens: 53341061120 | elapsed time per iteration (s): 0.17 | learning rate: 8.701E-05 | global batch size: 256 | lm loss: 3.685708E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.987 | TFLOPs: 24.17 | +7: iteration 101750/ 173500 | consumed samples: 26048000 | consumed tokens: 53346304000 | elapsed time per iteration (s): 0.16 | learning rate: 8.700E-05 | global batch size: 256 | lm loss: 3.686630E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.863 | TFLOPs: 25.18 | +7: iteration 101760/ 173500 | consumed samples: 26050560 | consumed tokens: 53351546880 | elapsed time per iteration (s): 0.16 | learning rate: 8.698E-05 | global batch size: 256 | lm loss: 3.692803E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.706 | TFLOPs: 24.66 | +7: iteration 101770/ 173500 | consumed samples: 26053120 | consumed tokens: 53356789760 | elapsed time per iteration (s): 0.16 | learning rate: 8.696E-05 | global batch size: 256 | lm loss: 3.696592E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.535 | TFLOPs: 25.32 | +7: iteration 101780/ 173500 | consumed samples: 26055680 | consumed tokens: 53362032640 | elapsed time per iteration (s): 0.16 | learning rate: 8.695E-05 | global batch size: 256 | lm loss: 3.693614E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.702 | TFLOPs: 25.26 | +7: iteration 101790/ 173500 | consumed samples: 26058240 | consumed tokens: 53367275520 | elapsed time per iteration (s): 0.16 | learning rate: 8.693E-05 | global batch size: 256 | lm loss: 3.698080E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.765 | TFLOPs: 25.54 | +7: iteration 101800/ 173500 | consumed samples: 26060800 | consumed tokens: 53372518400 | elapsed time per iteration (s): 0.16 | learning rate: 8.692E-05 | global batch size: 256 | lm loss: 3.691799E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.003 | TFLOPs: 25.50 | +7: iteration 101810/ 173500 | consumed samples: 26063360 | consumed tokens: 53377761280 | elapsed time per iteration (s): 0.16 | learning rate: 8.690E-05 | global batch size: 256 | lm loss: 3.693151E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.435 | TFLOPs: 25.29 | +7: iteration 101820/ 173500 | consumed samples: 26065920 | consumed tokens: 53383004160 | elapsed time per iteration (s): 0.16 | learning rate: 8.688E-05 | global batch size: 256 | lm loss: 3.693494E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.255 | TFLOPs: 25.02 | +7: iteration 101830/ 173500 | consumed samples: 26068480 | consumed tokens: 53388247040 | elapsed time per iteration (s): 0.16 | learning rate: 8.687E-05 | global batch size: 256 | lm loss: 3.686279E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.529 | TFLOPs: 25.23 | +7: iteration 101840/ 173500 | consumed samples: 26071040 | consumed tokens: 53393489920 | elapsed time per iteration (s): 0.16 | learning rate: 8.685E-05 | global batch size: 256 | lm loss: 3.683257E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.861 | TFLOPs: 24.60 | +7: iteration 101850/ 173500 | consumed samples: 26073600 | consumed tokens: 53398732800 | elapsed time per iteration (s): 0.16 | learning rate: 8.684E-05 | global batch size: 256 | lm loss: 3.687279E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.667 | TFLOPs: 24.99 | +7: iteration 101860/ 173500 | consumed samples: 26076160 | consumed tokens: 53403975680 | elapsed time per iteration (s): 0.16 | learning rate: 8.682E-05 | global batch size: 256 | lm loss: 3.696306E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.718 | TFLOPs: 25.28 | +7: iteration 101870/ 173500 | consumed samples: 26078720 | consumed tokens: 53409218560 | elapsed time per iteration (s): 0.16 | learning rate: 8.680E-05 | global batch size: 256 | lm loss: 3.699440E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.466 | TFLOPs: 25.57 | +7: iteration 101880/ 173500 | consumed samples: 26081280 | consumed tokens: 53414461440 | elapsed time per iteration (s): 0.16 | learning rate: 8.679E-05 | global batch size: 256 | lm loss: 3.685173E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.170 | TFLOPs: 24.81 | +7: iteration 101890/ 173500 | consumed samples: 26083840 | consumed tokens: 53419704320 | elapsed time per iteration (s): 0.16 | learning rate: 8.677E-05 | global batch size: 256 | lm loss: 3.696265E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.845 | TFLOPs: 24.81 | +7: iteration 101900/ 173500 | consumed samples: 26086400 | consumed tokens: 53424947200 | elapsed time per iteration (s): 0.16 | learning rate: 8.676E-05 | global batch size: 256 | lm loss: 3.688768E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.794 | TFLOPs: 25.61 | +7: iteration 101910/ 173500 | consumed samples: 26088960 | consumed tokens: 53430190080 | elapsed time per iteration (s): 0.16 | learning rate: 8.674E-05 | global batch size: 256 | lm loss: 3.691864E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.444 | TFLOPs: 25.41 | +7: iteration 101920/ 173500 | consumed samples: 26091520 | consumed tokens: 53435432960 | elapsed time per iteration (s): 0.16 | learning rate: 8.672E-05 | global batch size: 256 | lm loss: 3.688010E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.873 | TFLOPs: 25.00 | +7: iteration 101930/ 173500 | consumed samples: 26094080 | consumed tokens: 53440675840 | elapsed time per iteration (s): 0.16 | learning rate: 8.671E-05 | global batch size: 256 | lm loss: 3.695024E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.642 | TFLOPs: 25.29 | +7: iteration 101940/ 173500 | consumed samples: 26096640 | consumed tokens: 53445918720 | elapsed time per iteration (s): 0.17 | learning rate: 8.669E-05 | global batch size: 256 | lm loss: 3.702315E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.836 | TFLOPs: 24.20 | +7: iteration 101950/ 173500 | consumed samples: 26099200 | consumed tokens: 53451161600 | elapsed time per iteration (s): 0.16 | learning rate: 8.668E-05 | global batch size: 256 | lm loss: 3.701891E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.322 | TFLOPs: 24.89 | +7: iteration 101960/ 173500 | consumed samples: 26101760 | consumed tokens: 53456404480 | elapsed time per iteration (s): 0.16 | learning rate: 8.666E-05 | global batch size: 256 | lm loss: 3.710125E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.704 | TFLOPs: 24.81 | +7: iteration 101970/ 173500 | consumed samples: 26104320 | consumed tokens: 53461647360 | elapsed time per iteration (s): 0.16 | learning rate: 8.665E-05 | global batch size: 256 | lm loss: 3.701653E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.559 | TFLOPs: 24.72 | +7: iteration 101980/ 173500 | consumed samples: 26106880 | consumed tokens: 53466890240 | elapsed time per iteration (s): 0.16 | learning rate: 8.663E-05 | global batch size: 256 | lm loss: 3.695518E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.434 | TFLOPs: 24.61 | +7: iteration 101990/ 173500 | consumed samples: 26109440 | consumed tokens: 53472133120 | elapsed time per iteration (s): 0.16 | learning rate: 8.661E-05 | global batch size: 256 | lm loss: 3.679590E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.900 | TFLOPs: 25.14 | +0: [2023-03-17 04:40:45,584] [INFO] [logging.py:68:log_dist] [Rank 0] step=102000, skipped=0, lr=[8.659751165175261e-05, 8.659751165175261e-05, 8.659751165175261e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 102000/ 173500 | consumed samples: 26112000 | consumed tokens: 53477376000 | elapsed time per iteration (s): 0.15 | learning rate: 8.660E-05 | global batch size: 256 | lm loss: 3.689900E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.249 | TFLOPs: 26.10 | +0: steps: 102000 loss: 3.6905 iter time (s): 0.160 samples/sec: 1601.475 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 102000 | lm loss value: 3.843246E+00 | lm loss PPL: 4.667673E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 102000 to checkpoints_44m91b100m +0: [2023-03-17 04:40:45,659] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step102000 is begin to save! +0: [2023-03-17 04:40:45,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:40:45,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:40:45,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:40:45,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:40:45,739] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:40:45,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:40:45,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:40:45,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:40:45,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:40:45,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:40:45,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:40:45,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:40:45,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:40:45,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:40:45,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:40:45,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:40:45,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:40:45,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:40:45,795] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:40:45,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:40:45,797] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step102000/mp_rank_00_model_states.pt +0: [2023-03-17 04:40:45,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:40:45,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:40:45,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:40:45,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:40:45,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +7: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:40:45,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +4: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +4: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +7: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +4: [2023-03-17 04:40:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:40:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:40:45,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +7: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +4: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:40:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +7: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +4: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +7: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +4: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +2: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +5: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +3: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +7: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +6: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step102000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:40:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +1: [2023-03-17 04:40:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step102000 is ready now! +0: successfully saved checkpoint at iteration 102000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.02 +7: iteration 102010/ 173500 | consumed samples: 26114560 | consumed tokens: 53482618880 | elapsed time per iteration (s): 0.19 | learning rate: 8.658E-05 | global batch size: 256 | lm loss: 3.685949E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.994 | TFLOPs: 21.52 | +7: iteration 102020/ 173500 | consumed samples: 26117120 | consumed tokens: 53487861760 | elapsed time per iteration (s): 0.16 | learning rate: 8.657E-05 | global batch size: 256 | lm loss: 3.699379E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.077 | TFLOPs: 24.80 | +7: iteration 102030/ 173500 | consumed samples: 26119680 | consumed tokens: 53493104640 | elapsed time per iteration (s): 0.16 | learning rate: 8.655E-05 | global batch size: 256 | lm loss: 3.688873E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.549 | TFLOPs: 25.08 | +7: iteration 102040/ 173500 | consumed samples: 26122240 | consumed tokens: 53498347520 | elapsed time per iteration (s): 0.17 | learning rate: 8.653E-05 | global batch size: 256 | lm loss: 3.684020E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.811 | TFLOPs: 23.90 | +7: iteration 102050/ 173500 | consumed samples: 26124800 | consumed tokens: 53503590400 | elapsed time per iteration (s): 0.16 | learning rate: 8.652E-05 | global batch size: 256 | lm loss: 3.696553E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.061 | TFLOPs: 24.76 | +7: iteration 102060/ 173500 | consumed samples: 26127360 | consumed tokens: 53508833280 | elapsed time per iteration (s): 0.16 | learning rate: 8.650E-05 | global batch size: 256 | lm loss: 3.689884E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.474 | TFLOPs: 25.88 | +7: iteration 102070/ 173500 | consumed samples: 26129920 | consumed tokens: 53514076160 | elapsed time per iteration (s): 0.16 | learning rate: 8.649E-05 | global batch size: 256 | lm loss: 3.685418E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.078 | TFLOPs: 24.50 | +7: iteration 102080/ 173500 | consumed samples: 26132480 | consumed tokens: 53519319040 | elapsed time per iteration (s): 0.15 | learning rate: 8.647E-05 | global batch size: 256 | lm loss: 3.694011E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.685 | TFLOPs: 26.26 | +7: iteration 102090/ 173500 | consumed samples: 26135040 | consumed tokens: 53524561920 | elapsed time per iteration (s): 0.16 | learning rate: 8.645E-05 | global batch size: 256 | lm loss: 3.706762E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.426 | TFLOPs: 25.49 | +7: iteration 102100/ 173500 | consumed samples: 26137600 | consumed tokens: 53529804800 | elapsed time per iteration (s): 0.16 | learning rate: 8.644E-05 | global batch size: 256 | lm loss: 3.689775E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.096 | TFLOPs: 24.54 | +7: iteration 102110/ 173500 | consumed samples: 26140160 | consumed tokens: 53535047680 | elapsed time per iteration (s): 0.16 | learning rate: 8.642E-05 | global batch size: 256 | lm loss: 3.686522E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.628 | TFLOPs: 24.91 | +7: iteration 102120/ 173500 | consumed samples: 26142720 | consumed tokens: 53540290560 | elapsed time per iteration (s): 0.16 | learning rate: 8.641E-05 | global batch size: 256 | lm loss: 3.694115E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.232 | TFLOPs: 24.63 | +7: iteration 102130/ 173500 | consumed samples: 26145280 | consumed tokens: 53545533440 | elapsed time per iteration (s): 0.16 | learning rate: 8.639E-05 | global batch size: 256 | lm loss: 3.704712E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.582 | TFLOPs: 25.04 | +7: iteration 102140/ 173500 | consumed samples: 26147840 | consumed tokens: 53550776320 | elapsed time per iteration (s): 0.16 | learning rate: 8.638E-05 | global batch size: 256 | lm loss: 3.686416E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.921 | TFLOPs: 25.31 | +7: iteration 102150/ 173500 | consumed samples: 26150400 | consumed tokens: 53556019200 | elapsed time per iteration (s): 0.16 | learning rate: 8.636E-05 | global batch size: 256 | lm loss: 3.687812E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.184 | TFLOPs: 24.81 | +7: iteration 102160/ 173500 | consumed samples: 26152960 | consumed tokens: 53561262080 | elapsed time per iteration (s): 0.16 | learning rate: 8.634E-05 | global batch size: 256 | lm loss: 3.693670E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.749 | TFLOPs: 25.86 | +7: iteration 102170/ 173500 | consumed samples: 26155520 | consumed tokens: 53566504960 | elapsed time per iteration (s): 0.16 | learning rate: 8.633E-05 | global batch size: 256 | lm loss: 3.695243E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.873 | TFLOPs: 25.62 | +7: iteration 102180/ 173500 | consumed samples: 26158080 | consumed tokens: 53571747840 | elapsed time per iteration (s): 0.16 | learning rate: 8.631E-05 | global batch size: 256 | lm loss: 3.686357E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.573 | TFLOPs: 24.98 | +7: iteration 102190/ 173500 | consumed samples: 26160640 | consumed tokens: 53576990720 | elapsed time per iteration (s): 0.16 | learning rate: 8.630E-05 | global batch size: 256 | lm loss: 3.695074E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.246 | TFLOPs: 25.16 | +7: iteration 102200/ 173500 | consumed samples: 26163200 | consumed tokens: 53582233600 | elapsed time per iteration (s): 0.16 | learning rate: 8.628E-05 | global batch size: 256 | lm loss: 3.691268E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.770 | TFLOPs: 25.28 | +7: iteration 102210/ 173500 | consumed samples: 26165760 | consumed tokens: 53587476480 | elapsed time per iteration (s): 0.16 | learning rate: 8.626E-05 | global batch size: 256 | lm loss: 3.700446E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.460 | TFLOPs: 24.94 | +7: iteration 102220/ 173500 | consumed samples: 26168320 | consumed tokens: 53592719360 | elapsed time per iteration (s): 0.16 | learning rate: 8.625E-05 | global batch size: 256 | lm loss: 3.708804E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.360 | TFLOPs: 25.27 | +7: iteration 102230/ 173500 | consumed samples: 26170880 | consumed tokens: 53597962240 | elapsed time per iteration (s): 0.17 | learning rate: 8.623E-05 | global batch size: 256 | lm loss: 3.680950E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.147 | TFLOPs: 23.98 | +7: iteration 102240/ 173500 | consumed samples: 26173440 | consumed tokens: 53603205120 | elapsed time per iteration (s): 0.15 | learning rate: 8.622E-05 | global batch size: 256 | lm loss: 3.687102E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.598 | TFLOPs: 26.12 | +7: iteration 102250/ 173500 | consumed samples: 26176000 | consumed tokens: 53608448000 | elapsed time per iteration (s): 0.16 | learning rate: 8.620E-05 | global batch size: 256 | lm loss: 3.686528E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.549 | TFLOPs: 24.74 | +7: iteration 102260/ 173500 | consumed samples: 26178560 | consumed tokens: 53613690880 | elapsed time per iteration (s): 0.16 | learning rate: 8.618E-05 | global batch size: 256 | lm loss: 3.698039E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.015 | TFLOPs: 25.22 | +7: iteration 102270/ 173500 | consumed samples: 26181120 | consumed tokens: 53618933760 | elapsed time per iteration (s): 0.16 | learning rate: 8.617E-05 | global batch size: 256 | lm loss: 3.672997E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.109 | TFLOPs: 24.94 | +7: iteration 102280/ 173500 | consumed samples: 26183680 | consumed tokens: 53624176640 | elapsed time per iteration (s): 0.15 | learning rate: 8.615E-05 | global batch size: 256 | lm loss: 3.688800E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.049 | TFLOPs: 25.91 | +7: iteration 102290/ 173500 | consumed samples: 26186240 | consumed tokens: 53629419520 | elapsed time per iteration (s): 0.16 | learning rate: 8.614E-05 | global batch size: 256 | lm loss: 3.672507E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.595 | TFLOPs: 24.74 | +7: iteration 102300/ 173500 | consumed samples: 26188800 | consumed tokens: 53634662400 | elapsed time per iteration (s): 0.15 | learning rate: 8.612E-05 | global batch size: 256 | lm loss: 3.691187E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.107 | TFLOPs: 26.24 | +7: iteration 102310/ 173500 | consumed samples: 26191360 | consumed tokens: 53639905280 | elapsed time per iteration (s): 0.16 | learning rate: 8.611E-05 | global batch size: 256 | lm loss: 3.685944E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.229 | TFLOPs: 25.02 | +7: iteration 102320/ 173500 | consumed samples: 26193920 | consumed tokens: 53645148160 | elapsed time per iteration (s): 0.16 | learning rate: 8.609E-05 | global batch size: 256 | lm loss: 3.689146E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.374 | TFLOPs: 25.14 | +7: iteration 102330/ 173500 | consumed samples: 26196480 | consumed tokens: 53650391040 | elapsed time per iteration (s): 0.16 | learning rate: 8.607E-05 | global batch size: 256 | lm loss: 3.680184E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.805 | TFLOPs: 25.32 | +7: iteration 102340/ 173500 | consumed samples: 26199040 | consumed tokens: 53655633920 | elapsed time per iteration (s): 0.16 | learning rate: 8.606E-05 | global batch size: 256 | lm loss: 3.707817E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.904 | TFLOPs: 25.07 | +7: iteration 102350/ 173500 | consumed samples: 26201600 | consumed tokens: 53660876800 | elapsed time per iteration (s): 0.16 | learning rate: 8.604E-05 | global batch size: 256 | lm loss: 3.693702E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.940 | TFLOPs: 25.88 | +7: iteration 102360/ 173500 | consumed samples: 26204160 | consumed tokens: 53666119680 | elapsed time per iteration (s): 0.16 | learning rate: 8.603E-05 | global batch size: 256 | lm loss: 3.698634E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.374 | TFLOPs: 25.47 | +7: iteration 102370/ 173500 | consumed samples: 26206720 | consumed tokens: 53671362560 | elapsed time per iteration (s): 0.15 | learning rate: 8.601E-05 | global batch size: 256 | lm loss: 3.686958E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.977 | TFLOPs: 25.91 | +7: iteration 102380/ 173500 | consumed samples: 26209280 | consumed tokens: 53676605440 | elapsed time per iteration (s): 0.16 | learning rate: 8.599E-05 | global batch size: 256 | lm loss: 3.686465E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.226 | TFLOPs: 25.53 | +7: iteration 102390/ 173500 | consumed samples: 26211840 | consumed tokens: 53681848320 | elapsed time per iteration (s): 0.16 | learning rate: 8.598E-05 | global batch size: 256 | lm loss: 3.693878E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.444 | TFLOPs: 25.27 | +7: iteration 102400/ 173500 | consumed samples: 26214400 | consumed tokens: 53687091200 | elapsed time per iteration (s): 0.16 | learning rate: 8.596E-05 | global batch size: 256 | lm loss: 3.697435E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.187 | TFLOPs: 25.78 | +7: iteration 102410/ 173500 | consumed samples: 26216960 | consumed tokens: 53692334080 | elapsed time per iteration (s): 0.16 | learning rate: 8.595E-05 | global batch size: 256 | lm loss: 3.696525E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.877 | TFLOPs: 24.49 | +7: iteration 102420/ 173500 | consumed samples: 26219520 | consumed tokens: 53697576960 | elapsed time per iteration (s): 0.16 | learning rate: 8.593E-05 | global batch size: 256 | lm loss: 3.690873E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.622 | TFLOPs: 24.38 | +7: iteration 102430/ 173500 | consumed samples: 26222080 | consumed tokens: 53702819840 | elapsed time per iteration (s): 0.16 | learning rate: 8.591E-05 | global batch size: 256 | lm loss: 3.686184E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.867 | TFLOPs: 24.73 | +7: iteration 102440/ 173500 | consumed samples: 26224640 | consumed tokens: 53708062720 | elapsed time per iteration (s): 0.17 | learning rate: 8.590E-05 | global batch size: 256 | lm loss: 3.691910E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.991 | TFLOPs: 24.14 | +7: iteration 102450/ 173500 | consumed samples: 26227200 | consumed tokens: 53713305600 | elapsed time per iteration (s): 0.16 | learning rate: 8.588E-05 | global batch size: 256 | lm loss: 3.694119E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.081 | TFLOPs: 25.08 | +7: iteration 102460/ 173500 | consumed samples: 26229760 | consumed tokens: 53718548480 | elapsed time per iteration (s): 0.16 | learning rate: 8.587E-05 | global batch size: 256 | lm loss: 3.697251E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.409 | TFLOPs: 25.11 | +7: iteration 102470/ 173500 | consumed samples: 26232320 | consumed tokens: 53723791360 | elapsed time per iteration (s): 0.17 | learning rate: 8.585E-05 | global batch size: 256 | lm loss: 3.693984E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.275 | TFLOPs: 23.89 | +7: iteration 102480/ 173500 | consumed samples: 26234880 | consumed tokens: 53729034240 | elapsed time per iteration (s): 0.16 | learning rate: 8.584E-05 | global batch size: 256 | lm loss: 3.691064E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.277 | TFLOPs: 25.13 | +7: iteration 102490/ 173500 | consumed samples: 26237440 | consumed tokens: 53734277120 | elapsed time per iteration (s): 0.16 | learning rate: 8.582E-05 | global batch size: 256 | lm loss: 3.685905E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.558 | TFLOPs: 25.37 | +7: iteration 102500/ 173500 | consumed samples: 26240000 | consumed tokens: 53739520000 | elapsed time per iteration (s): 0.16 | learning rate: 8.580E-05 | global batch size: 256 | lm loss: 3.683144E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.788 | TFLOPs: 25.10 | +7: iteration 102510/ 173500 | consumed samples: 26242560 | consumed tokens: 53744762880 | elapsed time per iteration (s): 0.16 | learning rate: 8.579E-05 | global batch size: 256 | lm loss: 3.681677E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.703 | TFLOPs: 25.50 | +7: iteration 102520/ 173500 | consumed samples: 26245120 | consumed tokens: 53750005760 | elapsed time per iteration (s): 0.16 | learning rate: 8.577E-05 | global batch size: 256 | lm loss: 3.693364E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.832 | TFLOPs: 25.80 | +7: iteration 102530/ 173500 | consumed samples: 26247680 | consumed tokens: 53755248640 | elapsed time per iteration (s): 0.16 | learning rate: 8.576E-05 | global batch size: 256 | lm loss: 3.689096E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.334 | TFLOPs: 24.36 | +7: iteration 102540/ 173500 | consumed samples: 26250240 | consumed tokens: 53760491520 | elapsed time per iteration (s): 0.16 | learning rate: 8.574E-05 | global batch size: 256 | lm loss: 3.691433E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.340 | TFLOPs: 25.13 | +7: iteration 102550/ 173500 | consumed samples: 26252800 | consumed tokens: 53765734400 | elapsed time per iteration (s): 0.16 | learning rate: 8.572E-05 | global batch size: 256 | lm loss: 3.690042E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.704 | TFLOPs: 25.42 | +7: iteration 102560/ 173500 | consumed samples: 26255360 | consumed tokens: 53770977280 | elapsed time per iteration (s): 0.16 | learning rate: 8.571E-05 | global batch size: 256 | lm loss: 3.671975E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.706 | TFLOPs: 25.40 | +7: iteration 102570/ 173500 | consumed samples: 26257920 | consumed tokens: 53776220160 | elapsed time per iteration (s): 0.16 | learning rate: 8.569E-05 | global batch size: 256 | lm loss: 3.686539E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.077 | TFLOPs: 24.95 | +7: iteration 102580/ 173500 | consumed samples: 26260480 | consumed tokens: 53781463040 | elapsed time per iteration (s): 0.16 | learning rate: 8.568E-05 | global batch size: 256 | lm loss: 3.693097E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.167 | TFLOPs: 25.31 | +7: iteration 102590/ 173500 | consumed samples: 26263040 | consumed tokens: 53786705920 | elapsed time per iteration (s): 0.16 | learning rate: 8.566E-05 | global batch size: 256 | lm loss: 3.686827E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.258 | TFLOPs: 25.57 | +7: iteration 102600/ 173500 | consumed samples: 26265600 | consumed tokens: 53791948800 | elapsed time per iteration (s): 0.15 | learning rate: 8.565E-05 | global batch size: 256 | lm loss: 3.697579E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.004 | TFLOPs: 25.95 | +7: iteration 102610/ 173500 | consumed samples: 26268160 | consumed tokens: 53797191680 | elapsed time per iteration (s): 0.16 | learning rate: 8.563E-05 | global batch size: 256 | lm loss: 3.690276E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.911 | TFLOPs: 25.11 | +7: iteration 102620/ 173500 | consumed samples: 26270720 | consumed tokens: 53802434560 | elapsed time per iteration (s): 0.16 | learning rate: 8.561E-05 | global batch size: 256 | lm loss: 3.687328E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.083 | TFLOPs: 25.20 | +7: iteration 102630/ 173500 | consumed samples: 26273280 | consumed tokens: 53807677440 | elapsed time per iteration (s): 0.16 | learning rate: 8.560E-05 | global batch size: 256 | lm loss: 3.679195E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.017 | TFLOPs: 24.90 | +7: iteration 102640/ 173500 | consumed samples: 26275840 | consumed tokens: 53812920320 | elapsed time per iteration (s): 0.16 | learning rate: 8.558E-05 | global batch size: 256 | lm loss: 3.693163E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.116 | TFLOPs: 24.67 | +7: iteration 102650/ 173500 | consumed samples: 26278400 | consumed tokens: 53818163200 | elapsed time per iteration (s): 0.16 | learning rate: 8.557E-05 | global batch size: 256 | lm loss: 3.706189E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.531 | TFLOPs: 25.84 | +7: iteration 102660/ 173500 | consumed samples: 26280960 | consumed tokens: 53823406080 | elapsed time per iteration (s): 0.16 | learning rate: 8.555E-05 | global batch size: 256 | lm loss: 3.699909E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.422 | TFLOPs: 25.71 | +7: iteration 102670/ 173500 | consumed samples: 26283520 | consumed tokens: 53828648960 | elapsed time per iteration (s): 0.16 | learning rate: 8.553E-05 | global batch size: 256 | lm loss: 3.688940E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.771 | TFLOPs: 24.76 | +7: iteration 102680/ 173500 | consumed samples: 26286080 | consumed tokens: 53833891840 | elapsed time per iteration (s): 0.16 | learning rate: 8.552E-05 | global batch size: 256 | lm loss: 3.701150E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.419 | TFLOPs: 24.97 | +7: iteration 102690/ 173500 | consumed samples: 26288640 | consumed tokens: 53839134720 | elapsed time per iteration (s): 0.16 | learning rate: 8.550E-05 | global batch size: 256 | lm loss: 3.680953E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.406 | TFLOPs: 24.85 | +7: iteration 102700/ 173500 | consumed samples: 26291200 | consumed tokens: 53844377600 | elapsed time per iteration (s): 0.16 | learning rate: 8.549E-05 | global batch size: 256 | lm loss: 3.689557E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.608 | TFLOPs: 25.62 | +7: iteration 102710/ 173500 | consumed samples: 26293760 | consumed tokens: 53849620480 | elapsed time per iteration (s): 0.16 | learning rate: 8.547E-05 | global batch size: 256 | lm loss: 3.693215E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.792 | TFLOPs: 25.51 | +7: iteration 102720/ 173500 | consumed samples: 26296320 | consumed tokens: 53854863360 | elapsed time per iteration (s): 0.16 | learning rate: 8.546E-05 | global batch size: 256 | lm loss: 3.695230E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.062 | TFLOPs: 25.28 | +7: iteration 102730/ 173500 | consumed samples: 26298880 | consumed tokens: 53860106240 | elapsed time per iteration (s): 0.16 | learning rate: 8.544E-05 | global batch size: 256 | lm loss: 3.697527E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.936 | TFLOPs: 24.93 | +7: iteration 102740/ 173500 | consumed samples: 26301440 | consumed tokens: 53865349120 | elapsed time per iteration (s): 0.16 | learning rate: 8.542E-05 | global batch size: 256 | lm loss: 3.688450E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.267 | TFLOPs: 24.47 | +7: iteration 102750/ 173500 | consumed samples: 26304000 | consumed tokens: 53870592000 | elapsed time per iteration (s): 0.16 | learning rate: 8.541E-05 | global batch size: 256 | lm loss: 3.697655E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.770 | TFLOPs: 25.32 | +7: iteration 102760/ 173500 | consumed samples: 26306560 | consumed tokens: 53875834880 | elapsed time per iteration (s): 0.16 | learning rate: 8.539E-05 | global batch size: 256 | lm loss: 3.694056E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.176 | TFLOPs: 24.69 | +7: iteration 102770/ 173500 | consumed samples: 26309120 | consumed tokens: 53881077760 | elapsed time per iteration (s): 0.16 | learning rate: 8.538E-05 | global batch size: 256 | lm loss: 3.684109E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.692 | TFLOPs: 25.07 | +7: iteration 102780/ 173500 | consumed samples: 26311680 | consumed tokens: 53886320640 | elapsed time per iteration (s): 0.16 | learning rate: 8.536E-05 | global batch size: 256 | lm loss: 3.706355E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.477 | TFLOPs: 25.88 | +7: iteration 102790/ 173500 | consumed samples: 26314240 | consumed tokens: 53891563520 | elapsed time per iteration (s): 0.16 | learning rate: 8.534E-05 | global batch size: 256 | lm loss: 3.694316E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.441 | TFLOPs: 25.43 | +7: iteration 102800/ 173500 | consumed samples: 26316800 | consumed tokens: 53896806400 | elapsed time per iteration (s): 0.16 | learning rate: 8.533E-05 | global batch size: 256 | lm loss: 3.682734E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.118 | TFLOPs: 25.86 | +7: iteration 102810/ 173500 | consumed samples: 26319360 | consumed tokens: 53902049280 | elapsed time per iteration (s): 0.16 | learning rate: 8.531E-05 | global batch size: 256 | lm loss: 3.693362E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.418 | TFLOPs: 25.05 | +7: iteration 102820/ 173500 | consumed samples: 26321920 | consumed tokens: 53907292160 | elapsed time per iteration (s): 0.16 | learning rate: 8.530E-05 | global batch size: 256 | lm loss: 3.696699E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.979 | TFLOPs: 24.84 | +7: iteration 102830/ 173500 | consumed samples: 26324480 | consumed tokens: 53912535040 | elapsed time per iteration (s): 0.16 | learning rate: 8.528E-05 | global batch size: 256 | lm loss: 3.681951E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.296 | TFLOPs: 25.76 | +7: iteration 102840/ 173500 | consumed samples: 26327040 | consumed tokens: 53917777920 | elapsed time per iteration (s): 0.16 | learning rate: 8.527E-05 | global batch size: 256 | lm loss: 3.681064E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.446 | TFLOPs: 25.88 | +7: iteration 102850/ 173500 | consumed samples: 26329600 | consumed tokens: 53923020800 | elapsed time per iteration (s): 0.16 | learning rate: 8.525E-05 | global batch size: 256 | lm loss: 3.697548E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.790 | TFLOPs: 25.42 | +7: iteration 102860/ 173500 | consumed samples: 26332160 | consumed tokens: 53928263680 | elapsed time per iteration (s): 0.16 | learning rate: 8.523E-05 | global batch size: 256 | lm loss: 3.697417E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.014 | TFLOPs: 25.55 | +7: iteration 102870/ 173500 | consumed samples: 26334720 | consumed tokens: 53933506560 | elapsed time per iteration (s): 0.16 | learning rate: 8.522E-05 | global batch size: 256 | lm loss: 3.679123E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.774 | TFLOPs: 25.68 | +7: iteration 102880/ 173500 | consumed samples: 26337280 | consumed tokens: 53938749440 | elapsed time per iteration (s): 0.16 | learning rate: 8.520E-05 | global batch size: 256 | lm loss: 3.705893E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.628 | TFLOPs: 25.42 | +7: iteration 102890/ 173500 | consumed samples: 26339840 | consumed tokens: 53943992320 | elapsed time per iteration (s): 0.16 | learning rate: 8.519E-05 | global batch size: 256 | lm loss: 3.685938E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.415 | TFLOPs: 25.44 | +7: iteration 102900/ 173500 | consumed samples: 26342400 | consumed tokens: 53949235200 | elapsed time per iteration (s): 0.16 | learning rate: 8.517E-05 | global batch size: 256 | lm loss: 3.672661E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.913 | TFLOPs: 25.69 | +7: iteration 102910/ 173500 | consumed samples: 26344960 | consumed tokens: 53954478080 | elapsed time per iteration (s): 0.16 | learning rate: 8.515E-05 | global batch size: 256 | lm loss: 3.679941E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.301 | TFLOPs: 25.80 | +7: iteration 102920/ 173500 | consumed samples: 26347520 | consumed tokens: 53959720960 | elapsed time per iteration (s): 0.16 | learning rate: 8.514E-05 | global batch size: 256 | lm loss: 3.687965E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.477 | TFLOPs: 25.52 | +7: iteration 102930/ 173500 | consumed samples: 26350080 | consumed tokens: 53964963840 | elapsed time per iteration (s): 0.16 | learning rate: 8.512E-05 | global batch size: 256 | lm loss: 3.687023E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.132 | TFLOPs: 25.30 | +7: iteration 102940/ 173500 | consumed samples: 26352640 | consumed tokens: 53970206720 | elapsed time per iteration (s): 0.16 | learning rate: 8.511E-05 | global batch size: 256 | lm loss: 3.703725E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.882 | TFLOPs: 25.15 | +7: iteration 102950/ 173500 | consumed samples: 26355200 | consumed tokens: 53975449600 | elapsed time per iteration (s): 0.16 | learning rate: 8.509E-05 | global batch size: 256 | lm loss: 3.692603E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.287 | TFLOPs: 24.66 | +7: iteration 102960/ 173500 | consumed samples: 26357760 | consumed tokens: 53980692480 | elapsed time per iteration (s): 0.16 | learning rate: 8.508E-05 | global batch size: 256 | lm loss: 3.682840E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.737 | TFLOPs: 25.64 | +7: iteration 102970/ 173500 | consumed samples: 26360320 | consumed tokens: 53985935360 | elapsed time per iteration (s): 0.16 | learning rate: 8.506E-05 | global batch size: 256 | lm loss: 3.698845E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.415 | TFLOPs: 25.60 | +7: iteration 102980/ 173500 | consumed samples: 26362880 | consumed tokens: 53991178240 | elapsed time per iteration (s): 0.16 | learning rate: 8.504E-05 | global batch size: 256 | lm loss: 3.667924E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.976 | TFLOPs: 24.98 | +7: iteration 102990/ 173500 | consumed samples: 26365440 | consumed tokens: 53996421120 | elapsed time per iteration (s): 0.16 | learning rate: 8.503E-05 | global batch size: 256 | lm loss: 3.679007E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.297 | TFLOPs: 25.82 | +7: iteration 103000/ 173500 | consumed samples: 26368000 | consumed tokens: 54001664000 | elapsed time per iteration (s): 0.16 | learning rate: 8.501E-05 | global batch size: 256 | lm loss: 3.685973E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.811 | TFLOPs: 25.89 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 103000 | lm loss value: 3.843015E+00 | lm loss PPL: 4.666597E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 103000 to checkpoints_44m91b100m +0: [2023-03-17 04:43:25,135] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step103000 is begin to save! +0: [2023-03-17 04:43:25,138] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:43:25,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:43:25,198] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:43:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:43:25,210] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:43:25,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:43:25,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:43:25,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:43:25,226] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:43:25,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:43:25,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:43:25,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:43:25,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:43:25,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:43:25,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:43:25,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:43:25,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:43:25,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:43:25,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:43:25,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:43:25,268] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step103000/mp_rank_00_model_states.pt +0: [2023-03-17 04:43:25,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:43:25,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:43:25,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:43:25,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:43:25,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +3: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +3: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +1: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +3: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:43:25,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:43:25,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +1: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +1: [2023-03-17 04:43:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +3: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:43:25,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +3: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +1: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +1: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +2: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +6: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +4: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 04:43:25,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +7: [2023-03-17 04:43:25,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +3: [2023-03-17 04:43:25,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:43:25,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:43:25,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 04:43:25,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step103000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 04:43:25,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +5: [2023-03-17 04:43:25,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step103000 is ready now! +0: successfully saved checkpoint at iteration 103000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.91 +7: iteration 103010/ 173500 | consumed samples: 26370560 | consumed tokens: 54006906880 | elapsed time per iteration (s): 0.18 | learning rate: 8.500E-05 | global batch size: 256 | lm loss: 3.699070E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.343 | TFLOPs: 21.73 | +7: iteration 103020/ 173500 | consumed samples: 26373120 | consumed tokens: 54012149760 | elapsed time per iteration (s): 0.16 | learning rate: 8.498E-05 | global batch size: 256 | lm loss: 3.696965E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.982 | TFLOPs: 25.11 | +7: iteration 103030/ 173500 | consumed samples: 26375680 | consumed tokens: 54017392640 | elapsed time per iteration (s): 0.16 | learning rate: 8.496E-05 | global batch size: 256 | lm loss: 3.678674E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.437 | TFLOPs: 25.82 | +7: iteration 103040/ 173500 | consumed samples: 26378240 | consumed tokens: 54022635520 | elapsed time per iteration (s): 0.16 | learning rate: 8.495E-05 | global batch size: 256 | lm loss: 3.689017E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.764 | TFLOPs: 24.54 | +7: iteration 103050/ 173500 | consumed samples: 26380800 | consumed tokens: 54027878400 | elapsed time per iteration (s): 0.15 | learning rate: 8.493E-05 | global batch size: 256 | lm loss: 3.691866E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.007 | TFLOPs: 26.24 | +7: iteration 103060/ 173500 | consumed samples: 26383360 | consumed tokens: 54033121280 | elapsed time per iteration (s): 0.16 | learning rate: 8.492E-05 | global batch size: 256 | lm loss: 3.687707E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.909 | TFLOPs: 25.44 | +7: iteration 103070/ 173500 | consumed samples: 26385920 | consumed tokens: 54038364160 | elapsed time per iteration (s): 0.16 | learning rate: 8.490E-05 | global batch size: 256 | lm loss: 3.681647E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.109 | TFLOPs: 25.64 | +7: iteration 103080/ 173500 | consumed samples: 26388480 | consumed tokens: 54043607040 | elapsed time per iteration (s): 0.16 | learning rate: 8.489E-05 | global batch size: 256 | lm loss: 3.675108E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.167 | TFLOPs: 25.03 | +7: iteration 103090/ 173500 | consumed samples: 26391040 | consumed tokens: 54048849920 | elapsed time per iteration (s): 0.16 | learning rate: 8.487E-05 | global batch size: 256 | lm loss: 3.706046E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.923 | TFLOPs: 24.48 | +7: iteration 103100/ 173500 | consumed samples: 26393600 | consumed tokens: 54054092800 | elapsed time per iteration (s): 0.17 | learning rate: 8.485E-05 | global batch size: 256 | lm loss: 3.705325E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.039 | TFLOPs: 24.10 | +7: iteration 103110/ 173500 | consumed samples: 26396160 | consumed tokens: 54059335680 | elapsed time per iteration (s): 0.16 | learning rate: 8.484E-05 | global batch size: 256 | lm loss: 3.690342E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.295 | TFLOPs: 24.59 | +7: iteration 103120/ 173500 | consumed samples: 26398720 | consumed tokens: 54064578560 | elapsed time per iteration (s): 0.16 | learning rate: 8.482E-05 | global batch size: 256 | lm loss: 3.692805E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.247 | TFLOPs: 25.57 | +7: iteration 103130/ 173500 | consumed samples: 26401280 | consumed tokens: 54069821440 | elapsed time per iteration (s): 0.17 | learning rate: 8.481E-05 | global batch size: 256 | lm loss: 3.697138E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.892 | TFLOPs: 24.24 | +7: iteration 103140/ 173500 | consumed samples: 26403840 | consumed tokens: 54075064320 | elapsed time per iteration (s): 0.16 | learning rate: 8.479E-05 | global batch size: 256 | lm loss: 3.690414E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.262 | TFLOPs: 25.06 | +7: iteration 103150/ 173500 | consumed samples: 26406400 | consumed tokens: 54080307200 | elapsed time per iteration (s): 0.16 | learning rate: 8.477E-05 | global batch size: 256 | lm loss: 3.692555E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.459 | TFLOPs: 25.74 | +7: iteration 103160/ 173500 | consumed samples: 26408960 | consumed tokens: 54085550080 | elapsed time per iteration (s): 0.16 | learning rate: 8.476E-05 | global batch size: 256 | lm loss: 3.692315E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.473 | TFLOPs: 25.68 | +7: iteration 103170/ 173500 | consumed samples: 26411520 | consumed tokens: 54090792960 | elapsed time per iteration (s): 0.15 | learning rate: 8.474E-05 | global batch size: 256 | lm loss: 3.695428E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.292 | TFLOPs: 26.15 | +7: iteration 103180/ 173500 | consumed samples: 26414080 | consumed tokens: 54096035840 | elapsed time per iteration (s): 0.16 | learning rate: 8.473E-05 | global batch size: 256 | lm loss: 3.700407E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.045 | TFLOPs: 25.42 | +7: iteration 103190/ 173500 | consumed samples: 26416640 | consumed tokens: 54101278720 | elapsed time per iteration (s): 0.16 | learning rate: 8.471E-05 | global batch size: 256 | lm loss: 3.689239E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.841 | TFLOPs: 25.03 | +7: iteration 103200/ 173500 | consumed samples: 26419200 | consumed tokens: 54106521600 | elapsed time per iteration (s): 0.16 | learning rate: 8.470E-05 | global batch size: 256 | lm loss: 3.694944E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.114 | TFLOPs: 24.89 | +7: iteration 103210/ 173500 | consumed samples: 26421760 | consumed tokens: 54111764480 | elapsed time per iteration (s): 0.16 | learning rate: 8.468E-05 | global batch size: 256 | lm loss: 3.687514E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.698 | TFLOPs: 25.65 | +7: iteration 103220/ 173500 | consumed samples: 26424320 | consumed tokens: 54117007360 | elapsed time per iteration (s): 0.16 | learning rate: 8.466E-05 | global batch size: 256 | lm loss: 3.675940E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.734 | TFLOPs: 25.43 | +7: iteration 103230/ 173500 | consumed samples: 26426880 | consumed tokens: 54122250240 | elapsed time per iteration (s): 0.16 | learning rate: 8.465E-05 | global batch size: 256 | lm loss: 3.684209E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.878 | TFLOPs: 25.39 | +7: iteration 103240/ 173500 | consumed samples: 26429440 | consumed tokens: 54127493120 | elapsed time per iteration (s): 0.17 | learning rate: 8.463E-05 | global batch size: 256 | lm loss: 3.702729E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.648 | TFLOPs: 24.13 | +7: iteration 103250/ 173500 | consumed samples: 26432000 | consumed tokens: 54132736000 | elapsed time per iteration (s): 0.16 | learning rate: 8.462E-05 | global batch size: 256 | lm loss: 3.673653E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.635 | TFLOPs: 25.53 | +7: iteration 103260/ 173500 | consumed samples: 26434560 | consumed tokens: 54137978880 | elapsed time per iteration (s): 0.16 | learning rate: 8.460E-05 | global batch size: 256 | lm loss: 3.687852E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.737 | TFLOPs: 25.76 | +7: iteration 103270/ 173500 | consumed samples: 26437120 | consumed tokens: 54143221760 | elapsed time per iteration (s): 0.16 | learning rate: 8.459E-05 | global batch size: 256 | lm loss: 3.699121E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.964 | TFLOPs: 25.78 | +7: iteration 103280/ 173500 | consumed samples: 26439680 | consumed tokens: 54148464640 | elapsed time per iteration (s): 0.17 | learning rate: 8.457E-05 | global batch size: 256 | lm loss: 3.697530E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1507.349 | TFLOPs: 23.64 | +7: iteration 103290/ 173500 | consumed samples: 26442240 | consumed tokens: 54153707520 | elapsed time per iteration (s): 0.16 | learning rate: 8.455E-05 | global batch size: 256 | lm loss: 3.690658E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.793 | TFLOPs: 25.78 | +7: iteration 103300/ 173500 | consumed samples: 26444800 | consumed tokens: 54158950400 | elapsed time per iteration (s): 0.16 | learning rate: 8.454E-05 | global batch size: 256 | lm loss: 3.689429E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.454 | TFLOPs: 24.83 | +7: iteration 103310/ 173500 | consumed samples: 26447360 | consumed tokens: 54164193280 | elapsed time per iteration (s): 0.16 | learning rate: 8.452E-05 | global batch size: 256 | lm loss: 3.696152E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.085 | TFLOPs: 25.63 | +7: iteration 103320/ 173500 | consumed samples: 26449920 | consumed tokens: 54169436160 | elapsed time per iteration (s): 0.16 | learning rate: 8.451E-05 | global batch size: 256 | lm loss: 3.691590E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.886 | TFLOPs: 24.53 | +7: iteration 103330/ 173500 | consumed samples: 26452480 | consumed tokens: 54174679040 | elapsed time per iteration (s): 0.16 | learning rate: 8.449E-05 | global batch size: 256 | lm loss: 3.687083E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.637 | TFLOPs: 25.26 | +7: iteration 103340/ 173500 | consumed samples: 26455040 | consumed tokens: 54179921920 | elapsed time per iteration (s): 0.16 | learning rate: 8.447E-05 | global batch size: 256 | lm loss: 3.692142E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.674 | TFLOPs: 24.88 | +7: iteration 103350/ 173500 | consumed samples: 26457600 | consumed tokens: 54185164800 | elapsed time per iteration (s): 0.16 | learning rate: 8.446E-05 | global batch size: 256 | lm loss: 3.687745E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.918 | TFLOPs: 24.56 | +7: iteration 103360/ 173500 | consumed samples: 26460160 | consumed tokens: 54190407680 | elapsed time per iteration (s): 0.16 | learning rate: 8.444E-05 | global batch size: 256 | lm loss: 3.688246E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.109 | TFLOPs: 25.71 | +7: iteration 103370/ 173500 | consumed samples: 26462720 | consumed tokens: 54195650560 | elapsed time per iteration (s): 0.16 | learning rate: 8.443E-05 | global batch size: 256 | lm loss: 3.676711E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.046 | TFLOPs: 25.58 | +7: iteration 103380/ 173500 | consumed samples: 26465280 | consumed tokens: 54200893440 | elapsed time per iteration (s): 0.16 | learning rate: 8.441E-05 | global batch size: 256 | lm loss: 3.691072E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.555 | TFLOPs: 25.32 | +7: iteration 103390/ 173500 | consumed samples: 26467840 | consumed tokens: 54206136320 | elapsed time per iteration (s): 0.16 | learning rate: 8.440E-05 | global batch size: 256 | lm loss: 3.673484E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.011 | TFLOPs: 25.58 | +7: iteration 103400/ 173500 | consumed samples: 26470400 | consumed tokens: 54211379200 | elapsed time per iteration (s): 0.16 | learning rate: 8.438E-05 | global batch size: 256 | lm loss: 3.688667E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.359 | TFLOPs: 24.82 | +7: iteration 103410/ 173500 | consumed samples: 26472960 | consumed tokens: 54216622080 | elapsed time per iteration (s): 0.16 | learning rate: 8.436E-05 | global batch size: 256 | lm loss: 3.692145E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.259 | TFLOPs: 25.44 | +7: iteration 103420/ 173500 | consumed samples: 26475520 | consumed tokens: 54221864960 | elapsed time per iteration (s): 0.16 | learning rate: 8.435E-05 | global batch size: 256 | lm loss: 3.686900E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.333 | TFLOPs: 25.25 | +7: iteration 103430/ 173500 | consumed samples: 26478080 | consumed tokens: 54227107840 | elapsed time per iteration (s): 0.16 | learning rate: 8.433E-05 | global batch size: 256 | lm loss: 3.679927E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.510 | TFLOPs: 25.02 | +7: iteration 103440/ 173500 | consumed samples: 26480640 | consumed tokens: 54232350720 | elapsed time per iteration (s): 0.16 | learning rate: 8.432E-05 | global batch size: 256 | lm loss: 3.684657E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.651 | TFLOPs: 25.04 | +7: iteration 103450/ 173500 | consumed samples: 26483200 | consumed tokens: 54237593600 | elapsed time per iteration (s): 0.16 | learning rate: 8.430E-05 | global batch size: 256 | lm loss: 3.685770E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.677 | TFLOPs: 25.70 | +7: iteration 103460/ 173500 | consumed samples: 26485760 | consumed tokens: 54242836480 | elapsed time per iteration (s): 0.16 | learning rate: 8.429E-05 | global batch size: 256 | lm loss: 3.710861E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.450 | TFLOPs: 25.02 | +7: iteration 103470/ 173500 | consumed samples: 26488320 | consumed tokens: 54248079360 | elapsed time per iteration (s): 0.16 | learning rate: 8.427E-05 | global batch size: 256 | lm loss: 3.695809E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.762 | TFLOPs: 25.45 | +7: iteration 103480/ 173500 | consumed samples: 26490880 | consumed tokens: 54253322240 | elapsed time per iteration (s): 0.15 | learning rate: 8.425E-05 | global batch size: 256 | lm loss: 3.699916E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.924 | TFLOPs: 25.95 | +7: iteration 103490/ 173500 | consumed samples: 26493440 | consumed tokens: 54258565120 | elapsed time per iteration (s): 0.16 | learning rate: 8.424E-05 | global batch size: 256 | lm loss: 3.690505E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.828 | TFLOPs: 25.25 | +7: iteration 103500/ 173500 | consumed samples: 26496000 | consumed tokens: 54263808000 | elapsed time per iteration (s): 0.16 | learning rate: 8.422E-05 | global batch size: 256 | lm loss: 3.691431E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.231 | TFLOPs: 24.36 | +7: iteration 103510/ 173500 | consumed samples: 26498560 | consumed tokens: 54269050880 | elapsed time per iteration (s): 0.17 | learning rate: 8.421E-05 | global batch size: 256 | lm loss: 3.693187E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.099 | TFLOPs: 23.67 | +7: iteration 103520/ 173500 | consumed samples: 26501120 | consumed tokens: 54274293760 | elapsed time per iteration (s): 0.16 | learning rate: 8.419E-05 | global batch size: 256 | lm loss: 3.689020E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.961 | TFLOPs: 25.33 | +7: iteration 103530/ 173500 | consumed samples: 26503680 | consumed tokens: 54279536640 | elapsed time per iteration (s): 0.17 | learning rate: 8.418E-05 | global batch size: 256 | lm loss: 3.695942E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.497 | TFLOPs: 23.55 | +7: iteration 103540/ 173500 | consumed samples: 26506240 | consumed tokens: 54284779520 | elapsed time per iteration (s): 0.16 | learning rate: 8.416E-05 | global batch size: 256 | lm loss: 3.688477E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.097 | TFLOPs: 25.63 | +7: iteration 103550/ 173500 | consumed samples: 26508800 | consumed tokens: 54290022400 | elapsed time per iteration (s): 0.16 | learning rate: 8.414E-05 | global batch size: 256 | lm loss: 3.687219E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.387 | TFLOPs: 25.55 | +7: iteration 103560/ 173500 | consumed samples: 26511360 | consumed tokens: 54295265280 | elapsed time per iteration (s): 0.16 | learning rate: 8.413E-05 | global batch size: 256 | lm loss: 3.702541E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.553 | TFLOPs: 25.41 | +7: iteration 103570/ 173500 | consumed samples: 26513920 | consumed tokens: 54300508160 | elapsed time per iteration (s): 0.17 | learning rate: 8.411E-05 | global batch size: 256 | lm loss: 3.697606E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.234 | TFLOPs: 24.26 | +7: iteration 103580/ 173500 | consumed samples: 26516480 | consumed tokens: 54305751040 | elapsed time per iteration (s): 0.16 | learning rate: 8.410E-05 | global batch size: 256 | lm loss: 3.678429E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.322 | TFLOPs: 24.96 | +7: iteration 103590/ 173500 | consumed samples: 26519040 | consumed tokens: 54310993920 | elapsed time per iteration (s): 0.16 | learning rate: 8.408E-05 | global batch size: 256 | lm loss: 3.684511E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.146 | TFLOPs: 25.69 | +7: iteration 103600/ 173500 | consumed samples: 26521600 | consumed tokens: 54316236800 | elapsed time per iteration (s): 0.16 | learning rate: 8.406E-05 | global batch size: 256 | lm loss: 3.701156E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.780 | TFLOPs: 24.57 | +7: iteration 103610/ 173500 | consumed samples: 26524160 | consumed tokens: 54321479680 | elapsed time per iteration (s): 0.16 | learning rate: 8.405E-05 | global batch size: 256 | lm loss: 3.692794E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.961 | TFLOPs: 25.39 | +7: iteration 103620/ 173500 | consumed samples: 26526720 | consumed tokens: 54326722560 | elapsed time per iteration (s): 0.17 | learning rate: 8.403E-05 | global batch size: 256 | lm loss: 3.687531E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.474 | TFLOPs: 23.89 | +7: iteration 103630/ 173500 | consumed samples: 26529280 | consumed tokens: 54331965440 | elapsed time per iteration (s): 0.17 | learning rate: 8.402E-05 | global batch size: 256 | lm loss: 3.679686E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.820 | TFLOPs: 24.16 | +7: iteration 103640/ 173500 | consumed samples: 26531840 | consumed tokens: 54337208320 | elapsed time per iteration (s): 0.16 | learning rate: 8.400E-05 | global batch size: 256 | lm loss: 3.688272E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.472 | TFLOPs: 25.26 | +7: iteration 103650/ 173500 | consumed samples: 26534400 | consumed tokens: 54342451200 | elapsed time per iteration (s): 0.16 | learning rate: 8.399E-05 | global batch size: 256 | lm loss: 3.682619E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.413 | TFLOPs: 24.85 | +7: iteration 103660/ 173500 | consumed samples: 26536960 | consumed tokens: 54347694080 | elapsed time per iteration (s): 0.16 | learning rate: 8.397E-05 | global batch size: 256 | lm loss: 3.685905E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.921 | TFLOPs: 25.39 | +7: iteration 103670/ 173500 | consumed samples: 26539520 | consumed tokens: 54352936960 | elapsed time per iteration (s): 0.16 | learning rate: 8.395E-05 | global batch size: 256 | lm loss: 3.684487E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.709 | TFLOPs: 25.67 | +7: iteration 103680/ 173500 | consumed samples: 26542080 | consumed tokens: 54358179840 | elapsed time per iteration (s): 0.17 | learning rate: 8.394E-05 | global batch size: 256 | lm loss: 3.682715E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.801 | TFLOPs: 24.05 | +7: iteration 103690/ 173500 | consumed samples: 26544640 | consumed tokens: 54363422720 | elapsed time per iteration (s): 0.16 | learning rate: 8.392E-05 | global batch size: 256 | lm loss: 3.700668E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.598 | TFLOPs: 25.68 | +7: iteration 103700/ 173500 | consumed samples: 26547200 | consumed tokens: 54368665600 | elapsed time per iteration (s): 0.16 | learning rate: 8.391E-05 | global batch size: 256 | lm loss: 3.687019E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.283 | TFLOPs: 25.10 | +7: iteration 103710/ 173500 | consumed samples: 26549760 | consumed tokens: 54373908480 | elapsed time per iteration (s): 0.16 | learning rate: 8.389E-05 | global batch size: 256 | lm loss: 3.675340E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.248 | TFLOPs: 25.22 | +7: iteration 103720/ 173500 | consumed samples: 26552320 | consumed tokens: 54379151360 | elapsed time per iteration (s): 0.16 | learning rate: 8.388E-05 | global batch size: 256 | lm loss: 3.691039E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.251 | TFLOPs: 25.79 | +7: iteration 103730/ 173500 | consumed samples: 26554880 | consumed tokens: 54384394240 | elapsed time per iteration (s): 0.16 | learning rate: 8.386E-05 | global batch size: 256 | lm loss: 3.686982E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.363 | TFLOPs: 25.77 | +7: iteration 103740/ 173500 | consumed samples: 26557440 | consumed tokens: 54389637120 | elapsed time per iteration (s): 0.16 | learning rate: 8.384E-05 | global batch size: 256 | lm loss: 3.692817E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.934 | TFLOPs: 25.40 | +7: iteration 103750/ 173500 | consumed samples: 26560000 | consumed tokens: 54394880000 | elapsed time per iteration (s): 0.16 | learning rate: 8.383E-05 | global batch size: 256 | lm loss: 3.679506E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.778 | TFLOPs: 25.17 | +7: iteration 103760/ 173500 | consumed samples: 26562560 | consumed tokens: 54400122880 | elapsed time per iteration (s): 0.16 | learning rate: 8.381E-05 | global batch size: 256 | lm loss: 3.685651E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.668 | TFLOPs: 25.84 | +7: iteration 103770/ 173500 | consumed samples: 26565120 | consumed tokens: 54405365760 | elapsed time per iteration (s): 0.15 | learning rate: 8.380E-05 | global batch size: 256 | lm loss: 3.687703E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.565 | TFLOPs: 26.01 | +7: iteration 103780/ 173500 | consumed samples: 26567680 | consumed tokens: 54410608640 | elapsed time per iteration (s): 0.16 | learning rate: 8.378E-05 | global batch size: 256 | lm loss: 3.690879E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.106 | TFLOPs: 25.05 | +7: iteration 103790/ 173500 | consumed samples: 26570240 | consumed tokens: 54415851520 | elapsed time per iteration (s): 0.15 | learning rate: 8.377E-05 | global batch size: 256 | lm loss: 3.685749E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.891 | TFLOPs: 26.08 | +7: iteration 103800/ 173500 | consumed samples: 26572800 | consumed tokens: 54421094400 | elapsed time per iteration (s): 0.16 | learning rate: 8.375E-05 | global batch size: 256 | lm loss: 3.694575E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.830 | TFLOPs: 25.69 | +7: iteration 103810/ 173500 | consumed samples: 26575360 | consumed tokens: 54426337280 | elapsed time per iteration (s): 0.16 | learning rate: 8.373E-05 | global batch size: 256 | lm loss: 3.688845E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.765 | TFLOPs: 25.51 | +7: iteration 103820/ 173500 | consumed samples: 26577920 | consumed tokens: 54431580160 | elapsed time per iteration (s): 0.16 | learning rate: 8.372E-05 | global batch size: 256 | lm loss: 3.669473E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.767 | TFLOPs: 25.79 | +7: iteration 103830/ 173500 | consumed samples: 26580480 | consumed tokens: 54436823040 | elapsed time per iteration (s): 0.16 | learning rate: 8.370E-05 | global batch size: 256 | lm loss: 3.671975E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.701 | TFLOPs: 24.99 | +7: iteration 103840/ 173500 | consumed samples: 26583040 | consumed tokens: 54442065920 | elapsed time per iteration (s): 0.16 | learning rate: 8.369E-05 | global batch size: 256 | lm loss: 3.687333E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.167 | TFLOPs: 25.57 | +7: iteration 103850/ 173500 | consumed samples: 26585600 | consumed tokens: 54447308800 | elapsed time per iteration (s): 0.16 | learning rate: 8.367E-05 | global batch size: 256 | lm loss: 3.681433E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.255 | TFLOPs: 25.86 | +7: iteration 103860/ 173500 | consumed samples: 26588160 | consumed tokens: 54452551680 | elapsed time per iteration (s): 0.16 | learning rate: 8.366E-05 | global batch size: 256 | lm loss: 3.688450E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.330 | TFLOPs: 24.74 | +7: iteration 103870/ 173500 | consumed samples: 26590720 | consumed tokens: 54457794560 | elapsed time per iteration (s): 0.15 | learning rate: 8.364E-05 | global batch size: 256 | lm loss: 3.689870E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.164 | TFLOPs: 26.08 | +7: iteration 103880/ 173500 | consumed samples: 26593280 | consumed tokens: 54463037440 | elapsed time per iteration (s): 0.16 | learning rate: 8.362E-05 | global batch size: 256 | lm loss: 3.698460E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.325 | TFLOPs: 25.50 | +7: iteration 103890/ 173500 | consumed samples: 26595840 | consumed tokens: 54468280320 | elapsed time per iteration (s): 0.15 | learning rate: 8.361E-05 | global batch size: 256 | lm loss: 3.685813E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.738 | TFLOPs: 26.08 | +7: iteration 103900/ 173500 | consumed samples: 26598400 | consumed tokens: 54473523200 | elapsed time per iteration (s): 0.15 | learning rate: 8.359E-05 | global batch size: 256 | lm loss: 3.691809E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.488 | TFLOPs: 26.01 | +7: iteration 103910/ 173500 | consumed samples: 26600960 | consumed tokens: 54478766080 | elapsed time per iteration (s): 0.16 | learning rate: 8.358E-05 | global batch size: 256 | lm loss: 3.683021E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.234 | TFLOPs: 25.13 | +7: iteration 103920/ 173500 | consumed samples: 26603520 | consumed tokens: 54484008960 | elapsed time per iteration (s): 0.15 | learning rate: 8.356E-05 | global batch size: 256 | lm loss: 3.689666E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.174 | TFLOPs: 26.11 | +7: iteration 103930/ 173500 | consumed samples: 26606080 | consumed tokens: 54489251840 | elapsed time per iteration (s): 0.16 | learning rate: 8.355E-05 | global batch size: 256 | lm loss: 3.684136E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.124 | TFLOPs: 24.81 | +7: iteration 103940/ 173500 | consumed samples: 26608640 | consumed tokens: 54494494720 | elapsed time per iteration (s): 0.16 | learning rate: 8.353E-05 | global batch size: 256 | lm loss: 3.676777E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.263 | TFLOPs: 24.86 | +7: iteration 103950/ 173500 | consumed samples: 26611200 | consumed tokens: 54499737600 | elapsed time per iteration (s): 0.17 | learning rate: 8.351E-05 | global batch size: 256 | lm loss: 3.684935E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.110 | TFLOPs: 24.04 | +7: iteration 103960/ 173500 | consumed samples: 26613760 | consumed tokens: 54504980480 | elapsed time per iteration (s): 0.15 | learning rate: 8.350E-05 | global batch size: 256 | lm loss: 3.696485E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.744 | TFLOPs: 26.19 | +7: iteration 103970/ 173500 | consumed samples: 26616320 | consumed tokens: 54510223360 | elapsed time per iteration (s): 0.16 | learning rate: 8.348E-05 | global batch size: 256 | lm loss: 3.692892E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.969 | TFLOPs: 25.28 | +7: iteration 103980/ 173500 | consumed samples: 26618880 | consumed tokens: 54515466240 | elapsed time per iteration (s): 0.16 | learning rate: 8.347E-05 | global batch size: 256 | lm loss: 3.691190E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.773 | TFLOPs: 24.52 | +7: iteration 103990/ 173500 | consumed samples: 26621440 | consumed tokens: 54520709120 | elapsed time per iteration (s): 0.17 | learning rate: 8.345E-05 | global batch size: 256 | lm loss: 3.689677E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.388 | TFLOPs: 24.20 | +0: [2023-03-17 04:46:04,662] [INFO] [logging.py:68:log_dist] [Rank 0] step=104000, skipped=0, lr=[8.343492337309329e-05, 8.343492337309329e-05, 8.343492337309329e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 104000/ 173500 | consumed samples: 26624000 | consumed tokens: 54525952000 | elapsed time per iteration (s): 0.16 | learning rate: 8.343E-05 | global batch size: 256 | lm loss: 3.694291E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.150 | TFLOPs: 25.83 | +0: steps: 104000 loss: 3.6878 iter time (s): 0.158 samples/sec: 1616.302 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 104000 | lm loss value: 3.827978E+00 | lm loss PPL: 4.596950E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 104000 to checkpoints_44m91b100m +0: [2023-03-17 04:46:04,737] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step104000 is begin to save! +0: [2023-03-17 04:46:04,741] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:46:04,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:46:04,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:46:04,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:46:04,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:46:04,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:46:04,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:46:04,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:46:04,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:46:04,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:46:04,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:46:04,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:46:04,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:46:04,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:46:04,860] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:46:04,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:46:04,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:46:04,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:46:04,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:46:04,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:46:04,878] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step104000/mp_rank_00_model_states.pt +0: [2023-03-17 04:46:04,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:46:04,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:46:04,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:46:04,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:46:04,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:46:04,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:46:04,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:46:04,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:46:04,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:46:04,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,918] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,918] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:46:04,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +5: [2023-03-17 04:46:04,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +1: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:46:04,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +4: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:46:04,920] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +3: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +6: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +7: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:46:04,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step104000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:46:04,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step104000 is ready now! +0: successfully saved checkpoint at iteration 104000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.98 +7: iteration 104010/ 173500 | consumed samples: 26626560 | consumed tokens: 54531194880 | elapsed time per iteration (s): 0.18 | learning rate: 8.342E-05 | global batch size: 256 | lm loss: 3.685947E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.772 | TFLOPs: 21.83 | +7: iteration 104020/ 173500 | consumed samples: 26629120 | consumed tokens: 54536437760 | elapsed time per iteration (s): 0.16 | learning rate: 8.340E-05 | global batch size: 256 | lm loss: 3.690332E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.719 | TFLOPs: 25.62 | +7: iteration 104030/ 173500 | consumed samples: 26631680 | consumed tokens: 54541680640 | elapsed time per iteration (s): 0.15 | learning rate: 8.339E-05 | global batch size: 256 | lm loss: 3.675819E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.145 | TFLOPs: 26.16 | +7: iteration 104040/ 173500 | consumed samples: 26634240 | consumed tokens: 54546923520 | elapsed time per iteration (s): 0.16 | learning rate: 8.337E-05 | global batch size: 256 | lm loss: 3.676084E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.042 | TFLOPs: 25.19 | +7: iteration 104050/ 173500 | consumed samples: 26636800 | consumed tokens: 54552166400 | elapsed time per iteration (s): 0.16 | learning rate: 8.336E-05 | global batch size: 256 | lm loss: 3.687827E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.169 | TFLOPs: 25.17 | +7: iteration 104060/ 173500 | consumed samples: 26639360 | consumed tokens: 54557409280 | elapsed time per iteration (s): 0.16 | learning rate: 8.334E-05 | global batch size: 256 | lm loss: 3.681933E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.293 | TFLOPs: 25.50 | +7: iteration 104070/ 173500 | consumed samples: 26641920 | consumed tokens: 54562652160 | elapsed time per iteration (s): 0.16 | learning rate: 8.332E-05 | global batch size: 256 | lm loss: 3.693312E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.353 | TFLOPs: 25.54 | +7: iteration 104080/ 173500 | consumed samples: 26644480 | consumed tokens: 54567895040 | elapsed time per iteration (s): 0.16 | learning rate: 8.331E-05 | global batch size: 256 | lm loss: 3.695933E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.060 | TFLOPs: 25.50 | +7: iteration 104090/ 173500 | consumed samples: 26647040 | consumed tokens: 54573137920 | elapsed time per iteration (s): 0.16 | learning rate: 8.329E-05 | global batch size: 256 | lm loss: 3.687920E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.146 | TFLOPs: 25.78 | +7: iteration 104100/ 173500 | consumed samples: 26649600 | consumed tokens: 54578380800 | elapsed time per iteration (s): 0.15 | learning rate: 8.328E-05 | global batch size: 256 | lm loss: 3.690158E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.519 | TFLOPs: 25.95 | +7: iteration 104110/ 173500 | consumed samples: 26652160 | consumed tokens: 54583623680 | elapsed time per iteration (s): 0.16 | learning rate: 8.326E-05 | global batch size: 256 | lm loss: 3.678173E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.880 | TFLOPs: 24.90 | +7: iteration 104120/ 173500 | consumed samples: 26654720 | consumed tokens: 54588866560 | elapsed time per iteration (s): 0.16 | learning rate: 8.325E-05 | global batch size: 256 | lm loss: 3.689961E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.766 | TFLOPs: 25.89 | +7: iteration 104130/ 173500 | consumed samples: 26657280 | consumed tokens: 54594109440 | elapsed time per iteration (s): 0.16 | learning rate: 8.323E-05 | global batch size: 256 | lm loss: 3.694339E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.784 | TFLOPs: 25.46 | +7: iteration 104140/ 173500 | consumed samples: 26659840 | consumed tokens: 54599352320 | elapsed time per iteration (s): 0.18 | learning rate: 8.321E-05 | global batch size: 256 | lm loss: 3.693122E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.400 | TFLOPs: 22.82 | +7: iteration 104150/ 173500 | consumed samples: 26662400 | consumed tokens: 54604595200 | elapsed time per iteration (s): 0.16 | learning rate: 8.320E-05 | global batch size: 256 | lm loss: 3.680777E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.682 | TFLOPs: 24.69 | +7: iteration 104160/ 173500 | consumed samples: 26664960 | consumed tokens: 54609838080 | elapsed time per iteration (s): 0.16 | learning rate: 8.318E-05 | global batch size: 256 | lm loss: 3.692413E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.681 | TFLOPs: 25.28 | +7: iteration 104170/ 173500 | consumed samples: 26667520 | consumed tokens: 54615080960 | elapsed time per iteration (s): 0.16 | learning rate: 8.317E-05 | global batch size: 256 | lm loss: 3.694131E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.974 | TFLOPs: 25.64 | +7: iteration 104180/ 173500 | consumed samples: 26670080 | consumed tokens: 54620323840 | elapsed time per iteration (s): 0.16 | learning rate: 8.315E-05 | global batch size: 256 | lm loss: 3.709868E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.597 | TFLOPs: 25.60 | +7: iteration 104190/ 173500 | consumed samples: 26672640 | consumed tokens: 54625566720 | elapsed time per iteration (s): 0.16 | learning rate: 8.314E-05 | global batch size: 256 | lm loss: 3.683066E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.462 | TFLOPs: 24.64 | +7: iteration 104200/ 173500 | consumed samples: 26675200 | consumed tokens: 54630809600 | elapsed time per iteration (s): 0.15 | learning rate: 8.312E-05 | global batch size: 256 | lm loss: 3.677563E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.904 | TFLOPs: 25.91 | +7: iteration 104210/ 173500 | consumed samples: 26677760 | consumed tokens: 54636052480 | elapsed time per iteration (s): 0.16 | learning rate: 8.310E-05 | global batch size: 256 | lm loss: 3.686096E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.380 | TFLOPs: 24.94 | +7: iteration 104220/ 173500 | consumed samples: 26680320 | consumed tokens: 54641295360 | elapsed time per iteration (s): 0.16 | learning rate: 8.309E-05 | global batch size: 256 | lm loss: 3.680408E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.686 | TFLOPs: 25.71 | +7: iteration 104230/ 173500 | consumed samples: 26682880 | consumed tokens: 54646538240 | elapsed time per iteration (s): 0.16 | learning rate: 8.307E-05 | global batch size: 256 | lm loss: 3.687459E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.192 | TFLOPs: 25.47 | +7: iteration 104240/ 173500 | consumed samples: 26685440 | consumed tokens: 54651781120 | elapsed time per iteration (s): 0.16 | learning rate: 8.306E-05 | global batch size: 256 | lm loss: 3.680479E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.856 | TFLOPs: 24.96 | +7: iteration 104250/ 173500 | consumed samples: 26688000 | consumed tokens: 54657024000 | elapsed time per iteration (s): 0.15 | learning rate: 8.304E-05 | global batch size: 256 | lm loss: 3.682230E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.876 | TFLOPs: 26.17 | +7: iteration 104260/ 173500 | consumed samples: 26690560 | consumed tokens: 54662266880 | elapsed time per iteration (s): 0.16 | learning rate: 8.303E-05 | global batch size: 256 | lm loss: 3.689767E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.359 | TFLOPs: 25.49 | +7: iteration 104270/ 173500 | consumed samples: 26693120 | consumed tokens: 54667509760 | elapsed time per iteration (s): 0.16 | learning rate: 8.301E-05 | global batch size: 256 | lm loss: 3.683315E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.507 | TFLOPs: 25.02 | +7: iteration 104280/ 173500 | consumed samples: 26695680 | consumed tokens: 54672752640 | elapsed time per iteration (s): 0.16 | learning rate: 8.299E-05 | global batch size: 256 | lm loss: 3.676612E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.314 | TFLOPs: 24.66 | +7: iteration 104290/ 173500 | consumed samples: 26698240 | consumed tokens: 54677995520 | elapsed time per iteration (s): 0.15 | learning rate: 8.298E-05 | global batch size: 256 | lm loss: 3.694059E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.545 | TFLOPs: 25.95 | +7: iteration 104300/ 173500 | consumed samples: 26700800 | consumed tokens: 54683238400 | elapsed time per iteration (s): 0.16 | learning rate: 8.296E-05 | global batch size: 256 | lm loss: 3.678481E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.728 | TFLOPs: 25.79 | +7: iteration 104310/ 173500 | consumed samples: 26703360 | consumed tokens: 54688481280 | elapsed time per iteration (s): 0.16 | learning rate: 8.295E-05 | global batch size: 256 | lm loss: 3.696689E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.581 | TFLOPs: 25.43 | +7: iteration 104320/ 173500 | consumed samples: 26705920 | consumed tokens: 54693724160 | elapsed time per iteration (s): 0.16 | learning rate: 8.293E-05 | global batch size: 256 | lm loss: 3.673353E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.145 | TFLOPs: 24.70 | +7: iteration 104330/ 173500 | consumed samples: 26708480 | consumed tokens: 54698967040 | elapsed time per iteration (s): 0.16 | learning rate: 8.292E-05 | global batch size: 256 | lm loss: 3.676574E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.913 | TFLOPs: 25.26 | +7: iteration 104340/ 173500 | consumed samples: 26711040 | consumed tokens: 54704209920 | elapsed time per iteration (s): 0.16 | learning rate: 8.290E-05 | global batch size: 256 | lm loss: 3.668804E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.804 | TFLOPs: 25.28 | +7: iteration 104350/ 173500 | consumed samples: 26713600 | consumed tokens: 54709452800 | elapsed time per iteration (s): 0.16 | learning rate: 8.289E-05 | global batch size: 256 | lm loss: 3.686871E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.460 | TFLOPs: 25.15 | +7: iteration 104360/ 173500 | consumed samples: 26716160 | consumed tokens: 54714695680 | elapsed time per iteration (s): 0.16 | learning rate: 8.287E-05 | global batch size: 256 | lm loss: 3.689457E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.164 | TFLOPs: 24.58 | +7: iteration 104370/ 173500 | consumed samples: 26718720 | consumed tokens: 54719938560 | elapsed time per iteration (s): 0.16 | learning rate: 8.285E-05 | global batch size: 256 | lm loss: 3.687525E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.298 | TFLOPs: 25.36 | +7: iteration 104380/ 173500 | consumed samples: 26721280 | consumed tokens: 54725181440 | elapsed time per iteration (s): 0.17 | learning rate: 8.284E-05 | global batch size: 256 | lm loss: 3.690336E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.937 | TFLOPs: 24.28 | +7: iteration 104390/ 173500 | consumed samples: 26723840 | consumed tokens: 54730424320 | elapsed time per iteration (s): 0.17 | learning rate: 8.282E-05 | global batch size: 256 | lm loss: 3.695397E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.198 | TFLOPs: 23.84 | +7: iteration 104400/ 173500 | consumed samples: 26726400 | consumed tokens: 54735667200 | elapsed time per iteration (s): 0.16 | learning rate: 8.281E-05 | global batch size: 256 | lm loss: 3.684665E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.417 | TFLOPs: 25.73 | +7: iteration 104410/ 173500 | consumed samples: 26728960 | consumed tokens: 54740910080 | elapsed time per iteration (s): 0.17 | learning rate: 8.279E-05 | global batch size: 256 | lm loss: 3.674761E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.571 | TFLOPs: 23.91 | +7: iteration 104420/ 173500 | consumed samples: 26731520 | consumed tokens: 54746152960 | elapsed time per iteration (s): 0.16 | learning rate: 8.278E-05 | global batch size: 256 | lm loss: 3.688600E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.929 | TFLOPs: 24.34 | +7: iteration 104430/ 173500 | consumed samples: 26734080 | consumed tokens: 54751395840 | elapsed time per iteration (s): 0.16 | learning rate: 8.276E-05 | global batch size: 256 | lm loss: 3.687198E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.665 | TFLOPs: 24.80 | +7: iteration 104440/ 173500 | consumed samples: 26736640 | consumed tokens: 54756638720 | elapsed time per iteration (s): 0.16 | learning rate: 8.274E-05 | global batch size: 256 | lm loss: 3.688046E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.747 | TFLOPs: 25.39 | +7: iteration 104450/ 173500 | consumed samples: 26739200 | consumed tokens: 54761881600 | elapsed time per iteration (s): 0.16 | learning rate: 8.273E-05 | global batch size: 256 | lm loss: 3.693379E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.506 | TFLOPs: 25.51 | +7: iteration 104460/ 173500 | consumed samples: 26741760 | consumed tokens: 54767124480 | elapsed time per iteration (s): 0.16 | learning rate: 8.271E-05 | global batch size: 256 | lm loss: 3.692676E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.857 | TFLOPs: 25.17 | +7: iteration 104470/ 173500 | consumed samples: 26744320 | consumed tokens: 54772367360 | elapsed time per iteration (s): 0.15 | learning rate: 8.270E-05 | global batch size: 256 | lm loss: 3.687033E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.791 | TFLOPs: 25.90 | +7: iteration 104480/ 173500 | consumed samples: 26746880 | consumed tokens: 54777610240 | elapsed time per iteration (s): 0.16 | learning rate: 8.268E-05 | global batch size: 256 | lm loss: 3.683422E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.860 | TFLOPs: 25.65 | +7: iteration 104490/ 173500 | consumed samples: 26749440 | consumed tokens: 54782853120 | elapsed time per iteration (s): 0.16 | learning rate: 8.267E-05 | global batch size: 256 | lm loss: 3.693466E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.520 | TFLOPs: 25.71 | +7: iteration 104500/ 173500 | consumed samples: 26752000 | consumed tokens: 54788096000 | elapsed time per iteration (s): 0.17 | learning rate: 8.265E-05 | global batch size: 256 | lm loss: 3.694316E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.140 | TFLOPs: 23.45 | +7: iteration 104510/ 173500 | consumed samples: 26754560 | consumed tokens: 54793338880 | elapsed time per iteration (s): 0.17 | learning rate: 8.263E-05 | global batch size: 256 | lm loss: 3.691492E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.578 | TFLOPs: 24.00 | +7: iteration 104520/ 173500 | consumed samples: 26757120 | consumed tokens: 54798581760 | elapsed time per iteration (s): 0.17 | learning rate: 8.262E-05 | global batch size: 256 | lm loss: 3.699668E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.458 | TFLOPs: 24.22 | +7: iteration 104530/ 173500 | consumed samples: 26759680 | consumed tokens: 54803824640 | elapsed time per iteration (s): 0.17 | learning rate: 8.260E-05 | global batch size: 256 | lm loss: 3.688684E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.741 | TFLOPs: 24.21 | +7: iteration 104540/ 173500 | consumed samples: 26762240 | consumed tokens: 54809067520 | elapsed time per iteration (s): 0.16 | learning rate: 8.259E-05 | global batch size: 256 | lm loss: 3.684834E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.209 | TFLOPs: 25.33 | +7: iteration 104550/ 173500 | consumed samples: 26764800 | consumed tokens: 54814310400 | elapsed time per iteration (s): 0.16 | learning rate: 8.257E-05 | global batch size: 256 | lm loss: 3.687270E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.861 | TFLOPs: 24.60 | +7: iteration 104560/ 173500 | consumed samples: 26767360 | consumed tokens: 54819553280 | elapsed time per iteration (s): 0.16 | learning rate: 8.256E-05 | global batch size: 256 | lm loss: 3.698031E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.740 | TFLOPs: 25.07 | +7: iteration 104570/ 173500 | consumed samples: 26769920 | consumed tokens: 54824796160 | elapsed time per iteration (s): 0.16 | learning rate: 8.254E-05 | global batch size: 256 | lm loss: 3.689107E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.988 | TFLOPs: 25.89 | +7: iteration 104580/ 173500 | consumed samples: 26772480 | consumed tokens: 54830039040 | elapsed time per iteration (s): 0.16 | learning rate: 8.252E-05 | global batch size: 256 | lm loss: 3.707119E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.804 | TFLOPs: 25.18 | +7: iteration 104590/ 173500 | consumed samples: 26775040 | consumed tokens: 54835281920 | elapsed time per iteration (s): 0.16 | learning rate: 8.251E-05 | global batch size: 256 | lm loss: 3.681528E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.371 | TFLOPs: 25.69 | +7: iteration 104600/ 173500 | consumed samples: 26777600 | consumed tokens: 54840524800 | elapsed time per iteration (s): 0.16 | learning rate: 8.249E-05 | global batch size: 256 | lm loss: 3.707699E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.026 | TFLOPs: 25.06 | +7: iteration 104610/ 173500 | consumed samples: 26780160 | consumed tokens: 54845767680 | elapsed time per iteration (s): 0.16 | learning rate: 8.248E-05 | global batch size: 256 | lm loss: 3.691588E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.268 | TFLOPs: 25.41 | +7: iteration 104620/ 173500 | consumed samples: 26782720 | consumed tokens: 54851010560 | elapsed time per iteration (s): 0.16 | learning rate: 8.246E-05 | global batch size: 256 | lm loss: 3.683392E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.640 | TFLOPs: 24.41 | +7: iteration 104630/ 173500 | consumed samples: 26785280 | consumed tokens: 54856253440 | elapsed time per iteration (s): 0.16 | learning rate: 8.245E-05 | global batch size: 256 | lm loss: 3.694122E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.675 | TFLOPs: 25.09 | +7: iteration 104640/ 173500 | consumed samples: 26787840 | consumed tokens: 54861496320 | elapsed time per iteration (s): 0.17 | learning rate: 8.243E-05 | global batch size: 256 | lm loss: 3.692013E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.928 | TFLOPs: 24.24 | +7: iteration 104650/ 173500 | consumed samples: 26790400 | consumed tokens: 54866739200 | elapsed time per iteration (s): 0.17 | learning rate: 8.241E-05 | global batch size: 256 | lm loss: 3.688534E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1466.431 | TFLOPs: 23.00 | +7: iteration 104660/ 173500 | consumed samples: 26792960 | consumed tokens: 54871982080 | elapsed time per iteration (s): 0.16 | learning rate: 8.240E-05 | global batch size: 256 | lm loss: 3.689350E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.001 | TFLOPs: 25.17 | +7: iteration 104670/ 173500 | consumed samples: 26795520 | consumed tokens: 54877224960 | elapsed time per iteration (s): 0.16 | learning rate: 8.238E-05 | global batch size: 256 | lm loss: 3.691576E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.704 | TFLOPs: 24.73 | +7: iteration 104680/ 173500 | consumed samples: 26798080 | consumed tokens: 54882467840 | elapsed time per iteration (s): 0.17 | learning rate: 8.237E-05 | global batch size: 256 | lm loss: 3.682304E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.782 | TFLOPs: 23.96 | +7: iteration 104690/ 173500 | consumed samples: 26800640 | consumed tokens: 54887710720 | elapsed time per iteration (s): 0.16 | learning rate: 8.235E-05 | global batch size: 256 | lm loss: 3.677221E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.922 | TFLOPs: 25.22 | +7: iteration 104700/ 173500 | consumed samples: 26803200 | consumed tokens: 54892953600 | elapsed time per iteration (s): 0.16 | learning rate: 8.234E-05 | global batch size: 256 | lm loss: 3.694469E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.260 | TFLOPs: 24.56 | +7: iteration 104710/ 173500 | consumed samples: 26805760 | consumed tokens: 54898196480 | elapsed time per iteration (s): 0.16 | learning rate: 8.232E-05 | global batch size: 256 | lm loss: 3.711398E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.842 | TFLOPs: 24.54 | +7: iteration 104720/ 173500 | consumed samples: 26808320 | consumed tokens: 54903439360 | elapsed time per iteration (s): 0.15 | learning rate: 8.230E-05 | global batch size: 256 | lm loss: 3.678171E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.867 | TFLOPs: 26.06 | +7: iteration 104730/ 173500 | consumed samples: 26810880 | consumed tokens: 54908682240 | elapsed time per iteration (s): 0.16 | learning rate: 8.229E-05 | global batch size: 256 | lm loss: 3.688600E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.850 | TFLOPs: 24.71 | +7: iteration 104740/ 173500 | consumed samples: 26813440 | consumed tokens: 54913925120 | elapsed time per iteration (s): 0.16 | learning rate: 8.227E-05 | global batch size: 256 | lm loss: 3.686787E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.127 | TFLOPs: 25.41 | +7: iteration 104750/ 173500 | consumed samples: 26816000 | consumed tokens: 54919168000 | elapsed time per iteration (s): 0.16 | learning rate: 8.226E-05 | global batch size: 256 | lm loss: 3.681052E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.061 | TFLOPs: 24.42 | +7: iteration 104760/ 173500 | consumed samples: 26818560 | consumed tokens: 54924410880 | elapsed time per iteration (s): 0.16 | learning rate: 8.224E-05 | global batch size: 256 | lm loss: 3.702224E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.413 | TFLOPs: 24.96 | +7: iteration 104770/ 173500 | consumed samples: 26821120 | consumed tokens: 54929653760 | elapsed time per iteration (s): 0.16 | learning rate: 8.223E-05 | global batch size: 256 | lm loss: 3.681723E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.025 | TFLOPs: 25.33 | +7: iteration 104780/ 173500 | consumed samples: 26823680 | consumed tokens: 54934896640 | elapsed time per iteration (s): 0.17 | learning rate: 8.221E-05 | global batch size: 256 | lm loss: 3.699469E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.904 | TFLOPs: 23.69 | +7: iteration 104790/ 173500 | consumed samples: 26826240 | consumed tokens: 54940139520 | elapsed time per iteration (s): 0.16 | learning rate: 8.220E-05 | global batch size: 256 | lm loss: 3.698458E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.006 | TFLOPs: 25.30 | +7: iteration 104800/ 173500 | consumed samples: 26828800 | consumed tokens: 54945382400 | elapsed time per iteration (s): 0.16 | learning rate: 8.218E-05 | global batch size: 256 | lm loss: 3.685128E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.328 | TFLOPs: 24.88 | +7: iteration 104810/ 173500 | consumed samples: 26831360 | consumed tokens: 54950625280 | elapsed time per iteration (s): 0.16 | learning rate: 8.216E-05 | global batch size: 256 | lm loss: 3.702464E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.218 | TFLOPs: 25.75 | +7: iteration 104820/ 173500 | consumed samples: 26833920 | consumed tokens: 54955868160 | elapsed time per iteration (s): 0.16 | learning rate: 8.215E-05 | global batch size: 256 | lm loss: 3.695364E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.235 | TFLOPs: 24.91 | +7: iteration 104830/ 173500 | consumed samples: 26836480 | consumed tokens: 54961111040 | elapsed time per iteration (s): 0.16 | learning rate: 8.213E-05 | global batch size: 256 | lm loss: 3.703202E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.224 | TFLOPs: 25.17 | +7: iteration 104840/ 173500 | consumed samples: 26839040 | consumed tokens: 54966353920 | elapsed time per iteration (s): 0.16 | learning rate: 8.212E-05 | global batch size: 256 | lm loss: 3.696680E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.421 | TFLOPs: 25.38 | +7: iteration 104850/ 173500 | consumed samples: 26841600 | consumed tokens: 54971596800 | elapsed time per iteration (s): 0.16 | learning rate: 8.210E-05 | global batch size: 256 | lm loss: 3.691150E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.108 | TFLOPs: 24.83 | +7: iteration 104860/ 173500 | consumed samples: 26844160 | consumed tokens: 54976839680 | elapsed time per iteration (s): 0.15 | learning rate: 8.209E-05 | global batch size: 256 | lm loss: 3.687280E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.670 | TFLOPs: 26.28 | +7: iteration 104870/ 173500 | consumed samples: 26846720 | consumed tokens: 54982082560 | elapsed time per iteration (s): 0.16 | learning rate: 8.207E-05 | global batch size: 256 | lm loss: 3.677857E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.967 | TFLOPs: 24.78 | +7: iteration 104880/ 173500 | consumed samples: 26849280 | consumed tokens: 54987325440 | elapsed time per iteration (s): 0.16 | learning rate: 8.205E-05 | global batch size: 256 | lm loss: 3.695297E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.092 | TFLOPs: 25.05 | +7: iteration 104890/ 173500 | consumed samples: 26851840 | consumed tokens: 54992568320 | elapsed time per iteration (s): 0.16 | learning rate: 8.204E-05 | global batch size: 256 | lm loss: 3.693813E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.560 | TFLOPs: 24.77 | +7: iteration 104900/ 173500 | consumed samples: 26854400 | consumed tokens: 54997811200 | elapsed time per iteration (s): 0.16 | learning rate: 8.202E-05 | global batch size: 256 | lm loss: 3.702434E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.851 | TFLOPs: 25.72 | +7: iteration 104910/ 173500 | consumed samples: 26856960 | consumed tokens: 55003054080 | elapsed time per iteration (s): 0.15 | learning rate: 8.201E-05 | global batch size: 256 | lm loss: 3.686590E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.107 | TFLOPs: 26.18 | +7: iteration 104920/ 173500 | consumed samples: 26859520 | consumed tokens: 55008296960 | elapsed time per iteration (s): 0.16 | learning rate: 8.199E-05 | global batch size: 256 | lm loss: 3.683741E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.377 | TFLOPs: 25.52 | +7: iteration 104930/ 173500 | consumed samples: 26862080 | consumed tokens: 55013539840 | elapsed time per iteration (s): 0.16 | learning rate: 8.198E-05 | global batch size: 256 | lm loss: 3.669968E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.220 | TFLOPs: 25.63 | +7: iteration 104940/ 173500 | consumed samples: 26864640 | consumed tokens: 55018782720 | elapsed time per iteration (s): 0.16 | learning rate: 8.196E-05 | global batch size: 256 | lm loss: 3.690130E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.630 | TFLOPs: 24.91 | +7: iteration 104950/ 173500 | consumed samples: 26867200 | consumed tokens: 55024025600 | elapsed time per iteration (s): 0.16 | learning rate: 8.194E-05 | global batch size: 256 | lm loss: 3.692306E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.320 | TFLOPs: 25.22 | +7: iteration 104960/ 173500 | consumed samples: 26869760 | consumed tokens: 55029268480 | elapsed time per iteration (s): 0.16 | learning rate: 8.193E-05 | global batch size: 256 | lm loss: 3.680896E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.865 | TFLOPs: 25.62 | +7: iteration 104970/ 173500 | consumed samples: 26872320 | consumed tokens: 55034511360 | elapsed time per iteration (s): 0.16 | learning rate: 8.191E-05 | global batch size: 256 | lm loss: 3.684523E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.396 | TFLOPs: 25.62 | +7: iteration 104980/ 173500 | consumed samples: 26874880 | consumed tokens: 55039754240 | elapsed time per iteration (s): 0.16 | learning rate: 8.190E-05 | global batch size: 256 | lm loss: 3.700846E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.647 | TFLOPs: 25.87 | +7: iteration 104990/ 173500 | consumed samples: 26877440 | consumed tokens: 55044997120 | elapsed time per iteration (s): 0.16 | learning rate: 8.188E-05 | global batch size: 256 | lm loss: 3.680323E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.659 | TFLOPs: 25.68 | +7: iteration 105000/ 173500 | consumed samples: 26880000 | consumed tokens: 55050240000 | elapsed time per iteration (s): 0.16 | learning rate: 8.187E-05 | global batch size: 256 | lm loss: 3.675390E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.095 | TFLOPs: 24.61 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 105000 | lm loss value: 3.832714E+00 | lm loss PPL: 4.618773E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 105000 to checkpoints_44m91b100m +0: [2023-03-17 04:48:44,932] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step105000 is begin to save! +0: [2023-03-17 04:48:44,937] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:48:44,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:48:44,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:48:45,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:48:45,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:48:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:48:45,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:48:45,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:48:45,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:48:45,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:48:45,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:48:45,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:48:45,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:48:45,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:48:45,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:48:45,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:48:45,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:48:45,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:48:45,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:48:45,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:48:45,073] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step105000/mp_rank_00_model_states.pt +0: [2023-03-17 04:48:45,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:48:45,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:48:45,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:48:45,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:48:45,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +3: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +3: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +3: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +3: [2023-03-17 04:48:45,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:48:45,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +3: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +3: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +4: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 04:48:45,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +2: [2023-03-17 04:48:45,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:48:45,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +5: [2023-03-17 04:48:45,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +6: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:48:45,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +7: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:48:45,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:48:45,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +1: [2023-03-17 04:48:45,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:48:45,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step105000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:48:45,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step105000 is ready now! +0: successfully saved checkpoint at iteration 105000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 211.73 +7: iteration 105010/ 173500 | consumed samples: 26882560 | consumed tokens: 55055482880 | elapsed time per iteration (s): 0.18 | learning rate: 8.185E-05 | global batch size: 256 | lm loss: 3.689626E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.014 | TFLOPs: 22.05 | +7: iteration 105020/ 173500 | consumed samples: 26885120 | consumed tokens: 55060725760 | elapsed time per iteration (s): 0.16 | learning rate: 8.184E-05 | global batch size: 256 | lm loss: 3.686430E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.040 | TFLOPs: 25.23 | +7: iteration 105030/ 173500 | consumed samples: 26887680 | consumed tokens: 55065968640 | elapsed time per iteration (s): 0.16 | learning rate: 8.182E-05 | global batch size: 256 | lm loss: 3.689056E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.497 | TFLOPs: 25.34 | +7: iteration 105040/ 173500 | consumed samples: 26890240 | consumed tokens: 55071211520 | elapsed time per iteration (s): 0.16 | learning rate: 8.180E-05 | global batch size: 256 | lm loss: 3.686932E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.311 | TFLOPs: 25.72 | +7: iteration 105050/ 173500 | consumed samples: 26892800 | consumed tokens: 55076454400 | elapsed time per iteration (s): 0.16 | learning rate: 8.179E-05 | global batch size: 256 | lm loss: 3.681675E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.453 | TFLOPs: 25.80 | +7: iteration 105060/ 173500 | consumed samples: 26895360 | consumed tokens: 55081697280 | elapsed time per iteration (s): 0.16 | learning rate: 8.177E-05 | global batch size: 256 | lm loss: 3.697353E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.536 | TFLOPs: 25.52 | +7: iteration 105070/ 173500 | consumed samples: 26897920 | consumed tokens: 55086940160 | elapsed time per iteration (s): 0.16 | learning rate: 8.176E-05 | global batch size: 256 | lm loss: 3.680601E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.331 | TFLOPs: 25.71 | +7: iteration 105080/ 173500 | consumed samples: 26900480 | consumed tokens: 55092183040 | elapsed time per iteration (s): 0.16 | learning rate: 8.174E-05 | global batch size: 256 | lm loss: 3.682792E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.388 | TFLOPs: 25.76 | +7: iteration 105090/ 173500 | consumed samples: 26903040 | consumed tokens: 55097425920 | elapsed time per iteration (s): 0.16 | learning rate: 8.173E-05 | global batch size: 256 | lm loss: 3.675476E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.579 | TFLOPs: 24.38 | +7: iteration 105100/ 173500 | consumed samples: 26905600 | consumed tokens: 55102668800 | elapsed time per iteration (s): 0.15 | learning rate: 8.171E-05 | global batch size: 256 | lm loss: 3.685157E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.214 | TFLOPs: 26.08 | +7: iteration 105110/ 173500 | consumed samples: 26908160 | consumed tokens: 55107911680 | elapsed time per iteration (s): 0.16 | learning rate: 8.169E-05 | global batch size: 256 | lm loss: 3.698382E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.953 | TFLOPs: 25.36 | +7: iteration 105120/ 173500 | consumed samples: 26910720 | consumed tokens: 55113154560 | elapsed time per iteration (s): 0.15 | learning rate: 8.168E-05 | global batch size: 256 | lm loss: 3.684943E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.663 | TFLOPs: 26.12 | +7: iteration 105130/ 173500 | consumed samples: 26913280 | consumed tokens: 55118397440 | elapsed time per iteration (s): 0.16 | learning rate: 8.166E-05 | global batch size: 256 | lm loss: 3.677468E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.990 | TFLOPs: 25.00 | +7: iteration 105140/ 173500 | consumed samples: 26915840 | consumed tokens: 55123640320 | elapsed time per iteration (s): 0.16 | learning rate: 8.165E-05 | global batch size: 256 | lm loss: 3.699124E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.751 | TFLOPs: 24.81 | +7: iteration 105150/ 173500 | consumed samples: 26918400 | consumed tokens: 55128883200 | elapsed time per iteration (s): 0.16 | learning rate: 8.163E-05 | global batch size: 256 | lm loss: 3.685915E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.894 | TFLOPs: 25.15 | +7: iteration 105160/ 173500 | consumed samples: 26920960 | consumed tokens: 55134126080 | elapsed time per iteration (s): 0.17 | learning rate: 8.162E-05 | global batch size: 256 | lm loss: 3.686595E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.084 | TFLOPs: 24.04 | +7: iteration 105170/ 173500 | consumed samples: 26923520 | consumed tokens: 55139368960 | elapsed time per iteration (s): 0.16 | learning rate: 8.160E-05 | global batch size: 256 | lm loss: 3.682612E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.196 | TFLOPs: 25.75 | +7: iteration 105180/ 173500 | consumed samples: 26926080 | consumed tokens: 55144611840 | elapsed time per iteration (s): 0.16 | learning rate: 8.159E-05 | global batch size: 256 | lm loss: 3.694504E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.937 | TFLOPs: 25.08 | +7: iteration 105190/ 173500 | consumed samples: 26928640 | consumed tokens: 55149854720 | elapsed time per iteration (s): 0.17 | learning rate: 8.157E-05 | global batch size: 256 | lm loss: 3.689022E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.782 | TFLOPs: 23.94 | +7: iteration 105200/ 173500 | consumed samples: 26931200 | consumed tokens: 55155097600 | elapsed time per iteration (s): 0.16 | learning rate: 8.155E-05 | global batch size: 256 | lm loss: 3.676632E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.451 | TFLOPs: 25.84 | +7: iteration 105210/ 173500 | consumed samples: 26933760 | consumed tokens: 55160340480 | elapsed time per iteration (s): 0.16 | learning rate: 8.154E-05 | global batch size: 256 | lm loss: 3.679254E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.660 | TFLOPs: 25.27 | +7: iteration 105220/ 173500 | consumed samples: 26936320 | consumed tokens: 55165583360 | elapsed time per iteration (s): 0.16 | learning rate: 8.152E-05 | global batch size: 256 | lm loss: 3.670638E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.555 | TFLOPs: 25.82 | +7: iteration 105230/ 173500 | consumed samples: 26938880 | consumed tokens: 55170826240 | elapsed time per iteration (s): 0.15 | learning rate: 8.151E-05 | global batch size: 256 | lm loss: 3.688817E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.382 | TFLOPs: 26.15 | +7: iteration 105240/ 173500 | consumed samples: 26941440 | consumed tokens: 55176069120 | elapsed time per iteration (s): 0.16 | learning rate: 8.149E-05 | global batch size: 256 | lm loss: 3.695208E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.102 | TFLOPs: 24.70 | +7: iteration 105250/ 173500 | consumed samples: 26944000 | consumed tokens: 55181312000 | elapsed time per iteration (s): 0.15 | learning rate: 8.148E-05 | global batch size: 256 | lm loss: 3.696208E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.287 | TFLOPs: 26.13 | +7: iteration 105260/ 173500 | consumed samples: 26946560 | consumed tokens: 55186554880 | elapsed time per iteration (s): 0.15 | learning rate: 8.146E-05 | global batch size: 256 | lm loss: 3.701081E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.251 | TFLOPs: 26.13 | +7: iteration 105270/ 173500 | consumed samples: 26949120 | consumed tokens: 55191797760 | elapsed time per iteration (s): 0.16 | learning rate: 8.144E-05 | global batch size: 256 | lm loss: 3.684756E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.273 | TFLOPs: 25.16 | +7: iteration 105280/ 173500 | consumed samples: 26951680 | consumed tokens: 55197040640 | elapsed time per iteration (s): 0.16 | learning rate: 8.143E-05 | global batch size: 256 | lm loss: 3.680636E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.808 | TFLOPs: 25.62 | +7: iteration 105290/ 173500 | consumed samples: 26954240 | consumed tokens: 55202283520 | elapsed time per iteration (s): 0.16 | learning rate: 8.141E-05 | global batch size: 256 | lm loss: 3.680957E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.776 | TFLOPs: 25.01 | +7: iteration 105300/ 173500 | consumed samples: 26956800 | consumed tokens: 55207526400 | elapsed time per iteration (s): 0.16 | learning rate: 8.140E-05 | global batch size: 256 | lm loss: 3.690407E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.679 | TFLOPs: 25.78 | +7: iteration 105310/ 173500 | consumed samples: 26959360 | consumed tokens: 55212769280 | elapsed time per iteration (s): 0.16 | learning rate: 8.138E-05 | global batch size: 256 | lm loss: 3.681144E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.228 | TFLOPs: 24.63 | +7: iteration 105320/ 173500 | consumed samples: 26961920 | consumed tokens: 55218012160 | elapsed time per iteration (s): 0.17 | learning rate: 8.137E-05 | global batch size: 256 | lm loss: 3.681494E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.598 | TFLOPs: 24.10 | +7: iteration 105330/ 173500 | consumed samples: 26964480 | consumed tokens: 55223255040 | elapsed time per iteration (s): 0.15 | learning rate: 8.135E-05 | global batch size: 256 | lm loss: 3.673952E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.802 | TFLOPs: 26.14 | +7: iteration 105340/ 173500 | consumed samples: 26967040 | consumed tokens: 55228497920 | elapsed time per iteration (s): 0.15 | learning rate: 8.134E-05 | global batch size: 256 | lm loss: 3.686204E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.474 | TFLOPs: 26.07 | +7: iteration 105350/ 173500 | consumed samples: 26969600 | consumed tokens: 55233740800 | elapsed time per iteration (s): 0.15 | learning rate: 8.132E-05 | global batch size: 256 | lm loss: 3.676127E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.719 | TFLOPs: 26.08 | +7: iteration 105360/ 173500 | consumed samples: 26972160 | consumed tokens: 55238983680 | elapsed time per iteration (s): 0.16 | learning rate: 8.130E-05 | global batch size: 256 | lm loss: 3.697675E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.468 | TFLOPs: 24.96 | +7: iteration 105370/ 173500 | consumed samples: 26974720 | consumed tokens: 55244226560 | elapsed time per iteration (s): 0.16 | learning rate: 8.129E-05 | global batch size: 256 | lm loss: 3.684859E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.018 | TFLOPs: 25.53 | +7: iteration 105380/ 173500 | consumed samples: 26977280 | consumed tokens: 55249469440 | elapsed time per iteration (s): 0.16 | learning rate: 8.127E-05 | global batch size: 256 | lm loss: 3.683720E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.615 | TFLOPs: 24.76 | +7: iteration 105390/ 173500 | consumed samples: 26979840 | consumed tokens: 55254712320 | elapsed time per iteration (s): 0.15 | learning rate: 8.126E-05 | global batch size: 256 | lm loss: 3.692003E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.261 | TFLOPs: 26.10 | +7: iteration 105400/ 173500 | consumed samples: 26982400 | consumed tokens: 55259955200 | elapsed time per iteration (s): 0.16 | learning rate: 8.124E-05 | global batch size: 256 | lm loss: 3.695423E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.304 | TFLOPs: 25.36 | +7: iteration 105410/ 173500 | consumed samples: 26984960 | consumed tokens: 55265198080 | elapsed time per iteration (s): 0.16 | learning rate: 8.123E-05 | global batch size: 256 | lm loss: 3.683405E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.223 | TFLOPs: 25.47 | +7: iteration 105420/ 173500 | consumed samples: 26987520 | consumed tokens: 55270440960 | elapsed time per iteration (s): 0.15 | learning rate: 8.121E-05 | global batch size: 256 | lm loss: 3.675178E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.411 | TFLOPs: 26.09 | +7: iteration 105430/ 173500 | consumed samples: 26990080 | consumed tokens: 55275683840 | elapsed time per iteration (s): 0.18 | learning rate: 8.120E-05 | global batch size: 256 | lm loss: 3.678996E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.252 | TFLOPs: 22.76 | +7: iteration 105440/ 173500 | consumed samples: 26992640 | consumed tokens: 55280926720 | elapsed time per iteration (s): 0.16 | learning rate: 8.118E-05 | global batch size: 256 | lm loss: 3.685622E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.467 | TFLOPs: 25.73 | +7: iteration 105450/ 173500 | consumed samples: 26995200 | consumed tokens: 55286169600 | elapsed time per iteration (s): 0.15 | learning rate: 8.116E-05 | global batch size: 256 | lm loss: 3.695885E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.411 | TFLOPs: 26.09 | +7: iteration 105460/ 173500 | consumed samples: 26997760 | consumed tokens: 55291412480 | elapsed time per iteration (s): 0.16 | learning rate: 8.115E-05 | global batch size: 256 | lm loss: 3.678127E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.997 | TFLOPs: 25.70 | +7: iteration 105470/ 173500 | consumed samples: 27000320 | consumed tokens: 55296655360 | elapsed time per iteration (s): 0.16 | learning rate: 8.113E-05 | global batch size: 256 | lm loss: 3.687993E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.949 | TFLOPs: 25.62 | +7: iteration 105480/ 173500 | consumed samples: 27002880 | consumed tokens: 55301898240 | elapsed time per iteration (s): 0.16 | learning rate: 8.112E-05 | global batch size: 256 | lm loss: 3.689400E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.541 | TFLOPs: 25.40 | +7: iteration 105490/ 173500 | consumed samples: 27005440 | consumed tokens: 55307141120 | elapsed time per iteration (s): 0.16 | learning rate: 8.110E-05 | global batch size: 256 | lm loss: 3.688821E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.184 | TFLOPs: 25.74 | +7: iteration 105500/ 173500 | consumed samples: 27008000 | consumed tokens: 55312384000 | elapsed time per iteration (s): 0.16 | learning rate: 8.109E-05 | global batch size: 256 | lm loss: 3.686317E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.558 | TFLOPs: 25.78 | +7: iteration 105510/ 173500 | consumed samples: 27010560 | consumed tokens: 55317626880 | elapsed time per iteration (s): 0.16 | learning rate: 8.107E-05 | global batch size: 256 | lm loss: 3.684095E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.003 | TFLOPs: 25.56 | +7: iteration 105520/ 173500 | consumed samples: 27013120 | consumed tokens: 55322869760 | elapsed time per iteration (s): 0.16 | learning rate: 8.105E-05 | global batch size: 256 | lm loss: 3.694281E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.775 | TFLOPs: 25.53 | +7: iteration 105530/ 173500 | consumed samples: 27015680 | consumed tokens: 55328112640 | elapsed time per iteration (s): 0.16 | learning rate: 8.104E-05 | global batch size: 256 | lm loss: 3.685783E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.338 | TFLOPs: 25.77 | +7: iteration 105540/ 173500 | consumed samples: 27018240 | consumed tokens: 55333355520 | elapsed time per iteration (s): 0.16 | learning rate: 8.102E-05 | global batch size: 256 | lm loss: 3.681023E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.705 | TFLOPs: 24.49 | +7: iteration 105550/ 173500 | consumed samples: 27020800 | consumed tokens: 55338598400 | elapsed time per iteration (s): 0.16 | learning rate: 8.101E-05 | global batch size: 256 | lm loss: 3.699970E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.072 | TFLOPs: 25.59 | +7: iteration 105560/ 173500 | consumed samples: 27023360 | consumed tokens: 55343841280 | elapsed time per iteration (s): 0.16 | learning rate: 8.099E-05 | global batch size: 256 | lm loss: 3.693443E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.184 | TFLOPs: 25.14 | +7: iteration 105570/ 173500 | consumed samples: 27025920 | consumed tokens: 55349084160 | elapsed time per iteration (s): 0.16 | learning rate: 8.098E-05 | global batch size: 256 | lm loss: 3.700940E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.448 | TFLOPs: 24.94 | +7: iteration 105580/ 173500 | consumed samples: 27028480 | consumed tokens: 55354327040 | elapsed time per iteration (s): 0.16 | learning rate: 8.096E-05 | global batch size: 256 | lm loss: 3.685555E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.590 | TFLOPs: 25.87 | +7: iteration 105590/ 173500 | consumed samples: 27031040 | consumed tokens: 55359569920 | elapsed time per iteration (s): 0.16 | learning rate: 8.095E-05 | global batch size: 256 | lm loss: 3.684076E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.964 | TFLOPs: 25.80 | +7: iteration 105600/ 173500 | consumed samples: 27033600 | consumed tokens: 55364812800 | elapsed time per iteration (s): 0.16 | learning rate: 8.093E-05 | global batch size: 256 | lm loss: 3.700507E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.238 | TFLOPs: 25.49 | +7: iteration 105610/ 173500 | consumed samples: 27036160 | consumed tokens: 55370055680 | elapsed time per iteration (s): 0.15 | learning rate: 8.091E-05 | global batch size: 256 | lm loss: 3.695496E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.473 | TFLOPs: 26.32 | +7: iteration 105620/ 173500 | consumed samples: 27038720 | consumed tokens: 55375298560 | elapsed time per iteration (s): 0.15 | learning rate: 8.090E-05 | global batch size: 256 | lm loss: 3.683037E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.803 | TFLOPs: 26.31 | +7: iteration 105630/ 173500 | consumed samples: 27041280 | consumed tokens: 55380541440 | elapsed time per iteration (s): 0.15 | learning rate: 8.088E-05 | global batch size: 256 | lm loss: 3.685331E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.855 | TFLOPs: 25.91 | +7: iteration 105640/ 173500 | consumed samples: 27043840 | consumed tokens: 55385784320 | elapsed time per iteration (s): 0.15 | learning rate: 8.087E-05 | global batch size: 256 | lm loss: 3.702099E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.530 | TFLOPs: 25.98 | +7: iteration 105650/ 173500 | consumed samples: 27046400 | consumed tokens: 55391027200 | elapsed time per iteration (s): 0.15 | learning rate: 8.085E-05 | global batch size: 256 | lm loss: 3.692692E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.193 | TFLOPs: 26.02 | +7: iteration 105660/ 173500 | consumed samples: 27048960 | consumed tokens: 55396270080 | elapsed time per iteration (s): 0.16 | learning rate: 8.084E-05 | global batch size: 256 | lm loss: 3.682861E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.166 | TFLOPs: 24.89 | +7: iteration 105670/ 173500 | consumed samples: 27051520 | consumed tokens: 55401512960 | elapsed time per iteration (s): 0.16 | learning rate: 8.082E-05 | global batch size: 256 | lm loss: 3.670585E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.285 | TFLOPs: 24.36 | +7: iteration 105680/ 173500 | consumed samples: 27054080 | consumed tokens: 55406755840 | elapsed time per iteration (s): 0.16 | learning rate: 8.081E-05 | global batch size: 256 | lm loss: 3.693155E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.006 | TFLOPs: 25.86 | +7: iteration 105690/ 173500 | consumed samples: 27056640 | consumed tokens: 55411998720 | elapsed time per iteration (s): 0.15 | learning rate: 8.079E-05 | global batch size: 256 | lm loss: 3.678154E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.836 | TFLOPs: 26.14 | +7: iteration 105700/ 173500 | consumed samples: 27059200 | consumed tokens: 55417241600 | elapsed time per iteration (s): 0.16 | learning rate: 8.077E-05 | global batch size: 256 | lm loss: 3.689563E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.982 | TFLOPs: 24.81 | +7: iteration 105710/ 173500 | consumed samples: 27061760 | consumed tokens: 55422484480 | elapsed time per iteration (s): 0.15 | learning rate: 8.076E-05 | global batch size: 256 | lm loss: 3.692801E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.048 | TFLOPs: 25.91 | +7: iteration 105720/ 173500 | consumed samples: 27064320 | consumed tokens: 55427727360 | elapsed time per iteration (s): 0.15 | learning rate: 8.074E-05 | global batch size: 256 | lm loss: 3.674519E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.314 | TFLOPs: 26.10 | +7: iteration 105730/ 173500 | consumed samples: 27066880 | consumed tokens: 55432970240 | elapsed time per iteration (s): 0.16 | learning rate: 8.073E-05 | global batch size: 256 | lm loss: 3.689132E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.833 | TFLOPs: 25.00 | +7: iteration 105740/ 173500 | consumed samples: 27069440 | consumed tokens: 55438213120 | elapsed time per iteration (s): 0.16 | learning rate: 8.071E-05 | global batch size: 256 | lm loss: 3.685758E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.628 | TFLOPs: 25.56 | +7: iteration 105750/ 173500 | consumed samples: 27072000 | consumed tokens: 55443456000 | elapsed time per iteration (s): 0.16 | learning rate: 8.070E-05 | global batch size: 256 | lm loss: 3.686139E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.793 | TFLOPs: 25.54 | +7: iteration 105760/ 173500 | consumed samples: 27074560 | consumed tokens: 55448698880 | elapsed time per iteration (s): 0.16 | learning rate: 8.068E-05 | global batch size: 256 | lm loss: 3.672723E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.375 | TFLOPs: 25.22 | +7: iteration 105770/ 173500 | consumed samples: 27077120 | consumed tokens: 55453941760 | elapsed time per iteration (s): 0.15 | learning rate: 8.067E-05 | global batch size: 256 | lm loss: 3.697941E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.945 | TFLOPs: 25.97 | +7: iteration 105780/ 173500 | consumed samples: 27079680 | consumed tokens: 55459184640 | elapsed time per iteration (s): 0.15 | learning rate: 8.065E-05 | global batch size: 256 | lm loss: 3.684248E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.824 | TFLOPs: 25.95 | +7: iteration 105790/ 173500 | consumed samples: 27082240 | consumed tokens: 55464427520 | elapsed time per iteration (s): 0.16 | learning rate: 8.063E-05 | global batch size: 256 | lm loss: 3.690319E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.284 | TFLOPs: 25.86 | +7: iteration 105800/ 173500 | consumed samples: 27084800 | consumed tokens: 55469670400 | elapsed time per iteration (s): 0.16 | learning rate: 8.062E-05 | global batch size: 256 | lm loss: 3.693901E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.047 | TFLOPs: 25.14 | +7: iteration 105810/ 173500 | consumed samples: 27087360 | consumed tokens: 55474913280 | elapsed time per iteration (s): 0.16 | learning rate: 8.060E-05 | global batch size: 256 | lm loss: 3.687318E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.544 | TFLOPs: 25.88 | +7: iteration 105820/ 173500 | consumed samples: 27089920 | consumed tokens: 55480156160 | elapsed time per iteration (s): 0.16 | learning rate: 8.059E-05 | global batch size: 256 | lm loss: 3.688123E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.908 | TFLOPs: 25.89 | +7: iteration 105830/ 173500 | consumed samples: 27092480 | consumed tokens: 55485399040 | elapsed time per iteration (s): 0.15 | learning rate: 8.057E-05 | global batch size: 256 | lm loss: 3.680682E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.018 | TFLOPs: 26.22 | +7: iteration 105840/ 173500 | consumed samples: 27095040 | consumed tokens: 55490641920 | elapsed time per iteration (s): 0.16 | learning rate: 8.056E-05 | global batch size: 256 | lm loss: 3.689381E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.697 | TFLOPs: 24.65 | +7: iteration 105850/ 173500 | consumed samples: 27097600 | consumed tokens: 55495884800 | elapsed time per iteration (s): 0.16 | learning rate: 8.054E-05 | global batch size: 256 | lm loss: 3.692234E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.494 | TFLOPs: 25.73 | +7: iteration 105860/ 173500 | consumed samples: 27100160 | consumed tokens: 55501127680 | elapsed time per iteration (s): 0.15 | learning rate: 8.053E-05 | global batch size: 256 | lm loss: 3.692601E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.221 | TFLOPs: 26.19 | +7: iteration 105870/ 173500 | consumed samples: 27102720 | consumed tokens: 55506370560 | elapsed time per iteration (s): 0.16 | learning rate: 8.051E-05 | global batch size: 256 | lm loss: 3.678701E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.031 | TFLOPs: 25.36 | +7: iteration 105880/ 173500 | consumed samples: 27105280 | consumed tokens: 55511613440 | elapsed time per iteration (s): 0.15 | learning rate: 8.049E-05 | global batch size: 256 | lm loss: 3.687000E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.880 | TFLOPs: 26.20 | +7: iteration 105890/ 173500 | consumed samples: 27107840 | consumed tokens: 55516856320 | elapsed time per iteration (s): 0.16 | learning rate: 8.048E-05 | global batch size: 256 | lm loss: 3.692344E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.530 | TFLOPs: 25.77 | +7: iteration 105900/ 173500 | consumed samples: 27110400 | consumed tokens: 55522099200 | elapsed time per iteration (s): 0.15 | learning rate: 8.046E-05 | global batch size: 256 | lm loss: 3.673486E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.571 | TFLOPs: 26.21 | +7: iteration 105910/ 173500 | consumed samples: 27112960 | consumed tokens: 55527342080 | elapsed time per iteration (s): 0.16 | learning rate: 8.045E-05 | global batch size: 256 | lm loss: 3.672160E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.975 | TFLOPs: 25.36 | +7: iteration 105920/ 173500 | consumed samples: 27115520 | consumed tokens: 55532584960 | elapsed time per iteration (s): 0.15 | learning rate: 8.043E-05 | global batch size: 256 | lm loss: 3.694139E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.030 | TFLOPs: 26.00 | +7: iteration 105930/ 173500 | consumed samples: 27118080 | consumed tokens: 55537827840 | elapsed time per iteration (s): 0.15 | learning rate: 8.042E-05 | global batch size: 256 | lm loss: 3.678303E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.345 | TFLOPs: 26.02 | +7: iteration 105940/ 173500 | consumed samples: 27120640 | consumed tokens: 55543070720 | elapsed time per iteration (s): 0.16 | learning rate: 8.040E-05 | global batch size: 256 | lm loss: 3.693720E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.462 | TFLOPs: 25.62 | +7: iteration 105950/ 173500 | consumed samples: 27123200 | consumed tokens: 55548313600 | elapsed time per iteration (s): 0.15 | learning rate: 8.039E-05 | global batch size: 256 | lm loss: 3.693774E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.369 | TFLOPs: 26.02 | +7: iteration 105960/ 173500 | consumed samples: 27125760 | consumed tokens: 55553556480 | elapsed time per iteration (s): 0.16 | learning rate: 8.037E-05 | global batch size: 256 | lm loss: 3.680452E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.372 | TFLOPs: 25.66 | +7: iteration 105970/ 173500 | consumed samples: 27128320 | consumed tokens: 55558799360 | elapsed time per iteration (s): 0.16 | learning rate: 8.035E-05 | global batch size: 256 | lm loss: 3.691845E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.955 | TFLOPs: 25.73 | +7: iteration 105980/ 173500 | consumed samples: 27130880 | consumed tokens: 55564042240 | elapsed time per iteration (s): 0.16 | learning rate: 8.034E-05 | global batch size: 256 | lm loss: 3.684346E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.771 | TFLOPs: 25.65 | +7: iteration 105990/ 173500 | consumed samples: 27133440 | consumed tokens: 55569285120 | elapsed time per iteration (s): 0.15 | learning rate: 8.032E-05 | global batch size: 256 | lm loss: 3.680494E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.123 | TFLOPs: 25.91 | +0: [2023-03-17 04:51:22,436] [INFO] [logging.py:68:log_dist] [Rank 0] step=106000, skipped=0, lr=[8.030787777917086e-05, 8.030787777917086e-05, 8.030787777917086e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 106000/ 173500 | consumed samples: 27136000 | consumed tokens: 55574528000 | elapsed time per iteration (s): 0.15 | learning rate: 8.031E-05 | global batch size: 256 | lm loss: 3.670278E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.222 | TFLOPs: 26.11 | +0: steps: 106000 loss: 3.6486 iter time (s): 0.158 samples/sec: 1625.276 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 106000 | lm loss value: 3.824562E+00 | lm loss PPL: 4.581274E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 106000 to checkpoints_44m91b100m +0: [2023-03-17 04:51:22,509] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step106000 is begin to save! +0: [2023-03-17 04:51:22,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:51:22,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:51:22,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:51:22,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:51:22,584] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:51:22,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:51:22,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:51:22,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:51:22,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:51:22,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:51:22,609] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:51:22,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:51:22,617] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:51:22,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:51:22,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:51:22,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:51:22,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:51:22,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:51:22,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:51:22,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:51:22,642] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step106000/mp_rank_00_model_states.pt +0: [2023-03-17 04:51:22,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:51:22,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:51:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:51:22,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +2: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 04:51:22,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +2: [2023-03-17 04:51:22,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +2: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +2: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +2: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:51:22,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +4: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +2: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:51:22,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +1: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +7: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +6: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +5: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:51:22,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:51:22,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +3: [2023-03-17 04:51:22,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:51:22,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step106000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:51:22,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step106000 is ready now! +0: successfully saved checkpoint at iteration 106000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.23 +7: iteration 106010/ 173500 | consumed samples: 27138560 | consumed tokens: 55579770880 | elapsed time per iteration (s): 0.18 | learning rate: 8.029E-05 | global batch size: 256 | lm loss: 3.690964E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.658 | TFLOPs: 22.20 | +7: iteration 106020/ 173500 | consumed samples: 27141120 | consumed tokens: 55585013760 | elapsed time per iteration (s): 0.17 | learning rate: 8.028E-05 | global batch size: 256 | lm loss: 3.685792E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1510.945 | TFLOPs: 23.70 | +7: iteration 106030/ 173500 | consumed samples: 27143680 | consumed tokens: 55590256640 | elapsed time per iteration (s): 0.16 | learning rate: 8.026E-05 | global batch size: 256 | lm loss: 3.695691E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.877 | TFLOPs: 25.59 | +7: iteration 106040/ 173500 | consumed samples: 27146240 | consumed tokens: 55595499520 | elapsed time per iteration (s): 0.16 | learning rate: 8.025E-05 | global batch size: 256 | lm loss: 3.680300E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.185 | TFLOPs: 24.58 | +7: iteration 106050/ 173500 | consumed samples: 27148800 | consumed tokens: 55600742400 | elapsed time per iteration (s): 0.16 | learning rate: 8.023E-05 | global batch size: 256 | lm loss: 3.677372E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.013 | TFLOPs: 25.22 | +7: iteration 106060/ 173500 | consumed samples: 27151360 | consumed tokens: 55605985280 | elapsed time per iteration (s): 0.15 | learning rate: 8.021E-05 | global batch size: 256 | lm loss: 3.690790E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.351 | TFLOPs: 25.96 | +7: iteration 106070/ 173500 | consumed samples: 27153920 | consumed tokens: 55611228160 | elapsed time per iteration (s): 0.16 | learning rate: 8.020E-05 | global batch size: 256 | lm loss: 3.693037E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.152 | TFLOPs: 25.78 | +7: iteration 106080/ 173500 | consumed samples: 27156480 | consumed tokens: 55616471040 | elapsed time per iteration (s): 0.15 | learning rate: 8.018E-05 | global batch size: 256 | lm loss: 3.673879E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.983 | TFLOPs: 26.30 | +7: iteration 106090/ 173500 | consumed samples: 27159040 | consumed tokens: 55621713920 | elapsed time per iteration (s): 0.16 | learning rate: 8.017E-05 | global batch size: 256 | lm loss: 3.703320E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.622 | TFLOPs: 25.82 | +7: iteration 106100/ 173500 | consumed samples: 27161600 | consumed tokens: 55626956800 | elapsed time per iteration (s): 0.16 | learning rate: 8.015E-05 | global batch size: 256 | lm loss: 3.691591E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.727 | TFLOPs: 25.68 | +7: iteration 106110/ 173500 | consumed samples: 27164160 | consumed tokens: 55632199680 | elapsed time per iteration (s): 0.16 | learning rate: 8.014E-05 | global batch size: 256 | lm loss: 3.675953E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.701 | TFLOPs: 25.87 | +7: iteration 106120/ 173500 | consumed samples: 27166720 | consumed tokens: 55637442560 | elapsed time per iteration (s): 0.16 | learning rate: 8.012E-05 | global batch size: 256 | lm loss: 3.698250E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.732 | TFLOPs: 25.39 | +7: iteration 106130/ 173500 | consumed samples: 27169280 | consumed tokens: 55642685440 | elapsed time per iteration (s): 0.16 | learning rate: 8.011E-05 | global batch size: 256 | lm loss: 3.682614E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.323 | TFLOPs: 24.88 | +7: iteration 106140/ 173500 | consumed samples: 27171840 | consumed tokens: 55647928320 | elapsed time per iteration (s): 0.16 | learning rate: 8.009E-05 | global batch size: 256 | lm loss: 3.692799E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.345 | TFLOPs: 25.43 | +7: iteration 106150/ 173500 | consumed samples: 27174400 | consumed tokens: 55653171200 | elapsed time per iteration (s): 0.16 | learning rate: 8.007E-05 | global batch size: 256 | lm loss: 3.691439E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.317 | TFLOPs: 25.82 | +7: iteration 106160/ 173500 | consumed samples: 27176960 | consumed tokens: 55658414080 | elapsed time per iteration (s): 0.15 | learning rate: 8.006E-05 | global batch size: 256 | lm loss: 3.693165E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.759 | TFLOPs: 26.15 | +7: iteration 106170/ 173500 | consumed samples: 27179520 | consumed tokens: 55663656960 | elapsed time per iteration (s): 0.15 | learning rate: 8.004E-05 | global batch size: 256 | lm loss: 3.680136E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.784 | TFLOPs: 26.16 | +7: iteration 106180/ 173500 | consumed samples: 27182080 | consumed tokens: 55668899840 | elapsed time per iteration (s): 0.15 | learning rate: 8.003E-05 | global batch size: 256 | lm loss: 3.698531E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.754 | TFLOPs: 26.04 | +7: iteration 106190/ 173500 | consumed samples: 27184640 | consumed tokens: 55674142720 | elapsed time per iteration (s): 0.15 | learning rate: 8.001E-05 | global batch size: 256 | lm loss: 3.681516E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.529 | TFLOPs: 26.15 | +7: iteration 106200/ 173500 | consumed samples: 27187200 | consumed tokens: 55679385600 | elapsed time per iteration (s): 0.16 | learning rate: 8.000E-05 | global batch size: 256 | lm loss: 3.684602E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.146 | TFLOPs: 25.74 | +7: iteration 106210/ 173500 | consumed samples: 27189760 | consumed tokens: 55684628480 | elapsed time per iteration (s): 0.16 | learning rate: 7.998E-05 | global batch size: 256 | lm loss: 3.688943E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.013 | TFLOPs: 25.03 | +7: iteration 106220/ 173500 | consumed samples: 27192320 | consumed tokens: 55689871360 | elapsed time per iteration (s): 0.16 | learning rate: 7.997E-05 | global batch size: 256 | lm loss: 3.683943E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.964 | TFLOPs: 25.59 | +7: iteration 106230/ 173500 | consumed samples: 27194880 | consumed tokens: 55695114240 | elapsed time per iteration (s): 0.16 | learning rate: 7.995E-05 | global batch size: 256 | lm loss: 3.685023E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.506 | TFLOPs: 25.43 | +7: iteration 106240/ 173500 | consumed samples: 27197440 | consumed tokens: 55700357120 | elapsed time per iteration (s): 0.15 | learning rate: 7.994E-05 | global batch size: 256 | lm loss: 3.696481E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.854 | TFLOPs: 26.14 | +7: iteration 106250/ 173500 | consumed samples: 27200000 | consumed tokens: 55705600000 | elapsed time per iteration (s): 0.15 | learning rate: 7.992E-05 | global batch size: 256 | lm loss: 3.678287E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.432 | TFLOPs: 26.17 | +7: iteration 106260/ 173500 | consumed samples: 27202560 | consumed tokens: 55710842880 | elapsed time per iteration (s): 0.16 | learning rate: 7.990E-05 | global batch size: 256 | lm loss: 3.689200E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.143 | TFLOPs: 25.41 | +7: iteration 106270/ 173500 | consumed samples: 27205120 | consumed tokens: 55716085760 | elapsed time per iteration (s): 0.16 | learning rate: 7.989E-05 | global batch size: 256 | lm loss: 3.684231E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.430 | TFLOPs: 25.76 | +7: iteration 106280/ 173500 | consumed samples: 27207680 | consumed tokens: 55721328640 | elapsed time per iteration (s): 0.15 | learning rate: 7.987E-05 | global batch size: 256 | lm loss: 3.685617E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.149 | TFLOPs: 26.13 | +7: iteration 106290/ 173500 | consumed samples: 27210240 | consumed tokens: 55726571520 | elapsed time per iteration (s): 0.15 | learning rate: 7.986E-05 | global batch size: 256 | lm loss: 3.682393E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.380 | TFLOPs: 25.98 | +7: iteration 106300/ 173500 | consumed samples: 27212800 | consumed tokens: 55731814400 | elapsed time per iteration (s): 0.16 | learning rate: 7.984E-05 | global batch size: 256 | lm loss: 3.686083E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.930 | TFLOPs: 25.66 | +7: iteration 106310/ 173500 | consumed samples: 27215360 | consumed tokens: 55737057280 | elapsed time per iteration (s): 0.15 | learning rate: 7.983E-05 | global batch size: 256 | lm loss: 3.698825E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.027 | TFLOPs: 26.14 | +7: iteration 106320/ 173500 | consumed samples: 27217920 | consumed tokens: 55742300160 | elapsed time per iteration (s): 0.16 | learning rate: 7.981E-05 | global batch size: 256 | lm loss: 3.687751E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.903 | TFLOPs: 25.40 | +7: iteration 106330/ 173500 | consumed samples: 27220480 | consumed tokens: 55747543040 | elapsed time per iteration (s): 0.16 | learning rate: 7.980E-05 | global batch size: 256 | lm loss: 3.691016E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.924 | TFLOPs: 25.69 | +7: iteration 106340/ 173500 | consumed samples: 27223040 | consumed tokens: 55752785920 | elapsed time per iteration (s): 0.15 | learning rate: 7.978E-05 | global batch size: 256 | lm loss: 3.692532E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.256 | TFLOPs: 26.12 | +7: iteration 106350/ 173500 | consumed samples: 27225600 | consumed tokens: 55758028800 | elapsed time per iteration (s): 0.16 | learning rate: 7.976E-05 | global batch size: 256 | lm loss: 3.691269E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.232 | TFLOPs: 25.38 | +7: iteration 106360/ 173500 | consumed samples: 27228160 | consumed tokens: 55763271680 | elapsed time per iteration (s): 0.15 | learning rate: 7.975E-05 | global batch size: 256 | lm loss: 3.684735E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.277 | TFLOPs: 26.13 | +7: iteration 106370/ 173500 | consumed samples: 27230720 | consumed tokens: 55768514560 | elapsed time per iteration (s): 0.16 | learning rate: 7.973E-05 | global batch size: 256 | lm loss: 3.673521E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.742 | TFLOPs: 24.44 | +7: iteration 106380/ 173500 | consumed samples: 27233280 | consumed tokens: 55773757440 | elapsed time per iteration (s): 0.15 | learning rate: 7.972E-05 | global batch size: 256 | lm loss: 3.682637E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.830 | TFLOPs: 26.08 | +7: iteration 106390/ 173500 | consumed samples: 27235840 | consumed tokens: 55779000320 | elapsed time per iteration (s): 0.15 | learning rate: 7.970E-05 | global batch size: 256 | lm loss: 3.696941E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.463 | TFLOPs: 26.07 | +7: iteration 106400/ 173500 | consumed samples: 27238400 | consumed tokens: 55784243200 | elapsed time per iteration (s): 0.16 | learning rate: 7.969E-05 | global batch size: 256 | lm loss: 3.688781E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.080 | TFLOPs: 25.81 | +7: iteration 106410/ 173500 | consumed samples: 27240960 | consumed tokens: 55789486080 | elapsed time per iteration (s): 0.15 | learning rate: 7.967E-05 | global batch size: 256 | lm loss: 3.688039E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.046 | TFLOPs: 26.05 | +7: iteration 106420/ 173500 | consumed samples: 27243520 | consumed tokens: 55794728960 | elapsed time per iteration (s): 0.20 | learning rate: 7.966E-05 | global batch size: 256 | lm loss: 3.691860E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1311.122 | TFLOPs: 20.56 | +7: iteration 106430/ 173500 | consumed samples: 27246080 | consumed tokens: 55799971840 | elapsed time per iteration (s): 0.16 | learning rate: 7.964E-05 | global batch size: 256 | lm loss: 3.693360E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.964 | TFLOPs: 24.65 | +7: iteration 106440/ 173500 | consumed samples: 27248640 | consumed tokens: 55805214720 | elapsed time per iteration (s): 0.15 | learning rate: 7.963E-05 | global batch size: 256 | lm loss: 3.686759E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.525 | TFLOPs: 26.17 | +7: iteration 106450/ 173500 | consumed samples: 27251200 | consumed tokens: 55810457600 | elapsed time per iteration (s): 0.16 | learning rate: 7.961E-05 | global batch size: 256 | lm loss: 3.685690E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.311 | TFLOPs: 25.82 | +7: iteration 106460/ 173500 | consumed samples: 27253760 | consumed tokens: 55815700480 | elapsed time per iteration (s): 0.16 | learning rate: 7.959E-05 | global batch size: 256 | lm loss: 3.691990E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.396 | TFLOPs: 24.89 | +7: iteration 106470/ 173500 | consumed samples: 27256320 | consumed tokens: 55820943360 | elapsed time per iteration (s): 0.17 | learning rate: 7.958E-05 | global batch size: 256 | lm loss: 3.684906E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.474 | TFLOPs: 24.28 | +7: iteration 106480/ 173500 | consumed samples: 27258880 | consumed tokens: 55826186240 | elapsed time per iteration (s): 0.15 | learning rate: 7.956E-05 | global batch size: 256 | lm loss: 3.683739E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.856 | TFLOPs: 26.16 | +7: iteration 106490/ 173500 | consumed samples: 27261440 | consumed tokens: 55831429120 | elapsed time per iteration (s): 0.15 | learning rate: 7.955E-05 | global batch size: 256 | lm loss: 3.684114E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.317 | TFLOPs: 26.10 | +7: iteration 106500/ 173500 | consumed samples: 27264000 | consumed tokens: 55836672000 | elapsed time per iteration (s): 0.15 | learning rate: 7.953E-05 | global batch size: 256 | lm loss: 3.688131E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.679 | TFLOPs: 26.15 | +7: iteration 106510/ 173500 | consumed samples: 27266560 | consumed tokens: 55841914880 | elapsed time per iteration (s): 0.16 | learning rate: 7.952E-05 | global batch size: 256 | lm loss: 3.696113E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.649 | TFLOPs: 25.85 | +7: iteration 106520/ 173500 | consumed samples: 27269120 | consumed tokens: 55847157760 | elapsed time per iteration (s): 0.15 | learning rate: 7.950E-05 | global batch size: 256 | lm loss: 3.685941E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.831 | TFLOPs: 26.17 | +7: iteration 106530/ 173500 | consumed samples: 27271680 | consumed tokens: 55852400640 | elapsed time per iteration (s): 0.15 | learning rate: 7.949E-05 | global batch size: 256 | lm loss: 3.686607E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.826 | TFLOPs: 26.17 | +7: iteration 106540/ 173500 | consumed samples: 27274240 | consumed tokens: 55857643520 | elapsed time per iteration (s): 0.15 | learning rate: 7.947E-05 | global batch size: 256 | lm loss: 3.680972E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.166 | TFLOPs: 26.13 | +7: iteration 106550/ 173500 | consumed samples: 27276800 | consumed tokens: 55862886400 | elapsed time per iteration (s): 0.17 | learning rate: 7.945E-05 | global batch size: 256 | lm loss: 3.691014E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.936 | TFLOPs: 23.88 | +7: iteration 106560/ 173500 | consumed samples: 27279360 | consumed tokens: 55868129280 | elapsed time per iteration (s): 0.16 | learning rate: 7.944E-05 | global batch size: 256 | lm loss: 3.694084E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.252 | TFLOPs: 24.86 | +7: iteration 106570/ 173500 | consumed samples: 27281920 | consumed tokens: 55873372160 | elapsed time per iteration (s): 0.15 | learning rate: 7.942E-05 | global batch size: 256 | lm loss: 3.669187E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.226 | TFLOPs: 26.15 | +7: iteration 106580/ 173500 | consumed samples: 27284480 | consumed tokens: 55878615040 | elapsed time per iteration (s): 0.15 | learning rate: 7.941E-05 | global batch size: 256 | lm loss: 3.673205E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.968 | TFLOPs: 26.17 | +7: iteration 106590/ 173500 | consumed samples: 27287040 | consumed tokens: 55883857920 | elapsed time per iteration (s): 0.16 | learning rate: 7.939E-05 | global batch size: 256 | lm loss: 3.695626E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.131 | TFLOPs: 25.17 | +7: iteration 106600/ 173500 | consumed samples: 27289600 | consumed tokens: 55889100800 | elapsed time per iteration (s): 0.15 | learning rate: 7.938E-05 | global batch size: 256 | lm loss: 3.691830E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.260 | TFLOPs: 26.32 | +7: iteration 106610/ 173500 | consumed samples: 27292160 | consumed tokens: 55894343680 | elapsed time per iteration (s): 0.16 | learning rate: 7.936E-05 | global batch size: 256 | lm loss: 3.673919E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.841 | TFLOPs: 25.76 | +7: iteration 106620/ 173500 | consumed samples: 27294720 | consumed tokens: 55899586560 | elapsed time per iteration (s): 0.16 | learning rate: 7.935E-05 | global batch size: 256 | lm loss: 3.686895E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.653 | TFLOPs: 25.42 | +7: iteration 106630/ 173500 | consumed samples: 27297280 | consumed tokens: 55904829440 | elapsed time per iteration (s): 0.15 | learning rate: 7.933E-05 | global batch size: 256 | lm loss: 3.689238E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.102 | TFLOPs: 26.33 | +7: iteration 106640/ 173500 | consumed samples: 27299840 | consumed tokens: 55910072320 | elapsed time per iteration (s): 0.15 | learning rate: 7.932E-05 | global batch size: 256 | lm loss: 3.680161E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.337 | TFLOPs: 26.10 | +7: iteration 106650/ 173500 | consumed samples: 27302400 | consumed tokens: 55915315200 | elapsed time per iteration (s): 0.15 | learning rate: 7.930E-05 | global batch size: 256 | lm loss: 3.691848E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.477 | TFLOPs: 26.21 | +7: iteration 106660/ 173500 | consumed samples: 27304960 | consumed tokens: 55920558080 | elapsed time per iteration (s): 0.15 | learning rate: 7.928E-05 | global batch size: 256 | lm loss: 3.694555E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.374 | TFLOPs: 26.21 | +7: iteration 106670/ 173500 | consumed samples: 27307520 | consumed tokens: 55925800960 | elapsed time per iteration (s): 0.16 | learning rate: 7.927E-05 | global batch size: 256 | lm loss: 3.702670E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.516 | TFLOPs: 25.12 | +7: iteration 106680/ 173500 | consumed samples: 27310080 | consumed tokens: 55931043840 | elapsed time per iteration (s): 0.17 | learning rate: 7.925E-05 | global batch size: 256 | lm loss: 3.674222E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.910 | TFLOPs: 24.06 | +7: iteration 106690/ 173500 | consumed samples: 27312640 | consumed tokens: 55936286720 | elapsed time per iteration (s): 0.15 | learning rate: 7.924E-05 | global batch size: 256 | lm loss: 3.688991E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.242 | TFLOPs: 26.01 | +7: iteration 106700/ 173500 | consumed samples: 27315200 | consumed tokens: 55941529600 | elapsed time per iteration (s): 0.15 | learning rate: 7.922E-05 | global batch size: 256 | lm loss: 3.685691E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.355 | TFLOPs: 26.34 | +7: iteration 106710/ 173500 | consumed samples: 27317760 | consumed tokens: 55946772480 | elapsed time per iteration (s): 0.15 | learning rate: 7.921E-05 | global batch size: 256 | lm loss: 3.690056E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.329 | TFLOPs: 26.29 | +7: iteration 106720/ 173500 | consumed samples: 27320320 | consumed tokens: 55952015360 | elapsed time per iteration (s): 0.15 | learning rate: 7.919E-05 | global batch size: 256 | lm loss: 3.682925E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.400 | TFLOPs: 26.27 | +7: iteration 106730/ 173500 | consumed samples: 27322880 | consumed tokens: 55957258240 | elapsed time per iteration (s): 0.15 | learning rate: 7.918E-05 | global batch size: 256 | lm loss: 3.683740E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.859 | TFLOPs: 26.30 | +7: iteration 106740/ 173500 | consumed samples: 27325440 | consumed tokens: 55962501120 | elapsed time per iteration (s): 0.15 | learning rate: 7.916E-05 | global batch size: 256 | lm loss: 3.687050E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.468 | TFLOPs: 26.29 | +7: iteration 106750/ 173500 | consumed samples: 27328000 | consumed tokens: 55967744000 | elapsed time per iteration (s): 0.16 | learning rate: 7.915E-05 | global batch size: 256 | lm loss: 3.666494E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.425 | TFLOPs: 25.77 | +7: iteration 106760/ 173500 | consumed samples: 27330560 | consumed tokens: 55972986880 | elapsed time per iteration (s): 0.15 | learning rate: 7.913E-05 | global batch size: 256 | lm loss: 3.700515E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.064 | TFLOPs: 26.02 | +7: iteration 106770/ 173500 | consumed samples: 27333120 | consumed tokens: 55978229760 | elapsed time per iteration (s): 0.15 | learning rate: 7.911E-05 | global batch size: 256 | lm loss: 3.691778E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.948 | TFLOPs: 26.33 | +7: iteration 106780/ 173500 | consumed samples: 27335680 | consumed tokens: 55983472640 | elapsed time per iteration (s): 0.16 | learning rate: 7.910E-05 | global batch size: 256 | lm loss: 3.663580E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.024 | TFLOPs: 25.78 | +7: iteration 106790/ 173500 | consumed samples: 27338240 | consumed tokens: 55988715520 | elapsed time per iteration (s): 0.16 | learning rate: 7.908E-05 | global batch size: 256 | lm loss: 3.696455E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.163 | TFLOPs: 25.72 | +7: iteration 106800/ 173500 | consumed samples: 27340800 | consumed tokens: 55993958400 | elapsed time per iteration (s): 0.15 | learning rate: 7.907E-05 | global batch size: 256 | lm loss: 3.692671E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.367 | TFLOPs: 26.15 | +7: iteration 106810/ 173500 | consumed samples: 27343360 | consumed tokens: 55999201280 | elapsed time per iteration (s): 0.16 | learning rate: 7.905E-05 | global batch size: 256 | lm loss: 3.693223E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.178 | TFLOPs: 25.69 | +7: iteration 106820/ 173500 | consumed samples: 27345920 | consumed tokens: 56004444160 | elapsed time per iteration (s): 0.15 | learning rate: 7.904E-05 | global batch size: 256 | lm loss: 3.673922E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.375 | TFLOPs: 26.32 | +7: iteration 106830/ 173500 | consumed samples: 27348480 | consumed tokens: 56009687040 | elapsed time per iteration (s): 0.15 | learning rate: 7.902E-05 | global batch size: 256 | lm loss: 3.683990E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.326 | TFLOPs: 26.34 | +7: iteration 106840/ 173500 | consumed samples: 27351040 | consumed tokens: 56014929920 | elapsed time per iteration (s): 0.15 | learning rate: 7.901E-05 | global batch size: 256 | lm loss: 3.688420E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.920 | TFLOPs: 26.00 | +7: iteration 106850/ 173500 | consumed samples: 27353600 | consumed tokens: 56020172800 | elapsed time per iteration (s): 0.16 | learning rate: 7.899E-05 | global batch size: 256 | lm loss: 3.693755E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.653 | TFLOPs: 25.81 | +7: iteration 106860/ 173500 | consumed samples: 27356160 | consumed tokens: 56025415680 | elapsed time per iteration (s): 0.15 | learning rate: 7.898E-05 | global batch size: 256 | lm loss: 3.687928E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.259 | TFLOPs: 25.96 | +7: iteration 106870/ 173500 | consumed samples: 27358720 | consumed tokens: 56030658560 | elapsed time per iteration (s): 0.16 | learning rate: 7.896E-05 | global batch size: 256 | lm loss: 3.693479E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.759 | TFLOPs: 25.26 | +7: iteration 106880/ 173500 | consumed samples: 27361280 | consumed tokens: 56035901440 | elapsed time per iteration (s): 0.15 | learning rate: 7.894E-05 | global batch size: 256 | lm loss: 3.691930E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.343 | TFLOPs: 26.37 | +7: iteration 106890/ 173500 | consumed samples: 27363840 | consumed tokens: 56041144320 | elapsed time per iteration (s): 0.15 | learning rate: 7.893E-05 | global batch size: 256 | lm loss: 3.687110E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.858 | TFLOPs: 26.33 | +7: iteration 106900/ 173500 | consumed samples: 27366400 | consumed tokens: 56046387200 | elapsed time per iteration (s): 0.15 | learning rate: 7.891E-05 | global batch size: 256 | lm loss: 3.676054E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.662 | TFLOPs: 26.31 | +7: iteration 106910/ 173500 | consumed samples: 27368960 | consumed tokens: 56051630080 | elapsed time per iteration (s): 0.15 | learning rate: 7.890E-05 | global batch size: 256 | lm loss: 3.686697E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.294 | TFLOPs: 26.32 | +7: iteration 106920/ 173500 | consumed samples: 27371520 | consumed tokens: 56056872960 | elapsed time per iteration (s): 0.15 | learning rate: 7.888E-05 | global batch size: 256 | lm loss: 3.697459E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.849 | TFLOPs: 26.31 | +7: iteration 106930/ 173500 | consumed samples: 27374080 | consumed tokens: 56062115840 | elapsed time per iteration (s): 0.15 | learning rate: 7.887E-05 | global batch size: 256 | lm loss: 3.690985E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.331 | TFLOPs: 26.29 | +7: iteration 106940/ 173500 | consumed samples: 27376640 | consumed tokens: 56067358720 | elapsed time per iteration (s): 0.15 | learning rate: 7.885E-05 | global batch size: 256 | lm loss: 3.687595E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.513 | TFLOPs: 26.32 | +7: iteration 106950/ 173500 | consumed samples: 27379200 | consumed tokens: 56072601600 | elapsed time per iteration (s): 0.15 | learning rate: 7.884E-05 | global batch size: 256 | lm loss: 3.686088E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.323 | TFLOPs: 26.32 | +7: iteration 106960/ 173500 | consumed samples: 27381760 | consumed tokens: 56077844480 | elapsed time per iteration (s): 0.15 | learning rate: 7.882E-05 | global batch size: 256 | lm loss: 3.687214E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.606 | TFLOPs: 26.34 | +7: iteration 106970/ 173500 | consumed samples: 27384320 | consumed tokens: 56083087360 | elapsed time per iteration (s): 0.15 | learning rate: 7.881E-05 | global batch size: 256 | lm loss: 3.698147E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.309 | TFLOPs: 26.32 | +7: iteration 106980/ 173500 | consumed samples: 27386880 | consumed tokens: 56088330240 | elapsed time per iteration (s): 0.15 | learning rate: 7.879E-05 | global batch size: 256 | lm loss: 3.685082E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.026 | TFLOPs: 26.35 | +7: iteration 106990/ 173500 | consumed samples: 27389440 | consumed tokens: 56093573120 | elapsed time per iteration (s): 0.15 | learning rate: 7.877E-05 | global batch size: 256 | lm loss: 3.691672E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.548 | TFLOPs: 26.28 | +7: iteration 107000/ 173500 | consumed samples: 27392000 | consumed tokens: 56098816000 | elapsed time per iteration (s): 0.15 | learning rate: 7.876E-05 | global batch size: 256 | lm loss: 3.673122E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.603 | TFLOPs: 26.25 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 107000 | lm loss value: 3.834688E+00 | lm loss PPL: 4.627901E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 107000 to checkpoints_44m91b100m +0: [2023-03-17 04:53:58,739] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step107000 is begin to save! +0: [2023-03-17 04:53:58,743] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:53:58,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:53:58,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:53:58,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:53:58,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:53:58,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:53:58,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:53:58,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:53:58,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:53:58,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:53:58,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:53:58,884] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:53:58,885] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:53:58,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:53:58,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:53:58,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:53:58,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:53:58,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:53:58,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:53:58,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:53:58,919] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step107000/mp_rank_00_model_states.pt +0: [2023-03-17 04:53:58,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:53:58,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:53:58,939] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:53:58,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:53:58,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +6: [2023-03-17 04:53:58,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:53:58,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:53:58,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,955] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:53:58,955] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:53:58,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:53:58,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:53:58,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,957] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,957] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +6: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +6: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +6: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +6: [2023-03-17 04:53:58,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +2: [2023-03-17 04:53:58,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +1: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:53:58,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:53:58,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 04:53:58,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 04:53:58,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 04:53:58,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: [2023-03-17 04:53:58,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +5: [2023-03-17 04:53:58,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +4: [2023-03-17 04:53:58,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:53:58,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:53:58,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +3: [2023-03-17 04:53:58,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:53:58,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:53:58,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +0: successfully saved checkpoint at iteration 107000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 228.01 +6: [2023-03-17 04:53:58,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +6: [2023-03-17 04:53:58,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:53:58,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step107000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:53:58,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step107000 is ready now! +7: iteration 107010/ 173500 | consumed samples: 27394560 | consumed tokens: 56104058880 | elapsed time per iteration (s): 0.18 | learning rate: 7.874E-05 | global batch size: 256 | lm loss: 3.693725E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.798 | TFLOPs: 22.23 | +7: iteration 107020/ 173500 | consumed samples: 27397120 | consumed tokens: 56109301760 | elapsed time per iteration (s): 0.15 | learning rate: 7.873E-05 | global batch size: 256 | lm loss: 3.683741E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.225 | TFLOPs: 26.26 | +7: iteration 107030/ 173500 | consumed samples: 27399680 | consumed tokens: 56114544640 | elapsed time per iteration (s): 0.15 | learning rate: 7.871E-05 | global batch size: 256 | lm loss: 3.680207E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.940 | TFLOPs: 26.25 | +7: iteration 107040/ 173500 | consumed samples: 27402240 | consumed tokens: 56119787520 | elapsed time per iteration (s): 0.16 | learning rate: 7.870E-05 | global batch size: 256 | lm loss: 3.684970E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.145 | TFLOPs: 25.82 | +7: iteration 107050/ 173500 | consumed samples: 27404800 | consumed tokens: 56125030400 | elapsed time per iteration (s): 0.15 | learning rate: 7.868E-05 | global batch size: 256 | lm loss: 3.686868E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.582 | TFLOPs: 26.23 | +7: iteration 107060/ 173500 | consumed samples: 27407360 | consumed tokens: 56130273280 | elapsed time per iteration (s): 0.15 | learning rate: 7.867E-05 | global batch size: 256 | lm loss: 3.697988E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.967 | TFLOPs: 26.30 | +7: iteration 107070/ 173500 | consumed samples: 27409920 | consumed tokens: 56135516160 | elapsed time per iteration (s): 0.15 | learning rate: 7.865E-05 | global batch size: 256 | lm loss: 3.690153E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.163 | TFLOPs: 26.35 | +7: iteration 107080/ 173500 | consumed samples: 27412480 | consumed tokens: 56140759040 | elapsed time per iteration (s): 0.16 | learning rate: 7.864E-05 | global batch size: 256 | lm loss: 3.689635E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.722 | TFLOPs: 25.64 | +7: iteration 107090/ 173500 | consumed samples: 27415040 | consumed tokens: 56146001920 | elapsed time per iteration (s): 0.15 | learning rate: 7.862E-05 | global batch size: 256 | lm loss: 3.703359E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.612 | TFLOPs: 26.31 | +7: iteration 107100/ 173500 | consumed samples: 27417600 | consumed tokens: 56151244800 | elapsed time per iteration (s): 0.16 | learning rate: 7.860E-05 | global batch size: 256 | lm loss: 3.688961E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.989 | TFLOPs: 25.73 | +7: iteration 107110/ 173500 | consumed samples: 27420160 | consumed tokens: 56156487680 | elapsed time per iteration (s): 0.15 | learning rate: 7.859E-05 | global batch size: 256 | lm loss: 3.680319E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.605 | TFLOPs: 26.12 | +7: iteration 107120/ 173500 | consumed samples: 27422720 | consumed tokens: 56161730560 | elapsed time per iteration (s): 0.15 | learning rate: 7.857E-05 | global batch size: 256 | lm loss: 3.683451E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.307 | TFLOPs: 26.23 | +7: iteration 107130/ 173500 | consumed samples: 27425280 | consumed tokens: 56166973440 | elapsed time per iteration (s): 0.15 | learning rate: 7.856E-05 | global batch size: 256 | lm loss: 3.704148E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.109 | TFLOPs: 26.25 | +7: iteration 107140/ 173500 | consumed samples: 27427840 | consumed tokens: 56172216320 | elapsed time per iteration (s): 0.15 | learning rate: 7.854E-05 | global batch size: 256 | lm loss: 3.682214E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.478 | TFLOPs: 26.02 | +7: iteration 107150/ 173500 | consumed samples: 27430400 | consumed tokens: 56177459200 | elapsed time per iteration (s): 0.15 | learning rate: 7.853E-05 | global batch size: 256 | lm loss: 3.666020E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.570 | TFLOPs: 26.25 | +7: iteration 107160/ 173500 | consumed samples: 27432960 | consumed tokens: 56182702080 | elapsed time per iteration (s): 0.15 | learning rate: 7.851E-05 | global batch size: 256 | lm loss: 3.686520E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.034 | TFLOPs: 26.24 | +7: iteration 107170/ 173500 | consumed samples: 27435520 | consumed tokens: 56187944960 | elapsed time per iteration (s): 0.15 | learning rate: 7.850E-05 | global batch size: 256 | lm loss: 3.679495E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.250 | TFLOPs: 26.23 | +7: iteration 107180/ 173500 | consumed samples: 27438080 | consumed tokens: 56193187840 | elapsed time per iteration (s): 0.15 | learning rate: 7.848E-05 | global batch size: 256 | lm loss: 3.692445E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.630 | TFLOPs: 26.22 | +7: iteration 107190/ 173500 | consumed samples: 27440640 | consumed tokens: 56198430720 | elapsed time per iteration (s): 0.15 | learning rate: 7.847E-05 | global batch size: 256 | lm loss: 3.686805E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.870 | TFLOPs: 26.22 | +7: iteration 107200/ 173500 | consumed samples: 27443200 | consumed tokens: 56203673600 | elapsed time per iteration (s): 0.15 | learning rate: 7.845E-05 | global batch size: 256 | lm loss: 3.677313E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.975 | TFLOPs: 26.25 | +7: iteration 107210/ 173500 | consumed samples: 27445760 | consumed tokens: 56208916480 | elapsed time per iteration (s): 0.16 | learning rate: 7.844E-05 | global batch size: 256 | lm loss: 3.676345E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.802 | TFLOPs: 25.42 | +7: iteration 107220/ 173500 | consumed samples: 27448320 | consumed tokens: 56214159360 | elapsed time per iteration (s): 0.15 | learning rate: 7.842E-05 | global batch size: 256 | lm loss: 3.673059E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.449 | TFLOPs: 26.24 | +7: iteration 107230/ 173500 | consumed samples: 27450880 | consumed tokens: 56219402240 | elapsed time per iteration (s): 0.15 | learning rate: 7.840E-05 | global batch size: 256 | lm loss: 3.679661E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.442 | TFLOPs: 26.23 | +7: iteration 107240/ 173500 | consumed samples: 27453440 | consumed tokens: 56224645120 | elapsed time per iteration (s): 0.15 | learning rate: 7.839E-05 | global batch size: 256 | lm loss: 3.697500E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.875 | TFLOPs: 26.25 | +7: iteration 107250/ 173500 | consumed samples: 27456000 | consumed tokens: 56229888000 | elapsed time per iteration (s): 0.15 | learning rate: 7.837E-05 | global batch size: 256 | lm loss: 3.695849E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.910 | TFLOPs: 26.24 | +7: iteration 107260/ 173500 | consumed samples: 27458560 | consumed tokens: 56235130880 | elapsed time per iteration (s): 0.15 | learning rate: 7.836E-05 | global batch size: 256 | lm loss: 3.695316E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.151 | TFLOPs: 26.24 | +7: iteration 107270/ 173500 | consumed samples: 27461120 | consumed tokens: 56240373760 | elapsed time per iteration (s): 0.15 | learning rate: 7.834E-05 | global batch size: 256 | lm loss: 3.688810E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.634 | TFLOPs: 26.25 | +7: iteration 107280/ 173500 | consumed samples: 27463680 | consumed tokens: 56245616640 | elapsed time per iteration (s): 0.16 | learning rate: 7.833E-05 | global batch size: 256 | lm loss: 3.693055E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.541 | TFLOPs: 25.74 | +7: iteration 107290/ 173500 | consumed samples: 27466240 | consumed tokens: 56250859520 | elapsed time per iteration (s): 0.15 | learning rate: 7.831E-05 | global batch size: 256 | lm loss: 3.678050E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.836 | TFLOPs: 26.23 | +7: iteration 107300/ 173500 | consumed samples: 27468800 | consumed tokens: 56256102400 | elapsed time per iteration (s): 0.15 | learning rate: 7.830E-05 | global batch size: 256 | lm loss: 3.692789E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.147 | TFLOPs: 26.24 | +7: iteration 107310/ 173500 | consumed samples: 27471360 | consumed tokens: 56261345280 | elapsed time per iteration (s): 0.15 | learning rate: 7.828E-05 | global batch size: 256 | lm loss: 3.673794E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.535 | TFLOPs: 26.12 | +7: iteration 107320/ 173500 | consumed samples: 27473920 | consumed tokens: 56266588160 | elapsed time per iteration (s): 0.16 | learning rate: 7.827E-05 | global batch size: 256 | lm loss: 3.690194E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.230 | TFLOPs: 25.79 | +7: iteration 107330/ 173500 | consumed samples: 27476480 | consumed tokens: 56271831040 | elapsed time per iteration (s): 0.15 | learning rate: 7.825E-05 | global batch size: 256 | lm loss: 3.687130E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.356 | TFLOPs: 26.20 | +7: iteration 107340/ 173500 | consumed samples: 27479040 | consumed tokens: 56277073920 | elapsed time per iteration (s): 0.15 | learning rate: 7.823E-05 | global batch size: 256 | lm loss: 3.675280E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.599 | TFLOPs: 26.18 | +7: iteration 107350/ 173500 | consumed samples: 27481600 | consumed tokens: 56282316800 | elapsed time per iteration (s): 0.17 | learning rate: 7.822E-05 | global batch size: 256 | lm loss: 3.684084E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.423 | TFLOPs: 23.89 | +7: iteration 107360/ 173500 | consumed samples: 27484160 | consumed tokens: 56287559680 | elapsed time per iteration (s): 0.17 | learning rate: 7.820E-05 | global batch size: 256 | lm loss: 3.681477E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1492.819 | TFLOPs: 23.41 | +7: iteration 107370/ 173500 | consumed samples: 27486720 | consumed tokens: 56292802560 | elapsed time per iteration (s): 0.15 | learning rate: 7.819E-05 | global batch size: 256 | lm loss: 3.689170E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.318 | TFLOPs: 26.23 | +7: iteration 107380/ 173500 | consumed samples: 27489280 | consumed tokens: 56298045440 | elapsed time per iteration (s): 0.15 | learning rate: 7.817E-05 | global batch size: 256 | lm loss: 3.689542E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.050 | TFLOPs: 26.21 | +7: iteration 107390/ 173500 | consumed samples: 27491840 | consumed tokens: 56303288320 | elapsed time per iteration (s): 0.15 | learning rate: 7.816E-05 | global batch size: 256 | lm loss: 3.684924E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.477 | TFLOPs: 26.20 | +7: iteration 107400/ 173500 | consumed samples: 27494400 | consumed tokens: 56308531200 | elapsed time per iteration (s): 0.16 | learning rate: 7.814E-05 | global batch size: 256 | lm loss: 3.681446E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.631 | TFLOPs: 25.26 | +7: iteration 107410/ 173500 | consumed samples: 27496960 | consumed tokens: 56313774080 | elapsed time per iteration (s): 0.15 | learning rate: 7.813E-05 | global batch size: 256 | lm loss: 3.686611E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.991 | TFLOPs: 26.22 | +7: iteration 107420/ 173500 | consumed samples: 27499520 | consumed tokens: 56319016960 | elapsed time per iteration (s): 0.15 | learning rate: 7.811E-05 | global batch size: 256 | lm loss: 3.678222E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.142 | TFLOPs: 26.30 | +7: iteration 107430/ 173500 | consumed samples: 27502080 | consumed tokens: 56324259840 | elapsed time per iteration (s): 0.15 | learning rate: 7.810E-05 | global batch size: 256 | lm loss: 3.683991E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.639 | TFLOPs: 26.33 | +7: iteration 107440/ 173500 | consumed samples: 27504640 | consumed tokens: 56329502720 | elapsed time per iteration (s): 0.16 | learning rate: 7.808E-05 | global batch size: 256 | lm loss: 3.690237E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.438 | TFLOPs: 25.11 | +7: iteration 107450/ 173500 | consumed samples: 27507200 | consumed tokens: 56334745600 | elapsed time per iteration (s): 0.16 | learning rate: 7.807E-05 | global batch size: 256 | lm loss: 3.694032E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.064 | TFLOPs: 24.81 | +7: iteration 107460/ 173500 | consumed samples: 27509760 | consumed tokens: 56339988480 | elapsed time per iteration (s): 0.16 | learning rate: 7.805E-05 | global batch size: 256 | lm loss: 3.696432E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.028 | TFLOPs: 25.83 | +7: iteration 107470/ 173500 | consumed samples: 27512320 | consumed tokens: 56345231360 | elapsed time per iteration (s): 0.15 | learning rate: 7.803E-05 | global batch size: 256 | lm loss: 3.672523E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.001 | TFLOPs: 26.33 | +7: iteration 107480/ 173500 | consumed samples: 27514880 | consumed tokens: 56350474240 | elapsed time per iteration (s): 0.15 | learning rate: 7.802E-05 | global batch size: 256 | lm loss: 3.681973E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.562 | TFLOPs: 26.34 | +7: iteration 107490/ 173500 | consumed samples: 27517440 | consumed tokens: 56355717120 | elapsed time per iteration (s): 0.15 | learning rate: 7.800E-05 | global batch size: 256 | lm loss: 3.677700E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.983 | TFLOPs: 26.33 | +7: iteration 107500/ 173500 | consumed samples: 27520000 | consumed tokens: 56360960000 | elapsed time per iteration (s): 0.15 | learning rate: 7.799E-05 | global batch size: 256 | lm loss: 3.679844E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.952 | TFLOPs: 26.33 | +7: iteration 107510/ 173500 | consumed samples: 27522560 | consumed tokens: 56366202880 | elapsed time per iteration (s): 0.15 | learning rate: 7.797E-05 | global batch size: 256 | lm loss: 3.699427E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.685 | TFLOPs: 26.29 | +7: iteration 107520/ 173500 | consumed samples: 27525120 | consumed tokens: 56371445760 | elapsed time per iteration (s): 0.16 | learning rate: 7.796E-05 | global batch size: 256 | lm loss: 3.685756E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.079 | TFLOPs: 25.61 | +7: iteration 107530/ 173500 | consumed samples: 27527680 | consumed tokens: 56376688640 | elapsed time per iteration (s): 0.15 | learning rate: 7.794E-05 | global batch size: 256 | lm loss: 3.682954E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.815 | TFLOPs: 25.90 | +7: iteration 107540/ 173500 | consumed samples: 27530240 | consumed tokens: 56381931520 | elapsed time per iteration (s): 0.15 | learning rate: 7.793E-05 | global batch size: 256 | lm loss: 3.694236E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.863 | TFLOPs: 26.31 | +7: iteration 107550/ 173500 | consumed samples: 27532800 | consumed tokens: 56387174400 | elapsed time per iteration (s): 0.15 | learning rate: 7.791E-05 | global batch size: 256 | lm loss: 3.697141E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.041 | TFLOPs: 26.25 | +7: iteration 107560/ 173500 | consumed samples: 27535360 | consumed tokens: 56392417280 | elapsed time per iteration (s): 0.16 | learning rate: 7.790E-05 | global batch size: 256 | lm loss: 3.679033E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.615 | TFLOPs: 25.79 | +7: iteration 107570/ 173500 | consumed samples: 27537920 | consumed tokens: 56397660160 | elapsed time per iteration (s): 0.15 | learning rate: 7.788E-05 | global batch size: 256 | lm loss: 3.682951E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.011 | TFLOPs: 26.30 | +7: iteration 107580/ 173500 | consumed samples: 27540480 | consumed tokens: 56402903040 | elapsed time per iteration (s): 0.16 | learning rate: 7.787E-05 | global batch size: 256 | lm loss: 3.675087E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.327 | TFLOPs: 25.63 | +7: iteration 107590/ 173500 | consumed samples: 27543040 | consumed tokens: 56408145920 | elapsed time per iteration (s): 0.16 | learning rate: 7.785E-05 | global batch size: 256 | lm loss: 3.690545E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.236 | TFLOPs: 25.85 | +7: iteration 107600/ 173500 | consumed samples: 27545600 | consumed tokens: 56413388800 | elapsed time per iteration (s): 0.15 | learning rate: 7.783E-05 | global batch size: 256 | lm loss: 3.683540E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.636 | TFLOPs: 26.18 | +7: iteration 107610/ 173500 | consumed samples: 27548160 | consumed tokens: 56418631680 | elapsed time per iteration (s): 0.15 | learning rate: 7.782E-05 | global batch size: 256 | lm loss: 3.686884E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.325 | TFLOPs: 26.16 | +7: iteration 107620/ 173500 | consumed samples: 27550720 | consumed tokens: 56423874560 | elapsed time per iteration (s): 0.15 | learning rate: 7.780E-05 | global batch size: 256 | lm loss: 3.687063E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.833 | TFLOPs: 25.94 | +7: iteration 107630/ 173500 | consumed samples: 27553280 | consumed tokens: 56429117440 | elapsed time per iteration (s): 0.15 | learning rate: 7.779E-05 | global batch size: 256 | lm loss: 3.688135E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.423 | TFLOPs: 26.17 | +7: iteration 107640/ 173500 | consumed samples: 27555840 | consumed tokens: 56434360320 | elapsed time per iteration (s): 0.15 | learning rate: 7.777E-05 | global batch size: 256 | lm loss: 3.688366E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.586 | TFLOPs: 26.14 | +7: iteration 107650/ 173500 | consumed samples: 27558400 | consumed tokens: 56439603200 | elapsed time per iteration (s): 0.15 | learning rate: 7.776E-05 | global batch size: 256 | lm loss: 3.687933E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.304 | TFLOPs: 26.13 | +7: iteration 107660/ 173500 | consumed samples: 27560960 | consumed tokens: 56444846080 | elapsed time per iteration (s): 0.15 | learning rate: 7.774E-05 | global batch size: 256 | lm loss: 3.673697E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.213 | TFLOPs: 26.15 | +7: iteration 107670/ 173500 | consumed samples: 27563520 | consumed tokens: 56450088960 | elapsed time per iteration (s): 0.15 | learning rate: 7.773E-05 | global batch size: 256 | lm loss: 3.688913E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.920 | TFLOPs: 26.16 | +7: iteration 107680/ 173500 | consumed samples: 27566080 | consumed tokens: 56455331840 | elapsed time per iteration (s): 0.15 | learning rate: 7.771E-05 | global batch size: 256 | lm loss: 3.680701E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.981 | TFLOPs: 26.19 | +7: iteration 107690/ 173500 | consumed samples: 27568640 | consumed tokens: 56460574720 | elapsed time per iteration (s): 0.16 | learning rate: 7.770E-05 | global batch size: 256 | lm loss: 3.678792E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.311 | TFLOPs: 25.82 | +7: iteration 107700/ 173500 | consumed samples: 27571200 | consumed tokens: 56465817600 | elapsed time per iteration (s): 0.15 | learning rate: 7.768E-05 | global batch size: 256 | lm loss: 3.690280E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.999 | TFLOPs: 26.19 | +7: iteration 107710/ 173500 | consumed samples: 27573760 | consumed tokens: 56471060480 | elapsed time per iteration (s): 0.15 | learning rate: 7.767E-05 | global batch size: 256 | lm loss: 3.688133E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.864 | TFLOPs: 26.19 | +7: iteration 107720/ 173500 | consumed samples: 27576320 | consumed tokens: 56476303360 | elapsed time per iteration (s): 0.15 | learning rate: 7.765E-05 | global batch size: 256 | lm loss: 3.692527E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.644 | TFLOPs: 26.20 | +7: iteration 107730/ 173500 | consumed samples: 27578880 | consumed tokens: 56481546240 | elapsed time per iteration (s): 0.15 | learning rate: 7.763E-05 | global batch size: 256 | lm loss: 3.690424E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.305 | TFLOPs: 26.19 | +7: iteration 107740/ 173500 | consumed samples: 27581440 | consumed tokens: 56486789120 | elapsed time per iteration (s): 0.15 | learning rate: 7.762E-05 | global batch size: 256 | lm loss: 3.686312E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.780 | TFLOPs: 26.20 | +7: iteration 107750/ 173500 | consumed samples: 27584000 | consumed tokens: 56492032000 | elapsed time per iteration (s): 0.15 | learning rate: 7.760E-05 | global batch size: 256 | lm loss: 3.679787E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.954 | TFLOPs: 26.20 | +7: iteration 107760/ 173500 | consumed samples: 27586560 | consumed tokens: 56497274880 | elapsed time per iteration (s): 0.15 | learning rate: 7.759E-05 | global batch size: 256 | lm loss: 3.706338E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.185 | TFLOPs: 26.19 | +7: iteration 107770/ 173500 | consumed samples: 27589120 | consumed tokens: 56502517760 | elapsed time per iteration (s): 0.15 | learning rate: 7.757E-05 | global batch size: 256 | lm loss: 3.697519E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.685 | TFLOPs: 26.18 | +7: iteration 107780/ 173500 | consumed samples: 27591680 | consumed tokens: 56507760640 | elapsed time per iteration (s): 0.15 | learning rate: 7.756E-05 | global batch size: 256 | lm loss: 3.686120E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.131 | TFLOPs: 26.18 | +7: iteration 107790/ 173500 | consumed samples: 27594240 | consumed tokens: 56513003520 | elapsed time per iteration (s): 0.15 | learning rate: 7.754E-05 | global batch size: 256 | lm loss: 3.681615E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.758 | TFLOPs: 26.20 | +7: iteration 107800/ 173500 | consumed samples: 27596800 | consumed tokens: 56518246400 | elapsed time per iteration (s): 0.15 | learning rate: 7.753E-05 | global batch size: 256 | lm loss: 3.673659E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.847 | TFLOPs: 26.20 | +7: iteration 107810/ 173500 | consumed samples: 27599360 | consumed tokens: 56523489280 | elapsed time per iteration (s): 0.15 | learning rate: 7.751E-05 | global batch size: 256 | lm loss: 3.676323E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.203 | TFLOPs: 26.19 | +7: iteration 107820/ 173500 | consumed samples: 27601920 | consumed tokens: 56528732160 | elapsed time per iteration (s): 0.15 | learning rate: 7.750E-05 | global batch size: 256 | lm loss: 3.682061E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.867 | TFLOPs: 26.22 | +7: iteration 107830/ 173500 | consumed samples: 27604480 | consumed tokens: 56533975040 | elapsed time per iteration (s): 0.15 | learning rate: 7.748E-05 | global batch size: 256 | lm loss: 3.696531E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.280 | TFLOPs: 26.34 | +7: iteration 107840/ 173500 | consumed samples: 27607040 | consumed tokens: 56539217920 | elapsed time per iteration (s): 0.15 | learning rate: 7.747E-05 | global batch size: 256 | lm loss: 3.697443E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.344 | TFLOPs: 26.24 | +7: iteration 107850/ 173500 | consumed samples: 27609600 | consumed tokens: 56544460800 | elapsed time per iteration (s): 0.15 | learning rate: 7.745E-05 | global batch size: 256 | lm loss: 3.688981E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.872 | TFLOPs: 25.91 | +7: iteration 107860/ 173500 | consumed samples: 27612160 | consumed tokens: 56549703680 | elapsed time per iteration (s): 0.15 | learning rate: 7.744E-05 | global batch size: 256 | lm loss: 3.692274E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.233 | TFLOPs: 26.12 | +7: iteration 107870/ 173500 | consumed samples: 27614720 | consumed tokens: 56554946560 | elapsed time per iteration (s): 0.15 | learning rate: 7.742E-05 | global batch size: 256 | lm loss: 3.664409E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.022 | TFLOPs: 26.13 | +7: iteration 107880/ 173500 | consumed samples: 27617280 | consumed tokens: 56560189440 | elapsed time per iteration (s): 0.15 | learning rate: 7.740E-05 | global batch size: 256 | lm loss: 3.688347E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.974 | TFLOPs: 26.11 | +7: iteration 107890/ 173500 | consumed samples: 27619840 | consumed tokens: 56565432320 | elapsed time per iteration (s): 0.16 | learning rate: 7.739E-05 | global batch size: 256 | lm loss: 3.685178E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.353 | TFLOPs: 25.77 | +7: iteration 107900/ 173500 | consumed samples: 27622400 | consumed tokens: 56570675200 | elapsed time per iteration (s): 0.15 | learning rate: 7.737E-05 | global batch size: 256 | lm loss: 3.684771E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.188 | TFLOPs: 26.11 | +7: iteration 107910/ 173500 | consumed samples: 27624960 | consumed tokens: 56575918080 | elapsed time per iteration (s): 0.15 | learning rate: 7.736E-05 | global batch size: 256 | lm loss: 3.683361E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.993 | TFLOPs: 26.11 | +7: iteration 107920/ 173500 | consumed samples: 27627520 | consumed tokens: 56581160960 | elapsed time per iteration (s): 0.15 | learning rate: 7.734E-05 | global batch size: 256 | lm loss: 3.685570E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.522 | TFLOPs: 26.12 | +7: iteration 107930/ 173500 | consumed samples: 27630080 | consumed tokens: 56586403840 | elapsed time per iteration (s): 0.15 | learning rate: 7.733E-05 | global batch size: 256 | lm loss: 3.684137E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.443 | TFLOPs: 26.10 | +7: iteration 107940/ 173500 | consumed samples: 27632640 | consumed tokens: 56591646720 | elapsed time per iteration (s): 0.15 | learning rate: 7.731E-05 | global batch size: 256 | lm loss: 3.687539E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.484 | TFLOPs: 26.10 | +7: iteration 107950/ 173500 | consumed samples: 27635200 | consumed tokens: 56596889600 | elapsed time per iteration (s): 0.15 | learning rate: 7.730E-05 | global batch size: 256 | lm loss: 3.685887E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.662 | TFLOPs: 26.11 | +7: iteration 107960/ 173500 | consumed samples: 27637760 | consumed tokens: 56602132480 | elapsed time per iteration (s): 0.15 | learning rate: 7.728E-05 | global batch size: 256 | lm loss: 3.682919E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.036 | TFLOPs: 26.22 | +7: iteration 107970/ 173500 | consumed samples: 27640320 | consumed tokens: 56607375360 | elapsed time per iteration (s): 0.15 | learning rate: 7.727E-05 | global batch size: 256 | lm loss: 3.691400E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.716 | TFLOPs: 26.28 | +7: iteration 107980/ 173500 | consumed samples: 27642880 | consumed tokens: 56612618240 | elapsed time per iteration (s): 0.15 | learning rate: 7.725E-05 | global batch size: 256 | lm loss: 3.683742E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.427 | TFLOPs: 26.27 | +7: iteration 107990/ 173500 | consumed samples: 27645440 | consumed tokens: 56617861120 | elapsed time per iteration (s): 0.15 | learning rate: 7.724E-05 | global batch size: 256 | lm loss: 3.689877E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.932 | TFLOPs: 26.27 | +0: [2023-03-17 04:56:33,080] [INFO] [logging.py:68:log_dist] [Rank 0] step=108000, skipped=0, lr=[7.722055869362951e-05, 7.722055869362951e-05, 7.722055869362951e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 108000/ 173500 | consumed samples: 27648000 | consumed tokens: 56623104000 | elapsed time per iteration (s): 0.15 | learning rate: 7.722E-05 | global batch size: 256 | lm loss: 3.693958E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.436 | TFLOPs: 26.28 | +0: steps: 108000 loss: 3.6694 iter time (s): 0.154 samples/sec: 1661.987 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 108000 | lm loss value: 3.894175E+00 | lm loss PPL: 4.911552E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 108000 to checkpoints_44m91b100m +0: [2023-03-17 04:56:33,154] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step108000 is begin to save! +0: [2023-03-17 04:56:33,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:56:33,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:56:33,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:56:33,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:56:33,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:56:33,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:56:33,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:56:33,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:56:33,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:56:33,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:56:33,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:56:33,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:56:33,262] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:56:33,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:56:33,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:56:33,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:56:33,278] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:56:33,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:56:33,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:56:33,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:56:33,287] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step108000/mp_rank_00_model_states.pt +0: [2023-03-17 04:56:33,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:56:33,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:56:33,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:56:33,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:56:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 04:56:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:56:33,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 04:56:33,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +6: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:56:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:56:33,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +6: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +6: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +6: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +6: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:56:33,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +5: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +4: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +6: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +3: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 04:56:33,329] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +7: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +2: [2023-03-17 04:56:33,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:56:33,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step108000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +1: [2023-03-17 04:56:33,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step108000 is ready now! +0: successfully saved checkpoint at iteration 108000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 180.08 +7: iteration 108010/ 173500 | consumed samples: 27650560 | consumed tokens: 56628346880 | elapsed time per iteration (s): 0.18 | learning rate: 7.721E-05 | global batch size: 256 | lm loss: 3.690246E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.269 | TFLOPs: 22.62 | +7: iteration 108020/ 173500 | consumed samples: 27653120 | consumed tokens: 56633589760 | elapsed time per iteration (s): 0.15 | learning rate: 7.719E-05 | global batch size: 256 | lm loss: 3.694041E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.092 | TFLOPs: 26.29 | +7: iteration 108030/ 173500 | consumed samples: 27655680 | consumed tokens: 56638832640 | elapsed time per iteration (s): 0.15 | learning rate: 7.717E-05 | global batch size: 256 | lm loss: 3.674285E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.502 | TFLOPs: 26.35 | +7: iteration 108040/ 173500 | consumed samples: 27658240 | consumed tokens: 56644075520 | elapsed time per iteration (s): 0.15 | learning rate: 7.716E-05 | global batch size: 256 | lm loss: 3.697742E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.842 | TFLOPs: 26.34 | +7: iteration 108050/ 173500 | consumed samples: 27660800 | consumed tokens: 56649318400 | elapsed time per iteration (s): 0.15 | learning rate: 7.714E-05 | global batch size: 256 | lm loss: 3.672499E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.032 | TFLOPs: 26.35 | +7: iteration 108060/ 173500 | consumed samples: 27663360 | consumed tokens: 56654561280 | elapsed time per iteration (s): 0.15 | learning rate: 7.713E-05 | global batch size: 256 | lm loss: 3.678924E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.747 | TFLOPs: 26.34 | +7: iteration 108070/ 173500 | consumed samples: 27665920 | consumed tokens: 56659804160 | elapsed time per iteration (s): 0.15 | learning rate: 7.711E-05 | global batch size: 256 | lm loss: 3.692945E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.517 | TFLOPs: 26.34 | +7: iteration 108080/ 173500 | consumed samples: 27668480 | consumed tokens: 56665047040 | elapsed time per iteration (s): 0.15 | learning rate: 7.710E-05 | global batch size: 256 | lm loss: 3.685929E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.217 | TFLOPs: 26.35 | +7: iteration 108090/ 173500 | consumed samples: 27671040 | consumed tokens: 56670289920 | elapsed time per iteration (s): 0.15 | learning rate: 7.708E-05 | global batch size: 256 | lm loss: 3.685044E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.711 | TFLOPs: 26.22 | +7: iteration 108100/ 173500 | consumed samples: 27673600 | consumed tokens: 56675532800 | elapsed time per iteration (s): 0.15 | learning rate: 7.707E-05 | global batch size: 256 | lm loss: 3.682516E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.266 | TFLOPs: 26.16 | +7: iteration 108110/ 173500 | consumed samples: 27676160 | consumed tokens: 56680775680 | elapsed time per iteration (s): 0.15 | learning rate: 7.705E-05 | global batch size: 256 | lm loss: 3.694699E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.252 | TFLOPs: 26.16 | +7: iteration 108120/ 173500 | consumed samples: 27678720 | consumed tokens: 56686018560 | elapsed time per iteration (s): 0.16 | learning rate: 7.704E-05 | global batch size: 256 | lm loss: 3.683484E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.697 | TFLOPs: 25.89 | +7: iteration 108130/ 173500 | consumed samples: 27681280 | consumed tokens: 56691261440 | elapsed time per iteration (s): 0.15 | learning rate: 7.702E-05 | global batch size: 256 | lm loss: 3.679239E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.836 | TFLOPs: 26.05 | +7: iteration 108140/ 173500 | consumed samples: 27683840 | consumed tokens: 56696504320 | elapsed time per iteration (s): 0.15 | learning rate: 7.701E-05 | global batch size: 256 | lm loss: 3.697033E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.127 | TFLOPs: 26.03 | +7: iteration 108150/ 173500 | consumed samples: 27686400 | consumed tokens: 56701747200 | elapsed time per iteration (s): 0.15 | learning rate: 7.699E-05 | global batch size: 256 | lm loss: 3.684739E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.097 | TFLOPs: 26.07 | +7: iteration 108160/ 173500 | consumed samples: 27688960 | consumed tokens: 56706990080 | elapsed time per iteration (s): 0.16 | learning rate: 7.698E-05 | global batch size: 256 | lm loss: 3.678605E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.926 | TFLOPs: 25.29 | +7: iteration 108170/ 173500 | consumed samples: 27691520 | consumed tokens: 56712232960 | elapsed time per iteration (s): 0.15 | learning rate: 7.696E-05 | global batch size: 256 | lm loss: 3.696157E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.970 | TFLOPs: 26.10 | +7: iteration 108180/ 173500 | consumed samples: 27694080 | consumed tokens: 56717475840 | elapsed time per iteration (s): 0.15 | learning rate: 7.694E-05 | global batch size: 256 | lm loss: 3.690490E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.322 | TFLOPs: 26.09 | +7: iteration 108190/ 173500 | consumed samples: 27696640 | consumed tokens: 56722718720 | elapsed time per iteration (s): 0.15 | learning rate: 7.693E-05 | global batch size: 256 | lm loss: 3.680978E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.906 | TFLOPs: 26.08 | +7: iteration 108200/ 173500 | consumed samples: 27699200 | consumed tokens: 56727961600 | elapsed time per iteration (s): 0.15 | learning rate: 7.691E-05 | global batch size: 256 | lm loss: 3.671687E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.837 | TFLOPs: 26.09 | +7: iteration 108210/ 173500 | consumed samples: 27701760 | consumed tokens: 56733204480 | elapsed time per iteration (s): 0.15 | learning rate: 7.690E-05 | global batch size: 256 | lm loss: 3.694170E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.379 | TFLOPs: 26.01 | +7: iteration 108220/ 173500 | consumed samples: 27704320 | consumed tokens: 56738447360 | elapsed time per iteration (s): 0.15 | learning rate: 7.688E-05 | global batch size: 256 | lm loss: 3.668414E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.075 | TFLOPs: 26.10 | +7: iteration 108230/ 173500 | consumed samples: 27706880 | consumed tokens: 56743690240 | elapsed time per iteration (s): 0.15 | learning rate: 7.687E-05 | global batch size: 256 | lm loss: 3.693047E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.582 | TFLOPs: 26.09 | +7: iteration 108240/ 173500 | consumed samples: 27709440 | consumed tokens: 56748933120 | elapsed time per iteration (s): 0.15 | learning rate: 7.685E-05 | global batch size: 256 | lm loss: 3.697049E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.907 | TFLOPs: 26.09 | +7: iteration 108250/ 173500 | consumed samples: 27712000 | consumed tokens: 56754176000 | elapsed time per iteration (s): 0.15 | learning rate: 7.684E-05 | global batch size: 256 | lm loss: 3.689404E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.333 | TFLOPs: 26.04 | +7: iteration 108260/ 173500 | consumed samples: 27714560 | consumed tokens: 56759418880 | elapsed time per iteration (s): 0.15 | learning rate: 7.682E-05 | global batch size: 256 | lm loss: 3.698983E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.497 | TFLOPs: 25.95 | +7: iteration 108270/ 173500 | consumed samples: 27717120 | consumed tokens: 56764661760 | elapsed time per iteration (s): 0.15 | learning rate: 7.681E-05 | global batch size: 256 | lm loss: 3.703396E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.916 | TFLOPs: 26.05 | +7: iteration 108280/ 173500 | consumed samples: 27719680 | consumed tokens: 56769904640 | elapsed time per iteration (s): 0.15 | learning rate: 7.679E-05 | global batch size: 256 | lm loss: 3.678662E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.989 | TFLOPs: 26.08 | +7: iteration 108290/ 173500 | consumed samples: 27722240 | consumed tokens: 56775147520 | elapsed time per iteration (s): 0.15 | learning rate: 7.678E-05 | global batch size: 256 | lm loss: 3.696242E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.010 | TFLOPs: 26.10 | +7: iteration 108300/ 173500 | consumed samples: 27724800 | consumed tokens: 56780390400 | elapsed time per iteration (s): 0.15 | learning rate: 7.676E-05 | global batch size: 256 | lm loss: 3.688587E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.131 | TFLOPs: 26.10 | +7: iteration 108310/ 173500 | consumed samples: 27727360 | consumed tokens: 56785633280 | elapsed time per iteration (s): 0.15 | learning rate: 7.675E-05 | global batch size: 256 | lm loss: 3.680007E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.834 | TFLOPs: 26.06 | +7: iteration 108320/ 173500 | consumed samples: 27729920 | consumed tokens: 56790876160 | elapsed time per iteration (s): 0.15 | learning rate: 7.673E-05 | global batch size: 256 | lm loss: 3.668489E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.350 | TFLOPs: 26.07 | +7: iteration 108330/ 173500 | consumed samples: 27732480 | consumed tokens: 56796119040 | elapsed time per iteration (s): 0.15 | learning rate: 7.672E-05 | global batch size: 256 | lm loss: 3.689040E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.750 | TFLOPs: 26.06 | +7: iteration 108340/ 173500 | consumed samples: 27735040 | consumed tokens: 56801361920 | elapsed time per iteration (s): 0.15 | learning rate: 7.670E-05 | global batch size: 256 | lm loss: 3.700903E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.686 | TFLOPs: 26.08 | +7: iteration 108350/ 173500 | consumed samples: 27737600 | consumed tokens: 56806604800 | elapsed time per iteration (s): 0.15 | learning rate: 7.668E-05 | global batch size: 256 | lm loss: 3.697596E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.261 | TFLOPs: 26.07 | +7: iteration 108360/ 173500 | consumed samples: 27740160 | consumed tokens: 56811847680 | elapsed time per iteration (s): 0.15 | learning rate: 7.667E-05 | global batch size: 256 | lm loss: 3.686374E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.210 | TFLOPs: 26.08 | +7: iteration 108370/ 173500 | consumed samples: 27742720 | consumed tokens: 56817090560 | elapsed time per iteration (s): 0.15 | learning rate: 7.665E-05 | global batch size: 256 | lm loss: 3.682580E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.590 | TFLOPs: 26.07 | +7: iteration 108380/ 173500 | consumed samples: 27745280 | consumed tokens: 56822333440 | elapsed time per iteration (s): 0.15 | learning rate: 7.664E-05 | global batch size: 256 | lm loss: 3.683475E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.393 | TFLOPs: 26.09 | +7: iteration 108390/ 173500 | consumed samples: 27747840 | consumed tokens: 56827576320 | elapsed time per iteration (s): 0.15 | learning rate: 7.662E-05 | global batch size: 256 | lm loss: 3.682790E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.226 | TFLOPs: 26.08 | +7: iteration 108400/ 173500 | consumed samples: 27750400 | consumed tokens: 56832819200 | elapsed time per iteration (s): 0.15 | learning rate: 7.661E-05 | global batch size: 256 | lm loss: 3.685346E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.289 | TFLOPs: 26.10 | +7: iteration 108410/ 173500 | consumed samples: 27752960 | consumed tokens: 56838062080 | elapsed time per iteration (s): 0.16 | learning rate: 7.659E-05 | global batch size: 256 | lm loss: 3.690282E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.421 | TFLOPs: 25.69 | +7: iteration 108420/ 173500 | consumed samples: 27755520 | consumed tokens: 56843304960 | elapsed time per iteration (s): 0.16 | learning rate: 7.658E-05 | global batch size: 256 | lm loss: 3.675990E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.843 | TFLOPs: 24.76 | +7: iteration 108430/ 173500 | consumed samples: 27758080 | consumed tokens: 56848547840 | elapsed time per iteration (s): 0.15 | learning rate: 7.656E-05 | global batch size: 256 | lm loss: 3.675988E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.500 | TFLOPs: 26.06 | +7: iteration 108440/ 173500 | consumed samples: 27760640 | consumed tokens: 56853790720 | elapsed time per iteration (s): 0.15 | learning rate: 7.655E-05 | global batch size: 256 | lm loss: 3.675786E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.812 | TFLOPs: 26.00 | +7: iteration 108450/ 173500 | consumed samples: 27763200 | consumed tokens: 56859033600 | elapsed time per iteration (s): 0.15 | learning rate: 7.653E-05 | global batch size: 256 | lm loss: 3.692936E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.649 | TFLOPs: 26.04 | +7: iteration 108460/ 173500 | consumed samples: 27765760 | consumed tokens: 56864276480 | elapsed time per iteration (s): 0.15 | learning rate: 7.652E-05 | global batch size: 256 | lm loss: 3.692584E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.002 | TFLOPs: 26.05 | +7: iteration 108470/ 173500 | consumed samples: 27768320 | consumed tokens: 56869519360 | elapsed time per iteration (s): 0.15 | learning rate: 7.650E-05 | global batch size: 256 | lm loss: 3.681854E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.876 | TFLOPs: 26.14 | +7: iteration 108480/ 173500 | consumed samples: 27770880 | consumed tokens: 56874762240 | elapsed time per iteration (s): 0.15 | learning rate: 7.649E-05 | global batch size: 256 | lm loss: 3.663222E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.243 | TFLOPs: 26.15 | +7: iteration 108490/ 173500 | consumed samples: 27773440 | consumed tokens: 56880005120 | elapsed time per iteration (s): 0.15 | learning rate: 7.647E-05 | global batch size: 256 | lm loss: 3.676704E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.559 | TFLOPs: 26.17 | +7: iteration 108500/ 173500 | consumed samples: 27776000 | consumed tokens: 56885248000 | elapsed time per iteration (s): 0.15 | learning rate: 7.646E-05 | global batch size: 256 | lm loss: 3.686066E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.795 | TFLOPs: 26.17 | +7: iteration 108510/ 173500 | consumed samples: 27778560 | consumed tokens: 56890490880 | elapsed time per iteration (s): 0.17 | learning rate: 7.644E-05 | global batch size: 256 | lm loss: 3.689473E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.964 | TFLOPs: 24.18 | +7: iteration 108520/ 173500 | consumed samples: 27781120 | consumed tokens: 56895733760 | elapsed time per iteration (s): 0.15 | learning rate: 7.642E-05 | global batch size: 256 | lm loss: 3.682723E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.930 | TFLOPs: 26.17 | +7: iteration 108530/ 173500 | consumed samples: 27783680 | consumed tokens: 56900976640 | elapsed time per iteration (s): 0.16 | learning rate: 7.641E-05 | global batch size: 256 | lm loss: 3.683642E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.000 | TFLOPs: 25.77 | +7: iteration 108540/ 173500 | consumed samples: 27786240 | consumed tokens: 56906219520 | elapsed time per iteration (s): 0.15 | learning rate: 7.639E-05 | global batch size: 256 | lm loss: 3.685682E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.069 | TFLOPs: 26.18 | +7: iteration 108550/ 173500 | consumed samples: 27788800 | consumed tokens: 56911462400 | elapsed time per iteration (s): 0.15 | learning rate: 7.638E-05 | global batch size: 256 | lm loss: 3.676656E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.936 | TFLOPs: 26.17 | +7: iteration 108560/ 173500 | consumed samples: 27791360 | consumed tokens: 56916705280 | elapsed time per iteration (s): 0.15 | learning rate: 7.636E-05 | global batch size: 256 | lm loss: 3.667434E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.837 | TFLOPs: 26.20 | +7: iteration 108570/ 173500 | consumed samples: 27793920 | consumed tokens: 56921948160 | elapsed time per iteration (s): 0.15 | learning rate: 7.635E-05 | global batch size: 256 | lm loss: 3.686413E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.971 | TFLOPs: 26.19 | +7: iteration 108580/ 173500 | consumed samples: 27796480 | consumed tokens: 56927191040 | elapsed time per iteration (s): 0.16 | learning rate: 7.633E-05 | global batch size: 256 | lm loss: 3.681371E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.518 | TFLOPs: 24.90 | +7: iteration 108590/ 173500 | consumed samples: 27799040 | consumed tokens: 56932433920 | elapsed time per iteration (s): 0.15 | learning rate: 7.632E-05 | global batch size: 256 | lm loss: 3.699653E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.354 | TFLOPs: 26.15 | +7: iteration 108600/ 173500 | consumed samples: 27801600 | consumed tokens: 56937676800 | elapsed time per iteration (s): 0.15 | learning rate: 7.630E-05 | global batch size: 256 | lm loss: 3.695240E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.734 | TFLOPs: 25.97 | +7: iteration 108610/ 173500 | consumed samples: 27804160 | consumed tokens: 56942919680 | elapsed time per iteration (s): 0.15 | learning rate: 7.629E-05 | global batch size: 256 | lm loss: 3.687188E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.606 | TFLOPs: 26.17 | +7: iteration 108620/ 173500 | consumed samples: 27806720 | consumed tokens: 56948162560 | elapsed time per iteration (s): 0.15 | learning rate: 7.627E-05 | global batch size: 256 | lm loss: 3.685521E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.069 | TFLOPs: 26.16 | +7: iteration 108630/ 173500 | consumed samples: 27809280 | consumed tokens: 56953405440 | elapsed time per iteration (s): 0.15 | learning rate: 7.626E-05 | global batch size: 256 | lm loss: 3.681066E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.718 | TFLOPs: 26.12 | +7: iteration 108640/ 173500 | consumed samples: 27811840 | consumed tokens: 56958648320 | elapsed time per iteration (s): 0.15 | learning rate: 7.624E-05 | global batch size: 256 | lm loss: 3.677554E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.204 | TFLOPs: 26.15 | +7: iteration 108650/ 173500 | consumed samples: 27814400 | consumed tokens: 56963891200 | elapsed time per iteration (s): 0.15 | learning rate: 7.623E-05 | global batch size: 256 | lm loss: 3.684571E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.378 | TFLOPs: 26.18 | +7: iteration 108660/ 173500 | consumed samples: 27816960 | consumed tokens: 56969134080 | elapsed time per iteration (s): 0.15 | learning rate: 7.621E-05 | global batch size: 256 | lm loss: 3.688106E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.093 | TFLOPs: 26.19 | +7: iteration 108670/ 173500 | consumed samples: 27819520 | consumed tokens: 56974376960 | elapsed time per iteration (s): 0.15 | learning rate: 7.620E-05 | global batch size: 256 | lm loss: 3.694751E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.189 | TFLOPs: 26.21 | +7: iteration 108680/ 173500 | consumed samples: 27822080 | consumed tokens: 56979619840 | elapsed time per iteration (s): 0.15 | learning rate: 7.618E-05 | global batch size: 256 | lm loss: 3.689017E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.065 | TFLOPs: 26.22 | +7: iteration 108690/ 173500 | consumed samples: 27824640 | consumed tokens: 56984862720 | elapsed time per iteration (s): 0.15 | learning rate: 7.617E-05 | global batch size: 256 | lm loss: 3.691399E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.857 | TFLOPs: 26.22 | +7: iteration 108700/ 173500 | consumed samples: 27827200 | consumed tokens: 56990105600 | elapsed time per iteration (s): 0.15 | learning rate: 7.615E-05 | global batch size: 256 | lm loss: 3.687733E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.726 | TFLOPs: 26.17 | +7: iteration 108710/ 173500 | consumed samples: 27829760 | consumed tokens: 56995348480 | elapsed time per iteration (s): 0.15 | learning rate: 7.613E-05 | global batch size: 256 | lm loss: 3.680119E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.022 | TFLOPs: 26.17 | +7: iteration 108720/ 173500 | consumed samples: 27832320 | consumed tokens: 57000591360 | elapsed time per iteration (s): 0.15 | learning rate: 7.612E-05 | global batch size: 256 | lm loss: 3.675285E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.301 | TFLOPs: 26.19 | +7: iteration 108730/ 173500 | consumed samples: 27834880 | consumed tokens: 57005834240 | elapsed time per iteration (s): 0.15 | learning rate: 7.610E-05 | global batch size: 256 | lm loss: 3.680388E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.254 | TFLOPs: 26.18 | +7: iteration 108740/ 173500 | consumed samples: 27837440 | consumed tokens: 57011077120 | elapsed time per iteration (s): 0.15 | learning rate: 7.609E-05 | global batch size: 256 | lm loss: 3.684762E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.426 | TFLOPs: 26.20 | +7: iteration 108750/ 173500 | consumed samples: 27840000 | consumed tokens: 57016320000 | elapsed time per iteration (s): 0.15 | learning rate: 7.607E-05 | global batch size: 256 | lm loss: 3.684724E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.833 | TFLOPs: 26.16 | +7: iteration 108760/ 173500 | consumed samples: 27842560 | consumed tokens: 57021562880 | elapsed time per iteration (s): 0.15 | learning rate: 7.606E-05 | global batch size: 256 | lm loss: 3.687616E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.738 | TFLOPs: 26.14 | +7: iteration 108770/ 173500 | consumed samples: 27845120 | consumed tokens: 57026805760 | elapsed time per iteration (s): 0.15 | learning rate: 7.604E-05 | global batch size: 256 | lm loss: 3.675212E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.117 | TFLOPs: 26.13 | +7: iteration 108780/ 173500 | consumed samples: 27847680 | consumed tokens: 57032048640 | elapsed time per iteration (s): 0.15 | learning rate: 7.603E-05 | global batch size: 256 | lm loss: 3.676041E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.629 | TFLOPs: 26.14 | +7: iteration 108790/ 173500 | consumed samples: 27850240 | consumed tokens: 57037291520 | elapsed time per iteration (s): 0.15 | learning rate: 7.601E-05 | global batch size: 256 | lm loss: 3.682314E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.193 | TFLOPs: 26.16 | +7: iteration 108800/ 173500 | consumed samples: 27852800 | consumed tokens: 57042534400 | elapsed time per iteration (s): 0.15 | learning rate: 7.600E-05 | global batch size: 256 | lm loss: 3.680603E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.794 | TFLOPs: 26.22 | +7: iteration 108810/ 173500 | consumed samples: 27855360 | consumed tokens: 57047777280 | elapsed time per iteration (s): 0.15 | learning rate: 7.598E-05 | global batch size: 256 | lm loss: 3.684727E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.665 | TFLOPs: 26.20 | +7: iteration 108820/ 173500 | consumed samples: 27857920 | consumed tokens: 57053020160 | elapsed time per iteration (s): 0.15 | learning rate: 7.597E-05 | global batch size: 256 | lm loss: 3.698455E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.571 | TFLOPs: 26.18 | +7: iteration 108830/ 173500 | consumed samples: 27860480 | consumed tokens: 57058263040 | elapsed time per iteration (s): 0.15 | learning rate: 7.595E-05 | global batch size: 256 | lm loss: 3.676868E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.819 | TFLOPs: 26.19 | +7: iteration 108840/ 173500 | consumed samples: 27863040 | consumed tokens: 57063505920 | elapsed time per iteration (s): 0.15 | learning rate: 7.594E-05 | global batch size: 256 | lm loss: 3.687840E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.580 | TFLOPs: 26.20 | +7: iteration 108850/ 173500 | consumed samples: 27865600 | consumed tokens: 57068748800 | elapsed time per iteration (s): 0.15 | learning rate: 7.592E-05 | global batch size: 256 | lm loss: 3.706062E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.348 | TFLOPs: 26.20 | +7: iteration 108860/ 173500 | consumed samples: 27868160 | consumed tokens: 57073991680 | elapsed time per iteration (s): 0.15 | learning rate: 7.591E-05 | global batch size: 256 | lm loss: 3.677039E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.461 | TFLOPs: 26.21 | +7: iteration 108870/ 173500 | consumed samples: 27870720 | consumed tokens: 57079234560 | elapsed time per iteration (s): 0.15 | learning rate: 7.589E-05 | global batch size: 256 | lm loss: 3.701633E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.748 | TFLOPs: 26.22 | +7: iteration 108880/ 173500 | consumed samples: 27873280 | consumed tokens: 57084477440 | elapsed time per iteration (s): 0.16 | learning rate: 7.588E-05 | global batch size: 256 | lm loss: 3.684029E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.007 | TFLOPs: 25.81 | +7: iteration 108890/ 173500 | consumed samples: 27875840 | consumed tokens: 57089720320 | elapsed time per iteration (s): 0.15 | learning rate: 7.586E-05 | global batch size: 256 | lm loss: 3.679376E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.211 | TFLOPs: 26.18 | +7: iteration 108900/ 173500 | consumed samples: 27878400 | consumed tokens: 57094963200 | elapsed time per iteration (s): 0.15 | learning rate: 7.585E-05 | global batch size: 256 | lm loss: 3.679980E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.372 | TFLOPs: 26.18 | +7: iteration 108910/ 173500 | consumed samples: 27880960 | consumed tokens: 57100206080 | elapsed time per iteration (s): 0.15 | learning rate: 7.583E-05 | global batch size: 256 | lm loss: 3.693252E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.839 | TFLOPs: 26.19 | +7: iteration 108920/ 173500 | consumed samples: 27883520 | consumed tokens: 57105448960 | elapsed time per iteration (s): 0.15 | learning rate: 7.581E-05 | global batch size: 256 | lm loss: 3.693737E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.983 | TFLOPs: 26.16 | +7: iteration 108930/ 173500 | consumed samples: 27886080 | consumed tokens: 57110691840 | elapsed time per iteration (s): 0.15 | learning rate: 7.580E-05 | global batch size: 256 | lm loss: 3.694519E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.201 | TFLOPs: 26.18 | +7: iteration 108940/ 173500 | consumed samples: 27888640 | consumed tokens: 57115934720 | elapsed time per iteration (s): 0.15 | learning rate: 7.578E-05 | global batch size: 256 | lm loss: 3.692032E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.973 | TFLOPs: 26.19 | +7: iteration 108950/ 173500 | consumed samples: 27891200 | consumed tokens: 57121177600 | elapsed time per iteration (s): 0.15 | learning rate: 7.577E-05 | global batch size: 256 | lm loss: 3.683347E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.342 | TFLOPs: 26.18 | +7: iteration 108960/ 173500 | consumed samples: 27893760 | consumed tokens: 57126420480 | elapsed time per iteration (s): 0.15 | learning rate: 7.575E-05 | global batch size: 256 | lm loss: 3.691013E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.997 | TFLOPs: 26.14 | +7: iteration 108970/ 173500 | consumed samples: 27896320 | consumed tokens: 57131663360 | elapsed time per iteration (s): 0.16 | learning rate: 7.574E-05 | global batch size: 256 | lm loss: 3.691543E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.274 | TFLOPs: 25.85 | +7: iteration 108980/ 173500 | consumed samples: 27898880 | consumed tokens: 57136906240 | elapsed time per iteration (s): 0.15 | learning rate: 7.572E-05 | global batch size: 256 | lm loss: 3.696774E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.145 | TFLOPs: 26.19 | +7: iteration 108990/ 173500 | consumed samples: 27901440 | consumed tokens: 57142149120 | elapsed time per iteration (s): 0.15 | learning rate: 7.571E-05 | global batch size: 256 | lm loss: 3.686098E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.600 | TFLOPs: 26.18 | +7: iteration 109000/ 173500 | consumed samples: 27904000 | consumed tokens: 57147392000 | elapsed time per iteration (s): 0.15 | learning rate: 7.569E-05 | global batch size: 256 | lm loss: 3.681726E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.376 | TFLOPs: 26.18 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 109000 | lm loss value: 3.832494E+00 | lm loss PPL: 4.617754E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 109000 to checkpoints_44m91b100m +0: [2023-03-17 04:59:07,357] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step109000 is begin to save! +0: [2023-03-17 04:59:07,361] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_01-model_00-model_states.pt... +0: [2023-03-17 04:59:07,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_01-model_00-model_states.pt. +0: [2023-03-17 04:59:07,429] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_03-model_00-model_states.pt... +0: [2023-03-17 04:59:07,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_03-model_00-model_states.pt. +0: [2023-03-17 04:59:07,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_04-model_00-model_states.pt... +0: [2023-03-17 04:59:07,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_04-model_00-model_states.pt. +0: [2023-03-17 04:59:07,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_05-model_00-model_states.pt... +0: [2023-03-17 04:59:07,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_05-model_00-model_states.pt. +0: [2023-03-17 04:59:07,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_06-model_00-model_states.pt... +0: [2023-03-17 04:59:07,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_06-model_00-model_states.pt. +0: [2023-03-17 04:59:07,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_07-model_00-model_states.pt... +0: [2023-03-17 04:59:07,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_07-model_00-model_states.pt. +0: [2023-03-17 04:59:07,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_08-model_00-model_states.pt... +0: [2023-03-17 04:59:07,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_08-model_00-model_states.pt. +0: [2023-03-17 04:59:07,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_09-model_00-model_states.pt... +0: [2023-03-17 04:59:07,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_09-model_00-model_states.pt. +0: [2023-03-17 04:59:07,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_10-model_00-model_states.pt... +0: [2023-03-17 04:59:07,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_10-model_00-model_states.pt. +0: [2023-03-17 04:59:07,495] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/layer_12-model_00-model_states.pt... +0: [2023-03-17 04:59:07,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/layer_12-model_00-model_states.pt. +0: [2023-03-17 04:59:07,497] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step109000/mp_rank_00_model_states.pt +0: [2023-03-17 04:59:07,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/mp_rank_00_model_states.pt... +0: [2023-03-17 04:59:07,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/mp_rank_00_model_states.pt. +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 04:59:07,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 04:59:07,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 04:59:07,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:59:07,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:59:07,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 04:59:07,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +4: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 04:59:07,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 04:59:07,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +5: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +7: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +2: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +1: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +6: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +3: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 04:59:07,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step109000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 04:59:07,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step109000 is ready now! +0: successfully saved checkpoint at iteration 109000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 189.34 +7: iteration 109010/ 173500 | consumed samples: 27906560 | consumed tokens: 57152634880 | elapsed time per iteration (s): 0.18 | learning rate: 7.568E-05 | global batch size: 256 | lm loss: 3.678022E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1424.914 | TFLOPs: 22.35 | +7: iteration 109020/ 173500 | consumed samples: 27909120 | consumed tokens: 57157877760 | elapsed time per iteration (s): 0.15 | learning rate: 7.566E-05 | global batch size: 256 | lm loss: 3.689116E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.367 | TFLOPs: 26.16 | +7: iteration 109030/ 173500 | consumed samples: 27911680 | consumed tokens: 57163120640 | elapsed time per iteration (s): 0.15 | learning rate: 7.565E-05 | global batch size: 256 | lm loss: 3.675474E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.507 | TFLOPs: 26.18 | +7: iteration 109040/ 173500 | consumed samples: 27914240 | consumed tokens: 57168363520 | elapsed time per iteration (s): 0.15 | learning rate: 7.563E-05 | global batch size: 256 | lm loss: 3.694467E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.828 | TFLOPs: 26.17 | +7: iteration 109050/ 173500 | consumed samples: 27916800 | consumed tokens: 57173606400 | elapsed time per iteration (s): 0.15 | learning rate: 7.562E-05 | global batch size: 256 | lm loss: 3.688978E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.551 | TFLOPs: 26.17 | +7: iteration 109060/ 173500 | consumed samples: 27919360 | consumed tokens: 57178849280 | elapsed time per iteration (s): 0.15 | learning rate: 7.560E-05 | global batch size: 256 | lm loss: 3.680954E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.673 | TFLOPs: 26.17 | +7: iteration 109070/ 173500 | consumed samples: 27921920 | consumed tokens: 57184092160 | elapsed time per iteration (s): 0.16 | learning rate: 7.559E-05 | global batch size: 256 | lm loss: 3.672602E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.056 | TFLOPs: 25.64 | +7: iteration 109080/ 173500 | consumed samples: 27924480 | consumed tokens: 57189335040 | elapsed time per iteration (s): 0.15 | learning rate: 7.557E-05 | global batch size: 256 | lm loss: 3.681181E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.980 | TFLOPs: 26.16 | +7: iteration 109090/ 173500 | consumed samples: 27927040 | consumed tokens: 57194577920 | elapsed time per iteration (s): 0.15 | learning rate: 7.556E-05 | global batch size: 256 | lm loss: 3.672542E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.264 | TFLOPs: 26.15 | +7: iteration 109100/ 173500 | consumed samples: 27929600 | consumed tokens: 57199820800 | elapsed time per iteration (s): 0.15 | learning rate: 7.554E-05 | global batch size: 256 | lm loss: 3.690403E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.804 | TFLOPs: 26.16 | +7: iteration 109110/ 173500 | consumed samples: 27932160 | consumed tokens: 57205063680 | elapsed time per iteration (s): 0.15 | learning rate: 7.553E-05 | global batch size: 256 | lm loss: 3.687620E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.998 | TFLOPs: 26.14 | +7: iteration 109120/ 173500 | consumed samples: 27934720 | consumed tokens: 57210306560 | elapsed time per iteration (s): 0.19 | learning rate: 7.551E-05 | global batch size: 256 | lm loss: 3.659236E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1362.547 | TFLOPs: 21.37 | +7: iteration 109130/ 173500 | consumed samples: 27937280 | consumed tokens: 57215549440 | elapsed time per iteration (s): 0.16 | learning rate: 7.550E-05 | global batch size: 256 | lm loss: 3.683983E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.190 | TFLOPs: 24.80 | +7: iteration 109140/ 173500 | consumed samples: 27939840 | consumed tokens: 57220792320 | elapsed time per iteration (s): 0.16 | learning rate: 7.548E-05 | global batch size: 256 | lm loss: 3.690781E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.989 | TFLOPs: 25.86 | +7: iteration 109150/ 173500 | consumed samples: 27942400 | consumed tokens: 57226035200 | elapsed time per iteration (s): 0.15 | learning rate: 7.546E-05 | global batch size: 256 | lm loss: 3.689327E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.378 | TFLOPs: 26.01 | +7: iteration 109160/ 173500 | consumed samples: 27944960 | consumed tokens: 57231278080 | elapsed time per iteration (s): 0.16 | learning rate: 7.545E-05 | global batch size: 256 | lm loss: 3.696225E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.140 | TFLOPs: 24.80 | +7: iteration 109170/ 173500 | consumed samples: 27947520 | consumed tokens: 57236520960 | elapsed time per iteration (s): 0.16 | learning rate: 7.543E-05 | global batch size: 256 | lm loss: 3.678999E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.156 | TFLOPs: 25.60 | +7: iteration 109180/ 173500 | consumed samples: 27950080 | consumed tokens: 57241763840 | elapsed time per iteration (s): 0.16 | learning rate: 7.542E-05 | global batch size: 256 | lm loss: 3.675519E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.690 | TFLOPs: 25.53 | +7: iteration 109190/ 173500 | consumed samples: 27952640 | consumed tokens: 57247006720 | elapsed time per iteration (s): 0.16 | learning rate: 7.540E-05 | global batch size: 256 | lm loss: 3.687004E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.439 | TFLOPs: 25.32 | +7: iteration 109200/ 173500 | consumed samples: 27955200 | consumed tokens: 57252249600 | elapsed time per iteration (s): 0.16 | learning rate: 7.539E-05 | global batch size: 256 | lm loss: 3.684372E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.723 | TFLOPs: 24.66 | +7: iteration 109210/ 173500 | consumed samples: 27957760 | consumed tokens: 57257492480 | elapsed time per iteration (s): 0.16 | learning rate: 7.537E-05 | global batch size: 256 | lm loss: 3.680199E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.433 | TFLOPs: 25.38 | +7: iteration 109220/ 173500 | consumed samples: 27960320 | consumed tokens: 57262735360 | elapsed time per iteration (s): 0.16 | learning rate: 7.536E-05 | global batch size: 256 | lm loss: 3.676940E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.024 | TFLOPs: 25.48 | +7: iteration 109230/ 173500 | consumed samples: 27962880 | consumed tokens: 57267978240 | elapsed time per iteration (s): 0.16 | learning rate: 7.534E-05 | global batch size: 256 | lm loss: 3.669339E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.233 | TFLOPs: 25.63 | +7: iteration 109240/ 173500 | consumed samples: 27965440 | consumed tokens: 57273221120 | elapsed time per iteration (s): 0.16 | learning rate: 7.533E-05 | global batch size: 256 | lm loss: 3.689108E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.064 | TFLOPs: 25.38 | +7: iteration 109250/ 173500 | consumed samples: 27968000 | consumed tokens: 57278464000 | elapsed time per iteration (s): 0.16 | learning rate: 7.531E-05 | global batch size: 256 | lm loss: 3.683632E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.508 | TFLOPs: 25.24 | +7: iteration 109260/ 173500 | consumed samples: 27970560 | consumed tokens: 57283706880 | elapsed time per iteration (s): 0.15 | learning rate: 7.530E-05 | global batch size: 256 | lm loss: 3.686318E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.814 | TFLOPs: 26.14 | +7: iteration 109270/ 173500 | consumed samples: 27973120 | consumed tokens: 57288949760 | elapsed time per iteration (s): 0.16 | learning rate: 7.528E-05 | global batch size: 256 | lm loss: 3.672572E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.685 | TFLOPs: 24.65 | +7: iteration 109280/ 173500 | consumed samples: 27975680 | consumed tokens: 57294192640 | elapsed time per iteration (s): 0.16 | learning rate: 7.527E-05 | global batch size: 256 | lm loss: 3.672939E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.004 | TFLOPs: 24.98 | +7: iteration 109290/ 173500 | consumed samples: 27978240 | consumed tokens: 57299435520 | elapsed time per iteration (s): 0.16 | learning rate: 7.525E-05 | global batch size: 256 | lm loss: 3.683756E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.567 | TFLOPs: 24.96 | +7: iteration 109300/ 173500 | consumed samples: 27980800 | consumed tokens: 57304678400 | elapsed time per iteration (s): 0.16 | learning rate: 7.524E-05 | global batch size: 256 | lm loss: 3.698729E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.698 | TFLOPs: 25.12 | +7: iteration 109310/ 173500 | consumed samples: 27983360 | consumed tokens: 57309921280 | elapsed time per iteration (s): 0.16 | learning rate: 7.522E-05 | global batch size: 256 | lm loss: 3.696727E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.585 | TFLOPs: 24.57 | +7: iteration 109320/ 173500 | consumed samples: 27985920 | consumed tokens: 57315164160 | elapsed time per iteration (s): 0.16 | learning rate: 7.521E-05 | global batch size: 256 | lm loss: 3.696286E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.302 | TFLOPs: 25.21 | +7: iteration 109330/ 173500 | consumed samples: 27988480 | consumed tokens: 57320407040 | elapsed time per iteration (s): 0.16 | learning rate: 7.519E-05 | global batch size: 256 | lm loss: 3.689166E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.577 | TFLOPs: 25.23 | +7: iteration 109340/ 173500 | consumed samples: 27991040 | consumed tokens: 57325649920 | elapsed time per iteration (s): 0.15 | learning rate: 7.518E-05 | global batch size: 256 | lm loss: 3.687420E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.449 | TFLOPs: 26.10 | +7: iteration 109350/ 173500 | consumed samples: 27993600 | consumed tokens: 57330892800 | elapsed time per iteration (s): 0.16 | learning rate: 7.516E-05 | global batch size: 256 | lm loss: 3.688139E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.448 | TFLOPs: 25.27 | +7: iteration 109360/ 173500 | consumed samples: 27996160 | consumed tokens: 57336135680 | elapsed time per iteration (s): 0.16 | learning rate: 7.515E-05 | global batch size: 256 | lm loss: 3.688391E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.787 | TFLOPs: 25.14 | +7: iteration 109370/ 173500 | consumed samples: 27998720 | consumed tokens: 57341378560 | elapsed time per iteration (s): 0.16 | learning rate: 7.513E-05 | global batch size: 256 | lm loss: 3.698114E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.583 | TFLOPs: 25.73 | +7: iteration 109380/ 173500 | consumed samples: 28001280 | consumed tokens: 57346621440 | elapsed time per iteration (s): 0.16 | learning rate: 7.512E-05 | global batch size: 256 | lm loss: 3.683882E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.469 | TFLOPs: 25.40 | +7: iteration 109390/ 173500 | consumed samples: 28003840 | consumed tokens: 57351864320 | elapsed time per iteration (s): 0.16 | learning rate: 7.510E-05 | global batch size: 256 | lm loss: 3.682230E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.576 | TFLOPs: 25.20 | +7: iteration 109400/ 173500 | consumed samples: 28006400 | consumed tokens: 57357107200 | elapsed time per iteration (s): 0.16 | learning rate: 7.509E-05 | global batch size: 256 | lm loss: 3.679903E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.031 | TFLOPs: 25.34 | +7: iteration 109410/ 173500 | consumed samples: 28008960 | consumed tokens: 57362350080 | elapsed time per iteration (s): 0.16 | learning rate: 7.507E-05 | global batch size: 256 | lm loss: 3.679596E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.910 | TFLOPs: 25.23 | +7: iteration 109420/ 173500 | consumed samples: 28011520 | consumed tokens: 57367592960 | elapsed time per iteration (s): 0.16 | learning rate: 7.505E-05 | global batch size: 256 | lm loss: 3.678637E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.353 | TFLOPs: 25.30 | +7: iteration 109430/ 173500 | consumed samples: 28014080 | consumed tokens: 57372835840 | elapsed time per iteration (s): 0.16 | learning rate: 7.504E-05 | global batch size: 256 | lm loss: 3.696128E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.498 | TFLOPs: 25.44 | +7: iteration 109440/ 173500 | consumed samples: 28016640 | consumed tokens: 57378078720 | elapsed time per iteration (s): 0.16 | learning rate: 7.502E-05 | global batch size: 256 | lm loss: 3.685704E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.319 | TFLOPs: 25.55 | +7: iteration 109450/ 173500 | consumed samples: 28019200 | consumed tokens: 57383321600 | elapsed time per iteration (s): 0.16 | learning rate: 7.501E-05 | global batch size: 256 | lm loss: 3.698407E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.670 | TFLOPs: 25.29 | +7: iteration 109460/ 173500 | consumed samples: 28021760 | consumed tokens: 57388564480 | elapsed time per iteration (s): 0.16 | learning rate: 7.499E-05 | global batch size: 256 | lm loss: 3.681975E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.703 | TFLOPs: 25.13 | +7: iteration 109470/ 173500 | consumed samples: 28024320 | consumed tokens: 57393807360 | elapsed time per iteration (s): 0.15 | learning rate: 7.498E-05 | global batch size: 256 | lm loss: 3.685636E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.934 | TFLOPs: 25.94 | +7: iteration 109480/ 173500 | consumed samples: 28026880 | consumed tokens: 57399050240 | elapsed time per iteration (s): 0.16 | learning rate: 7.496E-05 | global batch size: 256 | lm loss: 3.677439E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.270 | TFLOPs: 24.80 | +7: iteration 109490/ 173500 | consumed samples: 28029440 | consumed tokens: 57404293120 | elapsed time per iteration (s): 0.16 | learning rate: 7.495E-05 | global batch size: 256 | lm loss: 3.676522E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.545 | TFLOPs: 25.81 | +7: iteration 109500/ 173500 | consumed samples: 28032000 | consumed tokens: 57409536000 | elapsed time per iteration (s): 0.15 | learning rate: 7.493E-05 | global batch size: 256 | lm loss: 3.678809E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.083 | TFLOPs: 26.08 | +7: iteration 109510/ 173500 | consumed samples: 28034560 | consumed tokens: 57414778880 | elapsed time per iteration (s): 0.16 | learning rate: 7.492E-05 | global batch size: 256 | lm loss: 3.690152E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.244 | TFLOPs: 25.82 | +7: iteration 109520/ 173500 | consumed samples: 28037120 | consumed tokens: 57420021760 | elapsed time per iteration (s): 0.16 | learning rate: 7.490E-05 | global batch size: 256 | lm loss: 3.689768E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.055 | TFLOPs: 25.17 | +7: iteration 109530/ 173500 | consumed samples: 28039680 | consumed tokens: 57425264640 | elapsed time per iteration (s): 0.16 | learning rate: 7.489E-05 | global batch size: 256 | lm loss: 3.682679E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.769 | TFLOPs: 25.64 | +7: iteration 109540/ 173500 | consumed samples: 28042240 | consumed tokens: 57430507520 | elapsed time per iteration (s): 0.15 | learning rate: 7.487E-05 | global batch size: 256 | lm loss: 3.695960E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.267 | TFLOPs: 26.12 | +7: iteration 109550/ 173500 | consumed samples: 28044800 | consumed tokens: 57435750400 | elapsed time per iteration (s): 0.16 | learning rate: 7.486E-05 | global batch size: 256 | lm loss: 3.682807E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.911 | TFLOPs: 25.31 | +7: iteration 109560/ 173500 | consumed samples: 28047360 | consumed tokens: 57440993280 | elapsed time per iteration (s): 0.16 | learning rate: 7.484E-05 | global batch size: 256 | lm loss: 3.688624E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.914 | TFLOPs: 24.95 | +7: iteration 109570/ 173500 | consumed samples: 28049920 | consumed tokens: 57446236160 | elapsed time per iteration (s): 0.16 | learning rate: 7.483E-05 | global batch size: 256 | lm loss: 3.667574E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.172 | TFLOPs: 25.75 | +7: iteration 109580/ 173500 | consumed samples: 28052480 | consumed tokens: 57451479040 | elapsed time per iteration (s): 0.16 | learning rate: 7.481E-05 | global batch size: 256 | lm loss: 3.683312E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.447 | TFLOPs: 25.38 | +7: iteration 109590/ 173500 | consumed samples: 28055040 | consumed tokens: 57456721920 | elapsed time per iteration (s): 0.16 | learning rate: 7.480E-05 | global batch size: 256 | lm loss: 3.707289E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.256 | TFLOPs: 25.88 | +7: iteration 109600/ 173500 | consumed samples: 28057600 | consumed tokens: 57461964800 | elapsed time per iteration (s): 0.16 | learning rate: 7.478E-05 | global batch size: 256 | lm loss: 3.675197E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.026 | TFLOPs: 25.78 | +7: iteration 109610/ 173500 | consumed samples: 28060160 | consumed tokens: 57467207680 | elapsed time per iteration (s): 0.15 | learning rate: 7.477E-05 | global batch size: 256 | lm loss: 3.689539E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.138 | TFLOPs: 26.18 | +7: iteration 109620/ 173500 | consumed samples: 28062720 | consumed tokens: 57472450560 | elapsed time per iteration (s): 0.15 | learning rate: 7.475E-05 | global batch size: 256 | lm loss: 3.689014E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.849 | TFLOPs: 26.17 | +7: iteration 109630/ 173500 | consumed samples: 28065280 | consumed tokens: 57477693440 | elapsed time per iteration (s): 0.15 | learning rate: 7.474E-05 | global batch size: 256 | lm loss: 3.682910E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.437 | TFLOPs: 26.15 | +7: iteration 109640/ 173500 | consumed samples: 28067840 | consumed tokens: 57482936320 | elapsed time per iteration (s): 0.16 | learning rate: 7.472E-05 | global batch size: 256 | lm loss: 3.688173E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.157 | TFLOPs: 25.60 | +7: iteration 109650/ 173500 | consumed samples: 28070400 | consumed tokens: 57488179200 | elapsed time per iteration (s): 0.16 | learning rate: 7.471E-05 | global batch size: 256 | lm loss: 3.695844E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.710 | TFLOPs: 25.60 | +7: iteration 109660/ 173500 | consumed samples: 28072960 | consumed tokens: 57493422080 | elapsed time per iteration (s): 0.16 | learning rate: 7.469E-05 | global batch size: 256 | lm loss: 3.693194E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.258 | TFLOPs: 25.83 | +7: iteration 109670/ 173500 | consumed samples: 28075520 | consumed tokens: 57498664960 | elapsed time per iteration (s): 0.15 | learning rate: 7.468E-05 | global batch size: 256 | lm loss: 3.694205E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.916 | TFLOPs: 26.20 | +7: iteration 109680/ 173500 | consumed samples: 28078080 | consumed tokens: 57503907840 | elapsed time per iteration (s): 0.16 | learning rate: 7.466E-05 | global batch size: 256 | lm loss: 3.687317E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.344 | TFLOPs: 25.87 | +7: iteration 109690/ 173500 | consumed samples: 28080640 | consumed tokens: 57509150720 | elapsed time per iteration (s): 0.16 | learning rate: 7.465E-05 | global batch size: 256 | lm loss: 3.688170E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.324 | TFLOPs: 25.85 | +7: iteration 109700/ 173500 | consumed samples: 28083200 | consumed tokens: 57514393600 | elapsed time per iteration (s): 0.15 | learning rate: 7.463E-05 | global batch size: 256 | lm loss: 3.689488E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.115 | TFLOPs: 26.21 | +7: iteration 109710/ 173500 | consumed samples: 28085760 | consumed tokens: 57519636480 | elapsed time per iteration (s): 0.16 | learning rate: 7.462E-05 | global batch size: 256 | lm loss: 3.667246E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.740 | TFLOPs: 25.48 | +7: iteration 109720/ 173500 | consumed samples: 28088320 | consumed tokens: 57524879360 | elapsed time per iteration (s): 0.16 | learning rate: 7.460E-05 | global batch size: 256 | lm loss: 3.685634E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.333 | TFLOPs: 25.80 | +7: iteration 109730/ 173500 | consumed samples: 28090880 | consumed tokens: 57530122240 | elapsed time per iteration (s): 0.15 | learning rate: 7.459E-05 | global batch size: 256 | lm loss: 3.686586E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.114 | TFLOPs: 26.18 | +7: iteration 109740/ 173500 | consumed samples: 28093440 | consumed tokens: 57535365120 | elapsed time per iteration (s): 0.15 | learning rate: 7.457E-05 | global batch size: 256 | lm loss: 3.689965E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.543 | TFLOPs: 26.20 | +7: iteration 109750/ 173500 | consumed samples: 28096000 | consumed tokens: 57540608000 | elapsed time per iteration (s): 0.16 | learning rate: 7.455E-05 | global batch size: 256 | lm loss: 3.682039E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.399 | TFLOPs: 25.73 | +7: iteration 109760/ 173500 | consumed samples: 28098560 | consumed tokens: 57545850880 | elapsed time per iteration (s): 0.15 | learning rate: 7.454E-05 | global batch size: 256 | lm loss: 3.689209E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.656 | TFLOPs: 26.25 | +7: iteration 109770/ 173500 | consumed samples: 28101120 | consumed tokens: 57551093760 | elapsed time per iteration (s): 0.15 | learning rate: 7.452E-05 | global batch size: 256 | lm loss: 3.684229E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.485 | TFLOPs: 26.23 | +7: iteration 109780/ 173500 | consumed samples: 28103680 | consumed tokens: 57556336640 | elapsed time per iteration (s): 0.15 | learning rate: 7.451E-05 | global batch size: 256 | lm loss: 3.668328E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.172 | TFLOPs: 26.24 | +7: iteration 109790/ 173500 | consumed samples: 28106240 | consumed tokens: 57561579520 | elapsed time per iteration (s): 0.15 | learning rate: 7.449E-05 | global batch size: 256 | lm loss: 3.687272E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.797 | TFLOPs: 25.92 | +7: iteration 109800/ 173500 | consumed samples: 28108800 | consumed tokens: 57566822400 | elapsed time per iteration (s): 0.15 | learning rate: 7.448E-05 | global batch size: 256 | lm loss: 3.668386E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.742 | TFLOPs: 26.19 | +7: iteration 109810/ 173500 | consumed samples: 28111360 | consumed tokens: 57572065280 | elapsed time per iteration (s): 0.15 | learning rate: 7.446E-05 | global batch size: 256 | lm loss: 3.692547E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.876 | TFLOPs: 26.19 | +7: iteration 109820/ 173500 | consumed samples: 28113920 | consumed tokens: 57577308160 | elapsed time per iteration (s): 0.15 | learning rate: 7.445E-05 | global batch size: 256 | lm loss: 3.682011E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.308 | TFLOPs: 26.23 | +7: iteration 109830/ 173500 | consumed samples: 28116480 | consumed tokens: 57582551040 | elapsed time per iteration (s): 0.15 | learning rate: 7.443E-05 | global batch size: 256 | lm loss: 3.676458E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.712 | TFLOPs: 26.20 | +7: iteration 109840/ 173500 | consumed samples: 28119040 | consumed tokens: 57587793920 | elapsed time per iteration (s): 0.15 | learning rate: 7.442E-05 | global batch size: 256 | lm loss: 3.684850E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.097 | TFLOPs: 26.24 | +7: iteration 109850/ 173500 | consumed samples: 28121600 | consumed tokens: 57593036800 | elapsed time per iteration (s): 0.15 | learning rate: 7.440E-05 | global batch size: 256 | lm loss: 3.688423E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.242 | TFLOPs: 26.21 | +7: iteration 109860/ 173500 | consumed samples: 28124160 | consumed tokens: 57598279680 | elapsed time per iteration (s): 0.16 | learning rate: 7.439E-05 | global batch size: 256 | lm loss: 3.683417E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.354 | TFLOPs: 25.47 | +7: iteration 109870/ 173500 | consumed samples: 28126720 | consumed tokens: 57603522560 | elapsed time per iteration (s): 0.15 | learning rate: 7.437E-05 | global batch size: 256 | lm loss: 3.678205E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.173 | TFLOPs: 26.21 | +7: iteration 109880/ 173500 | consumed samples: 28129280 | consumed tokens: 57608765440 | elapsed time per iteration (s): 0.15 | learning rate: 7.436E-05 | global batch size: 256 | lm loss: 3.693126E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.790 | TFLOPs: 26.00 | +7: iteration 109890/ 173500 | consumed samples: 28131840 | consumed tokens: 57614008320 | elapsed time per iteration (s): 0.16 | learning rate: 7.434E-05 | global batch size: 256 | lm loss: 3.671417E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.242 | TFLOPs: 25.77 | +7: iteration 109900/ 173500 | consumed samples: 28134400 | consumed tokens: 57619251200 | elapsed time per iteration (s): 0.15 | learning rate: 7.433E-05 | global batch size: 256 | lm loss: 3.674350E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.222 | TFLOPs: 26.22 | +7: iteration 109910/ 173500 | consumed samples: 28136960 | consumed tokens: 57624494080 | elapsed time per iteration (s): 0.15 | learning rate: 7.431E-05 | global batch size: 256 | lm loss: 3.686682E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.705 | TFLOPs: 26.23 | +7: iteration 109920/ 173500 | consumed samples: 28139520 | consumed tokens: 57629736960 | elapsed time per iteration (s): 0.15 | learning rate: 7.430E-05 | global batch size: 256 | lm loss: 3.676166E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.708 | TFLOPs: 26.22 | +7: iteration 109930/ 173500 | consumed samples: 28142080 | consumed tokens: 57634979840 | elapsed time per iteration (s): 0.16 | learning rate: 7.428E-05 | global batch size: 256 | lm loss: 3.675386E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.938 | TFLOPs: 25.81 | +7: iteration 109940/ 173500 | consumed samples: 28144640 | consumed tokens: 57640222720 | elapsed time per iteration (s): 0.15 | learning rate: 7.427E-05 | global batch size: 256 | lm loss: 3.690181E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.881 | TFLOPs: 26.20 | +7: iteration 109950/ 173500 | consumed samples: 28147200 | consumed tokens: 57645465600 | elapsed time per iteration (s): 0.16 | learning rate: 7.425E-05 | global batch size: 256 | lm loss: 3.691742E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.717 | TFLOPs: 25.82 | +7: iteration 109960/ 173500 | consumed samples: 28149760 | consumed tokens: 57650708480 | elapsed time per iteration (s): 0.15 | learning rate: 7.424E-05 | global batch size: 256 | lm loss: 3.677525E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.880 | TFLOPs: 26.20 | +7: iteration 109970/ 173500 | consumed samples: 28152320 | consumed tokens: 57655951360 | elapsed time per iteration (s): 0.15 | learning rate: 7.422E-05 | global batch size: 256 | lm loss: 3.675027E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.125 | TFLOPs: 26.22 | +7: iteration 109980/ 173500 | consumed samples: 28154880 | consumed tokens: 57661194240 | elapsed time per iteration (s): 0.15 | learning rate: 7.421E-05 | global batch size: 256 | lm loss: 3.686514E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.408 | TFLOPs: 26.21 | +7: iteration 109990/ 173500 | consumed samples: 28157440 | consumed tokens: 57666437120 | elapsed time per iteration (s): 0.16 | learning rate: 7.419E-05 | global batch size: 256 | lm loss: 3.686345E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.862 | TFLOPs: 25.67 | +0: [2023-03-17 05:01:43,939] [INFO] [logging.py:68:log_dist] [Rank 0] step=110000, skipped=0, lr=[7.417709678812063e-05, 7.417709678812063e-05, 7.417709678812063e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 110000/ 173500 | consumed samples: 28160000 | consumed tokens: 57671680000 | elapsed time per iteration (s): 0.15 | learning rate: 7.418E-05 | global batch size: 256 | lm loss: 3.675821E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.631 | TFLOPs: 26.20 | +0: steps: 110000 loss: 3.6541 iter time (s): 0.154 samples/sec: 1659.177 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 110000 | lm loss value: 3.806343E+00 | lm loss PPL: 4.498562E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 110000 to checkpoints_44m91b100m +0: [2023-03-17 05:01:44,013] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step110000 is begin to save! +0: [2023-03-17 05:01:44,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:01:44,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:01:44,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:01:44,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:01:44,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:01:44,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:01:44,099] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:01:44,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:01:44,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:01:44,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:01:44,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:01:44,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:01:44,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:01:44,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:01:44,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:01:44,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:01:44,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:01:44,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:01:44,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:01:44,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:01:44,149] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step110000/mp_rank_00_model_states.pt +0: [2023-03-17 05:01:44,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:01:44,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:01:44,167] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:01:44,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:01:44,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +6: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +4: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:01:44,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 05:01:44,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:01:44,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +5: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +1: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:01:44,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 05:01:44,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +2: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +3: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:01:44,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +7: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:01:44,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step110000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:01:44,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step110000 is ready now! +0: successfully saved checkpoint at iteration 110000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.52 +7: iteration 110010/ 173500 | consumed samples: 28162560 | consumed tokens: 57676922880 | elapsed time per iteration (s): 0.18 | learning rate: 7.416E-05 | global batch size: 256 | lm loss: 3.680448E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1436.680 | TFLOPs: 22.53 | +7: iteration 110020/ 173500 | consumed samples: 28165120 | consumed tokens: 57682165760 | elapsed time per iteration (s): 0.15 | learning rate: 7.415E-05 | global batch size: 256 | lm loss: 3.677625E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.656 | TFLOPs: 26.18 | +7: iteration 110030/ 173500 | consumed samples: 28167680 | consumed tokens: 57687408640 | elapsed time per iteration (s): 0.15 | learning rate: 7.413E-05 | global batch size: 256 | lm loss: 3.688293E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.286 | TFLOPs: 26.19 | +7: iteration 110040/ 173500 | consumed samples: 28170240 | consumed tokens: 57692651520 | elapsed time per iteration (s): 0.15 | learning rate: 7.412E-05 | global batch size: 256 | lm loss: 3.682252E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.065 | TFLOPs: 26.19 | +7: iteration 110050/ 173500 | consumed samples: 28172800 | consumed tokens: 57697894400 | elapsed time per iteration (s): 0.15 | learning rate: 7.410E-05 | global batch size: 256 | lm loss: 3.682297E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.174 | TFLOPs: 26.16 | +7: iteration 110060/ 173500 | consumed samples: 28175360 | consumed tokens: 57703137280 | elapsed time per iteration (s): 0.16 | learning rate: 7.409E-05 | global batch size: 256 | lm loss: 3.680033E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.146 | TFLOPs: 25.60 | +7: iteration 110070/ 173500 | consumed samples: 28177920 | consumed tokens: 57708380160 | elapsed time per iteration (s): 0.15 | learning rate: 7.407E-05 | global batch size: 256 | lm loss: 3.702745E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.190 | TFLOPs: 26.16 | +7: iteration 110080/ 173500 | consumed samples: 28180480 | consumed tokens: 57713623040 | elapsed time per iteration (s): 0.15 | learning rate: 7.406E-05 | global batch size: 256 | lm loss: 3.696124E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.641 | TFLOPs: 26.17 | +7: iteration 110090/ 173500 | consumed samples: 28183040 | consumed tokens: 57718865920 | elapsed time per iteration (s): 0.16 | learning rate: 7.404E-05 | global batch size: 256 | lm loss: 3.687301E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.317 | TFLOPs: 24.67 | +7: iteration 110100/ 173500 | consumed samples: 28185600 | consumed tokens: 57724108800 | elapsed time per iteration (s): 0.15 | learning rate: 7.403E-05 | global batch size: 256 | lm loss: 3.685905E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.485 | TFLOPs: 26.20 | +7: iteration 110110/ 173500 | consumed samples: 28188160 | consumed tokens: 57729351680 | elapsed time per iteration (s): 0.16 | learning rate: 7.401E-05 | global batch size: 256 | lm loss: 3.686172E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.702 | TFLOPs: 25.51 | +7: iteration 110120/ 173500 | consumed samples: 28190720 | consumed tokens: 57734594560 | elapsed time per iteration (s): 0.15 | learning rate: 7.400E-05 | global batch size: 256 | lm loss: 3.684897E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.743 | TFLOPs: 26.19 | +7: iteration 110130/ 173500 | consumed samples: 28193280 | consumed tokens: 57739837440 | elapsed time per iteration (s): 0.15 | learning rate: 7.398E-05 | global batch size: 256 | lm loss: 3.686615E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.336 | TFLOPs: 26.20 | +7: iteration 110140/ 173500 | consumed samples: 28195840 | consumed tokens: 57745080320 | elapsed time per iteration (s): 0.16 | learning rate: 7.397E-05 | global batch size: 256 | lm loss: 3.689580E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.618 | TFLOPs: 25.79 | +7: iteration 110150/ 173500 | consumed samples: 28198400 | consumed tokens: 57750323200 | elapsed time per iteration (s): 0.15 | learning rate: 7.395E-05 | global batch size: 256 | lm loss: 3.683337E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.775 | TFLOPs: 26.17 | +7: iteration 110160/ 173500 | consumed samples: 28200960 | consumed tokens: 57755566080 | elapsed time per iteration (s): 0.15 | learning rate: 7.394E-05 | global batch size: 256 | lm loss: 3.685273E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.773 | TFLOPs: 26.17 | +7: iteration 110170/ 173500 | consumed samples: 28203520 | consumed tokens: 57760808960 | elapsed time per iteration (s): 0.15 | learning rate: 7.392E-05 | global batch size: 256 | lm loss: 3.689767E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.448 | TFLOPs: 26.17 | +7: iteration 110180/ 173500 | consumed samples: 28206080 | consumed tokens: 57766051840 | elapsed time per iteration (s): 0.15 | learning rate: 7.391E-05 | global batch size: 256 | lm loss: 3.675063E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.024 | TFLOPs: 26.19 | +7: iteration 110190/ 173500 | consumed samples: 28208640 | consumed tokens: 57771294720 | elapsed time per iteration (s): 0.15 | learning rate: 7.389E-05 | global batch size: 256 | lm loss: 3.693000E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.684 | TFLOPs: 26.18 | +7: iteration 110200/ 173500 | consumed samples: 28211200 | consumed tokens: 57776537600 | elapsed time per iteration (s): 0.16 | learning rate: 7.388E-05 | global batch size: 256 | lm loss: 3.690314E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.607 | TFLOPs: 25.51 | +7: iteration 110210/ 173500 | consumed samples: 28213760 | consumed tokens: 57781780480 | elapsed time per iteration (s): 0.15 | learning rate: 7.386E-05 | global batch size: 256 | lm loss: 3.675681E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.371 | TFLOPs: 26.13 | +7: iteration 110220/ 173500 | consumed samples: 28216320 | consumed tokens: 57787023360 | elapsed time per iteration (s): 0.15 | learning rate: 7.385E-05 | global batch size: 256 | lm loss: 3.684896E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.101 | TFLOPs: 26.13 | +7: iteration 110230/ 173500 | consumed samples: 28218880 | consumed tokens: 57792266240 | elapsed time per iteration (s): 0.15 | learning rate: 7.383E-05 | global batch size: 256 | lm loss: 3.704688E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.199 | TFLOPs: 26.13 | +7: iteration 110240/ 173500 | consumed samples: 28221440 | consumed tokens: 57797509120 | elapsed time per iteration (s): 0.15 | learning rate: 7.382E-05 | global batch size: 256 | lm loss: 3.683784E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.782 | TFLOPs: 26.17 | +7: iteration 110250/ 173500 | consumed samples: 28224000 | consumed tokens: 57802752000 | elapsed time per iteration (s): 0.15 | learning rate: 7.380E-05 | global batch size: 256 | lm loss: 3.678366E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.728 | TFLOPs: 26.14 | +7: iteration 110260/ 173500 | consumed samples: 28226560 | consumed tokens: 57807994880 | elapsed time per iteration (s): 0.16 | learning rate: 7.378E-05 | global batch size: 256 | lm loss: 3.688839E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.586 | TFLOPs: 25.52 | +7: iteration 110270/ 173500 | consumed samples: 28229120 | consumed tokens: 57813237760 | elapsed time per iteration (s): 0.15 | learning rate: 7.377E-05 | global batch size: 256 | lm loss: 3.687240E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.200 | TFLOPs: 26.13 | +7: iteration 110280/ 173500 | consumed samples: 28231680 | consumed tokens: 57818480640 | elapsed time per iteration (s): 0.16 | learning rate: 7.375E-05 | global batch size: 256 | lm loss: 3.673457E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.728 | TFLOPs: 25.34 | +7: iteration 110290/ 173500 | consumed samples: 28234240 | consumed tokens: 57823723520 | elapsed time per iteration (s): 0.15 | learning rate: 7.374E-05 | global batch size: 256 | lm loss: 3.678378E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.559 | TFLOPs: 26.18 | +7: iteration 110300/ 173500 | consumed samples: 28236800 | consumed tokens: 57828966400 | elapsed time per iteration (s): 0.15 | learning rate: 7.372E-05 | global batch size: 256 | lm loss: 3.688476E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.810 | TFLOPs: 26.17 | +7: iteration 110310/ 173500 | consumed samples: 28239360 | consumed tokens: 57834209280 | elapsed time per iteration (s): 0.15 | learning rate: 7.371E-05 | global batch size: 256 | lm loss: 3.681409E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.805 | TFLOPs: 26.17 | +7: iteration 110320/ 173500 | consumed samples: 28241920 | consumed tokens: 57839452160 | elapsed time per iteration (s): 0.15 | learning rate: 7.369E-05 | global batch size: 256 | lm loss: 3.679627E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.148 | TFLOPs: 26.18 | +7: iteration 110330/ 173500 | consumed samples: 28244480 | consumed tokens: 57844695040 | elapsed time per iteration (s): 0.16 | learning rate: 7.368E-05 | global batch size: 256 | lm loss: 3.703759E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.237 | TFLOPs: 25.49 | +7: iteration 110340/ 173500 | consumed samples: 28247040 | consumed tokens: 57849937920 | elapsed time per iteration (s): 0.15 | learning rate: 7.366E-05 | global batch size: 256 | lm loss: 3.680914E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.043 | TFLOPs: 26.19 | +7: iteration 110350/ 173500 | consumed samples: 28249600 | consumed tokens: 57855180800 | elapsed time per iteration (s): 0.16 | learning rate: 7.365E-05 | global batch size: 256 | lm loss: 3.674488E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.465 | TFLOPs: 25.65 | +7: iteration 110360/ 173500 | consumed samples: 28252160 | consumed tokens: 57860423680 | elapsed time per iteration (s): 0.16 | learning rate: 7.363E-05 | global batch size: 256 | lm loss: 3.686195E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.059 | TFLOPs: 25.36 | +7: iteration 110370/ 173500 | consumed samples: 28254720 | consumed tokens: 57865666560 | elapsed time per iteration (s): 0.15 | learning rate: 7.362E-05 | global batch size: 256 | lm loss: 3.695177E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.154 | TFLOPs: 26.15 | +7: iteration 110380/ 173500 | consumed samples: 28257280 | consumed tokens: 57870909440 | elapsed time per iteration (s): 0.15 | learning rate: 7.360E-05 | global batch size: 256 | lm loss: 3.695209E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.405 | TFLOPs: 26.13 | +7: iteration 110390/ 173500 | consumed samples: 28259840 | consumed tokens: 57876152320 | elapsed time per iteration (s): 0.16 | learning rate: 7.359E-05 | global batch size: 256 | lm loss: 3.688130E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.519 | TFLOPs: 25.54 | +7: iteration 110400/ 173500 | consumed samples: 28262400 | consumed tokens: 57881395200 | elapsed time per iteration (s): 0.15 | learning rate: 7.357E-05 | global batch size: 256 | lm loss: 3.679203E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.278 | TFLOPs: 26.18 | +7: iteration 110410/ 173500 | consumed samples: 28264960 | consumed tokens: 57886638080 | elapsed time per iteration (s): 0.15 | learning rate: 7.356E-05 | global batch size: 256 | lm loss: 3.694493E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.701 | TFLOPs: 26.19 | +7: iteration 110420/ 173500 | consumed samples: 28267520 | consumed tokens: 57891880960 | elapsed time per iteration (s): 0.15 | learning rate: 7.354E-05 | global batch size: 256 | lm loss: 3.679255E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.654 | TFLOPs: 26.18 | +7: iteration 110430/ 173500 | consumed samples: 28270080 | consumed tokens: 57897123840 | elapsed time per iteration (s): 0.15 | learning rate: 7.353E-05 | global batch size: 256 | lm loss: 3.676193E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.899 | TFLOPs: 26.19 | +7: iteration 110440/ 173500 | consumed samples: 28272640 | consumed tokens: 57902366720 | elapsed time per iteration (s): 0.15 | learning rate: 7.351E-05 | global batch size: 256 | lm loss: 3.667452E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.250 | TFLOPs: 26.18 | +7: iteration 110450/ 173500 | consumed samples: 28275200 | consumed tokens: 57907609600 | elapsed time per iteration (s): 0.15 | learning rate: 7.350E-05 | global batch size: 256 | lm loss: 3.689811E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.139 | TFLOPs: 26.18 | +7: iteration 110460/ 173500 | consumed samples: 28277760 | consumed tokens: 57912852480 | elapsed time per iteration (s): 0.15 | learning rate: 7.348E-05 | global batch size: 256 | lm loss: 3.686637E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.069 | TFLOPs: 26.16 | +7: iteration 110470/ 173500 | consumed samples: 28280320 | consumed tokens: 57918095360 | elapsed time per iteration (s): 0.15 | learning rate: 7.347E-05 | global batch size: 256 | lm loss: 3.679949E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.955 | TFLOPs: 26.17 | +7: iteration 110480/ 173500 | consumed samples: 28282880 | consumed tokens: 57923338240 | elapsed time per iteration (s): 0.15 | learning rate: 7.345E-05 | global batch size: 256 | lm loss: 3.677456E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.564 | TFLOPs: 26.15 | +7: iteration 110490/ 173500 | consumed samples: 28285440 | consumed tokens: 57928581120 | elapsed time per iteration (s): 0.15 | learning rate: 7.344E-05 | global batch size: 256 | lm loss: 3.702480E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.621 | TFLOPs: 26.14 | +7: iteration 110500/ 173500 | consumed samples: 28288000 | consumed tokens: 57933824000 | elapsed time per iteration (s): 0.15 | learning rate: 7.342E-05 | global batch size: 256 | lm loss: 3.681382E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.344 | TFLOPs: 26.15 | +7: iteration 110510/ 173500 | consumed samples: 28290560 | consumed tokens: 57939066880 | elapsed time per iteration (s): 0.15 | learning rate: 7.341E-05 | global batch size: 256 | lm loss: 3.686274E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.938 | TFLOPs: 26.14 | +7: iteration 110520/ 173500 | consumed samples: 28293120 | consumed tokens: 57944309760 | elapsed time per iteration (s): 0.15 | learning rate: 7.339E-05 | global batch size: 256 | lm loss: 3.681732E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.971 | TFLOPs: 26.22 | +7: iteration 110530/ 173500 | consumed samples: 28295680 | consumed tokens: 57949552640 | elapsed time per iteration (s): 0.15 | learning rate: 7.338E-05 | global batch size: 256 | lm loss: 3.688165E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.124 | TFLOPs: 26.24 | +7: iteration 110540/ 173500 | consumed samples: 28298240 | consumed tokens: 57954795520 | elapsed time per iteration (s): 0.15 | learning rate: 7.336E-05 | global batch size: 256 | lm loss: 3.676700E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.580 | TFLOPs: 26.23 | +7: iteration 110550/ 173500 | consumed samples: 28300800 | consumed tokens: 57960038400 | elapsed time per iteration (s): 0.15 | learning rate: 7.335E-05 | global batch size: 256 | lm loss: 3.674279E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.356 | TFLOPs: 26.23 | +7: iteration 110560/ 173500 | consumed samples: 28303360 | consumed tokens: 57965281280 | elapsed time per iteration (s): 0.15 | learning rate: 7.333E-05 | global batch size: 256 | lm loss: 3.682434E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.425 | TFLOPs: 26.23 | +7: iteration 110570/ 173500 | consumed samples: 28305920 | consumed tokens: 57970524160 | elapsed time per iteration (s): 0.15 | learning rate: 7.332E-05 | global batch size: 256 | lm loss: 3.674786E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.918 | TFLOPs: 26.24 | +7: iteration 110580/ 173500 | consumed samples: 28308480 | consumed tokens: 57975767040 | elapsed time per iteration (s): 0.15 | learning rate: 7.330E-05 | global batch size: 256 | lm loss: 3.679994E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.008 | TFLOPs: 26.25 | +7: iteration 110590/ 173500 | consumed samples: 28311040 | consumed tokens: 57981009920 | elapsed time per iteration (s): 0.15 | learning rate: 7.329E-05 | global batch size: 256 | lm loss: 3.673312E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.398 | TFLOPs: 26.23 | +7: iteration 110600/ 173500 | consumed samples: 28313600 | consumed tokens: 57986252800 | elapsed time per iteration (s): 0.15 | learning rate: 7.327E-05 | global batch size: 256 | lm loss: 3.690241E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.077 | TFLOPs: 26.24 | +7: iteration 110610/ 173500 | consumed samples: 28316160 | consumed tokens: 57991495680 | elapsed time per iteration (s): 0.15 | learning rate: 7.326E-05 | global batch size: 256 | lm loss: 3.680633E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.143 | TFLOPs: 26.19 | +7: iteration 110620/ 173500 | consumed samples: 28318720 | consumed tokens: 57996738560 | elapsed time per iteration (s): 0.15 | learning rate: 7.324E-05 | global batch size: 256 | lm loss: 3.697853E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.725 | TFLOPs: 26.23 | +7: iteration 110630/ 173500 | consumed samples: 28321280 | consumed tokens: 58001981440 | elapsed time per iteration (s): 0.15 | learning rate: 7.323E-05 | global batch size: 256 | lm loss: 3.672152E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.876 | TFLOPs: 26.13 | +7: iteration 110640/ 173500 | consumed samples: 28323840 | consumed tokens: 58007224320 | elapsed time per iteration (s): 0.15 | learning rate: 7.321E-05 | global batch size: 256 | lm loss: 3.683226E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.346 | TFLOPs: 26.16 | +7: iteration 110650/ 173500 | consumed samples: 28326400 | consumed tokens: 58012467200 | elapsed time per iteration (s): 0.15 | learning rate: 7.320E-05 | global batch size: 256 | lm loss: 3.677994E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.206 | TFLOPs: 26.26 | +7: iteration 110660/ 173500 | consumed samples: 28328960 | consumed tokens: 58017710080 | elapsed time per iteration (s): 0.15 | learning rate: 7.318E-05 | global batch size: 256 | lm loss: 3.689998E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.754 | TFLOPs: 26.25 | +7: iteration 110670/ 173500 | consumed samples: 28331520 | consumed tokens: 58022952960 | elapsed time per iteration (s): 0.15 | learning rate: 7.317E-05 | global batch size: 256 | lm loss: 3.675312E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.629 | TFLOPs: 26.18 | +7: iteration 110680/ 173500 | consumed samples: 28334080 | consumed tokens: 58028195840 | elapsed time per iteration (s): 0.15 | learning rate: 7.315E-05 | global batch size: 256 | lm loss: 3.680127E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.131 | TFLOPs: 26.14 | +7: iteration 110690/ 173500 | consumed samples: 28336640 | consumed tokens: 58033438720 | elapsed time per iteration (s): 0.16 | learning rate: 7.314E-05 | global batch size: 256 | lm loss: 3.692674E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.950 | TFLOPs: 25.73 | +7: iteration 110700/ 173500 | consumed samples: 28339200 | consumed tokens: 58038681600 | elapsed time per iteration (s): 0.15 | learning rate: 7.312E-05 | global batch size: 256 | lm loss: 3.685459E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.974 | TFLOPs: 26.19 | +7: iteration 110710/ 173500 | consumed samples: 28341760 | consumed tokens: 58043924480 | elapsed time per iteration (s): 0.15 | learning rate: 7.311E-05 | global batch size: 256 | lm loss: 3.693333E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.286 | TFLOPs: 26.19 | +7: iteration 110720/ 173500 | consumed samples: 28344320 | consumed tokens: 58049167360 | elapsed time per iteration (s): 0.15 | learning rate: 7.309E-05 | global batch size: 256 | lm loss: 3.683753E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.797 | TFLOPs: 26.00 | +7: iteration 110730/ 173500 | consumed samples: 28346880 | consumed tokens: 58054410240 | elapsed time per iteration (s): 0.16 | learning rate: 7.308E-05 | global batch size: 256 | lm loss: 3.681175E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.782 | TFLOPs: 24.37 | +7: iteration 110740/ 173500 | consumed samples: 28349440 | consumed tokens: 58059653120 | elapsed time per iteration (s): 0.15 | learning rate: 7.306E-05 | global batch size: 256 | lm loss: 3.694711E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.611 | TFLOPs: 26.20 | +7: iteration 110750/ 173500 | consumed samples: 28352000 | consumed tokens: 58064896000 | elapsed time per iteration (s): 0.15 | learning rate: 7.305E-05 | global batch size: 256 | lm loss: 3.685172E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.759 | TFLOPs: 26.20 | +7: iteration 110760/ 173500 | consumed samples: 28354560 | consumed tokens: 58070138880 | elapsed time per iteration (s): 0.15 | learning rate: 7.303E-05 | global batch size: 256 | lm loss: 3.684654E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.568 | TFLOPs: 26.20 | +7: iteration 110770/ 173500 | consumed samples: 28357120 | consumed tokens: 58075381760 | elapsed time per iteration (s): 0.15 | learning rate: 7.302E-05 | global batch size: 256 | lm loss: 3.699615E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.732 | TFLOPs: 26.20 | +7: iteration 110780/ 173500 | consumed samples: 28359680 | consumed tokens: 58080624640 | elapsed time per iteration (s): 0.15 | learning rate: 7.300E-05 | global batch size: 256 | lm loss: 3.680674E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.549 | TFLOPs: 26.20 | +7: iteration 110790/ 173500 | consumed samples: 28362240 | consumed tokens: 58085867520 | elapsed time per iteration (s): 0.15 | learning rate: 7.299E-05 | global batch size: 256 | lm loss: 3.694647E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.033 | TFLOPs: 26.16 | +7: iteration 110800/ 173500 | consumed samples: 28364800 | consumed tokens: 58091110400 | elapsed time per iteration (s): 0.15 | learning rate: 7.297E-05 | global batch size: 256 | lm loss: 3.694661E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.778 | TFLOPs: 26.17 | +7: iteration 110810/ 173500 | consumed samples: 28367360 | consumed tokens: 58096353280 | elapsed time per iteration (s): 0.15 | learning rate: 7.296E-05 | global batch size: 256 | lm loss: 3.685501E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.008 | TFLOPs: 26.17 | +7: iteration 110820/ 173500 | consumed samples: 28369920 | consumed tokens: 58101596160 | elapsed time per iteration (s): 0.15 | learning rate: 7.294E-05 | global batch size: 256 | lm loss: 3.679029E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.281 | TFLOPs: 26.21 | +7: iteration 110830/ 173500 | consumed samples: 28372480 | consumed tokens: 58106839040 | elapsed time per iteration (s): 0.15 | learning rate: 7.293E-05 | global batch size: 256 | lm loss: 3.692759E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.291 | TFLOPs: 26.23 | +7: iteration 110840/ 173500 | consumed samples: 28375040 | consumed tokens: 58112081920 | elapsed time per iteration (s): 0.16 | learning rate: 7.291E-05 | global batch size: 256 | lm loss: 3.695407E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.982 | TFLOPs: 25.61 | +7: iteration 110850/ 173500 | consumed samples: 28377600 | consumed tokens: 58117324800 | elapsed time per iteration (s): 0.15 | learning rate: 7.290E-05 | global batch size: 256 | lm loss: 3.672054E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.915 | TFLOPs: 26.22 | +7: iteration 110860/ 173500 | consumed samples: 28380160 | consumed tokens: 58122567680 | elapsed time per iteration (s): 0.15 | learning rate: 7.288E-05 | global batch size: 256 | lm loss: 3.690078E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.245 | TFLOPs: 26.22 | +7: iteration 110870/ 173500 | consumed samples: 28382720 | consumed tokens: 58127810560 | elapsed time per iteration (s): 0.15 | learning rate: 7.287E-05 | global batch size: 256 | lm loss: 3.686752E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.213 | TFLOPs: 26.22 | +7: iteration 110880/ 173500 | consumed samples: 28385280 | consumed tokens: 58133053440 | elapsed time per iteration (s): 0.15 | learning rate: 7.285E-05 | global batch size: 256 | lm loss: 3.688079E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.925 | TFLOPs: 26.20 | +7: iteration 110890/ 173500 | consumed samples: 28387840 | consumed tokens: 58138296320 | elapsed time per iteration (s): 0.15 | learning rate: 7.284E-05 | global batch size: 256 | lm loss: 3.690094E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.354 | TFLOPs: 26.21 | +7: iteration 110900/ 173500 | consumed samples: 28390400 | consumed tokens: 58143539200 | elapsed time per iteration (s): 0.15 | learning rate: 7.282E-05 | global batch size: 256 | lm loss: 3.680554E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.627 | TFLOPs: 26.22 | +7: iteration 110910/ 173500 | consumed samples: 28392960 | consumed tokens: 58148782080 | elapsed time per iteration (s): 0.15 | learning rate: 7.281E-05 | global batch size: 256 | lm loss: 3.686576E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.803 | TFLOPs: 26.20 | +7: iteration 110920/ 173500 | consumed samples: 28395520 | consumed tokens: 58154024960 | elapsed time per iteration (s): 0.15 | learning rate: 7.279E-05 | global batch size: 256 | lm loss: 3.690842E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.187 | TFLOPs: 26.18 | +7: iteration 110930/ 173500 | consumed samples: 28398080 | consumed tokens: 58159267840 | elapsed time per iteration (s): 0.15 | learning rate: 7.278E-05 | global batch size: 256 | lm loss: 3.684683E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.190 | TFLOPs: 26.15 | +7: iteration 110940/ 173500 | consumed samples: 28400640 | consumed tokens: 58164510720 | elapsed time per iteration (s): 0.15 | learning rate: 7.276E-05 | global batch size: 256 | lm loss: 3.672591E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.156 | TFLOPs: 26.15 | +7: iteration 110950/ 173500 | consumed samples: 28403200 | consumed tokens: 58169753600 | elapsed time per iteration (s): 0.15 | learning rate: 7.275E-05 | global batch size: 256 | lm loss: 3.692313E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.668 | TFLOPs: 26.15 | +7: iteration 110960/ 173500 | consumed samples: 28405760 | consumed tokens: 58174996480 | elapsed time per iteration (s): 0.15 | learning rate: 7.273E-05 | global batch size: 256 | lm loss: 3.684803E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.894 | TFLOPs: 26.09 | +7: iteration 110970/ 173500 | consumed samples: 28408320 | consumed tokens: 58180239360 | elapsed time per iteration (s): 0.15 | learning rate: 7.272E-05 | global batch size: 256 | lm loss: 3.688503E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.486 | TFLOPs: 26.20 | +7: iteration 110980/ 173500 | consumed samples: 28410880 | consumed tokens: 58185482240 | elapsed time per iteration (s): 0.15 | learning rate: 7.270E-05 | global batch size: 256 | lm loss: 3.695430E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.926 | TFLOPs: 26.19 | +7: iteration 110990/ 173500 | consumed samples: 28413440 | consumed tokens: 58190725120 | elapsed time per iteration (s): 0.15 | learning rate: 7.269E-05 | global batch size: 256 | lm loss: 3.682878E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.474 | TFLOPs: 26.20 | +7: iteration 111000/ 173500 | consumed samples: 28416000 | consumed tokens: 58195968000 | elapsed time per iteration (s): 0.15 | learning rate: 7.267E-05 | global batch size: 256 | lm loss: 3.684438E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.608 | TFLOPs: 26.20 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 111000 | lm loss value: 3.820224E+00 | lm loss PPL: 4.561441E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 111000 to checkpoints_44m91b100m +0: [2023-03-17 05:04:18,241] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step111000 is begin to save! +0: [2023-03-17 05:04:18,244] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:04:18,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:04:18,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:04:18,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:04:18,318] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:04:18,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:04:18,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:04:18,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:04:18,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:04:18,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:04:18,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:04:18,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:04:18,351] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:04:18,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:04:18,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:04:18,367] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:04:18,367] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:04:18,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:04:18,375] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:04:18,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:04:18,377] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step111000/mp_rank_00_model_states.pt +0: [2023-03-17 05:04:18,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:04:18,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:04:18,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:04:18,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:04:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +1: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +1: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:04:18,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +1: [2023-03-17 05:04:18,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:04:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:04:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +1: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:04:18,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +1: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +1: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +4: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +7: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +2: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +3: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +6: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:04:18,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:04:18,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +5: [2023-03-17 05:04:18,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:04:18,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step111000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:04:18,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step111000 is ready now! +0: successfully saved checkpoint at iteration 111000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 196.15 +7: iteration 111010/ 173500 | consumed samples: 28418560 | consumed tokens: 58201210880 | elapsed time per iteration (s): 0.18 | learning rate: 7.266E-05 | global batch size: 256 | lm loss: 3.685085E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.636 | TFLOPs: 22.64 | +7: iteration 111020/ 173500 | consumed samples: 28421120 | consumed tokens: 58206453760 | elapsed time per iteration (s): 0.15 | learning rate: 7.264E-05 | global batch size: 256 | lm loss: 3.666844E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.393 | TFLOPs: 26.21 | +7: iteration 111030/ 173500 | consumed samples: 28423680 | consumed tokens: 58211696640 | elapsed time per iteration (s): 0.15 | learning rate: 7.263E-05 | global batch size: 256 | lm loss: 3.698933E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.469 | TFLOPs: 26.10 | +7: iteration 111040/ 173500 | consumed samples: 28426240 | consumed tokens: 58216939520 | elapsed time per iteration (s): 0.15 | learning rate: 7.261E-05 | global batch size: 256 | lm loss: 3.686361E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.717 | TFLOPs: 26.23 | +7: iteration 111050/ 173500 | consumed samples: 28428800 | consumed tokens: 58222182400 | elapsed time per iteration (s): 0.15 | learning rate: 7.260E-05 | global batch size: 256 | lm loss: 3.677176E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.815 | TFLOPs: 26.20 | +7: iteration 111060/ 173500 | consumed samples: 28431360 | consumed tokens: 58227425280 | elapsed time per iteration (s): 0.15 | learning rate: 7.258E-05 | global batch size: 256 | lm loss: 3.692442E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.548 | TFLOPs: 26.18 | +7: iteration 111070/ 173500 | consumed samples: 28433920 | consumed tokens: 58232668160 | elapsed time per iteration (s): 0.16 | learning rate: 7.257E-05 | global batch size: 256 | lm loss: 3.681958E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.524 | TFLOPs: 25.77 | +7: iteration 111080/ 173500 | consumed samples: 28436480 | consumed tokens: 58237911040 | elapsed time per iteration (s): 0.15 | learning rate: 7.255E-05 | global batch size: 256 | lm loss: 3.694995E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.842 | TFLOPs: 26.14 | +7: iteration 111090/ 173500 | consumed samples: 28439040 | consumed tokens: 58243153920 | elapsed time per iteration (s): 0.16 | learning rate: 7.254E-05 | global batch size: 256 | lm loss: 3.676078E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.099 | TFLOPs: 25.80 | +7: iteration 111100/ 173500 | consumed samples: 28441600 | consumed tokens: 58248396800 | elapsed time per iteration (s): 0.15 | learning rate: 7.252E-05 | global batch size: 256 | lm loss: 3.683813E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.635 | TFLOPs: 26.17 | +7: iteration 111110/ 173500 | consumed samples: 28444160 | consumed tokens: 58253639680 | elapsed time per iteration (s): 0.16 | learning rate: 7.251E-05 | global batch size: 256 | lm loss: 3.683191E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.622 | TFLOPs: 25.87 | +7: iteration 111120/ 173500 | consumed samples: 28446720 | consumed tokens: 58258882560 | elapsed time per iteration (s): 0.16 | learning rate: 7.249E-05 | global batch size: 256 | lm loss: 3.683909E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.477 | TFLOPs: 25.10 | +7: iteration 111130/ 173500 | consumed samples: 28449280 | consumed tokens: 58264125440 | elapsed time per iteration (s): 0.16 | learning rate: 7.248E-05 | global batch size: 256 | lm loss: 3.678916E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.290 | TFLOPs: 25.74 | +7: iteration 111140/ 173500 | consumed samples: 28451840 | consumed tokens: 58269368320 | elapsed time per iteration (s): 0.16 | learning rate: 7.246E-05 | global batch size: 256 | lm loss: 3.682303E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.906 | TFLOPs: 25.39 | +7: iteration 111150/ 173500 | consumed samples: 28454400 | consumed tokens: 58274611200 | elapsed time per iteration (s): 0.15 | learning rate: 7.245E-05 | global batch size: 256 | lm loss: 3.677649E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.659 | TFLOPs: 25.93 | +7: iteration 111160/ 173500 | consumed samples: 28456960 | consumed tokens: 58279854080 | elapsed time per iteration (s): 0.15 | learning rate: 7.243E-05 | global batch size: 256 | lm loss: 3.705386E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.106 | TFLOPs: 26.10 | +7: iteration 111170/ 173500 | consumed samples: 28459520 | consumed tokens: 58285096960 | elapsed time per iteration (s): 0.15 | learning rate: 7.242E-05 | global batch size: 256 | lm loss: 3.685279E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.828 | TFLOPs: 26.16 | +7: iteration 111180/ 173500 | consumed samples: 28462080 | consumed tokens: 58290339840 | elapsed time per iteration (s): 0.15 | learning rate: 7.240E-05 | global batch size: 256 | lm loss: 3.698277E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.141 | TFLOPs: 26.14 | +7: iteration 111190/ 173500 | consumed samples: 28464640 | consumed tokens: 58295582720 | elapsed time per iteration (s): 0.15 | learning rate: 7.239E-05 | global batch size: 256 | lm loss: 3.677866E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.929 | TFLOPs: 26.14 | +7: iteration 111200/ 173500 | consumed samples: 28467200 | consumed tokens: 58300825600 | elapsed time per iteration (s): 0.16 | learning rate: 7.237E-05 | global batch size: 256 | lm loss: 3.685187E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.126 | TFLOPs: 25.80 | +7: iteration 111210/ 173500 | consumed samples: 28469760 | consumed tokens: 58306068480 | elapsed time per iteration (s): 0.15 | learning rate: 7.236E-05 | global batch size: 256 | lm loss: 3.671060E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.056 | TFLOPs: 26.11 | +7: iteration 111220/ 173500 | consumed samples: 28472320 | consumed tokens: 58311311360 | elapsed time per iteration (s): 0.15 | learning rate: 7.234E-05 | global batch size: 256 | lm loss: 3.680903E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.102 | TFLOPs: 26.14 | +7: iteration 111230/ 173500 | consumed samples: 28474880 | consumed tokens: 58316554240 | elapsed time per iteration (s): 0.15 | learning rate: 7.233E-05 | global batch size: 256 | lm loss: 3.687421E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.907 | TFLOPs: 26.19 | +7: iteration 111240/ 173500 | consumed samples: 28477440 | consumed tokens: 58321797120 | elapsed time per iteration (s): 0.15 | learning rate: 7.231E-05 | global batch size: 256 | lm loss: 3.680454E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.347 | TFLOPs: 26.18 | +7: iteration 111250/ 173500 | consumed samples: 28480000 | consumed tokens: 58327040000 | elapsed time per iteration (s): 0.15 | learning rate: 7.230E-05 | global batch size: 256 | lm loss: 3.676377E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.326 | TFLOPs: 26.18 | +7: iteration 111260/ 173500 | consumed samples: 28482560 | consumed tokens: 58332282880 | elapsed time per iteration (s): 0.15 | learning rate: 7.228E-05 | global batch size: 256 | lm loss: 3.696401E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.347 | TFLOPs: 26.18 | +7: iteration 111270/ 173500 | consumed samples: 28485120 | consumed tokens: 58337525760 | elapsed time per iteration (s): 0.15 | learning rate: 7.227E-05 | global batch size: 256 | lm loss: 3.691666E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.981 | TFLOPs: 26.17 | +7: iteration 111280/ 173500 | consumed samples: 28487680 | consumed tokens: 58342768640 | elapsed time per iteration (s): 0.15 | learning rate: 7.225E-05 | global batch size: 256 | lm loss: 3.684412E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.824 | TFLOPs: 26.17 | +7: iteration 111290/ 173500 | consumed samples: 28490240 | consumed tokens: 58348011520 | elapsed time per iteration (s): 0.15 | learning rate: 7.224E-05 | global batch size: 256 | lm loss: 3.682195E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.735 | TFLOPs: 26.15 | +7: iteration 111300/ 173500 | consumed samples: 28492800 | consumed tokens: 58353254400 | elapsed time per iteration (s): 0.15 | learning rate: 7.222E-05 | global batch size: 256 | lm loss: 3.673634E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.337 | TFLOPs: 26.18 | +7: iteration 111310/ 173500 | consumed samples: 28495360 | consumed tokens: 58358497280 | elapsed time per iteration (s): 0.15 | learning rate: 7.221E-05 | global batch size: 256 | lm loss: 3.698311E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.298 | TFLOPs: 26.18 | +7: iteration 111320/ 173500 | consumed samples: 28497920 | consumed tokens: 58363740160 | elapsed time per iteration (s): 0.15 | learning rate: 7.219E-05 | global batch size: 256 | lm loss: 3.682840E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.684 | TFLOPs: 26.17 | +7: iteration 111330/ 173500 | consumed samples: 28500480 | consumed tokens: 58368983040 | elapsed time per iteration (s): 0.16 | learning rate: 7.218E-05 | global batch size: 256 | lm loss: 3.679152E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.672 | TFLOPs: 25.81 | +7: iteration 111340/ 173500 | consumed samples: 28503040 | consumed tokens: 58374225920 | elapsed time per iteration (s): 0.15 | learning rate: 7.216E-05 | global batch size: 256 | lm loss: 3.692537E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.778 | TFLOPs: 26.17 | +7: iteration 111350/ 173500 | consumed samples: 28505600 | consumed tokens: 58379468800 | elapsed time per iteration (s): 0.15 | learning rate: 7.215E-05 | global batch size: 256 | lm loss: 3.674368E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.134 | TFLOPs: 26.16 | +7: iteration 111360/ 173500 | consumed samples: 28508160 | consumed tokens: 58384711680 | elapsed time per iteration (s): 0.16 | learning rate: 7.213E-05 | global batch size: 256 | lm loss: 3.688916E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.520 | TFLOPs: 25.73 | +7: iteration 111370/ 173500 | consumed samples: 28510720 | consumed tokens: 58389954560 | elapsed time per iteration (s): 0.15 | learning rate: 7.212E-05 | global batch size: 256 | lm loss: 3.696835E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.298 | TFLOPs: 26.23 | +7: iteration 111380/ 173500 | consumed samples: 28513280 | consumed tokens: 58395197440 | elapsed time per iteration (s): 0.15 | learning rate: 7.210E-05 | global batch size: 256 | lm loss: 3.662437E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.196 | TFLOPs: 26.24 | +7: iteration 111390/ 173500 | consumed samples: 28515840 | consumed tokens: 58400440320 | elapsed time per iteration (s): 0.15 | learning rate: 7.209E-05 | global batch size: 256 | lm loss: 3.682893E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.172 | TFLOPs: 26.22 | +7: iteration 111400/ 173500 | consumed samples: 28518400 | consumed tokens: 58405683200 | elapsed time per iteration (s): 0.15 | learning rate: 7.207E-05 | global batch size: 256 | lm loss: 3.681319E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.052 | TFLOPs: 26.22 | +7: iteration 111410/ 173500 | consumed samples: 28520960 | consumed tokens: 58410926080 | elapsed time per iteration (s): 0.15 | learning rate: 7.206E-05 | global batch size: 256 | lm loss: 3.680560E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.530 | TFLOPs: 26.23 | +7: iteration 111420/ 173500 | consumed samples: 28523520 | consumed tokens: 58416168960 | elapsed time per iteration (s): 0.15 | learning rate: 7.205E-05 | global batch size: 256 | lm loss: 3.685476E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.727 | TFLOPs: 26.22 | +7: iteration 111430/ 173500 | consumed samples: 28526080 | consumed tokens: 58421411840 | elapsed time per iteration (s): 0.15 | learning rate: 7.203E-05 | global batch size: 256 | lm loss: 3.686041E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.813 | TFLOPs: 26.23 | +7: iteration 111440/ 173500 | consumed samples: 28528640 | consumed tokens: 58426654720 | elapsed time per iteration (s): 0.15 | learning rate: 7.202E-05 | global batch size: 256 | lm loss: 3.685922E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.636 | TFLOPs: 26.23 | +7: iteration 111450/ 173500 | consumed samples: 28531200 | consumed tokens: 58431897600 | elapsed time per iteration (s): 0.15 | learning rate: 7.200E-05 | global batch size: 256 | lm loss: 3.677629E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.636 | TFLOPs: 26.25 | +7: iteration 111460/ 173500 | consumed samples: 28533760 | consumed tokens: 58437140480 | elapsed time per iteration (s): 0.15 | learning rate: 7.199E-05 | global batch size: 256 | lm loss: 3.682259E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.583 | TFLOPs: 26.25 | +7: iteration 111470/ 173500 | consumed samples: 28536320 | consumed tokens: 58442383360 | elapsed time per iteration (s): 0.15 | learning rate: 7.197E-05 | global batch size: 256 | lm loss: 3.691417E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.004 | TFLOPs: 26.11 | +7: iteration 111480/ 173500 | consumed samples: 28538880 | consumed tokens: 58447626240 | elapsed time per iteration (s): 0.15 | learning rate: 7.196E-05 | global batch size: 256 | lm loss: 3.682347E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.460 | TFLOPs: 26.20 | +7: iteration 111490/ 173500 | consumed samples: 28541440 | consumed tokens: 58452869120 | elapsed time per iteration (s): 0.16 | learning rate: 7.194E-05 | global batch size: 256 | lm loss: 3.679418E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.426 | TFLOPs: 25.84 | +7: iteration 111500/ 173500 | consumed samples: 28544000 | consumed tokens: 58458112000 | elapsed time per iteration (s): 0.15 | learning rate: 7.193E-05 | global batch size: 256 | lm loss: 3.674571E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.866 | TFLOPs: 26.17 | +7: iteration 111510/ 173500 | consumed samples: 28546560 | consumed tokens: 58463354880 | elapsed time per iteration (s): 0.15 | learning rate: 7.191E-05 | global batch size: 256 | lm loss: 3.681689E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.496 | TFLOPs: 26.17 | +7: iteration 111520/ 173500 | consumed samples: 28549120 | consumed tokens: 58468597760 | elapsed time per iteration (s): 0.15 | learning rate: 7.190E-05 | global batch size: 256 | lm loss: 3.674768E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.893 | TFLOPs: 26.00 | +7: iteration 111530/ 173500 | consumed samples: 28551680 | consumed tokens: 58473840640 | elapsed time per iteration (s): 0.15 | learning rate: 7.188E-05 | global batch size: 256 | lm loss: 3.679744E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.407 | TFLOPs: 26.21 | +7: iteration 111540/ 173500 | consumed samples: 28554240 | consumed tokens: 58479083520 | elapsed time per iteration (s): 0.15 | learning rate: 7.187E-05 | global batch size: 256 | lm loss: 3.687205E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.711 | TFLOPs: 26.20 | +7: iteration 111550/ 173500 | consumed samples: 28556800 | consumed tokens: 58484326400 | elapsed time per iteration (s): 0.15 | learning rate: 7.185E-05 | global batch size: 256 | lm loss: 3.687016E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.023 | TFLOPs: 26.02 | +7: iteration 111560/ 173500 | consumed samples: 28559360 | consumed tokens: 58489569280 | elapsed time per iteration (s): 0.15 | learning rate: 7.184E-05 | global batch size: 256 | lm loss: 3.690605E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.106 | TFLOPs: 26.21 | +7: iteration 111570/ 173500 | consumed samples: 28561920 | consumed tokens: 58494812160 | elapsed time per iteration (s): 0.16 | learning rate: 7.182E-05 | global batch size: 256 | lm loss: 3.684860E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.033 | TFLOPs: 25.56 | +7: iteration 111580/ 173500 | consumed samples: 28564480 | consumed tokens: 58500055040 | elapsed time per iteration (s): 0.15 | learning rate: 7.181E-05 | global batch size: 256 | lm loss: 3.676564E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.526 | TFLOPs: 26.23 | +7: iteration 111590/ 173500 | consumed samples: 28567040 | consumed tokens: 58505297920 | elapsed time per iteration (s): 0.16 | learning rate: 7.179E-05 | global batch size: 256 | lm loss: 3.679984E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.983 | TFLOPs: 25.55 | +7: iteration 111600/ 173500 | consumed samples: 28569600 | consumed tokens: 58510540800 | elapsed time per iteration (s): 0.15 | learning rate: 7.178E-05 | global batch size: 256 | lm loss: 3.680647E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.399 | TFLOPs: 26.18 | +7: iteration 111610/ 173500 | consumed samples: 28572160 | consumed tokens: 58515783680 | elapsed time per iteration (s): 0.15 | learning rate: 7.176E-05 | global batch size: 256 | lm loss: 3.669588E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.885 | TFLOPs: 26.14 | +7: iteration 111620/ 173500 | consumed samples: 28574720 | consumed tokens: 58521026560 | elapsed time per iteration (s): 0.15 | learning rate: 7.175E-05 | global batch size: 256 | lm loss: 3.680675E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.781 | TFLOPs: 26.14 | +7: iteration 111630/ 173500 | consumed samples: 28577280 | consumed tokens: 58526269440 | elapsed time per iteration (s): 0.15 | learning rate: 7.173E-05 | global batch size: 256 | lm loss: 3.668315E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.404 | TFLOPs: 26.18 | +7: iteration 111640/ 173500 | consumed samples: 28579840 | consumed tokens: 58531512320 | elapsed time per iteration (s): 0.15 | learning rate: 7.172E-05 | global batch size: 256 | lm loss: 3.683440E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.661 | TFLOPs: 26.20 | +7: iteration 111650/ 173500 | consumed samples: 28582400 | consumed tokens: 58536755200 | elapsed time per iteration (s): 0.15 | learning rate: 7.170E-05 | global batch size: 256 | lm loss: 3.687870E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.892 | TFLOPs: 26.17 | +7: iteration 111660/ 173500 | consumed samples: 28584960 | consumed tokens: 58541998080 | elapsed time per iteration (s): 0.15 | learning rate: 7.169E-05 | global batch size: 256 | lm loss: 3.687035E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.238 | TFLOPs: 26.19 | +7: iteration 111670/ 173500 | consumed samples: 28587520 | consumed tokens: 58547240960 | elapsed time per iteration (s): 0.15 | learning rate: 7.167E-05 | global batch size: 256 | lm loss: 3.679990E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.153 | TFLOPs: 26.22 | +7: iteration 111680/ 173500 | consumed samples: 28590080 | consumed tokens: 58552483840 | elapsed time per iteration (s): 0.15 | learning rate: 7.166E-05 | global batch size: 256 | lm loss: 3.682948E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.978 | TFLOPs: 26.22 | +7: iteration 111690/ 173500 | consumed samples: 28592640 | consumed tokens: 58557726720 | elapsed time per iteration (s): 0.16 | learning rate: 7.164E-05 | global batch size: 256 | lm loss: 3.675718E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.658 | TFLOPs: 25.07 | +7: iteration 111700/ 173500 | consumed samples: 28595200 | consumed tokens: 58562969600 | elapsed time per iteration (s): 0.16 | learning rate: 7.163E-05 | global batch size: 256 | lm loss: 3.673985E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.882 | TFLOPs: 25.78 | +7: iteration 111710/ 173500 | consumed samples: 28597760 | consumed tokens: 58568212480 | elapsed time per iteration (s): 0.16 | learning rate: 7.161E-05 | global batch size: 256 | lm loss: 3.690906E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.060 | TFLOPs: 25.83 | +7: iteration 111720/ 173500 | consumed samples: 28600320 | consumed tokens: 58573455360 | elapsed time per iteration (s): 0.16 | learning rate: 7.160E-05 | global batch size: 256 | lm loss: 3.678307E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.659 | TFLOPs: 25.87 | +7: iteration 111730/ 173500 | consumed samples: 28602880 | consumed tokens: 58578698240 | elapsed time per iteration (s): 0.15 | learning rate: 7.158E-05 | global batch size: 256 | lm loss: 3.691317E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.045 | TFLOPs: 26.22 | +7: iteration 111740/ 173500 | consumed samples: 28605440 | consumed tokens: 58583941120 | elapsed time per iteration (s): 0.15 | learning rate: 7.157E-05 | global batch size: 256 | lm loss: 3.669541E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.222 | TFLOPs: 26.16 | +7: iteration 111750/ 173500 | consumed samples: 28608000 | consumed tokens: 58589184000 | elapsed time per iteration (s): 0.15 | learning rate: 7.155E-05 | global batch size: 256 | lm loss: 3.686140E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.040 | TFLOPs: 26.14 | +7: iteration 111760/ 173500 | consumed samples: 28610560 | consumed tokens: 58594426880 | elapsed time per iteration (s): 0.15 | learning rate: 7.154E-05 | global batch size: 256 | lm loss: 3.675922E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.548 | TFLOPs: 26.20 | +7: iteration 111770/ 173500 | consumed samples: 28613120 | consumed tokens: 58599669760 | elapsed time per iteration (s): 0.15 | learning rate: 7.152E-05 | global batch size: 256 | lm loss: 3.683788E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.623 | TFLOPs: 26.18 | +7: iteration 111780/ 173500 | consumed samples: 28615680 | consumed tokens: 58604912640 | elapsed time per iteration (s): 0.15 | learning rate: 7.151E-05 | global batch size: 256 | lm loss: 3.690267E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.357 | TFLOPs: 26.16 | +7: iteration 111790/ 173500 | consumed samples: 28618240 | consumed tokens: 58610155520 | elapsed time per iteration (s): 0.15 | learning rate: 7.149E-05 | global batch size: 256 | lm loss: 3.680677E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.419 | TFLOPs: 26.10 | +7: iteration 111800/ 173500 | consumed samples: 28620800 | consumed tokens: 58615398400 | elapsed time per iteration (s): 0.15 | learning rate: 7.148E-05 | global batch size: 256 | lm loss: 3.673130E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.685 | TFLOPs: 26.17 | +7: iteration 111810/ 173500 | consumed samples: 28623360 | consumed tokens: 58620641280 | elapsed time per iteration (s): 0.15 | learning rate: 7.146E-05 | global batch size: 256 | lm loss: 3.691493E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.805 | TFLOPs: 26.17 | +7: iteration 111820/ 173500 | consumed samples: 28625920 | consumed tokens: 58625884160 | elapsed time per iteration (s): 0.15 | learning rate: 7.145E-05 | global batch size: 256 | lm loss: 3.697684E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.603 | TFLOPs: 26.17 | +7: iteration 111830/ 173500 | consumed samples: 28628480 | consumed tokens: 58631127040 | elapsed time per iteration (s): 0.15 | learning rate: 7.143E-05 | global batch size: 256 | lm loss: 3.681565E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.954 | TFLOPs: 26.17 | +7: iteration 111840/ 173500 | consumed samples: 28631040 | consumed tokens: 58636369920 | elapsed time per iteration (s): 0.15 | learning rate: 7.142E-05 | global batch size: 256 | lm loss: 3.683949E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.228 | TFLOPs: 26.18 | +7: iteration 111850/ 173500 | consumed samples: 28633600 | consumed tokens: 58641612800 | elapsed time per iteration (s): 0.15 | learning rate: 7.140E-05 | global batch size: 256 | lm loss: 3.691206E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.156 | TFLOPs: 26.18 | +7: iteration 111860/ 173500 | consumed samples: 28636160 | consumed tokens: 58646855680 | elapsed time per iteration (s): 0.15 | learning rate: 7.139E-05 | global batch size: 256 | lm loss: 3.685698E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.106 | TFLOPs: 26.19 | +7: iteration 111870/ 173500 | consumed samples: 28638720 | consumed tokens: 58652098560 | elapsed time per iteration (s): 0.15 | learning rate: 7.137E-05 | global batch size: 256 | lm loss: 3.695164E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.190 | TFLOPs: 26.22 | +7: iteration 111880/ 173500 | consumed samples: 28641280 | consumed tokens: 58657341440 | elapsed time per iteration (s): 0.15 | learning rate: 7.136E-05 | global batch size: 256 | lm loss: 3.689881E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.377 | TFLOPs: 26.23 | +7: iteration 111890/ 173500 | consumed samples: 28643840 | consumed tokens: 58662584320 | elapsed time per iteration (s): 0.15 | learning rate: 7.135E-05 | global batch size: 256 | lm loss: 3.691961E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.743 | TFLOPs: 26.23 | +7: iteration 111900/ 173500 | consumed samples: 28646400 | consumed tokens: 58667827200 | elapsed time per iteration (s): 0.15 | learning rate: 7.133E-05 | global batch size: 256 | lm loss: 3.690754E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.633 | TFLOPs: 26.23 | +7: iteration 111910/ 173500 | consumed samples: 28648960 | consumed tokens: 58673070080 | elapsed time per iteration (s): 0.15 | learning rate: 7.132E-05 | global batch size: 256 | lm loss: 3.687478E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.484 | TFLOPs: 26.23 | +7: iteration 111920/ 173500 | consumed samples: 28651520 | consumed tokens: 58678312960 | elapsed time per iteration (s): 0.15 | learning rate: 7.130E-05 | global batch size: 256 | lm loss: 3.688763E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.617 | TFLOPs: 26.23 | +7: iteration 111930/ 173500 | consumed samples: 28654080 | consumed tokens: 58683555840 | elapsed time per iteration (s): 0.15 | learning rate: 7.129E-05 | global batch size: 256 | lm loss: 3.682219E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.657 | TFLOPs: 26.25 | +7: iteration 111940/ 173500 | consumed samples: 28656640 | consumed tokens: 58688798720 | elapsed time per iteration (s): 0.15 | learning rate: 7.127E-05 | global batch size: 256 | lm loss: 3.671158E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.363 | TFLOPs: 26.20 | +7: iteration 111950/ 173500 | consumed samples: 28659200 | consumed tokens: 58694041600 | elapsed time per iteration (s): 0.15 | learning rate: 7.126E-05 | global batch size: 256 | lm loss: 3.685537E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.665 | TFLOPs: 26.23 | +7: iteration 111960/ 173500 | consumed samples: 28661760 | consumed tokens: 58699284480 | elapsed time per iteration (s): 0.15 | learning rate: 7.124E-05 | global batch size: 256 | lm loss: 3.685286E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.844 | TFLOPs: 26.23 | +7: iteration 111970/ 173500 | consumed samples: 28664320 | consumed tokens: 58704527360 | elapsed time per iteration (s): 0.15 | learning rate: 7.123E-05 | global batch size: 256 | lm loss: 3.674645E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.761 | TFLOPs: 26.23 | +7: iteration 111980/ 173500 | consumed samples: 28666880 | consumed tokens: 58709770240 | elapsed time per iteration (s): 0.16 | learning rate: 7.121E-05 | global batch size: 256 | lm loss: 3.675411E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.415 | TFLOPs: 25.80 | +7: iteration 111990/ 173500 | consumed samples: 28669440 | consumed tokens: 58715013120 | elapsed time per iteration (s): 0.16 | learning rate: 7.120E-05 | global batch size: 256 | lm loss: 3.667115E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.962 | TFLOPs: 25.70 | +0: [2023-03-17 05:06:52,331] [INFO] [logging.py:68:log_dist] [Rank 0] step=112000, skipped=0, lr=[7.118156405567987e-05, 7.118156405567987e-05, 7.118156405567987e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 112000/ 173500 | consumed samples: 28672000 | consumed tokens: 58720256000 | elapsed time per iteration (s): 0.15 | learning rate: 7.118E-05 | global batch size: 256 | lm loss: 3.693285E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.263 | TFLOPs: 26.18 | +0: steps: 112000 loss: 3.6602 iter time (s): 0.153 samples/sec: 1675.677 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 112000 | lm loss value: 3.840583E+00 | lm loss PPL: 4.655262E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 112000 to checkpoints_44m91b100m +0: [2023-03-17 05:06:52,405] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step112000 is begin to save! +0: [2023-03-17 05:06:52,408] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:06:52,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:06:52,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:06:52,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:06:52,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:06:52,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:06:52,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:06:52,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:06:52,495] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:06:52,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:06:52,503] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:06:52,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:06:52,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:06:52,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:06:52,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:06:52,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:06:52,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:06:52,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:06:52,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:06:52,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:06:52,536] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step112000/mp_rank_00_model_states.pt +0: [2023-03-17 05:06:52,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:06:52,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:06:52,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:06:52,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:06:52,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:06:52,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +5: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +6: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +1: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 05:06:52,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +4: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: [2023-03-17 05:06:52,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 05:06:52,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +2: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +7: [2023-03-17 05:06:52,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +3: [2023-03-17 05:06:52,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step112000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:06:52,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step112000 is ready now! +0: successfully saved checkpoint at iteration 112000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 175.40 +7: iteration 112010/ 173500 | consumed samples: 28674560 | consumed tokens: 58725498880 | elapsed time per iteration (s): 0.18 | learning rate: 7.117E-05 | global batch size: 256 | lm loss: 3.699460E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1441.096 | TFLOPs: 22.60 | +7: iteration 112020/ 173500 | consumed samples: 28677120 | consumed tokens: 58730741760 | elapsed time per iteration (s): 0.15 | learning rate: 7.115E-05 | global batch size: 256 | lm loss: 3.689647E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.818 | TFLOPs: 26.17 | +7: iteration 112030/ 173500 | consumed samples: 28679680 | consumed tokens: 58735984640 | elapsed time per iteration (s): 0.15 | learning rate: 7.114E-05 | global batch size: 256 | lm loss: 3.678005E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.670 | TFLOPs: 26.17 | +7: iteration 112040/ 173500 | consumed samples: 28682240 | consumed tokens: 58741227520 | elapsed time per iteration (s): 0.15 | learning rate: 7.112E-05 | global batch size: 256 | lm loss: 3.679259E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.707 | TFLOPs: 25.93 | +7: iteration 112050/ 173500 | consumed samples: 28684800 | consumed tokens: 58746470400 | elapsed time per iteration (s): 0.15 | learning rate: 7.111E-05 | global batch size: 256 | lm loss: 3.673165E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.896 | TFLOPs: 26.14 | +7: iteration 112060/ 173500 | consumed samples: 28687360 | consumed tokens: 58751713280 | elapsed time per iteration (s): 0.15 | learning rate: 7.109E-05 | global batch size: 256 | lm loss: 3.678527E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.810 | TFLOPs: 26.17 | +7: iteration 112070/ 173500 | consumed samples: 28689920 | consumed tokens: 58756956160 | elapsed time per iteration (s): 0.15 | learning rate: 7.108E-05 | global batch size: 256 | lm loss: 3.682492E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.549 | TFLOPs: 26.03 | +7: iteration 112080/ 173500 | consumed samples: 28692480 | consumed tokens: 58762199040 | elapsed time per iteration (s): 0.15 | learning rate: 7.106E-05 | global batch size: 256 | lm loss: 3.679608E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.560 | TFLOPs: 26.15 | +7: iteration 112090/ 173500 | consumed samples: 28695040 | consumed tokens: 58767441920 | elapsed time per iteration (s): 0.16 | learning rate: 7.105E-05 | global batch size: 256 | lm loss: 3.678176E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.432 | TFLOPs: 25.71 | +7: iteration 112100/ 173500 | consumed samples: 28697600 | consumed tokens: 58772684800 | elapsed time per iteration (s): 0.15 | learning rate: 7.103E-05 | global batch size: 256 | lm loss: 3.682586E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.703 | TFLOPs: 26.15 | +7: iteration 112110/ 173500 | consumed samples: 28700160 | consumed tokens: 58777927680 | elapsed time per iteration (s): 0.16 | learning rate: 7.102E-05 | global batch size: 256 | lm loss: 3.687006E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.002 | TFLOPs: 25.36 | +7: iteration 112120/ 173500 | consumed samples: 28702720 | consumed tokens: 58783170560 | elapsed time per iteration (s): 0.15 | learning rate: 7.100E-05 | global batch size: 256 | lm loss: 3.668732E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.888 | TFLOPs: 26.14 | +7: iteration 112130/ 173500 | consumed samples: 28705280 | consumed tokens: 58788413440 | elapsed time per iteration (s): 0.16 | learning rate: 7.099E-05 | global batch size: 256 | lm loss: 3.682358E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.005 | TFLOPs: 25.22 | +7: iteration 112140/ 173500 | consumed samples: 28707840 | consumed tokens: 58793656320 | elapsed time per iteration (s): 0.19 | learning rate: 7.097E-05 | global batch size: 256 | lm loss: 3.680394E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1341.832 | TFLOPs: 21.04 | +7: iteration 112150/ 173500 | consumed samples: 28710400 | consumed tokens: 58798899200 | elapsed time per iteration (s): 0.26 | learning rate: 7.096E-05 | global batch size: 256 | lm loss: 3.686413E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1002.892 | TFLOPs: 15.73 | +7: iteration 112160/ 173500 | consumed samples: 28712960 | consumed tokens: 58804142080 | elapsed time per iteration (s): 0.15 | learning rate: 7.094E-05 | global batch size: 256 | lm loss: 3.682600E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.033 | TFLOPs: 26.00 | +7: iteration 112170/ 173500 | consumed samples: 28715520 | consumed tokens: 58809384960 | elapsed time per iteration (s): 0.27 | learning rate: 7.093E-05 | global batch size: 256 | lm loss: 3.677643E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 940.728 | TFLOPs: 14.75 | +7: iteration 112180/ 173500 | consumed samples: 28718080 | consumed tokens: 58814627840 | elapsed time per iteration (s): 0.39 | learning rate: 7.091E-05 | global batch size: 256 | lm loss: 3.684814E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 652.766 | TFLOPs: 10.24 | +7: iteration 112190/ 173500 | consumed samples: 28720640 | consumed tokens: 58819870720 | elapsed time per iteration (s): 0.33 | learning rate: 7.090E-05 | global batch size: 256 | lm loss: 3.678202E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 781.020 | TFLOPs: 12.25 | +7: iteration 112200/ 173500 | consumed samples: 28723200 | consumed tokens: 58825113600 | elapsed time per iteration (s): 0.51 | learning rate: 7.088E-05 | global batch size: 256 | lm loss: 3.681369E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 503.868 | TFLOPs: 7.90 | +7: iteration 112210/ 173500 | consumed samples: 28725760 | consumed tokens: 58830356480 | elapsed time per iteration (s): 0.23 | learning rate: 7.087E-05 | global batch size: 256 | lm loss: 3.686680E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.568 | TFLOPs: 17.84 | +7: iteration 112220/ 173500 | consumed samples: 28728320 | consumed tokens: 58835599360 | elapsed time per iteration (s): 0.33 | learning rate: 7.086E-05 | global batch size: 256 | lm loss: 3.676296E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 769.039 | TFLOPs: 12.06 | +7: iteration 112230/ 173500 | consumed samples: 28730880 | consumed tokens: 58840842240 | elapsed time per iteration (s): 0.27 | learning rate: 7.084E-05 | global batch size: 256 | lm loss: 3.687416E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 940.075 | TFLOPs: 14.74 | +7: iteration 112240/ 173500 | consumed samples: 28733440 | consumed tokens: 58846085120 | elapsed time per iteration (s): 0.15 | learning rate: 7.083E-05 | global batch size: 256 | lm loss: 3.683999E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.139 | TFLOPs: 26.11 | +7: iteration 112250/ 173500 | consumed samples: 28736000 | consumed tokens: 58851328000 | elapsed time per iteration (s): 0.15 | learning rate: 7.081E-05 | global batch size: 256 | lm loss: 3.689975E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.614 | TFLOPs: 26.22 | +7: iteration 112260/ 173500 | consumed samples: 28738560 | consumed tokens: 58856570880 | elapsed time per iteration (s): 0.15 | learning rate: 7.080E-05 | global batch size: 256 | lm loss: 3.682214E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.116 | TFLOPs: 26.21 | +7: iteration 112270/ 173500 | consumed samples: 28741120 | consumed tokens: 58861813760 | elapsed time per iteration (s): 0.15 | learning rate: 7.078E-05 | global batch size: 256 | lm loss: 3.690825E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.411 | TFLOPs: 26.21 | +7: iteration 112280/ 173500 | consumed samples: 28743680 | consumed tokens: 58867056640 | elapsed time per iteration (s): 0.15 | learning rate: 7.077E-05 | global batch size: 256 | lm loss: 3.694499E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.660 | TFLOPs: 26.20 | +7: iteration 112290/ 173500 | consumed samples: 28746240 | consumed tokens: 58872299520 | elapsed time per iteration (s): 0.15 | learning rate: 7.075E-05 | global batch size: 256 | lm loss: 3.677069E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.016 | TFLOPs: 26.21 | +7: iteration 112300/ 173500 | consumed samples: 28748800 | consumed tokens: 58877542400 | elapsed time per iteration (s): 0.15 | learning rate: 7.074E-05 | global batch size: 256 | lm loss: 3.684766E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.056 | TFLOPs: 26.21 | +7: iteration 112310/ 173500 | consumed samples: 28751360 | consumed tokens: 58882785280 | elapsed time per iteration (s): 0.15 | learning rate: 7.072E-05 | global batch size: 256 | lm loss: 3.696976E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.683 | TFLOPs: 26.20 | +7: iteration 112320/ 173500 | consumed samples: 28753920 | consumed tokens: 58888028160 | elapsed time per iteration (s): 0.15 | learning rate: 7.071E-05 | global batch size: 256 | lm loss: 3.670133E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.913 | TFLOPs: 26.19 | +7: iteration 112330/ 173500 | consumed samples: 28756480 | consumed tokens: 58893271040 | elapsed time per iteration (s): 0.15 | learning rate: 7.069E-05 | global batch size: 256 | lm loss: 3.679322E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.679 | TFLOPs: 26.20 | +7: iteration 112340/ 173500 | consumed samples: 28759040 | consumed tokens: 58898513920 | elapsed time per iteration (s): 0.15 | learning rate: 7.068E-05 | global batch size: 256 | lm loss: 3.694151E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.677 | TFLOPs: 26.18 | +7: iteration 112350/ 173500 | consumed samples: 28761600 | consumed tokens: 58903756800 | elapsed time per iteration (s): 0.15 | learning rate: 7.066E-05 | global batch size: 256 | lm loss: 3.692842E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.509 | TFLOPs: 26.24 | +7: iteration 112360/ 173500 | consumed samples: 28764160 | consumed tokens: 58908999680 | elapsed time per iteration (s): 0.15 | learning rate: 7.065E-05 | global batch size: 256 | lm loss: 3.676047E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.375 | TFLOPs: 26.24 | +7: iteration 112370/ 173500 | consumed samples: 28766720 | consumed tokens: 58914242560 | elapsed time per iteration (s): 0.15 | learning rate: 7.063E-05 | global batch size: 256 | lm loss: 3.691015E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.977 | TFLOPs: 26.24 | +7: iteration 112380/ 173500 | consumed samples: 28769280 | consumed tokens: 58919485440 | elapsed time per iteration (s): 0.15 | learning rate: 7.062E-05 | global batch size: 256 | lm loss: 3.686189E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.566 | TFLOPs: 26.23 | +7: iteration 112390/ 173500 | consumed samples: 28771840 | consumed tokens: 58924728320 | elapsed time per iteration (s): 0.15 | learning rate: 7.060E-05 | global batch size: 256 | lm loss: 3.679639E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.529 | TFLOPs: 26.23 | +7: iteration 112400/ 173500 | consumed samples: 28774400 | consumed tokens: 58929971200 | elapsed time per iteration (s): 0.15 | learning rate: 7.059E-05 | global batch size: 256 | lm loss: 3.694389E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.112 | TFLOPs: 26.25 | +7: iteration 112410/ 173500 | consumed samples: 28776960 | consumed tokens: 58935214080 | elapsed time per iteration (s): 0.15 | learning rate: 7.057E-05 | global batch size: 256 | lm loss: 3.677438E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.285 | TFLOPs: 26.23 | +7: iteration 112420/ 173500 | consumed samples: 28779520 | consumed tokens: 58940456960 | elapsed time per iteration (s): 0.15 | learning rate: 7.056E-05 | global batch size: 256 | lm loss: 3.672672E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.031 | TFLOPs: 26.25 | +7: iteration 112430/ 173500 | consumed samples: 28782080 | consumed tokens: 58945699840 | elapsed time per iteration (s): 0.15 | learning rate: 7.054E-05 | global batch size: 256 | lm loss: 3.680216E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.671 | TFLOPs: 26.25 | +7: iteration 112440/ 173500 | consumed samples: 28784640 | consumed tokens: 58950942720 | elapsed time per iteration (s): 0.15 | learning rate: 7.053E-05 | global batch size: 256 | lm loss: 3.673813E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.770 | TFLOPs: 26.09 | +7: iteration 112450/ 173500 | consumed samples: 28787200 | consumed tokens: 58956185600 | elapsed time per iteration (s): 0.15 | learning rate: 7.051E-05 | global batch size: 256 | lm loss: 3.672373E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.019 | TFLOPs: 26.21 | +7: iteration 112460/ 173500 | consumed samples: 28789760 | consumed tokens: 58961428480 | elapsed time per iteration (s): 0.15 | learning rate: 7.050E-05 | global batch size: 256 | lm loss: 3.681546E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.137 | TFLOPs: 26.24 | +7: iteration 112470/ 173500 | consumed samples: 28792320 | consumed tokens: 58966671360 | elapsed time per iteration (s): 0.15 | learning rate: 7.049E-05 | global batch size: 256 | lm loss: 3.691470E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.173 | TFLOPs: 26.22 | +7: iteration 112480/ 173500 | consumed samples: 28794880 | consumed tokens: 58971914240 | elapsed time per iteration (s): 0.15 | learning rate: 7.047E-05 | global batch size: 256 | lm loss: 3.677461E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.028 | TFLOPs: 26.24 | +7: iteration 112490/ 173500 | consumed samples: 28797440 | consumed tokens: 58977157120 | elapsed time per iteration (s): 0.15 | learning rate: 7.046E-05 | global batch size: 256 | lm loss: 3.695395E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.129 | TFLOPs: 26.24 | +7: iteration 112500/ 173500 | consumed samples: 28800000 | consumed tokens: 58982400000 | elapsed time per iteration (s): 0.15 | learning rate: 7.044E-05 | global batch size: 256 | lm loss: 3.688529E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.504 | TFLOPs: 26.24 | +7: iteration 112510/ 173500 | consumed samples: 28802560 | consumed tokens: 58987642880 | elapsed time per iteration (s): 0.15 | learning rate: 7.043E-05 | global batch size: 256 | lm loss: 3.673378E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.005 | TFLOPs: 26.24 | +7: iteration 112520/ 173500 | consumed samples: 28805120 | consumed tokens: 58992885760 | elapsed time per iteration (s): 0.15 | learning rate: 7.041E-05 | global batch size: 256 | lm loss: 3.696783E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.313 | TFLOPs: 26.24 | +7: iteration 112530/ 173500 | consumed samples: 28807680 | consumed tokens: 58998128640 | elapsed time per iteration (s): 0.15 | learning rate: 7.040E-05 | global batch size: 256 | lm loss: 3.681718E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.870 | TFLOPs: 26.22 | +7: iteration 112540/ 173500 | consumed samples: 28810240 | consumed tokens: 59003371520 | elapsed time per iteration (s): 0.15 | learning rate: 7.038E-05 | global batch size: 256 | lm loss: 3.671834E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.146 | TFLOPs: 26.22 | +7: iteration 112550/ 173500 | consumed samples: 28812800 | consumed tokens: 59008614400 | elapsed time per iteration (s): 0.15 | learning rate: 7.037E-05 | global batch size: 256 | lm loss: 3.690671E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.263 | TFLOPs: 26.23 | +7: iteration 112560/ 173500 | consumed samples: 28815360 | consumed tokens: 59013857280 | elapsed time per iteration (s): 0.15 | learning rate: 7.035E-05 | global batch size: 256 | lm loss: 3.684793E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.931 | TFLOPs: 26.19 | +7: iteration 112570/ 173500 | consumed samples: 28817920 | consumed tokens: 59019100160 | elapsed time per iteration (s): 0.15 | learning rate: 7.034E-05 | global batch size: 256 | lm loss: 3.671536E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.295 | TFLOPs: 26.21 | +7: iteration 112580/ 173500 | consumed samples: 28820480 | consumed tokens: 59024343040 | elapsed time per iteration (s): 0.15 | learning rate: 7.032E-05 | global batch size: 256 | lm loss: 3.682682E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.342 | TFLOPs: 26.21 | +7: iteration 112590/ 173500 | consumed samples: 28823040 | consumed tokens: 59029585920 | elapsed time per iteration (s): 0.15 | learning rate: 7.031E-05 | global batch size: 256 | lm loss: 3.682906E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.425 | TFLOPs: 26.21 | +7: iteration 112600/ 173500 | consumed samples: 28825600 | consumed tokens: 59034828800 | elapsed time per iteration (s): 0.15 | learning rate: 7.029E-05 | global batch size: 256 | lm loss: 3.685647E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.876 | TFLOPs: 26.23 | +7: iteration 112610/ 173500 | consumed samples: 28828160 | consumed tokens: 59040071680 | elapsed time per iteration (s): 0.15 | learning rate: 7.028E-05 | global batch size: 256 | lm loss: 3.686718E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.941 | TFLOPs: 26.25 | +7: iteration 112620/ 173500 | consumed samples: 28830720 | consumed tokens: 59045314560 | elapsed time per iteration (s): 0.15 | learning rate: 7.026E-05 | global batch size: 256 | lm loss: 3.699373E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.831 | TFLOPs: 26.27 | +7: iteration 112630/ 173500 | consumed samples: 28833280 | consumed tokens: 59050557440 | elapsed time per iteration (s): 0.16 | learning rate: 7.025E-05 | global batch size: 256 | lm loss: 3.676807E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.225 | TFLOPs: 25.80 | +7: iteration 112640/ 173500 | consumed samples: 28835840 | consumed tokens: 59055800320 | elapsed time per iteration (s): 0.15 | learning rate: 7.023E-05 | global batch size: 256 | lm loss: 3.684125E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.724 | TFLOPs: 26.06 | +7: iteration 112650/ 173500 | consumed samples: 28838400 | consumed tokens: 59061043200 | elapsed time per iteration (s): 0.15 | learning rate: 7.022E-05 | global batch size: 256 | lm loss: 3.674318E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.551 | TFLOPs: 26.06 | +7: iteration 112660/ 173500 | consumed samples: 28840960 | consumed tokens: 59066286080 | elapsed time per iteration (s): 0.15 | learning rate: 7.020E-05 | global batch size: 256 | lm loss: 3.687273E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.758 | TFLOPs: 26.11 | +7: iteration 112670/ 173500 | consumed samples: 28843520 | consumed tokens: 59071528960 | elapsed time per iteration (s): 0.15 | learning rate: 7.019E-05 | global batch size: 256 | lm loss: 3.677582E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.634 | TFLOPs: 26.12 | +7: iteration 112680/ 173500 | consumed samples: 28846080 | consumed tokens: 59076771840 | elapsed time per iteration (s): 0.15 | learning rate: 7.017E-05 | global batch size: 256 | lm loss: 3.686401E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.876 | TFLOPs: 26.09 | +7: iteration 112690/ 173500 | consumed samples: 28848640 | consumed tokens: 59082014720 | elapsed time per iteration (s): 0.15 | learning rate: 7.016E-05 | global batch size: 256 | lm loss: 3.688935E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.435 | TFLOPs: 26.10 | +7: iteration 112700/ 173500 | consumed samples: 28851200 | consumed tokens: 59087257600 | elapsed time per iteration (s): 0.16 | learning rate: 7.015E-05 | global batch size: 256 | lm loss: 3.686761E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.469 | TFLOPs: 25.66 | +7: iteration 112710/ 173500 | consumed samples: 28853760 | consumed tokens: 59092500480 | elapsed time per iteration (s): 0.15 | learning rate: 7.013E-05 | global batch size: 256 | lm loss: 3.667681E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.675 | TFLOPs: 26.12 | +7: iteration 112720/ 173500 | consumed samples: 28856320 | consumed tokens: 59097743360 | elapsed time per iteration (s): 0.15 | learning rate: 7.012E-05 | global batch size: 256 | lm loss: 3.671033E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.459 | TFLOPs: 26.17 | +7: iteration 112730/ 173500 | consumed samples: 28858880 | consumed tokens: 59102986240 | elapsed time per iteration (s): 0.15 | learning rate: 7.010E-05 | global batch size: 256 | lm loss: 3.685880E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.141 | TFLOPs: 26.22 | +7: iteration 112740/ 173500 | consumed samples: 28861440 | consumed tokens: 59108229120 | elapsed time per iteration (s): 0.15 | learning rate: 7.009E-05 | global batch size: 256 | lm loss: 3.685951E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.756 | TFLOPs: 26.23 | +7: iteration 112750/ 173500 | consumed samples: 28864000 | consumed tokens: 59113472000 | elapsed time per iteration (s): 0.15 | learning rate: 7.007E-05 | global batch size: 256 | lm loss: 3.692259E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.044 | TFLOPs: 26.24 | +7: iteration 112760/ 173500 | consumed samples: 28866560 | consumed tokens: 59118714880 | elapsed time per iteration (s): 0.15 | learning rate: 7.006E-05 | global batch size: 256 | lm loss: 3.692220E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.065 | TFLOPs: 26.22 | +7: iteration 112770/ 173500 | consumed samples: 28869120 | consumed tokens: 59123957760 | elapsed time per iteration (s): 0.15 | learning rate: 7.004E-05 | global batch size: 256 | lm loss: 3.684748E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.902 | TFLOPs: 26.20 | +7: iteration 112780/ 173500 | consumed samples: 28871680 | consumed tokens: 59129200640 | elapsed time per iteration (s): 0.15 | learning rate: 7.003E-05 | global batch size: 256 | lm loss: 3.687180E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.856 | TFLOPs: 26.22 | +7: iteration 112790/ 173500 | consumed samples: 28874240 | consumed tokens: 59134443520 | elapsed time per iteration (s): 0.15 | learning rate: 7.001E-05 | global batch size: 256 | lm loss: 3.683596E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.550 | TFLOPs: 26.12 | +7: iteration 112800/ 173500 | consumed samples: 28876800 | consumed tokens: 59139686400 | elapsed time per iteration (s): 0.15 | learning rate: 7.000E-05 | global batch size: 256 | lm loss: 3.687590E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.584 | TFLOPs: 26.20 | +7: iteration 112810/ 173500 | consumed samples: 28879360 | consumed tokens: 59144929280 | elapsed time per iteration (s): 0.15 | learning rate: 6.998E-05 | global batch size: 256 | lm loss: 3.679240E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.512 | TFLOPs: 26.21 | +7: iteration 112820/ 173500 | consumed samples: 28881920 | consumed tokens: 59150172160 | elapsed time per iteration (s): 0.15 | learning rate: 6.997E-05 | global batch size: 256 | lm loss: 3.677805E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.393 | TFLOPs: 26.21 | +7: iteration 112830/ 173500 | consumed samples: 28884480 | consumed tokens: 59155415040 | elapsed time per iteration (s): 0.15 | learning rate: 6.995E-05 | global batch size: 256 | lm loss: 3.682986E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.322 | TFLOPs: 26.23 | +7: iteration 112840/ 173500 | consumed samples: 28887040 | consumed tokens: 59160657920 | elapsed time per iteration (s): 0.15 | learning rate: 6.994E-05 | global batch size: 256 | lm loss: 3.684771E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.072 | TFLOPs: 26.22 | +7: iteration 112850/ 173500 | consumed samples: 28889600 | consumed tokens: 59165900800 | elapsed time per iteration (s): 0.15 | learning rate: 6.992E-05 | global batch size: 256 | lm loss: 3.697298E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.494 | TFLOPs: 25.95 | +7: iteration 112860/ 173500 | consumed samples: 28892160 | consumed tokens: 59171143680 | elapsed time per iteration (s): 0.16 | learning rate: 6.991E-05 | global batch size: 256 | lm loss: 3.684681E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.407 | TFLOPs: 25.85 | +7: iteration 112870/ 173500 | consumed samples: 28894720 | consumed tokens: 59176386560 | elapsed time per iteration (s): 0.15 | learning rate: 6.989E-05 | global batch size: 256 | lm loss: 3.682036E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.694 | TFLOPs: 26.18 | +7: iteration 112880/ 173500 | consumed samples: 28897280 | consumed tokens: 59181629440 | elapsed time per iteration (s): 0.15 | learning rate: 6.988E-05 | global batch size: 256 | lm loss: 3.673319E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.110 | TFLOPs: 26.18 | +7: iteration 112890/ 173500 | consumed samples: 28899840 | consumed tokens: 59186872320 | elapsed time per iteration (s): 0.15 | learning rate: 6.987E-05 | global batch size: 256 | lm loss: 3.676786E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.000 | TFLOPs: 26.17 | +7: iteration 112900/ 173500 | consumed samples: 28902400 | consumed tokens: 59192115200 | elapsed time per iteration (s): 0.15 | learning rate: 6.985E-05 | global batch size: 256 | lm loss: 3.687102E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.344 | TFLOPs: 26.20 | +7: iteration 112910/ 173500 | consumed samples: 28904960 | consumed tokens: 59197358080 | elapsed time per iteration (s): 0.15 | learning rate: 6.984E-05 | global batch size: 256 | lm loss: 3.687899E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.010 | TFLOPs: 26.22 | +7: iteration 112920/ 173500 | consumed samples: 28907520 | consumed tokens: 59202600960 | elapsed time per iteration (s): 0.15 | learning rate: 6.982E-05 | global batch size: 256 | lm loss: 3.686589E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.086 | TFLOPs: 26.22 | +7: iteration 112930/ 173500 | consumed samples: 28910080 | consumed tokens: 59207843840 | elapsed time per iteration (s): 0.15 | learning rate: 6.981E-05 | global batch size: 256 | lm loss: 3.677257E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.298 | TFLOPs: 25.91 | +7: iteration 112940/ 173500 | consumed samples: 28912640 | consumed tokens: 59213086720 | elapsed time per iteration (s): 0.15 | learning rate: 6.979E-05 | global batch size: 256 | lm loss: 3.684966E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.554 | TFLOPs: 26.03 | +7: iteration 112950/ 173500 | consumed samples: 28915200 | consumed tokens: 59218329600 | elapsed time per iteration (s): 0.16 | learning rate: 6.978E-05 | global batch size: 256 | lm loss: 3.676230E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.232 | TFLOPs: 25.83 | +7: iteration 112960/ 173500 | consumed samples: 28917760 | consumed tokens: 59223572480 | elapsed time per iteration (s): 0.16 | learning rate: 6.976E-05 | global batch size: 256 | lm loss: 3.680206E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.591 | TFLOPs: 25.32 | +7: iteration 112970/ 173500 | consumed samples: 28920320 | consumed tokens: 59228815360 | elapsed time per iteration (s): 0.16 | learning rate: 6.975E-05 | global batch size: 256 | lm loss: 3.691889E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.552 | TFLOPs: 25.87 | +7: iteration 112980/ 173500 | consumed samples: 28922880 | consumed tokens: 59234058240 | elapsed time per iteration (s): 0.16 | learning rate: 6.973E-05 | global batch size: 256 | lm loss: 3.672266E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.175 | TFLOPs: 25.24 | +7: iteration 112990/ 173500 | consumed samples: 28925440 | consumed tokens: 59239301120 | elapsed time per iteration (s): 0.15 | learning rate: 6.972E-05 | global batch size: 256 | lm loss: 3.677422E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.775 | TFLOPs: 25.97 | +7: iteration 113000/ 173500 | consumed samples: 28928000 | consumed tokens: 59244544000 | elapsed time per iteration (s): 0.16 | learning rate: 6.970E-05 | global batch size: 256 | lm loss: 3.686703E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.146 | TFLOPs: 25.53 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 113000 | lm loss value: 3.833106E+00 | lm loss PPL: 4.620583E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 113000 to checkpoints_44m91b100m +0: [2023-03-17 05:09:40,357] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step113000 is begin to save! +0: [2023-03-17 05:09:40,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:09:40,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:09:40,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:09:40,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:09:40,437] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:09:40,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:09:40,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:09:40,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:09:40,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:09:40,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:09:40,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:09:40,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:09:40,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:09:40,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:09:40,478] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:09:40,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:09:40,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:09:40,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:09:40,495] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:09:40,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:09:40,496] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step113000/mp_rank_00_model_states.pt +0: [2023-03-17 05:09:40,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:09:40,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:09:40,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:09:40,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +7: [2023-03-17 05:09:40,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +7: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +7: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +7: [2023-03-17 05:09:40,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:09:40,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +7: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +5: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +6: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +7: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:09:40,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 05:09:40,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +2: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:09:40,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +4: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:09:40,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 05:09:40,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +3: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:09:40,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:09:40,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +1: [2023-03-17 05:09:40,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:09:40,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step113000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:09:40,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step113000 is ready now! +0: successfully saved checkpoint at iteration 113000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 188.59 +7: iteration 113010/ 173500 | consumed samples: 28930560 | consumed tokens: 59249786880 | elapsed time per iteration (s): 0.18 | learning rate: 6.969E-05 | global batch size: 256 | lm loss: 3.676402E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1425.377 | TFLOPs: 22.35 | +7: iteration 113020/ 173500 | consumed samples: 28933120 | consumed tokens: 59255029760 | elapsed time per iteration (s): 0.16 | learning rate: 6.967E-05 | global batch size: 256 | lm loss: 3.673513E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.210 | TFLOPs: 25.49 | +7: iteration 113030/ 173500 | consumed samples: 28935680 | consumed tokens: 59260272640 | elapsed time per iteration (s): 0.16 | learning rate: 6.966E-05 | global batch size: 256 | lm loss: 3.666355E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.263 | TFLOPs: 25.54 | +7: iteration 113040/ 173500 | consumed samples: 28938240 | consumed tokens: 59265515520 | elapsed time per iteration (s): 0.15 | learning rate: 6.964E-05 | global batch size: 256 | lm loss: 3.691648E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.891 | TFLOPs: 26.17 | +7: iteration 113050/ 173500 | consumed samples: 28940800 | consumed tokens: 59270758400 | elapsed time per iteration (s): 0.16 | learning rate: 6.963E-05 | global batch size: 256 | lm loss: 3.679565E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.253 | TFLOPs: 25.71 | +7: iteration 113060/ 173500 | consumed samples: 28943360 | consumed tokens: 59276001280 | elapsed time per iteration (s): 0.15 | learning rate: 6.961E-05 | global batch size: 256 | lm loss: 3.686448E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.419 | TFLOPs: 26.16 | +7: iteration 113070/ 173500 | consumed samples: 28945920 | consumed tokens: 59281244160 | elapsed time per iteration (s): 0.16 | learning rate: 6.960E-05 | global batch size: 256 | lm loss: 3.676278E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.373 | TFLOPs: 25.79 | +7: iteration 113080/ 173500 | consumed samples: 28948480 | consumed tokens: 59286487040 | elapsed time per iteration (s): 0.15 | learning rate: 6.959E-05 | global batch size: 256 | lm loss: 3.677759E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.161 | TFLOPs: 25.99 | +7: iteration 113090/ 173500 | consumed samples: 28951040 | consumed tokens: 59291729920 | elapsed time per iteration (s): 0.15 | learning rate: 6.957E-05 | global batch size: 256 | lm loss: 3.684558E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.835 | TFLOPs: 25.95 | +7: iteration 113100/ 173500 | consumed samples: 28953600 | consumed tokens: 59296972800 | elapsed time per iteration (s): 0.15 | learning rate: 6.956E-05 | global batch size: 256 | lm loss: 3.677695E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.014 | TFLOPs: 26.19 | +7: iteration 113110/ 173500 | consumed samples: 28956160 | consumed tokens: 59302215680 | elapsed time per iteration (s): 0.15 | learning rate: 6.954E-05 | global batch size: 256 | lm loss: 3.676871E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.008 | TFLOPs: 26.17 | +7: iteration 113120/ 173500 | consumed samples: 28958720 | consumed tokens: 59307458560 | elapsed time per iteration (s): 0.16 | learning rate: 6.953E-05 | global batch size: 256 | lm loss: 3.679601E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.862 | TFLOPs: 25.33 | +7: iteration 113130/ 173500 | consumed samples: 28961280 | consumed tokens: 59312701440 | elapsed time per iteration (s): 0.16 | learning rate: 6.951E-05 | global batch size: 256 | lm loss: 3.680872E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.170 | TFLOPs: 25.75 | +7: iteration 113140/ 173500 | consumed samples: 28963840 | consumed tokens: 59317944320 | elapsed time per iteration (s): 0.16 | learning rate: 6.950E-05 | global batch size: 256 | lm loss: 3.678734E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.182 | TFLOPs: 25.72 | +7: iteration 113150/ 173500 | consumed samples: 28966400 | consumed tokens: 59323187200 | elapsed time per iteration (s): 0.16 | learning rate: 6.948E-05 | global batch size: 256 | lm loss: 3.683394E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.434 | TFLOPs: 25.60 | +7: iteration 113160/ 173500 | consumed samples: 28968960 | consumed tokens: 59328430080 | elapsed time per iteration (s): 0.16 | learning rate: 6.947E-05 | global batch size: 256 | lm loss: 3.679382E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.847 | TFLOPs: 25.47 | +7: iteration 113170/ 173500 | consumed samples: 28971520 | consumed tokens: 59333672960 | elapsed time per iteration (s): 0.17 | learning rate: 6.945E-05 | global batch size: 256 | lm loss: 3.684355E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.394 | TFLOPs: 24.05 | +7: iteration 113180/ 173500 | consumed samples: 28974080 | consumed tokens: 59338915840 | elapsed time per iteration (s): 0.16 | learning rate: 6.944E-05 | global batch size: 256 | lm loss: 3.674124E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.947 | TFLOPs: 25.42 | +7: iteration 113190/ 173500 | consumed samples: 28976640 | consumed tokens: 59344158720 | elapsed time per iteration (s): 0.16 | learning rate: 6.942E-05 | global batch size: 256 | lm loss: 3.695164E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.836 | TFLOPs: 25.81 | +7: iteration 113200/ 173500 | consumed samples: 28979200 | consumed tokens: 59349401600 | elapsed time per iteration (s): 0.16 | learning rate: 6.941E-05 | global batch size: 256 | lm loss: 3.687486E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.250 | TFLOPs: 25.60 | +7: iteration 113210/ 173500 | consumed samples: 28981760 | consumed tokens: 59354644480 | elapsed time per iteration (s): 0.16 | learning rate: 6.939E-05 | global batch size: 256 | lm loss: 3.682840E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.939 | TFLOPs: 25.70 | +7: iteration 113220/ 173500 | consumed samples: 28984320 | consumed tokens: 59359887360 | elapsed time per iteration (s): 0.15 | learning rate: 6.938E-05 | global batch size: 256 | lm loss: 3.687065E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.112 | TFLOPs: 26.13 | +7: iteration 113230/ 173500 | consumed samples: 28986880 | consumed tokens: 59365130240 | elapsed time per iteration (s): 0.16 | learning rate: 6.936E-05 | global batch size: 256 | lm loss: 3.677142E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.940 | TFLOPs: 25.61 | +7: iteration 113240/ 173500 | consumed samples: 28989440 | consumed tokens: 59370373120 | elapsed time per iteration (s): 0.16 | learning rate: 6.935E-05 | global batch size: 256 | lm loss: 3.676550E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.468 | TFLOPs: 25.71 | +7: iteration 113250/ 173500 | consumed samples: 28992000 | consumed tokens: 59375616000 | elapsed time per iteration (s): 0.15 | learning rate: 6.934E-05 | global batch size: 256 | lm loss: 3.673055E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.067 | TFLOPs: 26.10 | +7: iteration 113260/ 173500 | consumed samples: 28994560 | consumed tokens: 59380858880 | elapsed time per iteration (s): 0.16 | learning rate: 6.932E-05 | global batch size: 256 | lm loss: 3.684265E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.733 | TFLOPs: 25.67 | +7: iteration 113270/ 173500 | consumed samples: 28997120 | consumed tokens: 59386101760 | elapsed time per iteration (s): 0.16 | learning rate: 6.931E-05 | global batch size: 256 | lm loss: 3.681753E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.207 | TFLOPs: 25.77 | +7: iteration 113280/ 173500 | consumed samples: 28999680 | consumed tokens: 59391344640 | elapsed time per iteration (s): 0.16 | learning rate: 6.929E-05 | global batch size: 256 | lm loss: 3.691175E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.866 | TFLOPs: 25.73 | +7: iteration 113290/ 173500 | consumed samples: 29002240 | consumed tokens: 59396587520 | elapsed time per iteration (s): 0.15 | learning rate: 6.928E-05 | global batch size: 256 | lm loss: 3.683065E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.186 | TFLOPs: 26.15 | +7: iteration 113300/ 173500 | consumed samples: 29004800 | consumed tokens: 59401830400 | elapsed time per iteration (s): 0.16 | learning rate: 6.926E-05 | global batch size: 256 | lm loss: 3.697034E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.168 | TFLOPs: 25.31 | +7: iteration 113310/ 173500 | consumed samples: 29007360 | consumed tokens: 59407073280 | elapsed time per iteration (s): 0.16 | learning rate: 6.925E-05 | global batch size: 256 | lm loss: 3.673807E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.584 | TFLOPs: 25.26 | +7: iteration 113320/ 173500 | consumed samples: 29009920 | consumed tokens: 59412316160 | elapsed time per iteration (s): 0.16 | learning rate: 6.923E-05 | global batch size: 256 | lm loss: 3.666334E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.271 | TFLOPs: 25.75 | +7: iteration 113330/ 173500 | consumed samples: 29012480 | consumed tokens: 59417559040 | elapsed time per iteration (s): 0.16 | learning rate: 6.922E-05 | global batch size: 256 | lm loss: 3.684615E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.578 | TFLOPs: 25.49 | +7: iteration 113340/ 173500 | consumed samples: 29015040 | consumed tokens: 59422801920 | elapsed time per iteration (s): 0.16 | learning rate: 6.920E-05 | global batch size: 256 | lm loss: 3.680657E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.653 | TFLOPs: 25.31 | +7: iteration 113350/ 173500 | consumed samples: 29017600 | consumed tokens: 59428044800 | elapsed time per iteration (s): 0.15 | learning rate: 6.919E-05 | global batch size: 256 | lm loss: 3.678833E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.802 | TFLOPs: 25.95 | +7: iteration 113360/ 173500 | consumed samples: 29020160 | consumed tokens: 59433287680 | elapsed time per iteration (s): 0.16 | learning rate: 6.917E-05 | global batch size: 256 | lm loss: 3.689938E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.185 | TFLOPs: 25.35 | +7: iteration 113370/ 173500 | consumed samples: 29022720 | consumed tokens: 59438530560 | elapsed time per iteration (s): 0.16 | learning rate: 6.916E-05 | global batch size: 256 | lm loss: 3.680019E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.444 | TFLOPs: 25.54 | +7: iteration 113380/ 173500 | consumed samples: 29025280 | consumed tokens: 59443773440 | elapsed time per iteration (s): 0.15 | learning rate: 6.914E-05 | global batch size: 256 | lm loss: 3.682363E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.560 | TFLOPs: 25.95 | +7: iteration 113390/ 173500 | consumed samples: 29027840 | consumed tokens: 59449016320 | elapsed time per iteration (s): 0.15 | learning rate: 6.913E-05 | global batch size: 256 | lm loss: 3.686086E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.068 | TFLOPs: 26.16 | +7: iteration 113400/ 173500 | consumed samples: 29030400 | consumed tokens: 59454259200 | elapsed time per iteration (s): 0.16 | learning rate: 6.912E-05 | global batch size: 256 | lm loss: 3.664272E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.744 | TFLOPs: 25.43 | +7: iteration 113410/ 173500 | consumed samples: 29032960 | consumed tokens: 59459502080 | elapsed time per iteration (s): 0.16 | learning rate: 6.910E-05 | global batch size: 256 | lm loss: 3.677424E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.842 | TFLOPs: 25.39 | +7: iteration 113420/ 173500 | consumed samples: 29035520 | consumed tokens: 59464744960 | elapsed time per iteration (s): 0.15 | learning rate: 6.909E-05 | global batch size: 256 | lm loss: 3.680209E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.160 | TFLOPs: 26.21 | +7: iteration 113430/ 173500 | consumed samples: 29038080 | consumed tokens: 59469987840 | elapsed time per iteration (s): 0.16 | learning rate: 6.907E-05 | global batch size: 256 | lm loss: 3.678072E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.145 | TFLOPs: 25.11 | +7: iteration 113440/ 173500 | consumed samples: 29040640 | consumed tokens: 59475230720 | elapsed time per iteration (s): 0.16 | learning rate: 6.906E-05 | global batch size: 256 | lm loss: 3.686180E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.241 | TFLOPs: 25.71 | +7: iteration 113450/ 173500 | consumed samples: 29043200 | consumed tokens: 59480473600 | elapsed time per iteration (s): 0.16 | learning rate: 6.904E-05 | global batch size: 256 | lm loss: 3.692362E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.386 | TFLOPs: 25.07 | +7: iteration 113460/ 173500 | consumed samples: 29045760 | consumed tokens: 59485716480 | elapsed time per iteration (s): 0.16 | learning rate: 6.903E-05 | global batch size: 256 | lm loss: 3.678339E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.591 | TFLOPs: 25.85 | +7: iteration 113470/ 173500 | consumed samples: 29048320 | consumed tokens: 59490959360 | elapsed time per iteration (s): 0.16 | learning rate: 6.901E-05 | global batch size: 256 | lm loss: 3.695848E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.520 | TFLOPs: 25.74 | +7: iteration 113480/ 173500 | consumed samples: 29050880 | consumed tokens: 59496202240 | elapsed time per iteration (s): 0.16 | learning rate: 6.900E-05 | global batch size: 256 | lm loss: 3.685976E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.027 | TFLOPs: 25.77 | +7: iteration 113490/ 173500 | consumed samples: 29053440 | consumed tokens: 59501445120 | elapsed time per iteration (s): 0.16 | learning rate: 6.898E-05 | global batch size: 256 | lm loss: 3.687458E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.964 | TFLOPs: 25.56 | +7: iteration 113500/ 173500 | consumed samples: 29056000 | consumed tokens: 59506688000 | elapsed time per iteration (s): 0.15 | learning rate: 6.897E-05 | global batch size: 256 | lm loss: 3.678830E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.887 | TFLOPs: 26.19 | +7: iteration 113510/ 173500 | consumed samples: 29058560 | consumed tokens: 59511930880 | elapsed time per iteration (s): 0.16 | learning rate: 6.895E-05 | global batch size: 256 | lm loss: 3.669178E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.352 | TFLOPs: 25.83 | +7: iteration 113520/ 173500 | consumed samples: 29061120 | consumed tokens: 59517173760 | elapsed time per iteration (s): 0.16 | learning rate: 6.894E-05 | global batch size: 256 | lm loss: 3.681036E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.445 | TFLOPs: 25.27 | +7: iteration 113530/ 173500 | consumed samples: 29063680 | consumed tokens: 59522416640 | elapsed time per iteration (s): 0.15 | learning rate: 6.892E-05 | global batch size: 256 | lm loss: 3.676023E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.932 | TFLOPs: 26.24 | +7: iteration 113540/ 173500 | consumed samples: 29066240 | consumed tokens: 59527659520 | elapsed time per iteration (s): 0.15 | learning rate: 6.891E-05 | global batch size: 256 | lm loss: 3.693593E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.394 | TFLOPs: 26.15 | +7: iteration 113550/ 173500 | consumed samples: 29068800 | consumed tokens: 59532902400 | elapsed time per iteration (s): 0.16 | learning rate: 6.890E-05 | global batch size: 256 | lm loss: 3.676043E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.390 | TFLOPs: 25.79 | +7: iteration 113560/ 173500 | consumed samples: 29071360 | consumed tokens: 59538145280 | elapsed time per iteration (s): 0.16 | learning rate: 6.888E-05 | global batch size: 256 | lm loss: 3.674154E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.747 | TFLOPs: 25.73 | +7: iteration 113570/ 173500 | consumed samples: 29073920 | consumed tokens: 59543388160 | elapsed time per iteration (s): 0.15 | learning rate: 6.887E-05 | global batch size: 256 | lm loss: 3.676767E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.303 | TFLOPs: 26.19 | +7: iteration 113580/ 173500 | consumed samples: 29076480 | consumed tokens: 59548631040 | elapsed time per iteration (s): 0.16 | learning rate: 6.885E-05 | global batch size: 256 | lm loss: 3.692268E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.727 | TFLOPs: 25.65 | +7: iteration 113590/ 173500 | consumed samples: 29079040 | consumed tokens: 59553873920 | elapsed time per iteration (s): 0.16 | learning rate: 6.884E-05 | global batch size: 256 | lm loss: 3.677682E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.081 | TFLOPs: 25.81 | +7: iteration 113600/ 173500 | consumed samples: 29081600 | consumed tokens: 59559116800 | elapsed time per iteration (s): 0.16 | learning rate: 6.882E-05 | global batch size: 256 | lm loss: 3.682524E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.476 | TFLOPs: 25.38 | +7: iteration 113610/ 173500 | consumed samples: 29084160 | consumed tokens: 59564359680 | elapsed time per iteration (s): 0.16 | learning rate: 6.881E-05 | global batch size: 256 | lm loss: 3.688451E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.080 | TFLOPs: 25.78 | +7: iteration 113620/ 173500 | consumed samples: 29086720 | consumed tokens: 59569602560 | elapsed time per iteration (s): 0.16 | learning rate: 6.879E-05 | global batch size: 256 | lm loss: 3.691104E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.630 | TFLOPs: 24.84 | +7: iteration 113630/ 173500 | consumed samples: 29089280 | consumed tokens: 59574845440 | elapsed time per iteration (s): 0.16 | learning rate: 6.878E-05 | global batch size: 256 | lm loss: 3.691027E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.807 | TFLOPs: 25.73 | +7: iteration 113640/ 173500 | consumed samples: 29091840 | consumed tokens: 59580088320 | elapsed time per iteration (s): 0.15 | learning rate: 6.876E-05 | global batch size: 256 | lm loss: 3.685123E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.465 | TFLOPs: 26.24 | +7: iteration 113650/ 173500 | consumed samples: 29094400 | consumed tokens: 59585331200 | elapsed time per iteration (s): 0.16 | learning rate: 6.875E-05 | global batch size: 256 | lm loss: 3.692374E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.363 | TFLOPs: 24.80 | +7: iteration 113660/ 173500 | consumed samples: 29096960 | consumed tokens: 59590574080 | elapsed time per iteration (s): 0.16 | learning rate: 6.873E-05 | global batch size: 256 | lm loss: 3.668871E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.981 | TFLOPs: 25.86 | +7: iteration 113670/ 173500 | consumed samples: 29099520 | consumed tokens: 59595816960 | elapsed time per iteration (s): 0.16 | learning rate: 6.872E-05 | global batch size: 256 | lm loss: 3.674460E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.086 | TFLOPs: 25.61 | +7: iteration 113680/ 173500 | consumed samples: 29102080 | consumed tokens: 59601059840 | elapsed time per iteration (s): 0.15 | learning rate: 6.871E-05 | global batch size: 256 | lm loss: 3.691378E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.523 | TFLOPs: 25.96 | +7: iteration 113690/ 173500 | consumed samples: 29104640 | consumed tokens: 59606302720 | elapsed time per iteration (s): 0.15 | learning rate: 6.869E-05 | global batch size: 256 | lm loss: 3.673782E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.119 | TFLOPs: 26.18 | +7: iteration 113700/ 173500 | consumed samples: 29107200 | consumed tokens: 59611545600 | elapsed time per iteration (s): 0.16 | learning rate: 6.868E-05 | global batch size: 256 | lm loss: 3.685698E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.708 | TFLOPs: 25.46 | +7: iteration 113710/ 173500 | consumed samples: 29109760 | consumed tokens: 59616788480 | elapsed time per iteration (s): 0.16 | learning rate: 6.866E-05 | global batch size: 256 | lm loss: 3.679539E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.043 | TFLOPs: 25.34 | +7: iteration 113720/ 173500 | consumed samples: 29112320 | consumed tokens: 59622031360 | elapsed time per iteration (s): 0.16 | learning rate: 6.865E-05 | global batch size: 256 | lm loss: 3.686896E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.693 | TFLOPs: 25.76 | +7: iteration 113730/ 173500 | consumed samples: 29114880 | consumed tokens: 59627274240 | elapsed time per iteration (s): 0.16 | learning rate: 6.863E-05 | global batch size: 256 | lm loss: 3.686831E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.379 | TFLOPs: 25.35 | +7: iteration 113740/ 173500 | consumed samples: 29117440 | consumed tokens: 59632517120 | elapsed time per iteration (s): 0.16 | learning rate: 6.862E-05 | global batch size: 256 | lm loss: 3.671462E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.517 | TFLOPs: 25.68 | +7: iteration 113750/ 173500 | consumed samples: 29120000 | consumed tokens: 59637760000 | elapsed time per iteration (s): 0.16 | learning rate: 6.860E-05 | global batch size: 256 | lm loss: 3.678453E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.518 | TFLOPs: 25.30 | +7: iteration 113760/ 173500 | consumed samples: 29122560 | consumed tokens: 59643002880 | elapsed time per iteration (s): 0.16 | learning rate: 6.859E-05 | global batch size: 256 | lm loss: 3.672358E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.549 | TFLOPs: 25.68 | +7: iteration 113770/ 173500 | consumed samples: 29125120 | consumed tokens: 59648245760 | elapsed time per iteration (s): 0.16 | learning rate: 6.857E-05 | global batch size: 256 | lm loss: 3.692488E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.241 | TFLOPs: 25.55 | +7: iteration 113780/ 173500 | consumed samples: 29127680 | consumed tokens: 59653488640 | elapsed time per iteration (s): 0.15 | learning rate: 6.856E-05 | global batch size: 256 | lm loss: 3.690110E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.516 | TFLOPs: 26.09 | +7: iteration 113790/ 173500 | consumed samples: 29130240 | consumed tokens: 59658731520 | elapsed time per iteration (s): 0.16 | learning rate: 6.854E-05 | global batch size: 256 | lm loss: 3.678142E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.362 | TFLOPs: 25.35 | +7: iteration 113800/ 173500 | consumed samples: 29132800 | consumed tokens: 59663974400 | elapsed time per iteration (s): 0.15 | learning rate: 6.853E-05 | global batch size: 256 | lm loss: 3.681834E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.180 | TFLOPs: 26.11 | +7: iteration 113810/ 173500 | consumed samples: 29135360 | consumed tokens: 59669217280 | elapsed time per iteration (s): 0.16 | learning rate: 6.852E-05 | global batch size: 256 | lm loss: 3.690027E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.968 | TFLOPs: 25.77 | +7: iteration 113820/ 173500 | consumed samples: 29137920 | consumed tokens: 59674460160 | elapsed time per iteration (s): 0.15 | learning rate: 6.850E-05 | global batch size: 256 | lm loss: 3.692403E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.647 | TFLOPs: 26.12 | +7: iteration 113830/ 173500 | consumed samples: 29140480 | consumed tokens: 59679703040 | elapsed time per iteration (s): 0.16 | learning rate: 6.849E-05 | global batch size: 256 | lm loss: 3.675468E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.671 | TFLOPs: 25.53 | +7: iteration 113840/ 173500 | consumed samples: 29143040 | consumed tokens: 59684945920 | elapsed time per iteration (s): 0.15 | learning rate: 6.847E-05 | global batch size: 256 | lm loss: 3.664146E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.691 | TFLOPs: 26.12 | +7: iteration 113850/ 173500 | consumed samples: 29145600 | consumed tokens: 59690188800 | elapsed time per iteration (s): 0.16 | learning rate: 6.846E-05 | global batch size: 256 | lm loss: 3.683368E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.544 | TFLOPs: 25.35 | +7: iteration 113860/ 173500 | consumed samples: 29148160 | consumed tokens: 59695431680 | elapsed time per iteration (s): 0.16 | learning rate: 6.844E-05 | global batch size: 256 | lm loss: 3.681444E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.412 | TFLOPs: 25.77 | +7: iteration 113870/ 173500 | consumed samples: 29150720 | consumed tokens: 59700674560 | elapsed time per iteration (s): 0.16 | learning rate: 6.843E-05 | global batch size: 256 | lm loss: 3.686775E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.326 | TFLOPs: 25.43 | +7: iteration 113880/ 173500 | consumed samples: 29153280 | consumed tokens: 59705917440 | elapsed time per iteration (s): 0.15 | learning rate: 6.841E-05 | global batch size: 256 | lm loss: 3.692421E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.714 | TFLOPs: 26.12 | +7: iteration 113890/ 173500 | consumed samples: 29155840 | consumed tokens: 59711160320 | elapsed time per iteration (s): 0.17 | learning rate: 6.840E-05 | global batch size: 256 | lm loss: 3.673387E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.449 | TFLOPs: 24.05 | +7: iteration 113900/ 173500 | consumed samples: 29158400 | consumed tokens: 59716403200 | elapsed time per iteration (s): 0.17 | learning rate: 6.838E-05 | global batch size: 256 | lm loss: 3.675177E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.495 | TFLOPs: 23.59 | +7: iteration 113910/ 173500 | consumed samples: 29160960 | consumed tokens: 59721646080 | elapsed time per iteration (s): 0.16 | learning rate: 6.837E-05 | global batch size: 256 | lm loss: 3.681269E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.724 | TFLOPs: 24.93 | +7: iteration 113920/ 173500 | consumed samples: 29163520 | consumed tokens: 59726888960 | elapsed time per iteration (s): 0.16 | learning rate: 6.835E-05 | global batch size: 256 | lm loss: 3.674386E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.207 | TFLOPs: 25.69 | +7: iteration 113930/ 173500 | consumed samples: 29166080 | consumed tokens: 59732131840 | elapsed time per iteration (s): 0.16 | learning rate: 6.834E-05 | global batch size: 256 | lm loss: 3.683545E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.011 | TFLOPs: 25.80 | +7: iteration 113940/ 173500 | consumed samples: 29168640 | consumed tokens: 59737374720 | elapsed time per iteration (s): 0.16 | learning rate: 6.833E-05 | global batch size: 256 | lm loss: 3.674775E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.212 | TFLOPs: 25.39 | +7: iteration 113950/ 173500 | consumed samples: 29171200 | consumed tokens: 59742617600 | elapsed time per iteration (s): 0.16 | learning rate: 6.831E-05 | global batch size: 256 | lm loss: 3.685729E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.195 | TFLOPs: 25.80 | +7: iteration 113960/ 173500 | consumed samples: 29173760 | consumed tokens: 59747860480 | elapsed time per iteration (s): 0.16 | learning rate: 6.830E-05 | global batch size: 256 | lm loss: 3.687895E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.605 | TFLOPs: 25.74 | +7: iteration 113970/ 173500 | consumed samples: 29176320 | consumed tokens: 59753103360 | elapsed time per iteration (s): 0.16 | learning rate: 6.828E-05 | global batch size: 256 | lm loss: 3.698817E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.288 | TFLOPs: 25.14 | +7: iteration 113980/ 173500 | consumed samples: 29178880 | consumed tokens: 59758346240 | elapsed time per iteration (s): 0.17 | learning rate: 6.827E-05 | global batch size: 256 | lm loss: 3.695872E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.702 | TFLOPs: 24.19 | +7: iteration 113990/ 173500 | consumed samples: 29181440 | consumed tokens: 59763589120 | elapsed time per iteration (s): 0.16 | learning rate: 6.825E-05 | global batch size: 256 | lm loss: 3.683850E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.491 | TFLOPs: 25.73 | +0: [2023-03-17 05:12:17,244] [INFO] [logging.py:68:log_dist] [Rank 0] step=114000, skipped=0, lr=[6.823796836261315e-05, 6.823796836261315e-05, 6.823796836261315e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 114000/ 173500 | consumed samples: 29184000 | consumed tokens: 59768832000 | elapsed time per iteration (s): 0.16 | learning rate: 6.824E-05 | global batch size: 256 | lm loss: 3.680893E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.137 | TFLOPs: 25.56 | +0: steps: 114000 loss: 3.6796 iter time (s): 0.161 samples/sec: 1587.386 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 114000 | lm loss value: 3.865363E+00 | lm loss PPL: 4.772060E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 114000 to checkpoints_44m91b100m +0: [2023-03-17 05:12:17,319] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step114000 is begin to save! +0: [2023-03-17 05:12:17,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:12:17,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:12:17,386] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:12:17,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:12:17,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:12:17,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:12:17,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:12:17,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:12:17,412] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:12:17,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:12:17,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:12:17,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:12:17,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:12:17,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:12:17,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:12:17,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:12:17,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:12:17,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:12:17,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:12:17,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:12:17,454] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step114000/mp_rank_00_model_states.pt +0: [2023-03-17 05:12:17,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:12:17,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:12:17,472] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:12:17,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:12:17,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:12:17,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +1: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:12:17,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,490] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +1: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +1: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,491] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +1: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,492] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,492] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:12:17,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:12:17,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,493] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,493] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +1: [2023-03-17 05:12:17,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +4: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 05:12:17,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +7: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +6: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +5: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:12:17,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +2: [2023-03-17 05:12:17,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +3: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:12:17,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:12:17,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +1: [2023-03-17 05:12:17,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:12:17,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step114000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:12:17,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step114000 is ready now! +0: successfully saved checkpoint at iteration 114000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 185.93 +7: iteration 114010/ 173500 | consumed samples: 29186560 | consumed tokens: 59774074880 | elapsed time per iteration (s): 0.18 | learning rate: 6.822E-05 | global batch size: 256 | lm loss: 3.679481E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.218 | TFLOPs: 22.12 | +7: iteration 114020/ 173500 | consumed samples: 29189120 | consumed tokens: 59779317760 | elapsed time per iteration (s): 0.16 | learning rate: 6.821E-05 | global batch size: 256 | lm loss: 3.676868E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.325 | TFLOPs: 25.27 | +7: iteration 114030/ 173500 | consumed samples: 29191680 | consumed tokens: 59784560640 | elapsed time per iteration (s): 0.15 | learning rate: 6.819E-05 | global batch size: 256 | lm loss: 3.680155E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.762 | TFLOPs: 26.15 | +7: iteration 114040/ 173500 | consumed samples: 29194240 | consumed tokens: 59789803520 | elapsed time per iteration (s): 0.16 | learning rate: 6.818E-05 | global batch size: 256 | lm loss: 3.684406E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.966 | TFLOPs: 24.98 | +7: iteration 114050/ 173500 | consumed samples: 29196800 | consumed tokens: 59795046400 | elapsed time per iteration (s): 0.16 | learning rate: 6.817E-05 | global batch size: 256 | lm loss: 3.686530E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.337 | TFLOPs: 25.79 | +7: iteration 114060/ 173500 | consumed samples: 29199360 | consumed tokens: 59800289280 | elapsed time per iteration (s): 0.16 | learning rate: 6.815E-05 | global batch size: 256 | lm loss: 3.680125E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.309 | TFLOPs: 25.14 | +7: iteration 114070/ 173500 | consumed samples: 29201920 | consumed tokens: 59805532160 | elapsed time per iteration (s): 0.16 | learning rate: 6.814E-05 | global batch size: 256 | lm loss: 3.669230E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.443 | TFLOPs: 25.51 | +7: iteration 114080/ 173500 | consumed samples: 29204480 | consumed tokens: 59810775040 | elapsed time per iteration (s): 0.16 | learning rate: 6.812E-05 | global batch size: 256 | lm loss: 3.689075E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.693 | TFLOPs: 25.75 | +7: iteration 114090/ 173500 | consumed samples: 29207040 | consumed tokens: 59816017920 | elapsed time per iteration (s): 0.16 | learning rate: 6.811E-05 | global batch size: 256 | lm loss: 3.670902E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.761 | TFLOPs: 25.76 | +7: iteration 114100/ 173500 | consumed samples: 29209600 | consumed tokens: 59821260800 | elapsed time per iteration (s): 0.17 | learning rate: 6.809E-05 | global batch size: 256 | lm loss: 3.678518E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.137 | TFLOPs: 24.18 | +7: iteration 114110/ 173500 | consumed samples: 29212160 | consumed tokens: 59826503680 | elapsed time per iteration (s): 0.16 | learning rate: 6.808E-05 | global batch size: 256 | lm loss: 3.683345E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.197 | TFLOPs: 25.75 | +7: iteration 114120/ 173500 | consumed samples: 29214720 | consumed tokens: 59831746560 | elapsed time per iteration (s): 0.16 | learning rate: 6.806E-05 | global batch size: 256 | lm loss: 3.686750E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.872 | TFLOPs: 25.83 | +7: iteration 114130/ 173500 | consumed samples: 29217280 | consumed tokens: 59836989440 | elapsed time per iteration (s): 0.16 | learning rate: 6.805E-05 | global batch size: 256 | lm loss: 3.687888E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.330 | TFLOPs: 25.02 | +7: iteration 114140/ 173500 | consumed samples: 29219840 | consumed tokens: 59842232320 | elapsed time per iteration (s): 0.16 | learning rate: 6.803E-05 | global batch size: 256 | lm loss: 3.677836E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.267 | TFLOPs: 25.14 | +7: iteration 114150/ 173500 | consumed samples: 29222400 | consumed tokens: 59847475200 | elapsed time per iteration (s): 0.16 | learning rate: 6.802E-05 | global batch size: 256 | lm loss: 3.679882E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.843 | TFLOPs: 25.01 | +7: iteration 114160/ 173500 | consumed samples: 29224960 | consumed tokens: 59852718080 | elapsed time per iteration (s): 0.16 | learning rate: 6.800E-05 | global batch size: 256 | lm loss: 3.679079E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.380 | TFLOPs: 25.07 | +7: iteration 114170/ 173500 | consumed samples: 29227520 | consumed tokens: 59857960960 | elapsed time per iteration (s): 0.16 | learning rate: 6.799E-05 | global batch size: 256 | lm loss: 3.682919E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.775 | TFLOPs: 25.59 | +7: iteration 114180/ 173500 | consumed samples: 29230080 | consumed tokens: 59863203840 | elapsed time per iteration (s): 0.15 | learning rate: 6.798E-05 | global batch size: 256 | lm loss: 3.679136E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.266 | TFLOPs: 26.10 | +7: iteration 114190/ 173500 | consumed samples: 29232640 | consumed tokens: 59868446720 | elapsed time per iteration (s): 0.16 | learning rate: 6.796E-05 | global batch size: 256 | lm loss: 3.682083E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.428 | TFLOPs: 25.76 | +7: iteration 114200/ 173500 | consumed samples: 29235200 | consumed tokens: 59873689600 | elapsed time per iteration (s): 0.16 | learning rate: 6.795E-05 | global batch size: 256 | lm loss: 3.680340E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.727 | TFLOPs: 24.49 | +7: iteration 114210/ 173500 | consumed samples: 29237760 | consumed tokens: 59878932480 | elapsed time per iteration (s): 0.16 | learning rate: 6.793E-05 | global batch size: 256 | lm loss: 3.663615E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.041 | TFLOPs: 24.97 | +7: iteration 114220/ 173500 | consumed samples: 29240320 | consumed tokens: 59884175360 | elapsed time per iteration (s): 0.16 | learning rate: 6.792E-05 | global batch size: 256 | lm loss: 3.677361E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.046 | TFLOPs: 25.27 | +7: iteration 114230/ 173500 | consumed samples: 29242880 | consumed tokens: 59889418240 | elapsed time per iteration (s): 0.16 | learning rate: 6.790E-05 | global batch size: 256 | lm loss: 3.684164E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.988 | TFLOPs: 25.01 | +7: iteration 114240/ 173500 | consumed samples: 29245440 | consumed tokens: 59894661120 | elapsed time per iteration (s): 0.16 | learning rate: 6.789E-05 | global batch size: 256 | lm loss: 3.683542E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.945 | TFLOPs: 24.86 | +7: iteration 114250/ 173500 | consumed samples: 29248000 | consumed tokens: 59899904000 | elapsed time per iteration (s): 0.16 | learning rate: 6.787E-05 | global batch size: 256 | lm loss: 3.678912E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.626 | TFLOPs: 25.57 | +7: iteration 114260/ 173500 | consumed samples: 29250560 | consumed tokens: 59905146880 | elapsed time per iteration (s): 0.16 | learning rate: 6.786E-05 | global batch size: 256 | lm loss: 3.677229E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.802 | TFLOPs: 25.14 | +7: iteration 114270/ 173500 | consumed samples: 29253120 | consumed tokens: 59910389760 | elapsed time per iteration (s): 0.16 | learning rate: 6.784E-05 | global batch size: 256 | lm loss: 3.678463E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.642 | TFLOPs: 25.42 | +7: iteration 114280/ 173500 | consumed samples: 29255680 | consumed tokens: 59915632640 | elapsed time per iteration (s): 0.16 | learning rate: 6.783E-05 | global batch size: 256 | lm loss: 3.678884E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.510 | TFLOPs: 25.43 | +7: iteration 114290/ 173500 | consumed samples: 29258240 | consumed tokens: 59920875520 | elapsed time per iteration (s): 0.16 | learning rate: 6.782E-05 | global batch size: 256 | lm loss: 3.679934E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.845 | TFLOPs: 25.04 | +7: iteration 114300/ 173500 | consumed samples: 29260800 | consumed tokens: 59926118400 | elapsed time per iteration (s): 0.16 | learning rate: 6.780E-05 | global batch size: 256 | lm loss: 3.678727E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.760 | TFLOPs: 24.96 | +7: iteration 114310/ 173500 | consumed samples: 29263360 | consumed tokens: 59931361280 | elapsed time per iteration (s): 0.15 | learning rate: 6.779E-05 | global batch size: 256 | lm loss: 3.674818E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.146 | TFLOPs: 26.24 | +7: iteration 114320/ 173500 | consumed samples: 29265920 | consumed tokens: 59936604160 | elapsed time per iteration (s): 0.16 | learning rate: 6.777E-05 | global batch size: 256 | lm loss: 3.676635E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.667 | TFLOPs: 25.84 | +7: iteration 114330/ 173500 | consumed samples: 29268480 | consumed tokens: 59941847040 | elapsed time per iteration (s): 0.15 | learning rate: 6.776E-05 | global batch size: 256 | lm loss: 3.679546E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.516 | TFLOPs: 25.96 | +7: iteration 114340/ 173500 | consumed samples: 29271040 | consumed tokens: 59947089920 | elapsed time per iteration (s): 0.16 | learning rate: 6.774E-05 | global batch size: 256 | lm loss: 3.687567E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.266 | TFLOPs: 25.54 | +7: iteration 114350/ 173500 | consumed samples: 29273600 | consumed tokens: 59952332800 | elapsed time per iteration (s): 0.16 | learning rate: 6.773E-05 | global batch size: 256 | lm loss: 3.677203E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.413 | TFLOPs: 25.58 | +7: iteration 114360/ 173500 | consumed samples: 29276160 | consumed tokens: 59957575680 | elapsed time per iteration (s): 0.16 | learning rate: 6.771E-05 | global batch size: 256 | lm loss: 3.675650E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.906 | TFLOPs: 25.31 | +7: iteration 114370/ 173500 | consumed samples: 29278720 | consumed tokens: 59962818560 | elapsed time per iteration (s): 0.16 | learning rate: 6.770E-05 | global batch size: 256 | lm loss: 3.669761E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.979 | TFLOPs: 25.12 | +7: iteration 114380/ 173500 | consumed samples: 29281280 | consumed tokens: 59968061440 | elapsed time per iteration (s): 0.16 | learning rate: 6.768E-05 | global batch size: 256 | lm loss: 3.683166E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.018 | TFLOPs: 25.06 | +7: iteration 114390/ 173500 | consumed samples: 29283840 | consumed tokens: 59973304320 | elapsed time per iteration (s): 0.16 | learning rate: 6.767E-05 | global batch size: 256 | lm loss: 3.689629E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.497 | TFLOPs: 24.36 | +7: iteration 114400/ 173500 | consumed samples: 29286400 | consumed tokens: 59978547200 | elapsed time per iteration (s): 0.15 | learning rate: 6.766E-05 | global batch size: 256 | lm loss: 3.684732E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.244 | TFLOPs: 26.01 | +7: iteration 114410/ 173500 | consumed samples: 29288960 | consumed tokens: 59983790080 | elapsed time per iteration (s): 0.16 | learning rate: 6.764E-05 | global batch size: 256 | lm loss: 3.690680E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.312 | TFLOPs: 25.83 | +7: iteration 114420/ 173500 | consumed samples: 29291520 | consumed tokens: 59989032960 | elapsed time per iteration (s): 0.16 | learning rate: 6.763E-05 | global batch size: 256 | lm loss: 3.676516E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.443 | TFLOPs: 24.75 | +7: iteration 114430/ 173500 | consumed samples: 29294080 | consumed tokens: 59994275840 | elapsed time per iteration (s): 0.16 | learning rate: 6.761E-05 | global batch size: 256 | lm loss: 3.693211E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.573 | TFLOPs: 25.56 | +7: iteration 114440/ 173500 | consumed samples: 29296640 | consumed tokens: 59999518720 | elapsed time per iteration (s): 0.16 | learning rate: 6.760E-05 | global batch size: 256 | lm loss: 3.680814E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.160 | TFLOPs: 25.13 | +7: iteration 114450/ 173500 | consumed samples: 29299200 | consumed tokens: 60004761600 | elapsed time per iteration (s): 0.15 | learning rate: 6.758E-05 | global batch size: 256 | lm loss: 3.673270E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.079 | TFLOPs: 26.08 | +7: iteration 114460/ 173500 | consumed samples: 29301760 | consumed tokens: 60010004480 | elapsed time per iteration (s): 0.16 | learning rate: 6.757E-05 | global batch size: 256 | lm loss: 3.671454E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.718 | TFLOPs: 25.62 | +7: iteration 114470/ 173500 | consumed samples: 29304320 | consumed tokens: 60015247360 | elapsed time per iteration (s): 0.16 | learning rate: 6.755E-05 | global batch size: 256 | lm loss: 3.679156E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.298 | TFLOPs: 25.33 | +7: iteration 114480/ 173500 | consumed samples: 29306880 | consumed tokens: 60020490240 | elapsed time per iteration (s): 0.16 | learning rate: 6.754E-05 | global batch size: 256 | lm loss: 3.696646E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.377 | TFLOPs: 25.32 | +7: iteration 114490/ 173500 | consumed samples: 29309440 | consumed tokens: 60025733120 | elapsed time per iteration (s): 0.15 | learning rate: 6.753E-05 | global batch size: 256 | lm loss: 3.671689E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.077 | TFLOPs: 26.16 | +7: iteration 114500/ 173500 | consumed samples: 29312000 | consumed tokens: 60030976000 | elapsed time per iteration (s): 0.15 | learning rate: 6.751E-05 | global batch size: 256 | lm loss: 3.671428E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.970 | TFLOPs: 26.16 | +7: iteration 114510/ 173500 | consumed samples: 29314560 | consumed tokens: 60036218880 | elapsed time per iteration (s): 0.16 | learning rate: 6.750E-05 | global batch size: 256 | lm loss: 3.688698E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.444 | TFLOPs: 25.54 | +7: iteration 114520/ 173500 | consumed samples: 29317120 | consumed tokens: 60041461760 | elapsed time per iteration (s): 0.16 | learning rate: 6.748E-05 | global batch size: 256 | lm loss: 3.686432E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.940 | TFLOPs: 25.34 | +7: iteration 114530/ 173500 | consumed samples: 29319680 | consumed tokens: 60046704640 | elapsed time per iteration (s): 0.16 | learning rate: 6.747E-05 | global batch size: 256 | lm loss: 3.674583E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.948 | TFLOPs: 25.62 | +7: iteration 114540/ 173500 | consumed samples: 29322240 | consumed tokens: 60051947520 | elapsed time per iteration (s): 0.16 | learning rate: 6.745E-05 | global batch size: 256 | lm loss: 3.685044E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.968 | TFLOPs: 25.12 | +7: iteration 114550/ 173500 | consumed samples: 29324800 | consumed tokens: 60057190400 | elapsed time per iteration (s): 0.16 | learning rate: 6.744E-05 | global batch size: 256 | lm loss: 3.668399E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.241 | TFLOPs: 25.82 | +7: iteration 114560/ 173500 | consumed samples: 29327360 | consumed tokens: 60062433280 | elapsed time per iteration (s): 0.16 | learning rate: 6.742E-05 | global batch size: 256 | lm loss: 3.675661E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.204 | TFLOPs: 25.20 | +7: iteration 114570/ 173500 | consumed samples: 29329920 | consumed tokens: 60067676160 | elapsed time per iteration (s): 0.15 | learning rate: 6.741E-05 | global batch size: 256 | lm loss: 3.685295E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.345 | TFLOPs: 26.20 | +7: iteration 114580/ 173500 | consumed samples: 29332480 | consumed tokens: 60072919040 | elapsed time per iteration (s): 0.16 | learning rate: 6.739E-05 | global batch size: 256 | lm loss: 3.685434E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.824 | TFLOPs: 25.81 | +7: iteration 114590/ 173500 | consumed samples: 29335040 | consumed tokens: 60078161920 | elapsed time per iteration (s): 0.16 | learning rate: 6.738E-05 | global batch size: 256 | lm loss: 3.681997E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.991 | TFLOPs: 25.28 | +7: iteration 114600/ 173500 | consumed samples: 29337600 | consumed tokens: 60083404800 | elapsed time per iteration (s): 0.16 | learning rate: 6.737E-05 | global batch size: 256 | lm loss: 3.675305E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.712 | TFLOPs: 25.37 | +7: iteration 114610/ 173500 | consumed samples: 29340160 | consumed tokens: 60088647680 | elapsed time per iteration (s): 0.16 | learning rate: 6.735E-05 | global batch size: 256 | lm loss: 3.693628E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.137 | TFLOPs: 25.39 | +7: iteration 114620/ 173500 | consumed samples: 29342720 | consumed tokens: 60093890560 | elapsed time per iteration (s): 0.16 | learning rate: 6.734E-05 | global batch size: 256 | lm loss: 3.667210E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.093 | TFLOPs: 25.52 | +7: iteration 114630/ 173500 | consumed samples: 29345280 | consumed tokens: 60099133440 | elapsed time per iteration (s): 0.16 | learning rate: 6.732E-05 | global batch size: 256 | lm loss: 3.677247E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.613 | TFLOPs: 25.37 | +7: iteration 114640/ 173500 | consumed samples: 29347840 | consumed tokens: 60104376320 | elapsed time per iteration (s): 0.17 | learning rate: 6.731E-05 | global batch size: 256 | lm loss: 3.684398E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.432 | TFLOPs: 24.08 | +7: iteration 114650/ 173500 | consumed samples: 29350400 | consumed tokens: 60109619200 | elapsed time per iteration (s): 0.16 | learning rate: 6.729E-05 | global batch size: 256 | lm loss: 3.673490E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.901 | TFLOPs: 25.83 | +7: iteration 114660/ 173500 | consumed samples: 29352960 | consumed tokens: 60114862080 | elapsed time per iteration (s): 0.16 | learning rate: 6.728E-05 | global batch size: 256 | lm loss: 3.680952E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.982 | TFLOPs: 25.77 | +7: iteration 114670/ 173500 | consumed samples: 29355520 | consumed tokens: 60120104960 | elapsed time per iteration (s): 0.16 | learning rate: 6.726E-05 | global batch size: 256 | lm loss: 3.674997E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.476 | TFLOPs: 24.72 | +7: iteration 114680/ 173500 | consumed samples: 29358080 | consumed tokens: 60125347840 | elapsed time per iteration (s): 0.16 | learning rate: 6.725E-05 | global batch size: 256 | lm loss: 3.662444E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.867 | TFLOPs: 25.53 | +7: iteration 114690/ 173500 | consumed samples: 29360640 | consumed tokens: 60130590720 | elapsed time per iteration (s): 0.16 | learning rate: 6.724E-05 | global batch size: 256 | lm loss: 3.682612E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.799 | TFLOPs: 24.87 | +7: iteration 114700/ 173500 | consumed samples: 29363200 | consumed tokens: 60135833600 | elapsed time per iteration (s): 0.16 | learning rate: 6.722E-05 | global batch size: 256 | lm loss: 3.694489E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.440 | TFLOPs: 25.07 | +7: iteration 114710/ 173500 | consumed samples: 29365760 | consumed tokens: 60141076480 | elapsed time per iteration (s): 0.16 | learning rate: 6.721E-05 | global batch size: 256 | lm loss: 3.677947E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.245 | TFLOPs: 25.41 | +7: iteration 114720/ 173500 | consumed samples: 29368320 | consumed tokens: 60146319360 | elapsed time per iteration (s): 0.16 | learning rate: 6.719E-05 | global batch size: 256 | lm loss: 3.695150E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.466 | TFLOPs: 25.84 | +7: iteration 114730/ 173500 | consumed samples: 29370880 | consumed tokens: 60151562240 | elapsed time per iteration (s): 0.16 | learning rate: 6.718E-05 | global batch size: 256 | lm loss: 3.684867E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.603 | TFLOPs: 24.49 | +7: iteration 114740/ 173500 | consumed samples: 29373440 | consumed tokens: 60156805120 | elapsed time per iteration (s): 0.16 | learning rate: 6.716E-05 | global batch size: 256 | lm loss: 3.687657E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.652 | TFLOPs: 24.96 | +7: iteration 114750/ 173500 | consumed samples: 29376000 | consumed tokens: 60162048000 | elapsed time per iteration (s): 0.16 | learning rate: 6.715E-05 | global batch size: 256 | lm loss: 3.675413E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.955 | TFLOPs: 25.36 | +7: iteration 114760/ 173500 | consumed samples: 29378560 | consumed tokens: 60167290880 | elapsed time per iteration (s): 0.16 | learning rate: 6.713E-05 | global batch size: 256 | lm loss: 3.677188E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.782 | TFLOPs: 24.79 | +7: iteration 114770/ 173500 | consumed samples: 29381120 | consumed tokens: 60172533760 | elapsed time per iteration (s): 0.16 | learning rate: 6.712E-05 | global batch size: 256 | lm loss: 3.691297E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.800 | TFLOPs: 24.56 | +7: iteration 114780/ 173500 | consumed samples: 29383680 | consumed tokens: 60177776640 | elapsed time per iteration (s): 0.16 | learning rate: 6.710E-05 | global batch size: 256 | lm loss: 3.674953E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.997 | TFLOPs: 25.41 | +7: iteration 114790/ 173500 | consumed samples: 29386240 | consumed tokens: 60183019520 | elapsed time per iteration (s): 0.16 | learning rate: 6.709E-05 | global batch size: 256 | lm loss: 3.689559E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.866 | TFLOPs: 24.96 | +7: iteration 114800/ 173500 | consumed samples: 29388800 | consumed tokens: 60188262400 | elapsed time per iteration (s): 0.16 | learning rate: 6.708E-05 | global batch size: 256 | lm loss: 3.682907E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.358 | TFLOPs: 25.29 | +7: iteration 114810/ 173500 | consumed samples: 29391360 | consumed tokens: 60193505280 | elapsed time per iteration (s): 0.15 | learning rate: 6.706E-05 | global batch size: 256 | lm loss: 3.679655E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.407 | TFLOPs: 25.99 | +7: iteration 114820/ 173500 | consumed samples: 29393920 | consumed tokens: 60198748160 | elapsed time per iteration (s): 0.16 | learning rate: 6.705E-05 | global batch size: 256 | lm loss: 3.675937E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.015 | TFLOPs: 25.67 | +7: iteration 114830/ 173500 | consumed samples: 29396480 | consumed tokens: 60203991040 | elapsed time per iteration (s): 0.16 | learning rate: 6.703E-05 | global batch size: 256 | lm loss: 3.694622E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.097 | TFLOPs: 24.91 | +7: iteration 114840/ 173500 | consumed samples: 29399040 | consumed tokens: 60209233920 | elapsed time per iteration (s): 0.16 | learning rate: 6.702E-05 | global batch size: 256 | lm loss: 3.693524E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.108 | TFLOPs: 24.98 | +7: iteration 114850/ 173500 | consumed samples: 29401600 | consumed tokens: 60214476800 | elapsed time per iteration (s): 0.16 | learning rate: 6.700E-05 | global batch size: 256 | lm loss: 3.689201E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.354 | TFLOPs: 25.62 | +7: iteration 114860/ 173500 | consumed samples: 29404160 | consumed tokens: 60219719680 | elapsed time per iteration (s): 0.16 | learning rate: 6.699E-05 | global batch size: 256 | lm loss: 3.674701E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.477 | TFLOPs: 25.62 | +7: iteration 114870/ 173500 | consumed samples: 29406720 | consumed tokens: 60224962560 | elapsed time per iteration (s): 0.16 | learning rate: 6.697E-05 | global batch size: 256 | lm loss: 3.685586E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.447 | TFLOPs: 25.21 | +7: iteration 114880/ 173500 | consumed samples: 29409280 | consumed tokens: 60230205440 | elapsed time per iteration (s): 0.16 | learning rate: 6.696E-05 | global batch size: 256 | lm loss: 3.694962E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.782 | TFLOPs: 24.59 | +7: iteration 114890/ 173500 | consumed samples: 29411840 | consumed tokens: 60235448320 | elapsed time per iteration (s): 0.15 | learning rate: 6.695E-05 | global batch size: 256 | lm loss: 3.683899E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.246 | TFLOPs: 25.93 | +7: iteration 114900/ 173500 | consumed samples: 29414400 | consumed tokens: 60240691200 | elapsed time per iteration (s): 0.15 | learning rate: 6.693E-05 | global batch size: 256 | lm loss: 3.689005E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.380 | TFLOPs: 25.96 | +7: iteration 114910/ 173500 | consumed samples: 29416960 | consumed tokens: 60245934080 | elapsed time per iteration (s): 0.16 | learning rate: 6.692E-05 | global batch size: 256 | lm loss: 3.676818E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.789 | TFLOPs: 25.83 | +7: iteration 114920/ 173500 | consumed samples: 29419520 | consumed tokens: 60251176960 | elapsed time per iteration (s): 0.16 | learning rate: 6.690E-05 | global batch size: 256 | lm loss: 3.680483E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.085 | TFLOPs: 25.86 | +7: iteration 114930/ 173500 | consumed samples: 29422080 | consumed tokens: 60256419840 | elapsed time per iteration (s): 0.16 | learning rate: 6.689E-05 | global batch size: 256 | lm loss: 3.690391E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.812 | TFLOPs: 24.95 | +7: iteration 114940/ 173500 | consumed samples: 29424640 | consumed tokens: 60261662720 | elapsed time per iteration (s): 0.16 | learning rate: 6.687E-05 | global batch size: 256 | lm loss: 3.681602E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.510 | TFLOPs: 25.82 | +7: iteration 114950/ 173500 | consumed samples: 29427200 | consumed tokens: 60266905600 | elapsed time per iteration (s): 0.16 | learning rate: 6.686E-05 | global batch size: 256 | lm loss: 3.686497E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.356 | TFLOPs: 25.40 | +7: iteration 114960/ 173500 | consumed samples: 29429760 | consumed tokens: 60272148480 | elapsed time per iteration (s): 0.16 | learning rate: 6.684E-05 | global batch size: 256 | lm loss: 3.696889E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.413 | TFLOPs: 25.84 | +7: iteration 114970/ 173500 | consumed samples: 29432320 | consumed tokens: 60277391360 | elapsed time per iteration (s): 0.16 | learning rate: 6.683E-05 | global batch size: 256 | lm loss: 3.687340E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.944 | TFLOPs: 25.72 | +7: iteration 114980/ 173500 | consumed samples: 29434880 | consumed tokens: 60282634240 | elapsed time per iteration (s): 0.16 | learning rate: 6.682E-05 | global batch size: 256 | lm loss: 3.670160E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.827 | TFLOPs: 25.70 | +7: iteration 114990/ 173500 | consumed samples: 29437440 | consumed tokens: 60287877120 | elapsed time per iteration (s): 0.16 | learning rate: 6.680E-05 | global batch size: 256 | lm loss: 3.676189E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.634 | TFLOPs: 25.81 | +7: iteration 115000/ 173500 | consumed samples: 29440000 | consumed tokens: 60293120000 | elapsed time per iteration (s): 0.16 | learning rate: 6.679E-05 | global batch size: 256 | lm loss: 3.681696E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.841 | TFLOPs: 24.52 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 115000 | lm loss value: 3.838017E+00 | lm loss PPL: 4.643328E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 115000 to checkpoints_44m91b100m +0: [2023-03-17 05:14:55,728] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step115000 is begin to save! +0: [2023-03-17 05:14:55,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:14:55,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:14:55,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:14:55,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:14:55,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:14:55,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:14:55,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:14:55,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:14:55,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:14:55,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:14:55,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:14:55,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:14:55,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:14:55,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:14:55,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:14:55,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:14:55,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:14:55,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:14:55,859] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:14:55,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:14:55,860] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step115000/mp_rank_00_model_states.pt +0: [2023-03-17 05:14:55,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:14:55,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:14:55,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:14:55,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: [2023-03-17 05:14:55,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:14:55,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:14:55,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:14:55,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +6: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +5: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +4: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +2: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +1: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +3: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:14:55,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +7: [2023-03-17 05:14:55,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:14:55,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step115000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:14:55,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step115000 is ready now! +0: successfully saved checkpoint at iteration 115000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.53 +7: iteration 115010/ 173500 | consumed samples: 29442560 | consumed tokens: 60298362880 | elapsed time per iteration (s): 0.18 | learning rate: 6.677E-05 | global batch size: 256 | lm loss: 3.673785E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.903 | TFLOPs: 22.79 | +7: iteration 115020/ 173500 | consumed samples: 29445120 | consumed tokens: 60303605760 | elapsed time per iteration (s): 0.16 | learning rate: 6.676E-05 | global batch size: 256 | lm loss: 3.688394E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.186 | TFLOPs: 25.78 | +7: iteration 115030/ 173500 | consumed samples: 29447680 | consumed tokens: 60308848640 | elapsed time per iteration (s): 0.16 | learning rate: 6.674E-05 | global batch size: 256 | lm loss: 3.675830E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.512 | TFLOPs: 25.48 | +7: iteration 115040/ 173500 | consumed samples: 29450240 | consumed tokens: 60314091520 | elapsed time per iteration (s): 0.15 | learning rate: 6.673E-05 | global batch size: 256 | lm loss: 3.676785E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.247 | TFLOPs: 26.26 | +7: iteration 115050/ 173500 | consumed samples: 29452800 | consumed tokens: 60319334400 | elapsed time per iteration (s): 0.16 | learning rate: 6.671E-05 | global batch size: 256 | lm loss: 3.689479E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.996 | TFLOPs: 25.58 | +7: iteration 115060/ 173500 | consumed samples: 29455360 | consumed tokens: 60324577280 | elapsed time per iteration (s): 0.16 | learning rate: 6.670E-05 | global batch size: 256 | lm loss: 3.680207E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.842 | TFLOPs: 24.52 | +7: iteration 115070/ 173500 | consumed samples: 29457920 | consumed tokens: 60329820160 | elapsed time per iteration (s): 0.16 | learning rate: 6.669E-05 | global batch size: 256 | lm loss: 3.674228E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.866 | TFLOPs: 24.85 | +7: iteration 115080/ 173500 | consumed samples: 29460480 | consumed tokens: 60335063040 | elapsed time per iteration (s): 0.16 | learning rate: 6.667E-05 | global batch size: 256 | lm loss: 3.674112E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.421 | TFLOPs: 25.65 | +7: iteration 115090/ 173500 | consumed samples: 29463040 | consumed tokens: 60340305920 | elapsed time per iteration (s): 0.16 | learning rate: 6.666E-05 | global batch size: 256 | lm loss: 3.677946E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.551 | TFLOPs: 25.32 | +7: iteration 115100/ 173500 | consumed samples: 29465600 | consumed tokens: 60345548800 | elapsed time per iteration (s): 0.16 | learning rate: 6.664E-05 | global batch size: 256 | lm loss: 3.679626E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.617 | TFLOPs: 24.94 | +7: iteration 115110/ 173500 | consumed samples: 29468160 | consumed tokens: 60350791680 | elapsed time per iteration (s): 0.16 | learning rate: 6.663E-05 | global batch size: 256 | lm loss: 3.681721E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.314 | TFLOPs: 24.75 | +7: iteration 115120/ 173500 | consumed samples: 29470720 | consumed tokens: 60356034560 | elapsed time per iteration (s): 0.15 | learning rate: 6.661E-05 | global batch size: 256 | lm loss: 3.697927E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.540 | TFLOPs: 25.96 | +7: iteration 115130/ 173500 | consumed samples: 29473280 | consumed tokens: 60361277440 | elapsed time per iteration (s): 0.16 | learning rate: 6.660E-05 | global batch size: 256 | lm loss: 3.676194E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.755 | TFLOPs: 25.17 | +7: iteration 115140/ 173500 | consumed samples: 29475840 | consumed tokens: 60366520320 | elapsed time per iteration (s): 0.15 | learning rate: 6.658E-05 | global batch size: 256 | lm loss: 3.667011E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.228 | TFLOPs: 25.97 | +7: iteration 115150/ 173500 | consumed samples: 29478400 | consumed tokens: 60371763200 | elapsed time per iteration (s): 0.16 | learning rate: 6.657E-05 | global batch size: 256 | lm loss: 3.686896E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.240 | TFLOPs: 25.90 | +7: iteration 115160/ 173500 | consumed samples: 29480960 | consumed tokens: 60377006080 | elapsed time per iteration (s): 0.16 | learning rate: 6.656E-05 | global batch size: 256 | lm loss: 3.672321E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.303 | TFLOPs: 24.97 | +7: iteration 115170/ 173500 | consumed samples: 29483520 | consumed tokens: 60382248960 | elapsed time per iteration (s): 0.16 | learning rate: 6.654E-05 | global batch size: 256 | lm loss: 3.675967E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.864 | TFLOPs: 25.73 | +7: iteration 115180/ 173500 | consumed samples: 29486080 | consumed tokens: 60387491840 | elapsed time per iteration (s): 0.16 | learning rate: 6.653E-05 | global batch size: 256 | lm loss: 3.695464E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.335 | TFLOPs: 25.60 | +7: iteration 115190/ 173500 | consumed samples: 29488640 | consumed tokens: 60392734720 | elapsed time per iteration (s): 0.15 | learning rate: 6.651E-05 | global batch size: 256 | lm loss: 3.687859E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.722 | TFLOPs: 25.93 | +7: iteration 115200/ 173500 | consumed samples: 29491200 | consumed tokens: 60397977600 | elapsed time per iteration (s): 0.16 | learning rate: 6.650E-05 | global batch size: 256 | lm loss: 3.692309E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.955 | TFLOPs: 25.83 | +7: iteration 115210/ 173500 | consumed samples: 29493760 | consumed tokens: 60403220480 | elapsed time per iteration (s): 0.16 | learning rate: 6.648E-05 | global batch size: 256 | lm loss: 3.685524E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.033 | TFLOPs: 25.72 | +7: iteration 115220/ 173500 | consumed samples: 29496320 | consumed tokens: 60408463360 | elapsed time per iteration (s): 0.16 | learning rate: 6.647E-05 | global batch size: 256 | lm loss: 3.668920E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.158 | TFLOPs: 25.08 | +7: iteration 115230/ 173500 | consumed samples: 29498880 | consumed tokens: 60413706240 | elapsed time per iteration (s): 0.16 | learning rate: 6.646E-05 | global batch size: 256 | lm loss: 3.671145E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.089 | TFLOPs: 25.71 | +7: iteration 115240/ 173500 | consumed samples: 29501440 | consumed tokens: 60418949120 | elapsed time per iteration (s): 0.16 | learning rate: 6.644E-05 | global batch size: 256 | lm loss: 3.678724E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.492 | TFLOPs: 25.49 | +7: iteration 115250/ 173500 | consumed samples: 29504000 | consumed tokens: 60424192000 | elapsed time per iteration (s): 0.16 | learning rate: 6.643E-05 | global batch size: 256 | lm loss: 3.689810E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.189 | TFLOPs: 25.55 | +7: iteration 115260/ 173500 | consumed samples: 29506560 | consumed tokens: 60429434880 | elapsed time per iteration (s): 0.15 | learning rate: 6.641E-05 | global batch size: 256 | lm loss: 3.691385E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.000 | TFLOPs: 26.02 | +7: iteration 115270/ 173500 | consumed samples: 29509120 | consumed tokens: 60434677760 | elapsed time per iteration (s): 0.16 | learning rate: 6.640E-05 | global batch size: 256 | lm loss: 3.692859E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.986 | TFLOPs: 25.39 | +7: iteration 115280/ 173500 | consumed samples: 29511680 | consumed tokens: 60439920640 | elapsed time per iteration (s): 0.16 | learning rate: 6.638E-05 | global batch size: 256 | lm loss: 3.690311E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.661 | TFLOPs: 25.79 | +7: iteration 115290/ 173500 | consumed samples: 29514240 | consumed tokens: 60445163520 | elapsed time per iteration (s): 0.16 | learning rate: 6.637E-05 | global batch size: 256 | lm loss: 3.675513E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.385 | TFLOPs: 24.52 | +7: iteration 115300/ 173500 | consumed samples: 29516800 | consumed tokens: 60450406400 | elapsed time per iteration (s): 0.15 | learning rate: 6.635E-05 | global batch size: 256 | lm loss: 3.683341E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.505 | TFLOPs: 26.13 | +7: iteration 115310/ 173500 | consumed samples: 29519360 | consumed tokens: 60455649280 | elapsed time per iteration (s): 0.16 | learning rate: 6.634E-05 | global batch size: 256 | lm loss: 3.702714E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.326 | TFLOPs: 25.47 | +7: iteration 115320/ 173500 | consumed samples: 29521920 | consumed tokens: 60460892160 | elapsed time per iteration (s): 0.16 | learning rate: 6.633E-05 | global batch size: 256 | lm loss: 3.675092E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.581 | TFLOPs: 25.40 | +7: iteration 115330/ 173500 | consumed samples: 29524480 | consumed tokens: 60466135040 | elapsed time per iteration (s): 0.16 | learning rate: 6.631E-05 | global batch size: 256 | lm loss: 3.687767E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.632 | TFLOPs: 25.78 | +7: iteration 115340/ 173500 | consumed samples: 29527040 | consumed tokens: 60471377920 | elapsed time per iteration (s): 0.16 | learning rate: 6.630E-05 | global batch size: 256 | lm loss: 3.675409E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.172 | TFLOPs: 25.60 | +7: iteration 115350/ 173500 | consumed samples: 29529600 | consumed tokens: 60476620800 | elapsed time per iteration (s): 0.15 | learning rate: 6.628E-05 | global batch size: 256 | lm loss: 3.683442E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.681 | TFLOPs: 26.29 | +7: iteration 115360/ 173500 | consumed samples: 29532160 | consumed tokens: 60481863680 | elapsed time per iteration (s): 0.15 | learning rate: 6.627E-05 | global batch size: 256 | lm loss: 3.697608E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.833 | TFLOPs: 26.28 | +7: iteration 115370/ 173500 | consumed samples: 29534720 | consumed tokens: 60487106560 | elapsed time per iteration (s): 0.16 | learning rate: 6.625E-05 | global batch size: 256 | lm loss: 3.684783E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.602 | TFLOPs: 25.65 | +7: iteration 115380/ 173500 | consumed samples: 29537280 | consumed tokens: 60492349440 | elapsed time per iteration (s): 0.16 | learning rate: 6.624E-05 | global batch size: 256 | lm loss: 3.685201E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.148 | TFLOPs: 25.41 | +7: iteration 115390/ 173500 | consumed samples: 29539840 | consumed tokens: 60497592320 | elapsed time per iteration (s): 0.16 | learning rate: 6.622E-05 | global batch size: 256 | lm loss: 3.667524E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.880 | TFLOPs: 25.37 | +7: iteration 115400/ 173500 | consumed samples: 29542400 | consumed tokens: 60502835200 | elapsed time per iteration (s): 0.15 | learning rate: 6.621E-05 | global batch size: 256 | lm loss: 3.662640E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.504 | TFLOPs: 26.06 | +7: iteration 115410/ 173500 | consumed samples: 29544960 | consumed tokens: 60508078080 | elapsed time per iteration (s): 0.16 | learning rate: 6.620E-05 | global batch size: 256 | lm loss: 3.683366E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.546 | TFLOPs: 25.54 | +7: iteration 115420/ 173500 | consumed samples: 29547520 | consumed tokens: 60513320960 | elapsed time per iteration (s): 0.16 | learning rate: 6.618E-05 | global batch size: 256 | lm loss: 3.687189E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.037 | TFLOPs: 25.56 | +7: iteration 115430/ 173500 | consumed samples: 29550080 | consumed tokens: 60518563840 | elapsed time per iteration (s): 0.16 | learning rate: 6.617E-05 | global batch size: 256 | lm loss: 3.678510E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.658 | TFLOPs: 25.70 | +7: iteration 115440/ 173500 | consumed samples: 29552640 | consumed tokens: 60523806720 | elapsed time per iteration (s): 0.16 | learning rate: 6.615E-05 | global batch size: 256 | lm loss: 3.673914E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.933 | TFLOPs: 25.77 | +7: iteration 115450/ 173500 | consumed samples: 29555200 | consumed tokens: 60529049600 | elapsed time per iteration (s): 0.15 | learning rate: 6.614E-05 | global batch size: 256 | lm loss: 3.690726E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.573 | TFLOPs: 25.93 | +7: iteration 115460/ 173500 | consumed samples: 29557760 | consumed tokens: 60534292480 | elapsed time per iteration (s): 0.16 | learning rate: 6.612E-05 | global batch size: 256 | lm loss: 3.671444E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.181 | TFLOPs: 25.72 | +7: iteration 115470/ 173500 | consumed samples: 29560320 | consumed tokens: 60539535360 | elapsed time per iteration (s): 0.16 | learning rate: 6.611E-05 | global batch size: 256 | lm loss: 3.687582E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.505 | TFLOPs: 25.62 | +7: iteration 115480/ 173500 | consumed samples: 29562880 | consumed tokens: 60544778240 | elapsed time per iteration (s): 0.15 | learning rate: 6.610E-05 | global batch size: 256 | lm loss: 3.679815E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.897 | TFLOPs: 25.94 | +7: iteration 115490/ 173500 | consumed samples: 29565440 | consumed tokens: 60550021120 | elapsed time per iteration (s): 0.15 | learning rate: 6.608E-05 | global batch size: 256 | lm loss: 3.679564E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.899 | TFLOPs: 25.91 | +7: iteration 115500/ 173500 | consumed samples: 29568000 | consumed tokens: 60555264000 | elapsed time per iteration (s): 0.16 | learning rate: 6.607E-05 | global batch size: 256 | lm loss: 3.664336E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.027 | TFLOPs: 25.03 | +7: iteration 115510/ 173500 | consumed samples: 29570560 | consumed tokens: 60560506880 | elapsed time per iteration (s): 0.15 | learning rate: 6.605E-05 | global batch size: 256 | lm loss: 3.683331E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.487 | TFLOPs: 26.31 | +7: iteration 115520/ 173500 | consumed samples: 29573120 | consumed tokens: 60565749760 | elapsed time per iteration (s): 0.15 | learning rate: 6.604E-05 | global batch size: 256 | lm loss: 3.684872E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.065 | TFLOPs: 25.91 | +7: iteration 115530/ 173500 | consumed samples: 29575680 | consumed tokens: 60570992640 | elapsed time per iteration (s): 0.15 | learning rate: 6.602E-05 | global batch size: 256 | lm loss: 3.691175E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.280 | TFLOPs: 25.93 | +7: iteration 115540/ 173500 | consumed samples: 29578240 | consumed tokens: 60576235520 | elapsed time per iteration (s): 0.15 | learning rate: 6.601E-05 | global batch size: 256 | lm loss: 3.683614E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.764 | TFLOPs: 26.01 | +7: iteration 115550/ 173500 | consumed samples: 29580800 | consumed tokens: 60581478400 | elapsed time per iteration (s): 0.15 | learning rate: 6.599E-05 | global batch size: 256 | lm loss: 3.679109E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.371 | TFLOPs: 25.96 | +7: iteration 115560/ 173500 | consumed samples: 29583360 | consumed tokens: 60586721280 | elapsed time per iteration (s): 0.16 | learning rate: 6.598E-05 | global batch size: 256 | lm loss: 3.684796E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.381 | TFLOPs: 25.84 | +7: iteration 115570/ 173500 | consumed samples: 29585920 | consumed tokens: 60591964160 | elapsed time per iteration (s): 0.16 | learning rate: 6.597E-05 | global batch size: 256 | lm loss: 3.696206E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.762 | TFLOPs: 25.42 | +7: iteration 115580/ 173500 | consumed samples: 29588480 | consumed tokens: 60597207040 | elapsed time per iteration (s): 0.16 | learning rate: 6.595E-05 | global batch size: 256 | lm loss: 3.683061E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.100 | TFLOPs: 24.42 | +7: iteration 115590/ 173500 | consumed samples: 29591040 | consumed tokens: 60602449920 | elapsed time per iteration (s): 0.16 | learning rate: 6.594E-05 | global batch size: 256 | lm loss: 3.676176E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.955 | TFLOPs: 25.15 | +7: iteration 115600/ 173500 | consumed samples: 29593600 | consumed tokens: 60607692800 | elapsed time per iteration (s): 0.16 | learning rate: 6.592E-05 | global batch size: 256 | lm loss: 3.666389E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.062 | TFLOPs: 25.88 | +7: iteration 115610/ 173500 | consumed samples: 29596160 | consumed tokens: 60612935680 | elapsed time per iteration (s): 0.16 | learning rate: 6.591E-05 | global batch size: 256 | lm loss: 3.675969E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.971 | TFLOPs: 24.89 | +7: iteration 115620/ 173500 | consumed samples: 29598720 | consumed tokens: 60618178560 | elapsed time per iteration (s): 0.16 | learning rate: 6.589E-05 | global batch size: 256 | lm loss: 3.688547E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.632 | TFLOPs: 24.82 | +7: iteration 115630/ 173500 | consumed samples: 29601280 | consumed tokens: 60623421440 | elapsed time per iteration (s): 0.16 | learning rate: 6.588E-05 | global batch size: 256 | lm loss: 3.692163E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.567 | TFLOPs: 25.46 | +7: iteration 115640/ 173500 | consumed samples: 29603840 | consumed tokens: 60628664320 | elapsed time per iteration (s): 0.16 | learning rate: 6.587E-05 | global batch size: 256 | lm loss: 3.698896E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.028 | TFLOPs: 24.54 | +7: iteration 115650/ 173500 | consumed samples: 29606400 | consumed tokens: 60633907200 | elapsed time per iteration (s): 0.16 | learning rate: 6.585E-05 | global batch size: 256 | lm loss: 3.689651E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.722 | TFLOPs: 24.90 | +7: iteration 115660/ 173500 | consumed samples: 29608960 | consumed tokens: 60639150080 | elapsed time per iteration (s): 0.16 | learning rate: 6.584E-05 | global batch size: 256 | lm loss: 3.674474E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.537 | TFLOPs: 25.70 | +7: iteration 115670/ 173500 | consumed samples: 29611520 | consumed tokens: 60644392960 | elapsed time per iteration (s): 0.16 | learning rate: 6.582E-05 | global batch size: 256 | lm loss: 3.687011E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.272 | TFLOPs: 24.77 | +7: iteration 115680/ 173500 | consumed samples: 29614080 | consumed tokens: 60649635840 | elapsed time per iteration (s): 0.16 | learning rate: 6.581E-05 | global batch size: 256 | lm loss: 3.689815E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.629 | TFLOPs: 24.58 | +7: iteration 115690/ 173500 | consumed samples: 29616640 | consumed tokens: 60654878720 | elapsed time per iteration (s): 0.16 | learning rate: 6.579E-05 | global batch size: 256 | lm loss: 3.673335E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.056 | TFLOPs: 24.95 | +7: iteration 115700/ 173500 | consumed samples: 29619200 | consumed tokens: 60660121600 | elapsed time per iteration (s): 0.16 | learning rate: 6.578E-05 | global batch size: 256 | lm loss: 3.680625E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.572 | TFLOPs: 25.70 | +7: iteration 115710/ 173500 | consumed samples: 29621760 | consumed tokens: 60665364480 | elapsed time per iteration (s): 0.16 | learning rate: 6.577E-05 | global batch size: 256 | lm loss: 3.690816E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.463 | TFLOPs: 24.68 | +7: iteration 115720/ 173500 | consumed samples: 29624320 | consumed tokens: 60670607360 | elapsed time per iteration (s): 0.16 | learning rate: 6.575E-05 | global batch size: 256 | lm loss: 3.687406E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.823 | TFLOPs: 25.54 | +7: iteration 115730/ 173500 | consumed samples: 29626880 | consumed tokens: 60675850240 | elapsed time per iteration (s): 0.16 | learning rate: 6.574E-05 | global batch size: 256 | lm loss: 3.673919E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.318 | TFLOPs: 25.30 | +7: iteration 115740/ 173500 | consumed samples: 29629440 | consumed tokens: 60681093120 | elapsed time per iteration (s): 0.16 | learning rate: 6.572E-05 | global batch size: 256 | lm loss: 3.693246E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.215 | TFLOPs: 25.41 | +7: iteration 115750/ 173500 | consumed samples: 29632000 | consumed tokens: 60686336000 | elapsed time per iteration (s): 0.16 | learning rate: 6.571E-05 | global batch size: 256 | lm loss: 3.666882E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.206 | TFLOPs: 24.89 | +7: iteration 115760/ 173500 | consumed samples: 29634560 | consumed tokens: 60691578880 | elapsed time per iteration (s): 0.15 | learning rate: 6.569E-05 | global batch size: 256 | lm loss: 3.676081E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.228 | TFLOPs: 26.16 | +7: iteration 115770/ 173500 | consumed samples: 29637120 | consumed tokens: 60696821760 | elapsed time per iteration (s): 0.16 | learning rate: 6.568E-05 | global batch size: 256 | lm loss: 3.674086E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.855 | TFLOPs: 25.22 | +7: iteration 115780/ 173500 | consumed samples: 29639680 | consumed tokens: 60702064640 | elapsed time per iteration (s): 0.16 | learning rate: 6.567E-05 | global batch size: 256 | lm loss: 3.678576E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.824 | TFLOPs: 25.87 | +7: iteration 115790/ 173500 | consumed samples: 29642240 | consumed tokens: 60707307520 | elapsed time per iteration (s): 0.16 | learning rate: 6.565E-05 | global batch size: 256 | lm loss: 3.691895E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.443 | TFLOPs: 24.79 | +7: iteration 115800/ 173500 | consumed samples: 29644800 | consumed tokens: 60712550400 | elapsed time per iteration (s): 0.16 | learning rate: 6.564E-05 | global batch size: 256 | lm loss: 3.681056E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.299 | TFLOPs: 25.44 | +7: iteration 115810/ 173500 | consumed samples: 29647360 | consumed tokens: 60717793280 | elapsed time per iteration (s): 0.16 | learning rate: 6.562E-05 | global batch size: 256 | lm loss: 3.682754E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.196 | TFLOPs: 25.79 | +7: iteration 115820/ 173500 | consumed samples: 29649920 | consumed tokens: 60723036160 | elapsed time per iteration (s): 0.16 | learning rate: 6.561E-05 | global batch size: 256 | lm loss: 3.684893E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.467 | TFLOPs: 25.73 | +7: iteration 115830/ 173500 | consumed samples: 29652480 | consumed tokens: 60728279040 | elapsed time per iteration (s): 0.17 | learning rate: 6.559E-05 | global batch size: 256 | lm loss: 3.689689E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1486.654 | TFLOPs: 23.31 | +7: iteration 115840/ 173500 | consumed samples: 29655040 | consumed tokens: 60733521920 | elapsed time per iteration (s): 0.16 | learning rate: 6.558E-05 | global batch size: 256 | lm loss: 3.696409E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.047 | TFLOPs: 25.74 | +7: iteration 115850/ 173500 | consumed samples: 29657600 | consumed tokens: 60738764800 | elapsed time per iteration (s): 0.15 | learning rate: 6.556E-05 | global batch size: 256 | lm loss: 3.680747E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.303 | TFLOPs: 25.99 | +7: iteration 115860/ 173500 | consumed samples: 29660160 | consumed tokens: 60744007680 | elapsed time per iteration (s): 0.16 | learning rate: 6.555E-05 | global batch size: 256 | lm loss: 3.679582E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.261 | TFLOPs: 25.86 | +7: iteration 115870/ 173500 | consumed samples: 29662720 | consumed tokens: 60749250560 | elapsed time per iteration (s): 0.17 | learning rate: 6.554E-05 | global batch size: 256 | lm loss: 3.693744E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1485.007 | TFLOPs: 23.29 | +7: iteration 115880/ 173500 | consumed samples: 29665280 | consumed tokens: 60754493440 | elapsed time per iteration (s): 0.16 | learning rate: 6.552E-05 | global batch size: 256 | lm loss: 3.683924E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.488 | TFLOPs: 25.46 | +7: iteration 115890/ 173500 | consumed samples: 29667840 | consumed tokens: 60759736320 | elapsed time per iteration (s): 0.16 | learning rate: 6.551E-05 | global batch size: 256 | lm loss: 3.676089E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.820 | TFLOPs: 25.73 | +7: iteration 115900/ 173500 | consumed samples: 29670400 | consumed tokens: 60764979200 | elapsed time per iteration (s): 0.15 | learning rate: 6.549E-05 | global batch size: 256 | lm loss: 3.677247E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.008 | TFLOPs: 26.19 | +7: iteration 115910/ 173500 | consumed samples: 29672960 | consumed tokens: 60770222080 | elapsed time per iteration (s): 0.16 | learning rate: 6.548E-05 | global batch size: 256 | lm loss: 3.678185E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.949 | TFLOPs: 24.53 | +7: iteration 115920/ 173500 | consumed samples: 29675520 | consumed tokens: 60775464960 | elapsed time per iteration (s): 0.16 | learning rate: 6.546E-05 | global batch size: 256 | lm loss: 3.691320E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.990 | TFLOPs: 25.69 | +7: iteration 115930/ 173500 | consumed samples: 29678080 | consumed tokens: 60780707840 | elapsed time per iteration (s): 0.15 | learning rate: 6.545E-05 | global batch size: 256 | lm loss: 3.681772E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.322 | TFLOPs: 26.05 | +7: iteration 115940/ 173500 | consumed samples: 29680640 | consumed tokens: 60785950720 | elapsed time per iteration (s): 0.16 | learning rate: 6.544E-05 | global batch size: 256 | lm loss: 3.673073E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.592 | TFLOPs: 25.51 | +7: iteration 115950/ 173500 | consumed samples: 29683200 | consumed tokens: 60791193600 | elapsed time per iteration (s): 0.16 | learning rate: 6.542E-05 | global batch size: 256 | lm loss: 3.658064E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.180 | TFLOPs: 25.58 | +7: iteration 115960/ 173500 | consumed samples: 29685760 | consumed tokens: 60796436480 | elapsed time per iteration (s): 0.17 | learning rate: 6.541E-05 | global batch size: 256 | lm loss: 3.680338E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.556 | TFLOPs: 24.18 | +7: iteration 115970/ 173500 | consumed samples: 29688320 | consumed tokens: 60801679360 | elapsed time per iteration (s): 0.15 | learning rate: 6.539E-05 | global batch size: 256 | lm loss: 3.672381E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.675 | TFLOPs: 26.03 | +7: iteration 115980/ 173500 | consumed samples: 29690880 | consumed tokens: 60806922240 | elapsed time per iteration (s): 0.16 | learning rate: 6.538E-05 | global batch size: 256 | lm loss: 3.670956E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.798 | TFLOPs: 25.04 | +7: iteration 115990/ 173500 | consumed samples: 29693440 | consumed tokens: 60812165120 | elapsed time per iteration (s): 0.16 | learning rate: 6.536E-05 | global batch size: 256 | lm loss: 3.693161E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.886 | TFLOPs: 25.70 | +0: [2023-03-17 05:17:33,574] [INFO] [logging.py:68:log_dist] [Rank 0] step=116000, skipped=0, lr=[6.535024808618106e-05, 6.535024808618106e-05, 6.535024808618106e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 116000/ 173500 | consumed samples: 29696000 | consumed tokens: 60817408000 | elapsed time per iteration (s): 0.15 | learning rate: 6.535E-05 | global batch size: 256 | lm loss: 3.692642E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.594 | TFLOPs: 26.01 | +0: steps: 116000 loss: 3.6569 iter time (s): 0.157 samples/sec: 1630.850 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 116000 | lm loss value: 3.828074E+00 | lm loss PPL: 4.597391E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 116000 to checkpoints_44m91b100m +0: [2023-03-17 05:17:33,649] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step116000 is begin to save! +0: [2023-03-17 05:17:33,653] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:17:33,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:17:33,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:17:33,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:17:33,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:17:33,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:17:33,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:17:33,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:17:33,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:17:33,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:17:33,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:17:33,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:17:33,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:17:33,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:17:33,773] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:17:33,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:17:33,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:17:33,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:17:33,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:17:33,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:17:33,790] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step116000/mp_rank_00_model_states.pt +0: [2023-03-17 05:17:33,790] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:17:33,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:17:33,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:17:33,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +2: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,826] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,826] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +2: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +2: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:17:33,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +2: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:17:33,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +1: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +5: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +6: [2023-03-17 05:17:33,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:17:33,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +2: [2023-03-17 05:17:33,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +3: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:17:33,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +7: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:17:33,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:17:33,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +4: [2023-03-17 05:17:33,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:17:33,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step116000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:17:33,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step116000 is ready now! +0: successfully saved checkpoint at iteration 116000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 186.17 +7: iteration 116010/ 173500 | consumed samples: 29698560 | consumed tokens: 60822650880 | elapsed time per iteration (s): 0.18 | learning rate: 6.534E-05 | global batch size: 256 | lm loss: 3.674428E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.842 | TFLOPs: 22.08 | +7: iteration 116020/ 173500 | consumed samples: 29701120 | consumed tokens: 60827893760 | elapsed time per iteration (s): 0.15 | learning rate: 6.532E-05 | global batch size: 256 | lm loss: 3.672833E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.236 | TFLOPs: 26.15 | +7: iteration 116030/ 173500 | consumed samples: 29703680 | consumed tokens: 60833136640 | elapsed time per iteration (s): 0.16 | learning rate: 6.531E-05 | global batch size: 256 | lm loss: 3.684541E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.477 | TFLOPs: 24.49 | +7: iteration 116040/ 173500 | consumed samples: 29706240 | consumed tokens: 60838379520 | elapsed time per iteration (s): 0.16 | learning rate: 6.529E-05 | global batch size: 256 | lm loss: 3.677033E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.087 | TFLOPs: 25.47 | +7: iteration 116050/ 173500 | consumed samples: 29708800 | consumed tokens: 60843622400 | elapsed time per iteration (s): 0.15 | learning rate: 6.528E-05 | global batch size: 256 | lm loss: 3.679799E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.765 | TFLOPs: 26.23 | +7: iteration 116060/ 173500 | consumed samples: 29711360 | consumed tokens: 60848865280 | elapsed time per iteration (s): 0.16 | learning rate: 6.526E-05 | global batch size: 256 | lm loss: 3.669272E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.147 | TFLOPs: 25.83 | +7: iteration 116070/ 173500 | consumed samples: 29713920 | consumed tokens: 60854108160 | elapsed time per iteration (s): 0.15 | learning rate: 6.525E-05 | global batch size: 256 | lm loss: 3.693724E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.505 | TFLOPs: 26.24 | +7: iteration 116080/ 173500 | consumed samples: 29716480 | consumed tokens: 60859351040 | elapsed time per iteration (s): 0.15 | learning rate: 6.524E-05 | global batch size: 256 | lm loss: 3.682217E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.776 | TFLOPs: 25.98 | +7: iteration 116090/ 173500 | consumed samples: 29719040 | consumed tokens: 60864593920 | elapsed time per iteration (s): 0.15 | learning rate: 6.522E-05 | global batch size: 256 | lm loss: 3.681736E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.135 | TFLOPs: 26.22 | +7: iteration 116100/ 173500 | consumed samples: 29721600 | consumed tokens: 60869836800 | elapsed time per iteration (s): 0.16 | learning rate: 6.521E-05 | global batch size: 256 | lm loss: 3.692867E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.181 | TFLOPs: 25.20 | +7: iteration 116110/ 173500 | consumed samples: 29724160 | consumed tokens: 60875079680 | elapsed time per iteration (s): 0.15 | learning rate: 6.519E-05 | global batch size: 256 | lm loss: 3.676439E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.566 | TFLOPs: 25.96 | +7: iteration 116120/ 173500 | consumed samples: 29726720 | consumed tokens: 60880322560 | elapsed time per iteration (s): 0.16 | learning rate: 6.518E-05 | global batch size: 256 | lm loss: 3.689641E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.889 | TFLOPs: 25.81 | +7: iteration 116130/ 173500 | consumed samples: 29729280 | consumed tokens: 60885565440 | elapsed time per iteration (s): 0.16 | learning rate: 6.516E-05 | global batch size: 256 | lm loss: 3.682962E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.401 | TFLOPs: 25.69 | +7: iteration 116140/ 173500 | consumed samples: 29731840 | consumed tokens: 60890808320 | elapsed time per iteration (s): 0.16 | learning rate: 6.515E-05 | global batch size: 256 | lm loss: 3.672040E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.865 | TFLOPs: 25.11 | +7: iteration 116150/ 173500 | consumed samples: 29734400 | consumed tokens: 60896051200 | elapsed time per iteration (s): 0.16 | learning rate: 6.514E-05 | global batch size: 256 | lm loss: 3.679745E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.264 | TFLOPs: 25.44 | +7: iteration 116160/ 173500 | consumed samples: 29736960 | consumed tokens: 60901294080 | elapsed time per iteration (s): 0.15 | learning rate: 6.512E-05 | global batch size: 256 | lm loss: 3.684956E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.357 | TFLOPs: 26.23 | +7: iteration 116170/ 173500 | consumed samples: 29739520 | consumed tokens: 60906536960 | elapsed time per iteration (s): 0.16 | learning rate: 6.511E-05 | global batch size: 256 | lm loss: 3.692995E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.657 | TFLOPs: 25.86 | +7: iteration 116180/ 173500 | consumed samples: 29742080 | consumed tokens: 60911779840 | elapsed time per iteration (s): 0.16 | learning rate: 6.509E-05 | global batch size: 256 | lm loss: 3.665498E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.981 | TFLOPs: 25.37 | +7: iteration 116190/ 173500 | consumed samples: 29744640 | consumed tokens: 60917022720 | elapsed time per iteration (s): 0.15 | learning rate: 6.508E-05 | global batch size: 256 | lm loss: 3.690582E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.913 | TFLOPs: 26.08 | +7: iteration 116200/ 173500 | consumed samples: 29747200 | consumed tokens: 60922265600 | elapsed time per iteration (s): 0.15 | learning rate: 6.506E-05 | global batch size: 256 | lm loss: 3.681953E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.117 | TFLOPs: 26.13 | +7: iteration 116210/ 173500 | consumed samples: 29749760 | consumed tokens: 60927508480 | elapsed time per iteration (s): 0.15 | learning rate: 6.505E-05 | global batch size: 256 | lm loss: 3.665716E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.978 | TFLOPs: 26.16 | +7: iteration 116220/ 173500 | consumed samples: 29752320 | consumed tokens: 60932751360 | elapsed time per iteration (s): 0.16 | learning rate: 6.504E-05 | global batch size: 256 | lm loss: 3.669874E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.135 | TFLOPs: 25.67 | +7: iteration 116230/ 173500 | consumed samples: 29754880 | consumed tokens: 60937994240 | elapsed time per iteration (s): 0.16 | learning rate: 6.502E-05 | global batch size: 256 | lm loss: 3.668337E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.883 | TFLOPs: 25.56 | +7: iteration 116240/ 173500 | consumed samples: 29757440 | consumed tokens: 60943237120 | elapsed time per iteration (s): 0.17 | learning rate: 6.501E-05 | global batch size: 256 | lm loss: 3.672273E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.089 | TFLOPs: 23.10 | +7: iteration 116250/ 173500 | consumed samples: 29760000 | consumed tokens: 60948480000 | elapsed time per iteration (s): 0.15 | learning rate: 6.499E-05 | global batch size: 256 | lm loss: 3.672763E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.446 | TFLOPs: 26.10 | +7: iteration 116260/ 173500 | consumed samples: 29762560 | consumed tokens: 60953722880 | elapsed time per iteration (s): 0.16 | learning rate: 6.498E-05 | global batch size: 256 | lm loss: 3.676962E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.433 | TFLOPs: 25.19 | +7: iteration 116270/ 173500 | consumed samples: 29765120 | consumed tokens: 60958965760 | elapsed time per iteration (s): 0.16 | learning rate: 6.496E-05 | global batch size: 256 | lm loss: 3.679929E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.300 | TFLOPs: 25.77 | +7: iteration 116280/ 173500 | consumed samples: 29767680 | consumed tokens: 60964208640 | elapsed time per iteration (s): 0.16 | learning rate: 6.495E-05 | global batch size: 256 | lm loss: 3.685100E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.437 | TFLOPs: 25.76 | +7: iteration 116290/ 173500 | consumed samples: 29770240 | consumed tokens: 60969451520 | elapsed time per iteration (s): 0.16 | learning rate: 6.494E-05 | global batch size: 256 | lm loss: 3.692208E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.256 | TFLOPs: 25.66 | +7: iteration 116300/ 173500 | consumed samples: 29772800 | consumed tokens: 60974694400 | elapsed time per iteration (s): 0.16 | learning rate: 6.492E-05 | global batch size: 256 | lm loss: 3.674390E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.050 | TFLOPs: 25.33 | +7: iteration 116310/ 173500 | consumed samples: 29775360 | consumed tokens: 60979937280 | elapsed time per iteration (s): 0.15 | learning rate: 6.491E-05 | global batch size: 256 | lm loss: 3.684812E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.220 | TFLOPs: 26.11 | +7: iteration 116320/ 173500 | consumed samples: 29777920 | consumed tokens: 60985180160 | elapsed time per iteration (s): 0.16 | learning rate: 6.489E-05 | global batch size: 256 | lm loss: 3.668338E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.990 | TFLOPs: 25.58 | +7: iteration 116330/ 173500 | consumed samples: 29780480 | consumed tokens: 60990423040 | elapsed time per iteration (s): 0.16 | learning rate: 6.488E-05 | global batch size: 256 | lm loss: 3.684734E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.680 | TFLOPs: 25.51 | +7: iteration 116340/ 173500 | consumed samples: 29783040 | consumed tokens: 60995665920 | elapsed time per iteration (s): 0.16 | learning rate: 6.487E-05 | global batch size: 256 | lm loss: 3.690873E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.051 | TFLOPs: 24.65 | +7: iteration 116350/ 173500 | consumed samples: 29785600 | consumed tokens: 61000908800 | elapsed time per iteration (s): 0.16 | learning rate: 6.485E-05 | global batch size: 256 | lm loss: 3.688405E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.545 | TFLOPs: 25.70 | +7: iteration 116360/ 173500 | consumed samples: 29788160 | consumed tokens: 61006151680 | elapsed time per iteration (s): 0.16 | learning rate: 6.484E-05 | global batch size: 256 | lm loss: 3.678112E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.399 | TFLOPs: 25.35 | +7: iteration 116370/ 173500 | consumed samples: 29790720 | consumed tokens: 61011394560 | elapsed time per iteration (s): 0.16 | learning rate: 6.482E-05 | global batch size: 256 | lm loss: 3.682146E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.481 | TFLOPs: 25.44 | +7: iteration 116380/ 173500 | consumed samples: 29793280 | consumed tokens: 61016637440 | elapsed time per iteration (s): 0.16 | learning rate: 6.481E-05 | global batch size: 256 | lm loss: 3.676844E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.163 | TFLOPs: 25.30 | +7: iteration 116390/ 173500 | consumed samples: 29795840 | consumed tokens: 61021880320 | elapsed time per iteration (s): 0.15 | learning rate: 6.479E-05 | global batch size: 256 | lm loss: 3.692034E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.203 | TFLOPs: 25.97 | +7: iteration 116400/ 173500 | consumed samples: 29798400 | consumed tokens: 61027123200 | elapsed time per iteration (s): 0.17 | learning rate: 6.478E-05 | global batch size: 256 | lm loss: 3.681075E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.965 | TFLOPs: 23.46 | +7: iteration 116410/ 173500 | consumed samples: 29800960 | consumed tokens: 61032366080 | elapsed time per iteration (s): 0.15 | learning rate: 6.477E-05 | global batch size: 256 | lm loss: 3.674925E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.517 | TFLOPs: 26.14 | +7: iteration 116420/ 173500 | consumed samples: 29803520 | consumed tokens: 61037608960 | elapsed time per iteration (s): 0.16 | learning rate: 6.475E-05 | global batch size: 256 | lm loss: 3.684782E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.951 | TFLOPs: 25.44 | +7: iteration 116430/ 173500 | consumed samples: 29806080 | consumed tokens: 61042851840 | elapsed time per iteration (s): 0.16 | learning rate: 6.474E-05 | global batch size: 256 | lm loss: 3.689967E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.046 | TFLOPs: 25.85 | +7: iteration 116440/ 173500 | consumed samples: 29808640 | consumed tokens: 61048094720 | elapsed time per iteration (s): 0.16 | learning rate: 6.472E-05 | global batch size: 256 | lm loss: 3.670277E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.594 | TFLOPs: 25.29 | +7: iteration 116450/ 173500 | consumed samples: 29811200 | consumed tokens: 61053337600 | elapsed time per iteration (s): 0.15 | learning rate: 6.471E-05 | global batch size: 256 | lm loss: 3.680045E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.327 | TFLOPs: 26.15 | +7: iteration 116460/ 173500 | consumed samples: 29813760 | consumed tokens: 61058580480 | elapsed time per iteration (s): 0.15 | learning rate: 6.469E-05 | global batch size: 256 | lm loss: 3.686525E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.106 | TFLOPs: 26.14 | +7: iteration 116470/ 173500 | consumed samples: 29816320 | consumed tokens: 61063823360 | elapsed time per iteration (s): 0.15 | learning rate: 6.468E-05 | global batch size: 256 | lm loss: 3.687986E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.733 | TFLOPs: 26.11 | +7: iteration 116480/ 173500 | consumed samples: 29818880 | consumed tokens: 61069066240 | elapsed time per iteration (s): 0.15 | learning rate: 6.467E-05 | global batch size: 256 | lm loss: 3.684195E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.760 | TFLOPs: 26.12 | +7: iteration 116490/ 173500 | consumed samples: 29821440 | consumed tokens: 61074309120 | elapsed time per iteration (s): 0.16 | learning rate: 6.465E-05 | global batch size: 256 | lm loss: 3.680553E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.076 | TFLOPs: 25.86 | +7: iteration 116500/ 173500 | consumed samples: 29824000 | consumed tokens: 61079552000 | elapsed time per iteration (s): 0.15 | learning rate: 6.464E-05 | global batch size: 256 | lm loss: 3.680754E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.603 | TFLOPs: 26.12 | +7: iteration 116510/ 173500 | consumed samples: 29826560 | consumed tokens: 61084794880 | elapsed time per iteration (s): 0.15 | learning rate: 6.462E-05 | global batch size: 256 | lm loss: 3.670777E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.845 | TFLOPs: 26.14 | +7: iteration 116520/ 173500 | consumed samples: 29829120 | consumed tokens: 61090037760 | elapsed time per iteration (s): 0.15 | learning rate: 6.461E-05 | global batch size: 256 | lm loss: 3.683356E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.134 | TFLOPs: 26.14 | +7: iteration 116530/ 173500 | consumed samples: 29831680 | consumed tokens: 61095280640 | elapsed time per iteration (s): 0.16 | learning rate: 6.459E-05 | global batch size: 256 | lm loss: 3.686929E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.243 | TFLOPs: 25.55 | +7: iteration 116540/ 173500 | consumed samples: 29834240 | consumed tokens: 61100523520 | elapsed time per iteration (s): 0.15 | learning rate: 6.458E-05 | global batch size: 256 | lm loss: 3.677052E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.849 | TFLOPs: 26.11 | +7: iteration 116550/ 173500 | consumed samples: 29836800 | consumed tokens: 61105766400 | elapsed time per iteration (s): 0.15 | learning rate: 6.457E-05 | global batch size: 256 | lm loss: 3.674872E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.423 | TFLOPs: 26.09 | +7: iteration 116560/ 173500 | consumed samples: 29839360 | consumed tokens: 61111009280 | elapsed time per iteration (s): 0.16 | learning rate: 6.455E-05 | global batch size: 256 | lm loss: 3.676948E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.393 | TFLOPs: 25.77 | +7: iteration 116570/ 173500 | consumed samples: 29841920 | consumed tokens: 61116252160 | elapsed time per iteration (s): 0.15 | learning rate: 6.454E-05 | global batch size: 256 | lm loss: 3.679687E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.856 | TFLOPs: 26.09 | +7: iteration 116580/ 173500 | consumed samples: 29844480 | consumed tokens: 61121495040 | elapsed time per iteration (s): 0.16 | learning rate: 6.452E-05 | global batch size: 256 | lm loss: 3.674634E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.828 | TFLOPs: 25.45 | +7: iteration 116590/ 173500 | consumed samples: 29847040 | consumed tokens: 61126737920 | elapsed time per iteration (s): 0.16 | learning rate: 6.451E-05 | global batch size: 256 | lm loss: 3.677696E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.996 | TFLOPs: 25.70 | +7: iteration 116600/ 173500 | consumed samples: 29849600 | consumed tokens: 61131980800 | elapsed time per iteration (s): 0.16 | learning rate: 6.450E-05 | global batch size: 256 | lm loss: 3.683203E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.397 | TFLOPs: 25.58 | +7: iteration 116610/ 173500 | consumed samples: 29852160 | consumed tokens: 61137223680 | elapsed time per iteration (s): 0.16 | learning rate: 6.448E-05 | global batch size: 256 | lm loss: 3.680009E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.176 | TFLOPs: 25.74 | +7: iteration 116620/ 173500 | consumed samples: 29854720 | consumed tokens: 61142466560 | elapsed time per iteration (s): 0.16 | learning rate: 6.447E-05 | global batch size: 256 | lm loss: 3.699910E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.228 | TFLOPs: 24.47 | +7: iteration 116630/ 173500 | consumed samples: 29857280 | consumed tokens: 61147709440 | elapsed time per iteration (s): 0.16 | learning rate: 6.445E-05 | global batch size: 256 | lm loss: 3.681358E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.760 | TFLOPs: 25.84 | +7: iteration 116640/ 173500 | consumed samples: 29859840 | consumed tokens: 61152952320 | elapsed time per iteration (s): 0.16 | learning rate: 6.444E-05 | global batch size: 256 | lm loss: 3.671373E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.642 | TFLOPs: 25.60 | +7: iteration 116650/ 173500 | consumed samples: 29862400 | consumed tokens: 61158195200 | elapsed time per iteration (s): 0.15 | learning rate: 6.442E-05 | global batch size: 256 | lm loss: 3.682701E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.617 | TFLOPs: 26.07 | +7: iteration 116660/ 173500 | consumed samples: 29864960 | consumed tokens: 61163438080 | elapsed time per iteration (s): 0.16 | learning rate: 6.441E-05 | global batch size: 256 | lm loss: 3.698503E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.651 | TFLOPs: 24.91 | +7: iteration 116670/ 173500 | consumed samples: 29867520 | consumed tokens: 61168680960 | elapsed time per iteration (s): 0.15 | learning rate: 6.440E-05 | global batch size: 256 | lm loss: 3.675721E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.870 | TFLOPs: 26.11 | +7: iteration 116680/ 173500 | consumed samples: 29870080 | consumed tokens: 61173923840 | elapsed time per iteration (s): 0.15 | learning rate: 6.438E-05 | global batch size: 256 | lm loss: 3.690298E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.388 | TFLOPs: 26.05 | +7: iteration 116690/ 173500 | consumed samples: 29872640 | consumed tokens: 61179166720 | elapsed time per iteration (s): 0.15 | learning rate: 6.437E-05 | global batch size: 256 | lm loss: 3.677831E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.984 | TFLOPs: 26.22 | +7: iteration 116700/ 173500 | consumed samples: 29875200 | consumed tokens: 61184409600 | elapsed time per iteration (s): 0.16 | learning rate: 6.435E-05 | global batch size: 256 | lm loss: 3.675515E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.267 | TFLOPs: 25.82 | +7: iteration 116710/ 173500 | consumed samples: 29877760 | consumed tokens: 61189652480 | elapsed time per iteration (s): 0.16 | learning rate: 6.434E-05 | global batch size: 256 | lm loss: 3.677034E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.865 | TFLOPs: 25.12 | +7: iteration 116720/ 173500 | consumed samples: 29880320 | consumed tokens: 61194895360 | elapsed time per iteration (s): 0.16 | learning rate: 6.433E-05 | global batch size: 256 | lm loss: 3.677327E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.788 | TFLOPs: 25.83 | +7: iteration 116730/ 173500 | consumed samples: 29882880 | consumed tokens: 61200138240 | elapsed time per iteration (s): 0.15 | learning rate: 6.431E-05 | global batch size: 256 | lm loss: 3.671863E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.206 | TFLOPs: 26.22 | +7: iteration 116740/ 173500 | consumed samples: 29885440 | consumed tokens: 61205381120 | elapsed time per iteration (s): 0.15 | learning rate: 6.430E-05 | global batch size: 256 | lm loss: 3.676065E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.789 | TFLOPs: 26.20 | +7: iteration 116750/ 173500 | consumed samples: 29888000 | consumed tokens: 61210624000 | elapsed time per iteration (s): 0.16 | learning rate: 6.428E-05 | global batch size: 256 | lm loss: 3.681501E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.276 | TFLOPs: 25.49 | +7: iteration 116760/ 173500 | consumed samples: 29890560 | consumed tokens: 61215866880 | elapsed time per iteration (s): 0.16 | learning rate: 6.427E-05 | global batch size: 256 | lm loss: 3.675369E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.453 | TFLOPs: 25.59 | +7: iteration 116770/ 173500 | consumed samples: 29893120 | consumed tokens: 61221109760 | elapsed time per iteration (s): 0.16 | learning rate: 6.425E-05 | global batch size: 256 | lm loss: 3.682008E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.557 | TFLOPs: 25.26 | +7: iteration 116780/ 173500 | consumed samples: 29895680 | consumed tokens: 61226352640 | elapsed time per iteration (s): 0.16 | learning rate: 6.424E-05 | global batch size: 256 | lm loss: 3.685675E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.534 | TFLOPs: 24.80 | +7: iteration 116790/ 173500 | consumed samples: 29898240 | consumed tokens: 61231595520 | elapsed time per iteration (s): 0.16 | learning rate: 6.423E-05 | global batch size: 256 | lm loss: 3.664929E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.952 | TFLOPs: 25.31 | +7: iteration 116800/ 173500 | consumed samples: 29900800 | consumed tokens: 61236838400 | elapsed time per iteration (s): 0.16 | learning rate: 6.421E-05 | global batch size: 256 | lm loss: 3.671606E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.260 | TFLOPs: 25.44 | +7: iteration 116810/ 173500 | consumed samples: 29903360 | consumed tokens: 61242081280 | elapsed time per iteration (s): 0.15 | learning rate: 6.420E-05 | global batch size: 256 | lm loss: 3.686333E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.412 | TFLOPs: 26.18 | +7: iteration 116820/ 173500 | consumed samples: 29905920 | consumed tokens: 61247324160 | elapsed time per iteration (s): 0.16 | learning rate: 6.418E-05 | global batch size: 256 | lm loss: 3.688058E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.036 | TFLOPs: 25.74 | +7: iteration 116830/ 173500 | consumed samples: 29908480 | consumed tokens: 61252567040 | elapsed time per iteration (s): 0.15 | learning rate: 6.417E-05 | global batch size: 256 | lm loss: 3.675524E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.467 | TFLOPs: 26.23 | +7: iteration 116840/ 173500 | consumed samples: 29911040 | consumed tokens: 61257809920 | elapsed time per iteration (s): 0.16 | learning rate: 6.415E-05 | global batch size: 256 | lm loss: 3.664657E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.307 | TFLOPs: 24.74 | +7: iteration 116850/ 173500 | consumed samples: 29913600 | consumed tokens: 61263052800 | elapsed time per iteration (s): 0.15 | learning rate: 6.414E-05 | global batch size: 256 | lm loss: 3.689220E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.586 | TFLOPs: 26.00 | +7: iteration 116860/ 173500 | consumed samples: 29916160 | consumed tokens: 61268295680 | elapsed time per iteration (s): 0.16 | learning rate: 6.413E-05 | global batch size: 256 | lm loss: 3.676462E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.011 | TFLOPs: 25.67 | +7: iteration 116870/ 173500 | consumed samples: 29918720 | consumed tokens: 61273538560 | elapsed time per iteration (s): 0.16 | learning rate: 6.411E-05 | global batch size: 256 | lm loss: 3.669167E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.363 | TFLOPs: 25.71 | +7: iteration 116880/ 173500 | consumed samples: 29921280 | consumed tokens: 61278781440 | elapsed time per iteration (s): 0.15 | learning rate: 6.410E-05 | global batch size: 256 | lm loss: 3.693503E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.488 | TFLOPs: 25.93 | +7: iteration 116890/ 173500 | consumed samples: 29923840 | consumed tokens: 61284024320 | elapsed time per iteration (s): 0.16 | learning rate: 6.408E-05 | global batch size: 256 | lm loss: 3.678854E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.962 | TFLOPs: 25.86 | +7: iteration 116900/ 173500 | consumed samples: 29926400 | consumed tokens: 61289267200 | elapsed time per iteration (s): 0.16 | learning rate: 6.407E-05 | global batch size: 256 | lm loss: 3.682645E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.373 | TFLOPs: 25.40 | +7: iteration 116910/ 173500 | consumed samples: 29928960 | consumed tokens: 61294510080 | elapsed time per iteration (s): 0.15 | learning rate: 6.406E-05 | global batch size: 256 | lm loss: 3.671193E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.508 | TFLOPs: 25.99 | +7: iteration 116920/ 173500 | consumed samples: 29931520 | consumed tokens: 61299752960 | elapsed time per iteration (s): 0.15 | learning rate: 6.404E-05 | global batch size: 256 | lm loss: 3.673721E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.092 | TFLOPs: 26.18 | +7: iteration 116930/ 173500 | consumed samples: 29934080 | consumed tokens: 61304995840 | elapsed time per iteration (s): 0.15 | learning rate: 6.403E-05 | global batch size: 256 | lm loss: 3.688586E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.825 | TFLOPs: 26.00 | +7: iteration 116940/ 173500 | consumed samples: 29936640 | consumed tokens: 61310238720 | elapsed time per iteration (s): 0.15 | learning rate: 6.401E-05 | global batch size: 256 | lm loss: 3.669209E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.198 | TFLOPs: 25.94 | +7: iteration 116950/ 173500 | consumed samples: 29939200 | consumed tokens: 61315481600 | elapsed time per iteration (s): 0.16 | learning rate: 6.400E-05 | global batch size: 256 | lm loss: 3.689896E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.040 | TFLOPs: 25.45 | +7: iteration 116960/ 173500 | consumed samples: 29941760 | consumed tokens: 61320724480 | elapsed time per iteration (s): 0.16 | learning rate: 6.399E-05 | global batch size: 256 | lm loss: 3.673192E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.964 | TFLOPs: 25.62 | +7: iteration 116970/ 173500 | consumed samples: 29944320 | consumed tokens: 61325967360 | elapsed time per iteration (s): 0.16 | learning rate: 6.397E-05 | global batch size: 256 | lm loss: 3.683567E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.207 | TFLOPs: 25.35 | +7: iteration 116980/ 173500 | consumed samples: 29946880 | consumed tokens: 61331210240 | elapsed time per iteration (s): 0.15 | learning rate: 6.396E-05 | global batch size: 256 | lm loss: 3.688918E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.274 | TFLOPs: 26.21 | +7: iteration 116990/ 173500 | consumed samples: 29949440 | consumed tokens: 61336453120 | elapsed time per iteration (s): 0.16 | learning rate: 6.394E-05 | global batch size: 256 | lm loss: 3.675141E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.037 | TFLOPs: 25.27 | +7: iteration 117000/ 173500 | consumed samples: 29952000 | consumed tokens: 61341696000 | elapsed time per iteration (s): 0.16 | learning rate: 6.393E-05 | global batch size: 256 | lm loss: 3.688552E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.335 | TFLOPs: 25.32 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 117000 | lm loss value: 3.880635E+00 | lm loss PPL: 4.845498E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 117000 to checkpoints_44m91b100m +0: [2023-03-17 05:20:10,338] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step117000 is begin to save! +0: [2023-03-17 05:20:10,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:20:10,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:20:10,430] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:20:10,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:20:10,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:20:10,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:20:10,455] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:20:10,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:20:10,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:20:10,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:20:10,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:20:10,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:20:10,488] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:20:10,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:20:10,499] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:20:10,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:20:10,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:20:10,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:20:10,521] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:20:10,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:20:10,523] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step117000/mp_rank_00_model_states.pt +0: [2023-03-17 05:20:10,523] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:20:10,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,543] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:20:10,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:20:10,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:20:10,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 05:20:10,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:20:10,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:20:10,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 05:20:10,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:20:10,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:20:10,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:20:10,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +5: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +4: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +2: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +3: [2023-03-17 05:20:10,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:20:10,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +6: [2023-03-17 05:20:10,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +7: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:20:10,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +1: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:20:10,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step117000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:20:10,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step117000 is ready now! +0: successfully saved checkpoint at iteration 117000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 234.61 +7: iteration 117010/ 173500 | consumed samples: 29954560 | consumed tokens: 61346938880 | elapsed time per iteration (s): 0.19 | learning rate: 6.391E-05 | global batch size: 256 | lm loss: 3.670753E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1334.813 | TFLOPs: 20.93 | +7: iteration 117020/ 173500 | consumed samples: 29957120 | consumed tokens: 61352181760 | elapsed time per iteration (s): 0.16 | learning rate: 6.390E-05 | global batch size: 256 | lm loss: 3.682698E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.098 | TFLOPs: 25.75 | +7: iteration 117030/ 173500 | consumed samples: 29959680 | consumed tokens: 61357424640 | elapsed time per iteration (s): 0.16 | learning rate: 6.389E-05 | global batch size: 256 | lm loss: 3.673312E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.539 | TFLOPs: 24.43 | +7: iteration 117040/ 173500 | consumed samples: 29962240 | consumed tokens: 61362667520 | elapsed time per iteration (s): 0.15 | learning rate: 6.387E-05 | global batch size: 256 | lm loss: 3.673028E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.048 | TFLOPs: 26.07 | +7: iteration 117050/ 173500 | consumed samples: 29964800 | consumed tokens: 61367910400 | elapsed time per iteration (s): 0.16 | learning rate: 6.386E-05 | global batch size: 256 | lm loss: 3.685101E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.333 | TFLOPs: 25.61 | +7: iteration 117060/ 173500 | consumed samples: 29967360 | consumed tokens: 61373153280 | elapsed time per iteration (s): 0.16 | learning rate: 6.384E-05 | global batch size: 256 | lm loss: 3.691579E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.623 | TFLOPs: 25.35 | +7: iteration 117070/ 173500 | consumed samples: 29969920 | consumed tokens: 61378396160 | elapsed time per iteration (s): 0.16 | learning rate: 6.383E-05 | global batch size: 256 | lm loss: 3.694524E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.406 | TFLOPs: 25.62 | +7: iteration 117080/ 173500 | consumed samples: 29972480 | consumed tokens: 61383639040 | elapsed time per iteration (s): 0.16 | learning rate: 6.382E-05 | global batch size: 256 | lm loss: 3.676932E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.911 | TFLOPs: 25.17 | +7: iteration 117090/ 173500 | consumed samples: 29975040 | consumed tokens: 61388881920 | elapsed time per iteration (s): 0.16 | learning rate: 6.380E-05 | global batch size: 256 | lm loss: 3.684137E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.423 | TFLOPs: 25.66 | +7: iteration 117100/ 173500 | consumed samples: 29977600 | consumed tokens: 61394124800 | elapsed time per iteration (s): 0.16 | learning rate: 6.379E-05 | global batch size: 256 | lm loss: 3.671743E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.123 | TFLOPs: 25.50 | +7: iteration 117110/ 173500 | consumed samples: 29980160 | consumed tokens: 61399367680 | elapsed time per iteration (s): 0.16 | learning rate: 6.377E-05 | global batch size: 256 | lm loss: 3.674696E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.421 | TFLOPs: 25.55 | +7: iteration 117120/ 173500 | consumed samples: 29982720 | consumed tokens: 61404610560 | elapsed time per iteration (s): 0.15 | learning rate: 6.376E-05 | global batch size: 256 | lm loss: 3.689754E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.331 | TFLOPs: 26.16 | +7: iteration 117130/ 173500 | consumed samples: 29985280 | consumed tokens: 61409853440 | elapsed time per iteration (s): 0.16 | learning rate: 6.374E-05 | global batch size: 256 | lm loss: 3.689991E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.262 | TFLOPs: 24.47 | +7: iteration 117140/ 173500 | consumed samples: 29987840 | consumed tokens: 61415096320 | elapsed time per iteration (s): 0.16 | learning rate: 6.373E-05 | global batch size: 256 | lm loss: 3.681979E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.545 | TFLOPs: 25.46 | +7: iteration 117150/ 173500 | consumed samples: 29990400 | consumed tokens: 61420339200 | elapsed time per iteration (s): 0.15 | learning rate: 6.372E-05 | global batch size: 256 | lm loss: 3.670251E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.222 | TFLOPs: 26.08 | +7: iteration 117160/ 173500 | consumed samples: 29992960 | consumed tokens: 61425582080 | elapsed time per iteration (s): 0.16 | learning rate: 6.370E-05 | global batch size: 256 | lm loss: 3.680641E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.467 | TFLOPs: 25.15 | +7: iteration 117170/ 173500 | consumed samples: 29995520 | consumed tokens: 61430824960 | elapsed time per iteration (s): 0.16 | learning rate: 6.369E-05 | global batch size: 256 | lm loss: 3.667571E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.837 | TFLOPs: 25.48 | +7: iteration 117180/ 173500 | consumed samples: 29998080 | consumed tokens: 61436067840 | elapsed time per iteration (s): 0.16 | learning rate: 6.367E-05 | global batch size: 256 | lm loss: 3.667634E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.756 | TFLOPs: 25.73 | +7: iteration 117190/ 173500 | consumed samples: 30000640 | consumed tokens: 61441310720 | elapsed time per iteration (s): 0.16 | learning rate: 6.366E-05 | global batch size: 256 | lm loss: 3.682080E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.938 | TFLOPs: 24.97 | +7: iteration 117200/ 173500 | consumed samples: 30003200 | consumed tokens: 61446553600 | elapsed time per iteration (s): 0.16 | learning rate: 6.365E-05 | global batch size: 256 | lm loss: 3.676424E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.681 | TFLOPs: 25.76 | +7: iteration 117210/ 173500 | consumed samples: 30005760 | consumed tokens: 61451796480 | elapsed time per iteration (s): 0.17 | learning rate: 6.363E-05 | global batch size: 256 | lm loss: 3.677861E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.204 | TFLOPs: 24.12 | +7: iteration 117220/ 173500 | consumed samples: 30008320 | consumed tokens: 61457039360 | elapsed time per iteration (s): 0.15 | learning rate: 6.362E-05 | global batch size: 256 | lm loss: 3.674514E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.662 | TFLOPs: 25.98 | +7: iteration 117230/ 173500 | consumed samples: 30010880 | consumed tokens: 61462282240 | elapsed time per iteration (s): 0.16 | learning rate: 6.360E-05 | global batch size: 256 | lm loss: 3.673919E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.535 | TFLOPs: 25.35 | +7: iteration 117240/ 173500 | consumed samples: 30013440 | consumed tokens: 61467525120 | elapsed time per iteration (s): 0.16 | learning rate: 6.359E-05 | global batch size: 256 | lm loss: 3.687371E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.197 | TFLOPs: 24.94 | +7: iteration 117250/ 173500 | consumed samples: 30016000 | consumed tokens: 61472768000 | elapsed time per iteration (s): 0.16 | learning rate: 6.358E-05 | global batch size: 256 | lm loss: 3.689658E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.614 | TFLOPs: 24.90 | +7: iteration 117260/ 173500 | consumed samples: 30018560 | consumed tokens: 61478010880 | elapsed time per iteration (s): 0.16 | learning rate: 6.356E-05 | global batch size: 256 | lm loss: 3.667678E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.681 | TFLOPs: 24.44 | +7: iteration 117270/ 173500 | consumed samples: 30021120 | consumed tokens: 61483253760 | elapsed time per iteration (s): 0.16 | learning rate: 6.355E-05 | global batch size: 256 | lm loss: 3.685437E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.785 | TFLOPs: 24.67 | +7: iteration 117280/ 173500 | consumed samples: 30023680 | consumed tokens: 61488496640 | elapsed time per iteration (s): 0.16 | learning rate: 6.353E-05 | global batch size: 256 | lm loss: 3.679826E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.210 | TFLOPs: 24.58 | +7: iteration 117290/ 173500 | consumed samples: 30026240 | consumed tokens: 61493739520 | elapsed time per iteration (s): 0.17 | learning rate: 6.352E-05 | global batch size: 256 | lm loss: 3.674916E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.450 | TFLOPs: 23.89 | +7: iteration 117300/ 173500 | consumed samples: 30028800 | consumed tokens: 61498982400 | elapsed time per iteration (s): 0.16 | learning rate: 6.351E-05 | global batch size: 256 | lm loss: 3.675498E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.211 | TFLOPs: 25.88 | +7: iteration 117310/ 173500 | consumed samples: 30031360 | consumed tokens: 61504225280 | elapsed time per iteration (s): 0.16 | learning rate: 6.349E-05 | global batch size: 256 | lm loss: 3.680456E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.160 | TFLOPs: 25.85 | +7: iteration 117320/ 173500 | consumed samples: 30033920 | consumed tokens: 61509468160 | elapsed time per iteration (s): 0.16 | learning rate: 6.348E-05 | global batch size: 256 | lm loss: 3.680828E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.860 | TFLOPs: 25.01 | +7: iteration 117330/ 173500 | consumed samples: 30036480 | consumed tokens: 61514711040 | elapsed time per iteration (s): 0.16 | learning rate: 6.346E-05 | global batch size: 256 | lm loss: 3.682869E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.144 | TFLOPs: 25.69 | +7: iteration 117340/ 173500 | consumed samples: 30039040 | consumed tokens: 61519953920 | elapsed time per iteration (s): 0.16 | learning rate: 6.345E-05 | global batch size: 256 | lm loss: 3.665751E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.306 | TFLOPs: 25.33 | +7: iteration 117350/ 173500 | consumed samples: 30041600 | consumed tokens: 61525196800 | elapsed time per iteration (s): 0.16 | learning rate: 6.343E-05 | global batch size: 256 | lm loss: 3.685462E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.821 | TFLOPs: 25.37 | +7: iteration 117360/ 173500 | consumed samples: 30044160 | consumed tokens: 61530439680 | elapsed time per iteration (s): 0.15 | learning rate: 6.342E-05 | global batch size: 256 | lm loss: 3.691124E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.031 | TFLOPs: 26.10 | +7: iteration 117370/ 173500 | consumed samples: 30046720 | consumed tokens: 61535682560 | elapsed time per iteration (s): 0.16 | learning rate: 6.341E-05 | global batch size: 256 | lm loss: 3.679431E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.292 | TFLOPs: 25.44 | +7: iteration 117380/ 173500 | consumed samples: 30049280 | consumed tokens: 61540925440 | elapsed time per iteration (s): 0.15 | learning rate: 6.339E-05 | global batch size: 256 | lm loss: 3.674435E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.253 | TFLOPs: 26.08 | +7: iteration 117390/ 173500 | consumed samples: 30051840 | consumed tokens: 61546168320 | elapsed time per iteration (s): 0.16 | learning rate: 6.338E-05 | global batch size: 256 | lm loss: 3.684980E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.755 | TFLOPs: 25.61 | +7: iteration 117400/ 173500 | consumed samples: 30054400 | consumed tokens: 61551411200 | elapsed time per iteration (s): 0.15 | learning rate: 6.336E-05 | global batch size: 256 | lm loss: 3.676857E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.830 | TFLOPs: 26.11 | +7: iteration 117410/ 173500 | consumed samples: 30056960 | consumed tokens: 61556654080 | elapsed time per iteration (s): 0.16 | learning rate: 6.335E-05 | global batch size: 256 | lm loss: 3.689308E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.253 | TFLOPs: 25.50 | +7: iteration 117420/ 173500 | consumed samples: 30059520 | consumed tokens: 61561896960 | elapsed time per iteration (s): 0.16 | learning rate: 6.334E-05 | global batch size: 256 | lm loss: 3.666679E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.477 | TFLOPs: 25.79 | +7: iteration 117430/ 173500 | consumed samples: 30062080 | consumed tokens: 61567139840 | elapsed time per iteration (s): 0.16 | learning rate: 6.332E-05 | global batch size: 256 | lm loss: 3.689558E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.422 | TFLOPs: 24.88 | +7: iteration 117440/ 173500 | consumed samples: 30064640 | consumed tokens: 61572382720 | elapsed time per iteration (s): 0.16 | learning rate: 6.331E-05 | global batch size: 256 | lm loss: 3.682028E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.542 | TFLOPs: 24.74 | +7: iteration 117450/ 173500 | consumed samples: 30067200 | consumed tokens: 61577625600 | elapsed time per iteration (s): 0.16 | learning rate: 6.329E-05 | global batch size: 256 | lm loss: 3.671576E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.222 | TFLOPs: 25.64 | +7: iteration 117460/ 173500 | consumed samples: 30069760 | consumed tokens: 61582868480 | elapsed time per iteration (s): 0.15 | learning rate: 6.328E-05 | global batch size: 256 | lm loss: 3.672277E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.684 | TFLOPs: 25.90 | +7: iteration 117470/ 173500 | consumed samples: 30072320 | consumed tokens: 61588111360 | elapsed time per iteration (s): 0.16 | learning rate: 6.327E-05 | global batch size: 256 | lm loss: 3.682652E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.690 | TFLOPs: 25.46 | +7: iteration 117480/ 173500 | consumed samples: 30074880 | consumed tokens: 61593354240 | elapsed time per iteration (s): 0.15 | learning rate: 6.325E-05 | global batch size: 256 | lm loss: 3.661248E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.218 | TFLOPs: 26.10 | +7: iteration 117490/ 173500 | consumed samples: 30077440 | consumed tokens: 61598597120 | elapsed time per iteration (s): 0.16 | learning rate: 6.324E-05 | global batch size: 256 | lm loss: 3.686918E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.781 | TFLOPs: 25.07 | +7: iteration 117500/ 173500 | consumed samples: 30080000 | consumed tokens: 61603840000 | elapsed time per iteration (s): 0.15 | learning rate: 6.322E-05 | global batch size: 256 | lm loss: 3.673984E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.754 | TFLOPs: 26.08 | +7: iteration 117510/ 173500 | consumed samples: 30082560 | consumed tokens: 61609082880 | elapsed time per iteration (s): 0.15 | learning rate: 6.321E-05 | global batch size: 256 | lm loss: 3.693610E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.606 | TFLOPs: 26.03 | +7: iteration 117520/ 173500 | consumed samples: 30085120 | consumed tokens: 61614325760 | elapsed time per iteration (s): 0.16 | learning rate: 6.320E-05 | global batch size: 256 | lm loss: 3.676100E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.378 | TFLOPs: 25.62 | +7: iteration 117530/ 173500 | consumed samples: 30087680 | consumed tokens: 61619568640 | elapsed time per iteration (s): 0.16 | learning rate: 6.318E-05 | global batch size: 256 | lm loss: 3.665101E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.575 | TFLOPs: 24.93 | +7: iteration 117540/ 173500 | consumed samples: 30090240 | consumed tokens: 61624811520 | elapsed time per iteration (s): 0.15 | learning rate: 6.317E-05 | global batch size: 256 | lm loss: 3.680401E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.137 | TFLOPs: 26.10 | +7: iteration 117550/ 173500 | consumed samples: 30092800 | consumed tokens: 61630054400 | elapsed time per iteration (s): 0.17 | learning rate: 6.315E-05 | global batch size: 256 | lm loss: 3.682006E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.245 | TFLOPs: 23.79 | +7: iteration 117560/ 173500 | consumed samples: 30095360 | consumed tokens: 61635297280 | elapsed time per iteration (s): 0.15 | learning rate: 6.314E-05 | global batch size: 256 | lm loss: 3.679575E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.923 | TFLOPs: 26.11 | +7: iteration 117570/ 173500 | consumed samples: 30097920 | consumed tokens: 61640540160 | elapsed time per iteration (s): 0.15 | learning rate: 6.313E-05 | global batch size: 256 | lm loss: 3.686030E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.116 | TFLOPs: 26.05 | +7: iteration 117580/ 173500 | consumed samples: 30100480 | consumed tokens: 61645783040 | elapsed time per iteration (s): 0.15 | learning rate: 6.311E-05 | global batch size: 256 | lm loss: 3.680405E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.373 | TFLOPs: 26.10 | +7: iteration 117590/ 173500 | consumed samples: 30103040 | consumed tokens: 61651025920 | elapsed time per iteration (s): 0.16 | learning rate: 6.310E-05 | global batch size: 256 | lm loss: 3.682552E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.179 | TFLOPs: 25.85 | +7: iteration 117600/ 173500 | consumed samples: 30105600 | consumed tokens: 61656268800 | elapsed time per iteration (s): 0.16 | learning rate: 6.308E-05 | global batch size: 256 | lm loss: 3.664970E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.394 | TFLOPs: 25.79 | +7: iteration 117610/ 173500 | consumed samples: 30108160 | consumed tokens: 61661511680 | elapsed time per iteration (s): 0.15 | learning rate: 6.307E-05 | global batch size: 256 | lm loss: 3.670252E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.504 | TFLOPs: 26.10 | +7: iteration 117620/ 173500 | consumed samples: 30110720 | consumed tokens: 61666754560 | elapsed time per iteration (s): 0.16 | learning rate: 6.305E-05 | global batch size: 256 | lm loss: 3.688173E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.470 | TFLOPs: 25.27 | +7: iteration 117630/ 173500 | consumed samples: 30113280 | consumed tokens: 61671997440 | elapsed time per iteration (s): 0.15 | learning rate: 6.304E-05 | global batch size: 256 | lm loss: 3.682817E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.135 | TFLOPs: 25.94 | +7: iteration 117640/ 173500 | consumed samples: 30115840 | consumed tokens: 61677240320 | elapsed time per iteration (s): 0.16 | learning rate: 6.303E-05 | global batch size: 256 | lm loss: 3.683555E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.575 | TFLOPs: 25.81 | +7: iteration 117650/ 173500 | consumed samples: 30118400 | consumed tokens: 61682483200 | elapsed time per iteration (s): 0.16 | learning rate: 6.301E-05 | global batch size: 256 | lm loss: 3.688742E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.413 | TFLOPs: 25.32 | +7: iteration 117660/ 173500 | consumed samples: 30120960 | consumed tokens: 61687726080 | elapsed time per iteration (s): 0.16 | learning rate: 6.300E-05 | global batch size: 256 | lm loss: 3.683085E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.866 | TFLOPs: 25.73 | +7: iteration 117670/ 173500 | consumed samples: 30123520 | consumed tokens: 61692968960 | elapsed time per iteration (s): 0.16 | learning rate: 6.298E-05 | global batch size: 256 | lm loss: 3.666015E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.754 | TFLOPs: 25.62 | +7: iteration 117680/ 173500 | consumed samples: 30126080 | consumed tokens: 61698211840 | elapsed time per iteration (s): 0.15 | learning rate: 6.297E-05 | global batch size: 256 | lm loss: 3.676079E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.092 | TFLOPs: 25.92 | +7: iteration 117690/ 173500 | consumed samples: 30128640 | consumed tokens: 61703454720 | elapsed time per iteration (s): 0.15 | learning rate: 6.296E-05 | global batch size: 256 | lm loss: 3.675340E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.222 | TFLOPs: 25.94 | +7: iteration 117700/ 173500 | consumed samples: 30131200 | consumed tokens: 61708697600 | elapsed time per iteration (s): 0.16 | learning rate: 6.294E-05 | global batch size: 256 | lm loss: 3.679004E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.459 | TFLOPs: 25.88 | +7: iteration 117710/ 173500 | consumed samples: 30133760 | consumed tokens: 61713940480 | elapsed time per iteration (s): 0.16 | learning rate: 6.293E-05 | global batch size: 256 | lm loss: 3.683362E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.013 | TFLOPs: 25.72 | +7: iteration 117720/ 173500 | consumed samples: 30136320 | consumed tokens: 61719183360 | elapsed time per iteration (s): 0.16 | learning rate: 6.291E-05 | global batch size: 256 | lm loss: 3.680233E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.194 | TFLOPs: 25.88 | +7: iteration 117730/ 173500 | consumed samples: 30138880 | consumed tokens: 61724426240 | elapsed time per iteration (s): 0.16 | learning rate: 6.290E-05 | global batch size: 256 | lm loss: 3.684437E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.649 | TFLOPs: 25.54 | +7: iteration 117740/ 173500 | consumed samples: 30141440 | consumed tokens: 61729669120 | elapsed time per iteration (s): 0.15 | learning rate: 6.289E-05 | global batch size: 256 | lm loss: 3.674801E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.525 | TFLOPs: 25.92 | +7: iteration 117750/ 173500 | consumed samples: 30144000 | consumed tokens: 61734912000 | elapsed time per iteration (s): 0.16 | learning rate: 6.287E-05 | global batch size: 256 | lm loss: 3.665484E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.911 | TFLOPs: 25.81 | +7: iteration 117760/ 173500 | consumed samples: 30146560 | consumed tokens: 61740154880 | elapsed time per iteration (s): 0.16 | learning rate: 6.286E-05 | global batch size: 256 | lm loss: 3.681423E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.535 | TFLOPs: 25.27 | +7: iteration 117770/ 173500 | consumed samples: 30149120 | consumed tokens: 61745397760 | elapsed time per iteration (s): 0.16 | learning rate: 6.284E-05 | global batch size: 256 | lm loss: 3.663945E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.000 | TFLOPs: 25.63 | +7: iteration 117780/ 173500 | consumed samples: 30151680 | consumed tokens: 61750640640 | elapsed time per iteration (s): 0.15 | learning rate: 6.283E-05 | global batch size: 256 | lm loss: 3.676492E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.542 | TFLOPs: 26.07 | +7: iteration 117790/ 173500 | consumed samples: 30154240 | consumed tokens: 61755883520 | elapsed time per iteration (s): 0.16 | learning rate: 6.282E-05 | global batch size: 256 | lm loss: 3.673100E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.956 | TFLOPs: 25.53 | +7: iteration 117800/ 173500 | consumed samples: 30156800 | consumed tokens: 61761126400 | elapsed time per iteration (s): 0.16 | learning rate: 6.280E-05 | global batch size: 256 | lm loss: 3.677520E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.395 | TFLOPs: 25.63 | +7: iteration 117810/ 173500 | consumed samples: 30159360 | consumed tokens: 61766369280 | elapsed time per iteration (s): 0.15 | learning rate: 6.279E-05 | global batch size: 256 | lm loss: 3.675239E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.062 | TFLOPs: 26.00 | +7: iteration 117820/ 173500 | consumed samples: 30161920 | consumed tokens: 61771612160 | elapsed time per iteration (s): 0.15 | learning rate: 6.277E-05 | global batch size: 256 | lm loss: 3.679401E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.853 | TFLOPs: 25.95 | +7: iteration 117830/ 173500 | consumed samples: 30164480 | consumed tokens: 61776855040 | elapsed time per iteration (s): 0.15 | learning rate: 6.276E-05 | global batch size: 256 | lm loss: 3.672969E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.298 | TFLOPs: 25.99 | +7: iteration 117840/ 173500 | consumed samples: 30167040 | consumed tokens: 61782097920 | elapsed time per iteration (s): 0.16 | learning rate: 6.275E-05 | global batch size: 256 | lm loss: 3.682336E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.939 | TFLOPs: 25.75 | +7: iteration 117850/ 173500 | consumed samples: 30169600 | consumed tokens: 61787340800 | elapsed time per iteration (s): 0.16 | learning rate: 6.273E-05 | global batch size: 256 | lm loss: 3.680032E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.877 | TFLOPs: 25.54 | +7: iteration 117860/ 173500 | consumed samples: 30172160 | consumed tokens: 61792583680 | elapsed time per iteration (s): 0.16 | learning rate: 6.272E-05 | global batch size: 256 | lm loss: 3.682200E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.898 | TFLOPs: 25.75 | +7: iteration 117870/ 173500 | consumed samples: 30174720 | consumed tokens: 61797826560 | elapsed time per iteration (s): 0.16 | learning rate: 6.270E-05 | global batch size: 256 | lm loss: 3.676572E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.284 | TFLOPs: 24.91 | +7: iteration 117880/ 173500 | consumed samples: 30177280 | consumed tokens: 61803069440 | elapsed time per iteration (s): 0.16 | learning rate: 6.269E-05 | global batch size: 256 | lm loss: 3.678233E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.699 | TFLOPs: 25.82 | +7: iteration 117890/ 173500 | consumed samples: 30179840 | consumed tokens: 61808312320 | elapsed time per iteration (s): 0.16 | learning rate: 6.268E-05 | global batch size: 256 | lm loss: 3.676049E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.829 | TFLOPs: 25.89 | +7: iteration 117900/ 173500 | consumed samples: 30182400 | consumed tokens: 61813555200 | elapsed time per iteration (s): 0.16 | learning rate: 6.266E-05 | global batch size: 256 | lm loss: 3.690276E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.014 | TFLOPs: 25.74 | +7: iteration 117910/ 173500 | consumed samples: 30184960 | consumed tokens: 61818798080 | elapsed time per iteration (s): 0.16 | learning rate: 6.265E-05 | global batch size: 256 | lm loss: 3.676311E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.482 | TFLOPs: 25.90 | +7: iteration 117920/ 173500 | consumed samples: 30187520 | consumed tokens: 61824040960 | elapsed time per iteration (s): 0.16 | learning rate: 6.263E-05 | global batch size: 256 | lm loss: 3.675360E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.062 | TFLOPs: 25.59 | +7: iteration 117930/ 173500 | consumed samples: 30190080 | consumed tokens: 61829283840 | elapsed time per iteration (s): 0.15 | learning rate: 6.262E-05 | global batch size: 256 | lm loss: 3.674986E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.741 | TFLOPs: 25.92 | +7: iteration 117940/ 173500 | consumed samples: 30192640 | consumed tokens: 61834526720 | elapsed time per iteration (s): 0.15 | learning rate: 6.261E-05 | global batch size: 256 | lm loss: 3.682877E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.646 | TFLOPs: 26.17 | +7: iteration 117950/ 173500 | consumed samples: 30195200 | consumed tokens: 61839769600 | elapsed time per iteration (s): 0.16 | learning rate: 6.259E-05 | global batch size: 256 | lm loss: 3.683459E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.203 | TFLOPs: 25.74 | +7: iteration 117960/ 173500 | consumed samples: 30197760 | consumed tokens: 61845012480 | elapsed time per iteration (s): 0.15 | learning rate: 6.258E-05 | global batch size: 256 | lm loss: 3.687086E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.024 | TFLOPs: 26.17 | +7: iteration 117970/ 173500 | consumed samples: 30200320 | consumed tokens: 61850255360 | elapsed time per iteration (s): 0.15 | learning rate: 6.256E-05 | global batch size: 256 | lm loss: 3.681612E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.580 | TFLOPs: 26.21 | +7: iteration 117980/ 173500 | consumed samples: 30202880 | consumed tokens: 61855498240 | elapsed time per iteration (s): 0.15 | learning rate: 6.255E-05 | global batch size: 256 | lm loss: 3.680956E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.071 | TFLOPs: 26.05 | +7: iteration 117990/ 173500 | consumed samples: 30205440 | consumed tokens: 61860741120 | elapsed time per iteration (s): 0.15 | learning rate: 6.254E-05 | global batch size: 256 | lm loss: 3.679409E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.911 | TFLOPs: 26.19 | +0: [2023-03-17 05:22:47,621] [INFO] [logging.py:68:log_dist] [Rank 0] step=118000, skipped=0, lr=[6.252226684525562e-05, 6.252226684525562e-05, 6.252226684525562e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 118000/ 173500 | consumed samples: 30208000 | consumed tokens: 61865984000 | elapsed time per iteration (s): 0.15 | learning rate: 6.252E-05 | global batch size: 256 | lm loss: 3.667208E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.948 | TFLOPs: 26.17 | +0: steps: 118000 loss: 3.6493 iter time (s): 0.156 samples/sec: 1642.399 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 118000 | lm loss value: 3.842772E+00 | lm loss PPL: 4.665460E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 118000 to checkpoints_44m91b100m +0: [2023-03-17 05:22:47,696] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step118000 is begin to save! +0: [2023-03-17 05:22:47,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:22:47,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:22:47,771] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:22:47,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:22:47,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:22:47,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:22:47,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:22:47,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:22:47,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:22:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:22:47,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:22:47,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:22:47,813] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:22:47,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:22:47,821] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:22:47,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:22:47,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:22:47,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:22:47,837] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:22:47,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:22:47,838] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step118000/mp_rank_00_model_states.pt +0: [2023-03-17 05:22:47,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:22:47,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:22:47,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:22:47,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:22:47,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:22:47,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:22:47,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:22:47,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:22:47,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 05:22:47,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:22:47,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +2: [2023-03-17 05:22:47,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +1: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +5: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +7: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +4: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +6: [2023-03-17 05:22:47,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:22:47,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +3: [2023-03-17 05:22:47,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:22:47,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step118000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:22:47,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step118000 is ready now! +0: successfully saved checkpoint at iteration 118000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 189.98 +7: iteration 118010/ 173500 | consumed samples: 30210560 | consumed tokens: 61871226880 | elapsed time per iteration (s): 0.18 | learning rate: 6.251E-05 | global batch size: 256 | lm loss: 3.676381E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.891 | TFLOPs: 21.75 | +7: iteration 118020/ 173500 | consumed samples: 30213120 | consumed tokens: 61876469760 | elapsed time per iteration (s): 0.16 | learning rate: 6.249E-05 | global batch size: 256 | lm loss: 3.683890E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.752 | TFLOPs: 25.12 | +7: iteration 118030/ 173500 | consumed samples: 30215680 | consumed tokens: 61881712640 | elapsed time per iteration (s): 0.16 | learning rate: 6.248E-05 | global batch size: 256 | lm loss: 3.686321E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.660 | TFLOPs: 25.32 | +7: iteration 118040/ 173500 | consumed samples: 30218240 | consumed tokens: 61886955520 | elapsed time per iteration (s): 0.16 | learning rate: 6.247E-05 | global batch size: 256 | lm loss: 3.679706E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.709 | TFLOPs: 24.70 | +7: iteration 118050/ 173500 | consumed samples: 30220800 | consumed tokens: 61892198400 | elapsed time per iteration (s): 0.16 | learning rate: 6.245E-05 | global batch size: 256 | lm loss: 3.679826E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.115 | TFLOPs: 24.94 | +7: iteration 118060/ 173500 | consumed samples: 30223360 | consumed tokens: 61897441280 | elapsed time per iteration (s): 0.15 | learning rate: 6.244E-05 | global batch size: 256 | lm loss: 3.671852E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.910 | TFLOPs: 26.06 | +7: iteration 118070/ 173500 | consumed samples: 30225920 | consumed tokens: 61902684160 | elapsed time per iteration (s): 0.16 | learning rate: 6.242E-05 | global batch size: 256 | lm loss: 3.680285E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.247 | TFLOPs: 25.71 | +7: iteration 118080/ 173500 | consumed samples: 30228480 | consumed tokens: 61907927040 | elapsed time per iteration (s): 0.15 | learning rate: 6.241E-05 | global batch size: 256 | lm loss: 3.664182E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.696 | TFLOPs: 26.11 | +7: iteration 118090/ 173500 | consumed samples: 30231040 | consumed tokens: 61913169920 | elapsed time per iteration (s): 0.15 | learning rate: 6.240E-05 | global batch size: 256 | lm loss: 3.669682E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.023 | TFLOPs: 26.06 | +7: iteration 118100/ 173500 | consumed samples: 30233600 | consumed tokens: 61918412800 | elapsed time per iteration (s): 0.15 | learning rate: 6.238E-05 | global batch size: 256 | lm loss: 3.679594E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.031 | TFLOPs: 26.13 | +7: iteration 118110/ 173500 | consumed samples: 30236160 | consumed tokens: 61923655680 | elapsed time per iteration (s): 0.16 | learning rate: 6.237E-05 | global batch size: 256 | lm loss: 3.681933E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.130 | TFLOPs: 25.82 | +7: iteration 118120/ 173500 | consumed samples: 30238720 | consumed tokens: 61928898560 | elapsed time per iteration (s): 0.15 | learning rate: 6.235E-05 | global batch size: 256 | lm loss: 3.687905E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.123 | TFLOPs: 26.13 | +7: iteration 118130/ 173500 | consumed samples: 30241280 | consumed tokens: 61934141440 | elapsed time per iteration (s): 0.15 | learning rate: 6.234E-05 | global batch size: 256 | lm loss: 3.687378E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.860 | TFLOPs: 26.09 | +7: iteration 118140/ 173500 | consumed samples: 30243840 | consumed tokens: 61939384320 | elapsed time per iteration (s): 0.15 | learning rate: 6.233E-05 | global batch size: 256 | lm loss: 3.678747E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.815 | TFLOPs: 26.16 | +7: iteration 118150/ 173500 | consumed samples: 30246400 | consumed tokens: 61944627200 | elapsed time per iteration (s): 0.15 | learning rate: 6.231E-05 | global batch size: 256 | lm loss: 3.670409E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.327 | TFLOPs: 26.12 | +7: iteration 118160/ 173500 | consumed samples: 30248960 | consumed tokens: 61949870080 | elapsed time per iteration (s): 0.15 | learning rate: 6.230E-05 | global batch size: 256 | lm loss: 3.678079E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.609 | TFLOPs: 26.09 | +7: iteration 118170/ 173500 | consumed samples: 30251520 | consumed tokens: 61955112960 | elapsed time per iteration (s): 0.15 | learning rate: 6.228E-05 | global batch size: 256 | lm loss: 3.659835E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.494 | TFLOPs: 26.17 | +7: iteration 118180/ 173500 | consumed samples: 30254080 | consumed tokens: 61960355840 | elapsed time per iteration (s): 0.15 | learning rate: 6.227E-05 | global batch size: 256 | lm loss: 3.686829E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.171 | TFLOPs: 26.18 | +7: iteration 118190/ 173500 | consumed samples: 30256640 | consumed tokens: 61965598720 | elapsed time per iteration (s): 0.15 | learning rate: 6.226E-05 | global batch size: 256 | lm loss: 3.684399E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.649 | TFLOPs: 26.18 | +7: iteration 118200/ 173500 | consumed samples: 30259200 | consumed tokens: 61970841600 | elapsed time per iteration (s): 0.16 | learning rate: 6.224E-05 | global batch size: 256 | lm loss: 3.699246E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.940 | TFLOPs: 25.26 | +7: iteration 118210/ 173500 | consumed samples: 30261760 | consumed tokens: 61976084480 | elapsed time per iteration (s): 0.15 | learning rate: 6.223E-05 | global batch size: 256 | lm loss: 3.668691E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.376 | TFLOPs: 25.93 | +7: iteration 118220/ 173500 | consumed samples: 30264320 | consumed tokens: 61981327360 | elapsed time per iteration (s): 0.16 | learning rate: 6.221E-05 | global batch size: 256 | lm loss: 3.678581E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.172 | TFLOPs: 25.58 | +7: iteration 118230/ 173500 | consumed samples: 30266880 | consumed tokens: 61986570240 | elapsed time per iteration (s): 0.16 | learning rate: 6.220E-05 | global batch size: 256 | lm loss: 3.679919E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.302 | TFLOPs: 25.68 | +7: iteration 118240/ 173500 | consumed samples: 30269440 | consumed tokens: 61991813120 | elapsed time per iteration (s): 0.15 | learning rate: 6.219E-05 | global batch size: 256 | lm loss: 3.675359E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.887 | TFLOPs: 26.20 | +7: iteration 118250/ 173500 | consumed samples: 30272000 | consumed tokens: 61997056000 | elapsed time per iteration (s): 0.16 | learning rate: 6.217E-05 | global batch size: 256 | lm loss: 3.685759E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.998 | TFLOPs: 25.69 | +7: iteration 118260/ 173500 | consumed samples: 30274560 | consumed tokens: 62002298880 | elapsed time per iteration (s): 0.15 | learning rate: 6.216E-05 | global batch size: 256 | lm loss: 3.685991E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.633 | TFLOPs: 26.22 | +7: iteration 118270/ 173500 | consumed samples: 30277120 | consumed tokens: 62007541760 | elapsed time per iteration (s): 0.15 | learning rate: 6.215E-05 | global batch size: 256 | lm loss: 3.668611E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.070 | TFLOPs: 25.92 | +7: iteration 118280/ 173500 | consumed samples: 30279680 | consumed tokens: 62012784640 | elapsed time per iteration (s): 0.16 | learning rate: 6.213E-05 | global batch size: 256 | lm loss: 3.672451E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.109 | TFLOPs: 25.34 | +7: iteration 118290/ 173500 | consumed samples: 30282240 | consumed tokens: 62018027520 | elapsed time per iteration (s): 0.16 | learning rate: 6.212E-05 | global batch size: 256 | lm loss: 3.675523E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.966 | TFLOPs: 25.80 | +7: iteration 118300/ 173500 | consumed samples: 30284800 | consumed tokens: 62023270400 | elapsed time per iteration (s): 0.15 | learning rate: 6.210E-05 | global batch size: 256 | lm loss: 3.676831E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.372 | TFLOPs: 26.23 | +7: iteration 118310/ 173500 | consumed samples: 30287360 | consumed tokens: 62028513280 | elapsed time per iteration (s): 0.16 | learning rate: 6.209E-05 | global batch size: 256 | lm loss: 3.683815E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.453 | TFLOPs: 25.82 | +7: iteration 118320/ 173500 | consumed samples: 30289920 | consumed tokens: 62033756160 | elapsed time per iteration (s): 0.16 | learning rate: 6.208E-05 | global batch size: 256 | lm loss: 3.683155E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.811 | TFLOPs: 25.25 | +7: iteration 118330/ 173500 | consumed samples: 30292480 | consumed tokens: 62038999040 | elapsed time per iteration (s): 0.16 | learning rate: 6.206E-05 | global batch size: 256 | lm loss: 3.675685E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.664 | TFLOPs: 25.59 | +7: iteration 118340/ 173500 | consumed samples: 30295040 | consumed tokens: 62044241920 | elapsed time per iteration (s): 0.16 | learning rate: 6.205E-05 | global batch size: 256 | lm loss: 3.674706E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.726 | TFLOPs: 25.35 | +7: iteration 118350/ 173500 | consumed samples: 30297600 | consumed tokens: 62049484800 | elapsed time per iteration (s): 0.16 | learning rate: 6.203E-05 | global batch size: 256 | lm loss: 3.681816E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.966 | TFLOPs: 25.36 | +7: iteration 118360/ 173500 | consumed samples: 30300160 | consumed tokens: 62054727680 | elapsed time per iteration (s): 0.15 | learning rate: 6.202E-05 | global batch size: 256 | lm loss: 3.675103E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.717 | TFLOPs: 26.19 | +7: iteration 118370/ 173500 | consumed samples: 30302720 | consumed tokens: 62059970560 | elapsed time per iteration (s): 0.16 | learning rate: 6.201E-05 | global batch size: 256 | lm loss: 3.661724E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.619 | TFLOPs: 25.20 | +7: iteration 118380/ 173500 | consumed samples: 30305280 | consumed tokens: 62065213440 | elapsed time per iteration (s): 0.15 | learning rate: 6.199E-05 | global batch size: 256 | lm loss: 3.671992E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.771 | TFLOPs: 26.19 | +7: iteration 118390/ 173500 | consumed samples: 30307840 | consumed tokens: 62070456320 | elapsed time per iteration (s): 0.15 | learning rate: 6.198E-05 | global batch size: 256 | lm loss: 3.680226E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.665 | TFLOPs: 26.15 | +7: iteration 118400/ 173500 | consumed samples: 30310400 | consumed tokens: 62075699200 | elapsed time per iteration (s): 0.16 | learning rate: 6.196E-05 | global batch size: 256 | lm loss: 3.682646E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.068 | TFLOPs: 25.34 | +7: iteration 118410/ 173500 | consumed samples: 30312960 | consumed tokens: 62080942080 | elapsed time per iteration (s): 0.16 | learning rate: 6.195E-05 | global batch size: 256 | lm loss: 3.677157E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.090 | TFLOPs: 25.86 | +7: iteration 118420/ 173500 | consumed samples: 30315520 | consumed tokens: 62086184960 | elapsed time per iteration (s): 0.15 | learning rate: 6.194E-05 | global batch size: 256 | lm loss: 3.673909E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.144 | TFLOPs: 25.99 | +7: iteration 118430/ 173500 | consumed samples: 30318080 | consumed tokens: 62091427840 | elapsed time per iteration (s): 0.16 | learning rate: 6.192E-05 | global batch size: 256 | lm loss: 3.679605E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.781 | TFLOPs: 25.17 | +7: iteration 118440/ 173500 | consumed samples: 30320640 | consumed tokens: 62096670720 | elapsed time per iteration (s): 0.16 | learning rate: 6.191E-05 | global batch size: 256 | lm loss: 3.684747E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.621 | TFLOPs: 24.69 | +7: iteration 118450/ 173500 | consumed samples: 30323200 | consumed tokens: 62101913600 | elapsed time per iteration (s): 0.16 | learning rate: 6.189E-05 | global batch size: 256 | lm loss: 3.689482E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.656 | TFLOPs: 25.76 | +7: iteration 118460/ 173500 | consumed samples: 30325760 | consumed tokens: 62107156480 | elapsed time per iteration (s): 0.16 | learning rate: 6.188E-05 | global batch size: 256 | lm loss: 3.667590E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.602 | TFLOPs: 25.67 | +7: iteration 118470/ 173500 | consumed samples: 30328320 | consumed tokens: 62112399360 | elapsed time per iteration (s): 0.15 | learning rate: 6.187E-05 | global batch size: 256 | lm loss: 3.670733E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.042 | TFLOPs: 26.17 | +7: iteration 118480/ 173500 | consumed samples: 30330880 | consumed tokens: 62117642240 | elapsed time per iteration (s): 0.15 | learning rate: 6.185E-05 | global batch size: 256 | lm loss: 3.694626E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.016 | TFLOPs: 26.13 | +7: iteration 118490/ 173500 | consumed samples: 30333440 | consumed tokens: 62122885120 | elapsed time per iteration (s): 0.16 | learning rate: 6.184E-05 | global batch size: 256 | lm loss: 3.698741E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.228 | TFLOPs: 25.58 | +7: iteration 118500/ 173500 | consumed samples: 30336000 | consumed tokens: 62128128000 | elapsed time per iteration (s): 0.16 | learning rate: 6.183E-05 | global batch size: 256 | lm loss: 3.685044E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.456 | TFLOPs: 25.74 | +7: iteration 118510/ 173500 | consumed samples: 30338560 | consumed tokens: 62133370880 | elapsed time per iteration (s): 0.15 | learning rate: 6.181E-05 | global batch size: 256 | lm loss: 3.683076E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.097 | TFLOPs: 26.19 | +7: iteration 118520/ 173500 | consumed samples: 30341120 | consumed tokens: 62138613760 | elapsed time per iteration (s): 0.16 | learning rate: 6.180E-05 | global batch size: 256 | lm loss: 3.675619E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.171 | TFLOPs: 25.57 | +7: iteration 118530/ 173500 | consumed samples: 30343680 | consumed tokens: 62143856640 | elapsed time per iteration (s): 0.15 | learning rate: 6.178E-05 | global batch size: 256 | lm loss: 3.682547E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.640 | TFLOPs: 26.20 | +7: iteration 118540/ 173500 | consumed samples: 30346240 | consumed tokens: 62149099520 | elapsed time per iteration (s): 0.16 | learning rate: 6.177E-05 | global batch size: 256 | lm loss: 3.675761E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.829 | TFLOPs: 25.67 | +7: iteration 118550/ 173500 | consumed samples: 30348800 | consumed tokens: 62154342400 | elapsed time per iteration (s): 0.16 | learning rate: 6.176E-05 | global batch size: 256 | lm loss: 3.679286E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.865 | TFLOPs: 25.12 | +7: iteration 118560/ 173500 | consumed samples: 30351360 | consumed tokens: 62159585280 | elapsed time per iteration (s): 0.15 | learning rate: 6.174E-05 | global batch size: 256 | lm loss: 3.676029E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.853 | TFLOPs: 26.12 | +7: iteration 118570/ 173500 | consumed samples: 30353920 | consumed tokens: 62164828160 | elapsed time per iteration (s): 0.15 | learning rate: 6.173E-05 | global batch size: 256 | lm loss: 3.697488E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.465 | TFLOPs: 26.17 | +7: iteration 118580/ 173500 | consumed samples: 30356480 | consumed tokens: 62170071040 | elapsed time per iteration (s): 0.15 | learning rate: 6.171E-05 | global batch size: 256 | lm loss: 3.663686E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.316 | TFLOPs: 26.15 | +7: iteration 118590/ 173500 | consumed samples: 30359040 | consumed tokens: 62175313920 | elapsed time per iteration (s): 0.16 | learning rate: 6.170E-05 | global batch size: 256 | lm loss: 3.678126E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.292 | TFLOPs: 25.49 | +7: iteration 118600/ 173500 | consumed samples: 30361600 | consumed tokens: 62180556800 | elapsed time per iteration (s): 0.15 | learning rate: 6.169E-05 | global batch size: 256 | lm loss: 3.674552E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.828 | TFLOPs: 26.19 | +7: iteration 118610/ 173500 | consumed samples: 30364160 | consumed tokens: 62185799680 | elapsed time per iteration (s): 0.16 | learning rate: 6.167E-05 | global batch size: 256 | lm loss: 3.684436E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.109 | TFLOPs: 25.39 | +7: iteration 118620/ 173500 | consumed samples: 30366720 | consumed tokens: 62191042560 | elapsed time per iteration (s): 0.16 | learning rate: 6.166E-05 | global batch size: 256 | lm loss: 3.662544E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.138 | TFLOPs: 25.78 | +7: iteration 118630/ 173500 | consumed samples: 30369280 | consumed tokens: 62196285440 | elapsed time per iteration (s): 0.16 | learning rate: 6.164E-05 | global batch size: 256 | lm loss: 3.688738E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.378 | TFLOPs: 25.79 | +7: iteration 118640/ 173500 | consumed samples: 30371840 | consumed tokens: 62201528320 | elapsed time per iteration (s): 0.16 | learning rate: 6.163E-05 | global batch size: 256 | lm loss: 3.683199E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.719 | TFLOPs: 25.76 | +7: iteration 118650/ 173500 | consumed samples: 30374400 | consumed tokens: 62206771200 | elapsed time per iteration (s): 0.16 | learning rate: 6.162E-05 | global batch size: 256 | lm loss: 3.695984E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.958 | TFLOPs: 25.23 | +7: iteration 118660/ 173500 | consumed samples: 30376960 | consumed tokens: 62212014080 | elapsed time per iteration (s): 0.16 | learning rate: 6.160E-05 | global batch size: 256 | lm loss: 3.680334E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.318 | TFLOPs: 25.80 | +7: iteration 118670/ 173500 | consumed samples: 30379520 | consumed tokens: 62217256960 | elapsed time per iteration (s): 0.16 | learning rate: 6.159E-05 | global batch size: 256 | lm loss: 3.672365E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.401 | TFLOPs: 25.30 | +7: iteration 118680/ 173500 | consumed samples: 30382080 | consumed tokens: 62222499840 | elapsed time per iteration (s): 0.15 | learning rate: 6.158E-05 | global batch size: 256 | lm loss: 3.694540E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.430 | TFLOPs: 26.24 | +7: iteration 118690/ 173500 | consumed samples: 30384640 | consumed tokens: 62227742720 | elapsed time per iteration (s): 0.15 | learning rate: 6.156E-05 | global batch size: 256 | lm loss: 3.689807E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.186 | TFLOPs: 26.21 | +7: iteration 118700/ 173500 | consumed samples: 30387200 | consumed tokens: 62232985600 | elapsed time per iteration (s): 0.16 | learning rate: 6.155E-05 | global batch size: 256 | lm loss: 3.672286E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.357 | TFLOPs: 25.44 | +7: iteration 118710/ 173500 | consumed samples: 30389760 | consumed tokens: 62238228480 | elapsed time per iteration (s): 0.16 | learning rate: 6.153E-05 | global batch size: 256 | lm loss: 3.698051E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.133 | TFLOPs: 25.74 | +7: iteration 118720/ 173500 | consumed samples: 30392320 | consumed tokens: 62243471360 | elapsed time per iteration (s): 0.16 | learning rate: 6.152E-05 | global batch size: 256 | lm loss: 3.695358E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.734 | TFLOPs: 25.70 | +7: iteration 118730/ 173500 | consumed samples: 30394880 | consumed tokens: 62248714240 | elapsed time per iteration (s): 0.16 | learning rate: 6.151E-05 | global batch size: 256 | lm loss: 3.675412E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.096 | TFLOPs: 25.58 | +7: iteration 118740/ 173500 | consumed samples: 30397440 | consumed tokens: 62253957120 | elapsed time per iteration (s): 0.16 | learning rate: 6.149E-05 | global batch size: 256 | lm loss: 3.671450E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.272 | TFLOPs: 25.90 | +7: iteration 118750/ 173500 | consumed samples: 30400000 | consumed tokens: 62259200000 | elapsed time per iteration (s): 0.15 | learning rate: 6.148E-05 | global batch size: 256 | lm loss: 3.677142E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.964 | TFLOPs: 26.02 | +7: iteration 118760/ 173500 | consumed samples: 30402560 | consumed tokens: 62264442880 | elapsed time per iteration (s): 0.16 | learning rate: 6.146E-05 | global batch size: 256 | lm loss: 3.681409E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.365 | TFLOPs: 25.66 | +7: iteration 118770/ 173500 | consumed samples: 30405120 | consumed tokens: 62269685760 | elapsed time per iteration (s): 0.15 | learning rate: 6.145E-05 | global batch size: 256 | lm loss: 3.677824E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.050 | TFLOPs: 26.19 | +7: iteration 118780/ 173500 | consumed samples: 30407680 | consumed tokens: 62274928640 | elapsed time per iteration (s): 0.16 | learning rate: 6.144E-05 | global batch size: 256 | lm loss: 3.682018E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.063 | TFLOPs: 25.34 | +7: iteration 118790/ 173500 | consumed samples: 30410240 | consumed tokens: 62280171520 | elapsed time per iteration (s): 0.16 | learning rate: 6.142E-05 | global batch size: 256 | lm loss: 3.686491E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.809 | TFLOPs: 25.34 | +7: iteration 118800/ 173500 | consumed samples: 30412800 | consumed tokens: 62285414400 | elapsed time per iteration (s): 0.16 | learning rate: 6.141E-05 | global batch size: 256 | lm loss: 3.683819E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.056 | TFLOPs: 25.88 | +7: iteration 118810/ 173500 | consumed samples: 30415360 | consumed tokens: 62290657280 | elapsed time per iteration (s): 0.15 | learning rate: 6.139E-05 | global batch size: 256 | lm loss: 3.668811E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.583 | TFLOPs: 26.23 | +7: iteration 118820/ 173500 | consumed samples: 30417920 | consumed tokens: 62295900160 | elapsed time per iteration (s): 0.15 | learning rate: 6.138E-05 | global batch size: 256 | lm loss: 3.681174E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.748 | TFLOPs: 26.22 | +7: iteration 118830/ 173500 | consumed samples: 30420480 | consumed tokens: 62301143040 | elapsed time per iteration (s): 0.16 | learning rate: 6.137E-05 | global batch size: 256 | lm loss: 3.681609E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.180 | TFLOPs: 25.71 | +7: iteration 118840/ 173500 | consumed samples: 30423040 | consumed tokens: 62306385920 | elapsed time per iteration (s): 0.16 | learning rate: 6.135E-05 | global batch size: 256 | lm loss: 3.675384E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.725 | TFLOPs: 25.51 | +7: iteration 118850/ 173500 | consumed samples: 30425600 | consumed tokens: 62311628800 | elapsed time per iteration (s): 0.15 | learning rate: 6.134E-05 | global batch size: 256 | lm loss: 3.684570E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.424 | TFLOPs: 26.06 | +7: iteration 118860/ 173500 | consumed samples: 30428160 | consumed tokens: 62316871680 | elapsed time per iteration (s): 0.16 | learning rate: 6.133E-05 | global batch size: 256 | lm loss: 3.673592E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.551 | TFLOPs: 25.81 | +7: iteration 118870/ 173500 | consumed samples: 30430720 | consumed tokens: 62322114560 | elapsed time per iteration (s): 0.17 | learning rate: 6.131E-05 | global batch size: 256 | lm loss: 3.669069E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.660 | TFLOPs: 24.15 | +7: iteration 118880/ 173500 | consumed samples: 30433280 | consumed tokens: 62327357440 | elapsed time per iteration (s): 0.16 | learning rate: 6.130E-05 | global batch size: 256 | lm loss: 3.671296E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.756 | TFLOPs: 25.86 | +7: iteration 118890/ 173500 | consumed samples: 30435840 | consumed tokens: 62332600320 | elapsed time per iteration (s): 0.15 | learning rate: 6.128E-05 | global batch size: 256 | lm loss: 3.680512E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.333 | TFLOPs: 26.01 | +7: iteration 118900/ 173500 | consumed samples: 30438400 | consumed tokens: 62337843200 | elapsed time per iteration (s): 0.15 | learning rate: 6.127E-05 | global batch size: 256 | lm loss: 3.679713E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.660 | TFLOPs: 26.03 | +7: iteration 118910/ 173500 | consumed samples: 30440960 | consumed tokens: 62343086080 | elapsed time per iteration (s): 0.15 | learning rate: 6.126E-05 | global batch size: 256 | lm loss: 3.683165E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.947 | TFLOPs: 26.22 | +7: iteration 118920/ 173500 | consumed samples: 30443520 | consumed tokens: 62348328960 | elapsed time per iteration (s): 0.16 | learning rate: 6.124E-05 | global batch size: 256 | lm loss: 3.682810E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.802 | TFLOPs: 25.20 | +7: iteration 118930/ 173500 | consumed samples: 30446080 | consumed tokens: 62353571840 | elapsed time per iteration (s): 0.16 | learning rate: 6.123E-05 | global batch size: 256 | lm loss: 3.681606E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.545 | TFLOPs: 25.88 | +7: iteration 118940/ 173500 | consumed samples: 30448640 | consumed tokens: 62358814720 | elapsed time per iteration (s): 0.15 | learning rate: 6.121E-05 | global batch size: 256 | lm loss: 3.682439E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.644 | TFLOPs: 26.29 | +7: iteration 118950/ 173500 | consumed samples: 30451200 | consumed tokens: 62364057600 | elapsed time per iteration (s): 0.15 | learning rate: 6.120E-05 | global batch size: 256 | lm loss: 3.685854E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.690 | TFLOPs: 26.00 | +7: iteration 118960/ 173500 | consumed samples: 30453760 | consumed tokens: 62369300480 | elapsed time per iteration (s): 0.15 | learning rate: 6.119E-05 | global batch size: 256 | lm loss: 3.677348E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.269 | TFLOPs: 26.23 | +7: iteration 118970/ 173500 | consumed samples: 30456320 | consumed tokens: 62374543360 | elapsed time per iteration (s): 0.17 | learning rate: 6.117E-05 | global batch size: 256 | lm loss: 3.700826E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.773 | TFLOPs: 23.16 | +7: iteration 118980/ 173500 | consumed samples: 30458880 | consumed tokens: 62379786240 | elapsed time per iteration (s): 0.15 | learning rate: 6.116E-05 | global batch size: 256 | lm loss: 3.674790E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.453 | TFLOPs: 26.09 | +7: iteration 118990/ 173500 | consumed samples: 30461440 | consumed tokens: 62385029120 | elapsed time per iteration (s): 0.16 | learning rate: 6.115E-05 | global batch size: 256 | lm loss: 3.687746E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.860 | TFLOPs: 25.84 | +7: iteration 119000/ 173500 | consumed samples: 30464000 | consumed tokens: 62390272000 | elapsed time per iteration (s): 0.15 | learning rate: 6.113E-05 | global batch size: 256 | lm loss: 3.679069E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.522 | TFLOPs: 26.17 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 119000 | lm loss value: 3.850560E+00 | lm loss PPL: 4.701937E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 119000 to checkpoints_44m91b100m +0: [2023-03-17 05:25:23,893] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step119000 is begin to save! +0: [2023-03-17 05:25:23,898] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:25:23,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:25:23,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:25:23,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:25:23,973] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:25:23,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:25:23,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:25:23,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:25:23,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:25:23,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:25:23,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:25:24,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:25:24,006] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:25:24,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:25:24,014] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:25:24,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:25:24,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:25:24,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:25:24,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:25:24,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:25:24,031] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step119000/mp_rank_00_model_states.pt +0: [2023-03-17 05:25:24,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:25:24,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:25:24,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:25:24,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:25:24,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,068] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +5: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:25:24,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:25:24,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +3: [2023-03-17 05:25:24,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +6: [2023-03-17 05:25:24,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +4: [2023-03-17 05:25:24,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:25:24,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +7: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:25:24,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +2: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:25:24,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 05:25:24,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step119000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +1: [2023-03-17 05:25:24,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step119000 is ready now! +0: successfully saved checkpoint at iteration 119000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.54 +7: iteration 119010/ 173500 | consumed samples: 30466560 | consumed tokens: 62395514880 | elapsed time per iteration (s): 0.18 | learning rate: 6.112E-05 | global batch size: 256 | lm loss: 3.678122E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.108 | TFLOPs: 22.10 | +7: iteration 119020/ 173500 | consumed samples: 30469120 | consumed tokens: 62400757760 | elapsed time per iteration (s): 0.15 | learning rate: 6.110E-05 | global batch size: 256 | lm loss: 3.668742E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.954 | TFLOPs: 26.19 | +7: iteration 119030/ 173500 | consumed samples: 30471680 | consumed tokens: 62406000640 | elapsed time per iteration (s): 0.15 | learning rate: 6.109E-05 | global batch size: 256 | lm loss: 3.672278E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.519 | TFLOPs: 26.18 | +7: iteration 119040/ 173500 | consumed samples: 30474240 | consumed tokens: 62411243520 | elapsed time per iteration (s): 0.15 | learning rate: 6.108E-05 | global batch size: 256 | lm loss: 3.683257E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.326 | TFLOPs: 26.21 | +7: iteration 119050/ 173500 | consumed samples: 30476800 | consumed tokens: 62416486400 | elapsed time per iteration (s): 0.16 | learning rate: 6.106E-05 | global batch size: 256 | lm loss: 3.679285E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.962 | TFLOPs: 24.68 | +7: iteration 119060/ 173500 | consumed samples: 30479360 | consumed tokens: 62421729280 | elapsed time per iteration (s): 0.15 | learning rate: 6.105E-05 | global batch size: 256 | lm loss: 3.677133E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.599 | TFLOPs: 26.03 | +7: iteration 119070/ 173500 | consumed samples: 30481920 | consumed tokens: 62426972160 | elapsed time per iteration (s): 0.15 | learning rate: 6.104E-05 | global batch size: 256 | lm loss: 3.688430E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.490 | TFLOPs: 26.01 | +7: iteration 119080/ 173500 | consumed samples: 30484480 | consumed tokens: 62432215040 | elapsed time per iteration (s): 0.15 | learning rate: 6.102E-05 | global batch size: 256 | lm loss: 3.670058E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.439 | TFLOPs: 26.26 | +7: iteration 119090/ 173500 | consumed samples: 30487040 | consumed tokens: 62437457920 | elapsed time per iteration (s): 0.15 | learning rate: 6.101E-05 | global batch size: 256 | lm loss: 3.669269E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.470 | TFLOPs: 26.31 | +7: iteration 119100/ 173500 | consumed samples: 30489600 | consumed tokens: 62442700800 | elapsed time per iteration (s): 0.15 | learning rate: 6.099E-05 | global batch size: 256 | lm loss: 3.678925E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.923 | TFLOPs: 26.35 | +7: iteration 119110/ 173500 | consumed samples: 30492160 | consumed tokens: 62447943680 | elapsed time per iteration (s): 0.15 | learning rate: 6.098E-05 | global batch size: 256 | lm loss: 3.668163E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.823 | TFLOPs: 26.11 | +7: iteration 119120/ 173500 | consumed samples: 30494720 | consumed tokens: 62453186560 | elapsed time per iteration (s): 0.15 | learning rate: 6.097E-05 | global batch size: 256 | lm loss: 3.678747E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.634 | TFLOPs: 26.33 | +7: iteration 119130/ 173500 | consumed samples: 30497280 | consumed tokens: 62458429440 | elapsed time per iteration (s): 0.15 | learning rate: 6.095E-05 | global batch size: 256 | lm loss: 3.671732E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.667 | TFLOPs: 26.25 | +7: iteration 119140/ 173500 | consumed samples: 30499840 | consumed tokens: 62463672320 | elapsed time per iteration (s): 0.16 | learning rate: 6.094E-05 | global batch size: 256 | lm loss: 3.689649E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.065 | TFLOPs: 25.74 | +7: iteration 119150/ 173500 | consumed samples: 30502400 | consumed tokens: 62468915200 | elapsed time per iteration (s): 0.15 | learning rate: 6.092E-05 | global batch size: 256 | lm loss: 3.664767E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.076 | TFLOPs: 26.38 | +7: iteration 119160/ 173500 | consumed samples: 30504960 | consumed tokens: 62474158080 | elapsed time per iteration (s): 0.15 | learning rate: 6.091E-05 | global batch size: 256 | lm loss: 3.670809E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.445 | TFLOPs: 26.35 | +7: iteration 119170/ 173500 | consumed samples: 30507520 | consumed tokens: 62479400960 | elapsed time per iteration (s): 0.15 | learning rate: 6.090E-05 | global batch size: 256 | lm loss: 3.679551E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.008 | TFLOPs: 26.36 | +7: iteration 119180/ 173500 | consumed samples: 30510080 | consumed tokens: 62484643840 | elapsed time per iteration (s): 0.15 | learning rate: 6.088E-05 | global batch size: 256 | lm loss: 3.668214E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.810 | TFLOPs: 26.36 | +7: iteration 119190/ 173500 | consumed samples: 30512640 | consumed tokens: 62489886720 | elapsed time per iteration (s): 0.15 | learning rate: 6.087E-05 | global batch size: 256 | lm loss: 3.680173E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.868 | TFLOPs: 26.33 | +7: iteration 119200/ 173500 | consumed samples: 30515200 | consumed tokens: 62495129600 | elapsed time per iteration (s): 0.15 | learning rate: 6.086E-05 | global batch size: 256 | lm loss: 3.672655E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.989 | TFLOPs: 26.08 | +7: iteration 119210/ 173500 | consumed samples: 30517760 | consumed tokens: 62500372480 | elapsed time per iteration (s): 0.16 | learning rate: 6.084E-05 | global batch size: 256 | lm loss: 3.687265E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.695 | TFLOPs: 25.89 | +7: iteration 119220/ 173500 | consumed samples: 30520320 | consumed tokens: 62505615360 | elapsed time per iteration (s): 0.15 | learning rate: 6.083E-05 | global batch size: 256 | lm loss: 3.681374E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.516 | TFLOPs: 25.98 | +7: iteration 119230/ 173500 | consumed samples: 30522880 | consumed tokens: 62510858240 | elapsed time per iteration (s): 0.15 | learning rate: 6.081E-05 | global batch size: 256 | lm loss: 3.674927E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.280 | TFLOPs: 26.19 | +7: iteration 119240/ 173500 | consumed samples: 30525440 | consumed tokens: 62516101120 | elapsed time per iteration (s): 0.15 | learning rate: 6.080E-05 | global batch size: 256 | lm loss: 3.675835E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.512 | TFLOPs: 26.18 | +7: iteration 119250/ 173500 | consumed samples: 30528000 | consumed tokens: 62521344000 | elapsed time per iteration (s): 0.16 | learning rate: 6.079E-05 | global batch size: 256 | lm loss: 3.667373E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.753 | TFLOPs: 25.29 | +7: iteration 119260/ 173500 | consumed samples: 30530560 | consumed tokens: 62526586880 | elapsed time per iteration (s): 0.15 | learning rate: 6.077E-05 | global batch size: 256 | lm loss: 3.672962E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.492 | TFLOPs: 26.10 | +7: iteration 119270/ 173500 | consumed samples: 30533120 | consumed tokens: 62531829760 | elapsed time per iteration (s): 0.16 | learning rate: 6.076E-05 | global batch size: 256 | lm loss: 3.689183E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.797 | TFLOPs: 24.82 | +7: iteration 119280/ 173500 | consumed samples: 30535680 | consumed tokens: 62537072640 | elapsed time per iteration (s): 0.15 | learning rate: 6.075E-05 | global batch size: 256 | lm loss: 3.676731E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.935 | TFLOPs: 26.17 | +7: iteration 119290/ 173500 | consumed samples: 30538240 | consumed tokens: 62542315520 | elapsed time per iteration (s): 0.16 | learning rate: 6.073E-05 | global batch size: 256 | lm loss: 3.697272E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.447 | TFLOPs: 24.88 | +7: iteration 119300/ 173500 | consumed samples: 30540800 | consumed tokens: 62547558400 | elapsed time per iteration (s): 0.16 | learning rate: 6.072E-05 | global batch size: 256 | lm loss: 3.684447E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.341 | TFLOPs: 25.66 | +7: iteration 119310/ 173500 | consumed samples: 30543360 | consumed tokens: 62552801280 | elapsed time per iteration (s): 0.16 | learning rate: 6.070E-05 | global batch size: 256 | lm loss: 3.681062E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.997 | TFLOPs: 25.81 | +7: iteration 119320/ 173500 | consumed samples: 30545920 | consumed tokens: 62558044160 | elapsed time per iteration (s): 0.16 | learning rate: 6.069E-05 | global batch size: 256 | lm loss: 3.676985E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.099 | TFLOPs: 24.65 | +7: iteration 119330/ 173500 | consumed samples: 30548480 | consumed tokens: 62563287040 | elapsed time per iteration (s): 0.16 | learning rate: 6.068E-05 | global batch size: 256 | lm loss: 3.679957E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.364 | TFLOPs: 25.07 | +7: iteration 119340/ 173500 | consumed samples: 30551040 | consumed tokens: 62568529920 | elapsed time per iteration (s): 0.16 | learning rate: 6.066E-05 | global batch size: 256 | lm loss: 3.680198E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.229 | TFLOPs: 25.75 | +7: iteration 119350/ 173500 | consumed samples: 30553600 | consumed tokens: 62573772800 | elapsed time per iteration (s): 0.15 | learning rate: 6.065E-05 | global batch size: 256 | lm loss: 3.673404E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.949 | TFLOPs: 26.19 | +7: iteration 119360/ 173500 | consumed samples: 30556160 | consumed tokens: 62579015680 | elapsed time per iteration (s): 0.16 | learning rate: 6.064E-05 | global batch size: 256 | lm loss: 3.675259E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.756 | TFLOPs: 25.83 | +7: iteration 119370/ 173500 | consumed samples: 30558720 | consumed tokens: 62584258560 | elapsed time per iteration (s): 0.16 | learning rate: 6.062E-05 | global batch size: 256 | lm loss: 3.681755E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.398 | TFLOPs: 25.35 | +7: iteration 119380/ 173500 | consumed samples: 30561280 | consumed tokens: 62589501440 | elapsed time per iteration (s): 0.16 | learning rate: 6.061E-05 | global batch size: 256 | lm loss: 3.670361E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.847 | TFLOPs: 25.69 | +7: iteration 119390/ 173500 | consumed samples: 30563840 | consumed tokens: 62594744320 | elapsed time per iteration (s): 0.15 | learning rate: 6.059E-05 | global batch size: 256 | lm loss: 3.672076E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.790 | TFLOPs: 25.94 | +7: iteration 119400/ 173500 | consumed samples: 30566400 | consumed tokens: 62599987200 | elapsed time per iteration (s): 0.16 | learning rate: 6.058E-05 | global batch size: 256 | lm loss: 3.669174E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.064 | TFLOPs: 25.74 | +7: iteration 119410/ 173500 | consumed samples: 30568960 | consumed tokens: 62605230080 | elapsed time per iteration (s): 0.16 | learning rate: 6.057E-05 | global batch size: 256 | lm loss: 3.679649E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.696 | TFLOPs: 25.06 | +7: iteration 119420/ 173500 | consumed samples: 30571520 | consumed tokens: 62610472960 | elapsed time per iteration (s): 0.16 | learning rate: 6.055E-05 | global batch size: 256 | lm loss: 3.692660E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.064 | TFLOPs: 25.69 | +7: iteration 119430/ 173500 | consumed samples: 30574080 | consumed tokens: 62615715840 | elapsed time per iteration (s): 0.15 | learning rate: 6.054E-05 | global batch size: 256 | lm loss: 3.694753E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.042 | TFLOPs: 26.16 | +7: iteration 119440/ 173500 | consumed samples: 30576640 | consumed tokens: 62620958720 | elapsed time per iteration (s): 0.16 | learning rate: 6.053E-05 | global batch size: 256 | lm loss: 3.680152E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.665 | TFLOPs: 25.75 | +7: iteration 119450/ 173500 | consumed samples: 30579200 | consumed tokens: 62626201600 | elapsed time per iteration (s): 0.16 | learning rate: 6.051E-05 | global batch size: 256 | lm loss: 3.681293E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.374 | TFLOPs: 25.25 | +7: iteration 119460/ 173500 | consumed samples: 30581760 | consumed tokens: 62631444480 | elapsed time per iteration (s): 0.16 | learning rate: 6.050E-05 | global batch size: 256 | lm loss: 3.675030E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.275 | TFLOPs: 25.02 | +7: iteration 119470/ 173500 | consumed samples: 30584320 | consumed tokens: 62636687360 | elapsed time per iteration (s): 0.15 | learning rate: 6.048E-05 | global batch size: 256 | lm loss: 3.674323E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.633 | TFLOPs: 26.20 | +7: iteration 119480/ 173500 | consumed samples: 30586880 | consumed tokens: 62641930240 | elapsed time per iteration (s): 0.16 | learning rate: 6.047E-05 | global batch size: 256 | lm loss: 3.677178E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.033 | TFLOPs: 25.80 | +7: iteration 119490/ 173500 | consumed samples: 30589440 | consumed tokens: 62647173120 | elapsed time per iteration (s): 0.16 | learning rate: 6.046E-05 | global batch size: 256 | lm loss: 3.666015E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.249 | TFLOPs: 25.80 | +7: iteration 119500/ 173500 | consumed samples: 30592000 | consumed tokens: 62652416000 | elapsed time per iteration (s): 0.16 | learning rate: 6.044E-05 | global batch size: 256 | lm loss: 3.663825E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.191 | TFLOPs: 25.79 | +7: iteration 119510/ 173500 | consumed samples: 30594560 | consumed tokens: 62657658880 | elapsed time per iteration (s): 0.16 | learning rate: 6.043E-05 | global batch size: 256 | lm loss: 3.686445E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.929 | TFLOPs: 25.75 | +7: iteration 119520/ 173500 | consumed samples: 30597120 | consumed tokens: 62662901760 | elapsed time per iteration (s): 0.16 | learning rate: 6.042E-05 | global batch size: 256 | lm loss: 3.682998E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.371 | TFLOPs: 25.65 | +7: iteration 119530/ 173500 | consumed samples: 30599680 | consumed tokens: 62668144640 | elapsed time per iteration (s): 0.16 | learning rate: 6.040E-05 | global batch size: 256 | lm loss: 3.689117E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.008 | TFLOPs: 25.77 | +7: iteration 119540/ 173500 | consumed samples: 30602240 | consumed tokens: 62673387520 | elapsed time per iteration (s): 0.16 | learning rate: 6.039E-05 | global batch size: 256 | lm loss: 3.674201E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.517 | TFLOPs: 25.62 | +7: iteration 119550/ 173500 | consumed samples: 30604800 | consumed tokens: 62678630400 | elapsed time per iteration (s): 0.15 | learning rate: 6.037E-05 | global batch size: 256 | lm loss: 3.688726E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.149 | TFLOPs: 26.16 | +7: iteration 119560/ 173500 | consumed samples: 30607360 | consumed tokens: 62683873280 | elapsed time per iteration (s): 0.15 | learning rate: 6.036E-05 | global batch size: 256 | lm loss: 3.681293E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.395 | TFLOPs: 26.15 | +7: iteration 119570/ 173500 | consumed samples: 30609920 | consumed tokens: 62689116160 | elapsed time per iteration (s): 0.15 | learning rate: 6.035E-05 | global batch size: 256 | lm loss: 3.681333E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.397 | TFLOPs: 26.01 | +7: iteration 119580/ 173500 | consumed samples: 30612480 | consumed tokens: 62694359040 | elapsed time per iteration (s): 0.15 | learning rate: 6.033E-05 | global batch size: 256 | lm loss: 3.674458E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.068 | TFLOPs: 26.33 | +7: iteration 119590/ 173500 | consumed samples: 30615040 | consumed tokens: 62699601920 | elapsed time per iteration (s): 0.15 | learning rate: 6.032E-05 | global batch size: 256 | lm loss: 3.668050E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.690 | TFLOPs: 26.31 | +7: iteration 119600/ 173500 | consumed samples: 30617600 | consumed tokens: 62704844800 | elapsed time per iteration (s): 0.15 | learning rate: 6.031E-05 | global batch size: 256 | lm loss: 3.677444E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.374 | TFLOPs: 26.31 | +7: iteration 119610/ 173500 | consumed samples: 30620160 | consumed tokens: 62710087680 | elapsed time per iteration (s): 0.15 | learning rate: 6.029E-05 | global batch size: 256 | lm loss: 3.684375E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.796 | TFLOPs: 26.30 | +7: iteration 119620/ 173500 | consumed samples: 30622720 | consumed tokens: 62715330560 | elapsed time per iteration (s): 0.15 | learning rate: 6.028E-05 | global batch size: 256 | lm loss: 3.676752E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.660 | TFLOPs: 26.33 | +7: iteration 119630/ 173500 | consumed samples: 30625280 | consumed tokens: 62720573440 | elapsed time per iteration (s): 0.16 | learning rate: 6.026E-05 | global batch size: 256 | lm loss: 3.678555E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.172 | TFLOPs: 25.00 | +7: iteration 119640/ 173500 | consumed samples: 30627840 | consumed tokens: 62725816320 | elapsed time per iteration (s): 0.15 | learning rate: 6.025E-05 | global batch size: 256 | lm loss: 3.673183E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.488 | TFLOPs: 26.28 | +7: iteration 119650/ 173500 | consumed samples: 30630400 | consumed tokens: 62731059200 | elapsed time per iteration (s): 0.16 | learning rate: 6.024E-05 | global batch size: 256 | lm loss: 3.687935E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.604 | TFLOPs: 25.56 | +7: iteration 119660/ 173500 | consumed samples: 30632960 | consumed tokens: 62736302080 | elapsed time per iteration (s): 0.16 | learning rate: 6.022E-05 | global batch size: 256 | lm loss: 3.672007E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.172 | TFLOPs: 25.14 | +7: iteration 119670/ 173500 | consumed samples: 30635520 | consumed tokens: 62741544960 | elapsed time per iteration (s): 0.16 | learning rate: 6.021E-05 | global batch size: 256 | lm loss: 3.680744E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.403 | TFLOPs: 25.55 | +7: iteration 119680/ 173500 | consumed samples: 30638080 | consumed tokens: 62746787840 | elapsed time per iteration (s): 0.16 | learning rate: 6.020E-05 | global batch size: 256 | lm loss: 3.666864E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.226 | TFLOPs: 25.03 | +7: iteration 119690/ 173500 | consumed samples: 30640640 | consumed tokens: 62752030720 | elapsed time per iteration (s): 0.16 | learning rate: 6.018E-05 | global batch size: 256 | lm loss: 3.671481E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.034 | TFLOPs: 25.34 | +7: iteration 119700/ 173500 | consumed samples: 30643200 | consumed tokens: 62757273600 | elapsed time per iteration (s): 0.15 | learning rate: 6.017E-05 | global batch size: 256 | lm loss: 3.678805E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.161 | TFLOPs: 25.97 | +7: iteration 119710/ 173500 | consumed samples: 30645760 | consumed tokens: 62762516480 | elapsed time per iteration (s): 0.16 | learning rate: 6.015E-05 | global batch size: 256 | lm loss: 3.670730E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.955 | TFLOPs: 25.86 | +7: iteration 119720/ 173500 | consumed samples: 30648320 | consumed tokens: 62767759360 | elapsed time per iteration (s): 0.16 | learning rate: 6.014E-05 | global batch size: 256 | lm loss: 3.686002E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.344 | TFLOPs: 25.90 | +7: iteration 119730/ 173500 | consumed samples: 30650880 | consumed tokens: 62773002240 | elapsed time per iteration (s): 0.15 | learning rate: 6.013E-05 | global batch size: 256 | lm loss: 3.679789E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.224 | TFLOPs: 26.21 | +7: iteration 119740/ 173500 | consumed samples: 30653440 | consumed tokens: 62778245120 | elapsed time per iteration (s): 0.16 | learning rate: 6.011E-05 | global batch size: 256 | lm loss: 3.676816E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.836 | TFLOPs: 25.80 | +7: iteration 119750/ 173500 | consumed samples: 30656000 | consumed tokens: 62783488000 | elapsed time per iteration (s): 0.16 | learning rate: 6.010E-05 | global batch size: 256 | lm loss: 3.675045E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.817 | TFLOPs: 25.89 | +7: iteration 119760/ 173500 | consumed samples: 30658560 | consumed tokens: 62788730880 | elapsed time per iteration (s): 0.16 | learning rate: 6.009E-05 | global batch size: 256 | lm loss: 3.677947E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.402 | TFLOPs: 25.07 | +7: iteration 119770/ 173500 | consumed samples: 30661120 | consumed tokens: 62793973760 | elapsed time per iteration (s): 0.16 | learning rate: 6.007E-05 | global batch size: 256 | lm loss: 3.683749E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.256 | TFLOPs: 25.64 | +7: iteration 119780/ 173500 | consumed samples: 30663680 | consumed tokens: 62799216640 | elapsed time per iteration (s): 0.16 | learning rate: 6.006E-05 | global batch size: 256 | lm loss: 3.687709E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.779 | TFLOPs: 25.64 | +7: iteration 119790/ 173500 | consumed samples: 30666240 | consumed tokens: 62804459520 | elapsed time per iteration (s): 0.16 | learning rate: 6.004E-05 | global batch size: 256 | lm loss: 3.681567E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.065 | TFLOPs: 24.56 | +7: iteration 119800/ 173500 | consumed samples: 30668800 | consumed tokens: 62809702400 | elapsed time per iteration (s): 0.16 | learning rate: 6.003E-05 | global batch size: 256 | lm loss: 3.679005E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.877 | TFLOPs: 25.83 | +7: iteration 119810/ 173500 | consumed samples: 30671360 | consumed tokens: 62814945280 | elapsed time per iteration (s): 0.15 | learning rate: 6.002E-05 | global batch size: 256 | lm loss: 3.668664E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.904 | TFLOPs: 26.30 | +7: iteration 119820/ 173500 | consumed samples: 30673920 | consumed tokens: 62820188160 | elapsed time per iteration (s): 0.15 | learning rate: 6.000E-05 | global batch size: 256 | lm loss: 3.674751E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.066 | TFLOPs: 26.30 | +7: iteration 119830/ 173500 | consumed samples: 30676480 | consumed tokens: 62825431040 | elapsed time per iteration (s): 0.16 | learning rate: 5.999E-05 | global batch size: 256 | lm loss: 3.686689E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.835 | TFLOPs: 25.48 | +7: iteration 119840/ 173500 | consumed samples: 30679040 | consumed tokens: 62830673920 | elapsed time per iteration (s): 0.15 | learning rate: 5.998E-05 | global batch size: 256 | lm loss: 3.683404E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.891 | TFLOPs: 26.14 | +7: iteration 119850/ 173500 | consumed samples: 30681600 | consumed tokens: 62835916800 | elapsed time per iteration (s): 0.15 | learning rate: 5.996E-05 | global batch size: 256 | lm loss: 3.673067E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.491 | TFLOPs: 26.15 | +7: iteration 119860/ 173500 | consumed samples: 30684160 | consumed tokens: 62841159680 | elapsed time per iteration (s): 0.16 | learning rate: 5.995E-05 | global batch size: 256 | lm loss: 3.687123E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.865 | TFLOPs: 25.75 | +7: iteration 119870/ 173500 | consumed samples: 30686720 | consumed tokens: 62846402560 | elapsed time per iteration (s): 0.16 | learning rate: 5.994E-05 | global batch size: 256 | lm loss: 3.661345E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.469 | TFLOPs: 25.30 | +7: iteration 119880/ 173500 | consumed samples: 30689280 | consumed tokens: 62851645440 | elapsed time per iteration (s): 0.16 | learning rate: 5.992E-05 | global batch size: 256 | lm loss: 3.688050E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.368 | TFLOPs: 25.36 | +7: iteration 119890/ 173500 | consumed samples: 30691840 | consumed tokens: 62856888320 | elapsed time per iteration (s): 0.15 | learning rate: 5.991E-05 | global batch size: 256 | lm loss: 3.681994E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.762 | TFLOPs: 26.20 | +7: iteration 119900/ 173500 | consumed samples: 30694400 | consumed tokens: 62862131200 | elapsed time per iteration (s): 0.16 | learning rate: 5.989E-05 | global batch size: 256 | lm loss: 3.676310E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.569 | TFLOPs: 25.30 | +7: iteration 119910/ 173500 | consumed samples: 30696960 | consumed tokens: 62867374080 | elapsed time per iteration (s): 0.16 | learning rate: 5.988E-05 | global batch size: 256 | lm loss: 3.664148E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.269 | TFLOPs: 25.72 | +7: iteration 119920/ 173500 | consumed samples: 30699520 | consumed tokens: 62872616960 | elapsed time per iteration (s): 0.16 | learning rate: 5.987E-05 | global batch size: 256 | lm loss: 3.676838E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.096 | TFLOPs: 25.85 | +7: iteration 119930/ 173500 | consumed samples: 30702080 | consumed tokens: 62877859840 | elapsed time per iteration (s): 0.16 | learning rate: 5.985E-05 | global batch size: 256 | lm loss: 3.682883E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.218 | TFLOPs: 25.71 | +7: iteration 119940/ 173500 | consumed samples: 30704640 | consumed tokens: 62883102720 | elapsed time per iteration (s): 0.15 | learning rate: 5.984E-05 | global batch size: 256 | lm loss: 3.663908E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.029 | TFLOPs: 26.19 | +7: iteration 119950/ 173500 | consumed samples: 30707200 | consumed tokens: 62888345600 | elapsed time per iteration (s): 0.15 | learning rate: 5.983E-05 | global batch size: 256 | lm loss: 3.682146E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.244 | TFLOPs: 26.18 | +7: iteration 119960/ 173500 | consumed samples: 30709760 | consumed tokens: 62893588480 | elapsed time per iteration (s): 0.15 | learning rate: 5.981E-05 | global batch size: 256 | lm loss: 3.682552E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.480 | TFLOPs: 26.20 | +7: iteration 119970/ 173500 | consumed samples: 30712320 | consumed tokens: 62898831360 | elapsed time per iteration (s): 0.16 | learning rate: 5.980E-05 | global batch size: 256 | lm loss: 3.663652E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.767 | TFLOPs: 24.70 | +7: iteration 119980/ 173500 | consumed samples: 30714880 | consumed tokens: 62904074240 | elapsed time per iteration (s): 0.15 | learning rate: 5.979E-05 | global batch size: 256 | lm loss: 3.676553E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.323 | TFLOPs: 25.96 | +7: iteration 119990/ 173500 | consumed samples: 30717440 | consumed tokens: 62909317120 | elapsed time per iteration (s): 0.15 | learning rate: 5.977E-05 | global batch size: 256 | lm loss: 3.689806E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.204 | TFLOPs: 26.13 | +0: [2023-03-17 05:27:59,646] [INFO] [logging.py:68:log_dist] [Rank 0] step=120000, skipped=0, lr=[5.975780833100023e-05, 5.975780833100023e-05, 5.975780833100023e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 120000/ 173500 | consumed samples: 30720000 | consumed tokens: 62914560000 | elapsed time per iteration (s): 0.15 | learning rate: 5.976E-05 | global batch size: 256 | lm loss: 3.683067E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.074 | TFLOPs: 26.07 | +0: steps: 120000 loss: 3.7183 iter time (s): 0.155 samples/sec: 1652.571 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 120000 | lm loss value: 3.872859E+00 | lm loss PPL: 4.807964E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 120000 to checkpoints_44m91b100m +0: [2023-03-17 05:27:59,721] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step120000 is begin to save! +0: [2023-03-17 05:27:59,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:27:59,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:27:59,787] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:27:59,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:27:59,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:27:59,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:27:59,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:27:59,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:27:59,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:27:59,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:27:59,824] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:27:59,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:27:59,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:27:59,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:27:59,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:27:59,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:27:59,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:27:59,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:27:59,856] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:27:59,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:27:59,857] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step120000/mp_rank_00_model_states.pt +0: [2023-03-17 05:27:59,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:27:59,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:27:59,876] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:27:59,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +4: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:27:59,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +5: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +6: [2023-03-17 05:27:59,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +2: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +3: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:27:59,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:27:59,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:27:59,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +1: [2023-03-17 05:27:59,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:27:59,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:27:59,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +7: [2023-03-17 05:27:59,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:27:59,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step120000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:27:59,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step120000 is ready now! +0: successfully saved checkpoint at iteration 120000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.11 +7: iteration 120010/ 173500 | consumed samples: 30722560 | consumed tokens: 62919802880 | elapsed time per iteration (s): 0.18 | learning rate: 5.974E-05 | global batch size: 256 | lm loss: 3.660439E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.159 | TFLOPs: 22.05 | +7: iteration 120020/ 173500 | consumed samples: 30725120 | consumed tokens: 62925045760 | elapsed time per iteration (s): 0.15 | learning rate: 5.973E-05 | global batch size: 256 | lm loss: 3.692181E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.400 | TFLOPs: 26.12 | +7: iteration 120030/ 173500 | consumed samples: 30727680 | consumed tokens: 62930288640 | elapsed time per iteration (s): 0.16 | learning rate: 5.972E-05 | global batch size: 256 | lm loss: 3.682321E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.879 | TFLOPs: 25.50 | +7: iteration 120040/ 173500 | consumed samples: 30730240 | consumed tokens: 62935531520 | elapsed time per iteration (s): 0.16 | learning rate: 5.970E-05 | global batch size: 256 | lm loss: 3.675269E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.914 | TFLOPs: 25.47 | +7: iteration 120050/ 173500 | consumed samples: 30732800 | consumed tokens: 62940774400 | elapsed time per iteration (s): 0.16 | learning rate: 5.969E-05 | global batch size: 256 | lm loss: 3.674391E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.139 | TFLOPs: 25.30 | +7: iteration 120060/ 173500 | consumed samples: 30735360 | consumed tokens: 62946017280 | elapsed time per iteration (s): 0.15 | learning rate: 5.968E-05 | global batch size: 256 | lm loss: 3.672962E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.986 | TFLOPs: 26.16 | +7: iteration 120070/ 173500 | consumed samples: 30737920 | consumed tokens: 62951260160 | elapsed time per iteration (s): 0.15 | learning rate: 5.966E-05 | global batch size: 256 | lm loss: 3.680112E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.605 | TFLOPs: 26.20 | +7: iteration 120080/ 173500 | consumed samples: 30740480 | consumed tokens: 62956503040 | elapsed time per iteration (s): 0.15 | learning rate: 5.965E-05 | global batch size: 256 | lm loss: 3.687275E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.302 | TFLOPs: 26.16 | +7: iteration 120090/ 173500 | consumed samples: 30743040 | consumed tokens: 62961745920 | elapsed time per iteration (s): 0.15 | learning rate: 5.963E-05 | global batch size: 256 | lm loss: 3.685134E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.844 | TFLOPs: 26.19 | +7: iteration 120100/ 173500 | consumed samples: 30745600 | consumed tokens: 62966988800 | elapsed time per iteration (s): 0.15 | learning rate: 5.962E-05 | global batch size: 256 | lm loss: 3.684510E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.408 | TFLOPs: 26.18 | +7: iteration 120110/ 173500 | consumed samples: 30748160 | consumed tokens: 62972231680 | elapsed time per iteration (s): 0.15 | learning rate: 5.961E-05 | global batch size: 256 | lm loss: 3.673553E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.204 | TFLOPs: 26.05 | +7: iteration 120120/ 173500 | consumed samples: 30750720 | consumed tokens: 62977474560 | elapsed time per iteration (s): 0.15 | learning rate: 5.959E-05 | global batch size: 256 | lm loss: 3.680223E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.078 | TFLOPs: 26.05 | +7: iteration 120130/ 173500 | consumed samples: 30753280 | consumed tokens: 62982717440 | elapsed time per iteration (s): 0.15 | learning rate: 5.958E-05 | global batch size: 256 | lm loss: 3.681845E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.116 | TFLOPs: 25.97 | +7: iteration 120140/ 173500 | consumed samples: 30755840 | consumed tokens: 62987960320 | elapsed time per iteration (s): 0.15 | learning rate: 5.957E-05 | global batch size: 256 | lm loss: 3.677763E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.401 | TFLOPs: 26.20 | +7: iteration 120150/ 173500 | consumed samples: 30758400 | consumed tokens: 62993203200 | elapsed time per iteration (s): 0.16 | learning rate: 5.955E-05 | global batch size: 256 | lm loss: 3.684273E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.084 | TFLOPs: 25.09 | +7: iteration 120160/ 173500 | consumed samples: 30760960 | consumed tokens: 62998446080 | elapsed time per iteration (s): 0.16 | learning rate: 5.954E-05 | global batch size: 256 | lm loss: 3.676585E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.424 | TFLOPs: 25.69 | +7: iteration 120170/ 173500 | consumed samples: 30763520 | consumed tokens: 63003688960 | elapsed time per iteration (s): 0.16 | learning rate: 5.953E-05 | global batch size: 256 | lm loss: 3.681341E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.484 | TFLOPs: 25.44 | +7: iteration 120180/ 173500 | consumed samples: 30766080 | consumed tokens: 63008931840 | elapsed time per iteration (s): 0.15 | learning rate: 5.951E-05 | global batch size: 256 | lm loss: 3.662951E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.005 | TFLOPs: 26.14 | +7: iteration 120190/ 173500 | consumed samples: 30768640 | consumed tokens: 63014174720 | elapsed time per iteration (s): 0.16 | learning rate: 5.950E-05 | global batch size: 256 | lm loss: 3.684457E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.869 | TFLOPs: 25.61 | +7: iteration 120200/ 173500 | consumed samples: 30771200 | consumed tokens: 63019417600 | elapsed time per iteration (s): 0.17 | learning rate: 5.948E-05 | global batch size: 256 | lm loss: 3.683280E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.940 | TFLOPs: 23.93 | +7: iteration 120210/ 173500 | consumed samples: 30773760 | consumed tokens: 63024660480 | elapsed time per iteration (s): 0.16 | learning rate: 5.947E-05 | global batch size: 256 | lm loss: 3.682506E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.801 | TFLOPs: 24.98 | +7: iteration 120220/ 173500 | consumed samples: 30776320 | consumed tokens: 63029903360 | elapsed time per iteration (s): 0.15 | learning rate: 5.946E-05 | global batch size: 256 | lm loss: 3.684159E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.753 | TFLOPs: 26.22 | +7: iteration 120230/ 173500 | consumed samples: 30778880 | consumed tokens: 63035146240 | elapsed time per iteration (s): 0.16 | learning rate: 5.944E-05 | global batch size: 256 | lm loss: 3.676780E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.340 | TFLOPs: 24.67 | +7: iteration 120240/ 173500 | consumed samples: 30781440 | consumed tokens: 63040389120 | elapsed time per iteration (s): 0.16 | learning rate: 5.943E-05 | global batch size: 256 | lm loss: 3.680976E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.389 | TFLOPs: 25.41 | +7: iteration 120250/ 173500 | consumed samples: 30784000 | consumed tokens: 63045632000 | elapsed time per iteration (s): 0.17 | learning rate: 5.942E-05 | global batch size: 256 | lm loss: 3.680171E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.420 | TFLOPs: 23.91 | +7: iteration 120260/ 173500 | consumed samples: 30786560 | consumed tokens: 63050874880 | elapsed time per iteration (s): 0.15 | learning rate: 5.940E-05 | global batch size: 256 | lm loss: 3.675771E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.789 | TFLOPs: 26.12 | +7: iteration 120270/ 173500 | consumed samples: 30789120 | consumed tokens: 63056117760 | elapsed time per iteration (s): 0.16 | learning rate: 5.939E-05 | global batch size: 256 | lm loss: 3.679874E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.016 | TFLOPs: 25.77 | +7: iteration 120280/ 173500 | consumed samples: 30791680 | consumed tokens: 63061360640 | elapsed time per iteration (s): 0.16 | learning rate: 5.938E-05 | global batch size: 256 | lm loss: 3.664200E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.788 | TFLOPs: 25.10 | +7: iteration 120290/ 173500 | consumed samples: 30794240 | consumed tokens: 63066603520 | elapsed time per iteration (s): 0.16 | learning rate: 5.936E-05 | global batch size: 256 | lm loss: 3.672644E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.670 | TFLOPs: 25.57 | +7: iteration 120300/ 173500 | consumed samples: 30796800 | consumed tokens: 63071846400 | elapsed time per iteration (s): 0.15 | learning rate: 5.935E-05 | global batch size: 256 | lm loss: 3.683854E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.132 | TFLOPs: 25.97 | +7: iteration 120310/ 173500 | consumed samples: 30799360 | consumed tokens: 63077089280 | elapsed time per iteration (s): 0.16 | learning rate: 5.934E-05 | global batch size: 256 | lm loss: 3.673052E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.252 | TFLOPs: 25.00 | +7: iteration 120320/ 173500 | consumed samples: 30801920 | consumed tokens: 63082332160 | elapsed time per iteration (s): 0.16 | learning rate: 5.932E-05 | global batch size: 256 | lm loss: 3.686484E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.389 | TFLOPs: 25.62 | +7: iteration 120330/ 173500 | consumed samples: 30804480 | consumed tokens: 63087575040 | elapsed time per iteration (s): 0.16 | learning rate: 5.931E-05 | global batch size: 256 | lm loss: 3.672784E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.681 | TFLOPs: 25.43 | +7: iteration 120340/ 173500 | consumed samples: 30807040 | consumed tokens: 63092817920 | elapsed time per iteration (s): 0.16 | learning rate: 5.929E-05 | global batch size: 256 | lm loss: 3.691544E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.729 | TFLOPs: 25.72 | +7: iteration 120350/ 173500 | consumed samples: 30809600 | consumed tokens: 63098060800 | elapsed time per iteration (s): 0.16 | learning rate: 5.928E-05 | global batch size: 256 | lm loss: 3.678918E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.661 | TFLOPs: 25.78 | +7: iteration 120360/ 173500 | consumed samples: 30812160 | consumed tokens: 63103303680 | elapsed time per iteration (s): 0.15 | learning rate: 5.927E-05 | global batch size: 256 | lm loss: 3.670279E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.904 | TFLOPs: 26.17 | +7: iteration 120370/ 173500 | consumed samples: 30814720 | consumed tokens: 63108546560 | elapsed time per iteration (s): 0.15 | learning rate: 5.925E-05 | global batch size: 256 | lm loss: 3.677301E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.644 | TFLOPs: 26.17 | +7: iteration 120380/ 173500 | consumed samples: 30817280 | consumed tokens: 63113789440 | elapsed time per iteration (s): 0.15 | learning rate: 5.924E-05 | global batch size: 256 | lm loss: 3.667675E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.953 | TFLOPs: 26.16 | +7: iteration 120390/ 173500 | consumed samples: 30819840 | consumed tokens: 63119032320 | elapsed time per iteration (s): 0.16 | learning rate: 5.923E-05 | global batch size: 256 | lm loss: 3.678999E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.834 | TFLOPs: 25.69 | +7: iteration 120400/ 173500 | consumed samples: 30822400 | consumed tokens: 63124275200 | elapsed time per iteration (s): 0.15 | learning rate: 5.921E-05 | global batch size: 256 | lm loss: 3.684339E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.748 | TFLOPs: 25.93 | +7: iteration 120410/ 173500 | consumed samples: 30824960 | consumed tokens: 63129518080 | elapsed time per iteration (s): 0.16 | learning rate: 5.920E-05 | global batch size: 256 | lm loss: 3.668924E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.282 | TFLOPs: 25.69 | +7: iteration 120420/ 173500 | consumed samples: 30827520 | consumed tokens: 63134760960 | elapsed time per iteration (s): 0.15 | learning rate: 5.919E-05 | global batch size: 256 | lm loss: 3.682195E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.675 | TFLOPs: 26.20 | +7: iteration 120430/ 173500 | consumed samples: 30830080 | consumed tokens: 63140003840 | elapsed time per iteration (s): 0.16 | learning rate: 5.917E-05 | global batch size: 256 | lm loss: 3.667469E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.346 | TFLOPs: 25.77 | +7: iteration 120440/ 173500 | consumed samples: 30832640 | consumed tokens: 63145246720 | elapsed time per iteration (s): 0.16 | learning rate: 5.916E-05 | global batch size: 256 | lm loss: 3.674921E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.562 | TFLOPs: 25.78 | +7: iteration 120450/ 173500 | consumed samples: 30835200 | consumed tokens: 63150489600 | elapsed time per iteration (s): 0.15 | learning rate: 5.914E-05 | global batch size: 256 | lm loss: 3.681630E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.690 | TFLOPs: 26.15 | +7: iteration 120460/ 173500 | consumed samples: 30837760 | consumed tokens: 63155732480 | elapsed time per iteration (s): 0.15 | learning rate: 5.913E-05 | global batch size: 256 | lm loss: 3.682930E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.514 | TFLOPs: 26.18 | +7: iteration 120470/ 173500 | consumed samples: 30840320 | consumed tokens: 63160975360 | elapsed time per iteration (s): 0.15 | learning rate: 5.912E-05 | global batch size: 256 | lm loss: 3.667852E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.168 | TFLOPs: 26.16 | +7: iteration 120480/ 173500 | consumed samples: 30842880 | consumed tokens: 63166218240 | elapsed time per iteration (s): 0.16 | learning rate: 5.910E-05 | global batch size: 256 | lm loss: 3.682015E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.410 | TFLOPs: 25.62 | +7: iteration 120490/ 173500 | consumed samples: 30845440 | consumed tokens: 63171461120 | elapsed time per iteration (s): 0.15 | learning rate: 5.909E-05 | global batch size: 256 | lm loss: 3.690245E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.879 | TFLOPs: 26.17 | +7: iteration 120500/ 173500 | consumed samples: 30848000 | consumed tokens: 63176704000 | elapsed time per iteration (s): 0.16 | learning rate: 5.908E-05 | global batch size: 256 | lm loss: 3.688274E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.032 | TFLOPs: 25.85 | +7: iteration 120510/ 173500 | consumed samples: 30850560 | consumed tokens: 63181946880 | elapsed time per iteration (s): 0.16 | learning rate: 5.906E-05 | global batch size: 256 | lm loss: 3.697665E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.668 | TFLOPs: 25.53 | +7: iteration 120520/ 173500 | consumed samples: 30853120 | consumed tokens: 63187189760 | elapsed time per iteration (s): 0.15 | learning rate: 5.905E-05 | global batch size: 256 | lm loss: 3.680536E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.402 | TFLOPs: 26.01 | +7: iteration 120530/ 173500 | consumed samples: 30855680 | consumed tokens: 63192432640 | elapsed time per iteration (s): 0.15 | learning rate: 5.904E-05 | global batch size: 256 | lm loss: 3.690928E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.554 | TFLOPs: 25.96 | +7: iteration 120540/ 173500 | consumed samples: 30858240 | consumed tokens: 63197675520 | elapsed time per iteration (s): 0.15 | learning rate: 5.902E-05 | global batch size: 256 | lm loss: 3.682843E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.635 | TFLOPs: 26.22 | +7: iteration 120550/ 173500 | consumed samples: 30860800 | consumed tokens: 63202918400 | elapsed time per iteration (s): 0.15 | learning rate: 5.901E-05 | global batch size: 256 | lm loss: 3.678590E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.764 | TFLOPs: 26.20 | +7: iteration 120560/ 173500 | consumed samples: 30863360 | consumed tokens: 63208161280 | elapsed time per iteration (s): 0.15 | learning rate: 5.900E-05 | global batch size: 256 | lm loss: 3.670540E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.546 | TFLOPs: 26.20 | +7: iteration 120570/ 173500 | consumed samples: 30865920 | consumed tokens: 63213404160 | elapsed time per iteration (s): 0.15 | learning rate: 5.898E-05 | global batch size: 256 | lm loss: 3.687918E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.772 | TFLOPs: 26.20 | +7: iteration 120580/ 173500 | consumed samples: 30868480 | consumed tokens: 63218647040 | elapsed time per iteration (s): 0.15 | learning rate: 5.897E-05 | global batch size: 256 | lm loss: 3.677557E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.457 | TFLOPs: 26.18 | +7: iteration 120590/ 173500 | consumed samples: 30871040 | consumed tokens: 63223889920 | elapsed time per iteration (s): 0.16 | learning rate: 5.895E-05 | global batch size: 256 | lm loss: 3.677176E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.041 | TFLOPs: 25.78 | +7: iteration 120600/ 173500 | consumed samples: 30873600 | consumed tokens: 63229132800 | elapsed time per iteration (s): 0.15 | learning rate: 5.894E-05 | global batch size: 256 | lm loss: 3.683170E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.626 | TFLOPs: 26.15 | +7: iteration 120610/ 173500 | consumed samples: 30876160 | consumed tokens: 63234375680 | elapsed time per iteration (s): 0.15 | learning rate: 5.893E-05 | global batch size: 256 | lm loss: 3.668203E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.105 | TFLOPs: 26.07 | +7: iteration 120620/ 173500 | consumed samples: 30878720 | consumed tokens: 63239618560 | elapsed time per iteration (s): 0.16 | learning rate: 5.891E-05 | global batch size: 256 | lm loss: 3.677796E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.083 | TFLOPs: 25.41 | +7: iteration 120630/ 173500 | consumed samples: 30881280 | consumed tokens: 63244861440 | elapsed time per iteration (s): 0.16 | learning rate: 5.890E-05 | global batch size: 256 | lm loss: 3.676346E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.545 | TFLOPs: 25.73 | +7: iteration 120640/ 173500 | consumed samples: 30883840 | consumed tokens: 63250104320 | elapsed time per iteration (s): 0.16 | learning rate: 5.889E-05 | global batch size: 256 | lm loss: 3.665347E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.456 | TFLOPs: 25.73 | +7: iteration 120650/ 173500 | consumed samples: 30886400 | consumed tokens: 63255347200 | elapsed time per iteration (s): 0.15 | learning rate: 5.887E-05 | global batch size: 256 | lm loss: 3.690676E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.076 | TFLOPs: 26.14 | +7: iteration 120660/ 173500 | consumed samples: 30888960 | consumed tokens: 63260590080 | elapsed time per iteration (s): 0.16 | learning rate: 5.886E-05 | global batch size: 256 | lm loss: 3.666504E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.632 | TFLOPs: 25.85 | +7: iteration 120670/ 173500 | consumed samples: 30891520 | consumed tokens: 63265832960 | elapsed time per iteration (s): 0.16 | learning rate: 5.885E-05 | global batch size: 256 | lm loss: 3.687415E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.847 | TFLOPs: 25.56 | +7: iteration 120680/ 173500 | consumed samples: 30894080 | consumed tokens: 63271075840 | elapsed time per iteration (s): 0.16 | learning rate: 5.883E-05 | global batch size: 256 | lm loss: 3.659761E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.474 | TFLOPs: 25.55 | +7: iteration 120690/ 173500 | consumed samples: 30896640 | consumed tokens: 63276318720 | elapsed time per iteration (s): 0.16 | learning rate: 5.882E-05 | global batch size: 256 | lm loss: 3.678720E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.390 | TFLOPs: 25.77 | +7: iteration 120700/ 173500 | consumed samples: 30899200 | consumed tokens: 63281561600 | elapsed time per iteration (s): 0.15 | learning rate: 5.881E-05 | global batch size: 256 | lm loss: 3.681656E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.383 | TFLOPs: 26.13 | +7: iteration 120710/ 173500 | consumed samples: 30901760 | consumed tokens: 63286804480 | elapsed time per iteration (s): 0.15 | learning rate: 5.879E-05 | global batch size: 256 | lm loss: 3.694309E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.567 | TFLOPs: 26.14 | +7: iteration 120720/ 173500 | consumed samples: 30904320 | consumed tokens: 63292047360 | elapsed time per iteration (s): 0.15 | learning rate: 5.878E-05 | global batch size: 256 | lm loss: 3.679401E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.468 | TFLOPs: 26.17 | +7: iteration 120730/ 173500 | consumed samples: 30906880 | consumed tokens: 63297290240 | elapsed time per iteration (s): 0.15 | learning rate: 5.877E-05 | global batch size: 256 | lm loss: 3.671092E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.288 | TFLOPs: 26.16 | +7: iteration 120740/ 173500 | consumed samples: 30909440 | consumed tokens: 63302533120 | elapsed time per iteration (s): 0.15 | learning rate: 5.875E-05 | global batch size: 256 | lm loss: 3.690321E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.955 | TFLOPs: 26.17 | +7: iteration 120750/ 173500 | consumed samples: 30912000 | consumed tokens: 63307776000 | elapsed time per iteration (s): 0.15 | learning rate: 5.874E-05 | global batch size: 256 | lm loss: 3.677158E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.657 | TFLOPs: 26.12 | +7: iteration 120760/ 173500 | consumed samples: 30914560 | consumed tokens: 63313018880 | elapsed time per iteration (s): 0.15 | learning rate: 5.872E-05 | global batch size: 256 | lm loss: 3.688731E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.202 | TFLOPs: 26.11 | +7: iteration 120770/ 173500 | consumed samples: 30917120 | consumed tokens: 63318261760 | elapsed time per iteration (s): 0.15 | learning rate: 5.871E-05 | global batch size: 256 | lm loss: 3.663531E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.762 | TFLOPs: 26.14 | +7: iteration 120780/ 173500 | consumed samples: 30919680 | consumed tokens: 63323504640 | elapsed time per iteration (s): 0.18 | learning rate: 5.870E-05 | global batch size: 256 | lm loss: 3.671534E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.402 | TFLOPs: 22.13 | +7: iteration 120790/ 173500 | consumed samples: 30922240 | consumed tokens: 63328747520 | elapsed time per iteration (s): 0.17 | learning rate: 5.868E-05 | global batch size: 256 | lm loss: 3.684758E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.847 | TFLOPs: 23.58 | +7: iteration 120800/ 173500 | consumed samples: 30924800 | consumed tokens: 63333990400 | elapsed time per iteration (s): 0.15 | learning rate: 5.867E-05 | global batch size: 256 | lm loss: 3.672390E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.344 | TFLOPs: 26.02 | +7: iteration 120810/ 173500 | consumed samples: 30927360 | consumed tokens: 63339233280 | elapsed time per iteration (s): 0.15 | learning rate: 5.866E-05 | global batch size: 256 | lm loss: 3.675167E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.086 | TFLOPs: 26.27 | +7: iteration 120820/ 173500 | consumed samples: 30929920 | consumed tokens: 63344476160 | elapsed time per iteration (s): 0.15 | learning rate: 5.864E-05 | global batch size: 256 | lm loss: 3.679466E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.853 | TFLOPs: 26.28 | +7: iteration 120830/ 173500 | consumed samples: 30932480 | consumed tokens: 63349719040 | elapsed time per iteration (s): 0.16 | learning rate: 5.863E-05 | global batch size: 256 | lm loss: 3.682824E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.231 | TFLOPs: 25.85 | +7: iteration 120840/ 173500 | consumed samples: 30935040 | consumed tokens: 63354961920 | elapsed time per iteration (s): 0.15 | learning rate: 5.862E-05 | global batch size: 256 | lm loss: 3.667155E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.273 | TFLOPs: 26.27 | +7: iteration 120850/ 173500 | consumed samples: 30937600 | consumed tokens: 63360204800 | elapsed time per iteration (s): 0.15 | learning rate: 5.860E-05 | global batch size: 256 | lm loss: 3.682812E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.619 | TFLOPs: 26.20 | +7: iteration 120860/ 173500 | consumed samples: 30940160 | consumed tokens: 63365447680 | elapsed time per iteration (s): 0.15 | learning rate: 5.859E-05 | global batch size: 256 | lm loss: 3.669895E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.534 | TFLOPs: 26.29 | +7: iteration 120870/ 173500 | consumed samples: 30942720 | consumed tokens: 63370690560 | elapsed time per iteration (s): 0.16 | learning rate: 5.858E-05 | global batch size: 256 | lm loss: 3.679118E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.420 | TFLOPs: 24.99 | +7: iteration 120880/ 173500 | consumed samples: 30945280 | consumed tokens: 63375933440 | elapsed time per iteration (s): 0.15 | learning rate: 5.856E-05 | global batch size: 256 | lm loss: 3.691763E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.653 | TFLOPs: 26.28 | +7: iteration 120890/ 173500 | consumed samples: 30947840 | consumed tokens: 63381176320 | elapsed time per iteration (s): 0.15 | learning rate: 5.855E-05 | global batch size: 256 | lm loss: 3.679962E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.494 | TFLOPs: 25.92 | +7: iteration 120900/ 173500 | consumed samples: 30950400 | consumed tokens: 63386419200 | elapsed time per iteration (s): 0.16 | learning rate: 5.854E-05 | global batch size: 256 | lm loss: 3.679285E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.510 | TFLOPs: 25.08 | +7: iteration 120910/ 173500 | consumed samples: 30952960 | consumed tokens: 63391662080 | elapsed time per iteration (s): 0.15 | learning rate: 5.852E-05 | global batch size: 256 | lm loss: 3.674477E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.730 | TFLOPs: 26.26 | +7: iteration 120920/ 173500 | consumed samples: 30955520 | consumed tokens: 63396904960 | elapsed time per iteration (s): 0.15 | learning rate: 5.851E-05 | global batch size: 256 | lm loss: 3.691477E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.885 | TFLOPs: 26.27 | +7: iteration 120930/ 173500 | consumed samples: 30958080 | consumed tokens: 63402147840 | elapsed time per iteration (s): 0.15 | learning rate: 5.850E-05 | global batch size: 256 | lm loss: 3.678957E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.840 | TFLOPs: 26.05 | +7: iteration 120940/ 173500 | consumed samples: 30960640 | consumed tokens: 63407390720 | elapsed time per iteration (s): 0.15 | learning rate: 5.848E-05 | global batch size: 256 | lm loss: 3.673050E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.677 | TFLOPs: 26.29 | +7: iteration 120950/ 173500 | consumed samples: 30963200 | consumed tokens: 63412633600 | elapsed time per iteration (s): 0.15 | learning rate: 5.847E-05 | global batch size: 256 | lm loss: 3.674837E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.336 | TFLOPs: 25.93 | +7: iteration 120960/ 173500 | consumed samples: 30965760 | consumed tokens: 63417876480 | elapsed time per iteration (s): 0.16 | learning rate: 5.845E-05 | global batch size: 256 | lm loss: 3.678250E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.186 | TFLOPs: 25.68 | +7: iteration 120970/ 173500 | consumed samples: 30968320 | consumed tokens: 63423119360 | elapsed time per iteration (s): 0.15 | learning rate: 5.844E-05 | global batch size: 256 | lm loss: 3.667261E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.978 | TFLOPs: 25.91 | +7: iteration 120980/ 173500 | consumed samples: 30970880 | consumed tokens: 63428362240 | elapsed time per iteration (s): 0.16 | learning rate: 5.843E-05 | global batch size: 256 | lm loss: 3.684595E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.579 | TFLOPs: 24.38 | +7: iteration 120990/ 173500 | consumed samples: 30973440 | consumed tokens: 63433605120 | elapsed time per iteration (s): 0.15 | learning rate: 5.841E-05 | global batch size: 256 | lm loss: 3.683187E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.276 | TFLOPs: 26.27 | +7: iteration 121000/ 173500 | consumed samples: 30976000 | consumed tokens: 63438848000 | elapsed time per iteration (s): 0.16 | learning rate: 5.840E-05 | global batch size: 256 | lm loss: 3.678806E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.344 | TFLOPs: 24.96 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 121000 | lm loss value: 3.814854E+00 | lm loss PPL: 4.537012E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 121000 to checkpoints_44m91b100m +0: [2023-03-17 05:30:35,877] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step121000 is begin to save! +0: [2023-03-17 05:30:35,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:30:35,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:30:35,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:30:35,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:30:35,959] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:30:35,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:30:35,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:30:35,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:30:35,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:30:35,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:30:35,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:30:35,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:30:35,992] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:30:36,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:30:36,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:30:36,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:30:36,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:30:36,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:30:36,016] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:30:36,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:30:36,017] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step121000/mp_rank_00_model_states.pt +0: [2023-03-17 05:30:36,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:30:36,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:30:36,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:30:36,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:30:36,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:30:36,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:30:36,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +4: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 05:30:36,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +4: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:30:36,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:30:36,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:30:36,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:30:36,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +4: [2023-03-17 05:30:36,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +4: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +3: [2023-03-17 05:30:36,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +7: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +2: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +5: [2023-03-17 05:30:36,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:30:36,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +4: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:30:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 05:30:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +4: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +6: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +1: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:30:36,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step121000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:30:36,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step121000 is ready now! +0: successfully saved checkpoint at iteration 121000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 188.66 +7: iteration 121010/ 173500 | consumed samples: 30978560 | consumed tokens: 63444090880 | elapsed time per iteration (s): 0.18 | learning rate: 5.839E-05 | global batch size: 256 | lm loss: 3.668441E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1450.551 | TFLOPs: 22.75 | +7: iteration 121020/ 173500 | consumed samples: 30981120 | consumed tokens: 63449333760 | elapsed time per iteration (s): 0.17 | learning rate: 5.837E-05 | global batch size: 256 | lm loss: 3.675099E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.153 | TFLOPs: 24.01 | +7: iteration 121030/ 173500 | consumed samples: 30983680 | consumed tokens: 63454576640 | elapsed time per iteration (s): 0.16 | learning rate: 5.836E-05 | global batch size: 256 | lm loss: 3.666329E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.534 | TFLOPs: 25.48 | +7: iteration 121040/ 173500 | consumed samples: 30986240 | consumed tokens: 63459819520 | elapsed time per iteration (s): 0.16 | learning rate: 5.835E-05 | global batch size: 256 | lm loss: 3.681745E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.569 | TFLOPs: 25.89 | +7: iteration 121050/ 173500 | consumed samples: 30988800 | consumed tokens: 63465062400 | elapsed time per iteration (s): 0.16 | learning rate: 5.833E-05 | global batch size: 256 | lm loss: 3.668483E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.427 | TFLOPs: 25.54 | +7: iteration 121060/ 173500 | consumed samples: 30991360 | consumed tokens: 63470305280 | elapsed time per iteration (s): 0.15 | learning rate: 5.832E-05 | global batch size: 256 | lm loss: 3.672272E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.974 | TFLOPs: 25.95 | +7: iteration 121070/ 173500 | consumed samples: 30993920 | consumed tokens: 63475548160 | elapsed time per iteration (s): 0.15 | learning rate: 5.831E-05 | global batch size: 256 | lm loss: 3.674393E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.670 | TFLOPs: 26.07 | +7: iteration 121080/ 173500 | consumed samples: 30996480 | consumed tokens: 63480791040 | elapsed time per iteration (s): 0.15 | learning rate: 5.829E-05 | global batch size: 256 | lm loss: 3.680074E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.028 | TFLOPs: 26.25 | +7: iteration 121090/ 173500 | consumed samples: 30999040 | consumed tokens: 63486033920 | elapsed time per iteration (s): 0.16 | learning rate: 5.828E-05 | global batch size: 256 | lm loss: 3.675143E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.224 | TFLOPs: 25.03 | +7: iteration 121100/ 173500 | consumed samples: 31001600 | consumed tokens: 63491276800 | elapsed time per iteration (s): 0.15 | learning rate: 5.827E-05 | global batch size: 256 | lm loss: 3.679859E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.249 | TFLOPs: 26.01 | +7: iteration 121110/ 173500 | consumed samples: 31004160 | consumed tokens: 63496519680 | elapsed time per iteration (s): 0.15 | learning rate: 5.825E-05 | global batch size: 256 | lm loss: 3.674305E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.988 | TFLOPs: 26.21 | +7: iteration 121120/ 173500 | consumed samples: 31006720 | consumed tokens: 63501762560 | elapsed time per iteration (s): 0.15 | learning rate: 5.824E-05 | global batch size: 256 | lm loss: 3.670039E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.173 | TFLOPs: 26.30 | +7: iteration 121130/ 173500 | consumed samples: 31009280 | consumed tokens: 63507005440 | elapsed time per iteration (s): 0.16 | learning rate: 5.823E-05 | global batch size: 256 | lm loss: 3.681409E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.760 | TFLOPs: 25.57 | +7: iteration 121140/ 173500 | consumed samples: 31011840 | consumed tokens: 63512248320 | elapsed time per iteration (s): 0.16 | learning rate: 5.821E-05 | global batch size: 256 | lm loss: 3.673217E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.391 | TFLOPs: 25.84 | +7: iteration 121150/ 173500 | consumed samples: 31014400 | consumed tokens: 63517491200 | elapsed time per iteration (s): 0.15 | learning rate: 5.820E-05 | global batch size: 256 | lm loss: 3.668763E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.384 | TFLOPs: 26.34 | +7: iteration 121160/ 173500 | consumed samples: 31016960 | consumed tokens: 63522734080 | elapsed time per iteration (s): 0.17 | learning rate: 5.818E-05 | global batch size: 256 | lm loss: 3.684045E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1527.482 | TFLOPs: 23.95 | +7: iteration 121170/ 173500 | consumed samples: 31019520 | consumed tokens: 63527976960 | elapsed time per iteration (s): 0.16 | learning rate: 5.817E-05 | global batch size: 256 | lm loss: 3.677138E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.946 | TFLOPs: 25.56 | +7: iteration 121180/ 173500 | consumed samples: 31022080 | consumed tokens: 63533219840 | elapsed time per iteration (s): 0.15 | learning rate: 5.816E-05 | global batch size: 256 | lm loss: 3.681214E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.995 | TFLOPs: 26.35 | +7: iteration 121190/ 173500 | consumed samples: 31024640 | consumed tokens: 63538462720 | elapsed time per iteration (s): 0.16 | learning rate: 5.814E-05 | global batch size: 256 | lm loss: 3.680448E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.319 | TFLOPs: 24.70 | +7: iteration 121200/ 173500 | consumed samples: 31027200 | consumed tokens: 63543705600 | elapsed time per iteration (s): 0.16 | learning rate: 5.813E-05 | global batch size: 256 | lm loss: 3.676861E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.113 | TFLOPs: 25.66 | +7: iteration 121210/ 173500 | consumed samples: 31029760 | consumed tokens: 63548948480 | elapsed time per iteration (s): 0.16 | learning rate: 5.812E-05 | global batch size: 256 | lm loss: 3.682153E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.476 | TFLOPs: 25.79 | +7: iteration 121220/ 173500 | consumed samples: 31032320 | consumed tokens: 63554191360 | elapsed time per iteration (s): 0.16 | learning rate: 5.810E-05 | global batch size: 256 | lm loss: 3.667086E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.696 | TFLOPs: 25.62 | +7: iteration 121230/ 173500 | consumed samples: 31034880 | consumed tokens: 63559434240 | elapsed time per iteration (s): 0.16 | learning rate: 5.809E-05 | global batch size: 256 | lm loss: 3.667583E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.166 | TFLOPs: 25.85 | +7: iteration 121240/ 173500 | consumed samples: 31037440 | consumed tokens: 63564677120 | elapsed time per iteration (s): 0.16 | learning rate: 5.808E-05 | global batch size: 256 | lm loss: 3.675291E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.395 | TFLOPs: 25.46 | +7: iteration 121250/ 173500 | consumed samples: 31040000 | consumed tokens: 63569920000 | elapsed time per iteration (s): 0.15 | learning rate: 5.806E-05 | global batch size: 256 | lm loss: 3.689437E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.095 | TFLOPs: 26.21 | +7: iteration 121260/ 173500 | consumed samples: 31042560 | consumed tokens: 63575162880 | elapsed time per iteration (s): 0.15 | learning rate: 5.805E-05 | global batch size: 256 | lm loss: 3.680145E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.485 | TFLOPs: 26.32 | +7: iteration 121270/ 173500 | consumed samples: 31045120 | consumed tokens: 63580405760 | elapsed time per iteration (s): 0.15 | learning rate: 5.804E-05 | global batch size: 256 | lm loss: 3.669439E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.991 | TFLOPs: 26.32 | +7: iteration 121280/ 173500 | consumed samples: 31047680 | consumed tokens: 63585648640 | elapsed time per iteration (s): 0.16 | learning rate: 5.802E-05 | global batch size: 256 | lm loss: 3.681479E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.744 | TFLOPs: 25.89 | +7: iteration 121290/ 173500 | consumed samples: 31050240 | consumed tokens: 63590891520 | elapsed time per iteration (s): 0.15 | learning rate: 5.801E-05 | global batch size: 256 | lm loss: 3.685227E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.668 | TFLOPs: 26.09 | +7: iteration 121300/ 173500 | consumed samples: 31052800 | consumed tokens: 63596134400 | elapsed time per iteration (s): 0.15 | learning rate: 5.800E-05 | global batch size: 256 | lm loss: 3.670603E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.777 | TFLOPs: 26.36 | +7: iteration 121310/ 173500 | consumed samples: 31055360 | consumed tokens: 63601377280 | elapsed time per iteration (s): 0.16 | learning rate: 5.798E-05 | global batch size: 256 | lm loss: 3.670608E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.716 | TFLOPs: 25.84 | +7: iteration 121320/ 173500 | consumed samples: 31057920 | consumed tokens: 63606620160 | elapsed time per iteration (s): 0.15 | learning rate: 5.797E-05 | global batch size: 256 | lm loss: 3.675433E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.346 | TFLOPs: 26.27 | +7: iteration 121330/ 173500 | consumed samples: 31060480 | consumed tokens: 63611863040 | elapsed time per iteration (s): 0.16 | learning rate: 5.796E-05 | global batch size: 256 | lm loss: 3.670723E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.094 | TFLOPs: 25.81 | +7: iteration 121340/ 173500 | consumed samples: 31063040 | consumed tokens: 63617105920 | elapsed time per iteration (s): 0.15 | learning rate: 5.794E-05 | global batch size: 256 | lm loss: 3.678907E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.011 | TFLOPs: 26.24 | +7: iteration 121350/ 173500 | consumed samples: 31065600 | consumed tokens: 63622348800 | elapsed time per iteration (s): 0.16 | learning rate: 5.793E-05 | global batch size: 256 | lm loss: 3.685620E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.887 | TFLOPs: 25.55 | +7: iteration 121360/ 173500 | consumed samples: 31068160 | consumed tokens: 63627591680 | elapsed time per iteration (s): 0.15 | learning rate: 5.792E-05 | global batch size: 256 | lm loss: 3.669866E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.880 | TFLOPs: 25.91 | +7: iteration 121370/ 173500 | consumed samples: 31070720 | consumed tokens: 63632834560 | elapsed time per iteration (s): 0.15 | learning rate: 5.790E-05 | global batch size: 256 | lm loss: 3.682089E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.128 | TFLOPs: 26.27 | +7: iteration 121380/ 173500 | consumed samples: 31073280 | consumed tokens: 63638077440 | elapsed time per iteration (s): 0.15 | learning rate: 5.789E-05 | global batch size: 256 | lm loss: 3.692893E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.249 | TFLOPs: 26.27 | +7: iteration 121390/ 173500 | consumed samples: 31075840 | consumed tokens: 63643320320 | elapsed time per iteration (s): 0.15 | learning rate: 5.788E-05 | global batch size: 256 | lm loss: 3.691043E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.875 | TFLOPs: 26.27 | +7: iteration 121400/ 173500 | consumed samples: 31078400 | consumed tokens: 63648563200 | elapsed time per iteration (s): 0.15 | learning rate: 5.786E-05 | global batch size: 256 | lm loss: 3.671580E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.713 | TFLOPs: 26.25 | +7: iteration 121410/ 173500 | consumed samples: 31080960 | consumed tokens: 63653806080 | elapsed time per iteration (s): 0.15 | learning rate: 5.785E-05 | global batch size: 256 | lm loss: 3.674470E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.237 | TFLOPs: 26.24 | +7: iteration 121420/ 173500 | consumed samples: 31083520 | consumed tokens: 63659048960 | elapsed time per iteration (s): 0.15 | learning rate: 5.784E-05 | global batch size: 256 | lm loss: 3.684389E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.192 | TFLOPs: 26.22 | +7: iteration 121430/ 173500 | consumed samples: 31086080 | consumed tokens: 63664291840 | elapsed time per iteration (s): 0.15 | learning rate: 5.782E-05 | global batch size: 256 | lm loss: 3.674624E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.846 | TFLOPs: 26.22 | +7: iteration 121440/ 173500 | consumed samples: 31088640 | consumed tokens: 63669534720 | elapsed time per iteration (s): 0.15 | learning rate: 5.781E-05 | global batch size: 256 | lm loss: 3.681474E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.488 | TFLOPs: 26.23 | +7: iteration 121450/ 173500 | consumed samples: 31091200 | consumed tokens: 63674777600 | elapsed time per iteration (s): 0.15 | learning rate: 5.780E-05 | global batch size: 256 | lm loss: 3.685865E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.248 | TFLOPs: 26.26 | +7: iteration 121460/ 173500 | consumed samples: 31093760 | consumed tokens: 63680020480 | elapsed time per iteration (s): 0.15 | learning rate: 5.778E-05 | global batch size: 256 | lm loss: 3.672509E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.846 | TFLOPs: 26.27 | +7: iteration 121470/ 173500 | consumed samples: 31096320 | consumed tokens: 63685263360 | elapsed time per iteration (s): 0.15 | learning rate: 5.777E-05 | global batch size: 256 | lm loss: 3.666620E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.875 | TFLOPs: 26.25 | +7: iteration 121480/ 173500 | consumed samples: 31098880 | consumed tokens: 63690506240 | elapsed time per iteration (s): 0.15 | learning rate: 5.776E-05 | global batch size: 256 | lm loss: 3.670601E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.006 | TFLOPs: 26.25 | +7: iteration 121490/ 173500 | consumed samples: 31101440 | consumed tokens: 63695749120 | elapsed time per iteration (s): 0.15 | learning rate: 5.774E-05 | global batch size: 256 | lm loss: 3.681588E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.989 | TFLOPs: 26.17 | +7: iteration 121500/ 173500 | consumed samples: 31104000 | consumed tokens: 63700992000 | elapsed time per iteration (s): 0.15 | learning rate: 5.773E-05 | global batch size: 256 | lm loss: 3.673964E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.858 | TFLOPs: 26.17 | +7: iteration 121510/ 173500 | consumed samples: 31106560 | consumed tokens: 63706234880 | elapsed time per iteration (s): 0.15 | learning rate: 5.771E-05 | global batch size: 256 | lm loss: 3.676701E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.619 | TFLOPs: 25.92 | +7: iteration 121520/ 173500 | consumed samples: 31109120 | consumed tokens: 63711477760 | elapsed time per iteration (s): 0.16 | learning rate: 5.770E-05 | global batch size: 256 | lm loss: 3.685454E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.654 | TFLOPs: 25.70 | +7: iteration 121530/ 173500 | consumed samples: 31111680 | consumed tokens: 63716720640 | elapsed time per iteration (s): 0.16 | learning rate: 5.769E-05 | global batch size: 256 | lm loss: 3.677930E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.937 | TFLOPs: 25.66 | +7: iteration 121540/ 173500 | consumed samples: 31114240 | consumed tokens: 63721963520 | elapsed time per iteration (s): 0.15 | learning rate: 5.767E-05 | global batch size: 256 | lm loss: 3.690657E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.122 | TFLOPs: 26.18 | +7: iteration 121550/ 173500 | consumed samples: 31116800 | consumed tokens: 63727206400 | elapsed time per iteration (s): 0.15 | learning rate: 5.766E-05 | global batch size: 256 | lm loss: 3.673148E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.734 | TFLOPs: 26.19 | +7: iteration 121560/ 173500 | consumed samples: 31119360 | consumed tokens: 63732449280 | elapsed time per iteration (s): 0.15 | learning rate: 5.765E-05 | global batch size: 256 | lm loss: 3.667278E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.769 | TFLOPs: 26.20 | +7: iteration 121570/ 173500 | consumed samples: 31121920 | consumed tokens: 63737692160 | elapsed time per iteration (s): 0.15 | learning rate: 5.763E-05 | global batch size: 256 | lm loss: 3.689950E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.930 | TFLOPs: 26.22 | +7: iteration 121580/ 173500 | consumed samples: 31124480 | consumed tokens: 63742935040 | elapsed time per iteration (s): 0.15 | learning rate: 5.762E-05 | global batch size: 256 | lm loss: 3.679237E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.953 | TFLOPs: 26.20 | +7: iteration 121590/ 173500 | consumed samples: 31127040 | consumed tokens: 63748177920 | elapsed time per iteration (s): 0.15 | learning rate: 5.761E-05 | global batch size: 256 | lm loss: 3.679798E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.681 | TFLOPs: 26.22 | +7: iteration 121600/ 173500 | consumed samples: 31129600 | consumed tokens: 63753420800 | elapsed time per iteration (s): 0.16 | learning rate: 5.759E-05 | global batch size: 256 | lm loss: 3.664984E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.263 | TFLOPs: 25.61 | +7: iteration 121610/ 173500 | consumed samples: 31132160 | consumed tokens: 63758663680 | elapsed time per iteration (s): 0.16 | learning rate: 5.758E-05 | global batch size: 256 | lm loss: 3.682784E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.399 | TFLOPs: 25.46 | +7: iteration 121620/ 173500 | consumed samples: 31134720 | consumed tokens: 63763906560 | elapsed time per iteration (s): 0.16 | learning rate: 5.757E-05 | global batch size: 256 | lm loss: 3.672698E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.132 | TFLOPs: 25.16 | +7: iteration 121630/ 173500 | consumed samples: 31137280 | consumed tokens: 63769149440 | elapsed time per iteration (s): 0.15 | learning rate: 5.755E-05 | global batch size: 256 | lm loss: 3.682161E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.157 | TFLOPs: 26.22 | +7: iteration 121640/ 173500 | consumed samples: 31139840 | consumed tokens: 63774392320 | elapsed time per iteration (s): 0.15 | learning rate: 5.754E-05 | global batch size: 256 | lm loss: 3.681554E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.988 | TFLOPs: 26.06 | +7: iteration 121650/ 173500 | consumed samples: 31142400 | consumed tokens: 63779635200 | elapsed time per iteration (s): 0.15 | learning rate: 5.753E-05 | global batch size: 256 | lm loss: 3.674962E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.518 | TFLOPs: 26.24 | +7: iteration 121660/ 173500 | consumed samples: 31144960 | consumed tokens: 63784878080 | elapsed time per iteration (s): 0.15 | learning rate: 5.751E-05 | global batch size: 256 | lm loss: 3.684742E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.861 | TFLOPs: 26.22 | +7: iteration 121670/ 173500 | consumed samples: 31147520 | consumed tokens: 63790120960 | elapsed time per iteration (s): 0.15 | learning rate: 5.750E-05 | global batch size: 256 | lm loss: 3.679432E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.129 | TFLOPs: 26.21 | +7: iteration 121680/ 173500 | consumed samples: 31150080 | consumed tokens: 63795363840 | elapsed time per iteration (s): 0.16 | learning rate: 5.749E-05 | global batch size: 256 | lm loss: 3.665202E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.257 | TFLOPs: 25.46 | +7: iteration 121690/ 173500 | consumed samples: 31152640 | consumed tokens: 63800606720 | elapsed time per iteration (s): 0.16 | learning rate: 5.747E-05 | global batch size: 256 | lm loss: 3.670565E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.221 | TFLOPs: 25.55 | +7: iteration 121700/ 173500 | consumed samples: 31155200 | consumed tokens: 63805849600 | elapsed time per iteration (s): 0.15 | learning rate: 5.746E-05 | global batch size: 256 | lm loss: 3.679177E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.758 | TFLOPs: 26.22 | +7: iteration 121710/ 173500 | consumed samples: 31157760 | consumed tokens: 63811092480 | elapsed time per iteration (s): 0.16 | learning rate: 5.745E-05 | global batch size: 256 | lm loss: 3.676154E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.044 | TFLOPs: 25.61 | +7: iteration 121720/ 173500 | consumed samples: 31160320 | consumed tokens: 63816335360 | elapsed time per iteration (s): 0.15 | learning rate: 5.743E-05 | global batch size: 256 | lm loss: 3.675712E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.307 | TFLOPs: 26.26 | +7: iteration 121730/ 173500 | consumed samples: 31162880 | consumed tokens: 63821578240 | elapsed time per iteration (s): 0.15 | learning rate: 5.742E-05 | global batch size: 256 | lm loss: 3.678492E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.755 | TFLOPs: 26.25 | +7: iteration 121740/ 173500 | consumed samples: 31165440 | consumed tokens: 63826821120 | elapsed time per iteration (s): 0.15 | learning rate: 5.741E-05 | global batch size: 256 | lm loss: 3.682238E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.104 | TFLOPs: 26.22 | +7: iteration 121750/ 173500 | consumed samples: 31168000 | consumed tokens: 63832064000 | elapsed time per iteration (s): 0.15 | learning rate: 5.739E-05 | global batch size: 256 | lm loss: 3.688289E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.843 | TFLOPs: 26.23 | +7: iteration 121760/ 173500 | consumed samples: 31170560 | consumed tokens: 63837306880 | elapsed time per iteration (s): 0.15 | learning rate: 5.738E-05 | global batch size: 256 | lm loss: 3.677757E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.828 | TFLOPs: 26.23 | +7: iteration 121770/ 173500 | consumed samples: 31173120 | consumed tokens: 63842549760 | elapsed time per iteration (s): 0.15 | learning rate: 5.737E-05 | global batch size: 256 | lm loss: 3.699118E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.386 | TFLOPs: 26.23 | +7: iteration 121780/ 173500 | consumed samples: 31175680 | consumed tokens: 63847792640 | elapsed time per iteration (s): 0.15 | learning rate: 5.735E-05 | global batch size: 256 | lm loss: 3.679321E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.705 | TFLOPs: 26.25 | +7: iteration 121790/ 173500 | consumed samples: 31178240 | consumed tokens: 63853035520 | elapsed time per iteration (s): 0.15 | learning rate: 5.734E-05 | global batch size: 256 | lm loss: 3.670428E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.026 | TFLOPs: 26.24 | +7: iteration 121800/ 173500 | consumed samples: 31180800 | consumed tokens: 63858278400 | elapsed time per iteration (s): 0.15 | learning rate: 5.733E-05 | global batch size: 256 | lm loss: 3.686074E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.243 | TFLOPs: 26.19 | +7: iteration 121810/ 173500 | consumed samples: 31183360 | consumed tokens: 63863521280 | elapsed time per iteration (s): 0.15 | learning rate: 5.731E-05 | global batch size: 256 | lm loss: 3.674779E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.495 | TFLOPs: 26.23 | +7: iteration 121820/ 173500 | consumed samples: 31185920 | consumed tokens: 63868764160 | elapsed time per iteration (s): 0.15 | learning rate: 5.730E-05 | global batch size: 256 | lm loss: 3.669542E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.207 | TFLOPs: 26.26 | +7: iteration 121830/ 173500 | consumed samples: 31188480 | consumed tokens: 63874007040 | elapsed time per iteration (s): 0.15 | learning rate: 5.729E-05 | global batch size: 256 | lm loss: 3.676693E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.099 | TFLOPs: 26.27 | +7: iteration 121840/ 173500 | consumed samples: 31191040 | consumed tokens: 63879249920 | elapsed time per iteration (s): 0.15 | learning rate: 5.727E-05 | global batch size: 256 | lm loss: 3.682108E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.834 | TFLOPs: 26.25 | +7: iteration 121850/ 173500 | consumed samples: 31193600 | consumed tokens: 63884492800 | elapsed time per iteration (s): 0.15 | learning rate: 5.726E-05 | global batch size: 256 | lm loss: 3.684575E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.218 | TFLOPs: 26.24 | +7: iteration 121860/ 173500 | consumed samples: 31196160 | consumed tokens: 63889735680 | elapsed time per iteration (s): 0.16 | learning rate: 5.725E-05 | global batch size: 256 | lm loss: 3.672455E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.070 | TFLOPs: 24.80 | +7: iteration 121870/ 173500 | consumed samples: 31198720 | consumed tokens: 63894978560 | elapsed time per iteration (s): 0.16 | learning rate: 5.723E-05 | global batch size: 256 | lm loss: 3.667918E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.651 | TFLOPs: 25.76 | +7: iteration 121880/ 173500 | consumed samples: 31201280 | consumed tokens: 63900221440 | elapsed time per iteration (s): 0.16 | learning rate: 5.722E-05 | global batch size: 256 | lm loss: 3.676637E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.796 | TFLOPs: 25.51 | +7: iteration 121890/ 173500 | consumed samples: 31203840 | consumed tokens: 63905464320 | elapsed time per iteration (s): 0.16 | learning rate: 5.721E-05 | global batch size: 256 | lm loss: 3.672793E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.248 | TFLOPs: 25.33 | +7: iteration 121900/ 173500 | consumed samples: 31206400 | consumed tokens: 63910707200 | elapsed time per iteration (s): 0.15 | learning rate: 5.719E-05 | global batch size: 256 | lm loss: 3.675889E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.123 | TFLOPs: 26.11 | +7: iteration 121910/ 173500 | consumed samples: 31208960 | consumed tokens: 63915950080 | elapsed time per iteration (s): 0.15 | learning rate: 5.718E-05 | global batch size: 256 | lm loss: 3.674643E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.266 | TFLOPs: 26.13 | +7: iteration 121920/ 173500 | consumed samples: 31211520 | consumed tokens: 63921192960 | elapsed time per iteration (s): 0.15 | learning rate: 5.717E-05 | global batch size: 256 | lm loss: 3.677487E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.600 | TFLOPs: 26.12 | +7: iteration 121930/ 173500 | consumed samples: 31214080 | consumed tokens: 63926435840 | elapsed time per iteration (s): 0.15 | learning rate: 5.715E-05 | global batch size: 256 | lm loss: 3.673913E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.517 | TFLOPs: 26.15 | +7: iteration 121940/ 173500 | consumed samples: 31216640 | consumed tokens: 63931678720 | elapsed time per iteration (s): 0.16 | learning rate: 5.714E-05 | global batch size: 256 | lm loss: 3.681971E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.697 | TFLOPs: 25.87 | +7: iteration 121950/ 173500 | consumed samples: 31219200 | consumed tokens: 63936921600 | elapsed time per iteration (s): 0.15 | learning rate: 5.713E-05 | global batch size: 256 | lm loss: 3.678454E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.966 | TFLOPs: 26.13 | +7: iteration 121960/ 173500 | consumed samples: 31221760 | consumed tokens: 63942164480 | elapsed time per iteration (s): 0.16 | learning rate: 5.711E-05 | global batch size: 256 | lm loss: 3.671113E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.728 | TFLOPs: 25.40 | +7: iteration 121970/ 173500 | consumed samples: 31224320 | consumed tokens: 63947407360 | elapsed time per iteration (s): 0.15 | learning rate: 5.710E-05 | global batch size: 256 | lm loss: 3.675125E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.496 | TFLOPs: 26.09 | +7: iteration 121980/ 173500 | consumed samples: 31226880 | consumed tokens: 63952650240 | elapsed time per iteration (s): 0.15 | learning rate: 5.709E-05 | global batch size: 256 | lm loss: 3.661013E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.527 | TFLOPs: 26.10 | +7: iteration 121990/ 173500 | consumed samples: 31229440 | consumed tokens: 63957893120 | elapsed time per iteration (s): 0.16 | learning rate: 5.707E-05 | global batch size: 256 | lm loss: 3.667739E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.953 | TFLOPs: 25.01 | +0: [2023-03-17 05:33:10,859] [INFO] [logging.py:68:log_dist] [Rank 0] step=122000, skipped=0, lr=[5.706057124448849e-05, 5.706057124448849e-05, 5.706057124448849e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 122000/ 173500 | consumed samples: 31232000 | consumed tokens: 63963136000 | elapsed time per iteration (s): 0.15 | learning rate: 5.706E-05 | global batch size: 256 | lm loss: 3.679557E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.372 | TFLOPs: 26.15 | +0: steps: 122000 loss: 3.6935 iter time (s): 0.154 samples/sec: 1657.004 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 122000 | lm loss value: 3.861035E+00 | lm loss PPL: 4.751449E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 122000 to checkpoints_44m91b100m +0: [2023-03-17 05:33:10,932] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step122000 is begin to save! +0: [2023-03-17 05:33:10,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:33:10,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:33:10,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:33:11,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:33:11,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:33:11,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:33:11,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:33:11,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:33:11,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:33:11,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:33:11,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:33:11,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:33:11,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:33:11,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:33:11,052] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:33:11,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:33:11,060] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:33:11,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:33:11,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:33:11,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:33:11,069] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step122000/mp_rank_00_model_states.pt +0: [2023-03-17 05:33:11,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:33:11,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:33:11,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:33:11,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:33:11,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:33:11,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:33:11,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 05:33:11,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:33:11,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +2: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:33:11,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +2: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +2: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:33:11,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:33:11,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 05:33:11,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +2: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +2: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +5: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +1: [2023-03-17 05:33:11,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +4: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +2: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +6: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +3: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +7: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:33:11,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step122000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:33:11,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step122000 is ready now! +0: successfully saved checkpoint at iteration 122000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.46 +7: iteration 122010/ 173500 | consumed samples: 31234560 | consumed tokens: 63968378880 | elapsed time per iteration (s): 0.18 | learning rate: 5.705E-05 | global batch size: 256 | lm loss: 3.677999E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.181 | TFLOPs: 22.01 | +7: iteration 122020/ 173500 | consumed samples: 31237120 | consumed tokens: 63973621760 | elapsed time per iteration (s): 0.16 | learning rate: 5.703E-05 | global batch size: 256 | lm loss: 3.679330E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.538 | TFLOPs: 25.15 | +7: iteration 122030/ 173500 | consumed samples: 31239680 | consumed tokens: 63978864640 | elapsed time per iteration (s): 0.15 | learning rate: 5.702E-05 | global batch size: 256 | lm loss: 3.678152E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.710 | TFLOPs: 25.93 | +7: iteration 122040/ 173500 | consumed samples: 31242240 | consumed tokens: 63984107520 | elapsed time per iteration (s): 0.16 | learning rate: 5.701E-05 | global batch size: 256 | lm loss: 3.668933E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.323 | TFLOPs: 25.90 | +7: iteration 122050/ 173500 | consumed samples: 31244800 | consumed tokens: 63989350400 | elapsed time per iteration (s): 0.15 | learning rate: 5.699E-05 | global batch size: 256 | lm loss: 3.673560E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.077 | TFLOPs: 25.97 | +7: iteration 122060/ 173500 | consumed samples: 31247360 | consumed tokens: 63994593280 | elapsed time per iteration (s): 0.15 | learning rate: 5.698E-05 | global batch size: 256 | lm loss: 3.663109E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.245 | TFLOPs: 26.22 | +7: iteration 122070/ 173500 | consumed samples: 31249920 | consumed tokens: 63999836160 | elapsed time per iteration (s): 0.15 | learning rate: 5.697E-05 | global batch size: 256 | lm loss: 3.677651E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.934 | TFLOPs: 26.24 | +7: iteration 122080/ 173500 | consumed samples: 31252480 | consumed tokens: 64005079040 | elapsed time per iteration (s): 0.15 | learning rate: 5.695E-05 | global batch size: 256 | lm loss: 3.679871E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.001 | TFLOPs: 25.99 | +7: iteration 122090/ 173500 | consumed samples: 31255040 | consumed tokens: 64010321920 | elapsed time per iteration (s): 0.15 | learning rate: 5.694E-05 | global batch size: 256 | lm loss: 3.693214E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.752 | TFLOPs: 26.23 | +7: iteration 122100/ 173500 | consumed samples: 31257600 | consumed tokens: 64015564800 | elapsed time per iteration (s): 0.15 | learning rate: 5.693E-05 | global batch size: 256 | lm loss: 3.676973E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.986 | TFLOPs: 26.25 | +7: iteration 122110/ 173500 | consumed samples: 31260160 | consumed tokens: 64020807680 | elapsed time per iteration (s): 0.15 | learning rate: 5.691E-05 | global batch size: 256 | lm loss: 3.684182E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.956 | TFLOPs: 26.22 | +7: iteration 122120/ 173500 | consumed samples: 31262720 | consumed tokens: 64026050560 | elapsed time per iteration (s): 0.15 | learning rate: 5.690E-05 | global batch size: 256 | lm loss: 3.683268E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.136 | TFLOPs: 26.24 | +7: iteration 122130/ 173500 | consumed samples: 31265280 | consumed tokens: 64031293440 | elapsed time per iteration (s): 0.16 | learning rate: 5.689E-05 | global batch size: 256 | lm loss: 3.675229E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.727 | TFLOPs: 25.76 | +7: iteration 122140/ 173500 | consumed samples: 31267840 | consumed tokens: 64036536320 | elapsed time per iteration (s): 0.15 | learning rate: 5.687E-05 | global batch size: 256 | lm loss: 3.684245E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.733 | TFLOPs: 26.01 | +7: iteration 122150/ 173500 | consumed samples: 31270400 | consumed tokens: 64041779200 | elapsed time per iteration (s): 0.15 | learning rate: 5.686E-05 | global batch size: 256 | lm loss: 3.674962E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.990 | TFLOPs: 26.22 | +7: iteration 122160/ 173500 | consumed samples: 31272960 | consumed tokens: 64047022080 | elapsed time per iteration (s): 0.16 | learning rate: 5.685E-05 | global batch size: 256 | lm loss: 3.687591E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.488 | TFLOPs: 25.41 | +7: iteration 122170/ 173500 | consumed samples: 31275520 | consumed tokens: 64052264960 | elapsed time per iteration (s): 0.15 | learning rate: 5.683E-05 | global batch size: 256 | lm loss: 3.665268E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.160 | TFLOPs: 26.24 | +7: iteration 122180/ 173500 | consumed samples: 31278080 | consumed tokens: 64057507840 | elapsed time per iteration (s): 0.15 | learning rate: 5.682E-05 | global batch size: 256 | lm loss: 3.676129E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.953 | TFLOPs: 26.24 | +7: iteration 122190/ 173500 | consumed samples: 31280640 | consumed tokens: 64062750720 | elapsed time per iteration (s): 0.15 | learning rate: 5.681E-05 | global batch size: 256 | lm loss: 3.672984E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.238 | TFLOPs: 26.16 | +7: iteration 122200/ 173500 | consumed samples: 31283200 | consumed tokens: 64067993600 | elapsed time per iteration (s): 0.15 | learning rate: 5.679E-05 | global batch size: 256 | lm loss: 3.687317E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.220 | TFLOPs: 26.15 | +7: iteration 122210/ 173500 | consumed samples: 31285760 | consumed tokens: 64073236480 | elapsed time per iteration (s): 0.15 | learning rate: 5.678E-05 | global batch size: 256 | lm loss: 3.684930E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.196 | TFLOPs: 26.16 | +7: iteration 122220/ 173500 | consumed samples: 31288320 | consumed tokens: 64078479360 | elapsed time per iteration (s): 0.15 | learning rate: 5.677E-05 | global batch size: 256 | lm loss: 3.682452E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.160 | TFLOPs: 26.13 | +7: iteration 122230/ 173500 | consumed samples: 31290880 | consumed tokens: 64083722240 | elapsed time per iteration (s): 0.15 | learning rate: 5.675E-05 | global batch size: 256 | lm loss: 3.675346E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.483 | TFLOPs: 26.07 | +7: iteration 122240/ 173500 | consumed samples: 31293440 | consumed tokens: 64088965120 | elapsed time per iteration (s): 0.15 | learning rate: 5.674E-05 | global batch size: 256 | lm loss: 3.679506E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.659 | TFLOPs: 26.14 | +7: iteration 122250/ 173500 | consumed samples: 31296000 | consumed tokens: 64094208000 | elapsed time per iteration (s): 0.15 | learning rate: 5.673E-05 | global batch size: 256 | lm loss: 3.681507E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.720 | TFLOPs: 26.15 | +7: iteration 122260/ 173500 | consumed samples: 31298560 | consumed tokens: 64099450880 | elapsed time per iteration (s): 0.15 | learning rate: 5.672E-05 | global batch size: 256 | lm loss: 3.672216E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.363 | TFLOPs: 26.15 | +7: iteration 122270/ 173500 | consumed samples: 31301120 | consumed tokens: 64104693760 | elapsed time per iteration (s): 0.16 | learning rate: 5.670E-05 | global batch size: 256 | lm loss: 3.675519E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.137 | TFLOPs: 24.97 | +7: iteration 122280/ 173500 | consumed samples: 31303680 | consumed tokens: 64109936640 | elapsed time per iteration (s): 0.15 | learning rate: 5.669E-05 | global batch size: 256 | lm loss: 3.678958E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.595 | TFLOPs: 26.17 | +7: iteration 122290/ 173500 | consumed samples: 31306240 | consumed tokens: 64115179520 | elapsed time per iteration (s): 0.15 | learning rate: 5.668E-05 | global batch size: 256 | lm loss: 3.663400E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.014 | TFLOPs: 26.17 | +7: iteration 122300/ 173500 | consumed samples: 31308800 | consumed tokens: 64120422400 | elapsed time per iteration (s): 0.16 | learning rate: 5.666E-05 | global batch size: 256 | lm loss: 3.678895E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.044 | TFLOPs: 25.77 | +7: iteration 122310/ 173500 | consumed samples: 31311360 | consumed tokens: 64125665280 | elapsed time per iteration (s): 0.15 | learning rate: 5.665E-05 | global batch size: 256 | lm loss: 3.684896E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.688 | TFLOPs: 26.14 | +7: iteration 122320/ 173500 | consumed samples: 31313920 | consumed tokens: 64130908160 | elapsed time per iteration (s): 0.16 | learning rate: 5.664E-05 | global batch size: 256 | lm loss: 3.692593E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.839 | TFLOPs: 25.36 | +7: iteration 122330/ 173500 | consumed samples: 31316480 | consumed tokens: 64136151040 | elapsed time per iteration (s): 0.15 | learning rate: 5.662E-05 | global batch size: 256 | lm loss: 3.682552E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.412 | TFLOPs: 26.06 | +7: iteration 122340/ 173500 | consumed samples: 31319040 | consumed tokens: 64141393920 | elapsed time per iteration (s): 0.15 | learning rate: 5.661E-05 | global batch size: 256 | lm loss: 3.678902E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.522 | TFLOPs: 26.03 | +7: iteration 122350/ 173500 | consumed samples: 31321600 | consumed tokens: 64146636800 | elapsed time per iteration (s): 0.15 | learning rate: 5.660E-05 | global batch size: 256 | lm loss: 3.686418E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.440 | TFLOPs: 26.24 | +7: iteration 122360/ 173500 | consumed samples: 31324160 | consumed tokens: 64151879680 | elapsed time per iteration (s): 0.15 | learning rate: 5.658E-05 | global batch size: 256 | lm loss: 3.668834E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.254 | TFLOPs: 26.19 | +7: iteration 122370/ 173500 | consumed samples: 31326720 | consumed tokens: 64157122560 | elapsed time per iteration (s): 0.15 | learning rate: 5.657E-05 | global batch size: 256 | lm loss: 3.663123E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.706 | TFLOPs: 26.19 | +7: iteration 122380/ 173500 | consumed samples: 31329280 | consumed tokens: 64162365440 | elapsed time per iteration (s): 0.15 | learning rate: 5.656E-05 | global batch size: 256 | lm loss: 3.681855E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.432 | TFLOPs: 26.18 | +7: iteration 122390/ 173500 | consumed samples: 31331840 | consumed tokens: 64167608320 | elapsed time per iteration (s): 0.15 | learning rate: 5.654E-05 | global batch size: 256 | lm loss: 3.682798E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.228 | TFLOPs: 26.19 | +7: iteration 122400/ 173500 | consumed samples: 31334400 | consumed tokens: 64172851200 | elapsed time per iteration (s): 0.15 | learning rate: 5.653E-05 | global batch size: 256 | lm loss: 3.672373E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.149 | TFLOPs: 26.21 | +7: iteration 122410/ 173500 | consumed samples: 31336960 | consumed tokens: 64178094080 | elapsed time per iteration (s): 0.15 | learning rate: 5.652E-05 | global batch size: 256 | lm loss: 3.683762E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.351 | TFLOPs: 26.24 | +7: iteration 122420/ 173500 | consumed samples: 31339520 | consumed tokens: 64183336960 | elapsed time per iteration (s): 0.15 | learning rate: 5.650E-05 | global batch size: 256 | lm loss: 3.665952E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.179 | TFLOPs: 26.24 | +7: iteration 122430/ 173500 | consumed samples: 31342080 | consumed tokens: 64188579840 | elapsed time per iteration (s): 0.15 | learning rate: 5.649E-05 | global batch size: 256 | lm loss: 3.683741E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.682 | TFLOPs: 26.23 | +7: iteration 122440/ 173500 | consumed samples: 31344640 | consumed tokens: 64193822720 | elapsed time per iteration (s): 0.15 | learning rate: 5.648E-05 | global batch size: 256 | lm loss: 3.679039E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.158 | TFLOPs: 26.24 | +7: iteration 122450/ 173500 | consumed samples: 31347200 | consumed tokens: 64199065600 | elapsed time per iteration (s): 0.15 | learning rate: 5.646E-05 | global batch size: 256 | lm loss: 3.678347E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.844 | TFLOPs: 26.22 | +7: iteration 122460/ 173500 | consumed samples: 31349760 | consumed tokens: 64204308480 | elapsed time per iteration (s): 0.16 | learning rate: 5.645E-05 | global batch size: 256 | lm loss: 3.663725E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.609 | TFLOPs: 25.74 | +7: iteration 122470/ 173500 | consumed samples: 31352320 | consumed tokens: 64209551360 | elapsed time per iteration (s): 0.15 | learning rate: 5.644E-05 | global batch size: 256 | lm loss: 3.682863E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.168 | TFLOPs: 26.24 | +7: iteration 122480/ 173500 | consumed samples: 31354880 | consumed tokens: 64214794240 | elapsed time per iteration (s): 0.15 | learning rate: 5.642E-05 | global batch size: 256 | lm loss: 3.683560E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.240 | TFLOPs: 26.24 | +7: iteration 122490/ 173500 | consumed samples: 31357440 | consumed tokens: 64220037120 | elapsed time per iteration (s): 0.15 | learning rate: 5.641E-05 | global batch size: 256 | lm loss: 3.679979E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.328 | TFLOPs: 26.24 | +7: iteration 122500/ 173500 | consumed samples: 31360000 | consumed tokens: 64225280000 | elapsed time per iteration (s): 0.15 | learning rate: 5.640E-05 | global batch size: 256 | lm loss: 3.668530E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.130 | TFLOPs: 26.25 | +7: iteration 122510/ 173500 | consumed samples: 31362560 | consumed tokens: 64230522880 | elapsed time per iteration (s): 0.16 | learning rate: 5.638E-05 | global batch size: 256 | lm loss: 3.672760E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.622 | TFLOPs: 25.70 | +7: iteration 122520/ 173500 | consumed samples: 31365120 | consumed tokens: 64235765760 | elapsed time per iteration (s): 0.16 | learning rate: 5.637E-05 | global batch size: 256 | lm loss: 3.666840E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.445 | TFLOPs: 25.57 | +7: iteration 122530/ 173500 | consumed samples: 31367680 | consumed tokens: 64241008640 | elapsed time per iteration (s): 0.16 | learning rate: 5.636E-05 | global batch size: 256 | lm loss: 3.676612E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.504 | TFLOPs: 25.54 | +7: iteration 122540/ 173500 | consumed samples: 31370240 | consumed tokens: 64246251520 | elapsed time per iteration (s): 0.16 | learning rate: 5.634E-05 | global batch size: 256 | lm loss: 3.685424E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.902 | TFLOPs: 25.76 | +7: iteration 122550/ 173500 | consumed samples: 31372800 | consumed tokens: 64251494400 | elapsed time per iteration (s): 0.16 | learning rate: 5.633E-05 | global batch size: 256 | lm loss: 3.679008E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.716 | TFLOPs: 25.70 | +7: iteration 122560/ 173500 | consumed samples: 31375360 | consumed tokens: 64256737280 | elapsed time per iteration (s): 0.15 | learning rate: 5.632E-05 | global batch size: 256 | lm loss: 3.686086E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.982 | TFLOPs: 25.95 | +7: iteration 122570/ 173500 | consumed samples: 31377920 | consumed tokens: 64261980160 | elapsed time per iteration (s): 0.15 | learning rate: 5.630E-05 | global batch size: 256 | lm loss: 3.679998E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.759 | TFLOPs: 26.19 | +7: iteration 122580/ 173500 | consumed samples: 31380480 | consumed tokens: 64267223040 | elapsed time per iteration (s): 0.16 | learning rate: 5.629E-05 | global batch size: 256 | lm loss: 3.682289E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.378 | TFLOPs: 25.60 | +7: iteration 122590/ 173500 | consumed samples: 31383040 | consumed tokens: 64272465920 | elapsed time per iteration (s): 0.16 | learning rate: 5.628E-05 | global batch size: 256 | lm loss: 3.676567E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.182 | TFLOPs: 25.38 | +7: iteration 122600/ 173500 | consumed samples: 31385600 | consumed tokens: 64277708800 | elapsed time per iteration (s): 0.16 | learning rate: 5.627E-05 | global batch size: 256 | lm loss: 3.670962E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.614 | TFLOPs: 24.94 | +7: iteration 122610/ 173500 | consumed samples: 31388160 | consumed tokens: 64282951680 | elapsed time per iteration (s): 0.15 | learning rate: 5.625E-05 | global batch size: 256 | lm loss: 3.680231E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.550 | TFLOPs: 26.14 | +7: iteration 122620/ 173500 | consumed samples: 31390720 | consumed tokens: 64288194560 | elapsed time per iteration (s): 0.15 | learning rate: 5.624E-05 | global batch size: 256 | lm loss: 3.675423E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.872 | TFLOPs: 26.00 | +7: iteration 122630/ 173500 | consumed samples: 31393280 | consumed tokens: 64293437440 | elapsed time per iteration (s): 0.15 | learning rate: 5.623E-05 | global batch size: 256 | lm loss: 3.695036E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.883 | TFLOPs: 26.34 | +7: iteration 122640/ 173500 | consumed samples: 31395840 | consumed tokens: 64298680320 | elapsed time per iteration (s): 0.15 | learning rate: 5.621E-05 | global batch size: 256 | lm loss: 3.658438E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.841 | TFLOPs: 26.00 | +7: iteration 122650/ 173500 | consumed samples: 31398400 | consumed tokens: 64303923200 | elapsed time per iteration (s): 0.15 | learning rate: 5.620E-05 | global batch size: 256 | lm loss: 3.678025E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.700 | TFLOPs: 26.17 | +7: iteration 122660/ 173500 | consumed samples: 31400960 | consumed tokens: 64309166080 | elapsed time per iteration (s): 0.15 | learning rate: 5.619E-05 | global batch size: 256 | lm loss: 3.686093E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.785 | TFLOPs: 25.98 | +7: iteration 122670/ 173500 | consumed samples: 31403520 | consumed tokens: 64314408960 | elapsed time per iteration (s): 0.15 | learning rate: 5.617E-05 | global batch size: 256 | lm loss: 3.679105E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.410 | TFLOPs: 26.01 | +7: iteration 122680/ 173500 | consumed samples: 31406080 | consumed tokens: 64319651840 | elapsed time per iteration (s): 0.15 | learning rate: 5.616E-05 | global batch size: 256 | lm loss: 3.677894E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.243 | TFLOPs: 26.33 | +7: iteration 122690/ 173500 | consumed samples: 31408640 | consumed tokens: 64324894720 | elapsed time per iteration (s): 0.16 | learning rate: 5.615E-05 | global batch size: 256 | lm loss: 3.672497E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.511 | TFLOPs: 25.21 | +7: iteration 122700/ 173500 | consumed samples: 31411200 | consumed tokens: 64330137600 | elapsed time per iteration (s): 0.15 | learning rate: 5.613E-05 | global batch size: 256 | lm loss: 3.678650E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.156 | TFLOPs: 26.35 | +7: iteration 122710/ 173500 | consumed samples: 31413760 | consumed tokens: 64335380480 | elapsed time per iteration (s): 0.15 | learning rate: 5.612E-05 | global batch size: 256 | lm loss: 3.678299E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.127 | TFLOPs: 26.35 | +7: iteration 122720/ 173500 | consumed samples: 31416320 | consumed tokens: 64340623360 | elapsed time per iteration (s): 0.15 | learning rate: 5.611E-05 | global batch size: 256 | lm loss: 3.664410E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.252 | TFLOPs: 26.37 | +7: iteration 122730/ 173500 | consumed samples: 31418880 | consumed tokens: 64345866240 | elapsed time per iteration (s): 0.15 | learning rate: 5.609E-05 | global batch size: 256 | lm loss: 3.661728E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.903 | TFLOPs: 26.35 | +7: iteration 122740/ 173500 | consumed samples: 31421440 | consumed tokens: 64351109120 | elapsed time per iteration (s): 0.15 | learning rate: 5.608E-05 | global batch size: 256 | lm loss: 3.684703E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.062 | TFLOPs: 26.35 | +7: iteration 122750/ 173500 | consumed samples: 31424000 | consumed tokens: 64356352000 | elapsed time per iteration (s): 0.15 | learning rate: 5.607E-05 | global batch size: 256 | lm loss: 3.688079E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.096 | TFLOPs: 26.16 | +7: iteration 122760/ 173500 | consumed samples: 31426560 | consumed tokens: 64361594880 | elapsed time per iteration (s): 0.16 | learning rate: 5.605E-05 | global batch size: 256 | lm loss: 3.682891E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.217 | TFLOPs: 25.33 | +7: iteration 122770/ 173500 | consumed samples: 31429120 | consumed tokens: 64366837760 | elapsed time per iteration (s): 0.16 | learning rate: 5.604E-05 | global batch size: 256 | lm loss: 3.686002E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.956 | TFLOPs: 25.84 | +7: iteration 122780/ 173500 | consumed samples: 31431680 | consumed tokens: 64372080640 | elapsed time per iteration (s): 0.15 | learning rate: 5.603E-05 | global batch size: 256 | lm loss: 3.684515E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.019 | TFLOPs: 26.35 | +7: iteration 122790/ 173500 | consumed samples: 31434240 | consumed tokens: 64377323520 | elapsed time per iteration (s): 0.15 | learning rate: 5.601E-05 | global batch size: 256 | lm loss: 3.677822E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.578 | TFLOPs: 25.93 | +7: iteration 122800/ 173500 | consumed samples: 31436800 | consumed tokens: 64382566400 | elapsed time per iteration (s): 0.15 | learning rate: 5.600E-05 | global batch size: 256 | lm loss: 3.670693E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.790 | TFLOPs: 26.30 | +7: iteration 122810/ 173500 | consumed samples: 31439360 | consumed tokens: 64387809280 | elapsed time per iteration (s): 0.15 | learning rate: 5.599E-05 | global batch size: 256 | lm loss: 3.675938E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.259 | TFLOPs: 26.33 | +7: iteration 122820/ 173500 | consumed samples: 31441920 | consumed tokens: 64393052160 | elapsed time per iteration (s): 0.15 | learning rate: 5.597E-05 | global batch size: 256 | lm loss: 3.681721E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.384 | TFLOPs: 26.31 | +7: iteration 122830/ 173500 | consumed samples: 31444480 | consumed tokens: 64398295040 | elapsed time per iteration (s): 0.15 | learning rate: 5.596E-05 | global batch size: 256 | lm loss: 3.666802E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.695 | TFLOPs: 26.31 | +7: iteration 122840/ 173500 | consumed samples: 31447040 | consumed tokens: 64403537920 | elapsed time per iteration (s): 0.16 | learning rate: 5.595E-05 | global batch size: 256 | lm loss: 3.686515E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.016 | TFLOPs: 25.06 | +7: iteration 122850/ 173500 | consumed samples: 31449600 | consumed tokens: 64408780800 | elapsed time per iteration (s): 0.16 | learning rate: 5.594E-05 | global batch size: 256 | lm loss: 3.676641E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.543 | TFLOPs: 25.26 | +7: iteration 122860/ 173500 | consumed samples: 31452160 | consumed tokens: 64414023680 | elapsed time per iteration (s): 0.15 | learning rate: 5.592E-05 | global batch size: 256 | lm loss: 3.662204E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.510 | TFLOPs: 26.34 | +7: iteration 122870/ 173500 | consumed samples: 31454720 | consumed tokens: 64419266560 | elapsed time per iteration (s): 0.15 | learning rate: 5.591E-05 | global batch size: 256 | lm loss: 3.671453E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.111 | TFLOPs: 26.35 | +7: iteration 122880/ 173500 | consumed samples: 31457280 | consumed tokens: 64424509440 | elapsed time per iteration (s): 0.15 | learning rate: 5.590E-05 | global batch size: 256 | lm loss: 3.679167E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.493 | TFLOPs: 25.95 | +7: iteration 122890/ 173500 | consumed samples: 31459840 | consumed tokens: 64429752320 | elapsed time per iteration (s): 0.16 | learning rate: 5.588E-05 | global batch size: 256 | lm loss: 3.675232E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.885 | TFLOPs: 25.22 | +7: iteration 122900/ 173500 | consumed samples: 31462400 | consumed tokens: 64434995200 | elapsed time per iteration (s): 0.15 | learning rate: 5.587E-05 | global batch size: 256 | lm loss: 3.675400E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.276 | TFLOPs: 25.97 | +7: iteration 122910/ 173500 | consumed samples: 31464960 | consumed tokens: 64440238080 | elapsed time per iteration (s): 0.15 | learning rate: 5.586E-05 | global batch size: 256 | lm loss: 3.680836E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.512 | TFLOPs: 26.32 | +7: iteration 122920/ 173500 | consumed samples: 31467520 | consumed tokens: 64445480960 | elapsed time per iteration (s): 0.15 | learning rate: 5.584E-05 | global batch size: 256 | lm loss: 3.675300E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.328 | TFLOPs: 26.29 | +7: iteration 122930/ 173500 | consumed samples: 31470080 | consumed tokens: 64450723840 | elapsed time per iteration (s): 0.16 | learning rate: 5.583E-05 | global batch size: 256 | lm loss: 3.672123E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.785 | TFLOPs: 25.54 | +7: iteration 122940/ 173500 | consumed samples: 31472640 | consumed tokens: 64455966720 | elapsed time per iteration (s): 0.15 | learning rate: 5.582E-05 | global batch size: 256 | lm loss: 3.680842E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.525 | TFLOPs: 25.98 | +7: iteration 122950/ 173500 | consumed samples: 31475200 | consumed tokens: 64461209600 | elapsed time per iteration (s): 0.15 | learning rate: 5.580E-05 | global batch size: 256 | lm loss: 3.672833E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.641 | TFLOPs: 26.33 | +7: iteration 122960/ 173500 | consumed samples: 31477760 | consumed tokens: 64466452480 | elapsed time per iteration (s): 0.16 | learning rate: 5.579E-05 | global batch size: 256 | lm loss: 3.671219E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.163 | TFLOPs: 25.53 | +7: iteration 122970/ 173500 | consumed samples: 31480320 | consumed tokens: 64471695360 | elapsed time per iteration (s): 0.16 | learning rate: 5.578E-05 | global batch size: 256 | lm loss: 3.678689E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.570 | TFLOPs: 25.63 | +7: iteration 122980/ 173500 | consumed samples: 31482880 | consumed tokens: 64476938240 | elapsed time per iteration (s): 0.16 | learning rate: 5.576E-05 | global batch size: 256 | lm loss: 3.677643E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.060 | TFLOPs: 25.85 | +7: iteration 122990/ 173500 | consumed samples: 31485440 | consumed tokens: 64482181120 | elapsed time per iteration (s): 0.15 | learning rate: 5.575E-05 | global batch size: 256 | lm loss: 3.673430E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.885 | TFLOPs: 26.16 | +7: iteration 123000/ 173500 | consumed samples: 31488000 | consumed tokens: 64487424000 | elapsed time per iteration (s): 0.16 | learning rate: 5.574E-05 | global batch size: 256 | lm loss: 3.681591E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.355 | TFLOPs: 25.66 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 123000 | lm loss value: 3.859593E+00 | lm loss PPL: 4.744606E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 123000 to checkpoints_44m91b100m +0: [2023-03-17 05:35:45,658] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step123000 is begin to save! +0: [2023-03-17 05:35:45,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:35:45,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:35:45,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:35:45,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:35:45,739] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:35:45,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:35:45,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:35:45,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:35:45,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:35:45,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:35:45,764] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:35:45,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:35:45,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:35:45,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:35:45,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:35:45,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:35:45,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:35:45,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:35:45,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:35:45,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:35:45,798] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step123000/mp_rank_00_model_states.pt +0: [2023-03-17 05:35:45,798] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:35:45,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:35:45,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:35:45,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:35:45,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:35:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:35:45,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +2: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +5: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +6: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +4: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +7: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +1: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:35:45,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:35:45,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +3: [2023-03-17 05:35:45,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:35:45,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step123000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:35:45,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step123000 is ready now! +0: successfully saved checkpoint at iteration 123000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.74 +7: iteration 123010/ 173500 | consumed samples: 31490560 | consumed tokens: 64492666880 | elapsed time per iteration (s): 0.18 | learning rate: 5.573E-05 | global batch size: 256 | lm loss: 3.676225E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.740 | TFLOPs: 22.59 | +7: iteration 123020/ 173500 | consumed samples: 31493120 | consumed tokens: 64497909760 | elapsed time per iteration (s): 0.15 | learning rate: 5.571E-05 | global batch size: 256 | lm loss: 3.676102E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.976 | TFLOPs: 26.21 | +7: iteration 123030/ 173500 | consumed samples: 31495680 | consumed tokens: 64503152640 | elapsed time per iteration (s): 0.15 | learning rate: 5.570E-05 | global batch size: 256 | lm loss: 3.683731E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.557 | TFLOPs: 26.23 | +7: iteration 123040/ 173500 | consumed samples: 31498240 | consumed tokens: 64508395520 | elapsed time per iteration (s): 0.15 | learning rate: 5.569E-05 | global batch size: 256 | lm loss: 3.675874E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.761 | TFLOPs: 25.90 | +7: iteration 123050/ 173500 | consumed samples: 31500800 | consumed tokens: 64513638400 | elapsed time per iteration (s): 0.16 | learning rate: 5.567E-05 | global batch size: 256 | lm loss: 3.674708E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.715 | TFLOPs: 25.34 | +7: iteration 123060/ 173500 | consumed samples: 31503360 | consumed tokens: 64518881280 | elapsed time per iteration (s): 0.16 | learning rate: 5.566E-05 | global batch size: 256 | lm loss: 3.674041E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.683 | TFLOPs: 25.32 | +7: iteration 123070/ 173500 | consumed samples: 31505920 | consumed tokens: 64524124160 | elapsed time per iteration (s): 0.15 | learning rate: 5.565E-05 | global batch size: 256 | lm loss: 3.680975E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.595 | TFLOPs: 26.11 | +7: iteration 123080/ 173500 | consumed samples: 31508480 | consumed tokens: 64529367040 | elapsed time per iteration (s): 0.16 | learning rate: 5.563E-05 | global batch size: 256 | lm loss: 3.687738E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.711 | TFLOPs: 25.81 | +7: iteration 123090/ 173500 | consumed samples: 31511040 | consumed tokens: 64534609920 | elapsed time per iteration (s): 0.16 | learning rate: 5.562E-05 | global batch size: 256 | lm loss: 3.674918E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.155 | TFLOPs: 25.52 | +7: iteration 123100/ 173500 | consumed samples: 31513600 | consumed tokens: 64539852800 | elapsed time per iteration (s): 0.16 | learning rate: 5.561E-05 | global batch size: 256 | lm loss: 3.672952E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.051 | TFLOPs: 24.59 | +7: iteration 123110/ 173500 | consumed samples: 31516160 | consumed tokens: 64545095680 | elapsed time per iteration (s): 0.15 | learning rate: 5.559E-05 | global batch size: 256 | lm loss: 3.663752E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.545 | TFLOPs: 25.93 | +7: iteration 123120/ 173500 | consumed samples: 31518720 | consumed tokens: 64550338560 | elapsed time per iteration (s): 0.15 | learning rate: 5.558E-05 | global batch size: 256 | lm loss: 3.682257E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.184 | TFLOPs: 26.26 | +7: iteration 123130/ 173500 | consumed samples: 31521280 | consumed tokens: 64555581440 | elapsed time per iteration (s): 0.15 | learning rate: 5.557E-05 | global batch size: 256 | lm loss: 3.683707E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.694 | TFLOPs: 26.09 | +7: iteration 123140/ 173500 | consumed samples: 31523840 | consumed tokens: 64560824320 | elapsed time per iteration (s): 0.15 | learning rate: 5.555E-05 | global batch size: 256 | lm loss: 3.681145E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.176 | TFLOPs: 26.24 | +7: iteration 123150/ 173500 | consumed samples: 31526400 | consumed tokens: 64566067200 | elapsed time per iteration (s): 0.15 | learning rate: 5.554E-05 | global batch size: 256 | lm loss: 3.682193E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.646 | TFLOPs: 26.23 | +7: iteration 123160/ 173500 | consumed samples: 31528960 | consumed tokens: 64571310080 | elapsed time per iteration (s): 0.15 | learning rate: 5.553E-05 | global batch size: 256 | lm loss: 3.675234E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.270 | TFLOPs: 26.23 | +7: iteration 123170/ 173500 | consumed samples: 31531520 | consumed tokens: 64576552960 | elapsed time per iteration (s): 0.16 | learning rate: 5.552E-05 | global batch size: 256 | lm loss: 3.688553E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.312 | TFLOPs: 25.76 | +7: iteration 123180/ 173500 | consumed samples: 31534080 | consumed tokens: 64581795840 | elapsed time per iteration (s): 0.15 | learning rate: 5.550E-05 | global batch size: 256 | lm loss: 3.676767E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.686 | TFLOPs: 26.25 | +7: iteration 123190/ 173500 | consumed samples: 31536640 | consumed tokens: 64587038720 | elapsed time per iteration (s): 0.16 | learning rate: 5.549E-05 | global batch size: 256 | lm loss: 3.674553E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.043 | TFLOPs: 25.74 | +7: iteration 123200/ 173500 | consumed samples: 31539200 | consumed tokens: 64592281600 | elapsed time per iteration (s): 0.15 | learning rate: 5.548E-05 | global batch size: 256 | lm loss: 3.680494E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.447 | TFLOPs: 26.21 | +7: iteration 123210/ 173500 | consumed samples: 31541760 | consumed tokens: 64597524480 | elapsed time per iteration (s): 0.15 | learning rate: 5.546E-05 | global batch size: 256 | lm loss: 3.670100E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.728 | TFLOPs: 26.17 | +7: iteration 123220/ 173500 | consumed samples: 31544320 | consumed tokens: 64602767360 | elapsed time per iteration (s): 0.16 | learning rate: 5.545E-05 | global batch size: 256 | lm loss: 3.687173E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.890 | TFLOPs: 25.61 | +7: iteration 123230/ 173500 | consumed samples: 31546880 | consumed tokens: 64608010240 | elapsed time per iteration (s): 0.16 | learning rate: 5.544E-05 | global batch size: 256 | lm loss: 3.673745E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.166 | TFLOPs: 25.78 | +7: iteration 123240/ 173500 | consumed samples: 31549440 | consumed tokens: 64613253120 | elapsed time per iteration (s): 0.15 | learning rate: 5.542E-05 | global batch size: 256 | lm loss: 3.681715E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.067 | TFLOPs: 26.22 | +7: iteration 123250/ 173500 | consumed samples: 31552000 | consumed tokens: 64618496000 | elapsed time per iteration (s): 0.16 | learning rate: 5.541E-05 | global batch size: 256 | lm loss: 3.680923E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.805 | TFLOPs: 25.70 | +7: iteration 123260/ 173500 | consumed samples: 31554560 | consumed tokens: 64623738880 | elapsed time per iteration (s): 0.15 | learning rate: 5.540E-05 | global batch size: 256 | lm loss: 3.666634E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.336 | TFLOPs: 26.21 | +7: iteration 123270/ 173500 | consumed samples: 31557120 | consumed tokens: 64628981760 | elapsed time per iteration (s): 0.16 | learning rate: 5.538E-05 | global batch size: 256 | lm loss: 3.683375E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.586 | TFLOPs: 25.84 | +7: iteration 123280/ 173500 | consumed samples: 31559680 | consumed tokens: 64634224640 | elapsed time per iteration (s): 0.16 | learning rate: 5.537E-05 | global batch size: 256 | lm loss: 3.678047E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.450 | TFLOPs: 25.32 | +7: iteration 123290/ 173500 | consumed samples: 31562240 | consumed tokens: 64639467520 | elapsed time per iteration (s): 0.16 | learning rate: 5.536E-05 | global batch size: 256 | lm loss: 3.687284E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.520 | TFLOPs: 25.29 | +7: iteration 123300/ 173500 | consumed samples: 31564800 | consumed tokens: 64644710400 | elapsed time per iteration (s): 0.15 | learning rate: 5.535E-05 | global batch size: 256 | lm loss: 3.690045E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.744 | TFLOPs: 25.90 | +7: iteration 123310/ 173500 | consumed samples: 31567360 | consumed tokens: 64649953280 | elapsed time per iteration (s): 0.15 | learning rate: 5.533E-05 | global batch size: 256 | lm loss: 3.680501E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.105 | TFLOPs: 26.24 | +7: iteration 123320/ 173500 | consumed samples: 31569920 | consumed tokens: 64655196160 | elapsed time per iteration (s): 0.16 | learning rate: 5.532E-05 | global batch size: 256 | lm loss: 3.685919E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.337 | TFLOPs: 25.43 | +7: iteration 123330/ 173500 | consumed samples: 31572480 | consumed tokens: 64660439040 | elapsed time per iteration (s): 0.15 | learning rate: 5.531E-05 | global batch size: 256 | lm loss: 3.688523E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.033 | TFLOPs: 26.11 | +7: iteration 123340/ 173500 | consumed samples: 31575040 | consumed tokens: 64665681920 | elapsed time per iteration (s): 0.16 | learning rate: 5.529E-05 | global batch size: 256 | lm loss: 3.675423E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.895 | TFLOPs: 25.17 | +7: iteration 123350/ 173500 | consumed samples: 31577600 | consumed tokens: 64670924800 | elapsed time per iteration (s): 0.16 | learning rate: 5.528E-05 | global batch size: 256 | lm loss: 3.675420E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.684 | TFLOPs: 25.20 | +7: iteration 123360/ 173500 | consumed samples: 31580160 | consumed tokens: 64676167680 | elapsed time per iteration (s): 0.15 | learning rate: 5.527E-05 | global batch size: 256 | lm loss: 3.670556E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.513 | TFLOPs: 26.20 | +7: iteration 123370/ 173500 | consumed samples: 31582720 | consumed tokens: 64681410560 | elapsed time per iteration (s): 0.16 | learning rate: 5.525E-05 | global batch size: 256 | lm loss: 3.685393E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.595 | TFLOPs: 25.87 | +7: iteration 123380/ 173500 | consumed samples: 31585280 | consumed tokens: 64686653440 | elapsed time per iteration (s): 0.15 | learning rate: 5.524E-05 | global batch size: 256 | lm loss: 3.674702E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.026 | TFLOPs: 25.91 | +7: iteration 123390/ 173500 | consumed samples: 31587840 | consumed tokens: 64691896320 | elapsed time per iteration (s): 0.16 | learning rate: 5.523E-05 | global batch size: 256 | lm loss: 3.676004E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.540 | TFLOPs: 25.73 | +7: iteration 123400/ 173500 | consumed samples: 31590400 | consumed tokens: 64697139200 | elapsed time per iteration (s): 0.15 | learning rate: 5.521E-05 | global batch size: 256 | lm loss: 3.665416E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.738 | TFLOPs: 25.97 | +7: iteration 123410/ 173500 | consumed samples: 31592960 | consumed tokens: 64702382080 | elapsed time per iteration (s): 0.16 | learning rate: 5.520E-05 | global batch size: 256 | lm loss: 3.672009E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.273 | TFLOPs: 25.69 | +7: iteration 123420/ 173500 | consumed samples: 31595520 | consumed tokens: 64707624960 | elapsed time per iteration (s): 0.16 | learning rate: 5.519E-05 | global batch size: 256 | lm loss: 3.672395E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.413 | TFLOPs: 25.79 | +7: iteration 123430/ 173500 | consumed samples: 31598080 | consumed tokens: 64712867840 | elapsed time per iteration (s): 0.15 | learning rate: 5.518E-05 | global batch size: 256 | lm loss: 3.689980E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.732 | TFLOPs: 26.06 | +7: iteration 123440/ 173500 | consumed samples: 31600640 | consumed tokens: 64718110720 | elapsed time per iteration (s): 0.16 | learning rate: 5.516E-05 | global batch size: 256 | lm loss: 3.676600E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.536 | TFLOPs: 25.32 | +7: iteration 123450/ 173500 | consumed samples: 31603200 | consumed tokens: 64723353600 | elapsed time per iteration (s): 0.15 | learning rate: 5.515E-05 | global batch size: 256 | lm loss: 3.676791E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.930 | TFLOPs: 26.19 | +7: iteration 123460/ 173500 | consumed samples: 31605760 | consumed tokens: 64728596480 | elapsed time per iteration (s): 0.15 | learning rate: 5.514E-05 | global batch size: 256 | lm loss: 3.688816E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.515 | TFLOPs: 26.15 | +7: iteration 123470/ 173500 | consumed samples: 31608320 | consumed tokens: 64733839360 | elapsed time per iteration (s): 0.15 | learning rate: 5.512E-05 | global batch size: 256 | lm loss: 3.684161E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.870 | TFLOPs: 26.17 | +7: iteration 123480/ 173500 | consumed samples: 31610880 | consumed tokens: 64739082240 | elapsed time per iteration (s): 0.16 | learning rate: 5.511E-05 | global batch size: 256 | lm loss: 3.674549E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.064 | TFLOPs: 25.39 | +7: iteration 123490/ 173500 | consumed samples: 31613440 | consumed tokens: 64744325120 | elapsed time per iteration (s): 0.15 | learning rate: 5.510E-05 | global batch size: 256 | lm loss: 3.678065E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.342 | TFLOPs: 26.20 | +7: iteration 123500/ 173500 | consumed samples: 31616000 | consumed tokens: 64749568000 | elapsed time per iteration (s): 0.16 | learning rate: 5.508E-05 | global batch size: 256 | lm loss: 3.675833E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.862 | TFLOPs: 25.89 | +7: iteration 123510/ 173500 | consumed samples: 31618560 | consumed tokens: 64754810880 | elapsed time per iteration (s): 0.15 | learning rate: 5.507E-05 | global batch size: 256 | lm loss: 3.683994E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.680 | TFLOPs: 26.20 | +7: iteration 123520/ 173500 | consumed samples: 31621120 | consumed tokens: 64760053760 | elapsed time per iteration (s): 0.15 | learning rate: 5.506E-05 | global batch size: 256 | lm loss: 3.669441E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.583 | TFLOPs: 26.17 | +7: iteration 123530/ 173500 | consumed samples: 31623680 | consumed tokens: 64765296640 | elapsed time per iteration (s): 0.15 | learning rate: 5.504E-05 | global batch size: 256 | lm loss: 3.683371E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.945 | TFLOPs: 25.94 | +7: iteration 123540/ 173500 | consumed samples: 31626240 | consumed tokens: 64770539520 | elapsed time per iteration (s): 0.15 | learning rate: 5.503E-05 | global batch size: 256 | lm loss: 3.671752E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.813 | TFLOPs: 26.12 | +7: iteration 123550/ 173500 | consumed samples: 31628800 | consumed tokens: 64775782400 | elapsed time per iteration (s): 0.15 | learning rate: 5.502E-05 | global batch size: 256 | lm loss: 3.674918E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.749 | TFLOPs: 26.12 | +7: iteration 123560/ 173500 | consumed samples: 31631360 | consumed tokens: 64781025280 | elapsed time per iteration (s): 0.16 | learning rate: 5.501E-05 | global batch size: 256 | lm loss: 3.681452E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.771 | TFLOPs: 25.72 | +7: iteration 123570/ 173500 | consumed samples: 31633920 | consumed tokens: 64786268160 | elapsed time per iteration (s): 0.15 | learning rate: 5.499E-05 | global batch size: 256 | lm loss: 3.665413E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.958 | TFLOPs: 26.16 | +7: iteration 123580/ 173500 | consumed samples: 31636480 | consumed tokens: 64791511040 | elapsed time per iteration (s): 0.15 | learning rate: 5.498E-05 | global batch size: 256 | lm loss: 3.664305E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.470 | TFLOPs: 26.17 | +7: iteration 123590/ 173500 | consumed samples: 31639040 | consumed tokens: 64796753920 | elapsed time per iteration (s): 0.16 | learning rate: 5.497E-05 | global batch size: 256 | lm loss: 3.683036E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.703 | TFLOPs: 25.86 | +7: iteration 123600/ 173500 | consumed samples: 31641600 | consumed tokens: 64801996800 | elapsed time per iteration (s): 0.16 | learning rate: 5.495E-05 | global batch size: 256 | lm loss: 3.675747E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.240 | TFLOPs: 25.03 | +7: iteration 123610/ 173500 | consumed samples: 31644160 | consumed tokens: 64807239680 | elapsed time per iteration (s): 0.15 | learning rate: 5.494E-05 | global batch size: 256 | lm loss: 3.686745E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.134 | TFLOPs: 26.10 | +7: iteration 123620/ 173500 | consumed samples: 31646720 | consumed tokens: 64812482560 | elapsed time per iteration (s): 0.15 | learning rate: 5.493E-05 | global batch size: 256 | lm loss: 3.674012E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.661 | TFLOPs: 25.96 | +7: iteration 123630/ 173500 | consumed samples: 31649280 | consumed tokens: 64817725440 | elapsed time per iteration (s): 0.16 | learning rate: 5.491E-05 | global batch size: 256 | lm loss: 3.668739E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.502 | TFLOPs: 25.37 | +7: iteration 123640/ 173500 | consumed samples: 31651840 | consumed tokens: 64822968320 | elapsed time per iteration (s): 0.15 | learning rate: 5.490E-05 | global batch size: 256 | lm loss: 3.678121E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.655 | TFLOPs: 26.15 | +7: iteration 123650/ 173500 | consumed samples: 31654400 | consumed tokens: 64828211200 | elapsed time per iteration (s): 0.16 | learning rate: 5.489E-05 | global batch size: 256 | lm loss: 3.679046E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.887 | TFLOPs: 24.87 | +7: iteration 123660/ 173500 | consumed samples: 31656960 | consumed tokens: 64833454080 | elapsed time per iteration (s): 0.15 | learning rate: 5.488E-05 | global batch size: 256 | lm loss: 3.673079E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.375 | TFLOPs: 26.15 | +7: iteration 123670/ 173500 | consumed samples: 31659520 | consumed tokens: 64838696960 | elapsed time per iteration (s): 0.15 | learning rate: 5.486E-05 | global batch size: 256 | lm loss: 3.680852E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.536 | TFLOPs: 26.14 | +7: iteration 123680/ 173500 | consumed samples: 31662080 | consumed tokens: 64843939840 | elapsed time per iteration (s): 0.15 | learning rate: 5.485E-05 | global batch size: 256 | lm loss: 3.681080E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.512 | TFLOPs: 25.93 | +7: iteration 123690/ 173500 | consumed samples: 31664640 | consumed tokens: 64849182720 | elapsed time per iteration (s): 0.15 | learning rate: 5.484E-05 | global batch size: 256 | lm loss: 3.689680E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.101 | TFLOPs: 26.16 | +7: iteration 123700/ 173500 | consumed samples: 31667200 | consumed tokens: 64854425600 | elapsed time per iteration (s): 0.16 | learning rate: 5.482E-05 | global batch size: 256 | lm loss: 3.673961E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.474 | TFLOPs: 25.68 | +7: iteration 123710/ 173500 | consumed samples: 31669760 | consumed tokens: 64859668480 | elapsed time per iteration (s): 0.15 | learning rate: 5.481E-05 | global batch size: 256 | lm loss: 3.677699E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.839 | TFLOPs: 26.19 | +7: iteration 123720/ 173500 | consumed samples: 31672320 | consumed tokens: 64864911360 | elapsed time per iteration (s): 0.16 | learning rate: 5.480E-05 | global batch size: 256 | lm loss: 3.668493E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.797 | TFLOPs: 25.79 | +7: iteration 123730/ 173500 | consumed samples: 31674880 | consumed tokens: 64870154240 | elapsed time per iteration (s): 0.15 | learning rate: 5.478E-05 | global batch size: 256 | lm loss: 3.685352E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.674 | TFLOPs: 26.17 | +7: iteration 123740/ 173500 | consumed samples: 31677440 | consumed tokens: 64875397120 | elapsed time per iteration (s): 0.15 | learning rate: 5.477E-05 | global batch size: 256 | lm loss: 3.669696E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.059 | TFLOPs: 26.14 | +7: iteration 123750/ 173500 | consumed samples: 31680000 | consumed tokens: 64880640000 | elapsed time per iteration (s): 0.16 | learning rate: 5.476E-05 | global batch size: 256 | lm loss: 3.678027E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.440 | TFLOPs: 25.68 | +7: iteration 123760/ 173500 | consumed samples: 31682560 | consumed tokens: 64885882880 | elapsed time per iteration (s): 0.15 | learning rate: 5.475E-05 | global batch size: 256 | lm loss: 3.683159E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.914 | TFLOPs: 26.16 | +7: iteration 123770/ 173500 | consumed samples: 31685120 | consumed tokens: 64891125760 | elapsed time per iteration (s): 0.15 | learning rate: 5.473E-05 | global batch size: 256 | lm loss: 3.666064E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.022 | TFLOPs: 26.19 | +7: iteration 123780/ 173500 | consumed samples: 31687680 | consumed tokens: 64896368640 | elapsed time per iteration (s): 0.15 | learning rate: 5.472E-05 | global batch size: 256 | lm loss: 3.673388E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.384 | TFLOPs: 26.02 | +7: iteration 123790/ 173500 | consumed samples: 31690240 | consumed tokens: 64901611520 | elapsed time per iteration (s): 0.15 | learning rate: 5.471E-05 | global batch size: 256 | lm loss: 3.669346E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.501 | TFLOPs: 26.17 | +7: iteration 123800/ 173500 | consumed samples: 31692800 | consumed tokens: 64906854400 | elapsed time per iteration (s): 0.15 | learning rate: 5.469E-05 | global batch size: 256 | lm loss: 3.674221E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.801 | TFLOPs: 26.17 | +7: iteration 123810/ 173500 | consumed samples: 31695360 | consumed tokens: 64912097280 | elapsed time per iteration (s): 0.15 | learning rate: 5.468E-05 | global batch size: 256 | lm loss: 3.669541E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.589 | TFLOPs: 26.12 | +7: iteration 123820/ 173500 | consumed samples: 31697920 | consumed tokens: 64917340160 | elapsed time per iteration (s): 0.15 | learning rate: 5.467E-05 | global batch size: 256 | lm loss: 3.690894E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.076 | TFLOPs: 26.11 | +7: iteration 123830/ 173500 | consumed samples: 31700480 | consumed tokens: 64922583040 | elapsed time per iteration (s): 0.15 | learning rate: 5.465E-05 | global batch size: 256 | lm loss: 3.680527E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.771 | TFLOPs: 26.12 | +7: iteration 123840/ 173500 | consumed samples: 31703040 | consumed tokens: 64927825920 | elapsed time per iteration (s): 0.15 | learning rate: 5.464E-05 | global batch size: 256 | lm loss: 3.674764E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.596 | TFLOPs: 26.06 | +7: iteration 123850/ 173500 | consumed samples: 31705600 | consumed tokens: 64933068800 | elapsed time per iteration (s): 0.16 | learning rate: 5.463E-05 | global batch size: 256 | lm loss: 3.679874E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.664 | TFLOPs: 25.49 | +7: iteration 123860/ 173500 | consumed samples: 31708160 | consumed tokens: 64938311680 | elapsed time per iteration (s): 0.16 | learning rate: 5.462E-05 | global batch size: 256 | lm loss: 3.675646E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.114 | TFLOPs: 25.50 | +7: iteration 123870/ 173500 | consumed samples: 31710720 | consumed tokens: 64943554560 | elapsed time per iteration (s): 0.15 | learning rate: 5.460E-05 | global batch size: 256 | lm loss: 3.683352E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.922 | TFLOPs: 26.13 | +7: iteration 123880/ 173500 | consumed samples: 31713280 | consumed tokens: 64948797440 | elapsed time per iteration (s): 0.15 | learning rate: 5.459E-05 | global batch size: 256 | lm loss: 3.668420E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.447 | TFLOPs: 26.15 | +7: iteration 123890/ 173500 | consumed samples: 31715840 | consumed tokens: 64954040320 | elapsed time per iteration (s): 0.16 | learning rate: 5.458E-05 | global batch size: 256 | lm loss: 3.667306E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.938 | TFLOPs: 24.92 | +7: iteration 123900/ 173500 | consumed samples: 31718400 | consumed tokens: 64959283200 | elapsed time per iteration (s): 0.15 | learning rate: 5.456E-05 | global batch size: 256 | lm loss: 3.669471E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.796 | TFLOPs: 26.16 | +7: iteration 123910/ 173500 | consumed samples: 31720960 | consumed tokens: 64964526080 | elapsed time per iteration (s): 0.18 | learning rate: 5.455E-05 | global batch size: 256 | lm loss: 3.680028E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.689 | TFLOPs: 22.56 | +7: iteration 123920/ 173500 | consumed samples: 31723520 | consumed tokens: 64969768960 | elapsed time per iteration (s): 0.16 | learning rate: 5.454E-05 | global batch size: 256 | lm loss: 3.673275E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.652 | TFLOPs: 25.84 | +7: iteration 123930/ 173500 | consumed samples: 31726080 | consumed tokens: 64975011840 | elapsed time per iteration (s): 0.15 | learning rate: 5.452E-05 | global batch size: 256 | lm loss: 3.676731E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.315 | TFLOPs: 26.16 | +7: iteration 123940/ 173500 | consumed samples: 31728640 | consumed tokens: 64980254720 | elapsed time per iteration (s): 0.16 | learning rate: 5.451E-05 | global batch size: 256 | lm loss: 3.671148E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.388 | TFLOPs: 25.90 | +7: iteration 123950/ 173500 | consumed samples: 31731200 | consumed tokens: 64985497600 | elapsed time per iteration (s): 0.15 | learning rate: 5.450E-05 | global batch size: 256 | lm loss: 3.679618E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.687 | TFLOPs: 26.12 | +7: iteration 123960/ 173500 | consumed samples: 31733760 | consumed tokens: 64990740480 | elapsed time per iteration (s): 0.15 | learning rate: 5.449E-05 | global batch size: 256 | lm loss: 3.683731E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.312 | TFLOPs: 25.91 | +7: iteration 123970/ 173500 | consumed samples: 31736320 | consumed tokens: 64995983360 | elapsed time per iteration (s): 0.16 | learning rate: 5.447E-05 | global batch size: 256 | lm loss: 3.678847E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.597 | TFLOPs: 24.88 | +7: iteration 123980/ 173500 | consumed samples: 31738880 | consumed tokens: 65001226240 | elapsed time per iteration (s): 0.16 | learning rate: 5.446E-05 | global batch size: 256 | lm loss: 3.674791E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.797 | TFLOPs: 25.36 | +7: iteration 123990/ 173500 | consumed samples: 31741440 | consumed tokens: 65006469120 | elapsed time per iteration (s): 0.15 | learning rate: 5.445E-05 | global batch size: 256 | lm loss: 3.684376E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.694 | TFLOPs: 26.17 | +0: [2023-03-17 05:38:21,199] [INFO] [logging.py:68:log_dist] [Rank 0] step=124000, skipped=0, lr=[5.443416434803536e-05, 5.443416434803536e-05, 5.443416434803536e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 124000/ 173500 | consumed samples: 31744000 | consumed tokens: 65011712000 | elapsed time per iteration (s): 0.15 | learning rate: 5.443E-05 | global batch size: 256 | lm loss: 3.676262E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.691 | TFLOPs: 26.14 | +0: steps: 124000 loss: 3.6582 iter time (s): 0.154 samples/sec: 1662.173 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 124000 | lm loss value: 3.808945E+00 | lm loss PPL: 4.510282E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 124000 to checkpoints_44m91b100m +0: [2023-03-17 05:38:21,282] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step124000 is begin to save! +0: [2023-03-17 05:38:21,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:38:21,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:38:21,348] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:38:21,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:38:21,360] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:38:21,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:38:21,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:38:21,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:38:21,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:38:21,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:38:21,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:38:21,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:38:21,394] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:38:21,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:38:21,402] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:38:21,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:38:21,410] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:38:21,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:38:21,417] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:38:21,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:38:21,419] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step124000/mp_rank_00_model_states.pt +0: [2023-03-17 05:38:21,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:38:21,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:38:21,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:38:21,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:38:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:38:21,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:38:21,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +4: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: [2023-03-17 05:38:21,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:38:21,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +5: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +2: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +6: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +7: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +3: [2023-03-17 05:38:21,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:38:21,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:38:21,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step124000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:38:21,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +1: [2023-03-17 05:38:21,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step124000 is ready now! +0: successfully saved checkpoint at iteration 124000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.69 +7: iteration 124010/ 173500 | consumed samples: 31746560 | consumed tokens: 65016954880 | elapsed time per iteration (s): 0.18 | learning rate: 5.442E-05 | global batch size: 256 | lm loss: 3.671696E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1426.832 | TFLOPs: 22.38 | +7: iteration 124020/ 173500 | consumed samples: 31749120 | consumed tokens: 65022197760 | elapsed time per iteration (s): 0.15 | learning rate: 5.441E-05 | global batch size: 256 | lm loss: 3.671151E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.171 | TFLOPs: 26.19 | +7: iteration 124030/ 173500 | consumed samples: 31751680 | consumed tokens: 65027440640 | elapsed time per iteration (s): 0.16 | learning rate: 5.440E-05 | global batch size: 256 | lm loss: 3.664861E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.264 | TFLOPs: 25.50 | +7: iteration 124040/ 173500 | consumed samples: 31754240 | consumed tokens: 65032683520 | elapsed time per iteration (s): 0.16 | learning rate: 5.438E-05 | global batch size: 256 | lm loss: 3.672824E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.578 | TFLOPs: 25.56 | +7: iteration 124050/ 173500 | consumed samples: 31756800 | consumed tokens: 65037926400 | elapsed time per iteration (s): 0.16 | learning rate: 5.437E-05 | global batch size: 256 | lm loss: 3.665325E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.682 | TFLOPs: 25.04 | +7: iteration 124060/ 173500 | consumed samples: 31759360 | consumed tokens: 65043169280 | elapsed time per iteration (s): 0.16 | learning rate: 5.436E-05 | global batch size: 256 | lm loss: 3.669315E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.707 | TFLOPs: 25.06 | +7: iteration 124070/ 173500 | consumed samples: 31761920 | consumed tokens: 65048412160 | elapsed time per iteration (s): 0.16 | learning rate: 5.434E-05 | global batch size: 256 | lm loss: 3.667485E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.721 | TFLOPs: 25.79 | +7: iteration 124080/ 173500 | consumed samples: 31764480 | consumed tokens: 65053655040 | elapsed time per iteration (s): 0.16 | learning rate: 5.433E-05 | global batch size: 256 | lm loss: 3.690985E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.397 | TFLOPs: 25.46 | +7: iteration 124090/ 173500 | consumed samples: 31767040 | consumed tokens: 65058897920 | elapsed time per iteration (s): 0.15 | learning rate: 5.432E-05 | global batch size: 256 | lm loss: 3.682949E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.238 | TFLOPs: 26.13 | +7: iteration 124100/ 173500 | consumed samples: 31769600 | consumed tokens: 65064140800 | elapsed time per iteration (s): 0.16 | learning rate: 5.430E-05 | global batch size: 256 | lm loss: 3.674776E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.839 | TFLOPs: 25.32 | +7: iteration 124110/ 173500 | consumed samples: 31772160 | consumed tokens: 65069383680 | elapsed time per iteration (s): 0.17 | learning rate: 5.429E-05 | global batch size: 256 | lm loss: 3.678911E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1503.774 | TFLOPs: 23.58 | +7: iteration 124120/ 173500 | consumed samples: 31774720 | consumed tokens: 65074626560 | elapsed time per iteration (s): 0.15 | learning rate: 5.428E-05 | global batch size: 256 | lm loss: 3.660020E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.591 | TFLOPs: 26.18 | +7: iteration 124130/ 173500 | consumed samples: 31777280 | consumed tokens: 65079869440 | elapsed time per iteration (s): 0.15 | learning rate: 5.427E-05 | global batch size: 256 | lm loss: 3.671822E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.160 | TFLOPs: 26.16 | +7: iteration 124140/ 173500 | consumed samples: 31779840 | consumed tokens: 65085112320 | elapsed time per iteration (s): 0.15 | learning rate: 5.425E-05 | global batch size: 256 | lm loss: 3.672422E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.113 | TFLOPs: 26.18 | +7: iteration 124150/ 173500 | consumed samples: 31782400 | consumed tokens: 65090355200 | elapsed time per iteration (s): 0.16 | learning rate: 5.424E-05 | global batch size: 256 | lm loss: 3.666877E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.077 | TFLOPs: 25.55 | +7: iteration 124160/ 173500 | consumed samples: 31784960 | consumed tokens: 65095598080 | elapsed time per iteration (s): 0.16 | learning rate: 5.423E-05 | global batch size: 256 | lm loss: 3.682456E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.200 | TFLOPs: 25.58 | +7: iteration 124170/ 173500 | consumed samples: 31787520 | consumed tokens: 65100840960 | elapsed time per iteration (s): 0.16 | learning rate: 5.421E-05 | global batch size: 256 | lm loss: 3.666391E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.534 | TFLOPs: 24.88 | +7: iteration 124180/ 173500 | consumed samples: 31790080 | consumed tokens: 65106083840 | elapsed time per iteration (s): 0.16 | learning rate: 5.420E-05 | global batch size: 256 | lm loss: 3.670244E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.775 | TFLOPs: 25.65 | +7: iteration 124190/ 173500 | consumed samples: 31792640 | consumed tokens: 65111326720 | elapsed time per iteration (s): 0.16 | learning rate: 5.419E-05 | global batch size: 256 | lm loss: 3.672171E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.538 | TFLOPs: 25.60 | +7: iteration 124200/ 173500 | consumed samples: 31795200 | consumed tokens: 65116569600 | elapsed time per iteration (s): 0.16 | learning rate: 5.418E-05 | global batch size: 256 | lm loss: 3.692267E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.798 | TFLOPs: 25.70 | +7: iteration 124210/ 173500 | consumed samples: 31797760 | consumed tokens: 65121812480 | elapsed time per iteration (s): 0.16 | learning rate: 5.416E-05 | global batch size: 256 | lm loss: 3.674127E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.285 | TFLOPs: 24.83 | +7: iteration 124220/ 173500 | consumed samples: 31800320 | consumed tokens: 65127055360 | elapsed time per iteration (s): 0.17 | learning rate: 5.415E-05 | global batch size: 256 | lm loss: 3.675285E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.916 | TFLOPs: 23.35 | +7: iteration 124230/ 173500 | consumed samples: 31802880 | consumed tokens: 65132298240 | elapsed time per iteration (s): 0.16 | learning rate: 5.414E-05 | global batch size: 256 | lm loss: 3.674343E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.154 | TFLOPs: 24.94 | +7: iteration 124240/ 173500 | consumed samples: 31805440 | consumed tokens: 65137541120 | elapsed time per iteration (s): 0.18 | learning rate: 5.412E-05 | global batch size: 256 | lm loss: 3.679160E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.926 | TFLOPs: 21.95 | +7: iteration 124250/ 173500 | consumed samples: 31808000 | consumed tokens: 65142784000 | elapsed time per iteration (s): 0.17 | learning rate: 5.411E-05 | global batch size: 256 | lm loss: 3.679086E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.654 | TFLOPs: 23.86 | +7: iteration 124260/ 173500 | consumed samples: 31810560 | consumed tokens: 65148026880 | elapsed time per iteration (s): 0.17 | learning rate: 5.410E-05 | global batch size: 256 | lm loss: 3.675297E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.623 | TFLOPs: 23.71 | +7: iteration 124270/ 173500 | consumed samples: 31813120 | consumed tokens: 65153269760 | elapsed time per iteration (s): 0.16 | learning rate: 5.409E-05 | global batch size: 256 | lm loss: 3.660895E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.199 | TFLOPs: 24.80 | +7: iteration 124280/ 173500 | consumed samples: 31815680 | consumed tokens: 65158512640 | elapsed time per iteration (s): 0.17 | learning rate: 5.407E-05 | global batch size: 256 | lm loss: 3.697699E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.356 | TFLOPs: 23.81 | +7: iteration 124290/ 173500 | consumed samples: 31818240 | consumed tokens: 65163755520 | elapsed time per iteration (s): 0.17 | learning rate: 5.406E-05 | global batch size: 256 | lm loss: 3.684361E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.258 | TFLOPs: 23.32 | +7: iteration 124300/ 173500 | consumed samples: 31820800 | consumed tokens: 65168998400 | elapsed time per iteration (s): 0.17 | learning rate: 5.405E-05 | global batch size: 256 | lm loss: 3.678456E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.147 | TFLOPs: 23.78 | +7: iteration 124310/ 173500 | consumed samples: 31823360 | consumed tokens: 65174241280 | elapsed time per iteration (s): 0.16 | learning rate: 5.403E-05 | global batch size: 256 | lm loss: 3.672975E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.348 | TFLOPs: 25.33 | +7: iteration 124320/ 173500 | consumed samples: 31825920 | consumed tokens: 65179484160 | elapsed time per iteration (s): 0.18 | learning rate: 5.402E-05 | global batch size: 256 | lm loss: 3.669366E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.901 | TFLOPs: 21.78 | +7: iteration 124330/ 173500 | consumed samples: 31828480 | consumed tokens: 65184727040 | elapsed time per iteration (s): 0.16 | learning rate: 5.401E-05 | global batch size: 256 | lm loss: 3.666470E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.590 | TFLOPs: 24.79 | +7: iteration 124340/ 173500 | consumed samples: 31831040 | consumed tokens: 65189969920 | elapsed time per iteration (s): 0.17 | learning rate: 5.399E-05 | global batch size: 256 | lm loss: 3.678594E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.953 | TFLOPs: 23.85 | +7: iteration 124350/ 173500 | consumed samples: 31833600 | consumed tokens: 65195212800 | elapsed time per iteration (s): 0.17 | learning rate: 5.398E-05 | global batch size: 256 | lm loss: 3.677339E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.032 | TFLOPs: 24.25 | +7: iteration 124360/ 173500 | consumed samples: 31836160 | consumed tokens: 65200455680 | elapsed time per iteration (s): 0.16 | learning rate: 5.397E-05 | global batch size: 256 | lm loss: 3.677021E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.667 | TFLOPs: 24.58 | +7: iteration 124370/ 173500 | consumed samples: 31838720 | consumed tokens: 65205698560 | elapsed time per iteration (s): 0.16 | learning rate: 5.396E-05 | global batch size: 256 | lm loss: 3.675589E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.671 | TFLOPs: 25.24 | +7: iteration 124380/ 173500 | consumed samples: 31841280 | consumed tokens: 65210941440 | elapsed time per iteration (s): 0.17 | learning rate: 5.394E-05 | global batch size: 256 | lm loss: 3.671604E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.319 | TFLOPs: 23.11 | +7: iteration 124390/ 173500 | consumed samples: 31843840 | consumed tokens: 65216184320 | elapsed time per iteration (s): 0.16 | learning rate: 5.393E-05 | global batch size: 256 | lm loss: 3.670485E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.021 | TFLOPs: 24.54 | +7: iteration 124400/ 173500 | consumed samples: 31846400 | consumed tokens: 65221427200 | elapsed time per iteration (s): 0.16 | learning rate: 5.392E-05 | global batch size: 256 | lm loss: 3.674752E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.138 | TFLOPs: 24.76 | +7: iteration 124410/ 173500 | consumed samples: 31848960 | consumed tokens: 65226670080 | elapsed time per iteration (s): 0.17 | learning rate: 5.390E-05 | global batch size: 256 | lm loss: 3.678546E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.726 | TFLOPs: 24.01 | +7: iteration 124420/ 173500 | consumed samples: 31851520 | consumed tokens: 65231912960 | elapsed time per iteration (s): 0.16 | learning rate: 5.389E-05 | global batch size: 256 | lm loss: 3.681246E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.889 | TFLOPs: 24.95 | +7: iteration 124430/ 173500 | consumed samples: 31854080 | consumed tokens: 65237155840 | elapsed time per iteration (s): 0.17 | learning rate: 5.388E-05 | global batch size: 256 | lm loss: 3.678688E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.248 | TFLOPs: 23.81 | +7: iteration 124440/ 173500 | consumed samples: 31856640 | consumed tokens: 65242398720 | elapsed time per iteration (s): 0.16 | learning rate: 5.387E-05 | global batch size: 256 | lm loss: 3.687431E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.868 | TFLOPs: 24.68 | +7: iteration 124450/ 173500 | consumed samples: 31859200 | consumed tokens: 65247641600 | elapsed time per iteration (s): 0.16 | learning rate: 5.385E-05 | global batch size: 256 | lm loss: 3.658892E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.928 | TFLOPs: 25.61 | +7: iteration 124460/ 173500 | consumed samples: 31861760 | consumed tokens: 65252884480 | elapsed time per iteration (s): 0.17 | learning rate: 5.384E-05 | global batch size: 256 | lm loss: 3.668939E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.276 | TFLOPs: 24.00 | +7: iteration 124470/ 173500 | consumed samples: 31864320 | consumed tokens: 65258127360 | elapsed time per iteration (s): 0.16 | learning rate: 5.383E-05 | global batch size: 256 | lm loss: 3.675648E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.360 | TFLOPs: 24.85 | +7: iteration 124480/ 173500 | consumed samples: 31866880 | consumed tokens: 65263370240 | elapsed time per iteration (s): 0.16 | learning rate: 5.381E-05 | global batch size: 256 | lm loss: 3.688026E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.950 | TFLOPs: 25.26 | +7: iteration 124490/ 173500 | consumed samples: 31869440 | consumed tokens: 65268613120 | elapsed time per iteration (s): 0.16 | learning rate: 5.380E-05 | global batch size: 256 | lm loss: 3.667908E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.293 | TFLOPs: 25.87 | +7: iteration 124500/ 173500 | consumed samples: 31872000 | consumed tokens: 65273856000 | elapsed time per iteration (s): 0.17 | learning rate: 5.379E-05 | global batch size: 256 | lm loss: 3.669708E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.861 | TFLOPs: 23.99 | +7: iteration 124510/ 173500 | consumed samples: 31874560 | consumed tokens: 65279098880 | elapsed time per iteration (s): 0.16 | learning rate: 5.378E-05 | global batch size: 256 | lm loss: 3.680981E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.181 | TFLOPs: 24.84 | +7: iteration 124520/ 173500 | consumed samples: 31877120 | consumed tokens: 65284341760 | elapsed time per iteration (s): 0.16 | learning rate: 5.376E-05 | global batch size: 256 | lm loss: 3.670961E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.408 | TFLOPs: 25.13 | +7: iteration 124530/ 173500 | consumed samples: 31879680 | consumed tokens: 65289584640 | elapsed time per iteration (s): 0.16 | learning rate: 5.375E-05 | global batch size: 256 | lm loss: 3.677345E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.852 | TFLOPs: 24.53 | +7: iteration 124540/ 173500 | consumed samples: 31882240 | consumed tokens: 65294827520 | elapsed time per iteration (s): 0.16 | learning rate: 5.374E-05 | global batch size: 256 | lm loss: 3.691843E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.929 | TFLOPs: 24.76 | +7: iteration 124550/ 173500 | consumed samples: 31884800 | consumed tokens: 65300070400 | elapsed time per iteration (s): 0.17 | learning rate: 5.372E-05 | global batch size: 256 | lm loss: 3.686126E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.490 | TFLOPs: 23.45 | +7: iteration 124560/ 173500 | consumed samples: 31887360 | consumed tokens: 65305313280 | elapsed time per iteration (s): 0.16 | learning rate: 5.371E-05 | global batch size: 256 | lm loss: 3.676059E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.928 | TFLOPs: 24.49 | +7: iteration 124570/ 173500 | consumed samples: 31889920 | consumed tokens: 65310556160 | elapsed time per iteration (s): 0.17 | learning rate: 5.370E-05 | global batch size: 256 | lm loss: 3.680016E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.439 | TFLOPs: 24.22 | +7: iteration 124580/ 173500 | consumed samples: 31892480 | consumed tokens: 65315799040 | elapsed time per iteration (s): 0.17 | learning rate: 5.369E-05 | global batch size: 256 | lm loss: 3.669750E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.638 | TFLOPs: 24.27 | +7: iteration 124590/ 173500 | consumed samples: 31895040 | consumed tokens: 65321041920 | elapsed time per iteration (s): 0.16 | learning rate: 5.367E-05 | global batch size: 256 | lm loss: 3.669402E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.473 | TFLOPs: 24.80 | +7: iteration 124600/ 173500 | consumed samples: 31897600 | consumed tokens: 65326284800 | elapsed time per iteration (s): 0.16 | learning rate: 5.366E-05 | global batch size: 256 | lm loss: 3.662066E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.658 | TFLOPs: 25.15 | +7: iteration 124610/ 173500 | consumed samples: 31900160 | consumed tokens: 65331527680 | elapsed time per iteration (s): 0.16 | learning rate: 5.365E-05 | global batch size: 256 | lm loss: 3.671215E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.166 | TFLOPs: 24.42 | +7: iteration 124620/ 173500 | consumed samples: 31902720 | consumed tokens: 65336770560 | elapsed time per iteration (s): 0.17 | learning rate: 5.363E-05 | global batch size: 256 | lm loss: 3.669497E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1476.782 | TFLOPs: 23.16 | +7: iteration 124630/ 173500 | consumed samples: 31905280 | consumed tokens: 65342013440 | elapsed time per iteration (s): 0.17 | learning rate: 5.362E-05 | global batch size: 256 | lm loss: 3.668439E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.731 | TFLOPs: 24.19 | +7: iteration 124640/ 173500 | consumed samples: 31907840 | consumed tokens: 65347256320 | elapsed time per iteration (s): 0.18 | learning rate: 5.361E-05 | global batch size: 256 | lm loss: 3.689380E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1455.643 | TFLOPs: 22.83 | +7: iteration 124650/ 173500 | consumed samples: 31910400 | consumed tokens: 65352499200 | elapsed time per iteration (s): 0.16 | learning rate: 5.360E-05 | global batch size: 256 | lm loss: 3.679223E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.950 | TFLOPs: 24.43 | +7: iteration 124660/ 173500 | consumed samples: 31912960 | consumed tokens: 65357742080 | elapsed time per iteration (s): 0.17 | learning rate: 5.358E-05 | global batch size: 256 | lm loss: 3.676812E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.014 | TFLOPs: 23.88 | +7: iteration 124670/ 173500 | consumed samples: 31915520 | consumed tokens: 65362984960 | elapsed time per iteration (s): 0.16 | learning rate: 5.357E-05 | global batch size: 256 | lm loss: 3.685405E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.288 | TFLOPs: 25.07 | +7: iteration 124680/ 173500 | consumed samples: 31918080 | consumed tokens: 65368227840 | elapsed time per iteration (s): 0.19 | learning rate: 5.356E-05 | global batch size: 256 | lm loss: 3.673539E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1379.865 | TFLOPs: 21.64 | +7: iteration 124690/ 173500 | consumed samples: 31920640 | consumed tokens: 65373470720 | elapsed time per iteration (s): 0.17 | learning rate: 5.355E-05 | global batch size: 256 | lm loss: 3.680375E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.568 | TFLOPs: 24.19 | +7: iteration 124700/ 173500 | consumed samples: 31923200 | consumed tokens: 65378713600 | elapsed time per iteration (s): 0.16 | learning rate: 5.353E-05 | global batch size: 256 | lm loss: 3.655774E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.482 | TFLOPs: 24.36 | +7: iteration 124710/ 173500 | consumed samples: 31925760 | consumed tokens: 65383956480 | elapsed time per iteration (s): 0.16 | learning rate: 5.352E-05 | global batch size: 256 | lm loss: 3.677864E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.649 | TFLOPs: 25.01 | +7: iteration 124720/ 173500 | consumed samples: 31928320 | consumed tokens: 65389199360 | elapsed time per iteration (s): 0.16 | learning rate: 5.351E-05 | global batch size: 256 | lm loss: 3.668536E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.387 | TFLOPs: 24.88 | +7: iteration 124730/ 173500 | consumed samples: 31930880 | consumed tokens: 65394442240 | elapsed time per iteration (s): 0.16 | learning rate: 5.349E-05 | global batch size: 256 | lm loss: 3.678904E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.519 | TFLOPs: 24.44 | +7: iteration 124740/ 173500 | consumed samples: 31933440 | consumed tokens: 65399685120 | elapsed time per iteration (s): 0.16 | learning rate: 5.348E-05 | global batch size: 256 | lm loss: 3.669599E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.285 | TFLOPs: 24.59 | +7: iteration 124750/ 173500 | consumed samples: 31936000 | consumed tokens: 65404928000 | elapsed time per iteration (s): 0.17 | learning rate: 5.347E-05 | global batch size: 256 | lm loss: 3.679841E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.768 | TFLOPs: 24.15 | +7: iteration 124760/ 173500 | consumed samples: 31938560 | consumed tokens: 65410170880 | elapsed time per iteration (s): 0.16 | learning rate: 5.346E-05 | global batch size: 256 | lm loss: 3.685160E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.148 | TFLOPs: 25.28 | +7: iteration 124770/ 173500 | consumed samples: 31941120 | consumed tokens: 65415413760 | elapsed time per iteration (s): 0.16 | learning rate: 5.344E-05 | global batch size: 256 | lm loss: 3.675446E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.676 | TFLOPs: 24.91 | +7: iteration 124780/ 173500 | consumed samples: 31943680 | consumed tokens: 65420656640 | elapsed time per iteration (s): 0.16 | learning rate: 5.343E-05 | global batch size: 256 | lm loss: 3.682819E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.929 | TFLOPs: 24.79 | +7: iteration 124790/ 173500 | consumed samples: 31946240 | consumed tokens: 65425899520 | elapsed time per iteration (s): 0.17 | learning rate: 5.342E-05 | global batch size: 256 | lm loss: 3.672624E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.104 | TFLOPs: 23.43 | +7: iteration 124800/ 173500 | consumed samples: 31948800 | consumed tokens: 65431142400 | elapsed time per iteration (s): 0.17 | learning rate: 5.340E-05 | global batch size: 256 | lm loss: 3.680542E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.739 | TFLOPs: 24.24 | +7: iteration 124810/ 173500 | consumed samples: 31951360 | consumed tokens: 65436385280 | elapsed time per iteration (s): 0.18 | learning rate: 5.339E-05 | global batch size: 256 | lm loss: 3.686515E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.829 | TFLOPs: 22.20 | +7: iteration 124820/ 173500 | consumed samples: 31953920 | consumed tokens: 65441628160 | elapsed time per iteration (s): 0.17 | learning rate: 5.338E-05 | global batch size: 256 | lm loss: 3.657203E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.809 | TFLOPs: 24.10 | +7: iteration 124830/ 173500 | consumed samples: 31956480 | consumed tokens: 65446871040 | elapsed time per iteration (s): 0.16 | learning rate: 5.337E-05 | global batch size: 256 | lm loss: 3.673090E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.092 | TFLOPs: 24.67 | +7: iteration 124840/ 173500 | consumed samples: 31959040 | consumed tokens: 65452113920 | elapsed time per iteration (s): 0.17 | learning rate: 5.335E-05 | global batch size: 256 | lm loss: 3.669964E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.763 | TFLOPs: 23.35 | +7: iteration 124850/ 173500 | consumed samples: 31961600 | consumed tokens: 65457356800 | elapsed time per iteration (s): 0.16 | learning rate: 5.334E-05 | global batch size: 256 | lm loss: 3.668054E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.579 | TFLOPs: 24.98 | +7: iteration 124860/ 173500 | consumed samples: 31964160 | consumed tokens: 65462599680 | elapsed time per iteration (s): 0.16 | learning rate: 5.333E-05 | global batch size: 256 | lm loss: 3.672085E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.732 | TFLOPs: 25.15 | +7: iteration 124870/ 173500 | consumed samples: 31966720 | consumed tokens: 65467842560 | elapsed time per iteration (s): 0.16 | learning rate: 5.331E-05 | global batch size: 256 | lm loss: 3.679630E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.555 | TFLOPs: 24.61 | +7: iteration 124880/ 173500 | consumed samples: 31969280 | consumed tokens: 65473085440 | elapsed time per iteration (s): 0.16 | learning rate: 5.330E-05 | global batch size: 256 | lm loss: 3.671517E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.352 | TFLOPs: 24.56 | +7: iteration 124890/ 173500 | consumed samples: 31971840 | consumed tokens: 65478328320 | elapsed time per iteration (s): 0.17 | learning rate: 5.329E-05 | global batch size: 256 | lm loss: 3.687354E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.960 | TFLOPs: 23.60 | +7: iteration 124900/ 173500 | consumed samples: 31974400 | consumed tokens: 65483571200 | elapsed time per iteration (s): 0.17 | learning rate: 5.328E-05 | global batch size: 256 | lm loss: 3.679258E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.807 | TFLOPs: 24.20 | +7: iteration 124910/ 173500 | consumed samples: 31976960 | consumed tokens: 65488814080 | elapsed time per iteration (s): 0.16 | learning rate: 5.326E-05 | global batch size: 256 | lm loss: 3.691911E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.766 | TFLOPs: 24.66 | +7: iteration 124920/ 173500 | consumed samples: 31979520 | consumed tokens: 65494056960 | elapsed time per iteration (s): 0.16 | learning rate: 5.325E-05 | global batch size: 256 | lm loss: 3.679703E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.505 | TFLOPs: 24.69 | +7: iteration 124930/ 173500 | consumed samples: 31982080 | consumed tokens: 65499299840 | elapsed time per iteration (s): 0.16 | learning rate: 5.324E-05 | global batch size: 256 | lm loss: 3.659389E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.628 | TFLOPs: 24.58 | +7: iteration 124940/ 173500 | consumed samples: 31984640 | consumed tokens: 65504542720 | elapsed time per iteration (s): 0.16 | learning rate: 5.323E-05 | global batch size: 256 | lm loss: 3.673812E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.243 | TFLOPs: 24.44 | +7: iteration 124950/ 173500 | consumed samples: 31987200 | consumed tokens: 65509785600 | elapsed time per iteration (s): 0.17 | learning rate: 5.321E-05 | global batch size: 256 | lm loss: 3.670286E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.577 | TFLOPs: 24.18 | +7: iteration 124960/ 173500 | consumed samples: 31989760 | consumed tokens: 65515028480 | elapsed time per iteration (s): 0.17 | learning rate: 5.320E-05 | global batch size: 256 | lm loss: 3.681571E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.537 | TFLOPs: 23.59 | +7: iteration 124970/ 173500 | consumed samples: 31992320 | consumed tokens: 65520271360 | elapsed time per iteration (s): 0.16 | learning rate: 5.319E-05 | global batch size: 256 | lm loss: 3.681517E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.433 | TFLOPs: 24.89 | +7: iteration 124980/ 173500 | consumed samples: 31994880 | consumed tokens: 65525514240 | elapsed time per iteration (s): 0.16 | learning rate: 5.317E-05 | global batch size: 256 | lm loss: 3.672398E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.889 | TFLOPs: 24.46 | +7: iteration 124990/ 173500 | consumed samples: 31997440 | consumed tokens: 65530757120 | elapsed time per iteration (s): 0.17 | learning rate: 5.316E-05 | global batch size: 256 | lm loss: 3.672021E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1490.103 | TFLOPs: 23.37 | +7: iteration 125000/ 173500 | consumed samples: 32000000 | consumed tokens: 65536000000 | elapsed time per iteration (s): 0.16 | learning rate: 5.315E-05 | global batch size: 256 | lm loss: 3.680455E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.538 | TFLOPs: 24.90 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 125000 | lm loss value: 3.856799E+00 | lm loss PPL: 4.731365E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 125000 to checkpoints_44m91b100m +0: [2023-03-17 05:41:05,385] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step125000 is begin to save! +0: [2023-03-17 05:41:05,389] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:41:05,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:41:05,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:41:05,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:41:05,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:41:05,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:41:05,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:41:05,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:41:05,477] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:41:05,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:41:05,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:41:05,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:41:05,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:41:05,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:41:05,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:41:05,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:41:05,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:41:05,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:41:05,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:41:05,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:41:05,518] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step125000/mp_rank_00_model_states.pt +0: [2023-03-17 05:41:05,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:41:05,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:41:05,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:41:05,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 05:41:05,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 05:41:05,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +6: [2023-03-17 05:41:05,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:41:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:41:05,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +6: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 05:41:05,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:41:05,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +6: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:41:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:41:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +6: [2023-03-17 05:41:05,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +6: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +6: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +7: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 05:41:05,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +1: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +4: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 05:41:05,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +3: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:41:05,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +2: [2023-03-17 05:41:05,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:41:05,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step125000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:41:05,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step125000 is ready now! +0: successfully saved checkpoint at iteration 125000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.41 +7: iteration 125010/ 173500 | consumed samples: 32002560 | consumed tokens: 65541242880 | elapsed time per iteration (s): 0.18 | learning rate: 5.314E-05 | global batch size: 256 | lm loss: 3.685146E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.271 | TFLOPs: 22.07 | +7: iteration 125020/ 173500 | consumed samples: 32005120 | consumed tokens: 65546485760 | elapsed time per iteration (s): 0.16 | learning rate: 5.312E-05 | global batch size: 256 | lm loss: 3.661909E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.826 | TFLOPs: 24.68 | +7: iteration 125030/ 173500 | consumed samples: 32007680 | consumed tokens: 65551728640 | elapsed time per iteration (s): 0.17 | learning rate: 5.311E-05 | global batch size: 256 | lm loss: 3.678492E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.612 | TFLOPs: 24.02 | +7: iteration 125040/ 173500 | consumed samples: 32010240 | consumed tokens: 65556971520 | elapsed time per iteration (s): 0.17 | learning rate: 5.310E-05 | global batch size: 256 | lm loss: 3.670868E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.449 | TFLOPs: 24.10 | +7: iteration 125050/ 173500 | consumed samples: 32012800 | consumed tokens: 65562214400 | elapsed time per iteration (s): 0.16 | learning rate: 5.308E-05 | global batch size: 256 | lm loss: 3.662378E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.900 | TFLOPs: 24.81 | +7: iteration 125060/ 173500 | consumed samples: 32015360 | consumed tokens: 65567457280 | elapsed time per iteration (s): 0.17 | learning rate: 5.307E-05 | global batch size: 256 | lm loss: 3.674646E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.376 | TFLOPs: 24.27 | +7: iteration 125070/ 173500 | consumed samples: 32017920 | consumed tokens: 65572700160 | elapsed time per iteration (s): 0.16 | learning rate: 5.306E-05 | global batch size: 256 | lm loss: 3.678328E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.351 | TFLOPs: 25.19 | +7: iteration 125080/ 173500 | consumed samples: 32020480 | consumed tokens: 65577943040 | elapsed time per iteration (s): 0.16 | learning rate: 5.305E-05 | global batch size: 256 | lm loss: 3.678722E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.438 | TFLOPs: 24.52 | +7: iteration 125090/ 173500 | consumed samples: 32023040 | consumed tokens: 65583185920 | elapsed time per iteration (s): 0.17 | learning rate: 5.303E-05 | global batch size: 256 | lm loss: 3.680484E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.767 | TFLOPs: 24.27 | +7: iteration 125100/ 173500 | consumed samples: 32025600 | consumed tokens: 65588428800 | elapsed time per iteration (s): 0.16 | learning rate: 5.302E-05 | global batch size: 256 | lm loss: 3.666687E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.066 | TFLOPs: 24.56 | +7: iteration 125110/ 173500 | consumed samples: 32028160 | consumed tokens: 65593671680 | elapsed time per iteration (s): 0.16 | learning rate: 5.301E-05 | global batch size: 256 | lm loss: 3.680047E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.150 | TFLOPs: 25.09 | +7: iteration 125120/ 173500 | consumed samples: 32030720 | consumed tokens: 65598914560 | elapsed time per iteration (s): 0.16 | learning rate: 5.300E-05 | global batch size: 256 | lm loss: 3.672989E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.509 | TFLOPs: 24.35 | +7: iteration 125130/ 173500 | consumed samples: 32033280 | consumed tokens: 65604157440 | elapsed time per iteration (s): 0.17 | learning rate: 5.298E-05 | global batch size: 256 | lm loss: 3.681283E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.672 | TFLOPs: 24.32 | +7: iteration 125140/ 173500 | consumed samples: 32035840 | consumed tokens: 65609400320 | elapsed time per iteration (s): 0.16 | learning rate: 5.297E-05 | global batch size: 256 | lm loss: 3.678439E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.160 | TFLOPs: 24.89 | +7: iteration 125150/ 173500 | consumed samples: 32038400 | consumed tokens: 65614643200 | elapsed time per iteration (s): 0.17 | learning rate: 5.296E-05 | global batch size: 256 | lm loss: 3.676117E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.558 | TFLOPs: 24.03 | +7: iteration 125160/ 173500 | consumed samples: 32040960 | consumed tokens: 65619886080 | elapsed time per iteration (s): 0.16 | learning rate: 5.294E-05 | global batch size: 256 | lm loss: 3.670067E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.698 | TFLOPs: 24.98 | +7: iteration 125170/ 173500 | consumed samples: 32043520 | consumed tokens: 65625128960 | elapsed time per iteration (s): 0.17 | learning rate: 5.293E-05 | global batch size: 256 | lm loss: 3.678736E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.141 | TFLOPs: 23.87 | +7: iteration 125180/ 173500 | consumed samples: 32046080 | consumed tokens: 65630371840 | elapsed time per iteration (s): 0.16 | learning rate: 5.292E-05 | global batch size: 256 | lm loss: 3.675211E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.118 | TFLOPs: 25.49 | +7: iteration 125190/ 173500 | consumed samples: 32048640 | consumed tokens: 65635614720 | elapsed time per iteration (s): 0.16 | learning rate: 5.291E-05 | global batch size: 256 | lm loss: 3.680093E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.555 | TFLOPs: 24.82 | +7: iteration 125200/ 173500 | consumed samples: 32051200 | consumed tokens: 65640857600 | elapsed time per iteration (s): 0.17 | learning rate: 5.289E-05 | global batch size: 256 | lm loss: 3.681449E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.137 | TFLOPs: 24.03 | +7: iteration 125210/ 173500 | consumed samples: 32053760 | consumed tokens: 65646100480 | elapsed time per iteration (s): 0.16 | learning rate: 5.288E-05 | global batch size: 256 | lm loss: 3.664594E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.482 | TFLOPs: 24.94 | +7: iteration 125220/ 173500 | consumed samples: 32056320 | consumed tokens: 65651343360 | elapsed time per iteration (s): 0.16 | learning rate: 5.287E-05 | global batch size: 256 | lm loss: 3.660365E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.945 | TFLOPs: 25.25 | +7: iteration 125230/ 173500 | consumed samples: 32058880 | consumed tokens: 65656586240 | elapsed time per iteration (s): 0.17 | learning rate: 5.286E-05 | global batch size: 256 | lm loss: 3.680339E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.183 | TFLOPs: 24.26 | +7: iteration 125240/ 173500 | consumed samples: 32061440 | consumed tokens: 65661829120 | elapsed time per iteration (s): 0.16 | learning rate: 5.284E-05 | global batch size: 256 | lm loss: 3.676959E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.203 | TFLOPs: 25.17 | +7: iteration 125250/ 173500 | consumed samples: 32064000 | consumed tokens: 65667072000 | elapsed time per iteration (s): 0.17 | learning rate: 5.283E-05 | global batch size: 256 | lm loss: 3.673605E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.591 | TFLOPs: 24.13 | +7: iteration 125260/ 173500 | consumed samples: 32066560 | consumed tokens: 65672314880 | elapsed time per iteration (s): 0.16 | learning rate: 5.282E-05 | global batch size: 256 | lm loss: 3.669079E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.376 | TFLOPs: 24.38 | +7: iteration 125270/ 173500 | consumed samples: 32069120 | consumed tokens: 65677557760 | elapsed time per iteration (s): 0.17 | learning rate: 5.280E-05 | global batch size: 256 | lm loss: 3.684611E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.656 | TFLOPs: 24.22 | +7: iteration 125280/ 173500 | consumed samples: 32071680 | consumed tokens: 65682800640 | elapsed time per iteration (s): 0.16 | learning rate: 5.279E-05 | global batch size: 256 | lm loss: 3.662834E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.255 | TFLOPs: 24.69 | +7: iteration 125290/ 173500 | consumed samples: 32074240 | consumed tokens: 65688043520 | elapsed time per iteration (s): 0.16 | learning rate: 5.278E-05 | global batch size: 256 | lm loss: 3.659532E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.804 | TFLOPs: 24.60 | +7: iteration 125300/ 173500 | consumed samples: 32076800 | consumed tokens: 65693286400 | elapsed time per iteration (s): 0.18 | learning rate: 5.277E-05 | global batch size: 256 | lm loss: 3.674423E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.405 | TFLOPs: 22.48 | +7: iteration 125310/ 173500 | consumed samples: 32079360 | consumed tokens: 65698529280 | elapsed time per iteration (s): 0.17 | learning rate: 5.275E-05 | global batch size: 256 | lm loss: 3.662975E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.572 | TFLOPs: 23.55 | +7: iteration 125320/ 173500 | consumed samples: 32081920 | consumed tokens: 65703772160 | elapsed time per iteration (s): 0.16 | learning rate: 5.274E-05 | global batch size: 256 | lm loss: 3.682027E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.740 | TFLOPs: 24.81 | +7: iteration 125330/ 173500 | consumed samples: 32084480 | consumed tokens: 65709015040 | elapsed time per iteration (s): 0.16 | learning rate: 5.273E-05 | global batch size: 256 | lm loss: 3.673071E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.336 | TFLOPs: 25.46 | +7: iteration 125340/ 173500 | consumed samples: 32087040 | consumed tokens: 65714257920 | elapsed time per iteration (s): 0.17 | learning rate: 5.272E-05 | global batch size: 256 | lm loss: 3.671188E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.471 | TFLOPs: 23.12 | +7: iteration 125350/ 173500 | consumed samples: 32089600 | consumed tokens: 65719500800 | elapsed time per iteration (s): 0.17 | learning rate: 5.270E-05 | global batch size: 256 | lm loss: 3.677359E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.067 | TFLOPs: 23.76 | +7: iteration 125360/ 173500 | consumed samples: 32092160 | consumed tokens: 65724743680 | elapsed time per iteration (s): 0.17 | learning rate: 5.269E-05 | global batch size: 256 | lm loss: 3.667063E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.375 | TFLOPs: 23.67 | +7: iteration 125370/ 173500 | consumed samples: 32094720 | consumed tokens: 65729986560 | elapsed time per iteration (s): 0.17 | learning rate: 5.268E-05 | global batch size: 256 | lm loss: 3.675673E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1506.087 | TFLOPs: 23.62 | +7: iteration 125380/ 173500 | consumed samples: 32097280 | consumed tokens: 65735229440 | elapsed time per iteration (s): 0.16 | learning rate: 5.267E-05 | global batch size: 256 | lm loss: 3.654708E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.770 | TFLOPs: 25.39 | +7: iteration 125390/ 173500 | consumed samples: 32099840 | consumed tokens: 65740472320 | elapsed time per iteration (s): 0.17 | learning rate: 5.265E-05 | global batch size: 256 | lm loss: 3.671706E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.368 | TFLOPs: 24.20 | +7: iteration 125400/ 173500 | consumed samples: 32102400 | consumed tokens: 65745715200 | elapsed time per iteration (s): 0.17 | learning rate: 5.264E-05 | global batch size: 256 | lm loss: 3.687127E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1487.090 | TFLOPs: 23.32 | +7: iteration 125410/ 173500 | consumed samples: 32104960 | consumed tokens: 65750958080 | elapsed time per iteration (s): 0.16 | learning rate: 5.263E-05 | global batch size: 256 | lm loss: 3.674321E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.009 | TFLOPs: 25.11 | +7: iteration 125420/ 173500 | consumed samples: 32107520 | consumed tokens: 65756200960 | elapsed time per iteration (s): 0.17 | learning rate: 5.261E-05 | global batch size: 256 | lm loss: 3.678783E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.746 | TFLOPs: 23.17 | +7: iteration 125430/ 173500 | consumed samples: 32110080 | consumed tokens: 65761443840 | elapsed time per iteration (s): 0.17 | learning rate: 5.260E-05 | global batch size: 256 | lm loss: 3.666229E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.138 | TFLOPs: 23.84 | +7: iteration 125440/ 173500 | consumed samples: 32112640 | consumed tokens: 65766686720 | elapsed time per iteration (s): 0.16 | learning rate: 5.259E-05 | global batch size: 256 | lm loss: 3.669290E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.779 | TFLOPs: 24.93 | +7: iteration 125450/ 173500 | consumed samples: 32115200 | consumed tokens: 65771929600 | elapsed time per iteration (s): 0.16 | learning rate: 5.258E-05 | global batch size: 256 | lm loss: 3.680379E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.768 | TFLOPs: 24.34 | +7: iteration 125460/ 173500 | consumed samples: 32117760 | consumed tokens: 65777172480 | elapsed time per iteration (s): 0.16 | learning rate: 5.256E-05 | global batch size: 256 | lm loss: 3.664737E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.602 | TFLOPs: 24.76 | +7: iteration 125470/ 173500 | consumed samples: 32120320 | consumed tokens: 65782415360 | elapsed time per iteration (s): 0.16 | learning rate: 5.255E-05 | global batch size: 256 | lm loss: 3.686128E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.934 | TFLOPs: 25.00 | +7: iteration 125480/ 173500 | consumed samples: 32122880 | consumed tokens: 65787658240 | elapsed time per iteration (s): 0.17 | learning rate: 5.254E-05 | global batch size: 256 | lm loss: 3.675103E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.369 | TFLOPs: 24.20 | +7: iteration 125490/ 173500 | consumed samples: 32125440 | consumed tokens: 65792901120 | elapsed time per iteration (s): 0.16 | learning rate: 5.253E-05 | global batch size: 256 | lm loss: 3.670212E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.789 | TFLOPs: 24.34 | +7: iteration 125500/ 173500 | consumed samples: 32128000 | consumed tokens: 65798144000 | elapsed time per iteration (s): 0.16 | learning rate: 5.251E-05 | global batch size: 256 | lm loss: 3.665689E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.029 | TFLOPs: 25.05 | +7: iteration 125510/ 173500 | consumed samples: 32130560 | consumed tokens: 65803386880 | elapsed time per iteration (s): 0.16 | learning rate: 5.250E-05 | global batch size: 256 | lm loss: 3.668015E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.296 | TFLOPs: 24.97 | +7: iteration 125520/ 173500 | consumed samples: 32133120 | consumed tokens: 65808629760 | elapsed time per iteration (s): 0.16 | learning rate: 5.249E-05 | global batch size: 256 | lm loss: 3.668363E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.357 | TFLOPs: 25.66 | +7: iteration 125530/ 173500 | consumed samples: 32135680 | consumed tokens: 65813872640 | elapsed time per iteration (s): 0.16 | learning rate: 5.247E-05 | global batch size: 256 | lm loss: 3.681707E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.368 | TFLOPs: 24.77 | +7: iteration 125540/ 173500 | consumed samples: 32138240 | consumed tokens: 65819115520 | elapsed time per iteration (s): 0.18 | learning rate: 5.246E-05 | global batch size: 256 | lm loss: 3.680441E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.483 | TFLOPs: 22.56 | +7: iteration 125550/ 173500 | consumed samples: 32140800 | consumed tokens: 65824358400 | elapsed time per iteration (s): 0.16 | learning rate: 5.245E-05 | global batch size: 256 | lm loss: 3.670554E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.059 | TFLOPs: 25.38 | +7: iteration 125560/ 173500 | consumed samples: 32143360 | consumed tokens: 65829601280 | elapsed time per iteration (s): 0.16 | learning rate: 5.244E-05 | global batch size: 256 | lm loss: 3.681263E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.840 | TFLOPs: 25.00 | +7: iteration 125570/ 173500 | consumed samples: 32145920 | consumed tokens: 65834844160 | elapsed time per iteration (s): 0.16 | learning rate: 5.242E-05 | global batch size: 256 | lm loss: 3.659599E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.894 | TFLOPs: 25.61 | +7: iteration 125580/ 173500 | consumed samples: 32148480 | consumed tokens: 65840087040 | elapsed time per iteration (s): 0.16 | learning rate: 5.241E-05 | global batch size: 256 | lm loss: 3.685812E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.160 | TFLOPs: 24.81 | +7: iteration 125590/ 173500 | consumed samples: 32151040 | consumed tokens: 65845329920 | elapsed time per iteration (s): 0.16 | learning rate: 5.240E-05 | global batch size: 256 | lm loss: 3.668555E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.212 | TFLOPs: 25.19 | +7: iteration 125600/ 173500 | consumed samples: 32153600 | consumed tokens: 65850572800 | elapsed time per iteration (s): 0.16 | learning rate: 5.239E-05 | global batch size: 256 | lm loss: 3.662978E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.248 | TFLOPs: 25.21 | +7: iteration 125610/ 173500 | consumed samples: 32156160 | consumed tokens: 65855815680 | elapsed time per iteration (s): 0.16 | learning rate: 5.237E-05 | global batch size: 256 | lm loss: 3.673339E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.123 | TFLOPs: 24.44 | +7: iteration 125620/ 173500 | consumed samples: 32158720 | consumed tokens: 65861058560 | elapsed time per iteration (s): 0.16 | learning rate: 5.236E-05 | global batch size: 256 | lm loss: 3.676262E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.366 | TFLOPs: 24.86 | +7: iteration 125630/ 173500 | consumed samples: 32161280 | consumed tokens: 65866301440 | elapsed time per iteration (s): 0.16 | learning rate: 5.235E-05 | global batch size: 256 | lm loss: 3.685810E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.458 | TFLOPs: 25.48 | +7: iteration 125640/ 173500 | consumed samples: 32163840 | consumed tokens: 65871544320 | elapsed time per iteration (s): 0.16 | learning rate: 5.234E-05 | global batch size: 256 | lm loss: 3.671003E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.243 | TFLOPs: 24.97 | +7: iteration 125650/ 173500 | consumed samples: 32166400 | consumed tokens: 65876787200 | elapsed time per iteration (s): 0.17 | learning rate: 5.232E-05 | global batch size: 256 | lm loss: 3.671210E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.472 | TFLOPs: 24.27 | +7: iteration 125660/ 173500 | consumed samples: 32168960 | consumed tokens: 65882030080 | elapsed time per iteration (s): 0.16 | learning rate: 5.231E-05 | global batch size: 256 | lm loss: 3.684701E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.788 | TFLOPs: 25.21 | +7: iteration 125670/ 173500 | consumed samples: 32171520 | consumed tokens: 65887272960 | elapsed time per iteration (s): 0.15 | learning rate: 5.230E-05 | global batch size: 256 | lm loss: 3.676912E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.515 | TFLOPs: 25.96 | +7: iteration 125680/ 173500 | consumed samples: 32174080 | consumed tokens: 65892515840 | elapsed time per iteration (s): 0.16 | learning rate: 5.229E-05 | global batch size: 256 | lm loss: 3.662139E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.235 | TFLOPs: 25.64 | +7: iteration 125690/ 173500 | consumed samples: 32176640 | consumed tokens: 65897758720 | elapsed time per iteration (s): 0.16 | learning rate: 5.227E-05 | global batch size: 256 | lm loss: 3.674046E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.390 | TFLOPs: 25.13 | +7: iteration 125700/ 173500 | consumed samples: 32179200 | consumed tokens: 65903001600 | elapsed time per iteration (s): 0.16 | learning rate: 5.226E-05 | global batch size: 256 | lm loss: 3.677587E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.230 | TFLOPs: 25.68 | +7: iteration 125710/ 173500 | consumed samples: 32181760 | consumed tokens: 65908244480 | elapsed time per iteration (s): 0.15 | learning rate: 5.225E-05 | global batch size: 256 | lm loss: 3.673226E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.972 | TFLOPs: 26.00 | +7: iteration 125720/ 173500 | consumed samples: 32184320 | consumed tokens: 65913487360 | elapsed time per iteration (s): 0.15 | learning rate: 5.223E-05 | global batch size: 256 | lm loss: 3.684871E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.095 | TFLOPs: 26.36 | +7: iteration 125730/ 173500 | consumed samples: 32186880 | consumed tokens: 65918730240 | elapsed time per iteration (s): 0.15 | learning rate: 5.222E-05 | global batch size: 256 | lm loss: 3.671383E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.378 | TFLOPs: 25.93 | +7: iteration 125740/ 173500 | consumed samples: 32189440 | consumed tokens: 65923973120 | elapsed time per iteration (s): 0.16 | learning rate: 5.221E-05 | global batch size: 256 | lm loss: 3.673079E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.794 | TFLOPs: 24.56 | +7: iteration 125750/ 173500 | consumed samples: 32192000 | consumed tokens: 65929216000 | elapsed time per iteration (s): 0.16 | learning rate: 5.220E-05 | global batch size: 256 | lm loss: 3.676999E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.436 | TFLOPs: 24.50 | +7: iteration 125760/ 173500 | consumed samples: 32194560 | consumed tokens: 65934458880 | elapsed time per iteration (s): 0.17 | learning rate: 5.218E-05 | global batch size: 256 | lm loss: 3.664331E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.274 | TFLOPs: 23.98 | +7: iteration 125770/ 173500 | consumed samples: 32197120 | consumed tokens: 65939701760 | elapsed time per iteration (s): 0.16 | learning rate: 5.217E-05 | global batch size: 256 | lm loss: 3.680555E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.438 | TFLOPs: 24.60 | +7: iteration 125780/ 173500 | consumed samples: 32199680 | consumed tokens: 65944944640 | elapsed time per iteration (s): 0.16 | learning rate: 5.216E-05 | global batch size: 256 | lm loss: 3.669197E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.346 | TFLOPs: 25.58 | +7: iteration 125790/ 173500 | consumed samples: 32202240 | consumed tokens: 65950187520 | elapsed time per iteration (s): 0.16 | learning rate: 5.215E-05 | global batch size: 256 | lm loss: 3.683130E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.671 | TFLOPs: 24.54 | +7: iteration 125800/ 173500 | consumed samples: 32204800 | consumed tokens: 65955430400 | elapsed time per iteration (s): 0.16 | learning rate: 5.213E-05 | global batch size: 256 | lm loss: 3.685134E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.616 | TFLOPs: 24.60 | +7: iteration 125810/ 173500 | consumed samples: 32207360 | consumed tokens: 65960673280 | elapsed time per iteration (s): 0.16 | learning rate: 5.212E-05 | global batch size: 256 | lm loss: 3.683408E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.425 | TFLOPs: 25.84 | +7: iteration 125820/ 173500 | consumed samples: 32209920 | consumed tokens: 65965916160 | elapsed time per iteration (s): 0.16 | learning rate: 5.211E-05 | global batch size: 256 | lm loss: 3.687181E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.576 | TFLOPs: 25.68 | +7: iteration 125830/ 173500 | consumed samples: 32212480 | consumed tokens: 65971159040 | elapsed time per iteration (s): 0.17 | learning rate: 5.210E-05 | global batch size: 256 | lm loss: 3.679641E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.580 | TFLOPs: 23.77 | +7: iteration 125840/ 173500 | consumed samples: 32215040 | consumed tokens: 65976401920 | elapsed time per iteration (s): 0.17 | learning rate: 5.208E-05 | global batch size: 256 | lm loss: 3.685805E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.273 | TFLOPs: 23.43 | +7: iteration 125850/ 173500 | consumed samples: 32217600 | consumed tokens: 65981644800 | elapsed time per iteration (s): 0.17 | learning rate: 5.207E-05 | global batch size: 256 | lm loss: 3.665612E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.427 | TFLOPs: 23.70 | +7: iteration 125860/ 173500 | consumed samples: 32220160 | consumed tokens: 65986887680 | elapsed time per iteration (s): 0.16 | learning rate: 5.206E-05 | global batch size: 256 | lm loss: 3.694484E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.750 | TFLOPs: 25.17 | +7: iteration 125870/ 173500 | consumed samples: 32222720 | consumed tokens: 65992130560 | elapsed time per iteration (s): 0.16 | learning rate: 5.205E-05 | global batch size: 256 | lm loss: 3.682453E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.782 | TFLOPs: 24.70 | +7: iteration 125880/ 173500 | consumed samples: 32225280 | consumed tokens: 65997373440 | elapsed time per iteration (s): 0.16 | learning rate: 5.203E-05 | global batch size: 256 | lm loss: 3.668372E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.241 | TFLOPs: 25.06 | +7: iteration 125890/ 173500 | consumed samples: 32227840 | consumed tokens: 66002616320 | elapsed time per iteration (s): 0.16 | learning rate: 5.202E-05 | global batch size: 256 | lm loss: 3.686376E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.392 | TFLOPs: 24.96 | +7: iteration 125900/ 173500 | consumed samples: 32230400 | consumed tokens: 66007859200 | elapsed time per iteration (s): 0.17 | learning rate: 5.201E-05 | global batch size: 256 | lm loss: 3.669669E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.331 | TFLOPs: 24.12 | +7: iteration 125910/ 173500 | consumed samples: 32232960 | consumed tokens: 66013102080 | elapsed time per iteration (s): 0.16 | learning rate: 5.200E-05 | global batch size: 256 | lm loss: 3.672898E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.293 | TFLOPs: 25.85 | +7: iteration 125920/ 173500 | consumed samples: 32235520 | consumed tokens: 66018344960 | elapsed time per iteration (s): 0.17 | learning rate: 5.198E-05 | global batch size: 256 | lm loss: 3.667033E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.341 | TFLOPs: 24.03 | +7: iteration 125930/ 173500 | consumed samples: 32238080 | consumed tokens: 66023587840 | elapsed time per iteration (s): 0.16 | learning rate: 5.197E-05 | global batch size: 256 | lm loss: 3.686175E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.040 | TFLOPs: 24.76 | +7: iteration 125940/ 173500 | consumed samples: 32240640 | consumed tokens: 66028830720 | elapsed time per iteration (s): 0.16 | learning rate: 5.196E-05 | global batch size: 256 | lm loss: 3.674493E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.047 | TFLOPs: 25.81 | +7: iteration 125950/ 173500 | consumed samples: 32243200 | consumed tokens: 66034073600 | elapsed time per iteration (s): 0.17 | learning rate: 5.194E-05 | global batch size: 256 | lm loss: 3.679520E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.759 | TFLOPs: 23.50 | +7: iteration 125960/ 173500 | consumed samples: 32245760 | consumed tokens: 66039316480 | elapsed time per iteration (s): 0.17 | learning rate: 5.193E-05 | global batch size: 256 | lm loss: 3.691349E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.565 | TFLOPs: 23.77 | +7: iteration 125970/ 173500 | consumed samples: 32248320 | consumed tokens: 66044559360 | elapsed time per iteration (s): 0.16 | learning rate: 5.192E-05 | global batch size: 256 | lm loss: 3.669551E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.021 | TFLOPs: 25.30 | +7: iteration 125980/ 173500 | consumed samples: 32250880 | consumed tokens: 66049802240 | elapsed time per iteration (s): 0.16 | learning rate: 5.191E-05 | global batch size: 256 | lm loss: 3.669004E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.923 | TFLOPs: 25.22 | +7: iteration 125990/ 173500 | consumed samples: 32253440 | consumed tokens: 66055045120 | elapsed time per iteration (s): 0.16 | learning rate: 5.189E-05 | global batch size: 256 | lm loss: 3.671162E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.180 | TFLOPs: 25.14 | +0: [2023-03-17 05:43:48,489] [INFO] [logging.py:68:log_dist] [Rank 0] step=126000, skipped=0, lr=[5.188210163686188e-05, 5.188210163686188e-05, 5.188210163686188e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 126000/ 173500 | consumed samples: 32256000 | consumed tokens: 66060288000 | elapsed time per iteration (s): 0.17 | learning rate: 5.188E-05 | global batch size: 256 | lm loss: 3.672460E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1514.020 | TFLOPs: 23.74 | +0: steps: 126000 loss: 3.6912 iter time (s): 0.163 samples/sec: 1575.356 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 126000 | lm loss value: 3.849436E+00 | lm loss PPL: 4.696655E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 126000 to checkpoints_44m91b100m +0: [2023-03-17 05:43:48,576] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step126000 is begin to save! +0: [2023-03-17 05:43:48,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:43:48,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:43:48,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:43:48,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:43:48,656] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:43:48,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:43:48,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:43:48,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:43:48,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:43:48,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:43:48,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:43:48,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:43:48,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:43:48,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:43:48,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:43:48,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:43:48,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:43:48,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:43:48,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:43:48,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:43:48,715] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step126000/mp_rank_00_model_states.pt +0: [2023-03-17 05:43:48,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:43:48,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:43:48,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:43:48,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:43:48,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +6: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +5: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +4: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +2: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 05:43:48,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: [2023-03-17 05:43:48,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +1: [2023-03-17 05:43:48,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:43:48,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:43:48,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +3: [2023-03-17 05:43:48,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:43:48,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:43:48,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +7: [2023-03-17 05:43:48,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:43:48,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step126000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:43:48,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step126000 is ready now! +0: successfully saved checkpoint at iteration 126000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 186.93 +7: iteration 126010/ 173500 | consumed samples: 32258560 | consumed tokens: 66065530880 | elapsed time per iteration (s): 0.18 | learning rate: 5.187E-05 | global batch size: 256 | lm loss: 3.674609E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.174 | TFLOPs: 21.75 | +7: iteration 126020/ 173500 | consumed samples: 32261120 | consumed tokens: 66070773760 | elapsed time per iteration (s): 0.16 | learning rate: 5.186E-05 | global batch size: 256 | lm loss: 3.675691E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.777 | TFLOPs: 25.04 | +7: iteration 126030/ 173500 | consumed samples: 32263680 | consumed tokens: 66076016640 | elapsed time per iteration (s): 0.16 | learning rate: 5.184E-05 | global batch size: 256 | lm loss: 3.674224E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.409 | TFLOPs: 24.78 | +7: iteration 126040/ 173500 | consumed samples: 32266240 | consumed tokens: 66081259520 | elapsed time per iteration (s): 0.17 | learning rate: 5.183E-05 | global batch size: 256 | lm loss: 3.678322E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.693 | TFLOPs: 23.71 | +7: iteration 126050/ 173500 | consumed samples: 32268800 | consumed tokens: 66086502400 | elapsed time per iteration (s): 0.18 | learning rate: 5.182E-05 | global batch size: 256 | lm loss: 3.668143E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.809 | TFLOPs: 22.16 | +7: iteration 126060/ 173500 | consumed samples: 32271360 | consumed tokens: 66091745280 | elapsed time per iteration (s): 0.18 | learning rate: 5.181E-05 | global batch size: 256 | lm loss: 3.672959E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1421.759 | TFLOPs: 22.30 | +7: iteration 126070/ 173500 | consumed samples: 32273920 | consumed tokens: 66096988160 | elapsed time per iteration (s): 0.16 | learning rate: 5.179E-05 | global batch size: 256 | lm loss: 3.667119E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.809 | TFLOPs: 24.40 | +7: iteration 126080/ 173500 | consumed samples: 32276480 | consumed tokens: 66102231040 | elapsed time per iteration (s): 0.16 | learning rate: 5.178E-05 | global batch size: 256 | lm loss: 3.683739E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.433 | TFLOPs: 25.05 | +7: iteration 126090/ 173500 | consumed samples: 32279040 | consumed tokens: 66107473920 | elapsed time per iteration (s): 0.16 | learning rate: 5.177E-05 | global batch size: 256 | lm loss: 3.662270E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.273 | TFLOPs: 25.55 | +7: iteration 126100/ 173500 | consumed samples: 32281600 | consumed tokens: 66112716800 | elapsed time per iteration (s): 0.16 | learning rate: 5.176E-05 | global batch size: 256 | lm loss: 3.682554E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.410 | TFLOPs: 25.29 | +7: iteration 126110/ 173500 | consumed samples: 32284160 | consumed tokens: 66117959680 | elapsed time per iteration (s): 0.16 | learning rate: 5.174E-05 | global batch size: 256 | lm loss: 3.672919E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.839 | TFLOPs: 24.35 | +7: iteration 126120/ 173500 | consumed samples: 32286720 | consumed tokens: 66123202560 | elapsed time per iteration (s): 0.17 | learning rate: 5.173E-05 | global batch size: 256 | lm loss: 3.668909E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.278 | TFLOPs: 23.73 | +7: iteration 126130/ 173500 | consumed samples: 32289280 | consumed tokens: 66128445440 | elapsed time per iteration (s): 0.16 | learning rate: 5.172E-05 | global batch size: 256 | lm loss: 3.695432E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.329 | TFLOPs: 25.07 | +7: iteration 126140/ 173500 | consumed samples: 32291840 | consumed tokens: 66133688320 | elapsed time per iteration (s): 0.16 | learning rate: 5.171E-05 | global batch size: 256 | lm loss: 3.662457E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.550 | TFLOPs: 25.45 | +7: iteration 126150/ 173500 | consumed samples: 32294400 | consumed tokens: 66138931200 | elapsed time per iteration (s): 0.17 | learning rate: 5.169E-05 | global batch size: 256 | lm loss: 3.674766E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.388 | TFLOPs: 24.24 | +7: iteration 126160/ 173500 | consumed samples: 32296960 | consumed tokens: 66144174080 | elapsed time per iteration (s): 0.17 | learning rate: 5.168E-05 | global batch size: 256 | lm loss: 3.674342E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.528 | TFLOPs: 24.24 | +7: iteration 126170/ 173500 | consumed samples: 32299520 | consumed tokens: 66149416960 | elapsed time per iteration (s): 0.16 | learning rate: 5.167E-05 | global batch size: 256 | lm loss: 3.681684E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.405 | TFLOPs: 24.35 | +7: iteration 126180/ 173500 | consumed samples: 32302080 | consumed tokens: 66154659840 | elapsed time per iteration (s): 0.16 | learning rate: 5.166E-05 | global batch size: 256 | lm loss: 3.668811E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.641 | TFLOPs: 24.77 | +7: iteration 126190/ 173500 | consumed samples: 32304640 | consumed tokens: 66159902720 | elapsed time per iteration (s): 0.16 | learning rate: 5.164E-05 | global batch size: 256 | lm loss: 3.678822E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.704 | TFLOPs: 24.55 | +7: iteration 126200/ 173500 | consumed samples: 32307200 | consumed tokens: 66165145600 | elapsed time per iteration (s): 0.17 | learning rate: 5.163E-05 | global batch size: 256 | lm loss: 3.684317E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.138 | TFLOPs: 24.20 | +7: iteration 126210/ 173500 | consumed samples: 32309760 | consumed tokens: 66170388480 | elapsed time per iteration (s): 0.16 | learning rate: 5.162E-05 | global batch size: 256 | lm loss: 3.664049E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.613 | TFLOPs: 24.96 | +7: iteration 126220/ 173500 | consumed samples: 32312320 | consumed tokens: 66175631360 | elapsed time per iteration (s): 0.17 | learning rate: 5.161E-05 | global batch size: 256 | lm loss: 3.671878E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1467.054 | TFLOPs: 23.01 | +7: iteration 126230/ 173500 | consumed samples: 32314880 | consumed tokens: 66180874240 | elapsed time per iteration (s): 0.17 | learning rate: 5.159E-05 | global batch size: 256 | lm loss: 3.676185E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.331 | TFLOPs: 24.33 | +7: iteration 126240/ 173500 | consumed samples: 32317440 | consumed tokens: 66186117120 | elapsed time per iteration (s): 0.16 | learning rate: 5.158E-05 | global batch size: 256 | lm loss: 3.675609E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.618 | TFLOPs: 24.98 | +7: iteration 126250/ 173500 | consumed samples: 32320000 | consumed tokens: 66191360000 | elapsed time per iteration (s): 0.16 | learning rate: 5.157E-05 | global batch size: 256 | lm loss: 3.674260E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.422 | TFLOPs: 24.63 | +7: iteration 126260/ 173500 | consumed samples: 32322560 | consumed tokens: 66196602880 | elapsed time per iteration (s): 0.16 | learning rate: 5.156E-05 | global batch size: 256 | lm loss: 3.665373E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.273 | TFLOPs: 24.70 | +7: iteration 126270/ 173500 | consumed samples: 32325120 | consumed tokens: 66201845760 | elapsed time per iteration (s): 0.16 | learning rate: 5.154E-05 | global batch size: 256 | lm loss: 3.683745E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.288 | TFLOPs: 24.58 | +7: iteration 126280/ 173500 | consumed samples: 32327680 | consumed tokens: 66207088640 | elapsed time per iteration (s): 0.17 | learning rate: 5.153E-05 | global batch size: 256 | lm loss: 3.664966E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.374 | TFLOPs: 24.33 | +7: iteration 126290/ 173500 | consumed samples: 32330240 | consumed tokens: 66212331520 | elapsed time per iteration (s): 0.16 | learning rate: 5.152E-05 | global batch size: 256 | lm loss: 3.673717E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.492 | TFLOPs: 24.96 | +7: iteration 126300/ 173500 | consumed samples: 32332800 | consumed tokens: 66217574400 | elapsed time per iteration (s): 0.16 | learning rate: 5.151E-05 | global batch size: 256 | lm loss: 3.671382E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.615 | TFLOPs: 24.52 | +7: iteration 126310/ 173500 | consumed samples: 32335360 | consumed tokens: 66222817280 | elapsed time per iteration (s): 0.16 | learning rate: 5.149E-05 | global batch size: 256 | lm loss: 3.685149E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.390 | TFLOPs: 25.43 | +7: iteration 126320/ 173500 | consumed samples: 32337920 | consumed tokens: 66228060160 | elapsed time per iteration (s): 0.16 | learning rate: 5.148E-05 | global batch size: 256 | lm loss: 3.668712E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.758 | TFLOPs: 24.96 | +7: iteration 126330/ 173500 | consumed samples: 32340480 | consumed tokens: 66233303040 | elapsed time per iteration (s): 0.16 | learning rate: 5.147E-05 | global batch size: 256 | lm loss: 3.669188E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.141 | TFLOPs: 25.49 | +7: iteration 126340/ 173500 | consumed samples: 32343040 | consumed tokens: 66238545920 | elapsed time per iteration (s): 0.16 | learning rate: 5.146E-05 | global batch size: 256 | lm loss: 3.680822E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.355 | TFLOPs: 25.07 | +7: iteration 126350/ 173500 | consumed samples: 32345600 | consumed tokens: 66243788800 | elapsed time per iteration (s): 0.16 | learning rate: 5.144E-05 | global batch size: 256 | lm loss: 3.685564E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.396 | TFLOPs: 24.67 | +7: iteration 126360/ 173500 | consumed samples: 32348160 | consumed tokens: 66249031680 | elapsed time per iteration (s): 0.16 | learning rate: 5.143E-05 | global batch size: 256 | lm loss: 3.683680E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.254 | TFLOPs: 24.72 | +7: iteration 126370/ 173500 | consumed samples: 32350720 | consumed tokens: 66254274560 | elapsed time per iteration (s): 0.17 | learning rate: 5.142E-05 | global batch size: 256 | lm loss: 3.669756E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.021 | TFLOPs: 23.93 | +7: iteration 126380/ 173500 | consumed samples: 32353280 | consumed tokens: 66259517440 | elapsed time per iteration (s): 0.16 | learning rate: 5.141E-05 | global batch size: 256 | lm loss: 3.681838E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.676 | TFLOPs: 25.81 | +7: iteration 126390/ 173500 | consumed samples: 32355840 | consumed tokens: 66264760320 | elapsed time per iteration (s): 0.16 | learning rate: 5.139E-05 | global batch size: 256 | lm loss: 3.684111E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.361 | TFLOPs: 25.83 | +7: iteration 126400/ 173500 | consumed samples: 32358400 | consumed tokens: 66270003200 | elapsed time per iteration (s): 0.16 | learning rate: 5.138E-05 | global batch size: 256 | lm loss: 3.679026E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.729 | TFLOPs: 24.90 | +7: iteration 126410/ 173500 | consumed samples: 32360960 | consumed tokens: 66275246080 | elapsed time per iteration (s): 0.16 | learning rate: 5.137E-05 | global batch size: 256 | lm loss: 3.672875E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.836 | TFLOPs: 24.82 | +7: iteration 126420/ 173500 | consumed samples: 32363520 | consumed tokens: 66280488960 | elapsed time per iteration (s): 0.17 | learning rate: 5.136E-05 | global batch size: 256 | lm loss: 3.665648E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.283 | TFLOPs: 24.22 | +7: iteration 126430/ 173500 | consumed samples: 32366080 | consumed tokens: 66285731840 | elapsed time per iteration (s): 0.16 | learning rate: 5.134E-05 | global batch size: 256 | lm loss: 3.678637E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.647 | TFLOPs: 25.18 | +7: iteration 126440/ 173500 | consumed samples: 32368640 | consumed tokens: 66290974720 | elapsed time per iteration (s): 0.17 | learning rate: 5.133E-05 | global batch size: 256 | lm loss: 3.674656E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.268 | TFLOPs: 24.03 | +7: iteration 126450/ 173500 | consumed samples: 32371200 | consumed tokens: 66296217600 | elapsed time per iteration (s): 0.18 | learning rate: 5.132E-05 | global batch size: 256 | lm loss: 3.662217E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.944 | TFLOPs: 21.97 | +7: iteration 126460/ 173500 | consumed samples: 32373760 | consumed tokens: 66301460480 | elapsed time per iteration (s): 0.17 | learning rate: 5.131E-05 | global batch size: 256 | lm loss: 3.664463E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.983 | TFLOPs: 24.07 | +7: iteration 126470/ 173500 | consumed samples: 32376320 | consumed tokens: 66306703360 | elapsed time per iteration (s): 0.17 | learning rate: 5.129E-05 | global batch size: 256 | lm loss: 3.668480E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.361 | TFLOPs: 24.03 | +7: iteration 126480/ 173500 | consumed samples: 32378880 | consumed tokens: 66311946240 | elapsed time per iteration (s): 0.16 | learning rate: 5.128E-05 | global batch size: 256 | lm loss: 3.670879E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.219 | TFLOPs: 25.14 | +7: iteration 126490/ 173500 | consumed samples: 32381440 | consumed tokens: 66317189120 | elapsed time per iteration (s): 0.16 | learning rate: 5.127E-05 | global batch size: 256 | lm loss: 3.678592E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.367 | TFLOPs: 24.36 | +7: iteration 126500/ 173500 | consumed samples: 32384000 | consumed tokens: 66322432000 | elapsed time per iteration (s): 0.18 | learning rate: 5.126E-05 | global batch size: 256 | lm loss: 3.686437E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1441.176 | TFLOPs: 22.60 | +7: iteration 126510/ 173500 | consumed samples: 32386560 | consumed tokens: 66327674880 | elapsed time per iteration (s): 0.16 | learning rate: 5.124E-05 | global batch size: 256 | lm loss: 3.666891E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.784 | TFLOPs: 24.76 | +7: iteration 126520/ 173500 | consumed samples: 32389120 | consumed tokens: 66332917760 | elapsed time per iteration (s): 0.16 | learning rate: 5.123E-05 | global batch size: 256 | lm loss: 3.672540E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.698 | TFLOPs: 24.49 | +7: iteration 126530/ 173500 | consumed samples: 32391680 | consumed tokens: 66338160640 | elapsed time per iteration (s): 0.16 | learning rate: 5.122E-05 | global batch size: 256 | lm loss: 3.685696E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.220 | TFLOPs: 24.42 | +7: iteration 126540/ 173500 | consumed samples: 32394240 | consumed tokens: 66343403520 | elapsed time per iteration (s): 0.16 | learning rate: 5.121E-05 | global batch size: 256 | lm loss: 3.679145E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.954 | TFLOPs: 25.66 | +7: iteration 126550/ 173500 | consumed samples: 32396800 | consumed tokens: 66348646400 | elapsed time per iteration (s): 0.17 | learning rate: 5.119E-05 | global batch size: 256 | lm loss: 3.675241E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.895 | TFLOPs: 23.77 | +7: iteration 126560/ 173500 | consumed samples: 32399360 | consumed tokens: 66353889280 | elapsed time per iteration (s): 0.16 | learning rate: 5.118E-05 | global batch size: 256 | lm loss: 3.683359E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.143 | TFLOPs: 25.49 | +7: iteration 126570/ 173500 | consumed samples: 32401920 | consumed tokens: 66359132160 | elapsed time per iteration (s): 0.17 | learning rate: 5.117E-05 | global batch size: 256 | lm loss: 3.677356E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.401 | TFLOPs: 24.08 | +7: iteration 126580/ 173500 | consumed samples: 32404480 | consumed tokens: 66364375040 | elapsed time per iteration (s): 0.17 | learning rate: 5.116E-05 | global batch size: 256 | lm loss: 3.666619E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.874 | TFLOPs: 24.26 | +7: iteration 126590/ 173500 | consumed samples: 32407040 | consumed tokens: 66369617920 | elapsed time per iteration (s): 0.17 | learning rate: 5.114E-05 | global batch size: 256 | lm loss: 3.667859E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.964 | TFLOPs: 24.24 | +7: iteration 126600/ 173500 | consumed samples: 32409600 | consumed tokens: 66374860800 | elapsed time per iteration (s): 0.16 | learning rate: 5.113E-05 | global batch size: 256 | lm loss: 3.662096E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.659 | TFLOPs: 24.41 | +7: iteration 126610/ 173500 | consumed samples: 32412160 | consumed tokens: 66380103680 | elapsed time per iteration (s): 0.16 | learning rate: 5.112E-05 | global batch size: 256 | lm loss: 3.669863E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.769 | TFLOPs: 25.18 | +7: iteration 126620/ 173500 | consumed samples: 32414720 | consumed tokens: 66385346560 | elapsed time per iteration (s): 0.16 | learning rate: 5.111E-05 | global batch size: 256 | lm loss: 3.668609E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.758 | TFLOPs: 24.95 | +7: iteration 126630/ 173500 | consumed samples: 32417280 | consumed tokens: 66390589440 | elapsed time per iteration (s): 0.16 | learning rate: 5.109E-05 | global batch size: 256 | lm loss: 3.675956E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.178 | TFLOPs: 25.30 | +7: iteration 126640/ 173500 | consumed samples: 32419840 | consumed tokens: 66395832320 | elapsed time per iteration (s): 0.16 | learning rate: 5.108E-05 | global batch size: 256 | lm loss: 3.686105E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.986 | TFLOPs: 25.52 | +7: iteration 126650/ 173500 | consumed samples: 32422400 | consumed tokens: 66401075200 | elapsed time per iteration (s): 0.17 | learning rate: 5.107E-05 | global batch size: 256 | lm loss: 3.679379E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.491 | TFLOPs: 23.45 | +7: iteration 126660/ 173500 | consumed samples: 32424960 | consumed tokens: 66406318080 | elapsed time per iteration (s): 0.16 | learning rate: 5.106E-05 | global batch size: 256 | lm loss: 3.667430E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.044 | TFLOPs: 24.90 | +7: iteration 126670/ 173500 | consumed samples: 32427520 | consumed tokens: 66411560960 | elapsed time per iteration (s): 0.17 | learning rate: 5.104E-05 | global batch size: 256 | lm loss: 3.677322E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.246 | TFLOPs: 24.03 | +7: iteration 126680/ 173500 | consumed samples: 32430080 | consumed tokens: 66416803840 | elapsed time per iteration (s): 0.16 | learning rate: 5.103E-05 | global batch size: 256 | lm loss: 3.657727E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.641 | TFLOPs: 24.33 | +7: iteration 126690/ 173500 | consumed samples: 32432640 | consumed tokens: 66422046720 | elapsed time per iteration (s): 0.16 | learning rate: 5.102E-05 | global batch size: 256 | lm loss: 3.667757E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.193 | TFLOPs: 24.80 | +7: iteration 126700/ 173500 | consumed samples: 32435200 | consumed tokens: 66427289600 | elapsed time per iteration (s): 0.16 | learning rate: 5.101E-05 | global batch size: 256 | lm loss: 3.674963E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.553 | TFLOPs: 24.60 | +7: iteration 126710/ 173500 | consumed samples: 32437760 | consumed tokens: 66432532480 | elapsed time per iteration (s): 0.16 | learning rate: 5.099E-05 | global batch size: 256 | lm loss: 3.680357E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.604 | TFLOPs: 24.73 | +7: iteration 126720/ 173500 | consumed samples: 32440320 | consumed tokens: 66437775360 | elapsed time per iteration (s): 0.16 | learning rate: 5.098E-05 | global batch size: 256 | lm loss: 3.673418E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.907 | TFLOPs: 24.95 | +7: iteration 126730/ 173500 | consumed samples: 32442880 | consumed tokens: 66443018240 | elapsed time per iteration (s): 0.16 | learning rate: 5.097E-05 | global batch size: 256 | lm loss: 3.666757E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.651 | TFLOPs: 25.29 | +7: iteration 126740/ 173500 | consumed samples: 32445440 | consumed tokens: 66448261120 | elapsed time per iteration (s): 0.16 | learning rate: 5.096E-05 | global batch size: 256 | lm loss: 3.692200E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.903 | TFLOPs: 24.60 | +7: iteration 126750/ 173500 | consumed samples: 32448000 | consumed tokens: 66453504000 | elapsed time per iteration (s): 0.16 | learning rate: 5.094E-05 | global batch size: 256 | lm loss: 3.665219E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.130 | TFLOPs: 24.97 | +7: iteration 126760/ 173500 | consumed samples: 32450560 | consumed tokens: 66458746880 | elapsed time per iteration (s): 0.16 | learning rate: 5.093E-05 | global batch size: 256 | lm loss: 3.670623E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.378 | TFLOPs: 24.53 | +7: iteration 126770/ 173500 | consumed samples: 32453120 | consumed tokens: 66463989760 | elapsed time per iteration (s): 0.16 | learning rate: 5.092E-05 | global batch size: 256 | lm loss: 3.674207E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.917 | TFLOPs: 24.89 | +7: iteration 126780/ 173500 | consumed samples: 32455680 | consumed tokens: 66469232640 | elapsed time per iteration (s): 0.17 | learning rate: 5.091E-05 | global batch size: 256 | lm loss: 3.677649E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.241 | TFLOPs: 24.14 | +7: iteration 126790/ 173500 | consumed samples: 32458240 | consumed tokens: 66474475520 | elapsed time per iteration (s): 0.15 | learning rate: 5.090E-05 | global batch size: 256 | lm loss: 3.663182E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.990 | TFLOPs: 26.33 | +7: iteration 126800/ 173500 | consumed samples: 32460800 | consumed tokens: 66479718400 | elapsed time per iteration (s): 0.17 | learning rate: 5.088E-05 | global batch size: 256 | lm loss: 3.671255E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.962 | TFLOPs: 23.35 | +7: iteration 126810/ 173500 | consumed samples: 32463360 | consumed tokens: 66484961280 | elapsed time per iteration (s): 0.16 | learning rate: 5.087E-05 | global batch size: 256 | lm loss: 3.687476E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.665 | TFLOPs: 24.91 | +7: iteration 126820/ 173500 | consumed samples: 32465920 | consumed tokens: 66490204160 | elapsed time per iteration (s): 0.17 | learning rate: 5.086E-05 | global batch size: 256 | lm loss: 3.668205E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.778 | TFLOPs: 24.15 | +7: iteration 126830/ 173500 | consumed samples: 32468480 | consumed tokens: 66495447040 | elapsed time per iteration (s): 0.18 | learning rate: 5.085E-05 | global batch size: 256 | lm loss: 3.689893E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.989 | TFLOPs: 22.08 | +7: iteration 126840/ 173500 | consumed samples: 32471040 | consumed tokens: 66500689920 | elapsed time per iteration (s): 0.16 | learning rate: 5.083E-05 | global batch size: 256 | lm loss: 3.660413E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.055 | TFLOPs: 25.74 | +7: iteration 126850/ 173500 | consumed samples: 32473600 | consumed tokens: 66505932800 | elapsed time per iteration (s): 0.16 | learning rate: 5.082E-05 | global batch size: 256 | lm loss: 3.676944E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.168 | TFLOPs: 24.95 | +7: iteration 126860/ 173500 | consumed samples: 32476160 | consumed tokens: 66511175680 | elapsed time per iteration (s): 0.16 | learning rate: 5.081E-05 | global batch size: 256 | lm loss: 3.671935E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.484 | TFLOPs: 24.72 | +7: iteration 126870/ 173500 | consumed samples: 32478720 | consumed tokens: 66516418560 | elapsed time per iteration (s): 0.16 | learning rate: 5.080E-05 | global batch size: 256 | lm loss: 3.668020E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.187 | TFLOPs: 24.78 | +7: iteration 126880/ 173500 | consumed samples: 32481280 | consumed tokens: 66521661440 | elapsed time per iteration (s): 0.16 | learning rate: 5.078E-05 | global batch size: 256 | lm loss: 3.673340E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.878 | TFLOPs: 25.11 | +7: iteration 126890/ 173500 | consumed samples: 32483840 | consumed tokens: 66526904320 | elapsed time per iteration (s): 0.16 | learning rate: 5.077E-05 | global batch size: 256 | lm loss: 3.681957E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.645 | TFLOPs: 25.89 | +7: iteration 126900/ 173500 | consumed samples: 32486400 | consumed tokens: 66532147200 | elapsed time per iteration (s): 0.16 | learning rate: 5.076E-05 | global batch size: 256 | lm loss: 3.670059E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.999 | TFLOPs: 24.68 | +7: iteration 126910/ 173500 | consumed samples: 32488960 | consumed tokens: 66537390080 | elapsed time per iteration (s): 0.16 | learning rate: 5.075E-05 | global batch size: 256 | lm loss: 3.685491E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.400 | TFLOPs: 25.84 | +7: iteration 126920/ 173500 | consumed samples: 32491520 | consumed tokens: 66542632960 | elapsed time per iteration (s): 0.15 | learning rate: 5.073E-05 | global batch size: 256 | lm loss: 3.676434E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.422 | TFLOPs: 26.06 | +7: iteration 126930/ 173500 | consumed samples: 32494080 | consumed tokens: 66547875840 | elapsed time per iteration (s): 0.15 | learning rate: 5.072E-05 | global batch size: 256 | lm loss: 3.658028E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.005 | TFLOPs: 26.17 | +7: iteration 126940/ 173500 | consumed samples: 32496640 | consumed tokens: 66553118720 | elapsed time per iteration (s): 0.16 | learning rate: 5.071E-05 | global batch size: 256 | lm loss: 3.674588E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.813 | TFLOPs: 24.95 | +7: iteration 126950/ 173500 | consumed samples: 32499200 | consumed tokens: 66558361600 | elapsed time per iteration (s): 0.16 | learning rate: 5.070E-05 | global batch size: 256 | lm loss: 3.675589E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.363 | TFLOPs: 25.52 | +7: iteration 126960/ 173500 | consumed samples: 32501760 | consumed tokens: 66563604480 | elapsed time per iteration (s): 0.16 | learning rate: 5.068E-05 | global batch size: 256 | lm loss: 3.661701E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.115 | TFLOPs: 24.47 | +7: iteration 126970/ 173500 | consumed samples: 32504320 | consumed tokens: 66568847360 | elapsed time per iteration (s): 0.16 | learning rate: 5.067E-05 | global batch size: 256 | lm loss: 3.682434E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.624 | TFLOPs: 25.76 | +7: iteration 126980/ 173500 | consumed samples: 32506880 | consumed tokens: 66574090240 | elapsed time per iteration (s): 0.16 | learning rate: 5.066E-05 | global batch size: 256 | lm loss: 3.671803E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.875 | TFLOPs: 24.85 | +7: iteration 126990/ 173500 | consumed samples: 32509440 | consumed tokens: 66579333120 | elapsed time per iteration (s): 0.16 | learning rate: 5.065E-05 | global batch size: 256 | lm loss: 3.666071E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.231 | TFLOPs: 25.53 | +7: iteration 127000/ 173500 | consumed samples: 32512000 | consumed tokens: 66584576000 | elapsed time per iteration (s): 0.17 | learning rate: 5.064E-05 | global batch size: 256 | lm loss: 3.662415E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.491 | TFLOPs: 24.05 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 127000 | lm loss value: 3.848929E+00 | lm loss PPL: 4.694275E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 127000 to checkpoints_44m91b100m +0: [2023-03-17 05:46:31,854] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step127000 is begin to save! +0: [2023-03-17 05:46:31,858] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:46:31,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:46:31,926] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:46:31,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:46:31,935] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:46:31,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:46:31,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:46:31,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:46:31,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:46:31,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:46:31,959] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:46:31,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:46:31,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:46:31,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:46:31,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:46:31,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:46:31,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:46:31,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:46:31,992] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:46:31,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:46:31,993] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step127000/mp_rank_00_model_states.pt +0: [2023-03-17 05:46:31,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:46:31,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:46:32,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:46:32,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:46:32,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:46:32,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +4: [2023-03-17 05:46:32,033] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: [2023-03-17 05:46:32,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +5: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +1: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +7: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +6: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +2: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:46:32,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +3: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:46:32,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step127000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:46:32,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step127000 is ready now! +0: successfully saved checkpoint at iteration 127000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.63 +7: iteration 127010/ 173500 | consumed samples: 32514560 | consumed tokens: 66589818880 | elapsed time per iteration (s): 0.18 | learning rate: 5.062E-05 | global batch size: 256 | lm loss: 3.677763E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1423.502 | TFLOPs: 22.32 | +7: iteration 127020/ 173500 | consumed samples: 32517120 | consumed tokens: 66595061760 | elapsed time per iteration (s): 0.16 | learning rate: 5.061E-05 | global batch size: 256 | lm loss: 3.673283E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.413 | TFLOPs: 24.80 | +7: iteration 127030/ 173500 | consumed samples: 32519680 | consumed tokens: 66600304640 | elapsed time per iteration (s): 0.17 | learning rate: 5.060E-05 | global batch size: 256 | lm loss: 3.662241E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.334 | TFLOPs: 23.94 | +7: iteration 127040/ 173500 | consumed samples: 32522240 | consumed tokens: 66605547520 | elapsed time per iteration (s): 0.15 | learning rate: 5.059E-05 | global batch size: 256 | lm loss: 3.685516E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.374 | TFLOPs: 26.12 | +7: iteration 127050/ 173500 | consumed samples: 32524800 | consumed tokens: 66610790400 | elapsed time per iteration (s): 0.17 | learning rate: 5.057E-05 | global batch size: 256 | lm loss: 3.669194E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.570 | TFLOPs: 23.94 | +7: iteration 127060/ 173500 | consumed samples: 32527360 | consumed tokens: 66616033280 | elapsed time per iteration (s): 0.16 | learning rate: 5.056E-05 | global batch size: 256 | lm loss: 3.668505E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.523 | TFLOPs: 25.68 | +7: iteration 127070/ 173500 | consumed samples: 32529920 | consumed tokens: 66621276160 | elapsed time per iteration (s): 0.16 | learning rate: 5.055E-05 | global batch size: 256 | lm loss: 3.678218E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.592 | TFLOPs: 24.43 | +7: iteration 127080/ 173500 | consumed samples: 32532480 | consumed tokens: 66626519040 | elapsed time per iteration (s): 0.16 | learning rate: 5.054E-05 | global batch size: 256 | lm loss: 3.666521E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.174 | TFLOPs: 24.98 | +7: iteration 127090/ 173500 | consumed samples: 32535040 | consumed tokens: 66631761920 | elapsed time per iteration (s): 0.17 | learning rate: 5.052E-05 | global batch size: 256 | lm loss: 3.684155E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.258 | TFLOPs: 24.08 | +7: iteration 127100/ 173500 | consumed samples: 32537600 | consumed tokens: 66637004800 | elapsed time per iteration (s): 0.16 | learning rate: 5.051E-05 | global batch size: 256 | lm loss: 3.660840E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.234 | TFLOPs: 25.35 | +7: iteration 127110/ 173500 | consumed samples: 32540160 | consumed tokens: 66642247680 | elapsed time per iteration (s): 0.16 | learning rate: 5.050E-05 | global batch size: 256 | lm loss: 3.666525E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.981 | TFLOPs: 25.19 | +7: iteration 127120/ 173500 | consumed samples: 32542720 | consumed tokens: 66647490560 | elapsed time per iteration (s): 0.16 | learning rate: 5.049E-05 | global batch size: 256 | lm loss: 3.666578E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.775 | TFLOPs: 24.46 | +7: iteration 127130/ 173500 | consumed samples: 32545280 | consumed tokens: 66652733440 | elapsed time per iteration (s): 0.16 | learning rate: 5.047E-05 | global batch size: 256 | lm loss: 3.684090E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.818 | TFLOPs: 25.51 | +7: iteration 127140/ 173500 | consumed samples: 32547840 | consumed tokens: 66657976320 | elapsed time per iteration (s): 0.17 | learning rate: 5.046E-05 | global batch size: 256 | lm loss: 3.669357E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.102 | TFLOPs: 24.31 | +7: iteration 127150/ 173500 | consumed samples: 32550400 | consumed tokens: 66663219200 | elapsed time per iteration (s): 0.16 | learning rate: 5.045E-05 | global batch size: 256 | lm loss: 3.681448E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.383 | TFLOPs: 25.02 | +7: iteration 127160/ 173500 | consumed samples: 32552960 | consumed tokens: 66668462080 | elapsed time per iteration (s): 0.16 | learning rate: 5.044E-05 | global batch size: 256 | lm loss: 3.680157E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.227 | TFLOPs: 24.69 | +7: iteration 127170/ 173500 | consumed samples: 32555520 | consumed tokens: 66673704960 | elapsed time per iteration (s): 0.17 | learning rate: 5.042E-05 | global batch size: 256 | lm loss: 3.674376E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.843 | TFLOPs: 24.16 | +7: iteration 127180/ 173500 | consumed samples: 32558080 | consumed tokens: 66678947840 | elapsed time per iteration (s): 0.17 | learning rate: 5.041E-05 | global batch size: 256 | lm loss: 3.677321E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1471.654 | TFLOPs: 23.08 | +7: iteration 127190/ 173500 | consumed samples: 32560640 | consumed tokens: 66684190720 | elapsed time per iteration (s): 0.17 | learning rate: 5.040E-05 | global batch size: 256 | lm loss: 3.670525E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.995 | TFLOPs: 23.85 | +7: iteration 127200/ 173500 | consumed samples: 32563200 | consumed tokens: 66689433600 | elapsed time per iteration (s): 0.16 | learning rate: 5.039E-05 | global batch size: 256 | lm loss: 3.666270E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.015 | TFLOPs: 25.63 | +7: iteration 127210/ 173500 | consumed samples: 32565760 | consumed tokens: 66694676480 | elapsed time per iteration (s): 0.17 | learning rate: 5.038E-05 | global batch size: 256 | lm loss: 3.673413E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.667 | TFLOPs: 24.02 | +7: iteration 127220/ 173500 | consumed samples: 32568320 | consumed tokens: 66699919360 | elapsed time per iteration (s): 0.16 | learning rate: 5.036E-05 | global batch size: 256 | lm loss: 3.668793E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.909 | TFLOPs: 25.06 | +7: iteration 127230/ 173500 | consumed samples: 32570880 | consumed tokens: 66705162240 | elapsed time per iteration (s): 0.16 | learning rate: 5.035E-05 | global batch size: 256 | lm loss: 3.680494E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.883 | TFLOPs: 25.33 | +7: iteration 127240/ 173500 | consumed samples: 32573440 | consumed tokens: 66710405120 | elapsed time per iteration (s): 0.16 | learning rate: 5.034E-05 | global batch size: 256 | lm loss: 3.682182E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.812 | TFLOPs: 25.37 | +7: iteration 127250/ 173500 | consumed samples: 32576000 | consumed tokens: 66715648000 | elapsed time per iteration (s): 0.16 | learning rate: 5.033E-05 | global batch size: 256 | lm loss: 3.684399E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.270 | TFLOPs: 24.80 | +7: iteration 127260/ 173500 | consumed samples: 32578560 | consumed tokens: 66720890880 | elapsed time per iteration (s): 0.16 | learning rate: 5.031E-05 | global batch size: 256 | lm loss: 3.669671E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.641 | TFLOPs: 25.48 | +7: iteration 127270/ 173500 | consumed samples: 32581120 | consumed tokens: 66726133760 | elapsed time per iteration (s): 0.16 | learning rate: 5.030E-05 | global batch size: 256 | lm loss: 3.670443E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.110 | TFLOPs: 25.31 | +7: iteration 127280/ 173500 | consumed samples: 32583680 | consumed tokens: 66731376640 | elapsed time per iteration (s): 0.15 | learning rate: 5.029E-05 | global batch size: 256 | lm loss: 3.671788E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.991 | TFLOPs: 26.02 | +7: iteration 127290/ 173500 | consumed samples: 32586240 | consumed tokens: 66736619520 | elapsed time per iteration (s): 0.16 | learning rate: 5.028E-05 | global batch size: 256 | lm loss: 3.679220E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.357 | TFLOPs: 25.38 | +7: iteration 127300/ 173500 | consumed samples: 32588800 | consumed tokens: 66741862400 | elapsed time per iteration (s): 0.16 | learning rate: 5.026E-05 | global batch size: 256 | lm loss: 3.684687E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.948 | TFLOPs: 25.31 | +7: iteration 127310/ 173500 | consumed samples: 32591360 | consumed tokens: 66747105280 | elapsed time per iteration (s): 0.16 | learning rate: 5.025E-05 | global batch size: 256 | lm loss: 3.664313E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.550 | TFLOPs: 24.80 | +7: iteration 127320/ 173500 | consumed samples: 32593920 | consumed tokens: 66752348160 | elapsed time per iteration (s): 0.16 | learning rate: 5.024E-05 | global batch size: 256 | lm loss: 3.683683E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.757 | TFLOPs: 24.96 | +7: iteration 127330/ 173500 | consumed samples: 32596480 | consumed tokens: 66757591040 | elapsed time per iteration (s): 0.15 | learning rate: 5.023E-05 | global batch size: 256 | lm loss: 3.677199E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.573 | TFLOPs: 25.96 | +7: iteration 127340/ 173500 | consumed samples: 32599040 | consumed tokens: 66762833920 | elapsed time per iteration (s): 0.16 | learning rate: 5.022E-05 | global batch size: 256 | lm loss: 3.676012E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.291 | TFLOPs: 25.43 | +7: iteration 127350/ 173500 | consumed samples: 32601600 | consumed tokens: 66768076800 | elapsed time per iteration (s): 0.16 | learning rate: 5.020E-05 | global batch size: 256 | lm loss: 3.671507E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.374 | TFLOPs: 25.71 | +7: iteration 127360/ 173500 | consumed samples: 32604160 | consumed tokens: 66773319680 | elapsed time per iteration (s): 0.16 | learning rate: 5.019E-05 | global batch size: 256 | lm loss: 3.676500E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.775 | TFLOPs: 25.15 | +7: iteration 127370/ 173500 | consumed samples: 32606720 | consumed tokens: 66778562560 | elapsed time per iteration (s): 0.16 | learning rate: 5.018E-05 | global batch size: 256 | lm loss: 3.662674E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.390 | TFLOPs: 24.99 | +7: iteration 127380/ 173500 | consumed samples: 32609280 | consumed tokens: 66783805440 | elapsed time per iteration (s): 0.16 | learning rate: 5.017E-05 | global batch size: 256 | lm loss: 3.673929E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.387 | TFLOPs: 25.04 | +7: iteration 127390/ 173500 | consumed samples: 32611840 | consumed tokens: 66789048320 | elapsed time per iteration (s): 0.18 | learning rate: 5.015E-05 | global batch size: 256 | lm loss: 3.669025E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1428.756 | TFLOPs: 22.41 | +7: iteration 127400/ 173500 | consumed samples: 32614400 | consumed tokens: 66794291200 | elapsed time per iteration (s): 0.16 | learning rate: 5.014E-05 | global batch size: 256 | lm loss: 3.673890E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.316 | TFLOPs: 24.36 | +7: iteration 127410/ 173500 | consumed samples: 32616960 | consumed tokens: 66799534080 | elapsed time per iteration (s): 0.16 | learning rate: 5.013E-05 | global batch size: 256 | lm loss: 3.664904E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.560 | TFLOPs: 24.88 | +7: iteration 127420/ 173500 | consumed samples: 32619520 | consumed tokens: 66804776960 | elapsed time per iteration (s): 0.16 | learning rate: 5.012E-05 | global batch size: 256 | lm loss: 3.674202E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.845 | TFLOPs: 24.92 | +7: iteration 127430/ 173500 | consumed samples: 32622080 | consumed tokens: 66810019840 | elapsed time per iteration (s): 0.16 | learning rate: 5.010E-05 | global batch size: 256 | lm loss: 3.683628E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.488 | TFLOPs: 25.16 | +7: iteration 127440/ 173500 | consumed samples: 32624640 | consumed tokens: 66815262720 | elapsed time per iteration (s): 0.17 | learning rate: 5.009E-05 | global batch size: 256 | lm loss: 3.672984E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.817 | TFLOPs: 24.04 | +7: iteration 127450/ 173500 | consumed samples: 32627200 | consumed tokens: 66820505600 | elapsed time per iteration (s): 0.16 | learning rate: 5.008E-05 | global batch size: 256 | lm loss: 3.675596E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.796 | TFLOPs: 25.18 | +7: iteration 127460/ 173500 | consumed samples: 32629760 | consumed tokens: 66825748480 | elapsed time per iteration (s): 0.16 | learning rate: 5.007E-05 | global batch size: 256 | lm loss: 3.670680E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.930 | TFLOPs: 25.08 | +7: iteration 127470/ 173500 | consumed samples: 32632320 | consumed tokens: 66830991360 | elapsed time per iteration (s): 0.16 | learning rate: 5.006E-05 | global batch size: 256 | lm loss: 3.668840E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.866 | TFLOPs: 25.15 | +7: iteration 127480/ 173500 | consumed samples: 32634880 | consumed tokens: 66836234240 | elapsed time per iteration (s): 0.17 | learning rate: 5.004E-05 | global batch size: 256 | lm loss: 3.674485E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.509 | TFLOPs: 23.99 | +7: iteration 127490/ 173500 | consumed samples: 32637440 | consumed tokens: 66841477120 | elapsed time per iteration (s): 0.16 | learning rate: 5.003E-05 | global batch size: 256 | lm loss: 3.673687E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.905 | TFLOPs: 24.82 | +7: iteration 127500/ 173500 | consumed samples: 32640000 | consumed tokens: 66846720000 | elapsed time per iteration (s): 0.16 | learning rate: 5.002E-05 | global batch size: 256 | lm loss: 3.677925E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.025 | TFLOPs: 24.67 | +7: iteration 127510/ 173500 | consumed samples: 32642560 | consumed tokens: 66851962880 | elapsed time per iteration (s): 0.16 | learning rate: 5.001E-05 | global batch size: 256 | lm loss: 3.669540E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.215 | TFLOPs: 24.41 | +7: iteration 127520/ 173500 | consumed samples: 32645120 | consumed tokens: 66857205760 | elapsed time per iteration (s): 0.17 | learning rate: 4.999E-05 | global batch size: 256 | lm loss: 3.691753E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.481 | TFLOPs: 23.17 | +7: iteration 127530/ 173500 | consumed samples: 32647680 | consumed tokens: 66862448640 | elapsed time per iteration (s): 0.16 | learning rate: 4.998E-05 | global batch size: 256 | lm loss: 3.687363E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.473 | TFLOPs: 25.07 | +7: iteration 127540/ 173500 | consumed samples: 32650240 | consumed tokens: 66867691520 | elapsed time per iteration (s): 0.16 | learning rate: 4.997E-05 | global batch size: 256 | lm loss: 3.678983E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.684 | TFLOPs: 25.45 | +7: iteration 127550/ 173500 | consumed samples: 32652800 | consumed tokens: 66872934400 | elapsed time per iteration (s): 0.16 | learning rate: 4.996E-05 | global batch size: 256 | lm loss: 3.668261E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.182 | TFLOPs: 25.14 | +7: iteration 127560/ 173500 | consumed samples: 32655360 | consumed tokens: 66878177280 | elapsed time per iteration (s): 0.16 | learning rate: 4.995E-05 | global batch size: 256 | lm loss: 3.675460E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.383 | TFLOPs: 25.16 | +7: iteration 127570/ 173500 | consumed samples: 32657920 | consumed tokens: 66883420160 | elapsed time per iteration (s): 0.16 | learning rate: 4.993E-05 | global batch size: 256 | lm loss: 3.685312E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.727 | TFLOPs: 25.24 | +7: iteration 127580/ 173500 | consumed samples: 32660480 | consumed tokens: 66888663040 | elapsed time per iteration (s): 0.16 | learning rate: 4.992E-05 | global batch size: 256 | lm loss: 3.666813E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.455 | TFLOPs: 25.73 | +7: iteration 127590/ 173500 | consumed samples: 32663040 | consumed tokens: 66893905920 | elapsed time per iteration (s): 0.16 | learning rate: 4.991E-05 | global batch size: 256 | lm loss: 3.673156E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.629 | TFLOPs: 25.60 | +7: iteration 127600/ 173500 | consumed samples: 32665600 | consumed tokens: 66899148800 | elapsed time per iteration (s): 0.16 | learning rate: 4.990E-05 | global batch size: 256 | lm loss: 3.683070E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.451 | TFLOPs: 24.90 | +7: iteration 127610/ 173500 | consumed samples: 32668160 | consumed tokens: 66904391680 | elapsed time per iteration (s): 0.16 | learning rate: 4.988E-05 | global batch size: 256 | lm loss: 3.685970E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.960 | TFLOPs: 25.84 | +7: iteration 127620/ 173500 | consumed samples: 32670720 | consumed tokens: 66909634560 | elapsed time per iteration (s): 0.16 | learning rate: 4.987E-05 | global batch size: 256 | lm loss: 3.671663E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.835 | TFLOPs: 25.06 | +7: iteration 127630/ 173500 | consumed samples: 32673280 | consumed tokens: 66914877440 | elapsed time per iteration (s): 0.16 | learning rate: 4.986E-05 | global batch size: 256 | lm loss: 3.676574E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.108 | TFLOPs: 25.67 | +7: iteration 127640/ 173500 | consumed samples: 32675840 | consumed tokens: 66920120320 | elapsed time per iteration (s): 0.17 | learning rate: 4.985E-05 | global batch size: 256 | lm loss: 3.679908E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.448 | TFLOPs: 24.10 | +7: iteration 127650/ 173500 | consumed samples: 32678400 | consumed tokens: 66925363200 | elapsed time per iteration (s): 0.16 | learning rate: 4.984E-05 | global batch size: 256 | lm loss: 3.664135E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.445 | TFLOPs: 25.33 | +7: iteration 127660/ 173500 | consumed samples: 32680960 | consumed tokens: 66930606080 | elapsed time per iteration (s): 0.16 | learning rate: 4.982E-05 | global batch size: 256 | lm loss: 3.675892E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.122 | TFLOPs: 25.80 | +7: iteration 127670/ 173500 | consumed samples: 32683520 | consumed tokens: 66935848960 | elapsed time per iteration (s): 0.15 | learning rate: 4.981E-05 | global batch size: 256 | lm loss: 3.682754E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.702 | TFLOPs: 26.23 | +7: iteration 127680/ 173500 | consumed samples: 32686080 | consumed tokens: 66941091840 | elapsed time per iteration (s): 0.15 | learning rate: 4.980E-05 | global batch size: 256 | lm loss: 3.663950E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.177 | TFLOPs: 25.93 | +7: iteration 127690/ 173500 | consumed samples: 32688640 | consumed tokens: 66946334720 | elapsed time per iteration (s): 0.17 | learning rate: 4.979E-05 | global batch size: 256 | lm loss: 3.674516E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.762 | TFLOPs: 24.21 | +7: iteration 127700/ 173500 | consumed samples: 32691200 | consumed tokens: 66951577600 | elapsed time per iteration (s): 0.16 | learning rate: 4.977E-05 | global batch size: 256 | lm loss: 3.670143E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.584 | TFLOPs: 25.40 | +7: iteration 127710/ 173500 | consumed samples: 32693760 | consumed tokens: 66956820480 | elapsed time per iteration (s): 0.16 | learning rate: 4.976E-05 | global batch size: 256 | lm loss: 3.673158E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.837 | TFLOPs: 25.36 | +7: iteration 127720/ 173500 | consumed samples: 32696320 | consumed tokens: 66962063360 | elapsed time per iteration (s): 0.16 | learning rate: 4.975E-05 | global batch size: 256 | lm loss: 3.681087E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.760 | TFLOPs: 25.23 | +7: iteration 127730/ 173500 | consumed samples: 32698880 | consumed tokens: 66967306240 | elapsed time per iteration (s): 0.16 | learning rate: 4.974E-05 | global batch size: 256 | lm loss: 3.665198E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.766 | TFLOPs: 25.35 | +7: iteration 127740/ 173500 | consumed samples: 32701440 | consumed tokens: 66972549120 | elapsed time per iteration (s): 0.16 | learning rate: 4.972E-05 | global batch size: 256 | lm loss: 3.673874E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.461 | TFLOPs: 25.04 | +7: iteration 127750/ 173500 | consumed samples: 32704000 | consumed tokens: 66977792000 | elapsed time per iteration (s): 0.16 | learning rate: 4.971E-05 | global batch size: 256 | lm loss: 3.682703E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.231 | TFLOPs: 25.66 | +7: iteration 127760/ 173500 | consumed samples: 32706560 | consumed tokens: 66983034880 | elapsed time per iteration (s): 0.16 | learning rate: 4.970E-05 | global batch size: 256 | lm loss: 3.674693E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.090 | TFLOPs: 25.45 | +7: iteration 127770/ 173500 | consumed samples: 32709120 | consumed tokens: 66988277760 | elapsed time per iteration (s): 0.15 | learning rate: 4.969E-05 | global batch size: 256 | lm loss: 3.681981E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.713 | TFLOPs: 26.03 | +7: iteration 127780/ 173500 | consumed samples: 32711680 | consumed tokens: 66993520640 | elapsed time per iteration (s): 0.15 | learning rate: 4.968E-05 | global batch size: 256 | lm loss: 3.686186E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.338 | TFLOPs: 26.16 | +7: iteration 127790/ 173500 | consumed samples: 32714240 | consumed tokens: 66998763520 | elapsed time per iteration (s): 0.16 | learning rate: 4.966E-05 | global batch size: 256 | lm loss: 3.670285E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.429 | TFLOPs: 25.32 | +7: iteration 127800/ 173500 | consumed samples: 32716800 | consumed tokens: 67004006400 | elapsed time per iteration (s): 0.16 | learning rate: 4.965E-05 | global batch size: 256 | lm loss: 3.670065E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.991 | TFLOPs: 25.80 | +7: iteration 127810/ 173500 | consumed samples: 32719360 | consumed tokens: 67009249280 | elapsed time per iteration (s): 0.16 | learning rate: 4.964E-05 | global batch size: 256 | lm loss: 3.673214E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.112 | TFLOPs: 25.80 | +7: iteration 127820/ 173500 | consumed samples: 32721920 | consumed tokens: 67014492160 | elapsed time per iteration (s): 0.16 | learning rate: 4.963E-05 | global batch size: 256 | lm loss: 3.687033E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.728 | TFLOPs: 25.45 | +7: iteration 127830/ 173500 | consumed samples: 32724480 | consumed tokens: 67019735040 | elapsed time per iteration (s): 0.17 | learning rate: 4.962E-05 | global batch size: 256 | lm loss: 3.668260E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.861 | TFLOPs: 23.62 | +7: iteration 127840/ 173500 | consumed samples: 32727040 | consumed tokens: 67024977920 | elapsed time per iteration (s): 0.16 | learning rate: 4.960E-05 | global batch size: 256 | lm loss: 3.675555E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.059 | TFLOPs: 25.70 | +7: iteration 127850/ 173500 | consumed samples: 32729600 | consumed tokens: 67030220800 | elapsed time per iteration (s): 0.16 | learning rate: 4.959E-05 | global batch size: 256 | lm loss: 3.686029E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.071 | TFLOPs: 25.33 | +7: iteration 127860/ 173500 | consumed samples: 32732160 | consumed tokens: 67035463680 | elapsed time per iteration (s): 0.16 | learning rate: 4.958E-05 | global batch size: 256 | lm loss: 3.676145E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.872 | TFLOPs: 25.65 | +7: iteration 127870/ 173500 | consumed samples: 32734720 | consumed tokens: 67040706560 | elapsed time per iteration (s): 0.17 | learning rate: 4.957E-05 | global batch size: 256 | lm loss: 3.668435E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.158 | TFLOPs: 24.25 | +7: iteration 127880/ 173500 | consumed samples: 32737280 | consumed tokens: 67045949440 | elapsed time per iteration (s): 0.16 | learning rate: 4.955E-05 | global batch size: 256 | lm loss: 3.669535E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.078 | TFLOPs: 24.47 | +7: iteration 127890/ 173500 | consumed samples: 32739840 | consumed tokens: 67051192320 | elapsed time per iteration (s): 0.15 | learning rate: 4.954E-05 | global batch size: 256 | lm loss: 3.666667E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.216 | TFLOPs: 26.24 | +7: iteration 127900/ 173500 | consumed samples: 32742400 | consumed tokens: 67056435200 | elapsed time per iteration (s): 0.17 | learning rate: 4.953E-05 | global batch size: 256 | lm loss: 3.673191E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.124 | TFLOPs: 24.23 | +7: iteration 127910/ 173500 | consumed samples: 32744960 | consumed tokens: 67061678080 | elapsed time per iteration (s): 0.17 | learning rate: 4.952E-05 | global batch size: 256 | lm loss: 3.680759E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1468.232 | TFLOPs: 23.03 | +7: iteration 127920/ 173500 | consumed samples: 32747520 | consumed tokens: 67066920960 | elapsed time per iteration (s): 0.15 | learning rate: 4.951E-05 | global batch size: 256 | lm loss: 3.662580E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.529 | TFLOPs: 26.10 | +7: iteration 127930/ 173500 | consumed samples: 32750080 | consumed tokens: 67072163840 | elapsed time per iteration (s): 0.16 | learning rate: 4.949E-05 | global batch size: 256 | lm loss: 3.682676E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.826 | TFLOPs: 24.82 | +7: iteration 127940/ 173500 | consumed samples: 32752640 | consumed tokens: 67077406720 | elapsed time per iteration (s): 0.16 | learning rate: 4.948E-05 | global batch size: 256 | lm loss: 3.669075E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.973 | TFLOPs: 24.81 | +7: iteration 127950/ 173500 | consumed samples: 32755200 | consumed tokens: 67082649600 | elapsed time per iteration (s): 0.17 | learning rate: 4.947E-05 | global batch size: 256 | lm loss: 3.675225E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.542 | TFLOPs: 23.85 | +7: iteration 127960/ 173500 | consumed samples: 32757760 | consumed tokens: 67087892480 | elapsed time per iteration (s): 0.18 | learning rate: 4.946E-05 | global batch size: 256 | lm loss: 3.665570E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1446.046 | TFLOPs: 22.68 | +7: iteration 127970/ 173500 | consumed samples: 32760320 | consumed tokens: 67093135360 | elapsed time per iteration (s): 0.16 | learning rate: 4.944E-05 | global batch size: 256 | lm loss: 3.674202E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.907 | TFLOPs: 25.09 | +7: iteration 127980/ 173500 | consumed samples: 32762880 | consumed tokens: 67098378240 | elapsed time per iteration (s): 0.15 | learning rate: 4.943E-05 | global batch size: 256 | lm loss: 3.664434E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.453 | TFLOPs: 26.13 | +7: iteration 127990/ 173500 | consumed samples: 32765440 | consumed tokens: 67103621120 | elapsed time per iteration (s): 0.16 | learning rate: 4.942E-05 | global batch size: 256 | lm loss: 3.662347E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.023 | TFLOPs: 25.01 | +0: [2023-03-17 05:49:12,789] [INFO] [logging.py:68:log_dist] [Rank 0] step=128000, skipped=0, lr=[4.94077976375529e-05, 4.94077976375529e-05, 4.94077976375529e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 128000/ 173500 | consumed samples: 32768000 | consumed tokens: 67108864000 | elapsed time per iteration (s): 0.16 | learning rate: 4.941E-05 | global batch size: 256 | lm loss: 3.682552E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.192 | TFLOPs: 25.06 | +0: steps: 128000 loss: 3.6782 iter time (s): 0.161 samples/sec: 1589.922 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 128000 | lm loss value: 3.832808E+00 | lm loss PPL: 4.619209E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 128000 to checkpoints_44m91b100m +0: [2023-03-17 05:49:12,875] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step128000 is begin to save! +0: [2023-03-17 05:49:12,878] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:49:12,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:49:12,943] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:49:12,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:49:12,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:49:12,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:49:12,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:49:12,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:49:12,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:49:12,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:49:12,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:49:12,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:49:12,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:49:12,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:49:12,994] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:49:13,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:49:13,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:49:13,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:49:13,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:49:13,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:49:13,012] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step128000/mp_rank_00_model_states.pt +0: [2023-03-17 05:49:13,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:49:13,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:49:13,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:49:13,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:49:13,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +7: [2023-03-17 05:49:13,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:49:13,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:49:13,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:49:13,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 05:49:13,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +7: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +7: [2023-03-17 05:49:13,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +7: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +1: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +7: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +5: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +7: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +3: [2023-03-17 05:49:13,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:49:13,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +6: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:49:13,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +2: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:49:13,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +4: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:49:13,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step128000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:49:13,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step128000 is ready now! +0: successfully saved checkpoint at iteration 128000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.33 +7: iteration 128010/ 173500 | consumed samples: 32770560 | consumed tokens: 67114106880 | elapsed time per iteration (s): 0.18 | learning rate: 4.940E-05 | global batch size: 256 | lm loss: 3.681881E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.813 | TFLOPs: 21.95 | +7: iteration 128020/ 173500 | consumed samples: 32773120 | consumed tokens: 67119349760 | elapsed time per iteration (s): 0.16 | learning rate: 4.938E-05 | global batch size: 256 | lm loss: 3.676932E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.558 | TFLOPs: 25.63 | +7: iteration 128030/ 173500 | consumed samples: 32775680 | consumed tokens: 67124592640 | elapsed time per iteration (s): 0.16 | learning rate: 4.937E-05 | global batch size: 256 | lm loss: 3.685213E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.703 | TFLOPs: 25.13 | +7: iteration 128040/ 173500 | consumed samples: 32778240 | consumed tokens: 67129835520 | elapsed time per iteration (s): 0.17 | learning rate: 4.936E-05 | global batch size: 256 | lm loss: 3.675806E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.967 | TFLOPs: 24.15 | +7: iteration 128050/ 173500 | consumed samples: 32780800 | consumed tokens: 67135078400 | elapsed time per iteration (s): 0.16 | learning rate: 4.935E-05 | global batch size: 256 | lm loss: 3.674685E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.171 | TFLOPs: 24.77 | +7: iteration 128060/ 173500 | consumed samples: 32783360 | consumed tokens: 67140321280 | elapsed time per iteration (s): 0.16 | learning rate: 4.933E-05 | global batch size: 256 | lm loss: 3.677492E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.170 | TFLOPs: 25.58 | +7: iteration 128070/ 173500 | consumed samples: 32785920 | consumed tokens: 67145564160 | elapsed time per iteration (s): 0.16 | learning rate: 4.932E-05 | global batch size: 256 | lm loss: 3.663939E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.893 | TFLOPs: 24.71 | +7: iteration 128080/ 173500 | consumed samples: 32788480 | consumed tokens: 67150807040 | elapsed time per iteration (s): 0.16 | learning rate: 4.931E-05 | global batch size: 256 | lm loss: 3.667069E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.482 | TFLOPs: 24.63 | +7: iteration 128090/ 173500 | consumed samples: 32791040 | consumed tokens: 67156049920 | elapsed time per iteration (s): 0.16 | learning rate: 4.930E-05 | global batch size: 256 | lm loss: 3.675254E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.232 | TFLOPs: 25.49 | +7: iteration 128100/ 173500 | consumed samples: 32793600 | consumed tokens: 67161292800 | elapsed time per iteration (s): 0.15 | learning rate: 4.929E-05 | global batch size: 256 | lm loss: 3.667017E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.802 | TFLOPs: 25.94 | +7: iteration 128110/ 173500 | consumed samples: 32796160 | consumed tokens: 67166535680 | elapsed time per iteration (s): 0.16 | learning rate: 4.927E-05 | global batch size: 256 | lm loss: 3.676516E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.488 | TFLOPs: 24.94 | +7: iteration 128120/ 173500 | consumed samples: 32798720 | consumed tokens: 67171778560 | elapsed time per iteration (s): 0.16 | learning rate: 4.926E-05 | global batch size: 256 | lm loss: 3.673223E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.975 | TFLOPs: 25.86 | +7: iteration 128130/ 173500 | consumed samples: 32801280 | consumed tokens: 67177021440 | elapsed time per iteration (s): 0.16 | learning rate: 4.925E-05 | global batch size: 256 | lm loss: 3.675328E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.720 | TFLOPs: 25.02 | +7: iteration 128140/ 173500 | consumed samples: 32803840 | consumed tokens: 67182264320 | elapsed time per iteration (s): 0.16 | learning rate: 4.924E-05 | global batch size: 256 | lm loss: 3.678149E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.678 | TFLOPs: 25.43 | +7: iteration 128150/ 173500 | consumed samples: 32806400 | consumed tokens: 67187507200 | elapsed time per iteration (s): 0.15 | learning rate: 4.923E-05 | global batch size: 256 | lm loss: 3.674670E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.884 | TFLOPs: 26.28 | +7: iteration 128160/ 173500 | consumed samples: 32808960 | consumed tokens: 67192750080 | elapsed time per iteration (s): 0.16 | learning rate: 4.921E-05 | global batch size: 256 | lm loss: 3.683204E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.235 | TFLOPs: 25.25 | +7: iteration 128170/ 173500 | consumed samples: 32811520 | consumed tokens: 67197992960 | elapsed time per iteration (s): 0.16 | learning rate: 4.920E-05 | global batch size: 256 | lm loss: 3.668447E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.365 | TFLOPs: 24.97 | +7: iteration 128180/ 173500 | consumed samples: 32814080 | consumed tokens: 67203235840 | elapsed time per iteration (s): 0.16 | learning rate: 4.919E-05 | global batch size: 256 | lm loss: 3.668170E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.383 | TFLOPs: 25.68 | +7: iteration 128190/ 173500 | consumed samples: 32816640 | consumed tokens: 67208478720 | elapsed time per iteration (s): 0.16 | learning rate: 4.918E-05 | global batch size: 256 | lm loss: 3.662390E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.660 | TFLOPs: 25.51 | +7: iteration 128200/ 173500 | consumed samples: 32819200 | consumed tokens: 67213721600 | elapsed time per iteration (s): 0.16 | learning rate: 4.916E-05 | global batch size: 256 | lm loss: 3.681799E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.547 | TFLOPs: 25.35 | +7: iteration 128210/ 173500 | consumed samples: 32821760 | consumed tokens: 67218964480 | elapsed time per iteration (s): 0.15 | learning rate: 4.915E-05 | global batch size: 256 | lm loss: 3.670573E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.174 | TFLOPs: 25.93 | +7: iteration 128220/ 173500 | consumed samples: 32824320 | consumed tokens: 67224207360 | elapsed time per iteration (s): 0.17 | learning rate: 4.914E-05 | global batch size: 256 | lm loss: 3.672873E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1467.930 | TFLOPs: 23.02 | +7: iteration 128230/ 173500 | consumed samples: 32826880 | consumed tokens: 67229450240 | elapsed time per iteration (s): 0.16 | learning rate: 4.913E-05 | global batch size: 256 | lm loss: 3.668172E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.063 | TFLOPs: 25.74 | +7: iteration 128240/ 173500 | consumed samples: 32829440 | consumed tokens: 67234693120 | elapsed time per iteration (s): 0.15 | learning rate: 4.912E-05 | global batch size: 256 | lm loss: 3.669342E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.970 | TFLOPs: 26.25 | +7: iteration 128250/ 173500 | consumed samples: 32832000 | consumed tokens: 67239936000 | elapsed time per iteration (s): 0.16 | learning rate: 4.910E-05 | global batch size: 256 | lm loss: 3.672059E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.066 | TFLOPs: 25.06 | +7: iteration 128260/ 173500 | consumed samples: 32834560 | consumed tokens: 67245178880 | elapsed time per iteration (s): 0.17 | learning rate: 4.909E-05 | global batch size: 256 | lm loss: 3.672601E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1526.359 | TFLOPs: 23.94 | +7: iteration 128270/ 173500 | consumed samples: 32837120 | consumed tokens: 67250421760 | elapsed time per iteration (s): 0.15 | learning rate: 4.908E-05 | global batch size: 256 | lm loss: 3.675847E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.093 | TFLOPs: 26.21 | +7: iteration 128280/ 173500 | consumed samples: 32839680 | consumed tokens: 67255664640 | elapsed time per iteration (s): 0.15 | learning rate: 4.907E-05 | global batch size: 256 | lm loss: 3.676830E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.740 | TFLOPs: 25.97 | +7: iteration 128290/ 173500 | consumed samples: 32842240 | consumed tokens: 67260907520 | elapsed time per iteration (s): 0.16 | learning rate: 4.906E-05 | global batch size: 256 | lm loss: 3.661990E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.102 | TFLOPs: 25.52 | +7: iteration 128300/ 173500 | consumed samples: 32844800 | consumed tokens: 67266150400 | elapsed time per iteration (s): 0.16 | learning rate: 4.904E-05 | global batch size: 256 | lm loss: 3.666344E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.567 | TFLOPs: 25.12 | +7: iteration 128310/ 173500 | consumed samples: 32847360 | consumed tokens: 67271393280 | elapsed time per iteration (s): 0.16 | learning rate: 4.903E-05 | global batch size: 256 | lm loss: 3.687024E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.678 | TFLOPs: 25.46 | +7: iteration 128320/ 173500 | consumed samples: 32849920 | consumed tokens: 67276636160 | elapsed time per iteration (s): 0.16 | learning rate: 4.902E-05 | global batch size: 256 | lm loss: 3.679160E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.069 | TFLOPs: 25.33 | +7: iteration 128330/ 173500 | consumed samples: 32852480 | consumed tokens: 67281879040 | elapsed time per iteration (s): 0.16 | learning rate: 4.901E-05 | global batch size: 256 | lm loss: 3.671926E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.076 | TFLOPs: 25.80 | +7: iteration 128340/ 173500 | consumed samples: 32855040 | consumed tokens: 67287121920 | elapsed time per iteration (s): 0.16 | learning rate: 4.900E-05 | global batch size: 256 | lm loss: 3.673769E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.312 | TFLOPs: 25.90 | +7: iteration 128350/ 173500 | consumed samples: 32857600 | consumed tokens: 67292364800 | elapsed time per iteration (s): 0.16 | learning rate: 4.898E-05 | global batch size: 256 | lm loss: 3.685318E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.005 | TFLOPs: 25.23 | +7: iteration 128360/ 173500 | consumed samples: 32860160 | consumed tokens: 67297607680 | elapsed time per iteration (s): 0.16 | learning rate: 4.897E-05 | global batch size: 256 | lm loss: 3.663522E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.258 | TFLOPs: 24.48 | +7: iteration 128370/ 173500 | consumed samples: 32862720 | consumed tokens: 67302850560 | elapsed time per iteration (s): 0.16 | learning rate: 4.896E-05 | global batch size: 256 | lm loss: 3.666358E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.167 | TFLOPs: 25.05 | +7: iteration 128380/ 173500 | consumed samples: 32865280 | consumed tokens: 67308093440 | elapsed time per iteration (s): 0.16 | learning rate: 4.895E-05 | global batch size: 256 | lm loss: 3.673444E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.316 | TFLOPs: 24.50 | +7: iteration 128390/ 173500 | consumed samples: 32867840 | consumed tokens: 67313336320 | elapsed time per iteration (s): 0.16 | learning rate: 4.893E-05 | global batch size: 256 | lm loss: 3.676670E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.290 | TFLOPs: 24.86 | +7: iteration 128400/ 173500 | consumed samples: 32870400 | consumed tokens: 67318579200 | elapsed time per iteration (s): 0.17 | learning rate: 4.892E-05 | global batch size: 256 | lm loss: 3.687923E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.528 | TFLOPs: 23.44 | +7: iteration 128410/ 173500 | consumed samples: 32872960 | consumed tokens: 67323822080 | elapsed time per iteration (s): 0.16 | learning rate: 4.891E-05 | global batch size: 256 | lm loss: 3.671902E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.983 | TFLOPs: 24.61 | +7: iteration 128420/ 173500 | consumed samples: 32875520 | consumed tokens: 67329064960 | elapsed time per iteration (s): 0.17 | learning rate: 4.890E-05 | global batch size: 256 | lm loss: 3.672028E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.236 | TFLOPs: 24.06 | +7: iteration 128430/ 173500 | consumed samples: 32878080 | consumed tokens: 67334307840 | elapsed time per iteration (s): 0.16 | learning rate: 4.889E-05 | global batch size: 256 | lm loss: 3.664790E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.678 | TFLOPs: 25.84 | +7: iteration 128440/ 173500 | consumed samples: 32880640 | consumed tokens: 67339550720 | elapsed time per iteration (s): 0.16 | learning rate: 4.887E-05 | global batch size: 256 | lm loss: 3.664065E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.172 | TFLOPs: 25.63 | +7: iteration 128450/ 173500 | consumed samples: 32883200 | consumed tokens: 67344793600 | elapsed time per iteration (s): 0.16 | learning rate: 4.886E-05 | global batch size: 256 | lm loss: 3.655979E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.573 | TFLOPs: 25.57 | +7: iteration 128460/ 173500 | consumed samples: 32885760 | consumed tokens: 67350036480 | elapsed time per iteration (s): 0.17 | learning rate: 4.885E-05 | global batch size: 256 | lm loss: 3.667365E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.641 | TFLOPs: 24.07 | +7: iteration 128470/ 173500 | consumed samples: 32888320 | consumed tokens: 67355279360 | elapsed time per iteration (s): 0.16 | learning rate: 4.884E-05 | global batch size: 256 | lm loss: 3.688494E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.164 | TFLOPs: 24.95 | +7: iteration 128480/ 173500 | consumed samples: 32890880 | consumed tokens: 67360522240 | elapsed time per iteration (s): 0.16 | learning rate: 4.883E-05 | global batch size: 256 | lm loss: 3.667947E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.395 | TFLOPs: 25.18 | +7: iteration 128490/ 173500 | consumed samples: 32893440 | consumed tokens: 67365765120 | elapsed time per iteration (s): 0.16 | learning rate: 4.881E-05 | global batch size: 256 | lm loss: 3.676065E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.150 | TFLOPs: 24.87 | +7: iteration 128500/ 173500 | consumed samples: 32896000 | consumed tokens: 67371008000 | elapsed time per iteration (s): 0.16 | learning rate: 4.880E-05 | global batch size: 256 | lm loss: 3.663171E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.371 | TFLOPs: 25.68 | +7: iteration 128510/ 173500 | consumed samples: 32898560 | consumed tokens: 67376250880 | elapsed time per iteration (s): 0.16 | learning rate: 4.879E-05 | global batch size: 256 | lm loss: 3.689763E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.505 | TFLOPs: 25.32 | +7: iteration 128520/ 173500 | consumed samples: 32901120 | consumed tokens: 67381493760 | elapsed time per iteration (s): 0.16 | learning rate: 4.878E-05 | global batch size: 256 | lm loss: 3.664619E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.036 | TFLOPs: 25.12 | +7: iteration 128530/ 173500 | consumed samples: 32903680 | consumed tokens: 67386736640 | elapsed time per iteration (s): 0.16 | learning rate: 4.877E-05 | global batch size: 256 | lm loss: 3.663837E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.057 | TFLOPs: 25.03 | +7: iteration 128540/ 173500 | consumed samples: 32906240 | consumed tokens: 67391979520 | elapsed time per iteration (s): 0.16 | learning rate: 4.875E-05 | global batch size: 256 | lm loss: 3.679195E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.217 | TFLOPs: 25.44 | +7: iteration 128550/ 173500 | consumed samples: 32908800 | consumed tokens: 67397222400 | elapsed time per iteration (s): 0.16 | learning rate: 4.874E-05 | global batch size: 256 | lm loss: 3.669220E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.387 | TFLOPs: 25.32 | +7: iteration 128560/ 173500 | consumed samples: 32911360 | consumed tokens: 67402465280 | elapsed time per iteration (s): 0.16 | learning rate: 4.873E-05 | global batch size: 256 | lm loss: 3.677419E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.670 | TFLOPs: 24.54 | +7: iteration 128570/ 173500 | consumed samples: 32913920 | consumed tokens: 67407708160 | elapsed time per iteration (s): 0.16 | learning rate: 4.872E-05 | global batch size: 256 | lm loss: 3.677591E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.677 | TFLOPs: 24.43 | +7: iteration 128580/ 173500 | consumed samples: 32916480 | consumed tokens: 67412951040 | elapsed time per iteration (s): 0.16 | learning rate: 4.871E-05 | global batch size: 256 | lm loss: 3.676545E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.240 | TFLOPs: 25.71 | +7: iteration 128590/ 173500 | consumed samples: 32919040 | consumed tokens: 67418193920 | elapsed time per iteration (s): 0.16 | learning rate: 4.869E-05 | global batch size: 256 | lm loss: 3.665709E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.929 | TFLOPs: 24.42 | +7: iteration 128600/ 173500 | consumed samples: 32921600 | consumed tokens: 67423436800 | elapsed time per iteration (s): 0.16 | learning rate: 4.868E-05 | global batch size: 256 | lm loss: 3.683319E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.273 | TFLOPs: 24.77 | +7: iteration 128610/ 173500 | consumed samples: 32924160 | consumed tokens: 67428679680 | elapsed time per iteration (s): 0.16 | learning rate: 4.867E-05 | global batch size: 256 | lm loss: 3.677357E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.721 | TFLOPs: 24.82 | +7: iteration 128620/ 173500 | consumed samples: 32926720 | consumed tokens: 67433922560 | elapsed time per iteration (s): 0.17 | learning rate: 4.866E-05 | global batch size: 256 | lm loss: 3.669715E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.765 | TFLOPs: 24.04 | +7: iteration 128630/ 173500 | consumed samples: 32929280 | consumed tokens: 67439165440 | elapsed time per iteration (s): 0.15 | learning rate: 4.865E-05 | global batch size: 256 | lm loss: 3.678350E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.174 | TFLOPs: 26.24 | +7: iteration 128640/ 173500 | consumed samples: 32931840 | consumed tokens: 67444408320 | elapsed time per iteration (s): 0.16 | learning rate: 4.863E-05 | global batch size: 256 | lm loss: 3.662668E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.179 | TFLOPs: 25.36 | +7: iteration 128650/ 173500 | consumed samples: 32934400 | consumed tokens: 67449651200 | elapsed time per iteration (s): 0.16 | learning rate: 4.862E-05 | global batch size: 256 | lm loss: 3.672073E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.250 | TFLOPs: 25.17 | +7: iteration 128660/ 173500 | consumed samples: 32936960 | consumed tokens: 67454894080 | elapsed time per iteration (s): 0.16 | learning rate: 4.861E-05 | global batch size: 256 | lm loss: 3.669213E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.027 | TFLOPs: 24.70 | +7: iteration 128670/ 173500 | consumed samples: 32939520 | consumed tokens: 67460136960 | elapsed time per iteration (s): 0.16 | learning rate: 4.860E-05 | global batch size: 256 | lm loss: 3.656723E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.060 | TFLOPs: 25.86 | +7: iteration 128680/ 173500 | consumed samples: 32942080 | consumed tokens: 67465379840 | elapsed time per iteration (s): 0.15 | learning rate: 4.858E-05 | global batch size: 256 | lm loss: 3.673111E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.756 | TFLOPs: 25.94 | +7: iteration 128690/ 173500 | consumed samples: 32944640 | consumed tokens: 67470622720 | elapsed time per iteration (s): 0.16 | learning rate: 4.857E-05 | global batch size: 256 | lm loss: 3.660827E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.873 | TFLOPs: 25.51 | +7: iteration 128700/ 173500 | consumed samples: 32947200 | consumed tokens: 67475865600 | elapsed time per iteration (s): 0.15 | learning rate: 4.856E-05 | global batch size: 256 | lm loss: 3.673122E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.630 | TFLOPs: 26.31 | +7: iteration 128710/ 173500 | consumed samples: 32949760 | consumed tokens: 67481108480 | elapsed time per iteration (s): 0.16 | learning rate: 4.855E-05 | global batch size: 256 | lm loss: 3.658213E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.537 | TFLOPs: 25.52 | +7: iteration 128720/ 173500 | consumed samples: 32952320 | consumed tokens: 67486351360 | elapsed time per iteration (s): 0.16 | learning rate: 4.854E-05 | global batch size: 256 | lm loss: 3.683451E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.962 | TFLOPs: 25.22 | +7: iteration 128730/ 173500 | consumed samples: 32954880 | consumed tokens: 67491594240 | elapsed time per iteration (s): 0.16 | learning rate: 4.852E-05 | global batch size: 256 | lm loss: 3.680793E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.303 | TFLOPs: 25.18 | +7: iteration 128740/ 173500 | consumed samples: 32957440 | consumed tokens: 67496837120 | elapsed time per iteration (s): 0.16 | learning rate: 4.851E-05 | global batch size: 256 | lm loss: 3.672798E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.357 | TFLOPs: 24.56 | +7: iteration 128750/ 173500 | consumed samples: 32960000 | consumed tokens: 67502080000 | elapsed time per iteration (s): 0.16 | learning rate: 4.850E-05 | global batch size: 256 | lm loss: 3.685469E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.896 | TFLOPs: 25.29 | +7: iteration 128760/ 173500 | consumed samples: 32962560 | consumed tokens: 67507322880 | elapsed time per iteration (s): 0.16 | learning rate: 4.849E-05 | global batch size: 256 | lm loss: 3.674253E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.938 | TFLOPs: 25.36 | +7: iteration 128770/ 173500 | consumed samples: 32965120 | consumed tokens: 67512565760 | elapsed time per iteration (s): 0.16 | learning rate: 4.848E-05 | global batch size: 256 | lm loss: 3.685373E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.785 | TFLOPs: 25.18 | +7: iteration 128780/ 173500 | consumed samples: 32967680 | consumed tokens: 67517808640 | elapsed time per iteration (s): 0.16 | learning rate: 4.846E-05 | global batch size: 256 | lm loss: 3.676156E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.346 | TFLOPs: 25.11 | +7: iteration 128790/ 173500 | consumed samples: 32970240 | consumed tokens: 67523051520 | elapsed time per iteration (s): 0.16 | learning rate: 4.845E-05 | global batch size: 256 | lm loss: 3.671304E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.951 | TFLOPs: 24.84 | +7: iteration 128800/ 173500 | consumed samples: 32972800 | consumed tokens: 67528294400 | elapsed time per iteration (s): 0.16 | learning rate: 4.844E-05 | global batch size: 256 | lm loss: 3.678457E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.540 | TFLOPs: 25.88 | +7: iteration 128810/ 173500 | consumed samples: 32975360 | consumed tokens: 67533537280 | elapsed time per iteration (s): 0.16 | learning rate: 4.843E-05 | global batch size: 256 | lm loss: 3.665591E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.436 | TFLOPs: 25.58 | +7: iteration 128820/ 173500 | consumed samples: 32977920 | consumed tokens: 67538780160 | elapsed time per iteration (s): 0.16 | learning rate: 4.842E-05 | global batch size: 256 | lm loss: 3.677574E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.189 | TFLOPs: 25.28 | +7: iteration 128830/ 173500 | consumed samples: 32980480 | consumed tokens: 67544023040 | elapsed time per iteration (s): 0.16 | learning rate: 4.840E-05 | global batch size: 256 | lm loss: 3.683218E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.188 | TFLOPs: 25.74 | +7: iteration 128840/ 173500 | consumed samples: 32983040 | consumed tokens: 67549265920 | elapsed time per iteration (s): 0.16 | learning rate: 4.839E-05 | global batch size: 256 | lm loss: 3.664628E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.366 | TFLOPs: 24.63 | +7: iteration 128850/ 173500 | consumed samples: 32985600 | consumed tokens: 67554508800 | elapsed time per iteration (s): 0.16 | learning rate: 4.838E-05 | global batch size: 256 | lm loss: 3.673894E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.485 | TFLOPs: 25.74 | +7: iteration 128860/ 173500 | consumed samples: 32988160 | consumed tokens: 67559751680 | elapsed time per iteration (s): 0.16 | learning rate: 4.837E-05 | global batch size: 256 | lm loss: 3.670326E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.445 | TFLOPs: 25.00 | +7: iteration 128870/ 173500 | consumed samples: 32990720 | consumed tokens: 67564994560 | elapsed time per iteration (s): 0.16 | learning rate: 4.836E-05 | global batch size: 256 | lm loss: 3.672454E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.118 | TFLOPs: 24.58 | +7: iteration 128880/ 173500 | consumed samples: 32993280 | consumed tokens: 67570237440 | elapsed time per iteration (s): 0.17 | learning rate: 4.834E-05 | global batch size: 256 | lm loss: 3.663128E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.495 | TFLOPs: 23.97 | +7: iteration 128890/ 173500 | consumed samples: 32995840 | consumed tokens: 67575480320 | elapsed time per iteration (s): 0.16 | learning rate: 4.833E-05 | global batch size: 256 | lm loss: 3.679251E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.444 | TFLOPs: 25.80 | +7: iteration 128900/ 173500 | consumed samples: 32998400 | consumed tokens: 67580723200 | elapsed time per iteration (s): 0.16 | learning rate: 4.832E-05 | global batch size: 256 | lm loss: 3.656049E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.227 | TFLOPs: 24.70 | +7: iteration 128910/ 173500 | consumed samples: 33000960 | consumed tokens: 67585966080 | elapsed time per iteration (s): 0.16 | learning rate: 4.831E-05 | global batch size: 256 | lm loss: 3.677334E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.982 | TFLOPs: 24.42 | +7: iteration 128920/ 173500 | consumed samples: 33003520 | consumed tokens: 67591208960 | elapsed time per iteration (s): 0.16 | learning rate: 4.830E-05 | global batch size: 256 | lm loss: 3.672609E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.952 | TFLOPs: 25.01 | +7: iteration 128930/ 173500 | consumed samples: 33006080 | consumed tokens: 67596451840 | elapsed time per iteration (s): 0.16 | learning rate: 4.828E-05 | global batch size: 256 | lm loss: 3.676968E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.710 | TFLOPs: 25.02 | +7: iteration 128940/ 173500 | consumed samples: 33008640 | consumed tokens: 67601694720 | elapsed time per iteration (s): 0.16 | learning rate: 4.827E-05 | global batch size: 256 | lm loss: 3.670634E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.615 | TFLOPs: 25.57 | +7: iteration 128950/ 173500 | consumed samples: 33011200 | consumed tokens: 67606937600 | elapsed time per iteration (s): 0.16 | learning rate: 4.826E-05 | global batch size: 256 | lm loss: 3.678244E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.636 | TFLOPs: 25.62 | +7: iteration 128960/ 173500 | consumed samples: 33013760 | consumed tokens: 67612180480 | elapsed time per iteration (s): 0.16 | learning rate: 4.825E-05 | global batch size: 256 | lm loss: 3.684411E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.174 | TFLOPs: 24.72 | +7: iteration 128970/ 173500 | consumed samples: 33016320 | consumed tokens: 67617423360 | elapsed time per iteration (s): 0.15 | learning rate: 4.824E-05 | global batch size: 256 | lm loss: 3.674010E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.872 | TFLOPs: 26.22 | +7: iteration 128980/ 173500 | consumed samples: 33018880 | consumed tokens: 67622666240 | elapsed time per iteration (s): 0.16 | learning rate: 4.822E-05 | global batch size: 256 | lm loss: 3.671828E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.201 | TFLOPs: 25.52 | +7: iteration 128990/ 173500 | consumed samples: 33021440 | consumed tokens: 67627909120 | elapsed time per iteration (s): 0.15 | learning rate: 4.821E-05 | global batch size: 256 | lm loss: 3.672310E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.930 | TFLOPs: 26.28 | +7: iteration 129000/ 173500 | consumed samples: 33024000 | consumed tokens: 67633152000 | elapsed time per iteration (s): 0.17 | learning rate: 4.820E-05 | global batch size: 256 | lm loss: 3.672183E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.137 | TFLOPs: 24.07 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 129000 | lm loss value: 3.849362E+00 | lm loss PPL: 4.696308E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 129000 to checkpoints_44m91b100m +0: [2023-03-17 05:51:52,609] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step129000 is begin to save! +0: [2023-03-17 05:51:52,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:51:52,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:51:52,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:51:52,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:51:52,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:51:52,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:51:52,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:51:52,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:51:52,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:51:52,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:51:52,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:51:52,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:51:52,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:51:52,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:51:52,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:51:52,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:51:52,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:51:52,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:51:52,739] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:51:52,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:51:52,740] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step129000/mp_rank_00_model_states.pt +0: [2023-03-17 05:51:52,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:51:52,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:51:52,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:51:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:51:52,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:51:52,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:51:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:51:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:51:52,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +4: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +6: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +5: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +2: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +3: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:51:52,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 05:51:52,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:51:52,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +1: [2023-03-17 05:51:52,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:51:52,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +7: [2023-03-17 05:51:52,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step129000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:51:52,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step129000 is ready now! +0: successfully saved checkpoint at iteration 129000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.49 +7: iteration 129010/ 173500 | consumed samples: 33026560 | consumed tokens: 67638394880 | elapsed time per iteration (s): 0.18 | learning rate: 4.819E-05 | global batch size: 256 | lm loss: 3.649943E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1392.177 | TFLOPs: 21.83 | +7: iteration 129020/ 173500 | consumed samples: 33029120 | consumed tokens: 67643637760 | elapsed time per iteration (s): 0.16 | learning rate: 4.818E-05 | global batch size: 256 | lm loss: 3.679018E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.467 | TFLOPs: 25.37 | +7: iteration 129030/ 173500 | consumed samples: 33031680 | consumed tokens: 67648880640 | elapsed time per iteration (s): 0.16 | learning rate: 4.816E-05 | global batch size: 256 | lm loss: 3.653922E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.084 | TFLOPs: 25.14 | +7: iteration 129040/ 173500 | consumed samples: 33034240 | consumed tokens: 67654123520 | elapsed time per iteration (s): 0.16 | learning rate: 4.815E-05 | global batch size: 256 | lm loss: 3.684167E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.084 | TFLOPs: 25.41 | +7: iteration 129050/ 173500 | consumed samples: 33036800 | consumed tokens: 67659366400 | elapsed time per iteration (s): 0.16 | learning rate: 4.814E-05 | global batch size: 256 | lm loss: 3.661780E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.396 | TFLOPs: 24.83 | +7: iteration 129060/ 173500 | consumed samples: 33039360 | consumed tokens: 67664609280 | elapsed time per iteration (s): 0.16 | learning rate: 4.813E-05 | global batch size: 256 | lm loss: 3.668976E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.829 | TFLOPs: 24.79 | +7: iteration 129070/ 173500 | consumed samples: 33041920 | consumed tokens: 67669852160 | elapsed time per iteration (s): 0.16 | learning rate: 4.812E-05 | global batch size: 256 | lm loss: 3.662982E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.977 | TFLOPs: 25.39 | +7: iteration 129080/ 173500 | consumed samples: 33044480 | consumed tokens: 67675095040 | elapsed time per iteration (s): 0.16 | learning rate: 4.811E-05 | global batch size: 256 | lm loss: 3.678207E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.900 | TFLOPs: 25.22 | +7: iteration 129090/ 173500 | consumed samples: 33047040 | consumed tokens: 67680337920 | elapsed time per iteration (s): 0.16 | learning rate: 4.809E-05 | global batch size: 256 | lm loss: 3.671435E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.980 | TFLOPs: 25.04 | +7: iteration 129100/ 173500 | consumed samples: 33049600 | consumed tokens: 67685580800 | elapsed time per iteration (s): 0.16 | learning rate: 4.808E-05 | global batch size: 256 | lm loss: 3.685485E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.798 | TFLOPs: 25.47 | +7: iteration 129110/ 173500 | consumed samples: 33052160 | consumed tokens: 67690823680 | elapsed time per iteration (s): 0.16 | learning rate: 4.807E-05 | global batch size: 256 | lm loss: 3.673651E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.541 | TFLOPs: 25.13 | +7: iteration 129120/ 173500 | consumed samples: 33054720 | consumed tokens: 67696066560 | elapsed time per iteration (s): 0.16 | learning rate: 4.806E-05 | global batch size: 256 | lm loss: 3.665121E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.620 | TFLOPs: 25.70 | +7: iteration 129130/ 173500 | consumed samples: 33057280 | consumed tokens: 67701309440 | elapsed time per iteration (s): 0.16 | learning rate: 4.805E-05 | global batch size: 256 | lm loss: 3.670549E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.234 | TFLOPs: 25.80 | +7: iteration 129140/ 173500 | consumed samples: 33059840 | consumed tokens: 67706552320 | elapsed time per iteration (s): 0.16 | learning rate: 4.803E-05 | global batch size: 256 | lm loss: 3.667893E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.523 | TFLOPs: 24.76 | +7: iteration 129150/ 173500 | consumed samples: 33062400 | consumed tokens: 67711795200 | elapsed time per iteration (s): 0.15 | learning rate: 4.802E-05 | global batch size: 256 | lm loss: 3.672125E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.667 | TFLOPs: 26.07 | +7: iteration 129160/ 173500 | consumed samples: 33064960 | consumed tokens: 67717038080 | elapsed time per iteration (s): 0.16 | learning rate: 4.801E-05 | global batch size: 256 | lm loss: 3.665356E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.461 | TFLOPs: 25.59 | +7: iteration 129170/ 173500 | consumed samples: 33067520 | consumed tokens: 67722280960 | elapsed time per iteration (s): 0.16 | learning rate: 4.800E-05 | global batch size: 256 | lm loss: 3.658949E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.924 | TFLOPs: 25.15 | +7: iteration 129180/ 173500 | consumed samples: 33070080 | consumed tokens: 67727523840 | elapsed time per iteration (s): 0.16 | learning rate: 4.799E-05 | global batch size: 256 | lm loss: 3.684338E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.891 | TFLOPs: 25.72 | +7: iteration 129190/ 173500 | consumed samples: 33072640 | consumed tokens: 67732766720 | elapsed time per iteration (s): 0.16 | learning rate: 4.797E-05 | global batch size: 256 | lm loss: 3.670991E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.553 | TFLOPs: 25.40 | +7: iteration 129200/ 173500 | consumed samples: 33075200 | consumed tokens: 67738009600 | elapsed time per iteration (s): 0.16 | learning rate: 4.796E-05 | global batch size: 256 | lm loss: 3.672255E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.679 | TFLOPs: 25.82 | +7: iteration 129210/ 173500 | consumed samples: 33077760 | consumed tokens: 67743252480 | elapsed time per iteration (s): 0.16 | learning rate: 4.795E-05 | global batch size: 256 | lm loss: 3.680239E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.249 | TFLOPs: 25.35 | +7: iteration 129220/ 173500 | consumed samples: 33080320 | consumed tokens: 67748495360 | elapsed time per iteration (s): 0.16 | learning rate: 4.794E-05 | global batch size: 256 | lm loss: 3.680479E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.336 | TFLOPs: 25.61 | +7: iteration 129230/ 173500 | consumed samples: 33082880 | consumed tokens: 67753738240 | elapsed time per iteration (s): 0.16 | learning rate: 4.793E-05 | global batch size: 256 | lm loss: 3.675740E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.777 | TFLOPs: 24.76 | +7: iteration 129240/ 173500 | consumed samples: 33085440 | consumed tokens: 67758981120 | elapsed time per iteration (s): 0.16 | learning rate: 4.791E-05 | global batch size: 256 | lm loss: 3.676388E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.093 | TFLOPs: 25.06 | +7: iteration 129250/ 173500 | consumed samples: 33088000 | consumed tokens: 67764224000 | elapsed time per iteration (s): 0.16 | learning rate: 4.790E-05 | global batch size: 256 | lm loss: 3.666249E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.272 | TFLOPs: 24.56 | +7: iteration 129260/ 173500 | consumed samples: 33090560 | consumed tokens: 67769466880 | elapsed time per iteration (s): 0.16 | learning rate: 4.789E-05 | global batch size: 256 | lm loss: 3.677905E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.436 | TFLOPs: 25.04 | +7: iteration 129270/ 173500 | consumed samples: 33093120 | consumed tokens: 67774709760 | elapsed time per iteration (s): 0.16 | learning rate: 4.788E-05 | global batch size: 256 | lm loss: 3.669682E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.547 | TFLOPs: 25.74 | +7: iteration 129280/ 173500 | consumed samples: 33095680 | consumed tokens: 67779952640 | elapsed time per iteration (s): 0.16 | learning rate: 4.787E-05 | global batch size: 256 | lm loss: 3.671021E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.528 | TFLOPs: 24.68 | +7: iteration 129290/ 173500 | consumed samples: 33098240 | consumed tokens: 67785195520 | elapsed time per iteration (s): 0.15 | learning rate: 4.785E-05 | global batch size: 256 | lm loss: 3.662611E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.247 | TFLOPs: 26.15 | +7: iteration 129300/ 173500 | consumed samples: 33100800 | consumed tokens: 67790438400 | elapsed time per iteration (s): 0.16 | learning rate: 4.784E-05 | global batch size: 256 | lm loss: 3.658584E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.883 | TFLOPs: 24.87 | +7: iteration 129310/ 173500 | consumed samples: 33103360 | consumed tokens: 67795681280 | elapsed time per iteration (s): 0.16 | learning rate: 4.783E-05 | global batch size: 256 | lm loss: 3.666506E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.730 | TFLOPs: 24.81 | +7: iteration 129320/ 173500 | consumed samples: 33105920 | consumed tokens: 67800924160 | elapsed time per iteration (s): 0.16 | learning rate: 4.782E-05 | global batch size: 256 | lm loss: 3.674805E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.380 | TFLOPs: 25.30 | +7: iteration 129330/ 173500 | consumed samples: 33108480 | consumed tokens: 67806167040 | elapsed time per iteration (s): 0.16 | learning rate: 4.781E-05 | global batch size: 256 | lm loss: 3.675622E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.509 | TFLOPs: 25.23 | +7: iteration 129340/ 173500 | consumed samples: 33111040 | consumed tokens: 67811409920 | elapsed time per iteration (s): 0.16 | learning rate: 4.780E-05 | global batch size: 256 | lm loss: 3.665246E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.782 | TFLOPs: 25.56 | +7: iteration 129350/ 173500 | consumed samples: 33113600 | consumed tokens: 67816652800 | elapsed time per iteration (s): 0.16 | learning rate: 4.778E-05 | global batch size: 256 | lm loss: 3.677370E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.738 | TFLOPs: 25.07 | +7: iteration 129360/ 173500 | consumed samples: 33116160 | consumed tokens: 67821895680 | elapsed time per iteration (s): 0.16 | learning rate: 4.777E-05 | global batch size: 256 | lm loss: 3.656565E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.252 | TFLOPs: 25.46 | +7: iteration 129370/ 173500 | consumed samples: 33118720 | consumed tokens: 67827138560 | elapsed time per iteration (s): 0.16 | learning rate: 4.776E-05 | global batch size: 256 | lm loss: 3.684235E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.143 | TFLOPs: 25.17 | +7: iteration 129380/ 173500 | consumed samples: 33121280 | consumed tokens: 67832381440 | elapsed time per iteration (s): 0.15 | learning rate: 4.775E-05 | global batch size: 256 | lm loss: 3.676157E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.836 | TFLOPs: 26.00 | +7: iteration 129390/ 173500 | consumed samples: 33123840 | consumed tokens: 67837624320 | elapsed time per iteration (s): 0.16 | learning rate: 4.774E-05 | global batch size: 256 | lm loss: 3.666109E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.371 | TFLOPs: 25.18 | +7: iteration 129400/ 173500 | consumed samples: 33126400 | consumed tokens: 67842867200 | elapsed time per iteration (s): 0.16 | learning rate: 4.772E-05 | global batch size: 256 | lm loss: 3.675146E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.951 | TFLOPs: 24.50 | +7: iteration 129410/ 173500 | consumed samples: 33128960 | consumed tokens: 67848110080 | elapsed time per iteration (s): 0.16 | learning rate: 4.771E-05 | global batch size: 256 | lm loss: 3.663580E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.078 | TFLOPs: 25.61 | +7: iteration 129420/ 173500 | consumed samples: 33131520 | consumed tokens: 67853352960 | elapsed time per iteration (s): 0.16 | learning rate: 4.770E-05 | global batch size: 256 | lm loss: 3.677202E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.485 | TFLOPs: 24.91 | +7: iteration 129430/ 173500 | consumed samples: 33134080 | consumed tokens: 67858595840 | elapsed time per iteration (s): 0.15 | learning rate: 4.769E-05 | global batch size: 256 | lm loss: 3.670147E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.442 | TFLOPs: 26.06 | +7: iteration 129440/ 173500 | consumed samples: 33136640 | consumed tokens: 67863838720 | elapsed time per iteration (s): 0.16 | learning rate: 4.768E-05 | global batch size: 256 | lm loss: 3.684984E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.982 | TFLOPs: 25.00 | +7: iteration 129450/ 173500 | consumed samples: 33139200 | consumed tokens: 67869081600 | elapsed time per iteration (s): 0.16 | learning rate: 4.766E-05 | global batch size: 256 | lm loss: 3.677303E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.704 | TFLOPs: 24.54 | +7: iteration 129460/ 173500 | consumed samples: 33141760 | consumed tokens: 67874324480 | elapsed time per iteration (s): 0.16 | learning rate: 4.765E-05 | global batch size: 256 | lm loss: 3.670004E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.774 | TFLOPs: 25.53 | +7: iteration 129470/ 173500 | consumed samples: 33144320 | consumed tokens: 67879567360 | elapsed time per iteration (s): 0.16 | learning rate: 4.764E-05 | global batch size: 256 | lm loss: 3.686429E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.163 | TFLOPs: 25.20 | +7: iteration 129480/ 173500 | consumed samples: 33146880 | consumed tokens: 67884810240 | elapsed time per iteration (s): 0.16 | learning rate: 4.763E-05 | global batch size: 256 | lm loss: 3.672184E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.620 | TFLOPs: 24.93 | +7: iteration 129490/ 173500 | consumed samples: 33149440 | consumed tokens: 67890053120 | elapsed time per iteration (s): 0.15 | learning rate: 4.762E-05 | global batch size: 256 | lm loss: 3.674043E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.608 | TFLOPs: 26.26 | +7: iteration 129500/ 173500 | consumed samples: 33152000 | consumed tokens: 67895296000 | elapsed time per iteration (s): 0.15 | learning rate: 4.761E-05 | global batch size: 256 | lm loss: 3.679813E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.182 | TFLOPs: 26.02 | +7: iteration 129510/ 173500 | consumed samples: 33154560 | consumed tokens: 67900538880 | elapsed time per iteration (s): 0.16 | learning rate: 4.759E-05 | global batch size: 256 | lm loss: 3.662493E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.411 | TFLOPs: 25.49 | +7: iteration 129520/ 173500 | consumed samples: 33157120 | consumed tokens: 67905781760 | elapsed time per iteration (s): 0.16 | learning rate: 4.758E-05 | global batch size: 256 | lm loss: 3.658372E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.352 | TFLOPs: 25.43 | +7: iteration 129530/ 173500 | consumed samples: 33159680 | consumed tokens: 67911024640 | elapsed time per iteration (s): 0.16 | learning rate: 4.757E-05 | global batch size: 256 | lm loss: 3.672929E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.217 | TFLOPs: 25.85 | +7: iteration 129540/ 173500 | consumed samples: 33162240 | consumed tokens: 67916267520 | elapsed time per iteration (s): 0.16 | learning rate: 4.756E-05 | global batch size: 256 | lm loss: 3.686168E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.517 | TFLOPs: 24.72 | +7: iteration 129550/ 173500 | consumed samples: 33164800 | consumed tokens: 67921510400 | elapsed time per iteration (s): 0.16 | learning rate: 4.755E-05 | global batch size: 256 | lm loss: 3.674180E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.907 | TFLOPs: 25.40 | +7: iteration 129560/ 173500 | consumed samples: 33167360 | consumed tokens: 67926753280 | elapsed time per iteration (s): 0.16 | learning rate: 4.753E-05 | global batch size: 256 | lm loss: 3.671127E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.923 | TFLOPs: 25.40 | +7: iteration 129570/ 173500 | consumed samples: 33169920 | consumed tokens: 67931996160 | elapsed time per iteration (s): 0.16 | learning rate: 4.752E-05 | global batch size: 256 | lm loss: 3.671621E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.367 | TFLOPs: 25.18 | +7: iteration 129580/ 173500 | consumed samples: 33172480 | consumed tokens: 67937239040 | elapsed time per iteration (s): 0.15 | learning rate: 4.751E-05 | global batch size: 256 | lm loss: 3.679935E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.410 | TFLOPs: 26.24 | +7: iteration 129590/ 173500 | consumed samples: 33175040 | consumed tokens: 67942481920 | elapsed time per iteration (s): 0.16 | learning rate: 4.750E-05 | global batch size: 256 | lm loss: 3.673264E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.686 | TFLOPs: 25.67 | +7: iteration 129600/ 173500 | consumed samples: 33177600 | consumed tokens: 67947724800 | elapsed time per iteration (s): 0.15 | learning rate: 4.749E-05 | global batch size: 256 | lm loss: 3.672160E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.282 | TFLOPs: 26.24 | +7: iteration 129610/ 173500 | consumed samples: 33180160 | consumed tokens: 67952967680 | elapsed time per iteration (s): 0.16 | learning rate: 4.747E-05 | global batch size: 256 | lm loss: 3.682430E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.877 | TFLOPs: 25.15 | +7: iteration 129620/ 173500 | consumed samples: 33182720 | consumed tokens: 67958210560 | elapsed time per iteration (s): 0.16 | learning rate: 4.746E-05 | global batch size: 256 | lm loss: 3.675141E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.165 | TFLOPs: 24.70 | +7: iteration 129630/ 173500 | consumed samples: 33185280 | consumed tokens: 67963453440 | elapsed time per iteration (s): 0.16 | learning rate: 4.745E-05 | global batch size: 256 | lm loss: 3.666672E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.928 | TFLOPs: 25.17 | +7: iteration 129640/ 173500 | consumed samples: 33187840 | consumed tokens: 67968696320 | elapsed time per iteration (s): 0.16 | learning rate: 4.744E-05 | global batch size: 256 | lm loss: 3.676258E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.290 | TFLOPs: 25.41 | +7: iteration 129650/ 173500 | consumed samples: 33190400 | consumed tokens: 67973939200 | elapsed time per iteration (s): 0.16 | learning rate: 4.743E-05 | global batch size: 256 | lm loss: 3.677387E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.195 | TFLOPs: 25.39 | +7: iteration 129660/ 173500 | consumed samples: 33192960 | consumed tokens: 67979182080 | elapsed time per iteration (s): 0.16 | learning rate: 4.742E-05 | global batch size: 256 | lm loss: 3.678844E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.273 | TFLOPs: 24.47 | +7: iteration 129670/ 173500 | consumed samples: 33195520 | consumed tokens: 67984424960 | elapsed time per iteration (s): 0.16 | learning rate: 4.740E-05 | global batch size: 256 | lm loss: 3.676033E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.380 | TFLOPs: 25.60 | +7: iteration 129680/ 173500 | consumed samples: 33198080 | consumed tokens: 67989667840 | elapsed time per iteration (s): 0.16 | learning rate: 4.739E-05 | global batch size: 256 | lm loss: 3.673454E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.071 | TFLOPs: 25.88 | +7: iteration 129690/ 173500 | consumed samples: 33200640 | consumed tokens: 67994910720 | elapsed time per iteration (s): 0.16 | learning rate: 4.738E-05 | global batch size: 256 | lm loss: 3.672430E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.752 | TFLOPs: 25.51 | +7: iteration 129700/ 173500 | consumed samples: 33203200 | consumed tokens: 68000153600 | elapsed time per iteration (s): 0.16 | learning rate: 4.737E-05 | global batch size: 256 | lm loss: 3.663213E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.478 | TFLOPs: 25.02 | +7: iteration 129710/ 173500 | consumed samples: 33205760 | consumed tokens: 68005396480 | elapsed time per iteration (s): 0.16 | learning rate: 4.736E-05 | global batch size: 256 | lm loss: 3.661583E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.705 | TFLOPs: 25.67 | +7: iteration 129720/ 173500 | consumed samples: 33208320 | consumed tokens: 68010639360 | elapsed time per iteration (s): 0.16 | learning rate: 4.734E-05 | global batch size: 256 | lm loss: 3.646886E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.409 | TFLOPs: 25.90 | +7: iteration 129730/ 173500 | consumed samples: 33210880 | consumed tokens: 68015882240 | elapsed time per iteration (s): 0.15 | learning rate: 4.733E-05 | global batch size: 256 | lm loss: 3.691901E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.337 | TFLOPs: 26.01 | +7: iteration 129740/ 173500 | consumed samples: 33213440 | consumed tokens: 68021125120 | elapsed time per iteration (s): 0.16 | learning rate: 4.732E-05 | global batch size: 256 | lm loss: 3.673542E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.219 | TFLOPs: 25.58 | +7: iteration 129750/ 173500 | consumed samples: 33216000 | consumed tokens: 68026368000 | elapsed time per iteration (s): 0.16 | learning rate: 4.731E-05 | global batch size: 256 | lm loss: 3.657191E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.446 | TFLOPs: 25.51 | +7: iteration 129760/ 173500 | consumed samples: 33218560 | consumed tokens: 68031610880 | elapsed time per iteration (s): 0.16 | learning rate: 4.730E-05 | global batch size: 256 | lm loss: 3.667483E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.338 | TFLOPs: 25.51 | +7: iteration 129770/ 173500 | consumed samples: 33221120 | consumed tokens: 68036853760 | elapsed time per iteration (s): 0.16 | learning rate: 4.729E-05 | global batch size: 256 | lm loss: 3.672961E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.490 | TFLOPs: 25.46 | +7: iteration 129780/ 173500 | consumed samples: 33223680 | consumed tokens: 68042096640 | elapsed time per iteration (s): 0.17 | learning rate: 4.727E-05 | global batch size: 256 | lm loss: 3.668176E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.625 | TFLOPs: 24.19 | +7: iteration 129790/ 173500 | consumed samples: 33226240 | consumed tokens: 68047339520 | elapsed time per iteration (s): 0.16 | learning rate: 4.726E-05 | global batch size: 256 | lm loss: 3.658550E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.062 | TFLOPs: 25.14 | +7: iteration 129800/ 173500 | consumed samples: 33228800 | consumed tokens: 68052582400 | elapsed time per iteration (s): 0.16 | learning rate: 4.725E-05 | global batch size: 256 | lm loss: 3.682516E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.459 | TFLOPs: 25.51 | +7: iteration 129810/ 173500 | consumed samples: 33231360 | consumed tokens: 68057825280 | elapsed time per iteration (s): 0.16 | learning rate: 4.724E-05 | global batch size: 256 | lm loss: 3.670191E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.641 | TFLOPs: 25.54 | +7: iteration 129820/ 173500 | consumed samples: 33233920 | consumed tokens: 68063068160 | elapsed time per iteration (s): 0.17 | learning rate: 4.723E-05 | global batch size: 256 | lm loss: 3.672204E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.886 | TFLOPs: 24.26 | +7: iteration 129830/ 173500 | consumed samples: 33236480 | consumed tokens: 68068311040 | elapsed time per iteration (s): 0.16 | learning rate: 4.721E-05 | global batch size: 256 | lm loss: 3.666602E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.528 | TFLOPs: 25.85 | +7: iteration 129840/ 173500 | consumed samples: 33239040 | consumed tokens: 68073553920 | elapsed time per iteration (s): 0.15 | learning rate: 4.720E-05 | global batch size: 256 | lm loss: 3.660337E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.599 | TFLOPs: 26.25 | +7: iteration 129850/ 173500 | consumed samples: 33241600 | consumed tokens: 68078796800 | elapsed time per iteration (s): 0.16 | learning rate: 4.719E-05 | global batch size: 256 | lm loss: 3.674354E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.426 | TFLOPs: 25.46 | +7: iteration 129860/ 173500 | consumed samples: 33244160 | consumed tokens: 68084039680 | elapsed time per iteration (s): 0.16 | learning rate: 4.718E-05 | global batch size: 256 | lm loss: 3.674501E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.125 | TFLOPs: 25.44 | +7: iteration 129870/ 173500 | consumed samples: 33246720 | consumed tokens: 68089282560 | elapsed time per iteration (s): 0.16 | learning rate: 4.717E-05 | global batch size: 256 | lm loss: 3.677447E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.188 | TFLOPs: 25.74 | +7: iteration 129880/ 173500 | consumed samples: 33249280 | consumed tokens: 68094525440 | elapsed time per iteration (s): 0.16 | learning rate: 4.716E-05 | global batch size: 256 | lm loss: 3.666297E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.092 | TFLOPs: 25.39 | +7: iteration 129890/ 173500 | consumed samples: 33251840 | consumed tokens: 68099768320 | elapsed time per iteration (s): 0.16 | learning rate: 4.714E-05 | global batch size: 256 | lm loss: 3.669316E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.816 | TFLOPs: 25.50 | +7: iteration 129900/ 173500 | consumed samples: 33254400 | consumed tokens: 68105011200 | elapsed time per iteration (s): 0.16 | learning rate: 4.713E-05 | global batch size: 256 | lm loss: 3.676579E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.308 | TFLOPs: 25.83 | +7: iteration 129910/ 173500 | consumed samples: 33256960 | consumed tokens: 68110254080 | elapsed time per iteration (s): 0.16 | learning rate: 4.712E-05 | global batch size: 256 | lm loss: 3.674423E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.378 | TFLOPs: 24.97 | +7: iteration 129920/ 173500 | consumed samples: 33259520 | consumed tokens: 68115496960 | elapsed time per iteration (s): 0.15 | learning rate: 4.711E-05 | global batch size: 256 | lm loss: 3.672569E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.588 | TFLOPs: 26.00 | +7: iteration 129930/ 173500 | consumed samples: 33262080 | consumed tokens: 68120739840 | elapsed time per iteration (s): 0.16 | learning rate: 4.710E-05 | global batch size: 256 | lm loss: 3.662257E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.649 | TFLOPs: 25.70 | +7: iteration 129940/ 173500 | consumed samples: 33264640 | consumed tokens: 68125982720 | elapsed time per iteration (s): 0.16 | learning rate: 4.709E-05 | global batch size: 256 | lm loss: 3.676443E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.420 | TFLOPs: 25.30 | +7: iteration 129950/ 173500 | consumed samples: 33267200 | consumed tokens: 68131225600 | elapsed time per iteration (s): 0.15 | learning rate: 4.707E-05 | global batch size: 256 | lm loss: 3.665749E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.975 | TFLOPs: 26.17 | +7: iteration 129960/ 173500 | consumed samples: 33269760 | consumed tokens: 68136468480 | elapsed time per iteration (s): 0.15 | learning rate: 4.706E-05 | global batch size: 256 | lm loss: 3.671183E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.115 | TFLOPs: 26.11 | +7: iteration 129970/ 173500 | consumed samples: 33272320 | consumed tokens: 68141711360 | elapsed time per iteration (s): 0.16 | learning rate: 4.705E-05 | global batch size: 256 | lm loss: 3.675318E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.570 | TFLOPs: 25.49 | +7: iteration 129980/ 173500 | consumed samples: 33274880 | consumed tokens: 68146954240 | elapsed time per iteration (s): 0.15 | learning rate: 4.704E-05 | global batch size: 256 | lm loss: 3.677466E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.799 | TFLOPs: 26.23 | +7: iteration 129990/ 173500 | consumed samples: 33277440 | consumed tokens: 68152197120 | elapsed time per iteration (s): 0.16 | learning rate: 4.703E-05 | global batch size: 256 | lm loss: 3.669943E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.227 | TFLOPs: 25.14 | +0: [2023-03-17 05:54:30,913] [INFO] [logging.py:68:log_dist] [Rank 0] step=130000, skipped=0, lr=[4.7014562839599005e-05, 4.7014562839599005e-05, 4.7014562839599005e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 130000/ 173500 | consumed samples: 33280000 | consumed tokens: 68157440000 | elapsed time per iteration (s): 0.15 | learning rate: 4.701E-05 | global batch size: 256 | lm loss: 3.683596E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.284 | TFLOPs: 26.15 | +0: steps: 130000 loss: 3.6730 iter time (s): 0.158 samples/sec: 1621.151 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 130000 | lm loss value: 3.802984E+00 | lm loss PPL: 4.483477E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 130000 to checkpoints_44m91b100m +0: [2023-03-17 05:54:30,987] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step130000 is begin to save! +0: [2023-03-17 05:54:30,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:54:31,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:54:31,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:54:31,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:54:31,060] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:54:31,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:54:31,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:54:31,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:54:31,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:54:31,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:54:31,083] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:54:31,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:54:31,091] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:54:31,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:54:31,099] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:54:31,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:54:31,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:54:31,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:54:31,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:54:31,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:54:31,116] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step130000/mp_rank_00_model_states.pt +0: [2023-03-17 05:54:31,116] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:54:31,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:54:31,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:54:31,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:54:31,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:54:31,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:54:31,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,151] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:54:31,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:54:31,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +1: [2023-03-17 05:54:31,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:54:31,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +3: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:54:31,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +6: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +5: [2023-03-17 05:54:31,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +7: [2023-03-17 05:54:31,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:54:31,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +4: [2023-03-17 05:54:31,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:54:31,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:54:31,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +2: [2023-03-17 05:54:31,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:54:31,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step130000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:54:31,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step130000 is ready now! +0: successfully saved checkpoint at iteration 130000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 177.88 +7: iteration 130010/ 173500 | consumed samples: 33282560 | consumed tokens: 68162682880 | elapsed time per iteration (s): 0.19 | learning rate: 4.700E-05 | global batch size: 256 | lm loss: 3.670864E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.404 | TFLOPs: 21.57 | +7: iteration 130020/ 173500 | consumed samples: 33285120 | consumed tokens: 68167925760 | elapsed time per iteration (s): 0.16 | learning rate: 4.699E-05 | global batch size: 256 | lm loss: 3.672520E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.671 | TFLOPs: 25.87 | +7: iteration 130030/ 173500 | consumed samples: 33287680 | consumed tokens: 68173168640 | elapsed time per iteration (s): 0.16 | learning rate: 4.698E-05 | global batch size: 256 | lm loss: 3.675841E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.603 | TFLOPs: 24.90 | +7: iteration 130040/ 173500 | consumed samples: 33290240 | consumed tokens: 68178411520 | elapsed time per iteration (s): 0.16 | learning rate: 4.697E-05 | global batch size: 256 | lm loss: 3.681670E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.315 | TFLOPs: 25.71 | +7: iteration 130050/ 173500 | consumed samples: 33292800 | consumed tokens: 68183654400 | elapsed time per iteration (s): 0.16 | learning rate: 4.696E-05 | global batch size: 256 | lm loss: 3.689433E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.601 | TFLOPs: 25.34 | +7: iteration 130060/ 173500 | consumed samples: 33295360 | consumed tokens: 68188897280 | elapsed time per iteration (s): 0.15 | learning rate: 4.694E-05 | global batch size: 256 | lm loss: 3.672380E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.375 | TFLOPs: 26.07 | +7: iteration 130070/ 173500 | consumed samples: 33297920 | consumed tokens: 68194140160 | elapsed time per iteration (s): 0.16 | learning rate: 4.693E-05 | global batch size: 256 | lm loss: 3.691424E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.861 | TFLOPs: 25.43 | +7: iteration 130080/ 173500 | consumed samples: 33300480 | consumed tokens: 68199383040 | elapsed time per iteration (s): 0.16 | learning rate: 4.692E-05 | global batch size: 256 | lm loss: 3.666357E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.964 | TFLOPs: 25.75 | +7: iteration 130090/ 173500 | consumed samples: 33303040 | consumed tokens: 68204625920 | elapsed time per iteration (s): 0.16 | learning rate: 4.691E-05 | global batch size: 256 | lm loss: 3.665682E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.294 | TFLOPs: 25.66 | +7: iteration 130100/ 173500 | consumed samples: 33305600 | consumed tokens: 68209868800 | elapsed time per iteration (s): 0.16 | learning rate: 4.690E-05 | global batch size: 256 | lm loss: 3.670904E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.265 | TFLOPs: 25.13 | +7: iteration 130110/ 173500 | consumed samples: 33308160 | consumed tokens: 68215111680 | elapsed time per iteration (s): 0.16 | learning rate: 4.689E-05 | global batch size: 256 | lm loss: 3.658107E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.416 | TFLOPs: 25.44 | +7: iteration 130120/ 173500 | consumed samples: 33310720 | consumed tokens: 68220354560 | elapsed time per iteration (s): 0.16 | learning rate: 4.687E-05 | global batch size: 256 | lm loss: 3.676049E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.284 | TFLOPs: 24.52 | +7: iteration 130130/ 173500 | consumed samples: 33313280 | consumed tokens: 68225597440 | elapsed time per iteration (s): 0.16 | learning rate: 4.686E-05 | global batch size: 256 | lm loss: 3.666830E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.584 | TFLOPs: 25.87 | +7: iteration 130140/ 173500 | consumed samples: 33315840 | consumed tokens: 68230840320 | elapsed time per iteration (s): 0.16 | learning rate: 4.685E-05 | global batch size: 256 | lm loss: 3.665693E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.437 | TFLOPs: 25.08 | +7: iteration 130150/ 173500 | consumed samples: 33318400 | consumed tokens: 68236083200 | elapsed time per iteration (s): 0.15 | learning rate: 4.684E-05 | global batch size: 256 | lm loss: 3.668002E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.388 | TFLOPs: 26.18 | +7: iteration 130160/ 173500 | consumed samples: 33320960 | consumed tokens: 68241326080 | elapsed time per iteration (s): 0.15 | learning rate: 4.683E-05 | global batch size: 256 | lm loss: 3.682487E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.144 | TFLOPs: 26.19 | +7: iteration 130170/ 173500 | consumed samples: 33323520 | consumed tokens: 68246568960 | elapsed time per iteration (s): 0.16 | learning rate: 4.681E-05 | global batch size: 256 | lm loss: 3.671492E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.266 | TFLOPs: 25.44 | +7: iteration 130180/ 173500 | consumed samples: 33326080 | consumed tokens: 68251811840 | elapsed time per iteration (s): 0.16 | learning rate: 4.680E-05 | global batch size: 256 | lm loss: 3.673417E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.341 | TFLOPs: 25.40 | +7: iteration 130190/ 173500 | consumed samples: 33328640 | consumed tokens: 68257054720 | elapsed time per iteration (s): 0.15 | learning rate: 4.679E-05 | global batch size: 256 | lm loss: 3.654109E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.774 | TFLOPs: 25.92 | +7: iteration 130200/ 173500 | consumed samples: 33331200 | consumed tokens: 68262297600 | elapsed time per iteration (s): 0.16 | learning rate: 4.678E-05 | global batch size: 256 | lm loss: 3.687609E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.243 | TFLOPs: 25.38 | +7: iteration 130210/ 173500 | consumed samples: 33333760 | consumed tokens: 68267540480 | elapsed time per iteration (s): 0.16 | learning rate: 4.677E-05 | global batch size: 256 | lm loss: 3.663750E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.334 | TFLOPs: 25.21 | +7: iteration 130220/ 173500 | consumed samples: 33336320 | consumed tokens: 68272783360 | elapsed time per iteration (s): 0.16 | learning rate: 4.676E-05 | global batch size: 256 | lm loss: 3.664838E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.824 | TFLOPs: 25.45 | +7: iteration 130230/ 173500 | consumed samples: 33338880 | consumed tokens: 68278026240 | elapsed time per iteration (s): 0.15 | learning rate: 4.674E-05 | global batch size: 256 | lm loss: 3.681466E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.592 | TFLOPs: 25.93 | +7: iteration 130240/ 173500 | consumed samples: 33341440 | consumed tokens: 68283269120 | elapsed time per iteration (s): 0.16 | learning rate: 4.673E-05 | global batch size: 256 | lm loss: 3.658868E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.890 | TFLOPs: 25.75 | +7: iteration 130250/ 173500 | consumed samples: 33344000 | consumed tokens: 68288512000 | elapsed time per iteration (s): 0.15 | learning rate: 4.672E-05 | global batch size: 256 | lm loss: 3.682100E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.402 | TFLOPs: 26.26 | +7: iteration 130260/ 173500 | consumed samples: 33346560 | consumed tokens: 68293754880 | elapsed time per iteration (s): 0.16 | learning rate: 4.671E-05 | global batch size: 256 | lm loss: 3.677862E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.137 | TFLOPs: 25.41 | +7: iteration 130270/ 173500 | consumed samples: 33349120 | consumed tokens: 68298997760 | elapsed time per iteration (s): 0.16 | learning rate: 4.670E-05 | global batch size: 256 | lm loss: 3.679420E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.900 | TFLOPs: 25.00 | +7: iteration 130280/ 173500 | consumed samples: 33351680 | consumed tokens: 68304240640 | elapsed time per iteration (s): 0.16 | learning rate: 4.669E-05 | global batch size: 256 | lm loss: 3.670655E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.050 | TFLOPs: 25.61 | +7: iteration 130290/ 173500 | consumed samples: 33354240 | consumed tokens: 68309483520 | elapsed time per iteration (s): 0.15 | learning rate: 4.667E-05 | global batch size: 256 | lm loss: 3.669725E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.953 | TFLOPs: 26.30 | +7: iteration 130300/ 173500 | consumed samples: 33356800 | consumed tokens: 68314726400 | elapsed time per iteration (s): 0.16 | learning rate: 4.666E-05 | global batch size: 256 | lm loss: 3.681352E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.922 | TFLOPs: 25.87 | +7: iteration 130310/ 173500 | consumed samples: 33359360 | consumed tokens: 68319969280 | elapsed time per iteration (s): 0.16 | learning rate: 4.665E-05 | global batch size: 256 | lm loss: 3.668869E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.660 | TFLOPs: 25.89 | +7: iteration 130320/ 173500 | consumed samples: 33361920 | consumed tokens: 68325212160 | elapsed time per iteration (s): 0.15 | learning rate: 4.664E-05 | global batch size: 256 | lm loss: 3.672557E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.227 | TFLOPs: 26.22 | +7: iteration 130330/ 173500 | consumed samples: 33364480 | consumed tokens: 68330455040 | elapsed time per iteration (s): 0.15 | learning rate: 4.663E-05 | global batch size: 256 | lm loss: 3.664290E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.474 | TFLOPs: 26.02 | +7: iteration 130340/ 173500 | consumed samples: 33367040 | consumed tokens: 68335697920 | elapsed time per iteration (s): 0.16 | learning rate: 4.662E-05 | global batch size: 256 | lm loss: 3.674773E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.028 | TFLOPs: 25.56 | +7: iteration 130350/ 173500 | consumed samples: 33369600 | consumed tokens: 68340940800 | elapsed time per iteration (s): 0.16 | learning rate: 4.660E-05 | global batch size: 256 | lm loss: 3.660874E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.947 | TFLOPs: 25.44 | +7: iteration 130360/ 173500 | consumed samples: 33372160 | consumed tokens: 68346183680 | elapsed time per iteration (s): 0.16 | learning rate: 4.659E-05 | global batch size: 256 | lm loss: 3.676173E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.048 | TFLOPs: 25.59 | +7: iteration 130370/ 173500 | consumed samples: 33374720 | consumed tokens: 68351426560 | elapsed time per iteration (s): 0.15 | learning rate: 4.658E-05 | global batch size: 256 | lm loss: 3.668503E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.937 | TFLOPs: 26.08 | +7: iteration 130380/ 173500 | consumed samples: 33377280 | consumed tokens: 68356669440 | elapsed time per iteration (s): 0.16 | learning rate: 4.657E-05 | global batch size: 256 | lm loss: 3.684821E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.848 | TFLOPs: 25.62 | +7: iteration 130390/ 173500 | consumed samples: 33379840 | consumed tokens: 68361912320 | elapsed time per iteration (s): 0.15 | learning rate: 4.656E-05 | global batch size: 256 | lm loss: 3.690933E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.592 | TFLOPs: 25.92 | +7: iteration 130400/ 173500 | consumed samples: 33382400 | consumed tokens: 68367155200 | elapsed time per iteration (s): 0.16 | learning rate: 4.655E-05 | global batch size: 256 | lm loss: 3.665143E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.095 | TFLOPs: 25.28 | +7: iteration 130410/ 173500 | consumed samples: 33384960 | consumed tokens: 68372398080 | elapsed time per iteration (s): 0.15 | learning rate: 4.653E-05 | global batch size: 256 | lm loss: 3.674990E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.072 | TFLOPs: 26.29 | +7: iteration 130420/ 173500 | consumed samples: 33387520 | consumed tokens: 68377640960 | elapsed time per iteration (s): 0.16 | learning rate: 4.652E-05 | global batch size: 256 | lm loss: 3.661099E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.533 | TFLOPs: 25.73 | +7: iteration 130430/ 173500 | consumed samples: 33390080 | consumed tokens: 68382883840 | elapsed time per iteration (s): 0.15 | learning rate: 4.651E-05 | global batch size: 256 | lm loss: 3.677285E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.259 | TFLOPs: 26.24 | +7: iteration 130440/ 173500 | consumed samples: 33392640 | consumed tokens: 68388126720 | elapsed time per iteration (s): 0.15 | learning rate: 4.650E-05 | global batch size: 256 | lm loss: 3.675463E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.775 | TFLOPs: 26.26 | +7: iteration 130450/ 173500 | consumed samples: 33395200 | consumed tokens: 68393369600 | elapsed time per iteration (s): 0.15 | learning rate: 4.649E-05 | global batch size: 256 | lm loss: 3.661766E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.482 | TFLOPs: 26.18 | +7: iteration 130460/ 173500 | consumed samples: 33397760 | consumed tokens: 68398612480 | elapsed time per iteration (s): 0.16 | learning rate: 4.648E-05 | global batch size: 256 | lm loss: 3.673425E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.236 | TFLOPs: 25.38 | +7: iteration 130470/ 173500 | consumed samples: 33400320 | consumed tokens: 68403855360 | elapsed time per iteration (s): 0.16 | learning rate: 4.646E-05 | global batch size: 256 | lm loss: 3.659096E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.189 | TFLOPs: 25.06 | +7: iteration 130480/ 173500 | consumed samples: 33402880 | consumed tokens: 68409098240 | elapsed time per iteration (s): 0.16 | learning rate: 4.645E-05 | global batch size: 256 | lm loss: 3.675792E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.924 | TFLOPs: 25.69 | +7: iteration 130490/ 173500 | consumed samples: 33405440 | consumed tokens: 68414341120 | elapsed time per iteration (s): 0.16 | learning rate: 4.644E-05 | global batch size: 256 | lm loss: 3.678562E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.087 | TFLOPs: 25.47 | +7: iteration 130500/ 173500 | consumed samples: 33408000 | consumed tokens: 68419584000 | elapsed time per iteration (s): 0.16 | learning rate: 4.643E-05 | global batch size: 256 | lm loss: 3.671613E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.047 | TFLOPs: 25.08 | +7: iteration 130510/ 173500 | consumed samples: 33410560 | consumed tokens: 68424826880 | elapsed time per iteration (s): 0.15 | learning rate: 4.642E-05 | global batch size: 256 | lm loss: 3.663037E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.242 | TFLOPs: 25.97 | +7: iteration 130520/ 173500 | consumed samples: 33413120 | consumed tokens: 68430069760 | elapsed time per iteration (s): 0.16 | learning rate: 4.641E-05 | global batch size: 256 | lm loss: 3.673666E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.787 | TFLOPs: 24.34 | +7: iteration 130530/ 173500 | consumed samples: 33415680 | consumed tokens: 68435312640 | elapsed time per iteration (s): 0.16 | learning rate: 4.639E-05 | global batch size: 256 | lm loss: 3.674061E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.400 | TFLOPs: 24.57 | +7: iteration 130540/ 173500 | consumed samples: 33418240 | consumed tokens: 68440555520 | elapsed time per iteration (s): 0.16 | learning rate: 4.638E-05 | global batch size: 256 | lm loss: 3.677899E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.748 | TFLOPs: 25.81 | +7: iteration 130550/ 173500 | consumed samples: 33420800 | consumed tokens: 68445798400 | elapsed time per iteration (s): 0.15 | learning rate: 4.637E-05 | global batch size: 256 | lm loss: 3.676437E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.048 | TFLOPs: 26.10 | +7: iteration 130560/ 173500 | consumed samples: 33423360 | consumed tokens: 68451041280 | elapsed time per iteration (s): 0.15 | learning rate: 4.636E-05 | global batch size: 256 | lm loss: 3.681836E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.822 | TFLOPs: 26.08 | +7: iteration 130570/ 173500 | consumed samples: 33425920 | consumed tokens: 68456284160 | elapsed time per iteration (s): 0.15 | learning rate: 4.635E-05 | global batch size: 256 | lm loss: 3.662803E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.757 | TFLOPs: 26.06 | +7: iteration 130580/ 173500 | consumed samples: 33428480 | consumed tokens: 68461527040 | elapsed time per iteration (s): 0.16 | learning rate: 4.634E-05 | global batch size: 256 | lm loss: 3.671031E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.598 | TFLOPs: 25.84 | +7: iteration 130590/ 173500 | consumed samples: 33431040 | consumed tokens: 68466769920 | elapsed time per iteration (s): 0.15 | learning rate: 4.632E-05 | global batch size: 256 | lm loss: 3.661973E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.664 | TFLOPs: 25.95 | +7: iteration 130600/ 173500 | consumed samples: 33433600 | consumed tokens: 68472012800 | elapsed time per iteration (s): 0.16 | learning rate: 4.631E-05 | global batch size: 256 | lm loss: 3.674239E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.094 | TFLOPs: 25.83 | +7: iteration 130610/ 173500 | consumed samples: 33436160 | consumed tokens: 68477255680 | elapsed time per iteration (s): 0.16 | learning rate: 4.630E-05 | global batch size: 256 | lm loss: 3.677689E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.185 | TFLOPs: 25.14 | +7: iteration 130620/ 173500 | consumed samples: 33438720 | consumed tokens: 68482498560 | elapsed time per iteration (s): 0.16 | learning rate: 4.629E-05 | global batch size: 256 | lm loss: 3.683603E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.815 | TFLOPs: 25.47 | +7: iteration 130630/ 173500 | consumed samples: 33441280 | consumed tokens: 68487741440 | elapsed time per iteration (s): 0.16 | learning rate: 4.628E-05 | global batch size: 256 | lm loss: 3.679696E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.734 | TFLOPs: 25.84 | +7: iteration 130640/ 173500 | consumed samples: 33443840 | consumed tokens: 68492984320 | elapsed time per iteration (s): 0.15 | learning rate: 4.627E-05 | global batch size: 256 | lm loss: 3.678308E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.964 | TFLOPs: 25.94 | +7: iteration 130650/ 173500 | consumed samples: 33446400 | consumed tokens: 68498227200 | elapsed time per iteration (s): 0.15 | learning rate: 4.625E-05 | global batch size: 256 | lm loss: 3.675963E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.271 | TFLOPs: 26.32 | +7: iteration 130660/ 173500 | consumed samples: 33448960 | consumed tokens: 68503470080 | elapsed time per iteration (s): 0.16 | learning rate: 4.624E-05 | global batch size: 256 | lm loss: 3.672657E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.687 | TFLOPs: 25.82 | +7: iteration 130670/ 173500 | consumed samples: 33451520 | consumed tokens: 68508712960 | elapsed time per iteration (s): 0.16 | learning rate: 4.623E-05 | global batch size: 256 | lm loss: 3.667370E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.535 | TFLOPs: 25.45 | +7: iteration 130680/ 173500 | consumed samples: 33454080 | consumed tokens: 68513955840 | elapsed time per iteration (s): 0.15 | learning rate: 4.622E-05 | global batch size: 256 | lm loss: 3.677849E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.893 | TFLOPs: 26.14 | +7: iteration 130690/ 173500 | consumed samples: 33456640 | consumed tokens: 68519198720 | elapsed time per iteration (s): 0.15 | learning rate: 4.621E-05 | global batch size: 256 | lm loss: 3.669109E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.661 | TFLOPs: 25.92 | +7: iteration 130700/ 173500 | consumed samples: 33459200 | consumed tokens: 68524441600 | elapsed time per iteration (s): 0.16 | learning rate: 4.620E-05 | global batch size: 256 | lm loss: 3.680936E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.606 | TFLOPs: 25.81 | +7: iteration 130710/ 173500 | consumed samples: 33461760 | consumed tokens: 68529684480 | elapsed time per iteration (s): 0.16 | learning rate: 4.619E-05 | global batch size: 256 | lm loss: 3.682135E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.124 | TFLOPs: 25.58 | +7: iteration 130720/ 173500 | consumed samples: 33464320 | consumed tokens: 68534927360 | elapsed time per iteration (s): 0.15 | learning rate: 4.617E-05 | global batch size: 256 | lm loss: 3.674278E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.954 | TFLOPs: 25.91 | +7: iteration 130730/ 173500 | consumed samples: 33466880 | consumed tokens: 68540170240 | elapsed time per iteration (s): 0.16 | learning rate: 4.616E-05 | global batch size: 256 | lm loss: 3.672209E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.082 | TFLOPs: 25.83 | +7: iteration 130740/ 173500 | consumed samples: 33469440 | consumed tokens: 68545413120 | elapsed time per iteration (s): 0.17 | learning rate: 4.615E-05 | global batch size: 256 | lm loss: 3.672451E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.960 | TFLOPs: 24.07 | +7: iteration 130750/ 173500 | consumed samples: 33472000 | consumed tokens: 68550656000 | elapsed time per iteration (s): 0.15 | learning rate: 4.614E-05 | global batch size: 256 | lm loss: 3.666683E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.750 | TFLOPs: 26.14 | +7: iteration 130760/ 173500 | consumed samples: 33474560 | consumed tokens: 68555898880 | elapsed time per iteration (s): 0.16 | learning rate: 4.613E-05 | global batch size: 256 | lm loss: 3.676821E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.835 | TFLOPs: 25.80 | +7: iteration 130770/ 173500 | consumed samples: 33477120 | consumed tokens: 68561141760 | elapsed time per iteration (s): 0.15 | learning rate: 4.612E-05 | global batch size: 256 | lm loss: 3.657927E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.386 | TFLOPs: 26.15 | +7: iteration 130780/ 173500 | consumed samples: 33479680 | consumed tokens: 68566384640 | elapsed time per iteration (s): 0.16 | learning rate: 4.610E-05 | global batch size: 256 | lm loss: 3.675853E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.823 | TFLOPs: 24.38 | +7: iteration 130790/ 173500 | consumed samples: 33482240 | consumed tokens: 68571627520 | elapsed time per iteration (s): 0.15 | learning rate: 4.609E-05 | global batch size: 256 | lm loss: 3.665290E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.014 | TFLOPs: 26.19 | +7: iteration 130800/ 173500 | consumed samples: 33484800 | consumed tokens: 68576870400 | elapsed time per iteration (s): 0.16 | learning rate: 4.608E-05 | global batch size: 256 | lm loss: 3.663507E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.365 | TFLOPs: 25.58 | +7: iteration 130810/ 173500 | consumed samples: 33487360 | consumed tokens: 68582113280 | elapsed time per iteration (s): 0.15 | learning rate: 4.607E-05 | global batch size: 256 | lm loss: 3.680159E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.608 | TFLOPs: 26.15 | +7: iteration 130820/ 173500 | consumed samples: 33489920 | consumed tokens: 68587356160 | elapsed time per iteration (s): 0.15 | learning rate: 4.606E-05 | global batch size: 256 | lm loss: 3.674439E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.117 | TFLOPs: 25.97 | +7: iteration 130830/ 173500 | consumed samples: 33492480 | consumed tokens: 68592599040 | elapsed time per iteration (s): 0.16 | learning rate: 4.605E-05 | global batch size: 256 | lm loss: 3.668946E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.468 | TFLOPs: 25.71 | +7: iteration 130840/ 173500 | consumed samples: 33495040 | consumed tokens: 68597841920 | elapsed time per iteration (s): 0.16 | learning rate: 4.603E-05 | global batch size: 256 | lm loss: 3.677026E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.015 | TFLOPs: 25.06 | +7: iteration 130850/ 173500 | consumed samples: 33497600 | consumed tokens: 68603084800 | elapsed time per iteration (s): 0.15 | learning rate: 4.602E-05 | global batch size: 256 | lm loss: 3.658649E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.916 | TFLOPs: 26.11 | +7: iteration 130860/ 173500 | consumed samples: 33500160 | consumed tokens: 68608327680 | elapsed time per iteration (s): 0.15 | learning rate: 4.601E-05 | global batch size: 256 | lm loss: 3.671481E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.691 | TFLOPs: 26.12 | +7: iteration 130870/ 173500 | consumed samples: 33502720 | consumed tokens: 68613570560 | elapsed time per iteration (s): 0.15 | learning rate: 4.600E-05 | global batch size: 256 | lm loss: 3.678345E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.475 | TFLOPs: 26.12 | +7: iteration 130880/ 173500 | consumed samples: 33505280 | consumed tokens: 68618813440 | elapsed time per iteration (s): 0.15 | learning rate: 4.599E-05 | global batch size: 256 | lm loss: 3.654285E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.581 | TFLOPs: 26.09 | +7: iteration 130890/ 173500 | consumed samples: 33507840 | consumed tokens: 68624056320 | elapsed time per iteration (s): 0.15 | learning rate: 4.598E-05 | global batch size: 256 | lm loss: 3.674730E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.645 | TFLOPs: 26.12 | +7: iteration 130900/ 173500 | consumed samples: 33510400 | consumed tokens: 68629299200 | elapsed time per iteration (s): 0.15 | learning rate: 4.596E-05 | global batch size: 256 | lm loss: 3.671132E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.112 | TFLOPs: 26.08 | +7: iteration 130910/ 173500 | consumed samples: 33512960 | consumed tokens: 68634542080 | elapsed time per iteration (s): 0.15 | learning rate: 4.595E-05 | global batch size: 256 | lm loss: 3.682190E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.850 | TFLOPs: 26.11 | +7: iteration 130920/ 173500 | consumed samples: 33515520 | consumed tokens: 68639784960 | elapsed time per iteration (s): 0.16 | learning rate: 4.594E-05 | global batch size: 256 | lm loss: 3.675201E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.653 | TFLOPs: 25.45 | +7: iteration 130930/ 173500 | consumed samples: 33518080 | consumed tokens: 68645027840 | elapsed time per iteration (s): 0.16 | learning rate: 4.593E-05 | global batch size: 256 | lm loss: 3.668366E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.216 | TFLOPs: 25.08 | +7: iteration 130940/ 173500 | consumed samples: 33520640 | consumed tokens: 68650270720 | elapsed time per iteration (s): 0.15 | learning rate: 4.592E-05 | global batch size: 256 | lm loss: 3.669838E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.935 | TFLOPs: 26.17 | +7: iteration 130950/ 173500 | consumed samples: 33523200 | consumed tokens: 68655513600 | elapsed time per iteration (s): 0.15 | learning rate: 4.591E-05 | global batch size: 256 | lm loss: 3.672947E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.361 | TFLOPs: 26.16 | +7: iteration 130960/ 173500 | consumed samples: 33525760 | consumed tokens: 68660756480 | elapsed time per iteration (s): 0.16 | learning rate: 4.590E-05 | global batch size: 256 | lm loss: 3.662296E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.899 | TFLOPs: 25.83 | +7: iteration 130970/ 173500 | consumed samples: 33528320 | consumed tokens: 68665999360 | elapsed time per iteration (s): 0.15 | learning rate: 4.588E-05 | global batch size: 256 | lm loss: 3.675924E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.770 | TFLOPs: 26.15 | +7: iteration 130980/ 173500 | consumed samples: 33530880 | consumed tokens: 68671242240 | elapsed time per iteration (s): 0.16 | learning rate: 4.587E-05 | global batch size: 256 | lm loss: 3.683798E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.526 | TFLOPs: 25.74 | +7: iteration 130990/ 173500 | consumed samples: 33533440 | consumed tokens: 68676485120 | elapsed time per iteration (s): 0.16 | learning rate: 4.586E-05 | global batch size: 256 | lm loss: 3.667300E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.869 | TFLOPs: 25.59 | +7: iteration 131000/ 173500 | consumed samples: 33536000 | consumed tokens: 68681728000 | elapsed time per iteration (s): 0.15 | learning rate: 4.585E-05 | global batch size: 256 | lm loss: 3.659295E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.253 | TFLOPs: 26.12 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 131000 | lm loss value: 3.841293E+00 | lm loss PPL: 4.658566E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 131000 to checkpoints_44m91b100m +0: [2023-03-17 05:57:07,491] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step131000 is begin to save! +0: [2023-03-17 05:57:07,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:57:07,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:57:07,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:57:07,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:57:07,563] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:57:07,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:57:07,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:57:07,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:57:07,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:57:07,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:57:07,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:57:07,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:57:07,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:57:07,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:57:07,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:57:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:57:07,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:57:07,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:57:07,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:57:07,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:57:07,621] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step131000/mp_rank_00_model_states.pt +0: [2023-03-17 05:57:07,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:57:07,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:57:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:57:07,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:57:07,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,655] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +1: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +6: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +2: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,660] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +2: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,660] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +4: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:57:07,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +3: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:57:07,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +7: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:57:07,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +5: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:57:07,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step131000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:57:07,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step131000 is ready now! +0: successfully saved checkpoint at iteration 131000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 173.88 +7: iteration 131010/ 173500 | consumed samples: 33538560 | consumed tokens: 68686970880 | elapsed time per iteration (s): 0.18 | learning rate: 4.584E-05 | global batch size: 256 | lm loss: 3.668085E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.685 | TFLOPs: 22.45 | +7: iteration 131020/ 173500 | consumed samples: 33541120 | consumed tokens: 68692213760 | elapsed time per iteration (s): 0.16 | learning rate: 4.583E-05 | global batch size: 256 | lm loss: 3.670887E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.091 | TFLOPs: 24.83 | +7: iteration 131030/ 173500 | consumed samples: 33543680 | consumed tokens: 68697456640 | elapsed time per iteration (s): 0.16 | learning rate: 4.581E-05 | global batch size: 256 | lm loss: 3.664450E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.398 | TFLOPs: 25.84 | +7: iteration 131040/ 173500 | consumed samples: 33546240 | consumed tokens: 68702699520 | elapsed time per iteration (s): 0.16 | learning rate: 4.580E-05 | global batch size: 256 | lm loss: 3.664590E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.707 | TFLOPs: 25.87 | +7: iteration 131050/ 173500 | consumed samples: 33548800 | consumed tokens: 68707942400 | elapsed time per iteration (s): 0.16 | learning rate: 4.579E-05 | global batch size: 256 | lm loss: 3.668414E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.589 | TFLOPs: 25.21 | +7: iteration 131060/ 173500 | consumed samples: 33551360 | consumed tokens: 68713185280 | elapsed time per iteration (s): 0.15 | learning rate: 4.578E-05 | global batch size: 256 | lm loss: 3.673886E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.115 | TFLOPs: 25.96 | +7: iteration 131070/ 173500 | consumed samples: 33553920 | consumed tokens: 68718428160 | elapsed time per iteration (s): 0.15 | learning rate: 4.577E-05 | global batch size: 256 | lm loss: 3.668629E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.848 | TFLOPs: 26.09 | +7: iteration 131080/ 173500 | consumed samples: 33556480 | consumed tokens: 68723671040 | elapsed time per iteration (s): 0.15 | learning rate: 4.576E-05 | global batch size: 256 | lm loss: 3.660918E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.193 | TFLOPs: 26.02 | +7: iteration 131090/ 173500 | consumed samples: 33559040 | consumed tokens: 68728913920 | elapsed time per iteration (s): 0.15 | learning rate: 4.575E-05 | global batch size: 256 | lm loss: 3.661362E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.444 | TFLOPs: 26.07 | +7: iteration 131100/ 173500 | consumed samples: 33561600 | consumed tokens: 68734156800 | elapsed time per iteration (s): 0.16 | learning rate: 4.573E-05 | global batch size: 256 | lm loss: 3.659521E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.986 | TFLOPs: 25.67 | +7: iteration 131110/ 173500 | consumed samples: 33564160 | consumed tokens: 68739399680 | elapsed time per iteration (s): 0.15 | learning rate: 4.572E-05 | global batch size: 256 | lm loss: 3.675797E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.251 | TFLOPs: 26.07 | +7: iteration 131120/ 173500 | consumed samples: 33566720 | consumed tokens: 68744642560 | elapsed time per iteration (s): 0.16 | learning rate: 4.571E-05 | global batch size: 256 | lm loss: 3.668962E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.805 | TFLOPs: 25.72 | +7: iteration 131130/ 173500 | consumed samples: 33569280 | consumed tokens: 68749885440 | elapsed time per iteration (s): 0.16 | learning rate: 4.570E-05 | global batch size: 256 | lm loss: 3.678501E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.389 | TFLOPs: 25.47 | +7: iteration 131140/ 173500 | consumed samples: 33571840 | consumed tokens: 68755128320 | elapsed time per iteration (s): 0.16 | learning rate: 4.569E-05 | global batch size: 256 | lm loss: 3.683485E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.030 | TFLOPs: 25.69 | +7: iteration 131150/ 173500 | consumed samples: 33574400 | consumed tokens: 68760371200 | elapsed time per iteration (s): 0.16 | learning rate: 4.568E-05 | global batch size: 256 | lm loss: 3.663528E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.151 | TFLOPs: 25.80 | +7: iteration 131160/ 173500 | consumed samples: 33576960 | consumed tokens: 68765614080 | elapsed time per iteration (s): 0.16 | learning rate: 4.566E-05 | global batch size: 256 | lm loss: 3.661743E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.881 | TFLOPs: 25.80 | +7: iteration 131170/ 173500 | consumed samples: 33579520 | consumed tokens: 68770856960 | elapsed time per iteration (s): 0.16 | learning rate: 4.565E-05 | global batch size: 256 | lm loss: 3.685462E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.743 | TFLOPs: 25.48 | +7: iteration 131180/ 173500 | consumed samples: 33582080 | consumed tokens: 68776099840 | elapsed time per iteration (s): 0.18 | learning rate: 4.564E-05 | global batch size: 256 | lm loss: 3.678630E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1428.117 | TFLOPs: 22.40 | +7: iteration 131190/ 173500 | consumed samples: 33584640 | consumed tokens: 68781342720 | elapsed time per iteration (s): 0.15 | learning rate: 4.563E-05 | global batch size: 256 | lm loss: 3.663391E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.775 | TFLOPs: 26.08 | +7: iteration 131200/ 173500 | consumed samples: 33587200 | consumed tokens: 68786585600 | elapsed time per iteration (s): 0.16 | learning rate: 4.562E-05 | global batch size: 256 | lm loss: 3.675586E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.903 | TFLOPs: 25.83 | +7: iteration 131210/ 173500 | consumed samples: 33589760 | consumed tokens: 68791828480 | elapsed time per iteration (s): 0.15 | learning rate: 4.561E-05 | global batch size: 256 | lm loss: 3.662977E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.565 | TFLOPs: 26.07 | +7: iteration 131220/ 173500 | consumed samples: 33592320 | consumed tokens: 68797071360 | elapsed time per iteration (s): 0.16 | learning rate: 4.560E-05 | global batch size: 256 | lm loss: 3.681297E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.402 | TFLOPs: 25.80 | +7: iteration 131230/ 173500 | consumed samples: 33594880 | consumed tokens: 68802314240 | elapsed time per iteration (s): 0.15 | learning rate: 4.558E-05 | global batch size: 256 | lm loss: 3.679177E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.131 | TFLOPs: 26.32 | +7: iteration 131240/ 173500 | consumed samples: 33597440 | consumed tokens: 68807557120 | elapsed time per iteration (s): 0.15 | learning rate: 4.557E-05 | global batch size: 256 | lm loss: 3.658563E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.468 | TFLOPs: 26.32 | +7: iteration 131250/ 173500 | consumed samples: 33600000 | consumed tokens: 68812800000 | elapsed time per iteration (s): 0.15 | learning rate: 4.556E-05 | global batch size: 256 | lm loss: 3.678245E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.287 | TFLOPs: 26.32 | +7: iteration 131260/ 173500 | consumed samples: 33602560 | consumed tokens: 68818042880 | elapsed time per iteration (s): 0.16 | learning rate: 4.555E-05 | global batch size: 256 | lm loss: 3.677763E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.316 | TFLOPs: 25.87 | +7: iteration 131270/ 173500 | consumed samples: 33605120 | consumed tokens: 68823285760 | elapsed time per iteration (s): 0.15 | learning rate: 4.554E-05 | global batch size: 256 | lm loss: 3.685322E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.562 | TFLOPs: 26.32 | +7: iteration 131280/ 173500 | consumed samples: 33607680 | consumed tokens: 68828528640 | elapsed time per iteration (s): 0.15 | learning rate: 4.553E-05 | global batch size: 256 | lm loss: 3.660903E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.239 | TFLOPs: 26.29 | +7: iteration 131290/ 173500 | consumed samples: 33610240 | consumed tokens: 68833771520 | elapsed time per iteration (s): 0.16 | learning rate: 4.552E-05 | global batch size: 256 | lm loss: 3.671020E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.230 | TFLOPs: 25.27 | +7: iteration 131300/ 173500 | consumed samples: 33612800 | consumed tokens: 68839014400 | elapsed time per iteration (s): 0.15 | learning rate: 4.550E-05 | global batch size: 256 | lm loss: 3.673853E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.511 | TFLOPs: 26.21 | +7: iteration 131310/ 173500 | consumed samples: 33615360 | consumed tokens: 68844257280 | elapsed time per iteration (s): 0.16 | learning rate: 4.549E-05 | global batch size: 256 | lm loss: 3.664334E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.505 | TFLOPs: 25.82 | +7: iteration 131320/ 173500 | consumed samples: 33617920 | consumed tokens: 68849500160 | elapsed time per iteration (s): 0.15 | learning rate: 4.548E-05 | global batch size: 256 | lm loss: 3.664653E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.186 | TFLOPs: 26.26 | +7: iteration 131330/ 173500 | consumed samples: 33620480 | consumed tokens: 68854743040 | elapsed time per iteration (s): 0.15 | learning rate: 4.547E-05 | global batch size: 256 | lm loss: 3.689669E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.598 | TFLOPs: 26.23 | +7: iteration 131340/ 173500 | consumed samples: 33623040 | consumed tokens: 68859985920 | elapsed time per iteration (s): 0.16 | learning rate: 4.546E-05 | global batch size: 256 | lm loss: 3.672923E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.488 | TFLOPs: 25.81 | +7: iteration 131350/ 173500 | consumed samples: 33625600 | consumed tokens: 68865228800 | elapsed time per iteration (s): 0.15 | learning rate: 4.545E-05 | global batch size: 256 | lm loss: 3.669355E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.663 | TFLOPs: 26.20 | +7: iteration 131360/ 173500 | consumed samples: 33628160 | consumed tokens: 68870471680 | elapsed time per iteration (s): 0.15 | learning rate: 4.544E-05 | global batch size: 256 | lm loss: 3.667928E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.315 | TFLOPs: 26.24 | +7: iteration 131370/ 173500 | consumed samples: 33630720 | consumed tokens: 68875714560 | elapsed time per iteration (s): 0.15 | learning rate: 4.542E-05 | global batch size: 256 | lm loss: 3.679721E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.463 | TFLOPs: 26.20 | +7: iteration 131380/ 173500 | consumed samples: 33633280 | consumed tokens: 68880957440 | elapsed time per iteration (s): 0.15 | learning rate: 4.541E-05 | global batch size: 256 | lm loss: 3.678744E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.510 | TFLOPs: 26.18 | +7: iteration 131390/ 173500 | consumed samples: 33635840 | consumed tokens: 68886200320 | elapsed time per iteration (s): 0.15 | learning rate: 4.540E-05 | global batch size: 256 | lm loss: 3.671056E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.324 | TFLOPs: 26.27 | +7: iteration 131400/ 173500 | consumed samples: 33638400 | consumed tokens: 68891443200 | elapsed time per iteration (s): 0.15 | learning rate: 4.539E-05 | global batch size: 256 | lm loss: 3.667271E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.066 | TFLOPs: 26.25 | +7: iteration 131410/ 173500 | consumed samples: 33640960 | consumed tokens: 68896686080 | elapsed time per iteration (s): 0.15 | learning rate: 4.538E-05 | global batch size: 256 | lm loss: 3.667537E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.446 | TFLOPs: 26.23 | +7: iteration 131420/ 173500 | consumed samples: 33643520 | consumed tokens: 68901928960 | elapsed time per iteration (s): 0.15 | learning rate: 4.537E-05 | global batch size: 256 | lm loss: 3.662304E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.886 | TFLOPs: 26.25 | +7: iteration 131430/ 173500 | consumed samples: 33646080 | consumed tokens: 68907171840 | elapsed time per iteration (s): 0.15 | learning rate: 4.535E-05 | global batch size: 256 | lm loss: 3.675943E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.104 | TFLOPs: 26.27 | +7: iteration 131440/ 173500 | consumed samples: 33648640 | consumed tokens: 68912414720 | elapsed time per iteration (s): 0.16 | learning rate: 4.534E-05 | global batch size: 256 | lm loss: 3.680443E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.320 | TFLOPs: 25.65 | +7: iteration 131450/ 173500 | consumed samples: 33651200 | consumed tokens: 68917657600 | elapsed time per iteration (s): 0.15 | learning rate: 4.533E-05 | global batch size: 256 | lm loss: 3.663874E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.674 | TFLOPs: 25.92 | +7: iteration 131460/ 173500 | consumed samples: 33653760 | consumed tokens: 68922900480 | elapsed time per iteration (s): 0.16 | learning rate: 4.532E-05 | global batch size: 256 | lm loss: 3.670745E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.075 | TFLOPs: 25.89 | +7: iteration 131470/ 173500 | consumed samples: 33656320 | consumed tokens: 68928143360 | elapsed time per iteration (s): 0.15 | learning rate: 4.531E-05 | global batch size: 256 | lm loss: 3.686614E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.422 | TFLOPs: 26.01 | +7: iteration 131480/ 173500 | consumed samples: 33658880 | consumed tokens: 68933386240 | elapsed time per iteration (s): 0.15 | learning rate: 4.530E-05 | global batch size: 256 | lm loss: 3.670005E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.946 | TFLOPs: 26.24 | +7: iteration 131490/ 173500 | consumed samples: 33661440 | consumed tokens: 68938629120 | elapsed time per iteration (s): 0.15 | learning rate: 4.529E-05 | global batch size: 256 | lm loss: 3.663196E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.979 | TFLOPs: 26.22 | +7: iteration 131500/ 173500 | consumed samples: 33664000 | consumed tokens: 68943872000 | elapsed time per iteration (s): 0.15 | learning rate: 4.527E-05 | global batch size: 256 | lm loss: 3.667176E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.105 | TFLOPs: 25.96 | +7: iteration 131510/ 173500 | consumed samples: 33666560 | consumed tokens: 68949114880 | elapsed time per iteration (s): 0.15 | learning rate: 4.526E-05 | global batch size: 256 | lm loss: 3.681180E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.500 | TFLOPs: 26.24 | +7: iteration 131520/ 173500 | consumed samples: 33669120 | consumed tokens: 68954357760 | elapsed time per iteration (s): 0.15 | learning rate: 4.525E-05 | global batch size: 256 | lm loss: 3.662612E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.130 | TFLOPs: 26.16 | +7: iteration 131530/ 173500 | consumed samples: 33671680 | consumed tokens: 68959600640 | elapsed time per iteration (s): 0.15 | learning rate: 4.524E-05 | global batch size: 256 | lm loss: 3.668679E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.359 | TFLOPs: 25.94 | +7: iteration 131540/ 173500 | consumed samples: 33674240 | consumed tokens: 68964843520 | elapsed time per iteration (s): 0.15 | learning rate: 4.523E-05 | global batch size: 256 | lm loss: 3.661399E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.523 | TFLOPs: 26.17 | +7: iteration 131550/ 173500 | consumed samples: 33676800 | consumed tokens: 68970086400 | elapsed time per iteration (s): 0.15 | learning rate: 4.522E-05 | global batch size: 256 | lm loss: 3.664460E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.404 | TFLOPs: 26.35 | +7: iteration 131560/ 173500 | consumed samples: 33679360 | consumed tokens: 68975329280 | elapsed time per iteration (s): 0.15 | learning rate: 4.521E-05 | global batch size: 256 | lm loss: 3.677674E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.952 | TFLOPs: 26.25 | +7: iteration 131570/ 173500 | consumed samples: 33681920 | consumed tokens: 68980572160 | elapsed time per iteration (s): 0.16 | learning rate: 4.519E-05 | global batch size: 256 | lm loss: 3.662180E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.966 | TFLOPs: 25.86 | +7: iteration 131580/ 173500 | consumed samples: 33684480 | consumed tokens: 68985815040 | elapsed time per iteration (s): 0.15 | learning rate: 4.518E-05 | global batch size: 256 | lm loss: 3.669725E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.031 | TFLOPs: 26.21 | +7: iteration 131590/ 173500 | consumed samples: 33687040 | consumed tokens: 68991057920 | elapsed time per iteration (s): 0.15 | learning rate: 4.517E-05 | global batch size: 256 | lm loss: 3.665529E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.959 | TFLOPs: 26.20 | +7: iteration 131600/ 173500 | consumed samples: 33689600 | consumed tokens: 68996300800 | elapsed time per iteration (s): 0.15 | learning rate: 4.516E-05 | global batch size: 256 | lm loss: 3.671423E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.081 | TFLOPs: 26.21 | +7: iteration 131610/ 173500 | consumed samples: 33692160 | consumed tokens: 69001543680 | elapsed time per iteration (s): 0.15 | learning rate: 4.515E-05 | global batch size: 256 | lm loss: 3.666184E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.296 | TFLOPs: 26.21 | +7: iteration 131620/ 173500 | consumed samples: 33694720 | consumed tokens: 69006786560 | elapsed time per iteration (s): 0.15 | learning rate: 4.514E-05 | global batch size: 256 | lm loss: 3.675120E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.958 | TFLOPs: 26.19 | +7: iteration 131630/ 173500 | consumed samples: 33697280 | consumed tokens: 69012029440 | elapsed time per iteration (s): 0.15 | learning rate: 4.513E-05 | global batch size: 256 | lm loss: 3.684146E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.557 | TFLOPs: 26.18 | +7: iteration 131640/ 173500 | consumed samples: 33699840 | consumed tokens: 69017272320 | elapsed time per iteration (s): 0.16 | learning rate: 4.511E-05 | global batch size: 256 | lm loss: 3.678292E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.638 | TFLOPs: 25.60 | +7: iteration 131650/ 173500 | consumed samples: 33702400 | consumed tokens: 69022515200 | elapsed time per iteration (s): 0.16 | learning rate: 4.510E-05 | global batch size: 256 | lm loss: 3.688128E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.097 | TFLOPs: 25.83 | +7: iteration 131660/ 173500 | consumed samples: 33704960 | consumed tokens: 69027758080 | elapsed time per iteration (s): 0.15 | learning rate: 4.509E-05 | global batch size: 256 | lm loss: 3.668534E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.831 | TFLOPs: 26.19 | +7: iteration 131670/ 173500 | consumed samples: 33707520 | consumed tokens: 69033000960 | elapsed time per iteration (s): 0.15 | learning rate: 4.508E-05 | global batch size: 256 | lm loss: 3.680400E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.451 | TFLOPs: 26.20 | +7: iteration 131680/ 173500 | consumed samples: 33710080 | consumed tokens: 69038243840 | elapsed time per iteration (s): 0.15 | learning rate: 4.507E-05 | global batch size: 256 | lm loss: 3.663182E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.183 | TFLOPs: 26.18 | +7: iteration 131690/ 173500 | consumed samples: 33712640 | consumed tokens: 69043486720 | elapsed time per iteration (s): 0.15 | learning rate: 4.506E-05 | global batch size: 256 | lm loss: 3.672976E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.480 | TFLOPs: 25.93 | +7: iteration 131700/ 173500 | consumed samples: 33715200 | consumed tokens: 69048729600 | elapsed time per iteration (s): 0.15 | learning rate: 4.505E-05 | global batch size: 256 | lm loss: 3.670462E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.926 | TFLOPs: 26.03 | +7: iteration 131710/ 173500 | consumed samples: 33717760 | consumed tokens: 69053972480 | elapsed time per iteration (s): 0.15 | learning rate: 4.504E-05 | global batch size: 256 | lm loss: 3.681228E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.323 | TFLOPs: 26.16 | +7: iteration 131720/ 173500 | consumed samples: 33720320 | consumed tokens: 69059215360 | elapsed time per iteration (s): 0.16 | learning rate: 4.502E-05 | global batch size: 256 | lm loss: 3.659435E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.381 | TFLOPs: 25.87 | +7: iteration 131730/ 173500 | consumed samples: 33722880 | consumed tokens: 69064458240 | elapsed time per iteration (s): 0.15 | learning rate: 4.501E-05 | global batch size: 256 | lm loss: 3.667323E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.804 | TFLOPs: 26.17 | +7: iteration 131740/ 173500 | consumed samples: 33725440 | consumed tokens: 69069701120 | elapsed time per iteration (s): 0.15 | learning rate: 4.500E-05 | global batch size: 256 | lm loss: 3.683937E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.448 | TFLOPs: 26.17 | +7: iteration 131750/ 173500 | consumed samples: 33728000 | consumed tokens: 69074944000 | elapsed time per iteration (s): 0.15 | learning rate: 4.499E-05 | global batch size: 256 | lm loss: 3.674921E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.293 | TFLOPs: 26.12 | +7: iteration 131760/ 173500 | consumed samples: 33730560 | consumed tokens: 69080186880 | elapsed time per iteration (s): 0.15 | learning rate: 4.498E-05 | global batch size: 256 | lm loss: 3.679066E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.066 | TFLOPs: 26.08 | +7: iteration 131770/ 173500 | consumed samples: 33733120 | consumed tokens: 69085429760 | elapsed time per iteration (s): 0.15 | learning rate: 4.497E-05 | global batch size: 256 | lm loss: 3.675924E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.255 | TFLOPs: 25.96 | +7: iteration 131780/ 173500 | consumed samples: 33735680 | consumed tokens: 69090672640 | elapsed time per iteration (s): 0.15 | learning rate: 4.496E-05 | global batch size: 256 | lm loss: 3.672914E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.737 | TFLOPs: 25.98 | +7: iteration 131790/ 173500 | consumed samples: 33738240 | consumed tokens: 69095915520 | elapsed time per iteration (s): 0.16 | learning rate: 4.494E-05 | global batch size: 256 | lm loss: 3.663280E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.750 | TFLOPs: 25.83 | +7: iteration 131800/ 173500 | consumed samples: 33740800 | consumed tokens: 69101158400 | elapsed time per iteration (s): 0.15 | learning rate: 4.493E-05 | global batch size: 256 | lm loss: 3.668850E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.436 | TFLOPs: 25.98 | +7: iteration 131810/ 173500 | consumed samples: 33743360 | consumed tokens: 69106401280 | elapsed time per iteration (s): 0.15 | learning rate: 4.492E-05 | global batch size: 256 | lm loss: 3.661555E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.189 | TFLOPs: 26.18 | +7: iteration 131820/ 173500 | consumed samples: 33745920 | consumed tokens: 69111644160 | elapsed time per iteration (s): 0.15 | learning rate: 4.491E-05 | global batch size: 256 | lm loss: 3.669933E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.816 | TFLOPs: 25.90 | +7: iteration 131830/ 173500 | consumed samples: 33748480 | consumed tokens: 69116887040 | elapsed time per iteration (s): 0.15 | learning rate: 4.490E-05 | global batch size: 256 | lm loss: 3.662181E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.489 | TFLOPs: 26.17 | +7: iteration 131840/ 173500 | consumed samples: 33751040 | consumed tokens: 69122129920 | elapsed time per iteration (s): 0.16 | learning rate: 4.489E-05 | global batch size: 256 | lm loss: 3.655073E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.538 | TFLOPs: 25.56 | +7: iteration 131850/ 173500 | consumed samples: 33753600 | consumed tokens: 69127372800 | elapsed time per iteration (s): 0.15 | learning rate: 4.488E-05 | global batch size: 256 | lm loss: 3.679391E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.473 | TFLOPs: 26.18 | +7: iteration 131860/ 173500 | consumed samples: 33756160 | consumed tokens: 69132615680 | elapsed time per iteration (s): 0.15 | learning rate: 4.486E-05 | global batch size: 256 | lm loss: 3.667073E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.818 | TFLOPs: 26.20 | +7: iteration 131870/ 173500 | consumed samples: 33758720 | consumed tokens: 69137858560 | elapsed time per iteration (s): 0.15 | learning rate: 4.485E-05 | global batch size: 256 | lm loss: 3.680566E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.022 | TFLOPs: 26.11 | +7: iteration 131880/ 173500 | consumed samples: 33761280 | consumed tokens: 69143101440 | elapsed time per iteration (s): 0.15 | learning rate: 4.484E-05 | global batch size: 256 | lm loss: 3.673774E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.917 | TFLOPs: 26.09 | +7: iteration 131890/ 173500 | consumed samples: 33763840 | consumed tokens: 69148344320 | elapsed time per iteration (s): 0.16 | learning rate: 4.483E-05 | global batch size: 256 | lm loss: 3.664090E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.854 | TFLOPs: 25.48 | +7: iteration 131900/ 173500 | consumed samples: 33766400 | consumed tokens: 69153587200 | elapsed time per iteration (s): 0.15 | learning rate: 4.482E-05 | global batch size: 256 | lm loss: 3.669440E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.266 | TFLOPs: 25.99 | +7: iteration 131910/ 173500 | consumed samples: 33768960 | consumed tokens: 69158830080 | elapsed time per iteration (s): 0.15 | learning rate: 4.481E-05 | global batch size: 256 | lm loss: 3.677870E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.366 | TFLOPs: 26.02 | +7: iteration 131920/ 173500 | consumed samples: 33771520 | consumed tokens: 69164072960 | elapsed time per iteration (s): 0.16 | learning rate: 4.480E-05 | global batch size: 256 | lm loss: 3.663814E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.892 | TFLOPs: 25.51 | +7: iteration 131930/ 173500 | consumed samples: 33774080 | consumed tokens: 69169315840 | elapsed time per iteration (s): 0.15 | learning rate: 4.478E-05 | global batch size: 256 | lm loss: 3.668910E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.463 | TFLOPs: 26.10 | +7: iteration 131940/ 173500 | consumed samples: 33776640 | consumed tokens: 69174558720 | elapsed time per iteration (s): 0.16 | learning rate: 4.477E-05 | global batch size: 256 | lm loss: 3.657999E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.669 | TFLOPs: 25.48 | +7: iteration 131950/ 173500 | consumed samples: 33779200 | consumed tokens: 69179801600 | elapsed time per iteration (s): 0.15 | learning rate: 4.476E-05 | global batch size: 256 | lm loss: 3.662516E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.053 | TFLOPs: 26.13 | +7: iteration 131960/ 173500 | consumed samples: 33781760 | consumed tokens: 69185044480 | elapsed time per iteration (s): 0.15 | learning rate: 4.475E-05 | global batch size: 256 | lm loss: 3.688278E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.431 | TFLOPs: 26.10 | +7: iteration 131970/ 173500 | consumed samples: 33784320 | consumed tokens: 69190287360 | elapsed time per iteration (s): 0.16 | learning rate: 4.474E-05 | global batch size: 256 | lm loss: 3.671268E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.748 | TFLOPs: 25.68 | +7: iteration 131980/ 173500 | consumed samples: 33786880 | consumed tokens: 69195530240 | elapsed time per iteration (s): 0.15 | learning rate: 4.473E-05 | global batch size: 256 | lm loss: 3.677117E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.183 | TFLOPs: 26.13 | +7: iteration 131990/ 173500 | consumed samples: 33789440 | consumed tokens: 69200773120 | elapsed time per iteration (s): 0.15 | learning rate: 4.472E-05 | global batch size: 256 | lm loss: 3.672976E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.938 | TFLOPs: 26.03 | +0: [2023-03-17 05:59:42,359] [INFO] [logging.py:68:log_dist] [Rank 0] step=132000, skipped=0, lr=[4.4705599266134565e-05, 4.4705599266134565e-05, 4.4705599266134565e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 132000/ 173500 | consumed samples: 33792000 | consumed tokens: 69206016000 | elapsed time per iteration (s): 0.15 | learning rate: 4.471E-05 | global batch size: 256 | lm loss: 3.663508E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.027 | TFLOPs: 26.16 | +0: steps: 132000 loss: 3.6763 iter time (s): 0.155 samples/sec: 1656.325 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 132000 | lm loss value: 3.844030E+00 | lm loss PPL: 4.671335E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 132000 to checkpoints_44m91b100m +0: [2023-03-17 05:59:42,435] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step132000 is begin to save! +0: [2023-03-17 05:59:42,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_01-model_00-model_states.pt... +0: [2023-03-17 05:59:42,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_01-model_00-model_states.pt. +0: [2023-03-17 05:59:42,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_03-model_00-model_states.pt... +0: [2023-03-17 05:59:42,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_03-model_00-model_states.pt. +0: [2023-03-17 05:59:42,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_04-model_00-model_states.pt... +0: [2023-03-17 05:59:42,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_04-model_00-model_states.pt. +0: [2023-03-17 05:59:42,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_05-model_00-model_states.pt... +0: [2023-03-17 05:59:42,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_05-model_00-model_states.pt. +0: [2023-03-17 05:59:42,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_06-model_00-model_states.pt... +0: [2023-03-17 05:59:42,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_06-model_00-model_states.pt. +0: [2023-03-17 05:59:42,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_07-model_00-model_states.pt... +0: [2023-03-17 05:59:42,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_07-model_00-model_states.pt. +0: [2023-03-17 05:59:42,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_08-model_00-model_states.pt... +0: [2023-03-17 05:59:42,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_08-model_00-model_states.pt. +0: [2023-03-17 05:59:42,555] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_09-model_00-model_states.pt... +0: [2023-03-17 05:59:42,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_09-model_00-model_states.pt. +0: [2023-03-17 05:59:42,563] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_10-model_00-model_states.pt... +0: [2023-03-17 05:59:42,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_10-model_00-model_states.pt. +0: [2023-03-17 05:59:42,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/layer_12-model_00-model_states.pt... +0: [2023-03-17 05:59:42,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/layer_12-model_00-model_states.pt. +0: [2023-03-17 05:59:42,573] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step132000/mp_rank_00_model_states.pt +0: [2023-03-17 05:59:42,573] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/mp_rank_00_model_states.pt... +0: [2023-03-17 05:59:42,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/mp_rank_00_model_states.pt. +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +6: [2023-03-17 05:59:42,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 05:59:42,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 05:59:42,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +5: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +5: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:59:42,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +5: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:59:42,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +5: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +6: [2023-03-17 05:59:42,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +2: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +5: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +1: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +5: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +7: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 05:59:42,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +3: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 05:59:42,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +4: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 05:59:42,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step132000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 05:59:42,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step132000 is ready now! +0: successfully saved checkpoint at iteration 132000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.90 +7: iteration 132010/ 173500 | consumed samples: 33794560 | consumed tokens: 69211258880 | elapsed time per iteration (s): 0.18 | learning rate: 4.469E-05 | global batch size: 256 | lm loss: 3.671690E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1432.037 | TFLOPs: 22.46 | +7: iteration 132020/ 173500 | consumed samples: 33797120 | consumed tokens: 69216501760 | elapsed time per iteration (s): 0.15 | learning rate: 4.468E-05 | global batch size: 256 | lm loss: 3.674839E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.846 | TFLOPs: 26.17 | +7: iteration 132030/ 173500 | consumed samples: 33799680 | consumed tokens: 69221744640 | elapsed time per iteration (s): 0.15 | learning rate: 4.467E-05 | global batch size: 256 | lm loss: 3.667689E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.135 | TFLOPs: 26.16 | +7: iteration 132040/ 173500 | consumed samples: 33802240 | consumed tokens: 69226987520 | elapsed time per iteration (s): 0.15 | learning rate: 4.466E-05 | global batch size: 256 | lm loss: 3.663112E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.516 | TFLOPs: 26.17 | +7: iteration 132050/ 173500 | consumed samples: 33804800 | consumed tokens: 69232230400 | elapsed time per iteration (s): 0.15 | learning rate: 4.465E-05 | global batch size: 256 | lm loss: 3.674384E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.463 | TFLOPs: 26.17 | +7: iteration 132060/ 173500 | consumed samples: 33807360 | consumed tokens: 69237473280 | elapsed time per iteration (s): 0.16 | learning rate: 4.464E-05 | global batch size: 256 | lm loss: 3.671841E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.144 | TFLOPs: 25.44 | +7: iteration 132070/ 173500 | consumed samples: 33809920 | consumed tokens: 69242716160 | elapsed time per iteration (s): 0.15 | learning rate: 4.463E-05 | global batch size: 256 | lm loss: 3.681935E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.943 | TFLOPs: 26.14 | +7: iteration 132080/ 173500 | consumed samples: 33812480 | consumed tokens: 69247959040 | elapsed time per iteration (s): 0.15 | learning rate: 4.462E-05 | global batch size: 256 | lm loss: 3.670577E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.252 | TFLOPs: 26.16 | +7: iteration 132090/ 173500 | consumed samples: 33815040 | consumed tokens: 69253201920 | elapsed time per iteration (s): 0.15 | learning rate: 4.460E-05 | global batch size: 256 | lm loss: 3.680320E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.867 | TFLOPs: 26.12 | +7: iteration 132100/ 173500 | consumed samples: 33817600 | consumed tokens: 69258444800 | elapsed time per iteration (s): 0.15 | learning rate: 4.459E-05 | global batch size: 256 | lm loss: 3.670938E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.977 | TFLOPs: 26.19 | +7: iteration 132110/ 173500 | consumed samples: 33820160 | consumed tokens: 69263687680 | elapsed time per iteration (s): 0.15 | learning rate: 4.458E-05 | global batch size: 256 | lm loss: 3.669877E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.847 | TFLOPs: 26.16 | +7: iteration 132120/ 173500 | consumed samples: 33822720 | consumed tokens: 69268930560 | elapsed time per iteration (s): 0.15 | learning rate: 4.457E-05 | global batch size: 256 | lm loss: 3.669030E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.304 | TFLOPs: 26.08 | +7: iteration 132130/ 173500 | consumed samples: 33825280 | consumed tokens: 69274173440 | elapsed time per iteration (s): 0.15 | learning rate: 4.456E-05 | global batch size: 256 | lm loss: 3.660387E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.771 | TFLOPs: 26.19 | +7: iteration 132140/ 173500 | consumed samples: 33827840 | consumed tokens: 69279416320 | elapsed time per iteration (s): 0.16 | learning rate: 4.455E-05 | global batch size: 256 | lm loss: 3.676262E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.793 | TFLOPs: 25.87 | +7: iteration 132150/ 173500 | consumed samples: 33830400 | consumed tokens: 69284659200 | elapsed time per iteration (s): 0.15 | learning rate: 4.454E-05 | global batch size: 256 | lm loss: 3.673603E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.222 | TFLOPs: 26.19 | +7: iteration 132160/ 173500 | consumed samples: 33832960 | consumed tokens: 69289902080 | elapsed time per iteration (s): 0.15 | learning rate: 4.452E-05 | global batch size: 256 | lm loss: 3.677691E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.557 | TFLOPs: 26.18 | +7: iteration 132170/ 173500 | consumed samples: 33835520 | consumed tokens: 69295144960 | elapsed time per iteration (s): 0.15 | learning rate: 4.451E-05 | global batch size: 256 | lm loss: 3.671295E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.829 | TFLOPs: 26.19 | +7: iteration 132180/ 173500 | consumed samples: 33838080 | consumed tokens: 69300387840 | elapsed time per iteration (s): 0.15 | learning rate: 4.450E-05 | global batch size: 256 | lm loss: 3.661266E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.296 | TFLOPs: 26.23 | +7: iteration 132190/ 173500 | consumed samples: 33840640 | consumed tokens: 69305630720 | elapsed time per iteration (s): 0.15 | learning rate: 4.449E-05 | global batch size: 256 | lm loss: 3.686788E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.413 | TFLOPs: 26.21 | +7: iteration 132200/ 173500 | consumed samples: 33843200 | consumed tokens: 69310873600 | elapsed time per iteration (s): 0.16 | learning rate: 4.448E-05 | global batch size: 256 | lm loss: 3.678951E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.334 | TFLOPs: 25.80 | +7: iteration 132210/ 173500 | consumed samples: 33845760 | consumed tokens: 69316116480 | elapsed time per iteration (s): 0.15 | learning rate: 4.447E-05 | global batch size: 256 | lm loss: 3.661207E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.784 | TFLOPs: 26.14 | +7: iteration 132220/ 173500 | consumed samples: 33848320 | consumed tokens: 69321359360 | elapsed time per iteration (s): 0.15 | learning rate: 4.446E-05 | global batch size: 256 | lm loss: 3.658083E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.955 | TFLOPs: 26.14 | +7: iteration 132230/ 173500 | consumed samples: 33850880 | consumed tokens: 69326602240 | elapsed time per iteration (s): 0.15 | learning rate: 4.445E-05 | global batch size: 256 | lm loss: 3.671871E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.448 | TFLOPs: 26.15 | +7: iteration 132240/ 173500 | consumed samples: 33853440 | consumed tokens: 69331845120 | elapsed time per iteration (s): 0.16 | learning rate: 4.443E-05 | global batch size: 256 | lm loss: 3.683114E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.676 | TFLOPs: 25.56 | +7: iteration 132250/ 173500 | consumed samples: 33856000 | consumed tokens: 69337088000 | elapsed time per iteration (s): 0.15 | learning rate: 4.442E-05 | global batch size: 256 | lm loss: 3.680190E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.312 | TFLOPs: 26.16 | +7: iteration 132260/ 173500 | consumed samples: 33858560 | consumed tokens: 69342330880 | elapsed time per iteration (s): 0.16 | learning rate: 4.441E-05 | global batch size: 256 | lm loss: 3.666059E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.786 | TFLOPs: 25.45 | +7: iteration 132270/ 173500 | consumed samples: 33861120 | consumed tokens: 69347573760 | elapsed time per iteration (s): 0.16 | learning rate: 4.440E-05 | global batch size: 256 | lm loss: 3.674903E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.405 | TFLOPs: 25.85 | +7: iteration 132280/ 173500 | consumed samples: 33863680 | consumed tokens: 69352816640 | elapsed time per iteration (s): 0.15 | learning rate: 4.439E-05 | global batch size: 256 | lm loss: 3.677163E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.668 | TFLOPs: 26.14 | +7: iteration 132290/ 173500 | consumed samples: 33866240 | consumed tokens: 69358059520 | elapsed time per iteration (s): 0.16 | learning rate: 4.438E-05 | global batch size: 256 | lm loss: 3.659201E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.707 | TFLOPs: 25.35 | +7: iteration 132300/ 173500 | consumed samples: 33868800 | consumed tokens: 69363302400 | elapsed time per iteration (s): 0.15 | learning rate: 4.437E-05 | global batch size: 256 | lm loss: 3.676122E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.369 | TFLOPs: 26.16 | +7: iteration 132310/ 173500 | consumed samples: 33871360 | consumed tokens: 69368545280 | elapsed time per iteration (s): 0.15 | learning rate: 4.436E-05 | global batch size: 256 | lm loss: 3.661357E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.009 | TFLOPs: 26.17 | +7: iteration 132320/ 173500 | consumed samples: 33873920 | consumed tokens: 69373788160 | elapsed time per iteration (s): 0.15 | learning rate: 4.434E-05 | global batch size: 256 | lm loss: 3.670494E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.764 | TFLOPs: 26.09 | +7: iteration 132330/ 173500 | consumed samples: 33876480 | consumed tokens: 69379031040 | elapsed time per iteration (s): 0.15 | learning rate: 4.433E-05 | global batch size: 256 | lm loss: 3.662291E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.414 | TFLOPs: 26.13 | +7: iteration 132340/ 173500 | consumed samples: 33879040 | consumed tokens: 69384273920 | elapsed time per iteration (s): 0.15 | learning rate: 4.432E-05 | global batch size: 256 | lm loss: 3.663964E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.930 | TFLOPs: 26.17 | +7: iteration 132350/ 173500 | consumed samples: 33881600 | consumed tokens: 69389516800 | elapsed time per iteration (s): 0.16 | learning rate: 4.431E-05 | global batch size: 256 | lm loss: 3.670932E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.330 | TFLOPs: 25.41 | +7: iteration 132360/ 173500 | consumed samples: 33884160 | consumed tokens: 69394759680 | elapsed time per iteration (s): 0.15 | learning rate: 4.430E-05 | global batch size: 256 | lm loss: 3.682326E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.456 | TFLOPs: 26.15 | +7: iteration 132370/ 173500 | consumed samples: 33886720 | consumed tokens: 69400002560 | elapsed time per iteration (s): 0.15 | learning rate: 4.429E-05 | global batch size: 256 | lm loss: 3.675572E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.867 | TFLOPs: 26.14 | +7: iteration 132380/ 173500 | consumed samples: 33889280 | consumed tokens: 69405245440 | elapsed time per iteration (s): 0.15 | learning rate: 4.428E-05 | global batch size: 256 | lm loss: 3.671531E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.298 | TFLOPs: 26.15 | +7: iteration 132390/ 173500 | consumed samples: 33891840 | consumed tokens: 69410488320 | elapsed time per iteration (s): 0.15 | learning rate: 4.427E-05 | global batch size: 256 | lm loss: 3.671066E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.238 | TFLOPs: 26.16 | +7: iteration 132400/ 173500 | consumed samples: 33894400 | consumed tokens: 69415731200 | elapsed time per iteration (s): 0.15 | learning rate: 4.425E-05 | global batch size: 256 | lm loss: 3.667881E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.124 | TFLOPs: 26.16 | +7: iteration 132410/ 173500 | consumed samples: 33896960 | consumed tokens: 69420974080 | elapsed time per iteration (s): 0.15 | learning rate: 4.424E-05 | global batch size: 256 | lm loss: 3.654855E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.576 | TFLOPs: 26.14 | +7: iteration 132420/ 173500 | consumed samples: 33899520 | consumed tokens: 69426216960 | elapsed time per iteration (s): 0.15 | learning rate: 4.423E-05 | global batch size: 256 | lm loss: 3.687844E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.132 | TFLOPs: 26.08 | +7: iteration 132430/ 173500 | consumed samples: 33902080 | consumed tokens: 69431459840 | elapsed time per iteration (s): 0.15 | learning rate: 4.422E-05 | global batch size: 256 | lm loss: 3.660101E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.130 | TFLOPs: 26.08 | +7: iteration 132440/ 173500 | consumed samples: 33904640 | consumed tokens: 69436702720 | elapsed time per iteration (s): 0.15 | learning rate: 4.421E-05 | global batch size: 256 | lm loss: 3.664457E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.285 | TFLOPs: 26.10 | +7: iteration 132450/ 173500 | consumed samples: 33907200 | consumed tokens: 69441945600 | elapsed time per iteration (s): 0.16 | learning rate: 4.420E-05 | global batch size: 256 | lm loss: 3.671861E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.905 | TFLOPs: 25.69 | +7: iteration 132460/ 173500 | consumed samples: 33909760 | consumed tokens: 69447188480 | elapsed time per iteration (s): 0.15 | learning rate: 4.419E-05 | global batch size: 256 | lm loss: 3.664363E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.458 | TFLOPs: 26.07 | +7: iteration 132470/ 173500 | consumed samples: 33912320 | consumed tokens: 69452431360 | elapsed time per iteration (s): 0.15 | learning rate: 4.418E-05 | global batch size: 256 | lm loss: 3.660224E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.875 | TFLOPs: 26.02 | +7: iteration 132480/ 173500 | consumed samples: 33914880 | consumed tokens: 69457674240 | elapsed time per iteration (s): 0.15 | learning rate: 4.416E-05 | global batch size: 256 | lm loss: 3.683486E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.282 | TFLOPs: 26.08 | +7: iteration 132490/ 173500 | consumed samples: 33917440 | consumed tokens: 69462917120 | elapsed time per iteration (s): 0.15 | learning rate: 4.415E-05 | global batch size: 256 | lm loss: 3.670120E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.580 | TFLOPs: 26.10 | +7: iteration 132500/ 173500 | consumed samples: 33920000 | consumed tokens: 69468160000 | elapsed time per iteration (s): 0.15 | learning rate: 4.414E-05 | global batch size: 256 | lm loss: 3.681193E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.790 | TFLOPs: 26.11 | +7: iteration 132510/ 173500 | consumed samples: 33922560 | consumed tokens: 69473402880 | elapsed time per iteration (s): 0.15 | learning rate: 4.413E-05 | global batch size: 256 | lm loss: 3.685208E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.347 | TFLOPs: 26.07 | +7: iteration 132520/ 173500 | consumed samples: 33925120 | consumed tokens: 69478645760 | elapsed time per iteration (s): 0.15 | learning rate: 4.412E-05 | global batch size: 256 | lm loss: 3.659292E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.562 | TFLOPs: 26.09 | +7: iteration 132530/ 173500 | consumed samples: 33927680 | consumed tokens: 69483888640 | elapsed time per iteration (s): 0.16 | learning rate: 4.411E-05 | global batch size: 256 | lm loss: 3.669690E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.638 | TFLOPs: 25.53 | +7: iteration 132540/ 173500 | consumed samples: 33930240 | consumed tokens: 69489131520 | elapsed time per iteration (s): 0.16 | learning rate: 4.410E-05 | global batch size: 256 | lm loss: 3.656953E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.447 | TFLOPs: 25.69 | +7: iteration 132550/ 173500 | consumed samples: 33932800 | consumed tokens: 69494374400 | elapsed time per iteration (s): 0.15 | learning rate: 4.409E-05 | global batch size: 256 | lm loss: 3.676548E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.727 | TFLOPs: 26.04 | +7: iteration 132560/ 173500 | consumed samples: 33935360 | consumed tokens: 69499617280 | elapsed time per iteration (s): 0.15 | learning rate: 4.407E-05 | global batch size: 256 | lm loss: 3.657460E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.273 | TFLOPs: 26.07 | +7: iteration 132570/ 173500 | consumed samples: 33937920 | consumed tokens: 69504860160 | elapsed time per iteration (s): 0.16 | learning rate: 4.406E-05 | global batch size: 256 | lm loss: 3.662592E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.373 | TFLOPs: 25.69 | +7: iteration 132580/ 173500 | consumed samples: 33940480 | consumed tokens: 69510103040 | elapsed time per iteration (s): 0.16 | learning rate: 4.405E-05 | global batch size: 256 | lm loss: 3.674150E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.471 | TFLOPs: 25.71 | +7: iteration 132590/ 173500 | consumed samples: 33943040 | consumed tokens: 69515345920 | elapsed time per iteration (s): 0.15 | learning rate: 4.404E-05 | global batch size: 256 | lm loss: 3.677354E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.943 | TFLOPs: 26.06 | +7: iteration 132600/ 173500 | consumed samples: 33945600 | consumed tokens: 69520588800 | elapsed time per iteration (s): 0.15 | learning rate: 4.403E-05 | global batch size: 256 | lm loss: 3.668527E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.622 | TFLOPs: 26.09 | +7: iteration 132610/ 173500 | consumed samples: 33948160 | consumed tokens: 69525831680 | elapsed time per iteration (s): 0.15 | learning rate: 4.402E-05 | global batch size: 256 | lm loss: 3.659200E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.748 | TFLOPs: 26.09 | +7: iteration 132620/ 173500 | consumed samples: 33950720 | consumed tokens: 69531074560 | elapsed time per iteration (s): 0.15 | learning rate: 4.401E-05 | global batch size: 256 | lm loss: 3.659173E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.896 | TFLOPs: 26.08 | +7: iteration 132630/ 173500 | consumed samples: 33953280 | consumed tokens: 69536317440 | elapsed time per iteration (s): 0.16 | learning rate: 4.400E-05 | global batch size: 256 | lm loss: 3.667617E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.427 | TFLOPs: 25.32 | +7: iteration 132640/ 173500 | consumed samples: 33955840 | consumed tokens: 69541560320 | elapsed time per iteration (s): 0.16 | learning rate: 4.399E-05 | global batch size: 256 | lm loss: 3.668652E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.033 | TFLOPs: 25.85 | +7: iteration 132650/ 173500 | consumed samples: 33958400 | consumed tokens: 69546803200 | elapsed time per iteration (s): 0.16 | learning rate: 4.397E-05 | global batch size: 256 | lm loss: 3.665088E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.468 | TFLOPs: 25.35 | +7: iteration 132660/ 173500 | consumed samples: 33960960 | consumed tokens: 69552046080 | elapsed time per iteration (s): 0.15 | learning rate: 4.396E-05 | global batch size: 256 | lm loss: 3.680035E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.524 | TFLOPs: 26.04 | +7: iteration 132670/ 173500 | consumed samples: 33963520 | consumed tokens: 69557288960 | elapsed time per iteration (s): 0.15 | learning rate: 4.395E-05 | global batch size: 256 | lm loss: 3.663719E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.756 | TFLOPs: 26.11 | +7: iteration 132680/ 173500 | consumed samples: 33966080 | consumed tokens: 69562531840 | elapsed time per iteration (s): 0.16 | learning rate: 4.394E-05 | global batch size: 256 | lm loss: 3.672773E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.010 | TFLOPs: 25.64 | +7: iteration 132690/ 173500 | consumed samples: 33968640 | consumed tokens: 69567774720 | elapsed time per iteration (s): 0.15 | learning rate: 4.393E-05 | global batch size: 256 | lm loss: 3.678348E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.767 | TFLOPs: 26.11 | +7: iteration 132700/ 173500 | consumed samples: 33971200 | consumed tokens: 69573017600 | elapsed time per iteration (s): 0.15 | learning rate: 4.392E-05 | global batch size: 256 | lm loss: 3.662047E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.303 | TFLOPs: 26.12 | +7: iteration 132710/ 173500 | consumed samples: 33973760 | consumed tokens: 69578260480 | elapsed time per iteration (s): 0.15 | learning rate: 4.391E-05 | global batch size: 256 | lm loss: 3.676649E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.863 | TFLOPs: 26.12 | +7: iteration 132720/ 173500 | consumed samples: 33976320 | consumed tokens: 69583503360 | elapsed time per iteration (s): 0.16 | learning rate: 4.390E-05 | global batch size: 256 | lm loss: 3.676697E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.475 | TFLOPs: 25.41 | +7: iteration 132730/ 173500 | consumed samples: 33978880 | consumed tokens: 69588746240 | elapsed time per iteration (s): 0.15 | learning rate: 4.388E-05 | global batch size: 256 | lm loss: 3.676392E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.714 | TFLOPs: 26.14 | +7: iteration 132740/ 173500 | consumed samples: 33981440 | consumed tokens: 69593989120 | elapsed time per iteration (s): 0.15 | learning rate: 4.387E-05 | global batch size: 256 | lm loss: 3.674901E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.532 | TFLOPs: 26.15 | +7: iteration 132750/ 173500 | consumed samples: 33984000 | consumed tokens: 69599232000 | elapsed time per iteration (s): 0.15 | learning rate: 4.386E-05 | global batch size: 256 | lm loss: 3.673888E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.107 | TFLOPs: 26.18 | +7: iteration 132760/ 173500 | consumed samples: 33986560 | consumed tokens: 69604474880 | elapsed time per iteration (s): 0.15 | learning rate: 4.385E-05 | global batch size: 256 | lm loss: 3.672099E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.754 | TFLOPs: 26.19 | +7: iteration 132770/ 173500 | consumed samples: 33989120 | consumed tokens: 69609717760 | elapsed time per iteration (s): 0.15 | learning rate: 4.384E-05 | global batch size: 256 | lm loss: 3.677640E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.992 | TFLOPs: 26.17 | +7: iteration 132780/ 173500 | consumed samples: 33991680 | consumed tokens: 69614960640 | elapsed time per iteration (s): 0.15 | learning rate: 4.383E-05 | global batch size: 256 | lm loss: 3.680891E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.375 | TFLOPs: 26.18 | +7: iteration 132790/ 173500 | consumed samples: 33994240 | consumed tokens: 69620203520 | elapsed time per iteration (s): 0.15 | learning rate: 4.382E-05 | global batch size: 256 | lm loss: 3.670359E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.119 | TFLOPs: 26.18 | +7: iteration 132800/ 173500 | consumed samples: 33996800 | consumed tokens: 69625446400 | elapsed time per iteration (s): 0.15 | learning rate: 4.381E-05 | global batch size: 256 | lm loss: 3.667336E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.298 | TFLOPs: 26.12 | +7: iteration 132810/ 173500 | consumed samples: 33999360 | consumed tokens: 69630689280 | elapsed time per iteration (s): 0.15 | learning rate: 4.380E-05 | global batch size: 256 | lm loss: 3.674071E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.734 | TFLOPs: 26.14 | +7: iteration 132820/ 173500 | consumed samples: 34001920 | consumed tokens: 69635932160 | elapsed time per iteration (s): 0.16 | learning rate: 4.378E-05 | global batch size: 256 | lm loss: 3.668777E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.248 | TFLOPs: 25.90 | +7: iteration 132830/ 173500 | consumed samples: 34004480 | consumed tokens: 69641175040 | elapsed time per iteration (s): 0.15 | learning rate: 4.377E-05 | global batch size: 256 | lm loss: 3.661450E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.450 | TFLOPs: 26.12 | +7: iteration 132840/ 173500 | consumed samples: 34007040 | consumed tokens: 69646417920 | elapsed time per iteration (s): 0.15 | learning rate: 4.376E-05 | global batch size: 256 | lm loss: 3.664638E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.540 | TFLOPs: 26.14 | +7: iteration 132850/ 173500 | consumed samples: 34009600 | consumed tokens: 69651660800 | elapsed time per iteration (s): 0.15 | learning rate: 4.375E-05 | global batch size: 256 | lm loss: 3.664495E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.768 | TFLOPs: 26.11 | +7: iteration 132860/ 173500 | consumed samples: 34012160 | consumed tokens: 69656903680 | elapsed time per iteration (s): 0.16 | learning rate: 4.374E-05 | global batch size: 256 | lm loss: 3.672309E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.683 | TFLOPs: 25.84 | +7: iteration 132870/ 173500 | consumed samples: 34014720 | consumed tokens: 69662146560 | elapsed time per iteration (s): 0.15 | learning rate: 4.373E-05 | global batch size: 256 | lm loss: 3.662165E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.836 | TFLOPs: 26.12 | +7: iteration 132880/ 173500 | consumed samples: 34017280 | consumed tokens: 69667389440 | elapsed time per iteration (s): 0.16 | learning rate: 4.372E-05 | global batch size: 256 | lm loss: 3.682411E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.099 | TFLOPs: 25.83 | +7: iteration 132890/ 173500 | consumed samples: 34019840 | consumed tokens: 69672632320 | elapsed time per iteration (s): 0.15 | learning rate: 4.371E-05 | global batch size: 256 | lm loss: 3.665181E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.451 | TFLOPs: 26.17 | +7: iteration 132900/ 173500 | consumed samples: 34022400 | consumed tokens: 69677875200 | elapsed time per iteration (s): 0.16 | learning rate: 4.369E-05 | global batch size: 256 | lm loss: 3.664393E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.932 | TFLOPs: 25.55 | +7: iteration 132910/ 173500 | consumed samples: 34024960 | consumed tokens: 69683118080 | elapsed time per iteration (s): 0.15 | learning rate: 4.368E-05 | global batch size: 256 | lm loss: 3.675475E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.073 | TFLOPs: 26.11 | +7: iteration 132920/ 173500 | consumed samples: 34027520 | consumed tokens: 69688360960 | elapsed time per iteration (s): 0.15 | learning rate: 4.367E-05 | global batch size: 256 | lm loss: 3.668871E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.098 | TFLOPs: 26.11 | +7: iteration 132930/ 173500 | consumed samples: 34030080 | consumed tokens: 69693603840 | elapsed time per iteration (s): 0.16 | learning rate: 4.366E-05 | global batch size: 256 | lm loss: 3.671595E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.938 | TFLOPs: 24.95 | +7: iteration 132940/ 173500 | consumed samples: 34032640 | consumed tokens: 69698846720 | elapsed time per iteration (s): 0.15 | learning rate: 4.365E-05 | global batch size: 256 | lm loss: 3.671292E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.489 | TFLOPs: 26.12 | +7: iteration 132950/ 173500 | consumed samples: 34035200 | consumed tokens: 69704089600 | elapsed time per iteration (s): 0.16 | learning rate: 4.364E-05 | global batch size: 256 | lm loss: 3.672368E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.441 | TFLOPs: 25.69 | +7: iteration 132960/ 173500 | consumed samples: 34037760 | consumed tokens: 69709332480 | elapsed time per iteration (s): 0.15 | learning rate: 4.363E-05 | global batch size: 256 | lm loss: 3.671037E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.545 | TFLOPs: 26.09 | +7: iteration 132970/ 173500 | consumed samples: 34040320 | consumed tokens: 69714575360 | elapsed time per iteration (s): 0.15 | learning rate: 4.362E-05 | global batch size: 256 | lm loss: 3.668688E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.825 | TFLOPs: 26.01 | +7: iteration 132980/ 173500 | consumed samples: 34042880 | consumed tokens: 69719818240 | elapsed time per iteration (s): 0.15 | learning rate: 4.361E-05 | global batch size: 256 | lm loss: 3.684861E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.489 | TFLOPs: 26.09 | +7: iteration 132990/ 173500 | consumed samples: 34045440 | consumed tokens: 69725061120 | elapsed time per iteration (s): 0.16 | learning rate: 4.359E-05 | global batch size: 256 | lm loss: 3.668781E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.421 | TFLOPs: 25.49 | +7: iteration 133000/ 173500 | consumed samples: 34048000 | consumed tokens: 69730304000 | elapsed time per iteration (s): 0.16 | learning rate: 4.358E-05 | global batch size: 256 | lm loss: 3.673212E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.352 | TFLOPs: 25.69 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 133000 | lm loss value: 3.803735E+00 | lm loss PPL: 4.486848E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 133000 to checkpoints_44m91b100m +0: [2023-03-17 06:02:17,146] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step133000 is begin to save! +0: [2023-03-17 06:02:17,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:02:17,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:02:17,214] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:02:17,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:02:17,223] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:02:17,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:02:17,232] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:02:17,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:02:17,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:02:17,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:02:17,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:02:17,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:02:17,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:02:17,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:02:17,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:02:17,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:02:17,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:02:17,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:02:17,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:02:17,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:02:17,282] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step133000/mp_rank_00_model_states.pt +0: [2023-03-17 06:02:17,282] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:02:17,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:02:17,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:02:17,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:02:17,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:02:17,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:02:17,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:02:17,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 06:02:17,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:02:17,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +4: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 06:02:17,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +6: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +2: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +3: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +0: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +5: [2023-03-17 06:02:17,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:02:17,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +1: [2023-03-17 06:02:17,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:02:17,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:02:17,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +7: [2023-03-17 06:02:17,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:02:17,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step133000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:02:17,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step133000 is ready now! +0: successfully saved checkpoint at iteration 133000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.69 +7: iteration 133010/ 173500 | consumed samples: 34050560 | consumed tokens: 69735546880 | elapsed time per iteration (s): 0.18 | learning rate: 4.357E-05 | global batch size: 256 | lm loss: 3.680018E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1447.643 | TFLOPs: 22.70 | +7: iteration 133020/ 173500 | consumed samples: 34053120 | consumed tokens: 69740789760 | elapsed time per iteration (s): 0.17 | learning rate: 4.356E-05 | global batch size: 256 | lm loss: 3.668691E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1485.388 | TFLOPs: 23.29 | +7: iteration 133030/ 173500 | consumed samples: 34055680 | consumed tokens: 69746032640 | elapsed time per iteration (s): 0.15 | learning rate: 4.355E-05 | global batch size: 256 | lm loss: 3.676554E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.388 | TFLOPs: 26.09 | +7: iteration 133040/ 173500 | consumed samples: 34058240 | consumed tokens: 69751275520 | elapsed time per iteration (s): 0.16 | learning rate: 4.354E-05 | global batch size: 256 | lm loss: 3.670857E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.644 | TFLOPs: 24.35 | +7: iteration 133050/ 173500 | consumed samples: 34060800 | consumed tokens: 69756518400 | elapsed time per iteration (s): 0.16 | learning rate: 4.353E-05 | global batch size: 256 | lm loss: 3.679962E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.897 | TFLOPs: 25.59 | +7: iteration 133060/ 173500 | consumed samples: 34063360 | consumed tokens: 69761761280 | elapsed time per iteration (s): 0.16 | learning rate: 4.352E-05 | global batch size: 256 | lm loss: 3.675315E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.507 | TFLOPs: 25.54 | +7: iteration 133070/ 173500 | consumed samples: 34065920 | consumed tokens: 69767004160 | elapsed time per iteration (s): 0.16 | learning rate: 4.351E-05 | global batch size: 256 | lm loss: 3.670411E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.892 | TFLOPs: 25.89 | +7: iteration 133080/ 173500 | consumed samples: 34068480 | consumed tokens: 69772247040 | elapsed time per iteration (s): 0.15 | learning rate: 4.349E-05 | global batch size: 256 | lm loss: 3.663525E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.481 | TFLOPs: 26.10 | +7: iteration 133090/ 173500 | consumed samples: 34071040 | consumed tokens: 69777489920 | elapsed time per iteration (s): 0.15 | learning rate: 4.348E-05 | global batch size: 256 | lm loss: 3.667300E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.691 | TFLOPs: 26.08 | +7: iteration 133100/ 173500 | consumed samples: 34073600 | consumed tokens: 69782732800 | elapsed time per iteration (s): 0.15 | learning rate: 4.347E-05 | global batch size: 256 | lm loss: 3.674517E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.135 | TFLOPs: 26.10 | +7: iteration 133110/ 173500 | consumed samples: 34076160 | consumed tokens: 69787975680 | elapsed time per iteration (s): 0.16 | learning rate: 4.346E-05 | global batch size: 256 | lm loss: 3.662804E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.643 | TFLOPs: 25.68 | +7: iteration 133120/ 173500 | consumed samples: 34078720 | consumed tokens: 69793218560 | elapsed time per iteration (s): 0.16 | learning rate: 4.345E-05 | global batch size: 256 | lm loss: 3.666050E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.879 | TFLOPs: 24.87 | +7: iteration 133130/ 173500 | consumed samples: 34081280 | consumed tokens: 69798461440 | elapsed time per iteration (s): 0.15 | learning rate: 4.344E-05 | global batch size: 256 | lm loss: 3.675918E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.386 | TFLOPs: 26.02 | +7: iteration 133140/ 173500 | consumed samples: 34083840 | consumed tokens: 69803704320 | elapsed time per iteration (s): 0.16 | learning rate: 4.343E-05 | global batch size: 256 | lm loss: 3.666750E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.510 | TFLOPs: 25.35 | +7: iteration 133150/ 173500 | consumed samples: 34086400 | consumed tokens: 69808947200 | elapsed time per iteration (s): 0.15 | learning rate: 4.342E-05 | global batch size: 256 | lm loss: 3.657337E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.903 | TFLOPs: 26.00 | +7: iteration 133160/ 173500 | consumed samples: 34088960 | consumed tokens: 69814190080 | elapsed time per iteration (s): 0.17 | learning rate: 4.341E-05 | global batch size: 256 | lm loss: 3.675832E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1467.587 | TFLOPs: 23.02 | +7: iteration 133170/ 173500 | consumed samples: 34091520 | consumed tokens: 69819432960 | elapsed time per iteration (s): 0.17 | learning rate: 4.340E-05 | global batch size: 256 | lm loss: 3.684630E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.030 | TFLOPs: 24.12 | +7: iteration 133180/ 173500 | consumed samples: 34094080 | consumed tokens: 69824675840 | elapsed time per iteration (s): 0.16 | learning rate: 4.338E-05 | global batch size: 256 | lm loss: 3.666649E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.743 | TFLOPs: 25.79 | +7: iteration 133190/ 173500 | consumed samples: 34096640 | consumed tokens: 69829918720 | elapsed time per iteration (s): 0.15 | learning rate: 4.337E-05 | global batch size: 256 | lm loss: 3.681997E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.286 | TFLOPs: 26.15 | +7: iteration 133200/ 173500 | consumed samples: 34099200 | consumed tokens: 69835161600 | elapsed time per iteration (s): 0.15 | learning rate: 4.336E-05 | global batch size: 256 | lm loss: 3.673480E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.384 | TFLOPs: 26.05 | +7: iteration 133210/ 173500 | consumed samples: 34101760 | consumed tokens: 69840404480 | elapsed time per iteration (s): 0.16 | learning rate: 4.335E-05 | global batch size: 256 | lm loss: 3.673365E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.335 | TFLOPs: 24.60 | +7: iteration 133220/ 173500 | consumed samples: 34104320 | consumed tokens: 69845647360 | elapsed time per iteration (s): 0.15 | learning rate: 4.334E-05 | global batch size: 256 | lm loss: 3.669817E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.608 | TFLOPs: 26.15 | +7: iteration 133230/ 173500 | consumed samples: 34106880 | consumed tokens: 69850890240 | elapsed time per iteration (s): 0.15 | learning rate: 4.333E-05 | global batch size: 256 | lm loss: 3.683643E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.506 | TFLOPs: 25.96 | +7: iteration 133240/ 173500 | consumed samples: 34109440 | consumed tokens: 69856133120 | elapsed time per iteration (s): 0.15 | learning rate: 4.332E-05 | global batch size: 256 | lm loss: 3.693234E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.461 | TFLOPs: 26.04 | +7: iteration 133250/ 173500 | consumed samples: 34112000 | consumed tokens: 69861376000 | elapsed time per iteration (s): 0.15 | learning rate: 4.331E-05 | global batch size: 256 | lm loss: 3.666976E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.524 | TFLOPs: 26.23 | +7: iteration 133260/ 173500 | consumed samples: 34114560 | consumed tokens: 69866618880 | elapsed time per iteration (s): 0.16 | learning rate: 4.330E-05 | global batch size: 256 | lm loss: 3.680742E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.872 | TFLOPs: 25.67 | +7: iteration 133270/ 173500 | consumed samples: 34117120 | consumed tokens: 69871861760 | elapsed time per iteration (s): 0.15 | learning rate: 4.328E-05 | global batch size: 256 | lm loss: 3.664986E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.579 | TFLOPs: 26.21 | +7: iteration 133280/ 173500 | consumed samples: 34119680 | consumed tokens: 69877104640 | elapsed time per iteration (s): 0.15 | learning rate: 4.327E-05 | global batch size: 256 | lm loss: 3.659681E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.971 | TFLOPs: 26.24 | +7: iteration 133290/ 173500 | consumed samples: 34122240 | consumed tokens: 69882347520 | elapsed time per iteration (s): 0.15 | learning rate: 4.326E-05 | global batch size: 256 | lm loss: 3.683450E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.899 | TFLOPs: 26.22 | +7: iteration 133300/ 173500 | consumed samples: 34124800 | consumed tokens: 69887590400 | elapsed time per iteration (s): 0.16 | learning rate: 4.325E-05 | global batch size: 256 | lm loss: 3.681905E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.678 | TFLOPs: 25.48 | +7: iteration 133310/ 173500 | consumed samples: 34127360 | consumed tokens: 69892833280 | elapsed time per iteration (s): 0.16 | learning rate: 4.324E-05 | global batch size: 256 | lm loss: 3.656109E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.588 | TFLOPs: 25.59 | +7: iteration 133320/ 173500 | consumed samples: 34129920 | consumed tokens: 69898076160 | elapsed time per iteration (s): 0.15 | learning rate: 4.323E-05 | global batch size: 256 | lm loss: 3.663156E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.608 | TFLOPs: 26.04 | +7: iteration 133330/ 173500 | consumed samples: 34132480 | consumed tokens: 69903319040 | elapsed time per iteration (s): 0.15 | learning rate: 4.322E-05 | global batch size: 256 | lm loss: 3.681501E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.297 | TFLOPs: 26.24 | +7: iteration 133340/ 173500 | consumed samples: 34135040 | consumed tokens: 69908561920 | elapsed time per iteration (s): 0.15 | learning rate: 4.321E-05 | global batch size: 256 | lm loss: 3.669822E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.757 | TFLOPs: 26.23 | +7: iteration 133350/ 173500 | consumed samples: 34137600 | consumed tokens: 69913804800 | elapsed time per iteration (s): 0.15 | learning rate: 4.320E-05 | global batch size: 256 | lm loss: 3.664857E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.129 | TFLOPs: 26.22 | +7: iteration 133360/ 173500 | consumed samples: 34140160 | consumed tokens: 69919047680 | elapsed time per iteration (s): 0.15 | learning rate: 4.319E-05 | global batch size: 256 | lm loss: 3.683716E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.216 | TFLOPs: 26.18 | +7: iteration 133370/ 173500 | consumed samples: 34142720 | consumed tokens: 69924290560 | elapsed time per iteration (s): 0.15 | learning rate: 4.317E-05 | global batch size: 256 | lm loss: 3.665712E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.499 | TFLOPs: 26.17 | +7: iteration 133380/ 173500 | consumed samples: 34145280 | consumed tokens: 69929533440 | elapsed time per iteration (s): 0.15 | learning rate: 4.316E-05 | global batch size: 256 | lm loss: 3.685222E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.549 | TFLOPs: 26.21 | +7: iteration 133390/ 173500 | consumed samples: 34147840 | consumed tokens: 69934776320 | elapsed time per iteration (s): 0.15 | learning rate: 4.315E-05 | global batch size: 256 | lm loss: 3.662894E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.688 | TFLOPs: 26.22 | +7: iteration 133400/ 173500 | consumed samples: 34150400 | consumed tokens: 69940019200 | elapsed time per iteration (s): 0.17 | learning rate: 4.314E-05 | global batch size: 256 | lm loss: 3.686237E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.045 | TFLOPs: 24.28 | +7: iteration 133410/ 173500 | consumed samples: 34152960 | consumed tokens: 69945262080 | elapsed time per iteration (s): 0.15 | learning rate: 4.313E-05 | global batch size: 256 | lm loss: 3.669402E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.604 | TFLOPs: 26.21 | +7: iteration 133420/ 173500 | consumed samples: 34155520 | consumed tokens: 69950504960 | elapsed time per iteration (s): 0.15 | learning rate: 4.312E-05 | global batch size: 256 | lm loss: 3.670735E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.736 | TFLOPs: 26.19 | +7: iteration 133430/ 173500 | consumed samples: 34158080 | consumed tokens: 69955747840 | elapsed time per iteration (s): 0.16 | learning rate: 4.311E-05 | global batch size: 256 | lm loss: 3.668132E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.597 | TFLOPs: 25.10 | +7: iteration 133440/ 173500 | consumed samples: 34160640 | consumed tokens: 69960990720 | elapsed time per iteration (s): 0.16 | learning rate: 4.310E-05 | global batch size: 256 | lm loss: 3.667256E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.060 | TFLOPs: 25.17 | +7: iteration 133450/ 173500 | consumed samples: 34163200 | consumed tokens: 69966233600 | elapsed time per iteration (s): 0.15 | learning rate: 4.309E-05 | global batch size: 256 | lm loss: 3.669352E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.750 | TFLOPs: 25.95 | +7: iteration 133460/ 173500 | consumed samples: 34165760 | consumed tokens: 69971476480 | elapsed time per iteration (s): 0.16 | learning rate: 4.308E-05 | global batch size: 256 | lm loss: 3.682662E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.888 | TFLOPs: 25.76 | +7: iteration 133470/ 173500 | consumed samples: 34168320 | consumed tokens: 69976719360 | elapsed time per iteration (s): 0.15 | learning rate: 4.306E-05 | global batch size: 256 | lm loss: 3.684212E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.162 | TFLOPs: 26.27 | +7: iteration 133480/ 173500 | consumed samples: 34170880 | consumed tokens: 69981962240 | elapsed time per iteration (s): 0.16 | learning rate: 4.305E-05 | global batch size: 256 | lm loss: 3.676484E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.107 | TFLOPs: 25.39 | +7: iteration 133490/ 173500 | consumed samples: 34173440 | consumed tokens: 69987205120 | elapsed time per iteration (s): 0.15 | learning rate: 4.304E-05 | global batch size: 256 | lm loss: 3.658364E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.492 | TFLOPs: 26.32 | +7: iteration 133500/ 173500 | consumed samples: 34176000 | consumed tokens: 69992448000 | elapsed time per iteration (s): 0.15 | learning rate: 4.303E-05 | global batch size: 256 | lm loss: 3.675973E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.517 | TFLOPs: 25.99 | +7: iteration 133510/ 173500 | consumed samples: 34178560 | consumed tokens: 69997690880 | elapsed time per iteration (s): 0.16 | learning rate: 4.302E-05 | global batch size: 256 | lm loss: 3.677281E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.839 | TFLOPs: 25.87 | +7: iteration 133520/ 173500 | consumed samples: 34181120 | consumed tokens: 70002933760 | elapsed time per iteration (s): 0.15 | learning rate: 4.301E-05 | global batch size: 256 | lm loss: 3.685194E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.110 | TFLOPs: 25.97 | +7: iteration 133530/ 173500 | consumed samples: 34183680 | consumed tokens: 70008176640 | elapsed time per iteration (s): 0.15 | learning rate: 4.300E-05 | global batch size: 256 | lm loss: 3.673721E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.315 | TFLOPs: 26.32 | +7: iteration 133540/ 173500 | consumed samples: 34186240 | consumed tokens: 70013419520 | elapsed time per iteration (s): 0.15 | learning rate: 4.299E-05 | global batch size: 256 | lm loss: 3.671563E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.364 | TFLOPs: 26.34 | +7: iteration 133550/ 173500 | consumed samples: 34188800 | consumed tokens: 70018662400 | elapsed time per iteration (s): 0.15 | learning rate: 4.298E-05 | global batch size: 256 | lm loss: 3.681846E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.575 | TFLOPs: 25.93 | +7: iteration 133560/ 173500 | consumed samples: 34191360 | consumed tokens: 70023905280 | elapsed time per iteration (s): 0.16 | learning rate: 4.297E-05 | global batch size: 256 | lm loss: 3.679109E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.206 | TFLOPs: 25.71 | +7: iteration 133570/ 173500 | consumed samples: 34193920 | consumed tokens: 70029148160 | elapsed time per iteration (s): 0.16 | learning rate: 4.295E-05 | global batch size: 256 | lm loss: 3.661349E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.298 | TFLOPs: 25.47 | +7: iteration 133580/ 173500 | consumed samples: 34196480 | consumed tokens: 70034391040 | elapsed time per iteration (s): 0.16 | learning rate: 4.294E-05 | global batch size: 256 | lm loss: 3.663429E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.962 | TFLOPs: 25.66 | +7: iteration 133590/ 173500 | consumed samples: 34199040 | consumed tokens: 70039633920 | elapsed time per iteration (s): 0.15 | learning rate: 4.293E-05 | global batch size: 256 | lm loss: 3.671199E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.998 | TFLOPs: 25.95 | +7: iteration 133600/ 173500 | consumed samples: 34201600 | consumed tokens: 70044876800 | elapsed time per iteration (s): 0.16 | learning rate: 4.292E-05 | global batch size: 256 | lm loss: 3.671716E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.233 | TFLOPs: 25.61 | +7: iteration 133610/ 173500 | consumed samples: 34204160 | consumed tokens: 70050119680 | elapsed time per iteration (s): 0.16 | learning rate: 4.291E-05 | global batch size: 256 | lm loss: 3.670883E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.472 | TFLOPs: 25.71 | +7: iteration 133620/ 173500 | consumed samples: 34206720 | consumed tokens: 70055362560 | elapsed time per iteration (s): 0.15 | learning rate: 4.290E-05 | global batch size: 256 | lm loss: 3.674804E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.153 | TFLOPs: 26.30 | +7: iteration 133630/ 173500 | consumed samples: 34209280 | consumed tokens: 70060605440 | elapsed time per iteration (s): 0.15 | learning rate: 4.289E-05 | global batch size: 256 | lm loss: 3.677847E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.644 | TFLOPs: 26.29 | +7: iteration 133640/ 173500 | consumed samples: 34211840 | consumed tokens: 70065848320 | elapsed time per iteration (s): 0.15 | learning rate: 4.288E-05 | global batch size: 256 | lm loss: 3.667617E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.953 | TFLOPs: 25.97 | +7: iteration 133650/ 173500 | consumed samples: 34214400 | consumed tokens: 70071091200 | elapsed time per iteration (s): 0.16 | learning rate: 4.287E-05 | global batch size: 256 | lm loss: 3.682467E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.808 | TFLOPs: 25.28 | +7: iteration 133660/ 173500 | consumed samples: 34216960 | consumed tokens: 70076334080 | elapsed time per iteration (s): 0.15 | learning rate: 4.286E-05 | global batch size: 256 | lm loss: 3.679927E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.440 | TFLOPs: 26.34 | +7: iteration 133670/ 173500 | consumed samples: 34219520 | consumed tokens: 70081576960 | elapsed time per iteration (s): 0.16 | learning rate: 4.284E-05 | global batch size: 256 | lm loss: 3.672595E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.467 | TFLOPs: 25.85 | +7: iteration 133680/ 173500 | consumed samples: 34222080 | consumed tokens: 70086819840 | elapsed time per iteration (s): 0.16 | learning rate: 4.283E-05 | global batch size: 256 | lm loss: 3.665463E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.298 | TFLOPs: 25.39 | +7: iteration 133690/ 173500 | consumed samples: 34224640 | consumed tokens: 70092062720 | elapsed time per iteration (s): 0.16 | learning rate: 4.282E-05 | global batch size: 256 | lm loss: 3.674673E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.183 | TFLOPs: 25.57 | +7: iteration 133700/ 173500 | consumed samples: 34227200 | consumed tokens: 70097305600 | elapsed time per iteration (s): 0.16 | learning rate: 4.281E-05 | global batch size: 256 | lm loss: 3.678161E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.972 | TFLOPs: 24.97 | +7: iteration 133710/ 173500 | consumed samples: 34229760 | consumed tokens: 70102548480 | elapsed time per iteration (s): 0.15 | learning rate: 4.280E-05 | global batch size: 256 | lm loss: 3.661424E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.516 | TFLOPs: 26.01 | +7: iteration 133720/ 173500 | consumed samples: 34232320 | consumed tokens: 70107791360 | elapsed time per iteration (s): 0.15 | learning rate: 4.279E-05 | global batch size: 256 | lm loss: 3.680138E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.538 | TFLOPs: 25.93 | +7: iteration 133730/ 173500 | consumed samples: 34234880 | consumed tokens: 70113034240 | elapsed time per iteration (s): 0.15 | learning rate: 4.278E-05 | global batch size: 256 | lm loss: 3.685008E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.717 | TFLOPs: 26.12 | +7: iteration 133740/ 173500 | consumed samples: 34237440 | consumed tokens: 70118277120 | elapsed time per iteration (s): 0.16 | learning rate: 4.277E-05 | global batch size: 256 | lm loss: 3.666772E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.739 | TFLOPs: 25.53 | +7: iteration 133750/ 173500 | consumed samples: 34240000 | consumed tokens: 70123520000 | elapsed time per iteration (s): 0.16 | learning rate: 4.276E-05 | global batch size: 256 | lm loss: 3.671417E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.559 | TFLOPs: 25.85 | +7: iteration 133760/ 173500 | consumed samples: 34242560 | consumed tokens: 70128762880 | elapsed time per iteration (s): 0.15 | learning rate: 4.275E-05 | global batch size: 256 | lm loss: 3.654941E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.704 | TFLOPs: 26.11 | +7: iteration 133770/ 173500 | consumed samples: 34245120 | consumed tokens: 70134005760 | elapsed time per iteration (s): 0.15 | learning rate: 4.273E-05 | global batch size: 256 | lm loss: 3.673579E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.077 | TFLOPs: 26.02 | +7: iteration 133780/ 173500 | consumed samples: 34247680 | consumed tokens: 70139248640 | elapsed time per iteration (s): 0.16 | learning rate: 4.272E-05 | global batch size: 256 | lm loss: 3.669021E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.415 | TFLOPs: 25.85 | +7: iteration 133790/ 173500 | consumed samples: 34250240 | consumed tokens: 70144491520 | elapsed time per iteration (s): 0.15 | learning rate: 4.271E-05 | global batch size: 256 | lm loss: 3.672327E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.031 | TFLOPs: 26.22 | +7: iteration 133800/ 173500 | consumed samples: 34252800 | consumed tokens: 70149734400 | elapsed time per iteration (s): 0.15 | learning rate: 4.270E-05 | global batch size: 256 | lm loss: 3.658721E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.427 | TFLOPs: 26.21 | +7: iteration 133810/ 173500 | consumed samples: 34255360 | consumed tokens: 70154977280 | elapsed time per iteration (s): 0.15 | learning rate: 4.269E-05 | global batch size: 256 | lm loss: 3.672472E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.988 | TFLOPs: 26.17 | +7: iteration 133820/ 173500 | consumed samples: 34257920 | consumed tokens: 70160220160 | elapsed time per iteration (s): 0.15 | learning rate: 4.268E-05 | global batch size: 256 | lm loss: 3.666774E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.978 | TFLOPs: 26.10 | +7: iteration 133830/ 173500 | consumed samples: 34260480 | consumed tokens: 70165463040 | elapsed time per iteration (s): 0.16 | learning rate: 4.267E-05 | global batch size: 256 | lm loss: 3.667083E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.805 | TFLOPs: 25.21 | +7: iteration 133840/ 173500 | consumed samples: 34263040 | consumed tokens: 70170705920 | elapsed time per iteration (s): 0.16 | learning rate: 4.266E-05 | global batch size: 256 | lm loss: 3.671462E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.835 | TFLOPs: 25.59 | +7: iteration 133850/ 173500 | consumed samples: 34265600 | consumed tokens: 70175948800 | elapsed time per iteration (s): 0.20 | learning rate: 4.265E-05 | global batch size: 256 | lm loss: 3.656876E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1308.749 | TFLOPs: 20.52 | +7: iteration 133860/ 173500 | consumed samples: 34268160 | consumed tokens: 70181191680 | elapsed time per iteration (s): 0.16 | learning rate: 4.264E-05 | global batch size: 256 | lm loss: 3.656811E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.490 | TFLOPs: 24.82 | +7: iteration 133870/ 173500 | consumed samples: 34270720 | consumed tokens: 70186434560 | elapsed time per iteration (s): 0.16 | learning rate: 4.263E-05 | global batch size: 256 | lm loss: 3.659483E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.210 | TFLOPs: 25.69 | +7: iteration 133880/ 173500 | consumed samples: 34273280 | consumed tokens: 70191677440 | elapsed time per iteration (s): 0.15 | learning rate: 4.261E-05 | global batch size: 256 | lm loss: 3.677736E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.766 | TFLOPs: 26.15 | +7: iteration 133890/ 173500 | consumed samples: 34275840 | consumed tokens: 70196920320 | elapsed time per iteration (s): 0.15 | learning rate: 4.260E-05 | global batch size: 256 | lm loss: 3.664879E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.116 | TFLOPs: 26.08 | +7: iteration 133900/ 173500 | consumed samples: 34278400 | consumed tokens: 70202163200 | elapsed time per iteration (s): 0.15 | learning rate: 4.259E-05 | global batch size: 256 | lm loss: 3.661426E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.522 | TFLOPs: 25.92 | +7: iteration 133910/ 173500 | consumed samples: 34280960 | consumed tokens: 70207406080 | elapsed time per iteration (s): 0.16 | learning rate: 4.258E-05 | global batch size: 256 | lm loss: 3.651334E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.748 | TFLOPs: 25.62 | +7: iteration 133920/ 173500 | consumed samples: 34283520 | consumed tokens: 70212648960 | elapsed time per iteration (s): 0.16 | learning rate: 4.257E-05 | global batch size: 256 | lm loss: 3.676194E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.516 | TFLOPs: 25.87 | +7: iteration 133930/ 173500 | consumed samples: 34286080 | consumed tokens: 70217891840 | elapsed time per iteration (s): 0.15 | learning rate: 4.256E-05 | global batch size: 256 | lm loss: 3.666162E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.428 | TFLOPs: 25.93 | +7: iteration 133940/ 173500 | consumed samples: 34288640 | consumed tokens: 70223134720 | elapsed time per iteration (s): 0.16 | learning rate: 4.255E-05 | global batch size: 256 | lm loss: 3.658432E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.478 | TFLOPs: 25.81 | +7: iteration 133950/ 173500 | consumed samples: 34291200 | consumed tokens: 70228377600 | elapsed time per iteration (s): 0.16 | learning rate: 4.254E-05 | global batch size: 256 | lm loss: 3.675898E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.199 | TFLOPs: 24.92 | +7: iteration 133960/ 173500 | consumed samples: 34293760 | consumed tokens: 70233620480 | elapsed time per iteration (s): 0.16 | learning rate: 4.253E-05 | global batch size: 256 | lm loss: 3.674089E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.187 | TFLOPs: 25.08 | +7: iteration 133970/ 173500 | consumed samples: 34296320 | consumed tokens: 70238863360 | elapsed time per iteration (s): 0.15 | learning rate: 4.252E-05 | global batch size: 256 | lm loss: 3.677861E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.549 | TFLOPs: 25.92 | +7: iteration 133980/ 173500 | consumed samples: 34298880 | consumed tokens: 70244106240 | elapsed time per iteration (s): 0.16 | learning rate: 4.251E-05 | global batch size: 256 | lm loss: 3.684787E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.943 | TFLOPs: 25.69 | +7: iteration 133990/ 173500 | consumed samples: 34301440 | consumed tokens: 70249349120 | elapsed time per iteration (s): 0.16 | learning rate: 4.249E-05 | global batch size: 256 | lm loss: 3.673694E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.413 | TFLOPs: 25.84 | +0: [2023-03-17 06:04:53,775] [INFO] [logging.py:68:log_dist] [Rank 0] step=134000, skipped=0, lr=[4.248399618979796e-05, 4.248399618979796e-05, 4.248399618979796e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 134000/ 173500 | consumed samples: 34304000 | consumed tokens: 70254592000 | elapsed time per iteration (s): 0.16 | learning rate: 4.248E-05 | global batch size: 256 | lm loss: 3.678680E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.942 | TFLOPs: 24.89 | +0: steps: 134000 loss: 3.6670 iter time (s): 0.155 samples/sec: 1656.213 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 134000 | lm loss value: 3.837359E+00 | lm loss PPL: 4.640275E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 134000 to checkpoints_44m91b100m +0: [2023-03-17 06:04:53,848] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step134000 is begin to save! +0: [2023-03-17 06:04:53,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:04:53,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:04:53,915] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:04:53,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:04:53,925] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:04:53,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:04:53,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:04:53,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:04:53,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:04:53,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:04:53,950] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:04:53,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:04:53,958] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:04:53,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:04:53,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:04:53,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:04:53,974] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:04:53,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:04:53,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:04:53,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:04:53,983] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step134000/mp_rank_00_model_states.pt +0: [2023-03-17 06:04:53,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:04:53,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:04:54,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:04:54,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:04:54,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:04:54,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +7: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +7: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +7: [2023-03-17 06:04:54,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +7: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:04:54,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +7: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +7: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +4: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +3: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +1: [2023-03-17 06:04:54,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:04:54,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:04:54,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +2: [2023-03-17 06:04:54,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:04:54,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:04:54,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 06:04:54,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +6: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:04:54,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step134000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:04:54,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step134000 is ready now! +0: successfully saved checkpoint at iteration 134000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.58 +7: iteration 134010/ 173500 | consumed samples: 34306560 | consumed tokens: 70259834880 | elapsed time per iteration (s): 0.18 | learning rate: 4.247E-05 | global batch size: 256 | lm loss: 3.658607E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.372 | TFLOPs: 22.20 | +7: iteration 134020/ 173500 | consumed samples: 34309120 | consumed tokens: 70265077760 | elapsed time per iteration (s): 0.16 | learning rate: 4.246E-05 | global batch size: 256 | lm loss: 3.664939E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.423 | TFLOPs: 24.75 | +7: iteration 134030/ 173500 | consumed samples: 34311680 | consumed tokens: 70270320640 | elapsed time per iteration (s): 0.15 | learning rate: 4.245E-05 | global batch size: 256 | lm loss: 3.664273E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.008 | TFLOPs: 26.14 | +7: iteration 134040/ 173500 | consumed samples: 34314240 | consumed tokens: 70275563520 | elapsed time per iteration (s): 0.16 | learning rate: 4.244E-05 | global batch size: 256 | lm loss: 3.665714E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.716 | TFLOPs: 25.48 | +7: iteration 134050/ 173500 | consumed samples: 34316800 | consumed tokens: 70280806400 | elapsed time per iteration (s): 0.16 | learning rate: 4.243E-05 | global batch size: 256 | lm loss: 3.683564E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.759 | TFLOPs: 25.37 | +7: iteration 134060/ 173500 | consumed samples: 34319360 | consumed tokens: 70286049280 | elapsed time per iteration (s): 0.15 | learning rate: 4.242E-05 | global batch size: 256 | lm loss: 3.673763E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.883 | TFLOPs: 26.06 | +7: iteration 134070/ 173500 | consumed samples: 34321920 | consumed tokens: 70291292160 | elapsed time per iteration (s): 0.16 | learning rate: 4.241E-05 | global batch size: 256 | lm loss: 3.667840E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.716 | TFLOPs: 24.74 | +7: iteration 134080/ 173500 | consumed samples: 34324480 | consumed tokens: 70296535040 | elapsed time per iteration (s): 0.16 | learning rate: 4.240E-05 | global batch size: 256 | lm loss: 3.669234E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.635 | TFLOPs: 25.40 | +7: iteration 134090/ 173500 | consumed samples: 34327040 | consumed tokens: 70301777920 | elapsed time per iteration (s): 0.15 | learning rate: 4.239E-05 | global batch size: 256 | lm loss: 3.667836E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.758 | TFLOPs: 26.15 | +7: iteration 134100/ 173500 | consumed samples: 34329600 | consumed tokens: 70307020800 | elapsed time per iteration (s): 0.16 | learning rate: 4.238E-05 | global batch size: 256 | lm loss: 3.677764E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.077 | TFLOPs: 25.81 | +7: iteration 134110/ 173500 | consumed samples: 34332160 | consumed tokens: 70312263680 | elapsed time per iteration (s): 0.15 | learning rate: 4.236E-05 | global batch size: 256 | lm loss: 3.670383E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.252 | TFLOPs: 26.19 | +7: iteration 134120/ 173500 | consumed samples: 34334720 | consumed tokens: 70317506560 | elapsed time per iteration (s): 0.16 | learning rate: 4.235E-05 | global batch size: 256 | lm loss: 3.670074E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.605 | TFLOPs: 25.59 | +7: iteration 134130/ 173500 | consumed samples: 34337280 | consumed tokens: 70322749440 | elapsed time per iteration (s): 0.16 | learning rate: 4.234E-05 | global batch size: 256 | lm loss: 3.677184E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.026 | TFLOPs: 25.53 | +7: iteration 134140/ 173500 | consumed samples: 34339840 | consumed tokens: 70327992320 | elapsed time per iteration (s): 0.16 | learning rate: 4.233E-05 | global batch size: 256 | lm loss: 3.679341E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.918 | TFLOPs: 25.81 | +7: iteration 134150/ 173500 | consumed samples: 34342400 | consumed tokens: 70333235200 | elapsed time per iteration (s): 0.16 | learning rate: 4.232E-05 | global batch size: 256 | lm loss: 3.669336E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.699 | TFLOPs: 25.76 | +7: iteration 134160/ 173500 | consumed samples: 34344960 | consumed tokens: 70338478080 | elapsed time per iteration (s): 0.15 | learning rate: 4.231E-05 | global batch size: 256 | lm loss: 3.678876E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.223 | TFLOPs: 26.21 | +7: iteration 134170/ 173500 | consumed samples: 34347520 | consumed tokens: 70343720960 | elapsed time per iteration (s): 0.16 | learning rate: 4.230E-05 | global batch size: 256 | lm loss: 3.670398E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.947 | TFLOPs: 24.46 | +7: iteration 134180/ 173500 | consumed samples: 34350080 | consumed tokens: 70348963840 | elapsed time per iteration (s): 0.16 | learning rate: 4.229E-05 | global batch size: 256 | lm loss: 3.667459E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.235 | TFLOPs: 25.22 | +7: iteration 134190/ 173500 | consumed samples: 34352640 | consumed tokens: 70354206720 | elapsed time per iteration (s): 0.16 | learning rate: 4.228E-05 | global batch size: 256 | lm loss: 3.663889E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.928 | TFLOPs: 25.47 | +7: iteration 134200/ 173500 | consumed samples: 34355200 | consumed tokens: 70359449600 | elapsed time per iteration (s): 0.16 | learning rate: 4.227E-05 | global batch size: 256 | lm loss: 3.674776E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.928 | TFLOPs: 25.62 | +7: iteration 134210/ 173500 | consumed samples: 34357760 | consumed tokens: 70364692480 | elapsed time per iteration (s): 0.16 | learning rate: 4.226E-05 | global batch size: 256 | lm loss: 3.665884E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.174 | TFLOPs: 25.44 | +7: iteration 134220/ 173500 | consumed samples: 34360320 | consumed tokens: 70369935360 | elapsed time per iteration (s): 0.16 | learning rate: 4.225E-05 | global batch size: 256 | lm loss: 3.671478E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.800 | TFLOPs: 25.67 | +7: iteration 134230/ 173500 | consumed samples: 34362880 | consumed tokens: 70375178240 | elapsed time per iteration (s): 0.16 | learning rate: 4.223E-05 | global batch size: 256 | lm loss: 3.660273E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.754 | TFLOPs: 25.48 | +7: iteration 134240/ 173500 | consumed samples: 34365440 | consumed tokens: 70380421120 | elapsed time per iteration (s): 0.16 | learning rate: 4.222E-05 | global batch size: 256 | lm loss: 3.671915E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.672 | TFLOPs: 25.21 | +7: iteration 134250/ 173500 | consumed samples: 34368000 | consumed tokens: 70385664000 | elapsed time per iteration (s): 0.16 | learning rate: 4.221E-05 | global batch size: 256 | lm loss: 3.680252E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.631 | TFLOPs: 25.79 | +7: iteration 134260/ 173500 | consumed samples: 34370560 | consumed tokens: 70390906880 | elapsed time per iteration (s): 0.16 | learning rate: 4.220E-05 | global batch size: 256 | lm loss: 3.668514E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.358 | TFLOPs: 25.57 | +7: iteration 134270/ 173500 | consumed samples: 34373120 | consumed tokens: 70396149760 | elapsed time per iteration (s): 0.16 | learning rate: 4.219E-05 | global batch size: 256 | lm loss: 3.665609E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.343 | TFLOPs: 25.60 | +7: iteration 134280/ 173500 | consumed samples: 34375680 | consumed tokens: 70401392640 | elapsed time per iteration (s): 0.16 | learning rate: 4.218E-05 | global batch size: 256 | lm loss: 3.670457E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.431 | TFLOPs: 25.41 | +7: iteration 134290/ 173500 | consumed samples: 34378240 | consumed tokens: 70406635520 | elapsed time per iteration (s): 0.16 | learning rate: 4.217E-05 | global batch size: 256 | lm loss: 3.665715E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.268 | TFLOPs: 25.74 | +7: iteration 134300/ 173500 | consumed samples: 34380800 | consumed tokens: 70411878400 | elapsed time per iteration (s): 0.16 | learning rate: 4.216E-05 | global batch size: 256 | lm loss: 3.675899E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.493 | TFLOPs: 25.05 | +7: iteration 134310/ 173500 | consumed samples: 34383360 | consumed tokens: 70417121280 | elapsed time per iteration (s): 0.16 | learning rate: 4.215E-05 | global batch size: 256 | lm loss: 3.662149E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.236 | TFLOPs: 25.86 | +7: iteration 134320/ 173500 | consumed samples: 34385920 | consumed tokens: 70422364160 | elapsed time per iteration (s): 0.16 | learning rate: 4.214E-05 | global batch size: 256 | lm loss: 3.671577E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.555 | TFLOPs: 25.57 | +7: iteration 134330/ 173500 | consumed samples: 34388480 | consumed tokens: 70427607040 | elapsed time per iteration (s): 0.15 | learning rate: 4.213E-05 | global batch size: 256 | lm loss: 3.681172E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.024 | TFLOPs: 26.14 | +7: iteration 134340/ 173500 | consumed samples: 34391040 | consumed tokens: 70432849920 | elapsed time per iteration (s): 0.16 | learning rate: 4.212E-05 | global batch size: 256 | lm loss: 3.671571E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.677 | TFLOPs: 25.24 | +7: iteration 134350/ 173500 | consumed samples: 34393600 | consumed tokens: 70438092800 | elapsed time per iteration (s): 0.16 | learning rate: 4.210E-05 | global batch size: 256 | lm loss: 3.670875E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.033 | TFLOPs: 24.84 | +7: iteration 134360/ 173500 | consumed samples: 34396160 | consumed tokens: 70443335680 | elapsed time per iteration (s): 0.15 | learning rate: 4.209E-05 | global batch size: 256 | lm loss: 3.675793E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.118 | TFLOPs: 26.21 | +7: iteration 134370/ 173500 | consumed samples: 34398720 | consumed tokens: 70448578560 | elapsed time per iteration (s): 0.15 | learning rate: 4.208E-05 | global batch size: 256 | lm loss: 3.667042E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.237 | TFLOPs: 26.19 | +7: iteration 134380/ 173500 | consumed samples: 34401280 | consumed tokens: 70453821440 | elapsed time per iteration (s): 0.16 | learning rate: 4.207E-05 | global batch size: 256 | lm loss: 3.671747E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.847 | TFLOPs: 25.78 | +7: iteration 134390/ 173500 | consumed samples: 34403840 | consumed tokens: 70459064320 | elapsed time per iteration (s): 0.16 | learning rate: 4.206E-05 | global batch size: 256 | lm loss: 3.671275E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.047 | TFLOPs: 25.30 | +7: iteration 134400/ 173500 | consumed samples: 34406400 | consumed tokens: 70464307200 | elapsed time per iteration (s): 0.16 | learning rate: 4.205E-05 | global batch size: 256 | lm loss: 3.678767E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.757 | TFLOPs: 25.79 | +7: iteration 134410/ 173500 | consumed samples: 34408960 | consumed tokens: 70469550080 | elapsed time per iteration (s): 0.15 | learning rate: 4.204E-05 | global batch size: 256 | lm loss: 3.671084E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.472 | TFLOPs: 26.20 | +7: iteration 134420/ 173500 | consumed samples: 34411520 | consumed tokens: 70474792960 | elapsed time per iteration (s): 0.16 | learning rate: 4.203E-05 | global batch size: 256 | lm loss: 3.662518E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.262 | TFLOPs: 25.58 | +7: iteration 134430/ 173500 | consumed samples: 34414080 | consumed tokens: 70480035840 | elapsed time per iteration (s): 0.16 | learning rate: 4.202E-05 | global batch size: 256 | lm loss: 3.666748E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.188 | TFLOPs: 25.74 | +7: iteration 134440/ 173500 | consumed samples: 34416640 | consumed tokens: 70485278720 | elapsed time per iteration (s): 0.16 | learning rate: 4.201E-05 | global batch size: 256 | lm loss: 3.675533E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.392 | TFLOPs: 25.77 | +7: iteration 134450/ 173500 | consumed samples: 34419200 | consumed tokens: 70490521600 | elapsed time per iteration (s): 0.15 | learning rate: 4.200E-05 | global batch size: 256 | lm loss: 3.652081E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.880 | TFLOPs: 26.13 | +7: iteration 134460/ 173500 | consumed samples: 34421760 | consumed tokens: 70495764480 | elapsed time per iteration (s): 0.15 | learning rate: 4.199E-05 | global batch size: 256 | lm loss: 3.673441E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.063 | TFLOPs: 26.18 | +7: iteration 134470/ 173500 | consumed samples: 34424320 | consumed tokens: 70501007360 | elapsed time per iteration (s): 0.16 | learning rate: 4.197E-05 | global batch size: 256 | lm loss: 3.676503E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.048 | TFLOPs: 24.87 | +7: iteration 134480/ 173500 | consumed samples: 34426880 | consumed tokens: 70506250240 | elapsed time per iteration (s): 0.15 | learning rate: 4.196E-05 | global batch size: 256 | lm loss: 3.670647E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.571 | TFLOPs: 26.20 | +7: iteration 134490/ 173500 | consumed samples: 34429440 | consumed tokens: 70511493120 | elapsed time per iteration (s): 0.15 | learning rate: 4.195E-05 | global batch size: 256 | lm loss: 3.671503E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.545 | TFLOPs: 26.20 | +7: iteration 134500/ 173500 | consumed samples: 34432000 | consumed tokens: 70516736000 | elapsed time per iteration (s): 0.15 | learning rate: 4.194E-05 | global batch size: 256 | lm loss: 3.686488E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.273 | TFLOPs: 26.13 | +7: iteration 134510/ 173500 | consumed samples: 34434560 | consumed tokens: 70521978880 | elapsed time per iteration (s): 0.15 | learning rate: 4.193E-05 | global batch size: 256 | lm loss: 3.665597E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.445 | TFLOPs: 26.12 | +7: iteration 134520/ 173500 | consumed samples: 34437120 | consumed tokens: 70527221760 | elapsed time per iteration (s): 0.16 | learning rate: 4.192E-05 | global batch size: 256 | lm loss: 3.670315E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.218 | TFLOPs: 24.61 | +7: iteration 134530/ 173500 | consumed samples: 34439680 | consumed tokens: 70532464640 | elapsed time per iteration (s): 0.15 | learning rate: 4.191E-05 | global batch size: 256 | lm loss: 3.665425E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.025 | TFLOPs: 26.11 | +7: iteration 134540/ 173500 | consumed samples: 34442240 | consumed tokens: 70537707520 | elapsed time per iteration (s): 0.15 | learning rate: 4.190E-05 | global batch size: 256 | lm loss: 3.665059E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.857 | TFLOPs: 26.11 | +7: iteration 134550/ 173500 | consumed samples: 34444800 | consumed tokens: 70542950400 | elapsed time per iteration (s): 0.16 | learning rate: 4.189E-05 | global batch size: 256 | lm loss: 3.670675E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.077 | TFLOPs: 25.86 | +7: iteration 134560/ 173500 | consumed samples: 34447360 | consumed tokens: 70548193280 | elapsed time per iteration (s): 0.15 | learning rate: 4.188E-05 | global batch size: 256 | lm loss: 3.685293E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.625 | TFLOPs: 25.92 | +7: iteration 134570/ 173500 | consumed samples: 34449920 | consumed tokens: 70553436160 | elapsed time per iteration (s): 0.15 | learning rate: 4.187E-05 | global batch size: 256 | lm loss: 3.666903E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.922 | TFLOPs: 25.91 | +7: iteration 134580/ 173500 | consumed samples: 34452480 | consumed tokens: 70558679040 | elapsed time per iteration (s): 0.16 | learning rate: 4.186E-05 | global batch size: 256 | lm loss: 3.675998E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.568 | TFLOPs: 25.89 | +7: iteration 134590/ 173500 | consumed samples: 34455040 | consumed tokens: 70563921920 | elapsed time per iteration (s): 0.15 | learning rate: 4.185E-05 | global batch size: 256 | lm loss: 3.675238E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.801 | TFLOPs: 26.14 | +7: iteration 134600/ 173500 | consumed samples: 34457600 | consumed tokens: 70569164800 | elapsed time per iteration (s): 0.15 | learning rate: 4.183E-05 | global batch size: 256 | lm loss: 3.650673E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.352 | TFLOPs: 26.13 | +7: iteration 134610/ 173500 | consumed samples: 34460160 | consumed tokens: 70574407680 | elapsed time per iteration (s): 0.16 | learning rate: 4.182E-05 | global batch size: 256 | lm loss: 3.664727E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.554 | TFLOPs: 24.58 | +7: iteration 134620/ 173500 | consumed samples: 34462720 | consumed tokens: 70579650560 | elapsed time per iteration (s): 0.15 | learning rate: 4.181E-05 | global batch size: 256 | lm loss: 3.669589E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.128 | TFLOPs: 26.13 | +7: iteration 134630/ 173500 | consumed samples: 34465280 | consumed tokens: 70584893440 | elapsed time per iteration (s): 0.15 | learning rate: 4.180E-05 | global batch size: 256 | lm loss: 3.672661E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.787 | TFLOPs: 26.16 | +7: iteration 134640/ 173500 | consumed samples: 34467840 | consumed tokens: 70590136320 | elapsed time per iteration (s): 0.16 | learning rate: 4.179E-05 | global batch size: 256 | lm loss: 3.658763E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.001 | TFLOPs: 25.70 | +7: iteration 134650/ 173500 | consumed samples: 34470400 | consumed tokens: 70595379200 | elapsed time per iteration (s): 0.15 | learning rate: 4.178E-05 | global batch size: 256 | lm loss: 3.659963E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.579 | TFLOPs: 25.98 | +7: iteration 134660/ 173500 | consumed samples: 34472960 | consumed tokens: 70600622080 | elapsed time per iteration (s): 0.15 | learning rate: 4.177E-05 | global batch size: 256 | lm loss: 3.674430E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.183 | TFLOPs: 26.15 | +7: iteration 134670/ 173500 | consumed samples: 34475520 | consumed tokens: 70605864960 | elapsed time per iteration (s): 0.15 | learning rate: 4.176E-05 | global batch size: 256 | lm loss: 3.671016E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.240 | TFLOPs: 25.99 | +7: iteration 134680/ 173500 | consumed samples: 34478080 | consumed tokens: 70611107840 | elapsed time per iteration (s): 0.15 | learning rate: 4.175E-05 | global batch size: 256 | lm loss: 3.676506E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.618 | TFLOPs: 26.12 | +7: iteration 134690/ 173500 | consumed samples: 34480640 | consumed tokens: 70616350720 | elapsed time per iteration (s): 0.15 | learning rate: 4.174E-05 | global batch size: 256 | lm loss: 3.664449E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.115 | TFLOPs: 26.13 | +7: iteration 134700/ 173500 | consumed samples: 34483200 | consumed tokens: 70621593600 | elapsed time per iteration (s): 0.15 | learning rate: 4.173E-05 | global batch size: 256 | lm loss: 3.667199E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.821 | TFLOPs: 26.12 | +7: iteration 134710/ 173500 | consumed samples: 34485760 | consumed tokens: 70626836480 | elapsed time per iteration (s): 0.15 | learning rate: 4.172E-05 | global batch size: 256 | lm loss: 3.680828E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.462 | TFLOPs: 26.10 | +7: iteration 134720/ 173500 | consumed samples: 34488320 | consumed tokens: 70632079360 | elapsed time per iteration (s): 0.15 | learning rate: 4.171E-05 | global batch size: 256 | lm loss: 3.667320E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.038 | TFLOPs: 26.13 | +7: iteration 134730/ 173500 | consumed samples: 34490880 | consumed tokens: 70637322240 | elapsed time per iteration (s): 0.15 | learning rate: 4.170E-05 | global batch size: 256 | lm loss: 3.673450E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.888 | TFLOPs: 26.13 | +7: iteration 134740/ 173500 | consumed samples: 34493440 | consumed tokens: 70642565120 | elapsed time per iteration (s): 0.16 | learning rate: 4.168E-05 | global batch size: 256 | lm loss: 3.670137E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.185 | TFLOPs: 25.82 | +7: iteration 134750/ 173500 | consumed samples: 34496000 | consumed tokens: 70647808000 | elapsed time per iteration (s): 0.16 | learning rate: 4.167E-05 | global batch size: 256 | lm loss: 3.675605E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.622 | TFLOPs: 25.73 | +7: iteration 134760/ 173500 | consumed samples: 34498560 | consumed tokens: 70653050880 | elapsed time per iteration (s): 0.16 | learning rate: 4.166E-05 | global batch size: 256 | lm loss: 3.667554E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.124 | TFLOPs: 25.50 | +7: iteration 134770/ 173500 | consumed samples: 34501120 | consumed tokens: 70658293760 | elapsed time per iteration (s): 0.15 | learning rate: 4.165E-05 | global batch size: 256 | lm loss: 3.680093E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.632 | TFLOPs: 26.14 | +7: iteration 134780/ 173500 | consumed samples: 34503680 | consumed tokens: 70663536640 | elapsed time per iteration (s): 0.15 | learning rate: 4.164E-05 | global batch size: 256 | lm loss: 3.668605E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.730 | TFLOPs: 26.20 | +7: iteration 134790/ 173500 | consumed samples: 34506240 | consumed tokens: 70668779520 | elapsed time per iteration (s): 0.16 | learning rate: 4.163E-05 | global batch size: 256 | lm loss: 3.663253E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.431 | TFLOPs: 25.84 | +7: iteration 134800/ 173500 | consumed samples: 34508800 | consumed tokens: 70674022400 | elapsed time per iteration (s): 0.15 | learning rate: 4.162E-05 | global batch size: 256 | lm loss: 3.679036E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.518 | TFLOPs: 26.20 | +7: iteration 134810/ 173500 | consumed samples: 34511360 | consumed tokens: 70679265280 | elapsed time per iteration (s): 0.15 | learning rate: 4.161E-05 | global batch size: 256 | lm loss: 3.673901E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.400 | TFLOPs: 26.20 | +7: iteration 134820/ 173500 | consumed samples: 34513920 | consumed tokens: 70684508160 | elapsed time per iteration (s): 0.16 | learning rate: 4.160E-05 | global batch size: 256 | lm loss: 3.673126E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.894 | TFLOPs: 25.51 | +7: iteration 134830/ 173500 | consumed samples: 34516480 | consumed tokens: 70689751040 | elapsed time per iteration (s): 0.15 | learning rate: 4.159E-05 | global batch size: 256 | lm loss: 3.664482E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.340 | TFLOPs: 26.21 | +7: iteration 134840/ 173500 | consumed samples: 34519040 | consumed tokens: 70694993920 | elapsed time per iteration (s): 0.16 | learning rate: 4.158E-05 | global batch size: 256 | lm loss: 3.665847E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.107 | TFLOPs: 24.47 | +7: iteration 134850/ 173500 | consumed samples: 34521600 | consumed tokens: 70700236800 | elapsed time per iteration (s): 0.16 | learning rate: 4.157E-05 | global batch size: 256 | lm loss: 3.679958E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.528 | TFLOPs: 25.60 | +7: iteration 134860/ 173500 | consumed samples: 34524160 | consumed tokens: 70705479680 | elapsed time per iteration (s): 0.15 | learning rate: 4.156E-05 | global batch size: 256 | lm loss: 3.667942E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.965 | TFLOPs: 26.17 | +7: iteration 134870/ 173500 | consumed samples: 34526720 | consumed tokens: 70710722560 | elapsed time per iteration (s): 0.15 | learning rate: 4.155E-05 | global batch size: 256 | lm loss: 3.662247E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.014 | TFLOPs: 26.17 | +7: iteration 134880/ 173500 | consumed samples: 34529280 | consumed tokens: 70715965440 | elapsed time per iteration (s): 0.16 | learning rate: 4.153E-05 | global batch size: 256 | lm loss: 3.675504E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.449 | TFLOPs: 25.77 | +7: iteration 134890/ 173500 | consumed samples: 34531840 | consumed tokens: 70721208320 | elapsed time per iteration (s): 0.16 | learning rate: 4.152E-05 | global batch size: 256 | lm loss: 3.667179E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.171 | TFLOPs: 24.51 | +7: iteration 134900/ 173500 | consumed samples: 34534400 | consumed tokens: 70726451200 | elapsed time per iteration (s): 0.16 | learning rate: 4.151E-05 | global batch size: 256 | lm loss: 3.668655E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.233 | TFLOPs: 25.13 | +7: iteration 134910/ 173500 | consumed samples: 34536960 | consumed tokens: 70731694080 | elapsed time per iteration (s): 0.16 | learning rate: 4.150E-05 | global batch size: 256 | lm loss: 3.677355E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.438 | TFLOPs: 25.85 | +7: iteration 134920/ 173500 | consumed samples: 34539520 | consumed tokens: 70736936960 | elapsed time per iteration (s): 0.16 | learning rate: 4.149E-05 | global batch size: 256 | lm loss: 3.677541E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.770 | TFLOPs: 25.76 | +7: iteration 134930/ 173500 | consumed samples: 34542080 | consumed tokens: 70742179840 | elapsed time per iteration (s): 0.15 | learning rate: 4.148E-05 | global batch size: 256 | lm loss: 3.683643E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.500 | TFLOPs: 26.15 | +7: iteration 134940/ 173500 | consumed samples: 34544640 | consumed tokens: 70747422720 | elapsed time per iteration (s): 0.16 | learning rate: 4.147E-05 | global batch size: 256 | lm loss: 3.675285E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.361 | TFLOPs: 25.60 | +7: iteration 134950/ 173500 | consumed samples: 34547200 | consumed tokens: 70752665600 | elapsed time per iteration (s): 0.16 | learning rate: 4.146E-05 | global batch size: 256 | lm loss: 3.673745E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.394 | TFLOPs: 25.80 | +7: iteration 134960/ 173500 | consumed samples: 34549760 | consumed tokens: 70757908480 | elapsed time per iteration (s): 0.15 | learning rate: 4.145E-05 | global batch size: 256 | lm loss: 3.668528E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.261 | TFLOPs: 26.05 | +7: iteration 134970/ 173500 | consumed samples: 34552320 | consumed tokens: 70763151360 | elapsed time per iteration (s): 0.16 | learning rate: 4.144E-05 | global batch size: 256 | lm loss: 3.691824E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.700 | TFLOPs: 25.78 | +7: iteration 134980/ 173500 | consumed samples: 34554880 | consumed tokens: 70768394240 | elapsed time per iteration (s): 0.15 | learning rate: 4.143E-05 | global batch size: 256 | lm loss: 3.674726E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.739 | TFLOPs: 26.15 | +7: iteration 134990/ 173500 | consumed samples: 34557440 | consumed tokens: 70773637120 | elapsed time per iteration (s): 0.16 | learning rate: 4.142E-05 | global batch size: 256 | lm loss: 3.668451E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.370 | TFLOPs: 25.65 | +7: iteration 135000/ 173500 | consumed samples: 34560000 | consumed tokens: 70778880000 | elapsed time per iteration (s): 0.15 | learning rate: 4.141E-05 | global batch size: 256 | lm loss: 3.663952E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.840 | TFLOPs: 26.16 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 135000 | lm loss value: 3.855682E+00 | lm loss PPL: 4.726086E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 135000 to checkpoints_44m91b100m +0: [2023-03-17 06:07:30,054] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step135000 is begin to save! +0: [2023-03-17 06:07:30,058] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:07:30,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:07:30,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:07:30,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:07:30,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:07:30,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:07:30,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:07:30,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:07:30,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:07:30,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:07:30,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:07:30,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:07:30,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:07:30,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:07:30,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:07:30,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:07:30,179] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:07:30,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:07:30,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:07:30,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:07:30,189] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step135000/mp_rank_00_model_states.pt +0: [2023-03-17 06:07:30,189] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:07:30,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:07:30,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:07:30,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:07:30,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-17 06:07:30,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +6: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +5: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +7: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +2: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +4: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:07:30,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:07:30,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +3: [2023-03-17 06:07:30,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:07:30,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:07:30,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +1: [2023-03-17 06:07:30,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:07:30,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step135000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:07:30,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step135000 is ready now! +0: successfully saved checkpoint at iteration 135000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 180.62 +7: iteration 135010/ 173500 | consumed samples: 34562560 | consumed tokens: 70784122880 | elapsed time per iteration (s): 0.18 | learning rate: 4.140E-05 | global batch size: 256 | lm loss: 3.668626E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1452.753 | TFLOPs: 22.78 | +7: iteration 135020/ 173500 | consumed samples: 34565120 | consumed tokens: 70789365760 | elapsed time per iteration (s): 0.16 | learning rate: 4.139E-05 | global batch size: 256 | lm loss: 3.679913E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.219 | TFLOPs: 25.85 | +7: iteration 135030/ 173500 | consumed samples: 34567680 | consumed tokens: 70794608640 | elapsed time per iteration (s): 0.16 | learning rate: 4.137E-05 | global batch size: 256 | lm loss: 3.673774E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.031 | TFLOPs: 25.80 | +7: iteration 135040/ 173500 | consumed samples: 34570240 | consumed tokens: 70799851520 | elapsed time per iteration (s): 0.15 | learning rate: 4.136E-05 | global batch size: 256 | lm loss: 3.661383E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.860 | TFLOPs: 26.11 | +7: iteration 135050/ 173500 | consumed samples: 34572800 | consumed tokens: 70805094400 | elapsed time per iteration (s): 0.15 | learning rate: 4.135E-05 | global batch size: 256 | lm loss: 3.682148E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.890 | TFLOPs: 26.20 | +7: iteration 135060/ 173500 | consumed samples: 34575360 | consumed tokens: 70810337280 | elapsed time per iteration (s): 0.15 | learning rate: 4.134E-05 | global batch size: 256 | lm loss: 3.670845E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.318 | TFLOPs: 26.19 | +7: iteration 135070/ 173500 | consumed samples: 34577920 | consumed tokens: 70815580160 | elapsed time per iteration (s): 0.15 | learning rate: 4.133E-05 | global batch size: 256 | lm loss: 3.667618E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.789 | TFLOPs: 26.17 | +7: iteration 135080/ 173500 | consumed samples: 34580480 | consumed tokens: 70820823040 | elapsed time per iteration (s): 0.15 | learning rate: 4.132E-05 | global batch size: 256 | lm loss: 3.682353E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.341 | TFLOPs: 26.18 | +7: iteration 135090/ 173500 | consumed samples: 34583040 | consumed tokens: 70826065920 | elapsed time per iteration (s): 0.15 | learning rate: 4.131E-05 | global batch size: 256 | lm loss: 3.658093E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.398 | TFLOPs: 26.18 | +7: iteration 135100/ 173500 | consumed samples: 34585600 | consumed tokens: 70831308800 | elapsed time per iteration (s): 0.15 | learning rate: 4.130E-05 | global batch size: 256 | lm loss: 3.658778E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.349 | TFLOPs: 26.18 | +7: iteration 135110/ 173500 | consumed samples: 34588160 | consumed tokens: 70836551680 | elapsed time per iteration (s): 0.16 | learning rate: 4.129E-05 | global batch size: 256 | lm loss: 3.662867E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.272 | TFLOPs: 25.83 | +7: iteration 135120/ 173500 | consumed samples: 34590720 | consumed tokens: 70841794560 | elapsed time per iteration (s): 0.16 | learning rate: 4.128E-05 | global batch size: 256 | lm loss: 3.675484E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.957 | TFLOPs: 25.78 | +7: iteration 135130/ 173500 | consumed samples: 34593280 | consumed tokens: 70847037440 | elapsed time per iteration (s): 0.15 | learning rate: 4.127E-05 | global batch size: 256 | lm loss: 3.665570E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.250 | TFLOPs: 26.24 | +7: iteration 135140/ 173500 | consumed samples: 34595840 | consumed tokens: 70852280320 | elapsed time per iteration (s): 0.16 | learning rate: 4.126E-05 | global batch size: 256 | lm loss: 3.693215E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.729 | TFLOPs: 25.21 | +7: iteration 135150/ 173500 | consumed samples: 34598400 | consumed tokens: 70857523200 | elapsed time per iteration (s): 0.15 | learning rate: 4.125E-05 | global batch size: 256 | lm loss: 3.675676E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.537 | TFLOPs: 26.25 | +7: iteration 135160/ 173500 | consumed samples: 34600960 | consumed tokens: 70862766080 | elapsed time per iteration (s): 0.15 | learning rate: 4.124E-05 | global batch size: 256 | lm loss: 3.670396E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.177 | TFLOPs: 26.26 | +7: iteration 135170/ 173500 | consumed samples: 34603520 | consumed tokens: 70868008960 | elapsed time per iteration (s): 0.15 | learning rate: 4.123E-05 | global batch size: 256 | lm loss: 3.674622E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.755 | TFLOPs: 26.25 | +7: iteration 135180/ 173500 | consumed samples: 34606080 | consumed tokens: 70873251840 | elapsed time per iteration (s): 0.16 | learning rate: 4.122E-05 | global batch size: 256 | lm loss: 3.670863E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.176 | TFLOPs: 25.60 | +7: iteration 135190/ 173500 | consumed samples: 34608640 | consumed tokens: 70878494720 | elapsed time per iteration (s): 0.15 | learning rate: 4.120E-05 | global batch size: 256 | lm loss: 3.669610E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.121 | TFLOPs: 26.24 | +7: iteration 135200/ 173500 | consumed samples: 34611200 | consumed tokens: 70883737600 | elapsed time per iteration (s): 0.15 | learning rate: 4.119E-05 | global batch size: 256 | lm loss: 3.662186E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.345 | TFLOPs: 25.98 | +7: iteration 135210/ 173500 | consumed samples: 34613760 | consumed tokens: 70888980480 | elapsed time per iteration (s): 0.16 | learning rate: 4.118E-05 | global batch size: 256 | lm loss: 3.676683E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.860 | TFLOPs: 25.22 | +7: iteration 135220/ 173500 | consumed samples: 34616320 | consumed tokens: 70894223360 | elapsed time per iteration (s): 0.15 | learning rate: 4.117E-05 | global batch size: 256 | lm loss: 3.673391E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.623 | TFLOPs: 26.22 | +7: iteration 135230/ 173500 | consumed samples: 34618880 | consumed tokens: 70899466240 | elapsed time per iteration (s): 0.15 | learning rate: 4.116E-05 | global batch size: 256 | lm loss: 3.673284E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.437 | TFLOPs: 26.24 | +7: iteration 135240/ 173500 | consumed samples: 34621440 | consumed tokens: 70904709120 | elapsed time per iteration (s): 0.15 | learning rate: 4.115E-05 | global batch size: 256 | lm loss: 3.683817E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.299 | TFLOPs: 26.02 | +7: iteration 135250/ 173500 | consumed samples: 34624000 | consumed tokens: 70909952000 | elapsed time per iteration (s): 0.16 | learning rate: 4.114E-05 | global batch size: 256 | lm loss: 3.660843E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.412 | TFLOPs: 25.37 | +7: iteration 135260/ 173500 | consumed samples: 34626560 | consumed tokens: 70915194880 | elapsed time per iteration (s): 0.15 | learning rate: 4.113E-05 | global batch size: 256 | lm loss: 3.665894E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.943 | TFLOPs: 26.24 | +7: iteration 135270/ 173500 | consumed samples: 34629120 | consumed tokens: 70920437760 | elapsed time per iteration (s): 0.15 | learning rate: 4.112E-05 | global batch size: 256 | lm loss: 3.664345E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.880 | TFLOPs: 26.22 | +7: iteration 135280/ 173500 | consumed samples: 34631680 | consumed tokens: 70925680640 | elapsed time per iteration (s): 0.16 | learning rate: 4.111E-05 | global batch size: 256 | lm loss: 3.673748E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.064 | TFLOPs: 25.75 | +7: iteration 135290/ 173500 | consumed samples: 34634240 | consumed tokens: 70930923520 | elapsed time per iteration (s): 0.16 | learning rate: 4.110E-05 | global batch size: 256 | lm loss: 3.669024E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.961 | TFLOPs: 25.39 | +7: iteration 135300/ 173500 | consumed samples: 34636800 | consumed tokens: 70936166400 | elapsed time per iteration (s): 0.16 | learning rate: 4.109E-05 | global batch size: 256 | lm loss: 3.675628E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.278 | TFLOPs: 25.68 | +7: iteration 135310/ 173500 | consumed samples: 34639360 | consumed tokens: 70941409280 | elapsed time per iteration (s): 0.15 | learning rate: 4.108E-05 | global batch size: 256 | lm loss: 3.686283E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.082 | TFLOPs: 26.13 | +7: iteration 135320/ 173500 | consumed samples: 34641920 | consumed tokens: 70946652160 | elapsed time per iteration (s): 0.16 | learning rate: 4.107E-05 | global batch size: 256 | lm loss: 3.672984E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.512 | TFLOPs: 24.94 | +7: iteration 135330/ 173500 | consumed samples: 34644480 | consumed tokens: 70951895040 | elapsed time per iteration (s): 0.15 | learning rate: 4.106E-05 | global batch size: 256 | lm loss: 3.660128E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.965 | TFLOPs: 26.19 | +7: iteration 135340/ 173500 | consumed samples: 34647040 | consumed tokens: 70957137920 | elapsed time per iteration (s): 0.16 | learning rate: 4.105E-05 | global batch size: 256 | lm loss: 3.669767E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.273 | TFLOPs: 25.39 | +7: iteration 135350/ 173500 | consumed samples: 34649600 | consumed tokens: 70962380800 | elapsed time per iteration (s): 0.16 | learning rate: 4.104E-05 | global batch size: 256 | lm loss: 3.672301E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.918 | TFLOPs: 25.50 | +7: iteration 135360/ 173500 | consumed samples: 34652160 | consumed tokens: 70967623680 | elapsed time per iteration (s): 0.15 | learning rate: 4.102E-05 | global batch size: 256 | lm loss: 3.668199E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.636 | TFLOPs: 25.95 | +7: iteration 135370/ 173500 | consumed samples: 34654720 | consumed tokens: 70972866560 | elapsed time per iteration (s): 0.15 | learning rate: 4.101E-05 | global batch size: 256 | lm loss: 3.666564E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.843 | TFLOPs: 26.17 | +7: iteration 135380/ 173500 | consumed samples: 34657280 | consumed tokens: 70978109440 | elapsed time per iteration (s): 0.16 | learning rate: 4.100E-05 | global batch size: 256 | lm loss: 3.658881E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.375 | TFLOPs: 25.77 | +7: iteration 135390/ 173500 | consumed samples: 34659840 | consumed tokens: 70983352320 | elapsed time per iteration (s): 0.16 | learning rate: 4.099E-05 | global batch size: 256 | lm loss: 3.668569E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.586 | TFLOPs: 24.79 | +7: iteration 135400/ 173500 | consumed samples: 34662400 | consumed tokens: 70988595200 | elapsed time per iteration (s): 0.15 | learning rate: 4.098E-05 | global batch size: 256 | lm loss: 3.667348E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.796 | TFLOPs: 26.20 | +7: iteration 135410/ 173500 | consumed samples: 34664960 | consumed tokens: 70993838080 | elapsed time per iteration (s): 0.15 | learning rate: 4.097E-05 | global batch size: 256 | lm loss: 3.667236E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.701 | TFLOPs: 26.15 | +7: iteration 135420/ 173500 | consumed samples: 34667520 | consumed tokens: 70999080960 | elapsed time per iteration (s): 0.15 | learning rate: 4.096E-05 | global batch size: 256 | lm loss: 3.673682E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.421 | TFLOPs: 26.20 | +7: iteration 135430/ 173500 | consumed samples: 34670080 | consumed tokens: 71004323840 | elapsed time per iteration (s): 0.15 | learning rate: 4.095E-05 | global batch size: 256 | lm loss: 3.663269E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.451 | TFLOPs: 26.21 | +7: iteration 135440/ 173500 | consumed samples: 34672640 | consumed tokens: 71009566720 | elapsed time per iteration (s): 0.15 | learning rate: 4.094E-05 | global batch size: 256 | lm loss: 3.672553E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.856 | TFLOPs: 26.19 | +7: iteration 135450/ 173500 | consumed samples: 34675200 | consumed tokens: 71014809600 | elapsed time per iteration (s): 0.16 | learning rate: 4.093E-05 | global batch size: 256 | lm loss: 3.672802E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.944 | TFLOPs: 25.08 | +7: iteration 135460/ 173500 | consumed samples: 34677760 | consumed tokens: 71020052480 | elapsed time per iteration (s): 0.15 | learning rate: 4.092E-05 | global batch size: 256 | lm loss: 3.661380E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.772 | TFLOPs: 26.20 | +7: iteration 135470/ 173500 | consumed samples: 34680320 | consumed tokens: 71025295360 | elapsed time per iteration (s): 0.15 | learning rate: 4.091E-05 | global batch size: 256 | lm loss: 3.669132E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.900 | TFLOPs: 25.98 | +7: iteration 135480/ 173500 | consumed samples: 34682880 | consumed tokens: 71030538240 | elapsed time per iteration (s): 0.15 | learning rate: 4.090E-05 | global batch size: 256 | lm loss: 3.679001E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.602 | TFLOPs: 26.18 | +7: iteration 135490/ 173500 | consumed samples: 34685440 | consumed tokens: 71035781120 | elapsed time per iteration (s): 0.15 | learning rate: 4.089E-05 | global batch size: 256 | lm loss: 3.659052E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.079 | TFLOPs: 26.16 | +7: iteration 135500/ 173500 | consumed samples: 34688000 | consumed tokens: 71041024000 | elapsed time per iteration (s): 0.15 | learning rate: 4.088E-05 | global batch size: 256 | lm loss: 3.673990E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.178 | TFLOPs: 26.16 | +7: iteration 135510/ 173500 | consumed samples: 34690560 | consumed tokens: 71046266880 | elapsed time per iteration (s): 0.15 | learning rate: 4.087E-05 | global batch size: 256 | lm loss: 3.675076E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.996 | TFLOPs: 26.21 | +7: iteration 135520/ 173500 | consumed samples: 34693120 | consumed tokens: 71051509760 | elapsed time per iteration (s): 0.15 | learning rate: 4.086E-05 | global batch size: 256 | lm loss: 3.673961E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.110 | TFLOPs: 26.21 | +7: iteration 135530/ 173500 | consumed samples: 34695680 | consumed tokens: 71056752640 | elapsed time per iteration (s): 0.15 | learning rate: 4.085E-05 | global batch size: 256 | lm loss: 3.672873E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.046 | TFLOPs: 26.21 | +7: iteration 135540/ 173500 | consumed samples: 34698240 | consumed tokens: 71061995520 | elapsed time per iteration (s): 0.15 | learning rate: 4.083E-05 | global batch size: 256 | lm loss: 3.664678E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.204 | TFLOPs: 26.24 | +7: iteration 135550/ 173500 | consumed samples: 34700800 | consumed tokens: 71067238400 | elapsed time per iteration (s): 0.15 | learning rate: 4.082E-05 | global batch size: 256 | lm loss: 3.674032E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.460 | TFLOPs: 26.20 | +7: iteration 135560/ 173500 | consumed samples: 34703360 | consumed tokens: 71072481280 | elapsed time per iteration (s): 0.16 | learning rate: 4.081E-05 | global batch size: 256 | lm loss: 3.668214E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.255 | TFLOPs: 25.69 | +7: iteration 135570/ 173500 | consumed samples: 34705920 | consumed tokens: 71077724160 | elapsed time per iteration (s): 0.15 | learning rate: 4.080E-05 | global batch size: 256 | lm loss: 3.677625E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.921 | TFLOPs: 26.17 | +7: iteration 135580/ 173500 | consumed samples: 34708480 | consumed tokens: 71082967040 | elapsed time per iteration (s): 0.15 | learning rate: 4.079E-05 | global batch size: 256 | lm loss: 3.674491E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.851 | TFLOPs: 26.00 | +7: iteration 135590/ 173500 | consumed samples: 34711040 | consumed tokens: 71088209920 | elapsed time per iteration (s): 0.15 | learning rate: 4.078E-05 | global batch size: 256 | lm loss: 3.670826E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.510 | TFLOPs: 26.17 | +7: iteration 135600/ 173500 | consumed samples: 34713600 | consumed tokens: 71093452800 | elapsed time per iteration (s): 0.16 | learning rate: 4.077E-05 | global batch size: 256 | lm loss: 3.663078E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.776 | TFLOPs: 25.42 | +7: iteration 135610/ 173500 | consumed samples: 34716160 | consumed tokens: 71098695680 | elapsed time per iteration (s): 0.16 | learning rate: 4.076E-05 | global batch size: 256 | lm loss: 3.675246E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.107 | TFLOPs: 25.66 | +7: iteration 135620/ 173500 | consumed samples: 34718720 | consumed tokens: 71103938560 | elapsed time per iteration (s): 0.15 | learning rate: 4.075E-05 | global batch size: 256 | lm loss: 3.672231E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.924 | TFLOPs: 26.14 | +7: iteration 135630/ 173500 | consumed samples: 34721280 | consumed tokens: 71109181440 | elapsed time per iteration (s): 0.15 | learning rate: 4.074E-05 | global batch size: 256 | lm loss: 3.670282E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.458 | TFLOPs: 26.17 | +7: iteration 135640/ 173500 | consumed samples: 34723840 | consumed tokens: 71114424320 | elapsed time per iteration (s): 0.15 | learning rate: 4.073E-05 | global batch size: 256 | lm loss: 3.667898E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.359 | TFLOPs: 26.10 | +7: iteration 135650/ 173500 | consumed samples: 34726400 | consumed tokens: 71119667200 | elapsed time per iteration (s): 0.16 | learning rate: 4.072E-05 | global batch size: 256 | lm loss: 3.672103E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.495 | TFLOPs: 25.79 | +7: iteration 135660/ 173500 | consumed samples: 34728960 | consumed tokens: 71124910080 | elapsed time per iteration (s): 0.15 | learning rate: 4.071E-05 | global batch size: 256 | lm loss: 3.670936E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.794 | TFLOPs: 26.20 | +7: iteration 135670/ 173500 | consumed samples: 34731520 | consumed tokens: 71130152960 | elapsed time per iteration (s): 0.15 | learning rate: 4.070E-05 | global batch size: 256 | lm loss: 3.667123E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.067 | TFLOPs: 26.18 | +7: iteration 135680/ 173500 | consumed samples: 34734080 | consumed tokens: 71135395840 | elapsed time per iteration (s): 0.15 | learning rate: 4.069E-05 | global batch size: 256 | lm loss: 3.671422E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.247 | TFLOPs: 26.18 | +7: iteration 135690/ 173500 | consumed samples: 34736640 | consumed tokens: 71140638720 | elapsed time per iteration (s): 0.15 | learning rate: 4.068E-05 | global batch size: 256 | lm loss: 3.665878E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.546 | TFLOPs: 26.15 | +7: iteration 135700/ 173500 | consumed samples: 34739200 | consumed tokens: 71145881600 | elapsed time per iteration (s): 0.15 | learning rate: 4.067E-05 | global batch size: 256 | lm loss: 3.674948E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.575 | TFLOPs: 26.15 | +7: iteration 135710/ 173500 | consumed samples: 34741760 | consumed tokens: 71151124480 | elapsed time per iteration (s): 0.15 | learning rate: 4.066E-05 | global batch size: 256 | lm loss: 3.668684E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.443 | TFLOPs: 25.93 | +7: iteration 135720/ 173500 | consumed samples: 34744320 | consumed tokens: 71156367360 | elapsed time per iteration (s): 0.15 | learning rate: 4.065E-05 | global batch size: 256 | lm loss: 3.665090E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.222 | TFLOPs: 26.18 | +7: iteration 135730/ 173500 | consumed samples: 34746880 | consumed tokens: 71161610240 | elapsed time per iteration (s): 0.15 | learning rate: 4.064E-05 | global batch size: 256 | lm loss: 3.679412E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.154 | TFLOPs: 26.18 | +7: iteration 135740/ 173500 | consumed samples: 34749440 | consumed tokens: 71166853120 | elapsed time per iteration (s): 0.15 | learning rate: 4.062E-05 | global batch size: 256 | lm loss: 3.660713E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.506 | TFLOPs: 26.20 | +7: iteration 135750/ 173500 | consumed samples: 34752000 | consumed tokens: 71172096000 | elapsed time per iteration (s): 0.15 | learning rate: 4.061E-05 | global batch size: 256 | lm loss: 3.680133E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.007 | TFLOPs: 26.19 | +7: iteration 135760/ 173500 | consumed samples: 34754560 | consumed tokens: 71177338880 | elapsed time per iteration (s): 0.15 | learning rate: 4.060E-05 | global batch size: 256 | lm loss: 3.671359E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.754 | TFLOPs: 26.19 | +7: iteration 135770/ 173500 | consumed samples: 34757120 | consumed tokens: 71182581760 | elapsed time per iteration (s): 0.15 | learning rate: 4.059E-05 | global batch size: 256 | lm loss: 3.658020E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.577 | TFLOPs: 26.18 | +7: iteration 135780/ 173500 | consumed samples: 34759680 | consumed tokens: 71187824640 | elapsed time per iteration (s): 0.15 | learning rate: 4.058E-05 | global batch size: 256 | lm loss: 3.673592E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.813 | TFLOPs: 25.95 | +7: iteration 135790/ 173500 | consumed samples: 34762240 | consumed tokens: 71193067520 | elapsed time per iteration (s): 0.15 | learning rate: 4.057E-05 | global batch size: 256 | lm loss: 3.661243E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.451 | TFLOPs: 26.15 | +7: iteration 135800/ 173500 | consumed samples: 34764800 | consumed tokens: 71198310400 | elapsed time per iteration (s): 0.15 | learning rate: 4.056E-05 | global batch size: 256 | lm loss: 3.670650E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.379 | TFLOPs: 26.20 | +7: iteration 135810/ 173500 | consumed samples: 34767360 | consumed tokens: 71203553280 | elapsed time per iteration (s): 0.15 | learning rate: 4.055E-05 | global batch size: 256 | lm loss: 3.670636E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.716 | TFLOPs: 26.12 | +7: iteration 135820/ 173500 | consumed samples: 34769920 | consumed tokens: 71208796160 | elapsed time per iteration (s): 0.15 | learning rate: 4.054E-05 | global batch size: 256 | lm loss: 3.676886E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.429 | TFLOPs: 26.23 | +7: iteration 135830/ 173500 | consumed samples: 34772480 | consumed tokens: 71214039040 | elapsed time per iteration (s): 0.15 | learning rate: 4.053E-05 | global batch size: 256 | lm loss: 3.670786E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.035 | TFLOPs: 26.17 | +7: iteration 135840/ 173500 | consumed samples: 34775040 | consumed tokens: 71219281920 | elapsed time per iteration (s): 0.16 | learning rate: 4.052E-05 | global batch size: 256 | lm loss: 3.674721E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.772 | TFLOPs: 25.89 | +7: iteration 135850/ 173500 | consumed samples: 34777600 | consumed tokens: 71224524800 | elapsed time per iteration (s): 0.16 | learning rate: 4.051E-05 | global batch size: 256 | lm loss: 3.686463E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.236 | TFLOPs: 25.79 | +7: iteration 135860/ 173500 | consumed samples: 34780160 | consumed tokens: 71229767680 | elapsed time per iteration (s): 0.15 | learning rate: 4.050E-05 | global batch size: 256 | lm loss: 3.652980E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.571 | TFLOPs: 26.21 | +7: iteration 135870/ 173500 | consumed samples: 34782720 | consumed tokens: 71235010560 | elapsed time per iteration (s): 0.15 | learning rate: 4.049E-05 | global batch size: 256 | lm loss: 3.662872E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.609 | TFLOPs: 26.18 | +7: iteration 135880/ 173500 | consumed samples: 34785280 | consumed tokens: 71240253440 | elapsed time per iteration (s): 0.15 | learning rate: 4.048E-05 | global batch size: 256 | lm loss: 3.681318E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.122 | TFLOPs: 26.18 | +7: iteration 135890/ 173500 | consumed samples: 34787840 | consumed tokens: 71245496320 | elapsed time per iteration (s): 0.15 | learning rate: 4.047E-05 | global batch size: 256 | lm loss: 3.665898E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.483 | TFLOPs: 26.17 | +7: iteration 135900/ 173500 | consumed samples: 34790400 | consumed tokens: 71250739200 | elapsed time per iteration (s): 0.15 | learning rate: 4.046E-05 | global batch size: 256 | lm loss: 3.666548E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.816 | TFLOPs: 26.20 | +7: iteration 135910/ 173500 | consumed samples: 34792960 | consumed tokens: 71255982080 | elapsed time per iteration (s): 0.15 | learning rate: 4.045E-05 | global batch size: 256 | lm loss: 3.672215E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.918 | TFLOPs: 26.24 | +7: iteration 135920/ 173500 | consumed samples: 34795520 | consumed tokens: 71261224960 | elapsed time per iteration (s): 0.15 | learning rate: 4.044E-05 | global batch size: 256 | lm loss: 3.676896E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.434 | TFLOPs: 26.24 | +7: iteration 135930/ 173500 | consumed samples: 34798080 | consumed tokens: 71266467840 | elapsed time per iteration (s): 0.15 | learning rate: 4.043E-05 | global batch size: 256 | lm loss: 3.673942E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.972 | TFLOPs: 26.22 | +7: iteration 135940/ 173500 | consumed samples: 34800640 | consumed tokens: 71271710720 | elapsed time per iteration (s): 0.15 | learning rate: 4.042E-05 | global batch size: 256 | lm loss: 3.666069E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.444 | TFLOPs: 26.24 | +7: iteration 135950/ 173500 | consumed samples: 34803200 | consumed tokens: 71276953600 | elapsed time per iteration (s): 0.15 | learning rate: 4.040E-05 | global batch size: 256 | lm loss: 3.677215E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.528 | TFLOPs: 26.20 | +7: iteration 135960/ 173500 | consumed samples: 34805760 | consumed tokens: 71282196480 | elapsed time per iteration (s): 0.15 | learning rate: 4.039E-05 | global batch size: 256 | lm loss: 3.668943E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.804 | TFLOPs: 26.22 | +7: iteration 135970/ 173500 | consumed samples: 34808320 | consumed tokens: 71287439360 | elapsed time per iteration (s): 0.15 | learning rate: 4.038E-05 | global batch size: 256 | lm loss: 3.667022E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.059 | TFLOPs: 26.14 | +7: iteration 135980/ 173500 | consumed samples: 34810880 | consumed tokens: 71292682240 | elapsed time per iteration (s): 0.15 | learning rate: 4.037E-05 | global batch size: 256 | lm loss: 3.666143E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.921 | TFLOPs: 26.24 | +7: iteration 135990/ 173500 | consumed samples: 34813440 | consumed tokens: 71297925120 | elapsed time per iteration (s): 0.15 | learning rate: 4.036E-05 | global batch size: 256 | lm loss: 3.661975E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.348 | TFLOPs: 26.27 | +0: [2023-03-17 06:10:04,515] [INFO] [logging.py:68:log_dist] [Rank 0] step=136000, skipped=0, lr=[4.035272599944626e-05, 4.035272599944626e-05, 4.035272599944626e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 136000/ 173500 | consumed samples: 34816000 | consumed tokens: 71303168000 | elapsed time per iteration (s): 0.15 | learning rate: 4.035E-05 | global batch size: 256 | lm loss: 3.672911E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.768 | TFLOPs: 26.30 | +0: steps: 136000 loss: 3.6543 iter time (s): 0.154 samples/sec: 1660.730 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 136000 | lm loss value: 3.839772E+00 | lm loss PPL: 4.651487E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 136000 to checkpoints_44m91b100m +0: [2023-03-17 06:10:04,588] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step136000 is begin to save! +0: [2023-03-17 06:10:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:10:04,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:10:04,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:10:04,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:10:04,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:10:04,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:10:04,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:10:04,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:10:04,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:10:04,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:10:04,685] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:10:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:10:04,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:10:04,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:10:04,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:10:04,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:10:04,708] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:10:04,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:10:04,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:10:04,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:10:04,718] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step136000/mp_rank_00_model_states.pt +0: [2023-03-17 06:10:04,718] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:10:04,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:10:04,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:10:04,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: [2023-03-17 06:10:04,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 06:10:04,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +5: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +7: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:10:04,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +4: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:10:04,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +3: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +2: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +6: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +1: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:10:04,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step136000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:10:04,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step136000 is ready now! +0: successfully saved checkpoint at iteration 136000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 174.56 +7: iteration 136010/ 173500 | consumed samples: 34818560 | consumed tokens: 71308410880 | elapsed time per iteration (s): 0.18 | learning rate: 4.034E-05 | global batch size: 256 | lm loss: 3.675827E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.547 | TFLOPs: 22.72 | +7: iteration 136020/ 173500 | consumed samples: 34821120 | consumed tokens: 71313653760 | elapsed time per iteration (s): 0.15 | learning rate: 4.033E-05 | global batch size: 256 | lm loss: 3.674142E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.928 | TFLOPs: 26.31 | +7: iteration 136030/ 173500 | consumed samples: 34823680 | consumed tokens: 71318896640 | elapsed time per iteration (s): 0.15 | learning rate: 4.032E-05 | global batch size: 256 | lm loss: 3.661651E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.819 | TFLOPs: 26.30 | +7: iteration 136040/ 173500 | consumed samples: 34826240 | consumed tokens: 71324139520 | elapsed time per iteration (s): 0.15 | learning rate: 4.031E-05 | global batch size: 256 | lm loss: 3.674332E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.568 | TFLOPs: 26.29 | +7: iteration 136050/ 173500 | consumed samples: 34828800 | consumed tokens: 71329382400 | elapsed time per iteration (s): 0.16 | learning rate: 4.030E-05 | global batch size: 256 | lm loss: 3.670879E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.365 | TFLOPs: 25.80 | +7: iteration 136060/ 173500 | consumed samples: 34831360 | consumed tokens: 71334625280 | elapsed time per iteration (s): 0.15 | learning rate: 4.029E-05 | global batch size: 256 | lm loss: 3.673312E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.661 | TFLOPs: 26.29 | +7: iteration 136070/ 173500 | consumed samples: 34833920 | consumed tokens: 71339868160 | elapsed time per iteration (s): 0.15 | learning rate: 4.028E-05 | global batch size: 256 | lm loss: 3.674965E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.655 | TFLOPs: 26.31 | +7: iteration 136080/ 173500 | consumed samples: 34836480 | consumed tokens: 71345111040 | elapsed time per iteration (s): 0.15 | learning rate: 4.027E-05 | global batch size: 256 | lm loss: 3.688771E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.355 | TFLOPs: 26.31 | +7: iteration 136090/ 173500 | consumed samples: 34839040 | consumed tokens: 71350353920 | elapsed time per iteration (s): 0.16 | learning rate: 4.026E-05 | global batch size: 256 | lm loss: 3.668492E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.758 | TFLOPs: 25.76 | +7: iteration 136100/ 173500 | consumed samples: 34841600 | consumed tokens: 71355596800 | elapsed time per iteration (s): 0.15 | learning rate: 4.025E-05 | global batch size: 256 | lm loss: 3.672095E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.587 | TFLOPs: 25.92 | +7: iteration 136110/ 173500 | consumed samples: 34844160 | consumed tokens: 71360839680 | elapsed time per iteration (s): 0.15 | learning rate: 4.024E-05 | global batch size: 256 | lm loss: 3.679507E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.622 | TFLOPs: 25.98 | +7: iteration 136120/ 173500 | consumed samples: 34846720 | consumed tokens: 71366082560 | elapsed time per iteration (s): 0.15 | learning rate: 4.023E-05 | global batch size: 256 | lm loss: 3.681158E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.750 | TFLOPs: 25.97 | +7: iteration 136130/ 173500 | consumed samples: 34849280 | consumed tokens: 71371325440 | elapsed time per iteration (s): 0.15 | learning rate: 4.022E-05 | global batch size: 256 | lm loss: 3.667950E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.046 | TFLOPs: 26.25 | +7: iteration 136140/ 173500 | consumed samples: 34851840 | consumed tokens: 71376568320 | elapsed time per iteration (s): 0.15 | learning rate: 4.021E-05 | global batch size: 256 | lm loss: 3.678297E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.653 | TFLOPs: 26.26 | +7: iteration 136150/ 173500 | consumed samples: 34854400 | consumed tokens: 71381811200 | elapsed time per iteration (s): 0.15 | learning rate: 4.020E-05 | global batch size: 256 | lm loss: 3.678252E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.789 | TFLOPs: 26.26 | +7: iteration 136160/ 173500 | consumed samples: 34856960 | consumed tokens: 71387054080 | elapsed time per iteration (s): 0.16 | learning rate: 4.019E-05 | global batch size: 256 | lm loss: 3.663999E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.778 | TFLOPs: 25.64 | +7: iteration 136170/ 173500 | consumed samples: 34859520 | consumed tokens: 71392296960 | elapsed time per iteration (s): 0.15 | learning rate: 4.018E-05 | global batch size: 256 | lm loss: 3.667306E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.171 | TFLOPs: 26.27 | +7: iteration 136180/ 173500 | consumed samples: 34862080 | consumed tokens: 71397539840 | elapsed time per iteration (s): 0.15 | learning rate: 4.017E-05 | global batch size: 256 | lm loss: 3.682715E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.833 | TFLOPs: 26.25 | +7: iteration 136190/ 173500 | consumed samples: 34864640 | consumed tokens: 71402782720 | elapsed time per iteration (s): 0.15 | learning rate: 4.016E-05 | global batch size: 256 | lm loss: 3.680882E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.118 | TFLOPs: 26.25 | +7: iteration 136200/ 173500 | consumed samples: 34867200 | consumed tokens: 71408025600 | elapsed time per iteration (s): 0.16 | learning rate: 4.014E-05 | global batch size: 256 | lm loss: 3.664546E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.028 | TFLOPs: 25.89 | +7: iteration 136210/ 173500 | consumed samples: 34869760 | consumed tokens: 71413268480 | elapsed time per iteration (s): 0.16 | learning rate: 4.013E-05 | global batch size: 256 | lm loss: 3.658810E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.114 | TFLOPs: 25.31 | +7: iteration 136220/ 173500 | consumed samples: 34872320 | consumed tokens: 71418511360 | elapsed time per iteration (s): 0.16 | learning rate: 4.012E-05 | global batch size: 256 | lm loss: 3.683385E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.443 | TFLOPs: 25.74 | +7: iteration 136230/ 173500 | consumed samples: 34874880 | consumed tokens: 71423754240 | elapsed time per iteration (s): 0.15 | learning rate: 4.011E-05 | global batch size: 256 | lm loss: 3.670510E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.063 | TFLOPs: 26.25 | +7: iteration 136240/ 173500 | consumed samples: 34877440 | consumed tokens: 71428997120 | elapsed time per iteration (s): 0.16 | learning rate: 4.010E-05 | global batch size: 256 | lm loss: 3.662838E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.258 | TFLOPs: 25.39 | +7: iteration 136250/ 173500 | consumed samples: 34880000 | consumed tokens: 71434240000 | elapsed time per iteration (s): 0.15 | learning rate: 4.009E-05 | global batch size: 256 | lm loss: 3.659790E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.169 | TFLOPs: 26.26 | +7: iteration 136260/ 173500 | consumed samples: 34882560 | consumed tokens: 71439482880 | elapsed time per iteration (s): 0.15 | learning rate: 4.008E-05 | global batch size: 256 | lm loss: 3.675419E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.299 | TFLOPs: 26.23 | +7: iteration 136270/ 173500 | consumed samples: 34885120 | consumed tokens: 71444725760 | elapsed time per iteration (s): 0.15 | learning rate: 4.007E-05 | global batch size: 256 | lm loss: 3.664248E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.250 | TFLOPs: 26.21 | +7: iteration 136280/ 173500 | consumed samples: 34887680 | consumed tokens: 71449968640 | elapsed time per iteration (s): 0.15 | learning rate: 4.006E-05 | global batch size: 256 | lm loss: 3.682843E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.517 | TFLOPs: 26.18 | +7: iteration 136290/ 173500 | consumed samples: 34890240 | consumed tokens: 71455211520 | elapsed time per iteration (s): 0.15 | learning rate: 4.005E-05 | global batch size: 256 | lm loss: 3.680740E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.529 | TFLOPs: 26.21 | +7: iteration 136300/ 173500 | consumed samples: 34892800 | consumed tokens: 71460454400 | elapsed time per iteration (s): 0.15 | learning rate: 4.004E-05 | global batch size: 256 | lm loss: 3.660604E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.345 | TFLOPs: 26.20 | +7: iteration 136310/ 173500 | consumed samples: 34895360 | consumed tokens: 71465697280 | elapsed time per iteration (s): 0.15 | learning rate: 4.003E-05 | global batch size: 256 | lm loss: 3.665283E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.068 | TFLOPs: 26.16 | +7: iteration 136320/ 173500 | consumed samples: 34897920 | consumed tokens: 71470940160 | elapsed time per iteration (s): 0.15 | learning rate: 4.002E-05 | global batch size: 256 | lm loss: 3.671777E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.907 | TFLOPs: 26.20 | +7: iteration 136330/ 173500 | consumed samples: 34900480 | consumed tokens: 71476183040 | elapsed time per iteration (s): 0.15 | learning rate: 4.001E-05 | global batch size: 256 | lm loss: 3.667292E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.256 | TFLOPs: 26.23 | +7: iteration 136340/ 173500 | consumed samples: 34903040 | consumed tokens: 71481425920 | elapsed time per iteration (s): 0.15 | learning rate: 4.000E-05 | global batch size: 256 | lm loss: 3.674539E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.085 | TFLOPs: 26.24 | +7: iteration 136350/ 173500 | consumed samples: 34905600 | consumed tokens: 71486668800 | elapsed time per iteration (s): 0.15 | learning rate: 3.999E-05 | global batch size: 256 | lm loss: 3.666285E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.967 | TFLOPs: 26.20 | +7: iteration 136360/ 173500 | consumed samples: 34908160 | consumed tokens: 71491911680 | elapsed time per iteration (s): 0.15 | learning rate: 3.998E-05 | global batch size: 256 | lm loss: 3.657147E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.578 | TFLOPs: 26.21 | +7: iteration 136370/ 173500 | consumed samples: 34910720 | consumed tokens: 71497154560 | elapsed time per iteration (s): 0.15 | learning rate: 3.997E-05 | global batch size: 256 | lm loss: 3.668601E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.111 | TFLOPs: 26.16 | +7: iteration 136380/ 173500 | consumed samples: 34913280 | consumed tokens: 71502397440 | elapsed time per iteration (s): 0.15 | learning rate: 3.996E-05 | global batch size: 256 | lm loss: 3.667438E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.787 | TFLOPs: 26.05 | +7: iteration 136390/ 173500 | consumed samples: 34915840 | consumed tokens: 71507640320 | elapsed time per iteration (s): 0.15 | learning rate: 3.995E-05 | global batch size: 256 | lm loss: 3.671335E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.319 | TFLOPs: 26.13 | +7: iteration 136400/ 173500 | consumed samples: 34918400 | consumed tokens: 71512883200 | elapsed time per iteration (s): 0.15 | learning rate: 3.994E-05 | global batch size: 256 | lm loss: 3.674387E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.438 | TFLOPs: 26.24 | +7: iteration 136410/ 173500 | consumed samples: 34920960 | consumed tokens: 71518126080 | elapsed time per iteration (s): 0.15 | learning rate: 3.993E-05 | global batch size: 256 | lm loss: 3.680228E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.155 | TFLOPs: 26.25 | +7: iteration 136420/ 173500 | consumed samples: 34923520 | consumed tokens: 71523368960 | elapsed time per iteration (s): 0.15 | learning rate: 3.992E-05 | global batch size: 256 | lm loss: 3.674646E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.350 | TFLOPs: 26.26 | +7: iteration 136430/ 173500 | consumed samples: 34926080 | consumed tokens: 71528611840 | elapsed time per iteration (s): 0.15 | learning rate: 3.991E-05 | global batch size: 256 | lm loss: 3.664674E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.735 | TFLOPs: 26.23 | +7: iteration 136440/ 173500 | consumed samples: 34928640 | consumed tokens: 71533854720 | elapsed time per iteration (s): 0.15 | learning rate: 3.990E-05 | global batch size: 256 | lm loss: 3.678847E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.195 | TFLOPs: 26.24 | +7: iteration 136450/ 173500 | consumed samples: 34931200 | consumed tokens: 71539097600 | elapsed time per iteration (s): 0.15 | learning rate: 3.989E-05 | global batch size: 256 | lm loss: 3.655299E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.844 | TFLOPs: 26.25 | +7: iteration 136460/ 173500 | consumed samples: 34933760 | consumed tokens: 71544340480 | elapsed time per iteration (s): 0.15 | learning rate: 3.988E-05 | global batch size: 256 | lm loss: 3.675339E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.276 | TFLOPs: 26.24 | +7: iteration 136470/ 173500 | consumed samples: 34936320 | consumed tokens: 71549583360 | elapsed time per iteration (s): 0.15 | learning rate: 3.987E-05 | global batch size: 256 | lm loss: 3.673049E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.516 | TFLOPs: 26.24 | +7: iteration 136480/ 173500 | consumed samples: 34938880 | consumed tokens: 71554826240 | elapsed time per iteration (s): 0.15 | learning rate: 3.985E-05 | global batch size: 256 | lm loss: 3.676692E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.835 | TFLOPs: 26.23 | +7: iteration 136490/ 173500 | consumed samples: 34941440 | consumed tokens: 71560069120 | elapsed time per iteration (s): 0.15 | learning rate: 3.984E-05 | global batch size: 256 | lm loss: 3.668431E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.382 | TFLOPs: 26.23 | +7: iteration 136500/ 173500 | consumed samples: 34944000 | consumed tokens: 71565312000 | elapsed time per iteration (s): 0.15 | learning rate: 3.983E-05 | global batch size: 256 | lm loss: 3.675808E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.528 | TFLOPs: 26.20 | +7: iteration 136510/ 173500 | consumed samples: 34946560 | consumed tokens: 71570554880 | elapsed time per iteration (s): 0.15 | learning rate: 3.982E-05 | global batch size: 256 | lm loss: 3.680694E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.045 | TFLOPs: 26.00 | +7: iteration 136520/ 173500 | consumed samples: 34949120 | consumed tokens: 71575797760 | elapsed time per iteration (s): 0.15 | learning rate: 3.981E-05 | global batch size: 256 | lm loss: 3.674302E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.801 | TFLOPs: 26.23 | +7: iteration 136530/ 173500 | consumed samples: 34951680 | consumed tokens: 71581040640 | elapsed time per iteration (s): 0.15 | learning rate: 3.980E-05 | global batch size: 256 | lm loss: 3.675514E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.332 | TFLOPs: 26.21 | +7: iteration 136540/ 173500 | consumed samples: 34954240 | consumed tokens: 71586283520 | elapsed time per iteration (s): 0.15 | learning rate: 3.979E-05 | global batch size: 256 | lm loss: 3.663076E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.832 | TFLOPs: 26.20 | +7: iteration 136550/ 173500 | consumed samples: 34956800 | consumed tokens: 71591526400 | elapsed time per iteration (s): 0.15 | learning rate: 3.978E-05 | global batch size: 256 | lm loss: 3.665619E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.686 | TFLOPs: 26.20 | +7: iteration 136560/ 173500 | consumed samples: 34959360 | consumed tokens: 71596769280 | elapsed time per iteration (s): 0.15 | learning rate: 3.977E-05 | global batch size: 256 | lm loss: 3.655607E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.942 | TFLOPs: 26.20 | +7: iteration 136570/ 173500 | consumed samples: 34961920 | consumed tokens: 71602012160 | elapsed time per iteration (s): 0.15 | learning rate: 3.976E-05 | global batch size: 256 | lm loss: 3.674044E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.403 | TFLOPs: 26.20 | +7: iteration 136580/ 173500 | consumed samples: 34964480 | consumed tokens: 71607255040 | elapsed time per iteration (s): 0.15 | learning rate: 3.975E-05 | global batch size: 256 | lm loss: 3.670860E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.763 | TFLOPs: 26.20 | +7: iteration 136590/ 173500 | consumed samples: 34967040 | consumed tokens: 71612497920 | elapsed time per iteration (s): 0.16 | learning rate: 3.974E-05 | global batch size: 256 | lm loss: 3.669804E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.191 | TFLOPs: 25.83 | +7: iteration 136600/ 173500 | consumed samples: 34969600 | consumed tokens: 71617740800 | elapsed time per iteration (s): 0.15 | learning rate: 3.973E-05 | global batch size: 256 | lm loss: 3.667922E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.099 | TFLOPs: 26.21 | +7: iteration 136610/ 173500 | consumed samples: 34972160 | consumed tokens: 71622983680 | elapsed time per iteration (s): 0.15 | learning rate: 3.972E-05 | global batch size: 256 | lm loss: 3.672458E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.214 | TFLOPs: 26.04 | +7: iteration 136620/ 173500 | consumed samples: 34974720 | consumed tokens: 71628226560 | elapsed time per iteration (s): 0.15 | learning rate: 3.971E-05 | global batch size: 256 | lm loss: 3.671611E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.067 | TFLOPs: 26.21 | +7: iteration 136630/ 173500 | consumed samples: 34977280 | consumed tokens: 71633469440 | elapsed time per iteration (s): 0.15 | learning rate: 3.970E-05 | global batch size: 256 | lm loss: 3.666836E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.479 | TFLOPs: 26.24 | +7: iteration 136640/ 173500 | consumed samples: 34979840 | consumed tokens: 71638712320 | elapsed time per iteration (s): 0.15 | learning rate: 3.969E-05 | global batch size: 256 | lm loss: 3.673589E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.081 | TFLOPs: 26.22 | +7: iteration 136650/ 173500 | consumed samples: 34982400 | consumed tokens: 71643955200 | elapsed time per iteration (s): 0.15 | learning rate: 3.968E-05 | global batch size: 256 | lm loss: 3.661155E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.903 | TFLOPs: 26.27 | +7: iteration 136660/ 173500 | consumed samples: 34984960 | consumed tokens: 71649198080 | elapsed time per iteration (s): 0.16 | learning rate: 3.967E-05 | global batch size: 256 | lm loss: 3.679944E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.587 | TFLOPs: 25.81 | +7: iteration 136670/ 173500 | consumed samples: 34987520 | consumed tokens: 71654440960 | elapsed time per iteration (s): 0.16 | learning rate: 3.966E-05 | global batch size: 256 | lm loss: 3.668619E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.537 | TFLOPs: 25.74 | +7: iteration 136680/ 173500 | consumed samples: 34990080 | consumed tokens: 71659683840 | elapsed time per iteration (s): 0.16 | learning rate: 3.965E-05 | global batch size: 256 | lm loss: 3.671522E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.564 | TFLOPs: 25.60 | +7: iteration 136690/ 173500 | consumed samples: 34992640 | consumed tokens: 71664926720 | elapsed time per iteration (s): 0.16 | learning rate: 3.964E-05 | global batch size: 256 | lm loss: 3.658335E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.466 | TFLOPs: 25.63 | +7: iteration 136700/ 173500 | consumed samples: 34995200 | consumed tokens: 71670169600 | elapsed time per iteration (s): 0.15 | learning rate: 3.963E-05 | global batch size: 256 | lm loss: 3.655074E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.135 | TFLOPs: 26.02 | +7: iteration 136710/ 173500 | consumed samples: 34997760 | consumed tokens: 71675412480 | elapsed time per iteration (s): 0.16 | learning rate: 3.962E-05 | global batch size: 256 | lm loss: 3.665031E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.624 | TFLOPs: 25.89 | +7: iteration 136720/ 173500 | consumed samples: 35000320 | consumed tokens: 71680655360 | elapsed time per iteration (s): 0.16 | learning rate: 3.961E-05 | global batch size: 256 | lm loss: 3.675743E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.480 | TFLOPs: 25.87 | +7: iteration 136730/ 173500 | consumed samples: 35002880 | consumed tokens: 71685898240 | elapsed time per iteration (s): 0.15 | learning rate: 3.960E-05 | global batch size: 256 | lm loss: 3.671931E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.430 | TFLOPs: 26.23 | +7: iteration 136740/ 173500 | consumed samples: 35005440 | consumed tokens: 71691141120 | elapsed time per iteration (s): 0.15 | learning rate: 3.959E-05 | global batch size: 256 | lm loss: 3.671446E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.570 | TFLOPs: 26.23 | +7: iteration 136750/ 173500 | consumed samples: 35008000 | consumed tokens: 71696384000 | elapsed time per iteration (s): 0.15 | learning rate: 3.958E-05 | global batch size: 256 | lm loss: 3.668000E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.755 | TFLOPs: 26.23 | +7: iteration 136760/ 173500 | consumed samples: 35010560 | consumed tokens: 71701626880 | elapsed time per iteration (s): 0.16 | learning rate: 3.957E-05 | global batch size: 256 | lm loss: 3.677503E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.702 | TFLOPs: 24.87 | +7: iteration 136770/ 173500 | consumed samples: 35013120 | consumed tokens: 71706869760 | elapsed time per iteration (s): 0.15 | learning rate: 3.956E-05 | global batch size: 256 | lm loss: 3.669859E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.299 | TFLOPs: 26.27 | +7: iteration 136780/ 173500 | consumed samples: 35015680 | consumed tokens: 71712112640 | elapsed time per iteration (s): 0.16 | learning rate: 3.955E-05 | global batch size: 256 | lm loss: 3.660954E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.714 | TFLOPs: 24.87 | +7: iteration 136790/ 173500 | consumed samples: 35018240 | consumed tokens: 71717355520 | elapsed time per iteration (s): 0.15 | learning rate: 3.954E-05 | global batch size: 256 | lm loss: 3.666170E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.900 | TFLOPs: 26.38 | +7: iteration 136800/ 173500 | consumed samples: 35020800 | consumed tokens: 71722598400 | elapsed time per iteration (s): 0.15 | learning rate: 3.953E-05 | global batch size: 256 | lm loss: 3.675673E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.524 | TFLOPs: 26.39 | +7: iteration 136810/ 173500 | consumed samples: 35023360 | consumed tokens: 71727841280 | elapsed time per iteration (s): 0.15 | learning rate: 3.952E-05 | global batch size: 256 | lm loss: 3.666372E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.223 | TFLOPs: 26.37 | +7: iteration 136820/ 173500 | consumed samples: 35025920 | consumed tokens: 71733084160 | elapsed time per iteration (s): 0.15 | learning rate: 3.951E-05 | global batch size: 256 | lm loss: 3.668657E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.885 | TFLOPs: 26.36 | +7: iteration 136830/ 173500 | consumed samples: 35028480 | consumed tokens: 71738327040 | elapsed time per iteration (s): 0.15 | learning rate: 3.950E-05 | global batch size: 256 | lm loss: 3.673594E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.507 | TFLOPs: 26.37 | +7: iteration 136840/ 173500 | consumed samples: 35031040 | consumed tokens: 71743569920 | elapsed time per iteration (s): 0.16 | learning rate: 3.949E-05 | global batch size: 256 | lm loss: 3.677160E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.699 | TFLOPs: 25.53 | +7: iteration 136850/ 173500 | consumed samples: 35033600 | consumed tokens: 71748812800 | elapsed time per iteration (s): 0.15 | learning rate: 3.947E-05 | global batch size: 256 | lm loss: 3.673233E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.286 | TFLOPs: 26.38 | +7: iteration 136860/ 173500 | consumed samples: 35036160 | consumed tokens: 71754055680 | elapsed time per iteration (s): 0.15 | learning rate: 3.946E-05 | global batch size: 256 | lm loss: 3.686190E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.850 | TFLOPs: 26.39 | +7: iteration 136870/ 173500 | consumed samples: 35038720 | consumed tokens: 71759298560 | elapsed time per iteration (s): 0.15 | learning rate: 3.945E-05 | global batch size: 256 | lm loss: 3.667600E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.471 | TFLOPs: 26.37 | +7: iteration 136880/ 173500 | consumed samples: 35041280 | consumed tokens: 71764541440 | elapsed time per iteration (s): 0.15 | learning rate: 3.944E-05 | global batch size: 256 | lm loss: 3.657695E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.705 | TFLOPs: 26.36 | +7: iteration 136890/ 173500 | consumed samples: 35043840 | consumed tokens: 71769784320 | elapsed time per iteration (s): 0.15 | learning rate: 3.943E-05 | global batch size: 256 | lm loss: 3.672137E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.294 | TFLOPs: 26.35 | +7: iteration 136900/ 173500 | consumed samples: 35046400 | consumed tokens: 71775027200 | elapsed time per iteration (s): 0.15 | learning rate: 3.942E-05 | global batch size: 256 | lm loss: 3.664051E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.725 | TFLOPs: 26.25 | +7: iteration 136910/ 173500 | consumed samples: 35048960 | consumed tokens: 71780270080 | elapsed time per iteration (s): 0.16 | learning rate: 3.941E-05 | global batch size: 256 | lm loss: 3.672382E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.400 | TFLOPs: 25.88 | +7: iteration 136920/ 173500 | consumed samples: 35051520 | consumed tokens: 71785512960 | elapsed time per iteration (s): 0.15 | learning rate: 3.940E-05 | global batch size: 256 | lm loss: 3.671025E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.770 | TFLOPs: 26.23 | +7: iteration 136930/ 173500 | consumed samples: 35054080 | consumed tokens: 71790755840 | elapsed time per iteration (s): 0.15 | learning rate: 3.939E-05 | global batch size: 256 | lm loss: 3.669715E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.577 | TFLOPs: 26.29 | +7: iteration 136940/ 173500 | consumed samples: 35056640 | consumed tokens: 71795998720 | elapsed time per iteration (s): 0.15 | learning rate: 3.938E-05 | global batch size: 256 | lm loss: 3.670049E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.325 | TFLOPs: 26.19 | +7: iteration 136950/ 173500 | consumed samples: 35059200 | consumed tokens: 71801241600 | elapsed time per iteration (s): 0.15 | learning rate: 3.937E-05 | global batch size: 256 | lm loss: 3.678895E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.047 | TFLOPs: 26.19 | +7: iteration 136960/ 173500 | consumed samples: 35061760 | consumed tokens: 71806484480 | elapsed time per iteration (s): 0.15 | learning rate: 3.936E-05 | global batch size: 256 | lm loss: 3.671745E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.160 | TFLOPs: 26.19 | +7: iteration 136970/ 173500 | consumed samples: 35064320 | consumed tokens: 71811727360 | elapsed time per iteration (s): 0.16 | learning rate: 3.935E-05 | global batch size: 256 | lm loss: 3.668026E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.029 | TFLOPs: 25.66 | +7: iteration 136980/ 173500 | consumed samples: 35066880 | consumed tokens: 71816970240 | elapsed time per iteration (s): 0.15 | learning rate: 3.934E-05 | global batch size: 256 | lm loss: 3.664339E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.194 | TFLOPs: 26.19 | +7: iteration 136990/ 173500 | consumed samples: 35069440 | consumed tokens: 71822213120 | elapsed time per iteration (s): 0.15 | learning rate: 3.933E-05 | global batch size: 256 | lm loss: 3.665353E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.974 | TFLOPs: 26.19 | +7: iteration 137000/ 173500 | consumed samples: 35072000 | consumed tokens: 71827456000 | elapsed time per iteration (s): 0.15 | learning rate: 3.932E-05 | global batch size: 256 | lm loss: 3.676206E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.042 | TFLOPs: 25.91 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 137000 | lm loss value: 3.838572E+00 | lm loss PPL: 4.645908E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 137000 to checkpoints_44m91b100m +0: [2023-03-17 06:12:38,594] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step137000 is begin to save! +0: [2023-03-17 06:12:38,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:12:38,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:12:38,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:12:38,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:12:38,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:12:38,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:12:38,685] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:12:38,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:12:38,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:12:38,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:12:38,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:12:38,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:12:38,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:12:38,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:12:38,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:12:38,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:12:38,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:12:38,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:12:38,733] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:12:38,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:12:38,734] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step137000/mp_rank_00_model_states.pt +0: [2023-03-17 06:12:38,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:12:38,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:12:38,753] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:12:38,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:12:38,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:12:38,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +4: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +4: [2023-03-17 06:12:38,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +2: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:12:38,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: [2023-03-17 06:12:38,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +5: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +7: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +4: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +1: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +3: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:12:38,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:12:38,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +6: [2023-03-17 06:12:38,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:12:38,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step137000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:12:38,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step137000 is ready now! +0: successfully saved checkpoint at iteration 137000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 189.73 +7: iteration 137010/ 173500 | consumed samples: 35074560 | consumed tokens: 71832698880 | elapsed time per iteration (s): 0.18 | learning rate: 3.931E-05 | global batch size: 256 | lm loss: 3.674362E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1447.158 | TFLOPs: 22.70 | +7: iteration 137020/ 173500 | consumed samples: 35077120 | consumed tokens: 71837941760 | elapsed time per iteration (s): 0.15 | learning rate: 3.930E-05 | global batch size: 256 | lm loss: 3.672110E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.056 | TFLOPs: 26.17 | +7: iteration 137030/ 173500 | consumed samples: 35079680 | consumed tokens: 71843184640 | elapsed time per iteration (s): 0.15 | learning rate: 3.929E-05 | global batch size: 256 | lm loss: 3.666879E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.362 | TFLOPs: 26.18 | +7: iteration 137040/ 173500 | consumed samples: 35082240 | consumed tokens: 71848427520 | elapsed time per iteration (s): 0.15 | learning rate: 3.928E-05 | global batch size: 256 | lm loss: 3.666102E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.135 | TFLOPs: 26.22 | +7: iteration 137050/ 173500 | consumed samples: 35084800 | consumed tokens: 71853670400 | elapsed time per iteration (s): 0.15 | learning rate: 3.927E-05 | global batch size: 256 | lm loss: 3.659704E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.104 | TFLOPs: 26.24 | +7: iteration 137060/ 173500 | consumed samples: 35087360 | consumed tokens: 71858913280 | elapsed time per iteration (s): 0.15 | learning rate: 3.926E-05 | global batch size: 256 | lm loss: 3.668442E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.550 | TFLOPs: 26.23 | +7: iteration 137070/ 173500 | consumed samples: 35089920 | consumed tokens: 71864156160 | elapsed time per iteration (s): 0.15 | learning rate: 3.925E-05 | global batch size: 256 | lm loss: 3.665253E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.083 | TFLOPs: 26.33 | +7: iteration 137080/ 173500 | consumed samples: 35092480 | consumed tokens: 71869399040 | elapsed time per iteration (s): 0.15 | learning rate: 3.924E-05 | global batch size: 256 | lm loss: 3.665097E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.200 | TFLOPs: 26.33 | +7: iteration 137090/ 173500 | consumed samples: 35095040 | consumed tokens: 71874641920 | elapsed time per iteration (s): 0.15 | learning rate: 3.923E-05 | global batch size: 256 | lm loss: 3.659681E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.008 | TFLOPs: 26.00 | +7: iteration 137100/ 173500 | consumed samples: 35097600 | consumed tokens: 71879884800 | elapsed time per iteration (s): 0.15 | learning rate: 3.922E-05 | global batch size: 256 | lm loss: 3.671422E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.777 | TFLOPs: 26.37 | +7: iteration 137110/ 173500 | consumed samples: 35100160 | consumed tokens: 71885127680 | elapsed time per iteration (s): 0.15 | learning rate: 3.921E-05 | global batch size: 256 | lm loss: 3.669281E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.267 | TFLOPs: 26.35 | +7: iteration 137120/ 173500 | consumed samples: 35102720 | consumed tokens: 71890370560 | elapsed time per iteration (s): 0.15 | learning rate: 3.920E-05 | global batch size: 256 | lm loss: 3.672334E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.007 | TFLOPs: 26.36 | +7: iteration 137130/ 173500 | consumed samples: 35105280 | consumed tokens: 71895613440 | elapsed time per iteration (s): 0.16 | learning rate: 3.919E-05 | global batch size: 256 | lm loss: 3.667638E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.509 | TFLOPs: 25.15 | +7: iteration 137140/ 173500 | consumed samples: 35107840 | consumed tokens: 71900856320 | elapsed time per iteration (s): 0.15 | learning rate: 3.918E-05 | global batch size: 256 | lm loss: 3.668491E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.936 | TFLOPs: 26.33 | +7: iteration 137150/ 173500 | consumed samples: 35110400 | consumed tokens: 71906099200 | elapsed time per iteration (s): 0.15 | learning rate: 3.917E-05 | global batch size: 256 | lm loss: 3.657559E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.421 | TFLOPs: 26.35 | +7: iteration 137160/ 173500 | consumed samples: 35112960 | consumed tokens: 71911342080 | elapsed time per iteration (s): 0.15 | learning rate: 3.916E-05 | global batch size: 256 | lm loss: 3.673182E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.847 | TFLOPs: 26.33 | +7: iteration 137170/ 173500 | consumed samples: 35115520 | consumed tokens: 71916584960 | elapsed time per iteration (s): 0.15 | learning rate: 3.915E-05 | global batch size: 256 | lm loss: 3.672364E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.830 | TFLOPs: 26.33 | +7: iteration 137180/ 173500 | consumed samples: 35118080 | consumed tokens: 71921827840 | elapsed time per iteration (s): 0.15 | learning rate: 3.914E-05 | global batch size: 256 | lm loss: 3.661428E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.633 | TFLOPs: 26.34 | +7: iteration 137190/ 173500 | consumed samples: 35120640 | consumed tokens: 71927070720 | elapsed time per iteration (s): 0.15 | learning rate: 3.913E-05 | global batch size: 256 | lm loss: 3.667514E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.093 | TFLOPs: 26.29 | +7: iteration 137200/ 173500 | consumed samples: 35123200 | consumed tokens: 71932313600 | elapsed time per iteration (s): 0.15 | learning rate: 3.912E-05 | global batch size: 256 | lm loss: 3.665773E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.906 | TFLOPs: 26.20 | +7: iteration 137210/ 173500 | consumed samples: 35125760 | consumed tokens: 71937556480 | elapsed time per iteration (s): 0.15 | learning rate: 3.911E-05 | global batch size: 256 | lm loss: 3.682428E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.415 | TFLOPs: 25.95 | +7: iteration 137220/ 173500 | consumed samples: 35128320 | consumed tokens: 71942799360 | elapsed time per iteration (s): 0.16 | learning rate: 3.910E-05 | global batch size: 256 | lm loss: 3.668293E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.674 | TFLOPs: 25.89 | +7: iteration 137230/ 173500 | consumed samples: 35130880 | consumed tokens: 71948042240 | elapsed time per iteration (s): 0.15 | learning rate: 3.909E-05 | global batch size: 256 | lm loss: 3.664267E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.196 | TFLOPs: 26.19 | +7: iteration 137240/ 173500 | consumed samples: 35133440 | consumed tokens: 71953285120 | elapsed time per iteration (s): 0.15 | learning rate: 3.908E-05 | global batch size: 256 | lm loss: 3.670428E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.227 | TFLOPs: 26.16 | +7: iteration 137250/ 173500 | consumed samples: 35136000 | consumed tokens: 71958528000 | elapsed time per iteration (s): 0.15 | learning rate: 3.907E-05 | global batch size: 256 | lm loss: 3.679505E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.860 | TFLOPs: 26.19 | +7: iteration 137260/ 173500 | consumed samples: 35138560 | consumed tokens: 71963770880 | elapsed time per iteration (s): 0.15 | learning rate: 3.906E-05 | global batch size: 256 | lm loss: 3.662993E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.178 | TFLOPs: 26.18 | +7: iteration 137270/ 173500 | consumed samples: 35141120 | consumed tokens: 71969013760 | elapsed time per iteration (s): 0.15 | learning rate: 3.905E-05 | global batch size: 256 | lm loss: 3.670333E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.929 | TFLOPs: 26.19 | +7: iteration 137280/ 173500 | consumed samples: 35143680 | consumed tokens: 71974256640 | elapsed time per iteration (s): 0.15 | learning rate: 3.904E-05 | global batch size: 256 | lm loss: 3.674505E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.869 | TFLOPs: 26.19 | +7: iteration 137290/ 173500 | consumed samples: 35146240 | consumed tokens: 71979499520 | elapsed time per iteration (s): 0.15 | learning rate: 3.903E-05 | global batch size: 256 | lm loss: 3.676776E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.750 | TFLOPs: 26.19 | +7: iteration 137300/ 173500 | consumed samples: 35148800 | consumed tokens: 71984742400 | elapsed time per iteration (s): 0.15 | learning rate: 3.902E-05 | global batch size: 256 | lm loss: 3.675214E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.146 | TFLOPs: 26.16 | +7: iteration 137310/ 173500 | consumed samples: 35151360 | consumed tokens: 71989985280 | elapsed time per iteration (s): 0.15 | learning rate: 3.901E-05 | global batch size: 256 | lm loss: 3.667052E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.748 | TFLOPs: 26.19 | +7: iteration 137320/ 173500 | consumed samples: 35153920 | consumed tokens: 71995228160 | elapsed time per iteration (s): 0.15 | learning rate: 3.900E-05 | global batch size: 256 | lm loss: 3.672733E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.142 | TFLOPs: 26.14 | +7: iteration 137330/ 173500 | consumed samples: 35156480 | consumed tokens: 72000471040 | elapsed time per iteration (s): 0.15 | learning rate: 3.899E-05 | global batch size: 256 | lm loss: 3.667871E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.188 | TFLOPs: 26.16 | +7: iteration 137340/ 173500 | consumed samples: 35159040 | consumed tokens: 72005713920 | elapsed time per iteration (s): 0.15 | learning rate: 3.898E-05 | global batch size: 256 | lm loss: 3.667274E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.740 | TFLOPs: 26.14 | +7: iteration 137350/ 173500 | consumed samples: 35161600 | consumed tokens: 72010956800 | elapsed time per iteration (s): 0.15 | learning rate: 3.897E-05 | global batch size: 256 | lm loss: 3.677498E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.967 | TFLOPs: 26.13 | +7: iteration 137360/ 173500 | consumed samples: 35164160 | consumed tokens: 72016199680 | elapsed time per iteration (s): 0.15 | learning rate: 3.896E-05 | global batch size: 256 | lm loss: 3.677141E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.075 | TFLOPs: 26.14 | +7: iteration 137370/ 173500 | consumed samples: 35166720 | consumed tokens: 72021442560 | elapsed time per iteration (s): 0.15 | learning rate: 3.895E-05 | global batch size: 256 | lm loss: 3.669753E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.415 | TFLOPs: 26.16 | +7: iteration 137380/ 173500 | consumed samples: 35169280 | consumed tokens: 72026685440 | elapsed time per iteration (s): 0.15 | learning rate: 3.894E-05 | global batch size: 256 | lm loss: 3.675714E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.947 | TFLOPs: 26.16 | +7: iteration 137390/ 173500 | consumed samples: 35171840 | consumed tokens: 72031928320 | elapsed time per iteration (s): 0.15 | learning rate: 3.893E-05 | global batch size: 256 | lm loss: 3.671536E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.527 | TFLOPs: 26.15 | +7: iteration 137400/ 173500 | consumed samples: 35174400 | consumed tokens: 72037171200 | elapsed time per iteration (s): 0.15 | learning rate: 3.892E-05 | global batch size: 256 | lm loss: 3.673210E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.595 | TFLOPs: 26.17 | +7: iteration 137410/ 173500 | consumed samples: 35176960 | consumed tokens: 72042414080 | elapsed time per iteration (s): 0.15 | learning rate: 3.891E-05 | global batch size: 256 | lm loss: 3.674500E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.635 | TFLOPs: 26.12 | +7: iteration 137420/ 173500 | consumed samples: 35179520 | consumed tokens: 72047656960 | elapsed time per iteration (s): 0.15 | learning rate: 3.890E-05 | global batch size: 256 | lm loss: 3.670453E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.989 | TFLOPs: 26.17 | +7: iteration 137430/ 173500 | consumed samples: 35182080 | consumed tokens: 72052899840 | elapsed time per iteration (s): 0.15 | learning rate: 3.889E-05 | global batch size: 256 | lm loss: 3.662966E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.962 | TFLOPs: 26.14 | +7: iteration 137440/ 173500 | consumed samples: 35184640 | consumed tokens: 72058142720 | elapsed time per iteration (s): 0.15 | learning rate: 3.888E-05 | global batch size: 256 | lm loss: 3.675603E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.150 | TFLOPs: 26.16 | +7: iteration 137450/ 173500 | consumed samples: 35187200 | consumed tokens: 72063385600 | elapsed time per iteration (s): 0.15 | learning rate: 3.887E-05 | global batch size: 256 | lm loss: 3.659600E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.749 | TFLOPs: 26.14 | +7: iteration 137460/ 173500 | consumed samples: 35189760 | consumed tokens: 72068628480 | elapsed time per iteration (s): 0.15 | learning rate: 3.886E-05 | global batch size: 256 | lm loss: 3.674117E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.459 | TFLOPs: 26.32 | +7: iteration 137470/ 173500 | consumed samples: 35192320 | consumed tokens: 72073871360 | elapsed time per iteration (s): 0.15 | learning rate: 3.885E-05 | global batch size: 256 | lm loss: 3.677390E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.243 | TFLOPs: 26.33 | +7: iteration 137480/ 173500 | consumed samples: 35194880 | consumed tokens: 72079114240 | elapsed time per iteration (s): 0.15 | learning rate: 3.884E-05 | global batch size: 256 | lm loss: 3.684682E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.087 | TFLOPs: 26.13 | +7: iteration 137490/ 173500 | consumed samples: 35197440 | consumed tokens: 72084357120 | elapsed time per iteration (s): 0.15 | learning rate: 3.883E-05 | global batch size: 256 | lm loss: 3.673537E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.516 | TFLOPs: 26.10 | +7: iteration 137500/ 173500 | consumed samples: 35200000 | consumed tokens: 72089600000 | elapsed time per iteration (s): 0.15 | learning rate: 3.882E-05 | global batch size: 256 | lm loss: 3.678458E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.188 | TFLOPs: 26.11 | +7: iteration 137510/ 173500 | consumed samples: 35202560 | consumed tokens: 72094842880 | elapsed time per iteration (s): 0.15 | learning rate: 3.881E-05 | global batch size: 256 | lm loss: 3.665675E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.146 | TFLOPs: 26.08 | +7: iteration 137520/ 173500 | consumed samples: 35205120 | consumed tokens: 72100085760 | elapsed time per iteration (s): 0.15 | learning rate: 3.880E-05 | global batch size: 256 | lm loss: 3.678944E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.235 | TFLOPs: 26.10 | +7: iteration 137530/ 173500 | consumed samples: 35207680 | consumed tokens: 72105328640 | elapsed time per iteration (s): 0.16 | learning rate: 3.879E-05 | global batch size: 256 | lm loss: 3.680086E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.002 | TFLOPs: 25.50 | +7: iteration 137540/ 173500 | consumed samples: 35210240 | consumed tokens: 72110571520 | elapsed time per iteration (s): 0.15 | learning rate: 3.878E-05 | global batch size: 256 | lm loss: 3.671252E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.715 | TFLOPs: 26.33 | +7: iteration 137550/ 173500 | consumed samples: 35212800 | consumed tokens: 72115814400 | elapsed time per iteration (s): 0.15 | learning rate: 3.876E-05 | global batch size: 256 | lm loss: 3.674020E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.817 | TFLOPs: 26.33 | +7: iteration 137560/ 173500 | consumed samples: 35215360 | consumed tokens: 72121057280 | elapsed time per iteration (s): 0.15 | learning rate: 3.875E-05 | global batch size: 256 | lm loss: 3.671515E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.222 | TFLOPs: 26.33 | +7: iteration 137570/ 173500 | consumed samples: 35217920 | consumed tokens: 72126300160 | elapsed time per iteration (s): 0.15 | learning rate: 3.874E-05 | global batch size: 256 | lm loss: 3.681134E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.781 | TFLOPs: 26.33 | +7: iteration 137580/ 173500 | consumed samples: 35220480 | consumed tokens: 72131543040 | elapsed time per iteration (s): 0.15 | learning rate: 3.873E-05 | global batch size: 256 | lm loss: 3.668449E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.533 | TFLOPs: 26.34 | +7: iteration 137590/ 173500 | consumed samples: 35223040 | consumed tokens: 72136785920 | elapsed time per iteration (s): 0.15 | learning rate: 3.872E-05 | global batch size: 256 | lm loss: 3.662060E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.371 | TFLOPs: 26.31 | +7: iteration 137600/ 173500 | consumed samples: 35225600 | consumed tokens: 72142028800 | elapsed time per iteration (s): 0.15 | learning rate: 3.871E-05 | global batch size: 256 | lm loss: 3.661872E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.218 | TFLOPs: 26.27 | +7: iteration 137610/ 173500 | consumed samples: 35228160 | consumed tokens: 72147271680 | elapsed time per iteration (s): 0.16 | learning rate: 3.870E-05 | global batch size: 256 | lm loss: 3.675677E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.170 | TFLOPs: 25.13 | +7: iteration 137620/ 173500 | consumed samples: 35230720 | consumed tokens: 72152514560 | elapsed time per iteration (s): 0.15 | learning rate: 3.869E-05 | global batch size: 256 | lm loss: 3.673364E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.216 | TFLOPs: 26.19 | +7: iteration 137630/ 173500 | consumed samples: 35233280 | consumed tokens: 72157757440 | elapsed time per iteration (s): 0.15 | learning rate: 3.868E-05 | global batch size: 256 | lm loss: 3.665054E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.259 | TFLOPs: 26.21 | +7: iteration 137640/ 173500 | consumed samples: 35235840 | consumed tokens: 72163000320 | elapsed time per iteration (s): 0.15 | learning rate: 3.867E-05 | global batch size: 256 | lm loss: 3.675657E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.987 | TFLOPs: 26.19 | +7: iteration 137650/ 173500 | consumed samples: 35238400 | consumed tokens: 72168243200 | elapsed time per iteration (s): 0.15 | learning rate: 3.866E-05 | global batch size: 256 | lm loss: 3.674419E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.969 | TFLOPs: 26.25 | +7: iteration 137660/ 173500 | consumed samples: 35240960 | consumed tokens: 72173486080 | elapsed time per iteration (s): 0.15 | learning rate: 3.865E-05 | global batch size: 256 | lm loss: 3.659568E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.875 | TFLOPs: 26.33 | +7: iteration 137670/ 173500 | consumed samples: 35243520 | consumed tokens: 72178728960 | elapsed time per iteration (s): 0.15 | learning rate: 3.864E-05 | global batch size: 256 | lm loss: 3.677814E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.388 | TFLOPs: 26.32 | +7: iteration 137680/ 173500 | consumed samples: 35246080 | consumed tokens: 72183971840 | elapsed time per iteration (s): 0.15 | learning rate: 3.863E-05 | global batch size: 256 | lm loss: 3.674688E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.945 | TFLOPs: 26.31 | +7: iteration 137690/ 173500 | consumed samples: 35248640 | consumed tokens: 72189214720 | elapsed time per iteration (s): 0.15 | learning rate: 3.862E-05 | global batch size: 256 | lm loss: 3.673908E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.602 | TFLOPs: 26.31 | +7: iteration 137700/ 173500 | consumed samples: 35251200 | consumed tokens: 72194457600 | elapsed time per iteration (s): 0.15 | learning rate: 3.861E-05 | global batch size: 256 | lm loss: 3.676444E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.072 | TFLOPs: 26.25 | +7: iteration 137710/ 173500 | consumed samples: 35253760 | consumed tokens: 72199700480 | elapsed time per iteration (s): 0.15 | learning rate: 3.860E-05 | global batch size: 256 | lm loss: 3.675488E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.749 | TFLOPs: 26.28 | +7: iteration 137720/ 173500 | consumed samples: 35256320 | consumed tokens: 72204943360 | elapsed time per iteration (s): 0.15 | learning rate: 3.859E-05 | global batch size: 256 | lm loss: 3.663774E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.441 | TFLOPs: 26.28 | +7: iteration 137730/ 173500 | consumed samples: 35258880 | consumed tokens: 72210186240 | elapsed time per iteration (s): 0.15 | learning rate: 3.858E-05 | global batch size: 256 | lm loss: 3.678357E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.273 | TFLOPs: 26.27 | +7: iteration 137740/ 173500 | consumed samples: 35261440 | consumed tokens: 72215429120 | elapsed time per iteration (s): 0.15 | learning rate: 3.857E-05 | global batch size: 256 | lm loss: 3.671146E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.773 | TFLOPs: 26.31 | +7: iteration 137750/ 173500 | consumed samples: 35264000 | consumed tokens: 72220672000 | elapsed time per iteration (s): 0.15 | learning rate: 3.856E-05 | global batch size: 256 | lm loss: 3.659453E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.613 | TFLOPs: 26.31 | +7: iteration 137760/ 173500 | consumed samples: 35266560 | consumed tokens: 72225914880 | elapsed time per iteration (s): 0.15 | learning rate: 3.855E-05 | global batch size: 256 | lm loss: 3.686808E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.960 | TFLOPs: 26.25 | +7: iteration 137770/ 173500 | consumed samples: 35269120 | consumed tokens: 72231157760 | elapsed time per iteration (s): 0.15 | learning rate: 3.854E-05 | global batch size: 256 | lm loss: 3.673111E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.271 | TFLOPs: 26.26 | +7: iteration 137780/ 173500 | consumed samples: 35271680 | consumed tokens: 72236400640 | elapsed time per iteration (s): 0.15 | learning rate: 3.853E-05 | global batch size: 256 | lm loss: 3.669261E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.832 | TFLOPs: 26.27 | +7: iteration 137790/ 173500 | consumed samples: 35274240 | consumed tokens: 72241643520 | elapsed time per iteration (s): 0.15 | learning rate: 3.852E-05 | global batch size: 256 | lm loss: 3.673314E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.211 | TFLOPs: 26.04 | +7: iteration 137800/ 173500 | consumed samples: 35276800 | consumed tokens: 72246886400 | elapsed time per iteration (s): 0.15 | learning rate: 3.851E-05 | global batch size: 256 | lm loss: 3.670336E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.467 | TFLOPs: 26.12 | +7: iteration 137810/ 173500 | consumed samples: 35279360 | consumed tokens: 72252129280 | elapsed time per iteration (s): 0.15 | learning rate: 3.850E-05 | global batch size: 256 | lm loss: 3.676370E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.597 | TFLOPs: 26.32 | +7: iteration 137820/ 173500 | consumed samples: 35281920 | consumed tokens: 72257372160 | elapsed time per iteration (s): 0.16 | learning rate: 3.849E-05 | global batch size: 256 | lm loss: 3.662133E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.886 | TFLOPs: 25.87 | +7: iteration 137830/ 173500 | consumed samples: 35284480 | consumed tokens: 72262615040 | elapsed time per iteration (s): 0.15 | learning rate: 3.848E-05 | global batch size: 256 | lm loss: 3.657095E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.939 | TFLOPs: 26.30 | +7: iteration 137840/ 173500 | consumed samples: 35287040 | consumed tokens: 72267857920 | elapsed time per iteration (s): 0.15 | learning rate: 3.847E-05 | global batch size: 256 | lm loss: 3.667348E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.327 | TFLOPs: 26.32 | +7: iteration 137850/ 173500 | consumed samples: 35289600 | consumed tokens: 72273100800 | elapsed time per iteration (s): 0.15 | learning rate: 3.846E-05 | global batch size: 256 | lm loss: 3.663748E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.871 | TFLOPs: 26.34 | +7: iteration 137860/ 173500 | consumed samples: 35292160 | consumed tokens: 72278343680 | elapsed time per iteration (s): 0.15 | learning rate: 3.845E-05 | global batch size: 256 | lm loss: 3.671379E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.881 | TFLOPs: 26.34 | +7: iteration 137870/ 173500 | consumed samples: 35294720 | consumed tokens: 72283586560 | elapsed time per iteration (s): 0.15 | learning rate: 3.844E-05 | global batch size: 256 | lm loss: 3.673064E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.935 | TFLOPs: 26.30 | +7: iteration 137880/ 173500 | consumed samples: 35297280 | consumed tokens: 72288829440 | elapsed time per iteration (s): 0.15 | learning rate: 3.843E-05 | global batch size: 256 | lm loss: 3.670126E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.971 | TFLOPs: 26.36 | +7: iteration 137890/ 173500 | consumed samples: 35299840 | consumed tokens: 72294072320 | elapsed time per iteration (s): 0.15 | learning rate: 3.842E-05 | global batch size: 256 | lm loss: 3.684219E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.108 | TFLOPs: 26.35 | +7: iteration 137900/ 173500 | consumed samples: 35302400 | consumed tokens: 72299315200 | elapsed time per iteration (s): 0.15 | learning rate: 3.841E-05 | global batch size: 256 | lm loss: 3.677357E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.864 | TFLOPs: 26.34 | +7: iteration 137910/ 173500 | consumed samples: 35304960 | consumed tokens: 72304558080 | elapsed time per iteration (s): 0.15 | learning rate: 3.840E-05 | global batch size: 256 | lm loss: 3.663832E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.139 | TFLOPs: 26.07 | +7: iteration 137920/ 173500 | consumed samples: 35307520 | consumed tokens: 72309800960 | elapsed time per iteration (s): 0.15 | learning rate: 3.839E-05 | global batch size: 256 | lm loss: 3.669556E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.511 | TFLOPs: 25.96 | +7: iteration 137930/ 173500 | consumed samples: 35310080 | consumed tokens: 72315043840 | elapsed time per iteration (s): 0.15 | learning rate: 3.838E-05 | global batch size: 256 | lm loss: 3.678813E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.725 | TFLOPs: 26.33 | +7: iteration 137940/ 173500 | consumed samples: 35312640 | consumed tokens: 72320286720 | elapsed time per iteration (s): 0.15 | learning rate: 3.837E-05 | global batch size: 256 | lm loss: 3.679763E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.682 | TFLOPs: 26.31 | +7: iteration 137950/ 173500 | consumed samples: 35315200 | consumed tokens: 72325529600 | elapsed time per iteration (s): 0.15 | learning rate: 3.836E-05 | global batch size: 256 | lm loss: 3.666693E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.295 | TFLOPs: 26.29 | +7: iteration 137960/ 173500 | consumed samples: 35317760 | consumed tokens: 72330772480 | elapsed time per iteration (s): 0.15 | learning rate: 3.835E-05 | global batch size: 256 | lm loss: 3.667123E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.575 | TFLOPs: 26.20 | +7: iteration 137970/ 173500 | consumed samples: 35320320 | consumed tokens: 72336015360 | elapsed time per iteration (s): 0.15 | learning rate: 3.834E-05 | global batch size: 256 | lm loss: 3.665920E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.022 | TFLOPs: 26.10 | +7: iteration 137980/ 173500 | consumed samples: 35322880 | consumed tokens: 72341258240 | elapsed time per iteration (s): 0.15 | learning rate: 3.833E-05 | global batch size: 256 | lm loss: 3.660194E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.442 | TFLOPs: 26.23 | +7: iteration 137990/ 173500 | consumed samples: 35325440 | consumed tokens: 72346501120 | elapsed time per iteration (s): 0.15 | learning rate: 3.832E-05 | global batch size: 256 | lm loss: 3.664896E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.561 | TFLOPs: 26.23 | +0: [2023-03-17 06:15:12,062] [INFO] [logging.py:68:log_dist] [Rank 0] step=138000, skipped=0, lr=[3.831464022325417e-05, 3.831464022325417e-05, 3.831464022325417e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 138000/ 173500 | consumed samples: 35328000 | consumed tokens: 72351744000 | elapsed time per iteration (s): 0.15 | learning rate: 3.831E-05 | global batch size: 256 | lm loss: 3.665657E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.491 | TFLOPs: 26.06 | +0: steps: 138000 loss: 3.6454 iter time (s): 0.153 samples/sec: 1676.755 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 138000 | lm loss value: 3.803340E+00 | lm loss PPL: 4.485072E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 138000 to checkpoints_44m91b100m +0: [2023-03-17 06:15:12,134] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step138000 is begin to save! +0: [2023-03-17 06:15:12,137] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:15:12,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:15:12,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:15:12,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:15:12,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:15:12,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:15:12,224] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:15:12,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:15:12,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:15:12,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:15:12,241] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:15:12,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:15:12,249] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:15:12,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:15:12,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:15:12,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:15:12,265] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:15:12,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:15:12,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:15:12,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:15:12,274] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step138000/mp_rank_00_model_states.pt +0: [2023-03-17 06:15:12,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:15:12,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:15:12,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:15:12,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:15:12,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:15:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +6: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +5: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +1: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +7: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +2: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +4: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:15:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:15:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: [2023-03-17 06:15:12,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:15:12,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:15:12,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step138000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +3: [2023-03-17 06:15:12,342] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step138000 is ready now! +0: successfully saved checkpoint at iteration 138000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 213.27 +7: iteration 138010/ 173500 | consumed samples: 35330560 | consumed tokens: 72356986880 | elapsed time per iteration (s): 0.19 | learning rate: 3.830E-05 | global batch size: 256 | lm loss: 3.663199E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.076 | TFLOPs: 21.27 | +7: iteration 138020/ 173500 | consumed samples: 35333120 | consumed tokens: 72362229760 | elapsed time per iteration (s): 0.15 | learning rate: 3.829E-05 | global batch size: 256 | lm loss: 3.669181E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.786 | TFLOPs: 26.17 | +7: iteration 138030/ 173500 | consumed samples: 35335680 | consumed tokens: 72367472640 | elapsed time per iteration (s): 0.15 | learning rate: 3.828E-05 | global batch size: 256 | lm loss: 3.671786E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.149 | TFLOPs: 26.11 | +7: iteration 138040/ 173500 | consumed samples: 35338240 | consumed tokens: 72372715520 | elapsed time per iteration (s): 0.15 | learning rate: 3.827E-05 | global batch size: 256 | lm loss: 3.658488E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.149 | TFLOPs: 26.29 | +7: iteration 138050/ 173500 | consumed samples: 35340800 | consumed tokens: 72377958400 | elapsed time per iteration (s): 0.15 | learning rate: 3.826E-05 | global batch size: 256 | lm loss: 3.673431E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.582 | TFLOPs: 26.31 | +7: iteration 138060/ 173500 | consumed samples: 35343360 | consumed tokens: 72383201280 | elapsed time per iteration (s): 0.15 | learning rate: 3.825E-05 | global batch size: 256 | lm loss: 3.658201E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.703 | TFLOPs: 26.33 | +7: iteration 138070/ 173500 | consumed samples: 35345920 | consumed tokens: 72388444160 | elapsed time per iteration (s): 0.15 | learning rate: 3.825E-05 | global batch size: 256 | lm loss: 3.681617E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.752 | TFLOPs: 26.33 | +7: iteration 138080/ 173500 | consumed samples: 35348480 | consumed tokens: 72393687040 | elapsed time per iteration (s): 0.15 | learning rate: 3.824E-05 | global batch size: 256 | lm loss: 3.673303E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.564 | TFLOPs: 26.04 | +7: iteration 138090/ 173500 | consumed samples: 35351040 | consumed tokens: 72398929920 | elapsed time per iteration (s): 0.15 | learning rate: 3.823E-05 | global batch size: 256 | lm loss: 3.669812E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.042 | TFLOPs: 26.28 | +7: iteration 138100/ 173500 | consumed samples: 35353600 | consumed tokens: 72404172800 | elapsed time per iteration (s): 0.15 | learning rate: 3.822E-05 | global batch size: 256 | lm loss: 3.678514E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.192 | TFLOPs: 26.30 | +7: iteration 138110/ 173500 | consumed samples: 35356160 | consumed tokens: 72409415680 | elapsed time per iteration (s): 0.15 | learning rate: 3.821E-05 | global batch size: 256 | lm loss: 3.677243E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.585 | TFLOPs: 26.15 | +7: iteration 138120/ 173500 | consumed samples: 35358720 | consumed tokens: 72414658560 | elapsed time per iteration (s): 0.15 | learning rate: 3.820E-05 | global batch size: 256 | lm loss: 3.677662E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.917 | TFLOPs: 26.06 | +7: iteration 138130/ 173500 | consumed samples: 35361280 | consumed tokens: 72419901440 | elapsed time per iteration (s): 0.15 | learning rate: 3.819E-05 | global batch size: 256 | lm loss: 3.670968E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.458 | TFLOPs: 26.15 | +7: iteration 138140/ 173500 | consumed samples: 35363840 | consumed tokens: 72425144320 | elapsed time per iteration (s): 0.15 | learning rate: 3.818E-05 | global batch size: 256 | lm loss: 3.663465E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.826 | TFLOPs: 26.16 | +7: iteration 138150/ 173500 | consumed samples: 35366400 | consumed tokens: 72430387200 | elapsed time per iteration (s): 0.15 | learning rate: 3.817E-05 | global batch size: 256 | lm loss: 3.658458E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.873 | TFLOPs: 26.13 | +7: iteration 138160/ 173500 | consumed samples: 35368960 | consumed tokens: 72435630080 | elapsed time per iteration (s): 0.15 | learning rate: 3.816E-05 | global batch size: 256 | lm loss: 3.654779E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.961 | TFLOPs: 26.10 | +7: iteration 138170/ 173500 | consumed samples: 35371520 | consumed tokens: 72440872960 | elapsed time per iteration (s): 0.15 | learning rate: 3.815E-05 | global batch size: 256 | lm loss: 3.674144E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.215 | TFLOPs: 26.11 | +7: iteration 138180/ 173500 | consumed samples: 35374080 | consumed tokens: 72446115840 | elapsed time per iteration (s): 0.15 | learning rate: 3.814E-05 | global batch size: 256 | lm loss: 3.651020E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.068 | TFLOPs: 26.11 | +7: iteration 138190/ 173500 | consumed samples: 35376640 | consumed tokens: 72451358720 | elapsed time per iteration (s): 0.15 | learning rate: 3.813E-05 | global batch size: 256 | lm loss: 3.668416E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.963 | TFLOPs: 26.11 | +7: iteration 138200/ 173500 | consumed samples: 35379200 | consumed tokens: 72456601600 | elapsed time per iteration (s): 0.16 | learning rate: 3.812E-05 | global batch size: 256 | lm loss: 3.672812E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.126 | TFLOPs: 25.78 | +7: iteration 138210/ 173500 | consumed samples: 35381760 | consumed tokens: 72461844480 | elapsed time per iteration (s): 0.15 | learning rate: 3.811E-05 | global batch size: 256 | lm loss: 3.674209E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.180 | TFLOPs: 26.22 | +7: iteration 138220/ 173500 | consumed samples: 35384320 | consumed tokens: 72467087360 | elapsed time per iteration (s): 0.15 | learning rate: 3.810E-05 | global batch size: 256 | lm loss: 3.679161E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.190 | TFLOPs: 26.26 | +7: iteration 138230/ 173500 | consumed samples: 35386880 | consumed tokens: 72472330240 | elapsed time per iteration (s): 0.15 | learning rate: 3.809E-05 | global batch size: 256 | lm loss: 3.673600E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.879 | TFLOPs: 26.16 | +7: iteration 138240/ 173500 | consumed samples: 35389440 | consumed tokens: 72477573120 | elapsed time per iteration (s): 0.15 | learning rate: 3.808E-05 | global batch size: 256 | lm loss: 3.657751E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.893 | TFLOPs: 26.20 | +7: iteration 138250/ 173500 | consumed samples: 35392000 | consumed tokens: 72482816000 | elapsed time per iteration (s): 0.15 | learning rate: 3.807E-05 | global batch size: 256 | lm loss: 3.666015E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.090 | TFLOPs: 26.25 | +7: iteration 138260/ 173500 | consumed samples: 35394560 | consumed tokens: 72488058880 | elapsed time per iteration (s): 0.15 | learning rate: 3.806E-05 | global batch size: 256 | lm loss: 3.677688E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.031 | TFLOPs: 26.21 | +7: iteration 138270/ 173500 | consumed samples: 35397120 | consumed tokens: 72493301760 | elapsed time per iteration (s): 0.15 | learning rate: 3.805E-05 | global batch size: 256 | lm loss: 3.655259E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.719 | TFLOPs: 26.19 | +7: iteration 138280/ 173500 | consumed samples: 35399680 | consumed tokens: 72498544640 | elapsed time per iteration (s): 0.15 | learning rate: 3.804E-05 | global batch size: 256 | lm loss: 3.680178E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.098 | TFLOPs: 26.25 | +7: iteration 138290/ 173500 | consumed samples: 35402240 | consumed tokens: 72503787520 | elapsed time per iteration (s): 0.15 | learning rate: 3.803E-05 | global batch size: 256 | lm loss: 3.668662E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.481 | TFLOPs: 26.21 | +7: iteration 138300/ 173500 | consumed samples: 35404800 | consumed tokens: 72509030400 | elapsed time per iteration (s): 0.15 | learning rate: 3.802E-05 | global batch size: 256 | lm loss: 3.670786E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.658 | TFLOPs: 26.17 | +7: iteration 138310/ 173500 | consumed samples: 35407360 | consumed tokens: 72514273280 | elapsed time per iteration (s): 0.15 | learning rate: 3.801E-05 | global batch size: 256 | lm loss: 3.695825E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.910 | TFLOPs: 26.14 | +7: iteration 138320/ 173500 | consumed samples: 35409920 | consumed tokens: 72519516160 | elapsed time per iteration (s): 0.15 | learning rate: 3.800E-05 | global batch size: 256 | lm loss: 3.669685E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.480 | TFLOPs: 26.15 | +7: iteration 138330/ 173500 | consumed samples: 35412480 | consumed tokens: 72524759040 | elapsed time per iteration (s): 0.15 | learning rate: 3.799E-05 | global batch size: 256 | lm loss: 3.668716E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.389 | TFLOPs: 26.23 | +7: iteration 138340/ 173500 | consumed samples: 35415040 | consumed tokens: 72530001920 | elapsed time per iteration (s): 0.15 | learning rate: 3.798E-05 | global batch size: 256 | lm loss: 3.659348E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.630 | TFLOPs: 26.25 | +7: iteration 138350/ 173500 | consumed samples: 35417600 | consumed tokens: 72535244800 | elapsed time per iteration (s): 0.15 | learning rate: 3.797E-05 | global batch size: 256 | lm loss: 3.664178E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.032 | TFLOPs: 26.21 | +7: iteration 138360/ 173500 | consumed samples: 35420160 | consumed tokens: 72540487680 | elapsed time per iteration (s): 0.15 | learning rate: 3.796E-05 | global batch size: 256 | lm loss: 3.651907E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.131 | TFLOPs: 26.21 | +7: iteration 138370/ 173500 | consumed samples: 35422720 | consumed tokens: 72545730560 | elapsed time per iteration (s): 0.15 | learning rate: 3.795E-05 | global batch size: 256 | lm loss: 3.683154E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.836 | TFLOPs: 26.25 | +7: iteration 138380/ 173500 | consumed samples: 35425280 | consumed tokens: 72550973440 | elapsed time per iteration (s): 0.15 | learning rate: 3.794E-05 | global batch size: 256 | lm loss: 3.673732E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.570 | TFLOPs: 26.17 | +7: iteration 138390/ 173500 | consumed samples: 35427840 | consumed tokens: 72556216320 | elapsed time per iteration (s): 0.15 | learning rate: 3.793E-05 | global batch size: 256 | lm loss: 3.673436E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.455 | TFLOPs: 26.20 | +7: iteration 138400/ 173500 | consumed samples: 35430400 | consumed tokens: 72561459200 | elapsed time per iteration (s): 0.15 | learning rate: 3.792E-05 | global batch size: 256 | lm loss: 3.670433E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.715 | TFLOPs: 26.26 | +7: iteration 138410/ 173500 | consumed samples: 35432960 | consumed tokens: 72566702080 | elapsed time per iteration (s): 0.15 | learning rate: 3.791E-05 | global batch size: 256 | lm loss: 3.663968E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.040 | TFLOPs: 26.22 | +7: iteration 138420/ 173500 | consumed samples: 35435520 | consumed tokens: 72571944960 | elapsed time per iteration (s): 0.15 | learning rate: 3.790E-05 | global batch size: 256 | lm loss: 3.657848E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.217 | TFLOPs: 26.24 | +7: iteration 138430/ 173500 | consumed samples: 35438080 | consumed tokens: 72577187840 | elapsed time per iteration (s): 0.15 | learning rate: 3.789E-05 | global batch size: 256 | lm loss: 3.669430E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.464 | TFLOPs: 26.06 | +7: iteration 138440/ 173500 | consumed samples: 35440640 | consumed tokens: 72582430720 | elapsed time per iteration (s): 0.15 | learning rate: 3.788E-05 | global batch size: 256 | lm loss: 3.671874E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.485 | TFLOPs: 26.20 | +7: iteration 138450/ 173500 | consumed samples: 35443200 | consumed tokens: 72587673600 | elapsed time per iteration (s): 0.15 | learning rate: 3.787E-05 | global batch size: 256 | lm loss: 3.673026E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.089 | TFLOPs: 26.19 | +7: iteration 138460/ 173500 | consumed samples: 35445760 | consumed tokens: 72592916480 | elapsed time per iteration (s): 0.15 | learning rate: 3.786E-05 | global batch size: 256 | lm loss: 3.669155E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.387 | TFLOPs: 26.12 | +7: iteration 138470/ 173500 | consumed samples: 35448320 | consumed tokens: 72598159360 | elapsed time per iteration (s): 0.15 | learning rate: 3.785E-05 | global batch size: 256 | lm loss: 3.688818E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.577 | TFLOPs: 26.09 | +7: iteration 138480/ 173500 | consumed samples: 35450880 | consumed tokens: 72603402240 | elapsed time per iteration (s): 0.15 | learning rate: 3.784E-05 | global batch size: 256 | lm loss: 3.647321E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.621 | TFLOPs: 26.07 | +7: iteration 138490/ 173500 | consumed samples: 35453440 | consumed tokens: 72608645120 | elapsed time per iteration (s): 0.15 | learning rate: 3.783E-05 | global batch size: 256 | lm loss: 3.659177E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.140 | TFLOPs: 26.10 | +7: iteration 138500/ 173500 | consumed samples: 35456000 | consumed tokens: 72613888000 | elapsed time per iteration (s): 0.15 | learning rate: 3.782E-05 | global batch size: 256 | lm loss: 3.674507E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.192 | TFLOPs: 26.08 | +7: iteration 138510/ 173500 | consumed samples: 35458560 | consumed tokens: 72619130880 | elapsed time per iteration (s): 0.15 | learning rate: 3.781E-05 | global batch size: 256 | lm loss: 3.671428E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.064 | TFLOPs: 26.14 | +7: iteration 138520/ 173500 | consumed samples: 35461120 | consumed tokens: 72624373760 | elapsed time per iteration (s): 0.15 | learning rate: 3.780E-05 | global batch size: 256 | lm loss: 3.663358E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.136 | TFLOPs: 26.19 | +7: iteration 138530/ 173500 | consumed samples: 35463680 | consumed tokens: 72629616640 | elapsed time per iteration (s): 0.15 | learning rate: 3.779E-05 | global batch size: 256 | lm loss: 3.665771E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.648 | TFLOPs: 26.18 | +7: iteration 138540/ 173500 | consumed samples: 35466240 | consumed tokens: 72634859520 | elapsed time per iteration (s): 0.15 | learning rate: 3.778E-05 | global batch size: 256 | lm loss: 3.684527E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.284 | TFLOPs: 26.21 | +7: iteration 138550/ 173500 | consumed samples: 35468800 | consumed tokens: 72640102400 | elapsed time per iteration (s): 0.15 | learning rate: 3.777E-05 | global batch size: 256 | lm loss: 3.663294E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.260 | TFLOPs: 26.19 | +7: iteration 138560/ 173500 | consumed samples: 35471360 | consumed tokens: 72645345280 | elapsed time per iteration (s): 0.15 | learning rate: 3.776E-05 | global batch size: 256 | lm loss: 3.666436E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.532 | TFLOPs: 26.03 | +7: iteration 138570/ 173500 | consumed samples: 35473920 | consumed tokens: 72650588160 | elapsed time per iteration (s): 0.15 | learning rate: 3.775E-05 | global batch size: 256 | lm loss: 3.659148E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.860 | TFLOPs: 26.23 | +7: iteration 138580/ 173500 | consumed samples: 35476480 | consumed tokens: 72655831040 | elapsed time per iteration (s): 0.15 | learning rate: 3.774E-05 | global batch size: 256 | lm loss: 3.651948E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.540 | TFLOPs: 26.23 | +7: iteration 138590/ 173500 | consumed samples: 35479040 | consumed tokens: 72661073920 | elapsed time per iteration (s): 0.15 | learning rate: 3.773E-05 | global batch size: 256 | lm loss: 3.655750E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.205 | TFLOPs: 26.18 | +7: iteration 138600/ 173500 | consumed samples: 35481600 | consumed tokens: 72666316800 | elapsed time per iteration (s): 0.15 | learning rate: 3.772E-05 | global batch size: 256 | lm loss: 3.656606E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.791 | TFLOPs: 26.19 | +7: iteration 138610/ 173500 | consumed samples: 35484160 | consumed tokens: 72671559680 | elapsed time per iteration (s): 0.15 | learning rate: 3.771E-05 | global batch size: 256 | lm loss: 3.658947E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.830 | TFLOPs: 26.20 | +7: iteration 138620/ 173500 | consumed samples: 35486720 | consumed tokens: 72676802560 | elapsed time per iteration (s): 0.15 | learning rate: 3.770E-05 | global batch size: 256 | lm loss: 3.675354E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.432 | TFLOPs: 26.13 | +7: iteration 138630/ 173500 | consumed samples: 35489280 | consumed tokens: 72682045440 | elapsed time per iteration (s): 0.15 | learning rate: 3.769E-05 | global batch size: 256 | lm loss: 3.667122E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.184 | TFLOPs: 26.10 | +7: iteration 138640/ 173500 | consumed samples: 35491840 | consumed tokens: 72687288320 | elapsed time per iteration (s): 0.15 | learning rate: 3.768E-05 | global batch size: 256 | lm loss: 3.672673E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.452 | TFLOPs: 26.09 | +7: iteration 138650/ 173500 | consumed samples: 35494400 | consumed tokens: 72692531200 | elapsed time per iteration (s): 0.15 | learning rate: 3.767E-05 | global batch size: 256 | lm loss: 3.676409E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.690 | TFLOPs: 26.12 | +7: iteration 138660/ 173500 | consumed samples: 35496960 | consumed tokens: 72697774080 | elapsed time per iteration (s): 0.15 | learning rate: 3.766E-05 | global batch size: 256 | lm loss: 3.677114E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.674 | TFLOPs: 26.11 | +7: iteration 138670/ 173500 | consumed samples: 35499520 | consumed tokens: 72703016960 | elapsed time per iteration (s): 0.15 | learning rate: 3.765E-05 | global batch size: 256 | lm loss: 3.652286E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.705 | TFLOPs: 26.11 | +7: iteration 138680/ 173500 | consumed samples: 35502080 | consumed tokens: 72708259840 | elapsed time per iteration (s): 0.15 | learning rate: 3.764E-05 | global batch size: 256 | lm loss: 3.666452E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.143 | TFLOPs: 26.16 | +7: iteration 138690/ 173500 | consumed samples: 35504640 | consumed tokens: 72713502720 | elapsed time per iteration (s): 0.15 | learning rate: 3.763E-05 | global batch size: 256 | lm loss: 3.666380E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.967 | TFLOPs: 26.20 | +7: iteration 138700/ 173500 | consumed samples: 35507200 | consumed tokens: 72718745600 | elapsed time per iteration (s): 0.15 | learning rate: 3.762E-05 | global batch size: 256 | lm loss: 3.663256E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.460 | TFLOPs: 26.21 | +7: iteration 138710/ 173500 | consumed samples: 35509760 | consumed tokens: 72723988480 | elapsed time per iteration (s): 0.15 | learning rate: 3.761E-05 | global batch size: 256 | lm loss: 3.667159E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.931 | TFLOPs: 26.17 | +7: iteration 138720/ 173500 | consumed samples: 35512320 | consumed tokens: 72729231360 | elapsed time per iteration (s): 0.15 | learning rate: 3.760E-05 | global batch size: 256 | lm loss: 3.677923E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.496 | TFLOPs: 26.18 | +7: iteration 138730/ 173500 | consumed samples: 35514880 | consumed tokens: 72734474240 | elapsed time per iteration (s): 0.15 | learning rate: 3.759E-05 | global batch size: 256 | lm loss: 3.674212E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.977 | TFLOPs: 26.21 | +7: iteration 138740/ 173500 | consumed samples: 35517440 | consumed tokens: 72739717120 | elapsed time per iteration (s): 0.15 | learning rate: 3.758E-05 | global batch size: 256 | lm loss: 3.675452E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.099 | TFLOPs: 26.22 | +7: iteration 138750/ 173500 | consumed samples: 35520000 | consumed tokens: 72744960000 | elapsed time per iteration (s): 0.15 | learning rate: 3.757E-05 | global batch size: 256 | lm loss: 3.673291E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.462 | TFLOPs: 26.20 | +7: iteration 138760/ 173500 | consumed samples: 35522560 | consumed tokens: 72750202880 | elapsed time per iteration (s): 0.15 | learning rate: 3.757E-05 | global batch size: 256 | lm loss: 3.668410E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.690 | TFLOPs: 26.31 | +7: iteration 138770/ 173500 | consumed samples: 35525120 | consumed tokens: 72755445760 | elapsed time per iteration (s): 0.16 | learning rate: 3.756E-05 | global batch size: 256 | lm loss: 3.657120E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.587 | TFLOPs: 25.54 | +7: iteration 138780/ 173500 | consumed samples: 35527680 | consumed tokens: 72760688640 | elapsed time per iteration (s): 0.17 | learning rate: 3.755E-05 | global batch size: 256 | lm loss: 3.682263E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.603 | TFLOPs: 24.11 | +7: iteration 138790/ 173500 | consumed samples: 35530240 | consumed tokens: 72765931520 | elapsed time per iteration (s): 0.16 | learning rate: 3.754E-05 | global batch size: 256 | lm loss: 3.671446E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.805 | TFLOPs: 25.70 | +7: iteration 138800/ 173500 | consumed samples: 35532800 | consumed tokens: 72771174400 | elapsed time per iteration (s): 0.15 | learning rate: 3.753E-05 | global batch size: 256 | lm loss: 3.673785E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.911 | TFLOPs: 26.27 | +7: iteration 138810/ 173500 | consumed samples: 35535360 | consumed tokens: 72776417280 | elapsed time per iteration (s): 0.17 | learning rate: 3.752E-05 | global batch size: 256 | lm loss: 3.668428E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.590 | TFLOPs: 23.99 | +7: iteration 138820/ 173500 | consumed samples: 35537920 | consumed tokens: 72781660160 | elapsed time per iteration (s): 0.15 | learning rate: 3.751E-05 | global batch size: 256 | lm loss: 3.660077E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.703 | TFLOPs: 26.28 | +7: iteration 138830/ 173500 | consumed samples: 35540480 | consumed tokens: 72786903040 | elapsed time per iteration (s): 0.16 | learning rate: 3.750E-05 | global batch size: 256 | lm loss: 3.668900E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.992 | TFLOPs: 25.22 | +7: iteration 138840/ 173500 | consumed samples: 35543040 | consumed tokens: 72792145920 | elapsed time per iteration (s): 0.16 | learning rate: 3.749E-05 | global batch size: 256 | lm loss: 3.669549E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.575 | TFLOPs: 25.41 | +7: iteration 138850/ 173500 | consumed samples: 35545600 | consumed tokens: 72797388800 | elapsed time per iteration (s): 0.16 | learning rate: 3.748E-05 | global batch size: 256 | lm loss: 3.662909E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.827 | TFLOPs: 25.84 | +7: iteration 138860/ 173500 | consumed samples: 35548160 | consumed tokens: 72802631680 | elapsed time per iteration (s): 0.16 | learning rate: 3.747E-05 | global batch size: 256 | lm loss: 3.674658E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.080 | TFLOPs: 24.97 | +7: iteration 138870/ 173500 | consumed samples: 35550720 | consumed tokens: 72807874560 | elapsed time per iteration (s): 0.16 | learning rate: 3.746E-05 | global batch size: 256 | lm loss: 3.673008E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.228 | TFLOPs: 25.42 | +7: iteration 138880/ 173500 | consumed samples: 35553280 | consumed tokens: 72813117440 | elapsed time per iteration (s): 0.16 | learning rate: 3.745E-05 | global batch size: 256 | lm loss: 3.666191E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.417 | TFLOPs: 25.87 | +7: iteration 138890/ 173500 | consumed samples: 35555840 | consumed tokens: 72818360320 | elapsed time per iteration (s): 0.15 | learning rate: 3.744E-05 | global batch size: 256 | lm loss: 3.667011E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.877 | TFLOPs: 26.28 | +7: iteration 138900/ 173500 | consumed samples: 35558400 | consumed tokens: 72823603200 | elapsed time per iteration (s): 0.15 | learning rate: 3.743E-05 | global batch size: 256 | lm loss: 3.671149E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.489 | TFLOPs: 25.95 | +7: iteration 138910/ 173500 | consumed samples: 35560960 | consumed tokens: 72828846080 | elapsed time per iteration (s): 0.15 | learning rate: 3.742E-05 | global batch size: 256 | lm loss: 3.670100E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.244 | TFLOPs: 26.27 | +7: iteration 138920/ 173500 | consumed samples: 35563520 | consumed tokens: 72834088960 | elapsed time per iteration (s): 0.16 | learning rate: 3.741E-05 | global batch size: 256 | lm loss: 3.659652E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.637 | TFLOPs: 25.38 | +7: iteration 138930/ 173500 | consumed samples: 35566080 | consumed tokens: 72839331840 | elapsed time per iteration (s): 0.16 | learning rate: 3.740E-05 | global batch size: 256 | lm loss: 3.662387E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.486 | TFLOPs: 25.85 | +7: iteration 138940/ 173500 | consumed samples: 35568640 | consumed tokens: 72844574720 | elapsed time per iteration (s): 0.15 | learning rate: 3.739E-05 | global batch size: 256 | lm loss: 3.670338E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.726 | TFLOPs: 26.28 | +7: iteration 138950/ 173500 | consumed samples: 35571200 | consumed tokens: 72849817600 | elapsed time per iteration (s): 0.15 | learning rate: 3.738E-05 | global batch size: 256 | lm loss: 3.661754E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.667 | TFLOPs: 26.28 | +7: iteration 138960/ 173500 | consumed samples: 35573760 | consumed tokens: 72855060480 | elapsed time per iteration (s): 0.15 | learning rate: 3.737E-05 | global batch size: 256 | lm loss: 3.659128E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.710 | TFLOPs: 26.25 | +7: iteration 138970/ 173500 | consumed samples: 35576320 | consumed tokens: 72860303360 | elapsed time per iteration (s): 0.15 | learning rate: 3.736E-05 | global batch size: 256 | lm loss: 3.673402E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.765 | TFLOPs: 26.28 | +7: iteration 138980/ 173500 | consumed samples: 35578880 | consumed tokens: 72865546240 | elapsed time per iteration (s): 0.16 | learning rate: 3.735E-05 | global batch size: 256 | lm loss: 3.672809E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.347 | TFLOPs: 25.76 | +7: iteration 138990/ 173500 | consumed samples: 35581440 | consumed tokens: 72870789120 | elapsed time per iteration (s): 0.15 | learning rate: 3.734E-05 | global batch size: 256 | lm loss: 3.667373E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.774 | TFLOPs: 26.26 | +7: iteration 139000/ 173500 | consumed samples: 35584000 | consumed tokens: 72876032000 | elapsed time per iteration (s): 0.16 | learning rate: 3.733E-05 | global batch size: 256 | lm loss: 3.669873E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.578 | TFLOPs: 25.84 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 139000 | lm loss value: 3.846253E+00 | lm loss PPL: 4.681729E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 139000 to checkpoints_44m91b100m +0: [2023-03-17 06:17:46,530] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step139000 is begin to save! +0: [2023-03-17 06:17:46,533] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:17:46,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:17:46,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:17:46,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:17:46,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:17:46,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:17:46,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:17:46,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:17:46,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:17:46,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:17:46,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:17:46,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:17:46,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:17:46,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:17:46,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:17:46,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:17:46,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:17:46,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:17:46,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:17:46,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:17:46,664] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step139000/mp_rank_00_model_states.pt +0: [2023-03-17 06:17:46,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:17:46,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:17:46,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:17:46,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +2: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +2: [2023-03-17 06:17:46,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +4: [2023-03-17 06:17:46,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:17:46,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:17:46,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +4: [2023-03-17 06:17:46,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +2: [2023-03-17 06:17:46,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +4: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +2: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +4: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +2: [2023-03-17 06:17:46,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 06:17:46,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +4: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +5: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +6: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +2: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:17:46,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 06:17:46,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +7: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +3: [2023-03-17 06:17:46,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +4: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +1: [2023-03-17 06:17:46,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step139000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:17:46,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step139000 is ready now! +0: successfully saved checkpoint at iteration 139000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.95 +7: iteration 139010/ 173500 | consumed samples: 35586560 | consumed tokens: 72881274880 | elapsed time per iteration (s): 0.18 | learning rate: 3.732E-05 | global batch size: 256 | lm loss: 3.674031E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.324 | TFLOPs: 22.56 | +7: iteration 139020/ 173500 | consumed samples: 35589120 | consumed tokens: 72886517760 | elapsed time per iteration (s): 0.15 | learning rate: 3.731E-05 | global batch size: 256 | lm loss: 3.655902E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.164 | TFLOPs: 26.16 | +7: iteration 139030/ 173500 | consumed samples: 35591680 | consumed tokens: 72891760640 | elapsed time per iteration (s): 0.16 | learning rate: 3.730E-05 | global batch size: 256 | lm loss: 3.668332E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.995 | TFLOPs: 25.69 | +7: iteration 139040/ 173500 | consumed samples: 35594240 | consumed tokens: 72897003520 | elapsed time per iteration (s): 0.15 | learning rate: 3.729E-05 | global batch size: 256 | lm loss: 3.677216E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.311 | TFLOPs: 26.29 | +7: iteration 139050/ 173500 | consumed samples: 35596800 | consumed tokens: 72902246400 | elapsed time per iteration (s): 0.16 | learning rate: 3.728E-05 | global batch size: 256 | lm loss: 3.669674E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.301 | TFLOPs: 25.80 | +7: iteration 139060/ 173500 | consumed samples: 35599360 | consumed tokens: 72907489280 | elapsed time per iteration (s): 0.15 | learning rate: 3.727E-05 | global batch size: 256 | lm loss: 3.678134E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.601 | TFLOPs: 26.28 | +7: iteration 139070/ 173500 | consumed samples: 35601920 | consumed tokens: 72912732160 | elapsed time per iteration (s): 0.15 | learning rate: 3.726E-05 | global batch size: 256 | lm loss: 3.659739E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.428 | TFLOPs: 25.98 | +7: iteration 139080/ 173500 | consumed samples: 35604480 | consumed tokens: 72917975040 | elapsed time per iteration (s): 0.16 | learning rate: 3.725E-05 | global batch size: 256 | lm loss: 3.683060E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.691 | TFLOPs: 25.40 | +7: iteration 139090/ 173500 | consumed samples: 35607040 | consumed tokens: 72923217920 | elapsed time per iteration (s): 0.15 | learning rate: 3.724E-05 | global batch size: 256 | lm loss: 3.668604E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.506 | TFLOPs: 25.96 | +7: iteration 139100/ 173500 | consumed samples: 35609600 | consumed tokens: 72928460800 | elapsed time per iteration (s): 0.16 | learning rate: 3.723E-05 | global batch size: 256 | lm loss: 3.662552E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.963 | TFLOPs: 25.42 | +7: iteration 139110/ 173500 | consumed samples: 35612160 | consumed tokens: 72933703680 | elapsed time per iteration (s): 0.15 | learning rate: 3.722E-05 | global batch size: 256 | lm loss: 3.669622E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.687 | TFLOPs: 25.97 | +7: iteration 139120/ 173500 | consumed samples: 35614720 | consumed tokens: 72938946560 | elapsed time per iteration (s): 0.16 | learning rate: 3.722E-05 | global batch size: 256 | lm loss: 3.668713E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.976 | TFLOPs: 25.22 | +7: iteration 139130/ 173500 | consumed samples: 35617280 | consumed tokens: 72944189440 | elapsed time per iteration (s): 0.15 | learning rate: 3.721E-05 | global batch size: 256 | lm loss: 3.674796E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.031 | TFLOPs: 25.91 | +7: iteration 139140/ 173500 | consumed samples: 35619840 | consumed tokens: 72949432320 | elapsed time per iteration (s): 0.16 | learning rate: 3.720E-05 | global batch size: 256 | lm loss: 3.674187E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.956 | TFLOPs: 25.84 | +7: iteration 139150/ 173500 | consumed samples: 35622400 | consumed tokens: 72954675200 | elapsed time per iteration (s): 0.15 | learning rate: 3.719E-05 | global batch size: 256 | lm loss: 3.669790E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.036 | TFLOPs: 25.96 | +7: iteration 139160/ 173500 | consumed samples: 35624960 | consumed tokens: 72959918080 | elapsed time per iteration (s): 0.15 | learning rate: 3.718E-05 | global batch size: 256 | lm loss: 3.679126E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.338 | TFLOPs: 26.13 | +7: iteration 139170/ 173500 | consumed samples: 35627520 | consumed tokens: 72965160960 | elapsed time per iteration (s): 0.16 | learning rate: 3.717E-05 | global batch size: 256 | lm loss: 3.665479E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.256 | TFLOPs: 25.75 | +7: iteration 139180/ 173500 | consumed samples: 35630080 | consumed tokens: 72970403840 | elapsed time per iteration (s): 0.16 | learning rate: 3.716E-05 | global batch size: 256 | lm loss: 3.672923E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.962 | TFLOPs: 25.56 | +7: iteration 139190/ 173500 | consumed samples: 35632640 | consumed tokens: 72975646720 | elapsed time per iteration (s): 0.15 | learning rate: 3.715E-05 | global batch size: 256 | lm loss: 3.672518E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.709 | TFLOPs: 26.12 | +7: iteration 139200/ 173500 | consumed samples: 35635200 | consumed tokens: 72980889600 | elapsed time per iteration (s): 0.16 | learning rate: 3.714E-05 | global batch size: 256 | lm loss: 3.671065E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.738 | TFLOPs: 25.89 | +7: iteration 139210/ 173500 | consumed samples: 35637760 | consumed tokens: 72986132480 | elapsed time per iteration (s): 0.15 | learning rate: 3.713E-05 | global batch size: 256 | lm loss: 3.674528E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.268 | TFLOPs: 26.19 | +7: iteration 139220/ 173500 | consumed samples: 35640320 | consumed tokens: 72991375360 | elapsed time per iteration (s): 0.16 | learning rate: 3.712E-05 | global batch size: 256 | lm loss: 3.660602E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.570 | TFLOPs: 25.29 | +7: iteration 139230/ 173500 | consumed samples: 35642880 | consumed tokens: 72996618240 | elapsed time per iteration (s): 0.15 | learning rate: 3.711E-05 | global batch size: 256 | lm loss: 3.665048E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.682 | TFLOPs: 26.18 | +7: iteration 139240/ 173500 | consumed samples: 35645440 | consumed tokens: 73001861120 | elapsed time per iteration (s): 0.16 | learning rate: 3.710E-05 | global batch size: 256 | lm loss: 3.678083E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.426 | TFLOPs: 25.76 | +7: iteration 139250/ 173500 | consumed samples: 35648000 | consumed tokens: 73007104000 | elapsed time per iteration (s): 0.16 | learning rate: 3.709E-05 | global batch size: 256 | lm loss: 3.644334E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.669 | TFLOPs: 24.96 | +7: iteration 139260/ 173500 | consumed samples: 35650560 | consumed tokens: 73012346880 | elapsed time per iteration (s): 0.15 | learning rate: 3.708E-05 | global batch size: 256 | lm loss: 3.671513E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.972 | TFLOPs: 26.19 | +7: iteration 139270/ 173500 | consumed samples: 35653120 | consumed tokens: 73017589760 | elapsed time per iteration (s): 0.16 | learning rate: 3.707E-05 | global batch size: 256 | lm loss: 3.683768E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.894 | TFLOPs: 25.83 | +7: iteration 139280/ 173500 | consumed samples: 35655680 | consumed tokens: 73022832640 | elapsed time per iteration (s): 0.16 | learning rate: 3.706E-05 | global batch size: 256 | lm loss: 3.676978E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.636 | TFLOPs: 25.64 | +7: iteration 139290/ 173500 | consumed samples: 35658240 | consumed tokens: 73028075520 | elapsed time per iteration (s): 0.16 | learning rate: 3.705E-05 | global batch size: 256 | lm loss: 3.682578E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.802 | TFLOPs: 25.62 | +7: iteration 139300/ 173500 | consumed samples: 35660800 | consumed tokens: 73033318400 | elapsed time per iteration (s): 0.16 | learning rate: 3.704E-05 | global batch size: 256 | lm loss: 3.670415E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.936 | TFLOPs: 25.59 | +7: iteration 139310/ 173500 | consumed samples: 35663360 | consumed tokens: 73038561280 | elapsed time per iteration (s): 0.16 | learning rate: 3.703E-05 | global batch size: 256 | lm loss: 3.661871E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.014 | TFLOPs: 25.66 | +7: iteration 139320/ 173500 | consumed samples: 35665920 | consumed tokens: 73043804160 | elapsed time per iteration (s): 0.15 | learning rate: 3.702E-05 | global batch size: 256 | lm loss: 3.680757E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.490 | TFLOPs: 25.93 | +7: iteration 139330/ 173500 | consumed samples: 35668480 | consumed tokens: 73049047040 | elapsed time per iteration (s): 0.15 | learning rate: 3.701E-05 | global batch size: 256 | lm loss: 3.664130E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.436 | TFLOPs: 26.04 | +7: iteration 139340/ 173500 | consumed samples: 35671040 | consumed tokens: 73054289920 | elapsed time per iteration (s): 0.15 | learning rate: 3.700E-05 | global batch size: 256 | lm loss: 3.683651E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.792 | TFLOPs: 26.26 | +7: iteration 139350/ 173500 | consumed samples: 35673600 | consumed tokens: 73059532800 | elapsed time per iteration (s): 0.15 | learning rate: 3.699E-05 | global batch size: 256 | lm loss: 3.642725E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.471 | TFLOPs: 26.24 | +7: iteration 139360/ 173500 | consumed samples: 35676160 | consumed tokens: 73064775680 | elapsed time per iteration (s): 0.15 | learning rate: 3.698E-05 | global batch size: 256 | lm loss: 3.660135E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.606 | TFLOPs: 26.25 | +7: iteration 139370/ 173500 | consumed samples: 35678720 | consumed tokens: 73070018560 | elapsed time per iteration (s): 0.15 | learning rate: 3.697E-05 | global batch size: 256 | lm loss: 3.663123E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.066 | TFLOPs: 26.24 | +7: iteration 139380/ 173500 | consumed samples: 35681280 | consumed tokens: 73075261440 | elapsed time per iteration (s): 0.15 | learning rate: 3.696E-05 | global batch size: 256 | lm loss: 3.657660E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.249 | TFLOPs: 26.24 | +7: iteration 139390/ 173500 | consumed samples: 35683840 | consumed tokens: 73080504320 | elapsed time per iteration (s): 0.15 | learning rate: 3.695E-05 | global batch size: 256 | lm loss: 3.669586E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.916 | TFLOPs: 26.25 | +7: iteration 139400/ 173500 | consumed samples: 35686400 | consumed tokens: 73085747200 | elapsed time per iteration (s): 0.15 | learning rate: 3.694E-05 | global batch size: 256 | lm loss: 3.659703E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.008 | TFLOPs: 26.06 | +7: iteration 139410/ 173500 | consumed samples: 35688960 | consumed tokens: 73090990080 | elapsed time per iteration (s): 0.17 | learning rate: 3.694E-05 | global batch size: 256 | lm loss: 3.666368E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1498.875 | TFLOPs: 23.51 | +7: iteration 139420/ 173500 | consumed samples: 35691520 | consumed tokens: 73096232960 | elapsed time per iteration (s): 0.15 | learning rate: 3.693E-05 | global batch size: 256 | lm loss: 3.654544E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.732 | TFLOPs: 26.28 | +7: iteration 139430/ 173500 | consumed samples: 35694080 | consumed tokens: 73101475840 | elapsed time per iteration (s): 0.15 | learning rate: 3.692E-05 | global batch size: 256 | lm loss: 3.672316E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.289 | TFLOPs: 26.04 | +7: iteration 139440/ 173500 | consumed samples: 35696640 | consumed tokens: 73106718720 | elapsed time per iteration (s): 0.16 | learning rate: 3.691E-05 | global batch size: 256 | lm loss: 3.660947E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.466 | TFLOPs: 25.30 | +7: iteration 139450/ 173500 | consumed samples: 35699200 | consumed tokens: 73111961600 | elapsed time per iteration (s): 0.15 | learning rate: 3.690E-05 | global batch size: 256 | lm loss: 3.670904E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.262 | TFLOPs: 26.07 | +7: iteration 139460/ 173500 | consumed samples: 35701760 | consumed tokens: 73117204480 | elapsed time per iteration (s): 0.16 | learning rate: 3.689E-05 | global batch size: 256 | lm loss: 3.674771E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.702 | TFLOPs: 25.13 | +7: iteration 139470/ 173500 | consumed samples: 35704320 | consumed tokens: 73122447360 | elapsed time per iteration (s): 0.16 | learning rate: 3.688E-05 | global batch size: 256 | lm loss: 3.652224E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.530 | TFLOPs: 25.90 | +7: iteration 139480/ 173500 | consumed samples: 35706880 | consumed tokens: 73127690240 | elapsed time per iteration (s): 0.15 | learning rate: 3.687E-05 | global batch size: 256 | lm loss: 3.670930E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.516 | TFLOPs: 26.03 | +7: iteration 139490/ 173500 | consumed samples: 35709440 | consumed tokens: 73132933120 | elapsed time per iteration (s): 0.15 | learning rate: 3.686E-05 | global batch size: 256 | lm loss: 3.677466E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.869 | TFLOPs: 26.39 | +7: iteration 139500/ 173500 | consumed samples: 35712000 | consumed tokens: 73138176000 | elapsed time per iteration (s): 0.15 | learning rate: 3.685E-05 | global batch size: 256 | lm loss: 3.667083E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.343 | TFLOPs: 26.30 | +7: iteration 139510/ 173500 | consumed samples: 35714560 | consumed tokens: 73143418880 | elapsed time per iteration (s): 0.16 | learning rate: 3.684E-05 | global batch size: 256 | lm loss: 3.676869E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.127 | TFLOPs: 25.77 | +7: iteration 139520/ 173500 | consumed samples: 35717120 | consumed tokens: 73148661760 | elapsed time per iteration (s): 0.16 | learning rate: 3.683E-05 | global batch size: 256 | lm loss: 3.681414E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.818 | TFLOPs: 25.69 | +7: iteration 139530/ 173500 | consumed samples: 35719680 | consumed tokens: 73153904640 | elapsed time per iteration (s): 0.15 | learning rate: 3.682E-05 | global batch size: 256 | lm loss: 3.671549E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.176 | TFLOPs: 25.97 | +7: iteration 139540/ 173500 | consumed samples: 35722240 | consumed tokens: 73159147520 | elapsed time per iteration (s): 0.15 | learning rate: 3.681E-05 | global batch size: 256 | lm loss: 3.654690E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.262 | TFLOPs: 26.08 | +7: iteration 139550/ 173500 | consumed samples: 35724800 | consumed tokens: 73164390400 | elapsed time per iteration (s): 0.15 | learning rate: 3.680E-05 | global batch size: 256 | lm loss: 3.657910E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.922 | TFLOPs: 25.97 | +7: iteration 139560/ 173500 | consumed samples: 35727360 | consumed tokens: 73169633280 | elapsed time per iteration (s): 0.16 | learning rate: 3.679E-05 | global batch size: 256 | lm loss: 3.676545E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.359 | TFLOPs: 25.87 | +7: iteration 139570/ 173500 | consumed samples: 35729920 | consumed tokens: 73174876160 | elapsed time per iteration (s): 0.16 | learning rate: 3.678E-05 | global batch size: 256 | lm loss: 3.672989E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.362 | TFLOPs: 25.57 | +7: iteration 139580/ 173500 | consumed samples: 35732480 | consumed tokens: 73180119040 | elapsed time per iteration (s): 0.16 | learning rate: 3.677E-05 | global batch size: 256 | lm loss: 3.658727E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.436 | TFLOPs: 25.52 | +7: iteration 139590/ 173500 | consumed samples: 35735040 | consumed tokens: 73185361920 | elapsed time per iteration (s): 0.15 | learning rate: 3.676E-05 | global batch size: 256 | lm loss: 3.677711E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.860 | TFLOPs: 26.03 | +7: iteration 139600/ 173500 | consumed samples: 35737600 | consumed tokens: 73190604800 | elapsed time per iteration (s): 0.15 | learning rate: 3.675E-05 | global batch size: 256 | lm loss: 3.677083E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.333 | TFLOPs: 26.02 | +7: iteration 139610/ 173500 | consumed samples: 35740160 | consumed tokens: 73195847680 | elapsed time per iteration (s): 0.15 | learning rate: 3.674E-05 | global batch size: 256 | lm loss: 3.671681E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.391 | TFLOPs: 26.32 | +7: iteration 139620/ 173500 | consumed samples: 35742720 | consumed tokens: 73201090560 | elapsed time per iteration (s): 0.15 | learning rate: 3.673E-05 | global batch size: 256 | lm loss: 3.680635E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.857 | TFLOPs: 26.20 | +7: iteration 139630/ 173500 | consumed samples: 35745280 | consumed tokens: 73206333440 | elapsed time per iteration (s): 0.15 | learning rate: 3.672E-05 | global batch size: 256 | lm loss: 3.659940E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.357 | TFLOPs: 26.20 | +7: iteration 139640/ 173500 | consumed samples: 35747840 | consumed tokens: 73211576320 | elapsed time per iteration (s): 0.15 | learning rate: 3.671E-05 | global batch size: 256 | lm loss: 3.659246E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.944 | TFLOPs: 26.24 | +7: iteration 139650/ 173500 | consumed samples: 35750400 | consumed tokens: 73216819200 | elapsed time per iteration (s): 0.16 | learning rate: 3.671E-05 | global batch size: 256 | lm loss: 3.659528E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.461 | TFLOPs: 25.90 | +7: iteration 139660/ 173500 | consumed samples: 35752960 | consumed tokens: 73222062080 | elapsed time per iteration (s): 0.15 | learning rate: 3.670E-05 | global batch size: 256 | lm loss: 3.658588E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.492 | TFLOPs: 26.06 | +7: iteration 139670/ 173500 | consumed samples: 35755520 | consumed tokens: 73227304960 | elapsed time per iteration (s): 0.15 | learning rate: 3.669E-05 | global batch size: 256 | lm loss: 3.660289E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.186 | TFLOPs: 26.29 | +7: iteration 139680/ 173500 | consumed samples: 35758080 | consumed tokens: 73232547840 | elapsed time per iteration (s): 0.15 | learning rate: 3.668E-05 | global batch size: 256 | lm loss: 3.662033E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.825 | TFLOPs: 26.27 | +7: iteration 139690/ 173500 | consumed samples: 35760640 | consumed tokens: 73237790720 | elapsed time per iteration (s): 0.16 | learning rate: 3.667E-05 | global batch size: 256 | lm loss: 3.663119E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.520 | TFLOPs: 24.79 | +7: iteration 139700/ 173500 | consumed samples: 35763200 | consumed tokens: 73243033600 | elapsed time per iteration (s): 0.16 | learning rate: 3.666E-05 | global batch size: 256 | lm loss: 3.658488E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.438 | TFLOPs: 25.43 | +7: iteration 139710/ 173500 | consumed samples: 35765760 | consumed tokens: 73248276480 | elapsed time per iteration (s): 0.15 | learning rate: 3.665E-05 | global batch size: 256 | lm loss: 3.677171E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.160 | TFLOPs: 26.29 | +7: iteration 139720/ 173500 | consumed samples: 35768320 | consumed tokens: 73253519360 | elapsed time per iteration (s): 0.15 | learning rate: 3.664E-05 | global batch size: 256 | lm loss: 3.661108E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.152 | TFLOPs: 26.24 | +7: iteration 139730/ 173500 | consumed samples: 35770880 | consumed tokens: 73258762240 | elapsed time per iteration (s): 0.16 | learning rate: 3.663E-05 | global batch size: 256 | lm loss: 3.671554E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.803 | TFLOPs: 25.62 | +7: iteration 139740/ 173500 | consumed samples: 35773440 | consumed tokens: 73264005120 | elapsed time per iteration (s): 0.15 | learning rate: 3.662E-05 | global batch size: 256 | lm loss: 3.667556E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.630 | TFLOPs: 26.12 | +7: iteration 139750/ 173500 | consumed samples: 35776000 | consumed tokens: 73269248000 | elapsed time per iteration (s): 0.15 | learning rate: 3.661E-05 | global batch size: 256 | lm loss: 3.656757E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.041 | TFLOPs: 26.06 | +7: iteration 139760/ 173500 | consumed samples: 35778560 | consumed tokens: 73274490880 | elapsed time per iteration (s): 0.15 | learning rate: 3.660E-05 | global batch size: 256 | lm loss: 3.662417E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.340 | TFLOPs: 26.24 | +7: iteration 139770/ 173500 | consumed samples: 35781120 | consumed tokens: 73279733760 | elapsed time per iteration (s): 0.16 | learning rate: 3.659E-05 | global batch size: 256 | lm loss: 3.656712E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.795 | TFLOPs: 25.59 | +7: iteration 139780/ 173500 | consumed samples: 35783680 | consumed tokens: 73284976640 | elapsed time per iteration (s): 0.16 | learning rate: 3.658E-05 | global batch size: 256 | lm loss: 3.663129E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.601 | TFLOPs: 25.40 | +7: iteration 139790/ 173500 | consumed samples: 35786240 | consumed tokens: 73290219520 | elapsed time per iteration (s): 0.16 | learning rate: 3.657E-05 | global batch size: 256 | lm loss: 3.665617E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.592 | TFLOPs: 25.02 | +7: iteration 139800/ 173500 | consumed samples: 35788800 | consumed tokens: 73295462400 | elapsed time per iteration (s): 0.16 | learning rate: 3.656E-05 | global batch size: 256 | lm loss: 3.672129E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.999 | TFLOPs: 25.80 | +7: iteration 139810/ 173500 | consumed samples: 35791360 | consumed tokens: 73300705280 | elapsed time per iteration (s): 0.15 | learning rate: 3.655E-05 | global batch size: 256 | lm loss: 3.658585E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.211 | TFLOPs: 26.27 | +7: iteration 139820/ 173500 | consumed samples: 35793920 | consumed tokens: 73305948160 | elapsed time per iteration (s): 0.16 | learning rate: 3.654E-05 | global batch size: 256 | lm loss: 3.666474E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.428 | TFLOPs: 25.68 | +7: iteration 139830/ 173500 | consumed samples: 35796480 | consumed tokens: 73311191040 | elapsed time per iteration (s): 0.15 | learning rate: 3.653E-05 | global batch size: 256 | lm loss: 3.658893E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.249 | TFLOPs: 26.23 | +7: iteration 139840/ 173500 | consumed samples: 35799040 | consumed tokens: 73316433920 | elapsed time per iteration (s): 0.16 | learning rate: 3.652E-05 | global batch size: 256 | lm loss: 3.653281E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.955 | TFLOPs: 25.51 | +7: iteration 139850/ 173500 | consumed samples: 35801600 | consumed tokens: 73321676800 | elapsed time per iteration (s): 0.15 | learning rate: 3.651E-05 | global batch size: 256 | lm loss: 3.672026E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.647 | TFLOPs: 25.90 | +7: iteration 139860/ 173500 | consumed samples: 35804160 | consumed tokens: 73326919680 | elapsed time per iteration (s): 0.15 | learning rate: 3.651E-05 | global batch size: 256 | lm loss: 3.665955E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.289 | TFLOPs: 26.23 | +7: iteration 139870/ 173500 | consumed samples: 35806720 | consumed tokens: 73332162560 | elapsed time per iteration (s): 0.15 | learning rate: 3.650E-05 | global batch size: 256 | lm loss: 3.669159E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.956 | TFLOPs: 26.24 | +7: iteration 139880/ 173500 | consumed samples: 35809280 | consumed tokens: 73337405440 | elapsed time per iteration (s): 0.16 | learning rate: 3.649E-05 | global batch size: 256 | lm loss: 3.666848E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.586 | TFLOPs: 25.38 | +7: iteration 139890/ 173500 | consumed samples: 35811840 | consumed tokens: 73342648320 | elapsed time per iteration (s): 0.16 | learning rate: 3.648E-05 | global batch size: 256 | lm loss: 3.666838E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.074 | TFLOPs: 24.86 | +7: iteration 139900/ 173500 | consumed samples: 35814400 | consumed tokens: 73347891200 | elapsed time per iteration (s): 0.15 | learning rate: 3.647E-05 | global batch size: 256 | lm loss: 3.653030E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.888 | TFLOPs: 26.22 | +7: iteration 139910/ 173500 | consumed samples: 35816960 | consumed tokens: 73353134080 | elapsed time per iteration (s): 0.15 | learning rate: 3.646E-05 | global batch size: 256 | lm loss: 3.674046E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.204 | TFLOPs: 26.00 | +7: iteration 139920/ 173500 | consumed samples: 35819520 | consumed tokens: 73358376960 | elapsed time per iteration (s): 0.16 | learning rate: 3.645E-05 | global batch size: 256 | lm loss: 3.678214E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.905 | TFLOPs: 25.64 | +7: iteration 139930/ 173500 | consumed samples: 35822080 | consumed tokens: 73363619840 | elapsed time per iteration (s): 0.15 | learning rate: 3.644E-05 | global batch size: 256 | lm loss: 3.676992E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.916 | TFLOPs: 26.24 | +7: iteration 139940/ 173500 | consumed samples: 35824640 | consumed tokens: 73368862720 | elapsed time per iteration (s): 0.15 | learning rate: 3.643E-05 | global batch size: 256 | lm loss: 3.670176E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.273 | TFLOPs: 26.01 | +7: iteration 139950/ 173500 | consumed samples: 35827200 | consumed tokens: 73374105600 | elapsed time per iteration (s): 0.16 | learning rate: 3.642E-05 | global batch size: 256 | lm loss: 3.663338E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.584 | TFLOPs: 25.32 | +7: iteration 139960/ 173500 | consumed samples: 35829760 | consumed tokens: 73379348480 | elapsed time per iteration (s): 0.16 | learning rate: 3.641E-05 | global batch size: 256 | lm loss: 3.676966E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.109 | TFLOPs: 25.58 | +7: iteration 139970/ 173500 | consumed samples: 35832320 | consumed tokens: 73384591360 | elapsed time per iteration (s): 0.15 | learning rate: 3.640E-05 | global batch size: 256 | lm loss: 3.674509E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.466 | TFLOPs: 26.23 | +7: iteration 139980/ 173500 | consumed samples: 35834880 | consumed tokens: 73389834240 | elapsed time per iteration (s): 0.16 | learning rate: 3.639E-05 | global batch size: 256 | lm loss: 3.669694E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.726 | TFLOPs: 25.23 | +7: iteration 139990/ 173500 | consumed samples: 35837440 | consumed tokens: 73395077120 | elapsed time per iteration (s): 0.15 | learning rate: 3.638E-05 | global batch size: 256 | lm loss: 3.663485E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.343 | TFLOPs: 26.24 | +0: [2023-03-17 06:20:21,988] [INFO] [logging.py:68:log_dist] [Rank 0] step=140000, skipped=0, lr=[3.63724657135183e-05, 3.63724657135183e-05, 3.63724657135183e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 140000/ 173500 | consumed samples: 35840000 | consumed tokens: 73400320000 | elapsed time per iteration (s): 0.15 | learning rate: 3.637E-05 | global batch size: 256 | lm loss: 3.660668E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.874 | TFLOPs: 26.06 | +0: steps: 140000 loss: 3.6503 iter time (s): 0.154 samples/sec: 1663.944 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 140000 | lm loss value: 3.856175E+00 | lm loss PPL: 4.728416E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 140000 to checkpoints_44m91b100m +0: [2023-03-17 06:20:22,061] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step140000 is begin to save! +0: [2023-03-17 06:20:22,064] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:20:22,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:20:22,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:20:22,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:20:22,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:20:22,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:20:22,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:20:22,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:20:22,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:20:22,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:20:22,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:20:22,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:20:22,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:20:22,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:20:22,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:20:22,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:20:22,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:20:22,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:20:22,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:20:22,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:20:22,198] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step140000/mp_rank_00_model_states.pt +0: [2023-03-17 06:20:22,198] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:20:22,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:20:22,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:20:22,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:20:22,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:20:22,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:20:22,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:20:22,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:20:22,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:20:22,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:20:22,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:20:22,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:20:22,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:20:22,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:20:22,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +3: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:20:22,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +6: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +2: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 06:20:22,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 06:20:22,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 06:20:22,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +7: [2023-03-17 06:20:22,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +5: [2023-03-17 06:20:22,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +4: [2023-03-17 06:20:22,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:20:22,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:20:22,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +1: [2023-03-17 06:20:22,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:20:22,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step140000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:20:22,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step140000 is ready now! +0: successfully saved checkpoint at iteration 140000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 199.64 +7: iteration 140010/ 173500 | consumed samples: 35842560 | consumed tokens: 73405562880 | elapsed time per iteration (s): 0.18 | learning rate: 3.636E-05 | global batch size: 256 | lm loss: 3.668154E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.237 | TFLOPs: 22.12 | +7: iteration 140020/ 173500 | consumed samples: 35845120 | consumed tokens: 73410805760 | elapsed time per iteration (s): 0.15 | learning rate: 3.635E-05 | global batch size: 256 | lm loss: 3.677357E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.745 | TFLOPs: 25.92 | +7: iteration 140030/ 173500 | consumed samples: 35847680 | consumed tokens: 73416048640 | elapsed time per iteration (s): 0.15 | learning rate: 3.634E-05 | global batch size: 256 | lm loss: 3.662374E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.730 | TFLOPs: 26.37 | +7: iteration 140040/ 173500 | consumed samples: 35850240 | consumed tokens: 73421291520 | elapsed time per iteration (s): 0.15 | learning rate: 3.633E-05 | global batch size: 256 | lm loss: 3.670955E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.508 | TFLOPs: 26.04 | +7: iteration 140050/ 173500 | consumed samples: 35852800 | consumed tokens: 73426534400 | elapsed time per iteration (s): 0.16 | learning rate: 3.633E-05 | global batch size: 256 | lm loss: 3.659641E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.553 | TFLOPs: 25.67 | +7: iteration 140060/ 173500 | consumed samples: 35855360 | consumed tokens: 73431777280 | elapsed time per iteration (s): 0.16 | learning rate: 3.632E-05 | global batch size: 256 | lm loss: 3.675047E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.192 | TFLOPs: 25.79 | +7: iteration 140070/ 173500 | consumed samples: 35857920 | consumed tokens: 73437020160 | elapsed time per iteration (s): 0.16 | learning rate: 3.631E-05 | global batch size: 256 | lm loss: 3.662035E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.935 | TFLOPs: 25.20 | +7: iteration 140080/ 173500 | consumed samples: 35860480 | consumed tokens: 73442263040 | elapsed time per iteration (s): 0.17 | learning rate: 3.630E-05 | global batch size: 256 | lm loss: 3.662509E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1508.034 | TFLOPs: 23.65 | +7: iteration 140090/ 173500 | consumed samples: 35863040 | consumed tokens: 73447505920 | elapsed time per iteration (s): 0.16 | learning rate: 3.629E-05 | global batch size: 256 | lm loss: 3.656255E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.506 | TFLOPs: 25.84 | +7: iteration 140100/ 173500 | consumed samples: 35865600 | consumed tokens: 73452748800 | elapsed time per iteration (s): 0.15 | learning rate: 3.628E-05 | global batch size: 256 | lm loss: 3.668793E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.235 | TFLOPs: 26.21 | +7: iteration 140110/ 173500 | consumed samples: 35868160 | consumed tokens: 73457991680 | elapsed time per iteration (s): 0.15 | learning rate: 3.627E-05 | global batch size: 256 | lm loss: 3.665510E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.571 | TFLOPs: 25.95 | +7: iteration 140120/ 173500 | consumed samples: 35870720 | consumed tokens: 73463234560 | elapsed time per iteration (s): 0.16 | learning rate: 3.626E-05 | global batch size: 256 | lm loss: 3.665607E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.860 | TFLOPs: 25.23 | +7: iteration 140130/ 173500 | consumed samples: 35873280 | consumed tokens: 73468477440 | elapsed time per iteration (s): 0.16 | learning rate: 3.625E-05 | global batch size: 256 | lm loss: 3.656489E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.115 | TFLOPs: 25.08 | +7: iteration 140140/ 173500 | consumed samples: 35875840 | consumed tokens: 73473720320 | elapsed time per iteration (s): 0.16 | learning rate: 3.624E-05 | global batch size: 256 | lm loss: 3.665404E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.818 | TFLOPs: 25.62 | +7: iteration 140150/ 173500 | consumed samples: 35878400 | consumed tokens: 73478963200 | elapsed time per iteration (s): 0.15 | learning rate: 3.623E-05 | global batch size: 256 | lm loss: 3.652603E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.783 | TFLOPs: 26.22 | +7: iteration 140160/ 173500 | consumed samples: 35880960 | consumed tokens: 73484206080 | elapsed time per iteration (s): 0.15 | learning rate: 3.622E-05 | global batch size: 256 | lm loss: 3.666384E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.515 | TFLOPs: 26.20 | +7: iteration 140170/ 173500 | consumed samples: 35883520 | consumed tokens: 73489448960 | elapsed time per iteration (s): 0.15 | learning rate: 3.621E-05 | global batch size: 256 | lm loss: 3.678188E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.829 | TFLOPs: 26.20 | +7: iteration 140180/ 173500 | consumed samples: 35886080 | consumed tokens: 73494691840 | elapsed time per iteration (s): 0.15 | learning rate: 3.620E-05 | global batch size: 256 | lm loss: 3.687692E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.657 | TFLOPs: 26.17 | +7: iteration 140190/ 173500 | consumed samples: 35888640 | consumed tokens: 73499934720 | elapsed time per iteration (s): 0.15 | learning rate: 3.619E-05 | global batch size: 256 | lm loss: 3.673315E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.440 | TFLOPs: 26.24 | +7: iteration 140200/ 173500 | consumed samples: 35891200 | consumed tokens: 73505177600 | elapsed time per iteration (s): 0.15 | learning rate: 3.618E-05 | global batch size: 256 | lm loss: 3.665878E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.213 | TFLOPs: 26.27 | +7: iteration 140210/ 173500 | consumed samples: 35893760 | consumed tokens: 73510420480 | elapsed time per iteration (s): 0.15 | learning rate: 3.617E-05 | global batch size: 256 | lm loss: 3.651690E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.252 | TFLOPs: 26.26 | +7: iteration 140220/ 173500 | consumed samples: 35896320 | consumed tokens: 73515663360 | elapsed time per iteration (s): 0.15 | learning rate: 3.616E-05 | global batch size: 256 | lm loss: 3.657271E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.364 | TFLOPs: 26.24 | +7: iteration 140230/ 173500 | consumed samples: 35898880 | consumed tokens: 73520906240 | elapsed time per iteration (s): 0.15 | learning rate: 3.616E-05 | global batch size: 256 | lm loss: 3.668114E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.746 | TFLOPs: 26.25 | +7: iteration 140240/ 173500 | consumed samples: 35901440 | consumed tokens: 73526149120 | elapsed time per iteration (s): 0.15 | learning rate: 3.615E-05 | global batch size: 256 | lm loss: 3.654320E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.434 | TFLOPs: 26.26 | +7: iteration 140250/ 173500 | consumed samples: 35904000 | consumed tokens: 73531392000 | elapsed time per iteration (s): 0.16 | learning rate: 3.614E-05 | global batch size: 256 | lm loss: 3.666518E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.307 | TFLOPs: 25.77 | +7: iteration 140260/ 173500 | consumed samples: 35906560 | consumed tokens: 73536634880 | elapsed time per iteration (s): 0.15 | learning rate: 3.613E-05 | global batch size: 256 | lm loss: 3.671616E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.987 | TFLOPs: 26.27 | +7: iteration 140270/ 173500 | consumed samples: 35909120 | consumed tokens: 73541877760 | elapsed time per iteration (s): 0.15 | learning rate: 3.612E-05 | global batch size: 256 | lm loss: 3.668438E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.173 | TFLOPs: 26.29 | +7: iteration 140280/ 173500 | consumed samples: 35911680 | consumed tokens: 73547120640 | elapsed time per iteration (s): 0.15 | learning rate: 3.611E-05 | global batch size: 256 | lm loss: 3.664104E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.425 | TFLOPs: 26.29 | +7: iteration 140290/ 173500 | consumed samples: 35914240 | consumed tokens: 73552363520 | elapsed time per iteration (s): 0.15 | learning rate: 3.610E-05 | global batch size: 256 | lm loss: 3.676064E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.574 | TFLOPs: 26.28 | +7: iteration 140300/ 173500 | consumed samples: 35916800 | consumed tokens: 73557606400 | elapsed time per iteration (s): 0.15 | learning rate: 3.609E-05 | global batch size: 256 | lm loss: 3.676100E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.779 | TFLOPs: 26.26 | +7: iteration 140310/ 173500 | consumed samples: 35919360 | consumed tokens: 73562849280 | elapsed time per iteration (s): 0.15 | learning rate: 3.608E-05 | global batch size: 256 | lm loss: 3.664220E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.914 | TFLOPs: 26.27 | +7: iteration 140320/ 173500 | consumed samples: 35921920 | consumed tokens: 73568092160 | elapsed time per iteration (s): 0.15 | learning rate: 3.607E-05 | global batch size: 256 | lm loss: 3.669110E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.757 | TFLOPs: 26.28 | +7: iteration 140330/ 173500 | consumed samples: 35924480 | consumed tokens: 73573335040 | elapsed time per iteration (s): 0.15 | learning rate: 3.606E-05 | global batch size: 256 | lm loss: 3.671545E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.761 | TFLOPs: 26.26 | +7: iteration 140340/ 173500 | consumed samples: 35927040 | consumed tokens: 73578577920 | elapsed time per iteration (s): 0.15 | learning rate: 3.605E-05 | global batch size: 256 | lm loss: 3.665172E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.874 | TFLOPs: 26.28 | +7: iteration 140350/ 173500 | consumed samples: 35929600 | consumed tokens: 73583820800 | elapsed time per iteration (s): 0.15 | learning rate: 3.604E-05 | global batch size: 256 | lm loss: 3.670934E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.054 | TFLOPs: 26.21 | +7: iteration 140360/ 173500 | consumed samples: 35932160 | consumed tokens: 73589063680 | elapsed time per iteration (s): 0.15 | learning rate: 3.603E-05 | global batch size: 256 | lm loss: 3.657874E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.385 | TFLOPs: 26.18 | +7: iteration 140370/ 173500 | consumed samples: 35934720 | consumed tokens: 73594306560 | elapsed time per iteration (s): 0.15 | learning rate: 3.602E-05 | global batch size: 256 | lm loss: 3.658654E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.004 | TFLOPs: 26.14 | +7: iteration 140380/ 173500 | consumed samples: 35937280 | consumed tokens: 73599549440 | elapsed time per iteration (s): 0.16 | learning rate: 3.601E-05 | global batch size: 256 | lm loss: 3.662292E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.586 | TFLOPs: 25.79 | +7: iteration 140390/ 173500 | consumed samples: 35939840 | consumed tokens: 73604792320 | elapsed time per iteration (s): 0.15 | learning rate: 3.601E-05 | global batch size: 256 | lm loss: 3.667104E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.729 | TFLOPs: 26.04 | +7: iteration 140400/ 173500 | consumed samples: 35942400 | consumed tokens: 73610035200 | elapsed time per iteration (s): 0.15 | learning rate: 3.600E-05 | global batch size: 256 | lm loss: 3.665195E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.324 | TFLOPs: 26.10 | +7: iteration 140410/ 173500 | consumed samples: 35944960 | consumed tokens: 73615278080 | elapsed time per iteration (s): 0.15 | learning rate: 3.599E-05 | global batch size: 256 | lm loss: 3.664788E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.162 | TFLOPs: 26.13 | +7: iteration 140420/ 173500 | consumed samples: 35947520 | consumed tokens: 73620520960 | elapsed time per iteration (s): 0.15 | learning rate: 3.598E-05 | global batch size: 256 | lm loss: 3.666957E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.399 | TFLOPs: 26.10 | +7: iteration 140430/ 173500 | consumed samples: 35950080 | consumed tokens: 73625763840 | elapsed time per iteration (s): 0.16 | learning rate: 3.597E-05 | global batch size: 256 | lm loss: 3.666053E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.832 | TFLOPs: 25.87 | +7: iteration 140440/ 173500 | consumed samples: 35952640 | consumed tokens: 73631006720 | elapsed time per iteration (s): 0.15 | learning rate: 3.596E-05 | global batch size: 256 | lm loss: 3.679929E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.696 | TFLOPs: 26.19 | +7: iteration 140450/ 173500 | consumed samples: 35955200 | consumed tokens: 73636249600 | elapsed time per iteration (s): 0.15 | learning rate: 3.595E-05 | global batch size: 256 | lm loss: 3.671681E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.596 | TFLOPs: 26.18 | +7: iteration 140460/ 173500 | consumed samples: 35957760 | consumed tokens: 73641492480 | elapsed time per iteration (s): 0.15 | learning rate: 3.594E-05 | global batch size: 256 | lm loss: 3.666589E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.690 | TFLOPs: 26.18 | +7: iteration 140470/ 173500 | consumed samples: 35960320 | consumed tokens: 73646735360 | elapsed time per iteration (s): 0.15 | learning rate: 3.593E-05 | global batch size: 256 | lm loss: 3.680932E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.217 | TFLOPs: 26.07 | +7: iteration 140480/ 173500 | consumed samples: 35962880 | consumed tokens: 73651978240 | elapsed time per iteration (s): 0.15 | learning rate: 3.592E-05 | global batch size: 256 | lm loss: 3.679993E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.462 | TFLOPs: 26.28 | +7: iteration 140490/ 173500 | consumed samples: 35965440 | consumed tokens: 73657221120 | elapsed time per iteration (s): 0.15 | learning rate: 3.591E-05 | global batch size: 256 | lm loss: 3.651265E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.031 | TFLOPs: 26.25 | +7: iteration 140500/ 173500 | consumed samples: 35968000 | consumed tokens: 73662464000 | elapsed time per iteration (s): 0.15 | learning rate: 3.590E-05 | global batch size: 256 | lm loss: 3.679782E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.321 | TFLOPs: 26.21 | +7: iteration 140510/ 173500 | consumed samples: 35970560 | consumed tokens: 73667706880 | elapsed time per iteration (s): 0.15 | learning rate: 3.589E-05 | global batch size: 256 | lm loss: 3.677516E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.124 | TFLOPs: 26.16 | +7: iteration 140520/ 173500 | consumed samples: 35973120 | consumed tokens: 73672949760 | elapsed time per iteration (s): 0.15 | learning rate: 3.588E-05 | global batch size: 256 | lm loss: 3.656263E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.762 | TFLOPs: 26.15 | +7: iteration 140530/ 173500 | consumed samples: 35975680 | consumed tokens: 73678192640 | elapsed time per iteration (s): 0.16 | learning rate: 3.587E-05 | global batch size: 256 | lm loss: 3.667123E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.610 | TFLOPs: 25.51 | +7: iteration 140540/ 173500 | consumed samples: 35978240 | consumed tokens: 73683435520 | elapsed time per iteration (s): 0.15 | learning rate: 3.586E-05 | global batch size: 256 | lm loss: 3.678353E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.234 | TFLOPs: 26.15 | +7: iteration 140550/ 173500 | consumed samples: 35980800 | consumed tokens: 73688678400 | elapsed time per iteration (s): 0.15 | learning rate: 3.586E-05 | global batch size: 256 | lm loss: 3.673902E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.397 | TFLOPs: 26.18 | +7: iteration 140560/ 173500 | consumed samples: 35983360 | consumed tokens: 73693921280 | elapsed time per iteration (s): 0.15 | learning rate: 3.585E-05 | global batch size: 256 | lm loss: 3.653838E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.333 | TFLOPs: 26.27 | +7: iteration 140570/ 173500 | consumed samples: 35985920 | consumed tokens: 73699164160 | elapsed time per iteration (s): 0.15 | learning rate: 3.584E-05 | global batch size: 256 | lm loss: 3.660258E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.105 | TFLOPs: 26.29 | +7: iteration 140580/ 173500 | consumed samples: 35988480 | consumed tokens: 73704407040 | elapsed time per iteration (s): 0.15 | learning rate: 3.583E-05 | global batch size: 256 | lm loss: 3.669661E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.216 | TFLOPs: 26.26 | +7: iteration 140590/ 173500 | consumed samples: 35991040 | consumed tokens: 73709649920 | elapsed time per iteration (s): 0.15 | learning rate: 3.582E-05 | global batch size: 256 | lm loss: 3.676267E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.617 | TFLOPs: 26.25 | +7: iteration 140600/ 173500 | consumed samples: 35993600 | consumed tokens: 73714892800 | elapsed time per iteration (s): 0.15 | learning rate: 3.581E-05 | global batch size: 256 | lm loss: 3.676896E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.499 | TFLOPs: 26.28 | +7: iteration 140610/ 173500 | consumed samples: 35996160 | consumed tokens: 73720135680 | elapsed time per iteration (s): 0.15 | learning rate: 3.580E-05 | global batch size: 256 | lm loss: 3.657558E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.360 | TFLOPs: 26.27 | +7: iteration 140620/ 173500 | consumed samples: 35998720 | consumed tokens: 73725378560 | elapsed time per iteration (s): 0.16 | learning rate: 3.579E-05 | global batch size: 256 | lm loss: 3.665869E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.807 | TFLOPs: 25.61 | +7: iteration 140630/ 173500 | consumed samples: 36001280 | consumed tokens: 73730621440 | elapsed time per iteration (s): 0.15 | learning rate: 3.578E-05 | global batch size: 256 | lm loss: 3.672691E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.290 | TFLOPs: 26.27 | +7: iteration 140640/ 173500 | consumed samples: 36003840 | consumed tokens: 73735864320 | elapsed time per iteration (s): 0.15 | learning rate: 3.577E-05 | global batch size: 256 | lm loss: 3.675307E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.113 | TFLOPs: 26.24 | +7: iteration 140650/ 173500 | consumed samples: 36006400 | consumed tokens: 73741107200 | elapsed time per iteration (s): 0.15 | learning rate: 3.576E-05 | global batch size: 256 | lm loss: 3.674257E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.024 | TFLOPs: 26.27 | +7: iteration 140660/ 173500 | consumed samples: 36008960 | consumed tokens: 73746350080 | elapsed time per iteration (s): 0.15 | learning rate: 3.575E-05 | global batch size: 256 | lm loss: 3.671738E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.704 | TFLOPs: 26.25 | +7: iteration 140670/ 173500 | consumed samples: 36011520 | consumed tokens: 73751592960 | elapsed time per iteration (s): 0.15 | learning rate: 3.574E-05 | global batch size: 256 | lm loss: 3.680195E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.772 | TFLOPs: 26.23 | +7: iteration 140680/ 173500 | consumed samples: 36014080 | consumed tokens: 73756835840 | elapsed time per iteration (s): 0.15 | learning rate: 3.573E-05 | global batch size: 256 | lm loss: 3.674755E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.204 | TFLOPs: 25.91 | +7: iteration 140690/ 173500 | consumed samples: 36016640 | consumed tokens: 73762078720 | elapsed time per iteration (s): 0.16 | learning rate: 3.573E-05 | global batch size: 256 | lm loss: 3.670181E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.239 | TFLOPs: 25.75 | +7: iteration 140700/ 173500 | consumed samples: 36019200 | consumed tokens: 73767321600 | elapsed time per iteration (s): 0.15 | learning rate: 3.572E-05 | global batch size: 256 | lm loss: 3.671447E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.201 | TFLOPs: 26.24 | +7: iteration 140710/ 173500 | consumed samples: 36021760 | consumed tokens: 73772564480 | elapsed time per iteration (s): 0.15 | learning rate: 3.571E-05 | global batch size: 256 | lm loss: 3.666428E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.136 | TFLOPs: 26.16 | +7: iteration 140720/ 173500 | consumed samples: 36024320 | consumed tokens: 73777807360 | elapsed time per iteration (s): 0.17 | learning rate: 3.570E-05 | global batch size: 256 | lm loss: 3.659259E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.760 | TFLOPs: 24.15 | +7: iteration 140730/ 173500 | consumed samples: 36026880 | consumed tokens: 73783050240 | elapsed time per iteration (s): 0.15 | learning rate: 3.569E-05 | global batch size: 256 | lm loss: 3.670839E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.060 | TFLOPs: 26.14 | +7: iteration 140740/ 173500 | consumed samples: 36029440 | consumed tokens: 73788293120 | elapsed time per iteration (s): 0.15 | learning rate: 3.568E-05 | global batch size: 256 | lm loss: 3.664236E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.849 | TFLOPs: 26.19 | +7: iteration 140750/ 173500 | consumed samples: 36032000 | consumed tokens: 73793536000 | elapsed time per iteration (s): 0.16 | learning rate: 3.567E-05 | global batch size: 256 | lm loss: 3.673356E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.967 | TFLOPs: 25.73 | +7: iteration 140760/ 173500 | consumed samples: 36034560 | consumed tokens: 73798778880 | elapsed time per iteration (s): 0.15 | learning rate: 3.566E-05 | global batch size: 256 | lm loss: 3.673290E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.288 | TFLOPs: 26.12 | +7: iteration 140770/ 173500 | consumed samples: 36037120 | consumed tokens: 73804021760 | elapsed time per iteration (s): 0.16 | learning rate: 3.565E-05 | global batch size: 256 | lm loss: 3.668096E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.255 | TFLOPs: 25.68 | +7: iteration 140780/ 173500 | consumed samples: 36039680 | consumed tokens: 73809264640 | elapsed time per iteration (s): 0.15 | learning rate: 3.564E-05 | global batch size: 256 | lm loss: 3.654223E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.760 | TFLOPs: 26.11 | +7: iteration 140790/ 173500 | consumed samples: 36042240 | consumed tokens: 73814507520 | elapsed time per iteration (s): 0.15 | learning rate: 3.563E-05 | global batch size: 256 | lm loss: 3.671279E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.098 | TFLOPs: 26.14 | +7: iteration 140800/ 173500 | consumed samples: 36044800 | consumed tokens: 73819750400 | elapsed time per iteration (s): 0.15 | learning rate: 3.562E-05 | global batch size: 256 | lm loss: 3.671400E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.173 | TFLOPs: 26.15 | +7: iteration 140810/ 173500 | consumed samples: 36047360 | consumed tokens: 73824993280 | elapsed time per iteration (s): 0.15 | learning rate: 3.561E-05 | global batch size: 256 | lm loss: 3.666285E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.958 | TFLOPs: 26.16 | +7: iteration 140820/ 173500 | consumed samples: 36049920 | consumed tokens: 73830236160 | elapsed time per iteration (s): 0.15 | learning rate: 3.560E-05 | global batch size: 256 | lm loss: 3.662888E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.026 | TFLOPs: 26.14 | +7: iteration 140830/ 173500 | consumed samples: 36052480 | consumed tokens: 73835479040 | elapsed time per iteration (s): 0.16 | learning rate: 3.560E-05 | global batch size: 256 | lm loss: 3.673123E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.375 | TFLOPs: 25.58 | +7: iteration 140840/ 173500 | consumed samples: 36055040 | consumed tokens: 73840721920 | elapsed time per iteration (s): 0.15 | learning rate: 3.559E-05 | global batch size: 256 | lm loss: 3.654665E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.743 | TFLOPs: 26.12 | +7: iteration 140850/ 173500 | consumed samples: 36057600 | consumed tokens: 73845964800 | elapsed time per iteration (s): 0.15 | learning rate: 3.558E-05 | global batch size: 256 | lm loss: 3.667781E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.306 | TFLOPs: 26.19 | +7: iteration 140860/ 173500 | consumed samples: 36060160 | consumed tokens: 73851207680 | elapsed time per iteration (s): 0.16 | learning rate: 3.557E-05 | global batch size: 256 | lm loss: 3.670431E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.767 | TFLOPs: 25.73 | +7: iteration 140870/ 173500 | consumed samples: 36062720 | consumed tokens: 73856450560 | elapsed time per iteration (s): 0.15 | learning rate: 3.556E-05 | global batch size: 256 | lm loss: 3.666619E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.435 | TFLOPs: 26.13 | +7: iteration 140880/ 173500 | consumed samples: 36065280 | consumed tokens: 73861693440 | elapsed time per iteration (s): 0.16 | learning rate: 3.555E-05 | global batch size: 256 | lm loss: 3.661404E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.055 | TFLOPs: 25.69 | +7: iteration 140890/ 173500 | consumed samples: 36067840 | consumed tokens: 73866936320 | elapsed time per iteration (s): 0.15 | learning rate: 3.554E-05 | global batch size: 256 | lm loss: 3.690942E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.295 | TFLOPs: 26.13 | +7: iteration 140900/ 173500 | consumed samples: 36070400 | consumed tokens: 73872179200 | elapsed time per iteration (s): 0.16 | learning rate: 3.553E-05 | global batch size: 256 | lm loss: 3.668934E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.065 | TFLOPs: 25.30 | +7: iteration 140910/ 173500 | consumed samples: 36072960 | consumed tokens: 73877422080 | elapsed time per iteration (s): 0.16 | learning rate: 3.552E-05 | global batch size: 256 | lm loss: 3.675022E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.127 | TFLOPs: 25.89 | +7: iteration 140920/ 173500 | consumed samples: 36075520 | consumed tokens: 73882664960 | elapsed time per iteration (s): 0.16 | learning rate: 3.551E-05 | global batch size: 256 | lm loss: 3.652057E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.817 | TFLOPs: 25.73 | +7: iteration 140930/ 173500 | consumed samples: 36078080 | consumed tokens: 73887907840 | elapsed time per iteration (s): 0.15 | learning rate: 3.550E-05 | global batch size: 256 | lm loss: 3.670899E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.311 | TFLOPs: 26.10 | +7: iteration 140940/ 173500 | consumed samples: 36080640 | consumed tokens: 73893150720 | elapsed time per iteration (s): 0.16 | learning rate: 3.549E-05 | global batch size: 256 | lm loss: 3.666734E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.926 | TFLOPs: 25.55 | +7: iteration 140950/ 173500 | consumed samples: 36083200 | consumed tokens: 73898393600 | elapsed time per iteration (s): 0.16 | learning rate: 3.548E-05 | global batch size: 256 | lm loss: 3.664632E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.883 | TFLOPs: 25.72 | +7: iteration 140960/ 173500 | consumed samples: 36085760 | consumed tokens: 73903636480 | elapsed time per iteration (s): 0.16 | learning rate: 3.548E-05 | global batch size: 256 | lm loss: 3.674546E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.099 | TFLOPs: 25.71 | +7: iteration 140970/ 173500 | consumed samples: 36088320 | consumed tokens: 73908879360 | elapsed time per iteration (s): 0.16 | learning rate: 3.547E-05 | global batch size: 256 | lm loss: 3.674994E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.489 | TFLOPs: 25.44 | +7: iteration 140980/ 173500 | consumed samples: 36090880 | consumed tokens: 73914122240 | elapsed time per iteration (s): 0.15 | learning rate: 3.546E-05 | global batch size: 256 | lm loss: 3.671979E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.398 | TFLOPs: 26.15 | +7: iteration 140990/ 173500 | consumed samples: 36093440 | consumed tokens: 73919365120 | elapsed time per iteration (s): 0.15 | learning rate: 3.545E-05 | global batch size: 256 | lm loss: 3.667821E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.635 | TFLOPs: 26.14 | +7: iteration 141000/ 173500 | consumed samples: 36096000 | consumed tokens: 73924608000 | elapsed time per iteration (s): 0.16 | learning rate: 3.544E-05 | global batch size: 256 | lm loss: 3.678996E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.942 | TFLOPs: 25.44 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 141000 | lm loss value: 3.813592E+00 | lm loss PPL: 4.531290E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 141000 to checkpoints_44m91b100m +0: [2023-03-17 06:22:56,816] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step141000 is begin to save! +0: [2023-03-17 06:22:56,820] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:22:56,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:22:56,888] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:22:56,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:22:56,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:22:56,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:22:56,905] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:22:56,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:22:56,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:22:56,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:22:56,922] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:22:56,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:22:56,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:22:56,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:22:56,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:22:56,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:22:56,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:22:56,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:22:56,954] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:22:56,955] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:22:56,956] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step141000/mp_rank_00_model_states.pt +0: [2023-03-17 06:22:56,956] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:22:56,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:22:56,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:22:56,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +6: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:22:56,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:22:56,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +6: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:22:56,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,992] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:22:56,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:22:56,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:22:56,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:22:56,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +6: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:22:56,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +6: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,996] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +6: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +2: [2023-03-17 06:22:56,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:22:56,997] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +3: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +1: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +6: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:56,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:56,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +5: [2023-03-17 06:22:56,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:22:56,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:22:56,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +4: [2023-03-17 06:22:56,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:22:56,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:22:56,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +7: [2023-03-17 06:22:57,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:22:57,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step141000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:22:57,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step141000 is ready now! +0: successfully saved checkpoint at iteration 141000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 188.16 +7: iteration 141010/ 173500 | consumed samples: 36098560 | consumed tokens: 73929850880 | elapsed time per iteration (s): 0.18 | learning rate: 3.543E-05 | global batch size: 256 | lm loss: 3.669616E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1421.692 | TFLOPs: 22.30 | +7: iteration 141020/ 173500 | consumed samples: 36101120 | consumed tokens: 73935093760 | elapsed time per iteration (s): 0.16 | learning rate: 3.542E-05 | global batch size: 256 | lm loss: 3.674270E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.364 | TFLOPs: 25.44 | +7: iteration 141030/ 173500 | consumed samples: 36103680 | consumed tokens: 73940336640 | elapsed time per iteration (s): 0.15 | learning rate: 3.541E-05 | global batch size: 256 | lm loss: 3.662229E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.387 | TFLOPs: 26.12 | +7: iteration 141040/ 173500 | consumed samples: 36106240 | consumed tokens: 73945579520 | elapsed time per iteration (s): 0.16 | learning rate: 3.540E-05 | global batch size: 256 | lm loss: 3.673305E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.171 | TFLOPs: 24.53 | +7: iteration 141050/ 173500 | consumed samples: 36108800 | consumed tokens: 73950822400 | elapsed time per iteration (s): 0.16 | learning rate: 3.539E-05 | global batch size: 256 | lm loss: 3.653255E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.583 | TFLOPs: 25.10 | +7: iteration 141060/ 173500 | consumed samples: 36111360 | consumed tokens: 73956065280 | elapsed time per iteration (s): 0.16 | learning rate: 3.538E-05 | global batch size: 256 | lm loss: 3.665904E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.066 | TFLOPs: 25.27 | +7: iteration 141070/ 173500 | consumed samples: 36113920 | consumed tokens: 73961308160 | elapsed time per iteration (s): 0.16 | learning rate: 3.537E-05 | global batch size: 256 | lm loss: 3.659113E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.993 | TFLOPs: 25.58 | +7: iteration 141080/ 173500 | consumed samples: 36116480 | consumed tokens: 73966551040 | elapsed time per iteration (s): 0.16 | learning rate: 3.536E-05 | global batch size: 256 | lm loss: 3.671775E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.694 | TFLOPs: 25.82 | +7: iteration 141090/ 173500 | consumed samples: 36119040 | consumed tokens: 73971793920 | elapsed time per iteration (s): 0.16 | learning rate: 3.536E-05 | global batch size: 256 | lm loss: 3.667027E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.959 | TFLOPs: 25.59 | +7: iteration 141100/ 173500 | consumed samples: 36121600 | consumed tokens: 73977036800 | elapsed time per iteration (s): 0.15 | learning rate: 3.535E-05 | global batch size: 256 | lm loss: 3.661560E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.791 | TFLOPs: 26.14 | +7: iteration 141110/ 173500 | consumed samples: 36124160 | consumed tokens: 73982279680 | elapsed time per iteration (s): 0.16 | learning rate: 3.534E-05 | global batch size: 256 | lm loss: 3.664826E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.305 | TFLOPs: 25.79 | +7: iteration 141120/ 173500 | consumed samples: 36126720 | consumed tokens: 73987522560 | elapsed time per iteration (s): 0.15 | learning rate: 3.533E-05 | global batch size: 256 | lm loss: 3.665724E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.743 | TFLOPs: 26.09 | +7: iteration 141130/ 173500 | consumed samples: 36129280 | consumed tokens: 73992765440 | elapsed time per iteration (s): 0.15 | learning rate: 3.532E-05 | global batch size: 256 | lm loss: 3.663013E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.096 | TFLOPs: 26.11 | +7: iteration 141140/ 173500 | consumed samples: 36131840 | consumed tokens: 73998008320 | elapsed time per iteration (s): 0.16 | learning rate: 3.531E-05 | global batch size: 256 | lm loss: 3.676879E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.893 | TFLOPs: 24.89 | +7: iteration 141150/ 173500 | consumed samples: 36134400 | consumed tokens: 74003251200 | elapsed time per iteration (s): 0.15 | learning rate: 3.530E-05 | global batch size: 256 | lm loss: 3.681978E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.477 | TFLOPs: 26.09 | +7: iteration 141160/ 173500 | consumed samples: 36136960 | consumed tokens: 74008494080 | elapsed time per iteration (s): 0.16 | learning rate: 3.529E-05 | global batch size: 256 | lm loss: 3.658899E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.672 | TFLOPs: 25.73 | +7: iteration 141170/ 173500 | consumed samples: 36139520 | consumed tokens: 74013736960 | elapsed time per iteration (s): 0.16 | learning rate: 3.528E-05 | global batch size: 256 | lm loss: 3.665773E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.689 | TFLOPs: 25.56 | +7: iteration 141180/ 173500 | consumed samples: 36142080 | consumed tokens: 74018979840 | elapsed time per iteration (s): 0.16 | learning rate: 3.527E-05 | global batch size: 256 | lm loss: 3.668774E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.314 | TFLOPs: 25.21 | +7: iteration 141190/ 173500 | consumed samples: 36144640 | consumed tokens: 74024222720 | elapsed time per iteration (s): 0.16 | learning rate: 3.526E-05 | global batch size: 256 | lm loss: 3.681630E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.407 | TFLOPs: 25.16 | +7: iteration 141200/ 173500 | consumed samples: 36147200 | consumed tokens: 74029465600 | elapsed time per iteration (s): 0.16 | learning rate: 3.525E-05 | global batch size: 256 | lm loss: 3.677074E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.757 | TFLOPs: 25.29 | +7: iteration 141210/ 173500 | consumed samples: 36149760 | consumed tokens: 74034708480 | elapsed time per iteration (s): 0.16 | learning rate: 3.525E-05 | global batch size: 256 | lm loss: 3.665356E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.737 | TFLOPs: 25.75 | +7: iteration 141220/ 173500 | consumed samples: 36152320 | consumed tokens: 74039951360 | elapsed time per iteration (s): 0.16 | learning rate: 3.524E-05 | global batch size: 256 | lm loss: 3.663536E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.278 | TFLOPs: 25.61 | +7: iteration 141230/ 173500 | consumed samples: 36154880 | consumed tokens: 74045194240 | elapsed time per iteration (s): 0.15 | learning rate: 3.523E-05 | global batch size: 256 | lm loss: 3.667619E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.994 | TFLOPs: 26.13 | +7: iteration 141240/ 173500 | consumed samples: 36157440 | consumed tokens: 74050437120 | elapsed time per iteration (s): 0.16 | learning rate: 3.522E-05 | global batch size: 256 | lm loss: 3.670922E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.521 | TFLOPs: 25.43 | +7: iteration 141250/ 173500 | consumed samples: 36160000 | consumed tokens: 74055680000 | elapsed time per iteration (s): 0.16 | learning rate: 3.521E-05 | global batch size: 256 | lm loss: 3.682336E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.073 | TFLOPs: 25.86 | +7: iteration 141260/ 173500 | consumed samples: 36162560 | consumed tokens: 74060922880 | elapsed time per iteration (s): 0.16 | learning rate: 3.520E-05 | global batch size: 256 | lm loss: 3.652048E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.215 | TFLOPs: 25.61 | +7: iteration 141270/ 173500 | consumed samples: 36165120 | consumed tokens: 74066165760 | elapsed time per iteration (s): 0.16 | learning rate: 3.519E-05 | global batch size: 256 | lm loss: 3.664222E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.763 | TFLOPs: 25.50 | +7: iteration 141280/ 173500 | consumed samples: 36167680 | consumed tokens: 74071408640 | elapsed time per iteration (s): 0.16 | learning rate: 3.518E-05 | global batch size: 256 | lm loss: 3.683215E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.046 | TFLOPs: 25.61 | +7: iteration 141290/ 173500 | consumed samples: 36170240 | consumed tokens: 74076651520 | elapsed time per iteration (s): 0.16 | learning rate: 3.517E-05 | global batch size: 256 | lm loss: 3.671606E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.782 | TFLOPs: 25.64 | +7: iteration 141300/ 173500 | consumed samples: 36172800 | consumed tokens: 74081894400 | elapsed time per iteration (s): 0.16 | learning rate: 3.516E-05 | global batch size: 256 | lm loss: 3.662737E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.985 | TFLOPs: 25.70 | +7: iteration 141310/ 173500 | consumed samples: 36175360 | consumed tokens: 74087137280 | elapsed time per iteration (s): 0.15 | learning rate: 3.515E-05 | global batch size: 256 | lm loss: 3.681427E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.231 | TFLOPs: 25.96 | +7: iteration 141320/ 173500 | consumed samples: 36177920 | consumed tokens: 74092380160 | elapsed time per iteration (s): 0.16 | learning rate: 3.514E-05 | global batch size: 256 | lm loss: 3.663179E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.117 | TFLOPs: 25.41 | +7: iteration 141330/ 173500 | consumed samples: 36180480 | consumed tokens: 74097623040 | elapsed time per iteration (s): 0.16 | learning rate: 3.514E-05 | global batch size: 256 | lm loss: 3.668934E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.908 | TFLOPs: 25.37 | +7: iteration 141340/ 173500 | consumed samples: 36183040 | consumed tokens: 74102865920 | elapsed time per iteration (s): 0.16 | learning rate: 3.513E-05 | global batch size: 256 | lm loss: 3.658920E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.318 | TFLOPs: 25.60 | +7: iteration 141350/ 173500 | consumed samples: 36185600 | consumed tokens: 74108108800 | elapsed time per iteration (s): 0.15 | learning rate: 3.512E-05 | global batch size: 256 | lm loss: 3.669639E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.520 | TFLOPs: 26.12 | +7: iteration 141360/ 173500 | consumed samples: 36188160 | consumed tokens: 74113351680 | elapsed time per iteration (s): 0.16 | learning rate: 3.511E-05 | global batch size: 256 | lm loss: 3.668982E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.352 | TFLOPs: 25.63 | +7: iteration 141370/ 173500 | consumed samples: 36190720 | consumed tokens: 74118594560 | elapsed time per iteration (s): 0.15 | learning rate: 3.510E-05 | global batch size: 256 | lm loss: 3.666222E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.391 | TFLOPs: 26.16 | +7: iteration 141380/ 173500 | consumed samples: 36193280 | consumed tokens: 74123837440 | elapsed time per iteration (s): 0.15 | learning rate: 3.509E-05 | global batch size: 256 | lm loss: 3.675290E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.334 | TFLOPs: 26.15 | +7: iteration 141390/ 173500 | consumed samples: 36195840 | consumed tokens: 74129080320 | elapsed time per iteration (s): 0.16 | learning rate: 3.508E-05 | global batch size: 256 | lm loss: 3.667719E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.526 | TFLOPs: 25.90 | +7: iteration 141400/ 173500 | consumed samples: 36198400 | consumed tokens: 74134323200 | elapsed time per iteration (s): 0.15 | learning rate: 3.507E-05 | global batch size: 256 | lm loss: 3.668873E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.120 | TFLOPs: 26.08 | +7: iteration 141410/ 173500 | consumed samples: 36200960 | consumed tokens: 74139566080 | elapsed time per iteration (s): 0.16 | learning rate: 3.506E-05 | global batch size: 256 | lm loss: 3.660881E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.097 | TFLOPs: 25.12 | +7: iteration 141420/ 173500 | consumed samples: 36203520 | consumed tokens: 74144808960 | elapsed time per iteration (s): 0.16 | learning rate: 3.505E-05 | global batch size: 256 | lm loss: 3.657127E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.695 | TFLOPs: 25.57 | +7: iteration 141430/ 173500 | consumed samples: 36206080 | consumed tokens: 74150051840 | elapsed time per iteration (s): 0.15 | learning rate: 3.504E-05 | global batch size: 256 | lm loss: 3.667115E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.195 | TFLOPs: 26.07 | +7: iteration 141440/ 173500 | consumed samples: 36208640 | consumed tokens: 74155294720 | elapsed time per iteration (s): 0.16 | learning rate: 3.503E-05 | global batch size: 256 | lm loss: 3.665802E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.792 | TFLOPs: 25.26 | +7: iteration 141450/ 173500 | consumed samples: 36211200 | consumed tokens: 74160537600 | elapsed time per iteration (s): 0.16 | learning rate: 3.503E-05 | global batch size: 256 | lm loss: 3.668871E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.010 | TFLOPs: 25.84 | +7: iteration 141460/ 173500 | consumed samples: 36213760 | consumed tokens: 74165780480 | elapsed time per iteration (s): 0.15 | learning rate: 3.502E-05 | global batch size: 256 | lm loss: 3.686067E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.932 | TFLOPs: 26.06 | +7: iteration 141470/ 173500 | consumed samples: 36216320 | consumed tokens: 74171023360 | elapsed time per iteration (s): 0.15 | learning rate: 3.501E-05 | global batch size: 256 | lm loss: 3.667443E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.082 | TFLOPs: 26.03 | +7: iteration 141480/ 173500 | consumed samples: 36218880 | consumed tokens: 74176266240 | elapsed time per iteration (s): 0.16 | learning rate: 3.500E-05 | global batch size: 256 | lm loss: 3.676570E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.313 | TFLOPs: 25.69 | +7: iteration 141490/ 173500 | consumed samples: 36221440 | consumed tokens: 74181509120 | elapsed time per iteration (s): 0.16 | learning rate: 3.499E-05 | global batch size: 256 | lm loss: 3.681275E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.613 | TFLOPs: 25.74 | +7: iteration 141500/ 173500 | consumed samples: 36224000 | consumed tokens: 74186752000 | elapsed time per iteration (s): 0.16 | learning rate: 3.498E-05 | global batch size: 256 | lm loss: 3.671046E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.747 | TFLOPs: 24.85 | +7: iteration 141510/ 173500 | consumed samples: 36226560 | consumed tokens: 74191994880 | elapsed time per iteration (s): 0.15 | learning rate: 3.497E-05 | global batch size: 256 | lm loss: 3.678575E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.067 | TFLOPs: 26.00 | +7: iteration 141520/ 173500 | consumed samples: 36229120 | consumed tokens: 74197237760 | elapsed time per iteration (s): 0.16 | learning rate: 3.496E-05 | global batch size: 256 | lm loss: 3.668370E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.480 | TFLOPs: 25.60 | +7: iteration 141530/ 173500 | consumed samples: 36231680 | consumed tokens: 74202480640 | elapsed time per iteration (s): 0.16 | learning rate: 3.495E-05 | global batch size: 256 | lm loss: 3.673770E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.354 | TFLOPs: 24.99 | +7: iteration 141540/ 173500 | consumed samples: 36234240 | consumed tokens: 74207723520 | elapsed time per iteration (s): 0.16 | learning rate: 3.494E-05 | global batch size: 256 | lm loss: 3.667421E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.321 | TFLOPs: 25.79 | +7: iteration 141550/ 173500 | consumed samples: 36236800 | consumed tokens: 74212966400 | elapsed time per iteration (s): 0.15 | learning rate: 3.493E-05 | global batch size: 256 | lm loss: 3.650195E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.707 | TFLOPs: 26.08 | +7: iteration 141560/ 173500 | consumed samples: 36239360 | consumed tokens: 74218209280 | elapsed time per iteration (s): 0.15 | learning rate: 3.493E-05 | global batch size: 256 | lm loss: 3.656078E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.183 | TFLOPs: 26.05 | +7: iteration 141570/ 173500 | consumed samples: 36241920 | consumed tokens: 74223452160 | elapsed time per iteration (s): 0.15 | learning rate: 3.492E-05 | global batch size: 256 | lm loss: 3.657762E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.403 | TFLOPs: 26.10 | +7: iteration 141580/ 173500 | consumed samples: 36244480 | consumed tokens: 74228695040 | elapsed time per iteration (s): 0.16 | learning rate: 3.491E-05 | global batch size: 256 | lm loss: 3.670379E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.781 | TFLOPs: 25.70 | +7: iteration 141590/ 173500 | consumed samples: 36247040 | consumed tokens: 74233937920 | elapsed time per iteration (s): 0.16 | learning rate: 3.490E-05 | global batch size: 256 | lm loss: 3.654834E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.122 | TFLOPs: 25.66 | +7: iteration 141600/ 173500 | consumed samples: 36249600 | consumed tokens: 74239180800 | elapsed time per iteration (s): 0.16 | learning rate: 3.489E-05 | global batch size: 256 | lm loss: 3.669646E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.964 | TFLOPs: 25.69 | +7: iteration 141610/ 173500 | consumed samples: 36252160 | consumed tokens: 74244423680 | elapsed time per iteration (s): 0.16 | learning rate: 3.488E-05 | global batch size: 256 | lm loss: 3.674185E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.772 | TFLOPs: 25.83 | +7: iteration 141620/ 173500 | consumed samples: 36254720 | consumed tokens: 74249666560 | elapsed time per iteration (s): 0.16 | learning rate: 3.487E-05 | global batch size: 256 | lm loss: 3.663204E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.193 | TFLOPs: 25.66 | +7: iteration 141630/ 173500 | consumed samples: 36257280 | consumed tokens: 74254909440 | elapsed time per iteration (s): 0.16 | learning rate: 3.486E-05 | global batch size: 256 | lm loss: 3.672106E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.736 | TFLOPs: 25.86 | +7: iteration 141640/ 173500 | consumed samples: 36259840 | consumed tokens: 74260152320 | elapsed time per iteration (s): 0.16 | learning rate: 3.485E-05 | global batch size: 256 | lm loss: 3.668496E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.792 | TFLOPs: 25.87 | +7: iteration 141650/ 173500 | consumed samples: 36262400 | consumed tokens: 74265395200 | elapsed time per iteration (s): 0.16 | learning rate: 3.484E-05 | global batch size: 256 | lm loss: 3.675280E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.211 | TFLOPs: 25.72 | +7: iteration 141660/ 173500 | consumed samples: 36264960 | consumed tokens: 74270638080 | elapsed time per iteration (s): 0.16 | learning rate: 3.484E-05 | global batch size: 256 | lm loss: 3.672394E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.052 | TFLOPs: 25.53 | +7: iteration 141670/ 173500 | consumed samples: 36267520 | consumed tokens: 74275880960 | elapsed time per iteration (s): 0.16 | learning rate: 3.483E-05 | global batch size: 256 | lm loss: 3.670145E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.169 | TFLOPs: 25.19 | +7: iteration 141680/ 173500 | consumed samples: 36270080 | consumed tokens: 74281123840 | elapsed time per iteration (s): 0.16 | learning rate: 3.482E-05 | global batch size: 256 | lm loss: 3.663748E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.045 | TFLOPs: 25.69 | +7: iteration 141690/ 173500 | consumed samples: 36272640 | consumed tokens: 74286366720 | elapsed time per iteration (s): 0.16 | learning rate: 3.481E-05 | global batch size: 256 | lm loss: 3.670910E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.388 | TFLOPs: 25.22 | +7: iteration 141700/ 173500 | consumed samples: 36275200 | consumed tokens: 74291609600 | elapsed time per iteration (s): 0.16 | learning rate: 3.480E-05 | global batch size: 256 | lm loss: 3.671220E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.098 | TFLOPs: 25.49 | +7: iteration 141710/ 173500 | consumed samples: 36277760 | consumed tokens: 74296852480 | elapsed time per iteration (s): 0.16 | learning rate: 3.479E-05 | global batch size: 256 | lm loss: 3.670658E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.628 | TFLOPs: 25.34 | +7: iteration 141720/ 173500 | consumed samples: 36280320 | consumed tokens: 74302095360 | elapsed time per iteration (s): 0.16 | learning rate: 3.478E-05 | global batch size: 256 | lm loss: 3.664073E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.465 | TFLOPs: 25.84 | +7: iteration 141730/ 173500 | consumed samples: 36282880 | consumed tokens: 74307338240 | elapsed time per iteration (s): 0.16 | learning rate: 3.477E-05 | global batch size: 256 | lm loss: 3.650500E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.227 | TFLOPs: 25.85 | +7: iteration 141740/ 173500 | consumed samples: 36285440 | consumed tokens: 74312581120 | elapsed time per iteration (s): 0.19 | learning rate: 3.476E-05 | global batch size: 256 | lm loss: 3.664258E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1348.590 | TFLOPs: 21.15 | +7: iteration 141750/ 173500 | consumed samples: 36288000 | consumed tokens: 74317824000 | elapsed time per iteration (s): 0.15 | learning rate: 3.475E-05 | global batch size: 256 | lm loss: 3.654942E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.342 | TFLOPs: 25.94 | +7: iteration 141760/ 173500 | consumed samples: 36290560 | consumed tokens: 74323066880 | elapsed time per iteration (s): 0.15 | learning rate: 3.474E-05 | global batch size: 256 | lm loss: 3.672824E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.846 | TFLOPs: 25.92 | +7: iteration 141770/ 173500 | consumed samples: 36293120 | consumed tokens: 74328309760 | elapsed time per iteration (s): 0.16 | learning rate: 3.474E-05 | global batch size: 256 | lm loss: 3.662210E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.421 | TFLOPs: 25.46 | +7: iteration 141780/ 173500 | consumed samples: 36295680 | consumed tokens: 74333552640 | elapsed time per iteration (s): 0.16 | learning rate: 3.473E-05 | global batch size: 256 | lm loss: 3.647152E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.397 | TFLOPs: 25.74 | +7: iteration 141790/ 173500 | consumed samples: 36298240 | consumed tokens: 74338795520 | elapsed time per iteration (s): 0.16 | learning rate: 3.472E-05 | global batch size: 256 | lm loss: 3.673097E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.988 | TFLOPs: 25.28 | +7: iteration 141800/ 173500 | consumed samples: 36300800 | consumed tokens: 74344038400 | elapsed time per iteration (s): 0.16 | learning rate: 3.471E-05 | global batch size: 256 | lm loss: 3.667758E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.301 | TFLOPs: 25.47 | +7: iteration 141810/ 173500 | consumed samples: 36303360 | consumed tokens: 74349281280 | elapsed time per iteration (s): 0.16 | learning rate: 3.470E-05 | global batch size: 256 | lm loss: 3.664270E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.205 | TFLOPs: 25.28 | +7: iteration 141820/ 173500 | consumed samples: 36305920 | consumed tokens: 74354524160 | elapsed time per iteration (s): 0.17 | learning rate: 3.469E-05 | global batch size: 256 | lm loss: 3.660880E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.761 | TFLOPs: 23.60 | +7: iteration 141830/ 173500 | consumed samples: 36308480 | consumed tokens: 74359767040 | elapsed time per iteration (s): 0.17 | learning rate: 3.468E-05 | global batch size: 256 | lm loss: 3.667994E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1489.637 | TFLOPs: 23.36 | +7: iteration 141840/ 173500 | consumed samples: 36311040 | consumed tokens: 74365009920 | elapsed time per iteration (s): 0.15 | learning rate: 3.467E-05 | global batch size: 256 | lm loss: 3.667621E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.372 | TFLOPs: 26.09 | +7: iteration 141850/ 173500 | consumed samples: 36313600 | consumed tokens: 74370252800 | elapsed time per iteration (s): 0.16 | learning rate: 3.466E-05 | global batch size: 256 | lm loss: 3.670113E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.785 | TFLOPs: 25.87 | +7: iteration 141860/ 173500 | consumed samples: 36316160 | consumed tokens: 74375495680 | elapsed time per iteration (s): 0.16 | learning rate: 3.465E-05 | global batch size: 256 | lm loss: 3.667131E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.601 | TFLOPs: 25.76 | +7: iteration 141870/ 173500 | consumed samples: 36318720 | consumed tokens: 74380738560 | elapsed time per iteration (s): 0.16 | learning rate: 3.465E-05 | global batch size: 256 | lm loss: 3.662804E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.410 | TFLOPs: 25.79 | +7: iteration 141880/ 173500 | consumed samples: 36321280 | consumed tokens: 74385981440 | elapsed time per iteration (s): 0.16 | learning rate: 3.464E-05 | global batch size: 256 | lm loss: 3.659545E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.602 | TFLOPs: 25.48 | +7: iteration 141890/ 173500 | consumed samples: 36323840 | consumed tokens: 74391224320 | elapsed time per iteration (s): 0.16 | learning rate: 3.463E-05 | global batch size: 256 | lm loss: 3.678497E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.453 | TFLOPs: 25.88 | +7: iteration 141900/ 173500 | consumed samples: 36326400 | consumed tokens: 74396467200 | elapsed time per iteration (s): 0.16 | learning rate: 3.462E-05 | global batch size: 256 | lm loss: 3.672631E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.313 | TFLOPs: 25.66 | +7: iteration 141910/ 173500 | consumed samples: 36328960 | consumed tokens: 74401710080 | elapsed time per iteration (s): 0.16 | learning rate: 3.461E-05 | global batch size: 256 | lm loss: 3.662078E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.296 | TFLOPs: 25.76 | +7: iteration 141920/ 173500 | consumed samples: 36331520 | consumed tokens: 74406952960 | elapsed time per iteration (s): 0.15 | learning rate: 3.460E-05 | global batch size: 256 | lm loss: 3.666262E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.084 | TFLOPs: 26.11 | +7: iteration 141930/ 173500 | consumed samples: 36334080 | consumed tokens: 74412195840 | elapsed time per iteration (s): 0.16 | learning rate: 3.459E-05 | global batch size: 256 | lm loss: 3.659591E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.049 | TFLOPs: 25.22 | +7: iteration 141940/ 173500 | consumed samples: 36336640 | consumed tokens: 74417438720 | elapsed time per iteration (s): 0.16 | learning rate: 3.458E-05 | global batch size: 256 | lm loss: 3.674443E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.288 | TFLOPs: 25.50 | +7: iteration 141950/ 173500 | consumed samples: 36339200 | consumed tokens: 74422681600 | elapsed time per iteration (s): 0.15 | learning rate: 3.457E-05 | global batch size: 256 | lm loss: 3.671995E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.040 | TFLOPs: 26.10 | +7: iteration 141960/ 173500 | consumed samples: 36341760 | consumed tokens: 74427924480 | elapsed time per iteration (s): 0.15 | learning rate: 3.456E-05 | global batch size: 256 | lm loss: 3.653273E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.830 | TFLOPs: 26.08 | +7: iteration 141970/ 173500 | consumed samples: 36344320 | consumed tokens: 74433167360 | elapsed time per iteration (s): 0.15 | learning rate: 3.456E-05 | global batch size: 256 | lm loss: 3.662672E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.544 | TFLOPs: 26.10 | +7: iteration 141980/ 173500 | consumed samples: 36346880 | consumed tokens: 74438410240 | elapsed time per iteration (s): 0.16 | learning rate: 3.455E-05 | global batch size: 256 | lm loss: 3.654116E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.653 | TFLOPs: 25.57 | +7: iteration 141990/ 173500 | consumed samples: 36349440 | consumed tokens: 74443653120 | elapsed time per iteration (s): 0.15 | learning rate: 3.454E-05 | global batch size: 256 | lm loss: 3.671367E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.993 | TFLOPs: 26.08 | +0: [2023-03-17 06:25:33,960] [INFO] [logging.py:68:log_dist] [Rank 0] step=142000, skipped=0, lr=[3.452880099827123e-05, 3.452880099827123e-05, 3.452880099827123e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 142000/ 173500 | consumed samples: 36352000 | consumed tokens: 74448896000 | elapsed time per iteration (s): 0.15 | learning rate: 3.453E-05 | global batch size: 256 | lm loss: 3.661979E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.539 | TFLOPs: 25.93 | +0: steps: 142000 loss: 3.6250 iter time (s): 0.155 samples/sec: 1653.277 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 142000 | lm loss value: 3.814332E+00 | lm loss PPL: 4.534644E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 142000 to checkpoints_44m91b100m +0: [2023-03-17 06:25:34,034] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step142000 is begin to save! +0: [2023-03-17 06:25:34,037] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:25:34,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:25:34,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:25:34,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:25:34,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:25:34,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:25:34,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:25:34,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:25:34,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:25:34,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:25:34,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:25:34,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:25:34,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:25:34,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:25:34,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:25:34,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:25:34,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:25:34,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:25:34,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:25:34,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:25:34,166] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step142000/mp_rank_00_model_states.pt +0: [2023-03-17 06:25:34,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:25:34,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:25:34,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:25:34,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:25:34,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +7: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +4: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:25:34,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +7: [2023-03-17 06:25:34,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:25:34,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +4: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,202] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +7: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +4: [2023-03-17 06:25:34,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +7: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +4: [2023-03-17 06:25:34,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:25:34,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:25:34,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +4: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 06:25:34,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +6: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +2: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +1: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +5: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +3: [2023-03-17 06:25:34,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +4: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:25:34,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +7: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +7: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:25:34,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step142000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:25:34,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step142000 is ready now! +0: successfully saved checkpoint at iteration 142000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.84 +7: iteration 142010/ 173500 | consumed samples: 36354560 | consumed tokens: 74454138880 | elapsed time per iteration (s): 0.18 | learning rate: 3.452E-05 | global batch size: 256 | lm loss: 3.672678E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.949 | TFLOPs: 22.57 | +7: iteration 142020/ 173500 | consumed samples: 36357120 | consumed tokens: 74459381760 | elapsed time per iteration (s): 0.16 | learning rate: 3.451E-05 | global batch size: 256 | lm loss: 3.678930E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.668 | TFLOPs: 25.81 | +7: iteration 142030/ 173500 | consumed samples: 36359680 | consumed tokens: 74464624640 | elapsed time per iteration (s): 0.16 | learning rate: 3.450E-05 | global batch size: 256 | lm loss: 3.664200E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.029 | TFLOPs: 25.59 | +7: iteration 142040/ 173500 | consumed samples: 36362240 | consumed tokens: 74469867520 | elapsed time per iteration (s): 0.16 | learning rate: 3.449E-05 | global batch size: 256 | lm loss: 3.678671E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.701 | TFLOPs: 25.17 | +7: iteration 142050/ 173500 | consumed samples: 36364800 | consumed tokens: 74475110400 | elapsed time per iteration (s): 0.16 | learning rate: 3.448E-05 | global batch size: 256 | lm loss: 3.660093E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.230 | TFLOPs: 24.95 | +7: iteration 142060/ 173500 | consumed samples: 36367360 | consumed tokens: 74480353280 | elapsed time per iteration (s): 0.16 | learning rate: 3.448E-05 | global batch size: 256 | lm loss: 3.661334E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.020 | TFLOPs: 25.77 | +7: iteration 142070/ 173500 | consumed samples: 36369920 | consumed tokens: 74485596160 | elapsed time per iteration (s): 0.16 | learning rate: 3.447E-05 | global batch size: 256 | lm loss: 3.669453E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.254 | TFLOPs: 24.61 | +7: iteration 142080/ 173500 | consumed samples: 36372480 | consumed tokens: 74490839040 | elapsed time per iteration (s): 0.15 | learning rate: 3.446E-05 | global batch size: 256 | lm loss: 3.665322E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.860 | TFLOPs: 26.30 | +7: iteration 142090/ 173500 | consumed samples: 36375040 | consumed tokens: 74496081920 | elapsed time per iteration (s): 0.17 | learning rate: 3.445E-05 | global batch size: 256 | lm loss: 3.666771E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.404 | TFLOPs: 24.22 | +7: iteration 142100/ 173500 | consumed samples: 36377600 | consumed tokens: 74501324800 | elapsed time per iteration (s): 0.16 | learning rate: 3.444E-05 | global batch size: 256 | lm loss: 3.658852E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1552.990 | TFLOPs: 24.35 | +7: iteration 142110/ 173500 | consumed samples: 36380160 | consumed tokens: 74506567680 | elapsed time per iteration (s): 0.16 | learning rate: 3.443E-05 | global batch size: 256 | lm loss: 3.654632E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.918 | TFLOPs: 25.04 | +7: iteration 142120/ 173500 | consumed samples: 36382720 | consumed tokens: 74511810560 | elapsed time per iteration (s): 0.15 | learning rate: 3.442E-05 | global batch size: 256 | lm loss: 3.671587E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.729 | TFLOPs: 26.28 | +7: iteration 142130/ 173500 | consumed samples: 36385280 | consumed tokens: 74517053440 | elapsed time per iteration (s): 0.15 | learning rate: 3.441E-05 | global batch size: 256 | lm loss: 3.667719E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.696 | TFLOPs: 26.28 | +7: iteration 142140/ 173500 | consumed samples: 36387840 | consumed tokens: 74522296320 | elapsed time per iteration (s): 0.16 | learning rate: 3.440E-05 | global batch size: 256 | lm loss: 3.670174E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.333 | TFLOPs: 25.41 | +7: iteration 142150/ 173500 | consumed samples: 36390400 | consumed tokens: 74527539200 | elapsed time per iteration (s): 0.16 | learning rate: 3.439E-05 | global batch size: 256 | lm loss: 3.661423E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.185 | TFLOPs: 25.80 | +7: iteration 142160/ 173500 | consumed samples: 36392960 | consumed tokens: 74532782080 | elapsed time per iteration (s): 0.15 | learning rate: 3.439E-05 | global batch size: 256 | lm loss: 3.675707E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.694 | TFLOPs: 26.29 | +7: iteration 142170/ 173500 | consumed samples: 36395520 | consumed tokens: 74538024960 | elapsed time per iteration (s): 0.16 | learning rate: 3.438E-05 | global batch size: 256 | lm loss: 3.658478E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.050 | TFLOPs: 25.22 | +7: iteration 142180/ 173500 | consumed samples: 36398080 | consumed tokens: 74543267840 | elapsed time per iteration (s): 0.15 | learning rate: 3.437E-05 | global batch size: 256 | lm loss: 3.664773E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.131 | TFLOPs: 26.30 | +7: iteration 142190/ 173500 | consumed samples: 36400640 | consumed tokens: 74548510720 | elapsed time per iteration (s): 0.16 | learning rate: 3.436E-05 | global batch size: 256 | lm loss: 3.669188E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.099 | TFLOPs: 25.36 | +7: iteration 142200/ 173500 | consumed samples: 36403200 | consumed tokens: 74553753600 | elapsed time per iteration (s): 0.16 | learning rate: 3.435E-05 | global batch size: 256 | lm loss: 3.671680E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.004 | TFLOPs: 25.23 | +7: iteration 142210/ 173500 | consumed samples: 36405760 | consumed tokens: 74558996480 | elapsed time per iteration (s): 0.16 | learning rate: 3.434E-05 | global batch size: 256 | lm loss: 3.656773E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.160 | TFLOPs: 25.13 | +7: iteration 142220/ 173500 | consumed samples: 36408320 | consumed tokens: 74564239360 | elapsed time per iteration (s): 0.15 | learning rate: 3.433E-05 | global batch size: 256 | lm loss: 3.674383E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.786 | TFLOPs: 26.23 | +7: iteration 142230/ 173500 | consumed samples: 36410880 | consumed tokens: 74569482240 | elapsed time per iteration (s): 0.15 | learning rate: 3.432E-05 | global batch size: 256 | lm loss: 3.678544E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.470 | TFLOPs: 25.99 | +7: iteration 142240/ 173500 | consumed samples: 36413440 | consumed tokens: 74574725120 | elapsed time per iteration (s): 0.16 | learning rate: 3.431E-05 | global batch size: 256 | lm loss: 3.666412E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.647 | TFLOPs: 25.60 | +7: iteration 142250/ 173500 | consumed samples: 36416000 | consumed tokens: 74579968000 | elapsed time per iteration (s): 0.16 | learning rate: 3.431E-05 | global batch size: 256 | lm loss: 3.654140E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.846 | TFLOPs: 25.51 | +7: iteration 142260/ 173500 | consumed samples: 36418560 | consumed tokens: 74585210880 | elapsed time per iteration (s): 0.16 | learning rate: 3.430E-05 | global batch size: 256 | lm loss: 3.665302E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.851 | TFLOPs: 25.47 | +7: iteration 142270/ 173500 | consumed samples: 36421120 | consumed tokens: 74590453760 | elapsed time per iteration (s): 0.15 | learning rate: 3.429E-05 | global batch size: 256 | lm loss: 3.665562E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.412 | TFLOPs: 26.12 | +7: iteration 142280/ 173500 | consumed samples: 36423680 | consumed tokens: 74595696640 | elapsed time per iteration (s): 0.15 | learning rate: 3.428E-05 | global batch size: 256 | lm loss: 3.676273E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.205 | TFLOPs: 26.18 | +7: iteration 142290/ 173500 | consumed samples: 36426240 | consumed tokens: 74600939520 | elapsed time per iteration (s): 0.15 | learning rate: 3.427E-05 | global batch size: 256 | lm loss: 3.666759E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.642 | TFLOPs: 26.01 | +7: iteration 142300/ 173500 | consumed samples: 36428800 | consumed tokens: 74606182400 | elapsed time per iteration (s): 0.15 | learning rate: 3.426E-05 | global batch size: 256 | lm loss: 3.666446E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.756 | TFLOPs: 26.00 | +7: iteration 142310/ 173500 | consumed samples: 36431360 | consumed tokens: 74611425280 | elapsed time per iteration (s): 0.16 | learning rate: 3.425E-05 | global batch size: 256 | lm loss: 3.650008E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.918 | TFLOPs: 25.89 | +7: iteration 142320/ 173500 | consumed samples: 36433920 | consumed tokens: 74616668160 | elapsed time per iteration (s): 0.15 | learning rate: 3.424E-05 | global batch size: 256 | lm loss: 3.668086E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.058 | TFLOPs: 26.03 | +7: iteration 142330/ 173500 | consumed samples: 36436480 | consumed tokens: 74621911040 | elapsed time per iteration (s): 0.16 | learning rate: 3.423E-05 | global batch size: 256 | lm loss: 3.665543E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.773 | TFLOPs: 25.68 | +7: iteration 142340/ 173500 | consumed samples: 36439040 | consumed tokens: 74627153920 | elapsed time per iteration (s): 0.15 | learning rate: 3.423E-05 | global batch size: 256 | lm loss: 3.659707E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.692 | TFLOPs: 26.22 | +7: iteration 142350/ 173500 | consumed samples: 36441600 | consumed tokens: 74632396800 | elapsed time per iteration (s): 0.15 | learning rate: 3.422E-05 | global batch size: 256 | lm loss: 3.662667E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.469 | TFLOPs: 26.24 | +7: iteration 142360/ 173500 | consumed samples: 36444160 | consumed tokens: 74637639680 | elapsed time per iteration (s): 0.15 | learning rate: 3.421E-05 | global batch size: 256 | lm loss: 3.674649E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.464 | TFLOPs: 26.23 | +7: iteration 142370/ 173500 | consumed samples: 36446720 | consumed tokens: 74642882560 | elapsed time per iteration (s): 0.15 | learning rate: 3.420E-05 | global batch size: 256 | lm loss: 3.668092E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.890 | TFLOPs: 26.11 | +7: iteration 142380/ 173500 | consumed samples: 36449280 | consumed tokens: 74648125440 | elapsed time per iteration (s): 0.19 | learning rate: 3.419E-05 | global batch size: 256 | lm loss: 3.652591E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.654 | TFLOPs: 21.70 | +7: iteration 142390/ 173500 | consumed samples: 36451840 | consumed tokens: 74653368320 | elapsed time per iteration (s): 0.15 | learning rate: 3.418E-05 | global batch size: 256 | lm loss: 3.678616E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.723 | TFLOPs: 26.06 | +7: iteration 142400/ 173500 | consumed samples: 36454400 | consumed tokens: 74658611200 | elapsed time per iteration (s): 0.16 | learning rate: 3.417E-05 | global batch size: 256 | lm loss: 3.661531E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.515 | TFLOPs: 25.48 | +7: iteration 142410/ 173500 | consumed samples: 36456960 | consumed tokens: 74663854080 | elapsed time per iteration (s): 0.15 | learning rate: 3.416E-05 | global batch size: 256 | lm loss: 3.665717E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.506 | TFLOPs: 26.21 | +7: iteration 142420/ 173500 | consumed samples: 36459520 | consumed tokens: 74669096960 | elapsed time per iteration (s): 0.16 | learning rate: 3.415E-05 | global batch size: 256 | lm loss: 3.668419E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.801 | TFLOPs: 25.48 | +7: iteration 142430/ 173500 | consumed samples: 36462080 | consumed tokens: 74674339840 | elapsed time per iteration (s): 0.15 | learning rate: 3.415E-05 | global batch size: 256 | lm loss: 3.667003E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.339 | TFLOPs: 26.05 | +7: iteration 142440/ 173500 | consumed samples: 36464640 | consumed tokens: 74679582720 | elapsed time per iteration (s): 0.16 | learning rate: 3.414E-05 | global batch size: 256 | lm loss: 3.666406E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.366 | TFLOPs: 25.88 | +7: iteration 142450/ 173500 | consumed samples: 36467200 | consumed tokens: 74684825600 | elapsed time per iteration (s): 0.15 | learning rate: 3.413E-05 | global batch size: 256 | lm loss: 3.671425E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.314 | TFLOPs: 26.01 | +7: iteration 142460/ 173500 | consumed samples: 36469760 | consumed tokens: 74690068480 | elapsed time per iteration (s): 0.16 | learning rate: 3.412E-05 | global batch size: 256 | lm loss: 3.657518E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.150 | TFLOPs: 25.52 | +7: iteration 142470/ 173500 | consumed samples: 36472320 | consumed tokens: 74695311360 | elapsed time per iteration (s): 0.16 | learning rate: 3.411E-05 | global batch size: 256 | lm loss: 3.681737E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.335 | TFLOPs: 25.58 | +7: iteration 142480/ 173500 | consumed samples: 36474880 | consumed tokens: 74700554240 | elapsed time per iteration (s): 0.16 | learning rate: 3.410E-05 | global batch size: 256 | lm loss: 3.657109E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.441 | TFLOPs: 25.85 | +7: iteration 142490/ 173500 | consumed samples: 36477440 | consumed tokens: 74705797120 | elapsed time per iteration (s): 0.15 | learning rate: 3.409E-05 | global batch size: 256 | lm loss: 3.685109E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.218 | TFLOPs: 26.19 | +7: iteration 142500/ 173500 | consumed samples: 36480000 | consumed tokens: 74711040000 | elapsed time per iteration (s): 0.16 | learning rate: 3.408E-05 | global batch size: 256 | lm loss: 3.664680E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.063 | TFLOPs: 25.83 | +7: iteration 142510/ 173500 | consumed samples: 36482560 | consumed tokens: 74716282880 | elapsed time per iteration (s): 0.16 | learning rate: 3.407E-05 | global batch size: 256 | lm loss: 3.668658E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.655 | TFLOPs: 25.01 | +7: iteration 142520/ 173500 | consumed samples: 36485120 | consumed tokens: 74721525760 | elapsed time per iteration (s): 0.16 | learning rate: 3.407E-05 | global batch size: 256 | lm loss: 3.671101E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.223 | TFLOPs: 25.42 | +7: iteration 142530/ 173500 | consumed samples: 36487680 | consumed tokens: 74726768640 | elapsed time per iteration (s): 0.16 | learning rate: 3.406E-05 | global batch size: 256 | lm loss: 3.680652E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.062 | TFLOPs: 25.52 | +7: iteration 142540/ 173500 | consumed samples: 36490240 | consumed tokens: 74732011520 | elapsed time per iteration (s): 0.16 | learning rate: 3.405E-05 | global batch size: 256 | lm loss: 3.673653E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.766 | TFLOPs: 25.31 | +7: iteration 142550/ 173500 | consumed samples: 36492800 | consumed tokens: 74737254400 | elapsed time per iteration (s): 0.15 | learning rate: 3.404E-05 | global batch size: 256 | lm loss: 3.658568E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.003 | TFLOPs: 26.00 | +7: iteration 142560/ 173500 | consumed samples: 36495360 | consumed tokens: 74742497280 | elapsed time per iteration (s): 0.16 | learning rate: 3.403E-05 | global batch size: 256 | lm loss: 3.671483E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.077 | TFLOPs: 25.81 | +7: iteration 142570/ 173500 | consumed samples: 36497920 | consumed tokens: 74747740160 | elapsed time per iteration (s): 0.21 | learning rate: 3.402E-05 | global batch size: 256 | lm loss: 3.672742E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1230.575 | TFLOPs: 19.30 | +7: iteration 142580/ 173500 | consumed samples: 36500480 | consumed tokens: 74752983040 | elapsed time per iteration (s): 0.16 | learning rate: 3.401E-05 | global batch size: 256 | lm loss: 3.670810E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.845 | TFLOPs: 24.78 | +7: iteration 142590/ 173500 | consumed samples: 36503040 | consumed tokens: 74758225920 | elapsed time per iteration (s): 0.16 | learning rate: 3.400E-05 | global batch size: 256 | lm loss: 3.672179E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.698 | TFLOPs: 25.26 | +7: iteration 142600/ 173500 | consumed samples: 36505600 | consumed tokens: 74763468800 | elapsed time per iteration (s): 0.16 | learning rate: 3.400E-05 | global batch size: 256 | lm loss: 3.662868E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.256 | TFLOPs: 25.85 | +7: iteration 142610/ 173500 | consumed samples: 36508160 | consumed tokens: 74768711680 | elapsed time per iteration (s): 0.16 | learning rate: 3.399E-05 | global batch size: 256 | lm loss: 3.664910E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.587 | TFLOPs: 25.87 | +7: iteration 142620/ 173500 | consumed samples: 36510720 | consumed tokens: 74773954560 | elapsed time per iteration (s): 0.15 | learning rate: 3.398E-05 | global batch size: 256 | lm loss: 3.671496E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.525 | TFLOPs: 26.09 | +7: iteration 142630/ 173500 | consumed samples: 36513280 | consumed tokens: 74779197440 | elapsed time per iteration (s): 0.16 | learning rate: 3.397E-05 | global batch size: 256 | lm loss: 3.674015E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.868 | TFLOPs: 24.84 | +7: iteration 142640/ 173500 | consumed samples: 36515840 | consumed tokens: 74784440320 | elapsed time per iteration (s): 0.15 | learning rate: 3.396E-05 | global batch size: 256 | lm loss: 3.680222E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.164 | TFLOPs: 25.96 | +7: iteration 142650/ 173500 | consumed samples: 36518400 | consumed tokens: 74789683200 | elapsed time per iteration (s): 0.16 | learning rate: 3.395E-05 | global batch size: 256 | lm loss: 3.670991E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.780 | TFLOPs: 25.32 | +7: iteration 142660/ 173500 | consumed samples: 36520960 | consumed tokens: 74794926080 | elapsed time per iteration (s): 0.15 | learning rate: 3.394E-05 | global batch size: 256 | lm loss: 3.652067E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.207 | TFLOPs: 26.33 | +7: iteration 142670/ 173500 | consumed samples: 36523520 | consumed tokens: 74800168960 | elapsed time per iteration (s): 0.15 | learning rate: 3.393E-05 | global batch size: 256 | lm loss: 3.673998E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.450 | TFLOPs: 26.34 | +7: iteration 142680/ 173500 | consumed samples: 36526080 | consumed tokens: 74805411840 | elapsed time per iteration (s): 0.15 | learning rate: 3.392E-05 | global batch size: 256 | lm loss: 3.666345E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.493 | TFLOPs: 26.32 | +7: iteration 142690/ 173500 | consumed samples: 36528640 | consumed tokens: 74810654720 | elapsed time per iteration (s): 0.16 | learning rate: 3.392E-05 | global batch size: 256 | lm loss: 3.675187E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.280 | TFLOPs: 25.57 | +7: iteration 142700/ 173500 | consumed samples: 36531200 | consumed tokens: 74815897600 | elapsed time per iteration (s): 0.18 | learning rate: 3.391E-05 | global batch size: 256 | lm loss: 3.689114E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.249 | TFLOPs: 22.05 | +7: iteration 142710/ 173500 | consumed samples: 36533760 | consumed tokens: 74821140480 | elapsed time per iteration (s): 0.16 | learning rate: 3.390E-05 | global batch size: 256 | lm loss: 3.664190E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.673 | TFLOPs: 25.40 | +7: iteration 142720/ 173500 | consumed samples: 36536320 | consumed tokens: 74826383360 | elapsed time per iteration (s): 0.17 | learning rate: 3.389E-05 | global batch size: 256 | lm loss: 3.668664E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.622 | TFLOPs: 24.19 | +7: iteration 142730/ 173500 | consumed samples: 36538880 | consumed tokens: 74831626240 | elapsed time per iteration (s): 0.16 | learning rate: 3.388E-05 | global batch size: 256 | lm loss: 3.665166E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.835 | TFLOPs: 25.83 | +7: iteration 142740/ 173500 | consumed samples: 36541440 | consumed tokens: 74836869120 | elapsed time per iteration (s): 0.15 | learning rate: 3.387E-05 | global batch size: 256 | lm loss: 3.669046E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.836 | TFLOPs: 26.23 | +7: iteration 142750/ 173500 | consumed samples: 36544000 | consumed tokens: 74842112000 | elapsed time per iteration (s): 0.16 | learning rate: 3.386E-05 | global batch size: 256 | lm loss: 3.660560E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.374 | TFLOPs: 25.44 | +7: iteration 142760/ 173500 | consumed samples: 36546560 | consumed tokens: 74847354880 | elapsed time per iteration (s): 0.16 | learning rate: 3.385E-05 | global batch size: 256 | lm loss: 3.677423E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.029 | TFLOPs: 25.74 | +7: iteration 142770/ 173500 | consumed samples: 36549120 | consumed tokens: 74852597760 | elapsed time per iteration (s): 0.19 | learning rate: 3.385E-05 | global batch size: 256 | lm loss: 3.670288E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1337.854 | TFLOPs: 20.98 | +7: iteration 142780/ 173500 | consumed samples: 36551680 | consumed tokens: 74857840640 | elapsed time per iteration (s): 0.15 | learning rate: 3.384E-05 | global batch size: 256 | lm loss: 3.655257E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.626 | TFLOPs: 25.96 | +7: iteration 142790/ 173500 | consumed samples: 36554240 | consumed tokens: 74863083520 | elapsed time per iteration (s): 0.16 | learning rate: 3.383E-05 | global batch size: 256 | lm loss: 3.661441E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.821 | TFLOPs: 25.48 | +7: iteration 142800/ 173500 | consumed samples: 36556800 | consumed tokens: 74868326400 | elapsed time per iteration (s): 0.16 | learning rate: 3.382E-05 | global batch size: 256 | lm loss: 3.674045E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.974 | TFLOPs: 25.80 | +7: iteration 142810/ 173500 | consumed samples: 36559360 | consumed tokens: 74873569280 | elapsed time per iteration (s): 0.16 | learning rate: 3.381E-05 | global batch size: 256 | lm loss: 3.646627E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.945 | TFLOPs: 24.65 | +7: iteration 142820/ 173500 | consumed samples: 36561920 | consumed tokens: 74878812160 | elapsed time per iteration (s): 0.16 | learning rate: 3.380E-05 | global batch size: 256 | lm loss: 3.676429E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.626 | TFLOPs: 24.88 | +7: iteration 142830/ 173500 | consumed samples: 36564480 | consumed tokens: 74884055040 | elapsed time per iteration (s): 0.15 | learning rate: 3.379E-05 | global batch size: 256 | lm loss: 3.665643E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.705 | TFLOPs: 26.25 | +7: iteration 142840/ 173500 | consumed samples: 36567040 | consumed tokens: 74889297920 | elapsed time per iteration (s): 0.16 | learning rate: 3.378E-05 | global batch size: 256 | lm loss: 3.663409E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.261 | TFLOPs: 25.47 | +7: iteration 142850/ 173500 | consumed samples: 36569600 | consumed tokens: 74894540800 | elapsed time per iteration (s): 0.17 | learning rate: 3.378E-05 | global batch size: 256 | lm loss: 3.668149E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.359 | TFLOPs: 24.08 | +7: iteration 142860/ 173500 | consumed samples: 36572160 | consumed tokens: 74899783680 | elapsed time per iteration (s): 0.17 | learning rate: 3.377E-05 | global batch size: 256 | lm loss: 3.666983E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.236 | TFLOPs: 23.21 | +7: iteration 142870/ 173500 | consumed samples: 36574720 | consumed tokens: 74905026560 | elapsed time per iteration (s): 0.16 | learning rate: 3.376E-05 | global batch size: 256 | lm loss: 3.666658E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.096 | TFLOPs: 24.87 | +7: iteration 142880/ 173500 | consumed samples: 36577280 | consumed tokens: 74910269440 | elapsed time per iteration (s): 0.16 | learning rate: 3.375E-05 | global batch size: 256 | lm loss: 3.675054E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.764 | TFLOPs: 25.61 | +7: iteration 142890/ 173500 | consumed samples: 36579840 | consumed tokens: 74915512320 | elapsed time per iteration (s): 0.16 | learning rate: 3.374E-05 | global batch size: 256 | lm loss: 3.676928E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.161 | TFLOPs: 25.78 | +7: iteration 142900/ 173500 | consumed samples: 36582400 | consumed tokens: 74920755200 | elapsed time per iteration (s): 0.16 | learning rate: 3.373E-05 | global batch size: 256 | lm loss: 3.667114E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.693 | TFLOPs: 25.62 | +7: iteration 142910/ 173500 | consumed samples: 36584960 | consumed tokens: 74925998080 | elapsed time per iteration (s): 0.15 | learning rate: 3.372E-05 | global batch size: 256 | lm loss: 3.644236E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.895 | TFLOPs: 26.02 | +7: iteration 142920/ 173500 | consumed samples: 36587520 | consumed tokens: 74931240960 | elapsed time per iteration (s): 0.15 | learning rate: 3.371E-05 | global batch size: 256 | lm loss: 3.671235E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.947 | TFLOPs: 25.94 | +7: iteration 142930/ 173500 | consumed samples: 36590080 | consumed tokens: 74936483840 | elapsed time per iteration (s): 0.16 | learning rate: 3.371E-05 | global batch size: 256 | lm loss: 3.662820E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.042 | TFLOPs: 25.48 | +7: iteration 142940/ 173500 | consumed samples: 36592640 | consumed tokens: 74941726720 | elapsed time per iteration (s): 0.16 | learning rate: 3.370E-05 | global batch size: 256 | lm loss: 3.674176E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.405 | TFLOPs: 25.77 | +7: iteration 142950/ 173500 | consumed samples: 36595200 | consumed tokens: 74946969600 | elapsed time per iteration (s): 0.15 | learning rate: 3.369E-05 | global batch size: 256 | lm loss: 3.654115E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.283 | TFLOPs: 26.12 | +7: iteration 142960/ 173500 | consumed samples: 36597760 | consumed tokens: 74952212480 | elapsed time per iteration (s): 0.18 | learning rate: 3.368E-05 | global batch size: 256 | lm loss: 3.662000E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1389.848 | TFLOPs: 21.80 | +7: iteration 142970/ 173500 | consumed samples: 36600320 | consumed tokens: 74957455360 | elapsed time per iteration (s): 0.15 | learning rate: 3.367E-05 | global batch size: 256 | lm loss: 3.669463E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.062 | TFLOPs: 25.91 | +7: iteration 142980/ 173500 | consumed samples: 36602880 | consumed tokens: 74962698240 | elapsed time per iteration (s): 0.16 | learning rate: 3.366E-05 | global batch size: 256 | lm loss: 3.668536E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.775 | TFLOPs: 25.68 | +7: iteration 142990/ 173500 | consumed samples: 36605440 | consumed tokens: 74967941120 | elapsed time per iteration (s): 0.15 | learning rate: 3.365E-05 | global batch size: 256 | lm loss: 3.666634E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.857 | TFLOPs: 25.92 | +7: iteration 143000/ 173500 | consumed samples: 36608000 | consumed tokens: 74973184000 | elapsed time per iteration (s): 0.16 | learning rate: 3.364E-05 | global batch size: 256 | lm loss: 3.672147E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.141 | TFLOPs: 25.86 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 143000 | lm loss value: 3.831216E+00 | lm loss PPL: 4.611857E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 143000 to checkpoints_44m91b100m +0: [2023-03-17 06:28:12,599] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step143000 is begin to save! +0: [2023-03-17 06:28:12,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:28:12,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:28:12,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:28:12,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:28:12,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:28:12,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:28:12,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:28:12,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:28:12,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:28:12,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:28:12,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:28:12,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:28:12,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:28:12,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:28:12,723] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:28:12,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:28:12,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:28:12,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:28:12,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:28:12,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:28:12,742] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step143000/mp_rank_00_model_states.pt +0: [2023-03-17 06:28:12,742] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:28:12,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:28:12,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:28:12,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 06:28:12,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +1: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +5: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +6: [2023-03-17 06:28:12,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:28:12,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +3: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:28:12,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 06:28:12,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +7: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +4: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:28:12,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +2: [2023-03-17 06:28:12,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:28:12,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step143000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:28:12,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step143000 is ready now! +0: successfully saved checkpoint at iteration 143000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 190.47 +7: iteration 143010/ 173500 | consumed samples: 36610560 | consumed tokens: 74978426880 | elapsed time per iteration (s): 0.18 | learning rate: 3.364E-05 | global batch size: 256 | lm loss: 3.674176E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.387 | TFLOPs: 22.59 | +7: iteration 143020/ 173500 | consumed samples: 36613120 | consumed tokens: 74983669760 | elapsed time per iteration (s): 0.18 | learning rate: 3.363E-05 | global batch size: 256 | lm loss: 3.665586E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.047 | TFLOPs: 21.74 | +7: iteration 143030/ 173500 | consumed samples: 36615680 | consumed tokens: 74988912640 | elapsed time per iteration (s): 0.16 | learning rate: 3.362E-05 | global batch size: 256 | lm loss: 3.668273E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.972 | TFLOPs: 25.83 | +7: iteration 143040/ 173500 | consumed samples: 36618240 | consumed tokens: 74994155520 | elapsed time per iteration (s): 0.15 | learning rate: 3.361E-05 | global batch size: 256 | lm loss: 3.665723E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.629 | TFLOPs: 26.14 | +7: iteration 143050/ 173500 | consumed samples: 36620800 | consumed tokens: 74999398400 | elapsed time per iteration (s): 0.16 | learning rate: 3.360E-05 | global batch size: 256 | lm loss: 3.648203E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.847 | TFLOPs: 25.76 | +7: iteration 143060/ 173500 | consumed samples: 36623360 | consumed tokens: 75004641280 | elapsed time per iteration (s): 0.16 | learning rate: 3.359E-05 | global batch size: 256 | lm loss: 3.680719E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.064 | TFLOPs: 25.88 | +7: iteration 143070/ 173500 | consumed samples: 36625920 | consumed tokens: 75009884160 | elapsed time per iteration (s): 0.15 | learning rate: 3.358E-05 | global batch size: 256 | lm loss: 3.665723E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.375 | TFLOPs: 26.09 | +7: iteration 143080/ 173500 | consumed samples: 36628480 | consumed tokens: 75015127040 | elapsed time per iteration (s): 0.16 | learning rate: 3.358E-05 | global batch size: 256 | lm loss: 3.666609E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.655 | TFLOPs: 25.57 | +7: iteration 143090/ 173500 | consumed samples: 36631040 | consumed tokens: 75020369920 | elapsed time per iteration (s): 0.16 | learning rate: 3.357E-05 | global batch size: 256 | lm loss: 3.661032E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.753 | TFLOPs: 25.81 | +7: iteration 143100/ 173500 | consumed samples: 36633600 | consumed tokens: 75025612800 | elapsed time per iteration (s): 0.16 | learning rate: 3.356E-05 | global batch size: 256 | lm loss: 3.671764E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.875 | TFLOPs: 25.40 | +7: iteration 143110/ 173500 | consumed samples: 36636160 | consumed tokens: 75030855680 | elapsed time per iteration (s): 0.15 | learning rate: 3.355E-05 | global batch size: 256 | lm loss: 3.663829E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.044 | TFLOPs: 26.05 | +7: iteration 143120/ 173500 | consumed samples: 36638720 | consumed tokens: 75036098560 | elapsed time per iteration (s): 0.15 | learning rate: 3.354E-05 | global batch size: 256 | lm loss: 3.661534E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.773 | TFLOPs: 26.06 | +7: iteration 143130/ 173500 | consumed samples: 36641280 | consumed tokens: 75041341440 | elapsed time per iteration (s): 0.16 | learning rate: 3.353E-05 | global batch size: 256 | lm loss: 3.671704E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.210 | TFLOPs: 25.77 | +7: iteration 143140/ 173500 | consumed samples: 36643840 | consumed tokens: 75046584320 | elapsed time per iteration (s): 0.15 | learning rate: 3.352E-05 | global batch size: 256 | lm loss: 3.667828E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.357 | TFLOPs: 26.10 | +7: iteration 143150/ 173500 | consumed samples: 36646400 | consumed tokens: 75051827200 | elapsed time per iteration (s): 0.19 | learning rate: 3.351E-05 | global batch size: 256 | lm loss: 3.676525E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1358.056 | TFLOPs: 21.30 | +7: iteration 143160/ 173500 | consumed samples: 36648960 | consumed tokens: 75057070080 | elapsed time per iteration (s): 0.15 | learning rate: 3.351E-05 | global batch size: 256 | lm loss: 3.670797E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.243 | TFLOPs: 26.02 | +7: iteration 143170/ 173500 | consumed samples: 36651520 | consumed tokens: 75062312960 | elapsed time per iteration (s): 0.16 | learning rate: 3.350E-05 | global batch size: 256 | lm loss: 3.659169E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.512 | TFLOPs: 25.85 | +7: iteration 143180/ 173500 | consumed samples: 36654080 | consumed tokens: 75067555840 | elapsed time per iteration (s): 0.15 | learning rate: 3.349E-05 | global batch size: 256 | lm loss: 3.682021E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.522 | TFLOPs: 26.15 | +7: iteration 143190/ 173500 | consumed samples: 36656640 | consumed tokens: 75072798720 | elapsed time per iteration (s): 0.15 | learning rate: 3.348E-05 | global batch size: 256 | lm loss: 3.649551E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.964 | TFLOPs: 26.14 | +7: iteration 143200/ 173500 | consumed samples: 36659200 | consumed tokens: 75078041600 | elapsed time per iteration (s): 0.15 | learning rate: 3.347E-05 | global batch size: 256 | lm loss: 3.661104E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.680 | TFLOPs: 25.97 | +7: iteration 143210/ 173500 | consumed samples: 36661760 | consumed tokens: 75083284480 | elapsed time per iteration (s): 0.16 | learning rate: 3.346E-05 | global batch size: 256 | lm loss: 3.674236E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.810 | TFLOPs: 25.81 | +7: iteration 143220/ 173500 | consumed samples: 36664320 | consumed tokens: 75088527360 | elapsed time per iteration (s): 0.16 | learning rate: 3.345E-05 | global batch size: 256 | lm loss: 3.675094E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.343 | TFLOPs: 25.82 | +7: iteration 143230/ 173500 | consumed samples: 36666880 | consumed tokens: 75093770240 | elapsed time per iteration (s): 0.15 | learning rate: 3.344E-05 | global batch size: 256 | lm loss: 3.677774E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.174 | TFLOPs: 26.02 | +7: iteration 143240/ 173500 | consumed samples: 36669440 | consumed tokens: 75099013120 | elapsed time per iteration (s): 0.16 | learning rate: 3.344E-05 | global batch size: 256 | lm loss: 3.665942E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.639 | TFLOPs: 25.53 | +7: iteration 143250/ 173500 | consumed samples: 36672000 | consumed tokens: 75104256000 | elapsed time per iteration (s): 0.16 | learning rate: 3.343E-05 | global batch size: 256 | lm loss: 3.649028E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.963 | TFLOPs: 25.44 | +7: iteration 143260/ 173500 | consumed samples: 36674560 | consumed tokens: 75109498880 | elapsed time per iteration (s): 0.15 | learning rate: 3.342E-05 | global batch size: 256 | lm loss: 3.662115E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.089 | TFLOPs: 26.13 | +7: iteration 143270/ 173500 | consumed samples: 36677120 | consumed tokens: 75114741760 | elapsed time per iteration (s): 0.15 | learning rate: 3.341E-05 | global batch size: 256 | lm loss: 3.671746E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.790 | TFLOPs: 26.34 | +7: iteration 143280/ 173500 | consumed samples: 36679680 | consumed tokens: 75119984640 | elapsed time per iteration (s): 0.18 | learning rate: 3.340E-05 | global batch size: 256 | lm loss: 3.677741E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1434.262 | TFLOPs: 22.49 | +7: iteration 143290/ 173500 | consumed samples: 36682240 | consumed tokens: 75125227520 | elapsed time per iteration (s): 0.15 | learning rate: 3.339E-05 | global batch size: 256 | lm loss: 3.663001E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.815 | TFLOPs: 26.11 | +7: iteration 143300/ 173500 | consumed samples: 36684800 | consumed tokens: 75130470400 | elapsed time per iteration (s): 0.15 | learning rate: 3.338E-05 | global batch size: 256 | lm loss: 3.670256E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.398 | TFLOPs: 26.20 | +7: iteration 143310/ 173500 | consumed samples: 36687360 | consumed tokens: 75135713280 | elapsed time per iteration (s): 0.15 | learning rate: 3.338E-05 | global batch size: 256 | lm loss: 3.681705E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.040 | TFLOPs: 26.27 | +7: iteration 143320/ 173500 | consumed samples: 36689920 | consumed tokens: 75140956160 | elapsed time per iteration (s): 0.15 | learning rate: 3.337E-05 | global batch size: 256 | lm loss: 3.667104E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.263 | TFLOPs: 26.26 | +7: iteration 143330/ 173500 | consumed samples: 36692480 | consumed tokens: 75146199040 | elapsed time per iteration (s): 0.15 | learning rate: 3.336E-05 | global batch size: 256 | lm loss: 3.668540E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.170 | TFLOPs: 26.24 | +7: iteration 143340/ 173500 | consumed samples: 36695040 | consumed tokens: 75151441920 | elapsed time per iteration (s): 0.15 | learning rate: 3.335E-05 | global batch size: 256 | lm loss: 3.659407E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.122 | TFLOPs: 26.24 | +7: iteration 143350/ 173500 | consumed samples: 36697600 | consumed tokens: 75156684800 | elapsed time per iteration (s): 0.15 | learning rate: 3.334E-05 | global batch size: 256 | lm loss: 3.668390E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.446 | TFLOPs: 26.32 | +7: iteration 143360/ 173500 | consumed samples: 36700160 | consumed tokens: 75161927680 | elapsed time per iteration (s): 0.15 | learning rate: 3.333E-05 | global batch size: 256 | lm loss: 3.660546E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.713 | TFLOPs: 26.15 | +7: iteration 143370/ 173500 | consumed samples: 36702720 | consumed tokens: 75167170560 | elapsed time per iteration (s): 0.15 | learning rate: 3.332E-05 | global batch size: 256 | lm loss: 3.668109E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.762 | TFLOPs: 26.17 | +7: iteration 143380/ 173500 | consumed samples: 36705280 | consumed tokens: 75172413440 | elapsed time per iteration (s): 0.15 | learning rate: 3.332E-05 | global batch size: 256 | lm loss: 3.666989E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.525 | TFLOPs: 26.17 | +7: iteration 143390/ 173500 | consumed samples: 36707840 | consumed tokens: 75177656320 | elapsed time per iteration (s): 0.15 | learning rate: 3.331E-05 | global batch size: 256 | lm loss: 3.676830E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.663 | TFLOPs: 26.18 | +7: iteration 143400/ 173500 | consumed samples: 36710400 | consumed tokens: 75182899200 | elapsed time per iteration (s): 0.16 | learning rate: 3.330E-05 | global batch size: 256 | lm loss: 3.659406E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.261 | TFLOPs: 25.52 | +7: iteration 143410/ 173500 | consumed samples: 36712960 | consumed tokens: 75188142080 | elapsed time per iteration (s): 0.15 | learning rate: 3.329E-05 | global batch size: 256 | lm loss: 3.651678E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.312 | TFLOPs: 26.19 | +7: iteration 143420/ 173500 | consumed samples: 36715520 | consumed tokens: 75193384960 | elapsed time per iteration (s): 0.16 | learning rate: 3.328E-05 | global batch size: 256 | lm loss: 3.663839E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.374 | TFLOPs: 25.55 | +7: iteration 143430/ 173500 | consumed samples: 36718080 | consumed tokens: 75198627840 | elapsed time per iteration (s): 0.15 | learning rate: 3.327E-05 | global batch size: 256 | lm loss: 3.667942E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.062 | TFLOPs: 26.21 | +7: iteration 143440/ 173500 | consumed samples: 36720640 | consumed tokens: 75203870720 | elapsed time per iteration (s): 0.15 | learning rate: 3.326E-05 | global batch size: 256 | lm loss: 3.669581E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.141 | TFLOPs: 26.22 | +7: iteration 143450/ 173500 | consumed samples: 36723200 | consumed tokens: 75209113600 | elapsed time per iteration (s): 0.16 | learning rate: 3.326E-05 | global batch size: 256 | lm loss: 3.660425E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.435 | TFLOPs: 25.77 | +7: iteration 143460/ 173500 | consumed samples: 36725760 | consumed tokens: 75214356480 | elapsed time per iteration (s): 0.16 | learning rate: 3.325E-05 | global batch size: 256 | lm loss: 3.669671E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.937 | TFLOPs: 25.44 | +7: iteration 143470/ 173500 | consumed samples: 36728320 | consumed tokens: 75219599360 | elapsed time per iteration (s): 0.16 | learning rate: 3.324E-05 | global batch size: 256 | lm loss: 3.658562E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.415 | TFLOPs: 25.41 | +7: iteration 143480/ 173500 | consumed samples: 36730880 | consumed tokens: 75224842240 | elapsed time per iteration (s): 0.16 | learning rate: 3.323E-05 | global batch size: 256 | lm loss: 3.650787E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.256 | TFLOPs: 25.88 | +7: iteration 143490/ 173500 | consumed samples: 36733440 | consumed tokens: 75230085120 | elapsed time per iteration (s): 0.16 | learning rate: 3.322E-05 | global batch size: 256 | lm loss: 3.671852E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.970 | TFLOPs: 25.53 | +7: iteration 143500/ 173500 | consumed samples: 36736000 | consumed tokens: 75235328000 | elapsed time per iteration (s): 0.16 | learning rate: 3.321E-05 | global batch size: 256 | lm loss: 3.664300E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.269 | TFLOPs: 25.66 | +7: iteration 143510/ 173500 | consumed samples: 36738560 | consumed tokens: 75240570880 | elapsed time per iteration (s): 0.15 | learning rate: 3.320E-05 | global batch size: 256 | lm loss: 3.670070E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.981 | TFLOPs: 26.22 | +7: iteration 143520/ 173500 | consumed samples: 36741120 | consumed tokens: 75245813760 | elapsed time per iteration (s): 0.16 | learning rate: 3.320E-05 | global batch size: 256 | lm loss: 3.663811E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.297 | TFLOPs: 25.82 | +7: iteration 143530/ 173500 | consumed samples: 36743680 | consumed tokens: 75251056640 | elapsed time per iteration (s): 0.19 | learning rate: 3.319E-05 | global batch size: 256 | lm loss: 3.670788E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.519 | TFLOPs: 21.68 | +7: iteration 143540/ 173500 | consumed samples: 36746240 | consumed tokens: 75256299520 | elapsed time per iteration (s): 0.15 | learning rate: 3.318E-05 | global batch size: 256 | lm loss: 3.675540E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.768 | TFLOPs: 25.94 | +7: iteration 143550/ 173500 | consumed samples: 36748800 | consumed tokens: 75261542400 | elapsed time per iteration (s): 0.16 | learning rate: 3.317E-05 | global batch size: 256 | lm loss: 3.668873E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.669 | TFLOPs: 25.45 | +7: iteration 143560/ 173500 | consumed samples: 36751360 | consumed tokens: 75266785280 | elapsed time per iteration (s): 0.16 | learning rate: 3.316E-05 | global batch size: 256 | lm loss: 3.657542E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.271 | TFLOPs: 25.74 | +7: iteration 143570/ 173500 | consumed samples: 36753920 | consumed tokens: 75272028160 | elapsed time per iteration (s): 0.16 | learning rate: 3.315E-05 | global batch size: 256 | lm loss: 3.685163E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.300 | TFLOPs: 25.52 | +7: iteration 143580/ 173500 | consumed samples: 36756480 | consumed tokens: 75277271040 | elapsed time per iteration (s): 0.15 | learning rate: 3.314E-05 | global batch size: 256 | lm loss: 3.668206E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.129 | TFLOPs: 26.03 | +7: iteration 143590/ 173500 | consumed samples: 36759040 | consumed tokens: 75282513920 | elapsed time per iteration (s): 0.16 | learning rate: 3.314E-05 | global batch size: 256 | lm loss: 3.670462E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.311 | TFLOPs: 25.85 | +7: iteration 143600/ 173500 | consumed samples: 36761600 | consumed tokens: 75287756800 | elapsed time per iteration (s): 0.15 | learning rate: 3.313E-05 | global batch size: 256 | lm loss: 3.673859E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.918 | TFLOPs: 25.98 | +7: iteration 143610/ 173500 | consumed samples: 36764160 | consumed tokens: 75292999680 | elapsed time per iteration (s): 0.16 | learning rate: 3.312E-05 | global batch size: 256 | lm loss: 3.657697E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.276 | TFLOPs: 25.83 | +7: iteration 143620/ 173500 | consumed samples: 36766720 | consumed tokens: 75298242560 | elapsed time per iteration (s): 0.16 | learning rate: 3.311E-05 | global batch size: 256 | lm loss: 3.663137E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.528 | TFLOPs: 25.88 | +7: iteration 143630/ 173500 | consumed samples: 36769280 | consumed tokens: 75303485440 | elapsed time per iteration (s): 0.15 | learning rate: 3.310E-05 | global batch size: 256 | lm loss: 3.668498E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.783 | TFLOPs: 26.11 | +7: iteration 143640/ 173500 | consumed samples: 36771840 | consumed tokens: 75308728320 | elapsed time per iteration (s): 0.15 | learning rate: 3.309E-05 | global batch size: 256 | lm loss: 3.673025E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.609 | TFLOPs: 26.00 | +7: iteration 143650/ 173500 | consumed samples: 36774400 | consumed tokens: 75313971200 | elapsed time per iteration (s): 0.16 | learning rate: 3.308E-05 | global batch size: 256 | lm loss: 3.665878E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.059 | TFLOPs: 24.43 | +7: iteration 143660/ 173500 | consumed samples: 36776960 | consumed tokens: 75319214080 | elapsed time per iteration (s): 0.18 | learning rate: 3.308E-05 | global batch size: 256 | lm loss: 3.649329E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.206 | TFLOPs: 22.55 | +7: iteration 143670/ 173500 | consumed samples: 36779520 | consumed tokens: 75324456960 | elapsed time per iteration (s): 0.16 | learning rate: 3.307E-05 | global batch size: 256 | lm loss: 3.680326E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.199 | TFLOPs: 24.78 | +7: iteration 143680/ 173500 | consumed samples: 36782080 | consumed tokens: 75329699840 | elapsed time per iteration (s): 0.15 | learning rate: 3.306E-05 | global batch size: 256 | lm loss: 3.667249E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.533 | TFLOPs: 26.17 | +7: iteration 143690/ 173500 | consumed samples: 36784640 | consumed tokens: 75334942720 | elapsed time per iteration (s): 0.15 | learning rate: 3.305E-05 | global batch size: 256 | lm loss: 3.666058E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.543 | TFLOPs: 26.14 | +7: iteration 143700/ 173500 | consumed samples: 36787200 | consumed tokens: 75340185600 | elapsed time per iteration (s): 0.16 | learning rate: 3.304E-05 | global batch size: 256 | lm loss: 3.666348E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.085 | TFLOPs: 24.86 | +7: iteration 143710/ 173500 | consumed samples: 36789760 | consumed tokens: 75345428480 | elapsed time per iteration (s): 0.16 | learning rate: 3.303E-05 | global batch size: 256 | lm loss: 3.671606E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.274 | TFLOPs: 25.72 | +7: iteration 143720/ 173500 | consumed samples: 36792320 | consumed tokens: 75350671360 | elapsed time per iteration (s): 0.16 | learning rate: 3.302E-05 | global batch size: 256 | lm loss: 3.662387E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.948 | TFLOPs: 24.64 | +7: iteration 143730/ 173500 | consumed samples: 36794880 | consumed tokens: 75355914240 | elapsed time per iteration (s): 0.15 | learning rate: 3.302E-05 | global batch size: 256 | lm loss: 3.665759E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.321 | TFLOPs: 26.09 | +7: iteration 143740/ 173500 | consumed samples: 36797440 | consumed tokens: 75361157120 | elapsed time per iteration (s): 0.16 | learning rate: 3.301E-05 | global batch size: 256 | lm loss: 3.662188E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.274 | TFLOPs: 25.68 | +7: iteration 143750/ 173500 | consumed samples: 36800000 | consumed tokens: 75366400000 | elapsed time per iteration (s): 0.15 | learning rate: 3.300E-05 | global batch size: 256 | lm loss: 3.655056E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.868 | TFLOPs: 26.05 | +7: iteration 143760/ 173500 | consumed samples: 36802560 | consumed tokens: 75371642880 | elapsed time per iteration (s): 0.16 | learning rate: 3.299E-05 | global batch size: 256 | lm loss: 3.665958E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.379 | TFLOPs: 25.65 | +7: iteration 143770/ 173500 | consumed samples: 36805120 | consumed tokens: 75376885760 | elapsed time per iteration (s): 0.16 | learning rate: 3.298E-05 | global batch size: 256 | lm loss: 3.656641E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.445 | TFLOPs: 25.04 | +7: iteration 143780/ 173500 | consumed samples: 36807680 | consumed tokens: 75382128640 | elapsed time per iteration (s): 0.16 | learning rate: 3.297E-05 | global batch size: 256 | lm loss: 3.666758E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.744 | TFLOPs: 24.55 | +7: iteration 143790/ 173500 | consumed samples: 36810240 | consumed tokens: 75387371520 | elapsed time per iteration (s): 0.18 | learning rate: 3.296E-05 | global batch size: 256 | lm loss: 3.655056E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1427.438 | TFLOPs: 22.39 | +7: iteration 143800/ 173500 | consumed samples: 36812800 | consumed tokens: 75392614400 | elapsed time per iteration (s): 0.16 | learning rate: 3.296E-05 | global batch size: 256 | lm loss: 3.655202E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.907 | TFLOPs: 24.40 | +7: iteration 143810/ 173500 | consumed samples: 36815360 | consumed tokens: 75397857280 | elapsed time per iteration (s): 0.16 | learning rate: 3.295E-05 | global batch size: 256 | lm loss: 3.670953E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.278 | TFLOPs: 24.91 | +7: iteration 143820/ 173500 | consumed samples: 36817920 | consumed tokens: 75403100160 | elapsed time per iteration (s): 0.16 | learning rate: 3.294E-05 | global batch size: 256 | lm loss: 3.669149E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.402 | TFLOPs: 25.33 | +7: iteration 143830/ 173500 | consumed samples: 36820480 | consumed tokens: 75408343040 | elapsed time per iteration (s): 0.16 | learning rate: 3.293E-05 | global batch size: 256 | lm loss: 3.657247E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.531 | TFLOPs: 25.81 | +7: iteration 143840/ 173500 | consumed samples: 36823040 | consumed tokens: 75413585920 | elapsed time per iteration (s): 0.17 | learning rate: 3.292E-05 | global batch size: 256 | lm loss: 3.663863E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.011 | TFLOPs: 23.88 | +7: iteration 143850/ 173500 | consumed samples: 36825600 | consumed tokens: 75418828800 | elapsed time per iteration (s): 0.16 | learning rate: 3.291E-05 | global batch size: 256 | lm loss: 3.662489E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.456 | TFLOPs: 24.75 | +7: iteration 143860/ 173500 | consumed samples: 36828160 | consumed tokens: 75424071680 | elapsed time per iteration (s): 0.16 | learning rate: 3.290E-05 | global batch size: 256 | lm loss: 3.675112E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.089 | TFLOPs: 24.98 | +7: iteration 143870/ 173500 | consumed samples: 36830720 | consumed tokens: 75429314560 | elapsed time per iteration (s): 0.16 | learning rate: 3.290E-05 | global batch size: 256 | lm loss: 3.665926E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.847 | TFLOPs: 25.45 | +7: iteration 143880/ 173500 | consumed samples: 36833280 | consumed tokens: 75434557440 | elapsed time per iteration (s): 0.16 | learning rate: 3.289E-05 | global batch size: 256 | lm loss: 3.648570E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.154 | TFLOPs: 25.00 | +7: iteration 143890/ 173500 | consumed samples: 36835840 | consumed tokens: 75439800320 | elapsed time per iteration (s): 0.16 | learning rate: 3.288E-05 | global batch size: 256 | lm loss: 3.672147E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.787 | TFLOPs: 25.72 | +7: iteration 143900/ 173500 | consumed samples: 36838400 | consumed tokens: 75445043200 | elapsed time per iteration (s): 0.17 | learning rate: 3.287E-05 | global batch size: 256 | lm loss: 3.670033E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1480.293 | TFLOPs: 23.21 | +7: iteration 143910/ 173500 | consumed samples: 36840960 | consumed tokens: 75450286080 | elapsed time per iteration (s): 0.16 | learning rate: 3.286E-05 | global batch size: 256 | lm loss: 3.683298E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.362 | TFLOPs: 24.38 | +7: iteration 143920/ 173500 | consumed samples: 36843520 | consumed tokens: 75455528960 | elapsed time per iteration (s): 0.19 | learning rate: 3.285E-05 | global batch size: 256 | lm loss: 3.668372E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1370.866 | TFLOPs: 21.50 | +7: iteration 143930/ 173500 | consumed samples: 36846080 | consumed tokens: 75460771840 | elapsed time per iteration (s): 0.16 | learning rate: 3.285E-05 | global batch size: 256 | lm loss: 3.670478E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.395 | TFLOPs: 25.52 | +7: iteration 143940/ 173500 | consumed samples: 36848640 | consumed tokens: 75466014720 | elapsed time per iteration (s): 0.16 | learning rate: 3.284E-05 | global batch size: 256 | lm loss: 3.659283E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.381 | TFLOPs: 25.35 | +7: iteration 143950/ 173500 | consumed samples: 36851200 | consumed tokens: 75471257600 | elapsed time per iteration (s): 0.16 | learning rate: 3.283E-05 | global batch size: 256 | lm loss: 3.670459E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.501 | TFLOPs: 25.87 | +7: iteration 143960/ 173500 | consumed samples: 36853760 | consumed tokens: 75476500480 | elapsed time per iteration (s): 0.15 | learning rate: 3.282E-05 | global batch size: 256 | lm loss: 3.678542E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.039 | TFLOPs: 25.99 | +7: iteration 143970/ 173500 | consumed samples: 36856320 | consumed tokens: 75481743360 | elapsed time per iteration (s): 0.15 | learning rate: 3.281E-05 | global batch size: 256 | lm loss: 3.679389E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.152 | TFLOPs: 26.00 | +7: iteration 143980/ 173500 | consumed samples: 36858880 | consumed tokens: 75486986240 | elapsed time per iteration (s): 0.16 | learning rate: 3.280E-05 | global batch size: 256 | lm loss: 3.672261E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.976 | TFLOPs: 25.75 | +7: iteration 143990/ 173500 | consumed samples: 36861440 | consumed tokens: 75492229120 | elapsed time per iteration (s): 0.15 | learning rate: 3.279E-05 | global batch size: 256 | lm loss: 3.673495E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.479 | TFLOPs: 26.29 | +0: [2023-03-17 06:30:50,968] [INFO] [logging.py:68:log_dist] [Rank 0] step=144000, skipped=0, lr=[3.278611280458685e-05, 3.278611280458685e-05, 3.278611280458685e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 144000/ 173500 | consumed samples: 36864000 | consumed tokens: 75497472000 | elapsed time per iteration (s): 0.16 | learning rate: 3.279E-05 | global batch size: 256 | lm loss: 3.667027E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.379 | TFLOPs: 25.57 | +0: steps: 144000 loss: 3.6841 iter time (s): 0.157 samples/sec: 1626.558 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 144000 | lm loss value: 3.885406E+00 | lm loss PPL: 4.868671E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 144000 to checkpoints_44m91b100m +0: [2023-03-17 06:30:51,041] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step144000 is begin to save! +0: [2023-03-17 06:30:51,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:30:51,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:30:51,103] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:30:51,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:30:51,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:30:51,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:30:51,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:30:51,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:30:51,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:30:51,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:30:51,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:30:51,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:30:51,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:30:51,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:30:51,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:30:51,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:30:51,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:30:51,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:30:51,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:30:51,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:30:51,173] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step144000/mp_rank_00_model_states.pt +0: [2023-03-17 06:30:51,173] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:30:51,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:30:51,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:30:51,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: [2023-03-17 06:30:51,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:30:51,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +6: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:30:51,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:30:51,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:30:51,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +6: [2023-03-17 06:30:51,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +6: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:30:51,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +6: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:30:51,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +6: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +6: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +4: [2023-03-17 06:30:51,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +7: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +2: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +5: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +3: [2023-03-17 06:30:51,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +1: [2023-03-17 06:30:51,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:30:51,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step144000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:30:51,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step144000 is ready now! +0: successfully saved checkpoint at iteration 144000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 176.62 +7: iteration 144010/ 173500 | consumed samples: 36866560 | consumed tokens: 75502714880 | elapsed time per iteration (s): 0.18 | learning rate: 3.278E-05 | global batch size: 256 | lm loss: 3.665423E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1424.888 | TFLOPs: 22.35 | +7: iteration 144020/ 173500 | consumed samples: 36869120 | consumed tokens: 75507957760 | elapsed time per iteration (s): 0.16 | learning rate: 3.277E-05 | global batch size: 256 | lm loss: 3.664277E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.331 | TFLOPs: 25.76 | +7: iteration 144030/ 173500 | consumed samples: 36871680 | consumed tokens: 75513200640 | elapsed time per iteration (s): 0.16 | learning rate: 3.276E-05 | global batch size: 256 | lm loss: 3.666520E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.668 | TFLOPs: 25.24 | +7: iteration 144040/ 173500 | consumed samples: 36874240 | consumed tokens: 75518443520 | elapsed time per iteration (s): 0.15 | learning rate: 3.275E-05 | global batch size: 256 | lm loss: 3.675646E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.872 | TFLOPs: 26.16 | +7: iteration 144050/ 173500 | consumed samples: 36876800 | consumed tokens: 75523686400 | elapsed time per iteration (s): 0.18 | learning rate: 3.274E-05 | global batch size: 256 | lm loss: 3.664525E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.801 | TFLOPs: 22.22 | +7: iteration 144060/ 173500 | consumed samples: 36879360 | consumed tokens: 75528929280 | elapsed time per iteration (s): 0.16 | learning rate: 3.274E-05 | global batch size: 256 | lm loss: 3.670292E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.988 | TFLOPs: 25.23 | +7: iteration 144070/ 173500 | consumed samples: 36881920 | consumed tokens: 75534172160 | elapsed time per iteration (s): 0.16 | learning rate: 3.273E-05 | global batch size: 256 | lm loss: 3.665634E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.361 | TFLOPs: 25.44 | +7: iteration 144080/ 173500 | consumed samples: 36884480 | consumed tokens: 75539415040 | elapsed time per iteration (s): 0.16 | learning rate: 3.272E-05 | global batch size: 256 | lm loss: 3.671236E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.913 | TFLOPs: 25.89 | +7: iteration 144090/ 173500 | consumed samples: 36887040 | consumed tokens: 75544657920 | elapsed time per iteration (s): 0.16 | learning rate: 3.271E-05 | global batch size: 256 | lm loss: 3.659384E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.890 | TFLOPs: 25.42 | +7: iteration 144100/ 173500 | consumed samples: 36889600 | consumed tokens: 75549900800 | elapsed time per iteration (s): 0.16 | learning rate: 3.270E-05 | global batch size: 256 | lm loss: 3.673094E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.805 | TFLOPs: 25.53 | +7: iteration 144110/ 173500 | consumed samples: 36892160 | consumed tokens: 75555143680 | elapsed time per iteration (s): 0.16 | learning rate: 3.269E-05 | global batch size: 256 | lm loss: 3.665548E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.407 | TFLOPs: 25.47 | +7: iteration 144120/ 173500 | consumed samples: 36894720 | consumed tokens: 75560386560 | elapsed time per iteration (s): 0.16 | learning rate: 3.268E-05 | global batch size: 256 | lm loss: 3.662901E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.239 | TFLOPs: 25.66 | +7: iteration 144130/ 173500 | consumed samples: 36897280 | consumed tokens: 75565629440 | elapsed time per iteration (s): 0.16 | learning rate: 3.268E-05 | global batch size: 256 | lm loss: 3.679616E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.880 | TFLOPs: 25.48 | +7: iteration 144140/ 173500 | consumed samples: 36899840 | consumed tokens: 75570872320 | elapsed time per iteration (s): 0.16 | learning rate: 3.267E-05 | global batch size: 256 | lm loss: 3.667885E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.855 | TFLOPs: 25.64 | +7: iteration 144150/ 173500 | consumed samples: 36902400 | consumed tokens: 75576115200 | elapsed time per iteration (s): 0.16 | learning rate: 3.266E-05 | global batch size: 256 | lm loss: 3.653612E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.120 | TFLOPs: 25.89 | +7: iteration 144160/ 173500 | consumed samples: 36904960 | consumed tokens: 75581358080 | elapsed time per iteration (s): 0.16 | learning rate: 3.265E-05 | global batch size: 256 | lm loss: 3.675606E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.899 | TFLOPs: 25.59 | +7: iteration 144170/ 173500 | consumed samples: 36907520 | consumed tokens: 75586600960 | elapsed time per iteration (s): 0.18 | learning rate: 3.264E-05 | global batch size: 256 | lm loss: 3.675497E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1446.764 | TFLOPs: 22.69 | +7: iteration 144180/ 173500 | consumed samples: 36910080 | consumed tokens: 75591843840 | elapsed time per iteration (s): 0.16 | learning rate: 3.263E-05 | global batch size: 256 | lm loss: 3.672152E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.728 | TFLOPs: 25.78 | +7: iteration 144190/ 173500 | consumed samples: 36912640 | consumed tokens: 75597086720 | elapsed time per iteration (s): 0.16 | learning rate: 3.263E-05 | global batch size: 256 | lm loss: 3.663648E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.256 | TFLOPs: 25.30 | +7: iteration 144200/ 173500 | consumed samples: 36915200 | consumed tokens: 75602329600 | elapsed time per iteration (s): 0.16 | learning rate: 3.262E-05 | global batch size: 256 | lm loss: 3.657666E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.395 | TFLOPs: 24.66 | +7: iteration 144210/ 173500 | consumed samples: 36917760 | consumed tokens: 75607572480 | elapsed time per iteration (s): 0.16 | learning rate: 3.261E-05 | global batch size: 256 | lm loss: 3.661935E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.760 | TFLOPs: 25.86 | +7: iteration 144220/ 173500 | consumed samples: 36920320 | consumed tokens: 75612815360 | elapsed time per iteration (s): 0.16 | learning rate: 3.260E-05 | global batch size: 256 | lm loss: 3.671358E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.664 | TFLOPs: 25.06 | +7: iteration 144230/ 173500 | consumed samples: 36922880 | consumed tokens: 75618058240 | elapsed time per iteration (s): 0.15 | learning rate: 3.259E-05 | global batch size: 256 | lm loss: 3.670774E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.792 | TFLOPs: 26.05 | +7: iteration 144240/ 173500 | consumed samples: 36925440 | consumed tokens: 75623301120 | elapsed time per iteration (s): 0.15 | learning rate: 3.258E-05 | global batch size: 256 | lm loss: 3.671572E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.815 | TFLOPs: 26.31 | +7: iteration 144250/ 173500 | consumed samples: 36928000 | consumed tokens: 75628544000 | elapsed time per iteration (s): 0.16 | learning rate: 3.258E-05 | global batch size: 256 | lm loss: 3.661840E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.577 | TFLOPs: 25.41 | +7: iteration 144260/ 173500 | consumed samples: 36930560 | consumed tokens: 75633786880 | elapsed time per iteration (s): 0.15 | learning rate: 3.257E-05 | global batch size: 256 | lm loss: 3.664879E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.989 | TFLOPs: 26.08 | +7: iteration 144270/ 173500 | consumed samples: 36933120 | consumed tokens: 75639029760 | elapsed time per iteration (s): 0.15 | learning rate: 3.256E-05 | global batch size: 256 | lm loss: 3.660050E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.058 | TFLOPs: 26.07 | +7: iteration 144280/ 173500 | consumed samples: 36935680 | consumed tokens: 75644272640 | elapsed time per iteration (s): 0.15 | learning rate: 3.255E-05 | global batch size: 256 | lm loss: 3.677477E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.392 | TFLOPs: 26.10 | +7: iteration 144290/ 173500 | consumed samples: 36938240 | consumed tokens: 75649515520 | elapsed time per iteration (s): 0.15 | learning rate: 3.254E-05 | global batch size: 256 | lm loss: 3.658598E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.760 | TFLOPs: 26.00 | +7: iteration 144300/ 173500 | consumed samples: 36940800 | consumed tokens: 75654758400 | elapsed time per iteration (s): 0.18 | learning rate: 3.253E-05 | global batch size: 256 | lm loss: 3.679696E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.797 | TFLOPs: 22.19 | +7: iteration 144310/ 173500 | consumed samples: 36943360 | consumed tokens: 75660001280 | elapsed time per iteration (s): 0.15 | learning rate: 3.253E-05 | global batch size: 256 | lm loss: 3.668620E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.675 | TFLOPs: 26.15 | +7: iteration 144320/ 173500 | consumed samples: 36945920 | consumed tokens: 75665244160 | elapsed time per iteration (s): 0.15 | learning rate: 3.252E-05 | global batch size: 256 | lm loss: 3.656483E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.019 | TFLOPs: 26.16 | +7: iteration 144330/ 173500 | consumed samples: 36948480 | consumed tokens: 75670487040 | elapsed time per iteration (s): 0.15 | learning rate: 3.251E-05 | global batch size: 256 | lm loss: 3.654628E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.639 | TFLOPs: 26.17 | +7: iteration 144340/ 173500 | consumed samples: 36951040 | consumed tokens: 75675729920 | elapsed time per iteration (s): 0.15 | learning rate: 3.250E-05 | global batch size: 256 | lm loss: 3.681968E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.174 | TFLOPs: 26.18 | +7: iteration 144350/ 173500 | consumed samples: 36953600 | consumed tokens: 75680972800 | elapsed time per iteration (s): 0.15 | learning rate: 3.249E-05 | global batch size: 256 | lm loss: 3.667859E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.941 | TFLOPs: 26.20 | +7: iteration 144360/ 173500 | consumed samples: 36956160 | consumed tokens: 75686215680 | elapsed time per iteration (s): 0.15 | learning rate: 3.248E-05 | global batch size: 256 | lm loss: 3.676914E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.792 | TFLOPs: 26.17 | +7: iteration 144370/ 173500 | consumed samples: 36958720 | consumed tokens: 75691458560 | elapsed time per iteration (s): 0.15 | learning rate: 3.247E-05 | global batch size: 256 | lm loss: 3.659541E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.052 | TFLOPs: 26.16 | +7: iteration 144380/ 173500 | consumed samples: 36961280 | consumed tokens: 75696701440 | elapsed time per iteration (s): 0.15 | learning rate: 3.247E-05 | global batch size: 256 | lm loss: 3.666262E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.002 | TFLOPs: 26.11 | +7: iteration 144390/ 173500 | consumed samples: 36963840 | consumed tokens: 75701944320 | elapsed time per iteration (s): 0.15 | learning rate: 3.246E-05 | global batch size: 256 | lm loss: 3.669648E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.216 | TFLOPs: 26.19 | +7: iteration 144400/ 173500 | consumed samples: 36966400 | consumed tokens: 75707187200 | elapsed time per iteration (s): 0.15 | learning rate: 3.245E-05 | global batch size: 256 | lm loss: 3.674842E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.232 | TFLOPs: 26.19 | +7: iteration 144410/ 173500 | consumed samples: 36968960 | consumed tokens: 75712430080 | elapsed time per iteration (s): 0.15 | learning rate: 3.244E-05 | global batch size: 256 | lm loss: 3.655560E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.852 | TFLOPs: 26.20 | +7: iteration 144420/ 173500 | consumed samples: 36971520 | consumed tokens: 75717672960 | elapsed time per iteration (s): 0.15 | learning rate: 3.243E-05 | global batch size: 256 | lm loss: 3.677154E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.126 | TFLOPs: 26.24 | +7: iteration 144430/ 173500 | consumed samples: 36974080 | consumed tokens: 75722915840 | elapsed time per iteration (s): 0.18 | learning rate: 3.242E-05 | global batch size: 256 | lm loss: 3.680825E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1438.808 | TFLOPs: 22.56 | +7: iteration 144440/ 173500 | consumed samples: 36976640 | consumed tokens: 75728158720 | elapsed time per iteration (s): 0.15 | learning rate: 3.242E-05 | global batch size: 256 | lm loss: 3.663287E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.976 | TFLOPs: 26.24 | +7: iteration 144450/ 173500 | consumed samples: 36979200 | consumed tokens: 75733401600 | elapsed time per iteration (s): 0.15 | learning rate: 3.241E-05 | global batch size: 256 | lm loss: 3.662556E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.553 | TFLOPs: 26.21 | +7: iteration 144460/ 173500 | consumed samples: 36981760 | consumed tokens: 75738644480 | elapsed time per iteration (s): 0.15 | learning rate: 3.240E-05 | global batch size: 256 | lm loss: 3.667508E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.628 | TFLOPs: 26.23 | +7: iteration 144470/ 173500 | consumed samples: 36984320 | consumed tokens: 75743887360 | elapsed time per iteration (s): 0.15 | learning rate: 3.239E-05 | global batch size: 256 | lm loss: 3.668671E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.961 | TFLOPs: 26.22 | +7: iteration 144480/ 173500 | consumed samples: 36986880 | consumed tokens: 75749130240 | elapsed time per iteration (s): 0.15 | learning rate: 3.238E-05 | global batch size: 256 | lm loss: 3.655154E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.034 | TFLOPs: 26.21 | +7: iteration 144490/ 173500 | consumed samples: 36989440 | consumed tokens: 75754373120 | elapsed time per iteration (s): 0.18 | learning rate: 3.237E-05 | global batch size: 256 | lm loss: 3.653646E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.865 | TFLOPs: 22.16 | +7: iteration 144500/ 173500 | consumed samples: 36992000 | consumed tokens: 75759616000 | elapsed time per iteration (s): 0.15 | learning rate: 3.237E-05 | global batch size: 256 | lm loss: 3.674459E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.756 | TFLOPs: 26.20 | +7: iteration 144510/ 173500 | consumed samples: 36994560 | consumed tokens: 75764858880 | elapsed time per iteration (s): 0.16 | learning rate: 3.236E-05 | global batch size: 256 | lm loss: 3.681486E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.255 | TFLOPs: 25.68 | +7: iteration 144520/ 173500 | consumed samples: 36997120 | consumed tokens: 75770101760 | elapsed time per iteration (s): 0.15 | learning rate: 3.235E-05 | global batch size: 256 | lm loss: 3.679382E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.049 | TFLOPs: 26.21 | +7: iteration 144530/ 173500 | consumed samples: 36999680 | consumed tokens: 75775344640 | elapsed time per iteration (s): 0.15 | learning rate: 3.234E-05 | global batch size: 256 | lm loss: 3.670757E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.534 | TFLOPs: 26.20 | +7: iteration 144540/ 173500 | consumed samples: 37002240 | consumed tokens: 75780587520 | elapsed time per iteration (s): 0.15 | learning rate: 3.233E-05 | global batch size: 256 | lm loss: 3.665748E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.110 | TFLOPs: 26.22 | +7: iteration 144550/ 173500 | consumed samples: 37004800 | consumed tokens: 75785830400 | elapsed time per iteration (s): 0.15 | learning rate: 3.232E-05 | global batch size: 256 | lm loss: 3.668317E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.157 | TFLOPs: 26.19 | +7: iteration 144560/ 173500 | consumed samples: 37007360 | consumed tokens: 75791073280 | elapsed time per iteration (s): 0.18 | learning rate: 3.232E-05 | global batch size: 256 | lm loss: 3.667976E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.273 | TFLOPs: 22.45 | +7: iteration 144570/ 173500 | consumed samples: 37009920 | consumed tokens: 75796316160 | elapsed time per iteration (s): 0.15 | learning rate: 3.231E-05 | global batch size: 256 | lm loss: 3.662895E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.322 | TFLOPs: 26.26 | +7: iteration 144580/ 173500 | consumed samples: 37012480 | consumed tokens: 75801559040 | elapsed time per iteration (s): 0.15 | learning rate: 3.230E-05 | global batch size: 256 | lm loss: 3.668697E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.052 | TFLOPs: 26.27 | +7: iteration 144590/ 173500 | consumed samples: 37015040 | consumed tokens: 75806801920 | elapsed time per iteration (s): 0.15 | learning rate: 3.229E-05 | global batch size: 256 | lm loss: 3.653810E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.140 | TFLOPs: 26.24 | +7: iteration 144600/ 173500 | consumed samples: 37017600 | consumed tokens: 75812044800 | elapsed time per iteration (s): 0.15 | learning rate: 3.228E-05 | global batch size: 256 | lm loss: 3.683940E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.299 | TFLOPs: 25.96 | +7: iteration 144610/ 173500 | consumed samples: 37020160 | consumed tokens: 75817287680 | elapsed time per iteration (s): 0.15 | learning rate: 3.228E-05 | global batch size: 256 | lm loss: 3.672166E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.581 | TFLOPs: 26.20 | +7: iteration 144620/ 173500 | consumed samples: 37022720 | consumed tokens: 75822530560 | elapsed time per iteration (s): 0.15 | learning rate: 3.227E-05 | global batch size: 256 | lm loss: 3.669551E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.052 | TFLOPs: 26.16 | +7: iteration 144630/ 173500 | consumed samples: 37025280 | consumed tokens: 75827773440 | elapsed time per iteration (s): 0.15 | learning rate: 3.226E-05 | global batch size: 256 | lm loss: 3.660719E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.944 | TFLOPs: 26.14 | +7: iteration 144640/ 173500 | consumed samples: 37027840 | consumed tokens: 75833016320 | elapsed time per iteration (s): 0.15 | learning rate: 3.225E-05 | global batch size: 256 | lm loss: 3.668369E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.556 | TFLOPs: 26.14 | +7: iteration 144650/ 173500 | consumed samples: 37030400 | consumed tokens: 75838259200 | elapsed time per iteration (s): 0.15 | learning rate: 3.224E-05 | global batch size: 256 | lm loss: 3.660106E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.953 | TFLOPs: 26.19 | +7: iteration 144660/ 173500 | consumed samples: 37032960 | consumed tokens: 75843502080 | elapsed time per iteration (s): 0.15 | learning rate: 3.223E-05 | global batch size: 256 | lm loss: 3.662907E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.935 | TFLOPs: 26.27 | +7: iteration 144670/ 173500 | consumed samples: 37035520 | consumed tokens: 75848744960 | elapsed time per iteration (s): 0.15 | learning rate: 3.223E-05 | global batch size: 256 | lm loss: 3.666628E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.043 | TFLOPs: 26.28 | +7: iteration 144680/ 173500 | consumed samples: 37038080 | consumed tokens: 75853987840 | elapsed time per iteration (s): 0.15 | learning rate: 3.222E-05 | global batch size: 256 | lm loss: 3.662575E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.872 | TFLOPs: 26.28 | +7: iteration 144690/ 173500 | consumed samples: 37040640 | consumed tokens: 75859230720 | elapsed time per iteration (s): 0.18 | learning rate: 3.221E-05 | global batch size: 256 | lm loss: 3.663657E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1449.766 | TFLOPs: 22.74 | +7: iteration 144700/ 173500 | consumed samples: 37043200 | consumed tokens: 75864473600 | elapsed time per iteration (s): 0.15 | learning rate: 3.220E-05 | global batch size: 256 | lm loss: 3.662210E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1683.152 | TFLOPs: 26.40 | +7: iteration 144710/ 173500 | consumed samples: 37045760 | consumed tokens: 75869716480 | elapsed time per iteration (s): 0.15 | learning rate: 3.219E-05 | global batch size: 256 | lm loss: 3.662151E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.969 | TFLOPs: 26.35 | +7: iteration 144720/ 173500 | consumed samples: 37048320 | consumed tokens: 75874959360 | elapsed time per iteration (s): 0.15 | learning rate: 3.218E-05 | global batch size: 256 | lm loss: 3.664522E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.841 | TFLOPs: 26.39 | +7: iteration 144730/ 173500 | consumed samples: 37050880 | consumed tokens: 75880202240 | elapsed time per iteration (s): 0.15 | learning rate: 3.218E-05 | global batch size: 256 | lm loss: 3.667170E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.663 | TFLOPs: 26.39 | +7: iteration 144740/ 173500 | consumed samples: 37053440 | consumed tokens: 75885445120 | elapsed time per iteration (s): 0.15 | learning rate: 3.217E-05 | global batch size: 256 | lm loss: 3.673404E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.284 | TFLOPs: 26.38 | +7: iteration 144750/ 173500 | consumed samples: 37056000 | consumed tokens: 75890688000 | elapsed time per iteration (s): 0.15 | learning rate: 3.216E-05 | global batch size: 256 | lm loss: 3.671695E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.747 | TFLOPs: 26.39 | +7: iteration 144760/ 173500 | consumed samples: 37058560 | consumed tokens: 75895930880 | elapsed time per iteration (s): 0.15 | learning rate: 3.215E-05 | global batch size: 256 | lm loss: 3.662240E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.326 | TFLOPs: 26.37 | +7: iteration 144770/ 173500 | consumed samples: 37061120 | consumed tokens: 75901173760 | elapsed time per iteration (s): 0.15 | learning rate: 3.214E-05 | global batch size: 256 | lm loss: 3.671592E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.833 | TFLOPs: 26.38 | +7: iteration 144780/ 173500 | consumed samples: 37063680 | consumed tokens: 75906416640 | elapsed time per iteration (s): 0.15 | learning rate: 3.213E-05 | global batch size: 256 | lm loss: 3.668778E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.995 | TFLOPs: 26.39 | +7: iteration 144790/ 173500 | consumed samples: 37066240 | consumed tokens: 75911659520 | elapsed time per iteration (s): 0.15 | learning rate: 3.213E-05 | global batch size: 256 | lm loss: 3.676433E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.406 | TFLOPs: 26.35 | +7: iteration 144800/ 173500 | consumed samples: 37068800 | consumed tokens: 75916902400 | elapsed time per iteration (s): 0.15 | learning rate: 3.212E-05 | global batch size: 256 | lm loss: 3.670809E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.753 | TFLOPs: 26.39 | +7: iteration 144810/ 173500 | consumed samples: 37071360 | consumed tokens: 75922145280 | elapsed time per iteration (s): 0.18 | learning rate: 3.211E-05 | global batch size: 256 | lm loss: 3.674075E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.508 | TFLOPs: 22.64 | +7: iteration 144820/ 173500 | consumed samples: 37073920 | consumed tokens: 75927388160 | elapsed time per iteration (s): 0.15 | learning rate: 3.210E-05 | global batch size: 256 | lm loss: 3.650893E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.449 | TFLOPs: 26.37 | +7: iteration 144830/ 173500 | consumed samples: 37076480 | consumed tokens: 75932631040 | elapsed time per iteration (s): 0.16 | learning rate: 3.209E-05 | global batch size: 256 | lm loss: 3.668902E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.285 | TFLOPs: 24.41 | +7: iteration 144840/ 173500 | consumed samples: 37079040 | consumed tokens: 75937873920 | elapsed time per iteration (s): 0.16 | learning rate: 3.208E-05 | global batch size: 256 | lm loss: 3.684811E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.720 | TFLOPs: 25.04 | +7: iteration 144850/ 173500 | consumed samples: 37081600 | consumed tokens: 75943116800 | elapsed time per iteration (s): 0.16 | learning rate: 3.208E-05 | global batch size: 256 | lm loss: 3.660728E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.131 | TFLOPs: 25.88 | +7: iteration 144860/ 173500 | consumed samples: 37084160 | consumed tokens: 75948359680 | elapsed time per iteration (s): 0.15 | learning rate: 3.207E-05 | global batch size: 256 | lm loss: 3.661142E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.280 | TFLOPs: 25.93 | +7: iteration 144870/ 173500 | consumed samples: 37086720 | consumed tokens: 75953602560 | elapsed time per iteration (s): 0.16 | learning rate: 3.206E-05 | global batch size: 256 | lm loss: 3.677137E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.425 | TFLOPs: 25.62 | +7: iteration 144880/ 173500 | consumed samples: 37089280 | consumed tokens: 75958845440 | elapsed time per iteration (s): 0.18 | learning rate: 3.205E-05 | global batch size: 256 | lm loss: 3.654818E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.065 | TFLOPs: 22.08 | +7: iteration 144890/ 173500 | consumed samples: 37091840 | consumed tokens: 75964088320 | elapsed time per iteration (s): 0.15 | learning rate: 3.204E-05 | global batch size: 256 | lm loss: 3.669403E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.490 | TFLOPs: 26.20 | +7: iteration 144900/ 173500 | consumed samples: 37094400 | consumed tokens: 75969331200 | elapsed time per iteration (s): 0.16 | learning rate: 3.204E-05 | global batch size: 256 | lm loss: 3.682696E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.916 | TFLOPs: 25.45 | +7: iteration 144910/ 173500 | consumed samples: 37096960 | consumed tokens: 75974574080 | elapsed time per iteration (s): 0.16 | learning rate: 3.203E-05 | global batch size: 256 | lm loss: 3.669209E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.108 | TFLOPs: 25.77 | +7: iteration 144920/ 173500 | consumed samples: 37099520 | consumed tokens: 75979816960 | elapsed time per iteration (s): 0.16 | learning rate: 3.202E-05 | global batch size: 256 | lm loss: 3.668960E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.477 | TFLOPs: 25.41 | +7: iteration 144930/ 173500 | consumed samples: 37102080 | consumed tokens: 75985059840 | elapsed time per iteration (s): 0.16 | learning rate: 3.201E-05 | global batch size: 256 | lm loss: 3.659353E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.910 | TFLOPs: 25.59 | +7: iteration 144940/ 173500 | consumed samples: 37104640 | consumed tokens: 75990302720 | elapsed time per iteration (s): 0.18 | learning rate: 3.200E-05 | global batch size: 256 | lm loss: 3.672574E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1426.843 | TFLOPs: 22.38 | +7: iteration 144950/ 173500 | consumed samples: 37107200 | consumed tokens: 75995545600 | elapsed time per iteration (s): 0.16 | learning rate: 3.199E-05 | global batch size: 256 | lm loss: 3.661785E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.743 | TFLOPs: 25.40 | +7: iteration 144960/ 173500 | consumed samples: 37109760 | consumed tokens: 76000788480 | elapsed time per iteration (s): 0.15 | learning rate: 3.199E-05 | global batch size: 256 | lm loss: 3.676310E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.726 | TFLOPs: 25.97 | +7: iteration 144970/ 173500 | consumed samples: 37112320 | consumed tokens: 76006031360 | elapsed time per iteration (s): 0.15 | learning rate: 3.198E-05 | global batch size: 256 | lm loss: 3.666070E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.552 | TFLOPs: 26.20 | +7: iteration 144980/ 173500 | consumed samples: 37114880 | consumed tokens: 76011274240 | elapsed time per iteration (s): 0.15 | learning rate: 3.197E-05 | global batch size: 256 | lm loss: 3.657990E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.081 | TFLOPs: 25.96 | +7: iteration 144990/ 173500 | consumed samples: 37117440 | consumed tokens: 76016517120 | elapsed time per iteration (s): 0.16 | learning rate: 3.196E-05 | global batch size: 256 | lm loss: 3.665051E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.743 | TFLOPs: 24.93 | +7: iteration 145000/ 173500 | consumed samples: 37120000 | consumed tokens: 76021760000 | elapsed time per iteration (s): 0.16 | learning rate: 3.195E-05 | global batch size: 256 | lm loss: 3.662492E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.634 | TFLOPs: 25.27 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 145000 | lm loss value: 3.850029E+00 | lm loss PPL: 4.699444E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 145000 to checkpoints_44m91b100m +0: [2023-03-17 06:33:28,505] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step145000 is begin to save! +0: [2023-03-17 06:33:28,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:33:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:33:28,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:33:28,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:33:28,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:33:28,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:33:28,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:33:28,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:33:28,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:33:28,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:33:28,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:33:28,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:33:28,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:33:28,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:33:28,645] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:33:28,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:33:28,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:33:28,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:33:28,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:33:28,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:33:28,663] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step145000/mp_rank_00_model_states.pt +0: [2023-03-17 06:33:28,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:33:28,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:33:28,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:33:28,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:33:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-17 06:33:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +5: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +5: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +5: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:33:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +5: [2023-03-17 06:33:28,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 06:33:28,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +4: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +1: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +6: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +5: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +5: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +2: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +3: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:33:28,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +7: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:33:28,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step145000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:33:28,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step145000 is ready now! +0: successfully saved checkpoint at iteration 145000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 207.18 +7: iteration 145010/ 173500 | consumed samples: 37122560 | consumed tokens: 76027002880 | elapsed time per iteration (s): 0.18 | learning rate: 3.195E-05 | global batch size: 256 | lm loss: 3.673556E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.226 | TFLOPs: 22.45 | +7: iteration 145020/ 173500 | consumed samples: 37125120 | consumed tokens: 76032245760 | elapsed time per iteration (s): 0.16 | learning rate: 3.194E-05 | global batch size: 256 | lm loss: 3.658964E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.836 | TFLOPs: 24.73 | +7: iteration 145030/ 173500 | consumed samples: 37127680 | consumed tokens: 76037488640 | elapsed time per iteration (s): 0.15 | learning rate: 3.193E-05 | global batch size: 256 | lm loss: 3.669237E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.028 | TFLOPs: 25.97 | +7: iteration 145040/ 173500 | consumed samples: 37130240 | consumed tokens: 76042731520 | elapsed time per iteration (s): 0.15 | learning rate: 3.192E-05 | global batch size: 256 | lm loss: 3.655573E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.433 | TFLOPs: 26.17 | +7: iteration 145050/ 173500 | consumed samples: 37132800 | consumed tokens: 76047974400 | elapsed time per iteration (s): 0.16 | learning rate: 3.191E-05 | global batch size: 256 | lm loss: 3.658926E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.391 | TFLOPs: 25.36 | +7: iteration 145060/ 173500 | consumed samples: 37135360 | consumed tokens: 76053217280 | elapsed time per iteration (s): 0.15 | learning rate: 3.190E-05 | global batch size: 256 | lm loss: 3.670770E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.747 | TFLOPs: 26.15 | +7: iteration 145070/ 173500 | consumed samples: 37137920 | consumed tokens: 76058460160 | elapsed time per iteration (s): 0.19 | learning rate: 3.190E-05 | global batch size: 256 | lm loss: 3.673399E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1377.737 | TFLOPs: 21.61 | +7: iteration 145080/ 173500 | consumed samples: 37140480 | consumed tokens: 76063703040 | elapsed time per iteration (s): 0.15 | learning rate: 3.189E-05 | global batch size: 256 | lm loss: 3.677203E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.552 | TFLOPs: 26.12 | +7: iteration 145090/ 173500 | consumed samples: 37143040 | consumed tokens: 76068945920 | elapsed time per iteration (s): 0.16 | learning rate: 3.188E-05 | global batch size: 256 | lm loss: 3.671560E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.662 | TFLOPs: 25.65 | +7: iteration 145100/ 173500 | consumed samples: 37145600 | consumed tokens: 76074188800 | elapsed time per iteration (s): 0.15 | learning rate: 3.187E-05 | global batch size: 256 | lm loss: 3.667674E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.107 | TFLOPs: 26.16 | +7: iteration 145110/ 173500 | consumed samples: 37148160 | consumed tokens: 76079431680 | elapsed time per iteration (s): 0.15 | learning rate: 3.186E-05 | global batch size: 256 | lm loss: 3.678817E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.361 | TFLOPs: 26.15 | +7: iteration 145120/ 173500 | consumed samples: 37150720 | consumed tokens: 76084674560 | elapsed time per iteration (s): 0.16 | learning rate: 3.186E-05 | global batch size: 256 | lm loss: 3.668158E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.767 | TFLOPs: 25.86 | +7: iteration 145130/ 173500 | consumed samples: 37153280 | consumed tokens: 76089917440 | elapsed time per iteration (s): 0.18 | learning rate: 3.185E-05 | global batch size: 256 | lm loss: 3.671398E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.005 | TFLOPs: 22.47 | +7: iteration 145140/ 173500 | consumed samples: 37155840 | consumed tokens: 76095160320 | elapsed time per iteration (s): 0.15 | learning rate: 3.184E-05 | global batch size: 256 | lm loss: 3.677689E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.970 | TFLOPs: 26.06 | +7: iteration 145150/ 173500 | consumed samples: 37158400 | consumed tokens: 76100403200 | elapsed time per iteration (s): 0.15 | learning rate: 3.183E-05 | global batch size: 256 | lm loss: 3.667907E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.869 | TFLOPs: 25.95 | +7: iteration 145160/ 173500 | consumed samples: 37160960 | consumed tokens: 76105646080 | elapsed time per iteration (s): 0.15 | learning rate: 3.182E-05 | global batch size: 256 | lm loss: 3.673680E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.231 | TFLOPs: 26.04 | +7: iteration 145170/ 173500 | consumed samples: 37163520 | consumed tokens: 76110888960 | elapsed time per iteration (s): 0.15 | learning rate: 3.181E-05 | global batch size: 256 | lm loss: 3.668674E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.778 | TFLOPs: 26.08 | +7: iteration 145180/ 173500 | consumed samples: 37166080 | consumed tokens: 76116131840 | elapsed time per iteration (s): 0.15 | learning rate: 3.181E-05 | global batch size: 256 | lm loss: 3.664972E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.446 | TFLOPs: 26.01 | +7: iteration 145190/ 173500 | consumed samples: 37168640 | consumed tokens: 76121374720 | elapsed time per iteration (s): 0.18 | learning rate: 3.180E-05 | global batch size: 256 | lm loss: 3.669089E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.454 | TFLOPs: 22.10 | +7: iteration 145200/ 173500 | consumed samples: 37171200 | consumed tokens: 76126617600 | elapsed time per iteration (s): 0.18 | learning rate: 3.179E-05 | global batch size: 256 | lm loss: 3.658948E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1425.878 | TFLOPs: 22.36 | +7: iteration 145210/ 173500 | consumed samples: 37173760 | consumed tokens: 76131860480 | elapsed time per iteration (s): 0.15 | learning rate: 3.178E-05 | global batch size: 256 | lm loss: 3.656106E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.978 | TFLOPs: 25.94 | +7: iteration 145220/ 173500 | consumed samples: 37176320 | consumed tokens: 76137103360 | elapsed time per iteration (s): 0.15 | learning rate: 3.177E-05 | global batch size: 256 | lm loss: 3.671976E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.765 | TFLOPs: 26.17 | +7: iteration 145230/ 173500 | consumed samples: 37178880 | consumed tokens: 76142346240 | elapsed time per iteration (s): 0.16 | learning rate: 3.177E-05 | global batch size: 256 | lm loss: 3.668344E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.516 | TFLOPs: 25.29 | +7: iteration 145240/ 173500 | consumed samples: 37181440 | consumed tokens: 76147589120 | elapsed time per iteration (s): 0.16 | learning rate: 3.176E-05 | global batch size: 256 | lm loss: 3.643117E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.233 | TFLOPs: 25.68 | +7: iteration 145250/ 173500 | consumed samples: 37184000 | consumed tokens: 76152832000 | elapsed time per iteration (s): 0.15 | learning rate: 3.175E-05 | global batch size: 256 | lm loss: 3.673854E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.327 | TFLOPs: 26.07 | +7: iteration 145260/ 173500 | consumed samples: 37186560 | consumed tokens: 76158074880 | elapsed time per iteration (s): 0.15 | learning rate: 3.174E-05 | global batch size: 256 | lm loss: 3.661541E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.497 | TFLOPs: 26.17 | +7: iteration 145270/ 173500 | consumed samples: 37189120 | consumed tokens: 76163317760 | elapsed time per iteration (s): 0.15 | learning rate: 3.173E-05 | global batch size: 256 | lm loss: 3.668901E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.699 | TFLOPs: 26.20 | +7: iteration 145280/ 173500 | consumed samples: 37191680 | consumed tokens: 76168560640 | elapsed time per iteration (s): 0.15 | learning rate: 3.172E-05 | global batch size: 256 | lm loss: 3.668698E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.784 | TFLOPs: 26.19 | +7: iteration 145290/ 173500 | consumed samples: 37194240 | consumed tokens: 76173803520 | elapsed time per iteration (s): 0.16 | learning rate: 3.172E-05 | global batch size: 256 | lm loss: 3.663477E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.387 | TFLOPs: 25.57 | +7: iteration 145300/ 173500 | consumed samples: 37196800 | consumed tokens: 76179046400 | elapsed time per iteration (s): 0.15 | learning rate: 3.171E-05 | global batch size: 256 | lm loss: 3.658836E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.031 | TFLOPs: 26.16 | +7: iteration 145310/ 173500 | consumed samples: 37199360 | consumed tokens: 76184289280 | elapsed time per iteration (s): 0.15 | learning rate: 3.170E-05 | global batch size: 256 | lm loss: 3.660696E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.566 | TFLOPs: 26.18 | +7: iteration 145320/ 173500 | consumed samples: 37201920 | consumed tokens: 76189532160 | elapsed time per iteration (s): 0.18 | learning rate: 3.169E-05 | global batch size: 256 | lm loss: 3.663077E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1427.607 | TFLOPs: 22.39 | +7: iteration 145330/ 173500 | consumed samples: 37204480 | consumed tokens: 76194775040 | elapsed time per iteration (s): 0.16 | learning rate: 3.168E-05 | global batch size: 256 | lm loss: 3.668168E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.305 | TFLOPs: 25.88 | +7: iteration 145340/ 173500 | consumed samples: 37207040 | consumed tokens: 76200017920 | elapsed time per iteration (s): 0.16 | learning rate: 3.168E-05 | global batch size: 256 | lm loss: 3.658020E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.562 | TFLOPs: 25.12 | +7: iteration 145350/ 173500 | consumed samples: 37209600 | consumed tokens: 76205260800 | elapsed time per iteration (s): 0.16 | learning rate: 3.167E-05 | global batch size: 256 | lm loss: 3.657748E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.852 | TFLOPs: 25.76 | +7: iteration 145360/ 173500 | consumed samples: 37212160 | consumed tokens: 76210503680 | elapsed time per iteration (s): 0.15 | learning rate: 3.166E-05 | global batch size: 256 | lm loss: 3.658206E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.935 | TFLOPs: 26.11 | +7: iteration 145370/ 173500 | consumed samples: 37214720 | consumed tokens: 76215746560 | elapsed time per iteration (s): 0.16 | learning rate: 3.165E-05 | global batch size: 256 | lm loss: 3.660699E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.856 | TFLOPs: 25.64 | +7: iteration 145380/ 173500 | consumed samples: 37217280 | consumed tokens: 76220989440 | elapsed time per iteration (s): 0.15 | learning rate: 3.164E-05 | global batch size: 256 | lm loss: 3.668704E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.284 | TFLOPs: 26.04 | +7: iteration 145390/ 173500 | consumed samples: 37219840 | consumed tokens: 76226232320 | elapsed time per iteration (s): 0.15 | learning rate: 3.164E-05 | global batch size: 256 | lm loss: 3.652937E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.874 | TFLOPs: 26.17 | +7: iteration 145400/ 173500 | consumed samples: 37222400 | consumed tokens: 76231475200 | elapsed time per iteration (s): 0.16 | learning rate: 3.163E-05 | global batch size: 256 | lm loss: 3.650091E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.359 | TFLOPs: 25.68 | +7: iteration 145410/ 173500 | consumed samples: 37224960 | consumed tokens: 76236718080 | elapsed time per iteration (s): 0.15 | learning rate: 3.162E-05 | global batch size: 256 | lm loss: 3.649769E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.201 | TFLOPs: 25.99 | +7: iteration 145420/ 173500 | consumed samples: 37227520 | consumed tokens: 76241960960 | elapsed time per iteration (s): 0.16 | learning rate: 3.161E-05 | global batch size: 256 | lm loss: 3.663815E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.164 | TFLOPs: 25.36 | +7: iteration 145430/ 173500 | consumed samples: 37230080 | consumed tokens: 76247203840 | elapsed time per iteration (s): 0.15 | learning rate: 3.160E-05 | global batch size: 256 | lm loss: 3.673035E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.598 | TFLOPs: 26.20 | +7: iteration 145440/ 173500 | consumed samples: 37232640 | consumed tokens: 76252446720 | elapsed time per iteration (s): 0.15 | learning rate: 3.160E-05 | global batch size: 256 | lm loss: 3.677103E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.793 | TFLOPs: 26.19 | +7: iteration 145450/ 173500 | consumed samples: 37235200 | consumed tokens: 76257689600 | elapsed time per iteration (s): 0.15 | learning rate: 3.159E-05 | global batch size: 256 | lm loss: 3.662326E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.484 | TFLOPs: 26.21 | +7: iteration 145460/ 173500 | consumed samples: 37237760 | consumed tokens: 76262932480 | elapsed time per iteration (s): 0.15 | learning rate: 3.158E-05 | global batch size: 256 | lm loss: 3.664511E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.414 | TFLOPs: 26.20 | +7: iteration 145470/ 173500 | consumed samples: 37240320 | consumed tokens: 76268175360 | elapsed time per iteration (s): 0.15 | learning rate: 3.157E-05 | global batch size: 256 | lm loss: 3.658096E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.723 | TFLOPs: 26.23 | +7: iteration 145480/ 173500 | consumed samples: 37242880 | consumed tokens: 76273418240 | elapsed time per iteration (s): 0.16 | learning rate: 3.156E-05 | global batch size: 256 | lm loss: 3.661764E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.581 | TFLOPs: 25.76 | +7: iteration 145490/ 173500 | consumed samples: 37245440 | consumed tokens: 76278661120 | elapsed time per iteration (s): 0.15 | learning rate: 3.155E-05 | global batch size: 256 | lm loss: 3.664010E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.749 | TFLOPs: 26.19 | +7: iteration 145500/ 173500 | consumed samples: 37248000 | consumed tokens: 76283904000 | elapsed time per iteration (s): 0.16 | learning rate: 3.155E-05 | global batch size: 256 | lm loss: 3.661861E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.277 | TFLOPs: 25.33 | +7: iteration 145510/ 173500 | consumed samples: 37250560 | consumed tokens: 76289146880 | elapsed time per iteration (s): 0.16 | learning rate: 3.154E-05 | global batch size: 256 | lm loss: 3.673471E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.063 | TFLOPs: 25.31 | +7: iteration 145520/ 173500 | consumed samples: 37253120 | consumed tokens: 76294389760 | elapsed time per iteration (s): 0.16 | learning rate: 3.153E-05 | global batch size: 256 | lm loss: 3.668935E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.393 | TFLOPs: 24.50 | +7: iteration 145530/ 173500 | consumed samples: 37255680 | consumed tokens: 76299632640 | elapsed time per iteration (s): 0.16 | learning rate: 3.152E-05 | global batch size: 256 | lm loss: 3.677222E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.997 | TFLOPs: 25.37 | +7: iteration 145540/ 173500 | consumed samples: 37258240 | consumed tokens: 76304875520 | elapsed time per iteration (s): 0.16 | learning rate: 3.151E-05 | global batch size: 256 | lm loss: 3.665632E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.555 | TFLOPs: 25.15 | +7: iteration 145550/ 173500 | consumed samples: 37260800 | consumed tokens: 76310118400 | elapsed time per iteration (s): 0.16 | learning rate: 3.151E-05 | global batch size: 256 | lm loss: 3.663907E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.357 | TFLOPs: 24.56 | +7: iteration 145560/ 173500 | consumed samples: 37263360 | consumed tokens: 76315361280 | elapsed time per iteration (s): 0.16 | learning rate: 3.150E-05 | global batch size: 256 | lm loss: 3.662298E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.254 | TFLOPs: 25.88 | +7: iteration 145570/ 173500 | consumed samples: 37265920 | consumed tokens: 76320604160 | elapsed time per iteration (s): 0.15 | learning rate: 3.149E-05 | global batch size: 256 | lm loss: 3.678330E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.463 | TFLOPs: 25.98 | +7: iteration 145580/ 173500 | consumed samples: 37268480 | consumed tokens: 76325847040 | elapsed time per iteration (s): 0.18 | learning rate: 3.148E-05 | global batch size: 256 | lm loss: 3.661447E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.348 | TFLOPs: 21.85 | +7: iteration 145590/ 173500 | consumed samples: 37271040 | consumed tokens: 76331089920 | elapsed time per iteration (s): 0.16 | learning rate: 3.147E-05 | global batch size: 256 | lm loss: 3.658105E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.979 | TFLOPs: 25.39 | +7: iteration 145600/ 173500 | consumed samples: 37273600 | consumed tokens: 76336332800 | elapsed time per iteration (s): 0.16 | learning rate: 3.147E-05 | global batch size: 256 | lm loss: 3.677060E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.128 | TFLOPs: 25.77 | +7: iteration 145610/ 173500 | consumed samples: 37276160 | consumed tokens: 76341575680 | elapsed time per iteration (s): 0.16 | learning rate: 3.146E-05 | global batch size: 256 | lm loss: 3.659527E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.508 | TFLOPs: 25.74 | +7: iteration 145620/ 173500 | consumed samples: 37278720 | consumed tokens: 76346818560 | elapsed time per iteration (s): 0.16 | learning rate: 3.145E-05 | global batch size: 256 | lm loss: 3.664349E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.027 | TFLOPs: 25.33 | +7: iteration 145630/ 173500 | consumed samples: 37281280 | consumed tokens: 76352061440 | elapsed time per iteration (s): 0.15 | learning rate: 3.144E-05 | global batch size: 256 | lm loss: 3.663193E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.945 | TFLOPs: 26.28 | +7: iteration 145640/ 173500 | consumed samples: 37283840 | consumed tokens: 76357304320 | elapsed time per iteration (s): 0.15 | learning rate: 3.143E-05 | global batch size: 256 | lm loss: 3.663628E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.072 | TFLOPs: 26.25 | +7: iteration 145650/ 173500 | consumed samples: 37286400 | consumed tokens: 76362547200 | elapsed time per iteration (s): 0.17 | learning rate: 3.143E-05 | global batch size: 256 | lm loss: 3.666042E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1466.788 | TFLOPs: 23.00 | +7: iteration 145660/ 173500 | consumed samples: 37288960 | consumed tokens: 76367790080 | elapsed time per iteration (s): 0.16 | learning rate: 3.142E-05 | global batch size: 256 | lm loss: 3.678862E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.646 | TFLOPs: 25.81 | +7: iteration 145670/ 173500 | consumed samples: 37291520 | consumed tokens: 76373032960 | elapsed time per iteration (s): 0.16 | learning rate: 3.141E-05 | global batch size: 256 | lm loss: 3.672251E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.636 | TFLOPs: 25.70 | +7: iteration 145680/ 173500 | consumed samples: 37294080 | consumed tokens: 76378275840 | elapsed time per iteration (s): 0.16 | learning rate: 3.140E-05 | global batch size: 256 | lm loss: 3.654698E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.247 | TFLOPs: 25.69 | +7: iteration 145690/ 173500 | consumed samples: 37296640 | consumed tokens: 76383518720 | elapsed time per iteration (s): 0.16 | learning rate: 3.139E-05 | global batch size: 256 | lm loss: 3.649554E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.898 | TFLOPs: 25.67 | +7: iteration 145700/ 173500 | consumed samples: 37299200 | consumed tokens: 76388761600 | elapsed time per iteration (s): 0.16 | learning rate: 3.139E-05 | global batch size: 256 | lm loss: 3.669494E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.696 | TFLOPs: 25.87 | +7: iteration 145710/ 173500 | consumed samples: 37301760 | consumed tokens: 76394004480 | elapsed time per iteration (s): 0.21 | learning rate: 3.138E-05 | global batch size: 256 | lm loss: 3.669568E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1203.137 | TFLOPs: 18.87 | +7: iteration 145720/ 173500 | consumed samples: 37304320 | consumed tokens: 76399247360 | elapsed time per iteration (s): 0.15 | learning rate: 3.137E-05 | global batch size: 256 | lm loss: 3.670215E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.873 | TFLOPs: 26.38 | +7: iteration 145730/ 173500 | consumed samples: 37306880 | consumed tokens: 76404490240 | elapsed time per iteration (s): 0.16 | learning rate: 3.136E-05 | global batch size: 256 | lm loss: 3.664727E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.927 | TFLOPs: 25.51 | +7: iteration 145740/ 173500 | consumed samples: 37309440 | consumed tokens: 76409733120 | elapsed time per iteration (s): 0.15 | learning rate: 3.135E-05 | global batch size: 256 | lm loss: 3.658416E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.441 | TFLOPs: 26.35 | +7: iteration 145750/ 173500 | consumed samples: 37312000 | consumed tokens: 76414976000 | elapsed time per iteration (s): 0.15 | learning rate: 3.135E-05 | global batch size: 256 | lm loss: 3.663477E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.819 | TFLOPs: 26.38 | +7: iteration 145760/ 173500 | consumed samples: 37314560 | consumed tokens: 76420218880 | elapsed time per iteration (s): 0.15 | learning rate: 3.134E-05 | global batch size: 256 | lm loss: 3.658692E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.897 | TFLOPs: 26.00 | +7: iteration 145770/ 173500 | consumed samples: 37317120 | consumed tokens: 76425461760 | elapsed time per iteration (s): 0.16 | learning rate: 3.133E-05 | global batch size: 256 | lm loss: 3.656597E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.098 | TFLOPs: 25.56 | +7: iteration 145780/ 173500 | consumed samples: 37319680 | consumed tokens: 76430704640 | elapsed time per iteration (s): 0.16 | learning rate: 3.132E-05 | global batch size: 256 | lm loss: 3.665733E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.349 | TFLOPs: 25.35 | +7: iteration 145790/ 173500 | consumed samples: 37322240 | consumed tokens: 76435947520 | elapsed time per iteration (s): 0.16 | learning rate: 3.131E-05 | global batch size: 256 | lm loss: 3.661341E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.092 | TFLOPs: 25.41 | +7: iteration 145800/ 173500 | consumed samples: 37324800 | consumed tokens: 76441190400 | elapsed time per iteration (s): 0.15 | learning rate: 3.131E-05 | global batch size: 256 | lm loss: 3.672489E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.462 | TFLOPs: 25.93 | +7: iteration 145810/ 173500 | consumed samples: 37327360 | consumed tokens: 76446433280 | elapsed time per iteration (s): 0.15 | learning rate: 3.130E-05 | global batch size: 256 | lm loss: 3.676473E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.858 | TFLOPs: 25.92 | +7: iteration 145820/ 173500 | consumed samples: 37329920 | consumed tokens: 76451676160 | elapsed time per iteration (s): 0.16 | learning rate: 3.129E-05 | global batch size: 256 | lm loss: 3.662378E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.094 | TFLOPs: 25.58 | +7: iteration 145830/ 173500 | consumed samples: 37332480 | consumed tokens: 76456919040 | elapsed time per iteration (s): 0.15 | learning rate: 3.128E-05 | global batch size: 256 | lm loss: 3.657005E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.191 | TFLOPs: 26.35 | +7: iteration 145840/ 173500 | consumed samples: 37335040 | consumed tokens: 76462161920 | elapsed time per iteration (s): 0.18 | learning rate: 3.127E-05 | global batch size: 256 | lm loss: 3.672671E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.437 | TFLOPs: 22.18 | +7: iteration 145850/ 173500 | consumed samples: 37337600 | consumed tokens: 76467404800 | elapsed time per iteration (s): 0.16 | learning rate: 3.127E-05 | global batch size: 256 | lm loss: 3.656771E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.240 | TFLOPs: 25.80 | +7: iteration 145860/ 173500 | consumed samples: 37340160 | consumed tokens: 76472647680 | elapsed time per iteration (s): 0.15 | learning rate: 3.126E-05 | global batch size: 256 | lm loss: 3.650898E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.598 | TFLOPs: 26.34 | +7: iteration 145870/ 173500 | consumed samples: 37342720 | consumed tokens: 76477890560 | elapsed time per iteration (s): 0.16 | learning rate: 3.125E-05 | global batch size: 256 | lm loss: 3.656799E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.589 | TFLOPs: 25.68 | +7: iteration 145880/ 173500 | consumed samples: 37345280 | consumed tokens: 76483133440 | elapsed time per iteration (s): 0.16 | learning rate: 3.124E-05 | global batch size: 256 | lm loss: 3.658698E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.141 | TFLOPs: 25.24 | +7: iteration 145890/ 173500 | consumed samples: 37347840 | consumed tokens: 76488376320 | elapsed time per iteration (s): 0.16 | learning rate: 3.123E-05 | global batch size: 256 | lm loss: 3.656636E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.101 | TFLOPs: 25.31 | +7: iteration 145900/ 173500 | consumed samples: 37350400 | consumed tokens: 76493619200 | elapsed time per iteration (s): 0.16 | learning rate: 3.123E-05 | global batch size: 256 | lm loss: 3.666341E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.529 | TFLOPs: 25.15 | +7: iteration 145910/ 173500 | consumed samples: 37352960 | consumed tokens: 76498862080 | elapsed time per iteration (s): 0.16 | learning rate: 3.122E-05 | global batch size: 256 | lm loss: 3.654685E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.338 | TFLOPs: 25.22 | +7: iteration 145920/ 173500 | consumed samples: 37355520 | consumed tokens: 76504104960 | elapsed time per iteration (s): 0.15 | learning rate: 3.121E-05 | global batch size: 256 | lm loss: 3.662114E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.813 | TFLOPs: 26.08 | +7: iteration 145930/ 173500 | consumed samples: 37358080 | consumed tokens: 76509347840 | elapsed time per iteration (s): 0.15 | learning rate: 3.120E-05 | global batch size: 256 | lm loss: 3.659426E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.813 | TFLOPs: 26.08 | +7: iteration 145940/ 173500 | consumed samples: 37360640 | consumed tokens: 76514590720 | elapsed time per iteration (s): 0.15 | learning rate: 3.119E-05 | global batch size: 256 | lm loss: 3.646157E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.680 | TFLOPs: 26.11 | +7: iteration 145950/ 173500 | consumed samples: 37363200 | consumed tokens: 76519833600 | elapsed time per iteration (s): 0.16 | learning rate: 3.119E-05 | global batch size: 256 | lm loss: 3.664862E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.627 | TFLOPs: 25.16 | +7: iteration 145960/ 173500 | consumed samples: 37365760 | consumed tokens: 76525076480 | elapsed time per iteration (s): 0.16 | learning rate: 3.118E-05 | global batch size: 256 | lm loss: 3.671007E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.866 | TFLOPs: 25.33 | +7: iteration 145970/ 173500 | consumed samples: 37368320 | consumed tokens: 76530319360 | elapsed time per iteration (s): 0.16 | learning rate: 3.117E-05 | global batch size: 256 | lm loss: 3.666044E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.595 | TFLOPs: 24.55 | +7: iteration 145980/ 173500 | consumed samples: 37370880 | consumed tokens: 76535562240 | elapsed time per iteration (s): 0.16 | learning rate: 3.116E-05 | global batch size: 256 | lm loss: 3.649214E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.244 | TFLOPs: 25.72 | +7: iteration 145990/ 173500 | consumed samples: 37373440 | consumed tokens: 76540805120 | elapsed time per iteration (s): 0.15 | learning rate: 3.115E-05 | global batch size: 256 | lm loss: 3.671227E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.936 | TFLOPs: 26.19 | +0: [2023-03-17 06:36:06,961] [INFO] [logging.py:68:log_dist] [Rank 0] step=146000, skipped=0, lr=[3.1146732758228304e-05, 3.1146732758228304e-05, 3.1146732758228304e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 146000/ 173500 | consumed samples: 37376000 | consumed tokens: 76546048000 | elapsed time per iteration (s): 0.16 | learning rate: 3.115E-05 | global batch size: 256 | lm loss: 3.655445E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.475 | TFLOPs: 25.90 | +0: steps: 146000 loss: 3.6257 iter time (s): 0.157 samples/sec: 1631.755 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 146000 | lm loss value: 3.877541E+00 | lm loss PPL: 4.830530E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 146000 to checkpoints_44m91b100m +0: [2023-03-17 06:36:07,035] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step146000 is begin to save! +0: [2023-03-17 06:36:07,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:36:07,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:36:07,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:36:07,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:36:07,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:36:07,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:36:07,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:36:07,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:36:07,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:36:07,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:36:07,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:36:07,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:36:07,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:36:07,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:36:07,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:36:07,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:36:07,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:36:07,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:36:07,173] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:36:07,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:36:07,175] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step146000/mp_rank_00_model_states.pt +0: [2023-03-17 06:36:07,175] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:36:07,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:36:07,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:36:07,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:36:07,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:36:07,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:36:07,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +2: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +3: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +6: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +4: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:36:07,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:36:07,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +1: [2023-03-17 06:36:07,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:36:07,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:36:07,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +5: [2023-03-17 06:36:07,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:36:07,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:36:07,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +7: [2023-03-17 06:36:07,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:36:07,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step146000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:36:07,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step146000 is ready now! +0: successfully saved checkpoint at iteration 146000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.85 +7: iteration 146010/ 173500 | consumed samples: 37378560 | consumed tokens: 76551290880 | elapsed time per iteration (s): 0.18 | learning rate: 3.114E-05 | global batch size: 256 | lm loss: 3.664486E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1432.853 | TFLOPs: 22.47 | +7: iteration 146020/ 173500 | consumed samples: 37381120 | consumed tokens: 76556533760 | elapsed time per iteration (s): 0.16 | learning rate: 3.113E-05 | global batch size: 256 | lm loss: 3.657932E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.970 | TFLOPs: 25.23 | +7: iteration 146030/ 173500 | consumed samples: 37383680 | consumed tokens: 76561776640 | elapsed time per iteration (s): 0.15 | learning rate: 3.112E-05 | global batch size: 256 | lm loss: 3.669991E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.023 | TFLOPs: 26.22 | +7: iteration 146040/ 173500 | consumed samples: 37386240 | consumed tokens: 76567019520 | elapsed time per iteration (s): 0.16 | learning rate: 3.112E-05 | global batch size: 256 | lm loss: 3.683430E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.296 | TFLOPs: 25.52 | +7: iteration 146050/ 173500 | consumed samples: 37388800 | consumed tokens: 76572262400 | elapsed time per iteration (s): 0.15 | learning rate: 3.111E-05 | global batch size: 256 | lm loss: 3.669535E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.321 | TFLOPs: 26.18 | +7: iteration 146060/ 173500 | consumed samples: 37391360 | consumed tokens: 76577505280 | elapsed time per iteration (s): 0.16 | learning rate: 3.110E-05 | global batch size: 256 | lm loss: 3.663134E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.074 | TFLOPs: 25.50 | +7: iteration 146070/ 173500 | consumed samples: 37393920 | consumed tokens: 76582748160 | elapsed time per iteration (s): 0.15 | learning rate: 3.109E-05 | global batch size: 256 | lm loss: 3.670350E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.143 | TFLOPs: 26.21 | +7: iteration 146080/ 173500 | consumed samples: 37396480 | consumed tokens: 76587991040 | elapsed time per iteration (s): 0.15 | learning rate: 3.108E-05 | global batch size: 256 | lm loss: 3.673534E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.509 | TFLOPs: 26.20 | +7: iteration 146090/ 173500 | consumed samples: 37399040 | consumed tokens: 76593233920 | elapsed time per iteration (s): 0.24 | learning rate: 3.108E-05 | global batch size: 256 | lm loss: 3.661217E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1087.398 | TFLOPs: 17.05 | +7: iteration 146100/ 173500 | consumed samples: 37401600 | consumed tokens: 76598476800 | elapsed time per iteration (s): 0.16 | learning rate: 3.107E-05 | global batch size: 256 | lm loss: 3.675251E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.754 | TFLOPs: 25.73 | +7: iteration 146110/ 173500 | consumed samples: 37404160 | consumed tokens: 76603719680 | elapsed time per iteration (s): 0.16 | learning rate: 3.106E-05 | global batch size: 256 | lm loss: 3.671591E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.862 | TFLOPs: 25.76 | +7: iteration 146120/ 173500 | consumed samples: 37406720 | consumed tokens: 76608962560 | elapsed time per iteration (s): 0.16 | learning rate: 3.105E-05 | global batch size: 256 | lm loss: 3.663145E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.929 | TFLOPs: 25.72 | +7: iteration 146130/ 173500 | consumed samples: 37409280 | consumed tokens: 76614205440 | elapsed time per iteration (s): 0.16 | learning rate: 3.104E-05 | global batch size: 256 | lm loss: 3.668042E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.413 | TFLOPs: 24.93 | +7: iteration 146140/ 173500 | consumed samples: 37411840 | consumed tokens: 76619448320 | elapsed time per iteration (s): 0.15 | learning rate: 3.104E-05 | global batch size: 256 | lm loss: 3.671899E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.007 | TFLOPs: 26.16 | +7: iteration 146150/ 173500 | consumed samples: 37414400 | consumed tokens: 76624691200 | elapsed time per iteration (s): 0.19 | learning rate: 3.103E-05 | global batch size: 256 | lm loss: 3.663438E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1374.376 | TFLOPs: 21.55 | +7: iteration 146160/ 173500 | consumed samples: 37416960 | consumed tokens: 76629934080 | elapsed time per iteration (s): 0.15 | learning rate: 3.102E-05 | global batch size: 256 | lm loss: 3.658113E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.432 | TFLOPs: 26.12 | +7: iteration 146170/ 173500 | consumed samples: 37419520 | consumed tokens: 76635176960 | elapsed time per iteration (s): 0.16 | learning rate: 3.101E-05 | global batch size: 256 | lm loss: 3.662985E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.383 | TFLOPs: 25.04 | +7: iteration 146180/ 173500 | consumed samples: 37422080 | consumed tokens: 76640419840 | elapsed time per iteration (s): 0.16 | learning rate: 3.100E-05 | global batch size: 256 | lm loss: 3.676358E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.747 | TFLOPs: 25.78 | +7: iteration 146190/ 173500 | consumed samples: 37424640 | consumed tokens: 76645662720 | elapsed time per iteration (s): 0.16 | learning rate: 3.100E-05 | global batch size: 256 | lm loss: 3.659013E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.335 | TFLOPs: 25.50 | +7: iteration 146200/ 173500 | consumed samples: 37427200 | consumed tokens: 76650905600 | elapsed time per iteration (s): 0.16 | learning rate: 3.099E-05 | global batch size: 256 | lm loss: 3.664184E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.847 | TFLOPs: 25.81 | +7: iteration 146210/ 173500 | consumed samples: 37429760 | consumed tokens: 76656148480 | elapsed time per iteration (s): 0.16 | learning rate: 3.098E-05 | global batch size: 256 | lm loss: 3.667842E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.614 | TFLOPs: 24.88 | +7: iteration 146220/ 173500 | consumed samples: 37432320 | consumed tokens: 76661391360 | elapsed time per iteration (s): 0.18 | learning rate: 3.097E-05 | global batch size: 256 | lm loss: 3.669419E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1426.350 | TFLOPs: 22.37 | +7: iteration 146230/ 173500 | consumed samples: 37434880 | consumed tokens: 76666634240 | elapsed time per iteration (s): 0.16 | learning rate: 3.096E-05 | global batch size: 256 | lm loss: 3.671592E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.963 | TFLOPs: 25.66 | +7: iteration 146240/ 173500 | consumed samples: 37437440 | consumed tokens: 76671877120 | elapsed time per iteration (s): 0.16 | learning rate: 3.096E-05 | global batch size: 256 | lm loss: 3.685519E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.365 | TFLOPs: 25.11 | +7: iteration 146250/ 173500 | consumed samples: 37440000 | consumed tokens: 76677120000 | elapsed time per iteration (s): 0.15 | learning rate: 3.095E-05 | global batch size: 256 | lm loss: 3.667133E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.361 | TFLOPs: 26.16 | +7: iteration 146260/ 173500 | consumed samples: 37442560 | consumed tokens: 76682362880 | elapsed time per iteration (s): 0.16 | learning rate: 3.094E-05 | global batch size: 256 | lm loss: 3.665085E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.885 | TFLOPs: 25.80 | +7: iteration 146270/ 173500 | consumed samples: 37445120 | consumed tokens: 76687605760 | elapsed time per iteration (s): 0.16 | learning rate: 3.093E-05 | global batch size: 256 | lm loss: 3.671962E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.806 | TFLOPs: 25.73 | +7: iteration 146280/ 173500 | consumed samples: 37447680 | consumed tokens: 76692848640 | elapsed time per iteration (s): 0.16 | learning rate: 3.093E-05 | global batch size: 256 | lm loss: 3.664087E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.999 | TFLOPs: 25.39 | +7: iteration 146290/ 173500 | consumed samples: 37450240 | consumed tokens: 76698091520 | elapsed time per iteration (s): 0.15 | learning rate: 3.092E-05 | global batch size: 256 | lm loss: 3.671146E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.302 | TFLOPs: 26.01 | +7: iteration 146300/ 173500 | consumed samples: 37452800 | consumed tokens: 76703334400 | elapsed time per iteration (s): 0.15 | learning rate: 3.091E-05 | global batch size: 256 | lm loss: 3.659903E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.528 | TFLOPs: 26.18 | +7: iteration 146310/ 173500 | consumed samples: 37455360 | consumed tokens: 76708577280 | elapsed time per iteration (s): 0.16 | learning rate: 3.090E-05 | global batch size: 256 | lm loss: 3.676969E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.760 | TFLOPs: 24.48 | +7: iteration 146320/ 173500 | consumed samples: 37457920 | consumed tokens: 76713820160 | elapsed time per iteration (s): 0.16 | learning rate: 3.089E-05 | global batch size: 256 | lm loss: 3.672966E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.936 | TFLOPs: 25.73 | +7: iteration 146330/ 173500 | consumed samples: 37460480 | consumed tokens: 76719063040 | elapsed time per iteration (s): 0.16 | learning rate: 3.089E-05 | global batch size: 256 | lm loss: 3.678722E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.368 | TFLOPs: 25.47 | +7: iteration 146340/ 173500 | consumed samples: 37463040 | consumed tokens: 76724305920 | elapsed time per iteration (s): 0.16 | learning rate: 3.088E-05 | global batch size: 256 | lm loss: 3.661114E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.570 | TFLOPs: 25.70 | +7: iteration 146350/ 173500 | consumed samples: 37465600 | consumed tokens: 76729548800 | elapsed time per iteration (s): 0.16 | learning rate: 3.087E-05 | global batch size: 256 | lm loss: 3.666600E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.538 | TFLOPs: 25.68 | +7: iteration 146360/ 173500 | consumed samples: 37468160 | consumed tokens: 76734791680 | elapsed time per iteration (s): 0.16 | learning rate: 3.086E-05 | global batch size: 256 | lm loss: 3.663779E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.279 | TFLOPs: 25.66 | +7: iteration 146370/ 173500 | consumed samples: 37470720 | consumed tokens: 76740034560 | elapsed time per iteration (s): 0.16 | learning rate: 3.085E-05 | global batch size: 256 | lm loss: 3.666533E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.501 | TFLOPs: 25.76 | +7: iteration 146380/ 173500 | consumed samples: 37473280 | consumed tokens: 76745277440 | elapsed time per iteration (s): 0.16 | learning rate: 3.085E-05 | global batch size: 256 | lm loss: 3.672626E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.553 | TFLOPs: 25.40 | +7: iteration 146390/ 173500 | consumed samples: 37475840 | consumed tokens: 76750520320 | elapsed time per iteration (s): 0.16 | learning rate: 3.084E-05 | global batch size: 256 | lm loss: 3.672421E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.475 | TFLOPs: 24.53 | +7: iteration 146400/ 173500 | consumed samples: 37478400 | consumed tokens: 76755763200 | elapsed time per iteration (s): 0.16 | learning rate: 3.083E-05 | global batch size: 256 | lm loss: 3.668234E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.970 | TFLOPs: 25.64 | +7: iteration 146410/ 173500 | consumed samples: 37480960 | consumed tokens: 76761006080 | elapsed time per iteration (s): 0.16 | learning rate: 3.082E-05 | global batch size: 256 | lm loss: 3.663794E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.317 | TFLOPs: 24.86 | +7: iteration 146420/ 173500 | consumed samples: 37483520 | consumed tokens: 76766248960 | elapsed time per iteration (s): 0.16 | learning rate: 3.082E-05 | global batch size: 256 | lm loss: 3.668746E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.561 | TFLOPs: 25.35 | +7: iteration 146430/ 173500 | consumed samples: 37486080 | consumed tokens: 76771491840 | elapsed time per iteration (s): 0.16 | learning rate: 3.081E-05 | global batch size: 256 | lm loss: 3.659261E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.471 | TFLOPs: 25.70 | +7: iteration 146440/ 173500 | consumed samples: 37488640 | consumed tokens: 76776734720 | elapsed time per iteration (s): 0.16 | learning rate: 3.080E-05 | global batch size: 256 | lm loss: 3.654920E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.498 | TFLOPs: 25.77 | +7: iteration 146450/ 173500 | consumed samples: 37491200 | consumed tokens: 76781977600 | elapsed time per iteration (s): 0.16 | learning rate: 3.079E-05 | global batch size: 256 | lm loss: 3.659313E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.409 | TFLOPs: 25.66 | +7: iteration 146460/ 173500 | consumed samples: 37493760 | consumed tokens: 76787220480 | elapsed time per iteration (s): 0.16 | learning rate: 3.078E-05 | global batch size: 256 | lm loss: 3.673924E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.057 | TFLOPs: 25.63 | +7: iteration 146470/ 173500 | consumed samples: 37496320 | consumed tokens: 76792463360 | elapsed time per iteration (s): 0.16 | learning rate: 3.078E-05 | global batch size: 256 | lm loss: 3.659197E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.465 | TFLOPs: 25.54 | +7: iteration 146480/ 173500 | consumed samples: 37498880 | consumed tokens: 76797706240 | elapsed time per iteration (s): 0.16 | learning rate: 3.077E-05 | global batch size: 256 | lm loss: 3.659417E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.547 | TFLOPs: 25.88 | +7: iteration 146490/ 173500 | consumed samples: 37501440 | consumed tokens: 76802949120 | elapsed time per iteration (s): 0.16 | learning rate: 3.076E-05 | global batch size: 256 | lm loss: 3.674754E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.779 | TFLOPs: 25.79 | +7: iteration 146500/ 173500 | consumed samples: 37504000 | consumed tokens: 76808192000 | elapsed time per iteration (s): 0.16 | learning rate: 3.075E-05 | global batch size: 256 | lm loss: 3.674332E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.369 | TFLOPs: 25.80 | +7: iteration 146510/ 173500 | consumed samples: 37506560 | consumed tokens: 76813434880 | elapsed time per iteration (s): 0.15 | learning rate: 3.075E-05 | global batch size: 256 | lm loss: 3.666872E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.786 | TFLOPs: 26.16 | +7: iteration 146520/ 173500 | consumed samples: 37509120 | consumed tokens: 76818677760 | elapsed time per iteration (s): 0.16 | learning rate: 3.074E-05 | global batch size: 256 | lm loss: 3.680816E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.269 | TFLOPs: 25.79 | +7: iteration 146530/ 173500 | consumed samples: 37511680 | consumed tokens: 76823920640 | elapsed time per iteration (s): 0.15 | learning rate: 3.073E-05 | global batch size: 256 | lm loss: 3.669700E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.024 | TFLOPs: 26.13 | +7: iteration 146540/ 173500 | consumed samples: 37514240 | consumed tokens: 76829163520 | elapsed time per iteration (s): 0.15 | learning rate: 3.072E-05 | global batch size: 256 | lm loss: 3.662455E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.908 | TFLOPs: 26.11 | +7: iteration 146550/ 173500 | consumed samples: 37516800 | consumed tokens: 76834406400 | elapsed time per iteration (s): 0.16 | learning rate: 3.071E-05 | global batch size: 256 | lm loss: 3.665147E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.672 | TFLOPs: 25.59 | +7: iteration 146560/ 173500 | consumed samples: 37519360 | consumed tokens: 76839649280 | elapsed time per iteration (s): 0.15 | learning rate: 3.071E-05 | global batch size: 256 | lm loss: 3.663278E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.885 | TFLOPs: 26.14 | +7: iteration 146570/ 173500 | consumed samples: 37521920 | consumed tokens: 76844892160 | elapsed time per iteration (s): 0.15 | learning rate: 3.070E-05 | global batch size: 256 | lm loss: 3.658026E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.860 | TFLOPs: 26.17 | +7: iteration 146580/ 173500 | consumed samples: 37524480 | consumed tokens: 76850135040 | elapsed time per iteration (s): 0.15 | learning rate: 3.069E-05 | global batch size: 256 | lm loss: 3.675236E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.952 | TFLOPs: 26.16 | +7: iteration 146590/ 173500 | consumed samples: 37527040 | consumed tokens: 76855377920 | elapsed time per iteration (s): 0.16 | learning rate: 3.068E-05 | global batch size: 256 | lm loss: 3.677906E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.480 | TFLOPs: 25.77 | +7: iteration 146600/ 173500 | consumed samples: 37529600 | consumed tokens: 76860620800 | elapsed time per iteration (s): 0.16 | learning rate: 3.068E-05 | global batch size: 256 | lm loss: 3.657652E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.074 | TFLOPs: 24.48 | +7: iteration 146610/ 173500 | consumed samples: 37532160 | consumed tokens: 76865863680 | elapsed time per iteration (s): 0.16 | learning rate: 3.067E-05 | global batch size: 256 | lm loss: 3.671603E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.413 | TFLOPs: 25.76 | +7: iteration 146620/ 173500 | consumed samples: 37534720 | consumed tokens: 76871106560 | elapsed time per iteration (s): 0.15 | learning rate: 3.066E-05 | global batch size: 256 | lm loss: 3.665362E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.080 | TFLOPs: 26.13 | +7: iteration 146630/ 173500 | consumed samples: 37537280 | consumed tokens: 76876349440 | elapsed time per iteration (s): 0.16 | learning rate: 3.065E-05 | global batch size: 256 | lm loss: 3.667049E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.297 | TFLOPs: 25.57 | +7: iteration 146640/ 173500 | consumed samples: 37539840 | consumed tokens: 76881592320 | elapsed time per iteration (s): 0.16 | learning rate: 3.064E-05 | global batch size: 256 | lm loss: 3.664339E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.003 | TFLOPs: 24.62 | +7: iteration 146650/ 173500 | consumed samples: 37542400 | consumed tokens: 76886835200 | elapsed time per iteration (s): 0.16 | learning rate: 3.064E-05 | global batch size: 256 | lm loss: 3.680919E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.352 | TFLOPs: 25.41 | +7: iteration 146660/ 173500 | consumed samples: 37544960 | consumed tokens: 76892078080 | elapsed time per iteration (s): 0.16 | learning rate: 3.063E-05 | global batch size: 256 | lm loss: 3.660102E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.586 | TFLOPs: 25.26 | +7: iteration 146670/ 173500 | consumed samples: 37547520 | consumed tokens: 76897320960 | elapsed time per iteration (s): 0.16 | learning rate: 3.062E-05 | global batch size: 256 | lm loss: 3.661529E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.135 | TFLOPs: 25.60 | +7: iteration 146680/ 173500 | consumed samples: 37550080 | consumed tokens: 76902563840 | elapsed time per iteration (s): 0.16 | learning rate: 3.061E-05 | global batch size: 256 | lm loss: 3.677348E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.990 | TFLOPs: 25.67 | +7: iteration 146690/ 173500 | consumed samples: 37552640 | consumed tokens: 76907806720 | elapsed time per iteration (s): 0.16 | learning rate: 3.061E-05 | global batch size: 256 | lm loss: 3.662992E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.476 | TFLOPs: 24.99 | +7: iteration 146700/ 173500 | consumed samples: 37555200 | consumed tokens: 76913049600 | elapsed time per iteration (s): 0.16 | learning rate: 3.060E-05 | global batch size: 256 | lm loss: 3.661823E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.177 | TFLOPs: 25.63 | +7: iteration 146710/ 173500 | consumed samples: 37557760 | consumed tokens: 76918292480 | elapsed time per iteration (s): 0.16 | learning rate: 3.059E-05 | global batch size: 256 | lm loss: 3.682980E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.381 | TFLOPs: 25.36 | +7: iteration 146720/ 173500 | consumed samples: 37560320 | consumed tokens: 76923535360 | elapsed time per iteration (s): 0.15 | learning rate: 3.058E-05 | global batch size: 256 | lm loss: 3.666832E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.317 | TFLOPs: 25.99 | +7: iteration 146730/ 173500 | consumed samples: 37562880 | consumed tokens: 76928778240 | elapsed time per iteration (s): 0.15 | learning rate: 3.057E-05 | global batch size: 256 | lm loss: 3.665366E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.527 | TFLOPs: 26.15 | +7: iteration 146740/ 173500 | consumed samples: 37565440 | consumed tokens: 76934021120 | elapsed time per iteration (s): 0.15 | learning rate: 3.057E-05 | global batch size: 256 | lm loss: 3.671241E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.228 | TFLOPs: 26.26 | +7: iteration 146750/ 173500 | consumed samples: 37568000 | consumed tokens: 76939264000 | elapsed time per iteration (s): 0.16 | learning rate: 3.056E-05 | global batch size: 256 | lm loss: 3.653787E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.535 | TFLOPs: 24.69 | +7: iteration 146760/ 173500 | consumed samples: 37570560 | consumed tokens: 76944506880 | elapsed time per iteration (s): 0.15 | learning rate: 3.055E-05 | global batch size: 256 | lm loss: 3.673422E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.568 | TFLOPs: 26.04 | +7: iteration 146770/ 173500 | consumed samples: 37573120 | consumed tokens: 76949749760 | elapsed time per iteration (s): 0.15 | learning rate: 3.054E-05 | global batch size: 256 | lm loss: 3.663235E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.211 | TFLOPs: 26.24 | +7: iteration 146780/ 173500 | consumed samples: 37575680 | consumed tokens: 76954992640 | elapsed time per iteration (s): 0.16 | learning rate: 3.054E-05 | global batch size: 256 | lm loss: 3.672044E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.220 | TFLOPs: 25.00 | +7: iteration 146790/ 173500 | consumed samples: 37578240 | consumed tokens: 76960235520 | elapsed time per iteration (s): 0.16 | learning rate: 3.053E-05 | global batch size: 256 | lm loss: 3.651901E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.004 | TFLOPs: 24.90 | +7: iteration 146800/ 173500 | consumed samples: 37580800 | consumed tokens: 76965478400 | elapsed time per iteration (s): 0.16 | learning rate: 3.052E-05 | global batch size: 256 | lm loss: 3.657151E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.355 | TFLOPs: 25.85 | +7: iteration 146810/ 173500 | consumed samples: 37583360 | consumed tokens: 76970721280 | elapsed time per iteration (s): 0.16 | learning rate: 3.051E-05 | global batch size: 256 | lm loss: 3.645353E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.100 | TFLOPs: 25.30 | +7: iteration 146820/ 173500 | consumed samples: 37585920 | consumed tokens: 76975964160 | elapsed time per iteration (s): 0.15 | learning rate: 3.050E-05 | global batch size: 256 | lm loss: 3.664998E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.404 | TFLOPs: 26.18 | +7: iteration 146830/ 173500 | consumed samples: 37588480 | consumed tokens: 76981207040 | elapsed time per iteration (s): 0.16 | learning rate: 3.050E-05 | global batch size: 256 | lm loss: 3.658435E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.036 | TFLOPs: 25.19 | +7: iteration 146840/ 173500 | consumed samples: 37591040 | consumed tokens: 76986449920 | elapsed time per iteration (s): 0.15 | learning rate: 3.049E-05 | global batch size: 256 | lm loss: 3.668435E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.274 | TFLOPs: 25.91 | +7: iteration 146850/ 173500 | consumed samples: 37593600 | consumed tokens: 76991692800 | elapsed time per iteration (s): 0.15 | learning rate: 3.048E-05 | global batch size: 256 | lm loss: 3.677795E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.802 | TFLOPs: 26.17 | +7: iteration 146860/ 173500 | consumed samples: 37596160 | consumed tokens: 76996935680 | elapsed time per iteration (s): 0.16 | learning rate: 3.047E-05 | global batch size: 256 | lm loss: 3.655125E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.912 | TFLOPs: 25.36 | +7: iteration 146870/ 173500 | consumed samples: 37598720 | consumed tokens: 77002178560 | elapsed time per iteration (s): 0.15 | learning rate: 3.047E-05 | global batch size: 256 | lm loss: 3.671687E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.801 | TFLOPs: 26.16 | +7: iteration 146880/ 173500 | consumed samples: 37601280 | consumed tokens: 77007421440 | elapsed time per iteration (s): 0.15 | learning rate: 3.046E-05 | global batch size: 256 | lm loss: 3.666949E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.000 | TFLOPs: 26.11 | +7: iteration 146890/ 173500 | consumed samples: 37603840 | consumed tokens: 77012664320 | elapsed time per iteration (s): 0.15 | learning rate: 3.045E-05 | global batch size: 256 | lm loss: 3.669841E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.097 | TFLOPs: 26.14 | +7: iteration 146900/ 173500 | consumed samples: 37606400 | consumed tokens: 77017907200 | elapsed time per iteration (s): 0.15 | learning rate: 3.044E-05 | global batch size: 256 | lm loss: 3.662308E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.633 | TFLOPs: 26.15 | +7: iteration 146910/ 173500 | consumed samples: 37608960 | consumed tokens: 77023150080 | elapsed time per iteration (s): 0.16 | learning rate: 3.044E-05 | global batch size: 256 | lm loss: 3.661458E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.421 | TFLOPs: 25.88 | +7: iteration 146920/ 173500 | consumed samples: 37611520 | consumed tokens: 77028392960 | elapsed time per iteration (s): 0.15 | learning rate: 3.043E-05 | global batch size: 256 | lm loss: 3.668073E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.880 | TFLOPs: 26.14 | +7: iteration 146930/ 173500 | consumed samples: 37614080 | consumed tokens: 77033635840 | elapsed time per iteration (s): 0.15 | learning rate: 3.042E-05 | global batch size: 256 | lm loss: 3.663807E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.434 | TFLOPs: 26.15 | +7: iteration 146940/ 173500 | consumed samples: 37616640 | consumed tokens: 77038878720 | elapsed time per iteration (s): 0.16 | learning rate: 3.041E-05 | global batch size: 256 | lm loss: 3.668171E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.281 | TFLOPs: 25.77 | +7: iteration 146950/ 173500 | consumed samples: 37619200 | consumed tokens: 77044121600 | elapsed time per iteration (s): 0.15 | learning rate: 3.040E-05 | global batch size: 256 | lm loss: 3.665105E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.966 | TFLOPs: 26.13 | +7: iteration 146960/ 173500 | consumed samples: 37621760 | consumed tokens: 77049364480 | elapsed time per iteration (s): 0.15 | learning rate: 3.040E-05 | global batch size: 256 | lm loss: 3.663046E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.050 | TFLOPs: 26.10 | +7: iteration 146970/ 173500 | consumed samples: 37624320 | consumed tokens: 77054607360 | elapsed time per iteration (s): 0.16 | learning rate: 3.039E-05 | global batch size: 256 | lm loss: 3.673509E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.989 | TFLOPs: 25.66 | +7: iteration 146980/ 173500 | consumed samples: 37626880 | consumed tokens: 77059850240 | elapsed time per iteration (s): 0.15 | learning rate: 3.038E-05 | global batch size: 256 | lm loss: 3.663960E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.218 | TFLOPs: 26.11 | +7: iteration 146990/ 173500 | consumed samples: 37629440 | consumed tokens: 77065093120 | elapsed time per iteration (s): 0.15 | learning rate: 3.037E-05 | global batch size: 256 | lm loss: 3.670647E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.712 | TFLOPs: 26.12 | +7: iteration 147000/ 173500 | consumed samples: 37632000 | consumed tokens: 77070336000 | elapsed time per iteration (s): 0.15 | learning rate: 3.037E-05 | global batch size: 256 | lm loss: 3.665252E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.506 | TFLOPs: 26.12 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 147000 | lm loss value: 3.835962E+00 | lm loss PPL: 4.633797E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 147000 to checkpoints_44m91b100m +0: [2023-03-17 06:38:44,839] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step147000 is begin to save! +0: [2023-03-17 06:38:44,842] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:38:44,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:38:44,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:38:44,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:38:44,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:38:44,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:38:44,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:38:44,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:38:44,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:38:44,944] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:38:44,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:38:44,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:38:44,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:38:44,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:38:44,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:38:44,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:38:44,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:38:44,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:38:44,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:38:44,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:38:44,977] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step147000/mp_rank_00_model_states.pt +0: [2023-03-17 06:38:44,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:38:44,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:38:44,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:38:45,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +2: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +2: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +2: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +2: [2023-03-17 06:38:45,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,020] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,020] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +2: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +2: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:38:45,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:38:45,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:38:45,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +6: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +4: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +5: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:38:45,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +3: [2023-03-17 06:38:45,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 06:38:45,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +1: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:38:45,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +7: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:38:45,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step147000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:38:45,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step147000 is ready now! +0: successfully saved checkpoint at iteration 147000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 198.20 +7: iteration 147010/ 173500 | consumed samples: 37634560 | consumed tokens: 77075578880 | elapsed time per iteration (s): 0.18 | learning rate: 3.036E-05 | global batch size: 256 | lm loss: 3.668072E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.511 | TFLOPs: 22.48 | +7: iteration 147020/ 173500 | consumed samples: 37637120 | consumed tokens: 77080821760 | elapsed time per iteration (s): 0.15 | learning rate: 3.035E-05 | global batch size: 256 | lm loss: 3.655306E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.360 | TFLOPs: 26.09 | +7: iteration 147030/ 173500 | consumed samples: 37639680 | consumed tokens: 77086064640 | elapsed time per iteration (s): 0.16 | learning rate: 3.034E-05 | global batch size: 256 | lm loss: 3.660624E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.517 | TFLOPs: 25.46 | +7: iteration 147040/ 173500 | consumed samples: 37642240 | consumed tokens: 77091307520 | elapsed time per iteration (s): 0.16 | learning rate: 3.034E-05 | global batch size: 256 | lm loss: 3.676435E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.362 | TFLOPs: 25.71 | +7: iteration 147050/ 173500 | consumed samples: 37644800 | consumed tokens: 77096550400 | elapsed time per iteration (s): 0.16 | learning rate: 3.033E-05 | global batch size: 256 | lm loss: 3.670027E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.981 | TFLOPs: 24.73 | +7: iteration 147060/ 173500 | consumed samples: 37647360 | consumed tokens: 77101793280 | elapsed time per iteration (s): 0.16 | learning rate: 3.032E-05 | global batch size: 256 | lm loss: 3.660803E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.072 | TFLOPs: 25.42 | +7: iteration 147070/ 173500 | consumed samples: 37649920 | consumed tokens: 77107036160 | elapsed time per iteration (s): 0.16 | learning rate: 3.031E-05 | global batch size: 256 | lm loss: 3.681736E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.839 | TFLOPs: 25.73 | +7: iteration 147080/ 173500 | consumed samples: 37652480 | consumed tokens: 77112279040 | elapsed time per iteration (s): 0.16 | learning rate: 3.031E-05 | global batch size: 256 | lm loss: 3.657421E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.479 | TFLOPs: 25.46 | +7: iteration 147090/ 173500 | consumed samples: 37655040 | consumed tokens: 77117521920 | elapsed time per iteration (s): 0.16 | learning rate: 3.030E-05 | global batch size: 256 | lm loss: 3.659733E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.273 | TFLOPs: 25.80 | +7: iteration 147100/ 173500 | consumed samples: 37657600 | consumed tokens: 77122764800 | elapsed time per iteration (s): 0.16 | learning rate: 3.029E-05 | global batch size: 256 | lm loss: 3.650756E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.926 | TFLOPs: 25.66 | +7: iteration 147110/ 173500 | consumed samples: 37660160 | consumed tokens: 77128007680 | elapsed time per iteration (s): 0.16 | learning rate: 3.028E-05 | global batch size: 256 | lm loss: 3.680021E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.435 | TFLOPs: 25.71 | +7: iteration 147120/ 173500 | consumed samples: 37662720 | consumed tokens: 77133250560 | elapsed time per iteration (s): 0.16 | learning rate: 3.027E-05 | global batch size: 256 | lm loss: 3.662358E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.351 | TFLOPs: 25.35 | +7: iteration 147130/ 173500 | consumed samples: 37665280 | consumed tokens: 77138493440 | elapsed time per iteration (s): 0.16 | learning rate: 3.027E-05 | global batch size: 256 | lm loss: 3.657198E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.434 | TFLOPs: 25.69 | +7: iteration 147140/ 173500 | consumed samples: 37667840 | consumed tokens: 77143736320 | elapsed time per iteration (s): 0.16 | learning rate: 3.026E-05 | global batch size: 256 | lm loss: 3.672047E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.760 | TFLOPs: 24.88 | +7: iteration 147150/ 173500 | consumed samples: 37670400 | consumed tokens: 77148979200 | elapsed time per iteration (s): 0.16 | learning rate: 3.025E-05 | global batch size: 256 | lm loss: 3.659743E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.390 | TFLOPs: 25.69 | +7: iteration 147160/ 173500 | consumed samples: 37672960 | consumed tokens: 77154222080 | elapsed time per iteration (s): 0.16 | learning rate: 3.024E-05 | global batch size: 256 | lm loss: 3.661108E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.603 | TFLOPs: 25.62 | +7: iteration 147170/ 173500 | consumed samples: 37675520 | consumed tokens: 77159464960 | elapsed time per iteration (s): 0.16 | learning rate: 3.024E-05 | global batch size: 256 | lm loss: 3.653321E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.849 | TFLOPs: 25.70 | +7: iteration 147180/ 173500 | consumed samples: 37678080 | consumed tokens: 77164707840 | elapsed time per iteration (s): 0.18 | learning rate: 3.023E-05 | global batch size: 256 | lm loss: 3.667230E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1389.786 | TFLOPs: 21.80 | +7: iteration 147190/ 173500 | consumed samples: 37680640 | consumed tokens: 77169950720 | elapsed time per iteration (s): 0.16 | learning rate: 3.022E-05 | global batch size: 256 | lm loss: 3.681192E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.969 | TFLOPs: 25.44 | +7: iteration 147200/ 173500 | consumed samples: 37683200 | consumed tokens: 77175193600 | elapsed time per iteration (s): 0.17 | learning rate: 3.021E-05 | global batch size: 256 | lm loss: 3.654181E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.008 | TFLOPs: 23.84 | +7: iteration 147210/ 173500 | consumed samples: 37685760 | consumed tokens: 77180436480 | elapsed time per iteration (s): 0.16 | learning rate: 3.021E-05 | global batch size: 256 | lm loss: 3.687252E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.989 | TFLOPs: 25.25 | +7: iteration 147220/ 173500 | consumed samples: 37688320 | consumed tokens: 77185679360 | elapsed time per iteration (s): 0.16 | learning rate: 3.020E-05 | global batch size: 256 | lm loss: 3.670829E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.169 | TFLOPs: 24.50 | +7: iteration 147230/ 173500 | consumed samples: 37690880 | consumed tokens: 77190922240 | elapsed time per iteration (s): 0.16 | learning rate: 3.019E-05 | global batch size: 256 | lm loss: 3.671456E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.723 | TFLOPs: 25.68 | +7: iteration 147240/ 173500 | consumed samples: 37693440 | consumed tokens: 77196165120 | elapsed time per iteration (s): 0.15 | learning rate: 3.018E-05 | global batch size: 256 | lm loss: 3.661205E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.632 | TFLOPs: 26.15 | +7: iteration 147250/ 173500 | consumed samples: 37696000 | consumed tokens: 77201408000 | elapsed time per iteration (s): 0.16 | learning rate: 3.018E-05 | global batch size: 256 | lm loss: 3.668071E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.325 | TFLOPs: 25.50 | +7: iteration 147260/ 173500 | consumed samples: 37698560 | consumed tokens: 77206650880 | elapsed time per iteration (s): 0.16 | learning rate: 3.017E-05 | global batch size: 256 | lm loss: 3.672964E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.303 | TFLOPs: 25.74 | +7: iteration 147270/ 173500 | consumed samples: 37701120 | consumed tokens: 77211893760 | elapsed time per iteration (s): 0.16 | learning rate: 3.016E-05 | global batch size: 256 | lm loss: 3.653052E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.437 | TFLOPs: 25.79 | +7: iteration 147280/ 173500 | consumed samples: 37703680 | consumed tokens: 77217136640 | elapsed time per iteration (s): 0.17 | learning rate: 3.015E-05 | global batch size: 256 | lm loss: 3.662849E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.516 | TFLOPs: 24.25 | +7: iteration 147290/ 173500 | consumed samples: 37706240 | consumed tokens: 77222379520 | elapsed time per iteration (s): 0.16 | learning rate: 3.015E-05 | global batch size: 256 | lm loss: 3.684542E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.707 | TFLOPs: 25.68 | +7: iteration 147300/ 173500 | consumed samples: 37708800 | consumed tokens: 77227622400 | elapsed time per iteration (s): 0.16 | learning rate: 3.014E-05 | global batch size: 256 | lm loss: 3.673780E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.863 | TFLOPs: 25.42 | +7: iteration 147310/ 173500 | consumed samples: 37711360 | consumed tokens: 77232865280 | elapsed time per iteration (s): 0.16 | learning rate: 3.013E-05 | global batch size: 256 | lm loss: 3.676018E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.236 | TFLOPs: 24.44 | +7: iteration 147320/ 173500 | consumed samples: 37713920 | consumed tokens: 77238108160 | elapsed time per iteration (s): 0.16 | learning rate: 3.012E-05 | global batch size: 256 | lm loss: 3.658746E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.546 | TFLOPs: 25.77 | +7: iteration 147330/ 173500 | consumed samples: 37716480 | consumed tokens: 77243351040 | elapsed time per iteration (s): 0.16 | learning rate: 3.011E-05 | global batch size: 256 | lm loss: 3.668148E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.412 | TFLOPs: 25.43 | +7: iteration 147340/ 173500 | consumed samples: 37719040 | consumed tokens: 77248593920 | elapsed time per iteration (s): 0.17 | learning rate: 3.011E-05 | global batch size: 256 | lm loss: 3.672988E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.053 | TFLOPs: 24.25 | +7: iteration 147350/ 173500 | consumed samples: 37721600 | consumed tokens: 77253836800 | elapsed time per iteration (s): 0.16 | learning rate: 3.010E-05 | global batch size: 256 | lm loss: 3.663905E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.689 | TFLOPs: 24.60 | +7: iteration 147360/ 173500 | consumed samples: 37724160 | consumed tokens: 77259079680 | elapsed time per iteration (s): 0.16 | learning rate: 3.009E-05 | global batch size: 256 | lm loss: 3.659314E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.468 | TFLOPs: 25.21 | +7: iteration 147370/ 173500 | consumed samples: 37726720 | consumed tokens: 77264322560 | elapsed time per iteration (s): 0.16 | learning rate: 3.008E-05 | global batch size: 256 | lm loss: 3.674564E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.868 | TFLOPs: 25.70 | +7: iteration 147380/ 173500 | consumed samples: 37729280 | consumed tokens: 77269565440 | elapsed time per iteration (s): 0.15 | learning rate: 3.008E-05 | global batch size: 256 | lm loss: 3.664000E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.697 | TFLOPs: 26.11 | +7: iteration 147390/ 173500 | consumed samples: 37731840 | consumed tokens: 77274808320 | elapsed time per iteration (s): 0.17 | learning rate: 3.007E-05 | global batch size: 256 | lm loss: 3.661380E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.717 | TFLOPs: 24.07 | +7: iteration 147400/ 173500 | consumed samples: 37734400 | consumed tokens: 77280051200 | elapsed time per iteration (s): 0.16 | learning rate: 3.006E-05 | global batch size: 256 | lm loss: 3.673602E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.168 | TFLOPs: 24.61 | +7: iteration 147410/ 173500 | consumed samples: 37736960 | consumed tokens: 77285294080 | elapsed time per iteration (s): 0.16 | learning rate: 3.005E-05 | global batch size: 256 | lm loss: 3.674911E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.618 | TFLOPs: 25.68 | +7: iteration 147420/ 173500 | consumed samples: 37739520 | consumed tokens: 77290536960 | elapsed time per iteration (s): 0.16 | learning rate: 3.005E-05 | global batch size: 256 | lm loss: 3.668335E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.667 | TFLOPs: 25.68 | +7: iteration 147430/ 173500 | consumed samples: 37742080 | consumed tokens: 77295779840 | elapsed time per iteration (s): 0.16 | learning rate: 3.004E-05 | global batch size: 256 | lm loss: 3.665393E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.078 | TFLOPs: 25.83 | +7: iteration 147440/ 173500 | consumed samples: 37744640 | consumed tokens: 77301022720 | elapsed time per iteration (s): 0.17 | learning rate: 3.003E-05 | global batch size: 256 | lm loss: 3.665278E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.702 | TFLOPs: 24.18 | +7: iteration 147450/ 173500 | consumed samples: 37747200 | consumed tokens: 77306265600 | elapsed time per iteration (s): 0.16 | learning rate: 3.002E-05 | global batch size: 256 | lm loss: 3.663103E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.279 | TFLOPs: 25.03 | +7: iteration 147460/ 173500 | consumed samples: 37749760 | consumed tokens: 77311508480 | elapsed time per iteration (s): 0.16 | learning rate: 3.002E-05 | global batch size: 256 | lm loss: 3.668610E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.121 | TFLOPs: 24.89 | +7: iteration 147470/ 173500 | consumed samples: 37752320 | consumed tokens: 77316751360 | elapsed time per iteration (s): 0.16 | learning rate: 3.001E-05 | global batch size: 256 | lm loss: 3.669728E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.520 | TFLOPs: 25.08 | +7: iteration 147480/ 173500 | consumed samples: 37754880 | consumed tokens: 77321994240 | elapsed time per iteration (s): 0.17 | learning rate: 3.000E-05 | global batch size: 256 | lm loss: 3.667445E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.411 | TFLOPs: 23.73 | +7: iteration 147490/ 173500 | consumed samples: 37757440 | consumed tokens: 77327237120 | elapsed time per iteration (s): 0.16 | learning rate: 2.999E-05 | global batch size: 256 | lm loss: 3.677003E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.005 | TFLOPs: 25.66 | +7: iteration 147500/ 173500 | consumed samples: 37760000 | consumed tokens: 77332480000 | elapsed time per iteration (s): 0.17 | learning rate: 2.999E-05 | global batch size: 256 | lm loss: 3.671303E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.190 | TFLOPs: 24.23 | +7: iteration 147510/ 173500 | consumed samples: 37762560 | consumed tokens: 77337722880 | elapsed time per iteration (s): 0.16 | learning rate: 2.998E-05 | global batch size: 256 | lm loss: 3.660142E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.168 | TFLOPs: 24.40 | +7: iteration 147520/ 173500 | consumed samples: 37765120 | consumed tokens: 77342965760 | elapsed time per iteration (s): 0.16 | learning rate: 2.997E-05 | global batch size: 256 | lm loss: 3.669155E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.174 | TFLOPs: 25.28 | +7: iteration 147530/ 173500 | consumed samples: 37767680 | consumed tokens: 77348208640 | elapsed time per iteration (s): 0.16 | learning rate: 2.996E-05 | global batch size: 256 | lm loss: 3.666436E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.953 | TFLOPs: 25.25 | +7: iteration 147540/ 173500 | consumed samples: 37770240 | consumed tokens: 77353451520 | elapsed time per iteration (s): 0.16 | learning rate: 2.996E-05 | global batch size: 256 | lm loss: 3.668679E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.626 | TFLOPs: 25.87 | +7: iteration 147550/ 173500 | consumed samples: 37772800 | consumed tokens: 77358694400 | elapsed time per iteration (s): 0.15 | learning rate: 2.995E-05 | global batch size: 256 | lm loss: 3.666682E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.242 | TFLOPs: 26.13 | +7: iteration 147560/ 173500 | consumed samples: 37775360 | consumed tokens: 77363937280 | elapsed time per iteration (s): 0.16 | learning rate: 2.994E-05 | global batch size: 256 | lm loss: 3.661591E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.464 | TFLOPs: 25.01 | +7: iteration 147570/ 173500 | consumed samples: 37777920 | consumed tokens: 77369180160 | elapsed time per iteration (s): 0.17 | learning rate: 2.993E-05 | global batch size: 256 | lm loss: 3.659543E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.595 | TFLOPs: 24.14 | +7: iteration 147580/ 173500 | consumed samples: 37780480 | consumed tokens: 77374423040 | elapsed time per iteration (s): 0.15 | learning rate: 2.993E-05 | global batch size: 256 | lm loss: 3.660577E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.348 | TFLOPs: 26.20 | +7: iteration 147590/ 173500 | consumed samples: 37783040 | consumed tokens: 77379665920 | elapsed time per iteration (s): 0.15 | learning rate: 2.992E-05 | global batch size: 256 | lm loss: 3.671534E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.563 | TFLOPs: 26.04 | +7: iteration 147600/ 173500 | consumed samples: 37785600 | consumed tokens: 77384908800 | elapsed time per iteration (s): 0.16 | learning rate: 2.991E-05 | global batch size: 256 | lm loss: 3.664474E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.321 | TFLOPs: 25.30 | +7: iteration 147610/ 173500 | consumed samples: 37788160 | consumed tokens: 77390151680 | elapsed time per iteration (s): 0.16 | learning rate: 2.990E-05 | global batch size: 256 | lm loss: 3.684624E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.053 | TFLOPs: 25.12 | +7: iteration 147620/ 173500 | consumed samples: 37790720 | consumed tokens: 77395394560 | elapsed time per iteration (s): 0.17 | learning rate: 2.990E-05 | global batch size: 256 | lm loss: 3.675711E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.823 | TFLOPs: 24.05 | +7: iteration 147630/ 173500 | consumed samples: 37793280 | consumed tokens: 77400637440 | elapsed time per iteration (s): 0.16 | learning rate: 2.989E-05 | global batch size: 256 | lm loss: 3.657788E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.884 | TFLOPs: 24.43 | +7: iteration 147640/ 173500 | consumed samples: 37795840 | consumed tokens: 77405880320 | elapsed time per iteration (s): 0.17 | learning rate: 2.988E-05 | global batch size: 256 | lm loss: 3.667997E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.675 | TFLOPs: 24.07 | +7: iteration 147650/ 173500 | consumed samples: 37798400 | consumed tokens: 77411123200 | elapsed time per iteration (s): 0.16 | learning rate: 2.987E-05 | global batch size: 256 | lm loss: 3.667608E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.908 | TFLOPs: 24.89 | +7: iteration 147660/ 173500 | consumed samples: 37800960 | consumed tokens: 77416366080 | elapsed time per iteration (s): 0.15 | learning rate: 2.987E-05 | global batch size: 256 | lm loss: 3.676504E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.101 | TFLOPs: 26.08 | +7: iteration 147670/ 173500 | consumed samples: 37803520 | consumed tokens: 77421608960 | elapsed time per iteration (s): 0.16 | learning rate: 2.986E-05 | global batch size: 256 | lm loss: 3.673706E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.851 | TFLOPs: 25.50 | +7: iteration 147680/ 173500 | consumed samples: 37806080 | consumed tokens: 77426851840 | elapsed time per iteration (s): 0.16 | learning rate: 2.985E-05 | global batch size: 256 | lm loss: 3.666481E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.000 | TFLOPs: 25.42 | +7: iteration 147690/ 173500 | consumed samples: 37808640 | consumed tokens: 77432094720 | elapsed time per iteration (s): 0.17 | learning rate: 2.984E-05 | global batch size: 256 | lm loss: 3.688480E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1484.867 | TFLOPs: 23.29 | +7: iteration 147700/ 173500 | consumed samples: 37811200 | consumed tokens: 77437337600 | elapsed time per iteration (s): 0.16 | learning rate: 2.984E-05 | global batch size: 256 | lm loss: 3.667796E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.910 | TFLOPs: 25.15 | +7: iteration 147710/ 173500 | consumed samples: 37813760 | consumed tokens: 77442580480 | elapsed time per iteration (s): 0.15 | learning rate: 2.983E-05 | global batch size: 256 | lm loss: 3.665053E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.471 | TFLOPs: 26.18 | +7: iteration 147720/ 173500 | consumed samples: 37816320 | consumed tokens: 77447823360 | elapsed time per iteration (s): 0.15 | learning rate: 2.982E-05 | global batch size: 256 | lm loss: 3.671763E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.182 | TFLOPs: 26.22 | +7: iteration 147730/ 173500 | consumed samples: 37818880 | consumed tokens: 77453066240 | elapsed time per iteration (s): 0.16 | learning rate: 2.981E-05 | global batch size: 256 | lm loss: 3.686109E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.021 | TFLOPs: 25.23 | +7: iteration 147740/ 173500 | consumed samples: 37821440 | consumed tokens: 77458309120 | elapsed time per iteration (s): 0.16 | learning rate: 2.981E-05 | global batch size: 256 | lm loss: 3.661287E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.257 | TFLOPs: 25.83 | +7: iteration 147750/ 173500 | consumed samples: 37824000 | consumed tokens: 77463552000 | elapsed time per iteration (s): 0.16 | learning rate: 2.980E-05 | global batch size: 256 | lm loss: 3.658372E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.405 | TFLOPs: 24.36 | +7: iteration 147760/ 173500 | consumed samples: 37826560 | consumed tokens: 77468794880 | elapsed time per iteration (s): 0.16 | learning rate: 2.979E-05 | global batch size: 256 | lm loss: 3.659800E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.457 | TFLOPs: 25.57 | +7: iteration 147770/ 173500 | consumed samples: 37829120 | consumed tokens: 77474037760 | elapsed time per iteration (s): 0.16 | learning rate: 2.978E-05 | global batch size: 256 | lm loss: 3.663453E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.821 | TFLOPs: 25.40 | +7: iteration 147780/ 173500 | consumed samples: 37831680 | consumed tokens: 77479280640 | elapsed time per iteration (s): 0.16 | learning rate: 2.978E-05 | global batch size: 256 | lm loss: 3.682799E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.274 | TFLOPs: 24.56 | +7: iteration 147790/ 173500 | consumed samples: 37834240 | consumed tokens: 77484523520 | elapsed time per iteration (s): 0.16 | learning rate: 2.977E-05 | global batch size: 256 | lm loss: 3.653794E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.230 | TFLOPs: 25.77 | +7: iteration 147800/ 173500 | consumed samples: 37836800 | consumed tokens: 77489766400 | elapsed time per iteration (s): 0.16 | learning rate: 2.976E-05 | global batch size: 256 | lm loss: 3.666789E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.473 | TFLOPs: 25.63 | +7: iteration 147810/ 173500 | consumed samples: 37839360 | consumed tokens: 77495009280 | elapsed time per iteration (s): 0.16 | learning rate: 2.975E-05 | global batch size: 256 | lm loss: 3.661978E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.719 | TFLOPs: 24.59 | +7: iteration 147820/ 173500 | consumed samples: 37841920 | consumed tokens: 77500252160 | elapsed time per iteration (s): 0.16 | learning rate: 2.975E-05 | global batch size: 256 | lm loss: 3.672604E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.962 | TFLOPs: 24.90 | +7: iteration 147830/ 173500 | consumed samples: 37844480 | consumed tokens: 77505495040 | elapsed time per iteration (s): 0.16 | learning rate: 2.974E-05 | global batch size: 256 | lm loss: 3.658171E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.921 | TFLOPs: 25.26 | +7: iteration 147840/ 173500 | consumed samples: 37847040 | consumed tokens: 77510737920 | elapsed time per iteration (s): 0.16 | learning rate: 2.973E-05 | global batch size: 256 | lm loss: 3.657964E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.690 | TFLOPs: 25.84 | +7: iteration 147850/ 173500 | consumed samples: 37849600 | consumed tokens: 77515980800 | elapsed time per iteration (s): 0.16 | learning rate: 2.972E-05 | global batch size: 256 | lm loss: 3.651434E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.791 | TFLOPs: 25.59 | +7: iteration 147860/ 173500 | consumed samples: 37852160 | consumed tokens: 77521223680 | elapsed time per iteration (s): 0.16 | learning rate: 2.972E-05 | global batch size: 256 | lm loss: 3.669508E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.471 | TFLOPs: 25.41 | +7: iteration 147870/ 173500 | consumed samples: 37854720 | consumed tokens: 77526466560 | elapsed time per iteration (s): 0.16 | learning rate: 2.971E-05 | global batch size: 256 | lm loss: 3.673705E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.124 | TFLOPs: 25.20 | +7: iteration 147880/ 173500 | consumed samples: 37857280 | consumed tokens: 77531709440 | elapsed time per iteration (s): 0.16 | learning rate: 2.970E-05 | global batch size: 256 | lm loss: 3.656147E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.137 | TFLOPs: 25.00 | +7: iteration 147890/ 173500 | consumed samples: 37859840 | consumed tokens: 77536952320 | elapsed time per iteration (s): 0.16 | learning rate: 2.969E-05 | global batch size: 256 | lm loss: 3.660150E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.487 | TFLOPs: 25.77 | +7: iteration 147900/ 173500 | consumed samples: 37862400 | consumed tokens: 77542195200 | elapsed time per iteration (s): 0.16 | learning rate: 2.969E-05 | global batch size: 256 | lm loss: 3.661433E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.184 | TFLOPs: 24.61 | +7: iteration 147910/ 173500 | consumed samples: 37864960 | consumed tokens: 77547438080 | elapsed time per iteration (s): 0.16 | learning rate: 2.968E-05 | global batch size: 256 | lm loss: 3.676275E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.799 | TFLOPs: 25.20 | +7: iteration 147920/ 173500 | consumed samples: 37867520 | consumed tokens: 77552680960 | elapsed time per iteration (s): 0.16 | learning rate: 2.967E-05 | global batch size: 256 | lm loss: 3.675553E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.439 | TFLOPs: 25.29 | +7: iteration 147930/ 173500 | consumed samples: 37870080 | consumed tokens: 77557923840 | elapsed time per iteration (s): 0.16 | learning rate: 2.966E-05 | global batch size: 256 | lm loss: 3.669944E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.099 | TFLOPs: 25.44 | +7: iteration 147940/ 173500 | consumed samples: 37872640 | consumed tokens: 77563166720 | elapsed time per iteration (s): 0.16 | learning rate: 2.966E-05 | global batch size: 256 | lm loss: 3.663264E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.837 | TFLOPs: 24.54 | +7: iteration 147950/ 173500 | consumed samples: 37875200 | consumed tokens: 77568409600 | elapsed time per iteration (s): 0.16 | learning rate: 2.965E-05 | global batch size: 256 | lm loss: 3.667922E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.377 | TFLOPs: 24.63 | +7: iteration 147960/ 173500 | consumed samples: 37877760 | consumed tokens: 77573652480 | elapsed time per iteration (s): 0.16 | learning rate: 2.964E-05 | global batch size: 256 | lm loss: 3.649400E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.162 | TFLOPs: 24.44 | +7: iteration 147970/ 173500 | consumed samples: 37880320 | consumed tokens: 77578895360 | elapsed time per iteration (s): 0.16 | learning rate: 2.964E-05 | global batch size: 256 | lm loss: 3.667231E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.454 | TFLOPs: 24.39 | +7: iteration 147980/ 173500 | consumed samples: 37882880 | consumed tokens: 77584138240 | elapsed time per iteration (s): 0.16 | learning rate: 2.963E-05 | global batch size: 256 | lm loss: 3.672120E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.811 | TFLOPs: 24.70 | +7: iteration 147990/ 173500 | consumed samples: 37885440 | consumed tokens: 77589381120 | elapsed time per iteration (s): 0.15 | learning rate: 2.962E-05 | global batch size: 256 | lm loss: 3.661123E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.289 | TFLOPs: 25.97 | +0: [2023-03-17 06:41:24,641] [INFO] [logging.py:68:log_dist] [Rank 0] step=148000, skipped=0, lr=[2.9612854264054498e-05, 2.9612854264054498e-05, 2.9612854264054498e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 148000/ 173500 | consumed samples: 37888000 | consumed tokens: 77594624000 | elapsed time per iteration (s): 0.17 | learning rate: 2.961E-05 | global batch size: 256 | lm loss: 3.673998E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.789 | TFLOPs: 24.19 | +0: steps: 148000 loss: 3.6770 iter time (s): 0.158 samples/sec: 1623.040 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 148000 | lm loss value: 3.822907E+00 | lm loss PPL: 4.573698E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 148000 to checkpoints_44m91b100m +0: [2023-03-17 06:41:24,727] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step148000 is begin to save! +0: [2023-03-17 06:41:24,731] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:41:24,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:41:24,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:41:24,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:41:24,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:41:24,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:41:24,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:41:24,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:41:24,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:41:24,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:41:24,830] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:41:24,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:41:24,838] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:41:24,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:41:24,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:41:24,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:41:24,854] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:41:24,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:41:24,862] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:41:24,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:41:24,863] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step148000/mp_rank_00_model_states.pt +0: [2023-03-17 06:41:24,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:41:24,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:41:24,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:41:24,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:41:24,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:41:24,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:41:24,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:41:24,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +6: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +2: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +5: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +3: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +1: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +4: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:41:24,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: [2023-03-17 06:41:24,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:41:24,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:41:24,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +7: [2023-03-17 06:41:24,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:41:24,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step148000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:41:24,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step148000 is ready now! +0: successfully saved checkpoint at iteration 148000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 183.97 +7: iteration 148010/ 173500 | consumed samples: 37890560 | consumed tokens: 77599866880 | elapsed time per iteration (s): 0.18 | learning rate: 2.961E-05 | global batch size: 256 | lm loss: 3.660646E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.020 | TFLOPs: 21.77 | +7: iteration 148020/ 173500 | consumed samples: 37893120 | consumed tokens: 77605109760 | elapsed time per iteration (s): 0.16 | learning rate: 2.960E-05 | global batch size: 256 | lm loss: 3.679669E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.091 | TFLOPs: 25.33 | +7: iteration 148030/ 173500 | consumed samples: 37895680 | consumed tokens: 77610352640 | elapsed time per iteration (s): 0.16 | learning rate: 2.959E-05 | global batch size: 256 | lm loss: 3.662307E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.493 | TFLOPs: 25.30 | +7: iteration 148040/ 173500 | consumed samples: 37898240 | consumed tokens: 77615595520 | elapsed time per iteration (s): 0.17 | learning rate: 2.958E-05 | global batch size: 256 | lm loss: 3.675821E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.518 | TFLOPs: 24.32 | +7: iteration 148050/ 173500 | consumed samples: 37900800 | consumed tokens: 77620838400 | elapsed time per iteration (s): 0.16 | learning rate: 2.958E-05 | global batch size: 256 | lm loss: 3.670840E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.061 | TFLOPs: 25.31 | +7: iteration 148060/ 173500 | consumed samples: 37903360 | consumed tokens: 77626081280 | elapsed time per iteration (s): 0.17 | learning rate: 2.957E-05 | global batch size: 256 | lm loss: 3.665423E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.083 | TFLOPs: 23.43 | +7: iteration 148070/ 173500 | consumed samples: 37905920 | consumed tokens: 77631324160 | elapsed time per iteration (s): 0.15 | learning rate: 2.956E-05 | global batch size: 256 | lm loss: 3.661740E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.909 | TFLOPs: 25.97 | +7: iteration 148080/ 173500 | consumed samples: 37908480 | consumed tokens: 77636567040 | elapsed time per iteration (s): 0.16 | learning rate: 2.955E-05 | global batch size: 256 | lm loss: 3.671558E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.453 | TFLOPs: 25.04 | +7: iteration 148090/ 173500 | consumed samples: 37911040 | consumed tokens: 77641809920 | elapsed time per iteration (s): 0.16 | learning rate: 2.955E-05 | global batch size: 256 | lm loss: 3.677187E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.851 | TFLOPs: 24.92 | +7: iteration 148100/ 173500 | consumed samples: 37913600 | consumed tokens: 77647052800 | elapsed time per iteration (s): 0.16 | learning rate: 2.954E-05 | global batch size: 256 | lm loss: 3.663524E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.416 | TFLOPs: 25.79 | +7: iteration 148110/ 173500 | consumed samples: 37916160 | consumed tokens: 77652295680 | elapsed time per iteration (s): 0.15 | learning rate: 2.953E-05 | global batch size: 256 | lm loss: 3.671616E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.479 | TFLOPs: 26.23 | +7: iteration 148120/ 173500 | consumed samples: 37918720 | consumed tokens: 77657538560 | elapsed time per iteration (s): 0.16 | learning rate: 2.952E-05 | global batch size: 256 | lm loss: 3.669563E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.909 | TFLOPs: 25.72 | +7: iteration 148130/ 173500 | consumed samples: 37921280 | consumed tokens: 77662781440 | elapsed time per iteration (s): 0.16 | learning rate: 2.952E-05 | global batch size: 256 | lm loss: 3.661331E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.147 | TFLOPs: 25.28 | +7: iteration 148140/ 173500 | consumed samples: 37923840 | consumed tokens: 77668024320 | elapsed time per iteration (s): 0.16 | learning rate: 2.951E-05 | global batch size: 256 | lm loss: 3.656396E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.503 | TFLOPs: 25.52 | +7: iteration 148150/ 173500 | consumed samples: 37926400 | consumed tokens: 77673267200 | elapsed time per iteration (s): 0.16 | learning rate: 2.950E-05 | global batch size: 256 | lm loss: 3.665988E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.155 | TFLOPs: 25.42 | +7: iteration 148160/ 173500 | consumed samples: 37928960 | consumed tokens: 77678510080 | elapsed time per iteration (s): 0.16 | learning rate: 2.949E-05 | global batch size: 256 | lm loss: 3.654528E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.948 | TFLOPs: 25.08 | +7: iteration 148170/ 173500 | consumed samples: 37931520 | consumed tokens: 77683752960 | elapsed time per iteration (s): 0.16 | learning rate: 2.949E-05 | global batch size: 256 | lm loss: 3.663468E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.899 | TFLOPs: 25.47 | +7: iteration 148180/ 173500 | consumed samples: 37934080 | consumed tokens: 77688995840 | elapsed time per iteration (s): 0.16 | learning rate: 2.948E-05 | global batch size: 256 | lm loss: 3.668223E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.183 | TFLOPs: 24.77 | +7: iteration 148190/ 173500 | consumed samples: 37936640 | consumed tokens: 77694238720 | elapsed time per iteration (s): 0.16 | learning rate: 2.947E-05 | global batch size: 256 | lm loss: 3.675206E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.116 | TFLOPs: 25.16 | +7: iteration 148200/ 173500 | consumed samples: 37939200 | consumed tokens: 77699481600 | elapsed time per iteration (s): 0.16 | learning rate: 2.947E-05 | global batch size: 256 | lm loss: 3.666881E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.767 | TFLOPs: 25.62 | +7: iteration 148210/ 173500 | consumed samples: 37941760 | consumed tokens: 77704724480 | elapsed time per iteration (s): 0.16 | learning rate: 2.946E-05 | global batch size: 256 | lm loss: 3.659454E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.997 | TFLOPs: 25.39 | +7: iteration 148220/ 173500 | consumed samples: 37944320 | consumed tokens: 77709967360 | elapsed time per iteration (s): 0.16 | learning rate: 2.945E-05 | global batch size: 256 | lm loss: 3.649317E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.807 | TFLOPs: 24.59 | +7: iteration 148230/ 173500 | consumed samples: 37946880 | consumed tokens: 77715210240 | elapsed time per iteration (s): 0.16 | learning rate: 2.944E-05 | global batch size: 256 | lm loss: 3.662648E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.570 | TFLOPs: 24.65 | +7: iteration 148240/ 173500 | consumed samples: 37949440 | consumed tokens: 77720453120 | elapsed time per iteration (s): 0.15 | learning rate: 2.944E-05 | global batch size: 256 | lm loss: 3.663396E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.412 | TFLOPs: 26.23 | +7: iteration 148250/ 173500 | consumed samples: 37952000 | consumed tokens: 77725696000 | elapsed time per iteration (s): 0.16 | learning rate: 2.943E-05 | global batch size: 256 | lm loss: 3.659638E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.233 | TFLOPs: 25.53 | +7: iteration 148260/ 173500 | consumed samples: 37954560 | consumed tokens: 77730938880 | elapsed time per iteration (s): 0.16 | learning rate: 2.942E-05 | global batch size: 256 | lm loss: 3.659945E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.383 | TFLOPs: 25.90 | +7: iteration 148270/ 173500 | consumed samples: 37957120 | consumed tokens: 77736181760 | elapsed time per iteration (s): 0.16 | learning rate: 2.941E-05 | global batch size: 256 | lm loss: 3.660299E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.351 | TFLOPs: 25.72 | +7: iteration 148280/ 173500 | consumed samples: 37959680 | consumed tokens: 77741424640 | elapsed time per iteration (s): 0.16 | learning rate: 2.941E-05 | global batch size: 256 | lm loss: 3.675271E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.246 | TFLOPs: 24.95 | +7: iteration 148290/ 173500 | consumed samples: 37962240 | consumed tokens: 77746667520 | elapsed time per iteration (s): 0.16 | learning rate: 2.940E-05 | global batch size: 256 | lm loss: 3.664389E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.676 | TFLOPs: 25.59 | +7: iteration 148300/ 173500 | consumed samples: 37964800 | consumed tokens: 77751910400 | elapsed time per iteration (s): 0.16 | learning rate: 2.939E-05 | global batch size: 256 | lm loss: 3.663614E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.803 | TFLOPs: 25.54 | +7: iteration 148310/ 173500 | consumed samples: 37967360 | consumed tokens: 77757153280 | elapsed time per iteration (s): 0.16 | learning rate: 2.938E-05 | global batch size: 256 | lm loss: 3.666303E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.464 | TFLOPs: 25.65 | +7: iteration 148320/ 173500 | consumed samples: 37969920 | consumed tokens: 77762396160 | elapsed time per iteration (s): 0.16 | learning rate: 2.938E-05 | global batch size: 256 | lm loss: 3.666558E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.286 | TFLOPs: 25.00 | +7: iteration 148330/ 173500 | consumed samples: 37972480 | consumed tokens: 77767639040 | elapsed time per iteration (s): 0.16 | learning rate: 2.937E-05 | global batch size: 256 | lm loss: 3.644691E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.230 | TFLOPs: 25.90 | +7: iteration 148340/ 173500 | consumed samples: 37975040 | consumed tokens: 77772881920 | elapsed time per iteration (s): 0.16 | learning rate: 2.936E-05 | global batch size: 256 | lm loss: 3.669136E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.658 | TFLOPs: 25.24 | +7: iteration 148350/ 173500 | consumed samples: 37977600 | consumed tokens: 77778124800 | elapsed time per iteration (s): 0.16 | learning rate: 2.936E-05 | global batch size: 256 | lm loss: 3.659063E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.949 | TFLOPs: 25.88 | +7: iteration 148360/ 173500 | consumed samples: 37980160 | consumed tokens: 77783367680 | elapsed time per iteration (s): 0.16 | learning rate: 2.935E-05 | global batch size: 256 | lm loss: 3.661354E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.784 | TFLOPs: 25.10 | +7: iteration 148370/ 173500 | consumed samples: 37982720 | consumed tokens: 77788610560 | elapsed time per iteration (s): 0.15 | learning rate: 2.934E-05 | global batch size: 256 | lm loss: 3.661107E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.995 | TFLOPs: 26.28 | +7: iteration 148380/ 173500 | consumed samples: 37985280 | consumed tokens: 77793853440 | elapsed time per iteration (s): 0.16 | learning rate: 2.933E-05 | global batch size: 256 | lm loss: 3.671465E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.355 | TFLOPs: 25.13 | +7: iteration 148390/ 173500 | consumed samples: 37987840 | consumed tokens: 77799096320 | elapsed time per iteration (s): 0.17 | learning rate: 2.933E-05 | global batch size: 256 | lm loss: 3.675739E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.967 | TFLOPs: 23.82 | +7: iteration 148400/ 173500 | consumed samples: 37990400 | consumed tokens: 77804339200 | elapsed time per iteration (s): 0.15 | learning rate: 2.932E-05 | global batch size: 256 | lm loss: 3.669768E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.659 | TFLOPs: 26.06 | +7: iteration 148410/ 173500 | consumed samples: 37992960 | consumed tokens: 77809582080 | elapsed time per iteration (s): 0.16 | learning rate: 2.931E-05 | global batch size: 256 | lm loss: 3.659887E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.412 | TFLOPs: 25.24 | +7: iteration 148420/ 173500 | consumed samples: 37995520 | consumed tokens: 77814824960 | elapsed time per iteration (s): 0.16 | learning rate: 2.930E-05 | global batch size: 256 | lm loss: 3.665143E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.697 | TFLOPs: 25.76 | +7: iteration 148430/ 173500 | consumed samples: 37998080 | consumed tokens: 77820067840 | elapsed time per iteration (s): 0.17 | learning rate: 2.930E-05 | global batch size: 256 | lm loss: 3.662684E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.677 | TFLOPs: 23.46 | +7: iteration 148440/ 173500 | consumed samples: 38000640 | consumed tokens: 77825310720 | elapsed time per iteration (s): 0.17 | learning rate: 2.929E-05 | global batch size: 256 | lm loss: 3.659829E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.217 | TFLOPs: 24.06 | +7: iteration 148450/ 173500 | consumed samples: 38003200 | consumed tokens: 77830553600 | elapsed time per iteration (s): 0.16 | learning rate: 2.928E-05 | global batch size: 256 | lm loss: 3.661987E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.401 | TFLOPs: 25.74 | +7: iteration 148460/ 173500 | consumed samples: 38005760 | consumed tokens: 77835796480 | elapsed time per iteration (s): 0.16 | learning rate: 2.928E-05 | global batch size: 256 | lm loss: 3.665866E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.753 | TFLOPs: 25.62 | +7: iteration 148470/ 173500 | consumed samples: 38008320 | consumed tokens: 77841039360 | elapsed time per iteration (s): 0.15 | learning rate: 2.927E-05 | global batch size: 256 | lm loss: 3.662645E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.425 | TFLOPs: 26.24 | +7: iteration 148480/ 173500 | consumed samples: 38010880 | consumed tokens: 77846282240 | elapsed time per iteration (s): 0.16 | learning rate: 2.926E-05 | global batch size: 256 | lm loss: 3.662149E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.689 | TFLOPs: 25.60 | +7: iteration 148490/ 173500 | consumed samples: 38013440 | consumed tokens: 77851525120 | elapsed time per iteration (s): 0.15 | learning rate: 2.925E-05 | global batch size: 256 | lm loss: 3.656705E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.063 | TFLOPs: 26.22 | +7: iteration 148500/ 173500 | consumed samples: 38016000 | consumed tokens: 77856768000 | elapsed time per iteration (s): 0.16 | learning rate: 2.925E-05 | global batch size: 256 | lm loss: 3.665665E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.543 | TFLOPs: 24.66 | +7: iteration 148510/ 173500 | consumed samples: 38018560 | consumed tokens: 77862010880 | elapsed time per iteration (s): 0.16 | learning rate: 2.924E-05 | global batch size: 256 | lm loss: 3.655966E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.983 | TFLOPs: 24.68 | +7: iteration 148520/ 173500 | consumed samples: 38021120 | consumed tokens: 77867253760 | elapsed time per iteration (s): 0.16 | learning rate: 2.923E-05 | global batch size: 256 | lm loss: 3.660036E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.018 | TFLOPs: 24.73 | +7: iteration 148530/ 173500 | consumed samples: 38023680 | consumed tokens: 77872496640 | elapsed time per iteration (s): 0.17 | learning rate: 2.922E-05 | global batch size: 256 | lm loss: 3.659721E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.221 | TFLOPs: 24.33 | +7: iteration 148540/ 173500 | consumed samples: 38026240 | consumed tokens: 77877739520 | elapsed time per iteration (s): 0.16 | learning rate: 2.922E-05 | global batch size: 256 | lm loss: 3.669897E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.438 | TFLOPs: 25.57 | +7: iteration 148550/ 173500 | consumed samples: 38028800 | consumed tokens: 77882982400 | elapsed time per iteration (s): 0.16 | learning rate: 2.921E-05 | global batch size: 256 | lm loss: 3.666370E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.985 | TFLOPs: 24.97 | +7: iteration 148560/ 173500 | consumed samples: 38031360 | consumed tokens: 77888225280 | elapsed time per iteration (s): 0.16 | learning rate: 2.920E-05 | global batch size: 256 | lm loss: 3.662936E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.468 | TFLOPs: 25.46 | +7: iteration 148570/ 173500 | consumed samples: 38033920 | consumed tokens: 77893468160 | elapsed time per iteration (s): 0.16 | learning rate: 2.920E-05 | global batch size: 256 | lm loss: 3.667008E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.227 | TFLOPs: 25.10 | +7: iteration 148580/ 173500 | consumed samples: 38036480 | consumed tokens: 77898711040 | elapsed time per iteration (s): 0.16 | learning rate: 2.919E-05 | global batch size: 256 | lm loss: 3.669470E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.715 | TFLOPs: 25.06 | +7: iteration 148590/ 173500 | consumed samples: 38039040 | consumed tokens: 77903953920 | elapsed time per iteration (s): 0.16 | learning rate: 2.918E-05 | global batch size: 256 | lm loss: 3.682456E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.986 | TFLOPs: 25.45 | +7: iteration 148600/ 173500 | consumed samples: 38041600 | consumed tokens: 77909196800 | elapsed time per iteration (s): 0.17 | learning rate: 2.917E-05 | global batch size: 256 | lm loss: 3.670561E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.941 | TFLOPs: 23.98 | +7: iteration 148610/ 173500 | consumed samples: 38044160 | consumed tokens: 77914439680 | elapsed time per iteration (s): 0.15 | learning rate: 2.917E-05 | global batch size: 256 | lm loss: 3.670311E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.547 | TFLOPs: 25.95 | +7: iteration 148620/ 173500 | consumed samples: 38046720 | consumed tokens: 77919682560 | elapsed time per iteration (s): 0.15 | learning rate: 2.916E-05 | global batch size: 256 | lm loss: 3.667954E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.534 | TFLOPs: 26.10 | +7: iteration 148630/ 173500 | consumed samples: 38049280 | consumed tokens: 77924925440 | elapsed time per iteration (s): 0.16 | learning rate: 2.915E-05 | global batch size: 256 | lm loss: 3.652456E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.207 | TFLOPs: 25.61 | +7: iteration 148640/ 173500 | consumed samples: 38051840 | consumed tokens: 77930168320 | elapsed time per iteration (s): 0.16 | learning rate: 2.914E-05 | global batch size: 256 | lm loss: 3.666568E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.403 | TFLOPs: 25.51 | +7: iteration 148650/ 173500 | consumed samples: 38054400 | consumed tokens: 77935411200 | elapsed time per iteration (s): 0.15 | learning rate: 2.914E-05 | global batch size: 256 | lm loss: 3.660098E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.257 | TFLOPs: 26.23 | +7: iteration 148660/ 173500 | consumed samples: 38056960 | consumed tokens: 77940654080 | elapsed time per iteration (s): 0.16 | learning rate: 2.913E-05 | global batch size: 256 | lm loss: 3.654053E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.313 | TFLOPs: 25.52 | +7: iteration 148670/ 173500 | consumed samples: 38059520 | consumed tokens: 77945896960 | elapsed time per iteration (s): 0.16 | learning rate: 2.912E-05 | global batch size: 256 | lm loss: 3.665836E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.517 | TFLOPs: 25.52 | +7: iteration 148680/ 173500 | consumed samples: 38062080 | consumed tokens: 77951139840 | elapsed time per iteration (s): 0.16 | learning rate: 2.912E-05 | global batch size: 256 | lm loss: 3.659568E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.371 | TFLOPs: 25.85 | +7: iteration 148690/ 173500 | consumed samples: 38064640 | consumed tokens: 77956382720 | elapsed time per iteration (s): 0.16 | learning rate: 2.911E-05 | global batch size: 256 | lm loss: 3.681420E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.745 | TFLOPs: 25.75 | +7: iteration 148700/ 173500 | consumed samples: 38067200 | consumed tokens: 77961625600 | elapsed time per iteration (s): 0.16 | learning rate: 2.910E-05 | global batch size: 256 | lm loss: 3.661187E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.020 | TFLOPs: 25.26 | +7: iteration 148710/ 173500 | consumed samples: 38069760 | consumed tokens: 77966868480 | elapsed time per iteration (s): 0.17 | learning rate: 2.909E-05 | global batch size: 256 | lm loss: 3.657011E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1518.546 | TFLOPs: 23.81 | +7: iteration 148720/ 173500 | consumed samples: 38072320 | consumed tokens: 77972111360 | elapsed time per iteration (s): 0.16 | learning rate: 2.909E-05 | global batch size: 256 | lm loss: 3.665462E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.782 | TFLOPs: 25.46 | +7: iteration 148730/ 173500 | consumed samples: 38074880 | consumed tokens: 77977354240 | elapsed time per iteration (s): 0.16 | learning rate: 2.908E-05 | global batch size: 256 | lm loss: 3.665419E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.277 | TFLOPs: 24.91 | +7: iteration 148740/ 173500 | consumed samples: 38077440 | consumed tokens: 77982597120 | elapsed time per iteration (s): 0.15 | learning rate: 2.907E-05 | global batch size: 256 | lm loss: 3.679495E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.393 | TFLOPs: 26.05 | +7: iteration 148750/ 173500 | consumed samples: 38080000 | consumed tokens: 77987840000 | elapsed time per iteration (s): 0.16 | learning rate: 2.907E-05 | global batch size: 256 | lm loss: 3.664777E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.543 | TFLOPs: 25.76 | +7: iteration 148760/ 173500 | consumed samples: 38082560 | consumed tokens: 77993082880 | elapsed time per iteration (s): 0.17 | learning rate: 2.906E-05 | global batch size: 256 | lm loss: 3.661950E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.755 | TFLOPs: 23.93 | +7: iteration 148770/ 173500 | consumed samples: 38085120 | consumed tokens: 77998325760 | elapsed time per iteration (s): 0.16 | learning rate: 2.905E-05 | global batch size: 256 | lm loss: 3.661117E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.342 | TFLOPs: 24.60 | +7: iteration 148780/ 173500 | consumed samples: 38087680 | consumed tokens: 78003568640 | elapsed time per iteration (s): 0.15 | learning rate: 2.904E-05 | global batch size: 256 | lm loss: 3.652641E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.517 | TFLOPs: 26.10 | +7: iteration 148790/ 173500 | consumed samples: 38090240 | consumed tokens: 78008811520 | elapsed time per iteration (s): 0.16 | learning rate: 2.904E-05 | global batch size: 256 | lm loss: 3.652942E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.660 | TFLOPs: 25.54 | +7: iteration 148800/ 173500 | consumed samples: 38092800 | consumed tokens: 78014054400 | elapsed time per iteration (s): 0.17 | learning rate: 2.903E-05 | global batch size: 256 | lm loss: 3.664237E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1502.520 | TFLOPs: 23.56 | +7: iteration 148810/ 173500 | consumed samples: 38095360 | consumed tokens: 78019297280 | elapsed time per iteration (s): 0.15 | learning rate: 2.902E-05 | global batch size: 256 | lm loss: 3.661086E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.063 | TFLOPs: 26.27 | +7: iteration 148820/ 173500 | consumed samples: 38097920 | consumed tokens: 78024540160 | elapsed time per iteration (s): 0.16 | learning rate: 2.901E-05 | global batch size: 256 | lm loss: 3.650971E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.870 | TFLOPs: 24.92 | +7: iteration 148830/ 173500 | consumed samples: 38100480 | consumed tokens: 78029783040 | elapsed time per iteration (s): 0.16 | learning rate: 2.901E-05 | global batch size: 256 | lm loss: 3.672096E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.416 | TFLOPs: 25.24 | +7: iteration 148840/ 173500 | consumed samples: 38103040 | consumed tokens: 78035025920 | elapsed time per iteration (s): 0.17 | learning rate: 2.900E-05 | global batch size: 256 | lm loss: 3.653590E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.400 | TFLOPs: 24.17 | +7: iteration 148850/ 173500 | consumed samples: 38105600 | consumed tokens: 78040268800 | elapsed time per iteration (s): 0.16 | learning rate: 2.899E-05 | global batch size: 256 | lm loss: 3.669898E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.220 | TFLOPs: 25.80 | +7: iteration 148860/ 173500 | consumed samples: 38108160 | consumed tokens: 78045511680 | elapsed time per iteration (s): 0.16 | learning rate: 2.899E-05 | global batch size: 256 | lm loss: 3.667532E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.736 | TFLOPs: 25.61 | +7: iteration 148870/ 173500 | consumed samples: 38110720 | consumed tokens: 78050754560 | elapsed time per iteration (s): 0.16 | learning rate: 2.898E-05 | global batch size: 256 | lm loss: 3.665939E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.821 | TFLOPs: 25.21 | +7: iteration 148880/ 173500 | consumed samples: 38113280 | consumed tokens: 78055997440 | elapsed time per iteration (s): 0.17 | learning rate: 2.897E-05 | global batch size: 256 | lm loss: 3.679438E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.534 | TFLOPs: 23.88 | +7: iteration 148890/ 173500 | consumed samples: 38115840 | consumed tokens: 78061240320 | elapsed time per iteration (s): 0.17 | learning rate: 2.896E-05 | global batch size: 256 | lm loss: 3.663294E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1489.831 | TFLOPs: 23.36 | +7: iteration 148900/ 173500 | consumed samples: 38118400 | consumed tokens: 78066483200 | elapsed time per iteration (s): 0.16 | learning rate: 2.896E-05 | global batch size: 256 | lm loss: 3.666049E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.750 | TFLOPs: 24.92 | +7: iteration 148910/ 173500 | consumed samples: 38120960 | consumed tokens: 78071726080 | elapsed time per iteration (s): 0.16 | learning rate: 2.895E-05 | global batch size: 256 | lm loss: 3.657499E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.155 | TFLOPs: 25.38 | +7: iteration 148920/ 173500 | consumed samples: 38123520 | consumed tokens: 78076968960 | elapsed time per iteration (s): 0.16 | learning rate: 2.894E-05 | global batch size: 256 | lm loss: 3.653559E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.075 | TFLOPs: 24.73 | +7: iteration 148930/ 173500 | consumed samples: 38126080 | consumed tokens: 78082211840 | elapsed time per iteration (s): 0.16 | learning rate: 2.894E-05 | global batch size: 256 | lm loss: 3.667583E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.260 | TFLOPs: 25.33 | +7: iteration 148940/ 173500 | consumed samples: 38128640 | consumed tokens: 78087454720 | elapsed time per iteration (s): 0.16 | learning rate: 2.893E-05 | global batch size: 256 | lm loss: 3.675556E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.647 | TFLOPs: 25.38 | +7: iteration 148950/ 173500 | consumed samples: 38131200 | consumed tokens: 78092697600 | elapsed time per iteration (s): 0.17 | learning rate: 2.892E-05 | global batch size: 256 | lm loss: 3.658708E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.285 | TFLOPs: 23.87 | +7: iteration 148960/ 173500 | consumed samples: 38133760 | consumed tokens: 78097940480 | elapsed time per iteration (s): 0.16 | learning rate: 2.891E-05 | global batch size: 256 | lm loss: 3.660480E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.672 | TFLOPs: 24.82 | +7: iteration 148970/ 173500 | consumed samples: 38136320 | consumed tokens: 78103183360 | elapsed time per iteration (s): 0.16 | learning rate: 2.891E-05 | global batch size: 256 | lm loss: 3.664727E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.639 | TFLOPs: 24.40 | +7: iteration 148980/ 173500 | consumed samples: 38138880 | consumed tokens: 78108426240 | elapsed time per iteration (s): 0.16 | learning rate: 2.890E-05 | global batch size: 256 | lm loss: 3.670261E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.255 | TFLOPs: 24.39 | +7: iteration 148990/ 173500 | consumed samples: 38141440 | consumed tokens: 78113669120 | elapsed time per iteration (s): 0.16 | learning rate: 2.889E-05 | global batch size: 256 | lm loss: 3.658727E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.728 | TFLOPs: 24.82 | +7: iteration 149000/ 173500 | consumed samples: 38144000 | consumed tokens: 78118912000 | elapsed time per iteration (s): 0.17 | learning rate: 2.889E-05 | global batch size: 256 | lm loss: 3.652867E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.496 | TFLOPs: 23.83 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 149000 | lm loss value: 3.823423E+00 | lm loss PPL: 4.576059E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 149000 to checkpoints_44m91b100m +0: [2023-03-17 06:44:04,515] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step149000 is begin to save! +0: [2023-03-17 06:44:04,519] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:44:04,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:44:04,586] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:44:04,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:44:04,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:44:04,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:44:04,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:44:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:44:04,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:44:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:44:04,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:44:04,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:44:04,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:44:04,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:44:04,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:44:04,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:44:04,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:44:04,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:44:04,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:44:04,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:44:04,655] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step149000/mp_rank_00_model_states.pt +0: [2023-03-17 06:44:04,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:44:04,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:44:04,674] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:44:04,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:44:04,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:44:04,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +2: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +2: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +2: [2023-03-17 06:44:04,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:44:04,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:44:04,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +2: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +2: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +4: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +2: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:44:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:44:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +7: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:44:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 06:44:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +6: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +5: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:44:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +3: [2023-03-17 06:44:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:44:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:44:04,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +1: [2023-03-17 06:44:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:44:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step149000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:44:04,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step149000 is ready now! +0: successfully saved checkpoint at iteration 149000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 188.35 +7: iteration 149010/ 173500 | consumed samples: 38146560 | consumed tokens: 78124154880 | elapsed time per iteration (s): 0.19 | learning rate: 2.888E-05 | global batch size: 256 | lm loss: 3.663769E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1336.000 | TFLOPs: 20.95 | +7: iteration 149020/ 173500 | consumed samples: 38149120 | consumed tokens: 78129397760 | elapsed time per iteration (s): 0.16 | learning rate: 2.887E-05 | global batch size: 256 | lm loss: 3.664043E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.894 | TFLOPs: 24.59 | +7: iteration 149030/ 173500 | consumed samples: 38151680 | consumed tokens: 78134640640 | elapsed time per iteration (s): 0.17 | learning rate: 2.886E-05 | global batch size: 256 | lm loss: 3.672604E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.514 | TFLOPs: 24.27 | +7: iteration 149040/ 173500 | consumed samples: 38154240 | consumed tokens: 78139883520 | elapsed time per iteration (s): 0.16 | learning rate: 2.886E-05 | global batch size: 256 | lm loss: 3.665481E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.174 | TFLOPs: 25.03 | +7: iteration 149050/ 173500 | consumed samples: 38156800 | consumed tokens: 78145126400 | elapsed time per iteration (s): 0.16 | learning rate: 2.885E-05 | global batch size: 256 | lm loss: 3.664894E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.789 | TFLOPs: 25.09 | +7: iteration 149060/ 173500 | consumed samples: 38159360 | consumed tokens: 78150369280 | elapsed time per iteration (s): 0.16 | learning rate: 2.884E-05 | global batch size: 256 | lm loss: 3.648406E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.312 | TFLOPs: 24.83 | +7: iteration 149070/ 173500 | consumed samples: 38161920 | consumed tokens: 78155612160 | elapsed time per iteration (s): 0.17 | learning rate: 2.884E-05 | global batch size: 256 | lm loss: 3.676093E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1504.080 | TFLOPs: 23.59 | +7: iteration 149080/ 173500 | consumed samples: 38164480 | consumed tokens: 78160855040 | elapsed time per iteration (s): 0.18 | learning rate: 2.883E-05 | global batch size: 256 | lm loss: 3.655541E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1446.022 | TFLOPs: 22.68 | +7: iteration 149090/ 173500 | consumed samples: 38167040 | consumed tokens: 78166097920 | elapsed time per iteration (s): 0.16 | learning rate: 2.882E-05 | global batch size: 256 | lm loss: 3.664610E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1564.636 | TFLOPs: 24.54 | +7: iteration 149100/ 173500 | consumed samples: 38169600 | consumed tokens: 78171340800 | elapsed time per iteration (s): 0.16 | learning rate: 2.881E-05 | global batch size: 256 | lm loss: 3.660741E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.243 | TFLOPs: 24.81 | +7: iteration 149110/ 173500 | consumed samples: 38172160 | consumed tokens: 78176583680 | elapsed time per iteration (s): 0.16 | learning rate: 2.881E-05 | global batch size: 256 | lm loss: 3.664603E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.281 | TFLOPs: 24.61 | +7: iteration 149120/ 173500 | consumed samples: 38174720 | consumed tokens: 78181826560 | elapsed time per iteration (s): 0.16 | learning rate: 2.880E-05 | global batch size: 256 | lm loss: 3.678363E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.346 | TFLOPs: 24.60 | +7: iteration 149130/ 173500 | consumed samples: 38177280 | consumed tokens: 78187069440 | elapsed time per iteration (s): 0.18 | learning rate: 2.879E-05 | global batch size: 256 | lm loss: 3.666444E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.547 | TFLOPs: 22.64 | +7: iteration 149140/ 173500 | consumed samples: 38179840 | consumed tokens: 78192312320 | elapsed time per iteration (s): 0.17 | learning rate: 2.879E-05 | global batch size: 256 | lm loss: 3.667206E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.996 | TFLOPs: 23.18 | +7: iteration 149150/ 173500 | consumed samples: 38182400 | consumed tokens: 78197555200 | elapsed time per iteration (s): 0.17 | learning rate: 2.878E-05 | global batch size: 256 | lm loss: 3.661544E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1465.554 | TFLOPs: 22.98 | +7: iteration 149160/ 173500 | consumed samples: 38184960 | consumed tokens: 78202798080 | elapsed time per iteration (s): 0.17 | learning rate: 2.877E-05 | global batch size: 256 | lm loss: 3.663258E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1515.443 | TFLOPs: 23.77 | +7: iteration 149170/ 173500 | consumed samples: 38187520 | consumed tokens: 78208040960 | elapsed time per iteration (s): 0.16 | learning rate: 2.877E-05 | global batch size: 256 | lm loss: 3.663155E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.133 | TFLOPs: 24.50 | +7: iteration 149180/ 173500 | consumed samples: 38190080 | consumed tokens: 78213283840 | elapsed time per iteration (s): 0.16 | learning rate: 2.876E-05 | global batch size: 256 | lm loss: 3.670075E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.469 | TFLOPs: 24.60 | +7: iteration 149190/ 173500 | consumed samples: 38192640 | consumed tokens: 78218526720 | elapsed time per iteration (s): 0.17 | learning rate: 2.875E-05 | global batch size: 256 | lm loss: 3.674096E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1471.911 | TFLOPs: 23.08 | +7: iteration 149200/ 173500 | consumed samples: 38195200 | consumed tokens: 78223769600 | elapsed time per iteration (s): 0.16 | learning rate: 2.874E-05 | global batch size: 256 | lm loss: 3.664907E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.988 | TFLOPs: 24.78 | +7: iteration 149210/ 173500 | consumed samples: 38197760 | consumed tokens: 78229012480 | elapsed time per iteration (s): 0.16 | learning rate: 2.874E-05 | global batch size: 256 | lm loss: 3.677065E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.543 | TFLOPs: 25.21 | +7: iteration 149220/ 173500 | consumed samples: 38200320 | consumed tokens: 78234255360 | elapsed time per iteration (s): 0.16 | learning rate: 2.873E-05 | global batch size: 256 | lm loss: 3.668213E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.124 | TFLOPs: 24.61 | +7: iteration 149230/ 173500 | consumed samples: 38202880 | consumed tokens: 78239498240 | elapsed time per iteration (s): 0.16 | learning rate: 2.872E-05 | global batch size: 256 | lm loss: 3.666625E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.215 | TFLOPs: 24.88 | +7: iteration 149240/ 173500 | consumed samples: 38205440 | consumed tokens: 78244741120 | elapsed time per iteration (s): 0.17 | learning rate: 2.872E-05 | global batch size: 256 | lm loss: 3.661351E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.485 | TFLOPs: 24.19 | +7: iteration 149250/ 173500 | consumed samples: 38208000 | consumed tokens: 78249984000 | elapsed time per iteration (s): 0.16 | learning rate: 2.871E-05 | global batch size: 256 | lm loss: 3.658171E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.752 | TFLOPs: 24.85 | +7: iteration 149260/ 173500 | consumed samples: 38210560 | consumed tokens: 78255226880 | elapsed time per iteration (s): 0.16 | learning rate: 2.870E-05 | global batch size: 256 | lm loss: 3.656500E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.550 | TFLOPs: 24.66 | +7: iteration 149270/ 173500 | consumed samples: 38213120 | consumed tokens: 78260469760 | elapsed time per iteration (s): 0.16 | learning rate: 2.869E-05 | global batch size: 256 | lm loss: 3.664239E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.616 | TFLOPs: 24.57 | +7: iteration 149280/ 173500 | consumed samples: 38215680 | consumed tokens: 78265712640 | elapsed time per iteration (s): 0.16 | learning rate: 2.869E-05 | global batch size: 256 | lm loss: 3.674055E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.932 | TFLOPs: 24.39 | +7: iteration 149290/ 173500 | consumed samples: 38218240 | consumed tokens: 78270955520 | elapsed time per iteration (s): 0.18 | learning rate: 2.868E-05 | global batch size: 256 | lm loss: 3.662610E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.575 | TFLOPs: 22.12 | +7: iteration 149300/ 173500 | consumed samples: 38220800 | consumed tokens: 78276198400 | elapsed time per iteration (s): 0.17 | learning rate: 2.867E-05 | global batch size: 256 | lm loss: 3.662827E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1523.722 | TFLOPs: 23.90 | +7: iteration 149310/ 173500 | consumed samples: 38223360 | consumed tokens: 78281441280 | elapsed time per iteration (s): 0.16 | learning rate: 2.867E-05 | global batch size: 256 | lm loss: 3.660907E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.407 | TFLOPs: 24.50 | +7: iteration 149320/ 173500 | consumed samples: 38225920 | consumed tokens: 78286684160 | elapsed time per iteration (s): 0.16 | learning rate: 2.866E-05 | global batch size: 256 | lm loss: 3.670389E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.904 | TFLOPs: 25.23 | +7: iteration 149330/ 173500 | consumed samples: 38228480 | consumed tokens: 78291927040 | elapsed time per iteration (s): 0.16 | learning rate: 2.865E-05 | global batch size: 256 | lm loss: 3.659551E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.491 | TFLOPs: 24.86 | +7: iteration 149340/ 173500 | consumed samples: 38231040 | consumed tokens: 78297169920 | elapsed time per iteration (s): 0.17 | learning rate: 2.865E-05 | global batch size: 256 | lm loss: 3.658157E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.808 | TFLOPs: 23.68 | +7: iteration 149350/ 173500 | consumed samples: 38233600 | consumed tokens: 78302412800 | elapsed time per iteration (s): 0.16 | learning rate: 2.864E-05 | global batch size: 256 | lm loss: 3.648400E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.344 | TFLOPs: 24.69 | +7: iteration 149360/ 173500 | consumed samples: 38236160 | consumed tokens: 78307655680 | elapsed time per iteration (s): 0.16 | learning rate: 2.863E-05 | global batch size: 256 | lm loss: 3.660418E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.046 | TFLOPs: 25.22 | +7: iteration 149370/ 173500 | consumed samples: 38238720 | consumed tokens: 78312898560 | elapsed time per iteration (s): 0.16 | learning rate: 2.862E-05 | global batch size: 256 | lm loss: 3.660176E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.626 | TFLOPs: 25.12 | +7: iteration 149380/ 173500 | consumed samples: 38241280 | consumed tokens: 78318141440 | elapsed time per iteration (s): 0.16 | learning rate: 2.862E-05 | global batch size: 256 | lm loss: 3.673773E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.825 | TFLOPs: 25.40 | +7: iteration 149390/ 173500 | consumed samples: 38243840 | consumed tokens: 78323384320 | elapsed time per iteration (s): 0.17 | learning rate: 2.861E-05 | global batch size: 256 | lm loss: 3.676610E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.535 | TFLOPs: 24.21 | +7: iteration 149400/ 173500 | consumed samples: 38246400 | consumed tokens: 78328627200 | elapsed time per iteration (s): 0.16 | learning rate: 2.860E-05 | global batch size: 256 | lm loss: 3.663092E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.333 | TFLOPs: 24.64 | +7: iteration 149410/ 173500 | consumed samples: 38248960 | consumed tokens: 78333870080 | elapsed time per iteration (s): 0.16 | learning rate: 2.860E-05 | global batch size: 256 | lm loss: 3.674195E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.747 | TFLOPs: 24.73 | +7: iteration 149420/ 173500 | consumed samples: 38251520 | consumed tokens: 78339112960 | elapsed time per iteration (s): 0.17 | learning rate: 2.859E-05 | global batch size: 256 | lm loss: 3.672905E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.594 | TFLOPs: 23.83 | +7: iteration 149430/ 173500 | consumed samples: 38254080 | consumed tokens: 78344355840 | elapsed time per iteration (s): 0.16 | learning rate: 2.858E-05 | global batch size: 256 | lm loss: 3.671365E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.711 | TFLOPs: 24.59 | +7: iteration 149440/ 173500 | consumed samples: 38256640 | consumed tokens: 78349598720 | elapsed time per iteration (s): 0.16 | learning rate: 2.857E-05 | global batch size: 256 | lm loss: 3.663109E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.251 | TFLOPs: 25.43 | +7: iteration 149450/ 173500 | consumed samples: 38259200 | consumed tokens: 78354841600 | elapsed time per iteration (s): 0.16 | learning rate: 2.857E-05 | global batch size: 256 | lm loss: 3.663157E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.400 | TFLOPs: 24.67 | +7: iteration 149460/ 173500 | consumed samples: 38261760 | consumed tokens: 78360084480 | elapsed time per iteration (s): 0.17 | learning rate: 2.856E-05 | global batch size: 256 | lm loss: 3.657302E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.067 | TFLOPs: 24.01 | +7: iteration 149470/ 173500 | consumed samples: 38264320 | consumed tokens: 78365327360 | elapsed time per iteration (s): 0.16 | learning rate: 2.855E-05 | global batch size: 256 | lm loss: 3.651357E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.468 | TFLOPs: 24.50 | +7: iteration 149480/ 173500 | consumed samples: 38266880 | consumed tokens: 78370570240 | elapsed time per iteration (s): 0.16 | learning rate: 2.855E-05 | global batch size: 256 | lm loss: 3.669214E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.803 | TFLOPs: 24.45 | +7: iteration 149490/ 173500 | consumed samples: 38269440 | consumed tokens: 78375813120 | elapsed time per iteration (s): 0.16 | learning rate: 2.854E-05 | global batch size: 256 | lm loss: 3.658750E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.589 | TFLOPs: 25.23 | +7: iteration 149500/ 173500 | consumed samples: 38272000 | consumed tokens: 78381056000 | elapsed time per iteration (s): 0.17 | learning rate: 2.853E-05 | global batch size: 256 | lm loss: 3.662086E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.139 | TFLOPs: 24.18 | +7: iteration 149510/ 173500 | consumed samples: 38274560 | consumed tokens: 78386298880 | elapsed time per iteration (s): 0.17 | learning rate: 2.853E-05 | global batch size: 256 | lm loss: 3.664740E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.149 | TFLOPs: 23.43 | +7: iteration 149520/ 173500 | consumed samples: 38277120 | consumed tokens: 78391541760 | elapsed time per iteration (s): 0.17 | learning rate: 2.852E-05 | global batch size: 256 | lm loss: 3.673383E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.756 | TFLOPs: 24.02 | +7: iteration 149530/ 173500 | consumed samples: 38279680 | consumed tokens: 78396784640 | elapsed time per iteration (s): 0.16 | learning rate: 2.851E-05 | global batch size: 256 | lm loss: 3.677062E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.166 | TFLOPs: 25.03 | +7: iteration 149540/ 173500 | consumed samples: 38282240 | consumed tokens: 78402027520 | elapsed time per iteration (s): 0.16 | learning rate: 2.850E-05 | global batch size: 256 | lm loss: 3.664227E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.845 | TFLOPs: 24.70 | +7: iteration 149550/ 173500 | consumed samples: 38284800 | consumed tokens: 78407270400 | elapsed time per iteration (s): 0.17 | learning rate: 2.850E-05 | global batch size: 256 | lm loss: 3.642927E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.353 | TFLOPs: 23.44 | +7: iteration 149560/ 173500 | consumed samples: 38287360 | consumed tokens: 78412513280 | elapsed time per iteration (s): 0.17 | learning rate: 2.849E-05 | global batch size: 256 | lm loss: 3.663103E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1514.754 | TFLOPs: 23.76 | +7: iteration 149570/ 173500 | consumed samples: 38289920 | consumed tokens: 78417756160 | elapsed time per iteration (s): 0.17 | learning rate: 2.848E-05 | global batch size: 256 | lm loss: 3.661346E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.396 | TFLOPs: 23.83 | +7: iteration 149580/ 173500 | consumed samples: 38292480 | consumed tokens: 78422999040 | elapsed time per iteration (s): 0.16 | learning rate: 2.848E-05 | global batch size: 256 | lm loss: 3.659510E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.438 | TFLOPs: 25.35 | +7: iteration 149590/ 173500 | consumed samples: 38295040 | consumed tokens: 78428241920 | elapsed time per iteration (s): 0.17 | learning rate: 2.847E-05 | global batch size: 256 | lm loss: 3.668471E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.870 | TFLOPs: 24.09 | +7: iteration 149600/ 173500 | consumed samples: 38297600 | consumed tokens: 78433484800 | elapsed time per iteration (s): 0.16 | learning rate: 2.846E-05 | global batch size: 256 | lm loss: 3.671573E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.718 | TFLOPs: 24.73 | +7: iteration 149610/ 173500 | consumed samples: 38300160 | consumed tokens: 78438727680 | elapsed time per iteration (s): 0.16 | learning rate: 2.846E-05 | global batch size: 256 | lm loss: 3.658996E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.258 | TFLOPs: 24.94 | +7: iteration 149620/ 173500 | consumed samples: 38302720 | consumed tokens: 78443970560 | elapsed time per iteration (s): 0.16 | learning rate: 2.845E-05 | global batch size: 256 | lm loss: 3.667073E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.710 | TFLOPs: 25.35 | +7: iteration 149630/ 173500 | consumed samples: 38305280 | consumed tokens: 78449213440 | elapsed time per iteration (s): 0.17 | learning rate: 2.844E-05 | global batch size: 256 | lm loss: 3.673336E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.328 | TFLOPs: 24.16 | +7: iteration 149640/ 173500 | consumed samples: 38307840 | consumed tokens: 78454456320 | elapsed time per iteration (s): 0.16 | learning rate: 2.844E-05 | global batch size: 256 | lm loss: 3.659160E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.259 | TFLOPs: 25.21 | +7: iteration 149650/ 173500 | consumed samples: 38310400 | consumed tokens: 78459699200 | elapsed time per iteration (s): 0.16 | learning rate: 2.843E-05 | global batch size: 256 | lm loss: 3.673260E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.770 | TFLOPs: 24.41 | +7: iteration 149660/ 173500 | consumed samples: 38312960 | consumed tokens: 78464942080 | elapsed time per iteration (s): 0.16 | learning rate: 2.842E-05 | global batch size: 256 | lm loss: 3.657693E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.477 | TFLOPs: 24.68 | +7: iteration 149670/ 173500 | consumed samples: 38315520 | consumed tokens: 78470184960 | elapsed time per iteration (s): 0.16 | learning rate: 2.841E-05 | global batch size: 256 | lm loss: 3.678985E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.070 | TFLOPs: 25.23 | +7: iteration 149680/ 173500 | consumed samples: 38318080 | consumed tokens: 78475427840 | elapsed time per iteration (s): 0.16 | learning rate: 2.841E-05 | global batch size: 256 | lm loss: 3.662271E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.447 | TFLOPs: 25.52 | +7: iteration 149690/ 173500 | consumed samples: 38320640 | consumed tokens: 78480670720 | elapsed time per iteration (s): 0.17 | learning rate: 2.840E-05 | global batch size: 256 | lm loss: 3.664642E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.464 | TFLOPs: 23.91 | +7: iteration 149700/ 173500 | consumed samples: 38323200 | consumed tokens: 78485913600 | elapsed time per iteration (s): 0.16 | learning rate: 2.839E-05 | global batch size: 256 | lm loss: 3.669945E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.910 | TFLOPs: 24.70 | +7: iteration 149710/ 173500 | consumed samples: 38325760 | consumed tokens: 78491156480 | elapsed time per iteration (s): 0.16 | learning rate: 2.839E-05 | global batch size: 256 | lm loss: 3.662891E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.538 | TFLOPs: 25.04 | +7: iteration 149720/ 173500 | consumed samples: 38328320 | consumed tokens: 78496399360 | elapsed time per iteration (s): 0.16 | learning rate: 2.838E-05 | global batch size: 256 | lm loss: 3.676943E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.588 | TFLOPs: 25.07 | +7: iteration 149730/ 173500 | consumed samples: 38330880 | consumed tokens: 78501642240 | elapsed time per iteration (s): 0.17 | learning rate: 2.837E-05 | global batch size: 256 | lm loss: 3.667425E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1467.765 | TFLOPs: 23.02 | +7: iteration 149740/ 173500 | consumed samples: 38333440 | consumed tokens: 78506885120 | elapsed time per iteration (s): 0.16 | learning rate: 2.837E-05 | global batch size: 256 | lm loss: 3.655173E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.527 | TFLOPs: 25.81 | +7: iteration 149750/ 173500 | consumed samples: 38336000 | consumed tokens: 78512128000 | elapsed time per iteration (s): 0.18 | learning rate: 2.836E-05 | global batch size: 256 | lm loss: 3.660061E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.666 | TFLOPs: 22.64 | +7: iteration 149760/ 173500 | consumed samples: 38338560 | consumed tokens: 78517370880 | elapsed time per iteration (s): 0.16 | learning rate: 2.835E-05 | global batch size: 256 | lm loss: 3.673222E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.523 | TFLOPs: 24.57 | +7: iteration 149770/ 173500 | consumed samples: 38341120 | consumed tokens: 78522613760 | elapsed time per iteration (s): 0.16 | learning rate: 2.835E-05 | global batch size: 256 | lm loss: 3.654589E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.181 | TFLOPs: 25.14 | +7: iteration 149780/ 173500 | consumed samples: 38343680 | consumed tokens: 78527856640 | elapsed time per iteration (s): 0.16 | learning rate: 2.834E-05 | global batch size: 256 | lm loss: 3.666314E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.319 | TFLOPs: 25.22 | +7: iteration 149790/ 173500 | consumed samples: 38346240 | consumed tokens: 78533099520 | elapsed time per iteration (s): 0.16 | learning rate: 2.833E-05 | global batch size: 256 | lm loss: 3.651416E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.909 | TFLOPs: 25.40 | +7: iteration 149800/ 173500 | consumed samples: 38348800 | consumed tokens: 78538342400 | elapsed time per iteration (s): 0.15 | learning rate: 2.832E-05 | global batch size: 256 | lm loss: 3.663301E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.262 | TFLOPs: 26.23 | +7: iteration 149810/ 173500 | consumed samples: 38351360 | consumed tokens: 78543585280 | elapsed time per iteration (s): 0.16 | learning rate: 2.832E-05 | global batch size: 256 | lm loss: 3.670428E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.981 | TFLOPs: 24.90 | +7: iteration 149820/ 173500 | consumed samples: 38353920 | consumed tokens: 78548828160 | elapsed time per iteration (s): 0.15 | learning rate: 2.831E-05 | global batch size: 256 | lm loss: 3.669443E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.306 | TFLOPs: 26.07 | +7: iteration 149830/ 173500 | consumed samples: 38356480 | consumed tokens: 78554071040 | elapsed time per iteration (s): 0.15 | learning rate: 2.830E-05 | global batch size: 256 | lm loss: 3.663760E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.882 | TFLOPs: 26.00 | +7: iteration 149840/ 173500 | consumed samples: 38359040 | consumed tokens: 78559313920 | elapsed time per iteration (s): 0.15 | learning rate: 2.830E-05 | global batch size: 256 | lm loss: 3.661885E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.891 | TFLOPs: 26.02 | +7: iteration 149850/ 173500 | consumed samples: 38361600 | consumed tokens: 78564556800 | elapsed time per iteration (s): 0.15 | learning rate: 2.829E-05 | global batch size: 256 | lm loss: 3.655560E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.456 | TFLOPs: 26.35 | +7: iteration 149860/ 173500 | consumed samples: 38364160 | consumed tokens: 78569799680 | elapsed time per iteration (s): 0.16 | learning rate: 2.828E-05 | global batch size: 256 | lm loss: 3.666074E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.367 | TFLOPs: 24.88 | +7: iteration 149870/ 173500 | consumed samples: 38366720 | consumed tokens: 78575042560 | elapsed time per iteration (s): 0.17 | learning rate: 2.828E-05 | global batch size: 256 | lm loss: 3.656122E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1495.953 | TFLOPs: 23.46 | +7: iteration 149880/ 173500 | consumed samples: 38369280 | consumed tokens: 78580285440 | elapsed time per iteration (s): 0.16 | learning rate: 2.827E-05 | global batch size: 256 | lm loss: 3.660696E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.677 | TFLOPs: 24.68 | +7: iteration 149890/ 173500 | consumed samples: 38371840 | consumed tokens: 78585528320 | elapsed time per iteration (s): 0.17 | learning rate: 2.826E-05 | global batch size: 256 | lm loss: 3.664765E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1516.970 | TFLOPs: 23.79 | +7: iteration 149900/ 173500 | consumed samples: 38374400 | consumed tokens: 78590771200 | elapsed time per iteration (s): 0.16 | learning rate: 2.826E-05 | global batch size: 256 | lm loss: 3.655006E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.417 | TFLOPs: 25.46 | +7: iteration 149910/ 173500 | consumed samples: 38376960 | consumed tokens: 78596014080 | elapsed time per iteration (s): 0.17 | learning rate: 2.825E-05 | global batch size: 256 | lm loss: 3.668063E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1537.755 | TFLOPs: 24.12 | +7: iteration 149920/ 173500 | consumed samples: 38379520 | consumed tokens: 78601256960 | elapsed time per iteration (s): 0.16 | learning rate: 2.824E-05 | global batch size: 256 | lm loss: 3.671647E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.572 | TFLOPs: 25.02 | +7: iteration 149930/ 173500 | consumed samples: 38382080 | consumed tokens: 78606499840 | elapsed time per iteration (s): 0.17 | learning rate: 2.823E-05 | global batch size: 256 | lm loss: 3.657007E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1550.101 | TFLOPs: 24.31 | +7: iteration 149940/ 173500 | consumed samples: 38384640 | consumed tokens: 78611742720 | elapsed time per iteration (s): 0.17 | learning rate: 2.823E-05 | global batch size: 256 | lm loss: 3.680714E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.253 | TFLOPs: 24.17 | +7: iteration 149950/ 173500 | consumed samples: 38387200 | consumed tokens: 78616985600 | elapsed time per iteration (s): 0.16 | learning rate: 2.822E-05 | global batch size: 256 | lm loss: 3.676887E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.758 | TFLOPs: 24.68 | +7: iteration 149960/ 173500 | consumed samples: 38389760 | consumed tokens: 78622228480 | elapsed time per iteration (s): 0.16 | learning rate: 2.821E-05 | global batch size: 256 | lm loss: 3.669341E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.488 | TFLOPs: 25.21 | +7: iteration 149970/ 173500 | consumed samples: 38392320 | consumed tokens: 78627471360 | elapsed time per iteration (s): 0.16 | learning rate: 2.821E-05 | global batch size: 256 | lm loss: 3.665764E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.716 | TFLOPs: 25.31 | +7: iteration 149980/ 173500 | consumed samples: 38394880 | consumed tokens: 78632714240 | elapsed time per iteration (s): 0.16 | learning rate: 2.820E-05 | global batch size: 256 | lm loss: 3.660756E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.905 | TFLOPs: 25.01 | +7: iteration 149990/ 173500 | consumed samples: 38397440 | consumed tokens: 78637957120 | elapsed time per iteration (s): 0.17 | learning rate: 2.819E-05 | global batch size: 256 | lm loss: 3.670807E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1493.242 | TFLOPs: 23.42 | +0: [2023-03-17 06:46:48,482] [INFO] [logging.py:68:log_dist] [Rank 0] step=150000, skipped=0, lr=[2.8186529571359086e-05, 2.8186529571359086e-05, 2.8186529571359086e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 150000/ 173500 | consumed samples: 38400000 | consumed tokens: 78643200000 | elapsed time per iteration (s): 0.17 | learning rate: 2.819E-05 | global batch size: 256 | lm loss: 3.658022E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1524.607 | TFLOPs: 23.91 | +0: steps: 150000 loss: 3.6708 iter time (s): 0.161 samples/sec: 1592.037 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 150000 | lm loss value: 3.831722E+00 | lm loss PPL: 4.614195E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 150000 to checkpoints_44m91b100m +0: [2023-03-17 06:46:48,556] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step150000 is begin to save! +0: [2023-03-17 06:46:48,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:46:48,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:46:48,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:46:48,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:46:48,634] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:46:48,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:46:48,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:46:48,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:46:48,650] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:46:48,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:46:48,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:46:48,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:46:48,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:46:48,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:46:48,675] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:46:48,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:46:48,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:46:48,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:46:48,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:46:48,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:46:48,693] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step150000/mp_rank_00_model_states.pt +0: [2023-03-17 06:46:48,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:46:48,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:46:48,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:46:48,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:46:48,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:46:48,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:46:48,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:46:48,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +2: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +6: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +1: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +5: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +3: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +7: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:46:48,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step150000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:46:48,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step150000 is ready now! +0: successfully saved checkpoint at iteration 150000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 188.74 +7: iteration 150010/ 173500 | consumed samples: 38402560 | consumed tokens: 78648442880 | elapsed time per iteration (s): 0.19 | learning rate: 2.818E-05 | global batch size: 256 | lm loss: 3.663018E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.047 | TFLOPs: 21.45 | +7: iteration 150020/ 173500 | consumed samples: 38405120 | consumed tokens: 78653685760 | elapsed time per iteration (s): 0.16 | learning rate: 2.817E-05 | global batch size: 256 | lm loss: 3.656688E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.984 | TFLOPs: 24.62 | +7: iteration 150030/ 173500 | consumed samples: 38407680 | consumed tokens: 78658928640 | elapsed time per iteration (s): 0.16 | learning rate: 2.817E-05 | global batch size: 256 | lm loss: 3.667252E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.632 | TFLOPs: 24.66 | +7: iteration 150040/ 173500 | consumed samples: 38410240 | consumed tokens: 78664171520 | elapsed time per iteration (s): 0.16 | learning rate: 2.816E-05 | global batch size: 256 | lm loss: 3.670919E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.120 | TFLOPs: 24.67 | +7: iteration 150050/ 173500 | consumed samples: 38412800 | consumed tokens: 78669414400 | elapsed time per iteration (s): 0.16 | learning rate: 2.815E-05 | global batch size: 256 | lm loss: 3.654494E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.162 | TFLOPs: 25.06 | +7: iteration 150060/ 173500 | consumed samples: 38415360 | consumed tokens: 78674657280 | elapsed time per iteration (s): 0.16 | learning rate: 2.815E-05 | global batch size: 256 | lm loss: 3.662925E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.535 | TFLOPs: 24.96 | +7: iteration 150070/ 173500 | consumed samples: 38417920 | consumed tokens: 78679900160 | elapsed time per iteration (s): 0.16 | learning rate: 2.814E-05 | global batch size: 256 | lm loss: 3.660485E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.748 | TFLOPs: 25.01 | +7: iteration 150080/ 173500 | consumed samples: 38420480 | consumed tokens: 78685143040 | elapsed time per iteration (s): 0.16 | learning rate: 2.813E-05 | global batch size: 256 | lm loss: 3.661314E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.718 | TFLOPs: 25.07 | +7: iteration 150090/ 173500 | consumed samples: 38423040 | consumed tokens: 78690385920 | elapsed time per iteration (s): 0.17 | learning rate: 2.812E-05 | global batch size: 256 | lm loss: 3.659832E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1519.737 | TFLOPs: 23.83 | +7: iteration 150100/ 173500 | consumed samples: 38425600 | consumed tokens: 78695628800 | elapsed time per iteration (s): 0.17 | learning rate: 2.812E-05 | global batch size: 256 | lm loss: 3.654544E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.490 | TFLOPs: 24.13 | +7: iteration 150110/ 173500 | consumed samples: 38428160 | consumed tokens: 78700871680 | elapsed time per iteration (s): 0.16 | learning rate: 2.811E-05 | global batch size: 256 | lm loss: 3.665407E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.662 | TFLOPs: 24.44 | +7: iteration 150120/ 173500 | consumed samples: 38430720 | consumed tokens: 78706114560 | elapsed time per iteration (s): 0.16 | learning rate: 2.810E-05 | global batch size: 256 | lm loss: 3.674949E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.524 | TFLOPs: 25.59 | +7: iteration 150130/ 173500 | consumed samples: 38433280 | consumed tokens: 78711357440 | elapsed time per iteration (s): 0.16 | learning rate: 2.810E-05 | global batch size: 256 | lm loss: 3.675639E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.559 | TFLOPs: 24.88 | +7: iteration 150140/ 173500 | consumed samples: 38435840 | consumed tokens: 78716600320 | elapsed time per iteration (s): 0.16 | learning rate: 2.809E-05 | global batch size: 256 | lm loss: 3.666119E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.942 | TFLOPs: 25.06 | +7: iteration 150150/ 173500 | consumed samples: 38438400 | consumed tokens: 78721843200 | elapsed time per iteration (s): 0.17 | learning rate: 2.808E-05 | global batch size: 256 | lm loss: 3.673609E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1477.758 | TFLOPs: 23.17 | +7: iteration 150160/ 173500 | consumed samples: 38440960 | consumed tokens: 78727086080 | elapsed time per iteration (s): 0.16 | learning rate: 2.808E-05 | global batch size: 256 | lm loss: 3.669982E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.285 | TFLOPs: 25.79 | +7: iteration 150170/ 173500 | consumed samples: 38443520 | consumed tokens: 78732328960 | elapsed time per iteration (s): 0.16 | learning rate: 2.807E-05 | global batch size: 256 | lm loss: 3.655017E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.300 | TFLOPs: 24.77 | +7: iteration 150180/ 173500 | consumed samples: 38446080 | consumed tokens: 78737571840 | elapsed time per iteration (s): 0.17 | learning rate: 2.806E-05 | global batch size: 256 | lm loss: 3.666088E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.280 | TFLOPs: 24.00 | +7: iteration 150190/ 173500 | consumed samples: 38448640 | consumed tokens: 78742814720 | elapsed time per iteration (s): 0.16 | learning rate: 2.806E-05 | global batch size: 256 | lm loss: 3.665308E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.712 | TFLOPs: 24.93 | +7: iteration 150200/ 173500 | consumed samples: 38451200 | consumed tokens: 78748057600 | elapsed time per iteration (s): 0.16 | learning rate: 2.805E-05 | global batch size: 256 | lm loss: 3.663165E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.583 | TFLOPs: 25.37 | +7: iteration 150210/ 173500 | consumed samples: 38453760 | consumed tokens: 78753300480 | elapsed time per iteration (s): 0.17 | learning rate: 2.804E-05 | global batch size: 256 | lm loss: 3.669233E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1548.167 | TFLOPs: 24.28 | +7: iteration 150220/ 173500 | consumed samples: 38456320 | consumed tokens: 78758543360 | elapsed time per iteration (s): 0.16 | learning rate: 2.804E-05 | global batch size: 256 | lm loss: 3.668794E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.530 | TFLOPs: 25.32 | +7: iteration 150230/ 173500 | consumed samples: 38458880 | consumed tokens: 78763786240 | elapsed time per iteration (s): 0.16 | learning rate: 2.803E-05 | global batch size: 256 | lm loss: 3.665912E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.950 | TFLOPs: 24.89 | +7: iteration 150240/ 173500 | consumed samples: 38461440 | consumed tokens: 78769029120 | elapsed time per iteration (s): 0.17 | learning rate: 2.802E-05 | global batch size: 256 | lm loss: 3.660383E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.903 | TFLOPs: 24.15 | +7: iteration 150250/ 173500 | consumed samples: 38464000 | consumed tokens: 78774272000 | elapsed time per iteration (s): 0.16 | learning rate: 2.802E-05 | global batch size: 256 | lm loss: 3.690987E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.283 | TFLOPs: 24.39 | +7: iteration 150260/ 173500 | consumed samples: 38466560 | consumed tokens: 78779514880 | elapsed time per iteration (s): 0.16 | learning rate: 2.801E-05 | global batch size: 256 | lm loss: 3.671769E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.005 | TFLOPs: 25.48 | +7: iteration 150270/ 173500 | consumed samples: 38469120 | consumed tokens: 78784757760 | elapsed time per iteration (s): 0.16 | learning rate: 2.800E-05 | global batch size: 256 | lm loss: 3.655774E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.887 | TFLOPs: 24.78 | +7: iteration 150280/ 173500 | consumed samples: 38471680 | consumed tokens: 78790000640 | elapsed time per iteration (s): 0.17 | learning rate: 2.800E-05 | global batch size: 256 | lm loss: 3.659787E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1544.483 | TFLOPs: 24.22 | +7: iteration 150290/ 173500 | consumed samples: 38474240 | consumed tokens: 78795243520 | elapsed time per iteration (s): 0.16 | learning rate: 2.799E-05 | global batch size: 256 | lm loss: 3.672704E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.052 | TFLOPs: 25.78 | +7: iteration 150300/ 173500 | consumed samples: 38476800 | consumed tokens: 78800486400 | elapsed time per iteration (s): 0.16 | learning rate: 2.798E-05 | global batch size: 256 | lm loss: 3.660241E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.068 | TFLOPs: 25.56 | +7: iteration 150310/ 173500 | consumed samples: 38479360 | consumed tokens: 78805729280 | elapsed time per iteration (s): 0.16 | learning rate: 2.798E-05 | global batch size: 256 | lm loss: 3.676125E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.647 | TFLOPs: 24.37 | +7: iteration 150320/ 173500 | consumed samples: 38481920 | consumed tokens: 78810972160 | elapsed time per iteration (s): 0.16 | learning rate: 2.797E-05 | global batch size: 256 | lm loss: 3.655061E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.279 | TFLOPs: 24.92 | +7: iteration 150330/ 173500 | consumed samples: 38484480 | consumed tokens: 78816215040 | elapsed time per iteration (s): 0.16 | learning rate: 2.796E-05 | global batch size: 256 | lm loss: 3.666388E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.227 | TFLOPs: 24.94 | +7: iteration 150340/ 173500 | consumed samples: 38487040 | consumed tokens: 78821457920 | elapsed time per iteration (s): 0.16 | learning rate: 2.795E-05 | global batch size: 256 | lm loss: 3.671875E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.536 | TFLOPs: 25.77 | +7: iteration 150350/ 173500 | consumed samples: 38489600 | consumed tokens: 78826700800 | elapsed time per iteration (s): 0.16 | learning rate: 2.795E-05 | global batch size: 256 | lm loss: 3.657784E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.483 | TFLOPs: 24.91 | +7: iteration 150360/ 173500 | consumed samples: 38492160 | consumed tokens: 78831943680 | elapsed time per iteration (s): 0.16 | learning rate: 2.794E-05 | global batch size: 256 | lm loss: 3.663142E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.761 | TFLOPs: 25.21 | +7: iteration 150370/ 173500 | consumed samples: 38494720 | consumed tokens: 78837186560 | elapsed time per iteration (s): 0.16 | learning rate: 2.793E-05 | global batch size: 256 | lm loss: 3.659168E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.560 | TFLOPs: 25.12 | +7: iteration 150380/ 173500 | consumed samples: 38497280 | consumed tokens: 78842429440 | elapsed time per iteration (s): 0.17 | learning rate: 2.793E-05 | global batch size: 256 | lm loss: 3.662362E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1522.231 | TFLOPs: 23.87 | +7: iteration 150390/ 173500 | consumed samples: 38499840 | consumed tokens: 78847672320 | elapsed time per iteration (s): 0.16 | learning rate: 2.792E-05 | global batch size: 256 | lm loss: 3.660572E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.867 | TFLOPs: 24.79 | +7: iteration 150400/ 173500 | consumed samples: 38502400 | consumed tokens: 78852915200 | elapsed time per iteration (s): 0.16 | learning rate: 2.791E-05 | global batch size: 256 | lm loss: 3.661320E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.087 | TFLOPs: 25.39 | +7: iteration 150410/ 173500 | consumed samples: 38504960 | consumed tokens: 78858158080 | elapsed time per iteration (s): 0.16 | learning rate: 2.791E-05 | global batch size: 256 | lm loss: 3.670663E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1570.036 | TFLOPs: 24.62 | +7: iteration 150420/ 173500 | consumed samples: 38507520 | consumed tokens: 78863400960 | elapsed time per iteration (s): 0.16 | learning rate: 2.790E-05 | global batch size: 256 | lm loss: 3.670943E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.881 | TFLOPs: 25.26 | +7: iteration 150430/ 173500 | consumed samples: 38510080 | consumed tokens: 78868643840 | elapsed time per iteration (s): 0.16 | learning rate: 2.789E-05 | global batch size: 256 | lm loss: 3.664834E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.199 | TFLOPs: 25.33 | +7: iteration 150440/ 173500 | consumed samples: 38512640 | consumed tokens: 78873886720 | elapsed time per iteration (s): 0.16 | learning rate: 2.789E-05 | global batch size: 256 | lm loss: 3.662585E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.784 | TFLOPs: 24.51 | +7: iteration 150450/ 173500 | consumed samples: 38515200 | consumed tokens: 78879129600 | elapsed time per iteration (s): 0.16 | learning rate: 2.788E-05 | global batch size: 256 | lm loss: 3.656610E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.368 | TFLOPs: 25.49 | +7: iteration 150460/ 173500 | consumed samples: 38517760 | consumed tokens: 78884372480 | elapsed time per iteration (s): 0.16 | learning rate: 2.787E-05 | global batch size: 256 | lm loss: 3.663727E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.473 | TFLOPs: 25.12 | +7: iteration 150470/ 173500 | consumed samples: 38520320 | consumed tokens: 78889615360 | elapsed time per iteration (s): 0.16 | learning rate: 2.787E-05 | global batch size: 256 | lm loss: 3.654354E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.595 | TFLOPs: 24.82 | +7: iteration 150480/ 173500 | consumed samples: 38522880 | consumed tokens: 78894858240 | elapsed time per iteration (s): 0.16 | learning rate: 2.786E-05 | global batch size: 256 | lm loss: 3.671300E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.348 | TFLOPs: 25.38 | +7: iteration 150490/ 173500 | consumed samples: 38525440 | consumed tokens: 78900101120 | elapsed time per iteration (s): 0.16 | learning rate: 2.785E-05 | global batch size: 256 | lm loss: 3.657612E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.966 | TFLOPs: 24.48 | +7: iteration 150500/ 173500 | consumed samples: 38528000 | consumed tokens: 78905344000 | elapsed time per iteration (s): 0.16 | learning rate: 2.785E-05 | global batch size: 256 | lm loss: 3.668867E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.543 | TFLOPs: 25.62 | +7: iteration 150510/ 173500 | consumed samples: 38530560 | consumed tokens: 78910586880 | elapsed time per iteration (s): 0.15 | learning rate: 2.784E-05 | global batch size: 256 | lm loss: 3.676974E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.863 | TFLOPs: 25.95 | +7: iteration 150520/ 173500 | consumed samples: 38533120 | consumed tokens: 78915829760 | elapsed time per iteration (s): 0.16 | learning rate: 2.783E-05 | global batch size: 256 | lm loss: 3.668621E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.446 | TFLOPs: 25.10 | +7: iteration 150530/ 173500 | consumed samples: 38535680 | consumed tokens: 78921072640 | elapsed time per iteration (s): 0.16 | learning rate: 2.783E-05 | global batch size: 256 | lm loss: 3.661970E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.927 | TFLOPs: 24.97 | +7: iteration 150540/ 173500 | consumed samples: 38538240 | consumed tokens: 78926315520 | elapsed time per iteration (s): 0.15 | learning rate: 2.782E-05 | global batch size: 256 | lm loss: 3.663788E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.267 | TFLOPs: 26.07 | +7: iteration 150550/ 173500 | consumed samples: 38540800 | consumed tokens: 78931558400 | elapsed time per iteration (s): 0.16 | learning rate: 2.781E-05 | global batch size: 256 | lm loss: 3.671749E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1581.292 | TFLOPs: 24.80 | +7: iteration 150560/ 173500 | consumed samples: 38543360 | consumed tokens: 78936801280 | elapsed time per iteration (s): 0.16 | learning rate: 2.781E-05 | global batch size: 256 | lm loss: 3.685022E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.692 | TFLOPs: 25.53 | +7: iteration 150570/ 173500 | consumed samples: 38545920 | consumed tokens: 78942044160 | elapsed time per iteration (s): 0.16 | learning rate: 2.780E-05 | global batch size: 256 | lm loss: 3.667624E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.384 | TFLOPs: 25.87 | +7: iteration 150580/ 173500 | consumed samples: 38548480 | consumed tokens: 78947287040 | elapsed time per iteration (s): 0.16 | learning rate: 2.779E-05 | global batch size: 256 | lm loss: 3.671306E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.561 | TFLOPs: 25.13 | +7: iteration 150590/ 173500 | consumed samples: 38551040 | consumed tokens: 78952529920 | elapsed time per iteration (s): 0.16 | learning rate: 2.779E-05 | global batch size: 256 | lm loss: 3.670060E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.643 | TFLOPs: 25.37 | +7: iteration 150600/ 173500 | consumed samples: 38553600 | consumed tokens: 78957772800 | elapsed time per iteration (s): 0.16 | learning rate: 2.778E-05 | global batch size: 256 | lm loss: 3.667334E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.340 | TFLOPs: 25.32 | +7: iteration 150610/ 173500 | consumed samples: 38556160 | consumed tokens: 78963015680 | elapsed time per iteration (s): 0.16 | learning rate: 2.777E-05 | global batch size: 256 | lm loss: 3.670334E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.170 | TFLOPs: 25.25 | +7: iteration 150620/ 173500 | consumed samples: 38558720 | consumed tokens: 78968258560 | elapsed time per iteration (s): 0.16 | learning rate: 2.777E-05 | global batch size: 256 | lm loss: 3.662233E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.697 | TFLOPs: 25.07 | +7: iteration 150630/ 173500 | consumed samples: 38561280 | consumed tokens: 78973501440 | elapsed time per iteration (s): 0.16 | learning rate: 2.776E-05 | global batch size: 256 | lm loss: 3.658771E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.163 | TFLOPs: 25.22 | +7: iteration 150640/ 173500 | consumed samples: 38563840 | consumed tokens: 78978744320 | elapsed time per iteration (s): 0.16 | learning rate: 2.775E-05 | global batch size: 256 | lm loss: 3.653574E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.493 | TFLOPs: 25.21 | +7: iteration 150650/ 173500 | consumed samples: 38566400 | consumed tokens: 78983987200 | elapsed time per iteration (s): 0.16 | learning rate: 2.775E-05 | global batch size: 256 | lm loss: 3.668681E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.158 | TFLOPs: 24.84 | +7: iteration 150660/ 173500 | consumed samples: 38568960 | consumed tokens: 78989230080 | elapsed time per iteration (s): 0.16 | learning rate: 2.774E-05 | global batch size: 256 | lm loss: 3.665445E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.489 | TFLOPs: 25.13 | +7: iteration 150670/ 173500 | consumed samples: 38571520 | consumed tokens: 78994472960 | elapsed time per iteration (s): 0.16 | learning rate: 2.773E-05 | global batch size: 256 | lm loss: 3.663048E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.460 | TFLOPs: 24.99 | +7: iteration 150680/ 173500 | consumed samples: 38574080 | consumed tokens: 78999715840 | elapsed time per iteration (s): 0.16 | learning rate: 2.773E-05 | global batch size: 256 | lm loss: 3.662368E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.949 | TFLOPs: 25.50 | +7: iteration 150690/ 173500 | consumed samples: 38576640 | consumed tokens: 79004958720 | elapsed time per iteration (s): 0.16 | learning rate: 2.772E-05 | global batch size: 256 | lm loss: 3.674182E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.240 | TFLOPs: 24.92 | +7: iteration 150700/ 173500 | consumed samples: 38579200 | consumed tokens: 79010201600 | elapsed time per iteration (s): 0.16 | learning rate: 2.771E-05 | global batch size: 256 | lm loss: 3.672889E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.626 | TFLOPs: 25.85 | +7: iteration 150710/ 173500 | consumed samples: 38581760 | consumed tokens: 79015444480 | elapsed time per iteration (s): 0.16 | learning rate: 2.771E-05 | global batch size: 256 | lm loss: 3.655198E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.849 | TFLOPs: 25.04 | +7: iteration 150720/ 173500 | consumed samples: 38584320 | consumed tokens: 79020687360 | elapsed time per iteration (s): 0.16 | learning rate: 2.770E-05 | global batch size: 256 | lm loss: 3.669368E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.970 | TFLOPs: 25.45 | +7: iteration 150730/ 173500 | consumed samples: 38586880 | consumed tokens: 79025930240 | elapsed time per iteration (s): 0.16 | learning rate: 2.769E-05 | global batch size: 256 | lm loss: 3.666584E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.413 | TFLOPs: 25.38 | +7: iteration 150740/ 173500 | consumed samples: 38589440 | consumed tokens: 79031173120 | elapsed time per iteration (s): 0.16 | learning rate: 2.769E-05 | global batch size: 256 | lm loss: 3.649774E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.191 | TFLOPs: 25.85 | +7: iteration 150750/ 173500 | consumed samples: 38592000 | consumed tokens: 79036416000 | elapsed time per iteration (s): 0.16 | learning rate: 2.768E-05 | global batch size: 256 | lm loss: 3.656526E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.327 | TFLOPs: 25.52 | +7: iteration 150760/ 173500 | consumed samples: 38594560 | consumed tokens: 79041658880 | elapsed time per iteration (s): 0.16 | learning rate: 2.767E-05 | global batch size: 256 | lm loss: 3.666125E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.100 | TFLOPs: 25.42 | +7: iteration 150770/ 173500 | consumed samples: 38597120 | consumed tokens: 79046901760 | elapsed time per iteration (s): 0.17 | learning rate: 2.767E-05 | global batch size: 256 | lm loss: 3.665686E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.805 | TFLOPs: 24.04 | +7: iteration 150780/ 173500 | consumed samples: 38599680 | consumed tokens: 79052144640 | elapsed time per iteration (s): 0.16 | learning rate: 2.766E-05 | global batch size: 256 | lm loss: 3.645487E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.924 | TFLOPs: 24.40 | +7: iteration 150790/ 173500 | consumed samples: 38602240 | consumed tokens: 79057387520 | elapsed time per iteration (s): 0.16 | learning rate: 2.765E-05 | global batch size: 256 | lm loss: 3.666001E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.612 | TFLOPs: 25.13 | +7: iteration 150800/ 173500 | consumed samples: 38604800 | consumed tokens: 79062630400 | elapsed time per iteration (s): 0.16 | learning rate: 2.765E-05 | global batch size: 256 | lm loss: 3.673278E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.278 | TFLOPs: 25.06 | +7: iteration 150810/ 173500 | consumed samples: 38607360 | consumed tokens: 79067873280 | elapsed time per iteration (s): 0.16 | learning rate: 2.764E-05 | global batch size: 256 | lm loss: 3.668178E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.836 | TFLOPs: 25.31 | +7: iteration 150820/ 173500 | consumed samples: 38609920 | consumed tokens: 79073116160 | elapsed time per iteration (s): 0.15 | learning rate: 2.763E-05 | global batch size: 256 | lm loss: 3.653521E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.913 | TFLOPs: 26.06 | +7: iteration 150830/ 173500 | consumed samples: 38612480 | consumed tokens: 79078359040 | elapsed time per iteration (s): 0.16 | learning rate: 2.763E-05 | global batch size: 256 | lm loss: 3.663009E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.135 | TFLOPs: 25.72 | +7: iteration 150840/ 173500 | consumed samples: 38615040 | consumed tokens: 79083601920 | elapsed time per iteration (s): 0.16 | learning rate: 2.762E-05 | global batch size: 256 | lm loss: 3.659709E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.892 | TFLOPs: 24.65 | +7: iteration 150850/ 173500 | consumed samples: 38617600 | consumed tokens: 79088844800 | elapsed time per iteration (s): 0.16 | learning rate: 2.761E-05 | global batch size: 256 | lm loss: 3.675418E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.737 | TFLOPs: 25.84 | +7: iteration 150860/ 173500 | consumed samples: 38620160 | consumed tokens: 79094087680 | elapsed time per iteration (s): 0.16 | learning rate: 2.761E-05 | global batch size: 256 | lm loss: 3.671067E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.756 | TFLOPs: 24.92 | +7: iteration 150870/ 173500 | consumed samples: 38622720 | consumed tokens: 79099330560 | elapsed time per iteration (s): 0.16 | learning rate: 2.760E-05 | global batch size: 256 | lm loss: 3.680921E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.809 | TFLOPs: 25.51 | +7: iteration 150880/ 173500 | consumed samples: 38625280 | consumed tokens: 79104573440 | elapsed time per iteration (s): 0.16 | learning rate: 2.759E-05 | global batch size: 256 | lm loss: 3.682888E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.096 | TFLOPs: 25.89 | +7: iteration 150890/ 173500 | consumed samples: 38627840 | consumed tokens: 79109816320 | elapsed time per iteration (s): 0.16 | learning rate: 2.759E-05 | global batch size: 256 | lm loss: 3.671141E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.278 | TFLOPs: 25.54 | +7: iteration 150900/ 173500 | consumed samples: 38630400 | consumed tokens: 79115059200 | elapsed time per iteration (s): 0.15 | learning rate: 2.758E-05 | global batch size: 256 | lm loss: 3.662710E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.438 | TFLOPs: 26.12 | +7: iteration 150910/ 173500 | consumed samples: 38632960 | consumed tokens: 79120302080 | elapsed time per iteration (s): 0.16 | learning rate: 2.757E-05 | global batch size: 256 | lm loss: 3.669141E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.397 | TFLOPs: 25.49 | +7: iteration 150920/ 173500 | consumed samples: 38635520 | consumed tokens: 79125544960 | elapsed time per iteration (s): 0.16 | learning rate: 2.757E-05 | global batch size: 256 | lm loss: 3.671754E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.242 | TFLOPs: 24.88 | +7: iteration 150930/ 173500 | consumed samples: 38638080 | consumed tokens: 79130787840 | elapsed time per iteration (s): 0.16 | learning rate: 2.756E-05 | global batch size: 256 | lm loss: 3.668989E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.076 | TFLOPs: 25.12 | +7: iteration 150940/ 173500 | consumed samples: 38640640 | consumed tokens: 79136030720 | elapsed time per iteration (s): 0.16 | learning rate: 2.755E-05 | global batch size: 256 | lm loss: 3.653203E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.229 | TFLOPs: 25.47 | +7: iteration 150950/ 173500 | consumed samples: 38643200 | consumed tokens: 79141273600 | elapsed time per iteration (s): 0.15 | learning rate: 2.755E-05 | global batch size: 256 | lm loss: 3.662399E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.237 | TFLOPs: 25.94 | +7: iteration 150960/ 173500 | consumed samples: 38645760 | consumed tokens: 79146516480 | elapsed time per iteration (s): 0.16 | learning rate: 2.754E-05 | global batch size: 256 | lm loss: 3.652353E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.796 | TFLOPs: 25.83 | +7: iteration 150970/ 173500 | consumed samples: 38648320 | consumed tokens: 79151759360 | elapsed time per iteration (s): 0.16 | learning rate: 2.753E-05 | global batch size: 256 | lm loss: 3.661101E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.829 | TFLOPs: 24.41 | +7: iteration 150980/ 173500 | consumed samples: 38650880 | consumed tokens: 79157002240 | elapsed time per iteration (s): 0.16 | learning rate: 2.753E-05 | global batch size: 256 | lm loss: 3.668726E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.974 | TFLOPs: 25.25 | +7: iteration 150990/ 173500 | consumed samples: 38653440 | consumed tokens: 79162245120 | elapsed time per iteration (s): 0.17 | learning rate: 2.752E-05 | global batch size: 256 | lm loss: 3.666703E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1542.941 | TFLOPs: 24.20 | +7: iteration 151000/ 173500 | consumed samples: 38656000 | consumed tokens: 79167488000 | elapsed time per iteration (s): 0.16 | learning rate: 2.751E-05 | global batch size: 256 | lm loss: 3.654474E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.771 | TFLOPs: 25.61 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 151000 | lm loss value: 3.843715E+00 | lm loss PPL: 4.669862E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 151000 to checkpoints_44m91b100m +0: [2023-03-17 06:49:28,901] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step151000 is begin to save! +0: [2023-03-17 06:49:28,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:49:28,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:49:28,967] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:49:28,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:49:28,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:49:28,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:49:28,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:49:28,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:49:28,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:49:29,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:49:29,003] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:49:29,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:49:29,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:49:29,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:49:29,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:49:29,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:49:29,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:49:29,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:49:29,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:49:29,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:49:29,037] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step151000/mp_rank_00_model_states.pt +0: [2023-03-17 06:49:29,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:49:29,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:49:29,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:49:29,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:49:29,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:49:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +1: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 06:49:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +4: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:49:29,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +4: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:49:29,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 06:49:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +4: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +1: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +4: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:49:29,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:49:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +4: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: [2023-03-17 06:49:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +7: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +1: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +5: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +1: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +4: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +6: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +1: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +3: [2023-03-17 06:49:29,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:49:29,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +2: [2023-03-17 06:49:29,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:49:29,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step151000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:49:29,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151000 is ready now! +0: successfully saved checkpoint at iteration 151000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.12 +7: iteration 151010/ 173500 | consumed samples: 38658560 | consumed tokens: 79172730880 | elapsed time per iteration (s): 0.18 | learning rate: 2.751E-05 | global batch size: 256 | lm loss: 3.666084E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1443.618 | TFLOPs: 22.64 | +7: iteration 151020/ 173500 | consumed samples: 38661120 | consumed tokens: 79177973760 | elapsed time per iteration (s): 0.16 | learning rate: 2.750E-05 | global batch size: 256 | lm loss: 3.673938E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1551.786 | TFLOPs: 24.34 | +7: iteration 151030/ 173500 | consumed samples: 38663680 | consumed tokens: 79183216640 | elapsed time per iteration (s): 0.15 | learning rate: 2.749E-05 | global batch size: 256 | lm loss: 3.668815E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.329 | TFLOPs: 26.23 | +7: iteration 151040/ 173500 | consumed samples: 38666240 | consumed tokens: 79188459520 | elapsed time per iteration (s): 0.16 | learning rate: 2.749E-05 | global batch size: 256 | lm loss: 3.661910E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.804 | TFLOPs: 25.83 | +7: iteration 151050/ 173500 | consumed samples: 38668800 | consumed tokens: 79193702400 | elapsed time per iteration (s): 0.16 | learning rate: 2.748E-05 | global batch size: 256 | lm loss: 3.661531E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.044 | TFLOPs: 25.03 | +7: iteration 151060/ 173500 | consumed samples: 38671360 | consumed tokens: 79198945280 | elapsed time per iteration (s): 0.16 | learning rate: 2.747E-05 | global batch size: 256 | lm loss: 3.658897E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.835 | TFLOPs: 25.40 | +7: iteration 151070/ 173500 | consumed samples: 38673920 | consumed tokens: 79204188160 | elapsed time per iteration (s): 0.16 | learning rate: 2.747E-05 | global batch size: 256 | lm loss: 3.669350E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.540 | TFLOPs: 25.85 | +7: iteration 151080/ 173500 | consumed samples: 38676480 | consumed tokens: 79209431040 | elapsed time per iteration (s): 0.16 | learning rate: 2.746E-05 | global batch size: 256 | lm loss: 3.655193E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.631 | TFLOPs: 25.27 | +7: iteration 151090/ 173500 | consumed samples: 38679040 | consumed tokens: 79214673920 | elapsed time per iteration (s): 0.16 | learning rate: 2.746E-05 | global batch size: 256 | lm loss: 3.663689E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.106 | TFLOPs: 25.60 | +7: iteration 151100/ 173500 | consumed samples: 38681600 | consumed tokens: 79219916800 | elapsed time per iteration (s): 0.16 | learning rate: 2.745E-05 | global batch size: 256 | lm loss: 3.656882E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.615 | TFLOPs: 25.43 | +7: iteration 151110/ 173500 | consumed samples: 38684160 | consumed tokens: 79225159680 | elapsed time per iteration (s): 0.16 | learning rate: 2.744E-05 | global batch size: 256 | lm loss: 3.652887E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.185 | TFLOPs: 25.41 | +7: iteration 151120/ 173500 | consumed samples: 38686720 | consumed tokens: 79230402560 | elapsed time per iteration (s): 0.16 | learning rate: 2.744E-05 | global batch size: 256 | lm loss: 3.671946E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.059 | TFLOPs: 25.83 | +7: iteration 151130/ 173500 | consumed samples: 38689280 | consumed tokens: 79235645440 | elapsed time per iteration (s): 0.16 | learning rate: 2.743E-05 | global batch size: 256 | lm loss: 3.670599E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.739 | TFLOPs: 24.74 | +7: iteration 151140/ 173500 | consumed samples: 38691840 | consumed tokens: 79240888320 | elapsed time per iteration (s): 0.16 | learning rate: 2.742E-05 | global batch size: 256 | lm loss: 3.657701E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.998 | TFLOPs: 25.09 | +7: iteration 151150/ 173500 | consumed samples: 38694400 | consumed tokens: 79246131200 | elapsed time per iteration (s): 0.16 | learning rate: 2.742E-05 | global batch size: 256 | lm loss: 3.679253E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.796 | TFLOPs: 25.39 | +7: iteration 151160/ 173500 | consumed samples: 38696960 | consumed tokens: 79251374080 | elapsed time per iteration (s): 0.16 | learning rate: 2.741E-05 | global batch size: 256 | lm loss: 3.665400E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.158 | TFLOPs: 25.44 | +7: iteration 151170/ 173500 | consumed samples: 38699520 | consumed tokens: 79256616960 | elapsed time per iteration (s): 0.16 | learning rate: 2.740E-05 | global batch size: 256 | lm loss: 3.663634E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.771 | TFLOPs: 25.43 | +7: iteration 151180/ 173500 | consumed samples: 38702080 | consumed tokens: 79261859840 | elapsed time per iteration (s): 0.17 | learning rate: 2.740E-05 | global batch size: 256 | lm loss: 3.665585E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.956 | TFLOPs: 23.81 | +7: iteration 151190/ 173500 | consumed samples: 38704640 | consumed tokens: 79267102720 | elapsed time per iteration (s): 0.16 | learning rate: 2.739E-05 | global batch size: 256 | lm loss: 3.665909E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.848 | TFLOPs: 25.26 | +7: iteration 151200/ 173500 | consumed samples: 38707200 | consumed tokens: 79272345600 | elapsed time per iteration (s): 0.16 | learning rate: 2.738E-05 | global batch size: 256 | lm loss: 3.649131E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.643 | TFLOPs: 25.09 | +7: iteration 151210/ 173500 | consumed samples: 38709760 | consumed tokens: 79277588480 | elapsed time per iteration (s): 0.16 | learning rate: 2.738E-05 | global batch size: 256 | lm loss: 3.662233E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.511 | TFLOPs: 25.04 | +7: iteration 151220/ 173500 | consumed samples: 38712320 | consumed tokens: 79282831360 | elapsed time per iteration (s): 0.16 | learning rate: 2.737E-05 | global batch size: 256 | lm loss: 3.664999E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.118 | TFLOPs: 25.36 | +7: iteration 151230/ 173500 | consumed samples: 38714880 | consumed tokens: 79288074240 | elapsed time per iteration (s): 0.16 | learning rate: 2.736E-05 | global batch size: 256 | lm loss: 3.657906E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.870 | TFLOPs: 25.65 | +7: iteration 151240/ 173500 | consumed samples: 38717440 | consumed tokens: 79293317120 | elapsed time per iteration (s): 0.16 | learning rate: 2.736E-05 | global batch size: 256 | lm loss: 3.680695E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.115 | TFLOPs: 25.89 | +7: iteration 151250/ 173500 | consumed samples: 38720000 | consumed tokens: 79298560000 | elapsed time per iteration (s): 0.15 | learning rate: 2.735E-05 | global batch size: 256 | lm loss: 3.650412E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.051 | TFLOPs: 26.08 | +7: iteration 151260/ 173500 | consumed samples: 38722560 | consumed tokens: 79303802880 | elapsed time per iteration (s): 0.16 | learning rate: 2.734E-05 | global batch size: 256 | lm loss: 3.671430E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.648 | TFLOPs: 25.56 | +7: iteration 151270/ 173500 | consumed samples: 38725120 | consumed tokens: 79309045760 | elapsed time per iteration (s): 0.16 | learning rate: 2.734E-05 | global batch size: 256 | lm loss: 3.672952E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.320 | TFLOPs: 25.66 | +7: iteration 151280/ 173500 | consumed samples: 38727680 | consumed tokens: 79314288640 | elapsed time per iteration (s): 0.16 | learning rate: 2.733E-05 | global batch size: 256 | lm loss: 3.668367E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.407 | TFLOPs: 25.19 | +7: iteration 151290/ 173500 | consumed samples: 38730240 | consumed tokens: 79319531520 | elapsed time per iteration (s): 0.16 | learning rate: 2.732E-05 | global batch size: 256 | lm loss: 3.669875E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.431 | TFLOPs: 25.15 | +7: iteration 151300/ 173500 | consumed samples: 38732800 | consumed tokens: 79324774400 | elapsed time per iteration (s): 0.16 | learning rate: 2.732E-05 | global batch size: 256 | lm loss: 3.656948E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.701 | TFLOPs: 25.51 | +7: iteration 151310/ 173500 | consumed samples: 38735360 | consumed tokens: 79330017280 | elapsed time per iteration (s): 0.16 | learning rate: 2.731E-05 | global batch size: 256 | lm loss: 3.654583E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.565 | TFLOPs: 25.60 | +7: iteration 151320/ 173500 | consumed samples: 38737920 | consumed tokens: 79335260160 | elapsed time per iteration (s): 0.15 | learning rate: 2.731E-05 | global batch size: 256 | lm loss: 3.658986E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.862 | TFLOPs: 26.08 | +7: iteration 151330/ 173500 | consumed samples: 38740480 | consumed tokens: 79340503040 | elapsed time per iteration (s): 0.16 | learning rate: 2.730E-05 | global batch size: 256 | lm loss: 3.666612E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.335 | TFLOPs: 25.66 | +7: iteration 151340/ 173500 | consumed samples: 38743040 | consumed tokens: 79345745920 | elapsed time per iteration (s): 0.16 | learning rate: 2.729E-05 | global batch size: 256 | lm loss: 3.663184E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.784 | TFLOPs: 25.81 | +7: iteration 151350/ 173500 | consumed samples: 38745600 | consumed tokens: 79350988800 | elapsed time per iteration (s): 0.16 | learning rate: 2.729E-05 | global batch size: 256 | lm loss: 3.652393E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.328 | TFLOPs: 25.18 | +7: iteration 151360/ 173500 | consumed samples: 38748160 | consumed tokens: 79356231680 | elapsed time per iteration (s): 0.16 | learning rate: 2.728E-05 | global batch size: 256 | lm loss: 3.665844E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.973 | TFLOPs: 25.88 | +7: iteration 151370/ 173500 | consumed samples: 38750720 | consumed tokens: 79361474560 | elapsed time per iteration (s): 0.15 | learning rate: 2.727E-05 | global batch size: 256 | lm loss: 3.668095E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.899 | TFLOPs: 26.17 | +7: iteration 151380/ 173500 | consumed samples: 38753280 | consumed tokens: 79366717440 | elapsed time per iteration (s): 0.16 | learning rate: 2.727E-05 | global batch size: 256 | lm loss: 3.663969E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.815 | TFLOPs: 25.75 | +7: iteration 151390/ 173500 | consumed samples: 38755840 | consumed tokens: 79371960320 | elapsed time per iteration (s): 0.16 | learning rate: 2.726E-05 | global batch size: 256 | lm loss: 3.660555E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.362 | TFLOPs: 24.82 | +7: iteration 151400/ 173500 | consumed samples: 38758400 | consumed tokens: 79377203200 | elapsed time per iteration (s): 0.16 | learning rate: 2.725E-05 | global batch size: 256 | lm loss: 3.672215E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.612 | TFLOPs: 24.41 | +7: iteration 151410/ 173500 | consumed samples: 38760960 | consumed tokens: 79382446080 | elapsed time per iteration (s): 0.16 | learning rate: 2.725E-05 | global batch size: 256 | lm loss: 3.662450E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.115 | TFLOPs: 24.91 | +7: iteration 151420/ 173500 | consumed samples: 38763520 | consumed tokens: 79387688960 | elapsed time per iteration (s): 0.16 | learning rate: 2.724E-05 | global batch size: 256 | lm loss: 3.645654E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.098 | TFLOPs: 24.37 | +7: iteration 151430/ 173500 | consumed samples: 38766080 | consumed tokens: 79392931840 | elapsed time per iteration (s): 0.16 | learning rate: 2.723E-05 | global batch size: 256 | lm loss: 3.663257E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.765 | TFLOPs: 25.31 | +7: iteration 151440/ 173500 | consumed samples: 38768640 | consumed tokens: 79398174720 | elapsed time per iteration (s): 0.16 | learning rate: 2.723E-05 | global batch size: 256 | lm loss: 3.666398E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.481 | TFLOPs: 24.75 | +7: iteration 151450/ 173500 | consumed samples: 38771200 | consumed tokens: 79403417600 | elapsed time per iteration (s): 0.15 | learning rate: 2.722E-05 | global batch size: 256 | lm loss: 3.660183E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.573 | TFLOPs: 26.18 | +7: iteration 151460/ 173500 | consumed samples: 38773760 | consumed tokens: 79408660480 | elapsed time per iteration (s): 0.16 | learning rate: 2.721E-05 | global batch size: 256 | lm loss: 3.652707E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.269 | TFLOPs: 25.50 | +7: iteration 151470/ 173500 | consumed samples: 38776320 | consumed tokens: 79413903360 | elapsed time per iteration (s): 0.16 | learning rate: 2.721E-05 | global batch size: 256 | lm loss: 3.657679E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.027 | TFLOPs: 24.92 | +7: iteration 151480/ 173500 | consumed samples: 38778880 | consumed tokens: 79419146240 | elapsed time per iteration (s): 0.16 | learning rate: 2.720E-05 | global batch size: 256 | lm loss: 3.658366E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.273 | TFLOPs: 24.88 | +7: iteration 151490/ 173500 | consumed samples: 38781440 | consumed tokens: 79424389120 | elapsed time per iteration (s): 0.16 | learning rate: 2.719E-05 | global batch size: 256 | lm loss: 3.668246E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.840 | TFLOPs: 25.54 | +7: iteration 151500/ 173500 | consumed samples: 38784000 | consumed tokens: 79429632000 | elapsed time per iteration (s): 0.16 | learning rate: 2.719E-05 | global batch size: 256 | lm loss: 3.664969E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.300 | TFLOPs: 24.72 | +7: iteration 151510/ 173500 | consumed samples: 38786560 | consumed tokens: 79434874880 | elapsed time per iteration (s): 0.17 | learning rate: 2.718E-05 | global batch size: 256 | lm loss: 3.670858E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.365 | TFLOPs: 24.25 | +7: iteration 151520/ 173500 | consumed samples: 38789120 | consumed tokens: 79440117760 | elapsed time per iteration (s): 0.16 | learning rate: 2.718E-05 | global batch size: 256 | lm loss: 3.653759E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.946 | TFLOPs: 24.97 | +7: iteration 151530/ 173500 | consumed samples: 38791680 | consumed tokens: 79445360640 | elapsed time per iteration (s): 0.16 | learning rate: 2.717E-05 | global batch size: 256 | lm loss: 3.666295E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.777 | TFLOPs: 25.62 | +7: iteration 151540/ 173500 | consumed samples: 38794240 | consumed tokens: 79450603520 | elapsed time per iteration (s): 0.17 | learning rate: 2.716E-05 | global batch size: 256 | lm loss: 3.654992E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1529.490 | TFLOPs: 23.99 | +7: iteration 151550/ 173500 | consumed samples: 38796800 | consumed tokens: 79455846400 | elapsed time per iteration (s): 0.16 | learning rate: 2.716E-05 | global batch size: 256 | lm loss: 3.668562E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.420 | TFLOPs: 25.47 | +7: iteration 151560/ 173500 | consumed samples: 38799360 | consumed tokens: 79461089280 | elapsed time per iteration (s): 0.16 | learning rate: 2.715E-05 | global batch size: 256 | lm loss: 3.661251E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.514 | TFLOPs: 24.57 | +7: iteration 151570/ 173500 | consumed samples: 38801920 | consumed tokens: 79466332160 | elapsed time per iteration (s): 0.16 | learning rate: 2.714E-05 | global batch size: 256 | lm loss: 3.663410E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.910 | TFLOPs: 25.47 | +7: iteration 151580/ 173500 | consumed samples: 38804480 | consumed tokens: 79471575040 | elapsed time per iteration (s): 0.15 | learning rate: 2.714E-05 | global batch size: 256 | lm loss: 3.672565E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.122 | TFLOPs: 26.22 | +7: iteration 151590/ 173500 | consumed samples: 38807040 | consumed tokens: 79476817920 | elapsed time per iteration (s): 0.16 | learning rate: 2.713E-05 | global batch size: 256 | lm loss: 3.677769E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.717 | TFLOPs: 25.10 | +7: iteration 151600/ 173500 | consumed samples: 38809600 | consumed tokens: 79482060800 | elapsed time per iteration (s): 0.16 | learning rate: 2.712E-05 | global batch size: 256 | lm loss: 3.671070E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.124 | TFLOPs: 24.72 | +7: iteration 151610/ 173500 | consumed samples: 38812160 | consumed tokens: 79487303680 | elapsed time per iteration (s): 0.16 | learning rate: 2.712E-05 | global batch size: 256 | lm loss: 3.667750E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.704 | TFLOPs: 25.20 | +7: iteration 151620/ 173500 | consumed samples: 38814720 | consumed tokens: 79492546560 | elapsed time per iteration (s): 0.15 | learning rate: 2.711E-05 | global batch size: 256 | lm loss: 3.656189E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.969 | TFLOPs: 25.94 | +7: iteration 151630/ 173500 | consumed samples: 38817280 | consumed tokens: 79497789440 | elapsed time per iteration (s): 0.16 | learning rate: 2.710E-05 | global batch size: 256 | lm loss: 3.669065E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.270 | TFLOPs: 25.86 | +7: iteration 151640/ 173500 | consumed samples: 38819840 | consumed tokens: 79503032320 | elapsed time per iteration (s): 0.16 | learning rate: 2.710E-05 | global batch size: 256 | lm loss: 3.658654E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.718 | TFLOPs: 25.13 | +7: iteration 151650/ 173500 | consumed samples: 38822400 | consumed tokens: 79508275200 | elapsed time per iteration (s): 0.16 | learning rate: 2.709E-05 | global batch size: 256 | lm loss: 3.673938E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.914 | TFLOPs: 25.48 | +7: iteration 151660/ 173500 | consumed samples: 38824960 | consumed tokens: 79513518080 | elapsed time per iteration (s): 0.15 | learning rate: 2.709E-05 | global batch size: 256 | lm loss: 3.666833E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.990 | TFLOPs: 26.21 | +7: iteration 151670/ 173500 | consumed samples: 38827520 | consumed tokens: 79518760960 | elapsed time per iteration (s): 0.15 | learning rate: 2.708E-05 | global batch size: 256 | lm loss: 3.666755E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.882 | TFLOPs: 26.16 | +7: iteration 151680/ 173500 | consumed samples: 38830080 | consumed tokens: 79524003840 | elapsed time per iteration (s): 0.16 | learning rate: 2.707E-05 | global batch size: 256 | lm loss: 3.653124E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.818 | TFLOPs: 25.75 | +7: iteration 151690/ 173500 | consumed samples: 38832640 | consumed tokens: 79529246720 | elapsed time per iteration (s): 0.15 | learning rate: 2.707E-05 | global batch size: 256 | lm loss: 3.655766E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.309 | TFLOPs: 26.24 | +7: iteration 151700/ 173500 | consumed samples: 38835200 | consumed tokens: 79534489600 | elapsed time per iteration (s): 0.17 | learning rate: 2.706E-05 | global batch size: 256 | lm loss: 3.663744E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.885 | TFLOPs: 24.13 | +7: iteration 151710/ 173500 | consumed samples: 38837760 | consumed tokens: 79539732480 | elapsed time per iteration (s): 0.17 | learning rate: 2.705E-05 | global batch size: 256 | lm loss: 3.670776E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1488.699 | TFLOPs: 23.35 | +7: iteration 151720/ 173500 | consumed samples: 38840320 | consumed tokens: 79544975360 | elapsed time per iteration (s): 0.16 | learning rate: 2.705E-05 | global batch size: 256 | lm loss: 3.659025E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.998 | TFLOPs: 25.14 | +7: iteration 151730/ 173500 | consumed samples: 38842880 | consumed tokens: 79550218240 | elapsed time per iteration (s): 0.17 | learning rate: 2.704E-05 | global batch size: 256 | lm loss: 3.678341E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1517.543 | TFLOPs: 23.80 | +7: iteration 151740/ 173500 | consumed samples: 38845440 | consumed tokens: 79555461120 | elapsed time per iteration (s): 0.16 | learning rate: 2.703E-05 | global batch size: 256 | lm loss: 3.661486E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1573.984 | TFLOPs: 24.68 | +7: iteration 151750/ 173500 | consumed samples: 38848000 | consumed tokens: 79560704000 | elapsed time per iteration (s): 0.16 | learning rate: 2.703E-05 | global batch size: 256 | lm loss: 3.670308E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.702 | TFLOPs: 25.71 | +7: iteration 151760/ 173500 | consumed samples: 38850560 | consumed tokens: 79565946880 | elapsed time per iteration (s): 0.16 | learning rate: 2.702E-05 | global batch size: 256 | lm loss: 3.654107E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.800 | TFLOPs: 25.75 | +7: iteration 151770/ 173500 | consumed samples: 38853120 | consumed tokens: 79571189760 | elapsed time per iteration (s): 0.16 | learning rate: 2.702E-05 | global batch size: 256 | lm loss: 3.671051E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.798 | TFLOPs: 25.72 | +7: iteration 151780/ 173500 | consumed samples: 38855680 | consumed tokens: 79576432640 | elapsed time per iteration (s): 0.15 | learning rate: 2.701E-05 | global batch size: 256 | lm loss: 3.664408E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.919 | TFLOPs: 26.19 | +7: iteration 151790/ 173500 | consumed samples: 38858240 | consumed tokens: 79581675520 | elapsed time per iteration (s): 0.16 | learning rate: 2.700E-05 | global batch size: 256 | lm loss: 3.659919E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.012 | TFLOPs: 24.40 | +7: iteration 151800/ 173500 | consumed samples: 38860800 | consumed tokens: 79586918400 | elapsed time per iteration (s): 0.16 | learning rate: 2.700E-05 | global batch size: 256 | lm loss: 3.670854E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.042 | TFLOPs: 25.86 | +7: iteration 151810/ 173500 | consumed samples: 38863360 | consumed tokens: 79592161280 | elapsed time per iteration (s): 0.15 | learning rate: 2.699E-05 | global batch size: 256 | lm loss: 3.668196E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.368 | TFLOPs: 26.23 | +7: iteration 151820/ 173500 | consumed samples: 38865920 | consumed tokens: 79597404160 | elapsed time per iteration (s): 0.16 | learning rate: 2.698E-05 | global batch size: 256 | lm loss: 3.663086E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.668 | TFLOPs: 25.01 | +7: iteration 151830/ 173500 | consumed samples: 38868480 | consumed tokens: 79602647040 | elapsed time per iteration (s): 0.16 | learning rate: 2.698E-05 | global batch size: 256 | lm loss: 3.667602E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.157 | TFLOPs: 25.11 | +7: iteration 151840/ 173500 | consumed samples: 38871040 | consumed tokens: 79607889920 | elapsed time per iteration (s): 0.15 | learning rate: 2.697E-05 | global batch size: 256 | lm loss: 3.670164E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.606 | TFLOPs: 26.21 | +7: iteration 151850/ 173500 | consumed samples: 38873600 | consumed tokens: 79613132800 | elapsed time per iteration (s): 0.16 | learning rate: 2.696E-05 | global batch size: 256 | lm loss: 3.664373E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.887 | TFLOPs: 25.44 | +7: iteration 151860/ 173500 | consumed samples: 38876160 | consumed tokens: 79618375680 | elapsed time per iteration (s): 0.15 | learning rate: 2.696E-05 | global batch size: 256 | lm loss: 3.661319E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.285 | TFLOPs: 26.24 | +7: iteration 151870/ 173500 | consumed samples: 38878720 | consumed tokens: 79623618560 | elapsed time per iteration (s): 0.15 | learning rate: 2.695E-05 | global batch size: 256 | lm loss: 3.669502E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.620 | TFLOPs: 26.25 | +7: iteration 151880/ 173500 | consumed samples: 38881280 | consumed tokens: 79628861440 | elapsed time per iteration (s): 0.16 | learning rate: 2.695E-05 | global batch size: 256 | lm loss: 3.669415E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.125 | TFLOPs: 25.27 | +7: iteration 151890/ 173500 | consumed samples: 38883840 | consumed tokens: 79634104320 | elapsed time per iteration (s): 0.16 | learning rate: 2.694E-05 | global batch size: 256 | lm loss: 3.666406E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.546 | TFLOPs: 25.63 | +7: iteration 151900/ 173500 | consumed samples: 38886400 | consumed tokens: 79639347200 | elapsed time per iteration (s): 0.16 | learning rate: 2.693E-05 | global batch size: 256 | lm loss: 3.667207E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.397 | TFLOPs: 25.54 | +7: iteration 151910/ 173500 | consumed samples: 38888960 | consumed tokens: 79644590080 | elapsed time per iteration (s): 0.16 | learning rate: 2.693E-05 | global batch size: 256 | lm loss: 3.678551E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.509 | TFLOPs: 25.88 | +7: iteration 151920/ 173500 | consumed samples: 38891520 | consumed tokens: 79649832960 | elapsed time per iteration (s): 0.16 | learning rate: 2.692E-05 | global batch size: 256 | lm loss: 3.652495E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.456 | TFLOPs: 24.50 | +7: iteration 151930/ 173500 | consumed samples: 38894080 | consumed tokens: 79655075840 | elapsed time per iteration (s): 0.15 | learning rate: 2.691E-05 | global batch size: 256 | lm loss: 3.644410E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.911 | TFLOPs: 26.25 | +7: iteration 151940/ 173500 | consumed samples: 38896640 | consumed tokens: 79660318720 | elapsed time per iteration (s): 0.16 | learning rate: 2.691E-05 | global batch size: 256 | lm loss: 3.657707E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.086 | TFLOPs: 25.08 | +7: iteration 151950/ 173500 | consumed samples: 38899200 | consumed tokens: 79665561600 | elapsed time per iteration (s): 0.15 | learning rate: 2.690E-05 | global batch size: 256 | lm loss: 3.656464E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.493 | TFLOPs: 25.92 | +7: iteration 151960/ 173500 | consumed samples: 38901760 | consumed tokens: 79670804480 | elapsed time per iteration (s): 0.15 | learning rate: 2.689E-05 | global batch size: 256 | lm loss: 3.665586E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.702 | TFLOPs: 25.90 | +7: iteration 151970/ 173500 | consumed samples: 38904320 | consumed tokens: 79676047360 | elapsed time per iteration (s): 0.15 | learning rate: 2.689E-05 | global batch size: 256 | lm loss: 3.657725E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.945 | TFLOPs: 25.91 | +7: iteration 151980/ 173500 | consumed samples: 38906880 | consumed tokens: 79681290240 | elapsed time per iteration (s): 0.16 | learning rate: 2.688E-05 | global batch size: 256 | lm loss: 3.672604E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.978 | TFLOPs: 25.47 | +7: iteration 151990/ 173500 | consumed samples: 38909440 | consumed tokens: 79686533120 | elapsed time per iteration (s): 0.16 | learning rate: 2.688E-05 | global batch size: 256 | lm loss: 3.663961E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.632 | TFLOPs: 25.87 | +0: [2023-03-17 06:52:07,368] [INFO] [logging.py:68:log_dist] [Rank 0] step=152000, skipped=0, lr=[2.6869667028068037e-05, 2.6869667028068037e-05, 2.6869667028068037e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 152000/ 173500 | consumed samples: 38912000 | consumed tokens: 79691776000 | elapsed time per iteration (s): 0.16 | learning rate: 2.687E-05 | global batch size: 256 | lm loss: 3.669006E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.543 | TFLOPs: 25.08 | +0: steps: 152000 loss: 3.6761 iter time (s): 0.158 samples/sec: 1616.985 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 152000 | lm loss value: 3.848666E+00 | lm loss PPL: 4.693040E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 152000 to checkpoints_44m91b100m +0: [2023-03-17 06:52:07,441] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step152000 is begin to save! +0: [2023-03-17 06:52:07,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:52:07,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:52:07,506] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:52:07,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:52:07,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:52:07,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:52:07,523] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:52:07,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:52:07,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:52:07,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:52:07,539] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:52:07,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:52:07,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:52:07,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:52:07,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:52:07,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:52:07,564] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:52:07,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:52:07,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:52:07,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:52:07,573] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step152000/mp_rank_00_model_states.pt +0: [2023-03-17 06:52:07,573] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:52:07,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:52:07,592] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:52:07,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +5: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: [2023-03-17 06:52:07,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +5: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +5: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +5: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 06:52:07,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +3: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +5: [2023-03-17 06:52:07,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +5: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +1: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +2: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +6: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +7: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:52:07,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:52:07,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +4: [2023-03-17 06:52:07,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:52:07,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step152000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:52:07,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step152000 is ready now! +0: successfully saved checkpoint at iteration 152000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.19 +7: iteration 152010/ 173500 | consumed samples: 38914560 | consumed tokens: 79697018880 | elapsed time per iteration (s): 0.19 | learning rate: 2.686E-05 | global batch size: 256 | lm loss: 3.662743E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1381.279 | TFLOPs: 21.66 | +7: iteration 152020/ 173500 | consumed samples: 38917120 | consumed tokens: 79702261760 | elapsed time per iteration (s): 0.16 | learning rate: 2.686E-05 | global batch size: 256 | lm loss: 3.657984E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.302 | TFLOPs: 25.32 | +7: iteration 152030/ 173500 | consumed samples: 38919680 | consumed tokens: 79707504640 | elapsed time per iteration (s): 0.16 | learning rate: 2.685E-05 | global batch size: 256 | lm loss: 3.662763E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.400 | TFLOPs: 24.69 | +7: iteration 152040/ 173500 | consumed samples: 38922240 | consumed tokens: 79712747520 | elapsed time per iteration (s): 0.16 | learning rate: 2.684E-05 | global batch size: 256 | lm loss: 3.660832E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.522 | TFLOPs: 25.07 | +7: iteration 152050/ 173500 | consumed samples: 38924800 | consumed tokens: 79717990400 | elapsed time per iteration (s): 0.16 | learning rate: 2.684E-05 | global batch size: 256 | lm loss: 3.678056E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.072 | TFLOPs: 24.64 | +7: iteration 152060/ 173500 | consumed samples: 38927360 | consumed tokens: 79723233280 | elapsed time per iteration (s): 0.15 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 3.679019E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.533 | TFLOPs: 25.92 | +7: iteration 152070/ 173500 | consumed samples: 38929920 | consumed tokens: 79728476160 | elapsed time per iteration (s): 0.16 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 3.667339E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.658 | TFLOPs: 25.40 | +7: iteration 152080/ 173500 | consumed samples: 38932480 | consumed tokens: 79733719040 | elapsed time per iteration (s): 0.16 | learning rate: 2.682E-05 | global batch size: 256 | lm loss: 3.676223E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.561 | TFLOPs: 25.59 | +7: iteration 152090/ 173500 | consumed samples: 38935040 | consumed tokens: 79738961920 | elapsed time per iteration (s): 0.16 | learning rate: 2.681E-05 | global batch size: 256 | lm loss: 3.662017E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.285 | TFLOPs: 24.77 | +7: iteration 152100/ 173500 | consumed samples: 38937600 | consumed tokens: 79744204800 | elapsed time per iteration (s): 0.17 | learning rate: 2.681E-05 | global batch size: 256 | lm loss: 3.664468E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.381 | TFLOPs: 24.24 | +7: iteration 152110/ 173500 | consumed samples: 38940160 | consumed tokens: 79749447680 | elapsed time per iteration (s): 0.16 | learning rate: 2.680E-05 | global batch size: 256 | lm loss: 3.674028E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.309 | TFLOPs: 24.81 | +7: iteration 152120/ 173500 | consumed samples: 38942720 | consumed tokens: 79754690560 | elapsed time per iteration (s): 0.16 | learning rate: 2.679E-05 | global batch size: 256 | lm loss: 3.664046E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.485 | TFLOPs: 25.63 | +7: iteration 152130/ 173500 | consumed samples: 38945280 | consumed tokens: 79759933440 | elapsed time per iteration (s): 0.16 | learning rate: 2.679E-05 | global batch size: 256 | lm loss: 3.660049E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.384 | TFLOPs: 25.36 | +7: iteration 152140/ 173500 | consumed samples: 38947840 | consumed tokens: 79765176320 | elapsed time per iteration (s): 0.16 | learning rate: 2.678E-05 | global batch size: 256 | lm loss: 3.667229E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.408 | TFLOPs: 25.68 | +7: iteration 152150/ 173500 | consumed samples: 38950400 | consumed tokens: 79770419200 | elapsed time per iteration (s): 0.16 | learning rate: 2.678E-05 | global batch size: 256 | lm loss: 3.653382E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.887 | TFLOPs: 25.29 | +7: iteration 152160/ 173500 | consumed samples: 38952960 | consumed tokens: 79775662080 | elapsed time per iteration (s): 0.16 | learning rate: 2.677E-05 | global batch size: 256 | lm loss: 3.655664E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1568.498 | TFLOPs: 24.60 | +7: iteration 152170/ 173500 | consumed samples: 38955520 | consumed tokens: 79780904960 | elapsed time per iteration (s): 0.16 | learning rate: 2.676E-05 | global batch size: 256 | lm loss: 3.673232E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.452 | TFLOPs: 24.72 | +7: iteration 152180/ 173500 | consumed samples: 38958080 | consumed tokens: 79786147840 | elapsed time per iteration (s): 0.16 | learning rate: 2.676E-05 | global batch size: 256 | lm loss: 3.656947E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.159 | TFLOPs: 25.88 | +7: iteration 152190/ 173500 | consumed samples: 38960640 | consumed tokens: 79791390720 | elapsed time per iteration (s): 0.16 | learning rate: 2.675E-05 | global batch size: 256 | lm loss: 3.668891E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.991 | TFLOPs: 24.84 | +7: iteration 152200/ 173500 | consumed samples: 38963200 | consumed tokens: 79796633600 | elapsed time per iteration (s): 0.16 | learning rate: 2.674E-05 | global batch size: 256 | lm loss: 3.661638E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.220 | TFLOPs: 25.63 | +7: iteration 152210/ 173500 | consumed samples: 38965760 | consumed tokens: 79801876480 | elapsed time per iteration (s): 0.16 | learning rate: 2.674E-05 | global batch size: 256 | lm loss: 3.655959E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.185 | TFLOPs: 25.31 | +7: iteration 152220/ 173500 | consumed samples: 38968320 | consumed tokens: 79807119360 | elapsed time per iteration (s): 0.15 | learning rate: 2.673E-05 | global batch size: 256 | lm loss: 3.654120E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.500 | TFLOPs: 26.17 | +7: iteration 152230/ 173500 | consumed samples: 38970880 | consumed tokens: 79812362240 | elapsed time per iteration (s): 0.16 | learning rate: 2.673E-05 | global batch size: 256 | lm loss: 3.683168E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.308 | TFLOPs: 25.47 | +7: iteration 152240/ 173500 | consumed samples: 38973440 | consumed tokens: 79817605120 | elapsed time per iteration (s): 0.16 | learning rate: 2.672E-05 | global batch size: 256 | lm loss: 3.667945E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.080 | TFLOPs: 25.12 | +7: iteration 152250/ 173500 | consumed samples: 38976000 | consumed tokens: 79822848000 | elapsed time per iteration (s): 0.16 | learning rate: 2.671E-05 | global batch size: 256 | lm loss: 3.655642E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.632 | TFLOPs: 24.77 | +7: iteration 152260/ 173500 | consumed samples: 38978560 | consumed tokens: 79828090880 | elapsed time per iteration (s): 0.17 | learning rate: 2.671E-05 | global batch size: 256 | lm loss: 3.659441E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.636 | TFLOPs: 24.02 | +7: iteration 152270/ 173500 | consumed samples: 38981120 | consumed tokens: 79833333760 | elapsed time per iteration (s): 0.16 | learning rate: 2.670E-05 | global batch size: 256 | lm loss: 3.664082E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.793 | TFLOPs: 25.31 | +7: iteration 152280/ 173500 | consumed samples: 38983680 | consumed tokens: 79838576640 | elapsed time per iteration (s): 0.16 | learning rate: 2.669E-05 | global batch size: 256 | lm loss: 3.658455E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.434 | TFLOPs: 25.11 | +7: iteration 152290/ 173500 | consumed samples: 38986240 | consumed tokens: 79843819520 | elapsed time per iteration (s): 0.16 | learning rate: 2.669E-05 | global batch size: 256 | lm loss: 3.677924E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.300 | TFLOPs: 25.88 | +7: iteration 152300/ 173500 | consumed samples: 38988800 | consumed tokens: 79849062400 | elapsed time per iteration (s): 0.16 | learning rate: 2.668E-05 | global batch size: 256 | lm loss: 3.672197E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.880 | TFLOPs: 24.42 | +7: iteration 152310/ 173500 | consumed samples: 38991360 | consumed tokens: 79854305280 | elapsed time per iteration (s): 0.16 | learning rate: 2.668E-05 | global batch size: 256 | lm loss: 3.673995E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.621 | TFLOPs: 24.44 | +7: iteration 152320/ 173500 | consumed samples: 38993920 | consumed tokens: 79859548160 | elapsed time per iteration (s): 0.16 | learning rate: 2.667E-05 | global batch size: 256 | lm loss: 3.672889E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.220 | TFLOPs: 24.64 | +7: iteration 152330/ 173500 | consumed samples: 38996480 | consumed tokens: 79864791040 | elapsed time per iteration (s): 0.16 | learning rate: 2.666E-05 | global batch size: 256 | lm loss: 3.659289E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.982 | TFLOPs: 25.08 | +7: iteration 152340/ 173500 | consumed samples: 38999040 | consumed tokens: 79870033920 | elapsed time per iteration (s): 0.16 | learning rate: 2.666E-05 | global batch size: 256 | lm loss: 3.653010E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.864 | TFLOPs: 25.17 | +7: iteration 152350/ 173500 | consumed samples: 39001600 | consumed tokens: 79875276800 | elapsed time per iteration (s): 0.16 | learning rate: 2.665E-05 | global batch size: 256 | lm loss: 3.668013E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.552 | TFLOPs: 24.72 | +7: iteration 152360/ 173500 | consumed samples: 39004160 | consumed tokens: 79880519680 | elapsed time per iteration (s): 0.16 | learning rate: 2.664E-05 | global batch size: 256 | lm loss: 3.662652E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.915 | TFLOPs: 25.28 | +7: iteration 152370/ 173500 | consumed samples: 39006720 | consumed tokens: 79885762560 | elapsed time per iteration (s): 0.16 | learning rate: 2.664E-05 | global batch size: 256 | lm loss: 3.663968E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.563 | TFLOPs: 25.51 | +7: iteration 152380/ 173500 | consumed samples: 39009280 | consumed tokens: 79891005440 | elapsed time per iteration (s): 0.17 | learning rate: 2.663E-05 | global batch size: 256 | lm loss: 3.666676E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1528.071 | TFLOPs: 23.96 | +7: iteration 152390/ 173500 | consumed samples: 39011840 | consumed tokens: 79896248320 | elapsed time per iteration (s): 0.16 | learning rate: 2.663E-05 | global batch size: 256 | lm loss: 3.662110E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.428 | TFLOPs: 25.76 | +7: iteration 152400/ 173500 | consumed samples: 39014400 | consumed tokens: 79901491200 | elapsed time per iteration (s): 0.16 | learning rate: 2.662E-05 | global batch size: 256 | lm loss: 3.661686E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.582 | TFLOPs: 24.96 | +7: iteration 152410/ 173500 | consumed samples: 39016960 | consumed tokens: 79906734080 | elapsed time per iteration (s): 0.16 | learning rate: 2.661E-05 | global batch size: 256 | lm loss: 3.656212E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.786 | TFLOPs: 25.25 | +7: iteration 152420/ 173500 | consumed samples: 39019520 | consumed tokens: 79911976960 | elapsed time per iteration (s): 0.16 | learning rate: 2.661E-05 | global batch size: 256 | lm loss: 3.655957E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.141 | TFLOPs: 25.45 | +7: iteration 152430/ 173500 | consumed samples: 39022080 | consumed tokens: 79917219840 | elapsed time per iteration (s): 0.16 | learning rate: 2.660E-05 | global batch size: 256 | lm loss: 3.657516E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.507 | TFLOPs: 25.73 | +7: iteration 152440/ 173500 | consumed samples: 39024640 | consumed tokens: 79922462720 | elapsed time per iteration (s): 0.16 | learning rate: 2.659E-05 | global batch size: 256 | lm loss: 3.654960E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.989 | TFLOPs: 25.53 | +7: iteration 152450/ 173500 | consumed samples: 39027200 | consumed tokens: 79927705600 | elapsed time per iteration (s): 0.16 | learning rate: 2.659E-05 | global batch size: 256 | lm loss: 3.668185E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.534 | TFLOPs: 25.10 | +7: iteration 152460/ 173500 | consumed samples: 39029760 | consumed tokens: 79932948480 | elapsed time per iteration (s): 0.17 | learning rate: 2.658E-05 | global batch size: 256 | lm loss: 3.674820E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.687 | TFLOPs: 23.74 | +7: iteration 152470/ 173500 | consumed samples: 39032320 | consumed tokens: 79938191360 | elapsed time per iteration (s): 0.16 | learning rate: 2.658E-05 | global batch size: 256 | lm loss: 3.664371E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.883 | TFLOPs: 24.89 | +7: iteration 152480/ 173500 | consumed samples: 39034880 | consumed tokens: 79943434240 | elapsed time per iteration (s): 0.17 | learning rate: 2.657E-05 | global batch size: 256 | lm loss: 3.666995E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.565 | TFLOPs: 24.10 | +7: iteration 152490/ 173500 | consumed samples: 39037440 | consumed tokens: 79948677120 | elapsed time per iteration (s): 0.16 | learning rate: 2.656E-05 | global batch size: 256 | lm loss: 3.658552E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.519 | TFLOPs: 25.77 | +7: iteration 152500/ 173500 | consumed samples: 39040000 | consumed tokens: 79953920000 | elapsed time per iteration (s): 0.16 | learning rate: 2.656E-05 | global batch size: 256 | lm loss: 3.662762E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.608 | TFLOPs: 24.57 | +7: iteration 152510/ 173500 | consumed samples: 39042560 | consumed tokens: 79959162880 | elapsed time per iteration (s): 0.16 | learning rate: 2.655E-05 | global batch size: 256 | lm loss: 3.667621E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.498 | TFLOPs: 24.61 | +7: iteration 152520/ 173500 | consumed samples: 39045120 | consumed tokens: 79964405760 | elapsed time per iteration (s): 0.16 | learning rate: 2.655E-05 | global batch size: 256 | lm loss: 3.656739E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.800 | TFLOPs: 25.20 | +7: iteration 152530/ 173500 | consumed samples: 39047680 | consumed tokens: 79969648640 | elapsed time per iteration (s): 0.17 | learning rate: 2.654E-05 | global batch size: 256 | lm loss: 3.646404E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1494.612 | TFLOPs: 23.44 | +7: iteration 152540/ 173500 | consumed samples: 39050240 | consumed tokens: 79974891520 | elapsed time per iteration (s): 0.17 | learning rate: 2.653E-05 | global batch size: 256 | lm loss: 3.669918E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1545.485 | TFLOPs: 24.24 | +7: iteration 152550/ 173500 | consumed samples: 39052800 | consumed tokens: 79980134400 | elapsed time per iteration (s): 0.16 | learning rate: 2.653E-05 | global batch size: 256 | lm loss: 3.666892E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.452 | TFLOPs: 25.43 | +7: iteration 152560/ 173500 | consumed samples: 39055360 | consumed tokens: 79985377280 | elapsed time per iteration (s): 0.15 | learning rate: 2.652E-05 | global batch size: 256 | lm loss: 3.684371E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.656 | TFLOPs: 26.26 | +7: iteration 152570/ 173500 | consumed samples: 39057920 | consumed tokens: 79990620160 | elapsed time per iteration (s): 0.16 | learning rate: 2.651E-05 | global batch size: 256 | lm loss: 3.652757E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.529 | TFLOPs: 25.68 | +7: iteration 152580/ 173500 | consumed samples: 39060480 | consumed tokens: 79995863040 | elapsed time per iteration (s): 0.16 | learning rate: 2.651E-05 | global batch size: 256 | lm loss: 3.659016E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.538 | TFLOPs: 24.83 | +7: iteration 152590/ 173500 | consumed samples: 39063040 | consumed tokens: 80001105920 | elapsed time per iteration (s): 0.16 | learning rate: 2.650E-05 | global batch size: 256 | lm loss: 3.652652E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.939 | TFLOPs: 25.01 | +7: iteration 152600/ 173500 | consumed samples: 39065600 | consumed tokens: 80006348800 | elapsed time per iteration (s): 0.15 | learning rate: 2.650E-05 | global batch size: 256 | lm loss: 3.673478E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.205 | TFLOPs: 26.19 | +7: iteration 152610/ 173500 | consumed samples: 39068160 | consumed tokens: 80011591680 | elapsed time per iteration (s): 0.17 | learning rate: 2.649E-05 | global batch size: 256 | lm loss: 3.654351E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.385 | TFLOPs: 24.30 | +7: iteration 152620/ 173500 | consumed samples: 39070720 | consumed tokens: 80016834560 | elapsed time per iteration (s): 0.16 | learning rate: 2.648E-05 | global batch size: 256 | lm loss: 3.666349E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.439 | TFLOPs: 24.47 | +7: iteration 152630/ 173500 | consumed samples: 39073280 | consumed tokens: 80022077440 | elapsed time per iteration (s): 0.15 | learning rate: 2.648E-05 | global batch size: 256 | lm loss: 3.665744E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.007 | TFLOPs: 26.16 | +7: iteration 152640/ 173500 | consumed samples: 39075840 | consumed tokens: 80027320320 | elapsed time per iteration (s): 0.16 | learning rate: 2.647E-05 | global batch size: 256 | lm loss: 3.656018E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.551 | TFLOPs: 25.40 | +7: iteration 152650/ 173500 | consumed samples: 39078400 | consumed tokens: 80032563200 | elapsed time per iteration (s): 0.16 | learning rate: 2.647E-05 | global batch size: 256 | lm loss: 3.667884E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.109 | TFLOPs: 25.19 | +7: iteration 152660/ 173500 | consumed samples: 39080960 | consumed tokens: 80037806080 | elapsed time per iteration (s): 0.17 | learning rate: 2.646E-05 | global batch size: 256 | lm loss: 3.666680E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.708 | TFLOPs: 24.10 | +7: iteration 152670/ 173500 | consumed samples: 39083520 | consumed tokens: 80043048960 | elapsed time per iteration (s): 0.17 | learning rate: 2.645E-05 | global batch size: 256 | lm loss: 3.669527E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1546.949 | TFLOPs: 24.26 | +7: iteration 152680/ 173500 | consumed samples: 39086080 | consumed tokens: 80048291840 | elapsed time per iteration (s): 0.17 | learning rate: 2.645E-05 | global batch size: 256 | lm loss: 3.666261E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.448 | TFLOPs: 24.10 | +7: iteration 152690/ 173500 | consumed samples: 39088640 | consumed tokens: 80053534720 | elapsed time per iteration (s): 0.16 | learning rate: 2.644E-05 | global batch size: 256 | lm loss: 3.670319E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.221 | TFLOPs: 24.75 | +7: iteration 152700/ 173500 | consumed samples: 39091200 | consumed tokens: 80058777600 | elapsed time per iteration (s): 0.16 | learning rate: 2.643E-05 | global batch size: 256 | lm loss: 3.657214E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.366 | TFLOPs: 24.39 | +7: iteration 152710/ 173500 | consumed samples: 39093760 | consumed tokens: 80064020480 | elapsed time per iteration (s): 0.16 | learning rate: 2.643E-05 | global batch size: 256 | lm loss: 3.653453E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.451 | TFLOPs: 25.43 | +7: iteration 152720/ 173500 | consumed samples: 39096320 | consumed tokens: 80069263360 | elapsed time per iteration (s): 0.16 | learning rate: 2.642E-05 | global batch size: 256 | lm loss: 3.669963E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.332 | TFLOPs: 24.92 | +7: iteration 152730/ 173500 | consumed samples: 39098880 | consumed tokens: 80074506240 | elapsed time per iteration (s): 0.16 | learning rate: 2.642E-05 | global batch size: 256 | lm loss: 3.664671E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.797 | TFLOPs: 24.95 | +7: iteration 152740/ 173500 | consumed samples: 39101440 | consumed tokens: 80079749120 | elapsed time per iteration (s): 0.16 | learning rate: 2.641E-05 | global batch size: 256 | lm loss: 3.668325E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.677 | TFLOPs: 25.26 | +7: iteration 152750/ 173500 | consumed samples: 39104000 | consumed tokens: 80084992000 | elapsed time per iteration (s): 0.16 | learning rate: 2.640E-05 | global batch size: 256 | lm loss: 3.669749E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.543 | TFLOPs: 25.24 | +7: iteration 152760/ 173500 | consumed samples: 39106560 | consumed tokens: 80090234880 | elapsed time per iteration (s): 0.16 | learning rate: 2.640E-05 | global batch size: 256 | lm loss: 3.670007E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.563 | TFLOPs: 25.16 | +7: iteration 152770/ 173500 | consumed samples: 39109120 | consumed tokens: 80095477760 | elapsed time per iteration (s): 0.16 | learning rate: 2.639E-05 | global batch size: 256 | lm loss: 3.660579E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.897 | TFLOPs: 25.31 | +7: iteration 152780/ 173500 | consumed samples: 39111680 | consumed tokens: 80100720640 | elapsed time per iteration (s): 0.16 | learning rate: 2.639E-05 | global batch size: 256 | lm loss: 3.665714E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.679 | TFLOPs: 24.71 | +7: iteration 152790/ 173500 | consumed samples: 39114240 | consumed tokens: 80105963520 | elapsed time per iteration (s): 0.16 | learning rate: 2.638E-05 | global batch size: 256 | lm loss: 3.646676E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.750 | TFLOPs: 25.76 | +7: iteration 152800/ 173500 | consumed samples: 39116800 | consumed tokens: 80111206400 | elapsed time per iteration (s): 0.16 | learning rate: 2.637E-05 | global batch size: 256 | lm loss: 3.654479E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.782 | TFLOPs: 25.42 | +7: iteration 152810/ 173500 | consumed samples: 39119360 | consumed tokens: 80116449280 | elapsed time per iteration (s): 0.16 | learning rate: 2.637E-05 | global batch size: 256 | lm loss: 3.666406E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.074 | TFLOPs: 24.65 | +7: iteration 152820/ 173500 | consumed samples: 39121920 | consumed tokens: 80121692160 | elapsed time per iteration (s): 0.15 | learning rate: 2.636E-05 | global batch size: 256 | lm loss: 3.663263E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.490 | TFLOPs: 25.93 | +7: iteration 152830/ 173500 | consumed samples: 39124480 | consumed tokens: 80126935040 | elapsed time per iteration (s): 0.16 | learning rate: 2.636E-05 | global batch size: 256 | lm loss: 3.655563E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1558.304 | TFLOPs: 24.44 | +7: iteration 152840/ 173500 | consumed samples: 39127040 | consumed tokens: 80132177920 | elapsed time per iteration (s): 0.16 | learning rate: 2.635E-05 | global batch size: 256 | lm loss: 3.658188E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.454 | TFLOPs: 24.38 | +7: iteration 152850/ 173500 | consumed samples: 39129600 | consumed tokens: 80137420800 | elapsed time per iteration (s): 0.16 | learning rate: 2.634E-05 | global batch size: 256 | lm loss: 3.662266E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.160 | TFLOPs: 24.89 | +7: iteration 152860/ 173500 | consumed samples: 39132160 | consumed tokens: 80142663680 | elapsed time per iteration (s): 0.15 | learning rate: 2.634E-05 | global batch size: 256 | lm loss: 3.670157E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.440 | TFLOPs: 26.18 | +7: iteration 152870/ 173500 | consumed samples: 39134720 | consumed tokens: 80147906560 | elapsed time per iteration (s): 0.16 | learning rate: 2.633E-05 | global batch size: 256 | lm loss: 3.655305E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.409 | TFLOPs: 25.21 | +7: iteration 152880/ 173500 | consumed samples: 39137280 | consumed tokens: 80153149440 | elapsed time per iteration (s): 0.18 | learning rate: 2.633E-05 | global batch size: 256 | lm loss: 3.667064E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.669 | TFLOPs: 22.22 | +7: iteration 152890/ 173500 | consumed samples: 39139840 | consumed tokens: 80158392320 | elapsed time per iteration (s): 0.16 | learning rate: 2.632E-05 | global batch size: 256 | lm loss: 3.656884E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.517 | TFLOPs: 25.65 | +7: iteration 152900/ 173500 | consumed samples: 39142400 | consumed tokens: 80163635200 | elapsed time per iteration (s): 0.17 | learning rate: 2.631E-05 | global batch size: 256 | lm loss: 3.666500E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1531.146 | TFLOPs: 24.01 | +7: iteration 152910/ 173500 | consumed samples: 39144960 | consumed tokens: 80168878080 | elapsed time per iteration (s): 0.16 | learning rate: 2.631E-05 | global batch size: 256 | lm loss: 3.656846E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.707 | TFLOPs: 25.46 | +7: iteration 152920/ 173500 | consumed samples: 39147520 | consumed tokens: 80174120960 | elapsed time per iteration (s): 0.17 | learning rate: 2.630E-05 | global batch size: 256 | lm loss: 3.661167E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1533.834 | TFLOPs: 24.05 | +7: iteration 152930/ 173500 | consumed samples: 39150080 | consumed tokens: 80179363840 | elapsed time per iteration (s): 0.15 | learning rate: 2.630E-05 | global batch size: 256 | lm loss: 3.658591E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.581 | TFLOPs: 26.31 | +7: iteration 152940/ 173500 | consumed samples: 39152640 | consumed tokens: 80184606720 | elapsed time per iteration (s): 0.15 | learning rate: 2.629E-05 | global batch size: 256 | lm loss: 3.671871E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.610 | TFLOPs: 26.26 | +7: iteration 152950/ 173500 | consumed samples: 39155200 | consumed tokens: 80189849600 | elapsed time per iteration (s): 0.16 | learning rate: 2.628E-05 | global batch size: 256 | lm loss: 3.649828E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.969 | TFLOPs: 25.81 | +7: iteration 152960/ 173500 | consumed samples: 39157760 | consumed tokens: 80195092480 | elapsed time per iteration (s): 0.16 | learning rate: 2.628E-05 | global batch size: 256 | lm loss: 3.655186E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.153 | TFLOPs: 24.69 | +7: iteration 152970/ 173500 | consumed samples: 39160320 | consumed tokens: 80200335360 | elapsed time per iteration (s): 0.16 | learning rate: 2.627E-05 | global batch size: 256 | lm loss: 3.660349E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.186 | TFLOPs: 25.80 | +7: iteration 152980/ 173500 | consumed samples: 39162880 | consumed tokens: 80205578240 | elapsed time per iteration (s): 0.16 | learning rate: 2.626E-05 | global batch size: 256 | lm loss: 3.668487E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.965 | TFLOPs: 24.76 | +7: iteration 152990/ 173500 | consumed samples: 39165440 | consumed tokens: 80210821120 | elapsed time per iteration (s): 0.16 | learning rate: 2.626E-05 | global batch size: 256 | lm loss: 3.673888E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.395 | TFLOPs: 25.82 | +7: iteration 153000/ 173500 | consumed samples: 39168000 | consumed tokens: 80216064000 | elapsed time per iteration (s): 0.16 | learning rate: 2.625E-05 | global batch size: 256 | lm loss: 3.680415E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.962 | TFLOPs: 25.23 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 153000 | lm loss value: 3.857465E+00 | lm loss PPL: 4.734519E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 153000 to checkpoints_44m91b100m +0: [2023-03-17 06:54:47,986] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step153000 is begin to save! +0: [2023-03-17 06:54:47,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:54:48,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:54:48,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:54:48,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:54:48,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:54:48,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:54:48,070] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:54:48,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:54:48,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:54:48,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:54:48,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:54:48,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:54:48,094] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:54:48,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:54:48,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:54:48,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:54:48,110] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:54:48,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:54:48,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:54:48,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:54:48,120] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step153000/mp_rank_00_model_states.pt +0: [2023-03-17 06:54:48,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:54:48,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:54:48,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:54:48,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,157] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,157] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +6: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:54:48,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: [2023-03-17 06:54:48,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 06:54:48,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +7: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +5: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:54:48,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +1: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:54:48,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +3: [2023-03-17 06:54:48,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:54:48,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:54:48,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +4: [2023-03-17 06:54:48,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:54:48,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:54:48,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:54:48,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step153000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 06:54:48,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step153000 is ready now! +0: successfully saved checkpoint at iteration 153000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 195.66 +7: iteration 153010/ 173500 | consumed samples: 39170560 | consumed tokens: 80221306880 | elapsed time per iteration (s): 0.18 | learning rate: 2.625E-05 | global batch size: 256 | lm loss: 3.661304E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.328 | TFLOPs: 22.21 | +7: iteration 153020/ 173500 | consumed samples: 39173120 | consumed tokens: 80226549760 | elapsed time per iteration (s): 0.16 | learning rate: 2.624E-05 | global batch size: 256 | lm loss: 3.658834E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.035 | TFLOPs: 25.58 | +7: iteration 153030/ 173500 | consumed samples: 39175680 | consumed tokens: 80231792640 | elapsed time per iteration (s): 0.16 | learning rate: 2.623E-05 | global batch size: 256 | lm loss: 3.660660E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.339 | TFLOPs: 25.76 | +7: iteration 153040/ 173500 | consumed samples: 39178240 | consumed tokens: 80237035520 | elapsed time per iteration (s): 0.16 | learning rate: 2.623E-05 | global batch size: 256 | lm loss: 3.668402E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.952 | TFLOPs: 25.14 | +7: iteration 153050/ 173500 | consumed samples: 39180800 | consumed tokens: 80242278400 | elapsed time per iteration (s): 0.17 | learning rate: 2.622E-05 | global batch size: 256 | lm loss: 3.660552E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1539.640 | TFLOPs: 24.15 | +7: iteration 153060/ 173500 | consumed samples: 39183360 | consumed tokens: 80247521280 | elapsed time per iteration (s): 0.16 | learning rate: 2.622E-05 | global batch size: 256 | lm loss: 3.663891E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.289 | TFLOPs: 25.88 | +7: iteration 153070/ 173500 | consumed samples: 39185920 | consumed tokens: 80252764160 | elapsed time per iteration (s): 0.15 | learning rate: 2.621E-05 | global batch size: 256 | lm loss: 3.665244E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.169 | TFLOPs: 26.13 | +7: iteration 153080/ 173500 | consumed samples: 39188480 | consumed tokens: 80258007040 | elapsed time per iteration (s): 0.17 | learning rate: 2.620E-05 | global batch size: 256 | lm loss: 3.660713E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1474.081 | TFLOPs: 23.12 | +7: iteration 153090/ 173500 | consumed samples: 39191040 | consumed tokens: 80263249920 | elapsed time per iteration (s): 0.16 | learning rate: 2.620E-05 | global batch size: 256 | lm loss: 3.661871E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.771 | TFLOPs: 25.86 | +7: iteration 153100/ 173500 | consumed samples: 39193600 | consumed tokens: 80268492800 | elapsed time per iteration (s): 0.16 | learning rate: 2.619E-05 | global batch size: 256 | lm loss: 3.682045E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.826 | TFLOPs: 25.20 | +7: iteration 153110/ 173500 | consumed samples: 39196160 | consumed tokens: 80273735680 | elapsed time per iteration (s): 0.15 | learning rate: 2.619E-05 | global batch size: 256 | lm loss: 3.671024E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.063 | TFLOPs: 26.14 | +7: iteration 153120/ 173500 | consumed samples: 39198720 | consumed tokens: 80278978560 | elapsed time per iteration (s): 0.15 | learning rate: 2.618E-05 | global batch size: 256 | lm loss: 3.663682E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.918 | TFLOPs: 26.14 | +7: iteration 153130/ 173500 | consumed samples: 39201280 | consumed tokens: 80284221440 | elapsed time per iteration (s): 0.16 | learning rate: 2.617E-05 | global batch size: 256 | lm loss: 3.663989E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.064 | TFLOPs: 25.38 | +7: iteration 153140/ 173500 | consumed samples: 39203840 | consumed tokens: 80289464320 | elapsed time per iteration (s): 0.16 | learning rate: 2.617E-05 | global batch size: 256 | lm loss: 3.672563E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.193 | TFLOPs: 24.97 | +7: iteration 153150/ 173500 | consumed samples: 39206400 | consumed tokens: 80294707200 | elapsed time per iteration (s): 0.16 | learning rate: 2.616E-05 | global batch size: 256 | lm loss: 3.671096E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.289 | TFLOPs: 25.80 | +7: iteration 153160/ 173500 | consumed samples: 39208960 | consumed tokens: 80299950080 | elapsed time per iteration (s): 0.16 | learning rate: 2.616E-05 | global batch size: 256 | lm loss: 3.663712E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.842 | TFLOPs: 25.32 | +7: iteration 153170/ 173500 | consumed samples: 39211520 | consumed tokens: 80305192960 | elapsed time per iteration (s): 0.16 | learning rate: 2.615E-05 | global batch size: 256 | lm loss: 3.656952E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.083 | TFLOPs: 25.36 | +7: iteration 153180/ 173500 | consumed samples: 39214080 | consumed tokens: 80310435840 | elapsed time per iteration (s): 0.16 | learning rate: 2.614E-05 | global batch size: 256 | lm loss: 3.653542E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.179 | TFLOPs: 24.47 | +7: iteration 153190/ 173500 | consumed samples: 39216640 | consumed tokens: 80315678720 | elapsed time per iteration (s): 0.15 | learning rate: 2.614E-05 | global batch size: 256 | lm loss: 3.664177E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.452 | TFLOPs: 26.15 | +7: iteration 153200/ 173500 | consumed samples: 39219200 | consumed tokens: 80320921600 | elapsed time per iteration (s): 0.16 | learning rate: 2.613E-05 | global batch size: 256 | lm loss: 3.665368E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.620 | TFLOPs: 25.27 | +7: iteration 153210/ 173500 | consumed samples: 39221760 | consumed tokens: 80326164480 | elapsed time per iteration (s): 0.16 | learning rate: 2.613E-05 | global batch size: 256 | lm loss: 3.656016E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.014 | TFLOPs: 24.87 | +7: iteration 153220/ 173500 | consumed samples: 39224320 | consumed tokens: 80331407360 | elapsed time per iteration (s): 0.15 | learning rate: 2.612E-05 | global batch size: 256 | lm loss: 3.654000E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.256 | TFLOPs: 26.08 | +7: iteration 153230/ 173500 | consumed samples: 39226880 | consumed tokens: 80336650240 | elapsed time per iteration (s): 0.16 | learning rate: 2.611E-05 | global batch size: 256 | lm loss: 3.662193E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.903 | TFLOPs: 25.61 | +7: iteration 153240/ 173500 | consumed samples: 39229440 | consumed tokens: 80341893120 | elapsed time per iteration (s): 0.16 | learning rate: 2.611E-05 | global batch size: 256 | lm loss: 3.660545E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.281 | TFLOPs: 25.66 | +7: iteration 153250/ 173500 | consumed samples: 39232000 | consumed tokens: 80347136000 | elapsed time per iteration (s): 0.15 | learning rate: 2.610E-05 | global batch size: 256 | lm loss: 3.655724E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.601 | TFLOPs: 26.18 | +7: iteration 153260/ 173500 | consumed samples: 39234560 | consumed tokens: 80352378880 | elapsed time per iteration (s): 0.16 | learning rate: 2.610E-05 | global batch size: 256 | lm loss: 3.665727E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.836 | TFLOPs: 25.64 | +7: iteration 153270/ 173500 | consumed samples: 39237120 | consumed tokens: 80357621760 | elapsed time per iteration (s): 0.15 | learning rate: 2.609E-05 | global batch size: 256 | lm loss: 3.665839E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.293 | TFLOPs: 26.08 | +7: iteration 153280/ 173500 | consumed samples: 39239680 | consumed tokens: 80362864640 | elapsed time per iteration (s): 0.16 | learning rate: 2.609E-05 | global batch size: 256 | lm loss: 3.667143E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.904 | TFLOPs: 25.12 | +7: iteration 153290/ 173500 | consumed samples: 39242240 | consumed tokens: 80368107520 | elapsed time per iteration (s): 0.15 | learning rate: 2.608E-05 | global batch size: 256 | lm loss: 3.663847E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.134 | TFLOPs: 26.08 | +7: iteration 153300/ 173500 | consumed samples: 39244800 | consumed tokens: 80373350400 | elapsed time per iteration (s): 0.15 | learning rate: 2.607E-05 | global batch size: 256 | lm loss: 3.657015E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.586 | TFLOPs: 26.10 | +7: iteration 153310/ 173500 | consumed samples: 39247360 | consumed tokens: 80378593280 | elapsed time per iteration (s): 0.16 | learning rate: 2.607E-05 | global batch size: 256 | lm loss: 3.666853E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.327 | TFLOPs: 24.49 | +7: iteration 153320/ 173500 | consumed samples: 39249920 | consumed tokens: 80383836160 | elapsed time per iteration (s): 0.17 | learning rate: 2.606E-05 | global batch size: 256 | lm loss: 3.671325E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1541.711 | TFLOPs: 24.18 | +7: iteration 153330/ 173500 | consumed samples: 39252480 | consumed tokens: 80389079040 | elapsed time per iteration (s): 0.16 | learning rate: 2.606E-05 | global batch size: 256 | lm loss: 3.664511E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.789 | TFLOPs: 25.81 | +7: iteration 153340/ 173500 | consumed samples: 39255040 | consumed tokens: 80394321920 | elapsed time per iteration (s): 0.15 | learning rate: 2.605E-05 | global batch size: 256 | lm loss: 3.688382E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.601 | TFLOPs: 25.98 | +7: iteration 153350/ 173500 | consumed samples: 39257600 | consumed tokens: 80399564800 | elapsed time per iteration (s): 0.16 | learning rate: 2.604E-05 | global batch size: 256 | lm loss: 3.666585E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.814 | TFLOPs: 25.78 | +7: iteration 153360/ 173500 | consumed samples: 39260160 | consumed tokens: 80404807680 | elapsed time per iteration (s): 0.16 | learning rate: 2.604E-05 | global batch size: 256 | lm loss: 3.674646E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.827 | TFLOPs: 25.10 | +7: iteration 153370/ 173500 | consumed samples: 39262720 | consumed tokens: 80410050560 | elapsed time per iteration (s): 0.16 | learning rate: 2.603E-05 | global batch size: 256 | lm loss: 3.659650E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.034 | TFLOPs: 25.85 | +7: iteration 153380/ 173500 | consumed samples: 39265280 | consumed tokens: 80415293440 | elapsed time per iteration (s): 0.16 | learning rate: 2.603E-05 | global batch size: 256 | lm loss: 3.662136E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.238 | TFLOPs: 25.86 | +7: iteration 153390/ 173500 | consumed samples: 39267840 | consumed tokens: 80420536320 | elapsed time per iteration (s): 0.16 | learning rate: 2.602E-05 | global batch size: 256 | lm loss: 3.665636E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.746 | TFLOPs: 25.75 | +7: iteration 153400/ 173500 | consumed samples: 39270400 | consumed tokens: 80425779200 | elapsed time per iteration (s): 0.16 | learning rate: 2.601E-05 | global batch size: 256 | lm loss: 3.655671E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.266 | TFLOPs: 25.71 | +7: iteration 153410/ 173500 | consumed samples: 39272960 | consumed tokens: 80431022080 | elapsed time per iteration (s): 0.16 | learning rate: 2.601E-05 | global batch size: 256 | lm loss: 3.673074E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.494 | TFLOPs: 25.65 | +7: iteration 153420/ 173500 | consumed samples: 39275520 | consumed tokens: 80436264960 | elapsed time per iteration (s): 0.16 | learning rate: 2.600E-05 | global batch size: 256 | lm loss: 3.663940E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.589 | TFLOPs: 25.52 | +7: iteration 153430/ 173500 | consumed samples: 39278080 | consumed tokens: 80441507840 | elapsed time per iteration (s): 0.15 | learning rate: 2.600E-05 | global batch size: 256 | lm loss: 3.660741E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.352 | TFLOPs: 26.04 | +7: iteration 153440/ 173500 | consumed samples: 39280640 | consumed tokens: 80446750720 | elapsed time per iteration (s): 0.16 | learning rate: 2.599E-05 | global batch size: 256 | lm loss: 3.656314E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.464 | TFLOPs: 25.63 | +7: iteration 153450/ 173500 | consumed samples: 39283200 | consumed tokens: 80451993600 | elapsed time per iteration (s): 0.16 | learning rate: 2.598E-05 | global batch size: 256 | lm loss: 3.678529E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.631 | TFLOPs: 25.68 | +7: iteration 153460/ 173500 | consumed samples: 39285760 | consumed tokens: 80457236480 | elapsed time per iteration (s): 0.16 | learning rate: 2.598E-05 | global batch size: 256 | lm loss: 3.668342E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.040 | TFLOPs: 25.19 | +7: iteration 153470/ 173500 | consumed samples: 39288320 | consumed tokens: 80462479360 | elapsed time per iteration (s): 0.16 | learning rate: 2.597E-05 | global batch size: 256 | lm loss: 3.669029E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.217 | TFLOPs: 25.46 | +7: iteration 153480/ 173500 | consumed samples: 39290880 | consumed tokens: 80467722240 | elapsed time per iteration (s): 0.15 | learning rate: 2.597E-05 | global batch size: 256 | lm loss: 3.652129E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.168 | TFLOPs: 26.18 | +7: iteration 153490/ 173500 | consumed samples: 39293440 | consumed tokens: 80472965120 | elapsed time per iteration (s): 0.15 | learning rate: 2.596E-05 | global batch size: 256 | lm loss: 3.661899E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.071 | TFLOPs: 26.18 | +7: iteration 153500/ 173500 | consumed samples: 39296000 | consumed tokens: 80478208000 | elapsed time per iteration (s): 0.15 | learning rate: 2.595E-05 | global batch size: 256 | lm loss: 3.650816E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.227 | TFLOPs: 26.18 | +7: iteration 153510/ 173500 | consumed samples: 39298560 | consumed tokens: 80483450880 | elapsed time per iteration (s): 0.16 | learning rate: 2.595E-05 | global batch size: 256 | lm loss: 3.662797E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.334 | TFLOPs: 25.41 | +7: iteration 153520/ 173500 | consumed samples: 39301120 | consumed tokens: 80488693760 | elapsed time per iteration (s): 0.16 | learning rate: 2.594E-05 | global batch size: 256 | lm loss: 3.672397E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.926 | TFLOPs: 25.23 | +7: iteration 153530/ 173500 | consumed samples: 39303680 | consumed tokens: 80493936640 | elapsed time per iteration (s): 0.16 | learning rate: 2.594E-05 | global batch size: 256 | lm loss: 3.671860E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.078 | TFLOPs: 25.83 | +7: iteration 153540/ 173500 | consumed samples: 39306240 | consumed tokens: 80499179520 | elapsed time per iteration (s): 0.15 | learning rate: 2.593E-05 | global batch size: 256 | lm loss: 3.679211E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.487 | TFLOPs: 26.23 | +7: iteration 153550/ 173500 | consumed samples: 39308800 | consumed tokens: 80504422400 | elapsed time per iteration (s): 0.15 | learning rate: 2.593E-05 | global batch size: 256 | lm loss: 3.644930E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.584 | TFLOPs: 26.23 | +7: iteration 153560/ 173500 | consumed samples: 39311360 | consumed tokens: 80509665280 | elapsed time per iteration (s): 0.16 | learning rate: 2.592E-05 | global batch size: 256 | lm loss: 3.656162E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.299 | TFLOPs: 25.85 | +7: iteration 153570/ 173500 | consumed samples: 39313920 | consumed tokens: 80514908160 | elapsed time per iteration (s): 0.16 | learning rate: 2.591E-05 | global batch size: 256 | lm loss: 3.659007E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.057 | TFLOPs: 25.80 | +7: iteration 153580/ 173500 | consumed samples: 39316480 | consumed tokens: 80520151040 | elapsed time per iteration (s): 0.16 | learning rate: 2.591E-05 | global batch size: 256 | lm loss: 3.665230E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.559 | TFLOPs: 25.82 | +7: iteration 153590/ 173500 | consumed samples: 39319040 | consumed tokens: 80525393920 | elapsed time per iteration (s): 0.16 | learning rate: 2.590E-05 | global batch size: 256 | lm loss: 3.656084E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.655 | TFLOPs: 25.23 | +7: iteration 153600/ 173500 | consumed samples: 39321600 | consumed tokens: 80530636800 | elapsed time per iteration (s): 0.16 | learning rate: 2.590E-05 | global batch size: 256 | lm loss: 3.666255E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.877 | TFLOPs: 25.14 | +7: iteration 153610/ 173500 | consumed samples: 39324160 | consumed tokens: 80535879680 | elapsed time per iteration (s): 0.16 | learning rate: 2.589E-05 | global batch size: 256 | lm loss: 3.665803E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.052 | TFLOPs: 25.78 | +7: iteration 153620/ 173500 | consumed samples: 39326720 | consumed tokens: 80541122560 | elapsed time per iteration (s): 0.16 | learning rate: 2.588E-05 | global batch size: 256 | lm loss: 3.664767E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.848 | TFLOPs: 25.58 | +7: iteration 153630/ 173500 | consumed samples: 39329280 | consumed tokens: 80546365440 | elapsed time per iteration (s): 0.15 | learning rate: 2.588E-05 | global batch size: 256 | lm loss: 3.664184E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.631 | TFLOPs: 26.14 | +7: iteration 153640/ 173500 | consumed samples: 39331840 | consumed tokens: 80551608320 | elapsed time per iteration (s): 0.15 | learning rate: 2.587E-05 | global batch size: 256 | lm loss: 3.659358E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.249 | TFLOPs: 26.15 | +7: iteration 153650/ 173500 | consumed samples: 39334400 | consumed tokens: 80556851200 | elapsed time per iteration (s): 0.16 | learning rate: 2.587E-05 | global batch size: 256 | lm loss: 3.645915E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.629 | TFLOPs: 25.32 | +7: iteration 153660/ 173500 | consumed samples: 39336960 | consumed tokens: 80562094080 | elapsed time per iteration (s): 0.16 | learning rate: 2.586E-05 | global batch size: 256 | lm loss: 3.663414E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.322 | TFLOPs: 25.77 | +7: iteration 153670/ 173500 | consumed samples: 39339520 | consumed tokens: 80567336960 | elapsed time per iteration (s): 0.16 | learning rate: 2.586E-05 | global batch size: 256 | lm loss: 3.653421E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.282 | TFLOPs: 24.75 | +7: iteration 153680/ 173500 | consumed samples: 39342080 | consumed tokens: 80572579840 | elapsed time per iteration (s): 0.16 | learning rate: 2.585E-05 | global batch size: 256 | lm loss: 3.658942E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.370 | TFLOPs: 25.80 | +7: iteration 153690/ 173500 | consumed samples: 39344640 | consumed tokens: 80577822720 | elapsed time per iteration (s): 0.16 | learning rate: 2.584E-05 | global batch size: 256 | lm loss: 3.671753E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.533 | TFLOPs: 25.29 | +7: iteration 153700/ 173500 | consumed samples: 39347200 | consumed tokens: 80583065600 | elapsed time per iteration (s): 0.16 | learning rate: 2.584E-05 | global batch size: 256 | lm loss: 3.666713E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.755 | TFLOPs: 25.70 | +7: iteration 153710/ 173500 | consumed samples: 39349760 | consumed tokens: 80588308480 | elapsed time per iteration (s): 0.15 | learning rate: 2.583E-05 | global batch size: 256 | lm loss: 3.672125E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.433 | TFLOPs: 26.21 | +7: iteration 153720/ 173500 | consumed samples: 39352320 | consumed tokens: 80593551360 | elapsed time per iteration (s): 0.15 | learning rate: 2.583E-05 | global batch size: 256 | lm loss: 3.671709E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.829 | TFLOPs: 26.22 | +7: iteration 153730/ 173500 | consumed samples: 39354880 | consumed tokens: 80598794240 | elapsed time per iteration (s): 0.15 | learning rate: 2.582E-05 | global batch size: 256 | lm loss: 3.644559E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.595 | TFLOPs: 26.21 | +7: iteration 153740/ 173500 | consumed samples: 39357440 | consumed tokens: 80604037120 | elapsed time per iteration (s): 0.16 | learning rate: 2.581E-05 | global batch size: 256 | lm loss: 3.662114E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.803 | TFLOPs: 25.79 | +7: iteration 153750/ 173500 | consumed samples: 39360000 | consumed tokens: 80609280000 | elapsed time per iteration (s): 0.16 | learning rate: 2.581E-05 | global batch size: 256 | lm loss: 3.657151E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.993 | TFLOPs: 25.28 | +7: iteration 153760/ 173500 | consumed samples: 39362560 | consumed tokens: 80614522880 | elapsed time per iteration (s): 0.16 | learning rate: 2.580E-05 | global batch size: 256 | lm loss: 3.665324E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.652 | TFLOPs: 25.37 | +7: iteration 153770/ 173500 | consumed samples: 39365120 | consumed tokens: 80619765760 | elapsed time per iteration (s): 0.16 | learning rate: 2.580E-05 | global batch size: 256 | lm loss: 3.664863E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.092 | TFLOPs: 25.72 | +7: iteration 153780/ 173500 | consumed samples: 39367680 | consumed tokens: 80625008640 | elapsed time per iteration (s): 0.15 | learning rate: 2.579E-05 | global batch size: 256 | lm loss: 3.680984E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.439 | TFLOPs: 25.96 | +7: iteration 153790/ 173500 | consumed samples: 39370240 | consumed tokens: 80630251520 | elapsed time per iteration (s): 0.16 | learning rate: 2.579E-05 | global batch size: 256 | lm loss: 3.667928E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.250 | TFLOPs: 25.41 | +7: iteration 153800/ 173500 | consumed samples: 39372800 | consumed tokens: 80635494400 | elapsed time per iteration (s): 0.16 | learning rate: 2.578E-05 | global batch size: 256 | lm loss: 3.662181E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.328 | TFLOPs: 25.35 | +7: iteration 153810/ 173500 | consumed samples: 39375360 | consumed tokens: 80640737280 | elapsed time per iteration (s): 0.16 | learning rate: 2.577E-05 | global batch size: 256 | lm loss: 3.664082E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.001 | TFLOPs: 25.78 | +7: iteration 153820/ 173500 | consumed samples: 39377920 | consumed tokens: 80645980160 | elapsed time per iteration (s): 0.15 | learning rate: 2.577E-05 | global batch size: 256 | lm loss: 3.667525E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.008 | TFLOPs: 26.06 | +7: iteration 153830/ 173500 | consumed samples: 39380480 | consumed tokens: 80651223040 | elapsed time per iteration (s): 0.16 | learning rate: 2.576E-05 | global batch size: 256 | lm loss: 3.660459E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.239 | TFLOPs: 25.05 | +7: iteration 153840/ 173500 | consumed samples: 39383040 | consumed tokens: 80656465920 | elapsed time per iteration (s): 0.16 | learning rate: 2.576E-05 | global batch size: 256 | lm loss: 3.675497E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.794 | TFLOPs: 25.34 | +7: iteration 153850/ 173500 | consumed samples: 39385600 | consumed tokens: 80661708800 | elapsed time per iteration (s): 0.16 | learning rate: 2.575E-05 | global batch size: 256 | lm loss: 3.668864E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.838 | TFLOPs: 25.48 | +7: iteration 153860/ 173500 | consumed samples: 39388160 | consumed tokens: 80666951680 | elapsed time per iteration (s): 0.16 | learning rate: 2.574E-05 | global batch size: 256 | lm loss: 3.652713E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.562 | TFLOPs: 25.48 | +7: iteration 153870/ 173500 | consumed samples: 39390720 | consumed tokens: 80672194560 | elapsed time per iteration (s): 0.16 | learning rate: 2.574E-05 | global batch size: 256 | lm loss: 3.661642E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.986 | TFLOPs: 25.36 | +7: iteration 153880/ 173500 | consumed samples: 39393280 | consumed tokens: 80677437440 | elapsed time per iteration (s): 0.16 | learning rate: 2.573E-05 | global batch size: 256 | lm loss: 3.675585E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.351 | TFLOPs: 25.62 | +7: iteration 153890/ 173500 | consumed samples: 39395840 | consumed tokens: 80682680320 | elapsed time per iteration (s): 0.16 | learning rate: 2.573E-05 | global batch size: 256 | lm loss: 3.657607E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.818 | TFLOPs: 25.29 | +7: iteration 153900/ 173500 | consumed samples: 39398400 | consumed tokens: 80687923200 | elapsed time per iteration (s): 0.16 | learning rate: 2.572E-05 | global batch size: 256 | lm loss: 3.658547E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.663 | TFLOPs: 24.91 | +7: iteration 153910/ 173500 | consumed samples: 39400960 | consumed tokens: 80693166080 | elapsed time per iteration (s): 0.16 | learning rate: 2.572E-05 | global batch size: 256 | lm loss: 3.655978E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.942 | TFLOPs: 25.88 | +7: iteration 153920/ 173500 | consumed samples: 39403520 | consumed tokens: 80698408960 | elapsed time per iteration (s): 0.16 | learning rate: 2.571E-05 | global batch size: 256 | lm loss: 3.665983E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.276 | TFLOPs: 25.79 | +7: iteration 153930/ 173500 | consumed samples: 39406080 | consumed tokens: 80703651840 | elapsed time per iteration (s): 0.16 | learning rate: 2.570E-05 | global batch size: 256 | lm loss: 3.671438E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.616 | TFLOPs: 25.40 | +7: iteration 153940/ 173500 | consumed samples: 39408640 | consumed tokens: 80708894720 | elapsed time per iteration (s): 0.16 | learning rate: 2.570E-05 | global batch size: 256 | lm loss: 3.669173E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.307 | TFLOPs: 25.82 | +7: iteration 153950/ 173500 | consumed samples: 39411200 | consumed tokens: 80714137600 | elapsed time per iteration (s): 0.16 | learning rate: 2.569E-05 | global batch size: 256 | lm loss: 3.662440E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.728 | TFLOPs: 25.82 | +7: iteration 153960/ 173500 | consumed samples: 39413760 | consumed tokens: 80719380480 | elapsed time per iteration (s): 0.15 | learning rate: 2.569E-05 | global batch size: 256 | lm loss: 3.665620E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.135 | TFLOPs: 26.21 | +7: iteration 153970/ 173500 | consumed samples: 39416320 | consumed tokens: 80724623360 | elapsed time per iteration (s): 0.15 | learning rate: 2.568E-05 | global batch size: 256 | lm loss: 3.675452E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.439 | TFLOPs: 26.18 | +7: iteration 153980/ 173500 | consumed samples: 39418880 | consumed tokens: 80729866240 | elapsed time per iteration (s): 0.16 | learning rate: 2.568E-05 | global batch size: 256 | lm loss: 3.670895E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.625 | TFLOPs: 25.10 | +7: iteration 153990/ 173500 | consumed samples: 39421440 | consumed tokens: 80735109120 | elapsed time per iteration (s): 0.16 | learning rate: 2.567E-05 | global batch size: 256 | lm loss: 3.658284E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.277 | TFLOPs: 25.27 | +0: [2023-03-17 06:57:25,133] [INFO] [logging.py:68:log_dist] [Rank 0] step=154000, skipped=0, lr=[2.5664028527469924e-05, 2.5664028527469924e-05, 2.5664028527469924e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 154000/ 173500 | consumed samples: 39424000 | consumed tokens: 80740352000 | elapsed time per iteration (s): 0.16 | learning rate: 2.566E-05 | global batch size: 256 | lm loss: 3.669370E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.710 | TFLOPs: 25.35 | +0: steps: 154000 loss: 3.6673 iter time (s): 0.158 samples/sec: 1622.990 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 154000 | lm loss value: 3.807232E+00 | lm loss PPL: 4.502565E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 154000 to checkpoints_44m91b100m +0: [2023-03-17 06:57:25,208] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step154000 is begin to save! +0: [2023-03-17 06:57:25,211] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_01-model_00-model_states.pt... +0: [2023-03-17 06:57:25,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_01-model_00-model_states.pt. +0: [2023-03-17 06:57:25,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_03-model_00-model_states.pt... +0: [2023-03-17 06:57:25,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_03-model_00-model_states.pt. +0: [2023-03-17 06:57:25,281] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_04-model_00-model_states.pt... +0: [2023-03-17 06:57:25,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_04-model_00-model_states.pt. +0: [2023-03-17 06:57:25,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_05-model_00-model_states.pt... +0: [2023-03-17 06:57:25,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_05-model_00-model_states.pt. +0: [2023-03-17 06:57:25,298] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_06-model_00-model_states.pt... +0: [2023-03-17 06:57:25,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_06-model_00-model_states.pt. +0: [2023-03-17 06:57:25,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_07-model_00-model_states.pt... +0: [2023-03-17 06:57:25,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_07-model_00-model_states.pt. +0: [2023-03-17 06:57:25,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_08-model_00-model_states.pt... +0: [2023-03-17 06:57:25,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_08-model_00-model_states.pt. +0: [2023-03-17 06:57:25,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_09-model_00-model_states.pt... +0: [2023-03-17 06:57:25,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_09-model_00-model_states.pt. +0: [2023-03-17 06:57:25,330] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_10-model_00-model_states.pt... +0: [2023-03-17 06:57:25,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_10-model_00-model_states.pt. +0: [2023-03-17 06:57:25,338] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/layer_12-model_00-model_states.pt... +0: [2023-03-17 06:57:25,338] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/layer_12-model_00-model_states.pt. +0: [2023-03-17 06:57:25,339] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step154000/mp_rank_00_model_states.pt +0: [2023-03-17 06:57:25,339] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/mp_rank_00_model_states.pt... +0: [2023-03-17 06:57:25,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/mp_rank_00_model_states.pt. +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +5: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 06:57:25,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 06:57:25,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 06:57:25,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,372] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +5: [2023-03-17 06:57:25,372] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,373] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:57:25,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +5: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,374] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 06:57:25,374] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +5: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:57:25,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:57:25,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:57:25,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:57:25,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +5: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:57:25,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +5: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +5: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +2: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 06:57:25,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +6: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +7: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +3: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 06:57:25,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +4: [2023-03-17 06:57:25,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +1: [2023-03-17 06:57:25,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 06:57:25,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step154000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 06:57:25,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step154000 is ready now! +0: successfully saved checkpoint at iteration 154000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 178.33 +7: iteration 154010/ 173500 | consumed samples: 39426560 | consumed tokens: 80745594880 | elapsed time per iteration (s): 0.18 | learning rate: 2.566E-05 | global batch size: 256 | lm loss: 3.671952E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1441.704 | TFLOPs: 22.61 | +7: iteration 154020/ 173500 | consumed samples: 39429120 | consumed tokens: 80750837760 | elapsed time per iteration (s): 0.16 | learning rate: 2.565E-05 | global batch size: 256 | lm loss: 3.674496E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.396 | TFLOPs: 25.57 | +7: iteration 154030/ 173500 | consumed samples: 39431680 | consumed tokens: 80756080640 | elapsed time per iteration (s): 0.16 | learning rate: 2.565E-05 | global batch size: 256 | lm loss: 3.661826E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.239 | TFLOPs: 25.00 | +7: iteration 154040/ 173500 | consumed samples: 39434240 | consumed tokens: 80761323520 | elapsed time per iteration (s): 0.16 | learning rate: 2.564E-05 | global batch size: 256 | lm loss: 3.660687E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.827 | TFLOPs: 25.20 | +7: iteration 154050/ 173500 | consumed samples: 39436800 | consumed tokens: 80766566400 | elapsed time per iteration (s): 0.16 | learning rate: 2.564E-05 | global batch size: 256 | lm loss: 3.674657E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.450 | TFLOPs: 25.27 | +7: iteration 154060/ 173500 | consumed samples: 39439360 | consumed tokens: 80771809280 | elapsed time per iteration (s): 0.17 | learning rate: 2.563E-05 | global batch size: 256 | lm loss: 3.664777E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.997 | TFLOPs: 24.04 | +7: iteration 154070/ 173500 | consumed samples: 39441920 | consumed tokens: 80777052160 | elapsed time per iteration (s): 0.16 | learning rate: 2.562E-05 | global batch size: 256 | lm loss: 3.672914E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.707 | TFLOPs: 25.53 | +7: iteration 154080/ 173500 | consumed samples: 39444480 | consumed tokens: 80782295040 | elapsed time per iteration (s): 0.16 | learning rate: 2.562E-05 | global batch size: 256 | lm loss: 3.661030E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.945 | TFLOPs: 25.62 | +7: iteration 154090/ 173500 | consumed samples: 39447040 | consumed tokens: 80787537920 | elapsed time per iteration (s): 0.16 | learning rate: 2.561E-05 | global batch size: 256 | lm loss: 3.670562E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.668 | TFLOPs: 25.54 | +7: iteration 154100/ 173500 | consumed samples: 39449600 | consumed tokens: 80792780800 | elapsed time per iteration (s): 0.16 | learning rate: 2.561E-05 | global batch size: 256 | lm loss: 3.671229E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.860 | TFLOPs: 25.33 | +7: iteration 154110/ 173500 | consumed samples: 39452160 | consumed tokens: 80798023680 | elapsed time per iteration (s): 0.16 | learning rate: 2.560E-05 | global batch size: 256 | lm loss: 3.672344E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.783 | TFLOPs: 25.36 | +7: iteration 154120/ 173500 | consumed samples: 39454720 | consumed tokens: 80803266560 | elapsed time per iteration (s): 0.15 | learning rate: 2.560E-05 | global batch size: 256 | lm loss: 3.674352E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.760 | TFLOPs: 26.15 | +7: iteration 154130/ 173500 | consumed samples: 39457280 | consumed tokens: 80808509440 | elapsed time per iteration (s): 0.15 | learning rate: 2.559E-05 | global batch size: 256 | lm loss: 3.649450E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.762 | TFLOPs: 26.15 | +7: iteration 154140/ 173500 | consumed samples: 39459840 | consumed tokens: 80813752320 | elapsed time per iteration (s): 0.16 | learning rate: 2.558E-05 | global batch size: 256 | lm loss: 3.650230E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.115 | TFLOPs: 25.28 | +7: iteration 154150/ 173500 | consumed samples: 39462400 | consumed tokens: 80818995200 | elapsed time per iteration (s): 0.16 | learning rate: 2.558E-05 | global batch size: 256 | lm loss: 3.654477E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.278 | TFLOPs: 24.74 | +7: iteration 154160/ 173500 | consumed samples: 39464960 | consumed tokens: 80824238080 | elapsed time per iteration (s): 0.15 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 3.669242E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.464 | TFLOPs: 26.13 | +7: iteration 154170/ 173500 | consumed samples: 39467520 | consumed tokens: 80829480960 | elapsed time per iteration (s): 0.16 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 3.662318E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.170 | TFLOPs: 25.25 | +7: iteration 154180/ 173500 | consumed samples: 39470080 | consumed tokens: 80834723840 | elapsed time per iteration (s): 0.16 | learning rate: 2.556E-05 | global batch size: 256 | lm loss: 3.666597E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.035 | TFLOPs: 25.48 | +7: iteration 154190/ 173500 | consumed samples: 39472640 | consumed tokens: 80839966720 | elapsed time per iteration (s): 0.17 | learning rate: 2.556E-05 | global batch size: 256 | lm loss: 3.668153E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1502.951 | TFLOPs: 23.57 | +7: iteration 154200/ 173500 | consumed samples: 39475200 | consumed tokens: 80845209600 | elapsed time per iteration (s): 0.16 | learning rate: 2.555E-05 | global batch size: 256 | lm loss: 3.662105E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.163 | TFLOPs: 25.67 | +7: iteration 154210/ 173500 | consumed samples: 39477760 | consumed tokens: 80850452480 | elapsed time per iteration (s): 0.16 | learning rate: 2.554E-05 | global batch size: 256 | lm loss: 3.664260E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.214 | TFLOPs: 25.74 | +7: iteration 154220/ 173500 | consumed samples: 39480320 | consumed tokens: 80855695360 | elapsed time per iteration (s): 0.16 | learning rate: 2.554E-05 | global batch size: 256 | lm loss: 3.654171E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.347 | TFLOPs: 25.51 | +7: iteration 154230/ 173500 | consumed samples: 39482880 | consumed tokens: 80860938240 | elapsed time per iteration (s): 0.16 | learning rate: 2.553E-05 | global batch size: 256 | lm loss: 3.660897E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.524 | TFLOPs: 25.52 | +7: iteration 154240/ 173500 | consumed samples: 39485440 | consumed tokens: 80866181120 | elapsed time per iteration (s): 0.16 | learning rate: 2.553E-05 | global batch size: 256 | lm loss: 3.667345E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.379 | TFLOPs: 25.80 | +7: iteration 154250/ 173500 | consumed samples: 39488000 | consumed tokens: 80871424000 | elapsed time per iteration (s): 0.16 | learning rate: 2.552E-05 | global batch size: 256 | lm loss: 3.668398E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.501 | TFLOPs: 24.96 | +7: iteration 154260/ 173500 | consumed samples: 39490560 | consumed tokens: 80876666880 | elapsed time per iteration (s): 0.16 | learning rate: 2.552E-05 | global batch size: 256 | lm loss: 3.659461E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.846 | TFLOPs: 24.95 | +7: iteration 154270/ 173500 | consumed samples: 39493120 | consumed tokens: 80881909760 | elapsed time per iteration (s): 0.16 | learning rate: 2.551E-05 | global batch size: 256 | lm loss: 3.658947E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.340 | TFLOPs: 25.10 | +7: iteration 154280/ 173500 | consumed samples: 39495680 | consumed tokens: 80887152640 | elapsed time per iteration (s): 0.15 | learning rate: 2.550E-05 | global batch size: 256 | lm loss: 3.655991E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.449 | TFLOPs: 26.21 | +7: iteration 154290/ 173500 | consumed samples: 39498240 | consumed tokens: 80892395520 | elapsed time per iteration (s): 0.15 | learning rate: 2.550E-05 | global batch size: 256 | lm loss: 3.662675E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.340 | TFLOPs: 26.23 | +7: iteration 154300/ 173500 | consumed samples: 39500800 | consumed tokens: 80897638400 | elapsed time per iteration (s): 0.17 | learning rate: 2.549E-05 | global batch size: 256 | lm loss: 3.652364E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1549.092 | TFLOPs: 24.29 | +7: iteration 154310/ 173500 | consumed samples: 39503360 | consumed tokens: 80902881280 | elapsed time per iteration (s): 0.15 | learning rate: 2.549E-05 | global batch size: 256 | lm loss: 3.654735E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.081 | TFLOPs: 26.19 | +7: iteration 154320/ 173500 | consumed samples: 39505920 | consumed tokens: 80908124160 | elapsed time per iteration (s): 0.16 | learning rate: 2.548E-05 | global batch size: 256 | lm loss: 3.668353E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.303 | TFLOPs: 25.08 | +7: iteration 154330/ 173500 | consumed samples: 39508480 | consumed tokens: 80913367040 | elapsed time per iteration (s): 0.16 | learning rate: 2.548E-05 | global batch size: 256 | lm loss: 3.667128E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.247 | TFLOPs: 25.17 | +7: iteration 154340/ 173500 | consumed samples: 39511040 | consumed tokens: 80918609920 | elapsed time per iteration (s): 0.15 | learning rate: 2.547E-05 | global batch size: 256 | lm loss: 3.666744E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.184 | TFLOPs: 26.00 | +7: iteration 154350/ 173500 | consumed samples: 39513600 | consumed tokens: 80923852800 | elapsed time per iteration (s): 0.16 | learning rate: 2.546E-05 | global batch size: 256 | lm loss: 3.659858E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.678 | TFLOPs: 25.75 | +7: iteration 154360/ 173500 | consumed samples: 39516160 | consumed tokens: 80929095680 | elapsed time per iteration (s): 0.15 | learning rate: 2.546E-05 | global batch size: 256 | lm loss: 3.649382E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.452 | TFLOPs: 26.17 | +7: iteration 154370/ 173500 | consumed samples: 39518720 | consumed tokens: 80934338560 | elapsed time per iteration (s): 0.16 | learning rate: 2.545E-05 | global batch size: 256 | lm loss: 3.676305E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.459 | TFLOPs: 25.44 | +7: iteration 154380/ 173500 | consumed samples: 39521280 | consumed tokens: 80939581440 | elapsed time per iteration (s): 0.16 | learning rate: 2.545E-05 | global batch size: 256 | lm loss: 3.664612E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.486 | TFLOPs: 25.76 | +7: iteration 154390/ 173500 | consumed samples: 39523840 | consumed tokens: 80944824320 | elapsed time per iteration (s): 0.15 | learning rate: 2.544E-05 | global batch size: 256 | lm loss: 3.654809E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.723 | TFLOPs: 25.93 | +7: iteration 154400/ 173500 | consumed samples: 39526400 | consumed tokens: 80950067200 | elapsed time per iteration (s): 0.16 | learning rate: 2.544E-05 | global batch size: 256 | lm loss: 3.659016E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.801 | TFLOPs: 25.23 | +7: iteration 154410/ 173500 | consumed samples: 39528960 | consumed tokens: 80955310080 | elapsed time per iteration (s): 0.16 | learning rate: 2.543E-05 | global batch size: 256 | lm loss: 3.658051E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.724 | TFLOPs: 25.59 | +7: iteration 154420/ 173500 | consumed samples: 39531520 | consumed tokens: 80960552960 | elapsed time per iteration (s): 0.15 | learning rate: 2.543E-05 | global batch size: 256 | lm loss: 3.664352E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.133 | TFLOPs: 25.94 | +7: iteration 154430/ 173500 | consumed samples: 39534080 | consumed tokens: 80965795840 | elapsed time per iteration (s): 0.16 | learning rate: 2.542E-05 | global batch size: 256 | lm loss: 3.653978E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.617 | TFLOPs: 25.12 | +7: iteration 154440/ 173500 | consumed samples: 39536640 | consumed tokens: 80971038720 | elapsed time per iteration (s): 0.16 | learning rate: 2.541E-05 | global batch size: 256 | lm loss: 3.647792E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.566 | TFLOPs: 25.41 | +7: iteration 154450/ 173500 | consumed samples: 39539200 | consumed tokens: 80976281600 | elapsed time per iteration (s): 0.16 | learning rate: 2.541E-05 | global batch size: 256 | lm loss: 3.666434E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.690 | TFLOPs: 25.75 | +7: iteration 154460/ 173500 | consumed samples: 39541760 | consumed tokens: 80981524480 | elapsed time per iteration (s): 0.17 | learning rate: 2.540E-05 | global batch size: 256 | lm loss: 3.668138E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.044 | TFLOPs: 24.12 | +7: iteration 154470/ 173500 | consumed samples: 39544320 | consumed tokens: 80986767360 | elapsed time per iteration (s): 0.15 | learning rate: 2.540E-05 | global batch size: 256 | lm loss: 3.671542E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.395 | TFLOPs: 25.99 | +7: iteration 154480/ 173500 | consumed samples: 39546880 | consumed tokens: 80992010240 | elapsed time per iteration (s): 0.15 | learning rate: 2.539E-05 | global batch size: 256 | lm loss: 3.648336E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.430 | TFLOPs: 25.99 | +7: iteration 154490/ 173500 | consumed samples: 39549440 | consumed tokens: 80997253120 | elapsed time per iteration (s): 0.16 | learning rate: 2.539E-05 | global batch size: 256 | lm loss: 3.655242E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.824 | TFLOPs: 25.67 | +7: iteration 154500/ 173500 | consumed samples: 39552000 | consumed tokens: 81002496000 | elapsed time per iteration (s): 0.16 | learning rate: 2.538E-05 | global batch size: 256 | lm loss: 3.660876E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.718 | TFLOPs: 25.81 | +7: iteration 154510/ 173500 | consumed samples: 39554560 | consumed tokens: 81007738880 | elapsed time per iteration (s): 0.16 | learning rate: 2.537E-05 | global batch size: 256 | lm loss: 3.649470E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.781 | TFLOPs: 25.68 | +7: iteration 154520/ 173500 | consumed samples: 39557120 | consumed tokens: 81012981760 | elapsed time per iteration (s): 0.16 | learning rate: 2.537E-05 | global batch size: 256 | lm loss: 3.648040E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.600 | TFLOPs: 25.48 | +7: iteration 154530/ 173500 | consumed samples: 39559680 | consumed tokens: 81018224640 | elapsed time per iteration (s): 0.16 | learning rate: 2.536E-05 | global batch size: 256 | lm loss: 3.659533E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.518 | TFLOPs: 25.34 | +7: iteration 154540/ 173500 | consumed samples: 39562240 | consumed tokens: 81023467520 | elapsed time per iteration (s): 0.16 | learning rate: 2.536E-05 | global batch size: 256 | lm loss: 3.682409E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.984 | TFLOPs: 25.53 | +7: iteration 154550/ 173500 | consumed samples: 39564800 | consumed tokens: 81028710400 | elapsed time per iteration (s): 0.16 | learning rate: 2.535E-05 | global batch size: 256 | lm loss: 3.655030E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.204 | TFLOPs: 25.24 | +7: iteration 154560/ 173500 | consumed samples: 39567360 | consumed tokens: 81033953280 | elapsed time per iteration (s): 0.16 | learning rate: 2.535E-05 | global batch size: 256 | lm loss: 3.669110E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.063 | TFLOPs: 25.66 | +7: iteration 154570/ 173500 | consumed samples: 39569920 | consumed tokens: 81039196160 | elapsed time per iteration (s): 0.16 | learning rate: 2.534E-05 | global batch size: 256 | lm loss: 3.658556E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.990 | TFLOPs: 25.66 | +7: iteration 154580/ 173500 | consumed samples: 39572480 | consumed tokens: 81044439040 | elapsed time per iteration (s): 0.15 | learning rate: 2.534E-05 | global batch size: 256 | lm loss: 3.664013E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.930 | TFLOPs: 26.06 | +7: iteration 154590/ 173500 | consumed samples: 39575040 | consumed tokens: 81049681920 | elapsed time per iteration (s): 0.16 | learning rate: 2.533E-05 | global batch size: 256 | lm loss: 3.671393E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.065 | TFLOPs: 24.51 | +7: iteration 154600/ 173500 | consumed samples: 39577600 | consumed tokens: 81054924800 | elapsed time per iteration (s): 0.15 | learning rate: 2.532E-05 | global batch size: 256 | lm loss: 3.641095E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.193 | TFLOPs: 26.11 | +7: iteration 154610/ 173500 | consumed samples: 39580160 | consumed tokens: 81060167680 | elapsed time per iteration (s): 0.16 | learning rate: 2.532E-05 | global batch size: 256 | lm loss: 3.666304E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.686 | TFLOPs: 25.40 | +7: iteration 154620/ 173500 | consumed samples: 39582720 | consumed tokens: 81065410560 | elapsed time per iteration (s): 0.16 | learning rate: 2.531E-05 | global batch size: 256 | lm loss: 3.673602E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.925 | TFLOPs: 25.75 | +7: iteration 154630/ 173500 | consumed samples: 39585280 | consumed tokens: 81070653440 | elapsed time per iteration (s): 0.16 | learning rate: 2.531E-05 | global batch size: 256 | lm loss: 3.672808E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.490 | TFLOPs: 25.49 | +7: iteration 154640/ 173500 | consumed samples: 39587840 | consumed tokens: 81075896320 | elapsed time per iteration (s): 0.16 | learning rate: 2.530E-05 | global batch size: 256 | lm loss: 3.653650E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.516 | TFLOPs: 25.18 | +7: iteration 154650/ 173500 | consumed samples: 39590400 | consumed tokens: 81081139200 | elapsed time per iteration (s): 0.15 | learning rate: 2.530E-05 | global batch size: 256 | lm loss: 3.665803E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.562 | TFLOPs: 26.17 | +7: iteration 154660/ 173500 | consumed samples: 39592960 | consumed tokens: 81086382080 | elapsed time per iteration (s): 0.16 | learning rate: 2.529E-05 | global batch size: 256 | lm loss: 3.663986E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.917 | TFLOPs: 25.53 | +7: iteration 154670/ 173500 | consumed samples: 39595520 | consumed tokens: 81091624960 | elapsed time per iteration (s): 0.15 | learning rate: 2.529E-05 | global batch size: 256 | lm loss: 3.671210E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.517 | TFLOPs: 26.17 | +7: iteration 154680/ 173500 | consumed samples: 39598080 | consumed tokens: 81096867840 | elapsed time per iteration (s): 0.16 | learning rate: 2.528E-05 | global batch size: 256 | lm loss: 3.677031E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.301 | TFLOPs: 25.85 | +7: iteration 154690/ 173500 | consumed samples: 39600640 | consumed tokens: 81102110720 | elapsed time per iteration (s): 0.15 | learning rate: 2.527E-05 | global batch size: 256 | lm loss: 3.672810E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.947 | TFLOPs: 26.17 | +7: iteration 154700/ 173500 | consumed samples: 39603200 | consumed tokens: 81107353600 | elapsed time per iteration (s): 0.15 | learning rate: 2.527E-05 | global batch size: 256 | lm loss: 3.663932E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.824 | TFLOPs: 26.17 | +7: iteration 154710/ 173500 | consumed samples: 39605760 | consumed tokens: 81112596480 | elapsed time per iteration (s): 0.16 | learning rate: 2.526E-05 | global batch size: 256 | lm loss: 3.669851E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.567 | TFLOPs: 25.49 | +7: iteration 154720/ 173500 | consumed samples: 39608320 | consumed tokens: 81117839360 | elapsed time per iteration (s): 0.16 | learning rate: 2.526E-05 | global batch size: 256 | lm loss: 3.675264E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.366 | TFLOPs: 25.77 | +7: iteration 154730/ 173500 | consumed samples: 39610880 | consumed tokens: 81123082240 | elapsed time per iteration (s): 0.16 | learning rate: 2.525E-05 | global batch size: 256 | lm loss: 3.671980E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.175 | TFLOPs: 25.64 | +7: iteration 154740/ 173500 | consumed samples: 39613440 | consumed tokens: 81128325120 | elapsed time per iteration (s): 0.16 | learning rate: 2.525E-05 | global batch size: 256 | lm loss: 3.661570E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.379 | TFLOPs: 25.77 | +7: iteration 154750/ 173500 | consumed samples: 39616000 | consumed tokens: 81133568000 | elapsed time per iteration (s): 0.15 | learning rate: 2.524E-05 | global batch size: 256 | lm loss: 3.673025E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.390 | TFLOPs: 25.91 | +7: iteration 154760/ 173500 | consumed samples: 39618560 | consumed tokens: 81138810880 | elapsed time per iteration (s): 0.15 | learning rate: 2.524E-05 | global batch size: 256 | lm loss: 3.657804E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.358 | TFLOPs: 26.20 | +7: iteration 154770/ 173500 | consumed samples: 39621120 | consumed tokens: 81144053760 | elapsed time per iteration (s): 0.16 | learning rate: 2.523E-05 | global batch size: 256 | lm loss: 3.675644E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.955 | TFLOPs: 25.70 | +7: iteration 154780/ 173500 | consumed samples: 39623680 | consumed tokens: 81149296640 | elapsed time per iteration (s): 0.16 | learning rate: 2.522E-05 | global batch size: 256 | lm loss: 3.668501E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.116 | TFLOPs: 25.63 | +7: iteration 154790/ 173500 | consumed samples: 39626240 | consumed tokens: 81154539520 | elapsed time per iteration (s): 0.16 | learning rate: 2.522E-05 | global batch size: 256 | lm loss: 3.670474E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.535 | TFLOPs: 25.21 | +7: iteration 154800/ 173500 | consumed samples: 39628800 | consumed tokens: 81159782400 | elapsed time per iteration (s): 0.15 | learning rate: 2.521E-05 | global batch size: 256 | lm loss: 3.658551E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.057 | TFLOPs: 26.13 | +7: iteration 154810/ 173500 | consumed samples: 39631360 | consumed tokens: 81165025280 | elapsed time per iteration (s): 0.16 | learning rate: 2.521E-05 | global batch size: 256 | lm loss: 3.653389E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.781 | TFLOPs: 25.20 | +7: iteration 154820/ 173500 | consumed samples: 39633920 | consumed tokens: 81170268160 | elapsed time per iteration (s): 0.16 | learning rate: 2.520E-05 | global batch size: 256 | lm loss: 3.662125E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.075 | TFLOPs: 25.38 | +7: iteration 154830/ 173500 | consumed samples: 39636480 | consumed tokens: 81175511040 | elapsed time per iteration (s): 0.16 | learning rate: 2.520E-05 | global batch size: 256 | lm loss: 3.661380E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.568 | TFLOPs: 24.47 | +7: iteration 154840/ 173500 | consumed samples: 39639040 | consumed tokens: 81180753920 | elapsed time per iteration (s): 0.16 | learning rate: 2.519E-05 | global batch size: 256 | lm loss: 3.661895E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.506 | TFLOPs: 25.12 | +7: iteration 154850/ 173500 | consumed samples: 39641600 | consumed tokens: 81185996800 | elapsed time per iteration (s): 0.16 | learning rate: 2.519E-05 | global batch size: 256 | lm loss: 3.687734E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.905 | TFLOPs: 25.66 | +7: iteration 154860/ 173500 | consumed samples: 39644160 | consumed tokens: 81191239680 | elapsed time per iteration (s): 0.16 | learning rate: 2.518E-05 | global batch size: 256 | lm loss: 3.672213E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.226 | TFLOPs: 25.82 | +7: iteration 154870/ 173500 | consumed samples: 39646720 | consumed tokens: 81196482560 | elapsed time per iteration (s): 0.16 | learning rate: 2.517E-05 | global batch size: 256 | lm loss: 3.659111E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.970 | TFLOPs: 25.55 | +7: iteration 154880/ 173500 | consumed samples: 39649280 | consumed tokens: 81201725440 | elapsed time per iteration (s): 0.16 | learning rate: 2.517E-05 | global batch size: 256 | lm loss: 3.679134E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.015 | TFLOPs: 24.68 | +7: iteration 154890/ 173500 | consumed samples: 39651840 | consumed tokens: 81206968320 | elapsed time per iteration (s): 0.16 | learning rate: 2.516E-05 | global batch size: 256 | lm loss: 3.662235E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.598 | TFLOPs: 25.71 | +7: iteration 154900/ 173500 | consumed samples: 39654400 | consumed tokens: 81212211200 | elapsed time per iteration (s): 0.16 | learning rate: 2.516E-05 | global batch size: 256 | lm loss: 3.661998E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.855 | TFLOPs: 25.76 | +7: iteration 154910/ 173500 | consumed samples: 39656960 | consumed tokens: 81217454080 | elapsed time per iteration (s): 0.15 | learning rate: 2.515E-05 | global batch size: 256 | lm loss: 3.664815E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.477 | TFLOPs: 26.20 | +7: iteration 154920/ 173500 | consumed samples: 39659520 | consumed tokens: 81222696960 | elapsed time per iteration (s): 0.15 | learning rate: 2.515E-05 | global batch size: 256 | lm loss: 3.665068E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.017 | TFLOPs: 26.21 | +7: iteration 154930/ 173500 | consumed samples: 39662080 | consumed tokens: 81227939840 | elapsed time per iteration (s): 0.16 | learning rate: 2.514E-05 | global batch size: 256 | lm loss: 3.669073E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.483 | TFLOPs: 25.59 | +7: iteration 154940/ 173500 | consumed samples: 39664640 | consumed tokens: 81233182720 | elapsed time per iteration (s): 0.16 | learning rate: 2.514E-05 | global batch size: 256 | lm loss: 3.652552E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.824 | TFLOPs: 25.59 | +7: iteration 154950/ 173500 | consumed samples: 39667200 | consumed tokens: 81238425600 | elapsed time per iteration (s): 0.15 | learning rate: 2.513E-05 | global batch size: 256 | lm loss: 3.655995E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.272 | TFLOPs: 26.10 | +7: iteration 154960/ 173500 | consumed samples: 39669760 | consumed tokens: 81243668480 | elapsed time per iteration (s): 0.15 | learning rate: 2.513E-05 | global batch size: 256 | lm loss: 3.671513E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.330 | TFLOPs: 26.10 | +7: iteration 154970/ 173500 | consumed samples: 39672320 | consumed tokens: 81248911360 | elapsed time per iteration (s): 0.15 | learning rate: 2.512E-05 | global batch size: 256 | lm loss: 3.672205E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.033 | TFLOPs: 26.10 | +7: iteration 154980/ 173500 | consumed samples: 39674880 | consumed tokens: 81254154240 | elapsed time per iteration (s): 0.16 | learning rate: 2.511E-05 | global batch size: 256 | lm loss: 3.673740E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.904 | TFLOPs: 25.80 | +7: iteration 154990/ 173500 | consumed samples: 39677440 | consumed tokens: 81259397120 | elapsed time per iteration (s): 0.15 | learning rate: 2.511E-05 | global batch size: 256 | lm loss: 3.667162E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.029 | TFLOPs: 26.14 | +7: iteration 155000/ 173500 | consumed samples: 39680000 | consumed tokens: 81264640000 | elapsed time per iteration (s): 0.16 | learning rate: 2.510E-05 | global batch size: 256 | lm loss: 3.660507E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.617 | TFLOPs: 25.74 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 155000 | lm loss value: 3.889265E+00 | lm loss PPL: 4.887493E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 155000 to checkpoints_44m91b100m +0: [2023-03-17 07:00:02,504] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step155000 is begin to save! +0: [2023-03-17 07:00:02,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:00:02,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:00:02,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:00:02,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:00:02,583] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:00:02,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:00:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:00:02,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:00:02,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:00:02,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:00:02,608] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:00:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:00:02,616] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:00:02,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:00:02,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:00:02,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:00:02,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:00:02,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:00:02,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:00:02,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:00:02,641] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step155000/mp_rank_00_model_states.pt +0: [2023-03-17 07:00:02,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:00:02,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:00:02,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:00:02,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:00:02,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:00:02,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:00:02,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +7: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 07:00:02,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:00:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +7: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:00:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:00:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:00:02,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +7: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:00:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:00:02,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +7: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:00:02,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +7: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +7: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +6: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +5: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +4: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:00:02,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +2: [2023-03-17 07:00:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +1: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +3: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:00:02,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step155000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:00:02,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step155000 is ready now! +0: successfully saved checkpoint at iteration 155000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.33 +7: iteration 155010/ 173500 | consumed samples: 39682560 | consumed tokens: 81269882880 | elapsed time per iteration (s): 0.18 | learning rate: 2.510E-05 | global batch size: 256 | lm loss: 3.646191E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1451.522 | TFLOPs: 22.76 | +7: iteration 155020/ 173500 | consumed samples: 39685120 | consumed tokens: 81275125760 | elapsed time per iteration (s): 0.16 | learning rate: 2.509E-05 | global batch size: 256 | lm loss: 3.648032E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.147 | TFLOPs: 25.20 | +7: iteration 155030/ 173500 | consumed samples: 39687680 | consumed tokens: 81280368640 | elapsed time per iteration (s): 0.15 | learning rate: 2.509E-05 | global batch size: 256 | lm loss: 3.652003E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.183 | TFLOPs: 26.05 | +7: iteration 155040/ 173500 | consumed samples: 39690240 | consumed tokens: 81285611520 | elapsed time per iteration (s): 0.16 | learning rate: 2.508E-05 | global batch size: 256 | lm loss: 3.666267E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.709 | TFLOPs: 25.39 | +7: iteration 155050/ 173500 | consumed samples: 39692800 | consumed tokens: 81290854400 | elapsed time per iteration (s): 0.16 | learning rate: 2.508E-05 | global batch size: 256 | lm loss: 3.658334E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.992 | TFLOPs: 25.11 | +7: iteration 155060/ 173500 | consumed samples: 39695360 | consumed tokens: 81296097280 | elapsed time per iteration (s): 0.15 | learning rate: 2.507E-05 | global batch size: 256 | lm loss: 3.659760E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.114 | TFLOPs: 26.05 | +7: iteration 155070/ 173500 | consumed samples: 39697920 | consumed tokens: 81301340160 | elapsed time per iteration (s): 0.16 | learning rate: 2.507E-05 | global batch size: 256 | lm loss: 3.667680E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.105 | TFLOPs: 25.75 | +7: iteration 155080/ 173500 | consumed samples: 39700480 | consumed tokens: 81306583040 | elapsed time per iteration (s): 0.15 | learning rate: 2.506E-05 | global batch size: 256 | lm loss: 3.654900E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.554 | TFLOPs: 26.12 | +7: iteration 155090/ 173500 | consumed samples: 39703040 | consumed tokens: 81311825920 | elapsed time per iteration (s): 0.15 | learning rate: 2.505E-05 | global batch size: 256 | lm loss: 3.655719E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.742 | TFLOPs: 25.98 | +7: iteration 155100/ 173500 | consumed samples: 39705600 | consumed tokens: 81317068800 | elapsed time per iteration (s): 0.15 | learning rate: 2.505E-05 | global batch size: 256 | lm loss: 3.665491E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.146 | TFLOPs: 26.13 | +7: iteration 155110/ 173500 | consumed samples: 39708160 | consumed tokens: 81322311680 | elapsed time per iteration (s): 0.15 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 3.664962E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.846 | TFLOPs: 26.11 | +7: iteration 155120/ 173500 | consumed samples: 39710720 | consumed tokens: 81327554560 | elapsed time per iteration (s): 0.16 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 3.655603E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.979 | TFLOPs: 25.26 | +7: iteration 155130/ 173500 | consumed samples: 39713280 | consumed tokens: 81332797440 | elapsed time per iteration (s): 0.15 | learning rate: 2.503E-05 | global batch size: 256 | lm loss: 3.661020E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.576 | TFLOPs: 26.10 | +7: iteration 155140/ 173500 | consumed samples: 39715840 | consumed tokens: 81338040320 | elapsed time per iteration (s): 0.15 | learning rate: 2.503E-05 | global batch size: 256 | lm loss: 3.667698E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.444 | TFLOPs: 26.13 | +7: iteration 155150/ 173500 | consumed samples: 39718400 | consumed tokens: 81343283200 | elapsed time per iteration (s): 0.16 | learning rate: 2.502E-05 | global batch size: 256 | lm loss: 3.660247E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.931 | TFLOPs: 25.37 | +7: iteration 155160/ 173500 | consumed samples: 39720960 | consumed tokens: 81348526080 | elapsed time per iteration (s): 0.16 | learning rate: 2.502E-05 | global batch size: 256 | lm loss: 3.664487E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.154 | TFLOPs: 25.49 | +7: iteration 155170/ 173500 | consumed samples: 39723520 | consumed tokens: 81353768960 | elapsed time per iteration (s): 0.16 | learning rate: 2.501E-05 | global batch size: 256 | lm loss: 3.677757E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.404 | TFLOPs: 25.82 | +7: iteration 155180/ 173500 | consumed samples: 39726080 | consumed tokens: 81359011840 | elapsed time per iteration (s): 0.16 | learning rate: 2.501E-05 | global batch size: 256 | lm loss: 3.653440E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.181 | TFLOPs: 25.53 | +7: iteration 155190/ 173500 | consumed samples: 39728640 | consumed tokens: 81364254720 | elapsed time per iteration (s): 0.16 | learning rate: 2.500E-05 | global batch size: 256 | lm loss: 3.660180E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.130 | TFLOPs: 24.98 | +7: iteration 155200/ 173500 | consumed samples: 39731200 | consumed tokens: 81369497600 | elapsed time per iteration (s): 0.17 | learning rate: 2.499E-05 | global batch size: 256 | lm loss: 3.667871E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1532.384 | TFLOPs: 24.03 | +7: iteration 155210/ 173500 | consumed samples: 39733760 | consumed tokens: 81374740480 | elapsed time per iteration (s): 0.16 | learning rate: 2.499E-05 | global batch size: 256 | lm loss: 3.669216E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.118 | TFLOPs: 25.77 | +7: iteration 155220/ 173500 | consumed samples: 39736320 | consumed tokens: 81379983360 | elapsed time per iteration (s): 0.16 | learning rate: 2.498E-05 | global batch size: 256 | lm loss: 3.668970E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.251 | TFLOPs: 25.08 | +7: iteration 155230/ 173500 | consumed samples: 39738880 | consumed tokens: 81385226240 | elapsed time per iteration (s): 0.16 | learning rate: 2.498E-05 | global batch size: 256 | lm loss: 3.676137E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.943 | TFLOPs: 24.95 | +7: iteration 155240/ 173500 | consumed samples: 39741440 | consumed tokens: 81390469120 | elapsed time per iteration (s): 0.16 | learning rate: 2.497E-05 | global batch size: 256 | lm loss: 3.663038E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.461 | TFLOPs: 25.41 | +7: iteration 155250/ 173500 | consumed samples: 39744000 | consumed tokens: 81395712000 | elapsed time per iteration (s): 0.16 | learning rate: 2.497E-05 | global batch size: 256 | lm loss: 3.674548E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.638 | TFLOPs: 25.54 | +7: iteration 155260/ 173500 | consumed samples: 39746560 | consumed tokens: 81400954880 | elapsed time per iteration (s): 0.16 | learning rate: 2.496E-05 | global batch size: 256 | lm loss: 3.661403E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.646 | TFLOPs: 25.56 | +7: iteration 155270/ 173500 | consumed samples: 39749120 | consumed tokens: 81406197760 | elapsed time per iteration (s): 0.16 | learning rate: 2.496E-05 | global batch size: 256 | lm loss: 3.662540E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.395 | TFLOPs: 24.78 | +7: iteration 155280/ 173500 | consumed samples: 39751680 | consumed tokens: 81411440640 | elapsed time per iteration (s): 0.16 | learning rate: 2.495E-05 | global batch size: 256 | lm loss: 3.655920E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.651 | TFLOPs: 25.57 | +7: iteration 155290/ 173500 | consumed samples: 39754240 | consumed tokens: 81416683520 | elapsed time per iteration (s): 0.16 | learning rate: 2.495E-05 | global batch size: 256 | lm loss: 3.663650E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.994 | TFLOPs: 25.63 | +7: iteration 155300/ 173500 | consumed samples: 39756800 | consumed tokens: 81421926400 | elapsed time per iteration (s): 0.15 | learning rate: 2.494E-05 | global batch size: 256 | lm loss: 3.672883E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.371 | TFLOPs: 26.04 | +7: iteration 155310/ 173500 | consumed samples: 39759360 | consumed tokens: 81427169280 | elapsed time per iteration (s): 0.16 | learning rate: 2.494E-05 | global batch size: 256 | lm loss: 3.671091E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.002 | TFLOPs: 25.67 | +7: iteration 155320/ 173500 | consumed samples: 39761920 | consumed tokens: 81432412160 | elapsed time per iteration (s): 0.16 | learning rate: 2.493E-05 | global batch size: 256 | lm loss: 3.663334E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.519 | TFLOPs: 25.62 | +7: iteration 155330/ 173500 | consumed samples: 39764480 | consumed tokens: 81437655040 | elapsed time per iteration (s): 0.16 | learning rate: 2.492E-05 | global batch size: 256 | lm loss: 3.655331E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.565 | TFLOPs: 25.34 | +7: iteration 155340/ 173500 | consumed samples: 39767040 | consumed tokens: 81442897920 | elapsed time per iteration (s): 0.16 | learning rate: 2.492E-05 | global batch size: 256 | lm loss: 3.659192E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.787 | TFLOPs: 25.54 | +7: iteration 155350/ 173500 | consumed samples: 39769600 | consumed tokens: 81448140800 | elapsed time per iteration (s): 0.16 | learning rate: 2.491E-05 | global batch size: 256 | lm loss: 3.684483E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.618 | TFLOPs: 25.67 | +7: iteration 155360/ 173500 | consumed samples: 39772160 | consumed tokens: 81453383680 | elapsed time per iteration (s): 0.16 | learning rate: 2.491E-05 | global batch size: 256 | lm loss: 3.660924E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.050 | TFLOPs: 25.41 | +7: iteration 155370/ 173500 | consumed samples: 39774720 | consumed tokens: 81458626560 | elapsed time per iteration (s): 0.16 | learning rate: 2.490E-05 | global batch size: 256 | lm loss: 3.675249E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.527 | TFLOPs: 25.26 | +7: iteration 155380/ 173500 | consumed samples: 39777280 | consumed tokens: 81463869440 | elapsed time per iteration (s): 0.16 | learning rate: 2.490E-05 | global batch size: 256 | lm loss: 3.659693E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.462 | TFLOPs: 25.32 | +7: iteration 155390/ 173500 | consumed samples: 39779840 | consumed tokens: 81469112320 | elapsed time per iteration (s): 0.16 | learning rate: 2.489E-05 | global batch size: 256 | lm loss: 3.678370E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.540 | TFLOPs: 25.40 | +7: iteration 155400/ 173500 | consumed samples: 39782400 | consumed tokens: 81474355200 | elapsed time per iteration (s): 0.16 | learning rate: 2.489E-05 | global batch size: 256 | lm loss: 3.668822E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.299 | TFLOPs: 25.68 | +7: iteration 155410/ 173500 | consumed samples: 39784960 | consumed tokens: 81479598080 | elapsed time per iteration (s): 0.16 | learning rate: 2.488E-05 | global batch size: 256 | lm loss: 3.666381E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.859 | TFLOPs: 25.78 | +7: iteration 155420/ 173500 | consumed samples: 39787520 | consumed tokens: 81484840960 | elapsed time per iteration (s): 0.15 | learning rate: 2.488E-05 | global batch size: 256 | lm loss: 3.662009E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.348 | TFLOPs: 26.13 | +7: iteration 155430/ 173500 | consumed samples: 39790080 | consumed tokens: 81490083840 | elapsed time per iteration (s): 0.15 | learning rate: 2.487E-05 | global batch size: 256 | lm loss: 3.665638E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.226 | TFLOPs: 26.15 | +7: iteration 155440/ 173500 | consumed samples: 39792640 | consumed tokens: 81495326720 | elapsed time per iteration (s): 0.15 | learning rate: 2.487E-05 | global batch size: 256 | lm loss: 3.651160E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.343 | TFLOPs: 26.13 | +7: iteration 155450/ 173500 | consumed samples: 39795200 | consumed tokens: 81500569600 | elapsed time per iteration (s): 0.15 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 3.648172E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.840 | TFLOPs: 26.14 | +7: iteration 155460/ 173500 | consumed samples: 39797760 | consumed tokens: 81505812480 | elapsed time per iteration (s): 0.15 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 3.680853E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.951 | TFLOPs: 26.11 | +7: iteration 155470/ 173500 | consumed samples: 39800320 | consumed tokens: 81511055360 | elapsed time per iteration (s): 0.15 | learning rate: 2.485E-05 | global batch size: 256 | lm loss: 3.663254E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.221 | TFLOPs: 26.02 | +7: iteration 155480/ 173500 | consumed samples: 39802880 | consumed tokens: 81516298240 | elapsed time per iteration (s): 0.16 | learning rate: 2.484E-05 | global batch size: 256 | lm loss: 3.652249E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.749 | TFLOPs: 25.83 | +7: iteration 155490/ 173500 | consumed samples: 39805440 | consumed tokens: 81521541120 | elapsed time per iteration (s): 0.16 | learning rate: 2.484E-05 | global batch size: 256 | lm loss: 3.662516E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.039 | TFLOPs: 25.48 | +7: iteration 155500/ 173500 | consumed samples: 39808000 | consumed tokens: 81526784000 | elapsed time per iteration (s): 0.15 | learning rate: 2.483E-05 | global batch size: 256 | lm loss: 3.660208E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.426 | TFLOPs: 26.07 | +7: iteration 155510/ 173500 | consumed samples: 39810560 | consumed tokens: 81532026880 | elapsed time per iteration (s): 0.16 | learning rate: 2.483E-05 | global batch size: 256 | lm loss: 3.661333E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.931 | TFLOPs: 25.48 | +7: iteration 155520/ 173500 | consumed samples: 39813120 | consumed tokens: 81537269760 | elapsed time per iteration (s): 0.16 | learning rate: 2.482E-05 | global batch size: 256 | lm loss: 3.654151E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.831 | TFLOPs: 25.76 | +7: iteration 155530/ 173500 | consumed samples: 39815680 | consumed tokens: 81542512640 | elapsed time per iteration (s): 0.15 | learning rate: 2.482E-05 | global batch size: 256 | lm loss: 3.666837E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.165 | TFLOPs: 25.93 | +7: iteration 155540/ 173500 | consumed samples: 39818240 | consumed tokens: 81547755520 | elapsed time per iteration (s): 0.15 | learning rate: 2.481E-05 | global batch size: 256 | lm loss: 3.672176E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.406 | TFLOPs: 26.16 | +7: iteration 155550/ 173500 | consumed samples: 39820800 | consumed tokens: 81552998400 | elapsed time per iteration (s): 0.15 | learning rate: 2.481E-05 | global batch size: 256 | lm loss: 3.664223E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.288 | TFLOPs: 26.18 | +7: iteration 155560/ 173500 | consumed samples: 39823360 | consumed tokens: 81558241280 | elapsed time per iteration (s): 0.15 | learning rate: 2.480E-05 | global batch size: 256 | lm loss: 3.670929E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.056 | TFLOPs: 26.11 | +7: iteration 155570/ 173500 | consumed samples: 39825920 | consumed tokens: 81563484160 | elapsed time per iteration (s): 0.15 | learning rate: 2.480E-05 | global batch size: 256 | lm loss: 3.664050E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.041 | TFLOPs: 26.16 | +7: iteration 155580/ 173500 | consumed samples: 39828480 | consumed tokens: 81568727040 | elapsed time per iteration (s): 0.16 | learning rate: 2.479E-05 | global batch size: 256 | lm loss: 3.658343E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.268 | TFLOPs: 25.74 | +7: iteration 155590/ 173500 | consumed samples: 39831040 | consumed tokens: 81573969920 | elapsed time per iteration (s): 0.15 | learning rate: 2.479E-05 | global batch size: 256 | lm loss: 3.670266E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.856 | TFLOPs: 26.17 | +7: iteration 155600/ 173500 | consumed samples: 39833600 | consumed tokens: 81579212800 | elapsed time per iteration (s): 0.16 | learning rate: 2.478E-05 | global batch size: 256 | lm loss: 3.661418E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.428 | TFLOPs: 25.51 | +7: iteration 155610/ 173500 | consumed samples: 39836160 | consumed tokens: 81584455680 | elapsed time per iteration (s): 0.15 | learning rate: 2.478E-05 | global batch size: 256 | lm loss: 3.661865E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.545 | TFLOPs: 26.18 | +7: iteration 155620/ 173500 | consumed samples: 39838720 | consumed tokens: 81589698560 | elapsed time per iteration (s): 0.15 | learning rate: 2.477E-05 | global batch size: 256 | lm loss: 3.660313E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.508 | TFLOPs: 26.18 | +7: iteration 155630/ 173500 | consumed samples: 39841280 | consumed tokens: 81594941440 | elapsed time per iteration (s): 0.15 | learning rate: 2.476E-05 | global batch size: 256 | lm loss: 3.677797E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.505 | TFLOPs: 26.17 | +7: iteration 155640/ 173500 | consumed samples: 39843840 | consumed tokens: 81600184320 | elapsed time per iteration (s): 0.15 | learning rate: 2.476E-05 | global batch size: 256 | lm loss: 3.665496E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.497 | TFLOPs: 26.17 | +7: iteration 155650/ 173500 | consumed samples: 39846400 | consumed tokens: 81605427200 | elapsed time per iteration (s): 0.15 | learning rate: 2.475E-05 | global batch size: 256 | lm loss: 3.661026E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.735 | TFLOPs: 26.14 | +7: iteration 155660/ 173500 | consumed samples: 39848960 | consumed tokens: 81610670080 | elapsed time per iteration (s): 0.15 | learning rate: 2.475E-05 | global batch size: 256 | lm loss: 3.665112E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.738 | TFLOPs: 26.14 | +7: iteration 155670/ 173500 | consumed samples: 39851520 | consumed tokens: 81615912960 | elapsed time per iteration (s): 0.15 | learning rate: 2.474E-05 | global batch size: 256 | lm loss: 3.675328E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.812 | TFLOPs: 26.14 | +7: iteration 155680/ 173500 | consumed samples: 39854080 | consumed tokens: 81621155840 | elapsed time per iteration (s): 0.16 | learning rate: 2.474E-05 | global batch size: 256 | lm loss: 3.664559E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.444 | TFLOPs: 25.88 | +7: iteration 155690/ 173500 | consumed samples: 39856640 | consumed tokens: 81626398720 | elapsed time per iteration (s): 0.15 | learning rate: 2.473E-05 | global batch size: 256 | lm loss: 3.668742E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.336 | TFLOPs: 26.09 | +7: iteration 155700/ 173500 | consumed samples: 39859200 | consumed tokens: 81631641600 | elapsed time per iteration (s): 0.15 | learning rate: 2.473E-05 | global batch size: 256 | lm loss: 3.667459E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.229 | TFLOPs: 26.08 | +7: iteration 155710/ 173500 | consumed samples: 39861760 | consumed tokens: 81636884480 | elapsed time per iteration (s): 0.15 | learning rate: 2.472E-05 | global batch size: 256 | lm loss: 3.674358E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.175 | TFLOPs: 26.15 | +7: iteration 155720/ 173500 | consumed samples: 39864320 | consumed tokens: 81642127360 | elapsed time per iteration (s): 0.15 | learning rate: 2.472E-05 | global batch size: 256 | lm loss: 3.662734E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.130 | TFLOPs: 26.19 | +7: iteration 155730/ 173500 | consumed samples: 39866880 | consumed tokens: 81647370240 | elapsed time per iteration (s): 0.15 | learning rate: 2.471E-05 | global batch size: 256 | lm loss: 3.655249E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.232 | TFLOPs: 26.16 | +7: iteration 155740/ 173500 | consumed samples: 39869440 | consumed tokens: 81652613120 | elapsed time per iteration (s): 0.16 | learning rate: 2.471E-05 | global batch size: 256 | lm loss: 3.667284E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.767 | TFLOPs: 25.14 | +7: iteration 155750/ 173500 | consumed samples: 39872000 | consumed tokens: 81657856000 | elapsed time per iteration (s): 0.15 | learning rate: 2.470E-05 | global batch size: 256 | lm loss: 3.670066E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.882 | TFLOPs: 26.19 | +7: iteration 155760/ 173500 | consumed samples: 39874560 | consumed tokens: 81663098880 | elapsed time per iteration (s): 0.15 | learning rate: 2.470E-05 | global batch size: 256 | lm loss: 3.670443E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.770 | TFLOPs: 26.20 | +7: iteration 155770/ 173500 | consumed samples: 39877120 | consumed tokens: 81668341760 | elapsed time per iteration (s): 0.15 | learning rate: 2.469E-05 | global batch size: 256 | lm loss: 3.667301E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.858 | TFLOPs: 26.19 | +7: iteration 155780/ 173500 | consumed samples: 39879680 | consumed tokens: 81673584640 | elapsed time per iteration (s): 0.15 | learning rate: 2.469E-05 | global batch size: 256 | lm loss: 3.664651E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.231 | TFLOPs: 26.18 | +7: iteration 155790/ 173500 | consumed samples: 39882240 | consumed tokens: 81678827520 | elapsed time per iteration (s): 0.15 | learning rate: 2.468E-05 | global batch size: 256 | lm loss: 3.658862E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.020 | TFLOPs: 26.17 | +7: iteration 155800/ 173500 | consumed samples: 39884800 | consumed tokens: 81684070400 | elapsed time per iteration (s): 0.15 | learning rate: 2.468E-05 | global batch size: 256 | lm loss: 3.667655E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.031 | TFLOPs: 26.16 | +7: iteration 155810/ 173500 | consumed samples: 39887360 | consumed tokens: 81689313280 | elapsed time per iteration (s): 0.15 | learning rate: 2.467E-05 | global batch size: 256 | lm loss: 3.671168E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.186 | TFLOPs: 26.19 | +7: iteration 155820/ 173500 | consumed samples: 39889920 | consumed tokens: 81694556160 | elapsed time per iteration (s): 0.16 | learning rate: 2.466E-05 | global batch size: 256 | lm loss: 3.651645E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.672 | TFLOPs: 25.67 | +7: iteration 155830/ 173500 | consumed samples: 39892480 | consumed tokens: 81699799040 | elapsed time per iteration (s): 0.16 | learning rate: 2.466E-05 | global batch size: 256 | lm loss: 3.659602E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.613 | TFLOPs: 25.76 | +7: iteration 155840/ 173500 | consumed samples: 39895040 | consumed tokens: 81705041920 | elapsed time per iteration (s): 0.15 | learning rate: 2.465E-05 | global batch size: 256 | lm loss: 3.663259E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.136 | TFLOPs: 26.22 | +7: iteration 155850/ 173500 | consumed samples: 39897600 | consumed tokens: 81710284800 | elapsed time per iteration (s): 0.16 | learning rate: 2.465E-05 | global batch size: 256 | lm loss: 3.665789E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.095 | TFLOPs: 25.25 | +7: iteration 155860/ 173500 | consumed samples: 39900160 | consumed tokens: 81715527680 | elapsed time per iteration (s): 0.16 | learning rate: 2.464E-05 | global batch size: 256 | lm loss: 3.657015E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.612 | TFLOPs: 25.54 | +7: iteration 155870/ 173500 | consumed samples: 39902720 | consumed tokens: 81720770560 | elapsed time per iteration (s): 0.15 | learning rate: 2.464E-05 | global batch size: 256 | lm loss: 3.680312E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.057 | TFLOPs: 26.21 | +7: iteration 155880/ 173500 | consumed samples: 39905280 | consumed tokens: 81726013440 | elapsed time per iteration (s): 0.16 | learning rate: 2.463E-05 | global batch size: 256 | lm loss: 3.649370E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.548 | TFLOPs: 25.81 | +7: iteration 155890/ 173500 | consumed samples: 39907840 | consumed tokens: 81731256320 | elapsed time per iteration (s): 0.15 | learning rate: 2.463E-05 | global batch size: 256 | lm loss: 3.661987E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.310 | TFLOPs: 26.18 | +7: iteration 155900/ 173500 | consumed samples: 39910400 | consumed tokens: 81736499200 | elapsed time per iteration (s): 0.15 | learning rate: 2.462E-05 | global batch size: 256 | lm loss: 3.655495E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.209 | TFLOPs: 26.21 | +7: iteration 155910/ 173500 | consumed samples: 39912960 | consumed tokens: 81741742080 | elapsed time per iteration (s): 0.15 | learning rate: 2.462E-05 | global batch size: 256 | lm loss: 3.662428E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.644 | TFLOPs: 26.22 | +7: iteration 155920/ 173500 | consumed samples: 39915520 | consumed tokens: 81746984960 | elapsed time per iteration (s): 0.15 | learning rate: 2.461E-05 | global batch size: 256 | lm loss: 3.661865E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.501 | TFLOPs: 26.23 | +7: iteration 155930/ 173500 | consumed samples: 39918080 | consumed tokens: 81752227840 | elapsed time per iteration (s): 0.15 | learning rate: 2.461E-05 | global batch size: 256 | lm loss: 3.664481E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.212 | TFLOPs: 26.22 | +7: iteration 155940/ 173500 | consumed samples: 39920640 | consumed tokens: 81757470720 | elapsed time per iteration (s): 0.15 | learning rate: 2.460E-05 | global batch size: 256 | lm loss: 3.654937E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.580 | TFLOPs: 26.18 | +7: iteration 155950/ 173500 | consumed samples: 39923200 | consumed tokens: 81762713600 | elapsed time per iteration (s): 0.16 | learning rate: 2.460E-05 | global batch size: 256 | lm loss: 3.653357E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.748 | TFLOPs: 25.84 | +7: iteration 155960/ 173500 | consumed samples: 39925760 | consumed tokens: 81767956480 | elapsed time per iteration (s): 0.16 | learning rate: 2.459E-05 | global batch size: 256 | lm loss: 3.652811E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.360 | TFLOPs: 25.83 | +7: iteration 155970/ 173500 | consumed samples: 39928320 | consumed tokens: 81773199360 | elapsed time per iteration (s): 0.15 | learning rate: 2.459E-05 | global batch size: 256 | lm loss: 3.675233E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.124 | TFLOPs: 26.22 | +7: iteration 155980/ 173500 | consumed samples: 39930880 | consumed tokens: 81778442240 | elapsed time per iteration (s): 0.16 | learning rate: 2.458E-05 | global batch size: 256 | lm loss: 3.663522E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.547 | TFLOPs: 25.54 | +7: iteration 155990/ 173500 | consumed samples: 39933440 | consumed tokens: 81783685120 | elapsed time per iteration (s): 0.16 | learning rate: 2.458E-05 | global batch size: 256 | lm loss: 3.660884E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.629 | TFLOPs: 25.81 | +0: [2023-03-17 07:02:38,193] [INFO] [logging.py:68:log_dist] [Rank 0] step=156000, skipped=0, lr=[2.4571227150894576e-05, 2.4571227150894576e-05, 2.4571227150894576e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 156000/ 173500 | consumed samples: 39936000 | consumed tokens: 81788928000 | elapsed time per iteration (s): 0.15 | learning rate: 2.457E-05 | global batch size: 256 | lm loss: 3.675211E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.287 | TFLOPs: 26.21 | +0: steps: 156000 loss: 3.6600 iter time (s): 0.155 samples/sec: 1647.186 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 156000 | lm loss value: 3.848755E+00 | lm loss PPL: 4.693460E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 156000 to checkpoints_44m91b100m +0: [2023-03-17 07:02:38,267] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step156000 is begin to save! +0: [2023-03-17 07:02:38,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:02:38,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:02:38,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:02:38,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:02:38,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:02:38,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:02:38,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:02:38,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:02:38,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:02:38,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:02:38,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:02:38,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:02:38,374] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:02:38,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:02:38,382] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:02:38,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:02:38,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:02:38,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:02:38,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:02:38,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:02:38,399] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step156000/mp_rank_00_model_states.pt +0: [2023-03-17 07:02:38,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:02:38,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:02:38,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:02:38,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +7: [2023-03-17 07:02:38,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +7: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:02:38,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:02:38,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +7: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +7: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:02:38,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:02:38,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:02:38,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +7: [2023-03-17 07:02:38,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +3: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +7: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +6: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +2: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +5: [2023-03-17 07:02:38,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:02:38,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +1: [2023-03-17 07:02:38,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:02:38,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:02:38,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +4: [2023-03-17 07:02:38,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:02:38,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step156000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:02:38,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step156000 is ready now! +0: successfully saved checkpoint at iteration 156000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 180.32 +7: iteration 156010/ 173500 | consumed samples: 39938560 | consumed tokens: 81794170880 | elapsed time per iteration (s): 0.18 | learning rate: 2.457E-05 | global batch size: 256 | lm loss: 3.661723E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1432.967 | TFLOPs: 22.47 | +7: iteration 156020/ 173500 | consumed samples: 39941120 | consumed tokens: 81799413760 | elapsed time per iteration (s): 0.21 | learning rate: 2.456E-05 | global batch size: 256 | lm loss: 3.657212E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1199.787 | TFLOPs: 18.82 | +7: iteration 156030/ 173500 | consumed samples: 39943680 | consumed tokens: 81804656640 | elapsed time per iteration (s): 0.16 | learning rate: 2.456E-05 | global batch size: 256 | lm loss: 3.655399E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.798 | TFLOPs: 24.71 | +7: iteration 156040/ 173500 | consumed samples: 39946240 | consumed tokens: 81809899520 | elapsed time per iteration (s): 0.16 | learning rate: 2.455E-05 | global batch size: 256 | lm loss: 3.663159E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.411 | TFLOPs: 25.47 | +7: iteration 156050/ 173500 | consumed samples: 39948800 | consumed tokens: 81815142400 | elapsed time per iteration (s): 0.15 | learning rate: 2.455E-05 | global batch size: 256 | lm loss: 3.647821E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.849 | TFLOPs: 26.22 | +7: iteration 156060/ 173500 | consumed samples: 39951360 | consumed tokens: 81820385280 | elapsed time per iteration (s): 0.15 | learning rate: 2.454E-05 | global batch size: 256 | lm loss: 3.646000E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.197 | TFLOPs: 26.22 | +7: iteration 156070/ 173500 | consumed samples: 39953920 | consumed tokens: 81825628160 | elapsed time per iteration (s): 0.15 | learning rate: 2.454E-05 | global batch size: 256 | lm loss: 3.674879E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.264 | TFLOPs: 26.23 | +7: iteration 156080/ 173500 | consumed samples: 39956480 | consumed tokens: 81830871040 | elapsed time per iteration (s): 0.15 | learning rate: 2.453E-05 | global batch size: 256 | lm loss: 3.653748E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.234 | TFLOPs: 26.21 | +7: iteration 156090/ 173500 | consumed samples: 39959040 | consumed tokens: 81836113920 | elapsed time per iteration (s): 0.16 | learning rate: 2.452E-05 | global batch size: 256 | lm loss: 3.682802E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.099 | TFLOPs: 25.77 | +7: iteration 156100/ 173500 | consumed samples: 39961600 | consumed tokens: 81841356800 | elapsed time per iteration (s): 0.16 | learning rate: 2.452E-05 | global batch size: 256 | lm loss: 3.667939E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.590 | TFLOPs: 25.37 | +7: iteration 156110/ 173500 | consumed samples: 39964160 | consumed tokens: 81846599680 | elapsed time per iteration (s): 0.16 | learning rate: 2.451E-05 | global batch size: 256 | lm loss: 3.661392E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.439 | TFLOPs: 25.62 | +7: iteration 156120/ 173500 | consumed samples: 39966720 | consumed tokens: 81851842560 | elapsed time per iteration (s): 0.16 | learning rate: 2.451E-05 | global batch size: 256 | lm loss: 3.668106E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.355 | TFLOPs: 25.63 | +7: iteration 156130/ 173500 | consumed samples: 39969280 | consumed tokens: 81857085440 | elapsed time per iteration (s): 0.16 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 3.688743E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.562 | TFLOPs: 25.40 | +7: iteration 156140/ 173500 | consumed samples: 39971840 | consumed tokens: 81862328320 | elapsed time per iteration (s): 0.15 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 3.659103E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.155 | TFLOPs: 26.21 | +7: iteration 156150/ 173500 | consumed samples: 39974400 | consumed tokens: 81867571200 | elapsed time per iteration (s): 0.16 | learning rate: 2.449E-05 | global batch size: 256 | lm loss: 3.659295E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.394 | TFLOPs: 25.33 | +7: iteration 156160/ 173500 | consumed samples: 39976960 | consumed tokens: 81872814080 | elapsed time per iteration (s): 0.15 | learning rate: 2.449E-05 | global batch size: 256 | lm loss: 3.663198E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.033 | TFLOPs: 26.22 | +7: iteration 156170/ 173500 | consumed samples: 39979520 | consumed tokens: 81878056960 | elapsed time per iteration (s): 0.16 | learning rate: 2.448E-05 | global batch size: 256 | lm loss: 3.664953E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.440 | TFLOPs: 25.87 | +7: iteration 156180/ 173500 | consumed samples: 39982080 | consumed tokens: 81883299840 | elapsed time per iteration (s): 0.15 | learning rate: 2.448E-05 | global batch size: 256 | lm loss: 3.661404E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.662 | TFLOPs: 26.23 | +7: iteration 156190/ 173500 | consumed samples: 39984640 | consumed tokens: 81888542720 | elapsed time per iteration (s): 0.15 | learning rate: 2.447E-05 | global batch size: 256 | lm loss: 3.662787E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.681 | TFLOPs: 26.23 | +7: iteration 156200/ 173500 | consumed samples: 39987200 | consumed tokens: 81893785600 | elapsed time per iteration (s): 0.15 | learning rate: 2.447E-05 | global batch size: 256 | lm loss: 3.660878E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.179 | TFLOPs: 26.22 | +7: iteration 156210/ 173500 | consumed samples: 39989760 | consumed tokens: 81899028480 | elapsed time per iteration (s): 0.16 | learning rate: 2.446E-05 | global batch size: 256 | lm loss: 3.658535E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.575 | TFLOPs: 25.90 | +7: iteration 156220/ 173500 | consumed samples: 39992320 | consumed tokens: 81904271360 | elapsed time per iteration (s): 0.15 | learning rate: 2.446E-05 | global batch size: 256 | lm loss: 3.657625E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.622 | TFLOPs: 26.23 | +7: iteration 156230/ 173500 | consumed samples: 39994880 | consumed tokens: 81909514240 | elapsed time per iteration (s): 0.15 | learning rate: 2.445E-05 | global batch size: 256 | lm loss: 3.646701E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.567 | TFLOPs: 26.01 | +7: iteration 156240/ 173500 | consumed samples: 39997440 | consumed tokens: 81914757120 | elapsed time per iteration (s): 0.16 | learning rate: 2.445E-05 | global batch size: 256 | lm loss: 3.669322E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.444 | TFLOPs: 25.84 | +7: iteration 156250/ 173500 | consumed samples: 40000000 | consumed tokens: 81920000000 | elapsed time per iteration (s): 0.16 | learning rate: 2.444E-05 | global batch size: 256 | lm loss: 3.653099E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.258 | TFLOPs: 25.85 | +7: iteration 156260/ 173500 | consumed samples: 40002560 | consumed tokens: 81925242880 | elapsed time per iteration (s): 0.16 | learning rate: 2.444E-05 | global batch size: 256 | lm loss: 3.664505E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.905 | TFLOPs: 25.53 | +7: iteration 156270/ 173500 | consumed samples: 40005120 | consumed tokens: 81930485760 | elapsed time per iteration (s): 0.15 | learning rate: 2.443E-05 | global batch size: 256 | lm loss: 3.664240E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.349 | TFLOPs: 26.24 | +7: iteration 156280/ 173500 | consumed samples: 40007680 | consumed tokens: 81935728640 | elapsed time per iteration (s): 0.16 | learning rate: 2.443E-05 | global batch size: 256 | lm loss: 3.671337E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.693 | TFLOPs: 25.82 | +7: iteration 156290/ 173500 | consumed samples: 40010240 | consumed tokens: 81940971520 | elapsed time per iteration (s): 0.15 | learning rate: 2.442E-05 | global batch size: 256 | lm loss: 3.660910E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.530 | TFLOPs: 26.28 | +7: iteration 156300/ 173500 | consumed samples: 40012800 | consumed tokens: 81946214400 | elapsed time per iteration (s): 0.15 | learning rate: 2.442E-05 | global batch size: 256 | lm loss: 3.662331E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.369 | TFLOPs: 25.93 | +7: iteration 156310/ 173500 | consumed samples: 40015360 | consumed tokens: 81951457280 | elapsed time per iteration (s): 0.16 | learning rate: 2.441E-05 | global batch size: 256 | lm loss: 3.663468E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.504 | TFLOPs: 25.88 | +7: iteration 156320/ 173500 | consumed samples: 40017920 | consumed tokens: 81956700160 | elapsed time per iteration (s): 0.15 | learning rate: 2.441E-05 | global batch size: 256 | lm loss: 3.675985E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.603 | TFLOPs: 25.92 | +7: iteration 156330/ 173500 | consumed samples: 40020480 | consumed tokens: 81961943040 | elapsed time per iteration (s): 0.16 | learning rate: 2.440E-05 | global batch size: 256 | lm loss: 3.666844E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.495 | TFLOPs: 25.44 | +7: iteration 156340/ 173500 | consumed samples: 40023040 | consumed tokens: 81967185920 | elapsed time per iteration (s): 0.17 | learning rate: 2.440E-05 | global batch size: 256 | lm loss: 3.659964E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1473.445 | TFLOPs: 23.11 | +7: iteration 156350/ 173500 | consumed samples: 40025600 | consumed tokens: 81972428800 | elapsed time per iteration (s): 0.15 | learning rate: 2.439E-05 | global batch size: 256 | lm loss: 3.681893E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.790 | TFLOPs: 26.20 | +7: iteration 156360/ 173500 | consumed samples: 40028160 | consumed tokens: 81977671680 | elapsed time per iteration (s): 0.16 | learning rate: 2.439E-05 | global batch size: 256 | lm loss: 3.672514E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.699 | TFLOPs: 25.40 | +7: iteration 156370/ 173500 | consumed samples: 40030720 | consumed tokens: 81982914560 | elapsed time per iteration (s): 0.15 | learning rate: 2.438E-05 | global batch size: 256 | lm loss: 3.658281E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.463 | TFLOPs: 26.24 | +7: iteration 156380/ 173500 | consumed samples: 40033280 | consumed tokens: 81988157440 | elapsed time per iteration (s): 0.15 | learning rate: 2.438E-05 | global batch size: 256 | lm loss: 3.669122E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.253 | TFLOPs: 26.27 | +7: iteration 156390/ 173500 | consumed samples: 40035840 | consumed tokens: 81993400320 | elapsed time per iteration (s): 0.16 | learning rate: 2.437E-05 | global batch size: 256 | lm loss: 3.662820E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.793 | TFLOPs: 25.17 | +7: iteration 156400/ 173500 | consumed samples: 40038400 | consumed tokens: 81998643200 | elapsed time per iteration (s): 0.16 | learning rate: 2.437E-05 | global batch size: 256 | lm loss: 3.655222E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.184 | TFLOPs: 25.41 | +7: iteration 156410/ 173500 | consumed samples: 40040960 | consumed tokens: 82003886080 | elapsed time per iteration (s): 0.15 | learning rate: 2.436E-05 | global batch size: 256 | lm loss: 3.657401E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.242 | TFLOPs: 26.19 | +7: iteration 156420/ 173500 | consumed samples: 40043520 | consumed tokens: 82009128960 | elapsed time per iteration (s): 0.16 | learning rate: 2.436E-05 | global batch size: 256 | lm loss: 3.676123E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.270 | TFLOPs: 25.72 | +7: iteration 156430/ 173500 | consumed samples: 40046080 | consumed tokens: 82014371840 | elapsed time per iteration (s): 0.15 | learning rate: 2.435E-05 | global batch size: 256 | lm loss: 3.656716E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.385 | TFLOPs: 26.07 | +7: iteration 156440/ 173500 | consumed samples: 40048640 | consumed tokens: 82019614720 | elapsed time per iteration (s): 0.15 | learning rate: 2.435E-05 | global batch size: 256 | lm loss: 3.652724E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.936 | TFLOPs: 26.08 | +7: iteration 156450/ 173500 | consumed samples: 40051200 | consumed tokens: 82024857600 | elapsed time per iteration (s): 0.15 | learning rate: 2.434E-05 | global batch size: 256 | lm loss: 3.660326E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.721 | TFLOPs: 25.92 | +7: iteration 156460/ 173500 | consumed samples: 40053760 | consumed tokens: 82030100480 | elapsed time per iteration (s): 0.15 | learning rate: 2.434E-05 | global batch size: 256 | lm loss: 3.674004E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.874 | TFLOPs: 26.13 | +7: iteration 156470/ 173500 | consumed samples: 40056320 | consumed tokens: 82035343360 | elapsed time per iteration (s): 0.16 | learning rate: 2.433E-05 | global batch size: 256 | lm loss: 3.671078E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.107 | TFLOPs: 25.08 | +7: iteration 156480/ 173500 | consumed samples: 40058880 | consumed tokens: 82040586240 | elapsed time per iteration (s): 0.15 | learning rate: 2.433E-05 | global batch size: 256 | lm loss: 3.655640E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.938 | TFLOPs: 26.14 | +7: iteration 156490/ 173500 | consumed samples: 40061440 | consumed tokens: 82045829120 | elapsed time per iteration (s): 0.15 | learning rate: 2.432E-05 | global batch size: 256 | lm loss: 3.662999E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.802 | TFLOPs: 26.16 | +7: iteration 156500/ 173500 | consumed samples: 40064000 | consumed tokens: 82051072000 | elapsed time per iteration (s): 0.16 | learning rate: 2.432E-05 | global batch size: 256 | lm loss: 3.655449E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.436 | TFLOPs: 25.82 | +7: iteration 156510/ 173500 | consumed samples: 40066560 | consumed tokens: 82056314880 | elapsed time per iteration (s): 0.15 | learning rate: 2.431E-05 | global batch size: 256 | lm loss: 3.669936E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.352 | TFLOPs: 26.12 | +7: iteration 156520/ 173500 | consumed samples: 40069120 | consumed tokens: 82061557760 | elapsed time per iteration (s): 0.15 | learning rate: 2.431E-05 | global batch size: 256 | lm loss: 3.663108E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.356 | TFLOPs: 26.05 | +7: iteration 156530/ 173500 | consumed samples: 40071680 | consumed tokens: 82066800640 | elapsed time per iteration (s): 0.15 | learning rate: 2.430E-05 | global batch size: 256 | lm loss: 3.669292E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.061 | TFLOPs: 26.16 | +7: iteration 156540/ 173500 | consumed samples: 40074240 | consumed tokens: 82072043520 | elapsed time per iteration (s): 0.15 | learning rate: 2.430E-05 | global batch size: 256 | lm loss: 3.670773E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.899 | TFLOPs: 26.17 | +7: iteration 156550/ 173500 | consumed samples: 40076800 | consumed tokens: 82077286400 | elapsed time per iteration (s): 0.16 | learning rate: 2.429E-05 | global batch size: 256 | lm loss: 3.676842E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.390 | TFLOPs: 25.35 | +7: iteration 156560/ 173500 | consumed samples: 40079360 | consumed tokens: 82082529280 | elapsed time per iteration (s): 0.15 | learning rate: 2.429E-05 | global batch size: 256 | lm loss: 3.659457E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.525 | TFLOPs: 26.14 | +7: iteration 156570/ 173500 | consumed samples: 40081920 | consumed tokens: 82087772160 | elapsed time per iteration (s): 0.16 | learning rate: 2.428E-05 | global batch size: 256 | lm loss: 3.670161E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.493 | TFLOPs: 25.52 | +7: iteration 156580/ 173500 | consumed samples: 40084480 | consumed tokens: 82093015040 | elapsed time per iteration (s): 0.15 | learning rate: 2.428E-05 | global batch size: 256 | lm loss: 3.670285E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.272 | TFLOPs: 26.05 | +7: iteration 156590/ 173500 | consumed samples: 40087040 | consumed tokens: 82098257920 | elapsed time per iteration (s): 0.15 | learning rate: 2.427E-05 | global batch size: 256 | lm loss: 3.675597E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.106 | TFLOPs: 25.96 | +7: iteration 156600/ 173500 | consumed samples: 40089600 | consumed tokens: 82103500800 | elapsed time per iteration (s): 0.15 | learning rate: 2.427E-05 | global batch size: 256 | lm loss: 3.648433E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.679 | TFLOPs: 26.15 | +7: iteration 156610/ 173500 | consumed samples: 40092160 | consumed tokens: 82108743680 | elapsed time per iteration (s): 0.16 | learning rate: 2.426E-05 | global batch size: 256 | lm loss: 3.676916E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.175 | TFLOPs: 25.27 | +7: iteration 156620/ 173500 | consumed samples: 40094720 | consumed tokens: 82113986560 | elapsed time per iteration (s): 0.15 | learning rate: 2.426E-05 | global batch size: 256 | lm loss: 3.663235E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.977 | TFLOPs: 26.14 | +7: iteration 156630/ 173500 | consumed samples: 40097280 | consumed tokens: 82119229440 | elapsed time per iteration (s): 0.16 | learning rate: 2.425E-05 | global batch size: 256 | lm loss: 3.661540E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.470 | TFLOPs: 25.81 | +7: iteration 156640/ 173500 | consumed samples: 40099840 | consumed tokens: 82124472320 | elapsed time per iteration (s): 0.16 | learning rate: 2.425E-05 | global batch size: 256 | lm loss: 3.658220E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.830 | TFLOPs: 25.58 | +7: iteration 156650/ 173500 | consumed samples: 40102400 | consumed tokens: 82129715200 | elapsed time per iteration (s): 0.15 | learning rate: 2.424E-05 | global batch size: 256 | lm loss: 3.653667E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.291 | TFLOPs: 26.18 | +7: iteration 156660/ 173500 | consumed samples: 40104960 | consumed tokens: 82134958080 | elapsed time per iteration (s): 0.16 | learning rate: 2.424E-05 | global batch size: 256 | lm loss: 3.669879E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.207 | TFLOPs: 25.14 | +7: iteration 156670/ 173500 | consumed samples: 40107520 | consumed tokens: 82140200960 | elapsed time per iteration (s): 0.16 | learning rate: 2.423E-05 | global batch size: 256 | lm loss: 3.661540E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.234 | TFLOPs: 25.66 | +7: iteration 156680/ 173500 | consumed samples: 40110080 | consumed tokens: 82145443840 | elapsed time per iteration (s): 0.15 | learning rate: 2.423E-05 | global batch size: 256 | lm loss: 3.662212E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.370 | TFLOPs: 26.20 | +7: iteration 156690/ 173500 | consumed samples: 40112640 | consumed tokens: 82150686720 | elapsed time per iteration (s): 0.15 | learning rate: 2.422E-05 | global batch size: 256 | lm loss: 3.651160E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.157 | TFLOPs: 26.19 | +7: iteration 156700/ 173500 | consumed samples: 40115200 | consumed tokens: 82155929600 | elapsed time per iteration (s): 0.16 | learning rate: 2.422E-05 | global batch size: 256 | lm loss: 3.661724E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.140 | TFLOPs: 25.45 | +7: iteration 156710/ 173500 | consumed samples: 40117760 | consumed tokens: 82161172480 | elapsed time per iteration (s): 0.15 | learning rate: 2.421E-05 | global batch size: 256 | lm loss: 3.667177E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.676 | TFLOPs: 26.20 | +7: iteration 156720/ 173500 | consumed samples: 40120320 | consumed tokens: 82166415360 | elapsed time per iteration (s): 0.15 | learning rate: 2.421E-05 | global batch size: 256 | lm loss: 3.670965E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.969 | TFLOPs: 26.20 | +7: iteration 156730/ 173500 | consumed samples: 40122880 | consumed tokens: 82171658240 | elapsed time per iteration (s): 0.16 | learning rate: 2.420E-05 | global batch size: 256 | lm loss: 3.673739E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.471 | TFLOPs: 25.55 | +7: iteration 156740/ 173500 | consumed samples: 40125440 | consumed tokens: 82176901120 | elapsed time per iteration (s): 0.15 | learning rate: 2.420E-05 | global batch size: 256 | lm loss: 3.672591E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.999 | TFLOPs: 26.19 | +7: iteration 156750/ 173500 | consumed samples: 40128000 | consumed tokens: 82182144000 | elapsed time per iteration (s): 0.16 | learning rate: 2.419E-05 | global batch size: 256 | lm loss: 3.653939E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.979 | TFLOPs: 25.84 | +7: iteration 156760/ 173500 | consumed samples: 40130560 | consumed tokens: 82187386880 | elapsed time per iteration (s): 0.15 | learning rate: 2.419E-05 | global batch size: 256 | lm loss: 3.680331E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.131 | TFLOPs: 26.19 | +7: iteration 156770/ 173500 | consumed samples: 40133120 | consumed tokens: 82192629760 | elapsed time per iteration (s): 0.16 | learning rate: 2.418E-05 | global batch size: 256 | lm loss: 3.653610E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.769 | TFLOPs: 25.79 | +7: iteration 156780/ 173500 | consumed samples: 40135680 | consumed tokens: 82197872640 | elapsed time per iteration (s): 0.16 | learning rate: 2.418E-05 | global batch size: 256 | lm loss: 3.658604E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.275 | TFLOPs: 25.80 | +7: iteration 156790/ 173500 | consumed samples: 40138240 | consumed tokens: 82203115520 | elapsed time per iteration (s): 0.15 | learning rate: 2.417E-05 | global batch size: 256 | lm loss: 3.678947E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.349 | TFLOPs: 26.18 | +7: iteration 156800/ 173500 | consumed samples: 40140800 | consumed tokens: 82208358400 | elapsed time per iteration (s): 0.17 | learning rate: 2.417E-05 | global batch size: 256 | lm loss: 3.652952E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1501.031 | TFLOPs: 23.54 | +7: iteration 156810/ 173500 | consumed samples: 40143360 | consumed tokens: 82213601280 | elapsed time per iteration (s): 0.15 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 3.662371E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.036 | TFLOPs: 26.17 | +7: iteration 156820/ 173500 | consumed samples: 40145920 | consumed tokens: 82218844160 | elapsed time per iteration (s): 0.15 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 3.652403E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.905 | TFLOPs: 26.02 | +7: iteration 156830/ 173500 | consumed samples: 40148480 | consumed tokens: 82224087040 | elapsed time per iteration (s): 0.15 | learning rate: 2.415E-05 | global batch size: 256 | lm loss: 3.659961E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.522 | TFLOPs: 26.18 | +7: iteration 156840/ 173500 | consumed samples: 40151040 | consumed tokens: 82229329920 | elapsed time per iteration (s): 0.15 | learning rate: 2.415E-05 | global batch size: 256 | lm loss: 3.668220E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.940 | TFLOPs: 26.19 | +7: iteration 156850/ 173500 | consumed samples: 40153600 | consumed tokens: 82234572800 | elapsed time per iteration (s): 0.15 | learning rate: 2.414E-05 | global batch size: 256 | lm loss: 3.676804E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.611 | TFLOPs: 26.20 | +7: iteration 156860/ 173500 | consumed samples: 40156160 | consumed tokens: 82239815680 | elapsed time per iteration (s): 0.16 | learning rate: 2.414E-05 | global batch size: 256 | lm loss: 3.671299E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.870 | TFLOPs: 25.73 | +7: iteration 156870/ 173500 | consumed samples: 40158720 | consumed tokens: 82245058560 | elapsed time per iteration (s): 0.15 | learning rate: 2.413E-05 | global batch size: 256 | lm loss: 3.657530E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.898 | TFLOPs: 26.19 | +7: iteration 156880/ 173500 | consumed samples: 40161280 | consumed tokens: 82250301440 | elapsed time per iteration (s): 0.15 | learning rate: 2.413E-05 | global batch size: 256 | lm loss: 3.662783E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.630 | TFLOPs: 26.18 | +7: iteration 156890/ 173500 | consumed samples: 40163840 | consumed tokens: 82255544320 | elapsed time per iteration (s): 0.16 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 3.668633E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.175 | TFLOPs: 25.13 | +7: iteration 156900/ 173500 | consumed samples: 40166400 | consumed tokens: 82260787200 | elapsed time per iteration (s): 0.15 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 3.667728E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.995 | TFLOPs: 26.21 | +7: iteration 156910/ 173500 | consumed samples: 40168960 | consumed tokens: 82266030080 | elapsed time per iteration (s): 0.15 | learning rate: 2.411E-05 | global batch size: 256 | lm loss: 3.660636E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.320 | TFLOPs: 25.91 | +7: iteration 156920/ 173500 | consumed samples: 40171520 | consumed tokens: 82271272960 | elapsed time per iteration (s): 0.16 | learning rate: 2.411E-05 | global batch size: 256 | lm loss: 3.650296E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.425 | TFLOPs: 25.74 | +7: iteration 156930/ 173500 | consumed samples: 40174080 | consumed tokens: 82276515840 | elapsed time per iteration (s): 0.15 | learning rate: 2.410E-05 | global batch size: 256 | lm loss: 3.650537E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.962 | TFLOPs: 26.24 | +7: iteration 156940/ 173500 | consumed samples: 40176640 | consumed tokens: 82281758720 | elapsed time per iteration (s): 0.15 | learning rate: 2.410E-05 | global batch size: 256 | lm loss: 3.658079E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.230 | TFLOPs: 26.24 | +7: iteration 156950/ 173500 | consumed samples: 40179200 | consumed tokens: 82287001600 | elapsed time per iteration (s): 0.16 | learning rate: 2.409E-05 | global batch size: 256 | lm loss: 3.654198E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.899 | TFLOPs: 25.89 | +7: iteration 156960/ 173500 | consumed samples: 40181760 | consumed tokens: 82292244480 | elapsed time per iteration (s): 0.16 | learning rate: 2.409E-05 | global batch size: 256 | lm loss: 3.658292E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.806 | TFLOPs: 25.81 | +7: iteration 156970/ 173500 | consumed samples: 40184320 | consumed tokens: 82297487360 | elapsed time per iteration (s): 0.15 | learning rate: 2.408E-05 | global batch size: 256 | lm loss: 3.671935E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.513 | TFLOPs: 26.23 | +7: iteration 156980/ 173500 | consumed samples: 40186880 | consumed tokens: 82302730240 | elapsed time per iteration (s): 0.16 | learning rate: 2.408E-05 | global batch size: 256 | lm loss: 3.671727E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.376 | TFLOPs: 25.49 | +7: iteration 156990/ 173500 | consumed samples: 40189440 | consumed tokens: 82307973120 | elapsed time per iteration (s): 0.16 | learning rate: 2.407E-05 | global batch size: 256 | lm loss: 3.663807E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.129 | TFLOPs: 25.49 | +7: iteration 157000/ 173500 | consumed samples: 40192000 | consumed tokens: 82313216000 | elapsed time per iteration (s): 0.16 | learning rate: 2.407E-05 | global batch size: 256 | lm loss: 3.648519E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.300 | TFLOPs: 25.46 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 157000 | lm loss value: 3.821790E+00 | lm loss PPL: 4.568592E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 157000 to checkpoints_44m91b100m +0: [2023-03-17 07:05:14,509] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step157000 is begin to save! +0: [2023-03-17 07:05:14,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:05:14,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:05:14,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:05:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:05:14,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:05:14,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:05:14,605] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:05:14,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:05:14,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:05:14,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:05:14,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:05:14,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:05:14,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:05:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:05:14,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:05:14,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:05:14,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:05:14,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:05:14,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:05:14,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:05:14,655] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step157000/mp_rank_00_model_states.pt +0: [2023-03-17 07:05:14,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:05:14,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:05:14,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:05:14,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +7: [2023-03-17 07:05:14,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +2: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:05:14,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:05:14,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +7: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +2: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +7: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +2: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +7: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +2: [2023-03-17 07:05:14,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:05:14,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +7: [2023-03-17 07:05:14,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +2: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +7: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +1: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +2: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +4: [2023-03-17 07:05:14,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:05:14,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +6: [2023-03-17 07:05:14,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:05:14,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +3: [2023-03-17 07:05:14,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:05:14,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:05:14,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +5: [2023-03-17 07:05:14,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:05:14,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:05:14,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step157000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: [2023-03-17 07:05:14,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step157000 is ready now! +0: successfully saved checkpoint at iteration 157000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 214.08 +7: iteration 157010/ 173500 | consumed samples: 40194560 | consumed tokens: 82318458880 | elapsed time per iteration (s): 0.19 | learning rate: 2.406E-05 | global batch size: 256 | lm loss: 3.658964E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1340.584 | TFLOPs: 21.02 | +7: iteration 157020/ 173500 | consumed samples: 40197120 | consumed tokens: 82323701760 | elapsed time per iteration (s): 0.16 | learning rate: 2.406E-05 | global batch size: 256 | lm loss: 3.648941E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.072 | TFLOPs: 25.88 | +7: iteration 157030/ 173500 | consumed samples: 40199680 | consumed tokens: 82328944640 | elapsed time per iteration (s): 0.16 | learning rate: 2.405E-05 | global batch size: 256 | lm loss: 3.663332E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.201 | TFLOPs: 25.75 | +7: iteration 157040/ 173500 | consumed samples: 40202240 | consumed tokens: 82334187520 | elapsed time per iteration (s): 0.15 | learning rate: 2.405E-05 | global batch size: 256 | lm loss: 3.662933E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.409 | TFLOPs: 26.20 | +7: iteration 157050/ 173500 | consumed samples: 40204800 | consumed tokens: 82339430400 | elapsed time per iteration (s): 0.16 | learning rate: 2.404E-05 | global batch size: 256 | lm loss: 3.664839E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.494 | TFLOPs: 25.88 | +7: iteration 157060/ 173500 | consumed samples: 40207360 | consumed tokens: 82344673280 | elapsed time per iteration (s): 0.16 | learning rate: 2.404E-05 | global batch size: 256 | lm loss: 3.672752E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.063 | TFLOPs: 25.85 | +7: iteration 157070/ 173500 | consumed samples: 40209920 | consumed tokens: 82349916160 | elapsed time per iteration (s): 0.15 | learning rate: 2.403E-05 | global batch size: 256 | lm loss: 3.656295E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.888 | TFLOPs: 26.20 | +7: iteration 157080/ 173500 | consumed samples: 40212480 | consumed tokens: 82355159040 | elapsed time per iteration (s): 0.15 | learning rate: 2.403E-05 | global batch size: 256 | lm loss: 3.670718E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.988 | TFLOPs: 26.24 | +7: iteration 157090/ 173500 | consumed samples: 40215040 | consumed tokens: 82360401920 | elapsed time per iteration (s): 0.15 | learning rate: 2.402E-05 | global batch size: 256 | lm loss: 3.656018E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.768 | TFLOPs: 26.26 | +7: iteration 157100/ 173500 | consumed samples: 40217600 | consumed tokens: 82365644800 | elapsed time per iteration (s): 0.16 | learning rate: 2.402E-05 | global batch size: 256 | lm loss: 3.663796E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.080 | TFLOPs: 25.88 | +7: iteration 157110/ 173500 | consumed samples: 40220160 | consumed tokens: 82370887680 | elapsed time per iteration (s): 0.16 | learning rate: 2.401E-05 | global batch size: 256 | lm loss: 3.656529E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.097 | TFLOPs: 25.36 | +7: iteration 157120/ 173500 | consumed samples: 40222720 | consumed tokens: 82376130560 | elapsed time per iteration (s): 0.15 | learning rate: 2.401E-05 | global batch size: 256 | lm loss: 3.678856E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.607 | TFLOPs: 26.25 | +7: iteration 157130/ 173500 | consumed samples: 40225280 | consumed tokens: 82381373440 | elapsed time per iteration (s): 0.16 | learning rate: 2.400E-05 | global batch size: 256 | lm loss: 3.665550E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.557 | TFLOPs: 24.50 | +7: iteration 157140/ 173500 | consumed samples: 40227840 | consumed tokens: 82386616320 | elapsed time per iteration (s): 0.16 | learning rate: 2.400E-05 | global batch size: 256 | lm loss: 3.673933E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.395 | TFLOPs: 25.46 | +7: iteration 157150/ 173500 | consumed samples: 40230400 | consumed tokens: 82391859200 | elapsed time per iteration (s): 0.16 | learning rate: 2.399E-05 | global batch size: 256 | lm loss: 3.648165E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1557.072 | TFLOPs: 24.42 | +7: iteration 157160/ 173500 | consumed samples: 40232960 | consumed tokens: 82397102080 | elapsed time per iteration (s): 0.15 | learning rate: 2.399E-05 | global batch size: 256 | lm loss: 3.648580E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.924 | TFLOPs: 26.20 | +7: iteration 157170/ 173500 | consumed samples: 40235520 | consumed tokens: 82402344960 | elapsed time per iteration (s): 0.15 | learning rate: 2.398E-05 | global batch size: 256 | lm loss: 3.655099E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.577 | TFLOPs: 26.21 | +7: iteration 157180/ 173500 | consumed samples: 40238080 | consumed tokens: 82407587840 | elapsed time per iteration (s): 0.15 | learning rate: 2.398E-05 | global batch size: 256 | lm loss: 3.658640E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.850 | TFLOPs: 26.23 | +7: iteration 157190/ 173500 | consumed samples: 40240640 | consumed tokens: 82412830720 | elapsed time per iteration (s): 0.16 | learning rate: 2.398E-05 | global batch size: 256 | lm loss: 3.667906E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.911 | TFLOPs: 25.89 | +7: iteration 157200/ 173500 | consumed samples: 40243200 | consumed tokens: 82418073600 | elapsed time per iteration (s): 0.16 | learning rate: 2.397E-05 | global batch size: 256 | lm loss: 3.650094E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.141 | TFLOPs: 25.64 | +7: iteration 157210/ 173500 | consumed samples: 40245760 | consumed tokens: 82423316480 | elapsed time per iteration (s): 0.15 | learning rate: 2.397E-05 | global batch size: 256 | lm loss: 3.658865E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.359 | TFLOPs: 26.20 | +7: iteration 157220/ 173500 | consumed samples: 40248320 | consumed tokens: 82428559360 | elapsed time per iteration (s): 0.16 | learning rate: 2.396E-05 | global batch size: 256 | lm loss: 3.657081E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.073 | TFLOPs: 25.83 | +7: iteration 157230/ 173500 | consumed samples: 40250880 | consumed tokens: 82433802240 | elapsed time per iteration (s): 0.15 | learning rate: 2.396E-05 | global batch size: 256 | lm loss: 3.661752E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.362 | TFLOPs: 26.20 | +7: iteration 157240/ 173500 | consumed samples: 40253440 | consumed tokens: 82439045120 | elapsed time per iteration (s): 0.15 | learning rate: 2.395E-05 | global batch size: 256 | lm loss: 3.670315E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.234 | TFLOPs: 26.24 | +7: iteration 157250/ 173500 | consumed samples: 40256000 | consumed tokens: 82444288000 | elapsed time per iteration (s): 0.15 | learning rate: 2.395E-05 | global batch size: 256 | lm loss: 3.672498E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.602 | TFLOPs: 26.20 | +7: iteration 157260/ 173500 | consumed samples: 40258560 | consumed tokens: 82449530880 | elapsed time per iteration (s): 0.15 | learning rate: 2.394E-05 | global batch size: 256 | lm loss: 3.654774E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.811 | TFLOPs: 26.22 | +7: iteration 157270/ 173500 | consumed samples: 40261120 | consumed tokens: 82454773760 | elapsed time per iteration (s): 0.16 | learning rate: 2.394E-05 | global batch size: 256 | lm loss: 3.655971E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.068 | TFLOPs: 25.56 | +7: iteration 157280/ 173500 | consumed samples: 40263680 | consumed tokens: 82460016640 | elapsed time per iteration (s): 0.15 | learning rate: 2.393E-05 | global batch size: 256 | lm loss: 3.648761E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.450 | TFLOPs: 26.20 | +7: iteration 157290/ 173500 | consumed samples: 40266240 | consumed tokens: 82465259520 | elapsed time per iteration (s): 0.15 | learning rate: 2.393E-05 | global batch size: 256 | lm loss: 3.660827E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.420 | TFLOPs: 26.20 | +7: iteration 157300/ 173500 | consumed samples: 40268800 | consumed tokens: 82470502400 | elapsed time per iteration (s): 0.16 | learning rate: 2.392E-05 | global batch size: 256 | lm loss: 3.659822E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.764 | TFLOPs: 25.57 | +7: iteration 157310/ 173500 | consumed samples: 40271360 | consumed tokens: 82475745280 | elapsed time per iteration (s): 0.15 | learning rate: 2.392E-05 | global batch size: 256 | lm loss: 3.662070E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.317 | TFLOPs: 26.18 | +7: iteration 157320/ 173500 | consumed samples: 40273920 | consumed tokens: 82480988160 | elapsed time per iteration (s): 0.16 | learning rate: 2.391E-05 | global batch size: 256 | lm loss: 3.649156E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.883 | TFLOPs: 25.84 | +7: iteration 157330/ 173500 | consumed samples: 40276480 | consumed tokens: 82486231040 | elapsed time per iteration (s): 0.15 | learning rate: 2.391E-05 | global batch size: 256 | lm loss: 3.654172E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.100 | TFLOPs: 26.21 | +7: iteration 157340/ 173500 | consumed samples: 40279040 | consumed tokens: 82491473920 | elapsed time per iteration (s): 0.15 | learning rate: 2.390E-05 | global batch size: 256 | lm loss: 3.672869E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.619 | TFLOPs: 26.12 | +7: iteration 157350/ 173500 | consumed samples: 40281600 | consumed tokens: 82496716800 | elapsed time per iteration (s): 0.15 | learning rate: 2.390E-05 | global batch size: 256 | lm loss: 3.670440E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.352 | TFLOPs: 26.21 | +7: iteration 157360/ 173500 | consumed samples: 40284160 | consumed tokens: 82501959680 | elapsed time per iteration (s): 0.15 | learning rate: 2.389E-05 | global batch size: 256 | lm loss: 3.663889E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.034 | TFLOPs: 25.91 | +7: iteration 157370/ 173500 | consumed samples: 40286720 | consumed tokens: 82507202560 | elapsed time per iteration (s): 0.16 | learning rate: 2.389E-05 | global batch size: 256 | lm loss: 3.670122E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.242 | TFLOPs: 25.86 | +7: iteration 157380/ 173500 | consumed samples: 40289280 | consumed tokens: 82512445440 | elapsed time per iteration (s): 0.16 | learning rate: 2.388E-05 | global batch size: 256 | lm loss: 3.661603E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.352 | TFLOPs: 25.62 | +7: iteration 157390/ 173500 | consumed samples: 40291840 | consumed tokens: 82517688320 | elapsed time per iteration (s): 0.16 | learning rate: 2.388E-05 | global batch size: 256 | lm loss: 3.661486E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.357 | TFLOPs: 25.87 | +7: iteration 157400/ 173500 | consumed samples: 40294400 | consumed tokens: 82522931200 | elapsed time per iteration (s): 0.16 | learning rate: 2.387E-05 | global batch size: 256 | lm loss: 3.652605E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.677 | TFLOPs: 25.78 | +7: iteration 157410/ 173500 | consumed samples: 40296960 | consumed tokens: 82528174080 | elapsed time per iteration (s): 0.16 | learning rate: 2.387E-05 | global batch size: 256 | lm loss: 3.670136E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.121 | TFLOPs: 25.74 | +7: iteration 157420/ 173500 | consumed samples: 40299520 | consumed tokens: 82533416960 | elapsed time per iteration (s): 0.16 | learning rate: 2.386E-05 | global batch size: 256 | lm loss: 3.652161E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.147 | TFLOPs: 24.94 | +7: iteration 157430/ 173500 | consumed samples: 40302080 | consumed tokens: 82538659840 | elapsed time per iteration (s): 0.15 | learning rate: 2.386E-05 | global batch size: 256 | lm loss: 3.661388E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.275 | TFLOPs: 26.05 | +7: iteration 157440/ 173500 | consumed samples: 40304640 | consumed tokens: 82543902720 | elapsed time per iteration (s): 0.16 | learning rate: 2.386E-05 | global batch size: 256 | lm loss: 3.670530E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.499 | TFLOPs: 25.76 | +7: iteration 157450/ 173500 | consumed samples: 40307200 | consumed tokens: 82549145600 | elapsed time per iteration (s): 0.15 | learning rate: 2.385E-05 | global batch size: 256 | lm loss: 3.659444E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.900 | TFLOPs: 26.16 | +7: iteration 157460/ 173500 | consumed samples: 40309760 | consumed tokens: 82554388480 | elapsed time per iteration (s): 0.16 | learning rate: 2.385E-05 | global batch size: 256 | lm loss: 3.653562E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.990 | TFLOPs: 25.53 | +7: iteration 157470/ 173500 | consumed samples: 40312320 | consumed tokens: 82559631360 | elapsed time per iteration (s): 0.16 | learning rate: 2.384E-05 | global batch size: 256 | lm loss: 3.670822E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.597 | TFLOPs: 25.87 | +7: iteration 157480/ 173500 | consumed samples: 40314880 | consumed tokens: 82564874240 | elapsed time per iteration (s): 0.15 | learning rate: 2.384E-05 | global batch size: 256 | lm loss: 3.666763E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.080 | TFLOPs: 26.21 | +7: iteration 157490/ 173500 | consumed samples: 40317440 | consumed tokens: 82570117120 | elapsed time per iteration (s): 0.15 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 3.651528E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.722 | TFLOPs: 26.11 | +7: iteration 157500/ 173500 | consumed samples: 40320000 | consumed tokens: 82575360000 | elapsed time per iteration (s): 0.15 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 3.658950E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.425 | TFLOPs: 26.18 | +7: iteration 157510/ 173500 | consumed samples: 40322560 | consumed tokens: 82580602880 | elapsed time per iteration (s): 0.16 | learning rate: 2.382E-05 | global batch size: 256 | lm loss: 3.674559E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.670 | TFLOPs: 25.31 | +7: iteration 157520/ 173500 | consumed samples: 40325120 | consumed tokens: 82585845760 | elapsed time per iteration (s): 0.15 | learning rate: 2.382E-05 | global batch size: 256 | lm loss: 3.672303E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.893 | TFLOPs: 26.24 | +7: iteration 157530/ 173500 | consumed samples: 40327680 | consumed tokens: 82591088640 | elapsed time per iteration (s): 0.15 | learning rate: 2.381E-05 | global batch size: 256 | lm loss: 3.657068E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.150 | TFLOPs: 26.02 | +7: iteration 157540/ 173500 | consumed samples: 40330240 | consumed tokens: 82596331520 | elapsed time per iteration (s): 0.16 | learning rate: 2.381E-05 | global batch size: 256 | lm loss: 3.656268E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.372 | TFLOPs: 25.47 | +7: iteration 157550/ 173500 | consumed samples: 40332800 | consumed tokens: 82601574400 | elapsed time per iteration (s): 0.16 | learning rate: 2.380E-05 | global batch size: 256 | lm loss: 3.671519E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.247 | TFLOPs: 25.85 | +7: iteration 157560/ 173500 | consumed samples: 40335360 | consumed tokens: 82606817280 | elapsed time per iteration (s): 0.15 | learning rate: 2.380E-05 | global batch size: 256 | lm loss: 3.663578E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.154 | TFLOPs: 25.91 | +7: iteration 157570/ 173500 | consumed samples: 40337920 | consumed tokens: 82612060160 | elapsed time per iteration (s): 0.15 | learning rate: 2.379E-05 | global batch size: 256 | lm loss: 3.671670E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.983 | TFLOPs: 26.22 | +7: iteration 157580/ 173500 | consumed samples: 40340480 | consumed tokens: 82617303040 | elapsed time per iteration (s): 0.15 | learning rate: 2.379E-05 | global batch size: 256 | lm loss: 3.660590E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.603 | TFLOPs: 25.92 | +7: iteration 157590/ 173500 | consumed samples: 40343040 | consumed tokens: 82622545920 | elapsed time per iteration (s): 0.15 | learning rate: 2.378E-05 | global batch size: 256 | lm loss: 3.647910E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.129 | TFLOPs: 26.08 | +7: iteration 157600/ 173500 | consumed samples: 40345600 | consumed tokens: 82627788800 | elapsed time per iteration (s): 0.15 | learning rate: 2.378E-05 | global batch size: 256 | lm loss: 3.668359E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.718 | TFLOPs: 26.00 | +7: iteration 157610/ 173500 | consumed samples: 40348160 | consumed tokens: 82633031680 | elapsed time per iteration (s): 0.15 | learning rate: 2.377E-05 | global batch size: 256 | lm loss: 3.657450E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.741 | TFLOPs: 26.20 | +7: iteration 157620/ 173500 | consumed samples: 40350720 | consumed tokens: 82638274560 | elapsed time per iteration (s): 0.15 | learning rate: 2.377E-05 | global batch size: 256 | lm loss: 3.662706E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.867 | TFLOPs: 26.20 | +7: iteration 157630/ 173500 | consumed samples: 40353280 | consumed tokens: 82643517440 | elapsed time per iteration (s): 0.15 | learning rate: 2.377E-05 | global batch size: 256 | lm loss: 3.673523E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.269 | TFLOPs: 26.21 | +7: iteration 157640/ 173500 | consumed samples: 40355840 | consumed tokens: 82648760320 | elapsed time per iteration (s): 0.15 | learning rate: 2.376E-05 | global batch size: 256 | lm loss: 3.664851E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.075 | TFLOPs: 26.25 | +7: iteration 157650/ 173500 | consumed samples: 40358400 | consumed tokens: 82654003200 | elapsed time per iteration (s): 0.15 | learning rate: 2.376E-05 | global batch size: 256 | lm loss: 3.667939E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.545 | TFLOPs: 26.25 | +7: iteration 157660/ 173500 | consumed samples: 40360960 | consumed tokens: 82659246080 | elapsed time per iteration (s): 0.16 | learning rate: 2.375E-05 | global batch size: 256 | lm loss: 3.658144E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.871 | TFLOPs: 25.72 | +7: iteration 157670/ 173500 | consumed samples: 40363520 | consumed tokens: 82664488960 | elapsed time per iteration (s): 0.16 | learning rate: 2.375E-05 | global batch size: 256 | lm loss: 3.656700E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.797 | TFLOPs: 25.23 | +7: iteration 157680/ 173500 | consumed samples: 40366080 | consumed tokens: 82669731840 | elapsed time per iteration (s): 0.15 | learning rate: 2.374E-05 | global batch size: 256 | lm loss: 3.649518E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.610 | TFLOPs: 26.26 | +7: iteration 157690/ 173500 | consumed samples: 40368640 | consumed tokens: 82674974720 | elapsed time per iteration (s): 0.16 | learning rate: 2.374E-05 | global batch size: 256 | lm loss: 3.672235E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.018 | TFLOPs: 25.89 | +7: iteration 157700/ 173500 | consumed samples: 40371200 | consumed tokens: 82680217600 | elapsed time per iteration (s): 0.15 | learning rate: 2.373E-05 | global batch size: 256 | lm loss: 3.668844E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.080 | TFLOPs: 26.25 | +7: iteration 157710/ 173500 | consumed samples: 40373760 | consumed tokens: 82685460480 | elapsed time per iteration (s): 0.16 | learning rate: 2.373E-05 | global batch size: 256 | lm loss: 3.663568E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.924 | TFLOPs: 25.72 | +7: iteration 157720/ 173500 | consumed samples: 40376320 | consumed tokens: 82690703360 | elapsed time per iteration (s): 0.15 | learning rate: 2.372E-05 | global batch size: 256 | lm loss: 3.655627E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.948 | TFLOPs: 26.25 | +7: iteration 157730/ 173500 | consumed samples: 40378880 | consumed tokens: 82695946240 | elapsed time per iteration (s): 0.15 | learning rate: 2.372E-05 | global batch size: 256 | lm loss: 3.674662E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.426 | TFLOPs: 25.99 | +7: iteration 157740/ 173500 | consumed samples: 40381440 | consumed tokens: 82701189120 | elapsed time per iteration (s): 0.15 | learning rate: 2.371E-05 | global batch size: 256 | lm loss: 3.661715E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.234 | TFLOPs: 26.21 | +7: iteration 157750/ 173500 | consumed samples: 40384000 | consumed tokens: 82706432000 | elapsed time per iteration (s): 0.15 | learning rate: 2.371E-05 | global batch size: 256 | lm loss: 3.664319E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.113 | TFLOPs: 26.18 | +7: iteration 157760/ 173500 | consumed samples: 40386560 | consumed tokens: 82711674880 | elapsed time per iteration (s): 0.16 | learning rate: 2.370E-05 | global batch size: 256 | lm loss: 3.659334E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.657 | TFLOPs: 25.67 | +7: iteration 157770/ 173500 | consumed samples: 40389120 | consumed tokens: 82716917760 | elapsed time per iteration (s): 0.16 | learning rate: 2.370E-05 | global batch size: 256 | lm loss: 3.644677E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.644 | TFLOPs: 25.78 | +7: iteration 157780/ 173500 | consumed samples: 40391680 | consumed tokens: 82722160640 | elapsed time per iteration (s): 0.16 | learning rate: 2.369E-05 | global batch size: 256 | lm loss: 3.660512E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.078 | TFLOPs: 25.06 | +7: iteration 157790/ 173500 | consumed samples: 40394240 | consumed tokens: 82727403520 | elapsed time per iteration (s): 0.16 | learning rate: 2.369E-05 | global batch size: 256 | lm loss: 3.666613E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.424 | TFLOPs: 25.21 | +7: iteration 157800/ 173500 | consumed samples: 40396800 | consumed tokens: 82732646400 | elapsed time per iteration (s): 0.16 | learning rate: 2.369E-05 | global batch size: 256 | lm loss: 3.651255E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.453 | TFLOPs: 25.32 | +7: iteration 157810/ 173500 | consumed samples: 40399360 | consumed tokens: 82737889280 | elapsed time per iteration (s): 0.16 | learning rate: 2.368E-05 | global batch size: 256 | lm loss: 3.662459E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.794 | TFLOPs: 25.17 | +7: iteration 157820/ 173500 | consumed samples: 40401920 | consumed tokens: 82743132160 | elapsed time per iteration (s): 0.16 | learning rate: 2.368E-05 | global batch size: 256 | lm loss: 3.675418E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.631 | TFLOPs: 25.54 | +7: iteration 157830/ 173500 | consumed samples: 40404480 | consumed tokens: 82748375040 | elapsed time per iteration (s): 0.16 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 3.676817E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.910 | TFLOPs: 25.56 | +7: iteration 157840/ 173500 | consumed samples: 40407040 | consumed tokens: 82753617920 | elapsed time per iteration (s): 0.15 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 3.659827E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.265 | TFLOPs: 26.04 | +7: iteration 157850/ 173500 | consumed samples: 40409600 | consumed tokens: 82758860800 | elapsed time per iteration (s): 0.15 | learning rate: 2.366E-05 | global batch size: 256 | lm loss: 3.659251E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.725 | TFLOPs: 26.09 | +7: iteration 157860/ 173500 | consumed samples: 40412160 | consumed tokens: 82764103680 | elapsed time per iteration (s): 0.16 | learning rate: 2.366E-05 | global batch size: 256 | lm loss: 3.662233E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.289 | TFLOPs: 25.88 | +7: iteration 157870/ 173500 | consumed samples: 40414720 | consumed tokens: 82769346560 | elapsed time per iteration (s): 0.16 | learning rate: 2.365E-05 | global batch size: 256 | lm loss: 3.684064E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.456 | TFLOPs: 25.90 | +7: iteration 157880/ 173500 | consumed samples: 40417280 | consumed tokens: 82774589440 | elapsed time per iteration (s): 0.15 | learning rate: 2.365E-05 | global batch size: 256 | lm loss: 3.663898E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.997 | TFLOPs: 26.19 | +7: iteration 157890/ 173500 | consumed samples: 40419840 | consumed tokens: 82779832320 | elapsed time per iteration (s): 0.15 | learning rate: 2.364E-05 | global batch size: 256 | lm loss: 3.673199E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.946 | TFLOPs: 25.97 | +7: iteration 157900/ 173500 | consumed samples: 40422400 | consumed tokens: 82785075200 | elapsed time per iteration (s): 0.15 | learning rate: 2.364E-05 | global batch size: 256 | lm loss: 3.672082E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.900 | TFLOPs: 26.11 | +7: iteration 157910/ 173500 | consumed samples: 40424960 | consumed tokens: 82790318080 | elapsed time per iteration (s): 0.15 | learning rate: 2.363E-05 | global batch size: 256 | lm loss: 3.660369E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.227 | TFLOPs: 26.04 | +7: iteration 157920/ 173500 | consumed samples: 40427520 | consumed tokens: 82795560960 | elapsed time per iteration (s): 0.15 | learning rate: 2.363E-05 | global batch size: 256 | lm loss: 3.665565E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.550 | TFLOPs: 26.06 | +7: iteration 157930/ 173500 | consumed samples: 40430080 | consumed tokens: 82800803840 | elapsed time per iteration (s): 0.16 | learning rate: 2.363E-05 | global batch size: 256 | lm loss: 3.662813E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.009 | TFLOPs: 25.42 | +7: iteration 157940/ 173500 | consumed samples: 40432640 | consumed tokens: 82806046720 | elapsed time per iteration (s): 0.16 | learning rate: 2.362E-05 | global batch size: 256 | lm loss: 3.660225E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.214 | TFLOPs: 25.71 | +7: iteration 157950/ 173500 | consumed samples: 40435200 | consumed tokens: 82811289600 | elapsed time per iteration (s): 0.16 | learning rate: 2.362E-05 | global batch size: 256 | lm loss: 3.679381E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.706 | TFLOPs: 25.12 | +7: iteration 157960/ 173500 | consumed samples: 40437760 | consumed tokens: 82816532480 | elapsed time per iteration (s): 0.16 | learning rate: 2.361E-05 | global batch size: 256 | lm loss: 3.651406E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.054 | TFLOPs: 24.69 | +7: iteration 157970/ 173500 | consumed samples: 40440320 | consumed tokens: 82821775360 | elapsed time per iteration (s): 0.15 | learning rate: 2.361E-05 | global batch size: 256 | lm loss: 3.658729E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.485 | TFLOPs: 25.99 | +7: iteration 157980/ 173500 | consumed samples: 40442880 | consumed tokens: 82827018240 | elapsed time per iteration (s): 0.15 | learning rate: 2.360E-05 | global batch size: 256 | lm loss: 3.662790E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.686 | TFLOPs: 25.97 | +7: iteration 157990/ 173500 | consumed samples: 40445440 | consumed tokens: 82832261120 | elapsed time per iteration (s): 0.16 | learning rate: 2.360E-05 | global batch size: 256 | lm loss: 3.650378E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.747 | TFLOPs: 25.12 | +0: [2023-03-17 07:07:50,053] [INFO] [logging.py:68:log_dist] [Rank 0] step=158000, skipped=0, lr=[2.3592725009494674e-05, 2.3592725009494674e-05, 2.3592725009494674e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 158000/ 173500 | consumed samples: 40448000 | consumed tokens: 82837504000 | elapsed time per iteration (s): 0.15 | learning rate: 2.359E-05 | global batch size: 256 | lm loss: 3.667888E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.523 | TFLOPs: 26.06 | +0: steps: 158000 loss: 3.6542 iter time (s): 0.155 samples/sec: 1654.663 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 158000 | lm loss value: 3.839189E+00 | lm loss PPL: 4.648774E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 158000 to checkpoints_44m91b100m +0: [2023-03-17 07:07:50,127] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step158000 is begin to save! +0: [2023-03-17 07:07:50,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:07:50,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:07:50,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:07:50,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:07:50,208] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:07:50,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:07:50,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:07:50,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:07:50,225] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:07:50,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:07:50,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:07:50,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:07:50,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:07:50,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:07:50,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:07:50,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:07:50,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:07:50,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:07:50,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:07:50,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:07:50,268] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step158000/mp_rank_00_model_states.pt +0: [2023-03-17 07:07:50,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:07:50,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:07:50,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:07:50,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:07:50,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:07:50,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +2: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +2: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,302] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,302] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:07:50,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:07:50,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +2: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,304] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +2: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:07:50,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +2: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +2: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +4: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +6: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +7: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +5: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +3: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step158000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +1: [2023-03-17 07:07:50,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step158000 is ready now! +0: successfully saved checkpoint at iteration 158000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 186.88 +7: iteration 158010/ 173500 | consumed samples: 40450560 | consumed tokens: 82842746880 | elapsed time per iteration (s): 0.18 | learning rate: 2.359E-05 | global batch size: 256 | lm loss: 3.671251E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1428.022 | TFLOPs: 22.39 | +7: iteration 158020/ 173500 | consumed samples: 40453120 | consumed tokens: 82847989760 | elapsed time per iteration (s): 0.15 | learning rate: 2.358E-05 | global batch size: 256 | lm loss: 3.657592E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.510 | TFLOPs: 26.18 | +7: iteration 158030/ 173500 | consumed samples: 40455680 | consumed tokens: 82853232640 | elapsed time per iteration (s): 0.15 | learning rate: 2.358E-05 | global batch size: 256 | lm loss: 3.651073E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.208 | TFLOPs: 26.19 | +7: iteration 158040/ 173500 | consumed samples: 40458240 | consumed tokens: 82858475520 | elapsed time per iteration (s): 0.16 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 3.665017E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.515 | TFLOPs: 25.52 | +7: iteration 158050/ 173500 | consumed samples: 40460800 | consumed tokens: 82863718400 | elapsed time per iteration (s): 0.15 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 3.659260E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.378 | TFLOPs: 25.93 | +7: iteration 158060/ 173500 | consumed samples: 40463360 | consumed tokens: 82868961280 | elapsed time per iteration (s): 0.15 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 3.667016E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.743 | TFLOPs: 26.06 | +7: iteration 158070/ 173500 | consumed samples: 40465920 | consumed tokens: 82874204160 | elapsed time per iteration (s): 0.16 | learning rate: 2.356E-05 | global batch size: 256 | lm loss: 3.667694E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.728 | TFLOPs: 25.56 | +7: iteration 158080/ 173500 | consumed samples: 40468480 | consumed tokens: 82879447040 | elapsed time per iteration (s): 0.15 | learning rate: 2.356E-05 | global batch size: 256 | lm loss: 3.659001E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.483 | TFLOPs: 26.09 | +7: iteration 158090/ 173500 | consumed samples: 40471040 | consumed tokens: 82884689920 | elapsed time per iteration (s): 0.15 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 3.667432E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.570 | TFLOPs: 26.06 | +7: iteration 158100/ 173500 | consumed samples: 40473600 | consumed tokens: 82889932800 | elapsed time per iteration (s): 0.15 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 3.659077E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.231 | TFLOPs: 25.93 | +7: iteration 158110/ 173500 | consumed samples: 40476160 | consumed tokens: 82895175680 | elapsed time per iteration (s): 0.16 | learning rate: 2.354E-05 | global batch size: 256 | lm loss: 3.666313E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.933 | TFLOPs: 25.56 | +7: iteration 158120/ 173500 | consumed samples: 40478720 | consumed tokens: 82900418560 | elapsed time per iteration (s): 0.16 | learning rate: 2.354E-05 | global batch size: 256 | lm loss: 3.666197E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.915 | TFLOPs: 25.37 | +7: iteration 158130/ 173500 | consumed samples: 40481280 | consumed tokens: 82905661440 | elapsed time per iteration (s): 0.16 | learning rate: 2.353E-05 | global batch size: 256 | lm loss: 3.658170E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.546 | TFLOPs: 25.41 | +7: iteration 158140/ 173500 | consumed samples: 40483840 | consumed tokens: 82910904320 | elapsed time per iteration (s): 0.15 | learning rate: 2.353E-05 | global batch size: 256 | lm loss: 3.661031E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.194 | TFLOPs: 26.18 | +7: iteration 158150/ 173500 | consumed samples: 40486400 | consumed tokens: 82916147200 | elapsed time per iteration (s): 0.16 | learning rate: 2.352E-05 | global batch size: 256 | lm loss: 3.679525E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.957 | TFLOPs: 25.45 | +7: iteration 158160/ 173500 | consumed samples: 40488960 | consumed tokens: 82921390080 | elapsed time per iteration (s): 0.16 | learning rate: 2.352E-05 | global batch size: 256 | lm loss: 3.659422E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.553 | TFLOPs: 25.87 | +7: iteration 158170/ 173500 | consumed samples: 40491520 | consumed tokens: 82926632960 | elapsed time per iteration (s): 0.15 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 3.652270E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.414 | TFLOPs: 26.15 | +7: iteration 158180/ 173500 | consumed samples: 40494080 | consumed tokens: 82931875840 | elapsed time per iteration (s): 0.15 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 3.662167E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.704 | TFLOPs: 26.11 | +7: iteration 158190/ 173500 | consumed samples: 40496640 | consumed tokens: 82937118720 | elapsed time per iteration (s): 0.15 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 3.666026E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.895 | TFLOPs: 26.09 | +7: iteration 158200/ 173500 | consumed samples: 40499200 | consumed tokens: 82942361600 | elapsed time per iteration (s): 0.15 | learning rate: 2.350E-05 | global batch size: 256 | lm loss: 3.666695E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.096 | TFLOPs: 25.99 | +7: iteration 158210/ 173500 | consumed samples: 40501760 | consumed tokens: 82947604480 | elapsed time per iteration (s): 0.16 | learning rate: 2.350E-05 | global batch size: 256 | lm loss: 3.651266E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.890 | TFLOPs: 25.36 | +7: iteration 158220/ 173500 | consumed samples: 40504320 | consumed tokens: 82952847360 | elapsed time per iteration (s): 0.16 | learning rate: 2.349E-05 | global batch size: 256 | lm loss: 3.657150E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.517 | TFLOPs: 25.52 | +7: iteration 158230/ 173500 | consumed samples: 40506880 | consumed tokens: 82958090240 | elapsed time per iteration (s): 0.15 | learning rate: 2.349E-05 | global batch size: 256 | lm loss: 3.673617E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.880 | TFLOPs: 25.97 | +7: iteration 158240/ 173500 | consumed samples: 40509440 | consumed tokens: 82963333120 | elapsed time per iteration (s): 0.15 | learning rate: 2.348E-05 | global batch size: 256 | lm loss: 3.666939E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.246 | TFLOPs: 26.01 | +7: iteration 158250/ 173500 | consumed samples: 40512000 | consumed tokens: 82968576000 | elapsed time per iteration (s): 0.15 | learning rate: 2.348E-05 | global batch size: 256 | lm loss: 3.665582E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.350 | TFLOPs: 25.98 | +7: iteration 158260/ 173500 | consumed samples: 40514560 | consumed tokens: 82973818880 | elapsed time per iteration (s): 0.15 | learning rate: 2.347E-05 | global batch size: 256 | lm loss: 3.663202E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.022 | TFLOPs: 26.03 | +7: iteration 158270/ 173500 | consumed samples: 40517120 | consumed tokens: 82979061760 | elapsed time per iteration (s): 0.15 | learning rate: 2.347E-05 | global batch size: 256 | lm loss: 3.656464E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.923 | TFLOPs: 26.03 | +7: iteration 158280/ 173500 | consumed samples: 40519680 | consumed tokens: 82984304640 | elapsed time per iteration (s): 0.15 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 3.674569E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.645 | TFLOPs: 26.07 | +7: iteration 158290/ 173500 | consumed samples: 40522240 | consumed tokens: 82989547520 | elapsed time per iteration (s): 0.15 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 3.658393E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.697 | TFLOPs: 26.06 | +7: iteration 158300/ 173500 | consumed samples: 40524800 | consumed tokens: 82994790400 | elapsed time per iteration (s): 0.15 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 3.661136E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.445 | TFLOPs: 26.06 | +7: iteration 158310/ 173500 | consumed samples: 40527360 | consumed tokens: 83000033280 | elapsed time per iteration (s): 0.16 | learning rate: 2.345E-05 | global batch size: 256 | lm loss: 3.656554E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.820 | TFLOPs: 25.54 | +7: iteration 158320/ 173500 | consumed samples: 40529920 | consumed tokens: 83005276160 | elapsed time per iteration (s): 0.15 | learning rate: 2.345E-05 | global batch size: 256 | lm loss: 3.654353E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.716 | TFLOPs: 26.11 | +7: iteration 158330/ 173500 | consumed samples: 40532480 | consumed tokens: 83010519040 | elapsed time per iteration (s): 0.16 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 3.669501E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.817 | TFLOPs: 24.89 | +7: iteration 158340/ 173500 | consumed samples: 40535040 | consumed tokens: 83015761920 | elapsed time per iteration (s): 0.16 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 3.651779E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.892 | TFLOPs: 25.72 | +7: iteration 158350/ 173500 | consumed samples: 40537600 | consumed tokens: 83021004800 | elapsed time per iteration (s): 0.16 | learning rate: 2.343E-05 | global batch size: 256 | lm loss: 3.674884E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.268 | TFLOPs: 25.63 | +7: iteration 158360/ 173500 | consumed samples: 40540160 | consumed tokens: 83026247680 | elapsed time per iteration (s): 0.15 | learning rate: 2.343E-05 | global batch size: 256 | lm loss: 3.659267E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.818 | TFLOPs: 26.20 | +7: iteration 158370/ 173500 | consumed samples: 40542720 | consumed tokens: 83031490560 | elapsed time per iteration (s): 0.15 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 3.657421E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.716 | TFLOPs: 26.22 | +7: iteration 158380/ 173500 | consumed samples: 40545280 | consumed tokens: 83036733440 | elapsed time per iteration (s): 0.15 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 3.664745E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.150 | TFLOPs: 26.10 | +7: iteration 158390/ 173500 | consumed samples: 40547840 | consumed tokens: 83041976320 | elapsed time per iteration (s): 0.16 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 3.650430E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.882 | TFLOPs: 25.69 | +7: iteration 158400/ 173500 | consumed samples: 40550400 | consumed tokens: 83047219200 | elapsed time per iteration (s): 0.16 | learning rate: 2.341E-05 | global batch size: 256 | lm loss: 3.670044E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.693 | TFLOPs: 25.13 | +7: iteration 158410/ 173500 | consumed samples: 40552960 | consumed tokens: 83052462080 | elapsed time per iteration (s): 0.15 | learning rate: 2.341E-05 | global batch size: 256 | lm loss: 3.666546E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.167 | TFLOPs: 26.10 | +7: iteration 158420/ 173500 | consumed samples: 40555520 | consumed tokens: 83057704960 | elapsed time per iteration (s): 0.15 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 3.670995E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.951 | TFLOPs: 26.11 | +7: iteration 158430/ 173500 | consumed samples: 40558080 | consumed tokens: 83062947840 | elapsed time per iteration (s): 0.15 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 3.662428E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.589 | TFLOPs: 26.07 | +7: iteration 158440/ 173500 | consumed samples: 40560640 | consumed tokens: 83068190720 | elapsed time per iteration (s): 0.15 | learning rate: 2.339E-05 | global batch size: 256 | lm loss: 3.662415E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.432 | TFLOPs: 26.04 | +7: iteration 158450/ 173500 | consumed samples: 40563200 | consumed tokens: 83073433600 | elapsed time per iteration (s): 0.15 | learning rate: 2.339E-05 | global batch size: 256 | lm loss: 3.660865E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.537 | TFLOPs: 26.03 | +7: iteration 158460/ 173500 | consumed samples: 40565760 | consumed tokens: 83078676480 | elapsed time per iteration (s): 0.15 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 3.648809E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.846 | TFLOPs: 26.05 | +7: iteration 158470/ 173500 | consumed samples: 40568320 | consumed tokens: 83083919360 | elapsed time per iteration (s): 0.15 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 3.663840E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.736 | TFLOPs: 26.06 | +7: iteration 158480/ 173500 | consumed samples: 40570880 | consumed tokens: 83089162240 | elapsed time per iteration (s): 0.16 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 3.669382E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.960 | TFLOPs: 25.67 | +7: iteration 158490/ 173500 | consumed samples: 40573440 | consumed tokens: 83094405120 | elapsed time per iteration (s): 0.15 | learning rate: 2.337E-05 | global batch size: 256 | lm loss: 3.657953E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.255 | TFLOPs: 26.02 | +7: iteration 158500/ 173500 | consumed samples: 40576000 | consumed tokens: 83099648000 | elapsed time per iteration (s): 0.15 | learning rate: 2.337E-05 | global batch size: 256 | lm loss: 3.658374E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.985 | TFLOPs: 26.00 | +7: iteration 158510/ 173500 | consumed samples: 40578560 | consumed tokens: 83104890880 | elapsed time per iteration (s): 0.15 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 3.664548E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.988 | TFLOPs: 26.13 | +7: iteration 158520/ 173500 | consumed samples: 40581120 | consumed tokens: 83110133760 | elapsed time per iteration (s): 0.16 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 3.656433E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.848 | TFLOPs: 25.75 | +7: iteration 158530/ 173500 | consumed samples: 40583680 | consumed tokens: 83115376640 | elapsed time per iteration (s): 0.16 | learning rate: 2.335E-05 | global batch size: 256 | lm loss: 3.655822E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.905 | TFLOPs: 25.42 | +7: iteration 158540/ 173500 | consumed samples: 40586240 | consumed tokens: 83120619520 | elapsed time per iteration (s): 0.16 | learning rate: 2.335E-05 | global batch size: 256 | lm loss: 3.665283E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.973 | TFLOPs: 25.75 | +7: iteration 158550/ 173500 | consumed samples: 40588800 | consumed tokens: 83125862400 | elapsed time per iteration (s): 0.15 | learning rate: 2.334E-05 | global batch size: 256 | lm loss: 3.670219E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.676 | TFLOPs: 26.04 | +7: iteration 158560/ 173500 | consumed samples: 40591360 | consumed tokens: 83131105280 | elapsed time per iteration (s): 0.16 | learning rate: 2.334E-05 | global batch size: 256 | lm loss: 3.671530E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.856 | TFLOPs: 25.72 | +7: iteration 158570/ 173500 | consumed samples: 40593920 | consumed tokens: 83136348160 | elapsed time per iteration (s): 0.15 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 3.659124E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.923 | TFLOPs: 26.05 | +7: iteration 158580/ 173500 | consumed samples: 40596480 | consumed tokens: 83141591040 | elapsed time per iteration (s): 0.15 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 3.663800E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.446 | TFLOPs: 26.06 | +7: iteration 158590/ 173500 | consumed samples: 40599040 | consumed tokens: 83146833920 | elapsed time per iteration (s): 0.15 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 3.667775E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.973 | TFLOPs: 26.05 | +7: iteration 158600/ 173500 | consumed samples: 40601600 | consumed tokens: 83152076800 | elapsed time per iteration (s): 0.16 | learning rate: 2.332E-05 | global batch size: 256 | lm loss: 3.672324E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.736 | TFLOPs: 25.70 | +7: iteration 158610/ 173500 | consumed samples: 40604160 | consumed tokens: 83157319680 | elapsed time per iteration (s): 0.16 | learning rate: 2.332E-05 | global batch size: 256 | lm loss: 3.665275E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.183 | TFLOPs: 25.17 | +7: iteration 158620/ 173500 | consumed samples: 40606720 | consumed tokens: 83162562560 | elapsed time per iteration (s): 0.16 | learning rate: 2.331E-05 | global batch size: 256 | lm loss: 3.671330E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.003 | TFLOPs: 25.72 | +7: iteration 158630/ 173500 | consumed samples: 40609280 | consumed tokens: 83167805440 | elapsed time per iteration (s): 0.15 | learning rate: 2.331E-05 | global batch size: 256 | lm loss: 3.663729E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.301 | TFLOPs: 26.08 | +7: iteration 158640/ 173500 | consumed samples: 40611840 | consumed tokens: 83173048320 | elapsed time per iteration (s): 0.16 | learning rate: 2.330E-05 | global batch size: 256 | lm loss: 3.654045E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.312 | TFLOPs: 25.41 | +7: iteration 158650/ 173500 | consumed samples: 40614400 | consumed tokens: 83178291200 | elapsed time per iteration (s): 0.16 | learning rate: 2.330E-05 | global batch size: 256 | lm loss: 3.657841E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.463 | TFLOPs: 25.59 | +7: iteration 158660/ 173500 | consumed samples: 40616960 | consumed tokens: 83183534080 | elapsed time per iteration (s): 0.16 | learning rate: 2.330E-05 | global batch size: 256 | lm loss: 3.648605E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.749 | TFLOPs: 25.54 | +7: iteration 158670/ 173500 | consumed samples: 40619520 | consumed tokens: 83188776960 | elapsed time per iteration (s): 0.15 | learning rate: 2.329E-05 | global batch size: 256 | lm loss: 3.659006E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.994 | TFLOPs: 26.05 | +7: iteration 158680/ 173500 | consumed samples: 40622080 | consumed tokens: 83194019840 | elapsed time per iteration (s): 0.15 | learning rate: 2.329E-05 | global batch size: 256 | lm loss: 3.679794E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.762 | TFLOPs: 25.98 | +7: iteration 158690/ 173500 | consumed samples: 40624640 | consumed tokens: 83199262720 | elapsed time per iteration (s): 0.16 | learning rate: 2.328E-05 | global batch size: 256 | lm loss: 3.666328E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1593.543 | TFLOPs: 24.99 | +7: iteration 158700/ 173500 | consumed samples: 40627200 | consumed tokens: 83204505600 | elapsed time per iteration (s): 0.15 | learning rate: 2.328E-05 | global batch size: 256 | lm loss: 3.668393E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.583 | TFLOPs: 26.18 | +7: iteration 158710/ 173500 | consumed samples: 40629760 | consumed tokens: 83209748480 | elapsed time per iteration (s): 0.15 | learning rate: 2.327E-05 | global batch size: 256 | lm loss: 3.671106E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.714 | TFLOPs: 26.25 | +7: iteration 158720/ 173500 | consumed samples: 40632320 | consumed tokens: 83214991360 | elapsed time per iteration (s): 0.15 | learning rate: 2.327E-05 | global batch size: 256 | lm loss: 3.658084E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.646 | TFLOPs: 26.03 | +7: iteration 158730/ 173500 | consumed samples: 40634880 | consumed tokens: 83220234240 | elapsed time per iteration (s): 0.15 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.660473E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.087 | TFLOPs: 26.05 | +7: iteration 158740/ 173500 | consumed samples: 40637440 | consumed tokens: 83225477120 | elapsed time per iteration (s): 0.15 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.663313E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.460 | TFLOPs: 26.01 | +7: iteration 158750/ 173500 | consumed samples: 40640000 | consumed tokens: 83230720000 | elapsed time per iteration (s): 0.20 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.666624E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1307.032 | TFLOPs: 20.50 | +7: iteration 158760/ 173500 | consumed samples: 40642560 | consumed tokens: 83235962880 | elapsed time per iteration (s): 0.16 | learning rate: 2.325E-05 | global batch size: 256 | lm loss: 3.661854E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.133 | TFLOPs: 25.41 | +7: iteration 158770/ 173500 | consumed samples: 40645120 | consumed tokens: 83241205760 | elapsed time per iteration (s): 0.15 | learning rate: 2.325E-05 | global batch size: 256 | lm loss: 3.664137E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.560 | TFLOPs: 26.29 | +7: iteration 158780/ 173500 | consumed samples: 40647680 | consumed tokens: 83246448640 | elapsed time per iteration (s): 0.16 | learning rate: 2.324E-05 | global batch size: 256 | lm loss: 3.664019E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.864 | TFLOPs: 25.33 | +7: iteration 158790/ 173500 | consumed samples: 40650240 | consumed tokens: 83251691520 | elapsed time per iteration (s): 0.16 | learning rate: 2.324E-05 | global batch size: 256 | lm loss: 3.647288E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.292 | TFLOPs: 25.33 | +7: iteration 158800/ 173500 | consumed samples: 40652800 | consumed tokens: 83256934400 | elapsed time per iteration (s): 0.16 | learning rate: 2.323E-05 | global batch size: 256 | lm loss: 3.658651E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.911 | TFLOPs: 25.03 | +7: iteration 158810/ 173500 | consumed samples: 40655360 | consumed tokens: 83262177280 | elapsed time per iteration (s): 0.16 | learning rate: 2.323E-05 | global batch size: 256 | lm loss: 3.672211E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.670 | TFLOPs: 24.74 | +7: iteration 158820/ 173500 | consumed samples: 40657920 | consumed tokens: 83267420160 | elapsed time per iteration (s): 0.15 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 3.660219E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.711 | TFLOPs: 26.01 | +7: iteration 158830/ 173500 | consumed samples: 40660480 | consumed tokens: 83272663040 | elapsed time per iteration (s): 0.16 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 3.649846E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.346 | TFLOPs: 25.29 | +7: iteration 158840/ 173500 | consumed samples: 40663040 | consumed tokens: 83277905920 | elapsed time per iteration (s): 0.15 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 3.650172E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.966 | TFLOPs: 26.30 | +7: iteration 158850/ 173500 | consumed samples: 40665600 | consumed tokens: 83283148800 | elapsed time per iteration (s): 0.16 | learning rate: 2.321E-05 | global batch size: 256 | lm loss: 3.647087E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1553.864 | TFLOPs: 24.37 | +7: iteration 158860/ 173500 | consumed samples: 40668160 | consumed tokens: 83288391680 | elapsed time per iteration (s): 0.16 | learning rate: 2.321E-05 | global batch size: 256 | lm loss: 3.671657E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.757 | TFLOPs: 25.15 | +7: iteration 158870/ 173500 | consumed samples: 40670720 | consumed tokens: 83293634560 | elapsed time per iteration (s): 0.16 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 3.670714E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.412 | TFLOPs: 25.87 | +7: iteration 158880/ 173500 | consumed samples: 40673280 | consumed tokens: 83298877440 | elapsed time per iteration (s): 0.16 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 3.657639E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.340 | TFLOPs: 25.60 | +7: iteration 158890/ 173500 | consumed samples: 40675840 | consumed tokens: 83304120320 | elapsed time per iteration (s): 0.16 | learning rate: 2.319E-05 | global batch size: 256 | lm loss: 3.661436E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.352 | TFLOPs: 25.29 | +7: iteration 158900/ 173500 | consumed samples: 40678400 | consumed tokens: 83309363200 | elapsed time per iteration (s): 0.16 | learning rate: 2.319E-05 | global batch size: 256 | lm loss: 3.659380E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.933 | TFLOPs: 25.84 | +7: iteration 158910/ 173500 | consumed samples: 40680960 | consumed tokens: 83314606080 | elapsed time per iteration (s): 0.16 | learning rate: 2.319E-05 | global batch size: 256 | lm loss: 3.660450E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.814 | TFLOPs: 25.51 | +7: iteration 158920/ 173500 | consumed samples: 40683520 | consumed tokens: 83319848960 | elapsed time per iteration (s): 0.16 | learning rate: 2.318E-05 | global batch size: 256 | lm loss: 3.661602E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.176 | TFLOPs: 25.89 | +7: iteration 158930/ 173500 | consumed samples: 40686080 | consumed tokens: 83325091840 | elapsed time per iteration (s): 0.15 | learning rate: 2.318E-05 | global batch size: 256 | lm loss: 3.671749E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.623 | TFLOPs: 25.93 | +7: iteration 158940/ 173500 | consumed samples: 40688640 | consumed tokens: 83330334720 | elapsed time per iteration (s): 0.16 | learning rate: 2.317E-05 | global batch size: 256 | lm loss: 3.664576E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.565 | TFLOPs: 24.98 | +7: iteration 158950/ 173500 | consumed samples: 40691200 | consumed tokens: 83335577600 | elapsed time per iteration (s): 0.16 | learning rate: 2.317E-05 | global batch size: 256 | lm loss: 3.677294E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.579 | TFLOPs: 25.79 | +7: iteration 158960/ 173500 | consumed samples: 40693760 | consumed tokens: 83340820480 | elapsed time per iteration (s): 0.16 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 3.669803E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.778 | TFLOPs: 25.81 | +7: iteration 158970/ 173500 | consumed samples: 40696320 | consumed tokens: 83346063360 | elapsed time per iteration (s): 0.16 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 3.658153E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.209 | TFLOPs: 25.82 | +7: iteration 158980/ 173500 | consumed samples: 40698880 | consumed tokens: 83351306240 | elapsed time per iteration (s): 0.15 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 3.649157E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.857 | TFLOPs: 26.02 | +7: iteration 158990/ 173500 | consumed samples: 40701440 | consumed tokens: 83356549120 | elapsed time per iteration (s): 0.15 | learning rate: 2.315E-05 | global batch size: 256 | lm loss: 3.660414E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.537 | TFLOPs: 26.18 | +7: iteration 159000/ 173500 | consumed samples: 40704000 | consumed tokens: 83361792000 | elapsed time per iteration (s): 0.15 | learning rate: 2.315E-05 | global batch size: 256 | lm loss: 3.655111E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.507 | TFLOPs: 25.98 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 159000 | lm loss value: 3.863374E+00 | lm loss PPL: 4.762575E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 159000 to checkpoints_44m91b100m +0: [2023-03-17 07:10:26,438] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step159000 is begin to save! +0: [2023-03-17 07:10:26,442] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:10:26,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:10:26,510] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:10:26,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:10:26,520] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:10:26,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:10:26,528] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:10:26,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:10:26,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:10:26,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:10:26,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:10:26,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:10:26,553] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:10:26,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:10:26,561] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:10:26,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:10:26,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:10:26,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:10:26,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:10:26,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:10:26,579] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step159000/mp_rank_00_model_states.pt +0: [2023-03-17 07:10:26,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:10:26,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:10:26,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +1: [2023-03-17 07:10:26,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:10:26,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +1: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +1: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 07:10:26,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: [2023-03-17 07:10:26,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:10:26,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +1: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:10:26,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:10:26,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +1: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +1: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +3: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +2: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:10:26,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +4: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +6: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:10:26,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 07:10:26,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +5: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 07:10:26,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +7: [2023-03-17 07:10:26,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:10:26,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step159000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:10:26,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step159000 is ready now! +0: successfully saved checkpoint at iteration 159000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 186.14 +7: iteration 159010/ 173500 | consumed samples: 40706560 | consumed tokens: 83367034880 | elapsed time per iteration (s): 0.18 | learning rate: 2.314E-05 | global batch size: 256 | lm loss: 3.651473E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1448.410 | TFLOPs: 22.71 | +7: iteration 159020/ 173500 | consumed samples: 40709120 | consumed tokens: 83372277760 | elapsed time per iteration (s): 0.16 | learning rate: 2.314E-05 | global batch size: 256 | lm loss: 3.659790E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.954 | TFLOPs: 25.42 | +7: iteration 159030/ 173500 | consumed samples: 40711680 | consumed tokens: 83377520640 | elapsed time per iteration (s): 0.16 | learning rate: 2.313E-05 | global batch size: 256 | lm loss: 3.663765E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.464 | TFLOPs: 25.55 | +7: iteration 159040/ 173500 | consumed samples: 40714240 | consumed tokens: 83382763520 | elapsed time per iteration (s): 0.16 | learning rate: 2.313E-05 | global batch size: 256 | lm loss: 3.657901E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.103 | TFLOPs: 25.09 | +7: iteration 159050/ 173500 | consumed samples: 40716800 | consumed tokens: 83388006400 | elapsed time per iteration (s): 0.16 | learning rate: 2.313E-05 | global batch size: 256 | lm loss: 3.666288E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.431 | TFLOPs: 25.79 | +7: iteration 159060/ 173500 | consumed samples: 40719360 | consumed tokens: 83393249280 | elapsed time per iteration (s): 0.16 | learning rate: 2.312E-05 | global batch size: 256 | lm loss: 3.665419E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.678 | TFLOPs: 25.73 | +7: iteration 159070/ 173500 | consumed samples: 40721920 | consumed tokens: 83398492160 | elapsed time per iteration (s): 0.15 | learning rate: 2.312E-05 | global batch size: 256 | lm loss: 3.671554E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.321 | TFLOPs: 26.04 | +7: iteration 159080/ 173500 | consumed samples: 40724480 | consumed tokens: 83403735040 | elapsed time per iteration (s): 0.16 | learning rate: 2.311E-05 | global batch size: 256 | lm loss: 3.666404E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.622 | TFLOPs: 25.85 | +7: iteration 159090/ 173500 | consumed samples: 40727040 | consumed tokens: 83408977920 | elapsed time per iteration (s): 0.16 | learning rate: 2.311E-05 | global batch size: 256 | lm loss: 3.667949E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.031 | TFLOPs: 25.36 | +7: iteration 159100/ 173500 | consumed samples: 40729600 | consumed tokens: 83414220800 | elapsed time per iteration (s): 0.16 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 3.665249E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.694 | TFLOPs: 25.46 | +7: iteration 159110/ 173500 | consumed samples: 40732160 | consumed tokens: 83419463680 | elapsed time per iteration (s): 0.16 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 3.672492E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.369 | TFLOPs: 25.33 | +7: iteration 159120/ 173500 | consumed samples: 40734720 | consumed tokens: 83424706560 | elapsed time per iteration (s): 0.15 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 3.671952E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.546 | TFLOPs: 26.03 | +7: iteration 159130/ 173500 | consumed samples: 40737280 | consumed tokens: 83429949440 | elapsed time per iteration (s): 0.16 | learning rate: 2.309E-05 | global batch size: 256 | lm loss: 3.651320E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.857 | TFLOPs: 25.03 | +7: iteration 159140/ 173500 | consumed samples: 40739840 | consumed tokens: 83435192320 | elapsed time per iteration (s): 0.15 | learning rate: 2.309E-05 | global batch size: 256 | lm loss: 3.651866E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.730 | TFLOPs: 26.09 | +7: iteration 159150/ 173500 | consumed samples: 40742400 | consumed tokens: 83440435200 | elapsed time per iteration (s): 0.15 | learning rate: 2.308E-05 | global batch size: 256 | lm loss: 3.663863E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.552 | TFLOPs: 26.09 | +7: iteration 159160/ 173500 | consumed samples: 40744960 | consumed tokens: 83445678080 | elapsed time per iteration (s): 0.16 | learning rate: 2.308E-05 | global batch size: 256 | lm loss: 3.653611E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.568 | TFLOPs: 25.65 | +7: iteration 159170/ 173500 | consumed samples: 40747520 | consumed tokens: 83450920960 | elapsed time per iteration (s): 0.15 | learning rate: 2.307E-05 | global batch size: 256 | lm loss: 3.668754E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.846 | TFLOPs: 26.22 | +7: iteration 159180/ 173500 | consumed samples: 40750080 | consumed tokens: 83456163840 | elapsed time per iteration (s): 0.16 | learning rate: 2.307E-05 | global batch size: 256 | lm loss: 3.658852E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.878 | TFLOPs: 25.78 | +7: iteration 159190/ 173500 | consumed samples: 40752640 | consumed tokens: 83461406720 | elapsed time per iteration (s): 0.15 | learning rate: 2.307E-05 | global batch size: 256 | lm loss: 3.664573E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.250 | TFLOPs: 26.16 | +7: iteration 159200/ 173500 | consumed samples: 40755200 | consumed tokens: 83466649600 | elapsed time per iteration (s): 0.16 | learning rate: 2.306E-05 | global batch size: 256 | lm loss: 3.665770E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.175 | TFLOPs: 24.92 | +7: iteration 159210/ 173500 | consumed samples: 40757760 | consumed tokens: 83471892480 | elapsed time per iteration (s): 0.15 | learning rate: 2.306E-05 | global batch size: 256 | lm loss: 3.655915E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.819 | TFLOPs: 26.19 | +7: iteration 159220/ 173500 | consumed samples: 40760320 | consumed tokens: 83477135360 | elapsed time per iteration (s): 0.16 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 3.668393E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.886 | TFLOPs: 25.59 | +7: iteration 159230/ 173500 | consumed samples: 40762880 | consumed tokens: 83482378240 | elapsed time per iteration (s): 0.16 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 3.670330E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.702 | TFLOPs: 25.62 | +7: iteration 159240/ 173500 | consumed samples: 40765440 | consumed tokens: 83487621120 | elapsed time per iteration (s): 0.16 | learning rate: 2.304E-05 | global batch size: 256 | lm loss: 3.661818E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.615 | TFLOPs: 25.51 | +7: iteration 159250/ 173500 | consumed samples: 40768000 | consumed tokens: 83492864000 | elapsed time per iteration (s): 0.15 | learning rate: 2.304E-05 | global batch size: 256 | lm loss: 3.679713E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.307 | TFLOPs: 26.12 | +7: iteration 159260/ 173500 | consumed samples: 40770560 | consumed tokens: 83498106880 | elapsed time per iteration (s): 0.16 | learning rate: 2.304E-05 | global batch size: 256 | lm loss: 3.667300E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.238 | TFLOPs: 25.77 | +7: iteration 159270/ 173500 | consumed samples: 40773120 | consumed tokens: 83503349760 | elapsed time per iteration (s): 0.15 | learning rate: 2.303E-05 | global batch size: 256 | lm loss: 3.656032E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.892 | TFLOPs: 26.17 | +7: iteration 159280/ 173500 | consumed samples: 40775680 | consumed tokens: 83508592640 | elapsed time per iteration (s): 0.15 | learning rate: 2.303E-05 | global batch size: 256 | lm loss: 3.660526E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.082 | TFLOPs: 26.02 | +7: iteration 159290/ 173500 | consumed samples: 40778240 | consumed tokens: 83513835520 | elapsed time per iteration (s): 0.16 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 3.665610E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.068 | TFLOPs: 25.69 | +7: iteration 159300/ 173500 | consumed samples: 40780800 | consumed tokens: 83519078400 | elapsed time per iteration (s): 0.16 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 3.663340E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.190 | TFLOPs: 25.71 | +7: iteration 159310/ 173500 | consumed samples: 40783360 | consumed tokens: 83524321280 | elapsed time per iteration (s): 0.15 | learning rate: 2.301E-05 | global batch size: 256 | lm loss: 3.678745E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.839 | TFLOPs: 26.16 | +7: iteration 159320/ 173500 | consumed samples: 40785920 | consumed tokens: 83529564160 | elapsed time per iteration (s): 0.16 | learning rate: 2.301E-05 | global batch size: 256 | lm loss: 3.670409E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.878 | TFLOPs: 25.73 | +7: iteration 159330/ 173500 | consumed samples: 40788480 | consumed tokens: 83534807040 | elapsed time per iteration (s): 0.15 | learning rate: 2.301E-05 | global batch size: 256 | lm loss: 3.655820E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.584 | TFLOPs: 26.15 | +7: iteration 159340/ 173500 | consumed samples: 40791040 | consumed tokens: 83540049920 | elapsed time per iteration (s): 0.15 | learning rate: 2.300E-05 | global batch size: 256 | lm loss: 3.653676E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.974 | TFLOPs: 26.13 | +7: iteration 159350/ 173500 | consumed samples: 40793600 | consumed tokens: 83545292800 | elapsed time per iteration (s): 0.15 | learning rate: 2.300E-05 | global batch size: 256 | lm loss: 3.660184E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.341 | TFLOPs: 26.21 | +7: iteration 159360/ 173500 | consumed samples: 40796160 | consumed tokens: 83550535680 | elapsed time per iteration (s): 0.15 | learning rate: 2.299E-05 | global batch size: 256 | lm loss: 3.661539E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.485 | TFLOPs: 26.01 | +7: iteration 159370/ 173500 | consumed samples: 40798720 | consumed tokens: 83555778560 | elapsed time per iteration (s): 0.16 | learning rate: 2.299E-05 | global batch size: 256 | lm loss: 3.659952E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.326 | TFLOPs: 25.66 | +7: iteration 159380/ 173500 | consumed samples: 40801280 | consumed tokens: 83561021440 | elapsed time per iteration (s): 0.15 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 3.653141E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.533 | TFLOPs: 26.21 | +7: iteration 159390/ 173500 | consumed samples: 40803840 | consumed tokens: 83566264320 | elapsed time per iteration (s): 0.16 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 3.657269E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.024 | TFLOPs: 24.68 | +7: iteration 159400/ 173500 | consumed samples: 40806400 | consumed tokens: 83571507200 | elapsed time per iteration (s): 0.15 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 3.656910E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.960 | TFLOPs: 26.14 | +7: iteration 159410/ 173500 | consumed samples: 40808960 | consumed tokens: 83576750080 | elapsed time per iteration (s): 0.15 | learning rate: 2.297E-05 | global batch size: 256 | lm loss: 3.666298E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.687 | TFLOPs: 26.18 | +7: iteration 159420/ 173500 | consumed samples: 40811520 | consumed tokens: 83581992960 | elapsed time per iteration (s): 0.15 | learning rate: 2.297E-05 | global batch size: 256 | lm loss: 3.670185E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.390 | TFLOPs: 26.16 | +7: iteration 159430/ 173500 | consumed samples: 40814080 | consumed tokens: 83587235840 | elapsed time per iteration (s): 0.15 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 3.678592E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.665 | TFLOPs: 26.20 | +7: iteration 159440/ 173500 | consumed samples: 40816640 | consumed tokens: 83592478720 | elapsed time per iteration (s): 0.15 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 3.658269E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.695 | TFLOPs: 26.20 | +7: iteration 159450/ 173500 | consumed samples: 40819200 | consumed tokens: 83597721600 | elapsed time per iteration (s): 0.15 | learning rate: 2.296E-05 | global batch size: 256 | lm loss: 3.652625E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.578 | TFLOPs: 26.21 | +7: iteration 159460/ 173500 | consumed samples: 40821760 | consumed tokens: 83602964480 | elapsed time per iteration (s): 0.15 | learning rate: 2.295E-05 | global batch size: 256 | lm loss: 3.670253E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.985 | TFLOPs: 26.24 | +7: iteration 159470/ 173500 | consumed samples: 40824320 | consumed tokens: 83608207360 | elapsed time per iteration (s): 0.15 | learning rate: 2.295E-05 | global batch size: 256 | lm loss: 3.651616E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.338 | TFLOPs: 26.23 | +7: iteration 159480/ 173500 | consumed samples: 40826880 | consumed tokens: 83613450240 | elapsed time per iteration (s): 0.15 | learning rate: 2.294E-05 | global batch size: 256 | lm loss: 3.660651E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.068 | TFLOPs: 26.24 | +7: iteration 159490/ 173500 | consumed samples: 40829440 | consumed tokens: 83618693120 | elapsed time per iteration (s): 0.16 | learning rate: 2.294E-05 | global batch size: 256 | lm loss: 3.664496E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.959 | TFLOPs: 25.55 | +7: iteration 159500/ 173500 | consumed samples: 40832000 | consumed tokens: 83623936000 | elapsed time per iteration (s): 0.15 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 3.670542E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.413 | TFLOPs: 26.02 | +7: iteration 159510/ 173500 | consumed samples: 40834560 | consumed tokens: 83629178880 | elapsed time per iteration (s): 0.15 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 3.679142E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.794 | TFLOPs: 26.25 | +7: iteration 159520/ 173500 | consumed samples: 40837120 | consumed tokens: 83634421760 | elapsed time per iteration (s): 0.15 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 3.660863E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.398 | TFLOPs: 26.24 | +7: iteration 159530/ 173500 | consumed samples: 40839680 | consumed tokens: 83639664640 | elapsed time per iteration (s): 0.15 | learning rate: 2.292E-05 | global batch size: 256 | lm loss: 3.651877E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.278 | TFLOPs: 26.26 | +7: iteration 159540/ 173500 | consumed samples: 40842240 | consumed tokens: 83644907520 | elapsed time per iteration (s): 0.16 | learning rate: 2.292E-05 | global batch size: 256 | lm loss: 3.659027E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.033 | TFLOPs: 25.70 | +7: iteration 159550/ 173500 | consumed samples: 40844800 | consumed tokens: 83650150400 | elapsed time per iteration (s): 0.15 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 3.663173E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.403 | TFLOPs: 26.27 | +7: iteration 159560/ 173500 | consumed samples: 40847360 | consumed tokens: 83655393280 | elapsed time per iteration (s): 0.15 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 3.652202E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.744 | TFLOPs: 26.26 | +7: iteration 159570/ 173500 | consumed samples: 40849920 | consumed tokens: 83660636160 | elapsed time per iteration (s): 0.15 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 3.670155E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.221 | TFLOPs: 26.26 | +7: iteration 159580/ 173500 | consumed samples: 40852480 | consumed tokens: 83665879040 | elapsed time per iteration (s): 0.15 | learning rate: 2.290E-05 | global batch size: 256 | lm loss: 3.650837E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.965 | TFLOPs: 26.25 | +7: iteration 159590/ 173500 | consumed samples: 40855040 | consumed tokens: 83671121920 | elapsed time per iteration (s): 0.15 | learning rate: 2.290E-05 | global batch size: 256 | lm loss: 3.670127E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.834 | TFLOPs: 26.27 | +7: iteration 159600/ 173500 | consumed samples: 40857600 | consumed tokens: 83676364800 | elapsed time per iteration (s): 0.15 | learning rate: 2.289E-05 | global batch size: 256 | lm loss: 3.665446E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.872 | TFLOPs: 26.25 | +7: iteration 159610/ 173500 | consumed samples: 40860160 | consumed tokens: 83681607680 | elapsed time per iteration (s): 0.15 | learning rate: 2.289E-05 | global batch size: 256 | lm loss: 3.677637E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.367 | TFLOPs: 26.24 | +7: iteration 159620/ 173500 | consumed samples: 40862720 | consumed tokens: 83686850560 | elapsed time per iteration (s): 0.15 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 3.659258E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.775 | TFLOPs: 26.25 | +7: iteration 159630/ 173500 | consumed samples: 40865280 | consumed tokens: 83692093440 | elapsed time per iteration (s): 0.15 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 3.663672E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.330 | TFLOPs: 26.26 | +7: iteration 159640/ 173500 | consumed samples: 40867840 | consumed tokens: 83697336320 | elapsed time per iteration (s): 0.15 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 3.667758E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.633 | TFLOPs: 26.23 | +7: iteration 159650/ 173500 | consumed samples: 40870400 | consumed tokens: 83702579200 | elapsed time per iteration (s): 0.15 | learning rate: 2.287E-05 | global batch size: 256 | lm loss: 3.661234E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.515 | TFLOPs: 26.24 | +7: iteration 159660/ 173500 | consumed samples: 40872960 | consumed tokens: 83707822080 | elapsed time per iteration (s): 0.15 | learning rate: 2.287E-05 | global batch size: 256 | lm loss: 3.672720E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.016 | TFLOPs: 26.24 | +7: iteration 159670/ 173500 | consumed samples: 40875520 | consumed tokens: 83713064960 | elapsed time per iteration (s): 0.16 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 3.645847E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.865 | TFLOPs: 25.80 | +7: iteration 159680/ 173500 | consumed samples: 40878080 | consumed tokens: 83718307840 | elapsed time per iteration (s): 0.16 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 3.671764E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.370 | TFLOPs: 25.74 | +7: iteration 159690/ 173500 | consumed samples: 40880640 | consumed tokens: 83723550720 | elapsed time per iteration (s): 0.16 | learning rate: 2.286E-05 | global batch size: 256 | lm loss: 3.660859E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.535 | TFLOPs: 25.84 | +7: iteration 159700/ 173500 | consumed samples: 40883200 | consumed tokens: 83728793600 | elapsed time per iteration (s): 0.15 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 3.658170E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.445 | TFLOPs: 26.15 | +7: iteration 159710/ 173500 | consumed samples: 40885760 | consumed tokens: 83734036480 | elapsed time per iteration (s): 0.15 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 3.675740E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.670 | TFLOPs: 25.93 | +7: iteration 159720/ 173500 | consumed samples: 40888320 | consumed tokens: 83739279360 | elapsed time per iteration (s): 0.15 | learning rate: 2.284E-05 | global batch size: 256 | lm loss: 3.662796E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.023 | TFLOPs: 26.19 | +7: iteration 159730/ 173500 | consumed samples: 40890880 | consumed tokens: 83744522240 | elapsed time per iteration (s): 0.16 | learning rate: 2.284E-05 | global batch size: 256 | lm loss: 3.650157E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.985 | TFLOPs: 25.73 | +7: iteration 159740/ 173500 | consumed samples: 40893440 | consumed tokens: 83749765120 | elapsed time per iteration (s): 0.15 | learning rate: 2.284E-05 | global batch size: 256 | lm loss: 3.654674E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.791 | TFLOPs: 26.22 | +7: iteration 159750/ 173500 | consumed samples: 40896000 | consumed tokens: 83755008000 | elapsed time per iteration (s): 0.16 | learning rate: 2.283E-05 | global batch size: 256 | lm loss: 3.650082E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.835 | TFLOPs: 25.62 | +7: iteration 159760/ 173500 | consumed samples: 40898560 | consumed tokens: 83760250880 | elapsed time per iteration (s): 0.15 | learning rate: 2.283E-05 | global batch size: 256 | lm loss: 3.655402E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.650 | TFLOPs: 26.15 | +7: iteration 159770/ 173500 | consumed samples: 40901120 | consumed tokens: 83765493760 | elapsed time per iteration (s): 0.15 | learning rate: 2.282E-05 | global batch size: 256 | lm loss: 3.671429E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.631 | TFLOPs: 26.22 | +7: iteration 159780/ 173500 | consumed samples: 40903680 | consumed tokens: 83770736640 | elapsed time per iteration (s): 0.16 | learning rate: 2.282E-05 | global batch size: 256 | lm loss: 3.660382E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.143 | TFLOPs: 25.74 | +7: iteration 159790/ 173500 | consumed samples: 40906240 | consumed tokens: 83775979520 | elapsed time per iteration (s): 0.16 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 3.667555E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.023 | TFLOPs: 25.75 | +7: iteration 159800/ 173500 | consumed samples: 40908800 | consumed tokens: 83781222400 | elapsed time per iteration (s): 0.15 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 3.659538E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.907 | TFLOPs: 26.22 | +7: iteration 159810/ 173500 | consumed samples: 40911360 | consumed tokens: 83786465280 | elapsed time per iteration (s): 0.16 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 3.668877E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.969 | TFLOPs: 25.81 | +7: iteration 159820/ 173500 | consumed samples: 40913920 | consumed tokens: 83791708160 | elapsed time per iteration (s): 0.15 | learning rate: 2.280E-05 | global batch size: 256 | lm loss: 3.653736E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.706 | TFLOPs: 26.22 | +7: iteration 159830/ 173500 | consumed samples: 40916480 | consumed tokens: 83796951040 | elapsed time per iteration (s): 0.16 | learning rate: 2.280E-05 | global batch size: 256 | lm loss: 3.659691E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.958 | TFLOPs: 25.72 | +7: iteration 159840/ 173500 | consumed samples: 40919040 | consumed tokens: 83802193920 | elapsed time per iteration (s): 0.15 | learning rate: 2.279E-05 | global batch size: 256 | lm loss: 3.651899E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.598 | TFLOPs: 26.15 | +7: iteration 159850/ 173500 | consumed samples: 40921600 | consumed tokens: 83807436800 | elapsed time per iteration (s): 0.16 | learning rate: 2.279E-05 | global batch size: 256 | lm loss: 3.669753E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.209 | TFLOPs: 25.13 | +7: iteration 159860/ 173500 | consumed samples: 40924160 | consumed tokens: 83812679680 | elapsed time per iteration (s): 0.15 | learning rate: 2.279E-05 | global batch size: 256 | lm loss: 3.674201E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.468 | TFLOPs: 26.12 | +7: iteration 159870/ 173500 | consumed samples: 40926720 | consumed tokens: 83817922560 | elapsed time per iteration (s): 0.15 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 3.661876E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.930 | TFLOPs: 26.24 | +7: iteration 159880/ 173500 | consumed samples: 40929280 | consumed tokens: 83823165440 | elapsed time per iteration (s): 0.15 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 3.655806E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.750 | TFLOPs: 26.22 | +7: iteration 159890/ 173500 | consumed samples: 40931840 | consumed tokens: 83828408320 | elapsed time per iteration (s): 0.16 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 3.667506E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.191 | TFLOPs: 25.28 | +7: iteration 159900/ 173500 | consumed samples: 40934400 | consumed tokens: 83833651200 | elapsed time per iteration (s): 0.16 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 3.656446E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.275 | TFLOPs: 25.82 | +7: iteration 159910/ 173500 | consumed samples: 40936960 | consumed tokens: 83838894080 | elapsed time per iteration (s): 0.16 | learning rate: 2.277E-05 | global batch size: 256 | lm loss: 3.652569E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.366 | TFLOPs: 25.68 | +7: iteration 159920/ 173500 | consumed samples: 40939520 | consumed tokens: 83844136960 | elapsed time per iteration (s): 0.16 | learning rate: 2.276E-05 | global batch size: 256 | lm loss: 3.672820E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.932 | TFLOPs: 25.23 | +7: iteration 159930/ 173500 | consumed samples: 40942080 | consumed tokens: 83849379840 | elapsed time per iteration (s): 0.16 | learning rate: 2.276E-05 | global batch size: 256 | lm loss: 3.661813E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.050 | TFLOPs: 25.66 | +7: iteration 159940/ 173500 | consumed samples: 40944640 | consumed tokens: 83854622720 | elapsed time per iteration (s): 0.16 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 3.667650E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.593 | TFLOPs: 25.52 | +7: iteration 159950/ 173500 | consumed samples: 40947200 | consumed tokens: 83859865600 | elapsed time per iteration (s): 0.15 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 3.668921E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.543 | TFLOPs: 26.10 | +7: iteration 159960/ 173500 | consumed samples: 40949760 | consumed tokens: 83865108480 | elapsed time per iteration (s): 0.16 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 3.658496E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.664 | TFLOPs: 25.64 | +7: iteration 159970/ 173500 | consumed samples: 40952320 | consumed tokens: 83870351360 | elapsed time per iteration (s): 0.16 | learning rate: 2.274E-05 | global batch size: 256 | lm loss: 3.664415E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.480 | TFLOPs: 25.30 | +7: iteration 159980/ 173500 | consumed samples: 40954880 | consumed tokens: 83875594240 | elapsed time per iteration (s): 0.16 | learning rate: 2.274E-05 | global batch size: 256 | lm loss: 3.671487E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.428 | TFLOPs: 25.29 | +7: iteration 159990/ 173500 | consumed samples: 40957440 | consumed tokens: 83880837120 | elapsed time per iteration (s): 0.16 | learning rate: 2.273E-05 | global batch size: 256 | lm loss: 3.662270E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.125 | TFLOPs: 25.77 | +0: [2023-03-17 07:13:01,650] [INFO] [logging.py:68:log_dist] [Rank 0] step=160000, skipped=0, lr=[2.2729831288017337e-05, 2.2729831288017337e-05, 2.2729831288017337e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 160000/ 173500 | consumed samples: 40960000 | consumed tokens: 83886080000 | elapsed time per iteration (s): 0.15 | learning rate: 2.273E-05 | global batch size: 256 | lm loss: 3.664175E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.837 | TFLOPs: 26.01 | +0: steps: 160000 loss: 3.6788 iter time (s): 0.155 samples/sec: 1655.387 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 160000 | lm loss value: 3.839591E+00 | lm loss PPL: 4.650645E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 160000 to checkpoints_44m91b100m +0: [2023-03-17 07:13:01,724] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step160000 is begin to save! +0: [2023-03-17 07:13:01,728] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:13:01,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:13:01,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:13:01,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:13:01,803] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:13:01,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:13:01,811] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:13:01,819] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:13:01,819] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:13:01,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:13:01,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:13:01,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:13:01,836] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:13:01,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:13:01,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:13:01,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:13:01,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:13:01,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:13:01,860] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:13:01,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:13:01,861] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step160000/mp_rank_00_model_states.pt +0: [2023-03-17 07:13:01,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:13:01,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:13:01,880] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:13:01,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +6: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +3: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +1: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +5: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:13:01,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 07:13:01,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +2: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +7: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:13:01,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +4: [2023-03-17 07:13:01,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:13:01,904] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step160000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:13:01,904] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step160000 is ready now! +0: successfully saved checkpoint at iteration 160000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.20 +7: iteration 160010/ 173500 | consumed samples: 40962560 | consumed tokens: 83891322880 | elapsed time per iteration (s): 0.18 | learning rate: 2.273E-05 | global batch size: 256 | lm loss: 3.673967E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.584 | TFLOPs: 22.17 | +7: iteration 160020/ 173500 | consumed samples: 40965120 | consumed tokens: 83896565760 | elapsed time per iteration (s): 0.16 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 3.668463E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.350 | TFLOPs: 24.83 | +7: iteration 160030/ 173500 | consumed samples: 40967680 | consumed tokens: 83901808640 | elapsed time per iteration (s): 0.15 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 3.654475E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.088 | TFLOPs: 26.24 | +7: iteration 160040/ 173500 | consumed samples: 40970240 | consumed tokens: 83907051520 | elapsed time per iteration (s): 0.16 | learning rate: 2.271E-05 | global batch size: 256 | lm loss: 3.684445E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.881 | TFLOPs: 25.26 | +7: iteration 160050/ 173500 | consumed samples: 40972800 | consumed tokens: 83912294400 | elapsed time per iteration (s): 0.16 | learning rate: 2.271E-05 | global batch size: 256 | lm loss: 3.660220E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1566.506 | TFLOPs: 24.57 | +7: iteration 160060/ 173500 | consumed samples: 40975360 | consumed tokens: 83917537280 | elapsed time per iteration (s): 0.16 | learning rate: 2.271E-05 | global batch size: 256 | lm loss: 3.659889E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.927 | TFLOPs: 25.29 | +7: iteration 160070/ 173500 | consumed samples: 40977920 | consumed tokens: 83922780160 | elapsed time per iteration (s): 0.16 | learning rate: 2.270E-05 | global batch size: 256 | lm loss: 3.646751E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.168 | TFLOPs: 25.13 | +7: iteration 160080/ 173500 | consumed samples: 40980480 | consumed tokens: 83928023040 | elapsed time per iteration (s): 0.15 | learning rate: 2.270E-05 | global batch size: 256 | lm loss: 3.662293E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.359 | TFLOPs: 26.21 | +7: iteration 160090/ 173500 | consumed samples: 40983040 | consumed tokens: 83933265920 | elapsed time per iteration (s): 0.16 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 3.671840E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.073 | TFLOPs: 25.74 | +7: iteration 160100/ 173500 | consumed samples: 40985600 | consumed tokens: 83938508800 | elapsed time per iteration (s): 0.15 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 3.669313E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.114 | TFLOPs: 26.18 | +7: iteration 160110/ 173500 | consumed samples: 40988160 | consumed tokens: 83943751680 | elapsed time per iteration (s): 0.15 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 3.647667E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.236 | TFLOPs: 26.18 | +7: iteration 160120/ 173500 | consumed samples: 40990720 | consumed tokens: 83948994560 | elapsed time per iteration (s): 0.16 | learning rate: 2.268E-05 | global batch size: 256 | lm loss: 3.663918E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.586 | TFLOPs: 25.84 | +7: iteration 160130/ 173500 | consumed samples: 40993280 | consumed tokens: 83954237440 | elapsed time per iteration (s): 0.16 | learning rate: 2.268E-05 | global batch size: 256 | lm loss: 3.661113E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.255 | TFLOPs: 24.72 | +7: iteration 160140/ 173500 | consumed samples: 40995840 | consumed tokens: 83959480320 | elapsed time per iteration (s): 0.16 | learning rate: 2.267E-05 | global batch size: 256 | lm loss: 3.659663E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.550 | TFLOPs: 25.41 | +7: iteration 160150/ 173500 | consumed samples: 40998400 | consumed tokens: 83964723200 | elapsed time per iteration (s): 0.15 | learning rate: 2.267E-05 | global batch size: 256 | lm loss: 3.657852E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.874 | TFLOPs: 26.16 | +7: iteration 160160/ 173500 | consumed samples: 41000960 | consumed tokens: 83969966080 | elapsed time per iteration (s): 0.17 | learning rate: 2.267E-05 | global batch size: 256 | lm loss: 3.661008E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.824 | TFLOPs: 23.10 | +7: iteration 160170/ 173500 | consumed samples: 41003520 | consumed tokens: 83975208960 | elapsed time per iteration (s): 0.16 | learning rate: 2.266E-05 | global batch size: 256 | lm loss: 3.663443E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.612 | TFLOPs: 25.24 | +7: iteration 160180/ 173500 | consumed samples: 41006080 | consumed tokens: 83980451840 | elapsed time per iteration (s): 0.15 | learning rate: 2.266E-05 | global batch size: 256 | lm loss: 3.669176E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.344 | TFLOPs: 26.23 | +7: iteration 160190/ 173500 | consumed samples: 41008640 | consumed tokens: 83985694720 | elapsed time per iteration (s): 0.15 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.671511E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.903 | TFLOPs: 26.22 | +7: iteration 160200/ 173500 | consumed samples: 41011200 | consumed tokens: 83990937600 | elapsed time per iteration (s): 0.16 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.663416E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.526 | TFLOPs: 25.76 | +7: iteration 160210/ 173500 | consumed samples: 41013760 | consumed tokens: 83996180480 | elapsed time per iteration (s): 0.15 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.660739E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.958 | TFLOPs: 25.95 | +7: iteration 160220/ 173500 | consumed samples: 41016320 | consumed tokens: 84001423360 | elapsed time per iteration (s): 0.15 | learning rate: 2.264E-05 | global batch size: 256 | lm loss: 3.660277E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.278 | TFLOPs: 25.97 | +7: iteration 160230/ 173500 | consumed samples: 41018880 | consumed tokens: 84006666240 | elapsed time per iteration (s): 0.16 | learning rate: 2.264E-05 | global batch size: 256 | lm loss: 3.655621E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.053 | TFLOPs: 25.67 | +7: iteration 160240/ 173500 | consumed samples: 41021440 | consumed tokens: 84011909120 | elapsed time per iteration (s): 0.16 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 3.669336E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.855 | TFLOPs: 25.76 | +7: iteration 160250/ 173500 | consumed samples: 41024000 | consumed tokens: 84017152000 | elapsed time per iteration (s): 0.16 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 3.661985E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.741 | TFLOPs: 25.46 | +7: iteration 160260/ 173500 | consumed samples: 41026560 | consumed tokens: 84022394880 | elapsed time per iteration (s): 0.15 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 3.660139E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.351 | TFLOPs: 26.15 | +7: iteration 160270/ 173500 | consumed samples: 41029120 | consumed tokens: 84027637760 | elapsed time per iteration (s): 0.15 | learning rate: 2.262E-05 | global batch size: 256 | lm loss: 3.661934E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.244 | TFLOPs: 26.15 | +7: iteration 160280/ 173500 | consumed samples: 41031680 | consumed tokens: 84032880640 | elapsed time per iteration (s): 0.15 | learning rate: 2.262E-05 | global batch size: 256 | lm loss: 3.668663E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.249 | TFLOPs: 26.05 | +7: iteration 160290/ 173500 | consumed samples: 41034240 | consumed tokens: 84038123520 | elapsed time per iteration (s): 0.15 | learning rate: 2.261E-05 | global batch size: 256 | lm loss: 3.674313E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.672 | TFLOPs: 26.09 | +7: iteration 160300/ 173500 | consumed samples: 41036800 | consumed tokens: 84043366400 | elapsed time per iteration (s): 0.15 | learning rate: 2.261E-05 | global batch size: 256 | lm loss: 3.655025E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.303 | TFLOPs: 25.96 | +7: iteration 160310/ 173500 | consumed samples: 41039360 | consumed tokens: 84048609280 | elapsed time per iteration (s): 0.15 | learning rate: 2.261E-05 | global batch size: 256 | lm loss: 3.661922E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.597 | TFLOPs: 25.98 | +7: iteration 160320/ 173500 | consumed samples: 41041920 | consumed tokens: 84053852160 | elapsed time per iteration (s): 0.15 | learning rate: 2.260E-05 | global batch size: 256 | lm loss: 3.665532E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.597 | TFLOPs: 26.18 | +7: iteration 160330/ 173500 | consumed samples: 41044480 | consumed tokens: 84059095040 | elapsed time per iteration (s): 0.16 | learning rate: 2.260E-05 | global batch size: 256 | lm loss: 3.679560E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.064 | TFLOPs: 25.80 | +7: iteration 160340/ 173500 | consumed samples: 41047040 | consumed tokens: 84064337920 | elapsed time per iteration (s): 0.15 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 3.660013E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.313 | TFLOPs: 26.15 | +7: iteration 160350/ 173500 | consumed samples: 41049600 | consumed tokens: 84069580800 | elapsed time per iteration (s): 0.15 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 3.666477E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.009 | TFLOPs: 26.17 | +7: iteration 160360/ 173500 | consumed samples: 41052160 | consumed tokens: 84074823680 | elapsed time per iteration (s): 0.16 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 3.663133E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.740 | TFLOPs: 25.79 | +7: iteration 160370/ 173500 | consumed samples: 41054720 | consumed tokens: 84080066560 | elapsed time per iteration (s): 0.15 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 3.667134E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.989 | TFLOPs: 26.21 | +7: iteration 160380/ 173500 | consumed samples: 41057280 | consumed tokens: 84085309440 | elapsed time per iteration (s): 0.15 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 3.662787E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.374 | TFLOPs: 26.21 | +7: iteration 160390/ 173500 | consumed samples: 41059840 | consumed tokens: 84090552320 | elapsed time per iteration (s): 0.16 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 3.657638E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.294 | TFLOPs: 25.72 | +7: iteration 160400/ 173500 | consumed samples: 41062400 | consumed tokens: 84095795200 | elapsed time per iteration (s): 0.15 | learning rate: 2.257E-05 | global batch size: 256 | lm loss: 3.657451E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.311 | TFLOPs: 26.23 | +7: iteration 160410/ 173500 | consumed samples: 41064960 | consumed tokens: 84101038080 | elapsed time per iteration (s): 0.15 | learning rate: 2.257E-05 | global batch size: 256 | lm loss: 3.664582E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.307 | TFLOPs: 26.16 | +7: iteration 160420/ 173500 | consumed samples: 41067520 | consumed tokens: 84106280960 | elapsed time per iteration (s): 0.16 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 3.665573E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.340 | TFLOPs: 25.30 | +7: iteration 160430/ 173500 | consumed samples: 41070080 | consumed tokens: 84111523840 | elapsed time per iteration (s): 0.15 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 3.658256E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.444 | TFLOPs: 26.10 | +7: iteration 160440/ 173500 | consumed samples: 41072640 | consumed tokens: 84116766720 | elapsed time per iteration (s): 0.16 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 3.660375E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.510 | TFLOPs: 25.70 | +7: iteration 160450/ 173500 | consumed samples: 41075200 | consumed tokens: 84122009600 | elapsed time per iteration (s): 0.15 | learning rate: 2.255E-05 | global batch size: 256 | lm loss: 3.653401E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.502 | TFLOPs: 26.20 | +7: iteration 160460/ 173500 | consumed samples: 41077760 | consumed tokens: 84127252480 | elapsed time per iteration (s): 0.16 | learning rate: 2.255E-05 | global batch size: 256 | lm loss: 3.668301E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.888 | TFLOPs: 25.83 | +7: iteration 160470/ 173500 | consumed samples: 41080320 | consumed tokens: 84132495360 | elapsed time per iteration (s): 0.15 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 3.659323E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.844 | TFLOPs: 26.22 | +7: iteration 160480/ 173500 | consumed samples: 41082880 | consumed tokens: 84137738240 | elapsed time per iteration (s): 0.15 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 3.656428E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.590 | TFLOPs: 26.21 | +7: iteration 160490/ 173500 | consumed samples: 41085440 | consumed tokens: 84142981120 | elapsed time per iteration (s): 0.15 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 3.654982E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.124 | TFLOPs: 26.24 | +7: iteration 160500/ 173500 | consumed samples: 41088000 | consumed tokens: 84148224000 | elapsed time per iteration (s): 0.15 | learning rate: 2.253E-05 | global batch size: 256 | lm loss: 3.669604E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.517 | TFLOPs: 26.24 | +7: iteration 160510/ 173500 | consumed samples: 41090560 | consumed tokens: 84153466880 | elapsed time per iteration (s): 0.15 | learning rate: 2.253E-05 | global batch size: 256 | lm loss: 3.666602E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.427 | TFLOPs: 26.21 | +7: iteration 160520/ 173500 | consumed samples: 41093120 | consumed tokens: 84158709760 | elapsed time per iteration (s): 0.15 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 3.670394E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.345 | TFLOPs: 26.23 | +7: iteration 160530/ 173500 | consumed samples: 41095680 | consumed tokens: 84163952640 | elapsed time per iteration (s): 0.15 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 3.657664E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.366 | TFLOPs: 26.23 | +7: iteration 160540/ 173500 | consumed samples: 41098240 | consumed tokens: 84169195520 | elapsed time per iteration (s): 0.15 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 3.664536E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.950 | TFLOPs: 26.24 | +7: iteration 160550/ 173500 | consumed samples: 41100800 | consumed tokens: 84174438400 | elapsed time per iteration (s): 0.16 | learning rate: 2.251E-05 | global batch size: 256 | lm loss: 3.656584E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.472 | TFLOPs: 25.76 | +7: iteration 160560/ 173500 | consumed samples: 41103360 | consumed tokens: 84179681280 | elapsed time per iteration (s): 0.16 | learning rate: 2.251E-05 | global batch size: 256 | lm loss: 3.665951E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.780 | TFLOPs: 24.90 | +7: iteration 160570/ 173500 | consumed samples: 41105920 | consumed tokens: 84184924160 | elapsed time per iteration (s): 0.15 | learning rate: 2.251E-05 | global batch size: 256 | lm loss: 3.650094E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.163 | TFLOPs: 26.16 | +7: iteration 160580/ 173500 | consumed samples: 41108480 | consumed tokens: 84190167040 | elapsed time per iteration (s): 0.16 | learning rate: 2.250E-05 | global batch size: 256 | lm loss: 3.650557E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.532 | TFLOPs: 25.49 | +7: iteration 160590/ 173500 | consumed samples: 41111040 | consumed tokens: 84195409920 | elapsed time per iteration (s): 0.15 | learning rate: 2.250E-05 | global batch size: 256 | lm loss: 3.673302E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.451 | TFLOPs: 26.18 | +7: iteration 160600/ 173500 | consumed samples: 41113600 | consumed tokens: 84200652800 | elapsed time per iteration (s): 0.15 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 3.672629E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.515 | TFLOPs: 26.23 | +7: iteration 160610/ 173500 | consumed samples: 41116160 | consumed tokens: 84205895680 | elapsed time per iteration (s): 0.15 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 3.661268E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.876 | TFLOPs: 26.27 | +7: iteration 160620/ 173500 | consumed samples: 41118720 | consumed tokens: 84211138560 | elapsed time per iteration (s): 0.15 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 3.672584E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.432 | TFLOPs: 26.06 | +7: iteration 160630/ 173500 | consumed samples: 41121280 | consumed tokens: 84216381440 | elapsed time per iteration (s): 0.15 | learning rate: 2.248E-05 | global batch size: 256 | lm loss: 3.661227E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.756 | TFLOPs: 26.25 | +7: iteration 160640/ 173500 | consumed samples: 41123840 | consumed tokens: 84221624320 | elapsed time per iteration (s): 0.15 | learning rate: 2.248E-05 | global batch size: 256 | lm loss: 3.664336E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.433 | TFLOPs: 26.02 | +7: iteration 160650/ 173500 | consumed samples: 41126400 | consumed tokens: 84226867200 | elapsed time per iteration (s): 0.15 | learning rate: 2.247E-05 | global batch size: 256 | lm loss: 3.665622E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.454 | TFLOPs: 26.24 | +7: iteration 160660/ 173500 | consumed samples: 41128960 | consumed tokens: 84232110080 | elapsed time per iteration (s): 0.15 | learning rate: 2.247E-05 | global batch size: 256 | lm loss: 3.662803E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.310 | TFLOPs: 26.26 | +7: iteration 160670/ 173500 | consumed samples: 41131520 | consumed tokens: 84237352960 | elapsed time per iteration (s): 0.15 | learning rate: 2.247E-05 | global batch size: 256 | lm loss: 3.669439E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.915 | TFLOPs: 25.92 | +7: iteration 160680/ 173500 | consumed samples: 41134080 | consumed tokens: 84242595840 | elapsed time per iteration (s): 0.15 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 3.670225E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.733 | TFLOPs: 26.25 | +7: iteration 160690/ 173500 | consumed samples: 41136640 | consumed tokens: 84247838720 | elapsed time per iteration (s): 0.16 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 3.656730E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.749 | TFLOPs: 24.76 | +7: iteration 160700/ 173500 | consumed samples: 41139200 | consumed tokens: 84253081600 | elapsed time per iteration (s): 0.15 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 3.674313E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.017 | TFLOPs: 26.21 | +7: iteration 160710/ 173500 | consumed samples: 41141760 | consumed tokens: 84258324480 | elapsed time per iteration (s): 0.15 | learning rate: 2.245E-05 | global batch size: 256 | lm loss: 3.655389E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.276 | TFLOPs: 26.18 | +7: iteration 160720/ 173500 | consumed samples: 41144320 | consumed tokens: 84263567360 | elapsed time per iteration (s): 0.16 | learning rate: 2.245E-05 | global batch size: 256 | lm loss: 3.660316E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.200 | TFLOPs: 25.60 | +7: iteration 160730/ 173500 | consumed samples: 41146880 | consumed tokens: 84268810240 | elapsed time per iteration (s): 0.15 | learning rate: 2.244E-05 | global batch size: 256 | lm loss: 3.654691E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.274 | TFLOPs: 26.19 | +7: iteration 160740/ 173500 | consumed samples: 41149440 | consumed tokens: 84274053120 | elapsed time per iteration (s): 0.15 | learning rate: 2.244E-05 | global batch size: 256 | lm loss: 3.654112E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.854 | TFLOPs: 26.14 | +7: iteration 160750/ 173500 | consumed samples: 41152000 | consumed tokens: 84279296000 | elapsed time per iteration (s): 0.16 | learning rate: 2.244E-05 | global batch size: 256 | lm loss: 3.658847E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.728 | TFLOPs: 25.84 | +7: iteration 160760/ 173500 | consumed samples: 41154560 | consumed tokens: 84284538880 | elapsed time per iteration (s): 0.16 | learning rate: 2.243E-05 | global batch size: 256 | lm loss: 3.675537E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.286 | TFLOPs: 25.25 | +7: iteration 160770/ 173500 | consumed samples: 41157120 | consumed tokens: 84289781760 | elapsed time per iteration (s): 0.15 | learning rate: 2.243E-05 | global batch size: 256 | lm loss: 3.655489E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.610 | TFLOPs: 26.12 | +7: iteration 160780/ 173500 | consumed samples: 41159680 | consumed tokens: 84295024640 | elapsed time per iteration (s): 0.15 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 3.663996E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.260 | TFLOPs: 26.18 | +7: iteration 160790/ 173500 | consumed samples: 41162240 | consumed tokens: 84300267520 | elapsed time per iteration (s): 0.15 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 3.656909E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.500 | TFLOPs: 26.17 | +7: iteration 160800/ 173500 | consumed samples: 41164800 | consumed tokens: 84305510400 | elapsed time per iteration (s): 0.15 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 3.668591E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.255 | TFLOPs: 26.13 | +7: iteration 160810/ 173500 | consumed samples: 41167360 | consumed tokens: 84310753280 | elapsed time per iteration (s): 0.16 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 3.654419E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.647 | TFLOPs: 25.62 | +7: iteration 160820/ 173500 | consumed samples: 41169920 | consumed tokens: 84315996160 | elapsed time per iteration (s): 0.16 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 3.667986E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.902 | TFLOPs: 25.69 | +7: iteration 160830/ 173500 | consumed samples: 41172480 | consumed tokens: 84321239040 | elapsed time per iteration (s): 0.15 | learning rate: 2.241E-05 | global batch size: 256 | lm loss: 3.658993E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.707 | TFLOPs: 26.23 | +7: iteration 160840/ 173500 | consumed samples: 41175040 | consumed tokens: 84326481920 | elapsed time per iteration (s): 0.15 | learning rate: 2.240E-05 | global batch size: 256 | lm loss: 3.656130E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.451 | TFLOPs: 26.21 | +7: iteration 160850/ 173500 | consumed samples: 41177600 | consumed tokens: 84331724800 | elapsed time per iteration (s): 0.16 | learning rate: 2.240E-05 | global batch size: 256 | lm loss: 3.661425E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.330 | TFLOPs: 25.33 | +7: iteration 160860/ 173500 | consumed samples: 41180160 | consumed tokens: 84336967680 | elapsed time per iteration (s): 0.15 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 3.673216E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.146 | TFLOPs: 26.13 | +7: iteration 160870/ 173500 | consumed samples: 41182720 | consumed tokens: 84342210560 | elapsed time per iteration (s): 0.16 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 3.649618E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.504 | TFLOPs: 25.73 | +7: iteration 160880/ 173500 | consumed samples: 41185280 | consumed tokens: 84347453440 | elapsed time per iteration (s): 0.15 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 3.674275E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.028 | TFLOPs: 25.91 | +7: iteration 160890/ 173500 | consumed samples: 41187840 | consumed tokens: 84352696320 | elapsed time per iteration (s): 0.15 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 3.647121E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.646 | TFLOPs: 26.18 | +7: iteration 160900/ 173500 | consumed samples: 41190400 | consumed tokens: 84357939200 | elapsed time per iteration (s): 0.15 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 3.668659E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.977 | TFLOPs: 26.21 | +7: iteration 160910/ 173500 | consumed samples: 41192960 | consumed tokens: 84363182080 | elapsed time per iteration (s): 0.16 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 3.658475E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.169 | TFLOPs: 25.82 | +7: iteration 160920/ 173500 | consumed samples: 41195520 | consumed tokens: 84368424960 | elapsed time per iteration (s): 0.16 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 3.673878E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.951 | TFLOPs: 25.50 | +7: iteration 160930/ 173500 | consumed samples: 41198080 | consumed tokens: 84373667840 | elapsed time per iteration (s): 0.16 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 3.650146E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.863 | TFLOPs: 25.25 | +7: iteration 160940/ 173500 | consumed samples: 41200640 | consumed tokens: 84378910720 | elapsed time per iteration (s): 0.15 | learning rate: 2.236E-05 | global batch size: 256 | lm loss: 3.655466E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.882 | TFLOPs: 26.20 | +7: iteration 160950/ 173500 | consumed samples: 41203200 | consumed tokens: 84384153600 | elapsed time per iteration (s): 0.16 | learning rate: 2.236E-05 | global batch size: 256 | lm loss: 3.658256E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.461 | TFLOPs: 25.57 | +7: iteration 160960/ 173500 | consumed samples: 41205760 | consumed tokens: 84389396480 | elapsed time per iteration (s): 0.15 | learning rate: 2.236E-05 | global batch size: 256 | lm loss: 3.659674E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.060 | TFLOPs: 26.16 | +7: iteration 160970/ 173500 | consumed samples: 41208320 | consumed tokens: 84394639360 | elapsed time per iteration (s): 0.16 | learning rate: 2.235E-05 | global batch size: 256 | lm loss: 3.667979E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.175 | TFLOPs: 25.77 | +7: iteration 160980/ 173500 | consumed samples: 41210880 | consumed tokens: 84399882240 | elapsed time per iteration (s): 0.15 | learning rate: 2.235E-05 | global batch size: 256 | lm loss: 3.669754E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.027 | TFLOPs: 26.14 | +7: iteration 160990/ 173500 | consumed samples: 41213440 | consumed tokens: 84405125120 | elapsed time per iteration (s): 0.15 | learning rate: 2.235E-05 | global batch size: 256 | lm loss: 3.664968E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.188 | TFLOPs: 26.13 | +7: iteration 161000/ 173500 | consumed samples: 41216000 | consumed tokens: 84410368000 | elapsed time per iteration (s): 0.15 | learning rate: 2.234E-05 | global batch size: 256 | lm loss: 3.651786E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.832 | TFLOPs: 26.09 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 161000 | lm loss value: 3.826349E+00 | lm loss PPL: 4.589465E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 161000 to checkpoints_44m91b100m +0: [2023-03-17 07:15:37,177] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step161000 is begin to save! +0: [2023-03-17 07:15:37,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:15:37,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:15:37,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:15:37,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:15:37,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:15:37,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:15:37,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:15:37,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:15:37,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:15:37,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:15:37,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:15:37,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:15:37,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:15:37,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:15:37,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:15:37,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:15:37,304] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:15:37,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:15:37,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:15:37,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:15:37,314] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step161000/mp_rank_00_model_states.pt +0: [2023-03-17 07:15:37,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:15:37,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:15:37,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:15:37,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:15:37,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +7: [2023-03-17 07:15:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:15:37,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +6: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +2: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +1: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +5: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +3: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:15:37,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:15:37,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +4: [2023-03-17 07:15:37,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:15:37,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step161000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:15:37,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step161000 is ready now! +0: successfully saved checkpoint at iteration 161000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 184.55 +7: iteration 161010/ 173500 | consumed samples: 41218560 | consumed tokens: 84415610880 | elapsed time per iteration (s): 0.18 | learning rate: 2.234E-05 | global batch size: 256 | lm loss: 3.657787E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.485 | TFLOPs: 22.62 | +7: iteration 161020/ 173500 | consumed samples: 41221120 | consumed tokens: 84420853760 | elapsed time per iteration (s): 0.19 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 3.663852E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1366.592 | TFLOPs: 21.43 | +7: iteration 161030/ 173500 | consumed samples: 41223680 | consumed tokens: 84426096640 | elapsed time per iteration (s): 0.18 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 3.664013E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1433.883 | TFLOPs: 22.49 | +7: iteration 161040/ 173500 | consumed samples: 41226240 | consumed tokens: 84431339520 | elapsed time per iteration (s): 0.16 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 3.656014E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.396 | TFLOPs: 25.65 | +7: iteration 161050/ 173500 | consumed samples: 41228800 | consumed tokens: 84436582400 | elapsed time per iteration (s): 0.16 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 3.658859E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.151 | TFLOPs: 25.78 | +7: iteration 161060/ 173500 | consumed samples: 41231360 | consumed tokens: 84441825280 | elapsed time per iteration (s): 0.16 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 3.670488E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.886 | TFLOPs: 25.12 | +7: iteration 161070/ 173500 | consumed samples: 41233920 | consumed tokens: 84447068160 | elapsed time per iteration (s): 0.16 | learning rate: 2.232E-05 | global batch size: 256 | lm loss: 3.658405E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.799 | TFLOPs: 25.07 | +7: iteration 161080/ 173500 | consumed samples: 41236480 | consumed tokens: 84452311040 | elapsed time per iteration (s): 0.16 | learning rate: 2.231E-05 | global batch size: 256 | lm loss: 3.648840E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.782 | TFLOPs: 24.38 | +7: iteration 161090/ 173500 | consumed samples: 41239040 | consumed tokens: 84457553920 | elapsed time per iteration (s): 0.16 | learning rate: 2.231E-05 | global batch size: 256 | lm loss: 3.653690E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.972 | TFLOPs: 25.37 | +7: iteration 161100/ 173500 | consumed samples: 41241600 | consumed tokens: 84462796800 | elapsed time per iteration (s): 0.16 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 3.655640E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.513 | TFLOPs: 25.30 | +7: iteration 161110/ 173500 | consumed samples: 41244160 | consumed tokens: 84468039680 | elapsed time per iteration (s): 0.15 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 3.666066E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.575 | TFLOPs: 25.95 | +7: iteration 161120/ 173500 | consumed samples: 41246720 | consumed tokens: 84473282560 | elapsed time per iteration (s): 0.16 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 3.662101E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.494 | TFLOPs: 25.13 | +7: iteration 161130/ 173500 | consumed samples: 41249280 | consumed tokens: 84478525440 | elapsed time per iteration (s): 0.16 | learning rate: 2.229E-05 | global batch size: 256 | lm loss: 3.664861E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.572 | TFLOPs: 25.13 | +7: iteration 161140/ 173500 | consumed samples: 41251840 | consumed tokens: 84483768320 | elapsed time per iteration (s): 0.16 | learning rate: 2.229E-05 | global batch size: 256 | lm loss: 3.667633E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.236 | TFLOPs: 25.33 | +7: iteration 161150/ 173500 | consumed samples: 41254400 | consumed tokens: 84489011200 | elapsed time per iteration (s): 0.16 | learning rate: 2.229E-05 | global batch size: 256 | lm loss: 3.664235E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1582.852 | TFLOPs: 24.82 | +7: iteration 161160/ 173500 | consumed samples: 41256960 | consumed tokens: 84494254080 | elapsed time per iteration (s): 0.16 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 3.664705E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.607 | TFLOPs: 25.38 | +7: iteration 161170/ 173500 | consumed samples: 41259520 | consumed tokens: 84499496960 | elapsed time per iteration (s): 0.16 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 3.664774E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.112 | TFLOPs: 25.74 | +7: iteration 161180/ 173500 | consumed samples: 41262080 | consumed tokens: 84504739840 | elapsed time per iteration (s): 0.16 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 3.659884E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.565 | TFLOPs: 25.24 | +7: iteration 161190/ 173500 | consumed samples: 41264640 | consumed tokens: 84509982720 | elapsed time per iteration (s): 0.16 | learning rate: 2.227E-05 | global batch size: 256 | lm loss: 3.660474E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.543 | TFLOPs: 25.71 | +7: iteration 161200/ 173500 | consumed samples: 41267200 | consumed tokens: 84515225600 | elapsed time per iteration (s): 0.16 | learning rate: 2.227E-05 | global batch size: 256 | lm loss: 3.655875E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.059 | TFLOPs: 25.50 | +7: iteration 161210/ 173500 | consumed samples: 41269760 | consumed tokens: 84520468480 | elapsed time per iteration (s): 0.16 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 3.680195E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.989 | TFLOPs: 25.69 | +7: iteration 161220/ 173500 | consumed samples: 41272320 | consumed tokens: 84525711360 | elapsed time per iteration (s): 0.16 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 3.649472E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.687 | TFLOPs: 25.64 | +7: iteration 161230/ 173500 | consumed samples: 41274880 | consumed tokens: 84530954240 | elapsed time per iteration (s): 0.17 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 3.667292E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1511.398 | TFLOPs: 23.70 | +7: iteration 161240/ 173500 | consumed samples: 41277440 | consumed tokens: 84536197120 | elapsed time per iteration (s): 0.16 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 3.657846E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.501 | TFLOPs: 24.72 | +7: iteration 161250/ 173500 | consumed samples: 41280000 | consumed tokens: 84541440000 | elapsed time per iteration (s): 0.16 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 3.662154E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.141 | TFLOPs: 25.61 | +7: iteration 161260/ 173500 | consumed samples: 41282560 | consumed tokens: 84546682880 | elapsed time per iteration (s): 0.17 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 3.654462E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.625 | TFLOPs: 24.21 | +7: iteration 161270/ 173500 | consumed samples: 41285120 | consumed tokens: 84551925760 | elapsed time per iteration (s): 0.16 | learning rate: 2.224E-05 | global batch size: 256 | lm loss: 3.644987E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.193 | TFLOPs: 25.03 | +7: iteration 161280/ 173500 | consumed samples: 41287680 | consumed tokens: 84557168640 | elapsed time per iteration (s): 0.16 | learning rate: 2.224E-05 | global batch size: 256 | lm loss: 3.666525E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.011 | TFLOPs: 25.19 | +7: iteration 161290/ 173500 | consumed samples: 41290240 | consumed tokens: 84562411520 | elapsed time per iteration (s): 0.15 | learning rate: 2.224E-05 | global batch size: 256 | lm loss: 3.663715E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.451 | TFLOPs: 26.07 | +7: iteration 161300/ 173500 | consumed samples: 41292800 | consumed tokens: 84567654400 | elapsed time per iteration (s): 0.16 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 3.673596E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.995 | TFLOPs: 24.86 | +7: iteration 161310/ 173500 | consumed samples: 41295360 | consumed tokens: 84572897280 | elapsed time per iteration (s): 0.16 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 3.665102E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.095 | TFLOPs: 25.09 | +7: iteration 161320/ 173500 | consumed samples: 41297920 | consumed tokens: 84578140160 | elapsed time per iteration (s): 0.16 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 3.668531E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.710 | TFLOPs: 25.20 | +7: iteration 161330/ 173500 | consumed samples: 41300480 | consumed tokens: 84583383040 | elapsed time per iteration (s): 0.16 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 3.652974E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1575.512 | TFLOPs: 24.71 | +7: iteration 161340/ 173500 | consumed samples: 41303040 | consumed tokens: 84588625920 | elapsed time per iteration (s): 0.16 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 3.668036E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.585 | TFLOPs: 24.79 | +7: iteration 161350/ 173500 | consumed samples: 41305600 | consumed tokens: 84593868800 | elapsed time per iteration (s): 0.16 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 3.671059E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.788 | TFLOPs: 25.32 | +7: iteration 161360/ 173500 | consumed samples: 41308160 | consumed tokens: 84599111680 | elapsed time per iteration (s): 0.16 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 3.664080E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.464 | TFLOPs: 25.80 | +7: iteration 161370/ 173500 | consumed samples: 41310720 | consumed tokens: 84604354560 | elapsed time per iteration (s): 0.16 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 3.655799E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.905 | TFLOPs: 25.44 | +7: iteration 161380/ 173500 | consumed samples: 41313280 | consumed tokens: 84609597440 | elapsed time per iteration (s): 0.16 | learning rate: 2.220E-05 | global batch size: 256 | lm loss: 3.651870E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.703 | TFLOPs: 25.57 | +7: iteration 161390/ 173500 | consumed samples: 41315840 | consumed tokens: 84614840320 | elapsed time per iteration (s): 0.16 | learning rate: 2.220E-05 | global batch size: 256 | lm loss: 3.659934E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.271 | TFLOPs: 24.92 | +7: iteration 161400/ 173500 | consumed samples: 41318400 | consumed tokens: 84620083200 | elapsed time per iteration (s): 0.16 | learning rate: 2.220E-05 | global batch size: 256 | lm loss: 3.664678E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.725 | TFLOPs: 25.67 | +7: iteration 161410/ 173500 | consumed samples: 41320960 | consumed tokens: 84625326080 | elapsed time per iteration (s): 0.16 | learning rate: 2.219E-05 | global batch size: 256 | lm loss: 3.665847E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.731 | TFLOPs: 25.57 | +7: iteration 161420/ 173500 | consumed samples: 41323520 | consumed tokens: 84630568960 | elapsed time per iteration (s): 0.16 | learning rate: 2.219E-05 | global batch size: 256 | lm loss: 3.660741E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.150 | TFLOPs: 25.05 | +7: iteration 161430/ 173500 | consumed samples: 41326080 | consumed tokens: 84635811840 | elapsed time per iteration (s): 0.16 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 3.670520E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.766 | TFLOPs: 24.79 | +7: iteration 161440/ 173500 | consumed samples: 41328640 | consumed tokens: 84641054720 | elapsed time per iteration (s): 0.16 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 3.670227E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.855 | TFLOPs: 25.47 | +7: iteration 161450/ 173500 | consumed samples: 41331200 | consumed tokens: 84646297600 | elapsed time per iteration (s): 0.16 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 3.663026E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.009 | TFLOPs: 25.75 | +7: iteration 161460/ 173500 | consumed samples: 41333760 | consumed tokens: 84651540480 | elapsed time per iteration (s): 0.17 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 3.656813E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1520.309 | TFLOPs: 23.84 | +7: iteration 161470/ 173500 | consumed samples: 41336320 | consumed tokens: 84656783360 | elapsed time per iteration (s): 0.16 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 3.663436E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.023 | TFLOPs: 25.59 | +7: iteration 161480/ 173500 | consumed samples: 41338880 | consumed tokens: 84662026240 | elapsed time per iteration (s): 0.16 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 3.648943E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.142 | TFLOPs: 25.80 | +7: iteration 161490/ 173500 | consumed samples: 41341440 | consumed tokens: 84667269120 | elapsed time per iteration (s): 0.16 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 3.658839E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.382 | TFLOPs: 25.63 | +7: iteration 161500/ 173500 | consumed samples: 41344000 | consumed tokens: 84672512000 | elapsed time per iteration (s): 0.16 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 3.661590E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.507 | TFLOPs: 25.79 | +7: iteration 161510/ 173500 | consumed samples: 41346560 | consumed tokens: 84677754880 | elapsed time per iteration (s): 0.16 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 3.660032E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.817 | TFLOPs: 24.95 | +7: iteration 161520/ 173500 | consumed samples: 41349120 | consumed tokens: 84682997760 | elapsed time per iteration (s): 0.16 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 3.655727E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.614 | TFLOPs: 25.62 | +7: iteration 161530/ 173500 | consumed samples: 41351680 | consumed tokens: 84688240640 | elapsed time per iteration (s): 0.15 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 3.659168E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.868 | TFLOPs: 26.11 | +7: iteration 161540/ 173500 | consumed samples: 41354240 | consumed tokens: 84693483520 | elapsed time per iteration (s): 0.15 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 3.650470E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.993 | TFLOPs: 26.11 | +7: iteration 161550/ 173500 | consumed samples: 41356800 | consumed tokens: 84698726400 | elapsed time per iteration (s): 0.16 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 3.661601E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.151 | TFLOPs: 25.80 | +7: iteration 161560/ 173500 | consumed samples: 41359360 | consumed tokens: 84703969280 | elapsed time per iteration (s): 0.16 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 3.663498E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.437 | TFLOPs: 25.74 | +7: iteration 161570/ 173500 | consumed samples: 41361920 | consumed tokens: 84709212160 | elapsed time per iteration (s): 0.16 | learning rate: 2.213E-05 | global batch size: 256 | lm loss: 3.672329E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.180 | TFLOPs: 25.82 | +7: iteration 161580/ 173500 | consumed samples: 41364480 | consumed tokens: 84714455040 | elapsed time per iteration (s): 0.16 | learning rate: 2.213E-05 | global batch size: 256 | lm loss: 3.662073E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.703 | TFLOPs: 25.54 | +7: iteration 161590/ 173500 | consumed samples: 41367040 | consumed tokens: 84719697920 | elapsed time per iteration (s): 0.15 | learning rate: 2.213E-05 | global batch size: 256 | lm loss: 3.672408E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.276 | TFLOPs: 26.12 | +7: iteration 161600/ 173500 | consumed samples: 41369600 | consumed tokens: 84724940800 | elapsed time per iteration (s): 0.16 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 3.664244E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.633 | TFLOPs: 25.78 | +7: iteration 161610/ 173500 | consumed samples: 41372160 | consumed tokens: 84730183680 | elapsed time per iteration (s): 0.16 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 3.662422E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.037 | TFLOPs: 25.20 | +7: iteration 161620/ 173500 | consumed samples: 41374720 | consumed tokens: 84735426560 | elapsed time per iteration (s): 0.16 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 3.659900E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1563.653 | TFLOPs: 24.52 | +7: iteration 161630/ 173500 | consumed samples: 41377280 | consumed tokens: 84740669440 | elapsed time per iteration (s): 0.16 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 3.667094E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.379 | TFLOPs: 25.85 | +7: iteration 161640/ 173500 | consumed samples: 41379840 | consumed tokens: 84745912320 | elapsed time per iteration (s): 0.16 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 3.657551E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.302 | TFLOPs: 24.78 | +7: iteration 161650/ 173500 | consumed samples: 41382400 | consumed tokens: 84751155200 | elapsed time per iteration (s): 0.16 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 3.659819E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.703 | TFLOPs: 25.76 | +7: iteration 161660/ 173500 | consumed samples: 41384960 | consumed tokens: 84756398080 | elapsed time per iteration (s): 0.15 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 3.659291E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.435 | TFLOPs: 25.93 | +7: iteration 161670/ 173500 | consumed samples: 41387520 | consumed tokens: 84761640960 | elapsed time per iteration (s): 0.16 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 3.652184E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.026 | TFLOPs: 25.45 | +7: iteration 161680/ 173500 | consumed samples: 41390080 | consumed tokens: 84766883840 | elapsed time per iteration (s): 0.16 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 3.657870E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.102 | TFLOPs: 24.65 | +7: iteration 161690/ 173500 | consumed samples: 41392640 | consumed tokens: 84772126720 | elapsed time per iteration (s): 0.16 | learning rate: 2.209E-05 | global batch size: 256 | lm loss: 3.658981E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.850 | TFLOPs: 25.42 | +7: iteration 161700/ 173500 | consumed samples: 41395200 | consumed tokens: 84777369600 | elapsed time per iteration (s): 0.15 | learning rate: 2.209E-05 | global batch size: 256 | lm loss: 3.661278E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.610 | TFLOPs: 26.31 | +7: iteration 161710/ 173500 | consumed samples: 41397760 | consumed tokens: 84782612480 | elapsed time per iteration (s): 0.17 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 3.655537E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.513 | TFLOPs: 23.92 | +7: iteration 161720/ 173500 | consumed samples: 41400320 | consumed tokens: 84787855360 | elapsed time per iteration (s): 0.15 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 3.661726E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.264 | TFLOPs: 26.37 | +7: iteration 161730/ 173500 | consumed samples: 41402880 | consumed tokens: 84793098240 | elapsed time per iteration (s): 0.16 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 3.656655E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.807 | TFLOPs: 25.81 | +7: iteration 161740/ 173500 | consumed samples: 41405440 | consumed tokens: 84798341120 | elapsed time per iteration (s): 0.16 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 3.672854E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.546 | TFLOPs: 24.96 | +7: iteration 161750/ 173500 | consumed samples: 41408000 | consumed tokens: 84803584000 | elapsed time per iteration (s): 0.15 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 3.665126E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.409 | TFLOPs: 26.04 | +7: iteration 161760/ 173500 | consumed samples: 41410560 | consumed tokens: 84808826880 | elapsed time per iteration (s): 0.15 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 3.661468E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.041 | TFLOPs: 26.14 | +7: iteration 161770/ 173500 | consumed samples: 41413120 | consumed tokens: 84814069760 | elapsed time per iteration (s): 0.15 | learning rate: 2.206E-05 | global batch size: 256 | lm loss: 3.668465E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.583 | TFLOPs: 26.32 | +7: iteration 161780/ 173500 | consumed samples: 41415680 | consumed tokens: 84819312640 | elapsed time per iteration (s): 0.16 | learning rate: 2.206E-05 | global batch size: 256 | lm loss: 3.663683E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.186 | TFLOPs: 25.89 | +7: iteration 161790/ 173500 | consumed samples: 41418240 | consumed tokens: 84824555520 | elapsed time per iteration (s): 0.16 | learning rate: 2.206E-05 | global batch size: 256 | lm loss: 3.663219E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.135 | TFLOPs: 25.33 | +7: iteration 161800/ 173500 | consumed samples: 41420800 | consumed tokens: 84829798400 | elapsed time per iteration (s): 0.15 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 3.662207E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.569 | TFLOPs: 26.29 | +7: iteration 161810/ 173500 | consumed samples: 41423360 | consumed tokens: 84835041280 | elapsed time per iteration (s): 0.16 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 3.659568E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.222 | TFLOPs: 25.77 | +7: iteration 161820/ 173500 | consumed samples: 41425920 | consumed tokens: 84840284160 | elapsed time per iteration (s): 0.15 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 3.662917E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.902 | TFLOPs: 25.98 | +7: iteration 161830/ 173500 | consumed samples: 41428480 | consumed tokens: 84845527040 | elapsed time per iteration (s): 0.15 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 3.656036E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.934 | TFLOPs: 25.98 | +7: iteration 161840/ 173500 | consumed samples: 41431040 | consumed tokens: 84850769920 | elapsed time per iteration (s): 0.16 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 3.665086E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.615 | TFLOPs: 25.84 | +7: iteration 161850/ 173500 | consumed samples: 41433600 | consumed tokens: 84856012800 | elapsed time per iteration (s): 0.16 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 3.664326E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.364 | TFLOPs: 25.47 | +7: iteration 161860/ 173500 | consumed samples: 41436160 | consumed tokens: 84861255680 | elapsed time per iteration (s): 0.15 | learning rate: 2.203E-05 | global batch size: 256 | lm loss: 3.664697E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.809 | TFLOPs: 26.31 | +7: iteration 161870/ 173500 | consumed samples: 41438720 | consumed tokens: 84866498560 | elapsed time per iteration (s): 0.16 | learning rate: 2.203E-05 | global batch size: 256 | lm loss: 3.659672E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.865 | TFLOPs: 25.04 | +7: iteration 161880/ 173500 | consumed samples: 41441280 | consumed tokens: 84871741440 | elapsed time per iteration (s): 0.17 | learning rate: 2.203E-05 | global batch size: 256 | lm loss: 3.663097E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1469.303 | TFLOPs: 23.04 | +7: iteration 161890/ 173500 | consumed samples: 41443840 | consumed tokens: 84876984320 | elapsed time per iteration (s): 0.16 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 3.661641E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.708 | TFLOPs: 25.32 | +7: iteration 161900/ 173500 | consumed samples: 41446400 | consumed tokens: 84882227200 | elapsed time per iteration (s): 0.16 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 3.645700E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.993 | TFLOPs: 25.78 | +7: iteration 161910/ 173500 | consumed samples: 41448960 | consumed tokens: 84887470080 | elapsed time per iteration (s): 0.16 | learning rate: 2.201E-05 | global batch size: 256 | lm loss: 3.670671E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.775 | TFLOPs: 25.86 | +7: iteration 161920/ 173500 | consumed samples: 41451520 | consumed tokens: 84892712960 | elapsed time per iteration (s): 0.15 | learning rate: 2.201E-05 | global batch size: 256 | lm loss: 3.654198E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.554 | TFLOPs: 26.17 | +7: iteration 161930/ 173500 | consumed samples: 41454080 | consumed tokens: 84897955840 | elapsed time per iteration (s): 0.15 | learning rate: 2.201E-05 | global batch size: 256 | lm loss: 3.668842E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.794 | TFLOPs: 26.26 | +7: iteration 161940/ 173500 | consumed samples: 41456640 | consumed tokens: 84903198720 | elapsed time per iteration (s): 0.15 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 3.671335E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.154 | TFLOPs: 26.11 | +7: iteration 161950/ 173500 | consumed samples: 41459200 | consumed tokens: 84908441600 | elapsed time per iteration (s): 0.16 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 3.659171E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.177 | TFLOPs: 25.68 | +7: iteration 161960/ 173500 | consumed samples: 41461760 | consumed tokens: 84913684480 | elapsed time per iteration (s): 0.16 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 3.651388E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.575 | TFLOPs: 25.71 | +7: iteration 161970/ 173500 | consumed samples: 41464320 | consumed tokens: 84918927360 | elapsed time per iteration (s): 0.16 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 3.655121E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.280 | TFLOPs: 25.54 | +7: iteration 161980/ 173500 | consumed samples: 41466880 | consumed tokens: 84924170240 | elapsed time per iteration (s): 0.15 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 3.650054E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.982 | TFLOPs: 26.13 | +7: iteration 161990/ 173500 | consumed samples: 41469440 | consumed tokens: 84929413120 | elapsed time per iteration (s): 0.15 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 3.663849E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.506 | TFLOPs: 26.01 | +0: [2023-03-17 07:18:15,602] [INFO] [logging.py:68:log_dist] [Rank 0] step=162000, skipped=0, lr=[2.1983700493183342e-05, 2.1983700493183342e-05, 2.1983700493183342e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 162000/ 173500 | consumed samples: 41472000 | consumed tokens: 84934656000 | elapsed time per iteration (s): 0.15 | learning rate: 2.198E-05 | global batch size: 256 | lm loss: 3.662852E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.414 | TFLOPs: 25.98 | +0: steps: 162000 loss: 3.6641 iter time (s): 0.156 samples/sec: 1643.462 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 162000 | lm loss value: 3.833974E+00 | lm loss PPL: 4.624597E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 162000 to checkpoints_44m91b100m +0: [2023-03-17 07:18:15,674] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step162000 is begin to save! +0: [2023-03-17 07:18:15,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:18:15,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:18:15,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:18:15,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:18:15,748] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:18:15,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:18:15,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:18:15,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:18:15,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:18:15,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:18:15,773] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:18:15,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:18:15,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:18:15,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:18:15,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:18:15,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:18:15,797] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:18:15,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:18:15,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:18:15,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:18:15,807] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step162000/mp_rank_00_model_states.pt +0: [2023-03-17 07:18:15,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:18:15,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:18:15,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:18:15,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:18:15,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +7: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:18:15,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: [2023-03-17 07:18:15,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +7: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:18:15,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +7: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +7: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:18:15,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +7: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +5: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +3: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +7: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:18:15,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:18:15,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:18:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-17 07:18:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +4: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +6: [2023-03-17 07:18:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +1: [2023-03-17 07:18:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +2: [2023-03-17 07:18:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:18:15,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step162000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:18:15,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step162000 is ready now! +0: successfully saved checkpoint at iteration 162000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 179.80 +7: iteration 162010/ 173500 | consumed samples: 41474560 | consumed tokens: 84939898880 | elapsed time per iteration (s): 0.18 | learning rate: 2.198E-05 | global batch size: 256 | lm loss: 3.653192E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1445.943 | TFLOPs: 22.68 | +7: iteration 162020/ 173500 | consumed samples: 41477120 | consumed tokens: 84945141760 | elapsed time per iteration (s): 0.15 | learning rate: 2.198E-05 | global batch size: 256 | lm loss: 3.677467E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.082 | TFLOPs: 26.36 | +7: iteration 162030/ 173500 | consumed samples: 41479680 | consumed tokens: 84950384640 | elapsed time per iteration (s): 0.15 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 3.661416E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.154 | TFLOPs: 26.33 | +7: iteration 162040/ 173500 | consumed samples: 41482240 | consumed tokens: 84955627520 | elapsed time per iteration (s): 0.16 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 3.663892E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.767 | TFLOPs: 25.15 | +7: iteration 162050/ 173500 | consumed samples: 41484800 | consumed tokens: 84960870400 | elapsed time per iteration (s): 0.16 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 3.656713E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.336 | TFLOPs: 25.68 | +7: iteration 162060/ 173500 | consumed samples: 41487360 | consumed tokens: 84966113280 | elapsed time per iteration (s): 0.16 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 3.674737E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.645 | TFLOPs: 25.45 | +7: iteration 162070/ 173500 | consumed samples: 41489920 | consumed tokens: 84971356160 | elapsed time per iteration (s): 0.16 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 3.657123E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1572.027 | TFLOPs: 24.65 | +7: iteration 162080/ 173500 | consumed samples: 41492480 | consumed tokens: 84976599040 | elapsed time per iteration (s): 0.16 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 3.663358E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.889 | TFLOPs: 25.64 | +7: iteration 162090/ 173500 | consumed samples: 41495040 | consumed tokens: 84981841920 | elapsed time per iteration (s): 0.15 | learning rate: 2.195E-05 | global batch size: 256 | lm loss: 3.656321E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.322 | TFLOPs: 26.10 | +7: iteration 162100/ 173500 | consumed samples: 41497600 | consumed tokens: 84987084800 | elapsed time per iteration (s): 0.16 | learning rate: 2.195E-05 | global batch size: 256 | lm loss: 3.652841E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.383 | TFLOPs: 25.63 | +7: iteration 162110/ 173500 | consumed samples: 41500160 | consumed tokens: 84992327680 | elapsed time per iteration (s): 0.16 | learning rate: 2.195E-05 | global batch size: 256 | lm loss: 3.653140E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.811 | TFLOPs: 25.28 | +7: iteration 162120/ 173500 | consumed samples: 41502720 | consumed tokens: 84997570560 | elapsed time per iteration (s): 0.16 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 3.662392E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.734 | TFLOPs: 25.64 | +7: iteration 162130/ 173500 | consumed samples: 41505280 | consumed tokens: 85002813440 | elapsed time per iteration (s): 0.16 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 3.658084E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.991 | TFLOPs: 25.53 | +7: iteration 162140/ 173500 | consumed samples: 41507840 | consumed tokens: 85008056320 | elapsed time per iteration (s): 0.15 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 3.658920E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.820 | TFLOPs: 26.12 | +7: iteration 162150/ 173500 | consumed samples: 41510400 | consumed tokens: 85013299200 | elapsed time per iteration (s): 0.16 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 3.661943E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.604 | TFLOPs: 24.93 | +7: iteration 162160/ 173500 | consumed samples: 41512960 | consumed tokens: 85018542080 | elapsed time per iteration (s): 0.15 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 3.645348E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.822 | TFLOPs: 26.17 | +7: iteration 162170/ 173500 | consumed samples: 41515520 | consumed tokens: 85023784960 | elapsed time per iteration (s): 0.15 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 3.653946E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.406 | TFLOPs: 26.12 | +7: iteration 162180/ 173500 | consumed samples: 41518080 | consumed tokens: 85029027840 | elapsed time per iteration (s): 0.15 | learning rate: 2.192E-05 | global batch size: 256 | lm loss: 3.661674E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.765 | TFLOPs: 26.12 | +7: iteration 162190/ 173500 | consumed samples: 41520640 | consumed tokens: 85034270720 | elapsed time per iteration (s): 0.15 | learning rate: 2.192E-05 | global batch size: 256 | lm loss: 3.664372E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.485 | TFLOPs: 26.13 | +7: iteration 162200/ 173500 | consumed samples: 41523200 | consumed tokens: 85039513600 | elapsed time per iteration (s): 0.16 | learning rate: 2.192E-05 | global batch size: 256 | lm loss: 3.662488E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.553 | TFLOPs: 25.87 | +7: iteration 162210/ 173500 | consumed samples: 41525760 | consumed tokens: 85044756480 | elapsed time per iteration (s): 0.17 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.654722E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.547 | TFLOPs: 24.10 | +7: iteration 162220/ 173500 | consumed samples: 41528320 | consumed tokens: 85049999360 | elapsed time per iteration (s): 0.17 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.651288E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.103 | TFLOPs: 23.92 | +7: iteration 162230/ 173500 | consumed samples: 41530880 | consumed tokens: 85055242240 | elapsed time per iteration (s): 0.16 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.652332E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.954 | TFLOPs: 25.50 | +7: iteration 162240/ 173500 | consumed samples: 41533440 | consumed tokens: 85060485120 | elapsed time per iteration (s): 0.16 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 3.665940E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.694 | TFLOPs: 25.01 | +7: iteration 162250/ 173500 | consumed samples: 41536000 | consumed tokens: 85065728000 | elapsed time per iteration (s): 0.15 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 3.661183E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.622 | TFLOPs: 26.22 | +7: iteration 162260/ 173500 | consumed samples: 41538560 | consumed tokens: 85070970880 | elapsed time per iteration (s): 0.15 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 3.667616E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.589 | TFLOPs: 26.00 | +7: iteration 162270/ 173500 | consumed samples: 41541120 | consumed tokens: 85076213760 | elapsed time per iteration (s): 0.16 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 3.669846E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.245 | TFLOPs: 25.00 | +7: iteration 162280/ 173500 | consumed samples: 41543680 | consumed tokens: 85081456640 | elapsed time per iteration (s): 0.15 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 3.673540E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.490 | TFLOPs: 26.06 | +7: iteration 162290/ 173500 | consumed samples: 41546240 | consumed tokens: 85086699520 | elapsed time per iteration (s): 0.15 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 3.662145E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.914 | TFLOPs: 26.36 | +7: iteration 162300/ 173500 | consumed samples: 41548800 | consumed tokens: 85091942400 | elapsed time per iteration (s): 0.15 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 3.653130E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.602 | TFLOPs: 26.32 | +7: iteration 162310/ 173500 | consumed samples: 41551360 | consumed tokens: 85097185280 | elapsed time per iteration (s): 0.15 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 3.653970E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.869 | TFLOPs: 26.34 | +7: iteration 162320/ 173500 | consumed samples: 41553920 | consumed tokens: 85102428160 | elapsed time per iteration (s): 0.16 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 3.667805E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.001 | TFLOPs: 25.77 | +7: iteration 162330/ 173500 | consumed samples: 41556480 | consumed tokens: 85107671040 | elapsed time per iteration (s): 0.16 | learning rate: 2.187E-05 | global batch size: 256 | lm loss: 3.655277E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.890 | TFLOPs: 25.87 | +7: iteration 162340/ 173500 | consumed samples: 41559040 | consumed tokens: 85112913920 | elapsed time per iteration (s): 0.15 | learning rate: 2.187E-05 | global batch size: 256 | lm loss: 3.651567E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.837 | TFLOPs: 25.92 | +7: iteration 162350/ 173500 | consumed samples: 41561600 | consumed tokens: 85118156800 | elapsed time per iteration (s): 0.15 | learning rate: 2.187E-05 | global batch size: 256 | lm loss: 3.661865E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.397 | TFLOPs: 25.91 | +7: iteration 162360/ 173500 | consumed samples: 41564160 | consumed tokens: 85123399680 | elapsed time per iteration (s): 0.15 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 3.668666E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.918 | TFLOPs: 26.28 | +7: iteration 162370/ 173500 | consumed samples: 41566720 | consumed tokens: 85128642560 | elapsed time per iteration (s): 0.15 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 3.649723E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.468 | TFLOPs: 25.91 | +7: iteration 162380/ 173500 | consumed samples: 41569280 | consumed tokens: 85133885440 | elapsed time per iteration (s): 0.15 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 3.682118E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.360 | TFLOPs: 26.27 | +7: iteration 162390/ 173500 | consumed samples: 41571840 | consumed tokens: 85139128320 | elapsed time per iteration (s): 0.15 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 3.649796E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.353 | TFLOPs: 26.23 | +7: iteration 162400/ 173500 | consumed samples: 41574400 | consumed tokens: 85144371200 | elapsed time per iteration (s): 0.16 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 3.667200E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.960 | TFLOPs: 25.78 | +7: iteration 162410/ 173500 | consumed samples: 41576960 | consumed tokens: 85149614080 | elapsed time per iteration (s): 0.15 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 3.662012E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.273 | TFLOPs: 26.18 | +7: iteration 162420/ 173500 | consumed samples: 41579520 | consumed tokens: 85154856960 | elapsed time per iteration (s): 0.15 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 3.655389E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.661 | TFLOPs: 26.29 | +7: iteration 162430/ 173500 | consumed samples: 41582080 | consumed tokens: 85160099840 | elapsed time per iteration (s): 0.16 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 3.654763E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.829 | TFLOPs: 25.67 | +7: iteration 162440/ 173500 | consumed samples: 41584640 | consumed tokens: 85165342720 | elapsed time per iteration (s): 0.15 | learning rate: 2.184E-05 | global batch size: 256 | lm loss: 3.670751E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.632 | TFLOPs: 25.93 | +7: iteration 162450/ 173500 | consumed samples: 41587200 | consumed tokens: 85170585600 | elapsed time per iteration (s): 0.16 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.668578E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.044 | TFLOPs: 25.85 | +7: iteration 162460/ 173500 | consumed samples: 41589760 | consumed tokens: 85175828480 | elapsed time per iteration (s): 0.17 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.664983E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1493.343 | TFLOPs: 23.42 | +7: iteration 162470/ 173500 | consumed samples: 41592320 | consumed tokens: 85181071360 | elapsed time per iteration (s): 0.18 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.651500E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1454.519 | TFLOPs: 22.81 | +7: iteration 162480/ 173500 | consumed samples: 41594880 | consumed tokens: 85186314240 | elapsed time per iteration (s): 0.16 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 3.647721E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.453 | TFLOPs: 25.52 | +7: iteration 162490/ 173500 | consumed samples: 41597440 | consumed tokens: 85191557120 | elapsed time per iteration (s): 0.15 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 3.657830E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.155 | TFLOPs: 26.27 | +7: iteration 162500/ 173500 | consumed samples: 41600000 | consumed tokens: 85196800000 | elapsed time per iteration (s): 0.15 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 3.672238E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.616 | TFLOPs: 26.26 | +7: iteration 162510/ 173500 | consumed samples: 41602560 | consumed tokens: 85202042880 | elapsed time per iteration (s): 0.15 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 3.660057E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.395 | TFLOPs: 26.20 | +7: iteration 162520/ 173500 | consumed samples: 41605120 | consumed tokens: 85207285760 | elapsed time per iteration (s): 0.15 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 3.666798E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.589 | TFLOPs: 26.12 | +7: iteration 162530/ 173500 | consumed samples: 41607680 | consumed tokens: 85212528640 | elapsed time per iteration (s): 0.15 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 3.668245E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.898 | TFLOPs: 26.06 | +7: iteration 162540/ 173500 | consumed samples: 41610240 | consumed tokens: 85217771520 | elapsed time per iteration (s): 0.15 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 3.657897E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.109 | TFLOPs: 26.16 | +7: iteration 162550/ 173500 | consumed samples: 41612800 | consumed tokens: 85223014400 | elapsed time per iteration (s): 0.15 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 3.668179E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.792 | TFLOPs: 26.17 | +7: iteration 162560/ 173500 | consumed samples: 41615360 | consumed tokens: 85228257280 | elapsed time per iteration (s): 0.16 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 3.665989E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.302 | TFLOPs: 25.50 | +7: iteration 162570/ 173500 | consumed samples: 41617920 | consumed tokens: 85233500160 | elapsed time per iteration (s): 0.16 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 3.652032E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.430 | TFLOPs: 25.88 | +7: iteration 162580/ 173500 | consumed samples: 41620480 | consumed tokens: 85238743040 | elapsed time per iteration (s): 0.15 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 3.666885E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.878 | TFLOPs: 26.13 | +7: iteration 162590/ 173500 | consumed samples: 41623040 | consumed tokens: 85243985920 | elapsed time per iteration (s): 0.16 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 3.650650E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.435 | TFLOPs: 25.88 | +7: iteration 162600/ 173500 | consumed samples: 41625600 | consumed tokens: 85249228800 | elapsed time per iteration (s): 0.15 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 3.654984E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.673 | TFLOPs: 26.23 | +7: iteration 162610/ 173500 | consumed samples: 41628160 | consumed tokens: 85254471680 | elapsed time per iteration (s): 0.15 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 3.641721E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.073 | TFLOPs: 26.27 | +7: iteration 162620/ 173500 | consumed samples: 41630720 | consumed tokens: 85259714560 | elapsed time per iteration (s): 0.15 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 3.648532E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.103 | TFLOPs: 26.27 | +7: iteration 162630/ 173500 | consumed samples: 41633280 | consumed tokens: 85264957440 | elapsed time per iteration (s): 0.15 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 3.659724E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.839 | TFLOPs: 26.28 | +7: iteration 162640/ 173500 | consumed samples: 41635840 | consumed tokens: 85270200320 | elapsed time per iteration (s): 0.15 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 3.654535E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.916 | TFLOPs: 26.25 | +7: iteration 162650/ 173500 | consumed samples: 41638400 | consumed tokens: 85275443200 | elapsed time per iteration (s): 0.16 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 3.664637E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.245 | TFLOPs: 25.82 | +7: iteration 162660/ 173500 | consumed samples: 41640960 | consumed tokens: 85280686080 | elapsed time per iteration (s): 0.15 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 3.672190E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.136 | TFLOPs: 26.27 | +7: iteration 162670/ 173500 | consumed samples: 41643520 | consumed tokens: 85285928960 | elapsed time per iteration (s): 0.15 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 3.663555E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.428 | TFLOPs: 25.93 | +7: iteration 162680/ 173500 | consumed samples: 41646080 | consumed tokens: 85291171840 | elapsed time per iteration (s): 0.15 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 3.678825E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.037 | TFLOPs: 26.22 | +7: iteration 162690/ 173500 | consumed samples: 41648640 | consumed tokens: 85296414720 | elapsed time per iteration (s): 0.16 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 3.656261E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.273 | TFLOPs: 25.85 | +7: iteration 162700/ 173500 | consumed samples: 41651200 | consumed tokens: 85301657600 | elapsed time per iteration (s): 0.15 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 3.666604E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.340 | TFLOPs: 26.27 | +7: iteration 162710/ 173500 | consumed samples: 41653760 | consumed tokens: 85306900480 | elapsed time per iteration (s): 0.15 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 3.661733E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.666 | TFLOPs: 26.28 | +7: iteration 162720/ 173500 | consumed samples: 41656320 | consumed tokens: 85312143360 | elapsed time per iteration (s): 0.15 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 3.665089E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.994 | TFLOPs: 26.25 | +7: iteration 162730/ 173500 | consumed samples: 41658880 | consumed tokens: 85317386240 | elapsed time per iteration (s): 0.15 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 3.661893E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.994 | TFLOPs: 26.27 | +7: iteration 162740/ 173500 | consumed samples: 41661440 | consumed tokens: 85322629120 | elapsed time per iteration (s): 0.15 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 3.649511E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.419 | TFLOPs: 26.21 | +7: iteration 162750/ 173500 | consumed samples: 41664000 | consumed tokens: 85327872000 | elapsed time per iteration (s): 0.15 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 3.664759E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.340 | TFLOPs: 26.20 | +7: iteration 162760/ 173500 | consumed samples: 41666560 | consumed tokens: 85333114880 | elapsed time per iteration (s): 0.15 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 3.668319E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.333 | TFLOPs: 26.24 | +7: iteration 162770/ 173500 | consumed samples: 41669120 | consumed tokens: 85338357760 | elapsed time per iteration (s): 0.15 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 3.669246E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.708 | TFLOPs: 26.22 | +7: iteration 162780/ 173500 | consumed samples: 41671680 | consumed tokens: 85343600640 | elapsed time per iteration (s): 0.15 | learning rate: 2.172E-05 | global batch size: 256 | lm loss: 3.650185E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.322 | TFLOPs: 26.21 | +7: iteration 162790/ 173500 | consumed samples: 41674240 | consumed tokens: 85348843520 | elapsed time per iteration (s): 0.15 | learning rate: 2.172E-05 | global batch size: 256 | lm loss: 3.668457E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.853 | TFLOPs: 26.25 | +7: iteration 162800/ 173500 | consumed samples: 41676800 | consumed tokens: 85354086400 | elapsed time per iteration (s): 0.16 | learning rate: 2.172E-05 | global batch size: 256 | lm loss: 3.664169E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.565 | TFLOPs: 25.45 | +7: iteration 162810/ 173500 | consumed samples: 41679360 | consumed tokens: 85359329280 | elapsed time per iteration (s): 0.15 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 3.660442E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.892 | TFLOPs: 26.24 | +7: iteration 162820/ 173500 | consumed samples: 41681920 | consumed tokens: 85364572160 | elapsed time per iteration (s): 0.15 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 3.672939E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.072 | TFLOPs: 26.19 | +7: iteration 162830/ 173500 | consumed samples: 41684480 | consumed tokens: 85369815040 | elapsed time per iteration (s): 0.15 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 3.662068E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.468 | TFLOPs: 26.21 | +7: iteration 162840/ 173500 | consumed samples: 41687040 | consumed tokens: 85375057920 | elapsed time per iteration (s): 0.16 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 3.659589E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.105 | TFLOPs: 25.85 | +7: iteration 162850/ 173500 | consumed samples: 41689600 | consumed tokens: 85380300800 | elapsed time per iteration (s): 0.15 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.661322E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.533 | TFLOPs: 26.25 | +7: iteration 162860/ 173500 | consumed samples: 41692160 | consumed tokens: 85385543680 | elapsed time per iteration (s): 0.15 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.663567E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.665 | TFLOPs: 26.12 | +7: iteration 162870/ 173500 | consumed samples: 41694720 | consumed tokens: 85390786560 | elapsed time per iteration (s): 0.16 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.657344E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.511 | TFLOPs: 24.93 | +7: iteration 162880/ 173500 | consumed samples: 41697280 | consumed tokens: 85396029440 | elapsed time per iteration (s): 0.16 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 3.664486E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.710 | TFLOPs: 25.51 | +7: iteration 162890/ 173500 | consumed samples: 41699840 | consumed tokens: 85401272320 | elapsed time per iteration (s): 0.15 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 3.664377E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.891 | TFLOPs: 25.95 | +7: iteration 162900/ 173500 | consumed samples: 41702400 | consumed tokens: 85406515200 | elapsed time per iteration (s): 0.15 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 3.662151E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.278 | TFLOPs: 26.34 | +7: iteration 162910/ 173500 | consumed samples: 41704960 | consumed tokens: 85411758080 | elapsed time per iteration (s): 0.16 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.656453E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.089 | TFLOPs: 25.88 | +7: iteration 162920/ 173500 | consumed samples: 41707520 | consumed tokens: 85417000960 | elapsed time per iteration (s): 0.15 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.669109E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.750 | TFLOPs: 26.15 | +7: iteration 162930/ 173500 | consumed samples: 41710080 | consumed tokens: 85422243840 | elapsed time per iteration (s): 0.16 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.653564E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.393 | TFLOPs: 25.87 | +7: iteration 162940/ 173500 | consumed samples: 41712640 | consumed tokens: 85427486720 | elapsed time per iteration (s): 0.15 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 3.671525E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.571 | TFLOPs: 26.15 | +7: iteration 162950/ 173500 | consumed samples: 41715200 | consumed tokens: 85432729600 | elapsed time per iteration (s): 0.15 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 3.663839E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.052 | TFLOPs: 26.17 | +7: iteration 162960/ 173500 | consumed samples: 41717760 | consumed tokens: 85437972480 | elapsed time per iteration (s): 0.15 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 3.657521E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.845 | TFLOPs: 26.17 | +7: iteration 162970/ 173500 | consumed samples: 41720320 | consumed tokens: 85443215360 | elapsed time per iteration (s): 0.15 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 3.662437E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.562 | TFLOPs: 26.17 | +7: iteration 162980/ 173500 | consumed samples: 41722880 | consumed tokens: 85448458240 | elapsed time per iteration (s): 0.16 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 3.661801E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.955 | TFLOPs: 25.45 | +7: iteration 162990/ 173500 | consumed samples: 41725440 | consumed tokens: 85453701120 | elapsed time per iteration (s): 0.16 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 3.662981E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.782 | TFLOPs: 25.59 | +7: iteration 163000/ 173500 | consumed samples: 41728000 | consumed tokens: 85458944000 | elapsed time per iteration (s): 0.15 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 3.664745E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.565 | TFLOPs: 26.03 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 163000 | lm loss value: 3.823864E+00 | lm loss PPL: 4.578075E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 163000 to checkpoints_44m91b100m +0: [2023-03-17 07:20:51,195] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step163000 is begin to save! +0: [2023-03-17 07:20:51,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:20:51,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:20:51,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:20:51,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:20:51,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:20:51,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:20:51,294] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:20:51,302] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:20:51,302] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:20:51,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:20:51,310] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:20:51,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:20:51,318] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:20:51,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:20:51,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:20:51,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:20:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:20:51,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:20:51,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:20:51,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:20:51,344] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step163000/mp_rank_00_model_states.pt +0: [2023-03-17 07:20:51,344] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:20:51,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:20:51,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:20:51,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:20:51,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-17 07:20:51,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:20:51,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,375] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:20:51,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +1: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +7: [2023-03-17 07:20:51,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:20:51,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,377] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +1: [2023-03-17 07:20:51,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +7: [2023-03-17 07:20:51,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:20:51,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +1: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:20:51,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +7: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:20:51,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:20:51,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +7: [2023-03-17 07:20:51,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:20:51,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +1: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:20:51,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +7: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +4: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +6: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +7: [2023-03-17 07:20:51,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +2: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +5: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +1: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +3: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +1: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:20:51,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step163000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:20:51,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step163000 is ready now! +0: successfully saved checkpoint at iteration 163000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 194.84 +7: iteration 163010/ 173500 | consumed samples: 41730560 | consumed tokens: 85464186880 | elapsed time per iteration (s): 0.22 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 3.664059E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.777 | TFLOPs: 18.08 | +7: iteration 163020/ 173500 | consumed samples: 41733120 | consumed tokens: 85469429760 | elapsed time per iteration (s): 0.16 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 3.671221E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.039 | TFLOPs: 25.03 | +7: iteration 163030/ 173500 | consumed samples: 41735680 | consumed tokens: 85474672640 | elapsed time per iteration (s): 0.15 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 3.659093E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.707 | TFLOPs: 26.36 | +7: iteration 163040/ 173500 | consumed samples: 41738240 | consumed tokens: 85479915520 | elapsed time per iteration (s): 0.15 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 3.658096E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.747 | TFLOPs: 26.39 | +7: iteration 163050/ 173500 | consumed samples: 41740800 | consumed tokens: 85485158400 | elapsed time per iteration (s): 0.15 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 3.661460E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.952 | TFLOPs: 26.35 | +7: iteration 163060/ 173500 | consumed samples: 41743360 | consumed tokens: 85490401280 | elapsed time per iteration (s): 0.15 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 3.665531E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.137 | TFLOPs: 26.35 | +7: iteration 163070/ 173500 | consumed samples: 41745920 | consumed tokens: 85495644160 | elapsed time per iteration (s): 0.15 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 3.660083E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.576 | TFLOPs: 26.36 | +7: iteration 163080/ 173500 | consumed samples: 41748480 | consumed tokens: 85500887040 | elapsed time per iteration (s): 0.15 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 3.662162E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.997 | TFLOPs: 26.35 | +7: iteration 163090/ 173500 | consumed samples: 41751040 | consumed tokens: 85506129920 | elapsed time per iteration (s): 0.15 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 3.676769E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.881 | TFLOPs: 26.38 | +7: iteration 163100/ 173500 | consumed samples: 41753600 | consumed tokens: 85511372800 | elapsed time per iteration (s): 0.15 | learning rate: 2.162E-05 | global batch size: 256 | lm loss: 3.656992E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1682.505 | TFLOPs: 26.39 | +7: iteration 163110/ 173500 | consumed samples: 41756160 | consumed tokens: 85516615680 | elapsed time per iteration (s): 0.15 | learning rate: 2.162E-05 | global batch size: 256 | lm loss: 3.650754E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.683 | TFLOPs: 26.33 | +7: iteration 163120/ 173500 | consumed samples: 41758720 | consumed tokens: 85521858560 | elapsed time per iteration (s): 0.15 | learning rate: 2.162E-05 | global batch size: 256 | lm loss: 3.671724E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.753 | TFLOPs: 26.34 | +7: iteration 163130/ 173500 | consumed samples: 41761280 | consumed tokens: 85527101440 | elapsed time per iteration (s): 0.15 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 3.660718E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.543 | TFLOPs: 26.26 | +7: iteration 163140/ 173500 | consumed samples: 41763840 | consumed tokens: 85532344320 | elapsed time per iteration (s): 0.16 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 3.647910E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1569.220 | TFLOPs: 24.61 | +7: iteration 163150/ 173500 | consumed samples: 41766400 | consumed tokens: 85537587200 | elapsed time per iteration (s): 0.16 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 3.671981E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1567.130 | TFLOPs: 24.58 | +7: iteration 163160/ 173500 | consumed samples: 41768960 | consumed tokens: 85542830080 | elapsed time per iteration (s): 0.15 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.661059E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.918 | TFLOPs: 26.36 | +7: iteration 163170/ 173500 | consumed samples: 41771520 | consumed tokens: 85548072960 | elapsed time per iteration (s): 0.15 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.656066E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.913 | TFLOPs: 26.35 | +7: iteration 163180/ 173500 | consumed samples: 41774080 | consumed tokens: 85553315840 | elapsed time per iteration (s): 0.15 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.654942E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.623 | TFLOPs: 26.29 | +7: iteration 163190/ 173500 | consumed samples: 41776640 | consumed tokens: 85558558720 | elapsed time per iteration (s): 0.15 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.649909E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.423 | TFLOPs: 26.32 | +7: iteration 163200/ 173500 | consumed samples: 41779200 | consumed tokens: 85563801600 | elapsed time per iteration (s): 0.16 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 3.657353E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.210 | TFLOPs: 25.83 | +7: iteration 163210/ 173500 | consumed samples: 41781760 | consumed tokens: 85569044480 | elapsed time per iteration (s): 0.16 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 3.681086E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.732 | TFLOPs: 25.06 | +7: iteration 163220/ 173500 | consumed samples: 41784320 | consumed tokens: 85574287360 | elapsed time per iteration (s): 0.15 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 3.652847E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.342 | TFLOPs: 25.94 | +7: iteration 163230/ 173500 | consumed samples: 41786880 | consumed tokens: 85579530240 | elapsed time per iteration (s): 0.15 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 3.652647E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.826 | TFLOPs: 26.06 | +7: iteration 163240/ 173500 | consumed samples: 41789440 | consumed tokens: 85584773120 | elapsed time per iteration (s): 0.15 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 3.668819E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.100 | TFLOPs: 26.32 | +7: iteration 163250/ 173500 | consumed samples: 41792000 | consumed tokens: 85590016000 | elapsed time per iteration (s): 0.15 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 3.656435E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.286 | TFLOPs: 26.27 | +7: iteration 163260/ 173500 | consumed samples: 41794560 | consumed tokens: 85595258880 | elapsed time per iteration (s): 0.15 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 3.667783E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.734 | TFLOPs: 26.28 | +7: iteration 163270/ 173500 | consumed samples: 41797120 | consumed tokens: 85600501760 | elapsed time per iteration (s): 0.15 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 3.654648E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.456 | TFLOPs: 26.29 | +7: iteration 163280/ 173500 | consumed samples: 41799680 | consumed tokens: 85605744640 | elapsed time per iteration (s): 0.15 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 3.649117E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.717 | TFLOPs: 26.26 | +7: iteration 163290/ 173500 | consumed samples: 41802240 | consumed tokens: 85610987520 | elapsed time per iteration (s): 0.15 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 3.667836E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.803 | TFLOPs: 26.25 | +7: iteration 163300/ 173500 | consumed samples: 41804800 | consumed tokens: 85616230400 | elapsed time per iteration (s): 0.15 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 3.658743E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.672 | TFLOPs: 26.26 | +7: iteration 163310/ 173500 | consumed samples: 41807360 | consumed tokens: 85621473280 | elapsed time per iteration (s): 0.16 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 3.666281E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.832 | TFLOPs: 25.09 | +7: iteration 163320/ 173500 | consumed samples: 41809920 | consumed tokens: 85626716160 | elapsed time per iteration (s): 0.15 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 3.658504E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.496 | TFLOPs: 26.35 | +7: iteration 163330/ 173500 | consumed samples: 41812480 | consumed tokens: 85631959040 | elapsed time per iteration (s): 0.16 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 3.659484E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.521 | TFLOPs: 25.70 | +7: iteration 163340/ 173500 | consumed samples: 41815040 | consumed tokens: 85637201920 | elapsed time per iteration (s): 0.15 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 3.656372E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.453 | TFLOPs: 26.20 | +7: iteration 163350/ 173500 | consumed samples: 41817600 | consumed tokens: 85642444800 | elapsed time per iteration (s): 0.15 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 3.648410E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.906 | TFLOPs: 26.22 | +7: iteration 163360/ 173500 | consumed samples: 41820160 | consumed tokens: 85647687680 | elapsed time per iteration (s): 0.16 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 3.659085E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.823 | TFLOPs: 25.67 | +7: iteration 163370/ 173500 | consumed samples: 41822720 | consumed tokens: 85652930560 | elapsed time per iteration (s): 0.15 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 3.665326E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.110 | TFLOPs: 26.21 | +7: iteration 163380/ 173500 | consumed samples: 41825280 | consumed tokens: 85658173440 | elapsed time per iteration (s): 0.15 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 3.673900E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.967 | TFLOPs: 26.19 | +7: iteration 163390/ 173500 | consumed samples: 41827840 | consumed tokens: 85663416320 | elapsed time per iteration (s): 0.15 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.668084E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.258 | TFLOPs: 26.18 | +7: iteration 163400/ 173500 | consumed samples: 41830400 | consumed tokens: 85668659200 | elapsed time per iteration (s): 0.15 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.657653E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.676 | TFLOPs: 26.20 | +7: iteration 163410/ 173500 | consumed samples: 41832960 | consumed tokens: 85673902080 | elapsed time per iteration (s): 0.15 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.665144E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.362 | TFLOPs: 25.94 | +7: iteration 163420/ 173500 | consumed samples: 41835520 | consumed tokens: 85679144960 | elapsed time per iteration (s): 0.15 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.649073E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.299 | TFLOPs: 26.32 | +7: iteration 163430/ 173500 | consumed samples: 41838080 | consumed tokens: 85684387840 | elapsed time per iteration (s): 0.15 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 3.658769E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.398 | TFLOPs: 26.29 | +7: iteration 163440/ 173500 | consumed samples: 41840640 | consumed tokens: 85689630720 | elapsed time per iteration (s): 0.15 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 3.653038E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.843 | TFLOPs: 26.23 | +7: iteration 163450/ 173500 | consumed samples: 41843200 | consumed tokens: 85694873600 | elapsed time per iteration (s): 0.15 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 3.670765E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.703 | TFLOPs: 26.22 | +7: iteration 163460/ 173500 | consumed samples: 41845760 | consumed tokens: 85700116480 | elapsed time per iteration (s): 0.15 | learning rate: 2.151E-05 | global batch size: 256 | lm loss: 3.665871E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.093 | TFLOPs: 26.21 | +7: iteration 163470/ 173500 | consumed samples: 41848320 | consumed tokens: 85705359360 | elapsed time per iteration (s): 0.15 | learning rate: 2.151E-05 | global batch size: 256 | lm loss: 3.653141E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.364 | TFLOPs: 26.24 | +7: iteration 163480/ 173500 | consumed samples: 41850880 | consumed tokens: 85710602240 | elapsed time per iteration (s): 0.15 | learning rate: 2.151E-05 | global batch size: 256 | lm loss: 3.660511E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.456 | TFLOPs: 26.29 | +7: iteration 163490/ 173500 | consumed samples: 41853440 | consumed tokens: 85715845120 | elapsed time per iteration (s): 0.15 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.658001E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.596 | TFLOPs: 25.92 | +7: iteration 163500/ 173500 | consumed samples: 41856000 | consumed tokens: 85721088000 | elapsed time per iteration (s): 0.15 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.646152E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.099 | TFLOPs: 26.25 | +7: iteration 163510/ 173500 | consumed samples: 41858560 | consumed tokens: 85726330880 | elapsed time per iteration (s): 0.15 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.661091E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.802 | TFLOPs: 26.16 | +7: iteration 163520/ 173500 | consumed samples: 41861120 | consumed tokens: 85731573760 | elapsed time per iteration (s): 0.16 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.670171E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.315 | TFLOPs: 25.39 | +7: iteration 163530/ 173500 | consumed samples: 41863680 | consumed tokens: 85736816640 | elapsed time per iteration (s): 0.16 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 3.662713E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.388 | TFLOPs: 25.76 | +7: iteration 163540/ 173500 | consumed samples: 41866240 | consumed tokens: 85742059520 | elapsed time per iteration (s): 0.15 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 3.657528E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.271 | TFLOPs: 26.23 | +7: iteration 163550/ 173500 | consumed samples: 41868800 | consumed tokens: 85747302400 | elapsed time per iteration (s): 0.16 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 3.659597E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1577.842 | TFLOPs: 24.74 | +7: iteration 163560/ 173500 | consumed samples: 41871360 | consumed tokens: 85752545280 | elapsed time per iteration (s): 0.16 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 3.662702E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.899 | TFLOPs: 25.81 | +7: iteration 163570/ 173500 | consumed samples: 41873920 | consumed tokens: 85757788160 | elapsed time per iteration (s): 0.15 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 3.681168E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.861 | TFLOPs: 26.22 | +7: iteration 163580/ 173500 | consumed samples: 41876480 | consumed tokens: 85763031040 | elapsed time per iteration (s): 0.15 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 3.661810E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.418 | TFLOPs: 26.26 | +7: iteration 163590/ 173500 | consumed samples: 41879040 | consumed tokens: 85768273920 | elapsed time per iteration (s): 0.15 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.667759E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.134 | TFLOPs: 26.27 | +7: iteration 163600/ 173500 | consumed samples: 41881600 | consumed tokens: 85773516800 | elapsed time per iteration (s): 0.16 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.667237E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.754 | TFLOPs: 25.37 | +7: iteration 163610/ 173500 | consumed samples: 41884160 | consumed tokens: 85778759680 | elapsed time per iteration (s): 0.15 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.656358E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.112 | TFLOPs: 26.27 | +7: iteration 163620/ 173500 | consumed samples: 41886720 | consumed tokens: 85784002560 | elapsed time per iteration (s): 0.16 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.661565E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.489 | TFLOPs: 25.87 | +7: iteration 163630/ 173500 | consumed samples: 41889280 | consumed tokens: 85789245440 | elapsed time per iteration (s): 0.15 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 3.651902E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.230 | TFLOPs: 26.02 | +7: iteration 163640/ 173500 | consumed samples: 41891840 | consumed tokens: 85794488320 | elapsed time per iteration (s): 0.16 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 3.647671E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.322 | TFLOPs: 25.85 | +7: iteration 163650/ 173500 | consumed samples: 41894400 | consumed tokens: 85799731200 | elapsed time per iteration (s): 0.15 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 3.667482E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.713 | TFLOPs: 26.26 | +7: iteration 163660/ 173500 | consumed samples: 41896960 | consumed tokens: 85804974080 | elapsed time per iteration (s): 0.15 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 3.665793E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.188 | TFLOPs: 26.29 | +7: iteration 163670/ 173500 | consumed samples: 41899520 | consumed tokens: 85810216960 | elapsed time per iteration (s): 0.15 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 3.666259E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.125 | TFLOPs: 26.35 | +7: iteration 163680/ 173500 | consumed samples: 41902080 | consumed tokens: 85815459840 | elapsed time per iteration (s): 0.15 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 3.662734E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.953 | TFLOPs: 26.33 | +7: iteration 163690/ 173500 | consumed samples: 41904640 | consumed tokens: 85820702720 | elapsed time per iteration (s): 0.15 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 3.667699E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.672 | TFLOPs: 26.34 | +7: iteration 163700/ 173500 | consumed samples: 41907200 | consumed tokens: 85825945600 | elapsed time per iteration (s): 0.15 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 3.662299E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.349 | TFLOPs: 26.26 | +7: iteration 163710/ 173500 | consumed samples: 41909760 | consumed tokens: 85831188480 | elapsed time per iteration (s): 0.16 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 3.669269E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.595 | TFLOPs: 25.04 | +7: iteration 163720/ 173500 | consumed samples: 41912320 | consumed tokens: 85836431360 | elapsed time per iteration (s): 0.15 | learning rate: 2.144E-05 | global batch size: 256 | lm loss: 3.663789E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.482 | TFLOPs: 26.17 | +7: iteration 163730/ 173500 | consumed samples: 41914880 | consumed tokens: 85841674240 | elapsed time per iteration (s): 0.17 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 3.655672E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.687 | TFLOPs: 24.10 | +7: iteration 163740/ 173500 | consumed samples: 41917440 | consumed tokens: 85846917120 | elapsed time per iteration (s): 0.15 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 3.665951E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.159 | TFLOPs: 26.13 | +7: iteration 163750/ 173500 | consumed samples: 41920000 | consumed tokens: 85852160000 | elapsed time per iteration (s): 0.15 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 3.667231E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.720 | TFLOPs: 26.09 | +7: iteration 163760/ 173500 | consumed samples: 41922560 | consumed tokens: 85857402880 | elapsed time per iteration (s): 0.16 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.666029E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.970 | TFLOPs: 24.89 | +7: iteration 163770/ 173500 | consumed samples: 41925120 | consumed tokens: 85862645760 | elapsed time per iteration (s): 0.16 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.656331E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.697 | TFLOPs: 25.62 | +7: iteration 163780/ 173500 | consumed samples: 41927680 | consumed tokens: 85867888640 | elapsed time per iteration (s): 0.16 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.634496E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.085 | TFLOPs: 24.95 | +7: iteration 163790/ 173500 | consumed samples: 41930240 | consumed tokens: 85873131520 | elapsed time per iteration (s): 0.15 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.679140E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.591 | TFLOPs: 26.09 | +7: iteration 163800/ 173500 | consumed samples: 41932800 | consumed tokens: 85878374400 | elapsed time per iteration (s): 0.15 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 3.652834E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.598 | TFLOPs: 26.18 | +7: iteration 163810/ 173500 | consumed samples: 41935360 | consumed tokens: 85883617280 | elapsed time per iteration (s): 0.15 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 3.652237E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.110 | TFLOPs: 26.14 | +7: iteration 163820/ 173500 | consumed samples: 41937920 | consumed tokens: 85888860160 | elapsed time per iteration (s): 0.15 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 3.659618E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.786 | TFLOPs: 26.14 | +7: iteration 163830/ 173500 | consumed samples: 41940480 | consumed tokens: 85894103040 | elapsed time per iteration (s): 0.15 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 3.658795E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.312 | TFLOPs: 26.16 | +7: iteration 163840/ 173500 | consumed samples: 41943040 | consumed tokens: 85899345920 | elapsed time per iteration (s): 0.15 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 3.658528E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.914 | TFLOPs: 26.16 | +7: iteration 163850/ 173500 | consumed samples: 41945600 | consumed tokens: 85904588800 | elapsed time per iteration (s): 0.15 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 3.654623E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.927 | TFLOPs: 25.92 | +7: iteration 163860/ 173500 | consumed samples: 41948160 | consumed tokens: 85909831680 | elapsed time per iteration (s): 0.15 | learning rate: 2.140E-05 | global batch size: 256 | lm loss: 3.657981E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.909 | TFLOPs: 26.17 | +7: iteration 163870/ 173500 | consumed samples: 41950720 | consumed tokens: 85915074560 | elapsed time per iteration (s): 0.15 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 3.668559E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.554 | TFLOPs: 26.18 | +7: iteration 163880/ 173500 | consumed samples: 41953280 | consumed tokens: 85920317440 | elapsed time per iteration (s): 0.15 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 3.649511E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.246 | TFLOPs: 26.18 | +7: iteration 163890/ 173500 | consumed samples: 41955840 | consumed tokens: 85925560320 | elapsed time per iteration (s): 0.15 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 3.666501E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.911 | TFLOPs: 26.20 | +7: iteration 163900/ 173500 | consumed samples: 41958400 | consumed tokens: 85930803200 | elapsed time per iteration (s): 0.15 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.654119E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.211 | TFLOPs: 26.33 | +7: iteration 163910/ 173500 | consumed samples: 41960960 | consumed tokens: 85936046080 | elapsed time per iteration (s): 0.15 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.660665E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.935 | TFLOPs: 26.36 | +7: iteration 163920/ 173500 | consumed samples: 41963520 | consumed tokens: 85941288960 | elapsed time per iteration (s): 0.16 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.657338E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.511 | TFLOPs: 25.90 | +7: iteration 163930/ 173500 | consumed samples: 41966080 | consumed tokens: 85946531840 | elapsed time per iteration (s): 0.15 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.650255E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.654 | TFLOPs: 26.33 | +7: iteration 163940/ 173500 | consumed samples: 41968640 | consumed tokens: 85951774720 | elapsed time per iteration (s): 0.15 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.669940E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.756 | TFLOPs: 26.33 | +7: iteration 163950/ 173500 | consumed samples: 41971200 | consumed tokens: 85957017600 | elapsed time per iteration (s): 0.15 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.666362E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.693 | TFLOPs: 26.34 | +7: iteration 163960/ 173500 | consumed samples: 41973760 | consumed tokens: 85962260480 | elapsed time per iteration (s): 0.15 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.655947E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.678 | TFLOPs: 26.36 | +7: iteration 163970/ 173500 | consumed samples: 41976320 | consumed tokens: 85967503360 | elapsed time per iteration (s): 0.15 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 3.655994E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.860 | TFLOPs: 26.34 | +7: iteration 163980/ 173500 | consumed samples: 41978880 | consumed tokens: 85972746240 | elapsed time per iteration (s): 0.15 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 3.654009E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.163 | TFLOPs: 26.32 | +7: iteration 163990/ 173500 | consumed samples: 41981440 | consumed tokens: 85977989120 | elapsed time per iteration (s): 0.15 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 3.658943E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.955 | TFLOPs: 26.27 | +0: [2023-03-17 07:23:26,055] [INFO] [logging.py:68:log_dist] [Rank 0] step=164000, skipped=0, lr=[2.1355330909017464e-05, 2.1355330909017464e-05, 2.1355330909017464e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 164000/ 173500 | consumed samples: 41984000 | consumed tokens: 85983232000 | elapsed time per iteration (s): 0.15 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 3.648326E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.331 | TFLOPs: 26.27 | +0: steps: 164000 loss: 3.6211 iter time (s): 0.154 samples/sec: 1661.511 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 164000 | lm loss value: 3.792676E+00 | lm loss PPL: 4.437501E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 164000 to checkpoints_44m91b100m +0: [2023-03-17 07:23:26,131] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step164000 is begin to save! +0: [2023-03-17 07:23:26,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:23:26,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:23:26,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:23:26,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:23:26,213] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:23:26,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:23:26,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:23:26,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:23:26,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:23:26,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:23:26,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:23:26,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:23:26,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:23:26,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:23:26,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:23:26,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:23:26,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:23:26,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:23:26,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:23:26,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:23:26,272] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step164000/mp_rank_00_model_states.pt +0: [2023-03-17 07:23:26,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:23:26,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:23:26,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:23:26,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,304] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +4: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +4: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,306] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,306] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +1: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,307] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,307] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +4: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +1: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +1: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +4: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +1: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +4: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +1: [2023-03-17 07:23:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +5: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +2: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +1: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +7: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +6: [2023-03-17 07:23:26,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:23:26,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:23:26,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +4: [2023-03-17 07:23:26,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:23:26,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:23:26,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +3: [2023-03-17 07:23:26,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step164000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:23:26,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step164000 is ready now! +0: successfully saved checkpoint at iteration 164000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.54 +7: iteration 164010/ 173500 | consumed samples: 41986560 | consumed tokens: 85988474880 | elapsed time per iteration (s): 0.18 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 3.671315E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1431.065 | TFLOPs: 22.44 | +7: iteration 164020/ 173500 | consumed samples: 41989120 | consumed tokens: 85993717760 | elapsed time per iteration (s): 0.16 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 3.654208E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1561.237 | TFLOPs: 24.48 | +7: iteration 164030/ 173500 | consumed samples: 41991680 | consumed tokens: 85998960640 | elapsed time per iteration (s): 0.15 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 3.663371E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.791 | TFLOPs: 26.22 | +7: iteration 164040/ 173500 | consumed samples: 41994240 | consumed tokens: 86004203520 | elapsed time per iteration (s): 0.15 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.663436E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.817 | TFLOPs: 26.19 | +7: iteration 164050/ 173500 | consumed samples: 41996800 | consumed tokens: 86009446400 | elapsed time per iteration (s): 0.16 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.660863E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.429 | TFLOPs: 25.80 | +7: iteration 164060/ 173500 | consumed samples: 41999360 | consumed tokens: 86014689280 | elapsed time per iteration (s): 0.15 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.672851E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.238 | TFLOPs: 26.19 | +7: iteration 164070/ 173500 | consumed samples: 42001920 | consumed tokens: 86019932160 | elapsed time per iteration (s): 0.16 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.672842E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.498 | TFLOPs: 25.68 | +7: iteration 164080/ 173500 | consumed samples: 42004480 | consumed tokens: 86025175040 | elapsed time per iteration (s): 0.15 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 3.671449E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.600 | TFLOPs: 26.21 | +7: iteration 164090/ 173500 | consumed samples: 42007040 | consumed tokens: 86030417920 | elapsed time per iteration (s): 0.15 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 3.658887E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.361 | TFLOPs: 26.21 | +7: iteration 164100/ 173500 | consumed samples: 42009600 | consumed tokens: 86035660800 | elapsed time per iteration (s): 0.16 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 3.664026E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.700 | TFLOPs: 24.90 | +7: iteration 164110/ 173500 | consumed samples: 42012160 | consumed tokens: 86040903680 | elapsed time per iteration (s): 0.18 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.652938E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1422.220 | TFLOPs: 22.30 | +7: iteration 164120/ 173500 | consumed samples: 42014720 | consumed tokens: 86046146560 | elapsed time per iteration (s): 0.16 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.655289E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.582 | TFLOPs: 25.12 | +7: iteration 164130/ 173500 | consumed samples: 42017280 | consumed tokens: 86051389440 | elapsed time per iteration (s): 0.15 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.659583E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.211 | TFLOPs: 26.19 | +7: iteration 164140/ 173500 | consumed samples: 42019840 | consumed tokens: 86056632320 | elapsed time per iteration (s): 0.15 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.659374E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.609 | TFLOPs: 26.18 | +7: iteration 164150/ 173500 | consumed samples: 42022400 | consumed tokens: 86061875200 | elapsed time per iteration (s): 0.15 | learning rate: 2.131E-05 | global batch size: 256 | lm loss: 3.667017E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.951 | TFLOPs: 26.20 | +7: iteration 164160/ 173500 | consumed samples: 42024960 | consumed tokens: 86067118080 | elapsed time per iteration (s): 0.16 | learning rate: 2.131E-05 | global batch size: 256 | lm loss: 3.678600E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.023 | TFLOPs: 24.90 | +7: iteration 164170/ 173500 | consumed samples: 42027520 | consumed tokens: 86072360960 | elapsed time per iteration (s): 0.16 | learning rate: 2.131E-05 | global batch size: 256 | lm loss: 3.663405E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.340 | TFLOPs: 25.76 | +7: iteration 164180/ 173500 | consumed samples: 42030080 | consumed tokens: 86077603840 | elapsed time per iteration (s): 0.15 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 3.662967E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.882 | TFLOPs: 26.19 | +7: iteration 164190/ 173500 | consumed samples: 42032640 | consumed tokens: 86082846720 | elapsed time per iteration (s): 0.15 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 3.657206E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.092 | TFLOPs: 26.19 | +7: iteration 164200/ 173500 | consumed samples: 42035200 | consumed tokens: 86088089600 | elapsed time per iteration (s): 0.15 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 3.663611E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.253 | TFLOPs: 26.18 | +7: iteration 164210/ 173500 | consumed samples: 42037760 | consumed tokens: 86093332480 | elapsed time per iteration (s): 0.16 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 3.669286E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.319 | TFLOPs: 25.80 | +7: iteration 164220/ 173500 | consumed samples: 42040320 | consumed tokens: 86098575360 | elapsed time per iteration (s): 0.15 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.661398E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.938 | TFLOPs: 26.16 | +7: iteration 164230/ 173500 | consumed samples: 42042880 | consumed tokens: 86103818240 | elapsed time per iteration (s): 0.16 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.664709E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.506 | TFLOPs: 25.81 | +7: iteration 164240/ 173500 | consumed samples: 42045440 | consumed tokens: 86109061120 | elapsed time per iteration (s): 0.15 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.658542E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.063 | TFLOPs: 26.10 | +7: iteration 164250/ 173500 | consumed samples: 42048000 | consumed tokens: 86114304000 | elapsed time per iteration (s): 0.15 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.672234E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.870 | TFLOPs: 26.17 | +7: iteration 164260/ 173500 | consumed samples: 42050560 | consumed tokens: 86119546880 | elapsed time per iteration (s): 0.16 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.661186E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.317 | TFLOPs: 25.82 | +7: iteration 164270/ 173500 | consumed samples: 42053120 | consumed tokens: 86124789760 | elapsed time per iteration (s): 0.15 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.673557E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.280 | TFLOPs: 25.96 | +7: iteration 164280/ 173500 | consumed samples: 42055680 | consumed tokens: 86130032640 | elapsed time per iteration (s): 0.16 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.662365E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.839 | TFLOPs: 24.65 | +7: iteration 164290/ 173500 | consumed samples: 42058240 | consumed tokens: 86135275520 | elapsed time per iteration (s): 0.16 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.655187E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.816 | TFLOPs: 25.84 | +7: iteration 164300/ 173500 | consumed samples: 42060800 | consumed tokens: 86140518400 | elapsed time per iteration (s): 0.16 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.662323E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.470 | TFLOPs: 25.35 | +7: iteration 164310/ 173500 | consumed samples: 42063360 | consumed tokens: 86145761280 | elapsed time per iteration (s): 0.16 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.664312E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.354 | TFLOPs: 25.54 | +7: iteration 164320/ 173500 | consumed samples: 42065920 | consumed tokens: 86151004160 | elapsed time per iteration (s): 0.16 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.660498E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.622 | TFLOPs: 25.79 | +7: iteration 164330/ 173500 | consumed samples: 42068480 | consumed tokens: 86156247040 | elapsed time per iteration (s): 0.15 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 3.654692E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.875 | TFLOPs: 26.02 | +7: iteration 164340/ 173500 | consumed samples: 42071040 | consumed tokens: 86161489920 | elapsed time per iteration (s): 0.16 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 3.666821E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.897 | TFLOPs: 25.01 | +7: iteration 164350/ 173500 | consumed samples: 42073600 | consumed tokens: 86166732800 | elapsed time per iteration (s): 0.15 | learning rate: 2.126E-05 | global batch size: 256 | lm loss: 3.669603E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.663 | TFLOPs: 26.01 | +7: iteration 164360/ 173500 | consumed samples: 42076160 | consumed tokens: 86171975680 | elapsed time per iteration (s): 0.15 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.662688E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.697 | TFLOPs: 26.22 | +7: iteration 164370/ 173500 | consumed samples: 42078720 | consumed tokens: 86177218560 | elapsed time per iteration (s): 0.15 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.643305E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.903 | TFLOPs: 25.95 | +7: iteration 164380/ 173500 | consumed samples: 42081280 | consumed tokens: 86182461440 | elapsed time per iteration (s): 0.15 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.658707E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.208 | TFLOPs: 26.19 | +7: iteration 164390/ 173500 | consumed samples: 42083840 | consumed tokens: 86187704320 | elapsed time per iteration (s): 0.15 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.660848E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.510 | TFLOPs: 26.20 | +7: iteration 164400/ 173500 | consumed samples: 42086400 | consumed tokens: 86192947200 | elapsed time per iteration (s): 0.15 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.664282E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.876 | TFLOPs: 26.22 | +7: iteration 164410/ 173500 | consumed samples: 42088960 | consumed tokens: 86198190080 | elapsed time per iteration (s): 0.15 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.643613E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.990 | TFLOPs: 26.21 | +7: iteration 164420/ 173500 | consumed samples: 42091520 | consumed tokens: 86203432960 | elapsed time per iteration (s): 0.16 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.659309E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.272 | TFLOPs: 25.61 | +7: iteration 164430/ 173500 | consumed samples: 42094080 | consumed tokens: 86208675840 | elapsed time per iteration (s): 0.15 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.659649E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.758 | TFLOPs: 26.26 | +7: iteration 164440/ 173500 | consumed samples: 42096640 | consumed tokens: 86213918720 | elapsed time per iteration (s): 0.15 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 3.658281E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.845 | TFLOPs: 26.25 | +7: iteration 164450/ 173500 | consumed samples: 42099200 | consumed tokens: 86219161600 | elapsed time per iteration (s): 0.16 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 3.661815E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.471 | TFLOPs: 25.63 | +7: iteration 164460/ 173500 | consumed samples: 42101760 | consumed tokens: 86224404480 | elapsed time per iteration (s): 0.15 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 3.679360E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.701 | TFLOPs: 26.23 | +7: iteration 164470/ 173500 | consumed samples: 42104320 | consumed tokens: 86229647360 | elapsed time per iteration (s): 0.15 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 3.670837E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.100 | TFLOPs: 26.21 | +7: iteration 164480/ 173500 | consumed samples: 42106880 | consumed tokens: 86234890240 | elapsed time per iteration (s): 0.15 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 3.675944E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.233 | TFLOPs: 26.21 | +7: iteration 164490/ 173500 | consumed samples: 42109440 | consumed tokens: 86240133120 | elapsed time per iteration (s): 0.15 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 3.657872E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.293 | TFLOPs: 26.21 | +7: iteration 164500/ 173500 | consumed samples: 42112000 | consumed tokens: 86245376000 | elapsed time per iteration (s): 0.15 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 3.661460E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.640 | TFLOPs: 25.95 | +7: iteration 164510/ 173500 | consumed samples: 42114560 | consumed tokens: 86250618880 | elapsed time per iteration (s): 0.16 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.663816E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.096 | TFLOPs: 25.86 | +7: iteration 164520/ 173500 | consumed samples: 42117120 | consumed tokens: 86255861760 | elapsed time per iteration (s): 0.15 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.664297E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.131 | TFLOPs: 26.18 | +7: iteration 164530/ 173500 | consumed samples: 42119680 | consumed tokens: 86261104640 | elapsed time per iteration (s): 0.15 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.661608E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.476 | TFLOPs: 26.18 | +7: iteration 164540/ 173500 | consumed samples: 42122240 | consumed tokens: 86266347520 | elapsed time per iteration (s): 0.15 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.652654E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.711 | TFLOPs: 26.17 | +7: iteration 164550/ 173500 | consumed samples: 42124800 | consumed tokens: 86271590400 | elapsed time per iteration (s): 0.16 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 3.647321E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.951 | TFLOPs: 25.03 | +7: iteration 164560/ 173500 | consumed samples: 42127360 | consumed tokens: 86276833280 | elapsed time per iteration (s): 0.15 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 3.656454E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.379 | TFLOPs: 26.24 | +7: iteration 164570/ 173500 | consumed samples: 42129920 | consumed tokens: 86282076160 | elapsed time per iteration (s): 0.15 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 3.667738E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.222 | TFLOPs: 26.22 | +7: iteration 164580/ 173500 | consumed samples: 42132480 | consumed tokens: 86287319040 | elapsed time per iteration (s): 0.15 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 3.667310E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.701 | TFLOPs: 26.26 | +7: iteration 164590/ 173500 | consumed samples: 42135040 | consumed tokens: 86292561920 | elapsed time per iteration (s): 0.16 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 3.672959E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.099 | TFLOPs: 25.75 | +7: iteration 164600/ 173500 | consumed samples: 42137600 | consumed tokens: 86297804800 | elapsed time per iteration (s): 0.15 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 3.668960E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.066 | TFLOPs: 26.14 | +7: iteration 164610/ 173500 | consumed samples: 42140160 | consumed tokens: 86303047680 | elapsed time per iteration (s): 0.15 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 3.658989E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.606 | TFLOPs: 26.15 | +7: iteration 164620/ 173500 | consumed samples: 42142720 | consumed tokens: 86308290560 | elapsed time per iteration (s): 0.15 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.667810E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.220 | TFLOPs: 26.16 | +7: iteration 164630/ 173500 | consumed samples: 42145280 | consumed tokens: 86313533440 | elapsed time per iteration (s): 0.15 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.657987E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.276 | TFLOPs: 26.13 | +7: iteration 164640/ 173500 | consumed samples: 42147840 | consumed tokens: 86318776320 | elapsed time per iteration (s): 0.15 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.678696E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.364 | TFLOPs: 26.15 | +7: iteration 164650/ 173500 | consumed samples: 42150400 | consumed tokens: 86324019200 | elapsed time per iteration (s): 0.15 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.667649E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.451 | TFLOPs: 26.12 | +7: iteration 164660/ 173500 | consumed samples: 42152960 | consumed tokens: 86329262080 | elapsed time per iteration (s): 0.15 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 3.665429E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.003 | TFLOPs: 26.16 | +7: iteration 164670/ 173500 | consumed samples: 42155520 | consumed tokens: 86334504960 | elapsed time per iteration (s): 0.15 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 3.664133E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.957 | TFLOPs: 26.16 | +7: iteration 164680/ 173500 | consumed samples: 42158080 | consumed tokens: 86339747840 | elapsed time per iteration (s): 0.15 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 3.660252E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.172 | TFLOPs: 26.19 | +7: iteration 164690/ 173500 | consumed samples: 42160640 | consumed tokens: 86344990720 | elapsed time per iteration (s): 0.15 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 3.658906E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.625 | TFLOPs: 26.18 | +7: iteration 164700/ 173500 | consumed samples: 42163200 | consumed tokens: 86350233600 | elapsed time per iteration (s): 0.15 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 3.661890E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.286 | TFLOPs: 26.18 | +7: iteration 164710/ 173500 | consumed samples: 42165760 | consumed tokens: 86355476480 | elapsed time per iteration (s): 0.15 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 3.665403E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.734 | TFLOPs: 26.17 | +7: iteration 164720/ 173500 | consumed samples: 42168320 | consumed tokens: 86360719360 | elapsed time per iteration (s): 0.15 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 3.654024E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.381 | TFLOPs: 26.16 | +7: iteration 164730/ 173500 | consumed samples: 42170880 | consumed tokens: 86365962240 | elapsed time per iteration (s): 0.15 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 3.676487E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.467 | TFLOPs: 26.18 | +7: iteration 164740/ 173500 | consumed samples: 42173440 | consumed tokens: 86371205120 | elapsed time per iteration (s): 0.15 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 3.653408E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.507 | TFLOPs: 26.18 | +7: iteration 164750/ 173500 | consumed samples: 42176000 | consumed tokens: 86376448000 | elapsed time per iteration (s): 0.15 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 3.659220E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.575 | TFLOPs: 26.17 | +7: iteration 164760/ 173500 | consumed samples: 42178560 | consumed tokens: 86381690880 | elapsed time per iteration (s): 0.15 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 3.677812E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.341 | TFLOPs: 26.20 | +7: iteration 164770/ 173500 | consumed samples: 42181120 | consumed tokens: 86386933760 | elapsed time per iteration (s): 0.15 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.667002E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.208 | TFLOPs: 26.18 | +7: iteration 164780/ 173500 | consumed samples: 42183680 | consumed tokens: 86392176640 | elapsed time per iteration (s): 0.15 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.661947E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.487 | TFLOPs: 26.20 | +7: iteration 164790/ 173500 | consumed samples: 42186240 | consumed tokens: 86397419520 | elapsed time per iteration (s): 0.15 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.652121E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.699 | TFLOPs: 26.19 | +7: iteration 164800/ 173500 | consumed samples: 42188800 | consumed tokens: 86402662400 | elapsed time per iteration (s): 0.16 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.667431E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.352 | TFLOPs: 25.60 | +7: iteration 164810/ 173500 | consumed samples: 42191360 | consumed tokens: 86407905280 | elapsed time per iteration (s): 0.16 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 3.667990E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.130 | TFLOPs: 25.55 | +7: iteration 164820/ 173500 | consumed samples: 42193920 | consumed tokens: 86413148160 | elapsed time per iteration (s): 0.15 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 3.662178E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.749 | TFLOPs: 26.19 | +7: iteration 164830/ 173500 | consumed samples: 42196480 | consumed tokens: 86418391040 | elapsed time per iteration (s): 0.16 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 3.664215E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.395 | TFLOPs: 25.63 | +7: iteration 164840/ 173500 | consumed samples: 42199040 | consumed tokens: 86423633920 | elapsed time per iteration (s): 0.15 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 3.666872E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.721 | TFLOPs: 25.97 | +7: iteration 164850/ 173500 | consumed samples: 42201600 | consumed tokens: 86428876800 | elapsed time per iteration (s): 0.15 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.656791E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.252 | TFLOPs: 26.18 | +7: iteration 164860/ 173500 | consumed samples: 42204160 | consumed tokens: 86434119680 | elapsed time per iteration (s): 0.15 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.659230E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.069 | TFLOPs: 26.19 | +7: iteration 164870/ 173500 | consumed samples: 42206720 | consumed tokens: 86439362560 | elapsed time per iteration (s): 0.15 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.650243E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.117 | TFLOPs: 26.19 | +7: iteration 164880/ 173500 | consumed samples: 42209280 | consumed tokens: 86444605440 | elapsed time per iteration (s): 0.15 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.655334E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.152 | TFLOPs: 26.21 | +7: iteration 164890/ 173500 | consumed samples: 42211840 | consumed tokens: 86449848320 | elapsed time per iteration (s): 0.15 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 3.653009E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.964 | TFLOPs: 26.19 | +7: iteration 164900/ 173500 | consumed samples: 42214400 | consumed tokens: 86455091200 | elapsed time per iteration (s): 0.15 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 3.653304E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.863 | TFLOPs: 26.19 | +7: iteration 164910/ 173500 | consumed samples: 42216960 | consumed tokens: 86460334080 | elapsed time per iteration (s): 0.15 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 3.673641E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.909 | TFLOPs: 26.17 | +7: iteration 164920/ 173500 | consumed samples: 42219520 | consumed tokens: 86465576960 | elapsed time per iteration (s): 0.15 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 3.651714E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.450 | TFLOPs: 26.17 | +7: iteration 164930/ 173500 | consumed samples: 42222080 | consumed tokens: 86470819840 | elapsed time per iteration (s): 0.16 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.656921E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.616 | TFLOPs: 25.67 | +7: iteration 164940/ 173500 | consumed samples: 42224640 | consumed tokens: 86476062720 | elapsed time per iteration (s): 0.15 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.666481E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.551 | TFLOPs: 26.10 | +7: iteration 164950/ 173500 | consumed samples: 42227200 | consumed tokens: 86481305600 | elapsed time per iteration (s): 0.15 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.673407E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.674 | TFLOPs: 26.15 | +7: iteration 164960/ 173500 | consumed samples: 42229760 | consumed tokens: 86486548480 | elapsed time per iteration (s): 0.16 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.666721E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.293 | TFLOPs: 25.30 | +7: iteration 164970/ 173500 | consumed samples: 42232320 | consumed tokens: 86491791360 | elapsed time per iteration (s): 0.15 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.660103E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.289 | TFLOPs: 26.15 | +7: iteration 164980/ 173500 | consumed samples: 42234880 | consumed tokens: 86497034240 | elapsed time per iteration (s): 0.16 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.672682E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.078 | TFLOPs: 25.80 | +7: iteration 164990/ 173500 | consumed samples: 42237440 | consumed tokens: 86502277120 | elapsed time per iteration (s): 0.15 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.659609E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.153 | TFLOPs: 26.10 | +7: iteration 165000/ 173500 | consumed samples: 42240000 | consumed tokens: 86507520000 | elapsed time per iteration (s): 0.16 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.655975E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.707 | TFLOPs: 25.65 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 165000 | lm loss value: 3.847353E+00 | lm loss PPL: 4.686881E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 165000 to checkpoints_44m91b100m +0: [2023-03-17 07:26:01,239] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step165000 is begin to save! +0: [2023-03-17 07:26:01,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:26:01,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:26:01,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:26:01,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:26:01,316] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:26:01,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:26:01,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:26:01,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:26:01,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:26:01,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:26:01,341] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:26:01,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:26:01,349] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:26:01,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:26:01,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:26:01,365] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:26:01,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:26:01,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:26:01,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:26:01,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:26:01,374] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step165000/mp_rank_00_model_states.pt +0: [2023-03-17 07:26:01,374] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:26:01,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,395] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:26:01,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:26:01,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +2: [2023-03-17 07:26:01,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:26:01,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +2: [2023-03-17 07:26:01,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +2: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:26:01,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:26:01,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +2: [2023-03-17 07:26:01,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:26:01,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:26:01,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +2: [2023-03-17 07:26:01,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +6: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +5: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +2: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +3: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +4: [2023-03-17 07:26:01,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +7: [2023-03-17 07:26:01,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:26:01,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:26:01,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +1: [2023-03-17 07:26:01,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:26:01,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step165000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:26:01,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step165000 is ready now! +0: successfully saved checkpoint at iteration 165000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 186.52 +7: iteration 165010/ 173500 | consumed samples: 42242560 | consumed tokens: 86512762880 | elapsed time per iteration (s): 0.18 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.654958E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1440.250 | TFLOPs: 22.59 | +7: iteration 165020/ 173500 | consumed samples: 42245120 | consumed tokens: 86518005760 | elapsed time per iteration (s): 0.16 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.657970E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.512 | TFLOPs: 25.76 | +7: iteration 165030/ 173500 | consumed samples: 42247680 | consumed tokens: 86523248640 | elapsed time per iteration (s): 0.16 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.661866E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.647 | TFLOPs: 25.68 | +7: iteration 165040/ 173500 | consumed samples: 42250240 | consumed tokens: 86528491520 | elapsed time per iteration (s): 0.15 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.659772E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.043 | TFLOPs: 26.19 | +7: iteration 165050/ 173500 | consumed samples: 42252800 | consumed tokens: 86533734400 | elapsed time per iteration (s): 0.15 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 3.665535E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.374 | TFLOPs: 26.20 | +7: iteration 165060/ 173500 | consumed samples: 42255360 | consumed tokens: 86538977280 | elapsed time per iteration (s): 0.15 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 3.664353E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.597 | TFLOPs: 25.98 | +7: iteration 165070/ 173500 | consumed samples: 42257920 | consumed tokens: 86544220160 | elapsed time per iteration (s): 0.15 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 3.679633E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.746 | TFLOPs: 26.17 | +7: iteration 165080/ 173500 | consumed samples: 42260480 | consumed tokens: 86549463040 | elapsed time per iteration (s): 0.15 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 3.661303E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.685 | TFLOPs: 25.98 | +7: iteration 165090/ 173500 | consumed samples: 42263040 | consumed tokens: 86554705920 | elapsed time per iteration (s): 0.15 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.662399E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.619 | TFLOPs: 26.18 | +7: iteration 165100/ 173500 | consumed samples: 42265600 | consumed tokens: 86559948800 | elapsed time per iteration (s): 0.15 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.666561E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.295 | TFLOPs: 26.19 | +7: iteration 165110/ 173500 | consumed samples: 42268160 | consumed tokens: 86565191680 | elapsed time per iteration (s): 0.15 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.661477E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.344 | TFLOPs: 26.18 | +7: iteration 165120/ 173500 | consumed samples: 42270720 | consumed tokens: 86570434560 | elapsed time per iteration (s): 0.16 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.657093E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.972 | TFLOPs: 25.81 | +7: iteration 165130/ 173500 | consumed samples: 42273280 | consumed tokens: 86575677440 | elapsed time per iteration (s): 0.15 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 3.669645E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.323 | TFLOPs: 25.99 | +7: iteration 165140/ 173500 | consumed samples: 42275840 | consumed tokens: 86580920320 | elapsed time per iteration (s): 0.16 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 3.663023E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.663 | TFLOPs: 24.51 | +7: iteration 165150/ 173500 | consumed samples: 42278400 | consumed tokens: 86586163200 | elapsed time per iteration (s): 0.16 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 3.668594E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.298 | TFLOPs: 25.87 | +7: iteration 165160/ 173500 | consumed samples: 42280960 | consumed tokens: 86591406080 | elapsed time per iteration (s): 0.16 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 3.650631E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.802 | TFLOPs: 25.54 | +7: iteration 165170/ 173500 | consumed samples: 42283520 | consumed tokens: 86596648960 | elapsed time per iteration (s): 0.15 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.668358E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.705 | TFLOPs: 26.17 | +7: iteration 165180/ 173500 | consumed samples: 42286080 | consumed tokens: 86601891840 | elapsed time per iteration (s): 0.15 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.651379E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.402 | TFLOPs: 26.18 | +7: iteration 165190/ 173500 | consumed samples: 42288640 | consumed tokens: 86607134720 | elapsed time per iteration (s): 0.16 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.662334E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.176 | TFLOPs: 25.80 | +7: iteration 165200/ 173500 | consumed samples: 42291200 | consumed tokens: 86612377600 | elapsed time per iteration (s): 0.16 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.659727E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.842 | TFLOPs: 25.81 | +7: iteration 165210/ 173500 | consumed samples: 42293760 | consumed tokens: 86617620480 | elapsed time per iteration (s): 0.15 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.657893E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.919 | TFLOPs: 26.19 | +7: iteration 165220/ 173500 | consumed samples: 42296320 | consumed tokens: 86622863360 | elapsed time per iteration (s): 0.15 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.656233E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.317 | TFLOPs: 26.13 | +7: iteration 165230/ 173500 | consumed samples: 42298880 | consumed tokens: 86628106240 | elapsed time per iteration (s): 0.15 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.663315E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.965 | TFLOPs: 26.14 | +7: iteration 165240/ 173500 | consumed samples: 42301440 | consumed tokens: 86633349120 | elapsed time per iteration (s): 0.15 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.651009E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.401 | TFLOPs: 26.16 | +7: iteration 165250/ 173500 | consumed samples: 42304000 | consumed tokens: 86638592000 | elapsed time per iteration (s): 0.15 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 3.666696E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.934 | TFLOPs: 26.13 | +7: iteration 165260/ 173500 | consumed samples: 42306560 | consumed tokens: 86643834880 | elapsed time per iteration (s): 0.15 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 3.658835E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.099 | TFLOPs: 26.11 | +7: iteration 165270/ 173500 | consumed samples: 42309120 | consumed tokens: 86649077760 | elapsed time per iteration (s): 0.15 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 3.661508E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.742 | TFLOPs: 25.93 | +7: iteration 165280/ 173500 | consumed samples: 42311680 | consumed tokens: 86654320640 | elapsed time per iteration (s): 0.15 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 3.656560E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.948 | TFLOPs: 26.16 | +7: iteration 165290/ 173500 | consumed samples: 42314240 | consumed tokens: 86659563520 | elapsed time per iteration (s): 0.15 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.669310E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.650 | TFLOPs: 26.15 | +7: iteration 165300/ 173500 | consumed samples: 42316800 | consumed tokens: 86664806400 | elapsed time per iteration (s): 0.15 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.670459E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.740 | TFLOPs: 26.17 | +7: iteration 165310/ 173500 | consumed samples: 42319360 | consumed tokens: 86670049280 | elapsed time per iteration (s): 0.15 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.659487E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.945 | TFLOPs: 25.92 | +7: iteration 165320/ 173500 | consumed samples: 42321920 | consumed tokens: 86675292160 | elapsed time per iteration (s): 0.16 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.666516E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.818 | TFLOPs: 25.72 | +7: iteration 165330/ 173500 | consumed samples: 42324480 | consumed tokens: 86680535040 | elapsed time per iteration (s): 0.15 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.667846E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.469 | TFLOPs: 26.18 | +7: iteration 165340/ 173500 | consumed samples: 42327040 | consumed tokens: 86685777920 | elapsed time per iteration (s): 0.15 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.665466E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.226 | TFLOPs: 26.18 | +7: iteration 165350/ 173500 | consumed samples: 42329600 | consumed tokens: 86691020800 | elapsed time per iteration (s): 0.15 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.661862E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.098 | TFLOPs: 26.19 | +7: iteration 165360/ 173500 | consumed samples: 42332160 | consumed tokens: 86696263680 | elapsed time per iteration (s): 0.15 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.653604E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.834 | TFLOPs: 26.17 | +7: iteration 165370/ 173500 | consumed samples: 42334720 | consumed tokens: 86701506560 | elapsed time per iteration (s): 0.15 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.674158E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.987 | TFLOPs: 26.19 | +7: iteration 165380/ 173500 | consumed samples: 42337280 | consumed tokens: 86706749440 | elapsed time per iteration (s): 0.15 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.652808E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.903 | TFLOPs: 26.19 | +7: iteration 165390/ 173500 | consumed samples: 42339840 | consumed tokens: 86711992320 | elapsed time per iteration (s): 0.15 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.663506E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.155 | TFLOPs: 26.19 | +7: iteration 165400/ 173500 | consumed samples: 42342400 | consumed tokens: 86717235200 | elapsed time per iteration (s): 0.15 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.656144E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.005 | TFLOPs: 26.17 | +7: iteration 165410/ 173500 | consumed samples: 42344960 | consumed tokens: 86722478080 | elapsed time per iteration (s): 0.15 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.646476E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.454 | TFLOPs: 26.20 | +7: iteration 165420/ 173500 | consumed samples: 42347520 | consumed tokens: 86727720960 | elapsed time per iteration (s): 0.16 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.655274E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.419 | TFLOPs: 25.16 | +7: iteration 165430/ 173500 | consumed samples: 42350080 | consumed tokens: 86732963840 | elapsed time per iteration (s): 0.15 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.668110E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.937 | TFLOPs: 26.22 | +7: iteration 165440/ 173500 | consumed samples: 42352640 | consumed tokens: 86738206720 | elapsed time per iteration (s): 0.15 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.660189E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.948 | TFLOPs: 26.20 | +7: iteration 165450/ 173500 | consumed samples: 42355200 | consumed tokens: 86743449600 | elapsed time per iteration (s): 0.15 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.662379E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.231 | TFLOPs: 26.18 | +7: iteration 165460/ 173500 | consumed samples: 42357760 | consumed tokens: 86748692480 | elapsed time per iteration (s): 0.16 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.647846E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.005 | TFLOPs: 25.81 | +7: iteration 165470/ 173500 | consumed samples: 42360320 | consumed tokens: 86753935360 | elapsed time per iteration (s): 0.15 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.675652E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.614 | TFLOPs: 26.20 | +7: iteration 165480/ 173500 | consumed samples: 42362880 | consumed tokens: 86759178240 | elapsed time per iteration (s): 0.15 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.665892E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.002 | TFLOPs: 26.21 | +7: iteration 165490/ 173500 | consumed samples: 42365440 | consumed tokens: 86764421120 | elapsed time per iteration (s): 0.15 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.665585E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.023 | TFLOPs: 26.19 | +7: iteration 165500/ 173500 | consumed samples: 42368000 | consumed tokens: 86769664000 | elapsed time per iteration (s): 0.16 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.651608E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.196 | TFLOPs: 25.38 | +7: iteration 165510/ 173500 | consumed samples: 42370560 | consumed tokens: 86774906880 | elapsed time per iteration (s): 0.15 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.649162E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.757 | TFLOPs: 26.00 | +7: iteration 165520/ 173500 | consumed samples: 42373120 | consumed tokens: 86780149760 | elapsed time per iteration (s): 0.16 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.666679E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.238 | TFLOPs: 25.68 | +7: iteration 165530/ 173500 | consumed samples: 42375680 | consumed tokens: 86785392640 | elapsed time per iteration (s): 0.15 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.679414E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.136 | TFLOPs: 26.22 | +7: iteration 165540/ 173500 | consumed samples: 42378240 | consumed tokens: 86790635520 | elapsed time per iteration (s): 0.15 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.659927E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.747 | TFLOPs: 26.20 | +7: iteration 165550/ 173500 | consumed samples: 42380800 | consumed tokens: 86795878400 | elapsed time per iteration (s): 0.16 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.658696E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.377 | TFLOPs: 25.66 | +7: iteration 165560/ 173500 | consumed samples: 42383360 | consumed tokens: 86801121280 | elapsed time per iteration (s): 0.15 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.661579E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.627 | TFLOPs: 26.12 | +7: iteration 165570/ 173500 | consumed samples: 42385920 | consumed tokens: 86806364160 | elapsed time per iteration (s): 0.15 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.659834E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.784 | TFLOPs: 25.92 | +7: iteration 165580/ 173500 | consumed samples: 42388480 | consumed tokens: 86811607040 | elapsed time per iteration (s): 0.16 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.672318E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.381 | TFLOPs: 25.41 | +7: iteration 165590/ 173500 | consumed samples: 42391040 | consumed tokens: 86816849920 | elapsed time per iteration (s): 0.15 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.656419E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.193 | TFLOPs: 26.15 | +7: iteration 165600/ 173500 | consumed samples: 42393600 | consumed tokens: 86822092800 | elapsed time per iteration (s): 0.16 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.670948E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.951 | TFLOPs: 25.72 | +7: iteration 165610/ 173500 | consumed samples: 42396160 | consumed tokens: 86827335680 | elapsed time per iteration (s): 0.16 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.664075E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.776 | TFLOPs: 25.70 | +7: iteration 165620/ 173500 | consumed samples: 42398720 | consumed tokens: 86832578560 | elapsed time per iteration (s): 0.16 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.645705E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.591 | TFLOPs: 25.82 | +7: iteration 165630/ 173500 | consumed samples: 42401280 | consumed tokens: 86837821440 | elapsed time per iteration (s): 0.16 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.674215E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.321 | TFLOPs: 25.46 | +7: iteration 165640/ 173500 | consumed samples: 42403840 | consumed tokens: 86843064320 | elapsed time per iteration (s): 0.16 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.676569E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.790 | TFLOPs: 25.68 | +7: iteration 165650/ 173500 | consumed samples: 42406400 | consumed tokens: 86848307200 | elapsed time per iteration (s): 0.16 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.653915E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.203 | TFLOPs: 24.50 | +7: iteration 165660/ 173500 | consumed samples: 42408960 | consumed tokens: 86853550080 | elapsed time per iteration (s): 0.16 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.657951E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.158 | TFLOPs: 25.75 | +7: iteration 165670/ 173500 | consumed samples: 42411520 | consumed tokens: 86858792960 | elapsed time per iteration (s): 0.16 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.657053E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.596 | TFLOPs: 25.74 | +7: iteration 165680/ 173500 | consumed samples: 42414080 | consumed tokens: 86864035840 | elapsed time per iteration (s): 0.16 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.665882E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.293 | TFLOPs: 25.60 | +7: iteration 165690/ 173500 | consumed samples: 42416640 | consumed tokens: 86869278720 | elapsed time per iteration (s): 0.15 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.670851E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.792 | TFLOPs: 25.94 | +7: iteration 165700/ 173500 | consumed samples: 42419200 | consumed tokens: 86874521600 | elapsed time per iteration (s): 0.16 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 3.659067E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.319 | TFLOPs: 25.16 | +7: iteration 165710/ 173500 | consumed samples: 42421760 | consumed tokens: 86879764480 | elapsed time per iteration (s): 0.16 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 3.653513E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.582 | TFLOPs: 25.04 | +7: iteration 165720/ 173500 | consumed samples: 42424320 | consumed tokens: 86885007360 | elapsed time per iteration (s): 0.16 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 3.657405E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.109 | TFLOPs: 25.34 | +7: iteration 165730/ 173500 | consumed samples: 42426880 | consumed tokens: 86890250240 | elapsed time per iteration (s): 0.16 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 3.672527E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.809 | TFLOPs: 24.95 | +7: iteration 165740/ 173500 | consumed samples: 42429440 | consumed tokens: 86895493120 | elapsed time per iteration (s): 0.16 | learning rate: 2.091E-05 | global batch size: 256 | lm loss: 3.662083E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.177 | TFLOPs: 24.88 | +7: iteration 165750/ 173500 | consumed samples: 42432000 | consumed tokens: 86900736000 | elapsed time per iteration (s): 0.16 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.668218E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.031 | TFLOPs: 25.31 | +7: iteration 165760/ 173500 | consumed samples: 42434560 | consumed tokens: 86905978880 | elapsed time per iteration (s): 0.16 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.658244E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.928 | TFLOPs: 25.70 | +7: iteration 165770/ 173500 | consumed samples: 42437120 | consumed tokens: 86911221760 | elapsed time per iteration (s): 0.16 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.653706E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1601.001 | TFLOPs: 25.11 | +7: iteration 165780/ 173500 | consumed samples: 42439680 | consumed tokens: 86916464640 | elapsed time per iteration (s): 0.15 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.659666E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.019 | TFLOPs: 26.11 | +7: iteration 165790/ 173500 | consumed samples: 42442240 | consumed tokens: 86921707520 | elapsed time per iteration (s): 0.16 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 3.660312E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.892 | TFLOPs: 25.07 | +7: iteration 165800/ 173500 | consumed samples: 42444800 | consumed tokens: 86926950400 | elapsed time per iteration (s): 0.15 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 3.678500E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.713 | TFLOPs: 25.92 | +7: iteration 165810/ 173500 | consumed samples: 42447360 | consumed tokens: 86932193280 | elapsed time per iteration (s): 0.15 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 3.638826E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.928 | TFLOPs: 26.17 | +7: iteration 165820/ 173500 | consumed samples: 42449920 | consumed tokens: 86937436160 | elapsed time per iteration (s): 0.15 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 3.647594E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.963 | TFLOPs: 26.17 | +7: iteration 165830/ 173500 | consumed samples: 42452480 | consumed tokens: 86942679040 | elapsed time per iteration (s): 0.15 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.664234E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.217 | TFLOPs: 25.97 | +7: iteration 165840/ 173500 | consumed samples: 42455040 | consumed tokens: 86947921920 | elapsed time per iteration (s): 0.16 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.668798E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.592 | TFLOPs: 25.23 | +7: iteration 165850/ 173500 | consumed samples: 42457600 | consumed tokens: 86953164800 | elapsed time per iteration (s): 0.15 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.655365E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.843 | TFLOPs: 26.17 | +7: iteration 165860/ 173500 | consumed samples: 42460160 | consumed tokens: 86958407680 | elapsed time per iteration (s): 0.15 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.654843E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.091 | TFLOPs: 26.14 | +7: iteration 165870/ 173500 | consumed samples: 42462720 | consumed tokens: 86963650560 | elapsed time per iteration (s): 0.16 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.663720E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.709 | TFLOPs: 25.86 | +7: iteration 165880/ 173500 | consumed samples: 42465280 | consumed tokens: 86968893440 | elapsed time per iteration (s): 0.15 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.654133E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.839 | TFLOPs: 26.14 | +7: iteration 165890/ 173500 | consumed samples: 42467840 | consumed tokens: 86974136320 | elapsed time per iteration (s): 0.16 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.652677E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.699 | TFLOPs: 24.95 | +7: iteration 165900/ 173500 | consumed samples: 42470400 | consumed tokens: 86979379200 | elapsed time per iteration (s): 0.16 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.650606E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.440 | TFLOPs: 25.65 | +7: iteration 165910/ 173500 | consumed samples: 42472960 | consumed tokens: 86984622080 | elapsed time per iteration (s): 0.16 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.658060E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.725 | TFLOPs: 24.76 | +7: iteration 165920/ 173500 | consumed samples: 42475520 | consumed tokens: 86989864960 | elapsed time per iteration (s): 0.16 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.668781E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.394 | TFLOPs: 25.73 | +7: iteration 165930/ 173500 | consumed samples: 42478080 | consumed tokens: 86995107840 | elapsed time per iteration (s): 0.16 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.661238E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1562.092 | TFLOPs: 24.50 | +7: iteration 165940/ 173500 | consumed samples: 42480640 | consumed tokens: 87000350720 | elapsed time per iteration (s): 0.16 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.680075E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.336 | TFLOPs: 25.76 | +7: iteration 165950/ 173500 | consumed samples: 42483200 | consumed tokens: 87005593600 | elapsed time per iteration (s): 0.16 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.663880E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.214 | TFLOPs: 24.91 | +7: iteration 165960/ 173500 | consumed samples: 42485760 | consumed tokens: 87010836480 | elapsed time per iteration (s): 0.15 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.671941E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.925 | TFLOPs: 26.16 | +7: iteration 165970/ 173500 | consumed samples: 42488320 | consumed tokens: 87016079360 | elapsed time per iteration (s): 0.15 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.656076E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.251 | TFLOPs: 26.12 | +7: iteration 165980/ 173500 | consumed samples: 42490880 | consumed tokens: 87021322240 | elapsed time per iteration (s): 0.15 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.659109E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.431 | TFLOPs: 26.09 | +7: iteration 165990/ 173500 | consumed samples: 42493440 | consumed tokens: 87026565120 | elapsed time per iteration (s): 0.16 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.676543E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.765 | TFLOPs: 25.83 | +0: [2023-03-17 07:28:36,923] [INFO] [logging.py:68:log_dist] [Rank 0] step=166000, skipped=0, lr=[2.0845563261196566e-05, 2.0845563261196566e-05, 2.0845563261196566e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 166000/ 173500 | consumed samples: 42496000 | consumed tokens: 87031808000 | elapsed time per iteration (s): 0.15 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.656333E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.223 | TFLOPs: 26.04 | +0: steps: 166000 loss: 3.6955 iter time (s): 0.154 samples/sec: 1659.371 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 166000 | lm loss value: 3.873607E+00 | lm loss PPL: 4.811563E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 166000 to checkpoints_44m91b100m +0: [2023-03-17 07:28:36,999] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step166000 is begin to save! +0: [2023-03-17 07:28:37,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:28:37,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:28:37,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:28:37,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:28:37,081] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:28:37,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:28:37,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:28:37,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:28:37,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:28:37,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:28:37,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:28:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:28:37,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:28:37,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:28:37,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:28:37,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:28:37,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:28:37,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:28:37,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:28:37,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:28:37,141] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step166000/mp_rank_00_model_states.pt +0: [2023-03-17 07:28:37,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:28:37,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:28:37,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,174] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +3: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +3: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +3: [2023-03-17 07:28:37,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:28:37,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +3: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:28:37,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,179] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,179] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +3: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:28:37,180] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +3: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +4: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +1: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +4: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +6: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +5: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:28:37,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:28:37,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +7: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:28:37,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 07:28:37,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step166000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +2: [2023-03-17 07:28:37,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step166000 is ready now! +0: successfully saved checkpoint at iteration 166000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 187.90 +7: iteration 166010/ 173500 | consumed samples: 42498560 | consumed tokens: 87037050880 | elapsed time per iteration (s): 0.18 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.650919E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.317 | TFLOPs: 22.15 | +7: iteration 166020/ 173500 | consumed samples: 42501120 | consumed tokens: 87042293760 | elapsed time per iteration (s): 0.16 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.651050E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.775 | TFLOPs: 25.86 | +7: iteration 166030/ 173500 | consumed samples: 42503680 | consumed tokens: 87047536640 | elapsed time per iteration (s): 0.16 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.636120E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.450 | TFLOPs: 25.62 | +7: iteration 166040/ 173500 | consumed samples: 42506240 | consumed tokens: 87052779520 | elapsed time per iteration (s): 0.16 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.659876E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1597.696 | TFLOPs: 25.06 | +7: iteration 166050/ 173500 | consumed samples: 42508800 | consumed tokens: 87058022400 | elapsed time per iteration (s): 0.15 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.669503E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.851 | TFLOPs: 26.09 | +7: iteration 166060/ 173500 | consumed samples: 42511360 | consumed tokens: 87063265280 | elapsed time per iteration (s): 0.16 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.670856E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.667 | TFLOPs: 25.57 | +7: iteration 166070/ 173500 | consumed samples: 42513920 | consumed tokens: 87068508160 | elapsed time per iteration (s): 0.16 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.659358E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.614 | TFLOPs: 25.20 | +7: iteration 166080/ 173500 | consumed samples: 42516480 | consumed tokens: 87073751040 | elapsed time per iteration (s): 0.15 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.665969E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.333 | TFLOPs: 26.16 | +7: iteration 166090/ 173500 | consumed samples: 42519040 | consumed tokens: 87078993920 | elapsed time per iteration (s): 0.16 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.661697E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.469 | TFLOPs: 25.57 | +7: iteration 166100/ 173500 | consumed samples: 42521600 | consumed tokens: 87084236800 | elapsed time per iteration (s): 0.15 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.653735E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.955 | TFLOPs: 26.19 | +7: iteration 166110/ 173500 | consumed samples: 42524160 | consumed tokens: 87089479680 | elapsed time per iteration (s): 0.16 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.655799E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.199 | TFLOPs: 25.83 | +7: iteration 166120/ 173500 | consumed samples: 42526720 | consumed tokens: 87094722560 | elapsed time per iteration (s): 0.16 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.648246E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.740 | TFLOPs: 24.88 | +7: iteration 166130/ 173500 | consumed samples: 42529280 | consumed tokens: 87099965440 | elapsed time per iteration (s): 0.16 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.673748E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.927 | TFLOPs: 25.31 | +7: iteration 166140/ 173500 | consumed samples: 42531840 | consumed tokens: 87105208320 | elapsed time per iteration (s): 0.16 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.653828E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.318 | TFLOPs: 25.63 | +7: iteration 166150/ 173500 | consumed samples: 42534400 | consumed tokens: 87110451200 | elapsed time per iteration (s): 0.16 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.654984E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.351 | TFLOPs: 25.49 | +7: iteration 166160/ 173500 | consumed samples: 42536960 | consumed tokens: 87115694080 | elapsed time per iteration (s): 0.16 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.667758E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.323 | TFLOPs: 25.24 | +7: iteration 166170/ 173500 | consumed samples: 42539520 | consumed tokens: 87120936960 | elapsed time per iteration (s): 0.16 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.662849E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.151 | TFLOPs: 25.83 | +7: iteration 166180/ 173500 | consumed samples: 42542080 | consumed tokens: 87126179840 | elapsed time per iteration (s): 0.15 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.655408E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.903 | TFLOPs: 25.92 | +7: iteration 166190/ 173500 | consumed samples: 42544640 | consumed tokens: 87131422720 | elapsed time per iteration (s): 0.16 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 3.669955E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.259 | TFLOPs: 25.38 | +7: iteration 166200/ 173500 | consumed samples: 42547200 | consumed tokens: 87136665600 | elapsed time per iteration (s): 0.16 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 3.658656E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.842 | TFLOPs: 25.45 | +7: iteration 166210/ 173500 | consumed samples: 42549760 | consumed tokens: 87141908480 | elapsed time per iteration (s): 0.15 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 3.641659E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.356 | TFLOPs: 26.23 | +7: iteration 166220/ 173500 | consumed samples: 42552320 | consumed tokens: 87147151360 | elapsed time per iteration (s): 0.15 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 3.663999E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.643 | TFLOPs: 25.95 | +7: iteration 166230/ 173500 | consumed samples: 42554880 | consumed tokens: 87152394240 | elapsed time per iteration (s): 0.15 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.661332E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.541 | TFLOPs: 26.14 | +7: iteration 166240/ 173500 | consumed samples: 42557440 | consumed tokens: 87157637120 | elapsed time per iteration (s): 0.17 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.657136E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.531 | TFLOPs: 24.08 | +7: iteration 166250/ 173500 | consumed samples: 42560000 | consumed tokens: 87162880000 | elapsed time per iteration (s): 0.16 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.659132E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.210 | TFLOPs: 25.75 | +7: iteration 166260/ 173500 | consumed samples: 42562560 | consumed tokens: 87168122880 | elapsed time per iteration (s): 0.15 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.659475E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.373 | TFLOPs: 26.24 | +7: iteration 166270/ 173500 | consumed samples: 42565120 | consumed tokens: 87173365760 | elapsed time per iteration (s): 0.17 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.669560E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1540.863 | TFLOPs: 24.16 | +7: iteration 166280/ 173500 | consumed samples: 42567680 | consumed tokens: 87178608640 | elapsed time per iteration (s): 0.16 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.659963E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1587.423 | TFLOPs: 24.89 | +7: iteration 166290/ 173500 | consumed samples: 42570240 | consumed tokens: 87183851520 | elapsed time per iteration (s): 0.15 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.649464E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.423 | TFLOPs: 26.20 | +7: iteration 166300/ 173500 | consumed samples: 42572800 | consumed tokens: 87189094400 | elapsed time per iteration (s): 0.15 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.659708E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.707 | TFLOPs: 26.23 | +7: iteration 166310/ 173500 | consumed samples: 42575360 | consumed tokens: 87194337280 | elapsed time per iteration (s): 0.16 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.658506E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.165 | TFLOPs: 25.78 | +7: iteration 166320/ 173500 | consumed samples: 42577920 | consumed tokens: 87199580160 | elapsed time per iteration (s): 0.15 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.667522E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.800 | TFLOPs: 26.20 | +7: iteration 166330/ 173500 | consumed samples: 42580480 | consumed tokens: 87204823040 | elapsed time per iteration (s): 0.16 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.671391E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.340 | TFLOPs: 25.74 | +7: iteration 166340/ 173500 | consumed samples: 42583040 | consumed tokens: 87210065920 | elapsed time per iteration (s): 0.16 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.661017E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.893 | TFLOPs: 25.09 | +7: iteration 166350/ 173500 | consumed samples: 42585600 | consumed tokens: 87215308800 | elapsed time per iteration (s): 0.16 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.656433E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.180 | TFLOPs: 25.44 | +7: iteration 166360/ 173500 | consumed samples: 42588160 | consumed tokens: 87220551680 | elapsed time per iteration (s): 0.16 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.661372E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.484 | TFLOPs: 24.91 | +7: iteration 166370/ 173500 | consumed samples: 42590720 | consumed tokens: 87225794560 | elapsed time per iteration (s): 0.16 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.672125E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.654 | TFLOPs: 25.67 | +7: iteration 166380/ 173500 | consumed samples: 42593280 | consumed tokens: 87231037440 | elapsed time per iteration (s): 0.15 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.669368E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.029 | TFLOPs: 26.16 | +7: iteration 166390/ 173500 | consumed samples: 42595840 | consumed tokens: 87236280320 | elapsed time per iteration (s): 0.16 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.665133E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.432 | TFLOPs: 25.21 | +7: iteration 166400/ 173500 | consumed samples: 42598400 | consumed tokens: 87241523200 | elapsed time per iteration (s): 0.16 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.674614E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.417 | TFLOPs: 25.73 | +7: iteration 166410/ 173500 | consumed samples: 42600960 | consumed tokens: 87246766080 | elapsed time per iteration (s): 0.16 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.660710E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.257 | TFLOPs: 24.39 | +7: iteration 166420/ 173500 | consumed samples: 42603520 | consumed tokens: 87252008960 | elapsed time per iteration (s): 0.16 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.662000E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.277 | TFLOPs: 25.61 | +7: iteration 166430/ 173500 | consumed samples: 42606080 | consumed tokens: 87257251840 | elapsed time per iteration (s): 0.16 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.666730E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.688 | TFLOPs: 25.64 | +7: iteration 166440/ 173500 | consumed samples: 42608640 | consumed tokens: 87262494720 | elapsed time per iteration (s): 0.16 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.658744E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1589.471 | TFLOPs: 24.93 | +7: iteration 166450/ 173500 | consumed samples: 42611200 | consumed tokens: 87267737600 | elapsed time per iteration (s): 0.15 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.664260E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.617 | TFLOPs: 26.03 | +7: iteration 166460/ 173500 | consumed samples: 42613760 | consumed tokens: 87272980480 | elapsed time per iteration (s): 0.16 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.649892E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.936 | TFLOPs: 24.42 | +7: iteration 166470/ 173500 | consumed samples: 42616320 | consumed tokens: 87278223360 | elapsed time per iteration (s): 0.15 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.660909E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.140 | TFLOPs: 26.11 | +7: iteration 166480/ 173500 | consumed samples: 42618880 | consumed tokens: 87283466240 | elapsed time per iteration (s): 0.15 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.654338E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.077 | TFLOPs: 26.13 | +7: iteration 166490/ 173500 | consumed samples: 42621440 | consumed tokens: 87288709120 | elapsed time per iteration (s): 0.16 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.665062E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.566 | TFLOPs: 25.70 | +7: iteration 166500/ 173500 | consumed samples: 42624000 | consumed tokens: 87293952000 | elapsed time per iteration (s): 0.15 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.668366E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.131 | TFLOPs: 26.16 | +7: iteration 166510/ 173500 | consumed samples: 42626560 | consumed tokens: 87299194880 | elapsed time per iteration (s): 0.15 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.648235E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.817 | TFLOPs: 26.11 | +7: iteration 166520/ 173500 | consumed samples: 42629120 | consumed tokens: 87304437760 | elapsed time per iteration (s): 0.16 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.660933E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.738 | TFLOPs: 25.57 | +7: iteration 166530/ 173500 | consumed samples: 42631680 | consumed tokens: 87309680640 | elapsed time per iteration (s): 0.15 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.668283E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.621 | TFLOPs: 25.95 | +7: iteration 166540/ 173500 | consumed samples: 42634240 | consumed tokens: 87314923520 | elapsed time per iteration (s): 0.16 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.667031E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.813 | TFLOPs: 25.70 | +7: iteration 166550/ 173500 | consumed samples: 42636800 | consumed tokens: 87320166400 | elapsed time per iteration (s): 0.16 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.659147E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.488 | TFLOPs: 25.81 | +7: iteration 166560/ 173500 | consumed samples: 42639360 | consumed tokens: 87325409280 | elapsed time per iteration (s): 0.15 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.644042E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.110 | TFLOPs: 26.16 | +7: iteration 166570/ 173500 | consumed samples: 42641920 | consumed tokens: 87330652160 | elapsed time per iteration (s): 0.15 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.662683E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.240 | TFLOPs: 26.15 | +7: iteration 166580/ 173500 | consumed samples: 42644480 | consumed tokens: 87335895040 | elapsed time per iteration (s): 0.15 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.666690E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.341 | TFLOPs: 26.02 | +7: iteration 166590/ 173500 | consumed samples: 42647040 | consumed tokens: 87341137920 | elapsed time per iteration (s): 0.16 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.660925E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.333 | TFLOPs: 25.36 | +7: iteration 166600/ 173500 | consumed samples: 42649600 | consumed tokens: 87346380800 | elapsed time per iteration (s): 0.15 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.665255E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.998 | TFLOPs: 26.06 | +7: iteration 166610/ 173500 | consumed samples: 42652160 | consumed tokens: 87351623680 | elapsed time per iteration (s): 0.16 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.658773E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1602.695 | TFLOPs: 25.13 | +7: iteration 166620/ 173500 | consumed samples: 42654720 | consumed tokens: 87356866560 | elapsed time per iteration (s): 0.16 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.659825E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.950 | TFLOPs: 25.70 | +7: iteration 166630/ 173500 | consumed samples: 42657280 | consumed tokens: 87362109440 | elapsed time per iteration (s): 0.16 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.658918E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.091 | TFLOPs: 25.72 | +7: iteration 166640/ 173500 | consumed samples: 42659840 | consumed tokens: 87367352320 | elapsed time per iteration (s): 0.16 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.661818E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.912 | TFLOPs: 25.70 | +7: iteration 166650/ 173500 | consumed samples: 42662400 | consumed tokens: 87372595200 | elapsed time per iteration (s): 0.16 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.675647E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.654 | TFLOPs: 25.89 | +7: iteration 166660/ 173500 | consumed samples: 42664960 | consumed tokens: 87377838080 | elapsed time per iteration (s): 0.16 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.669707E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.667 | TFLOPs: 25.75 | +7: iteration 166670/ 173500 | consumed samples: 42667520 | consumed tokens: 87383080960 | elapsed time per iteration (s): 0.16 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.666262E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.833 | TFLOPs: 25.64 | +7: iteration 166680/ 173500 | consumed samples: 42670080 | consumed tokens: 87388323840 | elapsed time per iteration (s): 0.16 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.675386E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.224 | TFLOPs: 25.64 | +7: iteration 166690/ 173500 | consumed samples: 42672640 | consumed tokens: 87393566720 | elapsed time per iteration (s): 0.15 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.680062E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.055 | TFLOPs: 26.11 | +7: iteration 166700/ 173500 | consumed samples: 42675200 | consumed tokens: 87398809600 | elapsed time per iteration (s): 0.16 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.666486E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.085 | TFLOPs: 25.22 | +7: iteration 166710/ 173500 | consumed samples: 42677760 | consumed tokens: 87404052480 | elapsed time per iteration (s): 0.16 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.657389E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.916 | TFLOPs: 25.11 | +7: iteration 166720/ 173500 | consumed samples: 42680320 | consumed tokens: 87409295360 | elapsed time per iteration (s): 0.16 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.670525E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.597 | TFLOPs: 25.74 | +7: iteration 166730/ 173500 | consumed samples: 42682880 | consumed tokens: 87414538240 | elapsed time per iteration (s): 0.16 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.660960E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.083 | TFLOPs: 25.14 | +7: iteration 166740/ 173500 | consumed samples: 42685440 | consumed tokens: 87419781120 | elapsed time per iteration (s): 0.16 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.668663E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.087 | TFLOPs: 25.69 | +7: iteration 166750/ 173500 | consumed samples: 42688000 | consumed tokens: 87425024000 | elapsed time per iteration (s): 0.16 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.669265E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.624 | TFLOPs: 25.18 | +7: iteration 166760/ 173500 | consumed samples: 42690560 | consumed tokens: 87430266880 | elapsed time per iteration (s): 0.16 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.680159E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.693 | TFLOPs: 25.67 | +7: iteration 166770/ 173500 | consumed samples: 42693120 | consumed tokens: 87435509760 | elapsed time per iteration (s): 0.15 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.656165E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.815 | TFLOPs: 26.14 | +7: iteration 166780/ 173500 | consumed samples: 42695680 | consumed tokens: 87440752640 | elapsed time per iteration (s): 0.16 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.658693E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.556 | TFLOPs: 25.21 | +7: iteration 166790/ 173500 | consumed samples: 42698240 | consumed tokens: 87445995520 | elapsed time per iteration (s): 0.17 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.664965E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1535.031 | TFLOPs: 24.07 | +7: iteration 166800/ 173500 | consumed samples: 42700800 | consumed tokens: 87451238400 | elapsed time per iteration (s): 0.17 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.657713E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1509.464 | TFLOPs: 23.67 | +7: iteration 166810/ 173500 | consumed samples: 42703360 | consumed tokens: 87456481280 | elapsed time per iteration (s): 0.16 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.664234E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.505 | TFLOPs: 24.46 | +7: iteration 166820/ 173500 | consumed samples: 42705920 | consumed tokens: 87461724160 | elapsed time per iteration (s): 0.16 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.651039E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.242 | TFLOPs: 25.68 | +7: iteration 166830/ 173500 | consumed samples: 42708480 | consumed tokens: 87466967040 | elapsed time per iteration (s): 0.15 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.649846E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.192 | TFLOPs: 26.15 | +7: iteration 166840/ 173500 | consumed samples: 42711040 | consumed tokens: 87472209920 | elapsed time per iteration (s): 0.16 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.658002E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.261 | TFLOPs: 25.49 | +7: iteration 166850/ 173500 | consumed samples: 42713600 | consumed tokens: 87477452800 | elapsed time per iteration (s): 0.16 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.659256E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.835 | TFLOPs: 25.29 | +7: iteration 166860/ 173500 | consumed samples: 42716160 | consumed tokens: 87482695680 | elapsed time per iteration (s): 0.15 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.661009E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.404 | TFLOPs: 26.20 | +7: iteration 166870/ 173500 | consumed samples: 42718720 | consumed tokens: 87487938560 | elapsed time per iteration (s): 0.16 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.667245E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.067 | TFLOPs: 25.39 | +7: iteration 166880/ 173500 | consumed samples: 42721280 | consumed tokens: 87493181440 | elapsed time per iteration (s): 0.16 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.662705E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.985 | TFLOPs: 25.80 | +7: iteration 166890/ 173500 | consumed samples: 42723840 | consumed tokens: 87498424320 | elapsed time per iteration (s): 0.16 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.660810E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.061 | TFLOPs: 25.78 | +7: iteration 166900/ 173500 | consumed samples: 42726400 | consumed tokens: 87503667200 | elapsed time per iteration (s): 0.16 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.659926E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.237 | TFLOPs: 25.66 | +7: iteration 166910/ 173500 | consumed samples: 42728960 | consumed tokens: 87508910080 | elapsed time per iteration (s): 0.16 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.661475E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.774 | TFLOPs: 25.61 | +7: iteration 166920/ 173500 | consumed samples: 42731520 | consumed tokens: 87514152960 | elapsed time per iteration (s): 0.16 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.656896E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.895 | TFLOPs: 25.80 | +7: iteration 166930/ 173500 | consumed samples: 42734080 | consumed tokens: 87519395840 | elapsed time per iteration (s): 0.15 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.658143E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.025 | TFLOPs: 26.21 | +7: iteration 166940/ 173500 | consumed samples: 42736640 | consumed tokens: 87524638720 | elapsed time per iteration (s): 0.15 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.660346E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.170 | TFLOPs: 26.21 | +7: iteration 166950/ 173500 | consumed samples: 42739200 | consumed tokens: 87529881600 | elapsed time per iteration (s): 0.16 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.654779E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.547 | TFLOPs: 25.34 | +7: iteration 166960/ 173500 | consumed samples: 42741760 | consumed tokens: 87535124480 | elapsed time per iteration (s): 0.16 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.672293E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.737 | TFLOPs: 24.70 | +7: iteration 166970/ 173500 | consumed samples: 42744320 | consumed tokens: 87540367360 | elapsed time per iteration (s): 0.16 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.665693E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.807 | TFLOPs: 25.26 | +7: iteration 166980/ 173500 | consumed samples: 42746880 | consumed tokens: 87545610240 | elapsed time per iteration (s): 0.16 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.656442E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1605.839 | TFLOPs: 25.18 | +7: iteration 166990/ 173500 | consumed samples: 42749440 | consumed tokens: 87550853120 | elapsed time per iteration (s): 0.16 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.666077E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.603 | TFLOPs: 25.73 | +7: iteration 167000/ 173500 | consumed samples: 42752000 | consumed tokens: 87556096000 | elapsed time per iteration (s): 0.15 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.666972E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.920 | TFLOPs: 26.16 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 167000 | lm loss value: 3.822304E+00 | lm loss PPL: 4.570939E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 167000 to checkpoints_44m91b100m +0: [2023-03-17 07:31:14,249] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step167000 is begin to save! +0: [2023-03-17 07:31:14,254] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:31:14,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:31:14,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:31:14,336] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:31:14,336] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:31:14,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:31:14,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:31:14,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:31:14,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:31:14,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:31:14,361] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:31:14,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:31:14,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:31:14,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:31:14,378] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:31:14,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:31:14,386] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:31:14,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:31:14,394] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:31:14,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:31:14,396] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step167000/mp_rank_00_model_states.pt +0: [2023-03-17 07:31:14,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:31:14,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:31:14,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:31:14,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:31:14,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:31:14,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:31:14,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +5: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +7: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +6: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +3: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +2: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +4: [2023-03-17 07:31:14,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:31:14,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +1: [2023-03-17 07:31:14,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:31:14,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step167000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:31:14,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step167000 is ready now! +0: successfully saved checkpoint at iteration 167000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 191.84 +7: iteration 167010/ 173500 | consumed samples: 42754560 | consumed tokens: 87561338880 | elapsed time per iteration (s): 0.18 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.653277E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1442.694 | TFLOPs: 22.63 | +7: iteration 167020/ 173500 | consumed samples: 42757120 | consumed tokens: 87566581760 | elapsed time per iteration (s): 0.16 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.654144E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.226 | TFLOPs: 25.75 | +7: iteration 167030/ 173500 | consumed samples: 42759680 | consumed tokens: 87571824640 | elapsed time per iteration (s): 0.16 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.660840E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.614 | TFLOPs: 25.10 | +7: iteration 167040/ 173500 | consumed samples: 42762240 | consumed tokens: 87577067520 | elapsed time per iteration (s): 0.16 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.673078E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1618.338 | TFLOPs: 25.38 | +7: iteration 167050/ 173500 | consumed samples: 42764800 | consumed tokens: 87582310400 | elapsed time per iteration (s): 0.16 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.661611E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.071 | TFLOPs: 25.88 | +7: iteration 167060/ 173500 | consumed samples: 42767360 | consumed tokens: 87587553280 | elapsed time per iteration (s): 0.16 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.659610E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.293 | TFLOPs: 25.10 | +7: iteration 167070/ 173500 | consumed samples: 42769920 | consumed tokens: 87592796160 | elapsed time per iteration (s): 0.16 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.663969E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.601 | TFLOPs: 25.23 | +7: iteration 167080/ 173500 | consumed samples: 42772480 | consumed tokens: 87598039040 | elapsed time per iteration (s): 0.16 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.653219E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.365 | TFLOPs: 25.65 | +7: iteration 167090/ 173500 | consumed samples: 42775040 | consumed tokens: 87603281920 | elapsed time per iteration (s): 0.16 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.659417E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.846 | TFLOPs: 25.25 | +7: iteration 167100/ 173500 | consumed samples: 42777600 | consumed tokens: 87608524800 | elapsed time per iteration (s): 0.16 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.655788E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.083 | TFLOPs: 25.88 | +7: iteration 167110/ 173500 | consumed samples: 42780160 | consumed tokens: 87613767680 | elapsed time per iteration (s): 0.15 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.657291E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.695 | TFLOPs: 26.12 | +7: iteration 167120/ 173500 | consumed samples: 42782720 | consumed tokens: 87619010560 | elapsed time per iteration (s): 0.16 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.669321E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.367 | TFLOPs: 25.73 | +7: iteration 167130/ 173500 | consumed samples: 42785280 | consumed tokens: 87624253440 | elapsed time per iteration (s): 0.15 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.656593E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.589 | TFLOPs: 26.12 | +7: iteration 167140/ 173500 | consumed samples: 42787840 | consumed tokens: 87629496320 | elapsed time per iteration (s): 0.16 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.664937E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.596 | TFLOPs: 25.52 | +7: iteration 167150/ 173500 | consumed samples: 42790400 | consumed tokens: 87634739200 | elapsed time per iteration (s): 0.16 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.658368E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.270 | TFLOPs: 24.72 | +7: iteration 167160/ 173500 | consumed samples: 42792960 | consumed tokens: 87639982080 | elapsed time per iteration (s): 0.16 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.661662E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.483 | TFLOPs: 25.46 | +7: iteration 167170/ 173500 | consumed samples: 42795520 | consumed tokens: 87645224960 | elapsed time per iteration (s): 0.16 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.650400E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.638 | TFLOPs: 25.59 | +7: iteration 167180/ 173500 | consumed samples: 42798080 | consumed tokens: 87650467840 | elapsed time per iteration (s): 0.16 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.672260E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.460 | TFLOPs: 25.82 | +7: iteration 167190/ 173500 | consumed samples: 42800640 | consumed tokens: 87655710720 | elapsed time per iteration (s): 0.16 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.663414E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1595.361 | TFLOPs: 25.02 | +7: iteration 167200/ 173500 | consumed samples: 42803200 | consumed tokens: 87660953600 | elapsed time per iteration (s): 0.16 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.665946E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.875 | TFLOPs: 25.53 | +7: iteration 167210/ 173500 | consumed samples: 42805760 | consumed tokens: 87666196480 | elapsed time per iteration (s): 0.16 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.651324E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.039 | TFLOPs: 25.81 | +7: iteration 167220/ 173500 | consumed samples: 42808320 | consumed tokens: 87671439360 | elapsed time per iteration (s): 0.16 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.670397E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.282 | TFLOPs: 25.43 | +7: iteration 167230/ 173500 | consumed samples: 42810880 | consumed tokens: 87676682240 | elapsed time per iteration (s): 0.16 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.664873E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.551 | TFLOPs: 25.60 | +7: iteration 167240/ 173500 | consumed samples: 42813440 | consumed tokens: 87681925120 | elapsed time per iteration (s): 0.16 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.654852E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.854 | TFLOPs: 25.58 | +7: iteration 167250/ 173500 | consumed samples: 42816000 | consumed tokens: 87687168000 | elapsed time per iteration (s): 0.16 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.657203E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.878 | TFLOPs: 25.48 | +7: iteration 167260/ 173500 | consumed samples: 42818560 | consumed tokens: 87692410880 | elapsed time per iteration (s): 0.15 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.673986E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.194 | TFLOPs: 26.16 | +7: iteration 167270/ 173500 | consumed samples: 42821120 | consumed tokens: 87697653760 | elapsed time per iteration (s): 0.15 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.658673E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.649 | TFLOPs: 26.17 | +7: iteration 167280/ 173500 | consumed samples: 42823680 | consumed tokens: 87702896640 | elapsed time per iteration (s): 0.15 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.665400E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.518 | TFLOPs: 26.17 | +7: iteration 167290/ 173500 | consumed samples: 42826240 | consumed tokens: 87708139520 | elapsed time per iteration (s): 0.16 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.647334E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1600.049 | TFLOPs: 25.09 | +7: iteration 167300/ 173500 | consumed samples: 42828800 | consumed tokens: 87713382400 | elapsed time per iteration (s): 0.15 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.650777E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.605 | TFLOPs: 26.17 | +7: iteration 167310/ 173500 | consumed samples: 42831360 | consumed tokens: 87718625280 | elapsed time per iteration (s): 0.15 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.650133E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.325 | TFLOPs: 26.13 | +7: iteration 167320/ 173500 | consumed samples: 42833920 | consumed tokens: 87723868160 | elapsed time per iteration (s): 0.15 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.665721E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.108 | TFLOPs: 26.14 | +7: iteration 167330/ 173500 | consumed samples: 42836480 | consumed tokens: 87729111040 | elapsed time per iteration (s): 0.16 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.663460E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.868 | TFLOPs: 25.43 | +7: iteration 167340/ 173500 | consumed samples: 42839040 | consumed tokens: 87734353920 | elapsed time per iteration (s): 0.16 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.656202E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.550 | TFLOPs: 25.70 | +7: iteration 167350/ 173500 | consumed samples: 42841600 | consumed tokens: 87739596800 | elapsed time per iteration (s): 0.15 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.655691E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.730 | TFLOPs: 26.12 | +7: iteration 167360/ 173500 | consumed samples: 42844160 | consumed tokens: 87744839680 | elapsed time per iteration (s): 0.16 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.665924E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1579.017 | TFLOPs: 24.76 | +7: iteration 167370/ 173500 | consumed samples: 42846720 | consumed tokens: 87750082560 | elapsed time per iteration (s): 0.15 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.655207E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.512 | TFLOPs: 26.14 | +7: iteration 167380/ 173500 | consumed samples: 42849280 | consumed tokens: 87755325440 | elapsed time per iteration (s): 0.15 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.660656E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.888 | TFLOPs: 26.17 | +7: iteration 167390/ 173500 | consumed samples: 42851840 | consumed tokens: 87760568320 | elapsed time per iteration (s): 0.15 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.669981E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.826 | TFLOPs: 26.12 | +7: iteration 167400/ 173500 | consumed samples: 42854400 | consumed tokens: 87765811200 | elapsed time per iteration (s): 0.16 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.676703E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.538 | TFLOPs: 25.45 | +7: iteration 167410/ 173500 | consumed samples: 42856960 | consumed tokens: 87771054080 | elapsed time per iteration (s): 0.17 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.667677E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1534.639 | TFLOPs: 24.07 | +7: iteration 167420/ 173500 | consumed samples: 42859520 | consumed tokens: 87776296960 | elapsed time per iteration (s): 0.16 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.652336E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1592.429 | TFLOPs: 24.97 | +7: iteration 167430/ 173500 | consumed samples: 42862080 | consumed tokens: 87781539840 | elapsed time per iteration (s): 0.17 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.650234E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1472.124 | TFLOPs: 23.09 | +7: iteration 167440/ 173500 | consumed samples: 42864640 | consumed tokens: 87786782720 | elapsed time per iteration (s): 0.16 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.665282E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.854 | TFLOPs: 25.36 | +7: iteration 167450/ 173500 | consumed samples: 42867200 | consumed tokens: 87792025600 | elapsed time per iteration (s): 0.15 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.662759E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.803 | TFLOPs: 26.08 | +7: iteration 167460/ 173500 | consumed samples: 42869760 | consumed tokens: 87797268480 | elapsed time per iteration (s): 0.16 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.657360E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.182 | TFLOPs: 25.28 | +7: iteration 167470/ 173500 | consumed samples: 42872320 | consumed tokens: 87802511360 | elapsed time per iteration (s): 0.15 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.673959E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.998 | TFLOPs: 26.16 | +7: iteration 167480/ 173500 | consumed samples: 42874880 | consumed tokens: 87807754240 | elapsed time per iteration (s): 0.16 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.656207E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.655 | TFLOPs: 25.15 | +7: iteration 167490/ 173500 | consumed samples: 42877440 | consumed tokens: 87812997120 | elapsed time per iteration (s): 0.15 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.652537E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.184 | TFLOPs: 26.16 | +7: iteration 167500/ 173500 | consumed samples: 42880000 | consumed tokens: 87818240000 | elapsed time per iteration (s): 0.16 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.669305E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1559.173 | TFLOPs: 24.45 | +7: iteration 167510/ 173500 | consumed samples: 42882560 | consumed tokens: 87823482880 | elapsed time per iteration (s): 0.15 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.656897E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.840 | TFLOPs: 26.11 | +7: iteration 167520/ 173500 | consumed samples: 42885120 | consumed tokens: 87828725760 | elapsed time per iteration (s): 0.15 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.657462E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.022 | TFLOPs: 26.16 | +7: iteration 167530/ 173500 | consumed samples: 42887680 | consumed tokens: 87833968640 | elapsed time per iteration (s): 0.16 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.656289E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.860 | TFLOPs: 25.26 | +7: iteration 167540/ 173500 | consumed samples: 42890240 | consumed tokens: 87839211520 | elapsed time per iteration (s): 0.16 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.672272E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.412 | TFLOPs: 25.16 | +7: iteration 167550/ 173500 | consumed samples: 42892800 | consumed tokens: 87844454400 | elapsed time per iteration (s): 0.15 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.654134E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.929 | TFLOPs: 26.03 | +7: iteration 167560/ 173500 | consumed samples: 42895360 | consumed tokens: 87849697280 | elapsed time per iteration (s): 0.16 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.659183E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.150 | TFLOPs: 25.55 | +7: iteration 167570/ 173500 | consumed samples: 42897920 | consumed tokens: 87854940160 | elapsed time per iteration (s): 0.16 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.665746E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.105 | TFLOPs: 25.69 | +7: iteration 167580/ 173500 | consumed samples: 42900480 | consumed tokens: 87860183040 | elapsed time per iteration (s): 0.16 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.648704E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.309 | TFLOPs: 25.69 | +7: iteration 167590/ 173500 | consumed samples: 42903040 | consumed tokens: 87865425920 | elapsed time per iteration (s): 0.16 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.653693E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.438 | TFLOPs: 25.74 | +7: iteration 167600/ 173500 | consumed samples: 42905600 | consumed tokens: 87870668800 | elapsed time per iteration (s): 0.15 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.652739E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.846 | TFLOPs: 26.16 | +7: iteration 167610/ 173500 | consumed samples: 42908160 | consumed tokens: 87875911680 | elapsed time per iteration (s): 0.16 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.664454E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.600 | TFLOPs: 25.71 | +7: iteration 167620/ 173500 | consumed samples: 42910720 | consumed tokens: 87881154560 | elapsed time per iteration (s): 0.16 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.660685E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.753 | TFLOPs: 25.54 | +7: iteration 167630/ 173500 | consumed samples: 42913280 | consumed tokens: 87886397440 | elapsed time per iteration (s): 0.17 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.681414E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1530.936 | TFLOPs: 24.01 | +7: iteration 167640/ 173500 | consumed samples: 42915840 | consumed tokens: 87891640320 | elapsed time per iteration (s): 0.15 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.658305E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.959 | TFLOPs: 26.13 | +7: iteration 167650/ 173500 | consumed samples: 42918400 | consumed tokens: 87896883200 | elapsed time per iteration (s): 0.16 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.665136E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.102 | TFLOPs: 25.67 | +7: iteration 167660/ 173500 | consumed samples: 42920960 | consumed tokens: 87902126080 | elapsed time per iteration (s): 0.16 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.661305E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1554.456 | TFLOPs: 24.38 | +7: iteration 167670/ 173500 | consumed samples: 42923520 | consumed tokens: 87907368960 | elapsed time per iteration (s): 0.15 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.648848E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.038 | TFLOPs: 26.24 | +7: iteration 167680/ 173500 | consumed samples: 42926080 | consumed tokens: 87912611840 | elapsed time per iteration (s): 0.16 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.656258E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.146 | TFLOPs: 25.88 | +7: iteration 167690/ 173500 | consumed samples: 42928640 | consumed tokens: 87917854720 | elapsed time per iteration (s): 0.17 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.659766E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1521.909 | TFLOPs: 23.87 | +7: iteration 167700/ 173500 | consumed samples: 42931200 | consumed tokens: 87923097600 | elapsed time per iteration (s): 0.16 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.663930E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.362 | TFLOPs: 25.35 | +7: iteration 167710/ 173500 | consumed samples: 42933760 | consumed tokens: 87928340480 | elapsed time per iteration (s): 0.15 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.663707E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.255 | TFLOPs: 26.18 | +7: iteration 167720/ 173500 | consumed samples: 42936320 | consumed tokens: 87933583360 | elapsed time per iteration (s): 0.15 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.657290E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.067 | TFLOPs: 26.16 | +7: iteration 167730/ 173500 | consumed samples: 42938880 | consumed tokens: 87938826240 | elapsed time per iteration (s): 0.15 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.667680E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.067 | TFLOPs: 25.92 | +7: iteration 167740/ 173500 | consumed samples: 42941440 | consumed tokens: 87944069120 | elapsed time per iteration (s): 0.16 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.682142E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.555 | TFLOPs: 25.73 | +7: iteration 167750/ 173500 | consumed samples: 42944000 | consumed tokens: 87949312000 | elapsed time per iteration (s): 0.16 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.664987E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.181 | TFLOPs: 25.82 | +7: iteration 167760/ 173500 | consumed samples: 42946560 | consumed tokens: 87954554880 | elapsed time per iteration (s): 0.16 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.663551E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.276 | TFLOPs: 25.72 | +7: iteration 167770/ 173500 | consumed samples: 42949120 | consumed tokens: 87959797760 | elapsed time per iteration (s): 0.16 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.648549E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.265 | TFLOPs: 25.75 | +7: iteration 167780/ 173500 | consumed samples: 42951680 | consumed tokens: 87965040640 | elapsed time per iteration (s): 0.16 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.663438E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.564 | TFLOPs: 24.47 | +7: iteration 167790/ 173500 | consumed samples: 42954240 | consumed tokens: 87970283520 | elapsed time per iteration (s): 0.15 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.675950E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.423 | TFLOPs: 25.99 | +7: iteration 167800/ 173500 | consumed samples: 42956800 | consumed tokens: 87975526400 | elapsed time per iteration (s): 0.15 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.663359E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.923 | TFLOPs: 26.19 | +7: iteration 167810/ 173500 | consumed samples: 42959360 | consumed tokens: 87980769280 | elapsed time per iteration (s): 0.15 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.669869E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.749 | TFLOPs: 26.17 | +7: iteration 167820/ 173500 | consumed samples: 42961920 | consumed tokens: 87986012160 | elapsed time per iteration (s): 0.16 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.651012E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.018 | TFLOPs: 25.52 | +7: iteration 167830/ 173500 | consumed samples: 42964480 | consumed tokens: 87991255040 | elapsed time per iteration (s): 0.15 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.668108E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.160 | TFLOPs: 26.10 | +7: iteration 167840/ 173500 | consumed samples: 42967040 | consumed tokens: 87996497920 | elapsed time per iteration (s): 0.15 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.664336E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.667 | TFLOPs: 26.11 | +7: iteration 167850/ 173500 | consumed samples: 42969600 | consumed tokens: 88001740800 | elapsed time per iteration (s): 0.15 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.660969E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.128 | TFLOPs: 26.18 | +7: iteration 167860/ 173500 | consumed samples: 42972160 | consumed tokens: 88006983680 | elapsed time per iteration (s): 0.16 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.667346E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.103 | TFLOPs: 25.33 | +7: iteration 167870/ 173500 | consumed samples: 42974720 | consumed tokens: 88012226560 | elapsed time per iteration (s): 0.15 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.654530E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.317 | TFLOPs: 25.96 | +7: iteration 167880/ 173500 | consumed samples: 42977280 | consumed tokens: 88017469440 | elapsed time per iteration (s): 0.16 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.673953E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.356 | TFLOPs: 25.87 | +7: iteration 167890/ 173500 | consumed samples: 42979840 | consumed tokens: 88022712320 | elapsed time per iteration (s): 0.15 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.667716E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.378 | TFLOPs: 26.27 | +7: iteration 167900/ 173500 | consumed samples: 42982400 | consumed tokens: 88027955200 | elapsed time per iteration (s): 0.15 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.673712E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.224 | TFLOPs: 26.27 | +7: iteration 167910/ 173500 | consumed samples: 42984960 | consumed tokens: 88033198080 | elapsed time per iteration (s): 0.15 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.656815E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.642 | TFLOPs: 26.25 | +7: iteration 167920/ 173500 | consumed samples: 42987520 | consumed tokens: 88038440960 | elapsed time per iteration (s): 0.15 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.660461E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.828 | TFLOPs: 25.95 | +7: iteration 167930/ 173500 | consumed samples: 42990080 | consumed tokens: 88043683840 | elapsed time per iteration (s): 0.16 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.667504E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.088 | TFLOPs: 25.88 | +7: iteration 167940/ 173500 | consumed samples: 42992640 | consumed tokens: 88048926720 | elapsed time per iteration (s): 0.16 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.639668E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.428 | TFLOPs: 25.77 | +7: iteration 167950/ 173500 | consumed samples: 42995200 | consumed tokens: 88054169600 | elapsed time per iteration (s): 0.16 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.672575E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.028 | TFLOPs: 25.77 | +7: iteration 167960/ 173500 | consumed samples: 42997760 | consumed tokens: 88059412480 | elapsed time per iteration (s): 0.16 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.670698E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.619 | TFLOPs: 25.04 | +7: iteration 167970/ 173500 | consumed samples: 43000320 | consumed tokens: 88064655360 | elapsed time per iteration (s): 0.16 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.662649E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.284 | TFLOPs: 25.50 | +7: iteration 167980/ 173500 | consumed samples: 43002880 | consumed tokens: 88069898240 | elapsed time per iteration (s): 0.16 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.658412E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.988 | TFLOPs: 25.75 | +7: iteration 167990/ 173500 | consumed samples: 43005440 | consumed tokens: 88075141120 | elapsed time per iteration (s): 0.15 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.657602E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.511 | TFLOPs: 26.28 | +0: [2023-03-17 07:33:51,103] [INFO] [logging.py:68:log_dist] [Rank 0] step=168000, skipped=0, lr=[2.0455079592202583e-05, 2.0455079592202583e-05, 2.0455079592202583e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 168000/ 173500 | consumed samples: 43008000 | consumed tokens: 88080384000 | elapsed time per iteration (s): 0.15 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.656424E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.406 | TFLOPs: 26.21 | +0: steps: 168000 loss: 3.6788 iter time (s): 0.156 samples/sec: 1641.951 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 168000 | lm loss value: 3.850142E+00 | lm loss PPL: 4.699971E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 168000 to checkpoints_44m91b100m +0: [2023-03-17 07:33:51,178] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step168000 is begin to save! +0: [2023-03-17 07:33:51,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:33:51,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:33:51,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:33:51,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:33:51,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:33:51,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:33:51,265] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:33:51,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:33:51,273] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:33:51,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:33:51,281] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:33:51,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:33:51,289] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:33:51,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:33:51,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:33:51,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:33:51,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:33:51,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:33:51,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:33:51,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:33:51,315] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step168000/mp_rank_00_model_states.pt +0: [2023-03-17 07:33:51,315] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:33:51,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:33:51,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,347] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,347] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:33:51,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:33:51,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +7: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +7: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +7: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:33:51,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +7: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +7: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +2: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +1: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:33:51,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +7: [2023-03-17 07:33:51,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +6: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +3: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +4: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +5: [2023-03-17 07:33:51,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:33:51,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step168000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:33:51,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step168000 is ready now! +0: successfully saved checkpoint at iteration 168000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 182.56 +7: iteration 168010/ 173500 | consumed samples: 43010560 | consumed tokens: 88085626880 | elapsed time per iteration (s): 0.18 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.665029E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.267 | TFLOPs: 22.24 | +7: iteration 168020/ 173500 | consumed samples: 43013120 | consumed tokens: 88090869760 | elapsed time per iteration (s): 0.16 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.664365E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.854 | TFLOPs: 25.17 | +7: iteration 168030/ 173500 | consumed samples: 43015680 | consumed tokens: 88096112640 | elapsed time per iteration (s): 0.16 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.664761E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.092 | TFLOPs: 25.33 | +7: iteration 168040/ 173500 | consumed samples: 43018240 | consumed tokens: 88101355520 | elapsed time per iteration (s): 0.16 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.652650E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.041 | TFLOPs: 25.42 | +7: iteration 168050/ 173500 | consumed samples: 43020800 | consumed tokens: 88106598400 | elapsed time per iteration (s): 0.15 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.655447E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.581 | TFLOPs: 26.00 | +7: iteration 168060/ 173500 | consumed samples: 43023360 | consumed tokens: 88111841280 | elapsed time per iteration (s): 0.16 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.641442E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.879 | TFLOPs: 24.96 | +7: iteration 168070/ 173500 | consumed samples: 43025920 | consumed tokens: 88117084160 | elapsed time per iteration (s): 0.16 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.667791E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.726 | TFLOPs: 25.53 | +7: iteration 168080/ 173500 | consumed samples: 43028480 | consumed tokens: 88122327040 | elapsed time per iteration (s): 0.15 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.656892E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.666 | TFLOPs: 26.20 | +7: iteration 168090/ 173500 | consumed samples: 43031040 | consumed tokens: 88127569920 | elapsed time per iteration (s): 0.15 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.654405E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.622 | TFLOPs: 25.90 | +7: iteration 168100/ 173500 | consumed samples: 43033600 | consumed tokens: 88132812800 | elapsed time per iteration (s): 0.16 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.661488E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.505 | TFLOPs: 25.54 | +7: iteration 168110/ 173500 | consumed samples: 43036160 | consumed tokens: 88138055680 | elapsed time per iteration (s): 0.15 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.657883E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.503 | TFLOPs: 26.31 | +7: iteration 168120/ 173500 | consumed samples: 43038720 | consumed tokens: 88143298560 | elapsed time per iteration (s): 0.16 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.663740E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1590.161 | TFLOPs: 24.94 | +7: iteration 168130/ 173500 | consumed samples: 43041280 | consumed tokens: 88148541440 | elapsed time per iteration (s): 0.16 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.660617E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.290 | TFLOPs: 25.60 | +7: iteration 168140/ 173500 | consumed samples: 43043840 | consumed tokens: 88153784320 | elapsed time per iteration (s): 0.16 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.658889E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.174 | TFLOPs: 25.69 | +7: iteration 168150/ 173500 | consumed samples: 43046400 | consumed tokens: 88159027200 | elapsed time per iteration (s): 0.16 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.648305E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.160 | TFLOPs: 25.56 | +7: iteration 168160/ 173500 | consumed samples: 43048960 | consumed tokens: 88164270080 | elapsed time per iteration (s): 0.16 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.657419E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.195 | TFLOPs: 25.42 | +7: iteration 168170/ 173500 | consumed samples: 43051520 | consumed tokens: 88169512960 | elapsed time per iteration (s): 0.16 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.679772E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.854 | TFLOPs: 25.72 | +7: iteration 168180/ 173500 | consumed samples: 43054080 | consumed tokens: 88174755840 | elapsed time per iteration (s): 0.16 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.661034E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.621 | TFLOPs: 25.84 | +7: iteration 168190/ 173500 | consumed samples: 43056640 | consumed tokens: 88179998720 | elapsed time per iteration (s): 0.16 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.655990E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.374 | TFLOPs: 25.55 | +7: iteration 168200/ 173500 | consumed samples: 43059200 | consumed tokens: 88185241600 | elapsed time per iteration (s): 0.17 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.665005E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1536.866 | TFLOPs: 24.10 | +7: iteration 168210/ 173500 | consumed samples: 43061760 | consumed tokens: 88190484480 | elapsed time per iteration (s): 0.16 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.653763E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.278 | TFLOPs: 25.90 | +7: iteration 168220/ 173500 | consumed samples: 43064320 | consumed tokens: 88195727360 | elapsed time per iteration (s): 0.16 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.667941E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.205 | TFLOPs: 25.90 | +7: iteration 168230/ 173500 | consumed samples: 43066880 | consumed tokens: 88200970240 | elapsed time per iteration (s): 0.16 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.655946E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.520 | TFLOPs: 25.60 | +7: iteration 168240/ 173500 | consumed samples: 43069440 | consumed tokens: 88206213120 | elapsed time per iteration (s): 0.16 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.647326E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.910 | TFLOPs: 25.47 | +7: iteration 168250/ 173500 | consumed samples: 43072000 | consumed tokens: 88211456000 | elapsed time per iteration (s): 0.16 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.665320E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.192 | TFLOPs: 25.03 | +7: iteration 168260/ 173500 | consumed samples: 43074560 | consumed tokens: 88216698880 | elapsed time per iteration (s): 0.16 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.663312E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.269 | TFLOPs: 25.41 | +7: iteration 168270/ 173500 | consumed samples: 43077120 | consumed tokens: 88221941760 | elapsed time per iteration (s): 0.16 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.654914E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.819 | TFLOPs: 25.23 | +7: iteration 168280/ 173500 | consumed samples: 43079680 | consumed tokens: 88227184640 | elapsed time per iteration (s): 0.16 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.671137E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.847 | TFLOPs: 25.87 | +7: iteration 168290/ 173500 | consumed samples: 43082240 | consumed tokens: 88232427520 | elapsed time per iteration (s): 0.16 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.658514E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1584.260 | TFLOPs: 24.85 | +7: iteration 168300/ 173500 | consumed samples: 43084800 | consumed tokens: 88237670400 | elapsed time per iteration (s): 0.15 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.655523E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.749 | TFLOPs: 26.01 | +7: iteration 168310/ 173500 | consumed samples: 43087360 | consumed tokens: 88242913280 | elapsed time per iteration (s): 0.16 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.676449E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.396 | TFLOPs: 25.65 | +7: iteration 168320/ 173500 | consumed samples: 43089920 | consumed tokens: 88248156160 | elapsed time per iteration (s): 0.15 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.655471E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.316 | TFLOPs: 26.21 | +7: iteration 168330/ 173500 | consumed samples: 43092480 | consumed tokens: 88253399040 | elapsed time per iteration (s): 0.16 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.654903E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.843 | TFLOPs: 25.07 | +7: iteration 168340/ 173500 | consumed samples: 43095040 | consumed tokens: 88258641920 | elapsed time per iteration (s): 0.16 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.656133E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.637 | TFLOPs: 25.34 | +7: iteration 168350/ 173500 | consumed samples: 43097600 | consumed tokens: 88263884800 | elapsed time per iteration (s): 0.16 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.651931E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.494 | TFLOPs: 25.57 | +7: iteration 168360/ 173500 | consumed samples: 43100160 | consumed tokens: 88269127680 | elapsed time per iteration (s): 0.16 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.662714E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.894 | TFLOPs: 25.67 | +7: iteration 168370/ 173500 | consumed samples: 43102720 | consumed tokens: 88274370560 | elapsed time per iteration (s): 0.16 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.657400E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.608 | TFLOPs: 25.40 | +7: iteration 168380/ 173500 | consumed samples: 43105280 | consumed tokens: 88279613440 | elapsed time per iteration (s): 0.15 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.666203E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.366 | TFLOPs: 26.34 | +7: iteration 168390/ 173500 | consumed samples: 43107840 | consumed tokens: 88284856320 | elapsed time per iteration (s): 0.15 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.660257E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.130 | TFLOPs: 26.36 | +7: iteration 168400/ 173500 | consumed samples: 43110400 | consumed tokens: 88290099200 | elapsed time per iteration (s): 0.16 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.656025E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.753 | TFLOPs: 25.54 | +7: iteration 168410/ 173500 | consumed samples: 43112960 | consumed tokens: 88295342080 | elapsed time per iteration (s): 0.16 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.673666E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.518 | TFLOPs: 25.85 | +7: iteration 168420/ 173500 | consumed samples: 43115520 | consumed tokens: 88300584960 | elapsed time per iteration (s): 0.16 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.651405E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1594.169 | TFLOPs: 25.00 | +7: iteration 168430/ 173500 | consumed samples: 43118080 | consumed tokens: 88305827840 | elapsed time per iteration (s): 0.15 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.646439E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.431 | TFLOPs: 25.98 | +7: iteration 168440/ 173500 | consumed samples: 43120640 | consumed tokens: 88311070720 | elapsed time per iteration (s): 0.16 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.679037E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.347 | TFLOPs: 25.80 | +7: iteration 168450/ 173500 | consumed samples: 43123200 | consumed tokens: 88316313600 | elapsed time per iteration (s): 0.16 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.676554E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.572 | TFLOPs: 25.74 | +7: iteration 168460/ 173500 | consumed samples: 43125760 | consumed tokens: 88321556480 | elapsed time per iteration (s): 0.16 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.659712E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.639 | TFLOPs: 25.85 | +7: iteration 168470/ 173500 | consumed samples: 43128320 | consumed tokens: 88326799360 | elapsed time per iteration (s): 0.16 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.654899E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.540 | TFLOPs: 25.88 | +7: iteration 168480/ 173500 | consumed samples: 43130880 | consumed tokens: 88332042240 | elapsed time per iteration (s): 0.15 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.672528E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.110 | TFLOPs: 26.35 | +7: iteration 168490/ 173500 | consumed samples: 43133440 | consumed tokens: 88337285120 | elapsed time per iteration (s): 0.16 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.661066E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.882 | TFLOPs: 25.47 | +7: iteration 168500/ 173500 | consumed samples: 43136000 | consumed tokens: 88342528000 | elapsed time per iteration (s): 0.16 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.658165E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.725 | TFLOPs: 25.73 | +7: iteration 168510/ 173500 | consumed samples: 43138560 | consumed tokens: 88347770880 | elapsed time per iteration (s): 0.16 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.655188E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.663 | TFLOPs: 25.82 | +7: iteration 168520/ 173500 | consumed samples: 43141120 | consumed tokens: 88353013760 | elapsed time per iteration (s): 0.16 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.663295E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.858 | TFLOPs: 25.51 | +7: iteration 168530/ 173500 | consumed samples: 43143680 | consumed tokens: 88358256640 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.655165E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.364 | TFLOPs: 26.18 | +7: iteration 168540/ 173500 | consumed samples: 43146240 | consumed tokens: 88363499520 | elapsed time per iteration (s): 0.16 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.668431E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.328 | TFLOPs: 25.60 | +7: iteration 168550/ 173500 | consumed samples: 43148800 | consumed tokens: 88368742400 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.661549E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.684 | TFLOPs: 26.14 | +7: iteration 168560/ 173500 | consumed samples: 43151360 | consumed tokens: 88373985280 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.669714E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.158 | TFLOPs: 26.18 | +7: iteration 168570/ 173500 | consumed samples: 43153920 | consumed tokens: 88379228160 | elapsed time per iteration (s): 0.15 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.672627E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.177 | TFLOPs: 26.19 | +7: iteration 168580/ 173500 | consumed samples: 43156480 | consumed tokens: 88384471040 | elapsed time per iteration (s): 0.16 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.657278E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.603 | TFLOPs: 25.74 | +7: iteration 168590/ 173500 | consumed samples: 43159040 | consumed tokens: 88389713920 | elapsed time per iteration (s): 0.15 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.664335E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.848 | TFLOPs: 26.25 | +7: iteration 168600/ 173500 | consumed samples: 43161600 | consumed tokens: 88394956800 | elapsed time per iteration (s): 0.15 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.661396E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.410 | TFLOPs: 26.24 | +7: iteration 168610/ 173500 | consumed samples: 43164160 | consumed tokens: 88400199680 | elapsed time per iteration (s): 0.15 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.652390E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.762 | TFLOPs: 26.23 | +7: iteration 168620/ 173500 | consumed samples: 43166720 | consumed tokens: 88405442560 | elapsed time per iteration (s): 0.15 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.655816E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.450 | TFLOPs: 26.24 | +7: iteration 168630/ 173500 | consumed samples: 43169280 | consumed tokens: 88410685440 | elapsed time per iteration (s): 0.16 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.657602E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.493 | TFLOPs: 25.73 | +7: iteration 168640/ 173500 | consumed samples: 43171840 | consumed tokens: 88415928320 | elapsed time per iteration (s): 0.16 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.657144E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.192 | TFLOPs: 25.28 | +7: iteration 168650/ 173500 | consumed samples: 43174400 | consumed tokens: 88421171200 | elapsed time per iteration (s): 0.17 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.658252E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1538.112 | TFLOPs: 24.12 | +7: iteration 168660/ 173500 | consumed samples: 43176960 | consumed tokens: 88426414080 | elapsed time per iteration (s): 0.16 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.668227E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.678 | TFLOPs: 25.09 | +7: iteration 168670/ 173500 | consumed samples: 43179520 | consumed tokens: 88431656960 | elapsed time per iteration (s): 0.15 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.656854E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.667 | TFLOPs: 26.15 | +7: iteration 168680/ 173500 | consumed samples: 43182080 | consumed tokens: 88436899840 | elapsed time per iteration (s): 0.16 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.641242E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.303 | TFLOPs: 25.71 | +7: iteration 168690/ 173500 | consumed samples: 43184640 | consumed tokens: 88442142720 | elapsed time per iteration (s): 0.16 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.656888E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.831 | TFLOPs: 25.81 | +7: iteration 168700/ 173500 | consumed samples: 43187200 | consumed tokens: 88447385600 | elapsed time per iteration (s): 0.15 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.663139E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.111 | TFLOPs: 26.18 | +7: iteration 168710/ 173500 | consumed samples: 43189760 | consumed tokens: 88452628480 | elapsed time per iteration (s): 0.15 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.658511E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.185 | TFLOPs: 26.18 | +7: iteration 168720/ 173500 | consumed samples: 43192320 | consumed tokens: 88457871360 | elapsed time per iteration (s): 0.16 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.654514E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1555.433 | TFLOPs: 24.39 | +7: iteration 168730/ 173500 | consumed samples: 43194880 | consumed tokens: 88463114240 | elapsed time per iteration (s): 0.15 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.651359E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.580 | TFLOPs: 25.95 | +7: iteration 168740/ 173500 | consumed samples: 43197440 | consumed tokens: 88468357120 | elapsed time per iteration (s): 0.15 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.667979E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.012 | TFLOPs: 26.17 | +7: iteration 168750/ 173500 | consumed samples: 43200000 | consumed tokens: 88473600000 | elapsed time per iteration (s): 0.15 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.658273E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.825 | TFLOPs: 26.12 | +7: iteration 168760/ 173500 | consumed samples: 43202560 | consumed tokens: 88478842880 | elapsed time per iteration (s): 0.16 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.665986E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.733 | TFLOPs: 25.79 | +7: iteration 168770/ 173500 | consumed samples: 43205120 | consumed tokens: 88484085760 | elapsed time per iteration (s): 0.16 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.655459E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.007 | TFLOPs: 25.63 | +7: iteration 168780/ 173500 | consumed samples: 43207680 | consumed tokens: 88489328640 | elapsed time per iteration (s): 0.16 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.660720E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.130 | TFLOPs: 25.52 | +7: iteration 168790/ 173500 | consumed samples: 43210240 | consumed tokens: 88494571520 | elapsed time per iteration (s): 0.16 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.658043E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.676 | TFLOPs: 25.59 | +7: iteration 168800/ 173500 | consumed samples: 43212800 | consumed tokens: 88499814400 | elapsed time per iteration (s): 0.15 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.662694E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.626 | TFLOPs: 26.09 | +7: iteration 168810/ 173500 | consumed samples: 43215360 | consumed tokens: 88505057280 | elapsed time per iteration (s): 0.16 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.665892E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.904 | TFLOPs: 25.67 | +7: iteration 168820/ 173500 | consumed samples: 43217920 | consumed tokens: 88510300160 | elapsed time per iteration (s): 0.16 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.671670E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.376 | TFLOPs: 25.63 | +7: iteration 168830/ 173500 | consumed samples: 43220480 | consumed tokens: 88515543040 | elapsed time per iteration (s): 0.18 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.671330E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1421.481 | TFLOPs: 22.29 | +7: iteration 168840/ 173500 | consumed samples: 43223040 | consumed tokens: 88520785920 | elapsed time per iteration (s): 0.17 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.672771E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1505.606 | TFLOPs: 23.61 | +7: iteration 168850/ 173500 | consumed samples: 43225600 | consumed tokens: 88526028800 | elapsed time per iteration (s): 0.16 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.659532E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.896 | TFLOPs: 25.50 | +7: iteration 168860/ 173500 | consumed samples: 43228160 | consumed tokens: 88531271680 | elapsed time per iteration (s): 0.16 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.656570E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.014 | TFLOPs: 25.75 | +7: iteration 168870/ 173500 | consumed samples: 43230720 | consumed tokens: 88536514560 | elapsed time per iteration (s): 0.16 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.662336E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.435 | TFLOPs: 25.71 | +7: iteration 168880/ 173500 | consumed samples: 43233280 | consumed tokens: 88541757440 | elapsed time per iteration (s): 0.15 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.655241E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.194 | TFLOPs: 26.26 | +7: iteration 168890/ 173500 | consumed samples: 43235840 | consumed tokens: 88547000320 | elapsed time per iteration (s): 0.15 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.668207E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.890 | TFLOPs: 26.25 | +7: iteration 168900/ 173500 | consumed samples: 43238400 | consumed tokens: 88552243200 | elapsed time per iteration (s): 0.16 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.662555E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.682 | TFLOPs: 25.79 | +7: iteration 168910/ 173500 | consumed samples: 43240960 | consumed tokens: 88557486080 | elapsed time per iteration (s): 0.16 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.660233E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1571.537 | TFLOPs: 24.65 | +7: iteration 168920/ 173500 | consumed samples: 43243520 | consumed tokens: 88562728960 | elapsed time per iteration (s): 0.15 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.660688E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.725 | TFLOPs: 26.28 | +7: iteration 168930/ 173500 | consumed samples: 43246080 | consumed tokens: 88567971840 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.659458E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.664 | TFLOPs: 26.29 | +7: iteration 168940/ 173500 | consumed samples: 43248640 | consumed tokens: 88573214720 | elapsed time per iteration (s): 0.16 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.667808E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.253 | TFLOPs: 25.30 | +7: iteration 168950/ 173500 | consumed samples: 43251200 | consumed tokens: 88578457600 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.665562E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.073 | TFLOPs: 26.25 | +7: iteration 168960/ 173500 | consumed samples: 43253760 | consumed tokens: 88583700480 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.645526E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.372 | TFLOPs: 26.21 | +7: iteration 168970/ 173500 | consumed samples: 43256320 | consumed tokens: 88588943360 | elapsed time per iteration (s): 0.16 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.655177E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.463 | TFLOPs: 25.63 | +7: iteration 168980/ 173500 | consumed samples: 43258880 | consumed tokens: 88594186240 | elapsed time per iteration (s): 0.15 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.655236E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.525 | TFLOPs: 26.01 | +7: iteration 168990/ 173500 | consumed samples: 43261440 | consumed tokens: 88599429120 | elapsed time per iteration (s): 0.16 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.671971E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.918 | TFLOPs: 25.50 | +7: iteration 169000/ 173500 | consumed samples: 43264000 | consumed tokens: 88604672000 | elapsed time per iteration (s): 0.16 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.663813E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.503 | TFLOPs: 25.65 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 169000 | lm loss value: 3.797693E+00 | lm loss PPL: 4.459817E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 169000 to checkpoints_44m91b100m +0: [2023-03-17 07:36:28,042] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step169000 is begin to save! +0: [2023-03-17 07:36:28,047] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:36:28,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:36:28,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:36:28,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:36:28,124] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:36:28,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:36:28,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:36:28,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:36:28,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:36:28,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:36:28,149] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:36:28,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:36:28,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:36:28,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:36:28,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:36:28,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:36:28,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:36:28,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:36:28,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:36:28,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:36:28,184] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step169000/mp_rank_00_model_states.pt +0: [2023-03-17 07:36:28,184] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:36:28,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:36:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:36:28,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,222] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,222] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:36:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:36:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +3: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +6: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:36:28,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +4: [2023-03-17 07:36:28,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:36:28,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:36:28,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +2: [2023-03-17 07:36:28,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:36:28,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:36:28,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +7: [2023-03-17 07:36:28,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:36:28,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:36:28,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +5: [2023-03-17 07:36:28,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:36:28,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +1: [2023-03-17 07:36:28,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step169000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:36:28,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step169000 is ready now! +0: successfully saved checkpoint at iteration 169000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 259.63 +7: iteration 169010/ 173500 | consumed samples: 43266560 | consumed tokens: 88609914880 | elapsed time per iteration (s): 0.19 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.666766E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.068 | TFLOPs: 21.27 | +7: iteration 169020/ 173500 | consumed samples: 43269120 | consumed tokens: 88615157760 | elapsed time per iteration (s): 0.16 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.662207E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.268 | TFLOPs: 25.71 | +7: iteration 169030/ 173500 | consumed samples: 43271680 | consumed tokens: 88620400640 | elapsed time per iteration (s): 0.16 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.659571E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.279 | TFLOPs: 25.90 | +7: iteration 169040/ 173500 | consumed samples: 43274240 | consumed tokens: 88625643520 | elapsed time per iteration (s): 0.16 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.667363E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.537 | TFLOPs: 25.73 | +7: iteration 169050/ 173500 | consumed samples: 43276800 | consumed tokens: 88630886400 | elapsed time per iteration (s): 0.16 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.654053E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.805 | TFLOPs: 25.89 | +7: iteration 169060/ 173500 | consumed samples: 43279360 | consumed tokens: 88636129280 | elapsed time per iteration (s): 0.15 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.662044E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.218 | TFLOPs: 26.26 | +7: iteration 169070/ 173500 | consumed samples: 43281920 | consumed tokens: 88641372160 | elapsed time per iteration (s): 0.15 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.671248E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.702 | TFLOPs: 26.25 | +7: iteration 169080/ 173500 | consumed samples: 43284480 | consumed tokens: 88646615040 | elapsed time per iteration (s): 0.15 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.665889E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.932 | TFLOPs: 26.31 | +7: iteration 169090/ 173500 | consumed samples: 43287040 | consumed tokens: 88651857920 | elapsed time per iteration (s): 0.15 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.654645E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.040 | TFLOPs: 26.02 | +7: iteration 169100/ 173500 | consumed samples: 43289600 | consumed tokens: 88657100800 | elapsed time per iteration (s): 0.17 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.664088E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1547.563 | TFLOPs: 24.27 | +7: iteration 169110/ 173500 | consumed samples: 43292160 | consumed tokens: 88662343680 | elapsed time per iteration (s): 0.16 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.655053E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.794 | TFLOPs: 25.73 | +7: iteration 169120/ 173500 | consumed samples: 43294720 | consumed tokens: 88667586560 | elapsed time per iteration (s): 0.16 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.667305E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.187 | TFLOPs: 25.74 | +7: iteration 169130/ 173500 | consumed samples: 43297280 | consumed tokens: 88672829440 | elapsed time per iteration (s): 0.15 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.672215E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.622 | TFLOPs: 26.12 | +7: iteration 169140/ 173500 | consumed samples: 43299840 | consumed tokens: 88678072320 | elapsed time per iteration (s): 0.16 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.652491E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.608 | TFLOPs: 25.81 | +7: iteration 169150/ 173500 | consumed samples: 43302400 | consumed tokens: 88683315200 | elapsed time per iteration (s): 0.17 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.662790E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1543.310 | TFLOPs: 24.20 | +7: iteration 169160/ 173500 | consumed samples: 43304960 | consumed tokens: 88688558080 | elapsed time per iteration (s): 0.16 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.658931E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.162 | TFLOPs: 25.74 | +7: iteration 169170/ 173500 | consumed samples: 43307520 | consumed tokens: 88693800960 | elapsed time per iteration (s): 0.15 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.675461E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.441 | TFLOPs: 26.15 | +7: iteration 169180/ 173500 | consumed samples: 43310080 | consumed tokens: 88699043840 | elapsed time per iteration (s): 0.16 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.671208E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.008 | TFLOPs: 25.84 | +7: iteration 169190/ 173500 | consumed samples: 43312640 | consumed tokens: 88704286720 | elapsed time per iteration (s): 0.16 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.658638E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.017 | TFLOPs: 25.81 | +7: iteration 169200/ 173500 | consumed samples: 43315200 | consumed tokens: 88709529600 | elapsed time per iteration (s): 0.16 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.660135E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.522 | TFLOPs: 25.88 | +7: iteration 169210/ 173500 | consumed samples: 43317760 | consumed tokens: 88714772480 | elapsed time per iteration (s): 0.15 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.652870E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.948 | TFLOPs: 25.91 | +7: iteration 169220/ 173500 | consumed samples: 43320320 | consumed tokens: 88720015360 | elapsed time per iteration (s): 0.16 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.660454E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.278 | TFLOPs: 25.83 | +7: iteration 169230/ 173500 | consumed samples: 43322880 | consumed tokens: 88725258240 | elapsed time per iteration (s): 0.15 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.664849E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.248 | TFLOPs: 26.27 | +7: iteration 169240/ 173500 | consumed samples: 43325440 | consumed tokens: 88730501120 | elapsed time per iteration (s): 0.15 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.661788E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.335 | TFLOPs: 26.26 | +7: iteration 169250/ 173500 | consumed samples: 43328000 | consumed tokens: 88735744000 | elapsed time per iteration (s): 0.15 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.662944E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.938 | TFLOPs: 26.27 | +7: iteration 169260/ 173500 | consumed samples: 43330560 | consumed tokens: 88740986880 | elapsed time per iteration (s): 0.16 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.661101E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.477 | TFLOPs: 25.70 | +7: iteration 169270/ 173500 | consumed samples: 43333120 | consumed tokens: 88746229760 | elapsed time per iteration (s): 0.16 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.651821E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.798 | TFLOPs: 25.43 | +7: iteration 169280/ 173500 | consumed samples: 43335680 | consumed tokens: 88751472640 | elapsed time per iteration (s): 0.16 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.664709E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.076 | TFLOPs: 25.74 | +7: iteration 169290/ 173500 | consumed samples: 43338240 | consumed tokens: 88756715520 | elapsed time per iteration (s): 0.16 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.658467E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.382 | TFLOPs: 25.66 | +7: iteration 169300/ 173500 | consumed samples: 43340800 | consumed tokens: 88761958400 | elapsed time per iteration (s): 0.16 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.674249E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.575 | TFLOPs: 25.73 | +7: iteration 169310/ 173500 | consumed samples: 43343360 | consumed tokens: 88767201280 | elapsed time per iteration (s): 0.16 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.673113E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.613 | TFLOPs: 25.65 | +7: iteration 169320/ 173500 | consumed samples: 43345920 | consumed tokens: 88772444160 | elapsed time per iteration (s): 0.16 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.671156E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.817 | TFLOPs: 25.23 | +7: iteration 169330/ 173500 | consumed samples: 43348480 | consumed tokens: 88777687040 | elapsed time per iteration (s): 0.16 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.661207E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.146 | TFLOPs: 25.39 | +7: iteration 169340/ 173500 | consumed samples: 43351040 | consumed tokens: 88782929920 | elapsed time per iteration (s): 0.15 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.657597E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.308 | TFLOPs: 26.13 | +7: iteration 169350/ 173500 | consumed samples: 43353600 | consumed tokens: 88788172800 | elapsed time per iteration (s): 0.15 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.642455E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.124 | TFLOPs: 26.18 | +7: iteration 169360/ 173500 | consumed samples: 43356160 | consumed tokens: 88793415680 | elapsed time per iteration (s): 0.16 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.661397E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.746 | TFLOPs: 25.81 | +7: iteration 169370/ 173500 | consumed samples: 43358720 | consumed tokens: 88798658560 | elapsed time per iteration (s): 0.16 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.655877E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.959 | TFLOPs: 25.42 | +7: iteration 169380/ 173500 | consumed samples: 43361280 | consumed tokens: 88803901440 | elapsed time per iteration (s): 0.16 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.659103E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.890 | TFLOPs: 25.55 | +7: iteration 169390/ 173500 | consumed samples: 43363840 | consumed tokens: 88809144320 | elapsed time per iteration (s): 0.15 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.679346E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.438 | TFLOPs: 26.18 | +7: iteration 169400/ 173500 | consumed samples: 43366400 | consumed tokens: 88814387200 | elapsed time per iteration (s): 0.16 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.648247E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.599 | TFLOPs: 25.56 | +7: iteration 169410/ 173500 | consumed samples: 43368960 | consumed tokens: 88819630080 | elapsed time per iteration (s): 0.15 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.659633E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.308 | TFLOPs: 26.12 | +7: iteration 169420/ 173500 | consumed samples: 43371520 | consumed tokens: 88824872960 | elapsed time per iteration (s): 0.16 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.672276E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.868 | TFLOPs: 25.73 | +7: iteration 169430/ 173500 | consumed samples: 43374080 | consumed tokens: 88830115840 | elapsed time per iteration (s): 0.16 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.647610E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.737 | TFLOPs: 25.81 | +7: iteration 169440/ 173500 | consumed samples: 43376640 | consumed tokens: 88835358720 | elapsed time per iteration (s): 0.15 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.652219E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.990 | TFLOPs: 26.17 | +7: iteration 169450/ 173500 | consumed samples: 43379200 | consumed tokens: 88840601600 | elapsed time per iteration (s): 0.16 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.657467E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.126 | TFLOPs: 25.50 | +7: iteration 169460/ 173500 | consumed samples: 43381760 | consumed tokens: 88845844480 | elapsed time per iteration (s): 0.16 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.670827E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.976 | TFLOPs: 25.36 | +7: iteration 169470/ 173500 | consumed samples: 43384320 | consumed tokens: 88851087360 | elapsed time per iteration (s): 0.16 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.651070E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.597 | TFLOPs: 25.24 | +7: iteration 169480/ 173500 | consumed samples: 43386880 | consumed tokens: 88856330240 | elapsed time per iteration (s): 0.15 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.647728E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.889 | TFLOPs: 26.14 | +7: iteration 169490/ 173500 | consumed samples: 43389440 | consumed tokens: 88861573120 | elapsed time per iteration (s): 0.16 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.651907E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.379 | TFLOPs: 25.49 | +7: iteration 169500/ 173500 | consumed samples: 43392000 | consumed tokens: 88866816000 | elapsed time per iteration (s): 0.16 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.659188E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.343 | TFLOPs: 24.72 | +7: iteration 169510/ 173500 | consumed samples: 43394560 | consumed tokens: 88872058880 | elapsed time per iteration (s): 0.16 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.652908E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.443 | TFLOPs: 25.69 | +7: iteration 169520/ 173500 | consumed samples: 43397120 | consumed tokens: 88877301760 | elapsed time per iteration (s): 0.15 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.671572E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.816 | TFLOPs: 25.90 | +7: iteration 169530/ 173500 | consumed samples: 43399680 | consumed tokens: 88882544640 | elapsed time per iteration (s): 0.16 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.662080E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.712 | TFLOPs: 25.86 | +7: iteration 169540/ 173500 | consumed samples: 43402240 | consumed tokens: 88887787520 | elapsed time per iteration (s): 0.16 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.657405E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.048 | TFLOPs: 25.72 | +7: iteration 169550/ 173500 | consumed samples: 43404800 | consumed tokens: 88893030400 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.662754E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.203 | TFLOPs: 26.00 | +7: iteration 169560/ 173500 | consumed samples: 43407360 | consumed tokens: 88898273280 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.654160E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.307 | TFLOPs: 26.27 | +7: iteration 169570/ 173500 | consumed samples: 43409920 | consumed tokens: 88903516160 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.656274E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.682 | TFLOPs: 26.09 | +7: iteration 169580/ 173500 | consumed samples: 43412480 | consumed tokens: 88908759040 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.665795E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.140 | TFLOPs: 26.13 | +7: iteration 169590/ 173500 | consumed samples: 43415040 | consumed tokens: 88914001920 | elapsed time per iteration (s): 0.16 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.657663E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1638.903 | TFLOPs: 25.70 | +7: iteration 169600/ 173500 | consumed samples: 43417600 | consumed tokens: 88919244800 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.674466E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.586 | TFLOPs: 26.29 | +7: iteration 169610/ 173500 | consumed samples: 43420160 | consumed tokens: 88924487680 | elapsed time per iteration (s): 0.16 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.658918E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.421 | TFLOPs: 25.73 | +7: iteration 169620/ 173500 | consumed samples: 43422720 | consumed tokens: 88929730560 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.653430E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.496 | TFLOPs: 26.31 | +7: iteration 169630/ 173500 | consumed samples: 43425280 | consumed tokens: 88934973440 | elapsed time per iteration (s): 0.15 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.661102E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.067 | TFLOPs: 26.18 | +7: iteration 169640/ 173500 | consumed samples: 43427840 | consumed tokens: 88940216320 | elapsed time per iteration (s): 0.16 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.669519E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1621.981 | TFLOPs: 25.44 | +7: iteration 169650/ 173500 | consumed samples: 43430400 | consumed tokens: 88945459200 | elapsed time per iteration (s): 0.16 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.658460E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.943 | TFLOPs: 25.88 | +7: iteration 169660/ 173500 | consumed samples: 43432960 | consumed tokens: 88950702080 | elapsed time per iteration (s): 0.16 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.659056E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1556.054 | TFLOPs: 24.40 | +7: iteration 169670/ 173500 | consumed samples: 43435520 | consumed tokens: 88955944960 | elapsed time per iteration (s): 0.16 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.661629E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.086 | TFLOPs: 25.81 | +7: iteration 169680/ 173500 | consumed samples: 43438080 | consumed tokens: 88961187840 | elapsed time per iteration (s): 0.16 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.658329E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1628.688 | TFLOPs: 25.54 | +7: iteration 169690/ 173500 | consumed samples: 43440640 | consumed tokens: 88966430720 | elapsed time per iteration (s): 0.15 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.646532E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.794 | TFLOPs: 26.19 | +7: iteration 169700/ 173500 | consumed samples: 43443200 | consumed tokens: 88971673600 | elapsed time per iteration (s): 0.15 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.663837E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.771 | TFLOPs: 25.92 | +7: iteration 169710/ 173500 | consumed samples: 43445760 | consumed tokens: 88976916480 | elapsed time per iteration (s): 0.16 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.664616E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.535 | TFLOPs: 25.79 | +7: iteration 169720/ 173500 | consumed samples: 43448320 | consumed tokens: 88982159360 | elapsed time per iteration (s): 0.15 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.653212E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.725 | TFLOPs: 26.08 | +7: iteration 169730/ 173500 | consumed samples: 43450880 | consumed tokens: 88987402240 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.662598E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.429 | TFLOPs: 25.62 | +7: iteration 169740/ 173500 | consumed samples: 43453440 | consumed tokens: 88992645120 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.666499E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.024 | TFLOPs: 25.19 | +7: iteration 169750/ 173500 | consumed samples: 43456000 | consumed tokens: 88997888000 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.668341E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.500 | TFLOPs: 25.84 | +7: iteration 169760/ 173500 | consumed samples: 43458560 | consumed tokens: 89003130880 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.664457E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1596.967 | TFLOPs: 25.04 | +7: iteration 169770/ 173500 | consumed samples: 43461120 | consumed tokens: 89008373760 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.646277E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.766 | TFLOPs: 25.87 | +7: iteration 169780/ 173500 | consumed samples: 43463680 | consumed tokens: 89013616640 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.647083E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.270 | TFLOPs: 25.52 | +7: iteration 169790/ 173500 | consumed samples: 43466240 | consumed tokens: 89018859520 | elapsed time per iteration (s): 0.17 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.656594E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1525.428 | TFLOPs: 23.92 | +7: iteration 169800/ 173500 | consumed samples: 43468800 | consumed tokens: 89024102400 | elapsed time per iteration (s): 0.16 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.658676E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.685 | TFLOPs: 25.51 | +7: iteration 169810/ 173500 | consumed samples: 43471360 | consumed tokens: 89029345280 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.650803E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.589 | TFLOPs: 25.57 | +7: iteration 169820/ 173500 | consumed samples: 43473920 | consumed tokens: 89034588160 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.676683E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.254 | TFLOPs: 25.90 | +7: iteration 169830/ 173500 | consumed samples: 43476480 | consumed tokens: 89039831040 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.659131E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1578.095 | TFLOPs: 24.75 | +7: iteration 169840/ 173500 | consumed samples: 43479040 | consumed tokens: 89045073920 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.661454E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.168 | TFLOPs: 24.72 | +7: iteration 169850/ 173500 | consumed samples: 43481600 | consumed tokens: 89050316800 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.661127E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1576.323 | TFLOPs: 24.72 | +7: iteration 169860/ 173500 | consumed samples: 43484160 | consumed tokens: 89055559680 | elapsed time per iteration (s): 0.15 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.658972E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.456 | TFLOPs: 26.01 | +7: iteration 169870/ 173500 | consumed samples: 43486720 | consumed tokens: 89060802560 | elapsed time per iteration (s): 0.15 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.669071E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.971 | TFLOPs: 26.11 | +7: iteration 169880/ 173500 | consumed samples: 43489280 | consumed tokens: 89066045440 | elapsed time per iteration (s): 0.15 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.661599E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.066 | TFLOPs: 26.10 | +7: iteration 169890/ 173500 | consumed samples: 43491840 | consumed tokens: 89071288320 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.658160E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.328 | TFLOPs: 25.35 | +7: iteration 169900/ 173500 | consumed samples: 43494400 | consumed tokens: 89076531200 | elapsed time per iteration (s): 0.16 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.664708E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.825 | TFLOPs: 25.73 | +7: iteration 169910/ 173500 | consumed samples: 43496960 | consumed tokens: 89081774080 | elapsed time per iteration (s): 0.16 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.664307E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.928 | TFLOPs: 25.78 | +7: iteration 169920/ 173500 | consumed samples: 43499520 | consumed tokens: 89087016960 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.678921E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.156 | TFLOPs: 26.08 | +7: iteration 169930/ 173500 | consumed samples: 43502080 | consumed tokens: 89092259840 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.667283E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.103 | TFLOPs: 26.13 | +7: iteration 169940/ 173500 | consumed samples: 43504640 | consumed tokens: 89097502720 | elapsed time per iteration (s): 0.16 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.671822E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.651 | TFLOPs: 25.65 | +7: iteration 169950/ 173500 | consumed samples: 43507200 | consumed tokens: 89102745600 | elapsed time per iteration (s): 0.16 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.657978E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.267 | TFLOPs: 25.41 | +7: iteration 169960/ 173500 | consumed samples: 43509760 | consumed tokens: 89107988480 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.660275E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.043 | TFLOPs: 26.24 | +7: iteration 169970/ 173500 | consumed samples: 43512320 | consumed tokens: 89113231360 | elapsed time per iteration (s): 0.16 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.651473E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.207 | TFLOPs: 25.83 | +7: iteration 169980/ 173500 | consumed samples: 43514880 | consumed tokens: 89118474240 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.655927E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.577 | TFLOPs: 26.28 | +7: iteration 169990/ 173500 | consumed samples: 43517440 | consumed tokens: 89123717120 | elapsed time per iteration (s): 0.15 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.663070E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.206 | TFLOPs: 26.18 | +0: [2023-03-17 07:39:04,423] [INFO] [logging.py:68:log_dist] [Rank 0] step=170000, skipped=0, lr=[2.0184402348785326e-05, 2.0184402348785326e-05, 2.0184402348785326e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 170000/ 173500 | consumed samples: 43520000 | consumed tokens: 89128960000 | elapsed time per iteration (s): 0.16 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.657948E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.216 | TFLOPs: 25.41 | +0: steps: 170000 loss: 3.6923 iter time (s): 0.155 samples/sec: 1646.907 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 170000 | lm loss value: 3.821922E+00 | lm loss PPL: 4.569195E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 170000 to checkpoints_44m91b100m +0: [2023-03-17 07:39:04,511] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step170000 is begin to save! +0: [2023-03-17 07:39:04,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:39:04,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:39:04,594] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:39:04,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:39:04,605] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:39:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:39:04,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:39:04,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:39:04,622] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:39:04,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:39:04,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:39:04,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:39:04,638] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:39:04,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:39:04,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:39:04,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:39:04,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:39:04,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:39:04,663] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:39:04,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:39:04,664] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step170000/mp_rank_00_model_states.pt +0: [2023-03-17 07:39:04,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:39:04,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:39:04,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:39:04,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:39:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:39:04,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:39:04,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:39:04,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:39:04,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:39:04,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:39:04,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:39:04,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +4: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +6: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +2: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +3: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +1: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:39:04,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:39:04,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +7: [2023-03-17 07:39:04,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:39:04,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:39:04,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +5: [2023-03-17 07:39:04,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:39:04,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step170000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:39:04,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step170000 is ready now! +0: successfully saved checkpoint at iteration 170000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 199.03 +7: iteration 170010/ 173500 | consumed samples: 43522560 | consumed tokens: 89134202880 | elapsed time per iteration (s): 0.18 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.662043E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.235 | TFLOPs: 22.21 | +7: iteration 170020/ 173500 | consumed samples: 43525120 | consumed tokens: 89139445760 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.647572E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.976 | TFLOPs: 26.27 | +7: iteration 170030/ 173500 | consumed samples: 43527680 | consumed tokens: 89144688640 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.662830E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.665 | TFLOPs: 26.28 | +7: iteration 170040/ 173500 | consumed samples: 43530240 | consumed tokens: 89149931520 | elapsed time per iteration (s): 0.16 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.661706E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.225 | TFLOPs: 25.86 | +7: iteration 170050/ 173500 | consumed samples: 43532800 | consumed tokens: 89155174400 | elapsed time per iteration (s): 0.16 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.661051E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1612.154 | TFLOPs: 25.28 | +7: iteration 170060/ 173500 | consumed samples: 43535360 | consumed tokens: 89160417280 | elapsed time per iteration (s): 0.16 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.664590E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.034 | TFLOPs: 25.59 | +7: iteration 170070/ 173500 | consumed samples: 43537920 | consumed tokens: 89165660160 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.669458E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.704 | TFLOPs: 26.14 | +7: iteration 170080/ 173500 | consumed samples: 43540480 | consumed tokens: 89170903040 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.655759E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.377 | TFLOPs: 26.09 | +7: iteration 170090/ 173500 | consumed samples: 43543040 | consumed tokens: 89176145920 | elapsed time per iteration (s): 0.15 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.654772E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.665 | TFLOPs: 26.12 | +7: iteration 170100/ 173500 | consumed samples: 43545600 | consumed tokens: 89181388800 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.662736E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.342 | TFLOPs: 26.15 | +7: iteration 170110/ 173500 | consumed samples: 43548160 | consumed tokens: 89186631680 | elapsed time per iteration (s): 0.16 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.661868E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.723 | TFLOPs: 25.75 | +7: iteration 170120/ 173500 | consumed samples: 43550720 | consumed tokens: 89191874560 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.659569E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.998 | TFLOPs: 26.16 | +7: iteration 170130/ 173500 | consumed samples: 43553280 | consumed tokens: 89197117440 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.655408E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.380 | TFLOPs: 25.93 | +7: iteration 170140/ 173500 | consumed samples: 43555840 | consumed tokens: 89202360320 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.663367E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.225 | TFLOPs: 26.13 | +7: iteration 170150/ 173500 | consumed samples: 43558400 | consumed tokens: 89207603200 | elapsed time per iteration (s): 0.16 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.673583E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.635 | TFLOPs: 25.64 | +7: iteration 170160/ 173500 | consumed samples: 43560960 | consumed tokens: 89212846080 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.659624E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.439 | TFLOPs: 26.18 | +7: iteration 170170/ 173500 | consumed samples: 43563520 | consumed tokens: 89218088960 | elapsed time per iteration (s): 0.15 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.655274E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.576 | TFLOPs: 25.98 | +7: iteration 170180/ 173500 | consumed samples: 43566080 | consumed tokens: 89223331840 | elapsed time per iteration (s): 0.16 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.666541E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.622 | TFLOPs: 25.73 | +7: iteration 170190/ 173500 | consumed samples: 43568640 | consumed tokens: 89228574720 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.667026E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.979 | TFLOPs: 25.34 | +7: iteration 170200/ 173500 | consumed samples: 43571200 | consumed tokens: 89233817600 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.649224E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.505 | TFLOPs: 25.34 | +7: iteration 170210/ 173500 | consumed samples: 43573760 | consumed tokens: 89239060480 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.657051E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.268 | TFLOPs: 25.90 | +7: iteration 170220/ 173500 | consumed samples: 43576320 | consumed tokens: 89244303360 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.670565E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.255 | TFLOPs: 26.27 | +7: iteration 170230/ 173500 | consumed samples: 43578880 | consumed tokens: 89249546240 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.670924E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.078 | TFLOPs: 25.31 | +7: iteration 170240/ 173500 | consumed samples: 43581440 | consumed tokens: 89254789120 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.646311E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.269 | TFLOPs: 25.06 | +7: iteration 170250/ 173500 | consumed samples: 43584000 | consumed tokens: 89260032000 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.659801E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.218 | TFLOPs: 26.24 | +7: iteration 170260/ 173500 | consumed samples: 43586560 | consumed tokens: 89265274880 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.667786E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.608 | TFLOPs: 26.25 | +7: iteration 170270/ 173500 | consumed samples: 43589120 | consumed tokens: 89270517760 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.669938E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1598.132 | TFLOPs: 25.06 | +7: iteration 170280/ 173500 | consumed samples: 43591680 | consumed tokens: 89275760640 | elapsed time per iteration (s): 0.15 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.653176E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.495 | TFLOPs: 26.29 | +7: iteration 170290/ 173500 | consumed samples: 43594240 | consumed tokens: 89281003520 | elapsed time per iteration (s): 0.16 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.654896E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.870 | TFLOPs: 25.83 | +7: iteration 170300/ 173500 | consumed samples: 43596800 | consumed tokens: 89286246400 | elapsed time per iteration (s): 0.16 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.650526E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.628 | TFLOPs: 25.81 | +7: iteration 170310/ 173500 | consumed samples: 43599360 | consumed tokens: 89291489280 | elapsed time per iteration (s): 0.16 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.660276E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.950 | TFLOPs: 25.88 | +7: iteration 170320/ 173500 | consumed samples: 43601920 | consumed tokens: 89296732160 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.667670E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.649 | TFLOPs: 26.25 | +7: iteration 170330/ 173500 | consumed samples: 43604480 | consumed tokens: 89301975040 | elapsed time per iteration (s): 0.16 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.658207E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.030 | TFLOPs: 25.39 | +7: iteration 170340/ 173500 | consumed samples: 43607040 | consumed tokens: 89307217920 | elapsed time per iteration (s): 0.16 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.669092E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.132 | TFLOPs: 25.77 | +7: iteration 170350/ 173500 | consumed samples: 43609600 | consumed tokens: 89312460800 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.658958E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.708 | TFLOPs: 26.23 | +7: iteration 170360/ 173500 | consumed samples: 43612160 | consumed tokens: 89317703680 | elapsed time per iteration (s): 0.16 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.664374E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.646 | TFLOPs: 25.16 | +7: iteration 170370/ 173500 | consumed samples: 43614720 | consumed tokens: 89322946560 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.660264E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.052 | TFLOPs: 26.24 | +7: iteration 170380/ 173500 | consumed samples: 43617280 | consumed tokens: 89328189440 | elapsed time per iteration (s): 0.16 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.665096E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.890 | TFLOPs: 25.80 | +7: iteration 170390/ 173500 | consumed samples: 43619840 | consumed tokens: 89333432320 | elapsed time per iteration (s): 0.15 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.659639E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.402 | TFLOPs: 25.96 | +7: iteration 170400/ 173500 | consumed samples: 43622400 | consumed tokens: 89338675200 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.659098E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.521 | TFLOPs: 25.81 | +7: iteration 170410/ 173500 | consumed samples: 43624960 | consumed tokens: 89343918080 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.669177E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.643 | TFLOPs: 26.17 | +7: iteration 170420/ 173500 | consumed samples: 43627520 | consumed tokens: 89349160960 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.662261E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.101 | TFLOPs: 25.47 | +7: iteration 170430/ 173500 | consumed samples: 43630080 | consumed tokens: 89354403840 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.659065E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.300 | TFLOPs: 26.10 | +7: iteration 170440/ 173500 | consumed samples: 43632640 | consumed tokens: 89359646720 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.661327E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.901 | TFLOPs: 25.23 | +7: iteration 170450/ 173500 | consumed samples: 43635200 | consumed tokens: 89364889600 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.663541E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.668 | TFLOPs: 25.53 | +7: iteration 170460/ 173500 | consumed samples: 43637760 | consumed tokens: 89370132480 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.676559E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.186 | TFLOPs: 25.88 | +7: iteration 170470/ 173500 | consumed samples: 43640320 | consumed tokens: 89375375360 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.661147E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.981 | TFLOPs: 25.95 | +7: iteration 170480/ 173500 | consumed samples: 43642880 | consumed tokens: 89380618240 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.668209E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.252 | TFLOPs: 25.22 | +7: iteration 170490/ 173500 | consumed samples: 43645440 | consumed tokens: 89385861120 | elapsed time per iteration (s): 0.16 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.644906E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1614.268 | TFLOPs: 25.32 | +7: iteration 170500/ 173500 | consumed samples: 43648000 | consumed tokens: 89391104000 | elapsed time per iteration (s): 0.15 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.658914E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.412 | TFLOPs: 25.99 | +7: iteration 170510/ 173500 | consumed samples: 43650560 | consumed tokens: 89396346880 | elapsed time per iteration (s): 0.16 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.669951E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.826 | TFLOPs: 25.76 | +7: iteration 170520/ 173500 | consumed samples: 43653120 | consumed tokens: 89401589760 | elapsed time per iteration (s): 0.20 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.655471E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1258.990 | TFLOPs: 19.74 | +7: iteration 170530/ 173500 | consumed samples: 43655680 | consumed tokens: 89406832640 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.649197E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.022 | TFLOPs: 26.24 | +7: iteration 170540/ 173500 | consumed samples: 43658240 | consumed tokens: 89412075520 | elapsed time per iteration (s): 0.16 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.669281E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.458 | TFLOPs: 25.30 | +7: iteration 170550/ 173500 | consumed samples: 43660800 | consumed tokens: 89417318400 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.661940E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.779 | TFLOPs: 26.25 | +7: iteration 170560/ 173500 | consumed samples: 43663360 | consumed tokens: 89422561280 | elapsed time per iteration (s): 0.16 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.638100E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1599.305 | TFLOPs: 25.08 | +7: iteration 170570/ 173500 | consumed samples: 43665920 | consumed tokens: 89427804160 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.656109E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.961 | TFLOPs: 25.91 | +7: iteration 170580/ 173500 | consumed samples: 43668480 | consumed tokens: 89433047040 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.651874E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.271 | TFLOPs: 26.08 | +7: iteration 170590/ 173500 | consumed samples: 43671040 | consumed tokens: 89438289920 | elapsed time per iteration (s): 0.16 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.674390E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.841 | TFLOPs: 25.48 | +7: iteration 170600/ 173500 | consumed samples: 43673600 | consumed tokens: 89443532800 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.649500E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.437 | TFLOPs: 26.15 | +7: iteration 170610/ 173500 | consumed samples: 43676160 | consumed tokens: 89448775680 | elapsed time per iteration (s): 0.15 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.659591E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.146 | TFLOPs: 25.94 | +7: iteration 170620/ 173500 | consumed samples: 43678720 | consumed tokens: 89454018560 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.652354E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.849 | TFLOPs: 25.61 | +7: iteration 170630/ 173500 | consumed samples: 43681280 | consumed tokens: 89459261440 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.660248E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.837 | TFLOPs: 25.83 | +7: iteration 170640/ 173500 | consumed samples: 43683840 | consumed tokens: 89464504320 | elapsed time per iteration (s): 0.15 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.645430E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.928 | TFLOPs: 25.98 | +7: iteration 170650/ 173500 | consumed samples: 43686400 | consumed tokens: 89469747200 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.651469E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.062 | TFLOPs: 25.74 | +7: iteration 170660/ 173500 | consumed samples: 43688960 | consumed tokens: 89474990080 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.654176E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.027 | TFLOPs: 25.45 | +7: iteration 170670/ 173500 | consumed samples: 43691520 | consumed tokens: 89480232960 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.659693E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1580.595 | TFLOPs: 24.79 | +7: iteration 170680/ 173500 | consumed samples: 43694080 | consumed tokens: 89485475840 | elapsed time per iteration (s): 0.15 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.671790E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.601 | TFLOPs: 26.04 | +7: iteration 170690/ 173500 | consumed samples: 43696640 | consumed tokens: 89490718720 | elapsed time per iteration (s): 0.15 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.657103E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.733 | TFLOPs: 26.08 | +7: iteration 170700/ 173500 | consumed samples: 43699200 | consumed tokens: 89495961600 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.661759E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1591.165 | TFLOPs: 24.95 | +7: iteration 170710/ 173500 | consumed samples: 43701760 | consumed tokens: 89501204480 | elapsed time per iteration (s): 0.15 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.669590E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.818 | TFLOPs: 26.12 | +7: iteration 170720/ 173500 | consumed samples: 43704320 | consumed tokens: 89506447360 | elapsed time per iteration (s): 0.16 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.658395E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.578 | TFLOPs: 24.83 | +7: iteration 170730/ 173500 | consumed samples: 43706880 | consumed tokens: 89511690240 | elapsed time per iteration (s): 0.17 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.666614E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1513.813 | TFLOPs: 23.74 | +7: iteration 170740/ 173500 | consumed samples: 43709440 | consumed tokens: 89516933120 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.662039E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.624 | TFLOPs: 26.15 | +7: iteration 170750/ 173500 | consumed samples: 43712000 | consumed tokens: 89522176000 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.680175E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.201 | TFLOPs: 26.16 | +7: iteration 170760/ 173500 | consumed samples: 43714560 | consumed tokens: 89527418880 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.661221E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.198 | TFLOPs: 26.16 | +7: iteration 170770/ 173500 | consumed samples: 43717120 | consumed tokens: 89532661760 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.657951E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.468 | TFLOPs: 26.15 | +7: iteration 170780/ 173500 | consumed samples: 43719680 | consumed tokens: 89537904640 | elapsed time per iteration (s): 0.16 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.669131E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.547 | TFLOPs: 25.65 | +7: iteration 170790/ 173500 | consumed samples: 43722240 | consumed tokens: 89543147520 | elapsed time per iteration (s): 0.16 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.665559E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.679 | TFLOPs: 25.76 | +7: iteration 170800/ 173500 | consumed samples: 43724800 | consumed tokens: 89548390400 | elapsed time per iteration (s): 0.16 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.670694E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.581 | TFLOPs: 25.46 | +7: iteration 170810/ 173500 | consumed samples: 43727360 | consumed tokens: 89553633280 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.679476E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.054 | TFLOPs: 26.19 | +7: iteration 170820/ 173500 | consumed samples: 43729920 | consumed tokens: 89558876160 | elapsed time per iteration (s): 0.15 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.668409E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.216 | TFLOPs: 26.18 | +7: iteration 170830/ 173500 | consumed samples: 43732480 | consumed tokens: 89564119040 | elapsed time per iteration (s): 0.16 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.658781E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.232 | TFLOPs: 25.77 | +7: iteration 170840/ 173500 | consumed samples: 43735040 | consumed tokens: 89569361920 | elapsed time per iteration (s): 0.16 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.656273E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.765 | TFLOPs: 25.81 | +7: iteration 170850/ 173500 | consumed samples: 43737600 | consumed tokens: 89574604800 | elapsed time per iteration (s): 0.16 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.660934E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1616.408 | TFLOPs: 25.35 | +7: iteration 170860/ 173500 | consumed samples: 43740160 | consumed tokens: 89579847680 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.652157E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.645 | TFLOPs: 26.17 | +7: iteration 170870/ 173500 | consumed samples: 43742720 | consumed tokens: 89585090560 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.658290E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.207 | TFLOPs: 26.18 | +7: iteration 170880/ 173500 | consumed samples: 43745280 | consumed tokens: 89590333440 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.668888E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.148 | TFLOPs: 26.16 | +7: iteration 170890/ 173500 | consumed samples: 43747840 | consumed tokens: 89595576320 | elapsed time per iteration (s): 0.16 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.661197E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1585.050 | TFLOPs: 24.86 | +7: iteration 170900/ 173500 | consumed samples: 43750400 | consumed tokens: 89600819200 | elapsed time per iteration (s): 0.16 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.663141E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.836 | TFLOPs: 25.78 | +7: iteration 170910/ 173500 | consumed samples: 43752960 | consumed tokens: 89606062080 | elapsed time per iteration (s): 0.16 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.664456E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.690 | TFLOPs: 25.86 | +7: iteration 170920/ 173500 | consumed samples: 43755520 | consumed tokens: 89611304960 | elapsed time per iteration (s): 0.16 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.660826E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.005 | TFLOPs: 25.81 | +7: iteration 170930/ 173500 | consumed samples: 43758080 | consumed tokens: 89616547840 | elapsed time per iteration (s): 0.16 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.675568E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1608.729 | TFLOPs: 25.23 | +7: iteration 170940/ 173500 | consumed samples: 43760640 | consumed tokens: 89621790720 | elapsed time per iteration (s): 0.16 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.664507E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.463 | TFLOPs: 25.62 | +7: iteration 170950/ 173500 | consumed samples: 43763200 | consumed tokens: 89627033600 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.669748E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.402 | TFLOPs: 25.91 | +7: iteration 170960/ 173500 | consumed samples: 43765760 | consumed tokens: 89632276480 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.668889E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.394 | TFLOPs: 26.35 | +7: iteration 170970/ 173500 | consumed samples: 43768320 | consumed tokens: 89637519360 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.659393E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.456 | TFLOPs: 25.99 | +7: iteration 170980/ 173500 | consumed samples: 43770880 | consumed tokens: 89642762240 | elapsed time per iteration (s): 0.15 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.664074E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.093 | TFLOPs: 26.19 | +7: iteration 170990/ 173500 | consumed samples: 43773440 | consumed tokens: 89648005120 | elapsed time per iteration (s): 0.16 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.659776E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1610.467 | TFLOPs: 25.26 | +7: iteration 171000/ 173500 | consumed samples: 43776000 | consumed tokens: 89653248000 | elapsed time per iteration (s): 0.16 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.665466E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.235 | TFLOPs: 25.77 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 171000 | lm loss value: 3.836635E+00 | lm loss PPL: 4.636916E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 171000 to checkpoints_44m91b100m +0: [2023-03-17 07:41:40,972] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step171000 is begin to save! +0: [2023-03-17 07:41:40,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:41:41,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:41:41,046] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:41:41,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:41:41,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:41:41,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:41:41,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:41:41,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:41:41,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:41:41,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:41:41,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:41:41,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:41:41,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:41:41,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:41:41,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:41:41,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:41:41,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:41:41,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:41:41,112] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:41:41,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:41:41,114] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step171000/mp_rank_00_model_states.pt +0: [2023-03-17 07:41:41,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:41:41,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:41:41,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:41:41,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:41:41,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:41:41,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +5: [2023-03-17 07:41:41,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:41:41,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +5: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:41:41,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:41:41,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +5: [2023-03-17 07:41:41,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:41:41,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +5: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:41:41,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 07:41:41,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +4: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +3: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +2: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +7: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:41:41,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:41:41,170] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +1: [2023-03-17 07:41:41,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:41:41,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:41:41,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step171000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:41:41,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +6: [2023-03-17 07:41:41,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step171000 is ready now! +0: successfully saved checkpoint at iteration 171000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 222.50 +7: iteration 171010/ 173500 | consumed samples: 43778560 | consumed tokens: 89658490880 | elapsed time per iteration (s): 0.19 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.664687E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1365.179 | TFLOPs: 21.41 | +7: iteration 171020/ 173500 | consumed samples: 43781120 | consumed tokens: 89663733760 | elapsed time per iteration (s): 0.16 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.657459E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.765 | TFLOPs: 25.87 | +7: iteration 171030/ 173500 | consumed samples: 43783680 | consumed tokens: 89668976640 | elapsed time per iteration (s): 0.16 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.653463E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.719 | TFLOPs: 25.89 | +7: iteration 171040/ 173500 | consumed samples: 43786240 | consumed tokens: 89674219520 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.657016E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.957 | TFLOPs: 26.08 | +7: iteration 171050/ 173500 | consumed samples: 43788800 | consumed tokens: 89679462400 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.666380E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.663 | TFLOPs: 26.28 | +7: iteration 171060/ 173500 | consumed samples: 43791360 | consumed tokens: 89684705280 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.642700E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.309 | TFLOPs: 26.26 | +7: iteration 171070/ 173500 | consumed samples: 43793920 | consumed tokens: 89689948160 | elapsed time per iteration (s): 0.16 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.668485E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.612 | TFLOPs: 25.78 | +7: iteration 171080/ 173500 | consumed samples: 43796480 | consumed tokens: 89695191040 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.657848E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.998 | TFLOPs: 26.24 | +7: iteration 171090/ 173500 | consumed samples: 43799040 | consumed tokens: 89700433920 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.661820E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.555 | TFLOPs: 25.95 | +7: iteration 171100/ 173500 | consumed samples: 43801600 | consumed tokens: 89705676800 | elapsed time per iteration (s): 0.16 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.672944E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.816 | TFLOPs: 25.72 | +7: iteration 171110/ 173500 | consumed samples: 43804160 | consumed tokens: 89710919680 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.657940E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.099 | TFLOPs: 26.08 | +7: iteration 171120/ 173500 | consumed samples: 43806720 | consumed tokens: 89716162560 | elapsed time per iteration (s): 0.15 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.667257E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.141 | TFLOPs: 26.05 | +7: iteration 171130/ 173500 | consumed samples: 43809280 | consumed tokens: 89721405440 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.660627E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.891 | TFLOPs: 26.06 | +7: iteration 171140/ 173500 | consumed samples: 43811840 | consumed tokens: 89726648320 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.652689E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.268 | TFLOPs: 26.08 | +7: iteration 171150/ 173500 | consumed samples: 43814400 | consumed tokens: 89731891200 | elapsed time per iteration (s): 0.16 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.660343E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.078 | TFLOPs: 25.56 | +7: iteration 171160/ 173500 | consumed samples: 43816960 | consumed tokens: 89737134080 | elapsed time per iteration (s): 0.16 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.657078E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.178 | TFLOPs: 25.20 | +7: iteration 171170/ 173500 | consumed samples: 43819520 | consumed tokens: 89742376960 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.668990E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.007 | TFLOPs: 26.10 | +7: iteration 171180/ 173500 | consumed samples: 43822080 | consumed tokens: 89747619840 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.672771E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.217 | TFLOPs: 26.15 | +7: iteration 171190/ 173500 | consumed samples: 43824640 | consumed tokens: 89752862720 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.647825E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1670.103 | TFLOPs: 26.19 | +7: iteration 171200/ 173500 | consumed samples: 43827200 | consumed tokens: 89758105600 | elapsed time per iteration (s): 0.16 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.669398E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.237 | TFLOPs: 25.72 | +7: iteration 171210/ 173500 | consumed samples: 43829760 | consumed tokens: 89763348480 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.664695E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.259 | TFLOPs: 26.21 | +7: iteration 171220/ 173500 | consumed samples: 43832320 | consumed tokens: 89768591360 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.661306E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.124 | TFLOPs: 26.21 | +7: iteration 171230/ 173500 | consumed samples: 43834880 | consumed tokens: 89773834240 | elapsed time per iteration (s): 0.16 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.656276E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1627.453 | TFLOPs: 25.52 | +7: iteration 171240/ 173500 | consumed samples: 43837440 | consumed tokens: 89779077120 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.656258E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.465 | TFLOPs: 26.18 | +7: iteration 171250/ 173500 | consumed samples: 43840000 | consumed tokens: 89784320000 | elapsed time per iteration (s): 0.15 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.655861E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.985 | TFLOPs: 26.13 | +7: iteration 171260/ 173500 | consumed samples: 43842560 | consumed tokens: 89789562880 | elapsed time per iteration (s): 0.16 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.656276E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.502 | TFLOPs: 25.76 | +7: iteration 171270/ 173500 | consumed samples: 43845120 | consumed tokens: 89794805760 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.656094E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.205 | TFLOPs: 25.93 | +7: iteration 171280/ 173500 | consumed samples: 43847680 | consumed tokens: 89800048640 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.667335E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.567 | TFLOPs: 25.74 | +7: iteration 171290/ 173500 | consumed samples: 43850240 | consumed tokens: 89805291520 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.673219E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.462 | TFLOPs: 26.18 | +7: iteration 171300/ 173500 | consumed samples: 43852800 | consumed tokens: 89810534400 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.660005E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1560.975 | TFLOPs: 24.48 | +7: iteration 171310/ 173500 | consumed samples: 43855360 | consumed tokens: 89815777280 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.658745E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.479 | TFLOPs: 26.17 | +7: iteration 171320/ 173500 | consumed samples: 43857920 | consumed tokens: 89821020160 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.663077E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.951 | TFLOPs: 26.17 | +7: iteration 171330/ 173500 | consumed samples: 43860480 | consumed tokens: 89826263040 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.673924E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.987 | TFLOPs: 25.56 | +7: iteration 171340/ 173500 | consumed samples: 43863040 | consumed tokens: 89831505920 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.663545E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.705 | TFLOPs: 25.68 | +7: iteration 171350/ 173500 | consumed samples: 43865600 | consumed tokens: 89836748800 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.665037E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1586.928 | TFLOPs: 24.89 | +7: iteration 171360/ 173500 | consumed samples: 43868160 | consumed tokens: 89841991680 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.657256E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.814 | TFLOPs: 25.61 | +7: iteration 171370/ 173500 | consumed samples: 43870720 | consumed tokens: 89847234560 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.650357E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.814 | TFLOPs: 26.25 | +7: iteration 171380/ 173500 | consumed samples: 43873280 | consumed tokens: 89852477440 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.659315E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1619.779 | TFLOPs: 25.40 | +7: iteration 171390/ 173500 | consumed samples: 43875840 | consumed tokens: 89857720320 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.677230E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.073 | TFLOPs: 26.30 | +7: iteration 171400/ 173500 | consumed samples: 43878400 | consumed tokens: 89862963200 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.667209E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1676.240 | TFLOPs: 26.29 | +7: iteration 171410/ 173500 | consumed samples: 43880960 | consumed tokens: 89868206080 | elapsed time per iteration (s): 0.15 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.678610E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.544 | TFLOPs: 26.28 | +7: iteration 171420/ 173500 | consumed samples: 43883520 | consumed tokens: 89873448960 | elapsed time per iteration (s): 0.16 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.669504E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1631.738 | TFLOPs: 25.59 | +7: iteration 171430/ 173500 | consumed samples: 43886080 | consumed tokens: 89878691840 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.655131E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.917 | TFLOPs: 26.24 | +7: iteration 171440/ 173500 | consumed samples: 43888640 | consumed tokens: 89883934720 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.659824E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1656.159 | TFLOPs: 25.97 | +7: iteration 171450/ 173500 | consumed samples: 43891200 | consumed tokens: 89889177600 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.661944E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.854 | TFLOPs: 25.92 | +7: iteration 171460/ 173500 | consumed samples: 43893760 | consumed tokens: 89894420480 | elapsed time per iteration (s): 0.16 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.660325E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1620.160 | TFLOPs: 25.41 | +7: iteration 171470/ 173500 | consumed samples: 43896320 | consumed tokens: 89899663360 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.656293E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.912 | TFLOPs: 26.06 | +7: iteration 171480/ 173500 | consumed samples: 43898880 | consumed tokens: 89904906240 | elapsed time per iteration (s): 0.16 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.657072E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1583.514 | TFLOPs: 24.83 | +7: iteration 171490/ 173500 | consumed samples: 43901440 | consumed tokens: 89910149120 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.660339E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.521 | TFLOPs: 26.09 | +7: iteration 171500/ 173500 | consumed samples: 43904000 | consumed tokens: 89915392000 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.675116E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.676 | TFLOPs: 26.07 | +7: iteration 171510/ 173500 | consumed samples: 43906560 | consumed tokens: 89920634880 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.666845E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.879 | TFLOPs: 26.09 | +7: iteration 171520/ 173500 | consumed samples: 43909120 | consumed tokens: 89925877760 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.675028E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.268 | TFLOPs: 25.93 | +7: iteration 171530/ 173500 | consumed samples: 43911680 | consumed tokens: 89931120640 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.660783E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.814 | TFLOPs: 25.92 | +7: iteration 171540/ 173500 | consumed samples: 43914240 | consumed tokens: 89936363520 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.675753E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.963 | TFLOPs: 26.19 | +7: iteration 171550/ 173500 | consumed samples: 43916800 | consumed tokens: 89941606400 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.662649E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.425 | TFLOPs: 26.18 | +7: iteration 171560/ 173500 | consumed samples: 43919360 | consumed tokens: 89946849280 | elapsed time per iteration (s): 0.15 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.651970E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.204 | TFLOPs: 26.26 | +7: iteration 171570/ 173500 | consumed samples: 43921920 | consumed tokens: 89952092160 | elapsed time per iteration (s): 0.16 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.667717E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.121 | TFLOPs: 25.89 | +7: iteration 171580/ 173500 | consumed samples: 43924480 | consumed tokens: 89957335040 | elapsed time per iteration (s): 0.16 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.676337E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.144 | TFLOPs: 25.60 | +7: iteration 171590/ 173500 | consumed samples: 43927040 | consumed tokens: 89962577920 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.671124E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.305 | TFLOPs: 25.33 | +7: iteration 171600/ 173500 | consumed samples: 43929600 | consumed tokens: 89967820800 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.668990E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1642.090 | TFLOPs: 25.75 | +7: iteration 171610/ 173500 | consumed samples: 43932160 | consumed tokens: 89973063680 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.661808E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.538 | TFLOPs: 24.91 | +7: iteration 171620/ 173500 | consumed samples: 43934720 | consumed tokens: 89978306560 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.656696E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1613.606 | TFLOPs: 25.31 | +7: iteration 171630/ 173500 | consumed samples: 43937280 | consumed tokens: 89983549440 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.669102E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.513 | TFLOPs: 25.37 | +7: iteration 171640/ 173500 | consumed samples: 43939840 | consumed tokens: 89988792320 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.644005E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1565.824 | TFLOPs: 24.56 | +7: iteration 171650/ 173500 | consumed samples: 43942400 | consumed tokens: 89994035200 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.660362E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.381 | TFLOPs: 26.15 | +7: iteration 171660/ 173500 | consumed samples: 43944960 | consumed tokens: 89999278080 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.649001E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1640.673 | TFLOPs: 25.73 | +7: iteration 171670/ 173500 | consumed samples: 43947520 | consumed tokens: 90004520960 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.665903E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.942 | TFLOPs: 26.17 | +7: iteration 171680/ 173500 | consumed samples: 43950080 | consumed tokens: 90009763840 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.664847E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1611.328 | TFLOPs: 25.27 | +7: iteration 171690/ 173500 | consumed samples: 43952640 | consumed tokens: 90015006720 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.670996E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.283 | TFLOPs: 26.12 | +7: iteration 171700/ 173500 | consumed samples: 43955200 | consumed tokens: 90020249600 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.651603E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1623.920 | TFLOPs: 25.47 | +7: iteration 171710/ 173500 | consumed samples: 43957760 | consumed tokens: 90025492480 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.662236E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.492 | TFLOPs: 26.17 | +7: iteration 171720/ 173500 | consumed samples: 43960320 | consumed tokens: 90030735360 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.665892E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.668 | TFLOPs: 25.45 | +7: iteration 171730/ 173500 | consumed samples: 43962880 | consumed tokens: 90035978240 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.651759E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1603.244 | TFLOPs: 25.14 | +7: iteration 171740/ 173500 | consumed samples: 43965440 | consumed tokens: 90041221120 | elapsed time per iteration (s): 0.18 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.671483E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1456.326 | TFLOPs: 22.84 | +7: iteration 171750/ 173500 | consumed samples: 43968000 | consumed tokens: 90046464000 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.651822E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1626.881 | TFLOPs: 25.51 | +7: iteration 171760/ 173500 | consumed samples: 43970560 | consumed tokens: 90051706880 | elapsed time per iteration (s): 0.15 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.658424E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.100 | TFLOPs: 26.14 | +7: iteration 171770/ 173500 | consumed samples: 43973120 | consumed tokens: 90056949760 | elapsed time per iteration (s): 0.16 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.646327E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.165 | TFLOPs: 25.88 | +7: iteration 171780/ 173500 | consumed samples: 43975680 | consumed tokens: 90062192640 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.654571E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.625 | TFLOPs: 26.18 | +7: iteration 171790/ 173500 | consumed samples: 43978240 | consumed tokens: 90067435520 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.668573E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.519 | TFLOPs: 26.17 | +7: iteration 171800/ 173500 | consumed samples: 43980800 | consumed tokens: 90072678400 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.669033E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.120 | TFLOPs: 26.07 | +7: iteration 171810/ 173500 | consumed samples: 43983360 | consumed tokens: 90077921280 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.656158E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.797 | TFLOPs: 26.11 | +7: iteration 171820/ 173500 | consumed samples: 43985920 | consumed tokens: 90083164160 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.670968E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.420 | TFLOPs: 26.12 | +7: iteration 171830/ 173500 | consumed samples: 43988480 | consumed tokens: 90088407040 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.671118E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.706 | TFLOPs: 26.11 | +7: iteration 171840/ 173500 | consumed samples: 43991040 | consumed tokens: 90093649920 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.671511E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.016 | TFLOPs: 26.11 | +7: iteration 171850/ 173500 | consumed samples: 43993600 | consumed tokens: 90098892800 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.671671E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.271 | TFLOPs: 26.12 | +7: iteration 171860/ 173500 | consumed samples: 43996160 | consumed tokens: 90104135680 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.659930E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.092 | TFLOPs: 26.11 | +7: iteration 171870/ 173500 | consumed samples: 43998720 | consumed tokens: 90109378560 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.669893E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.435 | TFLOPs: 26.12 | +7: iteration 171880/ 173500 | consumed samples: 44001280 | consumed tokens: 90114621440 | elapsed time per iteration (s): 0.16 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.666921E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.463 | TFLOPs: 25.79 | +7: iteration 171890/ 173500 | consumed samples: 44003840 | consumed tokens: 90119864320 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.656620E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.973 | TFLOPs: 26.16 | +7: iteration 171900/ 173500 | consumed samples: 44006400 | consumed tokens: 90125107200 | elapsed time per iteration (s): 0.16 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.668821E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.744 | TFLOPs: 25.72 | +7: iteration 171910/ 173500 | consumed samples: 44008960 | consumed tokens: 90130350080 | elapsed time per iteration (s): 0.16 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.661483E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.480 | TFLOPs: 25.24 | +7: iteration 171920/ 173500 | consumed samples: 44011520 | consumed tokens: 90135592960 | elapsed time per iteration (s): 0.16 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.667170E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1617.954 | TFLOPs: 25.37 | +7: iteration 171930/ 173500 | consumed samples: 44014080 | consumed tokens: 90140835840 | elapsed time per iteration (s): 0.16 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.650430E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.997 | TFLOPs: 25.81 | +7: iteration 171940/ 173500 | consumed samples: 44016640 | consumed tokens: 90146078720 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.658530E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.537 | TFLOPs: 26.25 | +7: iteration 171950/ 173500 | consumed samples: 44019200 | consumed tokens: 90151321600 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.667804E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1669.584 | TFLOPs: 26.18 | +7: iteration 171960/ 173500 | consumed samples: 44021760 | consumed tokens: 90156564480 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.658549E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.562 | TFLOPs: 25.93 | +7: iteration 171970/ 173500 | consumed samples: 44024320 | consumed tokens: 90161807360 | elapsed time per iteration (s): 0.15 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.660941E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.400 | TFLOPs: 26.13 | +7: iteration 171980/ 173500 | consumed samples: 44026880 | consumed tokens: 90167050240 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.659240E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.055 | TFLOPs: 26.14 | +7: iteration 171990/ 173500 | consumed samples: 44029440 | consumed tokens: 90172293120 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.660149E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1646.592 | TFLOPs: 25.82 | +0: [2023-03-17 07:44:16,752] [INFO] [logging.py:68:log_dist] [Rank 0] step=172000, skipped=0, lr=[2.0033893682955986e-05, 2.0033893682955986e-05, 2.0033893682955986e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 172000/ 173500 | consumed samples: 44032000 | consumed tokens: 90177536000 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.661243E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.835 | TFLOPs: 25.34 | +0: steps: 172000 loss: 3.6474 iter time (s): 0.155 samples/sec: 1652.131 +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 172000 | lm loss value: 3.804868E+00 | lm loss PPL: 4.491934E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 172000 to checkpoints_44m91b100m +0: [2023-03-17 07:44:16,828] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step172000 is begin to save! +0: [2023-03-17 07:44:16,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:44:16,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:44:16,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:44:16,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:44:16,912] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:44:16,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:44:16,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:44:16,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:44:16,928] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:44:16,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:44:16,936] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:44:16,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:44:16,944] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:44:16,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:44:16,952] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:44:16,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:44:16,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:44:16,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:44:16,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:44:16,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:44:16,969] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step172000/mp_rank_00_model_states.pt +0: [2023-03-17 07:44:16,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:44:16,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:44:16,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:44:17,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:44:17,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:44:17,001] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,001] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +5: [2023-03-17 07:44:17,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:44:17,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +5: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:44:17,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +5: [2023-03-17 07:44:17,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:44:17,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:44:17,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +5: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +5: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:44:17,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +4: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +3: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +6: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +5: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +7: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:44:17,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:44:17,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +1: [2023-03-17 07:44:17,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:44:17,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 07:44:17,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +2: [2023-03-17 07:44:17,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:44:17,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step172000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:44:17,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step172000 is ready now! +0: successfully saved checkpoint at iteration 172000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 201.35 +7: iteration 172010/ 173500 | consumed samples: 44034560 | consumed tokens: 90182778880 | elapsed time per iteration (s): 0.19 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.668572E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1328.699 | TFLOPs: 20.84 | +7: iteration 172020/ 173500 | consumed samples: 44037120 | consumed tokens: 90188021760 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.662391E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.001 | TFLOPs: 26.21 | +7: iteration 172030/ 173500 | consumed samples: 44039680 | consumed tokens: 90193264640 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.667732E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.518 | TFLOPs: 25.81 | +7: iteration 172040/ 173500 | consumed samples: 44042240 | consumed tokens: 90198507520 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.669028E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.061 | TFLOPs: 26.22 | +7: iteration 172050/ 173500 | consumed samples: 44044800 | consumed tokens: 90203750400 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.653227E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1672.189 | TFLOPs: 26.22 | +7: iteration 172060/ 173500 | consumed samples: 44047360 | consumed tokens: 90208993280 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.669837E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.131 | TFLOPs: 25.80 | +7: iteration 172070/ 173500 | consumed samples: 44049920 | consumed tokens: 90214236160 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.651262E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.283 | TFLOPs: 26.24 | +7: iteration 172080/ 173500 | consumed samples: 44052480 | consumed tokens: 90219479040 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.656614E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.435 | TFLOPs: 26.34 | +7: iteration 172090/ 173500 | consumed samples: 44055040 | consumed tokens: 90224721920 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.669742E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1609.710 | TFLOPs: 25.24 | +7: iteration 172100/ 173500 | consumed samples: 44057600 | consumed tokens: 90229964800 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.664743E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.909 | TFLOPs: 25.91 | +7: iteration 172110/ 173500 | consumed samples: 44060160 | consumed tokens: 90235207680 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.662009E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.042 | TFLOPs: 25.67 | +7: iteration 172120/ 173500 | consumed samples: 44062720 | consumed tokens: 90240450560 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.669205E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.055 | TFLOPs: 26.25 | +7: iteration 172130/ 173500 | consumed samples: 44065280 | consumed tokens: 90245693440 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.659355E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.863 | TFLOPs: 26.34 | +7: iteration 172140/ 173500 | consumed samples: 44067840 | consumed tokens: 90250936320 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.651344E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1668.540 | TFLOPs: 26.17 | +7: iteration 172150/ 173500 | consumed samples: 44070400 | consumed tokens: 90256179200 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.667517E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.827 | TFLOPs: 26.36 | +7: iteration 172160/ 173500 | consumed samples: 44072960 | consumed tokens: 90261422080 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.649709E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.975 | TFLOPs: 26.35 | +7: iteration 172170/ 173500 | consumed samples: 44075520 | consumed tokens: 90266664960 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.663424E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.501 | TFLOPs: 25.63 | +7: iteration 172180/ 173500 | consumed samples: 44078080 | consumed tokens: 90271907840 | elapsed time per iteration (s): 0.16 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.666943E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1615.122 | TFLOPs: 25.33 | +7: iteration 172190/ 173500 | consumed samples: 44080640 | consumed tokens: 90277150720 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.661909E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.344 | TFLOPs: 26.27 | +7: iteration 172200/ 173500 | consumed samples: 44083200 | consumed tokens: 90282393600 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.667120E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.852 | TFLOPs: 26.09 | +7: iteration 172210/ 173500 | consumed samples: 44085760 | consumed tokens: 90287636480 | elapsed time per iteration (s): 0.15 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.653134E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.777 | TFLOPs: 25.95 | +7: iteration 172220/ 173500 | consumed samples: 44088320 | consumed tokens: 90292879360 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.650729E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.536 | TFLOPs: 25.95 | +7: iteration 172230/ 173500 | consumed samples: 44090880 | consumed tokens: 90298122240 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.653158E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.433 | TFLOPs: 26.32 | +7: iteration 172240/ 173500 | consumed samples: 44093440 | consumed tokens: 90303365120 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.647259E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.560 | TFLOPs: 25.92 | +7: iteration 172250/ 173500 | consumed samples: 44096000 | consumed tokens: 90308608000 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.655581E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1649.629 | TFLOPs: 25.87 | +7: iteration 172260/ 173500 | consumed samples: 44098560 | consumed tokens: 90313850880 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.643001E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.817 | TFLOPs: 26.28 | +7: iteration 172270/ 173500 | consumed samples: 44101120 | consumed tokens: 90319093760 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.659789E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1675.730 | TFLOPs: 26.28 | +7: iteration 172280/ 173500 | consumed samples: 44103680 | consumed tokens: 90324336640 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.647551E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1635.197 | TFLOPs: 25.64 | +7: iteration 172290/ 173500 | consumed samples: 44106240 | consumed tokens: 90329579520 | elapsed time per iteration (s): 0.17 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.670077E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1475.163 | TFLOPs: 23.13 | +7: iteration 172300/ 173500 | consumed samples: 44108800 | consumed tokens: 90334822400 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.654731E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.301 | TFLOPs: 26.24 | +7: iteration 172310/ 173500 | consumed samples: 44111360 | consumed tokens: 90340065280 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.664233E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.551 | TFLOPs: 24.91 | +7: iteration 172320/ 173500 | consumed samples: 44113920 | consumed tokens: 90345308160 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.671395E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1671.798 | TFLOPs: 26.22 | +7: iteration 172330/ 173500 | consumed samples: 44116480 | consumed tokens: 90350551040 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.650385E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1634.994 | TFLOPs: 25.64 | +7: iteration 172340/ 173500 | consumed samples: 44119040 | consumed tokens: 90355793920 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.641048E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.442 | TFLOPs: 26.09 | +7: iteration 172350/ 173500 | consumed samples: 44121600 | consumed tokens: 90361036800 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.671063E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.111 | TFLOPs: 26.08 | +7: iteration 172360/ 173500 | consumed samples: 44124160 | consumed tokens: 90366279680 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.666650E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1607.105 | TFLOPs: 25.20 | +7: iteration 172370/ 173500 | consumed samples: 44126720 | consumed tokens: 90371522560 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.647062E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.488 | TFLOPs: 25.99 | +7: iteration 172380/ 173500 | consumed samples: 44129280 | consumed tokens: 90376765440 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.666941E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.025 | TFLOPs: 26.05 | +7: iteration 172390/ 173500 | consumed samples: 44131840 | consumed tokens: 90382008320 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.665400E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1647.597 | TFLOPs: 25.84 | +7: iteration 172400/ 173500 | consumed samples: 44134400 | consumed tokens: 90387251200 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.674760E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.022 | TFLOPs: 26.10 | +7: iteration 172410/ 173500 | consumed samples: 44136960 | consumed tokens: 90392494080 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.652231E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.130 | TFLOPs: 26.07 | +7: iteration 172420/ 173500 | consumed samples: 44139520 | consumed tokens: 90397736960 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.654234E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1639.726 | TFLOPs: 25.72 | +7: iteration 172430/ 173500 | consumed samples: 44142080 | consumed tokens: 90402979840 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.650830E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1650.126 | TFLOPs: 25.88 | +7: iteration 172440/ 173500 | consumed samples: 44144640 | consumed tokens: 90408222720 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.665403E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.044 | TFLOPs: 26.08 | +7: iteration 172450/ 173500 | consumed samples: 44147200 | consumed tokens: 90413465600 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.664331E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.309 | TFLOPs: 25.66 | +7: iteration 172460/ 173500 | consumed samples: 44149760 | consumed tokens: 90418708480 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.658618E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.942 | TFLOPs: 26.08 | +7: iteration 172470/ 173500 | consumed samples: 44152320 | consumed tokens: 90423951360 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.666365E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.882 | TFLOPs: 26.11 | +7: iteration 172480/ 173500 | consumed samples: 44154880 | consumed tokens: 90429194240 | elapsed time per iteration (s): 0.16 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.663189E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.914 | TFLOPs: 25.62 | +7: iteration 172490/ 173500 | consumed samples: 44157440 | consumed tokens: 90434437120 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.664746E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1655.684 | TFLOPs: 25.97 | +7: iteration 172500/ 173500 | consumed samples: 44160000 | consumed tokens: 90439680000 | elapsed time per iteration (s): 0.15 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.676845E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.637 | TFLOPs: 26.07 | +7: iteration 172510/ 173500 | consumed samples: 44162560 | consumed tokens: 90444922880 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.647383E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1632.925 | TFLOPs: 25.61 | +7: iteration 172520/ 173500 | consumed samples: 44165120 | consumed tokens: 90450165760 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.674851E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.664 | TFLOPs: 26.03 | +7: iteration 172530/ 173500 | consumed samples: 44167680 | consumed tokens: 90455408640 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.656152E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.839 | TFLOPs: 26.12 | +7: iteration 172540/ 173500 | consumed samples: 44170240 | consumed tokens: 90460651520 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.645370E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.326 | TFLOPs: 26.10 | +7: iteration 172550/ 173500 | consumed samples: 44172800 | consumed tokens: 90465894400 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.667454E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.417 | TFLOPs: 26.12 | +7: iteration 172560/ 173500 | consumed samples: 44175360 | consumed tokens: 90471137280 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.664960E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.741 | TFLOPs: 26.12 | +7: iteration 172570/ 173500 | consumed samples: 44177920 | consumed tokens: 90476380160 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.671325E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1625.066 | TFLOPs: 25.49 | +7: iteration 172580/ 173500 | consumed samples: 44180480 | consumed tokens: 90481623040 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.671550E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.688 | TFLOPs: 26.08 | +7: iteration 172590/ 173500 | consumed samples: 44183040 | consumed tokens: 90486865920 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.659285E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.413 | TFLOPs: 26.07 | +7: iteration 172600/ 173500 | consumed samples: 44185600 | consumed tokens: 90492108800 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.649364E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.680 | TFLOPs: 26.00 | +7: iteration 172610/ 173500 | consumed samples: 44188160 | consumed tokens: 90497351680 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.662974E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.011 | TFLOPs: 26.03 | +7: iteration 172620/ 173500 | consumed samples: 44190720 | consumed tokens: 90502594560 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.656975E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.756 | TFLOPs: 26.08 | +7: iteration 172630/ 173500 | consumed samples: 44193280 | consumed tokens: 90507837440 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.664530E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.254 | TFLOPs: 26.04 | +7: iteration 172640/ 173500 | consumed samples: 44195840 | consumed tokens: 90513080320 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.640089E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.770 | TFLOPs: 26.09 | +7: iteration 172650/ 173500 | consumed samples: 44198400 | consumed tokens: 90518323200 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.659066E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.952 | TFLOPs: 26.09 | +7: iteration 172660/ 173500 | consumed samples: 44200960 | consumed tokens: 90523566080 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.654396E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.335 | TFLOPs: 26.07 | +7: iteration 172670/ 173500 | consumed samples: 44203520 | consumed tokens: 90528808960 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.658992E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.145 | TFLOPs: 26.07 | +7: iteration 172680/ 173500 | consumed samples: 44206080 | consumed tokens: 90534051840 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.656178E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.812 | TFLOPs: 26.08 | +7: iteration 172690/ 173500 | consumed samples: 44208640 | consumed tokens: 90539294720 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.682330E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.218 | TFLOPs: 25.44 | +7: iteration 172700/ 173500 | consumed samples: 44211200 | consumed tokens: 90544537600 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.650806E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.799 | TFLOPs: 26.09 | +7: iteration 172710/ 173500 | consumed samples: 44213760 | consumed tokens: 90549780480 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.661390E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1630.615 | TFLOPs: 25.57 | +7: iteration 172720/ 173500 | consumed samples: 44216320 | consumed tokens: 90555023360 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.641594E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.039 | TFLOPs: 26.10 | +7: iteration 172730/ 173500 | consumed samples: 44218880 | consumed tokens: 90560266240 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.669539E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1574.175 | TFLOPs: 24.69 | +7: iteration 172740/ 173500 | consumed samples: 44221440 | consumed tokens: 90565509120 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.648971E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.671 | TFLOPs: 26.07 | +7: iteration 172750/ 173500 | consumed samples: 44224000 | consumed tokens: 90570752000 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.663444E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.531 | TFLOPs: 26.06 | +7: iteration 172760/ 173500 | consumed samples: 44226560 | consumed tokens: 90575994880 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.667662E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.334 | TFLOPs: 26.07 | +7: iteration 172770/ 173500 | consumed samples: 44229120 | consumed tokens: 90581237760 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.669387E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1653.420 | TFLOPs: 25.93 | +7: iteration 172780/ 173500 | consumed samples: 44231680 | consumed tokens: 90586480640 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.662232E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1652.688 | TFLOPs: 25.92 | +7: iteration 172790/ 173500 | consumed samples: 44234240 | consumed tokens: 90591723520 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.648875E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.174 | TFLOPs: 26.05 | +7: iteration 172800/ 173500 | consumed samples: 44236800 | consumed tokens: 90596966400 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.652589E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.921 | TFLOPs: 26.06 | +7: iteration 172810/ 173500 | consumed samples: 44239360 | consumed tokens: 90602209280 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.668760E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.958 | TFLOPs: 26.06 | +7: iteration 172820/ 173500 | consumed samples: 44241920 | consumed tokens: 90607452160 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.663983E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1660.691 | TFLOPs: 26.04 | +7: iteration 172830/ 173500 | consumed samples: 44244480 | consumed tokens: 90612695040 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.649601E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.236 | TFLOPs: 25.99 | +7: iteration 172840/ 173500 | consumed samples: 44247040 | consumed tokens: 90617937920 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.651383E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.920 | TFLOPs: 26.02 | +7: iteration 172850/ 173500 | consumed samples: 44249600 | consumed tokens: 90623180800 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.660255E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1622.349 | TFLOPs: 25.44 | +7: iteration 172860/ 173500 | consumed samples: 44252160 | consumed tokens: 90628423680 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.668409E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.910 | TFLOPs: 26.06 | +7: iteration 172870/ 173500 | consumed samples: 44254720 | consumed tokens: 90633666560 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.660431E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.196 | TFLOPs: 26.08 | +7: iteration 172880/ 173500 | consumed samples: 44257280 | consumed tokens: 90638909440 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.645950E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1637.340 | TFLOPs: 25.68 | +7: iteration 172890/ 173500 | consumed samples: 44259840 | consumed tokens: 90644152320 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.661964E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1661.286 | TFLOPs: 26.05 | +7: iteration 172900/ 173500 | consumed samples: 44262400 | consumed tokens: 90649395200 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.654019E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.019 | TFLOPs: 26.08 | +7: iteration 172910/ 173500 | consumed samples: 44264960 | consumed tokens: 90654638080 | elapsed time per iteration (s): 0.15 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.654869E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.051 | TFLOPs: 26.08 | +7: iteration 172920/ 173500 | consumed samples: 44267520 | consumed tokens: 90659880960 | elapsed time per iteration (s): 0.16 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.650823E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1624.826 | TFLOPs: 25.48 | +7: iteration 172930/ 173500 | consumed samples: 44270080 | consumed tokens: 90665123840 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.660559E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.474 | TFLOPs: 26.09 | +7: iteration 172940/ 173500 | consumed samples: 44272640 | consumed tokens: 90670366720 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.663668E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1629.143 | TFLOPs: 25.55 | +7: iteration 172950/ 173500 | consumed samples: 44275200 | consumed tokens: 90675609600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.644390E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1636.303 | TFLOPs: 25.66 | +7: iteration 172960/ 173500 | consumed samples: 44277760 | consumed tokens: 90680852480 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.663411E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.119 | TFLOPs: 26.11 | +7: iteration 172970/ 173500 | consumed samples: 44280320 | consumed tokens: 90686095360 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.655202E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.330 | TFLOPs: 26.10 | +7: iteration 172980/ 173500 | consumed samples: 44282880 | consumed tokens: 90691338240 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.662942E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.272 | TFLOPs: 26.10 | +7: iteration 172990/ 173500 | consumed samples: 44285440 | consumed tokens: 90696581120 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.660370E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1641.561 | TFLOPs: 25.74 | +7: iteration 173000/ 173500 | consumed samples: 44288000 | consumed tokens: 90701824000 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.661639E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1658.916 | TFLOPs: 26.02 | +7: ------------------------------------------------------------------------------------------------- +7: validation loss at iteration 173000 | lm loss value: 3.871380E+00 | lm loss PPL: 4.800859E+01 | +7: ------------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 173000 to checkpoints_44m91b100m +0: [2023-03-17 07:46:52,127] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step173000 is begin to save! +0: [2023-03-17 07:46:52,132] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:46:52,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:46:52,191] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:46:52,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:46:52,201] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:46:52,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:46:52,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:46:52,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:46:52,217] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:46:52,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:46:52,225] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:46:52,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:46:52,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:46:52,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:46:52,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:46:52,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:46:52,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:46:52,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:46:52,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:46:52,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:46:52,259] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step173000/mp_rank_00_model_states.pt +0: [2023-03-17 07:46:52,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/mp_rank_00_model_states.pt... +0: [2023-03-17 07:46:52,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/mp_rank_00_model_states.pt. +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:46:52,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:46:52,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:46:52,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,292] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,294] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,298] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:46:52,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +4: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +3: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +7: [2023-03-17 07:46:52,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +5: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:46:52,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +6: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:46:52,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +1: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:46:52,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +2: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:46:52,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:46:52,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173000 is ready now! +0: successfully saved checkpoint at iteration 173000 to checkpoints_44m91b100m +7: time (ms) | save-checkpoint: 197.45 +7: iteration 173010/ 173500 | consumed samples: 44290560 | consumed tokens: 90707066880 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.673196E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1420.791 | TFLOPs: 22.28 | +7: iteration 173020/ 173500 | consumed samples: 44293120 | consumed tokens: 90712309760 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.665860E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.159 | TFLOPs: 25.89 | +7: iteration 173030/ 173500 | consumed samples: 44295680 | consumed tokens: 90717552640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.649943E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1643.782 | TFLOPs: 25.78 | +7: iteration 173040/ 173500 | consumed samples: 44298240 | consumed tokens: 90722795520 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.663669E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.326 | TFLOPs: 26.02 | +7: iteration 173050/ 173500 | consumed samples: 44300800 | consumed tokens: 90728038400 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.681619E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1606.605 | TFLOPs: 25.20 | +7: iteration 173060/ 173500 | consumed samples: 44303360 | consumed tokens: 90733281280 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.670313E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.015 | TFLOPs: 26.06 | +7: iteration 173070/ 173500 | consumed samples: 44305920 | consumed tokens: 90738524160 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.661940E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1654.008 | TFLOPs: 25.94 | +7: iteration 173080/ 173500 | consumed samples: 44308480 | consumed tokens: 90743767040 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.659613E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1663.179 | TFLOPs: 26.08 | +7: iteration 173090/ 173500 | consumed samples: 44311040 | consumed tokens: 90749009920 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.661376E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1633.192 | TFLOPs: 25.61 | +7: iteration 173100/ 173500 | consumed samples: 44313600 | consumed tokens: 90754252800 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.658010E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1666.124 | TFLOPs: 26.13 | +7: iteration 173110/ 173500 | consumed samples: 44316160 | consumed tokens: 90759495680 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.654339E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.264 | TFLOPs: 26.12 | +7: iteration 173120/ 173500 | consumed samples: 44318720 | consumed tokens: 90764738560 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.664176E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.181 | TFLOPs: 26.11 | +7: iteration 173130/ 173500 | consumed samples: 44321280 | consumed tokens: 90769981440 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.656810E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.974 | TFLOPs: 26.11 | +7: iteration 173140/ 173500 | consumed samples: 44323840 | consumed tokens: 90775224320 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.659670E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.009 | TFLOPs: 26.10 | +7: iteration 173150/ 173500 | consumed samples: 44326400 | consumed tokens: 90780467200 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.665228E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.029 | TFLOPs: 26.06 | +7: iteration 173160/ 173500 | consumed samples: 44328960 | consumed tokens: 90785710080 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.665326E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.377 | TFLOPs: 26.07 | +7: iteration 173170/ 173500 | consumed samples: 44331520 | consumed tokens: 90790952960 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.655489E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.861 | TFLOPs: 26.08 | +7: iteration 173180/ 173500 | consumed samples: 44334080 | consumed tokens: 90796195840 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.646515E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1662.805 | TFLOPs: 26.08 | +7: iteration 173190/ 173500 | consumed samples: 44336640 | consumed tokens: 90801438720 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.647720E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1665.003 | TFLOPs: 26.11 | +7: iteration 173200/ 173500 | consumed samples: 44339200 | consumed tokens: 90806681600 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.655060E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.077 | TFLOPs: 25.85 | +7: iteration 173210/ 173500 | consumed samples: 44341760 | consumed tokens: 90811924480 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.663324E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1659.462 | TFLOPs: 26.02 | +7: iteration 173220/ 173500 | consumed samples: 44344320 | consumed tokens: 90817167360 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.666891E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1664.523 | TFLOPs: 26.10 | +7: iteration 173230/ 173500 | consumed samples: 44346880 | consumed tokens: 90822410240 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.660157E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1588.084 | TFLOPs: 24.91 | +7: iteration 173240/ 173500 | consumed samples: 44349440 | consumed tokens: 90827653120 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.656379E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.616 | TFLOPs: 26.36 | +7: iteration 173250/ 173500 | consumed samples: 44352000 | consumed tokens: 90832896000 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.656527E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1681.072 | TFLOPs: 26.36 | +7: iteration 173260/ 173500 | consumed samples: 44354560 | consumed tokens: 90838138880 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.662444E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.833 | TFLOPs: 26.34 | +7: iteration 173270/ 173500 | consumed samples: 44357120 | consumed tokens: 90843381760 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.665113E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.040 | TFLOPs: 26.33 | +7: iteration 173280/ 173500 | consumed samples: 44359680 | consumed tokens: 90848624640 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.652808E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1644.362 | TFLOPs: 25.79 | +7: iteration 173290/ 173500 | consumed samples: 44362240 | consumed tokens: 90853867520 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.648482E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.290 | TFLOPs: 26.35 | +7: iteration 173300/ 173500 | consumed samples: 44364800 | consumed tokens: 90859110400 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.646376E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.664 | TFLOPs: 26.34 | +7: iteration 173310/ 173500 | consumed samples: 44367360 | consumed tokens: 90864353280 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.662030E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1645.107 | TFLOPs: 25.80 | +7: iteration 173320/ 173500 | consumed samples: 44369920 | consumed tokens: 90869596160 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.657200E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.213 | TFLOPs: 26.35 | +7: iteration 173330/ 173500 | consumed samples: 44372480 | consumed tokens: 90874839040 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.663614E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.864 | TFLOPs: 26.36 | +7: iteration 173340/ 173500 | consumed samples: 44375040 | consumed tokens: 90880081920 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.665649E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.226 | TFLOPs: 26.35 | +7: iteration 173350/ 173500 | consumed samples: 44377600 | consumed tokens: 90885324800 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.660918E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.071 | TFLOPs: 26.35 | +7: iteration 173360/ 173500 | consumed samples: 44380160 | consumed tokens: 90890567680 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.658480E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.009 | TFLOPs: 26.33 | +7: iteration 173370/ 173500 | consumed samples: 44382720 | consumed tokens: 90895810560 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.654260E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.632 | TFLOPs: 26.34 | +7: iteration 173380/ 173500 | consumed samples: 44385280 | consumed tokens: 90901053440 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.656451E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.738 | TFLOPs: 26.33 | +7: iteration 173390/ 173500 | consumed samples: 44387840 | consumed tokens: 90906296320 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.652847E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1680.789 | TFLOPs: 26.36 | +7: iteration 173400/ 173500 | consumed samples: 44390400 | consumed tokens: 90911539200 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.650640E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1679.881 | TFLOPs: 26.34 | +7: iteration 173410/ 173500 | consumed samples: 44392960 | consumed tokens: 90916782080 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.649874E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1648.621 | TFLOPs: 25.85 | +7: iteration 173420/ 173500 | consumed samples: 44395520 | consumed tokens: 90922024960 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.675017E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1651.472 | TFLOPs: 25.90 | +7: iteration 173430/ 173500 | consumed samples: 44398080 | consumed tokens: 90927267840 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.662194E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1667.109 | TFLOPs: 26.14 | +7: iteration 173440/ 173500 | consumed samples: 44400640 | consumed tokens: 90932510720 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.663636E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1657.538 | TFLOPs: 25.99 | +7: iteration 173450/ 173500 | consumed samples: 44403200 | consumed tokens: 90937753600 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.661227E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1673.180 | TFLOPs: 26.24 | +7: iteration 173460/ 173500 | consumed samples: 44405760 | consumed tokens: 90942996480 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.665894E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.124 | TFLOPs: 26.32 | +7: iteration 173470/ 173500 | consumed samples: 44408320 | consumed tokens: 90948239360 | elapsed time per iteration (s): 0.16 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.641941E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1604.854 | TFLOPs: 25.17 | +7: iteration 173480/ 173500 | consumed samples: 44410880 | consumed tokens: 90953482240 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.658032E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1678.650 | TFLOPs: 26.33 | +7: iteration 173490/ 173500 | consumed samples: 44413440 | consumed tokens: 90958725120 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.657109E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1674.474 | TFLOPs: 26.26 | +7: iteration 173500/ 173500 | consumed samples: 44416000 | consumed tokens: 90963968000 | elapsed time per iteration (s): 0.15 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.658786E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1677.381 | TFLOPs: 26.31 | +0: [after training is done] datetime: 2023-03-17 07:48:09 +0: saving checkpoint at iteration 173500 to checkpoints_44m91b100m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.828378E+00 | lm loss PPL: 4.598789E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-17 07:48:09,364] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step173500 is begin to save! +0: [2023-03-17 07:48:09,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 07:48:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 07:48:09,426] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 07:48:09,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 07:48:09,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 07:48:09,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 07:48:09,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 07:48:09,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 07:48:09,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 07:48:09,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 07:48:09,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 07:48:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 07:48:09,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 07:48:09,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 07:48:09,478] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 07:48:09,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 07:48:09,486] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 07:48:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 07:48:09,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 07:48:09,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 07:48:09,496] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt +0: [2023-03-17 07:48:09,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 07:48:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +7: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-17 07:48:09,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-17 07:48:09,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,533] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-17 07:48:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 07:48:09,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-17 07:48:09,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 07:48:09,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 07:48:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: successfully saved checkpoint at iteration 173500 to checkpoints_44m91b100m +END 3327057: Fri 17 Mar 2023 07:48:33 AM EET diff --git a/44m91b100m/3328579.err b/44m91b100m/3328579.err new file mode 100644 index 0000000000000000000000000000000000000000..0ee589464d0a0f82221d1f37b3b79f29e19d1bab --- /dev/null +++ b/44m91b100m/3328579.err @@ -0,0 +1,301 @@ +1: 2023-03-17 09:45:56.015307: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015312: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015316: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015320: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015315: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015309: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:56.015317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015781: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015785: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015791: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015785: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015793: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 09:45:56.015824: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 09:45:57.642865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642866: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642876: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642866: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642867: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.642868: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:45:57.643592: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643591: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643595: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643596: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643597: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643597: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643598: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:45:57.643602: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.700864: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700871: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700874: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700877: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700878: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.700880: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:45:57.701430: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701429: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701436: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701439: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701439: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701440: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701443: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 09:45:57.701446: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 09:46:02.002716: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002713: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002772: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002773: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002776: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002726: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002773: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002725: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002772: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 09:46:02.002730: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.002780: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023816: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023818: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023818: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023819: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023826: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023826: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023830: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023834: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023835: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023836: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023858: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023860: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 09:46:02.023870: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 09:46:02.023874: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004834: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004835: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004849: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004850: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004848: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004852: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004847: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004852: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004868: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004869: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004869: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 09:46:02.004871: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004872: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004871: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 09:46:02.004882: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/44m91b100m/3328579.out b/44m91b100m/3328579.out new file mode 100644 index 0000000000000000000000000000000000000000..780e06331879928c15bc97f57092d339a8de2f2c --- /dev/null +++ b/44m91b100m/3328579.out @@ -0,0 +1,1367 @@ +Model parameters: d_model 512 ffw_size 2048 kv_size 64 n_heads 8 n_layers 8 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 8 --hidden-size 512 --num-attention-heads 8 --kv-channels 64 --ffn-hidden-size 2048 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 64 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-44m91b100mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_44m91b100mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_44m91b100m --load checkpoints_44m91b100m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3328579.json --zero-stage 0 +START 3328579: Fri 17 Mar 2023 09:45:34 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 46.0c 79.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 48.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +1: Launching on nid007319 (1/2), master nid007318 port 9999, GPUs 8, CUDA: True +0: Launching on nid007318 (0/2), master nid007318 port 9999, GPUs 8, CUDA: True +0: using world size: 16, data-parallel-size: 16, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 16 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3328579.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2048 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 64 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 512 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-44m91b100mval +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_44m91b100m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 8 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 8 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_44m91b100m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_44m91b100mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 16 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-17 09:46:19,840] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +1: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.096 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 25.302 seconds +0: time to initialize megatron (seconds): -25.909 +0: [after megatron is initialized] datetime: 2023-03-17 09:46:46 +0: building GPT model ... +0: [2023-03-17 09:46:46,223] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-17 09:46:46,223] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-17 09:46:46,224] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.83 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15} +0: [2023-03-17 09:46:46,692] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=15 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: undo +0: 12: MixedFusedLayerNorm +0: 13: EmbeddingPipe +0: 14: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-17 09:46:46,901] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-17 09:46:46,901] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 09:46:46,902] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.84 GB, percent = 6.1% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-17 09:46:46,903] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-17 09:46:57,099] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-17 09:46:57,100] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-17 09:46:57,100] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-17 09:46:57,102] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-17 09:46:57,102] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-17 09:46:57,215] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-17 09:46:57,216] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,216] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.52 GB, percent = 6.3% +0: ninja: no work to do. +0: Time to load utils op: 0.14719820022583008 seconds +0: Time to load utils op: 0.2047121524810791 seconds +0: Time to load utils op: 0.204634428024292 secondsTime to load utils op: 0.20467090606689453 seconds +0: +0: Time to load utils op: 0.20447039604187012 seconds +0: Time to load utils op: 0.2046222686767578 seconds +0: Time to load utils op: 0.20471978187561035 seconds +0: Time to load utils op: 0.0007240772247314453 seconds +1: Time to load utils op: 0.21128535270690918 seconds +1: Time to load utils op: 0.21129918098449707 secondsTime to load utils op: 0.21130156517028809 secondsTime to load utils op: 0.21129941940307617 seconds +1: +1: +1: Time to load utils op: 0.2113037109375 secondsTime to load utils op: 0.2113049030303955 secondsTime to load utils op: 0.21130585670471191 seconds +1: +1: +1: Time to load utils op: 0.21131181716918945 seconds +0: Time to load utils op: 0.10267210006713867 seconds +0: Time to load utils op: 0.00036334991455078125 seconds +0: Time to load utils op: 0.00041961669921875 seconds +0: Time to load utils op: 0.0003981590270996094 seconds +0: Time to load utils op: 0.0004010200500488281 seconds +0: Time to load utils op: 0.0004069805145263672 seconds +0: Time to load utils op: 0.0003941059112548828 seconds +1: Time to load utils op: 0.0008432865142822266 seconds +1: Time to load utils op: 0.0009980201721191406 seconds +1: Time to load utils op: 0.0010347366333007812 seconds +1: Time to load utils op: 0.0011627674102783203 seconds +1: Time to load utils op: 0.0012624263763427734 secondsTime to load utils op: 0.0011997222900390625 seconds +1: +1: Time to load utils op: 0.0012540817260742188 seconds +1: Time to load utils op: 0.0012218952178955078 seconds +0: [2023-03-17 09:46:57,440] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-17 09:46:57,440] [INFO] [utils.py:828:see_memory_usage] MA 0.1 GB Max_MA 0.1 GB CA 0.11 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,440] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:57,552] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-17 09:46:57,553] [INFO] [utils.py:828:see_memory_usage] MA 0.25 GB Max_MA 0.25 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,553] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.68 GB, percent = 6.3% +0: [2023-03-17 09:46:57,653] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-17 09:46:57,653] [INFO] [utils.py:828:see_memory_usage] MA 0.25 GB Max_MA 0.25 GB CA 0.33 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,654] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.68 GB, percent = 6.3% +0: [2023-03-17 09:46:57,754] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-17 09:46:57,755] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,755] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:57,855] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-17 09:46:57,855] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,855] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:57,956] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-17 09:46:57,957] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:57,957] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:58,055] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-17 09:46:58,055] [INFO] [utils.py:828:see_memory_usage] MA 0.3 GB Max_MA 0.3 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:58,056] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:58,159] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-17 09:46:58,160] [INFO] [utils.py:828:see_memory_usage] MA 0.33 GB Max_MA 0.33 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:58,160] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:58,258] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-17 09:46:58,259] [INFO] [utils.py:828:see_memory_usage] MA 0.33 GB Max_MA 0.33 GB CA 0.38 GB Max_CA 0 GB +0: [2023-03-17 09:46:58,259] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.67 GB, percent = 6.3% +0: [2023-03-17 09:46:58,259] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-17 09:46:58,259] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-17 09:46:58,259] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-17 09:46:58,259] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-17 09:46:58,260] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-17 09:46:58,261] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] train_batch_size ............. 64 +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] world_size ................... 16 +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-17 09:46:58,262] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-17 09:46:58,262] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 64, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0003993511199951172 seconds +0: [2023-03-17 09:46:58,262] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-17 09:46:58,339] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=15 [0, 15) STAGE_PARAMS=52024320 (52.024M) TOTAL_PARAMS=52024320 (52.024M) UNIQUE_PARAMS=52024320 (52.024M) +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-17 09:46:58,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-17 09:46:58,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-17 09:46:58,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-17 09:46:58,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-17 09:46:58,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-17 09:46:58,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-17 09:46:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-17 09:46:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-17 09:46:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-17 09:46:58,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-17 09:46:58,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-17 09:46:58,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-17 09:46:58,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-17 09:46:58,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-17 09:46:58,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-17 09:46:58,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-17 09:46:58,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-17 09:46:58,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-17 09:46:58,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-17 09:46:58,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-17 09:46:58,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-17 09:46:58,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-17 09:46:58,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-17 09:46:58,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-17 09:46:58,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: > overriding learning rate value to 0.0002 +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-17 09:46:58,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-17 09:46:58,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 09:46:58,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,925] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +1: [2023-03-17 09:46:58,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,934] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +1: [2023-03-17 09:46:58,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,937] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +0: [2023-03-17 09:46:58,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,944] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +0: [2023-03-17 09:46:58,944] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +1: [2023-03-17 09:46:58,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,945] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +1: [2023-03-17 09:46:58,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,954] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +1: [2023-03-17 09:46:58,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,955] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +0: [2023-03-17 09:46:58,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,955] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +1: [2023-03-17 09:46:58,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,956] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +0: [2023-03-17 09:46:58,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,957] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +0: [2023-03-17 09:46:58,957] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +0: [2023-03-17 09:46:58,957] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +1: [2023-03-17 09:46:58,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,960] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +0: [2023-03-17 09:46:58,973] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +0: [2023-03-17 09:46:58,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 09:46:58,975] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +1: [2023-03-17 09:46:58,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 09:46:58,980] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +1: [2023-03-17 09:46:58,987] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +1: [2023-03-17 09:46:58,989] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +1: [2023-03-17 09:46:59,008] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +0: [2023-03-17 09:46:59,014] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +1: [2023-03-17 09:46:59,019] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +1: [2023-03-17 09:46:59,027] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +1: [2023-03-17 09:46:59,060] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +0: [2023-03-17 09:46:59,066] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +0: [2023-03-17 09:46:59,094] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +0: [2023-03-17 09:46:59,095] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +1: [2023-03-17 09:46:59,097] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +1: [2023-03-17 09:46:59,192] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +0: [2023-03-17 09:46:59,197] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +0: [2023-03-17 09:46:59,309] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +0: [2023-03-17 09:46:59,323] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +0: successfully loaded checkpoint from checkpoints_44m91b100m at iteration 0 +1: time (ms) | load-checkpoint: 1023.01 +0: estimated model parameters: 0.05202432 +0: estimated model parameters without embeddings: 0.025220096 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-17 09:46:59 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 6400 +0: test: 6400 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.005924 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.010 seconds +0: total number of samples: 48805 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.034780 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_6400ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_6400ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_6400ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.009 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-17 09:47:11 +0: done with setup ... +1: time (ms) | model-and-optimizer-setup: 13532.53 | train/valid/test-data-iterators-setup: 11601.30 +0: training ... +0: [after training is done] datetime: 2023-03-17 09:47:11 +1: ----------------------------------------------------------------------------------------------------------------- +1: validation loss at the end of training for val data | lm loss value: 3.836200E+00 | lm loss PPL: 4.634900E+01 | +1: ----------------------------------------------------------------------------------------------------------------- +END 3328579: Fri 17 Mar 2023 09:47:28 AM EET diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7690301c57db80579e9b6f3c1c9c390c85f06015 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a23eeddc2bb9637d5e27b18c221885509188a2e746e38654ba43c38084074a +size 9759063 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fc090dcdf5a445d12a60d52a8746be6280911b4 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9ac27a3829c8100532934ad283ef9892d07eb2f7ce3f6977eb5d72e2a966cc +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbe1c561e3863ac1f5f21393a02992e596219458 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b26d320100470f0d104acc5e8744223b3f412d10977ff8d4b7c173465b9fd2aa +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f04259edb83d8a1efb3c18ad0e05dfa6201c6eb7 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4899981cabb11a95a98fdc9f16f14bb88cb72342abc6bdf20f215a5125600ae7 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1cb0f73077dac51d880f39af69a3c4ab59a3473 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adda105d2e058199634a6aed8a8608eeadd1717a3f452a24642e7168dd333ee5 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3a2c10bccf7a791dcbb86284e11b2e40fb018e5 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:817f55084dee88cd90196a8aca81f578a2ebc6c010b421d8422172074686cdc5 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c56cf0ac62be23e00646c5918fcddc51c3b530f7 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e9c0f0345d6376fb0ab586d7f5568d0adf240547be4ede09925ed736e651725 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13c7761cdeb60f53e507f932d4570f88ff4d5e24 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74ee17ea83f770f92d24be87a3f7e72a114d212f0f86950c859f937436836fee +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2da82a14616fa3ad19818477fa2b3e2e44c1a918 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f2747969388b1b0f585e23a882546671ae433ebacdabaf9733921161f85cd96 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ead94705eba81dc7a47d534d24211389d491af8 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2801f151aac7ec73e03eb860fb4e2b7799b295720204d819774086331b6b412 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5ca00e7f002a4f8395ef0f4b81fba319af29846 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b6519a9d69f82a3f1dff48dc8df488e216fbb6c1b13099ebb3d8b77bfb06f4f +size 9759202 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bedd55c87a29fec99e0c3b2e10332ec6b647109 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dae0ee38ee06711896fb1799bac9a089e6c9edb4f22149e68f63b7e22266196 +size 9759063 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e7e3f07bc5ad56040db375de6aedf43192cacdf --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75f320b65b4a01cd6da7a8031d7ace636f10091f54dd30559eb77a7ae031ca3b +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e88786a5896d9b1f4b836c2399b2fefdc3bc0c9 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:befe70364a3059657d1597d0818a39de8843795b9718e8d3bbf6d00f4f29fe47 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1aff36808d62c22c7199b3b9d9e486211f5a3c53 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3e1771f46ae87c95f6b65a28edcacb081816c2df352e2773e62bf67eed417cc +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a9285c957e20c7811b89ed5daee8a5230f86bca --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e56018de7ef3b5c06cd13d7939154b9a8cf5780f30fae2077458fc7d526079e +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2db29dd803442908389884ba9fca5588e2e4cca5 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27860ea589acd56a3e22915b44ccb7e0b723ac5abea1cd96645c49bed6d04e9e +size 9759202 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..868bfc62c281aac107c0ddbc16c8fd0c4a6905b0 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983cfc182f6db5f89431deb1ce8119728d476a58f6a04e30a9215265561b6f0b +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..982cb811629dcbb2636ed95a218dd333bbdb6284 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60063e49a1be84b66e605473fa427bea7357b466a0c7ae74e07c0c4221968548 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca7f08f2418e97d158b8f4ee13d8efb3ef3c5409 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4defdb05c540656e567e759e5f217302fdf55dac28327961a281624a52fdec9d +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2d3fd9e32a8ea7334d7f1160eae99c06ab6c091 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dacf6b157b694d48d3e8abe41a62cbec6e355fbfb11bdac39bbcd2d6d975859 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6f503fc079017443b3fb82f39a98fb6c684995c --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1abf63c7e50478e31669df6dbc472d0018b21f1bdbbad52e722de219907b1950 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84fe030be5a2f675583a5c531c84662e3bb8e045 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20ca3de8351e21e255e4721666189b6456794fe4284f3acb9a1f873413e286f9 +size 9758999 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..998a27bc1e67e65f938e4b77098a72b40fdca8dc --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12f213a6edb71df903728c440513ef61c013d4c4f6445d0e8c499a57f2ef827d +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c267adb28f262f6979cc4fbaf59f7072ad850b4c --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:402d6066247c164c6e14c736103534306f2a3fd476b0b01b432700f66f213abc +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d1e59bce99093071adb8c5de76d67d19e4c5813 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e2afa21b02043f100211028544f969009bc3c26ae8ba5388d599e2ac558a7cf +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3033b6bfed6ef284bf3253c1e8ec3197b6b3d5be --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34475d05eed31e9fb1ce9403d10c4f015f89ef06d261d9343dd2be62278c4be4 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e936b1fd5fff08136350ca347034f338adc3fa77 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64bea8e99629eb4a63d54be02ef7a635be96d37e0aebbd7c1a25d947c0ef78e +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a8cff714ea08f81ce0fc1ed079cb25cd1fc926f --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6de4decefb12986cf86aa8bcfc5f4ed42cf32f36ca5a3d849455aa2b1bbe769f +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b66624eae4774ee833ad6b352e19ddc41e7f867b --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b10a0061a3a7a1d314f677c324d8f84986473893aad5ddd5b5c87cda56c67dac +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1ec6f011020f0fab138f95b6d0b4a2f7f9b57ef --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d20ba977a408d7d4a474e4a87c652cfbac48453f1eeb21c9f293af65efa39d8 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6b28a9e10a11df308fdbab3a48c6cc7479c7904 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd12d0e3e8e8713c4e0399f4ff6acc13b4bab52bc19b23cc5b335522d63a5f2c +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b93c48ec055ec2136d41b199bc80e72a7a58001 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f0a99d1838e78614c327cbd12cdd67ba83d0dec9d4990aa998c12303c4ad616 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc220d37590ed45f16c7e14d20244722e7cb6094 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dda8c4ae4acbe918670fc893f6aa424614b84f705ad5a2e5d76c6b8714f754 +size 9759127 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..829b0c06e2e821bc3b502e9db2a984e744aea232 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e122c940f8fd61f38a68d93774b60b319ad42da2d8af11a6f54edc096e1b170 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec45b73a26e25a93698355f1c18e22ba7bbbf717 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcfa3905936c01d7adc38064b6b96aec6ff04e5ca0134abc0667ea467c6838e3 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7c97843edf1e182a50314aeb0424201e298f32d --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed8206e43183e14f68f382aeeda98785e2c57b5968c975b331a39345048fb8f +size 9759202 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79d65d0a7a62f228774d30e3f3af2a24d2abf60d --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:858a672ddb8c33a46d3900a2298e7f48c6bb2d74f3264bf8f66be84bbcf60699 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ef78c08a6952552cf73e190b609eac09fedd872 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1089267b7eb5444749cf5a9da60f36c47409110470995104d2bd42da243e4fa4 +size 9759202 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21afbc9cee3aa57021bd50d91cf7c615fd3f10b4 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b932efe120f45b1b5cab169398afbe1b59dfb4fda9312d80bf197cf044dc334 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c70d6580eb3a185c5e4b4cf88546341b2098f2fd --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c33f968bdf8bb7212bd0c022bb2cdaad535330c6a758475d0d02603d5f5dca +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6a3d8d95fb9e372872b07375e30ba0c4aa9d052 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c177bdf26281cb85a0884db6b662d48349d490440c2a920b9485785e3259295 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f176d3481fd2e12f4bd5bcbc55a81e1fb439e754 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b203db00caa9a838a82a9794d9021e5beac8fcb59189cce314e8d96cc3d3e5 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b71d0c2406037fbff4d7213df67bd107eef580ed --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d513ada4463050794bd47d3192069b7fef6f46a21007f0f3c445ce8b4a93a14 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77397593230fe431085a7e134413da452332e482 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39029219ccb81d8b07e9d570b7516fee3fa2deb78275951ad029d1545bcfe324 +size 9759191 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..809c2856eb13e3eaadf8f15993d1b5e6d2fc19dc --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47be2d92de1370c10a24bddfa89bfcca2bd50aaa69d6fd436499525e5254ebea +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e96358e86586ef8bed933251c3f848c98db2bc7 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5e9085954ca06de1b0c7d3f6d6c3d7c630e0d2b2af4b2ba6cfe2d74ea70ce68 +size 9759266 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6ea23fa66194f092108f09a4fd4c7bd1f0b7d2d --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a11bbd88c5ad8d35edbf14739cd2dbcb6b3f6e8cca34f5ae308b33a093504238 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b3f271cf90efc348f4da64dd0004c5c73ff7267 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff4b3d1b2a72640217d726e005dc8d9bfa1a168b570f3ed6cafec88c852a72a0 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b985894e2eacb3962871e6a83e9a7e98c8fdc08 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c7e8f43403c73d7375a9954e6fac97193f29ac3f84acae31346a8b5a7ff8b3 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fab5e479291dfc4794d1e90fb1bab5cd56ee62c --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6317cb8b41b3f53c34a729159f5a3bbb9ceadbafd1c55957d3d0dbafb507ff2c +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85e50f8cdc6afb723ca7265e20a9f290e5f81f6e --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562f3a6839fbaa17a00ba95a554f00579f4530bb86d615743eea9bdc2a81ff78 +size 9759202 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbfebd9c142df462f6aadfe06e40c6535c18c7dc --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be0cc8b01392558aedd1527dc567317363db8f061b573ff5c62c83d98c7557c +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7449de519ea8618c9879168b38f0dfccfc70f31 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3e7640c578b57cbac47df6df24f19800171d29c219167f150aa80397b9a7fdb +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..847e1c7d9b627034e97a8b3bf8c3f76c28b474ee --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:282dbc4cf3efc6484c96b3df846fa1dd18c93015e3dd00e4c04b62fcf1fb74b5 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6df3b413fe44f207875b2ac4d145abd291cfbbdd --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:179e3d0243b7233d23fb933a3d9e07514c9887148b51cb52547faa549b8a6f34 +size 9758999 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03f10be5464ec91435e94cb215b9b846dde6a4d7 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7146de6b852e5c1156cc7d78d4929a075c75bcfd201d487046143396a58be52 +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11a652c754f3b68cf22aae1f354dc3bf6eb0aa99 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffde73f7921901d0b8b819524fcd0e98c69eed695f938484ca0f2ae913c245ce +size 9759074 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..64ac04e9ef4eb48d0f51314b427abef09d1b0bc9 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8adc09555ea5528381384568e4431995473dcec147668e7f859773a9546c5637 +size 9759138 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f9b7f8704e5497a2299b08bdd2e058d3e34e8c8 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:996e3c53d6255541166d83b40ebecc24fbfadd4ee54549d4f37137b11e8f8615 +size 9759010 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..550472a4e95cf3a63ff092535f95e45bf9dc91d7 --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc1f21f651ebbaa81fdbf8a38f03469813c4e5c019a8b654ad27961934c2a879 +size 9758999 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f56b2ff4f88150d3bb8735813fca8f5ad45522b --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ff1d5b23e77c01685e48a0ef5184f71c02e75abb40b3e88edc54578c562130 +size 9759127 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c312045d369c6fc6fc015737bd55ac98d2820dea --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1cff44112e287c95d20838fe371987152f060baf826ee6045123bb121e43f8b +size 9759127 diff --git a/44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cf63504cc49bb32b8364254d494e344d0869cff --- /dev/null +++ b/44m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2356cce18f90e9d397c2f2b6a82b6ceb105371426fd410f8165c5d2fd9c0ff6 +size 9759127 diff --git a/44m91b100m/global_step173500/layer_01-model_00-model_states.pt b/44m91b100m/global_step173500/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29ac62b1cf2df5859aa0657e4f8c82160b541ca7 --- /dev/null +++ b/44m91b100m/global_step173500/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7721598a33dc93b5e89747753cf3455f9810fe66cbd4c05709d15f4ee6ff093 +size 53609731 diff --git a/44m91b100m/global_step173500/layer_03-model_00-model_states.pt b/44m91b100m/global_step173500/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bc21aede3ff952b81ca9ff95f5b81fba7fc12ff --- /dev/null +++ b/44m91b100m/global_step173500/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda03628455ad4ccbcd4822e544733a40603eb49bc86f176cde4f8a4bd71f219 +size 6309123 diff --git a/44m91b100m/global_step173500/layer_04-model_00-model_states.pt b/44m91b100m/global_step173500/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fb16ee268258b4df2764d4e531274a6981d0a69 --- /dev/null +++ b/44m91b100m/global_step173500/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebe5ddd1cb8ed89e9e7a3aa5a03493405106b3fb53fe6d473cb4eff4c012364 +size 6309123 diff --git a/44m91b100m/global_step173500/layer_05-model_00-model_states.pt b/44m91b100m/global_step173500/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1ef83a16be161d5f757cfae508e00aad7f9c171 --- /dev/null +++ b/44m91b100m/global_step173500/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c23bc622f95614cf03ab16cef9d1bb74d0f2c981648c6a10e403ae4cb193a5f +size 6309123 diff --git a/44m91b100m/global_step173500/layer_06-model_00-model_states.pt b/44m91b100m/global_step173500/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..262bf7c413225e4560f317358179810eb5c967aa --- /dev/null +++ b/44m91b100m/global_step173500/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6402cccce6f11d15487b3a5fbd04144f31ea48aa50439f48e4d5e547851bd5 +size 6309123 diff --git a/44m91b100m/global_step173500/layer_07-model_00-model_states.pt b/44m91b100m/global_step173500/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79b4116c63142fd9d64177521c66799d43d117cb --- /dev/null +++ b/44m91b100m/global_step173500/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c69610d4e356905f069030446ab648f8b28a64a66575390f0a0d486fce2c45a4 +size 6309123 diff --git a/44m91b100m/global_step173500/layer_08-model_00-model_states.pt b/44m91b100m/global_step173500/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7129ee67a455fb8e677dfcce2b30dfa1cad8c018 --- /dev/null +++ b/44m91b100m/global_step173500/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97e86f9683cdf818a5b741802aed5ccc0ccb3a570b8398627f8afe0e9076ac5 +size 6309123 diff --git a/44m91b100m/global_step173500/layer_09-model_00-model_states.pt b/44m91b100m/global_step173500/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06f75099df3b4c7a447d1eb9948927f7a79e6f1b --- /dev/null +++ b/44m91b100m/global_step173500/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb151779f8782e37be564f024c42d0387289d1ed6128ef13a60a754709960d57 +size 6309123 diff --git a/44m91b100m/global_step173500/layer_10-model_00-model_states.pt b/44m91b100m/global_step173500/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92f2e16ce51cdde23261730f0b60924fa2568968 --- /dev/null +++ b/44m91b100m/global_step173500/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f6f7ff66da463532d252e41919ac432a88d1e6d8b080c29b05ba5c6a7a8aeb +size 6309123 diff --git a/44m91b100m/global_step173500/layer_12-model_00-model_states.pt b/44m91b100m/global_step173500/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b39594bd772af10bc1f4af0af5acaed026d8a50 --- /dev/null +++ b/44m91b100m/global_step173500/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c0d1642254b84e116e7ee006bf23c731455834c7165299b960c12394f40dae0 +size 3267 diff --git a/44m91b100m/global_step173500/mp_rank_00_model_states.pt b/44m91b100m/global_step173500/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78434f6aed620a3adfc2e032e7ab5e905a6b38ec --- /dev/null +++ b/44m91b100m/global_step173500/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc88edc8ea92994041868516ceddc2703495434de1544a81ae675f57bf8d01d7 +size 30131 diff --git a/44m91b100m/sbatch_44m91b100m.sh b/44m91b100m/sbatch_44m91b100m.sh new file mode 100644 index 0000000000000000000000000000000000000000..71c48eadf76d3bd14fb6b0569188aad6361feb97 --- /dev/null +++ b/44m91b100m/sbatch_44m91b100m.sh @@ -0,0 +1,171 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=44m91b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_44M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +# TRAIN_SAMPLES=9_703_701 +# Tokens: 90964260000 +# -> Samples: 44416143 +TRAIN_SAMPLES=44_416_143 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 444_161 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/44m91b100m/sbatch_44m91b100mval.sh b/44m91b100m/sbatch_44m91b100mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..83f2a3d8a704344b8fbc3fbe8a9e655a4561f22f --- /dev/null +++ b/44m91b100m/sbatch_44m91b100mval.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p small-g +#SBATCH -t 12:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=44m91b100mval +VARIANT_CKPT=44m91b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_44M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --no-load-optim \ + --reset-progress \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/44m91b100m/tensorboard_44m91b100m/events.out.tfevents.1679004828.nid006236.118767.0 b/44m91b100m/tensorboard_44m91b100m/events.out.tfevents.1679004828.nid006236.118767.0 new file mode 100644 index 0000000000000000000000000000000000000000..3f390cce887fb05a31b03bd236f58b2688e48e9b --- /dev/null +++ b/44m91b100m/tensorboard_44m91b100m/events.out.tfevents.1679004828.nid006236.118767.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04e0ac2b4d56db8e6c6dc5e95a32a77b43828acef6fabe1de4fbf23213981c90 +size 311124377 diff --git a/44m91b100m/tensorboard_44m91b100mval/events.out.tfevents.1679039180.nid007319.8773.0 b/44m91b100m/tensorboard_44m91b100mval/events.out.tfevents.1679039180.nid007319.8773.0 new file mode 100644 index 0000000000000000000000000000000000000000..b67c3f3847203a7567e65adacdf700951bb7ec20 --- /dev/null +++ b/44m91b100m/tensorboard_44m91b100mval/events.out.tfevents.1679039180.nid007319.8773.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b145551e333151c7c6366693631cad3a4e70d8961e8dd0a71c7fabe4360b191 +size 980 diff --git a/619m22b1b5/eval.txt b/619m22b1b5/eval.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6de1f0971cf43f1503cfaf062fe77e432727b86 --- /dev/null +++ b/619m22b1b5/eval.txt @@ -0,0 +1 @@ +2.973545E+00 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..895ba254c7e99b32af608abdc3fa6ee0765dc591 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:545b9d52d3a2d9112476c4239edfd383b0689c2d04db0dc1b4942fc6cd8754d3 +size 116013719 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c7e1c6ba084c6ea52452228356404021d1f4672 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde29077779d23ed13fc18fb90cd3d1c6d861fd648b0bd2fa37ac00e85bb7eb7 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42186fe5c41bf1774d719ce90e046127e1db7637 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:765585b6467cd5e0f97ff898a345f79721b6be7b14db319e822bf54eaa52b002 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eed616c651a88d4a6e0465201fb91d3c3a027cb7 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37336f4dee7b97db5d6082c26276432b19649a97cdac174a3bb9a4917edeb83e +size 116013602 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce7de4ce20d9bd400bec64d05f963d249db3473d --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f801e02fcc18b7bfa7e21f0cc157e87b6b17f8c4f8d90a80b6c46d590592d3e +size 116013858 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2705589a833443171466a4b280eb4c1b505da1ae --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2babb7118df46756f256b183e237666cc17f388c9fea3c156a1583f670e7225a +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4b406309b77c17231350916f24a67156d7a89a2 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8464897a366d532b5215cf5d46a6b8f42684e09e50346c460af39e457bb3c6 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..360c2c9de617a214ef572a6dd73c7a8eb613b29a --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b52910561dde90b1049a2cc0d6b24d78f7ecd3631ed6f1f8d2d3302a1f32f77 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c263ad097743ce389a76ca228f3190dc7797622 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a22513b4b9ab8ed956323c16049f0e8410459b0f339d1b1e851024f6b6f95d +size 116013602 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2afefa740785beb310b1b8c97bfda083314d3f3c --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db89e477b8aa21907be5fe00f99e55866feff18148f8fd598bc944a0d3dca4f4 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..098cccec1ae70fc4dd9e7c04f4cae5e43c54de57 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2181cbacbdd6687b1cb8c07654ea16ee10e3ad445b52636c84652c2fb0aa7ad7 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de445ae8020480d4e4844ae2215c748a96634d04 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cea35b4023be075fb7e07137afe1a6c9b73f8f656dae530fb682d05a8e2213ba +size 116013719 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4823e7b87e526c6f7f60976cc1c89f5cf53ec421 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e4bab851eb44c7b76b116a8e0c833652585a0ce0bdcd019b0aeee1dca50b98 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bacc246c9cefd4b8d5d4153bcd207988db0df4f4 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92006d62787dc8d36f5505a403350809a3a3370fb19630dd2b64cce5f0f5395a +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24a7b5ca9f9f0098782e99acaa0513c203516231 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d49bf1a07883ec473ac138db82b162806ea24e674f9967024f64a83bb297269 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..054eb2d148e1946d940062debcf2be8a266b0e02 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b1dc5e7d22bc213d51541ba5a1090e64a7c07511c8821d927c3bab48d3f0bc +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..421f0164c140a080fcd70401e08df0a2c3fa1a2b --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0c4e81ba80a599089b5ad3d279ee86b4a2f3f72e3654a00ce3ac59dec717b62 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c724b3f240c81c58fbc7cf90d73b52f2dd575951 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac020db5fe5c7f60fd8f5eb9b17336ed9af532c3fece1cdc456c05d6ccaaa177 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18fb26587de40328886d7e382927b4c7670e24f1 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b74b5477a0cc79f57248ba3eae2967bf5fde41813b10ca19c9267c442d3c3d +size 116013858 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e902433b25513cefb555a6aaa85f5e634f1aefd --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d33d8a7f999c64df8f63a58b5915c676888ad0122dadc45e58ddc71efdae4b +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a38e17f3b83ffa66333c71feeb19e7ef8ca38975 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d34398312fe93465cfca8db6452d8cc35bda0b8c54a868cdda6ed69d097a8f0 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..735d196adf534c2b0a46d34032b98bc75dec4aa0 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de00080332ce62e357820e9fede73705fc7b3ea7ad1925d4d705e2a09afdbb7 +size 116013602 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a12cf24c4bf916a37fda38151017a67e105ba94 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b0fa7d93eb45b7da2112dae335bd5f7eec75daebbc051c74861ca86dfc4e7cd +size 116013655 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..263e66ca0e5eee14d3c35417a083c5820020208e --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b4ebbfb418567be920478708447fb099196796f67ce499affa3428a0dd05731 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1c244dfe015515d8285f3eee4d774c86986b71c --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd6c088e8d846789a6a714271af3de9752415089a2d517b361b5bda58b5216d +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe18457217650ad5451dc464c52d82181d61f986 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b237529cb55f9154322603e762114c631a08c84f3bf222b0a58e508fb1d434c +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05e2113090358f675f917f3085dd34eba0d8b293 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:999541224f480dd24c65eb062b00b2462b19e41bfa03983ec280fcc01650cb4f +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d208448f79f38f1e7f8fbafd307dafd38d772b7c --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c729e3db16a7b11134563753ebdc0cb114ec45e53e40faff28e9e9a2606333 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4eaad4ccee3d697304a34785c86f218a1551bbf7 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa23755abea74b09ff18c7ccc492b22b0a67ed3eeabbb3ab4a08d8fae3ae2990 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b27923a795dae2162c9ad18105c11f924b45e75 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ed4c68b4f171b469e4f5ce0201c5860e93af25660f9e011fc0be6bbc15aa8f +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..876a4e1f5a494156e4213929489c675e1ba71f5f --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce750b0bf9f975a0a106e8db8ea1b9c1a8e7da439cae74a27e3d73b3d9e7b396 +size 116013858 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6aaf0fa2b9bd693f6d140b08840df952b0849273 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b7931468cb672bf0c1fece875bcbcd83233645b22697a346ac8465710840a8b +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4cbe147437ab1ce93756d1acb6c096ca5e69779 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17416426c14c0d1b5d819165f2092ccae46e4fdea3bc82c41e1b7eae5070f2e3 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..813687e0fb356044fc7183c8f7d69b7aaccec00e --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a61e075852a5fc2954d1540d719e2ce37f1c47a6568b86d3a37ce49c9673e1 +size 116013719 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7210455f0578d445d470cb4bce019fc658e45263 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8efdb4c09c7ab4fead10940ee7d569a482fd7a5780626987f64c1926d99dc70b +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..019663477771135c7c2e0eaf9bd51fae830f7464 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec004cb1343af99104b7c745a7599d9642cc503a2f8cd443a0e74f01417a5875 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ac12c106f6abb2628b873664cda713de74bb5bc --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723c2c74752943b89841cb9563c7dfe929494ebe5e1322e2e0a102ad534501c0 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0bd6ff3c1f44b8abd51308fd00c8329feef6055 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508c9a746a76c5231aaa6ff80a9bf0ea28c9c9f82be9c9b9c2051b5b9414402d +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40e23115bb6a9d5274bb716b8f96bc8afc165d4f --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f86125a6cda4d8bdd8131cc6497747c9aceb0cff9d07f80727843ec304c8a50 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..875f2c8b5f04784df893df0344b14f4a85508d59 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4404ba045dd79981270e2a8c150b110792d8c951eb3c86e4b943b907e70f7a8 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cb69bc8b5c4d47b174756106ceb5596adf2ff72 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d86c98f6f8e46f5dd7fcd0c9e8cdb1b129f974eb67fb42e9d23f745dfa0f71 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22f2eddc74d50fd70c053b947c85e938f7b1b78e --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef99f9a19887edfeb22f87c7c67149d14d54d1c0913127a63ed0e4aaec9ca788 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8df4eefb138f98521c9f9f007f39453dd3f38f4 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beb65d2e1d25d119e5c82cd4ae14e76ec025fef3156773b0001535d9b7407169 +size 116013922 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..968dee3e5d6386f65bd00fdbf52d92a434a456f2 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b93888e1bde72907105116cd23bff6ee8c9d58a0fa4419a1407fca4131da29d +size 116013538 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31a84b1934a53d1cdef6d2a3dd9f68214adf3f46 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07bb395a116e83f38ef92544d4df8287bbcf2214d9a059399199c1d15bd78a5f +size 116013719 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edebce1157008121d2bf3b91b65d9af588b6bf97 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:269056e7b6158e8c5b373b254621df580fc638fb6245b81878cf2933512caaf6 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb313eeeaf8e03ea414c0627327eef8d8ff3ecae --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9d1449e0d176f362318fcaa5b79a7a301275f0ebb77c50139f8299fc59e3fb8 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6bd87925e803f84308868e658960073c14119d7 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0daaf92cf9bd1dfe0234b85267f81b98deb3d25278fd3c025406bbce881f5300 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6be474455891d5e6cdec4e654cd73ee086efca91 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:385d5fad327c59e56a095f3ebd682e1cd30ae4cecd708e2fac74267f36f9a575 +size 116013922 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b2707c706b691c655b22d9541142c47227bbbe4 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34db9e5430d0e9a986bcd548b5e368df5cf723d2efa64716f5ced09baaecc7e7 +size 116013602 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01fbffc2d68660459dc9462a592f2b00dfda7513 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac1f29cf649623387da4f3dd07451adcdaff08886eb91857dba960fa624bd1a +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..785c7927d171bbba25e10f1025560b0b249b4996 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b6b3370ca8852e902a28ae707ad50d043ed86d426721c814e0a77696100d78 +size 116013602 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba851cb4f13d066a02eea22b8de0b41eaa649ef7 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68faaa44411dbc8329588a3bd000de186593bd9382178fc55c893b742a308326 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f171c126c9243dbc288b1dc198654ce7e55b7cec --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7979c6238b85045ef4b1faa8d0867e3fe777b1838250c1fd08ff9de566fdac7e +size 116013858 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a9fd8cb436458505562faa7548ff06fc259a20f --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6147be0c250365ae792c5f6dadfede59b881d7e215e6bf276cb8e3b1c92a95a6 +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddd91b1db743cb35539492d68c3372193c423af6 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f1fb45467820b92895b3e595fca262a24cf386b338e2cc431295e764aa1ce8 +size 116013655 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..061b0659a2c29d7df424e2af4c46265ef34bb5a8 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2fba126ac95e23f9f881d8d5f7fa507d36af586576e97a5e834ef4e55083b3 +size 116013730 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c21c6345928278f75ea295e1a82af4c937bf7b2d --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42d2fdf3a19a61c3737bb4d23cf5b616b5a26db1836fa8cb39bc14cda12521ef +size 116013794 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e728ac5e7d80d4d6f37ddc2b18cec72980fd3166 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35374e7b8448cd696b0d4b0be5cdf82973dfccce78976a927f3e2c4467076b47 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38b7acbf54402d62f93f65b2c7340fb31cc584c1 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8926b467544d00c0f4537990ccb38380d7a1b0c6c80bd55d9778327a7cdb63 +size 116013666 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9209a2f7522bfba261e5c28b553ac3870584f98 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb0aa3886b03e7a812b55cc33dbb55200534cd129143ab01eec9ddbb29e818ac +size 116013719 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..468f2c5bce40eef1386a192dd93e7d15944fb1a5 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aedd24831c9417faeb64effb9b9c52584e7500a37908c4d0b3acc6aa4f0cbc5 +size 116013719 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c879409135385912813c3776e0b341489446925b --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:165fd6f472e2725b908919f84a24668db627039027cedade4073bb22d4dce19b +size 116013655 diff --git a/619m22b1b5/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m22b1b5/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf29ca878a895b868f2736641cd006feb0c5dbc8 --- /dev/null +++ b/619m22b1b5/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fee4429b02f8513799e1bb31c926e0aa38304708981a93086bc5fa39e390ef3 +size 116013591 diff --git a/619m22b1b5/global_step41007/layer_01-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d31217773a966c26961d418b70d0d512751f19d7 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e86c63c60ae6d6f0e6c0238a6add603b86f0f831dca766fa6d4f586139f2ccf8 +size 160826627 diff --git a/619m22b1b5/global_step41007/layer_03-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7c89c315937a2801be1fc26952281a9e356c922 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9166542faa94faac29c5b6555dfeb88ac3ba0bfd553c2b7b4c19b5a91aca926 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_04-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d72ee248a4098e5dbfa8dd1f884da12b589f603 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb338f98c1d27b30a6592fc9c5d979dba0d299d8837a18b3ba1c4b482ca28de2 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_05-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c39c82c3c1ab9b6e6dcc8fca0a031554a0455148 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0cb6bf3220051f4cde81acc4ee82d9357d780a44d63662b9863b39b034bad8f +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_06-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6ee981f7fc031a128adfb9388a17c7c0977813d --- /dev/null +++ b/619m22b1b5/global_step41007/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04e3c80ab768b0e415dc9f4e893619b80cef9307b55a63762c14f9c9deba2655 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_07-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..615e72336288f341ef2e434a15e77d40288474a6 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a556faf4dde457a4ecb29ae133647d23c492e3b47ff571e5fc97186469ad966 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_08-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34969c60ad4a8f80e658abe3f8486bacfc0f95b4 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a21c2a5b721c9d5dee1408535bbd4c2e4f14876e07faed27b7d3b0d698fde719 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_09-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0740bcc8e502a74965fd9887f64d49179e9313e --- /dev/null +++ b/619m22b1b5/global_step41007/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2b5c31943c568c550323f3d4ca96d5460657d7a2e99ac10e33fb608b1a0d7e5 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_10-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4583c542cdb788cb4ed8c4c6b4b436b65d991243 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad9f33ab2946856b8213047d54004a6a00f479bd1e4e4bf76cf74804b01ed369 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_11-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6946b107bde9d5babfd235dfe96e125c4d8a8251 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18a4874e5de161da1e95d873ccd97cab959d936e0a30d01b3a1deeb8f52a0fa +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_12-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb73ddcaf214eb813189026306c40a3ad34610c1 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e77630465f7faa4576f9ca396223432331ca2856d34128525c4c29c3f43c73c +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_13-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c015c878e5a8e26f7793c104306614bb4680fea --- /dev/null +++ b/619m22b1b5/global_step41007/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caa1777200acc711815193792de9bc7578365553015b2b32262d6f43e516b199 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_14-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b99d9a3a78303764235c0d8d3ddc3edcd18e827b --- /dev/null +++ b/619m22b1b5/global_step41007/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a025394c24dea240b51b3172d0fc6d32bbae484afb93388a095f375381713a5a +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_15-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d84ae52d166cb09f1210ab8b8c7d4f1b170608c4 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:835a01428c4c80e5fce1dab2d4caa8be1dfe17abff67962764ac42c5893cac56 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_16-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..921c8ce7cdef627ef9aab4122c25905f0c66c41d --- /dev/null +++ b/619m22b1b5/global_step41007/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae202b892efbbc01aa60993b3e1ad7f09306bf0dfd5916077fd6cbdde02bc493 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_17-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48eebcec12e16bb6d91d8ec47ff11a4d54a4a844 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a4e7ebafa252574033d25ac975e0d40e4eaf351bacd875fcd7f016132c76653 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_18-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b064b03748aba488b3efb8459968f7ded3a3ed62 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5549e943c000593d696aed04c34f058dfd4b4db4e2819e1f187cd50ab0eb3b +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_19-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a8d175fb05a85f57fd981fc037e060eb512a00b --- /dev/null +++ b/619m22b1b5/global_step41007/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccad97078050318ebf8570698cf96bc845850600ef066e6e3a892b53d465483c +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_20-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa04d9f8d29337b6892c219348ccc701e0f8b642 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e859a4bee593d138fe26867e982ab5258a630539482893ef2a6da176954900 +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_21-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cea8d735ce0536535a9b8cca50bf09f8591b4c03 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95ae0fa2e1305a84d3bb70016c395e657029390b4fd7eff383d06c0f6b8ebbc +size 56667395 diff --git a/619m22b1b5/global_step41007/layer_23-model_00-model_states.pt b/619m22b1b5/global_step41007/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..edf0a76ca708cedba11484fdf1b58ae1254d3f69 --- /dev/null +++ b/619m22b1b5/global_step41007/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe0df419cc1a6c2ce177f53b63a20b8d473853a4a69f7e53c8bc19bcc0199496 +size 7363 diff --git a/619m22b1b5/global_step41007/mp_rank_00_model_states.pt b/619m22b1b5/global_step41007/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4addeed43662bcca8d433600e237270a445bc8b --- /dev/null +++ b/619m22b1b5/global_step41007/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f654c3e362bcd61cc14e4a833f1377eed108990ed7e97315eebd910d78de36 +size 38515 diff --git a/619m22b400m/3319357.err b/619m22b400m/3319357.err new file mode 100644 index 0000000000000000000000000000000000000000..54fa216a57e263b5fd9b07dc40632716b840a35f --- /dev/null +++ b/619m22b400m/3319357.err @@ -0,0 +1,1113 @@ +2: 2023-03-16 09:03:27.788692: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788703: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788704: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788706: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788699: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788710: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:03:27.788691: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813426: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813438: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813437: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813442: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813427: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813429: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813440: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:27.813435: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832602: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832602: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832615: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832627: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832616: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832635: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:03:27.832639: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908014: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908017: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908023: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908021: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908031: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908028: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908037: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:03:27.908033: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908263: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908277: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908281: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908268: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908265: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908265: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908288: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:03:27.908294: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062786: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062791: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062800: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062803: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062791: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062805: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:03:28.062811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063236: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063245: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063241: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063257: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063254: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:03:28.063253: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065654: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065646: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065659: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065643: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065655: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:03:28.065655: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:03:29.472526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472525: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472530: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472534: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472533: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472563: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:29.472950: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472953: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472956: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472955: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472957: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472960: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472960: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:03:29.472967: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.473584: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473589: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473597: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473592: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473599: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473604: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.473600: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:29.474005: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474004: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474011: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474012: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474014: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474015: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474018: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:03:29.474019: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.526784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526790: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526787: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526791: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526793: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.526801: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:29.527188: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527192: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527193: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527195: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527202: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527202: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527203: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:03:29.527207: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590438: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590444: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590441: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590449: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590449: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590452: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:29.590879: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590885: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590887: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590888: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590893: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590892: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590896: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:03:29.590899: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592338: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592332: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592343: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592352: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:29.592575: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592579: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592582: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592583: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592589: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592590: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592594: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:29.592598: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.698740: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698742: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698752: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.698753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:29.699205: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699207: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699210: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699212: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699215: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699219: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699222: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:03:29.699229: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.742807: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742818: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742821: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742820: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742820: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742827: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742823: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.742823: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:29.743228: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743234: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743235: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743238: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743237: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743236: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743238: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:03:29.743255: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.780643: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780649: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780660: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780663: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.780665: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:29.781237: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781240: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781244: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781246: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781249: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781252: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781256: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:03:29.781263: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:03:32.829311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829322: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829322: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.829327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831726: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831728: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831729: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831732: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831734: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831734: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831735: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831742: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831742: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831745: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831749: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831750: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831753: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:03:32.831765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:03:32.831779: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.866486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866489: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866494: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866503: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866508: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.866510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868299: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868301: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868304: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:03:32.868317: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868317: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868318: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868321: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868322: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868323: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868325: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:03:32.868324: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.884513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884518: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884523: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884521: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.884528: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886546: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886548: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886550: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886552: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886554: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886555: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886556: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886564: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886564: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886566: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886568: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886568: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886570: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886573: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:03:32.886572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:03:32.886589: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.891602: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891616: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891618: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891621: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.891624: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893387: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893387: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893392: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893394: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893395: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893393: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893400: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893403: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893400: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:03:32.893408: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893410: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893412: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893411: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:03:32.893415: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.985754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985765: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.985772: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986299: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986299: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.986307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987809: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987810: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987816: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987825: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987826: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987826: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987829: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987829: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987833: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987835: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:03:32.987862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:03:32.987875: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988677: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988679: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988686: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988685: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988694: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988696: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988699: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988699: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988721: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:03:32.988731: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:03:32.988737: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.075731: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075737: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075739: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075749: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.075747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078068: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:03:33.078082: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078085: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078087: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078087: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078089: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078094: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:03:33.078093: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.169159: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169178: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169173: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169174: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169181: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.169182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171070: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171076: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171079: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171082: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171088: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171087: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171093: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:03:33.171121: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:03:33.171134: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +7: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +6: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +2: Building extension module utils... +2: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +0: Loading extension module utils... +7: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils...Loading extension module utils... +5: +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/619m22b400m/3319357.out b/619m22b400m/3319357.out new file mode 100644 index 0000000000000000000000000000000000000000..25e4b0e976823e76dc12f75a92799e129eb984ec --- /dev/null +++ b/619m22b400m/3319357.out @@ -0,0 +1,6681 @@ +Model parameters: d_model 1536 ffw_size 6144 kv_size 128 n_heads 12 n_layers 19 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 19 --hidden-size 1536 --num-attention-heads 12 --kv-channels 128 --ffn-hidden-size 6144 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-619m22b400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --no-load-optim --reset-progress --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --tensorboard-dir tensorboard_619m22b400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_619m22b400m --load checkpoints_619m22b400m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3319357.json --zero-stage 0 +START 3319357: Thu 16 Mar 2023 09:03:07 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 42.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 46.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 45.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 47.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 46.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 48.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 47.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 40.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 47.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 45.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 40.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 45.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 47.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +6: Launching on nid007043 (6/8), master nid007037 port 9999, GPUs 8, CUDA: True +3: Launching on nid007040 (3/8), master nid007037 port 9999, GPUs 8, CUDA: True +4: Launching on nid007041 (4/8), master nid007037 port 9999, GPUs 8, CUDA: True +1: Launching on nid007038 (1/8), master nid007037 port 9999, GPUs 8, CUDA: True +2: Launching on nid007039 (2/8), master nid007037 port 9999, GPUs 8, CUDA: True +7: Launching on nid007044 (7/8), master nid007037 port 9999, GPUs 8, CUDA: True +5: Launching on nid007042 (5/8), master nid007037 port 9999, GPUs 8, CUDA: True +0: Launching on nid007037 (0/8), master nid007037 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3319357.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 6144 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1536 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-619m22b400mval +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_619m22b400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 12 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 19 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_619m22b400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_619m22b400mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-16 09:03:56,717] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.111 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 102 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_hip_kernel.cuda.o layer_norm_cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 33.656 seconds +0: time to initialize megatron (seconds): 1.299 +0: [after megatron is initialized] datetime: 2023-03-16 09:04:33 +0: building GPT model ... +0: [2023-03-16 09:04:33,431] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-16 09:04:33,432] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-16 09:04:33,432] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.23 GB, percent = 6.8% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-16 09:04:35,415] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=26 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: ParallelTransformerLayerPipe +0: 22: undo +0: 23: MixedFusedLayerNorm +0: 24: EmbeddingPipe +0: 25: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-16 09:04:35,637] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-16 09:04:35,638] [INFO] [utils.py:828:see_memory_usage] MA 1.16 GB Max_MA 1.16 GB CA 1.2 GB Max_CA 1 GB +0: [2023-03-16 09:04:35,638] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.27 GB, percent = 6.8% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-16 09:04:35,640] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-16 09:04:48,764] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-16 09:04:48,764] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-16 09:04:48,764] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-16 09:04:48,771] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-16 09:04:48,771] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-16 09:04:48,888] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-16 09:04:48,889] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.17 GB CA 1.22 GB Max_CA 1 GB +0: [2023-03-16 09:04:48,889] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.93 GB, percent = 6.9% +2: ninja: no work to do. +2: Time to load utils op: 0.2366793155670166 seconds +2: Time to load utils op: 0.0005621910095214844 seconds +0: Time to load utils op: 0.3115568161010742 seconds +7: Time to load utils op: 0.4113466739654541 seconds +0: Time to load utils op: 0.3032252788543701 seconds +0: Time to load utils op: 0.30309343338012695 seconds +0: Time to load utils op: 0.303727388381958 seconds +0: Time to load utils op: 0.3041698932647705 seconds +0: Time to load utils op: 0.3035259246826172 seconds +0: Time to load utils op: 0.3040006160736084 seconds +0: Time to load utils op: 0.3044121265411377 seconds +2: Time to load utils op: 0.3031466007232666 seconds +2: Time to load utils op: 0.30323123931884766 seconds +2: Time to load utils op: 0.3029811382293701 seconds +2: Time to load utils op: 0.30344462394714355 seconds +1: Time to load utils op: 0.31221580505371094 secondsTime to load utils op: 0.3114008903503418 seconds +1: +1: Time to load utils op: 0.3114156723022461 seconds +1: Time to load utils op: 0.31142401695251465 secondsTime to load utils op: 0.31139063835144043 seconds +1: +1: Time to load utils op: 0.31200194358825684 seconds +1: Time to load utils op: 0.3121507167816162 seconds +1: Time to load utils op: 0.31171512603759766 seconds +2: Time to load utils op: 0.2018423080444336 seconds +2: Time to load utils op: 0.20230865478515625 seconds +2: Time to load utils op: 0.20221805572509766 seconds +2: Time to load utils op: 0.00047016143798828125 seconds +7: Time to load utils op: 0.20380687713623047 seconds +2: Time to load utils op: 0.00036835670471191406 seconds +7: Time to load utils op: 0.20381450653076172 seconds +2: Time to load utils op: 0.0004062652587890625 seconds +7: Time to load utils op: 0.20397686958312988 seconds +2: Time to load utils op: 0.00030303001403808594 seconds +7: Time to load utils op: 0.20459437370300293 seconds +7: Time to load utils op: 0.20403146743774414 seconds +7: Time to load utils op: 0.20433592796325684 secondsTime to load utils op: 0.2047278881072998 seconds +7: +3: Time to load utils op: 0.2109065055847168 secondsTime to load utils op: 0.21106767654418945 seconds +3: +3: Time to load utils op: 0.21063566207885742 seconds +3: Time to load utils op: 0.21061182022094727 seconds +3: Time to load utils op: 0.2105555534362793 secondsTime to load utils op: 0.21123528480529785 seconds +3: +3: Time to load utils op: 0.21059608459472656 seconds +3: Time to load utils op: 0.2110891342163086 seconds +7: Time to load utils op: 0.0004794597625732422 seconds +6: Time to load utils op: 0.2107388973236084 seconds +6: Time to load utils op: 0.21089601516723633 secondsTime to load utils op: 0.21111011505126953 seconds +6: +6: Time to load utils op: 0.21131300926208496 seconds +6: Time to load utils op: 0.21134543418884277 secondsTime to load utils op: 0.2108011245727539 seconds +6: +6: Time to load utils op: 0.2112877368927002 seconds +6: Time to load utils op: 0.2113208770751953 seconds +4: Time to load utils op: 0.21021103858947754 secondsTime to load utils op: 0.21019649505615234 seconds +4: +4: Time to load utils op: 0.21021199226379395 seconds +4: Time to load utils op: 0.21027302742004395 seconds +2: Time to load utils op: 0.00033402442932128906 seconds +4: Time to load utils op: 0.21024441719055176 seconds +4: Time to load utils op: 0.21027874946594238 secondsTime to load utils op: 0.21029353141784668 seconds +4: +4: Time to load utils op: 0.2103586196899414 seconds +2: Time to load utils op: 0.000362396240234375 seconds +2: Time to load utils op: 0.0003514289855957031 seconds +5: Time to load utils op: 0.211958646774292 seconds +5: Time to load utils op: 0.21196651458740234 seconds +5: Time to load utils op: 0.21199512481689453 secondsTime to load utils op: 0.21199440956115723 seconds +5: +5: Time to load utils op: 0.21196293830871582 seconds +5: Time to load utils op: 0.21199989318847656 seconds +5: Time to load utils op: 0.21196961402893066 secondsTime to load utils op: 0.21199917793273926 seconds +5: +7: Time to load utils op: 0.0003986358642578125 seconds +7: Time to load utils op: 0.0003554821014404297 seconds +7: Time to load utils op: 0.0003867149353027344 seconds +7: Time to load utils op: 0.00033211708068847656 seconds +7: Time to load utils op: 0.00033020973205566406 seconds +7: Time to load utils op: 0.00039315223693847656 seconds +7: Time to load utils op: 0.0003502368927001953 seconds +0: Time to load utils op: 0.0005693435668945312 seconds +0: Time to load utils op: 0.0006127357482910156 secondsTime to load utils op: 0.0006124973297119141 seconds +0: +0: Time to load utils op: 0.0005815029144287109 seconds +0: Time to load utils op: 0.0005383491516113281 seconds +0: Time to load utils op: 0.0005955696105957031 secondsTime to load utils op: 0.0006101131439208984 seconds +0: +1: Time to load utils op: 0.0009176731109619141 seconds +1: Time to load utils op: 0.0009248256683349609 seconds +1: Time to load utils op: 0.0013878345489501953 seconds +1: Time to load utils op: 0.00136566162109375 seconds +1: Time to load utils op: 0.0013468265533447266 seconds +1: Time to load utils op: 0.0013730525970458984 seconds +1: Time to load utils op: 0.001428365707397461 seconds +1: Time to load utils op: 0.0014498233795166016 seconds +6: Time to load utils op: 0.0010716915130615234 seconds +6: Time to load utils op: 0.0011744499206542969 seconds +6: Time to load utils op: 0.0013566017150878906 seconds +6: Time to load utils op: 0.00128936767578125 seconds +6: Time to load utils op: 0.0013413429260253906 seconds +6: Time to load utils op: 0.0012483596801757812 secondsTime to load utils op: 0.0012645721435546875 seconds +6: +6: Time to load utils op: 0.0013675689697265625 seconds +3: Time to load utils op: 0.0008172988891601562 seconds +3: Time to load utils op: 0.000911712646484375 seconds +3: Time to load utils op: 0.0013532638549804688 seconds +3: Time to load utils op: 0.0012738704681396484 seconds +3: Time to load utils op: 0.0012869834899902344 secondsTime to load utils op: 0.0012538433074951172 seconds +3: +3: Time to load utils op: 0.0012729167938232422 seconds +3: Time to load utils op: 0.0013620853424072266 seconds +4: Time to load utils op: 0.0007991790771484375 seconds +4: Time to load utils op: 0.00069427490234375 seconds +4: Time to load utils op: 0.0009794235229492188 seconds +4: Time to load utils op: 0.0010564327239990234 seconds +4: Time to load utils op: 0.0011987686157226562 seconds +4: Time to load utils op: 0.001129150390625 seconds +4: Time to load utils op: 0.0011403560638427734 seconds +4: Time to load utils op: 0.0012023448944091797 seconds +5: Time to load utils op: 0.0009114742279052734 seconds +5: Time to load utils op: 0.0009500980377197266 seconds +5: Time to load utils op: 0.0011227130889892578 secondsTime to load utils op: 0.0011608600616455078 seconds +5: +5: Time to load utils op: 0.0010950565338134766 secondsTime to load utils op: 0.001125335693359375 seconds +5: +5: Time to load utils op: 0.00115203857421875 seconds +5: Time to load utils op: 0.001176595687866211 seconds +0: [2023-03-16 09:04:49,327] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-16 09:04:49,327] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.15 GB CA 1.22 GB Max_CA 1 GB +0: [2023-03-16 09:04:49,327] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.07 GB, percent = 7.0% +0: [2023-03-16 09:04:49,444] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-16 09:04:49,445] [INFO] [utils.py:828:see_memory_usage] MA 2.45 GB Max_MA 2.45 GB CA 3.13 GB Max_CA 3 GB +0: [2023-03-16 09:04:49,445] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.09 GB, percent = 7.0% +0: [2023-03-16 09:04:49,549] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-16 09:04:49,550] [INFO] [utils.py:828:see_memory_usage] MA 2.45 GB Max_MA 2.45 GB CA 3.13 GB Max_CA 3 GB +0: [2023-03-16 09:04:49,550] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.09 GB, percent = 7.0% +0: [2023-03-16 09:04:49,654] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-16 09:04:49,655] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-16 09:04:49,655] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.08 GB, percent = 7.0% +0: [2023-03-16 09:04:49,758] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-16 09:04:49,758] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-16 09:04:49,758] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.08 GB, percent = 7.0% +0: [2023-03-16 09:04:49,862] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-16 09:04:49,863] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-16 09:04:49,863] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.08 GB, percent = 7.0% +0: [2023-03-16 09:04:49,964] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-16 09:04:49,964] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-16 09:04:49,965] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.08 GB, percent = 7.0% +0: [2023-03-16 09:04:50,070] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-16 09:04:50,071] [INFO] [utils.py:828:see_memory_usage] MA 3.57 GB Max_MA 3.57 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-16 09:04:50,071] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.08 GB, percent = 7.0% +0: [2023-03-16 09:04:50,173] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-16 09:04:50,174] [INFO] [utils.py:828:see_memory_usage] MA 3.57 GB Max_MA 3.57 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-16 09:04:50,174] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 35.08 GB, percent = 7.0% +0: [2023-03-16 09:04:50,174] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-16 09:04:50,174] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-16 09:04:50,174] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-16 09:04:50,174] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-16 09:04:50,175] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-16 09:04:50,176] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-16 09:04:50,177] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-16 09:04:50,177] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.000392913818359375 seconds +0: [2023-03-16 09:04:50,177] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-16 09:04:50,235] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=26 [0, 26) STAGE_PARAMS=618714624 (618.715M) TOTAL_PARAMS=618714624 (618.715M) UNIQUE_PARAMS=618714624 (618.715M) +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:51,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:51,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:51,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:51,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:51,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:51,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:51,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:51,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:51,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:51,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:51,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:51,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:51,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:51,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:51,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:51,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:51,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:51,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:51,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:51,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:51,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:51,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,586] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:51,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:51,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:51,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:51,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:51,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:51,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:51,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:51,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:52,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:52,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:52,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:52,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:52,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:52,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:52,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:52,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:52,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:52,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:52,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:52,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:52,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:52,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:52,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:52,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:52,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:52,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:52,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:52,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:52,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:52,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:52,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:52,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:52,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:52,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:52,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:52,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:52,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:52,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:52,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:52,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:52,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:52,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +5: [2023-03-16 09:04:52,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +3: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +1: [2023-03-16 09:04:52,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +7: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +2: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +0: [2023-03-16 09:04:52,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +4: [2023-03-16 09:04:52,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt... +6: [2023-03-16 09:04:52,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +6: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +5: [2023-03-16 09:04:52,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +4: [2023-03-16 09:04:52,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +2: [2023-03-16 09:04:52,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +7: [2023-03-16 09:04:52,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +3: [2023-03-16 09:04:52,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +1: [2023-03-16 09:04:52,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_13-model_00-model_states.pt. +0: [2023-03-16 09:04:52,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:52,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:52,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:52,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:52,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:52,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:52,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:52,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:52,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:52,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:52,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:52,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:52,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:52,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:52,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:52,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:52,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:52,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:52,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:52,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:52,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:52,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:52,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:52,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:52,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:52,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:52,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:52,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:52,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:52,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:52,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:52,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:52,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:52,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:53,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:53,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:53,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:53,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:53,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:53,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:53,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:53,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:53,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +6: [2023-03-16 09:04:53,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +1: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +4: [2023-03-16 09:04:53,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +7: [2023-03-16 09:04:53,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +3: [2023-03-16 09:04:53,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +2: [2023-03-16 09:04:53,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +0: [2023-03-16 09:04:53,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt... +5: [2023-03-16 09:04:53,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +5: [2023-03-16 09:04:53,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +3: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +0: [2023-03-16 09:04:53,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +6: [2023-03-16 09:04:53,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +7: [2023-03-16 09:04:53,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +4: [2023-03-16 09:04:53,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +1: [2023-03-16 09:04:53,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_15-model_00-model_states.pt. +2: [2023-03-16 09:04:53,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +3: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +4: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +1: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +7: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +0: [2023-03-16 09:04:53,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +2: [2023-03-16 09:04:53,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt... +6: [2023-03-16 09:04:53,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +3: [2023-03-16 09:04:53,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +2: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +5: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +6: [2023-03-16 09:04:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +1: [2023-03-16 09:04:53,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +0: [2023-03-16 09:04:53,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +4: [2023-03-16 09:04:53,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_16-model_00-model_states.pt. +7: [2023-03-16 09:04:53,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +1: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +3: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +7: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +2: [2023-03-16 09:04:53,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +4: [2023-03-16 09:04:53,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +6: [2023-03-16 09:04:53,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +0: [2023-03-16 09:04:53,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt... +5: [2023-03-16 09:04:53,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +1: [2023-03-16 09:04:53,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +5: [2023-03-16 09:04:53,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +3: [2023-03-16 09:04:53,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +2: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +7: [2023-03-16 09:04:53,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +4: [2023-03-16 09:04:53,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +6: [2023-03-16 09:04:53,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_17-model_00-model_states.pt. +0: [2023-03-16 09:04:53,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +5: [2023-03-16 09:04:53,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +2: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +0: [2023-03-16 09:04:53,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +3: [2023-03-16 09:04:53,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +7: [2023-03-16 09:04:53,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +4: [2023-03-16 09:04:53,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +7: [2023-03-16 09:04:53,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +3: [2023-03-16 09:04:53,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt... +1: [2023-03-16 09:04:53,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +5: [2023-03-16 09:04:53,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +2: [2023-03-16 09:04:53,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +1: [2023-03-16 09:04:53,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +6: [2023-03-16 09:04:53,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +0: [2023-03-16 09:04:53,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_18-model_00-model_states.pt. +4: [2023-03-16 09:04:53,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +7: [2023-03-16 09:04:53,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +5: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +2: [2023-03-16 09:04:53,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +6: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +4: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +1: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +0: [2023-03-16 09:04:53,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt... +3: [2023-03-16 09:04:53,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +7: [2023-03-16 09:04:53,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +3: [2023-03-16 09:04:53,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +5: [2023-03-16 09:04:53,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:53,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:53,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +1: [2023-03-16 09:04:53,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +2: [2023-03-16 09:04:53,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +4: [2023-03-16 09:04:53,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:53,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +6: [2023-03-16 09:04:53,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:53,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:53,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:53,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_19-model_00-model_states.pt. +0: [2023-03-16 09:04:53,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:53,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:53,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +2: [2023-03-16 09:04:54,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +6: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +7: [2023-03-16 09:04:54,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +3: [2023-03-16 09:04:54,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +4: [2023-03-16 09:04:54,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +0: [2023-03-16 09:04:54,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +1: [2023-03-16 09:04:54,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt... +5: [2023-03-16 09:04:54,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +5: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +4: [2023-03-16 09:04:54,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +1: [2023-03-16 09:04:54,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +6: [2023-03-16 09:04:54,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +7: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +2: [2023-03-16 09:04:54,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +3: [2023-03-16 09:04:54,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_20-model_00-model_states.pt. +0: [2023-03-16 09:04:54,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +4: [2023-03-16 09:04:54,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +7: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +6: [2023-03-16 09:04:54,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +3: [2023-03-16 09:04:54,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +2: [2023-03-16 09:04:54,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +0: [2023-03-16 09:04:54,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +5: [2023-03-16 09:04:54,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt... +1: [2023-03-16 09:04:54,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +1: [2023-03-16 09:04:54,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +5: [2023-03-16 09:04:54,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +4: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +5: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +5: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +1: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +7: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +7: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +7: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +3: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +0: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +2: [2023-03-16 09:04:54,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +4: [2023-03-16 09:04:54,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +4: [2023-03-16 09:04:54,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +6: [2023-03-16 09:04:54,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +6: [2023-03-16 09:04:54,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_21-model_00-model_states.pt. +6: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +3: [2023-03-16 09:04:54,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +0: [2023-03-16 09:04:54,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +3: [2023-03-16 09:04:54,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:54,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:54,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: > overriding learning rate value to 0.0002 +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +4: [2023-03-16 09:04:54,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:54,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +2: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt... +2: [2023-03-16 09:04:54,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/layer_23-model_00-model_states.pt. +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:54,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:54,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:54,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,642] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +1: [2023-03-16 09:04:54,649] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +4: [2023-03-16 09:04:54,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,660] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +4: [2023-03-16 09:04:54,664] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +6: [2023-03-16 09:04:54,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,670] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +6: [2023-03-16 09:04:54,673] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +4: [2023-03-16 09:04:54,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,689] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +4: [2023-03-16 09:04:54,693] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +2: [2023-03-16 09:04:54,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,696] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +2: [2023-03-16 09:04:54,700] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +0: [2023-03-16 09:04:54,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,701] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +0: [2023-03-16 09:04:54,704] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +5: [2023-03-16 09:04:54,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,711] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +5: [2023-03-16 09:04:54,715] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +6: [2023-03-16 09:04:54,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,716] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +6: [2023-03-16 09:04:54,720] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +3: [2023-03-16 09:04:54,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,721] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +3: [2023-03-16 09:04:54,725] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +7: [2023-03-16 09:04:54,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,729] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +7: [2023-03-16 09:04:54,733] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +1: [2023-03-16 09:04:54,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,731] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +0: [2023-03-16 09:04:54,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,734] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +1: [2023-03-16 09:04:54,734] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +0: [2023-03-16 09:04:54,739] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +5: [2023-03-16 09:04:54,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,742] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +0: [2023-03-16 09:04:54,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,745] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +5: [2023-03-16 09:04:54,745] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +0: [2023-03-16 09:04:54,749] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +4: [2023-03-16 09:04:54,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,750] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +0: [2023-03-16 09:04:54,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,751] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +4: [2023-03-16 09:04:54,754] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +0: [2023-03-16 09:04:54,755] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +3: [2023-03-16 09:04:54,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,755] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +3: [2023-03-16 09:04:54,759] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +6: [2023-03-16 09:04:54,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,766] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +6: [2023-03-16 09:04:54,770] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +2: [2023-03-16 09:04:54,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,776] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +3: [2023-03-16 09:04:54,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +3: [2023-03-16 09:04:54,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,779] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +7: [2023-03-16 09:04:54,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,781] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +2: [2023-03-16 09:04:54,782] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +3: [2023-03-16 09:04:54,782] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +5: [2023-03-16 09:04:54,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,783] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +3: [2023-03-16 09:04:54,784] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +0: [2023-03-16 09:04:54,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,784] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +6: [2023-03-16 09:04:54,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,785] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +7: [2023-03-16 09:04:54,786] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +5: [2023-03-16 09:04:54,786] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +0: [2023-03-16 09:04:54,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,788] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +0: [2023-03-16 09:04:54,789] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +6: [2023-03-16 09:04:54,790] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +0: [2023-03-16 09:04:54,792] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +2: [2023-03-16 09:04:54,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,793] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +2: [2023-03-16 09:04:54,798] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +4: [2023-03-16 09:04:54,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,798] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +4: [2023-03-16 09:04:54,802] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +1: [2023-03-16 09:04:54,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,804] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +1: [2023-03-16 09:04:54,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,805] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +4: [2023-03-16 09:04:54,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,806] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +4: [2023-03-16 09:04:54,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,807] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +1: [2023-03-16 09:04:54,809] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +1: [2023-03-16 09:04:54,810] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +4: [2023-03-16 09:04:54,810] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +4: [2023-03-16 09:04:54,812] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +5: [2023-03-16 09:04:54,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,813] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +3: [2023-03-16 09:04:54,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,817] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +5: [2023-03-16 09:04:54,814] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +5: [2023-03-16 09:04:54,817] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +0: [2023-03-16 09:04:54,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,818] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +5: [2023-03-16 09:04:54,818] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +3: [2023-03-16 09:04:54,821] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +0: [2023-03-16 09:04:54,823] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +7: [2023-03-16 09:04:54,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,827] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +4: [2023-03-16 09:04:54,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,828] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +7: [2023-03-16 09:04:54,831] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +4: [2023-03-16 09:04:54,833] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +6: [2023-03-16 09:04:54,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,835] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +2: [2023-03-16 09:04:54,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:54,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,836] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +0: [2023-03-16 09:04:54,836] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +6: [2023-03-16 09:04:54,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,837] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +3: [2023-03-16 09:04:54,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,839] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +2: [2023-03-16 09:04:54,840] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +6: [2023-03-16 09:04:54,840] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +0: [2023-03-16 09:04:54,840] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +3: [2023-03-16 09:04:54,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,841] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +6: [2023-03-16 09:04:54,841] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +6: [2023-03-16 09:04:54,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,842] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +2: [2023-03-16 09:04:54,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,843] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +3: [2023-03-16 09:04:54,844] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +4: [2023-03-16 09:04:54,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:54,844] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +2: [2023-03-16 09:04:54,844] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +3: [2023-03-16 09:04:54,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:54,844] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +6: [2023-03-16 09:04:54,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:54,846] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +3: [2023-03-16 09:04:54,846] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +6: [2023-03-16 09:04:54,846] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +2: [2023-03-16 09:04:54,848] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +4: [2023-03-16 09:04:54,848] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +2: [2023-03-16 09:04:54,848] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +3: [2023-03-16 09:04:54,849] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +6: [2023-03-16 09:04:54,851] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +7: [2023-03-16 09:04:54,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,854] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +7: [2023-03-16 09:04:54,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,858] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +7: [2023-03-16 09:04:54,858] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +1: [2023-03-16 09:04:54,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,860] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +7: [2023-03-16 09:04:54,862] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +1: [2023-03-16 09:04:54,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,863] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +1: [2023-03-16 09:04:54,865] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +5: [2023-03-16 09:04:54,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,865] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +1: [2023-03-16 09:04:54,867] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +5: [2023-03-16 09:04:54,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +1: [2023-03-16 09:04:54,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +5: [2023-03-16 09:04:54,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:54,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +5: [2023-03-16 09:04:54,870] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +7: [2023-03-16 09:04:54,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,872] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +1: [2023-03-16 09:04:54,872] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +5: [2023-03-16 09:04:54,872] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +5: [2023-03-16 09:04:54,874] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +7: [2023-03-16 09:04:54,876] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +2: [2023-03-16 09:04:54,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,879] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +2: [2023-03-16 09:04:54,883] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +7: [2023-03-16 09:04:54,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,897] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +7: [2023-03-16 09:04:54,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:54,900] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +7: [2023-03-16 09:04:54,901] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +1: [2023-03-16 09:04:54,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:54,904] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +7: [2023-03-16 09:04:54,905] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +1: [2023-03-16 09:04:54,909] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +2: [2023-03-16 09:04:54,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m22b400m/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:54,916] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +2: [2023-03-16 09:04:54,921] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +0: successfully loaded checkpoint from checkpoints_619m22b400m at iteration 0 +7: time (ms) | load-checkpoint: 4700.06 +0: estimated model parameters: 0.618714624 +0: estimated model parameters without embeddings: 0.538301952 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 09:04:55 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.007502 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.009 seconds +0: total number of samples: 48805 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.049496 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.010 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-16 09:05:08 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 21866.11 | train/valid/test-data-iterators-setup: 12460.25 +0: [after training is done] datetime: 2023-03-16 09:05:08 +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.441785E+00 | lm loss PPL: 3.124266E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3319357: Thu 16 Mar 2023 09:05:40 AM EET diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f419f8e809976572b04af774fe3aa75285810ded --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7bf890f1ecb9a7b73f694effce19bf55c89243a319b2882996873900b09fed3 +size 116013719 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35c6fa5a72955b9023d9c96c01391dad48e48ebc --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d2415f089401f0fa321e629f1685529c70fa37041e03d582c3b0d3e490fa63a +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed01f3e5264e35c2c283526603cee30ef6f6e80a --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:009d829c2fe76966f3fac12ea425ee7f5ff3e3085beecf214f88d395844298bb +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcc175b164c23bdf4e328057b94a57a6eb0201cf --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e9ed6493fc9e8e16ab80bbfbf39f2c7cc93a3f2824f5e839a11323c8419ece +size 116013602 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9808a191f2d8c433d0ec7670bac142c542288740 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76d0940e8f05971ebd8338dee5b2608b2923955f595146f8e50d141f9316d115 +size 116013858 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9829210e1112388c76b2bcbc688f2751fefeb10 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfabf1acbca12b5a259b0cf8a92b34f84b4a81b5d72fc2a59bd9f974a7f5d1ec +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..157f47b0aae4447eb90c29a0dbd7a212c277d30a --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be1c6ebd75506cb42af6375e8d750e92a84ef4247f438d94cb3926382276047 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a7ef85d0ae6cc90195a9049e9c6b6bf3f0b399a --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8211cbaa8ec3f5d1f0092b6e45dfb490deb86cf82e1a59c80836aabeb009a34b +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3169626a6d8e70cc692ded3e0d4840ec968ab14a --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd88bf43510f75f1f24bbe2492b39d77e6f98250b2a72293359320c5ca7b923c +size 116013602 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c691d2d503b1168e7d3b5304b5936e4a4fb7ee2c --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45435fc38f0041e9c6b93eb67f3657560f2231527b0a8da3c9f3a979db9bdad9 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c67b2e64a378bb8e98ac2ce4eabed49ed3c563a1 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b081c9007bd325a45102af307a657807fa40c9381704c5698e14878c938863a +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42753c8c98683efbc99cbb36ab0401353b028674 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dd068ff0be3848c5fdd8f33fd7a95dcd621fbcce69c65f46fe1d83b2c9ba387 +size 116013719 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..053011712d069e960cf2c163bb0645649a34aaf6 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa2a5aae60dcf55697f4252a59e1ca378b118783805bd2d326f8554a1fc4b547 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2131eebb1f6b3927ef94d5d96530c0b2237efb84 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b646bf9d942a2127b3e18edf58c2f45e5aa1124857a5a9aeefbf3db565ebce +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31ea714abe4a8b3decbeda0b32a80be88b030512 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c547df14f1c52d39fc4c619c091f16697b4f33da87f876b1a4600aa70edeae01 +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9640c8513788d92ba993b3c9e7fea735a49c313 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19eaf30f46cef02088fe1a3417efe3c2ba1854197f200f30df90020df28e8a95 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4f4b327d380e4718af183b32eee9fb9611e8437 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:526304c7b4a5e01f3a4def8c42420158746957612f3fced61c575474e3facf74 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b854e91ba676d01500d73c18575806391f0a8322 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b53c5467a49d4a4c3b4eaded330f5e8ff212858bece3a3621b085803c17676d5 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52e6a90bfb114d138bf91d79be7a6f84fa8abc44 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a12a591a0baae25e0ae079a176d1824a5c5fb15928c7dac91a2f16f27dc9aef2 +size 116013858 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0076ba8eb58219d75fbdd0bcac7bb8da8fd22e18 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc4d9428164b28e9d51dc3edb5d9b4b3ed21b82e8e2f4d5ca6e4b3024644661b +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82e4b41286bb12783079e7987795ec850eb075e6 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07e14b480d1458b062e7cd43c98b9aea6e45a58d2c204726df10e1ef6147a9c +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e780103cc6d93ecc89438b0d657057e45c7dc38 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b3b0c114231301f29951bf02dcfe31665c988624f6024661b7bfa99698bff2 +size 116013602 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5eaf0282d889bd00dcdccdac2b238b7a21a16ab2 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec141520ca649796b123d9b98bae36383739778877fb97c2cc0b9e8d24531621 +size 116013655 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ae5bd5aabbed85506afad663733aa02c1e76b6e --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a2c3e325c8d265b60092512499bf2b5ff0cf34af22fe4a9fb46c5eb165dd8a +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8e8520c66c319cea2c8a8f48d0d38240f5a1a76 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ed2f58aa99ca0a04282e6e635c45c11e2d353350c0a64a06e5de36fa769c05 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4eea2e7213cf810cfe30f01b00f052db26d80d89 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e7729d46285f86915f1255fa47cbfc837953e2b29c9aa220f28d9a5472fdab4 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8565deac1ad425508de2d2cafe6b77ab1875aa50 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52dc2f7bffd8b72263b23598018cb9621b46e350cb72004ee68206a82ecb9c1c +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..538d71f3da4c22cfc022086dfd07483e88b929ce --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ec67a08d955ba80d0c1a4ab4e77d7761148713c34c12172b9e02f6e3e111d90 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8791794fd2e68ec6487baa48f74d6e10bfc77a2 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5265d430b461822d5aebe247e6c5bab83a38e991463365e40ad4136a58be35a4 +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb9f10625e453ddf70e9478cedd1ff68d1731c16 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa83de930274d9e5808f9f95c88621121d46ec64015e0631ce845deca0d62edc +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3d5f0e6ebb981046691a9fd73041c87167c01e2 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7c77c01f44279a084128998aa8eca74cac3e03cf80b349b195c10feac0a47a +size 116013858 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10780a092758b131a383083967334f764fc780db --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f81124f050c26e2e0f3c578ed06a838e1d7ec88c6a27f9c17618387d9b3b5d +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37c4c655f06f928982f0292476f2a2acdf972176 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bacf4f10188118892497f365f0730150586989fd07f4d91c5169ba81a7a90b3d +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f04365970b058136bb4570529d64baf10d6d725 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d395a13f30b39e9ef4dd830a9a915ab5b57255be2262f84b2438b47328881792 +size 116013719 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1c56ba7bd820aca1e11796bcb1f9b43316bfbbc --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c31fe828d13f7c36d9f1815799649a09f2868a404fcbb2fb7eeba57888ee294 +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d3ec3ea7c09c1e04e3a0c89bb0a7d913443c399 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:128746dfcbfe254d18c0c75390ad79216c16600ef0745e86c98ef614d7c7010d +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48e9717e540d308c4263cfb096243aeb07e414e6 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4854dd1a4e4f34164aafd03352e43f3968b0746a7bdf493a8dac82047a6a984 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8ed0eb22194aa8fabb7e857ea0e5d957b750184 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dddebb5b4c930041c58e28ea0da08e7b182757749b97560dc169d25183f391f +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef90db3b2dad18d3ae131bf37fdbbe777f81a48e --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297602031fa15e180fb9ad87adff5997d2f5cf6b2729f077af089f8cdf3960fa +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfe6865f876eccac97d0f0f758229169df63db55 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3598d7d6471f5f983308c2282916fe1b20d6317acc8a7a233d2d3f8711e983dc +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bef2c80007682ca227dc54572102a00c787be245 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:620dcdff92ba194fe6982d3882426c34e9ba154237017f62bdf6e44957908802 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16ed73eef804d4633a63055b03cf011b50a0bcb5 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f459e1d4ce4061ce5733adae84fb2ef129b88610967b38fcad7c280a9d39bc6 +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..509a540a9232f07c3f5a9d4cd190890915b709b9 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eefd13f6a31977ebaa590b633a7ae4f7dec20dac9808bf008ed790162bb1b3a +size 116013922 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89b66d3ac6a4904f20ec77f7424ca8e499b645dd --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:554683b31b6ef2a12c776ae8a3b7ba52d9118ba9b52e5895814136e2775935be +size 116013538 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9547e7da16ca712b7cdea255ca1c3b657d76683 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5241055470cf77673336411d0a3de76af4169860f4e9d370a9d457b65e507c14 +size 116013719 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63be6222ab9241f47e533f72b1882ed00320c65f --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e256dfd3f2c9bfdd99f24759d32f7855e23cb859e79482410d1be4f25602607 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd6dcb7249a36d95b583db6301d59bdb61bd0ff1 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4de457bcc244a4499630e65f8338ce2dc77d9b0aea512ca294e32c8e228a9d6 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd52633c8b6dd848c065ad3fba2549426d542f1b --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:300e5f69e812a963a8adca4320b8c8a0016f95235e5a95a80b70d4301f235493 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..449a75091c2395ebb61e236a690e90cfefdf2839 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc3db4407ce3263e0294b200d08805b08c5aea218ef4e11410b9fe791a4cf4f +size 116013922 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e27d5620fce16740a499ceb19fe8b66c70cfd542 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55951e57bd1f87c0d224a3114a5b0c52745e4b0fb0474ea883b5490e32b0fc08 +size 116013602 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fe17c0767f01e915549f6c868b9a925775bb6f8 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:150ccac85a89a50c30616f85fc3de20a4808a277f7843c1c1a8c1b02c66e2115 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..185070133248463f6ed32283083ac8811a9d60bb --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6864e3a8c837ee7c8ab29e6b4aedaf9fc3ec8747fb6cd99c45d11f044c129729 +size 116013602 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e2c4e934a653f724cf93dd493c690874e76bf8e --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c5eae6a8c420ca4e75eb260ebc84a7f73bf438d6d4599d97a2e52585575559 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca57d64bba14dae1f6eee628b540511413a2d9fd --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05ee3c9ee9576c3be9d07ab2fef6a1eb49c498cdeb0ecdacfb389ea9785d69b +size 116013858 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c44dde0eee96cbcba156dc5772d2232128f972a9 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffd7ef9366b4f60f65ae5a8e982ec3106f47e7431bf487d1331a39f498b7f42e +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6912894414848c5a5aa300f9347a4d84d3f88d2a --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb1a002170143042f7238774e0476345e449eb694002f20e1d07e904d9fde42a +size 116013655 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2154315ae1cca4e02dd99cb459e51c8049111bea --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c83da05faba00e4be4ea4581cbfe47ccb99a81399d79376ded771fc93778a1 +size 116013730 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15a6b59605ab896e0c95d36f1099ef285fe2fce1 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdef7d34a3e193a681b92889c4dec3d5077735016967d98cc20e5a52546cdbb4 +size 116013794 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e68626dac6864eb162a2de76cabadf9249a0ec5 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9132553c225e895f55f29d6ab38b3eb7abb8e66457356c63ccf8d95d9d61005d +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e15d4a9190e4894291a78e156042f99e3da8abd --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1d064fbb28dba31fd1654ea5d4093ad744b9bd9112146cac3b308732a86b91b +size 116013666 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4eefca1044ddf1f84d1e94b942a4b5b8c3f8683 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e320de4a46dff2d6ff875a5d9623e18b174116297edd4f2c99dd625ba8402fc9 +size 116013719 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a98620d1535fe679627d01558beb01f61d5afcf --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ce3547c1e7b695dce42cf5c6e3d69111b40bb2aa3416409d62b34a98e22509 +size 116013719 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac9db18e089c933aa5acea74e1b5e2ffa41c90a0 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5283554b2e3271097e2349af72acac5936cc8af7325271b80cb7d7804e03adbc +size 116013655 diff --git a/619m22b400m/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m22b400m/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cca6372206449628c0f1798d61f5075b6501b15 --- /dev/null +++ b/619m22b400m/global_step41007/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e788ddfccb4d5c7f7bc8fb0a339289c250c3af12da9a81289d674e56fb78b5 +size 116013591 diff --git a/619m22b400m/global_step41007/layer_01-model_00-model_states.pt b/619m22b400m/global_step41007/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a538e72618020bad07577562c35fde2630b15d51 --- /dev/null +++ b/619m22b400m/global_step41007/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a63a7594f754b9999196ff6608c46b7daf2b80327005a941bab858740b2d3d +size 160826627 diff --git a/619m22b400m/global_step41007/layer_03-model_00-model_states.pt b/619m22b400m/global_step41007/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc764535c079983b68f230b652afe0e9a6a0db16 --- /dev/null +++ b/619m22b400m/global_step41007/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60b6d2e23b27dd1e10447ce03a6dc602b7b60205e613b0eca71671f9619a1144 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_04-model_00-model_states.pt b/619m22b400m/global_step41007/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cf8020f1e79eba5d122fd6c782adc71d2fbc879 --- /dev/null +++ b/619m22b400m/global_step41007/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e081d575724f98b66e9f1e73e8f1867e6307290b623310abd0e058703cbae544 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_05-model_00-model_states.pt b/619m22b400m/global_step41007/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ece0cfc7a9aef01e716b388cb24d915b89ad2f55 --- /dev/null +++ b/619m22b400m/global_step41007/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46de3e536f4951baa61e219ee3c70c8623652b70b5bff65b7b3374a038fdd939 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_06-model_00-model_states.pt b/619m22b400m/global_step41007/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7addb3ce4a4f60045d41fc39a8dcf616fa0606ac --- /dev/null +++ b/619m22b400m/global_step41007/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cac03ad3c369bb7cd525336312878dfb9da09d824f8057cfa95603ffaa56c4ac +size 56667395 diff --git a/619m22b400m/global_step41007/layer_07-model_00-model_states.pt b/619m22b400m/global_step41007/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3361d4febe6391824cfe06b3ce304f796be40f0b --- /dev/null +++ b/619m22b400m/global_step41007/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92ca0ae66f04ffcad8fb96be504cd945c4e32c33218eca5be04c5b8c7fcaef25 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_08-model_00-model_states.pt b/619m22b400m/global_step41007/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab51f81eafcee13d298dcfa2bedcd064e03bcd75 --- /dev/null +++ b/619m22b400m/global_step41007/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a664f50db5af09dcf8bb61f5b22326c88d0b05735ef09b24d64f061f52dbbfb +size 56667395 diff --git a/619m22b400m/global_step41007/layer_09-model_00-model_states.pt b/619m22b400m/global_step41007/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4c8fe9a5cc97964249701847352cbbce4f2dcb1 --- /dev/null +++ b/619m22b400m/global_step41007/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:262bdbffa65444797749566894dc0aa6dce8b56c8f6ff0ce2e7271eb8f17f294 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_10-model_00-model_states.pt b/619m22b400m/global_step41007/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d5968d007a289552f19add6ed39baa5f49448fe --- /dev/null +++ b/619m22b400m/global_step41007/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1414b8919e6534bb48fc6538ffa2103e175c636927c272ccb1a207d5045325f +size 56667395 diff --git a/619m22b400m/global_step41007/layer_11-model_00-model_states.pt b/619m22b400m/global_step41007/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b25e6029a7cbd18a10d429304901978533ecb26 --- /dev/null +++ b/619m22b400m/global_step41007/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc086a0e61cee4ee0e576d245fc64116c7d3d8928420dc28f767a96793970e6a +size 56667395 diff --git a/619m22b400m/global_step41007/layer_12-model_00-model_states.pt b/619m22b400m/global_step41007/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff4b1ca593dc717fdf570f009d13d3c3b3732567 --- /dev/null +++ b/619m22b400m/global_step41007/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbff94e26a7bf08ca7ae9aac6c0491f24584fc38e446330bf61a16ad3b60c183 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_13-model_00-model_states.pt b/619m22b400m/global_step41007/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8384fcc220e7bc454417861c066edcb9526f29de --- /dev/null +++ b/619m22b400m/global_step41007/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6a6f473c5a34a2fab15de5d6d4fe837848859670f912bccf87294260ca90513 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_14-model_00-model_states.pt b/619m22b400m/global_step41007/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e154d4fc805ec87ecaf4253498af8188eb58954 --- /dev/null +++ b/619m22b400m/global_step41007/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:313672a584fac77c66d2e887a0eed99a0c565c70b66ca68d3d61e8c33748cd6c +size 56667395 diff --git a/619m22b400m/global_step41007/layer_15-model_00-model_states.pt b/619m22b400m/global_step41007/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7129a84729a8487fba3ca6420fdf0eb8c17e899d --- /dev/null +++ b/619m22b400m/global_step41007/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfa183eed549484b13678789afedd430b1df7fba32be8d5e7d71cde9fa99f9b0 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_16-model_00-model_states.pt b/619m22b400m/global_step41007/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5705de3e0fa8c78b1dedc18f01a5aa2a9176ddc5 --- /dev/null +++ b/619m22b400m/global_step41007/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92350a19f50d6e1402f8af174aac658bcdc7aa5370d9661d6c36452582127701 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_17-model_00-model_states.pt b/619m22b400m/global_step41007/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..261782eb903d3f9c166b0f67f7e200364e96a82a --- /dev/null +++ b/619m22b400m/global_step41007/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53bef97e4b93003718d05dec21ef7ed400ace0077ef39862dd0119f5f8c76438 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_18-model_00-model_states.pt b/619m22b400m/global_step41007/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6aad87767f27056ccd7dea692a82a92dedff1555 --- /dev/null +++ b/619m22b400m/global_step41007/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:285f9bc8e52615b6c0996a48c50cca97977f6cd8aaac9da8aec53f0097cf0608 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_19-model_00-model_states.pt b/619m22b400m/global_step41007/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a98443b50993bff22983d59e39236290665bd4ac --- /dev/null +++ b/619m22b400m/global_step41007/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec77ece9e83afd519e64f844c2f5c876fc2ed094e8d2f881dc4a1d9d99361a66 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_20-model_00-model_states.pt b/619m22b400m/global_step41007/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7777a89caee8ecd37ce142452aa87e7b28b9fef7 --- /dev/null +++ b/619m22b400m/global_step41007/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32733c724ff3b54c3f39f4c5b20e49a0989ce8ac7b1e5de0c69e3be325af4dbf +size 56667395 diff --git a/619m22b400m/global_step41007/layer_21-model_00-model_states.pt b/619m22b400m/global_step41007/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c85c540e694e5394bd02ee180ce0bb00d2baf09 --- /dev/null +++ b/619m22b400m/global_step41007/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c19e337720d10b7415e4f5598a7e3d6e42cfe901d6015214253fd36cdc6a9a8 +size 56667395 diff --git a/619m22b400m/global_step41007/layer_23-model_00-model_states.pt b/619m22b400m/global_step41007/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48adda5d9e4649f52d967c720de3f814d69fa5c4 --- /dev/null +++ b/619m22b400m/global_step41007/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d223d1edf59f460c5686f9ee2ba2819e90c346a17a2a23cdfef79b0e8ce76c90 +size 7363 diff --git a/619m22b400m/global_step41007/mp_rank_00_model_states.pt b/619m22b400m/global_step41007/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0c12474e13589b5f8af03f830d38d66c9b51fbe --- /dev/null +++ b/619m22b400m/global_step41007/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9a76228fbe246bd5f3b512abccbe01a3767387d81140c704aeb4e0cd7b8825 +size 38515 diff --git a/619m22b400m/sbatch_619m22b1b5.sh b/619m22b400m/sbatch_619m22b1b5.sh new file mode 100644 index 0000000000000000000000000000000000000000..5982c8f1484fe1ac27f4360563b11ea0934ba01d --- /dev/null +++ b/619m22b400m/sbatch_619m22b1b5.sh @@ -0,0 +1,163 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=619m22b1b5 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train1b5.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_632M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=5000 + +# Tokens: 21500000000 +# -> Samples: 10498047 +TRAIN_SAMPLES=10_498_047 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 104_980 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/619m22b400m/sbatch_619m22b1b5val.sh b/619m22b400m/sbatch_619m22b1b5val.sh new file mode 100644 index 0000000000000000000000000000000000000000..2ea555b93cb9f4566f9f7ae8fe65bc3fb2527491 --- /dev/null +++ b/619m22b400m/sbatch_619m22b1b5val.sh @@ -0,0 +1,167 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=619m22b1b5val +VARIANT_CKPT=619m22b1b5 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train1b5.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_4B8_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_632M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 21500000000 +# -> Samples: 10498047 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --no-load-optim \ + --reset-progress \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678909981.nid007044.14181.0 b/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678909981.nid007044.14181.0 new file mode 100644 index 0000000000000000000000000000000000000000..6f720cd0e523994159c7516d9565ff74dcd2b3e9 --- /dev/null +++ b/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678909981.nid007044.14181.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d26ffafcfa640983cbbcc0bf8474a8018aba4dadd327fdaf053dbbeb095a1fd +size 73304929 diff --git a/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678941146.nid006946.54347.0 b/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678941146.nid006946.54347.0 new file mode 100644 index 0000000000000000000000000000000000000000..7c0007047a4c73f167657efa95f1d822cdfeea48 --- /dev/null +++ b/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678941146.nid006946.54347.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ee3d864aa6bb2dd77e09a8575446958b21191f23285b9d4b7cb7e389c17f94 +size 21471 diff --git a/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678941279.nid006946.59861.0 b/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678941279.nid006946.59861.0 new file mode 100644 index 0000000000000000000000000000000000000000..c7f93d27c75f8831d1932356a5ac6fd674f00803 --- /dev/null +++ b/619m22b400m/tensorboard_619m22b400m/events.out.tfevents.1678941279.nid006946.59861.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf08224a84591853f1f74ff57e6deeb23ab1a1c32cc7e8063338ce44d03a19f9 +size 21471 diff --git a/619m2b7400m/3318428.err b/619m2b7400m/3318428.err new file mode 100644 index 0000000000000000000000000000000000000000..0765e3dff042a10556ebe918c4f6f063f88324de --- /dev/null +++ b/619m2b7400m/3318428.err @@ -0,0 +1,1119 @@ +6: 2023-03-15 22:17:30.877515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877524: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877531: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877533: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877530: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877538: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877534: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 22:17:30.877540: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877858: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877861: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877869: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877871: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877865: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877874: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877875: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 22:17:30.877870: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877959: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877964: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877974: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877973: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877983: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877963: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 22:17:30.877978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878128: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878142: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878142: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878150: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878152: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878153: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 22:17:30.878148: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: 2023-03-15 22:17:30.878197: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878219: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878201: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878209: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878200: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 22:17:30.878211: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878500: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878502: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878513: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878517: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878522: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878547: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878543: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878551: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: 2023-03-15 22:17:30.878524: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878553: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878555: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 22:17:30.878521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878558: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878557: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:30.878550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878878: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878873: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878883: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878871: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878880: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878893: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878880: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 22:17:30.878882: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 22:17:42.077062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077120: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077129: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077259: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077152: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077293: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:17:42.077690: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 22:17:42.077163: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 2023-03-15 22:17:42.077372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077306: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077844: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.077457: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:17:42.077442: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-15 22:17:42.077482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 22:17:42.077365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 22:17:42.077444: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 22:17:42.077865: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077395: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077896: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 22:17:42.077902: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.077482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077516: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 22:17:42.077917: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:17:42.077720: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 22:17:42.077733: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 22:17:42.077926: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.077501: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077553: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:17:42.077352: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 22:17:42.077935: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077409: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:17:42.077956: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.077711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077586: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 22:17:42.077473: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077357: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077421: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:17:42.077392: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:17:42.077938: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:17:42.077513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077611: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:17:42.077501: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077375: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 22:17:42.077471: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:17:42.077413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:17:42.077758: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 22:17:42.077758: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 22:17:42.077783: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 22:17:42.077798: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.077528: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:17:42.077488: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 22:17:42.077822: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077449: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 22:17:42.077493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:17:42.077430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:17:42.077544: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:17:42.077962: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.077516: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 22:17:42.077502: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:17:42.077445: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 22:17:42.077520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-15 22:17:42.077558: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.077629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:17:42.078067: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.077512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:17:42.077466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 22:17:42.077529: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-15 22:17:42.078253: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078081: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.078299: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:17:42.078281: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078102: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.077529: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-15 22:17:42.078330: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.077477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 22:17:42.077515: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-15 22:17:42.078296: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.078313: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078106: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.078343: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 22:17:42.078363: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 22:17:42.078371: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:17:42.077989: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.078328: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.078331: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078136: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078144: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078148: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.077540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-15 22:17:42.078386: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.077455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 22:17:42.077565: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 2023-03-15 22:17:42.078336: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 22:17:42.078342: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:17:42.078157: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:17:42.078403: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 22:17:42.078407: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:17:42.077998: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 22:17:42.078011: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 22:17:42.078019: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 22:17:42.078027: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.077536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 22:17:42.078309: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 22:17:42.078031: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:17:42.078316: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078329: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078334: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078355: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078362: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078366: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078375: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078345: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078359: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078353: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078374: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078372: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078376: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 22:17:42.078396: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 22:17:42.078392: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 22:18:05.749265: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749434: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 22:18:05.749280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.749582: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 22:18:05.749293: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749461: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.749303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-15 22:18:05.749601: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 22:18:05.749299: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749481: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-15 22:18:05.749612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 22:18:05.749314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.749600: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-15 22:18:05.749617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 22:18:05.749317: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749781: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-15 22:18:05.749498: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.749630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 22:18:05.749326: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.749618: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.749504: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.749625: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-15 22:18:05.749638: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-15 22:18:05.749632: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749815: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-15 22:18:05.749654: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-15 22:18:05.749634: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749824: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-15 22:18:05.749656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-15 22:18:05.749645: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749835: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-15 22:18:05.749663: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749838: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-15 22:18:05.749669: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.749841: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768113: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768123: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768121: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768126: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768131: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768133: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768133: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768137: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 22:18:05.768165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 22:18:05.768184: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 22:18:05.769267: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769269: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769272: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769271: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769276: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769277: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769285: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 22:18:05.769285: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 22:18:05.769288: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 22:18:05.769291: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 22:18:05.769291: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769616: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-15 22:18:05.769294: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 22:18:05.769291: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769348: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-15 22:18:05.769616: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 22:18:05.769361: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769618: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769621: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769624: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769626: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 22:18:05.769632: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769635: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769635: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769639: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769639: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769641: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769643: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 22:18:05.769643: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770504: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 22:18:05.770619: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770508: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.770641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: 2023-03-15 22:18:05.770507: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.770647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.770660: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 22:18:05.770664: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 22:18:05.770679: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770513: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 22:18:05.770834: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 22:18:05.770520: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770525: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770528: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770530: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770529: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770531: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 22:18:05.770532: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.770836: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771042: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771049: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771061: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771067: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771081: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.771084: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772378: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772389: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772397: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772405: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772419: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.772423: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772604: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772598: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772605: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772601: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772603: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772608: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 22:18:05.772620: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772620: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772625: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772625: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772626: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772628: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 22:18:05.772629: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772925: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772929: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772927: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772934: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772940: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772944: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772936: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772934: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772945: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772950: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772951: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772957: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772960: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 22:18:05.772971: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 22:18:05.772986: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774626: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774632: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774628: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774628: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774643: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774640: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 22:18:05.774648: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774652: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774655: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774657: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774658: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774660: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 22:18:05.774660: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770326: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770330: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770332: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770335: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770335: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770344: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770345: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770349: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770352: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770352: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770353: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770376: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 22:18:05.770385: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 22:18:05.770391: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +2: +2: +2: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: +3: +3: +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: +6: +6: +6: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: +7: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +5: Building extension module utils... +5: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +5: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils...Loading extension module utils... +4: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: +0: Loading extension module utils...Loading extension module utils... +0: Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils...Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils...Loading extension module utils... +5: +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: +3: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +3: +3: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils... +3: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/619m2b7400m/3318428.out b/619m2b7400m/3318428.out new file mode 100644 index 0000000000000000000000000000000000000000..f4d63ee9d8b1e819f2da2ca67c2892b11f5bff6d --- /dev/null +++ b/619m2b7400m/3318428.out @@ -0,0 +1,3225 @@ +Model parameters: d_model 1536 ffw_size 6144 kv_size 128 n_heads 12 n_layers 19 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 19 --hidden-size 1536 --num-attention-heads 12 --kv-channels 128 --ffn-hidden-size 6144 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1_308_594 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-619m2b7400m --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1_308_594 --lr-warmup-samples 13_086 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_619m2b7400m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_619m2b7400m --load checkpoints_619m2b7400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3318428.json --zero-stage 0 +START 3318428: Wed 15 Mar 2023 10:16:28 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 42.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 53.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 37.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 47.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 35.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 36.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 45.0c 101.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 44.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 47.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 43.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 46.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 50.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +0: Launching on nid007231 (0/8), master nid007231 port 9999, GPUs 8, CUDA: True +4: Launching on nid007235 (4/8), master nid007231 port 9999, GPUs 8, CUDA: True +1: Launching on nid007232 (1/8), master nid007231 port 9999, GPUs 8, CUDA: True +7: Launching on nid007238 (7/8), master nid007231 port 9999, GPUs 8, CUDA: True +5: Launching on nid007236 (5/8), master nid007231 port 9999, GPUs 8, CUDA: True +2: Launching on nid007233 (2/8), master nid007231 port 9999, GPUs 8, CUDA: True +3: Launching on nid007234 (3/8), master nid007231 port 9999, GPUs 8, CUDA: True +6: Launching on nid007237 (6/8), master nid007231 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3318428.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 6144 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1536 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-619m2b7400m +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_619m2b7400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1308594 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 13086 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 12 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 19 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_619m2b7400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_619m2b7400m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1308594 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-15 22:19:12,676] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.106 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 22.373 seconds +0: time to initialize megatron (seconds): -38.012 +0: [after megatron is initialized] datetime: 2023-03-15 22:19:37 +0: building GPT model ... +0: [2023-03-15 22:19:38,124] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-15 22:19:38,125] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-15 22:19:38,125] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.64 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-15 22:19:40,101] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=26 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: ParallelTransformerLayerPipe +0: 22: undo +0: 23: MixedFusedLayerNorm +0: 24: EmbeddingPipe +0: 25: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-15 22:19:40,390] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-15 22:19:40,390] [INFO] [utils.py:828:see_memory_usage] MA 1.16 GB Max_MA 1.16 GB CA 1.2 GB Max_CA 1 GB +0: [2023-03-15 22:19:40,391] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.68 GB, percent = 6.1% +0: setting training iterations to 5111 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-15 22:19:40,393] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-15 22:19:52,894] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-15 22:19:52,895] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-15 22:19:52,895] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-15 22:19:52,902] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-15 22:19:52,902] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-15 22:19:53,023] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-15 22:19:53,024] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.17 GB CA 1.22 GB Max_CA 1 GB +0: [2023-03-15 22:19:53,024] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +5: ninja: no work to do. +5: Time to load utils op: 0.31894588470458984 seconds +1: Time to load utils op: 0.28648948669433594 seconds +1: Time to load utils op: 0.2870185375213623 secondsTime to load utils op: 0.2866537570953369 seconds +1: +1: Time to load utils op: 0.287355899810791 seconds +1: Time to load utils op: 0.2871885299682617 seconds +1: Time to load utils op: 0.28630924224853516 secondsTime to load utils op: 0.286257266998291 secondsTime to load utils op: 0.28690290451049805 seconds +1: +1: +0: Time to load utils op: 0.2191452980041504 seconds +0: Time to load utils op: 0.3058795928955078 seconds +0: Time to load utils op: 0.30588436126708984 seconds +0: Time to load utils op: 0.3063852787017822 seconds +0: Time to load utils op: 0.30639028549194336 seconds +0: Time to load utils op: 0.3065338134765625 seconds +0: Time to load utils op: 0.3062143325805664 seconds +0: Time to load utils op: 0.30653858184814453 seconds +5: Time to load utils op: 0.30345845222473145 seconds +5: Time to load utils op: 0.3032045364379883 seconds +5: Time to load utils op: 0.3039402961730957 secondsTime to load utils op: 0.3039405345916748 seconds +5: +5: Time to load utils op: 0.30382871627807617 seconds +5: Time to load utils op: 0.3030509948730469 seconds +5: Time to load utils op: 0.3036632537841797 seconds +3: Time to load utils op: 0.3137068748474121 seconds +3: Time to load utils op: 0.31372857093811035 seconds +3: Time to load utils op: 0.3137357234954834 seconds +3: Time to load utils op: 0.31371569633483887 seconds +3: Time to load utils op: 0.313720703125 seconds +3: Time to load utils op: 0.31372928619384766 secondsTime to load utils op: 0.3137338161468506 seconds +3: Time to load utils op: 0.31372976303100586 seconds +3: +2: Time to load utils op: 0.3459460735321045 secondsTime to load utils op: 0.3459656238555908 seconds +2: +4: Time to load utils op: 0.3447906970977783 secondsTime to load utils op: 0.3447909355163574 seconds +4: +4: Time to load utils op: 0.3447990417480469 seconds +2: Time to load utils op: 0.34596705436706543 secondsTime to load utils op: 0.3459808826446533 secondsTime to load utils op: 0.3459737300872803 seconds +2: +2: +7: Time to load utils op: 0.3418564796447754 seconds +7: Time to load utils op: 0.34188151359558105 seconds +2: Time to load utils op: 0.3460235595703125 secondsTime to load utils op: 0.3459916114807129 seconds +4: Time to load utils op: 0.3448030948638916 seconds +2: +4: Time to load utils op: 0.3448629379272461 seconds +4: Time to load utils op: 0.3448481559753418 secondsTime to load utils op: 0.34485697746276855 seconds +4: +4: Time to load utils op: 0.3448824882507324 seconds +2: Time to load utils op: 0.34604573249816895 seconds +7: Time to load utils op: 0.3419947624206543 seconds +7: Time to load utils op: 0.3418440818786621 secondsTime to load utils op: 0.34183454513549805 seconds +7: Time to load utils op: 0.34183740615844727 seconds +7: +7: Time to load utils op: 0.341843843460083 seconds +7: Time to load utils op: 0.34185171127319336 seconds +6: Time to load utils op: 0.3428976535797119 secondsTime to load utils op: 0.34288883209228516 seconds +6: +6: Time to load utils op: 0.34297609329223633 seconds +6: Time to load utils op: 0.3429396152496338 seconds +6: Time to load utils op: 0.34298157691955566 seconds +6: Time to load utils op: 0.3429534435272217 seconds +6: Time to load utils op: 0.34296631813049316 seconds +6: Time to load utils op: 0.3430204391479492 seconds +0: [2023-03-15 22:19:53,353] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-15 22:19:53,354] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.15 GB CA 1.22 GB Max_CA 1 GB +0: [2023-03-15 22:19:53,354] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +4: Time to load utils op: 0.001238107681274414 seconds +4: Time to load utils op: 0.001270294189453125 seconds +4: Time to load utils op: 0.0013523101806640625 seconds +4: Time to load utils op: 0.0014324188232421875 seconds +4: Time to load utils op: 0.0015566349029541016 seconds +4: Time to load utils op: 0.0015501976013183594 secondsTime to load utils op: 0.0015168190002441406 seconds +4: +4: Time to load utils op: 0.0016052722930908203 seconds +0: Time to load utils op: 0.0005803108215332031 secondsTime to load utils op: 0.0006058216094970703 seconds +0: +0: Time to load utils op: 0.00066375732421875 seconds +0: Time to load utils op: 0.0005991458892822266 seconds +0: Time to load utils op: 0.0005896091461181641 seconds +0: Time to load utils op: 0.0006499290466308594 seconds +0: Time to load utils op: 0.0006477832794189453 seconds +1: Time to load utils op: 0.0009489059448242188 seconds +1: Time to load utils op: 0.0011053085327148438 seconds +1: Time to load utils op: 0.0013394355773925781 seconds +1: Time to load utils op: 0.0012488365173339844 seconds +1: Time to load utils op: 0.0010755062103271484 seconds +1: Time to load utils op: 0.0012555122375488281 secondsTime to load utils op: 0.0011801719665527344 seconds +1: +1: Time to load utils op: 0.0011947154998779297 seconds +7: Time to load utils op: 0.0007071495056152344 seconds +7: Time to load utils op: 0.0011699199676513672 seconds +7: Time to load utils op: 0.0011115074157714844 secondsTime to load utils op: 0.001119375228881836 seconds +7: +7: Time to load utils op: 0.0013077259063720703 seconds +7: Time to load utils op: 0.001146078109741211 seconds +7: Time to load utils op: 0.0012621879577636719 seconds +7: Time to load utils op: 0.0011610984802246094 seconds +5: Time to load utils op: 0.0004761219024658203 seconds +5: Time to load utils op: 0.0004544258117675781 secondsTime to load utils op: 0.000446319580078125 seconds +5: +5: Time to load utils op: 0.00043129920959472656 seconds +5: Time to load utils op: 0.00042366981506347656 secondsTime to load utils op: 0.00044226646423339844 seconds +5: +5: Time to load utils op: 0.0005285739898681641 seconds +2: Time to load utils op: 0.0010235309600830078 seconds +5: Time to load utils op: 0.00046253204345703125 seconds +2: Time to load utils op: 0.0012576580047607422 seconds +3: Time to load utils op: 0.0010597705841064453 secondsTime to load utils op: 0.0010943412780761719 seconds +3: +2: Time to load utils op: 0.0013630390167236328 seconds +2: Time to load utils op: 0.001401662826538086 seconds +2: Time to load utils op: 0.0013713836669921875 seconds +2: Time to load utils op: 0.0013654232025146484 seconds +6: Time to load utils op: 0.0009462833404541016 seconds +2: Time to load utils op: 0.0013895034790039062 seconds +6: Time to load utils op: 0.0010881423950195312 seconds +2: Time to load utils op: 0.001405477523803711 seconds +3: Time to load utils op: 0.0011239051818847656 seconds +3: Time to load utils op: 0.0012881755828857422 seconds +3: Time to load utils op: 0.0012977123260498047 secondsTime to load utils op: 0.001262664794921875 seconds +3: +3: Time to load utils op: 0.0012125968933105469 seconds +3: Time to load utils op: 0.0012710094451904297 seconds +6: Time to load utils op: 0.001234292984008789 seconds +6: Time to load utils op: 0.0012819766998291016 seconds +6: Time to load utils op: 0.0012426376342773438 seconds +6: Time to load utils op: 0.0012450218200683594 seconds +6: Time to load utils op: 0.0012526512145996094 seconds +6: Time to load utils op: 0.0013344287872314453 seconds +0: [2023-03-15 22:19:53,572] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-15 22:19:53,572] [INFO] [utils.py:828:see_memory_usage] MA 2.45 GB Max_MA 2.45 GB CA 3.13 GB Max_CA 3 GB +0: [2023-03-15 22:19:53,573] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:53,681] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-15 22:19:53,682] [INFO] [utils.py:828:see_memory_usage] MA 2.45 GB Max_MA 2.45 GB CA 3.13 GB Max_CA 3 GB +0: [2023-03-15 22:19:53,682] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:53,790] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-15 22:19:53,791] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 22:19:53,791] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:53,898] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-15 22:19:53,898] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 22:19:53,898] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:54,007] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-15 22:19:54,008] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 22:19:54,008] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:54,114] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-15 22:19:54,114] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 22:19:54,114] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:54,226] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-15 22:19:54,227] [INFO] [utils.py:828:see_memory_usage] MA 3.57 GB Max_MA 3.57 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 22:19:54,227] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:54,333] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-15 22:19:54,334] [INFO] [utils.py:828:see_memory_usage] MA 3.57 GB Max_MA 3.57 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 22:19:54,334] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 22:19:54,334] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-15 22:19:54,334] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-15 22:19:54,334] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-15 22:19:54,335] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-15 22:19:54,335] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-15 22:19:54,335] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-15 22:19:54,335] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-15 22:19:54,335] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-15 22:19:54,335] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-15 22:19:54,336] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-15 22:19:54,337] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-15 22:19:54,337] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.000415802001953125 seconds +0: [2023-03-15 22:19:54,338] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-15 22:19:54,348] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=26 [0, 26) STAGE_PARAMS=618714624 (618.715M) TOTAL_PARAMS=618714624 (618.715M) UNIQUE_PARAMS=618714624 (618.715M) +0: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_619m2b7400m +0: will not load any checkpoints and will start from random +7: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,353] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 22:19:54,354] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_619m2b7400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 7.82 +0: estimated model parameters: 0.618714624 +0: estimated model parameters without embeddings: 0.538301952 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-15 22:19:55 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1308594 +0: validation: 1536 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.008909 seconds +0: number of documents: 835726 +0: > dataset split: +0: train: +0: document indices in [0, 835726) total of 835726 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1308594ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1308594ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1308594ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.088 seconds +0: total number of samples: 1365704 +0: total number of epochs: 7 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.051339 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_1536ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_1536ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_1536ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.077 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-15 22:20:07 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 17008.35 | train/valid/test-data-iterators-setup: 12233.47 +0: [000-000] 0.6187B / 0.5383B +0: [before the start of training step] datetime: 2023-03-15 22:20:07 +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 6409.56689453125 | max allocated: 34826.62890625 | reserved: 38832.0 | max reserved: 38832.0 +7: iteration 10/ 5111 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 2.53 | learning rate: 3.913E-05 | global batch size: 256 | lm loss: 9.898353E+00 | grad norm: 3.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 101.234 | TFLOPs: 13.91 | +7: iteration 20/ 5111 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.77 | learning rate: 7.825E-05 | global batch size: 256 | lm loss: 8.636529E+00 | grad norm: 1.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.615 | TFLOPs: 45.41 | +7: iteration 30/ 5111 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.78 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 7.817020E+00 | grad norm: 1.842 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.803 | TFLOPs: 45.03 | +7: iteration 40/ 5111 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.77 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 7.417764E+00 | grad norm: 1.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.056 | TFLOPs: 45.47 | +7: iteration 50/ 5111 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.78 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 7.252057E+00 | grad norm: 1.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.801 | TFLOPs: 45.03 | +7: iteration 60/ 5111 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.76 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 7.039539E+00 | grad norm: 1.013 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.492 | TFLOPs: 46.22 | +7: iteration 70/ 5111 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.78 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.866595E+00 | grad norm: 0.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.258 | TFLOPs: 45.23 | +7: iteration 80/ 5111 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.77 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.710536E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.816 | TFLOPs: 45.44 | +7: iteration 90/ 5111 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.78 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.606774E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 328.560 | TFLOPs: 45.13 | +7: iteration 100/ 5111 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.76 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.525609E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.678 | TFLOPs: 46.38 | +7: iteration 110/ 5111 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.78 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.490987E+00 | grad norm: 0.850 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.231 | TFLOPs: 44.95 | +7: iteration 120/ 5111 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.76 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.439439E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.923 | TFLOPs: 46.14 | +7: iteration 130/ 5111 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.77 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.373621E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.561 | TFLOPs: 45.41 | +7: iteration 140/ 5111 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.78 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 6.330902E+00 | grad norm: 0.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.216 | TFLOPs: 44.95 | +7: iteration 150/ 5111 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.78 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 6.305273E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.281 | TFLOPs: 45.23 | +7: iteration 160/ 5111 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.77 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 6.253965E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.141 | TFLOPs: 45.62 | +7: iteration 170/ 5111 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.77 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 6.218412E+00 | grad norm: 0.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.574 | TFLOPs: 45.82 | +7: iteration 180/ 5111 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.80 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 6.199778E+00 | grad norm: 0.685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 321.909 | TFLOPs: 44.22 | +7: iteration 190/ 5111 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.77 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 6.170295E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.960 | TFLOPs: 45.87 | +7: iteration 200/ 5111 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.78 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 6.140448E+00 | grad norm: 0.713 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.128 | TFLOPs: 45.35 | +7: iteration 210/ 5111 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.78 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 6.101767E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.069 | TFLOPs: 44.93 | +7: iteration 220/ 5111 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.78 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 6.078740E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 326.962 | TFLOPs: 44.91 | +7: iteration 230/ 5111 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.78 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 6.035664E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 328.205 | TFLOPs: 45.08 | +7: iteration 240/ 5111 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.76 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 6.014500E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.422 | TFLOPs: 46.07 | +7: iteration 250/ 5111 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.76 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 5.981506E+00 | grad norm: 0.802 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.874 | TFLOPs: 46.00 | +7: iteration 260/ 5111 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.76 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 5.964225E+00 | grad norm: 0.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.623 | TFLOPs: 46.10 | +7: iteration 270/ 5111 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.78 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 5.937399E+00 | grad norm: 0.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 328.042 | TFLOPs: 45.06 | +7: iteration 280/ 5111 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.76 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 5.907223E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.737 | TFLOPs: 46.53 | +7: iteration 290/ 5111 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.76 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 5.851162E+00 | grad norm: 1.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.761 | TFLOPs: 45.98 | +7: iteration 300/ 5111 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.77 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 5.871957E+00 | grad norm: 1.073 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.358 | TFLOPs: 45.51 | +7: iteration 310/ 5111 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.77 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 5.830150E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.620 | TFLOPs: 45.69 | +7: iteration 320/ 5111 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.76 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 5.749750E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.765 | TFLOPs: 45.98 | +7: iteration 330/ 5111 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.78 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 5.749352E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.103 | TFLOPs: 45.34 | +7: iteration 340/ 5111 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.77 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 5.714845E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.929 | TFLOPs: 45.73 | +7: iteration 350/ 5111 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.77 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 5.681992E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.062 | TFLOPs: 45.89 | +7: iteration 360/ 5111 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.76 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 5.670543E+00 | grad norm: 0.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.976 | TFLOPs: 46.01 | +7: iteration 370/ 5111 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.78 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 5.631403E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.177 | TFLOPs: 45.35 | +7: iteration 380/ 5111 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.77 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 5.579153E+00 | grad norm: 0.818 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.506 | TFLOPs: 45.67 | +7: iteration 390/ 5111 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.78 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 5.567757E+00 | grad norm: 0.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.499 | TFLOPs: 45.26 | +7: iteration 400/ 5111 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.76 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 5.529657E+00 | grad norm: 0.679 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.759 | TFLOPs: 46.12 | +7: iteration 410/ 5111 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.77 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 5.486267E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.420 | TFLOPs: 45.80 | +7: iteration 420/ 5111 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.75 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 5.447980E+00 | grad norm: 0.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.901 | TFLOPs: 46.69 | +7: iteration 430/ 5111 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.78 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 5.445386E+00 | grad norm: 0.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.278 | TFLOPs: 44.95 | +7: iteration 440/ 5111 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.76 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 5.393705E+00 | grad norm: 0.766 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.435 | TFLOPs: 46.21 | +7: iteration 450/ 5111 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.77 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 5.380716E+00 | grad norm: 1.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.463 | TFLOPs: 45.94 | +7: iteration 460/ 5111 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.76 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 5.335891E+00 | grad norm: 0.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.179 | TFLOPs: 46.31 | +7: iteration 470/ 5111 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.77 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 5.297987E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.894 | TFLOPs: 45.86 | +7: iteration 480/ 5111 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.77 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 5.292832E+00 | grad norm: 0.879 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.708 | TFLOPs: 45.70 | +7: iteration 490/ 5111 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.76 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 5.253120E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.446 | TFLOPs: 46.49 | +7: iteration 500/ 5111 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.76 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 5.202845E+00 | grad norm: 0.713 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.188 | TFLOPs: 46.18 | +7: iteration 510/ 5111 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.77 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 5.198151E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.395 | TFLOPs: 45.93 | +7: iteration 520/ 5111 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.76 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 5.141507E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.318 | TFLOPs: 46.47 | +7: iteration 530/ 5111 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.78 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 5.117651E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.042 | TFLOPs: 45.33 | +7: iteration 540/ 5111 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.78 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 5.097948E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 328.547 | TFLOPs: 45.13 | +7: iteration 550/ 5111 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.77 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 5.058422E+00 | grad norm: 0.869 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.128 | TFLOPs: 45.76 | +7: iteration 560/ 5111 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.77 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 5.028521E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.485 | TFLOPs: 45.81 | +7: iteration 570/ 5111 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.75 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.995846E+00 | grad norm: 0.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.480 | TFLOPs: 46.63 | +7: iteration 580/ 5111 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.77 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.977039E+00 | grad norm: 0.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.780 | TFLOPs: 45.57 | +7: iteration 590/ 5111 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.76 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.936448E+00 | grad norm: 0.639 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.496 | TFLOPs: 46.22 | +7: iteration 600/ 5111 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.77 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.904394E+00 | grad norm: 0.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.708 | TFLOPs: 45.84 | +7: iteration 610/ 5111 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.77 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.861602E+00 | grad norm: 0.987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.676 | TFLOPs: 45.42 | +7: iteration 620/ 5111 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.77 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.876650E+00 | grad norm: 0.683 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.072 | TFLOPs: 45.61 | +7: iteration 630/ 5111 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.77 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.807820E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.820 | TFLOPs: 45.58 | +7: iteration 640/ 5111 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.77 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.774892E+00 | grad norm: 0.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.497 | TFLOPs: 45.40 | +7: iteration 650/ 5111 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.77 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.729450E+00 | grad norm: 0.750 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.276 | TFLOPs: 45.64 | +7: iteration 660/ 5111 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.76 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.702357E+00 | grad norm: 0.829 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.320 | TFLOPs: 46.33 | +7: iteration 670/ 5111 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.77 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.706216E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.397 | TFLOPs: 45.66 | +7: iteration 680/ 5111 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.77 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.652816E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.486 | TFLOPs: 45.94 | +7: iteration 690/ 5111 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.76 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.619194E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.770 | TFLOPs: 46.40 | +7: iteration 700/ 5111 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.76 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.596252E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.041 | TFLOPs: 46.16 | +7: iteration 710/ 5111 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.75 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.573396E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.545 | TFLOPs: 46.64 | +7: iteration 720/ 5111 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.76 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.531342E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.118 | TFLOPs: 46.17 | +7: iteration 730/ 5111 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.78 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.522043E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.398 | TFLOPs: 45.25 | +7: iteration 740/ 5111 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.77 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.514378E+00 | grad norm: 0.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.090 | TFLOPs: 45.75 | +7: iteration 750/ 5111 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.77 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.494963E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.799 | TFLOPs: 45.85 | +7: iteration 760/ 5111 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.76 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.479237E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.879 | TFLOPs: 46.00 | +7: iteration 770/ 5111 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.77 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.433130E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.292 | TFLOPs: 45.64 | +7: iteration 780/ 5111 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.76 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.436021E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.175 | TFLOPs: 46.04 | +7: iteration 790/ 5111 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.76 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.424761E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.974 | TFLOPs: 46.15 | +7: iteration 800/ 5111 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.76 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.426160E+00 | grad norm: 0.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.232 | TFLOPs: 46.46 | +7: iteration 810/ 5111 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.78 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.403563E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.593 | TFLOPs: 45.27 | +7: iteration 820/ 5111 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.77 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.374015E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.371 | TFLOPs: 45.65 | +7: iteration 830/ 5111 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.77 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.357983E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.348 | TFLOPs: 45.79 | +7: iteration 840/ 5111 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.77 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 4.333335E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.792 | TFLOPs: 45.85 | +7: iteration 850/ 5111 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.76 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.342595E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.805 | TFLOPs: 45.99 | +7: iteration 860/ 5111 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.80 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 4.314025E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 319.978 | TFLOPs: 43.95 | +7: iteration 870/ 5111 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.76 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 4.300742E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.760 | TFLOPs: 46.12 | +7: iteration 880/ 5111 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.77 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 4.295804E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.677 | TFLOPs: 45.83 | +7: iteration 890/ 5111 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.77 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 4.285990E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.376 | TFLOPs: 45.79 | +7: iteration 900/ 5111 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.78 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 4.277449E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.304 | TFLOPs: 44.96 | +7: iteration 910/ 5111 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.78 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 4.242863E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 326.621 | TFLOPs: 44.86 | +7: iteration 920/ 5111 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.76 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 4.263306E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.536 | TFLOPs: 46.09 | +7: iteration 930/ 5111 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.76 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 4.237844E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.458 | TFLOPs: 46.35 | +7: iteration 940/ 5111 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.78 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 4.221281E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.665 | TFLOPs: 45.28 | +7: iteration 950/ 5111 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.76 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 4.222636E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.286 | TFLOPs: 46.19 | +7: iteration 960/ 5111 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.76 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 4.206547E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.500 | TFLOPs: 46.50 | +7: iteration 970/ 5111 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.78 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 4.198036E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.367 | TFLOPs: 45.24 | +7: iteration 980/ 5111 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.76 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 4.198810E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.305 | TFLOPs: 46.19 | +7: iteration 990/ 5111 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.76 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 4.179040E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.634 | TFLOPs: 46.51 | +7: iteration 1000/ 5111 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.77 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 4.162016E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.173 | TFLOPs: 45.49 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 1000 | lm loss value: 4.173533E+00 | lm loss PPL: 6.494452E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 1000 to checkpoints_619m2b7400m +0: [2023-03-15 22:33:14,219] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +0: [2023-03-15 22:33:14,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:33:14,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:33:14,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:33:14,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:33:14,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:33:14,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:33:14,581] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:33:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:33:14,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:33:14,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:33:14,692] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:33:14,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:33:14,750] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:33:14,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:33:14,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:33:14,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:33:14,862] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:33:14,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:33:14,917] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:33:14,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:33:14,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:33:15,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:33:15,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:33:15,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:33:15,083] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:33:15,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:33:15,138] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:33:15,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:33:15,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:33:15,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:33:15,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:33:15,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:33:15,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:33:15,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:33:15,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:33:15,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:33:15,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:33:15,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:33:15,475] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_21-model_00-model_states.pt... +0: [2023-03-15 22:33:15,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_21-model_00-model_states.pt. +0: [2023-03-15 22:33:15,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/layer_23-model_00-model_states.pt... +0: [2023-03-15 22:33:15,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/layer_23-model_00-model_states.pt. +0: [2023-03-15 22:33:15,535] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_619m2b7400m/global_step1000/mp_rank_00_model_states.pt +0: [2023-03-15 22:33:15,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:33:15,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:33:15,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:33:15,558] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:33:15,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,717] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,766] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,766] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:33:15,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,859] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,859] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,859] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,861] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,863] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:33:15,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:33:15,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:33:15,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:33:15,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:33:15,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:33:15,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:33:15,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:33:15,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:33:15,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:33:15,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:33:15,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:33:15,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:33:15,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:33:15,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:33:15,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:33:15,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:33:15,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:33:15,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:33:15,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:33:15,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:33:15,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:33:15,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:33:15,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: successfully saved checkpoint at iteration 1000 to checkpoints_619m2b7400m +7: time (ms) | save-checkpoint: 1704.03 +7: iteration 1010/ 5111 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.98 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 4.167627E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 261.626 | TFLOPs: 35.94 | +7: iteration 1020/ 5111 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.77 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 4.148227E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.194 | TFLOPs: 45.90 | +7: iteration 1030/ 5111 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.77 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 4.149324E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.557 | TFLOPs: 45.54 | +7: iteration 1040/ 5111 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.76 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 4.130143E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.764 | TFLOPs: 46.53 | +7: iteration 1050/ 5111 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.77 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 4.128861E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.155 | TFLOPs: 45.76 | +7: iteration 1060/ 5111 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.75 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 4.107196E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.195 | TFLOPs: 46.59 | +7: iteration 1070/ 5111 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.77 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 4.096702E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.738 | TFLOPs: 45.70 | +7: iteration 1080/ 5111 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.77 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 4.100112E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.029 | TFLOPs: 45.88 | +7: iteration 1090/ 5111 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.77 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 4.100780E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.300 | TFLOPs: 45.78 | +7: iteration 1100/ 5111 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.79 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 4.073489E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 324.653 | TFLOPs: 44.59 | +7: iteration 1110/ 5111 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.78 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 4.074197E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 326.283 | TFLOPs: 44.82 | +7: iteration 1120/ 5111 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.76 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 4.076532E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.531 | TFLOPs: 46.09 | +7: iteration 1130/ 5111 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.77 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 4.060798E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.371 | TFLOPs: 45.38 | +7: iteration 1140/ 5111 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.77 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 4.053545E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.803 | TFLOPs: 45.58 | +7: iteration 1150/ 5111 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.76 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 4.058975E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.836 | TFLOPs: 46.40 | +7: iteration 1160/ 5111 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.76 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 4.048050E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.710 | TFLOPs: 45.98 | +7: iteration 1170/ 5111 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.77 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 4.040485E+00 | grad norm: 0.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.350 | TFLOPs: 45.93 | +7: iteration 1180/ 5111 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.77 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 4.145197E+00 | grad norm: 1.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.788 | TFLOPs: 45.44 | +7: iteration 1190/ 5111 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.76 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 4.232846E+00 | grad norm: 0.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.174 | TFLOPs: 46.04 | +7: iteration 1200/ 5111 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.76 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 4.204120E+00 | grad norm: 0.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.693 | TFLOPs: 46.25 | +7: iteration 1210/ 5111 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.76 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 4.119761E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.921 | TFLOPs: 46.14 | +7: iteration 1220/ 5111 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.76 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 4.075689E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.067 | TFLOPs: 46.57 | +7: iteration 1230/ 5111 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.77 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 4.037822E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.386 | TFLOPs: 45.93 | +7: iteration 1240/ 5111 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.76 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 4.022081E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.163 | TFLOPs: 46.45 | +7: iteration 1250/ 5111 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.77 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 4.023577E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.998 | TFLOPs: 45.47 | +7: iteration 1260/ 5111 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.76 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 4.009978E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.660 | TFLOPs: 45.97 | +7: iteration 1270/ 5111 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.75 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.991200E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.521 | TFLOPs: 46.64 | +7: iteration 1280/ 5111 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.76 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.985806E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.681 | TFLOPs: 46.38 | +7: iteration 1290/ 5111 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.76 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.984729E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.991 | TFLOPs: 46.29 | +7: iteration 1300/ 5111 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.76 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.967724E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.369 | TFLOPs: 46.34 | +7: iteration 1310/ 5111 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.77 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.942764E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.924 | TFLOPs: 45.59 | +7: iteration 1320/ 5111 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.76 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.968929E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.913 | TFLOPs: 46.28 | +7: iteration 1330/ 5111 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.75 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.950906E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.675 | TFLOPs: 46.66 | +7: iteration 1340/ 5111 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.76 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.952418E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.898 | TFLOPs: 46.41 | +7: iteration 1350/ 5111 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.76 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.936245E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.267 | TFLOPs: 46.05 | +7: iteration 1360/ 5111 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.77 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.950780E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.659 | TFLOPs: 45.83 | +7: iteration 1370/ 5111 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.76 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.938494E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.417 | TFLOPs: 46.07 | +7: iteration 1380/ 5111 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.76 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.935985E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.897 | TFLOPs: 46.28 | +7: iteration 1390/ 5111 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.78 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.926466E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.853 | TFLOPs: 45.03 | +7: iteration 1400/ 5111 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.77 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.933525E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.233 | TFLOPs: 45.91 | +7: iteration 1410/ 5111 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.76 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.906555E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.130 | TFLOPs: 46.03 | +7: iteration 1420/ 5111 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.77 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.932838E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.231 | TFLOPs: 45.91 | +7: iteration 1430/ 5111 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.77 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.915711E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.356 | TFLOPs: 45.65 | +7: iteration 1440/ 5111 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.76 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.901960E+00 | grad norm: 0.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.169 | TFLOPs: 46.04 | +7: iteration 1450/ 5111 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.78 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.911873E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.731 | TFLOPs: 45.02 | +7: iteration 1460/ 5111 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.76 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.897029E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.286 | TFLOPs: 46.05 | +7: iteration 1470/ 5111 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.75 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.888365E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.289 | TFLOPs: 46.60 | +7: iteration 1480/ 5111 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.77 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.880289E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.907 | TFLOPs: 45.59 | +7: iteration 1490/ 5111 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.76 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.876402E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.465 | TFLOPs: 46.35 | +7: iteration 1500/ 5111 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.76 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.871740E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.108 | TFLOPs: 46.17 | +7: iteration 1510/ 5111 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.76 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.866252E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.281 | TFLOPs: 46.19 | +7: iteration 1520/ 5111 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.77 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.879506E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.076 | TFLOPs: 45.89 | +7: iteration 1530/ 5111 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.78 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.854447E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.811 | TFLOPs: 45.30 | +7: iteration 1540/ 5111 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.78 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.864382E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.158 | TFLOPs: 45.21 | +7: iteration 1550/ 5111 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.76 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.864902E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.496 | TFLOPs: 46.50 | +7: iteration 1560/ 5111 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.76 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.857690E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.339 | TFLOPs: 46.34 | +7: iteration 1570/ 5111 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.76 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.840911E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.339 | TFLOPs: 46.34 | +7: iteration 1580/ 5111 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.77 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.846575E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.879 | TFLOPs: 45.72 | +7: iteration 1590/ 5111 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.77 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.840908E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.323 | TFLOPs: 45.37 | +7: iteration 1600/ 5111 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.80 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.837464E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 321.348 | TFLOPs: 44.14 | +7: iteration 1610/ 5111 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.77 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.823576E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.616 | TFLOPs: 45.96 | +7: iteration 1620/ 5111 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.77 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.826715E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.716 | TFLOPs: 45.56 | +7: iteration 1630/ 5111 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.76 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.827564E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.309 | TFLOPs: 46.33 | +7: iteration 1640/ 5111 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.77 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.823902E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 330.800 | TFLOPs: 45.44 | +7: iteration 1650/ 5111 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.77 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.820181E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.479 | TFLOPs: 45.94 | +7: iteration 1660/ 5111 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.75 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.825396E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.601 | TFLOPs: 46.78 | +7: iteration 1670/ 5111 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.76 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.815469E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.073 | TFLOPs: 46.44 | +7: iteration 1680/ 5111 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.76 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.811795E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.339 | TFLOPs: 46.06 | +7: iteration 1690/ 5111 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.76 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.812999E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.059 | TFLOPs: 46.57 | +7: iteration 1700/ 5111 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.76 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.819713E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.177 | TFLOPs: 46.45 | +7: iteration 1710/ 5111 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.76 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.789754E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.206 | TFLOPs: 46.32 | +7: iteration 1720/ 5111 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.76 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.794873E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.108 | TFLOPs: 46.44 | +7: iteration 1730/ 5111 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.77 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.799890E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.411 | TFLOPs: 45.52 | +7: iteration 1740/ 5111 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.77 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.789658E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.558 | TFLOPs: 45.68 | +7: iteration 1750/ 5111 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.76 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.791833E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.160 | TFLOPs: 46.17 | +7: iteration 1760/ 5111 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.76 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.774987E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.716 | TFLOPs: 46.39 | +7: iteration 1770/ 5111 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.78 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.777577E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 328.421 | TFLOPs: 45.11 | +7: iteration 1780/ 5111 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.77 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.782666E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.243 | TFLOPs: 45.64 | +7: iteration 1790/ 5111 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.77 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.779440E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.739 | TFLOPs: 45.84 | +7: iteration 1800/ 5111 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.76 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.776797E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.036 | TFLOPs: 46.29 | +7: iteration 1810/ 5111 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.76 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.768841E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.652 | TFLOPs: 46.38 | +7: iteration 1820/ 5111 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.76 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.759610E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.528 | TFLOPs: 46.36 | +7: iteration 1830/ 5111 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.76 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.756139E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.562 | TFLOPs: 46.23 | +7: iteration 1840/ 5111 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.76 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.755911E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.358 | TFLOPs: 46.06 | +7: iteration 1850/ 5111 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.76 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.752025E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.523 | TFLOPs: 46.09 | +7: iteration 1860/ 5111 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.76 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.758817E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.953 | TFLOPs: 46.28 | +7: iteration 1870/ 5111 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.77 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.750959E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.072 | TFLOPs: 45.75 | +7: iteration 1880/ 5111 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.76 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.740925E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.294 | TFLOPs: 46.19 | +7: iteration 1890/ 5111 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.77 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.758832E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.026 | TFLOPs: 45.47 | +7: iteration 1900/ 5111 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.75 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.756228E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.322 | TFLOPs: 46.61 | +7: iteration 1910/ 5111 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.77 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.763410E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.692 | TFLOPs: 45.84 | +7: iteration 1920/ 5111 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.77 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.752573E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.736 | TFLOPs: 45.84 | +7: iteration 1930/ 5111 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.76 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.755503E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.702 | TFLOPs: 46.25 | +7: iteration 1940/ 5111 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.76 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.747758E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.988 | TFLOPs: 46.29 | +7: iteration 1950/ 5111 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.76 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.718278E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.485 | TFLOPs: 46.08 | +7: iteration 1960/ 5111 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.76 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.740414E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.520 | TFLOPs: 46.22 | +7: iteration 1970/ 5111 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.76 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.724832E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.697 | TFLOPs: 46.39 | +7: iteration 1980/ 5111 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.78 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.716042E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 328.604 | TFLOPs: 45.14 | +7: iteration 1990/ 5111 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.77 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.722739E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.086 | TFLOPs: 45.48 | +0: [2023-03-15 22:46:01,181] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00014178345342766413, 0.00014178345342766413, 0.00014178345342766413], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 5111 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.77 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.713476E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.136 | TFLOPs: 45.76 | +0: steps: 2000 loss: 3.7119 iter time (s): 0.774 samples/sec: 330.923 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 2000 | lm loss value: 3.702347E+00 | lm loss PPL: 4.054235E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 2000 to checkpoints_619m2b7400m +0: [2023-03-15 22:46:01,446] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +0: [2023-03-15 22:46:01,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:46:01,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:46:01,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:46:01,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:46:01,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:46:01,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:46:01,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:46:01,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:46:01,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:46:01,863] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:46:01,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:46:01,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:46:01,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:46:01,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:46:01,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:46:02,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:46:02,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:46:02,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:46:02,091] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:46:02,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:46:02,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:46:02,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:46:02,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:46:02,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:46:02,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:46:02,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:46:02,321] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:46:02,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:46:02,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:46:02,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:46:02,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:46:02,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:46:02,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:46:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:46:02,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:46:02,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:46:02,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:46:02,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:46:02,793] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_21-model_00-model_states.pt... +0: [2023-03-15 22:46:02,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_21-model_00-model_states.pt. +0: [2023-03-15 22:46:02,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/layer_23-model_00-model_states.pt... +0: [2023-03-15 22:46:02,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/layer_23-model_00-model_states.pt. +0: [2023-03-15 22:46:02,856] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_619m2b7400m/global_step2000/mp_rank_00_model_states.pt +0: [2023-03-15 22:46:02,856] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:46:02,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:46:02,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:46:03,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:46:03,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,024] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,024] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,024] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,088] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,088] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,088] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:46:03,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:46:03,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:46:03,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,161] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,161] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:46:03,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:46:03,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:46:03,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:46:03,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:46:03,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:46:03,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:46:03,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:46:03,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:46:03,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:46:03,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:46:03,200] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:46:03,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:46:03,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:46:03,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:46:03,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:46:03,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:46:03,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:46:03,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:46:03,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:46:03,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:46:03,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:46:03,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:46:03,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:46:03,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:46:03,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: successfully saved checkpoint at iteration 2000 to checkpoints_619m2b7400m +7: time (ms) | save-checkpoint: 1784.20 +7: iteration 2010/ 5111 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.97 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.723517E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 264.435 | TFLOPs: 36.32 | +7: iteration 2020/ 5111 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.76 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.712139E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.373 | TFLOPs: 46.34 | +7: iteration 2030/ 5111 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.75 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.714959E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.359 | TFLOPs: 46.75 | +7: iteration 2040/ 5111 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.76 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.695593E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.102 | TFLOPs: 46.17 | +7: iteration 2050/ 5111 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.75 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.695288E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.711 | TFLOPs: 46.66 | +7: iteration 2060/ 5111 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.76 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.710129E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.744 | TFLOPs: 46.53 | +7: iteration 2070/ 5111 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.76 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.691918E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.753 | TFLOPs: 46.26 | +7: iteration 2080/ 5111 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.77 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.698141E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.252 | TFLOPs: 45.91 | +7: iteration 2090/ 5111 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.76 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.693457E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.705 | TFLOPs: 46.52 | +7: iteration 2100/ 5111 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.76 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.690554E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.956 | TFLOPs: 46.42 | +7: iteration 2110/ 5111 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.76 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.675154E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.784 | TFLOPs: 46.53 | +7: iteration 2120/ 5111 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.76 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.685118E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.717 | TFLOPs: 45.98 | +7: iteration 2130/ 5111 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.76 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.695407E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.098 | TFLOPs: 46.44 | +7: iteration 2140/ 5111 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.76 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.679923E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.850 | TFLOPs: 46.41 | +7: iteration 2150/ 5111 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.77 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.676600E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.036 | TFLOPs: 45.75 | +7: iteration 2160/ 5111 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.76 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.669759E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.971 | TFLOPs: 46.29 | +7: iteration 2170/ 5111 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.75 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.681193E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.242 | TFLOPs: 46.73 | +7: iteration 2180/ 5111 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.76 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.671584E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.192 | TFLOPs: 46.32 | +7: iteration 2190/ 5111 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.76 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.681287E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.903 | TFLOPs: 46.28 | +7: iteration 2200/ 5111 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.76 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.649101E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.396 | TFLOPs: 46.48 | +7: iteration 2210/ 5111 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.76 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.661359E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.881 | TFLOPs: 46.41 | +7: iteration 2220/ 5111 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.77 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.655462E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.242 | TFLOPs: 45.50 | +7: iteration 2230/ 5111 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.76 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.649213E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.379 | TFLOPs: 46.34 | +7: iteration 2240/ 5111 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.76 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.664364E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.034 | TFLOPs: 46.29 | +7: iteration 2250/ 5111 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.75 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.649953E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.363 | TFLOPs: 46.61 | +7: iteration 2260/ 5111 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.76 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.665808E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.817 | TFLOPs: 46.26 | +7: iteration 2270/ 5111 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.76 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.657291E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.924 | TFLOPs: 46.55 | +7: iteration 2280/ 5111 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.77 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.650835E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.023 | TFLOPs: 45.74 | +7: iteration 2290/ 5111 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.76 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.663692E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.851 | TFLOPs: 46.54 | +7: iteration 2300/ 5111 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.77 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.653220E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.760 | TFLOPs: 45.71 | +7: iteration 2310/ 5111 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.76 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.640451E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.103 | TFLOPs: 46.44 | +7: iteration 2320/ 5111 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.76 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.648277E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.201 | TFLOPs: 46.04 | +7: iteration 2330/ 5111 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.75 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.639063E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.250 | TFLOPs: 46.60 | +7: iteration 2340/ 5111 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.76 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.637827E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.231 | TFLOPs: 46.05 | +7: iteration 2350/ 5111 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.76 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.642163E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.558 | TFLOPs: 46.37 | +7: iteration 2360/ 5111 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.78 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.637463E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 329.807 | TFLOPs: 45.30 | +7: iteration 2370/ 5111 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.77 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.620320E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.064 | TFLOPs: 45.89 | +7: iteration 2380/ 5111 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.76 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.616133E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.041 | TFLOPs: 46.16 | +7: iteration 2390/ 5111 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.77 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.627217E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.020 | TFLOPs: 45.88 | +7: iteration 2400/ 5111 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.75 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.621268E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.444 | TFLOPs: 46.76 | +7: iteration 2410/ 5111 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.75 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.630759E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.486 | TFLOPs: 46.77 | +7: iteration 2420/ 5111 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.76 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.624362E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.520 | TFLOPs: 46.22 | +7: iteration 2430/ 5111 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.76 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.632523E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.040 | TFLOPs: 46.43 | +7: iteration 2440/ 5111 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.76 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.617118E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.945 | TFLOPs: 46.01 | +7: iteration 2450/ 5111 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.76 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.611507E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.840 | TFLOPs: 45.99 | +7: iteration 2460/ 5111 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.76 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.615461E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.584 | TFLOPs: 46.51 | +7: iteration 2470/ 5111 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.76 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.612769E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.987 | TFLOPs: 46.56 | +7: iteration 2480/ 5111 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.77 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.601806E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.527 | TFLOPs: 45.68 | +7: iteration 2490/ 5111 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.76 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.615773E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.974 | TFLOPs: 46.29 | +7: iteration 2500/ 5111 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.75 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.608227E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.143 | TFLOPs: 46.72 | +7: iteration 2510/ 5111 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.76 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.607396E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.087 | TFLOPs: 46.44 | +7: iteration 2520/ 5111 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.76 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.606179E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.177 | TFLOPs: 46.45 | +7: iteration 2530/ 5111 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.77 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.609529E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.808 | TFLOPs: 45.85 | +7: iteration 2540/ 5111 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.76 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.599187E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.661 | TFLOPs: 46.52 | +7: iteration 2550/ 5111 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.75 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.615699E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.269 | TFLOPs: 46.74 | +7: iteration 2560/ 5111 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.76 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.604496E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.472 | TFLOPs: 46.22 | +7: iteration 2570/ 5111 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.77 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.594081E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.857 | TFLOPs: 45.58 | +7: iteration 2580/ 5111 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.75 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.585435E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.469 | TFLOPs: 46.63 | +7: iteration 2590/ 5111 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.76 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.593255E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.231 | TFLOPs: 46.18 | +7: iteration 2600/ 5111 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.76 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.578218E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.177 | TFLOPs: 46.04 | +7: iteration 2610/ 5111 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.76 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.594020E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.363 | TFLOPs: 46.20 | +7: iteration 2620/ 5111 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.75 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.598975E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.100 | TFLOPs: 46.58 | +7: iteration 2630/ 5111 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.76 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.579321E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.175 | TFLOPs: 46.18 | +7: iteration 2640/ 5111 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.75 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.596231E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.746 | TFLOPs: 46.80 | +7: iteration 2650/ 5111 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.75 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.573516E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.446 | TFLOPs: 46.63 | +7: iteration 2660/ 5111 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.76 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.585148E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.559 | TFLOPs: 46.37 | +7: iteration 2670/ 5111 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.75 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.586572E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.858 | TFLOPs: 46.68 | +7: iteration 2680/ 5111 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.76 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.584430E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.430 | TFLOPs: 46.49 | +7: iteration 2690/ 5111 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.76 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.569501E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.597 | TFLOPs: 46.37 | +7: iteration 2700/ 5111 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.76 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.567652E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.980 | TFLOPs: 46.29 | +7: iteration 2710/ 5111 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.76 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.576491E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.096 | TFLOPs: 46.17 | +7: iteration 2720/ 5111 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.76 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.572104E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.965 | TFLOPs: 46.28 | +7: iteration 2730/ 5111 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.76 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.573635E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.510 | TFLOPs: 46.50 | +7: iteration 2740/ 5111 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.76 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.573677E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.228 | TFLOPs: 46.05 | +7: iteration 2750/ 5111 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.75 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.570007E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.214 | TFLOPs: 46.59 | +7: iteration 2760/ 5111 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.76 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.578943E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.778 | TFLOPs: 46.40 | +7: iteration 2770/ 5111 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.76 | learning rate: 9.949E-05 | global batch size: 256 | lm loss: 3.570494E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.750 | TFLOPs: 46.53 | +7: iteration 2780/ 5111 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.76 | learning rate: 9.893E-05 | global batch size: 256 | lm loss: 3.559249E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.616 | TFLOPs: 46.10 | +7: iteration 2790/ 5111 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.75 | learning rate: 9.838E-05 | global batch size: 256 | lm loss: 3.558356E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.259 | TFLOPs: 46.60 | +7: iteration 2800/ 5111 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.76 | learning rate: 9.782E-05 | global batch size: 256 | lm loss: 3.540414E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.647 | TFLOPs: 46.38 | +7: iteration 2810/ 5111 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.76 | learning rate: 9.727E-05 | global batch size: 256 | lm loss: 3.560577E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.408 | TFLOPs: 46.21 | +7: iteration 2820/ 5111 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.76 | learning rate: 9.672E-05 | global batch size: 256 | lm loss: 3.548517E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.990 | TFLOPs: 46.15 | +7: iteration 2830/ 5111 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.77 | learning rate: 9.617E-05 | global batch size: 256 | lm loss: 3.555171E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 332.158 | TFLOPs: 45.62 | +7: iteration 2840/ 5111 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.75 | learning rate: 9.561E-05 | global batch size: 256 | lm loss: 3.550037E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.007 | TFLOPs: 46.70 | +7: iteration 2850/ 5111 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.75 | learning rate: 9.506E-05 | global batch size: 256 | lm loss: 3.560454E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 341.111 | TFLOPs: 46.85 | +7: iteration 2860/ 5111 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.75 | learning rate: 9.451E-05 | global batch size: 256 | lm loss: 3.542068E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.828 | TFLOPs: 46.68 | +7: iteration 2870/ 5111 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.75 | learning rate: 9.396E-05 | global batch size: 256 | lm loss: 3.542127E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.378 | TFLOPs: 46.62 | +7: iteration 2880/ 5111 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.76 | learning rate: 9.341E-05 | global batch size: 256 | lm loss: 3.557994E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.842 | TFLOPs: 46.41 | +7: iteration 2890/ 5111 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.75 | learning rate: 9.286E-05 | global batch size: 256 | lm loss: 3.544698E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.418 | TFLOPs: 46.62 | +7: iteration 2900/ 5111 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.76 | learning rate: 9.232E-05 | global batch size: 256 | lm loss: 3.544549E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.651 | TFLOPs: 46.24 | +7: iteration 2910/ 5111 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.76 | learning rate: 9.177E-05 | global batch size: 256 | lm loss: 3.541290E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.664 | TFLOPs: 46.38 | +7: iteration 2920/ 5111 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.76 | learning rate: 9.122E-05 | global batch size: 256 | lm loss: 3.542454E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.674 | TFLOPs: 46.24 | +7: iteration 2930/ 5111 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.76 | learning rate: 9.068E-05 | global batch size: 256 | lm loss: 3.551359E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.264 | TFLOPs: 46.46 | +7: iteration 2940/ 5111 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.76 | learning rate: 9.013E-05 | global batch size: 256 | lm loss: 3.545039E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.957 | TFLOPs: 46.28 | +7: iteration 2950/ 5111 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.76 | learning rate: 8.959E-05 | global batch size: 256 | lm loss: 3.542348E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.004 | TFLOPs: 46.57 | +7: iteration 2960/ 5111 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.77 | learning rate: 8.904E-05 | global batch size: 256 | lm loss: 3.549818E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.623 | TFLOPs: 45.96 | +7: iteration 2970/ 5111 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.76 | learning rate: 8.850E-05 | global batch size: 256 | lm loss: 3.546348E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.642 | TFLOPs: 46.52 | +7: iteration 2980/ 5111 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.76 | learning rate: 8.796E-05 | global batch size: 256 | lm loss: 3.516154E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.144 | TFLOPs: 46.31 | +7: iteration 2990/ 5111 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.75 | learning rate: 8.742E-05 | global batch size: 256 | lm loss: 3.538433E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.091 | TFLOPs: 46.58 | +7: iteration 3000/ 5111 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.76 | learning rate: 8.687E-05 | global batch size: 256 | lm loss: 3.538366E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.141 | TFLOPs: 46.03 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 3000 | lm loss value: 3.552472E+00 | lm loss PPL: 3.489947E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 3000 to checkpoints_619m2b7400m +0: [2023-03-15 22:58:42,706] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +0: [2023-03-15 22:58:42,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:58:42,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:58:42,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:58:42,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:58:42,932] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:58:42,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:58:42,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:58:43,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:58:43,045] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:58:43,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:58:43,102] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:58:43,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:58:43,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:58:43,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:58:43,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:58:43,266] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:58:43,266] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:58:43,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:58:43,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:58:43,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:58:43,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:58:43,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:58:43,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_13-model_00-model_states.pt... +0: [2023-03-15 22:58:43,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_13-model_00-model_states.pt. +0: [2023-03-15 22:58:43,489] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:58:43,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:58:43,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_15-model_00-model_states.pt... +0: [2023-03-15 22:58:43,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_15-model_00-model_states.pt. +0: [2023-03-15 22:58:43,608] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_16-model_00-model_states.pt... +0: [2023-03-15 22:58:43,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_16-model_00-model_states.pt. +0: [2023-03-15 22:58:43,664] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_17-model_00-model_states.pt... +0: [2023-03-15 22:58:43,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_17-model_00-model_states.pt. +0: [2023-03-15 22:58:43,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_18-model_00-model_states.pt... +0: [2023-03-15 22:58:43,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_18-model_00-model_states.pt. +0: [2023-03-15 22:58:43,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_19-model_00-model_states.pt... +0: [2023-03-15 22:58:43,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_19-model_00-model_states.pt. +0: [2023-03-15 22:58:43,834] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_20-model_00-model_states.pt... +0: [2023-03-15 22:58:43,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_20-model_00-model_states.pt. +0: [2023-03-15 22:58:43,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_21-model_00-model_states.pt... +0: [2023-03-15 22:58:43,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_21-model_00-model_states.pt. +0: [2023-03-15 22:58:43,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/layer_23-model_00-model_states.pt... +0: [2023-03-15 22:58:43,950] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/layer_23-model_00-model_states.pt. +0: [2023-03-15 22:58:43,951] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_619m2b7400m/global_step3000/mp_rank_00_model_states.pt +0: [2023-03-15 22:58:43,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:58:43,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:58:43,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:58:44,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,095] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:58:44,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:58:44,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,166] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,166] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:58:44,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:58:44,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:58:44,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:58:44,241] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:58:44,241] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:58:44,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:58:44,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:58:44,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:58:44,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:58:44,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:58:44,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:58:44,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:58:44,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:58:44,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,256] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,256] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:58:44,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:58:44,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,266] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,266] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:58:44,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:58:44,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:58:44,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:58:44,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:58:44,292] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:58:44,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: successfully saved checkpoint at iteration 3000 to checkpoints_619m2b7400m +7: time (ms) | save-checkpoint: 1591.64 +7: iteration 3010/ 5111 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.94 | learning rate: 8.634E-05 | global batch size: 256 | lm loss: 3.532901E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 273.714 | TFLOPs: 37.60 | +7: iteration 3020/ 5111 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.75 | learning rate: 8.580E-05 | global batch size: 256 | lm loss: 3.530751E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.551 | TFLOPs: 46.78 | +7: iteration 3030/ 5111 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.76 | learning rate: 8.526E-05 | global batch size: 256 | lm loss: 3.535332E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.018 | TFLOPs: 46.57 | +7: iteration 3040/ 5111 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.75 | learning rate: 8.472E-05 | global batch size: 256 | lm loss: 3.529633E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.464 | TFLOPs: 46.77 | +7: iteration 3050/ 5111 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.75 | learning rate: 8.419E-05 | global batch size: 256 | lm loss: 3.514489E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.576 | TFLOPs: 46.78 | +7: iteration 3060/ 5111 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.75 | learning rate: 8.365E-05 | global batch size: 256 | lm loss: 3.534478E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.606 | TFLOPs: 46.65 | +7: iteration 3070/ 5111 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.75 | learning rate: 8.312E-05 | global batch size: 256 | lm loss: 3.533691E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.201 | TFLOPs: 46.59 | +7: iteration 3080/ 5111 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.75 | learning rate: 8.259E-05 | global batch size: 256 | lm loss: 3.520982E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.268 | TFLOPs: 46.60 | +7: iteration 3090/ 5111 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.75 | learning rate: 8.205E-05 | global batch size: 256 | lm loss: 3.513338E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.459 | TFLOPs: 46.76 | +7: iteration 3100/ 5111 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.76 | learning rate: 8.152E-05 | global batch size: 256 | lm loss: 3.502686E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.914 | TFLOPs: 46.28 | +7: iteration 3110/ 5111 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.75 | learning rate: 8.099E-05 | global batch size: 256 | lm loss: 3.519572E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.805 | TFLOPs: 46.81 | +7: iteration 3120/ 5111 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.75 | learning rate: 8.047E-05 | global batch size: 256 | lm loss: 3.504793E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.749 | TFLOPs: 46.67 | +7: iteration 3130/ 5111 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.75 | learning rate: 7.994E-05 | global batch size: 256 | lm loss: 3.518889E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.400 | TFLOPs: 46.62 | +7: iteration 3140/ 5111 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.75 | learning rate: 7.941E-05 | global batch size: 256 | lm loss: 3.506567E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.710 | TFLOPs: 46.80 | +7: iteration 3150/ 5111 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.76 | learning rate: 7.889E-05 | global batch size: 256 | lm loss: 3.508437E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.858 | TFLOPs: 46.41 | +7: iteration 3160/ 5111 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.75 | learning rate: 7.836E-05 | global batch size: 256 | lm loss: 3.510517E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.529 | TFLOPs: 46.77 | +7: iteration 3170/ 5111 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.75 | learning rate: 7.784E-05 | global batch size: 256 | lm loss: 3.512896E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.500 | TFLOPs: 46.77 | +7: iteration 3180/ 5111 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.76 | learning rate: 7.732E-05 | global batch size: 256 | lm loss: 3.507780E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.112 | TFLOPs: 46.44 | +7: iteration 3190/ 5111 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.75 | learning rate: 7.680E-05 | global batch size: 256 | lm loss: 3.518101E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.547 | TFLOPs: 46.78 | +7: iteration 3200/ 5111 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.76 | learning rate: 7.628E-05 | global batch size: 256 | lm loss: 3.495649E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.666 | TFLOPs: 46.38 | +7: iteration 3210/ 5111 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.75 | learning rate: 7.576E-05 | global batch size: 256 | lm loss: 3.506812E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.449 | TFLOPs: 46.63 | +7: iteration 3220/ 5111 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.75 | learning rate: 7.525E-05 | global batch size: 256 | lm loss: 3.506293E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.539 | TFLOPs: 46.78 | +7: iteration 3230/ 5111 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.76 | learning rate: 7.473E-05 | global batch size: 256 | lm loss: 3.497913E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.778 | TFLOPs: 46.12 | +7: iteration 3240/ 5111 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.75 | learning rate: 7.422E-05 | global batch size: 256 | lm loss: 3.507108E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.976 | TFLOPs: 46.70 | +7: iteration 3250/ 5111 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.75 | learning rate: 7.371E-05 | global batch size: 256 | lm loss: 3.520344E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.699 | TFLOPs: 46.80 | +7: iteration 3260/ 5111 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.75 | learning rate: 7.320E-05 | global batch size: 256 | lm loss: 3.494258E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.743 | TFLOPs: 46.80 | +7: iteration 3270/ 5111 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.75 | learning rate: 7.269E-05 | global batch size: 256 | lm loss: 3.493983E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.734 | TFLOPs: 46.80 | +7: iteration 3280/ 5111 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.75 | learning rate: 7.218E-05 | global batch size: 256 | lm loss: 3.485897E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.803 | TFLOPs: 46.81 | +7: iteration 3290/ 5111 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.75 | learning rate: 7.167E-05 | global batch size: 256 | lm loss: 3.512032E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.906 | TFLOPs: 46.83 | +7: iteration 3300/ 5111 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.75 | learning rate: 7.117E-05 | global batch size: 256 | lm loss: 3.490211E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.658 | TFLOPs: 46.79 | +7: iteration 3310/ 5111 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.75 | learning rate: 7.067E-05 | global batch size: 256 | lm loss: 3.490128E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.806 | TFLOPs: 46.81 | +7: iteration 3320/ 5111 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.75 | learning rate: 7.016E-05 | global batch size: 256 | lm loss: 3.499766E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.454 | TFLOPs: 46.63 | +7: iteration 3330/ 5111 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.76 | learning rate: 6.966E-05 | global batch size: 256 | lm loss: 3.495934E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.779 | TFLOPs: 46.53 | +7: iteration 3340/ 5111 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.75 | learning rate: 6.917E-05 | global batch size: 256 | lm loss: 3.495854E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.265 | TFLOPs: 46.60 | +7: iteration 3350/ 5111 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.75 | learning rate: 6.867E-05 | global batch size: 256 | lm loss: 3.493907E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.549 | TFLOPs: 46.64 | +7: iteration 3360/ 5111 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.76 | learning rate: 6.817E-05 | global batch size: 256 | lm loss: 3.489870E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.934 | TFLOPs: 46.28 | +7: iteration 3370/ 5111 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.76 | learning rate: 6.768E-05 | global batch size: 256 | lm loss: 3.489214E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.472 | TFLOPs: 46.49 | +7: iteration 3380/ 5111 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.75 | learning rate: 6.719E-05 | global batch size: 256 | lm loss: 3.496498E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.365 | TFLOPs: 46.75 | +7: iteration 3390/ 5111 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.75 | learning rate: 6.670E-05 | global batch size: 256 | lm loss: 3.503966E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.552 | TFLOPs: 46.78 | +7: iteration 3400/ 5111 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.75 | learning rate: 6.621E-05 | global batch size: 256 | lm loss: 3.485176E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.375 | TFLOPs: 46.75 | +7: iteration 3410/ 5111 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.76 | learning rate: 6.572E-05 | global batch size: 256 | lm loss: 3.490519E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.885 | TFLOPs: 46.27 | +7: iteration 3420/ 5111 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.77 | learning rate: 6.523E-05 | global batch size: 256 | lm loss: 3.470393E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 334.415 | TFLOPs: 45.93 | +7: iteration 3430/ 5111 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.75 | learning rate: 6.475E-05 | global batch size: 256 | lm loss: 3.484240E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.418 | TFLOPs: 46.76 | +7: iteration 3440/ 5111 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.76 | learning rate: 6.427E-05 | global batch size: 256 | lm loss: 3.483091E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.014 | TFLOPs: 46.43 | +7: iteration 3450/ 5111 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.75 | learning rate: 6.379E-05 | global batch size: 256 | lm loss: 3.478574E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.748 | TFLOPs: 46.80 | +7: iteration 3460/ 5111 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.75 | learning rate: 6.331E-05 | global batch size: 256 | lm loss: 3.485632E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.723 | TFLOPs: 46.80 | +7: iteration 3470/ 5111 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.75 | learning rate: 6.283E-05 | global batch size: 256 | lm loss: 3.472285E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.724 | TFLOPs: 46.80 | +7: iteration 3480/ 5111 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.75 | learning rate: 6.236E-05 | global batch size: 256 | lm loss: 3.465345E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.678 | TFLOPs: 46.79 | +7: iteration 3490/ 5111 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.75 | learning rate: 6.188E-05 | global batch size: 256 | lm loss: 3.480159E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.549 | TFLOPs: 46.78 | +7: iteration 3500/ 5111 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.75 | learning rate: 6.141E-05 | global batch size: 256 | lm loss: 3.467778E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.800 | TFLOPs: 46.67 | +7: iteration 3510/ 5111 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.76 | learning rate: 6.094E-05 | global batch size: 256 | lm loss: 3.484643E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.183 | TFLOPs: 46.45 | +7: iteration 3520/ 5111 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.75 | learning rate: 6.048E-05 | global batch size: 256 | lm loss: 3.477713E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.801 | TFLOPs: 46.81 | +7: iteration 3530/ 5111 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.75 | learning rate: 6.001E-05 | global batch size: 256 | lm loss: 3.481686E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.677 | TFLOPs: 46.79 | +7: iteration 3540/ 5111 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.75 | learning rate: 5.955E-05 | global batch size: 256 | lm loss: 3.463044E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.776 | TFLOPs: 46.81 | +7: iteration 3550/ 5111 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.75 | learning rate: 5.909E-05 | global batch size: 256 | lm loss: 3.443800E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.742 | TFLOPs: 46.80 | +7: iteration 3560/ 5111 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.75 | learning rate: 5.863E-05 | global batch size: 256 | lm loss: 3.469484E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.805 | TFLOPs: 46.67 | +7: iteration 3570/ 5111 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.76 | learning rate: 5.817E-05 | global batch size: 256 | lm loss: 3.475903E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.379 | TFLOPs: 46.34 | +7: iteration 3580/ 5111 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.76 | learning rate: 5.771E-05 | global batch size: 256 | lm loss: 3.479758E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.088 | TFLOPs: 46.44 | +7: iteration 3590/ 5111 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.75 | learning rate: 5.726E-05 | global batch size: 256 | lm loss: 3.471310E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.391 | TFLOPs: 46.76 | +7: iteration 3600/ 5111 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.75 | learning rate: 5.681E-05 | global batch size: 256 | lm loss: 3.459449E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.388 | TFLOPs: 46.76 | +7: iteration 3610/ 5111 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.75 | learning rate: 5.636E-05 | global batch size: 256 | lm loss: 3.453968E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.378 | TFLOPs: 46.62 | +7: iteration 3620/ 5111 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.75 | learning rate: 5.591E-05 | global batch size: 256 | lm loss: 3.457276E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.647 | TFLOPs: 46.79 | +7: iteration 3630/ 5111 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.75 | learning rate: 5.546E-05 | global batch size: 256 | lm loss: 3.452727E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.324 | TFLOPs: 46.61 | +7: iteration 3640/ 5111 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.76 | learning rate: 5.502E-05 | global batch size: 256 | lm loss: 3.457585E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.078 | TFLOPs: 46.16 | +7: iteration 3650/ 5111 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.75 | learning rate: 5.458E-05 | global batch size: 256 | lm loss: 3.448724E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.691 | TFLOPs: 46.80 | +7: iteration 3660/ 5111 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.76 | learning rate: 5.414E-05 | global batch size: 256 | lm loss: 3.452320E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.192 | TFLOPs: 46.18 | +7: iteration 3670/ 5111 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.76 | learning rate: 5.370E-05 | global batch size: 256 | lm loss: 3.448679E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.237 | TFLOPs: 46.32 | +7: iteration 3680/ 5111 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.76 | learning rate: 5.327E-05 | global batch size: 256 | lm loss: 3.463524E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.504 | TFLOPs: 46.50 | +7: iteration 3690/ 5111 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.75 | learning rate: 5.284E-05 | global batch size: 256 | lm loss: 3.456110E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.566 | TFLOPs: 46.78 | +7: iteration 3700/ 5111 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.76 | learning rate: 5.241E-05 | global batch size: 256 | lm loss: 3.465012E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.962 | TFLOPs: 46.56 | +7: iteration 3710/ 5111 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.75 | learning rate: 5.198E-05 | global batch size: 256 | lm loss: 3.456348E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.612 | TFLOPs: 46.79 | +7: iteration 3720/ 5111 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.75 | learning rate: 5.155E-05 | global batch size: 256 | lm loss: 3.439854E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.176 | TFLOPs: 46.59 | +7: iteration 3730/ 5111 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.75 | learning rate: 5.113E-05 | global batch size: 256 | lm loss: 3.440852E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.736 | TFLOPs: 46.80 | +7: iteration 3740/ 5111 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.75 | learning rate: 5.071E-05 | global batch size: 256 | lm loss: 3.454353E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.688 | TFLOPs: 46.80 | +7: iteration 3750/ 5111 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.75 | learning rate: 5.029E-05 | global batch size: 256 | lm loss: 3.447857E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.818 | TFLOPs: 46.81 | +7: iteration 3760/ 5111 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.76 | learning rate: 4.987E-05 | global batch size: 256 | lm loss: 3.445887E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.554 | TFLOPs: 46.23 | +7: iteration 3770/ 5111 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.75 | learning rate: 4.946E-05 | global batch size: 256 | lm loss: 3.452026E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.777 | TFLOPs: 46.67 | +7: iteration 3780/ 5111 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.75 | learning rate: 4.904E-05 | global batch size: 256 | lm loss: 3.449682E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.746 | TFLOPs: 46.80 | +7: iteration 3790/ 5111 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.76 | learning rate: 4.863E-05 | global batch size: 256 | lm loss: 3.449499E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.767 | TFLOPs: 46.40 | +7: iteration 3800/ 5111 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.76 | learning rate: 4.823E-05 | global batch size: 256 | lm loss: 3.437001E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.630 | TFLOPs: 46.51 | +7: iteration 3810/ 5111 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.75 | learning rate: 4.782E-05 | global batch size: 256 | lm loss: 3.440584E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.308 | TFLOPs: 46.74 | +7: iteration 3820/ 5111 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.75 | learning rate: 4.742E-05 | global batch size: 256 | lm loss: 3.451787E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.605 | TFLOPs: 46.78 | +7: iteration 3830/ 5111 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.75 | learning rate: 4.702E-05 | global batch size: 256 | lm loss: 3.466103E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.542 | TFLOPs: 46.78 | +7: iteration 3840/ 5111 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.75 | learning rate: 4.662E-05 | global batch size: 256 | lm loss: 3.455488E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.370 | TFLOPs: 46.75 | +7: iteration 3850/ 5111 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.75 | learning rate: 4.622E-05 | global batch size: 256 | lm loss: 3.441095E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.627 | TFLOPs: 46.79 | +7: iteration 3860/ 5111 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.75 | learning rate: 4.583E-05 | global batch size: 256 | lm loss: 3.441809E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.614 | TFLOPs: 46.79 | +7: iteration 3870/ 5111 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.75 | learning rate: 4.544E-05 | global batch size: 256 | lm loss: 3.437350E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.606 | TFLOPs: 46.78 | +7: iteration 3880/ 5111 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.75 | learning rate: 4.505E-05 | global batch size: 256 | lm loss: 3.449867E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.635 | TFLOPs: 46.79 | +7: iteration 3890/ 5111 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.75 | learning rate: 4.467E-05 | global batch size: 256 | lm loss: 3.430088E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.563 | TFLOPs: 46.78 | +7: iteration 3900/ 5111 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.75 | learning rate: 4.428E-05 | global batch size: 256 | lm loss: 3.443103E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.523 | TFLOPs: 46.77 | +7: iteration 3910/ 5111 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.75 | learning rate: 4.390E-05 | global batch size: 256 | lm loss: 3.433995E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.705 | TFLOPs: 46.80 | +7: iteration 3920/ 5111 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.75 | learning rate: 4.353E-05 | global batch size: 256 | lm loss: 3.434226E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.561 | TFLOPs: 46.78 | +7: iteration 3930/ 5111 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.75 | learning rate: 4.315E-05 | global batch size: 256 | lm loss: 3.436399E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.634 | TFLOPs: 46.79 | +7: iteration 3940/ 5111 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.75 | learning rate: 4.278E-05 | global batch size: 256 | lm loss: 3.420828E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.934 | TFLOPs: 46.83 | +7: iteration 3950/ 5111 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.75 | learning rate: 4.241E-05 | global batch size: 256 | lm loss: 3.446030E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.371 | TFLOPs: 46.75 | +7: iteration 3960/ 5111 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.75 | learning rate: 4.204E-05 | global batch size: 256 | lm loss: 3.439821E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.558 | TFLOPs: 46.78 | +7: iteration 3970/ 5111 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.75 | learning rate: 4.167E-05 | global batch size: 256 | lm loss: 3.437910E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.575 | TFLOPs: 46.78 | +7: iteration 3980/ 5111 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.75 | learning rate: 4.131E-05 | global batch size: 256 | lm loss: 3.441649E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.468 | TFLOPs: 46.77 | +7: iteration 3990/ 5111 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.75 | learning rate: 4.095E-05 | global batch size: 256 | lm loss: 3.434761E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.593 | TFLOPs: 46.78 | +0: [2023-03-15 23:11:17,871] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[4.0595726162397195e-05, 4.0595726162397195e-05, 4.0595726162397195e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 5111 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.75 | learning rate: 4.060E-05 | global batch size: 256 | lm loss: 3.432732E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.535 | TFLOPs: 46.78 | +0: steps: 4000 loss: 3.4525 iter time (s): 0.754 samples/sec: 339.537 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 4000 | lm loss value: 3.492442E+00 | lm loss PPL: 3.286610E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 4000 to checkpoints_619m2b7400m +0: [2023-03-15 23:11:18,136] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +0: [2023-03-15 23:11:18,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:11:18,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:11:18,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:11:18,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:11:18,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:11:18,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:11:18,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:11:18,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:11:18,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:11:18,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:11:18,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:11:18,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:11:18,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:11:18,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:11:18,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:11:18,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:11:18,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:11:18,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:11:18,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:11:18,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:11:18,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:11:18,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:11:18,857] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:11:18,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:11:18,912] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:11:18,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:11:18,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:11:19,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:11:19,025] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:11:19,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:11:19,080] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:11:19,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:11:19,135] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:11:19,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:11:19,190] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:11:19,246] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:11:19,246] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:11:19,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:11:19,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:11:19,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:11:19,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:11:19,361] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:11:19,362] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_619m2b7400m/global_step4000/mp_rank_00_model_states.pt +0: [2023-03-15 23:11:19,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:11:19,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:11:19,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:11:19,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,515] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,515] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,518] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,518] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,518] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:11:19,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 23:11:19,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:11:19,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:11:19,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 23:11:19,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:11:19,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:11:19,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 23:11:19,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:11:19,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:11:19,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 23:11:19,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:11:19,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 23:11:19,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:11:19,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:11:19,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 23:11:19,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:11:19,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:11:19,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 23:11:19,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:11:19,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:11:19,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 23:11:19,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:11:19,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:11:19,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: successfully saved checkpoint at iteration 4000 to checkpoints_619m2b7400m +7: time (ms) | save-checkpoint: 1612.59 +7: iteration 4010/ 5111 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.96 | learning rate: 4.024E-05 | global batch size: 256 | lm loss: 3.416137E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 266.585 | TFLOPs: 36.62 | +7: iteration 4020/ 5111 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.75 | learning rate: 3.989E-05 | global batch size: 256 | lm loss: 3.413684E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.851 | TFLOPs: 46.82 | +7: iteration 4030/ 5111 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.75 | learning rate: 3.954E-05 | global batch size: 256 | lm loss: 3.424090E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.423 | TFLOPs: 46.62 | +7: iteration 4040/ 5111 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.75 | learning rate: 3.919E-05 | global batch size: 256 | lm loss: 3.434966E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.657 | TFLOPs: 46.79 | +7: iteration 4050/ 5111 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.75 | learning rate: 3.885E-05 | global batch size: 256 | lm loss: 3.439545E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.697 | TFLOPs: 46.66 | +7: iteration 4060/ 5111 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.75 | learning rate: 3.851E-05 | global batch size: 256 | lm loss: 3.434303E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.310 | TFLOPs: 46.61 | +7: iteration 4070/ 5111 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.75 | learning rate: 3.817E-05 | global batch size: 256 | lm loss: 3.418552E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.811 | TFLOPs: 46.81 | +7: iteration 4080/ 5111 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.75 | learning rate: 3.784E-05 | global batch size: 256 | lm loss: 3.415513E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.736 | TFLOPs: 46.80 | +7: iteration 4090/ 5111 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.75 | learning rate: 3.750E-05 | global batch size: 256 | lm loss: 3.435669E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.718 | TFLOPs: 46.80 | +7: iteration 4100/ 5111 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.75 | learning rate: 3.717E-05 | global batch size: 256 | lm loss: 3.421369E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.512 | TFLOPs: 46.77 | +7: iteration 4110/ 5111 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.75 | learning rate: 3.685E-05 | global batch size: 256 | lm loss: 3.408772E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.631 | TFLOPs: 46.79 | +7: iteration 4120/ 5111 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.75 | learning rate: 3.652E-05 | global batch size: 256 | lm loss: 3.417802E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.720 | TFLOPs: 46.80 | +7: iteration 4130/ 5111 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.75 | learning rate: 3.620E-05 | global batch size: 256 | lm loss: 3.425528E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.570 | TFLOPs: 46.78 | +7: iteration 4140/ 5111 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.75 | learning rate: 3.588E-05 | global batch size: 256 | lm loss: 3.419074E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.553 | TFLOPs: 46.78 | +7: iteration 4150/ 5111 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.75 | learning rate: 3.557E-05 | global batch size: 256 | lm loss: 3.425745E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.519 | TFLOPs: 46.64 | +7: iteration 4160/ 5111 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.75 | learning rate: 3.526E-05 | global batch size: 256 | lm loss: 3.413997E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.277 | TFLOPs: 46.60 | +7: iteration 4170/ 5111 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.75 | learning rate: 3.495E-05 | global batch size: 256 | lm loss: 3.428862E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.867 | TFLOPs: 46.82 | +7: iteration 4180/ 5111 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.75 | learning rate: 3.464E-05 | global batch size: 256 | lm loss: 3.422573E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.887 | TFLOPs: 46.82 | +7: iteration 4190/ 5111 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.75 | learning rate: 3.434E-05 | global batch size: 256 | lm loss: 3.413564E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.735 | TFLOPs: 46.80 | +7: iteration 4200/ 5111 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.75 | learning rate: 3.403E-05 | global batch size: 256 | lm loss: 3.431851E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.873 | TFLOPs: 46.82 | +7: iteration 4210/ 5111 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.75 | learning rate: 3.374E-05 | global batch size: 256 | lm loss: 3.403998E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.535 | TFLOPs: 46.78 | +7: iteration 4220/ 5111 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.75 | learning rate: 3.344E-05 | global batch size: 256 | lm loss: 3.416856E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.723 | TFLOPs: 46.80 | +7: iteration 4230/ 5111 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.75 | learning rate: 3.315E-05 | global batch size: 256 | lm loss: 3.414219E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.609 | TFLOPs: 46.79 | +7: iteration 4240/ 5111 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.75 | learning rate: 3.286E-05 | global batch size: 256 | lm loss: 3.412880E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.764 | TFLOPs: 46.81 | +7: iteration 4250/ 5111 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.75 | learning rate: 3.257E-05 | global batch size: 256 | lm loss: 3.428153E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.856 | TFLOPs: 46.82 | +7: iteration 4260/ 5111 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.76 | learning rate: 3.229E-05 | global batch size: 256 | lm loss: 3.422430E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.468 | TFLOPs: 46.35 | +7: iteration 4270/ 5111 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.75 | learning rate: 3.201E-05 | global batch size: 256 | lm loss: 3.407513E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.715 | TFLOPs: 46.80 | +7: iteration 4280/ 5111 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.75 | learning rate: 3.173E-05 | global batch size: 256 | lm loss: 3.426492E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.485 | TFLOPs: 46.77 | +7: iteration 4290/ 5111 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.75 | learning rate: 3.146E-05 | global batch size: 256 | lm loss: 3.419142E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.342 | TFLOPs: 46.75 | +7: iteration 4300/ 5111 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.75 | learning rate: 3.119E-05 | global batch size: 256 | lm loss: 3.419181E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.447 | TFLOPs: 46.76 | +7: iteration 4310/ 5111 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.75 | learning rate: 3.092E-05 | global batch size: 256 | lm loss: 3.412307E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.497 | TFLOPs: 46.63 | +7: iteration 4320/ 5111 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.75 | learning rate: 3.065E-05 | global batch size: 256 | lm loss: 3.404604E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.739 | TFLOPs: 46.80 | +7: iteration 4330/ 5111 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.75 | learning rate: 3.039E-05 | global batch size: 256 | lm loss: 3.413062E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.669 | TFLOPs: 46.79 | +7: iteration 4340/ 5111 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.75 | learning rate: 3.013E-05 | global batch size: 256 | lm loss: 3.396986E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.882 | TFLOPs: 46.82 | +7: iteration 4350/ 5111 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.76 | learning rate: 2.988E-05 | global batch size: 256 | lm loss: 3.422100E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.423 | TFLOPs: 46.49 | +7: iteration 4360/ 5111 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.77 | learning rate: 2.962E-05 | global batch size: 256 | lm loss: 3.396862E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.143 | TFLOPs: 45.76 | +7: iteration 4370/ 5111 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.75 | learning rate: 2.937E-05 | global batch size: 256 | lm loss: 3.405564E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.026 | TFLOPs: 46.71 | +7: iteration 4380/ 5111 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.75 | learning rate: 2.913E-05 | global batch size: 256 | lm loss: 3.398115E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.947 | TFLOPs: 46.83 | +7: iteration 4390/ 5111 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.75 | learning rate: 2.888E-05 | global batch size: 256 | lm loss: 3.420832E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.946 | TFLOPs: 46.83 | +7: iteration 4400/ 5111 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.75 | learning rate: 2.864E-05 | global batch size: 256 | lm loss: 3.408413E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 341.099 | TFLOPs: 46.85 | +7: iteration 4410/ 5111 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.75 | learning rate: 2.840E-05 | global batch size: 256 | lm loss: 3.402903E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 341.040 | TFLOPs: 46.84 | +7: iteration 4420/ 5111 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.75 | learning rate: 2.817E-05 | global batch size: 256 | lm loss: 3.404995E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.822 | TFLOPs: 46.81 | +7: iteration 4430/ 5111 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.75 | learning rate: 2.794E-05 | global batch size: 256 | lm loss: 3.411222E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.113 | TFLOPs: 46.58 | +7: iteration 4440/ 5111 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.75 | learning rate: 2.771E-05 | global batch size: 256 | lm loss: 3.405320E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.862 | TFLOPs: 46.82 | +7: iteration 4450/ 5111 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.75 | learning rate: 2.749E-05 | global batch size: 256 | lm loss: 3.416983E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.795 | TFLOPs: 46.81 | +7: iteration 4460/ 5111 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.75 | learning rate: 2.727E-05 | global batch size: 256 | lm loss: 3.416997E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.705 | TFLOPs: 46.80 | +7: iteration 4470/ 5111 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.75 | learning rate: 2.705E-05 | global batch size: 256 | lm loss: 3.420380E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.926 | TFLOPs: 46.83 | +7: iteration 4480/ 5111 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.75 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 3.399457E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.916 | TFLOPs: 46.83 | +7: iteration 4490/ 5111 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.75 | learning rate: 2.662E-05 | global batch size: 256 | lm loss: 3.414359E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.829 | TFLOPs: 46.82 | +7: iteration 4500/ 5111 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.75 | learning rate: 2.641E-05 | global batch size: 256 | lm loss: 3.386167E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.410 | TFLOPs: 46.62 | +7: iteration 4510/ 5111 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.75 | learning rate: 2.621E-05 | global batch size: 256 | lm loss: 3.404083E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 341.007 | TFLOPs: 46.84 | +7: iteration 4520/ 5111 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.75 | learning rate: 2.600E-05 | global batch size: 256 | lm loss: 3.404515E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.922 | TFLOPs: 46.83 | +7: iteration 4530/ 5111 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.75 | learning rate: 2.580E-05 | global batch size: 256 | lm loss: 3.406207E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.012 | TFLOPs: 46.70 | +7: iteration 4540/ 5111 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.75 | learning rate: 2.561E-05 | global batch size: 256 | lm loss: 3.410766E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.559 | TFLOPs: 46.64 | +7: iteration 4550/ 5111 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.75 | learning rate: 2.542E-05 | global batch size: 256 | lm loss: 3.397030E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.799 | TFLOPs: 46.81 | +7: iteration 4560/ 5111 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.75 | learning rate: 2.523E-05 | global batch size: 256 | lm loss: 3.389738E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.670 | TFLOPs: 46.66 | +7: iteration 4570/ 5111 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.75 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 3.406122E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.707 | TFLOPs: 46.80 | +7: iteration 4580/ 5111 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.75 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 3.388462E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.719 | TFLOPs: 46.80 | +7: iteration 4590/ 5111 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.75 | learning rate: 2.468E-05 | global batch size: 256 | lm loss: 3.386525E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.884 | TFLOPs: 46.82 | +7: iteration 4600/ 5111 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.75 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 3.378867E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.925 | TFLOPs: 46.83 | +7: iteration 4610/ 5111 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.75 | learning rate: 2.433E-05 | global batch size: 256 | lm loss: 3.369725E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.641 | TFLOPs: 46.79 | +7: iteration 4620/ 5111 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.75 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 3.379598E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.618 | TFLOPs: 46.79 | +7: iteration 4630/ 5111 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.75 | learning rate: 2.399E-05 | global batch size: 256 | lm loss: 3.356216E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.673 | TFLOPs: 46.79 | +7: iteration 4640/ 5111 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.75 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 3.387407E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.919 | TFLOPs: 46.83 | +7: iteration 4650/ 5111 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.75 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 3.388644E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.899 | TFLOPs: 46.83 | +7: iteration 4660/ 5111 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.75 | learning rate: 2.352E-05 | global batch size: 256 | lm loss: 3.386916E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.612 | TFLOPs: 46.79 | +7: iteration 4670/ 5111 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.75 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 3.369680E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.662 | TFLOPs: 46.79 | +7: iteration 4680/ 5111 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.75 | learning rate: 2.321E-05 | global batch size: 256 | lm loss: 3.363216E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.752 | TFLOPs: 46.81 | +7: iteration 4690/ 5111 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.75 | learning rate: 2.307E-05 | global batch size: 256 | lm loss: 3.375264E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 339.500 | TFLOPs: 46.63 | +7: iteration 4700/ 5111 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.75 | learning rate: 2.292E-05 | global batch size: 256 | lm loss: 3.368787E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.722 | TFLOPs: 46.80 | +7: iteration 4710/ 5111 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.75 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 3.386152E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.525 | TFLOPs: 46.77 | +7: iteration 4720/ 5111 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.75 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.368199E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.614 | TFLOPs: 46.79 | +7: iteration 4730/ 5111 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.75 | learning rate: 2.251E-05 | global batch size: 256 | lm loss: 3.376327E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.430 | TFLOPs: 46.76 | +7: iteration 4740/ 5111 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.75 | learning rate: 2.239E-05 | global batch size: 256 | lm loss: 3.378979E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.376 | TFLOPs: 46.75 | +7: iteration 4750/ 5111 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.75 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 3.376451E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.711 | TFLOPs: 46.80 | +7: iteration 4760/ 5111 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.75 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 3.385368E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.704 | TFLOPs: 46.80 | +7: iteration 4770/ 5111 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.75 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 3.371242E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.605 | TFLOPs: 46.78 | +7: iteration 4780/ 5111 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.75 | learning rate: 2.190E-05 | global batch size: 256 | lm loss: 3.375348E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.748 | TFLOPs: 46.80 | +7: iteration 4790/ 5111 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.75 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 3.381540E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.757 | TFLOPs: 46.81 | +7: iteration 4800/ 5111 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.75 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.383057E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.606 | TFLOPs: 46.78 | +7: iteration 4810/ 5111 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.75 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 3.370196E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.343 | TFLOPs: 46.75 | +7: iteration 4820/ 5111 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.78 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.390055E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 327.096 | TFLOPs: 44.93 | +7: iteration 4830/ 5111 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.76 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.371221E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.223 | TFLOPs: 46.46 | +7: iteration 4840/ 5111 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.77 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.375312E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 331.752 | TFLOPs: 45.57 | +7: iteration 4850/ 5111 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.75 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 3.371566E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.727 | TFLOPs: 46.80 | +7: iteration 4860/ 5111 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.75 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.374778E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.690 | TFLOPs: 46.80 | +7: iteration 4870/ 5111 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.75 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.378878E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.627 | TFLOPs: 46.79 | +7: iteration 4880/ 5111 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.75 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.374505E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 341.006 | TFLOPs: 46.84 | +7: iteration 4890/ 5111 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.75 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.380755E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.256 | TFLOPs: 46.74 | +7: iteration 4900/ 5111 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.76 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.381356E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 337.748 | TFLOPs: 46.39 | +7: iteration 4910/ 5111 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.76 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.354634E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 335.254 | TFLOPs: 46.05 | +7: iteration 4920/ 5111 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.77 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.366709E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.567 | TFLOPs: 45.82 | +7: iteration 4930/ 5111 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.75 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.366899E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.650 | TFLOPs: 46.79 | +7: iteration 4940/ 5111 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.75 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.372399E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.372 | TFLOPs: 46.75 | +7: iteration 4950/ 5111 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.75 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.371947E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.441 | TFLOPs: 46.76 | +7: iteration 4960/ 5111 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.75 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.389143E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.485 | TFLOPs: 46.77 | +7: iteration 4970/ 5111 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.75 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.384359E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.747 | TFLOPs: 46.80 | +7: iteration 4980/ 5111 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.75 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.369466E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.316 | TFLOPs: 46.75 | +7: iteration 4990/ 5111 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.75 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.371009E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.092 | TFLOPs: 46.71 | +7: iteration 5000/ 5111 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.75 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.376430E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.496 | TFLOPs: 46.77 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 5000 | lm loss value: 3.458002E+00 | lm loss PPL: 3.175347E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 5000 to checkpoints_619m2b7400m +0: [2023-03-15 23:23:53,157] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +0: [2023-03-15 23:23:53,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:23:53,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:23:53,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:23:53,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:23:53,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:23:53,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:23:53,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:23:53,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:23:53,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:23:53,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:23:53,686] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:23:53,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:23:53,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:23:53,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:23:53,804] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:23:53,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:23:53,863] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:23:53,920] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:23:53,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:23:53,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:23:53,978] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:23:54,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:23:54,038] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:23:54,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:23:54,095] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:23:54,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:23:54,155] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:23:54,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:23:54,212] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:23:54,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:23:54,269] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:23:54,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:23:54,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:23:54,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:23:54,385] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:23:54,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:23:54,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:23:54,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:23:54,500] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:23:54,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:23:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:23:54,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:23:54,562] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_619m2b7400m/global_step5000/mp_rank_00_model_states.pt +0: [2023-03-15 23:23:54,562] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:23:54,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:54,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:54,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 23:23:54,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:54,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:54,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,817] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,814] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,814] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,814] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 23:23:54,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:54,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:54,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 23:23:54,883] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:54,883] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 23:23:54,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:54,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:54,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 23:23:54,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:54,892] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:54,892] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,895] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,895] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 23:23:54,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:54,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:54,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 23:23:54,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:54,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:54,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 23:23:54,910] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:54,910] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:54,910] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: successfully saved checkpoint at iteration 5000 to checkpoints_619m2b7400m +7: time (ms) | save-checkpoint: 1790.74 +7: iteration 5010/ 5111 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.95 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.379623E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 268.900 | TFLOPs: 36.94 | +7: iteration 5020/ 5111 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.76 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.371708E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.652 | TFLOPs: 46.52 | +7: iteration 5030/ 5111 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.76 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.375631E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 336.565 | TFLOPs: 46.23 | +7: iteration 5040/ 5111 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.93 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.379527E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 276.602 | TFLOPs: 37.99 | +7: iteration 5050/ 5111 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.76 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.356009E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.020 | TFLOPs: 46.43 | +7: iteration 5060/ 5111 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.76 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.362998E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.571 | TFLOPs: 46.51 | +7: iteration 5070/ 5111 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.93 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.385173E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 275.076 | TFLOPs: 37.78 | +7: iteration 5080/ 5111 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.78 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.370117E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 326.293 | TFLOPs: 44.82 | +7: iteration 5090/ 5111 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.75 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.379132E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 340.152 | TFLOPs: 46.72 | +7: iteration 5100/ 5111 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.76 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.364680E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 338.356 | TFLOPs: 46.48 | +7: iteration 5110/ 5111 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.77 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.378394E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 333.156 | TFLOPs: 45.76 | +0: [after training is done] datetime: 2023-03-15 23:25:22 +0: saving checkpoint at iteration 5111 to checkpoints_619m2b7400m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.467663E+00 | lm loss PPL: 3.206171E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-15 23:25:22,987] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5111 is begin to save! +0: [2023-03-15 23:25:23,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:25:23,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:25:23,317] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:25:23,373] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:25:23,373] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:25:23,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:25:23,431] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:25:23,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:25:23,489] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:25:23,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:25:23,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:25:23,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:25:23,626] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:25:23,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:25:23,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:25:23,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:25:23,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:25:23,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:25:23,795] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:25:23,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:25:23,854] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:25:23,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:25:23,911] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:25:23,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:25:23,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:25:24,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:25:24,026] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:25:24,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:25:24,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:25:24,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:25:24,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:25:24,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:25:24,199] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:25:24,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:25:24,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:25:24,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:25:24,313] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:25:24,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:25:24,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:25:24,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:25:24,426] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:25:24,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:25:24,431] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt +0: [2023-03-15 23:25:24,431] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:25:24,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:25:24,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:25:24,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:25:24,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +6: [2023-03-15 23:25:24,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:25:24,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:25:24,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: [2023-03-15 23:25:24,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:25:24,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,768] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,768] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +5: [2023-03-15 23:25:24,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:25:24,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:25:24,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +1: [2023-03-15 23:25:24,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:25:24,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:25:24,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +4: [2023-03-15 23:25:24,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:25:24,821] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:25:24,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +7: [2023-03-15 23:25:24,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:25:24,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:25:24,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +2: [2023-03-15 23:25:24,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:25:24,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:25:24,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +3: [2023-03-15 23:25:24,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:25:24,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:25:24,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5111 is ready now! +0: successfully saved checkpoint at iteration 5111 to checkpoints_619m2b7400m +END 3318428: Wed 15 Mar 2023 11:25:43 PM EET diff --git a/619m2b7400m/3318673.err b/619m2b7400m/3318673.err new file mode 100644 index 0000000000000000000000000000000000000000..d1d554bb6d8b6b26ea9977deb4ea4444694b5478 --- /dev/null +++ b/619m2b7400m/3318673.err @@ -0,0 +1,1124 @@ +7: 2023-03-15 23:28:18.875245: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:18.875255: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:18.875269: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:18.875279: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875408: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875424: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875413: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: 2023-03-15 23:28:18.875283: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:18.875287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:18.875264: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875427: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875437: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 23:28:18.875295: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875455: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875463: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 23:28:18.875457: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875825: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875837: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875843: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875924: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875928: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875943: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-15 23:28:18.875857: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875871: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875875: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875950: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875929: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875879: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875961: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 23:28:18.875869: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875967: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 23:28:18.875988: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876498: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876498: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876518: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876532: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876524: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876539: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876548: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 23:28:18.876554: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876661: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876651: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876684: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876689: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876715: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:18.876726: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877097: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877108: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877110: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877116: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877128: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877135: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 23:28:18.877131: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877275: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877282: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877303: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877316: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877307: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 23:28:18.877329: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 23:28:32.707609: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707638: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707680: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707683: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.707714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708129: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 23:28:32.708627: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708568: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708169: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 23:28:32.708649: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:32.708661: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:32.708679: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708580: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:32.708688: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 23:28:32.708701: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 23:28:32.708709: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:28:32.708738: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708193: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708600: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708197: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:28:32.708616: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708630: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708635: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708639: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 23:28:32.708645: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:32.708609: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708564: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708638: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.709183: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708605: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.709198: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708661: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708736: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708609: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.709214: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708725: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708731: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708675: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.708758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-15 23:28:32.708663: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.709233: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:32.709236: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:32.709247: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 23:28:32.709257: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:28:32.709273: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709420: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709437: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709447: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709468: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709473: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709481: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709494: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 23:28:32.709503: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.712612: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.712982: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.712639: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.712667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.712660: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.712671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.713014: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.713017: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.713022: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.712700: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.712835: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-15 23:28:32.712664: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.712679: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:28:32.713033: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.713052: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.713060: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 23:28:32.713081: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.712870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.712890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.713277: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.712920: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.713303: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.713301: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.712972: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.712971: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.712948: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.712982: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:28:32.713329: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.713342: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.713354: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.713358: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 23:28:32.713362: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:32.715395: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:32.715430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 2023-03-15 23:28:32.715477: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:32.715452: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:32.715465: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:32.715507: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.715495: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:32.715534: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.715480: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:32.715547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.715515: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:32.715577: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.715501: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:32.715599: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.716127: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:32.716143: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.715564: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.716155: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:32.716163: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:28:32.716170: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:32.716171: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:28:32.716180: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.715595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-15 23:28:32.716181: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:28:32.716074: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716093: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716106: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716113: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716125: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716136: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716136: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 23:28:32.716154: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 23:29:00.319298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319341: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319359: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319381: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319385: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319395: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.319588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320002: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320032: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320057: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320070: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320087: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320106: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320114: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.320132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320600: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320640: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320655: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320660: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.320723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321813: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321843: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321859: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321882: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321900: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.321908: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.322021: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322865: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322871: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322872: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322875: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322877: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322881: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322873: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322873: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 23:29:00.322889: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322890: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322892: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322893: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322894: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322897: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 23:29:00.322898: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.323538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323579: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323603: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323626: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323644: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323648: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.323669: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324054: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324121: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324138: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324661: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324161: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.324166: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324689: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-15 23:29:00.324203: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.324708: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324833: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324853: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324736: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324861: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324877: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-15 23:29:00.324769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324902: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.324921: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330894: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330900: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330900: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330903: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330904: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330910: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330921: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330921: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330922: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330924: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 23:29:00.330979: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 23:29:00.330992: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331561: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331567: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331567: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331573: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331590: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331576: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331579: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 23:29:00.331594: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331594: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.331809: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-15 23:29:00.331596: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331598: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331600: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331601: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 23:29:00.331602: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331825: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.331821: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331824: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.332164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331829: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331831: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.332066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331830: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331848: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.331850: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.332005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331851: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.331852: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 23:29:00.331853: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-15 23:29:00.331854: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331903: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 23:29:00.331921: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332066: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332006: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332171: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-15 23:29:00.332069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332173: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-15 23:29:00.332072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332180: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332182: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332068: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332012: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.332178: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332192: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-15 23:29:00.332181: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 23:29:00.332023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.332023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332194: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.332195: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332078: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-15 23:29:00.332027: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.332029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.332029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.332204: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 23:29:00.332083: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332083: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.332029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.332031: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 23:29:00.332035: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.332206: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332088: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 23:29:00.332241: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-15 23:29:00.332091: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332094: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 23:29:00.332095: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 23:29:00.332257: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331439: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331448: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331454: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331461: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331450: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331456: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331458: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 23:29:00.331483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331484: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331488: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331488: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 23:29:00.331488: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +1: Building extension module utils... +1: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +1: Loading extension module utils... +3: Loading extension module utils... +0: Loading extension module utils... +7: Loading extension module utils... +5: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils...Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: +1: Loading extension module utils... +1: Loading extension module utils...Loading extension module utils... +1: +1: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +1: +1: +1: +3: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +3: +3: +3: +3: +3: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +7: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +7: +7: Loading extension module utils... +7: Loading extension module utils... +7: +7: +7: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +4: +4: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +4: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/619m2b7400m/3318673.out b/619m2b7400m/3318673.out new file mode 100644 index 0000000000000000000000000000000000000000..84e42aaef58d1c077f94082a691f45e957b012bc --- /dev/null +++ b/619m2b7400m/3318673.out @@ -0,0 +1,6704 @@ +Model parameters: d_model 1536 ffw_size 6144 kv_size 128 n_heads 12 n_layers 19 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 19 --hidden-size 1536 --num-attention-heads 12 --kv-channels 128 --ffn-hidden-size 6144 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-619m2b7400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --no-load-optim --reset-progress --override-lr-scheduler --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --eval-only true --tensorboard-dir tensorboard_619m2b7400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_619m2b7400m --load checkpoints_619m2b7400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3318673.json --zero-stage 0 +START 3318673: Wed 15 Mar 2023 11:27:47 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 49.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 47.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 55.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 47.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 46.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 46.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 39.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 45.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 37.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 44.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 37.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 51.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 49.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 53.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 52.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 50.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 42.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 45.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +2: Launching on nid007233 (2/8), master nid007231 port 9999, GPUs 8, CUDA: True +0: Launching on nid007231 (0/8), master nid007231 port 9999, GPUs 8, CUDA: True +6: Launching on nid007237 (6/8), master nid007231 port 9999, GPUs 8, CUDA: True +1: Launching on nid007232 (1/8), master nid007231 port 9999, GPUs 8, CUDA: True +4: Launching on nid007235 (4/8), master nid007231 port 9999, GPUs 8, CUDA: True +5: Launching on nid007236 (5/8), master nid007231 port 9999, GPUs 8, CUDA: True +7: Launching on nid007238 (7/8), master nid007231 port 9999, GPUs 8, CUDA: True +3: Launching on nid007234 (3/8), master nid007231 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3318673.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... True +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 6144 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 1536 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-619m2b7400mval +0: kv_channels ..................................... 128 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_619m2b7400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 12 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 19 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_619m2b7400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_619m2b7400mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +7: > setting tensorboard ... +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-15 23:30:09,191] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.092 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_hip_kernel.cuda.o layer_norm_cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 26.364 seconds +0: time to initialize megatron (seconds): -1.683 +0: [after megatron is initialized] datetime: 2023-03-15 23:30:38 +0: building GPT model ... +0: [2023-03-15 23:30:38,450] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-15 23:30:38,450] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-15 23:30:38,450] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.63 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-15 23:30:40,456] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=26 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: ParallelTransformerLayerPipe +0: 14: ParallelTransformerLayerPipe +0: 15: ParallelTransformerLayerPipe +0: 16: ParallelTransformerLayerPipe +0: 17: ParallelTransformerLayerPipe +0: 18: ParallelTransformerLayerPipe +0: 19: ParallelTransformerLayerPipe +0: 20: ParallelTransformerLayerPipe +0: 21: ParallelTransformerLayerPipe +0: 22: undo +0: 23: MixedFusedLayerNorm +0: 24: EmbeddingPipe +0: 25: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-15 23:30:40,709] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-15 23:30:40,710] [INFO] [utils.py:828:see_memory_usage] MA 1.16 GB Max_MA 1.16 GB CA 1.2 GB Max_CA 1 GB +0: [2023-03-15 23:30:40,710] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.67 GB, percent = 6.1% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-15 23:30:40,712] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-15 23:30:54,176] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-15 23:30:54,177] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-15 23:30:54,177] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-15 23:30:54,184] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-15 23:30:54,184] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-15 23:30:54,302] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-15 23:30:54,303] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.17 GB CA 1.22 GB Max_CA 1 GB +0: [2023-03-15 23:30:54,303] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +1: ninja: no work to do. +1: Time to load utils op: 0.18811631202697754 seconds +0: Time to load utils op: 0.10921311378479004 seconds +7: Time to load utils op: 0.20951247215270996 seconds +3: Time to load utils op: 0.21080851554870605 seconds +5: Time to load utils op: 0.20934796333312988 seconds +0: [2023-03-15 23:30:54,521] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-15 23:30:54,521] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.15 GB CA 1.22 GB Max_CA 1 GB +0: [2023-03-15 23:30:54,522] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +0: ninja: no work to do. +0: Time to load utils op: 0.1326920986175537 seconds +7: Time to load utils op: 0.0005359649658203125 seconds +1: Time to load utils op: 0.0005338191986083984 seconds +3: Time to load utils op: 0.0004482269287109375 seconds +5: Time to load utils op: 0.0004875659942626953 seconds +0: Time to load utils op: 0.0005524158477783203 seconds +0: Time to load utils op: 0.20317840576171875 seconds +0: Time to load utils op: 0.20326757431030273 seconds +0: Time to load utils op: 0.2037975788116455 seconds +0: Time to load utils op: 0.2034618854522705 secondsTime to load utils op: 0.20369672775268555 seconds +0: +0: Time to load utils op: 0.2027454376220703 seconds +1: Time to load utils op: 0.20322823524475098 seconds +1: Time to load utils op: 0.20230913162231445 seconds +1: Time to load utils op: 0.2022390365600586 seconds +1: Time to load utils op: 0.20264792442321777 secondsTime to load utils op: 0.20334792137145996 seconds +1: +1: Time to load utils op: 0.20272302627563477 seconds +1: Time to load utils op: 0.2030785083770752 seconds +3: Time to load utils op: 0.20257902145385742 seconds +3: Time to load utils op: 0.20269227027893066 seconds +3: Time to load utils op: 0.20270323753356934 secondsTime to load utils op: 0.20357680320739746 seconds +3: +3: Time to load utils op: 0.20273256301879883 secondsTime to load utils op: 0.20311665534973145 secondsTime to load utils op: 0.20290493965148926 seconds +3: +3: +5: Time to load utils op: 0.20271515846252441 seconds +5: Time to load utils op: 0.20229601860046387 seconds +5: Time to load utils op: 0.20267724990844727 seconds +5: Time to load utils op: 0.20210886001586914 seconds +7: Time to load utils op: 0.2034142017364502 seconds +7: Time to load utils op: 0.20332050323486328 seconds +7: Time to load utils op: 0.2034752368927002 secondsTime to load utils op: 0.2034320831298828 seconds +7: +7: Time to load utils op: 0.20354413986206055 seconds +7: Time to load utils op: 0.20354771614074707 seconds +7: Time to load utils op: 0.2030167579650879 seconds +5: Time to load utils op: 0.2021474838256836 seconds +5: Time to load utils op: 0.2020092010498047 seconds +2: Time to load utils op: 0.213118314743042 secondsTime to load utils op: 0.21306991577148438 seconds +2: +2: Time to load utils op: 0.21218180656433105 seconds +2: Time to load utils op: 0.2123546600341797 seconds +2: Time to load utils op: 0.2129838466644287 secondsTime to load utils op: 0.21240901947021484 secondsTime to load utils op: 0.21317839622497559 seconds +2: Time to load utils op: 0.2122962474822998 seconds +2: +2: +5: Time to load utils op: 0.20248913764953613 seconds +0: Time to load utils op: 0.00042128562927246094 seconds +0: Time to load utils op: 0.0003857612609863281 seconds +0: Time to load utils op: 0.00038051605224609375 seconds +1: Time to load utils op: 0.0003209114074707031 seconds +0: Time to load utils op: 0.000400543212890625 seconds +0: Time to load utils op: 0.00039124488830566406 seconds +0: Time to load utils op: 0.00039696693420410156 seconds +1: Time to load utils op: 0.0003325939178466797 seconds +1: Time to load utils op: 0.0003509521484375 seconds +1: Time to load utils op: 0.00035834312438964844 seconds +1: Time to load utils op: 0.0003414154052734375 seconds +1: Time to load utils op: 0.00034499168395996094 seconds +1: Time to load utils op: 0.0003452301025390625 seconds +3: Time to load utils op: 0.0003216266632080078 seconds +3: Time to load utils op: 0.0003616809844970703 seconds +3: Time to load utils op: 0.00034427642822265625 seconds +3: Time to load utils op: 0.0003714561462402344 seconds +3: Time to load utils op: 0.00036978721618652344 seconds +3: Time to load utils op: 0.0003495216369628906 seconds +3: Time to load utils op: 0.0003409385681152344 seconds +5: Time to load utils op: 0.00041866302490234375 seconds +5: Time to load utils op: 0.0003902912139892578 seconds +7: Time to load utils op: 0.0003380775451660156 seconds +5: Time to load utils op: 0.0003948211669921875 seconds +7: Time to load utils op: 0.00038051605224609375 seconds +5: Time to load utils op: 0.0003962516784667969 seconds +7: Time to load utils op: 0.0003306865692138672 seconds +7: Time to load utils op: 0.00034356117248535156 seconds +7: Time to load utils op: 0.0003352165222167969 seconds +7: Time to load utils op: 0.000324249267578125 seconds +7: Time to load utils op: 0.00035762786865234375 seconds +5: Time to load utils op: 0.00044798851013183594 seconds +5: Time to load utils op: 0.0003685951232910156 seconds +0: [2023-03-15 23:30:54,688] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-15 23:30:54,688] [INFO] [utils.py:828:see_memory_usage] MA 2.45 GB Max_MA 2.45 GB CA 3.13 GB Max_CA 3 GB +0: [2023-03-15 23:30:54,689] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.49 GB, percent = 6.3% +5: Time to load utils op: 0.0003464221954345703 seconds +4: Time to load utils op: 0.23276448249816895 seconds +4: Time to load utils op: 0.23276424407958984 seconds +4: Time to load utils op: 0.23276996612548828 seconds +4: Time to load utils op: 0.23274898529052734 seconds +4: Time to load utils op: 0.2327897548675537 seconds +4: Time to load utils op: 0.2328171730041504 seconds +4: Time to load utils op: 0.2328348159790039 secondsTime to load utils op: 0.23277616500854492 seconds +4: +6: Time to load utils op: 0.2323293685913086 seconds +6: Time to load utils op: 0.23234128952026367 seconds +6: Time to load utils op: 0.23231840133666992 secondsTime to load utils op: 0.2323439121246338 secondsTime to load utils op: 0.23235321044921875 seconds +6: +6: +6: Time to load utils op: 0.23237967491149902 seconds +6: Time to load utils op: 0.2323760986328125 seconds +6: Time to load utils op: 0.23238778114318848 seconds +2: Time to load utils op: 0.0007610321044921875 seconds +2: Time to load utils op: 0.0008673667907714844 seconds +2: Time to load utils op: 0.0009624958038330078 seconds +2: Time to load utils op: 0.001012563705444336 seconds +2: Time to load utils op: 0.0010972023010253906 seconds +2: Time to load utils op: 0.0010623931884765625 seconds +2: Time to load utils op: 0.001009225845336914 seconds +2: Time to load utils op: 0.0011353492736816406 seconds +6: Time to load utils op: 0.0010094642639160156 seconds +6: Time to load utils op: 0.000995635986328125 seconds +4: Time to load utils op: 0.0009770393371582031 seconds +4: Time to load utils op: 0.0009236335754394531 seconds +6: Time to load utils op: 0.0011255741119384766 secondsTime to load utils op: 0.0012040138244628906 seconds +6: +4: Time to load utils op: 0.001102447509765625 seconds +4: Time to load utils op: 0.0010173320770263672 seconds +6: Time to load utils op: 0.0012688636779785156 seconds +6: Time to load utils op: 0.0012271404266357422 seconds +6: Time to load utils op: 0.001123666763305664 seconds +6: Time to load utils op: 0.0012054443359375 seconds +4: Time to load utils op: 0.0013022422790527344 seconds +4: Time to load utils op: 0.0012652873992919922 seconds +4: Time to load utils op: 0.00128173828125 seconds +4: Time to load utils op: 0.0013434886932373047 seconds +0: [2023-03-15 23:30:54,806] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-15 23:30:54,807] [INFO] [utils.py:828:see_memory_usage] MA 2.45 GB Max_MA 2.45 GB CA 3.13 GB Max_CA 3 GB +0: [2023-03-15 23:30:54,807] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:54,915] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-15 23:30:54,915] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 23:30:54,915] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:55,020] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-15 23:30:55,020] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 23:30:55,020] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:55,127] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-15 23:30:55,127] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 23:30:55,128] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:55,231] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-15 23:30:55,231] [INFO] [utils.py:828:see_memory_usage] MA 3.49 GB Max_MA 3.49 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 23:30:55,231] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:55,340] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-15 23:30:55,340] [INFO] [utils.py:828:see_memory_usage] MA 3.57 GB Max_MA 3.57 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 23:30:55,341] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:55,444] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-15 23:30:55,444] [INFO] [utils.py:828:see_memory_usage] MA 3.57 GB Max_MA 3.57 GB CA 4.68 GB Max_CA 5 GB +0: [2023-03-15 23:30:55,445] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.51 GB, percent = 6.3% +0: [2023-03-15 23:30:55,445] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-15 23:30:55,445] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-15 23:30:55,445] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-15 23:30:55,445] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-15 23:30:55,445] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-15 23:30:55,446] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-15 23:30:55,447] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-15 23:30:55,448] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-15 23:30:55,448] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-15 23:30:55,448] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-15 23:30:55,448] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00042891502380371094 seconds +0: [2023-03-15 23:30:55,449] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-15 23:30:55,468] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=26 [0, 26) STAGE_PARAMS=618714624 (618.715M) TOTAL_PARAMS=618714624 (618.715M) UNIQUE_PARAMS=618714624 (618.715M) +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:55,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +1: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +6: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +3: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +4: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt... +5: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +4: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +1: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/mp_rank_00_model_states.pt. +3: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +5: [2023-03-15 23:30:55,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +6: [2023-03-15 23:30:55,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +4: [2023-03-15 23:30:55,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +1: [2023-03-15 23:30:55,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +7: [2023-03-15 23:30:55,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +3: [2023-03-15 23:30:55,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt... +2: [2023-03-15 23:30:55,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +4: [2023-03-15 23:30:55,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:55,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +7: [2023-03-15 23:30:55,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +2: [2023-03-15 23:30:55,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +5: [2023-03-15 23:30:55,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +1: [2023-03-15 23:30:55,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +6: [2023-03-15 23:30:55,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_01-model_00-model_states.pt. +3: [2023-03-15 23:30:55,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +5: [2023-03-15 23:30:55,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +4: [2023-03-15 23:30:55,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:55,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +2: [2023-03-15 23:30:55,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +6: [2023-03-15 23:30:55,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +4: [2023-03-15 23:30:55,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +3: [2023-03-15 23:30:55,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +2: [2023-03-15 23:30:55,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:55,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +7: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +7: [2023-03-15 23:30:55,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +1: [2023-03-15 23:30:55,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt... +6: [2023-03-15 23:30:55,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +5: [2023-03-15 23:30:55,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +1: [2023-03-15 23:30:55,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_03-model_00-model_states.pt. +3: [2023-03-15 23:30:55,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:55,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +4: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +6: [2023-03-15 23:30:55,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:55,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:55,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +2: [2023-03-15 23:30:55,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +2: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:55,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +3: [2023-03-15 23:30:55,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +7: [2023-03-15 23:30:55,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:55,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:55,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:55,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:55,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:55,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:55,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:55,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:55,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:55,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:55,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:55,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:55,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:55,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:55,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +5: [2023-03-15 23:30:55,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +4: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:55,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +7: [2023-03-15 23:30:56,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +6: [2023-03-15 23:30:56,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +1: [2023-03-15 23:30:56,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt... +5: [2023-03-15 23:30:56,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +3: [2023-03-15 23:30:56,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_04-model_00-model_states.pt. +1: [2023-03-15 23:30:56,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,107] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +6: [2023-03-15 23:30:56,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +7: [2023-03-15 23:30:56,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:56,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +3: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +5: [2023-03-15 23:30:56,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,121] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +1: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +4: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt... +2: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +2: [2023-03-15 23:30:56,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +4: [2023-03-15 23:30:56,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +3: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +7: [2023-03-15 23:30:56,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +6: [2023-03-15 23:30:56,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:56,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +1: [2023-03-15 23:30:56,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_05-model_00-model_states.pt. +5: [2023-03-15 23:30:56,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:56,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +7: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +4: [2023-03-15 23:30:56,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,215] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +6: [2023-03-15 23:30:56,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +5: [2023-03-15 23:30:56,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +3: [2023-03-15 23:30:56,217] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +1: [2023-03-15 23:30:56,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt... +2: [2023-03-15 23:30:56,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +4: [2023-03-15 23:30:56,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +7: [2023-03-15 23:30:56,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +1: [2023-03-15 23:30:56,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +2: [2023-03-15 23:30:56,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +5: [2023-03-15 23:30:56,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +3: [2023-03-15 23:30:56,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +6: [2023-03-15 23:30:56,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:56,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:56,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +6: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +2: [2023-03-15 23:30:56,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +7: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +1: [2023-03-15 23:30:56,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +4: [2023-03-15 23:30:56,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +3: [2023-03-15 23:30:56,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt... +5: [2023-03-15 23:30:56,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +4: [2023-03-15 23:30:56,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:56,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +7: [2023-03-15 23:30:56,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +6: [2023-03-15 23:30:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +3: [2023-03-15 23:30:56,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +5: [2023-03-15 23:30:56,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +2: [2023-03-15 23:30:56,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_07-model_00-model_states.pt. +1: [2023-03-15 23:30:56,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +5: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:56,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +2: [2023-03-15 23:30:56,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +6: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +3: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +4: [2023-03-15 23:30:56,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt... +7: [2023-03-15 23:30:56,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +4: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +3: [2023-03-15 23:30:56,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +7: [2023-03-15 23:30:56,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +2: [2023-03-15 23:30:56,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:56,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +6: [2023-03-15 23:30:56,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +1: [2023-03-15 23:30:56,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_08-model_00-model_states.pt. +5: [2023-03-15 23:30:56,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +4: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +7: [2023-03-15 23:30:56,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +2: [2023-03-15 23:30:56,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:56,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +5: [2023-03-15 23:30:56,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +6: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +1: [2023-03-15 23:30:56,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt... +3: [2023-03-15 23:30:56,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +6: [2023-03-15 23:30:56,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +7: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +4: [2023-03-15 23:30:56,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +2: [2023-03-15 23:30:56,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +3: [2023-03-15 23:30:56,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +1: [2023-03-15 23:30:56,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +5: [2023-03-15 23:30:56,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:56,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +5: [2023-03-15 23:30:56,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +4: [2023-03-15 23:30:56,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +2: [2023-03-15 23:30:56,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +1: [2023-03-15 23:30:56,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +6: [2023-03-15 23:30:56,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +3: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:56,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt... +7: [2023-03-15 23:30:56,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +4: [2023-03-15 23:30:56,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +1: [2023-03-15 23:30:56,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +7: [2023-03-15 23:30:56,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +2: [2023-03-15 23:30:56,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +3: [2023-03-15 23:30:56,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +6: [2023-03-15 23:30:56,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +5: [2023-03-15 23:30:56,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:56,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +5: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +1: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,760] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +7: [2023-03-15 23:30:56,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +2: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +4: [2023-03-15 23:30:56,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +4: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +2: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +7: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +3: [2023-03-15 23:30:56,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +3: [2023-03-15 23:30:56,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt... +6: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +1: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:56,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +5: [2023-03-15 23:30:56,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_11-model_00-model_states.pt. +6: [2023-03-15 23:30:56,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +7: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:56,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +2: [2023-03-15 23:30:56,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +1: [2023-03-15 23:30:56,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +4: [2023-03-15 23:30:56,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:56,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +3: [2023-03-15 23:30:56,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +7: [2023-03-15 23:30:56,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:56,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +4: [2023-03-15 23:30:56,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:56,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:56,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +1: [2023-03-15 23:30:56,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +6: [2023-03-15 23:30:56,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:56,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:56,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:56,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +2: [2023-03-15 23:30:56,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +3: [2023-03-15 23:30:56,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:56,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +6: [2023-03-15 23:30:56,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt... +5: [2023-03-15 23:30:56,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:56,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:56,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +5: [2023-03-15 23:30:56,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:56,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:56,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +3: [2023-03-15 23:30:57,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:57,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +5: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:57,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +0: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +1: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +7: [2023-03-15 23:30:57,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +4: [2023-03-15 23:30:57,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +4: [2023-03-15 23:30:57,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +7: [2023-03-15 23:30:57,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +6: [2023-03-15 23:30:57,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt... +2: [2023-03-15 23:30:57,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +6: [2023-03-15 23:30:57,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +3: [2023-03-15 23:30:57,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +5: [2023-03-15 23:30:57,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +2: [2023-03-15 23:30:57,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +0: [2023-03-15 23:30:57,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_13-model_00-model_states.pt. +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +1: [2023-03-15 23:30:57,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +2: [2023-03-15 23:30:57,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +3: [2023-03-15 23:30:57,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +4: [2023-03-15 23:30:57,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +1: [2023-03-15 23:30:57,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:57,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +5: [2023-03-15 23:30:57,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +7: [2023-03-15 23:30:57,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt... +6: [2023-03-15 23:30:57,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +4: [2023-03-15 23:30:57,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +2: [2023-03-15 23:30:57,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +6: [2023-03-15 23:30:57,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +3: [2023-03-15 23:30:57,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +5: [2023-03-15 23:30:57,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,229] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_14-model_00-model_states.pt. +7: [2023-03-15 23:30:57,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +6: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +3: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +5: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +4: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +7: [2023-03-15 23:30:57,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +1: [2023-03-15 23:30:57,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +0: [2023-03-15 23:30:57,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,254] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +1: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +4: [2023-03-15 23:30:57,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +7: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +6: [2023-03-15 23:30:57,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +2: [2023-03-15 23:30:57,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +3: [2023-03-15 23:30:57,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt... +2: [2023-03-15 23:30:57,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +0: [2023-03-15 23:30:57,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_15-model_00-model_states.pt. +5: [2023-03-15 23:30:57,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +5: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +3: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +6: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +2: [2023-03-15 23:30:57,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +4: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +1: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,428] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +7: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt... +0: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +1: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +4: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +3: [2023-03-15 23:30:57,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +6: [2023-03-15 23:30:57,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +7: [2023-03-15 23:30:57,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +2: [2023-03-15 23:30:57,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +0: [2023-03-15 23:30:57,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_16-model_00-model_states.pt. +5: [2023-03-15 23:30:57,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +6: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +3: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +2: [2023-03-15 23:30:57,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +5: [2023-03-15 23:30:57,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +1: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +0: [2023-03-15 23:30:57,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +4: [2023-03-15 23:30:57,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt... +7: [2023-03-15 23:30:57,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +1: [2023-03-15 23:30:57,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +5: [2023-03-15 23:30:57,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +4: [2023-03-15 23:30:57,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +6: [2023-03-15 23:30:57,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +7: [2023-03-15 23:30:57,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +2: [2023-03-15 23:30:57,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +3: [2023-03-15 23:30:57,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_17-model_00-model_states.pt. +0: [2023-03-15 23:30:57,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +4: [2023-03-15 23:30:57,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +2: [2023-03-15 23:30:57,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +6: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +3: [2023-03-15 23:30:57,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +5: [2023-03-15 23:30:57,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +0: [2023-03-15 23:30:57,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +1: [2023-03-15 23:30:57,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt... +7: [2023-03-15 23:30:57,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +1: [2023-03-15 23:30:57,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +7: [2023-03-15 23:30:57,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +2: [2023-03-15 23:30:57,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +3: [2023-03-15 23:30:57,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +4: [2023-03-15 23:30:57,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +6: [2023-03-15 23:30:57,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +5: [2023-03-15 23:30:57,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_18-model_00-model_states.pt. +0: [2023-03-15 23:30:57,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +5: [2023-03-15 23:30:57,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +7: [2023-03-15 23:30:57,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +1: [2023-03-15 23:30:57,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +1: [2023-03-15 23:30:57,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +6: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +4: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +3: [2023-03-15 23:30:57,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt... +0: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +4: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +2: [2023-03-15 23:30:57,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +6: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +0: [2023-03-15 23:30:57,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +5: [2023-03-15 23:30:57,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +7: [2023-03-15 23:30:57,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,788] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_19-model_00-model_states.pt. +3: [2023-03-15 23:30:57,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +0: [2023-03-15 23:30:57,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +5: [2023-03-15 23:30:57,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +7: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +4: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +3: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +6: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +2: [2023-03-15 23:30:57,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt... +1: [2023-03-15 23:30:57,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +1: [2023-03-15 23:30:57,867] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +4: [2023-03-15 23:30:57,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +0: [2023-03-15 23:30:57,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +7: [2023-03-15 23:30:57,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +2: [2023-03-15 23:30:57,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +6: [2023-03-15 23:30:57,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +3: [2023-03-15 23:30:57,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_20-model_00-model_states.pt. +5: [2023-03-15 23:30:57,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: > overriding learning rate value to 0.0002 +4: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1 +0: > overriding decay style value to cosine +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +1: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +3: [2023-03-15 23:30:57,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +0: [2023-03-15 23:30:57,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +5: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +7: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +2: [2023-03-15 23:30:57,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +4: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt... +6: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +6: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:57,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:57,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +4: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +4: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +4: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +1: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +7: [2023-03-15 23:30:57,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:57,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +3: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:57,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +0: [2023-03-15 23:30:57,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:57,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:57,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:57,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:57,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +2: [2023-03-15 23:30:57,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:57,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +3: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +3: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:58,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:58,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:58,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:58,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +2: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +7: [2023-03-15 23:30:58,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:58,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +0: [2023-03-15 23:30:58,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +0: [2023-03-15 23:30:58,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:58,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_21-model_00-model_states.pt. +5: [2023-03-15 23:30:58,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +7: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:58,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:58,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:58,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:58,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt... +5: [2023-03-15 23:30:58,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/layer_23-model_00-model_states.pt. +5: [2023-03-15 23:30:58,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:58,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,207] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +1: [2023-03-15 23:30:58,212] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +6: [2023-03-15 23:30:58,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,212] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +0: [2023-03-15 23:30:58,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,214] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +6: [2023-03-15 23:30:58,216] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +3: [2023-03-15 23:30:58,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,218] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +0: [2023-03-15 23:30:58,219] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +2: [2023-03-15 23:30:58,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,221] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +3: [2023-03-15 23:30:58,222] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +2: [2023-03-15 23:30:58,225] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +5: [2023-03-15 23:30:58,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,243] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +5: [2023-03-15 23:30:58,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,246] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +5: [2023-03-15 23:30:58,247] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +5: [2023-03-15 23:30:58,250] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +3: [2023-03-15 23:30:58,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,250] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +3: [2023-03-15 23:30:58,255] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +0: [2023-03-15 23:30:58,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,257] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +3: [2023-03-15 23:30:58,258] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +2: [2023-03-15 23:30:58,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,259] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +0: [2023-03-15 23:30:58,262] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +3: [2023-03-15 23:30:58,262] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +2: [2023-03-15 23:30:58,263] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +5: [2023-03-15 23:30:58,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,266] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +7: [2023-03-15 23:30:58,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,270] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +7: [2023-03-15 23:30:58,266] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +7: [2023-03-15 23:30:58,270] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +2: [2023-03-15 23:30:58,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,272] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +1: [2023-03-15 23:30:58,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,272] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +3: [2023-03-15 23:30:58,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,273] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +2: [2023-03-15 23:30:58,275] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +1: [2023-03-15 23:30:58,277] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +3: [2023-03-15 23:30:58,277] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +7: [2023-03-15 23:30:58,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,280] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +5: [2023-03-15 23:30:58,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,283] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +6: [2023-03-15 23:30:58,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,284] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +7: [2023-03-15 23:30:58,284] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +1: [2023-03-15 23:30:58,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,285] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +5: [2023-03-15 23:30:58,288] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +6: [2023-03-15 23:30:58,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,288] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +6: [2023-03-15 23:30:58,288] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +5: [2023-03-15 23:30:58,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,289] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +1: [2023-03-15 23:30:58,289] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +6: [2023-03-15 23:30:58,292] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +5: [2023-03-15 23:30:58,293] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +0: [2023-03-15 23:30:58,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,299] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +0: [2023-03-15 23:30:58,303] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +7: [2023-03-15 23:30:58,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,305] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +0: [2023-03-15 23:30:58,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,308] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +6: [2023-03-15 23:30:58,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,309] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +7: [2023-03-15 23:30:58,309] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +0: [2023-03-15 23:30:58,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,309] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +4: [2023-03-15 23:30:58,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,310] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +6: [2023-03-15 23:30:58,314] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +0: [2023-03-15 23:30:58,314] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +4: [2023-03-15 23:30:58,314] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +0: [2023-03-15 23:30:58,316] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +0: [2023-03-15 23:30:58,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,319] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +7: [2023-03-15 23:30:58,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,321] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +1: [2023-03-15 23:30:58,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,323] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +0: [2023-03-15 23:30:58,323] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +4: [2023-03-15 23:30:58,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,324] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +4: [2023-03-15 23:30:58,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,325] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +7: [2023-03-15 23:30:58,326] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +2: [2023-03-15 23:30:58,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,327] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +1: [2023-03-15 23:30:58,327] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +0: [2023-03-15 23:30:58,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:58,328] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +4: [2023-03-15 23:30:58,328] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +1: [2023-03-15 23:30:58,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,328] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +1: [2023-03-15 23:30:58,328] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +4: [2023-03-15 23:30:58,329] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +2: [2023-03-15 23:30:58,331] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +0: [2023-03-15 23:30:58,332] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +1: [2023-03-15 23:30:58,332] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +5: [2023-03-15 23:30:58,333] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +4: [2023-03-15 23:30:58,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,334] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +2: [2023-03-15 23:30:58,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,334] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +4: [2023-03-15 23:30:58,337] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +2: [2023-03-15 23:30:58,338] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +7: [2023-03-15 23:30:58,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,340] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +3: [2023-03-15 23:30:58,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,341] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +3: [2023-03-15 23:30:58,341] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +7: [2023-03-15 23:30:58,344] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +3: [2023-03-15 23:30:58,345] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +3: [2023-03-15 23:30:58,345] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +6: [2023-03-15 23:30:58,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,347] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +7: [2023-03-15 23:30:58,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,348] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +3: [2023-03-15 23:30:58,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,350] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +4: [2023-03-15 23:30:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,350] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +1: [2023-03-15 23:30:58,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,350] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +6: [2023-03-15 23:30:58,351] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +7: [2023-03-15 23:30:58,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,352] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +7: [2023-03-15 23:30:58,352] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +6: [2023-03-15 23:30:58,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,353] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +3: [2023-03-15 23:30:58,353] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +4: [2023-03-15 23:30:58,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,354] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +1: [2023-03-15 23:30:58,354] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +4: [2023-03-15 23:30:58,354] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +2: [2023-03-15 23:30:58,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,355] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +7: [2023-03-15 23:30:58,356] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +6: [2023-03-15 23:30:58,357] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +4: [2023-03-15 23:30:58,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,357] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +4: [2023-03-15 23:30:58,358] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +2: [2023-03-15 23:30:58,360] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +4: [2023-03-15 23:30:58,361] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +6: [2023-03-15 23:30:58,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,362] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +0: [2023-03-15 23:30:58,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:58,363] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +0: [2023-03-15 23:30:58,363] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +6: [2023-03-15 23:30:58,367] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +4: [2023-03-15 23:30:58,367] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +1: [2023-03-15 23:30:58,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,368] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +0: [2023-03-15 23:30:58,368] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +1: [2023-03-15 23:30:58,372] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +5: [2023-03-15 23:30:58,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,382] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +5: [2023-03-15 23:30:58,386] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +6: [2023-03-15 23:30:58,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:58,413] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +7: [2023-03-15 23:30:58,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:58,416] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +6: [2023-03-15 23:30:58,417] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +3: [2023-03-15 23:30:58,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:58,417] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +7: [2023-03-15 23:30:58,420] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +3: [2023-03-15 23:30:58,421] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +1: [2023-03-15 23:30:58,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:58,430] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +1: [2023-03-15 23:30:58,434] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +5: [2023-03-15 23:30:58,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:58,452] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +5: [2023-03-15 23:30:58,456] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +2: [2023-03-15 23:30:58,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,464] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +2: [2023-03-15 23:30:58,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:58,465] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +2: [2023-03-15 23:30:58,468] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +2: [2023-03-15 23:30:58,469] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +0: successfully loaded checkpoint from checkpoints_619m2b7400m at iteration 0 +7: time (ms) | load-checkpoint: 3011.75 +0: estimated model parameters: 0.618714624 +0: estimated model parameters without embeddings: 0.538301952 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-15 23:30:59 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.044595 seconds +0: number of documents: 835726 +0: > dataset split: +0: train: +0: document indices in [0, 835726) total of 835726 documents +0: > WARNING: could not find index map files, building the indices on rank 0 ... +0: > only one epoch required, setting separate_last_epoch to False +0: > elasped time to build and save doc-idx mapping (seconds): 0.100536 +0: using: +0: number of documents: 835726 +0: number of epochs: 1 +0: sequence length: 2048 +0: total number of samples: 195100 +0: > elasped time to build and save sample-idx mapping (seconds): 0.030426 +0: > building shuffle index with split [0, 195100) and [195100, 195100) ... +0: > elasped time to build and save shuffle-idx mapping (seconds): 0.005628 +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.065 seconds +0: total number of samples: 195101 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.042588 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.075 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-15 23:31:13 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 20956.94 | train/valid/test-data-iterators-setup: 13824.13 +0: [after training is done] datetime: 2023-03-15 23:31:13 +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.449187E+00 | lm loss PPL: 3.147480E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3318673: Wed 15 Mar 2023 11:31:52 PM EET diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98806de66c21c2cda79ceb3d75db13b586a08983 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:352abcbf1c23f798e99a54aa940360c10e9a485f2134628f79d0c3f0397a2ece +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a50047d80173fe6ed6b202ff9a0f15529b7b3f5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25960b24c6e89610841840559a95cfc7c220b6d6c416fe642cab06f361ae5e7e +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f988184b3b1472cdcd7a2e2a883c80075d721dd7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78169f873f6226fa7ce9407cdbb0ceb1652d8205ca5e5b7ecdc031c90b56e4d5 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a0a8e531f7bb9de14f4693b8fa918b218fb5fce --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:046610d9d3ddc90baa083f0d5f73a936a3913b79c6360e8151f5eb66d95eb53a +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2d191be1d8ae690976d935fd38d313bb3cc7bdd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46f74e014ce76b10b4966d0f8645c1455d91b81550327a60db16f82d4feb439 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e3fb24653237905d4f5df37d7db2d529d87d1d4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2354abb06950219d7a60c8d45e4ce4c6b1d8579895b640e85a10551cbf3b0bb +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..731280e02b43ec70d815967305534a548b70f437 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fe51178caa8868f1c205add7b5c7fff6311885453671e3f879c55245cab485 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9595f9d36dabf94c266fd2b65b0039ca712dd8c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e962a0c594ca58bf7e59bdb5ad42db711694e6b16d03d23c12193eec82a18f9 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e78b8f905e0ac15050bb27a8e12f1a9be9bed608 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1461b9854aebf649dd367c3fda02d63f4e74c651dd4b6a420c238c9b7abbbc4b +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..766d4686374e3d9c0a5cfd09a551761a35fe60bc --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08582088d051f7f9f317d2c75781a18cf843327f805d8ba5e2484444bb96cbc2 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33c9c493313cfead88d293e1f28d91f02c26547e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4cc69967b35489171a966a4397411e3067f4cc67ffd7db3c807b3fa7eb0e4f3 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b89f5b90cddeab61043f4389911dc4a85807346 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55137103c24ab29a3601f4323f2c1029bbb89ebc348d50493872e3dcbba9ff22 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25275bb9fd84003262fcfaae1100411e38121113 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8889c7fab8e50470fb54da99c65c77e952b1e85ea64e001c09334f94d98d9de4 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d5d6b69d9ae78263c61f3a4b0b89672eb0119c5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aeb4568345af57d855fb5ebdc0794c91303161b13d1eac979939fa8d18f4d63 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0921a3ea2903cf6af6fc86a3f40622977add90ac --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bacb5ae949c9246d8ecdda078e8452ad7699485eaaf48e4a3dbd3d706cdbef81 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a7e524c05141b4ac5b10438a1d497aec71d1164 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0caa133a0c1fdc83b0bfd49a623bea83cad79581dae384768a3ee491597d9c60 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffa6690fd4e187d21c932ca2bb94bbf05a43ce5d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48c5bb544d0f638ebb1fc284a92a7491e68245cb4eaf852dcd00cc48283de3c3 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e6b3415496e74ed4db1b28ef2c4825887793452 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34cc15efdbb7f7c968b686fdc65149fac79c80cd3acd25f6bc1cf87ff073406e +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03e7143f1f21a18273ea366bd81287fd11e34e1d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c93ab991b2fcc776775664d9f222d396a505ef7a589e634e5e1ed2bf32c440e4 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f9bde697e163e129d40d77ceb6450b4c111328f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f26599b125a1057124998a5e021ea2fa6fea3037109ba6da5d72f3834de4950 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..785ebd818ab214a5b6caf69b2fbcf157d28e77a0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921436c7e691226a1768f85881b088c182f80128d376c8950459490411c36e9e +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01aac979129f903f2e5a7778698e49a3d4a0c858 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:471b82727af21151a328914b98aed9cf2ecc720cca263ae1f54e5d2a63f51faf +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb773972e323a4ab7af7d012c84a7c9de1eadfa3 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3150ccccf1eddd7fbda7c7f42f751afeec752346f7efe0866de68a2f99b78f9a +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9136f3574724f1f43cbb3a0bed61b3f8a389285a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3437261ce7cf7f5d70c41081444fae31f823d86bf29cc9cac59d2dd28b631024 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaf0ff3fb5f364e3bacdbd35f058cf9661a7083b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2127292cfb4845e3c5850c0e05b3dd21e4f17b42a632e87a2f1533fb4292782c +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6d755b04f75216450ee886b2c6f0ef8df385b1c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f33fe220fdec68664211a43442b59692223261cfa5ebb81cfbe1917113105f +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a069b265e5fbefaf3d612d571157b6e9ed921d7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fb241d944ccc90294338dc22c6d973b58d4b154b817b8dd3f674e99c2b4632c +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5112d20da2342de585fc18b293d36ecf1904a9c0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:646bc0cba39a6bd9a21421d3dad52d53e77de8995a3809a556f8e1e054efb4cb +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2abffc105b07a56eadce1df28b335305f2ed2187 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:587c6cac8b5d8e348e851fa49bca9d9840c981bc5630a7d7470865d35951a1f5 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81ca85ad0efb0c2aa1c4c6ffadaced44879af2fd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0089d05c51704aac6a5506e57355cdfc75b1368a21be11222cae662cacbbf44e +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a86e6d7c056384a95c50593d6066e1650b69e96f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6c5f7bd8eaf1adae7d1540cbfe1fe2f03115180b076e66ddf96463fea96a10 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d644623cc693a77a3591eea8be9f8cdd770265a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b18d84139cf50594da4077b9f9a31b948ad7fad07db3ea3db34243437fa5084 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75f1cc54893172b6a4cef15eb33c70f2e14f5a54 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b03a2d47bed5520eeee8ec55ba4cd6362851bbf8aeac89c2ecf282412e05d35 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe62f90552d886151b5f4145f90e60304c98f8ba --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22e9d2c40e77ecbf94f49f7cf44f7f990ff2833f2c6f6b5d58b16417d1703f3 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bc74abdd0f8d5857ba797da800b78de681f41f4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516ffb47fcc800b8f8ada68362d44a73fca626945aa2fdb5062bd01fef03af96 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10fd86cd770abdae0b5fc244a4367f9974e2c261 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6134a5cc303dea35dd21983c142489b72e1d9b01285c96266d0a172d67b4f9 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66055daf8adc1b18af5932eb68f3eb38f34ad4ac --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dd9c15408535cb079da1d681cbc55ed59158bf2183a1a058fa6a34c2269b65 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..193f2ef7f3cc85481368cd752d1097cbd24d19a8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cca0a42059fd2b78b61af89c24051389ddaf23a9385edcb5ef7db692883353d +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b45d50c53a9d6ea15626616e39dea0a8d1f8c34f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:781fbe3099aee2918ca95c3c0d281f2062c0ab871f281681d14a865d25d74d42 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6901454f5d953fecdc2d189655e75551acee978 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91016f8d741131681ab7dad22907a39ca50c0e9c1734230e57c75f5fbb8d4f1c +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1376460c3c7e8a20b4add68aa7f7a4e3c053741c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9abb1bc615cd66155a246538a826787cfc247756b78699f1cc50d307e0033c44 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4deb4284002004c8ebd12d3bb652cbdfcad34522 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47d9b3928746d788f889b86aa827c0a399e6ae30731f110fba37c03d9c822b8c +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2dffb62b0ff611944eb57b0b143328fa39c527a2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0ef53eec3811d7395b849b0872e061bf328b303d41841b5b076b073df4489f7 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b505df0f8ad7f6891583e40c4b54a2533a66f9e1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e07ea65987d601980e3c3388a5066b78a487ea767fb365ad6d64edb1f390086 +size 116013538 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a83602f6e2829837529c9535301499c6b22e40a7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4dc375c395c5ffd5ce99ababbb6980c88d5e2b4413e97a8030e8dcb28b3344c +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70cba64b0f6c86e16b2226b3402dcd44ee4f36e8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b507ffbda7c051158eda8110cbeac6a7d5b7eefe0b7c8a347bcea056127ce22d +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..616690944b21b6da9c848eeea39b06592f5bf9db --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b43ece2e0c79ac4741216bfc425abb7a7eb24f4a7a2b33631d0c3a7df332e2 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ace3a28a36dacc93cb56499675da776d611df48c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:375777f404941eb5629438fbde49d410a89585a900ab3c61dba5ef4efcca3222 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2016d916dacf1a4af84e9a4dc7ed86d4a462625b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4e4fb8ca74024b6ff1da70ab27650655e1cf6ff4df4bec12f073342ac5ba11 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2038050162430321578f1bddb0596ec5bf633b94 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baacae95bb8db72011151ba8dda40141ebc45307d25bea1bd188b877153d650f +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6988b29cb5dec5f6999ab5ac7326f8304539859 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e79dcff56e85f77344b1ac1a5a912f70468ea9bfea7b72a0d92639238beb9de +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ab3cda1ccdebfece960c21d33edb9811ed5632d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f476b2480bdf083e2d0db65696e83b1a1adc381c31edfc41bcd805de4f8c4e5 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80c6cca938062a7e5494ac33d0f964d4b12fa8e8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:934f09c68700f46d32aa2764f11f449b96fd816938db299f3a976fbbfc7ad35a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cd31d196fe27626c6c9ed0dcf1c83666ba78e79 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:024c81a024194e0acd4062b5e129a2d5ca073ad64a30bb98cbefa52a1776e0fb +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88d3cb4e5bab8da65ae3a2cb49c288740de31dd4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:461827fd4337b62a1b84e15d7332e97e6c9fa43fbf443012c013e0289bd72ccc +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f53d52601d277c127ece62ffc615514568cc856 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562c029b8003d7d478e28de8a9627aac62821f847e28a7372f2a4f26ccda434c +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1ef097390d86fcdb78e9f156bf8e73cd25f65c9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f005b69873824da8f91814090e1d980b2e14cb4866688fc624194fa3573dc2df +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1306523019bc66d51a88217eaae623e163c4d52 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92a71cc262c0693345763b1ac58d0003b8cb9e0cee50cac8207a8fc2d6cbf892 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66df67b1fd374bceb76dd7162575e1c3bf2ad48f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71ee758b327f96e52405b3f9543eaf7b5a29273be82288c784bb1107e8575dc6 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0894009bc9db2b8e8b50bd733d25d4e150c1165 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6045836aa1371e8998cf1cd9bd91d22bb2fb1fca18d19ad45c5d11f907ee6e61 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb37811ed1a6bddbc8ebe931c11a9126fb05ce71 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459286f6ad3a96897ed5dd1b48701f10ca2f1dfcaa9541acedba5b34edda832c +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..045c4aaca34efb9685594c5b3996e5d3c125d112 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2523c178188c5e0c0b3e83fea6995097156a0e47c82e499951c1309cde2cdd5 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0751531ebeff7f3e6cf67999123fea857de36b6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96acd0838a45f9ea1d2ef16fd2b4c29be82854c2de8c05b33fe7dd372b989a0b +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..febbf0f4ba419ce3bd34cd4fc2360605a630a2a0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:071c7851231218809190afeaf3092f42c668a1d706186e1187529573d320d451 +size 116013591 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_01-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24c6c76f5c40ab35b6ef459ee60d1deacbd11fc5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c953d54d435e8e2d3b52492bdcc2e41e435e048d0daea575dc032260dd4f1b +size 160826627 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_03-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..927c0020e578ce9f3b732ed59b8b6b40313188c9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4925e89adb51addccb508556ac5278a1914aa9091860f8e2a9fe6b9ec21a4d70 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_04-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69541cc9ec1775ff52279c6cbeee52ef883f7c97 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aafad222ec65173cc1d2dcb2dd1de723685f2fcdcc65d4f97bac1d4998c9626f +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_05-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..613cf24f9071f8b6c2870aced7d24dc2557d65b3 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c95afa24d608051af4c618815ed703d9480e2a92768380387fb88cfa9293aba +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_06-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae9191f88df70f1c4d79e6ab6edbd98b5cb66148 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00f47484dada28e397ec4f1d69ca100f7f4f1f9f4fbcf98551c2485716e169c +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_07-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7523b8c5d56137cbc1f021c10ec03894bce4a549 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9a66f4c59b4ae9017721c8ae57f285d9222e08f6baed1622274a65d58a30c5 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_08-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a957cc54179c89e725b0f4eea036a5004a6990d3 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0872229514c276ee02f27f2eeca08ea754b39b6b26d0fded696a0df8863db224 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_09-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fe04a26e6033ff8c8d4ea090ae14dd6778a1432 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb05706b32c8136b9a1f55363a3736a462bc451c057b708968db18e122e394ca +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_10-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..548df0913b914e2cf2d04d4fa040754489b83b67 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39c1a7af7397142b932c6ea0adce0fb2269f8be19a7171e82f1df637a92d86f3 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_11-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0f72d08da7141baeb5e4bd8bb13220a439f0dcb --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd1008165cc74ebdb482d3b8b8ec8367c7a6127580301418a52ee6653937a91 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_12-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0028560c3e38643a1e2db7c669517f6cf75093c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d193c74bb0d1dd5c42e94db82807b38b7edcc3764195b5f09167f74345d5172 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_13-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7d5b8f65701d33ff42221330fbe6a0a030f0752 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017fd173e9d70ff327dcb7d92f74bd719edcf6b2a0e88375c29884d113185089 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_14-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e9459c2ccc40d0667e22840d7908a4c16a514e0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6319261cabca695cc20d899e409c4e30f49d8f0cf4f12f4fc6cfb6deaf074ff7 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_15-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7ea25444a8fc7b435f80dd83ccbde9f590eeaea --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b82f01168efea39e682e33f08fcc9b49a0d90b00b828798e6552ed0858e0a103 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_16-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34fa4472eb362a2409ad7953e7e72ef56f92b2d8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8292c13c734eff66f3dcec3f36bd2b670edf613945b7d673b3778a737748186 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_17-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efef8464dd1336a1b0e57f5b9ca9a83069323b7a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad496c9239f2e779e2c931a7d61bf704b312f5e08300927d5950b22319655046 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_18-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01e91d656d90d0cccd124409ed2b4d70a0b98f9b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01aee4f74a8fe1f104dfe6fd24a54cb700f1c0e94f131dbbaa4b075472ef62dc +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_19-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3c352382b7ee70359093dc7c1a87fcb44beba4a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ddb8433937c1fef6e404ea6c82df3ba0924c22a41bcd508715a83913e8c125 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_20-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0997ee9d9ca853340d7927654c7f5b9c66526718 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:171c690f7e9bb275155b80e8d8440e477f3fbc6f7dd3ab97fa3abafdf2f9615a +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_21-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ea712801eb05af1b8315030763ac18ccc5df08c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c533bf0ed69dbe24ab9d2de7c3add58b26ea424783dff8094646f6ffd469ca34 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_23-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7cd36a89c4f4a9616a1c6dbcdc95c865a11ef8e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb29f2e7141752a7ab842533aee9abebb43937524ae45c7222461b0ce60fea02 +size 7363 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step1000/mp_rank_00_model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1d3ed04c5563e041a882cdeb01ecb1365c52007 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step1000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa99e9c78968966e8b10761ecb06264c17f01a0f2c4b1a267757e41574245c1a +size 38451 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29b34f51ae74fe21cbc266170b5b20be424d5db8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa0f35261acd61eeb1452c6efeecf03e9c129b1d0750896829c571fd8072e6c +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83e70bc9404c1653b62d0aeaaf2b5ecfb734ed9c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b84039f5cbdb2e1700d62dc73ccf41a7815bcfaabd870862bb086dd71e6362e +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d39764efa56c6123df732d4ddccc963ff30fcc71 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0e213025925d7c65fcaa6e9a9d58b26ff1f62ca02b4ed63966c024acc07f0ed +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b17918296f26048ad2be4d45f0a2bf465c3c7f4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2250bc2cc01131a786464b88b80c7d96ba834c153c88cd586704c4182ad5c1f7 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af2982e69319bf051b125c249741ac834d142195 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c6c39975c76ab4a2e78d6db3549be4030cc070e72ef0e7b246e3055ef39a3aa +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76e235e159cde56c3473094655f2e09f0ff16fa1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940718a985e5497f4485bd0a00a17986d73c18e19a2570171b6434a12e77ad56 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd759ac602496b9cf9ef4c2e5acf8ffc2ae249b2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ed10b621633deb5755f5e40aac241f31154b68fe01452d817f1e5ebc7b8bc5 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..376922d0ab1622e59b712038261dc89e4ebca52b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fdc86a5239ce14e25e6228098b6babdfd8595a95640503e0661c000f908937d +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18ef04abd17c674ee007340c62d43f2817d158c7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b7206b680d76ca7f29442fb7600b9e9ee8299de5731605fdaeb71fd2426a77c +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68605d7e0d25c5603b542fc907a85de4ce8e2982 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc92d73d0d643a9a9c3acac65916343cb849b8fa5c8f0b905193389ac05c21e3 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d12b5496ce499f8843f1239f363b6b88a21d5f82 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad64c91775d1aa163891f12e531160be564759f7e71cd4791e90c8e919d124d2 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c44444e40c7950a451c01f1ccff1175bf48a3e5f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3b125041260a6c71a25af1000650642b606a63f114c45cfdf9cad9403f9588f +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eec268518bd25f9f521d7f4b967ab18a034dbb65 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c83dc9cf7470261ad47af4284e5b7f2b2ba4c84fc43bfb2af754a93b087ecb1 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e92e0cd8c462fbc14d08b29c429dcc727df3d94e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85812d3aa07116545d8d988639e1ae0e772e69497d46f5f218c4f5aa5efc860a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7628d7664e09500fef1eb98f5d27c678e7bf9009 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7428ad3a990ee75083986b0a925560fbf821b9f400572dc5b612b9b5b8fe8fc9 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a87165e0295df0ecee7ead1f7d2488f14cdc1d76 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303ef175cacfea7a64031a1627027168e39c87aa1d43df05796039da1e61542d +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dae8c3063aa2c738f073281c1405d0c48d07ff8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435ae5be3d617fa1e5fdbcfb04b0317b548054e3aacc2f8eae2911ec36833b96 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ad17c4117eddcd8a0920c7a42427ffc2865802a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc15eac89f590ca23e233031c4bf0d682c38dfa266cb23c517397023a44432b4 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7232acfd8a5a5c92ca7619537df44c13370fa5e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41c0ca09f93e37e7b389aeb3368c16c37d39bd530d834c4da7df356bf75e4aa2 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00069779787c9fa1d229faa0ea19b63e7c9c1ba4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c99715bee72949ed9578321181f8a3856b21f2ae61b0f6abe1769427598c376 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e98aa1e58fe75f662f5a9cc65594f8439fa3ae5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6f33dc4969711e1e2a2772f68aa0d914e1c26e572338db900f0002db50fed0 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65ca61dc1d3f2d6209e6a2d1964f1a5eaf8cadaa --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e43486e058088739499dbd026b10607f54393fa875a2bef80c301014d44cad0 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c977ee996d8cf7efedb5e20431b7cebc1f971523 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3411ae38a332f78fd10cebef00cf2c72071b91a734e47eaae306188074ba0d1 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dabd5ff5cfa2c19b3b478f5c9783cb33e54310c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f93a07bb7fedcbad9666cb2edabdd8badb96090dfaaf9c3177c3b92d2fa98bcc +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b3c25651cf0d7aae07759aa8135949493314444 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75dc67ad12f09b54fe6b70771f037bb819782de99f1e50f38b238de062823c11 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02f572d878e0a33d0b827e0eee0bed6e6855eb70 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07f68ee389661fa6b64f14aaa780cb7d0f38024477607ac1855d5eb77dd192ac +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e535487521c0158c9f79535d7ef982494c59980 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e17f2d6955c0f4e6c6f5581a30e9a5c444d421f490d37b043e4599e32d4a63 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0084a825d217375e59ab1346f770536331eff185 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b5357c1a8569593bbb1f04ca91c6704cc32e8ab4a04f4b72c23d5b9c1b2aec3 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce34553c73e97762380572465a9bd833a97fa60d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8138f912d6aba9fc814dc1e9363177e42aa79e5d1bf865faacf45ab70c1ad80f +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a99c9fd2b39526d4271ac70846cd82af7df8dc4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70a0c5dfef77400e9b3a652ab8165ade0aba7adc4558486eef787d2cf447589f +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4d5ead305848afd0d0ff3a7884a4c7f4389725c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d36b88997b97f2a6e9e76a1fe95036eaecf5c313732519093bc2abe2a24b52cd +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d46149eea7028f0a436b3eb224dea0f9e84fcd9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9699983215ddc6e533c6537b9f7c4659fb8d19ba91460d36eb9abbf2bffe6d95 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc24ac99bd4b04222537dd8506a1fb63d24ddfce --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d856e9c0083faca1782dea559b7647bf4da1bb3fdef145ab83accf5c3e71a322 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19cc6c713bf5f96921f11ec33585e1eb490d939d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3684ce2c62fd8052b887316045eafb14fbee175538228748d5cbfcdf70868360 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..791cfe0aadda8391cd7f20323981c157e89907a1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc61631487961a02fdfadb8621896d163c21f33ff1f7de526fdd73129173e3a3 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f1262a696e2af6120766ff8fcf6b1a053061041 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c6e1a91f440718ddd42e776c6defc9d81a2486efafece52471cf39dc82d59a0 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3251f7533d2bf041916352656ed66cc8db3bc346 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53aee95d3125d27a91ed5b50287a3df17f7306e5a00c2ced7906719675b81591 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d38c47ae6bae11f041ac3e48bb786d1b9c9801e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1beaec7991235e2239c2f03199b0a580708cb562dbf0b91e2b7a8e86dff3d5 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b5fc891f0a9d7ec90d432bd9ae7bac2b1ca0c0d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e34cfb6b6ef824391e537745feacf4f7df64a5a371b6ad07e1650b5cc1515e3 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19d4ceb831dab3c0dc305289ce1e0bccc96680ac --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd304a25fcab819a25a6a611cd65989435199ceacaa98ad2966a5f531369b036 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..191fc291e910405b5ab1a581f156a3d883bb3f7b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f389a6bfa85607c68d316a32049f16bd12a44a6ccc2a9340e10618d2dc6d3461 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5615063d3e979ff4f77dbb4dffcb665d33900060 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c360d527d9bb3ec238c571383a5e2b631ef74976ec7a2e8f06f4e7d77b9274f +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cd93c26be58dbe638bd2a9aa24e9ba36be8f8dd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:045be38b697484943d31fe1a779227d9b9a7b240f53350d1a4e3e6ff42ba26c4 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da18c5dea7b0f7895c914c20e3936bbf352f17bf --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d943f8fca1d624f0fdc2df18f3cc724bdbd69f5093dbac303e9b294d191e46c0 +size 116013538 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..552e2102a55484f056066b19f5003451a4659cb9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94332832f745e73aa754501d70055d20c4a8e67e59aa33e50f045ebab8b3e178 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e29eb48fa0fbd5c12e46552c593937e74329184e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f65dab3c235c8b3583e3345fc6f487c5aeff5f4a1d7c1ca018cdab59d54b8c9 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef1d3b219bcbba2f218581960dc8fb70a1525489 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85eb46cf83074a2e022dddc60cf2b7f2b65f525690557192e83d13ee24d4e13f +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4f6012904f15ae68499befceef3b4b3d6557693 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecaf48111740ee4c2e3fe623f411571c87efbc447147d419a1eb9b4d1114b077 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..383de9db01bddb313a7b01215662d23c4c698765 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f9c77ebbee19610dada3dc6fc182b0e63a3cc4ee27c13212ac8b63d5021f714 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9b088b3511cf767c0351ca20df209a0b7bbc7a6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d102fa0004cbe4543c5170912b7a3f829bb0a01dd57377403630a109aaa593d5 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..887bb3957db8b10e12424a050f39d2b8e3582d4a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:770bfd664abad159a9b9abe20e1405cb45a1f823ca0334d502ada72f1d169353 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce89ca8d81107f935b8f1eecebb377f67d3ffd7e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1438ec91c4b98aef9df702ef66ff3bb83e47bc80e1f35b29e41570985f013385 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fe5207def43d62bf872e9088bc085dde08a7274 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0530ff910089e9c643051981e81a0d7fe5b95aeede66b97f470fca4b37279d31 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..101f954bc6d396f9bc2d55af624930cc1dc4dead --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52579865f46cb5d7036685df194043bd398631fb9e8b582f797880507d6f9251 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dd03624155d7816161cffcd5c6717a5a9354aae --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a377190bb94439fcd8dd1541addbf6fe24eebe088363028ff60e8d03b2025d7 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..428a382fa7e536919d87f35152e67d2c1fcf83e4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e586647bb93a7bc9187c065f8d10f7d877f015c093bc0df9d54547ca0e75f067 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cff895bc4628a58ed6c32c2b6063ce2682e965cb --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cadf114841e8912e1bec6e2fd42582a9d8c482345f284c2a87e9a4e3a17bfc6b +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1937c1deacac494c8b5e7d254562c4e5154c8569 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:595a7dfe5e30bac1874752130fc1c71457b6dc54bd17527e02fdfe483ccd924a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f32cd29e1955fc2613975152178ccc15e14da1b2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76e1e3858dfe18aaafeac6eabcdc2f51bd50e07faae5822499c49be57881d655 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..84994f6d9c1db512e2422b0bd8042f4782c34906 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93bab803f0af5f3e89c799b4959acf851200eb397e750e344a399e435e2f32d3 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db6dab9a54172f03781d34bbcf24bcd085340f64 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e4d30db0288c64dff98343f46c419dcc9cceef5ae3871d2a4cf506998a7ed4d +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec157ea177c6f95d237c43282f66bab1fa455a7e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:894c7b0cb184892528ac0f259c69ac6cd3ac5c760bf509c0273ab9f47848d1f7 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa23dafcf310d3f482f0bbc21fa1568e100dc251 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f7e1c59a7ad9712ee167e4379df2e060c40fa557e05875168d34de6b7a10e4 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da8645ebcef4d124d5795ad42db1a084b6b30d68 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79a439246b89ceb08278081ec1fd250393bead2b595e21565eec51030e9d612c +size 116013591 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_01-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6a0a2577450a4eaa5f948721f2bd71e49fdc329 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7d462678b52bb830f1b582e25bd9ac8952f2ada7fd139e3303e17980c78836b +size 160826627 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_03-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4136b8f2268a897bc3a239c66ef1b0c72b389ddd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9188e91e27d6ca2d4bf97320cb2aecebba06c18473e94436dcbb85581b126a35 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_04-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..145c864677d1a7095a36ac6ce797cb6e900285b2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882890b36c287224cfa0787a868d8299a4fa3f96baf091dfad0c4069d9b210d +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_05-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..312b971eed019678c4f6c355110fa57ba46c56bd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e28b915c7e820c2803e9ab7a2bb07673a5b6fc78c4de73eaaf72ab59e4dc0f +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_06-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b78cdff4a2d52c531e2acfa5925d9a603b8637b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16ea497bee4ba300f5d09d002860c761b9d6e8d514d67611f5739f088e053954 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_07-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1e6652319b54b6b45a552b6a4ea5665764c5904 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b776cc3ef9fb0aa022a097a5322b9089628e8758fdd21175ed507efb5c8262 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_08-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e53181a38322434d972476805bda02cfdd9bf1c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e0e0b2107231f581e420303a6100ff7a7c85652c4d74d3fed1d14b99ca4fe70 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_09-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c1cb38bb8501a733a81e86b9726287dc9f260e1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ee102941bdcc1c6b180a37857b481339414702b64d1a8eb7dafd26dc4766bb +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_10-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5478e32f6e44dab29a44dcb0346ddb80939fd343 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c00a8c36de91a81153c47891d1338427b0c836b5407a578c382c66ec06e89229 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_11-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50637af695cd29541dfb683c9335873c46874c33 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d09de8c6a76342e8a6cd24fc3a0c550091c2a264a7ac2036c59e826959c93da +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_12-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..972cb53731d9a4e84b5331f79af25ebd4b3063ef --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c9b926b7bf4f6e0a5d1297661a62f58cb26b5fad6327443455be599c5d7a39 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_13-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15206804f3a57362b131019a8ffb1bc0b206a18f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d13f9a0c0ab313555a2618c5a710f3e97ae1f163fada8e646dc8917c1d925e76 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_14-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d56a9e59114a37e7e53bba0974d9ee681bbe85e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:405fe61052be86054d68b5fe0ed3cf0fd38ec2b93e1e21370cd026f678df8b5f +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_15-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a3d4869578387abdbd1aa4ce66961dc6428b27f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:650e09e5c4ede6eb8237aec851d16e23256574727382740fbbda8dbaf592dc6a +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_16-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfa08d7624c93c0bd1296f915d6f5185a958525f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d85388d832048256656a6567b4b0044a28c1f51c19c37ee7bee0e19e82721ec +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_17-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a776929b12863a9e0b4a0c27747edca8f23cc21 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8027a073093db8f1d2d05d1da7551e33504d8286113d2e4f0e794df53b5057b +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_18-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15dc7c59889102a1ebb9cd8e0dfd5a211fddc91e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b2af7cb5b9e007f1da50b0b5a16bfb22d6b79e3ee9828fc7d607be390d2d88e +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_19-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58b212dace4b9e6b444015f8c1c1c2abf63e3d48 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb9bf4d92c72ed0b4604534f5bf2f436e3b579e42de9723aa56213ef82353abb +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_20-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0018504d6160e68f7cff0e2c8ca2912879e40b68 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faf4b18c6351d9c5a889ff5b585f47404adf3250d54fa0945ace34af71ed2d92 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_21-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32d035ce6110c4611a996fec275b08260ad03862 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0dec334c40237c0a865ee765642fad872243c046a2bd48aae37943a9c571790 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_23-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..718aef265734e45455c4ed6d485a37367c1eae65 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2653d166056d4ff2769d989eebff31e86e10e65ee3e97588632690f78a42b570 +size 7363 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step2000/mp_rank_00_model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9f4d3b3ef93557be9cb079633f3213f12561006 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step2000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf8d8c090053077f148643c376e0ce1203ec68dd48848d378ad91f3012e9cfe +size 38451 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d7e583ff12841f8d524dfe3a88e2d3f00e92f3d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727fa9d05d971bb4f4f3921658a7ebafeb4e01aad3b9bf1e0427b5fb15913c92 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e9bcc3ef50cee5c8f252f15121fd9a16dbddd13 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303aa212f26167ca1bb629ba4b229b88ca82a6d392f67ab67ca6d5549b6fc9f6 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bc14e2d198af2d49143b226513629a6fd8ab463 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9338cef2eed5a5f943246d164713ff0e227412bbc4c8d72f750fbeb3023882c2 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..474da7c3ad0260d68ea6a9f194761fe6fb40de13 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e83d25f87cc307f72a952c9463be75f6781a2c88e5b954fa298778eaff20b05 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1dd5661e811089b7304a4324e68d212e80c31d1e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2e218b6b516fe175bca7c03d077551ffce5dee2bc6faf937816ecb75486f7a +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfbf036cb97d8e6d1406deda71a1f93d7b02a81d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fecc7262c97abf45a259bb6f8f0eb6b575cf42646cb2dd005b167cef1f9dff05 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39309c90ecac617549c4cf579132179922864eb2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb09eaaebb68e7cd39974293fc1457474f6e66e7bcbb09493bba5e9b153686c +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be2fae4cc628c1d86b0cb3734b5e2a61a0f3d65a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa137d433a6bbfc67b076d9a750b68ad1f6708e3c37db9d6906c85535301c242 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62b211229aaace24ef6cf6e6954dcb375643543f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a904d663a250227f0cb3ff60c23488a656621e832eae6e301070547bdca69389 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af3d9961a1d9f377064dc640058d3e578809e9c1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fece6a50e7269c14fe2b2b8d3e023a5b67b7f056fed5a0d4969e25a5368075e2 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6aaf3cb1b916e30edc804179e17ad807fdacd0fe --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d42cf072157a1c6ea82c588831be2908a52358c11fa0ac55eaf3d48740eeefd8 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe169d75a4da45566faab23e5e3198a0ad7c3e3f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fefda579351c2c42000a9219cc333762b6a99118258b2f78c62a7d6a4293062b +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cef0073a2212a53f1807c16d202fb9e7f35a7f0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ae106e4df754803a30cc89e5ef1f87f42b58717efd71d4ceffb08ecaaacdfc3 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..622d5bebcdb419de84eec6f2f41db5a1a97529be --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4302b264822ffefbf65f1b4948893e54bf29f9c0a1c974f3e4f1a7ccba62841 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48706d2bfed17b3456ad67a4da6a4b24dcb482c2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18dcbe637a97c6b498cc683ec445d5d196314f812710629121e27f51509293fe +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e594c40a1a132649eabc540115484bd46d7622dd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dfd30bf5da7769d6908c64ff60dee080f8b3e2ea6ec59419c268ddbb800dfe0 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..504c0212546aa25a349afdcd4bdfdfc5c5680a84 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fdd9241476cbcdd7db47a89d1792a615c875f17b940f5527b4cbfbb262eafe1 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf66b4f6d98bffec5736865dad04c40369d04a86 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c1faa8117b4182d3e61852a7dc2dace898af1fcaae746486ff836106f1cef7b +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ed741771d57a839dd3774c975bccdc44a2e1c2b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f94b094080616ee5e43cb0a32af449b9b2ef8a1b39ca548a19905583ba223e79 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..793d75d52f16dd58eb695b1c534c6deb808aac5c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf95f6bb4c8469cb6afa9575ed89f1f7f28bb76b6d402eef76c59b2691c15489 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7b0cc50d8b06e53d46cb265d1ad7ad2d07ec4e9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f88ae141a7e38cc71796eedb7f5137e19ed51ac145e87edf9cca68b4fe5351 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9752918f470c5c2d1b2fbcea53f21d4bbf8ad894 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c1156dda2eebb6af2eaa641214acadd1e8d9666510d7a5d00f94bfffaae7c28 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..645991bc5e8314e2d857b6de197e20d45630231e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b4f0fe9ed06d95775733a3fa6760b3693f9deede4bb71f11410b746f6118db +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c781a0649d7344619b6ff6ecab11439fab5c0f7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c094946a3d8694dbcde8b26be4698cdb33de3c2fdf0051e03bf5b144733f631 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2df025d71042a6c832437d17b7c2f730f6c995c6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b698164844ba2b60c75fc271e3e2befd1a7a574b8082cfb04a755183341c2a43 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2e2ff2aeb3d8fdca81e81a6e806f75eaf11c12a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58cf8da35d222861657a70232feebb0177e6af0945fd7d1adb1155508e3035e4 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd268eec30a2eb62dc881dff81f02477c1d0ba5f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ede973633c4750809e21db264c098b21baa67b8ed21a7584c019c4cb2e2cb18a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c408746066ddaa4d689cd3a093d8afca00f662b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc9da4094f1a2600bd1ce944a849a9546ebb31bab968f53737b2759e53264f09 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f73a55ab1a3568356368aa6de6dd9a3326510ae6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ae5833d667e1915b0926d4d62de93e300d5e22b5a6c47e78482329dbd418b3 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfb5b5feb34d50f37348d02bc4359ea1a84c1191 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e67137d01a3f0eab93dfb3e33ce3a206973d0633aa35a5b592898da75a7ac53 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8276ffa25649fcc78dc46543e9024d5a4925c26 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1be5bcec5d2c3a0cfd69b00ccccdcb442be4226d98dbac24a3578fd56fd13fe +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25aa75983e7649023746e8ce24fbfa9663f54941 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68a28a4a7631ca31f7175bd155134d4568b0ff18c33ad796a69f4a9df6da835 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac7133942f51ab1f173b19d4330f870aaf6da95b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:185faa0e1eb4740ca25870097f0bf703e95df5a8d0ce7cc60252b71fe082653b +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f39a20d48eab8ca3158f95f9a28c2e5c63714c82 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aba7ddc698c3f9d7841fd7e5f66e0166f5c3863ce06d71a010e8db6668ecccb5 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d623f49d4bf5b147102039c0d6c506b645b57aeb --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2f650cf27132b1117d70a875576e2872206c6c095e1110c6a7ab8afcb219c8 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1eb6c08168a9d70510ed017b128996caca06d770 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:737221405368da3ccaac81c327c0bee80261214d0c61bb9d064bfe4fbc2b2d64 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0afed0a8f1f2ce7dd2157903c582fd1d7029c6a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42c4fd9a8c756c56c1f29c3a9ccd2f294ee9050df46de32723fff2b6f1d9ec65 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3931fb306e09e230b7066761282f31b143b28211 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:900d1618217416f7a3e39f7af6506db6f617fdfaa789a9e84ab81fcaf672fa79 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c7c7401c7541cd5a97774cbdbbaa28861b42060 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d23fa213c409d0be5351a701c756b48221684ccd3c846d48f21c8876a0972c7 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f162edadd67028e8aa022ebddde66f434ba015e4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5249479cf8f43dbdf240066d82104edd04cd9d8ed6c026bd023f6ddcaf5a3b +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab5750d91131ec3c02c3583cdba04be69f0f962d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb75f4ac75017833231fd6c662f1d4602c21b1288f338c0905b46ab931760649 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bf61d95e88e7f40327f85ddb14bd7047d0081e3 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ad05f21842c5704ef258232d52b6e225a7d1d6dc87f289fa5fed6ecd98f063 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1150d04f95409768757bb16c09ab271fa5ad60f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2bb48e79557e9a5e347445548fcfce3567ffd08de8ead6f6fd951faecf431c0 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36bbc7fad1b0758b13b54b48ea1863acd0323b27 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7cbfb4521e49ce7c8eb4089707bfb0375374f5a4b622248ead3a1ae8668d9fc +size 116013538 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f78dc1171d1f28090b0369d6f80798a4b833004 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86a7b4c5dc4c88164241f2b1ccb9212bd3098237ef942bbeb2a1f02ccb78b209 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc9df0767832d085bdcab1e418b7071b31c75e11 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a665f0801d3518fd32e8b820d07a27dd6135111a6caed82db52f1c803edd4354 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd48c1e38bf728579b6df0f6f87557481d3e9a20 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63ebed25e11f470df28aca633780432270f62e3da30d450fc84937f0b8d081f +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50101693f43305009d20547659b24e068b708fbb --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97b34bac084d4c64e807c0092f8e33f8c7e998eb7135778897304224734b2360 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a028c09fd51cb82226f8c12e111f3eda7da4d956 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:182368ee26d7d9fed6270b84c9a130b4670d642b5112aaecf4ed0c3d7ebe54c4 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..efa7ae7b8421241a568a93785f11a38b5dc3aafe --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82dbc8338267451ac2015388c7ed96b8567763507a5055bab86a6b9c3cfdf144 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8517e74cccac00d52c83a3256eb1cf033fa185bc --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4075e0e2c36d2ee97bdbc54988bf9c9b72397fd4dc968f733656c437856bfcde +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..292d23b9f0f292dfa728ae6ffaa0cf91ae8fbda4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e5b1260ee710f09bae14d3c5e93c2d68456444c291ad0f2f51916c3ceddc17 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcc9cfa06a3e4b0799a3a4eefb456210621810a4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9384ed9abd48672bf2896ecf175ed707db8dec5e2157e5f345dfe65a98a5f206 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da1631bed318fe5cd5d3409f87f3a6f433a57c6e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d74abc4cb4468df5d9111dc80e3b62c826dd00ddc3a4491640e544288af7b8f +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2520b645e24eab3eeb30f9f79306f623c76d33f3 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dfcbf8112c0410b3ec81b4dd63e9046f84521545c44fc9108fdb6f5ced5d4bf +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41218241d86e862a5e2070d7e10b1e9ef5181a04 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de59ccd87aa3a5f12d85a07dcdae3d5a8db757964a92b8aaf4340be67f41fc3 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3b96fa33446e0210c08634c0f85676f54e8045c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b7bb0718d58fec6e2cdcbd7a2bd10d9f889b12067ad9365a30a543e6141218 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f47a903b3fe044a793e0996d58ce29f84b73aed2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90ef5bd1c4927e1cfcee3e4516d4a2872aa4a693ab3729d36fd63b59d3fe127a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ee2f1266821ca875d9ce389236a55cf7405cfd6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03cc6ebc965884fbf6bca34650f8b40cb7e6366ea15ca2de67f64b370520223d +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28e733762ea610e3cfd85b62e818ee718c34a3d0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efa91aa19821be04159041d86773450b9a33810e7a29604e6a5e0d049c8f5b70 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20f701cf1aa6a34da022cc749a62a6dbbf976e85 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1149ee555fd052d8b50a3188c176a8bc81c11c63da7312e69b9668525488db19 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b1a0f9cc4346a86067119637c9f3ea9111c51b0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf3a1cf98c269af959f74760a54df647ea15022b31d82e677508553e645d181 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6169d76c665379c9745d16daed72f69a1e57828c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ded00a4d86e3697ac8158738076cf469d69df1386247671cb8ed715be9d707e3 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20330aa588c664810ceeb8d086b275d028fab61a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87ac61aae4e464a51fa0ded0bc7948ac0088e586ec7251f287ea5380bbfe49d4 +size 116013591 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_01-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..81ff29fd8e8b598c467ec22139f0454a3f19611c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538c9e542cf9b0b957aeadcbadd6e95c90aec089b328da3d76c9a3157f97fddb +size 160826627 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_03-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25e7977567cb4fe9d89f57c369091fb5ca93f593 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82bf73fcb459b87b1c8a52208a9441b6b0bb53b1f52a90ad128dad9ef201f3ec +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_04-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b125751ff307b6c3130c503ab611f2f17856fee --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2955c0795a6a37d80fd510f7dff5708d6cd8c9f1391259b209ac6f8a2dec9254 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_05-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6088fe2c8c50982906da2ba937acb5447e513c1e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:431f2a8af31a4063b2101459ed2827be94cf662f4a36c85ee517a582d4b6ebc1 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_06-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..42f59fa475b3d897c312f58262354af2de444080 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33d2f6a5843b2e3e79b58769db2a87ea6e601dc796ae4256e7cb1490c6ead1cd +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_07-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4636770051359092b690c4b08d2e297d192d1097 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b51308adc809d276321813f3b9f864cef41c72777cf4b2c42c70707b8484854 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_08-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04945680c78d8303704f313330644749c6956def --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fee6100d6c9d38baf001469e62b9e93364ee486ced98fc15660d8e0421145db +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_09-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..649a51b4640769b0ad5dc16b2b90a24a2184bcaa --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3824ddd6e2a564269f24d47d960ae3025c34d623a48028967f01f56ace5a96a +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_10-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..94a8afcf6c733466402140883dbeb91fb0a8b06c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f30e3cd0448e4d791771053d7f59e4d81c5466ef171457009e4a7c0fa9322d1 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_11-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e63c20f0c41e86f5aaf0227261fc7ba28af1724d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508b455649fad75d5af622289de3eb187fd5b0f9244fbeda118f9bc1009a9174 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_12-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..012bc1667e903ed9181c0889b8c54cdd91f897e4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:059e9986021c10347ce9ed582e39241bbfb14a83bae9764d6d0b2da5c900c252 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_13-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2071c99fadc0957e459b656dc8cb752c55891b87 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79676ff0095ccfe3794f72ce4b2c374d6b9242d5dad0044e88182ef097453787 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_14-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e700d1116a1cb2fc0d65107e5f735501d268d0d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54cb13948a291e50ec9c1dd7364503c3a3d5c0a9db205f2ee1e72a126e776d3 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_15-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e432667f1da3a4b3b1f518dc4cfca9365a11b33 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b243147cafb11b079cbb7d96f1de2d62222eee189f48dbb3d0babf703cb8115a +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_16-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b71307b3417d1f0802281196b2450643d816e6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baef67816cecaade15d9f6be2d2114619e87cb38896e99e7bd4c6907c228a723 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_17-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0d2c86c78db12831a3ae092abce00bf9ec260b6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c75357679c2075c416dd0ea951b56fdbb354877ec3375bbd23732612a14a5f6 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_18-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9a5a9e43fcc0c61dad3be5a3d7bf9f9a03134a6 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77a909f42a7f13ecb953196aa41634be5a27ed521f766a01d24fa480d0408b10 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_19-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b57718553e8015f63d9357fe9a2988d325f2b628 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed5b6b3563f5d434390e1dab73ade47e64a6248b6bd5dd055105e14b29b166bd +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_20-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7efaedc2152daaa7cf48815b86a4e27099a9c4ca --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e56b8167f2106ad467d55cc4d3c0fe850e887e435123bcbb56dec5948a7c329a +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_21-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c6ee92237cdfd377d81a687a9e4547a2083c8d1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab06113d2fe5ac7f2c8fa360c45ee3909852d662f7980d820dbac878581fb9b5 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_23-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fbc5892d607106b9f18a871b7f23bbc6624a664 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad28d1f19513aede472088f92c39255b1682e4ec238a0c3b7eb311472e8a5d2 +size 7363 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step3000/mp_rank_00_model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4be665518df292f2fa21897465ba91a0517e809 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step3000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04243e70478aaf3b0d60c3bbed473f07b5e527e89afa79c2b655e5acc9548e3d +size 38451 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b607de05338d2f8f8e65ffaa0b75da9b7947f173 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d7f75632c3f73f2fcd91d0b250fb762094e45fe0f26efe287ce76a78401e5e +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21e5d0b3095deb182a2990d76460331bdebe71d0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afdbabd60adf6177e148f5d11363325b09d37337673341a9f6142cf5e8d0555a +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1cc4b53a179541c2bdab84a27319ea0014897d4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea73c04528193abad56075fff006275f8a7d95a19aa7dc78ce58eed624b4923 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f07121fa049828b21c2cb48d92d293e3bb5e185d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f552a5340283c84bd50d27187fbcd3402bc52c7719b009d2bbef4fee9d80e195 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9fe198cef754865c7709eada9baff87a70cf039 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba2f4a3037cb563854302af0d4022314fc156adcd82abc36eb3b846e06753bd +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da755f4c2914b8d4468770986e5709d0845ee5df --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4a2cf8af732130a883fcc8ebd680cae08b237c324fb329d881ef3c90cf7b11a +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cfc8f3ec4e7794e7888a7bc708072772b90d273 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8380cdbda0a4ae89cd74c470abb196d14645bf4d1ece867430f36955099a2be8 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4de919485ed8b5b578acf4d651b55b9a39525626 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a3a1df47e07d1c0844015b96e0b40888673d358c893c401e292af73861ad18 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1ca7ee21b5b59b693b256401bd4bd4ff6843448 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c9af607fd8d5167d152b1be8367d544afd7341bcfcb5b1a69f2417e33f307eb +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60a233f9a59f17f0470fa82531653c9aa370f4aa --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7f2f318c39d35725ee93eb928abb141b46ff8de8652d168121f4697cf2823b9 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbf6ee00ec507f3871a07fb27707e94923c5f695 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c8634288c60b957366efee95d5d95fc2c2f9c772debdc87c27651b145f107fe +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0523648f0bb6906fd77f40af6b28224b31cc0859 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f917beecb92f18b13c49a90bd297daa36deb3caf8c0c84320804e6ff8599082 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2cdf28c9e977aea79b67115f98c98b3343370a4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb0db7e0a2337d33b2aa934fcb9b93b15e996006e592a9bfcdaea6542ad9832a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1b6f7a9baf6d9a75552b65fb20899012bc130e0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef5691a66feb02e702e9ee946b3df8eb08893930c8e9120c2a63b7b71b354e82 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b05253e03a4217a10aea80e2ca4bc712115cc26a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0485804540469ee10403f32d077b32f1bbc3486b7f99e35d665714b439d6a76b +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad44c17324e7a4e5fa880ddb179d0fb307301793 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f209d1e9a0d822a99105862e6f8f274873aff7dde5fd9a40cc15b20fdec66a98 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ea83671bb0ec7b36d865e4b2690fbbbb20fa503 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:073db85b420e7baa78068726658ee1733930d823b18228f82d8aeca79f401ec2 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a95e8724096e4fe23dd2bd9aa56b6d2ab17d8d59 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a5e37e11508bc068ec127de8a0a0e60c6a8d0325dabbbb6eced5c1b0e66ec3b +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfd10d8dcf9a12b1429106f9395c1cf71a226140 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b70e54a4cf76027715fc10a4e648ea9bf939136853d7a3b0e898d39ab181528 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1b1d325eb862324c817162ceb67c5b7671cc484 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d93a61c81dc0a6436818850e777aeddcf55703ff7fdb514fb1f5249867b400d +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5e32de9fb68790f0b1dc117596314ecd4dc20e8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30fa12a0338eede0ad55f5687da2678664a59d1547fdbef964d5975f04ad4e47 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75a368e6f63f039e44686aac4be42bf5214b054f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f4a46122f8a2c234cdc81558f4a3ef9bcd3c77597835ffb1cb4865f2a9d389 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ea85cba5278006dc2a0bfdf507a44d20ebf22c2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de5a4766536acaac0f2a470228cf61b03f372d05ac5162dbdfedb3128a6e5b90 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e30452d18c77361739374973e41d9ccb36848270 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba0bb62210f34c60ecff4e92dee49d9d63371b14a053072a679ddbc48dda704 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3237fbab3f91529ded6087b6ec9ee5166f36d51c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b46e59a3856f3cb8689e1f226b8916dff28d1a38a00054574df9c17559a853c +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac84203767ebb39cdfaac5c438343bc68e7ea5f1 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb5c02eae33c158a066576feb2fa9e24b2e59b8b6aa007cda6795e6ef633ec5 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07ec8ac2282fceaa8008ba921289b5c189999008 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2338402ab06c31eac8f3a0906416a53476b787aa32f4576ef274a3ae3a621df +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecbb908b6ff878e1e8bebe6905cd0c518948321c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb677258abe894965dd232c069e31097a73a141735216d80c7f6eaa3fdcb5256 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03889674b9c99446cc0c3317efbda98db4f1d01f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3786dd2a5086ec2a42343b58f98c3e0653f98fe23651c301531a00d7ef960701 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7962fa7dd0b18c21d26bcf9289cad8ca382fdeab --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8044946dfcd7110a2341ef91498b8556d9b8c550ed7502284adc1b914363c56 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..faadaac5f96c6efbda5d24ececa1b50869a37344 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dfa44778bfc7210fd9c33410a56012990ca31c8e649956aeeb8ed2065e2eabe +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52f5ed7103d55427d9b35af009e0e3d418dfa9b5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e725cc0cd3b6d80d5a38217d8fe7549aa001f12492b6504d9aa40512ce54fc25 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c270006e7f45e51ac6207bc84e1a345f000215c8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9b8266e3e9b2bb6aab5b4408ee1fd5e509f9a1ed9422202232a6a4200cae133 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dba70f44e0ceb9bcf59c9d8d1a617414879b4b02 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561f2455c5f5420831dfc66a42cab1372b93bc72e76aca591bf2cfa8a7f45e6b +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4144d20eb2f7821d24519fb3b88950ced4936b7f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bbcaa1c6cd319ab67799184bd3af75bfbfa134318a0561a62d9321bebc307bf +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..403741fea3d2a65df421bff3392efe7e0ae248c4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e70c4a8058f01fc1cf8c0545061b352af05db769b91f724710b4ecab863cb99f +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8f6ac064df783a4fb963ffe27be9e95f1bb6a37 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26e6728987aa82e3ed3535565d61e922fde7e14648bf545ced45c4cb6256086 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..380cd44237d52e8867cb31993786626a7636407d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59659b01c0d18ac7564eab9182babc95c4306e31543b46c233defcbad6fb9d6b +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6990614b6b6746a1405b2124652ce7a4c20f545 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de7bec61d4b9df725bffaa1fe32d1ca0600aa6821c5c67b5198be9ef13021b4 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a6e8eb4fe4180ddb732ce515d7ddc31df2265bc --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30e2fc104b8897a1cb7e882b63a78ef6a3fefbdfaa2aa6bbc5048c8ea075c9e +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bd631b60b9df499291334e3b9186a48fe6f0337 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6916d82c06082999262bc1b5489f04d3ff63f94f4df507a5e4721fcb115406 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15d2d89dcfebed7751380ee6c41f8c663f532fed --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ca042951254f9c2026287db9ce50b8436bb5e6412aab72addd44714c7ca91ed +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f018d38eca9740d15d8fba517187c0ecb9159965 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae234721b43681c242c65040f0bec805b6bd739e06740d689413b44c1ae5c3e +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7652c2299c4c26817c2447ac040b027d4f29f1cf --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:578cf6c0c5970d869c05c66efdb74407f7c09bd889b9ff612c91c34f1664d740 +size 116013538 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ed22d728a41fc7f5c5ee68d0bcd56cf2f146fd0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86965ebf6de9ab7ce50ea34e823fd0e73e394f3243307a8f46ab2f215f51a6a2 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b367934af0f291fea132dd25d747dca142bb360 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c13629c04693741517596d2f64766fd37a753d54455ac1c62925949e8c75bfff +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f39486694eb56a481b2b09afa9d9eb27eecd7c2a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7062d21e3412e67e68e68ed624447803dc5fe0d3c3b859593965711dedf97d5 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56ca1804b99908cc189c05969a65d6afd907461b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b3069d981d20bddd0e91aed8e05a5f9a5c817764e2006a68abc2be34981756 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01e78137541b6698dbada699477f8a7b5dec5b32 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06ad9c313239e14a26236785e0ef81782ae2e295a63544e6f32e6282720401c +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32d3ff1cb1a44082361b2c94563f6fec2c150618 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ba21f629376f25298e856360b75b9f34e9d0953fe963d3cdf627be4aa424f6f +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c70a0ca2544e9a5d17a7d22b2c1717fa00fc4af --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fdc4c6397ad6bf7d20e2ad9d8a530fc810b416eb7bbc21acdd1b64427cc18c0 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23fe6dedd13e250cf5f32bcf570eb31ac2d532ca --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d068589601df2c0593c179f92eb5bcd016155c4e80dd055c7abd17685181852c +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86317edf19a7165252670ed395264d23305935d9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a249426f66c7ad145a16c9d7f4b84af831b0fcbc2ab6d745785f045500770fc8 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5a2da46d0c682fba72d8828894811ce01d102da --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b234ec60d6e5b86dc57f32a62a05bbf15947d30612583227ec6b68d263b3e64a +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3f8b04ab5ebd6b63786ac4fe4272c0c56d72dfc --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:245333831b7f2c453c85300ef238f41ba1f189a7bb7ae29feda06c24865e28bb +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c3f2ca8e8397385a96ed3015947a5ba29c36e21 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4e8239b35c31bccbca850a8f60d1420135e905bfeb3dffcdd89090c43b4aafa +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2595b49c99b691746ad646c46256d52cbc26b456 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efc3b57412691908eb4bb0888a92f5d449aa2effec35443631a2c1403eabcf24 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c95de20f65f7861118907b8ceddbd903cf3b949f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c08db03c05b0905ca621fd239794c2a51b2fc5912adc788c4d0b9feb2c8d03b +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1daef49df6e5e5cbd40d58a4bad1d1c990484b8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d769cbeb0273b68aa901a895e021ed521a9566457c2fa84bfea1155bc6ef517d +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfe50878ea49a130b94a7037bcc6fde06bd2e292 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976c950e8c8521d02476c58567da523ccb23fd05b8f8614e034a3be33c2d6da6 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba6d8d414ee64f0ec9078b7818c4a0a4b1b7dabb --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f2ca3c4aa6cf7ca21a700acf904dace8e17a093f87da37c175de8fdf5079597 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0530c40f3e3f041039a79c123776d71abf19198a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6e155a9922bd0c7fb273eb55fddf4ff2a54d2fd18f195c5fa2a4151d7df394 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15b5e3d95e16e0806e6c6f1e2c7b74b8df435b97 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba81e2bb901100a17f39a991da0654c55035406270e97f4af6a8cbeaca15b08d +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..961524da5085c97e9f2497223c59b037dffa376d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5226b7dffdada4c7b122cf0da4f04e8d7064f1cbee17c2835d95c0cecef96db1 +size 116013591 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_01-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..073e220e54f76d14552f534f12c8b503847e46ca --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d858b8f123be5be93149924129b221c342c4c6d589b583490d748d1fe5556031 +size 160826627 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_03-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38af123fd3d06f834303d60445e78e2ad4923264 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a2d1d3df3ef5cf00cc1d163098bd4f882d8d9c6248763994da435f5ac1c2d7d +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_04-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51cd0531fb0a1ee8110d5a2ab8728895602972ae --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca77f261ff0e6abcdaeda4a4d5ae9545d47adc282b291e40b4439733a648ad3 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_05-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bfc27860b7d87a4ca3504a1b3aa8e4006fd1943 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77775939106cace468e2f586884a9d81ea4d576363ce6af95652c3dd4857f5cf +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_06-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b8cdbbd09d1ceb13d78209c66757b450065b295 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86de32415edb82b0e3fe3e525732d45b02236a8defa9094fd7a88154001d3573 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_07-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56ccda3a76624222a39786c6021a1e4defed3716 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eef746f08867f2d3a98660412620446ac1c12cc2be00b83624ff55f6f4c0c22 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_08-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fed0ba89eb06dcd34665b1c1e365ed278701ab5e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:112fe4ac43bdd0321759b7fbf6d276c56f84f63375d4689800bc9f5fcd9af34e +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_09-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d859be4a62d1a1767406c129a364ff46b02e0f9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35691fe13ebeed61414b146f4ceb0e256e98cc9e82595f6994cd36cc200686fc +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_10-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5816e6ace231432cf3faf70566bc09bf2f53a943 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73aaa427d2ccfb13d9bc7352ffb9850f2400c40d85b88b6dae25ba030822fc0d +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_11-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d542d1968488e935f2f190d199648280f7d8dd2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bb9254a7fc648451755f92dfce329208eecd535537ddd0cfeed4204420d45b3 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_12-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8167af6688f297207a5419b24793cb2a338174c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a89d8074a13c9c800753dba3fc18cc19c9e832b9a48cb7fe9fc4bad450a919 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_13-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9126211c62d86b66ab7d5899d2b55a0f2bdfb2f5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76663e40e70d9da2c02be985ee256ac3bc87574cd37db97a0d45b4725820b929 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_14-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..614f59ebde66bb8a1f2b6350722dcc6234d827fc --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1df17cd955ac15e1a9b5ab267b19cdd9a32b1b0227e768ad39f936b44f75f006 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_15-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5728162c781ad3aa195efbd1f94a2c8fd27fb99 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5bbb8aea58977f3ceca81fad75a4bfce2fc4a9584f97c15aa1c4cecf1ef0a8 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_16-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d09c5fb11291c5406dbec568b080d8d6a23da0c9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f05dc829121723e77fc659bc8821d9dfc7ce069ebbdb97c34db5c375a45ab67 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_17-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc70cc982dc784aa166db91b1d980a4f7dcffb07 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:974a52ba5279aba12d6b22ef4427d88c0c97a39a2ac28585be1fcede61bfaa0e +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_18-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c9cb2d345273ed7bd84c3286ef66e283f68509a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc83f37fd5441980413fb1d9838d07b1fa1bf93a40151b6e840ec371eb7ff50 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_19-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db951280959ab18cbd620707ccce8f159576e918 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5ab019f44134374002097848511441c52be418f2bb0ce7c8e11fd2d8407304e +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_20-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0eef0928c04aac405ee89c8db6cf0ebe87554bd --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:609e59fe46f996ca2e5b0e40809917541cf8529fbd6dcc8916b2d9c62449b80d +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_21-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0ee8a92619a5de204af9ecb9edb11595de999db --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:639eea937fd3925c455d9104c73fb263e4478312bc4d1f3e246b75e94380afcd +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_23-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..155ba166dc655bad6abc28012a751c1715bac2d0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eaa0d36a05a1017ef4b5af33050aa1921ea771dfe34132631ccf810b59f34d1 +size 7363 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step4000/mp_rank_00_model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f95433ea3c2919fd0dd1d09bc9c537b29c79e295 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step4000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c939e413738ed335fc1582441dfae0570cc8b96fc68aa4d5b89910a4f3edc557 +size 38451 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f59fea5e46c08a150b68a0c22ec827df9635cf5c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57dde257d5022f6879cf05c9e59498a112ba103b8df392c38c7a2b4581d45e3d +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aada3166359dc4332a3702b29c52890f5f4078d0 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:031b9504c0c65194b4507602d860a7598562817912762b442f10df4657992ea4 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e74c211719d523f23f5ec22b92e432fbaf487229 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d90ac247dfc486966b42860b7c86ec6eaf3825706f3d076614e0498b7cbd3ab +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27893dbc7e825eb5f14ee1114f899e16b29aa191 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:743338c75c9340e0d99e36752a5d6edb851174d202461bbc984f436a70210b6b +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5b250040d3e86a67e37f523be830cb3b9551f4b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a282c84150c451801925b8badd67fa035f19fa96fb5caf904ba8842b0c13969 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c29cc18f1aaecc890a4ebea7b1159f03f3219a49 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d6de0fa31c46e35d31928abee7c8457000ad4618a9e74d5fc7922fc3bcee18 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0473a0e97a6d3128dfb3dba1e408856af93df43 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4171b2e9dcc515d50d5b471f2a702057caa7c85385ff596ab9c3b0dbf01d64a9 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd60bfa4429baa785713154032003fc9f667096a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86faca9e1633802de38408fb1d853b1c02bc036d8bba2a58a660cecbaffcf4ff +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8af9c0f29c5debe5e1fd1f64e77385427dcd220d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c188fc474a5b15da75372d95ced0863220e704fdc0603362ed51cfd4c4116fb2 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de4e1cd7206bad731a07fb0a3f5483e4cb11ce5d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09f43a4e9772149c0fdeafce76f5ef7c0e3820a7d9d38245da28f14edf450d2e +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b8400fa632811d43035be0bf5d0cb09de02ec4d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6df98edc4ccd80c0f96fc818d05182f7539e303d18d892d8c34db7a83895d4d4 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3f31afa9b172cc7e135ba4a1c3f3bb22cc4c7ac --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed2493722274b1f97ef8098daa2e644e2544d50e942fe6a6530d2cffdab42317 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0640eefce8cc055d22a02a3eac549bb092d21070 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815059a4289910c1db74faf8d6d0d8b2a7fe0b60e72caa0c6b4ac2342ba5df37 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5f2ce208fcff6b4fc817be1973318cdfb3de306 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1efb9055d60d51300f94eb6afe288f424a342742ea45ac8f0670f3a0063c60cd +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f140549a880347af7933b736b4586d53aae880a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd4f5f0586536f28d50e7a1191a567642cb093079b2275d1dfbc6743fe142fc6 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9201cb38aeea73c8b2b1c162a7b2d737273a1c3 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6433a7736548b716992e5dc986769eb7d6daf44f59af48516b785b810715919 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..92032e947deb8a113fa2c6bb2e864c181cd61ced --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c218d7341dab0f75927e8addd63c4d22c5a7c73f27e8b70d74446484f8d5e23c +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38c15561a77f13363892c2ba4e8b5cc65c67f068 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed94e5332cd69cdf629d5d110cdebb756a1242ef73097e466d0d1fca871e6a5d +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..873274b34f6458773858e6b80a7591cb5518a181 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6bf2f2f31f02941d5adcc3008abdeb0c9cc19ff118cfaf0e35a0f72e0696f9 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35a4c72ca064dcb56e0648c41464aa90e121d1ab --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc975ab42a95eae655a91b00544b15f42cb61a63c191ef851ad612830f0d65b +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c002ec9f4a8bf54f1dbdfa6e8df673a868f6d62 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3de9fc085bf1a53dc0377d80964a48276032f45541a52303d234f584866204c +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6e7e7f3b613d4015e3812366684fdab3a9cc4b9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d326967a3fc457c0f77ce9822267567c6caac21ce6990c9a39ddf9fcb6b509b5 +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..536c911c57c5d69e4363a44771e5fa1c9bf0e0a8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68ed1b01ab491d85c7c7f5071f9509c03318d82a0c4e182ff51e1bdc8956bfd6 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a0a44dfcd01faf4b83302cde993350ea8717c80 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e937f176dc8e53497ba2583858c33c274d94746f4f328167e4670a9e3727519a +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f64c0809c52011e7ee52888d4acaee702dc4e47 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebeef9e833d622e9d731eac0ac587ba88d237329b9956851d5e2b3bf56613ac +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77f38e7e4b90977e2445e770ba2f13483899ec96 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ae6fafc3ef7c713211bf944fc3b649866769aca861d731c7d75689c07fcef8a +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b940596ac36d622dac9823c7b32ff4a97667807 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:738a44f4fff7d2222b898c6d750a7ec0eef3cc8d3cf163ebff86f63c3c3f9cd5 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e81243f5680b082e1e2eb064e5afd382b18ea51f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0872d1785f588082cdcda08fb5277c762f04361bc2c043d92546adb531377565 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1675cfbf6be6261ebf0e94f1d66cac2df3193608 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d84de2ca4a27317b1b5b445585e7cfe0401f4f1474154fd4090f38ef760f721 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..979865bd1ca04cd35076ea25eaa4422a325b3612 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a7c917e18b17af88cc7064fb0d144ffc90d1b44295569fbfef2071ba5b5e37c +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74f01831d4b778dbd9bfbe35c58f6a1a7a14a3cb --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd3d0a58969ea6a09f06765bda3f326363720fd2a1305575062b5f1fc6ac4466 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e104303d950f1ff6ca147e81b143425e2eae6c64 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd26911fbc86fbb04ed533aa8942e87fd99c87d7d3a76dda4e920814c203ebbb +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d1320836e1bfb17a4398911ff479a88ca0fd2ef --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50e10fca92e7cd6a30cecb5f8e1d5fffa873b2f3d2b602729dca14c6caf553e1 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc6629f20df6415056fdc520977520da4f4eb843 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7b3005bceb8bed8c62a808271a841737ab1be2d101708922ffda4dafb1183e7 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..832a93c92a013c7a789376f78a03875f797f7a34 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069806c8dcdaf9071406afe777599a482348f64df251c10ec34330120a729a12 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b547b405bae56f309dc9dafc6d53cb6f26949f50 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14cd7828602cd40e757f6c2ffb25cd91668d88f72f8885aa25cf2bd980c4d59f +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e16b0475fa2a87e711898e9df1eb25cb8978ab6a --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33df5c089744c0b12f2ec0bfdd24b407b8564958ca33186b00ec277c1eab0a66 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb71bf4da775b45f19d1988505cea648828db44d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0959778c949df142657d9f9c94ee3c5d5320977bf1e0a9486cba2fbd4b771bcf +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f09c6234c74a0d4cf626f70898e7ec71b51d3a0b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7584bfd4147d96e8ec45fd1b3df22d93f099dd3c7d6a7d6ac9266718156da878 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4341580e80510acbd41c27e545b478fa078be8f4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0ccc7f0ed97525ec97c1fac8e8080c8348863f04b8eaeea509a44fe44189252 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70f109123c9a344b95c7b5c1d298bc4ca8b3a16b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dde1c2557bf70511dc5c78cae3583f4b4d9e211a5da6f266a25d32a6f0adf14 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e94c91d14d2f820d4db7520ed248ca84637577f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e6a19936972fc16512dcc36b4329ed2b44362396022865af252b1a353ad3596 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f71310cbd6602059818ec10e67c799bd1f4d948 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510c27cb1911769a0b60a41e2b04237ad801d16aa394bdfbb2eb6cfea2c5689c +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20920f8e8b8ea33ca0fed6052c684ae0c7d3e9e8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55f4b42d756b5756bd3edb84f30798ab7ba58ccbc15ab8b967076e5b56bb529c +size 116013538 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8a5122064ae993f64cb7185208fc9eabecd3442 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cb71aea2828a27f25f8099d97c4b574ed80848f57927dffd36d6af899b41529 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9bdce678053038099542ab55137ab517a17cd9b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9577fb096e577987b349373d84c2a05b1ef37d39f42e258fba966f5d42aff136 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbda0ef11ffb3fa754a8db8addf8c14665c24d75 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42b5ddaa1f1e608afd34262ca24ddd6117543b69ad6ac9b23276da56a349e8e3 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6aeb1d12a6401950358075b6c1fe8ce6e5c39ce7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd59f706d897f4387c9e5f05df940f72a26c374eb5c3215d8c279b1176bc11e2 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b798df6efeb73687d9d6c75bfddfb4ee3a1d7a87 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da1a82b1273c4dcffee1bc1930f1a5a0eaf82d45d90d4d3afc5681f701043b0 +size 116013922 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9376b28f16f2fe354c29a28e61c6a547971cae68 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e1c87d950faebc7e3d50ab9c4a13ce74a3a5b396afae90d402ee4ff4f9dd7c +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34ab2d36ba53489ea4e9d30f2a889ed6ad40f8b4 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1bf8c125d4b11f35d6fd185c50e9dec8486a1f2ded3f17b7c666a1aac9aeb5 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..581b7dbbefc9da999a900b748373f970835f9a5e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1bfb66c040197b725650fd577d749b807ed9bd1795ba4983b340f3339d8901e +size 116013602 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36891311aa911615f00208244708e374ae64289c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d44ac8acb2537fa45f8ec95b3ebd0fa0b4f9a88c2bab15f458d57f46aa11284f +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30f49c1496c8f37a6715182e96909021fdd7dd3e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ac438c59fd62225c52f9c1e9268878aba68d883102e0dff6bfc5c9ee5b5fe7 +size 116013858 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..964f9fe12023554481db77296791489f1ad8259d --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c61f533eb61868e52fd0e538d08158e46e85a25540ac448ff175433ff0b43d62 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef68ffadbda2ae3ea8d62625072148294278685b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c10d3ad41ca2303e6db0dfa8b08782ddbaaef160b8e1a38a9fd05a88b8875ac9 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fecdab7875b0c9af7c591d92036ac6ec6b65109 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abee43467dabd12b0d9f4c1857f90d8639612f1f12099110a97ea7dbe63dbe36 +size 116013730 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d2cd5aa3321447ac1ef21b10d83cd84825175ce --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28b944034ec01ff3b88e2351019a22ce88653bf76152d9e61e59ae575e3fe9e7 +size 116013794 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49b73e09354382b4375b2d6fb3f75c5574107dd7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:749c12482c683a5bf85ff4d168237642051beea8f1fd5f1743b60235f3b71d90 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb65c4b14f6621dc72b77fe62f0e25e513093da9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3c3fd937112b1e9683111fadccbd5df03f40af4ae08c145d44c3a61635c7cd6 +size 116013666 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..292545fee56d1be63e1ab97d59bafd3dc01c53b5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9158b9d9740b1e3933de4d288a92d0488ea893d53a6b435bfc9904f0f5f6a5c1 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c011b62219cc4309bded94e05527943a02cebec2 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30a15e05c267fa30a3cfcfc96c2302b8ce0bb73e33f934278ea0011f5aca2454 +size 116013719 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..856fa003f1068631ac4880f0266abbb62c1eb3e9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b55622ef63ab79798813f2ed0064ab599dda37b09df3a33fd8023fdf7aa710 +size 116013655 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..960c4fde8e6946fde7190640546fdb52d6043a03 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c35f74a066383928f19bbc487b5a6faac78a43b8f5229df38e98787cc7ed89 +size 116013591 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_01-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36f42fef9dab78bdc42da48aa447d230ef1ee46e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4397612fc5b12593565dbf2565e26258bcd94be0764a1aee742a8ca202ad6152 +size 160826627 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_03-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5acd71684e631a53dcd1e364d406c0e23be28e75 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b21412ac3fd84c6d0d3fc35d5dab1e2fa8a9151ca25f42b732f7a9b50033d87a +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_04-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1830ddf4d39812e9449262cd5f0ff7d93106a786 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7078eb627a665b27ea4d201f8b4a415e61f6963784af36cf92bbbd455fb42670 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_05-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ca3327cc6c0801d21a0f504689ac58f9d74dc14 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed78947fd20d44e7856f7ba5d1d10936ebc2aae36b898580b1746a39c5656689 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_06-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d62bdd260b5aab32a5edb0db7134ab3665ffae5f --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49d2f645156ce8f49032b82b1bbd3c5ee3b6c5c0863bdaf747dfe7c24efa50d +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_07-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4288d680baba5e7180d96b819be149d145057281 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b802a528edf840910d5cf5dae42ed32d995e8147f23e7c85aeed19a137e04750 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_08-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9852288e078cd500287cd66e41fff2aa37bf1d6e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2520aa4ff1f4ca5778cc29ead6dd085219b964c5d4d3d8f2b638d9ffe4c61201 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_09-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c37cbb7cf04d4df2ed43e2415fb2312011ec10f9 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c93bccca2e72bff4417f01f4fcbee343bc0da0d1b03371840b97ca6e3cf25a9 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_10-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89ff23096b003a121dd603820e6172c3db47e09b --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a15c3599ab200f73cc42f1e17d2ac4ba1110442ada23b063cbaa4364e10b033b +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_11-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0385b28847929bf9411ae4aef5eb52a09508b14e --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2d761b4459e7a65d25ca417d47151d41a67f450bbce38b6396e54762d24e5e6 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_12-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a16c801144044c5dc090419597aa9bf989a8e8a8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1543f83f3fb0c6f5caeb635f706645606fc8445678a2f2e4c83c56137bd2b4 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_13-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb16400b8576346cc0c34dbd0f94c5ac37a22aab --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc05f2276c7147835de406435bb28baed90a7d8bdafe03ccc9c12a7b9f0f5e14 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_14-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d48db3d1146c1ac8af35240759be3298db7a8422 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f05f55a748d0c80aa61c0f643bc79edf783a55dc356368246d80703b8f1f467 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_15-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d1c8a3a6e46eebdce8a7dae787b517803f69519 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8161714aced1f219cad828a9aa9fc6481abbd8345dc92ac13c317910e22259d3 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_16-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6295d0ec82dc316e7af560073dff216b89495b7 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec1c5ee84040149d61f9356faf745208b9e6d66e417b74b9410116506b026e4c +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_17-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66a5e61f8191634619001d3bf6b303d20ce93911 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5e6494d5ca237eb33be770b73d5d98771e9496d614532d0e40c1d9c00e0df9d +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_18-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a09819d3ae401ba90499206f8f57733291e6daf --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:808dc0cea4d4d21ee71a2a5bfd0255576203075d1616c552986111bb6907dee4 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_19-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39afc19e07a3f490572ac6c3eba0e580a68fc6aa --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3699571d23f629c4c3867d7e3a9016dd8515086175550828edba13a5ae1a833 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_20-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..26c28be2bf85f11c4b00e11127463a96d9b999ec --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2a1e229d728cfa706afad99fb9eaf8ef5ee6087c6aa0ef9a227bf744029ee1b +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_21-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28539fd12e33c9499dc9eca21bc3d6ba62c6940c --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a299a84d5ae1dbf2ce98483b1ad3cd21fd842ac7f0194d720cadd1100314b85 +size 56667395 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_23-model_00-model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..290c713858512d9170e7af734401272e5630e420 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d83ab1388f9ba0160b1333a3bad77ea5b73913390e061617a08ff4d878ab93 +size 7363 diff --git a/619m2b7400m/checkpoints_619m2b7400m/global_step5000/mp_rank_00_model_states.pt b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0d5fc70363dc5d7246c1391489d00874c045eb8 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/global_step5000/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca9377a974deacaceaee3182d978cfa0e0e4d14d0b47ec80e658e61f2fbaf3f +size 38515 diff --git a/619m2b7400m/checkpoints_619m2b7400m/latest b/619m2b7400m/checkpoints_619m2b7400m/latest new file mode 100644 index 0000000000000000000000000000000000000000..d84d59d95b20df554651e267ad17833cffa10573 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/latest @@ -0,0 +1 @@ +global_step5111 \ No newline at end of file diff --git a/619m2b7400m/checkpoints_619m2b7400m/zero_to_fp32.py b/619m2b7400m/checkpoints_619m2b7400m/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..e5249853c89179a3d5ad212ead8c464d6c979df5 --- /dev/null +++ b/619m2b7400m/checkpoints_619m2b7400m/zero_to_fp32.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python + +# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, + OPTIMIZER_STATE_DICT, + SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, + ZERO_STAGE, + PARTITION_COUNT, + PARAM_SHAPES, + BUFFER_NAMES) + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage == 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_optim_files(checkpoint_dir): + # XXX: need to test that this simple glob rule works for multi-node setup too + optim_files = sorted(glob.glob(os.path.join(checkpoint_dir, + "*_optim_states.pt")), + key=natural_keys) + + if len(optim_files) == 0: + raise FileNotFoundError( + f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'") + + return optim_files + + +def parse_model_state(file): + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = { + k: v.float() + for k, + v in state_dict["module"].items() if k in buffer_names + } + param_shapes = state_dict[PARAM_SHAPES] + + ds_version = state_dict.get(DS_VERSION, None) + + return buffers, param_shapes, ds_version + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dicts.append(torch.load(f, map_location=device)) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage == 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage == 2: + fp32_flat_groups = [ + state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] + for i in range(len(state_dicts)) + ] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], + 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print( + f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_file = get_model_state_file(ds_checkpoint_dir, zero_stage) + buffers, param_shapes, ds_version = parse_model_state(model_file) + print(f'Parsing checkpoint created by deepspeed=={ds_version}') + + if zero_stage == 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, + param_shapes, + fp32_flat_groups, + buffers) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, + param_shapes, + fp32_flat_groups, + buffers) + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, + param_shapes, + fp32_flat_groups, + buffers): + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print( + f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum([ + full_single_fp32_vector.numel() + for full_single_fp32_vector in merged_single_partition_of_fp32_groups + ]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum( + [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + state_dict = OrderedDict() + + # buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print( + f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} " + ) + state_dict[name] = full_single_fp32_vector.narrow( + 0, + offset, + unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError( + f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print( + f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements" + ) + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, + param_shapes, + fp32_flat_groups, + buffers): + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + avail_numel = fp32_flat_groups[0].numel() * world_size + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + state_dict = OrderedDict() + + # buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, + offset, + partitioned_numel) + for i in range(world_size)), + 0).narrow(0, + 0, + unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError( + f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print( + f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements" + ) + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help= + "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)" + ) + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file) diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1239eb639966e208d087d89a1f37cd43ee9cab7 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9cf655a1db7401d2e3c6049abec5bd0bc98c62d538986f7efe5640447f043da +size 116013719 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71bd23ea75cda13ca8780a24b11a1177c0092c55 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce0bfc4c0129c9da43ff8e1317e76b794b1e78bce05d77c8e29e78ceba37fe7 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..716e1e1a2be3a3dd1d5a678e5814878d06e98edb --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e0bed5da0c3f3fcd8f0f0510c26acf901e50ac586e46ce5dd0fede459eec93 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49af1b43d57e5acd8833f7efdab94f7644fac2d4 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e526324892dad41b92205f0111171877a655bb74827b5674f91aff4592ba11 +size 116013602 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0575fdfa7f847467d146919cb6f0cdd49cba84d9 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d72e6a58cfc9585134a2e35f038250875abe863461b424964b273a308b13ea8 +size 116013858 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d033226cb3b336d581616910cf219f3f93846d94 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:495f4ea7c95d125f9e5b4db8af745a85d872db6530d03f8ebf5f47d9ef23eeb2 +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6c15c086e44b7efe650ae3f29325e0253f44433 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:600ee24dfba75e5e19012658edd35095e23643fa830a78f81b34170e845fbc71 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..107fa7ac45ba26673f3a6eb9a15fffa7ed5505e2 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be217d187ff2114e83183019cbdb2d1134c495fab54da9abdbe189f8e5701d2e +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce5085bb4b0aae5ca63b2e21ff93c68e7d5ae2e1 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d81f0b8c57e3d4611053cd05623331901e97a0860494851ccbb53e3939c48f6a +size 116013602 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..216fad3bf52610bbbf6185d10f69407febd23448 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:284f79e00746a9a4f09f8272e4530f0bdfe4976117052439efec3a9c3b5864ae +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cceaade10a1c8b71187ee9487a71ae4f54469a6 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afecc734e8d82ce1eae9a6cd217fdd0a25d21cebb68c2bf1951c9e9648bbb356 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfc87bd9311db79b1b3b6e7dee4dc4824bdde75f --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e613bea6e15c1f638c2d161e8849d7040612b84cc1da75763de1fe78cbfbe14a +size 116013719 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5532c9d18b93df693d06988d2f46b6668fd8ead --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069be265e1deb20e35e43ebd3d13f42a54407f1105302f40be90d3509d22fca4 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4c10eb52f73b9c1e3b8aa5afaf5fa13a2f82d29 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e97811a1814dab1acfa5d249f0f4486256f557e01e3b7b6523e8f47f4375325 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7da2f98a6713ecef25c796d5b10fc2344aab587c --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507431e0379980c49caa92f42b5e9e8cf734f4ba573218087178f3adb6c8f640 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..781127117f8988065593f29ad2e9d0332da06b5b --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d92560d8c59bac9022fa520552a3e5a16cc533d8364aab936c612bd065d5f6d +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfec755247759419ec19dee35be241f4bb25dee7 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058e795e5344124047d6cef1ed78e726bf4f18d6d89edc8bcf16389ab8450fe0 +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c9e44f1f68c8e45f372b4952d3258675fc857a2 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a616bd0b06c7df29f2d74740209689f8045b3ec94de875cf2a40c53ed4328df +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9a23284fb7e2045f4f28b8d025a2aa9b9f6c8da --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5bf8e99c60ac9bcb1a237829230a0020726054f24759032edde2491df3fda8b +size 116013858 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03abd96dc631867bc186e94011ca624b0a2de109 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb01870e62f19d0e474805979348964b9aa3cc1feabb9464684e97ec5cddd053 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c6daffa84816276a5549bd23eeff778db7fdbcc --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c10a1054ace8d715d18fb2375c365d236616e1e41007d0449f81770d1f15bac +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..920ca236d0e0e2e4a40e62586a152ecf631f4ab5 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8267abec205a77300c24b2ebb363fdde212b52ab083fba8c02e921f9f1e17dc +size 116013602 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30e32d3973b9be69bec0b0af852fff07bd759896 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8723fa68e845557839e7b24a0ef4af428f2e74f633e3e1561b49ec337fbb6923 +size 116013655 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a38de479bae94468c4ac45344a7fa6418e315d1a --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faf159c0ee098102239675ec656504a8545db12c4affbb3d30c58ec28b3deec2 +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7aa9435cb3bc1bfcd3f7e6f21491c64589a12bdb --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793531fdb8a197e521164a5f0d585f733b9a3d968b2e570b58b3aa21fa63611e +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82fb7c61fdb8a9eef56113b156bc4db1335fbd18 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70109c012ed4ecd5153c94907d51ae38a09d88b1dbd2ef48717f96d20db4bc06 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e95cd2752ac3e83b42ff5755b7d769c2174ea0f9 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0026bdbaed41a3b72b9b8ec19c179dec8dbe2ab8b33eda98d5d01bc26df6399c +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c47104e86f28e5634d78fe3b3f6da813c412842c --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:272fa612dfeeecf46924372eb601c00fba07a15fa7d7be2b4e6e3c5823157e6a +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51c18322322d0083c5c7ae0dd49ae14d866e7dc8 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cece6c3ec152a70cda6b3356681a0fceb0aa78210cd64a6109721460292fb8e +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d09dd777f08674e8493001a3077415d99978acf8 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f71c4c15676fbe67705c33a143dc0ad026340873e6e2fce64277272aeae5164d +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0dcb838036777f72403f12754062b74d89c8a1c5 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:978f4d427a79bdd297b92e39d76b899e6160655be90ae90d9b0b5eea9bf3ba74 +size 116013858 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..894a89469ae80022d852d3a13b7a77aec3fb7e7c --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68497f67f8e36dce0adfc66879a2a8340b0959bafbb4f73d66ac310ee279d3cc +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7deb189578b699b09cb9f9a653063941361b94e7 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1603909ae04ac121d7b2b31055eb6d337ea857a606849196b069fe14ae1f72f +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..813e785331c894bcf4d97d40a46a56b3f7a810c2 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:462dfc01ed6ba2d4772687a7fbb1c77d15cd41e5e23dd44028daa94624416a87 +size 116013719 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15402eb8bf1777ca10a7d180128f0fffb2b59dad --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c80c839f86390e113f92f4a9d72d7e45629497aae00396b38579ee850dc73abc +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fd9bf2b6c8b4b06c504c31daa1ca605a0056b7f --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db4d7de0899cdca73d147723f48ac50ed6542ec617f7a7e112149b0ab4598369 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1019e3af414a4a3a47b6e635d8bad0ec5c0714b --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56be22fe20abfe6f22f45f065de906ac7c773163ce3a8f3dbd306d8da623253a +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..66782434b57d980a55ddd7474925ca139a1a39c4 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2923e95022af0c7af7848476334a5978c7f91634b6edcf06d896b186b703b1c +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e3ee53bc3a02eba54ae750ff627a7932951a74b --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ebbc44ad76100d4fe0bcf89943516cb85b989c1bd534659d9bbbde05960708b +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb8c42c22dda2aebf851423142309544d21469ba --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19cca1c5b953a540c1fe29d94b5ed0c1677c4b00a4428358243fd9215df21a4 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e54eb5406bb55b3ec26ef8cc29da6a0e05c1b226 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155aaf86c37b2c8ef4d0f4bcd52792c8314b14ece5bb0f5779b7776b92e7dd76 +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea32b0b2b86b536622d748b3573fb5ffbdd6a5a0 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe3d5f0b62796d61dc2fd36fa337b8927d4cf12394c13da33400e19951ec730 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebd4b3d7a45eb81f5cc1f86c43a2b9a174f3e82c --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe02b08947b84d00d6c7797e3cbef590bad8de7f1dd7728d461db7d002cc345f +size 116013922 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c02ffd2a76808eca5f15ef272cb44925cc23e9da --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46245f6f08ba0899e974960c2ce215bcd8afce093bd7b53a739e4af4de376755 +size 116013538 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..47482cedf2fb0e14cfc636ea5b1b60a35ce5699e --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8fdf829f4594bcc6f784c9318a22fd813a6cc709dcd7ead256a551cc6d797d4 +size 116013719 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d905d45268f22eb1c2eff4acf05f04fffb68c4c1 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc200e9e2838ea5e03ee487aacc7c0283e6b64e7f7d6214ea8f6ff54ec76567 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..587bb2288dc24350ee90a74f1a18f642053f9b2e --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f375495f03dee7e462f714be20b7b7566e8515fec44a2f306ee10cd596f842f8 +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c123bf254c21145b5fffe8c48a9a4ca9039a5260 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:828907fda797b7010055c596b29dae0d6ca8275e4b393afdb5fde2aecc28680f +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61d01da8d335526e2fec7edb02bb44bca992d9d9 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a869678ab85f6e3238d8639dfdd72c354a99aa18742d7d4b2844155e5f364334 +size 116013922 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..becccc7eece11749660a4c3a7520906589decf74 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647444a8a18ec96e08a3a285d1940b03045aa48b623de66cd90f5144e306a57e +size 116013602 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf89d7f87957d95306557c42dc9b3e495ab01deb --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3742048e121fee8238ea7f8659faedf00d0a6de3aee51f9fc15c3ed8cba053fa +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfa12848fa4c1e5df2227bc0822266df8d401636 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c669dd013fddf8c7b451b4020e1322d6a4b1b0fe68cff154626690c57abc17 +size 116013602 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2b6c6cb0e3ca9923adb2078c257a95d679dae07 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97cde616411b1a707e56cb9e8aa6d10d57bc240c0a439bdfd9a8f19ed6c4541 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6baf2ed47ea9e96dcc4dc09032efcfb8aee0359 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80673f86fb127983ec6c2c2af48591908f7303a96c5eadb350cf237245ff586 +size 116013858 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33856a2a18fafcc5fc7594522e609372cd340f32 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8e6a9e4ba2bef0486c335d18a65498e6a135d21b8406b54c1a0b5fab37e166c +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..042c3dd30d6fce8e9e84e19b9567d0bea4762818 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bca5e230a6b54f0f5e156c6486b44a5046c49acb685bdf8e02538be9b96ae5f4 +size 116013655 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e665dc13f5369f380d708f34ab393b0aa36926c --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0228d56e04bf174f2217d33e86a22aa5d189d943daadbdb261fce90c968fcfd0 +size 116013730 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cff4280a5db2409d8a0d2f0d6cc24098dcd58607 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05dfa131d54522c5eed8c5dbd2fe0f853f7e362686f81c6a67afdde8fc3c5753 +size 116013794 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76815b52fff3d4f353bf4b1c065d963bce44b7f9 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b2affbb7ea8205038205db120592402d2f32620b7eecb771474e207764878b6 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53bc5047658a3d7e50ba26c11a016a83f39c0319 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b618d065fec59a18bc26a9bd8ecaf2d7635855072e396702ad8a10a776ed2d40 +size 116013666 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6707e688b09d918024f31e28c865adacbee942e --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2739cfeb9cc02ade3adc29e65e956b7c39267b5525dfe823fd724868ace55791 +size 116013719 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ae0616cb7cb4776eb91df65a5067f1cbcde1ef2 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52ccf25aa781dfde6e8432c8f1cfaca42086e139ed63a0182d3e8f2fb35189b8 +size 116013719 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71580d0aa9ac433c6aa146ff5e7e7f0f372fb875 --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f098fab37eaa444f37868cacdc9d7c2dab63ba72a870ae8ca7e6b4cda93fd766 +size 116013655 diff --git a/619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bce31135c0df47cff767f83bbb96a5dd1d83e1d --- /dev/null +++ b/619m2b7400m/global_step5111/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1133a4542517ec2c38490b82f7a15421c943a9298a435abba8a649ecb37f98b3 +size 116013591 diff --git a/619m2b7400m/global_step5111/layer_01-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22b1b3d5e4e7322698c99fca19cb5131b2576291 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7027ffd037ad64e8c4768974981c9939b38a8b8711717abf675a1d076c7c8623 +size 160826627 diff --git a/619m2b7400m/global_step5111/layer_03-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79821768a91568b8bbc5a05d074f71ba150e5e8d --- /dev/null +++ b/619m2b7400m/global_step5111/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f62d7145bb18da4ab0ecf4be95233a5152307cfbe2f297cc958cc5081432e61 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_04-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fd9c0a0e3fd23a76b36e5d3c639c28d63bf69e9 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925b99bfa5e83d70c1ac4d12c60349edc8b0de1966faac2424675c40d9627d43 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_05-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d893d0e489d274a726e97b2e090a19428aec87e6 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5cbbef626312c1608c4403b03fa08f2902b2b4a1026a560a4ef77972d111f6 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_06-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b288e630f1ab889d801189a30a680a71407a4445 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:699fc3e13cc975d2ad1817b4382f1837482f36801b545e4beb2de7ca91fc1c05 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_07-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7437ac67f7f6b4233b34868a74e3b601d3f4d46a --- /dev/null +++ b/619m2b7400m/global_step5111/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:121051a730b92b9bdde5b2a8aed9da5bdae2428ce695448a53ee2a4dea9a3746 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_08-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6113c9f25b8917237b80d359a9ab50a18e33ce0 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30355e0309a5f07b8b5c45b18b976a3a12cffd21b8f8ac9bfae55f51b37d730f +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_09-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d679c3a36ba74154bb00612b1fde869a91918fa --- /dev/null +++ b/619m2b7400m/global_step5111/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bca53e171c2a368c42993831f751a1191459dd81f225da92bb765dc7be1c571 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_10-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee757b351306de79e73ed4111419297a51f34f22 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4afef0016789e4607e083cf4be97d681d66b099b86e096ab70e32bca4acd888c +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_11-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc9db2ba854fc2be17a9749ba3c4ef1a4622f6b7 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51331cd15325d28db0d815ea893ebb9803bee24df8dea552052fa676384e13a0 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_12-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..871239147d861f70427bec9fad3da051a7be6128 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfa43165f6e3c0401642b9dee148801455bd7a1f48822ee17283f61150d8ef70 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_13-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e64e79b06e497c676853f528209555b377851c8 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b8ff372783973786ddc4ce0df25bc4837fbea551383c6d429308f5120af535b +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_14-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87d7f9ea3e3cd43b6db1ecf78748657a2211355c --- /dev/null +++ b/619m2b7400m/global_step5111/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c58ef6f6d704079175389a1f16a44ee49bf672e49ae8516ea46f5530f53f85b +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_15-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e6dd15f976429c2c33e2c35318f3b682fe67c46 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87f265d2da83540d53514615d0e8e11c9e66033279eb3a0398ce175a3a7f1b57 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_16-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b8832b5bd8723a1406aeebfb13b93c9faa060b4 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3ee6155b9280329aa590187c64d2d31bb04197b4478d3ce0da60afbe3c5638 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_17-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be1de30f7c9070fa56a61120c0e7bb4ee4dbe6d5 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29810c833cd87bd1308e9a17e2485e4da85871aac1c49c8dae7f0e4b6b6125d +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_18-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91afd79818d16409ec217eade9a3a0bd5c8e86da --- /dev/null +++ b/619m2b7400m/global_step5111/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37e0ebddd85b14f0aed8024ea32e81ddbad23a8622e186c61abd575ba4da4fa1 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_19-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a301e17e8bbef7a73569deba552e4109a36045c1 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a036343bb633af0c1b2e8b384a0ea4e18fb905095fb0bfcad6dd2a6fae83ae +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_20-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da88bc7e3ab87661d3f80648fae1088d3213690e --- /dev/null +++ b/619m2b7400m/global_step5111/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa30c700b388758f9d40c2222f013c0f580437876b3d716de24e0bc87ba538c7 +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_21-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..662317a3d0d3c21c674ee77aaefd5c06540fffa1 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ca26cf11730cf75981ea077825df3e8d5ab7fb4077e2071e4261a3ee1ec2ba +size 56667395 diff --git a/619m2b7400m/global_step5111/layer_23-model_00-model_states.pt b/619m2b7400m/global_step5111/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c155383aec129da3ff009c00147de2ad6e500f8 --- /dev/null +++ b/619m2b7400m/global_step5111/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8b5efe34e2126144af3902e06384c7166625fdc69ca6518fc3fa2842bb9b3e +size 7363 diff --git a/619m2b7400m/global_step5111/mp_rank_00_model_states.pt b/619m2b7400m/global_step5111/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e515a49c6c6ac7bbdc60d2ebe3b3ecc3f1b6c0d --- /dev/null +++ b/619m2b7400m/global_step5111/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0d2ee9c799939f3770671c045e00b280080dca604ba81446be6bf5b9dbed654 +size 38515 diff --git a/619m2b7400m/sbatch_619m2b7400m.sh b/619m2b7400m/sbatch_619m2b7400m.sh new file mode 100644 index 0000000000000000000000000000000000000000..e0a1566d95d452e8f7cbf14324e76e2a45474dc9 --- /dev/null +++ b/619m2b7400m/sbatch_619m2b7400m.sh @@ -0,0 +1,163 @@ +#!/bin/bash +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=619m2b7400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +# DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_632M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 2680000000 +# -> Samples: 1_308_594 +TRAIN_SAMPLES=1_308_594 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 13_086 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/619m2b7400m/sbatch_619m2b7400mval.sh b/619m2b7400m/sbatch_619m2b7400mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..5391550903f39fcfc394c84db7e3c8787242240f --- /dev/null +++ b/619m2b7400m/sbatch_619m2b7400mval.sh @@ -0,0 +1,170 @@ +#!/bin/bash +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=619m2b7400mval +VARIANT_CKPT=619m2b7400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT +# Start from scratch +#rm -rf "$CHECKPOINT_PATH" "$TENSORBOARD_PATH" + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +# DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_2B7_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_632M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 2680000000 +# -> Samples: 1_308_594 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --no-load-optim \ + --reset-progress \ + --override-lr-scheduler \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --eval-only true \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/619m2b7400m/tensorboard_619m2b7400m/events.out.tfevents.1678910396.nid005725.121259.0 b/619m2b7400m/tensorboard_619m2b7400m/events.out.tfevents.1678910396.nid005725.121259.0 new file mode 100644 index 0000000000000000000000000000000000000000..ed83ba82b62b492fda5786b8286e008cf9733a45 --- /dev/null +++ b/619m2b7400m/tensorboard_619m2b7400m/events.out.tfevents.1678910396.nid005725.121259.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffcaf0ca8ae0c72dd37f084df427564e853d419a67310490928e7295932d8874 +size 706368 diff --git a/619m2b7400m/tensorboard_619m2b7400m/events.out.tfevents.1678911552.nid007238.95948.0 b/619m2b7400m/tensorboard_619m2b7400m/events.out.tfevents.1678911552.nid007238.95948.0 new file mode 100644 index 0000000000000000000000000000000000000000..f96e756825b61e34e3725c2d62edc791e5f03f88 --- /dev/null +++ b/619m2b7400m/tensorboard_619m2b7400m/events.out.tfevents.1678911552.nid007238.95948.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beebbe9a96682ee14819a6578f8d6f0772b1bdf8dab273051f1cf385404f3d61 +size 9099912 diff --git a/619m2b7400m/tensorboard_619m2b7400mval/events.out.tfevents.1678915809.nid007238.110373.0 b/619m2b7400m/tensorboard_619m2b7400mval/events.out.tfevents.1678915809.nid007238.110373.0 new file mode 100644 index 0000000000000000000000000000000000000000..d5f0f762027c329ab032e34811afb07619670fd5 --- /dev/null +++ b/619m2b7400m/tensorboard_619m2b7400mval/events.out.tfevents.1678915809.nid007238.110373.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c74c5bfc2fddc752bb577d298ba17e4ff624e5d3b829c7e32aa9b29800e9a7b4 +size 980 diff --git a/7m100m100m/eval.txt b/7m100m100m/eval.txt new file mode 100644 index 0000000000000000000000000000000000000000..6de6de9b06cc5bc40a79eb08455ee803929520d7 --- /dev/null +++ b/7m100m100m/eval.txt @@ -0,0 +1 @@ +8.107364E+00 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f5f0e351b4cccedf9d7114fcc08028730ac3337 --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91e39ac7720cd88efa0f2792784d0995bd76bf801e9f5118f60ca32812df6a16 +size 5328663 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..17c00a1083f7a4860e053ef0e7303827fbfb1238 --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50524214c32ac1591a7ca179ae352d2d24c1cbc25af9c438a1932355bac9dac8 +size 5328738 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecb0886d879390da4b964ed68300ca1be56f8a8e --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf4eff0fae01461ece233f6f5c827d7276de91a0e7c1742594ebef134752c5e0 +size 5328674 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a6b5adc1d5b0974dabe02d7a27086f4b671b77c --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb9e2a0f09e02c0b2bdee18476c2f6fbf29143a906d2eae4d11f5632ca613492 +size 5328674 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec860a3d6f5db0a37abb55c26691515ff98e03fb --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d2024359a28d77bbb230eab81bbdebaa49a6d636e2b5fcb7ad390187b3a0323 +size 5328738 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c54a259015d7e5586d3e88e4158422b20c9fe01 --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ab9e28b46a1192a9329246816b6c8364a2f14f293ece347090682deb5443da +size 5328674 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4943fe7a0fc5b4f4c6870cc8a82b64e54026fbb --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:750f2efa4fd61f98ba33966e7f8af78607c1dc1d3f8a822084ef365fe233e1cc +size 5328930 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27122daab8d4825c5bcdfad630b194cfe823ff9e --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8be589a26588c6c0f4263986b837fc702f03677ef616f7b143e4aa7f618ed0 +size 5328663 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73de7ac2670a200007e07ea72e3d69c783d6fe9a --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1063c6a152da13d10d1c80b807901eea5c8690a26a39c756300c587e9140ef2f +size 5328663 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76b02e45f5a94676dc3fadd343d32838debcb86d --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c7f93a598f10961626a76180aaa0709b39e99c78ff1b760462c14bf16acbc9d +size 5328727 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..909df08ffa19b97f90380410b3bc15bb48efb820 --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531a2f6237ab987fc4a2f1f80b17ed0007e471a90d07f94bedbad693df99b9ea +size 5328599 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a93cbf71483ac5becb9d1db4c11edbac17a1d55f --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8aac831f3f14d90d2cb5a292460d3d604063d1e91f925dae4613988c06e9336 +size 5328727 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a98ce9cd5b2466687e783baa73de179f93c7b252 --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09be9059c96563f1820729b24a4d4a7d1efba89f95ecdf534895f1efb6ba941b +size 5328663 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..667465472d2b6de734ab31f68d9eb3e0acfe913c --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce505b1046906abd28d585d2a4eeaa49afff654c2598a88f4f03fc553cca9e4a +size 5328727 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe88f0be68ba7014a56a536eba05392147ee385b --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13a62508ac67ee77f5f540f0ed683de121dee608599c6ad120885a8a4ee5fa60 +size 5328663 diff --git a/7m100m100m/global_step190/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/7m100m100m/global_step190/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9727c24420977141cb72eb92bf363f6cc8c13fd7 --- /dev/null +++ b/7m100m100m/global_step190/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7a781edd720d3c8509dd73d88f9b337f1bcea9c4b376df11254cc9650225a57 +size 5328663 diff --git a/7m100m100m/global_step190/layer_01-model_00-model_states.pt b/7m100m100m/global_step190/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41615f38d5c4266431bc191f39c97b12c2b8da4a --- /dev/null +++ b/7m100m100m/global_step190/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dd9fd24e83426db607da9de56ffbec877aa9beb4280371e02dc307ba51c2894 +size 13403395 diff --git a/7m100m100m/global_step190/layer_03-model_00-model_states.pt b/7m100m100m/global_step190/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28409c4c3a311dd5857e1153a21c7eb1fc7cfca0 --- /dev/null +++ b/7m100m100m/global_step190/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f360d0adbad0e5638df2813825909d6984834cebe9df58fb8392b54cf94aba7 +size 269251 diff --git a/7m100m100m/global_step190/layer_04-model_00-model_states.pt b/7m100m100m/global_step190/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97cc183afaa34769535a1680ee3c7d99d873b4be --- /dev/null +++ b/7m100m100m/global_step190/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e1d89c75ba47e3b24518e467dfd201455e5209ee6e7642b6c91d212f443f3e8 +size 269251 diff --git a/7m100m100m/global_step190/layer_05-model_00-model_states.pt b/7m100m100m/global_step190/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b00675f5399fbd885fa105f2d7059eb25a78a37 --- /dev/null +++ b/7m100m100m/global_step190/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e42c451c9a9ca671b3032df95728c86de351808a4133292212189a5dba87089c +size 269251 diff --git a/7m100m100m/global_step190/layer_07-model_00-model_states.pt b/7m100m100m/global_step190/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4026d1fdf023ac33223bd4ce9959289e4cd58d51 --- /dev/null +++ b/7m100m100m/global_step190/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7ca97d524a068d70c503b217faea25271ed1bfa94e8e2fd94c7e27050c9016 +size 1731 diff --git a/7m100m100m/global_step190/mp_rank_00_model_states.pt b/7m100m100m/global_step190/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6ffe4015c3d00ed8fb14878ea8e0bc1ed1e5a55 --- /dev/null +++ b/7m100m100m/global_step190/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:337ac14e9140d70e974d88f9f29417e17d68da0290367b6dabda485b89aa63cb +size 26291 diff --git a/7m100m100m/tensorboard_7m100m100m/events.out.tfevents.1677508396.nid006608.48416.0 b/7m100m100m/tensorboard_7m100m100m/events.out.tfevents.1677508396.nid006608.48416.0 new file mode 100644 index 0000000000000000000000000000000000000000..7f67b901fc6423073871f77bb5179586551f3589 --- /dev/null +++ b/7m100m100m/tensorboard_7m100m100m/events.out.tfevents.1677508396.nid006608.48416.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de67a4693027cfe521c2df6c0ac915ed4558ae781912f55c9249033993605c41 +size 355309 diff --git a/7m100m100m/tensorboard_7m100m100mval/events.out.tfevents.1677509121.nid007242.99140.0 b/7m100m100m/tensorboard_7m100m100mval/events.out.tfevents.1677509121.nid007242.99140.0 new file mode 100644 index 0000000000000000000000000000000000000000..05f195a5b2892e05e6b57c3955a4fd8fb5f3a9ab --- /dev/null +++ b/7m100m100m/tensorboard_7m100m100mval/events.out.tfevents.1677509121.nid007242.99140.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c46d4d32c47a5117fc4d1d06d82c3715413ec290561a9b82c44e0f07bd7f1c10 +size 980 diff --git a/83m14b100mdedup/3326822.err b/83m14b100mdedup/3326822.err new file mode 100644 index 0000000000000000000000000000000000000000..cf38e734347814b07ce19fe4f01a487c36424f6b --- /dev/null +++ b/83m14b100mdedup/3326822.err @@ -0,0 +1,1111 @@ +0: 2023-03-16 23:32:26.013259: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013279: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013288: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013285: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013296: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013299: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:26.013330: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013712: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013719: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013720: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013725: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013728: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013731: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013737: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 23:32:26.013722: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014327: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014325: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014329: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014334: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014337: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014340: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014335: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 23:32:26.014332: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016338: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016345: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016344: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016339: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016351: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016357: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 23:32:26.016358: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022763: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022770: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022770: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022773: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022779: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022771: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 23:32:26.022773: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030324: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030331: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030338: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030334: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030329: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030334: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030326: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 23:32:26.030328: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079234: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079256: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079260: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079248: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079249: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079265: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: 2023-03-16 23:32:26.080052: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080060: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080067: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 23:32:26.079262: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080052: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080057: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080055: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080061: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 23:32:26.080058: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 23:32:28.448670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448681: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448696: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448702: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448672: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.448699: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:28.449524: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449529: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449533: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449535: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449555: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449559: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449562: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:28.449571: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449290: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449292: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449294: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449296: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449489: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449491: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449289: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:28.449496: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449497: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449499: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449503: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449502: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 23:32:28.449510: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 23:32:28.486653: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.486656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.486667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.486670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.486671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.486673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.486973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.486663: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.486983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 23:32:28.486666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.486981: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487074: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 23:32:28.487273: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 23:32:28.487276: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 23:32:28.486985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:28.487281: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 23:32:28.487282: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 23:32:28.487283: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 23:32:28.487288: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 23:32:28.487290: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 23:32:28.487295: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 23:32:28.486991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.486986: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.486981: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.487013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:28.487090: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.487376: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:28.487379: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487086: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 23:32:28.487385: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 23:32:28.487385: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 23:32:28.487388: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:28.487389: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 23:32:28.487392: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 23:32:28.487411: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487491: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487492: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487499: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487502: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487504: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487507: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487509: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 23:32:28.487509: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.491682: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491684: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491690: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.491692: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:28.492092: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492098: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492101: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492101: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492104: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492106: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492130: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 23:32:28.492131: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492427: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492423: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492439: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492435: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492458: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492463: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492435: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:28.492832: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492832: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492837: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492839: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492840: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492843: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492845: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 23:32:28.492852: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.507508: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507532: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507544: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507551: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507559: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.507566: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:28.508367: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508383: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508395: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508403: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508412: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508410: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508421: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 23:32:28.508426: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 23:32:41.094613: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094642: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094548: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094661: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094575: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094677: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094568: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094696: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094590: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094609: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.094921: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.094617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095300: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095333: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095343: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.095760: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 23:32:41.095379: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095391: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 23:32:41.095798: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.095415: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 23:32:41.095835: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.095869: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.095889: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.095892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.095898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.095907: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096116: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096152: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096185: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096200: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096202: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096216: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.096647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.096289: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: 2023-03-16 23:32:41.096648: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096650: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096651: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096655: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 23:32:41.096695: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096663: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096657: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 23:32:41.096667: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096668: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096671: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096670: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096674: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096677: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 23:32:41.096678: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096744: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096771: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.096795: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097731: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097733: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097732: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097889: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 23:32:41.097735: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097737: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097736: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097741: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097750: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097751: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097751: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097754: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097754: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097756: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 23:32:41.097810: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 23:32:41.097824: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097922: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097942: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097953: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097965: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.097984: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.098159: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.098055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 23:32:41.098360: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.098363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 23:32:41.098084: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.098364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.098118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 23:32:41.098364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.098156: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 23:32:41.098368: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.098369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 23:32:41.098171: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.098186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.098209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.098214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.098988: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.099000: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.099004: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.098999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.098999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.099006: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.098999: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.099003: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.099014: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.099004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 23:32:41.099023: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.099025: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.099026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.099026: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.099028: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 23:32:41.099030: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099820: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099821: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099824: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099827: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099828: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099829: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099837: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099837: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099834: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 23:32:41.099843: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099845: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099846: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099846: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099849: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 23:32:41.099854: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 23:32:41.100641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100640: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100645: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.100754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 23:32:41.100644: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100651: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 23:32:41.100754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100659: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 23:32:41.100660: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100660: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 23:32:41.100665: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 23:32:41.100665: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 23:32:41.100667: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 23:32:41.100665: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100712: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 23:32:41.100756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 23:32:41.100728: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.100759: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.100760: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.100759: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.100771: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100772: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100772: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100777: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100777: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100779: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100778: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 23:32:41.100833: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 23:32:41.100846: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097465: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097470: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097471: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097473: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097473: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097475: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 23:32:41.097481: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097483: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097490: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097489: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097489: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097493: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 23:32:41.097494: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.098373: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.098370: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 23:32:41.098379: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098380: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098381: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098383: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098383: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098387: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098389: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 23:32:41.098389: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +6: Successfully preprocessed all matching files. +6: Successfully preprocessed all matching files. +6: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +2: +2: +2: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +4: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +5: +5: +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +1: Loading extension module utils... +5: Loading extension module utils... +4: Loading extension module utils... +7: Loading extension module utils... +5: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +5: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +1: Loading extension module utils... +6: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +1: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +3: Loading extension module utils... +6: Loading extension module utils... +1: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +3: +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +2: +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +2: Loading extension module utils... +3: +3: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/83m14b100mdedup/3326822.out b/83m14b100mdedup/3326822.out new file mode 100644 index 0000000000000000000000000000000000000000..0ca13d64b7dec648826063c33ddb1087855e0216 --- /dev/null +++ b/83m14b100mdedup/3326822.out @@ -0,0 +1,9408 @@ +Model parameters: d_model 640 ffw_size 2560 kv_size 64 n_heads 10 n_layers 10 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 10 --hidden-size 640 --num-attention-heads 10 --kv-channels 64 --ffn-hidden-size 2560 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 5_517_578 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-83m14b100mdedup --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_517_578 --lr-warmup-samples 55_176 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_83m14b100mdedup --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_83m14b100mdedup --load checkpoints_83m14b100mdedup --train-weighted-split-paths-path train100mdedup.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3326822.json --zero-stage 0 +START 3326822: Thu 16 Mar 2023 11:32:03 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 41.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 41.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 39.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 39.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 40.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 43.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 37.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 37.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +7: Launching on nid006140 (7/8), master nid006133 port 9999, GPUs 8, CUDA: True +1: Launching on nid006134 (1/8), master nid006133 port 9999, GPUs 8, CUDA: True +4: Launching on nid006137 (4/8), master nid006133 port 9999, GPUs 8, CUDA: True +2: Launching on nid006135 (2/8), master nid006133 port 9999, GPUs 8, CUDA: True +3: Launching on nid006136 (3/8), master nid006133 port 9999, GPUs 8, CUDA: True +0: Launching on nid006133 (0/8), master nid006133 port 9999, GPUs 8, CUDA: True +6: Launching on nid006139 (6/8), master nid006133 port 9999, GPUs 8, CUDA: True +5: Launching on nid006138 (5/8), master nid006133 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... False +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3326822.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2560 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 640 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-83m14b100mdedup +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_83m14b100mdedup +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... None +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 5517578 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 55176 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 10 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_83m14b100mdedup +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_83m14b100mdedup +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 5517578 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_dedup_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-16 23:34:06,081] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.098 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 102 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_cuda.o layer_norm_hip_kernel.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 24.421 seconds +0: time to initialize megatron (seconds): -22.583 +0: [after megatron is initialized] datetime: 2023-03-16 23:34:33 +0: building GPT model ... +0: [2023-03-16 23:34:33,555] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-16 23:34:33,556] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-16 23:34:33,556] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.57 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-16 23:34:35,550] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=17 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: undo +0: 14: MixedFusedLayerNorm +0: 15: EmbeddingPipe +0: 16: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-16 23:34:35,918] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-16 23:34:35,919] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 23:34:35,919] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.59 GB, percent = 6.1% +0: setting training iterations to 21553 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-16 23:34:35,920] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-16 23:34:49,150] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-16 23:34:49,151] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-16 23:34:49,151] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-16 23:34:49,154] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-16 23:34:49,154] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-16 23:34:49,311] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-16 23:34:49,311] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 23:34:49,311] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.27 GB, percent = 6.2% +0: ninja: no work to do. +0: Time to load utils op: 0.1389157772064209 seconds +0: Time to load utils op: 0.24521636962890625 seconds +0: Time to load utils op: 0.2451949119567871 seconds +0: Time to load utils op: 0.2423079013824463 seconds +0: Time to load utils op: 0.24199676513671875 seconds +0: Time to load utils op: 0.24335241317749023 seconds +0: Time to load utils op: 0.24205255508422852 seconds +0: Time to load utils op: 0.24503397941589355 seconds +5: Time to load utils op: 0.2346949577331543 seconds +5: Time to load utils op: 0.23470497131347656 seconds +5: Time to load utils op: 0.23473644256591797 seconds +5: Time to load utils op: 0.23474502563476562 seconds +5: Time to load utils op: 0.2347729206085205 seconds +5: Time to load utils op: 0.234771728515625 seconds +5: Time to load utils op: 0.23477816581726074 secondsTime to load utils op: 0.23478341102600098 seconds +5: +2: Time to load utils op: 0.24148344993591309 seconds +2: Time to load utils op: 0.24149560928344727 seconds +2: Time to load utils op: 0.24150681495666504 seconds +2: Time to load utils op: 0.24148201942443848 seconds +2: Time to load utils op: 0.2415297031402588 secondsTime to load utils op: 0.2415180206298828 seconds +2: +2: Time to load utils op: 0.24153518676757812 secondsTime to load utils op: 0.24150729179382324 seconds +2: +7: Time to load utils op: 0.23450994491577148 seconds +7: Time to load utils op: 0.23595690727233887 seconds +7: Time to load utils op: 0.2356276512145996 seconds +7: Time to load utils op: 0.2357017993927002 seconds +7: Time to load utils op: 0.2356569766998291 seconds +7: Time to load utils op: 0.23593544960021973 secondsTime to load utils op: 0.23545598983764648 seconds +7: +7: Time to load utils op: 0.23461031913757324 seconds +3: Time to load utils op: 0.24033427238464355 secondsTime to load utils op: 0.24032831192016602 seconds +3: +3: Time to load utils op: 0.24036812782287598 seconds +3: Time to load utils op: 0.24034762382507324 seconds +3: Time to load utils op: 0.24034953117370605 seconds +3: Time to load utils op: 0.24037623405456543 seconds +3: Time to load utils op: 0.2404015064239502 seconds +3: Time to load utils op: 0.24039149284362793 seconds +1: Time to load utils op: 0.24444270133972168 seconds +1: Time to load utils op: 0.24446511268615723 seconds +1: Time to load utils op: 0.24448752403259277 seconds +1: Time to load utils op: 0.24448800086975098 seconds +1: Time to load utils op: 0.2445056438446045 seconds +1: Time to load utils op: 0.24451231956481934 secondsTime to load utils op: 0.24450111389160156 seconds +1: +1: Time to load utils op: 0.24451923370361328 seconds +6: Time to load utils op: 0.2429063320159912 seconds +6: Time to load utils op: 0.24297499656677246 seconds +6: Time to load utils op: 0.24225354194641113 seconds +6: Time to load utils op: 0.2417442798614502 seconds +6: Time to load utils op: 0.2424614429473877 seconds +6: Time to load utils op: 0.24268865585327148 seconds +6: Time to load utils op: 0.24178171157836914 secondsTime to load utils op: 0.24261116981506348 seconds +6: +4: Time to load utils op: 0.24143576622009277 seconds +4: Time to load utils op: 0.24144530296325684 seconds +4: Time to load utils op: 0.24144911766052246 seconds +4: Time to load utils op: 0.24145984649658203 seconds +4: Time to load utils op: 0.24149179458618164 seconds +4: Time to load utils op: 0.24149322509765625 seconds +4: Time to load utils op: 0.24149370193481445 seconds +4: Time to load utils op: 0.241502046585083 seconds +0: Time to load utils op: 0.0005505084991455078 seconds +0: Time to load utils op: 0.0005385875701904297 secondsTime to load utils op: 0.0003993511199951172 seconds +0: +0: Time to load utils op: 0.0003859996795654297 seconds +0: Time to load utils op: 0.000553131103515625 seconds +0: Time to load utils op: 0.0004954338073730469 seconds +0: Time to load utils op: 0.0004985332489013672 seconds +6: Time to load utils op: 0.0008761882781982422 seconds +6: Time to load utils op: 0.0010128021240234375 seconds +6: Time to load utils op: 0.0009658336639404297 seconds +6: Time to load utils op: 0.0012040138244628906 seconds +6: Time to load utils op: 0.0012845993041992188 seconds +6: Time to load utils op: 0.0012972354888916016 seconds +6: Time to load utils op: 0.0011572837829589844 seconds +6: Time to load utils op: 0.0012629032135009766 seconds +1: Time to load utils op: 0.0007221698760986328 seconds +1: Time to load utils op: 0.0007894039154052734 seconds +4: Time to load utils op: 0.001252889633178711 seconds +1: Time to load utils op: 0.0009930133819580078 seconds +1: Time to load utils op: 0.0010111331939697266 seconds +1: Time to load utils op: 0.0010824203491210938 seconds +5: Time to load utils op: 0.0006184577941894531 seconds +1: Time to load utils op: 0.0011050701141357422 secondsTime to load utils op: 0.0011322498321533203 seconds +1: +1: Time to load utils op: 0.0011603832244873047 seconds +4: Time to load utils op: 0.0014483928680419922 seconds +4: Time to load utils op: 0.0014853477478027344 seconds +4: Time to load utils op: 0.0014753341674804688 seconds +4: Time to load utils op: 0.0015003681182861328 seconds +4: Time to load utils op: 0.00144195556640625 seconds +4: Time to load utils op: 0.0015072822570800781 seconds +4: Time to load utils op: 0.0014994144439697266 seconds +5: Time to load utils op: 0.0008511543273925781 secondsTime to load utils op: 0.0008630752563476562 seconds +5: +5: Time to load utils op: 0.0008285045623779297 seconds +5: Time to load utils op: 0.0008349418640136719 seconds +2: Time to load utils op: 0.0009398460388183594 seconds +5: Time to load utils op: 0.0010030269622802734 seconds +5: Time to load utils op: 0.0009596347808837891 seconds +5: Time to load utils op: 0.0010426044464111328 seconds +2: Time to load utils op: 0.0010998249053955078 seconds +3: Time to load utils op: 0.0006604194641113281 seconds +3: Time to load utils op: 0.00042939186096191406 seconds +3: Time to load utils op: 0.0006098747253417969 seconds +2: Time to load utils op: 0.0013551712036132812 seconds +3: Time to load utils op: 0.0008637905120849609 seconds +3: Time to load utils op: 0.00096893310546875 seconds +3: Time to load utils op: 0.0006957054138183594 seconds +2: Time to load utils op: 0.0013761520385742188 seconds +2: Time to load utils op: 0.0014331340789794922 seconds +7: Time to load utils op: 0.0008690357208251953 seconds +3: Time to load utils op: 0.0006806850433349609 seconds +2: Time to load utils op: 0.0013966560363769531 seconds +2: Time to load utils op: 0.0014023780822753906 seconds +3: Time to load utils op: 0.0009396076202392578 seconds +2: Time to load utils op: 0.0014755725860595703 seconds +7: Time to load utils op: 0.0009074211120605469 seconds +7: Time to load utils op: 0.0012202262878417969 seconds +7: Time to load utils op: 0.0018057823181152344 seconds +7: Time to load utils op: 0.0018014907836914062 seconds +7: Time to load utils op: 0.0015742778778076172 seconds +7: Time to load utils op: 0.0016913414001464844 seconds +7: Time to load utils op: 0.0019440650939941406 seconds +0: [2023-03-16 23:34:49,603] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-16 23:34:49,604] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 23:34:49,604] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.41 GB, percent = 6.2% +0: [2023-03-16 23:34:49,742] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-16 23:34:49,742] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-16 23:34:49,742] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:49,858] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-16 23:34:49,858] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-16 23:34:49,859] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:49,973] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-16 23:34:49,973] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 23:34:49,973] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:50,085] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-16 23:34:50,086] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 23:34:50,086] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:50,199] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-16 23:34:50,200] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 23:34:50,200] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:50,312] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-16 23:34:50,312] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 23:34:50,312] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:50,429] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-16 23:34:50,429] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 23:34:50,430] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:50,539] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-16 23:34:50,540] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 23:34:50,540] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.42 GB, percent = 6.2% +0: [2023-03-16 23:34:50,540] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-16 23:34:50,540] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-16 23:34:50,540] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-16 23:34:50,540] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-16 23:34:50,541] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-16 23:34:50,541] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-16 23:34:50,541] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-16 23:34:50,541] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-16 23:34:50,541] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-16 23:34:50,541] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-16 23:34:50,542] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-16 23:34:50,543] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-16 23:34:50,543] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00043487548828125 seconds +0: [2023-03-16 23:34:50,544] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-16 23:34:50,596] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=17 [0, 17) STAGE_PARAMS=82741760 (82.742M) TOTAL_PARAMS=82741760 (82.742M) UNIQUE_PARAMS=82741760 (82.742M) +0: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_83m14b100mdedup +0: will not load any checkpoints and will start from random +6: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,602] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-16 23:34:50,603] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,604] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-16 23:34:50,604] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m14b100mdedup/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 7.26 +0: estimated model parameters: 0.08274176 +0: estimated model parameters without embeddings: 0.04923648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 23:34:51 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 5517578 +0: validation: 5632 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.032390 seconds +0: number of documents: 409500 +0: > dataset split: +0: train: +0: document indices in [0, 409500) total of 409500 documents +0: > WARNING: could not find index map files, building the indices on rank 0 ... +0: > last epoch number of samples (13468) is smaller than 95.0% of number of samples per epoch (48281), setting separate_last_epoch to True +0: > elasped time to build and save doc-idx mapping (seconds): 2.910930 +0: using: +0: number of documents: 409500 +0: number of epochs: 115 +0: sequence length: 2048 +0: total number of samples: 5552392 +0: > elasped time to build and save sample-idx mapping (seconds): 0.210359 +0: > building shuffle index with split [0, 5504110) and [5504110, 5552392) ... +0: > elasped time to build and save shuffle-idx mapping (seconds): 0.143702 +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_dedup_100M_text_document_train_indexmap_5517578ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_dedup_100M_text_document_train_indexmap_5517578ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_dedup_100M_text_document_train_indexmap_5517578ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.021 seconds +0: total number of samples: 5552393 +0: total number of epochs: 115 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.053650 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_5632ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_5632ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_5632ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.079 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-16 23:35:08 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 17973.07 | train/valid/test-data-iterators-setup: 17040.04 +0: [000-000] 0.0827B / 0.0492B +0: [before the start of training step] datetime: 2023-03-16 23:35:08 +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 2227.70263671875 | max allocated: 14316.5146484375 | reserved: 15154.0 | max reserved: 15154.0 +7: iteration 10/ 21553 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 1.10 | learning rate: 9.279E-06 | global batch size: 256 | lm loss: 1.073584E+01 | grad norm: 4.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 233.073 | TFLOPs: 4.45 | +7: iteration 20/ 21553 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.19 | learning rate: 1.856E-05 | global batch size: 256 | lm loss: 9.991614E+00 | grad norm: 2.011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1321.621 | TFLOPs: 25.25 | +7: iteration 30/ 21553 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.20 | learning rate: 2.784E-05 | global batch size: 256 | lm loss: 9.653863E+00 | grad norm: 1.873 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1309.795 | TFLOPs: 25.03 | +7: iteration 40/ 21553 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.21 | learning rate: 3.712E-05 | global batch size: 256 | lm loss: 9.302061E+00 | grad norm: 1.744 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1244.056 | TFLOPs: 23.77 | +7: iteration 50/ 21553 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.20 | learning rate: 4.640E-05 | global batch size: 256 | lm loss: 8.913966E+00 | grad norm: 1.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1275.640 | TFLOPs: 24.37 | +7: iteration 60/ 21553 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.21 | learning rate: 5.568E-05 | global batch size: 256 | lm loss: 8.506603E+00 | grad norm: 1.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1216.686 | TFLOPs: 23.25 | +7: iteration 70/ 21553 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.20 | learning rate: 6.496E-05 | global batch size: 256 | lm loss: 8.113055E+00 | grad norm: 1.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1283.476 | TFLOPs: 24.52 | +7: iteration 80/ 21553 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.20 | learning rate: 7.424E-05 | global batch size: 256 | lm loss: 7.728979E+00 | grad norm: 1.053 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1277.007 | TFLOPs: 24.40 | +7: iteration 90/ 21553 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.19 | learning rate: 8.351E-05 | global batch size: 256 | lm loss: 7.382808E+00 | grad norm: 0.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1346.449 | TFLOPs: 25.73 | +7: iteration 100/ 21553 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.19 | learning rate: 9.279E-05 | global batch size: 256 | lm loss: 7.149070E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1320.256 | TFLOPs: 25.23 | +7: iteration 110/ 21553 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.20 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 6.973957E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1311.604 | TFLOPs: 25.06 | +7: iteration 120/ 21553 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.20 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 6.866285E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1253.059 | TFLOPs: 23.94 | +7: iteration 130/ 21553 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.21 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 6.758479E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1206.604 | TFLOPs: 23.05 | +7: iteration 140/ 21553 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.20 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 6.643990E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1265.299 | TFLOPs: 24.18 | +7: iteration 150/ 21553 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.21 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 6.558148E+00 | grad norm: 0.614 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1240.217 | TFLOPs: 23.70 | +7: iteration 160/ 21553 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.19 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 6.497793E+00 | grad norm: 0.717 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1333.949 | TFLOPs: 25.49 | +7: iteration 170/ 21553 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.19 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 6.429901E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1362.121 | TFLOPs: 26.03 | +7: iteration 180/ 21553 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.19 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 6.384204E+00 | grad norm: 0.727 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1338.945 | TFLOPs: 25.58 | +7: iteration 190/ 21553 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.22 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 6.361044E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.887 | TFLOPs: 22.07 | +7: iteration 200/ 21553 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.20 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 6.310837E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1282.768 | TFLOPs: 24.51 | +7: iteration 210/ 21553 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.20 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 6.282251E+00 | grad norm: 0.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1265.004 | TFLOPs: 24.17 | +7: iteration 220/ 21553 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.243100E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1317.903 | TFLOPs: 25.18 | +7: iteration 230/ 21553 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.209847E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1326.796 | TFLOPs: 25.35 | +7: iteration 240/ 21553 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.180290E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1182.440 | TFLOPs: 22.59 | +7: iteration 250/ 21553 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.158507E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1166.272 | TFLOPs: 22.28 | +7: iteration 260/ 21553 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.136811E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1324.374 | TFLOPs: 25.30 | +7: iteration 270/ 21553 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.119715E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1325.544 | TFLOPs: 25.33 | +7: iteration 280/ 21553 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.092347E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1277.216 | TFLOPs: 24.40 | +7: iteration 290/ 21553 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.066650E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1294.274 | TFLOPs: 24.73 | +7: iteration 300/ 21553 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.21 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.066039E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1248.264 | TFLOPs: 23.85 | +7: iteration 310/ 21553 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.045524E+00 | grad norm: 0.711 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1262.857 | TFLOPs: 24.13 | +7: iteration 320/ 21553 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.032784E+00 | grad norm: 0.648 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1189.420 | TFLOPs: 22.73 | +7: iteration 330/ 21553 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.007415E+00 | grad norm: 0.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1273.557 | TFLOPs: 24.33 | +7: iteration 340/ 21553 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.981931E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1269.094 | TFLOPs: 24.25 | +7: iteration 350/ 21553 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.21 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.964911E+00 | grad norm: 0.760 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1195.921 | TFLOPs: 22.85 | +7: iteration 360/ 21553 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.974000E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1340.305 | TFLOPs: 25.61 | +7: iteration 370/ 21553 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.937421E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1160.790 | TFLOPs: 22.18 | +7: iteration 380/ 21553 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.891064E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1261.283 | TFLOPs: 24.10 | +7: iteration 390/ 21553 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.892653E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1278.305 | TFLOPs: 24.42 | +7: iteration 400/ 21553 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.866937E+00 | grad norm: 0.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1319.345 | TFLOPs: 25.21 | +7: iteration 410/ 21553 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.20 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.837734E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1274.653 | TFLOPs: 24.35 | +7: iteration 420/ 21553 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.21 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.812018E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1209.357 | TFLOPs: 23.11 | +7: iteration 430/ 21553 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.21 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.809445E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1229.909 | TFLOPs: 23.50 | +7: iteration 440/ 21553 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.21 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.777190E+00 | grad norm: 0.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1210.953 | TFLOPs: 23.14 | +7: iteration 450/ 21553 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.21 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.762331E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1238.498 | TFLOPs: 23.66 | +7: iteration 460/ 21553 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.727265E+00 | grad norm: 0.662 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1333.453 | TFLOPs: 25.48 | +7: iteration 470/ 21553 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.707065E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1346.112 | TFLOPs: 25.72 | +7: iteration 480/ 21553 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.708226E+00 | grad norm: 0.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1376.837 | TFLOPs: 26.31 | +7: iteration 490/ 21553 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.688678E+00 | grad norm: 0.955 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.351 | TFLOPs: 26.14 | +7: iteration 500/ 21553 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.20 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.663126E+00 | grad norm: 0.962 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1305.400 | TFLOPs: 24.94 | +7: iteration 510/ 21553 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.22 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.648516E+00 | grad norm: 0.674 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1184.857 | TFLOPs: 22.64 | +7: iteration 520/ 21553 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.20 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.624725E+00 | grad norm: 0.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1309.882 | TFLOPs: 25.03 | +7: iteration 530/ 21553 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.20 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.608421E+00 | grad norm: 0.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1294.567 | TFLOPs: 24.73 | +7: iteration 540/ 21553 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.20 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.590068E+00 | grad norm: 0.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1250.152 | TFLOPs: 23.89 | +7: iteration 550/ 21553 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.571334E+00 | grad norm: 0.802 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1342.879 | TFLOPs: 25.66 | +7: iteration 560/ 21553 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.562922E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1331.192 | TFLOPs: 25.43 | +7: iteration 570/ 21553 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.21 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.533746E+00 | grad norm: 0.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1215.504 | TFLOPs: 23.22 | +7: iteration 580/ 21553 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.20 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.522211E+00 | grad norm: 0.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1293.875 | TFLOPs: 24.72 | +7: iteration 590/ 21553 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.509986E+00 | grad norm: 0.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1357.809 | TFLOPs: 25.94 | +7: iteration 600/ 21553 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.19 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.484922E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1333.649 | TFLOPs: 25.48 | +7: iteration 610/ 21553 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.20 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.469765E+00 | grad norm: 0.753 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1272.910 | TFLOPs: 24.32 | +7: iteration 620/ 21553 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.444895E+00 | grad norm: 0.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1160.751 | TFLOPs: 22.18 | +7: iteration 630/ 21553 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.19 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.430409E+00 | grad norm: 0.833 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1323.592 | TFLOPs: 25.29 | +7: iteration 640/ 21553 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.19 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.423294E+00 | grad norm: 0.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1349.960 | TFLOPs: 25.79 | +7: iteration 650/ 21553 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.19 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.399799E+00 | grad norm: 0.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1327.817 | TFLOPs: 25.37 | +7: iteration 660/ 21553 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.19 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.382041E+00 | grad norm: 0.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1317.646 | TFLOPs: 25.18 | +7: iteration 670/ 21553 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.20 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.366107E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1291.685 | TFLOPs: 24.68 | +7: iteration 680/ 21553 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.19 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.349443E+00 | grad norm: 0.886 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1335.754 | TFLOPs: 25.52 | +7: iteration 690/ 21553 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.20 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.336282E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1312.002 | TFLOPs: 25.07 | +7: iteration 700/ 21553 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.18 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.326564E+00 | grad norm: 1.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.653 | TFLOPs: 27.07 | +7: iteration 710/ 21553 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.20 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.312223E+00 | grad norm: 0.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1294.008 | TFLOPs: 24.72 | +7: iteration 720/ 21553 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.20 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 5.292926E+00 | grad norm: 0.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1298.358 | TFLOPs: 24.81 | +7: iteration 730/ 21553 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.19 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.272779E+00 | grad norm: 1.099 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1332.188 | TFLOPs: 25.45 | +7: iteration 740/ 21553 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.19 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.272428E+00 | grad norm: 0.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1373.002 | TFLOPs: 26.23 | +7: iteration 750/ 21553 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.19 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.254498E+00 | grad norm: 1.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1338.657 | TFLOPs: 25.58 | +7: iteration 760/ 21553 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.20 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.246263E+00 | grad norm: 1.039 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1285.595 | TFLOPs: 24.56 | +7: iteration 770/ 21553 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.19 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.234694E+00 | grad norm: 0.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1319.735 | TFLOPs: 25.22 | +7: iteration 780/ 21553 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.20 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.220882E+00 | grad norm: 0.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1264.345 | TFLOPs: 24.16 | +7: iteration 790/ 21553 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.20 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.205399E+00 | grad norm: 0.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1285.727 | TFLOPs: 24.57 | +7: iteration 800/ 21553 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.20 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.178880E+00 | grad norm: 0.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1273.347 | TFLOPs: 24.33 | +7: iteration 810/ 21553 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.19 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 5.175276E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1330.450 | TFLOPs: 25.42 | +7: iteration 820/ 21553 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.18 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.173709E+00 | grad norm: 0.775 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.153 | TFLOPs: 26.52 | +7: iteration 830/ 21553 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.19 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.157261E+00 | grad norm: 0.867 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1321.275 | TFLOPs: 25.24 | +7: iteration 840/ 21553 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.19 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.144273E+00 | grad norm: 0.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1321.735 | TFLOPs: 25.25 | +7: iteration 850/ 21553 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.19 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.135125E+00 | grad norm: 0.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.560 | TFLOPs: 26.28 | +7: iteration 860/ 21553 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.19 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.126603E+00 | grad norm: 0.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1320.947 | TFLOPs: 25.24 | +7: iteration 870/ 21553 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.18 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.115991E+00 | grad norm: 0.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.803 | TFLOPs: 26.52 | +7: iteration 880/ 21553 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.19 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.108521E+00 | grad norm: 0.964 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1347.535 | TFLOPs: 25.75 | +7: iteration 890/ 21553 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.19 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 5.101833E+00 | grad norm: 0.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1369.082 | TFLOPs: 26.16 | +7: iteration 900/ 21553 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.19 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.073948E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1340.952 | TFLOPs: 25.62 | +7: iteration 910/ 21553 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.19 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.073545E+00 | grad norm: 0.695 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.721 | TFLOPs: 26.00 | +7: iteration 920/ 21553 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.18 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.056391E+00 | grad norm: 0.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.035 | TFLOPs: 26.46 | +7: iteration 930/ 21553 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.19 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.046578E+00 | grad norm: 0.804 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1336.480 | TFLOPs: 25.54 | +7: iteration 940/ 21553 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.19 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.039131E+00 | grad norm: 0.914 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1332.687 | TFLOPs: 25.46 | +7: iteration 950/ 21553 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.20 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.037307E+00 | grad norm: 1.070 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1257.440 | TFLOPs: 24.03 | +7: iteration 960/ 21553 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.19 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 5.022233E+00 | grad norm: 0.877 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1369.489 | TFLOPs: 26.17 | +7: iteration 970/ 21553 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.19 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 5.010467E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.331 | TFLOPs: 26.14 | +7: iteration 980/ 21553 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.20 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 5.007591E+00 | grad norm: 0.724 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1263.810 | TFLOPs: 24.15 | +7: iteration 990/ 21553 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.19 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.988042E+00 | grad norm: 0.721 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.523 | TFLOPs: 26.20 | +7: iteration 1000/ 21553 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.20 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.979719E+00 | grad norm: 0.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1304.531 | TFLOPs: 24.92 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 1000 | lm loss value: 4.988493E+00 | lm loss PPL: 1.467152E+02 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 1000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:38:35,246] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +0: [2023-03-16 23:38:35,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:38:35,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:38:35,456] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:38:35,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:38:35,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:38:35,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:38:35,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:38:35,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:38:35,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:38:35,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:38:35,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:38:35,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:38:35,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:38:35,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:38:35,524] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:38:35,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:38:35,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:38:35,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:38:35,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:38:35,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:38:35,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:38:35,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:38:35,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:38:35,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:38:35,570] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step1000/mp_rank_00_model_states.pt +0: [2023-03-16 23:38:35,570] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:38:35,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:38:35,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:38:35,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:38:35,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-16 23:38:35,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:38:35,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:38:35,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-16 23:38:35,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:38:35,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,646] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 23:38:35,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-16 23:38:35,646] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-16 23:38:35,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-16 23:38:35,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-16 23:38:35,653] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-16 23:38:35,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-16 23:38:35,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: successfully saved checkpoint at iteration 1000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 415.64 +7: iteration 1010/ 21553 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.971087E+00 | grad norm: 0.872 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1094.296 | TFLOPs: 20.91 | +7: iteration 1020/ 21553 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.19 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.965026E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1346.112 | TFLOPs: 25.72 | +7: iteration 1030/ 21553 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.19 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.964022E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1380.375 | TFLOPs: 26.37 | +7: iteration 1040/ 21553 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.18 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.946037E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.196 | TFLOPs: 27.04 | +7: iteration 1050/ 21553 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.19 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.946555E+00 | grad norm: 1.005 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1332.971 | TFLOPs: 25.47 | +7: iteration 1060/ 21553 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.18 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.935370E+00 | grad norm: 0.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.625 | TFLOPs: 26.49 | +7: iteration 1070/ 21553 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.19 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.915777E+00 | grad norm: 0.875 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1357.837 | TFLOPs: 25.94 | +7: iteration 1080/ 21553 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.18 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.922406E+00 | grad norm: 0.853 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.077 | TFLOPs: 26.65 | +7: iteration 1090/ 21553 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.19 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.910095E+00 | grad norm: 0.731 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1344.971 | TFLOPs: 25.70 | +7: iteration 1100/ 21553 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.19 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.912708E+00 | grad norm: 0.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1350.237 | TFLOPs: 25.80 | +7: iteration 1110/ 21553 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.19 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.892095E+00 | grad norm: 0.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1328.558 | TFLOPs: 25.38 | +7: iteration 1120/ 21553 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.19 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.882834E+00 | grad norm: 0.728 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1355.878 | TFLOPs: 25.91 | +7: iteration 1130/ 21553 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.18 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.875998E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.539 | TFLOPs: 26.51 | +7: iteration 1140/ 21553 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.19 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.860169E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.733 | TFLOPs: 26.44 | +7: iteration 1150/ 21553 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.19 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.850673E+00 | grad norm: 0.855 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1324.244 | TFLOPs: 25.30 | +7: iteration 1160/ 21553 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.19 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.851175E+00 | grad norm: 1.051 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1324.458 | TFLOPs: 25.31 | +7: iteration 1170/ 21553 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.19 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.850901E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1316.819 | TFLOPs: 25.16 | +7: iteration 1180/ 21553 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.19 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.845541E+00 | grad norm: 0.883 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1372.755 | TFLOPs: 26.23 | +7: iteration 1190/ 21553 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.18 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.824683E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.909 | TFLOPs: 26.71 | +7: iteration 1200/ 21553 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.20 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.806812E+00 | grad norm: 0.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1301.630 | TFLOPs: 24.87 | +7: iteration 1210/ 21553 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.19 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.810196E+00 | grad norm: 0.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1351.534 | TFLOPs: 25.82 | +7: iteration 1220/ 21553 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.19 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.805723E+00 | grad norm: 0.732 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1341.350 | TFLOPs: 25.63 | +7: iteration 1230/ 21553 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.19 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.787849E+00 | grad norm: 0.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1345.048 | TFLOPs: 25.70 | +7: iteration 1240/ 21553 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.19 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.781859E+00 | grad norm: 0.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1351.269 | TFLOPs: 25.82 | +7: iteration 1250/ 21553 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.19 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.780040E+00 | grad norm: 0.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1379.996 | TFLOPs: 26.37 | +7: iteration 1260/ 21553 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.19 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.774542E+00 | grad norm: 0.677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1369.672 | TFLOPs: 26.17 | +7: iteration 1270/ 21553 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.19 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.752124E+00 | grad norm: 0.856 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1370.680 | TFLOPs: 26.19 | +7: iteration 1280/ 21553 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.19 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.757315E+00 | grad norm: 0.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1358.564 | TFLOPs: 25.96 | +7: iteration 1290/ 21553 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.19 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.737913E+00 | grad norm: 0.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1338.844 | TFLOPs: 25.58 | +7: iteration 1300/ 21553 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.19 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.731004E+00 | grad norm: 0.765 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1373.590 | TFLOPs: 26.24 | +7: iteration 1310/ 21553 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.21 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.736915E+00 | grad norm: 0.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1197.663 | TFLOPs: 22.88 | +7: iteration 1320/ 21553 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.19 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.732215E+00 | grad norm: 0.902 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1355.827 | TFLOPs: 25.90 | +7: iteration 1330/ 21553 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.19 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.724858E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1345.232 | TFLOPs: 25.70 | +7: iteration 1340/ 21553 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.19 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.710284E+00 | grad norm: 0.776 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1363.959 | TFLOPs: 26.06 | +7: iteration 1350/ 21553 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.19 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.701207E+00 | grad norm: 0.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1340.375 | TFLOPs: 25.61 | +7: iteration 1360/ 21553 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.18 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.697201E+00 | grad norm: 0.835 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.969 | TFLOPs: 26.67 | +7: iteration 1370/ 21553 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.18 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.696406E+00 | grad norm: 0.839 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.487 | TFLOPs: 26.47 | +7: iteration 1380/ 21553 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.19 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.683631E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1343.634 | TFLOPs: 25.67 | +7: iteration 1390/ 21553 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.19 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.675492E+00 | grad norm: 0.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1350.397 | TFLOPs: 25.80 | +7: iteration 1400/ 21553 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.18 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.666925E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.792 | TFLOPs: 26.97 | +7: iteration 1410/ 21553 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.19 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.669301E+00 | grad norm: 0.920 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1365.042 | TFLOPs: 26.08 | +7: iteration 1420/ 21553 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.19 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.666803E+00 | grad norm: 0.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1338.682 | TFLOPs: 25.58 | +7: iteration 1430/ 21553 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.19 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.660406E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1377.208 | TFLOPs: 26.31 | +7: iteration 1440/ 21553 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.18 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.656536E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.028 | TFLOPs: 26.63 | +7: iteration 1450/ 21553 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.19 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.638615E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.491 | TFLOPs: 26.41 | +7: iteration 1460/ 21553 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.19 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.648136E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1320.991 | TFLOPs: 25.24 | +7: iteration 1470/ 21553 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.19 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.636689E+00 | grad norm: 0.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.521 | TFLOPs: 26.20 | +7: iteration 1480/ 21553 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.19 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.642252E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1372.440 | TFLOPs: 26.22 | +7: iteration 1490/ 21553 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.18 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.629053E+00 | grad norm: 0.702 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.998 | TFLOPs: 26.58 | +7: iteration 1500/ 21553 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.19 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.611509E+00 | grad norm: 0.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.738 | TFLOPs: 26.34 | +7: iteration 1510/ 21553 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.19 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.618935E+00 | grad norm: 0.708 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1313.513 | TFLOPs: 25.10 | +7: iteration 1520/ 21553 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.18 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.602971E+00 | grad norm: 0.618 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.435 | TFLOPs: 26.97 | +7: iteration 1530/ 21553 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.19 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.606402E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1370.746 | TFLOPs: 26.19 | +7: iteration 1540/ 21553 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.19 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.603698E+00 | grad norm: 0.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.400 | TFLOPs: 25.99 | +7: iteration 1550/ 21553 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.19 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.591967E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.637 | TFLOPs: 26.34 | +7: iteration 1560/ 21553 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.19 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.590930E+00 | grad norm: 0.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.740 | TFLOPs: 26.44 | +7: iteration 1570/ 21553 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.18 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.582811E+00 | grad norm: 0.734 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.844 | TFLOPs: 26.44 | +7: iteration 1580/ 21553 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.19 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.581321E+00 | grad norm: 0.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1337.391 | TFLOPs: 25.55 | +7: iteration 1590/ 21553 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.19 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.581391E+00 | grad norm: 0.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.414 | TFLOPs: 26.15 | +7: iteration 1600/ 21553 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.19 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.571597E+00 | grad norm: 0.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1355.033 | TFLOPs: 25.89 | +7: iteration 1610/ 21553 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.18 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.574175E+00 | grad norm: 0.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.590 | TFLOPs: 26.99 | +7: iteration 1620/ 21553 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.19 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.567448E+00 | grad norm: 0.743 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1354.547 | TFLOPs: 25.88 | +7: iteration 1630/ 21553 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.19 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.567069E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1377.289 | TFLOPs: 26.31 | +7: iteration 1640/ 21553 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.19 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.553430E+00 | grad norm: 0.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1346.861 | TFLOPs: 25.73 | +7: iteration 1650/ 21553 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.19 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.552691E+00 | grad norm: 0.669 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1376.500 | TFLOPs: 26.30 | +7: iteration 1660/ 21553 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.19 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.547833E+00 | grad norm: 0.733 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1366.806 | TFLOPs: 26.11 | +7: iteration 1670/ 21553 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.18 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.542608E+00 | grad norm: 0.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.933 | TFLOPs: 26.65 | +7: iteration 1680/ 21553 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.19 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.547117E+00 | grad norm: 0.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1370.330 | TFLOPs: 26.18 | +7: iteration 1690/ 21553 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.18 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.537572E+00 | grad norm: 0.720 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.974 | TFLOPs: 26.63 | +7: iteration 1700/ 21553 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.19 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.526381E+00 | grad norm: 0.643 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1372.956 | TFLOPs: 26.23 | +7: iteration 1710/ 21553 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.18 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.542380E+00 | grad norm: 0.762 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.934 | TFLOPs: 26.44 | +7: iteration 1720/ 21553 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.19 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.525897E+00 | grad norm: 0.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1339.937 | TFLOPs: 25.60 | +7: iteration 1730/ 21553 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.18 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.524710E+00 | grad norm: 0.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.185 | TFLOPs: 26.66 | +7: iteration 1740/ 21553 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.18 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.515484E+00 | grad norm: 0.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.587 | TFLOPs: 26.51 | +7: iteration 1750/ 21553 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.18 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.515206E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.451 | TFLOPs: 26.66 | +7: iteration 1760/ 21553 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.18 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.511349E+00 | grad norm: 0.933 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.845 | TFLOPs: 26.50 | +7: iteration 1770/ 21553 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.19 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.516087E+00 | grad norm: 0.785 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.952 | TFLOPs: 26.16 | +7: iteration 1780/ 21553 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.18 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.508992E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.395 | TFLOPs: 26.81 | +7: iteration 1790/ 21553 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.19 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.501821E+00 | grad norm: 0.897 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1381.861 | TFLOPs: 26.40 | +7: iteration 1800/ 21553 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.19 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.494095E+00 | grad norm: 0.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1349.733 | TFLOPs: 25.79 | +7: iteration 1810/ 21553 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.21 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.498513E+00 | grad norm: 0.774 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1231.824 | TFLOPs: 23.54 | +7: iteration 1820/ 21553 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.18 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.489198E+00 | grad norm: 0.594 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.153 | TFLOPs: 27.04 | +7: iteration 1830/ 21553 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.19 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.484724E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1329.947 | TFLOPs: 25.41 | +7: iteration 1840/ 21553 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.18 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.484981E+00 | grad norm: 0.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.107 | TFLOPs: 27.06 | +7: iteration 1850/ 21553 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.19 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.482726E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1369.317 | TFLOPs: 26.16 | +7: iteration 1860/ 21553 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.19 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.479527E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.386 | TFLOPs: 26.43 | +7: iteration 1870/ 21553 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.19 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.467596E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1365.743 | TFLOPs: 26.09 | +7: iteration 1880/ 21553 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.19 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.478012E+00 | grad norm: 0.704 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1366.432 | TFLOPs: 26.11 | +7: iteration 1890/ 21553 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.19 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.473419E+00 | grad norm: 0.646 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1374.422 | TFLOPs: 26.26 | +7: iteration 1900/ 21553 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.18 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.468744E+00 | grad norm: 0.700 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.007 | TFLOPs: 27.04 | +7: iteration 1910/ 21553 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.19 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.464024E+00 | grad norm: 0.652 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1352.780 | TFLOPs: 25.85 | +7: iteration 1920/ 21553 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.18 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.458398E+00 | grad norm: 0.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.547 | TFLOPs: 26.95 | +7: iteration 1930/ 21553 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.19 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.460975E+00 | grad norm: 0.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1354.895 | TFLOPs: 25.89 | +7: iteration 1940/ 21553 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.18 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.453363E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.675 | TFLOPs: 26.51 | +7: iteration 1950/ 21553 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.18 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.458375E+00 | grad norm: 0.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.660 | TFLOPs: 26.63 | +7: iteration 1960/ 21553 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.18 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.449066E+00 | grad norm: 0.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.590 | TFLOPs: 26.66 | +7: iteration 1970/ 21553 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.19 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.439232E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1376.383 | TFLOPs: 26.30 | +7: iteration 1980/ 21553 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.18 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.438773E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.741 | TFLOPs: 26.51 | +7: iteration 1990/ 21553 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.18 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.435063E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.521 | TFLOPs: 26.91 | +0: [2023-03-16 23:41:43,030] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00019691153209285103, 0.00019691153209285103, 0.00019691153209285103], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 21553 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.18 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.437208E+00 | grad norm: 0.681 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.282 | TFLOPs: 26.96 | +0: steps: 2000 loss: 4.4474 iter time (s): 0.196 samples/sec: 1308.157 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 2000 | lm loss value: 4.297775E+00 | lm loss PPL: 7.353598E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 2000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:41:43,118] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +0: [2023-03-16 23:41:43,122] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:41:43,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:41:43,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:41:43,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:41:43,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:41:43,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:41:43,218] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:41:43,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:41:43,229] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:41:43,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:41:43,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:41:43,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:41:43,251] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:41:43,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:41:43,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:41:43,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:41:43,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:41:43,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:41:43,285] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:41:43,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:41:43,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:41:43,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:41:43,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:41:43,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:41:43,309] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step2000/mp_rank_00_model_states.pt +0: [2023-03-16 23:41:43,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:41:43,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:41:43,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:41:43,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,348] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,348] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,349] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:41:43,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,350] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,351] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 23:41:43,351] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:41:43,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,352] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +0: [2023-03-16 23:41:43,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:41:43,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:41:43,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,355] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,355] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-16 23:41:43,356] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-16 23:41:43,356] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-16 23:41:43,357] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-16 23:41:43,357] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-16 23:41:43,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:41:43,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:41:43,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-16 23:41:43,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:41:43,358] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 23:41:43,358] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-16 23:41:43,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:41:43,359] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 23:41:43,359] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: successfully saved checkpoint at iteration 2000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 244.48 +7: iteration 2010/ 21553 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.22 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.441891E+00 | grad norm: 0.653 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1171.660 | TFLOPs: 22.39 | +7: iteration 2020/ 21553 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.18 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.437283E+00 | grad norm: 0.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1384.607 | TFLOPs: 26.45 | +7: iteration 2030/ 21553 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.19 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.438496E+00 | grad norm: 0.637 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1374.060 | TFLOPs: 26.25 | +7: iteration 2040/ 21553 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.18 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.430029E+00 | grad norm: 0.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.526 | TFLOPs: 26.70 | +7: iteration 2050/ 21553 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.18 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.420475E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1392.413 | TFLOPs: 26.60 | +7: iteration 2060/ 21553 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.18 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.423199E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.748 | TFLOPs: 26.82 | +7: iteration 2070/ 21553 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.21 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.419922E+00 | grad norm: 0.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1233.290 | TFLOPs: 23.56 | +7: iteration 2080/ 21553 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.19 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.415836E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1341.022 | TFLOPs: 25.62 | +7: iteration 2090/ 21553 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.18 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.413730E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.584 | TFLOPs: 26.59 | +7: iteration 2100/ 21553 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.19 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.408709E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.564 | TFLOPs: 26.28 | +7: iteration 2110/ 21553 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.19 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.410990E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1357.234 | TFLOPs: 25.93 | +7: iteration 2120/ 21553 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.19 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.404310E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1359.882 | TFLOPs: 25.98 | +7: iteration 2130/ 21553 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.18 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.403423E+00 | grad norm: 0.710 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.656 | TFLOPs: 26.99 | +7: iteration 2140/ 21553 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.19 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.398629E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1365.160 | TFLOPs: 26.08 | +7: iteration 2150/ 21553 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.19 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.400727E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.593 | TFLOPs: 25.92 | +7: iteration 2160/ 21553 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.18 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.397945E+00 | grad norm: 0.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.402 | TFLOPs: 27.00 | +7: iteration 2170/ 21553 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.18 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.390277E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.962 | TFLOPs: 27.02 | +7: iteration 2180/ 21553 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.18 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.393130E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.449 | TFLOPs: 26.97 | +7: iteration 2190/ 21553 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.18 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.385589E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.714 | TFLOPs: 26.72 | +7: iteration 2200/ 21553 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.18 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.380470E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.119 | TFLOPs: 27.02 | +7: iteration 2210/ 21553 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.19 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.386985E+00 | grad norm: 0.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1345.691 | TFLOPs: 25.71 | +7: iteration 2220/ 21553 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.19 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.382993E+00 | grad norm: 0.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1357.131 | TFLOPs: 25.93 | +7: iteration 2230/ 21553 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.19 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.377343E+00 | grad norm: 0.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.574 | TFLOPs: 26.28 | +7: iteration 2240/ 21553 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.19 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.373732E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.251 | TFLOPs: 25.91 | +7: iteration 2250/ 21553 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.19 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.368527E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.869 | TFLOPs: 26.42 | +7: iteration 2260/ 21553 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.18 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.374615E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.182 | TFLOPs: 26.73 | +7: iteration 2270/ 21553 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.18 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.376308E+00 | grad norm: 0.808 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.886 | TFLOPs: 26.54 | +7: iteration 2280/ 21553 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.18 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.352565E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.934 | TFLOPs: 27.05 | +7: iteration 2290/ 21553 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.19 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.367782E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1349.434 | TFLOPs: 25.78 | +7: iteration 2300/ 21553 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.19 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.354036E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1362.132 | TFLOPs: 26.03 | +7: iteration 2310/ 21553 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.18 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.357003E+00 | grad norm: 0.597 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.457 | TFLOPs: 27.03 | +7: iteration 2320/ 21553 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.19 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.356146E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.494 | TFLOPs: 26.43 | +7: iteration 2330/ 21553 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.18 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.356844E+00 | grad norm: 0.799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.195 | TFLOPs: 26.56 | +7: iteration 2340/ 21553 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.18 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.343206E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.330 | TFLOPs: 26.98 | +7: iteration 2350/ 21553 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.18 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.346402E+00 | grad norm: 0.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.727 | TFLOPs: 26.53 | +7: iteration 2360/ 21553 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.18 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.349184E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.731 | TFLOPs: 26.69 | +7: iteration 2370/ 21553 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.18 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.345761E+00 | grad norm: 0.580 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.752 | TFLOPs: 27.03 | +7: iteration 2380/ 21553 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.19 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.340083E+00 | grad norm: 0.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.654 | TFLOPs: 26.21 | +7: iteration 2390/ 21553 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.18 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.343600E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.945 | TFLOPs: 26.48 | +7: iteration 2400/ 21553 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.19 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.336867E+00 | grad norm: 0.745 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1365.977 | TFLOPs: 26.10 | +7: iteration 2410/ 21553 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.18 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.337428E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.838 | TFLOPs: 26.92 | +7: iteration 2420/ 21553 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.19 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.328009E+00 | grad norm: 0.656 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1363.968 | TFLOPs: 26.06 | +7: iteration 2430/ 21553 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.19 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.330214E+00 | grad norm: 0.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1369.427 | TFLOPs: 26.16 | +7: iteration 2440/ 21553 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.18 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.329007E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.805 | TFLOPs: 26.52 | +7: iteration 2450/ 21553 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.18 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.330043E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1384.853 | TFLOPs: 26.46 | +7: iteration 2460/ 21553 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.18 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.313327E+00 | grad norm: 0.696 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.017 | TFLOPs: 26.96 | +7: iteration 2470/ 21553 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.18 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.321852E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.730 | TFLOPs: 26.95 | +7: iteration 2480/ 21553 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.19 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.328387E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.151 | TFLOPs: 26.27 | +7: iteration 2490/ 21553 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.18 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.319683E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.974 | TFLOPs: 26.96 | +7: iteration 2500/ 21553 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.18 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.321442E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.873 | TFLOPs: 26.98 | +7: iteration 2510/ 21553 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.18 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.319791E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.230 | TFLOPs: 26.98 | +7: iteration 2520/ 21553 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.18 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.310916E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.940 | TFLOPs: 26.98 | +7: iteration 2530/ 21553 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.18 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.317565E+00 | grad norm: 0.605 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.985 | TFLOPs: 26.98 | +7: iteration 2540/ 21553 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.18 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.320458E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.850 | TFLOPs: 26.44 | +7: iteration 2550/ 21553 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.18 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.309078E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.401 | TFLOPs: 26.47 | +7: iteration 2560/ 21553 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.18 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.306112E+00 | grad norm: 0.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.848 | TFLOPs: 26.96 | +7: iteration 2570/ 21553 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.18 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.303923E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.073 | TFLOPs: 26.81 | +7: iteration 2580/ 21553 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.18 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.306987E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.060 | TFLOPs: 26.92 | +7: iteration 2590/ 21553 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.19 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.302839E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1327.207 | TFLOPs: 25.36 | +7: iteration 2600/ 21553 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.19 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.305293E+00 | grad norm: 0.644 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.063 | TFLOPs: 26.20 | +7: iteration 2610/ 21553 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.19 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.296361E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1377.026 | TFLOPs: 26.31 | +7: iteration 2620/ 21553 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.18 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.298802E+00 | grad norm: 0.794 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.954 | TFLOPs: 27.00 | +7: iteration 2630/ 21553 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.18 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.293317E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1384.713 | TFLOPs: 26.46 | +7: iteration 2640/ 21553 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.18 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.289262E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.389 | TFLOPs: 27.00 | +7: iteration 2650/ 21553 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.19 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.297715E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1376.108 | TFLOPs: 26.29 | +7: iteration 2660/ 21553 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.18 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.287108E+00 | grad norm: 0.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.433 | TFLOPs: 26.99 | +7: iteration 2670/ 21553 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.18 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.291749E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.987 | TFLOPs: 27.00 | +7: iteration 2680/ 21553 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.18 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.293131E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.288 | TFLOPs: 26.72 | +7: iteration 2690/ 21553 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.18 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.285867E+00 | grad norm: 0.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.623 | TFLOPs: 26.63 | +7: iteration 2700/ 21553 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.18 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.280908E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.820 | TFLOPs: 26.96 | +7: iteration 2710/ 21553 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.18 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.287123E+00 | grad norm: 0.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.513 | TFLOPs: 26.97 | +7: iteration 2720/ 21553 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.20 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.287958E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1306.641 | TFLOPs: 24.97 | +7: iteration 2730/ 21553 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.18 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.273272E+00 | grad norm: 0.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.199 | TFLOPs: 27.02 | +7: iteration 2740/ 21553 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.18 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.272800E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.231 | TFLOPs: 27.02 | +7: iteration 2750/ 21553 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.18 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.278469E+00 | grad norm: 0.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.509 | TFLOPs: 27.01 | +7: iteration 2760/ 21553 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.18 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.270573E+00 | grad norm: 0.577 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.098 | TFLOPs: 27.02 | +7: iteration 2770/ 21553 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.18 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.270861E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.282 | TFLOPs: 26.68 | +7: iteration 2780/ 21553 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.18 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.266187E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.954 | TFLOPs: 26.98 | +7: iteration 2790/ 21553 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.19 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.264288E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1357.425 | TFLOPs: 25.94 | +7: iteration 2800/ 21553 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.18 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.273402E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.194 | TFLOPs: 26.92 | +7: iteration 2810/ 21553 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.18 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.260828E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.661 | TFLOPs: 26.51 | +7: iteration 2820/ 21553 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.18 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.257616E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.389 | TFLOPs: 26.93 | +7: iteration 2830/ 21553 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.18 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.263500E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.073 | TFLOPs: 26.50 | +7: iteration 2840/ 21553 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.18 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.264570E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.078 | TFLOPs: 27.00 | +7: iteration 2850/ 21553 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.18 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.254689E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.646 | TFLOPs: 26.76 | +7: iteration 2860/ 21553 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.18 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.261912E+00 | grad norm: 0.613 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.038 | TFLOPs: 26.71 | +7: iteration 2870/ 21553 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.18 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.252793E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.873 | TFLOPs: 26.48 | +7: iteration 2880/ 21553 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.18 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.248394E+00 | grad norm: 0.593 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.471 | TFLOPs: 26.70 | +7: iteration 2890/ 21553 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.18 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.252703E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.441 | TFLOPs: 26.47 | +7: iteration 2900/ 21553 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.18 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.249340E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.190 | TFLOPs: 26.96 | +7: iteration 2910/ 21553 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.18 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.248116E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.910 | TFLOPs: 26.98 | +7: iteration 2920/ 21553 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.18 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.241092E+00 | grad norm: 0.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.995 | TFLOPs: 26.73 | +7: iteration 2930/ 21553 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.18 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.246082E+00 | grad norm: 0.612 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.346 | TFLOPs: 26.51 | +7: iteration 2940/ 21553 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.18 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.246945E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.191 | TFLOPs: 26.96 | +7: iteration 2950/ 21553 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.18 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.247636E+00 | grad norm: 0.665 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1402.875 | TFLOPs: 26.80 | +7: iteration 2960/ 21553 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.18 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.243487E+00 | grad norm: 0.676 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.586 | TFLOPs: 26.91 | +7: iteration 2970/ 21553 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.18 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.245314E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.467 | TFLOPs: 26.99 | +7: iteration 2980/ 21553 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.18 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.236769E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.384 | TFLOPs: 26.99 | +7: iteration 2990/ 21553 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.18 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.232218E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.236 | TFLOPs: 26.98 | +7: iteration 3000/ 21553 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.18 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.237989E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.703 | TFLOPs: 26.97 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 3000 | lm loss value: 4.126336E+00 | lm loss PPL: 6.195050E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 3000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:44:47,569] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +0: [2023-03-16 23:44:47,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:44:47,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:44:47,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:44:47,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:44:47,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:44:47,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:44:47,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:44:47,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:44:47,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:44:47,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:44:47,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:44:47,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:44:47,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:44:47,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:44:47,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:44:47,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:44:47,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:44:47,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:44:47,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:44:47,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:44:47,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:44:47,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:44:47,758] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:44:47,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:44:47,759] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step3000/mp_rank_00_model_states.pt +0: [2023-03-16 23:44:47,759] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:44:47,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:44:47,778] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:44:47,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:44:47,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:44:47,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,800] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,800] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 23:44:47,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:44:47,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,803] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:44:47,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-16 23:44:47,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:44:47,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:44:47,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,805] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,805] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-16 23:44:47,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:44:47,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:44:47,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:44:47,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:44:47,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:44:47,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: successfully saved checkpoint at iteration 3000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 241.39 +7: iteration 3010/ 21553 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.21 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.227590E+00 | grad norm: 0.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1202.053 | TFLOPs: 22.97 | +7: iteration 3020/ 21553 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.18 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.229546E+00 | grad norm: 0.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.410 | TFLOPs: 26.74 | +7: iteration 3030/ 21553 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.18 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.227672E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.125 | TFLOPs: 27.08 | +7: iteration 3040/ 21553 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.18 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.226134E+00 | grad norm: 0.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.305 | TFLOPs: 26.85 | +7: iteration 3050/ 21553 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.18 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.227320E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.205 | TFLOPs: 27.08 | +7: iteration 3060/ 21553 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.18 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.218733E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.708 | TFLOPs: 27.09 | +7: iteration 3070/ 21553 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.18 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.229634E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.027 | TFLOPs: 27.06 | +7: iteration 3080/ 21553 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.18 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.217741E+00 | grad norm: 0.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1389.380 | TFLOPs: 26.55 | +7: iteration 3090/ 21553 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.18 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.226797E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.252 | TFLOPs: 27.08 | +7: iteration 3100/ 21553 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.18 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.228262E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.933 | TFLOPs: 26.82 | +7: iteration 3110/ 21553 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.18 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.219043E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.677 | TFLOPs: 27.05 | +7: iteration 3120/ 21553 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.18 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.219038E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.612 | TFLOPs: 27.07 | +7: iteration 3130/ 21553 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.18 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.217791E+00 | grad norm: 0.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.422 | TFLOPs: 27.02 | +7: iteration 3140/ 21553 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.18 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.217616E+00 | grad norm: 0.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.229 | TFLOPs: 27.02 | +7: iteration 3150/ 21553 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.19 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.218243E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1361.754 | TFLOPs: 26.02 | +7: iteration 3160/ 21553 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.18 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.219657E+00 | grad norm: 0.627 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.196 | TFLOPs: 27.02 | +7: iteration 3170/ 21553 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.18 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.203500E+00 | grad norm: 0.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.049 | TFLOPs: 26.83 | +7: iteration 3180/ 21553 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.18 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.204434E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.151 | TFLOPs: 26.47 | +7: iteration 3190/ 21553 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.18 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.201901E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.427 | TFLOPs: 27.01 | +7: iteration 3200/ 21553 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.18 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.208718E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.738 | TFLOPs: 26.99 | +7: iteration 3210/ 21553 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.18 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.202372E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.906 | TFLOPs: 27.01 | +7: iteration 3220/ 21553 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.19 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.199363E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1333.667 | TFLOPs: 25.48 | +7: iteration 3230/ 21553 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.18 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.201097E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.373 | TFLOPs: 27.00 | +7: iteration 3240/ 21553 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.18 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.199615E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.719 | TFLOPs: 27.03 | +7: iteration 3250/ 21553 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.18 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.204938E+00 | grad norm: 0.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.439 | TFLOPs: 27.02 | +7: iteration 3260/ 21553 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.18 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.204086E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.337 | TFLOPs: 27.00 | +7: iteration 3270/ 21553 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.18 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.199723E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.083 | TFLOPs: 27.04 | +7: iteration 3280/ 21553 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.18 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.187920E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.397 | TFLOPs: 27.04 | +7: iteration 3290/ 21553 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.18 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.193433E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.381 | TFLOPs: 27.04 | +7: iteration 3300/ 21553 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.18 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.181998E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.985 | TFLOPs: 26.71 | +7: iteration 3310/ 21553 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.18 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.197295E+00 | grad norm: 0.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.259 | TFLOPs: 27.02 | +7: iteration 3320/ 21553 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.18 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.190192E+00 | grad norm: 0.590 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.283 | TFLOPs: 26.98 | +7: iteration 3330/ 21553 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.18 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.185132E+00 | grad norm: 0.620 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.167 | TFLOPs: 26.69 | +7: iteration 3340/ 21553 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.18 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.183635E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.686 | TFLOPs: 26.95 | +7: iteration 3350/ 21553 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.18 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.185477E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.917 | TFLOPs: 26.65 | +7: iteration 3360/ 21553 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.19 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.176020E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.335 | TFLOPs: 26.33 | +7: iteration 3370/ 21553 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.19 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.181207E+00 | grad norm: 0.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1377.303 | TFLOPs: 26.32 | +7: iteration 3380/ 21553 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.19 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.184956E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.787 | TFLOPs: 26.21 | +7: iteration 3390/ 21553 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.18 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.192490E+00 | grad norm: 0.604 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.855 | TFLOPs: 26.59 | +7: iteration 3400/ 21553 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.19 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.184830E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1361.246 | TFLOPs: 26.01 | +7: iteration 3410/ 21553 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.18 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.179413E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.951 | TFLOPs: 26.96 | +7: iteration 3420/ 21553 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.18 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.169230E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.316 | TFLOPs: 26.98 | +7: iteration 3430/ 21553 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.18 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.173833E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.413 | TFLOPs: 26.97 | +7: iteration 3440/ 21553 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.18 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.183514E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.985 | TFLOPs: 26.98 | +7: iteration 3450/ 21553 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.18 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.171655E+00 | grad norm: 0.615 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.303 | TFLOPs: 26.98 | +7: iteration 3460/ 21553 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.18 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.174057E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.951 | TFLOPs: 26.96 | +7: iteration 3470/ 21553 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.18 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.167528E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.070 | TFLOPs: 26.96 | +7: iteration 3480/ 21553 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.18 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.164120E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.027 | TFLOPs: 26.94 | +7: iteration 3490/ 21553 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.18 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.166887E+00 | grad norm: 0.579 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.944 | TFLOPs: 26.86 | +7: iteration 3500/ 21553 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.18 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.167406E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.143 | TFLOPs: 26.87 | +7: iteration 3510/ 21553 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.18 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.171104E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.539 | TFLOPs: 26.85 | +7: iteration 3520/ 21553 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.18 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.168483E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.482 | TFLOPs: 26.85 | +7: iteration 3530/ 21553 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.18 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.168866E+00 | grad norm: 0.661 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.784 | TFLOPs: 26.86 | +7: iteration 3540/ 21553 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.20 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 4.163160E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1262.655 | TFLOPs: 24.12 | +7: iteration 3550/ 21553 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.18 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 4.160950E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.126 | TFLOPs: 26.90 | +7: iteration 3560/ 21553 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.18 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 4.165687E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.379 | TFLOPs: 26.62 | +7: iteration 3570/ 21553 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.18 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.168067E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.549 | TFLOPs: 26.91 | +7: iteration 3580/ 21553 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.18 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.154307E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.813 | TFLOPs: 26.94 | +7: iteration 3590/ 21553 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.18 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.155849E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.425 | TFLOPs: 26.91 | +7: iteration 3600/ 21553 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.18 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.159106E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.174 | TFLOPs: 26.71 | +7: iteration 3610/ 21553 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.18 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 4.158405E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.785 | TFLOPs: 26.88 | +7: iteration 3620/ 21553 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.18 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 4.154534E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.780 | TFLOPs: 26.86 | +7: iteration 3630/ 21553 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.19 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 4.158395E+00 | grad norm: 0.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1352.825 | TFLOPs: 25.85 | +7: iteration 3640/ 21553 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.18 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 4.148129E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.143 | TFLOPs: 26.98 | +7: iteration 3650/ 21553 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.18 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 4.152771E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.466 | TFLOPs: 26.95 | +7: iteration 3660/ 21553 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.18 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 4.145613E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.439 | TFLOPs: 26.99 | +7: iteration 3670/ 21553 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.18 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 4.147853E+00 | grad norm: 0.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.244 | TFLOPs: 26.96 | +7: iteration 3680/ 21553 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.18 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 4.147783E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.500 | TFLOPs: 26.97 | +7: iteration 3690/ 21553 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.18 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 4.139599E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.080 | TFLOPs: 26.96 | +7: iteration 3700/ 21553 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.18 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 4.147760E+00 | grad norm: 0.563 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.532 | TFLOPs: 26.99 | +7: iteration 3710/ 21553 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.19 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 4.142347E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.445 | TFLOPs: 26.34 | +7: iteration 3720/ 21553 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.19 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 4.146368E+00 | grad norm: 0.628 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1362.281 | TFLOPs: 26.03 | +7: iteration 3730/ 21553 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.18 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 4.146245E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.202 | TFLOPs: 26.70 | +7: iteration 3740/ 21553 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.18 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 4.141906E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.750 | TFLOPs: 27.01 | +7: iteration 3750/ 21553 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.18 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 4.135209E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.364 | TFLOPs: 27.00 | +7: iteration 3760/ 21553 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.18 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 4.135703E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.514 | TFLOPs: 27.01 | +7: iteration 3770/ 21553 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.18 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 4.137571E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.481 | TFLOPs: 26.49 | +7: iteration 3780/ 21553 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.19 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 4.140828E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1355.699 | TFLOPs: 25.90 | +7: iteration 3790/ 21553 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.18 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 4.141749E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.699 | TFLOPs: 27.01 | +7: iteration 3800/ 21553 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.18 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 4.133888E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.182 | TFLOPs: 26.98 | +7: iteration 3810/ 21553 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.18 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 4.136845E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.496 | TFLOPs: 27.01 | +7: iteration 3820/ 21553 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.18 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 4.132127E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.308 | TFLOPs: 27.02 | +7: iteration 3830/ 21553 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.18 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 4.134805E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.618 | TFLOPs: 27.03 | +7: iteration 3840/ 21553 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.18 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 4.126903E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.931 | TFLOPs: 27.00 | +7: iteration 3850/ 21553 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.20 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 4.135069E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1273.103 | TFLOPs: 24.32 | +7: iteration 3860/ 21553 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.18 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 4.127682E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.624 | TFLOPs: 27.09 | +7: iteration 3870/ 21553 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.18 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 4.122889E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.235 | TFLOPs: 27.10 | +7: iteration 3880/ 21553 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.18 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 4.125462E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.276 | TFLOPs: 27.08 | +7: iteration 3890/ 21553 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.18 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 4.131146E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1419.500 | TFLOPs: 27.12 | +7: iteration 3900/ 21553 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.18 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 4.126286E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.176 | TFLOPs: 26.96 | +7: iteration 3910/ 21553 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.18 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 4.125890E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.174 | TFLOPs: 26.96 | +7: iteration 3920/ 21553 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.18 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 4.124276E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.771 | TFLOPs: 26.97 | +7: iteration 3930/ 21553 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.19 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 4.123851E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1355.754 | TFLOPs: 25.90 | +7: iteration 3940/ 21553 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.18 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 4.118924E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.787 | TFLOPs: 26.69 | +7: iteration 3950/ 21553 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.18 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 4.111319E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.508 | TFLOPs: 27.03 | +7: iteration 3960/ 21553 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.18 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 4.112189E+00 | grad norm: 0.621 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.151 | TFLOPs: 26.69 | +7: iteration 3970/ 21553 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.18 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 4.115085E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.985 | TFLOPs: 26.92 | +7: iteration 3980/ 21553 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.18 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 4.116924E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.058 | TFLOPs: 26.90 | +7: iteration 3990/ 21553 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.18 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 4.116478E+00 | grad norm: 0.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.170 | TFLOPs: 26.94 | +0: [2023-03-16 23:47:50,550] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.00018638652011758862, 0.00018638652011758862, 0.00018638652011758862], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 21553 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.18 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 4.109765E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.942 | TFLOPs: 26.96 | +0: steps: 4000 loss: 4.0746 iter time (s): 0.182 samples/sec: 1409.284 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 4000 | lm loss value: 4.063688E+00 | lm loss PPL: 5.818853E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 4000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:47:50,639] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +0: [2023-03-16 23:47:50,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:47:50,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:47:50,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:47:50,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:47:50,726] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:47:50,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:47:50,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:47:50,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:47:50,749] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:47:50,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:47:50,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:47:50,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:47:50,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:47:50,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:47:50,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:47:50,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:47:50,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:47:50,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:47:50,805] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:47:50,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:47:50,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:47:50,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:47:50,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:47:50,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:47:50,829] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step4000/mp_rank_00_model_states.pt +0: [2023-03-16 23:47:50,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:47:50,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:47:50,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:47:50,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:47:50,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:47:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:47:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:47:50,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:47:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:47:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:47:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:47:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-16 23:47:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-16 23:47:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:47:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:47:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-16 23:47:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:47:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 23:47:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-16 23:47:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:47:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:47:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-16 23:47:50,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:47:50,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:47:50,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-16 23:47:50,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:47:50,881] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:47:50,881] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: successfully saved checkpoint at iteration 4000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 246.52 +7: iteration 4010/ 21553 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.21 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 4.122270E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1195.308 | TFLOPs: 22.84 | +7: iteration 4020/ 21553 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.18 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 4.107952E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.325 | TFLOPs: 26.91 | +7: iteration 4030/ 21553 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.18 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 4.107865E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.014 | TFLOPs: 26.90 | +7: iteration 4040/ 21553 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.18 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 4.111381E+00 | grad norm: 0.575 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.701 | TFLOPs: 26.92 | +7: iteration 4050/ 21553 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.18 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 4.110009E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1384.445 | TFLOPs: 26.45 | +7: iteration 4060/ 21553 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.18 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 4.111044E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.152 | TFLOPs: 26.96 | +7: iteration 4070/ 21553 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.18 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 4.108169E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.262 | TFLOPs: 26.96 | +7: iteration 4080/ 21553 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.18 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 4.106376E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.049 | TFLOPs: 26.94 | +7: iteration 4090/ 21553 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.18 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 4.101377E+00 | grad norm: 0.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.443 | TFLOPs: 26.97 | +7: iteration 4100/ 21553 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.18 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 4.105708E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.851 | TFLOPs: 26.96 | +7: iteration 4110/ 21553 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.18 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 4.099969E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.762 | TFLOPs: 26.95 | +7: iteration 4120/ 21553 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.18 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 4.099728E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.230 | TFLOPs: 26.94 | +7: iteration 4130/ 21553 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.18 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 4.100113E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.396 | TFLOPs: 26.97 | +7: iteration 4140/ 21553 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.18 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 4.091880E+00 | grad norm: 0.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.702 | TFLOPs: 26.97 | +7: iteration 4150/ 21553 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.18 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 4.105266E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.622 | TFLOPs: 26.97 | +7: iteration 4160/ 21553 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.18 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 4.095549E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.307 | TFLOPs: 26.98 | +7: iteration 4170/ 21553 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.18 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 4.096465E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.261 | TFLOPs: 26.85 | +7: iteration 4180/ 21553 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.18 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 4.093505E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.187 | TFLOPs: 26.94 | +7: iteration 4190/ 21553 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.18 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 4.092995E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.605 | TFLOPs: 26.93 | +7: iteration 4200/ 21553 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.18 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 4.101080E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.448 | TFLOPs: 26.89 | +7: iteration 4210/ 21553 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.18 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 4.100312E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.344 | TFLOPs: 26.95 | +7: iteration 4220/ 21553 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.18 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 4.088758E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1389.783 | TFLOPs: 26.55 | +7: iteration 4230/ 21553 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.18 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 4.092545E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.875 | TFLOPs: 26.71 | +7: iteration 4240/ 21553 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.19 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 4.093592E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1381.112 | TFLOPs: 26.39 | +7: iteration 4250/ 21553 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.19 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 4.086315E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.949 | TFLOPs: 26.42 | +7: iteration 4260/ 21553 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.18 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 4.088330E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.577 | TFLOPs: 26.91 | +7: iteration 4270/ 21553 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.18 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 4.088430E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.293 | TFLOPs: 26.91 | +7: iteration 4280/ 21553 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.18 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 4.084534E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.130 | TFLOPs: 26.94 | +7: iteration 4290/ 21553 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.18 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 4.081382E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.799 | TFLOPs: 26.90 | +7: iteration 4300/ 21553 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.18 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 4.081735E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.828 | TFLOPs: 26.92 | +7: iteration 4310/ 21553 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.18 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 4.079536E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.290 | TFLOPs: 26.93 | +7: iteration 4320/ 21553 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.18 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 4.085256E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.562 | TFLOPs: 26.89 | +7: iteration 4330/ 21553 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.18 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 4.076880E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.115 | TFLOPs: 26.90 | +7: iteration 4340/ 21553 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.18 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 4.082703E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.498 | TFLOPs: 26.95 | +7: iteration 4350/ 21553 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.18 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 4.075549E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.447 | TFLOPs: 26.95 | +7: iteration 4360/ 21553 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.18 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 4.088005E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.281 | TFLOPs: 26.93 | +7: iteration 4370/ 21553 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.18 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 4.081167E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.142 | TFLOPs: 26.90 | +7: iteration 4380/ 21553 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.18 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 4.082598E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.821 | TFLOPs: 26.84 | +7: iteration 4390/ 21553 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.18 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 4.075223E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.390 | TFLOPs: 26.87 | +7: iteration 4400/ 21553 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.18 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 4.083041E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.903 | TFLOPs: 26.88 | +7: iteration 4410/ 21553 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.18 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 4.073055E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.377 | TFLOPs: 26.87 | +7: iteration 4420/ 21553 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.18 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 4.071705E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.303 | TFLOPs: 26.87 | +7: iteration 4430/ 21553 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.20 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 4.075788E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1264.805 | TFLOPs: 24.17 | +7: iteration 4440/ 21553 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.18 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 4.076022E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.642 | TFLOPs: 26.88 | +7: iteration 4450/ 21553 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.18 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 4.075268E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.664 | TFLOPs: 26.88 | +7: iteration 4460/ 21553 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.18 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 4.071542E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.301 | TFLOPs: 26.47 | +7: iteration 4470/ 21553 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.18 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 4.069972E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.382 | TFLOPs: 26.66 | +7: iteration 4480/ 21553 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.19 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 4.062879E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1380.960 | TFLOPs: 26.39 | +7: iteration 4490/ 21553 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.18 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 4.076851E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.314 | TFLOPs: 26.97 | +7: iteration 4500/ 21553 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.18 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 4.078402E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.667 | TFLOPs: 26.93 | +7: iteration 4510/ 21553 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.18 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 4.062875E+00 | grad norm: 0.606 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.267 | TFLOPs: 26.93 | +7: iteration 4520/ 21553 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.18 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 4.061502E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.725 | TFLOPs: 26.93 | +7: iteration 4530/ 21553 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.18 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 4.064632E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.617 | TFLOPs: 26.95 | +7: iteration 4540/ 21553 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.18 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 4.063182E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.071 | TFLOPs: 26.96 | +7: iteration 4550/ 21553 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.18 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 4.052083E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.070 | TFLOPs: 26.96 | +7: iteration 4560/ 21553 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.18 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 4.062194E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.609 | TFLOPs: 26.95 | +7: iteration 4570/ 21553 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.18 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 4.069695E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.584 | TFLOPs: 26.97 | +7: iteration 4580/ 21553 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.18 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 4.058488E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.036 | TFLOPs: 26.94 | +7: iteration 4590/ 21553 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.18 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 4.059241E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.856 | TFLOPs: 26.94 | +7: iteration 4600/ 21553 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.18 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 4.061289E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.178 | TFLOPs: 26.96 | +7: iteration 4610/ 21553 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.18 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 4.059543E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.425 | TFLOPs: 26.95 | +7: iteration 4620/ 21553 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.18 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 4.062352E+00 | grad norm: 0.584 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.339 | TFLOPs: 26.95 | +7: iteration 4630/ 21553 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.18 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 4.058358E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.108 | TFLOPs: 26.96 | +7: iteration 4640/ 21553 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.18 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 4.053112E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.631 | TFLOPs: 26.97 | +7: iteration 4650/ 21553 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.18 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 4.048458E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.064 | TFLOPs: 26.98 | +7: iteration 4660/ 21553 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.18 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 4.057312E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.013 | TFLOPs: 26.98 | +7: iteration 4670/ 21553 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.18 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 4.048968E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.869 | TFLOPs: 27.01 | +7: iteration 4680/ 21553 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.18 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 4.048359E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.045 | TFLOPs: 27.02 | +7: iteration 4690/ 21553 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.18 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 4.053822E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.213 | TFLOPs: 26.96 | +7: iteration 4700/ 21553 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.18 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 4.055724E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.749 | TFLOPs: 26.99 | +7: iteration 4710/ 21553 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.18 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 4.058515E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.514 | TFLOPs: 26.99 | +7: iteration 4720/ 21553 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.18 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 4.047925E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.881 | TFLOPs: 27.00 | +7: iteration 4730/ 21553 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.18 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 4.047628E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.124 | TFLOPs: 26.98 | +7: iteration 4740/ 21553 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.18 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 4.050303E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.836 | TFLOPs: 26.94 | +7: iteration 4750/ 21553 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.18 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 4.049182E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.392 | TFLOPs: 26.95 | +7: iteration 4760/ 21553 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.18 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 4.051161E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.933 | TFLOPs: 26.94 | +7: iteration 4770/ 21553 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.18 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 4.045005E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.484 | TFLOPs: 26.74 | +7: iteration 4780/ 21553 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.19 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 4.044420E+00 | grad norm: 0.546 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1343.104 | TFLOPs: 25.66 | +7: iteration 4790/ 21553 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.19 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 4.043773E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1363.064 | TFLOPs: 26.04 | +7: iteration 4800/ 21553 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.18 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 4.038311E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.961 | TFLOPs: 26.58 | +7: iteration 4810/ 21553 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.18 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 4.047957E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.633 | TFLOPs: 26.67 | +7: iteration 4820/ 21553 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.18 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 4.036110E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.042 | TFLOPs: 26.62 | +7: iteration 4830/ 21553 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.18 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 4.040846E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.908 | TFLOPs: 26.69 | +7: iteration 4840/ 21553 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.18 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 4.041230E+00 | grad norm: 0.645 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.990 | TFLOPs: 26.69 | +7: iteration 4850/ 21553 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.18 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 4.042736E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.488 | TFLOPs: 26.66 | +7: iteration 4860/ 21553 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.18 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 4.028824E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.248 | TFLOPs: 26.70 | +7: iteration 4870/ 21553 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.19 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 4.040389E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1383.048 | TFLOPs: 26.43 | +7: iteration 4880/ 21553 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.18 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 4.033074E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.288 | TFLOPs: 26.47 | +7: iteration 4890/ 21553 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.18 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 4.037292E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.204 | TFLOPs: 26.52 | +7: iteration 4900/ 21553 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.18 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 4.039602E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1385.776 | TFLOPs: 26.48 | +7: iteration 4910/ 21553 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.18 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 4.034843E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.350 | TFLOPs: 26.51 | +7: iteration 4920/ 21553 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.18 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 4.037445E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.649 | TFLOPs: 26.74 | +7: iteration 4930/ 21553 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.18 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 4.032487E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.029 | TFLOPs: 26.73 | +7: iteration 4940/ 21553 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.19 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 4.029553E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1371.358 | TFLOPs: 26.20 | +7: iteration 4950/ 21553 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.18 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 4.031058E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.182 | TFLOPs: 26.98 | +7: iteration 4960/ 21553 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.18 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 4.027805E+00 | grad norm: 0.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.568 | TFLOPs: 26.49 | +7: iteration 4970/ 21553 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.18 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 4.029955E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.942 | TFLOPs: 26.65 | +7: iteration 4980/ 21553 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.18 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 4.032702E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.621 | TFLOPs: 26.97 | +7: iteration 4990/ 21553 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.18 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 4.033833E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.247 | TFLOPs: 26.96 | +7: iteration 5000/ 21553 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.18 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 4.031496E+00 | grad norm: 0.587 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.426 | TFLOPs: 26.95 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 5000 | lm loss value: 3.969001E+00 | lm loss PPL: 5.293162E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 5000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:50:53,543] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +0: [2023-03-16 23:50:53,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:50:53,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:50:53,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:50:53,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:50:53,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:50:53,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:50:53,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:50:53,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:50:53,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:50:53,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:50:53,667] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:50:53,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:50:53,678] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:50:53,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:50:53,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:50:53,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:50:53,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:50:53,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:50:53,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:50:53,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:50:53,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:50:53,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:50:53,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:50:53,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:50:53,737] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step5000/mp_rank_00_model_states.pt +0: [2023-03-16 23:50:53,737] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:50:53,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:50:53,757] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,756] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:50:53,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,778] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,778] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,781] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-16 23:50:53,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,784] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 23:50:53,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,785] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,785] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-16 23:50:53,785] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +0: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:50:53,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-16 23:50:53,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:50:53,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:50:53,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:50:53,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,787] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-16 23:50:53,787] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: successfully saved checkpoint at iteration 5000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 248.36 +7: iteration 5010/ 21553 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.22 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 4.028423E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1186.837 | TFLOPs: 22.68 | +7: iteration 5020/ 21553 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.19 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 4.025131E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1336.023 | TFLOPs: 25.53 | +7: iteration 5030/ 21553 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.19 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 4.029721E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1341.329 | TFLOPs: 25.63 | +7: iteration 5040/ 21553 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.18 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 4.025825E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.963 | TFLOPs: 26.94 | +7: iteration 5050/ 21553 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.18 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 4.017821E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.437 | TFLOPs: 26.97 | +7: iteration 5060/ 21553 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.18 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 4.027163E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.228 | TFLOPs: 26.72 | +7: iteration 5070/ 21553 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.18 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 4.023523E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.771 | TFLOPs: 26.73 | +7: iteration 5080/ 21553 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.19 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 4.026198E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1381.036 | TFLOPs: 26.39 | +7: iteration 5090/ 21553 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.18 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 4.025022E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.100 | TFLOPs: 27.04 | +7: iteration 5100/ 21553 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.18 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 4.018504E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.421 | TFLOPs: 27.06 | +7: iteration 5110/ 21553 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.18 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 4.015395E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.866 | TFLOPs: 27.07 | +7: iteration 5120/ 21553 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 0.19 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 4.017639E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1346.553 | TFLOPs: 25.73 | +7: iteration 5130/ 21553 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 0.19 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 4.017160E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.244 | TFLOPs: 26.41 | +7: iteration 5140/ 21553 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 0.18 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 4.019996E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.019 | TFLOPs: 26.84 | +7: iteration 5150/ 21553 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 0.18 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 4.009134E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.461 | TFLOPs: 26.53 | +7: iteration 5160/ 21553 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 0.18 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 4.009969E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.308 | TFLOPs: 26.58 | +7: iteration 5170/ 21553 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 0.18 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 4.014471E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.055 | TFLOPs: 27.07 | +7: iteration 5180/ 21553 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 0.18 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 4.008334E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.747 | TFLOPs: 26.59 | +7: iteration 5190/ 21553 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 0.18 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 4.014187E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.771 | TFLOPs: 26.82 | +7: iteration 5200/ 21553 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 0.18 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 4.012628E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.223 | TFLOPs: 27.10 | +7: iteration 5210/ 21553 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 0.18 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 4.015244E+00 | grad norm: 0.617 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.535 | TFLOPs: 27.08 | +7: iteration 5220/ 21553 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 0.18 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 4.014193E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.785 | TFLOPs: 27.11 | +7: iteration 5230/ 21553 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 0.18 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 4.008553E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.419 | TFLOPs: 27.08 | +7: iteration 5240/ 21553 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 0.18 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 4.007209E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.762 | TFLOPs: 27.09 | +7: iteration 5250/ 21553 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 0.18 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 4.009098E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.878 | TFLOPs: 27.07 | +7: iteration 5260/ 21553 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 0.18 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 4.011852E+00 | grad norm: 0.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.153 | TFLOPs: 27.10 | +7: iteration 5270/ 21553 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 0.18 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 4.005059E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.453 | TFLOPs: 27.08 | +7: iteration 5280/ 21553 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 0.18 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 4.007853E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.328 | TFLOPs: 27.10 | +7: iteration 5290/ 21553 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 0.18 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 4.012265E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1419.252 | TFLOPs: 27.12 | +7: iteration 5300/ 21553 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 0.18 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 4.005261E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.271 | TFLOPs: 27.10 | +7: iteration 5310/ 21553 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 0.18 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 4.005804E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.973 | TFLOPs: 27.07 | +7: iteration 5320/ 21553 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 0.18 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.998082E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.106 | TFLOPs: 27.09 | +7: iteration 5330/ 21553 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 0.19 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 4.002461E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.247 | TFLOPs: 25.99 | +7: iteration 5340/ 21553 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 0.21 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 4.004174E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1242.445 | TFLOPs: 23.74 | +7: iteration 5350/ 21553 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 0.18 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 4.006510E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1420.394 | TFLOPs: 27.14 | +7: iteration 5360/ 21553 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 0.18 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 4.004003E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1420.103 | TFLOPs: 27.13 | +7: iteration 5370/ 21553 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 0.18 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 4.006152E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.974 | TFLOPs: 27.09 | +7: iteration 5380/ 21553 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 0.18 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.998737E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.330 | TFLOPs: 27.08 | +7: iteration 5390/ 21553 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 0.18 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 4.005159E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.293 | TFLOPs: 26.58 | +7: iteration 5400/ 21553 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 0.18 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.997897E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.170 | TFLOPs: 27.06 | +7: iteration 5410/ 21553 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 0.18 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 4.002584E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.479 | TFLOPs: 27.08 | +7: iteration 5420/ 21553 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 0.18 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 4.001212E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.307 | TFLOPs: 27.10 | +7: iteration 5430/ 21553 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 0.18 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 4.004513E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.893 | TFLOPs: 27.07 | +7: iteration 5440/ 21553 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 0.18 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 4.001895E+00 | grad norm: 0.558 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.600 | TFLOPs: 27.03 | +7: iteration 5450/ 21553 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 0.18 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.990342E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.125 | TFLOPs: 27.04 | +7: iteration 5460/ 21553 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 0.18 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.997926E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.568 | TFLOPs: 27.05 | +7: iteration 5470/ 21553 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 0.18 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.987795E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.183 | TFLOPs: 27.06 | +7: iteration 5480/ 21553 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 0.18 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.990511E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.475 | TFLOPs: 27.06 | +7: iteration 5490/ 21553 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 0.18 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.992474E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.675 | TFLOPs: 27.09 | +7: iteration 5500/ 21553 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 0.18 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.993037E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.394 | TFLOPs: 27.10 | +7: iteration 5510/ 21553 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 0.18 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.993439E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.888 | TFLOPs: 27.05 | +7: iteration 5520/ 21553 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 0.18 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.993844E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.867 | TFLOPs: 27.01 | +7: iteration 5530/ 21553 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 0.19 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.992250E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1380.562 | TFLOPs: 26.38 | +7: iteration 5540/ 21553 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 0.18 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.989926E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.614 | TFLOPs: 27.03 | +7: iteration 5550/ 21553 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 0.18 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.994470E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.272 | TFLOPs: 27.06 | +7: iteration 5560/ 21553 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 0.18 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.988889E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.275 | TFLOPs: 27.08 | +7: iteration 5570/ 21553 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 0.18 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.997121E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.934 | TFLOPs: 27.05 | +7: iteration 5580/ 21553 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 0.18 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.986156E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.462 | TFLOPs: 27.04 | +7: iteration 5590/ 21553 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 0.18 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.992126E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.652 | TFLOPs: 27.07 | +7: iteration 5600/ 21553 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 0.18 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.991200E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.305 | TFLOPs: 27.00 | +7: iteration 5610/ 21553 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 0.18 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.986009E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.601 | TFLOPs: 27.03 | +7: iteration 5620/ 21553 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 0.18 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.985532E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.277 | TFLOPs: 26.49 | +7: iteration 5630/ 21553 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 0.18 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.987070E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.137 | TFLOPs: 27.02 | +7: iteration 5640/ 21553 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 0.18 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.986401E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.031 | TFLOPs: 27.00 | +7: iteration 5650/ 21553 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 0.18 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.985584E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.515 | TFLOPs: 27.03 | +7: iteration 5660/ 21553 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 0.18 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.983520E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.804 | TFLOPs: 27.07 | +7: iteration 5670/ 21553 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 0.18 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.974086E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.551 | TFLOPs: 27.08 | +7: iteration 5680/ 21553 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 0.18 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.984357E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.173 | TFLOPs: 27.08 | +7: iteration 5690/ 21553 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 0.18 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.981213E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.263 | TFLOPs: 27.10 | +7: iteration 5700/ 21553 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 0.19 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.979602E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.512 | TFLOPs: 26.41 | +7: iteration 5710/ 21553 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 0.18 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.986927E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.345 | TFLOPs: 27.04 | +7: iteration 5720/ 21553 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 0.18 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.983255E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.757 | TFLOPs: 27.01 | +7: iteration 5730/ 21553 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 0.18 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.989817E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.187 | TFLOPs: 27.00 | +7: iteration 5740/ 21553 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 0.18 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.984650E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.827 | TFLOPs: 27.05 | +7: iteration 5750/ 21553 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 0.18 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.980473E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.664 | TFLOPs: 27.03 | +7: iteration 5760/ 21553 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 0.18 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.981001E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.032 | TFLOPs: 27.06 | +7: iteration 5770/ 21553 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 0.18 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.981358E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.065 | TFLOPs: 27.06 | +7: iteration 5780/ 21553 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 0.18 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.978754E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.506 | TFLOPs: 27.06 | +7: iteration 5790/ 21553 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 0.18 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.977598E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.694 | TFLOPs: 26.70 | +7: iteration 5800/ 21553 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 0.18 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.970518E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.142 | TFLOPs: 27.10 | +7: iteration 5810/ 21553 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 0.18 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.973563E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.152 | TFLOPs: 26.64 | +7: iteration 5820/ 21553 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 0.18 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.969698E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.899 | TFLOPs: 27.05 | +7: iteration 5830/ 21553 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 0.18 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.972570E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.408 | TFLOPs: 26.53 | +7: iteration 5840/ 21553 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 0.18 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.976832E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.878 | TFLOPs: 26.99 | +7: iteration 5850/ 21553 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 0.18 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.977742E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.198 | TFLOPs: 27.04 | +7: iteration 5860/ 21553 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 0.18 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.980232E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.646 | TFLOPs: 27.01 | +7: iteration 5870/ 21553 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 0.18 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.965577E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.688 | TFLOPs: 27.05 | +7: iteration 5880/ 21553 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 0.18 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.964757E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.112 | TFLOPs: 26.58 | +7: iteration 5890/ 21553 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 0.18 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.973952E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.832 | TFLOPs: 27.01 | +7: iteration 5900/ 21553 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 0.18 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.977568E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.966 | TFLOPs: 27.05 | +7: iteration 5910/ 21553 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 0.18 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.973429E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.809 | TFLOPs: 27.05 | +7: iteration 5920/ 21553 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 0.18 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.972532E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.115 | TFLOPs: 27.04 | +7: iteration 5930/ 21553 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 0.18 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.971873E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.227 | TFLOPs: 27.00 | +7: iteration 5940/ 21553 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 0.18 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.966836E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.728 | TFLOPs: 27.03 | +7: iteration 5950/ 21553 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 0.18 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.973277E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.642 | TFLOPs: 27.03 | +7: iteration 5960/ 21553 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 0.18 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.976271E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.293 | TFLOPs: 27.04 | +7: iteration 5970/ 21553 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 0.18 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.968323E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.975 | TFLOPs: 27.04 | +7: iteration 5980/ 21553 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 0.18 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.972195E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.489 | TFLOPs: 27.04 | +7: iteration 5990/ 21553 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 0.18 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.964158E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.018 | TFLOPs: 27.04 | +0: [2023-03-16 23:53:55,820] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.00016928570742907802, 0.00016928570742907802, 0.00016928570742907802], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 6000/ 21553 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 0.18 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.967544E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1389.015 | TFLOPs: 26.54 | +0: steps: 6000 loss: 3.9810 iter time (s): 0.181 samples/sec: 1417.968 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 6000 | lm loss value: 3.974031E+00 | lm loss PPL: 5.319857E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 6000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:53:55,907] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! +0: [2023-03-16 23:53:55,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:53:55,983] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:53:55,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:53:55,996] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:53:55,996] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:53:56,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:53:56,007] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:53:56,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:53:56,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:53:56,030] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:53:56,030] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:53:56,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:53:56,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:53:56,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:53:56,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:53:56,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:53:56,064] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:53:56,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:53:56,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:53:56,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:53:56,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:53:56,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:53:56,099] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:53:56,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:53:56,101] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step6000/mp_rank_00_model_states.pt +0: [2023-03-16 23:53:56,101] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:53:56,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:53:56,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:53:56,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:53:56,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:53:56,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 23:53:56,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:53:56,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:53:56,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-16 23:53:56,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-16 23:53:56,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-16 23:53:56,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:53:56,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:53:56,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:53:56,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: successfully saved checkpoint at iteration 6000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 246.13 +7: iteration 6010/ 21553 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 0.21 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.958425E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1198.461 | TFLOPs: 22.90 | +7: iteration 6020/ 21553 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 0.18 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.957337E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.017 | TFLOPs: 27.04 | +7: iteration 6030/ 21553 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 0.18 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.954497E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.720 | TFLOPs: 27.03 | +7: iteration 6040/ 21553 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 0.18 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.958625E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.770 | TFLOPs: 27.03 | +7: iteration 6050/ 21553 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 0.18 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.961900E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.551 | TFLOPs: 27.03 | +7: iteration 6060/ 21553 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 0.18 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.965736E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.184 | TFLOPs: 27.02 | +7: iteration 6070/ 21553 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 0.18 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.955048E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.047 | TFLOPs: 27.02 | +7: iteration 6080/ 21553 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 0.19 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.963227E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1354.591 | TFLOPs: 25.88 | +7: iteration 6090/ 21553 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 0.18 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.958986E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.344 | TFLOPs: 27.00 | +7: iteration 6100/ 21553 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 0.18 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.962622E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.115 | TFLOPs: 27.04 | +7: iteration 6110/ 21553 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 0.18 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.958931E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.204 | TFLOPs: 27.02 | +7: iteration 6120/ 21553 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 0.18 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.957957E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.492 | TFLOPs: 27.01 | +7: iteration 6130/ 21553 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 0.18 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.960773E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.519 | TFLOPs: 26.72 | +7: iteration 6140/ 21553 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 0.18 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.953809E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.759 | TFLOPs: 26.86 | +7: iteration 6150/ 21553 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 0.18 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.953321E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.647 | TFLOPs: 27.01 | +7: iteration 6160/ 21553 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 0.18 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.955872E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.597 | TFLOPs: 27.03 | +7: iteration 6170/ 21553 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 0.18 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.956256E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.289 | TFLOPs: 27.02 | +7: iteration 6180/ 21553 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 0.18 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.959060E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.246 | TFLOPs: 26.68 | +7: iteration 6190/ 21553 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 0.18 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.955332E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.394 | TFLOPs: 27.00 | +7: iteration 6200/ 21553 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 0.18 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.955652E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.526 | TFLOPs: 27.03 | +7: iteration 6210/ 21553 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 0.18 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.954129E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.194 | TFLOPs: 27.00 | +7: iteration 6220/ 21553 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 0.18 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.955416E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.802 | TFLOPs: 27.01 | +7: iteration 6230/ 21553 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 0.18 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.961676E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.544 | TFLOPs: 27.01 | +7: iteration 6240/ 21553 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 0.18 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.957551E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.164 | TFLOPs: 26.96 | +7: iteration 6250/ 21553 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 0.18 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.950417E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.944 | TFLOPs: 26.98 | +7: iteration 6260/ 21553 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 0.18 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.943818E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.212 | TFLOPs: 26.98 | +7: iteration 6270/ 21553 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 0.18 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.950351E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.406 | TFLOPs: 26.99 | +7: iteration 6280/ 21553 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 0.18 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.953633E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.365 | TFLOPs: 27.00 | +7: iteration 6290/ 21553 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 0.19 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.949500E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1380.265 | TFLOPs: 26.37 | +7: iteration 6300/ 21553 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 0.19 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.951943E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1372.272 | TFLOPs: 26.22 | +7: iteration 6310/ 21553 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 0.20 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.956557E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1271.520 | TFLOPs: 24.29 | +7: iteration 6320/ 21553 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 0.18 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.947420E+00 | grad norm: 0.538 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.172 | TFLOPs: 27.10 | +7: iteration 6330/ 21553 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 0.18 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.948378E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.375 | TFLOPs: 27.08 | +7: iteration 6340/ 21553 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 0.18 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.942791E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.573 | TFLOPs: 27.08 | +7: iteration 6350/ 21553 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 0.18 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.944474E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.481 | TFLOPs: 27.03 | +7: iteration 6360/ 21553 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 0.18 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.940184E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.045 | TFLOPs: 26.64 | +7: iteration 6370/ 21553 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 0.18 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.948548E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.214 | TFLOPs: 27.00 | +7: iteration 6380/ 21553 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 0.18 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.943503E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.142 | TFLOPs: 27.00 | +7: iteration 6390/ 21553 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 0.18 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.935133E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.777 | TFLOPs: 26.99 | +7: iteration 6400/ 21553 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 0.18 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.940512E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.528 | TFLOPs: 26.99 | +7: iteration 6410/ 21553 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 0.18 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.946171E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.633 | TFLOPs: 26.99 | +7: iteration 6420/ 21553 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 0.18 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.938020E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.859 | TFLOPs: 26.99 | +7: iteration 6430/ 21553 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 0.18 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.944734E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.642 | TFLOPs: 26.53 | +7: iteration 6440/ 21553 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 0.18 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.943114E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.674 | TFLOPs: 26.57 | +7: iteration 6450/ 21553 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 0.18 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.945704E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1392.280 | TFLOPs: 26.60 | +7: iteration 6460/ 21553 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 0.18 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.942008E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.839 | TFLOPs: 26.98 | +7: iteration 6470/ 21553 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 0.18 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.935964E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.143 | TFLOPs: 27.04 | +7: iteration 6480/ 21553 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 0.18 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.939361E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.545 | TFLOPs: 27.05 | +7: iteration 6490/ 21553 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 0.18 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.952465E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.641 | TFLOPs: 27.03 | +7: iteration 6500/ 21553 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 0.18 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.935848E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.300 | TFLOPs: 27.04 | +7: iteration 6510/ 21553 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 0.18 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.932524E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.198 | TFLOPs: 27.02 | +7: iteration 6520/ 21553 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 0.18 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.945626E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.724 | TFLOPs: 27.05 | +7: iteration 6530/ 21553 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 0.18 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.935637E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.007 | TFLOPs: 27.05 | +7: iteration 6540/ 21553 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 0.18 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.938197E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.237 | TFLOPs: 27.04 | +7: iteration 6550/ 21553 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 0.18 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.934430E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.512 | TFLOPs: 27.05 | +7: iteration 6560/ 21553 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 0.18 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.936769E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.061 | TFLOPs: 27.07 | +7: iteration 6570/ 21553 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 0.18 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.942253E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.073 | TFLOPs: 27.09 | +7: iteration 6580/ 21553 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 0.18 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.933813E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.502 | TFLOPs: 27.08 | +7: iteration 6590/ 21553 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 0.18 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.940855E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.965 | TFLOPs: 27.07 | +7: iteration 6600/ 21553 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 0.18 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.935257E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.107 | TFLOPs: 27.09 | +7: iteration 6610/ 21553 | consumed samples: 1692160 | consumed tokens: 3465543680 | elapsed time per iteration (s): 0.18 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.932365E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.161 | TFLOPs: 27.10 | +7: iteration 6620/ 21553 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 0.18 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.931081E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.583 | TFLOPs: 27.08 | +7: iteration 6630/ 21553 | consumed samples: 1697280 | consumed tokens: 3476029440 | elapsed time per iteration (s): 0.18 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.936831E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.322 | TFLOPs: 27.02 | +7: iteration 6640/ 21553 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 0.18 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.936266E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.726 | TFLOPs: 27.03 | +7: iteration 6650/ 21553 | consumed samples: 1702400 | consumed tokens: 3486515200 | elapsed time per iteration (s): 0.18 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.924513E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.758 | TFLOPs: 27.09 | +7: iteration 6660/ 21553 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 0.18 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.929240E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.266 | TFLOPs: 27.08 | +7: iteration 6670/ 21553 | consumed samples: 1707520 | consumed tokens: 3497000960 | elapsed time per iteration (s): 0.18 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.934403E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.080 | TFLOPs: 27.06 | +7: iteration 6680/ 21553 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 0.18 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.925714E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.242 | TFLOPs: 27.04 | +7: iteration 6690/ 21553 | consumed samples: 1712640 | consumed tokens: 3507486720 | elapsed time per iteration (s): 0.18 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.928349E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.186 | TFLOPs: 27.02 | +7: iteration 6700/ 21553 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 0.18 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.933576E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.861 | TFLOPs: 27.01 | +7: iteration 6710/ 21553 | consumed samples: 1717760 | consumed tokens: 3517972480 | elapsed time per iteration (s): 0.18 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.926194E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.232 | TFLOPs: 27.00 | +7: iteration 6720/ 21553 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 0.18 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.936193E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.240 | TFLOPs: 27.04 | +7: iteration 6730/ 21553 | consumed samples: 1722880 | consumed tokens: 3528458240 | elapsed time per iteration (s): 0.18 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.927428E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.412 | TFLOPs: 27.04 | +7: iteration 6740/ 21553 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 0.18 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.925841E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.513 | TFLOPs: 27.05 | +7: iteration 6750/ 21553 | consumed samples: 1728000 | consumed tokens: 3538944000 | elapsed time per iteration (s): 0.18 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.934401E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.415 | TFLOPs: 27.04 | +7: iteration 6760/ 21553 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 0.18 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.916542E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.399 | TFLOPs: 27.02 | +7: iteration 6770/ 21553 | consumed samples: 1733120 | consumed tokens: 3549429760 | elapsed time per iteration (s): 0.18 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.928948E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.905 | TFLOPs: 27.01 | +7: iteration 6780/ 21553 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 0.18 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.933952E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.180 | TFLOPs: 27.04 | +7: iteration 6790/ 21553 | consumed samples: 1738240 | consumed tokens: 3559915520 | elapsed time per iteration (s): 0.18 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.932309E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.945 | TFLOPs: 27.05 | +7: iteration 6800/ 21553 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 0.18 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.918304E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.956 | TFLOPs: 27.07 | +7: iteration 6810/ 21553 | consumed samples: 1743360 | consumed tokens: 3570401280 | elapsed time per iteration (s): 0.18 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.931443E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.666 | TFLOPs: 27.07 | +7: iteration 6820/ 21553 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 0.18 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.925771E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.982 | TFLOPs: 27.05 | +7: iteration 6830/ 21553 | consumed samples: 1748480 | consumed tokens: 3580887040 | elapsed time per iteration (s): 0.18 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.920012E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.896 | TFLOPs: 27.05 | +7: iteration 6840/ 21553 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 0.18 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.918979E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.458 | TFLOPs: 27.06 | +7: iteration 6850/ 21553 | consumed samples: 1753600 | consumed tokens: 3591372800 | elapsed time per iteration (s): 0.18 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.927531E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.071 | TFLOPs: 27.06 | +7: iteration 6860/ 21553 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 0.18 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.920779E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.175 | TFLOPs: 27.04 | +7: iteration 6870/ 21553 | consumed samples: 1758720 | consumed tokens: 3601858560 | elapsed time per iteration (s): 0.18 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.921400E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.760 | TFLOPs: 27.07 | +7: iteration 6880/ 21553 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 0.18 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.914940E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.311 | TFLOPs: 27.08 | +7: iteration 6890/ 21553 | consumed samples: 1763840 | consumed tokens: 3612344320 | elapsed time per iteration (s): 0.18 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.919310E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.756 | TFLOPs: 27.09 | +7: iteration 6900/ 21553 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 0.18 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.915882E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.007 | TFLOPs: 27.04 | +7: iteration 6910/ 21553 | consumed samples: 1768960 | consumed tokens: 3622830080 | elapsed time per iteration (s): 0.18 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.917134E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.645 | TFLOPs: 27.07 | +7: iteration 6920/ 21553 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 0.19 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.912145E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1374.222 | TFLOPs: 26.26 | +7: iteration 6930/ 21553 | consumed samples: 1774080 | consumed tokens: 3633315840 | elapsed time per iteration (s): 0.18 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.916664E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.863 | TFLOPs: 26.96 | +7: iteration 6940/ 21553 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 0.18 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.921609E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.649 | TFLOPs: 26.97 | +7: iteration 6950/ 21553 | consumed samples: 1779200 | consumed tokens: 3643801600 | elapsed time per iteration (s): 0.18 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.923027E+00 | grad norm: 0.525 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.379 | TFLOPs: 26.97 | +7: iteration 6960/ 21553 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 0.18 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.920995E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.120 | TFLOPs: 26.98 | +7: iteration 6970/ 21553 | consumed samples: 1784320 | consumed tokens: 3654287360 | elapsed time per iteration (s): 0.18 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.920377E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.189 | TFLOPs: 27.00 | +7: iteration 6980/ 21553 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 0.18 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.912654E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.412 | TFLOPs: 27.01 | +7: iteration 6990/ 21553 | consumed samples: 1789440 | consumed tokens: 3664773120 | elapsed time per iteration (s): 0.18 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.918069E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.498 | TFLOPs: 27.01 | +7: iteration 7000/ 21553 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 0.18 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.920945E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.249 | TFLOPs: 27.02 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 7000 | lm loss value: 3.956458E+00 | lm loss PPL: 5.227186E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 7000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:56:57,779] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is begin to save! +0: [2023-03-16 23:56:57,782] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:56:57,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:56:57,854] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:56:57,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:56:57,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:56:57,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:56:57,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:56:57,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:56:57,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:56:57,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:56:57,901] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:56:57,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:56:57,912] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:56:57,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:56:57,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:56:57,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:56:57,935] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:56:57,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:56:57,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:56:57,957] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:56:57,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:56:57,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:56:57,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:56:57,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:56:57,969] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step7000/mp_rank_00_model_states.pt +0: [2023-03-16 23:56:57,969] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:56:57,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:56:57,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:56:57,988] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:56:58,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,009] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,009] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:56:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:56:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-16 23:56:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,015] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,015] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-16 23:56:58,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-16 23:56:58,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:56:58,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:56:58,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: successfully saved checkpoint at iteration 7000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 240.89 +7: iteration 7010/ 21553 | consumed samples: 1794560 | consumed tokens: 3675258880 | elapsed time per iteration (s): 0.21 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.916760E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1204.871 | TFLOPs: 23.02 | +7: iteration 7020/ 21553 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 0.18 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.908915E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.398 | TFLOPs: 26.66 | +7: iteration 7030/ 21553 | consumed samples: 1799680 | consumed tokens: 3685744640 | elapsed time per iteration (s): 0.18 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.913593E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.523 | TFLOPs: 27.05 | +7: iteration 7040/ 21553 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 0.18 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.921173E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.768 | TFLOPs: 27.03 | +7: iteration 7050/ 21553 | consumed samples: 1804800 | consumed tokens: 3696230400 | elapsed time per iteration (s): 0.18 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.917997E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.386 | TFLOPs: 27.02 | +7: iteration 7060/ 21553 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 0.18 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.914005E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.725 | TFLOPs: 27.03 | +7: iteration 7070/ 21553 | consumed samples: 1809920 | consumed tokens: 3706716160 | elapsed time per iteration (s): 0.18 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.909998E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.774 | TFLOPs: 27.05 | +7: iteration 7080/ 21553 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 0.18 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.918777E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.535 | TFLOPs: 27.03 | +7: iteration 7090/ 21553 | consumed samples: 1815040 | consumed tokens: 3717201920 | elapsed time per iteration (s): 0.18 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.908474E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.536 | TFLOPs: 27.05 | +7: iteration 7100/ 21553 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 0.18 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.913537E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.987 | TFLOPs: 27.05 | +7: iteration 7110/ 21553 | consumed samples: 1820160 | consumed tokens: 3727687680 | elapsed time per iteration (s): 0.18 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.912737E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.536 | TFLOPs: 27.05 | +7: iteration 7120/ 21553 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 0.18 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.911752E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.322 | TFLOPs: 27.00 | +7: iteration 7130/ 21553 | consumed samples: 1825280 | consumed tokens: 3738173440 | elapsed time per iteration (s): 0.18 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.915241E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.326 | TFLOPs: 26.97 | +7: iteration 7140/ 21553 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 0.18 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.912630E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.892 | TFLOPs: 26.96 | +7: iteration 7150/ 21553 | consumed samples: 1830400 | consumed tokens: 3748659200 | elapsed time per iteration (s): 0.18 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.914033E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.217 | TFLOPs: 26.98 | +7: iteration 7160/ 21553 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 0.18 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.906757E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.512 | TFLOPs: 26.93 | +7: iteration 7170/ 21553 | consumed samples: 1835520 | consumed tokens: 3759144960 | elapsed time per iteration (s): 0.18 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.910019E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1401.242 | TFLOPs: 26.77 | +7: iteration 7180/ 21553 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 0.18 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.907725E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.612 | TFLOPs: 26.99 | +7: iteration 7190/ 21553 | consumed samples: 1840640 | consumed tokens: 3769630720 | elapsed time per iteration (s): 0.18 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.902246E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.791 | TFLOPs: 27.03 | +7: iteration 7200/ 21553 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 0.18 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.910643E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.596 | TFLOPs: 26.99 | +7: iteration 7210/ 21553 | consumed samples: 1845760 | consumed tokens: 3780116480 | elapsed time per iteration (s): 0.18 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.904124E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.873 | TFLOPs: 26.99 | +7: iteration 7220/ 21553 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 0.18 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.906495E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.528 | TFLOPs: 26.99 | +7: iteration 7230/ 21553 | consumed samples: 1850880 | consumed tokens: 3790602240 | elapsed time per iteration (s): 0.18 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.910912E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.378 | TFLOPs: 27.02 | +7: iteration 7240/ 21553 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 0.18 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.900928E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.224 | TFLOPs: 26.68 | +7: iteration 7250/ 21553 | consumed samples: 1856000 | consumed tokens: 3801088000 | elapsed time per iteration (s): 0.18 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.903354E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.235 | TFLOPs: 27.04 | +7: iteration 7260/ 21553 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 0.18 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.902697E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.734 | TFLOPs: 27.05 | +7: iteration 7270/ 21553 | consumed samples: 1861120 | consumed tokens: 3811573760 | elapsed time per iteration (s): 0.18 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.901523E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.034 | TFLOPs: 27.02 | +7: iteration 7280/ 21553 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 0.18 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.903734E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.486 | TFLOPs: 27.04 | +7: iteration 7290/ 21553 | consumed samples: 1866240 | consumed tokens: 3822059520 | elapsed time per iteration (s): 0.18 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.897108E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.632 | TFLOPs: 27.01 | +7: iteration 7300/ 21553 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 0.18 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.908303E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.709 | TFLOPs: 27.03 | +7: iteration 7310/ 21553 | consumed samples: 1871360 | consumed tokens: 3832545280 | elapsed time per iteration (s): 0.18 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.898891E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.385 | TFLOPs: 27.02 | +7: iteration 7320/ 21553 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 0.18 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.903290E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.098 | TFLOPs: 27.02 | +7: iteration 7330/ 21553 | consumed samples: 1876480 | consumed tokens: 3843031040 | elapsed time per iteration (s): 0.18 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.898312E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.977 | TFLOPs: 27.04 | +7: iteration 7340/ 21553 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 0.18 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.900099E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.811 | TFLOPs: 27.01 | +7: iteration 7350/ 21553 | consumed samples: 1881600 | consumed tokens: 3853516800 | elapsed time per iteration (s): 0.18 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.896066E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.650 | TFLOPs: 27.03 | +7: iteration 7360/ 21553 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 0.18 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.895094E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.542 | TFLOPs: 27.05 | +7: iteration 7370/ 21553 | consumed samples: 1886720 | consumed tokens: 3864002560 | elapsed time per iteration (s): 0.18 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.896999E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.916 | TFLOPs: 27.03 | +7: iteration 7380/ 21553 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 0.18 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.900013E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.179 | TFLOPs: 27.02 | +7: iteration 7390/ 21553 | consumed samples: 1891840 | consumed tokens: 3874488320 | elapsed time per iteration (s): 0.18 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.903727E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.234 | TFLOPs: 27.02 | +7: iteration 7400/ 21553 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 0.18 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.895612E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.515 | TFLOPs: 27.01 | +7: iteration 7410/ 21553 | consumed samples: 1896960 | consumed tokens: 3884974080 | elapsed time per iteration (s): 0.18 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.886684E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.116 | TFLOPs: 27.02 | +7: iteration 7420/ 21553 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 0.18 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.893108E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.473 | TFLOPs: 27.01 | +7: iteration 7430/ 21553 | consumed samples: 1902080 | consumed tokens: 3895459840 | elapsed time per iteration (s): 0.18 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.901801E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.452 | TFLOPs: 27.03 | +7: iteration 7440/ 21553 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 0.18 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.885587E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.075 | TFLOPs: 27.02 | +7: iteration 7450/ 21553 | consumed samples: 1907200 | consumed tokens: 3905945600 | elapsed time per iteration (s): 0.18 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.894413E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.995 | TFLOPs: 27.02 | +7: iteration 7460/ 21553 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 0.18 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.895763E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.614 | TFLOPs: 27.03 | +7: iteration 7470/ 21553 | consumed samples: 1912320 | consumed tokens: 3916431360 | elapsed time per iteration (s): 0.18 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.897890E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.333 | TFLOPs: 27.02 | +7: iteration 7480/ 21553 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 0.18 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.897898E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.841 | TFLOPs: 27.01 | +7: iteration 7490/ 21553 | consumed samples: 1917440 | consumed tokens: 3926917120 | elapsed time per iteration (s): 0.18 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.888017E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.200 | TFLOPs: 27.04 | +7: iteration 7500/ 21553 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 0.18 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.896127E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.166 | TFLOPs: 27.02 | +7: iteration 7510/ 21553 | consumed samples: 1922560 | consumed tokens: 3937402880 | elapsed time per iteration (s): 0.18 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.893907E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.089 | TFLOPs: 27.00 | +7: iteration 7520/ 21553 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 0.18 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.887220E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.193 | TFLOPs: 27.02 | +7: iteration 7530/ 21553 | consumed samples: 1927680 | consumed tokens: 3947888640 | elapsed time per iteration (s): 0.18 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.893143E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.082 | TFLOPs: 27.04 | +7: iteration 7540/ 21553 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 0.18 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.894749E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.859 | TFLOPs: 26.69 | +7: iteration 7550/ 21553 | consumed samples: 1932800 | consumed tokens: 3958374400 | elapsed time per iteration (s): 0.18 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.894080E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.303 | TFLOPs: 26.96 | +7: iteration 7560/ 21553 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 0.18 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.895225E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.043 | TFLOPs: 27.00 | +7: iteration 7570/ 21553 | consumed samples: 1937920 | consumed tokens: 3968860160 | elapsed time per iteration (s): 0.18 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.885835E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.389 | TFLOPs: 26.99 | +7: iteration 7580/ 21553 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 0.18 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.892377E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.288 | TFLOPs: 27.00 | +7: iteration 7590/ 21553 | consumed samples: 1943040 | consumed tokens: 3979345920 | elapsed time per iteration (s): 0.18 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.889734E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.876 | TFLOPs: 27.03 | +7: iteration 7600/ 21553 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 0.18 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.899288E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.885 | TFLOPs: 27.05 | +7: iteration 7610/ 21553 | consumed samples: 1948160 | consumed tokens: 3989831680 | elapsed time per iteration (s): 0.18 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.889670E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.078 | TFLOPs: 27.02 | +7: iteration 7620/ 21553 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 0.18 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.887090E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.266 | TFLOPs: 27.06 | +7: iteration 7630/ 21553 | consumed samples: 1953280 | consumed tokens: 4000317440 | elapsed time per iteration (s): 0.18 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.881828E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.848 | TFLOPs: 27.05 | +7: iteration 7640/ 21553 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 0.18 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.890230E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.012 | TFLOPs: 27.07 | +7: iteration 7650/ 21553 | consumed samples: 1958400 | consumed tokens: 4010803200 | elapsed time per iteration (s): 0.18 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.893852E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.097 | TFLOPs: 27.06 | +7: iteration 7660/ 21553 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 0.18 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.887984E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.149 | TFLOPs: 26.94 | +7: iteration 7670/ 21553 | consumed samples: 1963520 | consumed tokens: 4021288960 | elapsed time per iteration (s): 0.18 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.883536E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.262 | TFLOPs: 27.02 | +7: iteration 7680/ 21553 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 0.18 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.885180E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.610 | TFLOPs: 27.03 | +7: iteration 7690/ 21553 | consumed samples: 1968640 | consumed tokens: 4031774720 | elapsed time per iteration (s): 0.18 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.890164E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.306 | TFLOPs: 27.02 | +7: iteration 7700/ 21553 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 0.18 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.885568E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.189 | TFLOPs: 27.04 | +7: iteration 7710/ 21553 | consumed samples: 1973760 | consumed tokens: 4042260480 | elapsed time per iteration (s): 0.18 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.890108E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.815 | TFLOPs: 27.01 | +7: iteration 7720/ 21553 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 0.18 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.896943E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.060 | TFLOPs: 27.06 | +7: iteration 7730/ 21553 | consumed samples: 1978880 | consumed tokens: 4052746240 | elapsed time per iteration (s): 0.18 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.883512E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.879 | TFLOPs: 27.01 | +7: iteration 7740/ 21553 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 0.18 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.888726E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.183 | TFLOPs: 27.00 | +7: iteration 7750/ 21553 | consumed samples: 1984000 | consumed tokens: 4063232000 | elapsed time per iteration (s): 0.18 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.887097E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.893 | TFLOPs: 26.98 | +7: iteration 7760/ 21553 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 0.18 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.881523E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.723 | TFLOPs: 26.97 | +7: iteration 7770/ 21553 | consumed samples: 1989120 | consumed tokens: 4073717760 | elapsed time per iteration (s): 0.18 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.882156E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.929 | TFLOPs: 27.00 | +7: iteration 7780/ 21553 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 0.18 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.886412E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.480 | TFLOPs: 27.01 | +7: iteration 7790/ 21553 | consumed samples: 1994240 | consumed tokens: 4084203520 | elapsed time per iteration (s): 0.18 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.890922E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.145 | TFLOPs: 26.98 | +7: iteration 7800/ 21553 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 0.18 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.880706E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.784 | TFLOPs: 26.97 | +7: iteration 7810/ 21553 | consumed samples: 1999360 | consumed tokens: 4094689280 | elapsed time per iteration (s): 0.18 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.888074E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.543 | TFLOPs: 26.99 | +7: iteration 7820/ 21553 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 0.18 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.878861E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.019 | TFLOPs: 27.02 | +7: iteration 7830/ 21553 | consumed samples: 2004480 | consumed tokens: 4105175040 | elapsed time per iteration (s): 0.18 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.874623E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.694 | TFLOPs: 27.03 | +7: iteration 7840/ 21553 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 0.18 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.885081E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.009 | TFLOPs: 27.02 | +7: iteration 7850/ 21553 | consumed samples: 2009600 | consumed tokens: 4115660800 | elapsed time per iteration (s): 0.18 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.880328E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.248 | TFLOPs: 27.04 | +7: iteration 7860/ 21553 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 0.18 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.878834E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.413 | TFLOPs: 26.99 | +7: iteration 7870/ 21553 | consumed samples: 2014720 | consumed tokens: 4126146560 | elapsed time per iteration (s): 0.18 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.875404E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.726 | TFLOPs: 26.93 | +7: iteration 7880/ 21553 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 0.18 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.875896E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.890 | TFLOPs: 26.94 | +7: iteration 7890/ 21553 | consumed samples: 2019840 | consumed tokens: 4136632320 | elapsed time per iteration (s): 0.18 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.884688E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.616 | TFLOPs: 26.93 | +7: iteration 7900/ 21553 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 0.18 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.875768E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.838 | TFLOPs: 26.96 | +7: iteration 7910/ 21553 | consumed samples: 2024960 | consumed tokens: 4147118080 | elapsed time per iteration (s): 0.18 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.879087E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.972 | TFLOPs: 26.98 | +7: iteration 7920/ 21553 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 0.18 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.880862E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.823 | TFLOPs: 26.99 | +7: iteration 7930/ 21553 | consumed samples: 2030080 | consumed tokens: 4157603840 | elapsed time per iteration (s): 0.18 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.878517E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.200 | TFLOPs: 27.00 | +7: iteration 7940/ 21553 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 0.18 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.880633E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.925 | TFLOPs: 26.98 | +7: iteration 7950/ 21553 | consumed samples: 2035200 | consumed tokens: 4168089600 | elapsed time per iteration (s): 0.18 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.876533E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.217 | TFLOPs: 26.96 | +7: iteration 7960/ 21553 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 0.18 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.881749E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.936 | TFLOPs: 27.00 | +7: iteration 7970/ 21553 | consumed samples: 2040320 | consumed tokens: 4178575360 | elapsed time per iteration (s): 0.18 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.875314E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.671 | TFLOPs: 26.97 | +7: iteration 7980/ 21553 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 0.18 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.869088E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.518 | TFLOPs: 26.97 | +7: iteration 7990/ 21553 | consumed samples: 2045440 | consumed tokens: 4189061120 | elapsed time per iteration (s): 0.18 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.875262E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.268 | TFLOPs: 26.96 | +0: [2023-03-16 23:59:59,204] [INFO] [logging.py:68:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.00014708123251590496, 0.00014708123251590496, 0.00014708123251590496], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 8000/ 21553 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 0.18 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.871131E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.515 | TFLOPs: 26.97 | +0: steps: 8000 loss: 3.8501 iter time (s): 0.180 samples/sec: 1423.905 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 8000 | lm loss value: 3.956227E+00 | lm loss PPL: 5.225978E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 8000 to checkpoints_83m14b100mdedup +0: [2023-03-16 23:59:59,292] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is begin to save! +0: [2023-03-16 23:59:59,295] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_01-model_00-model_states.pt... +0: [2023-03-16 23:59:59,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_01-model_00-model_states.pt. +0: [2023-03-16 23:59:59,368] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_03-model_00-model_states.pt... +0: [2023-03-16 23:59:59,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_03-model_00-model_states.pt. +0: [2023-03-16 23:59:59,380] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_04-model_00-model_states.pt... +0: [2023-03-16 23:59:59,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_04-model_00-model_states.pt. +0: [2023-03-16 23:59:59,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_05-model_00-model_states.pt... +0: [2023-03-16 23:59:59,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_05-model_00-model_states.pt. +0: [2023-03-16 23:59:59,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_06-model_00-model_states.pt... +0: [2023-03-16 23:59:59,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_06-model_00-model_states.pt. +0: [2023-03-16 23:59:59,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_07-model_00-model_states.pt... +0: [2023-03-16 23:59:59,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_07-model_00-model_states.pt. +0: [2023-03-16 23:59:59,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_08-model_00-model_states.pt... +0: [2023-03-16 23:59:59,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_08-model_00-model_states.pt. +0: [2023-03-16 23:59:59,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_09-model_00-model_states.pt... +0: [2023-03-16 23:59:59,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_09-model_00-model_states.pt. +0: [2023-03-16 23:59:59,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_10-model_00-model_states.pt... +0: [2023-03-16 23:59:59,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_10-model_00-model_states.pt. +0: [2023-03-16 23:59:59,458] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_11-model_00-model_states.pt... +0: [2023-03-16 23:59:59,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_11-model_00-model_states.pt. +0: [2023-03-16 23:59:59,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_12-model_00-model_states.pt... +0: [2023-03-16 23:59:59,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_12-model_00-model_states.pt. +0: [2023-03-16 23:59:59,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/layer_14-model_00-model_states.pt... +0: [2023-03-16 23:59:59,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/layer_14-model_00-model_states.pt. +0: [2023-03-16 23:59:59,482] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step8000/mp_rank_00_model_states.pt +0: [2023-03-16 23:59:59,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/mp_rank_00_model_states.pt... +0: [2023-03-16 23:59:59,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/mp_rank_00_model_states.pt. +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +3: [2023-03-16 23:59:59,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-16 23:59:59,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-16 23:59:59,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,523] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,523] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,524] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,524] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,529] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 23:59:59,529] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-16 23:59:59,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-16 23:59:59,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +6: [2023-03-16 23:59:59,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-16 23:59:59,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-16 23:59:59,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-16 23:59:59,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-16 23:59:59,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-16 23:59:59,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-16 23:59:59,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 23:59:59,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: successfully saved checkpoint at iteration 8000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 241.90 +7: iteration 8010/ 21553 | consumed samples: 2050560 | consumed tokens: 4199546880 | elapsed time per iteration (s): 0.21 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.865408E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1198.645 | TFLOPs: 22.90 | +7: iteration 8020/ 21553 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 0.18 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.874432E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.362 | TFLOPs: 27.02 | +7: iteration 8030/ 21553 | consumed samples: 2055680 | consumed tokens: 4210032640 | elapsed time per iteration (s): 0.18 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.876367E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.279 | TFLOPs: 27.00 | +7: iteration 8040/ 21553 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 0.18 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.877640E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.381 | TFLOPs: 27.00 | +7: iteration 8050/ 21553 | consumed samples: 2060800 | consumed tokens: 4220518400 | elapsed time per iteration (s): 0.18 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.874703E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.879 | TFLOPs: 26.98 | +7: iteration 8060/ 21553 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 0.18 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.878427E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.603 | TFLOPs: 27.01 | +7: iteration 8070/ 21553 | consumed samples: 2065920 | consumed tokens: 4231004160 | elapsed time per iteration (s): 0.18 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.872745E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.380 | TFLOPs: 27.02 | +7: iteration 8080/ 21553 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 0.18 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.870894E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.174 | TFLOPs: 26.98 | +7: iteration 8090/ 21553 | consumed samples: 2071040 | consumed tokens: 4241489920 | elapsed time per iteration (s): 0.18 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.870299E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.081 | TFLOPs: 26.98 | +7: iteration 8100/ 21553 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 0.18 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.873534E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.796 | TFLOPs: 26.96 | +7: iteration 8110/ 21553 | consumed samples: 2076160 | consumed tokens: 4251975680 | elapsed time per iteration (s): 0.18 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.869105E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.897 | TFLOPs: 26.94 | +7: iteration 8120/ 21553 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 0.18 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.871013E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.187 | TFLOPs: 26.94 | +7: iteration 8130/ 21553 | consumed samples: 2081280 | consumed tokens: 4262461440 | elapsed time per iteration (s): 0.18 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.869889E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.630 | TFLOPs: 26.99 | +7: iteration 8140/ 21553 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 0.18 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.876064E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.788 | TFLOPs: 26.99 | +7: iteration 8150/ 21553 | consumed samples: 2086400 | consumed tokens: 4272947200 | elapsed time per iteration (s): 0.18 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.872637E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.121 | TFLOPs: 27.00 | +7: iteration 8160/ 21553 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 0.18 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.874320E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.769 | TFLOPs: 27.01 | +7: iteration 8170/ 21553 | consumed samples: 2091520 | consumed tokens: 4283432960 | elapsed time per iteration (s): 0.18 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.869281E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.581 | TFLOPs: 26.76 | +7: iteration 8180/ 21553 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 0.18 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.864671E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.819 | TFLOPs: 26.96 | +7: iteration 8190/ 21553 | consumed samples: 2096640 | consumed tokens: 4293918720 | elapsed time per iteration (s): 0.18 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.873112E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.417 | TFLOPs: 26.93 | +7: iteration 8200/ 21553 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 0.18 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.863743E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.912 | TFLOPs: 26.98 | +7: iteration 8210/ 21553 | consumed samples: 2101760 | consumed tokens: 4304404480 | elapsed time per iteration (s): 0.18 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.868346E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.787 | TFLOPs: 26.99 | +7: iteration 8220/ 21553 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 0.18 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.867227E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.242 | TFLOPs: 26.98 | +7: iteration 8230/ 21553 | consumed samples: 2106880 | consumed tokens: 4314890240 | elapsed time per iteration (s): 0.18 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.865267E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.877 | TFLOPs: 26.99 | +7: iteration 8240/ 21553 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 0.18 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.872705E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.666 | TFLOPs: 27.01 | +7: iteration 8250/ 21553 | consumed samples: 2112000 | consumed tokens: 4325376000 | elapsed time per iteration (s): 0.18 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.867369E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.463 | TFLOPs: 26.99 | +7: iteration 8260/ 21553 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 0.18 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.863477E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.311 | TFLOPs: 26.95 | +7: iteration 8270/ 21553 | consumed samples: 2117120 | consumed tokens: 4335861760 | elapsed time per iteration (s): 0.18 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.860138E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.005 | TFLOPs: 26.98 | +7: iteration 8280/ 21553 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 0.18 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.866445E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.866 | TFLOPs: 26.99 | +7: iteration 8290/ 21553 | consumed samples: 2122240 | consumed tokens: 4346347520 | elapsed time per iteration (s): 0.18 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.862414E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.626 | TFLOPs: 26.99 | +7: iteration 8300/ 21553 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 0.18 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.865364E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.778 | TFLOPs: 26.94 | +7: iteration 8310/ 21553 | consumed samples: 2127360 | consumed tokens: 4356833280 | elapsed time per iteration (s): 0.18 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.871065E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.685 | TFLOPs: 26.97 | +7: iteration 8320/ 21553 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 0.18 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.870499E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.580 | TFLOPs: 26.97 | +7: iteration 8330/ 21553 | consumed samples: 2132480 | consumed tokens: 4367319040 | elapsed time per iteration (s): 0.18 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.866486E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.969 | TFLOPs: 26.96 | +7: iteration 8340/ 21553 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 0.18 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.867033E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.542 | TFLOPs: 26.74 | +7: iteration 8350/ 21553 | consumed samples: 2137600 | consumed tokens: 4377804800 | elapsed time per iteration (s): 0.18 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.861761E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.218 | TFLOPs: 26.98 | +7: iteration 8360/ 21553 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 0.18 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.860591E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.833 | TFLOPs: 26.99 | +7: iteration 8370/ 21553 | consumed samples: 2142720 | consumed tokens: 4388290560 | elapsed time per iteration (s): 0.18 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.856469E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.724 | TFLOPs: 27.01 | +7: iteration 8380/ 21553 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 0.18 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.856451E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.326 | TFLOPs: 27.02 | +7: iteration 8390/ 21553 | consumed samples: 2147840 | consumed tokens: 4398776320 | elapsed time per iteration (s): 0.18 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.861927E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.335 | TFLOPs: 27.02 | +7: iteration 8400/ 21553 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 0.18 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.865413E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.725 | TFLOPs: 27.03 | +7: iteration 8410/ 21553 | consumed samples: 2152960 | consumed tokens: 4409262080 | elapsed time per iteration (s): 0.18 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.864585E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.844 | TFLOPs: 27.03 | +7: iteration 8420/ 21553 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 0.18 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.863827E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.399 | TFLOPs: 27.02 | +7: iteration 8430/ 21553 | consumed samples: 2158080 | consumed tokens: 4419747840 | elapsed time per iteration (s): 0.18 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.857031E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.826 | TFLOPs: 27.01 | +7: iteration 8440/ 21553 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 0.18 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.861952E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.396 | TFLOPs: 27.02 | +7: iteration 8450/ 21553 | consumed samples: 2163200 | consumed tokens: 4430233600 | elapsed time per iteration (s): 0.18 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.866209E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.586 | TFLOPs: 27.03 | +7: iteration 8460/ 21553 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 0.18 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.864991E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.483 | TFLOPs: 27.01 | +7: iteration 8470/ 21553 | consumed samples: 2168320 | consumed tokens: 4440719360 | elapsed time per iteration (s): 0.18 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.854642E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.894 | TFLOPs: 27.00 | +7: iteration 8480/ 21553 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 0.18 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.866743E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.642 | TFLOPs: 26.99 | +7: iteration 8490/ 21553 | consumed samples: 2173440 | consumed tokens: 4451205120 | elapsed time per iteration (s): 0.18 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.860030E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.962 | TFLOPs: 26.98 | +7: iteration 8500/ 21553 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 0.18 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.848616E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.755 | TFLOPs: 26.97 | +7: iteration 8510/ 21553 | consumed samples: 2178560 | consumed tokens: 4461690880 | elapsed time per iteration (s): 0.18 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.855866E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.497 | TFLOPs: 26.66 | +7: iteration 8520/ 21553 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 0.18 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.854435E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.840 | TFLOPs: 26.98 | +7: iteration 8530/ 21553 | consumed samples: 2183680 | consumed tokens: 4472176640 | elapsed time per iteration (s): 0.18 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.855418E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.682 | TFLOPs: 26.97 | +7: iteration 8540/ 21553 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 0.18 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.865953E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.544 | TFLOPs: 26.99 | +7: iteration 8550/ 21553 | consumed samples: 2188800 | consumed tokens: 4482662400 | elapsed time per iteration (s): 0.18 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.849851E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.246 | TFLOPs: 26.98 | +7: iteration 8560/ 21553 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 0.18 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.854525E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.701 | TFLOPs: 26.99 | +7: iteration 8570/ 21553 | consumed samples: 2193920 | consumed tokens: 4493148160 | elapsed time per iteration (s): 0.18 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.852289E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.123 | TFLOPs: 27.02 | +7: iteration 8580/ 21553 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 0.18 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.861813E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.166 | TFLOPs: 27.02 | +7: iteration 8590/ 21553 | consumed samples: 2199040 | consumed tokens: 4503633920 | elapsed time per iteration (s): 0.18 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.855980E+00 | grad norm: 0.555 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.943 | TFLOPs: 27.02 | +7: iteration 8600/ 21553 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 0.18 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.849332E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.555 | TFLOPs: 27.01 | +7: iteration 8610/ 21553 | consumed samples: 2204160 | consumed tokens: 4514119680 | elapsed time per iteration (s): 0.18 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.860989E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.680 | TFLOPs: 27.03 | +7: iteration 8620/ 21553 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 0.18 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.857620E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.336 | TFLOPs: 27.04 | +7: iteration 8630/ 21553 | consumed samples: 2209280 | consumed tokens: 4524605440 | elapsed time per iteration (s): 0.18 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.852576E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.145 | TFLOPs: 27.02 | +7: iteration 8640/ 21553 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 0.18 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.859350E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.458 | TFLOPs: 27.03 | +7: iteration 8650/ 21553 | consumed samples: 2214400 | consumed tokens: 4535091200 | elapsed time per iteration (s): 0.18 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.855058E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.088 | TFLOPs: 27.04 | +7: iteration 8660/ 21553 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 0.18 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.856284E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.292 | TFLOPs: 27.02 | +7: iteration 8670/ 21553 | consumed samples: 2219520 | consumed tokens: 4545576960 | elapsed time per iteration (s): 0.18 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.855261E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.296 | TFLOPs: 27.02 | +7: iteration 8680/ 21553 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 0.18 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.852552E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.723 | TFLOPs: 27.03 | +7: iteration 8690/ 21553 | consumed samples: 2224640 | consumed tokens: 4556062720 | elapsed time per iteration (s): 0.18 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.855023E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.620 | TFLOPs: 27.03 | +7: iteration 8700/ 21553 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 0.18 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.849686E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.108 | TFLOPs: 26.98 | +7: iteration 8710/ 21553 | consumed samples: 2229760 | consumed tokens: 4566548480 | elapsed time per iteration (s): 0.18 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.848901E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.052 | TFLOPs: 26.98 | +7: iteration 8720/ 21553 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 0.18 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.851548E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.483 | TFLOPs: 26.97 | +7: iteration 8730/ 21553 | consumed samples: 2234880 | consumed tokens: 4577034240 | elapsed time per iteration (s): 0.18 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.852925E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.439 | TFLOPs: 26.99 | +7: iteration 8740/ 21553 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 0.18 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.843987E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.306 | TFLOPs: 26.98 | +7: iteration 8750/ 21553 | consumed samples: 2240000 | consumed tokens: 4587520000 | elapsed time per iteration (s): 0.18 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.850390E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.504 | TFLOPs: 27.01 | +7: iteration 8760/ 21553 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 0.18 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.846030E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.342 | TFLOPs: 27.02 | +7: iteration 8770/ 21553 | consumed samples: 2245120 | consumed tokens: 4598005760 | elapsed time per iteration (s): 0.18 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.848554E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.707 | TFLOPs: 27.03 | +7: iteration 8780/ 21553 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 0.18 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.852575E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.666 | TFLOPs: 27.03 | +7: iteration 8790/ 21553 | consumed samples: 2250240 | consumed tokens: 4608491520 | elapsed time per iteration (s): 0.18 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.847691E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.236 | TFLOPs: 27.02 | +7: iteration 8800/ 21553 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 0.18 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.849113E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.804 | TFLOPs: 27.01 | +7: iteration 8810/ 21553 | consumed samples: 2255360 | consumed tokens: 4618977280 | elapsed time per iteration (s): 0.18 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.843451E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.342 | TFLOPs: 27.02 | +7: iteration 8820/ 21553 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 0.18 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.850134E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.517 | TFLOPs: 27.03 | +7: iteration 8830/ 21553 | consumed samples: 2260480 | consumed tokens: 4629463040 | elapsed time per iteration (s): 0.18 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.849067E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.054 | TFLOPs: 27.02 | +7: iteration 8840/ 21553 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 0.18 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.849540E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.575 | TFLOPs: 27.03 | +7: iteration 8850/ 21553 | consumed samples: 2265600 | consumed tokens: 4639948800 | elapsed time per iteration (s): 0.18 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.842328E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.812 | TFLOPs: 27.01 | +7: iteration 8860/ 21553 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 0.18 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.847327E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.710 | TFLOPs: 27.01 | +7: iteration 8870/ 21553 | consumed samples: 2270720 | consumed tokens: 4650434560 | elapsed time per iteration (s): 0.18 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.843762E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.575 | TFLOPs: 27.01 | +7: iteration 8880/ 21553 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 0.18 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.849500E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.597 | TFLOPs: 27.03 | +7: iteration 8890/ 21553 | consumed samples: 2275840 | consumed tokens: 4660920320 | elapsed time per iteration (s): 0.18 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.845926E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.056 | TFLOPs: 27.04 | +7: iteration 8900/ 21553 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 0.18 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.842416E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1401.161 | TFLOPs: 26.77 | +7: iteration 8910/ 21553 | consumed samples: 2280960 | consumed tokens: 4671406080 | elapsed time per iteration (s): 0.18 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.835725E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.426 | TFLOPs: 27.02 | +7: iteration 8920/ 21553 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 0.18 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.845685E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.114 | TFLOPs: 27.02 | +7: iteration 8930/ 21553 | consumed samples: 2286080 | consumed tokens: 4681891840 | elapsed time per iteration (s): 0.18 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.846761E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.329 | TFLOPs: 27.04 | +7: iteration 8940/ 21553 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 0.18 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.838097E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.677 | TFLOPs: 27.03 | +7: iteration 8950/ 21553 | consumed samples: 2291200 | consumed tokens: 4692377600 | elapsed time per iteration (s): 0.18 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.850356E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.155 | TFLOPs: 27.04 | +7: iteration 8960/ 21553 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 0.18 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.848509E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.539 | TFLOPs: 27.05 | +7: iteration 8970/ 21553 | consumed samples: 2296320 | consumed tokens: 4702863360 | elapsed time per iteration (s): 0.18 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.848844E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.610 | TFLOPs: 27.05 | +7: iteration 8980/ 21553 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 0.18 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.849059E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.829 | TFLOPs: 27.03 | +7: iteration 8990/ 21553 | consumed samples: 2301440 | consumed tokens: 4713349120 | elapsed time per iteration (s): 0.18 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.835326E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.756 | TFLOPs: 27.05 | +7: iteration 9000/ 21553 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 0.18 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.847236E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.137 | TFLOPs: 27.04 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 9000 | lm loss value: 3.911940E+00 | lm loss PPL: 4.999584E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 9000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:03:00,806] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step9000 is begin to save! +0: [2023-03-17 00:03:00,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:03:00,882] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:03:00,882] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:03:00,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:03:00,894] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:03:00,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:03:00,905] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:03:00,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:03:00,916] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:03:00,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:03:00,927] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:03:00,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:03:00,938] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:03:00,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:03:00,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:03:00,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:03:00,960] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:03:00,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:03:00,972] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:03:00,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:03:00,983] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:03:00,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:03:00,994] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:03:00,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:03:00,995] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step9000/mp_rank_00_model_states.pt +0: [2023-03-17 00:03:00,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:03:00,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:03:01,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:03:01,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,034] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:03:01,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:03:01,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:03:01,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,041] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:03:01,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,043] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,043] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-17 00:03:01,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:03:01,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +0: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:03:01,045] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:03:01,045] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: successfully saved checkpoint at iteration 9000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 244.08 +7: iteration 9010/ 21553 | consumed samples: 2306560 | consumed tokens: 4723834880 | elapsed time per iteration (s): 0.21 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.832664E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1212.505 | TFLOPs: 23.17 | +7: iteration 9020/ 21553 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 0.18 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.834706E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.156 | TFLOPs: 27.10 | +7: iteration 9030/ 21553 | consumed samples: 2311680 | consumed tokens: 4734320640 | elapsed time per iteration (s): 0.18 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.842598E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.008 | TFLOPs: 27.09 | +7: iteration 9040/ 21553 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 0.18 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.838020E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.119 | TFLOPs: 27.08 | +7: iteration 9050/ 21553 | consumed samples: 2316800 | consumed tokens: 4744806400 | elapsed time per iteration (s): 0.18 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.840412E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.174 | TFLOPs: 27.10 | +7: iteration 9060/ 21553 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 0.18 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.837488E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.047 | TFLOPs: 27.09 | +7: iteration 9070/ 21553 | consumed samples: 2321920 | consumed tokens: 4755292160 | elapsed time per iteration (s): 0.18 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.835757E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.208 | TFLOPs: 27.08 | +7: iteration 9080/ 21553 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 0.18 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.840637E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.902 | TFLOPs: 27.07 | +7: iteration 9090/ 21553 | consumed samples: 2327040 | consumed tokens: 4765777920 | elapsed time per iteration (s): 0.18 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.844939E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.755 | TFLOPs: 27.05 | +7: iteration 9100/ 21553 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 0.18 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.830150E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.463 | TFLOPs: 26.99 | +7: iteration 9110/ 21553 | consumed samples: 2332160 | consumed tokens: 4776263680 | elapsed time per iteration (s): 0.18 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.844085E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.660 | TFLOPs: 27.01 | +7: iteration 9120/ 21553 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 0.18 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.834247E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.594 | TFLOPs: 27.03 | +7: iteration 9130/ 21553 | consumed samples: 2337280 | consumed tokens: 4786749440 | elapsed time per iteration (s): 0.18 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.835044E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.124 | TFLOPs: 27.00 | +7: iteration 9140/ 21553 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 0.18 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.833643E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.685 | TFLOPs: 27.03 | +7: iteration 9150/ 21553 | consumed samples: 2342400 | consumed tokens: 4797235200 | elapsed time per iteration (s): 0.18 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.831524E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.363 | TFLOPs: 27.04 | +7: iteration 9160/ 21553 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 0.20 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.833517E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1269.792 | TFLOPs: 24.26 | +7: iteration 9170/ 21553 | consumed samples: 2347520 | consumed tokens: 4807720960 | elapsed time per iteration (s): 0.18 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.842569E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.366 | TFLOPs: 27.06 | +7: iteration 9180/ 21553 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 0.18 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.831392E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.776 | TFLOPs: 27.01 | +7: iteration 9190/ 21553 | consumed samples: 2352640 | consumed tokens: 4818206720 | elapsed time per iteration (s): 0.18 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.837300E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.511 | TFLOPs: 27.05 | +7: iteration 9200/ 21553 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 0.18 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.829830E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.284 | TFLOPs: 27.06 | +7: iteration 9210/ 21553 | consumed samples: 2357760 | consumed tokens: 4828692480 | elapsed time per iteration (s): 0.18 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.834437E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.364 | TFLOPs: 27.00 | +7: iteration 9220/ 21553 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 0.18 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.830891E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.639 | TFLOPs: 27.01 | +7: iteration 9230/ 21553 | consumed samples: 2362880 | consumed tokens: 4839178240 | elapsed time per iteration (s): 0.18 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.834440E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.294 | TFLOPs: 27.00 | +7: iteration 9240/ 21553 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 0.18 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.834685E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.993 | TFLOPs: 27.02 | +7: iteration 9250/ 21553 | consumed samples: 2368000 | consumed tokens: 4849664000 | elapsed time per iteration (s): 0.18 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.831294E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.969 | TFLOPs: 27.00 | +7: iteration 9260/ 21553 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 0.18 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.831149E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.020 | TFLOPs: 27.02 | +7: iteration 9270/ 21553 | consumed samples: 2373120 | consumed tokens: 4860149760 | elapsed time per iteration (s): 0.18 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.836967E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.919 | TFLOPs: 27.01 | +7: iteration 9280/ 21553 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 0.18 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.833618E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.986 | TFLOPs: 27.00 | +7: iteration 9290/ 21553 | consumed samples: 2378240 | consumed tokens: 4870635520 | elapsed time per iteration (s): 0.18 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.831361E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.565 | TFLOPs: 27.01 | +7: iteration 9300/ 21553 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 0.18 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.835251E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.179 | TFLOPs: 27.00 | +7: iteration 9310/ 21553 | consumed samples: 2383360 | consumed tokens: 4881121280 | elapsed time per iteration (s): 0.18 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.822535E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.278 | TFLOPs: 27.00 | +7: iteration 9320/ 21553 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 0.18 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.829258E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.202 | TFLOPs: 26.98 | +7: iteration 9330/ 21553 | consumed samples: 2388480 | consumed tokens: 4891607040 | elapsed time per iteration (s): 0.18 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.835351E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.559 | TFLOPs: 26.99 | +7: iteration 9340/ 21553 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 0.18 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.829917E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.738 | TFLOPs: 26.99 | +7: iteration 9350/ 21553 | consumed samples: 2393600 | consumed tokens: 4902092800 | elapsed time per iteration (s): 0.18 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.830701E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.853 | TFLOPs: 26.96 | +7: iteration 9360/ 21553 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 0.18 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.822427E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.233 | TFLOPs: 26.98 | +7: iteration 9370/ 21553 | consumed samples: 2398720 | consumed tokens: 4912578560 | elapsed time per iteration (s): 0.18 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.829994E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.467 | TFLOPs: 27.01 | +7: iteration 9380/ 21553 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 0.18 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.825418E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.860 | TFLOPs: 26.99 | +7: iteration 9390/ 21553 | consumed samples: 2403840 | consumed tokens: 4923064320 | elapsed time per iteration (s): 0.18 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.828693E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.155 | TFLOPs: 26.98 | +7: iteration 9400/ 21553 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 0.18 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.822712E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.638 | TFLOPs: 26.99 | +7: iteration 9410/ 21553 | consumed samples: 2408960 | consumed tokens: 4933550080 | elapsed time per iteration (s): 0.18 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.832921E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.420 | TFLOPs: 26.99 | +7: iteration 9420/ 21553 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 0.18 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.834659E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.020 | TFLOPs: 27.02 | +7: iteration 9430/ 21553 | consumed samples: 2414080 | consumed tokens: 4944035840 | elapsed time per iteration (s): 0.18 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.835534E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.347 | TFLOPs: 27.00 | +7: iteration 9440/ 21553 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 0.18 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.839718E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.308 | TFLOPs: 27.00 | +7: iteration 9450/ 21553 | consumed samples: 2419200 | consumed tokens: 4954521600 | elapsed time per iteration (s): 0.18 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.834323E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.686 | TFLOPs: 26.99 | +7: iteration 9460/ 21553 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 0.18 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.821355E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.978 | TFLOPs: 27.00 | +7: iteration 9470/ 21553 | consumed samples: 2424320 | consumed tokens: 4965007360 | elapsed time per iteration (s): 0.18 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.831082E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.264 | TFLOPs: 26.98 | +7: iteration 9480/ 21553 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 0.18 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.830397E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.670 | TFLOPs: 27.01 | +7: iteration 9490/ 21553 | consumed samples: 2429440 | consumed tokens: 4975493120 | elapsed time per iteration (s): 0.18 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.827047E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.412 | TFLOPs: 27.01 | +7: iteration 9500/ 21553 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 0.18 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.827867E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.144 | TFLOPs: 27.00 | +7: iteration 9510/ 21553 | consumed samples: 2434560 | consumed tokens: 4985978880 | elapsed time per iteration (s): 0.18 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.821171E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.628 | TFLOPs: 26.99 | +7: iteration 9520/ 21553 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 0.18 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.829413E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.729 | TFLOPs: 26.99 | +7: iteration 9530/ 21553 | consumed samples: 2439680 | consumed tokens: 4996464640 | elapsed time per iteration (s): 0.18 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.822871E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.751 | TFLOPs: 27.01 | +7: iteration 9540/ 21553 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 0.18 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.824207E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.702 | TFLOPs: 26.99 | +7: iteration 9550/ 21553 | consumed samples: 2444800 | consumed tokens: 5006950400 | elapsed time per iteration (s): 0.18 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.825214E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.614 | TFLOPs: 27.01 | +7: iteration 9560/ 21553 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 0.18 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.822968E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.519 | TFLOPs: 26.97 | +7: iteration 9570/ 21553 | consumed samples: 2449920 | consumed tokens: 5017436160 | elapsed time per iteration (s): 0.18 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.825642E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.394 | TFLOPs: 26.99 | +7: iteration 9580/ 21553 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 0.18 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.831968E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.884 | TFLOPs: 26.98 | +7: iteration 9590/ 21553 | consumed samples: 2455040 | consumed tokens: 5027921920 | elapsed time per iteration (s): 0.18 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.829220E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.096 | TFLOPs: 26.98 | +7: iteration 9600/ 21553 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 0.18 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.827338E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.495 | TFLOPs: 26.97 | +7: iteration 9610/ 21553 | consumed samples: 2460160 | consumed tokens: 5038407680 | elapsed time per iteration (s): 0.18 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.824955E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.812 | TFLOPs: 27.01 | +7: iteration 9620/ 21553 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 0.18 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.822071E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.319 | TFLOPs: 27.06 | +7: iteration 9630/ 21553 | consumed samples: 2465280 | consumed tokens: 5048893440 | elapsed time per iteration (s): 0.18 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.817826E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.243 | TFLOPs: 27.10 | +7: iteration 9640/ 21553 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 0.18 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.823986E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.130 | TFLOPs: 27.06 | +7: iteration 9650/ 21553 | consumed samples: 2470400 | consumed tokens: 5059379200 | elapsed time per iteration (s): 0.18 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.821406E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.098 | TFLOPs: 27.06 | +7: iteration 9660/ 21553 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 0.18 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.824346E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.552 | TFLOPs: 27.05 | +7: iteration 9670/ 21553 | consumed samples: 2475520 | consumed tokens: 5069864960 | elapsed time per iteration (s): 0.18 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.821485E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.686 | TFLOPs: 27.05 | +7: iteration 9680/ 21553 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 0.18 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.824321E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.932 | TFLOPs: 27.05 | +7: iteration 9690/ 21553 | consumed samples: 2480640 | consumed tokens: 5080350720 | elapsed time per iteration (s): 0.18 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.818449E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.882 | TFLOPs: 27.07 | +7: iteration 9700/ 21553 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 0.18 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.830566E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.357 | TFLOPs: 27.08 | +7: iteration 9710/ 21553 | consumed samples: 2485760 | consumed tokens: 5090836480 | elapsed time per iteration (s): 0.18 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.827902E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.722 | TFLOPs: 27.07 | +7: iteration 9720/ 21553 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 0.18 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.823536E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.350 | TFLOPs: 27.06 | +7: iteration 9730/ 21553 | consumed samples: 2490880 | consumed tokens: 5101322240 | elapsed time per iteration (s): 0.18 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.818879E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.821 | TFLOPs: 26.99 | +7: iteration 9740/ 21553 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 0.18 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.819161E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.351 | TFLOPs: 27.00 | +7: iteration 9750/ 21553 | consumed samples: 2496000 | consumed tokens: 5111808000 | elapsed time per iteration (s): 0.18 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.823794E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.376 | TFLOPs: 27.00 | +7: iteration 9760/ 21553 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 0.18 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.817013E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.256 | TFLOPs: 26.98 | +7: iteration 9770/ 21553 | consumed samples: 2501120 | consumed tokens: 5122293760 | elapsed time per iteration (s): 0.18 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.818649E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.369 | TFLOPs: 26.99 | +7: iteration 9780/ 21553 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 0.18 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.818102E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.476 | TFLOPs: 27.04 | +7: iteration 9790/ 21553 | consumed samples: 2506240 | consumed tokens: 5132779520 | elapsed time per iteration (s): 0.18 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.817049E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.143 | TFLOPs: 27.00 | +7: iteration 9800/ 21553 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 0.18 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.816624E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.649 | TFLOPs: 26.99 | +7: iteration 9810/ 21553 | consumed samples: 2511360 | consumed tokens: 5143265280 | elapsed time per iteration (s): 0.18 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.821672E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.236 | TFLOPs: 26.98 | +7: iteration 9820/ 21553 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 0.18 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.820866E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.453 | TFLOPs: 26.99 | +7: iteration 9830/ 21553 | consumed samples: 2516480 | consumed tokens: 5153751040 | elapsed time per iteration (s): 0.18 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.817392E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.468 | TFLOPs: 26.99 | +7: iteration 9840/ 21553 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 0.18 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.824363E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.713 | TFLOPs: 27.05 | +7: iteration 9850/ 21553 | consumed samples: 2521600 | consumed tokens: 5164236800 | elapsed time per iteration (s): 0.18 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.816103E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.717 | TFLOPs: 27.05 | +7: iteration 9860/ 21553 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 0.18 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.828194E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.354 | TFLOPs: 27.06 | +7: iteration 9870/ 21553 | consumed samples: 2526720 | consumed tokens: 5174722560 | elapsed time per iteration (s): 0.18 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.817870E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.380 | TFLOPs: 27.06 | +7: iteration 9880/ 21553 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 0.18 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.814933E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.657 | TFLOPs: 27.01 | +7: iteration 9890/ 21553 | consumed samples: 2531840 | consumed tokens: 5185208320 | elapsed time per iteration (s): 0.18 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.808755E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.304 | TFLOPs: 26.96 | +7: iteration 9900/ 21553 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 0.18 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.820992E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.836 | TFLOPs: 27.05 | +7: iteration 9910/ 21553 | consumed samples: 2536960 | consumed tokens: 5195694080 | elapsed time per iteration (s): 0.18 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.815655E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.395 | TFLOPs: 27.06 | +7: iteration 9920/ 21553 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 0.18 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.811988E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.537 | TFLOPs: 27.06 | +7: iteration 9930/ 21553 | consumed samples: 2542080 | consumed tokens: 5206179840 | elapsed time per iteration (s): 0.18 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.815637E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.342 | TFLOPs: 27.02 | +7: iteration 9940/ 21553 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 0.18 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.813412E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.565 | TFLOPs: 26.97 | +7: iteration 9950/ 21553 | consumed samples: 2547200 | consumed tokens: 5216665600 | elapsed time per iteration (s): 0.18 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.818687E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.233 | TFLOPs: 27.00 | +7: iteration 9960/ 21553 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 0.18 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.821052E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.490 | TFLOPs: 26.99 | +7: iteration 9970/ 21553 | consumed samples: 2552320 | consumed tokens: 5227151360 | elapsed time per iteration (s): 0.18 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.818291E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.612 | TFLOPs: 26.97 | +7: iteration 9980/ 21553 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 0.18 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.801509E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.842 | TFLOPs: 26.98 | +7: iteration 9990/ 21553 | consumed samples: 2557440 | consumed tokens: 5237637120 | elapsed time per iteration (s): 0.18 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.812561E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.432 | TFLOPs: 26.95 | +0: [2023-03-17 00:06:02,291] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=0, lr=[0.00012168458711439383, 0.00012168458711439383, 0.00012168458711439383], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 10000/ 21553 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 0.18 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.816633E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.277 | TFLOPs: 26.95 | +0: steps: 10000 loss: 3.8082 iter time (s): 0.179 samples/sec: 1426.433 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 10000 | lm loss value: 3.866740E+00 | lm loss PPL: 4.778637E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 10000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:06:02,380] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is begin to save! +0: [2023-03-17 00:06:02,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:06:02,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:06:02,455] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:06:02,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:06:02,467] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:06:02,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:06:02,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:06:02,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:06:02,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:06:02,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:06:02,502] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:06:02,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:06:02,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:06:02,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:06:02,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:06:02,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:06:02,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:06:02,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:06:02,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:06:02,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:06:02,559] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:06:02,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:06:02,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:06:02,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:06:02,572] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step10000/mp_rank_00_model_states.pt +0: [2023-03-17 00:06:02,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:06:02,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:06:02,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:06:02,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:06:02,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:06:02,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: successfully saved checkpoint at iteration 10000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 244.90 +7: iteration 10010/ 21553 | consumed samples: 2562560 | consumed tokens: 5248122880 | elapsed time per iteration (s): 0.21 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.809309E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1195.139 | TFLOPs: 22.83 | +7: iteration 10020/ 21553 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 0.18 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.809313E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.412 | TFLOPs: 26.97 | +7: iteration 10030/ 21553 | consumed samples: 2567680 | consumed tokens: 5258608640 | elapsed time per iteration (s): 0.18 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.816051E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.552 | TFLOPs: 26.91 | +7: iteration 10040/ 21553 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 0.18 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.810766E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.379 | TFLOPs: 26.97 | +7: iteration 10050/ 21553 | consumed samples: 2572800 | consumed tokens: 5269094400 | elapsed time per iteration (s): 0.18 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.818050E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.298 | TFLOPs: 26.98 | +7: iteration 10060/ 21553 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 0.18 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.807855E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.269 | TFLOPs: 26.98 | +7: iteration 10070/ 21553 | consumed samples: 2577920 | consumed tokens: 5279580160 | elapsed time per iteration (s): 0.18 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.811974E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.472 | TFLOPs: 26.95 | +7: iteration 10080/ 21553 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 0.18 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.809799E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.189 | TFLOPs: 26.92 | +7: iteration 10090/ 21553 | consumed samples: 2583040 | consumed tokens: 5290065920 | elapsed time per iteration (s): 0.19 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.818488E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.877 | TFLOPs: 26.15 | +7: iteration 10100/ 21553 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 0.18 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.808443E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.551 | TFLOPs: 26.99 | +7: iteration 10110/ 21553 | consumed samples: 2588160 | consumed tokens: 5300551680 | elapsed time per iteration (s): 0.18 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.810194E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.909 | TFLOPs: 26.96 | +7: iteration 10120/ 21553 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 0.18 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.807877E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.683 | TFLOPs: 26.97 | +7: iteration 10130/ 21553 | consumed samples: 2593280 | consumed tokens: 5311037440 | elapsed time per iteration (s): 0.18 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.808747E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.266 | TFLOPs: 26.96 | +7: iteration 10140/ 21553 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 0.18 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.808132E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.816 | TFLOPs: 26.92 | +7: iteration 10150/ 21553 | consumed samples: 2598400 | consumed tokens: 5321523200 | elapsed time per iteration (s): 0.18 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.816773E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.340 | TFLOPs: 26.93 | +7: iteration 10160/ 21553 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 0.18 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.801268E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.336 | TFLOPs: 26.93 | +7: iteration 10170/ 21553 | consumed samples: 2603520 | consumed tokens: 5332008960 | elapsed time per iteration (s): 0.18 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.812636E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.719 | TFLOPs: 26.92 | +7: iteration 10180/ 21553 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 0.18 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.806555E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.584 | TFLOPs: 26.68 | +7: iteration 10190/ 21553 | consumed samples: 2608640 | consumed tokens: 5342494720 | elapsed time per iteration (s): 0.18 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.806904E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.917 | TFLOPs: 26.94 | +7: iteration 10200/ 21553 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 0.18 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.806507E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.397 | TFLOPs: 26.95 | +7: iteration 10210/ 21553 | consumed samples: 2613760 | consumed tokens: 5352980480 | elapsed time per iteration (s): 0.20 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.805508E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1266.124 | TFLOPs: 24.19 | +7: iteration 10220/ 21553 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 0.18 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.809327E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.007 | TFLOPs: 26.86 | +7: iteration 10230/ 21553 | consumed samples: 2618880 | consumed tokens: 5363466240 | elapsed time per iteration (s): 0.18 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.810188E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.796 | TFLOPs: 26.84 | +7: iteration 10240/ 21553 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 0.18 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.804802E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.433 | TFLOPs: 26.81 | +7: iteration 10250/ 21553 | consumed samples: 2624000 | consumed tokens: 5373952000 | elapsed time per iteration (s): 0.18 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.801353E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.326 | TFLOPs: 26.83 | +7: iteration 10260/ 21553 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 0.18 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.811743E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.100 | TFLOPs: 26.81 | +7: iteration 10270/ 21553 | consumed samples: 2629120 | consumed tokens: 5384437760 | elapsed time per iteration (s): 0.18 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.808400E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.500 | TFLOPs: 26.85 | +7: iteration 10280/ 21553 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 0.18 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.803724E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.089 | TFLOPs: 26.87 | +7: iteration 10290/ 21553 | consumed samples: 2634240 | consumed tokens: 5394923520 | elapsed time per iteration (s): 0.18 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.799515E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.345 | TFLOPs: 26.87 | +7: iteration 10300/ 21553 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 0.18 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.802465E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.979 | TFLOPs: 26.84 | +7: iteration 10310/ 21553 | consumed samples: 2639360 | consumed tokens: 5405409280 | elapsed time per iteration (s): 0.18 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.802475E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.724 | TFLOPs: 26.86 | +7: iteration 10320/ 21553 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 0.18 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.809153E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.565 | TFLOPs: 26.86 | +7: iteration 10330/ 21553 | consumed samples: 2644480 | consumed tokens: 5415895040 | elapsed time per iteration (s): 0.18 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.798709E+00 | grad norm: 0.505 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.580 | TFLOPs: 26.86 | +7: iteration 10340/ 21553 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 0.18 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.805416E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.102 | TFLOPs: 26.87 | +7: iteration 10350/ 21553 | consumed samples: 2649600 | consumed tokens: 5426380800 | elapsed time per iteration (s): 0.18 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.801456E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.596 | TFLOPs: 26.86 | +7: iteration 10360/ 21553 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 0.18 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.813440E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.055 | TFLOPs: 26.94 | +7: iteration 10370/ 21553 | consumed samples: 2654720 | consumed tokens: 5436866560 | elapsed time per iteration (s): 0.18 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.803633E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.064 | TFLOPs: 26.92 | +7: iteration 10380/ 21553 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 0.18 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.805581E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.612 | TFLOPs: 26.93 | +7: iteration 10390/ 21553 | consumed samples: 2659840 | consumed tokens: 5447352320 | elapsed time per iteration (s): 0.18 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.800896E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.133 | TFLOPs: 26.92 | +7: iteration 10400/ 21553 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 0.18 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.801085E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.172 | TFLOPs: 26.96 | +7: iteration 10410/ 21553 | consumed samples: 2664960 | consumed tokens: 5457838080 | elapsed time per iteration (s): 0.18 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.797641E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.811 | TFLOPs: 26.97 | +7: iteration 10420/ 21553 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 0.18 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.807015E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.387 | TFLOPs: 26.99 | +7: iteration 10430/ 21553 | consumed samples: 2670080 | consumed tokens: 5468323840 | elapsed time per iteration (s): 0.18 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.795259E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.477 | TFLOPs: 26.99 | +7: iteration 10440/ 21553 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 0.18 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.800215E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.199 | TFLOPs: 26.98 | +7: iteration 10450/ 21553 | consumed samples: 2675200 | consumed tokens: 5478809600 | elapsed time per iteration (s): 0.18 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.802251E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.155 | TFLOPs: 27.00 | +7: iteration 10460/ 21553 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 0.18 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.797132E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.706 | TFLOPs: 26.97 | +7: iteration 10470/ 21553 | consumed samples: 2680320 | consumed tokens: 5489295360 | elapsed time per iteration (s): 0.18 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.800391E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.957 | TFLOPs: 26.98 | +7: iteration 10480/ 21553 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 0.18 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.804532E+00 | grad norm: 0.508 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.866 | TFLOPs: 26.98 | +7: iteration 10490/ 21553 | consumed samples: 2685440 | consumed tokens: 5499781120 | elapsed time per iteration (s): 0.18 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.797264E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.443 | TFLOPs: 26.99 | +7: iteration 10500/ 21553 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 0.18 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.789823E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.858 | TFLOPs: 26.96 | +7: iteration 10510/ 21553 | consumed samples: 2690560 | consumed tokens: 5510266880 | elapsed time per iteration (s): 0.18 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.793665E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.501 | TFLOPs: 26.97 | +7: iteration 10520/ 21553 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 0.18 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.801100E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.374 | TFLOPs: 26.97 | +7: iteration 10530/ 21553 | consumed samples: 2695680 | consumed tokens: 5520752640 | elapsed time per iteration (s): 0.18 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.803490E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.036 | TFLOPs: 26.98 | +7: iteration 10540/ 21553 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 0.18 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.796761E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.858 | TFLOPs: 26.98 | +7: iteration 10550/ 21553 | consumed samples: 2700800 | consumed tokens: 5531238400 | elapsed time per iteration (s): 0.18 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.801986E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.311 | TFLOPs: 26.98 | +7: iteration 10560/ 21553 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 0.18 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.797210E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.955 | TFLOPs: 26.98 | +7: iteration 10570/ 21553 | consumed samples: 2705920 | consumed tokens: 5541724160 | elapsed time per iteration (s): 0.18 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.798990E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.663 | TFLOPs: 26.99 | +7: iteration 10580/ 21553 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 0.18 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.801059E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.993 | TFLOPs: 26.98 | +7: iteration 10590/ 21553 | consumed samples: 2711040 | consumed tokens: 5552209920 | elapsed time per iteration (s): 0.18 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.795784E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.614 | TFLOPs: 26.99 | +7: iteration 10600/ 21553 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 0.18 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.795592E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.556 | TFLOPs: 26.97 | +7: iteration 10610/ 21553 | consumed samples: 2716160 | consumed tokens: 5562695680 | elapsed time per iteration (s): 0.18 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.798831E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.745 | TFLOPs: 27.09 | +7: iteration 10620/ 21553 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 0.18 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.801500E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.185 | TFLOPs: 27.08 | +7: iteration 10630/ 21553 | consumed samples: 2721280 | consumed tokens: 5573181440 | elapsed time per iteration (s): 0.18 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.790297E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.980 | TFLOPs: 27.07 | +7: iteration 10640/ 21553 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 0.18 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.797595E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.278 | TFLOPs: 27.08 | +7: iteration 10650/ 21553 | consumed samples: 2726400 | consumed tokens: 5583667200 | elapsed time per iteration (s): 0.18 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.802939E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.702 | TFLOPs: 27.09 | +7: iteration 10660/ 21553 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 0.18 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.797335E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.525 | TFLOPs: 27.08 | +7: iteration 10670/ 21553 | consumed samples: 2731520 | consumed tokens: 5594152960 | elapsed time per iteration (s): 0.18 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.795805E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.869 | TFLOPs: 27.07 | +7: iteration 10680/ 21553 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 0.18 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.799494E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.876 | TFLOPs: 26.84 | +7: iteration 10690/ 21553 | consumed samples: 2736640 | consumed tokens: 5604638720 | elapsed time per iteration (s): 0.18 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.799895E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.181 | TFLOPs: 27.08 | +7: iteration 10700/ 21553 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 0.18 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.790184E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.362 | TFLOPs: 27.08 | +7: iteration 10710/ 21553 | consumed samples: 2741760 | consumed tokens: 5615124480 | elapsed time per iteration (s): 0.18 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.792361E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.189 | TFLOPs: 27.08 | +7: iteration 10720/ 21553 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 0.18 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.793798E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.406 | TFLOPs: 27.08 | +7: iteration 10730/ 21553 | consumed samples: 2746880 | consumed tokens: 5625610240 | elapsed time per iteration (s): 0.18 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.800057E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.101 | TFLOPs: 26.75 | +7: iteration 10740/ 21553 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 0.18 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.788149E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.037 | TFLOPs: 27.07 | +7: iteration 10750/ 21553 | consumed samples: 2752000 | consumed tokens: 5636096000 | elapsed time per iteration (s): 0.18 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.796811E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.742 | TFLOPs: 27.07 | +7: iteration 10760/ 21553 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 0.18 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.790654E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.381 | TFLOPs: 27.08 | +7: iteration 10770/ 21553 | consumed samples: 2757120 | consumed tokens: 5646581760 | elapsed time per iteration (s): 0.18 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.793827E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.333 | TFLOPs: 27.08 | +7: iteration 10780/ 21553 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 0.18 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.796652E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.714 | TFLOPs: 27.09 | +7: iteration 10790/ 21553 | consumed samples: 2762240 | consumed tokens: 5657067520 | elapsed time per iteration (s): 0.18 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.790504E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.912 | TFLOPs: 27.09 | +7: iteration 10800/ 21553 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 0.18 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.796503E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.170 | TFLOPs: 27.10 | +7: iteration 10810/ 21553 | consumed samples: 2767360 | consumed tokens: 5667553280 | elapsed time per iteration (s): 0.18 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.795068E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.253 | TFLOPs: 27.06 | +7: iteration 10820/ 21553 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 0.18 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.791533E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.136 | TFLOPs: 27.04 | +7: iteration 10830/ 21553 | consumed samples: 2772480 | consumed tokens: 5678039040 | elapsed time per iteration (s): 0.18 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.794081E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.632 | TFLOPs: 27.03 | +7: iteration 10840/ 21553 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 0.18 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.794014E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.745 | TFLOPs: 27.03 | +7: iteration 10850/ 21553 | consumed samples: 2777600 | consumed tokens: 5688524800 | elapsed time per iteration (s): 0.18 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.801013E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.047 | TFLOPs: 27.07 | +7: iteration 10860/ 21553 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 0.18 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.790952E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.852 | TFLOPs: 26.71 | +7: iteration 10870/ 21553 | consumed samples: 2782720 | consumed tokens: 5699010560 | elapsed time per iteration (s): 0.18 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.790842E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.623 | TFLOPs: 26.72 | +7: iteration 10880/ 21553 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 0.19 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.792907E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1346.925 | TFLOPs: 25.73 | +7: iteration 10890/ 21553 | consumed samples: 2787840 | consumed tokens: 5709496320 | elapsed time per iteration (s): 0.18 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.784101E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.443 | TFLOPs: 27.06 | +7: iteration 10900/ 21553 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 0.18 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.796276E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.109 | TFLOPs: 27.06 | +7: iteration 10910/ 21553 | consumed samples: 2792960 | consumed tokens: 5719982080 | elapsed time per iteration (s): 0.18 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.786841E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.089 | TFLOPs: 27.06 | +7: iteration 10920/ 21553 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 0.18 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.801781E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.332 | TFLOPs: 27.04 | +7: iteration 10930/ 21553 | consumed samples: 2798080 | consumed tokens: 5730467840 | elapsed time per iteration (s): 0.18 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.789534E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.436 | TFLOPs: 27.06 | +7: iteration 10940/ 21553 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 0.18 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.789060E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.082 | TFLOPs: 27.08 | +7: iteration 10950/ 21553 | consumed samples: 2803200 | consumed tokens: 5740953600 | elapsed time per iteration (s): 0.18 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.799690E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.404 | TFLOPs: 27.06 | +7: iteration 10960/ 21553 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 0.18 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.789447E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.617 | TFLOPs: 27.09 | +7: iteration 10970/ 21553 | consumed samples: 2808320 | consumed tokens: 5751439360 | elapsed time per iteration (s): 0.18 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.794097E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.001 | TFLOPs: 27.07 | +7: iteration 10980/ 21553 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 0.18 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.794600E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.521 | TFLOPs: 27.06 | +7: iteration 10990/ 21553 | consumed samples: 2813440 | consumed tokens: 5761925120 | elapsed time per iteration (s): 0.18 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.792668E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.947 | TFLOPs: 27.07 | +7: iteration 11000/ 21553 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 0.18 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.781402E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.403 | TFLOPs: 27.06 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 11000 | lm loss value: 3.955650E+00 | lm loss PPL: 5.222965E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 11000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:09:04,380] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step11000 is begin to save! +0: [2023-03-17 00:09:04,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:09:04,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:09:04,458] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:09:04,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:09:04,470] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:09:04,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:09:04,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:09:04,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:09:04,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:09:04,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:09:04,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:09:04,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:09:04,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:09:04,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:09:04,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:09:04,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:09:04,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:09:04,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:09:04,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:09:04,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:09:04,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:09:04,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:09:04,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:09:04,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:09:04,572] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step11000/mp_rank_00_model_states.pt +0: [2023-03-17 00:09:04,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:09:04,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:09:04,591] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:09:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:09:04,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:09:04,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +1: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-17 00:09:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:09:04,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-17 00:09:04,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:09:04,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:09:04,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-17 00:09:04,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:09:04,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:09:04,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: successfully saved checkpoint at iteration 11000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 244.48 +7: iteration 11010/ 21553 | consumed samples: 2818560 | consumed tokens: 5772410880 | elapsed time per iteration (s): 0.21 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.793175E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1211.879 | TFLOPs: 23.15 | +7: iteration 11020/ 21553 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 0.18 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.790779E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.398 | TFLOPs: 27.08 | +7: iteration 11030/ 21553 | consumed samples: 2823680 | consumed tokens: 5782896640 | elapsed time per iteration (s): 0.18 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.786737E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.172 | TFLOPs: 27.08 | +7: iteration 11040/ 21553 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 0.18 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.791468E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.276 | TFLOPs: 27.08 | +7: iteration 11050/ 21553 | consumed samples: 2828800 | consumed tokens: 5793382400 | elapsed time per iteration (s): 0.18 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.787654E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.680 | TFLOPs: 27.09 | +7: iteration 11060/ 21553 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 0.18 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.786224E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.616 | TFLOPs: 27.09 | +7: iteration 11070/ 21553 | consumed samples: 2833920 | consumed tokens: 5803868160 | elapsed time per iteration (s): 0.18 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.791344E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.385 | TFLOPs: 27.08 | +7: iteration 11080/ 21553 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 0.18 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.786113E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.821 | TFLOPs: 27.09 | +7: iteration 11090/ 21553 | consumed samples: 2839040 | consumed tokens: 5814353920 | elapsed time per iteration (s): 0.18 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.785999E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.254 | TFLOPs: 27.08 | +7: iteration 11100/ 21553 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 0.18 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.786551E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.974 | TFLOPs: 27.09 | +7: iteration 11110/ 21553 | consumed samples: 2844160 | consumed tokens: 5824839680 | elapsed time per iteration (s): 0.18 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.782141E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.759 | TFLOPs: 27.05 | +7: iteration 11120/ 21553 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 0.18 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.781773E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.482 | TFLOPs: 27.04 | +7: iteration 11130/ 21553 | consumed samples: 2849280 | consumed tokens: 5835325440 | elapsed time per iteration (s): 0.18 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.788828E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.950 | TFLOPs: 27.05 | +7: iteration 11140/ 21553 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 0.18 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.783025E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.061 | TFLOPs: 27.06 | +7: iteration 11150/ 21553 | consumed samples: 2854400 | consumed tokens: 5845811200 | elapsed time per iteration (s): 0.18 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.795464E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.441 | TFLOPs: 27.02 | +7: iteration 11160/ 21553 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 0.18 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.792350E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.090 | TFLOPs: 27.06 | +7: iteration 11170/ 21553 | consumed samples: 2859520 | consumed tokens: 5856296960 | elapsed time per iteration (s): 0.18 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.788996E+00 | grad norm: 0.489 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.334 | TFLOPs: 27.04 | +7: iteration 11180/ 21553 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 0.18 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.786270E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.905 | TFLOPs: 26.94 | +7: iteration 11190/ 21553 | consumed samples: 2864640 | consumed tokens: 5866782720 | elapsed time per iteration (s): 0.18 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.778955E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.911 | TFLOPs: 26.77 | +7: iteration 11200/ 21553 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 0.18 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.784835E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.074 | TFLOPs: 26.98 | +7: iteration 11210/ 21553 | consumed samples: 2869760 | consumed tokens: 5877268480 | elapsed time per iteration (s): 0.18 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.783806E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.907 | TFLOPs: 27.05 | +7: iteration 11220/ 21553 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 0.18 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.781672E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.617 | TFLOPs: 27.07 | +7: iteration 11230/ 21553 | consumed samples: 2874880 | consumed tokens: 5887754240 | elapsed time per iteration (s): 0.18 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.780808E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.940 | TFLOPs: 27.07 | +7: iteration 11240/ 21553 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 0.18 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.784714E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.412 | TFLOPs: 27.06 | +7: iteration 11250/ 21553 | consumed samples: 2880000 | consumed tokens: 5898240000 | elapsed time per iteration (s): 0.18 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.778915E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.349 | TFLOPs: 27.06 | +7: iteration 11260/ 21553 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 0.18 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.780835E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.117 | TFLOPs: 27.08 | +7: iteration 11270/ 21553 | consumed samples: 2885120 | consumed tokens: 5908725760 | elapsed time per iteration (s): 0.18 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.788000E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.853 | TFLOPs: 27.09 | +7: iteration 11280/ 21553 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 0.18 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.795211E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.001 | TFLOPs: 27.09 | +7: iteration 11290/ 21553 | consumed samples: 2890240 | consumed tokens: 5919211520 | elapsed time per iteration (s): 0.18 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.793130E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.276 | TFLOPs: 27.08 | +7: iteration 11300/ 21553 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 0.18 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.786359E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.040 | TFLOPs: 27.07 | +7: iteration 11310/ 21553 | consumed samples: 2895360 | consumed tokens: 5929697280 | elapsed time per iteration (s): 0.18 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.776749E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.290 | TFLOPs: 27.08 | +7: iteration 11320/ 21553 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 0.18 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.789487E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.537 | TFLOPs: 27.08 | +7: iteration 11330/ 21553 | consumed samples: 2900480 | consumed tokens: 5940183040 | elapsed time per iteration (s): 0.18 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.782156E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.306 | TFLOPs: 27.06 | +7: iteration 11340/ 21553 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 0.18 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.782838E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.662 | TFLOPs: 27.09 | +7: iteration 11350/ 21553 | consumed samples: 2905600 | consumed tokens: 5950668800 | elapsed time per iteration (s): 0.18 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.783542E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.518 | TFLOPs: 27.08 | +7: iteration 11360/ 21553 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 0.18 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.778414E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.051 | TFLOPs: 27.09 | +7: iteration 11370/ 21553 | consumed samples: 2910720 | consumed tokens: 5961154560 | elapsed time per iteration (s): 0.18 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.782366E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.720 | TFLOPs: 27.05 | +7: iteration 11380/ 21553 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 0.18 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.772408E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.132 | TFLOPs: 27.04 | +7: iteration 11390/ 21553 | consumed samples: 2915840 | consumed tokens: 5971640320 | elapsed time per iteration (s): 0.18 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.775911E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.269 | TFLOPs: 27.00 | +7: iteration 11400/ 21553 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 0.18 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.780445E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.441 | TFLOPs: 26.97 | +7: iteration 11410/ 21553 | consumed samples: 2920960 | consumed tokens: 5982126080 | elapsed time per iteration (s): 0.18 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.779651E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.316 | TFLOPs: 26.98 | +7: iteration 11420/ 21553 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 0.18 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.776957E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.132 | TFLOPs: 27.00 | +7: iteration 11430/ 21553 | consumed samples: 2926080 | consumed tokens: 5992611840 | elapsed time per iteration (s): 0.18 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.783307E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.972 | TFLOPs: 27.00 | +7: iteration 11440/ 21553 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 0.18 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.783102E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.670 | TFLOPs: 26.97 | +7: iteration 11450/ 21553 | consumed samples: 2931200 | consumed tokens: 6003097600 | elapsed time per iteration (s): 0.18 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.786831E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.511 | TFLOPs: 26.97 | +7: iteration 11460/ 21553 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 0.18 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.782790E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.002 | TFLOPs: 27.02 | +7: iteration 11470/ 21553 | consumed samples: 2936320 | consumed tokens: 6013583360 | elapsed time per iteration (s): 0.18 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.775687E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.867 | TFLOPs: 26.99 | +7: iteration 11480/ 21553 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 0.18 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.776766E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.380 | TFLOPs: 27.02 | +7: iteration 11490/ 21553 | consumed samples: 2941440 | consumed tokens: 6024069120 | elapsed time per iteration (s): 0.18 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.781802E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.472 | TFLOPs: 26.99 | +7: iteration 11500/ 21553 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 0.18 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.774763E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.338 | TFLOPs: 27.00 | +7: iteration 11510/ 21553 | consumed samples: 2946560 | consumed tokens: 6034554880 | elapsed time per iteration (s): 0.18 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.773805E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.002 | TFLOPs: 26.96 | +7: iteration 11520/ 21553 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 0.18 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.776279E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.107 | TFLOPs: 27.00 | +7: iteration 11530/ 21553 | consumed samples: 2951680 | consumed tokens: 6045040640 | elapsed time per iteration (s): 0.18 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.780398E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.101 | TFLOPs: 26.98 | +7: iteration 11540/ 21553 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 0.18 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.767422E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.396 | TFLOPs: 26.97 | +7: iteration 11550/ 21553 | consumed samples: 2956800 | consumed tokens: 6055526400 | elapsed time per iteration (s): 0.18 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.780616E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.507 | TFLOPs: 26.95 | +7: iteration 11560/ 21553 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 0.18 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.783796E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.722 | TFLOPs: 26.95 | +7: iteration 11570/ 21553 | consumed samples: 2961920 | consumed tokens: 6066012160 | elapsed time per iteration (s): 0.18 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.775534E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.559 | TFLOPs: 26.97 | +7: iteration 11580/ 21553 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 0.18 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.776677E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.811 | TFLOPs: 26.99 | +7: iteration 11590/ 21553 | consumed samples: 2967040 | consumed tokens: 6076497920 | elapsed time per iteration (s): 0.18 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.771881E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.765 | TFLOPs: 26.97 | +7: iteration 11600/ 21553 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 0.18 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.782050E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.586 | TFLOPs: 26.97 | +7: iteration 11610/ 21553 | consumed samples: 2972160 | consumed tokens: 6086983680 | elapsed time per iteration (s): 0.18 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.776188E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.024 | TFLOPs: 26.96 | +7: iteration 11620/ 21553 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 0.18 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.776317E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.094 | TFLOPs: 26.83 | +7: iteration 11630/ 21553 | consumed samples: 2977280 | consumed tokens: 6097469440 | elapsed time per iteration (s): 0.18 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.774940E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.433 | TFLOPs: 26.97 | +7: iteration 11640/ 21553 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 0.18 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.769976E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.503 | TFLOPs: 27.01 | +7: iteration 11650/ 21553 | consumed samples: 2982400 | consumed tokens: 6107955200 | elapsed time per iteration (s): 0.18 | learning rate: 9.987E-05 | global batch size: 256 | lm loss: 3.776675E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.261 | TFLOPs: 27.04 | +7: iteration 11660/ 21553 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 0.20 | learning rate: 9.974E-05 | global batch size: 256 | lm loss: 3.778609E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1272.576 | TFLOPs: 24.31 | +7: iteration 11670/ 21553 | consumed samples: 2987520 | consumed tokens: 6118440960 | elapsed time per iteration (s): 0.18 | learning rate: 9.961E-05 | global batch size: 256 | lm loss: 3.776316E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.121 | TFLOPs: 27.04 | +7: iteration 11680/ 21553 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 0.18 | learning rate: 9.948E-05 | global batch size: 256 | lm loss: 3.769907E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.242 | TFLOPs: 27.08 | +7: iteration 11690/ 21553 | consumed samples: 2992640 | consumed tokens: 6128926720 | elapsed time per iteration (s): 0.18 | learning rate: 9.935E-05 | global batch size: 256 | lm loss: 3.768511E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.971 | TFLOPs: 27.07 | +7: iteration 11700/ 21553 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 0.18 | learning rate: 9.922E-05 | global batch size: 256 | lm loss: 3.778252E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.307 | TFLOPs: 27.06 | +7: iteration 11710/ 21553 | consumed samples: 2997760 | consumed tokens: 6139412480 | elapsed time per iteration (s): 0.18 | learning rate: 9.909E-05 | global batch size: 256 | lm loss: 3.779317E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.209 | TFLOPs: 27.06 | +7: iteration 11720/ 21553 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 0.18 | learning rate: 9.895E-05 | global batch size: 256 | lm loss: 3.775869E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.939 | TFLOPs: 27.05 | +7: iteration 11730/ 21553 | consumed samples: 3002880 | consumed tokens: 6149898240 | elapsed time per iteration (s): 0.18 | learning rate: 9.882E-05 | global batch size: 256 | lm loss: 3.771475E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.420 | TFLOPs: 27.04 | +7: iteration 11740/ 21553 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 0.18 | learning rate: 9.869E-05 | global batch size: 256 | lm loss: 3.774810E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.558 | TFLOPs: 27.07 | +7: iteration 11750/ 21553 | consumed samples: 3008000 | consumed tokens: 6160384000 | elapsed time per iteration (s): 0.18 | learning rate: 9.856E-05 | global batch size: 256 | lm loss: 3.770030E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.003 | TFLOPs: 27.05 | +7: iteration 11760/ 21553 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 0.18 | learning rate: 9.843E-05 | global batch size: 256 | lm loss: 3.769513E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.429 | TFLOPs: 27.06 | +7: iteration 11770/ 21553 | consumed samples: 3013120 | consumed tokens: 6170869760 | elapsed time per iteration (s): 0.18 | learning rate: 9.830E-05 | global batch size: 256 | lm loss: 3.769353E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.930 | TFLOPs: 27.05 | +7: iteration 11780/ 21553 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 0.18 | learning rate: 9.817E-05 | global batch size: 256 | lm loss: 3.781200E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.001 | TFLOPs: 27.05 | +7: iteration 11790/ 21553 | consumed samples: 3018240 | consumed tokens: 6181355520 | elapsed time per iteration (s): 0.18 | learning rate: 9.803E-05 | global batch size: 256 | lm loss: 3.771711E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.591 | TFLOPs: 27.05 | +7: iteration 11800/ 21553 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 0.18 | learning rate: 9.790E-05 | global batch size: 256 | lm loss: 3.776310E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.719 | TFLOPs: 27.03 | +7: iteration 11810/ 21553 | consumed samples: 3023360 | consumed tokens: 6191841280 | elapsed time per iteration (s): 0.18 | learning rate: 9.777E-05 | global batch size: 256 | lm loss: 3.774354E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.689 | TFLOPs: 26.95 | +7: iteration 11820/ 21553 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 0.18 | learning rate: 9.764E-05 | global batch size: 256 | lm loss: 3.775637E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.062 | TFLOPs: 26.98 | +7: iteration 11830/ 21553 | consumed samples: 3028480 | consumed tokens: 6202327040 | elapsed time per iteration (s): 0.18 | learning rate: 9.751E-05 | global batch size: 256 | lm loss: 3.771926E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.867 | TFLOPs: 26.98 | +7: iteration 11840/ 21553 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 0.18 | learning rate: 9.738E-05 | global batch size: 256 | lm loss: 3.777422E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.931 | TFLOPs: 27.00 | +7: iteration 11850/ 21553 | consumed samples: 3033600 | consumed tokens: 6212812800 | elapsed time per iteration (s): 0.18 | learning rate: 9.725E-05 | global batch size: 256 | lm loss: 3.765897E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.650 | TFLOPs: 26.95 | +7: iteration 11860/ 21553 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 0.18 | learning rate: 9.712E-05 | global batch size: 256 | lm loss: 3.770462E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.365 | TFLOPs: 26.97 | +7: iteration 11870/ 21553 | consumed samples: 3038720 | consumed tokens: 6223298560 | elapsed time per iteration (s): 0.18 | learning rate: 9.698E-05 | global batch size: 256 | lm loss: 3.776869E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.592 | TFLOPs: 26.97 | +7: iteration 11880/ 21553 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 0.18 | learning rate: 9.685E-05 | global batch size: 256 | lm loss: 3.776714E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.783 | TFLOPs: 26.97 | +7: iteration 11890/ 21553 | consumed samples: 3043840 | consumed tokens: 6233784320 | elapsed time per iteration (s): 0.18 | learning rate: 9.672E-05 | global batch size: 256 | lm loss: 3.762279E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.430 | TFLOPs: 26.97 | +7: iteration 11900/ 21553 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 0.18 | learning rate: 9.659E-05 | global batch size: 256 | lm loss: 3.775779E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.291 | TFLOPs: 26.96 | +7: iteration 11910/ 21553 | consumed samples: 3048960 | consumed tokens: 6244270080 | elapsed time per iteration (s): 0.18 | learning rate: 9.646E-05 | global batch size: 256 | lm loss: 3.768218E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.993 | TFLOPs: 26.96 | +7: iteration 11920/ 21553 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 0.18 | learning rate: 9.633E-05 | global batch size: 256 | lm loss: 3.775647E+00 | grad norm: 0.487 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.776 | TFLOPs: 26.95 | +7: iteration 11930/ 21553 | consumed samples: 3054080 | consumed tokens: 6254755840 | elapsed time per iteration (s): 0.18 | learning rate: 9.620E-05 | global batch size: 256 | lm loss: 3.774092E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.534 | TFLOPs: 26.97 | +7: iteration 11940/ 21553 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 0.18 | learning rate: 9.607E-05 | global batch size: 256 | lm loss: 3.775278E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.445 | TFLOPs: 26.95 | +7: iteration 11950/ 21553 | consumed samples: 3059200 | consumed tokens: 6265241600 | elapsed time per iteration (s): 0.18 | learning rate: 9.594E-05 | global batch size: 256 | lm loss: 3.770489E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.712 | TFLOPs: 26.95 | +7: iteration 11960/ 21553 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 0.18 | learning rate: 9.581E-05 | global batch size: 256 | lm loss: 3.759756E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.323 | TFLOPs: 26.97 | +7: iteration 11970/ 21553 | consumed samples: 3064320 | consumed tokens: 6275727360 | elapsed time per iteration (s): 0.18 | learning rate: 9.567E-05 | global batch size: 256 | lm loss: 3.769043E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.637 | TFLOPs: 26.97 | +7: iteration 11980/ 21553 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 0.18 | learning rate: 9.554E-05 | global batch size: 256 | lm loss: 3.766108E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.540 | TFLOPs: 26.97 | +7: iteration 11990/ 21553 | consumed samples: 3069440 | consumed tokens: 6286213120 | elapsed time per iteration (s): 0.18 | learning rate: 9.541E-05 | global batch size: 256 | lm loss: 3.765721E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.581 | TFLOPs: 26.97 | +0: [2023-03-17 00:12:05,864] [INFO] [logging.py:68:log_dist] [Rank 0] step=12000, skipped=0, lr=[9.528206376265585e-05, 9.528206376265585e-05, 9.528206376265585e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 12000/ 21553 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 0.18 | learning rate: 9.528E-05 | global batch size: 256 | lm loss: 3.761563E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.963 | TFLOPs: 26.96 | +0: steps: 12000 loss: 3.7758 iter time (s): 0.180 samples/sec: 1423.119 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 12000 | lm loss value: 3.880245E+00 | lm loss PPL: 4.843608E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 12000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:12:05,953] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is begin to save! +0: [2023-03-17 00:12:05,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:12:06,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:12:06,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:12:06,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:12:06,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:12:06,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:12:06,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:12:06,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:12:06,062] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:12:06,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:12:06,074] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:12:06,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:12:06,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:12:06,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:12:06,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:12:06,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:12:06,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:12:06,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:12:06,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:12:06,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:12:06,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:12:06,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:12:06,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:12:06,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:12:06,142] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step12000/mp_rank_00_model_states.pt +0: [2023-03-17 00:12:06,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:12:06,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:12:06,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:12:06,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,181] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,181] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:12:06,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:12:06,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:12:06,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:12:06,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,185] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:12:06,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:12:06,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:12:06,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:12:06,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +7: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 00:12:06,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +3: [2023-03-17 00:12:06,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-17 00:12:06,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-17 00:12:06,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:12:06,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:12:06,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: successfully saved checkpoint at iteration 12000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 240.62 +7: iteration 12010/ 21553 | consumed samples: 3074560 | consumed tokens: 6296698880 | elapsed time per iteration (s): 0.21 | learning rate: 9.515E-05 | global batch size: 256 | lm loss: 3.770847E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1198.605 | TFLOPs: 22.90 | +7: iteration 12020/ 21553 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 0.18 | learning rate: 9.502E-05 | global batch size: 256 | lm loss: 3.765963E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.950 | TFLOPs: 26.96 | +7: iteration 12030/ 21553 | consumed samples: 3079680 | consumed tokens: 6307184640 | elapsed time per iteration (s): 0.18 | learning rate: 9.489E-05 | global batch size: 256 | lm loss: 3.764995E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.946 | TFLOPs: 26.50 | +7: iteration 12040/ 21553 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 0.18 | learning rate: 9.476E-05 | global batch size: 256 | lm loss: 3.775603E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.901 | TFLOPs: 26.96 | +7: iteration 12050/ 21553 | consumed samples: 3084800 | consumed tokens: 6317670400 | elapsed time per iteration (s): 0.18 | learning rate: 9.463E-05 | global batch size: 256 | lm loss: 3.776439E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.147 | TFLOPs: 26.98 | +7: iteration 12060/ 21553 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 0.18 | learning rate: 9.450E-05 | global batch size: 256 | lm loss: 3.773139E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.211 | TFLOPs: 26.98 | +7: iteration 12070/ 21553 | consumed samples: 3089920 | consumed tokens: 6328156160 | elapsed time per iteration (s): 0.18 | learning rate: 9.437E-05 | global batch size: 256 | lm loss: 3.772548E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.801 | TFLOPs: 26.99 | +7: iteration 12080/ 21553 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 0.18 | learning rate: 9.424E-05 | global batch size: 256 | lm loss: 3.768679E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.311 | TFLOPs: 26.98 | +7: iteration 12090/ 21553 | consumed samples: 3095040 | consumed tokens: 6338641920 | elapsed time per iteration (s): 0.18 | learning rate: 9.411E-05 | global batch size: 256 | lm loss: 3.765540E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.889 | TFLOPs: 27.00 | +7: iteration 12100/ 21553 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 0.18 | learning rate: 9.398E-05 | global batch size: 256 | lm loss: 3.761350E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.696 | TFLOPs: 26.97 | +7: iteration 12110/ 21553 | consumed samples: 3100160 | consumed tokens: 6349127680 | elapsed time per iteration (s): 0.18 | learning rate: 9.385E-05 | global batch size: 256 | lm loss: 3.770160E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.100 | TFLOPs: 27.00 | +7: iteration 12120/ 21553 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 0.18 | learning rate: 9.372E-05 | global batch size: 256 | lm loss: 3.768969E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.738 | TFLOPs: 26.84 | +7: iteration 12130/ 21553 | consumed samples: 3105280 | consumed tokens: 6359613440 | elapsed time per iteration (s): 0.18 | learning rate: 9.359E-05 | global batch size: 256 | lm loss: 3.763423E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.701 | TFLOPs: 26.84 | +7: iteration 12140/ 21553 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 0.18 | learning rate: 9.346E-05 | global batch size: 256 | lm loss: 3.768459E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.215 | TFLOPs: 26.94 | +7: iteration 12150/ 21553 | consumed samples: 3110400 | consumed tokens: 6370099200 | elapsed time per iteration (s): 0.18 | learning rate: 9.332E-05 | global batch size: 256 | lm loss: 3.758546E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.742 | TFLOPs: 26.97 | +7: iteration 12160/ 21553 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 0.18 | learning rate: 9.319E-05 | global batch size: 256 | lm loss: 3.763307E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.372 | TFLOPs: 26.99 | +7: iteration 12170/ 21553 | consumed samples: 3115520 | consumed tokens: 6380584960 | elapsed time per iteration (s): 0.18 | learning rate: 9.306E-05 | global batch size: 256 | lm loss: 3.761748E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.991 | TFLOPs: 26.94 | +7: iteration 12180/ 21553 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 0.19 | learning rate: 9.293E-05 | global batch size: 256 | lm loss: 3.765649E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.752 | TFLOPs: 26.34 | +7: iteration 12190/ 21553 | consumed samples: 3120640 | consumed tokens: 6391070720 | elapsed time per iteration (s): 0.18 | learning rate: 9.280E-05 | global batch size: 256 | lm loss: 3.759826E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.570 | TFLOPs: 26.97 | +7: iteration 12200/ 21553 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 0.18 | learning rate: 9.267E-05 | global batch size: 256 | lm loss: 3.758160E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.520 | TFLOPs: 26.99 | +7: iteration 12210/ 21553 | consumed samples: 3125760 | consumed tokens: 6401556480 | elapsed time per iteration (s): 0.18 | learning rate: 9.254E-05 | global batch size: 256 | lm loss: 3.768502E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.569 | TFLOPs: 26.97 | +7: iteration 12220/ 21553 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 0.18 | learning rate: 9.241E-05 | global batch size: 256 | lm loss: 3.766282E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.516 | TFLOPs: 26.97 | +7: iteration 12230/ 21553 | consumed samples: 3130880 | consumed tokens: 6412042240 | elapsed time per iteration (s): 0.18 | learning rate: 9.228E-05 | global batch size: 256 | lm loss: 3.761673E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.621 | TFLOPs: 26.99 | +7: iteration 12240/ 21553 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 0.18 | learning rate: 9.215E-05 | global batch size: 256 | lm loss: 3.768581E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.159 | TFLOPs: 26.96 | +7: iteration 12250/ 21553 | consumed samples: 3136000 | consumed tokens: 6422528000 | elapsed time per iteration (s): 0.18 | learning rate: 9.202E-05 | global batch size: 256 | lm loss: 3.764905E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.067 | TFLOPs: 26.96 | +7: iteration 12260/ 21553 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 0.18 | learning rate: 9.189E-05 | global batch size: 256 | lm loss: 3.755706E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.377 | TFLOPs: 26.93 | +7: iteration 12270/ 21553 | consumed samples: 3141120 | consumed tokens: 6433013760 | elapsed time per iteration (s): 0.18 | learning rate: 9.177E-05 | global batch size: 256 | lm loss: 3.758179E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.514 | TFLOPs: 26.91 | +7: iteration 12280/ 21553 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 0.18 | learning rate: 9.164E-05 | global batch size: 256 | lm loss: 3.764426E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.894 | TFLOPs: 26.92 | +7: iteration 12290/ 21553 | consumed samples: 3146240 | consumed tokens: 6443499520 | elapsed time per iteration (s): 0.18 | learning rate: 9.151E-05 | global batch size: 256 | lm loss: 3.765379E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.340 | TFLOPs: 26.93 | +7: iteration 12300/ 21553 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 0.18 | learning rate: 9.138E-05 | global batch size: 256 | lm loss: 3.769396E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.992 | TFLOPs: 26.92 | +7: iteration 12310/ 21553 | consumed samples: 3151360 | consumed tokens: 6453985280 | elapsed time per iteration (s): 0.18 | learning rate: 9.125E-05 | global batch size: 256 | lm loss: 3.769665E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.736 | TFLOPs: 26.95 | +7: iteration 12320/ 21553 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 0.18 | learning rate: 9.112E-05 | global batch size: 256 | lm loss: 3.761810E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.343 | TFLOPs: 26.87 | +7: iteration 12330/ 21553 | consumed samples: 3156480 | consumed tokens: 6464471040 | elapsed time per iteration (s): 0.18 | learning rate: 9.099E-05 | global batch size: 256 | lm loss: 3.761044E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.977 | TFLOPs: 26.77 | +7: iteration 12340/ 21553 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 0.18 | learning rate: 9.086E-05 | global batch size: 256 | lm loss: 3.759835E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.258 | TFLOPs: 26.87 | +7: iteration 12350/ 21553 | consumed samples: 3161600 | consumed tokens: 6474956800 | elapsed time per iteration (s): 0.18 | learning rate: 9.073E-05 | global batch size: 256 | lm loss: 3.758255E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.275 | TFLOPs: 26.87 | +7: iteration 12360/ 21553 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 0.18 | learning rate: 9.060E-05 | global batch size: 256 | lm loss: 3.753431E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.936 | TFLOPs: 26.86 | +7: iteration 12370/ 21553 | consumed samples: 3166720 | consumed tokens: 6485442560 | elapsed time per iteration (s): 0.18 | learning rate: 9.047E-05 | global batch size: 256 | lm loss: 3.757684E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.345 | TFLOPs: 26.89 | +7: iteration 12380/ 21553 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 0.18 | learning rate: 9.034E-05 | global batch size: 256 | lm loss: 3.763609E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.330 | TFLOPs: 26.87 | +7: iteration 12390/ 21553 | consumed samples: 3171840 | consumed tokens: 6495928320 | elapsed time per iteration (s): 0.18 | learning rate: 9.021E-05 | global batch size: 256 | lm loss: 3.763707E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.548 | TFLOPs: 26.87 | +7: iteration 12400/ 21553 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 0.18 | learning rate: 9.008E-05 | global batch size: 256 | lm loss: 3.762417E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.464 | TFLOPs: 26.89 | +7: iteration 12410/ 21553 | consumed samples: 3176960 | consumed tokens: 6506414080 | elapsed time per iteration (s): 0.18 | learning rate: 8.995E-05 | global batch size: 256 | lm loss: 3.759846E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.956 | TFLOPs: 26.86 | +7: iteration 12420/ 21553 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 0.18 | learning rate: 8.982E-05 | global batch size: 256 | lm loss: 3.758572E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.935 | TFLOPs: 26.90 | +7: iteration 12430/ 21553 | consumed samples: 3182080 | consumed tokens: 6516899840 | elapsed time per iteration (s): 0.18 | learning rate: 8.969E-05 | global batch size: 256 | lm loss: 3.756110E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.998 | TFLOPs: 26.96 | +7: iteration 12440/ 21553 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 0.18 | learning rate: 8.957E-05 | global batch size: 256 | lm loss: 3.757973E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.409 | TFLOPs: 26.91 | +7: iteration 12450/ 21553 | consumed samples: 3187200 | consumed tokens: 6527385600 | elapsed time per iteration (s): 0.18 | learning rate: 8.944E-05 | global batch size: 256 | lm loss: 3.760692E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.335 | TFLOPs: 26.95 | +7: iteration 12460/ 21553 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 0.18 | learning rate: 8.931E-05 | global batch size: 256 | lm loss: 3.760471E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.508 | TFLOPs: 26.95 | +7: iteration 12470/ 21553 | consumed samples: 3192320 | consumed tokens: 6537871360 | elapsed time per iteration (s): 0.18 | learning rate: 8.918E-05 | global batch size: 256 | lm loss: 3.754817E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.829 | TFLOPs: 26.94 | +7: iteration 12480/ 21553 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 0.18 | learning rate: 8.905E-05 | global batch size: 256 | lm loss: 3.772380E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.548 | TFLOPs: 26.95 | +7: iteration 12490/ 21553 | consumed samples: 3197440 | consumed tokens: 6548357120 | elapsed time per iteration (s): 0.18 | learning rate: 8.892E-05 | global batch size: 256 | lm loss: 3.759403E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.211 | TFLOPs: 26.96 | +7: iteration 12500/ 21553 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 0.18 | learning rate: 8.879E-05 | global batch size: 256 | lm loss: 3.760552E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.195 | TFLOPs: 26.98 | +7: iteration 12510/ 21553 | consumed samples: 3202560 | consumed tokens: 6558842880 | elapsed time per iteration (s): 0.18 | learning rate: 8.866E-05 | global batch size: 256 | lm loss: 3.764136E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.496 | TFLOPs: 26.97 | +7: iteration 12520/ 21553 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 0.18 | learning rate: 8.853E-05 | global batch size: 256 | lm loss: 3.765339E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.271 | TFLOPs: 26.95 | +7: iteration 12530/ 21553 | consumed samples: 3207680 | consumed tokens: 6569328640 | elapsed time per iteration (s): 0.18 | learning rate: 8.841E-05 | global batch size: 256 | lm loss: 3.762272E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.818 | TFLOPs: 26.96 | +7: iteration 12540/ 21553 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 0.18 | learning rate: 8.828E-05 | global batch size: 256 | lm loss: 3.766639E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.656 | TFLOPs: 26.97 | +7: iteration 12550/ 21553 | consumed samples: 3212800 | consumed tokens: 6579814400 | elapsed time per iteration (s): 0.18 | learning rate: 8.815E-05 | global batch size: 256 | lm loss: 3.757442E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.582 | TFLOPs: 26.97 | +7: iteration 12560/ 21553 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 0.18 | learning rate: 8.802E-05 | global batch size: 256 | lm loss: 3.756319E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.609 | TFLOPs: 26.95 | +7: iteration 12570/ 21553 | consumed samples: 3217920 | consumed tokens: 6590300160 | elapsed time per iteration (s): 0.18 | learning rate: 8.789E-05 | global batch size: 256 | lm loss: 3.759988E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.333 | TFLOPs: 26.95 | +7: iteration 12580/ 21553 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 0.18 | learning rate: 8.776E-05 | global batch size: 256 | lm loss: 3.757596E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.371 | TFLOPs: 26.95 | +7: iteration 12590/ 21553 | consumed samples: 3223040 | consumed tokens: 6600785920 | elapsed time per iteration (s): 0.18 | learning rate: 8.763E-05 | global batch size: 256 | lm loss: 3.760876E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.289 | TFLOPs: 26.95 | +7: iteration 12600/ 21553 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 0.18 | learning rate: 8.751E-05 | global batch size: 256 | lm loss: 3.767749E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.420 | TFLOPs: 26.95 | +7: iteration 12610/ 21553 | consumed samples: 3228160 | consumed tokens: 6611271680 | elapsed time per iteration (s): 0.18 | learning rate: 8.738E-05 | global batch size: 256 | lm loss: 3.754338E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.610 | TFLOPs: 26.97 | +7: iteration 12620/ 21553 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 0.18 | learning rate: 8.725E-05 | global batch size: 256 | lm loss: 3.757784E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.311 | TFLOPs: 26.97 | +7: iteration 12630/ 21553 | consumed samples: 3233280 | consumed tokens: 6621757440 | elapsed time per iteration (s): 0.18 | learning rate: 8.712E-05 | global batch size: 256 | lm loss: 3.750748E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.553 | TFLOPs: 26.97 | +7: iteration 12640/ 21553 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 0.18 | learning rate: 8.699E-05 | global batch size: 256 | lm loss: 3.762851E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.451 | TFLOPs: 26.97 | +7: iteration 12650/ 21553 | consumed samples: 3238400 | consumed tokens: 6632243200 | elapsed time per iteration (s): 0.18 | learning rate: 8.687E-05 | global batch size: 256 | lm loss: 3.750264E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.482 | TFLOPs: 27.01 | +7: iteration 12660/ 21553 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 0.18 | learning rate: 8.674E-05 | global batch size: 256 | lm loss: 3.752197E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.924 | TFLOPs: 27.00 | +7: iteration 12670/ 21553 | consumed samples: 3243520 | consumed tokens: 6642728960 | elapsed time per iteration (s): 0.18 | learning rate: 8.661E-05 | global batch size: 256 | lm loss: 3.758654E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.100 | TFLOPs: 27.00 | +7: iteration 12680/ 21553 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 0.18 | learning rate: 8.648E-05 | global batch size: 256 | lm loss: 3.755272E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.218 | TFLOPs: 26.96 | +7: iteration 12690/ 21553 | consumed samples: 3248640 | consumed tokens: 6653214720 | elapsed time per iteration (s): 0.18 | learning rate: 8.635E-05 | global batch size: 256 | lm loss: 3.757548E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.060 | TFLOPs: 27.06 | +7: iteration 12700/ 21553 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 0.18 | learning rate: 8.623E-05 | global batch size: 256 | lm loss: 3.762486E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.213 | TFLOPs: 27.08 | +7: iteration 12710/ 21553 | consumed samples: 3253760 | consumed tokens: 6663700480 | elapsed time per iteration (s): 0.18 | learning rate: 8.610E-05 | global batch size: 256 | lm loss: 3.754197E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.231 | TFLOPs: 27.04 | +7: iteration 12720/ 21553 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 0.18 | learning rate: 8.597E-05 | global batch size: 256 | lm loss: 3.760113E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.036 | TFLOPs: 27.06 | +7: iteration 12730/ 21553 | consumed samples: 3258880 | consumed tokens: 6674186240 | elapsed time per iteration (s): 0.18 | learning rate: 8.584E-05 | global batch size: 256 | lm loss: 3.748360E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.496 | TFLOPs: 27.03 | +7: iteration 12740/ 21553 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 0.19 | learning rate: 8.571E-05 | global batch size: 256 | lm loss: 3.753308E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1373.995 | TFLOPs: 26.25 | +7: iteration 12750/ 21553 | consumed samples: 3264000 | consumed tokens: 6684672000 | elapsed time per iteration (s): 0.18 | learning rate: 8.559E-05 | global batch size: 256 | lm loss: 3.756425E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.201 | TFLOPs: 26.85 | +7: iteration 12760/ 21553 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 0.19 | learning rate: 8.546E-05 | global batch size: 256 | lm loss: 3.756204E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.432 | TFLOPs: 25.99 | +7: iteration 12770/ 21553 | consumed samples: 3269120 | consumed tokens: 6695157760 | elapsed time per iteration (s): 0.18 | learning rate: 8.533E-05 | global batch size: 256 | lm loss: 3.756391E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.715 | TFLOPs: 27.07 | +7: iteration 12780/ 21553 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 0.18 | learning rate: 8.520E-05 | global batch size: 256 | lm loss: 3.752188E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.721 | TFLOPs: 27.07 | +7: iteration 12790/ 21553 | consumed samples: 3274240 | consumed tokens: 6705643520 | elapsed time per iteration (s): 0.18 | learning rate: 8.508E-05 | global batch size: 256 | lm loss: 3.757315E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.413 | TFLOPs: 27.04 | +7: iteration 12800/ 21553 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 0.18 | learning rate: 8.495E-05 | global batch size: 256 | lm loss: 3.755616E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.737 | TFLOPs: 27.09 | +7: iteration 12810/ 21553 | consumed samples: 3279360 | consumed tokens: 6716129280 | elapsed time per iteration (s): 0.18 | learning rate: 8.482E-05 | global batch size: 256 | lm loss: 3.760154E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.395 | TFLOPs: 27.08 | +7: iteration 12820/ 21553 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 0.18 | learning rate: 8.470E-05 | global batch size: 256 | lm loss: 3.749829E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.005 | TFLOPs: 27.05 | +7: iteration 12830/ 21553 | consumed samples: 3284480 | consumed tokens: 6726615040 | elapsed time per iteration (s): 0.18 | learning rate: 8.457E-05 | global batch size: 256 | lm loss: 3.752335E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.318 | TFLOPs: 26.89 | +7: iteration 12840/ 21553 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 0.18 | learning rate: 8.444E-05 | global batch size: 256 | lm loss: 3.746975E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.252 | TFLOPs: 27.08 | +7: iteration 12850/ 21553 | consumed samples: 3289600 | consumed tokens: 6737100800 | elapsed time per iteration (s): 0.18 | learning rate: 8.431E-05 | global batch size: 256 | lm loss: 3.751182E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.966 | TFLOPs: 27.07 | +7: iteration 12860/ 21553 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 0.18 | learning rate: 8.419E-05 | global batch size: 256 | lm loss: 3.755165E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.993 | TFLOPs: 27.09 | +7: iteration 12870/ 21553 | consumed samples: 3294720 | consumed tokens: 6747586560 | elapsed time per iteration (s): 0.18 | learning rate: 8.406E-05 | global batch size: 256 | lm loss: 3.752470E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.466 | TFLOPs: 27.03 | +7: iteration 12880/ 21553 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 0.18 | learning rate: 8.393E-05 | global batch size: 256 | lm loss: 3.756438E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.433 | TFLOPs: 27.04 | +7: iteration 12890/ 21553 | consumed samples: 3299840 | consumed tokens: 6758072320 | elapsed time per iteration (s): 0.18 | learning rate: 8.381E-05 | global batch size: 256 | lm loss: 3.748013E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.276 | TFLOPs: 27.08 | +7: iteration 12900/ 21553 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 0.18 | learning rate: 8.368E-05 | global batch size: 256 | lm loss: 3.753014E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.968 | TFLOPs: 27.09 | +7: iteration 12910/ 21553 | consumed samples: 3304960 | consumed tokens: 6768558080 | elapsed time per iteration (s): 0.18 | learning rate: 8.355E-05 | global batch size: 256 | lm loss: 3.757096E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.397 | TFLOPs: 27.06 | +7: iteration 12920/ 21553 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 0.18 | learning rate: 8.343E-05 | global batch size: 256 | lm loss: 3.754327E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.895 | TFLOPs: 27.07 | +7: iteration 12930/ 21553 | consumed samples: 3310080 | consumed tokens: 6779043840 | elapsed time per iteration (s): 0.18 | learning rate: 8.330E-05 | global batch size: 256 | lm loss: 3.757573E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.735 | TFLOPs: 26.76 | +7: iteration 12940/ 21553 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 0.18 | learning rate: 8.317E-05 | global batch size: 256 | lm loss: 3.753298E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.611 | TFLOPs: 26.99 | +7: iteration 12950/ 21553 | consumed samples: 3315200 | consumed tokens: 6789529600 | elapsed time per iteration (s): 0.18 | learning rate: 8.305E-05 | global batch size: 256 | lm loss: 3.749504E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.598 | TFLOPs: 26.99 | +7: iteration 12960/ 21553 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 0.18 | learning rate: 8.292E-05 | global batch size: 256 | lm loss: 3.757581E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.695 | TFLOPs: 26.99 | +7: iteration 12970/ 21553 | consumed samples: 3320320 | consumed tokens: 6800015360 | elapsed time per iteration (s): 0.18 | learning rate: 8.279E-05 | global batch size: 256 | lm loss: 3.745730E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.287 | TFLOPs: 26.98 | +7: iteration 12980/ 21553 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 0.18 | learning rate: 8.267E-05 | global batch size: 256 | lm loss: 3.750079E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.372 | TFLOPs: 27.00 | +7: iteration 12990/ 21553 | consumed samples: 3325440 | consumed tokens: 6810501120 | elapsed time per iteration (s): 0.18 | learning rate: 8.254E-05 | global batch size: 256 | lm loss: 3.755561E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.898 | TFLOPs: 27.00 | +7: iteration 13000/ 21553 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 0.18 | learning rate: 8.242E-05 | global batch size: 256 | lm loss: 3.745842E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.638 | TFLOPs: 26.99 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 13000 | lm loss value: 3.929599E+00 | lm loss PPL: 5.088658E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 13000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:15:07,825] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step13000 is begin to save! +0: [2023-03-17 00:15:07,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:15:07,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:15:07,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:15:07,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:15:07,912] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:15:07,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:15:07,923] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:15:07,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:15:07,934] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:15:07,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:15:07,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:15:07,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:15:07,957] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:15:07,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:15:07,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:15:07,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:15:07,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:15:07,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:15:07,990] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:15:08,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:15:08,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:15:08,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:15:08,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:15:08,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:15:08,014] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step13000/mp_rank_00_model_states.pt +0: [2023-03-17 00:15:08,014] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:15:08,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:15:08,034] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:15:08,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:15:08,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:15:08,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-17 00:15:08,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-17 00:15:08,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: successfully saved checkpoint at iteration 13000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 242.41 +7: iteration 13010/ 21553 | consumed samples: 3330560 | consumed tokens: 6820986880 | elapsed time per iteration (s): 0.21 | learning rate: 8.229E-05 | global batch size: 256 | lm loss: 3.746012E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1211.133 | TFLOPs: 23.14 | +7: iteration 13020/ 21553 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 0.18 | learning rate: 8.216E-05 | global batch size: 256 | lm loss: 3.748772E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.333 | TFLOPs: 27.04 | +7: iteration 13030/ 21553 | consumed samples: 3335680 | consumed tokens: 6831472640 | elapsed time per iteration (s): 0.18 | learning rate: 8.204E-05 | global batch size: 256 | lm loss: 3.746436E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.438 | TFLOPs: 27.06 | +7: iteration 13040/ 21553 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 0.18 | learning rate: 8.191E-05 | global batch size: 256 | lm loss: 3.755832E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.651 | TFLOPs: 27.07 | +7: iteration 13050/ 21553 | consumed samples: 3340800 | consumed tokens: 6841958400 | elapsed time per iteration (s): 0.18 | learning rate: 8.179E-05 | global batch size: 256 | lm loss: 3.743473E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.365 | TFLOPs: 27.06 | +7: iteration 13060/ 21553 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 0.18 | learning rate: 8.166E-05 | global batch size: 256 | lm loss: 3.755780E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.683 | TFLOPs: 27.05 | +7: iteration 13070/ 21553 | consumed samples: 3345920 | consumed tokens: 6852444160 | elapsed time per iteration (s): 0.18 | learning rate: 8.153E-05 | global batch size: 256 | lm loss: 3.746959E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.662 | TFLOPs: 27.05 | +7: iteration 13080/ 21553 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 0.18 | learning rate: 8.141E-05 | global batch size: 256 | lm loss: 3.747878E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.950 | TFLOPs: 27.05 | +7: iteration 13090/ 21553 | consumed samples: 3351040 | consumed tokens: 6862929920 | elapsed time per iteration (s): 0.18 | learning rate: 8.128E-05 | global batch size: 256 | lm loss: 3.754067E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.725 | TFLOPs: 27.07 | +7: iteration 13100/ 21553 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 0.18 | learning rate: 8.116E-05 | global batch size: 256 | lm loss: 3.739912E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.457 | TFLOPs: 27.08 | +7: iteration 13110/ 21553 | consumed samples: 3356160 | consumed tokens: 6873415680 | elapsed time per iteration (s): 0.18 | learning rate: 8.103E-05 | global batch size: 256 | lm loss: 3.752090E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.942 | TFLOPs: 27.03 | +7: iteration 13120/ 21553 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 0.18 | learning rate: 8.091E-05 | global batch size: 256 | lm loss: 3.754869E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.257 | TFLOPs: 27.06 | +7: iteration 13130/ 21553 | consumed samples: 3361280 | consumed tokens: 6883901440 | elapsed time per iteration (s): 0.18 | learning rate: 8.078E-05 | global batch size: 256 | lm loss: 3.744891E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.442 | TFLOPs: 27.04 | +7: iteration 13140/ 21553 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 0.18 | learning rate: 8.066E-05 | global batch size: 256 | lm loss: 3.746982E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.003 | TFLOPs: 27.04 | +7: iteration 13150/ 21553 | consumed samples: 3366400 | consumed tokens: 6894387200 | elapsed time per iteration (s): 0.18 | learning rate: 8.053E-05 | global batch size: 256 | lm loss: 3.748169E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.812 | TFLOPs: 27.07 | +7: iteration 13160/ 21553 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 0.18 | learning rate: 8.041E-05 | global batch size: 256 | lm loss: 3.743281E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.279 | TFLOPs: 27.04 | +7: iteration 13170/ 21553 | consumed samples: 3371520 | consumed tokens: 6904872960 | elapsed time per iteration (s): 0.18 | learning rate: 8.028E-05 | global batch size: 256 | lm loss: 3.751314E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.669 | TFLOPs: 26.99 | +7: iteration 13180/ 21553 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 0.18 | learning rate: 8.016E-05 | global batch size: 256 | lm loss: 3.747768E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.909 | TFLOPs: 26.59 | +7: iteration 13190/ 21553 | consumed samples: 3376640 | consumed tokens: 6915358720 | elapsed time per iteration (s): 0.18 | learning rate: 8.003E-05 | global batch size: 256 | lm loss: 3.754241E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.051 | TFLOPs: 27.04 | +7: iteration 13200/ 21553 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 0.18 | learning rate: 7.991E-05 | global batch size: 256 | lm loss: 3.747683E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.638 | TFLOPs: 27.05 | +7: iteration 13210/ 21553 | consumed samples: 3381760 | consumed tokens: 6925844480 | elapsed time per iteration (s): 0.18 | learning rate: 7.978E-05 | global batch size: 256 | lm loss: 3.744665E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.652 | TFLOPs: 27.03 | +7: iteration 13220/ 21553 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 0.18 | learning rate: 7.966E-05 | global batch size: 256 | lm loss: 3.747427E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.887 | TFLOPs: 27.03 | +7: iteration 13230/ 21553 | consumed samples: 3386880 | consumed tokens: 6936330240 | elapsed time per iteration (s): 0.18 | learning rate: 7.953E-05 | global batch size: 256 | lm loss: 3.745374E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.122 | TFLOPs: 27.02 | +7: iteration 13240/ 21553 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 0.18 | learning rate: 7.941E-05 | global batch size: 256 | lm loss: 3.741954E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.604 | TFLOPs: 27.05 | +7: iteration 13250/ 21553 | consumed samples: 3392000 | consumed tokens: 6946816000 | elapsed time per iteration (s): 0.18 | learning rate: 7.928E-05 | global batch size: 256 | lm loss: 3.748577E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.198 | TFLOPs: 27.08 | +7: iteration 13260/ 21553 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 0.18 | learning rate: 7.916E-05 | global batch size: 256 | lm loss: 3.747943E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.282 | TFLOPs: 27.08 | +7: iteration 13270/ 21553 | consumed samples: 3397120 | consumed tokens: 6957301760 | elapsed time per iteration (s): 0.18 | learning rate: 7.903E-05 | global batch size: 256 | lm loss: 3.746659E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.363 | TFLOPs: 27.08 | +7: iteration 13280/ 21553 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 0.18 | learning rate: 7.891E-05 | global batch size: 256 | lm loss: 3.743701E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.315 | TFLOPs: 27.08 | +7: iteration 13290/ 21553 | consumed samples: 3402240 | consumed tokens: 6967787520 | elapsed time per iteration (s): 0.18 | learning rate: 7.878E-05 | global batch size: 256 | lm loss: 3.744572E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.188 | TFLOPs: 27.06 | +7: iteration 13300/ 21553 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 0.18 | learning rate: 7.866E-05 | global batch size: 256 | lm loss: 3.744802E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.705 | TFLOPs: 27.09 | +7: iteration 13310/ 21553 | consumed samples: 3407360 | consumed tokens: 6978273280 | elapsed time per iteration (s): 0.18 | learning rate: 7.854E-05 | global batch size: 256 | lm loss: 3.751623E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.864 | TFLOPs: 27.07 | +7: iteration 13320/ 21553 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 0.18 | learning rate: 7.841E-05 | global batch size: 256 | lm loss: 3.747886E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.662 | TFLOPs: 27.09 | +7: iteration 13330/ 21553 | consumed samples: 3412480 | consumed tokens: 6988759040 | elapsed time per iteration (s): 0.18 | learning rate: 7.829E-05 | global batch size: 256 | lm loss: 3.746093E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.966 | TFLOPs: 27.09 | +7: iteration 13340/ 21553 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 0.18 | learning rate: 7.816E-05 | global batch size: 256 | lm loss: 3.746397E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.794 | TFLOPs: 27.09 | +7: iteration 13350/ 21553 | consumed samples: 3417600 | consumed tokens: 6999244800 | elapsed time per iteration (s): 0.18 | learning rate: 7.804E-05 | global batch size: 256 | lm loss: 3.746978E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.782 | TFLOPs: 27.09 | +7: iteration 13360/ 21553 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 0.18 | learning rate: 7.792E-05 | global batch size: 256 | lm loss: 3.748150E+00 | grad norm: 0.469 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.533 | TFLOPs: 27.10 | +7: iteration 13370/ 21553 | consumed samples: 3422720 | consumed tokens: 7009730560 | elapsed time per iteration (s): 0.18 | learning rate: 7.779E-05 | global batch size: 256 | lm loss: 3.745292E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.213 | TFLOPs: 27.06 | +7: iteration 13380/ 21553 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 0.18 | learning rate: 7.767E-05 | global batch size: 256 | lm loss: 3.737696E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.344 | TFLOPs: 27.08 | +7: iteration 13390/ 21553 | consumed samples: 3427840 | consumed tokens: 7020216320 | elapsed time per iteration (s): 0.18 | learning rate: 7.754E-05 | global batch size: 256 | lm loss: 3.744656E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.852 | TFLOPs: 27.09 | +7: iteration 13400/ 21553 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 0.18 | learning rate: 7.742E-05 | global batch size: 256 | lm loss: 3.747202E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.112 | TFLOPs: 27.08 | +7: iteration 13410/ 21553 | consumed samples: 3432960 | consumed tokens: 7030702080 | elapsed time per iteration (s): 0.18 | learning rate: 7.730E-05 | global batch size: 256 | lm loss: 3.748899E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.145 | TFLOPs: 27.06 | +7: iteration 13420/ 21553 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 0.18 | learning rate: 7.717E-05 | global batch size: 256 | lm loss: 3.749274E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.514 | TFLOPs: 26.93 | +7: iteration 13430/ 21553 | consumed samples: 3438080 | consumed tokens: 7041187840 | elapsed time per iteration (s): 0.18 | learning rate: 7.705E-05 | global batch size: 256 | lm loss: 3.745875E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.974 | TFLOPs: 26.98 | +7: iteration 13440/ 21553 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 0.18 | learning rate: 7.693E-05 | global batch size: 256 | lm loss: 3.743495E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.571 | TFLOPs: 27.05 | +7: iteration 13450/ 21553 | consumed samples: 3443200 | consumed tokens: 7051673600 | elapsed time per iteration (s): 0.18 | learning rate: 7.680E-05 | global batch size: 256 | lm loss: 3.739624E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.827 | TFLOPs: 26.97 | +7: iteration 13460/ 21553 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 0.18 | learning rate: 7.668E-05 | global batch size: 256 | lm loss: 3.742994E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.984 | TFLOPs: 27.05 | +7: iteration 13470/ 21553 | consumed samples: 3448320 | consumed tokens: 7062159360 | elapsed time per iteration (s): 0.18 | learning rate: 7.656E-05 | global batch size: 256 | lm loss: 3.739444E+00 | grad norm: 0.495 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.963 | TFLOPs: 27.03 | +7: iteration 13480/ 21553 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 0.18 | learning rate: 7.644E-05 | global batch size: 256 | lm loss: 3.734958E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.455 | TFLOPs: 27.03 | +7: iteration 13490/ 21553 | consumed samples: 3453440 | consumed tokens: 7072645120 | elapsed time per iteration (s): 0.18 | learning rate: 7.631E-05 | global batch size: 256 | lm loss: 3.740568E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.785 | TFLOPs: 27.05 | +7: iteration 13500/ 21553 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 0.18 | learning rate: 7.619E-05 | global batch size: 256 | lm loss: 3.749968E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.918 | TFLOPs: 27.05 | +7: iteration 13510/ 21553 | consumed samples: 3458560 | consumed tokens: 7083130880 | elapsed time per iteration (s): 0.18 | learning rate: 7.607E-05 | global batch size: 256 | lm loss: 3.738253E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.433 | TFLOPs: 27.04 | +7: iteration 13520/ 21553 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 0.18 | learning rate: 7.594E-05 | global batch size: 256 | lm loss: 3.744713E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.716 | TFLOPs: 27.03 | +7: iteration 13530/ 21553 | consumed samples: 3463680 | consumed tokens: 7093616640 | elapsed time per iteration (s): 0.18 | learning rate: 7.582E-05 | global batch size: 256 | lm loss: 3.749196E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.183 | TFLOPs: 27.04 | +7: iteration 13540/ 21553 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 0.18 | learning rate: 7.570E-05 | global batch size: 256 | lm loss: 3.739270E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.953 | TFLOPs: 27.03 | +7: iteration 13550/ 21553 | consumed samples: 3468800 | consumed tokens: 7104102400 | elapsed time per iteration (s): 0.18 | learning rate: 7.558E-05 | global batch size: 256 | lm loss: 3.728800E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.447 | TFLOPs: 27.02 | +7: iteration 13560/ 21553 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 0.18 | learning rate: 7.545E-05 | global batch size: 256 | lm loss: 3.745338E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.637 | TFLOPs: 27.05 | +7: iteration 13570/ 21553 | consumed samples: 3473920 | consumed tokens: 7114588160 | elapsed time per iteration (s): 0.18 | learning rate: 7.533E-05 | global batch size: 256 | lm loss: 3.742161E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.992 | TFLOPs: 27.05 | +7: iteration 13580/ 21553 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 0.18 | learning rate: 7.521E-05 | global batch size: 256 | lm loss: 3.738711E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.362 | TFLOPs: 27.04 | +7: iteration 13590/ 21553 | consumed samples: 3479040 | consumed tokens: 7125073920 | elapsed time per iteration (s): 0.18 | learning rate: 7.509E-05 | global batch size: 256 | lm loss: 3.744593E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.576 | TFLOPs: 27.05 | +7: iteration 13600/ 21553 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 0.18 | learning rate: 7.497E-05 | global batch size: 256 | lm loss: 3.741431E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.183 | TFLOPs: 27.04 | +7: iteration 13610/ 21553 | consumed samples: 3484160 | consumed tokens: 7135559680 | elapsed time per iteration (s): 0.18 | learning rate: 7.484E-05 | global batch size: 256 | lm loss: 3.737359E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.035 | TFLOPs: 27.00 | +7: iteration 13620/ 21553 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 0.18 | learning rate: 7.472E-05 | global batch size: 256 | lm loss: 3.741756E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.718 | TFLOPs: 27.03 | +7: iteration 13630/ 21553 | consumed samples: 3489280 | consumed tokens: 7146045440 | elapsed time per iteration (s): 0.18 | learning rate: 7.460E-05 | global batch size: 256 | lm loss: 3.742531E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.329 | TFLOPs: 27.04 | +7: iteration 13640/ 21553 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 0.18 | learning rate: 7.448E-05 | global batch size: 256 | lm loss: 3.739111E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.330 | TFLOPs: 27.06 | +7: iteration 13650/ 21553 | consumed samples: 3494400 | consumed tokens: 7156531200 | elapsed time per iteration (s): 0.18 | learning rate: 7.436E-05 | global batch size: 256 | lm loss: 3.734393E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.240 | TFLOPs: 27.06 | +7: iteration 13660/ 21553 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 0.18 | learning rate: 7.423E-05 | global batch size: 256 | lm loss: 3.738979E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.381 | TFLOPs: 27.02 | +7: iteration 13670/ 21553 | consumed samples: 3499520 | consumed tokens: 7167016960 | elapsed time per iteration (s): 0.18 | learning rate: 7.411E-05 | global batch size: 256 | lm loss: 3.737482E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.947 | TFLOPs: 27.03 | +7: iteration 13680/ 21553 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 0.18 | learning rate: 7.399E-05 | global batch size: 256 | lm loss: 3.736683E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.817 | TFLOPs: 27.03 | +7: iteration 13690/ 21553 | consumed samples: 3504640 | consumed tokens: 7177502720 | elapsed time per iteration (s): 0.18 | learning rate: 7.387E-05 | global batch size: 256 | lm loss: 3.741576E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.913 | TFLOPs: 27.01 | +7: iteration 13700/ 21553 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 0.18 | learning rate: 7.375E-05 | global batch size: 256 | lm loss: 3.739792E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.257 | TFLOPs: 27.02 | +7: iteration 13710/ 21553 | consumed samples: 3509760 | consumed tokens: 7187988480 | elapsed time per iteration (s): 0.18 | learning rate: 7.363E-05 | global batch size: 256 | lm loss: 3.743460E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.585 | TFLOPs: 26.99 | +7: iteration 13720/ 21553 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 0.18 | learning rate: 7.351E-05 | global batch size: 256 | lm loss: 3.734141E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.197 | TFLOPs: 27.00 | +7: iteration 13730/ 21553 | consumed samples: 3514880 | consumed tokens: 7198474240 | elapsed time per iteration (s): 0.18 | learning rate: 7.339E-05 | global batch size: 256 | lm loss: 3.742052E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.079 | TFLOPs: 27.02 | +7: iteration 13740/ 21553 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 0.18 | learning rate: 7.326E-05 | global batch size: 256 | lm loss: 3.734595E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.322 | TFLOPs: 27.02 | +7: iteration 13750/ 21553 | consumed samples: 3520000 | consumed tokens: 7208960000 | elapsed time per iteration (s): 0.18 | learning rate: 7.314E-05 | global batch size: 256 | lm loss: 3.738504E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.992 | TFLOPs: 27.00 | +7: iteration 13760/ 21553 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 0.18 | learning rate: 7.302E-05 | global batch size: 256 | lm loss: 3.737737E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.980 | TFLOPs: 27.04 | +7: iteration 13770/ 21553 | consumed samples: 3525120 | consumed tokens: 7219445760 | elapsed time per iteration (s): 0.18 | learning rate: 7.290E-05 | global batch size: 256 | lm loss: 3.738781E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.291 | TFLOPs: 27.04 | +7: iteration 13780/ 21553 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 0.18 | learning rate: 7.278E-05 | global batch size: 256 | lm loss: 3.734082E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.135 | TFLOPs: 27.04 | +7: iteration 13790/ 21553 | consumed samples: 3530240 | consumed tokens: 7229931520 | elapsed time per iteration (s): 0.18 | learning rate: 7.266E-05 | global batch size: 256 | lm loss: 3.738936E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.626 | TFLOPs: 27.05 | +7: iteration 13800/ 21553 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 0.18 | learning rate: 7.254E-05 | global batch size: 256 | lm loss: 3.738063E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.712 | TFLOPs: 27.05 | +7: iteration 13810/ 21553 | consumed samples: 3535360 | consumed tokens: 7240417280 | elapsed time per iteration (s): 0.18 | learning rate: 7.242E-05 | global batch size: 256 | lm loss: 3.741245E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.660 | TFLOPs: 27.07 | +7: iteration 13820/ 21553 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 0.18 | learning rate: 7.230E-05 | global batch size: 256 | lm loss: 3.744098E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.699 | TFLOPs: 27.03 | +7: iteration 13830/ 21553 | consumed samples: 3540480 | consumed tokens: 7250903040 | elapsed time per iteration (s): 0.18 | learning rate: 7.218E-05 | global batch size: 256 | lm loss: 3.747808E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.865 | TFLOPs: 27.07 | +7: iteration 13840/ 21553 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 0.18 | learning rate: 7.206E-05 | global batch size: 256 | lm loss: 3.739523E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.949 | TFLOPs: 27.03 | +7: iteration 13850/ 21553 | consumed samples: 3545600 | consumed tokens: 7261388800 | elapsed time per iteration (s): 0.18 | learning rate: 7.194E-05 | global batch size: 256 | lm loss: 3.736797E+00 | grad norm: 0.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.726 | TFLOPs: 26.99 | +7: iteration 13860/ 21553 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 0.18 | learning rate: 7.182E-05 | global batch size: 256 | lm loss: 3.734580E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.856 | TFLOPs: 26.99 | +7: iteration 13870/ 21553 | consumed samples: 3550720 | consumed tokens: 7271874560 | elapsed time per iteration (s): 0.18 | learning rate: 7.170E-05 | global batch size: 256 | lm loss: 3.732168E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.838 | TFLOPs: 27.01 | +7: iteration 13880/ 21553 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 0.18 | learning rate: 7.158E-05 | global batch size: 256 | lm loss: 3.738575E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.244 | TFLOPs: 27.00 | +7: iteration 13890/ 21553 | consumed samples: 3555840 | consumed tokens: 7282360320 | elapsed time per iteration (s): 0.18 | learning rate: 7.146E-05 | global batch size: 256 | lm loss: 3.739870E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.491 | TFLOPs: 27.01 | +7: iteration 13900/ 21553 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 0.18 | learning rate: 7.134E-05 | global batch size: 256 | lm loss: 3.736049E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.853 | TFLOPs: 26.99 | +7: iteration 13910/ 21553 | consumed samples: 3560960 | consumed tokens: 7292846080 | elapsed time per iteration (s): 0.18 | learning rate: 7.122E-05 | global batch size: 256 | lm loss: 3.739278E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.034 | TFLOPs: 26.98 | +7: iteration 13920/ 21553 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 0.18 | learning rate: 7.110E-05 | global batch size: 256 | lm loss: 3.740556E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.985 | TFLOPs: 27.00 | +7: iteration 13930/ 21553 | consumed samples: 3566080 | consumed tokens: 7303331840 | elapsed time per iteration (s): 0.18 | learning rate: 7.098E-05 | global batch size: 256 | lm loss: 3.735896E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.642 | TFLOPs: 26.99 | +7: iteration 13940/ 21553 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 0.18 | learning rate: 7.086E-05 | global batch size: 256 | lm loss: 3.736842E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.181 | TFLOPs: 26.98 | +7: iteration 13950/ 21553 | consumed samples: 3571200 | consumed tokens: 7313817600 | elapsed time per iteration (s): 0.18 | learning rate: 7.074E-05 | global batch size: 256 | lm loss: 3.739101E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.051 | TFLOPs: 27.00 | +7: iteration 13960/ 21553 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 0.18 | learning rate: 7.062E-05 | global batch size: 256 | lm loss: 3.740583E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.695 | TFLOPs: 27.01 | +7: iteration 13970/ 21553 | consumed samples: 3576320 | consumed tokens: 7324303360 | elapsed time per iteration (s): 0.18 | learning rate: 7.050E-05 | global batch size: 256 | lm loss: 3.732656E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.748 | TFLOPs: 27.03 | +7: iteration 13980/ 21553 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 0.18 | learning rate: 7.038E-05 | global batch size: 256 | lm loss: 3.734490E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.696 | TFLOPs: 26.99 | +7: iteration 13990/ 21553 | consumed samples: 3581440 | consumed tokens: 7334789120 | elapsed time per iteration (s): 0.18 | learning rate: 7.027E-05 | global batch size: 256 | lm loss: 3.737610E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.651 | TFLOPs: 27.01 | +0: [2023-03-17 00:18:08,996] [INFO] [logging.py:68:log_dist] [Rank 0] step=14000, skipped=0, lr=[7.014654688611906e-05, 7.014654688611906e-05, 7.014654688611906e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 14000/ 21553 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 0.18 | learning rate: 7.015E-05 | global batch size: 256 | lm loss: 3.732229E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.687 | TFLOPs: 27.01 | +0: steps: 14000 loss: 3.7128 iter time (s): 0.180 samples/sec: 1424.883 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 14000 | lm loss value: 3.878067E+00 | lm loss PPL: 4.833070E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 14000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:18:09,084] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is begin to save! +0: [2023-03-17 00:18:09,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:18:09,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:18:09,160] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:18:09,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:18:09,175] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:18:09,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:18:09,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:18:09,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:18:09,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:18:09,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:18:09,208] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:18:09,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:18:09,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:18:09,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:18:09,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:18:09,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:18:09,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:18:09,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:18:09,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:18:09,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:18:09,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:18:09,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:18:09,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:18:09,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:18:09,277] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step14000/mp_rank_00_model_states.pt +0: [2023-03-17 00:18:09,277] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:18:09,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:18:09,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:18:09,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:18:09,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-17 00:18:09,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:18:09,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-17 00:18:09,328] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:18:09,328] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-17 00:18:09,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:18:09,330] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:18:09,330] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: successfully saved checkpoint at iteration 14000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 250.05 +7: iteration 14010/ 21553 | consumed samples: 3586560 | consumed tokens: 7345274880 | elapsed time per iteration (s): 0.21 | learning rate: 7.003E-05 | global batch size: 256 | lm loss: 3.736747E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1196.237 | TFLOPs: 22.86 | +7: iteration 14020/ 21553 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 0.18 | learning rate: 6.991E-05 | global batch size: 256 | lm loss: 3.737833E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.163 | TFLOPs: 27.04 | +7: iteration 14030/ 21553 | consumed samples: 3591680 | consumed tokens: 7355760640 | elapsed time per iteration (s): 0.18 | learning rate: 6.979E-05 | global batch size: 256 | lm loss: 3.733771E+00 | grad norm: 0.480 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.894 | TFLOPs: 27.03 | +7: iteration 14040/ 21553 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 0.18 | learning rate: 6.967E-05 | global batch size: 256 | lm loss: 3.736082E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.259 | TFLOPs: 27.04 | +7: iteration 14050/ 21553 | consumed samples: 3596800 | consumed tokens: 7366246400 | elapsed time per iteration (s): 0.18 | learning rate: 6.955E-05 | global batch size: 256 | lm loss: 3.738498E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.073 | TFLOPs: 27.02 | +7: iteration 14060/ 21553 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 0.18 | learning rate: 6.944E-05 | global batch size: 256 | lm loss: 3.736515E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.994 | TFLOPs: 27.05 | +7: iteration 14070/ 21553 | consumed samples: 3601920 | consumed tokens: 7376732160 | elapsed time per iteration (s): 0.18 | learning rate: 6.932E-05 | global batch size: 256 | lm loss: 3.737748E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.326 | TFLOPs: 27.04 | +7: iteration 14080/ 21553 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 0.18 | learning rate: 6.920E-05 | global batch size: 256 | lm loss: 3.737151E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.361 | TFLOPs: 27.04 | +7: iteration 14090/ 21553 | consumed samples: 3607040 | consumed tokens: 7387217920 | elapsed time per iteration (s): 0.18 | learning rate: 6.908E-05 | global batch size: 256 | lm loss: 3.727998E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.178 | TFLOPs: 27.04 | +7: iteration 14100/ 21553 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 0.18 | learning rate: 6.896E-05 | global batch size: 256 | lm loss: 3.728045E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.258 | TFLOPs: 27.04 | +7: iteration 14110/ 21553 | consumed samples: 3612160 | consumed tokens: 7397703680 | elapsed time per iteration (s): 0.18 | learning rate: 6.884E-05 | global batch size: 256 | lm loss: 3.734591E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.623 | TFLOPs: 27.05 | +7: iteration 14120/ 21553 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 0.18 | learning rate: 6.873E-05 | global batch size: 256 | lm loss: 3.727240E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.800 | TFLOPs: 27.05 | +7: iteration 14130/ 21553 | consumed samples: 3617280 | consumed tokens: 7408189440 | elapsed time per iteration (s): 0.18 | learning rate: 6.861E-05 | global batch size: 256 | lm loss: 3.731980E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.932 | TFLOPs: 27.07 | +7: iteration 14140/ 21553 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 0.18 | learning rate: 6.849E-05 | global batch size: 256 | lm loss: 3.739294E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.357 | TFLOPs: 27.04 | +7: iteration 14150/ 21553 | consumed samples: 3622400 | consumed tokens: 7418675200 | elapsed time per iteration (s): 0.18 | learning rate: 6.837E-05 | global batch size: 256 | lm loss: 3.729831E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.280 | TFLOPs: 27.06 | +7: iteration 14160/ 21553 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 0.18 | learning rate: 6.826E-05 | global batch size: 256 | lm loss: 3.736556E+00 | grad norm: 0.509 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.768 | TFLOPs: 27.03 | +7: iteration 14170/ 21553 | consumed samples: 3627520 | consumed tokens: 7429160960 | elapsed time per iteration (s): 0.18 | learning rate: 6.814E-05 | global batch size: 256 | lm loss: 3.734802E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.233 | TFLOPs: 27.04 | +7: iteration 14180/ 21553 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 0.18 | learning rate: 6.802E-05 | global batch size: 256 | lm loss: 3.738513E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.853 | TFLOPs: 26.90 | +7: iteration 14190/ 21553 | consumed samples: 3632640 | consumed tokens: 7439646720 | elapsed time per iteration (s): 0.18 | learning rate: 6.791E-05 | global batch size: 256 | lm loss: 3.730571E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.484 | TFLOPs: 27.06 | +7: iteration 14200/ 21553 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 0.18 | learning rate: 6.779E-05 | global batch size: 256 | lm loss: 3.733553E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.073 | TFLOPs: 27.06 | +7: iteration 14210/ 21553 | consumed samples: 3637760 | consumed tokens: 7450132480 | elapsed time per iteration (s): 0.18 | learning rate: 6.767E-05 | global batch size: 256 | lm loss: 3.730916E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.990 | TFLOPs: 27.07 | +7: iteration 14220/ 21553 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 0.18 | learning rate: 6.755E-05 | global batch size: 256 | lm loss: 3.737244E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.861 | TFLOPs: 27.05 | +7: iteration 14230/ 21553 | consumed samples: 3642880 | consumed tokens: 7460618240 | elapsed time per iteration (s): 0.18 | learning rate: 6.744E-05 | global batch size: 256 | lm loss: 3.732253E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.724 | TFLOPs: 27.05 | +7: iteration 14240/ 21553 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 0.18 | learning rate: 6.732E-05 | global batch size: 256 | lm loss: 3.734262E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.815 | TFLOPs: 27.03 | +7: iteration 14250/ 21553 | consumed samples: 3648000 | consumed tokens: 7471104000 | elapsed time per iteration (s): 0.18 | learning rate: 6.720E-05 | global batch size: 256 | lm loss: 3.733065E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.763 | TFLOPs: 27.05 | +7: iteration 14260/ 21553 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 0.18 | learning rate: 6.709E-05 | global batch size: 256 | lm loss: 3.727631E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.001 | TFLOPs: 27.00 | +7: iteration 14270/ 21553 | consumed samples: 3653120 | consumed tokens: 7481589760 | elapsed time per iteration (s): 0.18 | learning rate: 6.697E-05 | global batch size: 256 | lm loss: 3.731855E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.733 | TFLOPs: 26.99 | +7: iteration 14280/ 21553 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 0.18 | learning rate: 6.685E-05 | global batch size: 256 | lm loss: 3.730053E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.413 | TFLOPs: 27.02 | +7: iteration 14290/ 21553 | consumed samples: 3658240 | consumed tokens: 7492075520 | elapsed time per iteration (s): 0.18 | learning rate: 6.674E-05 | global batch size: 256 | lm loss: 3.728869E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.989 | TFLOPs: 27.02 | +7: iteration 14300/ 21553 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 0.18 | learning rate: 6.662E-05 | global batch size: 256 | lm loss: 3.734925E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.223 | TFLOPs: 27.02 | +7: iteration 14310/ 21553 | consumed samples: 3663360 | consumed tokens: 7502561280 | elapsed time per iteration (s): 0.18 | learning rate: 6.651E-05 | global batch size: 256 | lm loss: 3.737187E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.104 | TFLOPs: 27.00 | +7: iteration 14320/ 21553 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 0.18 | learning rate: 6.639E-05 | global batch size: 256 | lm loss: 3.727664E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.957 | TFLOPs: 27.00 | +7: iteration 14330/ 21553 | consumed samples: 3668480 | consumed tokens: 7513047040 | elapsed time per iteration (s): 0.18 | learning rate: 6.627E-05 | global batch size: 256 | lm loss: 3.737962E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.412 | TFLOPs: 26.95 | +7: iteration 14340/ 21553 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 0.18 | learning rate: 6.616E-05 | global batch size: 256 | lm loss: 3.729431E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.922 | TFLOPs: 26.82 | +7: iteration 14350/ 21553 | consumed samples: 3673600 | consumed tokens: 7523532800 | elapsed time per iteration (s): 0.18 | learning rate: 6.604E-05 | global batch size: 256 | lm loss: 3.730159E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.017 | TFLOPs: 27.05 | +7: iteration 14360/ 21553 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 0.18 | learning rate: 6.593E-05 | global batch size: 256 | lm loss: 3.726107E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.624 | TFLOPs: 27.03 | +7: iteration 14370/ 21553 | consumed samples: 3678720 | consumed tokens: 7534018560 | elapsed time per iteration (s): 0.18 | learning rate: 6.581E-05 | global batch size: 256 | lm loss: 3.734931E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.254 | TFLOPs: 27.04 | +7: iteration 14380/ 21553 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 0.18 | learning rate: 6.570E-05 | global batch size: 256 | lm loss: 3.726258E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.783 | TFLOPs: 27.03 | +7: iteration 14390/ 21553 | consumed samples: 3683840 | consumed tokens: 7544504320 | elapsed time per iteration (s): 0.18 | learning rate: 6.558E-05 | global batch size: 256 | lm loss: 3.725901E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.026 | TFLOPs: 27.02 | +7: iteration 14400/ 21553 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 0.18 | learning rate: 6.547E-05 | global batch size: 256 | lm loss: 3.728224E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.732 | TFLOPs: 27.01 | +7: iteration 14410/ 21553 | consumed samples: 3688960 | consumed tokens: 7554990080 | elapsed time per iteration (s): 0.18 | learning rate: 6.535E-05 | global batch size: 256 | lm loss: 3.728660E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.866 | TFLOPs: 26.98 | +7: iteration 14420/ 21553 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 0.18 | learning rate: 6.524E-05 | global batch size: 256 | lm loss: 3.725967E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.887 | TFLOPs: 27.01 | +7: iteration 14430/ 21553 | consumed samples: 3694080 | consumed tokens: 7565475840 | elapsed time per iteration (s): 0.18 | learning rate: 6.512E-05 | global batch size: 256 | lm loss: 3.729314E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.385 | TFLOPs: 27.02 | +7: iteration 14440/ 21553 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 0.18 | learning rate: 6.501E-05 | global batch size: 256 | lm loss: 3.732967E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.155 | TFLOPs: 27.02 | +7: iteration 14450/ 21553 | consumed samples: 3699200 | consumed tokens: 7575961600 | elapsed time per iteration (s): 0.18 | learning rate: 6.489E-05 | global batch size: 256 | lm loss: 3.724398E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.203 | TFLOPs: 27.00 | +7: iteration 14460/ 21553 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 0.18 | learning rate: 6.478E-05 | global batch size: 256 | lm loss: 3.731999E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.789 | TFLOPs: 27.03 | +7: iteration 14470/ 21553 | consumed samples: 3704320 | consumed tokens: 7586447360 | elapsed time per iteration (s): 0.18 | learning rate: 6.466E-05 | global batch size: 256 | lm loss: 3.729901E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.152 | TFLOPs: 26.98 | +7: iteration 14480/ 21553 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 0.18 | learning rate: 6.455E-05 | global batch size: 256 | lm loss: 3.719842E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.648 | TFLOPs: 27.01 | +7: iteration 14490/ 21553 | consumed samples: 3709440 | consumed tokens: 7596933120 | elapsed time per iteration (s): 0.18 | learning rate: 6.443E-05 | global batch size: 256 | lm loss: 3.732669E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.540 | TFLOPs: 26.97 | +7: iteration 14500/ 21553 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 0.18 | learning rate: 6.432E-05 | global batch size: 256 | lm loss: 3.735263E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.156 | TFLOPs: 26.98 | +7: iteration 14510/ 21553 | consumed samples: 3714560 | consumed tokens: 7607418880 | elapsed time per iteration (s): 0.18 | learning rate: 6.421E-05 | global batch size: 256 | lm loss: 3.722626E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.887 | TFLOPs: 26.98 | +7: iteration 14520/ 21553 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 0.18 | learning rate: 6.409E-05 | global batch size: 256 | lm loss: 3.725641E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.100 | TFLOPs: 26.98 | +7: iteration 14530/ 21553 | consumed samples: 3719680 | consumed tokens: 7617904640 | elapsed time per iteration (s): 0.18 | learning rate: 6.398E-05 | global batch size: 256 | lm loss: 3.725890E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.734 | TFLOPs: 26.97 | +7: iteration 14540/ 21553 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 0.18 | learning rate: 6.386E-05 | global batch size: 256 | lm loss: 3.728075E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.568 | TFLOPs: 26.99 | +7: iteration 14550/ 21553 | consumed samples: 3724800 | consumed tokens: 7628390400 | elapsed time per iteration (s): 0.18 | learning rate: 6.375E-05 | global batch size: 256 | lm loss: 3.725412E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.228 | TFLOPs: 27.00 | +7: iteration 14560/ 21553 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 0.18 | learning rate: 6.364E-05 | global batch size: 256 | lm loss: 3.726612E+00 | grad norm: 0.459 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.189 | TFLOPs: 27.00 | +7: iteration 14570/ 21553 | consumed samples: 3729920 | consumed tokens: 7638876160 | elapsed time per iteration (s): 0.18 | learning rate: 6.352E-05 | global batch size: 256 | lm loss: 3.727578E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.810 | TFLOPs: 26.99 | +7: iteration 14580/ 21553 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 0.18 | learning rate: 6.341E-05 | global batch size: 256 | lm loss: 3.726369E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.117 | TFLOPs: 27.00 | +7: iteration 14590/ 21553 | consumed samples: 3735040 | consumed tokens: 7649361920 | elapsed time per iteration (s): 0.18 | learning rate: 6.330E-05 | global batch size: 256 | lm loss: 3.723900E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.859 | TFLOPs: 26.99 | +7: iteration 14600/ 21553 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 0.18 | learning rate: 6.318E-05 | global batch size: 256 | lm loss: 3.725121E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.272 | TFLOPs: 26.96 | +7: iteration 14610/ 21553 | consumed samples: 3740160 | consumed tokens: 7659847680 | elapsed time per iteration (s): 0.18 | learning rate: 6.307E-05 | global batch size: 256 | lm loss: 3.730658E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.404 | TFLOPs: 27.01 | +7: iteration 14620/ 21553 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 0.18 | learning rate: 6.296E-05 | global batch size: 256 | lm loss: 3.727178E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.008 | TFLOPs: 27.02 | +7: iteration 14630/ 21553 | consumed samples: 3745280 | consumed tokens: 7670333440 | elapsed time per iteration (s): 0.18 | learning rate: 6.284E-05 | global batch size: 256 | lm loss: 3.728466E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.106 | TFLOPs: 27.00 | +7: iteration 14640/ 21553 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 0.18 | learning rate: 6.273E-05 | global batch size: 256 | lm loss: 3.732351E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.164 | TFLOPs: 27.02 | +7: iteration 14650/ 21553 | consumed samples: 3750400 | consumed tokens: 7680819200 | elapsed time per iteration (s): 0.18 | learning rate: 6.262E-05 | global batch size: 256 | lm loss: 3.730153E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.307 | TFLOPs: 27.00 | +7: iteration 14660/ 21553 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 0.18 | learning rate: 6.251E-05 | global batch size: 256 | lm loss: 3.724799E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.597 | TFLOPs: 27.01 | +7: iteration 14670/ 21553 | consumed samples: 3755520 | consumed tokens: 7691304960 | elapsed time per iteration (s): 0.18 | learning rate: 6.239E-05 | global batch size: 256 | lm loss: 3.727349E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.606 | TFLOPs: 27.01 | +7: iteration 14680/ 21553 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 0.18 | learning rate: 6.228E-05 | global batch size: 256 | lm loss: 3.719462E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.048 | TFLOPs: 27.02 | +7: iteration 14690/ 21553 | consumed samples: 3760640 | consumed tokens: 7701790720 | elapsed time per iteration (s): 0.18 | learning rate: 6.217E-05 | global batch size: 256 | lm loss: 3.726454E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.785 | TFLOPs: 26.99 | +7: iteration 14700/ 21553 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 0.18 | learning rate: 6.206E-05 | global batch size: 256 | lm loss: 3.717667E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.427 | TFLOPs: 27.04 | +7: iteration 14710/ 21553 | consumed samples: 3765760 | consumed tokens: 7712276480 | elapsed time per iteration (s): 0.18 | learning rate: 6.194E-05 | global batch size: 256 | lm loss: 3.723857E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.745 | TFLOPs: 27.03 | +7: iteration 14720/ 21553 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 0.18 | learning rate: 6.183E-05 | global batch size: 256 | lm loss: 3.728623E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.091 | TFLOPs: 27.02 | +7: iteration 14730/ 21553 | consumed samples: 3770880 | consumed tokens: 7722762240 | elapsed time per iteration (s): 0.18 | learning rate: 6.172E-05 | global batch size: 256 | lm loss: 3.730311E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.289 | TFLOPs: 27.00 | +7: iteration 14740/ 21553 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 0.18 | learning rate: 6.161E-05 | global batch size: 256 | lm loss: 3.727093E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.925 | TFLOPs: 27.01 | +7: iteration 14750/ 21553 | consumed samples: 3776000 | consumed tokens: 7733248000 | elapsed time per iteration (s): 0.18 | learning rate: 6.150E-05 | global batch size: 256 | lm loss: 3.729481E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.376 | TFLOPs: 26.99 | +7: iteration 14760/ 21553 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 0.18 | learning rate: 6.139E-05 | global batch size: 256 | lm loss: 3.720874E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.594 | TFLOPs: 27.05 | +7: iteration 14770/ 21553 | consumed samples: 3781120 | consumed tokens: 7743733760 | elapsed time per iteration (s): 0.18 | learning rate: 6.127E-05 | global batch size: 256 | lm loss: 3.720665E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.905 | TFLOPs: 27.00 | +7: iteration 14780/ 21553 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 0.18 | learning rate: 6.116E-05 | global batch size: 256 | lm loss: 3.727397E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.269 | TFLOPs: 27.04 | +7: iteration 14790/ 21553 | consumed samples: 3786240 | consumed tokens: 7754219520 | elapsed time per iteration (s): 0.18 | learning rate: 6.105E-05 | global batch size: 256 | lm loss: 3.723468E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.190 | TFLOPs: 27.02 | +7: iteration 14800/ 21553 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 0.18 | learning rate: 6.094E-05 | global batch size: 256 | lm loss: 3.729050E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.069 | TFLOPs: 27.04 | +7: iteration 14810/ 21553 | consumed samples: 3791360 | consumed tokens: 7764705280 | elapsed time per iteration (s): 0.18 | learning rate: 6.083E-05 | global batch size: 256 | lm loss: 3.719802E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.299 | TFLOPs: 27.00 | +7: iteration 14820/ 21553 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 0.18 | learning rate: 6.072E-05 | global batch size: 256 | lm loss: 3.732477E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.575 | TFLOPs: 27.03 | +7: iteration 14830/ 21553 | consumed samples: 3796480 | consumed tokens: 7775191040 | elapsed time per iteration (s): 0.18 | learning rate: 6.061E-05 | global batch size: 256 | lm loss: 3.725742E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.418 | TFLOPs: 27.04 | +7: iteration 14840/ 21553 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 0.18 | learning rate: 6.050E-05 | global batch size: 256 | lm loss: 3.718962E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.646 | TFLOPs: 27.01 | +7: iteration 14850/ 21553 | consumed samples: 3801600 | consumed tokens: 7785676800 | elapsed time per iteration (s): 0.18 | learning rate: 6.039E-05 | global batch size: 256 | lm loss: 3.729071E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.806 | TFLOPs: 27.03 | +7: iteration 14860/ 21553 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 0.18 | learning rate: 6.028E-05 | global batch size: 256 | lm loss: 3.715299E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.201 | TFLOPs: 27.02 | +7: iteration 14870/ 21553 | consumed samples: 3806720 | consumed tokens: 7796162560 | elapsed time per iteration (s): 0.18 | learning rate: 6.016E-05 | global batch size: 256 | lm loss: 3.721154E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.695 | TFLOPs: 27.01 | +7: iteration 14880/ 21553 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 0.18 | learning rate: 6.005E-05 | global batch size: 256 | lm loss: 3.728336E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.847 | TFLOPs: 27.01 | +7: iteration 14890/ 21553 | consumed samples: 3811840 | consumed tokens: 7806648320 | elapsed time per iteration (s): 0.18 | learning rate: 5.994E-05 | global batch size: 256 | lm loss: 3.725212E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.185 | TFLOPs: 27.00 | +7: iteration 14900/ 21553 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 0.18 | learning rate: 5.983E-05 | global batch size: 256 | lm loss: 3.722099E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.383 | TFLOPs: 27.02 | +7: iteration 14910/ 21553 | consumed samples: 3816960 | consumed tokens: 7817134080 | elapsed time per iteration (s): 0.18 | learning rate: 5.972E-05 | global batch size: 256 | lm loss: 3.718298E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.237 | TFLOPs: 27.04 | +7: iteration 14920/ 21553 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 0.18 | learning rate: 5.961E-05 | global batch size: 256 | lm loss: 3.719828E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.216 | TFLOPs: 27.02 | +7: iteration 14930/ 21553 | consumed samples: 3822080 | consumed tokens: 7827619840 | elapsed time per iteration (s): 0.18 | learning rate: 5.950E-05 | global batch size: 256 | lm loss: 3.718053E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.539 | TFLOPs: 27.05 | +7: iteration 14940/ 21553 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 0.18 | learning rate: 5.940E-05 | global batch size: 256 | lm loss: 3.726313E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.248 | TFLOPs: 27.04 | +7: iteration 14950/ 21553 | consumed samples: 3827200 | consumed tokens: 7838105600 | elapsed time per iteration (s): 0.18 | learning rate: 5.929E-05 | global batch size: 256 | lm loss: 3.728185E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.484 | TFLOPs: 27.03 | +7: iteration 14960/ 21553 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 0.18 | learning rate: 5.918E-05 | global batch size: 256 | lm loss: 3.725694E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.075 | TFLOPs: 27.02 | +7: iteration 14970/ 21553 | consumed samples: 3832320 | consumed tokens: 7848591360 | elapsed time per iteration (s): 0.18 | learning rate: 5.907E-05 | global batch size: 256 | lm loss: 3.724049E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.491 | TFLOPs: 27.03 | +7: iteration 14980/ 21553 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 0.18 | learning rate: 5.896E-05 | global batch size: 256 | lm loss: 3.725526E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.746 | TFLOPs: 27.03 | +7: iteration 14990/ 21553 | consumed samples: 3837440 | consumed tokens: 7859077120 | elapsed time per iteration (s): 0.18 | learning rate: 5.885E-05 | global batch size: 256 | lm loss: 3.722934E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.846 | TFLOPs: 27.03 | +7: iteration 15000/ 21553 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 0.18 | learning rate: 5.874E-05 | global batch size: 256 | lm loss: 3.723771E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.699 | TFLOPs: 26.99 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 15000 | lm loss value: 3.902675E+00 | lm loss PPL: 4.953480E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 15000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:21:10,444] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step15000 is begin to save! +0: [2023-03-17 00:21:10,448] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:21:10,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:21:10,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:21:10,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:21:10,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:21:10,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:21:10,546] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:21:10,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:21:10,557] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:21:10,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:21:10,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:21:10,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:21:10,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:21:10,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:21:10,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:21:10,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:21:10,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:21:10,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:21:10,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:21:10,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:21:10,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:21:10,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:21:10,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:21:10,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:21:10,637] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step15000/mp_rank_00_model_states.pt +0: [2023-03-17 00:21:10,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:21:10,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:21:10,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:21:10,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:21:10,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:21:10,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:10,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:10,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:10,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:10,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:10,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:10,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:10,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:10,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:10,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:10,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-17 00:21:10,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:21:10,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:21:10,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-17 00:21:10,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:21:10,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:21:10,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-17 00:21:10,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:21:10,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:21:10,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-17 00:21:10,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:21:10,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:21:10,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-17 00:21:10,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:21:10,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:21:10,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: successfully saved checkpoint at iteration 15000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 245.22 +7: iteration 15010/ 21553 | consumed samples: 3842560 | consumed tokens: 7869562880 | elapsed time per iteration (s): 0.21 | learning rate: 5.863E-05 | global batch size: 256 | lm loss: 3.722417E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1209.797 | TFLOPs: 23.11 | +7: iteration 15020/ 21553 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 0.18 | learning rate: 5.852E-05 | global batch size: 256 | lm loss: 3.722146E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.043 | TFLOPs: 27.02 | +7: iteration 15030/ 21553 | consumed samples: 3847680 | consumed tokens: 7880048640 | elapsed time per iteration (s): 0.18 | learning rate: 5.841E-05 | global batch size: 256 | lm loss: 3.727606E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.286 | TFLOPs: 27.02 | +7: iteration 15040/ 21553 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 0.18 | learning rate: 5.830E-05 | global batch size: 256 | lm loss: 3.718211E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.299 | TFLOPs: 27.02 | +7: iteration 15050/ 21553 | consumed samples: 3852800 | consumed tokens: 7890534400 | elapsed time per iteration (s): 0.18 | learning rate: 5.820E-05 | global batch size: 256 | lm loss: 3.722050E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.292 | TFLOPs: 27.02 | +7: iteration 15060/ 21553 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 0.18 | learning rate: 5.809E-05 | global batch size: 256 | lm loss: 3.730484E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.836 | TFLOPs: 27.01 | +7: iteration 15070/ 21553 | consumed samples: 3857920 | consumed tokens: 7901020160 | elapsed time per iteration (s): 0.18 | learning rate: 5.798E-05 | global batch size: 256 | lm loss: 3.712538E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.368 | TFLOPs: 27.00 | +7: iteration 15080/ 21553 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 0.18 | learning rate: 5.787E-05 | global batch size: 256 | lm loss: 3.725491E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.387 | TFLOPs: 27.02 | +7: iteration 15090/ 21553 | consumed samples: 3863040 | consumed tokens: 7911505920 | elapsed time per iteration (s): 0.18 | learning rate: 5.776E-05 | global batch size: 256 | lm loss: 3.727639E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.686 | TFLOPs: 27.01 | +7: iteration 15100/ 21553 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 0.18 | learning rate: 5.766E-05 | global batch size: 256 | lm loss: 3.709209E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.413 | TFLOPs: 27.01 | +7: iteration 15110/ 21553 | consumed samples: 3868160 | consumed tokens: 7921991680 | elapsed time per iteration (s): 0.18 | learning rate: 5.755E-05 | global batch size: 256 | lm loss: 3.723699E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.160 | TFLOPs: 27.02 | +7: iteration 15120/ 21553 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 0.18 | learning rate: 5.744E-05 | global batch size: 256 | lm loss: 3.723224E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.316 | TFLOPs: 27.04 | +7: iteration 15130/ 21553 | consumed samples: 3873280 | consumed tokens: 7932477440 | elapsed time per iteration (s): 0.18 | learning rate: 5.733E-05 | global batch size: 256 | lm loss: 3.722418E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.395 | TFLOPs: 27.02 | +7: iteration 15140/ 21553 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 0.18 | learning rate: 5.723E-05 | global batch size: 256 | lm loss: 3.723925E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.748 | TFLOPs: 26.99 | +7: iteration 15150/ 21553 | consumed samples: 3878400 | consumed tokens: 7942963200 | elapsed time per iteration (s): 0.18 | learning rate: 5.712E-05 | global batch size: 256 | lm loss: 3.715318E+00 | grad norm: 0.511 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.133 | TFLOPs: 26.98 | +7: iteration 15160/ 21553 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 0.18 | learning rate: 5.701E-05 | global batch size: 256 | lm loss: 3.719263E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.671 | TFLOPs: 26.99 | +7: iteration 15170/ 21553 | consumed samples: 3883520 | consumed tokens: 7953448960 | elapsed time per iteration (s): 0.18 | learning rate: 5.690E-05 | global batch size: 256 | lm loss: 3.721910E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.840 | TFLOPs: 26.82 | +7: iteration 15180/ 21553 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 0.18 | learning rate: 5.680E-05 | global batch size: 256 | lm loss: 3.719553E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.086 | TFLOPs: 26.98 | +7: iteration 15190/ 21553 | consumed samples: 3888640 | consumed tokens: 7963934720 | elapsed time per iteration (s): 0.18 | learning rate: 5.669E-05 | global batch size: 256 | lm loss: 3.721678E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1400.443 | TFLOPs: 26.76 | +7: iteration 15200/ 21553 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 0.18 | learning rate: 5.658E-05 | global batch size: 256 | lm loss: 3.719833E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.803 | TFLOPs: 26.97 | +7: iteration 15210/ 21553 | consumed samples: 3893760 | consumed tokens: 7974420480 | elapsed time per iteration (s): 0.18 | learning rate: 5.648E-05 | global batch size: 256 | lm loss: 3.720170E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.782 | TFLOPs: 26.97 | +7: iteration 15220/ 21553 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 0.18 | learning rate: 5.637E-05 | global batch size: 256 | lm loss: 3.715648E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.266 | TFLOPs: 26.98 | +7: iteration 15230/ 21553 | consumed samples: 3898880 | consumed tokens: 7984906240 | elapsed time per iteration (s): 0.18 | learning rate: 5.626E-05 | global batch size: 256 | lm loss: 3.723201E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.450 | TFLOPs: 26.99 | +7: iteration 15240/ 21553 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 0.18 | learning rate: 5.616E-05 | global batch size: 256 | lm loss: 3.724339E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.014 | TFLOPs: 27.00 | +7: iteration 15250/ 21553 | consumed samples: 3904000 | consumed tokens: 7995392000 | elapsed time per iteration (s): 0.18 | learning rate: 5.605E-05 | global batch size: 256 | lm loss: 3.719672E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.953 | TFLOPs: 26.98 | +7: iteration 15260/ 21553 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 0.20 | learning rate: 5.595E-05 | global batch size: 256 | lm loss: 3.723463E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1298.483 | TFLOPs: 24.81 | +7: iteration 15270/ 21553 | consumed samples: 3909120 | consumed tokens: 8005877760 | elapsed time per iteration (s): 0.18 | learning rate: 5.584E-05 | global batch size: 256 | lm loss: 3.722166E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.838 | TFLOPs: 26.98 | +7: iteration 15280/ 21553 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 0.18 | learning rate: 5.573E-05 | global batch size: 256 | lm loss: 3.721074E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.562 | TFLOPs: 26.99 | +7: iteration 15290/ 21553 | consumed samples: 3914240 | consumed tokens: 8016363520 | elapsed time per iteration (s): 0.18 | learning rate: 5.563E-05 | global batch size: 256 | lm loss: 3.719865E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.862 | TFLOPs: 26.98 | +7: iteration 15300/ 21553 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 0.18 | learning rate: 5.552E-05 | global batch size: 256 | lm loss: 3.718918E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.401 | TFLOPs: 26.99 | +7: iteration 15310/ 21553 | consumed samples: 3919360 | consumed tokens: 8026849280 | elapsed time per iteration (s): 0.18 | learning rate: 5.542E-05 | global batch size: 256 | lm loss: 3.719225E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.722 | TFLOPs: 26.97 | +7: iteration 15320/ 21553 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 0.18 | learning rate: 5.531E-05 | global batch size: 256 | lm loss: 3.715458E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.175 | TFLOPs: 26.98 | +7: iteration 15330/ 21553 | consumed samples: 3924480 | consumed tokens: 8037335040 | elapsed time per iteration (s): 0.18 | learning rate: 5.521E-05 | global batch size: 256 | lm loss: 3.714796E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.177 | TFLOPs: 26.96 | +7: iteration 15340/ 21553 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 0.18 | learning rate: 5.510E-05 | global batch size: 256 | lm loss: 3.723755E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.037 | TFLOPs: 26.98 | +7: iteration 15350/ 21553 | consumed samples: 3929600 | consumed tokens: 8047820800 | elapsed time per iteration (s): 0.18 | learning rate: 5.500E-05 | global batch size: 256 | lm loss: 3.717440E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.772 | TFLOPs: 26.99 | +7: iteration 15360/ 21553 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 0.18 | learning rate: 5.489E-05 | global batch size: 256 | lm loss: 3.716414E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.223 | TFLOPs: 26.98 | +7: iteration 15370/ 21553 | consumed samples: 3934720 | consumed tokens: 8058306560 | elapsed time per iteration (s): 0.18 | learning rate: 5.479E-05 | global batch size: 256 | lm loss: 3.713741E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.041 | TFLOPs: 27.00 | +7: iteration 15380/ 21553 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 0.18 | learning rate: 5.468E-05 | global batch size: 256 | lm loss: 3.720034E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.312 | TFLOPs: 26.97 | +7: iteration 15390/ 21553 | consumed samples: 3939840 | consumed tokens: 8068792320 | elapsed time per iteration (s): 0.18 | learning rate: 5.458E-05 | global batch size: 256 | lm loss: 3.716669E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.756 | TFLOPs: 26.97 | +7: iteration 15400/ 21553 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 0.18 | learning rate: 5.447E-05 | global batch size: 256 | lm loss: 3.720178E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.275 | TFLOPs: 26.96 | +7: iteration 15410/ 21553 | consumed samples: 3944960 | consumed tokens: 8079278080 | elapsed time per iteration (s): 0.18 | learning rate: 5.437E-05 | global batch size: 256 | lm loss: 3.716845E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.454 | TFLOPs: 26.99 | +7: iteration 15420/ 21553 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 0.18 | learning rate: 5.427E-05 | global batch size: 256 | lm loss: 3.718208E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.598 | TFLOPs: 26.97 | +7: iteration 15430/ 21553 | consumed samples: 3950080 | consumed tokens: 8089763840 | elapsed time per iteration (s): 0.18 | learning rate: 5.416E-05 | global batch size: 256 | lm loss: 3.714956E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.469 | TFLOPs: 26.97 | +7: iteration 15440/ 21553 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 0.18 | learning rate: 5.406E-05 | global batch size: 256 | lm loss: 3.716808E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.022 | TFLOPs: 26.96 | +7: iteration 15450/ 21553 | consumed samples: 3955200 | consumed tokens: 8100249600 | elapsed time per iteration (s): 0.18 | learning rate: 5.395E-05 | global batch size: 256 | lm loss: 3.714552E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.798 | TFLOPs: 26.97 | +7: iteration 15460/ 21553 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 0.18 | learning rate: 5.385E-05 | global batch size: 256 | lm loss: 3.712436E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.283 | TFLOPs: 26.96 | +7: iteration 15470/ 21553 | consumed samples: 3960320 | consumed tokens: 8110735360 | elapsed time per iteration (s): 0.18 | learning rate: 5.375E-05 | global batch size: 256 | lm loss: 3.720816E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.412 | TFLOPs: 26.99 | +7: iteration 15480/ 21553 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 0.18 | learning rate: 5.364E-05 | global batch size: 256 | lm loss: 3.712512E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.795 | TFLOPs: 26.97 | +7: iteration 15490/ 21553 | consumed samples: 3965440 | consumed tokens: 8121221120 | elapsed time per iteration (s): 0.18 | learning rate: 5.354E-05 | global batch size: 256 | lm loss: 3.714303E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.371 | TFLOPs: 26.99 | +7: iteration 15500/ 21553 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 0.18 | learning rate: 5.344E-05 | global batch size: 256 | lm loss: 3.719369E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.165 | TFLOPs: 26.98 | +7: iteration 15510/ 21553 | consumed samples: 3970560 | consumed tokens: 8131706880 | elapsed time per iteration (s): 0.18 | learning rate: 5.333E-05 | global batch size: 256 | lm loss: 3.719438E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.203 | TFLOPs: 26.66 | +7: iteration 15520/ 21553 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 0.19 | learning rate: 5.323E-05 | global batch size: 256 | lm loss: 3.720457E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1352.333 | TFLOPs: 25.84 | +7: iteration 15530/ 21553 | consumed samples: 3975680 | consumed tokens: 8142192640 | elapsed time per iteration (s): 0.19 | learning rate: 5.313E-05 | global batch size: 256 | lm loss: 3.713943E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1358.938 | TFLOPs: 25.96 | +7: iteration 15540/ 21553 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 0.18 | learning rate: 5.303E-05 | global batch size: 256 | lm loss: 3.711141E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.756 | TFLOPs: 26.99 | +7: iteration 15550/ 21553 | consumed samples: 3980800 | consumed tokens: 8152678400 | elapsed time per iteration (s): 0.18 | learning rate: 5.292E-05 | global batch size: 256 | lm loss: 3.720507E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.423 | TFLOPs: 26.97 | +7: iteration 15560/ 21553 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 0.18 | learning rate: 5.282E-05 | global batch size: 256 | lm loss: 3.715538E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.322 | TFLOPs: 27.00 | +7: iteration 15570/ 21553 | consumed samples: 3985920 | consumed tokens: 8163164160 | elapsed time per iteration (s): 0.18 | learning rate: 5.272E-05 | global batch size: 256 | lm loss: 3.711504E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.968 | TFLOPs: 27.00 | +7: iteration 15580/ 21553 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 0.18 | learning rate: 5.262E-05 | global batch size: 256 | lm loss: 3.710542E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.555 | TFLOPs: 26.97 | +7: iteration 15590/ 21553 | consumed samples: 3991040 | consumed tokens: 8173649920 | elapsed time per iteration (s): 0.18 | learning rate: 5.251E-05 | global batch size: 256 | lm loss: 3.711276E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.425 | TFLOPs: 26.99 | +7: iteration 15600/ 21553 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 0.18 | learning rate: 5.241E-05 | global batch size: 256 | lm loss: 3.723642E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.007 | TFLOPs: 27.00 | +7: iteration 15610/ 21553 | consumed samples: 3996160 | consumed tokens: 8184135680 | elapsed time per iteration (s): 0.18 | learning rate: 5.231E-05 | global batch size: 256 | lm loss: 3.713774E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.018 | TFLOPs: 26.98 | +7: iteration 15620/ 21553 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 0.18 | learning rate: 5.221E-05 | global batch size: 256 | lm loss: 3.710072E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.362 | TFLOPs: 26.99 | +7: iteration 15630/ 21553 | consumed samples: 4001280 | consumed tokens: 8194621440 | elapsed time per iteration (s): 0.18 | learning rate: 5.211E-05 | global batch size: 256 | lm loss: 3.708371E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.470 | TFLOPs: 26.99 | +7: iteration 15640/ 21553 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 0.18 | learning rate: 5.201E-05 | global batch size: 256 | lm loss: 3.719919E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.375 | TFLOPs: 26.99 | +7: iteration 15650/ 21553 | consumed samples: 4006400 | consumed tokens: 8205107200 | elapsed time per iteration (s): 0.18 | learning rate: 5.191E-05 | global batch size: 256 | lm loss: 3.720935E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.505 | TFLOPs: 26.82 | +7: iteration 15660/ 21553 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 0.18 | learning rate: 5.180E-05 | global batch size: 256 | lm loss: 3.714740E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.579 | TFLOPs: 27.01 | +7: iteration 15670/ 21553 | consumed samples: 4011520 | consumed tokens: 8215592960 | elapsed time per iteration (s): 0.18 | learning rate: 5.170E-05 | global batch size: 256 | lm loss: 3.717144E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.744 | TFLOPs: 26.99 | +7: iteration 15680/ 21553 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 0.18 | learning rate: 5.160E-05 | global batch size: 256 | lm loss: 3.707979E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.412 | TFLOPs: 26.99 | +7: iteration 15690/ 21553 | consumed samples: 4016640 | consumed tokens: 8226078720 | elapsed time per iteration (s): 0.18 | learning rate: 5.150E-05 | global batch size: 256 | lm loss: 3.715917E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.707 | TFLOPs: 27.03 | +7: iteration 15700/ 21553 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 0.18 | learning rate: 5.140E-05 | global batch size: 256 | lm loss: 3.714518E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.320 | TFLOPs: 27.00 | +7: iteration 15710/ 21553 | consumed samples: 4021760 | consumed tokens: 8236564480 | elapsed time per iteration (s): 0.18 | learning rate: 5.130E-05 | global batch size: 256 | lm loss: 3.710095E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.506 | TFLOPs: 27.01 | +7: iteration 15720/ 21553 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 0.18 | learning rate: 5.120E-05 | global batch size: 256 | lm loss: 3.713337E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.199 | TFLOPs: 27.02 | +7: iteration 15730/ 21553 | consumed samples: 4026880 | consumed tokens: 8247050240 | elapsed time per iteration (s): 0.18 | learning rate: 5.110E-05 | global batch size: 256 | lm loss: 3.706388E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.415 | TFLOPs: 27.08 | +7: iteration 15740/ 21553 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 0.18 | learning rate: 5.100E-05 | global batch size: 256 | lm loss: 3.714480E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.756 | TFLOPs: 27.07 | +7: iteration 15750/ 21553 | consumed samples: 4032000 | consumed tokens: 8257536000 | elapsed time per iteration (s): 0.18 | learning rate: 5.090E-05 | global batch size: 256 | lm loss: 3.708474E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.366 | TFLOPs: 27.10 | +7: iteration 15760/ 21553 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 0.18 | learning rate: 5.080E-05 | global batch size: 256 | lm loss: 3.721663E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.084 | TFLOPs: 27.08 | +7: iteration 15770/ 21553 | consumed samples: 4037120 | consumed tokens: 8268021760 | elapsed time per iteration (s): 0.18 | learning rate: 5.070E-05 | global batch size: 256 | lm loss: 3.716021E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.263 | TFLOPs: 27.08 | +7: iteration 15780/ 21553 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 0.18 | learning rate: 5.060E-05 | global batch size: 256 | lm loss: 3.714001E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.847 | TFLOPs: 27.07 | +7: iteration 15790/ 21553 | consumed samples: 4042240 | consumed tokens: 8278507520 | elapsed time per iteration (s): 0.18 | learning rate: 5.050E-05 | global batch size: 256 | lm loss: 3.713826E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.976 | TFLOPs: 27.07 | +7: iteration 15800/ 21553 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 0.18 | learning rate: 5.040E-05 | global batch size: 256 | lm loss: 3.714162E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.808 | TFLOPs: 27.09 | +7: iteration 15810/ 21553 | consumed samples: 4047360 | consumed tokens: 8288993280 | elapsed time per iteration (s): 0.18 | learning rate: 5.030E-05 | global batch size: 256 | lm loss: 3.713261E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.175 | TFLOPs: 27.08 | +7: iteration 15820/ 21553 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 0.18 | learning rate: 5.020E-05 | global batch size: 256 | lm loss: 3.706615E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.874 | TFLOPs: 27.05 | +7: iteration 15830/ 21553 | consumed samples: 4052480 | consumed tokens: 8299479040 | elapsed time per iteration (s): 0.18 | learning rate: 5.010E-05 | global batch size: 256 | lm loss: 3.706107E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.261 | TFLOPs: 26.96 | +7: iteration 15840/ 21553 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 0.18 | learning rate: 5.001E-05 | global batch size: 256 | lm loss: 3.710398E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.353 | TFLOPs: 26.98 | +7: iteration 15850/ 21553 | consumed samples: 4057600 | consumed tokens: 8309964800 | elapsed time per iteration (s): 0.18 | learning rate: 4.991E-05 | global batch size: 256 | lm loss: 3.715504E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.699 | TFLOPs: 26.97 | +7: iteration 15860/ 21553 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 0.18 | learning rate: 4.981E-05 | global batch size: 256 | lm loss: 3.714946E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.887 | TFLOPs: 26.96 | +7: iteration 15870/ 21553 | consumed samples: 4062720 | consumed tokens: 8320450560 | elapsed time per iteration (s): 0.18 | learning rate: 4.971E-05 | global batch size: 256 | lm loss: 3.704436E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.467 | TFLOPs: 26.99 | +7: iteration 15880/ 21553 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 0.18 | learning rate: 4.961E-05 | global batch size: 256 | lm loss: 3.710614E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.422 | TFLOPs: 26.99 | +7: iteration 15890/ 21553 | consumed samples: 4067840 | consumed tokens: 8330936320 | elapsed time per iteration (s): 0.18 | learning rate: 4.951E-05 | global batch size: 256 | lm loss: 3.709940E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.377 | TFLOPs: 26.99 | +7: iteration 15900/ 21553 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 0.18 | learning rate: 4.942E-05 | global batch size: 256 | lm loss: 3.711693E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.694 | TFLOPs: 26.99 | +7: iteration 15910/ 21553 | consumed samples: 4072960 | consumed tokens: 8341422080 | elapsed time per iteration (s): 0.18 | learning rate: 4.932E-05 | global batch size: 256 | lm loss: 3.713289E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.886 | TFLOPs: 27.00 | +7: iteration 15920/ 21553 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 0.18 | learning rate: 4.922E-05 | global batch size: 256 | lm loss: 3.710630E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.327 | TFLOPs: 26.98 | +7: iteration 15930/ 21553 | consumed samples: 4078080 | consumed tokens: 8351907840 | elapsed time per iteration (s): 0.18 | learning rate: 4.912E-05 | global batch size: 256 | lm loss: 3.704134E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.428 | TFLOPs: 26.99 | +7: iteration 15940/ 21553 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 0.18 | learning rate: 4.902E-05 | global batch size: 256 | lm loss: 3.711809E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.677 | TFLOPs: 26.95 | +7: iteration 15950/ 21553 | consumed samples: 4083200 | consumed tokens: 8362393600 | elapsed time per iteration (s): 0.18 | learning rate: 4.893E-05 | global batch size: 256 | lm loss: 3.710375E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.595 | TFLOPs: 26.97 | +7: iteration 15960/ 21553 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 0.18 | learning rate: 4.883E-05 | global batch size: 256 | lm loss: 3.703560E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.055 | TFLOPs: 26.98 | +7: iteration 15970/ 21553 | consumed samples: 4088320 | consumed tokens: 8372879360 | elapsed time per iteration (s): 0.20 | learning rate: 4.873E-05 | global batch size: 256 | lm loss: 3.716567E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1268.551 | TFLOPs: 24.24 | +7: iteration 15980/ 21553 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 0.18 | learning rate: 4.864E-05 | global batch size: 256 | lm loss: 3.711503E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.386 | TFLOPs: 26.99 | +7: iteration 15990/ 21553 | consumed samples: 4093440 | consumed tokens: 8383365120 | elapsed time per iteration (s): 0.18 | learning rate: 4.854E-05 | global batch size: 256 | lm loss: 3.713552E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.536 | TFLOPs: 26.97 | +0: [2023-03-17 00:24:12,435] [INFO] [logging.py:68:log_dist] [Rank 0] step=16000, skipped=0, lr=[4.8441849544340955e-05, 4.8441849544340955e-05, 4.8441849544340955e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 16000/ 21553 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 0.18 | learning rate: 4.844E-05 | global batch size: 256 | lm loss: 3.708321E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.994 | TFLOPs: 26.98 | +0: steps: 16000 loss: 3.6967 iter time (s): 0.180 samples/sec: 1423.780 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 16000 | lm loss value: 3.919941E+00 | lm loss PPL: 5.039746E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 16000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:24:12,525] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is begin to save! +0: [2023-03-17 00:24:12,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:24:12,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:24:12,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:24:12,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:24:12,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:24:12,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:24:12,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:24:12,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:24:12,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:24:12,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:24:12,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:24:12,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:24:12,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:24:12,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:24:12,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:24:12,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:24:12,680] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:24:12,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:24:12,691] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:24:12,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:24:12,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:24:12,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:24:12,713] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:24:12,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:24:12,715] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step16000/mp_rank_00_model_states.pt +0: [2023-03-17 00:24:12,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:24:12,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:24:12,734] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:24:12,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:12,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:24:12,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:24:12,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:24:12,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:24:12,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: successfully saved checkpoint at iteration 16000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 241.58 +7: iteration 16010/ 21553 | consumed samples: 4098560 | consumed tokens: 8393850880 | elapsed time per iteration (s): 0.22 | learning rate: 4.835E-05 | global batch size: 256 | lm loss: 3.713260E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1179.285 | TFLOPs: 22.53 | +7: iteration 16020/ 21553 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 0.18 | learning rate: 4.825E-05 | global batch size: 256 | lm loss: 3.707091E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.045 | TFLOPs: 26.98 | +7: iteration 16030/ 21553 | consumed samples: 4103680 | consumed tokens: 8404336640 | elapsed time per iteration (s): 0.18 | learning rate: 4.815E-05 | global batch size: 256 | lm loss: 3.709287E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.754 | TFLOPs: 26.97 | +7: iteration 16040/ 21553 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 0.18 | learning rate: 4.806E-05 | global batch size: 256 | lm loss: 3.716616E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.228 | TFLOPs: 26.98 | +7: iteration 16050/ 21553 | consumed samples: 4108800 | consumed tokens: 8414822400 | elapsed time per iteration (s): 0.18 | learning rate: 4.796E-05 | global batch size: 256 | lm loss: 3.713591E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.153 | TFLOPs: 26.83 | +7: iteration 16060/ 21553 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 0.18 | learning rate: 4.786E-05 | global batch size: 256 | lm loss: 3.713340E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.228 | TFLOPs: 26.98 | +7: iteration 16070/ 21553 | consumed samples: 4113920 | consumed tokens: 8425308160 | elapsed time per iteration (s): 0.18 | learning rate: 4.777E-05 | global batch size: 256 | lm loss: 3.705528E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.682 | TFLOPs: 26.97 | +7: iteration 16080/ 21553 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 0.18 | learning rate: 4.767E-05 | global batch size: 256 | lm loss: 3.708070E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.882 | TFLOPs: 26.98 | +7: iteration 16090/ 21553 | consumed samples: 4119040 | consumed tokens: 8435793920 | elapsed time per iteration (s): 0.18 | learning rate: 4.758E-05 | global batch size: 256 | lm loss: 3.710217E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.643 | TFLOPs: 26.97 | +7: iteration 16100/ 21553 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 0.18 | learning rate: 4.748E-05 | global batch size: 256 | lm loss: 3.713990E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.004 | TFLOPs: 26.96 | +7: iteration 16110/ 21553 | consumed samples: 4124160 | consumed tokens: 8446279680 | elapsed time per iteration (s): 0.18 | learning rate: 4.739E-05 | global batch size: 256 | lm loss: 3.708075E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.509 | TFLOPs: 27.01 | +7: iteration 16120/ 21553 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 0.18 | learning rate: 4.729E-05 | global batch size: 256 | lm loss: 3.712469E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.517 | TFLOPs: 26.99 | +7: iteration 16130/ 21553 | consumed samples: 4129280 | consumed tokens: 8456765440 | elapsed time per iteration (s): 0.18 | learning rate: 4.720E-05 | global batch size: 256 | lm loss: 3.707064E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.449 | TFLOPs: 27.01 | +7: iteration 16140/ 21553 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 0.18 | learning rate: 4.710E-05 | global batch size: 256 | lm loss: 3.710961E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.448 | TFLOPs: 27.01 | +7: iteration 16150/ 21553 | consumed samples: 4134400 | consumed tokens: 8467251200 | elapsed time per iteration (s): 0.18 | learning rate: 4.701E-05 | global batch size: 256 | lm loss: 3.706761E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.441 | TFLOPs: 26.99 | +7: iteration 16160/ 21553 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 0.18 | learning rate: 4.691E-05 | global batch size: 256 | lm loss: 3.713922E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.016 | TFLOPs: 27.00 | +7: iteration 16170/ 21553 | consumed samples: 4139520 | consumed tokens: 8477736960 | elapsed time per iteration (s): 0.18 | learning rate: 4.682E-05 | global batch size: 256 | lm loss: 3.709235E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1392.622 | TFLOPs: 26.61 | +7: iteration 16180/ 21553 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 0.18 | learning rate: 4.672E-05 | global batch size: 256 | lm loss: 3.704702E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.034 | TFLOPs: 27.02 | +7: iteration 16190/ 21553 | consumed samples: 4144640 | consumed tokens: 8488222720 | elapsed time per iteration (s): 0.18 | learning rate: 4.663E-05 | global batch size: 256 | lm loss: 3.712204E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.487 | TFLOPs: 27.01 | +7: iteration 16200/ 21553 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 0.18 | learning rate: 4.654E-05 | global batch size: 256 | lm loss: 3.708237E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.567 | TFLOPs: 27.03 | +7: iteration 16210/ 21553 | consumed samples: 4149760 | consumed tokens: 8498708480 | elapsed time per iteration (s): 0.18 | learning rate: 4.644E-05 | global batch size: 256 | lm loss: 3.705746E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.533 | TFLOPs: 27.03 | +7: iteration 16220/ 21553 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 0.18 | learning rate: 4.635E-05 | global batch size: 256 | lm loss: 3.713820E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.469 | TFLOPs: 27.03 | +7: iteration 16230/ 21553 | consumed samples: 4154880 | consumed tokens: 8509194240 | elapsed time per iteration (s): 0.18 | learning rate: 4.625E-05 | global batch size: 256 | lm loss: 3.705436E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.960 | TFLOPs: 26.98 | +7: iteration 16240/ 21553 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 0.18 | learning rate: 4.616E-05 | global batch size: 256 | lm loss: 3.707448E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.232 | TFLOPs: 26.96 | +7: iteration 16250/ 21553 | consumed samples: 4160000 | consumed tokens: 8519680000 | elapsed time per iteration (s): 0.18 | learning rate: 4.607E-05 | global batch size: 256 | lm loss: 3.710663E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.452 | TFLOPs: 26.93 | +7: iteration 16260/ 21553 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 0.18 | learning rate: 4.597E-05 | global batch size: 256 | lm loss: 3.711694E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.205 | TFLOPs: 26.92 | +7: iteration 16270/ 21553 | consumed samples: 4165120 | consumed tokens: 8530165760 | elapsed time per iteration (s): 0.18 | learning rate: 4.588E-05 | global batch size: 256 | lm loss: 3.710476E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.276 | TFLOPs: 26.93 | +7: iteration 16280/ 21553 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 0.18 | learning rate: 4.579E-05 | global batch size: 256 | lm loss: 3.702649E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.001 | TFLOPs: 26.94 | +7: iteration 16290/ 21553 | consumed samples: 4170240 | consumed tokens: 8540651520 | elapsed time per iteration (s): 0.18 | learning rate: 4.570E-05 | global batch size: 256 | lm loss: 3.711060E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.878 | TFLOPs: 26.96 | +7: iteration 16300/ 21553 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 0.18 | learning rate: 4.560E-05 | global batch size: 256 | lm loss: 3.712331E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.272 | TFLOPs: 27.00 | +7: iteration 16310/ 21553 | consumed samples: 4175360 | consumed tokens: 8551137280 | elapsed time per iteration (s): 0.18 | learning rate: 4.551E-05 | global batch size: 256 | lm loss: 3.703906E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.068 | TFLOPs: 27.04 | +7: iteration 16320/ 21553 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 0.18 | learning rate: 4.542E-05 | global batch size: 256 | lm loss: 3.705640E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.334 | TFLOPs: 27.04 | +7: iteration 16330/ 21553 | consumed samples: 4180480 | consumed tokens: 8561623040 | elapsed time per iteration (s): 0.18 | learning rate: 4.533E-05 | global batch size: 256 | lm loss: 3.705096E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.615 | TFLOPs: 27.03 | +7: iteration 16340/ 21553 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 0.18 | learning rate: 4.523E-05 | global batch size: 256 | lm loss: 3.700635E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.688 | TFLOPs: 27.03 | +7: iteration 16350/ 21553 | consumed samples: 4185600 | consumed tokens: 8572108800 | elapsed time per iteration (s): 0.18 | learning rate: 4.514E-05 | global batch size: 256 | lm loss: 3.702954E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.126 | TFLOPs: 27.04 | +7: iteration 16360/ 21553 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 0.18 | learning rate: 4.505E-05 | global batch size: 256 | lm loss: 3.707514E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.847 | TFLOPs: 27.01 | +7: iteration 16370/ 21553 | consumed samples: 4190720 | consumed tokens: 8582594560 | elapsed time per iteration (s): 0.18 | learning rate: 4.496E-05 | global batch size: 256 | lm loss: 3.708944E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.929 | TFLOPs: 26.94 | +7: iteration 16380/ 21553 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 0.18 | learning rate: 4.487E-05 | global batch size: 256 | lm loss: 3.702195E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.988 | TFLOPs: 26.96 | +7: iteration 16390/ 21553 | consumed samples: 4195840 | consumed tokens: 8593080320 | elapsed time per iteration (s): 0.18 | learning rate: 4.478E-05 | global batch size: 256 | lm loss: 3.706273E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.456 | TFLOPs: 26.95 | +7: iteration 16400/ 21553 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 0.18 | learning rate: 4.468E-05 | global batch size: 256 | lm loss: 3.700109E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.376 | TFLOPs: 26.95 | +7: iteration 16410/ 21553 | consumed samples: 4200960 | consumed tokens: 8603566080 | elapsed time per iteration (s): 0.18 | learning rate: 4.459E-05 | global batch size: 256 | lm loss: 3.703613E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.435 | TFLOPs: 26.97 | +7: iteration 16420/ 21553 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 0.18 | learning rate: 4.450E-05 | global batch size: 256 | lm loss: 3.710178E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.821 | TFLOPs: 26.94 | +7: iteration 16430/ 21553 | consumed samples: 4206080 | consumed tokens: 8614051840 | elapsed time per iteration (s): 0.18 | learning rate: 4.441E-05 | global batch size: 256 | lm loss: 3.710854E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.203 | TFLOPs: 26.96 | +7: iteration 16440/ 21553 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 0.18 | learning rate: 4.432E-05 | global batch size: 256 | lm loss: 3.707204E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.177 | TFLOPs: 26.94 | +7: iteration 16450/ 21553 | consumed samples: 4211200 | consumed tokens: 8624537600 | elapsed time per iteration (s): 0.18 | learning rate: 4.423E-05 | global batch size: 256 | lm loss: 3.706808E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.610 | TFLOPs: 26.95 | +7: iteration 16460/ 21553 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 0.18 | learning rate: 4.414E-05 | global batch size: 256 | lm loss: 3.708212E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.933 | TFLOPs: 26.96 | +7: iteration 16470/ 21553 | consumed samples: 4216320 | consumed tokens: 8635023360 | elapsed time per iteration (s): 0.18 | learning rate: 4.405E-05 | global batch size: 256 | lm loss: 3.707282E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.743 | TFLOPs: 26.95 | +7: iteration 16480/ 21553 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 0.18 | learning rate: 4.396E-05 | global batch size: 256 | lm loss: 3.699701E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.482 | TFLOPs: 26.95 | +7: iteration 16490/ 21553 | consumed samples: 4221440 | consumed tokens: 8645509120 | elapsed time per iteration (s): 0.18 | learning rate: 4.387E-05 | global batch size: 256 | lm loss: 3.706764E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.506 | TFLOPs: 26.95 | +7: iteration 16500/ 21553 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 0.18 | learning rate: 4.378E-05 | global batch size: 256 | lm loss: 3.709620E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.967 | TFLOPs: 26.94 | +7: iteration 16510/ 21553 | consumed samples: 4226560 | consumed tokens: 8655994880 | elapsed time per iteration (s): 0.18 | learning rate: 4.369E-05 | global batch size: 256 | lm loss: 3.709808E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.069 | TFLOPs: 26.92 | +7: iteration 16520/ 21553 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 0.18 | learning rate: 4.360E-05 | global batch size: 256 | lm loss: 3.701224E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.351 | TFLOPs: 26.95 | +7: iteration 16530/ 21553 | consumed samples: 4231680 | consumed tokens: 8666480640 | elapsed time per iteration (s): 0.18 | learning rate: 4.351E-05 | global batch size: 256 | lm loss: 3.705726E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.967 | TFLOPs: 26.94 | +7: iteration 16540/ 21553 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 0.18 | learning rate: 4.342E-05 | global batch size: 256 | lm loss: 3.711804E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.090 | TFLOPs: 26.90 | +7: iteration 16550/ 21553 | consumed samples: 4236800 | consumed tokens: 8676966400 | elapsed time per iteration (s): 0.18 | learning rate: 4.333E-05 | global batch size: 256 | lm loss: 3.708512E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.216 | TFLOPs: 26.91 | +7: iteration 16560/ 21553 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 0.18 | learning rate: 4.324E-05 | global batch size: 256 | lm loss: 3.704005E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.743 | TFLOPs: 26.94 | +7: iteration 16570/ 21553 | consumed samples: 4241920 | consumed tokens: 8687452160 | elapsed time per iteration (s): 0.18 | learning rate: 4.315E-05 | global batch size: 256 | lm loss: 3.698209E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.858 | TFLOPs: 26.92 | +7: iteration 16580/ 21553 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 0.18 | learning rate: 4.307E-05 | global batch size: 256 | lm loss: 3.705379E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.145 | TFLOPs: 26.90 | +7: iteration 16590/ 21553 | consumed samples: 4247040 | consumed tokens: 8697937920 | elapsed time per iteration (s): 0.18 | learning rate: 4.298E-05 | global batch size: 256 | lm loss: 3.709362E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.947 | TFLOPs: 26.88 | +7: iteration 16600/ 21553 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 0.18 | learning rate: 4.289E-05 | global batch size: 256 | lm loss: 3.700195E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.932 | TFLOPs: 26.90 | +7: iteration 16610/ 21553 | consumed samples: 4252160 | consumed tokens: 8708423680 | elapsed time per iteration (s): 0.18 | learning rate: 4.280E-05 | global batch size: 256 | lm loss: 3.702035E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.023 | TFLOPs: 26.92 | +7: iteration 16620/ 21553 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 0.18 | learning rate: 4.271E-05 | global batch size: 256 | lm loss: 3.707517E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.591 | TFLOPs: 26.91 | +7: iteration 16630/ 21553 | consumed samples: 4257280 | consumed tokens: 8718909440 | elapsed time per iteration (s): 0.18 | learning rate: 4.263E-05 | global batch size: 256 | lm loss: 3.706277E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.448 | TFLOPs: 26.89 | +7: iteration 16640/ 21553 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 0.18 | learning rate: 4.254E-05 | global batch size: 256 | lm loss: 3.705975E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.934 | TFLOPs: 26.96 | +7: iteration 16650/ 21553 | consumed samples: 4262400 | consumed tokens: 8729395200 | elapsed time per iteration (s): 0.18 | learning rate: 4.245E-05 | global batch size: 256 | lm loss: 3.704818E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.417 | TFLOPs: 26.95 | +7: iteration 16660/ 21553 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 0.18 | learning rate: 4.236E-05 | global batch size: 256 | lm loss: 3.706199E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.141 | TFLOPs: 26.94 | +7: iteration 16670/ 21553 | consumed samples: 4267520 | consumed tokens: 8739880960 | elapsed time per iteration (s): 0.18 | learning rate: 4.227E-05 | global batch size: 256 | lm loss: 3.702392E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.387 | TFLOPs: 26.93 | +7: iteration 16680/ 21553 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 0.18 | learning rate: 4.219E-05 | global batch size: 256 | lm loss: 3.703952E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.665 | TFLOPs: 26.91 | +7: iteration 16690/ 21553 | consumed samples: 4272640 | consumed tokens: 8750366720 | elapsed time per iteration (s): 0.18 | learning rate: 4.210E-05 | global batch size: 256 | lm loss: 3.703317E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.280 | TFLOPs: 26.91 | +7: iteration 16700/ 21553 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 0.18 | learning rate: 4.201E-05 | global batch size: 256 | lm loss: 3.711277E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.340 | TFLOPs: 26.91 | +7: iteration 16710/ 21553 | consumed samples: 4277760 | consumed tokens: 8760852480 | elapsed time per iteration (s): 0.18 | learning rate: 4.193E-05 | global batch size: 256 | lm loss: 3.705295E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.624 | TFLOPs: 26.93 | +7: iteration 16720/ 21553 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 0.18 | learning rate: 4.184E-05 | global batch size: 256 | lm loss: 3.704867E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.264 | TFLOPs: 26.93 | +7: iteration 16730/ 21553 | consumed samples: 4282880 | consumed tokens: 8771338240 | elapsed time per iteration (s): 0.18 | learning rate: 4.175E-05 | global batch size: 256 | lm loss: 3.703209E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.939 | TFLOPs: 26.92 | +7: iteration 16740/ 21553 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 0.18 | learning rate: 4.167E-05 | global batch size: 256 | lm loss: 3.699810E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.644 | TFLOPs: 26.93 | +7: iteration 16750/ 21553 | consumed samples: 4288000 | consumed tokens: 8781824000 | elapsed time per iteration (s): 0.18 | learning rate: 4.158E-05 | global batch size: 256 | lm loss: 3.697924E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.728 | TFLOPs: 26.92 | +7: iteration 16760/ 21553 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 0.18 | learning rate: 4.150E-05 | global batch size: 256 | lm loss: 3.710148E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.543 | TFLOPs: 26.93 | +7: iteration 16770/ 21553 | consumed samples: 4293120 | consumed tokens: 8792309760 | elapsed time per iteration (s): 0.18 | learning rate: 4.141E-05 | global batch size: 256 | lm loss: 3.699907E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.582 | TFLOPs: 26.91 | +7: iteration 16780/ 21553 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 0.18 | learning rate: 4.132E-05 | global batch size: 256 | lm loss: 3.701302E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.768 | TFLOPs: 26.92 | +7: iteration 16790/ 21553 | consumed samples: 4298240 | consumed tokens: 8802795520 | elapsed time per iteration (s): 0.18 | learning rate: 4.124E-05 | global batch size: 256 | lm loss: 3.701003E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.912 | TFLOPs: 26.92 | +7: iteration 16800/ 21553 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 0.18 | learning rate: 4.115E-05 | global batch size: 256 | lm loss: 3.709395E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.378 | TFLOPs: 26.91 | +7: iteration 16810/ 21553 | consumed samples: 4303360 | consumed tokens: 8813281280 | elapsed time per iteration (s): 0.18 | learning rate: 4.107E-05 | global batch size: 256 | lm loss: 3.700768E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.442 | TFLOPs: 26.91 | +7: iteration 16820/ 21553 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 0.18 | learning rate: 4.098E-05 | global batch size: 256 | lm loss: 3.710843E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.615 | TFLOPs: 26.93 | +7: iteration 16830/ 21553 | consumed samples: 4308480 | consumed tokens: 8823767040 | elapsed time per iteration (s): 0.18 | learning rate: 4.090E-05 | global batch size: 256 | lm loss: 3.702784E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.528 | TFLOPs: 26.93 | +7: iteration 16840/ 21553 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 0.18 | learning rate: 4.081E-05 | global batch size: 256 | lm loss: 3.694381E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.590 | TFLOPs: 26.95 | +7: iteration 16850/ 21553 | consumed samples: 4313600 | consumed tokens: 8834252800 | elapsed time per iteration (s): 0.18 | learning rate: 4.073E-05 | global batch size: 256 | lm loss: 3.704846E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.733 | TFLOPs: 26.95 | +7: iteration 16860/ 21553 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 0.18 | learning rate: 4.064E-05 | global batch size: 256 | lm loss: 3.708408E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.637 | TFLOPs: 26.95 | +7: iteration 16870/ 21553 | consumed samples: 4318720 | consumed tokens: 8844738560 | elapsed time per iteration (s): 0.18 | learning rate: 4.056E-05 | global batch size: 256 | lm loss: 3.704475E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.583 | TFLOPs: 26.93 | +7: iteration 16880/ 21553 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 0.18 | learning rate: 4.047E-05 | global batch size: 256 | lm loss: 3.703954E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.290 | TFLOPs: 26.93 | +7: iteration 16890/ 21553 | consumed samples: 4323840 | consumed tokens: 8855224320 | elapsed time per iteration (s): 0.18 | learning rate: 4.039E-05 | global batch size: 256 | lm loss: 3.697556E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.408 | TFLOPs: 26.93 | +7: iteration 16900/ 21553 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 0.18 | learning rate: 4.031E-05 | global batch size: 256 | lm loss: 3.695028E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.879 | TFLOPs: 26.94 | +7: iteration 16910/ 21553 | consumed samples: 4328960 | consumed tokens: 8865710080 | elapsed time per iteration (s): 0.18 | learning rate: 4.022E-05 | global batch size: 256 | lm loss: 3.705835E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.635 | TFLOPs: 26.89 | +7: iteration 16920/ 21553 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 0.18 | learning rate: 4.014E-05 | global batch size: 256 | lm loss: 3.699692E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.717 | TFLOPs: 26.92 | +7: iteration 16930/ 21553 | consumed samples: 4334080 | consumed tokens: 8876195840 | elapsed time per iteration (s): 0.18 | learning rate: 4.006E-05 | global batch size: 256 | lm loss: 3.699397E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.928 | TFLOPs: 26.90 | +7: iteration 16940/ 21553 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 0.18 | learning rate: 3.997E-05 | global batch size: 256 | lm loss: 3.699474E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.055 | TFLOPs: 26.92 | +7: iteration 16950/ 21553 | consumed samples: 4339200 | consumed tokens: 8886681600 | elapsed time per iteration (s): 0.18 | learning rate: 3.989E-05 | global batch size: 256 | lm loss: 3.692107E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.533 | TFLOPs: 26.85 | +7: iteration 16960/ 21553 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 0.18 | learning rate: 3.981E-05 | global batch size: 256 | lm loss: 3.701950E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.517 | TFLOPs: 26.89 | +7: iteration 16970/ 21553 | consumed samples: 4344320 | consumed tokens: 8897167360 | elapsed time per iteration (s): 0.18 | learning rate: 3.972E-05 | global batch size: 256 | lm loss: 3.692556E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.099 | TFLOPs: 26.87 | +7: iteration 16980/ 21553 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 0.18 | learning rate: 3.964E-05 | global batch size: 256 | lm loss: 3.698426E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.722 | TFLOPs: 26.90 | +7: iteration 16990/ 21553 | consumed samples: 4349440 | consumed tokens: 8907653120 | elapsed time per iteration (s): 0.18 | learning rate: 3.956E-05 | global batch size: 256 | lm loss: 3.700262E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.566 | TFLOPs: 26.91 | +7: iteration 17000/ 21553 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 0.18 | learning rate: 3.948E-05 | global batch size: 256 | lm loss: 3.703474E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.878 | TFLOPs: 26.90 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 17000 | lm loss value: 3.855834E+00 | lm loss PPL: 4.726801E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 17000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:27:14,404] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step17000 is begin to save! +0: [2023-03-17 00:27:14,409] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:27:14,481] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:27:14,481] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:27:14,492] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:27:14,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:27:14,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:27:14,504] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:27:14,515] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:27:14,515] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:27:14,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:27:14,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:27:14,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:27:14,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:27:14,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:27:14,548] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:27:14,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:27:14,560] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:27:14,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:27:14,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:27:14,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:27:14,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:27:14,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:27:14,593] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:27:14,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:27:14,595] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step17000/mp_rank_00_model_states.pt +0: [2023-03-17 00:27:14,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:27:14,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:27:14,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:27:14,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:14,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:14,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:14,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:14,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:14,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:27:14,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-17 00:27:14,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:14,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +3: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:27:14,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:27:14,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: successfully saved checkpoint at iteration 17000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 243.13 +7: iteration 17010/ 21553 | consumed samples: 4354560 | consumed tokens: 8918138880 | elapsed time per iteration (s): 0.21 | learning rate: 3.939E-05 | global batch size: 256 | lm loss: 3.697347E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1207.030 | TFLOPs: 23.06 | +7: iteration 17020/ 21553 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 0.18 | learning rate: 3.931E-05 | global batch size: 256 | lm loss: 3.699414E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.577 | TFLOPs: 26.86 | +7: iteration 17030/ 21553 | consumed samples: 4359680 | consumed tokens: 8928624640 | elapsed time per iteration (s): 0.18 | learning rate: 3.923E-05 | global batch size: 256 | lm loss: 3.701545E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.022 | TFLOPs: 26.88 | +7: iteration 17040/ 21553 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 0.18 | learning rate: 3.915E-05 | global batch size: 256 | lm loss: 3.704559E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.934 | TFLOPs: 26.92 | +7: iteration 17050/ 21553 | consumed samples: 4364800 | consumed tokens: 8939110400 | elapsed time per iteration (s): 0.18 | learning rate: 3.907E-05 | global batch size: 256 | lm loss: 3.709502E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.342 | TFLOPs: 26.93 | +7: iteration 17060/ 21553 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 0.18 | learning rate: 3.898E-05 | global batch size: 256 | lm loss: 3.696725E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.010 | TFLOPs: 26.92 | +7: iteration 17070/ 21553 | consumed samples: 4369920 | consumed tokens: 8949596160 | elapsed time per iteration (s): 0.18 | learning rate: 3.890E-05 | global batch size: 256 | lm loss: 3.698256E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.941 | TFLOPs: 26.90 | +7: iteration 17080/ 21553 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 0.18 | learning rate: 3.882E-05 | global batch size: 256 | lm loss: 3.695158E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.549 | TFLOPs: 26.95 | +7: iteration 17090/ 21553 | consumed samples: 4375040 | consumed tokens: 8960081920 | elapsed time per iteration (s): 0.18 | learning rate: 3.874E-05 | global batch size: 256 | lm loss: 3.689629E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.686 | TFLOPs: 26.90 | +7: iteration 17100/ 21553 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 0.18 | learning rate: 3.866E-05 | global batch size: 256 | lm loss: 3.701142E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.788 | TFLOPs: 26.92 | +7: iteration 17110/ 21553 | consumed samples: 4380160 | consumed tokens: 8970567680 | elapsed time per iteration (s): 0.18 | learning rate: 3.858E-05 | global batch size: 256 | lm loss: 3.703180E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.162 | TFLOPs: 26.96 | +7: iteration 17120/ 21553 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 0.18 | learning rate: 3.850E-05 | global batch size: 256 | lm loss: 3.697149E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.060 | TFLOPs: 27.04 | +7: iteration 17130/ 21553 | consumed samples: 4385280 | consumed tokens: 8981053440 | elapsed time per iteration (s): 0.18 | learning rate: 3.842E-05 | global batch size: 256 | lm loss: 3.700761E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.803 | TFLOPs: 27.03 | +7: iteration 17140/ 21553 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 0.18 | learning rate: 3.834E-05 | global batch size: 256 | lm loss: 3.698957E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.076 | TFLOPs: 27.02 | +7: iteration 17150/ 21553 | consumed samples: 4390400 | consumed tokens: 8991539200 | elapsed time per iteration (s): 0.18 | learning rate: 3.826E-05 | global batch size: 256 | lm loss: 3.696148E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.876 | TFLOPs: 26.69 | +7: iteration 17160/ 21553 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 0.18 | learning rate: 3.818E-05 | global batch size: 256 | lm loss: 3.702420E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.927 | TFLOPs: 27.03 | +7: iteration 17170/ 21553 | consumed samples: 4395520 | consumed tokens: 9002024960 | elapsed time per iteration (s): 0.19 | learning rate: 3.810E-05 | global batch size: 256 | lm loss: 3.698808E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1333.930 | TFLOPs: 25.49 | +7: iteration 17180/ 21553 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 0.18 | learning rate: 3.802E-05 | global batch size: 256 | lm loss: 3.699260E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.165 | TFLOPs: 27.04 | +7: iteration 17190/ 21553 | consumed samples: 4400640 | consumed tokens: 9012510720 | elapsed time per iteration (s): 0.18 | learning rate: 3.794E-05 | global batch size: 256 | lm loss: 3.699085E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.883 | TFLOPs: 27.03 | +7: iteration 17200/ 21553 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 0.18 | learning rate: 3.786E-05 | global batch size: 256 | lm loss: 3.699957E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.037 | TFLOPs: 27.02 | +7: iteration 17210/ 21553 | consumed samples: 4405760 | consumed tokens: 9022996480 | elapsed time per iteration (s): 0.18 | learning rate: 3.778E-05 | global batch size: 256 | lm loss: 3.703960E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.872 | TFLOPs: 27.03 | +7: iteration 17220/ 21553 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 0.18 | learning rate: 3.770E-05 | global batch size: 256 | lm loss: 3.696236E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.609 | TFLOPs: 27.03 | +7: iteration 17230/ 21553 | consumed samples: 4410880 | consumed tokens: 9033482240 | elapsed time per iteration (s): 0.18 | learning rate: 3.762E-05 | global batch size: 256 | lm loss: 3.693290E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.688 | TFLOPs: 26.99 | +7: iteration 17240/ 21553 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 0.18 | learning rate: 3.754E-05 | global batch size: 256 | lm loss: 3.700934E+00 | grad norm: 0.472 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.328 | TFLOPs: 27.02 | +7: iteration 17250/ 21553 | consumed samples: 4416000 | consumed tokens: 9043968000 | elapsed time per iteration (s): 0.18 | learning rate: 3.747E-05 | global batch size: 256 | lm loss: 3.699153E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.619 | TFLOPs: 27.01 | +7: iteration 17260/ 21553 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 0.18 | learning rate: 3.739E-05 | global batch size: 256 | lm loss: 3.700277E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.733 | TFLOPs: 26.99 | +7: iteration 17270/ 21553 | consumed samples: 4421120 | consumed tokens: 9054453760 | elapsed time per iteration (s): 0.18 | learning rate: 3.731E-05 | global batch size: 256 | lm loss: 3.696206E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.979 | TFLOPs: 27.02 | +7: iteration 17280/ 21553 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 0.18 | learning rate: 3.723E-05 | global batch size: 256 | lm loss: 3.704501E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.100 | TFLOPs: 27.02 | +7: iteration 17290/ 21553 | consumed samples: 4426240 | consumed tokens: 9064939520 | elapsed time per iteration (s): 0.18 | learning rate: 3.715E-05 | global batch size: 256 | lm loss: 3.695274E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.885 | TFLOPs: 27.03 | +7: iteration 17300/ 21553 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 0.18 | learning rate: 3.708E-05 | global batch size: 256 | lm loss: 3.707809E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.035 | TFLOPs: 27.04 | +7: iteration 17310/ 21553 | consumed samples: 4431360 | consumed tokens: 9075425280 | elapsed time per iteration (s): 0.18 | learning rate: 3.700E-05 | global batch size: 256 | lm loss: 3.697166E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.515 | TFLOPs: 27.03 | +7: iteration 17320/ 21553 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 0.18 | learning rate: 3.692E-05 | global batch size: 256 | lm loss: 3.696743E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.743 | TFLOPs: 27.03 | +7: iteration 17330/ 21553 | consumed samples: 4436480 | consumed tokens: 9085911040 | elapsed time per iteration (s): 0.18 | learning rate: 3.684E-05 | global batch size: 256 | lm loss: 3.702271E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.869 | TFLOPs: 27.01 | +7: iteration 17340/ 21553 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 0.18 | learning rate: 3.677E-05 | global batch size: 256 | lm loss: 3.693086E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.098 | TFLOPs: 26.98 | +7: iteration 17350/ 21553 | consumed samples: 4441600 | consumed tokens: 9096396800 | elapsed time per iteration (s): 0.18 | learning rate: 3.669E-05 | global batch size: 256 | lm loss: 3.703590E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.566 | TFLOPs: 26.99 | +7: iteration 17360/ 21553 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 0.18 | learning rate: 3.661E-05 | global batch size: 256 | lm loss: 3.693996E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.226 | TFLOPs: 27.06 | +7: iteration 17370/ 21553 | consumed samples: 4446720 | consumed tokens: 9106882560 | elapsed time per iteration (s): 0.18 | learning rate: 3.654E-05 | global batch size: 256 | lm loss: 3.694950E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.162 | TFLOPs: 27.04 | +7: iteration 17380/ 21553 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 0.18 | learning rate: 3.646E-05 | global batch size: 256 | lm loss: 3.696674E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.110 | TFLOPs: 27.06 | +7: iteration 17390/ 21553 | consumed samples: 4451840 | consumed tokens: 9117368320 | elapsed time per iteration (s): 0.18 | learning rate: 3.638E-05 | global batch size: 256 | lm loss: 3.695993E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.636 | TFLOPs: 27.03 | +7: iteration 17400/ 21553 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 0.18 | learning rate: 3.631E-05 | global batch size: 256 | lm loss: 3.698882E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.954 | TFLOPs: 27.02 | +7: iteration 17410/ 21553 | consumed samples: 4456960 | consumed tokens: 9127854080 | elapsed time per iteration (s): 0.18 | learning rate: 3.623E-05 | global batch size: 256 | lm loss: 3.701050E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.982 | TFLOPs: 27.00 | +7: iteration 17420/ 21553 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 0.18 | learning rate: 3.616E-05 | global batch size: 256 | lm loss: 3.698447E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.926 | TFLOPs: 27.00 | +7: iteration 17430/ 21553 | consumed samples: 4462080 | consumed tokens: 9138339840 | elapsed time per iteration (s): 0.18 | learning rate: 3.608E-05 | global batch size: 256 | lm loss: 3.691404E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.460 | TFLOPs: 27.01 | +7: iteration 17440/ 21553 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 0.18 | learning rate: 3.600E-05 | global batch size: 256 | lm loss: 3.695267E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.841 | TFLOPs: 27.01 | +7: iteration 17450/ 21553 | consumed samples: 4467200 | consumed tokens: 9148825600 | elapsed time per iteration (s): 0.18 | learning rate: 3.593E-05 | global batch size: 256 | lm loss: 3.701945E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.531 | TFLOPs: 27.01 | +7: iteration 17460/ 21553 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 0.18 | learning rate: 3.585E-05 | global batch size: 256 | lm loss: 3.692265E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.322 | TFLOPs: 27.00 | +7: iteration 17470/ 21553 | consumed samples: 4472320 | consumed tokens: 9159311360 | elapsed time per iteration (s): 0.18 | learning rate: 3.578E-05 | global batch size: 256 | lm loss: 3.706973E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.204 | TFLOPs: 26.98 | +7: iteration 17480/ 21553 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 0.18 | learning rate: 3.570E-05 | global batch size: 256 | lm loss: 3.699091E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.927 | TFLOPs: 27.02 | +7: iteration 17490/ 21553 | consumed samples: 4477440 | consumed tokens: 9169797120 | elapsed time per iteration (s): 0.18 | learning rate: 3.563E-05 | global batch size: 256 | lm loss: 3.695067E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.905 | TFLOPs: 27.00 | +7: iteration 17500/ 21553 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 0.18 | learning rate: 3.555E-05 | global batch size: 256 | lm loss: 3.694061E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.953 | TFLOPs: 27.02 | +7: iteration 17510/ 21553 | consumed samples: 4482560 | consumed tokens: 9180282880 | elapsed time per iteration (s): 0.18 | learning rate: 3.548E-05 | global batch size: 256 | lm loss: 3.701260E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.950 | TFLOPs: 27.05 | +7: iteration 17520/ 21553 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 0.18 | learning rate: 3.541E-05 | global batch size: 256 | lm loss: 3.701275E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.890 | TFLOPs: 27.05 | +7: iteration 17530/ 21553 | consumed samples: 4487680 | consumed tokens: 9190768640 | elapsed time per iteration (s): 0.18 | learning rate: 3.533E-05 | global batch size: 256 | lm loss: 3.694729E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.617 | TFLOPs: 27.03 | +7: iteration 17540/ 21553 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 0.18 | learning rate: 3.526E-05 | global batch size: 256 | lm loss: 3.686985E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.616 | TFLOPs: 26.99 | +7: iteration 17550/ 21553 | consumed samples: 4492800 | consumed tokens: 9201254400 | elapsed time per iteration (s): 0.18 | learning rate: 3.518E-05 | global batch size: 256 | lm loss: 3.695632E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.713 | TFLOPs: 26.51 | +7: iteration 17560/ 21553 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 0.18 | learning rate: 3.511E-05 | global batch size: 256 | lm loss: 3.692397E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.468 | TFLOPs: 26.99 | +7: iteration 17570/ 21553 | consumed samples: 4497920 | consumed tokens: 9211740160 | elapsed time per iteration (s): 0.18 | learning rate: 3.504E-05 | global batch size: 256 | lm loss: 3.694079E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.095 | TFLOPs: 26.96 | +7: iteration 17580/ 21553 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 0.18 | learning rate: 3.496E-05 | global batch size: 256 | lm loss: 3.699931E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.601 | TFLOPs: 26.93 | +7: iteration 17590/ 21553 | consumed samples: 4503040 | consumed tokens: 9222225920 | elapsed time per iteration (s): 0.18 | learning rate: 3.489E-05 | global batch size: 256 | lm loss: 3.688445E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.745 | TFLOPs: 26.94 | +7: iteration 17600/ 21553 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 0.18 | learning rate: 3.482E-05 | global batch size: 256 | lm loss: 3.694097E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.572 | TFLOPs: 26.97 | +7: iteration 17610/ 21553 | consumed samples: 4508160 | consumed tokens: 9232711680 | elapsed time per iteration (s): 0.18 | learning rate: 3.475E-05 | global batch size: 256 | lm loss: 3.692953E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.210 | TFLOPs: 26.89 | +7: iteration 17620/ 21553 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 0.18 | learning rate: 3.467E-05 | global batch size: 256 | lm loss: 3.691482E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.585 | TFLOPs: 26.99 | +7: iteration 17630/ 21553 | consumed samples: 4513280 | consumed tokens: 9243197440 | elapsed time per iteration (s): 0.18 | learning rate: 3.460E-05 | global batch size: 256 | lm loss: 3.693537E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.030 | TFLOPs: 26.96 | +7: iteration 17640/ 21553 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 0.18 | learning rate: 3.453E-05 | global batch size: 256 | lm loss: 3.696863E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.795 | TFLOPs: 26.96 | +7: iteration 17650/ 21553 | consumed samples: 4518400 | consumed tokens: 9253683200 | elapsed time per iteration (s): 0.18 | learning rate: 3.446E-05 | global batch size: 256 | lm loss: 3.696080E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.860 | TFLOPs: 26.98 | +7: iteration 17660/ 21553 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 0.18 | learning rate: 3.438E-05 | global batch size: 256 | lm loss: 3.692346E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.210 | TFLOPs: 26.98 | +7: iteration 17670/ 21553 | consumed samples: 4523520 | consumed tokens: 9264168960 | elapsed time per iteration (s): 0.18 | learning rate: 3.431E-05 | global batch size: 256 | lm loss: 3.694277E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.608 | TFLOPs: 26.97 | +7: iteration 17680/ 21553 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 0.18 | learning rate: 3.424E-05 | global batch size: 256 | lm loss: 3.690408E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.691 | TFLOPs: 26.93 | +7: iteration 17690/ 21553 | consumed samples: 4528640 | consumed tokens: 9274654720 | elapsed time per iteration (s): 0.18 | learning rate: 3.417E-05 | global batch size: 256 | lm loss: 3.701072E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.584 | TFLOPs: 26.97 | +7: iteration 17700/ 21553 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 0.18 | learning rate: 3.410E-05 | global batch size: 256 | lm loss: 3.699666E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.760 | TFLOPs: 26.97 | +7: iteration 17710/ 21553 | consumed samples: 4533760 | consumed tokens: 9285140480 | elapsed time per iteration (s): 0.18 | learning rate: 3.403E-05 | global batch size: 256 | lm loss: 3.698611E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.310 | TFLOPs: 26.98 | +7: iteration 17720/ 21553 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 0.18 | learning rate: 3.396E-05 | global batch size: 256 | lm loss: 3.695124E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.433 | TFLOPs: 26.97 | +7: iteration 17730/ 21553 | consumed samples: 4538880 | consumed tokens: 9295626240 | elapsed time per iteration (s): 0.18 | learning rate: 3.388E-05 | global batch size: 256 | lm loss: 3.695029E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.440 | TFLOPs: 26.99 | +7: iteration 17740/ 21553 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 0.18 | learning rate: 3.381E-05 | global batch size: 256 | lm loss: 3.695405E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.701 | TFLOPs: 26.97 | +7: iteration 17750/ 21553 | consumed samples: 4544000 | consumed tokens: 9306112000 | elapsed time per iteration (s): 0.18 | learning rate: 3.374E-05 | global batch size: 256 | lm loss: 3.698536E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.724 | TFLOPs: 26.97 | +7: iteration 17760/ 21553 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 0.18 | learning rate: 3.367E-05 | global batch size: 256 | lm loss: 3.691432E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.989 | TFLOPs: 27.00 | +7: iteration 17770/ 21553 | consumed samples: 4549120 | consumed tokens: 9316597760 | elapsed time per iteration (s): 0.18 | learning rate: 3.360E-05 | global batch size: 256 | lm loss: 3.687578E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.072 | TFLOPs: 27.00 | +7: iteration 17780/ 21553 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 0.18 | learning rate: 3.353E-05 | global batch size: 256 | lm loss: 3.699412E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.160 | TFLOPs: 26.96 | +7: iteration 17790/ 21553 | consumed samples: 4554240 | consumed tokens: 9327083520 | elapsed time per iteration (s): 0.18 | learning rate: 3.346E-05 | global batch size: 256 | lm loss: 3.694812E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.023 | TFLOPs: 27.00 | +7: iteration 17800/ 21553 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 0.18 | learning rate: 3.339E-05 | global batch size: 256 | lm loss: 3.703780E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.635 | TFLOPs: 26.99 | +7: iteration 17810/ 21553 | consumed samples: 4559360 | consumed tokens: 9337569280 | elapsed time per iteration (s): 0.18 | learning rate: 3.332E-05 | global batch size: 256 | lm loss: 3.690166E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.238 | TFLOPs: 26.98 | +7: iteration 17820/ 21553 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 0.18 | learning rate: 3.326E-05 | global batch size: 256 | lm loss: 3.692622E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.957 | TFLOPs: 26.98 | +7: iteration 17830/ 21553 | consumed samples: 4564480 | consumed tokens: 9348055040 | elapsed time per iteration (s): 0.18 | learning rate: 3.319E-05 | global batch size: 256 | lm loss: 3.694262E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.667 | TFLOPs: 26.97 | +7: iteration 17840/ 21553 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 0.18 | learning rate: 3.312E-05 | global batch size: 256 | lm loss: 3.694606E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.099 | TFLOPs: 26.98 | +7: iteration 17850/ 21553 | consumed samples: 4569600 | consumed tokens: 9358540800 | elapsed time per iteration (s): 0.18 | learning rate: 3.305E-05 | global batch size: 256 | lm loss: 3.699737E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.512 | TFLOPs: 26.97 | +7: iteration 17860/ 21553 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 0.18 | learning rate: 3.298E-05 | global batch size: 256 | lm loss: 3.695012E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.849 | TFLOPs: 26.86 | +7: iteration 17870/ 21553 | consumed samples: 4574720 | consumed tokens: 9369026560 | elapsed time per iteration (s): 0.18 | learning rate: 3.291E-05 | global batch size: 256 | lm loss: 3.688636E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.572 | TFLOPs: 26.86 | +7: iteration 17880/ 21553 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 0.18 | learning rate: 3.284E-05 | global batch size: 256 | lm loss: 3.694270E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.056 | TFLOPs: 26.86 | +7: iteration 17890/ 21553 | consumed samples: 4579840 | consumed tokens: 9379512320 | elapsed time per iteration (s): 0.18 | learning rate: 3.277E-05 | global batch size: 256 | lm loss: 3.699654E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.600 | TFLOPs: 26.84 | +7: iteration 17900/ 21553 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 0.18 | learning rate: 3.271E-05 | global batch size: 256 | lm loss: 3.690250E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.722 | TFLOPs: 26.86 | +7: iteration 17910/ 21553 | consumed samples: 4584960 | consumed tokens: 9389998080 | elapsed time per iteration (s): 0.18 | learning rate: 3.264E-05 | global batch size: 256 | lm loss: 3.697803E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.840 | TFLOPs: 26.82 | +7: iteration 17920/ 21553 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 0.18 | learning rate: 3.257E-05 | global batch size: 256 | lm loss: 3.692446E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.953 | TFLOPs: 26.92 | +7: iteration 17930/ 21553 | consumed samples: 4590080 | consumed tokens: 9400483840 | elapsed time per iteration (s): 0.18 | learning rate: 3.250E-05 | global batch size: 256 | lm loss: 3.690607E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.490 | TFLOPs: 26.97 | +7: iteration 17940/ 21553 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 0.18 | learning rate: 3.244E-05 | global batch size: 256 | lm loss: 3.695084E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.045 | TFLOPs: 26.90 | +7: iteration 17950/ 21553 | consumed samples: 4595200 | consumed tokens: 9410969600 | elapsed time per iteration (s): 0.18 | learning rate: 3.237E-05 | global batch size: 256 | lm loss: 3.694181E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.876 | TFLOPs: 27.01 | +7: iteration 17960/ 21553 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 0.18 | learning rate: 3.230E-05 | global batch size: 256 | lm loss: 3.689284E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.313 | TFLOPs: 26.98 | +7: iteration 17970/ 21553 | consumed samples: 4600320 | consumed tokens: 9421455360 | elapsed time per iteration (s): 0.18 | learning rate: 3.224E-05 | global batch size: 256 | lm loss: 3.697485E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.833 | TFLOPs: 26.99 | +7: iteration 17980/ 21553 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 0.18 | learning rate: 3.217E-05 | global batch size: 256 | lm loss: 3.690757E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.281 | TFLOPs: 27.00 | +7: iteration 17990/ 21553 | consumed samples: 4605440 | consumed tokens: 9431941120 | elapsed time per iteration (s): 0.18 | learning rate: 3.210E-05 | global batch size: 256 | lm loss: 3.691676E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.176 | TFLOPs: 26.89 | +0: [2023-03-17 00:30:16,112] [INFO] [logging.py:68:log_dist] [Rank 0] step=18000, skipped=0, lr=[3.2036439682204886e-05, 3.2036439682204886e-05, 3.2036439682204886e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 18000/ 21553 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 0.18 | learning rate: 3.204E-05 | global batch size: 256 | lm loss: 3.690342E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.615 | TFLOPs: 27.03 | +0: steps: 18000 loss: 3.6838 iter time (s): 0.180 samples/sec: 1422.600 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 18000 | lm loss value: 3.857691E+00 | lm loss PPL: 4.735587E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 18000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:30:16,200] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is begin to save! +0: [2023-03-17 00:30:16,203] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:30:16,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:30:16,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:30:16,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:30:16,288] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:30:16,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:30:16,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:30:16,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:30:16,311] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:30:16,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:30:16,322] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:30:16,333] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:30:16,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:30:16,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:30:16,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:30:16,359] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:30:16,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:30:16,370] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:30:16,370] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:30:16,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:30:16,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:30:16,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:30:16,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:30:16,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:30:16,394] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step18000/mp_rank_00_model_states.pt +0: [2023-03-17 00:30:16,394] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:30:16,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:30:16,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:30:16,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:30:16,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,434] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,434] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 00:30:16,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:30:16,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:30:16,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-17 00:30:16,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-17 00:30:16,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-17 00:30:16,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 00:30:16,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:30:16,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:30:16,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-17 00:30:16,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:30:16,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:30:16,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: successfully saved checkpoint at iteration 18000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 246.63 +7: iteration 18010/ 21553 | consumed samples: 4610560 | consumed tokens: 9442426880 | elapsed time per iteration (s): 0.21 | learning rate: 3.197E-05 | global batch size: 256 | lm loss: 3.692180E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1195.942 | TFLOPs: 22.85 | +7: iteration 18020/ 21553 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 0.18 | learning rate: 3.190E-05 | global batch size: 256 | lm loss: 3.686173E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.360 | TFLOPs: 26.97 | +7: iteration 18030/ 21553 | consumed samples: 4615680 | consumed tokens: 9452912640 | elapsed time per iteration (s): 0.18 | learning rate: 3.184E-05 | global batch size: 256 | lm loss: 3.691025E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.549 | TFLOPs: 26.91 | +7: iteration 18040/ 21553 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 0.18 | learning rate: 3.177E-05 | global batch size: 256 | lm loss: 3.694814E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.394 | TFLOPs: 26.97 | +7: iteration 18050/ 21553 | consumed samples: 4620800 | consumed tokens: 9463398400 | elapsed time per iteration (s): 0.18 | learning rate: 3.171E-05 | global batch size: 256 | lm loss: 3.683973E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.295 | TFLOPs: 26.96 | +7: iteration 18060/ 21553 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 0.18 | learning rate: 3.164E-05 | global batch size: 256 | lm loss: 3.688436E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.536 | TFLOPs: 26.97 | +7: iteration 18070/ 21553 | consumed samples: 4625920 | consumed tokens: 9473884160 | elapsed time per iteration (s): 0.18 | learning rate: 3.158E-05 | global batch size: 256 | lm loss: 3.691504E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.356 | TFLOPs: 26.98 | +7: iteration 18080/ 21553 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 0.18 | learning rate: 3.151E-05 | global batch size: 256 | lm loss: 3.694754E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.273 | TFLOPs: 26.96 | +7: iteration 18090/ 21553 | consumed samples: 4631040 | consumed tokens: 9484369920 | elapsed time per iteration (s): 0.18 | learning rate: 3.145E-05 | global batch size: 256 | lm loss: 3.693188E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.149 | TFLOPs: 26.98 | +7: iteration 18100/ 21553 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 0.18 | learning rate: 3.138E-05 | global batch size: 256 | lm loss: 3.692927E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.267 | TFLOPs: 26.96 | +7: iteration 18110/ 21553 | consumed samples: 4636160 | consumed tokens: 9494855680 | elapsed time per iteration (s): 0.18 | learning rate: 3.132E-05 | global batch size: 256 | lm loss: 3.688839E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.916 | TFLOPs: 26.98 | +7: iteration 18120/ 21553 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 0.18 | learning rate: 3.125E-05 | global batch size: 256 | lm loss: 3.695985E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.302 | TFLOPs: 26.96 | +7: iteration 18130/ 21553 | consumed samples: 4641280 | consumed tokens: 9505341440 | elapsed time per iteration (s): 0.18 | learning rate: 3.119E-05 | global batch size: 256 | lm loss: 3.687878E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.582 | TFLOPs: 26.97 | +7: iteration 18140/ 21553 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 0.18 | learning rate: 3.113E-05 | global batch size: 256 | lm loss: 3.697112E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.593 | TFLOPs: 26.95 | +7: iteration 18150/ 21553 | consumed samples: 4646400 | consumed tokens: 9515827200 | elapsed time per iteration (s): 0.18 | learning rate: 3.106E-05 | global batch size: 256 | lm loss: 3.684959E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.875 | TFLOPs: 26.94 | +7: iteration 18160/ 21553 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 0.18 | learning rate: 3.100E-05 | global batch size: 256 | lm loss: 3.690926E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.285 | TFLOPs: 26.95 | +7: iteration 18170/ 21553 | consumed samples: 4651520 | consumed tokens: 9526312960 | elapsed time per iteration (s): 0.18 | learning rate: 3.094E-05 | global batch size: 256 | lm loss: 3.689342E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.573 | TFLOPs: 26.84 | +7: iteration 18180/ 21553 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 0.18 | learning rate: 3.087E-05 | global batch size: 256 | lm loss: 3.690976E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.163 | TFLOPs: 26.71 | +7: iteration 18190/ 21553 | consumed samples: 4656640 | consumed tokens: 9536798720 | elapsed time per iteration (s): 0.18 | learning rate: 3.081E-05 | global batch size: 256 | lm loss: 3.691034E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.181 | TFLOPs: 26.92 | +7: iteration 18200/ 21553 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 0.18 | learning rate: 3.075E-05 | global batch size: 256 | lm loss: 3.691521E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.061 | TFLOPs: 26.98 | +7: iteration 18210/ 21553 | consumed samples: 4661760 | consumed tokens: 9547284480 | elapsed time per iteration (s): 0.18 | learning rate: 3.068E-05 | global batch size: 256 | lm loss: 3.692466E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.675 | TFLOPs: 26.95 | +7: iteration 18220/ 21553 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 0.18 | learning rate: 3.062E-05 | global batch size: 256 | lm loss: 3.695328E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.040 | TFLOPs: 26.96 | +7: iteration 18230/ 21553 | consumed samples: 4666880 | consumed tokens: 9557770240 | elapsed time per iteration (s): 0.18 | learning rate: 3.056E-05 | global batch size: 256 | lm loss: 3.687386E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.970 | TFLOPs: 26.92 | +7: iteration 18240/ 21553 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 0.18 | learning rate: 3.050E-05 | global batch size: 256 | lm loss: 3.691028E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.675 | TFLOPs: 26.93 | +7: iteration 18250/ 21553 | consumed samples: 4672000 | consumed tokens: 9568256000 | elapsed time per iteration (s): 0.18 | learning rate: 3.043E-05 | global batch size: 256 | lm loss: 3.686626E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.223 | TFLOPs: 26.83 | +7: iteration 18260/ 21553 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 0.18 | learning rate: 3.037E-05 | global batch size: 256 | lm loss: 3.684856E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.050 | TFLOPs: 26.85 | +7: iteration 18270/ 21553 | consumed samples: 4677120 | consumed tokens: 9578741760 | elapsed time per iteration (s): 0.18 | learning rate: 3.031E-05 | global batch size: 256 | lm loss: 3.683980E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.300 | TFLOPs: 26.83 | +7: iteration 18280/ 21553 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 0.18 | learning rate: 3.025E-05 | global batch size: 256 | lm loss: 3.692246E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.231 | TFLOPs: 26.91 | +7: iteration 18290/ 21553 | consumed samples: 4682240 | consumed tokens: 9589227520 | elapsed time per iteration (s): 0.18 | learning rate: 3.019E-05 | global batch size: 256 | lm loss: 3.690671E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.949 | TFLOPs: 26.96 | +7: iteration 18300/ 21553 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 0.18 | learning rate: 3.013E-05 | global batch size: 256 | lm loss: 3.694314E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.016 | TFLOPs: 26.96 | +7: iteration 18310/ 21553 | consumed samples: 4687360 | consumed tokens: 9599713280 | elapsed time per iteration (s): 0.18 | learning rate: 3.007E-05 | global batch size: 256 | lm loss: 3.686477E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.122 | TFLOPs: 26.94 | +7: iteration 18320/ 21553 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 0.18 | learning rate: 3.001E-05 | global batch size: 256 | lm loss: 3.688906E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.153 | TFLOPs: 26.92 | +7: iteration 18330/ 21553 | consumed samples: 4692480 | consumed tokens: 9610199040 | elapsed time per iteration (s): 0.18 | learning rate: 2.994E-05 | global batch size: 256 | lm loss: 3.691858E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.081 | TFLOPs: 26.94 | +7: iteration 18340/ 21553 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 0.18 | learning rate: 2.988E-05 | global batch size: 256 | lm loss: 3.691037E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.326 | TFLOPs: 26.97 | +7: iteration 18350/ 21553 | consumed samples: 4697600 | consumed tokens: 9620684800 | elapsed time per iteration (s): 0.18 | learning rate: 2.982E-05 | global batch size: 256 | lm loss: 3.693136E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.755 | TFLOPs: 26.94 | +7: iteration 18360/ 21553 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 0.18 | learning rate: 2.976E-05 | global batch size: 256 | lm loss: 3.699185E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.223 | TFLOPs: 26.96 | +7: iteration 18370/ 21553 | consumed samples: 4702720 | consumed tokens: 9631170560 | elapsed time per iteration (s): 0.18 | learning rate: 2.970E-05 | global batch size: 256 | lm loss: 3.688871E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.036 | TFLOPs: 26.96 | +7: iteration 18380/ 21553 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 0.18 | learning rate: 2.964E-05 | global batch size: 256 | lm loss: 3.690562E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.634 | TFLOPs: 26.93 | +7: iteration 18390/ 21553 | consumed samples: 4707840 | consumed tokens: 9641656320 | elapsed time per iteration (s): 0.18 | learning rate: 2.958E-05 | global batch size: 256 | lm loss: 3.692692E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.693 | TFLOPs: 26.90 | +7: iteration 18400/ 21553 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 0.18 | learning rate: 2.953E-05 | global batch size: 256 | lm loss: 3.689818E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.581 | TFLOPs: 26.89 | +7: iteration 18410/ 21553 | consumed samples: 4712960 | consumed tokens: 9652142080 | elapsed time per iteration (s): 0.18 | learning rate: 2.947E-05 | global batch size: 256 | lm loss: 3.686787E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.939 | TFLOPs: 26.96 | +7: iteration 18420/ 21553 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 0.18 | learning rate: 2.941E-05 | global batch size: 256 | lm loss: 3.691415E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.269 | TFLOPs: 26.95 | +7: iteration 18430/ 21553 | consumed samples: 4718080 | consumed tokens: 9662627840 | elapsed time per iteration (s): 0.18 | learning rate: 2.935E-05 | global batch size: 256 | lm loss: 3.698604E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.631 | TFLOPs: 26.95 | +7: iteration 18440/ 21553 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 0.18 | learning rate: 2.929E-05 | global batch size: 256 | lm loss: 3.689842E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.036 | TFLOPs: 26.94 | +7: iteration 18450/ 21553 | consumed samples: 4723200 | consumed tokens: 9673113600 | elapsed time per iteration (s): 0.18 | learning rate: 2.923E-05 | global batch size: 256 | lm loss: 3.687035E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.389 | TFLOPs: 26.93 | +7: iteration 18460/ 21553 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 0.18 | learning rate: 2.917E-05 | global batch size: 256 | lm loss: 3.697093E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.868 | TFLOPs: 26.92 | +7: iteration 18470/ 21553 | consumed samples: 4728320 | consumed tokens: 9683599360 | elapsed time per iteration (s): 0.18 | learning rate: 2.911E-05 | global batch size: 256 | lm loss: 3.686337E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.383 | TFLOPs: 26.93 | +7: iteration 18480/ 21553 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 0.18 | learning rate: 2.906E-05 | global batch size: 256 | lm loss: 3.685994E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.570 | TFLOPs: 26.97 | +7: iteration 18490/ 21553 | consumed samples: 4733440 | consumed tokens: 9694085120 | elapsed time per iteration (s): 0.18 | learning rate: 2.900E-05 | global batch size: 256 | lm loss: 3.689182E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.278 | TFLOPs: 26.95 | +7: iteration 18500/ 21553 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 0.18 | learning rate: 2.894E-05 | global batch size: 256 | lm loss: 3.687688E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.213 | TFLOPs: 26.96 | +7: iteration 18510/ 21553 | consumed samples: 4738560 | consumed tokens: 9704570880 | elapsed time per iteration (s): 0.18 | learning rate: 2.888E-05 | global batch size: 256 | lm loss: 3.696791E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.017 | TFLOPs: 26.96 | +7: iteration 18520/ 21553 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 0.18 | learning rate: 2.883E-05 | global batch size: 256 | lm loss: 3.682244E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.096 | TFLOPs: 26.96 | +7: iteration 18530/ 21553 | consumed samples: 4743680 | consumed tokens: 9715056640 | elapsed time per iteration (s): 0.18 | learning rate: 2.877E-05 | global batch size: 256 | lm loss: 3.691260E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.260 | TFLOPs: 26.96 | +7: iteration 18540/ 21553 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 0.18 | learning rate: 2.871E-05 | global batch size: 256 | lm loss: 3.694242E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.809 | TFLOPs: 26.96 | +7: iteration 18550/ 21553 | consumed samples: 4748800 | consumed tokens: 9725542400 | elapsed time per iteration (s): 0.18 | learning rate: 2.865E-05 | global batch size: 256 | lm loss: 3.689209E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.904 | TFLOPs: 26.96 | +7: iteration 18560/ 21553 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 0.18 | learning rate: 2.860E-05 | global batch size: 256 | lm loss: 3.688284E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.355 | TFLOPs: 26.95 | +7: iteration 18570/ 21553 | consumed samples: 4753920 | consumed tokens: 9736028160 | elapsed time per iteration (s): 0.18 | learning rate: 2.854E-05 | global batch size: 256 | lm loss: 3.688219E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.789 | TFLOPs: 26.94 | +7: iteration 18580/ 21553 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 0.18 | learning rate: 2.849E-05 | global batch size: 256 | lm loss: 3.690240E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.836 | TFLOPs: 26.94 | +7: iteration 18590/ 21553 | consumed samples: 4759040 | consumed tokens: 9746513920 | elapsed time per iteration (s): 0.18 | learning rate: 2.843E-05 | global batch size: 256 | lm loss: 3.695762E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.363 | TFLOPs: 26.95 | +7: iteration 18600/ 21553 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 0.18 | learning rate: 2.837E-05 | global batch size: 256 | lm loss: 3.694225E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.629 | TFLOPs: 26.95 | +7: iteration 18610/ 21553 | consumed samples: 4764160 | consumed tokens: 9756999680 | elapsed time per iteration (s): 0.18 | learning rate: 2.832E-05 | global batch size: 256 | lm loss: 3.695528E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.249 | TFLOPs: 26.96 | +7: iteration 18620/ 21553 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 0.18 | learning rate: 2.826E-05 | global batch size: 256 | lm loss: 3.689094E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.605 | TFLOPs: 26.95 | +7: iteration 18630/ 21553 | consumed samples: 4769280 | consumed tokens: 9767485440 | elapsed time per iteration (s): 0.18 | learning rate: 2.821E-05 | global batch size: 256 | lm loss: 3.690759E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.558 | TFLOPs: 26.91 | +7: iteration 18640/ 21553 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 0.18 | learning rate: 2.815E-05 | global batch size: 256 | lm loss: 3.685834E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.090 | TFLOPs: 26.88 | +7: iteration 18650/ 21553 | consumed samples: 4774400 | consumed tokens: 9777971200 | elapsed time per iteration (s): 0.18 | learning rate: 2.810E-05 | global batch size: 256 | lm loss: 3.691308E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.719 | TFLOPs: 26.92 | +7: iteration 18660/ 21553 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 0.18 | learning rate: 2.804E-05 | global batch size: 256 | lm loss: 3.691240E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.401 | TFLOPs: 26.97 | +7: iteration 18670/ 21553 | consumed samples: 4779520 | consumed tokens: 9788456960 | elapsed time per iteration (s): 0.18 | learning rate: 2.799E-05 | global batch size: 256 | lm loss: 3.691167E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.054 | TFLOPs: 26.90 | +7: iteration 18680/ 21553 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 0.18 | learning rate: 2.793E-05 | global batch size: 256 | lm loss: 3.692339E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.687 | TFLOPs: 26.93 | +7: iteration 18690/ 21553 | consumed samples: 4784640 | consumed tokens: 9798942720 | elapsed time per iteration (s): 0.18 | learning rate: 2.788E-05 | global batch size: 256 | lm loss: 3.686731E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.581 | TFLOPs: 26.93 | +7: iteration 18700/ 21553 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 0.18 | learning rate: 2.782E-05 | global batch size: 256 | lm loss: 3.687527E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.485 | TFLOPs: 26.95 | +7: iteration 18710/ 21553 | consumed samples: 4789760 | consumed tokens: 9809428480 | elapsed time per iteration (s): 0.19 | learning rate: 2.777E-05 | global batch size: 256 | lm loss: 3.691760E+00 | grad norm: 0.468 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1316.567 | TFLOPs: 25.15 | +7: iteration 18720/ 21553 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 0.18 | learning rate: 2.772E-05 | global batch size: 256 | lm loss: 3.683803E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.429 | TFLOPs: 26.87 | +7: iteration 18730/ 21553 | consumed samples: 4794880 | consumed tokens: 9819914240 | elapsed time per iteration (s): 0.18 | learning rate: 2.766E-05 | global batch size: 256 | lm loss: 3.691950E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1406.991 | TFLOPs: 26.88 | +7: iteration 18740/ 21553 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 0.18 | learning rate: 2.761E-05 | global batch size: 256 | lm loss: 3.694304E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.735 | TFLOPs: 26.92 | +7: iteration 18750/ 21553 | consumed samples: 4800000 | consumed tokens: 9830400000 | elapsed time per iteration (s): 0.18 | learning rate: 2.756E-05 | global batch size: 256 | lm loss: 3.689820E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.126 | TFLOPs: 26.94 | +7: iteration 18760/ 21553 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 0.18 | learning rate: 2.750E-05 | global batch size: 256 | lm loss: 3.689980E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.340 | TFLOPs: 26.95 | +7: iteration 18770/ 21553 | consumed samples: 4805120 | consumed tokens: 9840885760 | elapsed time per iteration (s): 0.18 | learning rate: 2.745E-05 | global batch size: 256 | lm loss: 3.683767E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.925 | TFLOPs: 26.96 | +7: iteration 18780/ 21553 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 0.18 | learning rate: 2.740E-05 | global batch size: 256 | lm loss: 3.692834E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.709 | TFLOPs: 26.97 | +7: iteration 18790/ 21553 | consumed samples: 4810240 | consumed tokens: 9851371520 | elapsed time per iteration (s): 0.18 | learning rate: 2.735E-05 | global batch size: 256 | lm loss: 3.687759E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.592 | TFLOPs: 26.97 | +7: iteration 18800/ 21553 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 0.18 | learning rate: 2.729E-05 | global batch size: 256 | lm loss: 3.681096E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.024 | TFLOPs: 26.98 | +7: iteration 18810/ 21553 | consumed samples: 4815360 | consumed tokens: 9861857280 | elapsed time per iteration (s): 0.18 | learning rate: 2.724E-05 | global batch size: 256 | lm loss: 3.686583E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.013 | TFLOPs: 26.98 | +7: iteration 18820/ 21553 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 0.18 | learning rate: 2.719E-05 | global batch size: 256 | lm loss: 3.688353E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.386 | TFLOPs: 26.97 | +7: iteration 18830/ 21553 | consumed samples: 4820480 | consumed tokens: 9872343040 | elapsed time per iteration (s): 0.18 | learning rate: 2.714E-05 | global batch size: 256 | lm loss: 3.686667E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.336 | TFLOPs: 26.97 | +7: iteration 18840/ 21553 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 0.18 | learning rate: 2.709E-05 | global batch size: 256 | lm loss: 3.688454E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.150 | TFLOPs: 26.96 | +7: iteration 18850/ 21553 | consumed samples: 4825600 | consumed tokens: 9882828800 | elapsed time per iteration (s): 0.18 | learning rate: 2.703E-05 | global batch size: 256 | lm loss: 3.687153E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.993 | TFLOPs: 26.98 | +7: iteration 18860/ 21553 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 0.19 | learning rate: 2.698E-05 | global batch size: 256 | lm loss: 3.692200E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1382.110 | TFLOPs: 26.41 | +7: iteration 18870/ 21553 | consumed samples: 4830720 | consumed tokens: 9893314560 | elapsed time per iteration (s): 0.18 | learning rate: 2.693E-05 | global batch size: 256 | lm loss: 3.683386E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.703 | TFLOPs: 26.97 | +7: iteration 18880/ 21553 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 0.18 | learning rate: 2.688E-05 | global batch size: 256 | lm loss: 3.685257E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.364 | TFLOPs: 26.81 | +7: iteration 18890/ 21553 | consumed samples: 4835840 | consumed tokens: 9903800320 | elapsed time per iteration (s): 0.18 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 3.682883E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.372 | TFLOPs: 27.00 | +7: iteration 18900/ 21553 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 0.18 | learning rate: 2.678E-05 | global batch size: 256 | lm loss: 3.694709E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.306 | TFLOPs: 26.98 | +7: iteration 18910/ 21553 | consumed samples: 4840960 | consumed tokens: 9914286080 | elapsed time per iteration (s): 0.18 | learning rate: 2.673E-05 | global batch size: 256 | lm loss: 3.686418E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.049 | TFLOPs: 27.00 | +7: iteration 18920/ 21553 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 0.18 | learning rate: 2.668E-05 | global batch size: 256 | lm loss: 3.688239E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.635 | TFLOPs: 27.05 | +7: iteration 18930/ 21553 | consumed samples: 4846080 | consumed tokens: 9924771840 | elapsed time per iteration (s): 0.18 | learning rate: 2.663E-05 | global batch size: 256 | lm loss: 3.681501E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.747 | TFLOPs: 27.01 | +7: iteration 18940/ 21553 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 0.18 | learning rate: 2.658E-05 | global batch size: 256 | lm loss: 3.688660E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.587 | TFLOPs: 26.95 | +7: iteration 18950/ 21553 | consumed samples: 4851200 | consumed tokens: 9935257600 | elapsed time per iteration (s): 0.18 | learning rate: 2.653E-05 | global batch size: 256 | lm loss: 3.684182E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.781 | TFLOPs: 27.05 | +7: iteration 18960/ 21553 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 0.18 | learning rate: 2.648E-05 | global batch size: 256 | lm loss: 3.692876E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.031 | TFLOPs: 27.04 | +7: iteration 18970/ 21553 | consumed samples: 4856320 | consumed tokens: 9945743360 | elapsed time per iteration (s): 0.18 | learning rate: 2.643E-05 | global batch size: 256 | lm loss: 3.686662E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.112 | TFLOPs: 27.02 | +7: iteration 18980/ 21553 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 0.18 | learning rate: 2.638E-05 | global batch size: 256 | lm loss: 3.687589E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.297 | TFLOPs: 26.98 | +7: iteration 18990/ 21553 | consumed samples: 4861440 | consumed tokens: 9956229120 | elapsed time per iteration (s): 0.18 | learning rate: 2.633E-05 | global batch size: 256 | lm loss: 3.685091E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1401.040 | TFLOPs: 26.77 | +7: iteration 19000/ 21553 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 0.18 | learning rate: 2.628E-05 | global batch size: 256 | lm loss: 3.684387E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.976 | TFLOPs: 27.00 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 19000 | lm loss value: 3.854015E+00 | lm loss PPL: 4.718212E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 19000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:33:18,210] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step19000 is begin to save! +0: [2023-03-17 00:33:18,215] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:33:18,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:33:18,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:33:18,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:33:18,299] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:33:18,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:33:18,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:33:18,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:33:18,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:33:18,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:33:18,335] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:33:18,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:33:18,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:33:18,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:33:18,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:33:18,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:33:18,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:33:18,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:33:18,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:33:18,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:33:18,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:33:18,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:33:18,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:33:18,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:33:18,405] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step19000/mp_rank_00_model_states.pt +0: [2023-03-17 00:33:18,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:33:18,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:33:18,423] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:18,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:18,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:18,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:18,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:18,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:18,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:18,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:33:18,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-17 00:33:18,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-17 00:33:18,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:33:18,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:33:18,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: successfully saved checkpoint at iteration 19000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 247.74 +7: iteration 19010/ 21553 | consumed samples: 4866560 | consumed tokens: 9966714880 | elapsed time per iteration (s): 0.21 | learning rate: 2.624E-05 | global batch size: 256 | lm loss: 3.684965E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1206.512 | TFLOPs: 23.05 | +7: iteration 19020/ 21553 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 0.18 | learning rate: 2.619E-05 | global batch size: 256 | lm loss: 3.685121E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.977 | TFLOPs: 27.00 | +7: iteration 19030/ 21553 | consumed samples: 4871680 | consumed tokens: 9977200640 | elapsed time per iteration (s): 0.18 | learning rate: 2.614E-05 | global batch size: 256 | lm loss: 3.683004E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.179 | TFLOPs: 26.98 | +7: iteration 19040/ 21553 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 0.18 | learning rate: 2.609E-05 | global batch size: 256 | lm loss: 3.683720E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.664 | TFLOPs: 26.99 | +7: iteration 19050/ 21553 | consumed samples: 4876800 | consumed tokens: 9987686400 | elapsed time per iteration (s): 0.18 | learning rate: 2.604E-05 | global batch size: 256 | lm loss: 3.687371E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.443 | TFLOPs: 26.99 | +7: iteration 19060/ 21553 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 0.18 | learning rate: 2.600E-05 | global batch size: 256 | lm loss: 3.689798E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.328 | TFLOPs: 26.98 | +7: iteration 19070/ 21553 | consumed samples: 4881920 | consumed tokens: 9998172160 | elapsed time per iteration (s): 0.18 | learning rate: 2.595E-05 | global batch size: 256 | lm loss: 3.686565E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.731 | TFLOPs: 26.99 | +7: iteration 19080/ 21553 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 0.18 | learning rate: 2.590E-05 | global batch size: 256 | lm loss: 3.685442E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.494 | TFLOPs: 27.01 | +7: iteration 19090/ 21553 | consumed samples: 4887040 | consumed tokens: 10008657920 | elapsed time per iteration (s): 0.18 | learning rate: 2.585E-05 | global batch size: 256 | lm loss: 3.686687E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.157 | TFLOPs: 27.02 | +7: iteration 19100/ 21553 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 0.18 | learning rate: 2.581E-05 | global batch size: 256 | lm loss: 3.683064E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.512 | TFLOPs: 27.03 | +7: iteration 19110/ 21553 | consumed samples: 4892160 | consumed tokens: 10019143680 | elapsed time per iteration (s): 0.18 | learning rate: 2.576E-05 | global batch size: 256 | lm loss: 3.687078E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.553 | TFLOPs: 27.05 | +7: iteration 19120/ 21553 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 0.18 | learning rate: 2.571E-05 | global batch size: 256 | lm loss: 3.680103E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.598 | TFLOPs: 27.03 | +7: iteration 19130/ 21553 | consumed samples: 4897280 | consumed tokens: 10029629440 | elapsed time per iteration (s): 0.18 | learning rate: 2.567E-05 | global batch size: 256 | lm loss: 3.683212E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.796 | TFLOPs: 27.03 | +7: iteration 19140/ 21553 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 0.18 | learning rate: 2.562E-05 | global batch size: 256 | lm loss: 3.683141E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.427 | TFLOPs: 27.01 | +7: iteration 19150/ 21553 | consumed samples: 4902400 | consumed tokens: 10040115200 | elapsed time per iteration (s): 0.18 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 3.679442E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.655 | TFLOPs: 27.05 | +7: iteration 19160/ 21553 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 0.18 | learning rate: 2.553E-05 | global batch size: 256 | lm loss: 3.688088E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.018 | TFLOPs: 27.02 | +7: iteration 19170/ 21553 | consumed samples: 4907520 | consumed tokens: 10050600960 | elapsed time per iteration (s): 0.18 | learning rate: 2.548E-05 | global batch size: 256 | lm loss: 3.689042E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.519 | TFLOPs: 26.57 | +7: iteration 19180/ 21553 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 0.18 | learning rate: 2.544E-05 | global batch size: 256 | lm loss: 3.684602E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.708 | TFLOPs: 26.95 | +7: iteration 19190/ 21553 | consumed samples: 4912640 | consumed tokens: 10061086720 | elapsed time per iteration (s): 0.18 | learning rate: 2.539E-05 | global batch size: 256 | lm loss: 3.681740E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.193 | TFLOPs: 27.02 | +7: iteration 19200/ 21553 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 0.18 | learning rate: 2.535E-05 | global batch size: 256 | lm loss: 3.685355E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.882 | TFLOPs: 27.05 | +7: iteration 19210/ 21553 | consumed samples: 4917760 | consumed tokens: 10071572480 | elapsed time per iteration (s): 0.18 | learning rate: 2.530E-05 | global batch size: 256 | lm loss: 3.679235E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.592 | TFLOPs: 27.03 | +7: iteration 19220/ 21553 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 0.18 | learning rate: 2.526E-05 | global batch size: 256 | lm loss: 3.687258E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.765 | TFLOPs: 27.03 | +7: iteration 19230/ 21553 | consumed samples: 4922880 | consumed tokens: 10082058240 | elapsed time per iteration (s): 0.18 | learning rate: 2.521E-05 | global batch size: 256 | lm loss: 3.690195E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.345 | TFLOPs: 27.04 | +7: iteration 19240/ 21553 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 0.18 | learning rate: 2.517E-05 | global batch size: 256 | lm loss: 3.685788E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.150 | TFLOPs: 27.02 | +7: iteration 19250/ 21553 | consumed samples: 4928000 | consumed tokens: 10092544000 | elapsed time per iteration (s): 0.18 | learning rate: 2.512E-05 | global batch size: 256 | lm loss: 3.685502E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.123 | TFLOPs: 27.00 | +7: iteration 19260/ 21553 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 0.18 | learning rate: 2.508E-05 | global batch size: 256 | lm loss: 3.687280E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.241 | TFLOPs: 27.02 | +7: iteration 19270/ 21553 | consumed samples: 4933120 | consumed tokens: 10103029760 | elapsed time per iteration (s): 0.18 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 3.682541E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.534 | TFLOPs: 27.01 | +7: iteration 19280/ 21553 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 0.18 | learning rate: 2.499E-05 | global batch size: 256 | lm loss: 3.677246E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.288 | TFLOPs: 27.02 | +7: iteration 19290/ 21553 | consumed samples: 4938240 | consumed tokens: 10113515520 | elapsed time per iteration (s): 0.18 | learning rate: 2.495E-05 | global batch size: 256 | lm loss: 3.683583E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.981 | TFLOPs: 27.00 | +7: iteration 19300/ 21553 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 0.20 | learning rate: 2.491E-05 | global batch size: 256 | lm loss: 3.683519E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1305.243 | TFLOPs: 24.94 | +7: iteration 19310/ 21553 | consumed samples: 4943360 | consumed tokens: 10124001280 | elapsed time per iteration (s): 0.18 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 3.684668E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.082 | TFLOPs: 27.02 | +7: iteration 19320/ 21553 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 0.18 | learning rate: 2.482E-05 | global batch size: 256 | lm loss: 3.683717E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.170 | TFLOPs: 26.91 | +7: iteration 19330/ 21553 | consumed samples: 4948480 | consumed tokens: 10134487040 | elapsed time per iteration (s): 0.18 | learning rate: 2.478E-05 | global batch size: 256 | lm loss: 3.677838E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.823 | TFLOPs: 26.92 | +7: iteration 19340/ 21553 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 0.18 | learning rate: 2.474E-05 | global batch size: 256 | lm loss: 3.686749E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.454 | TFLOPs: 26.91 | +7: iteration 19350/ 21553 | consumed samples: 4953600 | consumed tokens: 10144972800 | elapsed time per iteration (s): 0.18 | learning rate: 2.469E-05 | global batch size: 256 | lm loss: 3.687416E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.884 | TFLOPs: 26.92 | +7: iteration 19360/ 21553 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 0.18 | learning rate: 2.465E-05 | global batch size: 256 | lm loss: 3.683123E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.138 | TFLOPs: 26.94 | +7: iteration 19370/ 21553 | consumed samples: 4958720 | consumed tokens: 10155458560 | elapsed time per iteration (s): 0.18 | learning rate: 2.461E-05 | global batch size: 256 | lm loss: 3.687038E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.094 | TFLOPs: 26.94 | +7: iteration 19380/ 21553 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 0.18 | learning rate: 2.457E-05 | global batch size: 256 | lm loss: 3.686539E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.373 | TFLOPs: 26.93 | +7: iteration 19390/ 21553 | consumed samples: 4963840 | consumed tokens: 10165944320 | elapsed time per iteration (s): 0.18 | learning rate: 2.453E-05 | global batch size: 256 | lm loss: 3.683322E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.485 | TFLOPs: 26.93 | +7: iteration 19400/ 21553 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 0.18 | learning rate: 2.448E-05 | global batch size: 256 | lm loss: 3.677509E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.325 | TFLOPs: 26.95 | +7: iteration 19410/ 21553 | consumed samples: 4968960 | consumed tokens: 10176430080 | elapsed time per iteration (s): 0.18 | learning rate: 2.444E-05 | global batch size: 256 | lm loss: 3.683870E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.855 | TFLOPs: 26.92 | +7: iteration 19420/ 21553 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 0.18 | learning rate: 2.440E-05 | global batch size: 256 | lm loss: 3.687006E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.137 | TFLOPs: 26.94 | +7: iteration 19430/ 21553 | consumed samples: 4974080 | consumed tokens: 10186915840 | elapsed time per iteration (s): 0.18 | learning rate: 2.436E-05 | global batch size: 256 | lm loss: 3.685219E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1409.940 | TFLOPs: 26.94 | +7: iteration 19440/ 21553 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 0.18 | learning rate: 2.432E-05 | global batch size: 256 | lm loss: 3.686372E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.759 | TFLOPs: 26.95 | +7: iteration 19450/ 21553 | consumed samples: 4979200 | consumed tokens: 10197401600 | elapsed time per iteration (s): 0.18 | learning rate: 2.428E-05 | global batch size: 256 | lm loss: 3.684611E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.020 | TFLOPs: 26.96 | +7: iteration 19460/ 21553 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 0.18 | learning rate: 2.424E-05 | global batch size: 256 | lm loss: 3.684599E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.464 | TFLOPs: 26.66 | +7: iteration 19470/ 21553 | consumed samples: 4984320 | consumed tokens: 10207887360 | elapsed time per iteration (s): 0.18 | learning rate: 2.420E-05 | global batch size: 256 | lm loss: 3.679284E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.354 | TFLOPs: 27.02 | +7: iteration 19480/ 21553 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 0.18 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 3.676344E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.683 | TFLOPs: 26.51 | +7: iteration 19490/ 21553 | consumed samples: 4989440 | consumed tokens: 10218373120 | elapsed time per iteration (s): 0.18 | learning rate: 2.412E-05 | global batch size: 256 | lm loss: 3.683556E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.132 | TFLOPs: 27.04 | +7: iteration 19500/ 21553 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 0.18 | learning rate: 2.408E-05 | global batch size: 256 | lm loss: 3.677776E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.740 | TFLOPs: 27.03 | +7: iteration 19510/ 21553 | consumed samples: 4994560 | consumed tokens: 10228858880 | elapsed time per iteration (s): 0.18 | learning rate: 2.404E-05 | global batch size: 256 | lm loss: 3.686493E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.630 | TFLOPs: 27.01 | +7: iteration 19520/ 21553 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 0.18 | learning rate: 2.400E-05 | global batch size: 256 | lm loss: 3.687656E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.421 | TFLOPs: 27.04 | +7: iteration 19530/ 21553 | consumed samples: 4999680 | consumed tokens: 10239344640 | elapsed time per iteration (s): 0.18 | learning rate: 2.396E-05 | global batch size: 256 | lm loss: 3.683351E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.089 | TFLOPs: 27.04 | +7: iteration 19540/ 21553 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 0.18 | learning rate: 2.392E-05 | global batch size: 256 | lm loss: 3.685258E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.522 | TFLOPs: 27.05 | +7: iteration 19550/ 21553 | consumed samples: 5004800 | consumed tokens: 10249830400 | elapsed time per iteration (s): 0.18 | learning rate: 2.389E-05 | global batch size: 256 | lm loss: 3.681650E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.387 | TFLOPs: 27.04 | +7: iteration 19560/ 21553 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 0.18 | learning rate: 2.385E-05 | global batch size: 256 | lm loss: 3.682230E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.570 | TFLOPs: 27.05 | +7: iteration 19570/ 21553 | consumed samples: 5009920 | consumed tokens: 10260316160 | elapsed time per iteration (s): 0.18 | learning rate: 2.381E-05 | global batch size: 256 | lm loss: 3.681116E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.614 | TFLOPs: 27.03 | +7: iteration 19580/ 21553 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 0.18 | learning rate: 2.377E-05 | global batch size: 256 | lm loss: 3.686638E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.909 | TFLOPs: 27.03 | +7: iteration 19590/ 21553 | consumed samples: 5015040 | consumed tokens: 10270801920 | elapsed time per iteration (s): 0.18 | learning rate: 2.373E-05 | global batch size: 256 | lm loss: 3.682470E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.944 | TFLOPs: 27.03 | +7: iteration 19600/ 21553 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 0.18 | learning rate: 2.370E-05 | global batch size: 256 | lm loss: 3.675108E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.644 | TFLOPs: 27.03 | +7: iteration 19610/ 21553 | consumed samples: 5020160 | consumed tokens: 10281287680 | elapsed time per iteration (s): 0.18 | learning rate: 2.366E-05 | global batch size: 256 | lm loss: 3.682670E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.377 | TFLOPs: 27.06 | +7: iteration 19620/ 21553 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 0.18 | learning rate: 2.362E-05 | global batch size: 256 | lm loss: 3.689546E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.808 | TFLOPs: 27.03 | +7: iteration 19630/ 21553 | consumed samples: 5025280 | consumed tokens: 10291773440 | elapsed time per iteration (s): 0.18 | learning rate: 2.358E-05 | global batch size: 256 | lm loss: 3.688093E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.448 | TFLOPs: 27.06 | +7: iteration 19640/ 21553 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 0.18 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 3.687234E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.037 | TFLOPs: 27.06 | +7: iteration 19650/ 21553 | consumed samples: 5030400 | consumed tokens: 10302259200 | elapsed time per iteration (s): 0.18 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 3.677538E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.584 | TFLOPs: 27.10 | +7: iteration 19660/ 21553 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 0.18 | learning rate: 2.347E-05 | global batch size: 256 | lm loss: 3.685072E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.659 | TFLOPs: 27.05 | +7: iteration 19670/ 21553 | consumed samples: 5035520 | consumed tokens: 10312744960 | elapsed time per iteration (s): 0.18 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 3.679656E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1418.186 | TFLOPs: 27.10 | +7: iteration 19680/ 21553 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 0.18 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 3.679263E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.771 | TFLOPs: 27.09 | +7: iteration 19690/ 21553 | consumed samples: 5040640 | consumed tokens: 10323230720 | elapsed time per iteration (s): 0.18 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 3.681082E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.807 | TFLOPs: 27.09 | +7: iteration 19700/ 21553 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 0.18 | learning rate: 2.333E-05 | global batch size: 256 | lm loss: 3.679950E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.838 | TFLOPs: 27.07 | +7: iteration 19710/ 21553 | consumed samples: 5045760 | consumed tokens: 10333716480 | elapsed time per iteration (s): 0.18 | learning rate: 2.329E-05 | global batch size: 256 | lm loss: 3.680191E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.848 | TFLOPs: 27.07 | +7: iteration 19720/ 21553 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 0.18 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.680532E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.534 | TFLOPs: 27.08 | +7: iteration 19730/ 21553 | consumed samples: 5050880 | consumed tokens: 10344202240 | elapsed time per iteration (s): 0.18 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 3.681596E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.426 | TFLOPs: 27.08 | +7: iteration 19740/ 21553 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 0.18 | learning rate: 2.319E-05 | global batch size: 256 | lm loss: 3.692286E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.594 | TFLOPs: 27.07 | +7: iteration 19750/ 21553 | consumed samples: 5056000 | consumed tokens: 10354688000 | elapsed time per iteration (s): 0.18 | learning rate: 2.315E-05 | global batch size: 256 | lm loss: 3.676937E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.734 | TFLOPs: 27.09 | +7: iteration 19760/ 21553 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 0.18 | learning rate: 2.312E-05 | global batch size: 256 | lm loss: 3.676377E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.043 | TFLOPs: 27.06 | +7: iteration 19770/ 21553 | consumed samples: 5061120 | consumed tokens: 10365173760 | elapsed time per iteration (s): 0.18 | learning rate: 2.308E-05 | global batch size: 256 | lm loss: 3.687208E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.285 | TFLOPs: 27.06 | +7: iteration 19780/ 21553 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 0.18 | learning rate: 2.305E-05 | global batch size: 256 | lm loss: 3.687092E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.364 | TFLOPs: 27.08 | +7: iteration 19790/ 21553 | consumed samples: 5066240 | consumed tokens: 10375659520 | elapsed time per iteration (s): 0.18 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 3.677879E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.061 | TFLOPs: 27.07 | +7: iteration 19800/ 21553 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 0.18 | learning rate: 2.298E-05 | global batch size: 256 | lm loss: 3.684404E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.109 | TFLOPs: 27.06 | +7: iteration 19810/ 21553 | consumed samples: 5071360 | consumed tokens: 10386145280 | elapsed time per iteration (s): 0.18 | learning rate: 2.295E-05 | global batch size: 256 | lm loss: 3.682417E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.907 | TFLOPs: 27.07 | +7: iteration 19820/ 21553 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 0.18 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 3.677687E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.010 | TFLOPs: 27.05 | +7: iteration 19830/ 21553 | consumed samples: 5076480 | consumed tokens: 10396631040 | elapsed time per iteration (s): 0.18 | learning rate: 2.288E-05 | global batch size: 256 | lm loss: 3.681319E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.765 | TFLOPs: 27.05 | +7: iteration 19840/ 21553 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 0.18 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 3.684177E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.730 | TFLOPs: 27.05 | +7: iteration 19850/ 21553 | consumed samples: 5081600 | consumed tokens: 10407116800 | elapsed time per iteration (s): 0.18 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 3.686215E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.323 | TFLOPs: 27.04 | +7: iteration 19860/ 21553 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 0.18 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 3.679634E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.906 | TFLOPs: 27.03 | +7: iteration 19870/ 21553 | consumed samples: 5086720 | consumed tokens: 10417602560 | elapsed time per iteration (s): 0.18 | learning rate: 2.275E-05 | global batch size: 256 | lm loss: 3.684547E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.233 | TFLOPs: 27.06 | +7: iteration 19880/ 21553 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 0.18 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 3.693918E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.069 | TFLOPs: 27.04 | +7: iteration 19890/ 21553 | consumed samples: 5091840 | consumed tokens: 10428088320 | elapsed time per iteration (s): 0.18 | learning rate: 2.268E-05 | global batch size: 256 | lm loss: 3.684354E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.627 | TFLOPs: 27.05 | +7: iteration 19900/ 21553 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 0.18 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.684724E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.256 | TFLOPs: 27.04 | +7: iteration 19910/ 21553 | consumed samples: 5096960 | consumed tokens: 10438574080 | elapsed time per iteration (s): 0.18 | learning rate: 2.262E-05 | global batch size: 256 | lm loss: 3.676400E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.019 | TFLOPs: 27.05 | +7: iteration 19920/ 21553 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 0.18 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 3.682255E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1389.986 | TFLOPs: 26.56 | +7: iteration 19930/ 21553 | consumed samples: 5102080 | consumed tokens: 10449059840 | elapsed time per iteration (s): 0.18 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 3.684227E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.719 | TFLOPs: 27.03 | +7: iteration 19940/ 21553 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 0.18 | learning rate: 2.253E-05 | global batch size: 256 | lm loss: 3.677890E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.997 | TFLOPs: 27.05 | +7: iteration 19950/ 21553 | consumed samples: 5107200 | consumed tokens: 10459545600 | elapsed time per iteration (s): 0.18 | learning rate: 2.250E-05 | global batch size: 256 | lm loss: 3.681591E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.381 | TFLOPs: 27.04 | +7: iteration 19960/ 21553 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 0.18 | learning rate: 2.246E-05 | global batch size: 256 | lm loss: 3.693224E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.673 | TFLOPs: 27.05 | +7: iteration 19970/ 21553 | consumed samples: 5112320 | consumed tokens: 10470031360 | elapsed time per iteration (s): 0.18 | learning rate: 2.243E-05 | global batch size: 256 | lm loss: 3.680302E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.199 | TFLOPs: 27.04 | +7: iteration 19980/ 21553 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 0.18 | learning rate: 2.240E-05 | global batch size: 256 | lm loss: 3.688376E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.850 | TFLOPs: 27.03 | +7: iteration 19990/ 21553 | consumed samples: 5117440 | consumed tokens: 10480517120 | elapsed time per iteration (s): 0.18 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 3.684850E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.257 | TFLOPs: 27.02 | +0: [2023-03-17 00:36:19,733] [INFO] [logging.py:68:log_dist] [Rank 0] step=20000, skipped=0, lr=[2.234259142486404e-05, 2.234259142486404e-05, 2.234259142486404e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 20000/ 21553 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 0.18 | learning rate: 2.234E-05 | global batch size: 256 | lm loss: 3.682275E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.451 | TFLOPs: 27.03 | +0: steps: 20000 loss: 3.6949 iter time (s): 0.180 samples/sec: 1423.395 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 20000 | lm loss value: 3.879252E+00 | lm loss PPL: 4.838799E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 20000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:36:19,822] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is begin to save! +0: [2023-03-17 00:36:19,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:36:19,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:36:19,896] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:36:19,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:36:19,908] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:36:19,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:36:19,919] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:36:19,930] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:36:19,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:36:19,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:36:19,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:36:19,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:36:19,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:36:19,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:36:19,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:36:19,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:36:19,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:36:19,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:36:19,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:36:19,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:36:19,997] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:36:20,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:36:20,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:36:20,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:36:20,010] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step20000/mp_rank_00_model_states.pt +0: [2023-03-17 00:36:20,010] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:36:20,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:36:20,029] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:36:20,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,047] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,047] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,047] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:20,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:20,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,051] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,051] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:20,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 00:36:20,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,055] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-17 00:36:20,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:36:20,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: successfully saved checkpoint at iteration 20000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 238.03 +7: iteration 20010/ 21553 | consumed samples: 5122560 | consumed tokens: 10491002880 | elapsed time per iteration (s): 0.21 | learning rate: 2.231E-05 | global batch size: 256 | lm loss: 3.680535E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1202.195 | TFLOPs: 22.97 | +7: iteration 20020/ 21553 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 0.18 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 3.678138E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.009 | TFLOPs: 27.04 | +7: iteration 20030/ 21553 | consumed samples: 5127680 | consumed tokens: 10501488640 | elapsed time per iteration (s): 0.18 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 3.681607E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.107 | TFLOPs: 27.02 | +7: iteration 20040/ 21553 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 0.18 | learning rate: 2.222E-05 | global batch size: 256 | lm loss: 3.676760E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.640 | TFLOPs: 27.03 | +7: iteration 20050/ 21553 | consumed samples: 5132800 | consumed tokens: 10511974400 | elapsed time per iteration (s): 0.18 | learning rate: 2.219E-05 | global batch size: 256 | lm loss: 3.689143E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.391 | TFLOPs: 27.02 | +7: iteration 20060/ 21553 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 0.18 | learning rate: 2.217E-05 | global batch size: 256 | lm loss: 3.679877E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.772 | TFLOPs: 27.03 | +7: iteration 20070/ 21553 | consumed samples: 5137920 | consumed tokens: 10522460160 | elapsed time per iteration (s): 0.18 | learning rate: 2.214E-05 | global batch size: 256 | lm loss: 3.682888E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.460 | TFLOPs: 27.03 | +7: iteration 20080/ 21553 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 0.18 | learning rate: 2.211E-05 | global batch size: 256 | lm loss: 3.681796E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.865 | TFLOPs: 27.03 | +7: iteration 20090/ 21553 | consumed samples: 5143040 | consumed tokens: 10532945920 | elapsed time per iteration (s): 0.18 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 3.680094E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.578 | TFLOPs: 27.03 | +7: iteration 20100/ 21553 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 0.18 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 3.676299E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.161 | TFLOPs: 27.04 | +7: iteration 20110/ 21553 | consumed samples: 5148160 | consumed tokens: 10543431680 | elapsed time per iteration (s): 0.18 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 3.675672E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1388.836 | TFLOPs: 26.54 | +7: iteration 20120/ 21553 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 0.18 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 3.677117E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1395.330 | TFLOPs: 26.66 | +7: iteration 20130/ 21553 | consumed samples: 5153280 | consumed tokens: 10553917440 | elapsed time per iteration (s): 0.18 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 3.676393E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.324 | TFLOPs: 27.02 | +7: iteration 20140/ 21553 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 0.18 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 3.679449E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.527 | TFLOPs: 26.57 | +7: iteration 20150/ 21553 | consumed samples: 5158400 | consumed tokens: 10564403200 | elapsed time per iteration (s): 0.18 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.686061E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1387.005 | TFLOPs: 26.50 | +7: iteration 20160/ 21553 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 0.18 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 3.686258E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.284 | TFLOPs: 27.04 | +7: iteration 20170/ 21553 | consumed samples: 5163520 | consumed tokens: 10574888960 | elapsed time per iteration (s): 0.18 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 3.678296E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1404.678 | TFLOPs: 26.84 | +7: iteration 20180/ 21553 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 0.18 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.676973E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.642 | TFLOPs: 27.03 | +7: iteration 20190/ 21553 | consumed samples: 5168640 | consumed tokens: 10585374720 | elapsed time per iteration (s): 0.18 | learning rate: 2.181E-05 | global batch size: 256 | lm loss: 3.674577E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.125 | TFLOPs: 27.00 | +7: iteration 20200/ 21553 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 0.18 | learning rate: 2.178E-05 | global batch size: 256 | lm loss: 3.671096E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.422 | TFLOPs: 26.99 | +7: iteration 20210/ 21553 | consumed samples: 5173760 | consumed tokens: 10595860480 | elapsed time per iteration (s): 0.18 | learning rate: 2.175E-05 | global batch size: 256 | lm loss: 3.681863E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.687 | TFLOPs: 26.99 | +7: iteration 20220/ 21553 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 0.18 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 3.680770E+00 | grad norm: 0.448 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.591 | TFLOPs: 26.97 | +7: iteration 20230/ 21553 | consumed samples: 5178880 | consumed tokens: 10606346240 | elapsed time per iteration (s): 0.18 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.676871E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.065 | TFLOPs: 27.00 | +7: iteration 20240/ 21553 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 0.18 | learning rate: 2.168E-05 | global batch size: 256 | lm loss: 3.673069E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.416 | TFLOPs: 27.01 | +7: iteration 20250/ 21553 | consumed samples: 5184000 | consumed tokens: 10616832000 | elapsed time per iteration (s): 0.18 | learning rate: 2.165E-05 | global batch size: 256 | lm loss: 3.682153E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.564 | TFLOPs: 27.01 | +7: iteration 20260/ 21553 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 0.20 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 3.687093E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1292.825 | TFLOPs: 24.70 | +7: iteration 20270/ 21553 | consumed samples: 5189120 | consumed tokens: 10627317760 | elapsed time per iteration (s): 0.19 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.685769E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1317.278 | TFLOPs: 25.17 | +7: iteration 20280/ 21553 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 0.23 | learning rate: 2.158E-05 | global batch size: 256 | lm loss: 3.675163E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.006 | TFLOPs: 21.32 | +7: iteration 20290/ 21553 | consumed samples: 5194240 | consumed tokens: 10637803520 | elapsed time per iteration (s): 0.21 | learning rate: 2.155E-05 | global batch size: 256 | lm loss: 3.682824E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1230.158 | TFLOPs: 23.50 | +7: iteration 20300/ 21553 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 0.18 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.672925E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.271 | TFLOPs: 26.89 | +7: iteration 20310/ 21553 | consumed samples: 5199360 | consumed tokens: 10648289280 | elapsed time per iteration (s): 0.18 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.679848E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.771 | TFLOPs: 27.01 | +7: iteration 20320/ 21553 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 0.18 | learning rate: 2.148E-05 | global batch size: 256 | lm loss: 3.684940E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.411 | TFLOPs: 27.01 | +7: iteration 20330/ 21553 | consumed samples: 5204480 | consumed tokens: 10658775040 | elapsed time per iteration (s): 0.18 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 3.679048E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.899 | TFLOPs: 27.03 | +7: iteration 20340/ 21553 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 0.18 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 3.672326E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.431 | TFLOPs: 27.04 | +7: iteration 20350/ 21553 | consumed samples: 5209600 | consumed tokens: 10669260800 | elapsed time per iteration (s): 0.18 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 3.678210E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1401.995 | TFLOPs: 26.79 | +7: iteration 20360/ 21553 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 0.18 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.683841E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1408.025 | TFLOPs: 26.90 | +7: iteration 20370/ 21553 | consumed samples: 5214720 | consumed tokens: 10679746560 | elapsed time per iteration (s): 0.18 | learning rate: 2.136E-05 | global batch size: 256 | lm loss: 3.679012E+00 | grad norm: 0.402 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.510 | TFLOPs: 27.05 | +7: iteration 20380/ 21553 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 0.18 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.677824E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.343 | TFLOPs: 27.04 | +7: iteration 20390/ 21553 | consumed samples: 5219840 | consumed tokens: 10690232320 | elapsed time per iteration (s): 0.19 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.676518E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.562 | TFLOPs: 26.34 | +7: iteration 20400/ 21553 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 0.18 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.683084E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.209 | TFLOPs: 26.94 | +7: iteration 20410/ 21553 | consumed samples: 5224960 | consumed tokens: 10700718080 | elapsed time per iteration (s): 0.18 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.678505E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.259 | TFLOPs: 27.04 | +7: iteration 20420/ 21553 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 0.18 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.681899E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.430 | TFLOPs: 27.04 | +7: iteration 20430/ 21553 | consumed samples: 5230080 | consumed tokens: 10711203840 | elapsed time per iteration (s): 0.18 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 3.682846E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.567 | TFLOPs: 27.03 | +7: iteration 20440/ 21553 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 0.18 | learning rate: 2.121E-05 | global batch size: 256 | lm loss: 3.676139E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.663 | TFLOPs: 27.05 | +7: iteration 20450/ 21553 | consumed samples: 5235200 | consumed tokens: 10721689600 | elapsed time per iteration (s): 0.18 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.677988E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.915 | TFLOPs: 27.03 | +7: iteration 20460/ 21553 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 0.18 | learning rate: 2.116E-05 | global batch size: 256 | lm loss: 3.679084E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1407.096 | TFLOPs: 26.88 | +7: iteration 20470/ 21553 | consumed samples: 5240320 | consumed tokens: 10732175360 | elapsed time per iteration (s): 0.18 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.672937E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.130 | TFLOPs: 27.08 | +7: iteration 20480/ 21553 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 0.18 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.680064E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.440 | TFLOPs: 27.06 | +7: iteration 20490/ 21553 | consumed samples: 5245440 | consumed tokens: 10742661120 | elapsed time per iteration (s): 0.18 | learning rate: 2.110E-05 | global batch size: 256 | lm loss: 3.679238E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.441 | TFLOPs: 27.06 | +7: iteration 20500/ 21553 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 0.18 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.685828E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.096 | TFLOPs: 27.08 | +7: iteration 20510/ 21553 | consumed samples: 5250560 | consumed tokens: 10753146880 | elapsed time per iteration (s): 0.18 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.674315E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.743 | TFLOPs: 27.07 | +7: iteration 20520/ 21553 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 0.18 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.680912E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.093 | TFLOPs: 27.04 | +7: iteration 20530/ 21553 | consumed samples: 5255680 | consumed tokens: 10763632640 | elapsed time per iteration (s): 0.18 | learning rate: 2.102E-05 | global batch size: 256 | lm loss: 3.672766E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.080 | TFLOPs: 27.02 | +7: iteration 20540/ 21553 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 0.18 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.676889E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.434 | TFLOPs: 27.02 | +7: iteration 20550/ 21553 | consumed samples: 5260800 | consumed tokens: 10774118400 | elapsed time per iteration (s): 0.18 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.680056E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.666 | TFLOPs: 27.03 | +7: iteration 20560/ 21553 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 0.18 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.681703E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.509 | TFLOPs: 27.03 | +7: iteration 20570/ 21553 | consumed samples: 5265920 | consumed tokens: 10784604160 | elapsed time per iteration (s): 0.18 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.681540E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.448 | TFLOPs: 27.06 | +7: iteration 20580/ 21553 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 0.18 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.677842E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.580 | TFLOPs: 27.05 | +7: iteration 20590/ 21553 | consumed samples: 5271040 | consumed tokens: 10795089920 | elapsed time per iteration (s): 0.18 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.676725E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.093 | TFLOPs: 27.06 | +7: iteration 20600/ 21553 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 0.18 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.677384E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.631 | TFLOPs: 27.07 | +7: iteration 20610/ 21553 | consumed samples: 5276160 | consumed tokens: 10805575680 | elapsed time per iteration (s): 0.18 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.675908E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.248 | TFLOPs: 27.06 | +7: iteration 20620/ 21553 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 0.18 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.678164E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.670 | TFLOPs: 27.03 | +7: iteration 20630/ 21553 | consumed samples: 5281280 | consumed tokens: 10816061440 | elapsed time per iteration (s): 0.18 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.669957E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.670 | TFLOPs: 27.03 | +7: iteration 20640/ 21553 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 0.18 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.677188E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.468 | TFLOPs: 27.06 | +7: iteration 20650/ 21553 | consumed samples: 5286400 | consumed tokens: 10826547200 | elapsed time per iteration (s): 0.18 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.676611E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1417.302 | TFLOPs: 27.08 | +7: iteration 20660/ 21553 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 0.18 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.676353E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.060 | TFLOPs: 27.06 | +7: iteration 20670/ 21553 | consumed samples: 5291520 | consumed tokens: 10837032960 | elapsed time per iteration (s): 0.18 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.676741E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.596 | TFLOPs: 27.03 | +7: iteration 20680/ 21553 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 0.18 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.674804E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.764 | TFLOPs: 27.01 | +7: iteration 20690/ 21553 | consumed samples: 5296640 | consumed tokens: 10847518720 | elapsed time per iteration (s): 0.18 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.676899E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.962 | TFLOPs: 26.96 | +7: iteration 20700/ 21553 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 0.18 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.669834E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.542 | TFLOPs: 27.03 | +7: iteration 20710/ 21553 | consumed samples: 5301760 | consumed tokens: 10858004480 | elapsed time per iteration (s): 0.18 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.676924E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.331 | TFLOPs: 27.00 | +7: iteration 20720/ 21553 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 0.18 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.677539E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.912 | TFLOPs: 27.03 | +7: iteration 20730/ 21553 | consumed samples: 5306880 | consumed tokens: 10868490240 | elapsed time per iteration (s): 0.18 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.679890E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.256 | TFLOPs: 27.06 | +7: iteration 20740/ 21553 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 0.18 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.679455E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1416.184 | TFLOPs: 27.06 | +7: iteration 20750/ 21553 | consumed samples: 5312000 | consumed tokens: 10878976000 | elapsed time per iteration (s): 0.18 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.674055E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.355 | TFLOPs: 27.04 | +7: iteration 20760/ 21553 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 0.18 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.683023E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.696 | TFLOPs: 27.05 | +7: iteration 20770/ 21553 | consumed samples: 5317120 | consumed tokens: 10889461760 | elapsed time per iteration (s): 0.18 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.678621E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.167 | TFLOPs: 27.04 | +7: iteration 20780/ 21553 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 0.19 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.674558E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1352.681 | TFLOPs: 25.84 | +7: iteration 20790/ 21553 | consumed samples: 5322240 | consumed tokens: 10899947520 | elapsed time per iteration (s): 0.20 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.676872E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1257.082 | TFLOPs: 24.02 | +7: iteration 20800/ 21553 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 0.18 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.683440E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.214 | TFLOPs: 27.00 | +7: iteration 20810/ 21553 | consumed samples: 5327360 | consumed tokens: 10910433280 | elapsed time per iteration (s): 0.18 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.677513E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.673 | TFLOPs: 27.05 | +7: iteration 20820/ 21553 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 0.18 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.669380E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.489 | TFLOPs: 27.04 | +7: iteration 20830/ 21553 | consumed samples: 5332480 | consumed tokens: 10920919040 | elapsed time per iteration (s): 0.18 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.673724E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.577 | TFLOPs: 27.01 | +7: iteration 20840/ 21553 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 0.18 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.673654E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.236 | TFLOPs: 27.02 | +7: iteration 20850/ 21553 | consumed samples: 5337600 | consumed tokens: 10931404800 | elapsed time per iteration (s): 0.18 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.679478E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.045 | TFLOPs: 27.00 | +7: iteration 20860/ 21553 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 0.18 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.677661E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.513 | TFLOPs: 27.03 | +7: iteration 20870/ 21553 | consumed samples: 5342720 | consumed tokens: 10941890560 | elapsed time per iteration (s): 0.19 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.684190E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1368.887 | TFLOPs: 26.15 | +7: iteration 20880/ 21553 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 0.18 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.668877E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1403.427 | TFLOPs: 26.81 | +7: iteration 20890/ 21553 | consumed samples: 5347840 | consumed tokens: 10952376320 | elapsed time per iteration (s): 0.18 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.678942E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.316 | TFLOPs: 26.98 | +7: iteration 20900/ 21553 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 0.18 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.677905E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.505 | TFLOPs: 26.97 | +7: iteration 20910/ 21553 | consumed samples: 5352960 | consumed tokens: 10962862080 | elapsed time per iteration (s): 0.18 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.680018E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.062 | TFLOPs: 26.94 | +7: iteration 20920/ 21553 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 0.18 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.677669E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.078 | TFLOPs: 26.94 | +7: iteration 20930/ 21553 | consumed samples: 5358080 | consumed tokens: 10973347840 | elapsed time per iteration (s): 0.18 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.679086E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1410.198 | TFLOPs: 26.94 | +7: iteration 20940/ 21553 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 0.18 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.677193E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1391.752 | TFLOPs: 26.59 | +7: iteration 20950/ 21553 | consumed samples: 5363200 | consumed tokens: 10983833600 | elapsed time per iteration (s): 0.19 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.677847E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1322.427 | TFLOPs: 25.27 | +7: iteration 20960/ 21553 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 0.18 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.681932E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1411.903 | TFLOPs: 26.98 | +7: iteration 20970/ 21553 | consumed samples: 5368320 | consumed tokens: 10994319360 | elapsed time per iteration (s): 0.18 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.671819E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1386.752 | TFLOPs: 26.50 | +7: iteration 20980/ 21553 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 0.18 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.682245E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.682 | TFLOPs: 27.03 | +7: iteration 20990/ 21553 | consumed samples: 5373440 | consumed tokens: 11004805120 | elapsed time per iteration (s): 0.18 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.681741E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.551 | TFLOPs: 27.03 | +7: iteration 21000/ 21553 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 0.19 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.676091E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1374.860 | TFLOPs: 26.27 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 21000 | lm loss value: 3.871819E+00 | lm loss PPL: 4.802965E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 21000 to checkpoints_83m14b100mdedup +0: [2023-03-17 00:39:23,029] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step21000 is begin to save! +0: [2023-03-17 00:39:23,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:39:23,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:39:23,115] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:39:23,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:39:23,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:39:23,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:39:23,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:39:23,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:39:23,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:39:23,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:39:23,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:39:23,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:39:23,173] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:39:23,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:39:23,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:39:23,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:39:23,196] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:39:23,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:39:23,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:39:23,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:39:23,219] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:39:23,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:39:23,230] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:39:23,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:39:23,231] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step21000/mp_rank_00_model_states.pt +0: [2023-03-17 00:39:23,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/mp_rank_00_model_states.pt... +0: [2023-03-17 00:39:23,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/mp_rank_00_model_states.pt. +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:39:23,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,273] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,273] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:23,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,274] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,274] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,275] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,276] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:23,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:23,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:23,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:23,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:23,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:23,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:23,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:39:23,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:39:23,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:23,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,290] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-17 00:39:23,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-17 00:39:23,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: successfully saved checkpoint at iteration 21000 to checkpoints_83m14b100mdedup +7: time (ms) | save-checkpoint: 269.20 +7: iteration 21010/ 21553 | consumed samples: 5378560 | consumed tokens: 11015290880 | elapsed time per iteration (s): 0.21 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.674298E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1196.184 | TFLOPs: 22.85 | +7: iteration 21020/ 21553 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 0.18 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.679388E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1398.390 | TFLOPs: 26.72 | +7: iteration 21030/ 21553 | consumed samples: 5383680 | consumed tokens: 11025776640 | elapsed time per iteration (s): 0.18 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.676027E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.800 | TFLOPs: 26.69 | +7: iteration 21040/ 21553 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 0.18 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.677192E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.269 | TFLOPs: 27.02 | +7: iteration 21050/ 21553 | consumed samples: 5388800 | consumed tokens: 11036262400 | elapsed time per iteration (s): 0.18 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.673494E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1415.548 | TFLOPs: 27.05 | +7: iteration 21060/ 21553 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 0.23 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.671571E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.678 | TFLOPs: 20.95 | +7: iteration 21070/ 21553 | consumed samples: 5393920 | consumed tokens: 11046748160 | elapsed time per iteration (s): 0.19 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.676414E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1332.018 | TFLOPs: 25.45 | +7: iteration 21080/ 21553 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 0.19 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.675810E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1339.358 | TFLOPs: 25.59 | +7: iteration 21090/ 21553 | consumed samples: 5399040 | consumed tokens: 11057233920 | elapsed time per iteration (s): 0.19 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.673280E+00 | grad norm: 0.492 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.710 | TFLOPs: 25.92 | +7: iteration 21100/ 21553 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 0.22 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.674415E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1181.152 | TFLOPs: 22.57 | +7: iteration 21110/ 21553 | consumed samples: 5404160 | consumed tokens: 11067719680 | elapsed time per iteration (s): 0.23 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.673068E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.916 | TFLOPs: 21.03 | +7: iteration 21120/ 21553 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 0.19 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.669504E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1361.354 | TFLOPs: 26.01 | +7: iteration 21130/ 21553 | consumed samples: 5409280 | consumed tokens: 11078205440 | elapsed time per iteration (s): 0.19 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.676785E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1352.696 | TFLOPs: 25.85 | +7: iteration 21140/ 21553 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 0.19 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.677871E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1316.549 | TFLOPs: 25.15 | +7: iteration 21150/ 21553 | consumed samples: 5414400 | consumed tokens: 11088691200 | elapsed time per iteration (s): 0.19 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.679013E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1320.718 | TFLOPs: 25.23 | +7: iteration 21160/ 21553 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 0.20 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.673920E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1271.886 | TFLOPs: 24.30 | +7: iteration 21170/ 21553 | consumed samples: 5419520 | consumed tokens: 11099176960 | elapsed time per iteration (s): 0.19 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.678240E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.099 | TFLOPs: 25.91 | +7: iteration 21180/ 21553 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 0.19 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.676792E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.144 | TFLOPs: 26.27 | +7: iteration 21190/ 21553 | consumed samples: 5424640 | consumed tokens: 11109662720 | elapsed time per iteration (s): 0.19 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.682025E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1345.701 | TFLOPs: 25.71 | +7: iteration 21200/ 21553 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 0.19 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.674913E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1361.802 | TFLOPs: 26.02 | +7: iteration 21210/ 21553 | consumed samples: 5429760 | consumed tokens: 11120148480 | elapsed time per iteration (s): 0.18 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.677039E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1405.001 | TFLOPs: 26.84 | +7: iteration 21220/ 21553 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 0.19 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.680239E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1363.098 | TFLOPs: 26.04 | +7: iteration 21230/ 21553 | consumed samples: 5434880 | consumed tokens: 11130634240 | elapsed time per iteration (s): 0.20 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.678323E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1253.849 | TFLOPs: 23.96 | +7: iteration 21240/ 21553 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 0.19 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.681029E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1336.896 | TFLOPs: 25.54 | +7: iteration 21250/ 21553 | consumed samples: 5440000 | consumed tokens: 11141120000 | elapsed time per iteration (s): 0.19 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.676190E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1366.845 | TFLOPs: 26.12 | +7: iteration 21260/ 21553 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 0.20 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.674212E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1299.144 | TFLOPs: 24.82 | +7: iteration 21270/ 21553 | consumed samples: 5445120 | consumed tokens: 11151605760 | elapsed time per iteration (s): 0.21 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.680202E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1196.675 | TFLOPs: 22.86 | +7: iteration 21280/ 21553 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 0.19 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.678052E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1378.263 | TFLOPs: 26.33 | +7: iteration 21290/ 21553 | consumed samples: 5450240 | consumed tokens: 11162091520 | elapsed time per iteration (s): 0.18 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.670190E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1390.607 | TFLOPs: 26.57 | +7: iteration 21300/ 21553 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 0.18 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.679718E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.219 | TFLOPs: 27.00 | +7: iteration 21310/ 21553 | consumed samples: 5455360 | consumed tokens: 11172577280 | elapsed time per iteration (s): 0.19 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.671295E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1355.861 | TFLOPs: 25.91 | +7: iteration 21320/ 21553 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 0.19 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.674413E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1343.511 | TFLOPs: 25.67 | +7: iteration 21330/ 21553 | consumed samples: 5460480 | consumed tokens: 11183063040 | elapsed time per iteration (s): 0.20 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.679390E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1294.991 | TFLOPs: 24.74 | +7: iteration 21340/ 21553 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 0.18 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.673612E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.862 | TFLOPs: 27.03 | +7: iteration 21350/ 21553 | consumed samples: 5465600 | consumed tokens: 11193548800 | elapsed time per iteration (s): 0.19 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.683560E+00 | grad norm: 0.425 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1375.170 | TFLOPs: 26.27 | +7: iteration 21360/ 21553 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 0.18 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.677698E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1399.377 | TFLOPs: 26.74 | +7: iteration 21370/ 21553 | consumed samples: 5470720 | consumed tokens: 11204034560 | elapsed time per iteration (s): 0.19 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.668384E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.423 | TFLOPs: 25.99 | +7: iteration 21380/ 21553 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 0.18 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.678185E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1396.461 | TFLOPs: 26.68 | +7: iteration 21390/ 21553 | consumed samples: 5475840 | consumed tokens: 11214520320 | elapsed time per iteration (s): 0.19 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.675655E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1348.926 | TFLOPs: 25.77 | +7: iteration 21400/ 21553 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 0.19 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.673174E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1379.057 | TFLOPs: 26.35 | +7: iteration 21410/ 21553 | consumed samples: 5480960 | consumed tokens: 11225006080 | elapsed time per iteration (s): 0.18 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.683361E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.560 | TFLOPs: 27.03 | +7: iteration 21420/ 21553 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 0.19 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.676114E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1360.217 | TFLOPs: 25.99 | +7: iteration 21430/ 21553 | consumed samples: 5486080 | consumed tokens: 11235491840 | elapsed time per iteration (s): 0.19 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.672925E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1353.751 | TFLOPs: 25.87 | +7: iteration 21440/ 21553 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 0.18 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.675472E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1393.560 | TFLOPs: 26.63 | +7: iteration 21450/ 21553 | consumed samples: 5491200 | consumed tokens: 11245977600 | elapsed time per iteration (s): 0.19 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.679948E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1356.570 | TFLOPs: 25.92 | +7: iteration 21460/ 21553 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 0.19 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.680127E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1370.639 | TFLOPs: 26.19 | +7: iteration 21470/ 21553 | consumed samples: 5496320 | consumed tokens: 11256463360 | elapsed time per iteration (s): 0.19 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.675575E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1326.750 | TFLOPs: 25.35 | +7: iteration 21480/ 21553 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 0.19 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.678731E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1377.325 | TFLOPs: 26.32 | +7: iteration 21490/ 21553 | consumed samples: 5501440 | consumed tokens: 11266949120 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.683201E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1413.555 | TFLOPs: 27.01 | +7: iteration 21500/ 21553 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.678302E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1394.676 | TFLOPs: 26.65 | +7: iteration 21510/ 21553 | consumed samples: 5506560 | consumed tokens: 11277434880 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.670263E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1414.633 | TFLOPs: 27.03 | +7: iteration 21520/ 21553 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.671155E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1412.041 | TFLOPs: 26.98 | +7: iteration 21530/ 21553 | consumed samples: 5511680 | consumed tokens: 11287920640 | elapsed time per iteration (s): 0.19 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.669337E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1335.763 | TFLOPs: 25.52 | +7: iteration 21540/ 21553 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.670827E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1397.179 | TFLOPs: 26.70 | +7: iteration 21550/ 21553 | consumed samples: 5516800 | consumed tokens: 11298406400 | elapsed time per iteration (s): 0.18 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.673766E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1392.936 | TFLOPs: 26.61 | +0: [after training is done] datetime: 2023-03-17 00:41:08 +0: saving checkpoint at iteration 21553 to checkpoints_83m14b100mdedup +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.887809E+00 | lm loss PPL: 4.880383E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-17 00:41:08,450] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step21553 is begin to save! +0: [2023-03-17 00:41:08,456] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:41:08,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:41:08,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:41:08,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:41:08,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:41:08,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:41:08,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:41:08,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:41:08,601] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:41:08,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:41:08,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:41:08,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:41:08,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:41:08,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:41:08,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:41:08,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:41:08,647] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:41:08,658] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:41:08,658] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:41:08,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:41:08,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:41:08,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:41:08,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:41:08,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:41:08,683] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt +0: [2023-03-17 00:41:08,683] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:41:08,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:41:08,702] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:41:08,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,721] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:41:08,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +1: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,728] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +4: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:41:08,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-17 00:41:08,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +7: [2023-03-17 00:41:08,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:41:08,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +6: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +2: [2023-03-17 00:41:08,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-17 00:41:08,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:41:08,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-17 00:41:08,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +3: [2023-03-17 00:41:08,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21553 is ready now! +0: successfully saved checkpoint at iteration 21553 to checkpoints_83m14b100mdedup +END 3326822: Fri 17 Mar 2023 12:41:17 AM EET diff --git a/83m14b100mdedup/3327361.err b/83m14b100mdedup/3327361.err new file mode 100644 index 0000000000000000000000000000000000000000..7ba01a542ef090f816b171c754c649597ad82e50 --- /dev/null +++ b/83m14b100mdedup/3327361.err @@ -0,0 +1,1121 @@ +1: 2023-03-17 00:52:34.411851: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411850: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411860: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411847: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411847: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411877: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411893: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-17 00:52:34.411899: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412725: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412753: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412756: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412767: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412771: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412777: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-17 00:52:34.412788: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413372: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413384: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413387: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413396: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:34.413412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413787: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413800: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413819: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413807: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413832: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413809: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-17 00:52:34.413835: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427390: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427399: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427400: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427398: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427411: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427429: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427450: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-17 00:52:34.427454: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428571: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428582: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428583: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428568: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428578: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428590: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-17 00:52:34.428590: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429547: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429554: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429545: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429553: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429562: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429567: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-17 00:52:34.429563: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489122: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489125: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489125: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489116: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489115: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-17 00:52:34.489109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-17 00:52:36.049957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049967: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049965: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.049971: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:36.050363: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050364: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050372: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050369: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050371: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050375: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050376: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-17 00:52:36.050376: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064045: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064046: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064058: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064061: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064064: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064052: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064053: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:36.064233: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064238: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064238: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064239: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064241: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064244: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064243: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-17 00:52:36.064249: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.083760: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083768: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083768: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.083774: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:36.084168: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084171: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084177: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084179: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084180: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084180: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084184: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-17 00:52:36.084185: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.144806: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144824: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144815: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144820: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.144815: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:36.145323: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145321: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145327: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145329: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145330: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145332: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145333: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-17 00:52:36.145336: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158512: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158516: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158504: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158515: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:36.158702: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158709: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158710: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158709: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158713: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158711: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158714: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:36.158716: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193432: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193436: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193438: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193443: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193436: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193441: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193444: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193442: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:36.193828: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193830: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193833: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193838: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193837: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193839: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193844: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-17 00:52:36.193847: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:52:36.194102: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194120: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194117: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 2023-03-17 00:52:36.194186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194116: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194176: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 00:52:36.194122: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 00:52:36.194671: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:52:36.194673: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194189: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194194: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 2023-03-17 00:52:36.194679: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:52:36.194681: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:52:36.194686: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:52:36.194686: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-17 00:52:36.194685: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:36.194692: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194184: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:36.194534: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194533: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194539: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194542: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194546: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194545: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194550: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-17 00:52:36.194553: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-17 00:52:40.903875: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903981: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903978: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903880: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903988: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903880: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903882: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903991: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903887: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903989: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-17 00:52:40.903890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.903995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904196: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904205: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.904222: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904583: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904591: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904597: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.904693: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-17 00:52:40.904598: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 00:52:40.904697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 00:52:40.904707: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 00:52:40.904705: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.904603: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-17 00:52:40.904707: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.904707: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.904705: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.904711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905796: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905799: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905799: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906012: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-17 00:52:40.905801: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905803: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906012: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-17 00:52:40.905804: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905813: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:52:40.905815: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-17 00:52:40.905813: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:52:40.905819: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:52:40.905818: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905820: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-17 00:52:40.905821: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-17 00:52:40.905827: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-17 00:52:40.905842: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-17 00:52:40.906064: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906021: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906027: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.906069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 00:52:40.906024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906028: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906030: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.906074: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-17 00:52:40.906033: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906034: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-17 00:52:40.906037: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906038: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-17 00:52:40.906039: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.906077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.906077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.906077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.906081: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.906089: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906533: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906534: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906536: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-17 00:52:40.906537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 00:52:40.906750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906552: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:52:40.906553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:52:40.906556: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:52:40.906557: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:52:40.906750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-17 00:52:40.906559: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-17 00:52:40.906560: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906574: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 00:52:40.906751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-17 00:52:40.906588: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-17 00:52:40.906875: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906761: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:52:40.906765: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906766: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:52:40.906772: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:52:40.906773: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:52:40.906775: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-17 00:52:40.906775: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906877: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-17 00:52:40.906784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-17 00:52:40.906798: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906874: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.906879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.906877: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.906883: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.906882: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.906893: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906894: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906893: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906896: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906897: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906900: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906901: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-17 00:52:40.906922: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-17 00:52:40.906936: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.908148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908091: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908092: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908089: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908164: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908156: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908159: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908094: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908171: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.908172: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.908174: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.908101: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: 2023-03-17 00:52:40.908178: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.908178: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908205: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: 2023-03-17 00:52:40.908109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-17 00:52:40.908225: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-17 00:52:40.908229: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.909973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.909978: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.909982: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.909984: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.909986: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.909989: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.909991: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.909987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.909999: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.910002: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.910002: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.910007: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.910046: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.910050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-17 00:52:40.910063: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-17 00:52:40.910065: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.913955: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913952: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.913968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915890: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915903: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915902: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915901: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915918: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915917: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915921: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915923: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915923: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915928: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-17 00:52:40.915937: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-17 00:52:40.915948: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +7: Successfully preprocessed all matching files. +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +7: Building extension module utils... +7: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +6: +6: +6: +7: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +3: Loading extension module utils... +7: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: +7: Loading extension module utils... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +0: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +0: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +0: No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: +1: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +6: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils...Loading extension module utils... +6: +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/83m14b100mdedup/3327361.out b/83m14b100mdedup/3327361.out new file mode 100644 index 0000000000000000000000000000000000000000..0ed28a5d93a9c13a196a8056013f5bda2daff522 --- /dev/null +++ b/83m14b100mdedup/3327361.out @@ -0,0 +1,4384 @@ +Model parameters: d_model 640 ffw_size 2560 kv_size 64 n_heads 10 n_layers 10 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 10 --hidden-size 640 --num-attention-heads 10 --kv-channels 64 --ffn-hidden-size 2560 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-83m14b100mdedupval --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --no-load-optim --reset-progress --override-lr-scheduler --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --eval-only true --tensorboard-dir tensorboard_83m14b100mdedupval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_83m14b100mdedup --load checkpoints_83m14b100mdedup --train-weighted-split-paths-path train20b.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3327361.json --zero-stage 0 +START 3327361: Fri 17 Mar 2023 12:52:02 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 51.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 49.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 47.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 48.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 49.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 49.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 40.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 49.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 45.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 53.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 45.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 46.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 54.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 50.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +1: Launching on nid006862 (1/8), master nid006861 port 9999, GPUs 8, CUDA: True +5: Launching on nid006866 (5/8), master nid006861 port 9999, GPUs 8, CUDA: True +3: Launching on nid006864 (3/8), master nid006861 port 9999, GPUs 8, CUDA: True +0: Launching on nid006861 (0/8), master nid006861 port 9999, GPUs 8, CUDA: True +2: Launching on nid006863 (2/8), master nid006861 port 9999, GPUs 8, CUDA: True +4: Launching on nid006865 (4/8), master nid006861 port 9999, GPUs 8, CUDA: True +6: Launching on nid006867 (6/8), master nid006861 port 9999, GPUs 8, CUDA: True +7: Launching on nid006868 (7/8), master nid006861 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... True +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3327361.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... True +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2560 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 640 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-83m14b100mdedupval +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_83m14b100mdedup +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... 12.0 +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 10 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_83m14b100mdedup +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_83m14b100mdedupval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-17 00:52:59,570] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.088 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 26.280 seconds +0: time to initialize megatron (seconds): -23.195 +0: [after megatron is initialized] datetime: 2023-03-17 00:53:28 +0: building GPT model ... +0: [2023-03-17 00:53:28,940] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-17 00:53:28,940] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-17 00:53:28,940] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.2 GB, percent = 6.2% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-17 00:53:30,920] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=17 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: undo +0: 14: MixedFusedLayerNorm +0: 15: EmbeddingPipe +0: 16: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-17 00:53:31,164] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-17 00:53:31,164] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-17 00:53:31,165] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.22 GB, percent = 6.2% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-17 00:53:31,166] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-17 00:53:44,191] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-17 00:53:44,192] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-17 00:53:44,192] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-17 00:53:44,197] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-17 00:53:44,198] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-17 00:53:44,318] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-17 00:53:44,319] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-17 00:53:44,319] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.9 GB, percent = 6.3% +7: ninja: no work to do. +7: Time to load utils op: 0.17003989219665527 seconds +0: Time to load utils op: 0.11098408699035645 seconds +0: Time to load utils op: 0.1021428108215332 seconds +0: Time to load utils op: 0.1025242805480957 seconds +0: Time to load utils op: 0.10187363624572754 seconds +0: Time to load utils op: 0.10278487205505371 seconds +0: Time to load utils op: 0.10286307334899902 seconds +0: Time to load utils op: 0.1029815673828125 seconds +0: Time to load utils op: 0.10280060768127441 seconds +1: Time to load utils op: 0.11074042320251465 seconds +1: Time to load utils op: 0.11241841316223145 seconds +1: Time to load utils op: 0.11158370971679688 seconds +1: Time to load utils op: 0.11121273040771484 seconds +1: Time to load utils op: 0.11141490936279297 seconds +1: Time to load utils op: 0.11175012588500977 seconds +1: Time to load utils op: 0.11229109764099121 seconds +1: Time to load utils op: 0.11096501350402832 seconds +7: Time to load utils op: 0.10322451591491699 seconds +7: Time to load utils op: 0.10287070274353027 seconds +7: Time to load utils op: 0.10262107849121094 seconds +7: Time to load utils op: 0.10290980339050293 seconds +7: Time to load utils op: 0.10310912132263184 seconds +7: Time to load utils op: 0.10262203216552734 seconds +7: Time to load utils op: 0.10303878784179688 seconds +2: Time to load utils op: 0.11176633834838867 seconds +2: Time to load utils op: 0.11174631118774414 seconds +2: Time to load utils op: 0.11177396774291992 seconds +2: Time to load utils op: 0.1117551326751709 seconds +2: Time to load utils op: 0.11179375648498535 secondsTime to load utils op: 0.11179494857788086 seconds +2: +2: Time to load utils op: 0.11177659034729004 secondsTime to load utils op: 0.11177873611450195 seconds +2: +3: Time to load utils op: 0.11187481880187988 secondsTime to load utils op: 0.11188602447509766 seconds +3: +3: Time to load utils op: 0.11191654205322266 secondsTime to load utils op: 0.11191606521606445 seconds +3: Time to load utils op: 0.11188983917236328 seconds +3: Time to load utils op: 0.1118922233581543 secondsTime to load utils op: 0.1118929386138916 seconds +3: +3: Time to load utils op: 0.11189150810241699 seconds +3: +4: Time to load utils op: 0.11246562004089355 secondsTime to load utils op: 0.11242842674255371 seconds +4: +4: Time to load utils op: 0.11245465278625488 secondsTime to load utils op: 0.11244845390319824 seconds +4: +4: Time to load utils op: 0.1124563217163086 secondsTime to load utils op: 0.11245155334472656 seconds +4: +4: Time to load utils op: 0.11245512962341309 seconds +4: Time to load utils op: 0.11249303817749023 seconds +5: Time to load utils op: 0.1123056411743164 seconds +5: Time to load utils op: 0.1123194694519043 seconds +5: Time to load utils op: 0.11231803894042969 seconds +5: Time to load utils op: 0.11236929893493652 seconds +5: Time to load utils op: 0.11234903335571289 seconds +5: Time to load utils op: 0.1123666763305664 seconds +5: Time to load utils op: 0.11239838600158691 secondsTime to load utils op: 0.11239957809448242 seconds +5: +6: Time to load utils op: 0.11046838760375977 seconds +6: Time to load utils op: 0.11046862602233887 seconds +6: Time to load utils op: 0.11049175262451172 secondsTime to load utils op: 0.11051487922668457 secondsTime to load utils op: 0.11051797866821289 seconds +6: +6: +6: Time to load utils op: 0.11052942276000977 secondsTime to load utils op: 0.11052298545837402 seconds +6: +6: Time to load utils op: 0.11050605773925781 seconds +7: Time to load utils op: 0.0006136894226074219 seconds +7: Time to load utils op: 0.00046825408935546875 seconds +7: Time to load utils op: 0.0006456375122070312 secondsTime to load utils op: 0.0006434917449951172 seconds +7: +7: Time to load utils op: 0.0005464553833007812 seconds +7: Time to load utils op: 0.0007021427154541016 seconds +7: Time to load utils op: 0.0005280971527099609 seconds +7: Time to load utils op: 0.0005381107330322266 seconds +1: Time to load utils op: 0.0008208751678466797 seconds +0: Time to load utils op: 0.0005915164947509766 seconds +0: Time to load utils op: 0.0004956722259521484 seconds +0: Time to load utils op: 0.0005235671997070312 seconds +0: Time to load utils op: 0.0005156993865966797 seconds +0: Time to load utils op: 0.0004634857177734375 seconds +0: Time to load utils op: 0.0005102157592773438 seconds +1: Time to load utils op: 0.0014314651489257812 seconds +0: Time to load utils op: 0.00048422813415527344 seconds +1: Time to load utils op: 0.0011997222900390625 seconds +2: Time to load utils op: 0.0008199214935302734 seconds +1: Time to load utils op: 0.0013492107391357422 seconds +1: Time to load utils op: 0.0014286041259765625 seconds +1: Time to load utils op: 0.0014123916625976562 seconds +1: Time to load utils op: 0.0014412403106689453 seconds +2: Time to load utils op: 0.0009157657623291016 seconds +1: Time to load utils op: 0.001463174819946289 seconds +2: Time to load utils op: 0.0010433197021484375 seconds +2: Time to load utils op: 0.0010983943939208984 seconds +2: Time to load utils op: 0.0010530948638916016 seconds +2: Time to load utils op: 0.0010805130004882812 seconds +2: Time to load utils op: 0.00107574462890625 seconds +2: Time to load utils op: 0.001123666763305664 seconds +4: Time to load utils op: 0.0010666847229003906 seconds +4: Time to load utils op: 0.0012903213500976562 seconds +3: Time to load utils op: 0.0009443759918212891 seconds +3: Time to load utils op: 0.0009598731994628906 seconds +4: Time to load utils op: 0.0013959407806396484 secondsTime to load utils op: 0.0013480186462402344 seconds +4: +4: Time to load utils op: 0.0013859272003173828 secondsTime to load utils op: 0.0014157295227050781 seconds +4: +4: Time to load utils op: 0.0014023780822753906 seconds +4: Time to load utils op: 0.001447916030883789 seconds +3: Time to load utils op: 0.0013742446899414062 seconds +3: Time to load utils op: 0.0014023780822753906 seconds +3: Time to load utils op: 0.0013761520385742188 seconds +3: Time to load utils op: 0.0013909339904785156 seconds +3: Time to load utils op: 0.001390695571899414 seconds +3: Time to load utils op: 0.001444101333618164 seconds +6: Time to load utils op: 0.0010132789611816406 seconds +6: Time to load utils op: 0.0010335445404052734 seconds +6: Time to load utils op: 0.0010192394256591797 seconds +5: Time to load utils op: 0.0010123252868652344 seconds +5: Time to load utils op: 0.0009250640869140625 seconds +6: Time to load utils op: 0.001123189926147461 seconds +6: Time to load utils op: 0.0012390613555908203 seconds +5: Time to load utils op: 0.0011646747589111328 seconds +5: Time to load utils op: 0.0011749267578125 seconds +5: Time to load utils op: 0.0012009143829345703 seconds +6: Time to load utils op: 0.0012378692626953125 seconds +6: Time to load utils op: 0.001146078109741211 seconds +5: Time to load utils op: 0.0012018680572509766 seconds +5: Time to load utils op: 0.001222848892211914 seconds +5: Time to load utils op: 0.0012805461883544922 seconds +6: Time to load utils op: 0.0012691020965576172 seconds +0: [2023-03-17 00:53:44,565] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-17 00:53:44,565] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-17 00:53:44,566] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:44,685] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-17 00:53:44,686] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-17 00:53:44,686] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:44,790] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-17 00:53:44,790] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-17 00:53:44,790] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:44,894] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-17 00:53:44,895] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-17 00:53:44,895] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:44,997] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-17 00:53:44,998] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-17 00:53:44,998] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:45,101] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-17 00:53:45,102] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-17 00:53:45,102] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:45,203] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-17 00:53:45,204] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-17 00:53:45,204] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:45,311] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-17 00:53:45,312] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-17 00:53:45,312] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:45,414] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-17 00:53:45,414] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-17 00:53:45,414] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.05 GB, percent = 6.4% +0: [2023-03-17 00:53:45,414] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-17 00:53:45,415] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-17 00:53:45,415] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-17 00:53:45,415] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-17 00:53:45,415] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-17 00:53:45,415] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-17 00:53:45,415] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-17 00:53:45,415] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-17 00:53:45,415] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-17 00:53:45,416] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-17 00:53:45,417] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-17 00:53:45,417] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0004291534423828125 seconds +0: [2023-03-17 00:53:45,418] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,468] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=17 [0, 17) STAGE_PARAMS=82741760 (82.742M) TOTAL_PARAMS=82741760 (82.742M) UNIQUE_PARAMS=82741760 (82.742M) +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +5: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +0: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +0: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +6: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt... +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +2: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +3: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt. +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +2: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +3: [2023-03-17 00:53:45,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +0: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +6: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +5: [2023-03-17 00:53:45,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +7: [2023-03-17 00:53:45,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +1: [2023-03-17 00:53:45,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt... +4: [2023-03-17 00:53:45,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +6: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +5: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +2: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,762] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +7: [2023-03-17 00:53:45,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +3: [2023-03-17 00:53:45,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +4: [2023-03-17 00:53:45,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +0: [2023-03-17 00:53:45,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt. +1: [2023-03-17 00:53:45,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +2: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +3: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +1: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +4: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +6: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +5: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt... +7: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +5: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +6: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +1: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +7: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +0: [2023-03-17 00:53:45,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +4: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +2: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt. +3: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +2: [2023-03-17 00:53:45,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +4: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +3: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +5: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +6: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +1: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt... +7: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +7: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +5: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +3: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +4: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +0: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +6: [2023-03-17 00:53:45,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:45,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +1: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:45,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:45,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,923] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt. +2: [2023-03-17 00:53:45,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:45,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:46,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +5: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +7: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +4: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +0: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +4: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +3: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:45,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +1: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +3: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:46,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt... +2: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +2: [2023-03-17 00:53:46,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +7: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +5: [2023-03-17 00:53:46,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +6: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +1: [2023-03-17 00:53:46,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt. +0: [2023-03-17 00:53:46,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +6: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +5: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +7: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +4: [2023-03-17 00:53:46,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +0: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt... +3: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +6: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +7: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +2: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +3: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +4: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,099] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +0: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +1: [2023-03-17 00:53:46,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt. +5: [2023-03-17 00:53:46,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +7: [2023-03-17 00:53:46,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +1: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +3: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +4: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +6: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt... +5: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +4: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +6: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +2: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +7: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +3: [2023-03-17 00:53:46,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +0: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +5: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt. +1: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +2: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +3: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +4: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +1: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +6: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +7: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt... +5: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +7: [2023-03-17 00:53:46,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +2: [2023-03-17 00:53:46,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +5: [2023-03-17 00:53:46,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +6: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +4: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +3: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +0: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt. +1: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +7: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +4: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +1: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +2: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +0: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +6: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt... +3: [2023-03-17 00:53:46,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +4: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +7: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +1: [2023-03-17 00:53:46,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +5: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +2: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +3: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +6: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt. +0: [2023-03-17 00:53:46,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +2: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +4: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +6: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +5: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +3: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt... +7: [2023-03-17 00:53:46,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +7: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +4: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +2: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +3: [2023-03-17 00:53:46,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +6: [2023-03-17 00:53:46,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +1: [2023-03-17 00:53:46,444] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +0: [2023-03-17 00:53:46,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt. +5: [2023-03-17 00:53:46,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +4: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +3: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +1: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +7: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt... +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +7: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +4: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +6: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +3: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +0: [2023-03-17 00:53:46,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +1: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt. +5: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +4: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +2: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +1: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +6: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +7: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt... +3: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +2: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +4: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +6: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +7: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +3: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +7: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +2: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +4: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +6: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +5: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +6: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +5: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +5: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +0: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt. +1: [2023-03-17 00:53:46,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +1: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +1: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: > overriding learning rate value to 0.0002 +0: > overriding minimum learning rate value to 2e-05 +0: > overriding warmup iterations value to 0 +0: > overriding total number of iterations value to 1[2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: +0: > overriding decay style value to cosine +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt... +0: [2023-03-17 00:53:46,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt. +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-17 00:53:46,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-17 00:53:46,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-17 00:53:46,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-17 00:53:46,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,717] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +0: [2023-03-17 00:53:46,719] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +2: [2023-03-17 00:53:46,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,720] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +2: [2023-03-17 00:53:46,721] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +3: [2023-03-17 00:53:46,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,729] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +3: [2023-03-17 00:53:46,730] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +5: [2023-03-17 00:53:46,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:53:46,731] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +5: [2023-03-17 00:53:46,733] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +2: [2023-03-17 00:53:46,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,739] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +2: [2023-03-17 00:53:46,740] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +1: [2023-03-17 00:53:46,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,740] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +1: [2023-03-17 00:53:46,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,742] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +1: [2023-03-17 00:53:46,742] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +0: [2023-03-17 00:53:46,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,743] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +0: [2023-03-17 00:53:46,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,743] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +2: [2023-03-17 00:53:46,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,743] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +2: [2023-03-17 00:53:46,743] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +3: [2023-03-17 00:53:46,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,745] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +0: [2023-03-17 00:53:46,745] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +0: [2023-03-17 00:53:46,745] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +3: [2023-03-17 00:53:46,745] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +2: [2023-03-17 00:53:46,745] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +6: [2023-03-17 00:53:46,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,746] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +6: [2023-03-17 00:53:46,746] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +3: [2023-03-17 00:53:46,746] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +3: [2023-03-17 00:53:46,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,748] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +6: [2023-03-17 00:53:46,748] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +4: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,749] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +5: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,749] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +5: [2023-03-17 00:53:46,749] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +4: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,749] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +0: [2023-03-17 00:53:46,749] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +4: [2023-03-17 00:53:46,749] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +2: [2023-03-17 00:53:46,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,750] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +2: [2023-03-17 00:53:46,750] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +3: [2023-03-17 00:53:46,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,750] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +3: [2023-03-17 00:53:46,750] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +4: [2023-03-17 00:53:46,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,750] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +4: [2023-03-17 00:53:46,750] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +5: [2023-03-17 00:53:46,750] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +0: [2023-03-17 00:53:46,750] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +4: [2023-03-17 00:53:46,750] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +5: [2023-03-17 00:53:46,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,751] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +5: [2023-03-17 00:53:46,751] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +2: [2023-03-17 00:53:46,751] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +4: [2023-03-17 00:53:46,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,751] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +6: [2023-03-17 00:53:46,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,751] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +6: [2023-03-17 00:53:46,751] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +3: [2023-03-17 00:53:46,751] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +6: [2023-03-17 00:53:46,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,752] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +4: [2023-03-17 00:53:46,752] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +5: [2023-03-17 00:53:46,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:53:46,752] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +5: [2023-03-17 00:53:46,752] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +4: [2023-03-17 00:53:46,753] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +6: [2023-03-17 00:53:46,753] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +6: [2023-03-17 00:53:46,753] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +6: [2023-03-17 00:53:46,753] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +5: [2023-03-17 00:53:46,753] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +5: [2023-03-17 00:53:46,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:53:46,754] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +1: [2023-03-17 00:53:46,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,754] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +7: [2023-03-17 00:53:46,754] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +4: [2023-03-17 00:53:46,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,754] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +7: [2023-03-17 00:53:46,754] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +6: [2023-03-17 00:53:46,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,755] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +5: [2023-03-17 00:53:46,755] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +0: [2023-03-17 00:53:46,755] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +1: [2023-03-17 00:53:46,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,755] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +4: [2023-03-17 00:53:46,755] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +1: [2023-03-17 00:53:46,755] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +1: [2023-03-17 00:53:46,755] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +7: [2023-03-17 00:53:46,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,756] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +7: [2023-03-17 00:53:46,756] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +7: [2023-03-17 00:53:46,756] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +1: [2023-03-17 00:53:46,756] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +6: [2023-03-17 00:53:46,756] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +0: [2023-03-17 00:53:46,756] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +1: [2023-03-17 00:53:46,757] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +7: [2023-03-17 00:53:46,757] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +1: [2023-03-17 00:53:46,757] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +7: [2023-03-17 00:53:46,757] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +7: [2023-03-17 00:53:46,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,759] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +7: [2023-03-17 00:53:46,761] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +3: [2023-03-17 00:53:46,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,769] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +3: [2023-03-17 00:53:46,771] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +1: [2023-03-17 00:53:46,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,774] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +3: [2023-03-17 00:53:46,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,775] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +0: [2023-03-17 00:53:46,775] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +0: [2023-03-17 00:53:46,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-17 00:53:46,776] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +1: [2023-03-17 00:53:46,776] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +0: [2023-03-17 00:53:46,776] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +0: [2023-03-17 00:53:46,777] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +3: [2023-03-17 00:53:46,777] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +4: [2023-03-17 00:53:46,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +2: [2023-03-17 00:53:46,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +2: [2023-03-17 00:53:46,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +1: [2023-03-17 00:53:46,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +6: [2023-03-17 00:53:46,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,778] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +4: [2023-03-17 00:53:46,779] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +1: [2023-03-17 00:53:46,780] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +2: [2023-03-17 00:53:46,780] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +2: [2023-03-17 00:53:46,780] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +5: [2023-03-17 00:53:46,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:53:46,780] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +6: [2023-03-17 00:53:46,780] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +2: [2023-03-17 00:53:46,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-17 00:53:46,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-17 00:53:46,780] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +3: [2023-03-17 00:53:46,781] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +5: [2023-03-17 00:53:46,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:53:46,781] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +5: [2023-03-17 00:53:46,781] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +3: [2023-03-17 00:53:46,782] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +2: [2023-03-17 00:53:46,782] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +5: [2023-03-17 00:53:46,783] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +7: [2023-03-17 00:53:46,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,784] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +6: [2023-03-17 00:53:46,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,784] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +6: [2023-03-17 00:53:46,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-17 00:53:46,784] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +4: [2023-03-17 00:53:46,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,785] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +4: [2023-03-17 00:53:46,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-17 00:53:46,785] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +7: [2023-03-17 00:53:46,785] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +7: [2023-03-17 00:53:46,785] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +6: [2023-03-17 00:53:46,785] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +6: [2023-03-17 00:53:46,785] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +4: [2023-03-17 00:53:46,786] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +7: [2023-03-17 00:53:46,786] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +4: [2023-03-17 00:53:46,786] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +1: [2023-03-17 00:53:46,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-17 00:53:46,799] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +1: [2023-03-17 00:53:46,800] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +5: [2023-03-17 00:53:46,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-17 00:53:46,840] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +5: [2023-03-17 00:53:46,842] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +7: [2023-03-17 00:53:46,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-17 00:53:46,917] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +7: [2023-03-17 00:53:46,918] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +0: successfully loaded checkpoint from checkpoints_83m14b100mdedup at iteration 0 +7: time (ms) | load-checkpoint: 1450.41 +0: estimated model parameters: 0.08274176 +0: estimated model parameters without embeddings: 0.04923648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-17 00:53:47 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.030376 seconds +0: number of documents: 41786294 +0: > dataset split: +0: train: +0: document indices in [0, 41786294) total of 41786294 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.115 seconds +0: total number of samples: 9767463 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.032951 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.010 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-17 00:54:02 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 18455.03 | train/valid/test-data-iterators-setup: 14425.95 +0: [after training is done] datetime: 2023-03-17 00:54:02 +0: [2023-03-17 00:54:03,017] [INFO] [checkpointing.py:553:forward] Activation Checkpointing Information +0: [2023-03-17 00:54:03,017] [INFO] [checkpointing.py:554:forward] ----Partition Activations False, CPU CHECKPOINTING False +0: [2023-03-17 00:54:03,017] [INFO] [checkpointing.py:557:forward] ----contiguous Memory Checkpointing False with None total layers +0: [2023-03-17 00:54:03,017] [INFO] [checkpointing.py:560:forward] ----Synchronization False +0: [2023-03-17 00:54:03,017] [INFO] [checkpointing.py:561:forward] ----Profiling time in checkpointing False +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.872943E+00 | lm loss PPL: 4.808367E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3327361: Fri 17 Mar 2023 12:54:22 AM EET diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b97414bc08c318b55e581d0cc1af58e075e50d61 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50082e4588bf525c251cf8249c21f3970c86d2c36039c538242dcd621ad3cf9d +size 15518743 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..98b817143c811608b47e4862091be5daccecfe6d --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e124d6a7f389a244d17160a140ff6c1a4b9f6992d7129bbb67853d5d5d2b89b5 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86004b480c2bfbe586cc2cda64a16b6eb626f80e --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b53a86be1c78915e56884624e32ff51b6318bbf0c54fa402cee3708fcf70bc +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..363c6b18b4aca04bdba907af41524ba633e73f8f --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f123fdcc6b02329c239d36142d1c1a844ee4ccf6b8d0e4a86c9a000b4af328f +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d224816a66ef3cb9f1069bf73cdece2a74c79db --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b55bdcb9f7758c79cef8d799ee56f0d61a3e349b701dd69dc92e1082d5e9f612 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7af07b9179ea8924cdfc635b2886ae8a307d1e5 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0affa5152b0d45f9e1036e10a4ecec76e8768bab013caf4944dc888fc920018f +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0d4deedb7cf435019da9b6a053fff989d456314 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:412da3fefeda56742b3a4d12ced4bd821c4072d8547003ceb672a867eb1c8448 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d8ef4ce64e5d11cf76a087914d6d98fff2a3b63 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bfd9c5cde26c6dab2a70957471fc558ef380f8e8e4a0426584525abd726d3e7 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4714740e2103d1f372c07fa9901c326422f13292 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a7a2057da2e4acef59787ae3d2e351a267416ec1921d5e0c4ef597bfebbaa37 +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2888dce904bef4e35e285d24f88e881eb82a2018 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8235912ba949ebe439d8398caf75bfd77f6802f0a230f8783066317d03cd039 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..00c1eb933d5ff7d940debbacd24f8bf5664342ba --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ee536837d8d48e187bae917dc5a195d4b7dfc6ce09ee98902168f3cb1c1f737 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b214cadb70f8dcb0cdbabfa7096e67666a28278 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c484f87cd4dea2e5ffa7d5802a38dc4378cc4dff38ab05e80f9289f90f97380 +size 15518615 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53834546370bc4d2be3316ccb2d48b705bd410bf --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d100b5677f84eab6a5db3a7cd5ce614680dff83c09e7c7c36c20b36bb11a1206 +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75b0be4db0ec4ffeb7e6548bf56229ec7d5a8f94 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58df526e997253639b4c81ea14bb7930ddcdc811f42c5f8fb739358ad860c836 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7c0afa2960122257d96c4d449dd9805fd5eb33c --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a11a83982bd1d2d122e1b0db6080170dfd8c2f30cab04e130450e111a277bb +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83ba48fb540f70cf3b2ecab0837c45747d2e0994 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774e796bf2a92d46e9651722e2126985bb6c8e6ab510580aa797af4fbce0617a +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04af18f6c2eb6792fb8fceb750d855e8f8e0b6be --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84fae4e4cf156517126632960f3e3d71d9e5e6894ed3e86533b41aeffc7bc75f +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..510b6c554d0e5d9b4b55e7008a82cedc8090a850 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0240906ba876fe27bd2c712e2e6d8c07ea96d854882ff4588f7d0d18a4610e4 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..38daf938dffcb316c0a1067d0fa18baad9484c76 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b55c241b36a6579398ff9280456a9c91be85f9f88c4a8fa1769c65fe46a1e376 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0047986927730cbe1cf852e7ff4a6210c18a5ed --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d7b245d048c98ee2908cc1ef2209dbbb825321567c7bfa92a390afaf665f2d4 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..176ed937b5982c37c8c622f81d238b6700f0a4eb --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1b68275b691120e65eb0adcf5e6ea956acb3c2f53315ef7349a706b1629bda0 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd2ffd939ea89bed2de7b226d1974228f5e23553 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08954ccb6e8bbbcec261f735adca0c8f5239f5ead3709d00d38e9ec0338bb671 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f28d8957b76d185f08ac1caa059104ed0e1c7f0c --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bdc58022590747001b99dbfa31fd5e02d235e71fd149eaf414e8a8ffc7aa030 +size 15518743 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc5c26f9cb60ebb4d7cb9709859dd9d9a8f88d08 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a777c07ad05b1adb47ce10810b1280097eb4e689a040b603cf769aed4163be8c +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9c98a936447fcde03e2213d6d61965ef1d686e4 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5684d826355e27b313049caf77f8b728ede416dbb1f63103b85f158e572f948 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fc662571a4b216c5d538dec128bcc40d2c4fd9c --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40e9cafea567bb92565943db6b1aaa23137f8ef5ce7e11aaf288705ff46e4c16 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa88f5a2c94012546003c23e273c17bb586e548c --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08d604ad96c4c905a8d602119ecfa00aa0f582a5578b72fc40f3bdde312dcd45 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a5a47f750e4365b657c637d3ac0a6fe07eb8af9 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03840c55aa39a5079923b42842d9b8dccd197e42a23c5618214b862d0ca8a820 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..682de8db2a673a61a214b62e3c0a559c767ea811 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d22148c1ab1d22f3d17c7e571041e24b6c8be29e0dac6b450629918d182a2dde +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5d4dc6f15c47ec97d590f6559ca78118847f412 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13408c033608d27980faced6a0ef67ff77805ddc2bb5b3ee717376a9f92c6e7a +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..badc3f5fccd0ae2f911e56806fe2ca869c2e650c --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adbb11f15107cf4e1c24e3f9da3cc60b5536a4582ec318360c21e7099ece7b28 +size 15518818 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bee8890b9a53af5c2d07f94b8d15dec73775f02 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc83b1fe63306d24d899a1b7f12ddbb23fb7ac2cb27561541433d67af7fa9c86 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25587ce28bd0671248e6fe13a1024b0c0cc478ec --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f7a425440a166ef89d75b20d5c188402636d16a0120f2e98a3b620b677c595 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c048467a36ed5486607090213d0d2be8406b70b --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f348e3f4226786c1610d3c9a46c8791ee739e2aff749a3ed2a2f5071f255d9b +size 15518743 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1df2240cf355109a2ee93d47333a24f3d981090d --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f406f6e1afc9e2f4a44488e2c65ce992d325e0b9fe997d73dc00ba93b0a56d66 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e949ecb3697a8aef22c2f48224908f6b4036b48 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e516f10a62a87c9e52fc06b8699fb398fdca28b12406452ae56b3a2382a6bbc9 +size 15518818 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffded0e6ad4f17455b6bbdde3a93dbda7cc3f914 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8381bd19dbc68f87f8e9a0cc35666bb4715bfcf8a0aa611b9bda7c23ff8b21d +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97f791fa4ae38e7b0f0fb4b5dd16b7431b93ed48 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa048e22b0f34791f945ea4ced27baa17bf1a92ac61eb5fff7965bd087723f43 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..025b48dd10ed6905904ce0c7a70635714ca7fc9f --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06701b8df9f242c60aba35f27fd6395a61e3b009b029ca00613bcd225f2ca074 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1f8f92c02534428b0e1a30f92a0d57d0fca80e1 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec7013e4486fbee1b8d58624a2816c576a1e74115cd7117b6d8ccfdaa4cd3f3 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..beb20a0549edda578dc8b14dc761d582e7cb6131 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb1f4bd92b518af18ce893e0a8087047eb4828d121f390058e0cad03cca93ef +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b06f7a7525baa2dd3649774625a782b7a05884ad --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c102087f3b106ace34e8f65b362429b07e1c22c63cc2e815e7580b1185ad5fc9 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c231516233942a0415d6651a1ee19d90905ab99 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12113a378bbfd23af5b37a8eabe441233820e3f0f2bb30f856edefa5f0bc3381 +size 15518562 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..446781e1acadfbb87982484d6c3c09b0a3825cd9 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f2e8262c3bf246eea55ac9c413435f6da1be92d52fffbef220123b17423681 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d0fd3d81d50f496825cd87d83896127c0975574 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:178840324a1af62d3802cf7ea0ea61c4fa414098b62bca5cf6b815c1c00a8bba +size 15518615 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31b7c2b1bc178c8b7c9ae73ea0fd2763f519084d --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16c1196421ce4f94643f130a4d1d3c4504ee752945527f072a3127dbaebb3dba +size 15518818 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd62a93881d8264678d65a9241117e515b5f8c37 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f14944e4a5ff65fac714ccdfc33bfc7951c20717564647749f41b649b92cda +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..897e7231626d1d32319740ed32052522dadb9744 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f4959d1fd790b43d68cf285534ce31b3bbf2954fad012fb6fcbd612e40549dd +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10cf1e8a7d33380ef014314f9f175179c70bfc84 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b291bddcaa3fd56081084ccc885768605e5f90f4d4b9f59547875621c16206 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a53a67ba0d4606723ca2fb6dfcd1099329f439c0 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cebbc456a5abef3c49e6af43fe261bdff62e5f69ce251c8a1aead2cd3ea2ba55 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b0dace60621a68c7af9bc9ef2e99df69ad59b84 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f016f169b5b448cab0ddc320f8dae7b1fe319b447b072043c6e1c402f4a22ae4 +size 15518626 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e7a981d4d782fdea3c014280d4c7a2e2a321ebf --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f8e26c13dd28e6e5debcfab318a85b9000881f1e732ddcdb2d05e83edf2c5e +size 15518818 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be0bab1985390fc0a2c61a711e24f8cb81390b78 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fb2e8ea41281bd804f3b25aaaa804b0112b39f4dc49e395026d12d19a02480c +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5927acc97a1785c1d584fee0a5fb45b7080bbc2a --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535674571f50ee99351968cf92ef9235af4a3b80882d20f1fac3070669f133e2 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e51bc2eb225356d6af16ccb3909c80cd4698ead5 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe620b185b6650f697dbfb1f49d4192f30b9dca301489f330cd56c3db1aba20 +size 15518754 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f76fb25f074ab17b9b8473172ab6401d5a7cad25 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a83f8bc30832c16d59c5c2d886bf7f416684e9b50a51fcc6a812ea9a6d51084 +size 15518679 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c96883784913b4cf4c7e663bf708c5fa59534487 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a09e3a3f0f6174959072aea66f74bf2a62a9d3b126e3ba9639aa0834cf2430 +size 15518818 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04031233420d373c9bfe2c6a3f8ba18d623d5bc3 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e96a3f855b33c9cf1ecff515c0d36b62e8599d751e94273a3f25575d8865bc +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0aebe3877f993261acc358ea3d2aeb0a5f5e846 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:769ae17a92ed54c11e803999a9919a47480be5adff1fffb7848e5042825c1614 +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cae24ed17bf25c089516b7e12b8b1d9a600e3c9a --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a88787350f909f3709aee79a9dcf6014eff330481f8822754c661583602b6fc +size 15518690 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61f7fb53bfac345388009deb6b5004d59f3227ec --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5451fdb227f610f49a9c7823e0b100dcd5c6742d1823228bc06a1ecce273a2c2 +size 15518743 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9f2d929f5a91a9bc82935b974b836264f68ad13 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4765c0c6253902778db101e86f7b45d818ec6b972cf3fef62b5794646160f689 +size 15518679 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a59ff210459ba8ee4edfb6b6f320a98eeb88992 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b837e606ab626ec5daf06437133e053be71125a40194200d34cd89b0c69f93 +size 15518679 diff --git a/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..108c74b8fe539e595df64f76015e0d45820644c5 --- /dev/null +++ b/83m14b100mdedup/global_step21553/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4617be8a9219b5016b8c9ac29171d2ff32132c666f5cbc718b65900f816afac6 +size 15518743 diff --git a/83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..242e67616581e29ad67544aa941b944f340c1dcd --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20d44417373f6ef1097b57fde33caa3e70cda0b178bd2170fb69911cd035011a +size 67011843 diff --git a/83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4e9fafe80069a0d72d813fb9a7eecf9a98467ce --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96816db357762c8b7cc750cb505dd4f62cb8a0d6393d4ea037ced3e73bb89d2 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f55d2f2931b90c9df68dc5333020b486ae306efb --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ac82cf9b3b44b2deb0528b04fab851530716156750335a7407b55e9a9f50c58 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1689b4770ec21b6375ee1fb5ca0fabf76be34581 --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1214720059edcc991e0cefaed3d8d572b58be35f294eae49ff773fdfe04c323d +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c80ff931d28e073801879e194225deb68999e1b --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:522dc906474a0db06d2ffc535ac8569fc9304e65c428860ef229f78e579e70e6 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e74016e53f0cf387da477b78fe20adfa3290b09 --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c19953a9e0df29a929b1be243e79b299c1c547636ea251cca53ad13d7c9a9f01 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc4805a7bfb01bc58a764d02f46124e8bccefb57 --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3867cc8ad8381a24b80a85abd50a2d15d79199b35b7a7e29aa9855c182be587 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cfd6c0789848ad86426412e8ffaca715ce5b4b3 --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb5823cd66314c55a6cf4009c664bd6c01f591c39ca98304f3ed13b4b1a93735 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60dc3998bc56579828612c8664c97790c8618e48 --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9168c38e8f16c15dc80f92b0e3360a2e62d399802876c6ea9a0c20301367a90b +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb2060906a1d410bff5a2eadbfee253321828e92 --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8cf708b5713848e9b14bf28da43e16c37aa74e8ec7f42ad8a048324b9c236f +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9118f6d243c1e72d768f29c47b3683b9bc1bca6a --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf600aea6d0a66017179d1457cee4ed08e9e68cb545ab4728368847b789deb5 +size 9851395 diff --git a/83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt b/83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65789a158902b5015c9d2466485ca3f5b55b85ee --- /dev/null +++ b/83m14b100mdedup/global_step21553/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e610504f937c3ff889749f4916dc3a17a8915ea4d8b5e91ccde636b0293d741 +size 3779 diff --git a/83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt b/83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0918847746d81e93d18fc55e74f19ce02d3451cc --- /dev/null +++ b/83m14b100mdedup/global_step21553/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4846eafeaedb66e039554f06ae20e6b21c1dd7c41b3a2fc9f51c59909fb471 +size 31667 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..215caf699ed0550a165b8400ad40cd7e7b8a6112 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fa23194850f1736d5f542904368c45b4e30c9cba19c59cebe07fd6f24e3bf46 +size 62061079 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..524bc47ae4e27ae7390a53a2b7288d8a3f9f2028 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14de3a07aadeec992727ef89f13ecb49c851113874656e2cd5946306bb9a14a6 +size 62061218 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..732c2f0dfbcb8a6d086b45c6af8b193597d3f0b5 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11dafb2f1d5971a5b9140f9fa47d4c2eb7a2bf1167ec334aa7c52fa079d9f001 +size 62061282 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..71d4862e056c8b59cda783a28b1ce0a947c92b6f --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84dfb8887be9145094cf42c5d290a0ded0f873290cc4eb4431b2b17cc269fadf +size 62061154 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4702fe682ac6bf9ca1d5ea2e80ae096f2bb9684a --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da4b87d60e8cc563a566e264986251a6ad11d6b1a5be2166cf3a1f0efd842b0 +size 62061218 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..12b1db3053f61cf2b8d93ccc824c16759044e282 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65bdd91b84fe061e1ff79810c2ca5ed3fbe24b7300fb682fedbe7a369b13e1ec +size 62061282 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7351f696b05ce8fa58794d3e57972d136411bcb1 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:393461f05c7ac8f3fbc9658e26e9dde2e67920c5c1326fb11ab3b67f9c9df069 +size 62061154 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c47b3491f38f7c040a95132b85ed51b75a8b2890 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4212d1a955a728d729c25bdb9200a7a450d9158fc513af6e6fff5f810b2c9640 +size 62061015 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b94ca16ed888b77b3f445f6f40313ccc53cfa99 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78ace877f88bdd99e8d572190ecf726f489a9ae16db89a460f66231a4374fe7 +size 62061015 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fb82a8caa3d662cb47db8be0aff0ae476f239bb --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eef3772bcb669d07234f26a34def7192367f57464f809d72628a990744474661 +size 62061079 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..834c7b5c416925fa86c32c356701326ab5d6746d --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fa964b5b8ccd22526306af0252cf21e645ccd2721d3de526ce5550258b5846a +size 62061079 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f12b59cfcc8f0320d1e1bffea5577c1b5c5830c --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:991e4940b7a1d13ae9adfb172e8c164ed353730a4ddacf0e044b96174d9d4239 +size 62061015 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b027488909c2aeb66e97fb08fcfa0a87aab9863 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88f4f9984a330a5ef9e6d56c15ead245a62c4472b9ff4d0c45aae7365c3fec7c +size 62061015 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cda2c0e6e3d7e143c1a8a327c97f0ed32c24e12d --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab079f6363e6a909f4c87812a2fc9c0dd82641839888c8a9ebec389946791379 +size 62061079 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..246147f6b2f598eca5a765d8c03430cfe179426c --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc76ca24ae8a3d7ebf8e50a4f89a2c8d770fbca314a7e9f5cee55d9c1f71d867 +size 62061015 diff --git a/83m1b51b5/global_step2891/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/83m1b51b5/global_step2891/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08edf23b52574d5bd706e2d8234f9b749af7aeb1 --- /dev/null +++ b/83m1b51b5/global_step2891/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02026cf8945738aba862cc16d05144ba98a6f2806c8233120aa8c00845cb3d17 +size 62061207 diff --git a/83m1b51b5/global_step2891/layer_01-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d99fb2b8968f6a2f1c3e83b856c7765248c0da06 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12149a691da0e88ba805a4dfeb1e45c02e36a53769616877949ad0a6184dcc71 +size 67011843 diff --git a/83m1b51b5/global_step2891/layer_03-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..700f2fbcf5b4ef554c6df779be1bd45000c505dc --- /dev/null +++ b/83m1b51b5/global_step2891/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f93025d774976a9c6b890d7a8bd6e11128530b3fe6bca700c2bb5d1defc55273 +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_04-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bb352ed27392c30ae819519b0860f4cb3a7daf7 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d66d505304c87052df9f4ec0f4142dcd4b61007ca709cb017f71b073c35a6db +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_05-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..56d96e6b6d6a18ff7b3292335891b04ff2c3b2f3 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955a98fdb7fe49c10922ffaa0ca010dabc20903d17cab611cc629736adaffb4b +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_06-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4546b516bce769b8e8f5b29a1a6c708d1ceaf226 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7fcc91994c18d7a4ab7967dcc0f4585fb36b51416d7896869f1ebf0a7e232f1 +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_07-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..536d14eb223816ee77a23eef366efbd1d316c042 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cff96d5efad2a18ddada4af609f5193a68e7a970acba4fc3c0a9722fc119fc2e +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_08-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32aeb5fa5e0d2c6b0e981821ad2ccdb244b99f41 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02220706041fb26bd371385f14fddfb1c4e4a3ea615af78651486cafe9771319 +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_09-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dce779f7aa975c3fd73541998194331c9422bc2d --- /dev/null +++ b/83m1b51b5/global_step2891/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885de8ded3ca1f094596132fa5a47bd07241b4c3dbf0a921f7d01975979765f0 +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_10-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d95bda8cb5101831ebfa479118559801f64411be --- /dev/null +++ b/83m1b51b5/global_step2891/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bcb6bf60357e430006d48660a8650c40f5f9f1284323660c0e065492027c047 +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_11-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0af0a958afd3f2f637b36f1b127d15a2013f433f --- /dev/null +++ b/83m1b51b5/global_step2891/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d014810e17c31f6a858a60f72f7f11e7837a9c340c57d70117474f05bff448f +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_12-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a04f788aa89616d341d0dd559b0a03508051e73 --- /dev/null +++ b/83m1b51b5/global_step2891/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9497c021c6a8f4849b496afdb3a46373525d5f61762462bcb914c072707ef5 +size 9851395 diff --git a/83m1b51b5/global_step2891/layer_14-model_00-model_states.pt b/83m1b51b5/global_step2891/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..100986e20fde0fd665eb2ca0c18725672d753f7d --- /dev/null +++ b/83m1b51b5/global_step2891/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ec3abf8c4016b73319309ce1d721acd268559cb9d1d7ffafcaf6b617bb4dae8 +size 3779 diff --git a/83m1b51b5/global_step2891/mp_rank_00_model_states.pt b/83m1b51b5/global_step2891/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9891029bbea2c1947d144106542a2fa65674047 --- /dev/null +++ b/83m1b51b5/global_step2891/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864f14461c26a812632313d3083bd8e79506017722e53d5e5bfc8cb7782d88e7 +size 31603 diff --git a/83m1b51b5/sbatch_83m1b51b5.sh b/83m1b51b5/sbatch_83m1b51b5.sh new file mode 100644 index 0000000000000000000000000000000000000000..ec5947cb2029ef2e5e6378304d95c3f2a2cf11b7 --- /dev/null +++ b/83m1b51b5/sbatch_83m1b51b5.sh @@ -0,0 +1,168 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=83m1b51b5 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train1b5.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=16 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_74M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 1516071000 +# -> Samples: 740269 +TRAIN_SAMPLES=740_269 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 7403 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/83m1b51b5/sbatch_83m1b51b5val.sh b/83m1b51b5/sbatch_83m1b51b5val.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa268c96355dfa21f7aca03336fe1317adcf4f5f --- /dev/null +++ b/83m1b51b5/sbatch_83m1b51b5val.sh @@ -0,0 +1,173 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p small-g +#SBATCH -t 12:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=83m1b51b5val +VARIANT_CKPT=83m1b51b5 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train1b5.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_74M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --no-load-optim \ + --reset-progress \ + --override-lr-scheduler \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --eval-only true \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/83m1b51b5/tensorboard_83m1b51b5/events.out.tfevents.1677499642.nid006687.9741.0 b/83m1b51b5/tensorboard_83m1b51b5/events.out.tfevents.1677499642.nid006687.9741.0 new file mode 100644 index 0000000000000000000000000000000000000000..de8a16dbc10200603f6499c0ba662d7f53f46ee2 --- /dev/null +++ b/83m1b51b5/tensorboard_83m1b51b5/events.out.tfevents.1677499642.nid006687.9741.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7004c7325aebc5fd3f0a7079cb8faf34d9663fbdad10aa101eacb970966fd051 +size 40 diff --git a/83m1b51b5/tensorboard_83m1b51b5/events.out.tfevents.1677500137.nid005252.129714.0 b/83m1b51b5/tensorboard_83m1b51b5/events.out.tfevents.1677500137.nid005252.129714.0 new file mode 100644 index 0000000000000000000000000000000000000000..49759d54b6b45f5633650221666d9ceb295da4a6 --- /dev/null +++ b/83m1b51b5/tensorboard_83m1b51b5/events.out.tfevents.1677500137.nid005252.129714.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce937eed5efae6ecc72e42c5dab4066f0286d2e55b827e7593e8b0f230c13e0 +size 5151990 diff --git a/83m1b51b5/tensorboard_83m1b51b5val/events.out.tfevents.1677509741.nid007495.101562.0 b/83m1b51b5/tensorboard_83m1b51b5val/events.out.tfevents.1677509741.nid007495.101562.0 new file mode 100644 index 0000000000000000000000000000000000000000..82490c9bfb875514b83ac4fdfd0b6932eee7ad21 --- /dev/null +++ b/83m1b51b5/tensorboard_83m1b51b5val/events.out.tfevents.1677509741.nid007495.101562.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44356a69d50e06faf078bf77afd27800bc10fc82be8487c18da724cc2fd35c88 +size 980 diff --git a/83m20b400m/3318389.err b/83m20b400m/3318389.err new file mode 100644 index 0000000000000000000000000000000000000000..8fca112901ed89b6040bd4049ecc8530e41d77af --- /dev/null +++ b/83m20b400m/3318389.err @@ -0,0 +1,1124 @@ +6: 2023-03-15 21:55:58.274940: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274945: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274951: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274961: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274958: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274965: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274967: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-15 21:55:58.274969: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.309996: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.309999: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.309999: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.310013: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.310007: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.310003: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.310008: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-15 21:55:58.310006: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348254: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348242: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348254: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348248: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348256: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348263: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348241: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:58.348250: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382723: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382728: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382734: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382743: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382740: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-15 21:55:58.382753: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: 2023-03-15 21:55:58.403353: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403356: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403359: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403386: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403367: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403363: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403396: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-15 21:55:58.403393: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403363: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-15 21:55:58.403365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: 2023-03-15 21:55:58.403393: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407033: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407040: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407047: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407038: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407038: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407049: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407051: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-15 21:55:58.407048: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479759: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479761: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479774: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479777: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-15 21:55:58.479776: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-15 21:55:59.975693: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975708: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975710: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975710: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.975715: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:55:59.976124: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976127: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976129: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976131: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976134: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976135: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976135: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:55:59.976140: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.001622: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001620: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001631: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001635: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001642: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001645: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.001636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:00.002095: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002099: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002098: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002102: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002102: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002105: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002106: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-15 21:56:00.002110: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.023973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023977: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023990: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023994: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.023987: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:00.024588: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024594: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024597: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024599: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024601: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024603: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024604: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-15 21:56:00.024606: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049340: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049350: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049335: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049358: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049341: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049375: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049354: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049590: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049353: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049379: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-15 21:56:00.049737: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049741: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049742: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049746: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049748: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:00.049597: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:56:00.049599: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:56:00.049601: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:56:00.049603: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049753: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049756: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-15 21:56:00.049757: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:56:00.049607: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:56:00.049610: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-15 21:56:00.049611: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196532: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196527: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196519: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:00.196941: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196945: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196949: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196952: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196954: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196956: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196954: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-15 21:56:00.196959: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209346: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209352: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209359: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:00.209769: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209773: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209779: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209777: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209782: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209782: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209784: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-15 21:56:00.209789: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236114: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236119: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236125: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236125: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236129: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236130: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:00.236522: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236524: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236527: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236527: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236530: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236532: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236537: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-15 21:56:00.236539: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-15 21:56:03.184390: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184399: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184405: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184409: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.184419: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186086: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186091: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186100: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186095: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186094: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186108: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186110: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186114: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186115: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186115: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-15 21:56:03.186163: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-15 21:56:03.186164: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.207617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207623: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207625: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207633: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207632: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.207634: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208222: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208222: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208228: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208227: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208225: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.208230: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209805: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209812: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209812: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209812: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209819: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209829: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209829: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209832: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209834: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209834: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209834: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209840: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-15 21:56:03.209848: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-15 21:56:03.209855: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.250217: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250223: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250225: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250231: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250233: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250238: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250238: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.250243: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210114: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210121: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210130: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210124: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210134: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210133: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210134: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210141: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-15 21:56:03.210147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-15 21:56:03.210161: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252400: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-15 21:56:03.252415: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252416: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252417: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252418: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252419: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252419: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252421: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-15 21:56:03.252422: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.324242: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324249: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324252: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324252: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324258: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324261: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324267: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.324269: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325818: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325818: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325820: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325822: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325822: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325834: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325835: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325836: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325838: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325838: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325839: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325868: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325869: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-15 21:56:03.325881: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-15 21:56:03.325882: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.435393: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.435412: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437012: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437015: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-15 21:56:03.437028: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437032: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437033: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437034: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437037: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437036: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437039: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-15 21:56:03.437040: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.483850: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483859: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483859: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483860: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483867: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.483870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485786: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485790: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485790: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485792: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485792: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485799: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485799: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485799: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485806: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485807: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485808: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485809: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485815: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-15 21:56:03.485843: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-15 21:56:03.485856: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.567351: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567357: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.567372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569226: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569228: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569228: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569230: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569231: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569233: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569241: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569243: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569243: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569246: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569247: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569248: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569270: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569272: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-15 21:56:03.569283: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-15 21:56:03.569285: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_upper_triang_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +4: Successfully preprocessed all matching files. +4: Successfully preprocessed all matching files. +4: Successfully preprocessed all matching files. +5: Successfully preprocessed all matching files. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +1: Building extension module utils... +1: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +1: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +1: Building extension module utils... +1: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +6: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils...Loading extension module utils... +6: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils...Loading extension module utils... +2: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: Loading extension module utils...Loading extension module utils... +5: +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils...Loading extension module utils... +3: +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/83m20b400m/3318389.out b/83m20b400m/3318389.out new file mode 100644 index 0000000000000000000000000000000000000000..44ece52a853811c96a4ce4258518a0e0c852631f --- /dev/null +++ b/83m20b400m/3318389.out @@ -0,0 +1,15716 @@ +Model parameters: d_model 640 ffw_size 2560 kv_size 64 n_heads 10 n_layers 10 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 10 --hidden-size 640 --num-attention-heads 10 --kv-channels 64 --ffn-hidden-size 2560 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 9_703_701 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-83m20b400m --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 9_703_701 --lr-warmup-samples 97_037 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_83m20b400m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_83m20b400m --load checkpoints_83m20b400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3318389.json --zero-stage 0 +START 3318389: Wed 15 Mar 2023 09:55:38 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 40.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 48.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 31.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 43.0c 81.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 45.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 46.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 43.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 48.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 44.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 45.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 39.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +2: Launching on nid005526 (2/8), master nid005524 port 9999, GPUs 8, CUDA: True +3: Launching on nid005527 (3/8), master nid005524 port 9999, GPUs 8, CUDA: True +5: Launching on nid005529 (5/8), master nid005524 port 9999, GPUs 8, CUDA: True +0: Launching on nid005524 (0/8), master nid005524 port 9999, GPUs 8, CUDA: True +1: Launching on nid005525 (1/8), master nid005524 port 9999, GPUs 8, CUDA: True +6: Launching on nid005530 (6/8), master nid005524 port 9999, GPUs 8, CUDA: True +7: Launching on nid005531 (7/8), master nid005524 port 9999, GPUs 8, CUDA: True +4: Launching on nid005528 (4/8), master nid005524 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... True +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3318389.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2560 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 640 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-83m20b400m +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_83m20b400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... 12.0 +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 9703701 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 97037 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 10 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_83m20b400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_83m20b400m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 9703701 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +7: > setting tensorboard ... +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-15 21:56:17,330] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.085 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 87 +0: ninja: no work to do. +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 63 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: [1/1] c++ layer_norm_hip_kernel.cuda.o layer_norm_cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so +0: >>> done with compiling and loading fused kernels. Compilation time: 18.858 seconds +0: time to initialize megatron (seconds): -8.720 +0: [after megatron is initialized] datetime: 2023-03-15 21:56:39 +0: building GPT model ... +0: [2023-03-15 21:56:39,420] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-15 21:56:39,421] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-15 21:56:39,421] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.14 GB, percent = 6.6% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-15 21:56:41,408] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=17 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: undo +0: 14: MixedFusedLayerNorm +0: 15: EmbeddingPipe +0: 16: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-15 21:56:41,595] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-15 21:56:41,596] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-15 21:56:41,596] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.16 GB, percent = 6.6% +0: setting training iterations to 37905 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-15 21:56:41,597] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-15 21:56:54,742] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-15 21:56:54,743] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-15 21:56:54,743] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-15 21:56:54,745] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-15 21:56:54,745] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-15 21:56:54,864] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-15 21:56:54,864] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-15 21:56:54,865] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.84 GB, percent = 6.7% +1: ninja: no work to do. +1: Time to load utils op: 0.11241340637207031 secondsTime to load utils op: 0.11224007606506348 seconds +1: Time to load utils op: 0.11227107048034668 secondsTime to load utils op: 0.1123189926147461 seconds +1: +1: +1: Time to load utils op: 0.24545931816101074 seconds +1: Time to load utils op: 0.0005512237548828125 seconds +1: Time to load utils op: 0.0005097389221191406 seconds +1: Time to load utils op: 0.0005645751953125 seconds +1: Time to load utils op: 0.0005598068237304688 seconds +1: Time to load utils op: 0.0006654262542724609 seconds +1: ninja: no work to do. +1: Time to load utils op: 0.16608595848083496 seconds +0: Time to load utils op: 0.31093454360961914 seconds +1: Time to load utils op: 0.00060272216796875 seconds +0: Time to load utils op: 0.3031630516052246 seconds +0: Time to load utils op: 0.3021988868713379 seconds +0: Time to load utils op: 0.3024311065673828 seconds +0: Time to load utils op: 0.30272984504699707 seconds +0: Time to load utils op: 0.30255818367004395 seconds +0: Time to load utils op: 0.3028082847595215 seconds +0: Time to load utils op: 0.30298829078674316 seconds +1: Time to load utils op: 0.20364665985107422 seconds +1: Time to load utils op: 0.2037525177001953 seconds +4: Time to load utils op: 0.21210241317749023 seconds +4: Time to load utils op: 0.21103167533874512 seconds +4: Time to load utils op: 0.211381196975708 seconds +4: Time to load utils op: 0.21208667755126953 secondsTime to load utils op: 0.2116687297821045 seconds +4: Time to load utils op: 0.21205449104309082 seconds +4: Time to load utils op: 0.21092963218688965 seconds +4: +4: Time to load utils op: 0.21191835403442383 seconds +6: Time to load utils op: 0.2086949348449707 seconds +6: Time to load utils op: 0.20875310897827148 seconds +6: Time to load utils op: 0.20873093605041504 seconds +6: Time to load utils op: 0.20812726020812988 secondsTime to load utils op: 0.2081623077392578 seconds +6: +6: Time to load utils op: 0.2086033821105957 seconds +6: Time to load utils op: 0.20817923545837402 seconds +5: Time to load utils op: 0.2128005027770996 seconds +5: Time to load utils op: 0.2134561538696289 seconds +5: Time to load utils op: 0.2137298583984375 seconds +5: Time to load utils op: 0.21285223960876465 seconds +5: Time to load utils op: 0.21373581886291504 seconds +5: Time to load utils op: 0.21303987503051758 secondsTime to load utils op: 0.21350312232971191 seconds +5: +5: Time to load utils op: 0.2131352424621582 seconds +2: Time to load utils op: 0.21236824989318848 seconds +2: Time to load utils op: 0.21242928504943848 seconds +2: Time to load utils op: 0.21240830421447754 seconds +2: Time to load utils op: 0.21241116523742676 seconds +2: Time to load utils op: 0.2124195098876953 seconds +2: Time to load utils op: 0.2124347686767578 secondsTime to load utils op: 0.21242332458496094 secondsTime to load utils op: 0.21242523193359375 seconds +2: +2: +3: Time to load utils op: 0.21283340454101562 seconds +3: Time to load utils op: 0.21283507347106934 secondsTime to load utils op: 0.2128307819366455 seconds +3: Time to load utils op: 0.21287035942077637 secondsTime to load utils op: 0.21283626556396484 seconds +3: +3: Time to load utils op: 0.21283984184265137 seconds +3: Time to load utils op: 0.21283388137817383 secondsTime to load utils op: 0.21284174919128418 seconds +3: +3: +7: Time to load utils op: 0.21094083786010742 secondsTime to load utils op: 0.21099615097045898 seconds +7: +7: Time to load utils op: 0.21099519729614258 seconds +7: Time to load utils op: 0.21100234985351562 secondsTime to load utils op: 0.21100068092346191 seconds +7: +7: Time to load utils op: 0.21101880073547363 secondsTime to load utils op: 0.210984468460083 seconds +7: Time to load utils op: 0.2109842300415039 seconds +7: +1: Time to load utils op: 0.0003695487976074219 seconds +1: Time to load utils op: 0.0004627704620361328 seconds +0: Time to load utils op: 0.0006041526794433594 seconds +0: Time to load utils op: 0.0005724430084228516 seconds +0: Time to load utils op: 0.0006196498870849609 seconds +0: Time to load utils op: 0.0005888938903808594 seconds +0: Time to load utils op: 0.0004425048828125 seconds +0: Time to load utils op: 0.0005853176116943359 seconds +0: Time to load utils op: 0.0004818439483642578 seconds +4: Time to load utils op: 0.0006144046783447266 seconds +4: Time to load utils op: 0.0008330345153808594 seconds +4: Time to load utils op: 0.0009374618530273438 seconds +4: Time to load utils op: 0.001007080078125 seconds +4: Time to load utils op: 0.0010867118835449219 seconds +4: Time to load utils op: 0.0011906623840332031 seconds +4: Time to load utils op: 0.0012485980987548828 seconds +4: Time to load utils op: 0.001081705093383789 seconds +6: Time to load utils op: 0.0004868507385253906 seconds +6: Time to load utils op: 0.0004112720489501953 seconds +2: Time to load utils op: 0.0007560253143310547 seconds +6: Time to load utils op: 0.0004100799560546875 secondsTime to load utils op: 0.0004138946533203125 seconds +6: +6: Time to load utils op: 0.0005319118499755859 seconds +6: Time to load utils op: 0.0005166530609130859 seconds +2: Time to load utils op: 0.0007920265197753906 secondsTime to load utils op: 0.0007932186126708984 seconds +2: +6: Time to load utils op: 0.0005402565002441406 seconds +2: Time to load utils op: 0.0008614063262939453 seconds +2: Time to load utils op: 0.0010609626770019531 seconds +2: Time to load utils op: 0.0010857582092285156 seconds +2: Time to load utils op: 0.00102996826171875 seconds +5: Time to load utils op: 0.0008373260498046875 seconds +2: Time to load utils op: 0.0011022090911865234 seconds +6: Time to load utils op: 0.5042719841003418 seconds +5: Time to load utils op: 0.0008449554443359375 seconds +5: Time to load utils op: 0.0009083747863769531 seconds +5: Time to load utils op: 0.001165628433227539 seconds +5: Time to load utils op: 0.0011630058288574219 seconds +5: Time to load utils op: 0.0011496543884277344 secondsTime to load utils op: 0.0009500980377197266 seconds +5: +5: Time to load utils op: 0.0012178421020507812 seconds +3: Time to load utils op: 0.0009694099426269531 seconds +3: Time to load utils op: 0.0010373592376708984 seconds +3: Time to load utils op: 0.0011439323425292969 seconds +3: Time to load utils op: 0.0012476444244384766 secondsTime to load utils op: 0.0012524127960205078 seconds +3: +3: Time to load utils op: 0.001354217529296875 seconds +3: Time to load utils op: 0.0013208389282226562 seconds +3: Time to load utils op: 0.001333475112915039 seconds +7: Time to load utils op: 0.0010528564453125 seconds +7: Time to load utils op: 0.0012009143829345703 seconds +7: Time to load utils op: 0.001317739486694336 seconds +7: Time to load utils op: 0.0013318061828613281 seconds +7: Time to load utils op: 0.001176595687866211 seconds +7: Time to load utils op: 0.0013408660888671875 seconds +7: Time to load utils op: 0.0013077259063720703 seconds +7: Time to load utils op: 0.0013852119445800781 seconds +6: Time to load utils op: 0.0004572868347167969 seconds +0: [2023-03-15 21:56:55,299] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-15 21:56:55,300] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-15 21:56:55,301] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:55,428] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-15 21:56:55,429] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-15 21:56:55,430] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:55,536] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-15 21:56:55,537] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-15 21:56:55,538] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:55,644] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-15 21:56:55,645] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-15 21:56:55,645] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:55,745] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-15 21:56:55,746] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-15 21:56:55,746] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:55,848] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-15 21:56:55,849] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-15 21:56:55,849] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:55,949] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-15 21:56:55,949] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-15 21:56:55,950] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:56,054] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-15 21:56:56,055] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-15 21:56:56,055] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:56,156] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-15 21:56:56,156] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-15 21:56:56,156] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 33.98 GB, percent = 6.8% +0: [2023-03-15 21:56:56,157] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-15 21:56:56,157] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-15 21:56:56,157] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-15 21:56:56,157] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-15 21:56:56,157] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-15 21:56:56,157] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-15 21:56:56,157] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-15 21:56:56,157] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-15 21:56:56,158] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-15 21:56:56,159] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-15 21:56:56,159] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0004215240478515625 seconds +0: [2023-03-15 21:56:56,160] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-15 21:56:56,210] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=17 [0, 17) STAGE_PARAMS=82741760 (82.742M) TOTAL_PARAMS=82741760 (82.742M) UNIQUE_PARAMS=82741760 (82.742M) +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: WARNING: could not find the metadata file checkpoints_83m20b400m +0: will not load any checkpoints and will start from random +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +0: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +4: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +6: [2023-03-15 21:56:56,218] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +5: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +2: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +3: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +1: [2023-03-15 21:56:56,219] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_83m20b400m/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. +7: time (ms) | load-checkpoint: 9.26 +0: estimated model parameters: 0.08274176 +0: estimated model parameters without embeddings: 0.04923648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-15 21:56:56 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 9703701 +0: validation: 9728 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.007387 seconds +0: number of documents: 835726 +0: > dataset split: +0: train: +0: document indices in [0, 835726) total of 835726 documents +0: > WARNING: could not find index map files, building the indices on rank 0 ... +0: > last epoch number of samples (143775) is smaller than 95.0% of number of samples per epoch (195100), setting separate_last_epoch to True +0: > elasped time to build and save doc-idx mapping (seconds): 2.488707 +0: using: +0: number of documents: 835726 +0: number of epochs: 50 +0: sequence length: 2048 +0: total number of samples: 9755027 +0: > elasped time to build and save sample-idx mapping (seconds): 0.221381 +0: > building shuffle index with split [0, 9559926) and [9559926, 9755027) ... +0: > elasped time to build and save shuffle-idx mapping (seconds): 0.261740 +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_9703701ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_9703701ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_9703701ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.015 seconds +0: total number of samples: 9755028 +0: total number of epochs: 50 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.045141 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_9728ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_9728ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_9728ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.078 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-15 21:57:12 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 17096.93 | train/valid/test-data-iterators-setup: 16100.83 +0: [000-000] 0.0827B / 0.0492B +0: [before the start of training step] datetime: 2023-03-15 21:57:12 +0: [2023-03-15 21:57:13,188] [INFO] [checkpointing.py:553:forward] Activation Checkpointing Information +0: [2023-03-15 21:57:13,188] [INFO] [checkpointing.py:554:forward] ----Partition Activations False, CPU CHECKPOINTING False +0: [2023-03-15 21:57:13,188] [INFO] [checkpointing.py:557:forward] ----contiguous Memory Checkpointing False with None total layers +0: [2023-03-15 21:57:13,188] [INFO] [checkpointing.py:560:forward] ----Synchronization False +0: [2023-03-15 21:57:13,188] [INFO] [checkpointing.py:561:forward] ----Profiling time in checkpointing False +0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 2227.14013671875 | max allocated: 4719.5146484375 | reserved: 5998.0 | max reserved: 5998.0 +7: iteration 10/ 37905 | consumed samples: 2560 | consumed tokens: 5242880 | elapsed time per iteration (s): 1.12 | learning rate: 5.276E-06 | global batch size: 256 | lm loss: 1.083670E+01 | grad norm: 6.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 229.534 | TFLOPs: 5.85 | +7: iteration 20/ 37905 | consumed samples: 5120 | consumed tokens: 10485760 | elapsed time per iteration (s): 0.25 | learning rate: 1.055E-05 | global batch size: 256 | lm loss: 1.017269E+01 | grad norm: 2.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1039.725 | TFLOPs: 26.49 | +7: iteration 30/ 37905 | consumed samples: 7680 | consumed tokens: 15728640 | elapsed time per iteration (s): 0.24 | learning rate: 1.583E-05 | global batch size: 256 | lm loss: 9.807629E+00 | grad norm: 1.959 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1069.225 | TFLOPs: 27.24 | +7: iteration 40/ 37905 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 0.24 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 9.580672E+00 | grad norm: 1.860 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1048.197 | TFLOPs: 26.70 | +7: iteration 50/ 37905 | consumed samples: 12800 | consumed tokens: 26214400 | elapsed time per iteration (s): 0.24 | learning rate: 2.638E-05 | global batch size: 256 | lm loss: 9.319089E+00 | grad norm: 1.671 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1066.495 | TFLOPs: 27.17 | +7: iteration 60/ 37905 | consumed samples: 15360 | consumed tokens: 31457280 | elapsed time per iteration (s): 0.24 | learning rate: 3.166E-05 | global batch size: 256 | lm loss: 9.032787E+00 | grad norm: 1.709 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1087.092 | TFLOPs: 27.69 | +7: iteration 70/ 37905 | consumed samples: 17920 | consumed tokens: 36700160 | elapsed time per iteration (s): 0.24 | learning rate: 3.693E-05 | global batch size: 256 | lm loss: 8.742216E+00 | grad norm: 1.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1074.177 | TFLOPs: 27.36 | +7: iteration 80/ 37905 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 0.23 | learning rate: 4.221E-05 | global batch size: 256 | lm loss: 8.456736E+00 | grad norm: 1.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1108.756 | TFLOPs: 28.25 | +7: iteration 90/ 37905 | consumed samples: 23040 | consumed tokens: 47185920 | elapsed time per iteration (s): 0.24 | learning rate: 4.749E-05 | global batch size: 256 | lm loss: 8.165895E+00 | grad norm: 1.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1074.631 | TFLOPs: 27.38 | +7: iteration 100/ 37905 | consumed samples: 25600 | consumed tokens: 52428800 | elapsed time per iteration (s): 0.24 | learning rate: 5.276E-05 | global batch size: 256 | lm loss: 7.886219E+00 | grad norm: 1.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1049.638 | TFLOPs: 26.74 | +7: iteration 110/ 37905 | consumed samples: 28160 | consumed tokens: 57671680 | elapsed time per iteration (s): 0.23 | learning rate: 5.804E-05 | global batch size: 256 | lm loss: 7.636875E+00 | grad norm: 1.031 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.804 | TFLOPs: 28.17 | +7: iteration 120/ 37905 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 0.23 | learning rate: 6.332E-05 | global batch size: 256 | lm loss: 7.423003E+00 | grad norm: 0.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.213 | TFLOPs: 28.28 | +7: iteration 130/ 37905 | consumed samples: 33280 | consumed tokens: 68157440 | elapsed time per iteration (s): 0.23 | learning rate: 6.859E-05 | global batch size: 256 | lm loss: 7.244942E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.196 | TFLOPs: 28.03 | +7: iteration 140/ 37905 | consumed samples: 35840 | consumed tokens: 73400320 | elapsed time per iteration (s): 0.24 | learning rate: 7.387E-05 | global batch size: 256 | lm loss: 7.137174E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1087.983 | TFLOPs: 27.72 | +7: iteration 150/ 37905 | consumed samples: 38400 | consumed tokens: 78643200 | elapsed time per iteration (s): 0.23 | learning rate: 7.915E-05 | global batch size: 256 | lm loss: 7.042148E+00 | grad norm: 0.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.341 | TFLOPs: 28.49 | +7: iteration 160/ 37905 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 0.24 | learning rate: 8.442E-05 | global batch size: 256 | lm loss: 6.940608E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1052.787 | TFLOPs: 26.82 | +7: iteration 170/ 37905 | consumed samples: 43520 | consumed tokens: 89128960 | elapsed time per iteration (s): 0.23 | learning rate: 8.970E-05 | global batch size: 256 | lm loss: 6.862914E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.592 | TFLOPs: 28.65 | +7: iteration 180/ 37905 | consumed samples: 46080 | consumed tokens: 94371840 | elapsed time per iteration (s): 0.23 | learning rate: 9.497E-05 | global batch size: 256 | lm loss: 6.785056E+00 | grad norm: 0.574 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1102.002 | TFLOPs: 28.07 | +7: iteration 190/ 37905 | consumed samples: 48640 | consumed tokens: 99614720 | elapsed time per iteration (s): 0.24 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 6.704742E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1064.459 | TFLOPs: 27.12 | +7: iteration 200/ 37905 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 0.24 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 6.669518E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1073.999 | TFLOPs: 27.36 | +7: iteration 210/ 37905 | consumed samples: 53760 | consumed tokens: 110100480 | elapsed time per iteration (s): 0.24 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 6.618465E+00 | grad norm: 0.692 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1046.345 | TFLOPs: 26.66 | +7: iteration 220/ 37905 | consumed samples: 56320 | consumed tokens: 115343360 | elapsed time per iteration (s): 0.23 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 6.546831E+00 | grad norm: 0.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.740 | TFLOPs: 28.04 | +7: iteration 230/ 37905 | consumed samples: 58880 | consumed tokens: 120586240 | elapsed time per iteration (s): 0.23 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 6.516463E+00 | grad norm: 1.100 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.280 | TFLOPs: 28.44 | +7: iteration 240/ 37905 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 0.23 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 6.492381E+00 | grad norm: 0.470 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.551 | TFLOPs: 28.29 | +7: iteration 250/ 37905 | consumed samples: 64000 | consumed tokens: 131072000 | elapsed time per iteration (s): 0.24 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 6.454444E+00 | grad norm: 0.686 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1074.963 | TFLOPs: 27.38 | +7: iteration 260/ 37905 | consumed samples: 66560 | consumed tokens: 136314880 | elapsed time per iteration (s): 0.23 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 6.415411E+00 | grad norm: 0.654 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.815 | TFLOPs: 28.50 | +7: iteration 270/ 37905 | consumed samples: 69120 | consumed tokens: 141557760 | elapsed time per iteration (s): 0.23 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 6.394322E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.292 | TFLOPs: 28.77 | +7: iteration 280/ 37905 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 0.23 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 6.366209E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.815 | TFLOPs: 28.30 | +7: iteration 290/ 37905 | consumed samples: 74240 | consumed tokens: 152043520 | elapsed time per iteration (s): 0.22 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 6.324459E+00 | grad norm: 0.533 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.492 | TFLOPs: 29.08 | +7: iteration 300/ 37905 | consumed samples: 76800 | consumed tokens: 157286400 | elapsed time per iteration (s): 0.24 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 6.297155E+00 | grad norm: 0.738 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1072.071 | TFLOPs: 27.31 | +7: iteration 310/ 37905 | consumed samples: 79360 | consumed tokens: 162529280 | elapsed time per iteration (s): 0.23 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 6.276912E+00 | grad norm: 0.813 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.788 | TFLOPs: 28.96 | +7: iteration 320/ 37905 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 0.23 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 6.273842E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1092.577 | TFLOPs: 27.83 | +7: iteration 330/ 37905 | consumed samples: 84480 | consumed tokens: 173015040 | elapsed time per iteration (s): 0.23 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 6.237569E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1094.638 | TFLOPs: 27.89 | +7: iteration 340/ 37905 | consumed samples: 87040 | consumed tokens: 178257920 | elapsed time per iteration (s): 0.22 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 6.207568E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.162 | TFLOPs: 29.10 | +7: iteration 350/ 37905 | consumed samples: 89600 | consumed tokens: 183500800 | elapsed time per iteration (s): 0.23 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 6.181374E+00 | grad norm: 0.500 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.185 | TFLOPs: 28.89 | +7: iteration 360/ 37905 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 0.23 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 6.158475E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1115.923 | TFLOPs: 28.43 | +7: iteration 370/ 37905 | consumed samples: 94720 | consumed tokens: 193986560 | elapsed time per iteration (s): 0.23 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 6.141027E+00 | grad norm: 1.066 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.624 | TFLOPs: 28.52 | +7: iteration 380/ 37905 | consumed samples: 97280 | consumed tokens: 199229440 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.124451E+00 | grad norm: 0.777 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1112.519 | TFLOPs: 28.34 | +7: iteration 390/ 37905 | consumed samples: 99840 | consumed tokens: 204472320 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.108446E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.201 | TFLOPs: 28.44 | +7: iteration 400/ 37905 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.075313E+00 | grad norm: 0.758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.151 | TFLOPs: 28.13 | +7: iteration 410/ 37905 | consumed samples: 104960 | consumed tokens: 214958080 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.052757E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.822 | TFLOPs: 28.15 | +7: iteration 420/ 37905 | consumed samples: 107520 | consumed tokens: 220200960 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.021589E+00 | grad norm: 0.944 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1093.901 | TFLOPs: 27.87 | +7: iteration 430/ 37905 | consumed samples: 110080 | consumed tokens: 225443840 | elapsed time per iteration (s): 0.24 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 6.015988E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1084.220 | TFLOPs: 27.62 | +7: iteration 440/ 37905 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.981266E+00 | grad norm: 1.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.008 | TFLOPs: 28.35 | +7: iteration 450/ 37905 | consumed samples: 115200 | consumed tokens: 235929600 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.950665E+00 | grad norm: 0.978 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.720 | TFLOPs: 28.22 | +7: iteration 460/ 37905 | consumed samples: 117760 | consumed tokens: 241172480 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.937405E+00 | grad norm: 0.688 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.221 | TFLOPs: 28.31 | +7: iteration 470/ 37905 | consumed samples: 120320 | consumed tokens: 246415360 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.895083E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.950 | TFLOPs: 28.84 | +7: iteration 480/ 37905 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.883459E+00 | grad norm: 0.571 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.902 | TFLOPs: 28.76 | +7: iteration 490/ 37905 | consumed samples: 125440 | consumed tokens: 256901120 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.869415E+00 | grad norm: 0.862 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.114 | TFLOPs: 28.54 | +7: iteration 500/ 37905 | consumed samples: 128000 | consumed tokens: 262144000 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.847033E+00 | grad norm: 0.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.649 | TFLOPs: 28.80 | +7: iteration 510/ 37905 | consumed samples: 130560 | consumed tokens: 267386880 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.830613E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1098.680 | TFLOPs: 27.99 | +7: iteration 520/ 37905 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 0.24 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.794568E+00 | grad norm: 1.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1072.695 | TFLOPs: 27.33 | +7: iteration 530/ 37905 | consumed samples: 135680 | consumed tokens: 277872640 | elapsed time per iteration (s): 0.24 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.783559E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1086.342 | TFLOPs: 27.67 | +7: iteration 540/ 37905 | consumed samples: 138240 | consumed tokens: 283115520 | elapsed time per iteration (s): 0.24 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.741978E+00 | grad norm: 1.098 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1049.708 | TFLOPs: 26.74 | +7: iteration 550/ 37905 | consumed samples: 140800 | consumed tokens: 288358400 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.735179E+00 | grad norm: 0.865 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.245 | TFLOPs: 29.10 | +7: iteration 560/ 37905 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.715572E+00 | grad norm: 1.091 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1109.608 | TFLOPs: 28.27 | +7: iteration 570/ 37905 | consumed samples: 145920 | consumed tokens: 298844160 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.694568E+00 | grad norm: 0.783 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1158.080 | TFLOPs: 29.50 | +7: iteration 580/ 37905 | consumed samples: 148480 | consumed tokens: 304087040 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.692747E+00 | grad norm: 0.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.195 | TFLOPs: 28.64 | +7: iteration 590/ 37905 | consumed samples: 151040 | consumed tokens: 309329920 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.676124E+00 | grad norm: 0.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.668 | TFLOPs: 28.55 | +7: iteration 600/ 37905 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.641177E+00 | grad norm: 0.763 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1091.713 | TFLOPs: 27.81 | +7: iteration 610/ 37905 | consumed samples: 156160 | consumed tokens: 319815680 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.611765E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.372 | TFLOPs: 28.77 | +7: iteration 620/ 37905 | consumed samples: 158720 | consumed tokens: 325058560 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.592076E+00 | grad norm: 0.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.458 | TFLOPs: 28.98 | +7: iteration 630/ 37905 | consumed samples: 161280 | consumed tokens: 330301440 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.581807E+00 | grad norm: 0.779 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.712 | TFLOPs: 28.63 | +7: iteration 640/ 37905 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.558252E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.221 | TFLOPs: 28.61 | +7: iteration 650/ 37905 | consumed samples: 166400 | consumed tokens: 340787200 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.550424E+00 | grad norm: 0.899 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.749 | TFLOPs: 28.22 | +7: iteration 660/ 37905 | consumed samples: 168960 | consumed tokens: 346030080 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.514875E+00 | grad norm: 0.625 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.470 | TFLOPs: 28.82 | +7: iteration 670/ 37905 | consumed samples: 171520 | consumed tokens: 351272960 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.512341E+00 | grad norm: 1.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.904 | TFLOPs: 29.17 | +7: iteration 680/ 37905 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.503383E+00 | grad norm: 0.939 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.250 | TFLOPs: 29.20 | +7: iteration 690/ 37905 | consumed samples: 176640 | consumed tokens: 361758720 | elapsed time per iteration (s): 0.24 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.495254E+00 | grad norm: 0.725 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1074.410 | TFLOPs: 27.37 | +7: iteration 700/ 37905 | consumed samples: 179200 | consumed tokens: 367001600 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.455674E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.388 | TFLOPs: 28.90 | +7: iteration 710/ 37905 | consumed samples: 181760 | consumed tokens: 372244480 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.450623E+00 | grad norm: 0.755 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.373 | TFLOPs: 28.49 | +7: iteration 720/ 37905 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.432937E+00 | grad norm: 0.980 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1095.413 | TFLOPs: 27.91 | +7: iteration 730/ 37905 | consumed samples: 186880 | consumed tokens: 382730240 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.415879E+00 | grad norm: 0.834 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.655 | TFLOPs: 28.63 | +7: iteration 740/ 37905 | consumed samples: 189440 | consumed tokens: 387973120 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.376398E+00 | grad norm: 0.948 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.614 | TFLOPs: 28.04 | +7: iteration 750/ 37905 | consumed samples: 192000 | consumed tokens: 393216000 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.385703E+00 | grad norm: 0.684 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.976 | TFLOPs: 28.68 | +7: iteration 760/ 37905 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 0.24 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.365536E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1088.550 | TFLOPs: 27.73 | +7: iteration 770/ 37905 | consumed samples: 197120 | consumed tokens: 403701760 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-04 | global batch size: 256 | lm loss: 5.343258E+00 | grad norm: 0.650 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.667 | TFLOPs: 28.88 | +7: iteration 780/ 37905 | consumed samples: 199680 | consumed tokens: 408944640 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.315247E+00 | grad norm: 0.778 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1101.075 | TFLOPs: 28.05 | +7: iteration 790/ 37905 | consumed samples: 202240 | consumed tokens: 414187520 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.308436E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1103.961 | TFLOPs: 28.12 | +7: iteration 800/ 37905 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.296307E+00 | grad norm: 0.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.488 | TFLOPs: 28.16 | +7: iteration 810/ 37905 | consumed samples: 207360 | consumed tokens: 424673280 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.283355E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.225 | TFLOPs: 28.56 | +7: iteration 820/ 37905 | consumed samples: 209920 | consumed tokens: 429916160 | elapsed time per iteration (s): 0.24 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.261154E+00 | grad norm: 0.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1075.493 | TFLOPs: 27.40 | +7: iteration 830/ 37905 | consumed samples: 212480 | consumed tokens: 435159040 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.248159E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.633 | TFLOPs: 28.14 | +7: iteration 840/ 37905 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.235538E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.273 | TFLOPs: 28.59 | +7: iteration 850/ 37905 | consumed samples: 217600 | consumed tokens: 445644800 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.225422E+00 | grad norm: 0.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1109.626 | TFLOPs: 28.27 | +7: iteration 860/ 37905 | consumed samples: 220160 | consumed tokens: 450887680 | elapsed time per iteration (s): 0.24 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.224110E+00 | grad norm: 0.876 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1086.453 | TFLOPs: 27.68 | +7: iteration 870/ 37905 | consumed samples: 222720 | consumed tokens: 456130560 | elapsed time per iteration (s): 0.22 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.228340E+00 | grad norm: 1.060 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.320 | TFLOPs: 29.02 | +7: iteration 880/ 37905 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.216127E+00 | grad norm: 0.658 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.150 | TFLOPs: 28.18 | +7: iteration 890/ 37905 | consumed samples: 227840 | consumed tokens: 466616320 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.180418E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.583 | TFLOPs: 28.47 | +7: iteration 900/ 37905 | consumed samples: 230400 | consumed tokens: 471859200 | elapsed time per iteration (s): 0.22 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.167649E+00 | grad norm: 0.601 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.535 | TFLOPs: 29.26 | +7: iteration 910/ 37905 | consumed samples: 232960 | consumed tokens: 477102080 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.143647E+00 | grad norm: 0.622 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1102.746 | TFLOPs: 28.09 | +7: iteration 920/ 37905 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.135431E+00 | grad norm: 0.687 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.145 | TFLOPs: 28.28 | +7: iteration 930/ 37905 | consumed samples: 238080 | consumed tokens: 487587840 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.120846E+00 | grad norm: 0.746 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.327 | TFLOPs: 28.90 | +7: iteration 940/ 37905 | consumed samples: 240640 | consumed tokens: 492830720 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.123615E+00 | grad norm: 0.925 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.003 | TFLOPs: 28.79 | +7: iteration 950/ 37905 | consumed samples: 243200 | consumed tokens: 498073600 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.114190E+00 | grad norm: 0.664 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.711 | TFLOPs: 28.60 | +7: iteration 960/ 37905 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.094154E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.057 | TFLOPs: 28.30 | +7: iteration 970/ 37905 | consumed samples: 248320 | consumed tokens: 508559360 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.075079E+00 | grad norm: 1.064 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1093.263 | TFLOPs: 27.85 | +7: iteration 980/ 37905 | consumed samples: 250880 | consumed tokens: 513802240 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.071307E+00 | grad norm: 0.807 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.708 | TFLOPs: 28.75 | +7: iteration 990/ 37905 | consumed samples: 253440 | consumed tokens: 519045120 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.050685E+00 | grad norm: 0.651 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.118 | TFLOPs: 28.15 | +7: iteration 1000/ 37905 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.027954E+00 | grad norm: 0.481 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.213 | TFLOPs: 28.84 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 1000 | lm loss value: 4.931033E+00 | lm loss PPL: 1.385226E+02 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 1000 to checkpoints_83m20b400m +0: [2023-03-15 22:01:13,283] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! +0: [2023-03-15 22:01:13,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:01:13,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:01:13,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:01:13,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:01:13,428] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:01:13,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:01:13,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:01:13,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:01:13,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:01:13,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:01:13,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:01:13,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:01:13,473] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:01:13,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:01:13,484] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:01:13,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:01:13,495] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:01:13,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:01:13,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:01:13,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:01:13,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:01:13,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:01:13,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:01:13,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:01:13,530] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step1000/mp_rank_00_model_states.pt +0: [2023-03-15 22:01:13,530] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:01:13,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:01:13,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:01:13,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:01:13,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:01:13,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:01:13,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:01:13,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:01:13,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:01:13,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:01:13,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:01:13,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +2: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:01:13,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +1: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +5: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:01:13,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +4: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:01:13,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +0: successfully saved checkpoint at iteration 1000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 299.82 +6: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +6: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:01:13,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:01:13,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! +7: iteration 1010/ 37905 | consumed samples: 258560 | consumed tokens: 529530880 | elapsed time per iteration (s): 0.26 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.030133E+00 | grad norm: 0.548 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 983.520 | TFLOPs: 25.06 | +7: iteration 1020/ 37905 | consumed samples: 261120 | consumed tokens: 534773760 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.013237E+00 | grad norm: 0.843 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.699 | TFLOPs: 28.37 | +7: iteration 1030/ 37905 | consumed samples: 263680 | consumed tokens: 540016640 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 5.010295E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.894 | TFLOPs: 28.55 | +7: iteration 1040/ 37905 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 0.23 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.990858E+00 | grad norm: 0.719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.890 | TFLOPs: 28.96 | +7: iteration 1050/ 37905 | consumed samples: 268800 | consumed tokens: 550502400 | elapsed time per iteration (s): 0.22 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.967235E+00 | grad norm: 0.730 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.426 | TFLOPs: 29.03 | +7: iteration 1060/ 37905 | consumed samples: 271360 | consumed tokens: 555745280 | elapsed time per iteration (s): 0.22 | learning rate: 1.999E-04 | global batch size: 256 | lm loss: 4.948669E+00 | grad norm: 0.753 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.126 | TFLOPs: 28.99 | +7: iteration 1070/ 37905 | consumed samples: 273920 | consumed tokens: 560988160 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.958039E+00 | grad norm: 0.729 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.017 | TFLOPs: 28.63 | +7: iteration 1080/ 37905 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.922709E+00 | grad norm: 0.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.418 | TFLOPs: 28.95 | +7: iteration 1090/ 37905 | consumed samples: 279040 | consumed tokens: 571473920 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.925859E+00 | grad norm: 0.757 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.887 | TFLOPs: 29.40 | +7: iteration 1100/ 37905 | consumed samples: 281600 | consumed tokens: 576716800 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.921746E+00 | grad norm: 0.588 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.137 | TFLOPs: 28.94 | +7: iteration 1110/ 37905 | consumed samples: 284160 | consumed tokens: 581959680 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.902637E+00 | grad norm: 0.585 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.987 | TFLOPs: 28.74 | +7: iteration 1120/ 37905 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.887336E+00 | grad norm: 0.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.850 | TFLOPs: 28.60 | +7: iteration 1130/ 37905 | consumed samples: 289280 | consumed tokens: 592445440 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.879362E+00 | grad norm: 0.761 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.215 | TFLOPs: 29.25 | +7: iteration 1140/ 37905 | consumed samples: 291840 | consumed tokens: 597688320 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.882837E+00 | grad norm: 0.828 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.314 | TFLOPs: 28.79 | +7: iteration 1150/ 37905 | consumed samples: 294400 | consumed tokens: 602931200 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.869481E+00 | grad norm: 0.726 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.942 | TFLOPs: 28.68 | +7: iteration 1160/ 37905 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.849191E+00 | grad norm: 0.795 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.909 | TFLOPs: 29.32 | +7: iteration 1170/ 37905 | consumed samples: 299520 | consumed tokens: 613416960 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.837354E+00 | grad norm: 0.638 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.298 | TFLOPs: 29.15 | +7: iteration 1180/ 37905 | consumed samples: 302080 | consumed tokens: 618659840 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.847112E+00 | grad norm: 0.751 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.436 | TFLOPs: 29.03 | +7: iteration 1190/ 37905 | consumed samples: 304640 | consumed tokens: 623902720 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.820013E+00 | grad norm: 0.703 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.862 | TFLOPs: 29.06 | +7: iteration 1200/ 37905 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.822704E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.101 | TFLOPs: 29.12 | +7: iteration 1210/ 37905 | consumed samples: 309760 | consumed tokens: 634388480 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.805293E+00 | grad norm: 0.562 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.063 | TFLOPs: 28.92 | +7: iteration 1220/ 37905 | consumed samples: 312320 | consumed tokens: 639631360 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.793204E+00 | grad norm: 0.946 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.903 | TFLOPs: 29.27 | +7: iteration 1230/ 37905 | consumed samples: 314880 | consumed tokens: 644874240 | elapsed time per iteration (s): 0.22 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.807935E+00 | grad norm: 0.811 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.848 | TFLOPs: 29.29 | +7: iteration 1240/ 37905 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.791220E+00 | grad norm: 0.690 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.404 | TFLOPs: 28.59 | +7: iteration 1250/ 37905 | consumed samples: 320000 | consumed tokens: 655360000 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.776680E+00 | grad norm: 0.680 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.096 | TFLOPs: 28.69 | +7: iteration 1260/ 37905 | consumed samples: 322560 | consumed tokens: 660602880 | elapsed time per iteration (s): 0.23 | learning rate: 1.998E-04 | global batch size: 256 | lm loss: 4.769843E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.530 | TFLOPs: 28.95 | +7: iteration 1270/ 37905 | consumed samples: 325120 | consumed tokens: 665845760 | elapsed time per iteration (s): 0.23 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.772070E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.105 | TFLOPs: 28.97 | +7: iteration 1280/ 37905 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 0.23 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.746735E+00 | grad norm: 0.790 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.352 | TFLOPs: 28.77 | +7: iteration 1290/ 37905 | consumed samples: 330240 | consumed tokens: 676331520 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.750860E+00 | grad norm: 0.741 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.979 | TFLOPs: 29.45 | +7: iteration 1300/ 37905 | consumed samples: 332800 | consumed tokens: 681574400 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.740641E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.931 | TFLOPs: 28.99 | +7: iteration 1310/ 37905 | consumed samples: 335360 | consumed tokens: 686817280 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.731256E+00 | grad norm: 0.531 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.149 | TFLOPs: 28.99 | +7: iteration 1320/ 37905 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 0.23 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.733314E+00 | grad norm: 0.773 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.330 | TFLOPs: 28.59 | +7: iteration 1330/ 37905 | consumed samples: 340480 | consumed tokens: 697303040 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.736125E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.822 | TFLOPs: 29.06 | +7: iteration 1340/ 37905 | consumed samples: 343040 | consumed tokens: 702545920 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.716637E+00 | grad norm: 0.707 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.931 | TFLOPs: 29.42 | +7: iteration 1350/ 37905 | consumed samples: 345600 | consumed tokens: 707788800 | elapsed time per iteration (s): 0.23 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.712139E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.605 | TFLOPs: 28.75 | +7: iteration 1360/ 37905 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 0.23 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.701163E+00 | grad norm: 0.616 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1101.151 | TFLOPs: 28.05 | +7: iteration 1370/ 37905 | consumed samples: 350720 | consumed tokens: 718274560 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.692302E+00 | grad norm: 0.599 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.545 | TFLOPs: 29.39 | +7: iteration 1380/ 37905 | consumed samples: 353280 | consumed tokens: 723517440 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.702615E+00 | grad norm: 0.578 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.448 | TFLOPs: 29.03 | +7: iteration 1390/ 37905 | consumed samples: 355840 | consumed tokens: 728760320 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.684325E+00 | grad norm: 0.598 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.124 | TFLOPs: 29.10 | +7: iteration 1400/ 37905 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.682600E+00 | grad norm: 0.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.220 | TFLOPs: 29.40 | +7: iteration 1410/ 37905 | consumed samples: 360960 | consumed tokens: 739246080 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.666287E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.211 | TFLOPs: 29.07 | +7: iteration 1420/ 37905 | consumed samples: 363520 | consumed tokens: 744488960 | elapsed time per iteration (s): 0.22 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.664254E+00 | grad norm: 0.603 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.003 | TFLOPs: 29.45 | +7: iteration 1430/ 37905 | consumed samples: 366080 | consumed tokens: 749731840 | elapsed time per iteration (s): 0.23 | learning rate: 1.997E-04 | global batch size: 256 | lm loss: 4.663700E+00 | grad norm: 0.663 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.944 | TFLOPs: 28.61 | +7: iteration 1440/ 37905 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 0.22 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.682574E+00 | grad norm: 0.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.373 | TFLOPs: 29.13 | +7: iteration 1450/ 37905 | consumed samples: 371200 | consumed tokens: 760217600 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.655141E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.570 | TFLOPs: 28.93 | +7: iteration 1460/ 37905 | consumed samples: 373760 | consumed tokens: 765460480 | elapsed time per iteration (s): 0.22 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.655849E+00 | grad norm: 0.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.323 | TFLOPs: 29.10 | +7: iteration 1470/ 37905 | consumed samples: 376320 | consumed tokens: 770703360 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.636879E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.992 | TFLOPs: 28.61 | +7: iteration 1480/ 37905 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 0.22 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.643461E+00 | grad norm: 0.647 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.218 | TFLOPs: 29.25 | +7: iteration 1490/ 37905 | consumed samples: 381440 | consumed tokens: 781189120 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.635341E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1112.794 | TFLOPs: 28.35 | +7: iteration 1500/ 37905 | consumed samples: 384000 | consumed tokens: 786432000 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.624498E+00 | grad norm: 0.534 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.392 | TFLOPs: 28.75 | +7: iteration 1510/ 37905 | consumed samples: 386560 | consumed tokens: 791674880 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.614385E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.195 | TFLOPs: 28.13 | +7: iteration 1520/ 37905 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.623706E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.317 | TFLOPs: 28.21 | +7: iteration 1530/ 37905 | consumed samples: 391680 | consumed tokens: 802160640 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.624720E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.843 | TFLOPs: 28.76 | +7: iteration 1540/ 37905 | consumed samples: 394240 | consumed tokens: 807403520 | elapsed time per iteration (s): 0.22 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.614523E+00 | grad norm: 0.565 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.424 | TFLOPs: 29.28 | +7: iteration 1550/ 37905 | consumed samples: 396800 | consumed tokens: 812646400 | elapsed time per iteration (s): 0.22 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.622631E+00 | grad norm: 0.771 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.880 | TFLOPs: 29.29 | +7: iteration 1560/ 37905 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 0.22 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.609277E+00 | grad norm: 0.515 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.514 | TFLOPs: 29.23 | +7: iteration 1570/ 37905 | consumed samples: 401920 | consumed tokens: 823132160 | elapsed time per iteration (s): 0.23 | learning rate: 1.996E-04 | global batch size: 256 | lm loss: 4.592915E+00 | grad norm: 0.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.274 | TFLOPs: 28.90 | +7: iteration 1580/ 37905 | consumed samples: 404480 | consumed tokens: 828375040 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.594596E+00 | grad norm: 0.630 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.798 | TFLOPs: 29.47 | +7: iteration 1590/ 37905 | consumed samples: 407040 | consumed tokens: 833617920 | elapsed time per iteration (s): 0.23 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.584727E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.393 | TFLOPs: 28.95 | +7: iteration 1600/ 37905 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.597144E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.579 | TFLOPs: 29.11 | +7: iteration 1610/ 37905 | consumed samples: 412160 | consumed tokens: 844103680 | elapsed time per iteration (s): 0.23 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.584182E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.200 | TFLOPs: 28.31 | +7: iteration 1620/ 37905 | consumed samples: 414720 | consumed tokens: 849346560 | elapsed time per iteration (s): 0.23 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.591137E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.148 | TFLOPs: 28.92 | +7: iteration 1630/ 37905 | consumed samples: 417280 | consumed tokens: 854589440 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.574801E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.650 | TFLOPs: 29.13 | +7: iteration 1640/ 37905 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.574819E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.816 | TFLOPs: 29.34 | +7: iteration 1650/ 37905 | consumed samples: 422400 | consumed tokens: 865075200 | elapsed time per iteration (s): 0.23 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.555800E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.709 | TFLOPs: 28.96 | +7: iteration 1660/ 37905 | consumed samples: 424960 | consumed tokens: 870318080 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.558211E+00 | grad norm: 0.737 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.780 | TFLOPs: 29.32 | +7: iteration 1670/ 37905 | consumed samples: 427520 | consumed tokens: 875560960 | elapsed time per iteration (s): 0.23 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.548636E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.145 | TFLOPs: 28.54 | +7: iteration 1680/ 37905 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.565816E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.743 | TFLOPs: 29.04 | +7: iteration 1690/ 37905 | consumed samples: 432640 | consumed tokens: 886046720 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.546888E+00 | grad norm: 0.510 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.699 | TFLOPs: 29.08 | +7: iteration 1700/ 37905 | consumed samples: 435200 | consumed tokens: 891289600 | elapsed time per iteration (s): 0.22 | learning rate: 1.995E-04 | global batch size: 256 | lm loss: 4.557291E+00 | grad norm: 0.635 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.015 | TFLOPs: 29.19 | +7: iteration 1710/ 37905 | consumed samples: 437760 | consumed tokens: 896532480 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.537750E+00 | grad norm: 0.660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.744 | TFLOPs: 28.83 | +7: iteration 1720/ 37905 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 0.22 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.540603E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.593 | TFLOPs: 29.26 | +7: iteration 1730/ 37905 | consumed samples: 442880 | consumed tokens: 907018240 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.535907E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.232 | TFLOPs: 28.92 | +7: iteration 1740/ 37905 | consumed samples: 445440 | consumed tokens: 912261120 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.548719E+00 | grad norm: 0.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.905 | TFLOPs: 28.05 | +7: iteration 1750/ 37905 | consumed samples: 448000 | consumed tokens: 917504000 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.526238E+00 | grad norm: 0.715 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.400 | TFLOPs: 28.67 | +7: iteration 1760/ 37905 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.541762E+00 | grad norm: 0.607 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.153 | TFLOPs: 28.46 | +7: iteration 1770/ 37905 | consumed samples: 453120 | consumed tokens: 927989760 | elapsed time per iteration (s): 0.22 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.540829E+00 | grad norm: 0.564 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.956 | TFLOPs: 29.09 | +7: iteration 1780/ 37905 | consumed samples: 455680 | consumed tokens: 933232640 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.508240E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.616 | TFLOPs: 28.60 | +7: iteration 1790/ 37905 | consumed samples: 458240 | consumed tokens: 938475520 | elapsed time per iteration (s): 0.22 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.495466E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.257 | TFLOPs: 29.18 | +7: iteration 1800/ 37905 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 0.23 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.499306E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.147 | TFLOPs: 28.84 | +7: iteration 1810/ 37905 | consumed samples: 463360 | consumed tokens: 948961280 | elapsed time per iteration (s): 0.22 | learning rate: 1.994E-04 | global batch size: 256 | lm loss: 4.501743E+00 | grad norm: 0.619 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.146 | TFLOPs: 29.27 | +7: iteration 1820/ 37905 | consumed samples: 465920 | consumed tokens: 954204160 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.510410E+00 | grad norm: 0.800 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.615 | TFLOPs: 29.39 | +7: iteration 1830/ 37905 | consumed samples: 468480 | consumed tokens: 959447040 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.493966E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.570 | TFLOPs: 29.11 | +7: iteration 1840/ 37905 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.501603E+00 | grad norm: 0.600 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.202 | TFLOPs: 29.15 | +7: iteration 1850/ 37905 | consumed samples: 473600 | consumed tokens: 969932800 | elapsed time per iteration (s): 0.23 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.494566E+00 | grad norm: 0.568 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.941 | TFLOPs: 28.56 | +7: iteration 1860/ 37905 | consumed samples: 476160 | consumed tokens: 975175680 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.497011E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.290 | TFLOPs: 29.41 | +7: iteration 1870/ 37905 | consumed samples: 478720 | consumed tokens: 980418560 | elapsed time per iteration (s): 0.23 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.499256E+00 | grad norm: 0.535 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.962 | TFLOPs: 28.89 | +7: iteration 1880/ 37905 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.496732E+00 | grad norm: 0.633 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.107 | TFLOPs: 29.27 | +7: iteration 1890/ 37905 | consumed samples: 483840 | consumed tokens: 990904320 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.471655E+00 | grad norm: 0.667 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.873 | TFLOPs: 29.04 | +7: iteration 1900/ 37905 | consumed samples: 486400 | consumed tokens: 996147200 | elapsed time per iteration (s): 0.23 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.480313E+00 | grad norm: 0.516 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.476 | TFLOPs: 28.80 | +7: iteration 1910/ 37905 | consumed samples: 488960 | consumed tokens: 1001390080 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.474668E+00 | grad norm: 0.673 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.666 | TFLOPs: 29.03 | +7: iteration 1920/ 37905 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 0.22 | learning rate: 1.993E-04 | global batch size: 256 | lm loss: 4.478395E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.697 | TFLOPs: 29.14 | +7: iteration 1930/ 37905 | consumed samples: 494080 | consumed tokens: 1011875840 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.482597E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.675 | TFLOPs: 29.14 | +7: iteration 1940/ 37905 | consumed samples: 496640 | consumed tokens: 1017118720 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.480304E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.274 | TFLOPs: 29.10 | +7: iteration 1950/ 37905 | consumed samples: 499200 | consumed tokens: 1022361600 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.466273E+00 | grad norm: 0.672 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.952 | TFLOPs: 29.04 | +7: iteration 1960/ 37905 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 0.23 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.471249E+00 | grad norm: 0.586 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.403 | TFLOPs: 28.47 | +7: iteration 1970/ 37905 | consumed samples: 504320 | consumed tokens: 1032847360 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.446387E+00 | grad norm: 0.547 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.207 | TFLOPs: 29.07 | +7: iteration 1980/ 37905 | consumed samples: 506880 | consumed tokens: 1038090240 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.472006E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.332 | TFLOPs: 29.43 | +7: iteration 1990/ 37905 | consumed samples: 509440 | consumed tokens: 1043333120 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.458263E+00 | grad norm: 0.583 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.174 | TFLOPs: 29.05 | +0: [2023-03-15 22:04:58,800] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00019917259290531361, 0.00019917259290531361, 0.00019917259290531361], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 2000/ 37905 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 0.22 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.452668E+00 | grad norm: 0.632 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.464 | TFLOPs: 29.00 | +0: steps: 2000 loss: 4.4714 iter time (s): 0.231 samples/sec: 1108.064 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 2000 | lm loss value: 4.326741E+00 | lm loss PPL: 7.569717E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 2000 to checkpoints_83m20b400m +0: [2023-03-15 22:04:58,888] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! +0: [2023-03-15 22:04:58,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:04:58,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:04:58,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:04:58,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:04:58,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:04:58,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:04:58,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:04:58,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:04:58,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:04:59,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:04:59,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:04:59,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:04:59,021] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:04:59,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:04:59,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:04:59,043] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:04:59,043] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:04:59,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:04:59,054] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:04:59,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:04:59,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:04:59,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:04:59,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:04:59,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:04:59,078] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step2000/mp_rank_00_model_states.pt +0: [2023-03-15 22:04:59,078] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:04:59,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:04:59,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:04:59,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:04:59,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:04:59,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:04:59,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:04:59,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +4: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +2: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +1: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +3: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:04:59,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +6: [2023-03-15 22:04:59,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +7: [2023-03-15 22:04:59,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:04:59,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:04:59,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-15 22:04:59,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:04:59,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-15 22:04:59,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +5: [2023-03-15 22:04:59,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,157] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: [2023-03-15 22:04:59,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:04:59,163] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:04:59,163] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! +0: successfully saved checkpoint at iteration 2000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 278.75 +7: iteration 2010/ 37905 | consumed samples: 514560 | consumed tokens: 1053818880 | elapsed time per iteration (s): 0.26 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.450465E+00 | grad norm: 0.581 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 983.095 | TFLOPs: 25.04 | +7: iteration 2020/ 37905 | consumed samples: 517120 | consumed tokens: 1059061760 | elapsed time per iteration (s): 0.23 | learning rate: 1.992E-04 | global batch size: 256 | lm loss: 4.437001E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.548 | TFLOPs: 28.62 | +7: iteration 2030/ 37905 | consumed samples: 519680 | consumed tokens: 1064304640 | elapsed time per iteration (s): 0.23 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.446391E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.795 | TFLOPs: 28.81 | +7: iteration 2040/ 37905 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 0.22 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.452684E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.976 | TFLOPs: 29.12 | +7: iteration 2050/ 37905 | consumed samples: 524800 | consumed tokens: 1074790400 | elapsed time per iteration (s): 0.22 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.449673E+00 | grad norm: 0.815 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.266 | TFLOPs: 29.46 | +7: iteration 2060/ 37905 | consumed samples: 527360 | consumed tokens: 1080033280 | elapsed time per iteration (s): 0.22 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.453643E+00 | grad norm: 0.596 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.191 | TFLOPs: 29.15 | +7: iteration 2070/ 37905 | consumed samples: 529920 | consumed tokens: 1085276160 | elapsed time per iteration (s): 0.23 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.453635E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.807 | TFLOPs: 28.63 | +7: iteration 2080/ 37905 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 0.23 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.440795E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.162 | TFLOPs: 28.15 | +7: iteration 2090/ 37905 | consumed samples: 535040 | consumed tokens: 1095761920 | elapsed time per iteration (s): 0.22 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.438361E+00 | grad norm: 0.502 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.388 | TFLOPs: 29.43 | +7: iteration 2100/ 37905 | consumed samples: 537600 | consumed tokens: 1101004800 | elapsed time per iteration (s): 0.23 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.418536E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.866 | TFLOPs: 28.86 | +7: iteration 2110/ 37905 | consumed samples: 540160 | consumed tokens: 1106247680 | elapsed time per iteration (s): 0.22 | learning rate: 1.991E-04 | global batch size: 256 | lm loss: 4.405057E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.981 | TFLOPs: 29.09 | +7: iteration 2120/ 37905 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 0.23 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.428956E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.613 | TFLOPs: 28.73 | +7: iteration 2130/ 37905 | consumed samples: 545280 | consumed tokens: 1116733440 | elapsed time per iteration (s): 0.23 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.418066E+00 | grad norm: 0.659 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.236 | TFLOPs: 28.36 | +7: iteration 2140/ 37905 | consumed samples: 547840 | consumed tokens: 1121976320 | elapsed time per iteration (s): 0.22 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.432791E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.007 | TFLOPs: 29.12 | +7: iteration 2150/ 37905 | consumed samples: 550400 | consumed tokens: 1127219200 | elapsed time per iteration (s): 0.23 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.430492E+00 | grad norm: 0.569 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.834 | TFLOPs: 28.88 | +7: iteration 2160/ 37905 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 0.22 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.419345E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.698 | TFLOPs: 29.06 | +7: iteration 2170/ 37905 | consumed samples: 555520 | consumed tokens: 1137704960 | elapsed time per iteration (s): 0.23 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.416404E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.351 | TFLOPs: 28.95 | +7: iteration 2180/ 37905 | consumed samples: 558080 | consumed tokens: 1142947840 | elapsed time per iteration (s): 0.23 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.414737E+00 | grad norm: 0.611 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.510 | TFLOPs: 28.44 | +7: iteration 2190/ 37905 | consumed samples: 560640 | consumed tokens: 1148190720 | elapsed time per iteration (s): 0.22 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.402277E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.379 | TFLOPs: 29.00 | +7: iteration 2200/ 37905 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 0.22 | learning rate: 1.990E-04 | global batch size: 256 | lm loss: 4.410326E+00 | grad norm: 0.437 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.472 | TFLOPs: 29.16 | +7: iteration 2210/ 37905 | consumed samples: 565760 | consumed tokens: 1158676480 | elapsed time per iteration (s): 0.22 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.402198E+00 | grad norm: 0.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.353 | TFLOPs: 29.03 | +7: iteration 2220/ 37905 | consumed samples: 568320 | consumed tokens: 1163919360 | elapsed time per iteration (s): 0.22 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.406293E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.860 | TFLOPs: 29.19 | +7: iteration 2230/ 37905 | consumed samples: 570880 | consumed tokens: 1169162240 | elapsed time per iteration (s): 0.23 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.398306E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.111 | TFLOPs: 28.82 | +7: iteration 2240/ 37905 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 0.23 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.408975E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.111 | TFLOPs: 28.51 | +7: iteration 2250/ 37905 | consumed samples: 576000 | consumed tokens: 1179648000 | elapsed time per iteration (s): 0.22 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.391634E+00 | grad norm: 0.573 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.532 | TFLOPs: 29.34 | +7: iteration 2260/ 37905 | consumed samples: 578560 | consumed tokens: 1184890880 | elapsed time per iteration (s): 0.22 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.393900E+00 | grad norm: 0.551 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.281 | TFLOPs: 29.10 | +7: iteration 2270/ 37905 | consumed samples: 581120 | consumed tokens: 1190133760 | elapsed time per iteration (s): 0.23 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.400688E+00 | grad norm: 0.820 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.930 | TFLOPs: 28.38 | +7: iteration 2280/ 37905 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 0.23 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.401357E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.828 | TFLOPs: 28.66 | +7: iteration 2290/ 37905 | consumed samples: 586240 | consumed tokens: 1200619520 | elapsed time per iteration (s): 0.23 | learning rate: 1.989E-04 | global batch size: 256 | lm loss: 4.410697E+00 | grad norm: 0.678 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.834 | TFLOPs: 28.91 | +7: iteration 2300/ 37905 | consumed samples: 588800 | consumed tokens: 1205862400 | elapsed time per iteration (s): 0.22 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.382485E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.978 | TFLOPs: 29.35 | +7: iteration 2310/ 37905 | consumed samples: 591360 | consumed tokens: 1211105280 | elapsed time per iteration (s): 0.22 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.379309E+00 | grad norm: 0.512 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.491 | TFLOPs: 29.05 | +7: iteration 2320/ 37905 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 0.22 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.380706E+00 | grad norm: 0.542 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.091 | TFLOPs: 29.43 | +7: iteration 2330/ 37905 | consumed samples: 596480 | consumed tokens: 1221591040 | elapsed time per iteration (s): 0.23 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.369021E+00 | grad norm: 0.722 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.678 | TFLOPs: 28.88 | +7: iteration 2340/ 37905 | consumed samples: 599040 | consumed tokens: 1226833920 | elapsed time per iteration (s): 0.22 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.373204E+00 | grad norm: 0.576 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.630 | TFLOPs: 29.21 | +7: iteration 2350/ 37905 | consumed samples: 601600 | consumed tokens: 1232076800 | elapsed time per iteration (s): 0.23 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.378789E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.273 | TFLOPs: 28.87 | +7: iteration 2360/ 37905 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 0.22 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.380368E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.057 | TFLOPs: 29.37 | +7: iteration 2370/ 37905 | consumed samples: 606720 | consumed tokens: 1242562560 | elapsed time per iteration (s): 0.23 | learning rate: 1.988E-04 | global batch size: 256 | lm loss: 4.368618E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.141 | TFLOPs: 28.87 | +7: iteration 2380/ 37905 | consumed samples: 609280 | consumed tokens: 1247805440 | elapsed time per iteration (s): 0.23 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.377790E+00 | grad norm: 0.747 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.702 | TFLOPs: 28.73 | +7: iteration 2390/ 37905 | consumed samples: 611840 | consumed tokens: 1253048320 | elapsed time per iteration (s): 0.23 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.369838E+00 | grad norm: 0.560 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.189 | TFLOPs: 28.79 | +7: iteration 2400/ 37905 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 0.22 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.373450E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.321 | TFLOPs: 29.43 | +7: iteration 2410/ 37905 | consumed samples: 616960 | consumed tokens: 1263534080 | elapsed time per iteration (s): 0.22 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.367044E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.558 | TFLOPs: 29.41 | +7: iteration 2420/ 37905 | consumed samples: 619520 | consumed tokens: 1268776960 | elapsed time per iteration (s): 0.23 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.363867E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.171 | TFLOPs: 28.97 | +7: iteration 2430/ 37905 | consumed samples: 622080 | consumed tokens: 1274019840 | elapsed time per iteration (s): 0.23 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.351334E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.506 | TFLOPs: 28.80 | +7: iteration 2440/ 37905 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 0.22 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.359116E+00 | grad norm: 0.735 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.048 | TFLOPs: 29.43 | +7: iteration 2450/ 37905 | consumed samples: 627200 | consumed tokens: 1284505600 | elapsed time per iteration (s): 0.22 | learning rate: 1.987E-04 | global batch size: 256 | lm loss: 4.354925E+00 | grad norm: 0.540 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.021 | TFLOPs: 29.37 | +7: iteration 2460/ 37905 | consumed samples: 629760 | consumed tokens: 1289748480 | elapsed time per iteration (s): 0.22 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.357458E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.552 | TFLOPs: 29.00 | +7: iteration 2470/ 37905 | consumed samples: 632320 | consumed tokens: 1294991360 | elapsed time per iteration (s): 0.23 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.355169E+00 | grad norm: 0.493 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.786 | TFLOPs: 28.91 | +7: iteration 2480/ 37905 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 0.22 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.341691E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.529 | TFLOPs: 29.08 | +7: iteration 2490/ 37905 | consumed samples: 637440 | consumed tokens: 1305477120 | elapsed time per iteration (s): 0.23 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.352808E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.222 | TFLOPs: 28.97 | +7: iteration 2500/ 37905 | consumed samples: 640000 | consumed tokens: 1310720000 | elapsed time per iteration (s): 0.22 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.346865E+00 | grad norm: 0.567 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.209 | TFLOPs: 29.33 | +7: iteration 2510/ 37905 | consumed samples: 642560 | consumed tokens: 1315962880 | elapsed time per iteration (s): 0.22 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.341171E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.775 | TFLOPs: 29.14 | +7: iteration 2520/ 37905 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 0.22 | learning rate: 1.986E-04 | global batch size: 256 | lm loss: 4.353244E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.172 | TFLOPs: 29.38 | +7: iteration 2530/ 37905 | consumed samples: 647680 | consumed tokens: 1326448640 | elapsed time per iteration (s): 0.23 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.348444E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.586 | TFLOPs: 28.62 | +7: iteration 2540/ 37905 | consumed samples: 650240 | consumed tokens: 1331691520 | elapsed time per iteration (s): 0.22 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.343108E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.960 | TFLOPs: 29.24 | +7: iteration 2550/ 37905 | consumed samples: 652800 | consumed tokens: 1336934400 | elapsed time per iteration (s): 0.22 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.334798E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.904 | TFLOPs: 29.17 | +7: iteration 2560/ 37905 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 0.22 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.330108E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.690 | TFLOPs: 29.14 | +7: iteration 2570/ 37905 | consumed samples: 657920 | consumed tokens: 1347420160 | elapsed time per iteration (s): 0.22 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.328258E+00 | grad norm: 0.432 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.429 | TFLOPs: 29.28 | +7: iteration 2580/ 37905 | consumed samples: 660480 | consumed tokens: 1352663040 | elapsed time per iteration (s): 0.23 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.334600E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.271 | TFLOPs: 28.56 | +7: iteration 2590/ 37905 | consumed samples: 663040 | consumed tokens: 1357905920 | elapsed time per iteration (s): 0.23 | learning rate: 1.985E-04 | global batch size: 256 | lm loss: 4.328511E+00 | grad norm: 0.550 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.308 | TFLOPs: 28.46 | +7: iteration 2600/ 37905 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.319664E+00 | grad norm: 0.501 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1157.887 | TFLOPs: 29.50 | +7: iteration 2610/ 37905 | consumed samples: 668160 | consumed tokens: 1368391680 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.317437E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.378 | TFLOPs: 29.31 | +7: iteration 2620/ 37905 | consumed samples: 670720 | consumed tokens: 1373634560 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.331112E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.967 | TFLOPs: 29.22 | +7: iteration 2630/ 37905 | consumed samples: 673280 | consumed tokens: 1378877440 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.340913E+00 | grad norm: 0.537 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.977 | TFLOPs: 29.22 | +7: iteration 2640/ 37905 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.325655E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.103 | TFLOPs: 29.04 | +7: iteration 2650/ 37905 | consumed samples: 678400 | consumed tokens: 1389363200 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.321187E+00 | grad norm: 0.544 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.520 | TFLOPs: 29.05 | +7: iteration 2660/ 37905 | consumed samples: 680960 | consumed tokens: 1394606080 | elapsed time per iteration (s): 0.22 | learning rate: 1.984E-04 | global batch size: 256 | lm loss: 4.317188E+00 | grad norm: 0.467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.490 | TFLOPs: 29.00 | +7: iteration 2670/ 37905 | consumed samples: 683520 | consumed tokens: 1399848960 | elapsed time per iteration (s): 0.22 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.322479E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.542 | TFLOPs: 29.18 | +7: iteration 2680/ 37905 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 0.22 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.311148E+00 | grad norm: 0.642 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.088 | TFLOPs: 29.20 | +7: iteration 2690/ 37905 | consumed samples: 688640 | consumed tokens: 1410334720 | elapsed time per iteration (s): 0.22 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.318692E+00 | grad norm: 0.477 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.319 | TFLOPs: 29.15 | +7: iteration 2700/ 37905 | consumed samples: 691200 | consumed tokens: 1415577600 | elapsed time per iteration (s): 0.23 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.311824E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.927 | TFLOPs: 28.91 | +7: iteration 2710/ 37905 | consumed samples: 693760 | consumed tokens: 1420820480 | elapsed time per iteration (s): 0.22 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.320660E+00 | grad norm: 0.503 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.428 | TFLOPs: 29.05 | +7: iteration 2720/ 37905 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 0.23 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.306575E+00 | grad norm: 0.554 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.974 | TFLOPs: 28.89 | +7: iteration 2730/ 37905 | consumed samples: 698880 | consumed tokens: 1431306240 | elapsed time per iteration (s): 0.23 | learning rate: 1.983E-04 | global batch size: 256 | lm loss: 4.303236E+00 | grad norm: 0.566 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.771 | TFLOPs: 28.86 | +7: iteration 2740/ 37905 | consumed samples: 701440 | consumed tokens: 1436549120 | elapsed time per iteration (s): 0.23 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.324173E+00 | grad norm: 0.767 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.746 | TFLOPs: 28.70 | +7: iteration 2750/ 37905 | consumed samples: 704000 | consumed tokens: 1441792000 | elapsed time per iteration (s): 0.22 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.314097E+00 | grad norm: 0.609 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.752 | TFLOPs: 29.21 | +7: iteration 2760/ 37905 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 0.22 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.318128E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.282 | TFLOPs: 29.15 | +7: iteration 2770/ 37905 | consumed samples: 709120 | consumed tokens: 1452277760 | elapsed time per iteration (s): 0.23 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.293235E+00 | grad norm: 0.494 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.291 | TFLOPs: 28.90 | +7: iteration 2780/ 37905 | consumed samples: 711680 | consumed tokens: 1457520640 | elapsed time per iteration (s): 0.22 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.297020E+00 | grad norm: 0.461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.122 | TFLOPs: 29.07 | +7: iteration 2790/ 37905 | consumed samples: 714240 | consumed tokens: 1462763520 | elapsed time per iteration (s): 0.22 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.299581E+00 | grad norm: 0.572 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.192 | TFLOPs: 29.20 | +7: iteration 2800/ 37905 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 0.22 | learning rate: 1.982E-04 | global batch size: 256 | lm loss: 4.296344E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.081 | TFLOPs: 29.37 | +7: iteration 2810/ 37905 | consumed samples: 719360 | consumed tokens: 1473249280 | elapsed time per iteration (s): 0.22 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.295065E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.299 | TFLOPs: 29.23 | +7: iteration 2820/ 37905 | consumed samples: 721920 | consumed tokens: 1478492160 | elapsed time per iteration (s): 0.22 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.297188E+00 | grad norm: 0.479 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.603 | TFLOPs: 29.31 | +7: iteration 2830/ 37905 | consumed samples: 724480 | consumed tokens: 1483735040 | elapsed time per iteration (s): 0.22 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.294061E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.470 | TFLOPs: 29.46 | +7: iteration 2840/ 37905 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 0.23 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.296048E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.585 | TFLOPs: 28.45 | +7: iteration 2850/ 37905 | consumed samples: 729600 | consumed tokens: 1494220800 | elapsed time per iteration (s): 0.22 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.296646E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.563 | TFLOPs: 29.13 | +7: iteration 2860/ 37905 | consumed samples: 732160 | consumed tokens: 1499463680 | elapsed time per iteration (s): 0.23 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.301092E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.093 | TFLOPs: 28.94 | +7: iteration 2870/ 37905 | consumed samples: 734720 | consumed tokens: 1504706560 | elapsed time per iteration (s): 0.22 | learning rate: 1.981E-04 | global batch size: 256 | lm loss: 4.294002E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.072 | TFLOPs: 29.20 | +7: iteration 2880/ 37905 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 0.22 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.287393E+00 | grad norm: 0.519 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.977 | TFLOPs: 29.02 | +7: iteration 2890/ 37905 | consumed samples: 739840 | consumed tokens: 1515192320 | elapsed time per iteration (s): 0.23 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.290034E+00 | grad norm: 0.543 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.263 | TFLOPs: 28.49 | +7: iteration 2900/ 37905 | consumed samples: 742400 | consumed tokens: 1520435200 | elapsed time per iteration (s): 0.22 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.289718E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.312 | TFLOPs: 29.10 | +7: iteration 2910/ 37905 | consumed samples: 744960 | consumed tokens: 1525678080 | elapsed time per iteration (s): 0.22 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.276403E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.159 | TFLOPs: 29.35 | +7: iteration 2920/ 37905 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 0.23 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.287043E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.841 | TFLOPs: 28.66 | +7: iteration 2930/ 37905 | consumed samples: 750080 | consumed tokens: 1536163840 | elapsed time per iteration (s): 0.23 | learning rate: 1.980E-04 | global batch size: 256 | lm loss: 4.265014E+00 | grad norm: 0.471 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.441 | TFLOPs: 28.72 | +7: iteration 2940/ 37905 | consumed samples: 752640 | consumed tokens: 1541406720 | elapsed time per iteration (s): 0.24 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.271587E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1064.877 | TFLOPs: 27.13 | +7: iteration 2950/ 37905 | consumed samples: 755200 | consumed tokens: 1546649600 | elapsed time per iteration (s): 0.22 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.258932E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.434 | TFLOPs: 29.36 | +7: iteration 2960/ 37905 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 0.22 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.270282E+00 | grad norm: 0.691 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.460 | TFLOPs: 29.03 | +7: iteration 2970/ 37905 | consumed samples: 760320 | consumed tokens: 1557135360 | elapsed time per iteration (s): 0.23 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.268787E+00 | grad norm: 0.465 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.890 | TFLOPs: 28.91 | +7: iteration 2980/ 37905 | consumed samples: 762880 | consumed tokens: 1562378240 | elapsed time per iteration (s): 0.22 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.275019E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.735 | TFLOPs: 29.44 | +7: iteration 2990/ 37905 | consumed samples: 765440 | consumed tokens: 1567621120 | elapsed time per iteration (s): 0.22 | learning rate: 1.979E-04 | global batch size: 256 | lm loss: 4.275319E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.668 | TFLOPs: 29.47 | +7: iteration 3000/ 37905 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 0.22 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.284168E+00 | grad norm: 0.496 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.704 | TFLOPs: 29.44 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 3000 | lm loss value: 4.159828E+00 | lm loss PPL: 6.406048E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 3000 to checkpoints_83m20b400m +0: [2023-03-15 22:08:44,050] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! +0: [2023-03-15 22:08:44,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:08:44,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:08:44,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:08:44,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:08:44,137] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:08:44,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:08:44,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:08:44,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:08:44,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:08:44,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:08:44,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:08:44,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:08:44,183] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:08:44,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:08:44,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:08:44,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:08:44,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:08:44,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:08:44,216] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:08:44,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:08:44,227] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:08:44,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:08:44,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:08:44,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:08:44,240] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step3000/mp_rank_00_model_states.pt +0: [2023-03-15 22:08:44,240] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:08:44,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:08:44,258] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:08:44,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:08:44,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:08:44,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,284] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,284] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,285] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,285] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +5: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +2: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +4: [2023-03-15 22:08:44,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +3: [2023-03-15 22:08:44,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +1: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:08:44,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:08:44,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +6: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:08:44,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:08:44,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-15 22:08:44,288] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-15 22:08:44,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +7: [2023-03-15 22:08:44,288] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! +0: successfully saved checkpoint at iteration 3000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 240.50 +7: iteration 3010/ 37905 | consumed samples: 770560 | consumed tokens: 1578106880 | elapsed time per iteration (s): 0.27 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.268792E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 958.834 | TFLOPs: 24.43 | +7: iteration 3020/ 37905 | consumed samples: 773120 | consumed tokens: 1583349760 | elapsed time per iteration (s): 0.22 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.257661E+00 | grad norm: 0.522 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.471 | TFLOPs: 29.44 | +7: iteration 3030/ 37905 | consumed samples: 775680 | consumed tokens: 1588592640 | elapsed time per iteration (s): 0.22 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.257388E+00 | grad norm: 0.517 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.205 | TFLOPs: 29.43 | +7: iteration 3040/ 37905 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 0.23 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.254527E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1115.264 | TFLOPs: 28.41 | +7: iteration 3050/ 37905 | consumed samples: 780800 | consumed tokens: 1599078400 | elapsed time per iteration (s): 0.23 | learning rate: 1.978E-04 | global batch size: 256 | lm loss: 4.257375E+00 | grad norm: 0.556 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.727 | TFLOPs: 28.73 | +7: iteration 3060/ 37905 | consumed samples: 783360 | consumed tokens: 1604321280 | elapsed time per iteration (s): 0.22 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.257104E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.760 | TFLOPs: 29.42 | +7: iteration 3070/ 37905 | consumed samples: 785920 | consumed tokens: 1609564160 | elapsed time per iteration (s): 0.24 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.264785E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1083.203 | TFLOPs: 27.59 | +7: iteration 3080/ 37905 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 0.23 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.265687E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.637 | TFLOPs: 28.70 | +7: iteration 3090/ 37905 | consumed samples: 791040 | consumed tokens: 1620049920 | elapsed time per iteration (s): 0.23 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.256187E+00 | grad norm: 0.450 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.029 | TFLOPs: 28.84 | +7: iteration 3100/ 37905 | consumed samples: 793600 | consumed tokens: 1625292800 | elapsed time per iteration (s): 0.22 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.267358E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.232 | TFLOPs: 29.02 | +7: iteration 3110/ 37905 | consumed samples: 796160 | consumed tokens: 1630535680 | elapsed time per iteration (s): 0.23 | learning rate: 1.977E-04 | global batch size: 256 | lm loss: 4.243521E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.624 | TFLOPs: 28.85 | +7: iteration 3120/ 37905 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 0.22 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.246337E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.943 | TFLOPs: 29.45 | +7: iteration 3130/ 37905 | consumed samples: 801280 | consumed tokens: 1641021440 | elapsed time per iteration (s): 0.22 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.245243E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.128 | TFLOPs: 29.45 | +7: iteration 3140/ 37905 | consumed samples: 803840 | consumed tokens: 1646264320 | elapsed time per iteration (s): 0.22 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.238573E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.194 | TFLOPs: 29.15 | +7: iteration 3150/ 37905 | consumed samples: 806400 | consumed tokens: 1651507200 | elapsed time per iteration (s): 0.23 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.236618E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.140 | TFLOPs: 28.71 | +7: iteration 3160/ 37905 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 0.22 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.238921E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.040 | TFLOPs: 29.25 | +7: iteration 3170/ 37905 | consumed samples: 811520 | consumed tokens: 1661992960 | elapsed time per iteration (s): 0.23 | learning rate: 1.976E-04 | global batch size: 256 | lm loss: 4.256159E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.250 | TFLOPs: 28.67 | +7: iteration 3180/ 37905 | consumed samples: 814080 | consumed tokens: 1667235840 | elapsed time per iteration (s): 0.22 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.240393E+00 | grad norm: 0.636 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.058 | TFLOPs: 29.45 | +7: iteration 3190/ 37905 | consumed samples: 816640 | consumed tokens: 1672478720 | elapsed time per iteration (s): 0.23 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.247688E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.918 | TFLOPs: 28.94 | +7: iteration 3200/ 37905 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 0.22 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.228506E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.142 | TFLOPs: 29.33 | +7: iteration 3210/ 37905 | consumed samples: 821760 | consumed tokens: 1682964480 | elapsed time per iteration (s): 0.22 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.232521E+00 | grad norm: 0.490 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.039 | TFLOPs: 29.25 | +7: iteration 3220/ 37905 | consumed samples: 824320 | consumed tokens: 1688207360 | elapsed time per iteration (s): 0.22 | learning rate: 1.975E-04 | global batch size: 256 | lm loss: 4.231715E+00 | grad norm: 0.475 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.307 | TFLOPs: 29.00 | +7: iteration 3230/ 37905 | consumed samples: 826880 | consumed tokens: 1693450240 | elapsed time per iteration (s): 0.23 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.242461E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.994 | TFLOPs: 28.66 | +7: iteration 3240/ 37905 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 0.22 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.239491E+00 | grad norm: 0.539 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.047 | TFLOPs: 29.02 | +7: iteration 3250/ 37905 | consumed samples: 832000 | consumed tokens: 1703936000 | elapsed time per iteration (s): 0.22 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.234603E+00 | grad norm: 0.529 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.102 | TFLOPs: 29.12 | +7: iteration 3260/ 37905 | consumed samples: 834560 | consumed tokens: 1709178880 | elapsed time per iteration (s): 0.22 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.235329E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.818 | TFLOPs: 29.29 | +7: iteration 3270/ 37905 | consumed samples: 837120 | consumed tokens: 1714421760 | elapsed time per iteration (s): 0.22 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.239950E+00 | grad norm: 0.417 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.082 | TFLOPs: 29.12 | +7: iteration 3280/ 37905 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 0.23 | learning rate: 1.974E-04 | global batch size: 256 | lm loss: 4.234569E+00 | grad norm: 0.528 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.664 | TFLOPs: 28.55 | +7: iteration 3290/ 37905 | consumed samples: 842240 | consumed tokens: 1724907520 | elapsed time per iteration (s): 0.23 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.231893E+00 | grad norm: 0.527 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.524 | TFLOPs: 28.55 | +7: iteration 3300/ 37905 | consumed samples: 844800 | consumed tokens: 1730150400 | elapsed time per iteration (s): 0.22 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.227441E+00 | grad norm: 0.624 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.548 | TFLOPs: 29.39 | +7: iteration 3310/ 37905 | consumed samples: 847360 | consumed tokens: 1735393280 | elapsed time per iteration (s): 0.22 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.224672E+00 | grad norm: 0.557 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.076 | TFLOPs: 29.04 | +7: iteration 3320/ 37905 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 0.22 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.228351E+00 | grad norm: 0.559 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.216 | TFLOPs: 29.43 | +7: iteration 3330/ 37905 | consumed samples: 852480 | consumed tokens: 1745879040 | elapsed time per iteration (s): 0.23 | learning rate: 1.973E-04 | global batch size: 256 | lm loss: 4.234261E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.670 | TFLOPs: 28.22 | +7: iteration 3340/ 37905 | consumed samples: 855040 | consumed tokens: 1751121920 | elapsed time per iteration (s): 0.22 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.214080E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.354 | TFLOPs: 29.05 | +7: iteration 3350/ 37905 | consumed samples: 857600 | consumed tokens: 1756364800 | elapsed time per iteration (s): 0.22 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.213986E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.382 | TFLOPs: 29.15 | +7: iteration 3360/ 37905 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 0.22 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.223432E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.082 | TFLOPs: 29.32 | +7: iteration 3370/ 37905 | consumed samples: 862720 | consumed tokens: 1766850560 | elapsed time per iteration (s): 0.22 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.216847E+00 | grad norm: 0.498 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.316 | TFLOPs: 29.43 | +7: iteration 3380/ 37905 | consumed samples: 865280 | consumed tokens: 1772093440 | elapsed time per iteration (s): 0.22 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.214106E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.675 | TFLOPs: 29.42 | +7: iteration 3390/ 37905 | consumed samples: 867840 | consumed tokens: 1777336320 | elapsed time per iteration (s): 0.22 | learning rate: 1.972E-04 | global batch size: 256 | lm loss: 4.214223E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.447 | TFLOPs: 29.41 | +7: iteration 3400/ 37905 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 0.22 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.208819E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.732 | TFLOPs: 29.42 | +7: iteration 3410/ 37905 | consumed samples: 872960 | consumed tokens: 1787822080 | elapsed time per iteration (s): 0.22 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.217411E+00 | grad norm: 0.513 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.223 | TFLOPs: 29.12 | +7: iteration 3420/ 37905 | consumed samples: 875520 | consumed tokens: 1793064960 | elapsed time per iteration (s): 0.23 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.223217E+00 | grad norm: 0.592 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.005 | TFLOPs: 28.53 | +7: iteration 3430/ 37905 | consumed samples: 878080 | consumed tokens: 1798307840 | elapsed time per iteration (s): 0.22 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.214928E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.692 | TFLOPs: 29.16 | +7: iteration 3440/ 37905 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 0.22 | learning rate: 1.971E-04 | global batch size: 256 | lm loss: 4.196350E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.939 | TFLOPs: 29.37 | +7: iteration 3450/ 37905 | consumed samples: 883200 | consumed tokens: 1808793600 | elapsed time per iteration (s): 0.23 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.220768E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.404 | TFLOPs: 28.98 | +7: iteration 3460/ 37905 | consumed samples: 885760 | consumed tokens: 1814036480 | elapsed time per iteration (s): 0.22 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.222431E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.461 | TFLOPs: 29.33 | +7: iteration 3470/ 37905 | consumed samples: 888320 | consumed tokens: 1819279360 | elapsed time per iteration (s): 0.22 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.213908E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.135 | TFLOPs: 29.38 | +7: iteration 3480/ 37905 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 0.22 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.192555E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.199 | TFLOPs: 29.12 | +7: iteration 3490/ 37905 | consumed samples: 893440 | consumed tokens: 1829765120 | elapsed time per iteration (s): 0.22 | learning rate: 1.970E-04 | global batch size: 256 | lm loss: 4.211906E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.279 | TFLOPs: 29.13 | +7: iteration 3500/ 37905 | consumed samples: 896000 | consumed tokens: 1835008000 | elapsed time per iteration (s): 0.22 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.205934E+00 | grad norm: 0.549 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.503 | TFLOPs: 29.05 | +7: iteration 3510/ 37905 | consumed samples: 898560 | consumed tokens: 1840250880 | elapsed time per iteration (s): 0.22 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.204291E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.533 | TFLOPs: 29.39 | +7: iteration 3520/ 37905 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 0.22 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.220183E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.381 | TFLOPs: 29.38 | +7: iteration 3530/ 37905 | consumed samples: 903680 | consumed tokens: 1850736640 | elapsed time per iteration (s): 0.22 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.201289E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.560 | TFLOPs: 29.39 | +7: iteration 3540/ 37905 | consumed samples: 906240 | consumed tokens: 1855979520 | elapsed time per iteration (s): 0.22 | learning rate: 1.969E-04 | global batch size: 256 | lm loss: 4.200562E+00 | grad norm: 0.623 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.071 | TFLOPs: 29.02 | +7: iteration 3550/ 37905 | consumed samples: 908800 | consumed tokens: 1861222400 | elapsed time per iteration (s): 0.22 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.198825E+00 | grad norm: 0.626 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.319 | TFLOPs: 29.41 | +7: iteration 3560/ 37905 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 0.22 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.186451E+00 | grad norm: 0.553 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.973 | TFLOPs: 29.40 | +7: iteration 3570/ 37905 | consumed samples: 913920 | consumed tokens: 1871708160 | elapsed time per iteration (s): 0.22 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.193014E+00 | grad norm: 0.530 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.946 | TFLOPs: 29.22 | +7: iteration 3580/ 37905 | consumed samples: 916480 | consumed tokens: 1876951040 | elapsed time per iteration (s): 0.22 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.202007E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.342 | TFLOPs: 29.41 | +7: iteration 3590/ 37905 | consumed samples: 919040 | consumed tokens: 1882193920 | elapsed time per iteration (s): 0.22 | learning rate: 1.968E-04 | global batch size: 256 | lm loss: 4.189970E+00 | grad norm: 0.422 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.769 | TFLOPs: 29.42 | +7: iteration 3600/ 37905 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 0.22 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.199353E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.567 | TFLOPs: 29.21 | +7: iteration 3610/ 37905 | consumed samples: 924160 | consumed tokens: 1892679680 | elapsed time per iteration (s): 0.22 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.193163E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.088 | TFLOPs: 29.40 | +7: iteration 3620/ 37905 | consumed samples: 926720 | consumed tokens: 1897922560 | elapsed time per iteration (s): 0.23 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.187204E+00 | grad norm: 0.595 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.421 | TFLOPs: 28.95 | +7: iteration 3630/ 37905 | consumed samples: 929280 | consumed tokens: 1903165440 | elapsed time per iteration (s): 0.22 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.193474E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.063 | TFLOPs: 29.30 | +7: iteration 3640/ 37905 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 0.22 | learning rate: 1.967E-04 | global batch size: 256 | lm loss: 4.195608E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.939 | TFLOPs: 29.42 | +7: iteration 3650/ 37905 | consumed samples: 934400 | consumed tokens: 1913651200 | elapsed time per iteration (s): 0.23 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.178280E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.543 | TFLOPs: 28.95 | +7: iteration 3660/ 37905 | consumed samples: 936960 | consumed tokens: 1918894080 | elapsed time per iteration (s): 0.23 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.180426E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1108.920 | TFLOPs: 28.25 | +7: iteration 3670/ 37905 | consumed samples: 939520 | consumed tokens: 1924136960 | elapsed time per iteration (s): 0.22 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.184148E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.908 | TFLOPs: 29.42 | +7: iteration 3680/ 37905 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 0.24 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.190357E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1083.925 | TFLOPs: 27.61 | +7: iteration 3690/ 37905 | consumed samples: 944640 | consumed tokens: 1934622720 | elapsed time per iteration (s): 0.22 | learning rate: 1.966E-04 | global batch size: 256 | lm loss: 4.178551E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.751 | TFLOPs: 29.11 | +7: iteration 3700/ 37905 | consumed samples: 947200 | consumed tokens: 1939865600 | elapsed time per iteration (s): 0.22 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.193258E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.812 | TFLOPs: 29.04 | +7: iteration 3710/ 37905 | consumed samples: 949760 | consumed tokens: 1945108480 | elapsed time per iteration (s): 0.22 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.193431E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.448 | TFLOPs: 29.44 | +7: iteration 3720/ 37905 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 0.22 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.180586E+00 | grad norm: 0.608 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.243 | TFLOPs: 29.10 | +7: iteration 3730/ 37905 | consumed samples: 954880 | consumed tokens: 1955594240 | elapsed time per iteration (s): 0.22 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.171391E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.953 | TFLOPs: 29.42 | +7: iteration 3740/ 37905 | consumed samples: 957440 | consumed tokens: 1960837120 | elapsed time per iteration (s): 0.22 | learning rate: 1.965E-04 | global batch size: 256 | lm loss: 4.184170E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.153 | TFLOPs: 29.40 | +7: iteration 3750/ 37905 | consumed samples: 960000 | consumed tokens: 1966080000 | elapsed time per iteration (s): 0.22 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.163470E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.170 | TFLOPs: 29.05 | +7: iteration 3760/ 37905 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 0.22 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.185949E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.579 | TFLOPs: 29.44 | +7: iteration 3770/ 37905 | consumed samples: 965120 | consumed tokens: 1976565760 | elapsed time per iteration (s): 0.22 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.176775E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.251 | TFLOPs: 29.25 | +7: iteration 3780/ 37905 | consumed samples: 967680 | consumed tokens: 1981808640 | elapsed time per iteration (s): 0.23 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.184824E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.090 | TFLOPs: 28.94 | +7: iteration 3790/ 37905 | consumed samples: 970240 | consumed tokens: 1987051520 | elapsed time per iteration (s): 0.22 | learning rate: 1.964E-04 | global batch size: 256 | lm loss: 4.176545E+00 | grad norm: 0.454 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.627 | TFLOPs: 29.44 | +7: iteration 3800/ 37905 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 0.22 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.168100E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.179 | TFLOPs: 29.20 | +7: iteration 3810/ 37905 | consumed samples: 975360 | consumed tokens: 1997537280 | elapsed time per iteration (s): 0.22 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.170380E+00 | grad norm: 0.561 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.252 | TFLOPs: 29.46 | +7: iteration 3820/ 37905 | consumed samples: 977920 | consumed tokens: 2002780160 | elapsed time per iteration (s): 0.22 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.177531E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.277 | TFLOPs: 29.46 | +7: iteration 3830/ 37905 | consumed samples: 980480 | consumed tokens: 2008023040 | elapsed time per iteration (s): 0.22 | learning rate: 1.963E-04 | global batch size: 256 | lm loss: 4.177326E+00 | grad norm: 0.536 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.358 | TFLOPs: 29.36 | +7: iteration 3840/ 37905 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 0.22 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.182430E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.601 | TFLOPs: 29.36 | +7: iteration 3850/ 37905 | consumed samples: 985600 | consumed tokens: 2018508800 | elapsed time per iteration (s): 0.22 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.175226E+00 | grad norm: 0.407 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.713 | TFLOPs: 29.34 | +7: iteration 3860/ 37905 | consumed samples: 988160 | consumed tokens: 2023751680 | elapsed time per iteration (s): 0.23 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.165273E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.968 | TFLOPs: 28.53 | +7: iteration 3870/ 37905 | consumed samples: 990720 | consumed tokens: 2028994560 | elapsed time per iteration (s): 0.22 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.153728E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.634 | TFLOPs: 29.13 | +7: iteration 3880/ 37905 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 0.22 | learning rate: 1.962E-04 | global batch size: 256 | lm loss: 4.165813E+00 | grad norm: 0.521 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.847 | TFLOPs: 29.09 | +7: iteration 3890/ 37905 | consumed samples: 995840 | consumed tokens: 2039480320 | elapsed time per iteration (s): 0.22 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.157262E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.518 | TFLOPs: 29.36 | +7: iteration 3900/ 37905 | consumed samples: 998400 | consumed tokens: 2044723200 | elapsed time per iteration (s): 0.22 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.182277E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.256 | TFLOPs: 29.20 | +7: iteration 3910/ 37905 | consumed samples: 1000960 | consumed tokens: 2049966080 | elapsed time per iteration (s): 0.22 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.169258E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.925 | TFLOPs: 29.35 | +7: iteration 3920/ 37905 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 0.22 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.172372E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.885 | TFLOPs: 29.34 | +7: iteration 3930/ 37905 | consumed samples: 1006080 | consumed tokens: 2060451840 | elapsed time per iteration (s): 0.22 | learning rate: 1.961E-04 | global batch size: 256 | lm loss: 4.173264E+00 | grad norm: 0.446 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.720 | TFLOPs: 29.37 | +7: iteration 3940/ 37905 | consumed samples: 1008640 | consumed tokens: 2065694720 | elapsed time per iteration (s): 0.22 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.161182E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.570 | TFLOPs: 29.36 | +7: iteration 3950/ 37905 | consumed samples: 1011200 | consumed tokens: 2070937600 | elapsed time per iteration (s): 0.22 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.166042E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.710 | TFLOPs: 29.34 | +7: iteration 3960/ 37905 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 0.22 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.152959E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.108 | TFLOPs: 29.22 | +7: iteration 3970/ 37905 | consumed samples: 1016320 | consumed tokens: 2081423360 | elapsed time per iteration (s): 0.22 | learning rate: 1.960E-04 | global batch size: 256 | lm loss: 4.151085E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.532 | TFLOPs: 29.34 | +7: iteration 3980/ 37905 | consumed samples: 1018880 | consumed tokens: 2086666240 | elapsed time per iteration (s): 0.22 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.148476E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.150 | TFLOPs: 29.35 | +7: iteration 3990/ 37905 | consumed samples: 1021440 | consumed tokens: 2091909120 | elapsed time per iteration (s): 0.22 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.167647E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.624 | TFLOPs: 29.13 | +0: [2023-03-15 22:12:28,270] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[0.0001958964133300005, 0.0001958964133300005, 0.0001958964133300005], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 4000/ 37905 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 0.22 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.161606E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.557 | TFLOPs: 29.36 | +0: steps: 4000 loss: 4.1573 iter time (s): 0.223 samples/sec: 1149.067 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 4000 | lm loss value: 4.073997E+00 | lm loss PPL: 5.879151E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 4000 to checkpoints_83m20b400m +0: [2023-03-15 22:12:28,359] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! +0: [2023-03-15 22:12:28,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:12:28,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:12:28,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:12:28,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:12:28,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:12:28,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:12:28,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:12:28,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:12:28,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:12:28,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:12:28,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:12:28,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:12:28,490] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:12:28,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:12:28,501] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:12:28,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:12:28,512] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:12:28,523] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:12:28,523] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:12:28,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:12:28,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:12:28,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:12:28,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:12:28,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:12:28,549] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step4000/mp_rank_00_model_states.pt +0: [2023-03-15 22:12:28,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:12:28,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:12:28,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:12:28,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:12:28,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:12:28,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-15 22:12:28,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:12:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:12:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:12:28,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +2: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:12:28,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +2: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +1: [2023-03-15 22:12:28,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +7: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +6: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +5: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +4: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +3: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:12:28,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:12:28,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:12:28,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:12:28,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:12:28,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:12:28,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: [2023-03-15 22:12:28,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! +0: successfully saved checkpoint at iteration 4000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.58 +7: iteration 4010/ 37905 | consumed samples: 1026560 | consumed tokens: 2102394880 | elapsed time per iteration (s): 0.25 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.164610E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1008.107 | TFLOPs: 25.68 | +7: iteration 4020/ 37905 | consumed samples: 1029120 | consumed tokens: 2107637760 | elapsed time per iteration (s): 0.22 | learning rate: 1.959E-04 | global batch size: 256 | lm loss: 4.154263E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.316 | TFLOPs: 29.43 | +7: iteration 4030/ 37905 | consumed samples: 1031680 | consumed tokens: 2112880640 | elapsed time per iteration (s): 0.22 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.146037E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.592 | TFLOPs: 29.16 | +7: iteration 4040/ 37905 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 0.22 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.145509E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.223 | TFLOPs: 29.40 | +7: iteration 4050/ 37905 | consumed samples: 1036800 | consumed tokens: 2123366400 | elapsed time per iteration (s): 0.22 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.144290E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.625 | TFLOPs: 29.39 | +7: iteration 4060/ 37905 | consumed samples: 1039360 | consumed tokens: 2128609280 | elapsed time per iteration (s): 0.22 | learning rate: 1.958E-04 | global batch size: 256 | lm loss: 4.148421E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.787 | TFLOPs: 29.39 | +7: iteration 4070/ 37905 | consumed samples: 1041920 | consumed tokens: 2133852160 | elapsed time per iteration (s): 0.22 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.153977E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.130 | TFLOPs: 29.12 | +7: iteration 4080/ 37905 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 0.22 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.132663E+00 | grad norm: 0.485 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.411 | TFLOPs: 29.41 | +7: iteration 4090/ 37905 | consumed samples: 1047040 | consumed tokens: 2144337920 | elapsed time per iteration (s): 0.22 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.149767E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.260 | TFLOPs: 29.40 | +7: iteration 4100/ 37905 | consumed samples: 1049600 | consumed tokens: 2149580800 | elapsed time per iteration (s): 0.22 | learning rate: 1.957E-04 | global batch size: 256 | lm loss: 4.158287E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.006 | TFLOPs: 29.40 | +7: iteration 4110/ 37905 | consumed samples: 1052160 | consumed tokens: 2154823680 | elapsed time per iteration (s): 0.22 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.152287E+00 | grad norm: 0.476 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.298 | TFLOPs: 29.41 | +7: iteration 4120/ 37905 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 0.22 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.150379E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.703 | TFLOPs: 29.34 | +7: iteration 4130/ 37905 | consumed samples: 1057280 | consumed tokens: 2165309440 | elapsed time per iteration (s): 0.22 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.134649E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.969 | TFLOPs: 29.32 | +7: iteration 4140/ 37905 | consumed samples: 1059840 | consumed tokens: 2170552320 | elapsed time per iteration (s): 0.22 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.132087E+00 | grad norm: 0.433 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.768 | TFLOPs: 29.11 | +7: iteration 4150/ 37905 | consumed samples: 1062400 | consumed tokens: 2175795200 | elapsed time per iteration (s): 0.22 | learning rate: 1.956E-04 | global batch size: 256 | lm loss: 4.141156E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.092 | TFLOPs: 29.32 | +7: iteration 4160/ 37905 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 0.22 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.136386E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.895 | TFLOPs: 29.34 | +7: iteration 4170/ 37905 | consumed samples: 1067520 | consumed tokens: 2186280960 | elapsed time per iteration (s): 0.22 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.136476E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.615 | TFLOPs: 29.31 | +7: iteration 4180/ 37905 | consumed samples: 1070080 | consumed tokens: 2191523840 | elapsed time per iteration (s): 0.22 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.155896E+00 | grad norm: 0.520 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.734 | TFLOPs: 29.37 | +7: iteration 4190/ 37905 | consumed samples: 1072640 | consumed tokens: 2196766720 | elapsed time per iteration (s): 0.22 | learning rate: 1.955E-04 | global batch size: 256 | lm loss: 4.129590E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.269 | TFLOPs: 29.30 | +7: iteration 4200/ 37905 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 0.23 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.149476E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.546 | TFLOPs: 28.90 | +7: iteration 4210/ 37905 | consumed samples: 1077760 | consumed tokens: 2207252480 | elapsed time per iteration (s): 0.22 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.139719E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.647 | TFLOPs: 29.41 | +7: iteration 4220/ 37905 | consumed samples: 1080320 | consumed tokens: 2212495360 | elapsed time per iteration (s): 0.23 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.142287E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.238 | TFLOPs: 28.69 | +7: iteration 4230/ 37905 | consumed samples: 1082880 | consumed tokens: 2217738240 | elapsed time per iteration (s): 0.22 | learning rate: 1.954E-04 | global batch size: 256 | lm loss: 4.142463E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.312 | TFLOPs: 29.36 | +7: iteration 4240/ 37905 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 0.22 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.130286E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.562 | TFLOPs: 29.34 | +7: iteration 4250/ 37905 | consumed samples: 1088000 | consumed tokens: 2228224000 | elapsed time per iteration (s): 0.22 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.139140E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.839 | TFLOPs: 29.32 | +7: iteration 4260/ 37905 | consumed samples: 1090560 | consumed tokens: 2233466880 | elapsed time per iteration (s): 0.22 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.127898E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.015 | TFLOPs: 29.30 | +7: iteration 4270/ 37905 | consumed samples: 1093120 | consumed tokens: 2238709760 | elapsed time per iteration (s): 0.22 | learning rate: 1.953E-04 | global batch size: 256 | lm loss: 4.134974E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.626 | TFLOPs: 29.31 | +7: iteration 4280/ 37905 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 0.22 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.138823E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.545 | TFLOPs: 29.06 | +7: iteration 4290/ 37905 | consumed samples: 1098240 | consumed tokens: 2249195520 | elapsed time per iteration (s): 0.22 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.123545E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.811 | TFLOPs: 29.34 | +7: iteration 4300/ 37905 | consumed samples: 1100800 | consumed tokens: 2254438400 | elapsed time per iteration (s): 0.22 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.131660E+00 | grad norm: 0.518 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.624 | TFLOPs: 29.36 | +7: iteration 4310/ 37905 | consumed samples: 1103360 | consumed tokens: 2259681280 | elapsed time per iteration (s): 0.22 | learning rate: 1.952E-04 | global batch size: 256 | lm loss: 4.129547E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.612 | TFLOPs: 29.36 | +7: iteration 4320/ 37905 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 0.22 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.127180E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.528 | TFLOPs: 29.39 | +7: iteration 4330/ 37905 | consumed samples: 1108480 | consumed tokens: 2270167040 | elapsed time per iteration (s): 0.22 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.128294E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.691 | TFLOPs: 29.34 | +7: iteration 4340/ 37905 | consumed samples: 1111040 | consumed tokens: 2275409920 | elapsed time per iteration (s): 0.22 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.131260E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.311 | TFLOPs: 29.38 | +7: iteration 4350/ 37905 | consumed samples: 1113600 | consumed tokens: 2280652800 | elapsed time per iteration (s): 0.22 | learning rate: 1.951E-04 | global batch size: 256 | lm loss: 4.119745E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.738 | TFLOPs: 29.39 | +7: iteration 4360/ 37905 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 0.22 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.115033E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.129 | TFLOPs: 29.40 | +7: iteration 4370/ 37905 | consumed samples: 1118720 | consumed tokens: 2291138560 | elapsed time per iteration (s): 0.22 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.121249E+00 | grad norm: 0.488 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.551 | TFLOPs: 29.41 | +7: iteration 4380/ 37905 | consumed samples: 1121280 | consumed tokens: 2296381440 | elapsed time per iteration (s): 0.22 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.136316E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.982 | TFLOPs: 29.40 | +7: iteration 4390/ 37905 | consumed samples: 1123840 | consumed tokens: 2301624320 | elapsed time per iteration (s): 0.22 | learning rate: 1.950E-04 | global batch size: 256 | lm loss: 4.125609E+00 | grad norm: 0.545 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.508 | TFLOPs: 29.36 | +7: iteration 4400/ 37905 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 0.22 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.126319E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.479 | TFLOPs: 29.39 | +7: iteration 4410/ 37905 | consumed samples: 1128960 | consumed tokens: 2312110080 | elapsed time per iteration (s): 0.22 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.119042E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.178 | TFLOPs: 29.38 | +7: iteration 4420/ 37905 | consumed samples: 1131520 | consumed tokens: 2317352960 | elapsed time per iteration (s): 0.22 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.105751E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.509 | TFLOPs: 29.39 | +7: iteration 4430/ 37905 | consumed samples: 1134080 | consumed tokens: 2322595840 | elapsed time per iteration (s): 0.22 | learning rate: 1.949E-04 | global batch size: 256 | lm loss: 4.127887E+00 | grad norm: 0.483 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.440 | TFLOPs: 29.13 | +7: iteration 4440/ 37905 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 0.22 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.119102E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.217 | TFLOPs: 29.38 | +7: iteration 4450/ 37905 | consumed samples: 1139200 | consumed tokens: 2333081600 | elapsed time per iteration (s): 0.22 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.120357E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.403 | TFLOPs: 29.38 | +7: iteration 4460/ 37905 | consumed samples: 1141760 | consumed tokens: 2338324480 | elapsed time per iteration (s): 0.22 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.126852E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.915 | TFLOPs: 29.42 | +7: iteration 4470/ 37905 | consumed samples: 1144320 | consumed tokens: 2343567360 | elapsed time per iteration (s): 0.22 | learning rate: 1.948E-04 | global batch size: 256 | lm loss: 4.121542E+00 | grad norm: 0.514 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.257 | TFLOPs: 29.46 | +7: iteration 4480/ 37905 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 0.22 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.109412E+00 | grad norm: 0.463 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.180 | TFLOPs: 29.43 | +7: iteration 4490/ 37905 | consumed samples: 1149440 | consumed tokens: 2354053120 | elapsed time per iteration (s): 0.22 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.105916E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.241 | TFLOPs: 29.43 | +7: iteration 4500/ 37905 | consumed samples: 1152000 | consumed tokens: 2359296000 | elapsed time per iteration (s): 0.22 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.098887E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.181 | TFLOPs: 29.43 | +7: iteration 4510/ 37905 | consumed samples: 1154560 | consumed tokens: 2364538880 | elapsed time per iteration (s): 0.22 | learning rate: 1.947E-04 | global batch size: 256 | lm loss: 4.115977E+00 | grad norm: 0.380 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.726 | TFLOPs: 29.42 | +7: iteration 4520/ 37905 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 0.22 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.099632E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.122 | TFLOPs: 29.40 | +7: iteration 4530/ 37905 | consumed samples: 1159680 | consumed tokens: 2375024640 | elapsed time per iteration (s): 0.22 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.104693E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.272 | TFLOPs: 29.43 | +7: iteration 4540/ 37905 | consumed samples: 1162240 | consumed tokens: 2380267520 | elapsed time per iteration (s): 0.22 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.123256E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.466 | TFLOPs: 29.44 | +7: iteration 4550/ 37905 | consumed samples: 1164800 | consumed tokens: 2385510400 | elapsed time per iteration (s): 0.23 | learning rate: 1.946E-04 | global batch size: 256 | lm loss: 4.128489E+00 | grad norm: 0.404 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.671 | TFLOPs: 28.93 | +7: iteration 4560/ 37905 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 0.22 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.096590E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.480 | TFLOPs: 29.41 | +7: iteration 4570/ 37905 | consumed samples: 1169920 | consumed tokens: 2395996160 | elapsed time per iteration (s): 0.22 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.114713E+00 | grad norm: 0.451 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.754 | TFLOPs: 29.42 | +7: iteration 4580/ 37905 | consumed samples: 1172480 | consumed tokens: 2401239040 | elapsed time per iteration (s): 0.22 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.111770E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.319 | TFLOPs: 29.43 | +7: iteration 4590/ 37905 | consumed samples: 1175040 | consumed tokens: 2406481920 | elapsed time per iteration (s): 0.22 | learning rate: 1.945E-04 | global batch size: 256 | lm loss: 4.117823E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.368 | TFLOPs: 29.43 | +7: iteration 4600/ 37905 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 0.22 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.108492E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.234 | TFLOPs: 29.43 | +7: iteration 4610/ 37905 | consumed samples: 1180160 | consumed tokens: 2416967680 | elapsed time per iteration (s): 0.22 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.103517E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.008 | TFLOPs: 29.42 | +7: iteration 4620/ 37905 | consumed samples: 1182720 | consumed tokens: 2422210560 | elapsed time per iteration (s): 0.22 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.094334E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.060 | TFLOPs: 29.43 | +7: iteration 4630/ 37905 | consumed samples: 1185280 | consumed tokens: 2427453440 | elapsed time per iteration (s): 0.23 | learning rate: 1.944E-04 | global batch size: 256 | lm loss: 4.106321E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.372 | TFLOPs: 28.97 | +7: iteration 4640/ 37905 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 0.22 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.101641E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.550 | TFLOPs: 29.41 | +7: iteration 4650/ 37905 | consumed samples: 1190400 | consumed tokens: 2437939200 | elapsed time per iteration (s): 0.22 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.105620E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.344 | TFLOPs: 29.41 | +7: iteration 4660/ 37905 | consumed samples: 1192960 | consumed tokens: 2443182080 | elapsed time per iteration (s): 0.22 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.089219E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.014 | TFLOPs: 29.22 | +7: iteration 4670/ 37905 | consumed samples: 1195520 | consumed tokens: 2448424960 | elapsed time per iteration (s): 0.22 | learning rate: 1.943E-04 | global batch size: 256 | lm loss: 4.098534E+00 | grad norm: 0.429 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.940 | TFLOPs: 29.42 | +7: iteration 4680/ 37905 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 0.22 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.115641E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.746 | TFLOPs: 29.42 | +7: iteration 4690/ 37905 | consumed samples: 1200640 | consumed tokens: 2458910720 | elapsed time per iteration (s): 0.22 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.101051E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.539 | TFLOPs: 29.03 | +7: iteration 4700/ 37905 | consumed samples: 1203200 | consumed tokens: 2464153600 | elapsed time per iteration (s): 0.22 | learning rate: 1.942E-04 | global batch size: 256 | lm loss: 4.101617E+00 | grad norm: 0.439 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.121 | TFLOPs: 29.40 | +7: iteration 4710/ 37905 | consumed samples: 1205760 | consumed tokens: 2469396480 | elapsed time per iteration (s): 0.22 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.104177E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.057 | TFLOPs: 29.40 | +7: iteration 4720/ 37905 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 0.22 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.095860E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.121 | TFLOPs: 29.43 | +7: iteration 4730/ 37905 | consumed samples: 1210880 | consumed tokens: 2479882240 | elapsed time per iteration (s): 0.22 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.091052E+00 | grad norm: 0.441 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.288 | TFLOPs: 29.41 | +7: iteration 4740/ 37905 | consumed samples: 1213440 | consumed tokens: 2485125120 | elapsed time per iteration (s): 0.22 | learning rate: 1.941E-04 | global batch size: 256 | lm loss: 4.091042E+00 | grad norm: 0.526 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.247 | TFLOPs: 29.40 | +7: iteration 4750/ 37905 | consumed samples: 1216000 | consumed tokens: 2490368000 | elapsed time per iteration (s): 0.22 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.112924E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.148 | TFLOPs: 29.43 | +7: iteration 4760/ 37905 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 0.22 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.093068E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.280 | TFLOPs: 29.43 | +7: iteration 4770/ 37905 | consumed samples: 1221120 | consumed tokens: 2500853760 | elapsed time per iteration (s): 0.22 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.087476E+00 | grad norm: 0.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.620 | TFLOPs: 29.44 | +7: iteration 4780/ 37905 | consumed samples: 1223680 | consumed tokens: 2506096640 | elapsed time per iteration (s): 0.22 | learning rate: 1.940E-04 | global batch size: 256 | lm loss: 4.093838E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.073 | TFLOPs: 29.40 | +7: iteration 4790/ 37905 | consumed samples: 1226240 | consumed tokens: 2511339520 | elapsed time per iteration (s): 0.22 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.094292E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.097 | TFLOPs: 29.35 | +7: iteration 4800/ 37905 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 0.22 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.097847E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.968 | TFLOPs: 29.35 | +7: iteration 4810/ 37905 | consumed samples: 1231360 | consumed tokens: 2521825280 | elapsed time per iteration (s): 0.22 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.087079E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.200 | TFLOPs: 29.35 | +7: iteration 4820/ 37905 | consumed samples: 1233920 | consumed tokens: 2527068160 | elapsed time per iteration (s): 0.22 | learning rate: 1.939E-04 | global batch size: 256 | lm loss: 4.091005E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.139 | TFLOPs: 29.35 | +7: iteration 4830/ 37905 | consumed samples: 1236480 | consumed tokens: 2532311040 | elapsed time per iteration (s): 0.22 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.091104E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.091 | TFLOPs: 29.38 | +7: iteration 4840/ 37905 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 0.22 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.091142E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.623 | TFLOPs: 29.41 | +7: iteration 4850/ 37905 | consumed samples: 1241600 | consumed tokens: 2542796800 | elapsed time per iteration (s): 0.22 | learning rate: 1.938E-04 | global batch size: 256 | lm loss: 4.087761E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.287 | TFLOPs: 29.46 | +7: iteration 4860/ 37905 | consumed samples: 1244160 | consumed tokens: 2548039680 | elapsed time per iteration (s): 0.23 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.090508E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.052 | TFLOPs: 28.56 | +7: iteration 4870/ 37905 | consumed samples: 1246720 | consumed tokens: 2553282560 | elapsed time per iteration (s): 0.23 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.088979E+00 | grad norm: 0.455 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.002 | TFLOPs: 28.56 | +7: iteration 4880/ 37905 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 0.22 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.078479E+00 | grad norm: 0.435 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.671 | TFLOPs: 29.47 | +7: iteration 4890/ 37905 | consumed samples: 1251840 | consumed tokens: 2563768320 | elapsed time per iteration (s): 0.22 | learning rate: 1.937E-04 | global batch size: 256 | lm loss: 4.091096E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.237 | TFLOPs: 29.46 | +7: iteration 4900/ 37905 | consumed samples: 1254400 | consumed tokens: 2569011200 | elapsed time per iteration (s): 0.22 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.083669E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.646 | TFLOPs: 29.47 | +7: iteration 4910/ 37905 | consumed samples: 1256960 | consumed tokens: 2574254080 | elapsed time per iteration (s): 0.22 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.076157E+00 | grad norm: 0.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.071 | TFLOPs: 29.43 | +7: iteration 4920/ 37905 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 0.22 | learning rate: 1.936E-04 | global batch size: 256 | lm loss: 4.087543E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.041 | TFLOPs: 29.35 | +7: iteration 4930/ 37905 | consumed samples: 1262080 | consumed tokens: 2584739840 | elapsed time per iteration (s): 0.22 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.092101E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.595 | TFLOPs: 29.39 | +7: iteration 4940/ 37905 | consumed samples: 1264640 | consumed tokens: 2589982720 | elapsed time per iteration (s): 0.22 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.082740E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.614 | TFLOPs: 29.39 | +7: iteration 4950/ 37905 | consumed samples: 1267200 | consumed tokens: 2595225600 | elapsed time per iteration (s): 0.22 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.089139E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.599 | TFLOPs: 29.36 | +7: iteration 4960/ 37905 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 0.22 | learning rate: 1.935E-04 | global batch size: 256 | lm loss: 4.083956E+00 | grad norm: 0.447 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.567 | TFLOPs: 29.36 | +7: iteration 4970/ 37905 | consumed samples: 1272320 | consumed tokens: 2605711360 | elapsed time per iteration (s): 0.22 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.073463E+00 | grad norm: 0.442 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.799 | TFLOPs: 29.24 | +7: iteration 4980/ 37905 | consumed samples: 1274880 | consumed tokens: 2610954240 | elapsed time per iteration (s): 0.22 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.079023E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.950 | TFLOPs: 29.37 | +7: iteration 4990/ 37905 | consumed samples: 1277440 | consumed tokens: 2616197120 | elapsed time per iteration (s): 0.23 | learning rate: 1.934E-04 | global batch size: 256 | lm loss: 4.093890E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.423 | TFLOPs: 28.67 | +7: iteration 5000/ 37905 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 0.23 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.078710E+00 | grad norm: 0.408 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.521 | TFLOPs: 28.93 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 5000 | lm loss value: 3.989103E+00 | lm loss PPL: 5.400643E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 5000 to checkpoints_83m20b400m +0: [2023-03-15 22:16:11,086] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! +0: [2023-03-15 22:16:11,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:16:11,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:16:11,162] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:16:11,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:16:11,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:16:11,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:16:11,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:16:11,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:16:11,197] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:16:11,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:16:11,209] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:16:11,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:16:11,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:16:11,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:16:11,231] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:16:11,241] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:16:11,242] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:16:11,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:16:11,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:16:11,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:16:11,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:16:11,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:16:11,275] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:16:11,276] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:16:11,276] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step5000/mp_rank_00_model_states.pt +0: [2023-03-15 22:16:11,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:16:11,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:16:11,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:16:11,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:16:11,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,316] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:16:11,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +1: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +4: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +6: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:16:11,323] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +5: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +2: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-15 22:16:11,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +7: [2023-03-15 22:16:11,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +3: [2023-03-15 22:16:11,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:16:11,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:16:11,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! +0: successfully saved checkpoint at iteration 5000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 243.57 +7: iteration 5010/ 37905 | consumed samples: 1282560 | consumed tokens: 2626682880 | elapsed time per iteration (s): 0.25 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.076684E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1013.359 | TFLOPs: 25.82 | +7: iteration 5020/ 37905 | consumed samples: 1285120 | consumed tokens: 2631925760 | elapsed time per iteration (s): 0.23 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.072186E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.244 | TFLOPs: 28.84 | +7: iteration 5030/ 37905 | consumed samples: 1287680 | consumed tokens: 2637168640 | elapsed time per iteration (s): 0.22 | learning rate: 1.933E-04 | global batch size: 256 | lm loss: 4.082055E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.234 | TFLOPs: 29.28 | +7: iteration 5040/ 37905 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 0.22 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.071145E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.367 | TFLOPs: 29.28 | +7: iteration 5050/ 37905 | consumed samples: 1292800 | consumed tokens: 2647654400 | elapsed time per iteration (s): 0.22 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.080047E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.765 | TFLOPs: 29.26 | +7: iteration 5060/ 37905 | consumed samples: 1295360 | consumed tokens: 2652897280 | elapsed time per iteration (s): 0.23 | learning rate: 1.932E-04 | global batch size: 256 | lm loss: 4.069718E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.713 | TFLOPs: 28.93 | +7: iteration 5070/ 37905 | consumed samples: 1297920 | consumed tokens: 2658140160 | elapsed time per iteration (s): 0.22 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.072466E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.171 | TFLOPs: 29.35 | +7: iteration 5080/ 37905 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 0.22 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.086956E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.747 | TFLOPs: 29.34 | +7: iteration 5090/ 37905 | consumed samples: 1303040 | consumed tokens: 2668625920 | elapsed time per iteration (s): 0.22 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.087429E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.284 | TFLOPs: 29.35 | +7: iteration 5100/ 37905 | consumed samples: 1305600 | consumed tokens: 2673868800 | elapsed time per iteration (s): 0.22 | learning rate: 1.931E-04 | global batch size: 256 | lm loss: 4.077721E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.075 | TFLOPs: 29.37 | +7: iteration 5110/ 37905 | consumed samples: 1308160 | consumed tokens: 2679111680 | elapsed time per iteration (s): 0.22 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.081743E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.385 | TFLOPs: 29.36 | +7: iteration 5120/ 37905 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 0.22 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.072313E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.040 | TFLOPs: 29.37 | +7: iteration 5130/ 37905 | consumed samples: 1313280 | consumed tokens: 2689597440 | elapsed time per iteration (s): 0.23 | learning rate: 1.930E-04 | global batch size: 256 | lm loss: 4.087690E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.669 | TFLOPs: 28.98 | +7: iteration 5140/ 37905 | consumed samples: 1315840 | consumed tokens: 2694840320 | elapsed time per iteration (s): 0.22 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.081796E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.141 | TFLOPs: 28.99 | +7: iteration 5150/ 37905 | consumed samples: 1318400 | consumed tokens: 2700083200 | elapsed time per iteration (s): 0.22 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.062381E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.757 | TFLOPs: 29.34 | +7: iteration 5160/ 37905 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 0.22 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.079032E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.737 | TFLOPs: 29.34 | +7: iteration 5170/ 37905 | consumed samples: 1323520 | consumed tokens: 2710568960 | elapsed time per iteration (s): 0.23 | learning rate: 1.929E-04 | global batch size: 256 | lm loss: 4.074348E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.229 | TFLOPs: 28.69 | +7: iteration 5180/ 37905 | consumed samples: 1326080 | consumed tokens: 2715811840 | elapsed time per iteration (s): 0.22 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.078922E+00 | grad norm: 0.524 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.653 | TFLOPs: 29.03 | +7: iteration 5190/ 37905 | consumed samples: 1328640 | consumed tokens: 2721054720 | elapsed time per iteration (s): 0.22 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.072696E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.490 | TFLOPs: 29.39 | +7: iteration 5200/ 37905 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 0.22 | learning rate: 1.928E-04 | global batch size: 256 | lm loss: 4.062747E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.238 | TFLOPs: 29.35 | +7: iteration 5210/ 37905 | consumed samples: 1333760 | consumed tokens: 2731540480 | elapsed time per iteration (s): 0.22 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.064561E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.199 | TFLOPs: 29.35 | +7: iteration 5220/ 37905 | consumed samples: 1336320 | consumed tokens: 2736783360 | elapsed time per iteration (s): 0.22 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.069471E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.678 | TFLOPs: 29.21 | +7: iteration 5230/ 37905 | consumed samples: 1338880 | consumed tokens: 2742026240 | elapsed time per iteration (s): 0.22 | learning rate: 1.927E-04 | global batch size: 256 | lm loss: 4.059191E+00 | grad norm: 0.421 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.734 | TFLOPs: 29.34 | +7: iteration 5240/ 37905 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 0.22 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.072396E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.816 | TFLOPs: 29.34 | +7: iteration 5250/ 37905 | consumed samples: 1344000 | consumed tokens: 2752512000 | elapsed time per iteration (s): 0.22 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.079474E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.818 | TFLOPs: 29.34 | +7: iteration 5260/ 37905 | consumed samples: 1346560 | consumed tokens: 2757754880 | elapsed time per iteration (s): 0.22 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.072964E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.764 | TFLOPs: 29.32 | +7: iteration 5270/ 37905 | consumed samples: 1349120 | consumed tokens: 2762997760 | elapsed time per iteration (s): 0.22 | learning rate: 1.926E-04 | global batch size: 256 | lm loss: 4.071770E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.965 | TFLOPs: 29.32 | +7: iteration 5280/ 37905 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 0.22 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.061233E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.743 | TFLOPs: 29.37 | +7: iteration 5290/ 37905 | consumed samples: 1354240 | consumed tokens: 2773483520 | elapsed time per iteration (s): 0.22 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.061683E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.233 | TFLOPs: 29.35 | +7: iteration 5300/ 37905 | consumed samples: 1356800 | consumed tokens: 2778726400 | elapsed time per iteration (s): 0.22 | learning rate: 1.925E-04 | global batch size: 256 | lm loss: 4.058170E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.912 | TFLOPs: 29.32 | +7: iteration 5310/ 37905 | consumed samples: 1359360 | consumed tokens: 2783969280 | elapsed time per iteration (s): 0.22 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.066984E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.737 | TFLOPs: 29.32 | +7: iteration 5320/ 37905 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 0.23 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.054108E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.130 | TFLOPs: 28.97 | +7: iteration 5330/ 37905 | consumed samples: 1364480 | consumed tokens: 2794455040 | elapsed time per iteration (s): 0.22 | learning rate: 1.924E-04 | global batch size: 256 | lm loss: 4.062208E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.807 | TFLOPs: 29.32 | +7: iteration 5340/ 37905 | consumed samples: 1367040 | consumed tokens: 2799697920 | elapsed time per iteration (s): 0.22 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.058406E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.029 | TFLOPs: 29.32 | +7: iteration 5350/ 37905 | consumed samples: 1369600 | consumed tokens: 2804940800 | elapsed time per iteration (s): 0.22 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.050340E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.621 | TFLOPs: 29.31 | +7: iteration 5360/ 37905 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 0.22 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.063607E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.431 | TFLOPs: 29.31 | +7: iteration 5370/ 37905 | consumed samples: 1374720 | consumed tokens: 2815426560 | elapsed time per iteration (s): 0.22 | learning rate: 1.923E-04 | global batch size: 256 | lm loss: 4.044280E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.793 | TFLOPs: 29.34 | +7: iteration 5380/ 37905 | consumed samples: 1377280 | consumed tokens: 2820669440 | elapsed time per iteration (s): 0.22 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.064012E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.616 | TFLOPs: 29.29 | +7: iteration 5390/ 37905 | consumed samples: 1379840 | consumed tokens: 2825912320 | elapsed time per iteration (s): 0.22 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.058051E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.097 | TFLOPs: 29.10 | +7: iteration 5400/ 37905 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 0.22 | learning rate: 1.922E-04 | global batch size: 256 | lm loss: 4.058493E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.124 | TFLOPs: 29.27 | +7: iteration 5410/ 37905 | consumed samples: 1384960 | consumed tokens: 2836398080 | elapsed time per iteration (s): 0.22 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.050909E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.456 | TFLOPs: 29.26 | +7: iteration 5420/ 37905 | consumed samples: 1387520 | consumed tokens: 2841640960 | elapsed time per iteration (s): 0.22 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.053685E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.657 | TFLOPs: 29.34 | +7: iteration 5430/ 37905 | consumed samples: 1390080 | consumed tokens: 2846883840 | elapsed time per iteration (s): 0.22 | learning rate: 1.921E-04 | global batch size: 256 | lm loss: 4.046612E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.519 | TFLOPs: 29.28 | +7: iteration 5440/ 37905 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 0.22 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.052257E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.766 | TFLOPs: 29.34 | +7: iteration 5450/ 37905 | consumed samples: 1395200 | consumed tokens: 2857369600 | elapsed time per iteration (s): 0.22 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.047233E+00 | grad norm: 0.444 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.426 | TFLOPs: 29.33 | +7: iteration 5460/ 37905 | consumed samples: 1397760 | consumed tokens: 2862612480 | elapsed time per iteration (s): 0.23 | learning rate: 1.920E-04 | global batch size: 256 | lm loss: 4.054819E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.739 | TFLOPs: 28.45 | +7: iteration 5470/ 37905 | consumed samples: 1400320 | consumed tokens: 2867855360 | elapsed time per iteration (s): 0.22 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.064571E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.673 | TFLOPs: 29.34 | +7: iteration 5480/ 37905 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 0.22 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.053850E+00 | grad norm: 0.426 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.558 | TFLOPs: 29.31 | +7: iteration 5490/ 37905 | consumed samples: 1405440 | consumed tokens: 2878341120 | elapsed time per iteration (s): 0.22 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.051921E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.993 | TFLOPs: 29.37 | +7: iteration 5500/ 37905 | consumed samples: 1408000 | consumed tokens: 2883584000 | elapsed time per iteration (s): 0.22 | learning rate: 1.919E-04 | global batch size: 256 | lm loss: 4.058445E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.167 | TFLOPs: 29.35 | +7: iteration 5510/ 37905 | consumed samples: 1410560 | consumed tokens: 2888826880 | elapsed time per iteration (s): 0.22 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.058044E+00 | grad norm: 0.420 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.821 | TFLOPs: 29.34 | +7: iteration 5520/ 37905 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 0.22 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.042353E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.126 | TFLOPs: 29.35 | +7: iteration 5530/ 37905 | consumed samples: 1415680 | consumed tokens: 2899312640 | elapsed time per iteration (s): 0.22 | learning rate: 1.918E-04 | global batch size: 256 | lm loss: 4.042913E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.842 | TFLOPs: 29.34 | +7: iteration 5540/ 37905 | consumed samples: 1418240 | consumed tokens: 2904555520 | elapsed time per iteration (s): 0.22 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.053475E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.145 | TFLOPs: 29.35 | +7: iteration 5550/ 37905 | consumed samples: 1420800 | consumed tokens: 2909798400 | elapsed time per iteration (s): 0.22 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.045594E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.162 | TFLOPs: 29.35 | +7: iteration 5560/ 37905 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 0.22 | learning rate: 1.917E-04 | global batch size: 256 | lm loss: 4.047651E+00 | grad norm: 0.453 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.642 | TFLOPs: 29.34 | +7: iteration 5570/ 37905 | consumed samples: 1425920 | consumed tokens: 2920284160 | elapsed time per iteration (s): 0.22 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.042588E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.973 | TFLOPs: 29.35 | +7: iteration 5580/ 37905 | consumed samples: 1428480 | consumed tokens: 2925527040 | elapsed time per iteration (s): 0.22 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.037538E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.601 | TFLOPs: 29.36 | +7: iteration 5590/ 37905 | consumed samples: 1431040 | consumed tokens: 2930769920 | elapsed time per iteration (s): 0.22 | learning rate: 1.916E-04 | global batch size: 256 | lm loss: 4.056142E+00 | grad norm: 0.499 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.523 | TFLOPs: 29.41 | +7: iteration 5600/ 37905 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 0.22 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.041758E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.856 | TFLOPs: 29.34 | +7: iteration 5610/ 37905 | consumed samples: 1436160 | consumed tokens: 2941255680 | elapsed time per iteration (s): 0.22 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.033621E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.136 | TFLOPs: 29.12 | +7: iteration 5620/ 37905 | consumed samples: 1438720 | consumed tokens: 2946498560 | elapsed time per iteration (s): 0.22 | learning rate: 1.915E-04 | global batch size: 256 | lm loss: 4.033212E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.459 | TFLOPs: 29.31 | +7: iteration 5630/ 37905 | consumed samples: 1441280 | consumed tokens: 2951741440 | elapsed time per iteration (s): 0.22 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.050502E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.223 | TFLOPs: 29.33 | +7: iteration 5640/ 37905 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 0.22 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.040886E+00 | grad norm: 0.438 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.874 | TFLOPs: 29.34 | +7: iteration 5650/ 37905 | consumed samples: 1446400 | consumed tokens: 2962227200 | elapsed time per iteration (s): 0.22 | learning rate: 1.914E-04 | global batch size: 256 | lm loss: 4.062682E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.635 | TFLOPs: 29.34 | +7: iteration 5660/ 37905 | consumed samples: 1448960 | consumed tokens: 2967470080 | elapsed time per iteration (s): 0.22 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.032494E+00 | grad norm: 0.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.639 | TFLOPs: 29.34 | +7: iteration 5670/ 37905 | consumed samples: 1451520 | consumed tokens: 2972712960 | elapsed time per iteration (s): 0.22 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.035645E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.784 | TFLOPs: 29.32 | +7: iteration 5680/ 37905 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 0.22 | learning rate: 1.913E-04 | global batch size: 256 | lm loss: 4.030121E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.742 | TFLOPs: 29.32 | +7: iteration 5690/ 37905 | consumed samples: 1456640 | consumed tokens: 2983198720 | elapsed time per iteration (s): 0.22 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.037351E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.524 | TFLOPs: 29.31 | +7: iteration 5700/ 37905 | consumed samples: 1459200 | consumed tokens: 2988441600 | elapsed time per iteration (s): 0.22 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.033003E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.214 | TFLOPs: 29.30 | +7: iteration 5710/ 37905 | consumed samples: 1461760 | consumed tokens: 2993684480 | elapsed time per iteration (s): 0.22 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.047117E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.561 | TFLOPs: 29.31 | +7: iteration 5720/ 37905 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 0.22 | learning rate: 1.912E-04 | global batch size: 256 | lm loss: 4.037786E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.484 | TFLOPs: 29.33 | +7: iteration 5730/ 37905 | consumed samples: 1466880 | consumed tokens: 3004170240 | elapsed time per iteration (s): 0.22 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.043974E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.030 | TFLOPs: 29.32 | +7: iteration 5740/ 37905 | consumed samples: 1469440 | consumed tokens: 3009413120 | elapsed time per iteration (s): 0.22 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.029827E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.740 | TFLOPs: 29.34 | +7: iteration 5750/ 37905 | consumed samples: 1472000 | consumed tokens: 3014656000 | elapsed time per iteration (s): 0.22 | learning rate: 1.911E-04 | global batch size: 256 | lm loss: 4.036561E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.108 | TFLOPs: 29.32 | +7: iteration 5760/ 37905 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 0.22 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.050324E+00 | grad norm: 0.416 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.772 | TFLOPs: 29.34 | +7: iteration 5770/ 37905 | consumed samples: 1477120 | consumed tokens: 3025141760 | elapsed time per iteration (s): 0.22 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.049946E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.582 | TFLOPs: 29.34 | +7: iteration 5780/ 37905 | consumed samples: 1479680 | consumed tokens: 3030384640 | elapsed time per iteration (s): 0.23 | learning rate: 1.910E-04 | global batch size: 256 | lm loss: 4.039154E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.967 | TFLOPs: 28.96 | +7: iteration 5790/ 37905 | consumed samples: 1482240 | consumed tokens: 3035627520 | elapsed time per iteration (s): 0.22 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.034772E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.424 | TFLOPs: 29.33 | +7: iteration 5800/ 37905 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 0.22 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.022884E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.015 | TFLOPs: 29.35 | +7: iteration 5810/ 37905 | consumed samples: 1487360 | consumed tokens: 3046113280 | elapsed time per iteration (s): 0.24 | learning rate: 1.909E-04 | global batch size: 256 | lm loss: 4.031591E+00 | grad norm: 0.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1083.075 | TFLOPs: 27.59 | +7: iteration 5820/ 37905 | consumed samples: 1489920 | consumed tokens: 3051356160 | elapsed time per iteration (s): 0.22 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.044043E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.412 | TFLOPs: 29.33 | +7: iteration 5830/ 37905 | consumed samples: 1492480 | consumed tokens: 3056599040 | elapsed time per iteration (s): 0.22 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.040681E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.253 | TFLOPs: 29.38 | +7: iteration 5840/ 37905 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 0.22 | learning rate: 1.908E-04 | global batch size: 256 | lm loss: 4.032288E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.295 | TFLOPs: 29.35 | +7: iteration 5850/ 37905 | consumed samples: 1497600 | consumed tokens: 3067084800 | elapsed time per iteration (s): 0.23 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.011431E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.595 | TFLOPs: 28.47 | +7: iteration 5860/ 37905 | consumed samples: 1500160 | consumed tokens: 3072327680 | elapsed time per iteration (s): 0.22 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.034185E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.205 | TFLOPs: 29.35 | +7: iteration 5870/ 37905 | consumed samples: 1502720 | consumed tokens: 3077570560 | elapsed time per iteration (s): 0.22 | learning rate: 1.907E-04 | global batch size: 256 | lm loss: 4.028950E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.637 | TFLOPs: 29.34 | +7: iteration 5880/ 37905 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 0.22 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.040334E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.243 | TFLOPs: 29.33 | +7: iteration 5890/ 37905 | consumed samples: 1507840 | consumed tokens: 3088056320 | elapsed time per iteration (s): 0.22 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.017110E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.157 | TFLOPs: 29.30 | +7: iteration 5900/ 37905 | consumed samples: 1510400 | consumed tokens: 3093299200 | elapsed time per iteration (s): 0.22 | learning rate: 1.906E-04 | global batch size: 256 | lm loss: 4.024017E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.143 | TFLOPs: 29.30 | +7: iteration 5910/ 37905 | consumed samples: 1512960 | consumed tokens: 3098542080 | elapsed time per iteration (s): 0.22 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.035585E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.767 | TFLOPs: 29.29 | +7: iteration 5920/ 37905 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 0.22 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.017375E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.669 | TFLOPs: 29.31 | +7: iteration 5930/ 37905 | consumed samples: 1518080 | consumed tokens: 3109027840 | elapsed time per iteration (s): 0.22 | learning rate: 1.905E-04 | global batch size: 256 | lm loss: 4.025904E+00 | grad norm: 0.389 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.853 | TFLOPs: 29.32 | +7: iteration 5940/ 37905 | consumed samples: 1520640 | consumed tokens: 3114270720 | elapsed time per iteration (s): 0.22 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.021608E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.379 | TFLOPs: 29.31 | +7: iteration 5950/ 37905 | consumed samples: 1523200 | consumed tokens: 3119513600 | elapsed time per iteration (s): 0.23 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.024990E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.119 | TFLOPs: 28.15 | +7: iteration 5960/ 37905 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 0.22 | learning rate: 1.904E-04 | global batch size: 256 | lm loss: 4.040877E+00 | grad norm: 0.458 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.419 | TFLOPs: 29.28 | +7: iteration 5970/ 37905 | consumed samples: 1528320 | consumed tokens: 3129999360 | elapsed time per iteration (s): 0.22 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.022007E+00 | grad norm: 0.466 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.534 | TFLOPs: 29.26 | +7: iteration 5980/ 37905 | consumed samples: 1530880 | consumed tokens: 3135242240 | elapsed time per iteration (s): 0.22 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.031785E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.638 | TFLOPs: 29.29 | +7: iteration 5990/ 37905 | consumed samples: 1533440 | consumed tokens: 3140485120 | elapsed time per iteration (s): 0.23 | learning rate: 1.903E-04 | global batch size: 256 | lm loss: 4.037115E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.484 | TFLOPs: 28.65 | +0: [2023-03-15 22:19:54,391] [INFO] [logging.py:68:log_dist] [Rank 0] step=6000, skipped=0, lr=[0.00019021778213052494, 0.00019021778213052494, 0.00019021778213052494], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 6000/ 37905 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 0.22 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.025628E+00 | grad norm: 0.423 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.248 | TFLOPs: 29.30 | +0: steps: 6000 loss: 4.0120 iter time (s): 0.221 samples/sec: 1158.574 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 6000 | lm loss value: 3.971578E+00 | lm loss PPL: 5.306823E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 6000 to checkpoints_83m20b400m +0: [2023-03-15 22:19:54,480] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step6000 is begin to save! +0: [2023-03-15 22:19:54,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:19:54,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:19:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:19:54,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:19:54,568] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:19:54,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:19:54,579] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:19:54,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:19:54,590] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:19:54,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:19:54,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:19:54,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:19:54,613] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:19:54,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:19:54,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:19:54,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:19:54,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:19:54,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:19:54,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:19:54,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:19:54,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:19:54,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:19:54,668] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:19:54,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:19:54,670] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step6000/mp_rank_00_model_states.pt +0: [2023-03-15 22:19:54,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:19:54,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:19:54,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:54,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:54,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:54,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:19:54,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:54,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,711] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:54,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:54,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +2: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +1: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +1: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +5: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +3: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +7: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-15 22:19:54,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step6000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +0: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +4: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +6: [2023-03-15 22:19:54,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step6000 is ready now! +0: successfully saved checkpoint at iteration 6000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 238.64 +7: iteration 6010/ 37905 | consumed samples: 1538560 | consumed tokens: 3150970880 | elapsed time per iteration (s): 0.26 | learning rate: 1.902E-04 | global batch size: 256 | lm loss: 4.030090E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 974.481 | TFLOPs: 24.83 | +7: iteration 6020/ 37905 | consumed samples: 1541120 | consumed tokens: 3156213760 | elapsed time per iteration (s): 0.23 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.030534E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.475 | TFLOPs: 28.72 | +7: iteration 6030/ 37905 | consumed samples: 1543680 | consumed tokens: 3161456640 | elapsed time per iteration (s): 0.22 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.016777E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.420 | TFLOPs: 29.10 | +7: iteration 6040/ 37905 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 0.22 | learning rate: 1.901E-04 | global batch size: 256 | lm loss: 4.018451E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.440 | TFLOPs: 29.05 | +7: iteration 6050/ 37905 | consumed samples: 1548800 | consumed tokens: 3171942400 | elapsed time per iteration (s): 0.22 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.030311E+00 | grad norm: 0.486 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.262 | TFLOPs: 29.33 | +7: iteration 6060/ 37905 | consumed samples: 1551360 | consumed tokens: 3177185280 | elapsed time per iteration (s): 0.22 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.021988E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.305 | TFLOPs: 29.36 | +7: iteration 6070/ 37905 | consumed samples: 1553920 | consumed tokens: 3182428160 | elapsed time per iteration (s): 0.22 | learning rate: 1.900E-04 | global batch size: 256 | lm loss: 4.016147E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.029 | TFLOPs: 29.35 | +7: iteration 6080/ 37905 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 0.22 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.024448E+00 | grad norm: 0.541 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.525 | TFLOPs: 29.36 | +7: iteration 6090/ 37905 | consumed samples: 1559040 | consumed tokens: 3192913920 | elapsed time per iteration (s): 0.22 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.023840E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.512 | TFLOPs: 29.36 | +7: iteration 6100/ 37905 | consumed samples: 1561600 | consumed tokens: 3198156800 | elapsed time per iteration (s): 0.22 | learning rate: 1.899E-04 | global batch size: 256 | lm loss: 4.020523E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.003 | TFLOPs: 29.35 | +7: iteration 6110/ 37905 | consumed samples: 1564160 | consumed tokens: 3203399680 | elapsed time per iteration (s): 0.22 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.019008E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.167 | TFLOPs: 29.33 | +7: iteration 6120/ 37905 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 0.22 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.016804E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.265 | TFLOPs: 29.35 | +7: iteration 6130/ 37905 | consumed samples: 1569280 | consumed tokens: 3213885440 | elapsed time per iteration (s): 0.22 | learning rate: 1.898E-04 | global batch size: 256 | lm loss: 4.011164E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.561 | TFLOPs: 29.36 | +7: iteration 6140/ 37905 | consumed samples: 1571840 | consumed tokens: 3219128320 | elapsed time per iteration (s): 0.22 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.012510E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.094 | TFLOPs: 29.04 | +7: iteration 6150/ 37905 | consumed samples: 1574400 | consumed tokens: 3224371200 | elapsed time per iteration (s): 0.22 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.003749E+00 | grad norm: 0.374 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.291 | TFLOPs: 29.25 | +7: iteration 6160/ 37905 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 0.22 | learning rate: 1.897E-04 | global batch size: 256 | lm loss: 4.023817E+00 | grad norm: 0.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.936 | TFLOPs: 29.32 | +7: iteration 6170/ 37905 | consumed samples: 1579520 | consumed tokens: 3234856960 | elapsed time per iteration (s): 0.22 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.006060E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.214 | TFLOPs: 29.30 | +7: iteration 6180/ 37905 | consumed samples: 1582080 | consumed tokens: 3240099840 | elapsed time per iteration (s): 0.22 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.001370E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.131 | TFLOPs: 29.33 | +7: iteration 6190/ 37905 | consumed samples: 1584640 | consumed tokens: 3245342720 | elapsed time per iteration (s): 0.22 | learning rate: 1.896E-04 | global batch size: 256 | lm loss: 4.021568E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.415 | TFLOPs: 29.36 | +7: iteration 6200/ 37905 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 0.22 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.011597E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.182 | TFLOPs: 29.30 | +7: iteration 6210/ 37905 | consumed samples: 1589760 | consumed tokens: 3255828480 | elapsed time per iteration (s): 0.22 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.003299E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.068 | TFLOPs: 29.30 | +7: iteration 6220/ 37905 | consumed samples: 1592320 | consumed tokens: 3261071360 | elapsed time per iteration (s): 0.22 | learning rate: 1.895E-04 | global batch size: 256 | lm loss: 4.019881E+00 | grad norm: 0.409 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.002 | TFLOPs: 29.37 | +7: iteration 6230/ 37905 | consumed samples: 1594880 | consumed tokens: 3266314240 | elapsed time per iteration (s): 0.22 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 4.013582E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.188 | TFLOPs: 29.40 | +7: iteration 6240/ 37905 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 0.22 | learning rate: 1.894E-04 | global batch size: 256 | lm loss: 3.993706E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.267 | TFLOPs: 29.38 | +7: iteration 6250/ 37905 | consumed samples: 1600000 | consumed tokens: 3276800000 | elapsed time per iteration (s): 0.22 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 4.024754E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.552 | TFLOPs: 29.41 | +7: iteration 6260/ 37905 | consumed samples: 1602560 | consumed tokens: 3282042880 | elapsed time per iteration (s): 0.22 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 4.016394E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.155 | TFLOPs: 29.40 | +7: iteration 6270/ 37905 | consumed samples: 1605120 | consumed tokens: 3287285760 | elapsed time per iteration (s): 0.22 | learning rate: 1.893E-04 | global batch size: 256 | lm loss: 3.997614E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.525 | TFLOPs: 29.41 | +7: iteration 6280/ 37905 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 0.22 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.011109E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.483 | TFLOPs: 29.41 | +7: iteration 6290/ 37905 | consumed samples: 1610240 | consumed tokens: 3297771520 | elapsed time per iteration (s): 0.22 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 4.003141E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.993 | TFLOPs: 29.40 | +7: iteration 6300/ 37905 | consumed samples: 1612800 | consumed tokens: 3303014400 | elapsed time per iteration (s): 0.22 | learning rate: 1.892E-04 | global batch size: 256 | lm loss: 3.989075E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.154 | TFLOPs: 29.40 | +7: iteration 6310/ 37905 | consumed samples: 1615360 | consumed tokens: 3308257280 | elapsed time per iteration (s): 0.22 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.004478E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.668 | TFLOPs: 29.44 | +7: iteration 6320/ 37905 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 0.22 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.003695E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1158.663 | TFLOPs: 29.52 | +7: iteration 6330/ 37905 | consumed samples: 1620480 | consumed tokens: 3318743040 | elapsed time per iteration (s): 0.22 | learning rate: 1.891E-04 | global batch size: 256 | lm loss: 4.002296E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1157.754 | TFLOPs: 29.49 | +7: iteration 6340/ 37905 | consumed samples: 1623040 | consumed tokens: 3323985920 | elapsed time per iteration (s): 0.22 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 4.012409E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.242 | TFLOPs: 29.46 | +7: iteration 6350/ 37905 | consumed samples: 1625600 | consumed tokens: 3329228800 | elapsed time per iteration (s): 0.22 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 3.998210E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.388 | TFLOPs: 29.46 | +7: iteration 6360/ 37905 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 0.22 | learning rate: 1.890E-04 | global batch size: 256 | lm loss: 4.004960E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.404 | TFLOPs: 29.46 | +7: iteration 6370/ 37905 | consumed samples: 1630720 | consumed tokens: 3339714560 | elapsed time per iteration (s): 0.22 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 4.002388E+00 | grad norm: 0.497 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.821 | TFLOPs: 29.39 | +7: iteration 6380/ 37905 | consumed samples: 1633280 | consumed tokens: 3344957440 | elapsed time per iteration (s): 0.22 | learning rate: 1.889E-04 | global batch size: 256 | lm loss: 4.004763E+00 | grad norm: 0.445 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.852 | TFLOPs: 29.42 | +7: iteration 6390/ 37905 | consumed samples: 1635840 | consumed tokens: 3350200320 | elapsed time per iteration (s): 0.22 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 4.006319E+00 | grad norm: 0.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.262 | TFLOPs: 29.38 | +7: iteration 6400/ 37905 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 0.22 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.995461E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.475 | TFLOPs: 29.36 | +7: iteration 6410/ 37905 | consumed samples: 1640960 | consumed tokens: 3360686080 | elapsed time per iteration (s): 0.22 | learning rate: 1.888E-04 | global batch size: 256 | lm loss: 3.976340E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.639 | TFLOPs: 29.36 | +7: iteration 6420/ 37905 | consumed samples: 1643520 | consumed tokens: 3365928960 | elapsed time per iteration (s): 0.22 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 4.001280E+00 | grad norm: 0.434 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.385 | TFLOPs: 29.38 | +7: iteration 6430/ 37905 | consumed samples: 1646080 | consumed tokens: 3371171840 | elapsed time per iteration (s): 0.22 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 4.002915E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.193 | TFLOPs: 29.35 | +7: iteration 6440/ 37905 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 0.22 | learning rate: 1.887E-04 | global batch size: 256 | lm loss: 4.001878E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.183 | TFLOPs: 29.38 | +7: iteration 6450/ 37905 | consumed samples: 1651200 | consumed tokens: 3381657600 | elapsed time per iteration (s): 0.22 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.993693E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.629 | TFLOPs: 29.16 | +7: iteration 6460/ 37905 | consumed samples: 1653760 | consumed tokens: 3386900480 | elapsed time per iteration (s): 0.22 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 4.002316E+00 | grad norm: 0.401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.461 | TFLOPs: 29.36 | +7: iteration 6470/ 37905 | consumed samples: 1656320 | consumed tokens: 3392143360 | elapsed time per iteration (s): 0.22 | learning rate: 1.886E-04 | global batch size: 256 | lm loss: 3.991282E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.511 | TFLOPs: 29.36 | +7: iteration 6480/ 37905 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 0.23 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 4.008193E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.244 | TFLOPs: 28.92 | +7: iteration 6490/ 37905 | consumed samples: 1661440 | consumed tokens: 3402629120 | elapsed time per iteration (s): 0.22 | learning rate: 1.885E-04 | global batch size: 256 | lm loss: 4.012024E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.772 | TFLOPs: 29.37 | +7: iteration 6500/ 37905 | consumed samples: 1664000 | consumed tokens: 3407872000 | elapsed time per iteration (s): 0.22 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.981017E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.180 | TFLOPs: 29.35 | +7: iteration 6510/ 37905 | consumed samples: 1666560 | consumed tokens: 3413114880 | elapsed time per iteration (s): 0.22 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 3.997509E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.860 | TFLOPs: 29.37 | +7: iteration 6520/ 37905 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 0.22 | learning rate: 1.884E-04 | global batch size: 256 | lm loss: 4.008387E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.863 | TFLOPs: 29.39 | +7: iteration 6530/ 37905 | consumed samples: 1671680 | consumed tokens: 3423600640 | elapsed time per iteration (s): 0.22 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.997316E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.777 | TFLOPs: 29.19 | +7: iteration 6540/ 37905 | consumed samples: 1674240 | consumed tokens: 3428843520 | elapsed time per iteration (s): 0.22 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.980608E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.170 | TFLOPs: 29.43 | +7: iteration 6550/ 37905 | consumed samples: 1676800 | consumed tokens: 3434086400 | elapsed time per iteration (s): 0.22 | learning rate: 1.883E-04 | global batch size: 256 | lm loss: 3.992169E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1157.174 | TFLOPs: 29.48 | +7: iteration 6560/ 37905 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 0.22 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 4.005666E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.705 | TFLOPs: 29.44 | +7: iteration 6570/ 37905 | consumed samples: 1681920 | consumed tokens: 3444572160 | elapsed time per iteration (s): 0.22 | learning rate: 1.882E-04 | global batch size: 256 | lm loss: 3.991632E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.905 | TFLOPs: 29.42 | +7: iteration 6580/ 37905 | consumed samples: 1684480 | consumed tokens: 3449815040 | elapsed time per iteration (s): 0.22 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.985491E+00 | grad norm: 0.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.443 | TFLOPs: 29.41 | +7: iteration 6590/ 37905 | consumed samples: 1687040 | consumed tokens: 3455057920 | elapsed time per iteration (s): 0.22 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.979922E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.032 | TFLOPs: 29.40 | +7: iteration 6600/ 37905 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 0.22 | learning rate: 1.881E-04 | global batch size: 256 | lm loss: 3.991814E+00 | grad norm: 0.367 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.475 | TFLOPs: 29.46 | +7: iteration 6610/ 37905 | consumed samples: 1692160 | consumed tokens: 3465543680 | elapsed time per iteration (s): 0.22 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.990429E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.888 | TFLOPs: 29.47 | +7: iteration 6620/ 37905 | consumed samples: 1694720 | consumed tokens: 3470786560 | elapsed time per iteration (s): 0.22 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.987111E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.900 | TFLOPs: 29.47 | +7: iteration 6630/ 37905 | consumed samples: 1697280 | consumed tokens: 3476029440 | elapsed time per iteration (s): 0.22 | learning rate: 1.880E-04 | global batch size: 256 | lm loss: 3.984464E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.505 | TFLOPs: 29.39 | +7: iteration 6640/ 37905 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 0.22 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.970701E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.095 | TFLOPs: 29.40 | +7: iteration 6650/ 37905 | consumed samples: 1702400 | consumed tokens: 3486515200 | elapsed time per iteration (s): 0.22 | learning rate: 1.879E-04 | global batch size: 256 | lm loss: 3.985374E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.904 | TFLOPs: 29.32 | +7: iteration 6660/ 37905 | consumed samples: 1704960 | consumed tokens: 3491758080 | elapsed time per iteration (s): 0.22 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.995074E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.970 | TFLOPs: 29.42 | +7: iteration 6670/ 37905 | consumed samples: 1707520 | consumed tokens: 3497000960 | elapsed time per iteration (s): 0.22 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.981999E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.998 | TFLOPs: 29.45 | +7: iteration 6680/ 37905 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 0.22 | learning rate: 1.878E-04 | global batch size: 256 | lm loss: 3.986607E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.158 | TFLOPs: 29.40 | +7: iteration 6690/ 37905 | consumed samples: 1712640 | consumed tokens: 3507486720 | elapsed time per iteration (s): 0.22 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.976208E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.905 | TFLOPs: 29.45 | +7: iteration 6700/ 37905 | consumed samples: 1715200 | consumed tokens: 3512729600 | elapsed time per iteration (s): 0.22 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.985742E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.242 | TFLOPs: 29.43 | +7: iteration 6710/ 37905 | consumed samples: 1717760 | consumed tokens: 3517972480 | elapsed time per iteration (s): 0.22 | learning rate: 1.877E-04 | global batch size: 256 | lm loss: 3.987041E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.300 | TFLOPs: 29.43 | +7: iteration 6720/ 37905 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 0.22 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.986936E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.222 | TFLOPs: 29.43 | +7: iteration 6730/ 37905 | consumed samples: 1722880 | consumed tokens: 3528458240 | elapsed time per iteration (s): 0.22 | learning rate: 1.876E-04 | global batch size: 256 | lm loss: 3.984333E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.367 | TFLOPs: 29.41 | +7: iteration 6740/ 37905 | consumed samples: 1725440 | consumed tokens: 3533701120 | elapsed time per iteration (s): 0.22 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.977771E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.773 | TFLOPs: 29.47 | +7: iteration 6750/ 37905 | consumed samples: 1728000 | consumed tokens: 3538944000 | elapsed time per iteration (s): 0.22 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.987113E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.431 | TFLOPs: 29.43 | +7: iteration 6760/ 37905 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 0.22 | learning rate: 1.875E-04 | global batch size: 256 | lm loss: 3.983573E+00 | grad norm: 0.411 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.581 | TFLOPs: 29.44 | +7: iteration 6770/ 37905 | consumed samples: 1733120 | consumed tokens: 3549429760 | elapsed time per iteration (s): 0.22 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.989676E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.188 | TFLOPs: 29.43 | +7: iteration 6780/ 37905 | consumed samples: 1735680 | consumed tokens: 3554672640 | elapsed time per iteration (s): 0.22 | learning rate: 1.874E-04 | global batch size: 256 | lm loss: 3.985643E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.859 | TFLOPs: 29.42 | +7: iteration 6790/ 37905 | consumed samples: 1738240 | consumed tokens: 3559915520 | elapsed time per iteration (s): 0.22 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.968238E+00 | grad norm: 0.377 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.608 | TFLOPs: 29.41 | +7: iteration 6800/ 37905 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 0.22 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.974717E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.419 | TFLOPs: 29.46 | +7: iteration 6810/ 37905 | consumed samples: 1743360 | consumed tokens: 3570401280 | elapsed time per iteration (s): 0.22 | learning rate: 1.873E-04 | global batch size: 256 | lm loss: 3.995856E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.546 | TFLOPs: 29.46 | +7: iteration 6820/ 37905 | consumed samples: 1745920 | consumed tokens: 3575644160 | elapsed time per iteration (s): 0.22 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.977052E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.707 | TFLOPs: 29.44 | +7: iteration 6830/ 37905 | consumed samples: 1748480 | consumed tokens: 3580887040 | elapsed time per iteration (s): 0.22 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.985343E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.612 | TFLOPs: 29.46 | +7: iteration 6840/ 37905 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 0.22 | learning rate: 1.872E-04 | global batch size: 256 | lm loss: 3.989384E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.321 | TFLOPs: 29.20 | +7: iteration 6850/ 37905 | consumed samples: 1753600 | consumed tokens: 3591372800 | elapsed time per iteration (s): 0.22 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.991323E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.369 | TFLOPs: 29.46 | +7: iteration 6860/ 37905 | consumed samples: 1756160 | consumed tokens: 3596615680 | elapsed time per iteration (s): 0.22 | learning rate: 1.871E-04 | global batch size: 256 | lm loss: 3.986744E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.884 | TFLOPs: 29.47 | +7: iteration 6870/ 37905 | consumed samples: 1758720 | consumed tokens: 3601858560 | elapsed time per iteration (s): 0.22 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.961678E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.589 | TFLOPs: 29.46 | +7: iteration 6880/ 37905 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 0.22 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.981791E+00 | grad norm: 0.392 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.023 | TFLOPs: 29.45 | +7: iteration 6890/ 37905 | consumed samples: 1763840 | consumed tokens: 3612344320 | elapsed time per iteration (s): 0.22 | learning rate: 1.870E-04 | global batch size: 256 | lm loss: 3.973657E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.990 | TFLOPs: 29.40 | +7: iteration 6900/ 37905 | consumed samples: 1766400 | consumed tokens: 3617587200 | elapsed time per iteration (s): 0.22 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.975949E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.545 | TFLOPs: 29.46 | +7: iteration 6910/ 37905 | consumed samples: 1768960 | consumed tokens: 3622830080 | elapsed time per iteration (s): 0.22 | learning rate: 1.869E-04 | global batch size: 256 | lm loss: 3.983625E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1157.065 | TFLOPs: 29.48 | +7: iteration 6920/ 37905 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 0.22 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.973721E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.715 | TFLOPs: 29.47 | +7: iteration 6930/ 37905 | consumed samples: 1774080 | consumed tokens: 3633315840 | elapsed time per iteration (s): 0.22 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.976918E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.327 | TFLOPs: 29.41 | +7: iteration 6940/ 37905 | consumed samples: 1776640 | consumed tokens: 3638558720 | elapsed time per iteration (s): 0.22 | learning rate: 1.868E-04 | global batch size: 256 | lm loss: 3.980058E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.936 | TFLOPs: 29.42 | +7: iteration 6950/ 37905 | consumed samples: 1779200 | consumed tokens: 3643801600 | elapsed time per iteration (s): 0.22 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.978248E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.196 | TFLOPs: 29.43 | +7: iteration 6960/ 37905 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 0.22 | learning rate: 1.867E-04 | global batch size: 256 | lm loss: 3.960218E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.682 | TFLOPs: 29.44 | +7: iteration 6970/ 37905 | consumed samples: 1784320 | consumed tokens: 3654287360 | elapsed time per iteration (s): 0.22 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.976690E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.050 | TFLOPs: 29.40 | +7: iteration 6980/ 37905 | consumed samples: 1786880 | consumed tokens: 3659530240 | elapsed time per iteration (s): 0.22 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.974502E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.765 | TFLOPs: 29.39 | +7: iteration 6990/ 37905 | consumed samples: 1789440 | consumed tokens: 3664773120 | elapsed time per iteration (s): 0.22 | learning rate: 1.866E-04 | global batch size: 256 | lm loss: 3.967024E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.767 | TFLOPs: 29.39 | +7: iteration 7000/ 37905 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 0.22 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.968086E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.630 | TFLOPs: 29.39 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 7000 | lm loss value: 3.932854E+00 | lm loss PPL: 5.105247E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 7000 to checkpoints_83m20b400m +0: [2023-03-15 22:23:36,880] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step7000 is begin to save! +0: [2023-03-15 22:23:36,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:23:36,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:23:36,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:23:36,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:23:36,966] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:23:36,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:23:36,977] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:23:36,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:23:36,989] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:23:36,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:23:37,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:23:37,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:23:37,011] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:23:37,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:23:37,022] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:23:37,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:23:37,033] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:23:37,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:23:37,044] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:23:37,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:23:37,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:23:37,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:23:37,066] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:23:37,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:23:37,068] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step7000/mp_rank_00_model_states.pt +0: [2023-03-15 22:23:37,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:23:37,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:23:37,087] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:23:37,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:23:37,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:23:37,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:23:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:23:37,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:23:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:23:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 22:23:37,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:23:37,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +7: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:23:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +1: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:23:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +3: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: [2023-03-15 22:23:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:23:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:23:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +4: [2023-03-15 22:23:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:23:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +2: [2023-03-15 22:23:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +5: [2023-03-15 22:23:37,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:23:37,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +6: [2023-03-15 22:23:37,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:23:37,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step7000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:23:37,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step7000 is ready now! +0: successfully saved checkpoint at iteration 7000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.54 +7: iteration 7010/ 37905 | consumed samples: 1794560 | consumed tokens: 3675258880 | elapsed time per iteration (s): 0.25 | learning rate: 1.865E-04 | global batch size: 256 | lm loss: 3.978253E+00 | grad norm: 0.384 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1015.040 | TFLOPs: 25.86 | +7: iteration 7020/ 37905 | consumed samples: 1797120 | consumed tokens: 3680501760 | elapsed time per iteration (s): 0.22 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.971407E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.329 | TFLOPs: 29.38 | +7: iteration 7030/ 37905 | consumed samples: 1799680 | consumed tokens: 3685744640 | elapsed time per iteration (s): 0.22 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.957278E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.511 | TFLOPs: 29.39 | +7: iteration 7040/ 37905 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 0.22 | learning rate: 1.864E-04 | global batch size: 256 | lm loss: 3.981360E+00 | grad norm: 0.388 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.231 | TFLOPs: 29.43 | +7: iteration 7050/ 37905 | consumed samples: 1804800 | consumed tokens: 3696230400 | elapsed time per iteration (s): 0.22 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.977623E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.016 | TFLOPs: 29.45 | +7: iteration 7060/ 37905 | consumed samples: 1807360 | consumed tokens: 3701473280 | elapsed time per iteration (s): 0.22 | learning rate: 1.863E-04 | global batch size: 256 | lm loss: 3.974726E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.395 | TFLOPs: 29.43 | +7: iteration 7070/ 37905 | consumed samples: 1809920 | consumed tokens: 3706716160 | elapsed time per iteration (s): 0.22 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.975454E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.132 | TFLOPs: 29.45 | +7: iteration 7080/ 37905 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 0.22 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.972984E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.703 | TFLOPs: 29.44 | +7: iteration 7090/ 37905 | consumed samples: 1815040 | consumed tokens: 3717201920 | elapsed time per iteration (s): 0.22 | learning rate: 1.862E-04 | global batch size: 256 | lm loss: 3.979592E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.008 | TFLOPs: 29.40 | +7: iteration 7100/ 37905 | consumed samples: 1817600 | consumed tokens: 3722444800 | elapsed time per iteration (s): 0.22 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.975300E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.845 | TFLOPs: 29.47 | +7: iteration 7110/ 37905 | consumed samples: 1820160 | consumed tokens: 3727687680 | elapsed time per iteration (s): 0.22 | learning rate: 1.861E-04 | global batch size: 256 | lm loss: 3.975802E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.611 | TFLOPs: 29.46 | +7: iteration 7120/ 37905 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 0.22 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.963222E+00 | grad norm: 0.419 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.629 | TFLOPs: 29.47 | +7: iteration 7130/ 37905 | consumed samples: 1825280 | consumed tokens: 3738173440 | elapsed time per iteration (s): 0.22 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.967509E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.218 | TFLOPs: 29.40 | +7: iteration 7140/ 37905 | consumed samples: 1827840 | consumed tokens: 3743416320 | elapsed time per iteration (s): 0.22 | learning rate: 1.860E-04 | global batch size: 256 | lm loss: 3.980874E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.306 | TFLOPs: 29.10 | +7: iteration 7150/ 37905 | consumed samples: 1830400 | consumed tokens: 3748659200 | elapsed time per iteration (s): 0.22 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.974882E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.519 | TFLOPs: 29.39 | +7: iteration 7160/ 37905 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 0.22 | learning rate: 1.859E-04 | global batch size: 256 | lm loss: 3.958682E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.932 | TFLOPs: 29.42 | +7: iteration 7170/ 37905 | consumed samples: 1835520 | consumed tokens: 3759144960 | elapsed time per iteration (s): 0.22 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.964063E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.884 | TFLOPs: 29.42 | +7: iteration 7180/ 37905 | consumed samples: 1838080 | consumed tokens: 3764387840 | elapsed time per iteration (s): 0.22 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.968808E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.650 | TFLOPs: 29.44 | +7: iteration 7190/ 37905 | consumed samples: 1840640 | consumed tokens: 3769630720 | elapsed time per iteration (s): 0.22 | learning rate: 1.858E-04 | global batch size: 256 | lm loss: 3.976342E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.159 | TFLOPs: 29.43 | +7: iteration 7200/ 37905 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 0.22 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.961423E+00 | grad norm: 0.397 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.100 | TFLOPs: 29.43 | +7: iteration 7210/ 37905 | consumed samples: 1845760 | consumed tokens: 3780116480 | elapsed time per iteration (s): 0.22 | learning rate: 1.857E-04 | global batch size: 256 | lm loss: 3.977833E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.702 | TFLOPs: 29.44 | +7: iteration 7220/ 37905 | consumed samples: 1848320 | consumed tokens: 3785359360 | elapsed time per iteration (s): 0.22 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.954823E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.844 | TFLOPs: 29.42 | +7: iteration 7230/ 37905 | consumed samples: 1850880 | consumed tokens: 3790602240 | elapsed time per iteration (s): 0.22 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.959881E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.115 | TFLOPs: 29.40 | +7: iteration 7240/ 37905 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 0.22 | learning rate: 1.856E-04 | global batch size: 256 | lm loss: 3.960798E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.403 | TFLOPs: 29.38 | +7: iteration 7250/ 37905 | consumed samples: 1856000 | consumed tokens: 3801088000 | elapsed time per iteration (s): 0.22 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.956441E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.741 | TFLOPs: 29.42 | +7: iteration 7260/ 37905 | consumed samples: 1858560 | consumed tokens: 3806330880 | elapsed time per iteration (s): 0.22 | learning rate: 1.855E-04 | global batch size: 256 | lm loss: 3.971120E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.242 | TFLOPs: 29.40 | +7: iteration 7270/ 37905 | consumed samples: 1861120 | consumed tokens: 3811573760 | elapsed time per iteration (s): 0.22 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.968940E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.649 | TFLOPs: 29.36 | +7: iteration 7280/ 37905 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 0.22 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.958179E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.047 | TFLOPs: 29.35 | +7: iteration 7290/ 37905 | consumed samples: 1866240 | consumed tokens: 3822059520 | elapsed time per iteration (s): 0.22 | learning rate: 1.854E-04 | global batch size: 256 | lm loss: 3.970564E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.409 | TFLOPs: 29.36 | +7: iteration 7300/ 37905 | consumed samples: 1868800 | consumed tokens: 3827302400 | elapsed time per iteration (s): 0.22 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.960233E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.500 | TFLOPs: 29.41 | +7: iteration 7310/ 37905 | consumed samples: 1871360 | consumed tokens: 3832545280 | elapsed time per iteration (s): 0.22 | learning rate: 1.853E-04 | global batch size: 256 | lm loss: 3.956105E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.803 | TFLOPs: 29.39 | +7: iteration 7320/ 37905 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 0.22 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.961852E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.266 | TFLOPs: 29.41 | +7: iteration 7330/ 37905 | consumed samples: 1876480 | consumed tokens: 3843031040 | elapsed time per iteration (s): 0.22 | learning rate: 1.852E-04 | global batch size: 256 | lm loss: 3.963490E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.163 | TFLOPs: 29.38 | +7: iteration 7340/ 37905 | consumed samples: 1879040 | consumed tokens: 3848273920 | elapsed time per iteration (s): 0.22 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.962526E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.028 | TFLOPs: 29.40 | +7: iteration 7350/ 37905 | consumed samples: 1881600 | consumed tokens: 3853516800 | elapsed time per iteration (s): 0.22 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.964816E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.863 | TFLOPs: 29.42 | +7: iteration 7360/ 37905 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 0.22 | learning rate: 1.851E-04 | global batch size: 256 | lm loss: 3.973324E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.881 | TFLOPs: 29.40 | +7: iteration 7370/ 37905 | consumed samples: 1886720 | consumed tokens: 3864002560 | elapsed time per iteration (s): 0.22 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.964311E+00 | grad norm: 0.478 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.160 | TFLOPs: 29.40 | +7: iteration 7380/ 37905 | consumed samples: 1889280 | consumed tokens: 3869245440 | elapsed time per iteration (s): 0.22 | learning rate: 1.850E-04 | global batch size: 256 | lm loss: 3.950101E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.918 | TFLOPs: 29.40 | +7: iteration 7390/ 37905 | consumed samples: 1891840 | consumed tokens: 3874488320 | elapsed time per iteration (s): 0.22 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.971969E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.704 | TFLOPs: 29.39 | +7: iteration 7400/ 37905 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 0.22 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.966046E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.706 | TFLOPs: 29.34 | +7: iteration 7410/ 37905 | consumed samples: 1896960 | consumed tokens: 3884974080 | elapsed time per iteration (s): 0.22 | learning rate: 1.849E-04 | global batch size: 256 | lm loss: 3.957032E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.488 | TFLOPs: 29.36 | +7: iteration 7420/ 37905 | consumed samples: 1899520 | consumed tokens: 3890216960 | elapsed time per iteration (s): 0.22 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.960736E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.282 | TFLOPs: 29.38 | +7: iteration 7430/ 37905 | consumed samples: 1902080 | consumed tokens: 3895459840 | elapsed time per iteration (s): 0.22 | learning rate: 1.848E-04 | global batch size: 256 | lm loss: 3.948732E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.394 | TFLOPs: 29.36 | +7: iteration 7440/ 37905 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 0.22 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.965556E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.687 | TFLOPs: 29.34 | +7: iteration 7450/ 37905 | consumed samples: 1907200 | consumed tokens: 3905945600 | elapsed time per iteration (s): 0.22 | learning rate: 1.847E-04 | global batch size: 256 | lm loss: 3.964428E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.273 | TFLOPs: 29.33 | +7: iteration 7460/ 37905 | consumed samples: 1909760 | consumed tokens: 3911188480 | elapsed time per iteration (s): 0.22 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.944125E+00 | grad norm: 0.398 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.173 | TFLOPs: 29.35 | +7: iteration 7470/ 37905 | consumed samples: 1912320 | consumed tokens: 3916431360 | elapsed time per iteration (s): 0.22 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.962789E+00 | grad norm: 0.400 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.861 | TFLOPs: 29.37 | +7: iteration 7480/ 37905 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 0.22 | learning rate: 1.846E-04 | global batch size: 256 | lm loss: 3.951910E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.172 | TFLOPs: 29.35 | +7: iteration 7490/ 37905 | consumed samples: 1917440 | consumed tokens: 3926917120 | elapsed time per iteration (s): 0.22 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.949238E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.252 | TFLOPs: 29.35 | +7: iteration 7500/ 37905 | consumed samples: 1920000 | consumed tokens: 3932160000 | elapsed time per iteration (s): 0.22 | learning rate: 1.845E-04 | global batch size: 256 | lm loss: 3.962664E+00 | grad norm: 0.372 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.185 | TFLOPs: 29.38 | +7: iteration 7510/ 37905 | consumed samples: 1922560 | consumed tokens: 3937402880 | elapsed time per iteration (s): 0.22 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.952116E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.387 | TFLOPs: 29.38 | +7: iteration 7520/ 37905 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 0.22 | learning rate: 1.844E-04 | global batch size: 256 | lm loss: 3.948388E+00 | grad norm: 0.443 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.789 | TFLOPs: 29.37 | +7: iteration 7530/ 37905 | consumed samples: 1927680 | consumed tokens: 3947888640 | elapsed time per iteration (s): 0.22 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.970195E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.770 | TFLOPs: 29.34 | +7: iteration 7540/ 37905 | consumed samples: 1930240 | consumed tokens: 3953131520 | elapsed time per iteration (s): 0.22 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.962228E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.353 | TFLOPs: 29.36 | +7: iteration 7550/ 37905 | consumed samples: 1932800 | consumed tokens: 3958374400 | elapsed time per iteration (s): 0.22 | learning rate: 1.843E-04 | global batch size: 256 | lm loss: 3.944371E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.347 | TFLOPs: 29.36 | +7: iteration 7560/ 37905 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 0.22 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.959119E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.362 | TFLOPs: 29.38 | +7: iteration 7570/ 37905 | consumed samples: 1937920 | consumed tokens: 3968860160 | elapsed time per iteration (s): 0.22 | learning rate: 1.842E-04 | global batch size: 256 | lm loss: 3.958300E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.309 | TFLOPs: 29.38 | +7: iteration 7580/ 37905 | consumed samples: 1940480 | consumed tokens: 3974103040 | elapsed time per iteration (s): 0.22 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.937038E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.891 | TFLOPs: 29.40 | +7: iteration 7590/ 37905 | consumed samples: 1943040 | consumed tokens: 3979345920 | elapsed time per iteration (s): 0.22 | learning rate: 1.841E-04 | global batch size: 256 | lm loss: 3.959766E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.548 | TFLOPs: 29.36 | +7: iteration 7600/ 37905 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 0.22 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.950743E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.916 | TFLOPs: 29.35 | +7: iteration 7610/ 37905 | consumed samples: 1948160 | consumed tokens: 3989831680 | elapsed time per iteration (s): 0.22 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.954843E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.878 | TFLOPs: 29.32 | +7: iteration 7620/ 37905 | consumed samples: 1950720 | consumed tokens: 3995074560 | elapsed time per iteration (s): 0.22 | learning rate: 1.840E-04 | global batch size: 256 | lm loss: 3.956060E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.282 | TFLOPs: 29.15 | +7: iteration 7630/ 37905 | consumed samples: 1953280 | consumed tokens: 4000317440 | elapsed time per iteration (s): 0.22 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.950747E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.320 | TFLOPs: 29.36 | +7: iteration 7640/ 37905 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 0.22 | learning rate: 1.839E-04 | global batch size: 256 | lm loss: 3.946824E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.321 | TFLOPs: 29.36 | +7: iteration 7650/ 37905 | consumed samples: 1958400 | consumed tokens: 4010803200 | elapsed time per iteration (s): 0.22 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.944444E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.705 | TFLOPs: 29.37 | +7: iteration 7660/ 37905 | consumed samples: 1960960 | consumed tokens: 4016046080 | elapsed time per iteration (s): 0.22 | learning rate: 1.838E-04 | global batch size: 256 | lm loss: 3.963372E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.392 | TFLOPs: 29.38 | +7: iteration 7670/ 37905 | consumed samples: 1963520 | consumed tokens: 4021288960 | elapsed time per iteration (s): 0.22 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.957116E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.820 | TFLOPs: 29.16 | +7: iteration 7680/ 37905 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 0.22 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.949525E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.146 | TFLOPs: 29.38 | +7: iteration 7690/ 37905 | consumed samples: 1968640 | consumed tokens: 4031774720 | elapsed time per iteration (s): 0.22 | learning rate: 1.837E-04 | global batch size: 256 | lm loss: 3.949032E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.411 | TFLOPs: 29.36 | +7: iteration 7700/ 37905 | consumed samples: 1971200 | consumed tokens: 4037017600 | elapsed time per iteration (s): 0.22 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.953453E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.499 | TFLOPs: 29.39 | +7: iteration 7710/ 37905 | consumed samples: 1973760 | consumed tokens: 4042260480 | elapsed time per iteration (s): 0.22 | learning rate: 1.836E-04 | global batch size: 256 | lm loss: 3.936668E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.563 | TFLOPs: 29.36 | +7: iteration 7720/ 37905 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 0.22 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.960037E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.325 | TFLOPs: 29.13 | +7: iteration 7730/ 37905 | consumed samples: 1978880 | consumed tokens: 4052746240 | elapsed time per iteration (s): 0.22 | learning rate: 1.835E-04 | global batch size: 256 | lm loss: 3.946020E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.711 | TFLOPs: 29.42 | +7: iteration 7740/ 37905 | consumed samples: 1981440 | consumed tokens: 4057989120 | elapsed time per iteration (s): 0.22 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.952182E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.745 | TFLOPs: 29.44 | +7: iteration 7750/ 37905 | consumed samples: 1984000 | consumed tokens: 4063232000 | elapsed time per iteration (s): 0.22 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.945160E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.769 | TFLOPs: 29.04 | +7: iteration 7760/ 37905 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 0.22 | learning rate: 1.834E-04 | global batch size: 256 | lm loss: 3.946105E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.582 | TFLOPs: 29.39 | +7: iteration 7770/ 37905 | consumed samples: 1989120 | consumed tokens: 4073717760 | elapsed time per iteration (s): 0.22 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.948602E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.770 | TFLOPs: 29.37 | +7: iteration 7780/ 37905 | consumed samples: 1991680 | consumed tokens: 4078960640 | elapsed time per iteration (s): 0.22 | learning rate: 1.833E-04 | global batch size: 256 | lm loss: 3.956142E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.051 | TFLOPs: 29.40 | +7: iteration 7790/ 37905 | consumed samples: 1994240 | consumed tokens: 4084203520 | elapsed time per iteration (s): 0.22 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.938160E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.940 | TFLOPs: 29.40 | +7: iteration 7800/ 37905 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 0.22 | learning rate: 1.832E-04 | global batch size: 256 | lm loss: 3.956903E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.991 | TFLOPs: 29.35 | +7: iteration 7810/ 37905 | consumed samples: 1999360 | consumed tokens: 4094689280 | elapsed time per iteration (s): 0.22 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.944165E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.003 | TFLOPs: 29.37 | +7: iteration 7820/ 37905 | consumed samples: 2001920 | consumed tokens: 4099932160 | elapsed time per iteration (s): 0.22 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.946870E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.511 | TFLOPs: 29.36 | +7: iteration 7830/ 37905 | consumed samples: 2004480 | consumed tokens: 4105175040 | elapsed time per iteration (s): 0.22 | learning rate: 1.831E-04 | global batch size: 256 | lm loss: 3.944749E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.476 | TFLOPs: 29.36 | +7: iteration 7840/ 37905 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 0.22 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.946400E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.966 | TFLOPs: 29.35 | +7: iteration 7850/ 37905 | consumed samples: 2009600 | consumed tokens: 4115660800 | elapsed time per iteration (s): 0.22 | learning rate: 1.830E-04 | global batch size: 256 | lm loss: 3.957299E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.064 | TFLOPs: 29.35 | +7: iteration 7860/ 37905 | consumed samples: 2012160 | consumed tokens: 4120903680 | elapsed time per iteration (s): 0.22 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.935104E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.671 | TFLOPs: 29.36 | +7: iteration 7870/ 37905 | consumed samples: 2014720 | consumed tokens: 4126146560 | elapsed time per iteration (s): 0.22 | learning rate: 1.829E-04 | global batch size: 256 | lm loss: 3.937918E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.768 | TFLOPs: 29.34 | +7: iteration 7880/ 37905 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 0.22 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.939874E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.162 | TFLOPs: 29.35 | +7: iteration 7890/ 37905 | consumed samples: 2019840 | consumed tokens: 4136632320 | elapsed time per iteration (s): 0.22 | learning rate: 1.828E-04 | global batch size: 256 | lm loss: 3.948573E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.517 | TFLOPs: 29.36 | +7: iteration 7900/ 37905 | consumed samples: 2022400 | consumed tokens: 4141875200 | elapsed time per iteration (s): 0.22 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.945476E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.726 | TFLOPs: 29.37 | +7: iteration 7910/ 37905 | consumed samples: 2024960 | consumed tokens: 4147118080 | elapsed time per iteration (s): 0.22 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.931063E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.925 | TFLOPs: 29.35 | +7: iteration 7920/ 37905 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 0.22 | learning rate: 1.827E-04 | global batch size: 256 | lm loss: 3.945326E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.666 | TFLOPs: 29.34 | +7: iteration 7930/ 37905 | consumed samples: 2030080 | consumed tokens: 4157603840 | elapsed time per iteration (s): 0.22 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.934711E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.784 | TFLOPs: 29.37 | +7: iteration 7940/ 37905 | consumed samples: 2032640 | consumed tokens: 4162846720 | elapsed time per iteration (s): 0.22 | learning rate: 1.826E-04 | global batch size: 256 | lm loss: 3.936568E+00 | grad norm: 0.390 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.372 | TFLOPs: 29.33 | +7: iteration 7950/ 37905 | consumed samples: 2035200 | consumed tokens: 4168089600 | elapsed time per iteration (s): 0.22 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.928820E+00 | grad norm: 0.504 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.145 | TFLOPs: 29.27 | +7: iteration 7960/ 37905 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 0.22 | learning rate: 1.825E-04 | global batch size: 256 | lm loss: 3.942479E+00 | grad norm: 0.366 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.220 | TFLOPs: 29.28 | +7: iteration 7970/ 37905 | consumed samples: 2040320 | consumed tokens: 4178575360 | elapsed time per iteration (s): 0.22 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.950653E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.133 | TFLOPs: 29.27 | +7: iteration 7980/ 37905 | consumed samples: 2042880 | consumed tokens: 4183818240 | elapsed time per iteration (s): 0.22 | learning rate: 1.824E-04 | global batch size: 256 | lm loss: 3.944622E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.787 | TFLOPs: 29.27 | +7: iteration 7990/ 37905 | consumed samples: 2045440 | consumed tokens: 4189061120 | elapsed time per iteration (s): 0.22 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.934067E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.245 | TFLOPs: 29.28 | +0: [2023-03-15 22:27:19,238] [INFO] [logging.py:68:log_dist] [Rank 0] step=8000, skipped=0, lr=[0.00018229552592396328, 0.00018229552592396328, 0.00018229552592396328], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 8000/ 37905 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 0.23 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.954539E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.805 | TFLOPs: 28.96 | +0: steps: 8000 loss: 3.9570 iter time (s): 0.220 samples/sec: 1161.285 +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 8000 | lm loss value: 3.910519E+00 | lm loss PPL: 4.992486E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 8000 to checkpoints_83m20b400m +0: [2023-03-15 22:27:19,328] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step8000 is begin to save! +0: [2023-03-15 22:27:19,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:27:19,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:27:19,402] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:27:19,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:27:19,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:27:19,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:27:19,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:27:19,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:27:19,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:27:19,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:27:19,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:27:19,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:27:19,458] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:27:19,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:27:19,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:27:19,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:27:19,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:27:19,490] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:27:19,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:27:19,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:27:19,502] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:27:19,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:27:19,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:27:19,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:27:19,514] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step8000/mp_rank_00_model_states.pt +0: [2023-03-15 22:27:19,514] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:27:19,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:27:19,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:27:19,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-15 22:27:19,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:27:19,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:27:19,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-15 22:27:19,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +7: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +3: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +6: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +2: [2023-03-15 22:27:19,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:27:19,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:27:19,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +4: [2023-03-15 22:27:19,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:27:19,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:27:19,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +5: [2023-03-15 22:27:19,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:27:19,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:27:19,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +1: [2023-03-15 22:27:19,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:27:19,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step8000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:27:19,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step8000 is ready now! +0: successfully saved checkpoint at iteration 8000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 239.00 +7: iteration 8010/ 37905 | consumed samples: 2050560 | consumed tokens: 4199546880 | elapsed time per iteration (s): 0.25 | learning rate: 1.823E-04 | global batch size: 256 | lm loss: 3.928272E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1004.875 | TFLOPs: 25.60 | +7: iteration 8020/ 37905 | consumed samples: 2053120 | consumed tokens: 4204789760 | elapsed time per iteration (s): 0.22 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.948079E+00 | grad norm: 0.415 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.933 | TFLOPs: 29.32 | +7: iteration 8030/ 37905 | consumed samples: 2055680 | consumed tokens: 4210032640 | elapsed time per iteration (s): 0.22 | learning rate: 1.822E-04 | global batch size: 256 | lm loss: 3.950105E+00 | grad norm: 0.403 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.181 | TFLOPs: 29.30 | +7: iteration 8040/ 37905 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 0.22 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.915077E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.189 | TFLOPs: 29.33 | +7: iteration 8050/ 37905 | consumed samples: 2060800 | consumed tokens: 4220518400 | elapsed time per iteration (s): 0.22 | learning rate: 1.821E-04 | global batch size: 256 | lm loss: 3.938581E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.145 | TFLOPs: 29.30 | +7: iteration 8060/ 37905 | consumed samples: 2063360 | consumed tokens: 4225761280 | elapsed time per iteration (s): 0.22 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.934867E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.643 | TFLOPs: 29.29 | +7: iteration 8070/ 37905 | consumed samples: 2065920 | consumed tokens: 4231004160 | elapsed time per iteration (s): 0.22 | learning rate: 1.820E-04 | global batch size: 256 | lm loss: 3.942956E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.877 | TFLOPs: 29.29 | +7: iteration 8080/ 37905 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 0.22 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.930626E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.669 | TFLOPs: 29.29 | +7: iteration 8090/ 37905 | consumed samples: 2071040 | consumed tokens: 4241489920 | elapsed time per iteration (s): 0.22 | learning rate: 1.819E-04 | global batch size: 256 | lm loss: 3.950672E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.880 | TFLOPs: 29.32 | +7: iteration 8100/ 37905 | consumed samples: 2073600 | consumed tokens: 4246732800 | elapsed time per iteration (s): 0.22 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.955646E+00 | grad norm: 0.473 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.701 | TFLOPs: 29.31 | +7: iteration 8110/ 37905 | consumed samples: 2076160 | consumed tokens: 4251975680 | elapsed time per iteration (s): 0.22 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.940989E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.602 | TFLOPs: 29.34 | +7: iteration 8120/ 37905 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 0.22 | learning rate: 1.818E-04 | global batch size: 256 | lm loss: 3.939577E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.469 | TFLOPs: 29.33 | +7: iteration 8130/ 37905 | consumed samples: 2081280 | consumed tokens: 4262461440 | elapsed time per iteration (s): 0.22 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.932113E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.461 | TFLOPs: 29.36 | +7: iteration 8140/ 37905 | consumed samples: 2083840 | consumed tokens: 4267704320 | elapsed time per iteration (s): 0.22 | learning rate: 1.817E-04 | global batch size: 256 | lm loss: 3.934580E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.344 | TFLOPs: 29.02 | +7: iteration 8150/ 37905 | consumed samples: 2086400 | consumed tokens: 4272947200 | elapsed time per iteration (s): 0.22 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.948415E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.324 | TFLOPs: 29.38 | +7: iteration 8160/ 37905 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 0.22 | learning rate: 1.816E-04 | global batch size: 256 | lm loss: 3.924197E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.004 | TFLOPs: 29.40 | +7: iteration 8170/ 37905 | consumed samples: 2091520 | consumed tokens: 4283432960 | elapsed time per iteration (s): 0.22 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.929733E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.265 | TFLOPs: 29.41 | +7: iteration 8180/ 37905 | consumed samples: 2094080 | consumed tokens: 4288675840 | elapsed time per iteration (s): 0.22 | learning rate: 1.815E-04 | global batch size: 256 | lm loss: 3.939088E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.815 | TFLOPs: 29.39 | +7: iteration 8190/ 37905 | consumed samples: 2096640 | consumed tokens: 4293918720 | elapsed time per iteration (s): 0.22 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.936359E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.319 | TFLOPs: 29.38 | +7: iteration 8200/ 37905 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 0.22 | learning rate: 1.814E-04 | global batch size: 256 | lm loss: 3.934626E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.964 | TFLOPs: 29.42 | +7: iteration 8210/ 37905 | consumed samples: 2101760 | consumed tokens: 4304404480 | elapsed time per iteration (s): 0.22 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.921715E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.023 | TFLOPs: 29.40 | +7: iteration 8220/ 37905 | consumed samples: 2104320 | consumed tokens: 4309647360 | elapsed time per iteration (s): 0.22 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.941222E+00 | grad norm: 0.361 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.434 | TFLOPs: 29.36 | +7: iteration 8230/ 37905 | consumed samples: 2106880 | consumed tokens: 4314890240 | elapsed time per iteration (s): 0.22 | learning rate: 1.813E-04 | global batch size: 256 | lm loss: 3.942149E+00 | grad norm: 0.360 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.357 | TFLOPs: 29.36 | +7: iteration 8240/ 37905 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 0.22 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.949332E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.414 | TFLOPs: 29.38 | +7: iteration 8250/ 37905 | consumed samples: 2112000 | consumed tokens: 4325376000 | elapsed time per iteration (s): 0.22 | learning rate: 1.812E-04 | global batch size: 256 | lm loss: 3.938968E+00 | grad norm: 0.381 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.653 | TFLOPs: 29.26 | +7: iteration 8260/ 37905 | consumed samples: 2114560 | consumed tokens: 4330618880 | elapsed time per iteration (s): 0.22 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.923321E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.133 | TFLOPs: 29.38 | +7: iteration 8270/ 37905 | consumed samples: 2117120 | consumed tokens: 4335861760 | elapsed time per iteration (s): 0.22 | learning rate: 1.811E-04 | global batch size: 256 | lm loss: 3.927434E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.291 | TFLOPs: 29.41 | +7: iteration 8280/ 37905 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 0.22 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.936526E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.421 | TFLOPs: 29.38 | +7: iteration 8290/ 37905 | consumed samples: 2122240 | consumed tokens: 4346347520 | elapsed time per iteration (s): 0.22 | learning rate: 1.810E-04 | global batch size: 256 | lm loss: 3.929858E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.439 | TFLOPs: 29.36 | +7: iteration 8300/ 37905 | consumed samples: 2124800 | consumed tokens: 4351590400 | elapsed time per iteration (s): 0.22 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.927436E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.379 | TFLOPs: 29.41 | +7: iteration 8310/ 37905 | consumed samples: 2127360 | consumed tokens: 4356833280 | elapsed time per iteration (s): 0.22 | learning rate: 1.809E-04 | global batch size: 256 | lm loss: 3.928614E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.905 | TFLOPs: 29.40 | +7: iteration 8320/ 37905 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 0.22 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.921541E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.106 | TFLOPs: 29.43 | +7: iteration 8330/ 37905 | consumed samples: 2132480 | consumed tokens: 4367319040 | elapsed time per iteration (s): 0.22 | learning rate: 1.808E-04 | global batch size: 256 | lm loss: 3.935262E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.045 | TFLOPs: 29.42 | +7: iteration 8340/ 37905 | consumed samples: 2135040 | consumed tokens: 4372561920 | elapsed time per iteration (s): 0.22 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.930840E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.718 | TFLOPs: 29.44 | +7: iteration 8350/ 37905 | consumed samples: 2137600 | consumed tokens: 4377804800 | elapsed time per iteration (s): 0.22 | learning rate: 1.807E-04 | global batch size: 256 | lm loss: 3.928543E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.069 | TFLOPs: 29.45 | +7: iteration 8360/ 37905 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 0.22 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.909772E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.598 | TFLOPs: 29.44 | +7: iteration 8370/ 37905 | consumed samples: 2142720 | consumed tokens: 4388290560 | elapsed time per iteration (s): 0.22 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.934565E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.642 | TFLOPs: 29.44 | +7: iteration 8380/ 37905 | consumed samples: 2145280 | consumed tokens: 4393533440 | elapsed time per iteration (s): 0.22 | learning rate: 1.806E-04 | global batch size: 256 | lm loss: 3.937955E+00 | grad norm: 0.440 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.382 | TFLOPs: 29.23 | +7: iteration 8390/ 37905 | consumed samples: 2147840 | consumed tokens: 4398776320 | elapsed time per iteration (s): 0.22 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.924519E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.387 | TFLOPs: 29.46 | +7: iteration 8400/ 37905 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 0.22 | learning rate: 1.805E-04 | global batch size: 256 | lm loss: 3.929369E+00 | grad norm: 0.351 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.520 | TFLOPs: 29.46 | +7: iteration 8410/ 37905 | consumed samples: 2152960 | consumed tokens: 4409262080 | elapsed time per iteration (s): 0.22 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.910216E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.622 | TFLOPs: 29.47 | +7: iteration 8420/ 37905 | consumed samples: 2155520 | consumed tokens: 4414504960 | elapsed time per iteration (s): 0.22 | learning rate: 1.804E-04 | global batch size: 256 | lm loss: 3.925543E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.532 | TFLOPs: 29.16 | +7: iteration 8430/ 37905 | consumed samples: 2158080 | consumed tokens: 4419747840 | elapsed time per iteration (s): 0.23 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.920329E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.479 | TFLOPs: 28.82 | +7: iteration 8440/ 37905 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 0.22 | learning rate: 1.803E-04 | global batch size: 256 | lm loss: 3.910067E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.121 | TFLOPs: 29.45 | +7: iteration 8450/ 37905 | consumed samples: 2163200 | consumed tokens: 4430233600 | elapsed time per iteration (s): 0.22 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.936020E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.050 | TFLOPs: 29.43 | +7: iteration 8460/ 37905 | consumed samples: 2165760 | consumed tokens: 4435476480 | elapsed time per iteration (s): 0.22 | learning rate: 1.802E-04 | global batch size: 256 | lm loss: 3.915897E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.403 | TFLOPs: 29.43 | +7: iteration 8470/ 37905 | consumed samples: 2168320 | consumed tokens: 4440719360 | elapsed time per iteration (s): 0.22 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.943418E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.816 | TFLOPs: 29.42 | +7: iteration 8480/ 37905 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 0.22 | learning rate: 1.801E-04 | global batch size: 256 | lm loss: 3.916274E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.062 | TFLOPs: 29.43 | +7: iteration 8490/ 37905 | consumed samples: 2173440 | consumed tokens: 4451205120 | elapsed time per iteration (s): 0.22 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.929562E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.302 | TFLOPs: 29.43 | +7: iteration 8500/ 37905 | consumed samples: 2176000 | consumed tokens: 4456448000 | elapsed time per iteration (s): 0.22 | learning rate: 1.800E-04 | global batch size: 256 | lm loss: 3.919778E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.796 | TFLOPs: 29.44 | +7: iteration 8510/ 37905 | consumed samples: 2178560 | consumed tokens: 4461690880 | elapsed time per iteration (s): 0.22 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.926474E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.991 | TFLOPs: 29.42 | +7: iteration 8520/ 37905 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 0.22 | learning rate: 1.799E-04 | global batch size: 256 | lm loss: 3.920732E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.252 | TFLOPs: 29.43 | +7: iteration 8530/ 37905 | consumed samples: 2183680 | consumed tokens: 4472176640 | elapsed time per iteration (s): 0.22 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.905692E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.528 | TFLOPs: 29.44 | +7: iteration 8540/ 37905 | consumed samples: 2186240 | consumed tokens: 4477419520 | elapsed time per iteration (s): 0.22 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.918704E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.088 | TFLOPs: 29.40 | +7: iteration 8550/ 37905 | consumed samples: 2188800 | consumed tokens: 4482662400 | elapsed time per iteration (s): 0.22 | learning rate: 1.798E-04 | global batch size: 256 | lm loss: 3.910493E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.698 | TFLOPs: 29.39 | +7: iteration 8560/ 37905 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 0.22 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.924805E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.473 | TFLOPs: 29.38 | +7: iteration 8570/ 37905 | consumed samples: 2193920 | consumed tokens: 4493148160 | elapsed time per iteration (s): 0.22 | learning rate: 1.797E-04 | global batch size: 256 | lm loss: 3.916848E+00 | grad norm: 0.375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.374 | TFLOPs: 29.46 | +7: iteration 8580/ 37905 | consumed samples: 2196480 | consumed tokens: 4498391040 | elapsed time per iteration (s): 0.22 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.908025E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.375 | TFLOPs: 29.46 | +7: iteration 8590/ 37905 | consumed samples: 2199040 | consumed tokens: 4503633920 | elapsed time per iteration (s): 0.22 | learning rate: 1.796E-04 | global batch size: 256 | lm loss: 3.922288E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.950 | TFLOPs: 29.47 | +7: iteration 8600/ 37905 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 0.22 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.920121E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.340 | TFLOPs: 29.43 | +7: iteration 8610/ 37905 | consumed samples: 2204160 | consumed tokens: 4514119680 | elapsed time per iteration (s): 0.22 | learning rate: 1.795E-04 | global batch size: 256 | lm loss: 3.904320E+00 | grad norm: 0.329 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.167 | TFLOPs: 29.35 | +7: iteration 8620/ 37905 | consumed samples: 2206720 | consumed tokens: 4519362560 | elapsed time per iteration (s): 0.22 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.912189E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.756 | TFLOPs: 29.39 | +7: iteration 8630/ 37905 | consumed samples: 2209280 | consumed tokens: 4524605440 | elapsed time per iteration (s): 0.22 | learning rate: 1.794E-04 | global batch size: 256 | lm loss: 3.910220E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.517 | TFLOPs: 29.21 | +7: iteration 8640/ 37905 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 0.22 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.919032E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.407 | TFLOPs: 29.43 | +7: iteration 8650/ 37905 | consumed samples: 2214400 | consumed tokens: 4535091200 | elapsed time per iteration (s): 0.22 | learning rate: 1.793E-04 | global batch size: 256 | lm loss: 3.905318E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.817 | TFLOPs: 29.44 | +7: iteration 8660/ 37905 | consumed samples: 2216960 | consumed tokens: 4540334080 | elapsed time per iteration (s): 0.22 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.917636E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.267 | TFLOPs: 29.43 | +7: iteration 8670/ 37905 | consumed samples: 2219520 | consumed tokens: 4545576960 | elapsed time per iteration (s): 0.22 | learning rate: 1.792E-04 | global batch size: 256 | lm loss: 3.933204E+00 | grad norm: 0.373 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.051 | TFLOPs: 29.45 | +7: iteration 8680/ 37905 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 0.22 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.922909E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.227 | TFLOPs: 29.43 | +7: iteration 8690/ 37905 | consumed samples: 2224640 | consumed tokens: 4556062720 | elapsed time per iteration (s): 0.22 | learning rate: 1.791E-04 | global batch size: 256 | lm loss: 3.903242E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.829 | TFLOPs: 29.44 | +7: iteration 8700/ 37905 | consumed samples: 2227200 | consumed tokens: 4561305600 | elapsed time per iteration (s): 0.22 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.916688E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.287 | TFLOPs: 29.41 | +7: iteration 8710/ 37905 | consumed samples: 2229760 | consumed tokens: 4566548480 | elapsed time per iteration (s): 0.22 | learning rate: 1.790E-04 | global batch size: 256 | lm loss: 3.913987E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.451 | TFLOPs: 29.41 | +7: iteration 8720/ 37905 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 0.22 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.906614E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.179 | TFLOPs: 29.40 | +7: iteration 8730/ 37905 | consumed samples: 2234880 | consumed tokens: 4577034240 | elapsed time per iteration (s): 0.23 | learning rate: 1.789E-04 | global batch size: 256 | lm loss: 3.911819E+00 | grad norm: 0.383 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.097 | TFLOPs: 28.94 | +7: iteration 8740/ 37905 | consumed samples: 2237440 | consumed tokens: 4582277120 | elapsed time per iteration (s): 0.22 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.923080E+00 | grad norm: 0.357 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.211 | TFLOPs: 29.35 | +7: iteration 8750/ 37905 | consumed samples: 2240000 | consumed tokens: 4587520000 | elapsed time per iteration (s): 0.22 | learning rate: 1.788E-04 | global batch size: 256 | lm loss: 3.914418E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.987 | TFLOPs: 29.40 | +7: iteration 8760/ 37905 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 0.22 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.904372E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.714 | TFLOPs: 29.37 | +7: iteration 8770/ 37905 | consumed samples: 2245120 | consumed tokens: 4598005760 | elapsed time per iteration (s): 0.22 | learning rate: 1.787E-04 | global batch size: 256 | lm loss: 3.915708E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.316 | TFLOPs: 29.38 | +7: iteration 8780/ 37905 | consumed samples: 2247680 | consumed tokens: 4603248640 | elapsed time per iteration (s): 0.22 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.902945E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.571 | TFLOPs: 29.41 | +7: iteration 8790/ 37905 | consumed samples: 2250240 | consumed tokens: 4608491520 | elapsed time per iteration (s): 0.22 | learning rate: 1.786E-04 | global batch size: 256 | lm loss: 3.899205E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.975 | TFLOPs: 29.40 | +7: iteration 8800/ 37905 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 0.22 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.910602E+00 | grad norm: 0.340 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.663 | TFLOPs: 29.42 | +7: iteration 8810/ 37905 | consumed samples: 2255360 | consumed tokens: 4618977280 | elapsed time per iteration (s): 0.22 | learning rate: 1.785E-04 | global batch size: 256 | lm loss: 3.916121E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.393 | TFLOPs: 29.41 | +7: iteration 8820/ 37905 | consumed samples: 2257920 | consumed tokens: 4624220160 | elapsed time per iteration (s): 0.22 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.897799E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.914 | TFLOPs: 29.42 | +7: iteration 8830/ 37905 | consumed samples: 2260480 | consumed tokens: 4629463040 | elapsed time per iteration (s): 0.22 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.925098E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.004 | TFLOPs: 29.42 | +7: iteration 8840/ 37905 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 0.22 | learning rate: 1.784E-04 | global batch size: 256 | lm loss: 3.916856E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.579 | TFLOPs: 29.41 | +7: iteration 8850/ 37905 | consumed samples: 2265600 | consumed tokens: 4639948800 | elapsed time per iteration (s): 0.22 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.904343E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.046 | TFLOPs: 29.40 | +7: iteration 8860/ 37905 | consumed samples: 2268160 | consumed tokens: 4645191680 | elapsed time per iteration (s): 0.22 | learning rate: 1.783E-04 | global batch size: 256 | lm loss: 3.912359E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.849 | TFLOPs: 29.39 | +7: iteration 8870/ 37905 | consumed samples: 2270720 | consumed tokens: 4650434560 | elapsed time per iteration (s): 0.22 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.907470E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.437 | TFLOPs: 29.38 | +7: iteration 8880/ 37905 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 0.22 | learning rate: 1.782E-04 | global batch size: 256 | lm loss: 3.919999E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.159 | TFLOPs: 29.40 | +7: iteration 8890/ 37905 | consumed samples: 2275840 | consumed tokens: 4660920320 | elapsed time per iteration (s): 0.22 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.916137E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.575 | TFLOPs: 29.39 | +7: iteration 8900/ 37905 | consumed samples: 2278400 | consumed tokens: 4666163200 | elapsed time per iteration (s): 0.22 | learning rate: 1.781E-04 | global batch size: 256 | lm loss: 3.900328E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.551 | TFLOPs: 29.39 | +7: iteration 8910/ 37905 | consumed samples: 2280960 | consumed tokens: 4671406080 | elapsed time per iteration (s): 0.22 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.917109E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.666 | TFLOPs: 29.42 | +7: iteration 8920/ 37905 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 0.22 | learning rate: 1.780E-04 | global batch size: 256 | lm loss: 3.908854E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.515 | TFLOPs: 29.41 | +7: iteration 8930/ 37905 | consumed samples: 2286080 | consumed tokens: 4681891840 | elapsed time per iteration (s): 0.22 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.927416E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.939 | TFLOPs: 29.40 | +7: iteration 8940/ 37905 | consumed samples: 2288640 | consumed tokens: 4687134720 | elapsed time per iteration (s): 0.22 | learning rate: 1.779E-04 | global batch size: 256 | lm loss: 3.909389E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.167 | TFLOPs: 29.40 | +7: iteration 8950/ 37905 | consumed samples: 2291200 | consumed tokens: 4692377600 | elapsed time per iteration (s): 0.22 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.918010E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.085 | TFLOPs: 29.40 | +7: iteration 8960/ 37905 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 0.22 | learning rate: 1.778E-04 | global batch size: 256 | lm loss: 3.915067E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.471 | TFLOPs: 29.41 | +7: iteration 8970/ 37905 | consumed samples: 2296320 | consumed tokens: 4702863360 | elapsed time per iteration (s): 0.22 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.911806E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.217 | TFLOPs: 29.38 | +7: iteration 8980/ 37905 | consumed samples: 2298880 | consumed tokens: 4708106240 | elapsed time per iteration (s): 0.22 | learning rate: 1.777E-04 | global batch size: 256 | lm loss: 3.889901E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.398 | TFLOPs: 29.38 | +7: iteration 8990/ 37905 | consumed samples: 2301440 | consumed tokens: 4713349120 | elapsed time per iteration (s): 0.22 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.901064E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.049 | TFLOPs: 29.40 | +7: iteration 9000/ 37905 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 0.22 | learning rate: 1.776E-04 | global batch size: 256 | lm loss: 3.907154E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.903 | TFLOPs: 29.40 | +7: ----------------------------------------------------------------------------------------------- +7: validation loss at iteration 9000 | lm loss value: 3.833609E+00 | lm loss PPL: 4.622908E+01 | +7: ----------------------------------------------------------------------------------------------- +0: saving checkpoint at iteration 9000 to checkpoints_83m20b400m +0: [2023-03-15 22:31:01,639] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step9000 is begin to save! +0: [2023-03-15 22:31:01,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:31:01,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:31:01,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:31:01,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:31:01,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:31:01,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:31:01,736] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:31:01,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:31:01,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:31:01,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:31:01,758] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:31:01,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:31:01,769] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:31:01,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:31:01,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:31:01,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:31:01,791] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:31:01,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:31:01,802] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:31:01,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:31:01,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:31:01,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:31:01,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:31:01,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:31:01,826] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step9000/mp_rank_00_model_states.pt +0: [2023-03-15 22:31:01,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:31:01,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:31:01,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:31:01,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:31:01,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:31:01,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +1: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:31:01,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:31:01,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-15 22:31:01,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +4: [2023-03-15 22:31:01,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +6: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +7: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +3: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +5: [2023-03-15 22:31:01,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:31:01,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +2: [2023-03-15 22:31:01,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:31:01,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step9000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:31:01,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step9000 is ready now! +0: successfully saved checkpoint at iteration 9000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 240.72 +7: iteration 9010/ 37905 | consumed samples: 2306560 | consumed tokens: 4723834880 | elapsed time per iteration (s): 0.25 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.910043E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1013.577 | TFLOPs: 25.82 | +7: iteration 9020/ 37905 | consumed samples: 2309120 | consumed tokens: 4729077760 | elapsed time per iteration (s): 0.22 | learning rate: 1.775E-04 | global batch size: 256 | lm loss: 3.900800E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.554 | TFLOPs: 29.36 | +7: iteration 9030/ 37905 | consumed samples: 2311680 | consumed tokens: 4734320640 | elapsed time per iteration (s): 0.22 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.914738E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.156 | TFLOPs: 29.35 | +7: iteration 9040/ 37905 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 0.22 | learning rate: 1.774E-04 | global batch size: 256 | lm loss: 3.912895E+00 | grad norm: 0.412 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.882 | TFLOPs: 29.34 | +7: iteration 9050/ 37905 | consumed samples: 2316800 | consumed tokens: 4744806400 | elapsed time per iteration (s): 0.22 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.922366E+00 | grad norm: 0.379 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.166 | TFLOPs: 29.33 | +7: iteration 9060/ 37905 | consumed samples: 2319360 | consumed tokens: 4750049280 | elapsed time per iteration (s): 0.22 | learning rate: 1.773E-04 | global batch size: 256 | lm loss: 3.912145E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.947 | TFLOPs: 29.30 | +7: iteration 9070/ 37905 | consumed samples: 2321920 | consumed tokens: 4755292160 | elapsed time per iteration (s): 0.22 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.908870E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.675 | TFLOPs: 29.26 | +7: iteration 9080/ 37905 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 0.22 | learning rate: 1.772E-04 | global batch size: 256 | lm loss: 3.922305E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.684 | TFLOPs: 29.26 | +7: iteration 9090/ 37905 | consumed samples: 2327040 | consumed tokens: 4765777920 | elapsed time per iteration (s): 0.22 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.911425E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.840 | TFLOPs: 29.24 | +7: iteration 9100/ 37905 | consumed samples: 2329600 | consumed tokens: 4771020800 | elapsed time per iteration (s): 0.22 | learning rate: 1.771E-04 | global batch size: 256 | lm loss: 3.915283E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.051 | TFLOPs: 29.25 | +7: iteration 9110/ 37905 | consumed samples: 2332160 | consumed tokens: 4776263680 | elapsed time per iteration (s): 0.22 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.902308E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.727 | TFLOPs: 29.24 | +7: iteration 9120/ 37905 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 0.22 | learning rate: 1.770E-04 | global batch size: 256 | lm loss: 3.889579E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.203 | TFLOPs: 29.25 | +7: iteration 9130/ 37905 | consumed samples: 2337280 | consumed tokens: 4786749440 | elapsed time per iteration (s): 0.22 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.917053E+00 | grad norm: 0.387 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.047 | TFLOPs: 29.25 | +7: iteration 9140/ 37905 | consumed samples: 2339840 | consumed tokens: 4791992320 | elapsed time per iteration (s): 0.22 | learning rate: 1.769E-04 | global batch size: 256 | lm loss: 3.900134E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.766 | TFLOPs: 29.09 | +7: iteration 9150/ 37905 | consumed samples: 2342400 | consumed tokens: 4797235200 | elapsed time per iteration (s): 0.23 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.905108E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.534 | TFLOPs: 28.67 | +7: iteration 9160/ 37905 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 0.22 | learning rate: 1.768E-04 | global batch size: 256 | lm loss: 3.897224E+00 | grad norm: 0.418 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.875 | TFLOPs: 29.11 | +7: iteration 9170/ 37905 | consumed samples: 2347520 | consumed tokens: 4807720960 | elapsed time per iteration (s): 0.22 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.895522E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.483 | TFLOPs: 29.31 | +7: iteration 9180/ 37905 | consumed samples: 2350080 | consumed tokens: 4812963840 | elapsed time per iteration (s): 0.22 | learning rate: 1.767E-04 | global batch size: 256 | lm loss: 3.911087E+00 | grad norm: 0.368 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.478 | TFLOPs: 29.31 | +7: iteration 9190/ 37905 | consumed samples: 2352640 | consumed tokens: 4818206720 | elapsed time per iteration (s): 0.22 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.898667E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.535 | TFLOPs: 29.31 | +7: iteration 9200/ 37905 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 0.22 | learning rate: 1.766E-04 | global batch size: 256 | lm loss: 3.901452E+00 | grad norm: 0.350 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.088 | TFLOPs: 29.32 | +7: iteration 9210/ 37905 | consumed samples: 2357760 | consumed tokens: 4828692480 | elapsed time per iteration (s): 0.22 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.904316E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.790 | TFLOPs: 29.32 | +7: iteration 9220/ 37905 | consumed samples: 2360320 | consumed tokens: 4833935360 | elapsed time per iteration (s): 0.22 | learning rate: 1.765E-04 | global batch size: 256 | lm loss: 3.898122E+00 | grad norm: 0.369 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.298 | TFLOPs: 29.33 | +7: iteration 9230/ 37905 | consumed samples: 2362880 | consumed tokens: 4839178240 | elapsed time per iteration (s): 0.22 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.899151E+00 | grad norm: 0.365 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.962 | TFLOPs: 29.32 | +7: iteration 9240/ 37905 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 0.22 | learning rate: 1.764E-04 | global batch size: 256 | lm loss: 3.901660E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.214 | TFLOPs: 29.28 | +7: iteration 9250/ 37905 | consumed samples: 2368000 | consumed tokens: 4849664000 | elapsed time per iteration (s): 0.22 | learning rate: 1.763E-04 | global batch size: 256 | lm loss: 3.901031E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.561 | TFLOPs: 29.29 | +7: iteration 9260/ 37905 | consumed samples: 2370560 | consumed tokens: 4854906880 | elapsed time per iteration (s): 0.22 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.904393E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.155 | TFLOPs: 29.25 | +7: iteration 9270/ 37905 | consumed samples: 2373120 | consumed tokens: 4860149760 | elapsed time per iteration (s): 0.22 | learning rate: 1.762E-04 | global batch size: 256 | lm loss: 3.898542E+00 | grad norm: 0.345 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.345 | TFLOPs: 29.28 | +7: iteration 9280/ 37905 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 0.22 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.918515E+00 | grad norm: 0.353 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.258 | TFLOPs: 29.28 | +7: iteration 9290/ 37905 | consumed samples: 2378240 | consumed tokens: 4870635520 | elapsed time per iteration (s): 0.22 | learning rate: 1.761E-04 | global batch size: 256 | lm loss: 3.883461E+00 | grad norm: 0.414 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.340 | TFLOPs: 29.28 | +7: iteration 9300/ 37905 | consumed samples: 2380800 | consumed tokens: 4875878400 | elapsed time per iteration (s): 0.22 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.900817E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.011 | TFLOPs: 29.27 | +7: iteration 9310/ 37905 | consumed samples: 2383360 | consumed tokens: 4881121280 | elapsed time per iteration (s): 0.22 | learning rate: 1.760E-04 | global batch size: 256 | lm loss: 3.907362E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.198 | TFLOPs: 29.25 | +7: iteration 9320/ 37905 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 0.22 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.888508E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.176 | TFLOPs: 29.22 | +7: iteration 9330/ 37905 | consumed samples: 2388480 | consumed tokens: 4891607040 | elapsed time per iteration (s): 0.22 | learning rate: 1.759E-04 | global batch size: 256 | lm loss: 3.893817E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.614 | TFLOPs: 29.29 | +7: iteration 9340/ 37905 | consumed samples: 2391040 | consumed tokens: 4896849920 | elapsed time per iteration (s): 0.22 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.897737E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.211 | TFLOPs: 29.28 | +7: iteration 9350/ 37905 | consumed samples: 2393600 | consumed tokens: 4902092800 | elapsed time per iteration (s): 0.22 | learning rate: 1.758E-04 | global batch size: 256 | lm loss: 3.913283E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.917 | TFLOPs: 29.37 | +7: iteration 9360/ 37905 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 0.22 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.898339E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.756 | TFLOPs: 29.32 | +7: iteration 9370/ 37905 | consumed samples: 2398720 | consumed tokens: 4912578560 | elapsed time per iteration (s): 0.22 | learning rate: 1.757E-04 | global batch size: 256 | lm loss: 3.892804E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.939 | TFLOPs: 29.35 | +7: iteration 9380/ 37905 | consumed samples: 2401280 | consumed tokens: 4917821440 | elapsed time per iteration (s): 0.22 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.899727E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.262 | TFLOPs: 29.33 | +7: iteration 9390/ 37905 | consumed samples: 2403840 | consumed tokens: 4923064320 | elapsed time per iteration (s): 0.22 | learning rate: 1.756E-04 | global batch size: 256 | lm loss: 3.893318E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.235 | TFLOPs: 29.30 | +7: iteration 9400/ 37905 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 0.22 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.900908E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.794 | TFLOPs: 29.34 | +7: iteration 9410/ 37905 | consumed samples: 2408960 | consumed tokens: 4933550080 | elapsed time per iteration (s): 0.22 | learning rate: 1.755E-04 | global batch size: 256 | lm loss: 3.884056E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.096 | TFLOPs: 29.35 | +7: iteration 9420/ 37905 | consumed samples: 2411520 | consumed tokens: 4938792960 | elapsed time per iteration (s): 0.22 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.892896E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.120 | TFLOPs: 29.35 | +7: iteration 9430/ 37905 | consumed samples: 2414080 | consumed tokens: 4944035840 | elapsed time per iteration (s): 0.22 | learning rate: 1.754E-04 | global batch size: 256 | lm loss: 3.902490E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.620 | TFLOPs: 29.39 | +7: iteration 9440/ 37905 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 0.22 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.901550E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.088 | TFLOPs: 29.38 | +7: iteration 9450/ 37905 | consumed samples: 2419200 | consumed tokens: 4954521600 | elapsed time per iteration (s): 0.22 | learning rate: 1.753E-04 | global batch size: 256 | lm loss: 3.877711E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.651 | TFLOPs: 29.41 | +7: iteration 9460/ 37905 | consumed samples: 2421760 | consumed tokens: 4959764480 | elapsed time per iteration (s): 0.22 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.902407E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.598 | TFLOPs: 29.39 | +7: iteration 9470/ 37905 | consumed samples: 2424320 | consumed tokens: 4965007360 | elapsed time per iteration (s): 0.22 | learning rate: 1.752E-04 | global batch size: 256 | lm loss: 3.891822E+00 | grad norm: 0.336 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.964 | TFLOPs: 29.40 | +7: iteration 9480/ 37905 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 0.22 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.890520E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.606 | TFLOPs: 29.39 | +7: iteration 9490/ 37905 | consumed samples: 2429440 | consumed tokens: 4975493120 | elapsed time per iteration (s): 0.22 | learning rate: 1.751E-04 | global batch size: 256 | lm loss: 3.882401E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.009 | TFLOPs: 29.40 | +7: iteration 9500/ 37905 | consumed samples: 2432000 | consumed tokens: 4980736000 | elapsed time per iteration (s): 0.22 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.892778E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.393 | TFLOPs: 29.41 | +7: iteration 9510/ 37905 | consumed samples: 2434560 | consumed tokens: 4985978880 | elapsed time per iteration (s): 0.22 | learning rate: 1.750E-04 | global batch size: 256 | lm loss: 3.882555E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.013 | TFLOPs: 29.42 | +7: iteration 9520/ 37905 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 0.22 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.891242E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.021 | TFLOPs: 29.42 | +7: iteration 9530/ 37905 | consumed samples: 2439680 | consumed tokens: 4996464640 | elapsed time per iteration (s): 0.22 | learning rate: 1.749E-04 | global batch size: 256 | lm loss: 3.898649E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.562 | TFLOPs: 29.39 | +7: iteration 9540/ 37905 | consumed samples: 2442240 | consumed tokens: 5001707520 | elapsed time per iteration (s): 0.22 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.878686E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.587 | TFLOPs: 29.39 | +7: iteration 9550/ 37905 | consumed samples: 2444800 | consumed tokens: 5006950400 | elapsed time per iteration (s): 0.22 | learning rate: 1.748E-04 | global batch size: 256 | lm loss: 3.892652E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.690 | TFLOPs: 29.39 | +7: iteration 9560/ 37905 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 0.22 | learning rate: 1.747E-04 | global batch size: 256 | lm loss: 3.895226E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.299 | TFLOPs: 29.38 | +7: iteration 9570/ 37905 | consumed samples: 2449920 | consumed tokens: 5017436160 | elapsed time per iteration (s): 0.22 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.887086E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.777 | TFLOPs: 29.37 | +7: iteration 9580/ 37905 | consumed samples: 2452480 | consumed tokens: 5022679040 | elapsed time per iteration (s): 0.22 | learning rate: 1.746E-04 | global batch size: 256 | lm loss: 3.902578E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.444 | TFLOPs: 29.33 | +7: iteration 9590/ 37905 | consumed samples: 2455040 | consumed tokens: 5027921920 | elapsed time per iteration (s): 0.22 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.906654E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.624 | TFLOPs: 29.34 | +7: iteration 9600/ 37905 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 0.22 | learning rate: 1.745E-04 | global batch size: 256 | lm loss: 3.900772E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.912 | TFLOPs: 29.35 | +7: iteration 9610/ 37905 | consumed samples: 2460160 | consumed tokens: 5038407680 | elapsed time per iteration (s): 0.22 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.903433E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.321 | TFLOPs: 29.36 | +7: iteration 9620/ 37905 | consumed samples: 2462720 | consumed tokens: 5043650560 | elapsed time per iteration (s): 0.22 | learning rate: 1.744E-04 | global batch size: 256 | lm loss: 3.887991E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.112 | TFLOPs: 29.25 | +7: iteration 9630/ 37905 | consumed samples: 2465280 | consumed tokens: 5048893440 | elapsed time per iteration (s): 0.22 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.889953E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.267 | TFLOPs: 29.28 | +7: iteration 9640/ 37905 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 0.22 | learning rate: 1.743E-04 | global batch size: 256 | lm loss: 3.887745E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.137 | TFLOPs: 29.25 | +7: iteration 9650/ 37905 | consumed samples: 2470400 | consumed tokens: 5059379200 | elapsed time per iteration (s): 0.22 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.879786E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.039 | TFLOPs: 29.25 | +7: iteration 9660/ 37905 | consumed samples: 2472960 | consumed tokens: 5064622080 | elapsed time per iteration (s): 0.22 | learning rate: 1.742E-04 | global batch size: 256 | lm loss: 3.881911E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.403 | TFLOPs: 29.26 | +7: iteration 9670/ 37905 | consumed samples: 2475520 | consumed tokens: 5069864960 | elapsed time per iteration (s): 0.22 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.891155E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.399 | TFLOPs: 29.26 | +7: iteration 9680/ 37905 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 0.22 | learning rate: 1.741E-04 | global batch size: 256 | lm loss: 3.888193E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.660 | TFLOPs: 29.26 | +7: iteration 9690/ 37905 | consumed samples: 2480640 | consumed tokens: 5080350720 | elapsed time per iteration (s): 0.22 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.876662E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.318 | TFLOPs: 29.30 | +7: iteration 9700/ 37905 | consumed samples: 2483200 | consumed tokens: 5085593600 | elapsed time per iteration (s): 0.22 | learning rate: 1.740E-04 | global batch size: 256 | lm loss: 3.889033E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.475 | TFLOPs: 29.36 | +7: iteration 9710/ 37905 | consumed samples: 2485760 | consumed tokens: 5090836480 | elapsed time per iteration (s): 0.22 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.877715E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.831 | TFLOPs: 29.24 | +7: iteration 9720/ 37905 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 0.22 | learning rate: 1.739E-04 | global batch size: 256 | lm loss: 3.884967E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.815 | TFLOPs: 29.37 | +7: iteration 9730/ 37905 | consumed samples: 2490880 | consumed tokens: 5101322240 | elapsed time per iteration (s): 0.22 | learning rate: 1.738E-04 | global batch size: 256 | lm loss: 3.900180E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.837 | TFLOPs: 29.34 | +7: iteration 9740/ 37905 | consumed samples: 2493440 | consumed tokens: 5106565120 | elapsed time per iteration (s): 0.22 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.880407E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.150 | TFLOPs: 29.30 | +7: iteration 9750/ 37905 | consumed samples: 2496000 | consumed tokens: 5111808000 | elapsed time per iteration (s): 0.22 | learning rate: 1.737E-04 | global batch size: 256 | lm loss: 3.885706E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.471 | TFLOPs: 29.26 | +7: iteration 9760/ 37905 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 0.22 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.884092E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.607 | TFLOPs: 29.01 | +7: iteration 9770/ 37905 | consumed samples: 2501120 | consumed tokens: 5122293760 | elapsed time per iteration (s): 0.22 | learning rate: 1.736E-04 | global batch size: 256 | lm loss: 3.887835E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.465 | TFLOPs: 29.31 | +7: iteration 9780/ 37905 | consumed samples: 2503680 | consumed tokens: 5127536640 | elapsed time per iteration (s): 0.22 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.896596E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.361 | TFLOPs: 29.33 | +7: iteration 9790/ 37905 | consumed samples: 2506240 | consumed tokens: 5132779520 | elapsed time per iteration (s): 0.22 | learning rate: 1.735E-04 | global batch size: 256 | lm loss: 3.889203E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.498 | TFLOPs: 29.33 | +7: iteration 9800/ 37905 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 0.23 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.889757E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1114.335 | TFLOPs: 28.39 | +7: iteration 9810/ 37905 | consumed samples: 2511360 | consumed tokens: 5143265280 | elapsed time per iteration (s): 0.22 | learning rate: 1.734E-04 | global batch size: 256 | lm loss: 3.876662E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.447 | TFLOPs: 29.31 | +7: iteration 9820/ 37905 | consumed samples: 2513920 | consumed tokens: 5148508160 | elapsed time per iteration (s): 0.22 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.886369E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.473 | TFLOPs: 29.28 | +7: iteration 9830/ 37905 | consumed samples: 2516480 | consumed tokens: 5153751040 | elapsed time per iteration (s): 0.22 | learning rate: 1.733E-04 | global batch size: 256 | lm loss: 3.889106E+00 | grad norm: 0.326 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.623 | TFLOPs: 29.31 | +7: iteration 9840/ 37905 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 0.22 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.890047E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.471 | TFLOPs: 29.36 | +7: iteration 9850/ 37905 | consumed samples: 2521600 | consumed tokens: 5164236800 | elapsed time per iteration (s): 0.22 | learning rate: 1.732E-04 | global batch size: 256 | lm loss: 3.894642E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.692 | TFLOPs: 29.31 | +7: iteration 9860/ 37905 | consumed samples: 2524160 | consumed tokens: 5169479680 | elapsed time per iteration (s): 0.22 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.889165E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.189 | TFLOPs: 29.30 | +7: iteration 9870/ 37905 | consumed samples: 2526720 | consumed tokens: 5174722560 | elapsed time per iteration (s): 0.22 | learning rate: 1.731E-04 | global batch size: 256 | lm loss: 3.876802E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.153 | TFLOPs: 29.07 | +7: iteration 9880/ 37905 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 0.22 | learning rate: 1.730E-04 | global batch size: 256 | lm loss: 3.895081E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.112 | TFLOPs: 29.27 | +7: iteration 9890/ 37905 | consumed samples: 2531840 | consumed tokens: 5185208320 | elapsed time per iteration (s): 0.22 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.887755E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.023 | TFLOPs: 29.30 | +7: iteration 9900/ 37905 | consumed samples: 2534400 | consumed tokens: 5190451200 | elapsed time per iteration (s): 0.22 | learning rate: 1.729E-04 | global batch size: 256 | lm loss: 3.895060E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.435 | TFLOPs: 29.31 | +7: iteration 9910/ 37905 | consumed samples: 2536960 | consumed tokens: 5195694080 | elapsed time per iteration (s): 0.22 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.887934E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.767 | TFLOPs: 29.32 | +7: iteration 9920/ 37905 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 0.22 | learning rate: 1.728E-04 | global batch size: 256 | lm loss: 3.894518E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.585 | TFLOPs: 29.31 | +7: iteration 9930/ 37905 | consumed samples: 2542080 | consumed tokens: 5206179840 | elapsed time per iteration (s): 0.22 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.890848E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.249 | TFLOPs: 29.30 | +7: iteration 9940/ 37905 | consumed samples: 2544640 | consumed tokens: 5211422720 | elapsed time per iteration (s): 0.22 | learning rate: 1.727E-04 | global batch size: 256 | lm loss: 3.887161E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.701 | TFLOPs: 29.34 | +7: iteration 9950/ 37905 | consumed samples: 2547200 | consumed tokens: 5216665600 | elapsed time per iteration (s): 0.22 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.895498E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.698 | TFLOPs: 29.34 | +7: iteration 9960/ 37905 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 0.22 | learning rate: 1.726E-04 | global batch size: 256 | lm loss: 3.885143E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.692 | TFLOPs: 29.34 | +7: iteration 9970/ 37905 | consumed samples: 2552320 | consumed tokens: 5227151360 | elapsed time per iteration (s): 0.22 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.893588E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.538 | TFLOPs: 29.34 | +7: iteration 9980/ 37905 | consumed samples: 2554880 | consumed tokens: 5232394240 | elapsed time per iteration (s): 0.22 | learning rate: 1.725E-04 | global batch size: 256 | lm loss: 3.891413E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.788 | TFLOPs: 29.34 | +7: iteration 9990/ 37905 | consumed samples: 2557440 | consumed tokens: 5237637120 | elapsed time per iteration (s): 0.22 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.886600E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.591 | TFLOPs: 29.34 | +0: [2023-03-15 22:34:44,514] [INFO] [logging.py:68:log_dist] [Rank 0] step=10000, skipped=0, lr=[0.00017235122366283088, 0.00017235122366283088, 0.00017235122366283088], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 10000/ 37905 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 0.22 | learning rate: 1.724E-04 | global batch size: 256 | lm loss: 3.873353E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.205 | TFLOPs: 29.35 | +0: steps: 10000 loss: 3.8520 iter time (s): 0.221 samples/sec: 1158.881 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 10000 | lm loss value: 3.779798E+00 | lm loss PPL: 4.380717E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 10000 to checkpoints_83m20b400m +0: [2023-03-15 22:34:44,601] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step10000 is begin to save! +0: [2023-03-15 22:34:44,604] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:34:44,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:34:44,676] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:34:44,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:34:44,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:34:44,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:34:44,699] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:34:44,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:34:44,710] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:34:44,721] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:34:44,721] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:34:44,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:34:44,732] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:34:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:34:44,744] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:34:44,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:34:44,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:34:44,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:34:44,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:34:44,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:34:44,777] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:34:44,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:34:44,788] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:34:44,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:34:44,790] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step10000/mp_rank_00_model_states.pt +0: [2023-03-15 22:34:44,790] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:34:44,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,809] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,829] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,829] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,830] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,830] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,831] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:34:44,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-15 22:34:44,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:34:44,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +6: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +3: [2023-03-15 22:34:44,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-15 22:34:44,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-15 22:34:44,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +2: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +1: [2023-03-15 22:34:44,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +4: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:34:44,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:34:44,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +7: [2023-03-15 22:34:44,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +7: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +5: [2023-03-15 22:34:44,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:34:44,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +6: [2023-03-15 22:34:44,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:34:44,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step10000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:34:44,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step10000 is ready now! +0: successfully saved checkpoint at iteration 10000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 243.85 +7: iteration 10010/ 37905 | consumed samples: 2562560 | consumed tokens: 5248122880 | elapsed time per iteration (s): 0.25 | learning rate: 1.723E-04 | global batch size: 256 | lm loss: 3.876786E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1006.125 | TFLOPs: 25.63 | +7: iteration 10020/ 37905 | consumed samples: 2565120 | consumed tokens: 5253365760 | elapsed time per iteration (s): 0.22 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.877598E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.052 | TFLOPs: 29.32 | +7: iteration 10030/ 37905 | consumed samples: 2567680 | consumed tokens: 5258608640 | elapsed time per iteration (s): 0.22 | learning rate: 1.722E-04 | global batch size: 256 | lm loss: 3.887808E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.507 | TFLOPs: 29.31 | +7: iteration 10040/ 37905 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 0.22 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.880196E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.300 | TFLOPs: 29.33 | +7: iteration 10050/ 37905 | consumed samples: 2572800 | consumed tokens: 5269094400 | elapsed time per iteration (s): 0.23 | learning rate: 1.721E-04 | global batch size: 256 | lm loss: 3.886744E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.005 | TFLOPs: 28.66 | +7: iteration 10060/ 37905 | consumed samples: 2575360 | consumed tokens: 5274337280 | elapsed time per iteration (s): 0.22 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.873527E+00 | grad norm: 0.346 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.157 | TFLOPs: 29.30 | +7: iteration 10070/ 37905 | consumed samples: 2577920 | consumed tokens: 5279580160 | elapsed time per iteration (s): 0.22 | learning rate: 1.720E-04 | global batch size: 256 | lm loss: 3.898214E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.227 | TFLOPs: 29.33 | +7: iteration 10080/ 37905 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 0.22 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.880749E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.651 | TFLOPs: 29.31 | +7: iteration 10090/ 37905 | consumed samples: 2583040 | consumed tokens: 5290065920 | elapsed time per iteration (s): 0.22 | learning rate: 1.719E-04 | global batch size: 256 | lm loss: 3.871943E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.790 | TFLOPs: 29.32 | +7: iteration 10100/ 37905 | consumed samples: 2585600 | consumed tokens: 5295308800 | elapsed time per iteration (s): 0.22 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.895148E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.386 | TFLOPs: 29.31 | +7: iteration 10110/ 37905 | consumed samples: 2588160 | consumed tokens: 5300551680 | elapsed time per iteration (s): 0.22 | learning rate: 1.718E-04 | global batch size: 256 | lm loss: 3.879435E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.690 | TFLOPs: 29.31 | +7: iteration 10120/ 37905 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 0.22 | learning rate: 1.717E-04 | global batch size: 256 | lm loss: 3.875307E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.761 | TFLOPs: 29.29 | +7: iteration 10130/ 37905 | consumed samples: 2593280 | consumed tokens: 5311037440 | elapsed time per iteration (s): 0.22 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.872182E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.335 | TFLOPs: 29.28 | +7: iteration 10140/ 37905 | consumed samples: 2595840 | consumed tokens: 5316280320 | elapsed time per iteration (s): 0.23 | learning rate: 1.716E-04 | global batch size: 256 | lm loss: 3.870569E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.619 | TFLOPs: 28.93 | +7: iteration 10150/ 37905 | consumed samples: 2598400 | consumed tokens: 5321523200 | elapsed time per iteration (s): 0.22 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.886274E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.440 | TFLOPs: 29.28 | +7: iteration 10160/ 37905 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 0.22 | learning rate: 1.715E-04 | global batch size: 256 | lm loss: 3.870461E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.995 | TFLOPs: 29.32 | +7: iteration 10170/ 37905 | consumed samples: 2603520 | consumed tokens: 5332008960 | elapsed time per iteration (s): 0.22 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.884782E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.733 | TFLOPs: 29.32 | +7: iteration 10180/ 37905 | consumed samples: 2606080 | consumed tokens: 5337251840 | elapsed time per iteration (s): 0.22 | learning rate: 1.714E-04 | global batch size: 256 | lm loss: 3.881171E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.609 | TFLOPs: 29.31 | +7: iteration 10190/ 37905 | consumed samples: 2608640 | consumed tokens: 5342494720 | elapsed time per iteration (s): 0.22 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.882586E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.294 | TFLOPs: 29.30 | +7: iteration 10200/ 37905 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 0.22 | learning rate: 1.713E-04 | global batch size: 256 | lm loss: 3.870918E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.245 | TFLOPs: 29.33 | +7: iteration 10210/ 37905 | consumed samples: 2613760 | consumed tokens: 5352980480 | elapsed time per iteration (s): 0.22 | learning rate: 1.712E-04 | global batch size: 256 | lm loss: 3.877795E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.212 | TFLOPs: 29.30 | +7: iteration 10220/ 37905 | consumed samples: 2616320 | consumed tokens: 5358223360 | elapsed time per iteration (s): 0.22 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.877298E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.696 | TFLOPs: 29.31 | +7: iteration 10230/ 37905 | consumed samples: 2618880 | consumed tokens: 5363466240 | elapsed time per iteration (s): 0.22 | learning rate: 1.711E-04 | global batch size: 256 | lm loss: 3.867500E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.601 | TFLOPs: 29.34 | +7: iteration 10240/ 37905 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 0.22 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.872293E+00 | grad norm: 0.339 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.358 | TFLOPs: 29.31 | +7: iteration 10250/ 37905 | consumed samples: 2624000 | consumed tokens: 5373952000 | elapsed time per iteration (s): 0.22 | learning rate: 1.710E-04 | global batch size: 256 | lm loss: 3.869346E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.799 | TFLOPs: 29.32 | +7: iteration 10260/ 37905 | consumed samples: 2626560 | consumed tokens: 5379194880 | elapsed time per iteration (s): 0.22 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.881589E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.142 | TFLOPs: 29.33 | +7: iteration 10270/ 37905 | consumed samples: 2629120 | consumed tokens: 5384437760 | elapsed time per iteration (s): 0.22 | learning rate: 1.709E-04 | global batch size: 256 | lm loss: 3.875429E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.220 | TFLOPs: 29.30 | +7: iteration 10280/ 37905 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 0.22 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.877428E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.038 | TFLOPs: 29.27 | +7: iteration 10290/ 37905 | consumed samples: 2634240 | consumed tokens: 5394923520 | elapsed time per iteration (s): 0.22 | learning rate: 1.708E-04 | global batch size: 256 | lm loss: 3.873517E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.072 | TFLOPs: 29.27 | +7: iteration 10300/ 37905 | consumed samples: 2636800 | consumed tokens: 5400166400 | elapsed time per iteration (s): 0.22 | learning rate: 1.707E-04 | global batch size: 256 | lm loss: 3.871131E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.993 | TFLOPs: 29.27 | +7: iteration 10310/ 37905 | consumed samples: 2639360 | consumed tokens: 5405409280 | elapsed time per iteration (s): 0.22 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.866780E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.607 | TFLOPs: 29.26 | +7: iteration 10320/ 37905 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 0.22 | learning rate: 1.706E-04 | global batch size: 256 | lm loss: 3.883580E+00 | grad norm: 0.382 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.956 | TFLOPs: 29.27 | +7: iteration 10330/ 37905 | consumed samples: 2644480 | consumed tokens: 5415895040 | elapsed time per iteration (s): 0.22 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.876097E+00 | grad norm: 0.395 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.350 | TFLOPs: 29.28 | +7: iteration 10340/ 37905 | consumed samples: 2647040 | consumed tokens: 5421137920 | elapsed time per iteration (s): 0.22 | learning rate: 1.705E-04 | global batch size: 256 | lm loss: 3.867094E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.102 | TFLOPs: 29.27 | +7: iteration 10350/ 37905 | consumed samples: 2649600 | consumed tokens: 5426380800 | elapsed time per iteration (s): 0.22 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.876281E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.798 | TFLOPs: 29.29 | +7: iteration 10360/ 37905 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 0.22 | learning rate: 1.704E-04 | global batch size: 256 | lm loss: 3.875891E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.245 | TFLOPs: 29.35 | +7: iteration 10370/ 37905 | consumed samples: 2654720 | consumed tokens: 5436866560 | elapsed time per iteration (s): 0.22 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.865957E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.500 | TFLOPs: 29.36 | +7: iteration 10380/ 37905 | consumed samples: 2657280 | consumed tokens: 5442109440 | elapsed time per iteration (s): 0.22 | learning rate: 1.703E-04 | global batch size: 256 | lm loss: 3.881699E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.922 | TFLOPs: 29.35 | +7: iteration 10390/ 37905 | consumed samples: 2659840 | consumed tokens: 5447352320 | elapsed time per iteration (s): 0.22 | learning rate: 1.702E-04 | global batch size: 256 | lm loss: 3.869343E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.760 | TFLOPs: 29.34 | +7: iteration 10400/ 37905 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 0.22 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.883429E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.176 | TFLOPs: 29.35 | +7: iteration 10410/ 37905 | consumed samples: 2664960 | consumed tokens: 5457838080 | elapsed time per iteration (s): 0.22 | learning rate: 1.701E-04 | global batch size: 256 | lm loss: 3.872725E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.946 | TFLOPs: 29.37 | +7: iteration 10420/ 37905 | consumed samples: 2667520 | consumed tokens: 5463080960 | elapsed time per iteration (s): 0.22 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.888285E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.843 | TFLOPs: 29.37 | +7: iteration 10430/ 37905 | consumed samples: 2670080 | consumed tokens: 5468323840 | elapsed time per iteration (s): 0.22 | learning rate: 1.700E-04 | global batch size: 256 | lm loss: 3.857618E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.637 | TFLOPs: 29.36 | +7: iteration 10440/ 37905 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 0.22 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.873967E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.676 | TFLOPs: 29.34 | +7: iteration 10450/ 37905 | consumed samples: 2675200 | consumed tokens: 5478809600 | elapsed time per iteration (s): 0.22 | learning rate: 1.699E-04 | global batch size: 256 | lm loss: 3.878436E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.475 | TFLOPs: 29.36 | +7: iteration 10460/ 37905 | consumed samples: 2677760 | consumed tokens: 5484052480 | elapsed time per iteration (s): 0.22 | learning rate: 1.698E-04 | global batch size: 256 | lm loss: 3.881542E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.500 | TFLOPs: 29.36 | +7: iteration 10470/ 37905 | consumed samples: 2680320 | consumed tokens: 5489295360 | elapsed time per iteration (s): 0.22 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.884517E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.128 | TFLOPs: 29.35 | +7: iteration 10480/ 37905 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 0.22 | learning rate: 1.697E-04 | global batch size: 256 | lm loss: 3.865559E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.325 | TFLOPs: 29.33 | +7: iteration 10490/ 37905 | consumed samples: 2685440 | consumed tokens: 5499781120 | elapsed time per iteration (s): 0.22 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.869542E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.273 | TFLOPs: 29.33 | +7: iteration 10500/ 37905 | consumed samples: 2688000 | consumed tokens: 5505024000 | elapsed time per iteration (s): 0.22 | learning rate: 1.696E-04 | global batch size: 256 | lm loss: 3.886011E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.529 | TFLOPs: 29.34 | +7: iteration 10510/ 37905 | consumed samples: 2690560 | consumed tokens: 5510266880 | elapsed time per iteration (s): 0.22 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.882427E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.146 | TFLOPs: 29.35 | +7: iteration 10520/ 37905 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 0.22 | learning rate: 1.695E-04 | global batch size: 256 | lm loss: 3.870546E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.165 | TFLOPs: 29.35 | +7: iteration 10530/ 37905 | consumed samples: 2695680 | consumed tokens: 5520752640 | elapsed time per iteration (s): 0.22 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.861032E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.301 | TFLOPs: 29.33 | +7: iteration 10540/ 37905 | consumed samples: 2698240 | consumed tokens: 5525995520 | elapsed time per iteration (s): 0.22 | learning rate: 1.694E-04 | global batch size: 256 | lm loss: 3.868734E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.520 | TFLOPs: 29.34 | +7: iteration 10550/ 37905 | consumed samples: 2700800 | consumed tokens: 5531238400 | elapsed time per iteration (s): 0.22 | learning rate: 1.693E-04 | global batch size: 256 | lm loss: 3.872486E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.431 | TFLOPs: 29.33 | +7: iteration 10560/ 37905 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 0.22 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.859793E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.173 | TFLOPs: 29.35 | +7: iteration 10570/ 37905 | consumed samples: 2705920 | consumed tokens: 5541724160 | elapsed time per iteration (s): 0.22 | learning rate: 1.692E-04 | global batch size: 256 | lm loss: 3.863113E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.749 | TFLOPs: 29.34 | +7: iteration 10580/ 37905 | consumed samples: 2708480 | consumed tokens: 5546967040 | elapsed time per iteration (s): 0.22 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.871655E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.653 | TFLOPs: 29.34 | +7: iteration 10590/ 37905 | consumed samples: 2711040 | consumed tokens: 5552209920 | elapsed time per iteration (s): 0.22 | learning rate: 1.691E-04 | global batch size: 256 | lm loss: 3.873815E+00 | grad norm: 0.343 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.129 | TFLOPs: 29.35 | +7: iteration 10600/ 37905 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 0.22 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.870513E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.750 | TFLOPs: 29.34 | +7: iteration 10610/ 37905 | consumed samples: 2716160 | consumed tokens: 5562695680 | elapsed time per iteration (s): 0.22 | learning rate: 1.690E-04 | global batch size: 256 | lm loss: 3.863783E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.362 | TFLOPs: 29.31 | +7: iteration 10620/ 37905 | consumed samples: 2718720 | consumed tokens: 5567938560 | elapsed time per iteration (s): 0.22 | learning rate: 1.689E-04 | global batch size: 256 | lm loss: 3.868406E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.534 | TFLOPs: 29.28 | +7: iteration 10630/ 37905 | consumed samples: 2721280 | consumed tokens: 5573181440 | elapsed time per iteration (s): 0.22 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.870504E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.602 | TFLOPs: 29.31 | +7: iteration 10640/ 37905 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 0.22 | learning rate: 1.688E-04 | global batch size: 256 | lm loss: 3.865230E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.015 | TFLOPs: 29.30 | +7: iteration 10650/ 37905 | consumed samples: 2726400 | consumed tokens: 5583667200 | elapsed time per iteration (s): 0.22 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.854441E+00 | grad norm: 0.349 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.758 | TFLOPs: 29.32 | +7: iteration 10660/ 37905 | consumed samples: 2728960 | consumed tokens: 5588910080 | elapsed time per iteration (s): 0.22 | learning rate: 1.687E-04 | global batch size: 256 | lm loss: 3.858937E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.987 | TFLOPs: 29.30 | +7: iteration 10670/ 37905 | consumed samples: 2731520 | consumed tokens: 5594152960 | elapsed time per iteration (s): 0.22 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.862033E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.053 | TFLOPs: 29.30 | +7: iteration 10680/ 37905 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 0.22 | learning rate: 1.686E-04 | global batch size: 256 | lm loss: 3.870299E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.040 | TFLOPs: 29.25 | +7: iteration 10690/ 37905 | consumed samples: 2736640 | consumed tokens: 5604638720 | elapsed time per iteration (s): 0.22 | learning rate: 1.685E-04 | global batch size: 256 | lm loss: 3.877335E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.578 | TFLOPs: 29.34 | +7: iteration 10700/ 37905 | consumed samples: 2739200 | consumed tokens: 5609881600 | elapsed time per iteration (s): 0.22 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.870177E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.048 | TFLOPs: 29.35 | +7: iteration 10710/ 37905 | consumed samples: 2741760 | consumed tokens: 5615124480 | elapsed time per iteration (s): 0.22 | learning rate: 1.684E-04 | global batch size: 256 | lm loss: 3.876244E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.350 | TFLOPs: 29.31 | +7: iteration 10720/ 37905 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 0.22 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.859219E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.207 | TFLOPs: 29.23 | +7: iteration 10730/ 37905 | consumed samples: 2746880 | consumed tokens: 5625610240 | elapsed time per iteration (s): 0.22 | learning rate: 1.683E-04 | global batch size: 256 | lm loss: 3.865597E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.239 | TFLOPs: 29.23 | +7: iteration 10740/ 37905 | consumed samples: 2749440 | consumed tokens: 5630853120 | elapsed time per iteration (s): 0.22 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.866858E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.042 | TFLOPs: 29.22 | +7: iteration 10750/ 37905 | consumed samples: 2752000 | consumed tokens: 5636096000 | elapsed time per iteration (s): 0.22 | learning rate: 1.682E-04 | global batch size: 256 | lm loss: 3.873675E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.145 | TFLOPs: 28.99 | +7: iteration 10760/ 37905 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 0.22 | learning rate: 1.681E-04 | global batch size: 256 | lm loss: 3.862739E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.582 | TFLOPs: 29.21 | +7: iteration 10770/ 37905 | consumed samples: 2757120 | consumed tokens: 5646581760 | elapsed time per iteration (s): 0.22 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.872883E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.324 | TFLOPs: 29.28 | +7: iteration 10780/ 37905 | consumed samples: 2759680 | consumed tokens: 5651824640 | elapsed time per iteration (s): 0.22 | learning rate: 1.680E-04 | global batch size: 256 | lm loss: 3.865074E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.444 | TFLOPs: 29.36 | +7: iteration 10790/ 37905 | consumed samples: 2762240 | consumed tokens: 5657067520 | elapsed time per iteration (s): 0.22 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.871054E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.670 | TFLOPs: 29.36 | +7: iteration 10800/ 37905 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 0.22 | learning rate: 1.679E-04 | global batch size: 256 | lm loss: 3.857003E+00 | grad norm: 0.358 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.023 | TFLOPs: 29.37 | +7: iteration 10810/ 37905 | consumed samples: 2767360 | consumed tokens: 5667553280 | elapsed time per iteration (s): 0.22 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.863982E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.658 | TFLOPs: 29.39 | +7: iteration 10820/ 37905 | consumed samples: 2769920 | consumed tokens: 5672796160 | elapsed time per iteration (s): 0.22 | learning rate: 1.678E-04 | global batch size: 256 | lm loss: 3.865383E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.884 | TFLOPs: 29.40 | +7: iteration 10830/ 37905 | consumed samples: 2772480 | consumed tokens: 5678039040 | elapsed time per iteration (s): 0.22 | learning rate: 1.677E-04 | global batch size: 256 | lm loss: 3.871013E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.246 | TFLOPs: 29.35 | +7: iteration 10840/ 37905 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 0.22 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.858629E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.462 | TFLOPs: 29.41 | +7: iteration 10850/ 37905 | consumed samples: 2777600 | consumed tokens: 5688524800 | elapsed time per iteration (s): 0.22 | learning rate: 1.676E-04 | global batch size: 256 | lm loss: 3.855872E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.340 | TFLOPs: 29.41 | +7: iteration 10860/ 37905 | consumed samples: 2780160 | consumed tokens: 5693767680 | elapsed time per iteration (s): 0.22 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.868889E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.202 | TFLOPs: 29.38 | +7: iteration 10870/ 37905 | consumed samples: 2782720 | consumed tokens: 5699010560 | elapsed time per iteration (s): 0.22 | learning rate: 1.675E-04 | global batch size: 256 | lm loss: 3.857948E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.971 | TFLOPs: 29.40 | +7: iteration 10880/ 37905 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 0.22 | learning rate: 1.674E-04 | global batch size: 256 | lm loss: 3.853617E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.468 | TFLOPs: 29.41 | +7: iteration 10890/ 37905 | consumed samples: 2787840 | consumed tokens: 5709496320 | elapsed time per iteration (s): 0.22 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.858038E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.156 | TFLOPs: 29.38 | +7: iteration 10900/ 37905 | consumed samples: 2790400 | consumed tokens: 5714739200 | elapsed time per iteration (s): 0.22 | learning rate: 1.673E-04 | global batch size: 256 | lm loss: 3.859146E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.499 | TFLOPs: 29.41 | +7: iteration 10910/ 37905 | consumed samples: 2792960 | consumed tokens: 5719982080 | elapsed time per iteration (s): 0.22 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.859219E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.654 | TFLOPs: 29.41 | +7: iteration 10920/ 37905 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 0.22 | learning rate: 1.672E-04 | global batch size: 256 | lm loss: 3.859707E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.413 | TFLOPs: 29.41 | +7: iteration 10930/ 37905 | consumed samples: 2798080 | consumed tokens: 5730467840 | elapsed time per iteration (s): 0.22 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.856504E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.916 | TFLOPs: 29.42 | +7: iteration 10940/ 37905 | consumed samples: 2800640 | consumed tokens: 5735710720 | elapsed time per iteration (s): 0.22 | learning rate: 1.671E-04 | global batch size: 256 | lm loss: 3.852799E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.890 | TFLOPs: 29.42 | +7: iteration 10950/ 37905 | consumed samples: 2803200 | consumed tokens: 5740953600 | elapsed time per iteration (s): 0.22 | learning rate: 1.670E-04 | global batch size: 256 | lm loss: 3.851611E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.954 | TFLOPs: 29.42 | +7: iteration 10960/ 37905 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 0.22 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.871011E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.780 | TFLOPs: 29.42 | +7: iteration 10970/ 37905 | consumed samples: 2808320 | consumed tokens: 5751439360 | elapsed time per iteration (s): 0.22 | learning rate: 1.669E-04 | global batch size: 256 | lm loss: 3.863811E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.594 | TFLOPs: 29.41 | +7: iteration 10980/ 37905 | consumed samples: 2810880 | consumed tokens: 5756682240 | elapsed time per iteration (s): 0.22 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.849948E+00 | grad norm: 0.324 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.533 | TFLOPs: 29.41 | +7: iteration 10990/ 37905 | consumed samples: 2813440 | consumed tokens: 5761925120 | elapsed time per iteration (s): 0.22 | learning rate: 1.668E-04 | global batch size: 256 | lm loss: 3.859346E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.115 | TFLOPs: 29.43 | +7: iteration 11000/ 37905 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 0.22 | learning rate: 1.667E-04 | global batch size: 256 | lm loss: 3.864420E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.235 | TFLOPs: 29.40 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 11000 | lm loss value: 3.853655E+00 | lm loss PPL: 4.716515E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 11000 to checkpoints_83m20b400m +0: [2023-03-15 22:38:27,340] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step11000 is begin to save! +0: [2023-03-15 22:38:27,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:38:27,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:38:27,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:38:27,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:38:27,427] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:38:27,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:38:27,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:38:27,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:38:27,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:38:27,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:38:27,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:38:27,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:38:27,471] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:38:27,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:38:27,483] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:38:27,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:38:27,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:38:27,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:38:27,505] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:38:27,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:38:27,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:38:27,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:38:27,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:38:27,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:38:27,529] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step11000/mp_rank_00_model_states.pt +0: [2023-03-15 22:38:27,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:38:27,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:38:27,547] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:38:27,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:27,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-15 22:38:27,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:27,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:27,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-15 22:38:27,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +1: [2023-03-15 22:38:27,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +5: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +4: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +7: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +3: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +2: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:38:27,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:38:27,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +6: [2023-03-15 22:38:27,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:38:27,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step11000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:38:27,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step11000 is ready now! +0: successfully saved checkpoint at iteration 11000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 240.22 +7: iteration 11010/ 37905 | consumed samples: 2818560 | consumed tokens: 5772410880 | elapsed time per iteration (s): 0.25 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.860648E+00 | grad norm: 0.427 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1016.580 | TFLOPs: 25.90 | +7: iteration 11020/ 37905 | consumed samples: 2821120 | consumed tokens: 5777653760 | elapsed time per iteration (s): 0.22 | learning rate: 1.666E-04 | global batch size: 256 | lm loss: 3.858609E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.628 | TFLOPs: 29.41 | +7: iteration 11030/ 37905 | consumed samples: 2823680 | consumed tokens: 5782896640 | elapsed time per iteration (s): 0.22 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.858435E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.198 | TFLOPs: 29.43 | +7: iteration 11040/ 37905 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 0.22 | learning rate: 1.665E-04 | global batch size: 256 | lm loss: 3.866629E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.624 | TFLOPs: 29.44 | +7: iteration 11050/ 37905 | consumed samples: 2828800 | consumed tokens: 5793382400 | elapsed time per iteration (s): 0.22 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.849988E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.050 | TFLOPs: 29.43 | +7: iteration 11060/ 37905 | consumed samples: 2831360 | consumed tokens: 5798625280 | elapsed time per iteration (s): 0.22 | learning rate: 1.664E-04 | global batch size: 256 | lm loss: 3.852749E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.800 | TFLOPs: 29.42 | +7: iteration 11070/ 37905 | consumed samples: 2833920 | consumed tokens: 5803868160 | elapsed time per iteration (s): 0.22 | learning rate: 1.663E-04 | global batch size: 256 | lm loss: 3.865215E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.071 | TFLOPs: 29.43 | +7: iteration 11080/ 37905 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 0.22 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.864890E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.872 | TFLOPs: 29.42 | +7: iteration 11090/ 37905 | consumed samples: 2839040 | consumed tokens: 5814353920 | elapsed time per iteration (s): 0.22 | learning rate: 1.662E-04 | global batch size: 256 | lm loss: 3.851180E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.573 | TFLOPs: 29.36 | +7: iteration 11100/ 37905 | consumed samples: 2841600 | consumed tokens: 5819596800 | elapsed time per iteration (s): 0.22 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.853472E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.323 | TFLOPs: 29.33 | +7: iteration 11110/ 37905 | consumed samples: 2844160 | consumed tokens: 5824839680 | elapsed time per iteration (s): 0.22 | learning rate: 1.661E-04 | global batch size: 256 | lm loss: 3.854374E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.713 | TFLOPs: 29.34 | +7: iteration 11120/ 37905 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 0.22 | learning rate: 1.660E-04 | global batch size: 256 | lm loss: 3.860694E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.727 | TFLOPs: 29.34 | +7: iteration 11130/ 37905 | consumed samples: 2849280 | consumed tokens: 5835325440 | elapsed time per iteration (s): 0.22 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.862798E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.918 | TFLOPs: 29.35 | +7: iteration 11140/ 37905 | consumed samples: 2851840 | consumed tokens: 5840568320 | elapsed time per iteration (s): 0.22 | learning rate: 1.659E-04 | global batch size: 256 | lm loss: 3.858835E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.346 | TFLOPs: 29.31 | +7: iteration 11150/ 37905 | consumed samples: 2854400 | consumed tokens: 5845811200 | elapsed time per iteration (s): 0.22 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.856614E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.392 | TFLOPs: 29.05 | +7: iteration 11160/ 37905 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 0.22 | learning rate: 1.658E-04 | global batch size: 256 | lm loss: 3.843836E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.178 | TFLOPs: 29.38 | +7: iteration 11170/ 37905 | consumed samples: 2859520 | consumed tokens: 5856296960 | elapsed time per iteration (s): 0.22 | learning rate: 1.657E-04 | global batch size: 256 | lm loss: 3.856564E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.790 | TFLOPs: 29.39 | +7: iteration 11180/ 37905 | consumed samples: 2862080 | consumed tokens: 5861539840 | elapsed time per iteration (s): 0.22 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.839827E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.288 | TFLOPs: 29.41 | +7: iteration 11190/ 37905 | consumed samples: 2864640 | consumed tokens: 5866782720 | elapsed time per iteration (s): 0.22 | learning rate: 1.656E-04 | global batch size: 256 | lm loss: 3.851189E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.471 | TFLOPs: 29.41 | +7: iteration 11200/ 37905 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 0.22 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.857003E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.660 | TFLOPs: 29.36 | +7: iteration 11210/ 37905 | consumed samples: 2869760 | consumed tokens: 5877268480 | elapsed time per iteration (s): 0.22 | learning rate: 1.655E-04 | global batch size: 256 | lm loss: 3.859151E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.151 | TFLOPs: 29.33 | +7: iteration 11220/ 37905 | consumed samples: 2872320 | consumed tokens: 5882511360 | elapsed time per iteration (s): 0.22 | learning rate: 1.654E-04 | global batch size: 256 | lm loss: 3.861681E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.633 | TFLOPs: 29.36 | +7: iteration 11230/ 37905 | consumed samples: 2874880 | consumed tokens: 5887754240 | elapsed time per iteration (s): 0.22 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.853182E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.813 | TFLOPs: 29.37 | +7: iteration 11240/ 37905 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 0.22 | learning rate: 1.653E-04 | global batch size: 256 | lm loss: 3.843502E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.855 | TFLOPs: 29.37 | +7: iteration 11250/ 37905 | consumed samples: 2880000 | consumed tokens: 5898240000 | elapsed time per iteration (s): 0.22 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.856066E+00 | grad norm: 0.364 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.941 | TFLOPs: 29.37 | +7: iteration 11260/ 37905 | consumed samples: 2882560 | consumed tokens: 5903482880 | elapsed time per iteration (s): 0.22 | learning rate: 1.652E-04 | global batch size: 256 | lm loss: 3.854451E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.629 | TFLOPs: 29.36 | +7: iteration 11270/ 37905 | consumed samples: 2885120 | consumed tokens: 5908725760 | elapsed time per iteration (s): 0.22 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.847880E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.570 | TFLOPs: 29.36 | +7: iteration 11280/ 37905 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 0.22 | learning rate: 1.651E-04 | global batch size: 256 | lm loss: 3.856507E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.954 | TFLOPs: 29.35 | +7: iteration 11290/ 37905 | consumed samples: 2890240 | consumed tokens: 5919211520 | elapsed time per iteration (s): 0.22 | learning rate: 1.650E-04 | global batch size: 256 | lm loss: 3.846735E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.677 | TFLOPs: 29.34 | +7: iteration 11300/ 37905 | consumed samples: 2892800 | consumed tokens: 5924454400 | elapsed time per iteration (s): 0.22 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.858133E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.575 | TFLOPs: 29.36 | +7: iteration 11310/ 37905 | consumed samples: 2895360 | consumed tokens: 5929697280 | elapsed time per iteration (s): 0.22 | learning rate: 1.649E-04 | global batch size: 256 | lm loss: 3.874879E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.794 | TFLOPs: 29.37 | +7: iteration 11320/ 37905 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 0.22 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.852257E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.308 | TFLOPs: 29.38 | +7: iteration 11330/ 37905 | consumed samples: 2900480 | consumed tokens: 5940183040 | elapsed time per iteration (s): 0.22 | learning rate: 1.648E-04 | global batch size: 256 | lm loss: 3.868066E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.120 | TFLOPs: 29.38 | +7: iteration 11340/ 37905 | consumed samples: 2903040 | consumed tokens: 5945425920 | elapsed time per iteration (s): 0.22 | learning rate: 1.647E-04 | global batch size: 256 | lm loss: 3.856354E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.236 | TFLOPs: 29.35 | +7: iteration 11350/ 37905 | consumed samples: 2905600 | consumed tokens: 5950668800 | elapsed time per iteration (s): 0.22 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.854058E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.719 | TFLOPs: 29.37 | +7: iteration 11360/ 37905 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 0.22 | learning rate: 1.646E-04 | global batch size: 256 | lm loss: 3.842923E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.023 | TFLOPs: 29.37 | +7: iteration 11370/ 37905 | consumed samples: 2910720 | consumed tokens: 5961154560 | elapsed time per iteration (s): 0.22 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.846355E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.649 | TFLOPs: 29.36 | +7: iteration 11380/ 37905 | consumed samples: 2913280 | consumed tokens: 5966397440 | elapsed time per iteration (s): 0.22 | learning rate: 1.645E-04 | global batch size: 256 | lm loss: 3.858726E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.120 | TFLOPs: 29.35 | +7: iteration 11390/ 37905 | consumed samples: 2915840 | consumed tokens: 5971640320 | elapsed time per iteration (s): 0.22 | learning rate: 1.644E-04 | global batch size: 256 | lm loss: 3.861744E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.645 | TFLOPs: 29.34 | +7: iteration 11400/ 37905 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 0.22 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.844511E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.834 | TFLOPs: 29.34 | +7: iteration 11410/ 37905 | consumed samples: 2920960 | consumed tokens: 5982126080 | elapsed time per iteration (s): 0.22 | learning rate: 1.643E-04 | global batch size: 256 | lm loss: 3.847469E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.625 | TFLOPs: 29.34 | +7: iteration 11420/ 37905 | consumed samples: 2923520 | consumed tokens: 5987368960 | elapsed time per iteration (s): 0.22 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.858547E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.531 | TFLOPs: 29.34 | +7: iteration 11430/ 37905 | consumed samples: 2926080 | consumed tokens: 5992611840 | elapsed time per iteration (s): 0.22 | learning rate: 1.642E-04 | global batch size: 256 | lm loss: 3.853288E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.138 | TFLOPs: 29.33 | +7: iteration 11440/ 37905 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 0.22 | learning rate: 1.641E-04 | global batch size: 256 | lm loss: 3.842562E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.420 | TFLOPs: 29.33 | +7: iteration 11450/ 37905 | consumed samples: 2931200 | consumed tokens: 6003097600 | elapsed time per iteration (s): 0.22 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.860384E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.926 | TFLOPs: 29.32 | +7: iteration 11460/ 37905 | consumed samples: 2933760 | consumed tokens: 6008340480 | elapsed time per iteration (s): 0.22 | learning rate: 1.640E-04 | global batch size: 256 | lm loss: 3.831258E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.872 | TFLOPs: 29.34 | +7: iteration 11470/ 37905 | consumed samples: 2936320 | consumed tokens: 6013583360 | elapsed time per iteration (s): 0.22 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.848775E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.226 | TFLOPs: 29.33 | +7: iteration 11480/ 37905 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 0.22 | learning rate: 1.639E-04 | global batch size: 256 | lm loss: 3.844581E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.376 | TFLOPs: 29.33 | +7: iteration 11490/ 37905 | consumed samples: 2941440 | consumed tokens: 6024069120 | elapsed time per iteration (s): 0.22 | learning rate: 1.638E-04 | global batch size: 256 | lm loss: 3.837996E+00 | grad norm: 0.359 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.901 | TFLOPs: 29.29 | +7: iteration 11500/ 37905 | consumed samples: 2944000 | consumed tokens: 6029312000 | elapsed time per iteration (s): 0.22 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.848082E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.862 | TFLOPs: 29.34 | +7: iteration 11510/ 37905 | consumed samples: 2946560 | consumed tokens: 6034554880 | elapsed time per iteration (s): 0.22 | learning rate: 1.637E-04 | global batch size: 256 | lm loss: 3.834283E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.386 | TFLOPs: 29.38 | +7: iteration 11520/ 37905 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 0.22 | learning rate: 1.636E-04 | global batch size: 256 | lm loss: 3.853677E+00 | grad norm: 0.363 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.948 | TFLOPs: 29.37 | +7: iteration 11530/ 37905 | consumed samples: 2951680 | consumed tokens: 6045040640 | elapsed time per iteration (s): 0.22 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.851240E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.737 | TFLOPs: 29.37 | +7: iteration 11540/ 37905 | consumed samples: 2954240 | consumed tokens: 6050283520 | elapsed time per iteration (s): 0.22 | learning rate: 1.635E-04 | global batch size: 256 | lm loss: 3.845963E+00 | grad norm: 0.308 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.422 | TFLOPs: 29.36 | +7: iteration 11550/ 37905 | consumed samples: 2956800 | consumed tokens: 6055526400 | elapsed time per iteration (s): 0.22 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.848003E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.901 | TFLOPs: 29.37 | +7: iteration 11560/ 37905 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 0.22 | learning rate: 1.634E-04 | global batch size: 256 | lm loss: 3.851190E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.972 | TFLOPs: 29.37 | +7: iteration 11570/ 37905 | consumed samples: 2961920 | consumed tokens: 6066012160 | elapsed time per iteration (s): 0.22 | learning rate: 1.633E-04 | global batch size: 256 | lm loss: 3.847393E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.664 | TFLOPs: 29.42 | +7: iteration 11580/ 37905 | consumed samples: 2964480 | consumed tokens: 6071255040 | elapsed time per iteration (s): 0.22 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.853205E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.482 | TFLOPs: 29.36 | +7: iteration 11590/ 37905 | consumed samples: 2967040 | consumed tokens: 6076497920 | elapsed time per iteration (s): 0.22 | learning rate: 1.632E-04 | global batch size: 256 | lm loss: 3.844371E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.096 | TFLOPs: 29.38 | +7: iteration 11600/ 37905 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 0.22 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.841822E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.388 | TFLOPs: 29.31 | +7: iteration 11610/ 37905 | consumed samples: 2972160 | consumed tokens: 6086983680 | elapsed time per iteration (s): 0.22 | learning rate: 1.631E-04 | global batch size: 256 | lm loss: 3.849044E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.144 | TFLOPs: 29.30 | +7: iteration 11620/ 37905 | consumed samples: 2974720 | consumed tokens: 6092226560 | elapsed time per iteration (s): 0.22 | learning rate: 1.630E-04 | global batch size: 256 | lm loss: 3.832402E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.631 | TFLOPs: 29.31 | +7: iteration 11630/ 37905 | consumed samples: 2977280 | consumed tokens: 6097469440 | elapsed time per iteration (s): 0.22 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.848307E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.769 | TFLOPs: 29.32 | +7: iteration 11640/ 37905 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 0.22 | learning rate: 1.629E-04 | global batch size: 256 | lm loss: 3.850518E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.349 | TFLOPs: 29.31 | +7: iteration 11650/ 37905 | consumed samples: 2982400 | consumed tokens: 6107955200 | elapsed time per iteration (s): 0.22 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.845733E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.367 | TFLOPs: 29.31 | +7: iteration 11660/ 37905 | consumed samples: 2984960 | consumed tokens: 6113198080 | elapsed time per iteration (s): 0.22 | learning rate: 1.628E-04 | global batch size: 256 | lm loss: 3.842509E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.829 | TFLOPs: 29.32 | +7: iteration 11670/ 37905 | consumed samples: 2987520 | consumed tokens: 6118440960 | elapsed time per iteration (s): 0.22 | learning rate: 1.627E-04 | global batch size: 256 | lm loss: 3.850317E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.624 | TFLOPs: 29.31 | +7: iteration 11680/ 37905 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 0.22 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.835002E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.434 | TFLOPs: 29.31 | +7: iteration 11690/ 37905 | consumed samples: 2992640 | consumed tokens: 6128926720 | elapsed time per iteration (s): 0.22 | learning rate: 1.626E-04 | global batch size: 256 | lm loss: 3.831602E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.511 | TFLOPs: 29.33 | +7: iteration 11700/ 37905 | consumed samples: 2995200 | consumed tokens: 6134169600 | elapsed time per iteration (s): 0.22 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.847253E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.509 | TFLOPs: 29.33 | +7: iteration 11710/ 37905 | consumed samples: 2997760 | consumed tokens: 6139412480 | elapsed time per iteration (s): 0.22 | learning rate: 1.625E-04 | global batch size: 256 | lm loss: 3.844650E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.582 | TFLOPs: 29.36 | +7: iteration 11720/ 37905 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 0.22 | learning rate: 1.624E-04 | global batch size: 256 | lm loss: 3.843266E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.575 | TFLOPs: 29.34 | +7: iteration 11730/ 37905 | consumed samples: 3002880 | consumed tokens: 6149898240 | elapsed time per iteration (s): 0.22 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.841824E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.490 | TFLOPs: 29.26 | +7: iteration 11740/ 37905 | consumed samples: 3005440 | consumed tokens: 6155141120 | elapsed time per iteration (s): 0.22 | learning rate: 1.623E-04 | global batch size: 256 | lm loss: 3.834587E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.324 | TFLOPs: 29.28 | +7: iteration 11750/ 37905 | consumed samples: 3008000 | consumed tokens: 6160384000 | elapsed time per iteration (s): 0.22 | learning rate: 1.622E-04 | global batch size: 256 | lm loss: 3.831205E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.047 | TFLOPs: 29.27 | +7: iteration 11760/ 37905 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 0.22 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.860328E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.376 | TFLOPs: 29.28 | +7: iteration 11770/ 37905 | consumed samples: 3013120 | consumed tokens: 6170869760 | elapsed time per iteration (s): 0.22 | learning rate: 1.621E-04 | global batch size: 256 | lm loss: 3.846608E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.663 | TFLOPs: 29.26 | +7: iteration 11780/ 37905 | consumed samples: 3015680 | consumed tokens: 6176112640 | elapsed time per iteration (s): 0.22 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.836028E+00 | grad norm: 0.348 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.201 | TFLOPs: 29.28 | +7: iteration 11790/ 37905 | consumed samples: 3018240 | consumed tokens: 6181355520 | elapsed time per iteration (s): 0.22 | learning rate: 1.620E-04 | global batch size: 256 | lm loss: 3.819156E+00 | grad norm: 0.362 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.245 | TFLOPs: 29.28 | +7: iteration 11800/ 37905 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 0.22 | learning rate: 1.619E-04 | global batch size: 256 | lm loss: 3.848473E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.460 | TFLOPs: 29.28 | +7: iteration 11810/ 37905 | consumed samples: 3023360 | consumed tokens: 6191841280 | elapsed time per iteration (s): 0.22 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.858543E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.549 | TFLOPs: 29.28 | +7: iteration 11820/ 37905 | consumed samples: 3025920 | consumed tokens: 6197084160 | elapsed time per iteration (s): 0.22 | learning rate: 1.618E-04 | global batch size: 256 | lm loss: 3.846700E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.171 | TFLOPs: 29.28 | +7: iteration 11830/ 37905 | consumed samples: 3028480 | consumed tokens: 6202327040 | elapsed time per iteration (s): 0.23 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.856210E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.277 | TFLOPs: 28.90 | +7: iteration 11840/ 37905 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 0.23 | learning rate: 1.617E-04 | global batch size: 256 | lm loss: 3.850760E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.829 | TFLOPs: 28.60 | +7: iteration 11850/ 37905 | consumed samples: 3033600 | consumed tokens: 6212812800 | elapsed time per iteration (s): 0.22 | learning rate: 1.616E-04 | global batch size: 256 | lm loss: 3.848832E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.252 | TFLOPs: 29.28 | +7: iteration 11860/ 37905 | consumed samples: 3036160 | consumed tokens: 6218055680 | elapsed time per iteration (s): 0.22 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.846046E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.763 | TFLOPs: 29.29 | +7: iteration 11870/ 37905 | consumed samples: 3038720 | consumed tokens: 6223298560 | elapsed time per iteration (s): 0.22 | learning rate: 1.615E-04 | global batch size: 256 | lm loss: 3.834038E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.322 | TFLOPs: 29.28 | +7: iteration 11880/ 37905 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 0.22 | learning rate: 1.614E-04 | global batch size: 256 | lm loss: 3.825058E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.741 | TFLOPs: 29.24 | +7: iteration 11890/ 37905 | consumed samples: 3043840 | consumed tokens: 6233784320 | elapsed time per iteration (s): 0.22 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.840878E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.200 | TFLOPs: 29.25 | +7: iteration 11900/ 37905 | consumed samples: 3046400 | consumed tokens: 6239027200 | elapsed time per iteration (s): 0.22 | learning rate: 1.613E-04 | global batch size: 256 | lm loss: 3.845150E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.395 | TFLOPs: 29.26 | +7: iteration 11910/ 37905 | consumed samples: 3048960 | consumed tokens: 6244270080 | elapsed time per iteration (s): 0.22 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.843560E+00 | grad norm: 0.341 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.134 | TFLOPs: 29.27 | +7: iteration 11920/ 37905 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 0.22 | learning rate: 1.612E-04 | global batch size: 256 | lm loss: 3.833983E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.507 | TFLOPs: 29.26 | +7: iteration 11930/ 37905 | consumed samples: 3054080 | consumed tokens: 6254755840 | elapsed time per iteration (s): 0.23 | learning rate: 1.611E-04 | global batch size: 256 | lm loss: 3.832536E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.973 | TFLOPs: 28.58 | +7: iteration 11940/ 37905 | consumed samples: 3056640 | consumed tokens: 6259998720 | elapsed time per iteration (s): 0.22 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.838324E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.370 | TFLOPs: 29.28 | +7: iteration 11950/ 37905 | consumed samples: 3059200 | consumed tokens: 6265241600 | elapsed time per iteration (s): 0.22 | learning rate: 1.610E-04 | global batch size: 256 | lm loss: 3.831932E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.562 | TFLOPs: 29.26 | +7: iteration 11960/ 37905 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 0.22 | learning rate: 1.609E-04 | global batch size: 256 | lm loss: 3.842880E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.985 | TFLOPs: 29.30 | +7: iteration 11970/ 37905 | consumed samples: 3064320 | consumed tokens: 6275727360 | elapsed time per iteration (s): 0.22 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.837443E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.257 | TFLOPs: 29.30 | +7: iteration 11980/ 37905 | consumed samples: 3066880 | consumed tokens: 6280970240 | elapsed time per iteration (s): 0.22 | learning rate: 1.608E-04 | global batch size: 256 | lm loss: 3.843784E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.208 | TFLOPs: 29.30 | +7: iteration 11990/ 37905 | consumed samples: 3069440 | consumed tokens: 6286213120 | elapsed time per iteration (s): 0.22 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.841912E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.917 | TFLOPs: 29.22 | +0: [2023-03-15 22:42:10,065] [INFO] [logging.py:68:log_dist] [Rank 0] step=12000, skipped=0, lr=[0.00016066300925507844, 0.00016066300925507844, 0.00016066300925507844], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 12000/ 37905 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 0.22 | learning rate: 1.607E-04 | global batch size: 256 | lm loss: 3.849116E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.130 | TFLOPs: 29.22 | +0: steps: 12000 loss: 3.8496 iter time (s): 0.221 samples/sec: 1159.060 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 12000 | lm loss value: 3.742370E+00 | lm loss PPL: 4.219788E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 12000 to checkpoints_83m20b400m +0: [2023-03-15 22:42:10,153] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step12000 is begin to save! +0: [2023-03-15 22:42:10,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:42:10,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:42:10,227] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:42:10,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:42:10,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:42:10,250] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:42:10,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:42:10,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:42:10,261] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:42:10,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:42:10,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:42:10,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:42:10,283] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:42:10,294] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:42:10,295] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:42:10,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:42:10,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:42:10,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:42:10,317] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:42:10,328] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:42:10,328] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:42:10,339] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:42:10,339] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:42:10,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:42:10,340] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step12000/mp_rank_00_model_states.pt +0: [2023-03-15 22:42:10,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:42:10,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:42:10,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,378] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,378] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:42:10,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-15 22:42:10,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,379] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-15 22:42:10,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:42:10,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:42:10,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:42:10,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-15 22:42:10,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +4: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:42:10,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +5: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +5: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +6: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:42:10,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +4: [2023-03-15 22:42:10,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-15 22:42:10,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +2: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +7: [2023-03-15 22:42:10,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-15 22:42:10,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +3: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +1: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:42:10,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step12000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:42:10,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step12000 is ready now! +0: successfully saved checkpoint at iteration 12000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 239.51 +7: iteration 12010/ 37905 | consumed samples: 3074560 | consumed tokens: 6296698880 | elapsed time per iteration (s): 0.25 | learning rate: 1.606E-04 | global batch size: 256 | lm loss: 3.846304E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1004.369 | TFLOPs: 25.59 | +7: iteration 12020/ 37905 | consumed samples: 3077120 | consumed tokens: 6301941760 | elapsed time per iteration (s): 0.22 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.830468E+00 | grad norm: 0.334 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.653 | TFLOPs: 29.29 | +7: iteration 12030/ 37905 | consumed samples: 3079680 | consumed tokens: 6307184640 | elapsed time per iteration (s): 0.22 | learning rate: 1.605E-04 | global batch size: 256 | lm loss: 3.849477E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.452 | TFLOPs: 29.28 | +7: iteration 12040/ 37905 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 0.22 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.834663E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.005 | TFLOPs: 29.27 | +7: iteration 12050/ 37905 | consumed samples: 3084800 | consumed tokens: 6317670400 | elapsed time per iteration (s): 0.22 | learning rate: 1.604E-04 | global batch size: 256 | lm loss: 3.830462E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.720 | TFLOPs: 29.29 | +7: iteration 12060/ 37905 | consumed samples: 3087360 | consumed tokens: 6322913280 | elapsed time per iteration (s): 0.22 | learning rate: 1.603E-04 | global batch size: 256 | lm loss: 3.834809E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.118 | TFLOPs: 29.27 | +7: iteration 12070/ 37905 | consumed samples: 3089920 | consumed tokens: 6328156160 | elapsed time per iteration (s): 0.22 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.835849E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.209 | TFLOPs: 29.25 | +7: iteration 12080/ 37905 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 0.22 | learning rate: 1.602E-04 | global batch size: 256 | lm loss: 3.843092E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.069 | TFLOPs: 29.25 | +7: iteration 12090/ 37905 | consumed samples: 3095040 | consumed tokens: 6338641920 | elapsed time per iteration (s): 0.23 | learning rate: 1.601E-04 | global batch size: 256 | lm loss: 3.824983E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.912 | TFLOPs: 28.58 | +7: iteration 12100/ 37905 | consumed samples: 3097600 | consumed tokens: 6343884800 | elapsed time per iteration (s): 0.22 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.845895E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.694 | TFLOPs: 29.26 | +7: iteration 12110/ 37905 | consumed samples: 3100160 | consumed tokens: 6349127680 | elapsed time per iteration (s): 0.22 | learning rate: 1.600E-04 | global batch size: 256 | lm loss: 3.849931E+00 | grad norm: 0.333 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.281 | TFLOPs: 29.30 | +7: iteration 12120/ 37905 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 0.22 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.834315E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.397 | TFLOPs: 29.28 | +7: iteration 12130/ 37905 | consumed samples: 3105280 | consumed tokens: 6359613440 | elapsed time per iteration (s): 0.23 | learning rate: 1.599E-04 | global batch size: 256 | lm loss: 3.827652E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.809 | TFLOPs: 28.83 | +7: iteration 12140/ 37905 | consumed samples: 3107840 | consumed tokens: 6364856320 | elapsed time per iteration (s): 0.22 | learning rate: 1.598E-04 | global batch size: 256 | lm loss: 3.839205E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.452 | TFLOPs: 29.28 | +7: iteration 12150/ 37905 | consumed samples: 3110400 | consumed tokens: 6370099200 | elapsed time per iteration (s): 0.22 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.837745E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.022 | TFLOPs: 29.30 | +7: iteration 12160/ 37905 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 0.22 | learning rate: 1.597E-04 | global batch size: 256 | lm loss: 3.843945E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.061 | TFLOPs: 29.30 | +7: iteration 12170/ 37905 | consumed samples: 3115520 | consumed tokens: 6380584960 | elapsed time per iteration (s): 0.22 | learning rate: 1.596E-04 | global batch size: 256 | lm loss: 3.843622E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.557 | TFLOPs: 29.31 | +7: iteration 12180/ 37905 | consumed samples: 3118080 | consumed tokens: 6385827840 | elapsed time per iteration (s): 0.22 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.836156E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.566 | TFLOPs: 29.29 | +7: iteration 12190/ 37905 | consumed samples: 3120640 | consumed tokens: 6391070720 | elapsed time per iteration (s): 0.22 | learning rate: 1.595E-04 | global batch size: 256 | lm loss: 3.836285E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.458 | TFLOPs: 29.28 | +7: iteration 12200/ 37905 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 0.22 | learning rate: 1.594E-04 | global batch size: 256 | lm loss: 3.846764E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.254 | TFLOPs: 29.30 | +7: iteration 12210/ 37905 | consumed samples: 3125760 | consumed tokens: 6401556480 | elapsed time per iteration (s): 0.22 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.846293E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.272 | TFLOPs: 29.30 | +7: iteration 12220/ 37905 | consumed samples: 3128320 | consumed tokens: 6406799360 | elapsed time per iteration (s): 0.22 | learning rate: 1.593E-04 | global batch size: 256 | lm loss: 3.854848E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.016 | TFLOPs: 29.30 | +7: iteration 12230/ 37905 | consumed samples: 3130880 | consumed tokens: 6412042240 | elapsed time per iteration (s): 0.22 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.830516E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.472 | TFLOPs: 29.31 | +7: iteration 12240/ 37905 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 0.22 | learning rate: 1.592E-04 | global batch size: 256 | lm loss: 3.840620E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.831 | TFLOPs: 29.29 | +7: iteration 12250/ 37905 | consumed samples: 3136000 | consumed tokens: 6422528000 | elapsed time per iteration (s): 0.22 | learning rate: 1.591E-04 | global batch size: 256 | lm loss: 3.837104E+00 | grad norm: 0.322 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.113 | TFLOPs: 29.30 | +7: iteration 12260/ 37905 | consumed samples: 3138560 | consumed tokens: 6427770880 | elapsed time per iteration (s): 0.22 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.837437E+00 | grad norm: 0.399 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.595 | TFLOPs: 29.29 | +7: iteration 12270/ 37905 | consumed samples: 3141120 | consumed tokens: 6433013760 | elapsed time per iteration (s): 0.22 | learning rate: 1.590E-04 | global batch size: 256 | lm loss: 3.819981E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.401 | TFLOPs: 29.28 | +7: iteration 12280/ 37905 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 0.22 | learning rate: 1.589E-04 | global batch size: 256 | lm loss: 3.828738E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.730 | TFLOPs: 29.32 | +7: iteration 12290/ 37905 | consumed samples: 3146240 | consumed tokens: 6443499520 | elapsed time per iteration (s): 0.22 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.831839E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.482 | TFLOPs: 29.31 | +7: iteration 12300/ 37905 | consumed samples: 3148800 | consumed tokens: 6448742400 | elapsed time per iteration (s): 0.22 | learning rate: 1.588E-04 | global batch size: 256 | lm loss: 3.828467E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.398 | TFLOPs: 29.31 | +7: iteration 12310/ 37905 | consumed samples: 3151360 | consumed tokens: 6453985280 | elapsed time per iteration (s): 0.22 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.833757E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.823 | TFLOPs: 29.29 | +7: iteration 12320/ 37905 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 0.22 | learning rate: 1.587E-04 | global batch size: 256 | lm loss: 3.835118E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.226 | TFLOPs: 29.30 | +7: iteration 12330/ 37905 | consumed samples: 3156480 | consumed tokens: 6464471040 | elapsed time per iteration (s): 0.22 | learning rate: 1.586E-04 | global batch size: 256 | lm loss: 3.835366E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.167 | TFLOPs: 29.30 | +7: iteration 12340/ 37905 | consumed samples: 3159040 | consumed tokens: 6469713920 | elapsed time per iteration (s): 0.22 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.828844E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.430 | TFLOPs: 29.28 | +7: iteration 12350/ 37905 | consumed samples: 3161600 | consumed tokens: 6474956800 | elapsed time per iteration (s): 0.22 | learning rate: 1.585E-04 | global batch size: 256 | lm loss: 3.818654E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.999 | TFLOPs: 29.27 | +7: iteration 12360/ 37905 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 0.22 | learning rate: 1.584E-04 | global batch size: 256 | lm loss: 3.838916E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.867 | TFLOPs: 29.29 | +7: iteration 12370/ 37905 | consumed samples: 3166720 | consumed tokens: 6485442560 | elapsed time per iteration (s): 0.22 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.830318E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.338 | TFLOPs: 29.28 | +7: iteration 12380/ 37905 | consumed samples: 3169280 | consumed tokens: 6490685440 | elapsed time per iteration (s): 0.22 | learning rate: 1.583E-04 | global batch size: 256 | lm loss: 3.833949E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.677 | TFLOPs: 29.29 | +7: iteration 12390/ 37905 | consumed samples: 3171840 | consumed tokens: 6495928320 | elapsed time per iteration (s): 0.22 | learning rate: 1.582E-04 | global batch size: 256 | lm loss: 3.835167E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.902 | TFLOPs: 29.29 | +7: iteration 12400/ 37905 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 0.22 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.828794E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.620 | TFLOPs: 29.29 | +7: iteration 12410/ 37905 | consumed samples: 3176960 | consumed tokens: 6506414080 | elapsed time per iteration (s): 0.22 | learning rate: 1.581E-04 | global batch size: 256 | lm loss: 3.840408E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.582 | TFLOPs: 29.29 | +7: iteration 12420/ 37905 | consumed samples: 3179520 | consumed tokens: 6511656960 | elapsed time per iteration (s): 0.22 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.830436E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.833 | TFLOPs: 29.29 | +7: iteration 12430/ 37905 | consumed samples: 3182080 | consumed tokens: 6516899840 | elapsed time per iteration (s): 0.22 | learning rate: 1.580E-04 | global batch size: 256 | lm loss: 3.822787E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.953 | TFLOPs: 29.30 | +7: iteration 12440/ 37905 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 0.22 | learning rate: 1.579E-04 | global batch size: 256 | lm loss: 3.841209E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.526 | TFLOPs: 29.28 | +7: iteration 12450/ 37905 | consumed samples: 3187200 | consumed tokens: 6527385600 | elapsed time per iteration (s): 0.22 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.833339E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.577 | TFLOPs: 29.29 | +7: iteration 12460/ 37905 | consumed samples: 3189760 | consumed tokens: 6532628480 | elapsed time per iteration (s): 0.22 | learning rate: 1.578E-04 | global batch size: 256 | lm loss: 3.816756E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.986 | TFLOPs: 29.27 | +7: iteration 12470/ 37905 | consumed samples: 3192320 | consumed tokens: 6537871360 | elapsed time per iteration (s): 0.22 | learning rate: 1.577E-04 | global batch size: 256 | lm loss: 3.832833E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.430 | TFLOPs: 29.26 | +7: iteration 12480/ 37905 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 0.22 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.847038E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.919 | TFLOPs: 29.27 | +7: iteration 12490/ 37905 | consumed samples: 3197440 | consumed tokens: 6548357120 | elapsed time per iteration (s): 0.22 | learning rate: 1.576E-04 | global batch size: 256 | lm loss: 3.836406E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.598 | TFLOPs: 29.29 | +7: iteration 12500/ 37905 | consumed samples: 3200000 | consumed tokens: 6553600000 | elapsed time per iteration (s): 0.22 | learning rate: 1.575E-04 | global batch size: 256 | lm loss: 3.836481E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.729 | TFLOPs: 29.29 | +7: iteration 12510/ 37905 | consumed samples: 3202560 | consumed tokens: 6558842880 | elapsed time per iteration (s): 0.22 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.830530E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.402 | TFLOPs: 29.31 | +7: iteration 12520/ 37905 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 0.22 | learning rate: 1.574E-04 | global batch size: 256 | lm loss: 3.837339E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.889 | TFLOPs: 29.32 | +7: iteration 12530/ 37905 | consumed samples: 3207680 | consumed tokens: 6569328640 | elapsed time per iteration (s): 0.22 | learning rate: 1.573E-04 | global batch size: 256 | lm loss: 3.823070E+00 | grad norm: 0.317 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.964 | TFLOPs: 29.32 | +7: iteration 12540/ 37905 | consumed samples: 3210240 | consumed tokens: 6574571520 | elapsed time per iteration (s): 0.22 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.823558E+00 | grad norm: 0.325 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.621 | TFLOPs: 29.31 | +7: iteration 12550/ 37905 | consumed samples: 3212800 | consumed tokens: 6579814400 | elapsed time per iteration (s): 0.22 | learning rate: 1.572E-04 | global batch size: 256 | lm loss: 3.835756E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.139 | TFLOPs: 29.30 | +7: iteration 12560/ 37905 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 0.22 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.829011E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.367 | TFLOPs: 29.31 | +7: iteration 12570/ 37905 | consumed samples: 3217920 | consumed tokens: 6590300160 | elapsed time per iteration (s): 0.22 | learning rate: 1.571E-04 | global batch size: 256 | lm loss: 3.834098E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.982 | TFLOPs: 29.32 | +7: iteration 12580/ 37905 | consumed samples: 3220480 | consumed tokens: 6595543040 | elapsed time per iteration (s): 0.22 | learning rate: 1.570E-04 | global batch size: 256 | lm loss: 3.825576E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.606 | TFLOPs: 29.31 | +7: iteration 12590/ 37905 | consumed samples: 3223040 | consumed tokens: 6600785920 | elapsed time per iteration (s): 0.22 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.843414E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.290 | TFLOPs: 29.30 | +7: iteration 12600/ 37905 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 0.22 | learning rate: 1.569E-04 | global batch size: 256 | lm loss: 3.824933E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.750 | TFLOPs: 29.32 | +7: iteration 12610/ 37905 | consumed samples: 3228160 | consumed tokens: 6611271680 | elapsed time per iteration (s): 0.22 | learning rate: 1.568E-04 | global batch size: 256 | lm loss: 3.838449E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.734 | TFLOPs: 29.32 | +7: iteration 12620/ 37905 | consumed samples: 3230720 | consumed tokens: 6616514560 | elapsed time per iteration (s): 0.22 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.828287E+00 | grad norm: 0.354 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.387 | TFLOPs: 29.33 | +7: iteration 12630/ 37905 | consumed samples: 3233280 | consumed tokens: 6621757440 | elapsed time per iteration (s): 0.22 | learning rate: 1.567E-04 | global batch size: 256 | lm loss: 3.834403E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.874 | TFLOPs: 29.34 | +7: iteration 12640/ 37905 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 0.22 | learning rate: 1.566E-04 | global batch size: 256 | lm loss: 3.834840E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.317 | TFLOPs: 29.36 | +7: iteration 12650/ 37905 | consumed samples: 3238400 | consumed tokens: 6632243200 | elapsed time per iteration (s): 0.22 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.808564E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.802 | TFLOPs: 29.29 | +7: iteration 12660/ 37905 | consumed samples: 3240960 | consumed tokens: 6637486080 | elapsed time per iteration (s): 0.22 | learning rate: 1.565E-04 | global batch size: 256 | lm loss: 3.813364E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.385 | TFLOPs: 29.28 | +7: iteration 12670/ 37905 | consumed samples: 3243520 | consumed tokens: 6642728960 | elapsed time per iteration (s): 0.22 | learning rate: 1.564E-04 | global batch size: 256 | lm loss: 3.844222E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.999 | TFLOPs: 29.27 | +7: iteration 12680/ 37905 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 0.22 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.838945E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.601 | TFLOPs: 29.26 | +7: iteration 12690/ 37905 | consumed samples: 3248640 | consumed tokens: 6653214720 | elapsed time per iteration (s): 0.22 | learning rate: 1.563E-04 | global batch size: 256 | lm loss: 3.841557E+00 | grad norm: 0.320 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.121 | TFLOPs: 29.30 | +7: iteration 12700/ 37905 | consumed samples: 3251200 | consumed tokens: 6658457600 | elapsed time per iteration (s): 0.22 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.844205E+00 | grad norm: 0.315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.024 | TFLOPs: 29.32 | +7: iteration 12710/ 37905 | consumed samples: 3253760 | consumed tokens: 6663700480 | elapsed time per iteration (s): 0.22 | learning rate: 1.562E-04 | global batch size: 256 | lm loss: 3.834893E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.069 | TFLOPs: 29.30 | +7: iteration 12720/ 37905 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 0.22 | learning rate: 1.561E-04 | global batch size: 256 | lm loss: 3.834245E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.682 | TFLOPs: 29.31 | +7: iteration 12730/ 37905 | consumed samples: 3258880 | consumed tokens: 6674186240 | elapsed time per iteration (s): 0.22 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.829627E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.014 | TFLOPs: 29.30 | +7: iteration 12740/ 37905 | consumed samples: 3261440 | consumed tokens: 6679429120 | elapsed time per iteration (s): 0.22 | learning rate: 1.560E-04 | global batch size: 256 | lm loss: 3.820959E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.835 | TFLOPs: 29.32 | +7: iteration 12750/ 37905 | consumed samples: 3264000 | consumed tokens: 6684672000 | elapsed time per iteration (s): 0.22 | learning rate: 1.559E-04 | global batch size: 256 | lm loss: 3.824037E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.852 | TFLOPs: 29.29 | +7: iteration 12760/ 37905 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 0.22 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.832294E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.934 | TFLOPs: 29.29 | +7: iteration 12770/ 37905 | consumed samples: 3269120 | consumed tokens: 6695157760 | elapsed time per iteration (s): 0.22 | learning rate: 1.558E-04 | global batch size: 256 | lm loss: 3.820152E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.932 | TFLOPs: 29.29 | +7: iteration 12780/ 37905 | consumed samples: 3271680 | consumed tokens: 6700400640 | elapsed time per iteration (s): 0.22 | learning rate: 1.557E-04 | global batch size: 256 | lm loss: 3.813527E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.776 | TFLOPs: 29.29 | +7: iteration 12790/ 37905 | consumed samples: 3274240 | consumed tokens: 6705643520 | elapsed time per iteration (s): 0.22 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.826274E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.272 | TFLOPs: 29.30 | +7: iteration 12800/ 37905 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 0.22 | learning rate: 1.556E-04 | global batch size: 256 | lm loss: 3.832341E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.445 | TFLOPs: 29.31 | +7: iteration 12810/ 37905 | consumed samples: 3279360 | consumed tokens: 6716129280 | elapsed time per iteration (s): 0.22 | learning rate: 1.555E-04 | global batch size: 256 | lm loss: 3.827287E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.376 | TFLOPs: 29.31 | +7: iteration 12820/ 37905 | consumed samples: 3281920 | consumed tokens: 6721372160 | elapsed time per iteration (s): 0.22 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.818577E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.572 | TFLOPs: 29.31 | +7: iteration 12830/ 37905 | consumed samples: 3284480 | consumed tokens: 6726615040 | elapsed time per iteration (s): 0.22 | learning rate: 1.554E-04 | global batch size: 256 | lm loss: 3.818363E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.165 | TFLOPs: 29.30 | +7: iteration 12840/ 37905 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 0.22 | learning rate: 1.553E-04 | global batch size: 256 | lm loss: 3.819487E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.108 | TFLOPs: 29.15 | +7: iteration 12850/ 37905 | consumed samples: 3289600 | consumed tokens: 6737100800 | elapsed time per iteration (s): 0.22 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.823594E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.383 | TFLOPs: 29.31 | +7: iteration 12860/ 37905 | consumed samples: 3292160 | consumed tokens: 6742343680 | elapsed time per iteration (s): 0.22 | learning rate: 1.552E-04 | global batch size: 256 | lm loss: 3.838925E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.611 | TFLOPs: 29.31 | +7: iteration 12870/ 37905 | consumed samples: 3294720 | consumed tokens: 6747586560 | elapsed time per iteration (s): 0.22 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.838098E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.168 | TFLOPs: 29.30 | +7: iteration 12880/ 37905 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 0.22 | learning rate: 1.551E-04 | global batch size: 256 | lm loss: 3.826623E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.422 | TFLOPs: 29.28 | +7: iteration 12890/ 37905 | consumed samples: 3299840 | consumed tokens: 6758072320 | elapsed time per iteration (s): 0.22 | learning rate: 1.550E-04 | global batch size: 256 | lm loss: 3.828145E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.748 | TFLOPs: 29.32 | +7: iteration 12900/ 37905 | consumed samples: 3302400 | consumed tokens: 6763315200 | elapsed time per iteration (s): 0.22 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.832366E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.163 | TFLOPs: 29.30 | +7: iteration 12910/ 37905 | consumed samples: 3304960 | consumed tokens: 6768558080 | elapsed time per iteration (s): 0.22 | learning rate: 1.549E-04 | global batch size: 256 | lm loss: 3.817400E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.975 | TFLOPs: 29.30 | +7: iteration 12920/ 37905 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 0.22 | learning rate: 1.548E-04 | global batch size: 256 | lm loss: 3.823928E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.159 | TFLOPs: 29.35 | +7: iteration 12930/ 37905 | consumed samples: 3310080 | consumed tokens: 6779043840 | elapsed time per iteration (s): 0.22 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.829736E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.750 | TFLOPs: 29.29 | +7: iteration 12940/ 37905 | consumed samples: 3312640 | consumed tokens: 6784286720 | elapsed time per iteration (s): 0.22 | learning rate: 1.547E-04 | global batch size: 256 | lm loss: 3.815177E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.406 | TFLOPs: 29.31 | +7: iteration 12950/ 37905 | consumed samples: 3315200 | consumed tokens: 6789529600 | elapsed time per iteration (s): 0.22 | learning rate: 1.546E-04 | global batch size: 256 | lm loss: 3.821453E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.012 | TFLOPs: 29.27 | +7: iteration 12960/ 37905 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 0.22 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.830432E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.579 | TFLOPs: 29.29 | +7: iteration 12970/ 37905 | consumed samples: 3320320 | consumed tokens: 6800015360 | elapsed time per iteration (s): 0.22 | learning rate: 1.545E-04 | global batch size: 256 | lm loss: 3.830711E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.745 | TFLOPs: 29.29 | +7: iteration 12980/ 37905 | consumed samples: 3322880 | consumed tokens: 6805258240 | elapsed time per iteration (s): 0.22 | learning rate: 1.544E-04 | global batch size: 256 | lm loss: 3.828314E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.623 | TFLOPs: 29.31 | +7: iteration 12990/ 37905 | consumed samples: 3325440 | consumed tokens: 6810501120 | elapsed time per iteration (s): 0.22 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.814749E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.914 | TFLOPs: 29.29 | +7: iteration 13000/ 37905 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 0.22 | learning rate: 1.543E-04 | global batch size: 256 | lm loss: 3.815903E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.216 | TFLOPs: 29.30 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 13000 | lm loss value: 3.826323E+00 | lm loss PPL: 4.589347E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 13000 to checkpoints_83m20b400m +0: [2023-03-15 22:45:53,171] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step13000 is begin to save! +0: [2023-03-15 22:45:53,175] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:45:53,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:45:53,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:45:53,257] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:45:53,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:45:53,268] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:45:53,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:45:53,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:45:53,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:45:53,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:45:53,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:45:53,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:45:53,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:45:53,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:45:53,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:45:53,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:45:53,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:45:53,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:45:53,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:45:53,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:45:53,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:45:53,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:45:53,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:45:53,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:45:53,358] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step13000/mp_rank_00_model_states.pt +0: [2023-03-15 22:45:53,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:45:53,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:45:53,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:45:53,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:45:53,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-15 22:45:53,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:45:53,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:45:53,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:45:53,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +5: [2023-03-15 22:45:53,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +1: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +2: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +0: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +7: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +1: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +3: [2023-03-15 22:45:53,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +6: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:45:53,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +4: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:45:53,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step13000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:45:53,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step13000 is ready now! +0: successfully saved checkpoint at iteration 13000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.28 +7: iteration 13010/ 37905 | consumed samples: 3330560 | consumed tokens: 6820986880 | elapsed time per iteration (s): 0.25 | learning rate: 1.542E-04 | global batch size: 256 | lm loss: 3.816357E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1012.814 | TFLOPs: 25.80 | +7: iteration 13020/ 37905 | consumed samples: 3333120 | consumed tokens: 6826229760 | elapsed time per iteration (s): 0.22 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.822569E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.537 | TFLOPs: 29.34 | +7: iteration 13030/ 37905 | consumed samples: 3335680 | consumed tokens: 6831472640 | elapsed time per iteration (s): 0.22 | learning rate: 1.541E-04 | global batch size: 256 | lm loss: 3.826575E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.227 | TFLOPs: 29.33 | +7: iteration 13040/ 37905 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 0.22 | learning rate: 1.540E-04 | global batch size: 256 | lm loss: 3.822154E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.880 | TFLOPs: 29.29 | +7: iteration 13050/ 37905 | consumed samples: 3340800 | consumed tokens: 6841958400 | elapsed time per iteration (s): 0.22 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.839993E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.070 | TFLOPs: 29.32 | +7: iteration 13060/ 37905 | consumed samples: 3343360 | consumed tokens: 6847201280 | elapsed time per iteration (s): 0.22 | learning rate: 1.539E-04 | global batch size: 256 | lm loss: 3.809498E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.005 | TFLOPs: 29.32 | +7: iteration 13070/ 37905 | consumed samples: 3345920 | consumed tokens: 6852444160 | elapsed time per iteration (s): 0.22 | learning rate: 1.538E-04 | global batch size: 256 | lm loss: 3.824938E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.484 | TFLOPs: 29.31 | +7: iteration 13080/ 37905 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 0.22 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.819973E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.150 | TFLOPs: 29.33 | +7: iteration 13090/ 37905 | consumed samples: 3351040 | consumed tokens: 6862929920 | elapsed time per iteration (s): 0.22 | learning rate: 1.537E-04 | global batch size: 256 | lm loss: 3.822823E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.858 | TFLOPs: 29.32 | +7: iteration 13100/ 37905 | consumed samples: 3353600 | consumed tokens: 6868172800 | elapsed time per iteration (s): 0.22 | learning rate: 1.536E-04 | global batch size: 256 | lm loss: 3.829796E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.340 | TFLOPs: 29.31 | +7: iteration 13110/ 37905 | consumed samples: 3356160 | consumed tokens: 6873415680 | elapsed time per iteration (s): 0.22 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.820296E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.331 | TFLOPs: 29.28 | +7: iteration 13120/ 37905 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 0.22 | learning rate: 1.535E-04 | global batch size: 256 | lm loss: 3.817698E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.925 | TFLOPs: 29.29 | +7: iteration 13130/ 37905 | consumed samples: 3361280 | consumed tokens: 6883901440 | elapsed time per iteration (s): 0.22 | learning rate: 1.534E-04 | global batch size: 256 | lm loss: 3.841159E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.363 | TFLOPs: 29.25 | +7: iteration 13140/ 37905 | consumed samples: 3363840 | consumed tokens: 6889144320 | elapsed time per iteration (s): 0.23 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.814248E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.486 | TFLOPs: 28.98 | +7: iteration 13150/ 37905 | consumed samples: 3366400 | consumed tokens: 6894387200 | elapsed time per iteration (s): 0.22 | learning rate: 1.533E-04 | global batch size: 256 | lm loss: 3.832072E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.877 | TFLOPs: 29.29 | +7: iteration 13160/ 37905 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 0.22 | learning rate: 1.532E-04 | global batch size: 256 | lm loss: 3.809719E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.026 | TFLOPs: 29.30 | +7: iteration 13170/ 37905 | consumed samples: 3371520 | consumed tokens: 6904872960 | elapsed time per iteration (s): 0.22 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.835183E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.926 | TFLOPs: 29.29 | +7: iteration 13180/ 37905 | consumed samples: 3374080 | consumed tokens: 6910115840 | elapsed time per iteration (s): 0.22 | learning rate: 1.531E-04 | global batch size: 256 | lm loss: 3.816771E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.996 | TFLOPs: 29.30 | +7: iteration 13190/ 37905 | consumed samples: 3376640 | consumed tokens: 6915358720 | elapsed time per iteration (s): 0.22 | learning rate: 1.530E-04 | global batch size: 256 | lm loss: 3.819902E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.038 | TFLOPs: 29.30 | +7: iteration 13200/ 37905 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 0.22 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.825879E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.314 | TFLOPs: 29.30 | +7: iteration 13210/ 37905 | consumed samples: 3381760 | consumed tokens: 6925844480 | elapsed time per iteration (s): 0.22 | learning rate: 1.529E-04 | global batch size: 256 | lm loss: 3.813065E+00 | grad norm: 0.337 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.271 | TFLOPs: 29.28 | +7: iteration 13220/ 37905 | consumed samples: 3384320 | consumed tokens: 6931087360 | elapsed time per iteration (s): 0.22 | learning rate: 1.528E-04 | global batch size: 256 | lm loss: 3.830269E+00 | grad norm: 0.344 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.798 | TFLOPs: 29.29 | +7: iteration 13230/ 37905 | consumed samples: 3386880 | consumed tokens: 6936330240 | elapsed time per iteration (s): 0.22 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.816006E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.825 | TFLOPs: 29.29 | +7: iteration 13240/ 37905 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 0.22 | learning rate: 1.527E-04 | global batch size: 256 | lm loss: 3.820848E+00 | grad norm: 0.327 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.911 | TFLOPs: 29.29 | +7: iteration 13250/ 37905 | consumed samples: 3392000 | consumed tokens: 6946816000 | elapsed time per iteration (s): 0.22 | learning rate: 1.526E-04 | global batch size: 256 | lm loss: 3.810749E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.482 | TFLOPs: 29.05 | +7: iteration 13260/ 37905 | consumed samples: 3394560 | consumed tokens: 6952058880 | elapsed time per iteration (s): 0.22 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.818744E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.494 | TFLOPs: 29.28 | +7: iteration 13270/ 37905 | consumed samples: 3397120 | consumed tokens: 6957301760 | elapsed time per iteration (s): 0.22 | learning rate: 1.525E-04 | global batch size: 256 | lm loss: 3.815718E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.462 | TFLOPs: 29.26 | +7: iteration 13280/ 37905 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 0.22 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.812378E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.830 | TFLOPs: 29.29 | +7: iteration 13290/ 37905 | consumed samples: 3402240 | consumed tokens: 6967787520 | elapsed time per iteration (s): 0.22 | learning rate: 1.524E-04 | global batch size: 256 | lm loss: 3.817964E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.411 | TFLOPs: 29.28 | +7: iteration 13300/ 37905 | consumed samples: 3404800 | consumed tokens: 6973030400 | elapsed time per iteration (s): 0.22 | learning rate: 1.523E-04 | global batch size: 256 | lm loss: 3.814168E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.145 | TFLOPs: 29.27 | +7: iteration 13310/ 37905 | consumed samples: 3407360 | consumed tokens: 6978273280 | elapsed time per iteration (s): 0.22 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.808838E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.741 | TFLOPs: 29.29 | +7: iteration 13320/ 37905 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 0.22 | learning rate: 1.522E-04 | global batch size: 256 | lm loss: 3.825405E+00 | grad norm: 0.355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.681 | TFLOPs: 29.26 | +7: iteration 13330/ 37905 | consumed samples: 3412480 | consumed tokens: 6988759040 | elapsed time per iteration (s): 0.22 | learning rate: 1.521E-04 | global batch size: 256 | lm loss: 3.819175E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.667 | TFLOPs: 29.29 | +7: iteration 13340/ 37905 | consumed samples: 3415040 | consumed tokens: 6994001920 | elapsed time per iteration (s): 0.22 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.815199E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.080 | TFLOPs: 29.27 | +7: iteration 13350/ 37905 | consumed samples: 3417600 | consumed tokens: 6999244800 | elapsed time per iteration (s): 0.22 | learning rate: 1.520E-04 | global batch size: 256 | lm loss: 3.819793E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.845 | TFLOPs: 29.29 | +7: iteration 13360/ 37905 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 0.22 | learning rate: 1.519E-04 | global batch size: 256 | lm loss: 3.815451E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.321 | TFLOPs: 29.28 | +7: iteration 13370/ 37905 | consumed samples: 3422720 | consumed tokens: 7009730560 | elapsed time per iteration (s): 0.22 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.808271E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.905 | TFLOPs: 29.29 | +7: iteration 13380/ 37905 | consumed samples: 3425280 | consumed tokens: 7014973440 | elapsed time per iteration (s): 0.22 | learning rate: 1.518E-04 | global batch size: 256 | lm loss: 3.823120E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.342 | TFLOPs: 29.28 | +7: iteration 13390/ 37905 | consumed samples: 3427840 | consumed tokens: 7020216320 | elapsed time per iteration (s): 0.22 | learning rate: 1.517E-04 | global batch size: 256 | lm loss: 3.812940E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.024 | TFLOPs: 29.27 | +7: iteration 13400/ 37905 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 0.22 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.817005E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.718 | TFLOPs: 29.29 | +7: iteration 13410/ 37905 | consumed samples: 3432960 | consumed tokens: 7030702080 | elapsed time per iteration (s): 0.22 | learning rate: 1.516E-04 | global batch size: 256 | lm loss: 3.807891E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.361 | TFLOPs: 29.28 | +7: iteration 13420/ 37905 | consumed samples: 3435520 | consumed tokens: 7035944960 | elapsed time per iteration (s): 0.22 | learning rate: 1.515E-04 | global batch size: 256 | lm loss: 3.818869E+00 | grad norm: 0.376 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.866 | TFLOPs: 29.29 | +7: iteration 13430/ 37905 | consumed samples: 3438080 | consumed tokens: 7041187840 | elapsed time per iteration (s): 0.22 | learning rate: 1.514E-04 | global batch size: 256 | lm loss: 3.810377E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.201 | TFLOPs: 29.28 | +7: iteration 13440/ 37905 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 0.22 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.814988E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.415 | TFLOPs: 29.28 | +7: iteration 13450/ 37905 | consumed samples: 3443200 | consumed tokens: 7051673600 | elapsed time per iteration (s): 0.22 | learning rate: 1.513E-04 | global batch size: 256 | lm loss: 3.808404E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.592 | TFLOPs: 29.29 | +7: iteration 13460/ 37905 | consumed samples: 3445760 | consumed tokens: 7056916480 | elapsed time per iteration (s): 0.22 | learning rate: 1.512E-04 | global batch size: 256 | lm loss: 3.806623E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.691 | TFLOPs: 29.29 | +7: iteration 13470/ 37905 | consumed samples: 3448320 | consumed tokens: 7062159360 | elapsed time per iteration (s): 0.22 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.821487E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.113 | TFLOPs: 29.27 | +7: iteration 13480/ 37905 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 0.22 | learning rate: 1.511E-04 | global batch size: 256 | lm loss: 3.811795E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.131 | TFLOPs: 29.25 | +7: iteration 13490/ 37905 | consumed samples: 3453440 | consumed tokens: 7072645120 | elapsed time per iteration (s): 0.22 | learning rate: 1.510E-04 | global batch size: 256 | lm loss: 3.805419E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.283 | TFLOPs: 29.28 | +7: iteration 13500/ 37905 | consumed samples: 3456000 | consumed tokens: 7077888000 | elapsed time per iteration (s): 0.22 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.829868E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.244 | TFLOPs: 29.28 | +7: iteration 13510/ 37905 | consumed samples: 3458560 | consumed tokens: 7083130880 | elapsed time per iteration (s): 0.22 | learning rate: 1.509E-04 | global batch size: 256 | lm loss: 3.814214E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.388 | TFLOPs: 29.28 | +7: iteration 13520/ 37905 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 0.22 | learning rate: 1.508E-04 | global batch size: 256 | lm loss: 3.810737E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.377 | TFLOPs: 29.05 | +7: iteration 13530/ 37905 | consumed samples: 3463680 | consumed tokens: 7093616640 | elapsed time per iteration (s): 0.22 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.817816E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.453 | TFLOPs: 29.28 | +7: iteration 13540/ 37905 | consumed samples: 3466240 | consumed tokens: 7098859520 | elapsed time per iteration (s): 0.22 | learning rate: 1.507E-04 | global batch size: 256 | lm loss: 3.832420E+00 | grad norm: 0.342 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.598 | TFLOPs: 29.29 | +7: iteration 13550/ 37905 | consumed samples: 3468800 | consumed tokens: 7104102400 | elapsed time per iteration (s): 0.22 | learning rate: 1.506E-04 | global batch size: 256 | lm loss: 3.821730E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.896 | TFLOPs: 29.27 | +7: iteration 13560/ 37905 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 0.22 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.804948E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.151 | TFLOPs: 29.27 | +7: iteration 13570/ 37905 | consumed samples: 3473920 | consumed tokens: 7114588160 | elapsed time per iteration (s): 0.22 | learning rate: 1.505E-04 | global batch size: 256 | lm loss: 3.818781E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.056 | TFLOPs: 29.27 | +7: iteration 13580/ 37905 | consumed samples: 3476480 | consumed tokens: 7119831040 | elapsed time per iteration (s): 0.22 | learning rate: 1.504E-04 | global batch size: 256 | lm loss: 3.811520E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.194 | TFLOPs: 29.02 | +7: iteration 13590/ 37905 | consumed samples: 3479040 | consumed tokens: 7125073920 | elapsed time per iteration (s): 0.22 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.824004E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.628 | TFLOPs: 29.31 | +7: iteration 13600/ 37905 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 0.22 | learning rate: 1.503E-04 | global batch size: 256 | lm loss: 3.817946E+00 | grad norm: 0.318 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.656 | TFLOPs: 29.31 | +7: iteration 13610/ 37905 | consumed samples: 3484160 | consumed tokens: 7135559680 | elapsed time per iteration (s): 0.22 | learning rate: 1.502E-04 | global batch size: 256 | lm loss: 3.812980E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.771 | TFLOPs: 29.29 | +7: iteration 13620/ 37905 | consumed samples: 3486720 | consumed tokens: 7140802560 | elapsed time per iteration (s): 0.22 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.813132E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.206 | TFLOPs: 29.30 | +7: iteration 13630/ 37905 | consumed samples: 3489280 | consumed tokens: 7146045440 | elapsed time per iteration (s): 0.22 | learning rate: 1.501E-04 | global batch size: 256 | lm loss: 3.809024E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.073 | TFLOPs: 29.30 | +7: iteration 13640/ 37905 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 0.22 | learning rate: 1.500E-04 | global batch size: 256 | lm loss: 3.818538E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.480 | TFLOPs: 29.10 | +7: iteration 13650/ 37905 | consumed samples: 3494400 | consumed tokens: 7156531200 | elapsed time per iteration (s): 0.22 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.811097E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.700 | TFLOPs: 29.11 | +7: iteration 13660/ 37905 | consumed samples: 3496960 | consumed tokens: 7161774080 | elapsed time per iteration (s): 0.22 | learning rate: 1.499E-04 | global batch size: 256 | lm loss: 3.820239E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.767 | TFLOPs: 29.04 | +7: iteration 13670/ 37905 | consumed samples: 3499520 | consumed tokens: 7167016960 | elapsed time per iteration (s): 0.22 | learning rate: 1.498E-04 | global batch size: 256 | lm loss: 3.821748E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.866 | TFLOPs: 29.29 | +7: iteration 13680/ 37905 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 0.22 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.825620E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.082 | TFLOPs: 29.30 | +7: iteration 13690/ 37905 | consumed samples: 3504640 | consumed tokens: 7177502720 | elapsed time per iteration (s): 0.23 | learning rate: 1.497E-04 | global batch size: 256 | lm loss: 3.807897E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.659 | TFLOPs: 28.93 | +7: iteration 13700/ 37905 | consumed samples: 3507200 | consumed tokens: 7182745600 | elapsed time per iteration (s): 0.22 | learning rate: 1.496E-04 | global batch size: 256 | lm loss: 3.810378E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.286 | TFLOPs: 29.30 | +7: iteration 13710/ 37905 | consumed samples: 3509760 | consumed tokens: 7187988480 | elapsed time per iteration (s): 0.22 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.815324E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.362 | TFLOPs: 29.31 | +7: iteration 13720/ 37905 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 0.22 | learning rate: 1.495E-04 | global batch size: 256 | lm loss: 3.811600E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.290 | TFLOPs: 29.30 | +7: iteration 13730/ 37905 | consumed samples: 3514880 | consumed tokens: 7198474240 | elapsed time per iteration (s): 0.22 | learning rate: 1.494E-04 | global batch size: 256 | lm loss: 3.803481E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.209 | TFLOPs: 29.28 | +7: iteration 13740/ 37905 | consumed samples: 3517440 | consumed tokens: 7203717120 | elapsed time per iteration (s): 0.22 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.810381E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.849 | TFLOPs: 29.27 | +7: iteration 13750/ 37905 | consumed samples: 3520000 | consumed tokens: 7208960000 | elapsed time per iteration (s): 0.22 | learning rate: 1.493E-04 | global batch size: 256 | lm loss: 3.816104E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.149 | TFLOPs: 29.30 | +7: iteration 13760/ 37905 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 0.22 | learning rate: 1.492E-04 | global batch size: 256 | lm loss: 3.804711E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.893 | TFLOPs: 29.32 | +7: iteration 13770/ 37905 | consumed samples: 3525120 | consumed tokens: 7219445760 | elapsed time per iteration (s): 0.22 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.812173E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.388 | TFLOPs: 29.31 | +7: iteration 13780/ 37905 | consumed samples: 3527680 | consumed tokens: 7224688640 | elapsed time per iteration (s): 0.22 | learning rate: 1.491E-04 | global batch size: 256 | lm loss: 3.802590E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.323 | TFLOPs: 29.38 | +7: iteration 13790/ 37905 | consumed samples: 3530240 | consumed tokens: 7229931520 | elapsed time per iteration (s): 0.22 | learning rate: 1.490E-04 | global batch size: 256 | lm loss: 3.820498E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.328 | TFLOPs: 29.38 | +7: iteration 13800/ 37905 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 0.22 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.817500E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.270 | TFLOPs: 29.30 | +7: iteration 13810/ 37905 | consumed samples: 3535360 | consumed tokens: 7240417280 | elapsed time per iteration (s): 0.22 | learning rate: 1.489E-04 | global batch size: 256 | lm loss: 3.819270E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.395 | TFLOPs: 29.33 | +7: iteration 13820/ 37905 | consumed samples: 3537920 | consumed tokens: 7245660160 | elapsed time per iteration (s): 0.22 | learning rate: 1.488E-04 | global batch size: 256 | lm loss: 3.801550E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.458 | TFLOPs: 29.33 | +7: iteration 13830/ 37905 | consumed samples: 3540480 | consumed tokens: 7250903040 | elapsed time per iteration (s): 0.22 | learning rate: 1.487E-04 | global batch size: 256 | lm loss: 3.804523E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.483 | TFLOPs: 29.31 | +7: iteration 13840/ 37905 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 0.22 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.818201E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.241 | TFLOPs: 29.33 | +7: iteration 13850/ 37905 | consumed samples: 3545600 | consumed tokens: 7261388800 | elapsed time per iteration (s): 0.22 | learning rate: 1.486E-04 | global batch size: 256 | lm loss: 3.795044E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.054 | TFLOPs: 29.35 | +7: iteration 13860/ 37905 | consumed samples: 3548160 | consumed tokens: 7266631680 | elapsed time per iteration (s): 0.22 | learning rate: 1.485E-04 | global batch size: 256 | lm loss: 3.806577E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.683 | TFLOPs: 29.34 | +7: iteration 13870/ 37905 | consumed samples: 3550720 | consumed tokens: 7271874560 | elapsed time per iteration (s): 0.22 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.813122E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.084 | TFLOPs: 29.32 | +7: iteration 13880/ 37905 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 0.22 | learning rate: 1.484E-04 | global batch size: 256 | lm loss: 3.810638E+00 | grad norm: 0.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.029 | TFLOPs: 29.37 | +7: iteration 13890/ 37905 | consumed samples: 3555840 | consumed tokens: 7282360320 | elapsed time per iteration (s): 0.22 | learning rate: 1.483E-04 | global batch size: 256 | lm loss: 3.802962E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.775 | TFLOPs: 29.37 | +7: iteration 13900/ 37905 | consumed samples: 3558400 | consumed tokens: 7287603200 | elapsed time per iteration (s): 0.22 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.814197E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.999 | TFLOPs: 29.37 | +7: iteration 13910/ 37905 | consumed samples: 3560960 | consumed tokens: 7292846080 | elapsed time per iteration (s): 0.22 | learning rate: 1.482E-04 | global batch size: 256 | lm loss: 3.810378E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.136 | TFLOPs: 29.38 | +7: iteration 13920/ 37905 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 0.22 | learning rate: 1.481E-04 | global batch size: 256 | lm loss: 3.810513E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.057 | TFLOPs: 29.37 | +7: iteration 13930/ 37905 | consumed samples: 3566080 | consumed tokens: 7303331840 | elapsed time per iteration (s): 0.22 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.818654E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.869 | TFLOPs: 29.37 | +7: iteration 13940/ 37905 | consumed samples: 3568640 | consumed tokens: 7308574720 | elapsed time per iteration (s): 0.22 | learning rate: 1.480E-04 | global batch size: 256 | lm loss: 3.809532E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.529 | TFLOPs: 29.36 | +7: iteration 13950/ 37905 | consumed samples: 3571200 | consumed tokens: 7313817600 | elapsed time per iteration (s): 0.22 | learning rate: 1.479E-04 | global batch size: 256 | lm loss: 3.819598E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.587 | TFLOPs: 29.36 | +7: iteration 13960/ 37905 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 0.22 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.816378E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.592 | TFLOPs: 29.36 | +7: iteration 13970/ 37905 | consumed samples: 3576320 | consumed tokens: 7324303360 | elapsed time per iteration (s): 0.22 | learning rate: 1.478E-04 | global batch size: 256 | lm loss: 3.798637E+00 | grad norm: 0.330 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.378 | TFLOPs: 29.36 | +7: iteration 13980/ 37905 | consumed samples: 3578880 | consumed tokens: 7329546240 | elapsed time per iteration (s): 0.22 | learning rate: 1.477E-04 | global batch size: 256 | lm loss: 3.807574E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.209 | TFLOPs: 29.38 | +7: iteration 13990/ 37905 | consumed samples: 3581440 | consumed tokens: 7334789120 | elapsed time per iteration (s): 0.22 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.804515E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.857 | TFLOPs: 29.37 | +0: [2023-03-15 22:49:36,100] [INFO] [logging.py:68:log_dist] [Rank 0] step=14000, skipped=0, lr=[0.00014755779238876917, 0.00014755779238876917, 0.00014755779238876917], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 14000/ 37905 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 0.22 | learning rate: 1.476E-04 | global batch size: 256 | lm loss: 3.813103E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.563 | TFLOPs: 29.36 | +0: steps: 14000 loss: 3.7939 iter time (s): 0.221 samples/sec: 1157.715 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 14000 | lm loss value: 3.758934E+00 | lm loss PPL: 4.290269E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 14000 to checkpoints_83m20b400m +0: [2023-03-15 22:49:36,188] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step14000 is begin to save! +0: [2023-03-15 22:49:36,192] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:49:36,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:49:36,263] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:49:36,274] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:49:36,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:49:36,285] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:49:36,286] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:49:36,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:49:36,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:49:36,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:49:36,308] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:49:36,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:49:36,319] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:49:36,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:49:36,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:49:36,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:49:36,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:49:36,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:49:36,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:49:36,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:49:36,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:49:36,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:49:36,375] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:49:36,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:49:36,377] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step14000/mp_rank_00_model_states.pt +0: [2023-03-15 22:49:36,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:49:36,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:49:36,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:49:36,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:49:36,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:49:36,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +4: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +3: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +2: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +5: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +5: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +7: [2023-03-15 22:49:36,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:49:36,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +7: [2023-03-15 22:49:36,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:49:36,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +1: [2023-03-15 22:49:36,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:49:36,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +6: [2023-03-15 22:49:36,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:49:36,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step14000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:49:36,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step14000 is ready now! +0: successfully saved checkpoint at iteration 14000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 240.32 +7: iteration 14010/ 37905 | consumed samples: 3586560 | consumed tokens: 7345274880 | elapsed time per iteration (s): 0.25 | learning rate: 1.475E-04 | global batch size: 256 | lm loss: 3.816951E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1007.152 | TFLOPs: 25.66 | +7: iteration 14020/ 37905 | consumed samples: 3589120 | consumed tokens: 7350517760 | elapsed time per iteration (s): 0.22 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.803443E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.372 | TFLOPs: 29.36 | +7: iteration 14030/ 37905 | consumed samples: 3591680 | consumed tokens: 7355760640 | elapsed time per iteration (s): 0.22 | learning rate: 1.474E-04 | global batch size: 256 | lm loss: 3.816834E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.348 | TFLOPs: 29.38 | +7: iteration 14040/ 37905 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 0.22 | learning rate: 1.473E-04 | global batch size: 256 | lm loss: 3.804811E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.707 | TFLOPs: 29.34 | +7: iteration 14050/ 37905 | consumed samples: 3596800 | consumed tokens: 7366246400 | elapsed time per iteration (s): 0.22 | learning rate: 1.472E-04 | global batch size: 256 | lm loss: 3.812244E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.342 | TFLOPs: 29.36 | +7: iteration 14060/ 37905 | consumed samples: 3599360 | consumed tokens: 7371489280 | elapsed time per iteration (s): 0.22 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.793371E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.550 | TFLOPs: 29.39 | +7: iteration 14070/ 37905 | consumed samples: 3601920 | consumed tokens: 7376732160 | elapsed time per iteration (s): 0.23 | learning rate: 1.471E-04 | global batch size: 256 | lm loss: 3.806550E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1091.577 | TFLOPs: 27.81 | +7: iteration 14080/ 37905 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 0.22 | learning rate: 1.470E-04 | global batch size: 256 | lm loss: 3.808352E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.334 | TFLOPs: 29.38 | +7: iteration 14090/ 37905 | consumed samples: 3607040 | consumed tokens: 7387217920 | elapsed time per iteration (s): 0.22 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.814885E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.619 | TFLOPs: 29.24 | +7: iteration 14100/ 37905 | consumed samples: 3609600 | consumed tokens: 7392460800 | elapsed time per iteration (s): 0.22 | learning rate: 1.469E-04 | global batch size: 256 | lm loss: 3.816980E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.677 | TFLOPs: 29.36 | +7: iteration 14110/ 37905 | consumed samples: 3612160 | consumed tokens: 7397703680 | elapsed time per iteration (s): 0.22 | learning rate: 1.468E-04 | global batch size: 256 | lm loss: 3.801219E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.924 | TFLOPs: 29.37 | +7: iteration 14120/ 37905 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 0.22 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.818779E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.487 | TFLOPs: 29.36 | +7: iteration 14130/ 37905 | consumed samples: 3617280 | consumed tokens: 7408189440 | elapsed time per iteration (s): 0.22 | learning rate: 1.467E-04 | global batch size: 256 | lm loss: 3.819795E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.738 | TFLOPs: 29.37 | +7: iteration 14140/ 37905 | consumed samples: 3619840 | consumed tokens: 7413432320 | elapsed time per iteration (s): 0.22 | learning rate: 1.466E-04 | global batch size: 256 | lm loss: 3.804263E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.512 | TFLOPs: 29.36 | +7: iteration 14150/ 37905 | consumed samples: 3622400 | consumed tokens: 7418675200 | elapsed time per iteration (s): 0.22 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.794327E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.476 | TFLOPs: 29.36 | +7: iteration 14160/ 37905 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 0.22 | learning rate: 1.465E-04 | global batch size: 256 | lm loss: 3.801254E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.489 | TFLOPs: 29.33 | +7: iteration 14170/ 37905 | consumed samples: 3627520 | consumed tokens: 7429160960 | elapsed time per iteration (s): 0.22 | learning rate: 1.464E-04 | global batch size: 256 | lm loss: 3.808579E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.459 | TFLOPs: 29.36 | +7: iteration 14180/ 37905 | consumed samples: 3630080 | consumed tokens: 7434403840 | elapsed time per iteration (s): 0.22 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.803150E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.213 | TFLOPs: 29.35 | +7: iteration 14190/ 37905 | consumed samples: 3632640 | consumed tokens: 7439646720 | elapsed time per iteration (s): 0.22 | learning rate: 1.463E-04 | global batch size: 256 | lm loss: 3.818787E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.064 | TFLOPs: 29.35 | +7: iteration 14200/ 37905 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 0.22 | learning rate: 1.462E-04 | global batch size: 256 | lm loss: 3.802161E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.187 | TFLOPs: 29.35 | +7: iteration 14210/ 37905 | consumed samples: 3637760 | consumed tokens: 7450132480 | elapsed time per iteration (s): 0.22 | learning rate: 1.461E-04 | global batch size: 256 | lm loss: 3.816994E+00 | grad norm: 0.328 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.445 | TFLOPs: 29.36 | +7: iteration 14220/ 37905 | consumed samples: 3640320 | consumed tokens: 7455375360 | elapsed time per iteration (s): 0.22 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.801462E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.408 | TFLOPs: 29.36 | +7: iteration 14230/ 37905 | consumed samples: 3642880 | consumed tokens: 7460618240 | elapsed time per iteration (s): 0.22 | learning rate: 1.460E-04 | global batch size: 256 | lm loss: 3.802524E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.651 | TFLOPs: 29.36 | +7: iteration 14240/ 37905 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 0.22 | learning rate: 1.459E-04 | global batch size: 256 | lm loss: 3.805732E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.670 | TFLOPs: 29.34 | +7: iteration 14250/ 37905 | consumed samples: 3648000 | consumed tokens: 7471104000 | elapsed time per iteration (s): 0.22 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.811133E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.691 | TFLOPs: 29.31 | +7: iteration 14260/ 37905 | consumed samples: 3650560 | consumed tokens: 7476346880 | elapsed time per iteration (s): 0.22 | learning rate: 1.458E-04 | global batch size: 256 | lm loss: 3.819503E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.615 | TFLOPs: 29.34 | +7: iteration 14270/ 37905 | consumed samples: 3653120 | consumed tokens: 7481589760 | elapsed time per iteration (s): 0.22 | learning rate: 1.457E-04 | global batch size: 256 | lm loss: 3.801347E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.728 | TFLOPs: 29.31 | +7: iteration 14280/ 37905 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 0.22 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.792484E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.399 | TFLOPs: 29.36 | +7: iteration 14290/ 37905 | consumed samples: 3658240 | consumed tokens: 7492075520 | elapsed time per iteration (s): 0.22 | learning rate: 1.456E-04 | global batch size: 256 | lm loss: 3.805724E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.272 | TFLOPs: 29.33 | +7: iteration 14300/ 37905 | consumed samples: 3660800 | consumed tokens: 7497318400 | elapsed time per iteration (s): 0.22 | learning rate: 1.455E-04 | global batch size: 256 | lm loss: 3.803414E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.643 | TFLOPs: 29.34 | +7: iteration 14310/ 37905 | consumed samples: 3663360 | consumed tokens: 7502561280 | elapsed time per iteration (s): 0.22 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.813886E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.278 | TFLOPs: 29.35 | +7: iteration 14320/ 37905 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 0.22 | learning rate: 1.454E-04 | global batch size: 256 | lm loss: 3.811966E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.385 | TFLOPs: 29.36 | +7: iteration 14330/ 37905 | consumed samples: 3668480 | consumed tokens: 7513047040 | elapsed time per iteration (s): 0.22 | learning rate: 1.453E-04 | global batch size: 256 | lm loss: 3.807279E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.196 | TFLOPs: 29.35 | +7: iteration 14340/ 37905 | consumed samples: 3671040 | consumed tokens: 7518289920 | elapsed time per iteration (s): 0.22 | learning rate: 1.452E-04 | global batch size: 256 | lm loss: 3.804742E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.716 | TFLOPs: 29.37 | +7: iteration 14350/ 37905 | consumed samples: 3673600 | consumed tokens: 7523532800 | elapsed time per iteration (s): 0.22 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.792787E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.141 | TFLOPs: 29.35 | +7: iteration 14360/ 37905 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 0.22 | learning rate: 1.451E-04 | global batch size: 256 | lm loss: 3.804770E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.668 | TFLOPs: 29.34 | +7: iteration 14370/ 37905 | consumed samples: 3678720 | consumed tokens: 7534018560 | elapsed time per iteration (s): 0.22 | learning rate: 1.450E-04 | global batch size: 256 | lm loss: 3.796692E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.908 | TFLOPs: 29.35 | +7: iteration 14380/ 37905 | consumed samples: 3681280 | consumed tokens: 7539261440 | elapsed time per iteration (s): 0.22 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.798428E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.732 | TFLOPs: 29.34 | +7: iteration 14390/ 37905 | consumed samples: 3683840 | consumed tokens: 7544504320 | elapsed time per iteration (s): 0.22 | learning rate: 1.449E-04 | global batch size: 256 | lm loss: 3.806572E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.007 | TFLOPs: 29.35 | +7: iteration 14400/ 37905 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 0.22 | learning rate: 1.448E-04 | global batch size: 256 | lm loss: 3.807956E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.186 | TFLOPs: 29.35 | +7: iteration 14410/ 37905 | consumed samples: 3688960 | consumed tokens: 7554990080 | elapsed time per iteration (s): 0.22 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.797980E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.130 | TFLOPs: 29.35 | +7: iteration 14420/ 37905 | consumed samples: 3691520 | consumed tokens: 7560232960 | elapsed time per iteration (s): 0.22 | learning rate: 1.447E-04 | global batch size: 256 | lm loss: 3.795404E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.449 | TFLOPs: 29.33 | +7: iteration 14430/ 37905 | consumed samples: 3694080 | consumed tokens: 7565475840 | elapsed time per iteration (s): 0.22 | learning rate: 1.446E-04 | global batch size: 256 | lm loss: 3.797684E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.581 | TFLOPs: 29.36 | +7: iteration 14440/ 37905 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 0.22 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.792223E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.340 | TFLOPs: 29.36 | +7: iteration 14450/ 37905 | consumed samples: 3699200 | consumed tokens: 7575961600 | elapsed time per iteration (s): 0.22 | learning rate: 1.445E-04 | global batch size: 256 | lm loss: 3.814190E+00 | grad norm: 0.319 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.926 | TFLOPs: 29.37 | +7: iteration 14460/ 37905 | consumed samples: 3701760 | consumed tokens: 7581204480 | elapsed time per iteration (s): 0.22 | learning rate: 1.444E-04 | global batch size: 256 | lm loss: 3.797669E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.994 | TFLOPs: 29.35 | +7: iteration 14470/ 37905 | consumed samples: 3704320 | consumed tokens: 7586447360 | elapsed time per iteration (s): 0.22 | learning rate: 1.443E-04 | global batch size: 256 | lm loss: 3.803026E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.701 | TFLOPs: 29.34 | +7: iteration 14480/ 37905 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 0.22 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.799580E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.384 | TFLOPs: 29.36 | +7: iteration 14490/ 37905 | consumed samples: 3709440 | consumed tokens: 7596933120 | elapsed time per iteration (s): 0.22 | learning rate: 1.442E-04 | global batch size: 256 | lm loss: 3.806973E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.980 | TFLOPs: 29.35 | +7: iteration 14500/ 37905 | consumed samples: 3712000 | consumed tokens: 7602176000 | elapsed time per iteration (s): 0.22 | learning rate: 1.441E-04 | global batch size: 256 | lm loss: 3.798893E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.773 | TFLOPs: 29.34 | +7: iteration 14510/ 37905 | consumed samples: 3714560 | consumed tokens: 7607418880 | elapsed time per iteration (s): 0.22 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.817432E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.764 | TFLOPs: 29.21 | +7: iteration 14520/ 37905 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 0.22 | learning rate: 1.440E-04 | global batch size: 256 | lm loss: 3.806907E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.872 | TFLOPs: 29.37 | +7: iteration 14530/ 37905 | consumed samples: 3719680 | consumed tokens: 7617904640 | elapsed time per iteration (s): 0.22 | learning rate: 1.439E-04 | global batch size: 256 | lm loss: 3.797248E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.506 | TFLOPs: 29.03 | +7: iteration 14540/ 37905 | consumed samples: 3722240 | consumed tokens: 7623147520 | elapsed time per iteration (s): 0.22 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.783710E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.090 | TFLOPs: 29.35 | +7: iteration 14550/ 37905 | consumed samples: 3724800 | consumed tokens: 7628390400 | elapsed time per iteration (s): 0.22 | learning rate: 1.438E-04 | global batch size: 256 | lm loss: 3.814676E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.455 | TFLOPs: 29.36 | +7: iteration 14560/ 37905 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 0.22 | learning rate: 1.437E-04 | global batch size: 256 | lm loss: 3.796042E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.458 | TFLOPs: 29.33 | +7: iteration 14570/ 37905 | consumed samples: 3729920 | consumed tokens: 7638876160 | elapsed time per iteration (s): 0.22 | learning rate: 1.436E-04 | global batch size: 256 | lm loss: 3.793372E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.447 | TFLOPs: 29.33 | +7: iteration 14580/ 37905 | consumed samples: 3732480 | consumed tokens: 7644119040 | elapsed time per iteration (s): 0.22 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.806075E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.389 | TFLOPs: 29.36 | +7: iteration 14590/ 37905 | consumed samples: 3735040 | consumed tokens: 7649361920 | elapsed time per iteration (s): 0.22 | learning rate: 1.435E-04 | global batch size: 256 | lm loss: 3.807318E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.992 | TFLOPs: 29.35 | +7: iteration 14600/ 37905 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 0.22 | learning rate: 1.434E-04 | global batch size: 256 | lm loss: 3.797289E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.183 | TFLOPs: 29.35 | +7: iteration 14610/ 37905 | consumed samples: 3740160 | consumed tokens: 7659847680 | elapsed time per iteration (s): 0.22 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.790305E+00 | grad norm: 0.311 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.258 | TFLOPs: 29.35 | +7: iteration 14620/ 37905 | consumed samples: 3742720 | consumed tokens: 7665090560 | elapsed time per iteration (s): 0.22 | learning rate: 1.433E-04 | global batch size: 256 | lm loss: 3.800152E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.707 | TFLOPs: 29.34 | +7: iteration 14630/ 37905 | consumed samples: 3745280 | consumed tokens: 7670333440 | elapsed time per iteration (s): 0.22 | learning rate: 1.432E-04 | global batch size: 256 | lm loss: 3.796212E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.210 | TFLOPs: 29.35 | +7: iteration 14640/ 37905 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 0.22 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.808614E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.414 | TFLOPs: 29.36 | +7: iteration 14650/ 37905 | consumed samples: 3750400 | consumed tokens: 7680819200 | elapsed time per iteration (s): 0.22 | learning rate: 1.431E-04 | global batch size: 256 | lm loss: 3.797242E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.657 | TFLOPs: 29.31 | +7: iteration 14660/ 37905 | consumed samples: 3752960 | consumed tokens: 7686062080 | elapsed time per iteration (s): 0.22 | learning rate: 1.430E-04 | global batch size: 256 | lm loss: 3.790982E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.190 | TFLOPs: 29.35 | +7: iteration 14670/ 37905 | consumed samples: 3755520 | consumed tokens: 7691304960 | elapsed time per iteration (s): 0.22 | learning rate: 1.429E-04 | global batch size: 256 | lm loss: 3.808016E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.585 | TFLOPs: 29.34 | +7: iteration 14680/ 37905 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 0.22 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.798439E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.317 | TFLOPs: 29.36 | +7: iteration 14690/ 37905 | consumed samples: 3760640 | consumed tokens: 7701790720 | elapsed time per iteration (s): 0.22 | learning rate: 1.428E-04 | global batch size: 256 | lm loss: 3.789106E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.087 | TFLOPs: 29.35 | +7: iteration 14700/ 37905 | consumed samples: 3763200 | consumed tokens: 7707033600 | elapsed time per iteration (s): 0.22 | learning rate: 1.427E-04 | global batch size: 256 | lm loss: 3.796051E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.048 | TFLOPs: 29.35 | +7: iteration 14710/ 37905 | consumed samples: 3765760 | consumed tokens: 7712276480 | elapsed time per iteration (s): 0.22 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.794913E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.855 | TFLOPs: 29.34 | +7: iteration 14720/ 37905 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 0.22 | learning rate: 1.426E-04 | global batch size: 256 | lm loss: 3.800524E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.102 | TFLOPs: 29.32 | +7: iteration 14730/ 37905 | consumed samples: 3770880 | consumed tokens: 7722762240 | elapsed time per iteration (s): 0.22 | learning rate: 1.425E-04 | global batch size: 256 | lm loss: 3.793653E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.699 | TFLOPs: 29.37 | +7: iteration 14740/ 37905 | consumed samples: 3773440 | consumed tokens: 7728005120 | elapsed time per iteration (s): 0.22 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.798070E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.237 | TFLOPs: 29.35 | +7: iteration 14750/ 37905 | consumed samples: 3776000 | consumed tokens: 7733248000 | elapsed time per iteration (s): 0.22 | learning rate: 1.424E-04 | global batch size: 256 | lm loss: 3.797550E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.450 | TFLOPs: 29.36 | +7: iteration 14760/ 37905 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 0.22 | learning rate: 1.423E-04 | global batch size: 256 | lm loss: 3.794510E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.813 | TFLOPs: 29.37 | +7: iteration 14770/ 37905 | consumed samples: 3781120 | consumed tokens: 7743733760 | elapsed time per iteration (s): 0.22 | learning rate: 1.422E-04 | global batch size: 256 | lm loss: 3.789385E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.495 | TFLOPs: 29.36 | +7: iteration 14780/ 37905 | consumed samples: 3783680 | consumed tokens: 7748976640 | elapsed time per iteration (s): 0.22 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.792505E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.032 | TFLOPs: 29.35 | +7: iteration 14790/ 37905 | consumed samples: 3786240 | consumed tokens: 7754219520 | elapsed time per iteration (s): 0.22 | learning rate: 1.421E-04 | global batch size: 256 | lm loss: 3.801845E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.351 | TFLOPs: 29.36 | +7: iteration 14800/ 37905 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 0.22 | learning rate: 1.420E-04 | global batch size: 256 | lm loss: 3.805255E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.428 | TFLOPs: 29.33 | +7: iteration 14810/ 37905 | consumed samples: 3791360 | consumed tokens: 7764705280 | elapsed time per iteration (s): 0.22 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.793272E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.820 | TFLOPs: 29.34 | +7: iteration 14820/ 37905 | consumed samples: 3793920 | consumed tokens: 7769948160 | elapsed time per iteration (s): 0.22 | learning rate: 1.419E-04 | global batch size: 256 | lm loss: 3.798725E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.880 | TFLOPs: 29.37 | +7: iteration 14830/ 37905 | consumed samples: 3796480 | consumed tokens: 7775191040 | elapsed time per iteration (s): 0.22 | learning rate: 1.418E-04 | global batch size: 256 | lm loss: 3.809336E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.544 | TFLOPs: 29.34 | +7: iteration 14840/ 37905 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 0.22 | learning rate: 1.417E-04 | global batch size: 256 | lm loss: 3.807148E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.881 | TFLOPs: 29.34 | +7: iteration 14850/ 37905 | consumed samples: 3801600 | consumed tokens: 7785676800 | elapsed time per iteration (s): 0.22 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.784459E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.300 | TFLOPs: 29.33 | +7: iteration 14860/ 37905 | consumed samples: 3804160 | consumed tokens: 7790919680 | elapsed time per iteration (s): 0.22 | learning rate: 1.416E-04 | global batch size: 256 | lm loss: 3.783915E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.779 | TFLOPs: 29.34 | +7: iteration 14870/ 37905 | consumed samples: 3806720 | consumed tokens: 7796162560 | elapsed time per iteration (s): 0.22 | learning rate: 1.415E-04 | global batch size: 256 | lm loss: 3.800405E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.322 | TFLOPs: 29.33 | +7: iteration 14880/ 37905 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 0.22 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.783306E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.990 | TFLOPs: 29.35 | +7: iteration 14890/ 37905 | consumed samples: 3811840 | consumed tokens: 7806648320 | elapsed time per iteration (s): 0.22 | learning rate: 1.414E-04 | global batch size: 256 | lm loss: 3.784072E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.598 | TFLOPs: 29.34 | +7: iteration 14900/ 37905 | consumed samples: 3814400 | consumed tokens: 7811891200 | elapsed time per iteration (s): 0.22 | learning rate: 1.413E-04 | global batch size: 256 | lm loss: 3.783630E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.839 | TFLOPs: 29.34 | +7: iteration 14910/ 37905 | consumed samples: 3816960 | consumed tokens: 7817134080 | elapsed time per iteration (s): 0.22 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.800105E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.306 | TFLOPs: 29.36 | +7: iteration 14920/ 37905 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 0.22 | learning rate: 1.412E-04 | global batch size: 256 | lm loss: 3.795575E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.732 | TFLOPs: 29.34 | +7: iteration 14930/ 37905 | consumed samples: 3822080 | consumed tokens: 7827619840 | elapsed time per iteration (s): 0.22 | learning rate: 1.411E-04 | global batch size: 256 | lm loss: 3.808539E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.671 | TFLOPs: 29.34 | +7: iteration 14940/ 37905 | consumed samples: 3824640 | consumed tokens: 7832862720 | elapsed time per iteration (s): 0.22 | learning rate: 1.410E-04 | global batch size: 256 | lm loss: 3.797580E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.303 | TFLOPs: 29.33 | +7: iteration 14950/ 37905 | consumed samples: 3827200 | consumed tokens: 7838105600 | elapsed time per iteration (s): 0.22 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.781183E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.530 | TFLOPs: 29.34 | +7: iteration 14960/ 37905 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 0.22 | learning rate: 1.409E-04 | global batch size: 256 | lm loss: 3.806457E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.958 | TFLOPs: 29.37 | +7: iteration 14970/ 37905 | consumed samples: 3832320 | consumed tokens: 7848591360 | elapsed time per iteration (s): 0.22 | learning rate: 1.408E-04 | global batch size: 256 | lm loss: 3.817322E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.656 | TFLOPs: 29.36 | +7: iteration 14980/ 37905 | consumed samples: 3834880 | consumed tokens: 7853834240 | elapsed time per iteration (s): 0.22 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.794737E+00 | grad norm: 0.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.082 | TFLOPs: 29.37 | +7: iteration 14990/ 37905 | consumed samples: 3837440 | consumed tokens: 7859077120 | elapsed time per iteration (s): 0.22 | learning rate: 1.407E-04 | global batch size: 256 | lm loss: 3.782508E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.175 | TFLOPs: 29.15 | +7: iteration 15000/ 37905 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 0.22 | learning rate: 1.406E-04 | global batch size: 256 | lm loss: 3.798744E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.058 | TFLOPs: 29.15 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 15000 | lm loss value: 3.752554E+00 | lm loss PPL: 4.262982E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 15000 to checkpoints_83m20b400m +0: [2023-03-15 22:53:18,891] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step15000 is begin to save! +0: [2023-03-15 22:53:18,893] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:53:18,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:53:18,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:53:18,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:53:18,976] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:53:18,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:53:18,987] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:53:18,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:53:18,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:53:19,009] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:53:19,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:53:19,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:53:19,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:53:19,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:53:19,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:53:19,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:53:19,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:53:19,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:53:19,053] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:53:19,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:53:19,064] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:53:19,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:53:19,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:53:19,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:53:19,077] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step15000/mp_rank_00_model_states.pt +0: [2023-03-15 22:53:19,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:53:19,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:53:19,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:53:19,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-15 22:53:19,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-15 22:53:19,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +6: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +7: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +4: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +1: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +5: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:53:19,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +2: [2023-03-15 22:53:19,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:53:19,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 22:53:19,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +4: [2023-03-15 22:53:19,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:53:19,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 22:53:19,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +3: [2023-03-15 22:53:19,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:53:19,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step15000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:53:19,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step15000 is ready now! +0: successfully saved checkpoint at iteration 15000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 242.41 +7: iteration 15010/ 37905 | consumed samples: 3842560 | consumed tokens: 7869562880 | elapsed time per iteration (s): 0.26 | learning rate: 1.405E-04 | global batch size: 256 | lm loss: 3.796848E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 987.284 | TFLOPs: 25.15 | +7: iteration 15020/ 37905 | consumed samples: 3845120 | consumed tokens: 7874805760 | elapsed time per iteration (s): 0.22 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.813690E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.531 | TFLOPs: 29.39 | +7: iteration 15030/ 37905 | consumed samples: 3847680 | consumed tokens: 7880048640 | elapsed time per iteration (s): 0.22 | learning rate: 1.404E-04 | global batch size: 256 | lm loss: 3.794176E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.343 | TFLOPs: 29.38 | +7: iteration 15040/ 37905 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 0.22 | learning rate: 1.403E-04 | global batch size: 256 | lm loss: 3.790592E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.663 | TFLOPs: 29.39 | +7: iteration 15050/ 37905 | consumed samples: 3852800 | consumed tokens: 7890534400 | elapsed time per iteration (s): 0.22 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.801766E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.611 | TFLOPs: 29.36 | +7: iteration 15060/ 37905 | consumed samples: 3855360 | consumed tokens: 7895777280 | elapsed time per iteration (s): 0.22 | learning rate: 1.402E-04 | global batch size: 256 | lm loss: 3.785183E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.789 | TFLOPs: 29.37 | +7: iteration 15070/ 37905 | consumed samples: 3857920 | consumed tokens: 7901020160 | elapsed time per iteration (s): 0.22 | learning rate: 1.401E-04 | global batch size: 256 | lm loss: 3.785234E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.994 | TFLOPs: 29.37 | +7: iteration 15080/ 37905 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 0.22 | learning rate: 1.400E-04 | global batch size: 256 | lm loss: 3.787091E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.936 | TFLOPs: 29.37 | +7: iteration 15090/ 37905 | consumed samples: 3863040 | consumed tokens: 7911505920 | elapsed time per iteration (s): 0.22 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.804255E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.865 | TFLOPs: 29.11 | +7: iteration 15100/ 37905 | consumed samples: 3865600 | consumed tokens: 7916748800 | elapsed time per iteration (s): 0.22 | learning rate: 1.399E-04 | global batch size: 256 | lm loss: 3.777030E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.221 | TFLOPs: 29.38 | +7: iteration 15110/ 37905 | consumed samples: 3868160 | consumed tokens: 7921991680 | elapsed time per iteration (s): 0.22 | learning rate: 1.398E-04 | global batch size: 256 | lm loss: 3.788933E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.010 | TFLOPs: 29.37 | +7: iteration 15120/ 37905 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 0.22 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.805789E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.799 | TFLOPs: 29.24 | +7: iteration 15130/ 37905 | consumed samples: 3873280 | consumed tokens: 7932477440 | elapsed time per iteration (s): 0.22 | learning rate: 1.397E-04 | global batch size: 256 | lm loss: 3.795205E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.836 | TFLOPs: 29.34 | +7: iteration 15140/ 37905 | consumed samples: 3875840 | consumed tokens: 7937720320 | elapsed time per iteration (s): 0.22 | learning rate: 1.396E-04 | global batch size: 256 | lm loss: 3.808067E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.379 | TFLOPs: 29.33 | +7: iteration 15150/ 37905 | consumed samples: 3878400 | consumed tokens: 7942963200 | elapsed time per iteration (s): 0.22 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.801085E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.066 | TFLOPs: 29.32 | +7: iteration 15160/ 37905 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 0.22 | learning rate: 1.395E-04 | global batch size: 256 | lm loss: 3.804480E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.974 | TFLOPs: 29.35 | +7: iteration 15170/ 37905 | consumed samples: 3883520 | consumed tokens: 7953448960 | elapsed time per iteration (s): 0.22 | learning rate: 1.394E-04 | global batch size: 256 | lm loss: 3.790499E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.602 | TFLOPs: 29.36 | +7: iteration 15180/ 37905 | consumed samples: 3886080 | consumed tokens: 7958691840 | elapsed time per iteration (s): 0.22 | learning rate: 1.393E-04 | global batch size: 256 | lm loss: 3.793975E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.878 | TFLOPs: 29.34 | +7: iteration 15190/ 37905 | consumed samples: 3888640 | consumed tokens: 7963934720 | elapsed time per iteration (s): 0.22 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.787074E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.696 | TFLOPs: 29.34 | +7: iteration 15200/ 37905 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 0.22 | learning rate: 1.392E-04 | global batch size: 256 | lm loss: 3.790356E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.996 | TFLOPs: 29.32 | +7: iteration 15210/ 37905 | consumed samples: 3893760 | consumed tokens: 7974420480 | elapsed time per iteration (s): 0.23 | learning rate: 1.391E-04 | global batch size: 256 | lm loss: 3.788598E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.812 | TFLOPs: 28.45 | +7: iteration 15220/ 37905 | consumed samples: 3896320 | consumed tokens: 7979663360 | elapsed time per iteration (s): 0.22 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.791835E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.810 | TFLOPs: 29.24 | +7: iteration 15230/ 37905 | consumed samples: 3898880 | consumed tokens: 7984906240 | elapsed time per iteration (s): 0.22 | learning rate: 1.390E-04 | global batch size: 256 | lm loss: 3.787466E+00 | grad norm: 0.347 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.296 | TFLOPs: 29.35 | +7: iteration 15240/ 37905 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 0.22 | learning rate: 1.389E-04 | global batch size: 256 | lm loss: 3.790796E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.474 | TFLOPs: 29.36 | +7: iteration 15250/ 37905 | consumed samples: 3904000 | consumed tokens: 7995392000 | elapsed time per iteration (s): 0.22 | learning rate: 1.388E-04 | global batch size: 256 | lm loss: 3.787322E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.498 | TFLOPs: 29.36 | +7: iteration 15260/ 37905 | consumed samples: 3906560 | consumed tokens: 8000634880 | elapsed time per iteration (s): 0.22 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.802950E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.879 | TFLOPs: 29.37 | +7: iteration 15270/ 37905 | consumed samples: 3909120 | consumed tokens: 8005877760 | elapsed time per iteration (s): 0.22 | learning rate: 1.387E-04 | global batch size: 256 | lm loss: 3.779512E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.696 | TFLOPs: 29.34 | +7: iteration 15280/ 37905 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 0.22 | learning rate: 1.386E-04 | global batch size: 256 | lm loss: 3.789980E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.155 | TFLOPs: 29.35 | +7: iteration 15290/ 37905 | consumed samples: 3914240 | consumed tokens: 8016363520 | elapsed time per iteration (s): 0.22 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.802200E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.521 | TFLOPs: 29.36 | +7: iteration 15300/ 37905 | consumed samples: 3916800 | consumed tokens: 8021606400 | elapsed time per iteration (s): 0.22 | learning rate: 1.385E-04 | global batch size: 256 | lm loss: 3.793053E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.788 | TFLOPs: 29.39 | +7: iteration 15310/ 37905 | consumed samples: 3919360 | consumed tokens: 8026849280 | elapsed time per iteration (s): 0.22 | learning rate: 1.384E-04 | global batch size: 256 | lm loss: 3.794118E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.224 | TFLOPs: 29.38 | +7: iteration 15320/ 37905 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 0.22 | learning rate: 1.383E-04 | global batch size: 256 | lm loss: 3.781819E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.713 | TFLOPs: 29.39 | +7: iteration 15330/ 37905 | consumed samples: 3924480 | consumed tokens: 8037335040 | elapsed time per iteration (s): 0.22 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.805401E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.778 | TFLOPs: 29.39 | +7: iteration 15340/ 37905 | consumed samples: 3927040 | consumed tokens: 8042577920 | elapsed time per iteration (s): 0.22 | learning rate: 1.382E-04 | global batch size: 256 | lm loss: 3.798201E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.468 | TFLOPs: 29.36 | +7: iteration 15350/ 37905 | consumed samples: 3929600 | consumed tokens: 8047820800 | elapsed time per iteration (s): 0.22 | learning rate: 1.381E-04 | global batch size: 256 | lm loss: 3.796177E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.676 | TFLOPs: 29.36 | +7: iteration 15360/ 37905 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 0.22 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.807703E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.690 | TFLOPs: 29.34 | +7: iteration 15370/ 37905 | consumed samples: 3934720 | consumed tokens: 8058306560 | elapsed time per iteration (s): 0.22 | learning rate: 1.380E-04 | global batch size: 256 | lm loss: 3.790692E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.849 | TFLOPs: 29.27 | +7: iteration 15380/ 37905 | consumed samples: 3937280 | consumed tokens: 8063549440 | elapsed time per iteration (s): 0.22 | learning rate: 1.379E-04 | global batch size: 256 | lm loss: 3.796442E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.585 | TFLOPs: 29.34 | +7: iteration 15390/ 37905 | consumed samples: 3939840 | consumed tokens: 8068792320 | elapsed time per iteration (s): 0.22 | learning rate: 1.378E-04 | global batch size: 256 | lm loss: 3.799306E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.633 | TFLOPs: 29.34 | +7: iteration 15400/ 37905 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 0.22 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.792094E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.279 | TFLOPs: 29.33 | +7: iteration 15410/ 37905 | consumed samples: 3944960 | consumed tokens: 8079278080 | elapsed time per iteration (s): 0.22 | learning rate: 1.377E-04 | global batch size: 256 | lm loss: 3.791337E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.542 | TFLOPs: 29.34 | +7: iteration 15420/ 37905 | consumed samples: 3947520 | consumed tokens: 8084520960 | elapsed time per iteration (s): 0.22 | learning rate: 1.376E-04 | global batch size: 256 | lm loss: 3.783814E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.445 | TFLOPs: 29.36 | +7: iteration 15430/ 37905 | consumed samples: 3950080 | consumed tokens: 8089763840 | elapsed time per iteration (s): 0.22 | learning rate: 1.375E-04 | global batch size: 256 | lm loss: 3.789204E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.817 | TFLOPs: 29.34 | +7: iteration 15440/ 37905 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 0.22 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.796031E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.741 | TFLOPs: 29.34 | +7: iteration 15450/ 37905 | consumed samples: 3955200 | consumed tokens: 8100249600 | elapsed time per iteration (s): 0.22 | learning rate: 1.374E-04 | global batch size: 256 | lm loss: 3.786535E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.248 | TFLOPs: 29.35 | +7: iteration 15460/ 37905 | consumed samples: 3957760 | consumed tokens: 8105492480 | elapsed time per iteration (s): 0.22 | learning rate: 1.373E-04 | global batch size: 256 | lm loss: 3.798185E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.708 | TFLOPs: 29.34 | +7: iteration 15470/ 37905 | consumed samples: 3960320 | consumed tokens: 8110735360 | elapsed time per iteration (s): 0.22 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.791631E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.952 | TFLOPs: 29.35 | +7: iteration 15480/ 37905 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 0.22 | learning rate: 1.372E-04 | global batch size: 256 | lm loss: 3.786319E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.947 | TFLOPs: 29.35 | +7: iteration 15490/ 37905 | consumed samples: 3965440 | consumed tokens: 8121221120 | elapsed time per iteration (s): 0.22 | learning rate: 1.371E-04 | global batch size: 256 | lm loss: 3.781996E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.482 | TFLOPs: 29.33 | +7: iteration 15500/ 37905 | consumed samples: 3968000 | consumed tokens: 8126464000 | elapsed time per iteration (s): 0.22 | learning rate: 1.370E-04 | global batch size: 256 | lm loss: 3.805777E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.271 | TFLOPs: 29.33 | +7: iteration 15510/ 37905 | consumed samples: 3970560 | consumed tokens: 8131706880 | elapsed time per iteration (s): 0.22 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.791284E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.685 | TFLOPs: 29.34 | +7: iteration 15520/ 37905 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 0.22 | learning rate: 1.369E-04 | global batch size: 256 | lm loss: 3.801959E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.416 | TFLOPs: 29.36 | +7: iteration 15530/ 37905 | consumed samples: 3975680 | consumed tokens: 8142192640 | elapsed time per iteration (s): 0.22 | learning rate: 1.368E-04 | global batch size: 256 | lm loss: 3.787836E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.977 | TFLOPs: 29.32 | +7: iteration 15540/ 37905 | consumed samples: 3978240 | consumed tokens: 8147435520 | elapsed time per iteration (s): 0.22 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.795240E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.158 | TFLOPs: 29.35 | +7: iteration 15550/ 37905 | consumed samples: 3980800 | consumed tokens: 8152678400 | elapsed time per iteration (s): 0.22 | learning rate: 1.367E-04 | global batch size: 256 | lm loss: 3.806316E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.235 | TFLOPs: 29.35 | +7: iteration 15560/ 37905 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 0.22 | learning rate: 1.366E-04 | global batch size: 256 | lm loss: 3.802259E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.012 | TFLOPs: 29.35 | +7: iteration 15570/ 37905 | consumed samples: 3985920 | consumed tokens: 8163164160 | elapsed time per iteration (s): 0.22 | learning rate: 1.365E-04 | global batch size: 256 | lm loss: 3.793139E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.160 | TFLOPs: 29.35 | +7: iteration 15580/ 37905 | consumed samples: 3988480 | consumed tokens: 8168407040 | elapsed time per iteration (s): 0.22 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.796804E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.213 | TFLOPs: 29.35 | +7: iteration 15590/ 37905 | consumed samples: 3991040 | consumed tokens: 8173649920 | elapsed time per iteration (s): 0.22 | learning rate: 1.364E-04 | global batch size: 256 | lm loss: 3.796972E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.098 | TFLOPs: 29.35 | +7: iteration 15600/ 37905 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 0.22 | learning rate: 1.363E-04 | global batch size: 256 | lm loss: 3.783507E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.183 | TFLOPs: 29.33 | +7: iteration 15610/ 37905 | consumed samples: 3996160 | consumed tokens: 8184135680 | elapsed time per iteration (s): 0.22 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.784159E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.420 | TFLOPs: 29.33 | +7: iteration 15620/ 37905 | consumed samples: 3998720 | consumed tokens: 8189378560 | elapsed time per iteration (s): 0.22 | learning rate: 1.362E-04 | global batch size: 256 | lm loss: 3.785689E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.580 | TFLOPs: 29.34 | +7: iteration 15630/ 37905 | consumed samples: 4001280 | consumed tokens: 8194621440 | elapsed time per iteration (s): 0.22 | learning rate: 1.361E-04 | global batch size: 256 | lm loss: 3.790646E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.403 | TFLOPs: 29.36 | +7: iteration 15640/ 37905 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 0.22 | learning rate: 1.360E-04 | global batch size: 256 | lm loss: 3.803001E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.910 | TFLOPs: 29.35 | +7: iteration 15650/ 37905 | consumed samples: 4006400 | consumed tokens: 8205107200 | elapsed time per iteration (s): 0.22 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.775646E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.728 | TFLOPs: 29.34 | +7: iteration 15660/ 37905 | consumed samples: 4008960 | consumed tokens: 8210350080 | elapsed time per iteration (s): 0.22 | learning rate: 1.359E-04 | global batch size: 256 | lm loss: 3.799802E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.305 | TFLOPs: 29.33 | +7: iteration 15670/ 37905 | consumed samples: 4011520 | consumed tokens: 8215592960 | elapsed time per iteration (s): 0.22 | learning rate: 1.358E-04 | global batch size: 256 | lm loss: 3.785966E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.223 | TFLOPs: 29.35 | +7: iteration 15680/ 37905 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 0.22 | learning rate: 1.357E-04 | global batch size: 256 | lm loss: 3.793410E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.414 | TFLOPs: 29.36 | +7: iteration 15690/ 37905 | consumed samples: 4016640 | consumed tokens: 8226078720 | elapsed time per iteration (s): 0.22 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.787241E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.802 | TFLOPs: 29.37 | +7: iteration 15700/ 37905 | consumed samples: 4019200 | consumed tokens: 8231321600 | elapsed time per iteration (s): 0.22 | learning rate: 1.356E-04 | global batch size: 256 | lm loss: 3.793439E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.408 | TFLOPs: 29.38 | +7: iteration 15710/ 37905 | consumed samples: 4021760 | consumed tokens: 8236564480 | elapsed time per iteration (s): 0.22 | learning rate: 1.355E-04 | global batch size: 256 | lm loss: 3.797554E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.479 | TFLOPs: 29.33 | +7: iteration 15720/ 37905 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 0.22 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.799918E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.063 | TFLOPs: 29.37 | +7: iteration 15730/ 37905 | consumed samples: 4026880 | consumed tokens: 8247050240 | elapsed time per iteration (s): 0.22 | learning rate: 1.354E-04 | global batch size: 256 | lm loss: 3.789527E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.689 | TFLOPs: 29.36 | +7: iteration 15740/ 37905 | consumed samples: 4029440 | consumed tokens: 8252293120 | elapsed time per iteration (s): 0.22 | learning rate: 1.353E-04 | global batch size: 256 | lm loss: 3.784542E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.831 | TFLOPs: 29.37 | +7: iteration 15750/ 37905 | consumed samples: 4032000 | consumed tokens: 8257536000 | elapsed time per iteration (s): 0.22 | learning rate: 1.352E-04 | global batch size: 256 | lm loss: 3.779148E+00 | grad norm: 0.309 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.580 | TFLOPs: 29.39 | +7: iteration 15760/ 37905 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 0.22 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.791972E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.368 | TFLOPs: 29.36 | +7: iteration 15770/ 37905 | consumed samples: 4037120 | consumed tokens: 8268021760 | elapsed time per iteration (s): 0.22 | learning rate: 1.351E-04 | global batch size: 256 | lm loss: 3.774101E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.026 | TFLOPs: 29.27 | +7: iteration 15780/ 37905 | consumed samples: 4039680 | consumed tokens: 8273264640 | elapsed time per iteration (s): 0.22 | learning rate: 1.350E-04 | global batch size: 256 | lm loss: 3.796771E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.829 | TFLOPs: 29.32 | +7: iteration 15790/ 37905 | consumed samples: 4042240 | consumed tokens: 8278507520 | elapsed time per iteration (s): 0.22 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.774183E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.860 | TFLOPs: 29.37 | +7: iteration 15800/ 37905 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 0.22 | learning rate: 1.349E-04 | global batch size: 256 | lm loss: 3.787436E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.664 | TFLOPs: 29.36 | +7: iteration 15810/ 37905 | consumed samples: 4047360 | consumed tokens: 8288993280 | elapsed time per iteration (s): 0.22 | learning rate: 1.348E-04 | global batch size: 256 | lm loss: 3.785728E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.926 | TFLOPs: 29.37 | +7: iteration 15820/ 37905 | consumed samples: 4049920 | consumed tokens: 8294236160 | elapsed time per iteration (s): 0.22 | learning rate: 1.347E-04 | global batch size: 256 | lm loss: 3.779655E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.436 | TFLOPs: 29.36 | +7: iteration 15830/ 37905 | consumed samples: 4052480 | consumed tokens: 8299479040 | elapsed time per iteration (s): 0.22 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.796851E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.202 | TFLOPs: 29.38 | +7: iteration 15840/ 37905 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 0.22 | learning rate: 1.346E-04 | global batch size: 256 | lm loss: 3.782305E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.296 | TFLOPs: 29.35 | +7: iteration 15850/ 37905 | consumed samples: 4057600 | consumed tokens: 8309964800 | elapsed time per iteration (s): 0.22 | learning rate: 1.345E-04 | global batch size: 256 | lm loss: 3.800352E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.857 | TFLOPs: 29.37 | +7: iteration 15860/ 37905 | consumed samples: 4060160 | consumed tokens: 8315207680 | elapsed time per iteration (s): 0.22 | learning rate: 1.344E-04 | global batch size: 256 | lm loss: 3.784109E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.549 | TFLOPs: 29.39 | +7: iteration 15870/ 37905 | consumed samples: 4062720 | consumed tokens: 8320450560 | elapsed time per iteration (s): 0.22 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.799628E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.587 | TFLOPs: 29.39 | +7: iteration 15880/ 37905 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 0.22 | learning rate: 1.343E-04 | global batch size: 256 | lm loss: 3.790246E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.108 | TFLOPs: 29.38 | +7: iteration 15890/ 37905 | consumed samples: 4067840 | consumed tokens: 8330936320 | elapsed time per iteration (s): 0.22 | learning rate: 1.342E-04 | global batch size: 256 | lm loss: 3.789235E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.007 | TFLOPs: 29.09 | +7: iteration 15900/ 37905 | consumed samples: 4070400 | consumed tokens: 8336179200 | elapsed time per iteration (s): 0.22 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.796582E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.899 | TFLOPs: 29.37 | +7: iteration 15910/ 37905 | consumed samples: 4072960 | consumed tokens: 8341422080 | elapsed time per iteration (s): 0.22 | learning rate: 1.341E-04 | global batch size: 256 | lm loss: 3.778515E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.983 | TFLOPs: 29.37 | +7: iteration 15920/ 37905 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 0.22 | learning rate: 1.340E-04 | global batch size: 256 | lm loss: 3.793768E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.233 | TFLOPs: 29.38 | +7: iteration 15930/ 37905 | consumed samples: 4078080 | consumed tokens: 8351907840 | elapsed time per iteration (s): 0.22 | learning rate: 1.339E-04 | global batch size: 256 | lm loss: 3.778423E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.190 | TFLOPs: 29.38 | +7: iteration 15940/ 37905 | consumed samples: 4080640 | consumed tokens: 8357150720 | elapsed time per iteration (s): 0.22 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.776031E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.593 | TFLOPs: 29.36 | +7: iteration 15950/ 37905 | consumed samples: 4083200 | consumed tokens: 8362393600 | elapsed time per iteration (s): 0.22 | learning rate: 1.338E-04 | global batch size: 256 | lm loss: 3.789654E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.009 | TFLOPs: 29.40 | +7: iteration 15960/ 37905 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 0.22 | learning rate: 1.337E-04 | global batch size: 256 | lm loss: 3.776251E+00 | grad norm: 0.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.482 | TFLOPs: 29.39 | +7: iteration 15970/ 37905 | consumed samples: 4088320 | consumed tokens: 8372879360 | elapsed time per iteration (s): 0.22 | learning rate: 1.336E-04 | global batch size: 256 | lm loss: 3.801303E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.806 | TFLOPs: 29.37 | +7: iteration 15980/ 37905 | consumed samples: 4090880 | consumed tokens: 8378122240 | elapsed time per iteration (s): 0.22 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.791745E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.993 | TFLOPs: 29.37 | +7: iteration 15990/ 37905 | consumed samples: 4093440 | consumed tokens: 8383365120 | elapsed time per iteration (s): 0.22 | learning rate: 1.335E-04 | global batch size: 256 | lm loss: 3.784761E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.343 | TFLOPs: 29.38 | +0: [2023-03-15 22:57:01,476] [INFO] [logging.py:68:log_dist] [Rank 0] step=16000, skipped=0, lr=[0.00013340211513853558, 0.00013340211513853558, 0.00013340211513853558], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 16000/ 37905 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 0.22 | learning rate: 1.334E-04 | global batch size: 256 | lm loss: 3.769939E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.004 | TFLOPs: 29.37 | +0: steps: 16000 loss: 3.7763 iter time (s): 0.221 samples/sec: 1158.485 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 16000 | lm loss value: 3.780492E+00 | lm loss PPL: 4.383761E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 16000 to checkpoints_83m20b400m +0: [2023-03-15 22:57:01,564] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step16000 is begin to save! +0: [2023-03-15 22:57:01,566] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_01-model_00-model_states.pt... +0: [2023-03-15 22:57:01,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_01-model_00-model_states.pt. +0: [2023-03-15 22:57:01,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_03-model_00-model_states.pt... +0: [2023-03-15 22:57:01,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_03-model_00-model_states.pt. +0: [2023-03-15 22:57:01,649] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_04-model_00-model_states.pt... +0: [2023-03-15 22:57:01,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_04-model_00-model_states.pt. +0: [2023-03-15 22:57:01,660] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_05-model_00-model_states.pt... +0: [2023-03-15 22:57:01,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_05-model_00-model_states.pt. +0: [2023-03-15 22:57:01,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_06-model_00-model_states.pt... +0: [2023-03-15 22:57:01,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_06-model_00-model_states.pt. +0: [2023-03-15 22:57:01,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_07-model_00-model_states.pt... +0: [2023-03-15 22:57:01,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_07-model_00-model_states.pt. +0: [2023-03-15 22:57:01,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_08-model_00-model_states.pt... +0: [2023-03-15 22:57:01,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_08-model_00-model_states.pt. +0: [2023-03-15 22:57:01,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_09-model_00-model_states.pt... +0: [2023-03-15 22:57:01,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_09-model_00-model_states.pt. +0: [2023-03-15 22:57:01,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_10-model_00-model_states.pt... +0: [2023-03-15 22:57:01,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_10-model_00-model_states.pt. +0: [2023-03-15 22:57:01,727] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_11-model_00-model_states.pt... +0: [2023-03-15 22:57:01,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_11-model_00-model_states.pt. +0: [2023-03-15 22:57:01,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_12-model_00-model_states.pt... +0: [2023-03-15 22:57:01,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_12-model_00-model_states.pt. +0: [2023-03-15 22:57:01,749] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/layer_14-model_00-model_states.pt... +0: [2023-03-15 22:57:01,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/layer_14-model_00-model_states.pt. +0: [2023-03-15 22:57:01,751] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step16000/mp_rank_00_model_states.pt +0: [2023-03-15 22:57:01,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/mp_rank_00_model_states.pt... +0: [2023-03-15 22:57:01,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/mp_rank_00_model_states.pt. +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +2: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 22:57:01,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +0: [2023-03-15 22:57:01,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:01,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,789] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,789] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,789] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,790] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:01,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,792] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,792] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:01,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,793] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,793] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,793] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:01,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,797] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +5: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,798] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +6: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +7: [2023-03-15 22:57:01,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 22:57:01,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +3: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 22:57:01,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:01,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +1: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 22:57:01,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +6: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +4: [2023-03-15 22:57:01,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 22:57:01,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +2: [2023-03-15 22:57:01,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 22:57:01,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step16000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 22:57:01,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step16000 is ready now! +0: successfully saved checkpoint at iteration 16000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.67 +7: iteration 16010/ 37905 | consumed samples: 4098560 | consumed tokens: 8393850880 | elapsed time per iteration (s): 0.25 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.783394E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1008.504 | TFLOPs: 25.69 | +7: iteration 16020/ 37905 | consumed samples: 4101120 | consumed tokens: 8399093760 | elapsed time per iteration (s): 0.22 | learning rate: 1.333E-04 | global batch size: 256 | lm loss: 3.774315E+00 | grad norm: 0.302 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.169 | TFLOPs: 29.43 | +7: iteration 16030/ 37905 | consumed samples: 4103680 | consumed tokens: 8404336640 | elapsed time per iteration (s): 0.22 | learning rate: 1.332E-04 | global batch size: 256 | lm loss: 3.788422E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.396 | TFLOPs: 29.41 | +7: iteration 16040/ 37905 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 0.22 | learning rate: 1.331E-04 | global batch size: 256 | lm loss: 3.784787E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.904 | TFLOPs: 29.40 | +7: iteration 16050/ 37905 | consumed samples: 4108800 | consumed tokens: 8414822400 | elapsed time per iteration (s): 0.22 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.776559E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.596 | TFLOPs: 29.41 | +7: iteration 16060/ 37905 | consumed samples: 4111360 | consumed tokens: 8420065280 | elapsed time per iteration (s): 0.22 | learning rate: 1.330E-04 | global batch size: 256 | lm loss: 3.781393E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.825 | TFLOPs: 29.42 | +7: iteration 16070/ 37905 | consumed samples: 4113920 | consumed tokens: 8425308160 | elapsed time per iteration (s): 0.22 | learning rate: 1.329E-04 | global batch size: 256 | lm loss: 3.797782E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.610 | TFLOPs: 29.41 | +7: iteration 16080/ 37905 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 0.22 | learning rate: 1.328E-04 | global batch size: 256 | lm loss: 3.789953E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.074 | TFLOPs: 29.40 | +7: iteration 16090/ 37905 | consumed samples: 4119040 | consumed tokens: 8435793920 | elapsed time per iteration (s): 0.22 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.777053E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.972 | TFLOPs: 29.40 | +7: iteration 16100/ 37905 | consumed samples: 4121600 | consumed tokens: 8441036800 | elapsed time per iteration (s): 0.22 | learning rate: 1.327E-04 | global batch size: 256 | lm loss: 3.790815E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.356 | TFLOPs: 29.41 | +7: iteration 16110/ 37905 | consumed samples: 4124160 | consumed tokens: 8446279680 | elapsed time per iteration (s): 0.22 | learning rate: 1.326E-04 | global batch size: 256 | lm loss: 3.804035E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.620 | TFLOPs: 29.41 | +7: iteration 16120/ 37905 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 0.22 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.778021E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.038 | TFLOPs: 29.40 | +7: iteration 16130/ 37905 | consumed samples: 4129280 | consumed tokens: 8456765440 | elapsed time per iteration (s): 0.22 | learning rate: 1.325E-04 | global batch size: 256 | lm loss: 3.791961E+00 | grad norm: 0.306 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.017 | TFLOPs: 29.40 | +7: iteration 16140/ 37905 | consumed samples: 4131840 | consumed tokens: 8462008320 | elapsed time per iteration (s): 0.22 | learning rate: 1.324E-04 | global batch size: 256 | lm loss: 3.784908E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.822 | TFLOPs: 29.04 | +7: iteration 16150/ 37905 | consumed samples: 4134400 | consumed tokens: 8467251200 | elapsed time per iteration (s): 0.22 | learning rate: 1.323E-04 | global batch size: 256 | lm loss: 3.791058E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.164 | TFLOPs: 29.40 | +7: iteration 16160/ 37905 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 0.22 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.776632E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.323 | TFLOPs: 29.41 | +7: iteration 16170/ 37905 | consumed samples: 4139520 | consumed tokens: 8477736960 | elapsed time per iteration (s): 0.22 | learning rate: 1.322E-04 | global batch size: 256 | lm loss: 3.776363E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.628 | TFLOPs: 29.41 | +7: iteration 16180/ 37905 | consumed samples: 4142080 | consumed tokens: 8482979840 | elapsed time per iteration (s): 0.22 | learning rate: 1.321E-04 | global batch size: 256 | lm loss: 3.795102E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.705 | TFLOPs: 29.42 | +7: iteration 16190/ 37905 | consumed samples: 4144640 | consumed tokens: 8488222720 | elapsed time per iteration (s): 0.22 | learning rate: 1.320E-04 | global batch size: 256 | lm loss: 3.756539E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.330 | TFLOPs: 29.41 | +7: iteration 16200/ 37905 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 0.22 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.780055E+00 | grad norm: 0.282 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.543 | TFLOPs: 29.41 | +7: iteration 16210/ 37905 | consumed samples: 4149760 | consumed tokens: 8498708480 | elapsed time per iteration (s): 0.22 | learning rate: 1.319E-04 | global batch size: 256 | lm loss: 3.784196E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.337 | TFLOPs: 29.41 | +7: iteration 16220/ 37905 | consumed samples: 4152320 | consumed tokens: 8503951360 | elapsed time per iteration (s): 0.22 | learning rate: 1.318E-04 | global batch size: 256 | lm loss: 3.775409E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.522 | TFLOPs: 29.44 | +7: iteration 16230/ 37905 | consumed samples: 4154880 | consumed tokens: 8509194240 | elapsed time per iteration (s): 0.22 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.779236E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.002 | TFLOPs: 29.42 | +7: iteration 16240/ 37905 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 0.22 | learning rate: 1.317E-04 | global batch size: 256 | lm loss: 3.786321E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.740 | TFLOPs: 29.37 | +7: iteration 16250/ 37905 | consumed samples: 4160000 | consumed tokens: 8519680000 | elapsed time per iteration (s): 0.22 | learning rate: 1.316E-04 | global batch size: 256 | lm loss: 3.777710E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.352 | TFLOPs: 29.36 | +7: iteration 16260/ 37905 | consumed samples: 4162560 | consumed tokens: 8524922880 | elapsed time per iteration (s): 0.22 | learning rate: 1.315E-04 | global batch size: 256 | lm loss: 3.788506E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.819 | TFLOPs: 29.34 | +7: iteration 16270/ 37905 | consumed samples: 4165120 | consumed tokens: 8530165760 | elapsed time per iteration (s): 0.22 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.783621E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.005 | TFLOPs: 29.32 | +7: iteration 16280/ 37905 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 0.22 | learning rate: 1.314E-04 | global batch size: 256 | lm loss: 3.785750E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.207 | TFLOPs: 29.33 | +7: iteration 16290/ 37905 | consumed samples: 4170240 | consumed tokens: 8540651520 | elapsed time per iteration (s): 0.22 | learning rate: 1.313E-04 | global batch size: 256 | lm loss: 3.779622E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.916 | TFLOPs: 29.35 | +7: iteration 16300/ 37905 | consumed samples: 4172800 | consumed tokens: 8545894400 | elapsed time per iteration (s): 0.22 | learning rate: 1.312E-04 | global batch size: 256 | lm loss: 3.784054E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.987 | TFLOPs: 29.32 | +7: iteration 16310/ 37905 | consumed samples: 4175360 | consumed tokens: 8551137280 | elapsed time per iteration (s): 0.22 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.783939E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.072 | TFLOPs: 29.32 | +7: iteration 16320/ 37905 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 0.22 | learning rate: 1.311E-04 | global batch size: 256 | lm loss: 3.777270E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.920 | TFLOPs: 29.32 | +7: iteration 16330/ 37905 | consumed samples: 4180480 | consumed tokens: 8561623040 | elapsed time per iteration (s): 0.22 | learning rate: 1.310E-04 | global batch size: 256 | lm loss: 3.785275E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.358 | TFLOPs: 29.33 | +7: iteration 16340/ 37905 | consumed samples: 4183040 | consumed tokens: 8566865920 | elapsed time per iteration (s): 0.22 | learning rate: 1.309E-04 | global batch size: 256 | lm loss: 3.778478E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.457 | TFLOPs: 29.33 | +7: iteration 16350/ 37905 | consumed samples: 4185600 | consumed tokens: 8572108800 | elapsed time per iteration (s): 0.22 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.784300E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.830 | TFLOPs: 29.34 | +7: iteration 16360/ 37905 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 0.22 | learning rate: 1.308E-04 | global batch size: 256 | lm loss: 3.778289E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.949 | TFLOPs: 29.32 | +7: iteration 16370/ 37905 | consumed samples: 4190720 | consumed tokens: 8582594560 | elapsed time per iteration (s): 0.22 | learning rate: 1.307E-04 | global batch size: 256 | lm loss: 3.776661E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.043 | TFLOPs: 29.32 | +7: iteration 16380/ 37905 | consumed samples: 4193280 | consumed tokens: 8587837440 | elapsed time per iteration (s): 0.22 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.788344E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.506 | TFLOPs: 29.31 | +7: iteration 16390/ 37905 | consumed samples: 4195840 | consumed tokens: 8593080320 | elapsed time per iteration (s): 0.22 | learning rate: 1.306E-04 | global batch size: 256 | lm loss: 3.784544E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.542 | TFLOPs: 29.34 | +7: iteration 16400/ 37905 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 0.22 | learning rate: 1.305E-04 | global batch size: 256 | lm loss: 3.769749E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.705 | TFLOPs: 29.31 | +7: iteration 16410/ 37905 | consumed samples: 4200960 | consumed tokens: 8603566080 | elapsed time per iteration (s): 0.22 | learning rate: 1.304E-04 | global batch size: 256 | lm loss: 3.762800E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.500 | TFLOPs: 29.33 | +7: iteration 16420/ 37905 | consumed samples: 4203520 | consumed tokens: 8608808960 | elapsed time per iteration (s): 0.22 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.783530E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.789 | TFLOPs: 29.32 | +7: iteration 16430/ 37905 | consumed samples: 4206080 | consumed tokens: 8614051840 | elapsed time per iteration (s): 0.22 | learning rate: 1.303E-04 | global batch size: 256 | lm loss: 3.768000E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.670 | TFLOPs: 29.34 | +7: iteration 16440/ 37905 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 0.22 | learning rate: 1.302E-04 | global batch size: 256 | lm loss: 3.784295E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.694 | TFLOPs: 29.34 | +7: iteration 16450/ 37905 | consumed samples: 4211200 | consumed tokens: 8624537600 | elapsed time per iteration (s): 0.22 | learning rate: 1.301E-04 | global batch size: 256 | lm loss: 3.774039E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.304 | TFLOPs: 29.33 | +7: iteration 16460/ 37905 | consumed samples: 4213760 | consumed tokens: 8629780480 | elapsed time per iteration (s): 0.22 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.783464E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.412 | TFLOPs: 29.33 | +7: iteration 16470/ 37905 | consumed samples: 4216320 | consumed tokens: 8635023360 | elapsed time per iteration (s): 0.22 | learning rate: 1.300E-04 | global batch size: 256 | lm loss: 3.784419E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.385 | TFLOPs: 29.33 | +7: iteration 16480/ 37905 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 0.22 | learning rate: 1.299E-04 | global batch size: 256 | lm loss: 3.772279E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.161 | TFLOPs: 29.33 | +7: iteration 16490/ 37905 | consumed samples: 4221440 | consumed tokens: 8645509120 | elapsed time per iteration (s): 0.22 | learning rate: 1.298E-04 | global batch size: 256 | lm loss: 3.782991E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.533 | TFLOPs: 29.34 | +7: iteration 16500/ 37905 | consumed samples: 4224000 | consumed tokens: 8650752000 | elapsed time per iteration (s): 0.22 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.776289E+00 | grad norm: 0.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.894 | TFLOPs: 29.34 | +7: iteration 16510/ 37905 | consumed samples: 4226560 | consumed tokens: 8655994880 | elapsed time per iteration (s): 0.22 | learning rate: 1.297E-04 | global batch size: 256 | lm loss: 3.779478E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.018 | TFLOPs: 29.35 | +7: iteration 16520/ 37905 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 0.22 | learning rate: 1.296E-04 | global batch size: 256 | lm loss: 3.787235E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.832 | TFLOPs: 29.34 | +7: iteration 16530/ 37905 | consumed samples: 4231680 | consumed tokens: 8666480640 | elapsed time per iteration (s): 0.22 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.774480E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.329 | TFLOPs: 29.36 | +7: iteration 16540/ 37905 | consumed samples: 4234240 | consumed tokens: 8671723520 | elapsed time per iteration (s): 0.22 | learning rate: 1.295E-04 | global batch size: 256 | lm loss: 3.776360E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.692 | TFLOPs: 29.34 | +7: iteration 16550/ 37905 | consumed samples: 4236800 | consumed tokens: 8676966400 | elapsed time per iteration (s): 0.22 | learning rate: 1.294E-04 | global batch size: 256 | lm loss: 3.776307E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.815 | TFLOPs: 29.34 | +7: iteration 16560/ 37905 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 0.22 | learning rate: 1.293E-04 | global batch size: 256 | lm loss: 3.780645E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.392 | TFLOPs: 29.33 | +7: iteration 16570/ 37905 | consumed samples: 4241920 | consumed tokens: 8687452160 | elapsed time per iteration (s): 0.22 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.765237E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.859 | TFLOPs: 29.34 | +7: iteration 16580/ 37905 | consumed samples: 4244480 | consumed tokens: 8692695040 | elapsed time per iteration (s): 0.22 | learning rate: 1.292E-04 | global batch size: 256 | lm loss: 3.762534E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.155 | TFLOPs: 29.30 | +7: iteration 16590/ 37905 | consumed samples: 4247040 | consumed tokens: 8697937920 | elapsed time per iteration (s): 0.22 | learning rate: 1.291E-04 | global batch size: 256 | lm loss: 3.779771E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.687 | TFLOPs: 29.34 | +7: iteration 16600/ 37905 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 0.22 | learning rate: 1.290E-04 | global batch size: 256 | lm loss: 3.769814E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.868 | TFLOPs: 29.32 | +7: iteration 16610/ 37905 | consumed samples: 4252160 | consumed tokens: 8708423680 | elapsed time per iteration (s): 0.22 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.777544E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.101 | TFLOPs: 29.35 | +7: iteration 16620/ 37905 | consumed samples: 4254720 | consumed tokens: 8713666560 | elapsed time per iteration (s): 0.22 | learning rate: 1.289E-04 | global batch size: 256 | lm loss: 3.772697E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.025 | TFLOPs: 29.32 | +7: iteration 16630/ 37905 | consumed samples: 4257280 | consumed tokens: 8718909440 | elapsed time per iteration (s): 0.22 | learning rate: 1.288E-04 | global batch size: 256 | lm loss: 3.770895E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.844 | TFLOPs: 29.34 | +7: iteration 16640/ 37905 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 0.22 | learning rate: 1.287E-04 | global batch size: 256 | lm loss: 3.771433E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.045 | TFLOPs: 29.35 | +7: iteration 16650/ 37905 | consumed samples: 4262400 | consumed tokens: 8729395200 | elapsed time per iteration (s): 0.22 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.772813E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.616 | TFLOPs: 29.31 | +7: iteration 16660/ 37905 | consumed samples: 4264960 | consumed tokens: 8734638080 | elapsed time per iteration (s): 0.22 | learning rate: 1.286E-04 | global batch size: 256 | lm loss: 3.791671E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.462 | TFLOPs: 29.36 | +7: iteration 16670/ 37905 | consumed samples: 4267520 | consumed tokens: 8739880960 | elapsed time per iteration (s): 0.22 | learning rate: 1.285E-04 | global batch size: 256 | lm loss: 3.763490E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.674 | TFLOPs: 29.34 | +7: iteration 16680/ 37905 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 0.22 | learning rate: 1.284E-04 | global batch size: 256 | lm loss: 3.772664E+00 | grad norm: 0.312 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.922 | TFLOPs: 29.32 | +7: iteration 16690/ 37905 | consumed samples: 4272640 | consumed tokens: 8750366720 | elapsed time per iteration (s): 0.22 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.764923E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.491 | TFLOPs: 29.33 | +7: iteration 16700/ 37905 | consumed samples: 4275200 | consumed tokens: 8755609600 | elapsed time per iteration (s): 0.22 | learning rate: 1.283E-04 | global batch size: 256 | lm loss: 3.774788E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.337 | TFLOPs: 29.33 | +7: iteration 16710/ 37905 | consumed samples: 4277760 | consumed tokens: 8760852480 | elapsed time per iteration (s): 0.22 | learning rate: 1.282E-04 | global batch size: 256 | lm loss: 3.793974E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.337 | TFLOPs: 29.36 | +7: iteration 16720/ 37905 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 0.22 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.769642E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.547 | TFLOPs: 29.34 | +7: iteration 16730/ 37905 | consumed samples: 4282880 | consumed tokens: 8771338240 | elapsed time per iteration (s): 0.22 | learning rate: 1.281E-04 | global batch size: 256 | lm loss: 3.791775E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.741 | TFLOPs: 29.29 | +7: iteration 16740/ 37905 | consumed samples: 4285440 | consumed tokens: 8776581120 | elapsed time per iteration (s): 0.22 | learning rate: 1.280E-04 | global batch size: 256 | lm loss: 3.789747E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.794 | TFLOPs: 29.29 | +7: iteration 16750/ 37905 | consumed samples: 4288000 | consumed tokens: 8781824000 | elapsed time per iteration (s): 0.22 | learning rate: 1.279E-04 | global batch size: 256 | lm loss: 3.775823E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.909 | TFLOPs: 29.32 | +7: iteration 16760/ 37905 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 0.22 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.771897E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.050 | TFLOPs: 29.32 | +7: iteration 16770/ 37905 | consumed samples: 4293120 | consumed tokens: 8792309760 | elapsed time per iteration (s): 0.22 | learning rate: 1.278E-04 | global batch size: 256 | lm loss: 3.759517E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.378 | TFLOPs: 29.31 | +7: iteration 16780/ 37905 | consumed samples: 4295680 | consumed tokens: 8797552640 | elapsed time per iteration (s): 0.22 | learning rate: 1.277E-04 | global batch size: 256 | lm loss: 3.778706E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.591 | TFLOPs: 29.34 | +7: iteration 16790/ 37905 | consumed samples: 4298240 | consumed tokens: 8802795520 | elapsed time per iteration (s): 0.22 | learning rate: 1.276E-04 | global batch size: 256 | lm loss: 3.777580E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.457 | TFLOPs: 29.33 | +7: iteration 16800/ 37905 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 0.22 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.770765E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.533 | TFLOPs: 29.34 | +7: iteration 16810/ 37905 | consumed samples: 4303360 | consumed tokens: 8813281280 | elapsed time per iteration (s): 0.22 | learning rate: 1.275E-04 | global batch size: 256 | lm loss: 3.762301E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.632 | TFLOPs: 29.34 | +7: iteration 16820/ 37905 | consumed samples: 4305920 | consumed tokens: 8818524160 | elapsed time per iteration (s): 0.22 | learning rate: 1.274E-04 | global batch size: 256 | lm loss: 3.780636E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.705 | TFLOPs: 29.34 | +7: iteration 16830/ 37905 | consumed samples: 4308480 | consumed tokens: 8823767040 | elapsed time per iteration (s): 0.22 | learning rate: 1.273E-04 | global batch size: 256 | lm loss: 3.778187E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.490 | TFLOPs: 29.33 | +7: iteration 16840/ 37905 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 0.22 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.766491E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.153 | TFLOPs: 29.33 | +7: iteration 16850/ 37905 | consumed samples: 4313600 | consumed tokens: 8834252800 | elapsed time per iteration (s): 0.22 | learning rate: 1.272E-04 | global batch size: 256 | lm loss: 3.783278E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.227 | TFLOPs: 29.33 | +7: iteration 16860/ 37905 | consumed samples: 4316160 | consumed tokens: 8839495680 | elapsed time per iteration (s): 0.22 | learning rate: 1.271E-04 | global batch size: 256 | lm loss: 3.780457E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.680 | TFLOPs: 29.31 | +7: iteration 16870/ 37905 | consumed samples: 4318720 | consumed tokens: 8844738560 | elapsed time per iteration (s): 0.22 | learning rate: 1.270E-04 | global batch size: 256 | lm loss: 3.781445E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.001 | TFLOPs: 29.32 | +7: iteration 16880/ 37905 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 0.22 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.776386E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.748 | TFLOPs: 29.34 | +7: iteration 16890/ 37905 | consumed samples: 4323840 | consumed tokens: 8855224320 | elapsed time per iteration (s): 0.22 | learning rate: 1.269E-04 | global batch size: 256 | lm loss: 3.771579E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.159 | TFLOPs: 29.33 | +7: iteration 16900/ 37905 | consumed samples: 4326400 | consumed tokens: 8860467200 | elapsed time per iteration (s): 0.22 | learning rate: 1.268E-04 | global batch size: 256 | lm loss: 3.782846E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.925 | TFLOPs: 29.32 | +7: iteration 16910/ 37905 | consumed samples: 4328960 | consumed tokens: 8865710080 | elapsed time per iteration (s): 0.22 | learning rate: 1.267E-04 | global batch size: 256 | lm loss: 3.772466E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.973 | TFLOPs: 29.32 | +7: iteration 16920/ 37905 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 0.22 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.772389E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.499 | TFLOPs: 29.31 | +7: iteration 16930/ 37905 | consumed samples: 4334080 | consumed tokens: 8876195840 | elapsed time per iteration (s): 0.22 | learning rate: 1.266E-04 | global batch size: 256 | lm loss: 3.772762E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.461 | TFLOPs: 29.31 | +7: iteration 16940/ 37905 | consumed samples: 4336640 | consumed tokens: 8881438720 | elapsed time per iteration (s): 0.22 | learning rate: 1.265E-04 | global batch size: 256 | lm loss: 3.768386E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.121 | TFLOPs: 29.30 | +7: iteration 16950/ 37905 | consumed samples: 4339200 | consumed tokens: 8886681600 | elapsed time per iteration (s): 0.22 | learning rate: 1.264E-04 | global batch size: 256 | lm loss: 3.766605E+00 | grad norm: 0.301 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.455 | TFLOPs: 29.33 | +7: iteration 16960/ 37905 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 0.22 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.764227E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.364 | TFLOPs: 29.25 | +7: iteration 16970/ 37905 | consumed samples: 4344320 | consumed tokens: 8897167360 | elapsed time per iteration (s): 0.22 | learning rate: 1.263E-04 | global batch size: 256 | lm loss: 3.779761E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.568 | TFLOPs: 29.36 | +7: iteration 16980/ 37905 | consumed samples: 4346880 | consumed tokens: 8902410240 | elapsed time per iteration (s): 0.22 | learning rate: 1.262E-04 | global batch size: 256 | lm loss: 3.781016E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.987 | TFLOPs: 29.37 | +7: iteration 16990/ 37905 | consumed samples: 4349440 | consumed tokens: 8907653120 | elapsed time per iteration (s): 0.22 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.772689E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.873 | TFLOPs: 29.17 | +7: iteration 17000/ 37905 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 0.24 | learning rate: 1.261E-04 | global batch size: 256 | lm loss: 3.776598E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1061.765 | TFLOPs: 27.05 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 17000 | lm loss value: 3.719566E+00 | lm loss PPL: 4.124649E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 17000 to checkpoints_83m20b400m +0: [2023-03-15 23:00:44,309] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step17000 is begin to save! +0: [2023-03-15 23:00:44,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:00:44,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:00:44,409] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:00:44,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:00:44,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:00:44,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:00:44,432] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:00:44,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:00:44,443] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:00:44,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:00:44,454] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:00:44,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:00:44,465] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:00:44,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:00:44,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:00:44,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:00:44,487] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:00:44,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:00:44,498] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:00:44,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:00:44,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:00:44,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:00:44,679] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:00:44,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:00:44,682] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step17000/mp_rank_00_model_states.pt +0: [2023-03-15 23:00:44,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:00:44,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:00:44,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-15 23:00:44,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:00:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +5: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +1: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +3: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +6: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-15 23:00:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +2: [2023-03-15 23:00:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:00:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:00:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +7: [2023-03-15 23:00:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +2: [2023-03-15 23:00:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: [2023-03-15 23:00:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +4: [2023-03-15 23:00:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:00:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:00:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,707] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:00:44,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: [2023-03-15 23:00:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +7: time (ms) | save-checkpoint: 463.07 +0: [2023-03-15 23:00:44,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:00:44,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step17000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:00:44,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step17000 is ready now! +0: successfully saved checkpoint at iteration 17000 to checkpoints_83m20b400m +7: iteration 17010/ 37905 | consumed samples: 4354560 | consumed tokens: 8918138880 | elapsed time per iteration (s): 0.28 | learning rate: 1.260E-04 | global batch size: 256 | lm loss: 3.787906E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 925.287 | TFLOPs: 23.57 | +7: iteration 17020/ 37905 | consumed samples: 4357120 | consumed tokens: 8923381760 | elapsed time per iteration (s): 0.24 | learning rate: 1.259E-04 | global batch size: 256 | lm loss: 3.768674E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1064.299 | TFLOPs: 27.11 | +7: iteration 17030/ 37905 | consumed samples: 4359680 | consumed tokens: 8928624640 | elapsed time per iteration (s): 0.23 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.763168E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.250 | TFLOPs: 28.67 | +7: iteration 17040/ 37905 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 0.23 | learning rate: 1.258E-04 | global batch size: 256 | lm loss: 3.772137E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1095.544 | TFLOPs: 27.91 | +7: iteration 17050/ 37905 | consumed samples: 4364800 | consumed tokens: 8939110400 | elapsed time per iteration (s): 0.22 | learning rate: 1.257E-04 | global batch size: 256 | lm loss: 3.768380E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.847 | TFLOPs: 29.39 | +7: iteration 17060/ 37905 | consumed samples: 4367360 | consumed tokens: 8944353280 | elapsed time per iteration (s): 0.22 | learning rate: 1.256E-04 | global batch size: 256 | lm loss: 3.771507E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.630 | TFLOPs: 29.11 | +7: iteration 17070/ 37905 | consumed samples: 4369920 | consumed tokens: 8949596160 | elapsed time per iteration (s): 0.23 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.784178E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.918 | TFLOPs: 28.73 | +7: iteration 17080/ 37905 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 0.23 | learning rate: 1.255E-04 | global batch size: 256 | lm loss: 3.765434E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.885 | TFLOPs: 28.48 | +7: iteration 17090/ 37905 | consumed samples: 4375040 | consumed tokens: 8960081920 | elapsed time per iteration (s): 0.22 | learning rate: 1.254E-04 | global batch size: 256 | lm loss: 3.780433E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.302 | TFLOPs: 29.00 | +7: iteration 17100/ 37905 | consumed samples: 4377600 | consumed tokens: 8965324800 | elapsed time per iteration (s): 0.22 | learning rate: 1.253E-04 | global batch size: 256 | lm loss: 3.784801E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.877 | TFLOPs: 29.22 | +7: iteration 17110/ 37905 | consumed samples: 4380160 | consumed tokens: 8970567680 | elapsed time per iteration (s): 0.22 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.771439E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.609 | TFLOPs: 29.06 | +7: iteration 17120/ 37905 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 0.23 | learning rate: 1.252E-04 | global batch size: 256 | lm loss: 3.771237E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.304 | TFLOPs: 28.92 | +7: iteration 17130/ 37905 | consumed samples: 4385280 | consumed tokens: 8981053440 | elapsed time per iteration (s): 0.24 | learning rate: 1.251E-04 | global batch size: 256 | lm loss: 3.769154E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1078.964 | TFLOPs: 27.49 | +7: iteration 17140/ 37905 | consumed samples: 4387840 | consumed tokens: 8986296320 | elapsed time per iteration (s): 0.23 | learning rate: 1.250E-04 | global batch size: 256 | lm loss: 3.780125E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.558 | TFLOPs: 28.98 | +7: iteration 17150/ 37905 | consumed samples: 4390400 | consumed tokens: 8991539200 | elapsed time per iteration (s): 0.22 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.775251E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.986 | TFLOPs: 29.22 | +7: iteration 17160/ 37905 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 0.23 | learning rate: 1.249E-04 | global batch size: 256 | lm loss: 3.769779E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.688 | TFLOPs: 28.47 | +7: iteration 17170/ 37905 | consumed samples: 4395520 | consumed tokens: 9002024960 | elapsed time per iteration (s): 0.22 | learning rate: 1.248E-04 | global batch size: 256 | lm loss: 3.773706E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.723 | TFLOPs: 29.14 | +7: iteration 17180/ 37905 | consumed samples: 4398080 | consumed tokens: 9007267840 | elapsed time per iteration (s): 0.23 | learning rate: 1.247E-04 | global batch size: 256 | lm loss: 3.756127E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.089 | TFLOPs: 28.81 | +7: iteration 17190/ 37905 | consumed samples: 4400640 | consumed tokens: 9012510720 | elapsed time per iteration (s): 0.22 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.775979E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.065 | TFLOPs: 29.30 | +7: iteration 17200/ 37905 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 0.23 | learning rate: 1.246E-04 | global batch size: 256 | lm loss: 3.786675E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.930 | TFLOPs: 28.79 | +7: iteration 17210/ 37905 | consumed samples: 4405760 | consumed tokens: 9022996480 | elapsed time per iteration (s): 0.22 | learning rate: 1.245E-04 | global batch size: 256 | lm loss: 3.775363E+00 | grad norm: 0.293 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.329 | TFLOPs: 29.15 | +7: iteration 17220/ 37905 | consumed samples: 4408320 | consumed tokens: 9028239360 | elapsed time per iteration (s): 0.22 | learning rate: 1.244E-04 | global batch size: 256 | lm loss: 3.784481E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.764 | TFLOPs: 29.19 | +7: iteration 17230/ 37905 | consumed samples: 4410880 | consumed tokens: 9033482240 | elapsed time per iteration (s): 0.22 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.764858E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.792 | TFLOPs: 29.39 | +7: iteration 17240/ 37905 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 0.23 | learning rate: 1.243E-04 | global batch size: 256 | lm loss: 3.769724E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.204 | TFLOPs: 28.28 | +7: iteration 17250/ 37905 | consumed samples: 4416000 | consumed tokens: 9043968000 | elapsed time per iteration (s): 0.23 | learning rate: 1.242E-04 | global batch size: 256 | lm loss: 3.762782E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1115.632 | TFLOPs: 28.42 | +7: iteration 17260/ 37905 | consumed samples: 4418560 | consumed tokens: 9049210880 | elapsed time per iteration (s): 0.28 | learning rate: 1.241E-04 | global batch size: 256 | lm loss: 3.781968E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 928.530 | TFLOPs: 23.65 | +7: iteration 17270/ 37905 | consumed samples: 4421120 | consumed tokens: 9054453760 | elapsed time per iteration (s): 0.28 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.769518E+00 | grad norm: 0.303 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 901.316 | TFLOPs: 22.96 | +7: iteration 17280/ 37905 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 0.24 | learning rate: 1.240E-04 | global batch size: 256 | lm loss: 3.779796E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1071.210 | TFLOPs: 27.29 | +7: iteration 17290/ 37905 | consumed samples: 4426240 | consumed tokens: 9064939520 | elapsed time per iteration (s): 0.22 | learning rate: 1.239E-04 | global batch size: 256 | lm loss: 3.764715E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.041 | TFLOPs: 29.40 | +7: iteration 17300/ 37905 | consumed samples: 4428800 | consumed tokens: 9070182400 | elapsed time per iteration (s): 0.23 | learning rate: 1.238E-04 | global batch size: 256 | lm loss: 3.773483E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.331 | TFLOPs: 28.92 | +7: iteration 17310/ 37905 | consumed samples: 4431360 | consumed tokens: 9075425280 | elapsed time per iteration (s): 0.22 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.749960E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.554 | TFLOPs: 29.36 | +7: iteration 17320/ 37905 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 0.24 | learning rate: 1.237E-04 | global batch size: 256 | lm loss: 3.765978E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1087.809 | TFLOPs: 27.71 | +7: iteration 17330/ 37905 | consumed samples: 4436480 | consumed tokens: 9085911040 | elapsed time per iteration (s): 0.26 | learning rate: 1.236E-04 | global batch size: 256 | lm loss: 3.779155E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 985.013 | TFLOPs: 25.09 | +7: iteration 17340/ 37905 | consumed samples: 4439040 | consumed tokens: 9091153920 | elapsed time per iteration (s): 0.23 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.762817E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.503 | TFLOPs: 28.52 | +7: iteration 17350/ 37905 | consumed samples: 4441600 | consumed tokens: 9096396800 | elapsed time per iteration (s): 0.33 | learning rate: 1.235E-04 | global batch size: 256 | lm loss: 3.776454E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 777.886 | TFLOPs: 19.82 | +7: iteration 17360/ 37905 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 0.22 | learning rate: 1.234E-04 | global batch size: 256 | lm loss: 3.778949E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.287 | TFLOPs: 29.00 | +7: iteration 17370/ 37905 | consumed samples: 4446720 | consumed tokens: 9106882560 | elapsed time per iteration (s): 0.23 | learning rate: 1.233E-04 | global batch size: 256 | lm loss: 3.765626E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.659 | TFLOPs: 28.70 | +7: iteration 17380/ 37905 | consumed samples: 4449280 | consumed tokens: 9112125440 | elapsed time per iteration (s): 0.24 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.772114E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1062.262 | TFLOPs: 27.06 | +7: iteration 17390/ 37905 | consumed samples: 4451840 | consumed tokens: 9117368320 | elapsed time per iteration (s): 0.30 | learning rate: 1.232E-04 | global batch size: 256 | lm loss: 3.781119E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 857.605 | TFLOPs: 21.85 | +7: iteration 17400/ 37905 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 0.23 | learning rate: 1.231E-04 | global batch size: 256 | lm loss: 3.784386E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.458 | TFLOPs: 28.88 | +7: iteration 17410/ 37905 | consumed samples: 4456960 | consumed tokens: 9127854080 | elapsed time per iteration (s): 0.23 | learning rate: 1.230E-04 | global batch size: 256 | lm loss: 3.758797E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.596 | TFLOPs: 28.75 | +7: iteration 17420/ 37905 | consumed samples: 4459520 | consumed tokens: 9133096960 | elapsed time per iteration (s): 0.24 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.766825E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1089.238 | TFLOPs: 27.75 | +7: iteration 17430/ 37905 | consumed samples: 4462080 | consumed tokens: 9138339840 | elapsed time per iteration (s): 0.23 | learning rate: 1.229E-04 | global batch size: 256 | lm loss: 3.765899E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.717 | TFLOPs: 28.91 | +7: iteration 17440/ 37905 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 0.23 | learning rate: 1.228E-04 | global batch size: 256 | lm loss: 3.779485E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.022 | TFLOPs: 28.58 | +7: iteration 17450/ 37905 | consumed samples: 4467200 | consumed tokens: 9148825600 | elapsed time per iteration (s): 0.23 | learning rate: 1.227E-04 | global batch size: 256 | lm loss: 3.777048E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.054 | TFLOPs: 28.18 | +7: iteration 17460/ 37905 | consumed samples: 4469760 | consumed tokens: 9154068480 | elapsed time per iteration (s): 0.23 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.776509E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.949 | TFLOPs: 28.79 | +7: iteration 17470/ 37905 | consumed samples: 4472320 | consumed tokens: 9159311360 | elapsed time per iteration (s): 0.22 | learning rate: 1.226E-04 | global batch size: 256 | lm loss: 3.778865E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.202 | TFLOPs: 29.02 | +7: iteration 17480/ 37905 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 0.22 | learning rate: 1.225E-04 | global batch size: 256 | lm loss: 3.781675E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.983 | TFLOPs: 29.12 | +7: iteration 17490/ 37905 | consumed samples: 4477440 | consumed tokens: 9169797120 | elapsed time per iteration (s): 0.22 | learning rate: 1.224E-04 | global batch size: 256 | lm loss: 3.761768E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.058 | TFLOPs: 29.02 | +7: iteration 17500/ 37905 | consumed samples: 4480000 | consumed tokens: 9175040000 | elapsed time per iteration (s): 0.22 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.774889E+00 | grad norm: 0.295 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.273 | TFLOPs: 29.35 | +7: iteration 17510/ 37905 | consumed samples: 4482560 | consumed tokens: 9180282880 | elapsed time per iteration (s): 0.23 | learning rate: 1.223E-04 | global batch size: 256 | lm loss: 3.773074E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.723 | TFLOPs: 28.91 | +7: iteration 17520/ 37905 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 0.23 | learning rate: 1.222E-04 | global batch size: 256 | lm loss: 3.780256E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.061 | TFLOPs: 28.81 | +7: iteration 17530/ 37905 | consumed samples: 4487680 | consumed tokens: 9190768640 | elapsed time per iteration (s): 0.22 | learning rate: 1.221E-04 | global batch size: 256 | lm loss: 3.773085E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.860 | TFLOPs: 29.37 | +7: iteration 17540/ 37905 | consumed samples: 4490240 | consumed tokens: 9196011520 | elapsed time per iteration (s): 0.23 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.766047E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.154 | TFLOPs: 28.49 | +7: iteration 17550/ 37905 | consumed samples: 4492800 | consumed tokens: 9201254400 | elapsed time per iteration (s): 0.23 | learning rate: 1.220E-04 | global batch size: 256 | lm loss: 3.767713E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.862 | TFLOPs: 28.50 | +7: iteration 17560/ 37905 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 0.23 | learning rate: 1.219E-04 | global batch size: 256 | lm loss: 3.766973E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.546 | TFLOPs: 28.55 | +7: iteration 17570/ 37905 | consumed samples: 4497920 | consumed tokens: 9211740160 | elapsed time per iteration (s): 0.22 | learning rate: 1.218E-04 | global batch size: 256 | lm loss: 3.755187E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.564 | TFLOPs: 29.18 | +7: iteration 17580/ 37905 | consumed samples: 4500480 | consumed tokens: 9216983040 | elapsed time per iteration (s): 0.22 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.770407E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.649 | TFLOPs: 29.11 | +7: iteration 17590/ 37905 | consumed samples: 4503040 | consumed tokens: 9222225920 | elapsed time per iteration (s): 0.22 | learning rate: 1.217E-04 | global batch size: 256 | lm loss: 3.766722E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.839 | TFLOPs: 29.09 | +7: iteration 17600/ 37905 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 0.22 | learning rate: 1.216E-04 | global batch size: 256 | lm loss: 3.763847E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.412 | TFLOPs: 29.13 | +7: iteration 17610/ 37905 | consumed samples: 4508160 | consumed tokens: 9232711680 | elapsed time per iteration (s): 0.23 | learning rate: 1.215E-04 | global batch size: 256 | lm loss: 3.761896E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.554 | TFLOPs: 28.88 | +7: iteration 17620/ 37905 | consumed samples: 4510720 | consumed tokens: 9237954560 | elapsed time per iteration (s): 0.23 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.775016E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.186 | TFLOPs: 28.84 | +7: iteration 17630/ 37905 | consumed samples: 4513280 | consumed tokens: 9243197440 | elapsed time per iteration (s): 0.22 | learning rate: 1.214E-04 | global batch size: 256 | lm loss: 3.771635E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.238 | TFLOPs: 29.02 | +7: iteration 17640/ 37905 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 0.23 | learning rate: 1.213E-04 | global batch size: 256 | lm loss: 3.775840E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.134 | TFLOPs: 28.66 | +7: iteration 17650/ 37905 | consumed samples: 4518400 | consumed tokens: 9253683200 | elapsed time per iteration (s): 0.23 | learning rate: 1.212E-04 | global batch size: 256 | lm loss: 3.772370E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.279 | TFLOPs: 28.84 | +7: iteration 17660/ 37905 | consumed samples: 4520960 | consumed tokens: 9258926080 | elapsed time per iteration (s): 0.23 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.751948E+00 | grad norm: 0.296 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.602 | TFLOPs: 28.88 | +7: iteration 17670/ 37905 | consumed samples: 4523520 | consumed tokens: 9264168960 | elapsed time per iteration (s): 0.22 | learning rate: 1.211E-04 | global batch size: 256 | lm loss: 3.775309E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.611 | TFLOPs: 29.36 | +7: iteration 17680/ 37905 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 0.22 | learning rate: 1.210E-04 | global batch size: 256 | lm loss: 3.765756E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.019 | TFLOPs: 29.40 | +7: iteration 17690/ 37905 | consumed samples: 4528640 | consumed tokens: 9274654720 | elapsed time per iteration (s): 0.23 | learning rate: 1.209E-04 | global batch size: 256 | lm loss: 3.768683E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.916 | TFLOPs: 28.20 | +7: iteration 17700/ 37905 | consumed samples: 4531200 | consumed tokens: 9279897600 | elapsed time per iteration (s): 0.22 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.770181E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.522 | TFLOPs: 29.00 | +7: iteration 17710/ 37905 | consumed samples: 4533760 | consumed tokens: 9285140480 | elapsed time per iteration (s): 0.23 | learning rate: 1.208E-04 | global batch size: 256 | lm loss: 3.766444E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.251 | TFLOPs: 28.72 | +7: iteration 17720/ 37905 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 0.23 | learning rate: 1.207E-04 | global batch size: 256 | lm loss: 3.765236E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.908 | TFLOPs: 28.48 | +7: iteration 17730/ 37905 | consumed samples: 4538880 | consumed tokens: 9295626240 | elapsed time per iteration (s): 0.23 | learning rate: 1.206E-04 | global batch size: 256 | lm loss: 3.762123E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.030 | TFLOPs: 28.86 | +7: iteration 17740/ 37905 | consumed samples: 4541440 | consumed tokens: 9300869120 | elapsed time per iteration (s): 0.22 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.769584E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.313 | TFLOPs: 29.00 | +7: iteration 17750/ 37905 | consumed samples: 4544000 | consumed tokens: 9306112000 | elapsed time per iteration (s): 0.22 | learning rate: 1.205E-04 | global batch size: 256 | lm loss: 3.757525E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.579 | TFLOPs: 29.21 | +7: iteration 17760/ 37905 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 0.23 | learning rate: 1.204E-04 | global batch size: 256 | lm loss: 3.762259E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.816 | TFLOPs: 28.96 | +7: iteration 17770/ 37905 | consumed samples: 4549120 | consumed tokens: 9316597760 | elapsed time per iteration (s): 0.23 | learning rate: 1.203E-04 | global batch size: 256 | lm loss: 3.756992E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.966 | TFLOPs: 28.58 | +7: iteration 17780/ 37905 | consumed samples: 4551680 | consumed tokens: 9321840640 | elapsed time per iteration (s): 0.23 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.776912E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.319 | TFLOPs: 28.36 | +7: iteration 17790/ 37905 | consumed samples: 4554240 | consumed tokens: 9327083520 | elapsed time per iteration (s): 0.22 | learning rate: 1.202E-04 | global batch size: 256 | lm loss: 3.769552E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.783 | TFLOPs: 29.09 | +7: iteration 17800/ 37905 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 0.22 | learning rate: 1.201E-04 | global batch size: 256 | lm loss: 3.744867E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.766 | TFLOPs: 29.16 | +7: iteration 17810/ 37905 | consumed samples: 4559360 | consumed tokens: 9337569280 | elapsed time per iteration (s): 0.23 | learning rate: 1.200E-04 | global batch size: 256 | lm loss: 3.763924E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.790 | TFLOPs: 28.48 | +7: iteration 17820/ 37905 | consumed samples: 4561920 | consumed tokens: 9342812160 | elapsed time per iteration (s): 0.23 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.765744E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.983 | TFLOPs: 28.91 | +7: iteration 17830/ 37905 | consumed samples: 4564480 | consumed tokens: 9348055040 | elapsed time per iteration (s): 0.23 | learning rate: 1.199E-04 | global batch size: 256 | lm loss: 3.755225E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.556 | TFLOPs: 28.52 | +7: iteration 17840/ 37905 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 0.23 | learning rate: 1.198E-04 | global batch size: 256 | lm loss: 3.764878E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.823 | TFLOPs: 28.88 | +7: iteration 17850/ 37905 | consumed samples: 4569600 | consumed tokens: 9358540800 | elapsed time per iteration (s): 0.23 | learning rate: 1.197E-04 | global batch size: 256 | lm loss: 3.760810E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.014 | TFLOPs: 28.15 | +7: iteration 17860/ 37905 | consumed samples: 4572160 | consumed tokens: 9363783680 | elapsed time per iteration (s): 0.22 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.764571E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.101 | TFLOPs: 29.12 | +7: iteration 17870/ 37905 | consumed samples: 4574720 | consumed tokens: 9369026560 | elapsed time per iteration (s): 0.22 | learning rate: 1.196E-04 | global batch size: 256 | lm loss: 3.765892E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.958 | TFLOPs: 29.24 | +7: iteration 17880/ 37905 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 0.22 | learning rate: 1.195E-04 | global batch size: 256 | lm loss: 3.768586E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.038 | TFLOPs: 29.09 | +7: iteration 17890/ 37905 | consumed samples: 4579840 | consumed tokens: 9379512320 | elapsed time per iteration (s): 0.23 | learning rate: 1.194E-04 | global batch size: 256 | lm loss: 3.761084E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.281 | TFLOPs: 28.56 | +7: iteration 17900/ 37905 | consumed samples: 4582400 | consumed tokens: 9384755200 | elapsed time per iteration (s): 0.22 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.775751E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.701 | TFLOPs: 29.39 | +7: iteration 17910/ 37905 | consumed samples: 4584960 | consumed tokens: 9389998080 | elapsed time per iteration (s): 0.22 | learning rate: 1.193E-04 | global batch size: 256 | lm loss: 3.753921E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.211 | TFLOPs: 29.35 | +7: iteration 17920/ 37905 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 0.23 | learning rate: 1.192E-04 | global batch size: 256 | lm loss: 3.767324E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.165 | TFLOPs: 28.66 | +7: iteration 17930/ 37905 | consumed samples: 4590080 | consumed tokens: 9400483840 | elapsed time per iteration (s): 0.24 | learning rate: 1.191E-04 | global batch size: 256 | lm loss: 3.767619E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1079.954 | TFLOPs: 27.51 | +7: iteration 17940/ 37905 | consumed samples: 4592640 | consumed tokens: 9405726720 | elapsed time per iteration (s): 0.23 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.762901E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.695 | TFLOPs: 28.68 | +7: iteration 17950/ 37905 | consumed samples: 4595200 | consumed tokens: 9410969600 | elapsed time per iteration (s): 0.23 | learning rate: 1.190E-04 | global batch size: 256 | lm loss: 3.762288E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.636 | TFLOPs: 28.85 | +7: iteration 17960/ 37905 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 0.23 | learning rate: 1.189E-04 | global batch size: 256 | lm loss: 3.772145E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.354 | TFLOPs: 28.31 | +7: iteration 17970/ 37905 | consumed samples: 4600320 | consumed tokens: 9421455360 | elapsed time per iteration (s): 0.22 | learning rate: 1.188E-04 | global batch size: 256 | lm loss: 3.768910E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.987 | TFLOPs: 29.07 | +7: iteration 17980/ 37905 | consumed samples: 4602880 | consumed tokens: 9426698240 | elapsed time per iteration (s): 0.23 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.762991E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.029 | TFLOPs: 28.43 | +7: iteration 17990/ 37905 | consumed samples: 4605440 | consumed tokens: 9431941120 | elapsed time per iteration (s): 0.23 | learning rate: 1.187E-04 | global batch size: 256 | lm loss: 3.764003E+00 | grad norm: 0.290 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.449 | TFLOPs: 28.82 | +0: [2023-03-15 23:04:34,634] [INFO] [logging.py:68:log_dist] [Rank 0] step=18000, skipped=0, lr=[0.00011859190008696891, 0.00011859190008696891, 0.00011859190008696891], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 18000/ 37905 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 0.22 | learning rate: 1.186E-04 | global batch size: 256 | lm loss: 3.761556E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.355 | TFLOPs: 29.08 | +0: steps: 18000 loss: 3.7579 iter time (s): 0.225 samples/sec: 1140.179 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 18000 | lm loss value: 3.728833E+00 | lm loss PPL: 4.163050E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 18000 to checkpoints_83m20b400m +0: [2023-03-15 23:04:34,783] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step18000 is begin to save! +0: [2023-03-15 23:04:34,789] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:04:34,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:04:34,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:04:34,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:04:34,959] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:04:34,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:04:34,971] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:04:34,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:04:34,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:04:34,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:04:34,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:04:35,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:04:35,005] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:04:35,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:04:35,016] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:04:35,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:04:35,027] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:04:35,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:04:35,039] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:04:35,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:04:35,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:04:35,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:04:35,061] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:04:35,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:04:35,063] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step18000/mp_rank_00_model_states.pt +0: [2023-03-15 23:04:35,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:04:35,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:04:35,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:04:35,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +2: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +5: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +6: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:04:35,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:04:35,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +3: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +3: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +7: [2023-03-15 23:04:35,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:04:35,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:04:35,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:04:35,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step18000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:04:35,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +1: [2023-03-15 23:04:35,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step18000 is ready now! +0: successfully saved checkpoint at iteration 18000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 396.72 +7: iteration 18010/ 37905 | consumed samples: 4610560 | consumed tokens: 9442426880 | elapsed time per iteration (s): 0.27 | learning rate: 1.185E-04 | global batch size: 256 | lm loss: 3.765107E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 942.138 | TFLOPs: 24.00 | +7: iteration 18020/ 37905 | consumed samples: 4613120 | consumed tokens: 9447669760 | elapsed time per iteration (s): 0.23 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.765244E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.256 | TFLOPs: 28.90 | +7: iteration 18030/ 37905 | consumed samples: 4615680 | consumed tokens: 9452912640 | elapsed time per iteration (s): 0.22 | learning rate: 1.184E-04 | global batch size: 256 | lm loss: 3.765025E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.535 | TFLOPs: 29.16 | +7: iteration 18040/ 37905 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 0.22 | learning rate: 1.183E-04 | global batch size: 256 | lm loss: 3.770796E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.227 | TFLOPs: 29.02 | +7: iteration 18050/ 37905 | consumed samples: 4620800 | consumed tokens: 9463398400 | elapsed time per iteration (s): 0.23 | learning rate: 1.182E-04 | global batch size: 256 | lm loss: 3.768923E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.756 | TFLOPs: 28.81 | +7: iteration 18060/ 37905 | consumed samples: 4623360 | consumed tokens: 9468641280 | elapsed time per iteration (s): 0.22 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.768971E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.628 | TFLOPs: 29.03 | +7: iteration 18070/ 37905 | consumed samples: 4625920 | consumed tokens: 9473884160 | elapsed time per iteration (s): 0.22 | learning rate: 1.181E-04 | global batch size: 256 | lm loss: 3.760654E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.088 | TFLOPs: 28.99 | +7: iteration 18080/ 37905 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 0.23 | learning rate: 1.180E-04 | global batch size: 256 | lm loss: 3.764680E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1108.157 | TFLOPs: 28.23 | +7: iteration 18090/ 37905 | consumed samples: 4631040 | consumed tokens: 9484369920 | elapsed time per iteration (s): 0.22 | learning rate: 1.179E-04 | global batch size: 256 | lm loss: 3.760880E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.747 | TFLOPs: 29.24 | +7: iteration 18100/ 37905 | consumed samples: 4633600 | consumed tokens: 9489612800 | elapsed time per iteration (s): 0.23 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.767575E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.112 | TFLOPs: 28.46 | +7: iteration 18110/ 37905 | consumed samples: 4636160 | consumed tokens: 9494855680 | elapsed time per iteration (s): 0.23 | learning rate: 1.178E-04 | global batch size: 256 | lm loss: 3.758040E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.403 | TFLOPs: 28.57 | +7: iteration 18120/ 37905 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 0.23 | learning rate: 1.177E-04 | global batch size: 256 | lm loss: 3.760586E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1101.487 | TFLOPs: 28.06 | +7: iteration 18130/ 37905 | consumed samples: 4641280 | consumed tokens: 9505341440 | elapsed time per iteration (s): 0.23 | learning rate: 1.176E-04 | global batch size: 256 | lm loss: 3.749055E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.816 | TFLOPs: 28.83 | +7: iteration 18140/ 37905 | consumed samples: 4643840 | consumed tokens: 9510584320 | elapsed time per iteration (s): 0.23 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.760965E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.914 | TFLOPs: 28.66 | +7: iteration 18150/ 37905 | consumed samples: 4646400 | consumed tokens: 9515827200 | elapsed time per iteration (s): 0.23 | learning rate: 1.175E-04 | global batch size: 256 | lm loss: 3.771667E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.732 | TFLOPs: 28.47 | +7: iteration 18160/ 37905 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 0.23 | learning rate: 1.174E-04 | global batch size: 256 | lm loss: 3.771758E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.669 | TFLOPs: 28.55 | +7: iteration 18170/ 37905 | consumed samples: 4651520 | consumed tokens: 9526312960 | elapsed time per iteration (s): 0.23 | learning rate: 1.173E-04 | global batch size: 256 | lm loss: 3.758328E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.235 | TFLOPs: 28.84 | +7: iteration 18180/ 37905 | consumed samples: 4654080 | consumed tokens: 9531555840 | elapsed time per iteration (s): 0.23 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.759385E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.744 | TFLOPs: 28.98 | +7: iteration 18190/ 37905 | consumed samples: 4656640 | consumed tokens: 9536798720 | elapsed time per iteration (s): 0.23 | learning rate: 1.172E-04 | global batch size: 256 | lm loss: 3.772729E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.278 | TFLOPs: 28.84 | +7: iteration 18200/ 37905 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 0.22 | learning rate: 1.171E-04 | global batch size: 256 | lm loss: 3.768037E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.331 | TFLOPs: 29.36 | +7: iteration 18210/ 37905 | consumed samples: 4661760 | consumed tokens: 9547284480 | elapsed time per iteration (s): 0.23 | learning rate: 1.170E-04 | global batch size: 256 | lm loss: 3.772486E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.837 | TFLOPs: 28.48 | +7: iteration 18220/ 37905 | consumed samples: 4664320 | consumed tokens: 9552527360 | elapsed time per iteration (s): 0.23 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.775843E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1093.349 | TFLOPs: 27.85 | +7: iteration 18230/ 37905 | consumed samples: 4666880 | consumed tokens: 9557770240 | elapsed time per iteration (s): 0.23 | learning rate: 1.169E-04 | global batch size: 256 | lm loss: 3.757338E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.037 | TFLOPs: 28.89 | +7: iteration 18240/ 37905 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 0.23 | learning rate: 1.168E-04 | global batch size: 256 | lm loss: 3.775457E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.700 | TFLOPs: 28.32 | +7: iteration 18250/ 37905 | consumed samples: 4672000 | consumed tokens: 9568256000 | elapsed time per iteration (s): 0.23 | learning rate: 1.167E-04 | global batch size: 256 | lm loss: 3.751171E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.919 | TFLOPs: 28.68 | +7: iteration 18260/ 37905 | consumed samples: 4674560 | consumed tokens: 9573498880 | elapsed time per iteration (s): 0.23 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.768625E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.902 | TFLOPs: 28.73 | +7: iteration 18270/ 37905 | consumed samples: 4677120 | consumed tokens: 9578741760 | elapsed time per iteration (s): 0.23 | learning rate: 1.166E-04 | global batch size: 256 | lm loss: 3.770406E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.103 | TFLOPs: 28.51 | +7: iteration 18280/ 37905 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 0.23 | learning rate: 1.165E-04 | global batch size: 256 | lm loss: 3.754250E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.374 | TFLOPs: 28.62 | +7: iteration 18290/ 37905 | consumed samples: 4682240 | consumed tokens: 9589227520 | elapsed time per iteration (s): 0.23 | learning rate: 1.164E-04 | global batch size: 256 | lm loss: 3.763776E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.106 | TFLOPs: 28.53 | +7: iteration 18300/ 37905 | consumed samples: 4684800 | consumed tokens: 9594470400 | elapsed time per iteration (s): 0.23 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.758464E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.036 | TFLOPs: 28.69 | +7: iteration 18310/ 37905 | consumed samples: 4687360 | consumed tokens: 9599713280 | elapsed time per iteration (s): 0.24 | learning rate: 1.163E-04 | global batch size: 256 | lm loss: 3.772592E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1081.724 | TFLOPs: 27.56 | +7: iteration 18320/ 37905 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 0.23 | learning rate: 1.162E-04 | global batch size: 256 | lm loss: 3.766602E+00 | grad norm: 0.321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1114.409 | TFLOPs: 28.39 | +7: iteration 18330/ 37905 | consumed samples: 4692480 | consumed tokens: 9610199040 | elapsed time per iteration (s): 0.23 | learning rate: 1.161E-04 | global batch size: 256 | lm loss: 3.760901E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1090.007 | TFLOPs: 27.77 | +7: iteration 18340/ 37905 | consumed samples: 4695040 | consumed tokens: 9615441920 | elapsed time per iteration (s): 0.23 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.757770E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.179 | TFLOPs: 28.61 | +7: iteration 18350/ 37905 | consumed samples: 4697600 | consumed tokens: 9620684800 | elapsed time per iteration (s): 0.23 | learning rate: 1.160E-04 | global batch size: 256 | lm loss: 3.759374E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.729 | TFLOPs: 28.14 | +7: iteration 18360/ 37905 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 0.23 | learning rate: 1.159E-04 | global batch size: 256 | lm loss: 3.751749E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.617 | TFLOPs: 28.19 | +7: iteration 18370/ 37905 | consumed samples: 4702720 | consumed tokens: 9631170560 | elapsed time per iteration (s): 0.22 | learning rate: 1.158E-04 | global batch size: 256 | lm loss: 3.752825E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.025 | TFLOPs: 29.20 | +7: iteration 18380/ 37905 | consumed samples: 4705280 | consumed tokens: 9636413440 | elapsed time per iteration (s): 0.23 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.770293E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.166 | TFLOPs: 28.61 | +7: iteration 18390/ 37905 | consumed samples: 4707840 | consumed tokens: 9641656320 | elapsed time per iteration (s): 0.23 | learning rate: 1.157E-04 | global batch size: 256 | lm loss: 3.763558E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.086 | TFLOPs: 28.53 | +7: iteration 18400/ 37905 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 0.23 | learning rate: 1.156E-04 | global batch size: 256 | lm loss: 3.763483E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.816 | TFLOPs: 28.81 | +7: iteration 18410/ 37905 | consumed samples: 4712960 | consumed tokens: 9652142080 | elapsed time per iteration (s): 0.23 | learning rate: 1.155E-04 | global batch size: 256 | lm loss: 3.767683E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1114.028 | TFLOPs: 28.38 | +7: iteration 18420/ 37905 | consumed samples: 4715520 | consumed tokens: 9657384960 | elapsed time per iteration (s): 0.22 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.749863E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.295 | TFLOPs: 29.15 | +7: iteration 18430/ 37905 | consumed samples: 4718080 | consumed tokens: 9662627840 | elapsed time per iteration (s): 0.23 | learning rate: 1.154E-04 | global batch size: 256 | lm loss: 3.766568E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.182 | TFLOPs: 28.77 | +7: iteration 18440/ 37905 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 0.24 | learning rate: 1.153E-04 | global batch size: 256 | lm loss: 3.750764E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1075.638 | TFLOPs: 27.40 | +7: iteration 18450/ 37905 | consumed samples: 4723200 | consumed tokens: 9673113600 | elapsed time per iteration (s): 0.28 | learning rate: 1.152E-04 | global batch size: 256 | lm loss: 3.765272E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 910.653 | TFLOPs: 23.20 | +7: iteration 18460/ 37905 | consumed samples: 4725760 | consumed tokens: 9678356480 | elapsed time per iteration (s): 0.23 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.749827E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.959 | TFLOPs: 28.38 | +7: iteration 18470/ 37905 | consumed samples: 4728320 | consumed tokens: 9683599360 | elapsed time per iteration (s): 0.23 | learning rate: 1.151E-04 | global batch size: 256 | lm loss: 3.766306E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.448 | TFLOPs: 28.14 | +7: iteration 18480/ 37905 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 0.23 | learning rate: 1.150E-04 | global batch size: 256 | lm loss: 3.754276E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.600 | TFLOPs: 28.37 | +7: iteration 18490/ 37905 | consumed samples: 4733440 | consumed tokens: 9694085120 | elapsed time per iteration (s): 0.23 | learning rate: 1.149E-04 | global batch size: 256 | lm loss: 3.760919E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.486 | TFLOPs: 28.62 | +7: iteration 18500/ 37905 | consumed samples: 4736000 | consumed tokens: 9699328000 | elapsed time per iteration (s): 0.22 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.758462E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.442 | TFLOPs: 29.38 | +7: iteration 18510/ 37905 | consumed samples: 4738560 | consumed tokens: 9704570880 | elapsed time per iteration (s): 0.23 | learning rate: 1.148E-04 | global batch size: 256 | lm loss: 3.765260E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.343 | TFLOPs: 28.62 | +7: iteration 18520/ 37905 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 0.22 | learning rate: 1.147E-04 | global batch size: 256 | lm loss: 3.770630E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.281 | TFLOPs: 29.00 | +7: iteration 18530/ 37905 | consumed samples: 4743680 | consumed tokens: 9715056640 | elapsed time per iteration (s): 0.22 | learning rate: 1.146E-04 | global batch size: 256 | lm loss: 3.774665E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.727 | TFLOPs: 29.19 | +7: iteration 18540/ 37905 | consumed samples: 4746240 | consumed tokens: 9720299520 | elapsed time per iteration (s): 0.22 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.766800E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.096 | TFLOPs: 28.99 | +7: iteration 18550/ 37905 | consumed samples: 4748800 | consumed tokens: 9725542400 | elapsed time per iteration (s): 0.23 | learning rate: 1.145E-04 | global batch size: 256 | lm loss: 3.761959E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.004 | TFLOPs: 28.79 | +7: iteration 18560/ 37905 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 0.23 | learning rate: 1.144E-04 | global batch size: 256 | lm loss: 3.758165E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.833 | TFLOPs: 28.60 | +7: iteration 18570/ 37905 | consumed samples: 4753920 | consumed tokens: 9736028160 | elapsed time per iteration (s): 0.22 | learning rate: 1.143E-04 | global batch size: 256 | lm loss: 3.761620E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.958 | TFLOPs: 29.12 | +7: iteration 18580/ 37905 | consumed samples: 4756480 | consumed tokens: 9741271040 | elapsed time per iteration (s): 0.23 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.766160E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1115.107 | TFLOPs: 28.41 | +7: iteration 18590/ 37905 | consumed samples: 4759040 | consumed tokens: 9746513920 | elapsed time per iteration (s): 0.23 | learning rate: 1.142E-04 | global batch size: 256 | lm loss: 3.763873E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.621 | TFLOPs: 28.83 | +7: iteration 18600/ 37905 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 0.22 | learning rate: 1.141E-04 | global batch size: 256 | lm loss: 3.760320E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.960 | TFLOPs: 29.12 | +7: iteration 18610/ 37905 | consumed samples: 4764160 | consumed tokens: 9756999680 | elapsed time per iteration (s): 0.23 | learning rate: 1.140E-04 | global batch size: 256 | lm loss: 3.760738E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1099.485 | TFLOPs: 28.01 | +7: iteration 18620/ 37905 | consumed samples: 4766720 | consumed tokens: 9762242560 | elapsed time per iteration (s): 0.23 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.757883E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.056 | TFLOPs: 28.02 | +7: iteration 18630/ 37905 | consumed samples: 4769280 | consumed tokens: 9767485440 | elapsed time per iteration (s): 0.23 | learning rate: 1.139E-04 | global batch size: 256 | lm loss: 3.757613E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.235 | TFLOPs: 28.92 | +7: iteration 18640/ 37905 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 0.23 | learning rate: 1.138E-04 | global batch size: 256 | lm loss: 3.764463E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.519 | TFLOPs: 28.88 | +7: iteration 18650/ 37905 | consumed samples: 4774400 | consumed tokens: 9777971200 | elapsed time per iteration (s): 0.23 | learning rate: 1.137E-04 | global batch size: 256 | lm loss: 3.767451E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.395 | TFLOPs: 28.90 | +7: iteration 18660/ 37905 | consumed samples: 4776960 | consumed tokens: 9783214080 | elapsed time per iteration (s): 0.22 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.759090E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.048 | TFLOPs: 29.02 | +7: iteration 18670/ 37905 | consumed samples: 4779520 | consumed tokens: 9788456960 | elapsed time per iteration (s): 0.23 | learning rate: 1.136E-04 | global batch size: 256 | lm loss: 3.744267E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.749 | TFLOPs: 28.63 | +7: iteration 18680/ 37905 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 0.22 | learning rate: 1.135E-04 | global batch size: 256 | lm loss: 3.750637E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.208 | TFLOPs: 29.35 | +7: iteration 18690/ 37905 | consumed samples: 4784640 | consumed tokens: 9798942720 | elapsed time per iteration (s): 0.22 | learning rate: 1.134E-04 | global batch size: 256 | lm loss: 3.748067E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.766 | TFLOPs: 29.32 | +7: iteration 18700/ 37905 | consumed samples: 4787200 | consumed tokens: 9804185600 | elapsed time per iteration (s): 0.22 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.760354E+00 | grad norm: 0.287 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.782 | TFLOPs: 29.32 | +7: iteration 18710/ 37905 | consumed samples: 4789760 | consumed tokens: 9809428480 | elapsed time per iteration (s): 0.23 | learning rate: 1.133E-04 | global batch size: 256 | lm loss: 3.762626E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.184 | TFLOPs: 28.82 | +7: iteration 18720/ 37905 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 0.23 | learning rate: 1.132E-04 | global batch size: 256 | lm loss: 3.753551E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.688 | TFLOPs: 28.83 | +7: iteration 18730/ 37905 | consumed samples: 4794880 | consumed tokens: 9819914240 | elapsed time per iteration (s): 0.22 | learning rate: 1.131E-04 | global batch size: 256 | lm loss: 3.762334E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.260 | TFLOPs: 29.40 | +7: iteration 18740/ 37905 | consumed samples: 4797440 | consumed tokens: 9825157120 | elapsed time per iteration (s): 0.22 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.756943E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.760 | TFLOPs: 29.24 | +7: iteration 18750/ 37905 | consumed samples: 4800000 | consumed tokens: 9830400000 | elapsed time per iteration (s): 0.22 | learning rate: 1.130E-04 | global batch size: 256 | lm loss: 3.768830E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.127 | TFLOPs: 29.12 | +7: iteration 18760/ 37905 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 0.23 | learning rate: 1.129E-04 | global batch size: 256 | lm loss: 3.758425E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.792 | TFLOPs: 28.71 | +7: iteration 18770/ 37905 | consumed samples: 4805120 | consumed tokens: 9840885760 | elapsed time per iteration (s): 0.23 | learning rate: 1.128E-04 | global batch size: 256 | lm loss: 3.772383E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.634 | TFLOPs: 28.98 | +7: iteration 18780/ 37905 | consumed samples: 4807680 | consumed tokens: 9846128640 | elapsed time per iteration (s): 0.23 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.749426E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.306 | TFLOPs: 28.67 | +7: iteration 18790/ 37905 | consumed samples: 4810240 | consumed tokens: 9851371520 | elapsed time per iteration (s): 0.23 | learning rate: 1.127E-04 | global batch size: 256 | lm loss: 3.762314E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.965 | TFLOPs: 28.66 | +7: iteration 18800/ 37905 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 0.23 | learning rate: 1.126E-04 | global batch size: 256 | lm loss: 3.757033E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.580 | TFLOPs: 28.60 | +7: iteration 18810/ 37905 | consumed samples: 4815360 | consumed tokens: 9861857280 | elapsed time per iteration (s): 0.23 | learning rate: 1.125E-04 | global batch size: 256 | lm loss: 3.752851E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.427 | TFLOPs: 28.80 | +7: iteration 18820/ 37905 | consumed samples: 4817920 | consumed tokens: 9867100160 | elapsed time per iteration (s): 0.22 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.768306E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.560 | TFLOPs: 29.23 | +7: iteration 18830/ 37905 | consumed samples: 4820480 | consumed tokens: 9872343040 | elapsed time per iteration (s): 0.22 | learning rate: 1.124E-04 | global batch size: 256 | lm loss: 3.748376E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.259 | TFLOPs: 29.00 | +7: iteration 18840/ 37905 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 0.22 | learning rate: 1.123E-04 | global batch size: 256 | lm loss: 3.761927E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.761 | TFLOPs: 29.34 | +7: iteration 18850/ 37905 | consumed samples: 4825600 | consumed tokens: 9882828800 | elapsed time per iteration (s): 0.23 | learning rate: 1.122E-04 | global batch size: 256 | lm loss: 3.759418E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.902 | TFLOPs: 28.84 | +7: iteration 18860/ 37905 | consumed samples: 4828160 | consumed tokens: 9888071680 | elapsed time per iteration (s): 0.23 | learning rate: 1.121E-04 | global batch size: 256 | lm loss: 3.743830E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.936 | TFLOPs: 28.94 | +7: iteration 18870/ 37905 | consumed samples: 4830720 | consumed tokens: 9893314560 | elapsed time per iteration (s): 0.22 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.758289E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.756 | TFLOPs: 29.16 | +7: iteration 18880/ 37905 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 0.22 | learning rate: 1.120E-04 | global batch size: 256 | lm loss: 3.757211E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.772 | TFLOPs: 29.34 | +7: iteration 18890/ 37905 | consumed samples: 4835840 | consumed tokens: 9903800320 | elapsed time per iteration (s): 0.22 | learning rate: 1.119E-04 | global batch size: 256 | lm loss: 3.759597E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.693 | TFLOPs: 29.31 | +7: iteration 18900/ 37905 | consumed samples: 4838400 | consumed tokens: 9909043200 | elapsed time per iteration (s): 0.22 | learning rate: 1.118E-04 | global batch size: 256 | lm loss: 3.756502E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.996 | TFLOPs: 29.07 | +7: iteration 18910/ 37905 | consumed samples: 4840960 | consumed tokens: 9914286080 | elapsed time per iteration (s): 0.22 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.755262E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.348 | TFLOPs: 29.03 | +7: iteration 18920/ 37905 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 0.22 | learning rate: 1.117E-04 | global batch size: 256 | lm loss: 3.764092E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.554 | TFLOPs: 29.16 | +7: iteration 18930/ 37905 | consumed samples: 4846080 | consumed tokens: 9924771840 | elapsed time per iteration (s): 0.22 | learning rate: 1.116E-04 | global batch size: 256 | lm loss: 3.750136E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.254 | TFLOPs: 29.18 | +7: iteration 18940/ 37905 | consumed samples: 4848640 | consumed tokens: 9930014720 | elapsed time per iteration (s): 0.23 | learning rate: 1.115E-04 | global batch size: 256 | lm loss: 3.770371E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.950 | TFLOPs: 28.91 | +7: iteration 18950/ 37905 | consumed samples: 4851200 | consumed tokens: 9935257600 | elapsed time per iteration (s): 0.23 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.769889E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.602 | TFLOPs: 28.57 | +7: iteration 18960/ 37905 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 0.23 | learning rate: 1.114E-04 | global batch size: 256 | lm loss: 3.754565E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.318 | TFLOPs: 27.93 | +7: iteration 18970/ 37905 | consumed samples: 4856320 | consumed tokens: 9945743360 | elapsed time per iteration (s): 0.22 | learning rate: 1.113E-04 | global batch size: 256 | lm loss: 3.748816E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.290 | TFLOPs: 29.15 | +7: iteration 18980/ 37905 | consumed samples: 4858880 | consumed tokens: 9950986240 | elapsed time per iteration (s): 0.22 | learning rate: 1.112E-04 | global batch size: 256 | lm loss: 3.744463E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.225 | TFLOPs: 29.07 | +7: iteration 18990/ 37905 | consumed samples: 4861440 | consumed tokens: 9956229120 | elapsed time per iteration (s): 0.22 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.752338E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.056 | TFLOPs: 29.32 | +7: iteration 19000/ 37905 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 0.23 | learning rate: 1.111E-04 | global batch size: 256 | lm loss: 3.753788E+00 | grad norm: 0.275 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.189 | TFLOPs: 28.46 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 19000 | lm loss value: 3.679648E+00 | lm loss PPL: 3.963243E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 19000 to checkpoints_83m20b400m +0: [2023-03-15 23:08:22,502] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step19000 is begin to save! +0: [2023-03-15 23:08:22,505] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:08:22,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:08:22,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:08:22,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:08:22,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:08:22,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:08:22,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:08:22,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:08:22,617] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:08:22,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:08:22,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:08:22,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:08:22,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:08:22,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:08:22,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:08:22,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:08:22,662] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:08:22,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:08:22,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:08:22,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:08:22,684] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:08:22,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:08:22,695] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:08:22,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:08:22,697] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step19000/mp_rank_00_model_states.pt +0: [2023-03-15 23:08:22,697] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:08:22,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:08:22,717] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:08:22,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:08:22,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:08:22,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:08:22,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:08:22,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-15 23:08:22,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +0: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +1: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +6: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +2: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +4: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +4: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +3: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +5: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +7: [2023-03-15 23:08:22,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step19000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +7: [2023-03-15 23:08:22,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step19000 is ready now! +0: successfully saved checkpoint at iteration 19000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 248.99 +7: iteration 19010/ 37905 | consumed samples: 4866560 | consumed tokens: 9966714880 | elapsed time per iteration (s): 0.25 | learning rate: 1.110E-04 | global batch size: 256 | lm loss: 3.759740E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1009.344 | TFLOPs: 25.71 | +7: iteration 19020/ 37905 | consumed samples: 4869120 | consumed tokens: 9971957760 | elapsed time per iteration (s): 0.23 | learning rate: 1.109E-04 | global batch size: 256 | lm loss: 3.757050E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.577 | TFLOPs: 28.57 | +7: iteration 19030/ 37905 | consumed samples: 4871680 | consumed tokens: 9977200640 | elapsed time per iteration (s): 0.22 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.760451E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.528 | TFLOPs: 29.36 | +7: iteration 19040/ 37905 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 0.22 | learning rate: 1.108E-04 | global batch size: 256 | lm loss: 3.760711E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.773 | TFLOPs: 29.32 | +7: iteration 19050/ 37905 | consumed samples: 4876800 | consumed tokens: 9987686400 | elapsed time per iteration (s): 0.22 | learning rate: 1.107E-04 | global batch size: 256 | lm loss: 3.741663E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.432 | TFLOPs: 29.36 | +7: iteration 19060/ 37905 | consumed samples: 4879360 | consumed tokens: 9992929280 | elapsed time per iteration (s): 0.22 | learning rate: 1.106E-04 | global batch size: 256 | lm loss: 3.757490E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.963 | TFLOPs: 29.35 | +7: iteration 19070/ 37905 | consumed samples: 4881920 | consumed tokens: 9998172160 | elapsed time per iteration (s): 0.22 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.763705E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.298 | TFLOPs: 29.10 | +7: iteration 19080/ 37905 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 0.23 | learning rate: 1.105E-04 | global batch size: 256 | lm loss: 3.749717E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.198 | TFLOPs: 28.92 | +7: iteration 19090/ 37905 | consumed samples: 4887040 | consumed tokens: 10008657920 | elapsed time per iteration (s): 0.22 | learning rate: 1.104E-04 | global batch size: 256 | lm loss: 3.746045E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.980 | TFLOPs: 29.35 | +7: iteration 19100/ 37905 | consumed samples: 4889600 | consumed tokens: 10013900800 | elapsed time per iteration (s): 0.23 | learning rate: 1.103E-04 | global batch size: 256 | lm loss: 3.736595E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1101.437 | TFLOPs: 28.06 | +7: iteration 19110/ 37905 | consumed samples: 4892160 | consumed tokens: 10019143680 | elapsed time per iteration (s): 0.22 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.758103E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.329 | TFLOPs: 29.13 | +7: iteration 19120/ 37905 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 0.23 | learning rate: 1.102E-04 | global batch size: 256 | lm loss: 3.756761E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.181 | TFLOPs: 28.77 | +7: iteration 19130/ 37905 | consumed samples: 4897280 | consumed tokens: 10029629440 | elapsed time per iteration (s): 0.22 | learning rate: 1.101E-04 | global batch size: 256 | lm loss: 3.759600E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.938 | TFLOPs: 29.32 | +7: iteration 19140/ 37905 | consumed samples: 4899840 | consumed tokens: 10034872320 | elapsed time per iteration (s): 0.22 | learning rate: 1.100E-04 | global batch size: 256 | lm loss: 3.750291E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.471 | TFLOPs: 29.13 | +7: iteration 19150/ 37905 | consumed samples: 4902400 | consumed tokens: 10040115200 | elapsed time per iteration (s): 0.22 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.761630E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.313 | TFLOPs: 29.33 | +7: iteration 19160/ 37905 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 0.22 | learning rate: 1.099E-04 | global batch size: 256 | lm loss: 3.769744E+00 | grad norm: 0.268 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.214 | TFLOPs: 29.33 | +7: iteration 19170/ 37905 | consumed samples: 4907520 | consumed tokens: 10050600960 | elapsed time per iteration (s): 0.22 | learning rate: 1.098E-04 | global batch size: 256 | lm loss: 3.742698E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.342 | TFLOPs: 29.33 | +7: iteration 19180/ 37905 | consumed samples: 4910080 | consumed tokens: 10055843840 | elapsed time per iteration (s): 0.22 | learning rate: 1.097E-04 | global batch size: 256 | lm loss: 3.754427E+00 | grad norm: 0.298 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.183 | TFLOPs: 29.25 | +7: iteration 19190/ 37905 | consumed samples: 4912640 | consumed tokens: 10061086720 | elapsed time per iteration (s): 0.22 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.757951E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.549 | TFLOPs: 29.08 | +7: iteration 19200/ 37905 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 0.22 | learning rate: 1.096E-04 | global batch size: 256 | lm loss: 3.742084E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.488 | TFLOPs: 29.31 | +7: iteration 19210/ 37905 | consumed samples: 4917760 | consumed tokens: 10071572480 | elapsed time per iteration (s): 0.22 | learning rate: 1.095E-04 | global batch size: 256 | lm loss: 3.752827E+00 | grad norm: 0.281 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.019 | TFLOPs: 29.32 | +7: iteration 19220/ 37905 | consumed samples: 4920320 | consumed tokens: 10076815360 | elapsed time per iteration (s): 0.22 | learning rate: 1.094E-04 | global batch size: 256 | lm loss: 3.746064E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.182 | TFLOPs: 29.35 | +7: iteration 19230/ 37905 | consumed samples: 4922880 | consumed tokens: 10082058240 | elapsed time per iteration (s): 0.22 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.761506E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.733 | TFLOPs: 29.34 | +7: iteration 19240/ 37905 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 0.22 | learning rate: 1.093E-04 | global batch size: 256 | lm loss: 3.756800E+00 | grad norm: 0.291 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.955 | TFLOPs: 29.30 | +7: iteration 19250/ 37905 | consumed samples: 4928000 | consumed tokens: 10092544000 | elapsed time per iteration (s): 0.22 | learning rate: 1.092E-04 | global batch size: 256 | lm loss: 3.748571E+00 | grad norm: 0.297 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.893 | TFLOPs: 29.32 | +7: iteration 19260/ 37905 | consumed samples: 4930560 | consumed tokens: 10097786880 | elapsed time per iteration (s): 0.22 | learning rate: 1.091E-04 | global batch size: 256 | lm loss: 3.770717E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.562 | TFLOPs: 29.34 | +7: iteration 19270/ 37905 | consumed samples: 4933120 | consumed tokens: 10103029760 | elapsed time per iteration (s): 0.22 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.755519E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.890 | TFLOPs: 29.34 | +7: iteration 19280/ 37905 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 0.23 | learning rate: 1.090E-04 | global batch size: 256 | lm loss: 3.746900E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.362 | TFLOPs: 28.90 | +7: iteration 19290/ 37905 | consumed samples: 4938240 | consumed tokens: 10113515520 | elapsed time per iteration (s): 0.23 | learning rate: 1.089E-04 | global batch size: 256 | lm loss: 3.761241E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.341 | TFLOPs: 28.85 | +7: iteration 19300/ 37905 | consumed samples: 4940800 | consumed tokens: 10118758400 | elapsed time per iteration (s): 0.22 | learning rate: 1.088E-04 | global batch size: 256 | lm loss: 3.740593E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.594 | TFLOPs: 29.34 | +7: iteration 19310/ 37905 | consumed samples: 4943360 | consumed tokens: 10124001280 | elapsed time per iteration (s): 0.22 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.746816E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.752 | TFLOPs: 29.32 | +7: iteration 19320/ 37905 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 0.22 | learning rate: 1.087E-04 | global batch size: 256 | lm loss: 3.756054E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.241 | TFLOPs: 29.33 | +7: iteration 19330/ 37905 | consumed samples: 4948480 | consumed tokens: 10134487040 | elapsed time per iteration (s): 0.22 | learning rate: 1.086E-04 | global batch size: 256 | lm loss: 3.744653E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.430 | TFLOPs: 29.31 | +7: iteration 19340/ 37905 | consumed samples: 4951040 | consumed tokens: 10139729920 | elapsed time per iteration (s): 0.23 | learning rate: 1.085E-04 | global batch size: 256 | lm loss: 3.757489E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.992 | TFLOPs: 28.68 | +7: iteration 19350/ 37905 | consumed samples: 4953600 | consumed tokens: 10144972800 | elapsed time per iteration (s): 0.22 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.751486E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.949 | TFLOPs: 29.12 | +7: iteration 19360/ 37905 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 0.22 | learning rate: 1.084E-04 | global batch size: 256 | lm loss: 3.752345E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.177 | TFLOPs: 29.33 | +7: iteration 19370/ 37905 | consumed samples: 4958720 | consumed tokens: 10155458560 | elapsed time per iteration (s): 0.22 | learning rate: 1.083E-04 | global batch size: 256 | lm loss: 3.751360E+00 | grad norm: 0.273 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.497 | TFLOPs: 29.13 | +7: iteration 19380/ 37905 | consumed samples: 4961280 | consumed tokens: 10160701440 | elapsed time per iteration (s): 0.22 | learning rate: 1.082E-04 | global batch size: 256 | lm loss: 3.756071E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.584 | TFLOPs: 29.34 | +7: iteration 19390/ 37905 | consumed samples: 4963840 | consumed tokens: 10165944320 | elapsed time per iteration (s): 0.22 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.756842E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.349 | TFLOPs: 29.31 | +7: iteration 19400/ 37905 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 0.22 | learning rate: 1.081E-04 | global batch size: 256 | lm loss: 3.740953E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.495 | TFLOPs: 29.31 | +7: iteration 19410/ 37905 | consumed samples: 4968960 | consumed tokens: 10176430080 | elapsed time per iteration (s): 0.22 | learning rate: 1.080E-04 | global batch size: 256 | lm loss: 3.742920E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.915 | TFLOPs: 29.32 | +7: iteration 19420/ 37905 | consumed samples: 4971520 | consumed tokens: 10181672960 | elapsed time per iteration (s): 0.22 | learning rate: 1.079E-04 | global batch size: 256 | lm loss: 3.747184E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.318 | TFLOPs: 29.23 | +7: iteration 19430/ 37905 | consumed samples: 4974080 | consumed tokens: 10186915840 | elapsed time per iteration (s): 0.22 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.747294E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.987 | TFLOPs: 29.32 | +7: iteration 19440/ 37905 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 0.22 | learning rate: 1.078E-04 | global batch size: 256 | lm loss: 3.749037E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.150 | TFLOPs: 29.38 | +7: iteration 19450/ 37905 | consumed samples: 4979200 | consumed tokens: 10197401600 | elapsed time per iteration (s): 0.22 | learning rate: 1.077E-04 | global batch size: 256 | lm loss: 3.748196E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.116 | TFLOPs: 29.40 | +7: iteration 19460/ 37905 | consumed samples: 4981760 | consumed tokens: 10202644480 | elapsed time per iteration (s): 0.22 | learning rate: 1.076E-04 | global batch size: 256 | lm loss: 3.757080E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.942 | TFLOPs: 29.40 | +7: iteration 19470/ 37905 | consumed samples: 4984320 | consumed tokens: 10207887360 | elapsed time per iteration (s): 0.22 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.763428E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.627 | TFLOPs: 29.39 | +7: iteration 19480/ 37905 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 0.22 | learning rate: 1.075E-04 | global batch size: 256 | lm loss: 3.762734E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.530 | TFLOPs: 29.36 | +7: iteration 19490/ 37905 | consumed samples: 4989440 | consumed tokens: 10218373120 | elapsed time per iteration (s): 0.23 | learning rate: 1.074E-04 | global batch size: 256 | lm loss: 3.771298E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.193 | TFLOPs: 28.97 | +7: iteration 19500/ 37905 | consumed samples: 4992000 | consumed tokens: 10223616000 | elapsed time per iteration (s): 0.22 | learning rate: 1.073E-04 | global batch size: 256 | lm loss: 3.741661E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.681 | TFLOPs: 29.34 | +7: iteration 19510/ 37905 | consumed samples: 4994560 | consumed tokens: 10228858880 | elapsed time per iteration (s): 0.22 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.759148E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.073 | TFLOPs: 29.30 | +7: iteration 19520/ 37905 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 0.22 | learning rate: 1.072E-04 | global batch size: 256 | lm loss: 3.754264E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.787 | TFLOPs: 29.34 | +7: iteration 19530/ 37905 | consumed samples: 4999680 | consumed tokens: 10239344640 | elapsed time per iteration (s): 0.22 | learning rate: 1.071E-04 | global batch size: 256 | lm loss: 3.769702E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.295 | TFLOPs: 29.38 | +7: iteration 19540/ 37905 | consumed samples: 5002240 | consumed tokens: 10244587520 | elapsed time per iteration (s): 0.22 | learning rate: 1.070E-04 | global batch size: 256 | lm loss: 3.751614E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.231 | TFLOPs: 29.38 | +7: iteration 19550/ 37905 | consumed samples: 5004800 | consumed tokens: 10249830400 | elapsed time per iteration (s): 0.22 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.746246E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.507 | TFLOPs: 29.13 | +7: iteration 19560/ 37905 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 0.22 | learning rate: 1.069E-04 | global batch size: 256 | lm loss: 3.751469E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.548 | TFLOPs: 29.13 | +7: iteration 19570/ 37905 | consumed samples: 5009920 | consumed tokens: 10260316160 | elapsed time per iteration (s): 0.22 | learning rate: 1.068E-04 | global batch size: 256 | lm loss: 3.756042E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.189 | TFLOPs: 29.12 | +7: iteration 19580/ 37905 | consumed samples: 5012480 | consumed tokens: 10265559040 | elapsed time per iteration (s): 0.22 | learning rate: 1.067E-04 | global batch size: 256 | lm loss: 3.744936E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.513 | TFLOPs: 29.39 | +7: iteration 19590/ 37905 | consumed samples: 5015040 | consumed tokens: 10270801920 | elapsed time per iteration (s): 0.22 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.761792E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.781 | TFLOPs: 29.32 | +7: iteration 19600/ 37905 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 0.22 | learning rate: 1.066E-04 | global batch size: 256 | lm loss: 3.741043E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.629 | TFLOPs: 29.39 | +7: iteration 19610/ 37905 | consumed samples: 5020160 | consumed tokens: 10281287680 | elapsed time per iteration (s): 0.22 | learning rate: 1.065E-04 | global batch size: 256 | lm loss: 3.754600E+00 | grad norm: 0.264 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.095 | TFLOPs: 29.12 | +7: iteration 19620/ 37905 | consumed samples: 5022720 | consumed tokens: 10286530560 | elapsed time per iteration (s): 0.22 | learning rate: 1.064E-04 | global batch size: 256 | lm loss: 3.751027E+00 | grad norm: 0.305 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.389 | TFLOPs: 29.38 | +7: iteration 19630/ 37905 | consumed samples: 5025280 | consumed tokens: 10291773440 | elapsed time per iteration (s): 0.22 | learning rate: 1.063E-04 | global batch size: 256 | lm loss: 3.758529E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.278 | TFLOPs: 29.38 | +7: iteration 19640/ 37905 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 0.22 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.764226E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.610 | TFLOPs: 29.36 | +7: iteration 19650/ 37905 | consumed samples: 5030400 | consumed tokens: 10302259200 | elapsed time per iteration (s): 0.23 | learning rate: 1.062E-04 | global batch size: 256 | lm loss: 3.752384E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.752 | TFLOPs: 28.17 | +7: iteration 19660/ 37905 | consumed samples: 5032960 | consumed tokens: 10307502080 | elapsed time per iteration (s): 0.22 | learning rate: 1.061E-04 | global batch size: 256 | lm loss: 3.765054E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.644 | TFLOPs: 29.36 | +7: iteration 19670/ 37905 | consumed samples: 5035520 | consumed tokens: 10312744960 | elapsed time per iteration (s): 0.23 | learning rate: 1.060E-04 | global batch size: 256 | lm loss: 3.747865E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.760 | TFLOPs: 28.91 | +7: iteration 19680/ 37905 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 0.22 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.745898E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.077 | TFLOPs: 29.37 | +7: iteration 19690/ 37905 | consumed samples: 5040640 | consumed tokens: 10323230720 | elapsed time per iteration (s): 0.22 | learning rate: 1.059E-04 | global batch size: 256 | lm loss: 3.749603E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.460 | TFLOPs: 29.36 | +7: iteration 19700/ 37905 | consumed samples: 5043200 | consumed tokens: 10328473600 | elapsed time per iteration (s): 0.22 | learning rate: 1.058E-04 | global batch size: 256 | lm loss: 3.765540E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.360 | TFLOPs: 29.36 | +7: iteration 19710/ 37905 | consumed samples: 5045760 | consumed tokens: 10333716480 | elapsed time per iteration (s): 0.22 | learning rate: 1.057E-04 | global batch size: 256 | lm loss: 3.748549E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.176 | TFLOPs: 29.33 | +7: iteration 19720/ 37905 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 0.22 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.749462E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.710 | TFLOPs: 29.31 | +7: iteration 19730/ 37905 | consumed samples: 5050880 | consumed tokens: 10344202240 | elapsed time per iteration (s): 0.22 | learning rate: 1.056E-04 | global batch size: 256 | lm loss: 3.735474E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.646 | TFLOPs: 29.29 | +7: iteration 19740/ 37905 | consumed samples: 5053440 | consumed tokens: 10349445120 | elapsed time per iteration (s): 0.22 | learning rate: 1.055E-04 | global batch size: 256 | lm loss: 3.742808E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.332 | TFLOPs: 29.25 | +7: iteration 19750/ 37905 | consumed samples: 5056000 | consumed tokens: 10354688000 | elapsed time per iteration (s): 0.22 | learning rate: 1.054E-04 | global batch size: 256 | lm loss: 3.759088E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.815 | TFLOPs: 29.24 | +7: iteration 19760/ 37905 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 0.22 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.760209E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.531 | TFLOPs: 29.31 | +7: iteration 19770/ 37905 | consumed samples: 5061120 | consumed tokens: 10365173760 | elapsed time per iteration (s): 0.22 | learning rate: 1.053E-04 | global batch size: 256 | lm loss: 3.751360E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.239 | TFLOPs: 29.28 | +7: iteration 19780/ 37905 | consumed samples: 5063680 | consumed tokens: 10370416640 | elapsed time per iteration (s): 0.23 | learning rate: 1.052E-04 | global batch size: 256 | lm loss: 3.764745E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.470 | TFLOPs: 28.14 | +7: iteration 19790/ 37905 | consumed samples: 5066240 | consumed tokens: 10375659520 | elapsed time per iteration (s): 0.22 | learning rate: 1.051E-04 | global batch size: 256 | lm loss: 3.747824E+00 | grad norm: 0.271 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.292 | TFLOPs: 29.05 | +7: iteration 19800/ 37905 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 0.22 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.745566E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.250 | TFLOPs: 29.33 | +7: iteration 19810/ 37905 | consumed samples: 5071360 | consumed tokens: 10386145280 | elapsed time per iteration (s): 0.22 | learning rate: 1.050E-04 | global batch size: 256 | lm loss: 3.743392E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.613 | TFLOPs: 29.34 | +7: iteration 19820/ 37905 | consumed samples: 5073920 | consumed tokens: 10391388160 | elapsed time per iteration (s): 0.22 | learning rate: 1.049E-04 | global batch size: 256 | lm loss: 3.752243E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.216 | TFLOPs: 29.33 | +7: iteration 19830/ 37905 | consumed samples: 5076480 | consumed tokens: 10396631040 | elapsed time per iteration (s): 0.22 | learning rate: 1.048E-04 | global batch size: 256 | lm loss: 3.751608E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.079 | TFLOPs: 29.07 | +7: iteration 19840/ 37905 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 0.22 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.751827E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.477 | TFLOPs: 29.33 | +7: iteration 19850/ 37905 | consumed samples: 5081600 | consumed tokens: 10407116800 | elapsed time per iteration (s): 0.23 | learning rate: 1.047E-04 | global batch size: 256 | lm loss: 3.758003E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.156 | TFLOPs: 28.97 | +7: iteration 19860/ 37905 | consumed samples: 5084160 | consumed tokens: 10412359680 | elapsed time per iteration (s): 0.22 | learning rate: 1.046E-04 | global batch size: 256 | lm loss: 3.747832E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.331 | TFLOPs: 29.33 | +7: iteration 19870/ 37905 | consumed samples: 5086720 | consumed tokens: 10417602560 | elapsed time per iteration (s): 0.22 | learning rate: 1.045E-04 | global batch size: 256 | lm loss: 3.754064E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.278 | TFLOPs: 29.05 | +7: iteration 19880/ 37905 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 0.22 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.745708E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.719 | TFLOPs: 29.06 | +7: iteration 19890/ 37905 | consumed samples: 5091840 | consumed tokens: 10428088320 | elapsed time per iteration (s): 0.23 | learning rate: 1.044E-04 | global batch size: 256 | lm loss: 3.768621E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.878 | TFLOPs: 28.89 | +7: iteration 19900/ 37905 | consumed samples: 5094400 | consumed tokens: 10433331200 | elapsed time per iteration (s): 0.22 | learning rate: 1.043E-04 | global batch size: 256 | lm loss: 3.744194E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.255 | TFLOPs: 29.30 | +7: iteration 19910/ 37905 | consumed samples: 5096960 | consumed tokens: 10438574080 | elapsed time per iteration (s): 0.23 | learning rate: 1.042E-04 | global batch size: 256 | lm loss: 3.744955E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.043 | TFLOPs: 28.94 | +7: iteration 19920/ 37905 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 0.23 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.757289E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.476 | TFLOPs: 28.80 | +7: iteration 19930/ 37905 | consumed samples: 5102080 | consumed tokens: 10449059840 | elapsed time per iteration (s): 0.23 | learning rate: 1.041E-04 | global batch size: 256 | lm loss: 3.759970E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.792 | TFLOPs: 28.73 | +7: iteration 19940/ 37905 | consumed samples: 5104640 | consumed tokens: 10454302720 | elapsed time per iteration (s): 0.22 | learning rate: 1.040E-04 | global batch size: 256 | lm loss: 3.739242E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.979 | TFLOPs: 29.24 | +7: iteration 19950/ 37905 | consumed samples: 5107200 | consumed tokens: 10459545600 | elapsed time per iteration (s): 0.23 | learning rate: 1.039E-04 | global batch size: 256 | lm loss: 3.759926E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.470 | TFLOPs: 28.90 | +7: iteration 19960/ 37905 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 0.22 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.751317E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.680 | TFLOPs: 29.24 | +7: iteration 19970/ 37905 | consumed samples: 5112320 | consumed tokens: 10470031360 | elapsed time per iteration (s): 0.22 | learning rate: 1.038E-04 | global batch size: 256 | lm loss: 3.738527E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.486 | TFLOPs: 29.23 | +7: iteration 19980/ 37905 | consumed samples: 5114880 | consumed tokens: 10475274240 | elapsed time per iteration (s): 0.22 | learning rate: 1.037E-04 | global batch size: 256 | lm loss: 3.749877E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.601 | TFLOPs: 29.26 | +7: iteration 19990/ 37905 | consumed samples: 5117440 | consumed tokens: 10480517120 | elapsed time per iteration (s): 0.22 | learning rate: 1.036E-04 | global batch size: 256 | lm loss: 3.753282E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.621 | TFLOPs: 29.26 | +0: [2023-03-15 23:12:06,178] [INFO] [logging.py:68:log_dist] [Rank 0] step=20000, skipped=0, lr=[0.00010354137669750605, 0.00010354137669750605, 0.00010354137669750605], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 20000/ 37905 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 0.22 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.751834E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.798 | TFLOPs: 29.29 | +0: steps: 20000 loss: 3.7499 iter time (s): 0.224 samples/sec: 1141.316 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 20000 | lm loss value: 3.722880E+00 | lm loss PPL: 4.138342E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 20000 to checkpoints_83m20b400m +0: [2023-03-15 23:12:06,267] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step20000 is begin to save! +0: [2023-03-15 23:12:06,271] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:12:06,344] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:12:06,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:12:06,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:12:06,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:12:06,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:12:06,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:12:06,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:12:06,380] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:12:06,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:12:06,391] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:12:06,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:12:06,402] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:12:06,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:12:06,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:12:06,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:12:06,425] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:12:06,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:12:06,436] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:12:06,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:12:06,447] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:12:06,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:12:06,459] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:12:06,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:12:06,460] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step20000/mp_rank_00_model_states.pt +0: [2023-03-15 23:12:06,460] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:12:06,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:12:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:12:06,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:12:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:12:06,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +3: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +4: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +1: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +5: [2023-03-15 23:12:06,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +5: [2023-03-15 23:12:06,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:12:06,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +7: [2023-03-15 23:12:06,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:12:06,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +2: [2023-03-15 23:12:06,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:12:06,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:12:06,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +6: [2023-03-15 23:12:06,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:12:06,512] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step20000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:12:06,512] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step20000 is ready now! +0: successfully saved checkpoint at iteration 20000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 246.87 +7: iteration 20010/ 37905 | consumed samples: 5122560 | consumed tokens: 10491002880 | elapsed time per iteration (s): 0.26 | learning rate: 1.035E-04 | global batch size: 256 | lm loss: 3.752723E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1001.641 | TFLOPs: 25.52 | +7: iteration 20020/ 37905 | consumed samples: 5125120 | consumed tokens: 10496245760 | elapsed time per iteration (s): 0.22 | learning rate: 1.034E-04 | global batch size: 256 | lm loss: 3.740751E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.815 | TFLOPs: 29.29 | +7: iteration 20030/ 37905 | consumed samples: 5127680 | consumed tokens: 10501488640 | elapsed time per iteration (s): 0.22 | learning rate: 1.033E-04 | global batch size: 256 | lm loss: 3.740759E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.212 | TFLOPs: 29.28 | +7: iteration 20040/ 37905 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 0.22 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.747046E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.548 | TFLOPs: 29.26 | +7: iteration 20050/ 37905 | consumed samples: 5132800 | consumed tokens: 10511974400 | elapsed time per iteration (s): 0.22 | learning rate: 1.032E-04 | global batch size: 256 | lm loss: 3.749868E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.800 | TFLOPs: 29.27 | +7: iteration 20060/ 37905 | consumed samples: 5135360 | consumed tokens: 10517217280 | elapsed time per iteration (s): 0.23 | learning rate: 1.031E-04 | global batch size: 256 | lm loss: 3.761724E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.120 | TFLOPs: 28.87 | +7: iteration 20070/ 37905 | consumed samples: 5137920 | consumed tokens: 10522460160 | elapsed time per iteration (s): 0.22 | learning rate: 1.030E-04 | global batch size: 256 | lm loss: 3.753495E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.131 | TFLOPs: 29.22 | +7: iteration 20080/ 37905 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 0.22 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.761959E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.414 | TFLOPs: 29.21 | +7: iteration 20090/ 37905 | consumed samples: 5143040 | consumed tokens: 10532945920 | elapsed time per iteration (s): 0.23 | learning rate: 1.029E-04 | global batch size: 256 | lm loss: 3.749551E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.500 | TFLOPs: 28.60 | +7: iteration 20100/ 37905 | consumed samples: 5145600 | consumed tokens: 10538188800 | elapsed time per iteration (s): 0.22 | learning rate: 1.028E-04 | global batch size: 256 | lm loss: 3.739578E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.509 | TFLOPs: 29.28 | +7: iteration 20110/ 37905 | consumed samples: 5148160 | consumed tokens: 10543431680 | elapsed time per iteration (s): 0.22 | learning rate: 1.027E-04 | global batch size: 256 | lm loss: 3.763391E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.106 | TFLOPs: 29.30 | +7: iteration 20120/ 37905 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 0.22 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.744461E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.797 | TFLOPs: 29.29 | +7: iteration 20130/ 37905 | consumed samples: 5153280 | consumed tokens: 10553917440 | elapsed time per iteration (s): 0.22 | learning rate: 1.026E-04 | global batch size: 256 | lm loss: 3.727483E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.731 | TFLOPs: 29.34 | +7: iteration 20140/ 37905 | consumed samples: 5155840 | consumed tokens: 10559160320 | elapsed time per iteration (s): 0.22 | learning rate: 1.025E-04 | global batch size: 256 | lm loss: 3.744315E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.392 | TFLOPs: 29.05 | +7: iteration 20150/ 37905 | consumed samples: 5158400 | consumed tokens: 10564403200 | elapsed time per iteration (s): 0.22 | learning rate: 1.024E-04 | global batch size: 256 | lm loss: 3.742487E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.103 | TFLOPs: 29.32 | +7: iteration 20160/ 37905 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 0.22 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.740171E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.942 | TFLOPs: 29.32 | +7: iteration 20170/ 37905 | consumed samples: 5163520 | consumed tokens: 10574888960 | elapsed time per iteration (s): 0.22 | learning rate: 1.023E-04 | global batch size: 256 | lm loss: 3.752437E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.547 | TFLOPs: 29.28 | +7: iteration 20180/ 37905 | consumed samples: 5166080 | consumed tokens: 10580131840 | elapsed time per iteration (s): 0.22 | learning rate: 1.022E-04 | global batch size: 256 | lm loss: 3.760648E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.322 | TFLOPs: 29.28 | +7: iteration 20190/ 37905 | consumed samples: 5168640 | consumed tokens: 10585374720 | elapsed time per iteration (s): 0.22 | learning rate: 1.021E-04 | global batch size: 256 | lm loss: 3.754131E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.108 | TFLOPs: 29.27 | +7: iteration 20200/ 37905 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 0.22 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.744925E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.395 | TFLOPs: 29.26 | +7: iteration 20210/ 37905 | consumed samples: 5173760 | consumed tokens: 10595860480 | elapsed time per iteration (s): 0.22 | learning rate: 1.020E-04 | global batch size: 256 | lm loss: 3.753773E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.192 | TFLOPs: 29.30 | +7: iteration 20220/ 37905 | consumed samples: 5176320 | consumed tokens: 10601103360 | elapsed time per iteration (s): 0.22 | learning rate: 1.019E-04 | global batch size: 256 | lm loss: 3.740894E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.759 | TFLOPs: 29.29 | +7: iteration 20230/ 37905 | consumed samples: 5178880 | consumed tokens: 10606346240 | elapsed time per iteration (s): 0.22 | learning rate: 1.018E-04 | global batch size: 256 | lm loss: 3.758754E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.004 | TFLOPs: 29.30 | +7: iteration 20240/ 37905 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 0.22 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.739986E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.633 | TFLOPs: 29.29 | +7: iteration 20250/ 37905 | consumed samples: 5184000 | consumed tokens: 10616832000 | elapsed time per iteration (s): 0.22 | learning rate: 1.017E-04 | global batch size: 256 | lm loss: 3.751162E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.395 | TFLOPs: 29.31 | +7: iteration 20260/ 37905 | consumed samples: 5186560 | consumed tokens: 10622074880 | elapsed time per iteration (s): 0.22 | learning rate: 1.016E-04 | global batch size: 256 | lm loss: 3.747990E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.355 | TFLOPs: 29.28 | +7: iteration 20270/ 37905 | consumed samples: 5189120 | consumed tokens: 10627317760 | elapsed time per iteration (s): 0.22 | learning rate: 1.015E-04 | global batch size: 256 | lm loss: 3.748269E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.233 | TFLOPs: 29.30 | +7: iteration 20280/ 37905 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 0.22 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.756900E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.269 | TFLOPs: 29.23 | +7: iteration 20290/ 37905 | consumed samples: 5194240 | consumed tokens: 10637803520 | elapsed time per iteration (s): 0.22 | learning rate: 1.014E-04 | global batch size: 256 | lm loss: 3.750515E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.005 | TFLOPs: 29.25 | +7: iteration 20300/ 37905 | consumed samples: 5196800 | consumed tokens: 10643046400 | elapsed time per iteration (s): 0.23 | learning rate: 1.013E-04 | global batch size: 256 | lm loss: 3.737016E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.989 | TFLOPs: 28.46 | +7: iteration 20310/ 37905 | consumed samples: 5199360 | consumed tokens: 10648289280 | elapsed time per iteration (s): 0.22 | learning rate: 1.012E-04 | global batch size: 256 | lm loss: 3.754483E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.344 | TFLOPs: 29.28 | +7: iteration 20320/ 37905 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 0.22 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.737898E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.441 | TFLOPs: 29.26 | +7: iteration 20330/ 37905 | consumed samples: 5204480 | consumed tokens: 10658775040 | elapsed time per iteration (s): 0.22 | learning rate: 1.011E-04 | global batch size: 256 | lm loss: 3.740154E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.244 | TFLOPs: 29.25 | +7: iteration 20340/ 37905 | consumed samples: 5207040 | consumed tokens: 10664017920 | elapsed time per iteration (s): 0.22 | learning rate: 1.010E-04 | global batch size: 256 | lm loss: 3.742353E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.167 | TFLOPs: 29.28 | +7: iteration 20350/ 37905 | consumed samples: 5209600 | consumed tokens: 10669260800 | elapsed time per iteration (s): 0.22 | learning rate: 1.009E-04 | global batch size: 256 | lm loss: 3.742259E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.241 | TFLOPs: 29.25 | +7: iteration 20360/ 37905 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 0.22 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.749721E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.954 | TFLOPs: 29.27 | +7: iteration 20370/ 37905 | consumed samples: 5214720 | consumed tokens: 10679746560 | elapsed time per iteration (s): 0.22 | learning rate: 1.008E-04 | global batch size: 256 | lm loss: 3.758720E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.396 | TFLOPs: 29.28 | +7: iteration 20380/ 37905 | consumed samples: 5217280 | consumed tokens: 10684989440 | elapsed time per iteration (s): 0.22 | learning rate: 1.007E-04 | global batch size: 256 | lm loss: 3.749072E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.688 | TFLOPs: 29.24 | +7: iteration 20390/ 37905 | consumed samples: 5219840 | consumed tokens: 10690232320 | elapsed time per iteration (s): 0.22 | learning rate: 1.006E-04 | global batch size: 256 | lm loss: 3.746678E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.750 | TFLOPs: 29.26 | +7: iteration 20400/ 37905 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 0.22 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.761304E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.103 | TFLOPs: 29.27 | +7: iteration 20410/ 37905 | consumed samples: 5224960 | consumed tokens: 10700718080 | elapsed time per iteration (s): 0.22 | learning rate: 1.005E-04 | global batch size: 256 | lm loss: 3.734254E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.976 | TFLOPs: 29.27 | +7: iteration 20420/ 37905 | consumed samples: 5227520 | consumed tokens: 10705960960 | elapsed time per iteration (s): 0.22 | learning rate: 1.004E-04 | global batch size: 256 | lm loss: 3.745084E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.844 | TFLOPs: 29.27 | +7: iteration 20430/ 37905 | consumed samples: 5230080 | consumed tokens: 10711203840 | elapsed time per iteration (s): 0.22 | learning rate: 1.003E-04 | global batch size: 256 | lm loss: 3.744253E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.246 | TFLOPs: 29.28 | +7: iteration 20440/ 37905 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 0.22 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.739983E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.377 | TFLOPs: 29.31 | +7: iteration 20450/ 37905 | consumed samples: 5235200 | consumed tokens: 10721689600 | elapsed time per iteration (s): 0.22 | learning rate: 1.002E-04 | global batch size: 256 | lm loss: 3.742638E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.955 | TFLOPs: 29.37 | +7: iteration 20460/ 37905 | consumed samples: 5237760 | consumed tokens: 10726932480 | elapsed time per iteration (s): 0.22 | learning rate: 1.001E-04 | global batch size: 256 | lm loss: 3.745799E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.730 | TFLOPs: 29.37 | +7: iteration 20470/ 37905 | consumed samples: 5240320 | consumed tokens: 10732175360 | elapsed time per iteration (s): 0.22 | learning rate: 1.000E-04 | global batch size: 256 | lm loss: 3.747092E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.565 | TFLOPs: 29.36 | +7: iteration 20480/ 37905 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 0.22 | learning rate: 9.994E-05 | global batch size: 256 | lm loss: 3.752542E+00 | grad norm: 0.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.184 | TFLOPs: 29.38 | +7: iteration 20490/ 37905 | consumed samples: 5245440 | consumed tokens: 10742661120 | elapsed time per iteration (s): 0.22 | learning rate: 9.987E-05 | global batch size: 256 | lm loss: 3.742114E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.436 | TFLOPs: 29.38 | +7: iteration 20500/ 37905 | consumed samples: 5248000 | consumed tokens: 10747904000 | elapsed time per iteration (s): 0.22 | learning rate: 9.979E-05 | global batch size: 256 | lm loss: 3.759758E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.462 | TFLOPs: 29.36 | +7: iteration 20510/ 37905 | consumed samples: 5250560 | consumed tokens: 10753146880 | elapsed time per iteration (s): 0.22 | learning rate: 9.972E-05 | global batch size: 256 | lm loss: 3.744752E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.750 | TFLOPs: 29.39 | +7: iteration 20520/ 37905 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 0.22 | learning rate: 9.964E-05 | global batch size: 256 | lm loss: 3.751001E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.856 | TFLOPs: 29.34 | +7: iteration 20530/ 37905 | consumed samples: 5255680 | consumed tokens: 10763632640 | elapsed time per iteration (s): 0.22 | learning rate: 9.957E-05 | global batch size: 256 | lm loss: 3.745441E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.297 | TFLOPs: 29.38 | +7: iteration 20540/ 37905 | consumed samples: 5258240 | consumed tokens: 10768875520 | elapsed time per iteration (s): 0.22 | learning rate: 9.949E-05 | global batch size: 256 | lm loss: 3.744683E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.114 | TFLOPs: 29.38 | +7: iteration 20550/ 37905 | consumed samples: 5260800 | consumed tokens: 10774118400 | elapsed time per iteration (s): 0.22 | learning rate: 9.942E-05 | global batch size: 256 | lm loss: 3.755750E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.223 | TFLOPs: 29.38 | +7: iteration 20560/ 37905 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 0.22 | learning rate: 9.934E-05 | global batch size: 256 | lm loss: 3.742466E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.751 | TFLOPs: 29.37 | +7: iteration 20570/ 37905 | consumed samples: 5265920 | consumed tokens: 10784604160 | elapsed time per iteration (s): 0.22 | learning rate: 9.927E-05 | global batch size: 256 | lm loss: 3.746429E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.904 | TFLOPs: 29.40 | +7: iteration 20580/ 37905 | consumed samples: 5268480 | consumed tokens: 10789847040 | elapsed time per iteration (s): 0.22 | learning rate: 9.919E-05 | global batch size: 256 | lm loss: 3.740894E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.283 | TFLOPs: 29.38 | +7: iteration 20590/ 37905 | consumed samples: 5271040 | consumed tokens: 10795089920 | elapsed time per iteration (s): 0.22 | learning rate: 9.912E-05 | global batch size: 256 | lm loss: 3.743069E+00 | grad norm: 0.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.128 | TFLOPs: 29.38 | +7: iteration 20600/ 37905 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 0.22 | learning rate: 9.904E-05 | global batch size: 256 | lm loss: 3.746286E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.079 | TFLOPs: 29.37 | +7: iteration 20610/ 37905 | consumed samples: 5276160 | consumed tokens: 10805575680 | elapsed time per iteration (s): 0.22 | learning rate: 9.897E-05 | global batch size: 256 | lm loss: 3.731648E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.819 | TFLOPs: 29.39 | +7: iteration 20620/ 37905 | consumed samples: 5278720 | consumed tokens: 10810818560 | elapsed time per iteration (s): 0.22 | learning rate: 9.889E-05 | global batch size: 256 | lm loss: 3.736332E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.364 | TFLOPs: 29.36 | +7: iteration 20630/ 37905 | consumed samples: 5281280 | consumed tokens: 10816061440 | elapsed time per iteration (s): 0.22 | learning rate: 9.882E-05 | global batch size: 256 | lm loss: 3.741551E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.039 | TFLOPs: 29.37 | +7: iteration 20640/ 37905 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 0.22 | learning rate: 9.874E-05 | global batch size: 256 | lm loss: 3.751795E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.762 | TFLOPs: 29.39 | +7: iteration 20650/ 37905 | consumed samples: 5286400 | consumed tokens: 10826547200 | elapsed time per iteration (s): 0.22 | learning rate: 9.867E-05 | global batch size: 256 | lm loss: 3.753857E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.477 | TFLOPs: 29.38 | +7: iteration 20660/ 37905 | consumed samples: 5288960 | consumed tokens: 10831790080 | elapsed time per iteration (s): 0.22 | learning rate: 9.859E-05 | global batch size: 256 | lm loss: 3.746128E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.719 | TFLOPs: 29.39 | +7: iteration 20670/ 37905 | consumed samples: 5291520 | consumed tokens: 10837032960 | elapsed time per iteration (s): 0.22 | learning rate: 9.852E-05 | global batch size: 256 | lm loss: 3.742057E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.625 | TFLOPs: 29.39 | +7: iteration 20680/ 37905 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 0.22 | learning rate: 9.844E-05 | global batch size: 256 | lm loss: 3.752404E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.026 | TFLOPs: 29.30 | +7: iteration 20690/ 37905 | consumed samples: 5296640 | consumed tokens: 10847518720 | elapsed time per iteration (s): 0.22 | learning rate: 9.837E-05 | global batch size: 256 | lm loss: 3.746917E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.447 | TFLOPs: 29.31 | +7: iteration 20700/ 37905 | consumed samples: 5299200 | consumed tokens: 10852761600 | elapsed time per iteration (s): 0.22 | learning rate: 9.829E-05 | global batch size: 256 | lm loss: 3.745142E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.681 | TFLOPs: 29.31 | +7: iteration 20710/ 37905 | consumed samples: 5301760 | consumed tokens: 10858004480 | elapsed time per iteration (s): 0.22 | learning rate: 9.822E-05 | global batch size: 256 | lm loss: 3.730038E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.081 | TFLOPs: 29.32 | +7: iteration 20720/ 37905 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 0.22 | learning rate: 9.815E-05 | global batch size: 256 | lm loss: 3.737376E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.070 | TFLOPs: 29.32 | +7: iteration 20730/ 37905 | consumed samples: 5306880 | consumed tokens: 10868490240 | elapsed time per iteration (s): 0.22 | learning rate: 9.807E-05 | global batch size: 256 | lm loss: 3.741866E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.079 | TFLOPs: 29.32 | +7: iteration 20740/ 37905 | consumed samples: 5309440 | consumed tokens: 10873733120 | elapsed time per iteration (s): 0.22 | learning rate: 9.800E-05 | global batch size: 256 | lm loss: 3.745653E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.766 | TFLOPs: 29.34 | +7: iteration 20750/ 37905 | consumed samples: 5312000 | consumed tokens: 10878976000 | elapsed time per iteration (s): 0.22 | learning rate: 9.792E-05 | global batch size: 256 | lm loss: 3.748184E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.435 | TFLOPs: 29.33 | +7: iteration 20760/ 37905 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 0.22 | learning rate: 9.785E-05 | global batch size: 256 | lm loss: 3.744684E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.368 | TFLOPs: 29.31 | +7: iteration 20770/ 37905 | consumed samples: 5317120 | consumed tokens: 10889461760 | elapsed time per iteration (s): 0.22 | learning rate: 9.777E-05 | global batch size: 256 | lm loss: 3.751501E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.304 | TFLOPs: 29.33 | +7: iteration 20780/ 37905 | consumed samples: 5319680 | consumed tokens: 10894704640 | elapsed time per iteration (s): 0.22 | learning rate: 9.770E-05 | global batch size: 256 | lm loss: 3.724262E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.667 | TFLOPs: 29.31 | +7: iteration 20790/ 37905 | consumed samples: 5322240 | consumed tokens: 10899947520 | elapsed time per iteration (s): 0.22 | learning rate: 9.762E-05 | global batch size: 256 | lm loss: 3.747048E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.414 | TFLOPs: 29.31 | +7: iteration 20800/ 37905 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 0.22 | learning rate: 9.755E-05 | global batch size: 256 | lm loss: 3.745171E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.778 | TFLOPs: 29.29 | +7: iteration 20810/ 37905 | consumed samples: 5327360 | consumed tokens: 10910433280 | elapsed time per iteration (s): 0.22 | learning rate: 9.747E-05 | global batch size: 256 | lm loss: 3.745472E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.482 | TFLOPs: 29.31 | +7: iteration 20820/ 37905 | consumed samples: 5329920 | consumed tokens: 10915676160 | elapsed time per iteration (s): 0.22 | learning rate: 9.740E-05 | global batch size: 256 | lm loss: 3.755498E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.914 | TFLOPs: 29.32 | +7: iteration 20830/ 37905 | consumed samples: 5332480 | consumed tokens: 10920919040 | elapsed time per iteration (s): 0.22 | learning rate: 9.732E-05 | global batch size: 256 | lm loss: 3.754420E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.056 | TFLOPs: 29.32 | +7: iteration 20840/ 37905 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 0.22 | learning rate: 9.725E-05 | global batch size: 256 | lm loss: 3.740888E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.147 | TFLOPs: 29.33 | +7: iteration 20850/ 37905 | consumed samples: 5337600 | consumed tokens: 10931404800 | elapsed time per iteration (s): 0.22 | learning rate: 9.718E-05 | global batch size: 256 | lm loss: 3.744712E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.530 | TFLOPs: 29.31 | +7: iteration 20860/ 37905 | consumed samples: 5340160 | consumed tokens: 10936647680 | elapsed time per iteration (s): 0.22 | learning rate: 9.710E-05 | global batch size: 256 | lm loss: 3.738494E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.614 | TFLOPs: 29.31 | +7: iteration 20870/ 37905 | consumed samples: 5342720 | consumed tokens: 10941890560 | elapsed time per iteration (s): 0.22 | learning rate: 9.703E-05 | global batch size: 256 | lm loss: 3.736425E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.749 | TFLOPs: 29.32 | +7: iteration 20880/ 37905 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 0.22 | learning rate: 9.695E-05 | global batch size: 256 | lm loss: 3.744801E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.061 | TFLOPs: 29.32 | +7: iteration 20890/ 37905 | consumed samples: 5347840 | consumed tokens: 10952376320 | elapsed time per iteration (s): 0.22 | learning rate: 9.688E-05 | global batch size: 256 | lm loss: 3.735203E+00 | grad norm: 0.262 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.027 | TFLOPs: 29.35 | +7: iteration 20900/ 37905 | consumed samples: 5350400 | consumed tokens: 10957619200 | elapsed time per iteration (s): 0.22 | learning rate: 9.680E-05 | global batch size: 256 | lm loss: 3.742070E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.647 | TFLOPs: 29.34 | +7: iteration 20910/ 37905 | consumed samples: 5352960 | consumed tokens: 10962862080 | elapsed time per iteration (s): 0.22 | learning rate: 9.673E-05 | global batch size: 256 | lm loss: 3.733004E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.524 | TFLOPs: 29.34 | +7: iteration 20920/ 37905 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 0.22 | learning rate: 9.665E-05 | global batch size: 256 | lm loss: 3.728380E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.912 | TFLOPs: 29.32 | +7: iteration 20930/ 37905 | consumed samples: 5358080 | consumed tokens: 10973347840 | elapsed time per iteration (s): 0.22 | learning rate: 9.658E-05 | global batch size: 256 | lm loss: 3.742654E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.303 | TFLOPs: 29.30 | +7: iteration 20940/ 37905 | consumed samples: 5360640 | consumed tokens: 10978590720 | elapsed time per iteration (s): 0.22 | learning rate: 9.650E-05 | global batch size: 256 | lm loss: 3.753496E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.520 | TFLOPs: 29.28 | +7: iteration 20950/ 37905 | consumed samples: 5363200 | consumed tokens: 10983833600 | elapsed time per iteration (s): 0.22 | learning rate: 9.643E-05 | global batch size: 256 | lm loss: 3.743713E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.373 | TFLOPs: 29.28 | +7: iteration 20960/ 37905 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 0.22 | learning rate: 9.636E-05 | global batch size: 256 | lm loss: 3.729819E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.738 | TFLOPs: 29.24 | +7: iteration 20970/ 37905 | consumed samples: 5368320 | consumed tokens: 10994319360 | elapsed time per iteration (s): 0.22 | learning rate: 9.628E-05 | global batch size: 256 | lm loss: 3.735012E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.562 | TFLOPs: 29.31 | +7: iteration 20980/ 37905 | consumed samples: 5370880 | consumed tokens: 10999562240 | elapsed time per iteration (s): 0.22 | learning rate: 9.621E-05 | global batch size: 256 | lm loss: 3.747481E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.590 | TFLOPs: 29.36 | +7: iteration 20990/ 37905 | consumed samples: 5373440 | consumed tokens: 11004805120 | elapsed time per iteration (s): 0.22 | learning rate: 9.613E-05 | global batch size: 256 | lm loss: 3.744208E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.720 | TFLOPs: 29.37 | +7: iteration 21000/ 37905 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 0.22 | learning rate: 9.606E-05 | global batch size: 256 | lm loss: 3.734497E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.595 | TFLOPs: 29.39 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 21000 | lm loss value: 3.705594E+00 | lm loss PPL: 4.067420E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 21000 to checkpoints_83m20b400m +0: [2023-03-15 23:15:49,224] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step21000 is begin to save! +0: [2023-03-15 23:15:49,228] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:15:49,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:15:49,300] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:15:49,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:15:49,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:15:49,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:15:49,324] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:15:49,335] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:15:49,335] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:15:49,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:15:49,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:15:49,358] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:15:49,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:15:49,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:15:49,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:15:49,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:15:49,380] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:15:49,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:15:49,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:15:49,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:15:49,403] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:15:49,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:15:49,414] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:15:49,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:15:49,416] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step21000/mp_rank_00_model_states.pt +0: [2023-03-15 23:15:49,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:15:49,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:15:49,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:15:49,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:15:49,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,456] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,456] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:15:49,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +3: [2023-03-15 23:15:49,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,462] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +2: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +6: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +7: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +4: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:15:49,463] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,463] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +1: [2023-03-15 23:15:49,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:15:49,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:15:49,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +5: [2023-03-15 23:15:49,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:15:49,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step21000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:15:49,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step21000 is ready now! +0: successfully saved checkpoint at iteration 21000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 244.32 +7: iteration 21010/ 37905 | consumed samples: 5378560 | consumed tokens: 11015290880 | elapsed time per iteration (s): 0.25 | learning rate: 9.598E-05 | global batch size: 256 | lm loss: 3.748965E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1012.212 | TFLOPs: 25.79 | +7: iteration 21020/ 37905 | consumed samples: 5381120 | consumed tokens: 11020533760 | elapsed time per iteration (s): 0.22 | learning rate: 9.591E-05 | global batch size: 256 | lm loss: 3.750354E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.294 | TFLOPs: 29.33 | +7: iteration 21030/ 37905 | consumed samples: 5383680 | consumed tokens: 11025776640 | elapsed time per iteration (s): 0.22 | learning rate: 9.583E-05 | global batch size: 256 | lm loss: 3.744104E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.492 | TFLOPs: 29.26 | +7: iteration 21040/ 37905 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 0.22 | learning rate: 9.576E-05 | global batch size: 256 | lm loss: 3.743104E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.203 | TFLOPs: 29.30 | +7: iteration 21050/ 37905 | consumed samples: 5388800 | consumed tokens: 11036262400 | elapsed time per iteration (s): 0.22 | learning rate: 9.569E-05 | global batch size: 256 | lm loss: 3.736147E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.094 | TFLOPs: 29.30 | +7: iteration 21060/ 37905 | consumed samples: 5391360 | consumed tokens: 11041505280 | elapsed time per iteration (s): 0.22 | learning rate: 9.561E-05 | global batch size: 256 | lm loss: 3.721322E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.529 | TFLOPs: 29.13 | +7: iteration 21070/ 37905 | consumed samples: 5393920 | consumed tokens: 11046748160 | elapsed time per iteration (s): 0.22 | learning rate: 9.554E-05 | global batch size: 256 | lm loss: 3.746661E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.638 | TFLOPs: 29.36 | +7: iteration 21080/ 37905 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 0.22 | learning rate: 9.546E-05 | global batch size: 256 | lm loss: 3.723845E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.380 | TFLOPs: 29.36 | +7: iteration 21090/ 37905 | consumed samples: 5399040 | consumed tokens: 11057233920 | elapsed time per iteration (s): 0.22 | learning rate: 9.539E-05 | global batch size: 256 | lm loss: 3.745432E+00 | grad norm: 0.261 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.634 | TFLOPs: 29.16 | +7: iteration 21100/ 37905 | consumed samples: 5401600 | consumed tokens: 11062476800 | elapsed time per iteration (s): 0.22 | learning rate: 9.531E-05 | global batch size: 256 | lm loss: 3.738671E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.350 | TFLOPs: 29.38 | +7: iteration 21110/ 37905 | consumed samples: 5404160 | consumed tokens: 11067719680 | elapsed time per iteration (s): 0.22 | learning rate: 9.524E-05 | global batch size: 256 | lm loss: 3.739008E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.786 | TFLOPs: 29.06 | +7: iteration 21120/ 37905 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 0.22 | learning rate: 9.517E-05 | global batch size: 256 | lm loss: 3.744497E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.263 | TFLOPs: 29.33 | +7: iteration 21130/ 37905 | consumed samples: 5409280 | consumed tokens: 11078205440 | elapsed time per iteration (s): 0.23 | learning rate: 9.509E-05 | global batch size: 256 | lm loss: 3.741972E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.089 | TFLOPs: 28.94 | +7: iteration 21140/ 37905 | consumed samples: 5411840 | consumed tokens: 11083448320 | elapsed time per iteration (s): 0.22 | learning rate: 9.502E-05 | global batch size: 256 | lm loss: 3.737629E+00 | grad norm: 0.254 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.921 | TFLOPs: 28.99 | +7: iteration 21150/ 37905 | consumed samples: 5414400 | consumed tokens: 11088691200 | elapsed time per iteration (s): 0.22 | learning rate: 9.494E-05 | global batch size: 256 | lm loss: 3.734641E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.220 | TFLOPs: 29.35 | +7: iteration 21160/ 37905 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 0.22 | learning rate: 9.487E-05 | global batch size: 256 | lm loss: 3.747097E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.704 | TFLOPs: 29.37 | +7: iteration 21170/ 37905 | consumed samples: 5419520 | consumed tokens: 11099176960 | elapsed time per iteration (s): 0.22 | learning rate: 9.479E-05 | global batch size: 256 | lm loss: 3.740115E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.474 | TFLOPs: 29.33 | +7: iteration 21180/ 37905 | consumed samples: 5422080 | consumed tokens: 11104419840 | elapsed time per iteration (s): 0.22 | learning rate: 9.472E-05 | global batch size: 256 | lm loss: 3.754705E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.577 | TFLOPs: 29.36 | +7: iteration 21190/ 37905 | consumed samples: 5424640 | consumed tokens: 11109662720 | elapsed time per iteration (s): 0.22 | learning rate: 9.465E-05 | global batch size: 256 | lm loss: 3.739826E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.445 | TFLOPs: 29.28 | +7: iteration 21200/ 37905 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 0.22 | learning rate: 9.457E-05 | global batch size: 256 | lm loss: 3.741502E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.609 | TFLOPs: 29.31 | +7: iteration 21210/ 37905 | consumed samples: 5429760 | consumed tokens: 11120148480 | elapsed time per iteration (s): 0.22 | learning rate: 9.450E-05 | global batch size: 256 | lm loss: 3.750383E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.329 | TFLOPs: 29.30 | +7: iteration 21220/ 37905 | consumed samples: 5432320 | consumed tokens: 11125391360 | elapsed time per iteration (s): 0.22 | learning rate: 9.442E-05 | global batch size: 256 | lm loss: 3.743899E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.700 | TFLOPs: 29.29 | +7: iteration 21230/ 37905 | consumed samples: 5434880 | consumed tokens: 11130634240 | elapsed time per iteration (s): 0.22 | learning rate: 9.435E-05 | global batch size: 256 | lm loss: 3.736942E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.493 | TFLOPs: 29.03 | +7: iteration 21240/ 37905 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 0.22 | learning rate: 9.427E-05 | global batch size: 256 | lm loss: 3.742931E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.857 | TFLOPs: 29.29 | +7: iteration 21250/ 37905 | consumed samples: 5440000 | consumed tokens: 11141120000 | elapsed time per iteration (s): 0.22 | learning rate: 9.420E-05 | global batch size: 256 | lm loss: 3.726109E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.561 | TFLOPs: 29.29 | +7: iteration 21260/ 37905 | consumed samples: 5442560 | consumed tokens: 11146362880 | elapsed time per iteration (s): 0.22 | learning rate: 9.413E-05 | global batch size: 256 | lm loss: 3.741314E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.819 | TFLOPs: 29.32 | +7: iteration 21270/ 37905 | consumed samples: 5445120 | consumed tokens: 11151605760 | elapsed time per iteration (s): 0.22 | learning rate: 9.405E-05 | global batch size: 256 | lm loss: 3.740017E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.480 | TFLOPs: 29.10 | +7: iteration 21280/ 37905 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 0.22 | learning rate: 9.398E-05 | global batch size: 256 | lm loss: 3.755208E+00 | grad norm: 0.278 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.824 | TFLOPs: 29.32 | +7: iteration 21290/ 37905 | consumed samples: 5450240 | consumed tokens: 11162091520 | elapsed time per iteration (s): 0.22 | learning rate: 9.390E-05 | global batch size: 256 | lm loss: 3.725480E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.084 | TFLOPs: 29.30 | +7: iteration 21300/ 37905 | consumed samples: 5452800 | consumed tokens: 11167334400 | elapsed time per iteration (s): 0.22 | learning rate: 9.383E-05 | global batch size: 256 | lm loss: 3.735358E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.491 | TFLOPs: 29.31 | +7: iteration 21310/ 37905 | consumed samples: 5455360 | consumed tokens: 11172577280 | elapsed time per iteration (s): 0.22 | learning rate: 9.376E-05 | global batch size: 256 | lm loss: 3.736652E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.729 | TFLOPs: 29.29 | +7: iteration 21320/ 37905 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 0.22 | learning rate: 9.368E-05 | global batch size: 256 | lm loss: 3.735094E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.188 | TFLOPs: 29.28 | +7: iteration 21330/ 37905 | consumed samples: 5460480 | consumed tokens: 11183063040 | elapsed time per iteration (s): 0.22 | learning rate: 9.361E-05 | global batch size: 256 | lm loss: 3.723511E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.321 | TFLOPs: 29.28 | +7: iteration 21340/ 37905 | consumed samples: 5463040 | consumed tokens: 11188305920 | elapsed time per iteration (s): 0.22 | learning rate: 9.353E-05 | global batch size: 256 | lm loss: 3.746358E+00 | grad norm: 0.263 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.926 | TFLOPs: 29.29 | +7: iteration 21350/ 37905 | consumed samples: 5465600 | consumed tokens: 11193548800 | elapsed time per iteration (s): 0.22 | learning rate: 9.346E-05 | global batch size: 256 | lm loss: 3.736903E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.207 | TFLOPs: 29.28 | +7: iteration 21360/ 37905 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 0.22 | learning rate: 9.338E-05 | global batch size: 256 | lm loss: 3.754974E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.777 | TFLOPs: 29.29 | +7: iteration 21370/ 37905 | consumed samples: 5470720 | consumed tokens: 11204034560 | elapsed time per iteration (s): 0.22 | learning rate: 9.331E-05 | global batch size: 256 | lm loss: 3.720689E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.281 | TFLOPs: 29.30 | +7: iteration 21380/ 37905 | consumed samples: 5473280 | consumed tokens: 11209277440 | elapsed time per iteration (s): 0.22 | learning rate: 9.324E-05 | global batch size: 256 | lm loss: 3.741873E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.062 | TFLOPs: 29.30 | +7: iteration 21390/ 37905 | consumed samples: 5475840 | consumed tokens: 11214520320 | elapsed time per iteration (s): 0.22 | learning rate: 9.316E-05 | global batch size: 256 | lm loss: 3.739359E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.839 | TFLOPs: 29.29 | +7: iteration 21400/ 37905 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 0.22 | learning rate: 9.309E-05 | global batch size: 256 | lm loss: 3.738137E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.679 | TFLOPs: 29.31 | +7: iteration 21410/ 37905 | consumed samples: 5480960 | consumed tokens: 11225006080 | elapsed time per iteration (s): 0.22 | learning rate: 9.301E-05 | global batch size: 256 | lm loss: 3.756448E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.179 | TFLOPs: 29.33 | +7: iteration 21420/ 37905 | consumed samples: 5483520 | consumed tokens: 11230248960 | elapsed time per iteration (s): 0.22 | learning rate: 9.294E-05 | global batch size: 256 | lm loss: 3.742228E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.316 | TFLOPs: 29.28 | +7: iteration 21430/ 37905 | consumed samples: 5486080 | consumed tokens: 11235491840 | elapsed time per iteration (s): 0.22 | learning rate: 9.287E-05 | global batch size: 256 | lm loss: 3.726852E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.441 | TFLOPs: 29.26 | +7: iteration 21440/ 37905 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 0.22 | learning rate: 9.279E-05 | global batch size: 256 | lm loss: 3.736578E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.776 | TFLOPs: 29.27 | +7: iteration 21450/ 37905 | consumed samples: 5491200 | consumed tokens: 11245977600 | elapsed time per iteration (s): 0.22 | learning rate: 9.272E-05 | global batch size: 256 | lm loss: 3.732433E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.900 | TFLOPs: 29.27 | +7: iteration 21460/ 37905 | consumed samples: 5493760 | consumed tokens: 11251220480 | elapsed time per iteration (s): 0.22 | learning rate: 9.264E-05 | global batch size: 256 | lm loss: 3.744722E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.824 | TFLOPs: 29.27 | +7: iteration 21470/ 37905 | consumed samples: 5496320 | consumed tokens: 11256463360 | elapsed time per iteration (s): 0.22 | learning rate: 9.257E-05 | global batch size: 256 | lm loss: 3.727020E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.689 | TFLOPs: 29.29 | +7: iteration 21480/ 37905 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 0.22 | learning rate: 9.250E-05 | global batch size: 256 | lm loss: 3.739516E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.503 | TFLOPs: 29.31 | +7: iteration 21490/ 37905 | consumed samples: 5501440 | consumed tokens: 11266949120 | elapsed time per iteration (s): 0.22 | learning rate: 9.242E-05 | global batch size: 256 | lm loss: 3.743275E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.552 | TFLOPs: 29.34 | +7: iteration 21500/ 37905 | consumed samples: 5504000 | consumed tokens: 11272192000 | elapsed time per iteration (s): 0.22 | learning rate: 9.235E-05 | global batch size: 256 | lm loss: 3.748200E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.231 | TFLOPs: 29.33 | +7: iteration 21510/ 37905 | consumed samples: 5506560 | consumed tokens: 11277434880 | elapsed time per iteration (s): 0.23 | learning rate: 9.228E-05 | global batch size: 256 | lm loss: 3.736316E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.623 | TFLOPs: 28.90 | +7: iteration 21520/ 37905 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 0.22 | learning rate: 9.220E-05 | global batch size: 256 | lm loss: 3.744201E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.866 | TFLOPs: 29.32 | +7: iteration 21530/ 37905 | consumed samples: 5511680 | consumed tokens: 11287920640 | elapsed time per iteration (s): 0.22 | learning rate: 9.213E-05 | global batch size: 256 | lm loss: 3.748834E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.238 | TFLOPs: 29.33 | +7: iteration 21540/ 37905 | consumed samples: 5514240 | consumed tokens: 11293163520 | elapsed time per iteration (s): 0.22 | learning rate: 9.205E-05 | global batch size: 256 | lm loss: 3.741896E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.934 | TFLOPs: 29.29 | +7: iteration 21550/ 37905 | consumed samples: 5516800 | consumed tokens: 11298406400 | elapsed time per iteration (s): 0.22 | learning rate: 9.198E-05 | global batch size: 256 | lm loss: 3.736295E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.268 | TFLOPs: 29.30 | +7: iteration 21560/ 37905 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 0.22 | learning rate: 9.191E-05 | global batch size: 256 | lm loss: 3.740990E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.104 | TFLOPs: 29.30 | +7: iteration 21570/ 37905 | consumed samples: 5521920 | consumed tokens: 11308892160 | elapsed time per iteration (s): 0.22 | learning rate: 9.183E-05 | global batch size: 256 | lm loss: 3.748532E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.268 | TFLOPs: 29.30 | +7: iteration 21580/ 37905 | consumed samples: 5524480 | consumed tokens: 11314135040 | elapsed time per iteration (s): 0.22 | learning rate: 9.176E-05 | global batch size: 256 | lm loss: 3.746062E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.932 | TFLOPs: 29.32 | +7: iteration 21590/ 37905 | consumed samples: 5527040 | consumed tokens: 11319377920 | elapsed time per iteration (s): 0.22 | learning rate: 9.168E-05 | global batch size: 256 | lm loss: 3.736032E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.581 | TFLOPs: 29.31 | +7: iteration 21600/ 37905 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 0.22 | learning rate: 9.161E-05 | global batch size: 256 | lm loss: 3.731663E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.992 | TFLOPs: 29.32 | +7: iteration 21610/ 37905 | consumed samples: 5532160 | consumed tokens: 11329863680 | elapsed time per iteration (s): 0.22 | learning rate: 9.154E-05 | global batch size: 256 | lm loss: 3.738791E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.209 | TFLOPs: 29.30 | +7: iteration 21620/ 37905 | consumed samples: 5534720 | consumed tokens: 11335106560 | elapsed time per iteration (s): 0.22 | learning rate: 9.146E-05 | global batch size: 256 | lm loss: 3.740403E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.331 | TFLOPs: 29.20 | +7: iteration 21630/ 37905 | consumed samples: 5537280 | consumed tokens: 11340349440 | elapsed time per iteration (s): 0.23 | learning rate: 9.139E-05 | global batch size: 256 | lm loss: 3.750246E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.990 | TFLOPs: 28.89 | +7: iteration 21640/ 37905 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 0.22 | learning rate: 9.132E-05 | global batch size: 256 | lm loss: 3.742646E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.068 | TFLOPs: 29.07 | +7: iteration 21650/ 37905 | consumed samples: 5542400 | consumed tokens: 11350835200 | elapsed time per iteration (s): 0.22 | learning rate: 9.124E-05 | global batch size: 256 | lm loss: 3.747844E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.429 | TFLOPs: 29.31 | +7: iteration 21660/ 37905 | consumed samples: 5544960 | consumed tokens: 11356078080 | elapsed time per iteration (s): 0.22 | learning rate: 9.117E-05 | global batch size: 256 | lm loss: 3.741666E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.934 | TFLOPs: 29.32 | +7: iteration 21670/ 37905 | consumed samples: 5547520 | consumed tokens: 11361320960 | elapsed time per iteration (s): 0.22 | learning rate: 9.109E-05 | global batch size: 256 | lm loss: 3.724632E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.278 | TFLOPs: 29.10 | +7: iteration 21680/ 37905 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 0.22 | learning rate: 9.102E-05 | global batch size: 256 | lm loss: 3.741608E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.853 | TFLOPs: 29.32 | +7: iteration 21690/ 37905 | consumed samples: 5552640 | consumed tokens: 11371806720 | elapsed time per iteration (s): 0.22 | learning rate: 9.095E-05 | global batch size: 256 | lm loss: 3.740732E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.686 | TFLOPs: 29.03 | +7: iteration 21700/ 37905 | consumed samples: 5555200 | consumed tokens: 11377049600 | elapsed time per iteration (s): 0.22 | learning rate: 9.087E-05 | global batch size: 256 | lm loss: 3.726106E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.030 | TFLOPs: 29.04 | +7: iteration 21710/ 37905 | consumed samples: 5557760 | consumed tokens: 11382292480 | elapsed time per iteration (s): 0.22 | learning rate: 9.080E-05 | global batch size: 256 | lm loss: 3.725327E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.868 | TFLOPs: 29.32 | +7: iteration 21720/ 37905 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 0.23 | learning rate: 9.073E-05 | global batch size: 256 | lm loss: 3.739989E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.787 | TFLOPs: 28.91 | +7: iteration 21730/ 37905 | consumed samples: 5562880 | consumed tokens: 11392778240 | elapsed time per iteration (s): 0.23 | learning rate: 9.065E-05 | global batch size: 256 | lm loss: 3.731586E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.946 | TFLOPs: 28.89 | +7: iteration 21740/ 37905 | consumed samples: 5565440 | consumed tokens: 11398021120 | elapsed time per iteration (s): 0.22 | learning rate: 9.058E-05 | global batch size: 256 | lm loss: 3.742290E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.805 | TFLOPs: 29.29 | +7: iteration 21750/ 37905 | consumed samples: 5568000 | consumed tokens: 11403264000 | elapsed time per iteration (s): 0.23 | learning rate: 9.051E-05 | global batch size: 256 | lm loss: 3.742746E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.238 | TFLOPs: 28.87 | +7: iteration 21760/ 37905 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 0.22 | learning rate: 9.043E-05 | global batch size: 256 | lm loss: 3.721314E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.705 | TFLOPs: 29.31 | +7: iteration 21770/ 37905 | consumed samples: 5573120 | consumed tokens: 11413749760 | elapsed time per iteration (s): 0.22 | learning rate: 9.036E-05 | global batch size: 256 | lm loss: 3.725994E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.052 | TFLOPs: 29.30 | +7: iteration 21780/ 37905 | consumed samples: 5575680 | consumed tokens: 11418992640 | elapsed time per iteration (s): 0.22 | learning rate: 9.029E-05 | global batch size: 256 | lm loss: 3.744333E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.563 | TFLOPs: 29.29 | +7: iteration 21790/ 37905 | consumed samples: 5578240 | consumed tokens: 11424235520 | elapsed time per iteration (s): 0.22 | learning rate: 9.021E-05 | global batch size: 256 | lm loss: 3.736699E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.666 | TFLOPs: 29.31 | +7: iteration 21800/ 37905 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 0.22 | learning rate: 9.014E-05 | global batch size: 256 | lm loss: 3.752959E+00 | grad norm: 0.299 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.272 | TFLOPs: 29.30 | +7: iteration 21810/ 37905 | consumed samples: 5583360 | consumed tokens: 11434721280 | elapsed time per iteration (s): 0.23 | learning rate: 9.006E-05 | global batch size: 256 | lm loss: 3.738816E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.173 | TFLOPs: 28.92 | +7: iteration 21820/ 37905 | consumed samples: 5585920 | consumed tokens: 11439964160 | elapsed time per iteration (s): 0.23 | learning rate: 8.999E-05 | global batch size: 256 | lm loss: 3.724492E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.305 | TFLOPs: 28.95 | +7: iteration 21830/ 37905 | consumed samples: 5588480 | consumed tokens: 11445207040 | elapsed time per iteration (s): 0.22 | learning rate: 8.992E-05 | global batch size: 256 | lm loss: 3.732143E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.589 | TFLOPs: 29.11 | +7: iteration 21840/ 37905 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 0.22 | learning rate: 8.984E-05 | global batch size: 256 | lm loss: 3.734543E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.255 | TFLOPs: 29.33 | +7: iteration 21850/ 37905 | consumed samples: 5593600 | consumed tokens: 11455692800 | elapsed time per iteration (s): 0.23 | learning rate: 8.977E-05 | global batch size: 256 | lm loss: 3.739311E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.510 | TFLOPs: 28.93 | +7: iteration 21860/ 37905 | consumed samples: 5596160 | consumed tokens: 11460935680 | elapsed time per iteration (s): 0.22 | learning rate: 8.970E-05 | global batch size: 256 | lm loss: 3.739531E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.480 | TFLOPs: 29.33 | +7: iteration 21870/ 37905 | consumed samples: 5598720 | consumed tokens: 11466178560 | elapsed time per iteration (s): 0.22 | learning rate: 8.962E-05 | global batch size: 256 | lm loss: 3.739573E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.602 | TFLOPs: 29.34 | +7: iteration 21880/ 37905 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 0.22 | learning rate: 8.955E-05 | global batch size: 256 | lm loss: 3.720046E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.802 | TFLOPs: 29.09 | +7: iteration 21890/ 37905 | consumed samples: 5603840 | consumed tokens: 11476664320 | elapsed time per iteration (s): 0.22 | learning rate: 8.948E-05 | global batch size: 256 | lm loss: 3.741070E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.067 | TFLOPs: 29.32 | +7: iteration 21900/ 37905 | consumed samples: 5606400 | consumed tokens: 11481907200 | elapsed time per iteration (s): 0.22 | learning rate: 8.940E-05 | global batch size: 256 | lm loss: 3.725288E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.977 | TFLOPs: 29.32 | +7: iteration 21910/ 37905 | consumed samples: 5608960 | consumed tokens: 11487150080 | elapsed time per iteration (s): 0.23 | learning rate: 8.933E-05 | global batch size: 256 | lm loss: 3.724812E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.665 | TFLOPs: 28.88 | +7: iteration 21920/ 37905 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 0.22 | learning rate: 8.926E-05 | global batch size: 256 | lm loss: 3.732143E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.750 | TFLOPs: 29.26 | +7: iteration 21930/ 37905 | consumed samples: 5614080 | consumed tokens: 11497635840 | elapsed time per iteration (s): 0.22 | learning rate: 8.918E-05 | global batch size: 256 | lm loss: 3.735038E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.279 | TFLOPs: 29.33 | +7: iteration 21940/ 37905 | consumed samples: 5616640 | consumed tokens: 11502878720 | elapsed time per iteration (s): 0.22 | learning rate: 8.911E-05 | global batch size: 256 | lm loss: 3.728121E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.432 | TFLOPs: 29.33 | +7: iteration 21950/ 37905 | consumed samples: 5619200 | consumed tokens: 11508121600 | elapsed time per iteration (s): 0.22 | learning rate: 8.904E-05 | global batch size: 256 | lm loss: 3.721846E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.569 | TFLOPs: 29.31 | +7: iteration 21960/ 37905 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 0.22 | learning rate: 8.896E-05 | global batch size: 256 | lm loss: 3.735160E+00 | grad norm: 0.283 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.566 | TFLOPs: 29.11 | +7: iteration 21970/ 37905 | consumed samples: 5624320 | consumed tokens: 11518607360 | elapsed time per iteration (s): 0.22 | learning rate: 8.889E-05 | global batch size: 256 | lm loss: 3.746381E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.778 | TFLOPs: 29.34 | +7: iteration 21980/ 37905 | consumed samples: 5626880 | consumed tokens: 11523850240 | elapsed time per iteration (s): 0.22 | learning rate: 8.882E-05 | global batch size: 256 | lm loss: 3.731442E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.580 | TFLOPs: 29.34 | +7: iteration 21990/ 37905 | consumed samples: 5629440 | consumed tokens: 11529093120 | elapsed time per iteration (s): 0.22 | learning rate: 8.874E-05 | global batch size: 256 | lm loss: 3.719785E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.679 | TFLOPs: 29.31 | +0: [2023-03-15 23:19:32,517] [INFO] [logging.py:68:log_dist] [Rank 0] step=22000, skipped=0, lr=[8.86714956590028e-05, 8.86714956590028e-05, 8.86714956590028e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 22000/ 37905 | consumed samples: 5632000 | consumed tokens: 11534336000 | elapsed time per iteration (s): 0.22 | learning rate: 8.867E-05 | global batch size: 256 | lm loss: 3.735718E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.493 | TFLOPs: 29.28 | +0: steps: 22000 loss: 3.7733 iter time (s): 0.222 samples/sec: 1154.104 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 22000 | lm loss value: 3.704001E+00 | lm loss PPL: 4.060945E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 22000 to checkpoints_83m20b400m +0: [2023-03-15 23:19:32,606] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step22000 is begin to save! +0: [2023-03-15 23:19:32,609] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:19:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:19:32,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:19:32,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:19:32,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:19:32,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:19:32,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:19:32,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:19:32,716] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:19:32,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:19:32,727] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:19:32,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:19:32,738] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:19:32,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:19:32,749] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:19:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:19:32,761] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:19:32,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:19:32,772] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:19:32,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:19:32,783] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:19:32,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:19:32,794] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:19:32,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:19:32,796] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step22000/mp_rank_00_model_states.pt +0: [2023-03-15 23:19:32,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:19:32,798] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:19:32,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:19:32,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:19:32,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:19:32,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,837] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,837] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:19:32,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +6: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +1: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +2: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +4: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:19:32,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +7: [2023-03-15 23:19:32,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +5: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:19:32,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +3: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:19:32,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step22000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:19:32,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step22000 is ready now! +0: successfully saved checkpoint at iteration 22000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 243.83 +7: iteration 22010/ 37905 | consumed samples: 5634560 | consumed tokens: 11539578880 | elapsed time per iteration (s): 0.25 | learning rate: 8.860E-05 | global batch size: 256 | lm loss: 3.737257E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1004.834 | TFLOPs: 25.60 | +7: iteration 22020/ 37905 | consumed samples: 5637120 | consumed tokens: 11544821760 | elapsed time per iteration (s): 0.22 | learning rate: 8.853E-05 | global batch size: 256 | lm loss: 3.725282E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.111 | TFLOPs: 29.35 | +7: iteration 22030/ 37905 | consumed samples: 5639680 | consumed tokens: 11550064640 | elapsed time per iteration (s): 0.22 | learning rate: 8.845E-05 | global batch size: 256 | lm loss: 3.732216E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.841 | TFLOPs: 29.04 | +7: iteration 22040/ 37905 | consumed samples: 5642240 | consumed tokens: 11555307520 | elapsed time per iteration (s): 0.22 | learning rate: 8.838E-05 | global batch size: 256 | lm loss: 3.735748E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.110 | TFLOPs: 29.32 | +7: iteration 22050/ 37905 | consumed samples: 5644800 | consumed tokens: 11560550400 | elapsed time per iteration (s): 0.22 | learning rate: 8.831E-05 | global batch size: 256 | lm loss: 3.740740E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.314 | TFLOPs: 29.36 | +7: iteration 22060/ 37905 | consumed samples: 5647360 | consumed tokens: 11565793280 | elapsed time per iteration (s): 0.22 | learning rate: 8.823E-05 | global batch size: 256 | lm loss: 3.714016E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.243 | TFLOPs: 29.35 | +7: iteration 22070/ 37905 | consumed samples: 5649920 | consumed tokens: 11571036160 | elapsed time per iteration (s): 0.22 | learning rate: 8.816E-05 | global batch size: 256 | lm loss: 3.735246E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.418 | TFLOPs: 29.33 | +7: iteration 22080/ 37905 | consumed samples: 5652480 | consumed tokens: 11576279040 | elapsed time per iteration (s): 0.22 | learning rate: 8.809E-05 | global batch size: 256 | lm loss: 3.740915E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.077 | TFLOPs: 29.35 | +7: iteration 22090/ 37905 | consumed samples: 5655040 | consumed tokens: 11581521920 | elapsed time per iteration (s): 0.22 | learning rate: 8.801E-05 | global batch size: 256 | lm loss: 3.726738E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.825 | TFLOPs: 29.32 | +7: iteration 22100/ 37905 | consumed samples: 5657600 | consumed tokens: 11586764800 | elapsed time per iteration (s): 0.22 | learning rate: 8.794E-05 | global batch size: 256 | lm loss: 3.726037E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.428 | TFLOPs: 29.33 | +7: iteration 22110/ 37905 | consumed samples: 5660160 | consumed tokens: 11592007680 | elapsed time per iteration (s): 0.22 | learning rate: 8.787E-05 | global batch size: 256 | lm loss: 3.730499E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.890 | TFLOPs: 29.19 | +7: iteration 22120/ 37905 | consumed samples: 5662720 | consumed tokens: 11597250560 | elapsed time per iteration (s): 0.22 | learning rate: 8.779E-05 | global batch size: 256 | lm loss: 3.731612E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.722 | TFLOPs: 29.03 | +7: iteration 22130/ 37905 | consumed samples: 5665280 | consumed tokens: 11602493440 | elapsed time per iteration (s): 0.22 | learning rate: 8.772E-05 | global batch size: 256 | lm loss: 3.740410E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.229 | TFLOPs: 29.05 | +7: iteration 22140/ 37905 | consumed samples: 5667840 | consumed tokens: 11607736320 | elapsed time per iteration (s): 0.22 | learning rate: 8.765E-05 | global batch size: 256 | lm loss: 3.724032E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.687 | TFLOPs: 29.06 | +7: iteration 22150/ 37905 | consumed samples: 5670400 | consumed tokens: 11612979200 | elapsed time per iteration (s): 0.22 | learning rate: 8.758E-05 | global batch size: 256 | lm loss: 3.738741E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.776 | TFLOPs: 29.34 | +7: iteration 22160/ 37905 | consumed samples: 5672960 | consumed tokens: 11618222080 | elapsed time per iteration (s): 0.22 | learning rate: 8.750E-05 | global batch size: 256 | lm loss: 3.728386E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.070 | TFLOPs: 29.37 | +7: iteration 22170/ 37905 | consumed samples: 5675520 | consumed tokens: 11623464960 | elapsed time per iteration (s): 0.22 | learning rate: 8.743E-05 | global batch size: 256 | lm loss: 3.735920E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.217 | TFLOPs: 29.38 | +7: iteration 22180/ 37905 | consumed samples: 5678080 | consumed tokens: 11628707840 | elapsed time per iteration (s): 0.24 | learning rate: 8.736E-05 | global batch size: 256 | lm loss: 3.724907E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1049.520 | TFLOPs: 26.74 | +7: iteration 22190/ 37905 | consumed samples: 5680640 | consumed tokens: 11633950720 | elapsed time per iteration (s): 0.22 | learning rate: 8.728E-05 | global batch size: 256 | lm loss: 3.745565E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.971 | TFLOPs: 29.37 | +7: iteration 22200/ 37905 | consumed samples: 5683200 | consumed tokens: 11639193600 | elapsed time per iteration (s): 0.22 | learning rate: 8.721E-05 | global batch size: 256 | lm loss: 3.721010E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.372 | TFLOPs: 29.36 | +7: iteration 22210/ 37905 | consumed samples: 5685760 | consumed tokens: 11644436480 | elapsed time per iteration (s): 0.22 | learning rate: 8.714E-05 | global batch size: 256 | lm loss: 3.740420E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.607 | TFLOPs: 29.36 | +7: iteration 22220/ 37905 | consumed samples: 5688320 | consumed tokens: 11649679360 | elapsed time per iteration (s): 0.22 | learning rate: 8.706E-05 | global batch size: 256 | lm loss: 3.733820E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.664 | TFLOPs: 29.34 | +7: iteration 22230/ 37905 | consumed samples: 5690880 | consumed tokens: 11654922240 | elapsed time per iteration (s): 0.22 | learning rate: 8.699E-05 | global batch size: 256 | lm loss: 3.728536E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.216 | TFLOPs: 29.20 | +7: iteration 22240/ 37905 | consumed samples: 5693440 | consumed tokens: 11660165120 | elapsed time per iteration (s): 0.22 | learning rate: 8.692E-05 | global batch size: 256 | lm loss: 3.743998E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.138 | TFLOPs: 29.40 | +7: iteration 22250/ 37905 | consumed samples: 5696000 | consumed tokens: 11665408000 | elapsed time per iteration (s): 0.22 | learning rate: 8.685E-05 | global batch size: 256 | lm loss: 3.742564E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.269 | TFLOPs: 29.41 | +7: iteration 22260/ 37905 | consumed samples: 5698560 | consumed tokens: 11670650880 | elapsed time per iteration (s): 0.22 | learning rate: 8.677E-05 | global batch size: 256 | lm loss: 3.733044E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.624 | TFLOPs: 29.41 | +7: iteration 22270/ 37905 | consumed samples: 5701120 | consumed tokens: 11675893760 | elapsed time per iteration (s): 0.22 | learning rate: 8.670E-05 | global batch size: 256 | lm loss: 3.728663E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.928 | TFLOPs: 29.40 | +7: iteration 22280/ 37905 | consumed samples: 5703680 | consumed tokens: 11681136640 | elapsed time per iteration (s): 0.22 | learning rate: 8.663E-05 | global batch size: 256 | lm loss: 3.728597E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.821 | TFLOPs: 29.42 | +7: iteration 22290/ 37905 | consumed samples: 5706240 | consumed tokens: 11686379520 | elapsed time per iteration (s): 0.22 | learning rate: 8.656E-05 | global batch size: 256 | lm loss: 3.740164E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.238 | TFLOPs: 29.43 | +7: iteration 22300/ 37905 | consumed samples: 5708800 | consumed tokens: 11691622400 | elapsed time per iteration (s): 0.22 | learning rate: 8.648E-05 | global batch size: 256 | lm loss: 3.727358E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.979 | TFLOPs: 29.42 | +7: iteration 22310/ 37905 | consumed samples: 5711360 | consumed tokens: 11696865280 | elapsed time per iteration (s): 0.22 | learning rate: 8.641E-05 | global batch size: 256 | lm loss: 3.726726E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.438 | TFLOPs: 29.43 | +7: iteration 22320/ 37905 | consumed samples: 5713920 | consumed tokens: 11702108160 | elapsed time per iteration (s): 0.22 | learning rate: 8.634E-05 | global batch size: 256 | lm loss: 3.729139E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.685 | TFLOPs: 29.42 | +7: iteration 22330/ 37905 | consumed samples: 5716480 | consumed tokens: 11707351040 | elapsed time per iteration (s): 0.22 | learning rate: 8.626E-05 | global batch size: 256 | lm loss: 3.736536E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.876 | TFLOPs: 29.40 | +7: iteration 22340/ 37905 | consumed samples: 5719040 | consumed tokens: 11712593920 | elapsed time per iteration (s): 0.22 | learning rate: 8.619E-05 | global batch size: 256 | lm loss: 3.728356E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.981 | TFLOPs: 29.42 | +7: iteration 22350/ 37905 | consumed samples: 5721600 | consumed tokens: 11717836800 | elapsed time per iteration (s): 0.22 | learning rate: 8.612E-05 | global batch size: 256 | lm loss: 3.718515E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.076 | TFLOPs: 29.40 | +7: iteration 22360/ 37905 | consumed samples: 5724160 | consumed tokens: 11723079680 | elapsed time per iteration (s): 0.22 | learning rate: 8.605E-05 | global batch size: 256 | lm loss: 3.734008E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.281 | TFLOPs: 29.41 | +7: iteration 22370/ 37905 | consumed samples: 5726720 | consumed tokens: 11728322560 | elapsed time per iteration (s): 0.22 | learning rate: 8.597E-05 | global batch size: 256 | lm loss: 3.733004E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.355 | TFLOPs: 29.03 | +7: iteration 22380/ 37905 | consumed samples: 5729280 | consumed tokens: 11733565440 | elapsed time per iteration (s): 0.22 | learning rate: 8.590E-05 | global batch size: 256 | lm loss: 3.732373E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.372 | TFLOPs: 29.41 | +7: iteration 22390/ 37905 | consumed samples: 5731840 | consumed tokens: 11738808320 | elapsed time per iteration (s): 0.22 | learning rate: 8.583E-05 | global batch size: 256 | lm loss: 3.740661E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.698 | TFLOPs: 29.39 | +7: iteration 22400/ 37905 | consumed samples: 5734400 | consumed tokens: 11744051200 | elapsed time per iteration (s): 0.22 | learning rate: 8.576E-05 | global batch size: 256 | lm loss: 3.734222E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.938 | TFLOPs: 29.37 | +7: iteration 22410/ 37905 | consumed samples: 5736960 | consumed tokens: 11749294080 | elapsed time per iteration (s): 0.22 | learning rate: 8.568E-05 | global batch size: 256 | lm loss: 3.745377E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.714 | TFLOPs: 29.09 | +7: iteration 22420/ 37905 | consumed samples: 5739520 | consumed tokens: 11754536960 | elapsed time per iteration (s): 0.22 | learning rate: 8.561E-05 | global batch size: 256 | lm loss: 3.744331E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.730 | TFLOPs: 29.39 | +7: iteration 22430/ 37905 | consumed samples: 5742080 | consumed tokens: 11759779840 | elapsed time per iteration (s): 0.22 | learning rate: 8.554E-05 | global batch size: 256 | lm loss: 3.737562E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.878 | TFLOPs: 29.37 | +7: iteration 22440/ 37905 | consumed samples: 5744640 | consumed tokens: 11765022720 | elapsed time per iteration (s): 0.22 | learning rate: 8.547E-05 | global batch size: 256 | lm loss: 3.729062E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.850 | TFLOPs: 29.37 | +7: iteration 22450/ 37905 | consumed samples: 5747200 | consumed tokens: 11770265600 | elapsed time per iteration (s): 0.22 | learning rate: 8.539E-05 | global batch size: 256 | lm loss: 3.723543E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.944 | TFLOPs: 29.37 | +7: iteration 22460/ 37905 | consumed samples: 5749760 | consumed tokens: 11775508480 | elapsed time per iteration (s): 0.22 | learning rate: 8.532E-05 | global batch size: 256 | lm loss: 3.740691E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.416 | TFLOPs: 29.36 | +7: iteration 22470/ 37905 | consumed samples: 5752320 | consumed tokens: 11780751360 | elapsed time per iteration (s): 0.22 | learning rate: 8.525E-05 | global batch size: 256 | lm loss: 3.718370E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.957 | TFLOPs: 29.35 | +7: iteration 22480/ 37905 | consumed samples: 5754880 | consumed tokens: 11785994240 | elapsed time per iteration (s): 0.22 | learning rate: 8.518E-05 | global batch size: 256 | lm loss: 3.736794E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.241 | TFLOPs: 29.35 | +7: iteration 22490/ 37905 | consumed samples: 5757440 | consumed tokens: 11791237120 | elapsed time per iteration (s): 0.22 | learning rate: 8.510E-05 | global batch size: 256 | lm loss: 3.721564E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.168 | TFLOPs: 29.35 | +7: iteration 22500/ 37905 | consumed samples: 5760000 | consumed tokens: 11796480000 | elapsed time per iteration (s): 0.22 | learning rate: 8.503E-05 | global batch size: 256 | lm loss: 3.730295E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.401 | TFLOPs: 29.36 | +7: iteration 22510/ 37905 | consumed samples: 5762560 | consumed tokens: 11801722880 | elapsed time per iteration (s): 0.22 | learning rate: 8.496E-05 | global batch size: 256 | lm loss: 3.722949E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.132 | TFLOPs: 29.35 | +7: iteration 22520/ 37905 | consumed samples: 5765120 | consumed tokens: 11806965760 | elapsed time per iteration (s): 0.23 | learning rate: 8.489E-05 | global batch size: 256 | lm loss: 3.739509E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.613 | TFLOPs: 28.93 | +7: iteration 22530/ 37905 | consumed samples: 5767680 | consumed tokens: 11812208640 | elapsed time per iteration (s): 0.22 | learning rate: 8.481E-05 | global batch size: 256 | lm loss: 3.728955E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.028 | TFLOPs: 29.09 | +7: iteration 22540/ 37905 | consumed samples: 5770240 | consumed tokens: 11817451520 | elapsed time per iteration (s): 0.22 | learning rate: 8.474E-05 | global batch size: 256 | lm loss: 3.732293E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.389 | TFLOPs: 29.38 | +7: iteration 22550/ 37905 | consumed samples: 5772800 | consumed tokens: 11822694400 | elapsed time per iteration (s): 0.22 | learning rate: 8.467E-05 | global batch size: 256 | lm loss: 3.742157E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.770 | TFLOPs: 29.01 | +7: iteration 22560/ 37905 | consumed samples: 5775360 | consumed tokens: 11827937280 | elapsed time per iteration (s): 0.22 | learning rate: 8.460E-05 | global batch size: 256 | lm loss: 3.750744E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.532 | TFLOPs: 29.39 | +7: iteration 22570/ 37905 | consumed samples: 5777920 | consumed tokens: 11833180160 | elapsed time per iteration (s): 0.22 | learning rate: 8.452E-05 | global batch size: 256 | lm loss: 3.731605E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.277 | TFLOPs: 29.38 | +7: iteration 22580/ 37905 | consumed samples: 5780480 | consumed tokens: 11838423040 | elapsed time per iteration (s): 0.22 | learning rate: 8.445E-05 | global batch size: 256 | lm loss: 3.724939E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.322 | TFLOPs: 29.38 | +7: iteration 22590/ 37905 | consumed samples: 5783040 | consumed tokens: 11843665920 | elapsed time per iteration (s): 0.22 | learning rate: 8.438E-05 | global batch size: 256 | lm loss: 3.728984E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.791 | TFLOPs: 29.37 | +7: iteration 22600/ 37905 | consumed samples: 5785600 | consumed tokens: 11848908800 | elapsed time per iteration (s): 0.22 | learning rate: 8.431E-05 | global batch size: 256 | lm loss: 3.726973E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.325 | TFLOPs: 29.02 | +7: iteration 22610/ 37905 | consumed samples: 5788160 | consumed tokens: 11854151680 | elapsed time per iteration (s): 0.22 | learning rate: 8.424E-05 | global batch size: 256 | lm loss: 3.734206E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.718 | TFLOPs: 29.31 | +7: iteration 22620/ 37905 | consumed samples: 5790720 | consumed tokens: 11859394560 | elapsed time per iteration (s): 0.22 | learning rate: 8.416E-05 | global batch size: 256 | lm loss: 3.730679E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.622 | TFLOPs: 29.31 | +7: iteration 22630/ 37905 | consumed samples: 5793280 | consumed tokens: 11864637440 | elapsed time per iteration (s): 0.23 | learning rate: 8.409E-05 | global batch size: 256 | lm loss: 3.724668E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.421 | TFLOPs: 28.93 | +7: iteration 22640/ 37905 | consumed samples: 5795840 | consumed tokens: 11869880320 | elapsed time per iteration (s): 0.22 | learning rate: 8.402E-05 | global batch size: 256 | lm loss: 3.731590E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.124 | TFLOPs: 29.30 | +7: iteration 22650/ 37905 | consumed samples: 5798400 | consumed tokens: 11875123200 | elapsed time per iteration (s): 0.22 | learning rate: 8.395E-05 | global batch size: 256 | lm loss: 3.736640E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.264 | TFLOPs: 29.30 | +7: iteration 22660/ 37905 | consumed samples: 5800960 | consumed tokens: 11880366080 | elapsed time per iteration (s): 0.22 | learning rate: 8.388E-05 | global batch size: 256 | lm loss: 3.735188E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.084 | TFLOPs: 29.30 | +7: iteration 22670/ 37905 | consumed samples: 5803520 | consumed tokens: 11885608960 | elapsed time per iteration (s): 0.22 | learning rate: 8.380E-05 | global batch size: 256 | lm loss: 3.735075E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.574 | TFLOPs: 29.29 | +7: iteration 22680/ 37905 | consumed samples: 5806080 | consumed tokens: 11890851840 | elapsed time per iteration (s): 0.22 | learning rate: 8.373E-05 | global batch size: 256 | lm loss: 3.736261E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.817 | TFLOPs: 29.22 | +7: iteration 22690/ 37905 | consumed samples: 5808640 | consumed tokens: 11896094720 | elapsed time per iteration (s): 0.22 | learning rate: 8.366E-05 | global batch size: 256 | lm loss: 3.732933E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.431 | TFLOPs: 29.26 | +7: iteration 22700/ 37905 | consumed samples: 5811200 | consumed tokens: 11901337600 | elapsed time per iteration (s): 0.22 | learning rate: 8.359E-05 | global batch size: 256 | lm loss: 3.736683E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.751 | TFLOPs: 29.32 | +7: iteration 22710/ 37905 | consumed samples: 5813760 | consumed tokens: 11906580480 | elapsed time per iteration (s): 0.22 | learning rate: 8.352E-05 | global batch size: 256 | lm loss: 3.742607E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.369 | TFLOPs: 29.31 | +7: iteration 22720/ 37905 | consumed samples: 5816320 | consumed tokens: 11911823360 | elapsed time per iteration (s): 0.22 | learning rate: 8.344E-05 | global batch size: 256 | lm loss: 3.727884E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.653 | TFLOPs: 29.31 | +7: iteration 22730/ 37905 | consumed samples: 5818880 | consumed tokens: 11917066240 | elapsed time per iteration (s): 0.22 | learning rate: 8.337E-05 | global batch size: 256 | lm loss: 3.741560E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.972 | TFLOPs: 29.32 | +7: iteration 22740/ 37905 | consumed samples: 5821440 | consumed tokens: 11922309120 | elapsed time per iteration (s): 0.22 | learning rate: 8.330E-05 | global batch size: 256 | lm loss: 3.724053E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.371 | TFLOPs: 29.31 | +7: iteration 22750/ 37905 | consumed samples: 5824000 | consumed tokens: 11927552000 | elapsed time per iteration (s): 0.22 | learning rate: 8.323E-05 | global batch size: 256 | lm loss: 3.720208E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.786 | TFLOPs: 29.24 | +7: iteration 22760/ 37905 | consumed samples: 5826560 | consumed tokens: 11932794880 | elapsed time per iteration (s): 0.22 | learning rate: 8.316E-05 | global batch size: 256 | lm loss: 3.741518E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.544 | TFLOPs: 29.26 | +7: iteration 22770/ 37905 | consumed samples: 5829120 | consumed tokens: 11938037760 | elapsed time per iteration (s): 0.22 | learning rate: 8.308E-05 | global batch size: 256 | lm loss: 3.738697E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.122 | TFLOPs: 29.25 | +7: iteration 22780/ 37905 | consumed samples: 5831680 | consumed tokens: 11943280640 | elapsed time per iteration (s): 0.22 | learning rate: 8.301E-05 | global batch size: 256 | lm loss: 3.748215E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.382 | TFLOPs: 29.26 | +7: iteration 22790/ 37905 | consumed samples: 5834240 | consumed tokens: 11948523520 | elapsed time per iteration (s): 0.22 | learning rate: 8.294E-05 | global batch size: 256 | lm loss: 3.717891E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.135 | TFLOPs: 29.25 | +7: iteration 22800/ 37905 | consumed samples: 5836800 | consumed tokens: 11953766400 | elapsed time per iteration (s): 0.22 | learning rate: 8.287E-05 | global batch size: 256 | lm loss: 3.736288E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.995 | TFLOPs: 29.27 | +7: iteration 22810/ 37905 | consumed samples: 5839360 | consumed tokens: 11959009280 | elapsed time per iteration (s): 0.23 | learning rate: 8.280E-05 | global batch size: 256 | lm loss: 3.728342E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.436 | TFLOPs: 28.90 | +7: iteration 22820/ 37905 | consumed samples: 5841920 | consumed tokens: 11964252160 | elapsed time per iteration (s): 0.22 | learning rate: 8.272E-05 | global batch size: 256 | lm loss: 3.742711E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.151 | TFLOPs: 29.30 | +7: iteration 22830/ 37905 | consumed samples: 5844480 | consumed tokens: 11969495040 | elapsed time per iteration (s): 0.22 | learning rate: 8.265E-05 | global batch size: 256 | lm loss: 3.738273E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.359 | TFLOPs: 29.31 | +7: iteration 22840/ 37905 | consumed samples: 5847040 | consumed tokens: 11974737920 | elapsed time per iteration (s): 0.22 | learning rate: 8.258E-05 | global batch size: 256 | lm loss: 3.733879E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.647 | TFLOPs: 29.34 | +7: iteration 22850/ 37905 | consumed samples: 5849600 | consumed tokens: 11979980800 | elapsed time per iteration (s): 0.22 | learning rate: 8.251E-05 | global batch size: 256 | lm loss: 3.722662E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.887 | TFLOPs: 29.34 | +7: iteration 22860/ 37905 | consumed samples: 5852160 | consumed tokens: 11985223680 | elapsed time per iteration (s): 0.22 | learning rate: 8.244E-05 | global batch size: 256 | lm loss: 3.739253E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.664 | TFLOPs: 29.34 | +7: iteration 22870/ 37905 | consumed samples: 5854720 | consumed tokens: 11990466560 | elapsed time per iteration (s): 0.22 | learning rate: 8.237E-05 | global batch size: 256 | lm loss: 3.717262E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.322 | TFLOPs: 29.30 | +7: iteration 22880/ 37905 | consumed samples: 5857280 | consumed tokens: 11995709440 | elapsed time per iteration (s): 0.24 | learning rate: 8.229E-05 | global batch size: 256 | lm loss: 3.734711E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1087.032 | TFLOPs: 27.69 | +7: iteration 22890/ 37905 | consumed samples: 5859840 | consumed tokens: 12000952320 | elapsed time per iteration (s): 0.22 | learning rate: 8.222E-05 | global batch size: 256 | lm loss: 3.744928E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.256 | TFLOPs: 29.33 | +7: iteration 22900/ 37905 | consumed samples: 5862400 | consumed tokens: 12006195200 | elapsed time per iteration (s): 0.22 | learning rate: 8.215E-05 | global batch size: 256 | lm loss: 3.739912E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.257 | TFLOPs: 29.33 | +7: iteration 22910/ 37905 | consumed samples: 5864960 | consumed tokens: 12011438080 | elapsed time per iteration (s): 0.22 | learning rate: 8.208E-05 | global batch size: 256 | lm loss: 3.728310E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.533 | TFLOPs: 29.34 | +7: iteration 22920/ 37905 | consumed samples: 5867520 | consumed tokens: 12016680960 | elapsed time per iteration (s): 0.22 | learning rate: 8.201E-05 | global batch size: 256 | lm loss: 3.715976E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.954 | TFLOPs: 29.35 | +7: iteration 22930/ 37905 | consumed samples: 5870080 | consumed tokens: 12021923840 | elapsed time per iteration (s): 0.22 | learning rate: 8.194E-05 | global batch size: 256 | lm loss: 3.727594E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.883 | TFLOPs: 29.34 | +7: iteration 22940/ 37905 | consumed samples: 5872640 | consumed tokens: 12027166720 | elapsed time per iteration (s): 0.22 | learning rate: 8.186E-05 | global batch size: 256 | lm loss: 3.734597E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.694 | TFLOPs: 29.37 | +7: iteration 22950/ 37905 | consumed samples: 5875200 | consumed tokens: 12032409600 | elapsed time per iteration (s): 0.22 | learning rate: 8.179E-05 | global batch size: 256 | lm loss: 3.722516E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.002 | TFLOPs: 29.37 | +7: iteration 22960/ 37905 | consumed samples: 5877760 | consumed tokens: 12037652480 | elapsed time per iteration (s): 0.22 | learning rate: 8.172E-05 | global batch size: 256 | lm loss: 3.744490E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.079 | TFLOPs: 29.32 | +7: iteration 22970/ 37905 | consumed samples: 5880320 | consumed tokens: 12042895360 | elapsed time per iteration (s): 0.22 | learning rate: 8.165E-05 | global batch size: 256 | lm loss: 3.733450E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.682 | TFLOPs: 29.31 | +7: iteration 22980/ 37905 | consumed samples: 5882880 | consumed tokens: 12048138240 | elapsed time per iteration (s): 0.22 | learning rate: 8.158E-05 | global batch size: 256 | lm loss: 3.727502E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.822 | TFLOPs: 29.27 | +7: iteration 22990/ 37905 | consumed samples: 5885440 | consumed tokens: 12053381120 | elapsed time per iteration (s): 0.22 | learning rate: 8.151E-05 | global batch size: 256 | lm loss: 3.726921E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.473 | TFLOPs: 29.33 | +7: iteration 23000/ 37905 | consumed samples: 5888000 | consumed tokens: 12058624000 | elapsed time per iteration (s): 0.22 | learning rate: 8.143E-05 | global batch size: 256 | lm loss: 3.723304E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.529 | TFLOPs: 29.21 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 23000 | lm loss value: 3.699339E+00 | lm loss PPL: 4.042057E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 23000 to checkpoints_83m20b400m +0: [2023-03-15 23:23:15,843] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step23000 is begin to save! +0: [2023-03-15 23:23:15,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:23:15,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:23:15,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:23:15,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:23:15,931] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:23:15,942] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:23:15,942] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:23:15,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:23:15,953] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:23:15,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:23:15,964] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:23:15,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:23:15,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:23:15,986] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:23:15,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:23:15,997] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:23:15,998] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:23:16,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:23:16,009] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:23:16,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:23:16,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:23:16,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:23:16,031] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:23:16,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:23:16,032] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step23000/mp_rank_00_model_states.pt +0: [2023-03-15 23:23:16,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:23:16,034] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:23:16,051] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,074] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,074] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-15 23:23:16,078] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +5: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +2: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +6: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +5: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +6: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +1: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +1: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +4: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +3: [2023-03-15 23:23:16,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:23:16,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:23:16,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +7: [2023-03-15 23:23:16,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:23:16,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step23000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:23:16,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step23000 is ready now! +0: successfully saved checkpoint at iteration 23000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.86 +7: iteration 23010/ 37905 | consumed samples: 5890560 | consumed tokens: 12063866880 | elapsed time per iteration (s): 0.26 | learning rate: 8.136E-05 | global batch size: 256 | lm loss: 3.751936E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1000.863 | TFLOPs: 25.50 | +7: iteration 23020/ 37905 | consumed samples: 5893120 | consumed tokens: 12069109760 | elapsed time per iteration (s): 0.22 | learning rate: 8.129E-05 | global batch size: 256 | lm loss: 3.742461E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.947 | TFLOPs: 29.37 | +7: iteration 23030/ 37905 | consumed samples: 5895680 | consumed tokens: 12074352640 | elapsed time per iteration (s): 0.22 | learning rate: 8.122E-05 | global batch size: 256 | lm loss: 3.730898E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.555 | TFLOPs: 29.39 | +7: iteration 23040/ 37905 | consumed samples: 5898240 | consumed tokens: 12079595520 | elapsed time per iteration (s): 0.22 | learning rate: 8.115E-05 | global batch size: 256 | lm loss: 3.725974E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.185 | TFLOPs: 29.30 | +7: iteration 23050/ 37905 | consumed samples: 5900800 | consumed tokens: 12084838400 | elapsed time per iteration (s): 0.23 | learning rate: 8.108E-05 | global batch size: 256 | lm loss: 3.734351E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.328 | TFLOPs: 28.92 | +7: iteration 23060/ 37905 | consumed samples: 5903360 | consumed tokens: 12090081280 | elapsed time per iteration (s): 0.23 | learning rate: 8.101E-05 | global batch size: 256 | lm loss: 3.733246E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1094.318 | TFLOPs: 27.88 | +7: iteration 23070/ 37905 | consumed samples: 5905920 | consumed tokens: 12095324160 | elapsed time per iteration (s): 0.22 | learning rate: 8.094E-05 | global batch size: 256 | lm loss: 3.724047E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.167 | TFLOPs: 29.25 | +7: iteration 23080/ 37905 | consumed samples: 5908480 | consumed tokens: 12100567040 | elapsed time per iteration (s): 0.22 | learning rate: 8.086E-05 | global batch size: 256 | lm loss: 3.723043E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.235 | TFLOPs: 29.33 | +7: iteration 23090/ 37905 | consumed samples: 5911040 | consumed tokens: 12105809920 | elapsed time per iteration (s): 0.22 | learning rate: 8.079E-05 | global batch size: 256 | lm loss: 3.738988E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.884 | TFLOPs: 29.34 | +7: iteration 23100/ 37905 | consumed samples: 5913600 | consumed tokens: 12111052800 | elapsed time per iteration (s): 0.22 | learning rate: 8.072E-05 | global batch size: 256 | lm loss: 3.734302E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.245 | TFLOPs: 29.33 | +7: iteration 23110/ 37905 | consumed samples: 5916160 | consumed tokens: 12116295680 | elapsed time per iteration (s): 0.22 | learning rate: 8.065E-05 | global batch size: 256 | lm loss: 3.717657E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.914 | TFLOPs: 29.32 | +7: iteration 23120/ 37905 | consumed samples: 5918720 | consumed tokens: 12121538560 | elapsed time per iteration (s): 0.22 | learning rate: 8.058E-05 | global batch size: 256 | lm loss: 3.721438E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.041 | TFLOPs: 29.30 | +7: iteration 23130/ 37905 | consumed samples: 5921280 | consumed tokens: 12126781440 | elapsed time per iteration (s): 0.22 | learning rate: 8.051E-05 | global batch size: 256 | lm loss: 3.723851E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.192 | TFLOPs: 29.28 | +7: iteration 23140/ 37905 | consumed samples: 5923840 | consumed tokens: 12132024320 | elapsed time per iteration (s): 0.22 | learning rate: 8.044E-05 | global batch size: 256 | lm loss: 3.731308E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.215 | TFLOPs: 29.28 | +7: iteration 23150/ 37905 | consumed samples: 5926400 | consumed tokens: 12137267200 | elapsed time per iteration (s): 0.22 | learning rate: 8.037E-05 | global batch size: 256 | lm loss: 3.755894E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.384 | TFLOPs: 29.28 | +7: iteration 23160/ 37905 | consumed samples: 5928960 | consumed tokens: 12142510080 | elapsed time per iteration (s): 0.22 | learning rate: 8.029E-05 | global batch size: 256 | lm loss: 3.729486E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.218 | TFLOPs: 29.28 | +7: iteration 23170/ 37905 | consumed samples: 5931520 | consumed tokens: 12147752960 | elapsed time per iteration (s): 0.22 | learning rate: 8.022E-05 | global batch size: 256 | lm loss: 3.734496E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.842 | TFLOPs: 29.29 | +7: iteration 23180/ 37905 | consumed samples: 5934080 | consumed tokens: 12152995840 | elapsed time per iteration (s): 0.22 | learning rate: 8.015E-05 | global batch size: 256 | lm loss: 3.737698E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.834 | TFLOPs: 29.27 | +7: iteration 23190/ 37905 | consumed samples: 5936640 | consumed tokens: 12158238720 | elapsed time per iteration (s): 0.22 | learning rate: 8.008E-05 | global batch size: 256 | lm loss: 3.735647E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.887 | TFLOPs: 29.06 | +7: iteration 23200/ 37905 | consumed samples: 5939200 | consumed tokens: 12163481600 | elapsed time per iteration (s): 0.22 | learning rate: 8.001E-05 | global batch size: 256 | lm loss: 3.740279E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.533 | TFLOPs: 29.23 | +7: iteration 23210/ 37905 | consumed samples: 5941760 | consumed tokens: 12168724480 | elapsed time per iteration (s): 0.22 | learning rate: 7.994E-05 | global batch size: 256 | lm loss: 3.726693E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.105 | TFLOPs: 29.27 | +7: iteration 23220/ 37905 | consumed samples: 5944320 | consumed tokens: 12173967360 | elapsed time per iteration (s): 0.22 | learning rate: 7.987E-05 | global batch size: 256 | lm loss: 3.726876E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.936 | TFLOPs: 29.29 | +7: iteration 23230/ 37905 | consumed samples: 5946880 | consumed tokens: 12179210240 | elapsed time per iteration (s): 0.22 | learning rate: 7.980E-05 | global batch size: 256 | lm loss: 3.722173E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.718 | TFLOPs: 29.34 | +7: iteration 23240/ 37905 | consumed samples: 5949440 | consumed tokens: 12184453120 | elapsed time per iteration (s): 0.22 | learning rate: 7.973E-05 | global batch size: 256 | lm loss: 3.730934E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.770 | TFLOPs: 29.24 | +7: iteration 23250/ 37905 | consumed samples: 5952000 | consumed tokens: 12189696000 | elapsed time per iteration (s): 0.23 | learning rate: 7.965E-05 | global batch size: 256 | lm loss: 3.725500E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.949 | TFLOPs: 28.86 | +7: iteration 23260/ 37905 | consumed samples: 5954560 | consumed tokens: 12194938880 | elapsed time per iteration (s): 0.23 | learning rate: 7.958E-05 | global batch size: 256 | lm loss: 3.713635E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.811 | TFLOPs: 28.91 | +7: iteration 23270/ 37905 | consumed samples: 5957120 | consumed tokens: 12200181760 | elapsed time per iteration (s): 0.22 | learning rate: 7.951E-05 | global batch size: 256 | lm loss: 3.731423E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.407 | TFLOPs: 29.23 | +7: iteration 23280/ 37905 | consumed samples: 5959680 | consumed tokens: 12205424640 | elapsed time per iteration (s): 0.22 | learning rate: 7.944E-05 | global batch size: 256 | lm loss: 3.725785E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.261 | TFLOPs: 29.33 | +7: iteration 23290/ 37905 | consumed samples: 5962240 | consumed tokens: 12210667520 | elapsed time per iteration (s): 0.22 | learning rate: 7.937E-05 | global batch size: 256 | lm loss: 3.728261E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.763 | TFLOPs: 29.34 | +7: iteration 23300/ 37905 | consumed samples: 5964800 | consumed tokens: 12215910400 | elapsed time per iteration (s): 0.22 | learning rate: 7.930E-05 | global batch size: 256 | lm loss: 3.721252E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.002 | TFLOPs: 29.35 | +7: iteration 23310/ 37905 | consumed samples: 5967360 | consumed tokens: 12221153280 | elapsed time per iteration (s): 0.22 | learning rate: 7.923E-05 | global batch size: 256 | lm loss: 3.730756E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.653 | TFLOPs: 29.34 | +7: iteration 23320/ 37905 | consumed samples: 5969920 | consumed tokens: 12226396160 | elapsed time per iteration (s): 0.22 | learning rate: 7.916E-05 | global batch size: 256 | lm loss: 3.738372E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.692 | TFLOPs: 29.26 | +7: iteration 23330/ 37905 | consumed samples: 5972480 | consumed tokens: 12231639040 | elapsed time per iteration (s): 0.22 | learning rate: 7.909E-05 | global batch size: 256 | lm loss: 3.724193E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.255 | TFLOPs: 29.33 | +7: iteration 23340/ 37905 | consumed samples: 5975040 | consumed tokens: 12236881920 | elapsed time per iteration (s): 0.22 | learning rate: 7.902E-05 | global batch size: 256 | lm loss: 3.721547E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.765 | TFLOPs: 29.32 | +7: iteration 23350/ 37905 | consumed samples: 5977600 | consumed tokens: 12242124800 | elapsed time per iteration (s): 0.22 | learning rate: 7.895E-05 | global batch size: 256 | lm loss: 3.738362E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.213 | TFLOPs: 29.33 | +7: iteration 23360/ 37905 | consumed samples: 5980160 | consumed tokens: 12247367680 | elapsed time per iteration (s): 0.22 | learning rate: 7.888E-05 | global batch size: 256 | lm loss: 3.721259E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.925 | TFLOPs: 29.32 | +7: iteration 23370/ 37905 | consumed samples: 5982720 | consumed tokens: 12252610560 | elapsed time per iteration (s): 0.22 | learning rate: 7.881E-05 | global batch size: 256 | lm loss: 3.742204E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.955 | TFLOPs: 29.32 | +7: iteration 23380/ 37905 | consumed samples: 5985280 | consumed tokens: 12257853440 | elapsed time per iteration (s): 0.22 | learning rate: 7.873E-05 | global batch size: 256 | lm loss: 3.728425E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.664 | TFLOPs: 29.31 | +7: iteration 23390/ 37905 | consumed samples: 5987840 | consumed tokens: 12263096320 | elapsed time per iteration (s): 0.22 | learning rate: 7.866E-05 | global batch size: 256 | lm loss: 3.729158E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.081 | TFLOPs: 29.32 | +7: iteration 23400/ 37905 | consumed samples: 5990400 | consumed tokens: 12268339200 | elapsed time per iteration (s): 0.22 | learning rate: 7.859E-05 | global batch size: 256 | lm loss: 3.725161E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.442 | TFLOPs: 29.33 | +7: iteration 23410/ 37905 | consumed samples: 5992960 | consumed tokens: 12273582080 | elapsed time per iteration (s): 0.22 | learning rate: 7.852E-05 | global batch size: 256 | lm loss: 3.721380E+00 | grad norm: 0.335 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.264 | TFLOPs: 29.30 | +7: iteration 23420/ 37905 | consumed samples: 5995520 | consumed tokens: 12278824960 | elapsed time per iteration (s): 0.22 | learning rate: 7.845E-05 | global batch size: 256 | lm loss: 3.724035E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.325 | TFLOPs: 29.30 | +7: iteration 23430/ 37905 | consumed samples: 5998080 | consumed tokens: 12284067840 | elapsed time per iteration (s): 0.22 | learning rate: 7.838E-05 | global batch size: 256 | lm loss: 3.721801E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.103 | TFLOPs: 29.32 | +7: iteration 23440/ 37905 | consumed samples: 6000640 | consumed tokens: 12289310720 | elapsed time per iteration (s): 0.22 | learning rate: 7.831E-05 | global batch size: 256 | lm loss: 3.723996E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.276 | TFLOPs: 29.33 | +7: iteration 23450/ 37905 | consumed samples: 6003200 | consumed tokens: 12294553600 | elapsed time per iteration (s): 0.22 | learning rate: 7.824E-05 | global batch size: 256 | lm loss: 3.730611E+00 | grad norm: 0.257 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.610 | TFLOPs: 29.31 | +7: iteration 23460/ 37905 | consumed samples: 6005760 | consumed tokens: 12299796480 | elapsed time per iteration (s): 0.22 | learning rate: 7.817E-05 | global batch size: 256 | lm loss: 3.721492E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.361 | TFLOPs: 29.33 | +7: iteration 23470/ 37905 | consumed samples: 6008320 | consumed tokens: 12305039360 | elapsed time per iteration (s): 0.22 | learning rate: 7.810E-05 | global batch size: 256 | lm loss: 3.728302E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.605 | TFLOPs: 29.31 | +7: iteration 23480/ 37905 | consumed samples: 6010880 | consumed tokens: 12310282240 | elapsed time per iteration (s): 0.22 | learning rate: 7.803E-05 | global batch size: 256 | lm loss: 3.721828E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.775 | TFLOPs: 29.29 | +7: iteration 23490/ 37905 | consumed samples: 6013440 | consumed tokens: 12315525120 | elapsed time per iteration (s): 0.22 | learning rate: 7.796E-05 | global batch size: 256 | lm loss: 3.721397E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.395 | TFLOPs: 29.28 | +7: iteration 23500/ 37905 | consumed samples: 6016000 | consumed tokens: 12320768000 | elapsed time per iteration (s): 0.22 | learning rate: 7.789E-05 | global batch size: 256 | lm loss: 3.731273E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.029 | TFLOPs: 29.27 | +7: iteration 23510/ 37905 | consumed samples: 6018560 | consumed tokens: 12326010880 | elapsed time per iteration (s): 0.22 | learning rate: 7.782E-05 | global batch size: 256 | lm loss: 3.719687E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.358 | TFLOPs: 29.28 | +7: iteration 23520/ 37905 | consumed samples: 6021120 | consumed tokens: 12331253760 | elapsed time per iteration (s): 0.22 | learning rate: 7.775E-05 | global batch size: 256 | lm loss: 3.726237E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.990 | TFLOPs: 29.27 | +7: iteration 23530/ 37905 | consumed samples: 6023680 | consumed tokens: 12336496640 | elapsed time per iteration (s): 0.22 | learning rate: 7.768E-05 | global batch size: 256 | lm loss: 3.727859E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.337 | TFLOPs: 29.28 | +7: iteration 23540/ 37905 | consumed samples: 6026240 | consumed tokens: 12341739520 | elapsed time per iteration (s): 0.22 | learning rate: 7.761E-05 | global batch size: 256 | lm loss: 3.724528E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.333 | TFLOPs: 29.30 | +7: iteration 23550/ 37905 | consumed samples: 6028800 | consumed tokens: 12346982400 | elapsed time per iteration (s): 0.22 | learning rate: 7.754E-05 | global batch size: 256 | lm loss: 3.754194E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.804 | TFLOPs: 29.29 | +7: iteration 23560/ 37905 | consumed samples: 6031360 | consumed tokens: 12352225280 | elapsed time per iteration (s): 0.22 | learning rate: 7.747E-05 | global batch size: 256 | lm loss: 3.725661E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.794 | TFLOPs: 29.27 | +7: iteration 23570/ 37905 | consumed samples: 6033920 | consumed tokens: 12357468160 | elapsed time per iteration (s): 0.22 | learning rate: 7.740E-05 | global batch size: 256 | lm loss: 3.721560E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.481 | TFLOPs: 29.28 | +7: iteration 23580/ 37905 | consumed samples: 6036480 | consumed tokens: 12362711040 | elapsed time per iteration (s): 0.22 | learning rate: 7.733E-05 | global batch size: 256 | lm loss: 3.730137E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.387 | TFLOPs: 29.28 | +7: iteration 23590/ 37905 | consumed samples: 6039040 | consumed tokens: 12367953920 | elapsed time per iteration (s): 0.23 | learning rate: 7.726E-05 | global batch size: 256 | lm loss: 3.733062E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.688 | TFLOPs: 28.88 | +7: iteration 23600/ 37905 | consumed samples: 6041600 | consumed tokens: 12373196800 | elapsed time per iteration (s): 0.22 | learning rate: 7.719E-05 | global batch size: 256 | lm loss: 3.723900E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.550 | TFLOPs: 29.31 | +7: iteration 23610/ 37905 | consumed samples: 6044160 | consumed tokens: 12378439680 | elapsed time per iteration (s): 0.22 | learning rate: 7.712E-05 | global batch size: 256 | lm loss: 3.727967E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.325 | TFLOPs: 29.28 | +7: iteration 23620/ 37905 | consumed samples: 6046720 | consumed tokens: 12383682560 | elapsed time per iteration (s): 0.22 | learning rate: 7.705E-05 | global batch size: 256 | lm loss: 3.731975E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.537 | TFLOPs: 29.26 | +7: iteration 23630/ 37905 | consumed samples: 6049280 | consumed tokens: 12388925440 | elapsed time per iteration (s): 0.22 | learning rate: 7.698E-05 | global batch size: 256 | lm loss: 3.734785E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.986 | TFLOPs: 29.27 | +7: iteration 23640/ 37905 | consumed samples: 6051840 | consumed tokens: 12394168320 | elapsed time per iteration (s): 0.22 | learning rate: 7.691E-05 | global batch size: 256 | lm loss: 3.726128E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.324 | TFLOPs: 29.25 | +7: iteration 23650/ 37905 | consumed samples: 6054400 | consumed tokens: 12399411200 | elapsed time per iteration (s): 0.22 | learning rate: 7.684E-05 | global batch size: 256 | lm loss: 3.720100E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.946 | TFLOPs: 29.24 | +7: iteration 23660/ 37905 | consumed samples: 6056960 | consumed tokens: 12404654080 | elapsed time per iteration (s): 0.22 | learning rate: 7.677E-05 | global batch size: 256 | lm loss: 3.727408E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.960 | TFLOPs: 29.19 | +7: iteration 23670/ 37905 | consumed samples: 6059520 | consumed tokens: 12409896960 | elapsed time per iteration (s): 0.22 | learning rate: 7.670E-05 | global batch size: 256 | lm loss: 3.740627E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.178 | TFLOPs: 29.22 | +7: iteration 23680/ 37905 | consumed samples: 6062080 | consumed tokens: 12415139840 | elapsed time per iteration (s): 0.22 | learning rate: 7.663E-05 | global batch size: 256 | lm loss: 3.727016E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.668 | TFLOPs: 29.21 | +7: iteration 23690/ 37905 | consumed samples: 6064640 | consumed tokens: 12420382720 | elapsed time per iteration (s): 0.22 | learning rate: 7.656E-05 | global batch size: 256 | lm loss: 3.727837E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.578 | TFLOPs: 29.23 | +7: iteration 23700/ 37905 | consumed samples: 6067200 | consumed tokens: 12425625600 | elapsed time per iteration (s): 0.22 | learning rate: 7.649E-05 | global batch size: 256 | lm loss: 3.718122E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.795 | TFLOPs: 29.24 | +7: iteration 23710/ 37905 | consumed samples: 6069760 | consumed tokens: 12430868480 | elapsed time per iteration (s): 0.22 | learning rate: 7.642E-05 | global batch size: 256 | lm loss: 3.719016E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.665 | TFLOPs: 29.24 | +7: iteration 23720/ 37905 | consumed samples: 6072320 | consumed tokens: 12436111360 | elapsed time per iteration (s): 0.22 | learning rate: 7.635E-05 | global batch size: 256 | lm loss: 3.714782E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.545 | TFLOPs: 29.21 | +7: iteration 23730/ 37905 | consumed samples: 6074880 | consumed tokens: 12441354240 | elapsed time per iteration (s): 0.22 | learning rate: 7.628E-05 | global batch size: 256 | lm loss: 3.728066E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.566 | TFLOPs: 29.23 | +7: iteration 23740/ 37905 | consumed samples: 6077440 | consumed tokens: 12446597120 | elapsed time per iteration (s): 0.22 | learning rate: 7.621E-05 | global batch size: 256 | lm loss: 3.730573E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.540 | TFLOPs: 29.26 | +7: iteration 23750/ 37905 | consumed samples: 6080000 | consumed tokens: 12451840000 | elapsed time per iteration (s): 0.22 | learning rate: 7.614E-05 | global batch size: 256 | lm loss: 3.731257E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.747 | TFLOPs: 29.24 | +7: iteration 23760/ 37905 | consumed samples: 6082560 | consumed tokens: 12457082880 | elapsed time per iteration (s): 0.22 | learning rate: 7.607E-05 | global batch size: 256 | lm loss: 3.716886E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.225 | TFLOPs: 29.28 | +7: iteration 23770/ 37905 | consumed samples: 6085120 | consumed tokens: 12462325760 | elapsed time per iteration (s): 0.22 | learning rate: 7.600E-05 | global batch size: 256 | lm loss: 3.727202E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.692 | TFLOPs: 29.24 | +7: iteration 23780/ 37905 | consumed samples: 6087680 | consumed tokens: 12467568640 | elapsed time per iteration (s): 0.22 | learning rate: 7.593E-05 | global batch size: 256 | lm loss: 3.723603E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.272 | TFLOPs: 29.23 | +7: iteration 23790/ 37905 | consumed samples: 6090240 | consumed tokens: 12472811520 | elapsed time per iteration (s): 0.22 | learning rate: 7.586E-05 | global batch size: 256 | lm loss: 3.729347E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.621 | TFLOPs: 29.24 | +7: iteration 23800/ 37905 | consumed samples: 6092800 | consumed tokens: 12478054400 | elapsed time per iteration (s): 0.22 | learning rate: 7.579E-05 | global batch size: 256 | lm loss: 3.727822E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.544 | TFLOPs: 29.26 | +7: iteration 23810/ 37905 | consumed samples: 6095360 | consumed tokens: 12483297280 | elapsed time per iteration (s): 0.22 | learning rate: 7.572E-05 | global batch size: 256 | lm loss: 3.733101E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.066 | TFLOPs: 29.25 | +7: iteration 23820/ 37905 | consumed samples: 6097920 | consumed tokens: 12488540160 | elapsed time per iteration (s): 0.22 | learning rate: 7.565E-05 | global batch size: 256 | lm loss: 3.719972E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.611 | TFLOPs: 29.26 | +7: iteration 23830/ 37905 | consumed samples: 6100480 | consumed tokens: 12493783040 | elapsed time per iteration (s): 0.22 | learning rate: 7.558E-05 | global batch size: 256 | lm loss: 3.720368E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.738 | TFLOPs: 29.26 | +7: iteration 23840/ 37905 | consumed samples: 6103040 | consumed tokens: 12499025920 | elapsed time per iteration (s): 0.22 | learning rate: 7.551E-05 | global batch size: 256 | lm loss: 3.732592E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.207 | TFLOPs: 29.25 | +7: iteration 23850/ 37905 | consumed samples: 6105600 | consumed tokens: 12504268800 | elapsed time per iteration (s): 0.22 | learning rate: 7.544E-05 | global batch size: 256 | lm loss: 3.733545E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.848 | TFLOPs: 29.27 | +7: iteration 23860/ 37905 | consumed samples: 6108160 | consumed tokens: 12509511680 | elapsed time per iteration (s): 0.22 | learning rate: 7.537E-05 | global batch size: 256 | lm loss: 3.726162E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.938 | TFLOPs: 29.24 | +7: iteration 23870/ 37905 | consumed samples: 6110720 | consumed tokens: 12514754560 | elapsed time per iteration (s): 0.23 | learning rate: 7.530E-05 | global batch size: 256 | lm loss: 3.729499E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.968 | TFLOPs: 28.89 | +7: iteration 23880/ 37905 | consumed samples: 6113280 | consumed tokens: 12519997440 | elapsed time per iteration (s): 0.22 | learning rate: 7.523E-05 | global batch size: 256 | lm loss: 3.726918E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.991 | TFLOPs: 29.25 | +7: iteration 23890/ 37905 | consumed samples: 6115840 | consumed tokens: 12525240320 | elapsed time per iteration (s): 0.22 | learning rate: 7.516E-05 | global batch size: 256 | lm loss: 3.727089E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.122 | TFLOPs: 29.27 | +7: iteration 23900/ 37905 | consumed samples: 6118400 | consumed tokens: 12530483200 | elapsed time per iteration (s): 0.22 | learning rate: 7.509E-05 | global batch size: 256 | lm loss: 3.725262E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.162 | TFLOPs: 29.25 | +7: iteration 23910/ 37905 | consumed samples: 6120960 | consumed tokens: 12535726080 | elapsed time per iteration (s): 0.22 | learning rate: 7.502E-05 | global batch size: 256 | lm loss: 3.726635E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.987 | TFLOPs: 29.25 | +7: iteration 23920/ 37905 | consumed samples: 6123520 | consumed tokens: 12540968960 | elapsed time per iteration (s): 0.22 | learning rate: 7.495E-05 | global batch size: 256 | lm loss: 3.722284E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.960 | TFLOPs: 29.30 | +7: iteration 23930/ 37905 | consumed samples: 6126080 | consumed tokens: 12546211840 | elapsed time per iteration (s): 0.22 | learning rate: 7.488E-05 | global batch size: 256 | lm loss: 3.726690E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.241 | TFLOPs: 29.30 | +7: iteration 23940/ 37905 | consumed samples: 6128640 | consumed tokens: 12551454720 | elapsed time per iteration (s): 0.22 | learning rate: 7.481E-05 | global batch size: 256 | lm loss: 3.713368E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.983 | TFLOPs: 29.32 | +7: iteration 23950/ 37905 | consumed samples: 6131200 | consumed tokens: 12556697600 | elapsed time per iteration (s): 0.22 | learning rate: 7.474E-05 | global batch size: 256 | lm loss: 3.722862E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.906 | TFLOPs: 29.29 | +7: iteration 23960/ 37905 | consumed samples: 6133760 | consumed tokens: 12561940480 | elapsed time per iteration (s): 0.22 | learning rate: 7.468E-05 | global batch size: 256 | lm loss: 3.719070E+00 | grad norm: 0.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.685 | TFLOPs: 29.31 | +7: iteration 23970/ 37905 | consumed samples: 6136320 | consumed tokens: 12567183360 | elapsed time per iteration (s): 0.22 | learning rate: 7.461E-05 | global batch size: 256 | lm loss: 3.721712E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.750 | TFLOPs: 29.32 | +7: iteration 23980/ 37905 | consumed samples: 6138880 | consumed tokens: 12572426240 | elapsed time per iteration (s): 0.22 | learning rate: 7.454E-05 | global batch size: 256 | lm loss: 3.718307E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.714 | TFLOPs: 29.31 | +7: iteration 23990/ 37905 | consumed samples: 6141440 | consumed tokens: 12577669120 | elapsed time per iteration (s): 0.22 | learning rate: 7.447E-05 | global batch size: 256 | lm loss: 3.733181E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.073 | TFLOPs: 29.32 | +0: [2023-03-15 23:26:59,084] [INFO] [logging.py:68:log_dist] [Rank 0] step=24000, skipped=0, lr=[7.439815524319269e-05, 7.439815524319269e-05, 7.439815524319269e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 24000/ 37905 | consumed samples: 6144000 | consumed tokens: 12582912000 | elapsed time per iteration (s): 0.22 | learning rate: 7.440E-05 | global batch size: 256 | lm loss: 3.724394E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.458 | TFLOPs: 29.31 | +0: steps: 24000 loss: 3.7540 iter time (s): 0.222 samples/sec: 1153.339 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 24000 | lm loss value: 3.705614E+00 | lm loss PPL: 4.067503E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 24000 to checkpoints_83m20b400m +0: [2023-03-15 23:26:59,172] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step24000 is begin to save! +0: [2023-03-15 23:26:59,175] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:26:59,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:26:59,250] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:26:59,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:26:59,262] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:26:59,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:26:59,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:26:59,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:26:59,285] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:26:59,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:26:59,296] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:26:59,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:26:59,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:26:59,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:26:59,318] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:26:59,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:26:59,329] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:26:59,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:26:59,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:26:59,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:26:59,352] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:26:59,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:26:59,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:26:59,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:26:59,364] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step24000/mp_rank_00_model_states.pt +0: [2023-03-15 23:26:59,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:26:59,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:26:59,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:26:59,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:26:59,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +0: [2023-03-15 23:26:59,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:26:59,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:26:59,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:26:59,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:26:59,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-15 23:26:59,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:26:59,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:26:59,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:26:59,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:26:59,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:26:59,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +1: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:26:59,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-15 23:26:59,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +5: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +7: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +4: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:26:59,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +6: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +3: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:26:59,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +2: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:26:59,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step24000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:26:59,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step24000 is ready now! +0: successfully saved checkpoint at iteration 24000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 244.15 +7: iteration 24010/ 37905 | consumed samples: 6146560 | consumed tokens: 12588154880 | elapsed time per iteration (s): 0.25 | learning rate: 7.433E-05 | global batch size: 256 | lm loss: 3.721146E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1006.170 | TFLOPs: 25.63 | +7: iteration 24020/ 37905 | consumed samples: 6149120 | consumed tokens: 12593397760 | elapsed time per iteration (s): 0.22 | learning rate: 7.426E-05 | global batch size: 256 | lm loss: 3.725032E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.074 | TFLOPs: 29.37 | +7: iteration 24030/ 37905 | consumed samples: 6151680 | consumed tokens: 12598640640 | elapsed time per iteration (s): 0.22 | learning rate: 7.419E-05 | global batch size: 256 | lm loss: 3.725734E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.757 | TFLOPs: 29.37 | +7: iteration 24040/ 37905 | consumed samples: 6154240 | consumed tokens: 12603883520 | elapsed time per iteration (s): 0.22 | learning rate: 7.412E-05 | global batch size: 256 | lm loss: 3.724958E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.535 | TFLOPs: 29.36 | +7: iteration 24050/ 37905 | consumed samples: 6156800 | consumed tokens: 12609126400 | elapsed time per iteration (s): 0.22 | learning rate: 7.405E-05 | global batch size: 256 | lm loss: 3.727039E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.386 | TFLOPs: 29.15 | +7: iteration 24060/ 37905 | consumed samples: 6159360 | consumed tokens: 12614369280 | elapsed time per iteration (s): 0.22 | learning rate: 7.398E-05 | global batch size: 256 | lm loss: 3.725502E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.917 | TFLOPs: 29.37 | +7: iteration 24070/ 37905 | consumed samples: 6161920 | consumed tokens: 12619612160 | elapsed time per iteration (s): 0.22 | learning rate: 7.391E-05 | global batch size: 256 | lm loss: 3.719093E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.117 | TFLOPs: 29.38 | +7: iteration 24080/ 37905 | consumed samples: 6164480 | consumed tokens: 12624855040 | elapsed time per iteration (s): 0.22 | learning rate: 7.385E-05 | global batch size: 256 | lm loss: 3.725470E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.244 | TFLOPs: 29.35 | +7: iteration 24090/ 37905 | consumed samples: 6167040 | consumed tokens: 12630097920 | elapsed time per iteration (s): 0.22 | learning rate: 7.378E-05 | global batch size: 256 | lm loss: 3.720309E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.299 | TFLOPs: 29.33 | +7: iteration 24100/ 37905 | consumed samples: 6169600 | consumed tokens: 12635340800 | elapsed time per iteration (s): 0.22 | learning rate: 7.371E-05 | global batch size: 256 | lm loss: 3.717350E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.407 | TFLOPs: 29.33 | +7: iteration 24110/ 37905 | consumed samples: 6172160 | consumed tokens: 12640583680 | elapsed time per iteration (s): 0.22 | learning rate: 7.364E-05 | global batch size: 256 | lm loss: 3.729873E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.967 | TFLOPs: 29.32 | +7: iteration 24120/ 37905 | consumed samples: 6174720 | consumed tokens: 12645826560 | elapsed time per iteration (s): 0.22 | learning rate: 7.357E-05 | global batch size: 256 | lm loss: 3.723225E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.613 | TFLOPs: 29.34 | +7: iteration 24130/ 37905 | consumed samples: 6177280 | consumed tokens: 12651069440 | elapsed time per iteration (s): 0.22 | learning rate: 7.350E-05 | global batch size: 256 | lm loss: 3.727129E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.171 | TFLOPs: 29.35 | +7: iteration 24140/ 37905 | consumed samples: 6179840 | consumed tokens: 12656312320 | elapsed time per iteration (s): 0.22 | learning rate: 7.343E-05 | global batch size: 256 | lm loss: 3.729810E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.153 | TFLOPs: 29.25 | +7: iteration 24150/ 37905 | consumed samples: 6182400 | consumed tokens: 12661555200 | elapsed time per iteration (s): 0.22 | learning rate: 7.336E-05 | global batch size: 256 | lm loss: 3.722805E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.352 | TFLOPs: 29.33 | +7: iteration 24160/ 37905 | consumed samples: 6184960 | consumed tokens: 12666798080 | elapsed time per iteration (s): 0.22 | learning rate: 7.329E-05 | global batch size: 256 | lm loss: 3.720536E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.607 | TFLOPs: 29.31 | +7: iteration 24170/ 37905 | consumed samples: 6187520 | consumed tokens: 12672040960 | elapsed time per iteration (s): 0.22 | learning rate: 7.323E-05 | global batch size: 256 | lm loss: 3.718114E+00 | grad norm: 0.279 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.115 | TFLOPs: 29.32 | +7: iteration 24180/ 37905 | consumed samples: 6190080 | consumed tokens: 12677283840 | elapsed time per iteration (s): 0.22 | learning rate: 7.316E-05 | global batch size: 256 | lm loss: 3.737000E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.952 | TFLOPs: 29.32 | +7: iteration 24190/ 37905 | consumed samples: 6192640 | consumed tokens: 12682526720 | elapsed time per iteration (s): 0.22 | learning rate: 7.309E-05 | global batch size: 256 | lm loss: 3.724744E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.668 | TFLOPs: 29.31 | +7: iteration 24200/ 37905 | consumed samples: 6195200 | consumed tokens: 12687769600 | elapsed time per iteration (s): 0.22 | learning rate: 7.302E-05 | global batch size: 256 | lm loss: 3.709266E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.242 | TFLOPs: 29.30 | +7: iteration 24210/ 37905 | consumed samples: 6197760 | consumed tokens: 12693012480 | elapsed time per iteration (s): 0.22 | learning rate: 7.295E-05 | global batch size: 256 | lm loss: 3.730152E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.367 | TFLOPs: 29.33 | +7: iteration 24220/ 37905 | consumed samples: 6200320 | consumed tokens: 12698255360 | elapsed time per iteration (s): 0.22 | learning rate: 7.288E-05 | global batch size: 256 | lm loss: 3.729615E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.818 | TFLOPs: 29.32 | +7: iteration 24230/ 37905 | consumed samples: 6202880 | consumed tokens: 12703498240 | elapsed time per iteration (s): 0.22 | learning rate: 7.281E-05 | global batch size: 256 | lm loss: 3.718960E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.919 | TFLOPs: 29.32 | +7: iteration 24240/ 37905 | consumed samples: 6205440 | consumed tokens: 12708741120 | elapsed time per iteration (s): 0.22 | learning rate: 7.274E-05 | global batch size: 256 | lm loss: 3.738178E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.130 | TFLOPs: 29.30 | +7: iteration 24250/ 37905 | consumed samples: 6208000 | consumed tokens: 12713984000 | elapsed time per iteration (s): 0.22 | learning rate: 7.268E-05 | global batch size: 256 | lm loss: 3.716821E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.690 | TFLOPs: 29.31 | +7: iteration 24260/ 37905 | consumed samples: 6210560 | consumed tokens: 12719226880 | elapsed time per iteration (s): 0.22 | learning rate: 7.261E-05 | global batch size: 256 | lm loss: 3.724042E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.529 | TFLOPs: 29.28 | +7: iteration 24270/ 37905 | consumed samples: 6213120 | consumed tokens: 12724469760 | elapsed time per iteration (s): 0.22 | learning rate: 7.254E-05 | global batch size: 256 | lm loss: 3.718352E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.638 | TFLOPs: 29.31 | +7: iteration 24280/ 37905 | consumed samples: 6215680 | consumed tokens: 12729712640 | elapsed time per iteration (s): 0.22 | learning rate: 7.247E-05 | global batch size: 256 | lm loss: 3.713434E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.150 | TFLOPs: 29.30 | +7: iteration 24290/ 37905 | consumed samples: 6218240 | consumed tokens: 12734955520 | elapsed time per iteration (s): 0.22 | learning rate: 7.240E-05 | global batch size: 256 | lm loss: 3.730705E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.292 | TFLOPs: 29.30 | +7: iteration 24300/ 37905 | consumed samples: 6220800 | consumed tokens: 12740198400 | elapsed time per iteration (s): 0.22 | learning rate: 7.233E-05 | global batch size: 256 | lm loss: 3.730650E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.901 | TFLOPs: 29.29 | +7: iteration 24310/ 37905 | consumed samples: 6223360 | consumed tokens: 12745441280 | elapsed time per iteration (s): 0.22 | learning rate: 7.227E-05 | global batch size: 256 | lm loss: 3.726216E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.624 | TFLOPs: 29.31 | +7: iteration 24320/ 37905 | consumed samples: 6225920 | consumed tokens: 12750684160 | elapsed time per iteration (s): 0.22 | learning rate: 7.220E-05 | global batch size: 256 | lm loss: 3.719560E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.998 | TFLOPs: 29.30 | +7: iteration 24330/ 37905 | consumed samples: 6228480 | consumed tokens: 12755927040 | elapsed time per iteration (s): 0.22 | learning rate: 7.213E-05 | global batch size: 256 | lm loss: 3.716973E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.204 | TFLOPs: 29.30 | +7: iteration 24340/ 37905 | consumed samples: 6231040 | consumed tokens: 12761169920 | elapsed time per iteration (s): 0.22 | learning rate: 7.206E-05 | global batch size: 256 | lm loss: 3.725064E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.447 | TFLOPs: 29.31 | +7: iteration 24350/ 37905 | consumed samples: 6233600 | consumed tokens: 12766412800 | elapsed time per iteration (s): 0.22 | learning rate: 7.199E-05 | global batch size: 256 | lm loss: 3.733142E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.568 | TFLOPs: 29.31 | +7: iteration 24360/ 37905 | consumed samples: 6236160 | consumed tokens: 12771655680 | elapsed time per iteration (s): 0.22 | learning rate: 7.192E-05 | global batch size: 256 | lm loss: 3.722470E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.221 | TFLOPs: 29.30 | +7: iteration 24370/ 37905 | consumed samples: 6238720 | consumed tokens: 12776898560 | elapsed time per iteration (s): 0.22 | learning rate: 7.186E-05 | global batch size: 256 | lm loss: 3.721548E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.864 | TFLOPs: 29.32 | +7: iteration 24380/ 37905 | consumed samples: 6241280 | consumed tokens: 12782141440 | elapsed time per iteration (s): 0.22 | learning rate: 7.179E-05 | global batch size: 256 | lm loss: 3.730606E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.457 | TFLOPs: 29.31 | +7: iteration 24390/ 37905 | consumed samples: 6243840 | consumed tokens: 12787384320 | elapsed time per iteration (s): 0.22 | learning rate: 7.172E-05 | global batch size: 256 | lm loss: 3.726833E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.936 | TFLOPs: 29.32 | +7: iteration 24400/ 37905 | consumed samples: 6246400 | consumed tokens: 12792627200 | elapsed time per iteration (s): 0.23 | learning rate: 7.165E-05 | global batch size: 256 | lm loss: 3.723790E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.051 | TFLOPs: 28.86 | +7: iteration 24410/ 37905 | consumed samples: 6248960 | consumed tokens: 12797870080 | elapsed time per iteration (s): 0.23 | learning rate: 7.158E-05 | global batch size: 256 | lm loss: 3.721877E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.791 | TFLOPs: 28.86 | +7: iteration 24420/ 37905 | consumed samples: 6251520 | consumed tokens: 12803112960 | elapsed time per iteration (s): 0.22 | learning rate: 7.151E-05 | global batch size: 256 | lm loss: 3.724030E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.206 | TFLOPs: 29.33 | +7: iteration 24430/ 37905 | consumed samples: 6254080 | consumed tokens: 12808355840 | elapsed time per iteration (s): 0.22 | learning rate: 7.145E-05 | global batch size: 256 | lm loss: 3.710478E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.416 | TFLOPs: 29.33 | +7: iteration 24440/ 37905 | consumed samples: 6256640 | consumed tokens: 12813598720 | elapsed time per iteration (s): 0.22 | learning rate: 7.138E-05 | global batch size: 256 | lm loss: 3.715135E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.077 | TFLOPs: 29.32 | +7: iteration 24450/ 37905 | consumed samples: 6259200 | consumed tokens: 12818841600 | elapsed time per iteration (s): 0.22 | learning rate: 7.131E-05 | global batch size: 256 | lm loss: 3.715975E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.455 | TFLOPs: 29.33 | +7: iteration 24460/ 37905 | consumed samples: 6261760 | consumed tokens: 12824084480 | elapsed time per iteration (s): 0.22 | learning rate: 7.124E-05 | global batch size: 256 | lm loss: 3.724669E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.583 | TFLOPs: 29.36 | +7: iteration 24470/ 37905 | consumed samples: 6264320 | consumed tokens: 12829327360 | elapsed time per iteration (s): 0.22 | learning rate: 7.117E-05 | global batch size: 256 | lm loss: 3.722786E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.352 | TFLOPs: 29.31 | +7: iteration 24480/ 37905 | consumed samples: 6266880 | consumed tokens: 12834570240 | elapsed time per iteration (s): 0.22 | learning rate: 7.111E-05 | global batch size: 256 | lm loss: 3.720301E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.343 | TFLOPs: 29.28 | +7: iteration 24490/ 37905 | consumed samples: 6269440 | consumed tokens: 12839813120 | elapsed time per iteration (s): 0.22 | learning rate: 7.104E-05 | global batch size: 256 | lm loss: 3.715117E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.435 | TFLOPs: 29.31 | +7: iteration 24500/ 37905 | consumed samples: 6272000 | consumed tokens: 12845056000 | elapsed time per iteration (s): 0.22 | learning rate: 7.097E-05 | global batch size: 256 | lm loss: 3.698237E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.643 | TFLOPs: 29.29 | +7: iteration 24510/ 37905 | consumed samples: 6274560 | consumed tokens: 12850298880 | elapsed time per iteration (s): 0.22 | learning rate: 7.090E-05 | global batch size: 256 | lm loss: 3.729995E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.006 | TFLOPs: 29.35 | +7: iteration 24520/ 37905 | consumed samples: 6277120 | consumed tokens: 12855541760 | elapsed time per iteration (s): 0.22 | learning rate: 7.083E-05 | global batch size: 256 | lm loss: 3.716307E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.020 | TFLOPs: 29.37 | +7: iteration 24530/ 37905 | consumed samples: 6279680 | consumed tokens: 12860784640 | elapsed time per iteration (s): 0.22 | learning rate: 7.077E-05 | global batch size: 256 | lm loss: 3.714662E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.626 | TFLOPs: 29.29 | +7: iteration 24540/ 37905 | consumed samples: 6282240 | consumed tokens: 12866027520 | elapsed time per iteration (s): 0.22 | learning rate: 7.070E-05 | global batch size: 256 | lm loss: 3.722360E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.804 | TFLOPs: 29.29 | +7: iteration 24550/ 37905 | consumed samples: 6284800 | consumed tokens: 12871270400 | elapsed time per iteration (s): 0.22 | learning rate: 7.063E-05 | global batch size: 256 | lm loss: 3.734827E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.986 | TFLOPs: 29.30 | +7: iteration 24560/ 37905 | consumed samples: 6287360 | consumed tokens: 12876513280 | elapsed time per iteration (s): 0.22 | learning rate: 7.056E-05 | global batch size: 256 | lm loss: 3.729250E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.881 | TFLOPs: 29.29 | +7: iteration 24570/ 37905 | consumed samples: 6289920 | consumed tokens: 12881756160 | elapsed time per iteration (s): 0.22 | learning rate: 7.050E-05 | global batch size: 256 | lm loss: 3.731650E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.185 | TFLOPs: 29.33 | +7: iteration 24580/ 37905 | consumed samples: 6292480 | consumed tokens: 12886999040 | elapsed time per iteration (s): 0.22 | learning rate: 7.043E-05 | global batch size: 256 | lm loss: 3.719947E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.482 | TFLOPs: 29.33 | +7: iteration 24590/ 37905 | consumed samples: 6295040 | consumed tokens: 12892241920 | elapsed time per iteration (s): 0.22 | learning rate: 7.036E-05 | global batch size: 256 | lm loss: 3.723226E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.920 | TFLOPs: 29.32 | +7: iteration 24600/ 37905 | consumed samples: 6297600 | consumed tokens: 12897484800 | elapsed time per iteration (s): 0.22 | learning rate: 7.029E-05 | global batch size: 256 | lm loss: 3.719348E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.599 | TFLOPs: 29.31 | +7: iteration 24610/ 37905 | consumed samples: 6300160 | consumed tokens: 12902727680 | elapsed time per iteration (s): 0.22 | learning rate: 7.023E-05 | global batch size: 256 | lm loss: 3.734293E+00 | grad norm: 0.253 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.899 | TFLOPs: 29.22 | +7: iteration 24620/ 37905 | consumed samples: 6302720 | consumed tokens: 12907970560 | elapsed time per iteration (s): 0.22 | learning rate: 7.016E-05 | global batch size: 256 | lm loss: 3.719184E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.246 | TFLOPs: 29.02 | +7: iteration 24630/ 37905 | consumed samples: 6305280 | consumed tokens: 12913213440 | elapsed time per iteration (s): 0.22 | learning rate: 7.009E-05 | global batch size: 256 | lm loss: 3.721851E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.439 | TFLOPs: 29.03 | +7: iteration 24640/ 37905 | consumed samples: 6307840 | consumed tokens: 12918456320 | elapsed time per iteration (s): 0.22 | learning rate: 7.002E-05 | global batch size: 256 | lm loss: 3.729170E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.608 | TFLOPs: 29.34 | +7: iteration 24650/ 37905 | consumed samples: 6310400 | consumed tokens: 12923699200 | elapsed time per iteration (s): 0.22 | learning rate: 6.996E-05 | global batch size: 256 | lm loss: 3.725415E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.500 | TFLOPs: 29.33 | +7: iteration 24660/ 37905 | consumed samples: 6312960 | consumed tokens: 12928942080 | elapsed time per iteration (s): 0.22 | learning rate: 6.989E-05 | global batch size: 256 | lm loss: 3.730662E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.689 | TFLOPs: 29.36 | +7: iteration 24670/ 37905 | consumed samples: 6315520 | consumed tokens: 12934184960 | elapsed time per iteration (s): 0.22 | learning rate: 6.982E-05 | global batch size: 256 | lm loss: 3.717747E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.659 | TFLOPs: 29.34 | +7: iteration 24680/ 37905 | consumed samples: 6318080 | consumed tokens: 12939427840 | elapsed time per iteration (s): 0.22 | learning rate: 6.975E-05 | global batch size: 256 | lm loss: 3.717762E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.173 | TFLOPs: 29.33 | +7: iteration 24690/ 37905 | consumed samples: 6320640 | consumed tokens: 12944670720 | elapsed time per iteration (s): 0.22 | learning rate: 6.969E-05 | global batch size: 256 | lm loss: 3.719988E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.422 | TFLOPs: 29.33 | +7: iteration 24700/ 37905 | consumed samples: 6323200 | consumed tokens: 12949913600 | elapsed time per iteration (s): 0.22 | learning rate: 6.962E-05 | global batch size: 256 | lm loss: 3.717495E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.688 | TFLOPs: 29.39 | +7: iteration 24710/ 37905 | consumed samples: 6325760 | consumed tokens: 12955156480 | elapsed time per iteration (s): 0.22 | learning rate: 6.955E-05 | global batch size: 256 | lm loss: 3.715164E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.927 | TFLOPs: 29.42 | +7: iteration 24720/ 37905 | consumed samples: 6328320 | consumed tokens: 12960399360 | elapsed time per iteration (s): 0.22 | learning rate: 6.948E-05 | global batch size: 256 | lm loss: 3.722058E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.010 | TFLOPs: 29.19 | +7: iteration 24730/ 37905 | consumed samples: 6330880 | consumed tokens: 12965642240 | elapsed time per iteration (s): 0.22 | learning rate: 6.942E-05 | global batch size: 256 | lm loss: 3.717431E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.801 | TFLOPs: 29.37 | +7: iteration 24740/ 37905 | consumed samples: 6333440 | consumed tokens: 12970885120 | elapsed time per iteration (s): 0.22 | learning rate: 6.935E-05 | global batch size: 256 | lm loss: 3.729802E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.633 | TFLOPs: 29.39 | +7: iteration 24750/ 37905 | consumed samples: 6336000 | consumed tokens: 12976128000 | elapsed time per iteration (s): 0.22 | learning rate: 6.928E-05 | global batch size: 256 | lm loss: 3.723332E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.262 | TFLOPs: 29.43 | +7: iteration 24760/ 37905 | consumed samples: 6338560 | consumed tokens: 12981370880 | elapsed time per iteration (s): 0.22 | learning rate: 6.921E-05 | global batch size: 256 | lm loss: 3.718443E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.086 | TFLOPs: 29.43 | +7: iteration 24770/ 37905 | consumed samples: 6341120 | consumed tokens: 12986613760 | elapsed time per iteration (s): 0.22 | learning rate: 6.915E-05 | global batch size: 256 | lm loss: 3.732284E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.252 | TFLOPs: 29.43 | +7: iteration 24780/ 37905 | consumed samples: 6343680 | consumed tokens: 12991856640 | elapsed time per iteration (s): 0.22 | learning rate: 6.908E-05 | global batch size: 256 | lm loss: 3.725277E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.295 | TFLOPs: 29.41 | +7: iteration 24790/ 37905 | consumed samples: 6346240 | consumed tokens: 12997099520 | elapsed time per iteration (s): 0.22 | learning rate: 6.901E-05 | global batch size: 256 | lm loss: 3.728301E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.327 | TFLOPs: 29.36 | +7: iteration 24800/ 37905 | consumed samples: 6348800 | consumed tokens: 13002342400 | elapsed time per iteration (s): 0.22 | learning rate: 6.895E-05 | global batch size: 256 | lm loss: 3.710454E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.006 | TFLOPs: 29.35 | +7: iteration 24810/ 37905 | consumed samples: 6351360 | consumed tokens: 13007585280 | elapsed time per iteration (s): 0.22 | learning rate: 6.888E-05 | global batch size: 256 | lm loss: 3.719371E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.978 | TFLOPs: 29.32 | +7: iteration 24820/ 37905 | consumed samples: 6353920 | consumed tokens: 13012828160 | elapsed time per iteration (s): 0.22 | learning rate: 6.881E-05 | global batch size: 256 | lm loss: 3.724622E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.925 | TFLOPs: 29.32 | +7: iteration 24830/ 37905 | consumed samples: 6356480 | consumed tokens: 13018071040 | elapsed time per iteration (s): 0.22 | learning rate: 6.875E-05 | global batch size: 256 | lm loss: 3.722308E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.109 | TFLOPs: 29.30 | +7: iteration 24840/ 37905 | consumed samples: 6359040 | consumed tokens: 13023313920 | elapsed time per iteration (s): 0.22 | learning rate: 6.868E-05 | global batch size: 256 | lm loss: 3.719472E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.223 | TFLOPs: 29.33 | +7: iteration 24850/ 37905 | consumed samples: 6361600 | consumed tokens: 13028556800 | elapsed time per iteration (s): 0.22 | learning rate: 6.861E-05 | global batch size: 256 | lm loss: 3.719633E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.186 | TFLOPs: 29.30 | +7: iteration 24860/ 37905 | consumed samples: 6364160 | consumed tokens: 13033799680 | elapsed time per iteration (s): 0.22 | learning rate: 6.854E-05 | global batch size: 256 | lm loss: 3.717977E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.981 | TFLOPs: 29.35 | +7: iteration 24870/ 37905 | consumed samples: 6366720 | consumed tokens: 13039042560 | elapsed time per iteration (s): 0.22 | learning rate: 6.848E-05 | global batch size: 256 | lm loss: 3.723158E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.074 | TFLOPs: 29.37 | +7: iteration 24880/ 37905 | consumed samples: 6369280 | consumed tokens: 13044285440 | elapsed time per iteration (s): 0.22 | learning rate: 6.841E-05 | global batch size: 256 | lm loss: 3.715730E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.483 | TFLOPs: 29.33 | +7: iteration 24890/ 37905 | consumed samples: 6371840 | consumed tokens: 13049528320 | elapsed time per iteration (s): 0.22 | learning rate: 6.834E-05 | global batch size: 256 | lm loss: 3.724904E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.983 | TFLOPs: 29.35 | +7: iteration 24900/ 37905 | consumed samples: 6374400 | consumed tokens: 13054771200 | elapsed time per iteration (s): 0.22 | learning rate: 6.828E-05 | global batch size: 256 | lm loss: 3.719292E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.685 | TFLOPs: 29.34 | +7: iteration 24910/ 37905 | consumed samples: 6376960 | consumed tokens: 13060014080 | elapsed time per iteration (s): 0.22 | learning rate: 6.821E-05 | global batch size: 256 | lm loss: 3.726676E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.833 | TFLOPs: 29.34 | +7: iteration 24920/ 37905 | consumed samples: 6379520 | consumed tokens: 13065256960 | elapsed time per iteration (s): 0.22 | learning rate: 6.814E-05 | global batch size: 256 | lm loss: 3.714574E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.509 | TFLOPs: 29.33 | +7: iteration 24930/ 37905 | consumed samples: 6382080 | consumed tokens: 13070499840 | elapsed time per iteration (s): 0.22 | learning rate: 6.808E-05 | global batch size: 256 | lm loss: 3.718419E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.366 | TFLOPs: 29.33 | +7: iteration 24940/ 37905 | consumed samples: 6384640 | consumed tokens: 13075742720 | elapsed time per iteration (s): 0.22 | learning rate: 6.801E-05 | global batch size: 256 | lm loss: 3.730873E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.116 | TFLOPs: 29.30 | +7: iteration 24950/ 37905 | consumed samples: 6387200 | consumed tokens: 13080985600 | elapsed time per iteration (s): 0.22 | learning rate: 6.794E-05 | global batch size: 256 | lm loss: 3.714442E+00 | grad norm: 0.234 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.659 | TFLOPs: 29.31 | +7: iteration 24960/ 37905 | consumed samples: 6389760 | consumed tokens: 13086228480 | elapsed time per iteration (s): 0.22 | learning rate: 6.788E-05 | global batch size: 256 | lm loss: 3.717032E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.398 | TFLOPs: 29.31 | +7: iteration 24970/ 37905 | consumed samples: 6392320 | consumed tokens: 13091471360 | elapsed time per iteration (s): 0.22 | learning rate: 6.781E-05 | global batch size: 256 | lm loss: 3.714894E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.700 | TFLOPs: 29.29 | +7: iteration 24980/ 37905 | consumed samples: 6394880 | consumed tokens: 13096714240 | elapsed time per iteration (s): 0.22 | learning rate: 6.774E-05 | global batch size: 256 | lm loss: 3.726907E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.196 | TFLOPs: 29.33 | +7: iteration 24990/ 37905 | consumed samples: 6397440 | consumed tokens: 13101957120 | elapsed time per iteration (s): 0.22 | learning rate: 6.768E-05 | global batch size: 256 | lm loss: 3.715639E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.852 | TFLOPs: 29.29 | +7: iteration 25000/ 37905 | consumed samples: 6400000 | consumed tokens: 13107200000 | elapsed time per iteration (s): 0.22 | learning rate: 6.761E-05 | global batch size: 256 | lm loss: 3.735238E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.279 | TFLOPs: 29.33 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 25000 | lm loss value: 3.660262E+00 | lm loss PPL: 3.887153E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 25000 to checkpoints_83m20b400m +0: [2023-03-15 23:30:41,980] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step25000 is begin to save! +0: [2023-03-15 23:30:41,984] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:30:42,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:30:42,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:30:42,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:30:42,068] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:30:42,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:30:42,079] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:30:42,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:30:42,090] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:30:42,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:30:42,101] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:30:42,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:30:42,112] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:30:42,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:30:42,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:30:42,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:30:42,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:30:42,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:30:42,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:30:42,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:30:42,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:30:42,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:30:42,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:30:42,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:30:42,169] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step25000/mp_rank_00_model_states.pt +0: [2023-03-15 23:30:42,169] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:30:42,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:30:42,188] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:30:42,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,216] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,216] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +2: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,217] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +2: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,217] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +5: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +3: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +4: [2023-03-15 23:30:42,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +1: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:30:42,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:30:42,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: [2023-03-15 23:30:42,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:30:42,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:30:42,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +7: [2023-03-15 23:30:42,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:30:42,220] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:30:42,220] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +6: [2023-03-15 23:30:42,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:30:42,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step25000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:30:42,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step25000 is ready now! +0: successfully saved checkpoint at iteration 25000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 247.34 +7: iteration 25010/ 37905 | consumed samples: 6402560 | consumed tokens: 13112442880 | elapsed time per iteration (s): 0.25 | learning rate: 6.754E-05 | global batch size: 256 | lm loss: 3.720374E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1011.091 | TFLOPs: 25.76 | +7: iteration 25020/ 37905 | consumed samples: 6405120 | consumed tokens: 13117685760 | elapsed time per iteration (s): 0.22 | learning rate: 6.748E-05 | global batch size: 256 | lm loss: 3.718114E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.309 | TFLOPs: 29.33 | +7: iteration 25030/ 37905 | consumed samples: 6407680 | consumed tokens: 13122928640 | elapsed time per iteration (s): 0.22 | learning rate: 6.741E-05 | global batch size: 256 | lm loss: 3.722953E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.968 | TFLOPs: 29.32 | +7: iteration 25040/ 37905 | consumed samples: 6410240 | consumed tokens: 13128171520 | elapsed time per iteration (s): 0.22 | learning rate: 6.735E-05 | global batch size: 256 | lm loss: 3.720059E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.813 | TFLOPs: 29.34 | +7: iteration 25050/ 37905 | consumed samples: 6412800 | consumed tokens: 13133414400 | elapsed time per iteration (s): 0.23 | learning rate: 6.728E-05 | global batch size: 256 | lm loss: 3.737077E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.433 | TFLOPs: 28.85 | +7: iteration 25060/ 37905 | consumed samples: 6415360 | consumed tokens: 13138657280 | elapsed time per iteration (s): 0.22 | learning rate: 6.721E-05 | global batch size: 256 | lm loss: 3.722344E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.119 | TFLOPs: 29.30 | +7: iteration 25070/ 37905 | consumed samples: 6417920 | consumed tokens: 13143900160 | elapsed time per iteration (s): 0.22 | learning rate: 6.715E-05 | global batch size: 256 | lm loss: 3.709147E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.254 | TFLOPs: 29.33 | +7: iteration 25080/ 37905 | consumed samples: 6420480 | consumed tokens: 13149143040 | elapsed time per iteration (s): 0.22 | learning rate: 6.708E-05 | global batch size: 256 | lm loss: 3.721699E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.519 | TFLOPs: 29.31 | +7: iteration 25090/ 37905 | consumed samples: 6423040 | consumed tokens: 13154385920 | elapsed time per iteration (s): 0.22 | learning rate: 6.701E-05 | global batch size: 256 | lm loss: 3.710751E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.827 | TFLOPs: 29.32 | +7: iteration 25100/ 37905 | consumed samples: 6425600 | consumed tokens: 13159628800 | elapsed time per iteration (s): 0.22 | learning rate: 6.695E-05 | global batch size: 256 | lm loss: 3.719374E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.701 | TFLOPs: 29.29 | +7: iteration 25110/ 37905 | consumed samples: 6428160 | consumed tokens: 13164871680 | elapsed time per iteration (s): 0.22 | learning rate: 6.688E-05 | global batch size: 256 | lm loss: 3.708892E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.309 | TFLOPs: 29.28 | +7: iteration 25120/ 37905 | consumed samples: 6430720 | consumed tokens: 13170114560 | elapsed time per iteration (s): 0.22 | learning rate: 6.682E-05 | global batch size: 256 | lm loss: 3.715308E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.040 | TFLOPs: 29.30 | +7: iteration 25130/ 37905 | consumed samples: 6433280 | consumed tokens: 13175357440 | elapsed time per iteration (s): 0.22 | learning rate: 6.675E-05 | global batch size: 256 | lm loss: 3.722707E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.489 | TFLOPs: 29.26 | +7: iteration 25140/ 37905 | consumed samples: 6435840 | consumed tokens: 13180600320 | elapsed time per iteration (s): 0.22 | learning rate: 6.668E-05 | global batch size: 256 | lm loss: 3.724490E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.767 | TFLOPs: 29.01 | +7: iteration 25150/ 37905 | consumed samples: 6438400 | consumed tokens: 13185843200 | elapsed time per iteration (s): 0.22 | learning rate: 6.662E-05 | global batch size: 256 | lm loss: 3.716824E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.315 | TFLOPs: 29.36 | +7: iteration 25160/ 37905 | consumed samples: 6440960 | consumed tokens: 13191086080 | elapsed time per iteration (s): 0.22 | learning rate: 6.655E-05 | global batch size: 256 | lm loss: 3.707595E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.269 | TFLOPs: 29.35 | +7: iteration 25170/ 37905 | consumed samples: 6443520 | consumed tokens: 13196328960 | elapsed time per iteration (s): 0.22 | learning rate: 6.649E-05 | global batch size: 256 | lm loss: 3.712435E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.890 | TFLOPs: 29.32 | +7: iteration 25180/ 37905 | consumed samples: 6446080 | consumed tokens: 13201571840 | elapsed time per iteration (s): 0.22 | learning rate: 6.642E-05 | global batch size: 256 | lm loss: 3.716239E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.249 | TFLOPs: 29.35 | +7: iteration 25190/ 37905 | consumed samples: 6448640 | consumed tokens: 13206814720 | elapsed time per iteration (s): 0.22 | learning rate: 6.635E-05 | global batch size: 256 | lm loss: 3.718114E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.062 | TFLOPs: 29.35 | +7: iteration 25200/ 37905 | consumed samples: 6451200 | consumed tokens: 13212057600 | elapsed time per iteration (s): 0.22 | learning rate: 6.629E-05 | global batch size: 256 | lm loss: 3.709805E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.448 | TFLOPs: 29.33 | +7: iteration 25210/ 37905 | consumed samples: 6453760 | consumed tokens: 13217300480 | elapsed time per iteration (s): 0.22 | learning rate: 6.622E-05 | global batch size: 256 | lm loss: 3.717846E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.168 | TFLOPs: 29.30 | +7: iteration 25220/ 37905 | consumed samples: 6456320 | consumed tokens: 13222543360 | elapsed time per iteration (s): 0.22 | learning rate: 6.616E-05 | global batch size: 256 | lm loss: 3.725449E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.704 | TFLOPs: 29.31 | +7: iteration 25230/ 37905 | consumed samples: 6458880 | consumed tokens: 13227786240 | elapsed time per iteration (s): 0.22 | learning rate: 6.609E-05 | global batch size: 256 | lm loss: 3.714671E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.331 | TFLOPs: 29.30 | +7: iteration 25240/ 37905 | consumed samples: 6461440 | consumed tokens: 13233029120 | elapsed time per iteration (s): 0.22 | learning rate: 6.602E-05 | global batch size: 256 | lm loss: 3.721807E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.070 | TFLOPs: 29.30 | +7: iteration 25250/ 37905 | consumed samples: 6464000 | consumed tokens: 13238272000 | elapsed time per iteration (s): 0.22 | learning rate: 6.596E-05 | global batch size: 256 | lm loss: 3.721801E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.970 | TFLOPs: 29.30 | +7: iteration 25260/ 37905 | consumed samples: 6466560 | consumed tokens: 13243514880 | elapsed time per iteration (s): 0.22 | learning rate: 6.589E-05 | global batch size: 256 | lm loss: 3.728367E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.939 | TFLOPs: 29.29 | +7: iteration 25270/ 37905 | consumed samples: 6469120 | consumed tokens: 13248757760 | elapsed time per iteration (s): 0.22 | learning rate: 6.583E-05 | global batch size: 256 | lm loss: 3.712334E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.076 | TFLOPs: 29.30 | +7: iteration 25280/ 37905 | consumed samples: 6471680 | consumed tokens: 13254000640 | elapsed time per iteration (s): 0.22 | learning rate: 6.576E-05 | global batch size: 256 | lm loss: 3.707430E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.657 | TFLOPs: 29.29 | +7: iteration 25290/ 37905 | consumed samples: 6474240 | consumed tokens: 13259243520 | elapsed time per iteration (s): 0.22 | learning rate: 6.570E-05 | global batch size: 256 | lm loss: 3.725162E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.886 | TFLOPs: 29.29 | +7: iteration 25300/ 37905 | consumed samples: 6476800 | consumed tokens: 13264486400 | elapsed time per iteration (s): 0.22 | learning rate: 6.563E-05 | global batch size: 256 | lm loss: 3.719231E+00 | grad norm: 0.259 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.352 | TFLOPs: 29.28 | +7: iteration 25310/ 37905 | consumed samples: 6479360 | consumed tokens: 13269729280 | elapsed time per iteration (s): 0.22 | learning rate: 6.556E-05 | global batch size: 256 | lm loss: 3.721912E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.106 | TFLOPs: 29.30 | +7: iteration 25320/ 37905 | consumed samples: 6481920 | consumed tokens: 13274972160 | elapsed time per iteration (s): 0.22 | learning rate: 6.550E-05 | global batch size: 256 | lm loss: 3.721532E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.105 | TFLOPs: 29.30 | +7: iteration 25330/ 37905 | consumed samples: 6484480 | consumed tokens: 13280215040 | elapsed time per iteration (s): 0.22 | learning rate: 6.543E-05 | global batch size: 256 | lm loss: 3.720845E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.096 | TFLOPs: 29.30 | +7: iteration 25340/ 37905 | consumed samples: 6487040 | consumed tokens: 13285457920 | elapsed time per iteration (s): 0.22 | learning rate: 6.537E-05 | global batch size: 256 | lm loss: 3.707177E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.569 | TFLOPs: 29.31 | +7: iteration 25350/ 37905 | consumed samples: 6489600 | consumed tokens: 13290700800 | elapsed time per iteration (s): 0.22 | learning rate: 6.530E-05 | global batch size: 256 | lm loss: 3.715586E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.479 | TFLOPs: 29.33 | +7: iteration 25360/ 37905 | consumed samples: 6492160 | consumed tokens: 13295943680 | elapsed time per iteration (s): 0.22 | learning rate: 6.524E-05 | global batch size: 256 | lm loss: 3.713301E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.737 | TFLOPs: 29.37 | +7: iteration 25370/ 37905 | consumed samples: 6494720 | consumed tokens: 13301186560 | elapsed time per iteration (s): 0.22 | learning rate: 6.517E-05 | global batch size: 256 | lm loss: 3.705825E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.752 | TFLOPs: 29.32 | +7: iteration 25380/ 37905 | consumed samples: 6497280 | consumed tokens: 13306429440 | elapsed time per iteration (s): 0.22 | learning rate: 6.511E-05 | global batch size: 256 | lm loss: 3.720882E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.834 | TFLOPs: 29.37 | +7: iteration 25390/ 37905 | consumed samples: 6499840 | consumed tokens: 13311672320 | elapsed time per iteration (s): 0.22 | learning rate: 6.504E-05 | global batch size: 256 | lm loss: 3.727627E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.549 | TFLOPs: 29.36 | +7: iteration 25400/ 37905 | consumed samples: 6502400 | consumed tokens: 13316915200 | elapsed time per iteration (s): 0.22 | learning rate: 6.498E-05 | global batch size: 256 | lm loss: 3.711539E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.476 | TFLOPs: 29.36 | +7: iteration 25410/ 37905 | consumed samples: 6504960 | consumed tokens: 13322158080 | elapsed time per iteration (s): 0.22 | learning rate: 6.491E-05 | global batch size: 256 | lm loss: 3.717929E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.613 | TFLOPs: 29.36 | +7: iteration 25420/ 37905 | consumed samples: 6507520 | consumed tokens: 13327400960 | elapsed time per iteration (s): 0.22 | learning rate: 6.485E-05 | global batch size: 256 | lm loss: 3.700238E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.345 | TFLOPs: 29.33 | +7: iteration 25430/ 37905 | consumed samples: 6510080 | consumed tokens: 13332643840 | elapsed time per iteration (s): 0.22 | learning rate: 6.478E-05 | global batch size: 256 | lm loss: 3.718470E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.255 | TFLOPs: 29.35 | +7: iteration 25440/ 37905 | consumed samples: 6512640 | consumed tokens: 13337886720 | elapsed time per iteration (s): 0.22 | learning rate: 6.472E-05 | global batch size: 256 | lm loss: 3.715742E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.524 | TFLOPs: 29.36 | +7: iteration 25450/ 37905 | consumed samples: 6515200 | consumed tokens: 13343129600 | elapsed time per iteration (s): 0.22 | learning rate: 6.465E-05 | global batch size: 256 | lm loss: 3.733133E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.597 | TFLOPs: 29.36 | +7: iteration 25460/ 37905 | consumed samples: 6517760 | consumed tokens: 13348372480 | elapsed time per iteration (s): 0.22 | learning rate: 6.459E-05 | global batch size: 256 | lm loss: 3.718504E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.366 | TFLOPs: 29.36 | +7: iteration 25470/ 37905 | consumed samples: 6520320 | consumed tokens: 13353615360 | elapsed time per iteration (s): 0.22 | learning rate: 6.452E-05 | global batch size: 256 | lm loss: 3.716270E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.850 | TFLOPs: 29.37 | +7: iteration 25480/ 37905 | consumed samples: 6522880 | consumed tokens: 13358858240 | elapsed time per iteration (s): 0.22 | learning rate: 6.446E-05 | global batch size: 256 | lm loss: 3.726766E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.075 | TFLOPs: 29.37 | +7: iteration 25490/ 37905 | consumed samples: 6525440 | consumed tokens: 13364101120 | elapsed time per iteration (s): 0.22 | learning rate: 6.439E-05 | global batch size: 256 | lm loss: 3.710241E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.785 | TFLOPs: 29.29 | +7: iteration 25500/ 37905 | consumed samples: 6528000 | consumed tokens: 13369344000 | elapsed time per iteration (s): 0.22 | learning rate: 6.433E-05 | global batch size: 256 | lm loss: 3.718277E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.995 | TFLOPs: 29.19 | +7: iteration 25510/ 37905 | consumed samples: 6530560 | consumed tokens: 13374586880 | elapsed time per iteration (s): 0.22 | learning rate: 6.426E-05 | global batch size: 256 | lm loss: 3.712741E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.770 | TFLOPs: 29.21 | +7: iteration 25520/ 37905 | consumed samples: 6533120 | consumed tokens: 13379829760 | elapsed time per iteration (s): 0.22 | learning rate: 6.420E-05 | global batch size: 256 | lm loss: 3.726595E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.231 | TFLOPs: 29.20 | +7: iteration 25530/ 37905 | consumed samples: 6535680 | consumed tokens: 13385072640 | elapsed time per iteration (s): 0.22 | learning rate: 6.413E-05 | global batch size: 256 | lm loss: 3.703816E+00 | grad norm: 0.272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.435 | TFLOPs: 29.33 | +7: iteration 25540/ 37905 | consumed samples: 6538240 | consumed tokens: 13390315520 | elapsed time per iteration (s): 0.22 | learning rate: 6.407E-05 | global batch size: 256 | lm loss: 3.711248E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.180 | TFLOPs: 29.35 | +7: iteration 25550/ 37905 | consumed samples: 6540800 | consumed tokens: 13395558400 | elapsed time per iteration (s): 0.22 | learning rate: 6.400E-05 | global batch size: 256 | lm loss: 3.699757E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.211 | TFLOPs: 29.40 | +7: iteration 25560/ 37905 | consumed samples: 6543360 | consumed tokens: 13400801280 | elapsed time per iteration (s): 0.22 | learning rate: 6.394E-05 | global batch size: 256 | lm loss: 3.720100E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.530 | TFLOPs: 29.34 | +7: iteration 25570/ 37905 | consumed samples: 6545920 | consumed tokens: 13406044160 | elapsed time per iteration (s): 0.22 | learning rate: 6.387E-05 | global batch size: 256 | lm loss: 3.718365E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.507 | TFLOPs: 29.31 | +7: iteration 25580/ 37905 | consumed samples: 6548480 | consumed tokens: 13411287040 | elapsed time per iteration (s): 0.22 | learning rate: 6.381E-05 | global batch size: 256 | lm loss: 3.716271E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.731 | TFLOPs: 29.29 | +7: iteration 25590/ 37905 | consumed samples: 6551040 | consumed tokens: 13416529920 | elapsed time per iteration (s): 0.22 | learning rate: 6.374E-05 | global batch size: 256 | lm loss: 3.702969E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.209 | TFLOPs: 29.20 | +7: iteration 25600/ 37905 | consumed samples: 6553600 | consumed tokens: 13421772800 | elapsed time per iteration (s): 0.22 | learning rate: 6.368E-05 | global batch size: 256 | lm loss: 3.715598E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.923 | TFLOPs: 29.24 | +7: iteration 25610/ 37905 | consumed samples: 6556160 | consumed tokens: 13427015680 | elapsed time per iteration (s): 0.22 | learning rate: 6.361E-05 | global batch size: 256 | lm loss: 3.710530E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.300 | TFLOPs: 29.30 | +7: iteration 25620/ 37905 | consumed samples: 6558720 | consumed tokens: 13432258560 | elapsed time per iteration (s): 0.22 | learning rate: 6.355E-05 | global batch size: 256 | lm loss: 3.719206E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.907 | TFLOPs: 29.32 | +7: iteration 25630/ 37905 | consumed samples: 6561280 | consumed tokens: 13437501440 | elapsed time per iteration (s): 0.22 | learning rate: 6.348E-05 | global batch size: 256 | lm loss: 3.709583E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.053 | TFLOPs: 29.27 | +7: iteration 25640/ 37905 | consumed samples: 6563840 | consumed tokens: 13442744320 | elapsed time per iteration (s): 0.22 | learning rate: 6.342E-05 | global batch size: 256 | lm loss: 3.709994E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.305 | TFLOPs: 29.25 | +7: iteration 25650/ 37905 | consumed samples: 6566400 | consumed tokens: 13447987200 | elapsed time per iteration (s): 0.22 | learning rate: 6.336E-05 | global batch size: 256 | lm loss: 3.721041E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.065 | TFLOPs: 29.22 | +7: iteration 25660/ 37905 | consumed samples: 6568960 | consumed tokens: 13453230080 | elapsed time per iteration (s): 0.22 | learning rate: 6.329E-05 | global batch size: 256 | lm loss: 3.714244E+00 | grad norm: 0.269 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.172 | TFLOPs: 29.22 | +7: iteration 25670/ 37905 | consumed samples: 6571520 | consumed tokens: 13458472960 | elapsed time per iteration (s): 0.22 | learning rate: 6.323E-05 | global batch size: 256 | lm loss: 3.707426E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.161 | TFLOPs: 29.22 | +7: iteration 25680/ 37905 | consumed samples: 6574080 | consumed tokens: 13463715840 | elapsed time per iteration (s): 0.22 | learning rate: 6.316E-05 | global batch size: 256 | lm loss: 3.720307E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.904 | TFLOPs: 29.24 | +7: iteration 25690/ 37905 | consumed samples: 6576640 | consumed tokens: 13468958720 | elapsed time per iteration (s): 0.22 | learning rate: 6.310E-05 | global batch size: 256 | lm loss: 3.725497E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.590 | TFLOPs: 29.24 | +7: iteration 25700/ 37905 | consumed samples: 6579200 | consumed tokens: 13474201600 | elapsed time per iteration (s): 0.22 | learning rate: 6.303E-05 | global batch size: 256 | lm loss: 3.705963E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.979 | TFLOPs: 29.24 | +7: iteration 25710/ 37905 | consumed samples: 6581760 | consumed tokens: 13479444480 | elapsed time per iteration (s): 0.22 | learning rate: 6.297E-05 | global batch size: 256 | lm loss: 3.707935E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.984 | TFLOPs: 29.27 | +7: iteration 25720/ 37905 | consumed samples: 6584320 | consumed tokens: 13484687360 | elapsed time per iteration (s): 0.22 | learning rate: 6.291E-05 | global batch size: 256 | lm loss: 3.701986E+00 | grad norm: 0.276 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.073 | TFLOPs: 29.27 | +7: iteration 25730/ 37905 | consumed samples: 6586880 | consumed tokens: 13489930240 | elapsed time per iteration (s): 0.22 | learning rate: 6.284E-05 | global batch size: 256 | lm loss: 3.718774E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.093 | TFLOPs: 29.04 | +7: iteration 25740/ 37905 | consumed samples: 6589440 | consumed tokens: 13495173120 | elapsed time per iteration (s): 0.22 | learning rate: 6.278E-05 | global batch size: 256 | lm loss: 3.709393E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.786 | TFLOPs: 29.27 | +7: iteration 25750/ 37905 | consumed samples: 6592000 | consumed tokens: 13500416000 | elapsed time per iteration (s): 0.22 | learning rate: 6.271E-05 | global batch size: 256 | lm loss: 3.715377E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.589 | TFLOPs: 29.24 | +7: iteration 25760/ 37905 | consumed samples: 6594560 | consumed tokens: 13505658880 | elapsed time per iteration (s): 0.22 | learning rate: 6.265E-05 | global batch size: 256 | lm loss: 3.726828E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.174 | TFLOPs: 29.22 | +7: iteration 25770/ 37905 | consumed samples: 6597120 | consumed tokens: 13510901760 | elapsed time per iteration (s): 0.22 | learning rate: 6.258E-05 | global batch size: 256 | lm loss: 3.712073E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.335 | TFLOPs: 29.25 | +7: iteration 25780/ 37905 | consumed samples: 6599680 | consumed tokens: 13516144640 | elapsed time per iteration (s): 0.22 | learning rate: 6.252E-05 | global batch size: 256 | lm loss: 3.738382E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.003 | TFLOPs: 29.25 | +7: iteration 25790/ 37905 | consumed samples: 6602240 | consumed tokens: 13521387520 | elapsed time per iteration (s): 0.22 | learning rate: 6.246E-05 | global batch size: 256 | lm loss: 3.735987E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.206 | TFLOPs: 29.30 | +7: iteration 25800/ 37905 | consumed samples: 6604800 | consumed tokens: 13526630400 | elapsed time per iteration (s): 0.22 | learning rate: 6.239E-05 | global batch size: 256 | lm loss: 3.724983E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.162 | TFLOPs: 29.30 | +7: iteration 25810/ 37905 | consumed samples: 6607360 | consumed tokens: 13531873280 | elapsed time per iteration (s): 0.22 | learning rate: 6.233E-05 | global batch size: 256 | lm loss: 3.713760E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.246 | TFLOPs: 29.28 | +7: iteration 25820/ 37905 | consumed samples: 6609920 | consumed tokens: 13537116160 | elapsed time per iteration (s): 0.22 | learning rate: 6.226E-05 | global batch size: 256 | lm loss: 3.730641E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.361 | TFLOPs: 29.31 | +7: iteration 25830/ 37905 | consumed samples: 6612480 | consumed tokens: 13542359040 | elapsed time per iteration (s): 0.22 | learning rate: 6.220E-05 | global batch size: 256 | lm loss: 3.707535E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.828 | TFLOPs: 29.32 | +7: iteration 25840/ 37905 | consumed samples: 6615040 | consumed tokens: 13547601920 | elapsed time per iteration (s): 0.22 | learning rate: 6.214E-05 | global batch size: 256 | lm loss: 3.717134E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.004 | TFLOPs: 29.30 | +7: iteration 25850/ 37905 | consumed samples: 6617600 | consumed tokens: 13552844800 | elapsed time per iteration (s): 0.22 | learning rate: 6.207E-05 | global batch size: 256 | lm loss: 3.709471E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.381 | TFLOPs: 29.05 | +7: iteration 25860/ 37905 | consumed samples: 6620160 | consumed tokens: 13558087680 | elapsed time per iteration (s): 0.22 | learning rate: 6.201E-05 | global batch size: 256 | lm loss: 3.703857E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.909 | TFLOPs: 29.29 | +7: iteration 25870/ 37905 | consumed samples: 6622720 | consumed tokens: 13563330560 | elapsed time per iteration (s): 0.22 | learning rate: 6.195E-05 | global batch size: 256 | lm loss: 3.716270E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.497 | TFLOPs: 29.31 | +7: iteration 25880/ 37905 | consumed samples: 6625280 | consumed tokens: 13568573440 | elapsed time per iteration (s): 0.22 | learning rate: 6.188E-05 | global batch size: 256 | lm loss: 3.714545E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.238 | TFLOPs: 29.33 | +7: iteration 25890/ 37905 | consumed samples: 6627840 | consumed tokens: 13573816320 | elapsed time per iteration (s): 0.22 | learning rate: 6.182E-05 | global batch size: 256 | lm loss: 3.726366E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.200 | TFLOPs: 29.35 | +7: iteration 25900/ 37905 | consumed samples: 6630400 | consumed tokens: 13579059200 | elapsed time per iteration (s): 0.22 | learning rate: 6.175E-05 | global batch size: 256 | lm loss: 3.715867E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.795 | TFLOPs: 29.32 | +7: iteration 25910/ 37905 | consumed samples: 6632960 | consumed tokens: 13584302080 | elapsed time per iteration (s): 0.22 | learning rate: 6.169E-05 | global batch size: 256 | lm loss: 3.718550E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.383 | TFLOPs: 29.33 | +7: iteration 25920/ 37905 | consumed samples: 6635520 | consumed tokens: 13589544960 | elapsed time per iteration (s): 0.22 | learning rate: 6.163E-05 | global batch size: 256 | lm loss: 3.717952E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.983 | TFLOPs: 29.32 | +7: iteration 25930/ 37905 | consumed samples: 6638080 | consumed tokens: 13594787840 | elapsed time per iteration (s): 0.22 | learning rate: 6.156E-05 | global batch size: 256 | lm loss: 3.729473E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.920 | TFLOPs: 29.32 | +7: iteration 25940/ 37905 | consumed samples: 6640640 | consumed tokens: 13600030720 | elapsed time per iteration (s): 0.23 | learning rate: 6.150E-05 | global batch size: 256 | lm loss: 3.711242E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.406 | TFLOPs: 28.92 | +7: iteration 25950/ 37905 | consumed samples: 6643200 | consumed tokens: 13605273600 | elapsed time per iteration (s): 0.22 | learning rate: 6.144E-05 | global batch size: 256 | lm loss: 3.728399E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.894 | TFLOPs: 29.32 | +7: iteration 25960/ 37905 | consumed samples: 6645760 | consumed tokens: 13610516480 | elapsed time per iteration (s): 0.22 | learning rate: 6.137E-05 | global batch size: 256 | lm loss: 3.717083E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.355 | TFLOPs: 29.36 | +7: iteration 25970/ 37905 | consumed samples: 6648320 | consumed tokens: 13615759360 | elapsed time per iteration (s): 0.22 | learning rate: 6.131E-05 | global batch size: 256 | lm loss: 3.718363E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.393 | TFLOPs: 29.36 | +7: iteration 25980/ 37905 | consumed samples: 6650880 | consumed tokens: 13621002240 | elapsed time per iteration (s): 0.22 | learning rate: 6.125E-05 | global batch size: 256 | lm loss: 3.711960E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.981 | TFLOPs: 29.35 | +7: iteration 25990/ 37905 | consumed samples: 6653440 | consumed tokens: 13626245120 | elapsed time per iteration (s): 0.22 | learning rate: 6.118E-05 | global batch size: 256 | lm loss: 3.727015E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.370 | TFLOPs: 29.31 | +0: [2023-03-15 23:34:24,888] [INFO] [logging.py:68:log_dist] [Rank 0] step=26000, skipped=0, lr=[6.112056897407288e-05, 6.112056897407288e-05, 6.112056897407288e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 26000/ 37905 | consumed samples: 6656000 | consumed tokens: 13631488000 | elapsed time per iteration (s): 0.22 | learning rate: 6.112E-05 | global batch size: 256 | lm loss: 3.708707E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.918 | TFLOPs: 29.35 | +0: steps: 26000 loss: 3.7126 iter time (s): 0.221 samples/sec: 1158.547 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 26000 | lm loss value: 3.669709E+00 | lm loss PPL: 3.924049E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 26000 to checkpoints_83m20b400m +0: [2023-03-15 23:34:24,975] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step26000 is begin to save! +0: [2023-03-15 23:34:24,979] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:34:25,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:34:25,050] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:34:25,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:34:25,063] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:34:25,074] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:34:25,074] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:34:25,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:34:25,086] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:34:25,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:34:25,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:34:25,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:34:25,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:34:25,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:34:25,120] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:34:25,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:34:25,131] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:34:25,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:34:25,142] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:34:25,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:34:25,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:34:25,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:34:25,165] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:34:25,166] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:34:25,166] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step26000/mp_rank_00_model_states.pt +0: [2023-03-15 23:34:25,166] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:34:25,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:34:25,185] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:34:25,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:34:25,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:34:25,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:34:25,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:34:25,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,209] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,209] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,212] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,212] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:34:25,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,213] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,213] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +5: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +3: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:34:25,214] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:34:25,214] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +6: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +1: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +4: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +6: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +2: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +7: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:34:25,215] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step26000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:34:25,215] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step26000 is ready now! +0: successfully saved checkpoint at iteration 26000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 243.33 +7: iteration 26010/ 37905 | consumed samples: 6658560 | consumed tokens: 13636730880 | elapsed time per iteration (s): 0.25 | learning rate: 6.106E-05 | global batch size: 256 | lm loss: 3.698427E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1005.557 | TFLOPs: 25.62 | +7: iteration 26020/ 37905 | consumed samples: 6661120 | consumed tokens: 13641973760 | elapsed time per iteration (s): 0.22 | learning rate: 6.099E-05 | global batch size: 256 | lm loss: 3.712224E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.875 | TFLOPs: 29.34 | +7: iteration 26030/ 37905 | consumed samples: 6663680 | consumed tokens: 13647216640 | elapsed time per iteration (s): 0.22 | learning rate: 6.093E-05 | global batch size: 256 | lm loss: 3.706956E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.774 | TFLOPs: 29.29 | +7: iteration 26040/ 37905 | consumed samples: 6666240 | consumed tokens: 13652459520 | elapsed time per iteration (s): 0.22 | learning rate: 6.087E-05 | global batch size: 256 | lm loss: 3.708791E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.705 | TFLOPs: 29.31 | +7: iteration 26050/ 37905 | consumed samples: 6668800 | consumed tokens: 13657702400 | elapsed time per iteration (s): 0.22 | learning rate: 6.080E-05 | global batch size: 256 | lm loss: 3.712668E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.826 | TFLOPs: 29.32 | +7: iteration 26060/ 37905 | consumed samples: 6671360 | consumed tokens: 13662945280 | elapsed time per iteration (s): 0.22 | learning rate: 6.074E-05 | global batch size: 256 | lm loss: 3.728619E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.045 | TFLOPs: 29.32 | +7: iteration 26070/ 37905 | consumed samples: 6673920 | consumed tokens: 13668188160 | elapsed time per iteration (s): 0.22 | learning rate: 6.068E-05 | global batch size: 256 | lm loss: 3.710432E+00 | grad norm: 0.260 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.137 | TFLOPs: 29.33 | +7: iteration 26080/ 37905 | consumed samples: 6676480 | consumed tokens: 13673431040 | elapsed time per iteration (s): 0.22 | learning rate: 6.062E-05 | global batch size: 256 | lm loss: 3.713185E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.320 | TFLOPs: 29.30 | +7: iteration 26090/ 37905 | consumed samples: 6679040 | consumed tokens: 13678673920 | elapsed time per iteration (s): 0.22 | learning rate: 6.055E-05 | global batch size: 256 | lm loss: 3.709666E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.392 | TFLOPs: 29.28 | +7: iteration 26100/ 37905 | consumed samples: 6681600 | consumed tokens: 13683916800 | elapsed time per iteration (s): 0.22 | learning rate: 6.049E-05 | global batch size: 256 | lm loss: 3.721517E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.776 | TFLOPs: 29.32 | +7: iteration 26110/ 37905 | consumed samples: 6684160 | consumed tokens: 13689159680 | elapsed time per iteration (s): 0.22 | learning rate: 6.043E-05 | global batch size: 256 | lm loss: 3.702924E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.435 | TFLOPs: 29.28 | +7: iteration 26120/ 37905 | consumed samples: 6686720 | consumed tokens: 13694402560 | elapsed time per iteration (s): 0.22 | learning rate: 6.036E-05 | global batch size: 256 | lm loss: 3.712172E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.680 | TFLOPs: 29.31 | +7: iteration 26130/ 37905 | consumed samples: 6689280 | consumed tokens: 13699645440 | elapsed time per iteration (s): 0.22 | learning rate: 6.030E-05 | global batch size: 256 | lm loss: 3.715266E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.357 | TFLOPs: 29.31 | +7: iteration 26140/ 37905 | consumed samples: 6691840 | consumed tokens: 13704888320 | elapsed time per iteration (s): 0.22 | learning rate: 6.024E-05 | global batch size: 256 | lm loss: 3.733307E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.267 | TFLOPs: 29.33 | +7: iteration 26150/ 37905 | consumed samples: 6694400 | consumed tokens: 13710131200 | elapsed time per iteration (s): 0.22 | learning rate: 6.018E-05 | global batch size: 256 | lm loss: 3.699767E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.573 | TFLOPs: 29.18 | +7: iteration 26160/ 37905 | consumed samples: 6696960 | consumed tokens: 13715374080 | elapsed time per iteration (s): 0.22 | learning rate: 6.011E-05 | global batch size: 256 | lm loss: 3.713669E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.151 | TFLOPs: 29.27 | +7: iteration 26170/ 37905 | consumed samples: 6699520 | consumed tokens: 13720616960 | elapsed time per iteration (s): 0.22 | learning rate: 6.005E-05 | global batch size: 256 | lm loss: 3.720602E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.095 | TFLOPs: 29.25 | +7: iteration 26180/ 37905 | consumed samples: 6702080 | consumed tokens: 13725859840 | elapsed time per iteration (s): 0.22 | learning rate: 5.999E-05 | global batch size: 256 | lm loss: 3.707893E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.534 | TFLOPs: 29.28 | +7: iteration 26190/ 37905 | consumed samples: 6704640 | consumed tokens: 13731102720 | elapsed time per iteration (s): 0.22 | learning rate: 5.992E-05 | global batch size: 256 | lm loss: 3.716365E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.568 | TFLOPs: 29.34 | +7: iteration 26200/ 37905 | consumed samples: 6707200 | consumed tokens: 13736345600 | elapsed time per iteration (s): 0.22 | learning rate: 5.986E-05 | global batch size: 256 | lm loss: 3.728609E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.972 | TFLOPs: 29.32 | +7: iteration 26210/ 37905 | consumed samples: 6709760 | consumed tokens: 13741588480 | elapsed time per iteration (s): 0.22 | learning rate: 5.980E-05 | global batch size: 256 | lm loss: 3.707193E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.696 | TFLOPs: 29.29 | +7: iteration 26220/ 37905 | consumed samples: 6712320 | consumed tokens: 13746831360 | elapsed time per iteration (s): 0.22 | learning rate: 5.974E-05 | global batch size: 256 | lm loss: 3.710204E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.897 | TFLOPs: 29.27 | +7: iteration 26230/ 37905 | consumed samples: 6714880 | consumed tokens: 13752074240 | elapsed time per iteration (s): 0.22 | learning rate: 5.967E-05 | global batch size: 256 | lm loss: 3.712498E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.227 | TFLOPs: 29.25 | +7: iteration 26240/ 37905 | consumed samples: 6717440 | consumed tokens: 13757317120 | elapsed time per iteration (s): 0.22 | learning rate: 5.961E-05 | global batch size: 256 | lm loss: 3.712821E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.787 | TFLOPs: 29.24 | +7: iteration 26250/ 37905 | consumed samples: 6720000 | consumed tokens: 13762560000 | elapsed time per iteration (s): 0.22 | learning rate: 5.955E-05 | global batch size: 256 | lm loss: 3.724713E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.793 | TFLOPs: 29.24 | +7: iteration 26260/ 37905 | consumed samples: 6722560 | consumed tokens: 13767802880 | elapsed time per iteration (s): 0.22 | learning rate: 5.949E-05 | global batch size: 256 | lm loss: 3.701620E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.171 | TFLOPs: 29.28 | +7: iteration 26270/ 37905 | consumed samples: 6725120 | consumed tokens: 13773045760 | elapsed time per iteration (s): 0.22 | learning rate: 5.943E-05 | global batch size: 256 | lm loss: 3.709716E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.885 | TFLOPs: 29.29 | +7: iteration 26280/ 37905 | consumed samples: 6727680 | consumed tokens: 13778288640 | elapsed time per iteration (s): 0.22 | learning rate: 5.936E-05 | global batch size: 256 | lm loss: 3.712511E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.873 | TFLOPs: 29.29 | +7: iteration 26290/ 37905 | consumed samples: 6730240 | consumed tokens: 13783531520 | elapsed time per iteration (s): 0.22 | learning rate: 5.930E-05 | global batch size: 256 | lm loss: 3.707124E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.073 | TFLOPs: 29.30 | +7: iteration 26300/ 37905 | consumed samples: 6732800 | consumed tokens: 13788774400 | elapsed time per iteration (s): 0.22 | learning rate: 5.924E-05 | global batch size: 256 | lm loss: 3.718711E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.018 | TFLOPs: 29.30 | +7: iteration 26310/ 37905 | consumed samples: 6735360 | consumed tokens: 13794017280 | elapsed time per iteration (s): 0.22 | learning rate: 5.918E-05 | global batch size: 256 | lm loss: 3.705657E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.057 | TFLOPs: 29.30 | +7: iteration 26320/ 37905 | consumed samples: 6737920 | consumed tokens: 13799260160 | elapsed time per iteration (s): 0.22 | learning rate: 5.911E-05 | global batch size: 256 | lm loss: 3.712110E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.530 | TFLOPs: 29.28 | +7: iteration 26330/ 37905 | consumed samples: 6740480 | consumed tokens: 13804503040 | elapsed time per iteration (s): 0.22 | learning rate: 5.905E-05 | global batch size: 256 | lm loss: 3.708670E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.806 | TFLOPs: 29.29 | +7: iteration 26340/ 37905 | consumed samples: 6743040 | consumed tokens: 13809745920 | elapsed time per iteration (s): 0.22 | learning rate: 5.899E-05 | global batch size: 256 | lm loss: 3.710660E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.947 | TFLOPs: 29.30 | +7: iteration 26350/ 37905 | consumed samples: 6745600 | consumed tokens: 13814988800 | elapsed time per iteration (s): 0.22 | learning rate: 5.893E-05 | global batch size: 256 | lm loss: 3.705182E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.114 | TFLOPs: 29.30 | +7: iteration 26360/ 37905 | consumed samples: 6748160 | consumed tokens: 13820231680 | elapsed time per iteration (s): 0.22 | learning rate: 5.887E-05 | global batch size: 256 | lm loss: 3.719009E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.469 | TFLOPs: 29.31 | +7: iteration 26370/ 37905 | consumed samples: 6750720 | consumed tokens: 13825474560 | elapsed time per iteration (s): 0.22 | learning rate: 5.880E-05 | global batch size: 256 | lm loss: 3.713169E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.345 | TFLOPs: 29.25 | +7: iteration 26380/ 37905 | consumed samples: 6753280 | consumed tokens: 13830717440 | elapsed time per iteration (s): 0.22 | learning rate: 5.874E-05 | global batch size: 256 | lm loss: 3.713102E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.380 | TFLOPs: 29.28 | +7: iteration 26390/ 37905 | consumed samples: 6755840 | consumed tokens: 13835960320 | elapsed time per iteration (s): 0.22 | learning rate: 5.868E-05 | global batch size: 256 | lm loss: 3.703872E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.683 | TFLOPs: 29.26 | +7: iteration 26400/ 37905 | consumed samples: 6758400 | consumed tokens: 13841203200 | elapsed time per iteration (s): 0.22 | learning rate: 5.862E-05 | global batch size: 256 | lm loss: 3.713942E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.352 | TFLOPs: 29.25 | +7: iteration 26410/ 37905 | consumed samples: 6760960 | consumed tokens: 13846446080 | elapsed time per iteration (s): 0.22 | learning rate: 5.856E-05 | global batch size: 256 | lm loss: 3.716682E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.418 | TFLOPs: 29.26 | +7: iteration 26420/ 37905 | consumed samples: 6763520 | consumed tokens: 13851688960 | elapsed time per iteration (s): 0.22 | learning rate: 5.849E-05 | global batch size: 256 | lm loss: 3.704872E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.755 | TFLOPs: 29.29 | +7: iteration 26430/ 37905 | consumed samples: 6766080 | consumed tokens: 13856931840 | elapsed time per iteration (s): 0.22 | learning rate: 5.843E-05 | global batch size: 256 | lm loss: 3.708754E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.551 | TFLOPs: 29.26 | +7: iteration 26440/ 37905 | consumed samples: 6768640 | consumed tokens: 13862174720 | elapsed time per iteration (s): 0.22 | learning rate: 5.837E-05 | global batch size: 256 | lm loss: 3.718367E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.945 | TFLOPs: 29.27 | +7: iteration 26450/ 37905 | consumed samples: 6771200 | consumed tokens: 13867417600 | elapsed time per iteration (s): 0.22 | learning rate: 5.831E-05 | global batch size: 256 | lm loss: 3.712848E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.436 | TFLOPs: 29.26 | +7: iteration 26460/ 37905 | consumed samples: 6773760 | consumed tokens: 13872660480 | elapsed time per iteration (s): 0.22 | learning rate: 5.825E-05 | global batch size: 256 | lm loss: 3.708341E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.654 | TFLOPs: 29.29 | +7: iteration 26470/ 37905 | consumed samples: 6776320 | consumed tokens: 13877903360 | elapsed time per iteration (s): 0.22 | learning rate: 5.819E-05 | global batch size: 256 | lm loss: 3.697737E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.538 | TFLOPs: 29.26 | +7: iteration 26480/ 37905 | consumed samples: 6778880 | consumed tokens: 13883146240 | elapsed time per iteration (s): 0.22 | learning rate: 5.812E-05 | global batch size: 256 | lm loss: 3.711287E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.784 | TFLOPs: 29.29 | +7: iteration 26490/ 37905 | consumed samples: 6781440 | consumed tokens: 13888389120 | elapsed time per iteration (s): 0.22 | learning rate: 5.806E-05 | global batch size: 256 | lm loss: 3.712486E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.303 | TFLOPs: 29.33 | +7: iteration 26500/ 37905 | consumed samples: 6784000 | consumed tokens: 13893632000 | elapsed time per iteration (s): 0.22 | learning rate: 5.800E-05 | global batch size: 256 | lm loss: 3.717641E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.521 | TFLOPs: 29.31 | +7: iteration 26510/ 37905 | consumed samples: 6786560 | consumed tokens: 13898874880 | elapsed time per iteration (s): 0.22 | learning rate: 5.794E-05 | global batch size: 256 | lm loss: 3.708521E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.069 | TFLOPs: 29.30 | +7: iteration 26520/ 37905 | consumed samples: 6789120 | consumed tokens: 13904117760 | elapsed time per iteration (s): 0.22 | learning rate: 5.788E-05 | global batch size: 256 | lm loss: 3.705694E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.475 | TFLOPs: 29.28 | +7: iteration 26530/ 37905 | consumed samples: 6791680 | consumed tokens: 13909360640 | elapsed time per iteration (s): 0.22 | learning rate: 5.782E-05 | global batch size: 256 | lm loss: 3.722845E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.746 | TFLOPs: 29.26 | +7: iteration 26540/ 37905 | consumed samples: 6794240 | consumed tokens: 13914603520 | elapsed time per iteration (s): 0.22 | learning rate: 5.776E-05 | global batch size: 256 | lm loss: 3.714858E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.012 | TFLOPs: 29.27 | +7: iteration 26550/ 37905 | consumed samples: 6796800 | consumed tokens: 13919846400 | elapsed time per iteration (s): 0.22 | learning rate: 5.769E-05 | global batch size: 256 | lm loss: 3.705198E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.364 | TFLOPs: 29.33 | +7: iteration 26560/ 37905 | consumed samples: 6799360 | consumed tokens: 13925089280 | elapsed time per iteration (s): 0.22 | learning rate: 5.763E-05 | global batch size: 256 | lm loss: 3.720284E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.196 | TFLOPs: 29.30 | +7: iteration 26570/ 37905 | consumed samples: 6801920 | consumed tokens: 13930332160 | elapsed time per iteration (s): 0.22 | learning rate: 5.757E-05 | global batch size: 256 | lm loss: 3.711092E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.820 | TFLOPs: 29.32 | +7: iteration 26580/ 37905 | consumed samples: 6804480 | consumed tokens: 13935575040 | elapsed time per iteration (s): 0.22 | learning rate: 5.751E-05 | global batch size: 256 | lm loss: 3.724001E+00 | grad norm: 0.243 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.215 | TFLOPs: 29.30 | +7: iteration 26590/ 37905 | consumed samples: 6807040 | consumed tokens: 13940817920 | elapsed time per iteration (s): 0.22 | learning rate: 5.745E-05 | global batch size: 256 | lm loss: 3.721121E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.142 | TFLOPs: 29.33 | +7: iteration 26600/ 37905 | consumed samples: 6809600 | consumed tokens: 13946060800 | elapsed time per iteration (s): 0.22 | learning rate: 5.739E-05 | global batch size: 256 | lm loss: 3.702936E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.351 | TFLOPs: 29.33 | +7: iteration 26610/ 37905 | consumed samples: 6812160 | consumed tokens: 13951303680 | elapsed time per iteration (s): 0.23 | learning rate: 5.733E-05 | global batch size: 256 | lm loss: 3.706417E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.809 | TFLOPs: 28.86 | +7: iteration 26620/ 37905 | consumed samples: 6814720 | consumed tokens: 13956546560 | elapsed time per iteration (s): 0.22 | learning rate: 5.727E-05 | global batch size: 256 | lm loss: 3.717711E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.467 | TFLOPs: 29.31 | +7: iteration 26630/ 37905 | consumed samples: 6817280 | consumed tokens: 13961789440 | elapsed time per iteration (s): 0.22 | learning rate: 5.720E-05 | global batch size: 256 | lm loss: 3.698479E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.017 | TFLOPs: 29.30 | +7: iteration 26640/ 37905 | consumed samples: 6819840 | consumed tokens: 13967032320 | elapsed time per iteration (s): 0.22 | learning rate: 5.714E-05 | global batch size: 256 | lm loss: 3.710982E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.420 | TFLOPs: 29.31 | +7: iteration 26650/ 37905 | consumed samples: 6822400 | consumed tokens: 13972275200 | elapsed time per iteration (s): 0.22 | learning rate: 5.708E-05 | global batch size: 256 | lm loss: 3.715285E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.434 | TFLOPs: 29.28 | +7: iteration 26660/ 37905 | consumed samples: 6824960 | consumed tokens: 13977518080 | elapsed time per iteration (s): 0.22 | learning rate: 5.702E-05 | global batch size: 256 | lm loss: 3.718865E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.575 | TFLOPs: 29.29 | +7: iteration 26670/ 37905 | consumed samples: 6827520 | consumed tokens: 13982760960 | elapsed time per iteration (s): 0.22 | learning rate: 5.696E-05 | global batch size: 256 | lm loss: 3.705540E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.811 | TFLOPs: 29.29 | +7: iteration 26680/ 37905 | consumed samples: 6830080 | consumed tokens: 13988003840 | elapsed time per iteration (s): 0.23 | learning rate: 5.690E-05 | global batch size: 256 | lm loss: 3.711180E+00 | grad norm: 0.248 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.550 | TFLOPs: 28.50 | +7: iteration 26690/ 37905 | consumed samples: 6832640 | consumed tokens: 13993246720 | elapsed time per iteration (s): 0.22 | learning rate: 5.684E-05 | global batch size: 256 | lm loss: 3.706689E+00 | grad norm: 0.236 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.991 | TFLOPs: 29.30 | +7: iteration 26700/ 37905 | consumed samples: 6835200 | consumed tokens: 13998489600 | elapsed time per iteration (s): 0.22 | learning rate: 5.678E-05 | global batch size: 256 | lm loss: 3.699347E+00 | grad norm: 0.252 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.986 | TFLOPs: 29.30 | +7: iteration 26710/ 37905 | consumed samples: 6837760 | consumed tokens: 14003732480 | elapsed time per iteration (s): 0.22 | learning rate: 5.672E-05 | global batch size: 256 | lm loss: 3.709439E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.070 | TFLOPs: 29.27 | +7: iteration 26720/ 37905 | consumed samples: 6840320 | consumed tokens: 14008975360 | elapsed time per iteration (s): 0.22 | learning rate: 5.666E-05 | global batch size: 256 | lm loss: 3.700700E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.766 | TFLOPs: 29.26 | +7: iteration 26730/ 37905 | consumed samples: 6842880 | consumed tokens: 14014218240 | elapsed time per iteration (s): 0.22 | learning rate: 5.660E-05 | global batch size: 256 | lm loss: 3.708923E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.049 | TFLOPs: 29.22 | +7: iteration 26740/ 37905 | consumed samples: 6845440 | consumed tokens: 14019461120 | elapsed time per iteration (s): 0.22 | learning rate: 5.654E-05 | global batch size: 256 | lm loss: 3.704610E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.508 | TFLOPs: 29.23 | +7: iteration 26750/ 37905 | consumed samples: 6848000 | consumed tokens: 14024704000 | elapsed time per iteration (s): 0.22 | learning rate: 5.648E-05 | global batch size: 256 | lm loss: 3.724901E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.303 | TFLOPs: 29.25 | +7: iteration 26760/ 37905 | consumed samples: 6850560 | consumed tokens: 14029946880 | elapsed time per iteration (s): 0.22 | learning rate: 5.641E-05 | global batch size: 256 | lm loss: 3.716238E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.992 | TFLOPs: 29.25 | +7: iteration 26770/ 37905 | consumed samples: 6853120 | consumed tokens: 14035189760 | elapsed time per iteration (s): 0.22 | learning rate: 5.635E-05 | global batch size: 256 | lm loss: 3.712086E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.559 | TFLOPs: 29.29 | +7: iteration 26780/ 37905 | consumed samples: 6855680 | consumed tokens: 14040432640 | elapsed time per iteration (s): 0.22 | learning rate: 5.629E-05 | global batch size: 256 | lm loss: 3.707734E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.932 | TFLOPs: 29.27 | +7: iteration 26790/ 37905 | consumed samples: 6858240 | consumed tokens: 14045675520 | elapsed time per iteration (s): 0.22 | learning rate: 5.623E-05 | global batch size: 256 | lm loss: 3.708418E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.944 | TFLOPs: 29.30 | +7: iteration 26800/ 37905 | consumed samples: 6860800 | consumed tokens: 14050918400 | elapsed time per iteration (s): 0.22 | learning rate: 5.617E-05 | global batch size: 256 | lm loss: 3.708577E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.383 | TFLOPs: 29.31 | +7: iteration 26810/ 37905 | consumed samples: 6863360 | consumed tokens: 14056161280 | elapsed time per iteration (s): 0.22 | learning rate: 5.611E-05 | global batch size: 256 | lm loss: 3.716558E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.373 | TFLOPs: 29.33 | +7: iteration 26820/ 37905 | consumed samples: 6865920 | consumed tokens: 14061404160 | elapsed time per iteration (s): 0.22 | learning rate: 5.605E-05 | global batch size: 256 | lm loss: 3.714180E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.328 | TFLOPs: 29.33 | +7: iteration 26830/ 37905 | consumed samples: 6868480 | consumed tokens: 14066647040 | elapsed time per iteration (s): 0.22 | learning rate: 5.599E-05 | global batch size: 256 | lm loss: 3.717299E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.566 | TFLOPs: 29.34 | +7: iteration 26840/ 37905 | consumed samples: 6871040 | consumed tokens: 14071889920 | elapsed time per iteration (s): 0.22 | learning rate: 5.593E-05 | global batch size: 256 | lm loss: 3.711247E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.868 | TFLOPs: 29.32 | +7: iteration 26850/ 37905 | consumed samples: 6873600 | consumed tokens: 14077132800 | elapsed time per iteration (s): 0.22 | learning rate: 5.587E-05 | global batch size: 256 | lm loss: 3.723942E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.359 | TFLOPs: 29.33 | +7: iteration 26860/ 37905 | consumed samples: 6876160 | consumed tokens: 14082375680 | elapsed time per iteration (s): 0.22 | learning rate: 5.581E-05 | global batch size: 256 | lm loss: 3.699696E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.668 | TFLOPs: 29.34 | +7: iteration 26870/ 37905 | consumed samples: 6878720 | consumed tokens: 14087618560 | elapsed time per iteration (s): 0.22 | learning rate: 5.575E-05 | global batch size: 256 | lm loss: 3.715812E+00 | grad norm: 0.239 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.452 | TFLOPs: 29.33 | +7: iteration 26880/ 37905 | consumed samples: 6881280 | consumed tokens: 14092861440 | elapsed time per iteration (s): 0.22 | learning rate: 5.569E-05 | global batch size: 256 | lm loss: 3.711389E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.340 | TFLOPs: 29.18 | +7: iteration 26890/ 37905 | consumed samples: 6883840 | consumed tokens: 14098104320 | elapsed time per iteration (s): 0.22 | learning rate: 5.563E-05 | global batch size: 256 | lm loss: 3.711094E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.636 | TFLOPs: 29.34 | +7: iteration 26900/ 37905 | consumed samples: 6886400 | consumed tokens: 14103347200 | elapsed time per iteration (s): 0.22 | learning rate: 5.557E-05 | global batch size: 256 | lm loss: 3.714769E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.059 | TFLOPs: 29.32 | +7: iteration 26910/ 37905 | consumed samples: 6888960 | consumed tokens: 14108590080 | elapsed time per iteration (s): 0.22 | learning rate: 5.551E-05 | global batch size: 256 | lm loss: 3.713492E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.274 | TFLOPs: 29.33 | +7: iteration 26920/ 37905 | consumed samples: 6891520 | consumed tokens: 14113832960 | elapsed time per iteration (s): 0.22 | learning rate: 5.545E-05 | global batch size: 256 | lm loss: 3.712257E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.476 | TFLOPs: 29.36 | +7: iteration 26930/ 37905 | consumed samples: 6894080 | consumed tokens: 14119075840 | elapsed time per iteration (s): 0.22 | learning rate: 5.539E-05 | global batch size: 256 | lm loss: 3.710590E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.758 | TFLOPs: 29.34 | +7: iteration 26940/ 37905 | consumed samples: 6896640 | consumed tokens: 14124318720 | elapsed time per iteration (s): 0.22 | learning rate: 5.533E-05 | global batch size: 256 | lm loss: 3.710326E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.888 | TFLOPs: 29.29 | +7: iteration 26950/ 37905 | consumed samples: 6899200 | consumed tokens: 14129561600 | elapsed time per iteration (s): 0.22 | learning rate: 5.527E-05 | global batch size: 256 | lm loss: 3.706232E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.291 | TFLOPs: 29.35 | +7: iteration 26960/ 37905 | consumed samples: 6901760 | consumed tokens: 14134804480 | elapsed time per iteration (s): 0.22 | learning rate: 5.521E-05 | global batch size: 256 | lm loss: 3.712726E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.731 | TFLOPs: 29.34 | +7: iteration 26970/ 37905 | consumed samples: 6904320 | consumed tokens: 14140047360 | elapsed time per iteration (s): 0.22 | learning rate: 5.515E-05 | global batch size: 256 | lm loss: 3.702665E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.356 | TFLOPs: 29.33 | +7: iteration 26980/ 37905 | consumed samples: 6906880 | consumed tokens: 14145290240 | elapsed time per iteration (s): 0.22 | learning rate: 5.509E-05 | global batch size: 256 | lm loss: 3.688873E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.748 | TFLOPs: 29.34 | +7: iteration 26990/ 37905 | consumed samples: 6909440 | consumed tokens: 14150533120 | elapsed time per iteration (s): 0.22 | learning rate: 5.503E-05 | global batch size: 256 | lm loss: 3.714515E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.853 | TFLOPs: 29.34 | +7: iteration 27000/ 37905 | consumed samples: 6912000 | consumed tokens: 14155776000 | elapsed time per iteration (s): 0.22 | learning rate: 5.497E-05 | global batch size: 256 | lm loss: 3.710657E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.773 | TFLOPs: 29.34 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 27000 | lm loss value: 3.681979E+00 | lm loss PPL: 3.972492E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 27000 to checkpoints_83m20b400m +0: [2023-03-15 23:38:07,992] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step27000 is begin to save! +0: [2023-03-15 23:38:07,995] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:38:08,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:38:08,071] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:38:08,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:38:08,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:38:08,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:38:08,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:38:08,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:38:08,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:38:08,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:38:08,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:38:08,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:38:08,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:38:08,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:38:08,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:38:08,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:38:08,152] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:38:08,163] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:38:08,164] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:38:08,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:38:08,175] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:38:08,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:38:08,186] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:38:08,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:38:08,187] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step27000/mp_rank_00_model_states.pt +0: [2023-03-15 23:38:08,187] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:38:08,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:38:08,206] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:38:08,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:38:08,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:38:08,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:38:08,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:38:08,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-15 23:38:08,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:38:08,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,228] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,228] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,228] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,229] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,229] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +1: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,230] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,230] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:38:08,231] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,231] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +1: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +5: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:38:08,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +0: [2023-03-15 23:38:08,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:38:08,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +6: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:38:08,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:38:08,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:38:08,234] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +4: [2023-03-15 23:38:08,234] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +7: [2023-03-15 23:38:08,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:38:08,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:38:08,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +3: [2023-03-15 23:38:08,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:38:08,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:38:08,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:38:08,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +2: [2023-03-15 23:38:08,236] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:38:08,236] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step27000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:38:08,236] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step27000 is ready now! +0: successfully saved checkpoint at iteration 27000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 248.23 +7: iteration 27010/ 37905 | consumed samples: 6914560 | consumed tokens: 14161018880 | elapsed time per iteration (s): 0.25 | learning rate: 5.491E-05 | global batch size: 256 | lm loss: 3.720450E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1010.652 | TFLOPs: 25.75 | +7: iteration 27020/ 37905 | consumed samples: 6917120 | consumed tokens: 14166261760 | elapsed time per iteration (s): 0.22 | learning rate: 5.485E-05 | global batch size: 256 | lm loss: 3.706453E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.304 | TFLOPs: 29.33 | +7: iteration 27030/ 37905 | consumed samples: 6919680 | consumed tokens: 14171504640 | elapsed time per iteration (s): 0.22 | learning rate: 5.479E-05 | global batch size: 256 | lm loss: 3.712962E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.210 | TFLOPs: 29.33 | +7: iteration 27040/ 37905 | consumed samples: 6922240 | consumed tokens: 14176747520 | elapsed time per iteration (s): 0.23 | learning rate: 5.473E-05 | global batch size: 256 | lm loss: 3.706729E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.704 | TFLOPs: 28.96 | +7: iteration 27050/ 37905 | consumed samples: 6924800 | consumed tokens: 14181990400 | elapsed time per iteration (s): 0.22 | learning rate: 5.467E-05 | global batch size: 256 | lm loss: 3.704161E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.538 | TFLOPs: 29.34 | +7: iteration 27060/ 37905 | consumed samples: 6927360 | consumed tokens: 14187233280 | elapsed time per iteration (s): 0.22 | learning rate: 5.462E-05 | global batch size: 256 | lm loss: 3.708968E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.736 | TFLOPs: 29.34 | +7: iteration 27070/ 37905 | consumed samples: 6929920 | consumed tokens: 14192476160 | elapsed time per iteration (s): 0.22 | learning rate: 5.456E-05 | global batch size: 256 | lm loss: 3.712262E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.886 | TFLOPs: 29.34 | +7: iteration 27080/ 37905 | consumed samples: 6932480 | consumed tokens: 14197719040 | elapsed time per iteration (s): 0.22 | learning rate: 5.450E-05 | global batch size: 256 | lm loss: 3.716182E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.963 | TFLOPs: 29.35 | +7: iteration 27090/ 37905 | consumed samples: 6935040 | consumed tokens: 14202961920 | elapsed time per iteration (s): 0.22 | learning rate: 5.444E-05 | global batch size: 256 | lm loss: 3.708593E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.854 | TFLOPs: 29.32 | +7: iteration 27100/ 37905 | consumed samples: 6937600 | consumed tokens: 14208204800 | elapsed time per iteration (s): 0.22 | learning rate: 5.438E-05 | global batch size: 256 | lm loss: 3.721379E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.457 | TFLOPs: 29.33 | +7: iteration 27110/ 37905 | consumed samples: 6940160 | consumed tokens: 14213447680 | elapsed time per iteration (s): 0.22 | learning rate: 5.432E-05 | global batch size: 256 | lm loss: 3.695956E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.956 | TFLOPs: 29.32 | +7: iteration 27120/ 37905 | consumed samples: 6942720 | consumed tokens: 14218690560 | elapsed time per iteration (s): 0.22 | learning rate: 5.426E-05 | global batch size: 256 | lm loss: 3.695987E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.139 | TFLOPs: 29.33 | +7: iteration 27130/ 37905 | consumed samples: 6945280 | consumed tokens: 14223933440 | elapsed time per iteration (s): 0.22 | learning rate: 5.420E-05 | global batch size: 256 | lm loss: 3.700003E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.011 | TFLOPs: 29.32 | +7: iteration 27140/ 37905 | consumed samples: 6947840 | consumed tokens: 14229176320 | elapsed time per iteration (s): 0.22 | learning rate: 5.414E-05 | global batch size: 256 | lm loss: 3.712617E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.229 | TFLOPs: 29.33 | +7: iteration 27150/ 37905 | consumed samples: 6950400 | consumed tokens: 14234419200 | elapsed time per iteration (s): 0.22 | learning rate: 5.408E-05 | global batch size: 256 | lm loss: 3.704589E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.795 | TFLOPs: 29.34 | +7: iteration 27160/ 37905 | consumed samples: 6952960 | consumed tokens: 14239662080 | elapsed time per iteration (s): 0.22 | learning rate: 5.402E-05 | global batch size: 256 | lm loss: 3.702993E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.746 | TFLOPs: 29.32 | +7: iteration 27170/ 37905 | consumed samples: 6955520 | consumed tokens: 14244904960 | elapsed time per iteration (s): 0.23 | learning rate: 5.396E-05 | global batch size: 256 | lm loss: 3.703611E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.996 | TFLOPs: 27.95 | +7: iteration 27180/ 37905 | consumed samples: 6958080 | consumed tokens: 14250147840 | elapsed time per iteration (s): 0.22 | learning rate: 5.391E-05 | global batch size: 256 | lm loss: 3.724371E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.877 | TFLOPs: 29.34 | +7: iteration 27190/ 37905 | consumed samples: 6960640 | consumed tokens: 14255390720 | elapsed time per iteration (s): 0.22 | learning rate: 5.385E-05 | global batch size: 256 | lm loss: 3.717450E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.413 | TFLOPs: 29.33 | +7: iteration 27200/ 37905 | consumed samples: 6963200 | consumed tokens: 14260633600 | elapsed time per iteration (s): 0.22 | learning rate: 5.379E-05 | global batch size: 256 | lm loss: 3.712029E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.089 | TFLOPs: 29.32 | +7: iteration 27210/ 37905 | consumed samples: 6965760 | consumed tokens: 14265876480 | elapsed time per iteration (s): 0.22 | learning rate: 5.373E-05 | global batch size: 256 | lm loss: 3.697995E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.589 | TFLOPs: 29.34 | +7: iteration 27220/ 37905 | consumed samples: 6968320 | consumed tokens: 14271119360 | elapsed time per iteration (s): 0.22 | learning rate: 5.367E-05 | global batch size: 256 | lm loss: 3.702061E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.222 | TFLOPs: 29.33 | +7: iteration 27230/ 37905 | consumed samples: 6970880 | consumed tokens: 14276362240 | elapsed time per iteration (s): 0.22 | learning rate: 5.361E-05 | global batch size: 256 | lm loss: 3.705201E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.591 | TFLOPs: 29.31 | +7: iteration 27240/ 37905 | consumed samples: 6973440 | consumed tokens: 14281605120 | elapsed time per iteration (s): 0.22 | learning rate: 5.355E-05 | global batch size: 256 | lm loss: 3.710724E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.862 | TFLOPs: 29.32 | +7: iteration 27250/ 37905 | consumed samples: 6976000 | consumed tokens: 14286848000 | elapsed time per iteration (s): 0.22 | learning rate: 5.349E-05 | global batch size: 256 | lm loss: 3.701345E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.302 | TFLOPs: 29.33 | +7: iteration 27260/ 37905 | consumed samples: 6978560 | consumed tokens: 14292090880 | elapsed time per iteration (s): 0.22 | learning rate: 5.344E-05 | global batch size: 256 | lm loss: 3.709929E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.366 | TFLOPs: 29.31 | +7: iteration 27270/ 37905 | consumed samples: 6981120 | consumed tokens: 14297333760 | elapsed time per iteration (s): 0.22 | learning rate: 5.338E-05 | global batch size: 256 | lm loss: 3.711417E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.984 | TFLOPs: 29.32 | +7: iteration 27280/ 37905 | consumed samples: 6983680 | consumed tokens: 14302576640 | elapsed time per iteration (s): 0.22 | learning rate: 5.332E-05 | global batch size: 256 | lm loss: 3.706939E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.567 | TFLOPs: 29.34 | +7: iteration 27290/ 37905 | consumed samples: 6986240 | consumed tokens: 14307819520 | elapsed time per iteration (s): 0.22 | learning rate: 5.326E-05 | global batch size: 256 | lm loss: 3.707725E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.909 | TFLOPs: 29.32 | +7: iteration 27300/ 37905 | consumed samples: 6988800 | consumed tokens: 14313062400 | elapsed time per iteration (s): 0.22 | learning rate: 5.320E-05 | global batch size: 256 | lm loss: 3.704210E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.014 | TFLOPs: 29.32 | +7: iteration 27310/ 37905 | consumed samples: 6991360 | consumed tokens: 14318305280 | elapsed time per iteration (s): 0.22 | learning rate: 5.314E-05 | global batch size: 256 | lm loss: 3.715477E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.313 | TFLOPs: 29.33 | +7: iteration 27320/ 37905 | consumed samples: 6993920 | consumed tokens: 14323548160 | elapsed time per iteration (s): 0.22 | learning rate: 5.308E-05 | global batch size: 256 | lm loss: 3.705367E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.779 | TFLOPs: 29.32 | +7: iteration 27330/ 37905 | consumed samples: 6996480 | consumed tokens: 14328791040 | elapsed time per iteration (s): 0.22 | learning rate: 5.303E-05 | global batch size: 256 | lm loss: 3.705164E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.762 | TFLOPs: 29.32 | +7: iteration 27340/ 37905 | consumed samples: 6999040 | consumed tokens: 14334033920 | elapsed time per iteration (s): 0.22 | learning rate: 5.297E-05 | global batch size: 256 | lm loss: 3.718190E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.464 | TFLOPs: 29.33 | +7: iteration 27350/ 37905 | consumed samples: 7001600 | consumed tokens: 14339276800 | elapsed time per iteration (s): 0.22 | learning rate: 5.291E-05 | global batch size: 256 | lm loss: 3.712011E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.805 | TFLOPs: 29.34 | +7: iteration 27360/ 37905 | consumed samples: 7004160 | consumed tokens: 14344519680 | elapsed time per iteration (s): 0.22 | learning rate: 5.285E-05 | global batch size: 256 | lm loss: 3.705346E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.492 | TFLOPs: 29.33 | +7: iteration 27370/ 37905 | consumed samples: 7006720 | consumed tokens: 14349762560 | elapsed time per iteration (s): 0.22 | learning rate: 5.279E-05 | global batch size: 256 | lm loss: 3.695679E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.157 | TFLOPs: 29.33 | +7: iteration 27380/ 37905 | consumed samples: 7009280 | consumed tokens: 14355005440 | elapsed time per iteration (s): 0.22 | learning rate: 5.274E-05 | global batch size: 256 | lm loss: 3.724718E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.452 | TFLOPs: 29.33 | +7: iteration 27390/ 37905 | consumed samples: 7011840 | consumed tokens: 14360248320 | elapsed time per iteration (s): 0.22 | learning rate: 5.268E-05 | global batch size: 256 | lm loss: 3.703994E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.837 | TFLOPs: 29.32 | +7: iteration 27400/ 37905 | consumed samples: 7014400 | consumed tokens: 14365491200 | elapsed time per iteration (s): 0.22 | learning rate: 5.262E-05 | global batch size: 256 | lm loss: 3.721861E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.972 | TFLOPs: 29.32 | +7: iteration 27410/ 37905 | consumed samples: 7016960 | consumed tokens: 14370734080 | elapsed time per iteration (s): 0.22 | learning rate: 5.256E-05 | global batch size: 256 | lm loss: 3.716396E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.123 | TFLOPs: 29.33 | +7: iteration 27420/ 37905 | consumed samples: 7019520 | consumed tokens: 14375976960 | elapsed time per iteration (s): 0.22 | learning rate: 5.250E-05 | global batch size: 256 | lm loss: 3.718708E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.212 | TFLOPs: 29.33 | +7: iteration 27430/ 37905 | consumed samples: 7022080 | consumed tokens: 14381219840 | elapsed time per iteration (s): 0.22 | learning rate: 5.245E-05 | global batch size: 256 | lm loss: 3.705695E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.757 | TFLOPs: 29.32 | +7: iteration 27440/ 37905 | consumed samples: 7024640 | consumed tokens: 14386462720 | elapsed time per iteration (s): 0.22 | learning rate: 5.239E-05 | global batch size: 256 | lm loss: 3.695174E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.889 | TFLOPs: 29.32 | +7: iteration 27450/ 37905 | consumed samples: 7027200 | consumed tokens: 14391705600 | elapsed time per iteration (s): 0.22 | learning rate: 5.233E-05 | global batch size: 256 | lm loss: 3.711849E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.388 | TFLOPs: 29.31 | +7: iteration 27460/ 37905 | consumed samples: 7029760 | consumed tokens: 14396948480 | elapsed time per iteration (s): 0.22 | learning rate: 5.227E-05 | global batch size: 256 | lm loss: 3.709025E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.995 | TFLOPs: 29.32 | +7: iteration 27470/ 37905 | consumed samples: 7032320 | consumed tokens: 14402191360 | elapsed time per iteration (s): 0.22 | learning rate: 5.221E-05 | global batch size: 256 | lm loss: 3.699218E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.421 | TFLOPs: 29.31 | +7: iteration 27480/ 37905 | consumed samples: 7034880 | consumed tokens: 14407434240 | elapsed time per iteration (s): 0.22 | learning rate: 5.216E-05 | global batch size: 256 | lm loss: 3.721766E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.340 | TFLOPs: 29.33 | +7: iteration 27490/ 37905 | consumed samples: 7037440 | consumed tokens: 14412677120 | elapsed time per iteration (s): 0.22 | learning rate: 5.210E-05 | global batch size: 256 | lm loss: 3.702717E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.759 | TFLOPs: 29.34 | +7: iteration 27500/ 37905 | consumed samples: 7040000 | consumed tokens: 14417920000 | elapsed time per iteration (s): 0.22 | learning rate: 5.204E-05 | global batch size: 256 | lm loss: 3.705904E+00 | grad norm: 0.232 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.351 | TFLOPs: 29.36 | +7: iteration 27510/ 37905 | consumed samples: 7042560 | consumed tokens: 14423162880 | elapsed time per iteration (s): 0.22 | learning rate: 5.198E-05 | global batch size: 256 | lm loss: 3.719674E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.964 | TFLOPs: 29.40 | +7: iteration 27520/ 37905 | consumed samples: 7045120 | consumed tokens: 14428405760 | elapsed time per iteration (s): 0.22 | learning rate: 5.193E-05 | global batch size: 256 | lm loss: 3.704834E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.371 | TFLOPs: 29.41 | +7: iteration 27530/ 37905 | consumed samples: 7047680 | consumed tokens: 14433648640 | elapsed time per iteration (s): 0.22 | learning rate: 5.187E-05 | global batch size: 256 | lm loss: 3.696016E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.859 | TFLOPs: 29.42 | +7: iteration 27540/ 37905 | consumed samples: 7050240 | consumed tokens: 14438891520 | elapsed time per iteration (s): 0.22 | learning rate: 5.181E-05 | global batch size: 256 | lm loss: 3.708984E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.390 | TFLOPs: 29.41 | +7: iteration 27550/ 37905 | consumed samples: 7052800 | consumed tokens: 14444134400 | elapsed time per iteration (s): 0.22 | learning rate: 5.175E-05 | global batch size: 256 | lm loss: 3.699221E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.089 | TFLOPs: 29.38 | +7: iteration 27560/ 37905 | consumed samples: 7055360 | consumed tokens: 14449377280 | elapsed time per iteration (s): 0.22 | learning rate: 5.170E-05 | global batch size: 256 | lm loss: 3.697454E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.724 | TFLOPs: 29.34 | +7: iteration 27570/ 37905 | consumed samples: 7057920 | consumed tokens: 14454620160 | elapsed time per iteration (s): 0.22 | learning rate: 5.164E-05 | global batch size: 256 | lm loss: 3.701990E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.555 | TFLOPs: 29.36 | +7: iteration 27580/ 37905 | consumed samples: 7060480 | consumed tokens: 14459863040 | elapsed time per iteration (s): 0.22 | learning rate: 5.158E-05 | global batch size: 256 | lm loss: 3.711133E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.564 | TFLOPs: 29.36 | +7: iteration 27590/ 37905 | consumed samples: 7063040 | consumed tokens: 14465105920 | elapsed time per iteration (s): 0.22 | learning rate: 5.152E-05 | global batch size: 256 | lm loss: 3.705157E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.923 | TFLOPs: 29.35 | +7: iteration 27600/ 37905 | consumed samples: 7065600 | consumed tokens: 14470348800 | elapsed time per iteration (s): 0.22 | learning rate: 5.147E-05 | global batch size: 256 | lm loss: 3.709409E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.916 | TFLOPs: 29.35 | +7: iteration 27610/ 37905 | consumed samples: 7068160 | consumed tokens: 14475591680 | elapsed time per iteration (s): 0.22 | learning rate: 5.141E-05 | global batch size: 256 | lm loss: 3.702011E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.765 | TFLOPs: 29.37 | +7: iteration 27620/ 37905 | consumed samples: 7070720 | consumed tokens: 14480834560 | elapsed time per iteration (s): 0.22 | learning rate: 5.135E-05 | global batch size: 256 | lm loss: 3.722553E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.613 | TFLOPs: 29.34 | +7: iteration 27630/ 37905 | consumed samples: 7073280 | consumed tokens: 14486077440 | elapsed time per iteration (s): 0.22 | learning rate: 5.129E-05 | global batch size: 256 | lm loss: 3.718634E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.935 | TFLOPs: 29.40 | +7: iteration 27640/ 37905 | consumed samples: 7075840 | consumed tokens: 14491320320 | elapsed time per iteration (s): 0.22 | learning rate: 5.124E-05 | global batch size: 256 | lm loss: 3.700413E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.129 | TFLOPs: 29.40 | +7: iteration 27650/ 37905 | consumed samples: 7078400 | consumed tokens: 14496563200 | elapsed time per iteration (s): 0.22 | learning rate: 5.118E-05 | global batch size: 256 | lm loss: 3.714148E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.291 | TFLOPs: 29.38 | +7: iteration 27660/ 37905 | consumed samples: 7080960 | consumed tokens: 14501806080 | elapsed time per iteration (s): 0.22 | learning rate: 5.112E-05 | global batch size: 256 | lm loss: 3.708720E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.501 | TFLOPs: 29.36 | +7: iteration 27670/ 37905 | consumed samples: 7083520 | consumed tokens: 14507048960 | elapsed time per iteration (s): 0.22 | learning rate: 5.107E-05 | global batch size: 256 | lm loss: 3.706584E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.012 | TFLOPs: 29.37 | +7: iteration 27680/ 37905 | consumed samples: 7086080 | consumed tokens: 14512291840 | elapsed time per iteration (s): 0.22 | learning rate: 5.101E-05 | global batch size: 256 | lm loss: 3.731487E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.913 | TFLOPs: 29.24 | +7: iteration 27690/ 37905 | consumed samples: 7088640 | consumed tokens: 14517534720 | elapsed time per iteration (s): 0.22 | learning rate: 5.095E-05 | global batch size: 256 | lm loss: 3.709614E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.241 | TFLOPs: 29.38 | +7: iteration 27700/ 37905 | consumed samples: 7091200 | consumed tokens: 14522777600 | elapsed time per iteration (s): 0.22 | learning rate: 5.090E-05 | global batch size: 256 | lm loss: 3.713424E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.236 | TFLOPs: 29.40 | +7: iteration 27710/ 37905 | consumed samples: 7093760 | consumed tokens: 14528020480 | elapsed time per iteration (s): 0.22 | learning rate: 5.084E-05 | global batch size: 256 | lm loss: 3.719562E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.264 | TFLOPs: 29.43 | +7: iteration 27720/ 37905 | consumed samples: 7096320 | consumed tokens: 14533263360 | elapsed time per iteration (s): 0.22 | learning rate: 5.078E-05 | global batch size: 256 | lm loss: 3.692506E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.944 | TFLOPs: 29.42 | +7: iteration 27730/ 37905 | consumed samples: 7098880 | consumed tokens: 14538506240 | elapsed time per iteration (s): 0.22 | learning rate: 5.073E-05 | global batch size: 256 | lm loss: 3.701285E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.996 | TFLOPs: 29.42 | +7: iteration 27740/ 37905 | consumed samples: 7101440 | consumed tokens: 14543749120 | elapsed time per iteration (s): 0.22 | learning rate: 5.067E-05 | global batch size: 256 | lm loss: 3.699478E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.041 | TFLOPs: 29.42 | +7: iteration 27750/ 37905 | consumed samples: 7104000 | consumed tokens: 14548992000 | elapsed time per iteration (s): 0.22 | learning rate: 5.061E-05 | global batch size: 256 | lm loss: 3.714134E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.951 | TFLOPs: 29.42 | +7: iteration 27760/ 37905 | consumed samples: 7106560 | consumed tokens: 14554234880 | elapsed time per iteration (s): 0.22 | learning rate: 5.056E-05 | global batch size: 256 | lm loss: 3.722964E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.445 | TFLOPs: 29.41 | +7: iteration 27770/ 37905 | consumed samples: 7109120 | consumed tokens: 14559477760 | elapsed time per iteration (s): 0.22 | learning rate: 5.050E-05 | global batch size: 256 | lm loss: 3.695593E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.538 | TFLOPs: 29.18 | +7: iteration 27780/ 37905 | consumed samples: 7111680 | consumed tokens: 14564720640 | elapsed time per iteration (s): 0.22 | learning rate: 5.044E-05 | global batch size: 256 | lm loss: 3.712082E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.928 | TFLOPs: 29.37 | +7: iteration 27790/ 37905 | consumed samples: 7114240 | consumed tokens: 14569963520 | elapsed time per iteration (s): 0.22 | learning rate: 5.039E-05 | global batch size: 256 | lm loss: 3.713832E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.082 | TFLOPs: 29.43 | +7: iteration 27800/ 37905 | consumed samples: 7116800 | consumed tokens: 14575206400 | elapsed time per iteration (s): 0.22 | learning rate: 5.033E-05 | global batch size: 256 | lm loss: 3.709922E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.165 | TFLOPs: 29.43 | +7: iteration 27810/ 37905 | consumed samples: 7119360 | consumed tokens: 14580449280 | elapsed time per iteration (s): 0.22 | learning rate: 5.027E-05 | global batch size: 256 | lm loss: 3.712635E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.894 | TFLOPs: 29.42 | +7: iteration 27820/ 37905 | consumed samples: 7121920 | consumed tokens: 14585692160 | elapsed time per iteration (s): 0.22 | learning rate: 5.022E-05 | global batch size: 256 | lm loss: 3.708480E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.025 | TFLOPs: 29.42 | +7: iteration 27830/ 37905 | consumed samples: 7124480 | consumed tokens: 14590935040 | elapsed time per iteration (s): 0.22 | learning rate: 5.016E-05 | global batch size: 256 | lm loss: 3.695370E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.609 | TFLOPs: 29.44 | +7: iteration 27840/ 37905 | consumed samples: 7127040 | consumed tokens: 14596177920 | elapsed time per iteration (s): 0.22 | learning rate: 5.010E-05 | global batch size: 256 | lm loss: 3.710533E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.554 | TFLOPs: 29.41 | +7: iteration 27850/ 37905 | consumed samples: 7129600 | consumed tokens: 14601420800 | elapsed time per iteration (s): 0.22 | learning rate: 5.005E-05 | global batch size: 256 | lm loss: 3.695413E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.529 | TFLOPs: 29.39 | +7: iteration 27860/ 37905 | consumed samples: 7132160 | consumed tokens: 14606663680 | elapsed time per iteration (s): 0.22 | learning rate: 4.999E-05 | global batch size: 256 | lm loss: 3.709149E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.418 | TFLOPs: 29.38 | +7: iteration 27870/ 37905 | consumed samples: 7134720 | consumed tokens: 14611906560 | elapsed time per iteration (s): 0.22 | learning rate: 4.994E-05 | global batch size: 256 | lm loss: 3.720013E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.552 | TFLOPs: 29.39 | +7: iteration 27880/ 37905 | consumed samples: 7137280 | consumed tokens: 14617149440 | elapsed time per iteration (s): 0.22 | learning rate: 4.988E-05 | global batch size: 256 | lm loss: 3.694428E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.491 | TFLOPs: 29.39 | +7: iteration 27890/ 37905 | consumed samples: 7139840 | consumed tokens: 14622392320 | elapsed time per iteration (s): 0.22 | learning rate: 4.982E-05 | global batch size: 256 | lm loss: 3.700532E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.054 | TFLOPs: 29.37 | +7: iteration 27900/ 37905 | consumed samples: 7142400 | consumed tokens: 14627635200 | elapsed time per iteration (s): 0.22 | learning rate: 4.977E-05 | global batch size: 256 | lm loss: 3.715335E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.425 | TFLOPs: 29.38 | +7: iteration 27910/ 37905 | consumed samples: 7144960 | consumed tokens: 14632878080 | elapsed time per iteration (s): 0.22 | learning rate: 4.971E-05 | global batch size: 256 | lm loss: 3.704505E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.572 | TFLOPs: 29.39 | +7: iteration 27920/ 37905 | consumed samples: 7147520 | consumed tokens: 14638120960 | elapsed time per iteration (s): 0.22 | learning rate: 4.966E-05 | global batch size: 256 | lm loss: 3.697942E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.604 | TFLOPs: 29.39 | +7: iteration 27930/ 37905 | consumed samples: 7150080 | consumed tokens: 14643363840 | elapsed time per iteration (s): 0.22 | learning rate: 4.960E-05 | global batch size: 256 | lm loss: 3.709412E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.188 | TFLOPs: 29.35 | +7: iteration 27940/ 37905 | consumed samples: 7152640 | consumed tokens: 14648606720 | elapsed time per iteration (s): 0.22 | learning rate: 4.954E-05 | global batch size: 256 | lm loss: 3.716013E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.484 | TFLOPs: 29.39 | +7: iteration 27950/ 37905 | consumed samples: 7155200 | consumed tokens: 14653849600 | elapsed time per iteration (s): 0.22 | learning rate: 4.949E-05 | global batch size: 256 | lm loss: 3.700555E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.798 | TFLOPs: 29.37 | +7: iteration 27960/ 37905 | consumed samples: 7157760 | consumed tokens: 14659092480 | elapsed time per iteration (s): 0.22 | learning rate: 4.943E-05 | global batch size: 256 | lm loss: 3.712679E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.801 | TFLOPs: 29.37 | +7: iteration 27970/ 37905 | consumed samples: 7160320 | consumed tokens: 14664335360 | elapsed time per iteration (s): 0.22 | learning rate: 4.938E-05 | global batch size: 256 | lm loss: 3.691704E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.879 | TFLOPs: 29.37 | +7: iteration 27980/ 37905 | consumed samples: 7162880 | consumed tokens: 14669578240 | elapsed time per iteration (s): 0.22 | learning rate: 4.932E-05 | global batch size: 256 | lm loss: 3.719032E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.404 | TFLOPs: 29.31 | +7: iteration 27990/ 37905 | consumed samples: 7165440 | consumed tokens: 14674821120 | elapsed time per iteration (s): 0.22 | learning rate: 4.927E-05 | global batch size: 256 | lm loss: 3.710036E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.706 | TFLOPs: 29.31 | +0: [2023-03-15 23:41:50,552] [INFO] [logging.py:68:log_dist] [Rank 0] step=28000, skipped=0, lr=[4.9210099955882125e-05, 4.9210099955882125e-05, 4.9210099955882125e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 28000/ 37905 | consumed samples: 7168000 | consumed tokens: 14680064000 | elapsed time per iteration (s): 0.22 | learning rate: 4.921E-05 | global batch size: 256 | lm loss: 3.692842E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.925 | TFLOPs: 29.35 | +0: steps: 28000 loss: 3.6588 iter time (s): 0.221 samples/sec: 1158.267 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 28000 | lm loss value: 3.671027E+00 | lm loss PPL: 3.929223E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 28000 to checkpoints_83m20b400m +0: [2023-03-15 23:41:50,641] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step28000 is begin to save! +0: [2023-03-15 23:41:50,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:41:50,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:41:50,715] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:41:50,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:41:50,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:41:50,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:41:50,740] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:41:50,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:41:50,751] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:41:50,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:41:50,762] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:41:50,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:41:50,774] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:41:50,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:41:50,785] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:41:50,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:41:50,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:41:50,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:41:50,807] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:41:50,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:41:50,818] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:41:50,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:41:50,829] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:41:50,830] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:41:50,831] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step28000/mp_rank_00_model_states.pt +0: [2023-03-15 23:41:50,831] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:41:50,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:41:50,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:41:50,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:41:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:41:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,869] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,869] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:41:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +2: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:41:50,871] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,871] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:41:50,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:41:50,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-15 23:41:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:41:50,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:41:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,875] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:41:50,875] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,875] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +5: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +7: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:41:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:41:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:41:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +5: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,877] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,877] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +6: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:41:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +3: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: [2023-03-15 23:41:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:41:50,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +1: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +2: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:41:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-15 23:41:50,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step28000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +7: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +4: [2023-03-15 23:41:50,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step28000 is ready now! +0: successfully saved checkpoint at iteration 28000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.53 +7: iteration 28010/ 37905 | consumed samples: 7170560 | consumed tokens: 14685306880 | elapsed time per iteration (s): 0.25 | learning rate: 4.915E-05 | global batch size: 256 | lm loss: 3.711611E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1008.233 | TFLOPs: 25.68 | +7: iteration 28020/ 37905 | consumed samples: 7173120 | consumed tokens: 14690549760 | elapsed time per iteration (s): 0.22 | learning rate: 4.910E-05 | global batch size: 256 | lm loss: 3.708319E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.884 | TFLOPs: 29.42 | +7: iteration 28030/ 37905 | consumed samples: 7175680 | consumed tokens: 14695792640 | elapsed time per iteration (s): 0.22 | learning rate: 4.904E-05 | global batch size: 256 | lm loss: 3.714707E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.079 | TFLOPs: 29.43 | +7: iteration 28040/ 37905 | consumed samples: 7178240 | consumed tokens: 14701035520 | elapsed time per iteration (s): 0.22 | learning rate: 4.899E-05 | global batch size: 256 | lm loss: 3.710577E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.611 | TFLOPs: 29.41 | +7: iteration 28050/ 37905 | consumed samples: 7180800 | consumed tokens: 14706278400 | elapsed time per iteration (s): 0.22 | learning rate: 4.893E-05 | global batch size: 256 | lm loss: 3.704778E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.750 | TFLOPs: 29.42 | +7: iteration 28060/ 37905 | consumed samples: 7183360 | consumed tokens: 14711521280 | elapsed time per iteration (s): 0.22 | learning rate: 4.888E-05 | global batch size: 256 | lm loss: 3.712127E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.435 | TFLOPs: 29.41 | +7: iteration 28070/ 37905 | consumed samples: 7185920 | consumed tokens: 14716764160 | elapsed time per iteration (s): 0.22 | learning rate: 4.882E-05 | global batch size: 256 | lm loss: 3.728531E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.616 | TFLOPs: 29.41 | +7: iteration 28080/ 37905 | consumed samples: 7188480 | consumed tokens: 14722007040 | elapsed time per iteration (s): 0.22 | learning rate: 4.877E-05 | global batch size: 256 | lm loss: 3.708839E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.891 | TFLOPs: 29.42 | +7: iteration 28090/ 37905 | consumed samples: 7191040 | consumed tokens: 14727249920 | elapsed time per iteration (s): 0.22 | learning rate: 4.871E-05 | global batch size: 256 | lm loss: 3.706712E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.405 | TFLOPs: 29.41 | +7: iteration 28100/ 37905 | consumed samples: 7193600 | consumed tokens: 14732492800 | elapsed time per iteration (s): 0.22 | learning rate: 4.866E-05 | global batch size: 256 | lm loss: 3.705644E+00 | grad norm: 0.242 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.489 | TFLOPs: 29.41 | +7: iteration 28110/ 37905 | consumed samples: 7196160 | consumed tokens: 14737735680 | elapsed time per iteration (s): 0.22 | learning rate: 4.860E-05 | global batch size: 256 | lm loss: 3.705898E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.682 | TFLOPs: 29.42 | +7: iteration 28120/ 37905 | consumed samples: 7198720 | consumed tokens: 14742978560 | elapsed time per iteration (s): 0.22 | learning rate: 4.855E-05 | global batch size: 256 | lm loss: 3.703982E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.657 | TFLOPs: 29.44 | +7: iteration 28130/ 37905 | consumed samples: 7201280 | consumed tokens: 14748221440 | elapsed time per iteration (s): 0.22 | learning rate: 4.849E-05 | global batch size: 256 | lm loss: 3.709703E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.455 | TFLOPs: 29.41 | +7: iteration 28140/ 37905 | consumed samples: 7203840 | consumed tokens: 14753464320 | elapsed time per iteration (s): 0.22 | learning rate: 4.844E-05 | global batch size: 256 | lm loss: 3.705927E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.847 | TFLOPs: 29.19 | +7: iteration 28150/ 37905 | consumed samples: 7206400 | consumed tokens: 14758707200 | elapsed time per iteration (s): 0.22 | learning rate: 4.838E-05 | global batch size: 256 | lm loss: 3.700172E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.428 | TFLOPs: 29.43 | +7: iteration 28160/ 37905 | consumed samples: 7208960 | consumed tokens: 14763950080 | elapsed time per iteration (s): 0.22 | learning rate: 4.833E-05 | global batch size: 256 | lm loss: 3.708110E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.697 | TFLOPs: 29.44 | +7: iteration 28170/ 37905 | consumed samples: 7211520 | consumed tokens: 14769192960 | elapsed time per iteration (s): 0.22 | learning rate: 4.827E-05 | global batch size: 256 | lm loss: 3.713804E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.515 | TFLOPs: 29.44 | +7: iteration 28180/ 37905 | consumed samples: 7214080 | consumed tokens: 14774435840 | elapsed time per iteration (s): 0.22 | learning rate: 4.822E-05 | global batch size: 256 | lm loss: 3.708386E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.627 | TFLOPs: 29.44 | +7: iteration 28190/ 37905 | consumed samples: 7216640 | consumed tokens: 14779678720 | elapsed time per iteration (s): 0.22 | learning rate: 4.816E-05 | global batch size: 256 | lm loss: 3.692228E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.366 | TFLOPs: 29.43 | +7: iteration 28200/ 37905 | consumed samples: 7219200 | consumed tokens: 14784921600 | elapsed time per iteration (s): 0.22 | learning rate: 4.811E-05 | global batch size: 256 | lm loss: 3.702901E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.149 | TFLOPs: 29.43 | +7: iteration 28210/ 37905 | consumed samples: 7221760 | consumed tokens: 14790164480 | elapsed time per iteration (s): 0.22 | learning rate: 4.805E-05 | global batch size: 256 | lm loss: 3.702298E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.687 | TFLOPs: 29.44 | +7: iteration 28220/ 37905 | consumed samples: 7224320 | consumed tokens: 14795407360 | elapsed time per iteration (s): 0.22 | learning rate: 4.800E-05 | global batch size: 256 | lm loss: 3.703064E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.525 | TFLOPs: 29.44 | +7: iteration 28230/ 37905 | consumed samples: 7226880 | consumed tokens: 14800650240 | elapsed time per iteration (s): 0.22 | learning rate: 4.794E-05 | global batch size: 256 | lm loss: 3.713990E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.428 | TFLOPs: 29.43 | +7: iteration 28240/ 37905 | consumed samples: 7229440 | consumed tokens: 14805893120 | elapsed time per iteration (s): 0.22 | learning rate: 4.789E-05 | global batch size: 256 | lm loss: 3.701595E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.908 | TFLOPs: 29.42 | +7: iteration 28250/ 37905 | consumed samples: 7232000 | consumed tokens: 14811136000 | elapsed time per iteration (s): 0.22 | learning rate: 4.783E-05 | global batch size: 256 | lm loss: 3.699838E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.827 | TFLOPs: 28.99 | +7: iteration 28260/ 37905 | consumed samples: 7234560 | consumed tokens: 14816378880 | elapsed time per iteration (s): 0.22 | learning rate: 4.778E-05 | global batch size: 256 | lm loss: 3.710647E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.438 | TFLOPs: 29.43 | +7: iteration 28270/ 37905 | consumed samples: 7237120 | consumed tokens: 14821621760 | elapsed time per iteration (s): 0.22 | learning rate: 4.773E-05 | global batch size: 256 | lm loss: 3.705221E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.077 | TFLOPs: 29.43 | +7: iteration 28280/ 37905 | consumed samples: 7239680 | consumed tokens: 14826864640 | elapsed time per iteration (s): 0.22 | learning rate: 4.767E-05 | global batch size: 256 | lm loss: 3.718202E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.704 | TFLOPs: 29.42 | +7: iteration 28290/ 37905 | consumed samples: 7242240 | consumed tokens: 14832107520 | elapsed time per iteration (s): 0.22 | learning rate: 4.762E-05 | global batch size: 256 | lm loss: 3.709156E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.163 | TFLOPs: 29.43 | +7: iteration 28300/ 37905 | consumed samples: 7244800 | consumed tokens: 14837350400 | elapsed time per iteration (s): 0.22 | learning rate: 4.756E-05 | global batch size: 256 | lm loss: 3.699529E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.941 | TFLOPs: 29.42 | +7: iteration 28310/ 37905 | consumed samples: 7247360 | consumed tokens: 14842593280 | elapsed time per iteration (s): 0.22 | learning rate: 4.751E-05 | global batch size: 256 | lm loss: 3.708557E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.291 | TFLOPs: 29.41 | +7: iteration 28320/ 37905 | consumed samples: 7249920 | consumed tokens: 14847836160 | elapsed time per iteration (s): 0.22 | learning rate: 4.745E-05 | global batch size: 256 | lm loss: 3.713001E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.103 | TFLOPs: 29.43 | +7: iteration 28330/ 37905 | consumed samples: 7252480 | consumed tokens: 14853079040 | elapsed time per iteration (s): 0.22 | learning rate: 4.740E-05 | global batch size: 256 | lm loss: 3.706796E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.577 | TFLOPs: 29.41 | +7: iteration 28340/ 37905 | consumed samples: 7255040 | consumed tokens: 14858321920 | elapsed time per iteration (s): 0.22 | learning rate: 4.735E-05 | global batch size: 256 | lm loss: 3.711458E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.827 | TFLOPs: 29.42 | +7: iteration 28350/ 37905 | consumed samples: 7257600 | consumed tokens: 14863564800 | elapsed time per iteration (s): 0.22 | learning rate: 4.729E-05 | global batch size: 256 | lm loss: 3.719184E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.557 | TFLOPs: 29.41 | +7: iteration 28360/ 37905 | consumed samples: 7260160 | consumed tokens: 14868807680 | elapsed time per iteration (s): 0.22 | learning rate: 4.724E-05 | global batch size: 256 | lm loss: 3.704546E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.729 | TFLOPs: 29.42 | +7: iteration 28370/ 37905 | consumed samples: 7262720 | consumed tokens: 14874050560 | elapsed time per iteration (s): 0.22 | learning rate: 4.718E-05 | global batch size: 256 | lm loss: 3.698595E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.600 | TFLOPs: 29.44 | +7: iteration 28380/ 37905 | consumed samples: 7265280 | consumed tokens: 14879293440 | elapsed time per iteration (s): 0.22 | learning rate: 4.713E-05 | global batch size: 256 | lm loss: 3.707873E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.133 | TFLOPs: 29.43 | +7: iteration 28390/ 37905 | consumed samples: 7267840 | consumed tokens: 14884536320 | elapsed time per iteration (s): 0.22 | learning rate: 4.708E-05 | global batch size: 256 | lm loss: 3.709052E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.266 | TFLOPs: 29.10 | +7: iteration 28400/ 37905 | consumed samples: 7270400 | consumed tokens: 14889779200 | elapsed time per iteration (s): 0.23 | learning rate: 4.702E-05 | global batch size: 256 | lm loss: 3.707920E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.459 | TFLOPs: 28.59 | +7: iteration 28410/ 37905 | consumed samples: 7272960 | consumed tokens: 14895022080 | elapsed time per iteration (s): 0.22 | learning rate: 4.697E-05 | global batch size: 256 | lm loss: 3.711908E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.598 | TFLOPs: 29.41 | +7: iteration 28420/ 37905 | consumed samples: 7275520 | consumed tokens: 14900264960 | elapsed time per iteration (s): 0.22 | learning rate: 4.691E-05 | global batch size: 256 | lm loss: 3.696794E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.921 | TFLOPs: 29.42 | +7: iteration 28430/ 37905 | consumed samples: 7278080 | consumed tokens: 14905507840 | elapsed time per iteration (s): 0.22 | learning rate: 4.686E-05 | global batch size: 256 | lm loss: 3.688716E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.750 | TFLOPs: 29.39 | +7: iteration 28440/ 37905 | consumed samples: 7280640 | consumed tokens: 14910750720 | elapsed time per iteration (s): 0.22 | learning rate: 4.681E-05 | global batch size: 256 | lm loss: 3.710934E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.429 | TFLOPs: 29.38 | +7: iteration 28450/ 37905 | consumed samples: 7283200 | consumed tokens: 14915993600 | elapsed time per iteration (s): 0.22 | learning rate: 4.675E-05 | global batch size: 256 | lm loss: 3.700618E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.746 | TFLOPs: 29.39 | +7: iteration 28460/ 37905 | consumed samples: 7285760 | consumed tokens: 14921236480 | elapsed time per iteration (s): 0.22 | learning rate: 4.670E-05 | global batch size: 256 | lm loss: 3.696192E+00 | grad norm: 0.230 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.873 | TFLOPs: 29.11 | +7: iteration 28470/ 37905 | consumed samples: 7288320 | consumed tokens: 14926479360 | elapsed time per iteration (s): 0.22 | learning rate: 4.665E-05 | global batch size: 256 | lm loss: 3.693166E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.478 | TFLOPs: 29.31 | +7: iteration 28480/ 37905 | consumed samples: 7290880 | consumed tokens: 14931722240 | elapsed time per iteration (s): 0.22 | learning rate: 4.659E-05 | global batch size: 256 | lm loss: 3.681370E+00 | grad norm: 0.237 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.818 | TFLOPs: 29.32 | +7: iteration 28490/ 37905 | consumed samples: 7293440 | consumed tokens: 14936965120 | elapsed time per iteration (s): 0.22 | learning rate: 4.654E-05 | global batch size: 256 | lm loss: 3.679089E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.350 | TFLOPs: 29.33 | +7: iteration 28500/ 37905 | consumed samples: 7296000 | consumed tokens: 14942208000 | elapsed time per iteration (s): 0.22 | learning rate: 4.649E-05 | global batch size: 256 | lm loss: 3.687425E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.442 | TFLOPs: 29.38 | +7: iteration 28510/ 37905 | consumed samples: 7298560 | consumed tokens: 14947450880 | elapsed time per iteration (s): 0.22 | learning rate: 4.643E-05 | global batch size: 256 | lm loss: 3.692960E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.089 | TFLOPs: 29.35 | +7: iteration 28520/ 37905 | consumed samples: 7301120 | consumed tokens: 14952693760 | elapsed time per iteration (s): 0.22 | learning rate: 4.638E-05 | global batch size: 256 | lm loss: 3.704972E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.142 | TFLOPs: 29.38 | +7: iteration 28530/ 37905 | consumed samples: 7303680 | consumed tokens: 14957936640 | elapsed time per iteration (s): 0.22 | learning rate: 4.633E-05 | global batch size: 256 | lm loss: 3.687564E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.498 | TFLOPs: 29.36 | +7: iteration 28540/ 37905 | consumed samples: 7306240 | consumed tokens: 14963179520 | elapsed time per iteration (s): 0.22 | learning rate: 4.627E-05 | global batch size: 256 | lm loss: 3.704290E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.739 | TFLOPs: 29.37 | +7: iteration 28550/ 37905 | consumed samples: 7308800 | consumed tokens: 14968422400 | elapsed time per iteration (s): 0.22 | learning rate: 4.622E-05 | global batch size: 256 | lm loss: 3.716817E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.530 | TFLOPs: 29.36 | +7: iteration 28560/ 37905 | consumed samples: 7311360 | consumed tokens: 14973665280 | elapsed time per iteration (s): 0.22 | learning rate: 4.617E-05 | global batch size: 256 | lm loss: 3.689042E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.019 | TFLOPs: 29.35 | +7: iteration 28570/ 37905 | consumed samples: 7313920 | consumed tokens: 14978908160 | elapsed time per iteration (s): 0.22 | learning rate: 4.611E-05 | global batch size: 256 | lm loss: 3.700241E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.923 | TFLOPs: 29.35 | +7: iteration 28580/ 37905 | consumed samples: 7316480 | consumed tokens: 14984151040 | elapsed time per iteration (s): 0.22 | learning rate: 4.606E-05 | global batch size: 256 | lm loss: 3.707794E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.310 | TFLOPs: 29.33 | +7: iteration 28590/ 37905 | consumed samples: 7319040 | consumed tokens: 14989393920 | elapsed time per iteration (s): 0.22 | learning rate: 4.601E-05 | global batch size: 256 | lm loss: 3.703591E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.821 | TFLOPs: 29.37 | +7: iteration 28600/ 37905 | consumed samples: 7321600 | consumed tokens: 14994636800 | elapsed time per iteration (s): 0.22 | learning rate: 4.595E-05 | global batch size: 256 | lm loss: 3.704898E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.298 | TFLOPs: 29.33 | +7: iteration 28610/ 37905 | consumed samples: 7324160 | consumed tokens: 14999879680 | elapsed time per iteration (s): 0.22 | learning rate: 4.590E-05 | global batch size: 256 | lm loss: 3.724495E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.736 | TFLOPs: 29.29 | +7: iteration 28620/ 37905 | consumed samples: 7326720 | consumed tokens: 15005122560 | elapsed time per iteration (s): 0.22 | learning rate: 4.585E-05 | global batch size: 256 | lm loss: 3.696010E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.916 | TFLOPs: 29.27 | +7: iteration 28630/ 37905 | consumed samples: 7329280 | consumed tokens: 15010365440 | elapsed time per iteration (s): 0.22 | learning rate: 4.580E-05 | global batch size: 256 | lm loss: 3.696147E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.776 | TFLOPs: 29.29 | +7: iteration 28640/ 37905 | consumed samples: 7331840 | consumed tokens: 15015608320 | elapsed time per iteration (s): 0.22 | learning rate: 4.574E-05 | global batch size: 256 | lm loss: 3.712881E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.180 | TFLOPs: 29.28 | +7: iteration 28650/ 37905 | consumed samples: 7334400 | consumed tokens: 15020851200 | elapsed time per iteration (s): 0.22 | learning rate: 4.569E-05 | global batch size: 256 | lm loss: 3.706568E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.949 | TFLOPs: 29.27 | +7: iteration 28660/ 37905 | consumed samples: 7336960 | consumed tokens: 15026094080 | elapsed time per iteration (s): 0.22 | learning rate: 4.564E-05 | global batch size: 256 | lm loss: 3.707586E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.698 | TFLOPs: 29.29 | +7: iteration 28670/ 37905 | consumed samples: 7339520 | consumed tokens: 15031336960 | elapsed time per iteration (s): 0.22 | learning rate: 4.559E-05 | global batch size: 256 | lm loss: 3.713429E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.898 | TFLOPs: 29.29 | +7: iteration 28680/ 37905 | consumed samples: 7342080 | consumed tokens: 15036579840 | elapsed time per iteration (s): 0.22 | learning rate: 4.553E-05 | global batch size: 256 | lm loss: 3.707592E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.302 | TFLOPs: 29.30 | +7: iteration 28690/ 37905 | consumed samples: 7344640 | consumed tokens: 15041822720 | elapsed time per iteration (s): 0.22 | learning rate: 4.548E-05 | global batch size: 256 | lm loss: 3.713284E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.512 | TFLOPs: 29.28 | +7: iteration 28700/ 37905 | consumed samples: 7347200 | consumed tokens: 15047065600 | elapsed time per iteration (s): 0.22 | learning rate: 4.543E-05 | global batch size: 256 | lm loss: 3.703922E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.432 | TFLOPs: 29.28 | +7: iteration 28710/ 37905 | consumed samples: 7349760 | consumed tokens: 15052308480 | elapsed time per iteration (s): 0.22 | learning rate: 4.537E-05 | global batch size: 256 | lm loss: 3.694921E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.667 | TFLOPs: 29.26 | +7: iteration 28720/ 37905 | consumed samples: 7352320 | consumed tokens: 15057551360 | elapsed time per iteration (s): 0.22 | learning rate: 4.532E-05 | global batch size: 256 | lm loss: 3.713668E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.132 | TFLOPs: 29.27 | +7: iteration 28730/ 37905 | consumed samples: 7354880 | consumed tokens: 15062794240 | elapsed time per iteration (s): 0.22 | learning rate: 4.527E-05 | global batch size: 256 | lm loss: 3.715353E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.036 | TFLOPs: 29.27 | +7: iteration 28740/ 37905 | consumed samples: 7357440 | consumed tokens: 15068037120 | elapsed time per iteration (s): 0.22 | learning rate: 4.522E-05 | global batch size: 256 | lm loss: 3.718177E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.094 | TFLOPs: 29.30 | +7: iteration 28750/ 37905 | consumed samples: 7360000 | consumed tokens: 15073280000 | elapsed time per iteration (s): 0.22 | learning rate: 4.517E-05 | global batch size: 256 | lm loss: 3.717915E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.126 | TFLOPs: 29.30 | +7: iteration 28760/ 37905 | consumed samples: 7362560 | consumed tokens: 15078522880 | elapsed time per iteration (s): 0.22 | learning rate: 4.511E-05 | global batch size: 256 | lm loss: 3.703060E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.749 | TFLOPs: 29.39 | +7: iteration 28770/ 37905 | consumed samples: 7365120 | consumed tokens: 15083765760 | elapsed time per iteration (s): 0.22 | learning rate: 4.506E-05 | global batch size: 256 | lm loss: 3.699037E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.658 | TFLOPs: 29.36 | +7: iteration 28780/ 37905 | consumed samples: 7367680 | consumed tokens: 15089008640 | elapsed time per iteration (s): 0.22 | learning rate: 4.501E-05 | global batch size: 256 | lm loss: 3.701208E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.220 | TFLOPs: 29.33 | +7: iteration 28790/ 37905 | consumed samples: 7370240 | consumed tokens: 15094251520 | elapsed time per iteration (s): 0.22 | learning rate: 4.496E-05 | global batch size: 256 | lm loss: 3.708595E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.107 | TFLOPs: 29.30 | +7: iteration 28800/ 37905 | consumed samples: 7372800 | consumed tokens: 15099494400 | elapsed time per iteration (s): 0.22 | learning rate: 4.490E-05 | global batch size: 256 | lm loss: 3.704008E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.902 | TFLOPs: 29.32 | +7: iteration 28810/ 37905 | consumed samples: 7375360 | consumed tokens: 15104737280 | elapsed time per iteration (s): 0.22 | learning rate: 4.485E-05 | global batch size: 256 | lm loss: 3.697129E+00 | grad norm: 0.229 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.692 | TFLOPs: 29.31 | +7: iteration 28820/ 37905 | consumed samples: 7377920 | consumed tokens: 15109980160 | elapsed time per iteration (s): 0.22 | learning rate: 4.480E-05 | global batch size: 256 | lm loss: 3.703259E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.196 | TFLOPs: 29.33 | +7: iteration 28830/ 37905 | consumed samples: 7380480 | consumed tokens: 15115223040 | elapsed time per iteration (s): 0.22 | learning rate: 4.475E-05 | global batch size: 256 | lm loss: 3.706678E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.771 | TFLOPs: 29.32 | +7: iteration 28840/ 37905 | consumed samples: 7383040 | consumed tokens: 15120465920 | elapsed time per iteration (s): 0.22 | learning rate: 4.470E-05 | global batch size: 256 | lm loss: 3.708580E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.627 | TFLOPs: 29.36 | +7: iteration 28850/ 37905 | consumed samples: 7385600 | consumed tokens: 15125708800 | elapsed time per iteration (s): 0.22 | learning rate: 4.465E-05 | global batch size: 256 | lm loss: 3.710969E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.661 | TFLOPs: 29.34 | +7: iteration 28860/ 37905 | consumed samples: 7388160 | consumed tokens: 15130951680 | elapsed time per iteration (s): 0.22 | learning rate: 4.459E-05 | global batch size: 256 | lm loss: 3.713129E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.434 | TFLOPs: 29.33 | +7: iteration 28870/ 37905 | consumed samples: 7390720 | consumed tokens: 15136194560 | elapsed time per iteration (s): 0.22 | learning rate: 4.454E-05 | global batch size: 256 | lm loss: 3.713119E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.628 | TFLOPs: 29.31 | +7: iteration 28880/ 37905 | consumed samples: 7393280 | consumed tokens: 15141437440 | elapsed time per iteration (s): 0.22 | learning rate: 4.449E-05 | global batch size: 256 | lm loss: 3.714445E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.204 | TFLOPs: 29.33 | +7: iteration 28890/ 37905 | consumed samples: 7395840 | consumed tokens: 15146680320 | elapsed time per iteration (s): 0.22 | learning rate: 4.444E-05 | global batch size: 256 | lm loss: 3.707856E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.533 | TFLOPs: 29.31 | +7: iteration 28900/ 37905 | consumed samples: 7398400 | consumed tokens: 15151923200 | elapsed time per iteration (s): 0.22 | learning rate: 4.439E-05 | global batch size: 256 | lm loss: 3.702819E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.031 | TFLOPs: 29.32 | +7: iteration 28910/ 37905 | consumed samples: 7400960 | consumed tokens: 15157166080 | elapsed time per iteration (s): 0.22 | learning rate: 4.434E-05 | global batch size: 256 | lm loss: 3.700040E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.119 | TFLOPs: 29.32 | +7: iteration 28920/ 37905 | consumed samples: 7403520 | consumed tokens: 15162408960 | elapsed time per iteration (s): 0.22 | learning rate: 4.428E-05 | global batch size: 256 | lm loss: 3.710405E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.320 | TFLOPs: 29.30 | +7: iteration 28930/ 37905 | consumed samples: 7406080 | consumed tokens: 15167651840 | elapsed time per iteration (s): 0.22 | learning rate: 4.423E-05 | global batch size: 256 | lm loss: 3.697701E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.938 | TFLOPs: 29.32 | +7: iteration 28940/ 37905 | consumed samples: 7408640 | consumed tokens: 15172894720 | elapsed time per iteration (s): 0.22 | learning rate: 4.418E-05 | global batch size: 256 | lm loss: 3.694638E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.472 | TFLOPs: 29.31 | +7: iteration 28950/ 37905 | consumed samples: 7411200 | consumed tokens: 15178137600 | elapsed time per iteration (s): 0.22 | learning rate: 4.413E-05 | global batch size: 256 | lm loss: 3.712149E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.183 | TFLOPs: 29.33 | +7: iteration 28960/ 37905 | consumed samples: 7413760 | consumed tokens: 15183380480 | elapsed time per iteration (s): 0.22 | learning rate: 4.408E-05 | global batch size: 256 | lm loss: 3.705790E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.397 | TFLOPs: 29.31 | +7: iteration 28970/ 37905 | consumed samples: 7416320 | consumed tokens: 15188623360 | elapsed time per iteration (s): 0.22 | learning rate: 4.403E-05 | global batch size: 256 | lm loss: 3.702451E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.149 | TFLOPs: 29.30 | +7: iteration 28980/ 37905 | consumed samples: 7418880 | consumed tokens: 15193866240 | elapsed time per iteration (s): 0.22 | learning rate: 4.398E-05 | global batch size: 256 | lm loss: 3.713921E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.880 | TFLOPs: 29.32 | +7: iteration 28990/ 37905 | consumed samples: 7421440 | consumed tokens: 15199109120 | elapsed time per iteration (s): 0.22 | learning rate: 4.392E-05 | global batch size: 256 | lm loss: 3.710514E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.512 | TFLOPs: 29.31 | +7: iteration 29000/ 37905 | consumed samples: 7424000 | consumed tokens: 15204352000 | elapsed time per iteration (s): 0.22 | learning rate: 4.387E-05 | global batch size: 256 | lm loss: 3.703193E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.714 | TFLOPs: 29.31 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 29000 | lm loss value: 3.621628E+00 | lm loss PPL: 3.739841E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 29000 to checkpoints_83m20b400m +0: [2023-03-15 23:45:33,190] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step29000 is begin to save! +0: [2023-03-15 23:45:33,194] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:45:33,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:45:33,264] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:45:33,275] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:45:33,276] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:45:33,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:45:33,287] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:45:33,298] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:45:33,298] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:45:33,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:45:33,309] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:45:33,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:45:33,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:45:33,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:45:33,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:45:33,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:45:33,342] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:45:33,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:45:33,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:45:33,364] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:45:33,364] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:45:33,375] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:45:33,375] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:45:33,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:45:33,377] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step29000/mp_rank_00_model_states.pt +0: [2023-03-15 23:45:33,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:45:33,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:45:33,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:45:33,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:45:33,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:45:33,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:45:33,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:45:33,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: [2023-03-15 23:45:33,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:45:33,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +5: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +6: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +3: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +1: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +0: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +7: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +4: [2023-03-15 23:45:33,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:45:33,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step29000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:45:33,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +2: [2023-03-15 23:45:33,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step29000 is ready now! +0: successfully saved checkpoint at iteration 29000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 239.69 +7: iteration 29010/ 37905 | consumed samples: 7426560 | consumed tokens: 15209594880 | elapsed time per iteration (s): 0.25 | learning rate: 4.382E-05 | global batch size: 256 | lm loss: 3.701027E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1013.798 | TFLOPs: 25.83 | +7: iteration 29020/ 37905 | consumed samples: 7429120 | consumed tokens: 15214837760 | elapsed time per iteration (s): 0.22 | learning rate: 4.377E-05 | global batch size: 256 | lm loss: 3.707859E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.330 | TFLOPs: 29.33 | +7: iteration 29030/ 37905 | consumed samples: 7431680 | consumed tokens: 15220080640 | elapsed time per iteration (s): 0.22 | learning rate: 4.372E-05 | global batch size: 256 | lm loss: 3.706907E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.004 | TFLOPs: 29.30 | +7: iteration 29040/ 37905 | consumed samples: 7434240 | consumed tokens: 15225323520 | elapsed time per iteration (s): 0.22 | learning rate: 4.367E-05 | global batch size: 256 | lm loss: 3.705358E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.614 | TFLOPs: 29.34 | +7: iteration 29050/ 37905 | consumed samples: 7436800 | consumed tokens: 15230566400 | elapsed time per iteration (s): 0.22 | learning rate: 4.362E-05 | global batch size: 256 | lm loss: 3.715554E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.905 | TFLOPs: 29.32 | +7: iteration 29060/ 37905 | consumed samples: 7439360 | consumed tokens: 15235809280 | elapsed time per iteration (s): 0.22 | learning rate: 4.357E-05 | global batch size: 256 | lm loss: 3.703970E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.176 | TFLOPs: 29.30 | +7: iteration 29070/ 37905 | consumed samples: 7441920 | consumed tokens: 15241052160 | elapsed time per iteration (s): 0.22 | learning rate: 4.352E-05 | global batch size: 256 | lm loss: 3.716900E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.847 | TFLOPs: 29.34 | +7: iteration 29080/ 37905 | consumed samples: 7444480 | consumed tokens: 15246295040 | elapsed time per iteration (s): 0.22 | learning rate: 4.347E-05 | global batch size: 256 | lm loss: 3.693851E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.858 | TFLOPs: 29.32 | +7: iteration 29090/ 37905 | consumed samples: 7447040 | consumed tokens: 15251537920 | elapsed time per iteration (s): 0.22 | learning rate: 4.342E-05 | global batch size: 256 | lm loss: 3.699795E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.549 | TFLOPs: 29.31 | +7: iteration 29100/ 37905 | consumed samples: 7449600 | consumed tokens: 15256780800 | elapsed time per iteration (s): 0.22 | learning rate: 4.336E-05 | global batch size: 256 | lm loss: 3.710928E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.346 | TFLOPs: 29.33 | +7: iteration 29110/ 37905 | consumed samples: 7452160 | consumed tokens: 15262023680 | elapsed time per iteration (s): 0.22 | learning rate: 4.331E-05 | global batch size: 256 | lm loss: 3.705091E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.896 | TFLOPs: 29.32 | +7: iteration 29120/ 37905 | consumed samples: 7454720 | consumed tokens: 15267266560 | elapsed time per iteration (s): 0.22 | learning rate: 4.326E-05 | global batch size: 256 | lm loss: 3.686071E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.814 | TFLOPs: 29.32 | +7: iteration 29130/ 37905 | consumed samples: 7457280 | consumed tokens: 15272509440 | elapsed time per iteration (s): 0.22 | learning rate: 4.321E-05 | global batch size: 256 | lm loss: 3.704085E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.987 | TFLOPs: 29.30 | +7: iteration 29140/ 37905 | consumed samples: 7459840 | consumed tokens: 15277752320 | elapsed time per iteration (s): 0.22 | learning rate: 4.316E-05 | global batch size: 256 | lm loss: 3.684805E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.665 | TFLOPs: 29.31 | +7: iteration 29150/ 37905 | consumed samples: 7462400 | consumed tokens: 15282995200 | elapsed time per iteration (s): 0.22 | learning rate: 4.311E-05 | global batch size: 256 | lm loss: 3.690802E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.484 | TFLOPs: 29.33 | +7: iteration 29160/ 37905 | consumed samples: 7464960 | consumed tokens: 15288238080 | elapsed time per iteration (s): 0.22 | learning rate: 4.306E-05 | global batch size: 256 | lm loss: 3.693953E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.903 | TFLOPs: 29.32 | +7: iteration 29170/ 37905 | consumed samples: 7467520 | consumed tokens: 15293480960 | elapsed time per iteration (s): 0.22 | learning rate: 4.301E-05 | global batch size: 256 | lm loss: 3.710025E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.885 | TFLOPs: 29.32 | +7: iteration 29180/ 37905 | consumed samples: 7470080 | consumed tokens: 15298723840 | elapsed time per iteration (s): 0.22 | learning rate: 4.296E-05 | global batch size: 256 | lm loss: 3.710674E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.688 | TFLOPs: 29.34 | +7: iteration 29190/ 37905 | consumed samples: 7472640 | consumed tokens: 15303966720 | elapsed time per iteration (s): 0.22 | learning rate: 4.291E-05 | global batch size: 256 | lm loss: 3.692567E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.634 | TFLOPs: 29.03 | +7: iteration 29200/ 37905 | consumed samples: 7475200 | consumed tokens: 15309209600 | elapsed time per iteration (s): 0.22 | learning rate: 4.286E-05 | global batch size: 256 | lm loss: 3.701916E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.275 | TFLOPs: 29.33 | +7: iteration 29210/ 37905 | consumed samples: 7477760 | consumed tokens: 15314452480 | elapsed time per iteration (s): 0.22 | learning rate: 4.281E-05 | global batch size: 256 | lm loss: 3.699733E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.210 | TFLOPs: 29.33 | +7: iteration 29220/ 37905 | consumed samples: 7480320 | consumed tokens: 15319695360 | elapsed time per iteration (s): 0.22 | learning rate: 4.276E-05 | global batch size: 256 | lm loss: 3.699965E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.810 | TFLOPs: 29.34 | +7: iteration 29230/ 37905 | consumed samples: 7482880 | consumed tokens: 15324938240 | elapsed time per iteration (s): 0.22 | learning rate: 4.271E-05 | global batch size: 256 | lm loss: 3.698907E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.087 | TFLOPs: 29.32 | +7: iteration 29240/ 37905 | consumed samples: 7485440 | consumed tokens: 15330181120 | elapsed time per iteration (s): 0.22 | learning rate: 4.266E-05 | global batch size: 256 | lm loss: 3.705662E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.581 | TFLOPs: 29.31 | +7: iteration 29250/ 37905 | consumed samples: 7488000 | consumed tokens: 15335424000 | elapsed time per iteration (s): 0.22 | learning rate: 4.261E-05 | global batch size: 256 | lm loss: 3.708235E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.029 | TFLOPs: 29.32 | +7: iteration 29260/ 37905 | consumed samples: 7490560 | consumed tokens: 15340666880 | elapsed time per iteration (s): 0.22 | learning rate: 4.256E-05 | global batch size: 256 | lm loss: 3.700592E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.486 | TFLOPs: 29.31 | +7: iteration 29270/ 37905 | consumed samples: 7493120 | consumed tokens: 15345909760 | elapsed time per iteration (s): 0.22 | learning rate: 4.251E-05 | global batch size: 256 | lm loss: 3.695891E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.760 | TFLOPs: 29.32 | +7: iteration 29280/ 37905 | consumed samples: 7495680 | consumed tokens: 15351152640 | elapsed time per iteration (s): 0.22 | learning rate: 4.246E-05 | global batch size: 256 | lm loss: 3.705301E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.826 | TFLOPs: 29.29 | +7: iteration 29290/ 37905 | consumed samples: 7498240 | consumed tokens: 15356395520 | elapsed time per iteration (s): 0.22 | learning rate: 4.241E-05 | global batch size: 256 | lm loss: 3.699092E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.835 | TFLOPs: 29.34 | +7: iteration 29300/ 37905 | consumed samples: 7500800 | consumed tokens: 15361638400 | elapsed time per iteration (s): 0.22 | learning rate: 4.236E-05 | global batch size: 256 | lm loss: 3.710380E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.469 | TFLOPs: 29.33 | +7: iteration 29310/ 37905 | consumed samples: 7503360 | consumed tokens: 15366881280 | elapsed time per iteration (s): 0.22 | learning rate: 4.231E-05 | global batch size: 256 | lm loss: 3.706198E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.325 | TFLOPs: 29.05 | +7: iteration 29320/ 37905 | consumed samples: 7505920 | consumed tokens: 15372124160 | elapsed time per iteration (s): 0.22 | learning rate: 4.226E-05 | global batch size: 256 | lm loss: 3.709195E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.395 | TFLOPs: 29.05 | +7: iteration 29330/ 37905 | consumed samples: 7508480 | consumed tokens: 15377367040 | elapsed time per iteration (s): 0.22 | learning rate: 4.221E-05 | global batch size: 256 | lm loss: 3.702733E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.238 | TFLOPs: 29.33 | +7: iteration 29340/ 37905 | consumed samples: 7511040 | consumed tokens: 15382609920 | elapsed time per iteration (s): 0.23 | learning rate: 4.216E-05 | global batch size: 256 | lm loss: 3.687043E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.608 | TFLOPs: 28.37 | +7: iteration 29350/ 37905 | consumed samples: 7513600 | consumed tokens: 15387852800 | elapsed time per iteration (s): 0.22 | learning rate: 4.211E-05 | global batch size: 256 | lm loss: 3.710283E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.472 | TFLOPs: 29.33 | +7: iteration 29360/ 37905 | consumed samples: 7516160 | consumed tokens: 15393095680 | elapsed time per iteration (s): 0.22 | learning rate: 4.206E-05 | global batch size: 256 | lm loss: 3.707727E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.234 | TFLOPs: 29.33 | +7: iteration 29370/ 37905 | consumed samples: 7518720 | consumed tokens: 15398338560 | elapsed time per iteration (s): 0.22 | learning rate: 4.201E-05 | global batch size: 256 | lm loss: 3.691956E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.312 | TFLOPs: 29.00 | +7: iteration 29380/ 37905 | consumed samples: 7521280 | consumed tokens: 15403581440 | elapsed time per iteration (s): 0.22 | learning rate: 4.196E-05 | global batch size: 256 | lm loss: 3.718022E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.912 | TFLOPs: 29.32 | +7: iteration 29390/ 37905 | consumed samples: 7523840 | consumed tokens: 15408824320 | elapsed time per iteration (s): 0.23 | learning rate: 4.192E-05 | global batch size: 256 | lm loss: 3.700766E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1103.457 | TFLOPs: 28.11 | +7: iteration 29400/ 37905 | consumed samples: 7526400 | consumed tokens: 15414067200 | elapsed time per iteration (s): 0.22 | learning rate: 4.187E-05 | global batch size: 256 | lm loss: 3.701899E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.477 | TFLOPs: 29.31 | +7: iteration 29410/ 37905 | consumed samples: 7528960 | consumed tokens: 15419310080 | elapsed time per iteration (s): 0.23 | learning rate: 4.182E-05 | global batch size: 256 | lm loss: 3.704863E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.984 | TFLOPs: 28.91 | +7: iteration 29420/ 37905 | consumed samples: 7531520 | consumed tokens: 15424552960 | elapsed time per iteration (s): 0.23 | learning rate: 4.177E-05 | global batch size: 256 | lm loss: 3.706846E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.009 | TFLOPs: 28.94 | +7: iteration 29430/ 37905 | consumed samples: 7534080 | consumed tokens: 15429795840 | elapsed time per iteration (s): 0.22 | learning rate: 4.172E-05 | global batch size: 256 | lm loss: 3.700462E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.055 | TFLOPs: 29.32 | +7: iteration 29440/ 37905 | consumed samples: 7536640 | consumed tokens: 15435038720 | elapsed time per iteration (s): 0.22 | learning rate: 4.167E-05 | global batch size: 256 | lm loss: 3.692076E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.087 | TFLOPs: 29.32 | +7: iteration 29450/ 37905 | consumed samples: 7539200 | consumed tokens: 15440281600 | elapsed time per iteration (s): 0.22 | learning rate: 4.162E-05 | global batch size: 256 | lm loss: 3.700016E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.564 | TFLOPs: 29.34 | +7: iteration 29460/ 37905 | consumed samples: 7541760 | consumed tokens: 15445524480 | elapsed time per iteration (s): 0.22 | learning rate: 4.157E-05 | global batch size: 256 | lm loss: 3.702651E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.126 | TFLOPs: 29.35 | +7: iteration 29470/ 37905 | consumed samples: 7544320 | consumed tokens: 15450767360 | elapsed time per iteration (s): 0.22 | learning rate: 4.152E-05 | global batch size: 256 | lm loss: 3.698134E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.827 | TFLOPs: 29.34 | +7: iteration 29480/ 37905 | consumed samples: 7546880 | consumed tokens: 15456010240 | elapsed time per iteration (s): 0.22 | learning rate: 4.147E-05 | global batch size: 256 | lm loss: 3.721953E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.633 | TFLOPs: 29.34 | +7: iteration 29490/ 37905 | consumed samples: 7549440 | consumed tokens: 15461253120 | elapsed time per iteration (s): 0.22 | learning rate: 4.143E-05 | global batch size: 256 | lm loss: 3.701559E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.023 | TFLOPs: 29.32 | +7: iteration 29500/ 37905 | consumed samples: 7552000 | consumed tokens: 15466496000 | elapsed time per iteration (s): 0.22 | learning rate: 4.138E-05 | global batch size: 256 | lm loss: 3.709689E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.136 | TFLOPs: 29.30 | +7: iteration 29510/ 37905 | consumed samples: 7554560 | consumed tokens: 15471738880 | elapsed time per iteration (s): 0.22 | learning rate: 4.133E-05 | global batch size: 256 | lm loss: 3.707396E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.114 | TFLOPs: 29.32 | +7: iteration 29520/ 37905 | consumed samples: 7557120 | consumed tokens: 15476981760 | elapsed time per iteration (s): 0.22 | learning rate: 4.128E-05 | global batch size: 256 | lm loss: 3.706500E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.670 | TFLOPs: 29.31 | +7: iteration 29530/ 37905 | consumed samples: 7559680 | consumed tokens: 15482224640 | elapsed time per iteration (s): 0.22 | learning rate: 4.123E-05 | global batch size: 256 | lm loss: 3.698706E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.842 | TFLOPs: 29.29 | +7: iteration 29540/ 37905 | consumed samples: 7562240 | consumed tokens: 15487467520 | elapsed time per iteration (s): 0.22 | learning rate: 4.118E-05 | global batch size: 256 | lm loss: 3.710351E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.796 | TFLOPs: 29.29 | +7: iteration 29550/ 37905 | consumed samples: 7564800 | consumed tokens: 15492710400 | elapsed time per iteration (s): 0.22 | learning rate: 4.113E-05 | global batch size: 256 | lm loss: 3.696957E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.124 | TFLOPs: 29.27 | +7: iteration 29560/ 37905 | consumed samples: 7567360 | consumed tokens: 15497953280 | elapsed time per iteration (s): 0.22 | learning rate: 4.108E-05 | global batch size: 256 | lm loss: 3.701250E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.681 | TFLOPs: 29.29 | +7: iteration 29570/ 37905 | consumed samples: 7569920 | consumed tokens: 15503196160 | elapsed time per iteration (s): 0.22 | learning rate: 4.104E-05 | global batch size: 256 | lm loss: 3.698167E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.512 | TFLOPs: 29.03 | +7: iteration 29580/ 37905 | consumed samples: 7572480 | consumed tokens: 15508439040 | elapsed time per iteration (s): 0.22 | learning rate: 4.099E-05 | global batch size: 256 | lm loss: 3.700632E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.341 | TFLOPs: 29.31 | +7: iteration 29590/ 37905 | consumed samples: 7575040 | consumed tokens: 15513681920 | elapsed time per iteration (s): 0.22 | learning rate: 4.094E-05 | global batch size: 256 | lm loss: 3.700286E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.821 | TFLOPs: 29.29 | +7: iteration 29600/ 37905 | consumed samples: 7577600 | consumed tokens: 15518924800 | elapsed time per iteration (s): 0.22 | learning rate: 4.089E-05 | global batch size: 256 | lm loss: 3.693600E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.676 | TFLOPs: 29.31 | +7: iteration 29610/ 37905 | consumed samples: 7580160 | consumed tokens: 15524167680 | elapsed time per iteration (s): 0.22 | learning rate: 4.084E-05 | global batch size: 256 | lm loss: 3.705848E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.185 | TFLOPs: 29.28 | +7: iteration 29620/ 37905 | consumed samples: 7582720 | consumed tokens: 15529410560 | elapsed time per iteration (s): 0.22 | learning rate: 4.080E-05 | global batch size: 256 | lm loss: 3.695926E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.724 | TFLOPs: 29.01 | +7: iteration 29630/ 37905 | consumed samples: 7585280 | consumed tokens: 15534653440 | elapsed time per iteration (s): 0.22 | learning rate: 4.075E-05 | global batch size: 256 | lm loss: 3.703248E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.444 | TFLOPs: 29.31 | +7: iteration 29640/ 37905 | consumed samples: 7587840 | consumed tokens: 15539896320 | elapsed time per iteration (s): 0.22 | learning rate: 4.070E-05 | global batch size: 256 | lm loss: 3.682998E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.416 | TFLOPs: 29.31 | +7: iteration 29650/ 37905 | consumed samples: 7590400 | consumed tokens: 15545139200 | elapsed time per iteration (s): 0.22 | learning rate: 4.065E-05 | global batch size: 256 | lm loss: 3.698495E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.858 | TFLOPs: 29.29 | +7: iteration 29660/ 37905 | consumed samples: 7592960 | consumed tokens: 15550382080 | elapsed time per iteration (s): 0.22 | learning rate: 4.060E-05 | global batch size: 256 | lm loss: 3.701350E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.051 | TFLOPs: 29.30 | +7: iteration 29670/ 37905 | consumed samples: 7595520 | consumed tokens: 15555624960 | elapsed time per iteration (s): 0.22 | learning rate: 4.055E-05 | global batch size: 256 | lm loss: 3.706233E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.696 | TFLOPs: 29.29 | +7: iteration 29680/ 37905 | consumed samples: 7598080 | consumed tokens: 15560867840 | elapsed time per iteration (s): 0.22 | learning rate: 4.051E-05 | global batch size: 256 | lm loss: 3.705588E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.608 | TFLOPs: 29.24 | +7: iteration 29690/ 37905 | consumed samples: 7600640 | consumed tokens: 15566110720 | elapsed time per iteration (s): 0.22 | learning rate: 4.046E-05 | global batch size: 256 | lm loss: 3.704404E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.972 | TFLOPs: 29.22 | +7: iteration 29700/ 37905 | consumed samples: 7603200 | consumed tokens: 15571353600 | elapsed time per iteration (s): 0.22 | learning rate: 4.041E-05 | global batch size: 256 | lm loss: 3.706753E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.869 | TFLOPs: 29.27 | +7: iteration 29710/ 37905 | consumed samples: 7605760 | consumed tokens: 15576596480 | elapsed time per iteration (s): 0.22 | learning rate: 4.036E-05 | global batch size: 256 | lm loss: 3.697899E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.979 | TFLOPs: 29.27 | +7: iteration 29720/ 37905 | consumed samples: 7608320 | consumed tokens: 15581839360 | elapsed time per iteration (s): 0.22 | learning rate: 4.032E-05 | global batch size: 256 | lm loss: 3.709952E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.267 | TFLOPs: 29.25 | +7: iteration 29730/ 37905 | consumed samples: 7610880 | consumed tokens: 15587082240 | elapsed time per iteration (s): 0.22 | learning rate: 4.027E-05 | global batch size: 256 | lm loss: 3.697930E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.741 | TFLOPs: 29.29 | +7: iteration 29740/ 37905 | consumed samples: 7613440 | consumed tokens: 15592325120 | elapsed time per iteration (s): 0.22 | learning rate: 4.022E-05 | global batch size: 256 | lm loss: 3.695371E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.461 | TFLOPs: 29.26 | +7: iteration 29750/ 37905 | consumed samples: 7616000 | consumed tokens: 15597568000 | elapsed time per iteration (s): 0.22 | learning rate: 4.017E-05 | global batch size: 256 | lm loss: 3.704209E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.479 | TFLOPs: 29.26 | +7: iteration 29760/ 37905 | consumed samples: 7618560 | consumed tokens: 15602810880 | elapsed time per iteration (s): 0.22 | learning rate: 4.013E-05 | global batch size: 256 | lm loss: 3.716697E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.641 | TFLOPs: 29.26 | +7: iteration 29770/ 37905 | consumed samples: 7621120 | consumed tokens: 15608053760 | elapsed time per iteration (s): 0.22 | learning rate: 4.008E-05 | global batch size: 256 | lm loss: 3.706374E+00 | grad norm: 0.244 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.104 | TFLOPs: 29.25 | +7: iteration 29780/ 37905 | consumed samples: 7623680 | consumed tokens: 15613296640 | elapsed time per iteration (s): 0.22 | learning rate: 4.003E-05 | global batch size: 256 | lm loss: 3.720388E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.341 | TFLOPs: 29.23 | +7: iteration 29790/ 37905 | consumed samples: 7626240 | consumed tokens: 15618539520 | elapsed time per iteration (s): 0.22 | learning rate: 3.998E-05 | global batch size: 256 | lm loss: 3.689042E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.723 | TFLOPs: 29.26 | +7: iteration 29800/ 37905 | consumed samples: 7628800 | consumed tokens: 15623782400 | elapsed time per iteration (s): 0.22 | learning rate: 3.994E-05 | global batch size: 256 | lm loss: 3.700758E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.120 | TFLOPs: 29.32 | +7: iteration 29810/ 37905 | consumed samples: 7631360 | consumed tokens: 15629025280 | elapsed time per iteration (s): 0.22 | learning rate: 3.989E-05 | global batch size: 256 | lm loss: 3.696448E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.082 | TFLOPs: 29.30 | +7: iteration 29820/ 37905 | consumed samples: 7633920 | consumed tokens: 15634268160 | elapsed time per iteration (s): 0.22 | learning rate: 3.984E-05 | global batch size: 256 | lm loss: 3.705674E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.765 | TFLOPs: 29.32 | +7: iteration 29830/ 37905 | consumed samples: 7636480 | consumed tokens: 15639511040 | elapsed time per iteration (s): 0.22 | learning rate: 3.979E-05 | global batch size: 256 | lm loss: 3.709798E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.408 | TFLOPs: 29.31 | +7: iteration 29840/ 37905 | consumed samples: 7639040 | consumed tokens: 15644753920 | elapsed time per iteration (s): 0.22 | learning rate: 3.975E-05 | global batch size: 256 | lm loss: 3.678966E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.854 | TFLOPs: 29.32 | +7: iteration 29850/ 37905 | consumed samples: 7641600 | consumed tokens: 15649996800 | elapsed time per iteration (s): 0.22 | learning rate: 3.970E-05 | global batch size: 256 | lm loss: 3.700533E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.777 | TFLOPs: 29.32 | +7: iteration 29860/ 37905 | consumed samples: 7644160 | consumed tokens: 15655239680 | elapsed time per iteration (s): 0.22 | learning rate: 3.965E-05 | global batch size: 256 | lm loss: 3.700589E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.770 | TFLOPs: 29.29 | +7: iteration 29870/ 37905 | consumed samples: 7646720 | consumed tokens: 15660482560 | elapsed time per iteration (s): 0.22 | learning rate: 3.961E-05 | global batch size: 256 | lm loss: 3.695420E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.996 | TFLOPs: 29.37 | +7: iteration 29880/ 37905 | consumed samples: 7649280 | consumed tokens: 15665725440 | elapsed time per iteration (s): 0.22 | learning rate: 3.956E-05 | global batch size: 256 | lm loss: 3.707785E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.602 | TFLOPs: 29.36 | +7: iteration 29890/ 37905 | consumed samples: 7651840 | consumed tokens: 15670968320 | elapsed time per iteration (s): 0.22 | learning rate: 3.951E-05 | global batch size: 256 | lm loss: 3.689634E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.685 | TFLOPs: 29.29 | +7: iteration 29900/ 37905 | consumed samples: 7654400 | consumed tokens: 15676211200 | elapsed time per iteration (s): 0.22 | learning rate: 3.947E-05 | global batch size: 256 | lm loss: 3.690272E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.385 | TFLOPs: 29.26 | +7: iteration 29910/ 37905 | consumed samples: 7656960 | consumed tokens: 15681454080 | elapsed time per iteration (s): 0.22 | learning rate: 3.942E-05 | global batch size: 256 | lm loss: 3.703534E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.337 | TFLOPs: 29.28 | +7: iteration 29920/ 37905 | consumed samples: 7659520 | consumed tokens: 15686696960 | elapsed time per iteration (s): 0.22 | learning rate: 3.937E-05 | global batch size: 256 | lm loss: 3.691526E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.655 | TFLOPs: 29.31 | +7: iteration 29930/ 37905 | consumed samples: 7662080 | consumed tokens: 15691939840 | elapsed time per iteration (s): 0.22 | learning rate: 3.933E-05 | global batch size: 256 | lm loss: 3.694573E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.166 | TFLOPs: 29.33 | +7: iteration 29940/ 37905 | consumed samples: 7664640 | consumed tokens: 15697182720 | elapsed time per iteration (s): 0.22 | learning rate: 3.928E-05 | global batch size: 256 | lm loss: 3.695448E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.652 | TFLOPs: 29.31 | +7: iteration 29950/ 37905 | consumed samples: 7667200 | consumed tokens: 15702425600 | elapsed time per iteration (s): 0.22 | learning rate: 3.923E-05 | global batch size: 256 | lm loss: 3.699733E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.060 | TFLOPs: 29.32 | +7: iteration 29960/ 37905 | consumed samples: 7669760 | consumed tokens: 15707668480 | elapsed time per iteration (s): 0.22 | learning rate: 3.919E-05 | global batch size: 256 | lm loss: 3.697454E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.459 | TFLOPs: 29.31 | +7: iteration 29970/ 37905 | consumed samples: 7672320 | consumed tokens: 15712911360 | elapsed time per iteration (s): 0.22 | learning rate: 3.914E-05 | global batch size: 256 | lm loss: 3.711774E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.541 | TFLOPs: 29.31 | +7: iteration 29980/ 37905 | consumed samples: 7674880 | consumed tokens: 15718154240 | elapsed time per iteration (s): 0.22 | learning rate: 3.909E-05 | global batch size: 256 | lm loss: 3.693530E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.858 | TFLOPs: 29.32 | +7: iteration 29990/ 37905 | consumed samples: 7677440 | consumed tokens: 15723397120 | elapsed time per iteration (s): 0.22 | learning rate: 3.905E-05 | global batch size: 256 | lm loss: 3.698771E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.151 | TFLOPs: 29.33 | +0: [2023-03-15 23:49:16,302] [INFO] [logging.py:68:log_dist] [Rank 0] step=30000, skipped=0, lr=[3.899987415428045e-05, 3.899987415428045e-05, 3.899987415428045e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 30000/ 37905 | consumed samples: 7680000 | consumed tokens: 15728640000 | elapsed time per iteration (s): 0.22 | learning rate: 3.900E-05 | global batch size: 256 | lm loss: 3.695488E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.605 | TFLOPs: 29.34 | +0: steps: 30000 loss: 3.7071 iter time (s): 0.221 samples/sec: 1158.544 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 30000 | lm loss value: 3.673124E+00 | lm loss PPL: 3.937471E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 30000 to checkpoints_83m20b400m +0: [2023-03-15 23:49:16,391] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step30000 is begin to save! +0: [2023-03-15 23:49:16,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:49:16,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:49:16,465] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:49:16,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:49:16,478] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:49:16,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:49:16,489] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:49:16,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:49:16,500] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:49:16,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:49:16,511] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:49:16,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:49:16,522] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:49:16,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:49:16,534] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:49:16,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:49:16,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:49:16,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:49:16,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:49:16,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:49:16,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:49:16,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:49:16,578] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:49:16,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:49:16,580] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step30000/mp_rank_00_model_states.pt +0: [2023-03-15 23:49:16,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:49:16,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:49:16,599] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:49:16,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:49:16,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:49:16,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:49:16,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:49:16,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:49:16,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:49:16,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +5: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +6: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +1: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +4: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +7: [2023-03-15 23:49:16,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:49:16,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +3: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:49:16,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:49:16,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step30000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +2: [2023-03-15 23:49:16,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step30000 is ready now! +0: successfully saved checkpoint at iteration 30000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 243.05 +7: iteration 30010/ 37905 | consumed samples: 7682560 | consumed tokens: 15733882880 | elapsed time per iteration (s): 0.25 | learning rate: 3.895E-05 | global batch size: 256 | lm loss: 3.704327E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1004.119 | TFLOPs: 25.58 | +7: iteration 30020/ 37905 | consumed samples: 7685120 | consumed tokens: 15739125760 | elapsed time per iteration (s): 0.22 | learning rate: 3.891E-05 | global batch size: 256 | lm loss: 3.682682E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.895 | TFLOPs: 29.34 | +7: iteration 30030/ 37905 | consumed samples: 7687680 | consumed tokens: 15744368640 | elapsed time per iteration (s): 0.22 | learning rate: 3.886E-05 | global batch size: 256 | lm loss: 3.705099E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.907 | TFLOPs: 29.35 | +7: iteration 30040/ 37905 | consumed samples: 7690240 | consumed tokens: 15749611520 | elapsed time per iteration (s): 0.22 | learning rate: 3.882E-05 | global batch size: 256 | lm loss: 3.699893E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.384 | TFLOPs: 29.31 | +7: iteration 30050/ 37905 | consumed samples: 7692800 | consumed tokens: 15754854400 | elapsed time per iteration (s): 0.22 | learning rate: 3.877E-05 | global batch size: 256 | lm loss: 3.696039E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.221 | TFLOPs: 29.33 | +7: iteration 30060/ 37905 | consumed samples: 7695360 | consumed tokens: 15760097280 | elapsed time per iteration (s): 0.22 | learning rate: 3.872E-05 | global batch size: 256 | lm loss: 3.702089E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.466 | TFLOPs: 29.31 | +7: iteration 30070/ 37905 | consumed samples: 7697920 | consumed tokens: 15765340160 | elapsed time per iteration (s): 0.22 | learning rate: 3.868E-05 | global batch size: 256 | lm loss: 3.701203E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.018 | TFLOPs: 29.30 | +7: iteration 30080/ 37905 | consumed samples: 7700480 | consumed tokens: 15770583040 | elapsed time per iteration (s): 0.22 | learning rate: 3.863E-05 | global batch size: 256 | lm loss: 3.701353E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.964 | TFLOPs: 29.30 | +7: iteration 30090/ 37905 | consumed samples: 7703040 | consumed tokens: 15775825920 | elapsed time per iteration (s): 0.22 | learning rate: 3.859E-05 | global batch size: 256 | lm loss: 3.706850E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.233 | TFLOPs: 29.30 | +7: iteration 30100/ 37905 | consumed samples: 7705600 | consumed tokens: 15781068800 | elapsed time per iteration (s): 0.22 | learning rate: 3.854E-05 | global batch size: 256 | lm loss: 3.695094E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.660 | TFLOPs: 29.31 | +7: iteration 30110/ 37905 | consumed samples: 7708160 | consumed tokens: 15786311680 | elapsed time per iteration (s): 0.22 | learning rate: 3.849E-05 | global batch size: 256 | lm loss: 3.702019E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.942 | TFLOPs: 29.27 | +7: iteration 30120/ 37905 | consumed samples: 7710720 | consumed tokens: 15791554560 | elapsed time per iteration (s): 0.22 | learning rate: 3.845E-05 | global batch size: 256 | lm loss: 3.703918E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.630 | TFLOPs: 29.31 | +7: iteration 30130/ 37905 | consumed samples: 7713280 | consumed tokens: 15796797440 | elapsed time per iteration (s): 0.22 | learning rate: 3.840E-05 | global batch size: 256 | lm loss: 3.705082E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.755 | TFLOPs: 29.29 | +7: iteration 30140/ 37905 | consumed samples: 7715840 | consumed tokens: 15802040320 | elapsed time per iteration (s): 0.22 | learning rate: 3.836E-05 | global batch size: 256 | lm loss: 3.697726E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.587 | TFLOPs: 29.31 | +7: iteration 30150/ 37905 | consumed samples: 7718400 | consumed tokens: 15807283200 | elapsed time per iteration (s): 0.22 | learning rate: 3.831E-05 | global batch size: 256 | lm loss: 3.711461E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.918 | TFLOPs: 29.32 | +7: iteration 30160/ 37905 | consumed samples: 7720960 | consumed tokens: 15812526080 | elapsed time per iteration (s): 0.22 | learning rate: 3.827E-05 | global batch size: 256 | lm loss: 3.703411E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.978 | TFLOPs: 29.32 | +7: iteration 30170/ 37905 | consumed samples: 7723520 | consumed tokens: 15817768960 | elapsed time per iteration (s): 0.22 | learning rate: 3.822E-05 | global batch size: 256 | lm loss: 3.700764E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.041 | TFLOPs: 29.30 | +7: iteration 30180/ 37905 | consumed samples: 7726080 | consumed tokens: 15823011840 | elapsed time per iteration (s): 0.22 | learning rate: 3.817E-05 | global batch size: 256 | lm loss: 3.699720E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.205 | TFLOPs: 29.25 | +7: iteration 30190/ 37905 | consumed samples: 7728640 | consumed tokens: 15828254720 | elapsed time per iteration (s): 0.23 | learning rate: 3.813E-05 | global batch size: 256 | lm loss: 3.691008E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.481 | TFLOPs: 28.85 | +7: iteration 30200/ 37905 | consumed samples: 7731200 | consumed tokens: 15833497600 | elapsed time per iteration (s): 0.22 | learning rate: 3.808E-05 | global batch size: 256 | lm loss: 3.699079E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.265 | TFLOPs: 29.28 | +7: iteration 30210/ 37905 | consumed samples: 7733760 | consumed tokens: 15838740480 | elapsed time per iteration (s): 0.22 | learning rate: 3.804E-05 | global batch size: 256 | lm loss: 3.704302E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.881 | TFLOPs: 29.27 | +7: iteration 30220/ 37905 | consumed samples: 7736320 | consumed tokens: 15843983360 | elapsed time per iteration (s): 0.22 | learning rate: 3.799E-05 | global batch size: 256 | lm loss: 3.703215E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.554 | TFLOPs: 29.31 | +7: iteration 30230/ 37905 | consumed samples: 7738880 | consumed tokens: 15849226240 | elapsed time per iteration (s): 0.22 | learning rate: 3.795E-05 | global batch size: 256 | lm loss: 3.704856E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.528 | TFLOPs: 29.28 | +7: iteration 30240/ 37905 | consumed samples: 7741440 | consumed tokens: 15854469120 | elapsed time per iteration (s): 0.22 | learning rate: 3.790E-05 | global batch size: 256 | lm loss: 3.708766E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.828 | TFLOPs: 29.34 | +7: iteration 30250/ 37905 | consumed samples: 7744000 | consumed tokens: 15859712000 | elapsed time per iteration (s): 0.22 | learning rate: 3.786E-05 | global batch size: 256 | lm loss: 3.709178E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.713 | TFLOPs: 29.26 | +7: iteration 30260/ 37905 | consumed samples: 7746560 | consumed tokens: 15864954880 | elapsed time per iteration (s): 0.22 | learning rate: 3.781E-05 | global batch size: 256 | lm loss: 3.696944E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.803 | TFLOPs: 29.27 | +7: iteration 30270/ 37905 | consumed samples: 7749120 | consumed tokens: 15870197760 | elapsed time per iteration (s): 0.22 | learning rate: 3.777E-05 | global batch size: 256 | lm loss: 3.695089E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.338 | TFLOPs: 29.28 | +7: iteration 30280/ 37905 | consumed samples: 7751680 | consumed tokens: 15875440640 | elapsed time per iteration (s): 0.22 | learning rate: 3.772E-05 | global batch size: 256 | lm loss: 3.711664E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.383 | TFLOPs: 29.31 | +7: iteration 30290/ 37905 | consumed samples: 7754240 | consumed tokens: 15880683520 | elapsed time per iteration (s): 0.22 | learning rate: 3.768E-05 | global batch size: 256 | lm loss: 3.687654E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.676 | TFLOPs: 29.24 | +7: iteration 30300/ 37905 | consumed samples: 7756800 | consumed tokens: 15885926400 | elapsed time per iteration (s): 0.22 | learning rate: 3.763E-05 | global batch size: 256 | lm loss: 3.689089E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.619 | TFLOPs: 29.26 | +7: iteration 30310/ 37905 | consumed samples: 7759360 | consumed tokens: 15891169280 | elapsed time per iteration (s): 0.22 | learning rate: 3.759E-05 | global batch size: 256 | lm loss: 3.687705E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.042 | TFLOPs: 29.27 | +7: iteration 30320/ 37905 | consumed samples: 7761920 | consumed tokens: 15896412160 | elapsed time per iteration (s): 0.22 | learning rate: 3.754E-05 | global batch size: 256 | lm loss: 3.706508E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.109 | TFLOPs: 29.32 | +7: iteration 30330/ 37905 | consumed samples: 7764480 | consumed tokens: 15901655040 | elapsed time per iteration (s): 0.22 | learning rate: 3.750E-05 | global batch size: 256 | lm loss: 3.700063E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.904 | TFLOPs: 29.34 | +7: iteration 30340/ 37905 | consumed samples: 7767040 | consumed tokens: 15906897920 | elapsed time per iteration (s): 0.22 | learning rate: 3.745E-05 | global batch size: 256 | lm loss: 3.703489E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.283 | TFLOPs: 29.35 | +7: iteration 30350/ 37905 | consumed samples: 7769600 | consumed tokens: 15912140800 | elapsed time per iteration (s): 0.22 | learning rate: 3.741E-05 | global batch size: 256 | lm loss: 3.696361E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.688 | TFLOPs: 29.34 | +7: iteration 30360/ 37905 | consumed samples: 7772160 | consumed tokens: 15917383680 | elapsed time per iteration (s): 0.23 | learning rate: 3.737E-05 | global batch size: 256 | lm loss: 3.705806E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.979 | TFLOPs: 28.96 | +7: iteration 30370/ 37905 | consumed samples: 7774720 | consumed tokens: 15922626560 | elapsed time per iteration (s): 0.22 | learning rate: 3.732E-05 | global batch size: 256 | lm loss: 3.698748E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.800 | TFLOPs: 29.32 | +7: iteration 30380/ 37905 | consumed samples: 7777280 | consumed tokens: 15927869440 | elapsed time per iteration (s): 0.22 | learning rate: 3.728E-05 | global batch size: 256 | lm loss: 3.704324E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.183 | TFLOPs: 29.35 | +7: iteration 30390/ 37905 | consumed samples: 7779840 | consumed tokens: 15933112320 | elapsed time per iteration (s): 0.22 | learning rate: 3.723E-05 | global batch size: 256 | lm loss: 3.678692E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.642 | TFLOPs: 29.03 | +7: iteration 30400/ 37905 | consumed samples: 7782400 | consumed tokens: 15938355200 | elapsed time per iteration (s): 0.22 | learning rate: 3.719E-05 | global batch size: 256 | lm loss: 3.696341E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.733 | TFLOPs: 29.26 | +7: iteration 30410/ 37905 | consumed samples: 7784960 | consumed tokens: 15943598080 | elapsed time per iteration (s): 0.22 | learning rate: 3.714E-05 | global batch size: 256 | lm loss: 3.693702E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.555 | TFLOPs: 29.29 | +7: iteration 30420/ 37905 | consumed samples: 7787520 | consumed tokens: 15948840960 | elapsed time per iteration (s): 0.22 | learning rate: 3.710E-05 | global batch size: 256 | lm loss: 3.692285E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.698 | TFLOPs: 29.31 | +7: iteration 30430/ 37905 | consumed samples: 7790080 | consumed tokens: 15954083840 | elapsed time per iteration (s): 0.22 | learning rate: 3.706E-05 | global batch size: 256 | lm loss: 3.694950E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.484 | TFLOPs: 29.28 | +7: iteration 30440/ 37905 | consumed samples: 7792640 | consumed tokens: 15959326720 | elapsed time per iteration (s): 0.22 | learning rate: 3.701E-05 | global batch size: 256 | lm loss: 3.682521E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.674 | TFLOPs: 29.29 | +7: iteration 30450/ 37905 | consumed samples: 7795200 | consumed tokens: 15964569600 | elapsed time per iteration (s): 0.22 | learning rate: 3.697E-05 | global batch size: 256 | lm loss: 3.694145E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.175 | TFLOPs: 29.30 | +7: iteration 30460/ 37905 | consumed samples: 7797760 | consumed tokens: 15969812480 | elapsed time per iteration (s): 0.22 | learning rate: 3.692E-05 | global batch size: 256 | lm loss: 3.705278E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.278 | TFLOPs: 29.30 | +7: iteration 30470/ 37905 | consumed samples: 7800320 | consumed tokens: 15975055360 | elapsed time per iteration (s): 0.22 | learning rate: 3.688E-05 | global batch size: 256 | lm loss: 3.697723E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.106 | TFLOPs: 29.27 | +7: iteration 30480/ 37905 | consumed samples: 7802880 | consumed tokens: 15980298240 | elapsed time per iteration (s): 0.22 | learning rate: 3.684E-05 | global batch size: 256 | lm loss: 3.699669E+00 | grad norm: 0.223 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.794 | TFLOPs: 29.29 | +7: iteration 30490/ 37905 | consumed samples: 7805440 | consumed tokens: 15985541120 | elapsed time per iteration (s): 0.22 | learning rate: 3.679E-05 | global batch size: 256 | lm loss: 3.698359E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.188 | TFLOPs: 29.28 | +7: iteration 30500/ 37905 | consumed samples: 7808000 | consumed tokens: 15990784000 | elapsed time per iteration (s): 0.22 | learning rate: 3.675E-05 | global batch size: 256 | lm loss: 3.705420E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.007 | TFLOPs: 29.30 | +7: iteration 30510/ 37905 | consumed samples: 7810560 | consumed tokens: 15996026880 | elapsed time per iteration (s): 0.22 | learning rate: 3.670E-05 | global batch size: 256 | lm loss: 3.701853E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.650 | TFLOPs: 29.29 | +7: iteration 30520/ 37905 | consumed samples: 7813120 | consumed tokens: 16001269760 | elapsed time per iteration (s): 0.22 | learning rate: 3.666E-05 | global batch size: 256 | lm loss: 3.694220E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.985 | TFLOPs: 29.30 | +7: iteration 30530/ 37905 | consumed samples: 7815680 | consumed tokens: 16006512640 | elapsed time per iteration (s): 0.22 | learning rate: 3.662E-05 | global batch size: 256 | lm loss: 3.702463E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.716 | TFLOPs: 29.31 | +7: iteration 30540/ 37905 | consumed samples: 7818240 | consumed tokens: 16011755520 | elapsed time per iteration (s): 0.22 | learning rate: 3.657E-05 | global batch size: 256 | lm loss: 3.684875E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.459 | TFLOPs: 29.33 | +7: iteration 30550/ 37905 | consumed samples: 7820800 | consumed tokens: 16016998400 | elapsed time per iteration (s): 0.22 | learning rate: 3.653E-05 | global batch size: 256 | lm loss: 3.699290E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.919 | TFLOPs: 29.32 | +7: iteration 30560/ 37905 | consumed samples: 7823360 | consumed tokens: 16022241280 | elapsed time per iteration (s): 0.22 | learning rate: 3.649E-05 | global batch size: 256 | lm loss: 3.701371E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.235 | TFLOPs: 29.33 | +7: iteration 30570/ 37905 | consumed samples: 7825920 | consumed tokens: 16027484160 | elapsed time per iteration (s): 0.22 | learning rate: 3.644E-05 | global batch size: 256 | lm loss: 3.693019E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.816 | TFLOPs: 29.32 | +7: iteration 30580/ 37905 | consumed samples: 7828480 | consumed tokens: 16032727040 | elapsed time per iteration (s): 0.22 | learning rate: 3.640E-05 | global batch size: 256 | lm loss: 3.696520E+00 | grad norm: 0.225 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.492 | TFLOPs: 29.33 | +7: iteration 30590/ 37905 | consumed samples: 7831040 | consumed tokens: 16037969920 | elapsed time per iteration (s): 0.23 | learning rate: 3.636E-05 | global batch size: 256 | lm loss: 3.689025E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.157 | TFLOPs: 28.94 | +7: iteration 30600/ 37905 | consumed samples: 7833600 | consumed tokens: 16043212800 | elapsed time per iteration (s): 0.22 | learning rate: 3.631E-05 | global batch size: 256 | lm loss: 3.689902E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.155 | TFLOPs: 29.02 | +7: iteration 30610/ 37905 | consumed samples: 7836160 | consumed tokens: 16048455680 | elapsed time per iteration (s): 0.22 | learning rate: 3.627E-05 | global batch size: 256 | lm loss: 3.685796E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.224 | TFLOPs: 29.30 | +7: iteration 30620/ 37905 | consumed samples: 7838720 | consumed tokens: 16053698560 | elapsed time per iteration (s): 0.22 | learning rate: 3.623E-05 | global batch size: 256 | lm loss: 3.691793E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.685 | TFLOPs: 29.19 | +7: iteration 30630/ 37905 | consumed samples: 7841280 | consumed tokens: 16058941440 | elapsed time per iteration (s): 0.22 | learning rate: 3.618E-05 | global batch size: 256 | lm loss: 3.709026E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.855 | TFLOPs: 29.04 | +7: iteration 30640/ 37905 | consumed samples: 7843840 | consumed tokens: 16064184320 | elapsed time per iteration (s): 0.22 | learning rate: 3.614E-05 | global batch size: 256 | lm loss: 3.699595E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.711 | TFLOPs: 29.31 | +7: iteration 30650/ 37905 | consumed samples: 7846400 | consumed tokens: 16069427200 | elapsed time per iteration (s): 0.22 | learning rate: 3.610E-05 | global batch size: 256 | lm loss: 3.690725E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.358 | TFLOPs: 29.33 | +7: iteration 30660/ 37905 | consumed samples: 7848960 | consumed tokens: 16074670080 | elapsed time per iteration (s): 0.22 | learning rate: 3.605E-05 | global batch size: 256 | lm loss: 3.717766E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.595 | TFLOPs: 29.34 | +7: iteration 30670/ 37905 | consumed samples: 7851520 | consumed tokens: 16079912960 | elapsed time per iteration (s): 0.22 | learning rate: 3.601E-05 | global batch size: 256 | lm loss: 3.714556E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.664 | TFLOPs: 29.34 | +7: iteration 30680/ 37905 | consumed samples: 7854080 | consumed tokens: 16085155840 | elapsed time per iteration (s): 0.22 | learning rate: 3.597E-05 | global batch size: 256 | lm loss: 3.690696E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.800 | TFLOPs: 29.32 | +7: iteration 30690/ 37905 | consumed samples: 7856640 | consumed tokens: 16090398720 | elapsed time per iteration (s): 0.22 | learning rate: 3.593E-05 | global batch size: 256 | lm loss: 3.689959E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.117 | TFLOPs: 29.32 | +7: iteration 30700/ 37905 | consumed samples: 7859200 | consumed tokens: 16095641600 | elapsed time per iteration (s): 0.22 | learning rate: 3.588E-05 | global batch size: 256 | lm loss: 3.709471E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.367 | TFLOPs: 29.33 | +7: iteration 30710/ 37905 | consumed samples: 7861760 | consumed tokens: 16100884480 | elapsed time per iteration (s): 0.22 | learning rate: 3.584E-05 | global batch size: 256 | lm loss: 3.688674E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.230 | TFLOPs: 29.33 | +7: iteration 30720/ 37905 | consumed samples: 7864320 | consumed tokens: 16106127360 | elapsed time per iteration (s): 0.22 | learning rate: 3.580E-05 | global batch size: 256 | lm loss: 3.687587E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.196 | TFLOPs: 29.33 | +7: iteration 30730/ 37905 | consumed samples: 7866880 | consumed tokens: 16111370240 | elapsed time per iteration (s): 0.22 | learning rate: 3.575E-05 | global batch size: 256 | lm loss: 3.698295E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.994 | TFLOPs: 29.32 | +7: iteration 30740/ 37905 | consumed samples: 7869440 | consumed tokens: 16116613120 | elapsed time per iteration (s): 0.22 | learning rate: 3.571E-05 | global batch size: 256 | lm loss: 3.688668E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.266 | TFLOPs: 29.30 | +7: iteration 30750/ 37905 | consumed samples: 7872000 | consumed tokens: 16121856000 | elapsed time per iteration (s): 0.22 | learning rate: 3.567E-05 | global batch size: 256 | lm loss: 3.690630E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.719 | TFLOPs: 29.31 | +7: iteration 30760/ 37905 | consumed samples: 7874560 | consumed tokens: 16127098880 | elapsed time per iteration (s): 0.22 | learning rate: 3.563E-05 | global batch size: 256 | lm loss: 3.710098E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.971 | TFLOPs: 29.32 | +7: iteration 30770/ 37905 | consumed samples: 7877120 | consumed tokens: 16132341760 | elapsed time per iteration (s): 0.22 | learning rate: 3.558E-05 | global batch size: 256 | lm loss: 3.700925E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.164 | TFLOPs: 29.33 | +7: iteration 30780/ 37905 | consumed samples: 7879680 | consumed tokens: 16137584640 | elapsed time per iteration (s): 0.22 | learning rate: 3.554E-05 | global batch size: 256 | lm loss: 3.690352E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.200 | TFLOPs: 29.33 | +7: iteration 30790/ 37905 | consumed samples: 7882240 | consumed tokens: 16142827520 | elapsed time per iteration (s): 0.22 | learning rate: 3.550E-05 | global batch size: 256 | lm loss: 3.697937E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.286 | TFLOPs: 29.33 | +7: iteration 30800/ 37905 | consumed samples: 7884800 | consumed tokens: 16148070400 | elapsed time per iteration (s): 0.22 | learning rate: 3.546E-05 | global batch size: 256 | lm loss: 3.687667E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.492 | TFLOPs: 29.33 | +7: iteration 30810/ 37905 | consumed samples: 7887360 | consumed tokens: 16153313280 | elapsed time per iteration (s): 0.22 | learning rate: 3.542E-05 | global batch size: 256 | lm loss: 3.686785E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.024 | TFLOPs: 29.35 | +7: iteration 30820/ 37905 | consumed samples: 7889920 | consumed tokens: 16158556160 | elapsed time per iteration (s): 0.22 | learning rate: 3.537E-05 | global batch size: 256 | lm loss: 3.693165E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.612 | TFLOPs: 29.34 | +7: iteration 30830/ 37905 | consumed samples: 7892480 | consumed tokens: 16163799040 | elapsed time per iteration (s): 0.22 | learning rate: 3.533E-05 | global batch size: 256 | lm loss: 3.698227E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.756 | TFLOPs: 29.32 | +7: iteration 30840/ 37905 | consumed samples: 7895040 | consumed tokens: 16169041920 | elapsed time per iteration (s): 0.22 | learning rate: 3.529E-05 | global batch size: 256 | lm loss: 3.703270E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.538 | TFLOPs: 29.31 | +7: iteration 30850/ 37905 | consumed samples: 7897600 | consumed tokens: 16174284800 | elapsed time per iteration (s): 0.22 | learning rate: 3.525E-05 | global batch size: 256 | lm loss: 3.712786E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.794 | TFLOPs: 29.32 | +7: iteration 30860/ 37905 | consumed samples: 7900160 | consumed tokens: 16179527680 | elapsed time per iteration (s): 0.22 | learning rate: 3.521E-05 | global batch size: 256 | lm loss: 3.712876E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.255 | TFLOPs: 29.33 | +7: iteration 30870/ 37905 | consumed samples: 7902720 | consumed tokens: 16184770560 | elapsed time per iteration (s): 0.22 | learning rate: 3.516E-05 | global batch size: 256 | lm loss: 3.695126E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.706 | TFLOPs: 29.31 | +7: iteration 30880/ 37905 | consumed samples: 7905280 | consumed tokens: 16190013440 | elapsed time per iteration (s): 0.22 | learning rate: 3.512E-05 | global batch size: 256 | lm loss: 3.701085E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.433 | TFLOPs: 29.33 | +7: iteration 30890/ 37905 | consumed samples: 7907840 | consumed tokens: 16195256320 | elapsed time per iteration (s): 0.22 | learning rate: 3.508E-05 | global batch size: 256 | lm loss: 3.686887E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.579 | TFLOPs: 29.34 | +7: iteration 30900/ 37905 | consumed samples: 7910400 | consumed tokens: 16200499200 | elapsed time per iteration (s): 0.22 | learning rate: 3.504E-05 | global batch size: 256 | lm loss: 3.696439E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.075 | TFLOPs: 29.32 | +7: iteration 30910/ 37905 | consumed samples: 7912960 | consumed tokens: 16205742080 | elapsed time per iteration (s): 0.22 | learning rate: 3.500E-05 | global batch size: 256 | lm loss: 3.711419E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.587 | TFLOPs: 29.31 | +7: iteration 30920/ 37905 | consumed samples: 7915520 | consumed tokens: 16210984960 | elapsed time per iteration (s): 0.22 | learning rate: 3.495E-05 | global batch size: 256 | lm loss: 3.699646E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.219 | TFLOPs: 29.33 | +7: iteration 30930/ 37905 | consumed samples: 7918080 | consumed tokens: 16216227840 | elapsed time per iteration (s): 0.22 | learning rate: 3.491E-05 | global batch size: 256 | lm loss: 3.696264E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.569 | TFLOPs: 29.34 | +7: iteration 30940/ 37905 | consumed samples: 7920640 | consumed tokens: 16221470720 | elapsed time per iteration (s): 0.22 | learning rate: 3.487E-05 | global batch size: 256 | lm loss: 3.689260E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.198 | TFLOPs: 29.33 | +7: iteration 30950/ 37905 | consumed samples: 7923200 | consumed tokens: 16226713600 | elapsed time per iteration (s): 0.22 | learning rate: 3.483E-05 | global batch size: 256 | lm loss: 3.698156E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.042 | TFLOPs: 29.32 | +7: iteration 30960/ 37905 | consumed samples: 7925760 | consumed tokens: 16231956480 | elapsed time per iteration (s): 0.22 | learning rate: 3.479E-05 | global batch size: 256 | lm loss: 3.701347E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.604 | TFLOPs: 29.31 | +7: iteration 30970/ 37905 | consumed samples: 7928320 | consumed tokens: 16237199360 | elapsed time per iteration (s): 0.22 | learning rate: 3.475E-05 | global batch size: 256 | lm loss: 3.705994E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.821 | TFLOPs: 29.32 | +7: iteration 30980/ 37905 | consumed samples: 7930880 | consumed tokens: 16242442240 | elapsed time per iteration (s): 0.22 | learning rate: 3.471E-05 | global batch size: 256 | lm loss: 3.700665E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.120 | TFLOPs: 29.32 | +7: iteration 30990/ 37905 | consumed samples: 7933440 | consumed tokens: 16247685120 | elapsed time per iteration (s): 0.22 | learning rate: 3.466E-05 | global batch size: 256 | lm loss: 3.687585E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.495 | TFLOPs: 29.31 | +7: iteration 31000/ 37905 | consumed samples: 7936000 | consumed tokens: 16252928000 | elapsed time per iteration (s): 0.22 | learning rate: 3.462E-05 | global batch size: 256 | lm loss: 3.690397E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.170 | TFLOPs: 29.33 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 31000 | lm loss value: 3.664255E+00 | lm loss PPL: 3.902706E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 31000 to checkpoints_83m20b400m +0: [2023-03-15 23:52:59,358] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step31000 is begin to save! +0: [2023-03-15 23:52:59,362] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:52:59,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:52:59,438] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:52:59,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:52:59,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:52:59,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:52:59,462] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:52:59,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:52:59,473] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:52:59,484] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:52:59,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:52:59,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:52:59,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:52:59,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:52:59,507] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:52:59,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:52:59,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:52:59,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:52:59,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:52:59,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:52:59,540] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:52:59,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:52:59,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:52:59,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:52:59,552] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step31000/mp_rank_00_model_states.pt +0: [2023-03-15 23:52:59,552] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:52:59,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,571] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:52:59,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:52:59,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-15 23:52:59,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +6: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:52:59,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:52:59,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:52:59,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +5: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +3: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:52:59,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:52:59,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +1: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +6: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +2: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +6: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +2: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +7: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:52:59,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step31000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +7: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +4: [2023-03-15 23:52:59,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step31000 is ready now! +0: successfully saved checkpoint at iteration 31000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 245.33 +7: iteration 31010/ 37905 | consumed samples: 7938560 | consumed tokens: 16258170880 | elapsed time per iteration (s): 0.25 | learning rate: 3.458E-05 | global batch size: 256 | lm loss: 3.707769E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1011.381 | TFLOPs: 25.77 | +7: iteration 31020/ 37905 | consumed samples: 7941120 | consumed tokens: 16263413760 | elapsed time per iteration (s): 0.22 | learning rate: 3.454E-05 | global batch size: 256 | lm loss: 3.712775E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.926 | TFLOPs: 29.29 | +7: iteration 31030/ 37905 | consumed samples: 7943680 | consumed tokens: 16268656640 | elapsed time per iteration (s): 0.22 | learning rate: 3.450E-05 | global batch size: 256 | lm loss: 3.692776E+00 | grad norm: 0.219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.607 | TFLOPs: 29.29 | +7: iteration 31040/ 37905 | consumed samples: 7946240 | consumed tokens: 16273899520 | elapsed time per iteration (s): 0.22 | learning rate: 3.446E-05 | global batch size: 256 | lm loss: 3.717509E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.364 | TFLOPs: 29.28 | +7: iteration 31050/ 37905 | consumed samples: 7948800 | consumed tokens: 16279142400 | elapsed time per iteration (s): 0.22 | learning rate: 3.442E-05 | global batch size: 256 | lm loss: 3.701183E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.464 | TFLOPs: 29.28 | +7: iteration 31060/ 37905 | consumed samples: 7951360 | consumed tokens: 16284385280 | elapsed time per iteration (s): 0.22 | learning rate: 3.438E-05 | global batch size: 256 | lm loss: 3.698894E+00 | grad norm: 0.228 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.729 | TFLOPs: 29.24 | +7: iteration 31070/ 37905 | consumed samples: 7953920 | consumed tokens: 16289628160 | elapsed time per iteration (s): 0.22 | learning rate: 3.434E-05 | global batch size: 256 | lm loss: 3.695118E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.949 | TFLOPs: 29.24 | +7: iteration 31080/ 37905 | consumed samples: 7956480 | consumed tokens: 16294871040 | elapsed time per iteration (s): 0.22 | learning rate: 3.430E-05 | global batch size: 256 | lm loss: 3.684740E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.732 | TFLOPs: 29.29 | +7: iteration 31090/ 37905 | consumed samples: 7959040 | consumed tokens: 16300113920 | elapsed time per iteration (s): 0.23 | learning rate: 3.426E-05 | global batch size: 256 | lm loss: 3.703093E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.688 | TFLOPs: 28.47 | +7: iteration 31100/ 37905 | consumed samples: 7961600 | consumed tokens: 16305356800 | elapsed time per iteration (s): 0.22 | learning rate: 3.421E-05 | global batch size: 256 | lm loss: 3.692519E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.112 | TFLOPs: 29.27 | +7: iteration 31110/ 37905 | consumed samples: 7964160 | consumed tokens: 16310599680 | elapsed time per iteration (s): 0.22 | learning rate: 3.417E-05 | global batch size: 256 | lm loss: 3.700912E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.318 | TFLOPs: 29.28 | +7: iteration 31120/ 37905 | consumed samples: 7966720 | consumed tokens: 16315842560 | elapsed time per iteration (s): 0.22 | learning rate: 3.413E-05 | global batch size: 256 | lm loss: 3.689767E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.320 | TFLOPs: 29.28 | +7: iteration 31130/ 37905 | consumed samples: 7969280 | consumed tokens: 16321085440 | elapsed time per iteration (s): 0.22 | learning rate: 3.409E-05 | global batch size: 256 | lm loss: 3.707225E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.777 | TFLOPs: 29.32 | +7: iteration 31140/ 37905 | consumed samples: 7971840 | consumed tokens: 16326328320 | elapsed time per iteration (s): 0.22 | learning rate: 3.405E-05 | global batch size: 256 | lm loss: 3.693043E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.915 | TFLOPs: 29.17 | +7: iteration 31150/ 37905 | consumed samples: 7974400 | consumed tokens: 16331571200 | elapsed time per iteration (s): 0.22 | learning rate: 3.401E-05 | global batch size: 256 | lm loss: 3.702153E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.560 | TFLOPs: 29.21 | +7: iteration 31160/ 37905 | consumed samples: 7976960 | consumed tokens: 16336814080 | elapsed time per iteration (s): 0.22 | learning rate: 3.397E-05 | global batch size: 256 | lm loss: 3.693845E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.902 | TFLOPs: 29.01 | +7: iteration 31170/ 37905 | consumed samples: 7979520 | consumed tokens: 16342056960 | elapsed time per iteration (s): 0.22 | learning rate: 3.393E-05 | global batch size: 256 | lm loss: 3.707628E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.584 | TFLOPs: 29.26 | +7: iteration 31180/ 37905 | consumed samples: 7982080 | consumed tokens: 16347299840 | elapsed time per iteration (s): 0.22 | learning rate: 3.389E-05 | global batch size: 256 | lm loss: 3.701305E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.362 | TFLOPs: 29.31 | +7: iteration 31190/ 37905 | consumed samples: 7984640 | consumed tokens: 16352542720 | elapsed time per iteration (s): 0.22 | learning rate: 3.385E-05 | global batch size: 256 | lm loss: 3.698449E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.872 | TFLOPs: 29.29 | +7: iteration 31200/ 37905 | consumed samples: 7987200 | consumed tokens: 16357785600 | elapsed time per iteration (s): 0.22 | learning rate: 3.381E-05 | global batch size: 256 | lm loss: 3.693448E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.955 | TFLOPs: 29.30 | +7: iteration 31210/ 37905 | consumed samples: 7989760 | consumed tokens: 16363028480 | elapsed time per iteration (s): 0.22 | learning rate: 3.377E-05 | global batch size: 256 | lm loss: 3.694353E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.181 | TFLOPs: 29.28 | +7: iteration 31220/ 37905 | consumed samples: 7992320 | consumed tokens: 16368271360 | elapsed time per iteration (s): 0.22 | learning rate: 3.373E-05 | global batch size: 256 | lm loss: 3.702073E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.523 | TFLOPs: 29.31 | +7: iteration 31230/ 37905 | consumed samples: 7994880 | consumed tokens: 16373514240 | elapsed time per iteration (s): 0.22 | learning rate: 3.369E-05 | global batch size: 256 | lm loss: 3.709530E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.662 | TFLOPs: 29.31 | +7: iteration 31240/ 37905 | consumed samples: 7997440 | consumed tokens: 16378757120 | elapsed time per iteration (s): 0.22 | learning rate: 3.365E-05 | global batch size: 256 | lm loss: 3.686269E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.561 | TFLOPs: 29.23 | +7: iteration 31250/ 37905 | consumed samples: 8000000 | consumed tokens: 16384000000 | elapsed time per iteration (s): 0.22 | learning rate: 3.361E-05 | global batch size: 256 | lm loss: 3.699903E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.378 | TFLOPs: 29.26 | +7: iteration 31260/ 37905 | consumed samples: 8002560 | consumed tokens: 16389242880 | elapsed time per iteration (s): 0.22 | learning rate: 3.357E-05 | global batch size: 256 | lm loss: 3.689772E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.628 | TFLOPs: 29.24 | +7: iteration 31270/ 37905 | consumed samples: 8005120 | consumed tokens: 16394485760 | elapsed time per iteration (s): 0.22 | learning rate: 3.353E-05 | global batch size: 256 | lm loss: 3.696279E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.881 | TFLOPs: 29.24 | +7: iteration 31280/ 37905 | consumed samples: 8007680 | consumed tokens: 16399728640 | elapsed time per iteration (s): 0.22 | learning rate: 3.349E-05 | global batch size: 256 | lm loss: 3.707381E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.310 | TFLOPs: 29.25 | +7: iteration 31290/ 37905 | consumed samples: 8010240 | consumed tokens: 16404971520 | elapsed time per iteration (s): 0.22 | learning rate: 3.345E-05 | global batch size: 256 | lm loss: 3.704599E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.273 | TFLOPs: 29.25 | +7: iteration 31300/ 37905 | consumed samples: 8012800 | consumed tokens: 16410214400 | elapsed time per iteration (s): 0.22 | learning rate: 3.341E-05 | global batch size: 256 | lm loss: 3.699869E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.554 | TFLOPs: 29.26 | +7: iteration 31310/ 37905 | consumed samples: 8015360 | consumed tokens: 16415457280 | elapsed time per iteration (s): 0.22 | learning rate: 3.337E-05 | global batch size: 256 | lm loss: 3.700814E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.194 | TFLOPs: 29.25 | +7: iteration 31320/ 37905 | consumed samples: 8017920 | consumed tokens: 16420700160 | elapsed time per iteration (s): 0.22 | learning rate: 3.333E-05 | global batch size: 256 | lm loss: 3.706860E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.302 | TFLOPs: 29.25 | +7: iteration 31330/ 37905 | consumed samples: 8020480 | consumed tokens: 16425943040 | elapsed time per iteration (s): 0.23 | learning rate: 3.329E-05 | global batch size: 256 | lm loss: 3.701789E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.266 | TFLOPs: 28.72 | +7: iteration 31340/ 37905 | consumed samples: 8023040 | consumed tokens: 16431185920 | elapsed time per iteration (s): 0.23 | learning rate: 3.325E-05 | global batch size: 256 | lm loss: 3.694638E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.864 | TFLOPs: 28.86 | +7: iteration 31350/ 37905 | consumed samples: 8025600 | consumed tokens: 16436428800 | elapsed time per iteration (s): 0.22 | learning rate: 3.322E-05 | global batch size: 256 | lm loss: 3.700810E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.253 | TFLOPs: 29.30 | +7: iteration 31360/ 37905 | consumed samples: 8028160 | consumed tokens: 16441671680 | elapsed time per iteration (s): 0.22 | learning rate: 3.318E-05 | global batch size: 256 | lm loss: 3.706471E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.210 | TFLOPs: 29.30 | +7: iteration 31370/ 37905 | consumed samples: 8030720 | consumed tokens: 16446914560 | elapsed time per iteration (s): 0.22 | learning rate: 3.314E-05 | global batch size: 256 | lm loss: 3.698597E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.106 | TFLOPs: 29.30 | +7: iteration 31380/ 37905 | consumed samples: 8033280 | consumed tokens: 16452157440 | elapsed time per iteration (s): 0.22 | learning rate: 3.310E-05 | global batch size: 256 | lm loss: 3.698757E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.833 | TFLOPs: 29.32 | +7: iteration 31390/ 37905 | consumed samples: 8035840 | consumed tokens: 16457400320 | elapsed time per iteration (s): 0.22 | learning rate: 3.306E-05 | global batch size: 256 | lm loss: 3.689989E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.485 | TFLOPs: 29.31 | +7: iteration 31400/ 37905 | consumed samples: 8038400 | consumed tokens: 16462643200 | elapsed time per iteration (s): 0.22 | learning rate: 3.302E-05 | global batch size: 256 | lm loss: 3.686057E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.398 | TFLOPs: 29.33 | +7: iteration 31410/ 37905 | consumed samples: 8040960 | consumed tokens: 16467886080 | elapsed time per iteration (s): 0.22 | learning rate: 3.298E-05 | global batch size: 256 | lm loss: 3.688594E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.517 | TFLOPs: 29.34 | +7: iteration 31420/ 37905 | consumed samples: 8043520 | consumed tokens: 16473128960 | elapsed time per iteration (s): 0.22 | learning rate: 3.294E-05 | global batch size: 256 | lm loss: 3.693669E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.131 | TFLOPs: 29.38 | +7: iteration 31430/ 37905 | consumed samples: 8046080 | consumed tokens: 16478371840 | elapsed time per iteration (s): 0.22 | learning rate: 3.290E-05 | global batch size: 256 | lm loss: 3.684078E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.793 | TFLOPs: 29.34 | +7: iteration 31440/ 37905 | consumed samples: 8048640 | consumed tokens: 16483614720 | elapsed time per iteration (s): 0.22 | learning rate: 3.286E-05 | global batch size: 256 | lm loss: 3.704272E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.745 | TFLOPs: 29.37 | +7: iteration 31450/ 37905 | consumed samples: 8051200 | consumed tokens: 16488857600 | elapsed time per iteration (s): 0.22 | learning rate: 3.282E-05 | global batch size: 256 | lm loss: 3.695113E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.538 | TFLOPs: 29.36 | +7: iteration 31460/ 37905 | consumed samples: 8053760 | consumed tokens: 16494100480 | elapsed time per iteration (s): 0.22 | learning rate: 3.279E-05 | global batch size: 256 | lm loss: 3.689402E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.800 | TFLOPs: 29.39 | +7: iteration 31470/ 37905 | consumed samples: 8056320 | consumed tokens: 16499343360 | elapsed time per iteration (s): 0.22 | learning rate: 3.275E-05 | global batch size: 256 | lm loss: 3.696532E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.189 | TFLOPs: 29.40 | +7: iteration 31480/ 37905 | consumed samples: 8058880 | consumed tokens: 16504586240 | elapsed time per iteration (s): 0.22 | learning rate: 3.271E-05 | global batch size: 256 | lm loss: 3.700282E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.056 | TFLOPs: 29.40 | +7: iteration 31490/ 37905 | consumed samples: 8061440 | consumed tokens: 16509829120 | elapsed time per iteration (s): 0.22 | learning rate: 3.267E-05 | global batch size: 256 | lm loss: 3.690007E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.266 | TFLOPs: 29.41 | +7: iteration 31500/ 37905 | consumed samples: 8064000 | consumed tokens: 16515072000 | elapsed time per iteration (s): 0.22 | learning rate: 3.263E-05 | global batch size: 256 | lm loss: 3.694262E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.601 | TFLOPs: 29.39 | +7: iteration 31510/ 37905 | consumed samples: 8066560 | consumed tokens: 16520314880 | elapsed time per iteration (s): 0.22 | learning rate: 3.259E-05 | global batch size: 256 | lm loss: 3.698381E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.927 | TFLOPs: 29.40 | +7: iteration 31520/ 37905 | consumed samples: 8069120 | consumed tokens: 16525557760 | elapsed time per iteration (s): 0.22 | learning rate: 3.255E-05 | global batch size: 256 | lm loss: 3.698704E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.131 | TFLOPs: 29.38 | +7: iteration 31530/ 37905 | consumed samples: 8071680 | consumed tokens: 16530800640 | elapsed time per iteration (s): 0.22 | learning rate: 3.252E-05 | global batch size: 256 | lm loss: 3.695325E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.257 | TFLOPs: 29.38 | +7: iteration 31540/ 37905 | consumed samples: 8074240 | consumed tokens: 16536043520 | elapsed time per iteration (s): 0.22 | learning rate: 3.248E-05 | global batch size: 256 | lm loss: 3.694786E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.622 | TFLOPs: 29.34 | +7: iteration 31550/ 37905 | consumed samples: 8076800 | consumed tokens: 16541286400 | elapsed time per iteration (s): 0.22 | learning rate: 3.244E-05 | global batch size: 256 | lm loss: 3.697885E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.575 | TFLOPs: 29.36 | +7: iteration 31560/ 37905 | consumed samples: 8079360 | consumed tokens: 16546529280 | elapsed time per iteration (s): 0.22 | learning rate: 3.240E-05 | global batch size: 256 | lm loss: 3.695837E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.549 | TFLOPs: 29.36 | +7: iteration 31570/ 37905 | consumed samples: 8081920 | consumed tokens: 16551772160 | elapsed time per iteration (s): 0.22 | learning rate: 3.236E-05 | global batch size: 256 | lm loss: 3.686322E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.540 | TFLOPs: 29.36 | +7: iteration 31580/ 37905 | consumed samples: 8084480 | consumed tokens: 16557015040 | elapsed time per iteration (s): 0.22 | learning rate: 3.233E-05 | global batch size: 256 | lm loss: 3.706859E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.322 | TFLOPs: 29.36 | +7: iteration 31590/ 37905 | consumed samples: 8087040 | consumed tokens: 16562257920 | elapsed time per iteration (s): 0.23 | learning rate: 3.229E-05 | global batch size: 256 | lm loss: 3.699469E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.632 | TFLOPs: 28.52 | +7: iteration 31600/ 37905 | consumed samples: 8089600 | consumed tokens: 16567500800 | elapsed time per iteration (s): 0.22 | learning rate: 3.225E-05 | global batch size: 256 | lm loss: 3.687936E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.666 | TFLOPs: 29.34 | +7: iteration 31610/ 37905 | consumed samples: 8092160 | consumed tokens: 16572743680 | elapsed time per iteration (s): 0.22 | learning rate: 3.221E-05 | global batch size: 256 | lm loss: 3.704762E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.102 | TFLOPs: 29.32 | +7: iteration 31620/ 37905 | consumed samples: 8094720 | consumed tokens: 16577986560 | elapsed time per iteration (s): 0.22 | learning rate: 3.217E-05 | global batch size: 256 | lm loss: 3.694295E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.243 | TFLOPs: 29.30 | +7: iteration 31630/ 37905 | consumed samples: 8097280 | consumed tokens: 16583229440 | elapsed time per iteration (s): 0.22 | learning rate: 3.214E-05 | global batch size: 256 | lm loss: 3.692012E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.764 | TFLOPs: 29.26 | +7: iteration 31640/ 37905 | consumed samples: 8099840 | consumed tokens: 16588472320 | elapsed time per iteration (s): 0.22 | learning rate: 3.210E-05 | global batch size: 256 | lm loss: 3.694216E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.387 | TFLOPs: 29.26 | +7: iteration 31650/ 37905 | consumed samples: 8102400 | consumed tokens: 16593715200 | elapsed time per iteration (s): 0.22 | learning rate: 3.206E-05 | global batch size: 256 | lm loss: 3.701478E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.305 | TFLOPs: 29.25 | +7: iteration 31660/ 37905 | consumed samples: 8104960 | consumed tokens: 16598958080 | elapsed time per iteration (s): 0.22 | learning rate: 3.202E-05 | global batch size: 256 | lm loss: 3.685719E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.922 | TFLOPs: 29.27 | +7: iteration 31670/ 37905 | consumed samples: 8107520 | consumed tokens: 16604200960 | elapsed time per iteration (s): 0.22 | learning rate: 3.199E-05 | global batch size: 256 | lm loss: 3.701374E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.007 | TFLOPs: 29.27 | +7: iteration 31680/ 37905 | consumed samples: 8110080 | consumed tokens: 16609443840 | elapsed time per iteration (s): 0.22 | learning rate: 3.195E-05 | global batch size: 256 | lm loss: 3.700289E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.458 | TFLOPs: 29.28 | +7: iteration 31690/ 37905 | consumed samples: 8112640 | consumed tokens: 16614686720 | elapsed time per iteration (s): 0.22 | learning rate: 3.191E-05 | global batch size: 256 | lm loss: 3.687166E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.607 | TFLOPs: 29.29 | +7: iteration 31700/ 37905 | consumed samples: 8115200 | consumed tokens: 16619929600 | elapsed time per iteration (s): 0.22 | learning rate: 3.187E-05 | global batch size: 256 | lm loss: 3.696980E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.647 | TFLOPs: 29.29 | +7: iteration 31710/ 37905 | consumed samples: 8117760 | consumed tokens: 16625172480 | elapsed time per iteration (s): 0.22 | learning rate: 3.184E-05 | global batch size: 256 | lm loss: 3.691886E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.867 | TFLOPs: 29.29 | +7: iteration 31720/ 37905 | consumed samples: 8120320 | consumed tokens: 16630415360 | elapsed time per iteration (s): 0.22 | learning rate: 3.180E-05 | global batch size: 256 | lm loss: 3.698037E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.892 | TFLOPs: 29.27 | +7: iteration 31730/ 37905 | consumed samples: 8122880 | consumed tokens: 16635658240 | elapsed time per iteration (s): 0.22 | learning rate: 3.176E-05 | global batch size: 256 | lm loss: 3.695889E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.837 | TFLOPs: 29.29 | +7: iteration 31740/ 37905 | consumed samples: 8125440 | consumed tokens: 16640901120 | elapsed time per iteration (s): 0.22 | learning rate: 3.172E-05 | global batch size: 256 | lm loss: 3.702901E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.306 | TFLOPs: 29.28 | +7: iteration 31750/ 37905 | consumed samples: 8128000 | consumed tokens: 16646144000 | elapsed time per iteration (s): 0.22 | learning rate: 3.169E-05 | global batch size: 256 | lm loss: 3.689590E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.726 | TFLOPs: 29.29 | +7: iteration 31760/ 37905 | consumed samples: 8130560 | consumed tokens: 16651386880 | elapsed time per iteration (s): 0.22 | learning rate: 3.165E-05 | global batch size: 256 | lm loss: 3.705151E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.830 | TFLOPs: 29.29 | +7: iteration 31770/ 37905 | consumed samples: 8133120 | consumed tokens: 16656629760 | elapsed time per iteration (s): 0.22 | learning rate: 3.161E-05 | global batch size: 256 | lm loss: 3.698902E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.317 | TFLOPs: 29.30 | +7: iteration 31780/ 37905 | consumed samples: 8135680 | consumed tokens: 16661872640 | elapsed time per iteration (s): 0.22 | learning rate: 3.158E-05 | global batch size: 256 | lm loss: 3.700720E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.494 | TFLOPs: 29.31 | +7: iteration 31790/ 37905 | consumed samples: 8138240 | consumed tokens: 16667115520 | elapsed time per iteration (s): 0.22 | learning rate: 3.154E-05 | global batch size: 256 | lm loss: 3.694476E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.771 | TFLOPs: 29.29 | +7: iteration 31800/ 37905 | consumed samples: 8140800 | consumed tokens: 16672358400 | elapsed time per iteration (s): 0.22 | learning rate: 3.150E-05 | global batch size: 256 | lm loss: 3.703812E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.911 | TFLOPs: 29.32 | +7: iteration 31810/ 37905 | consumed samples: 8143360 | consumed tokens: 16677601280 | elapsed time per iteration (s): 0.22 | learning rate: 3.146E-05 | global batch size: 256 | lm loss: 3.698648E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.701 | TFLOPs: 29.31 | +7: iteration 31820/ 37905 | consumed samples: 8145920 | consumed tokens: 16682844160 | elapsed time per iteration (s): 0.22 | learning rate: 3.143E-05 | global batch size: 256 | lm loss: 3.695565E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.912 | TFLOPs: 29.32 | +7: iteration 31830/ 37905 | consumed samples: 8148480 | consumed tokens: 16688087040 | elapsed time per iteration (s): 0.22 | learning rate: 3.139E-05 | global batch size: 256 | lm loss: 3.704903E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.831 | TFLOPs: 29.29 | +7: iteration 31840/ 37905 | consumed samples: 8151040 | consumed tokens: 16693329920 | elapsed time per iteration (s): 0.22 | learning rate: 3.135E-05 | global batch size: 256 | lm loss: 3.699932E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.315 | TFLOPs: 29.30 | +7: iteration 31850/ 37905 | consumed samples: 8153600 | consumed tokens: 16698572800 | elapsed time per iteration (s): 0.22 | learning rate: 3.132E-05 | global batch size: 256 | lm loss: 3.704392E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.962 | TFLOPs: 29.30 | +7: iteration 31860/ 37905 | consumed samples: 8156160 | consumed tokens: 16703815680 | elapsed time per iteration (s): 0.22 | learning rate: 3.128E-05 | global batch size: 256 | lm loss: 3.693159E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.291 | TFLOPs: 29.28 | +7: iteration 31870/ 37905 | consumed samples: 8158720 | consumed tokens: 16709058560 | elapsed time per iteration (s): 0.22 | learning rate: 3.124E-05 | global batch size: 256 | lm loss: 3.687460E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.894 | TFLOPs: 29.29 | +7: iteration 31880/ 37905 | consumed samples: 8161280 | consumed tokens: 16714301440 | elapsed time per iteration (s): 0.22 | learning rate: 3.121E-05 | global batch size: 256 | lm loss: 3.703518E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.450 | TFLOPs: 29.28 | +7: iteration 31890/ 37905 | consumed samples: 8163840 | consumed tokens: 16719544320 | elapsed time per iteration (s): 0.22 | learning rate: 3.117E-05 | global batch size: 256 | lm loss: 3.699640E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.251 | TFLOPs: 29.28 | +7: iteration 31900/ 37905 | consumed samples: 8166400 | consumed tokens: 16724787200 | elapsed time per iteration (s): 0.22 | learning rate: 3.114E-05 | global batch size: 256 | lm loss: 3.700084E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.205 | TFLOPs: 29.28 | +7: iteration 31910/ 37905 | consumed samples: 8168960 | consumed tokens: 16730030080 | elapsed time per iteration (s): 0.22 | learning rate: 3.110E-05 | global batch size: 256 | lm loss: 3.687181E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.192 | TFLOPs: 29.28 | +7: iteration 31920/ 37905 | consumed samples: 8171520 | consumed tokens: 16735272960 | elapsed time per iteration (s): 0.22 | learning rate: 3.106E-05 | global batch size: 256 | lm loss: 3.684467E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.726 | TFLOPs: 29.29 | +7: iteration 31930/ 37905 | consumed samples: 8174080 | consumed tokens: 16740515840 | elapsed time per iteration (s): 0.22 | learning rate: 3.103E-05 | global batch size: 256 | lm loss: 3.691895E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.157 | TFLOPs: 29.30 | +7: iteration 31940/ 37905 | consumed samples: 8176640 | consumed tokens: 16745758720 | elapsed time per iteration (s): 0.22 | learning rate: 3.099E-05 | global batch size: 256 | lm loss: 3.695320E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.259 | TFLOPs: 29.28 | +7: iteration 31950/ 37905 | consumed samples: 8179200 | consumed tokens: 16751001600 | elapsed time per iteration (s): 0.22 | learning rate: 3.095E-05 | global batch size: 256 | lm loss: 3.690207E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.292 | TFLOPs: 29.28 | +7: iteration 31960/ 37905 | consumed samples: 8181760 | consumed tokens: 16756244480 | elapsed time per iteration (s): 0.22 | learning rate: 3.092E-05 | global batch size: 256 | lm loss: 3.690042E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.795 | TFLOPs: 29.29 | +7: iteration 31970/ 37905 | consumed samples: 8184320 | consumed tokens: 16761487360 | elapsed time per iteration (s): 0.22 | learning rate: 3.088E-05 | global batch size: 256 | lm loss: 3.696490E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.919 | TFLOPs: 29.27 | +7: iteration 31980/ 37905 | consumed samples: 8186880 | consumed tokens: 16766730240 | elapsed time per iteration (s): 0.22 | learning rate: 3.085E-05 | global batch size: 256 | lm loss: 3.695010E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.532 | TFLOPs: 29.26 | +7: iteration 31990/ 37905 | consumed samples: 8189440 | consumed tokens: 16771973120 | elapsed time per iteration (s): 0.22 | learning rate: 3.081E-05 | global batch size: 256 | lm loss: 3.690105E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.438 | TFLOPs: 29.00 | +0: [2023-03-15 23:56:42,433] [INFO] [logging.py:68:log_dist] [Rank 0] step=32000, skipped=0, lr=[3.077546313866371e-05, 3.077546313866371e-05, 3.077546313866371e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 32000/ 37905 | consumed samples: 8192000 | consumed tokens: 16777216000 | elapsed time per iteration (s): 0.22 | learning rate: 3.078E-05 | global batch size: 256 | lm loss: 3.696385E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.417 | TFLOPs: 29.28 | +0: steps: 32000 loss: 3.7138 iter time (s): 0.221 samples/sec: 1158.109 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 32000 | lm loss value: 3.646358E+00 | lm loss PPL: 3.833481E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 32000 to checkpoints_83m20b400m +0: [2023-03-15 23:56:42,523] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step32000 is begin to save! +0: [2023-03-15 23:56:42,526] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_01-model_00-model_states.pt... +0: [2023-03-15 23:56:42,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_01-model_00-model_states.pt. +0: [2023-03-15 23:56:42,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_03-model_00-model_states.pt... +0: [2023-03-15 23:56:42,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_03-model_00-model_states.pt. +0: [2023-03-15 23:56:42,609] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_04-model_00-model_states.pt... +0: [2023-03-15 23:56:42,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_04-model_00-model_states.pt. +0: [2023-03-15 23:56:42,621] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_05-model_00-model_states.pt... +0: [2023-03-15 23:56:42,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_05-model_00-model_states.pt. +0: [2023-03-15 23:56:42,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_06-model_00-model_states.pt... +0: [2023-03-15 23:56:42,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_06-model_00-model_states.pt. +0: [2023-03-15 23:56:42,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_07-model_00-model_states.pt... +0: [2023-03-15 23:56:42,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_07-model_00-model_states.pt. +0: [2023-03-15 23:56:42,655] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_08-model_00-model_states.pt... +0: [2023-03-15 23:56:42,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_08-model_00-model_states.pt. +0: [2023-03-15 23:56:42,666] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_09-model_00-model_states.pt... +0: [2023-03-15 23:56:42,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_09-model_00-model_states.pt. +0: [2023-03-15 23:56:42,677] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_10-model_00-model_states.pt... +0: [2023-03-15 23:56:42,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_10-model_00-model_states.pt. +0: [2023-03-15 23:56:42,688] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_11-model_00-model_states.pt... +0: [2023-03-15 23:56:42,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_11-model_00-model_states.pt. +0: [2023-03-15 23:56:42,700] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_12-model_00-model_states.pt... +0: [2023-03-15 23:56:42,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_12-model_00-model_states.pt. +0: [2023-03-15 23:56:42,711] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/layer_14-model_00-model_states.pt... +0: [2023-03-15 23:56:42,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/layer_14-model_00-model_states.pt. +0: [2023-03-15 23:56:42,712] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step32000/mp_rank_00_model_states.pt +0: [2023-03-15 23:56:42,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/mp_rank_00_model_states.pt... +0: [2023-03-15 23:56:42,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/mp_rank_00_model_states.pt. +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +5: [2023-03-15 23:56:42,730] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +6: [2023-03-15 23:56:42,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-15 23:56:42,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +4: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:56:42,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-15 23:56:42,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:56:42,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-15 23:56:42,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +5: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +0: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +5: [2023-03-15 23:56:42,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +0: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +5: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +1: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +2: [2023-03-15 23:56:42,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +2: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +1: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +2: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +4: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +6: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +3: [2023-03-15 23:56:42,760] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-15 23:56:42,760] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +7: [2023-03-15 23:56:42,761] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-15 23:56:42,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step32000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-15 23:56:42,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step32000 is ready now! +0: successfully saved checkpoint at iteration 32000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 241.29 +7: iteration 32010/ 37905 | consumed samples: 8194560 | consumed tokens: 16782458880 | elapsed time per iteration (s): 0.25 | learning rate: 3.074E-05 | global batch size: 256 | lm loss: 3.694058E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1005.015 | TFLOPs: 25.60 | +7: iteration 32020/ 37905 | consumed samples: 8197120 | consumed tokens: 16787701760 | elapsed time per iteration (s): 0.22 | learning rate: 3.070E-05 | global batch size: 256 | lm loss: 3.682945E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.387 | TFLOPs: 29.33 | +7: iteration 32030/ 37905 | consumed samples: 8199680 | consumed tokens: 16792944640 | elapsed time per iteration (s): 0.22 | learning rate: 3.067E-05 | global batch size: 256 | lm loss: 3.708638E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.709 | TFLOPs: 29.31 | +7: iteration 32040/ 37905 | consumed samples: 8202240 | consumed tokens: 16798187520 | elapsed time per iteration (s): 0.22 | learning rate: 3.063E-05 | global batch size: 256 | lm loss: 3.697873E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.170 | TFLOPs: 29.33 | +7: iteration 32050/ 37905 | consumed samples: 8204800 | consumed tokens: 16803430400 | elapsed time per iteration (s): 0.22 | learning rate: 3.060E-05 | global batch size: 256 | lm loss: 3.691486E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.050 | TFLOPs: 29.32 | +7: iteration 32060/ 37905 | consumed samples: 8207360 | consumed tokens: 16808673280 | elapsed time per iteration (s): 0.23 | learning rate: 3.056E-05 | global batch size: 256 | lm loss: 3.704995E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.250 | TFLOPs: 28.90 | +7: iteration 32070/ 37905 | consumed samples: 8209920 | consumed tokens: 16813916160 | elapsed time per iteration (s): 0.22 | learning rate: 3.053E-05 | global batch size: 256 | lm loss: 3.715157E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.644 | TFLOPs: 29.34 | +7: iteration 32080/ 37905 | consumed samples: 8212480 | consumed tokens: 16819159040 | elapsed time per iteration (s): 0.22 | learning rate: 3.049E-05 | global batch size: 256 | lm loss: 3.678113E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.484 | TFLOPs: 29.33 | +7: iteration 32090/ 37905 | consumed samples: 8215040 | consumed tokens: 16824401920 | elapsed time per iteration (s): 0.22 | learning rate: 3.046E-05 | global batch size: 256 | lm loss: 3.692970E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.030 | TFLOPs: 29.32 | +7: iteration 32100/ 37905 | consumed samples: 8217600 | consumed tokens: 16829644800 | elapsed time per iteration (s): 0.22 | learning rate: 3.042E-05 | global batch size: 256 | lm loss: 3.711378E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.822 | TFLOPs: 29.04 | +7: iteration 32110/ 37905 | consumed samples: 8220160 | consumed tokens: 16834887680 | elapsed time per iteration (s): 0.22 | learning rate: 3.039E-05 | global batch size: 256 | lm loss: 3.691866E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.660 | TFLOPs: 29.34 | +7: iteration 32120/ 37905 | consumed samples: 8222720 | consumed tokens: 16840130560 | elapsed time per iteration (s): 0.22 | learning rate: 3.035E-05 | global batch size: 256 | lm loss: 3.693984E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.091 | TFLOPs: 29.35 | +7: iteration 32130/ 37905 | consumed samples: 8225280 | consumed tokens: 16845373440 | elapsed time per iteration (s): 0.22 | learning rate: 3.032E-05 | global batch size: 256 | lm loss: 3.699484E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.044 | TFLOPs: 29.32 | +7: iteration 32140/ 37905 | consumed samples: 8227840 | consumed tokens: 16850616320 | elapsed time per iteration (s): 0.22 | learning rate: 3.028E-05 | global batch size: 256 | lm loss: 3.692572E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.543 | TFLOPs: 29.34 | +7: iteration 32150/ 37905 | consumed samples: 8230400 | consumed tokens: 16855859200 | elapsed time per iteration (s): 0.22 | learning rate: 3.025E-05 | global batch size: 256 | lm loss: 3.702085E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.866 | TFLOPs: 29.34 | +7: iteration 32160/ 37905 | consumed samples: 8232960 | consumed tokens: 16861102080 | elapsed time per iteration (s): 0.22 | learning rate: 3.021E-05 | global batch size: 256 | lm loss: 3.696768E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.602 | TFLOPs: 29.01 | +7: iteration 32170/ 37905 | consumed samples: 8235520 | consumed tokens: 16866344960 | elapsed time per iteration (s): 0.22 | learning rate: 3.018E-05 | global batch size: 256 | lm loss: 3.697874E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.130 | TFLOPs: 29.35 | +7: iteration 32180/ 37905 | consumed samples: 8238080 | consumed tokens: 16871587840 | elapsed time per iteration (s): 0.22 | learning rate: 3.014E-05 | global batch size: 256 | lm loss: 3.682261E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.381 | TFLOPs: 29.43 | +7: iteration 32190/ 37905 | consumed samples: 8240640 | consumed tokens: 16876830720 | elapsed time per iteration (s): 0.22 | learning rate: 3.011E-05 | global batch size: 256 | lm loss: 3.687171E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.336 | TFLOPs: 29.46 | +7: iteration 32200/ 37905 | consumed samples: 8243200 | consumed tokens: 16882073600 | elapsed time per iteration (s): 0.22 | learning rate: 3.007E-05 | global batch size: 256 | lm loss: 3.695763E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1157.104 | TFLOPs: 29.48 | +7: iteration 32210/ 37905 | consumed samples: 8245760 | consumed tokens: 16887316480 | elapsed time per iteration (s): 0.22 | learning rate: 3.004E-05 | global batch size: 256 | lm loss: 3.700201E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.849 | TFLOPs: 29.47 | +7: iteration 32220/ 37905 | consumed samples: 8248320 | consumed tokens: 16892559360 | elapsed time per iteration (s): 0.22 | learning rate: 3.000E-05 | global batch size: 256 | lm loss: 3.694040E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.476 | TFLOPs: 29.46 | +7: iteration 32230/ 37905 | consumed samples: 8250880 | consumed tokens: 16897802240 | elapsed time per iteration (s): 0.22 | learning rate: 2.997E-05 | global batch size: 256 | lm loss: 3.688625E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.381 | TFLOPs: 29.46 | +7: iteration 32240/ 37905 | consumed samples: 8253440 | consumed tokens: 16903045120 | elapsed time per iteration (s): 0.22 | learning rate: 2.993E-05 | global batch size: 256 | lm loss: 3.700564E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.527 | TFLOPs: 29.46 | +7: iteration 32250/ 37905 | consumed samples: 8256000 | consumed tokens: 16908288000 | elapsed time per iteration (s): 0.22 | learning rate: 2.990E-05 | global batch size: 256 | lm loss: 3.697612E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.992 | TFLOPs: 29.45 | +7: iteration 32260/ 37905 | consumed samples: 8258560 | consumed tokens: 16913530880 | elapsed time per iteration (s): 0.22 | learning rate: 2.986E-05 | global batch size: 256 | lm loss: 3.695731E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.548 | TFLOPs: 29.46 | +7: iteration 32270/ 37905 | consumed samples: 8261120 | consumed tokens: 16918773760 | elapsed time per iteration (s): 0.22 | learning rate: 2.983E-05 | global batch size: 256 | lm loss: 3.693551E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.893 | TFLOPs: 29.45 | +7: iteration 32280/ 37905 | consumed samples: 8263680 | consumed tokens: 16924016640 | elapsed time per iteration (s): 0.24 | learning rate: 2.980E-05 | global batch size: 256 | lm loss: 3.685048E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1085.486 | TFLOPs: 27.65 | +7: iteration 32290/ 37905 | consumed samples: 8266240 | consumed tokens: 16929259520 | elapsed time per iteration (s): 0.22 | learning rate: 2.976E-05 | global batch size: 256 | lm loss: 3.690791E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.766 | TFLOPs: 29.16 | +7: iteration 32300/ 37905 | consumed samples: 8268800 | consumed tokens: 16934502400 | elapsed time per iteration (s): 0.22 | learning rate: 2.973E-05 | global batch size: 256 | lm loss: 3.701074E+00 | grad norm: 0.224 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.893 | TFLOPs: 29.47 | +7: iteration 32310/ 37905 | consumed samples: 8271360 | consumed tokens: 16939745280 | elapsed time per iteration (s): 0.22 | learning rate: 2.969E-05 | global batch size: 256 | lm loss: 3.702645E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.549 | TFLOPs: 29.46 | +7: iteration 32320/ 37905 | consumed samples: 8273920 | consumed tokens: 16944988160 | elapsed time per iteration (s): 0.22 | learning rate: 2.966E-05 | global batch size: 256 | lm loss: 3.695855E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1157.377 | TFLOPs: 29.48 | +7: iteration 32330/ 37905 | consumed samples: 8276480 | consumed tokens: 16950231040 | elapsed time per iteration (s): 0.22 | learning rate: 2.963E-05 | global batch size: 256 | lm loss: 3.691256E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.925 | TFLOPs: 29.45 | +7: iteration 32340/ 37905 | consumed samples: 8279040 | consumed tokens: 16955473920 | elapsed time per iteration (s): 0.22 | learning rate: 2.959E-05 | global batch size: 256 | lm loss: 3.699653E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.415 | TFLOPs: 29.43 | +7: iteration 32350/ 37905 | consumed samples: 8281600 | consumed tokens: 16960716800 | elapsed time per iteration (s): 0.22 | learning rate: 2.956E-05 | global batch size: 256 | lm loss: 3.692383E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.477 | TFLOPs: 29.44 | +7: iteration 32360/ 37905 | consumed samples: 8284160 | consumed tokens: 16965959680 | elapsed time per iteration (s): 0.22 | learning rate: 2.952E-05 | global batch size: 256 | lm loss: 3.692122E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.651 | TFLOPs: 29.44 | +7: iteration 32370/ 37905 | consumed samples: 8286720 | consumed tokens: 16971202560 | elapsed time per iteration (s): 0.22 | learning rate: 2.949E-05 | global batch size: 256 | lm loss: 3.696955E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.538 | TFLOPs: 29.44 | +7: iteration 32380/ 37905 | consumed samples: 8289280 | consumed tokens: 16976445440 | elapsed time per iteration (s): 0.22 | learning rate: 2.946E-05 | global batch size: 256 | lm loss: 3.691842E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.388 | TFLOPs: 29.43 | +7: iteration 32390/ 37905 | consumed samples: 8291840 | consumed tokens: 16981688320 | elapsed time per iteration (s): 0.22 | learning rate: 2.942E-05 | global batch size: 256 | lm loss: 3.691909E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.314 | TFLOPs: 29.43 | +7: iteration 32400/ 37905 | consumed samples: 8294400 | consumed tokens: 16986931200 | elapsed time per iteration (s): 0.22 | learning rate: 2.939E-05 | global batch size: 256 | lm loss: 3.700830E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.110 | TFLOPs: 29.43 | +7: iteration 32410/ 37905 | consumed samples: 8296960 | consumed tokens: 16992174080 | elapsed time per iteration (s): 0.22 | learning rate: 2.936E-05 | global batch size: 256 | lm loss: 3.695731E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.667 | TFLOPs: 29.44 | +7: iteration 32420/ 37905 | consumed samples: 8299520 | consumed tokens: 16997416960 | elapsed time per iteration (s): 0.22 | learning rate: 2.932E-05 | global batch size: 256 | lm loss: 3.705959E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.169 | TFLOPs: 29.43 | +7: iteration 32430/ 37905 | consumed samples: 8302080 | consumed tokens: 17002659840 | elapsed time per iteration (s): 0.22 | learning rate: 2.929E-05 | global batch size: 256 | lm loss: 3.689190E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.873 | TFLOPs: 29.42 | +7: iteration 32440/ 37905 | consumed samples: 8304640 | consumed tokens: 17007902720 | elapsed time per iteration (s): 0.22 | learning rate: 2.926E-05 | global batch size: 256 | lm loss: 3.698492E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.116 | TFLOPs: 29.43 | +7: iteration 32450/ 37905 | consumed samples: 8307200 | consumed tokens: 17013145600 | elapsed time per iteration (s): 0.22 | learning rate: 2.922E-05 | global batch size: 256 | lm loss: 3.717057E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.854 | TFLOPs: 29.45 | +7: iteration 32460/ 37905 | consumed samples: 8309760 | consumed tokens: 17018388480 | elapsed time per iteration (s): 0.22 | learning rate: 2.919E-05 | global batch size: 256 | lm loss: 3.692220E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.595 | TFLOPs: 29.44 | +7: iteration 32470/ 37905 | consumed samples: 8312320 | consumed tokens: 17023631360 | elapsed time per iteration (s): 0.22 | learning rate: 2.916E-05 | global batch size: 256 | lm loss: 3.697298E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.702 | TFLOPs: 29.44 | +7: iteration 32480/ 37905 | consumed samples: 8314880 | consumed tokens: 17028874240 | elapsed time per iteration (s): 0.22 | learning rate: 2.912E-05 | global batch size: 256 | lm loss: 3.700653E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.461 | TFLOPs: 29.36 | +7: iteration 32490/ 37905 | consumed samples: 8317440 | consumed tokens: 17034117120 | elapsed time per iteration (s): 0.22 | learning rate: 2.909E-05 | global batch size: 256 | lm loss: 3.695558E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.456 | TFLOPs: 29.33 | +7: iteration 32500/ 37905 | consumed samples: 8320000 | consumed tokens: 17039360000 | elapsed time per iteration (s): 0.22 | learning rate: 2.906E-05 | global batch size: 256 | lm loss: 3.704578E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.799 | TFLOPs: 29.34 | +7: iteration 32510/ 37905 | consumed samples: 8322560 | consumed tokens: 17044602880 | elapsed time per iteration (s): 0.22 | learning rate: 2.903E-05 | global batch size: 256 | lm loss: 3.687795E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.171 | TFLOPs: 29.35 | +7: iteration 32520/ 37905 | consumed samples: 8325120 | consumed tokens: 17049845760 | elapsed time per iteration (s): 0.22 | learning rate: 2.899E-05 | global batch size: 256 | lm loss: 3.691851E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.831 | TFLOPs: 29.37 | +7: iteration 32530/ 37905 | consumed samples: 8327680 | consumed tokens: 17055088640 | elapsed time per iteration (s): 0.22 | learning rate: 2.896E-05 | global batch size: 256 | lm loss: 3.701423E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.178 | TFLOPs: 29.38 | +7: iteration 32540/ 37905 | consumed samples: 8330240 | consumed tokens: 17060331520 | elapsed time per iteration (s): 0.22 | learning rate: 2.893E-05 | global batch size: 256 | lm loss: 3.704584E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.063 | TFLOPs: 29.37 | +7: iteration 32550/ 37905 | consumed samples: 8332800 | consumed tokens: 17065574400 | elapsed time per iteration (s): 0.22 | learning rate: 2.889E-05 | global batch size: 256 | lm loss: 3.695029E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.641 | TFLOPs: 29.36 | +7: iteration 32560/ 37905 | consumed samples: 8335360 | consumed tokens: 17070817280 | elapsed time per iteration (s): 0.22 | learning rate: 2.886E-05 | global batch size: 256 | lm loss: 3.710209E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.270 | TFLOPs: 29.38 | +7: iteration 32570/ 37905 | consumed samples: 8337920 | consumed tokens: 17076060160 | elapsed time per iteration (s): 0.22 | learning rate: 2.883E-05 | global batch size: 256 | lm loss: 3.693769E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.226 | TFLOPs: 29.38 | +7: iteration 32580/ 37905 | consumed samples: 8340480 | consumed tokens: 17081303040 | elapsed time per iteration (s): 0.22 | learning rate: 2.880E-05 | global batch size: 256 | lm loss: 3.699168E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.661 | TFLOPs: 29.31 | +7: iteration 32590/ 37905 | consumed samples: 8343040 | consumed tokens: 17086545920 | elapsed time per iteration (s): 0.22 | learning rate: 2.876E-05 | global batch size: 256 | lm loss: 3.689351E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.229 | TFLOPs: 29.28 | +7: iteration 32600/ 37905 | consumed samples: 8345600 | consumed tokens: 17091788800 | elapsed time per iteration (s): 0.22 | learning rate: 2.873E-05 | global batch size: 256 | lm loss: 3.693142E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.475 | TFLOPs: 29.26 | +7: iteration 32610/ 37905 | consumed samples: 8348160 | consumed tokens: 17097031680 | elapsed time per iteration (s): 0.22 | learning rate: 2.870E-05 | global batch size: 256 | lm loss: 3.694629E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.047 | TFLOPs: 29.27 | +7: iteration 32620/ 37905 | consumed samples: 8350720 | consumed tokens: 17102274560 | elapsed time per iteration (s): 0.23 | learning rate: 2.867E-05 | global batch size: 256 | lm loss: 3.704477E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.800 | TFLOPs: 28.50 | +7: iteration 32630/ 37905 | consumed samples: 8353280 | consumed tokens: 17107517440 | elapsed time per iteration (s): 0.22 | learning rate: 2.863E-05 | global batch size: 256 | lm loss: 3.685268E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.220 | TFLOPs: 29.35 | +7: iteration 32640/ 37905 | consumed samples: 8355840 | consumed tokens: 17112760320 | elapsed time per iteration (s): 0.22 | learning rate: 2.860E-05 | global batch size: 256 | lm loss: 3.704865E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.926 | TFLOPs: 29.32 | +7: iteration 32650/ 37905 | consumed samples: 8358400 | consumed tokens: 17118003200 | elapsed time per iteration (s): 0.22 | learning rate: 2.857E-05 | global batch size: 256 | lm loss: 3.696650E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.416 | TFLOPs: 29.33 | +7: iteration 32660/ 37905 | consumed samples: 8360960 | consumed tokens: 17123246080 | elapsed time per iteration (s): 0.22 | learning rate: 2.854E-05 | global batch size: 256 | lm loss: 3.680263E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.727 | TFLOPs: 29.31 | +7: iteration 32670/ 37905 | consumed samples: 8363520 | consumed tokens: 17128488960 | elapsed time per iteration (s): 0.22 | learning rate: 2.851E-05 | global batch size: 256 | lm loss: 3.706766E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.509 | TFLOPs: 29.33 | +7: iteration 32680/ 37905 | consumed samples: 8366080 | consumed tokens: 17133731840 | elapsed time per iteration (s): 0.22 | learning rate: 2.847E-05 | global batch size: 256 | lm loss: 3.701264E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.016 | TFLOPs: 29.32 | +7: iteration 32690/ 37905 | consumed samples: 8368640 | consumed tokens: 17138974720 | elapsed time per iteration (s): 0.22 | learning rate: 2.844E-05 | global batch size: 256 | lm loss: 3.694562E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.300 | TFLOPs: 29.36 | +7: iteration 32700/ 37905 | consumed samples: 8371200 | consumed tokens: 17144217600 | elapsed time per iteration (s): 0.22 | learning rate: 2.841E-05 | global batch size: 256 | lm loss: 3.681036E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.629 | TFLOPs: 29.34 | +7: iteration 32710/ 37905 | consumed samples: 8373760 | consumed tokens: 17149460480 | elapsed time per iteration (s): 0.22 | learning rate: 2.838E-05 | global batch size: 256 | lm loss: 3.706656E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.278 | TFLOPs: 29.33 | +7: iteration 32720/ 37905 | consumed samples: 8376320 | consumed tokens: 17154703360 | elapsed time per iteration (s): 0.22 | learning rate: 2.835E-05 | global batch size: 256 | lm loss: 3.690283E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.590 | TFLOPs: 29.34 | +7: iteration 32730/ 37905 | consumed samples: 8378880 | consumed tokens: 17159946240 | elapsed time per iteration (s): 0.22 | learning rate: 2.832E-05 | global batch size: 256 | lm loss: 3.693975E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.541 | TFLOPs: 29.11 | +7: iteration 32740/ 37905 | consumed samples: 8381440 | consumed tokens: 17165189120 | elapsed time per iteration (s): 0.22 | learning rate: 2.828E-05 | global batch size: 256 | lm loss: 3.701280E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.900 | TFLOPs: 29.32 | +7: iteration 32750/ 37905 | consumed samples: 8384000 | consumed tokens: 17170432000 | elapsed time per iteration (s): 0.23 | learning rate: 2.825E-05 | global batch size: 256 | lm loss: 3.699189E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.992 | TFLOPs: 28.94 | +7: iteration 32760/ 37905 | consumed samples: 8386560 | consumed tokens: 17175674880 | elapsed time per iteration (s): 0.22 | learning rate: 2.822E-05 | global batch size: 256 | lm loss: 3.701175E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.308 | TFLOPs: 29.33 | +7: iteration 32770/ 37905 | consumed samples: 8389120 | consumed tokens: 17180917760 | elapsed time per iteration (s): 0.22 | learning rate: 2.819E-05 | global batch size: 256 | lm loss: 3.695749E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.260 | TFLOPs: 29.30 | +7: iteration 32780/ 37905 | consumed samples: 8391680 | consumed tokens: 17186160640 | elapsed time per iteration (s): 0.22 | learning rate: 2.816E-05 | global batch size: 256 | lm loss: 3.700647E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.730 | TFLOPs: 29.29 | +7: iteration 32790/ 37905 | consumed samples: 8394240 | consumed tokens: 17191403520 | elapsed time per iteration (s): 0.22 | learning rate: 2.813E-05 | global batch size: 256 | lm loss: 3.696919E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.872 | TFLOPs: 29.29 | +7: iteration 32800/ 37905 | consumed samples: 8396800 | consumed tokens: 17196646400 | elapsed time per iteration (s): 0.22 | learning rate: 2.810E-05 | global batch size: 256 | lm loss: 3.706572E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.074 | TFLOPs: 29.32 | +7: iteration 32810/ 37905 | consumed samples: 8399360 | consumed tokens: 17201889280 | elapsed time per iteration (s): 0.22 | learning rate: 2.806E-05 | global batch size: 256 | lm loss: 3.685760E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.966 | TFLOPs: 29.30 | +7: iteration 32820/ 37905 | consumed samples: 8401920 | consumed tokens: 17207132160 | elapsed time per iteration (s): 0.22 | learning rate: 2.803E-05 | global batch size: 256 | lm loss: 3.674140E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.877 | TFLOPs: 29.32 | +7: iteration 32830/ 37905 | consumed samples: 8404480 | consumed tokens: 17212375040 | elapsed time per iteration (s): 0.22 | learning rate: 2.800E-05 | global batch size: 256 | lm loss: 3.695435E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.829 | TFLOPs: 29.29 | +7: iteration 32840/ 37905 | consumed samples: 8407040 | consumed tokens: 17217617920 | elapsed time per iteration (s): 0.22 | learning rate: 2.797E-05 | global batch size: 256 | lm loss: 3.699611E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.198 | TFLOPs: 29.30 | +7: iteration 32850/ 37905 | consumed samples: 8409600 | consumed tokens: 17222860800 | elapsed time per iteration (s): 0.22 | learning rate: 2.794E-05 | global batch size: 256 | lm loss: 3.694158E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.824 | TFLOPs: 29.32 | +7: iteration 32860/ 37905 | consumed samples: 8412160 | consumed tokens: 17228103680 | elapsed time per iteration (s): 0.22 | learning rate: 2.791E-05 | global batch size: 256 | lm loss: 3.706344E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.781 | TFLOPs: 29.32 | +7: iteration 32870/ 37905 | consumed samples: 8414720 | consumed tokens: 17233346560 | elapsed time per iteration (s): 0.22 | learning rate: 2.788E-05 | global batch size: 256 | lm loss: 3.699031E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.653 | TFLOPs: 29.31 | +7: iteration 32880/ 37905 | consumed samples: 8417280 | consumed tokens: 17238589440 | elapsed time per iteration (s): 0.22 | learning rate: 2.785E-05 | global batch size: 256 | lm loss: 3.690967E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.362 | TFLOPs: 29.28 | +7: iteration 32890/ 37905 | consumed samples: 8419840 | consumed tokens: 17243832320 | elapsed time per iteration (s): 0.23 | learning rate: 2.782E-05 | global batch size: 256 | lm loss: 3.693354E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.557 | TFLOPs: 28.83 | +7: iteration 32900/ 37905 | consumed samples: 8422400 | consumed tokens: 17249075200 | elapsed time per iteration (s): 0.22 | learning rate: 2.779E-05 | global batch size: 256 | lm loss: 3.695922E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.519 | TFLOPs: 29.34 | +7: iteration 32910/ 37905 | consumed samples: 8424960 | consumed tokens: 17254318080 | elapsed time per iteration (s): 0.22 | learning rate: 2.776E-05 | global batch size: 256 | lm loss: 3.688664E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.702 | TFLOPs: 29.37 | +7: iteration 32920/ 37905 | consumed samples: 8427520 | consumed tokens: 17259560960 | elapsed time per iteration (s): 0.22 | learning rate: 2.772E-05 | global batch size: 256 | lm loss: 3.700565E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.851 | TFLOPs: 29.32 | +7: iteration 32930/ 37905 | consumed samples: 8430080 | consumed tokens: 17264803840 | elapsed time per iteration (s): 0.22 | learning rate: 2.769E-05 | global batch size: 256 | lm loss: 3.678497E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.980 | TFLOPs: 29.30 | +7: iteration 32940/ 37905 | consumed samples: 8432640 | consumed tokens: 17270046720 | elapsed time per iteration (s): 0.22 | learning rate: 2.766E-05 | global batch size: 256 | lm loss: 3.701387E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.925 | TFLOPs: 29.27 | +7: iteration 32950/ 37905 | consumed samples: 8435200 | consumed tokens: 17275289600 | elapsed time per iteration (s): 0.22 | learning rate: 2.763E-05 | global batch size: 256 | lm loss: 3.697389E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.815 | TFLOPs: 29.37 | +7: iteration 32960/ 37905 | consumed samples: 8437760 | consumed tokens: 17280532480 | elapsed time per iteration (s): 0.22 | learning rate: 2.760E-05 | global batch size: 256 | lm loss: 3.692523E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.904 | TFLOPs: 29.32 | +7: iteration 32970/ 37905 | consumed samples: 8440320 | consumed tokens: 17285775360 | elapsed time per iteration (s): 0.22 | learning rate: 2.757E-05 | global batch size: 256 | lm loss: 3.682092E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.889 | TFLOPs: 29.37 | +7: iteration 32980/ 37905 | consumed samples: 8442880 | consumed tokens: 17291018240 | elapsed time per iteration (s): 0.22 | learning rate: 2.754E-05 | global batch size: 256 | lm loss: 3.698733E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.155 | TFLOPs: 29.43 | +7: iteration 32990/ 37905 | consumed samples: 8445440 | consumed tokens: 17296261120 | elapsed time per iteration (s): 0.22 | learning rate: 2.751E-05 | global batch size: 256 | lm loss: 3.688846E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.966 | TFLOPs: 29.40 | +7: iteration 33000/ 37905 | consumed samples: 8448000 | consumed tokens: 17301504000 | elapsed time per iteration (s): 0.22 | learning rate: 2.748E-05 | global batch size: 256 | lm loss: 3.682835E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.573 | TFLOPs: 29.41 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 33000 | lm loss value: 3.656678E+00 | lm loss PPL: 3.873247E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 33000 to checkpoints_83m20b400m +0: [2023-03-16 00:00:25,303] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step33000 is begin to save! +0: [2023-03-16 00:00:25,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_01-model_00-model_states.pt... +0: [2023-03-16 00:00:25,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_01-model_00-model_states.pt. +0: [2023-03-16 00:00:25,384] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_03-model_00-model_states.pt... +0: [2023-03-16 00:00:25,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_03-model_00-model_states.pt. +0: [2023-03-16 00:00:25,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_04-model_00-model_states.pt... +0: [2023-03-16 00:00:25,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_04-model_00-model_states.pt. +0: [2023-03-16 00:00:25,407] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_05-model_00-model_states.pt... +0: [2023-03-16 00:00:25,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_05-model_00-model_states.pt. +0: [2023-03-16 00:00:25,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_06-model_00-model_states.pt... +0: [2023-03-16 00:00:25,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_06-model_00-model_states.pt. +0: [2023-03-16 00:00:25,429] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_07-model_00-model_states.pt... +0: [2023-03-16 00:00:25,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_07-model_00-model_states.pt. +0: [2023-03-16 00:00:25,440] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_08-model_00-model_states.pt... +0: [2023-03-16 00:00:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_08-model_00-model_states.pt. +0: [2023-03-16 00:00:25,452] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_09-model_00-model_states.pt... +0: [2023-03-16 00:00:25,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_09-model_00-model_states.pt. +0: [2023-03-16 00:00:25,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_10-model_00-model_states.pt... +0: [2023-03-16 00:00:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_10-model_00-model_states.pt. +0: [2023-03-16 00:00:25,474] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_11-model_00-model_states.pt... +0: [2023-03-16 00:00:25,485] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_11-model_00-model_states.pt. +0: [2023-03-16 00:00:25,485] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_12-model_00-model_states.pt... +0: [2023-03-16 00:00:25,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_12-model_00-model_states.pt. +0: [2023-03-16 00:00:25,496] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/layer_14-model_00-model_states.pt... +0: [2023-03-16 00:00:25,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/layer_14-model_00-model_states.pt. +0: [2023-03-16 00:00:25,497] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step33000/mp_rank_00_model_states.pt +0: [2023-03-16 00:00:25,497] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/mp_rank_00_model_states.pt... +0: [2023-03-16 00:00:25,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/mp_rank_00_model_states.pt. +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:00:25,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:00:25,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 00:00:25,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:00:25,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:00:25,538] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,538] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +6: [2023-03-16 00:00:25,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:00:25,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 00:00:25,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,544] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +5: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +5: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +0: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:00:25,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +4: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +2: [2023-03-16 00:00:25,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +2: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +1: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +6: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +7: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +3: [2023-03-16 00:00:25,547] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:00:25,547] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step33000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 00:00:25,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step33000 is ready now! +0: successfully saved checkpoint at iteration 33000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 247.45 +7: iteration 33010/ 37905 | consumed samples: 8450560 | consumed tokens: 17306746880 | elapsed time per iteration (s): 0.25 | learning rate: 2.745E-05 | global batch size: 256 | lm loss: 3.699950E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1013.558 | TFLOPs: 25.82 | +7: iteration 33020/ 37905 | consumed samples: 8453120 | consumed tokens: 17311989760 | elapsed time per iteration (s): 0.22 | learning rate: 2.742E-05 | global batch size: 256 | lm loss: 3.697021E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.332 | TFLOPs: 29.41 | +7: iteration 33030/ 37905 | consumed samples: 8455680 | consumed tokens: 17317232640 | elapsed time per iteration (s): 0.22 | learning rate: 2.739E-05 | global batch size: 256 | lm loss: 3.706533E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.668 | TFLOPs: 29.39 | +7: iteration 33040/ 37905 | consumed samples: 8458240 | consumed tokens: 17322475520 | elapsed time per iteration (s): 0.22 | learning rate: 2.736E-05 | global batch size: 256 | lm loss: 3.693428E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.676 | TFLOPs: 29.39 | +7: iteration 33050/ 37905 | consumed samples: 8460800 | consumed tokens: 17327718400 | elapsed time per iteration (s): 0.22 | learning rate: 2.733E-05 | global batch size: 256 | lm loss: 3.686506E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.000 | TFLOPs: 29.40 | +7: iteration 33060/ 37905 | consumed samples: 8463360 | consumed tokens: 17332961280 | elapsed time per iteration (s): 0.22 | learning rate: 2.730E-05 | global batch size: 256 | lm loss: 3.694957E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.406 | TFLOPs: 29.41 | +7: iteration 33070/ 37905 | consumed samples: 8465920 | consumed tokens: 17338204160 | elapsed time per iteration (s): 0.22 | learning rate: 2.727E-05 | global batch size: 256 | lm loss: 3.697033E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.593 | TFLOPs: 29.39 | +7: iteration 33080/ 37905 | consumed samples: 8468480 | consumed tokens: 17343447040 | elapsed time per iteration (s): 0.22 | learning rate: 2.724E-05 | global batch size: 256 | lm loss: 3.691320E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.135 | TFLOPs: 29.40 | +7: iteration 33090/ 37905 | consumed samples: 8471040 | consumed tokens: 17348689920 | elapsed time per iteration (s): 0.22 | learning rate: 2.721E-05 | global batch size: 256 | lm loss: 3.683343E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.843 | TFLOPs: 29.39 | +7: iteration 33100/ 37905 | consumed samples: 8473600 | consumed tokens: 17353932800 | elapsed time per iteration (s): 0.22 | learning rate: 2.718E-05 | global batch size: 256 | lm loss: 3.703255E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.917 | TFLOPs: 29.42 | +7: iteration 33110/ 37905 | consumed samples: 8476160 | consumed tokens: 17359175680 | elapsed time per iteration (s): 0.22 | learning rate: 2.715E-05 | global batch size: 256 | lm loss: 3.701150E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.899 | TFLOPs: 29.45 | +7: iteration 33120/ 37905 | consumed samples: 8478720 | consumed tokens: 17364418560 | elapsed time per iteration (s): 0.22 | learning rate: 2.713E-05 | global batch size: 256 | lm loss: 3.694456E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.987 | TFLOPs: 29.42 | +7: iteration 33130/ 37905 | consumed samples: 8481280 | consumed tokens: 17369661440 | elapsed time per iteration (s): 0.22 | learning rate: 2.710E-05 | global batch size: 256 | lm loss: 3.693353E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.797 | TFLOPs: 28.99 | +7: iteration 33140/ 37905 | consumed samples: 8483840 | consumed tokens: 17374904320 | elapsed time per iteration (s): 0.22 | learning rate: 2.707E-05 | global batch size: 256 | lm loss: 3.692257E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.015 | TFLOPs: 29.07 | +7: iteration 33150/ 37905 | consumed samples: 8486400 | consumed tokens: 17380147200 | elapsed time per iteration (s): 0.23 | learning rate: 2.704E-05 | global batch size: 256 | lm loss: 3.692342E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.001 | TFLOPs: 28.79 | +7: iteration 33160/ 37905 | consumed samples: 8488960 | consumed tokens: 17385390080 | elapsed time per iteration (s): 0.23 | learning rate: 2.701E-05 | global batch size: 256 | lm loss: 3.691132E+00 | grad norm: 0.210 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.390 | TFLOPs: 28.21 | +7: iteration 33170/ 37905 | consumed samples: 8491520 | consumed tokens: 17390632960 | elapsed time per iteration (s): 0.22 | learning rate: 2.698E-05 | global batch size: 256 | lm loss: 3.695481E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.748 | TFLOPs: 29.39 | +7: iteration 33180/ 37905 | consumed samples: 8494080 | consumed tokens: 17395875840 | elapsed time per iteration (s): 0.22 | learning rate: 2.695E-05 | global batch size: 256 | lm loss: 3.696207E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.423 | TFLOPs: 29.38 | +7: iteration 33190/ 37905 | consumed samples: 8496640 | consumed tokens: 17401118720 | elapsed time per iteration (s): 0.22 | learning rate: 2.692E-05 | global batch size: 256 | lm loss: 3.689244E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.654 | TFLOPs: 29.39 | +7: iteration 33200/ 37905 | consumed samples: 8499200 | consumed tokens: 17406361600 | elapsed time per iteration (s): 0.22 | learning rate: 2.689E-05 | global batch size: 256 | lm loss: 3.714524E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.053 | TFLOPs: 29.40 | +7: iteration 33210/ 37905 | consumed samples: 8501760 | consumed tokens: 17411604480 | elapsed time per iteration (s): 0.22 | learning rate: 2.686E-05 | global batch size: 256 | lm loss: 3.696051E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.863 | TFLOPs: 29.34 | +7: iteration 33220/ 37905 | consumed samples: 8504320 | consumed tokens: 17416847360 | elapsed time per iteration (s): 0.22 | learning rate: 2.683E-05 | global batch size: 256 | lm loss: 3.693182E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.542 | TFLOPs: 29.34 | +7: iteration 33230/ 37905 | consumed samples: 8506880 | consumed tokens: 17422090240 | elapsed time per iteration (s): 0.22 | learning rate: 2.681E-05 | global batch size: 256 | lm loss: 3.692503E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.337 | TFLOPs: 29.36 | +7: iteration 33240/ 37905 | consumed samples: 8509440 | consumed tokens: 17427333120 | elapsed time per iteration (s): 0.22 | learning rate: 2.678E-05 | global batch size: 256 | lm loss: 3.681757E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.891 | TFLOPs: 29.34 | +7: iteration 33250/ 37905 | consumed samples: 8512000 | consumed tokens: 17432576000 | elapsed time per iteration (s): 0.22 | learning rate: 2.675E-05 | global batch size: 256 | lm loss: 3.696102E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.122 | TFLOPs: 29.35 | +7: iteration 33260/ 37905 | consumed samples: 8514560 | consumed tokens: 17437818880 | elapsed time per iteration (s): 0.22 | learning rate: 2.672E-05 | global batch size: 256 | lm loss: 3.681561E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.834 | TFLOPs: 29.39 | +7: iteration 33270/ 37905 | consumed samples: 8517120 | consumed tokens: 17443061760 | elapsed time per iteration (s): 0.22 | learning rate: 2.669E-05 | global batch size: 256 | lm loss: 3.692688E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.019 | TFLOPs: 29.37 | +7: iteration 33280/ 37905 | consumed samples: 8519680 | consumed tokens: 17448304640 | elapsed time per iteration (s): 0.22 | learning rate: 2.666E-05 | global batch size: 256 | lm loss: 3.681778E+00 | grad norm: 0.226 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.868 | TFLOPs: 29.39 | +7: iteration 33290/ 37905 | consumed samples: 8522240 | consumed tokens: 17453547520 | elapsed time per iteration (s): 0.22 | learning rate: 2.663E-05 | global batch size: 256 | lm loss: 3.704038E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.004 | TFLOPs: 29.40 | +7: iteration 33300/ 37905 | consumed samples: 8524800 | consumed tokens: 17458790400 | elapsed time per iteration (s): 0.22 | learning rate: 2.661E-05 | global batch size: 256 | lm loss: 3.676626E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.049 | TFLOPs: 29.43 | +7: iteration 33310/ 37905 | consumed samples: 8527360 | consumed tokens: 17464033280 | elapsed time per iteration (s): 0.22 | learning rate: 2.658E-05 | global batch size: 256 | lm loss: 3.678481E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.652 | TFLOPs: 29.39 | +7: iteration 33320/ 37905 | consumed samples: 8529920 | consumed tokens: 17469276160 | elapsed time per iteration (s): 0.22 | learning rate: 2.655E-05 | global batch size: 256 | lm loss: 3.686111E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.975 | TFLOPs: 29.40 | +7: iteration 33330/ 37905 | consumed samples: 8532480 | consumed tokens: 17474519040 | elapsed time per iteration (s): 0.22 | learning rate: 2.652E-05 | global batch size: 256 | lm loss: 3.695589E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.467 | TFLOPs: 29.41 | +7: iteration 33340/ 37905 | consumed samples: 8535040 | consumed tokens: 17479761920 | elapsed time per iteration (s): 0.22 | learning rate: 2.649E-05 | global batch size: 256 | lm loss: 3.689296E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.525 | TFLOPs: 29.41 | +7: iteration 33350/ 37905 | consumed samples: 8537600 | consumed tokens: 17485004800 | elapsed time per iteration (s): 0.22 | learning rate: 2.647E-05 | global batch size: 256 | lm loss: 3.696545E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.671 | TFLOPs: 29.42 | +7: iteration 33360/ 37905 | consumed samples: 8540160 | consumed tokens: 17490247680 | elapsed time per iteration (s): 0.22 | learning rate: 2.644E-05 | global batch size: 256 | lm loss: 3.699034E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.996 | TFLOPs: 29.37 | +7: iteration 33370/ 37905 | consumed samples: 8542720 | consumed tokens: 17495490560 | elapsed time per iteration (s): 0.22 | learning rate: 2.641E-05 | global batch size: 256 | lm loss: 3.674549E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.616 | TFLOPs: 29.36 | +7: iteration 33380/ 37905 | consumed samples: 8545280 | consumed tokens: 17500733440 | elapsed time per iteration (s): 0.22 | learning rate: 2.638E-05 | global batch size: 256 | lm loss: 3.692997E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.507 | TFLOPs: 29.36 | +7: iteration 33390/ 37905 | consumed samples: 8547840 | consumed tokens: 17505976320 | elapsed time per iteration (s): 0.22 | learning rate: 2.635E-05 | global batch size: 256 | lm loss: 3.695602E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.134 | TFLOPs: 29.40 | +7: iteration 33400/ 37905 | consumed samples: 8550400 | consumed tokens: 17511219200 | elapsed time per iteration (s): 0.22 | learning rate: 2.633E-05 | global batch size: 256 | lm loss: 3.703420E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.654 | TFLOPs: 29.39 | +7: iteration 33410/ 37905 | consumed samples: 8552960 | consumed tokens: 17516462080 | elapsed time per iteration (s): 0.22 | learning rate: 2.630E-05 | global batch size: 256 | lm loss: 3.684258E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.430 | TFLOPs: 29.41 | +7: iteration 33420/ 37905 | consumed samples: 8555520 | consumed tokens: 17521704960 | elapsed time per iteration (s): 0.22 | learning rate: 2.627E-05 | global batch size: 256 | lm loss: 3.689779E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.685 | TFLOPs: 29.39 | +7: iteration 33430/ 37905 | consumed samples: 8558080 | consumed tokens: 17526947840 | elapsed time per iteration (s): 0.22 | learning rate: 2.624E-05 | global batch size: 256 | lm loss: 3.714130E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.951 | TFLOPs: 29.37 | +7: iteration 33440/ 37905 | consumed samples: 8560640 | consumed tokens: 17532190720 | elapsed time per iteration (s): 0.22 | learning rate: 2.622E-05 | global batch size: 256 | lm loss: 3.690043E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.292 | TFLOPs: 29.41 | +7: iteration 33450/ 37905 | consumed samples: 8563200 | consumed tokens: 17537433600 | elapsed time per iteration (s): 0.22 | learning rate: 2.619E-05 | global batch size: 256 | lm loss: 3.697404E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.702 | TFLOPs: 29.39 | +7: iteration 33460/ 37905 | consumed samples: 8565760 | consumed tokens: 17542676480 | elapsed time per iteration (s): 0.22 | learning rate: 2.616E-05 | global batch size: 256 | lm loss: 3.703596E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.151 | TFLOPs: 29.43 | +7: iteration 33470/ 37905 | consumed samples: 8568320 | consumed tokens: 17547919360 | elapsed time per iteration (s): 0.22 | learning rate: 2.613E-05 | global batch size: 256 | lm loss: 3.691007E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.454 | TFLOPs: 29.41 | +7: iteration 33480/ 37905 | consumed samples: 8570880 | consumed tokens: 17553162240 | elapsed time per iteration (s): 0.22 | learning rate: 2.611E-05 | global batch size: 256 | lm loss: 3.705519E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.300 | TFLOPs: 29.41 | +7: iteration 33490/ 37905 | consumed samples: 8573440 | consumed tokens: 17558405120 | elapsed time per iteration (s): 0.22 | learning rate: 2.608E-05 | global batch size: 256 | lm loss: 3.689194E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.700 | TFLOPs: 29.39 | +7: iteration 33500/ 37905 | consumed samples: 8576000 | consumed tokens: 17563648000 | elapsed time per iteration (s): 0.22 | learning rate: 2.605E-05 | global batch size: 256 | lm loss: 3.702551E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.572 | TFLOPs: 29.41 | +7: iteration 33510/ 37905 | consumed samples: 8578560 | consumed tokens: 17568890880 | elapsed time per iteration (s): 0.22 | learning rate: 2.602E-05 | global batch size: 256 | lm loss: 3.693228E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.253 | TFLOPs: 29.40 | +7: iteration 33520/ 37905 | consumed samples: 8581120 | consumed tokens: 17574133760 | elapsed time per iteration (s): 0.22 | learning rate: 2.600E-05 | global batch size: 256 | lm loss: 3.693225E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.023 | TFLOPs: 29.40 | +7: iteration 33530/ 37905 | consumed samples: 8583680 | consumed tokens: 17579376640 | elapsed time per iteration (s): 0.22 | learning rate: 2.597E-05 | global batch size: 256 | lm loss: 3.695190E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.277 | TFLOPs: 29.41 | +7: iteration 33540/ 37905 | consumed samples: 8586240 | consumed tokens: 17584619520 | elapsed time per iteration (s): 0.22 | learning rate: 2.594E-05 | global batch size: 256 | lm loss: 3.688713E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.266 | TFLOPs: 29.41 | +7: iteration 33550/ 37905 | consumed samples: 8588800 | consumed tokens: 17589862400 | elapsed time per iteration (s): 0.22 | learning rate: 2.592E-05 | global batch size: 256 | lm loss: 3.677772E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.743 | TFLOPs: 29.39 | +7: iteration 33560/ 37905 | consumed samples: 8591360 | consumed tokens: 17595105280 | elapsed time per iteration (s): 0.22 | learning rate: 2.589E-05 | global batch size: 256 | lm loss: 3.704854E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.714 | TFLOPs: 29.39 | +7: iteration 33570/ 37905 | consumed samples: 8593920 | consumed tokens: 17600348160 | elapsed time per iteration (s): 0.22 | learning rate: 2.586E-05 | global batch size: 256 | lm loss: 3.685743E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.793 | TFLOPs: 29.39 | +7: iteration 33580/ 37905 | consumed samples: 8596480 | consumed tokens: 17605591040 | elapsed time per iteration (s): 0.22 | learning rate: 2.584E-05 | global batch size: 256 | lm loss: 3.709675E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.058 | TFLOPs: 29.43 | +7: iteration 33590/ 37905 | consumed samples: 8599040 | consumed tokens: 17610833920 | elapsed time per iteration (s): 0.22 | learning rate: 2.581E-05 | global batch size: 256 | lm loss: 3.693530E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.266 | TFLOPs: 29.10 | +7: iteration 33600/ 37905 | consumed samples: 8601600 | consumed tokens: 17616076800 | elapsed time per iteration (s): 0.22 | learning rate: 2.578E-05 | global batch size: 256 | lm loss: 3.691030E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.396 | TFLOPs: 29.43 | +7: iteration 33610/ 37905 | consumed samples: 8604160 | consumed tokens: 17621319680 | elapsed time per iteration (s): 0.22 | learning rate: 2.576E-05 | global batch size: 256 | lm loss: 3.689896E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.712 | TFLOPs: 29.42 | +7: iteration 33620/ 37905 | consumed samples: 8606720 | consumed tokens: 17626562560 | elapsed time per iteration (s): 0.22 | learning rate: 2.573E-05 | global batch size: 256 | lm loss: 3.696822E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.166 | TFLOPs: 29.38 | +7: iteration 33630/ 37905 | consumed samples: 8609280 | consumed tokens: 17631805440 | elapsed time per iteration (s): 0.22 | learning rate: 2.570E-05 | global batch size: 256 | lm loss: 3.692514E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.334 | TFLOPs: 29.41 | +7: iteration 33640/ 37905 | consumed samples: 8611840 | consumed tokens: 17637048320 | elapsed time per iteration (s): 0.22 | learning rate: 2.568E-05 | global batch size: 256 | lm loss: 3.689939E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.644 | TFLOPs: 29.19 | +7: iteration 33650/ 37905 | consumed samples: 8614400 | consumed tokens: 17642291200 | elapsed time per iteration (s): 0.22 | learning rate: 2.565E-05 | global batch size: 256 | lm loss: 3.692081E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.019 | TFLOPs: 29.42 | +7: iteration 33660/ 37905 | consumed samples: 8616960 | consumed tokens: 17647534080 | elapsed time per iteration (s): 0.22 | learning rate: 2.562E-05 | global batch size: 256 | lm loss: 3.706086E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.134 | TFLOPs: 29.17 | +7: iteration 33670/ 37905 | consumed samples: 8619520 | consumed tokens: 17652776960 | elapsed time per iteration (s): 0.22 | learning rate: 2.560E-05 | global batch size: 256 | lm loss: 3.695833E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.835 | TFLOPs: 29.45 | +7: iteration 33680/ 37905 | consumed samples: 8622080 | consumed tokens: 17658019840 | elapsed time per iteration (s): 0.22 | learning rate: 2.557E-05 | global batch size: 256 | lm loss: 3.686963E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.686 | TFLOPs: 29.06 | +7: iteration 33690/ 37905 | consumed samples: 8624640 | consumed tokens: 17663262720 | elapsed time per iteration (s): 0.22 | learning rate: 2.555E-05 | global batch size: 256 | lm loss: 3.692023E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.971 | TFLOPs: 29.40 | +7: iteration 33700/ 37905 | consumed samples: 8627200 | consumed tokens: 17668505600 | elapsed time per iteration (s): 0.22 | learning rate: 2.552E-05 | global batch size: 256 | lm loss: 3.694827E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.579 | TFLOPs: 29.36 | +7: iteration 33710/ 37905 | consumed samples: 8629760 | consumed tokens: 17673748480 | elapsed time per iteration (s): 0.22 | learning rate: 2.549E-05 | global batch size: 256 | lm loss: 3.687627E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.573 | TFLOPs: 29.44 | +7: iteration 33720/ 37905 | consumed samples: 8632320 | consumed tokens: 17678991360 | elapsed time per iteration (s): 0.23 | learning rate: 2.547E-05 | global batch size: 256 | lm loss: 3.682090E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.722 | TFLOPs: 28.78 | +7: iteration 33730/ 37905 | consumed samples: 8634880 | consumed tokens: 17684234240 | elapsed time per iteration (s): 0.22 | learning rate: 2.544E-05 | global batch size: 256 | lm loss: 3.693301E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.989 | TFLOPs: 29.17 | +7: iteration 33740/ 37905 | consumed samples: 8637440 | consumed tokens: 17689477120 | elapsed time per iteration (s): 0.22 | learning rate: 2.542E-05 | global batch size: 256 | lm loss: 3.687469E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.152 | TFLOPs: 29.25 | +7: iteration 33750/ 37905 | consumed samples: 8640000 | consumed tokens: 17694720000 | elapsed time per iteration (s): 0.22 | learning rate: 2.539E-05 | global batch size: 256 | lm loss: 3.692551E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1156.002 | TFLOPs: 29.45 | +7: iteration 33760/ 37905 | consumed samples: 8642560 | consumed tokens: 17699962880 | elapsed time per iteration (s): 0.22 | learning rate: 2.536E-05 | global batch size: 256 | lm loss: 3.684730E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.371 | TFLOPs: 29.08 | +7: iteration 33770/ 37905 | consumed samples: 8645120 | consumed tokens: 17705205760 | elapsed time per iteration (s): 0.22 | learning rate: 2.534E-05 | global batch size: 256 | lm loss: 3.685197E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.909 | TFLOPs: 29.45 | +7: iteration 33780/ 37905 | consumed samples: 8647680 | consumed tokens: 17710448640 | elapsed time per iteration (s): 0.22 | learning rate: 2.531E-05 | global batch size: 256 | lm loss: 3.679052E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.270 | TFLOPs: 29.43 | +7: iteration 33790/ 37905 | consumed samples: 8650240 | consumed tokens: 17715691520 | elapsed time per iteration (s): 0.22 | learning rate: 2.529E-05 | global batch size: 256 | lm loss: 3.697220E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.891 | TFLOPs: 29.42 | +7: iteration 33800/ 37905 | consumed samples: 8652800 | consumed tokens: 17720934400 | elapsed time per iteration (s): 0.22 | learning rate: 2.526E-05 | global batch size: 256 | lm loss: 3.701508E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.587 | TFLOPs: 29.39 | +7: iteration 33810/ 37905 | consumed samples: 8655360 | consumed tokens: 17726177280 | elapsed time per iteration (s): 0.22 | learning rate: 2.524E-05 | global batch size: 256 | lm loss: 3.702013E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.957 | TFLOPs: 29.42 | +7: iteration 33820/ 37905 | consumed samples: 8657920 | consumed tokens: 17731420160 | elapsed time per iteration (s): 0.22 | learning rate: 2.521E-05 | global batch size: 256 | lm loss: 3.676942E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.158 | TFLOPs: 29.15 | +7: iteration 33830/ 37905 | consumed samples: 8660480 | consumed tokens: 17736663040 | elapsed time per iteration (s): 0.22 | learning rate: 2.519E-05 | global batch size: 256 | lm loss: 3.688151E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.045 | TFLOPs: 29.09 | +7: iteration 33840/ 37905 | consumed samples: 8663040 | consumed tokens: 17741905920 | elapsed time per iteration (s): 0.22 | learning rate: 2.516E-05 | global batch size: 256 | lm loss: 3.695949E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.054 | TFLOPs: 29.14 | +7: iteration 33850/ 37905 | consumed samples: 8665600 | consumed tokens: 17747148800 | elapsed time per iteration (s): 0.22 | learning rate: 2.514E-05 | global batch size: 256 | lm loss: 3.707488E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.758 | TFLOPs: 29.04 | +7: iteration 33860/ 37905 | consumed samples: 8668160 | consumed tokens: 17752391680 | elapsed time per iteration (s): 0.22 | learning rate: 2.511E-05 | global batch size: 256 | lm loss: 3.696730E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.360 | TFLOPs: 29.43 | +7: iteration 33870/ 37905 | consumed samples: 8670720 | consumed tokens: 17757634560 | elapsed time per iteration (s): 0.22 | learning rate: 2.509E-05 | global batch size: 256 | lm loss: 3.689014E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.699 | TFLOPs: 29.42 | +7: iteration 33880/ 37905 | consumed samples: 8673280 | consumed tokens: 17762877440 | elapsed time per iteration (s): 0.22 | learning rate: 2.506E-05 | global batch size: 256 | lm loss: 3.701865E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.571 | TFLOPs: 29.41 | +7: iteration 33890/ 37905 | consumed samples: 8675840 | consumed tokens: 17768120320 | elapsed time per iteration (s): 0.22 | learning rate: 2.504E-05 | global batch size: 256 | lm loss: 3.690504E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.985 | TFLOPs: 28.99 | +7: iteration 33900/ 37905 | consumed samples: 8678400 | consumed tokens: 17773363200 | elapsed time per iteration (s): 0.22 | learning rate: 2.501E-05 | global batch size: 256 | lm loss: 3.693903E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.518 | TFLOPs: 29.36 | +7: iteration 33910/ 37905 | consumed samples: 8680960 | consumed tokens: 17778606080 | elapsed time per iteration (s): 0.22 | learning rate: 2.499E-05 | global batch size: 256 | lm loss: 3.685373E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.013 | TFLOPs: 29.14 | +7: iteration 33920/ 37905 | consumed samples: 8683520 | consumed tokens: 17783848960 | elapsed time per iteration (s): 0.23 | learning rate: 2.496E-05 | global batch size: 256 | lm loss: 3.679218E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.654 | TFLOPs: 28.65 | +7: iteration 33930/ 37905 | consumed samples: 8686080 | consumed tokens: 17789091840 | elapsed time per iteration (s): 0.22 | learning rate: 2.494E-05 | global batch size: 256 | lm loss: 3.689735E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.559 | TFLOPs: 29.36 | +7: iteration 33940/ 37905 | consumed samples: 8688640 | consumed tokens: 17794334720 | elapsed time per iteration (s): 0.22 | learning rate: 2.491E-05 | global batch size: 256 | lm loss: 3.688732E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.242 | TFLOPs: 29.12 | +7: iteration 33950/ 37905 | consumed samples: 8691200 | consumed tokens: 17799577600 | elapsed time per iteration (s): 0.22 | learning rate: 2.489E-05 | global batch size: 256 | lm loss: 3.693647E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.992 | TFLOPs: 29.32 | +7: iteration 33960/ 37905 | consumed samples: 8693760 | consumed tokens: 17804820480 | elapsed time per iteration (s): 0.22 | learning rate: 2.486E-05 | global batch size: 256 | lm loss: 3.687837E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.633 | TFLOPs: 29.29 | +7: iteration 33970/ 37905 | consumed samples: 8696320 | consumed tokens: 17810063360 | elapsed time per iteration (s): 0.23 | learning rate: 2.484E-05 | global batch size: 256 | lm loss: 3.694329E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.837 | TFLOPs: 28.96 | +7: iteration 33980/ 37905 | consumed samples: 8698880 | consumed tokens: 17815306240 | elapsed time per iteration (s): 0.22 | learning rate: 2.482E-05 | global batch size: 256 | lm loss: 3.700517E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.378 | TFLOPs: 29.28 | +7: iteration 33990/ 37905 | consumed samples: 8701440 | consumed tokens: 17820549120 | elapsed time per iteration (s): 0.22 | learning rate: 2.479E-05 | global batch size: 256 | lm loss: 3.698514E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.218 | TFLOPs: 29.33 | +0: [2023-03-16 00:04:08,070] [INFO] [logging.py:68:log_dist] [Rank 0] step=34000, skipped=0, lr=[2.4766896881602934e-05, 2.4766896881602934e-05, 2.4766896881602934e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 34000/ 37905 | consumed samples: 8704000 | consumed tokens: 17825792000 | elapsed time per iteration (s): 0.23 | learning rate: 2.477E-05 | global batch size: 256 | lm loss: 3.687655E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.653 | TFLOPs: 28.96 | +0: steps: 34000 loss: 3.7365 iter time (s): 0.221 samples/sec: 1158.840 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 34000 | lm loss value: 3.678910E+00 | lm loss PPL: 3.960322E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 34000 to checkpoints_83m20b400m +0: [2023-03-16 00:04:08,159] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step34000 is begin to save! +0: [2023-03-16 00:04:08,163] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_01-model_00-model_states.pt... +0: [2023-03-16 00:04:08,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_01-model_00-model_states.pt. +0: [2023-03-16 00:04:08,233] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_03-model_00-model_states.pt... +0: [2023-03-16 00:04:08,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_03-model_00-model_states.pt. +0: [2023-03-16 00:04:08,245] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_04-model_00-model_states.pt... +0: [2023-03-16 00:04:08,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_04-model_00-model_states.pt. +0: [2023-03-16 00:04:08,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_05-model_00-model_states.pt... +0: [2023-03-16 00:04:08,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_05-model_00-model_states.pt. +0: [2023-03-16 00:04:08,268] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_06-model_00-model_states.pt... +0: [2023-03-16 00:04:08,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_06-model_00-model_states.pt. +0: [2023-03-16 00:04:08,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_07-model_00-model_states.pt... +0: [2023-03-16 00:04:08,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_07-model_00-model_states.pt. +0: [2023-03-16 00:04:08,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_08-model_00-model_states.pt... +0: [2023-03-16 00:04:08,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_08-model_00-model_states.pt. +0: [2023-03-16 00:04:08,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_09-model_00-model_states.pt... +0: [2023-03-16 00:04:08,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_09-model_00-model_states.pt. +0: [2023-03-16 00:04:08,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_10-model_00-model_states.pt... +0: [2023-03-16 00:04:08,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_10-model_00-model_states.pt. +0: [2023-03-16 00:04:08,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_11-model_00-model_states.pt... +0: [2023-03-16 00:04:08,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_11-model_00-model_states.pt. +0: [2023-03-16 00:04:08,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_12-model_00-model_states.pt... +0: [2023-03-16 00:04:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_12-model_00-model_states.pt. +0: [2023-03-16 00:04:08,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/layer_14-model_00-model_states.pt... +0: [2023-03-16 00:04:08,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/layer_14-model_00-model_states.pt. +0: [2023-03-16 00:04:08,347] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step34000/mp_rank_00_model_states.pt +0: [2023-03-16 00:04:08,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/mp_rank_00_model_states.pt... +0: [2023-03-16 00:04:08,349] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/mp_rank_00_model_states.pt. +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,366] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:04:08,365] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:04:08,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,384] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,384] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,384] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:04:08,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,385] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 00:04:08,385] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-16 00:04:08,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-16 00:04:08,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +0: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:04:08,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +5: [2023-03-16 00:04:08,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +5: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +6: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +2: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +1: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:04:08,394] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 00:04:08,394] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +3: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:04:08,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +5: [2023-03-16 00:04:08,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +7: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:04:08,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +4: [2023-03-16 00:04:08,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step34000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 00:04:08,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step34000 is ready now! +0: successfully saved checkpoint at iteration 34000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 239.95 +7: iteration 34010/ 37905 | consumed samples: 8706560 | consumed tokens: 17831034880 | elapsed time per iteration (s): 0.25 | learning rate: 2.474E-05 | global batch size: 256 | lm loss: 3.690086E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1008.164 | TFLOPs: 25.68 | +7: iteration 34020/ 37905 | consumed samples: 8709120 | consumed tokens: 17836277760 | elapsed time per iteration (s): 0.22 | learning rate: 2.472E-05 | global batch size: 256 | lm loss: 3.689415E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.611 | TFLOPs: 29.39 | +7: iteration 34030/ 37905 | consumed samples: 8711680 | consumed tokens: 17841520640 | elapsed time per iteration (s): 0.22 | learning rate: 2.469E-05 | global batch size: 256 | lm loss: 3.667216E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.086 | TFLOPs: 29.40 | +7: iteration 34040/ 37905 | consumed samples: 8714240 | consumed tokens: 17846763520 | elapsed time per iteration (s): 0.22 | learning rate: 2.467E-05 | global batch size: 256 | lm loss: 3.684420E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.329 | TFLOPs: 29.18 | +7: iteration 34050/ 37905 | consumed samples: 8716800 | consumed tokens: 17852006400 | elapsed time per iteration (s): 0.23 | learning rate: 2.465E-05 | global batch size: 256 | lm loss: 3.678035E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.247 | TFLOPs: 28.61 | +7: iteration 34060/ 37905 | consumed samples: 8719360 | consumed tokens: 17857249280 | elapsed time per iteration (s): 0.22 | learning rate: 2.462E-05 | global batch size: 256 | lm loss: 3.695703E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.070 | TFLOPs: 29.40 | +7: iteration 34070/ 37905 | consumed samples: 8721920 | consumed tokens: 17862492160 | elapsed time per iteration (s): 0.23 | learning rate: 2.460E-05 | global batch size: 256 | lm loss: 3.699991E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.652 | TFLOPs: 28.73 | +7: iteration 34080/ 37905 | consumed samples: 8724480 | consumed tokens: 17867735040 | elapsed time per iteration (s): 0.22 | learning rate: 2.458E-05 | global batch size: 256 | lm loss: 3.688258E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.883 | TFLOPs: 29.32 | +7: iteration 34090/ 37905 | consumed samples: 8727040 | consumed tokens: 17872977920 | elapsed time per iteration (s): 0.22 | learning rate: 2.455E-05 | global batch size: 256 | lm loss: 3.685788E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.080 | TFLOPs: 29.09 | +7: iteration 34100/ 37905 | consumed samples: 8729600 | consumed tokens: 17878220800 | elapsed time per iteration (s): 0.22 | learning rate: 2.453E-05 | global batch size: 256 | lm loss: 3.701087E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.974 | TFLOPs: 29.07 | +7: iteration 34110/ 37905 | consumed samples: 8732160 | consumed tokens: 17883463680 | elapsed time per iteration (s): 0.22 | learning rate: 2.450E-05 | global batch size: 256 | lm loss: 3.699676E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.737 | TFLOPs: 29.37 | +7: iteration 34120/ 37905 | consumed samples: 8734720 | consumed tokens: 17888706560 | elapsed time per iteration (s): 0.22 | learning rate: 2.448E-05 | global batch size: 256 | lm loss: 3.690605E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.741 | TFLOPs: 29.32 | +7: iteration 34130/ 37905 | consumed samples: 8737280 | consumed tokens: 17893949440 | elapsed time per iteration (s): 0.23 | learning rate: 2.446E-05 | global batch size: 256 | lm loss: 3.705491E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.107 | TFLOPs: 28.15 | +7: iteration 34140/ 37905 | consumed samples: 8739840 | consumed tokens: 17899192320 | elapsed time per iteration (s): 0.23 | learning rate: 2.443E-05 | global batch size: 256 | lm loss: 3.696072E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.155 | TFLOPs: 28.94 | +7: iteration 34150/ 37905 | consumed samples: 8742400 | consumed tokens: 17904435200 | elapsed time per iteration (s): 0.22 | learning rate: 2.441E-05 | global batch size: 256 | lm loss: 3.692393E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.328 | TFLOPs: 29.00 | +7: iteration 34160/ 37905 | consumed samples: 8744960 | consumed tokens: 17909678080 | elapsed time per iteration (s): 0.22 | learning rate: 2.439E-05 | global batch size: 256 | lm loss: 3.706141E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.054 | TFLOPs: 29.37 | +7: iteration 34170/ 37905 | consumed samples: 8747520 | consumed tokens: 17914920960 | elapsed time per iteration (s): 0.22 | learning rate: 2.436E-05 | global batch size: 256 | lm loss: 3.677526E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.840 | TFLOPs: 29.34 | +7: iteration 34180/ 37905 | consumed samples: 8750080 | consumed tokens: 17920163840 | elapsed time per iteration (s): 0.22 | learning rate: 2.434E-05 | global batch size: 256 | lm loss: 3.700246E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.942 | TFLOPs: 29.35 | +7: iteration 34190/ 37905 | consumed samples: 8752640 | consumed tokens: 17925406720 | elapsed time per iteration (s): 0.22 | learning rate: 2.432E-05 | global batch size: 256 | lm loss: 3.694202E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.351 | TFLOPs: 29.31 | +7: iteration 34200/ 37905 | consumed samples: 8755200 | consumed tokens: 17930649600 | elapsed time per iteration (s): 0.22 | learning rate: 2.429E-05 | global batch size: 256 | lm loss: 3.697715E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.703 | TFLOPs: 29.14 | +7: iteration 34210/ 37905 | consumed samples: 8757760 | consumed tokens: 17935892480 | elapsed time per iteration (s): 0.22 | learning rate: 2.427E-05 | global batch size: 256 | lm loss: 3.687015E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.244 | TFLOPs: 29.10 | +7: iteration 34220/ 37905 | consumed samples: 8760320 | consumed tokens: 17941135360 | elapsed time per iteration (s): 0.22 | learning rate: 2.425E-05 | global batch size: 256 | lm loss: 3.675846E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.315 | TFLOPs: 29.15 | +7: iteration 34230/ 37905 | consumed samples: 8762880 | consumed tokens: 17946378240 | elapsed time per iteration (s): 0.22 | learning rate: 2.423E-05 | global batch size: 256 | lm loss: 3.695304E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.945 | TFLOPs: 29.40 | +7: iteration 34240/ 37905 | consumed samples: 8765440 | consumed tokens: 17951621120 | elapsed time per iteration (s): 0.22 | learning rate: 2.420E-05 | global batch size: 256 | lm loss: 3.700657E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.873 | TFLOPs: 29.37 | +7: iteration 34250/ 37905 | consumed samples: 8768000 | consumed tokens: 17956864000 | elapsed time per iteration (s): 0.22 | learning rate: 2.418E-05 | global batch size: 256 | lm loss: 3.684452E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.700 | TFLOPs: 29.39 | +7: iteration 34260/ 37905 | consumed samples: 8770560 | consumed tokens: 17962106880 | elapsed time per iteration (s): 0.22 | learning rate: 2.416E-05 | global batch size: 256 | lm loss: 3.677788E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.450 | TFLOPs: 29.38 | +7: iteration 34270/ 37905 | consumed samples: 8773120 | consumed tokens: 17967349760 | elapsed time per iteration (s): 0.22 | learning rate: 2.414E-05 | global batch size: 256 | lm loss: 3.705857E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.719 | TFLOPs: 29.39 | +7: iteration 34280/ 37905 | consumed samples: 8775680 | consumed tokens: 17972592640 | elapsed time per iteration (s): 0.22 | learning rate: 2.411E-05 | global batch size: 256 | lm loss: 3.696019E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.996 | TFLOPs: 29.40 | +7: iteration 34290/ 37905 | consumed samples: 8778240 | consumed tokens: 17977835520 | elapsed time per iteration (s): 0.22 | learning rate: 2.409E-05 | global batch size: 256 | lm loss: 3.693826E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.140 | TFLOPs: 29.40 | +7: iteration 34300/ 37905 | consumed samples: 8780800 | consumed tokens: 17983078400 | elapsed time per iteration (s): 0.22 | learning rate: 2.407E-05 | global batch size: 256 | lm loss: 3.693256E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.078 | TFLOPs: 29.40 | +7: iteration 34310/ 37905 | consumed samples: 8783360 | consumed tokens: 17988321280 | elapsed time per iteration (s): 0.22 | learning rate: 2.405E-05 | global batch size: 256 | lm loss: 3.699093E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.471 | TFLOPs: 29.08 | +7: iteration 34320/ 37905 | consumed samples: 8785920 | consumed tokens: 17993564160 | elapsed time per iteration (s): 0.22 | learning rate: 2.402E-05 | global batch size: 256 | lm loss: 3.686073E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.186 | TFLOPs: 29.40 | +7: iteration 34330/ 37905 | consumed samples: 8788480 | consumed tokens: 17998807040 | elapsed time per iteration (s): 0.22 | learning rate: 2.400E-05 | global batch size: 256 | lm loss: 3.691797E+00 | grad norm: 0.205 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.789 | TFLOPs: 29.29 | +7: iteration 34340/ 37905 | consumed samples: 8791040 | consumed tokens: 18004049920 | elapsed time per iteration (s): 0.23 | learning rate: 2.398E-05 | global batch size: 256 | lm loss: 3.681509E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1112.958 | TFLOPs: 28.35 | +7: iteration 34350/ 37905 | consumed samples: 8793600 | consumed tokens: 18009292800 | elapsed time per iteration (s): 0.23 | learning rate: 2.396E-05 | global batch size: 256 | lm loss: 3.689178E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.496 | TFLOPs: 28.77 | +7: iteration 34360/ 37905 | consumed samples: 8796160 | consumed tokens: 18014535680 | elapsed time per iteration (s): 0.22 | learning rate: 2.393E-05 | global batch size: 256 | lm loss: 3.700823E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.799 | TFLOPs: 29.34 | +7: iteration 34370/ 37905 | consumed samples: 8798720 | consumed tokens: 18019778560 | elapsed time per iteration (s): 0.22 | learning rate: 2.391E-05 | global batch size: 256 | lm loss: 3.693076E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.093 | TFLOPs: 29.43 | +7: iteration 34380/ 37905 | consumed samples: 8801280 | consumed tokens: 18025021440 | elapsed time per iteration (s): 0.22 | learning rate: 2.389E-05 | global batch size: 256 | lm loss: 3.690617E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.381 | TFLOPs: 29.43 | +7: iteration 34390/ 37905 | consumed samples: 8803840 | consumed tokens: 18030264320 | elapsed time per iteration (s): 0.22 | learning rate: 2.387E-05 | global batch size: 256 | lm loss: 3.685058E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.073 | TFLOPs: 29.17 | +7: iteration 34400/ 37905 | consumed samples: 8806400 | consumed tokens: 18035507200 | elapsed time per iteration (s): 0.22 | learning rate: 2.385E-05 | global batch size: 256 | lm loss: 3.688116E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.730 | TFLOPs: 29.42 | +7: iteration 34410/ 37905 | consumed samples: 8808960 | consumed tokens: 18040750080 | elapsed time per iteration (s): 0.22 | learning rate: 2.383E-05 | global batch size: 256 | lm loss: 3.697115E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.527 | TFLOPs: 29.41 | +7: iteration 34420/ 37905 | consumed samples: 8811520 | consumed tokens: 18045992960 | elapsed time per iteration (s): 0.22 | learning rate: 2.380E-05 | global batch size: 256 | lm loss: 3.675107E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.770 | TFLOPs: 29.06 | +7: iteration 34430/ 37905 | consumed samples: 8814080 | consumed tokens: 18051235840 | elapsed time per iteration (s): 0.22 | learning rate: 2.378E-05 | global batch size: 256 | lm loss: 3.682296E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.737 | TFLOPs: 29.42 | +7: iteration 34440/ 37905 | consumed samples: 8816640 | consumed tokens: 18056478720 | elapsed time per iteration (s): 0.22 | learning rate: 2.376E-05 | global batch size: 256 | lm loss: 3.682717E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.280 | TFLOPs: 29.07 | +7: iteration 34450/ 37905 | consumed samples: 8819200 | consumed tokens: 18061721600 | elapsed time per iteration (s): 0.23 | learning rate: 2.374E-05 | global batch size: 256 | lm loss: 3.698966E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.335 | TFLOPs: 28.97 | +7: iteration 34460/ 37905 | consumed samples: 8821760 | consumed tokens: 18066964480 | elapsed time per iteration (s): 0.23 | learning rate: 2.372E-05 | global batch size: 256 | lm loss: 3.694997E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.781 | TFLOPs: 28.96 | +7: iteration 34470/ 37905 | consumed samples: 8824320 | consumed tokens: 18072207360 | elapsed time per iteration (s): 0.22 | learning rate: 2.370E-05 | global batch size: 256 | lm loss: 3.681150E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.363 | TFLOPs: 29.41 | +7: iteration 34480/ 37905 | consumed samples: 8826880 | consumed tokens: 18077450240 | elapsed time per iteration (s): 0.22 | learning rate: 2.367E-05 | global batch size: 256 | lm loss: 3.684242E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.432 | TFLOPs: 29.03 | +7: iteration 34490/ 37905 | consumed samples: 8829440 | consumed tokens: 18082693120 | elapsed time per iteration (s): 0.22 | learning rate: 2.365E-05 | global batch size: 256 | lm loss: 3.696175E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.290 | TFLOPs: 29.02 | +7: iteration 34500/ 37905 | consumed samples: 8832000 | consumed tokens: 18087936000 | elapsed time per iteration (s): 0.22 | learning rate: 2.363E-05 | global batch size: 256 | lm loss: 3.698484E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.860 | TFLOPs: 29.27 | +7: iteration 34510/ 37905 | consumed samples: 8834560 | consumed tokens: 18093178880 | elapsed time per iteration (s): 0.23 | learning rate: 2.361E-05 | global batch size: 256 | lm loss: 3.693272E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.979 | TFLOPs: 28.46 | +7: iteration 34520/ 37905 | consumed samples: 8837120 | consumed tokens: 18098421760 | elapsed time per iteration (s): 0.24 | learning rate: 2.359E-05 | global batch size: 256 | lm loss: 3.699097E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1088.976 | TFLOPs: 27.74 | +7: iteration 34530/ 37905 | consumed samples: 8839680 | consumed tokens: 18103664640 | elapsed time per iteration (s): 0.24 | learning rate: 2.357E-05 | global batch size: 256 | lm loss: 3.682570E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1071.426 | TFLOPs: 27.29 | +7: iteration 34540/ 37905 | consumed samples: 8842240 | consumed tokens: 18108907520 | elapsed time per iteration (s): 0.23 | learning rate: 2.355E-05 | global batch size: 256 | lm loss: 3.693192E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1098.811 | TFLOPs: 27.99 | +7: iteration 34550/ 37905 | consumed samples: 8844800 | consumed tokens: 18114150400 | elapsed time per iteration (s): 0.24 | learning rate: 2.353E-05 | global batch size: 256 | lm loss: 3.684096E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1072.772 | TFLOPs: 27.33 | +7: iteration 34560/ 37905 | consumed samples: 8847360 | consumed tokens: 18119393280 | elapsed time per iteration (s): 0.22 | learning rate: 2.351E-05 | global batch size: 256 | lm loss: 3.684822E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.758 | TFLOPs: 29.42 | +7: iteration 34570/ 37905 | consumed samples: 8849920 | consumed tokens: 18124636160 | elapsed time per iteration (s): 0.23 | learning rate: 2.349E-05 | global batch size: 256 | lm loss: 3.684890E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.434 | TFLOPs: 28.77 | +7: iteration 34580/ 37905 | consumed samples: 8852480 | consumed tokens: 18129879040 | elapsed time per iteration (s): 0.23 | learning rate: 2.346E-05 | global batch size: 256 | lm loss: 3.691074E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.002 | TFLOPs: 28.58 | +7: iteration 34590/ 37905 | consumed samples: 8855040 | consumed tokens: 18135121920 | elapsed time per iteration (s): 0.24 | learning rate: 2.344E-05 | global batch size: 256 | lm loss: 3.689586E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1075.430 | TFLOPs: 27.40 | +7: iteration 34600/ 37905 | consumed samples: 8857600 | consumed tokens: 18140364800 | elapsed time per iteration (s): 0.22 | learning rate: 2.342E-05 | global batch size: 256 | lm loss: 3.681219E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.038 | TFLOPs: 29.07 | +7: iteration 34610/ 37905 | consumed samples: 8860160 | consumed tokens: 18145607680 | elapsed time per iteration (s): 0.22 | learning rate: 2.340E-05 | global batch size: 256 | lm loss: 3.696755E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.012 | TFLOPs: 29.07 | +7: iteration 34620/ 37905 | consumed samples: 8862720 | consumed tokens: 18150850560 | elapsed time per iteration (s): 0.22 | learning rate: 2.338E-05 | global batch size: 256 | lm loss: 3.681549E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.859 | TFLOPs: 29.24 | +7: iteration 34630/ 37905 | consumed samples: 8865280 | consumed tokens: 18156093440 | elapsed time per iteration (s): 0.22 | learning rate: 2.336E-05 | global batch size: 256 | lm loss: 3.700879E+00 | grad norm: 0.220 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.881 | TFLOPs: 29.32 | +7: iteration 34640/ 37905 | consumed samples: 8867840 | consumed tokens: 18161336320 | elapsed time per iteration (s): 0.22 | learning rate: 2.334E-05 | global batch size: 256 | lm loss: 3.691973E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.262 | TFLOPs: 29.15 | +7: iteration 34650/ 37905 | consumed samples: 8870400 | consumed tokens: 18166579200 | elapsed time per iteration (s): 0.23 | learning rate: 2.332E-05 | global batch size: 256 | lm loss: 3.681724E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.330 | TFLOPs: 28.49 | +7: iteration 34660/ 37905 | consumed samples: 8872960 | consumed tokens: 18171822080 | elapsed time per iteration (s): 0.23 | learning rate: 2.330E-05 | global batch size: 256 | lm loss: 3.687373E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.724 | TFLOPs: 28.58 | +7: iteration 34670/ 37905 | consumed samples: 8875520 | consumed tokens: 18177064960 | elapsed time per iteration (s): 0.23 | learning rate: 2.328E-05 | global batch size: 256 | lm loss: 3.684050E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.682 | TFLOPs: 28.75 | +7: iteration 34680/ 37905 | consumed samples: 8878080 | consumed tokens: 18182307840 | elapsed time per iteration (s): 0.23 | learning rate: 2.326E-05 | global batch size: 256 | lm loss: 3.682235E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1115.919 | TFLOPs: 28.43 | +7: iteration 34690/ 37905 | consumed samples: 8880640 | consumed tokens: 18187550720 | elapsed time per iteration (s): 0.23 | learning rate: 2.324E-05 | global batch size: 256 | lm loss: 3.685897E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.450 | TFLOPs: 28.85 | +7: iteration 34700/ 37905 | consumed samples: 8883200 | consumed tokens: 18192793600 | elapsed time per iteration (s): 0.23 | learning rate: 2.322E-05 | global batch size: 256 | lm loss: 3.696429E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.269 | TFLOPs: 28.92 | +7: iteration 34710/ 37905 | consumed samples: 8885760 | consumed tokens: 18198036480 | elapsed time per iteration (s): 0.23 | learning rate: 2.320E-05 | global batch size: 256 | lm loss: 3.697106E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.383 | TFLOPs: 28.69 | +7: iteration 34720/ 37905 | consumed samples: 8888320 | consumed tokens: 18203279360 | elapsed time per iteration (s): 0.24 | learning rate: 2.318E-05 | global batch size: 256 | lm loss: 3.692236E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1044.901 | TFLOPs: 26.62 | +7: iteration 34730/ 37905 | consumed samples: 8890880 | consumed tokens: 18208522240 | elapsed time per iteration (s): 0.23 | learning rate: 2.316E-05 | global batch size: 256 | lm loss: 3.691066E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.981 | TFLOPs: 28.61 | +7: iteration 34740/ 37905 | consumed samples: 8893440 | consumed tokens: 18213765120 | elapsed time per iteration (s): 0.23 | learning rate: 2.314E-05 | global batch size: 256 | lm loss: 3.695055E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.645 | TFLOPs: 28.19 | +7: iteration 34750/ 37905 | consumed samples: 8896000 | consumed tokens: 18219008000 | elapsed time per iteration (s): 0.23 | learning rate: 2.312E-05 | global batch size: 256 | lm loss: 3.708842E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.070 | TFLOPs: 28.28 | +7: iteration 34760/ 37905 | consumed samples: 8898560 | consumed tokens: 18224250880 | elapsed time per iteration (s): 0.22 | learning rate: 2.310E-05 | global batch size: 256 | lm loss: 3.695250E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.944 | TFLOPs: 29.07 | +7: iteration 34770/ 37905 | consumed samples: 8901120 | consumed tokens: 18229493760 | elapsed time per iteration (s): 0.23 | learning rate: 2.308E-05 | global batch size: 256 | lm loss: 3.689191E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1112.014 | TFLOPs: 28.33 | +7: iteration 34780/ 37905 | consumed samples: 8903680 | consumed tokens: 18234736640 | elapsed time per iteration (s): 0.22 | learning rate: 2.306E-05 | global batch size: 256 | lm loss: 3.694693E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.514 | TFLOPs: 29.13 | +7: iteration 34790/ 37905 | consumed samples: 8906240 | consumed tokens: 18239979520 | elapsed time per iteration (s): 0.22 | learning rate: 2.304E-05 | global batch size: 256 | lm loss: 3.688115E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.806 | TFLOPs: 29.14 | +7: iteration 34800/ 37905 | consumed samples: 8908800 | consumed tokens: 18245222400 | elapsed time per iteration (s): 0.22 | learning rate: 2.302E-05 | global batch size: 256 | lm loss: 3.692752E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.955 | TFLOPs: 29.42 | +7: iteration 34810/ 37905 | consumed samples: 8911360 | consumed tokens: 18250465280 | elapsed time per iteration (s): 0.23 | learning rate: 2.300E-05 | global batch size: 256 | lm loss: 3.683485E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.315 | TFLOPs: 28.29 | +7: iteration 34820/ 37905 | consumed samples: 8913920 | consumed tokens: 18255708160 | elapsed time per iteration (s): 0.23 | learning rate: 2.299E-05 | global batch size: 256 | lm loss: 3.704913E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.931 | TFLOPs: 28.33 | +7: iteration 34830/ 37905 | consumed samples: 8916480 | consumed tokens: 18260951040 | elapsed time per iteration (s): 0.23 | learning rate: 2.297E-05 | global batch size: 256 | lm loss: 3.694588E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.766 | TFLOPs: 28.63 | +7: iteration 34840/ 37905 | consumed samples: 8919040 | consumed tokens: 18266193920 | elapsed time per iteration (s): 0.23 | learning rate: 2.295E-05 | global batch size: 256 | lm loss: 3.685944E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.439 | TFLOPs: 28.59 | +7: iteration 34850/ 37905 | consumed samples: 8921600 | consumed tokens: 18271436800 | elapsed time per iteration (s): 0.23 | learning rate: 2.293E-05 | global batch size: 256 | lm loss: 3.688087E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1114.369 | TFLOPs: 28.39 | +7: iteration 34860/ 37905 | consumed samples: 8924160 | consumed tokens: 18276679680 | elapsed time per iteration (s): 0.23 | learning rate: 2.291E-05 | global batch size: 256 | lm loss: 3.695189E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.046 | TFLOPs: 28.76 | +7: iteration 34870/ 37905 | consumed samples: 8926720 | consumed tokens: 18281922560 | elapsed time per iteration (s): 0.23 | learning rate: 2.289E-05 | global batch size: 256 | lm loss: 3.699075E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.270 | TFLOPs: 28.49 | +7: iteration 34880/ 37905 | consumed samples: 8929280 | consumed tokens: 18287165440 | elapsed time per iteration (s): 0.23 | learning rate: 2.287E-05 | global batch size: 256 | lm loss: 3.695338E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.163 | TFLOPs: 28.59 | +7: iteration 34890/ 37905 | consumed samples: 8931840 | consumed tokens: 18292408320 | elapsed time per iteration (s): 0.23 | learning rate: 2.285E-05 | global batch size: 256 | lm loss: 3.682729E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1098.044 | TFLOPs: 27.97 | +7: iteration 34900/ 37905 | consumed samples: 8934400 | consumed tokens: 18297651200 | elapsed time per iteration (s): 0.22 | learning rate: 2.283E-05 | global batch size: 256 | lm loss: 3.699969E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.614 | TFLOPs: 29.08 | +7: iteration 34910/ 37905 | consumed samples: 8936960 | consumed tokens: 18302894080 | elapsed time per iteration (s): 0.24 | learning rate: 2.281E-05 | global batch size: 256 | lm loss: 3.671402E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1088.079 | TFLOPs: 27.72 | +7: iteration 34920/ 37905 | consumed samples: 8939520 | consumed tokens: 18308136960 | elapsed time per iteration (s): 0.23 | learning rate: 2.280E-05 | global batch size: 256 | lm loss: 3.699501E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.706 | TFLOPs: 27.94 | +7: iteration 34930/ 37905 | consumed samples: 8942080 | consumed tokens: 18313379840 | elapsed time per iteration (s): 0.23 | learning rate: 2.278E-05 | global batch size: 256 | lm loss: 3.685239E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.643 | TFLOPs: 28.83 | +7: iteration 34940/ 37905 | consumed samples: 8944640 | consumed tokens: 18318622720 | elapsed time per iteration (s): 0.23 | learning rate: 2.276E-05 | global batch size: 256 | lm loss: 3.696882E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.780 | TFLOPs: 28.22 | +7: iteration 34950/ 37905 | consumed samples: 8947200 | consumed tokens: 18323865600 | elapsed time per iteration (s): 0.23 | learning rate: 2.274E-05 | global batch size: 256 | lm loss: 3.694997E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1109.578 | TFLOPs: 28.27 | +7: iteration 34960/ 37905 | consumed samples: 8949760 | consumed tokens: 18329108480 | elapsed time per iteration (s): 0.23 | learning rate: 2.272E-05 | global batch size: 256 | lm loss: 3.691174E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.476 | TFLOPs: 28.44 | +7: iteration 34970/ 37905 | consumed samples: 8952320 | consumed tokens: 18334351360 | elapsed time per iteration (s): 0.22 | learning rate: 2.270E-05 | global batch size: 256 | lm loss: 3.678959E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.764 | TFLOPs: 29.06 | +7: iteration 34980/ 37905 | consumed samples: 8954880 | consumed tokens: 18339594240 | elapsed time per iteration (s): 0.23 | learning rate: 2.269E-05 | global batch size: 256 | lm loss: 3.707651E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.190 | TFLOPs: 28.89 | +7: iteration 34990/ 37905 | consumed samples: 8957440 | consumed tokens: 18344837120 | elapsed time per iteration (s): 0.23 | learning rate: 2.267E-05 | global batch size: 256 | lm loss: 3.699218E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.338 | TFLOPs: 28.64 | +7: iteration 35000/ 37905 | consumed samples: 8960000 | consumed tokens: 18350080000 | elapsed time per iteration (s): 0.23 | learning rate: 2.265E-05 | global batch size: 256 | lm loss: 3.688733E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.959 | TFLOPs: 28.30 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 35000 | lm loss value: 3.635775E+00 | lm loss PPL: 3.793123E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 35000 to checkpoints_83m20b400m +0: [2023-03-16 00:07:54,592] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step35000 is begin to save! +0: [2023-03-16 00:07:54,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_01-model_00-model_states.pt... +0: [2023-03-16 00:07:54,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_01-model_00-model_states.pt. +0: [2023-03-16 00:07:54,687] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_03-model_00-model_states.pt... +0: [2023-03-16 00:07:54,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_03-model_00-model_states.pt. +0: [2023-03-16 00:07:54,701] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_04-model_00-model_states.pt... +0: [2023-03-16 00:07:54,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_04-model_00-model_states.pt. +0: [2023-03-16 00:07:54,712] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_05-model_00-model_states.pt... +0: [2023-03-16 00:07:54,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_05-model_00-model_states.pt. +0: [2023-03-16 00:07:54,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_06-model_00-model_states.pt... +0: [2023-03-16 00:07:54,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_06-model_00-model_states.pt. +0: [2023-03-16 00:07:54,735] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_07-model_00-model_states.pt... +0: [2023-03-16 00:07:54,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_07-model_00-model_states.pt. +0: [2023-03-16 00:07:54,747] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_08-model_00-model_states.pt... +0: [2023-03-16 00:07:54,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_08-model_00-model_states.pt. +0: [2023-03-16 00:07:54,758] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_09-model_00-model_states.pt... +0: [2023-03-16 00:07:54,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_09-model_00-model_states.pt. +0: [2023-03-16 00:07:54,770] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_10-model_00-model_states.pt... +0: [2023-03-16 00:07:54,781] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_10-model_00-model_states.pt. +0: [2023-03-16 00:07:54,781] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_11-model_00-model_states.pt... +0: [2023-03-16 00:07:54,792] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_11-model_00-model_states.pt. +0: [2023-03-16 00:07:54,793] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_12-model_00-model_states.pt... +0: [2023-03-16 00:07:54,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_12-model_00-model_states.pt. +0: [2023-03-16 00:07:54,804] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/layer_14-model_00-model_states.pt... +0: [2023-03-16 00:07:54,805] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/layer_14-model_00-model_states.pt. +0: [2023-03-16 00:07:54,806] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step35000/mp_rank_00_model_states.pt +0: [2023-03-16 00:07:54,806] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/mp_rank_00_model_states.pt... +0: [2023-03-16 00:07:54,809] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/mp_rank_00_model_states.pt. +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:07:54,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:07:54,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,847] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 00:07:54,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +7: [2023-03-16 00:07:54,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:07:54,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:07:54,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,848] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:07:54,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:07:54,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,851] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-16 00:07:54,851] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:07:54,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +7: [2023-03-16 00:07:54,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:07:54,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +3: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +0: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +7: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +5: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +1: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +1: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-16 00:07:54,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +2: [2023-03-16 00:07:54,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:07:54,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:07:54,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 00:07:54,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +4: [2023-03-16 00:07:54,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 00:07:54,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +6: [2023-03-16 00:07:54,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:07:54,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step35000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 00:07:54,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step35000 is ready now! +0: successfully saved checkpoint at iteration 35000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 281.23 +7: iteration 35010/ 37905 | consumed samples: 8962560 | consumed tokens: 18355322880 | elapsed time per iteration (s): 0.26 | learning rate: 2.263E-05 | global batch size: 256 | lm loss: 3.699152E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 989.222 | TFLOPs: 25.20 | +7: iteration 35020/ 37905 | consumed samples: 8965120 | consumed tokens: 18360565760 | elapsed time per iteration (s): 0.23 | learning rate: 2.261E-05 | global batch size: 256 | lm loss: 3.695958E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.288 | TFLOPs: 28.54 | +7: iteration 35030/ 37905 | consumed samples: 8967680 | consumed tokens: 18365808640 | elapsed time per iteration (s): 0.23 | learning rate: 2.259E-05 | global batch size: 256 | lm loss: 3.695930E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.014 | TFLOPs: 28.20 | +7: iteration 35040/ 37905 | consumed samples: 8970240 | consumed tokens: 18371051520 | elapsed time per iteration (s): 0.23 | learning rate: 2.258E-05 | global batch size: 256 | lm loss: 3.674247E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.965 | TFLOPs: 28.63 | +7: iteration 35050/ 37905 | consumed samples: 8972800 | consumed tokens: 18376294400 | elapsed time per iteration (s): 0.23 | learning rate: 2.256E-05 | global batch size: 256 | lm loss: 3.685418E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1094.412 | TFLOPs: 27.88 | +7: iteration 35060/ 37905 | consumed samples: 8975360 | consumed tokens: 18381537280 | elapsed time per iteration (s): 0.22 | learning rate: 2.254E-05 | global batch size: 256 | lm loss: 3.691079E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.468 | TFLOPs: 29.16 | +7: iteration 35070/ 37905 | consumed samples: 8977920 | consumed tokens: 18386780160 | elapsed time per iteration (s): 0.23 | learning rate: 2.252E-05 | global batch size: 256 | lm loss: 3.696262E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1115.359 | TFLOPs: 28.41 | +7: iteration 35080/ 37905 | consumed samples: 8980480 | consumed tokens: 18392023040 | elapsed time per iteration (s): 0.24 | learning rate: 2.251E-05 | global batch size: 256 | lm loss: 3.700032E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1073.525 | TFLOPs: 27.35 | +7: iteration 35090/ 37905 | consumed samples: 8983040 | consumed tokens: 18397265920 | elapsed time per iteration (s): 0.23 | learning rate: 2.249E-05 | global batch size: 256 | lm loss: 3.693798E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1109.929 | TFLOPs: 28.28 | +7: iteration 35100/ 37905 | consumed samples: 8985600 | consumed tokens: 18402508800 | elapsed time per iteration (s): 0.23 | learning rate: 2.247E-05 | global batch size: 256 | lm loss: 3.693289E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.674 | TFLOPs: 28.73 | +7: iteration 35110/ 37905 | consumed samples: 8988160 | consumed tokens: 18407751680 | elapsed time per iteration (s): 0.23 | learning rate: 2.245E-05 | global batch size: 256 | lm loss: 3.689285E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1107.387 | TFLOPs: 28.21 | +7: iteration 35120/ 37905 | consumed samples: 8990720 | consumed tokens: 18412994560 | elapsed time per iteration (s): 0.23 | learning rate: 2.244E-05 | global batch size: 256 | lm loss: 3.680107E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1109.418 | TFLOPs: 28.26 | +7: iteration 35130/ 37905 | consumed samples: 8993280 | consumed tokens: 18418237440 | elapsed time per iteration (s): 0.22 | learning rate: 2.242E-05 | global batch size: 256 | lm loss: 3.692636E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.745 | TFLOPs: 29.01 | +7: iteration 35140/ 37905 | consumed samples: 8995840 | consumed tokens: 18423480320 | elapsed time per iteration (s): 0.23 | learning rate: 2.240E-05 | global batch size: 256 | lm loss: 3.683646E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1109.538 | TFLOPs: 28.27 | +7: iteration 35150/ 37905 | consumed samples: 8998400 | consumed tokens: 18428723200 | elapsed time per iteration (s): 0.23 | learning rate: 2.238E-05 | global batch size: 256 | lm loss: 3.692049E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.257 | TFLOPs: 28.16 | +7: iteration 35160/ 37905 | consumed samples: 9000960 | consumed tokens: 18433966080 | elapsed time per iteration (s): 0.23 | learning rate: 2.237E-05 | global batch size: 256 | lm loss: 3.696316E+00 | grad norm: 0.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.257 | TFLOPs: 28.56 | +7: iteration 35170/ 37905 | consumed samples: 9003520 | consumed tokens: 18439208960 | elapsed time per iteration (s): 0.23 | learning rate: 2.235E-05 | global batch size: 256 | lm loss: 3.680550E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.283 | TFLOPs: 28.87 | +7: iteration 35180/ 37905 | consumed samples: 9006080 | consumed tokens: 18444451840 | elapsed time per iteration (s): 0.23 | learning rate: 2.233E-05 | global batch size: 256 | lm loss: 3.679016E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1097.940 | TFLOPs: 27.97 | +7: iteration 35190/ 37905 | consumed samples: 9008640 | consumed tokens: 18449694720 | elapsed time per iteration (s): 0.23 | learning rate: 2.231E-05 | global batch size: 256 | lm loss: 3.695628E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.322 | TFLOPs: 28.67 | +7: iteration 35200/ 37905 | consumed samples: 9011200 | consumed tokens: 18454937600 | elapsed time per iteration (s): 0.23 | learning rate: 2.230E-05 | global batch size: 256 | lm loss: 3.703072E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.289 | TFLOPs: 28.59 | +7: iteration 35210/ 37905 | consumed samples: 9013760 | consumed tokens: 18460180480 | elapsed time per iteration (s): 0.23 | learning rate: 2.228E-05 | global batch size: 256 | lm loss: 3.696365E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.137 | TFLOPs: 28.56 | +7: iteration 35220/ 37905 | consumed samples: 9016320 | consumed tokens: 18465423360 | elapsed time per iteration (s): 0.24 | learning rate: 2.226E-05 | global batch size: 256 | lm loss: 3.686037E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1081.437 | TFLOPs: 27.55 | +7: iteration 35230/ 37905 | consumed samples: 9018880 | consumed tokens: 18470666240 | elapsed time per iteration (s): 0.23 | learning rate: 2.225E-05 | global batch size: 256 | lm loss: 3.691630E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.823 | TFLOPs: 28.86 | +7: iteration 35240/ 37905 | consumed samples: 9021440 | consumed tokens: 18475909120 | elapsed time per iteration (s): 0.23 | learning rate: 2.223E-05 | global batch size: 256 | lm loss: 3.691295E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.644 | TFLOPs: 28.52 | +7: iteration 35250/ 37905 | consumed samples: 9024000 | consumed tokens: 18481152000 | elapsed time per iteration (s): 0.23 | learning rate: 2.221E-05 | global batch size: 256 | lm loss: 3.695411E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.029 | TFLOPs: 28.02 | +7: iteration 35260/ 37905 | consumed samples: 9026560 | consumed tokens: 18486394880 | elapsed time per iteration (s): 0.23 | learning rate: 2.220E-05 | global batch size: 256 | lm loss: 3.688823E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.968 | TFLOPs: 28.33 | +7: iteration 35270/ 37905 | consumed samples: 9029120 | consumed tokens: 18491637760 | elapsed time per iteration (s): 0.23 | learning rate: 2.218E-05 | global batch size: 256 | lm loss: 3.677028E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1101.449 | TFLOPs: 28.06 | +7: iteration 35280/ 37905 | consumed samples: 9031680 | consumed tokens: 18496880640 | elapsed time per iteration (s): 0.23 | learning rate: 2.216E-05 | global batch size: 256 | lm loss: 3.690673E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.682 | TFLOPs: 28.58 | +7: iteration 35290/ 37905 | consumed samples: 9034240 | consumed tokens: 18502123520 | elapsed time per iteration (s): 0.23 | learning rate: 2.215E-05 | global batch size: 256 | lm loss: 3.691093E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.377 | TFLOPs: 28.92 | +7: iteration 35300/ 37905 | consumed samples: 9036800 | consumed tokens: 18507366400 | elapsed time per iteration (s): 0.23 | learning rate: 2.213E-05 | global batch size: 256 | lm loss: 3.685569E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.594 | TFLOPs: 28.88 | +7: iteration 35310/ 37905 | consumed samples: 9039360 | consumed tokens: 18512609280 | elapsed time per iteration (s): 0.23 | learning rate: 2.212E-05 | global batch size: 256 | lm loss: 3.676912E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1094.182 | TFLOPs: 27.87 | +7: iteration 35320/ 37905 | consumed samples: 9041920 | consumed tokens: 18517852160 | elapsed time per iteration (s): 0.23 | learning rate: 2.210E-05 | global batch size: 256 | lm loss: 3.700538E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.962 | TFLOPs: 28.15 | +7: iteration 35330/ 37905 | consumed samples: 9044480 | consumed tokens: 18523095040 | elapsed time per iteration (s): 0.23 | learning rate: 2.208E-05 | global batch size: 256 | lm loss: 3.697210E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.437 | TFLOPs: 28.47 | +7: iteration 35340/ 37905 | consumed samples: 9047040 | consumed tokens: 18528337920 | elapsed time per iteration (s): 0.23 | learning rate: 2.207E-05 | global batch size: 256 | lm loss: 3.694623E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.690 | TFLOPs: 27.94 | +7: iteration 35350/ 37905 | consumed samples: 9049600 | consumed tokens: 18533580800 | elapsed time per iteration (s): 0.23 | learning rate: 2.205E-05 | global batch size: 256 | lm loss: 3.686000E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.979 | TFLOPs: 28.20 | +7: iteration 35360/ 37905 | consumed samples: 9052160 | consumed tokens: 18538823680 | elapsed time per iteration (s): 0.23 | learning rate: 2.204E-05 | global batch size: 256 | lm loss: 3.694234E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.268 | TFLOPs: 28.92 | +7: iteration 35370/ 37905 | consumed samples: 9054720 | consumed tokens: 18544066560 | elapsed time per iteration (s): 0.25 | learning rate: 2.202E-05 | global batch size: 256 | lm loss: 3.683617E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1034.907 | TFLOPs: 26.36 | +7: iteration 35380/ 37905 | consumed samples: 9057280 | consumed tokens: 18549309440 | elapsed time per iteration (s): 0.23 | learning rate: 2.200E-05 | global batch size: 256 | lm loss: 3.699782E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.948 | TFLOPs: 28.38 | +7: iteration 35390/ 37905 | consumed samples: 9059840 | consumed tokens: 18554552320 | elapsed time per iteration (s): 0.23 | learning rate: 2.199E-05 | global batch size: 256 | lm loss: 3.688290E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.527 | TFLOPs: 28.55 | +7: iteration 35400/ 37905 | consumed samples: 9062400 | consumed tokens: 18559795200 | elapsed time per iteration (s): 0.23 | learning rate: 2.197E-05 | global batch size: 256 | lm loss: 3.695657E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1103.484 | TFLOPs: 28.11 | +7: iteration 35410/ 37905 | consumed samples: 9064960 | consumed tokens: 18565038080 | elapsed time per iteration (s): 0.23 | learning rate: 2.196E-05 | global batch size: 256 | lm loss: 3.711012E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1112.413 | TFLOPs: 28.34 | +7: iteration 35420/ 37905 | consumed samples: 9067520 | consumed tokens: 18570280960 | elapsed time per iteration (s): 0.24 | learning rate: 2.194E-05 | global batch size: 256 | lm loss: 3.697892E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1088.479 | TFLOPs: 27.73 | +7: iteration 35430/ 37905 | consumed samples: 9070080 | consumed tokens: 18575523840 | elapsed time per iteration (s): 0.23 | learning rate: 2.193E-05 | global batch size: 256 | lm loss: 3.704888E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.888 | TFLOPs: 28.91 | +7: iteration 35440/ 37905 | consumed samples: 9072640 | consumed tokens: 18580766720 | elapsed time per iteration (s): 0.23 | learning rate: 2.191E-05 | global batch size: 256 | lm loss: 3.680539E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1093.379 | TFLOPs: 27.85 | +7: iteration 35450/ 37905 | consumed samples: 9075200 | consumed tokens: 18586009600 | elapsed time per iteration (s): 0.25 | learning rate: 2.189E-05 | global batch size: 256 | lm loss: 3.697706E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1012.822 | TFLOPs: 25.80 | +7: iteration 35460/ 37905 | consumed samples: 9077760 | consumed tokens: 18591252480 | elapsed time per iteration (s): 0.23 | learning rate: 2.188E-05 | global batch size: 256 | lm loss: 3.687118E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.477 | TFLOPs: 28.57 | +7: iteration 35470/ 37905 | consumed samples: 9080320 | consumed tokens: 18596495360 | elapsed time per iteration (s): 0.23 | learning rate: 2.186E-05 | global batch size: 256 | lm loss: 3.695002E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1105.349 | TFLOPs: 28.16 | +7: iteration 35480/ 37905 | consumed samples: 9082880 | consumed tokens: 18601738240 | elapsed time per iteration (s): 0.23 | learning rate: 2.185E-05 | global batch size: 256 | lm loss: 3.683287E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.656 | TFLOPs: 28.45 | +7: iteration 35490/ 37905 | consumed samples: 9085440 | consumed tokens: 18606981120 | elapsed time per iteration (s): 0.23 | learning rate: 2.183E-05 | global batch size: 256 | lm loss: 3.680103E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.195 | TFLOPs: 28.69 | +7: iteration 35500/ 37905 | consumed samples: 9088000 | consumed tokens: 18612224000 | elapsed time per iteration (s): 0.23 | learning rate: 2.182E-05 | global batch size: 256 | lm loss: 3.681506E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.417 | TFLOPs: 28.70 | +7: iteration 35510/ 37905 | consumed samples: 9090560 | consumed tokens: 18617466880 | elapsed time per iteration (s): 0.22 | learning rate: 2.180E-05 | global batch size: 256 | lm loss: 3.695792E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.253 | TFLOPs: 29.07 | +7: iteration 35520/ 37905 | consumed samples: 9093120 | consumed tokens: 18622709760 | elapsed time per iteration (s): 0.23 | learning rate: 2.179E-05 | global batch size: 256 | lm loss: 3.689422E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.606 | TFLOPs: 28.73 | +7: iteration 35530/ 37905 | consumed samples: 9095680 | consumed tokens: 18627952640 | elapsed time per iteration (s): 0.23 | learning rate: 2.177E-05 | global batch size: 256 | lm loss: 3.680154E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.745 | TFLOPs: 28.70 | +7: iteration 35540/ 37905 | consumed samples: 9098240 | consumed tokens: 18633195520 | elapsed time per iteration (s): 0.23 | learning rate: 2.176E-05 | global batch size: 256 | lm loss: 3.688984E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.226 | TFLOPs: 28.56 | +7: iteration 35550/ 37905 | consumed samples: 9100800 | consumed tokens: 18638438400 | elapsed time per iteration (s): 0.23 | learning rate: 2.174E-05 | global batch size: 256 | lm loss: 3.685430E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.394 | TFLOPs: 28.77 | +7: iteration 35560/ 37905 | consumed samples: 9103360 | consumed tokens: 18643681280 | elapsed time per iteration (s): 0.23 | learning rate: 2.173E-05 | global batch size: 256 | lm loss: 3.692016E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.832 | TFLOPs: 28.60 | +7: iteration 35570/ 37905 | consumed samples: 9105920 | consumed tokens: 18648924160 | elapsed time per iteration (s): 0.23 | learning rate: 2.171E-05 | global batch size: 256 | lm loss: 3.684930E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.560 | TFLOPs: 27.94 | +7: iteration 35580/ 37905 | consumed samples: 9108480 | consumed tokens: 18654167040 | elapsed time per iteration (s): 0.23 | learning rate: 2.170E-05 | global batch size: 256 | lm loss: 3.693659E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.153 | TFLOPs: 28.51 | +7: iteration 35590/ 37905 | consumed samples: 9111040 | consumed tokens: 18659409920 | elapsed time per iteration (s): 0.23 | learning rate: 2.169E-05 | global batch size: 256 | lm loss: 3.684156E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.508 | TFLOPs: 28.49 | +7: iteration 35600/ 37905 | consumed samples: 9113600 | consumed tokens: 18664652800 | elapsed time per iteration (s): 0.23 | learning rate: 2.167E-05 | global batch size: 256 | lm loss: 3.685580E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.645 | TFLOPs: 28.93 | +7: iteration 35610/ 37905 | consumed samples: 9116160 | consumed tokens: 18669895680 | elapsed time per iteration (s): 0.23 | learning rate: 2.166E-05 | global batch size: 256 | lm loss: 3.685891E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1096.057 | TFLOPs: 27.92 | +7: iteration 35620/ 37905 | consumed samples: 9118720 | consumed tokens: 18675138560 | elapsed time per iteration (s): 0.22 | learning rate: 2.164E-05 | global batch size: 256 | lm loss: 3.693142E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.452 | TFLOPs: 29.23 | +7: iteration 35630/ 37905 | consumed samples: 9121280 | consumed tokens: 18680381440 | elapsed time per iteration (s): 0.23 | learning rate: 2.163E-05 | global batch size: 256 | lm loss: 3.689293E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.395 | TFLOPs: 28.62 | +7: iteration 35640/ 37905 | consumed samples: 9123840 | consumed tokens: 18685624320 | elapsed time per iteration (s): 0.24 | learning rate: 2.161E-05 | global batch size: 256 | lm loss: 3.695347E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1072.723 | TFLOPs: 27.33 | +7: iteration 35650/ 37905 | consumed samples: 9126400 | consumed tokens: 18690867200 | elapsed time per iteration (s): 0.23 | learning rate: 2.160E-05 | global batch size: 256 | lm loss: 3.676216E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1111.579 | TFLOPs: 28.32 | +7: iteration 35660/ 37905 | consumed samples: 9128960 | consumed tokens: 18696110080 | elapsed time per iteration (s): 0.23 | learning rate: 2.159E-05 | global batch size: 256 | lm loss: 3.678171E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.055 | TFLOPs: 28.43 | +7: iteration 35670/ 37905 | consumed samples: 9131520 | consumed tokens: 18701352960 | elapsed time per iteration (s): 0.23 | learning rate: 2.157E-05 | global batch size: 256 | lm loss: 3.686951E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1097.558 | TFLOPs: 27.96 | +7: iteration 35680/ 37905 | consumed samples: 9134080 | consumed tokens: 18706595840 | elapsed time per iteration (s): 0.23 | learning rate: 2.156E-05 | global batch size: 256 | lm loss: 3.690545E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1098.833 | TFLOPs: 27.99 | +7: iteration 35690/ 37905 | consumed samples: 9136640 | consumed tokens: 18711838720 | elapsed time per iteration (s): 0.23 | learning rate: 2.154E-05 | global batch size: 256 | lm loss: 3.681483E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.813 | TFLOPs: 28.83 | +7: iteration 35700/ 37905 | consumed samples: 9139200 | consumed tokens: 18717081600 | elapsed time per iteration (s): 0.22 | learning rate: 2.153E-05 | global batch size: 256 | lm loss: 3.683296E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.439 | TFLOPs: 29.08 | +7: iteration 35710/ 37905 | consumed samples: 9141760 | consumed tokens: 18722324480 | elapsed time per iteration (s): 0.23 | learning rate: 2.152E-05 | global batch size: 256 | lm loss: 3.700802E+00 | grad norm: 0.221 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.182 | TFLOPs: 28.79 | +7: iteration 35720/ 37905 | consumed samples: 9144320 | consumed tokens: 18727567360 | elapsed time per iteration (s): 0.23 | learning rate: 2.150E-05 | global batch size: 256 | lm loss: 3.697052E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1100.733 | TFLOPs: 28.04 | +7: iteration 35730/ 37905 | consumed samples: 9146880 | consumed tokens: 18732810240 | elapsed time per iteration (s): 0.23 | learning rate: 2.149E-05 | global batch size: 256 | lm loss: 3.681094E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1097.380 | TFLOPs: 27.96 | +7: iteration 35740/ 37905 | consumed samples: 9149440 | consumed tokens: 18738053120 | elapsed time per iteration (s): 0.23 | learning rate: 2.147E-05 | global batch size: 256 | lm loss: 3.682303E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1091.869 | TFLOPs: 27.82 | +7: iteration 35750/ 37905 | consumed samples: 9152000 | consumed tokens: 18743296000 | elapsed time per iteration (s): 0.23 | learning rate: 2.146E-05 | global batch size: 256 | lm loss: 3.692517E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.182 | TFLOPs: 28.92 | +7: iteration 35760/ 37905 | consumed samples: 9154560 | consumed tokens: 18748538880 | elapsed time per iteration (s): 0.23 | learning rate: 2.145E-05 | global batch size: 256 | lm loss: 3.679816E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.619 | TFLOPs: 28.47 | +7: iteration 35770/ 37905 | consumed samples: 9157120 | consumed tokens: 18753781760 | elapsed time per iteration (s): 0.23 | learning rate: 2.143E-05 | global batch size: 256 | lm loss: 3.687514E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.015 | TFLOPs: 28.94 | +7: iteration 35780/ 37905 | consumed samples: 9159680 | consumed tokens: 18759024640 | elapsed time per iteration (s): 0.23 | learning rate: 2.142E-05 | global batch size: 256 | lm loss: 3.689373E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1103.085 | TFLOPs: 28.10 | +7: iteration 35790/ 37905 | consumed samples: 9162240 | consumed tokens: 18764267520 | elapsed time per iteration (s): 0.23 | learning rate: 2.141E-05 | global batch size: 256 | lm loss: 3.707013E+00 | grad norm: 0.211 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.792 | TFLOPs: 28.65 | +7: iteration 35800/ 37905 | consumed samples: 9164800 | consumed tokens: 18769510400 | elapsed time per iteration (s): 0.23 | learning rate: 2.139E-05 | global batch size: 256 | lm loss: 3.683506E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.650 | TFLOPs: 28.19 | +7: iteration 35810/ 37905 | consumed samples: 9167360 | consumed tokens: 18774753280 | elapsed time per iteration (s): 0.23 | learning rate: 2.138E-05 | global batch size: 256 | lm loss: 3.690404E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.239 | TFLOPs: 28.87 | +7: iteration 35820/ 37905 | consumed samples: 9169920 | consumed tokens: 18779996160 | elapsed time per iteration (s): 0.23 | learning rate: 2.137E-05 | global batch size: 256 | lm loss: 3.695244E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1108.830 | TFLOPs: 28.25 | +7: iteration 35830/ 37905 | consumed samples: 9172480 | consumed tokens: 18785239040 | elapsed time per iteration (s): 0.23 | learning rate: 2.135E-05 | global batch size: 256 | lm loss: 3.686158E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.942 | TFLOPs: 28.63 | +7: iteration 35840/ 37905 | consumed samples: 9175040 | consumed tokens: 18790481920 | elapsed time per iteration (s): 0.23 | learning rate: 2.134E-05 | global batch size: 256 | lm loss: 3.697090E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.859 | TFLOPs: 28.96 | +7: iteration 35850/ 37905 | consumed samples: 9177600 | consumed tokens: 18795724800 | elapsed time per iteration (s): 0.22 | learning rate: 2.133E-05 | global batch size: 256 | lm loss: 3.696758E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.912 | TFLOPs: 28.99 | +7: iteration 35860/ 37905 | consumed samples: 9180160 | consumed tokens: 18800967680 | elapsed time per iteration (s): 0.22 | learning rate: 2.132E-05 | global batch size: 256 | lm loss: 3.682220E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.052 | TFLOPs: 28.99 | +7: iteration 35870/ 37905 | consumed samples: 9182720 | consumed tokens: 18806210560 | elapsed time per iteration (s): 0.23 | learning rate: 2.130E-05 | global batch size: 256 | lm loss: 3.680396E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.607 | TFLOPs: 28.62 | +7: iteration 35880/ 37905 | consumed samples: 9185280 | consumed tokens: 18811453440 | elapsed time per iteration (s): 0.23 | learning rate: 2.129E-05 | global batch size: 256 | lm loss: 3.674948E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.870 | TFLOPs: 28.61 | +7: iteration 35890/ 37905 | consumed samples: 9187840 | consumed tokens: 18816696320 | elapsed time per iteration (s): 0.23 | learning rate: 2.128E-05 | global batch size: 256 | lm loss: 3.689994E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.869 | TFLOPs: 28.71 | +7: iteration 35900/ 37905 | consumed samples: 9190400 | consumed tokens: 18821939200 | elapsed time per iteration (s): 0.23 | learning rate: 2.127E-05 | global batch size: 256 | lm loss: 3.687388E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.334 | TFLOPs: 28.46 | +7: iteration 35910/ 37905 | consumed samples: 9192960 | consumed tokens: 18827182080 | elapsed time per iteration (s): 0.24 | learning rate: 2.125E-05 | global batch size: 256 | lm loss: 3.707139E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1069.534 | TFLOPs: 27.25 | +7: iteration 35920/ 37905 | consumed samples: 9195520 | consumed tokens: 18832424960 | elapsed time per iteration (s): 0.23 | learning rate: 2.124E-05 | global batch size: 256 | lm loss: 3.684915E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.999 | TFLOPs: 28.68 | +7: iteration 35930/ 37905 | consumed samples: 9198080 | consumed tokens: 18837667840 | elapsed time per iteration (s): 0.23 | learning rate: 2.123E-05 | global batch size: 256 | lm loss: 3.702476E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1104.602 | TFLOPs: 28.14 | +7: iteration 35940/ 37905 | consumed samples: 9200640 | consumed tokens: 18842910720 | elapsed time per iteration (s): 0.23 | learning rate: 2.122E-05 | global batch size: 256 | lm loss: 3.686770E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.958 | TFLOPs: 28.30 | +7: iteration 35950/ 37905 | consumed samples: 9203200 | consumed tokens: 18848153600 | elapsed time per iteration (s): 0.23 | learning rate: 2.120E-05 | global batch size: 256 | lm loss: 3.690911E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.115 | TFLOPs: 28.89 | +7: iteration 35960/ 37905 | consumed samples: 9205760 | consumed tokens: 18853396480 | elapsed time per iteration (s): 0.23 | learning rate: 2.119E-05 | global batch size: 256 | lm loss: 3.690757E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.942 | TFLOPs: 28.89 | +7: iteration 35970/ 37905 | consumed samples: 9208320 | consumed tokens: 18858639360 | elapsed time per iteration (s): 0.22 | learning rate: 2.118E-05 | global batch size: 256 | lm loss: 3.689748E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.807 | TFLOPs: 29.22 | +7: iteration 35980/ 37905 | consumed samples: 9210880 | consumed tokens: 18863882240 | elapsed time per iteration (s): 0.22 | learning rate: 2.117E-05 | global batch size: 256 | lm loss: 3.689527E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.052 | TFLOPs: 29.20 | +7: iteration 35990/ 37905 | consumed samples: 9213440 | consumed tokens: 18869125120 | elapsed time per iteration (s): 0.23 | learning rate: 2.115E-05 | global batch size: 256 | lm loss: 3.688923E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.900 | TFLOPs: 28.50 | +0: [2023-03-16 00:11:44,414] [INFO] [logging.py:68:log_dist] [Rank 0] step=36000, skipped=0, lr=[2.1142230010800715e-05, 2.1142230010800715e-05, 2.1142230010800715e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +7: iteration 36000/ 37905 | consumed samples: 9216000 | consumed tokens: 18874368000 | elapsed time per iteration (s): 0.23 | learning rate: 2.114E-05 | global batch size: 256 | lm loss: 3.680216E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.227 | TFLOPs: 28.72 | +0: steps: 36000 loss: 3.6771 iter time (s): 0.226 samples/sec: 1131.617 +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 36000 | lm loss value: 3.712149E+00 | lm loss PPL: 4.094170E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 36000 to checkpoints_83m20b400m +0: [2023-03-16 00:11:44,506] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step36000 is begin to save! +0: [2023-03-16 00:11:44,509] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_01-model_00-model_states.pt... +0: [2023-03-16 00:11:44,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_01-model_00-model_states.pt. +0: [2023-03-16 00:11:44,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_03-model_00-model_states.pt... +0: [2023-03-16 00:11:44,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_03-model_00-model_states.pt. +0: [2023-03-16 00:11:44,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_04-model_00-model_states.pt... +0: [2023-03-16 00:11:44,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_04-model_00-model_states.pt. +0: [2023-03-16 00:11:44,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_05-model_00-model_states.pt... +0: [2023-03-16 00:11:44,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_05-model_00-model_states.pt. +0: [2023-03-16 00:11:44,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_06-model_00-model_states.pt... +0: [2023-03-16 00:11:44,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_06-model_00-model_states.pt. +0: [2023-03-16 00:11:44,637] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_07-model_00-model_states.pt... +0: [2023-03-16 00:11:44,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_07-model_00-model_states.pt. +0: [2023-03-16 00:11:44,648] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_08-model_00-model_states.pt... +0: [2023-03-16 00:11:44,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_08-model_00-model_states.pt. +0: [2023-03-16 00:11:44,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_09-model_00-model_states.pt... +0: [2023-03-16 00:11:44,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_09-model_00-model_states.pt. +0: [2023-03-16 00:11:44,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_10-model_00-model_states.pt... +0: [2023-03-16 00:11:44,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_10-model_00-model_states.pt. +0: [2023-03-16 00:11:44,682] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_11-model_00-model_states.pt... +0: [2023-03-16 00:11:44,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_11-model_00-model_states.pt. +0: [2023-03-16 00:11:44,693] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_12-model_00-model_states.pt... +0: [2023-03-16 00:11:44,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_12-model_00-model_states.pt. +0: [2023-03-16 00:11:44,705] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/layer_14-model_00-model_states.pt... +0: [2023-03-16 00:11:44,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/layer_14-model_00-model_states.pt. +0: [2023-03-16 00:11:44,706] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step36000/mp_rank_00_model_states.pt +0: [2023-03-16 00:11:44,706] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/mp_rank_00_model_states.pt... +0: [2023-03-16 00:11:44,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/mp_rank_00_model_states.pt. +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:11:44,725] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:11:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 00:11:44,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +6: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +6: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +2: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:11:44,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 00:11:44,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +3: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:11:44,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:11:44,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +6: [2023-03-16 00:11:44,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:11:44,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 00:11:44,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +7: [2023-03-16 00:11:44,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:11:44,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 00:11:44,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +1: [2023-03-16 00:11:44,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:11:44,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 00:11:44,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +5: [2023-03-16 00:11:44,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:11:44,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 00:11:44,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step36000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +4: [2023-03-16 00:11:44,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step36000 is ready now! +0: successfully saved checkpoint at iteration 36000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 310.74 +7: iteration 36010/ 37905 | consumed samples: 9218560 | consumed tokens: 18879610880 | elapsed time per iteration (s): 0.27 | learning rate: 2.113E-05 | global batch size: 256 | lm loss: 3.677317E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 953.048 | TFLOPs: 24.28 | +7: iteration 36020/ 37905 | consumed samples: 9221120 | consumed tokens: 18884853760 | elapsed time per iteration (s): 0.23 | learning rate: 2.112E-05 | global batch size: 256 | lm loss: 3.681898E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.397 | TFLOPs: 28.72 | +7: iteration 36030/ 37905 | consumed samples: 9223680 | consumed tokens: 18890096640 | elapsed time per iteration (s): 0.23 | learning rate: 2.111E-05 | global batch size: 256 | lm loss: 3.691203E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1113.732 | TFLOPs: 28.37 | +7: iteration 36040/ 37905 | consumed samples: 9226240 | consumed tokens: 18895339520 | elapsed time per iteration (s): 0.23 | learning rate: 2.109E-05 | global batch size: 256 | lm loss: 3.699359E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.590 | TFLOPs: 28.50 | +7: iteration 36050/ 37905 | consumed samples: 9228800 | consumed tokens: 18900582400 | elapsed time per iteration (s): 0.23 | learning rate: 2.108E-05 | global batch size: 256 | lm loss: 3.697357E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.275 | TFLOPs: 28.28 | +7: iteration 36060/ 37905 | consumed samples: 9231360 | consumed tokens: 18905825280 | elapsed time per iteration (s): 0.22 | learning rate: 2.107E-05 | global batch size: 256 | lm loss: 3.693887E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.615 | TFLOPs: 29.01 | +7: iteration 36070/ 37905 | consumed samples: 9233920 | consumed tokens: 18911068160 | elapsed time per iteration (s): 0.23 | learning rate: 2.106E-05 | global batch size: 256 | lm loss: 3.693883E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1103.574 | TFLOPs: 28.11 | +7: iteration 36080/ 37905 | consumed samples: 9236480 | consumed tokens: 18916311040 | elapsed time per iteration (s): 0.23 | learning rate: 2.105E-05 | global batch size: 256 | lm loss: 3.681726E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.356 | TFLOPs: 28.62 | +7: iteration 36090/ 37905 | consumed samples: 9239040 | consumed tokens: 18921553920 | elapsed time per iteration (s): 0.22 | learning rate: 2.104E-05 | global batch size: 256 | lm loss: 3.686319E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.917 | TFLOPs: 28.99 | +7: iteration 36100/ 37905 | consumed samples: 9241600 | consumed tokens: 18926796800 | elapsed time per iteration (s): 0.22 | learning rate: 2.103E-05 | global batch size: 256 | lm loss: 3.678566E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.597 | TFLOPs: 29.36 | +7: iteration 36110/ 37905 | consumed samples: 9244160 | consumed tokens: 18932039680 | elapsed time per iteration (s): 0.23 | learning rate: 2.101E-05 | global batch size: 256 | lm loss: 3.683665E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1112.485 | TFLOPs: 28.34 | +7: iteration 36120/ 37905 | consumed samples: 9246720 | consumed tokens: 18937282560 | elapsed time per iteration (s): 0.22 | learning rate: 2.100E-05 | global batch size: 256 | lm loss: 3.700175E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.294 | TFLOPs: 29.18 | +7: iteration 36130/ 37905 | consumed samples: 9249280 | consumed tokens: 18942525440 | elapsed time per iteration (s): 0.22 | learning rate: 2.099E-05 | global batch size: 256 | lm loss: 3.683683E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.248 | TFLOPs: 29.15 | +7: iteration 36140/ 37905 | consumed samples: 9251840 | consumed tokens: 18947768320 | elapsed time per iteration (s): 0.22 | learning rate: 2.098E-05 | global batch size: 256 | lm loss: 3.686453E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.930 | TFLOPs: 29.42 | +7: iteration 36150/ 37905 | consumed samples: 9254400 | consumed tokens: 18953011200 | elapsed time per iteration (s): 0.22 | learning rate: 2.097E-05 | global batch size: 256 | lm loss: 3.685275E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.993 | TFLOPs: 29.02 | +7: iteration 36160/ 37905 | consumed samples: 9256960 | consumed tokens: 18958254080 | elapsed time per iteration (s): 0.23 | learning rate: 2.096E-05 | global batch size: 256 | lm loss: 3.687806E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1097.075 | TFLOPs: 27.95 | +7: iteration 36170/ 37905 | consumed samples: 9259520 | consumed tokens: 18963496960 | elapsed time per iteration (s): 0.22 | learning rate: 2.095E-05 | global batch size: 256 | lm loss: 3.694406E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.025 | TFLOPs: 29.32 | +7: iteration 36180/ 37905 | consumed samples: 9262080 | consumed tokens: 18968739840 | elapsed time per iteration (s): 0.22 | learning rate: 2.094E-05 | global batch size: 256 | lm loss: 3.688264E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.346 | TFLOPs: 29.05 | +7: iteration 36190/ 37905 | consumed samples: 9264640 | consumed tokens: 18973982720 | elapsed time per iteration (s): 0.22 | learning rate: 2.093E-05 | global batch size: 256 | lm loss: 3.697744E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.800 | TFLOPs: 29.29 | +7: iteration 36200/ 37905 | consumed samples: 9267200 | consumed tokens: 18979225600 | elapsed time per iteration (s): 0.22 | learning rate: 2.092E-05 | global batch size: 256 | lm loss: 3.690975E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.931 | TFLOPs: 29.24 | +7: iteration 36210/ 37905 | consumed samples: 9269760 | consumed tokens: 18984468480 | elapsed time per iteration (s): 0.22 | learning rate: 2.090E-05 | global batch size: 256 | lm loss: 3.706404E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.470 | TFLOPs: 29.21 | +7: iteration 36220/ 37905 | consumed samples: 9272320 | consumed tokens: 18989711360 | elapsed time per iteration (s): 0.22 | learning rate: 2.089E-05 | global batch size: 256 | lm loss: 3.688055E+00 | grad norm: 0.209 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.082 | TFLOPs: 29.20 | +7: iteration 36230/ 37905 | consumed samples: 9274880 | consumed tokens: 18994954240 | elapsed time per iteration (s): 0.23 | learning rate: 2.088E-05 | global batch size: 256 | lm loss: 3.700908E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.122 | TFLOPs: 28.84 | +7: iteration 36240/ 37905 | consumed samples: 9277440 | consumed tokens: 19000197120 | elapsed time per iteration (s): 0.22 | learning rate: 2.087E-05 | global batch size: 256 | lm loss: 3.695217E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.557 | TFLOPs: 29.08 | +7: iteration 36250/ 37905 | consumed samples: 9280000 | consumed tokens: 19005440000 | elapsed time per iteration (s): 0.22 | learning rate: 2.086E-05 | global batch size: 256 | lm loss: 3.689727E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.991 | TFLOPs: 29.27 | +7: iteration 36260/ 37905 | consumed samples: 9282560 | consumed tokens: 19010682880 | elapsed time per iteration (s): 0.22 | learning rate: 2.085E-05 | global batch size: 256 | lm loss: 3.690801E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.407 | TFLOPs: 29.31 | +7: iteration 36270/ 37905 | consumed samples: 9285120 | consumed tokens: 19015925760 | elapsed time per iteration (s): 0.23 | learning rate: 2.084E-05 | global batch size: 256 | lm loss: 3.702753E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.061 | TFLOPs: 28.97 | +7: iteration 36280/ 37905 | consumed samples: 9287680 | consumed tokens: 19021168640 | elapsed time per iteration (s): 0.23 | learning rate: 2.083E-05 | global batch size: 256 | lm loss: 3.692248E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.265 | TFLOPs: 28.79 | +7: iteration 36290/ 37905 | consumed samples: 9290240 | consumed tokens: 19026411520 | elapsed time per iteration (s): 0.23 | learning rate: 2.082E-05 | global batch size: 256 | lm loss: 3.689009E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.135 | TFLOPs: 28.87 | +7: iteration 36300/ 37905 | consumed samples: 9292800 | consumed tokens: 19031654400 | elapsed time per iteration (s): 0.22 | learning rate: 2.081E-05 | global batch size: 256 | lm loss: 3.695205E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.019 | TFLOPs: 29.37 | +7: iteration 36310/ 37905 | consumed samples: 9295360 | consumed tokens: 19036897280 | elapsed time per iteration (s): 0.23 | learning rate: 2.080E-05 | global batch size: 256 | lm loss: 3.685341E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1091.642 | TFLOPs: 27.81 | +7: iteration 36320/ 37905 | consumed samples: 9297920 | consumed tokens: 19042140160 | elapsed time per iteration (s): 0.22 | learning rate: 2.079E-05 | global batch size: 256 | lm loss: 3.702357E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.075 | TFLOPs: 29.02 | +7: iteration 36330/ 37905 | consumed samples: 9300480 | consumed tokens: 19047383040 | elapsed time per iteration (s): 0.22 | learning rate: 2.078E-05 | global batch size: 256 | lm loss: 3.670208E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.879 | TFLOPs: 29.37 | +7: iteration 36340/ 37905 | consumed samples: 9303040 | consumed tokens: 19052625920 | elapsed time per iteration (s): 0.22 | learning rate: 2.077E-05 | global batch size: 256 | lm loss: 3.680928E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.218 | TFLOPs: 29.17 | +7: iteration 36350/ 37905 | consumed samples: 9305600 | consumed tokens: 19057868800 | elapsed time per iteration (s): 0.22 | learning rate: 2.076E-05 | global batch size: 256 | lm loss: 3.695716E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.060 | TFLOPs: 29.22 | +7: iteration 36360/ 37905 | consumed samples: 9308160 | consumed tokens: 19063111680 | elapsed time per iteration (s): 0.22 | learning rate: 2.075E-05 | global batch size: 256 | lm loss: 3.677826E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.418 | TFLOPs: 29.13 | +7: iteration 36370/ 37905 | consumed samples: 9310720 | consumed tokens: 19068354560 | elapsed time per iteration (s): 0.22 | learning rate: 2.074E-05 | global batch size: 256 | lm loss: 3.675444E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.847 | TFLOPs: 29.19 | +7: iteration 36380/ 37905 | consumed samples: 9313280 | consumed tokens: 19073597440 | elapsed time per iteration (s): 0.23 | learning rate: 2.073E-05 | global batch size: 256 | lm loss: 3.692886E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.918 | TFLOPs: 28.56 | +7: iteration 36390/ 37905 | consumed samples: 9315840 | consumed tokens: 19078840320 | elapsed time per iteration (s): 0.22 | learning rate: 2.072E-05 | global batch size: 256 | lm loss: 3.696190E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.308 | TFLOPs: 29.13 | +7: iteration 36400/ 37905 | consumed samples: 9318400 | consumed tokens: 19084083200 | elapsed time per iteration (s): 0.22 | learning rate: 2.071E-05 | global batch size: 256 | lm loss: 3.684406E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.899 | TFLOPs: 29.19 | +7: iteration 36410/ 37905 | consumed samples: 9320960 | consumed tokens: 19089326080 | elapsed time per iteration (s): 0.22 | learning rate: 2.070E-05 | global batch size: 256 | lm loss: 3.686701E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.959 | TFLOPs: 29.02 | +7: iteration 36420/ 37905 | consumed samples: 9323520 | consumed tokens: 19094568960 | elapsed time per iteration (s): 0.22 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.686132E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.581 | TFLOPs: 29.31 | +7: iteration 36430/ 37905 | consumed samples: 9326080 | consumed tokens: 19099811840 | elapsed time per iteration (s): 0.23 | learning rate: 2.069E-05 | global batch size: 256 | lm loss: 3.687900E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.555 | TFLOPs: 28.60 | +7: iteration 36440/ 37905 | consumed samples: 9328640 | consumed tokens: 19105054720 | elapsed time per iteration (s): 0.22 | learning rate: 2.068E-05 | global batch size: 256 | lm loss: 3.682838E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.721 | TFLOPs: 29.29 | +7: iteration 36450/ 37905 | consumed samples: 9331200 | consumed tokens: 19110297600 | elapsed time per iteration (s): 0.22 | learning rate: 2.067E-05 | global batch size: 256 | lm loss: 3.678786E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.635 | TFLOPs: 29.08 | +7: iteration 36460/ 37905 | consumed samples: 9333760 | consumed tokens: 19115540480 | elapsed time per iteration (s): 0.22 | learning rate: 2.066E-05 | global batch size: 256 | lm loss: 3.693052E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.409 | TFLOPs: 29.15 | +7: iteration 36470/ 37905 | consumed samples: 9336320 | consumed tokens: 19120783360 | elapsed time per iteration (s): 0.22 | learning rate: 2.065E-05 | global batch size: 256 | lm loss: 3.689747E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.593 | TFLOPs: 29.18 | +7: iteration 36480/ 37905 | consumed samples: 9338880 | consumed tokens: 19126026240 | elapsed time per iteration (s): 0.22 | learning rate: 2.064E-05 | global batch size: 256 | lm loss: 3.690467E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.916 | TFLOPs: 29.04 | +7: iteration 36490/ 37905 | consumed samples: 9341440 | consumed tokens: 19131269120 | elapsed time per iteration (s): 0.22 | learning rate: 2.063E-05 | global batch size: 256 | lm loss: 3.679140E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.579 | TFLOPs: 29.34 | +7: iteration 36500/ 37905 | consumed samples: 9344000 | consumed tokens: 19136512000 | elapsed time per iteration (s): 0.22 | learning rate: 2.062E-05 | global batch size: 256 | lm loss: 3.675832E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.493 | TFLOPs: 29.36 | +7: iteration 36510/ 37905 | consumed samples: 9346560 | consumed tokens: 19141754880 | elapsed time per iteration (s): 0.22 | learning rate: 2.061E-05 | global batch size: 256 | lm loss: 3.684749E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.741 | TFLOPs: 29.04 | +7: iteration 36520/ 37905 | consumed samples: 9349120 | consumed tokens: 19146997760 | elapsed time per iteration (s): 0.22 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.687072E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.816 | TFLOPs: 29.32 | +7: iteration 36530/ 37905 | consumed samples: 9351680 | consumed tokens: 19152240640 | elapsed time per iteration (s): 0.22 | learning rate: 2.060E-05 | global batch size: 256 | lm loss: 3.699862E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.973 | TFLOPs: 29.30 | +7: iteration 36540/ 37905 | consumed samples: 9354240 | consumed tokens: 19157483520 | elapsed time per iteration (s): 0.22 | learning rate: 2.059E-05 | global batch size: 256 | lm loss: 3.683347E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.739 | TFLOPs: 29.06 | +7: iteration 36550/ 37905 | consumed samples: 9356800 | consumed tokens: 19162726400 | elapsed time per iteration (s): 0.22 | learning rate: 2.058E-05 | global batch size: 256 | lm loss: 3.688390E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.710 | TFLOPs: 29.09 | +7: iteration 36560/ 37905 | consumed samples: 9359360 | consumed tokens: 19167969280 | elapsed time per iteration (s): 0.22 | learning rate: 2.057E-05 | global batch size: 256 | lm loss: 3.685057E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.554 | TFLOPs: 29.00 | +7: iteration 36570/ 37905 | consumed samples: 9361920 | consumed tokens: 19173212160 | elapsed time per iteration (s): 0.23 | learning rate: 2.056E-05 | global batch size: 256 | lm loss: 3.695377E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.586 | TFLOPs: 28.95 | +7: iteration 36580/ 37905 | consumed samples: 9364480 | consumed tokens: 19178455040 | elapsed time per iteration (s): 0.23 | learning rate: 2.055E-05 | global batch size: 256 | lm loss: 3.678041E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.126 | TFLOPs: 28.61 | +7: iteration 36590/ 37905 | consumed samples: 9367040 | consumed tokens: 19183697920 | elapsed time per iteration (s): 0.23 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.714393E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.735 | TFLOPs: 28.93 | +7: iteration 36600/ 37905 | consumed samples: 9369600 | consumed tokens: 19188940800 | elapsed time per iteration (s): 0.23 | learning rate: 2.054E-05 | global batch size: 256 | lm loss: 3.699698E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1119.772 | TFLOPs: 28.53 | +7: iteration 36610/ 37905 | consumed samples: 9372160 | consumed tokens: 19194183680 | elapsed time per iteration (s): 0.22 | learning rate: 2.053E-05 | global batch size: 256 | lm loss: 3.690060E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.448 | TFLOPs: 29.05 | +7: iteration 36620/ 37905 | consumed samples: 9374720 | consumed tokens: 19199426560 | elapsed time per iteration (s): 0.22 | learning rate: 2.052E-05 | global batch size: 256 | lm loss: 3.689719E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.613 | TFLOPs: 29.44 | +7: iteration 36630/ 37905 | consumed samples: 9377280 | consumed tokens: 19204669440 | elapsed time per iteration (s): 0.22 | learning rate: 2.051E-05 | global batch size: 256 | lm loss: 3.683881E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.461 | TFLOPs: 29.05 | +7: iteration 36640/ 37905 | consumed samples: 9379840 | consumed tokens: 19209912320 | elapsed time per iteration (s): 0.23 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.689925E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.206 | TFLOPs: 28.87 | +7: iteration 36650/ 37905 | consumed samples: 9382400 | consumed tokens: 19215155200 | elapsed time per iteration (s): 0.22 | learning rate: 2.050E-05 | global batch size: 256 | lm loss: 3.695701E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.869 | TFLOPs: 29.34 | +7: iteration 36660/ 37905 | consumed samples: 9384960 | consumed tokens: 19220398080 | elapsed time per iteration (s): 0.23 | learning rate: 2.049E-05 | global batch size: 256 | lm loss: 3.688179E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1089.473 | TFLOPs: 27.75 | +7: iteration 36670/ 37905 | consumed samples: 9387520 | consumed tokens: 19225640960 | elapsed time per iteration (s): 0.23 | learning rate: 2.048E-05 | global batch size: 256 | lm loss: 3.699403E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1122.900 | TFLOPs: 28.61 | +7: iteration 36680/ 37905 | consumed samples: 9390080 | consumed tokens: 19230883840 | elapsed time per iteration (s): 0.22 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.671212E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.211 | TFLOPs: 29.30 | +7: iteration 36690/ 37905 | consumed samples: 9392640 | consumed tokens: 19236126720 | elapsed time per iteration (s): 0.23 | learning rate: 2.047E-05 | global batch size: 256 | lm loss: 3.684849E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1117.201 | TFLOPs: 28.46 | +7: iteration 36700/ 37905 | consumed samples: 9395200 | consumed tokens: 19241369600 | elapsed time per iteration (s): 0.23 | learning rate: 2.046E-05 | global batch size: 256 | lm loss: 3.688374E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.863 | TFLOPs: 28.78 | +7: iteration 36710/ 37905 | consumed samples: 9397760 | consumed tokens: 19246612480 | elapsed time per iteration (s): 0.22 | learning rate: 2.045E-05 | global batch size: 256 | lm loss: 3.687590E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.620 | TFLOPs: 29.16 | +7: iteration 36720/ 37905 | consumed samples: 9400320 | consumed tokens: 19251855360 | elapsed time per iteration (s): 0.23 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.687830E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.065 | TFLOPs: 28.89 | +7: iteration 36730/ 37905 | consumed samples: 9402880 | consumed tokens: 19257098240 | elapsed time per iteration (s): 0.23 | learning rate: 2.044E-05 | global batch size: 256 | lm loss: 3.678556E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.341 | TFLOPs: 28.97 | +7: iteration 36740/ 37905 | consumed samples: 9405440 | consumed tokens: 19262341120 | elapsed time per iteration (s): 0.23 | learning rate: 2.043E-05 | global batch size: 256 | lm loss: 3.665319E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1120.860 | TFLOPs: 28.55 | +7: iteration 36750/ 37905 | consumed samples: 9408000 | consumed tokens: 19267584000 | elapsed time per iteration (s): 0.23 | learning rate: 2.042E-05 | global batch size: 256 | lm loss: 3.681150E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1118.347 | TFLOPs: 28.49 | +7: iteration 36760/ 37905 | consumed samples: 9410560 | consumed tokens: 19272826880 | elapsed time per iteration (s): 0.23 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.666608E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1133.196 | TFLOPs: 28.87 | +7: iteration 36770/ 37905 | consumed samples: 9413120 | consumed tokens: 19278069760 | elapsed time per iteration (s): 0.23 | learning rate: 2.041E-05 | global batch size: 256 | lm loss: 3.692715E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.947 | TFLOPs: 28.81 | +7: iteration 36780/ 37905 | consumed samples: 9415680 | consumed tokens: 19283312640 | elapsed time per iteration (s): 0.23 | learning rate: 2.040E-05 | global batch size: 256 | lm loss: 3.682910E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1125.383 | TFLOPs: 28.67 | +7: iteration 36790/ 37905 | consumed samples: 9418240 | consumed tokens: 19288555520 | elapsed time per iteration (s): 0.23 | learning rate: 2.039E-05 | global batch size: 256 | lm loss: 3.687926E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.955 | TFLOPs: 28.76 | +7: iteration 36800/ 37905 | consumed samples: 9420800 | consumed tokens: 19293798400 | elapsed time per iteration (s): 0.23 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.687103E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1108.535 | TFLOPs: 28.24 | +7: iteration 36810/ 37905 | consumed samples: 9423360 | consumed tokens: 19299041280 | elapsed time per iteration (s): 0.22 | learning rate: 2.038E-05 | global batch size: 256 | lm loss: 3.686457E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.375 | TFLOPs: 29.00 | +7: iteration 36820/ 37905 | consumed samples: 9425920 | consumed tokens: 19304284160 | elapsed time per iteration (s): 0.22 | learning rate: 2.037E-05 | global batch size: 256 | lm loss: 3.686511E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.514 | TFLOPs: 29.39 | +7: iteration 36830/ 37905 | consumed samples: 9428480 | consumed tokens: 19309527040 | elapsed time per iteration (s): 0.23 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.690842E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.771 | TFLOPs: 28.81 | +7: iteration 36840/ 37905 | consumed samples: 9431040 | consumed tokens: 19314769920 | elapsed time per iteration (s): 0.22 | learning rate: 2.036E-05 | global batch size: 256 | lm loss: 3.680743E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.573 | TFLOPs: 29.01 | +7: iteration 36850/ 37905 | consumed samples: 9433600 | consumed tokens: 19320012800 | elapsed time per iteration (s): 0.22 | learning rate: 2.035E-05 | global batch size: 256 | lm loss: 3.682006E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.109 | TFLOPs: 29.40 | +7: iteration 36860/ 37905 | consumed samples: 9436160 | consumed tokens: 19325255680 | elapsed time per iteration (s): 0.22 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.692780E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.445 | TFLOPs: 29.15 | +7: iteration 36870/ 37905 | consumed samples: 9438720 | consumed tokens: 19330498560 | elapsed time per iteration (s): 0.22 | learning rate: 2.034E-05 | global batch size: 256 | lm loss: 3.691840E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.467 | TFLOPs: 29.18 | +7: iteration 36880/ 37905 | consumed samples: 9441280 | consumed tokens: 19335741440 | elapsed time per iteration (s): 0.22 | learning rate: 2.033E-05 | global batch size: 256 | lm loss: 3.682218E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.537 | TFLOPs: 29.00 | +7: iteration 36890/ 37905 | consumed samples: 9443840 | consumed tokens: 19340984320 | elapsed time per iteration (s): 0.22 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.686371E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.084 | TFLOPs: 29.15 | +7: iteration 36900/ 37905 | consumed samples: 9446400 | consumed tokens: 19346227200 | elapsed time per iteration (s): 0.22 | learning rate: 2.032E-05 | global batch size: 256 | lm loss: 3.676078E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.263 | TFLOPs: 29.35 | +7: iteration 36910/ 37905 | consumed samples: 9448960 | consumed tokens: 19351470080 | elapsed time per iteration (s): 0.23 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.685662E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.855 | TFLOPs: 28.78 | +7: iteration 36920/ 37905 | consumed samples: 9451520 | consumed tokens: 19356712960 | elapsed time per iteration (s): 0.22 | learning rate: 2.031E-05 | global batch size: 256 | lm loss: 3.685484E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.126 | TFLOPs: 28.99 | +7: iteration 36930/ 37905 | consumed samples: 9454080 | consumed tokens: 19361955840 | elapsed time per iteration (s): 0.23 | learning rate: 2.030E-05 | global batch size: 256 | lm loss: 3.685173E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.963 | TFLOPs: 28.96 | +7: iteration 36940/ 37905 | consumed samples: 9456640 | consumed tokens: 19367198720 | elapsed time per iteration (s): 0.22 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.694548E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.875 | TFLOPs: 29.27 | +7: iteration 36950/ 37905 | consumed samples: 9459200 | consumed tokens: 19372441600 | elapsed time per iteration (s): 0.22 | learning rate: 2.029E-05 | global batch size: 256 | lm loss: 3.684616E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.821 | TFLOPs: 29.09 | +7: iteration 36960/ 37905 | consumed samples: 9461760 | consumed tokens: 19377684480 | elapsed time per iteration (s): 0.23 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.687372E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1134.421 | TFLOPs: 28.90 | +7: iteration 36970/ 37905 | consumed samples: 9464320 | consumed tokens: 19382927360 | elapsed time per iteration (s): 0.22 | learning rate: 2.028E-05 | global batch size: 256 | lm loss: 3.684997E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.965 | TFLOPs: 29.19 | +7: iteration 36980/ 37905 | consumed samples: 9466880 | consumed tokens: 19388170240 | elapsed time per iteration (s): 0.23 | learning rate: 2.027E-05 | global batch size: 256 | lm loss: 3.696020E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.224 | TFLOPs: 28.82 | +7: iteration 36990/ 37905 | consumed samples: 9469440 | consumed tokens: 19393413120 | elapsed time per iteration (s): 0.23 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.694213E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1128.934 | TFLOPs: 28.76 | +7: iteration 37000/ 37905 | consumed samples: 9472000 | consumed tokens: 19398656000 | elapsed time per iteration (s): 0.23 | learning rate: 2.026E-05 | global batch size: 256 | lm loss: 3.699011E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.366 | TFLOPs: 28.72 | +7: ------------------------------------------------------------------------------------------------ +7: validation loss at iteration 37000 | lm loss value: 3.677787E+00 | lm loss PPL: 3.955874E+01 | +7: ------------------------------------------------------------------------------------------------ +0: saving checkpoint at iteration 37000 to checkpoints_83m20b400m +0: [2023-03-16 00:15:30,165] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step37000 is begin to save! +0: [2023-03-16 00:15:30,169] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_01-model_00-model_states.pt... +0: [2023-03-16 00:15:30,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_01-model_00-model_states.pt. +0: [2023-03-16 00:15:30,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_03-model_00-model_states.pt... +0: [2023-03-16 00:15:30,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_03-model_00-model_states.pt. +0: [2023-03-16 00:15:30,256] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_04-model_00-model_states.pt... +0: [2023-03-16 00:15:30,267] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_04-model_00-model_states.pt. +0: [2023-03-16 00:15:30,267] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_05-model_00-model_states.pt... +0: [2023-03-16 00:15:30,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_05-model_00-model_states.pt. +0: [2023-03-16 00:15:30,278] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_06-model_00-model_states.pt... +0: [2023-03-16 00:15:30,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_06-model_00-model_states.pt. +0: [2023-03-16 00:15:30,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_07-model_00-model_states.pt... +0: [2023-03-16 00:15:30,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_07-model_00-model_states.pt. +0: [2023-03-16 00:15:30,301] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_08-model_00-model_states.pt... +0: [2023-03-16 00:15:30,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_08-model_00-model_states.pt. +0: [2023-03-16 00:15:30,312] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_09-model_00-model_states.pt... +0: [2023-03-16 00:15:30,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_09-model_00-model_states.pt. +0: [2023-03-16 00:15:30,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_10-model_00-model_states.pt... +0: [2023-03-16 00:15:30,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_10-model_00-model_states.pt. +0: [2023-03-16 00:15:30,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_11-model_00-model_states.pt... +0: [2023-03-16 00:15:30,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_11-model_00-model_states.pt. +0: [2023-03-16 00:15:30,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_12-model_00-model_states.pt... +0: [2023-03-16 00:15:30,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_12-model_00-model_states.pt. +0: [2023-03-16 00:15:30,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/layer_14-model_00-model_states.pt... +0: [2023-03-16 00:15:30,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/layer_14-model_00-model_states.pt. +0: [2023-03-16 00:15:30,358] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step37000/mp_rank_00_model_states.pt +0: [2023-03-16 00:15:30,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/mp_rank_00_model_states.pt... +0: [2023-03-16 00:15:30,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/mp_rank_00_model_states.pt. +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:15:30,377] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:15:30,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +6: [2023-03-16 00:15:30,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-16 00:15:30,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +3: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:15:30,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:15:30,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 00:15:30,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:15:30,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +6: [2023-03-16 00:15:30,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +3: [2023-03-16 00:15:30,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:15:30,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:15:30,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +6: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-16 00:15:30,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:15:30,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:15:30,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:15:30,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +6: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:15:30,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +4: [2023-03-16 00:15:30,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +6: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +4: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:15:30,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +2: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +2: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +1: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +5: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +3: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:15:30,406] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +7: [2023-03-16 00:15:30,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:15:30,407] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +7: [2023-03-16 00:15:30,407] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37000 is ready now! +0: successfully saved checkpoint at iteration 37000 to checkpoints_83m20b400m +7: time (ms) | save-checkpoint: 245.13 +7: iteration 37010/ 37905 | consumed samples: 9474560 | consumed tokens: 19403898880 | elapsed time per iteration (s): 0.25 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.684177E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1012.120 | TFLOPs: 25.78 | +7: iteration 37020/ 37905 | consumed samples: 9477120 | consumed tokens: 19409141760 | elapsed time per iteration (s): 0.23 | learning rate: 2.025E-05 | global batch size: 256 | lm loss: 3.691692E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.579 | TFLOPs: 28.70 | +7: iteration 37030/ 37905 | consumed samples: 9479680 | consumed tokens: 19414384640 | elapsed time per iteration (s): 0.22 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.699360E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.738 | TFLOPs: 29.09 | +7: iteration 37040/ 37905 | consumed samples: 9482240 | consumed tokens: 19419627520 | elapsed time per iteration (s): 0.23 | learning rate: 2.024E-05 | global batch size: 256 | lm loss: 3.699234E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.850 | TFLOPs: 28.96 | +7: iteration 37050/ 37905 | consumed samples: 9484800 | consumed tokens: 19424870400 | elapsed time per iteration (s): 0.22 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.671251E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.525 | TFLOPs: 29.31 | +7: iteration 37060/ 37905 | consumed samples: 9487360 | consumed tokens: 19430113280 | elapsed time per iteration (s): 0.22 | learning rate: 2.023E-05 | global batch size: 256 | lm loss: 3.683720E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.249 | TFLOPs: 29.07 | +7: iteration 37070/ 37905 | consumed samples: 9489920 | consumed tokens: 19435356160 | elapsed time per iteration (s): 0.23 | learning rate: 2.022E-05 | global batch size: 256 | lm loss: 3.706766E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.715 | TFLOPs: 28.65 | +7: iteration 37080/ 37905 | consumed samples: 9492480 | consumed tokens: 19440599040 | elapsed time per iteration (s): 0.22 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.696066E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.281 | TFLOPs: 29.10 | +7: iteration 37090/ 37905 | consumed samples: 9495040 | consumed tokens: 19445841920 | elapsed time per iteration (s): 0.22 | learning rate: 2.021E-05 | global batch size: 256 | lm loss: 3.679822E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.948 | TFLOPs: 28.99 | +7: iteration 37100/ 37905 | consumed samples: 9497600 | consumed tokens: 19451084800 | elapsed time per iteration (s): 0.23 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.683275E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1130.494 | TFLOPs: 28.80 | +7: iteration 37110/ 37905 | consumed samples: 9500160 | consumed tokens: 19456327680 | elapsed time per iteration (s): 0.23 | learning rate: 2.020E-05 | global batch size: 256 | lm loss: 3.680961E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.594 | TFLOPs: 28.98 | +7: iteration 37120/ 37905 | consumed samples: 9502720 | consumed tokens: 19461570560 | elapsed time per iteration (s): 0.23 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.689721E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.774 | TFLOPs: 28.73 | +7: iteration 37130/ 37905 | consumed samples: 9505280 | consumed tokens: 19466813440 | elapsed time per iteration (s): 0.23 | learning rate: 2.019E-05 | global batch size: 256 | lm loss: 3.696782E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1123.218 | TFLOPs: 28.61 | +7: iteration 37140/ 37905 | consumed samples: 9507840 | consumed tokens: 19472056320 | elapsed time per iteration (s): 0.23 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.690985E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.099 | TFLOPs: 28.28 | +7: iteration 37150/ 37905 | consumed samples: 9510400 | consumed tokens: 19477299200 | elapsed time per iteration (s): 0.23 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.692928E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.656 | TFLOPs: 28.83 | +7: iteration 37160/ 37905 | consumed samples: 9512960 | consumed tokens: 19482542080 | elapsed time per iteration (s): 0.22 | learning rate: 2.018E-05 | global batch size: 256 | lm loss: 3.691690E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.404 | TFLOPs: 29.10 | +7: iteration 37170/ 37905 | consumed samples: 9515520 | consumed tokens: 19487784960 | elapsed time per iteration (s): 0.22 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.679459E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1145.887 | TFLOPs: 29.19 | +7: iteration 37180/ 37905 | consumed samples: 9518080 | consumed tokens: 19493027840 | elapsed time per iteration (s): 0.23 | learning rate: 2.017E-05 | global batch size: 256 | lm loss: 3.679171E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.642 | TFLOPs: 28.93 | +7: iteration 37190/ 37905 | consumed samples: 9520640 | consumed tokens: 19498270720 | elapsed time per iteration (s): 0.22 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.689073E+00 | grad norm: 0.206 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.906 | TFLOPs: 29.12 | +7: iteration 37200/ 37905 | consumed samples: 9523200 | consumed tokens: 19503513600 | elapsed time per iteration (s): 0.22 | learning rate: 2.016E-05 | global batch size: 256 | lm loss: 3.691517E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.422 | TFLOPs: 29.05 | +7: iteration 37210/ 37905 | consumed samples: 9525760 | consumed tokens: 19508756480 | elapsed time per iteration (s): 0.22 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.690752E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.660 | TFLOPs: 29.03 | +7: iteration 37220/ 37905 | consumed samples: 9528320 | consumed tokens: 19513999360 | elapsed time per iteration (s): 0.23 | learning rate: 2.015E-05 | global batch size: 256 | lm loss: 3.671384E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.218 | TFLOPs: 28.97 | +7: iteration 37230/ 37905 | consumed samples: 9530880 | consumed tokens: 19519242240 | elapsed time per iteration (s): 0.22 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.693882E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.810 | TFLOPs: 29.32 | +7: iteration 37240/ 37905 | consumed samples: 9533440 | consumed tokens: 19524485120 | elapsed time per iteration (s): 0.23 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.699586E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1129.264 | TFLOPs: 28.77 | +7: iteration 37250/ 37905 | consumed samples: 9536000 | consumed tokens: 19529728000 | elapsed time per iteration (s): 0.23 | learning rate: 2.014E-05 | global batch size: 256 | lm loss: 3.686335E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.259 | TFLOPs: 28.72 | +7: iteration 37260/ 37905 | consumed samples: 9538560 | consumed tokens: 19534970880 | elapsed time per iteration (s): 0.23 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.697060E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1116.166 | TFLOPs: 28.43 | +7: iteration 37270/ 37905 | consumed samples: 9541120 | consumed tokens: 19540213760 | elapsed time per iteration (s): 0.22 | learning rate: 2.013E-05 | global batch size: 256 | lm loss: 3.688512E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.528 | TFLOPs: 29.13 | +7: iteration 37280/ 37905 | consumed samples: 9543680 | consumed tokens: 19545456640 | elapsed time per iteration (s): 0.23 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.690815E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1106.274 | TFLOPs: 28.18 | +7: iteration 37290/ 37905 | consumed samples: 9546240 | consumed tokens: 19550699520 | elapsed time per iteration (s): 0.24 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.676572E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1051.873 | TFLOPs: 26.80 | +7: iteration 37300/ 37905 | consumed samples: 9548800 | consumed tokens: 19555942400 | elapsed time per iteration (s): 0.23 | learning rate: 2.012E-05 | global batch size: 256 | lm loss: 3.691796E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.574 | TFLOPs: 28.85 | +7: iteration 37310/ 37905 | consumed samples: 9551360 | consumed tokens: 19561185280 | elapsed time per iteration (s): 0.23 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.674722E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1132.696 | TFLOPs: 28.86 | +7: iteration 37320/ 37905 | consumed samples: 9553920 | consumed tokens: 19566428160 | elapsed time per iteration (s): 0.22 | learning rate: 2.011E-05 | global batch size: 256 | lm loss: 3.693681E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.698 | TFLOPs: 29.42 | +7: iteration 37330/ 37905 | consumed samples: 9556480 | consumed tokens: 19571671040 | elapsed time per iteration (s): 0.22 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.683284E+00 | grad norm: 0.217 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.638 | TFLOPs: 29.11 | +7: iteration 37340/ 37905 | consumed samples: 9559040 | consumed tokens: 19576913920 | elapsed time per iteration (s): 0.23 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.692588E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1121.120 | TFLOPs: 28.56 | +7: iteration 37350/ 37905 | consumed samples: 9561600 | consumed tokens: 19582156800 | elapsed time per iteration (s): 0.23 | learning rate: 2.010E-05 | global batch size: 256 | lm loss: 3.681297E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1090.352 | TFLOPs: 27.78 | +7: iteration 37360/ 37905 | consumed samples: 9564160 | consumed tokens: 19587399680 | elapsed time per iteration (s): 0.23 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.704680E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1110.741 | TFLOPs: 28.30 | +7: iteration 37370/ 37905 | consumed samples: 9566720 | consumed tokens: 19592642560 | elapsed time per iteration (s): 0.22 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.683165E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.589 | TFLOPs: 29.13 | +7: iteration 37380/ 37905 | consumed samples: 9569280 | consumed tokens: 19597885440 | elapsed time per iteration (s): 0.22 | learning rate: 2.009E-05 | global batch size: 256 | lm loss: 3.687438E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.618 | TFLOPs: 29.08 | +7: iteration 37390/ 37905 | consumed samples: 9571840 | consumed tokens: 19603128320 | elapsed time per iteration (s): 0.25 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.678082E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1042.427 | TFLOPs: 26.56 | +7: iteration 37400/ 37905 | consumed samples: 9574400 | consumed tokens: 19608371200 | elapsed time per iteration (s): 0.24 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.694447E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1084.220 | TFLOPs: 27.62 | +7: iteration 37410/ 37905 | consumed samples: 9576960 | consumed tokens: 19613614080 | elapsed time per iteration (s): 0.22 | learning rate: 2.008E-05 | global batch size: 256 | lm loss: 3.683956E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.579 | TFLOPs: 29.36 | +7: iteration 37420/ 37905 | consumed samples: 9579520 | consumed tokens: 19618856960 | elapsed time per iteration (s): 0.22 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.681425E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.274 | TFLOPs: 29.33 | +7: iteration 37430/ 37905 | consumed samples: 9582080 | consumed tokens: 19624099840 | elapsed time per iteration (s): 0.22 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.681572E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.413 | TFLOPs: 29.08 | +7: iteration 37440/ 37905 | consumed samples: 9584640 | consumed tokens: 19629342720 | elapsed time per iteration (s): 0.22 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.688024E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.658 | TFLOPs: 29.36 | +7: iteration 37450/ 37905 | consumed samples: 9587200 | consumed tokens: 19634585600 | elapsed time per iteration (s): 0.22 | learning rate: 2.007E-05 | global batch size: 256 | lm loss: 3.678676E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.202 | TFLOPs: 29.43 | +7: iteration 37460/ 37905 | consumed samples: 9589760 | consumed tokens: 19639828480 | elapsed time per iteration (s): 0.22 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.684047E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1139.295 | TFLOPs: 29.02 | +7: iteration 37470/ 37905 | consumed samples: 9592320 | consumed tokens: 19645071360 | elapsed time per iteration (s): 0.22 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.697086E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.569 | TFLOPs: 29.11 | +7: iteration 37480/ 37905 | consumed samples: 9594880 | consumed tokens: 19650314240 | elapsed time per iteration (s): 0.22 | learning rate: 2.006E-05 | global batch size: 256 | lm loss: 3.682874E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.642 | TFLOPs: 29.39 | +7: iteration 37490/ 37905 | consumed samples: 9597440 | consumed tokens: 19655557120 | elapsed time per iteration (s): 0.22 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.682404E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.840 | TFLOPs: 29.42 | +7: iteration 37500/ 37905 | consumed samples: 9600000 | consumed tokens: 19660800000 | elapsed time per iteration (s): 0.22 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.691506E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.855 | TFLOPs: 29.32 | +7: iteration 37510/ 37905 | consumed samples: 9602560 | consumed tokens: 19666042880 | elapsed time per iteration (s): 0.23 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.683309E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.869 | TFLOPs: 28.66 | +7: iteration 37520/ 37905 | consumed samples: 9605120 | consumed tokens: 19671285760 | elapsed time per iteration (s): 0.23 | learning rate: 2.005E-05 | global batch size: 256 | lm loss: 3.679476E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1126.115 | TFLOPs: 28.69 | +7: iteration 37530/ 37905 | consumed samples: 9607680 | consumed tokens: 19676528640 | elapsed time per iteration (s): 0.22 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.675459E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.006 | TFLOPs: 29.27 | +7: iteration 37540/ 37905 | consumed samples: 9610240 | consumed tokens: 19681771520 | elapsed time per iteration (s): 0.22 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.679925E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.681 | TFLOPs: 29.29 | +7: iteration 37550/ 37905 | consumed samples: 9612800 | consumed tokens: 19687014400 | elapsed time per iteration (s): 0.22 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.676271E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1153.396 | TFLOPs: 29.38 | +7: iteration 37560/ 37905 | consumed samples: 9615360 | consumed tokens: 19692257280 | elapsed time per iteration (s): 0.22 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.686637E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.725 | TFLOPs: 29.37 | +7: iteration 37570/ 37905 | consumed samples: 9617920 | consumed tokens: 19697500160 | elapsed time per iteration (s): 0.23 | learning rate: 2.004E-05 | global batch size: 256 | lm loss: 3.695964E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1108.190 | TFLOPs: 28.23 | +7: iteration 37580/ 37905 | consumed samples: 9620480 | consumed tokens: 19702743040 | elapsed time per iteration (s): 0.22 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.699043E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.147 | TFLOPs: 29.10 | +7: iteration 37590/ 37905 | consumed samples: 9623040 | consumed tokens: 19707985920 | elapsed time per iteration (s): 0.23 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.687690E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.332 | TFLOPs: 28.82 | +7: iteration 37600/ 37905 | consumed samples: 9625600 | consumed tokens: 19713228800 | elapsed time per iteration (s): 0.22 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.681830E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.902 | TFLOPs: 29.09 | +7: iteration 37610/ 37905 | consumed samples: 9628160 | consumed tokens: 19718471680 | elapsed time per iteration (s): 0.22 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.691315E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1143.623 | TFLOPs: 29.13 | +7: iteration 37620/ 37905 | consumed samples: 9630720 | consumed tokens: 19723714560 | elapsed time per iteration (s): 0.22 | learning rate: 2.003E-05 | global batch size: 256 | lm loss: 3.683908E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.990 | TFLOPs: 29.35 | +7: iteration 37630/ 37905 | consumed samples: 9633280 | consumed tokens: 19728957440 | elapsed time per iteration (s): 0.23 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.691509E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1127.314 | TFLOPs: 28.72 | +7: iteration 37640/ 37905 | consumed samples: 9635840 | consumed tokens: 19734200320 | elapsed time per iteration (s): 0.22 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.676642E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.331 | TFLOPs: 29.33 | +7: iteration 37650/ 37905 | consumed samples: 9638400 | consumed tokens: 19739443200 | elapsed time per iteration (s): 0.23 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.692892E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.186 | TFLOPs: 28.94 | +7: iteration 37660/ 37905 | consumed samples: 9640960 | consumed tokens: 19744686080 | elapsed time per iteration (s): 0.23 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.677682E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.531 | TFLOPs: 28.93 | +7: iteration 37670/ 37905 | consumed samples: 9643520 | consumed tokens: 19749928960 | elapsed time per iteration (s): 0.23 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.694227E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1124.566 | TFLOPs: 28.65 | +7: iteration 37680/ 37905 | consumed samples: 9646080 | consumed tokens: 19755171840 | elapsed time per iteration (s): 0.23 | learning rate: 2.002E-05 | global batch size: 256 | lm loss: 3.689448E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.157 | TFLOPs: 28.92 | +7: iteration 37690/ 37905 | consumed samples: 9648640 | consumed tokens: 19760414720 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.679867E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1149.729 | TFLOPs: 29.29 | +7: iteration 37700/ 37905 | consumed samples: 9651200 | consumed tokens: 19765657600 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.690173E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.797 | TFLOPs: 29.37 | +7: iteration 37710/ 37905 | consumed samples: 9653760 | consumed tokens: 19770900480 | elapsed time per iteration (s): 0.23 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.691937E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1136.018 | TFLOPs: 28.94 | +7: iteration 37720/ 37905 | consumed samples: 9656320 | consumed tokens: 19776143360 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.702235E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1141.357 | TFLOPs: 29.08 | +7: iteration 37730/ 37905 | consumed samples: 9658880 | consumed tokens: 19781386240 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.673969E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1146.295 | TFLOPs: 29.20 | +7: iteration 37740/ 37905 | consumed samples: 9661440 | consumed tokens: 19786629120 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.683495E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.651 | TFLOPs: 29.31 | +7: iteration 37750/ 37905 | consumed samples: 9664000 | consumed tokens: 19791872000 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.679488E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1147.412 | TFLOPs: 29.23 | +7: iteration 37760/ 37905 | consumed samples: 9666560 | consumed tokens: 19797114880 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.698267E+00 | grad norm: 0.190 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1142.467 | TFLOPs: 29.10 | +7: iteration 37770/ 37905 | consumed samples: 9669120 | consumed tokens: 19802357760 | elapsed time per iteration (s): 0.22 | learning rate: 2.001E-05 | global batch size: 256 | lm loss: 3.687330E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.504 | TFLOPs: 29.36 | +7: iteration 37780/ 37905 | consumed samples: 9671680 | consumed tokens: 19807600640 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.699572E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1135.546 | TFLOPs: 28.93 | +7: iteration 37790/ 37905 | consumed samples: 9674240 | consumed tokens: 19812843520 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.686530E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1151.766 | TFLOPs: 29.34 | +7: iteration 37800/ 37905 | consumed samples: 9676800 | consumed tokens: 19818086400 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.694067E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1152.383 | TFLOPs: 29.36 | +7: iteration 37810/ 37905 | consumed samples: 9679360 | consumed tokens: 19823329280 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.704834E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1148.282 | TFLOPs: 29.25 | +7: iteration 37820/ 37905 | consumed samples: 9681920 | consumed tokens: 19828572160 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.697397E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1140.285 | TFLOPs: 29.05 | +7: iteration 37830/ 37905 | consumed samples: 9684480 | consumed tokens: 19833815040 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.703952E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1131.978 | TFLOPs: 28.84 | +7: iteration 37840/ 37905 | consumed samples: 9687040 | consumed tokens: 19839057920 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.681824E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1144.002 | TFLOPs: 29.14 | +7: iteration 37850/ 37905 | consumed samples: 9689600 | consumed tokens: 19844300800 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.687146E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1155.614 | TFLOPs: 29.44 | +7: iteration 37860/ 37905 | consumed samples: 9692160 | consumed tokens: 19849543680 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.693908E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1150.943 | TFLOPs: 29.32 | +7: iteration 37870/ 37905 | consumed samples: 9694720 | consumed tokens: 19854786560 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.688554E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1154.681 | TFLOPs: 29.42 | +7: iteration 37880/ 37905 | consumed samples: 9697280 | consumed tokens: 19860029440 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.687076E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.391 | TFLOPs: 28.98 | +7: iteration 37890/ 37905 | consumed samples: 9699840 | consumed tokens: 19865272320 | elapsed time per iteration (s): 0.22 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.699013E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1138.678 | TFLOPs: 29.01 | +7: iteration 37900/ 37905 | consumed samples: 9702400 | consumed tokens: 19870515200 | elapsed time per iteration (s): 0.23 | learning rate: 2.000E-05 | global batch size: 256 | lm loss: 3.672169E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 1137.579 | TFLOPs: 28.98 | +0: [after training is done] datetime: 2023-03-16 00:18:54 +0: saving checkpoint at iteration 37905 to checkpoints_83m20b400m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.633683E+00 | lm loss PPL: 3.785196E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-16 00:18:54,345] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step37905 is begin to save! +0: [2023-03-16 00:18:54,348] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 00:18:54,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 00:18:54,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 00:18:54,434] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 00:18:54,434] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 00:18:54,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 00:18:54,446] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 00:18:54,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 00:18:54,457] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 00:18:54,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 00:18:54,468] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 00:18:54,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 00:18:54,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 00:18:54,491] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 00:18:54,491] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 00:18:54,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 00:18:54,502] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 00:18:54,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 00:18:54,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 00:18:54,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 00:18:54,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 00:18:54,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 00:18:54,536] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 00:18:54,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 00:18:54,537] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt +0: [2023-03-16 00:18:54,538] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 00:18:54,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 00:18:54,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +0: [2023-03-16 00:18:54,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 00:18:54,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:18:54,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +1: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +6: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +4: [2023-03-16 00:18:54,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:18:54,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +4: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +6: [2023-03-16 00:18:54,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +6: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +4: [2023-03-16 00:18:54,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:18:54,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +5: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +6: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 00:18:54,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +3: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +6: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +6: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +7: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +3: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +3: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +7: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +4: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +7: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +4: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +1: [2023-03-16 00:18:54,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 00:18:54,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 00:18:54,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +2: [2023-03-16 00:18:54,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 00:18:54,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +2: [2023-03-16 00:18:54,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step37905 is ready now! +0: successfully saved checkpoint at iteration 37905 to checkpoints_83m20b400m +END 3318389: Thu 16 Mar 2023 12:19:03 AM EET diff --git a/83m20b400m/3319355.err b/83m20b400m/3319355.err new file mode 100644 index 0000000000000000000000000000000000000000..346bdb88293ae2023c2c714e33270c049d1fbd68 --- /dev/null +++ b/83m20b400m/3319355.err @@ -0,0 +1,1106 @@ +6: 2023-03-16 09:02:17.128853: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128861: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128855: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128867: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128874: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128876: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128872: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 09:02:17.128871: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168035: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168040: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168037: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168044: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168043: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168044: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 09:02:17.168050: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176173: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176180: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176187: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176185: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176170: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176170: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176186: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 09:02:17.176179: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206308: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206314: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206314: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206324: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206321: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206326: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:17.206333: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304516: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304519: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304526: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304519: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304536: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304529: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 09:02:17.304541: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331917: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331917: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331926: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331924: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331923: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331929: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331931: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 09:02:17.331926: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.442650: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.442648: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.451645: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.452260: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.452251: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.452413: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.452503: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 09:02:17.452625: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548610: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548613: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548621: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548616: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548627: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548615: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 09:02:17.548622: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 09:02:19.278328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278329: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278338: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278338: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.278344: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:19.296953: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296957: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296963: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296965: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296967: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296969: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296978: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 09:02:19.296977: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:19.345406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345401: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 09:02:19.345992: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 09:02:19.345997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345414: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 09:02:19.346007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345414: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 09:02:19.346007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 2023-03-16 09:02:19.346008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.345998: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:19.345997: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:19.346003: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:19.346005: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:19.346005: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346006: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 2023-03-16 09:02:19.346009: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:19.346005: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:19.346015: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346006: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:19.346011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:19.346189: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346193: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346196: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346198: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346195: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346197: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346194: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 09:02:19.346202: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355136: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355140: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355146: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355155: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355361: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355156: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:19.355363: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355367: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355369: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355371: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355372: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355376: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 09:02:19.355378: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356077: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356107: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356130: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356138: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356154: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:19.356534: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356539: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356567: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356570: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356576: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356589: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356593: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 09:02:19.356598: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.364625: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364643: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364643: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364635: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364642: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.364636: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:19.365082: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365087: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365089: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365094: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365097: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365103: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365108: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 09:02:19.365117: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.384737: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384817: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384823: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384734: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384828: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384749: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384827: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384828: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384837: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.384745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.384824: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 2023-03-16 09:02:19.385189: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385197: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:19.385043: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385048: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385052: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385052: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385199: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385203: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385053: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385055: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385056: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 09:02:19.385057: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385207: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385208: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385210: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 09:02:19.385216: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 09:02:40.348926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.348958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.348969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.348989: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.348998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.349018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.349028: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.349093: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351120: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351119: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351120: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351125: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351135: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351138: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351140: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351142: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351143: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351143: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351166: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351170: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 09:02:40.351182: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 09:02:40.351186: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.366250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366312: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 09:02:40.366350: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 09:02:40.366276: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 09:02:40.366511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 09:02:40.366393: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 09:02:40.366295: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366360: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.366383: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 09:02:40.366303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366542: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.366394: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 09:02:40.366317: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366553: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 09:02:40.366857: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366674: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366381: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.366421: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.366324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366704: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366394: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.366909: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 09:02:40.366443: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.366343: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366586: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366715: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.366914: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 09:02:40.366450: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.366757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366591: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366731: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.366470: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.366920: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: 2023-03-16 09:02:40.366478: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366609: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366748: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.366932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.366691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366751: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.366949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 09:02:40.366954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.366830: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: 2023-03-16 09:02:40.366974: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368103: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368117: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368124: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368142: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368153: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.368182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369202: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369206: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369211: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 09:02:40.369218: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:02:40.369224: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369412: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 09:02:40.369223: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:02:40.369226: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:02:40.369227: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:02:40.369229: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:02:40.369227: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 09:02:40.369229: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369415: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369415: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369415: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369420: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369422: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369431: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369426: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 09:02:40.369430: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369435: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369440: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369442: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369440: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369440: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 09:02:40.369445: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369657: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 09:02:40.369714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.369660: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.369662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369715: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 09:02:40.369662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 09:02:40.369665: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369718: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 09:02:40.369665: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369719: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 09:02:40.369666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.369674: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369673: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 09:02:40.369679: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369682: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369683: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369732: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369685: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369688: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369732: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369734: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369735: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 09:02:40.369725: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 09:02:40.369736: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369737: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 09:02:40.369745: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369783: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369798: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 09:02:40.369802: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 09:02:40.369816: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 09:02:40.370078: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.370062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.370080: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370064: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 09:02:40.370079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370063: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.370082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.370084: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 09:02:40.370084: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370070: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 09:02:40.370086: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370070: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 09:02:40.370080: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370081: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370194: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 09:02:40.370086: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: 2023-03-16 09:02:40.370082: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370083: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370086: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 09:02:40.370096: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:02:40.370096: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370087: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 09:02:40.370090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 09:02:40.370098: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:02:40.370100: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:02:40.370102: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370208: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:02:40.370103: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:02:40.370106: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 09:02:40.370107: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370202: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 09:02:40.370218: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370218: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370220: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370221: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370223: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370224: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 09:02:40.370225: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Loading extension module scaled_masked_softmax_cuda... +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +2: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: +5: +5: +5: +5: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +0: Building extension module utils... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module utils... +3: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +0: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +7: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +3: +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils...Loading extension module utils... +4: +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +4: +4: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: +4: Loading extension module utils... +4: Loading extension module utils... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +6: +6: +6: Loading extension module utils...Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils...Loading extension module utils... +0: +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: +1: +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils...Loading extension module utils... +1: +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +2: +2: +2: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +2: +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: +7: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +7: +7: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +7: +7: Loading extension module utils... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/83m20b400m/3319355.out b/83m20b400m/3319355.out new file mode 100644 index 0000000000000000000000000000000000000000..6473279401a5ebc81da99675b0c9c490097b9a9f --- /dev/null +++ b/83m20b400m/3319355.out @@ -0,0 +1,4348 @@ +Model parameters: d_model 640 ffw_size 2560 kv_size 64 n_heads 10 n_layers 10 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 10 --hidden-size 640 --num-attention-heads 10 --kv-channels 64 --ffn-hidden-size 2560 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-83m20b400mval --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --no-load-optim --reset-progress --override-lr-scheduler --log-interval 10 --save-interval 1000 --eval-interval 1 --eval-iters 100 --eval-only true --tensorboard-dir tensorboard_83m20b400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_83m20b400m --load checkpoints_83m20b400m --train-weighted-split-paths-path train20b.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3319355.json --zero-stage 0 +START 3319355: Thu 16 Mar 2023 09:01:56 AM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 45.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 42.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 38.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 35.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 45.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 49.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 43.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 44.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 46.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 44.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 48.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 38.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 37.0c 79.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 45.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +1: Launching on nid006940 (1/8), master nid006939 port 9999, GPUs 8, CUDA: True +6: Launching on nid006945 (6/8), master nid006939 port 9999, GPUs 8, CUDA: True +4: Launching on nid006943 (4/8), master nid006939 port 9999, GPUs 8, CUDA: True +3: Launching on nid006942 (3/8), master nid006939 port 9999, GPUs 8, CUDA: True +2: Launching on nid006941 (2/8), master nid006939 port 9999, GPUs 8, CUDA: True +0: Launching on nid006939 (0/8), master nid006939 port 9999, GPUs 8, CUDA: True +7: Launching on nid006946 (7/8), master nid006939 port 9999, GPUs 8, CUDA: True +5: Launching on nid006944 (5/8), master nid006939 port 9999, GPUs 8, CUDA: True +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... True +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3319355.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 1 +0: eval_iters ...................................... 100 +0: eval_only ....................................... True +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2560 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 640 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-83m20b400mval +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_83m20b400m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 10 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... 12.0 +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 1 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 0 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... True +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 10 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... True +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. True +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_83m20b400m +0: save_interval ................................... 1000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_83m20b400mval +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 1 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-16 09:03:56,814] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +7: > setting tensorboard ... +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.096 seconds +0: > compiling and loading fused kernels ... +0: >>> done with compiling and loading fused kernels. Compilation time: 33.634 seconds +0: time to initialize megatron (seconds): 1.299 +0: [after megatron is initialized] datetime: 2023-03-16 09:04:33 +0: building GPT model ... +0: [2023-03-16 09:04:33,436] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-16 09:04:33,437] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-16 09:04:33,437] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.07 GB, percent = 6.8% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-16 09:04:35,423] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=17 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: undo +0: 14: MixedFusedLayerNorm +0: 15: EmbeddingPipe +0: 16: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-16 09:04:35,630] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-16 09:04:35,630] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 09:04:35,630] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.09 GB, percent = 6.8% +0: setting training iterations to 0 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-16 09:04:35,632] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-16 09:04:48,894] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-16 09:04:48,894] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-16 09:04:48,894] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-16 09:04:48,897] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-16 09:04:48,897] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-16 09:04:49,016] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-16 09:04:49,016] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 09:04:49,017] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.77 GB, percent = 6.9% +0: ninja: no work to do. +0: Time to load utils op: 0.1482996940612793 seconds +3: Time to load utils op: 0.31015634536743164 seconds +4: Time to load utils op: 0.3088548183441162 seconds +6: Time to load utils op: 0.3093736171722412 seconds +0: Time to load utils op: 0.30618977546691895 secondsTime to load utils op: 0.30620336532592773 seconds +0: +3: Time to load utils op: 0.3023257255554199 seconds +3: Time to load utils op: 0.3035309314727783 seconds +0: Time to load utils op: 0.3069179058074951 seconds +3: Time to load utils op: 0.30307841300964355 seconds +3: Time to load utils op: 0.30295419692993164 seconds +0: Time to load utils op: 0.30731773376464844 seconds +0: Time to load utils op: 0.30698442459106445 seconds +0: Time to load utils op: 0.30725646018981934 seconds +3: Time to load utils op: 0.3039689064025879 seconds +3: Time to load utils op: 0.30333757400512695 seconds +0: Time to load utils op: 0.3070557117462158 seconds +3: Time to load utils op: 0.3044545650482178 seconds +4: Time to load utils op: 0.303286075592041 seconds +4: Time to load utils op: 0.3037111759185791 seconds +4: Time to load utils op: 0.3034539222717285 seconds +4: Time to load utils op: 0.30355167388916016 seconds +4: Time to load utils op: 0.3039989471435547 seconds +4: Time to load utils op: 0.3040733337402344 seconds +4: Time to load utils op: 0.30419373512268066 seconds +6: Time to load utils op: 0.30281758308410645 seconds +6: Time to load utils op: 0.30324363708496094 seconds +6: Time to load utils op: 0.3034484386444092 seconds +6: Time to load utils op: 0.3034968376159668 seconds +6: Time to load utils op: 0.30318641662597656 seconds +6: Time to load utils op: 0.3033115863800049 secondsTime to load utils op: 0.3036229610443115 seconds +6: +1: Time to load utils op: 0.31205010414123535 seconds +1: Time to load utils op: 0.31150245666503906 seconds +1: Time to load utils op: 0.3113083839416504 seconds +1: Time to load utils op: 0.3115272521972656 seconds +1: Time to load utils op: 0.3119173049926758 seconds +1: Time to load utils op: 0.31214189529418945 seconds +1: Time to load utils op: 0.31137824058532715 seconds +1: Time to load utils op: 0.3121945858001709 seconds +2: Time to load utils op: 0.3118908405303955 secondsTime to load utils op: 0.3112974166870117 seconds +2: +2: Time to load utils op: 0.3117852210998535 seconds +2: Time to load utils op: 0.31180286407470703 seconds +2: Time to load utils op: 0.30901312828063965 seconds +2: Time to load utils op: 0.30837559700012207 seconds +2: Time to load utils op: 0.30888843536376953 seconds +2: Time to load utils op: 0.30823421478271484 seconds +3: Time to load utils op: 0.0004913806915283203 seconds +3: Time to load utils op: 0.0003895759582519531 seconds +3: Time to load utils op: 0.0003376007080078125 seconds +3: Time to load utils op: 0.00036334991455078125 seconds +3: Time to load utils op: 0.0003275871276855469 seconds +5: Time to load utils op: 0.3109104633331299 seconds +5: Time to load utils op: 0.3109269142150879 seconds +5: Time to load utils op: 0.31093478202819824 seconds +5: Time to load utils op: 0.31093788146972656 seconds +5: Time to load utils op: 0.3109579086303711 seconds +5: Time to load utils op: 0.31096553802490234 seconds +5: Time to load utils op: 0.31098175048828125 seconds +5: Time to load utils op: 0.31098365783691406 seconds +3: Time to load utils op: 0.00031304359436035156 seconds +3: Time to load utils op: 0.0003216266632080078 seconds +3: Time to load utils op: 0.00031375885009765625 seconds +7: Time to load utils op: 0.31247949600219727 secondsTime to load utils op: 0.3124821186065674 seconds +7: +7: Time to load utils op: 0.3125004768371582 seconds +7: Time to load utils op: 0.3125171661376953 seconds +7: Time to load utils op: 0.3125300407409668 seconds +7: Time to load utils op: 0.31253790855407715 seconds +7: Time to load utils op: 0.3125438690185547 secondsTime to load utils op: 0.31254076957702637 seconds +7: +4: Time to load utils op: 0.00048804283142089844 seconds +4: Time to load utils op: 0.00048160552978515625 seconds +4: Time to load utils op: 0.0005729198455810547 seconds +4: Time to load utils op: 0.0005517005920410156 seconds +4: Time to load utils op: 0.0005872249603271484 secondsTime to load utils op: 0.0006103515625 seconds +4: +4: Time to load utils op: 0.0005905628204345703 seconds +4: Time to load utils op: 0.0005869865417480469 seconds +6: Time to load utils op: 0.000518798828125 seconds +6: Time to load utils op: 0.0005371570587158203 seconds +6: Time to load utils op: 0.0005533695220947266 seconds +6: Time to load utils op: 0.0005829334259033203 seconds +6: Time to load utils op: 0.0005402565002441406 seconds +6: Time to load utils op: 0.0005211830139160156 secondsTime to load utils op: 0.0005576610565185547 seconds +6: +6: Time to load utils op: 0.0003845691680908203 seconds +0: Time to load utils op: 0.00046443939208984375 seconds +0: Time to load utils op: 0.00043272972106933594 seconds +0: Time to load utils op: 0.0004248619079589844 seconds +0: Time to load utils op: 0.0004444122314453125 seconds +0: Time to load utils op: 0.00048351287841796875 secondsTime to load utils op: 0.00046515464782714844 seconds +0: +0: Time to load utils op: 0.000392913818359375 seconds +1: Time to load utils op: 0.0006940364837646484 seconds +1: Time to load utils op: 0.0008289813995361328 seconds +1: Time to load utils op: 0.0009090900421142578 secondsTime to load utils op: 0.0009272098541259766 seconds +1: +1: Time to load utils op: 0.0010728836059570312 secondsTime to load utils op: 0.0009143352508544922 seconds +1: +1: Time to load utils op: 0.000865936279296875 seconds +1: Time to load utils op: 0.0010654926300048828 seconds +2: Time to load utils op: 0.0005357265472412109 seconds +2: Time to load utils op: 0.000431060791015625 secondsTime to load utils op: 0.0005474090576171875 seconds +2: +2: Time to load utils op: 0.0005371570587158203 seconds +2: Time to load utils op: 0.00042128562927246094 seconds +2: Time to load utils op: 0.0004222393035888672 seconds +2: Time to load utils op: 0.0004088878631591797 seconds +2: Time to load utils op: 0.00045490264892578125 seconds +5: Time to load utils op: 0.0006334781646728516 seconds +5: Time to load utils op: 0.0008089542388916016 seconds +5: Time to load utils op: 0.0010695457458496094 secondsTime to load utils op: 0.0011169910430908203 secondsTime to load utils op: 0.0011250972747802734 seconds +5: +5: +5: Time to load utils op: 0.0011277198791503906 seconds +5: Time to load utils op: 0.001192331314086914 seconds +5: Time to load utils op: 0.0011949539184570312 seconds +7: Time to load utils op: 0.0008008480072021484 seconds +7: Time to load utils op: 0.000820159912109375 seconds +7: Time to load utils op: 0.0011110305786132812 seconds +7: Time to load utils op: 0.0010840892791748047 seconds +7: Time to load utils op: 0.0009407997131347656 seconds +7: Time to load utils op: 0.0010764598846435547 seconds +7: Time to load utils op: 0.0011165142059326172 seconds +7: Time to load utils op: 0.0011684894561767578 seconds +0: [2023-03-16 09:04:49,300] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-16 09:04:49,301] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 09:04:49,301] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.88 GB, percent = 6.9% +0: [2023-03-16 09:04:49,415] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-16 09:04:49,416] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-16 09:04:49,416] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.92 GB, percent = 6.9% +0: [2023-03-16 09:04:49,517] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-16 09:04:49,517] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-16 09:04:49,518] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.92 GB, percent = 6.9% +0: [2023-03-16 09:04:49,618] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-16 09:04:49,619] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 09:04:49,619] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.92 GB, percent = 6.9% +0: [2023-03-16 09:04:49,718] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-16 09:04:49,719] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 09:04:49,719] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.92 GB, percent = 6.9% +0: [2023-03-16 09:04:49,819] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-16 09:04:49,820] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 09:04:49,820] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.92 GB, percent = 6.9% +0: [2023-03-16 09:04:49,919] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-16 09:04:49,920] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 09:04:49,920] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.92 GB, percent = 6.9% +0: [2023-03-16 09:04:50,025] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-16 09:04:50,026] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 09:04:50,026] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.93 GB, percent = 6.9% +0: [2023-03-16 09:04:50,125] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-16 09:04:50,126] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 09:04:50,126] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 34.93 GB, percent = 6.9% +0: [2023-03-16 09:04:50,126] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-16 09:04:50,126] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-16 09:04:50,126] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-16 09:04:50,126] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-16 09:04:50,127] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-16 09:04:50,128] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-16 09:04:50,129] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-16 09:04:50,129] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.00047779083251953125 seconds +0: [2023-03-16 09:04:50,129] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-16 09:04:50,238] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=17 [0, 17) STAGE_PARAMS=82741760 (82.742M) TOTAL_PARAMS=82741760 (82.742M) UNIQUE_PARAMS=82741760 (82.742M) +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +1: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +5: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +2: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +7: [2023-03-16 09:04:50,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt... +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +0: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +2: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +1: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/mp_rank_00_model_states.pt. +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +5: [2023-03-16 09:04:50,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +3: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +4: [2023-03-16 09:04:50,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +6: [2023-03-16 09:04:50,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +7: [2023-03-16 09:04:50,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +2: [2023-03-16 09:04:50,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +0: [2023-03-16 09:04:50,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt... +1: [2023-03-16 09:04:50,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +2: [2023-03-16 09:04:50,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +5: [2023-03-16 09:04:50,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +1: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +4: [2023-03-16 09:04:50,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +3: [2023-03-16 09:04:50,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +0: [2023-03-16 09:04:50,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_01-model_00-model_states.pt. +7: [2023-03-16 09:04:50,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +2: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +6: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +4: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +1: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +3: [2023-03-16 09:04:50,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +0: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +3: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +2: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +1: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt... +5: [2023-03-16 09:04:50,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +4: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +0: [2023-03-16 09:04:50,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +5: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +7: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,584] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_03-model_00-model_states.pt. +6: [2023-03-16 09:04:50,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +4: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +6: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +1: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +3: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +5: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +2: [2023-03-16 09:04:50,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt... +7: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +2: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +1: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +0: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +3: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +7: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +6: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +5: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_04-model_00-model_states.pt. +4: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +3: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +4: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +1: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +6: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +0: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +2: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +7: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt... +5: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +4: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +3: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +5: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +0: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +7: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +6: [2023-03-16 09:04:50,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +1: [2023-03-16 09:04:50,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_05-model_00-model_states.pt. +2: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +6: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +5: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +4: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +2: [2023-03-16 09:04:50,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +1: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt... +0: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,836] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +6: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +2: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +5: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +4: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +7: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +0: [2023-03-16 09:04:50,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +3: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_06-model_00-model_states.pt. +1: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +4: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +3: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +1: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +5: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +0: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +7: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt... +6: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +5: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +6: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +3: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +0: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +7: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_07-model_00-model_states.pt. +2: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:50,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:50,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:50,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,003] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +2: [2023-03-16 09:04:51,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +3: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +4: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +1: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +5: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +6: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt... +7: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +5: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +4: [2023-03-16 09:04:51,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +3: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +7: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +1: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +6: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_08-model_00-model_states.pt. +0: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,088] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +5: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +5: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +4: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +6: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +0: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +4: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +0: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +3: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +1: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +2: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +7: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +1: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt... +2: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +6: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_09-model_00-model_states.pt. +7: [2023-03-16 09:04:51,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +1: [2023-03-16 09:04:51,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +1: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +4: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +7: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +0: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +3: [2023-03-16 09:04:51,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +5: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt... +6: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +5: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +3: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +7: [2023-03-16 09:04:51,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +4: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +0: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +2: [2023-03-16 09:04:51,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_10-model_00-model_states.pt. +6: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +5: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +1: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +0: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +2: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +4: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +7: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt... +3: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +1: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +0: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +5: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +2: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +3: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +6: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,272] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_11-model_00-model_states.pt. +7: [2023-03-16 09:04:51,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +4: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +5: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +0: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +1: [2023-03-16 09:04:51,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +7: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +6: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt... +3: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +4: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +2: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +3: [2023-03-16 09:04:51,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +6: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +5: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +0: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +1: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_12-model_00-model_states.pt. +7: [2023-03-16 09:04:51,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +6: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +2: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt... +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: > overriding learning rate value to 0.0002 +4: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: > overriding minimum learning rate value to 2e-05 +3: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: > overriding warmup iterations value to 0 +7: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: > overriding total number of iterations value to 1 +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: > overriding decay style value to cosine +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +7: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +4: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +6: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +3: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +1: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +0: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/layer_14-model_00-model_states.pt. +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 09:04:51,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +3: [2023-03-16 09:04:51,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,430] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +3: [2023-03-16 09:04:51,431] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +1: [2023-03-16 09:04:51,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,448] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +6: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,449] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +5: [2023-03-16 09:04:51,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,449] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +1: [2023-03-16 09:04:51,449] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +5: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,450] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +3: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,450] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +3: [2023-03-16 09:04:51,450] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +6: [2023-03-16 09:04:51,450] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +5: [2023-03-16 09:04:51,451] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +5: [2023-03-16 09:04:51,451] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +3: [2023-03-16 09:04:51,451] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +3: [2023-03-16 09:04:51,451] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +7: [2023-03-16 09:04:51,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,457] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +7: [2023-03-16 09:04:51,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,458] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +7: [2023-03-16 09:04:51,458] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +6: [2023-03-16 09:04:51,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,459] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +7: [2023-03-16 09:04:51,460] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +6: [2023-03-16 09:04:51,460] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +6: [2023-03-16 09:04:51,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,462] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +6: [2023-03-16 09:04:51,463] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +7: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,464] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +3: [2023-03-16 09:04:51,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,465] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +3: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,465] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +3: [2023-03-16 09:04:51,465] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +6: [2023-03-16 09:04:51,465] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +3: [2023-03-16 09:04:51,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,466] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +3: [2023-03-16 09:04:51,466] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +0: [2023-03-16 09:04:51,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,466] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +3: [2023-03-16 09:04:51,466] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +0: [2023-03-16 09:04:51,466] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +0: [2023-03-16 09:04:51,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:51,467] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +3: [2023-03-16 09:04:51,467] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +6: [2023-03-16 09:04:51,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,468] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +0: [2023-03-16 09:04:51,468] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +0: [2023-03-16 09:04:51,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:51,468] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: could not find arguments in the checkpoint ... +0: checkpoint version 3.0 +0: [2023-03-16 09:04:51,469] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +6: [2023-03-16 09:04:51,469] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +0: [2023-03-16 09:04:51,470] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +5: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,470] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +0: [2023-03-16 09:04:51,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:51,470] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +7: [2023-03-16 09:04:51,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,471] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +5: [2023-03-16 09:04:51,471] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +0: [2023-03-16 09:04:51,472] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +7: [2023-03-16 09:04:51,472] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +5: [2023-03-16 09:04:51,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,474] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +5: [2023-03-16 09:04:51,475] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +5: [2023-03-16 09:04:51,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,475] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +4: [2023-03-16 09:04:51,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +5: [2023-03-16 09:04:51,477] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +1: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +1: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +2: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +1: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +4: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +4: [2023-03-16 09:04:51,477] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +1: [2023-03-16 09:04:51,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,478] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +4: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,478] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +4: [2023-03-16 09:04:51,478] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +0: [2023-03-16 09:04:51,478] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +4: [2023-03-16 09:04:51,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,478] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +4: [2023-03-16 09:04:51,478] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +1: [2023-03-16 09:04:51,478] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +1: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +1: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +2: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +2: [2023-03-16 09:04:51,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +2: [2023-03-16 09:04:51,479] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +1: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +4: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +0: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +4: [2023-03-16 09:04:51,479] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +4: [2023-03-16 09:04:51,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,480] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +2: [2023-03-16 09:04:51,480] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +2: [2023-03-16 09:04:51,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,481] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +4: [2023-03-16 09:04:51,482] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +2: [2023-03-16 09:04:51,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,482] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +7: [2023-03-16 09:04:51,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,483] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +7: [2023-03-16 09:04:51,483] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +2: [2023-03-16 09:04:51,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,483] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +2: [2023-03-16 09:04:51,483] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +7: [2023-03-16 09:04:51,484] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +2: [2023-03-16 09:04:51,485] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +6: [2023-03-16 09:04:51,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,487] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +6: [2023-03-16 09:04:51,489] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +7: [2023-03-16 09:04:51,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,491] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +7: [2023-03-16 09:04:51,492] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +6: [2023-03-16 09:04:51,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,494] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +0: [2023-03-16 09:04:51,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,495] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +0: [2023-03-16 09:04:51,496] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +6: [2023-03-16 09:04:51,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 09:04:51,496] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +6: [2023-03-16 09:04:51,497] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +0: [2023-03-16 09:04:51,498] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +5: [2023-03-16 09:04:51,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,498] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +5: [2023-03-16 09:04:51,499] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +0: [2023-03-16 09:04:51,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:51,500] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +7: [2023-03-16 09:04:51,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,501] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +0: [2023-03-16 09:04:51,501] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +7: [2023-03-16 09:04:51,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 09:04:51,502] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +7: [2023-03-16 09:04:51,502] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +7: [2023-03-16 09:04:51,503] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +1: [2023-03-16 09:04:51,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,505] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +0: [2023-03-16 09:04:51,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +3: [2023-03-16 09:04:51,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,506] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +3: [2023-03-16 09:04:51,506] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +0: [2023-03-16 09:04:51,506] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +1: [2023-03-16 09:04:51,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,507] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +3: [2023-03-16 09:04:51,507] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +5: [2023-03-16 09:04:51,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +0: [2023-03-16 09:04:51,508] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +5: [2023-03-16 09:04:51,508] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +1: [2023-03-16 09:04:51,508] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +5: [2023-03-16 09:04:51,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +5: [2023-03-16 09:04:51,508] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +4: [2023-03-16 09:04:51,508] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +4: [2023-03-16 09:04:51,508] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +5: [2023-03-16 09:04:51,509] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +5: [2023-03-16 09:04:51,509] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +4: [2023-03-16 09:04:51,509] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +4: [2023-03-16 09:04:51,510] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +2: [2023-03-16 09:04:51,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +4: [2023-03-16 09:04:51,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,511] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +4: [2023-03-16 09:04:51,511] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +4: [2023-03-16 09:04:51,512] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +2: [2023-03-16 09:04:51,512] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +2: [2023-03-16 09:04:51,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,513] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +2: [2023-03-16 09:04:51,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 09:04:51,513] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +2: [2023-03-16 09:04:51,514] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +2: [2023-03-16 09:04:51,514] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +1: [2023-03-16 09:04:51,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 09:04:51,524] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +1: [2023-03-16 09:04:51,525] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +0: successfully loaded checkpoint from checkpoints_83m20b400m at iteration 0 +7: time (ms) | load-checkpoint: 1289.19 +0: estimated model parameters: 0.08274176 +0: estimated model parameters without embeddings: 0.04923648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 09:04:51 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 1 +0: validation: 25600 +0: test: 25600 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.031692 seconds +0: number of documents: 41786294 +0: > dataset split: +0: train: +0: document indices in [0, 41786294) total of 41786294 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.083 seconds +0: total number of samples: 9767463 +0: total number of epochs: 1 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.023942 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_25600ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.010 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-16 09:05:07 +0: done with setup ... +0: training ... +7: time (ms) | model-and-optimizer-setup: 18446.90 | train/valid/test-data-iterators-setup: 15621.15 +0: [after training is done] datetime: 2023-03-16 09:05:07 +0: [2023-03-16 09:05:07,676] [INFO] [checkpointing.py:553:forward] Activation Checkpointing Information +0: [2023-03-16 09:05:07,677] [INFO] [checkpointing.py:554:forward] ----Partition Activations False, CPU CHECKPOINTING False +0: [2023-03-16 09:05:07,677] [INFO] [checkpointing.py:557:forward] ----contiguous Memory Checkpointing False with None total layers +0: [2023-03-16 09:05:07,677] [INFO] [checkpointing.py:560:forward] ----Synchronization False +0: [2023-03-16 09:05:07,677] [INFO] [checkpointing.py:561:forward] ----Profiling time in checkpointing False +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.651057E+00 | lm loss PPL: 3.851535E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +END 3319355: Thu 16 Mar 2023 09:05:25 AM EET diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ac14e91c5e0f8cbe57d401e60eef4cab1d8f769 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b786ff0578f86a2cc838c3286a0e54ec2836844c99cea81c4c0013a99e1c108 +size 15518743 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..640c2472e9fe2809c50357b5e9048f37ca3dc69c --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ccfa6d2022cdb10f49826bb3a918bebeee8e844bc8a2ef491fb562ea0b42a41 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b7c8fbac7267e070770985dd3c23b80848985a6 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0efd4c39e2cca3b4dc9b999cf2330dca5fb42b6c8a04b2561e4465c987f7e34c +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aed737ad37aa705f9137a97a74b17132e38882bc --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05ee7265a4c3c935b85fd61f6a2d05431a53542f2ec52351c3dac3adf92968fc +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e9dafd10942ad43b1e9ec64c45748823c7614ef --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5884c509a50f1be1a8572381ce72c6029fc5e53feec1b62b0965c386afd80f34 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3c2ddd1b94cebe5733f1e5f78de7b406ae145ae --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff0a4bec8f6b49320c40ff8b3a5a653f3dc39c14b20518545c0997a9ad5aa1c +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aff9b611615c86151599c40dc03412672d206ad8 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bfbe34d1fe45c9eeb003a845e848591ce42fcb85999259e010f3904829d893 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7fc3bc9e131786e6ab87f4fa6a61ae6e7770420 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a80a23e6ffac08ef2cfe3fe1580d5f8d43190df4a645929a310328046c1e757 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..041b2cd68d9082165078c7567309fcc947e79ab4 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:328315cfdff73a72a0794a6fdf81f2d1887289d7fa78c1715b68075a9b6fef5c +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bc2a23ba992aab0741aa258229032f7125967bb --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db2cfed9d8dbcb80d8416b9ea3a083bf0ec6623a8f1cd91aad8b2be76446e3a +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..544c0ee51d7836aa9cd21a0094d5ec697d600ab8 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cb78c77f77d9634212fcae9f32fc58214c19b8b74ce44f962d03611548cf293 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dec03de777891ba54014079eda4c05fbcfcdff3 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c8edf4ecae98e8e585282b9fcaddde805fae00fe2e2200b8d057bcc642ba475 +size 15518615 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ba95a5958b377e600c971818ca00e156fa18c12 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b40e2ddb5e3a16107447cf3dd55a4b8fd60a2f26b2f126229d8714e0b2f69e77 +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6fd58ac03df7ccd88e92b2c367f65129ca13888 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76230db7d4b4e827d812eaa3f4339ca571284a4f0bc203d08542aa7de6476610 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e57e6c37d7b0e7706ddc1618fabc5b8cdd79a35a --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5846fbbb9da75980e0b9ecb6c74080487f941f2babc0e39555cfa7f256a22b02 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a09f84714c203123ed2939b1103af3a2669212fe --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd3169690f43549443c095fcc2e30f85958256e557b05e746988630a50295b8 +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..631d3d1479dbecbac42a46994c98a28c33f64630 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:179b5e6ad5f3295a9f4e0baaf6e9c062c982d132d445bb2c1e952a4def745346 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9609563fd4219dfe1864d4b4bcb2e5ac4758c3bc --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56cd8769d8fb90efb9670c08d0e45e4d5a683c7fc5ef7e4275446df06eb4a5a0 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b345d6b45b8f52bd1a2b6b4f62430083d8835e64 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf005f8b2b267304c73692bfe94b15d8bab04a655fa948cb117e96f871baba4b +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2298b1f066e1cd70a7a825b97f3e060df18c2e7 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1f617699d67b0b136163627cf40a7f93671b5ac147fe3213c2e60f3b6e446e6 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf6066fcc1ec3b79396b8ad7a3532de4f6c26e59 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:631c8d69014d4957b388c1ab166aa4973852b72991b4cc35ad0f4d13649a8d6c +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..afcef128802493786abd96ba556159da1698081b --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e5221c6ead202cc65345144a8e7312b1dec8ed310f0cf1a54db2a08f5055a8 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..662185088c16af5079a9051a8d30462549e65986 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6cd84376401e658f2b4de6204782ebce49da43a57cc02f544b738170c2e26ba +size 15518743 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcfb4012cbf1dcaa7126492fcf0abb8c32044826 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773d1c022a9890be834484048cb094bb1bf0e4b0661881b4174941df5904c709 +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa78e146c5d1f989ccddacfe7516b10dd34ebd92 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d7c816be8fa1ff8cb3fd1edbed5cf28c0f10b5ec7ba91df7b35cd198e9ddb64 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..67b23d3398b30d6a6378fb34d1aa602857010a09 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aae86c51b78fd01f3eb3d7c64d85429a28c3e759f76c26fed0f12eaf69727dfd +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bce474b8b0e71b3432112220e3a7e8ad86bc68bf --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3835fe5faec5eed0d45184f5c755ace3acf02074656c8f18fe3e9c7485417143 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1147ed9ce997714812709a263a69f83f766a71d0 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f8aac15ec6d5c82f04787e9dea2572906b82106985d931f903df8f20ee96322 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fdbee65ec0fca3497dc7a46f8278c3bd93823f4 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2e25b74ce3875640e6c49d7e3361bf871450b392488ecae7d670b17d8cc787 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..531b9d7a6dc23af839bfdc9caa2b0fd5df8bd89b --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d99fb7f71ef3892fae5e1c692ed5bd22f449b4ce9423b7aa1a869ab3477eb2c4 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce89162874c92b23cf57b9486ae8274033d9dffe --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25aa1914d099a0018a4c5b1b35e19727c84b6e9b91db12fb9c9c95c934602db5 +size 15518818 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..faa5ab3a2df2f6808c8d271c551b4f5a624dcba4 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79284b59b1629bae8dabc8ca80a2d152b343227c09464b03503cc5bea2684caf +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f113c48eef41177678b05b3b9ba5456a2464c91 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:064449c7237b36445fee2ef8a3509731289e9672fd0114675d7799e7fd31feff +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bffc6593d4fe5ee2b86ea039df3cb43170e69d4b --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a637c997034960fb13a77498bb0759a9ab8a074c985b75bfc23c470ac062e8 +size 15518743 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4edc3fc73e9bfdf26ef2d74d656421db9940316 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15caa4c68cd48d9efd33f8456dd78252ebc9b3e75c47d126d9b4dbbdfcc73aae +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..217c661ae51113ccfd05fcd53f1f093918679183 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0459076e9f5ebb38ac2ae1b2f29ac45c34d2219be304dde8b0bee86e4ae0ed7 +size 15518818 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..176277155f4bb071cdab89c85ad1e28bf9fea27b --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d91ad53400e2dfd202716a27a820ac33b0a9af61255cd154444e4941bf6c5c4b +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72c3edd84c5d3773a6bfb1957816a076378190b8 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6884f043a8ef6557dd4fbeded050ba04b373f84d59abdaa8e6215e2241359c88 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d66b6c89dfd599311d44927864ed0ec4124690b8 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88420927f3ec33c036b1cda4619cc9c81266c6f28ff98d5e9acfee8ff348d495 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..962f2835d31011fda9a82c8cd834490fba61afef --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c340072f88e21e63c19e1440ff2eaa86f510ed4f64f7dee637680925325c558 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7889be1f802990b9cacdcd8203f1e706108b26fb --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b13cbce4557e4306ad5c7d05806b4ff9cfd191fd9849ed03560a4fe30a0d9830 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..372cc48fcac3eaf74d6d7f5b408056e5b1f6764a --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56697334f7173c71bff9eeb51fe95b2bc8a79dfd77533d0f559fb2e829916be0 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4d2ccefdb9c0d174385c87dae9697375398cc20 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88885ba327572c261a812aab5cc5cf0b7ffe3d61209f716f2c63dab2d72e8978 +size 15518562 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f11d936e1f7f9cce6d697c6dc600c4fd5cf9eeba --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c1dc6c95551e11d65cf6725e978077f4d9f99a81443906e0225c17e4fed5ee +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52209a571cd65018cb61f625fd31c2be6060b116 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de18d32b3c41a9424a7bbead23b6b47a69745af861dae382c523b13b70e112b4 +size 15518615 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cf9bd3211b6588f5f1c7afa74c22a46ccf64ceb --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3930af65518140085344a4149b2c1f9088ce938b4982df382a4fc750acd7ee3 +size 15518818 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccffa1124f1a24e503ffb6cbd7d09914ecdb9217 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd5aa36daa408c2315efce4a95f9f3c0c7f475f10f8d5682ed2ffb74bfda3ad +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86f5bae9e14110289c71e29396e7fd3173cf8a5f --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8aa4592530c80fa5e648ad21bf0f698df0e746a72ab5d93065d24bbf6eda992 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4414ad491aa561422930620e95d83b9f8f4d866b --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffdf9d7e0ab30e76727cb03f39dd44d12a9fad3c5cc1793af9196c795f431ab9 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c0c6c3c4729e225a7d1601a13be4c62ffa41d45 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8de5098209f688d857fd0f36e29206730d6f67ed3e2abf26ea63fdd4409bf5eb +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9099378cabaf4cf5618577ed2d255a1c7c00c79 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9f48c7719c31e5d2b3c74df34387964b3859f1b3fbe14557fd767af9027962e +size 15518626 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..27af6459546adf766d4cfd7c3cfe3b442b67b734 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523fdcc74273160c08b499f9785f912df2358b76eea2eae0dfda81ac92690e35 +size 15518818 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a36f3cdc6d1047d4fb153e76aebdbeedd570db6 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13d105dfc70c8fccb5be83054a349a00918e23217cd07f99159092a18b88a3b6 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af0b482e974c1c7baa6445ee9d9c9bdc77caf118 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99741699cbdb3a120a268fe7f63ba4c1bc66c8b2365a1de67c9e21de2997232e +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..486ea5bd3bad66bc897bec7a7660214e13cbe731 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e8c5d3801fc8ca146bbfc304893adbdedc919a1e743a284eca874b278598b88 +size 15518754 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18b24257c0f2460040f91addf694467616e24275 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd07a98430084da8e8b0a56cc9e532d5e273d629e44cb05d24e3a3df62362858 +size 15518679 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40fdde9aa85049620d64e4812c64e561121ebffc --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb593cc772b4ce7f90baa45c9a31dd144ede9006ce76abec7a53c531ed8899dd +size 15518818 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82215567978b29b6f736b4de2bcad9e11473c85c --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a2881fa9317bc42f09d22ed688a842cdca335c551f6c705b538f309284fe35 +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5980a4ab2c58e719c8427c823d83515f075c2fff --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25a39c4e715dbef2c9c007266002c9ca86809349a3a0ff1586796b2694fadc3b +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09c87dda5843fe9d57d28f1291f969efa9b20404 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99300d2ef81231f5e32b76606104864b8d20d76bbedab8df013ebb724a9137ce +size 15518690 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd1eb58a7161d61ed83009185e386bd6c560c32f --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16a8ce560303b4ee23e5b892d1f33b864ff7ed8ddfd1f44585582e3ff49ee623 +size 15518743 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce467a846e67ca95cddef156921d8633cc1d70f6 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:834989ad5e8aa16a62e0a842dd7a14ed5a08b8ea4bba8ec31b937e469e0c9e97 +size 15518679 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1a8dc650e9ffec213efd0904fde501904ee3f48 --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:784b3cf03c97677f322503c0eed93d2eb3386573f4fcee6a8a7dfcd408da8ce3 +size 15518679 diff --git a/83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57ed1fa8ad5bfd2e35ffbd159feb200fd67d346c --- /dev/null +++ b/83m20b400m/global_step37905/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f6957a09051fef8ecf48d1f2427e687ea877aa3dd8f14ca8efc012f86b0447 +size 15518743 diff --git a/83m20b400m/global_step37905/layer_01-model_00-model_states.pt b/83m20b400m/global_step37905/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a18d40b5b98e64e9473a5590534e8fccf0d249f6 --- /dev/null +++ b/83m20b400m/global_step37905/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fd17552dfa0ecf79959f1749a628d680f7a2c0489967f01f14c72a982fba6e5 +size 67011843 diff --git a/83m20b400m/global_step37905/layer_03-model_00-model_states.pt b/83m20b400m/global_step37905/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddfd64ed828f733356f602988fa90a8fbe7eaef7 --- /dev/null +++ b/83m20b400m/global_step37905/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07c51d3382bfdcb039916a1c032dc612be4cdeadcbe151a0229c4d400710437f +size 9851395 diff --git a/83m20b400m/global_step37905/layer_04-model_00-model_states.pt b/83m20b400m/global_step37905/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5a9b8c2916aa8dc35f493f91cb318395ce5dee2 --- /dev/null +++ b/83m20b400m/global_step37905/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d71d535734ce5dcd7f816a94b7cff30a370cf8ab30aa3d18a24fa1134ddf2b94 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_05-model_00-model_states.pt b/83m20b400m/global_step37905/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c169ece66316653c1debb88b736a62fa3df532a --- /dev/null +++ b/83m20b400m/global_step37905/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57f183a3615a22b6cdc4ac0f9d621e8a726f33591e8b3d19129f87a06fa4df36 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_06-model_00-model_states.pt b/83m20b400m/global_step37905/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15b6bf4e84c220087f826d112d0193ae53cc7471 --- /dev/null +++ b/83m20b400m/global_step37905/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a3964e9b2e4fec5cee25604e3fa4442932c3fe0156450f4927d61bcf0047df +size 9851395 diff --git a/83m20b400m/global_step37905/layer_07-model_00-model_states.pt b/83m20b400m/global_step37905/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7fdc19dea83b8b75d11d5e6e040b223992156b3 --- /dev/null +++ b/83m20b400m/global_step37905/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f18afe887f49c454fd88ee2cc93793c526d425f238827ebfd307da9e50a5ec26 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_08-model_00-model_states.pt b/83m20b400m/global_step37905/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1ea78864c49d4fa39edf54b23b8ae92c7339aff --- /dev/null +++ b/83m20b400m/global_step37905/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c944ae0c3e13d8b44100ce29f05bf89d5d42b91f470c2feec88082961569048c +size 9851395 diff --git a/83m20b400m/global_step37905/layer_09-model_00-model_states.pt b/83m20b400m/global_step37905/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..058a7deddb1d95a5ac4efa15b630f926ccdfa719 --- /dev/null +++ b/83m20b400m/global_step37905/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5675f3b4f9cf56cee764a209da8fddaf4379968a10f2001aa6951f327c443ec3 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_10-model_00-model_states.pt b/83m20b400m/global_step37905/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae0ea22aea5fe54e9b9536d069336cbd8c2ef26e --- /dev/null +++ b/83m20b400m/global_step37905/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1966bfdf7e99d3afc557b9d9e6f987001df72b9697141b85c614ebc0527e7217 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_11-model_00-model_states.pt b/83m20b400m/global_step37905/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48c7fd4f1ceb33738a3c851cb341d6b649b4ac67 --- /dev/null +++ b/83m20b400m/global_step37905/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbac38173322fd5633bc7dd7a34a47484f20331ba53bdaa050fa9eec3a37e797 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_12-model_00-model_states.pt b/83m20b400m/global_step37905/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2ca23013357ac7524cd0e5770b1ef390d6688aa --- /dev/null +++ b/83m20b400m/global_step37905/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea20059a7a41df76920b7be8d27cd70186d1ef96b976218a66054cbb27c2a939 +size 9851395 diff --git a/83m20b400m/global_step37905/layer_14-model_00-model_states.pt b/83m20b400m/global_step37905/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..95d68322c2ad50e98f8364a1870bff2ae5b7b3f2 --- /dev/null +++ b/83m20b400m/global_step37905/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70f9c1d81e2ab03fecbc8b59c60820c1e2993ef639023713c3c15f446f5d1133 +size 3779 diff --git a/83m20b400m/global_step37905/mp_rank_00_model_states.pt b/83m20b400m/global_step37905/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31b776a2347d661df6f07ce001eebdba9e146151 --- /dev/null +++ b/83m20b400m/global_step37905/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b77e409ba4a5d55f6e5375822d7b7d0864fd3384f8a9fbb7ce08837897411ca1 +size 31603 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e859b2e7c653e7962c4a199ca8d4d480b659b503 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9b2660b5c77d58d492fefb55fb618c3f48c3618871500eca162dcc95d5d03b4 +size 15518743 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90cd5308de4d6c666f1ac32b3e6efc0ba26840cb --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b86c879e494573c24b6089f2a14d7a505b28d615d0cb4b4a32360b8cff6640 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f85ff531d2a38457dd77fbf07cbbcba41fffeec --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:171ddaf38121ce95a9d2589eeeb3ac1168eb591249a792fa367ddd70b543a9a2 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76cc8699127bfb75f83b86828be65da03808b6e8 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f05d98397b050f8febf1ec829154f0152e5db37af3319648daa40dd6b70af0e +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a78af723f55cccd7a8df503a1f7df3e6d3c30d73 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccb60aab0c1599962454aeedbc7f60d546ecfcb0cf1a4eef914acb1f6300975e +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5223db71d22d3e841eef4d316d43ceb964d82e3a --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d70722c3480df620540b7d8a10389294085745b605036e7254dffcc3913ee79 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfd3ffc92ec604c2c41be4d8d1d9453037a8a223 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00a1c1d3ec90a3fe31d19c91fdf0fa3e43e11c86d6abeec59c636c15220e1a7 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..781e33497604986bdde8503bf8550a5fa792a31e --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90545cbd3f1c70df0f9e87fbdf9e78aa2fd7936179a74188aaa4f84cc81433a +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bece89cf0a85d50ae3aaf605a2b085286566ef4 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae731d1e37cbf092441b6ad32b3a474c3426b7563add4ed899b1a53058f58f85 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d98bc5a00d65c5181e8553af1730aa180a3ce7e --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77479ec34c920991316e9a2f54b5042967556f5c7868774431dbd9b2ac0be80b +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bec33c30df7614af96aeb26ef88da920933ae92 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f6bbdeca69903549531b21f8a31987b5165ad2a1c3a4d2408972da33258feb +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32e0e111d67b6de0b0168cb9d2020bbc65678580 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37d1dbbaf9368cb8ea1b62139b78d00ccf6f34cae566564af10f1cfc81c30db +size 15518615 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e118678add727c9922000d2b968012fae57c0dd --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0ea938c19ba8720867789b5ce032e372c7bfafb05f8dca1425a2e831042a39 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccb2678876939116b8764829395e5aa3756d6b1d --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2013723616309aded12e04476a16a1c7f0326c56e9d2fa2b66acc9d9fa9eaa77 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5d59d895b9098735ed6b2e5508911a06941a5ba --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28f11ae0ea1f7d27a450a1c89ed30b425dde5e9e2bfb466b085523cca5ab30e +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..774c23caee72cb0a530475b42f87803d6180b671 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b13e2c12976b363cb3ca5e77aa829b074e105e5311e79c8685b712a478253b5 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..62027565d7b59ae912a8b7cd36bbde2350678619 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6147860b75167bc72626916685dc09db2414ccacecb2254ff05737bed7a8ce61 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f17e3f890ffe244bf2c2de870e225d626e4f1067 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a8e1686e07a6cc1e3b1e7eac2c6d73704238df83786ce753a5ef09cb00632f +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72ca663e79a560158e967547e945a18d4cc96a04 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d156e4c459a3505d18a20d3f41e9b9a15174926303fedc103044ea8ddf956c +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..127754b667d9b228e50f80f607e296959affbc6b --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a49da66b63b015071398fca927e42f6b780f775280bea4c27214291750fb8cb +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd9aae2492e5f09fc147fe3ae9ac29dee7c348f3 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e9098458a254dc1e8e87b2a369d3e931852ff7d2f672471af811a1e2e784a90 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0234ce22fe473a1f9392e4f213947f0cd841c0ec --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4eabfd36cff95fd7f516a42b25ba4a30f26b60a685a91a18aa4dd3050be2d285 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f20363172eadd0d44aa6687e48391607bb156fe --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1c2e8888ea8d3ba140bea0b8dd97be867ed5f2d546e8419dfa89e64e701600f +size 15518743 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8633c6824fca6840707196d3340795b6ee6d82e2 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:280192d00057993fc5471b88f4bbdbf936202b660fa14e0788d9c578569b2908 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b759cf794560d10727f53b05545b875cb77f9ed3 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f3cc42c550c2ba87593ec90828e60ee04c3fed2304d93f2c72484eae9b57d82 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b3bea7b09f8bdc5b1bd39316ca4781cc02b3d12 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b32c207994e67b1c4e9c259bfa76f5c2da27b2902198cfd9967075d56121a7f6 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e95bd94b24855e3fa6889796d9cb53282c1c2790 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59db05b8e6263bd720380f988edba3cfa3679eeda213044809285450f276df0 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6789a2d0c1356d765f6f248c0540ab0b2c506475 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b74388fd923555e3e58d43d231a2ad3e48e4bd3525c73bc538f27e1b5ce524c7 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a669e0fdfb27295912526d1c9b906253ee015cc1 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b46ed6740ac9dd1695cd865d4615bfadaf4f226044f6eccd08b8c4ce208d8468 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9b0797b8ff1f54f3f334089b64307db5787215d --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3fb8d4d9fa3ae8ea108e0744d04ccb0cf2d0e03b66c6c287067507b82afaac +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db5c073578803f0c70d9e5012d76c057fdd07b3b --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54f310ebca12b9af8d88d9ee1206cbcd9316eaca78a47b380746c9f39e09c0f9 +size 15518818 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5457f421aba3078167f1e319762ba9a9b5e1a46 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d26feeea8279da993b35c460310c8214007b4efb20a586ad579a40cf2a8e2f +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6330ef0fa8bc9b4bac5e12580bae5aae6feb685 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9c7320251a7d1bb02150950c3ce1637552e94fcb53afd007a33324860acaaca +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..204ef9ce41326fa1c6e0802e89a2a84d903c614d --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51bbafbc30f777fe032c235531ee77b0a65abb6189c148be7d2c74aa812e1798 +size 15518743 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dbea04f9f5abe3aa94e730a90eee67726ac4eb4 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aacd2c2ba1827d536b9db494337a126504160fdf538e951c071dbfb976531dd +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6b721ec6c54b16693e0573578e812bbc928eb06 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5d3933d578c61e831b59faa0ff61882efafa17eb8f445a752ed9c05a6e5b9d6 +size 15518818 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ab84b833514b0013ab353cd3119199e1885ae98 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:144118f491c7a2286e28b5ddcb2ab0a405c4464916388bfec20bf8fe927a274b +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aeba55178d0dbd944fa921bb27105fc31bb8373a --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7522ae7d97dea486ee0cc2f730b6fb44bac6c4bfcc667a0f452531fc52e46462 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..655045a2b245c605f05f02a1879823d31e14fa71 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:458136a729d4e23f8e205a92b873f894e6ca0806ee8fecc312293b2052b18771 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97b782c3db469fbaa769dc248e3ffa0132d5a28c --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:597ed88ba583ab153633f0142b0f8c61720d7bfa281bc5ecb174dafc90188fb3 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc0176168af806bbc2e9a12c38e0283c1217fcf8 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9712f82bbe6584bc49e5b828ac114cc70e341fbf7f372f0da0d2b70935eab3bd +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4eb2815f5628ba3c2ec64cf2defcf2c13eabe884 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa9072c4d6599afedb7a887c7ec4ba4e8aaebe758d3747059c8e2c6ea417abfa +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..840e24bc98a3927804926d0f04b3732526a97046 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b747ca9873d3a562265fc06812f3386e6cfc0e53527d3bc03f76342849f3087 +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..734865d3921242ac89dce8cb116840c5886f488a --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:663376cf34324ca1b4102d6328f1580e83ca83ebb3a9e80099baec3d02e12b4a +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4a28124d6703b3f1b4a6a6f2ab22207038ec753 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6d4c04f9de956fb3ec6014cabf3f72258d167b1120b7860815c762fdce1aa96 +size 15518615 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5194be27b44faf841f5a6ec7b49c8cf725cce081 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e66b96bcdfb65769daf26be725f0122cd4f01128edbb0e25f142de6c425b9f +size 15518818 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb23e619c6174435fc1bab4d7bc9127936e6fa04 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e358ed4887de609005061fc5d4ab6c80559690db31c1e98752f99905fd7ac9cf +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be8e8059ed22d75af8d43efb0062e67e7b064ee3 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eac96b8b817eaf1ba4019fa1678cb050d6c59fe6dc0491ecc659f6dd51633d16 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..148cc391d96de4f65907edcccba12025eaa2197a --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4c57e5cbbd70433340ed91682228d985376d79562a2cdb38600626fac39275d +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61b1f57fcfa1c52260040957e2d58cec622e7534 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde2da4b719418d18de79dc3687d721ff25d00618c6ac18877dd7d0dacb880af +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de67f774b138e143e8e32cff0705a4a229bb2344 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16b002f203e5ed9b4ba07217106af611945a21086c7a3f63198bad674bade5c +size 15518626 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ca3b5b38a273ae6a16eafa2b381055b2e7ac5b7 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3132b4462717e0414cf5acd5c06f6178a3091c4e57e0ca5182d957cd5ad01685 +size 15518818 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f87acf385e012682c95ae277bc52313e4b2a257 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea4b51ba180c48f73d6c4a47220cf57e15032a4ff7a06025085358c5f36b6d4 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb2e14f728ed986ea13e76d689480b2f76304679 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d3b04ddc161602d64ec7afe2f64f83641afaddc4c2814f4bda49190be3abce9 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19a9335fe23d043bcee64ed8ec0c8700b8fda503 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e3fbc341e9db87ad03b7f97cb1fc89171d7dc456d2c8a558fb1d396175ffc2 +size 15518754 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2dd3b64b60af36be30825a0676f04c014e31887 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e276a45894828bab69cf3950da92cbd9a61153dc325fcdc3e304adee50365027 +size 15518679 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7806f4a1668e5722998c2df1a1dd7e082c327fb6 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e267174020e967680060615f053a10179ac1e6e8f5c13e9e0730dddb45871c20 +size 15518818 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3ec50b15a62f4151a8b7976a310ff8a6bb04875 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f7d5d40d143136d4eef0d6857929b7df4530729560a8f89beb69e22ed8bb18 +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6be18e5020958b4441c3728be5b2e659cb8c3bd9 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf8442c5afd75182b2ad8a617c34672bad4dad94fe2ac0bb25601c76ecb84b7f +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3e91c375dee19329a1e32a48c192c5a520eb14a --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de00b825be5613d3858cbd82718c4d3cc334888dfe3bd0577ee5e57b890272d +size 15518690 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78da25241605cc99f4d1214db90eb24e1c404be5 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4c9b3ea21bde7145a9263496cadb4be9e0cd906f7961e0d0d65bb57d43c66ae +size 15518743 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ca4ac78bde47af48f53239fadf7d16bcc88e817 --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69bfb2aaba7ba63ea79034145c656ed7cd450e41ae9e17b554d466274b9fd2fb +size 15518679 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d49ab4dde558e128fb8b70857c71dc391eb983fd --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:941a966acfe05889ef6285912c823b8ccd87778852237820998687dcffc0e957 +size 15518679 diff --git a/83m32b100m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/83m32b100m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bead6c914c31c95f128915405127863c298de60e --- /dev/null +++ b/83m32b100m/global_step115203/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a959ac8842e9efe4064bccc2975e6749488f0a30b9f41c3448ebcb3e0c75129f +size 15518743 diff --git a/83m32b100m/global_step115203/layer_01-model_00-model_states.pt b/83m32b100m/global_step115203/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c671c6d2431eb9e3968a5148695747a6521b7905 --- /dev/null +++ b/83m32b100m/global_step115203/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e8f90ec57aa56739bf1184836c8315b5a06c46daba52f938ee823df5e6cc35 +size 67011843 diff --git a/83m32b100m/global_step115203/layer_03-model_00-model_states.pt b/83m32b100m/global_step115203/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83b35a3250927cf324ff6d4ac2e773fe2e7b59a5 --- /dev/null +++ b/83m32b100m/global_step115203/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342ee6204b3a5472bc0a49a31a9ddae0e9f2f2005f31ff888729b344be6501c6 +size 9851395 diff --git a/83m32b100m/global_step115203/layer_04-model_00-model_states.pt b/83m32b100m/global_step115203/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1eb1e41c032d304e20812fd2311768334e3b8419 --- /dev/null +++ b/83m32b100m/global_step115203/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2969c5dff55eafae7c2ac36e9c1e3c609e069d5cbbc2a4e047522747d634d0e +size 9851395 diff --git a/83m32b100m/global_step115203/layer_05-model_00-model_states.pt b/83m32b100m/global_step115203/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a54952f628478649436f5b52a17fa816dba68bfe --- /dev/null +++ b/83m32b100m/global_step115203/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7a1ca42ef1622c95fb625439481d0c1b20c3292425d1e30db38789f927714e3 +size 9851395 diff --git a/83m32b100m/global_step115203/layer_06-model_00-model_states.pt b/83m32b100m/global_step115203/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f78241728fe3ec915224306ba205a00e9b4bbe10 --- /dev/null +++ b/83m32b100m/global_step115203/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ac82f138ccc6e25e3215831e74689b7e300a97dc609425887c79e220584872d +size 9851395 diff --git a/83m32b100m/global_step115203/layer_07-model_00-model_states.pt b/83m32b100m/global_step115203/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f2dbd0e0ddc3c0549c2ba796836fe5740753865 --- /dev/null +++ b/83m32b100m/global_step115203/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa5a14274ee3ba9a1b3a9a1594d049a2d2bd0b05822e1aa7a4363f975d77b52 +size 9851395 diff --git a/83m32b100m/global_step115203/layer_08-model_00-model_states.pt b/83m32b100m/global_step115203/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..183afefbff6d098f04ce5e68ae7821274d15350a --- /dev/null +++ b/83m32b100m/global_step115203/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdb9f91fc161ff5d175fef53adf0f6edf45ed2122e4d1f816747f54f63eccaaf +size 9851395 diff --git a/83m32b100m/global_step115203/layer_09-model_00-model_states.pt b/83m32b100m/global_step115203/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..285cc2598783b1c83f0e51a3e6c92cd5ca739df7 --- /dev/null +++ b/83m32b100m/global_step115203/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc4a0f06e14f999b9bd096a542fa7ffa0587bb1cdd4d0739053c8c7fb0745bb6 +size 9851395 diff --git a/83m32b100m/global_step115203/layer_10-model_00-model_states.pt b/83m32b100m/global_step115203/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..323c538933e70160999be9ca73f9d8479e25259e --- /dev/null +++ b/83m32b100m/global_step115203/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d212657bbb624b4d4a7997203bb38ef13939e764fd3d6e36073089dbd72c95d +size 9851395 diff --git a/83m32b100m/global_step115203/layer_11-model_00-model_states.pt b/83m32b100m/global_step115203/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a92b2ae946565f5699ae0143f893f635cb556c86 --- /dev/null +++ b/83m32b100m/global_step115203/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:421dd7cfb3e84bf8782474f1f38f4437d5ef7ded6ecaff0449a0f70947369353 +size 9851395 diff --git a/83m32b100m/global_step115203/layer_12-model_00-model_states.pt b/83m32b100m/global_step115203/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e753381aeb858c287c04ef1133314f3052d61e8f --- /dev/null +++ b/83m32b100m/global_step115203/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22335a65ab7a66caf7bad4175631baedd7057a29aeb93775f54c122dae257566 +size 9851395 diff --git a/83m32b100m/global_step115203/layer_14-model_00-model_states.pt b/83m32b100m/global_step115203/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f88a7d3b2cf38390b8a03541493f081293d87707 --- /dev/null +++ b/83m32b100m/global_step115203/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d9a964a8af865b7382b0a6df1898454308aa45322566f7f90aa1c24715c2a53 +size 3779 diff --git a/83m32b100m/global_step115203/mp_rank_00_model_states.pt b/83m32b100m/global_step115203/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0ed24a3eeff92c44140206f78a160cdc83823e8 --- /dev/null +++ b/83m32b100m/global_step115203/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdbcdd187836ed56d48318b93e012c8d99164eff83deebfc2a9bcefa751cb852 +size 31667 diff --git a/83m32b100m/sbatch_83m32b100m.sh b/83m32b100m/sbatch_83m32b100m.sh new file mode 100644 index 0000000000000000000000000000000000000000..ee755dfaca4629f312339de641dd60c3721c874e --- /dev/null +++ b/83m32b100m/sbatch_83m32b100m.sh @@ -0,0 +1,171 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=83m32b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_74M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=20000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +# TRAIN_SAMPLES=9_703_701 +# Tokens: 31633480000 +# -> Samples: 15446035 +TRAIN_SAMPLES=15_446_035 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 154_460 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 100 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 20000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/83m32b100m/sbatch_83m32b100mval.sh b/83m32b100m/sbatch_83m32b100mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..a8a8857f4a1abd170a55c67fd1f309d8792d2b95 --- /dev/null +++ b/83m32b100m/sbatch_83m32b100mval.sh @@ -0,0 +1,173 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=83m32b100mval +VARIANT_CKPT=83m32b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train32b.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_74M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --no-load-optim \ + --reset-progress \ + --override-lr-scheduler \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --eval-only true \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954238.nid005749.100452.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954238.nid005749.100452.0 new file mode 100644 index 0000000000000000000000000000000000000000..13e2e3d07ee2ded7c9e28b6f92cff31171253f5b --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954238.nid005749.100452.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc7658d72ad53a9ab1c7c70405f912c68759ab6eb14d22accf30e0561bbc5fd +size 206419797 diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954343.nid006724.88857.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954343.nid006724.88857.0 new file mode 100644 index 0000000000000000000000000000000000000000..83fb01c5d195d9d95d38cf09ace5523a2ab9f791 --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954343.nid006724.88857.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:291631c97ff068d91f4e51ea0a2c70dcf9252613c8c3f8731c3cf4671822f423 +size 1314960 diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954643.nid006724.94809.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954643.nid006724.94809.0 new file mode 100644 index 0000000000000000000000000000000000000000..ac5f2b71b7c73d5780ffa81b71805403ce09f196 --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678954643.nid006724.94809.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d2d4ce980413e031d3dd07f2a6c63a20eb8ffa16fcf1fe1be9462297359f881 +size 103672901 diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678968608.nid006955.98891.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678968608.nid006955.98891.0 new file mode 100644 index 0000000000000000000000000000000000000000..6686f429fadbbbbb844bd3c1e9112c8c8f17c971 --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678968608.nid006955.98891.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a68a83c71b749abfce5a9c3a7ab570315bb3d30c105b3a795bded721e60dc7 +size 40 diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678968719.nid006724.1013.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678968719.nid006724.1013.0 new file mode 100644 index 0000000000000000000000000000000000000000..01c489b25a96ccaf3d30043511ad634c2ce2f484 --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678968719.nid006724.1013.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fb989a5cc983afcf0e293e0616d8d8b449967d709df99775f731329b011a20 +size 40 diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678980330.nid005143.21630.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678980330.nid005143.21630.0 new file mode 100644 index 0000000000000000000000000000000000000000..8d779b4ceafc3ddfa6aa6759866f356a3895414a --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678980330.nid005143.21630.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0501c5f38f1a57e059c165c9c380414c446cc19ed8eca3d86eff58e20f1a2ae5 +size 21466 diff --git a/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678980471.nid006724.36500.0 b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678980471.nid006724.36500.0 new file mode 100644 index 0000000000000000000000000000000000000000..f4d8a0315c5d155453a83ff8ea7d1407dd239b78 --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100m/events.out.tfevents.1678980471.nid006724.36500.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a197aa4518e0abb729fcfb37f29a349e1fa6518274826b19d83c2d068b8eeb +size 21466 diff --git a/83m32b100m/tensorboard_83m32b100mval/events.out.tfevents.1678986178.nid005143.44647.0 b/83m32b100m/tensorboard_83m32b100mval/events.out.tfevents.1678986178.nid005143.44647.0 new file mode 100644 index 0000000000000000000000000000000000000000..f30e2176fee40600896d7dbe510d80db1dbafe18 --- /dev/null +++ b/83m32b100m/tensorboard_83m32b100mval/events.out.tfevents.1678986178.nid005143.44647.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6b78b4142331890a4a5489ce0a1ed3e025ab920918f3f28c0f49ff118821762 +size 980 diff --git a/83m91b100m/3319473.err b/83m91b100m/3319473.err new file mode 100644 index 0000000000000000000000000000000000000000..da46d4b9d5e7be87da96da50637d7f3cd79ab6e8 --- /dev/null +++ b/83m91b100m/3319473.err @@ -0,0 +1,1116 @@ +5: 2023-03-16 21:10:27.844358: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844362: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844366: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844354: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844355: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:27.844365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846384: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846387: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846382: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846396: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +1: 2023-03-16 21:10:27.846397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887385: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887406: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887402: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887403: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887415: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887416: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887402: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +6: 2023-03-16 21:10:27.887412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899763: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899770: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899770: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899779: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899777: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +4: 2023-03-16 21:10:27.899777: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918296: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918299: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918298: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918303: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918286: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918305: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +2: 2023-03-16 21:10:27.918307: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920400: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920394: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920393: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920396: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +7: 2023-03-16 21:10:27.920394: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986910: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986916: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986917: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986911: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986913: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986913: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986904: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +3: 2023-03-16 21:10:27.986903: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027406: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027421: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027413: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027414: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027430: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027427: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +0: 2023-03-16 21:10:28.027441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +5: 2023-03-16 21:10:29.502027: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502028: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502027: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502034: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:29.502415: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502419: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502425: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502426: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502423: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502427: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502430: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:29.502436: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.611762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611776: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611769: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611774: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.611774: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:29.612168: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612170: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612173: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612173: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612175: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612176: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612177: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +4: 2023-03-16 21:10:29.612177: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613370: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613373: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613366: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613374: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:29.613777: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613778: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613782: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613786: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613789: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613790: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613795: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +3: 2023-03-16 21:10:29.613796: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.660709: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660707: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660714: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660715: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660722: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.660717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:29.661127: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661130: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661134: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661136: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661139: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661140: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661141: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +1: 2023-03-16 21:10:29.661143: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.667915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667911: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667920: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667921: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667921: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667927: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667927: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.667932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:29.668328: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668333: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668335: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668337: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668339: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668341: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668342: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +2: 2023-03-16 21:10:29.668346: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.737738: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737745: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737748: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737744: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737753: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737755: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.737756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:29.738167: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738171: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738176: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738176: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738179: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738180: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738179: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +7: 2023-03-16 21:10:29.738183: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.775644: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775652: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775649: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775653: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775653: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.775659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:29.776186: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776187: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776189: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776192: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776192: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776194: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776198: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +6: 2023-03-16 21:10:29.776200: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778367: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778368: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778364: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778378: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778570: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778375: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778577: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778578: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778580: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778581: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778376: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:29.778587: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778590: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +0: 2023-03-16 21:10:29.778600: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +5: 2023-03-16 21:10:33.769198: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769205: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769210: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769210: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.769219: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.769630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: 2023-03-16 21:10:33.769687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769690: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769699: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769635: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769645: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.769938: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769699: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769650: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769703: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 21:10:33.769943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769650: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.769707: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: 2023-03-16 21:10:33.769951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +0: 2023-03-16 21:10:33.769659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.769954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.769951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.769952: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.769960: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.769961: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770544: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770544: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770549: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770547: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770554: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.770549: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770400: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770399: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770407: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770413: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770420: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.770418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771316: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771367: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771314: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771376: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771366: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771321: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771370: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771317: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771318: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771368: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771323: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.771330: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771371: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +0: 2023-03-16 21:10:33.771385: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771386: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771387: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771390: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771392: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771393: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 21:10:33.771625: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.771627: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.771687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 21:10:33.771631: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.771633: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.771691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 21:10:33.771634: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.771639: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 21:10:33.771639: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 21:10:33.771637: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.771645: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771696: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 21:10:33.771642: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.771649: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 21:10:33.771648: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771693: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: 2023-03-16 21:10:33.771654: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 21:10:33.771660: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +1: 2023-03-16 21:10:33.771659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: 2023-03-16 21:10:33.771695: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.771701: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +1: 2023-03-16 21:10:33.771677: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771699: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.771704: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +2: 2023-03-16 21:10:33.771712: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771712: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771711: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771714: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771715: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771717: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +2: 2023-03-16 21:10:33.771719: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 21:10:33.772391: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 21:10:33.773013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772394: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 21:10:33.773018: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772397: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 21:10:33.773021: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772394: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772396: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 21:10:33.773021: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772404: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 21:10:33.773022: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.773027: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 21:10:33.772398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: 2023-03-16 21:10:33.773030: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772412: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 21:10:33.772413: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 21:10:33.772415: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 21:10:33.772416: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: 2023-03-16 21:10:33.772419: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772421: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +4: 2023-03-16 21:10:33.772422: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.773037: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773038: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +4: 2023-03-16 21:10:33.772435: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773039: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773042: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773042: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +6: 2023-03-16 21:10:33.773062: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +6: 2023-03-16 21:10:33.773075: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.784523: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784531: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.784541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785974: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785976: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785973: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785979: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785983: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.785985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786458: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786464: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786466: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786477: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786478: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786477: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786484: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786485: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786486: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786523: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +3: 2023-03-16 21:10:33.786540: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +3: 2023-03-16 21:10:33.786543: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787883: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787884: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787888: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787893: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787897: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787897: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787896: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787903: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787907: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787907: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +7: 2023-03-16 21:10:33.787960: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +7: 2023-03-16 21:10:33.787961: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771571: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: 2023-03-16 21:10:33.771330: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 21:10:33.771335: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 21:10:33.771340: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.771340: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 21:10:33.771342: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: 2023-03-16 21:10:33.771585: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 21:10:33.771350: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.771352: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +5: 2023-03-16 21:10:33.771369: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +5: 2023-03-16 21:10:33.771369: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +0: Loading extension module scaled_upper_triang_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module scaled_masked_softmax_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module scaled_masked_softmax_cuda... +0: Successfully preprocessed all matching files. +0: Detected CUDA files, patching ldflags +0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... +0: Building extension module fused_mix_prec_layer_norm_cuda... +0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Loading extension module fused_mix_prec_layer_norm_cuda... +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +0: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +1: Successfully preprocessed all matching files. +3: Successfully preprocessed all matching files. +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +1: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +3: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +5: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +7: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +6: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +2: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +4: warnings.warn( +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +0: warnings.warn( +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +7: Building extension module utils... +7: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: +2: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: +4: +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: +6: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: +7: Loading extension module utils... +0: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +1: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +3: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils...Loading extension module utils...Loading extension module utils... +7: +7: +7: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +2: Loading extension module utils... +7: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +4: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +6: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +5: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +0: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: +0: Loading extension module utils... +0: Loading extension module utils... +0: Loading extension module utils... +0: +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +7: No modifications detected for re-loaded extension module utils, skipping build step... +7: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +1: +1: Loading extension module utils... +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +1: +1: Loading extension module utils... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +1: No modifications detected for re-loaded extension module utils, skipping build step... +1: Loading extension module utils... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: +3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +3: No modifications detected for re-loaded extension module utils, skipping build step... +3: Loading extension module utils... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: +2: +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +2: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +6: +6: Loading extension module utils... +2: No modifications detected for re-loaded extension module utils, skipping build step... +2: Loading extension module utils... +6: No modifications detected for re-loaded extension module utils, skipping build step... +6: Loading extension module utils... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +4: +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +4: No modifications detected for re-loaded extension module utils, skipping build step... +4: Loading extension module utils... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +5: +5: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +5: +5: +5: Loading extension module utils...Loading extension module utils... +5: +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +5: No modifications detected for re-loaded extension module utils, skipping build step... +5: Loading extension module utils... +0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +0: No modifications detected for re-loaded extension module utils, skipping build step... +0: Loading extension module utils... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings +0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/83m91b100m/3319473.out b/83m91b100m/3319473.out new file mode 100644 index 0000000000000000000000000000000000000000..4fd41882942a4a644fa29050caf47f66113f4de1 --- /dev/null +++ b/83m91b100m/3319473.out @@ -0,0 +1,4661 @@ +Model parameters: d_model 640 ffw_size 2560 kv_size 64 n_heads 10 n_layers 10 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 1 --pipeline-model-parallel-size 1 --num-layers 10 --hidden-size 640 --num-attention-heads 10 --kv-channels 64 --ffn-hidden-size 2560 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 4 --global-batch-size 256 --train-samples 44_416_143 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --loss-scale 12 --clip-grad 1.0 --kill-switch-path kill-switch-83m91b100m --bf16 --checkpoint-activations --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 44_416_143 --lr-warmup-samples 444_161 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 100 --save-interval 20000 --eval-interval 20000 --eval-iters 1 --tensorboard-dir tensorboard_83m91b100m --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_83m91b100m --load checkpoints_83m91b100m --train-weighted-split-paths-path train100m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --deepspeed --deepspeed_config ds_configs/3319473.json --zero-stage 0 +START 3319473: Thu 16 Mar 2023 09:10:05 PM EET +0: +0: +0: ======================= ROCm System Management Interface ======================= +0: ================================= Concise Info ================================= +0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +0: 0 52.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 1 54.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 2 47.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 4 54.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: 6 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +0: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +0: ================================================================================ +0: ============================= End of ROCm SMI Log ============================== +1: +1: +1: ======================= ROCm System Management Interface ======================= +1: ================================= Concise Info ================================= +1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +1: 0 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 2 49.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 4 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: 6 44.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +1: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +1: ================================================================================ +1: ============================= End of ROCm SMI Log ============================== +2: +2: +2: ======================= ROCm System Management Interface ======================= +2: ================================= Concise Info ================================= +2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +2: 0 45.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 1 56.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 3 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 4 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: 6 43.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +2: 7 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +2: ================================================================================ +2: ============================= End of ROCm SMI Log ============================== +5: +5: +5: ======================= ROCm System Management Interface ======================= +5: ================================= Concise Info ================================= +5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +5: 0 46.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 2 46.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 4 47.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: 6 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +5: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +5: ================================================================================ +5: ============================= End of ROCm SMI Log ============================== +4: +4: +4: ======================= ROCm System Management Interface ======================= +4: ================================= Concise Info ================================= +4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +4: 0 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 2 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 4 47.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 5 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: 6 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +4: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +4: ================================================================================ +4: ============================= End of ROCm SMI Log ============================== +7: +7: +7: ======================= ROCm System Management Interface ======================= +7: ================================= Concise Info ================================= +7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +7: 0 48.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 2 49.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 4 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: 6 48.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +7: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +7: ================================================================================ +7: ============================= End of ROCm SMI Log ============================== +3: +3: +3: ======================= ROCm System Management Interface ======================= +3: ================================= Concise Info ================================= +3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +3: 0 48.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 2 51.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 4 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +3: ================================================================================ +3: ============================= End of ROCm SMI Log ============================== +6: +6: +6: ======================= ROCm System Management Interface ======================= +6: ================================= Concise Info ================================= +6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +6: 0 50.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 1 57.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 2 45.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: 6 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +6: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +6: ================================================================================ +6: ============================= End of ROCm SMI Log ============================== +3: Launching on nid007226 (3/8), master nid007223 port 9999, GPUs 8, CUDA: True +6: Launching on nid007229 (6/8), master nid007223 port 9999, GPUs 8, CUDA: True +2: Launching on nid007225 (2/8), master nid007223 port 9999, GPUs 8, CUDA: True +7: Launching on nid007230 (7/8), master nid007223 port 9999, GPUs 8, CUDA: True +5: Launching on nid007228 (5/8), master nid007223 port 9999, GPUs 8, CUDA: True +0: Launching on nid007223 (0/8), master nid007223 port 9999, GPUs 8, CUDA: True +4: Launching on nid007227 (4/8), master nid007223 port 9999, GPUs 8, CUDA: True +1: Launching on nid007224 (1/8), master nid007223 port 9999, GPUs 8, CUDA: True +7: > setting tensorboard ... +0: using world size: 64, data-parallel-size: 64, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 +0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. +0: using torch.bfloat16 for parameters ... +0: ------------------------ arguments ------------------------ +0: abort_on_unmet_fused_kernel_constraints ......... False +0: accumulate_allreduce_grads_in_fp32 .............. True +0: adam_beta1 ...................................... 0.9 +0: adam_beta2 ...................................... 0.999 +0: adam_eps ........................................ 1e-08 +0: adlr_autoresume ................................. False +0: adlr_autoresume_interval ........................ 1000 +0: apply_query_key_layer_scaling ................... True +0: apply_residual_connection_post_layernorm ........ False +0: attention_dropout ............................... 0.1 +0: attention_softmax_in_fp32 ....................... False +0: bert_binary_head ................................ True +0: bert_load ....................................... None +0: bf16 ............................................ True +0: bias_dropout_fusion ............................. True +0: bias_gelu_fusion ................................ True +0: biencoder_projection_dim ........................ 0 +0: biencoder_shared_query_context_model ............ False +0: block_data_path ................................. None +0: checkpoint_activations .......................... True +0: checkpoint_in_cpu ............................... False +0: checkpoint_num_layers ........................... 1 +0: clip_grad ....................................... 1.0 +0: codecarbon_dir .................................. None +0: consumed_train_samples .......................... 0 +0: consumed_train_tokens ........................... 0 +0: consumed_valid_samples .......................... 0 +0: contigious_checkpointing ........................ False +0: cpu_optimizer ................................... False +0: cpu_torch_adam .................................. False +0: curriculum_learning ............................. False +0: data_impl ....................................... mmap +0: data_parallel_size .............................. 64 +0: data_path ....................................... None +0: dataloader_type ................................. single +0: DDP_impl ........................................ local +0: decoder_seq_length .............................. None +0: deepscale ....................................... False +0: deepscale_config ................................ None +0: deepspeed ....................................... True +0: deepspeed_activation_checkpointing .............. False +0: deepspeed_config ................................ ds_configs/3319473.json +0: deepspeed_mpi ................................... False +0: distribute_checkpointed_activations ............. False +0: distributed_backend ............................. nccl +0: embed_layernorm ................................. False +0: embedding_path .................................. None +0: encoder_seq_length .............................. 2048 +0: eod_mask_loss ................................... False +0: eval_interval ................................... 20000 +0: eval_iters ...................................... 1 +0: eval_only ....................................... None +0: evidence_data_path .............................. None +0: exit_duration_in_mins ........................... None +0: exit_interval ................................... None +0: ffn_hidden_size ................................. 2560 +0: finetune ........................................ False +0: fp16 ............................................ False +0: fp16_lm_cross_entropy ........................... False +0: fp32_residual_connection ........................ False +0: gigaflos_no_embeds .............................. 0 +0: global_batch_size ............................... 256 +0: glu_activation .................................. None +0: hidden_dropout .................................. 0.1 +0: hidden_size ..................................... 640 +0: hysteresis ...................................... 2 +0: ict_head_size ................................... None +0: ict_load ........................................ None +0: img_dim ......................................... 224 +0: indexer_batch_size .............................. 128 +0: indexer_log_interval ............................ 1000 +0: inference ....................................... False +0: init_method_std ................................. 0.02 +0: init_method_xavier_uniform ...................... False +0: initial_loss_scale .............................. 4294967296 +0: kill_switch_path ................................ kill-switch-83m91b100m +0: kv_channels ..................................... 64 +0: layer_norm_fusion ............................... True +0: layernorm_epsilon ............................... 1e-05 +0: lazy_mpu_init ................................... None +0: load ............................................ checkpoints_83m91b100m +0: local_rank ...................................... None +0: log_batch_size_to_tensorboard ................... True +0: log_interval .................................... 100 +0: log_learning_rate_to_tensorboard ................ True +0: log_level ....................................... None +0: log_level_replica ............................... None +0: log_loss_scale_to_tensorboard ................... True +0: log_num_zeros_in_grad ........................... False +0: log_params_norm ................................. False +0: log_path ........................................ None +0: log_timers_to_tensorboard ....................... True +0: log_validation_ppl_to_tensorboard ............... True +0: loss_on_targets_only ............................ False +0: loss_scale ...................................... 12.0 +0: loss_scale_window ............................... 1000 +0: lr .............................................. 0.0002 +0: lr_decay_iters .................................. None +0: lr_decay_samples ................................ 44416143 +0: lr_decay_style .................................. cosine +0: lr_decay_tokens ................................. None +0: lr_warmup_fraction .............................. None +0: lr_warmup_iters ................................. 0 +0: lr_warmup_samples ............................... 444161 +0: make_vocab_size_divisible_by .................... 128 +0: mask_prob ....................................... 0.15 +0: masked_softmax_fusion ........................... True +0: max_position_embeddings ......................... 2048 +0: mean_noise_span_length .......................... None +0: memory_centric_tiled_linear ..................... False +0: merge_file ...................................... gpt2/merges.txt +0: micro_batch_size ................................ 4 +0: min_loss_scale .................................. 1.0 +0: min_lr .......................................... 2e-05 +0: mmap_warmup ..................................... False +0: no_load_optim ................................... None +0: no_load_rng ..................................... None +0: no_save_optim ................................... None +0: no_save_rng ..................................... None +0: noise_density ................................... None +0: num_attention_heads ............................. 10 +0: num_channels .................................... 3 +0: num_classes ..................................... 1000 +0: num_layers ...................................... 10 +0: num_layers_per_virtual_pipeline_stage ........... None +0: num_workers ..................................... 2 +0: onnx_safe ....................................... None +0: openai_gelu ..................................... False +0: optimizer ....................................... adam +0: optimizer_fusion ................................ True +0: override_lr_scheduler ........................... False +0: pad_vocab_size_to ............................... None +0: params_dtype .................................... torch.bfloat16 +0: partition_activations ........................... False +0: patch_dim ....................................... 16 +0: pipeline_model_parallel_size .................... 1 +0: position_embedding_type ......................... PositionEmbeddingType.absolute +0: pp_partition_method ............................. None +0: profile_backward ................................ False +0: query_in_block_prob ............................. 0.1 +0: rampup_batch_size ............................... None +0: rank ............................................ 0 +0: remote_device ................................... none +0: reset_attention_mask ............................ False +0: reset_position_ids .............................. False +0: reset_progress .................................. None +0: retriever_report_topk_accuracies ................ [] +0: retriever_score_scaling ......................... False +0: retriever_seq_length ............................ 256 +0: reweight_loss_based_on_position_frequency ....... False +0: sample_rate ..................................... 1.0 +0: save ............................................ checkpoints_83m91b100m +0: save_interval ................................... 20000 +0: scatter_gather_tensors_in_pipeline .............. True +0: scattered_embeddings ............................ False +0: seed ............................................ 1234 +0: seq_length ...................................... 2048 +0: sgd_momentum .................................... 0.9 +0: short_seq_prob .................................. 0.1 +0: skip_train_iteration_range ...................... None +0: split ........................................... None +0: split_transformers .............................. False +0: sync_tp_duplicated_parameters ................... False +0: synchronize_each_layer .......................... False +0: tensor_model_parallel_size ...................... 1 +0: tensorboard_dir ................................. tensorboard_83m91b100m +0: tensorboard_log_interval ........................ 1 +0: tensorboard_queue_size .......................... 5 +0: test_weighted_split_paths ....................... None +0: test_weighted_split_paths_path .................. None +0: tile_factor ..................................... 1 +0: titles_data_path ................................ None +0: tokenizer_name_or_path .......................... None +0: tokenizer_type .................................. GPT2BPETokenizer +0: train_iters ..................................... None +0: train_samples ................................... 44416143 +0: train_tokens .................................... None +0: train_weighted_split_names ...................... ['train'] +0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document']] +0: train_weighted_split_paths_path ................. None +0: train_weighted_split_splits ..................... [['0:1']] +0: train_weighted_split_weights .................... [['1.0']] +0: universal_checkpoint ............................ False +0: use_bnb_optimizer ............................... False +0: use_checkpoint_lr_scheduler ..................... False +0: use_contiguous_buffers_in_ddp ................... True +0: use_cpu_initialization .......................... None +0: use_one_sent_docs ............................... False +0: use_pin_memory .................................. False +0: valid_num_workers ............................... 2 +0: valid_weighted_split_names ...................... ['validation'] +0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] +0: valid_weighted_split_paths_path ................. None +0: valid_weighted_split_splits ..................... [['0:1']] +0: valid_weighted_split_weights .................... [['1.0']] +0: virtual_pipeline_model_parallel_size ............ None +0: vocab_extra_ids ................................. 0 +0: vocab_file ...................................... gpt2/vocab.json +0: weight_decay .................................... 0.1 +0: world_size ...................................... 64 +0: zero_allgather_bucket_size ...................... 0.0 +0: zero_contigious_gradients ....................... False +0: zero_reduce_bucket_size ......................... 0.0 +0: zero_reduce_scatter ............................. False +0: zero_stage ...................................... 0 +0: -------------------- end of arguments --------------------- +0: setting number of micro-batches to constant 1 +0: > building GPT2BPETokenizer tokenizer ... +0: > padded vocab (size: 50257) with 47 dummy tokens (new size: 50304) +0: DeepSpeed general environment info: +0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] +0: torch version .................... 1.13.0+rocm5.2 +0: torch cuda version ............... None +0: torch hip version ................ 5.2.21151-afdc89f8 +0: nvcc version ..................... None +0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] +0: deepspeed info ................... 0.7.5, unknown, unknown +0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 +0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** +0: > initializing torch distributed ... +0: [2023-03-16 21:10:56,979] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +0: > initializing tensor model parallel with size 1 +0: > initializing pipeline model parallel with size 1 +0: > setting random seeds to 1234 ... +0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 +0: > compiling dataset index builder ... +0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: make: Nothing to be done for 'default'. +0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' +0: >>> done with dataset index builder. Compilation time: 0.091 seconds +0: > compiling and loading fused kernels ... +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 102 +0: [1/1] c++ scaled_masked_softmax_hip.cuda.o scaled_masked_softmax_hip.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] +0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] +0: Total number of unsupported CUDA function calls: 0 +0: +0: +0: Total number of replaced kernel launches: 67 +0: ninja: no work to do. +0: >>> done with compiling and loading fused kernels. Compilation time: 23.077 seconds +0: time to initialize megatron (seconds): -37.141 +0: [after megatron is initialized] datetime: 2023-03-16 21:11:22 +0: building GPT model ... +0: [2023-03-16 21:11:22,992] [INFO] [utils.py:827:see_memory_usage] Before Building Model +0: [2023-03-16 21:11:22,992] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB +0: [2023-03-16 21:11:22,992] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.67 GB, percent = 6.1% +0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None +0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=1, model=0): 1, ProcessCoord(pipe=0, data=2, model=0): 2, ProcessCoord(pipe=0, data=3, model=0): 3, ProcessCoord(pipe=0, data=4, model=0): 4, ProcessCoord(pipe=0, data=5, model=0): 5, ProcessCoord(pipe=0, data=6, model=0): 6, ProcessCoord(pipe=0, data=7, model=0): 7, ProcessCoord(pipe=0, data=8, model=0): 8, ProcessCoord(pipe=0, data=9, model=0): 9, ProcessCoord(pipe=0, data=10, model=0): 10, ProcessCoord(pipe=0, data=11, model=0): 11, ProcessCoord(pipe=0, data=12, model=0): 12, ProcessCoord(pipe=0, data=13, model=0): 13, ProcessCoord(pipe=0, data=14, model=0): 14, ProcessCoord(pipe=0, data=15, model=0): 15, ProcessCoord(pipe=0, data=16, model=0): 16, ProcessCoord(pipe=0, data=17, model=0): 17, ProcessCoord(pipe=0, data=18, model=0): 18, ProcessCoord(pipe=0, data=19, model=0): 19, ProcessCoord(pipe=0, data=20, model=0): 20, ProcessCoord(pipe=0, data=21, model=0): 21, ProcessCoord(pipe=0, data=22, model=0): 22, ProcessCoord(pi +0: pe=0, data=23, model=0): 23, ProcessCoord(pipe=0, data=24, model=0): 24, ProcessCoord(pipe=0, data=25, model=0): 25, ProcessCoord(pipe=0, data=26, model=0): 26, ProcessCoord(pipe=0, data=27, model=0): 27, ProcessCoord(pipe=0, data=28, model=0): 28, ProcessCoord(pipe=0, data=29, model=0): 29, ProcessCoord(pipe=0, data=30, model=0): 30, ProcessCoord(pipe=0, data=31, model=0): 31, ProcessCoord(pipe=0, data=32, model=0): 32, ProcessCoord(pipe=0, data=33, model=0): 33, ProcessCoord(pipe=0, data=34, model=0): 34, ProcessCoord(pipe=0, data=35, model=0): 35, ProcessCoord(pipe=0, data=36, model=0): 36, ProcessCoord(pipe=0, data=37, model=0): 37, ProcessCoord(pipe=0, data=38, model=0): 38, ProcessCoord(pipe=0, data=39, model=0): 39, ProcessCoord(pipe=0, data=40, model=0): 40, ProcessCoord(pipe=0, data=41, model=0): 41, ProcessCoord(pipe=0, data=42, model=0): 42, ProcessCoord(pipe=0, data=43, model=0): 43, ProcessCoord(pipe=0, data=44, model=0): 44, ProcessCoord(pipe=0, data=45, model=0): 45, ProcessCoord(pipe=0, data=4 +0: 6, model=0): 46, ProcessCoord(pipe=0, data=47, model=0): 47, ProcessCoord(pipe=0, data=48, model=0): 48, ProcessCoord(pipe=0, data=49, model=0): 49, ProcessCoord(pipe=0, data=50, model=0): 50, ProcessCoord(pipe=0, data=51, model=0): 51, ProcessCoord(pipe=0, data=52, model=0): 52, ProcessCoord(pipe=0, data=53, model=0): 53, ProcessCoord(pipe=0, data=54, model=0): 54, ProcessCoord(pipe=0, data=55, model=0): 55, ProcessCoord(pipe=0, data=56, model=0): 56, ProcessCoord(pipe=0, data=57, model=0): 57, ProcessCoord(pipe=0, data=58, model=0): 58, ProcessCoord(pipe=0, data=59, model=0): 59, ProcessCoord(pipe=0, data=60, model=0): 60, ProcessCoord(pipe=0, data=61, model=0): 61, ProcessCoord(pipe=0, data=62, model=0): 62, ProcessCoord(pipe=0, data=63, model=0): 63} +0: [2023-03-16 21:11:24,984] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer +0: stage=0 layers=17 +0: 0: _to_float16 +0: 1: EmbeddingPipe +0: 2: +0: 3: ParallelTransformerLayerPipe +0: 4: ParallelTransformerLayerPipe +0: 5: ParallelTransformerLayerPipe +0: 6: ParallelTransformerLayerPipe +0: 7: ParallelTransformerLayerPipe +0: 8: ParallelTransformerLayerPipe +0: 9: ParallelTransformerLayerPipe +0: 10: ParallelTransformerLayerPipe +0: 11: ParallelTransformerLayerPipe +0: 12: ParallelTransformerLayerPipe +0: 13: undo +0: 14: MixedFusedLayerNorm +0: 15: EmbeddingPipe +0: 16: float16_to_fp32 +0: loss: CrossEntropy +0: [2023-03-16 21:11:25,168] [INFO] [utils.py:827:see_memory_usage] After Building Model +0: [2023-03-16 21:11:25,169] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 21:11:25,169] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.68 GB, percent = 6.1% +0: setting training iterations to 173500 +0: > learning rate decay style: cosine +0: DeepSpeed is enabled. +0: [2023-03-16 21:11:25,170] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +0: [2023-03-16 21:11:38,085] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +0: [2023-03-16 21:11:38,086] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer +0: [2023-03-16 21:11:38,086] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer +0: [2023-03-16 21:11:38,088] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +0: [2023-03-16 21:11:38,088] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +0: [2023-03-16 21:11:38,208] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer +0: [2023-03-16 21:11:38,209] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 21:11:38,209] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.36 GB, percent = 6.2% +7: ninja: no work to do. +7: Time to load utils op: 0.1621694564819336 seconds +0: Time to load utils op: 0.10976934432983398 seconds +7: Time to load utils op: 0.0006387233734130859 seconds +0: Time to load utils op: 0.1026308536529541 seconds +0: Time to load utils op: 0.10279107093811035 seconds +0: Time to load utils op: 0.10291600227355957 seconds +0: Time to load utils op: 0.10286068916320801 seconds +0: Time to load utils op: 0.10342860221862793 seconds +0: Time to load utils op: 0.10326743125915527 seconds +0: Time to load utils op: 0.1036367416381836 seconds +1: Time to load utils op: 0.10945653915405273 seconds +1: Time to load utils op: 0.10638737678527832 seconds +1: Time to load utils op: 0.10680103302001953 secondsTime to load utils op: 0.10617542266845703 secondsTime to load utils op: 0.10692763328552246 seconds +1: +1: +1: Time to load utils op: 0.10717391967773438 seconds +1: Time to load utils op: 0.10730552673339844 seconds +1: Time to load utils op: 0.10698080062866211 seconds +7: Time to load utils op: 0.10192084312438965 seconds +7: Time to load utils op: 0.10224723815917969 secondsTime to load utils op: 0.10274672508239746 secondsTime to load utils op: 0.10271739959716797 seconds +7: +7: +7: Time to load utils op: 0.10248589515686035 seconds +7: Time to load utils op: 0.10186767578125 seconds +7: Time to load utils op: 0.10235285758972168 seconds +3: Time to load utils op: 0.11167645454406738 seconds +3: Time to load utils op: 0.11154389381408691 seconds +3: Time to load utils op: 0.11151432991027832 secondsTime to load utils op: 0.11202597618103027 seconds +3: +3: Time to load utils op: 0.1115880012512207 seconds +3: Time to load utils op: 0.11150836944580078 seconds +3: Time to load utils op: 0.11154341697692871 seconds +3: Time to load utils op: 0.11171507835388184 seconds +2: Time to load utils op: 0.11095643043518066 secondsTime to load utils op: 0.11093783378601074 seconds +2: Time to load utils op: 0.1109628677368164 seconds +2: +2: Time to load utils op: 0.11094999313354492 secondsTime to load utils op: 0.11095690727233887 seconds +2: Time to load utils op: 0.11095213890075684 seconds +2: Time to load utils op: 0.11095929145812988 secondsTime to load utils op: 0.11095595359802246 seconds +2: +2: +4: Time to load utils op: 0.1103515625 seconds +4: Time to load utils op: 0.11036849021911621 seconds +4: Time to load utils op: 0.11041569709777832 seconds +4: Time to load utils op: 0.11039876937866211 seconds +4: Time to load utils op: 0.11041951179504395 secondsTime to load utils op: 0.11040997505187988 secondsTime to load utils op: 0.11041259765625 seconds +4: +4: Time to load utils op: 0.11041498184204102 seconds +4: +6: Time to load utils op: 0.10982108116149902 seconds +6: Time to load utils op: 0.10980343818664551 secondsTime to load utils op: 0.1098322868347168 seconds +6: +6: Time to load utils op: 0.10984420776367188 seconds +6: Time to load utils op: 0.10983920097351074 secondsTime to load utils op: 0.1098325252532959 seconds +6: +6: Time to load utils op: 0.10987710952758789 seconds +6: Time to load utils op: 0.10984563827514648 seconds +5: Time to load utils op: 0.11023211479187012 seconds +5: Time to load utils op: 0.11024212837219238 seconds +5: Time to load utils op: 0.11028552055358887 seconds +5: Time to load utils op: 0.11026835441589355 secondsTime to load utils op: 0.11029243469238281 seconds +5: +5: Time to load utils op: 0.11028432846069336 seconds +5: Time to load utils op: 0.11031675338745117 secondsTime to load utils op: 0.11028456687927246 seconds +5: +7: Time to load utils op: 0.0004200935363769531 seconds +7: Time to load utils op: 0.0003845691680908203 seconds +7: Time to load utils op: 0.00036215782165527344 seconds +7: Time to load utils op: 0.0003707408905029297 seconds +7: Time to load utils op: 0.0004527568817138672 seconds +7: Time to load utils op: 0.0005095005035400391 seconds +0: Time to load utils op: 0.0005290508270263672 seconds +0: Time to load utils op: 0.0005958080291748047 seconds +0: Time to load utils op: 0.0006556510925292969 secondsTime to load utils op: 0.0006320476531982422 secondsTime to load utils op: 0.0006303787231445312 seconds +0: +0: +0: Time to load utils op: 0.0004904270172119141 seconds +0: Time to load utils op: 0.0004563331604003906 seconds +7: Time to load utils op: 0.0006608963012695312 seconds +1: Time to load utils op: 0.0004382133483886719 seconds +1: Time to load utils op: 0.0004012584686279297 seconds +1: Time to load utils op: 0.0004355907440185547 seconds +1: Time to load utils op: 0.00040531158447265625 seconds +1: Time to load utils op: 0.00042700767517089844 secondsTime to load utils op: 0.000408172607421875 seconds +1: +1: Time to load utils op: 0.00040221214294433594 seconds +1: Time to load utils op: 0.0003917217254638672 seconds +3: Time to load utils op: 0.0008485317230224609 seconds +3: Time to load utils op: 0.0011484622955322266 seconds +3: Time to load utils op: 0.0012035369873046875 seconds +3: Time to load utils op: 0.0012950897216796875 seconds +3: Time to load utils op: 0.0013949871063232422 secondsTime to load utils op: 0.0014011859893798828 seconds +3: +3: Time to load utils op: 0.0012464523315429688 seconds +3: Time to load utils op: 0.0014510154724121094 seconds +2: Time to load utils op: 0.0009479522705078125 seconds +6: Time to load utils op: 0.0009236335754394531 seconds +6: Time to load utils op: 0.0009379386901855469 seconds +2: Time to load utils op: 0.0011653900146484375 seconds +6: Time to load utils op: 0.0011394023895263672 seconds +6: Time to load utils op: 0.0011713504791259766 seconds +2: Time to load utils op: 0.0012965202331542969 seconds +2: Time to load utils op: 0.0013284683227539062 seconds +2: Time to load utils op: 0.001302480697631836 seconds +2: Time to load utils op: 0.0013051033020019531 seconds +6: Time to load utils op: 0.0012552738189697266 seconds +2: Time to load utils op: 0.0013363361358642578 seconds +6: Time to load utils op: 0.0012903213500976562 seconds +2: Time to load utils op: 0.0013546943664550781 seconds +6: Time to load utils op: 0.001262664794921875 seconds +6: Time to load utils op: 0.0013272762298583984 seconds +4: Time to load utils op: 0.0010635852813720703 secondsTime to load utils op: 0.0010280609130859375 seconds +4: +4: Time to load utils op: 0.0013003349304199219 seconds +4: Time to load utils op: 0.0013451576232910156 seconds +4: Time to load utils op: 0.001356363296508789 seconds +4: Time to load utils op: 0.0013604164123535156 seconds +4: Time to load utils op: 0.0012791156768798828 seconds +4: Time to load utils op: 0.0014240741729736328 seconds +5: Time to load utils op: 0.0007305145263671875 seconds +5: Time to load utils op: 0.0006229877471923828 seconds +5: Time to load utils op: 0.0005125999450683594 seconds +5: Time to load utils op: 0.0005481243133544922 seconds +5: Time to load utils op: 0.0004773139953613281 seconds +5: Time to load utils op: 0.0008428096771240234 seconds +5: Time to load utils op: 0.0008206367492675781 seconds +5: Time to load utils op: 0.0005338191986083984 seconds +0: [2023-03-16 21:11:38,450] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 +0: [2023-03-16 21:11:38,451] [INFO] [utils.py:828:see_memory_usage] MA 0.16 GB Max_MA 0.16 GB CA 0.17 GB Max_CA 0 GB +0: [2023-03-16 21:11:38,451] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:38,568] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 +0: [2023-03-16 21:11:38,568] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-16 21:11:38,568] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:38,672] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 +0: [2023-03-16 21:11:38,673] [INFO] [utils.py:828:see_memory_usage] MA 0.37 GB Max_MA 0.37 GB CA 0.48 GB Max_CA 0 GB +0: [2023-03-16 21:11:38,673] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:38,778] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 +0: [2023-03-16 21:11:38,779] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 21:11:38,779] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:38,881] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 +0: [2023-03-16 21:11:38,882] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 21:11:38,882] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:38,987] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 +0: [2023-03-16 21:11:38,987] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 21:11:38,987] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:39,091] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer +0: [2023-03-16 21:11:39,091] [INFO] [utils.py:828:see_memory_usage] MA 0.47 GB Max_MA 0.47 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 21:11:39,092] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:39,199] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer +0: [2023-03-16 21:11:39,200] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 21:11:39,200] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:39,303] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer +0: [2023-03-16 21:11:39,304] [INFO] [utils.py:828:see_memory_usage] MA 0.48 GB Max_MA 0.48 GB CA 0.58 GB Max_CA 1 GB +0: [2023-03-16 21:11:39,304] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.5 GB, percent = 6.3% +0: [2023-03-16 21:11:39,304] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +0: [2023-03-16 21:11:39,304] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler +0: [2023-03-16 21:11:39,304] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +0: [2023-03-16 21:11:39,304] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] +0: [2023-03-16 21:11:39,304] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] activation_checkpointing_config { +0: "partition_activations": false, +0: "contiguous_memory_optimization": false, +0: "cpu_checkpointing": false, +0: "number_checkpoints": null, +0: "synchronize_checkpoint_boundary": false, +0: "profile": false +0: } +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] amp_enabled .................. False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] amp_params ................... False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] autotuning_config ............ { +0: "enabled": false, +0: "start_step": null, +0: "end_step": null, +0: "metric_path": null, +0: "arg_mappings": null, +0: "metric": "throughput", +0: "model_info": null, +0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", +0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", +0: "overwrite": true, +0: "fast": true, +0: "start_profile_step": 3, +0: "end_profile_step": 5, +0: "tuner_type": "gridsearch", +0: "tuner_early_stopping": 5, +0: "tuner_num_trials": 50, +0: "model_info_path": null, +0: "mp_size": 1, +0: "max_train_batch_size": null, +0: "min_train_batch_size": 1, +0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, +0: "min_train_micro_batch_size_per_gpu": 1, +0: "num_tuning_micro_batch_sizes": 3 +0: } +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] bfloat16_enabled ............. True +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] comms_config ................. +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] communication_data_type ...... None +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa +0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] curriculum_enabled ........... False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] curriculum_params ............ False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] dataloader_drop_last ......... False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] disable_allgather ............ False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] dump_state ................... False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 +0: [2023-03-16 21:11:39,305] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] elasticity_enabled ........... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] flops_profiler_config ........ { +0: "enabled": false, +0: "profile_step": 1, +0: "module_depth": -1, +0: "top_modules": 1, +0: "detailed": true, +0: "output_file": null +0: } +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] fp16_auto_cast ............... None +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] fp16_enabled ................. False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] global_rank .................. 0 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 1 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] load_universal_checkpoint .... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] loss_scale ................... 1.0 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] memory_breakdown ............. False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] monitor_config ............... +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] nebula_config ................ { +0: "enabled": false, +0: "persistent_storage_path": null, +0: "persistent_time_interval": 100, +0: "num_of_version_in_retention": 2, +0: "enable_nebula_load": true, +0: "load_path": null +0: } +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] optimizer_name ............... None +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] optimizer_params ............. None +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] pld_enabled .................. False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] pld_params ................... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] prescale_gradients ........... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] scheduler_name ............... None +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] scheduler_params ............. None +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] sparse_attention ............. None +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] steps_per_print .............. 2000 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] train_batch_size ............. 256 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 4 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] use_node_local_storage ....... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] world_size ................... 64 +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] zero_enabled ................. False +0: [2023-03-16 21:11:39,306] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 +0: [2023-03-16 21:11:39,307] [INFO] [config.py:996:print_user_config] json = { +0: "train_micro_batch_size_per_gpu": 4, +0: "train_batch_size": 256, +0: "gradient_clipping": 1.0, +0: "zero_optimization": { +0: "stage": 0 +0: }, +0: "bf16": { +0: "enabled": true +0: }, +0: "steps_per_print": 2.000000e+03, +0: "wall_clock_breakdown": false +0: } +0: Time to load utils op: 0.0004258155822753906 seconds +0: [2023-03-16 21:11:39,307] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=1 micro_batch_size=4 +0: [2023-03-16 21:11:39,359] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=17 [0, 17) STAGE_PARAMS=82741760 (82.742M) TOTAL_PARAMS=82741760 (82.742M) UNIQUE_PARAMS=82741760 (82.742M) +0: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +2: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +1: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +7: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +6: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +7: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +3: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +1: [2023-03-16 21:11:39,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +5: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +6: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +2: [2023-03-16 21:11:39,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +4: [2023-03-16 21:11:39,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:11:39,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +3: [2023-03-16 21:11:39,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +6: [2023-03-16 21:11:39,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +5: [2023-03-16 21:11:39,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +7: [2023-03-16 21:11:39,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +4: [2023-03-16 21:11:39,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +1: [2023-03-16 21:11:39,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +2: [2023-03-16 21:11:39,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,520] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +6: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +7: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +2: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +5: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +1: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +4: [2023-03-16 21:11:39,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +3: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +1: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +5: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +4: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,569] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +6: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +3: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +7: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +3: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +6: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +5: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +4: [2023-03-16 21:11:39,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +1: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +3: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +6: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +1: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +5: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +4: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +2: [2023-03-16 21:11:39,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +7: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,588] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +7: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +2: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +2: [2023-03-16 21:11:39,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +4: [2023-03-16 21:11:39,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +2: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +5: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +1: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +2: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +7: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +7: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +6: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +1: [2023-03-16 21:11:39,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +6: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +5: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +4: [2023-03-16 21:11:39,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +3: [2023-03-16 21:11:39,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:11:39,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +2: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +2: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +7: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +7: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +1: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +6: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +1: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +3: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +4: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +6: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +4: [2023-03-16 21:11:39,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:11:39,646] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +3: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +5: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +4: [2023-03-16 21:11:39,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +5: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +3: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +2: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +6: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +1: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +7: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +2: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +4: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +1: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +7: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +6: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +3: [2023-03-16 21:11:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +5: [2023-03-16 21:11:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:11:39,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +6: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +2: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +1: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +3: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +2: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +4: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +1: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +4: [2023-03-16 21:11:39,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +6: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +3: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +7: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:11:39,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,712] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,714] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +2: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +7: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +1: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +6: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +4: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +5: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +1: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +7: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +3: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +5: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +6: [2023-03-16 21:11:39,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +4: [2023-03-16 21:11:39,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +3: [2023-03-16 21:11:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:11:39,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +2: [2023-03-16 21:11:39,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,736] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +2: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +2: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +2: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +7: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +7: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +4: [2023-03-16 21:11:39,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +1: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +6: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +1: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +3: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +4: [2023-03-16 21:11:39,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +5: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +5: [2023-03-16 21:11:39,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +3: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +6: [2023-03-16 21:11:39,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:11:39,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +1: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +5: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +4: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +6: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +7: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +7: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:11:39,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +5: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +2: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +3: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +1: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +4: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +6: [2023-03-16 21:11:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +7: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +2: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +3: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +4: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +6: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +1: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +4: [2023-03-16 21:11:39,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +7: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +2: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +4: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +4: [2023-03-16 21:11:39,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +6: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +3: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +3: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +3: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +1: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +5: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: > using checkpoint value 0.0002 for learning rate +0: > using checkpoint value 2e-05 for minimum learning rate +5: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: > using checkpoint value 444161 for warmup iterations +0: > using checkpoint value 44416143 for total number of iterations +0: > using checkpoint value cosine for decay style +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +5: [2023-03-16 21:11:39,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:11:39,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:11:39,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:11:39,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:11:39,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:11:39,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:11:39,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:11:39,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,850] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 24 +2: [2023-03-16 21:11:39,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,852] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 20 +7: [2023-03-16 21:11:39,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,855] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 62 +3: [2023-03-16 21:11:39,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,857] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 29 +6: [2023-03-16 21:11:39,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,858] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 51 +5: [2023-03-16 21:11:39,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,860] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 45 +2: [2023-03-16 21:11:39,861] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 23 +1: [2023-03-16 21:11:39,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,861] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 15 +2: [2023-03-16 21:11:39,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,861] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 13 +1: [2023-03-16 21:11:39,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,861] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 16 +1: [2023-03-16 21:11:39,861] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 9 +7: [2023-03-16 21:11:39,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,865] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 63 +1: [2023-03-16 21:11:39,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,865] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 11 +7: [2023-03-16 21:11:39,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,866] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 60 +2: [2023-03-16 21:11:39,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,866] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 21 +2: [2023-03-16 21:11:39,866] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 22 +6: [2023-03-16 21:11:39,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,866] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 53 +3: [2023-03-16 21:11:39,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,867] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 28 +0: [2023-03-16 21:11:39,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:11:39,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,859] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 0 +0: [2023-03-16 21:11:39,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,860] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 4 +6: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 52 +2: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 18 +2: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 19 +3: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 26 +6: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 48 +3: [2023-03-16 21:11:39,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 25 +4: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 39 +4: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 36 +5: [2023-03-16 21:11:39,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 40 +5: [2023-03-16 21:11:39,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 44 +5: [2023-03-16 21:11:39,870] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 41 +3: [2023-03-16 21:11:39,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,870] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 27 +6: [2023-03-16 21:11:39,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,870] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 49 +4: [2023-03-16 21:11:39,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 34 +3: [2023-03-16 21:11:39,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,871] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 30 +7: [2023-03-16 21:11:39,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,871] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 58 +7: [2023-03-16 21:11:39,871] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 61 +7: [2023-03-16 21:11:39,871] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 59 +4: [2023-03-16 21:11:39,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,872] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 38 +5: [2023-03-16 21:11:39,873] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 42 +6: [2023-03-16 21:11:39,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,873] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 54 +7: [2023-03-16 21:11:39,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,874] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 57 +5: [2023-03-16 21:11:39,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,874] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 46 +4: [2023-03-16 21:11:39,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,874] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 32 +7: [2023-03-16 21:11:39,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:11:39,875] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 56 +5: [2023-03-16 21:11:39,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:39,875] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 47 +6: [2023-03-16 21:11:39,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,876] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 55 +6: [2023-03-16 21:11:39,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:11:39,877] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 50 +1: [2023-03-16 21:11:39,877] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 12 +1: [2023-03-16 21:11:39,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,877] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 14 +4: [2023-03-16 21:11:39,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,877] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 35 +4: [2023-03-16 21:11:39,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,877] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 33 +4: [2023-03-16 21:11:39,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:11:39,878] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 37 +3: [2023-03-16 21:11:39,887] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 28 +1: [2023-03-16 21:11:39,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,897] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 8 +3: [2023-03-16 21:11:39,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:11:39,906] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 31 +0: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 7 +0: [2023-03-16 21:11:39,868] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 5 +0: [2023-03-16 21:11:39,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,869] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 1 +0: [2023-03-16 21:11:39,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,871] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 6 +0: [2023-03-16 21:11:39,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,906] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 3 +0: [2023-03-16 21:11:39,906] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 6 +0: [2023-03-16 21:11:39,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:11:39,907] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 2 +1: [2023-03-16 21:11:39,915] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 14 +1: [2023-03-16 21:11:39,916] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 12 +6: [2023-03-16 21:11:39,936] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 54 +6: [2023-03-16 21:11:39,938] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 53 +6: [2023-03-16 21:11:39,939] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 51 +1: [2023-03-16 21:11:39,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:11:39,941] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 10 +1: [2023-03-16 21:11:39,950] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 9 +3: [2023-03-16 21:11:39,953] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 31 +0: [2023-03-16 21:11:39,949] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 5 +6: [2023-03-16 21:11:39,969] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 49 +1: [2023-03-16 21:11:39,971] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 13 +0: [2023-03-16 21:11:39,972] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 7 +4: [2023-03-16 21:11:39,974] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 39 +4: [2023-03-16 21:11:39,977] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 38 +1: [2023-03-16 21:11:39,982] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 11 +0: [2023-03-16 21:11:39,982] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 0 +0: checkpoint version 3.0 +1: [2023-03-16 21:11:39,983] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 10 +0: [2023-03-16 21:11:39,988] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 1 +4: [2023-03-16 21:11:39,990] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 36 +0: [2023-03-16 21:11:39,998] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 4 +1: [2023-03-16 21:11:40,001] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 15 +5: [2023-03-16 21:11:40,002] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 44 +6: [2023-03-16 21:11:40,008] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 52 +6: [2023-03-16 21:11:40,012] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 48 +2: [2023-03-16 21:11:40,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:11:40,026] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 17 +5: [2023-03-16 21:11:40,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:11:40,027] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 64 ZeRO state_dicts for rank 43 +5: [2023-03-16 21:11:40,033] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 42 +2: [2023-03-16 21:11:40,043] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 16 +5: [2023-03-16 21:11:40,051] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 41 +5: [2023-03-16 21:11:40,058] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 43 +2: [2023-03-16 21:11:40,058] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 21 +1: [2023-03-16 21:11:40,076] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 8 +7: [2023-03-16 21:11:40,083] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 56 +2: [2023-03-16 21:11:40,086] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 23 +2: [2023-03-16 21:11:40,090] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 18 +7: [2023-03-16 21:11:40,099] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 57 +5: [2023-03-16 21:11:40,113] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 47 +5: [2023-03-16 21:11:40,113] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 46 +5: [2023-03-16 21:11:40,113] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 45 +2: [2023-03-16 21:11:40,119] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 22 +0: [2023-03-16 21:11:40,128] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 3 +2: [2023-03-16 21:11:40,132] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 20 +2: [2023-03-16 21:11:40,135] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 19 +0: [2023-03-16 21:11:40,140] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 2 +7: [2023-03-16 21:11:40,156] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 63 +5: [2023-03-16 21:11:40,157] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 40 +7: [2023-03-16 21:11:40,160] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 61 +7: [2023-03-16 21:11:40,160] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 62 +7: [2023-03-16 21:11:40,172] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 58 +3: [2023-03-16 21:11:40,176] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 30 +7: [2023-03-16 21:11:40,177] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 59 +7: [2023-03-16 21:11:40,181] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 60 +3: [2023-03-16 21:11:40,190] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 27 +3: [2023-03-16 21:11:40,192] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 26 +6: [2023-03-16 21:11:40,204] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 50 +3: [2023-03-16 21:11:40,204] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 25 +4: [2023-03-16 21:11:40,218] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 32 +3: [2023-03-16 21:11:40,234] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 24 +6: [2023-03-16 21:11:40,235] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 55 +4: [2023-03-16 21:11:40,241] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 37 +4: [2023-03-16 21:11:40,251] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 35 +4: [2023-03-16 21:11:40,259] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 33 +4: [2023-03-16 21:11:40,271] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 34 +2: [2023-03-16 21:11:40,319] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 17 +3: [2023-03-16 21:11:40,325] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 64 zero partition checkpoints for rank 29 +0: successfully loaded checkpoint from checkpoints_83m91b100m at iteration 173500 +7: time (ms) | load-checkpoint: 1082.63 +0: estimated model parameters: 0.08274176 +0: estimated model parameters without embeddings: 0.04923648 +0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 21:11:40 +0: > building train, validation, and test datasets ... +0: > datasets target sizes (minimum size): +0: train: 44416143 +0: validation: 2304 +0: test: 256 +0: > building train, validation, and test datasets for GPT ... +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.008298 seconds +0: number of documents: 208931 +0: > dataset split: +0: train: +0: document indices in [0, 208931) total of 208931 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_44416143ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_44416143ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_100M_text_document_train_indexmap_44416143ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.030 seconds +0: total number of samples: 44461248 +0: total number of epochs: 911 +0: > building dataset index ... +0: reading sizes... +0: reading pointers... +0: reading document index... +0: creating numpy buffer of mmap... +0: creating memory view of numpy buffer... +0: > finished creating indexed dataset in 0.034618 seconds +0: number of documents: 364608 +0: > dataset split: +0: validation: +0: document indices in [0, 364608) total of 364608 documents +0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_2304ns_2048sl_1234s_doc_idx.npy +0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_2304ns_2048sl_1234s_sample_idx.npy +0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_2304ns_2048sl_1234s_shuffle_idx.npy +0: loaded indexed file in 0.034 seconds +0: total number of samples: 84978 +0: total number of epochs: 1 +0: > finished creating GPT datasets ... +0: [after dataloaders are built] datetime: 2023-03-16 21:11:54 +0: done with setup ... +0: training ... +0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: +7: time (ms) | model-and-optimizer-setup: 17716.78 | train/valid/test-data-iterators-setup: 13157.19 +0: [000-000] 0.0827B / 0.0492B +0: [before the start of training step] datetime: 2023-03-16 21:11:54 +0: [after training is done] datetime: 2023-03-16 21:11:54 +0: [2023-03-16 21:11:54,722] [INFO] [checkpointing.py:553:forward] Activation Checkpointing Information +0: [2023-03-16 21:11:54,722] [INFO] [checkpointing.py:554:forward] ----Partition Activations False, CPU CHECKPOINTING False +0: [2023-03-16 21:11:54,722] [INFO] [checkpointing.py:557:forward] ----contiguous Memory Checkpointing False with None total layers +0: [2023-03-16 21:11:54,722] [INFO] [checkpointing.py:560:forward] ----Synchronization False +0: [2023-03-16 21:11:54,722] [INFO] [checkpointing.py:561:forward] ----Profiling time in checkpointing False +0: saving checkpoint at iteration 173500 to checkpoints_83m91b100m +7: ----------------------------------------------------------------------------------------------------------------- +7: validation loss at the end of training for val data | lm loss value: 3.725194E+00 | lm loss PPL: 4.147926E+01 | +7: ----------------------------------------------------------------------------------------------------------------- +0: [2023-03-16 21:12:01,245] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step173500 is begin to save! +0: [2023-03-16 21:12:01,262] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt... +0: [2023-03-16 21:12:01,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_01-model_00-model_states.pt. +0: [2023-03-16 21:12:01,348] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt... +0: [2023-03-16 21:12:01,363] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_03-model_00-model_states.pt. +0: [2023-03-16 21:12:01,363] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt... +0: [2023-03-16 21:12:01,377] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_04-model_00-model_states.pt. +0: [2023-03-16 21:12:01,378] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt... +0: [2023-03-16 21:12:01,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_05-model_00-model_states.pt. +0: [2023-03-16 21:12:01,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt... +0: [2023-03-16 21:12:01,406] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_06-model_00-model_states.pt. +0: [2023-03-16 21:12:01,406] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt... +0: [2023-03-16 21:12:01,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_07-model_00-model_states.pt. +0: [2023-03-16 21:12:01,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt... +0: [2023-03-16 21:12:01,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_08-model_00-model_states.pt. +0: [2023-03-16 21:12:01,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt... +0: [2023-03-16 21:12:01,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_09-model_00-model_states.pt. +0: [2023-03-16 21:12:01,449] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt... +0: [2023-03-16 21:12:01,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_10-model_00-model_states.pt. +0: [2023-03-16 21:12:01,464] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt... +0: [2023-03-16 21:12:01,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_11-model_00-model_states.pt. +0: [2023-03-16 21:12:01,478] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt... +0: [2023-03-16 21:12:01,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_12-model_00-model_states.pt. +0: [2023-03-16 21:12:01,493] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt... +0: [2023-03-16 21:12:01,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/layer_14-model_00-model_states.pt. +0: [2023-03-16 21:12:01,495] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt +0: [2023-03-16 21:12:01,495] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt... +0: [2023-03-16 21:12:01,499] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/mp_rank_00_model_states.pt. +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... +5: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... +4: [2023-03-16 21:12:01,516] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... +0: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... +3: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... +7: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... +2: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... +6: [2023-03-16 21:12:01,517] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... +1: [2023-03-16 21:12:01,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +6: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt +6: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +6: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,577] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,577] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +2: [2023-03-16 21:12:01,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +7: [2023-03-16 21:12:01,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. +7: [2023-03-16 21:12:01,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt +7: [2023-03-16 21:12:01,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +3: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. +0: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: [2023-03-16 21:12:01,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +2: [2023-03-16 21:12:01,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt +0: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. +2: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +3: [2023-03-16 21:12:01,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt +3: [2023-03-16 21:12:01,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +4: [2023-03-16 21:12:01,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. +4: [2023-03-16 21:12:01,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt +4: [2023-03-16 21:12:01,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +1: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. +1: [2023-03-16 21:12:01,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +1: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +5: [2023-03-16 21:12:01,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. +5: [2023-03-16 21:12:01,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt +5: [2023-03-16 21:12:01,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step173500 is ready now! +0: successfully saved checkpoint at iteration 173500 to checkpoints_83m91b100m +END 3319473: Thu 16 Mar 2023 09:12:05 PM EET diff --git a/83m91b100m/eval.txt b/83m91b100m/eval.txt new file mode 100644 index 0000000000000000000000000000000000000000..98c7952bf8c1b6da6cd3ab667631273048290244 --- /dev/null +++ b/83m91b100m/eval.txt @@ -0,0 +1 @@ +3.743505E+00 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..981de1c77480b0a03ca1f8e216029a525657d011 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06485eb13338b05b438762f37409576dc0e4136525055b46d61602665e4deab9 +size 15518743 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..deae0e4cd9d62ea9f665b1f350851d6c18370130 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f048079f1df5ffdd78a5ca099779427699f63a1e3ed698c1a53b3afe6a8d620d +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec669d3e0dc08d457330228ab7118b66583432ed --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1429537ed84324c95983872cee835394a0c309614f8f7231baba4d950ba18344 +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..855ab6835ef78b18fcb15549c8db19e28d7441f8 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9431a598e45f9bce03437266be5466854aaa63fc823feb8daf6f78fdc1276267 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe962513af6d2627e85fdd67f4399dc5a94dd714 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a50855ed3eb14d00ab178d13bd925b834462f03aa3f316aeda7b01b78cb27a3 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..367733a82bca6299722cf078f7a6a362b89467e6 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b491b1eeb8f186012d57c48bd908fbdfa0d881a86fc0470232cb283f57320d +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..193f5d52ce699a2ae545fba79e28471ecb0c9645 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0e7f27f43ac5f66b3f9f8dfe9a93c9e6e07a3a7b51fb72e09232d9ad3563f03 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9c6ba922847ab2437269ec74db27dcff7738873 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6481c972db9c8252b8f42a89abc1069128d367cc8f90f5a389eacb8fd388f7e2 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a98da69c1e7edb0c90228a2ea102a37659d58cb4 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0615e81be23ae24f258feb1abe1a743e743cfa50ce6a26855b269cd04ef50302 +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a31f07b4985607225318cc4795480a9c9d2e6bc9 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f231ec6ad0fe1c24c66085d79f40c2836f2e5e76cddc7c39334f656647787b2 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..50b3b1373b8b281a24ade6b275b34650f1e90e53 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c48c58601fd82f9f84498fe15fccd8f5384ed8e3ef32b55c4b241a8eb13bc39f +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bb9d2bd3ecd475a3c7b6f016c9d7ddb5aa7500b --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89bda502d76f376d84b3a2c1b76678f80c15d2375d1a33eba3d7fd29243acde8 +size 15518615 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..929683d712f96f061c3946a6b100d5cda2e7265d --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb3ddccb53ca4938a45fc57f8781d7c8d41812bd82256393562d31a51c76c1d +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d407a7653584e026ee2b3a0d67fbfd0c72d966e --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ca0fe1ee800458453894261cee895dba9345bcf06c4f18793fa861e88240bdb +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..89c9661918d82bfeee823b09915932afc820c842 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57c03e0cdfcf89fcba61ae21e40039c79cc46e85f9badfb92f07a70691b26c00 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba5b2e79506866bec81065365e4f72c8ec74df08 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b846416a05e42d13e7293b17b57bf13f8c393b529373aee8b7534e2862e8a294 +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf6856f1dbeffd9bc7a79142dd58265faa3deb7d --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6daaf52d721335d027de121c821d724bb310d9f326551c6be54a4bcbe7893d1f +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa3e78b6317c986e1da407bcafabe950a9060ff6 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2f2ccc5ef49540c3668d2aace53333a4229987302ad4da0271d1df5c27fc83c +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4630c78805f3ec27f5d56ebe906df72f127eaad4 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b3f40213a4e1fc897d149986cca0f186227f8fb7738760748705d26cb5b2aa +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b529ef5487effce6866b341b2689a9c715b5a2a --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e3c86d89dc8462a39eb206e9dc5858de4d6638154b4f8a6519bd174c93dbb4 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..297feb96087914592fc8ad13f18ad2ac1fdc65d3 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5ea538ae9704c5245c0d69a0b0d69fdbdcc1907ce0f52bbe0d044974f5bc15 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d51eb44d3c006d5aade7c25556e46bbec2679342 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01476fc646bcb7b4ed7cefafdbb2cbf51dd898b086f9e7385b9bcf6e32ccb14d +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aab54a5cccf30110bf1f7938d86e03860e54df3a --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13257f289c535b00357bb3114f51985358f7efaa71c2e80130a9151fb57dfffe +size 15518743 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cabc38b76fb6be859057768b1061c942ce8afa84 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c36a56dea9243f1d414ad658eae241d36fcbe2fbc7d7e9ec4d06a482a9f85c7b +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..342f56054a7449cbd9d51035e7a32cf3706eef2b --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:674fcf49be6a27b8092b2cbaa3ed28ba833213d6e30b906a8ed01ea9414370cf +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bb592118434c74ffeef4b12f6631545148bfcc3 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d076d858db344c89612f7ab7cee50671eb6d18fe9504f21aa9952a6aecc3fba +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..01cb2fb305d21b5c059f804d3718fdd7b927c402 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:414740497b9d5aea0d5cc46cdbb04560b74bd7554f22c2fc629b706ecaad0daa +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de8a19a9d0bdf89b76f7bb08d9a774f5de939f17 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38d05ce22a4327aef03f26cdf84f85616f25d13ac07c799d67a24bd7658adeaa +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2de86eff8aa42e0130ca3792b9a2941b512fbac0 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8de7f94183e49892dc32a6fc48eeeaa347922933b557f70bc377da6a5328cf +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2b2d862227e05611af097b892092593cadced9f --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2fe155945df966f30d4fb10436b03baf062ad116f64913f9ba281b08376aef0 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd090dbca6628f6f0d82097cfe9e80fb33ac8110 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae2101ecde321d355b76491ebaa85622e5363f802266a817827ea56ac977b56e +size 15518818 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97da576fc35571af35e581905a6da05d09a9ebe4 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e6cadf6918790165ed1c59c269ad142643e258794a389060e456d024a27033 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0c9b65c9994704b4887732f46f251a612cef9e9 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e408ec2934ed619b84e99aa1fdcd7fd772037d8cd5010d8a98937e6851e2744c +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e19f2183a502c009b3e0e20815de2ab85a81481 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c172d3e88048e32127a08a778b52d13989801f0773a93fe9ca57a039e1357d53 +size 15518743 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3438ef3bb94d0c4dd1378156ad4fb47821652668 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:256d8badc973b967c86a80d25e2cb039ade078fe2b7dfc5bbb08ce038badbc17 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2b50d8f3eb2d0762b0ce9df11b982678c84229f --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b182bf81375ab5bf3697a106053025cca13ae8b03d77c18fd9c02fabd2c1ba8 +size 15518818 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69251a79282667d74efc7db2f16a6dab1c8eccd7 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd1d96aed42da09a1584e13da606b4cdbe750f8b1e8091c4c70fc673638bf10 +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99ee72a3ebd828a21c1e5bf17a82daf9a331201b --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3fb71a08226b43f84f58901ce2eb7b02d8c601cce2fbc3b9cecedc0da1e3d22 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a02e706c61bc0be7d1a2e8deb91bdf78997f053 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec29f05287d5a9872086e42251e6ffdd04091b39070cbcd0bc1bcaca5342edc +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..540cb05f24b6e7be427337e831e9c8d8d13ef2da --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b0c130c455b8e1e9cb3d3799f02433df241245492e0f1aea23dd2fd0e7ac058 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8be0e01632bdfc7cf00554967bafdd487365bd2f --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d87a0ced30a190e49af1c53ceb799f2574df36e074fe80649b7434996fc55dd +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a322d04e74f315db4810af317660096a2e869f3 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbe4fb6393b1cb77de1141465484d2be8f181145c8fd8fa0d28cd1a07b9e2878 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e44133ca9e15ab436c38dc4783158732fb2f47ea --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f903cae832664178408e16a65e666229a2719f39a585830e852b6749c0954a0d +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b1e127cc52ca6ea2eda60975c120f443901ca6e --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a38cf3fe4efbf98a16d5b1a6375af68e9951e385ad959d7d71ed18f38f866349 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c13f62ae81e1eb054a65da50d8591a41fe800853 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351a01843d624c9d5b181d5091319df64ffc3da3fe00e2093152a2a1a0f1a19f +size 15518615 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be5ac5dfcd69e1c09d2d27f9a5b9637b426f0528 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2637081ad74f118b880ca3f1775fae333b4e06a9a79512b3fb84464ea5cff5f +size 15518818 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31756084f2054022c0c87527bd5c15321b287423 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b87a5237b0382ff79f0ed8d1280261d93513217a25ca25ccd8016d7ee53508b +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cba426b0ecd6c66d50b61210f5c6953a4f6111ab --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3adc191be602977ad9e211ea259395df730dd3901663bbd1d7f22afe8b4c20d5 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..871d58a8e5c00eced8c6993d5cc6e5b4cc4717d0 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ad090c73689fb8a896956a9dfd1476b781f84ce155cfef06cdefbad9e21aff +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79bde346b000561e4e20e02c07050685f5324cf8 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dbe275c5c7cb624fa04e1f388f68abd2f381367c9efda3618db198cc8804087 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78c27db6fdeb451d2781c6a77c06eadaab076452 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87c43a466c5368e22ff37853f9895a9d6f2188627d6b253f31e5de06002ae63e +size 15518626 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a1f371a3521c6227c8ff19e008be2d8fb3ac0b2 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb41f05993969db8a60f66d3a79771c2e608394efb950b6c706e4294db9af2fc +size 15518818 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f316bc616be821615bbbaf6b4e1ce97f62293f24 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647fa193df9b0f81ae7c2ad473e560c3ef758cad62d5c6ac01dc6bb34fa099b2 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad0f38d1cab1abcb92eefe6408321e1ae2e51c1b --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3783212e628c7891b917be8d596a20c8dcc59164a422f1a90ac1de0d4c02cbb7 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4784943538caa147a3478ed39caceec1206a306 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d6c028d75013722ad665bf23e19f0ffbcfcf9d1b05d844befa4f29971d80d5 +size 15518754 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36512dfa784f9c435275d286badda4c2345de766 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46bdf9ad8b2937eb59e29b21819abd54201b3894b4b65f7eb2a4a0c3b431b783 +size 15518679 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6414b7b35dbdf9fe98efdddbfb6113ea8176324 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba96dbc6f2277c8cf22f3ab230926a97b5315bba2e2880cbc09cabd0872df19 +size 15518818 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9e73624da925458a2dcd78da3a879f0a2fdfe41 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:136aa8d5f2354a93a170c0761c26ae53396e7047171d7ea329e328aadc688a4c +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e32d6c9df4b4dc61666fdd283b716491a7719726 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b10e3bd78605b8ba5c6f7c759dc6b5e2b8ba95a05f27a386acaf73e2ac7f3982 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..69cad22542e62a273dcfd27f89947cc3f5f34962 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada8840fd0b2ccfadf071ea82d786dd3e13f4ed24c97b950772c6801239f1b00 +size 15518690 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b97eaa537ddd9cd044af885bfa141af2a10093e1 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea97e715343055843b233ac5c33bcfbe53957a801935c63c12a4019fc7302b99 +size 15518743 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af3de229778f686fbf9ee81e2ee897b028767dd4 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048bb97f7e0f9a716170add305aa623e61bb7055bd3a2f838bbcd6de09ddf7be +size 15518679 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1000afd16ced54665512b9c4a46998952d699374 --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:900458d7a15e5a5e6de128b19547d0cab991da7e19cac73628c83f2a6f53c079 +size 15518679 diff --git a/83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02e995a3f50e489b8c3a82a2e36e43a5537b4a2b --- /dev/null +++ b/83m91b100m/global_step173500/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7760d982883fee6d9a2af572bb64673b9b23dcbb6c7bfe31e06f490cf411437 +size 15518743 diff --git a/83m91b100m/global_step173500/layer_01-model_00-model_states.pt b/83m91b100m/global_step173500/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e946602be6d1cd95c861940102a9730aad0e18eb --- /dev/null +++ b/83m91b100m/global_step173500/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc3204e07429cf9996fe18d6fc8e76915cdddd24f2197740e3fc217210854797 +size 67011843 diff --git a/83m91b100m/global_step173500/layer_03-model_00-model_states.pt b/83m91b100m/global_step173500/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..964a3040d000e8cd4951244b3238a468eb5c35e8 --- /dev/null +++ b/83m91b100m/global_step173500/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a70fc0fda0add60d1569408b9adf4efed3d4f2d4422c39f38202beab845e380e +size 9851395 diff --git a/83m91b100m/global_step173500/layer_04-model_00-model_states.pt b/83m91b100m/global_step173500/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8286d39290b384cba79c811fec23ea12361100b2 --- /dev/null +++ b/83m91b100m/global_step173500/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee09ecfc81b90b5079dce91f4c95a7a9ac23514eec7e81bb3b396d53d801268 +size 9851395 diff --git a/83m91b100m/global_step173500/layer_05-model_00-model_states.pt b/83m91b100m/global_step173500/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29753ed1268600cddd5dc730d8e83452fd58d37a --- /dev/null +++ b/83m91b100m/global_step173500/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6682ef2f717a26d1f7b0a6b4468c8f45adf46bb7e363cb48e6d80eaf538d3be8 +size 9851395 diff --git a/83m91b100m/global_step173500/layer_06-model_00-model_states.pt b/83m91b100m/global_step173500/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f31ebd999e9ddcf3656e2eeeba3d5a16d8d72fe9 --- /dev/null +++ b/83m91b100m/global_step173500/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3c1749cf860d8015003bcb834a600ef5a970972bd6121762758c19708b9422c +size 9851395 diff --git a/83m91b100m/global_step173500/layer_07-model_00-model_states.pt b/83m91b100m/global_step173500/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..577858d9c4ef7c71f03d410f411e2e7c709b9c03 --- /dev/null +++ b/83m91b100m/global_step173500/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6f091ebc32dbb15ee07bbdbed0c738ba3fe957b3ef8bdea3b75ac478ea35cb +size 9851395 diff --git a/83m91b100m/global_step173500/layer_08-model_00-model_states.pt b/83m91b100m/global_step173500/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff0d68b9144ea569d451418dc44895697ae312b7 --- /dev/null +++ b/83m91b100m/global_step173500/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5206ff763171c98d1af553dca303f1adf55667a3cec266ff5993b63d0aa77b73 +size 9851395 diff --git a/83m91b100m/global_step173500/layer_09-model_00-model_states.pt b/83m91b100m/global_step173500/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..814c433649e7d9c1040340cb5f04d42ac295f129 --- /dev/null +++ b/83m91b100m/global_step173500/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60bb86fe3d4758cb8107b997db8404189b46664dd180fe0b398d3a4b951e6f4c +size 9851395 diff --git a/83m91b100m/global_step173500/layer_10-model_00-model_states.pt b/83m91b100m/global_step173500/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..247aafceaa2286eabf7517beef1f8628281e7fd9 --- /dev/null +++ b/83m91b100m/global_step173500/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ac39c6e6d84997f614020a12ff9e6d647cb1aa3c118b87df8f6e5a2d07ffba8 +size 9851395 diff --git a/83m91b100m/global_step173500/layer_11-model_00-model_states.pt b/83m91b100m/global_step173500/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6c8be50dc38e553bae8c2bb5e339f0029980d80 --- /dev/null +++ b/83m91b100m/global_step173500/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d75af03ae8af0a56c21bdb5d358ca200c48e1c7f92d6b61d57e748ee58beb7eb +size 9851395 diff --git a/83m91b100m/global_step173500/layer_12-model_00-model_states.pt b/83m91b100m/global_step173500/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac4a1e12b05c8b5461a879d10c4396f6f9c1bcb7 --- /dev/null +++ b/83m91b100m/global_step173500/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7c209442ae4175f2a2185c0b79c0cf496f5bd3817b41b480dff967ccc1ac8e2 +size 9851395 diff --git a/83m91b100m/global_step173500/layer_14-model_00-model_states.pt b/83m91b100m/global_step173500/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24e7e77d16b17503ad3c0a454f40f00cc4571768 --- /dev/null +++ b/83m91b100m/global_step173500/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f61078decbb3e4d0e62bf1cd20c14365c62ae74006917650ae1ff9bc87402f71 +size 3779 diff --git a/83m91b100m/global_step173500/mp_rank_00_model_states.pt b/83m91b100m/global_step173500/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8276be4faf51216f83d529ac6de97171857f8fca --- /dev/null +++ b/83m91b100m/global_step173500/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e93b6e288a499fb52805de07699ab8db5cdda745a41de2d71876e564d8691b2 +size 31667 diff --git a/83m91b100m/sbatch_83m91b100m.sh b/83m91b100m/sbatch_83m91b100m.sh new file mode 100644 index 0000000000000000000000000000000000000000..50352a5417cb32f10fe429394fbcad4c4221fa31 --- /dev/null +++ b/83m91b100m/sbatch_83m91b100m.sh @@ -0,0 +1,177 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=83m91b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_1B5_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_74M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=20000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +# TRAIN_SAMPLES=9_703_701 +# Tokens: 31633480000 +# -> Samples: 15446035 +# TRAIN_SAMPLES=15_446_035 +# Tokens: 60400000000 +# -> Samples: 29492188 +# TRAIN_SAMPLES=29_492_188 +# Tokens: 90964260000 +# -> Samples: 44416143 +TRAIN_SAMPLES=44_416_143 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 444_161 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 100 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 20000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/83m91b100m/sbatch_83m91b100mval.sh b/83m91b100m/sbatch_83m91b100mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..cb905904aba111d3affc3a0354920a707a67d130 --- /dev/null +++ b/83m91b100m/sbatch_83m91b100mval.sh @@ -0,0 +1,173 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=83m91b100mval +VARIANT_CKPT=83m91b100m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT +mkdir -p $CHECKPOINT_PATH +mkdir -p $TENSORBOARD_PATH + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" +TRAIN_DATA_PATH=train100m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_20B_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + + +PP_SIZE=1 +TP_SIZE=1 + +MICRO_BATCH_SIZE=4 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_74M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=1000 + +# Tokens: 19873180000 +# -> Samples: 9703701 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --no-load-optim \ + --reset-progress \ + --override-lr-scheduler \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --eval-only true \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678954266.nid006716.96142.0 b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678954266.nid006716.96142.0 new file mode 100644 index 0000000000000000000000000000000000000000..04bfc7d7b41abc18bf1a15b3d1767c72898f77ac --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678954266.nid006716.96142.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca177b7b067eaf94f95ddcbac913c85ada2f4e3af1a857eca68f6a48fc62bf60 +size 311126045 diff --git a/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678954675.nid005735.52991.0 b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678954675.nid005735.52991.0 new file mode 100644 index 0000000000000000000000000000000000000000..9e80e492f5911774f5caff86f324ed7aac097247 --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678954675.nid005735.52991.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b98aefecd3b8cbcb383e57e6bfda889393731c1a131c6edcef2b13322fd49269 +size 298940301 diff --git a/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993729.nid005617.63604.0 b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993729.nid005617.63604.0 new file mode 100644 index 0000000000000000000000000000000000000000..bbad97c4c1b4334c98dd8d5813bdc6f703c87011 --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993729.nid005617.63604.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:077ae365d98ebb2ac0eb07026fbede6c535c44e36ea276bc5e375a89c463f0d7 +size 21469 diff --git a/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993770.nid006236.81522.0 b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993770.nid006236.81522.0 new file mode 100644 index 0000000000000000000000000000000000000000..dd14dce6cd7bbc173441c782104c1e7955b99364 --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993770.nid006236.81522.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55f9e4b626547da7527bbe00fbfc00e9b44c8737e31f1dcab8f39fe091a7295f +size 21466 diff --git a/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993856.nid007230.78955.0 b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993856.nid007230.78955.0 new file mode 100644 index 0000000000000000000000000000000000000000..b37ed4af8bfd5891c28f544afb70d6e2b84905d1 --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993856.nid007230.78955.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cdeebf4d661884b5ba1d59f4aa2e18f2ee31cb7ad7e66607d6dc6683e0fa1f3 +size 21469 diff --git a/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993893.nid006716.61570.0 b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993893.nid006716.61570.0 new file mode 100644 index 0000000000000000000000000000000000000000..c71ea9b4e30fbf3fbd317586c4fd397d5571aade --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100m/events.out.tfevents.1678993893.nid006716.61570.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb8e3fee487a6ab61586ef5d26a6d6f19e60c6523952bef81fdd9c7f04aecd7 +size 21466 diff --git a/83m91b100m/tensorboard_83m91b100mval/events.out.tfevents.1678996112.nid006716.71310.0 b/83m91b100m/tensorboard_83m91b100mval/events.out.tfevents.1678996112.nid006716.71310.0 new file mode 100644 index 0000000000000000000000000000000000000000..afce6e8ee741e8c97f4232eb065760980640bfc5 --- /dev/null +++ b/83m91b100m/tensorboard_83m91b100mval/events.out.tfevents.1678996112.nid006716.71310.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81afeaf5014bd00260f48add2237490bcf831a5ffbdb7ef402f47f57bd46ee5 +size 980 diff --git a/8b712b400m/3324489.err b/8b712b400m/3324489.err new file mode 100644 index 0000000000000000000000000000000000000000..da0dfa299779ac3696f33e9aec6f3bcfc43c421d --- /dev/null +++ b/8b712b400m/3324489.err @@ -0,0 +1,4391 @@ +18: 2023-03-16 19:04:20.609077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +18: 2023-03-16 19:04:20.609090: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +18: 2023-03-16 19:04:20.609092: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +18: 2023-03-16 19:04:20.609096: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +18: 2023-03-16 19:04:20.609097: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609364: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609370: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: 2023-03-16 19:04:20.609089: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609267: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609270: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609261: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609376: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +18: 2023-03-16 19:04:20.609083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +18: 2023-03-16 19:04:20.609096: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +18: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609278: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609262: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609380: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +26: 2023-03-16 19:04:20.609385: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: 2023-03-16 19:04:20.609261: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609271: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: 2023-03-16 19:04:20.609259: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +26: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +10: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629335: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629337: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629360: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629356: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629368: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629366: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629376: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 8: 2023-03-16 19:04:20.629386: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 8: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638291: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638284: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638286: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638299: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638301: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638310: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 5: 2023-03-16 19:04:20.638300: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 5: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638341: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638342: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638372: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638377: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638381: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +29: 2023-03-16 19:04:20.638390: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +29: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638752: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638764: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638758: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638772: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638774: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638760: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638794: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +30: 2023-03-16 19:04:20.638790: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +30: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646401: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646411: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 7: 2023-03-16 19:04:20.646425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 7: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646847: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646841: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646853: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646852: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646842: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +20: 2023-03-16 19:04:20.646848: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +20: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647180: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647182: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647193: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647205: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647194: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647188: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647188: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +25: 2023-03-16 19:04:20.647188: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +25: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648116: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648126: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648126: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648136: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648146: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648139: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648150: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +13: 2023-03-16 19:04:20.648146: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +13: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648512: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648524: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648528: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648516: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648513: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +12: 2023-03-16 19:04:20.648520: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +12: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648857: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648860: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648865: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648871: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648854: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648878: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648858: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +15: 2023-03-16 19:04:20.648877: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +15: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669277: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669280: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669298: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669299: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669308: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +11: 2023-03-16 19:04:20.669304: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +11: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680845: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680852: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680853: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680858: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680862: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680866: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680859: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +16: 2023-03-16 19:04:20.680845: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +16: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.681987: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.681990: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.681994: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.682000: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.681998: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.682007: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.682008: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +28: 2023-03-16 19:04:20.682000: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +28: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718834: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718833: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718844: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: 2023-03-16 19:04:20.718653: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: 2023-03-16 19:04:20.718658: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: 2023-03-16 19:04:20.718666: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: 2023-03-16 19:04:20.718574: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:20.718581: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:20.718588: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718838: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: 2023-03-16 19:04:20.718662: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: 2023-03-16 19:04:20.718672: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:20.718578: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:20.718583: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718838: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718843: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +21: 2023-03-16 19:04:20.718845: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: 2023-03-16 19:04:20.718665: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: 2023-03-16 19:04:20.718669: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: 2023-03-16 19:04:20.718673: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: 2023-03-16 19:04:20.718591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:20.718594: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:20.718587: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +21: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +17: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724366: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724372: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724374: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724370: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724376: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724371: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 1: 2023-03-16 19:04:20.724375: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 1: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735967: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735974: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735972: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735983: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735985: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736059: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736069: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736067: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: 2023-03-16 19:04:20.735988: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +19: 2023-03-16 19:04:20.735993: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +19: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736075: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736084: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736070: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736090: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +23: 2023-03-16 19:04:20.736084: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +23: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738753: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738759: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738772: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738778: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 3: 2023-03-16 19:04:20.738788: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 3: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746102: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746103: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746112: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746114: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746108: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746109: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746119: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 4: 2023-03-16 19:04:20.746120: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 4: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763679: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763684: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763694: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763688: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763681: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763710: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 9: 2023-03-16 19:04:20.763702: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 9: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764404: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764415: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764418: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764415: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764404: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764407: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 0: 2023-03-16 19:04:20.764412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 0: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764465: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764475: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764466: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764479: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764460: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764481: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764479: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 6: 2023-03-16 19:04:20.764490: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 6: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765229: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765236: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765234: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765232: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765243: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765247: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765247: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +27: 2023-03-16 19:04:20.765242: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +27: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801917: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801935: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801926: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801931: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801925: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801930: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801942: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2: 2023-03-16 19:04:20.801952: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA + 2: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802525: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802536: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802528: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802535: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802541: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802530: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802528: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +24: 2023-03-16 19:04:20.802546: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +24: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807530: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807542: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807536: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807546: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807532: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807548: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807537: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +14: 2023-03-16 19:04:20.807537: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +14: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902747: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902752: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902763: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902776: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902763: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +31: 2023-03-16 19:04:20.902762: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA +31: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. +22: 2023-03-16 19:04:22.920169: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920171: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920178: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920406: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +22: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:22.920412: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920414: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920419: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920421: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920420: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920426: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +22: 2023-03-16 19:04:22.920429: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.920994: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.920996: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921008: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921001: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921003: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +18: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:22.921423: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921425: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921427: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921429: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921430: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921433: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921435: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +18: 2023-03-16 19:04:22.921432: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.934952: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934952: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934965: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934960: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.934956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +10: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:22.935520: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935524: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935527: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935533: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935530: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935531: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935538: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +10: 2023-03-16 19:04:22.935537: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.951789: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951794: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951805: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951801: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951811: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951798: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951812: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.951808: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +25: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:22.952191: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952196: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952198: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952199: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952200: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952203: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952204: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +25: 2023-03-16 19:04:22.952209: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.955923: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955934: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955939: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955938: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955939: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955936: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.955943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +20: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:22.956301: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956307: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956313: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956316: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956321: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956322: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956326: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +20: 2023-03-16 19:04:22.956326: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962173: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962176: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962183: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962197: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962191: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +12: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:22.962535: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962534: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962538: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962542: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962541: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962543: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962545: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +12: 2023-03-16 19:04:22.962548: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.962763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962771: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962775: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962780: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.962777: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +16: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:22.963127: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963131: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963134: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963135: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963140: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963139: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963142: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +16: 2023-03-16 19:04:22.963143: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016059: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016054: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016064: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016061: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016055: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016060: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 3: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:23.016461: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016465: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016466: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016470: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016471: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016473: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016476: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 3: 2023-03-16 19:04:23.016479: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018682: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018695: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018690: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018684: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018695: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 2023-03-16 19:04:23.020297: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020293: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020294: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020306: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020301: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 5: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:23.020503: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020507: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020509: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020511: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020513: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020514: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020516: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 5: 2023-03-16 19:04:23.020516: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021023: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021033: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021036: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021040: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021041: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +28: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:23.021233: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021236: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021239: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021240: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021241: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021244: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021244: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +28: 2023-03-16 19:04:23.021250: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029432: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029423: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029433: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029435: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029438: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029438: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029448: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029441: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +21: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:23.029617: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029617: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029623: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029625: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029623: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029625: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029631: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +21: 2023-03-16 19:04:23.029634: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +26: 2023-03-16 19:04:23.033926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:23.033930: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:23.033934: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:23.033931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034096: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 2023-03-16 19:04:23.033941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:23.033932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034102: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 2023-03-16 19:04:23.033937: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034107: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 2023-03-16 19:04:23.033938: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034100: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 2023-03-16 19:04:23.034357: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +26: 2023-03-16 19:04:23.034360: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +26: 2023-03-16 19:04:23.034363: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:23.034371: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +26: 2023-03-16 19:04:23.034371: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +26: 2023-03-16 19:04:23.034375: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +26: 2023-03-16 19:04:23.034377: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +26: 2023-03-16 19:04:23.034382: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 2023-03-16 19:04:23.037065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037074: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037083: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037085: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +11: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:23.037476: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037478: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037482: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037483: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037485: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037487: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037488: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +11: 2023-03-16 19:04:23.037490: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.052676: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052680: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052685: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052689: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052694: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052690: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.052693: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +17: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:23.053094: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053100: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053105: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053108: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053115: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053115: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053118: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +17: 2023-03-16 19:04:23.053121: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053277: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053288: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053285: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053306: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053285: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053306: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053299: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +13: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:23.053318: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053320: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053321: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053324: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053326: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +13: 2023-03-16 19:04:23.053325: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.057845: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057851: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057847: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057853: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057861: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057860: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.057855: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 2: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:23.058233: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058237: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058239: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058243: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058245: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058245: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058251: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 2: 2023-03-16 19:04:23.058258: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058311: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058319: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058329: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058326: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 0: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:23.058707: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058711: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058716: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058715: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058718: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058722: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058725: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 0: 2023-03-16 19:04:23.058728: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:23.018978: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018982: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018984: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018985: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018987: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018988: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018989: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +14: 2023-03-16 19:04:23.018992: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.062587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062580: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062583: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062580: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062586: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.062594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 6: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:23.063072: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063072: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063076: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063081: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063082: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063086: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063088: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 6: 2023-03-16 19:04:23.063088: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071469: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071467: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071465: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071481: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071476: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071476: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071475: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 1: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:23.071904: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071910: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071911: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071914: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071916: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071918: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071918: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 1: 2023-03-16 19:04:23.071921: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074549: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074549: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074559: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074556: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074565: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074563: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074552: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074560: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +19: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:23.074969: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074975: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074977: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074976: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074979: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074982: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074985: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:23.074986: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:23.034490: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034488: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034492: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034494: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034497: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034500: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034501: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +29: 2023-03-16 19:04:23.034505: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122271: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122270: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122280: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122273: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122278: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122274: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122277: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +31: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:23.122654: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122663: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122666: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122669: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122671: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122673: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122683: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +31: 2023-03-16 19:04:23.122697: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136329: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:23.136328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136417: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 2023-03-16 19:04:23.136324: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136424: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 2023-03-16 19:04:23.136333: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136430: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 2023-03-16 19:04:23.136338: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136419: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 2023-03-16 19:04:23.136344: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136425: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136432: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136427: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136431: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +23: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:23.136826: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136828: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136830: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136831: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136833: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136833: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136835: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +23: 2023-03-16 19:04:23.136836: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.142628: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142626: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142631: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142625: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142638: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142638: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142632: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.142629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 7: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:23.143036: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143041: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143046: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143045: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143048: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143047: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143049: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 7: 2023-03-16 19:04:23.143050: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143487: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143495: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143492: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143503: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143502: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143502: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +30: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:23.143909: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143914: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143919: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143919: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143922: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143922: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143927: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +30: 2023-03-16 19:04:23.143926: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145322: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145331: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145339: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145333: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145335: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145332: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 9: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:23.145739: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145745: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145747: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145748: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145751: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145753: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145758: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 9: 2023-03-16 19:04:23.145763: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147340: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147340: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147350: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147348: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +15: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:23.147751: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147750: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147754: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147757: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147757: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147759: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147759: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +15: 2023-03-16 19:04:23.147762: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 4: 2023-03-16 19:04:23.148402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148417: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148422: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148416: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 2023-03-16 19:04:23.148659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148425: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148819: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 4: 2023-03-16 19:04:23.148825: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:23.148828: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 4: 2023-03-16 19:04:23.148829: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 4: 2023-03-16 19:04:23.148832: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.148667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 4: 2023-03-16 19:04:23.148836: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 4: 2023-03-16 19:04:23.148841: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 4: 2023-03-16 19:04:23.148843: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.148667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.148665: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.148670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.148670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.148673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.148678: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +24: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:23.149027: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149029: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149031: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:23.136345: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:23.136347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 + 8: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:23.136710: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136716: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136717: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136718: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136723: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136726: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136729: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. + 8: 2023-03-16 19:04:23.136732: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149033: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149034: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149038: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149040: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +24: 2023-03-16 19:04:23.149043: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237450: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237448: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237445: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_46200 +27: 0125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:23.237819: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237822: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237827: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237829: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237828: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237830: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237833: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +27: 2023-03-16 19:04:23.237835: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine. +19: 2023-03-16 19:04:28.353810: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353820: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353822: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353829: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353828: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: 2023-03-16 19:04:28.354260: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353831: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: 2023-03-16 19:04:28.354270: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353824: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: 2023-03-16 19:04:28.354275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.353833: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: 2023-03-16 19:04:28.354275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.354268: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.354273: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.354279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.354297: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +10: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354855: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354857: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354873: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354868: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354868: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354876: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: 2023-03-16 19:04:28.355009: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354881: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.354881: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: 2023-03-16 19:04:28.355014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.355020: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.355025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.355022: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.355026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.355030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.355026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +26: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355873: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355874: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355876: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355881: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: 2023-03-16 19:04:28.355888: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +19: 2023-03-16 19:04:28.355890: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +19: 2023-03-16 19:04:28.355893: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +19: 2023-03-16 19:04:28.355897: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +19: 2023-03-16 19:04:28.355899: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +19: 2023-03-16 19:04:28.355899: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356202: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: 2023-03-16 19:04:28.355926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: 2023-03-16 19:04:28.355926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +19: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356206: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +19: 2023-03-16 19:04:28.355940: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +19: 2023-03-16 19:04:28.355941: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356206: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356220: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356222: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356220: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356227: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356229: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356231: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356265: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356270: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +10: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +10: 2023-03-16 19:04:28.356282: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +10: 2023-03-16 19:04:28.356286: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +12: 2023-03-16 19:04:28.357110: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.357112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.357182: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: 2023-03-16 19:04:28.357113: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.357112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.357187: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: 2023-03-16 19:04:28.357116: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.357188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: 2023-03-16 19:04:28.357118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.357125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357190: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: 2023-03-16 19:04:28.357128: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +12: 2023-03-16 19:04:28.357128: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +12: 2023-03-16 19:04:28.357129: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.357131: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +12: 2023-03-16 19:04:28.357133: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357190: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.357302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: 2023-03-16 19:04:28.357191: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +12: 2023-03-16 19:04:28.357302: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: 2023-03-16 19:04:28.357191: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +12: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.357200: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357202: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +12: 2023-03-16 19:04:28.357317: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +12: 2023-03-16 19:04:28.357317: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357198: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357205: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357207: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357207: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357209: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +26: 2023-03-16 19:04:28.357247: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +26: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +26: 2023-03-16 19:04:28.357259: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.358234: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358240: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358244: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358248: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358249: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358252: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.358253: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 0: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359716: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359723: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359720: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359725: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359730: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359730: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359736: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.359739: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +18: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361951: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.361964: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.361968: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.361973: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.361974: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.361975: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.361976: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.362088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.362101: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +18: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +18: 2023-03-16 19:04:28.362110: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +18: 2023-03-16 19:04:28.362115: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.367767: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367770: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367775: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367776: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367778: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367782: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367786: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.367785: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 2: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368068: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368087: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368080: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368080: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368085: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.368091: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +23: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368662: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368667: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368665: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368671: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368673: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368676: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.368678: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +13: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369453: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369455: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369463: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369466: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369470: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369472: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369478: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369478: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369477: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369523: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369536: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 2: 2023-03-16 19:04:28.369539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 2: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 2: 2023-03-16 19:04:28.369553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370239: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370245: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370242: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370246: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370253: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370249: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370253: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370250: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370260: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370262: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370263: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370267: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370268: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370268: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +23: 2023-03-16 19:04:28.370275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +23: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +23: 2023-03-16 19:04:28.370291: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370584: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370584: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370587: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370588: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370590: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370590: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370596: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370600: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370599: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370602: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370605: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370606: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370608: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370610: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +13: 2023-03-16 19:04:28.370634: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +13: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +13: 2023-03-16 19:04:28.370649: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359744: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359742: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359746: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359749: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359752: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359760: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359763: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359766: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359769: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359771: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359772: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359823: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359826: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 0: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 0: 2023-03-16 19:04:28.359844: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: 2023-03-16 19:04:28.359844: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 8: 2023-03-16 19:04:28.405325: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405328: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405333: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405340: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405339: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.405342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.407236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407283: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407238: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407153: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407288: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407238: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407161: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407301: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407237: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407159: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407297: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407242: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407303: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407245: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.407244: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407171: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407297: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407247: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.407164: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.407251: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 8: 2023-03-16 19:04:28.407255: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +29: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.407308: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 8: 2023-03-16 19:04:28.407254: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 8: 2023-03-16 19:04:28.407257: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 8: 2023-03-16 19:04:28.407260: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 8: 2023-03-16 19:04:28.407261: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 8: 2023-03-16 19:04:28.407264: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 8: 2023-03-16 19:04:28.407264: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.407312: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +21: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409106: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409107: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409106: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409367: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.409112: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409118: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +29: 2023-03-16 19:04:28.409129: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +29: 2023-03-16 19:04:28.409124: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +29: 2023-03-16 19:04:28.409123: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +29: 2023-03-16 19:04:28.409135: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409135: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +29: 2023-03-16 19:04:28.409137: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409368: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +29: 2023-03-16 19:04:28.409136: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +29: 2023-03-16 19:04:28.409151: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409371: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409371: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409375: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409372: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409381: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409386: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409388: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409390: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409392: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409388: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +21: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +21: 2023-03-16 19:04:28.409392: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409395: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +21: 2023-03-16 19:04:28.409405: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 5: 2023-03-16 19:04:28.416127: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416124: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416136: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416133: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416140: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.416142: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416773: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416775: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416784: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416785: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416792: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416803: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.416814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417030: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:28.417132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: 2023-03-16 19:04:28.417034: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417038: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.417140: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417039: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.417144: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417041: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.417147: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417046: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.417150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417041: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.417143: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.417042: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.417148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:28.417157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417514: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417523: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.417664: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417524: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: 2023-03-16 19:04:28.417669: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.417519: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: 2023-03-16 19:04:28.417678: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +30: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.417750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.417682: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: 2023-03-16 19:04:28.417748: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.417683: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: 2023-03-16 19:04:28.417759: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.417687: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: 2023-03-16 19:04:28.417764: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.417680: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: 2023-03-16 19:04:28.417761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.417764: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.417766: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +20: 2023-03-16 19:04:28.417691: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.417763: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +11: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.418072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: 2023-03-16 19:04:28.418094: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.418071: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.418107: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418075: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.418116: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: 2023-03-16 19:04:28.418100: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418079: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.418119: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: 2023-03-16 19:04:28.418190: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: 2023-03-16 19:04:28.418103: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:28.418195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.418128: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: 2023-03-16 19:04:28.418102: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418084: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.418089: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.418125: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: 2023-03-16 19:04:28.418201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: 2023-03-16 19:04:28.418109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418087: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.418090: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 5: 2023-03-16 19:04:28.418089: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.418127: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: 2023-03-16 19:04:28.418198: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418203: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: 2023-03-16 19:04:28.418111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.418133: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: 2023-03-16 19:04:28.418204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418202: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 7: 2023-03-16 19:04:28.418113: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418096: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 5: 2023-03-16 19:04:28.418098: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 5: 2023-03-16 19:04:28.418097: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.418346: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.418104: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.418137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: 2023-03-16 19:04:28.418206: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 5: 2023-03-16 19:04:28.418127: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 5: 2023-03-16 19:04:28.418141: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.418132: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: 2023-03-16 19:04:28.418201: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: 2023-03-16 19:04:28.418354: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418208: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 4: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.418204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: 2023-03-16 19:04:28.418355: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418207: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.418208: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: 2023-03-16 19:04:28.418351: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 9: 2023-03-16 19:04:28.418209: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 3: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.418358: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.418361: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.418366: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.418365: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +16: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418956: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: 2023-03-16 19:04:28.418984: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: 2023-03-16 19:04:28.418980: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418962: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +17: 2023-03-16 19:04:28.419086: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: 2023-03-16 19:04:28.418989: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418971: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418967: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +17: 2023-03-16 19:04:28.419087: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418983: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +15: 2023-03-16 19:04:28.418984: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.419000: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: 2023-03-16 19:04:28.418979: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +15: 2023-03-16 19:04:28.418981: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +15: 2023-03-16 19:04:28.418988: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419088: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.418992: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.418997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +15: 2023-03-16 19:04:28.419010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +17: 2023-03-16 19:04:28.419091: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +15: 2023-03-16 19:04:28.419022: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.418997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.419091: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.419004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +25: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419179: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: 2023-03-16 19:04:28.418997: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +17: 2023-03-16 19:04:28.419090: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +17: 2023-03-16 19:04:28.419091: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419194: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419093: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: 2023-03-16 19:04:28.419190: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +17: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:28.419102: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419103: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +17: 2023-03-16 19:04:28.419104: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419108: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419110: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: 2023-03-16 19:04:28.419192: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +17: 2023-03-16 19:04:28.419111: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419114: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +17: 2023-03-16 19:04:28.419115: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419200: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: 2023-03-16 19:04:28.419195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419195: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: 2023-03-16 19:04:28.419482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419483: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +25: 2023-03-16 19:04:28.419212: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: 2023-03-16 19:04:28.419213: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: 2023-03-16 19:04:28.419218: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +25: 2023-03-16 19:04:28.419219: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: 2023-03-16 19:04:28.419219: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +25: 2023-03-16 19:04:28.419221: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +20: 2023-03-16 19:04:28.419485: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: 2023-03-16 19:04:28.419589: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419491: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: 2023-03-16 19:04:28.419590: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419492: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: 2023-03-16 19:04:28.419591: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419497: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419498: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419594: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: 2023-03-16 19:04:28.419496: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419499: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +20: 2023-03-16 19:04:28.419506: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419596: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: 2023-03-16 19:04:28.419511: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +20: 2023-03-16 19:04:28.419510: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +20: 2023-03-16 19:04:28.419513: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: 2023-03-16 19:04:28.419599: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +20: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +20: 2023-03-16 19:04:28.419549: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.419605: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419605: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419613: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419610: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419612: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419614: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419829: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: 2023-03-16 19:04:28.419641: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: 2023-03-16 19:04:28.419646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +30: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419832: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +30: 2023-03-16 19:04:28.419657: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +30: 2023-03-16 19:04:28.419662: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419832: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419833: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419833: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419837: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419836: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419844: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419849: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419850: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419852: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419852: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419855: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419854: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +11: 2023-03-16 19:04:28.419893: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +11: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +11: 2023-03-16 19:04:28.419904: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420169: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420169: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420173: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420168: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420172: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420370: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: 2023-03-16 19:04:28.420177: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420188: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420190: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420191: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420191: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 7: 2023-03-16 19:04:28.420193: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420227: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: 2023-03-16 19:04:28.420369: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 7: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 7: 2023-03-16 19:04:28.420241: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:28.420445: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: 2023-03-16 19:04:28.420374: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420377: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420379: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: 2023-03-16 19:04:28.420535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420378: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: 2023-03-16 19:04:28.420570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420384: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: 2023-03-16 19:04:28.420383: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: 2023-03-16 19:04:28.420446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420537: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: 2023-03-16 19:04:28.420572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420383: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420540: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420388: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 3: 2023-03-16 19:04:28.420393: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420569: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420454: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420394: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 3: 2023-03-16 19:04:28.420394: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 3: 2023-03-16 19:04:28.420398: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420539: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 3: 2023-03-16 19:04:28.420439: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: 2023-03-16 19:04:28.420572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420453: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 3: 2023-03-16 19:04:28.420452: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:28.420460: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: 2023-03-16 19:04:28.420462: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420542: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: 2023-03-16 19:04:28.420572: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420466: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: 2023-03-16 19:04:28.420468: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: 2023-03-16 19:04:28.420469: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:28.420473: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: 2023-03-16 19:04:28.420471: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420542: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: 2023-03-16 19:04:28.420573: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 9: 2023-03-16 19:04:28.420503: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420553: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420554: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 9: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 9: 2023-03-16 19:04:28.420518: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420563: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420564: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420566: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420568: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 4: 2023-03-16 19:04:28.420591: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 4: 2023-03-16 19:04:28.420603: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420579: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.420587: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420587: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420589: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420591: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420592: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420592: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420596: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +16: 2023-03-16 19:04:28.420596: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +16: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +16: 2023-03-16 19:04:28.420611: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421150: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421153: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421164: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421162: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421161: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421171: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421164: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421162: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421181: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421180: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421185: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421185: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +28: 2023-03-16 19:04:28.421194: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +28: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +28: 2023-03-16 19:04:28.421211: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +22: 2023-03-16 19:04:28.423499: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423511: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423506: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423508: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423514: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423517: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423516: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: 2023-03-16 19:04:28.423921: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423928: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423925: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423931: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423930: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423937: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.423941: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 + 6: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425111: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425123: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425121: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425124: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425129: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.425133: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +31: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.423520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.425400: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.425516: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.425406: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.425524: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.425523: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.425538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425408: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.425415: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.425534: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425417: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +22: 2023-03-16 19:04:28.425422: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +22: 2023-03-16 19:04:28.425426: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: 2023-03-16 19:04:28.425427: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +22: 2023-03-16 19:04:28.425428: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.425533: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425493: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.425541: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425496: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +22: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.425550: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +22: 2023-03-16 19:04:28.425505: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +22: 2023-03-16 19:04:28.425509: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425904: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425904: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425909: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425912: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425912: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.425919: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.425919: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.425923: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.425927: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.425930: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.425931: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427223: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427225: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427227: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427228: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427229: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427230: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427478: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: 2023-03-16 19:04:28.427231: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427236: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427237: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427241: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427244: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427247: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427247: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +31: 2023-03-16 19:04:28.427248: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: 2023-03-16 19:04:28.427451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: 2023-03-16 19:04:28.427479: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427482: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +31: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +31: 2023-03-16 19:04:28.427465: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427486: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427490: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427494: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427495: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427494: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427502: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427503: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427504: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427534: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427538: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 1: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 1: 2023-03-16 19:04:28.427548: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 1: 2023-03-16 19:04:28.427550: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.433998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.433995: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.434005: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.434004: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.434010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.434014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.434012: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.434010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +14: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435565: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435569: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435571: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435573: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435579: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435587: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435592: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435588: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435588: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435591: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435591: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +14: 2023-03-16 19:04:28.435611: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +14: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +14: 2023-03-16 19:04:28.435624: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.441943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441950: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441948: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441953: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441960: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441961: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.441957: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443960: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443967: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443966: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443970: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443968: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.443973: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.443973: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.443980: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.443982: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.443984: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.444196: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: 2023-03-16 19:04:28.443989: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.444013: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.444198: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +24: 2023-03-16 19:04:28.444014: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +24: 2023-03-16 19:04:28.444028: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +24: 2023-03-16 19:04:28.444031: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.444204: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.444206: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.444212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.444214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.444222: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.444220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/project_462000125 +27: /samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446212: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446213: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446214: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446231: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446233: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446234: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446235: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446236: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446237: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446238: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. +27: 2023-03-16 19:04:28.446279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro +27: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 +27: 2023-03-16 19:04:28.446292: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.426274: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.426281: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /pfs/lustrep2/projappl/project_462000125/samantao-public/apps/aws-ofi-rccl:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/rccl/rccl-develop-release/rccl/lib:/pfs/lustrep4/projappl/project_462000075/samantao-public/rocm/glibc/selected:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hip/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/hsa/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/llvm:/pfs/lustrep2/projappl/pro + 6: ject_462000125/samantao-public/apps/suse-repo-deps/lib64:/pfs/lustrep2/projappl/project_462000125/samantao-public/apps/suse-repo-deps/usr/lib64:/opt/cray/pe/python/3.9.12.1/lib:/opt/cray/pe/gcc-libs:/opt/cray/libfabric/1.15.0.0/lib64 + 6: 2023-03-16 19:04:28.426287: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 6: 2023-03-16 19:04:28.426293: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. + 0: Successfully preprocessed all matching files. + 0: Detected CUDA files, patching ldflags + 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... + 0: Building extension module scaled_upper_triang_masked_softmax_cuda... + 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) + 0: Loading extension module scaled_upper_triang_masked_softmax_cuda... + 0: Successfully preprocessed all matching files. + 0: Detected CUDA files, patching ldflags + 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... + 0: Building extension module scaled_masked_softmax_cuda... + 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) + 0: Loading extension module scaled_masked_softmax_cuda... + 0: Successfully preprocessed all matching files. + 0: Detected CUDA files, patching ldflags + 0: Emitting ninja build file /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/build/build.ninja... + 0: Building extension module fused_mix_prec_layer_norm_cuda... + 0: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) + 0: Loading extension module fused_mix_prec_layer_norm_cuda... + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. + 0: Successfully preprocessed all matching files. +17: Successfully preprocessed all matching files. +17: Successfully preprocessed all matching files. +17: Successfully preprocessed all matching files. + 1: Successfully preprocessed all matching files. + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +23: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +23: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +14: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +14: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +21: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +21: warnings.warn( +11: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +11: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +19: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +19: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +16: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +16: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +10: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +10: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( +15: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +15: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( + 8: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 8: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +22: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +22: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +20: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +20: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +13: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +13: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +17: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +17: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +18: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +18: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( +12: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +12: warnings.warn( + 9: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 9: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( +28: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +28: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( + 4: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 4: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( +31: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +31: warnings.warn( + 7: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 7: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( + 5: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 5: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +29: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +29: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 3: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 3: warnings.warn( +27: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +27: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( + 2: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 2: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( +26: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +26: warnings.warn( + 6: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 6: warnings.warn( +30: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +30: warnings.warn( +24: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +24: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +25: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead +25: warnings.warn( + 1: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 1: warnings.warn( + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:429: UserWarning: torch.distributed.distributed_c10d._get_global_rank is deprecated please use torch.distributed.distributed_c10d.get_global_rank instead + 0: warnings.warn( +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: +16: +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +16: Building extension module utils... +16: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +16: Loading extension module utils... +24: Loading extension module utils... +24: Loading extension module utils... +25: Loading extension module utils... +25: Loading extension module utils... + 8: Loading extension module utils... +16: Loading extension module utils... +16: Loading extension module utils... +16: Loading extension module utils... + 8: Loading extension module utils... + 8: Loading extension module utils... +13: Loading extension module utils... +17: Loading extension module utils... +11: Loading extension module utils... +13: Loading extension module utils... +12: Loading extension module utils... + 8: Loading extension module utils... +11: Loading extension module utils... +14: Loading extension module utils... + 9: Loading extension module utils... +12: Loading extension module utils... +10: Loading extension module utils... +14: Loading extension module utils... +15: Loading extension module utils... +15: Loading extension module utils... + 9: Loading extension module utils... +10: Loading extension module utils... +17: Loading extension module utils... +17: Loading extension module utils... +26: Loading extension module utils... +26: Loading extension module utils... + 8: Loading extension module utils... +27: Loading extension module utils... +27: Loading extension module utils... + 9: Loading extension module utils... + 9: Loading extension module utils... +28: Loading extension module utils... +17: Loading extension module utils... +28: Loading extension module utils... + 8: Loading extension module utils... +29: Loading extension module utils... +29: Loading extension module utils... +30: Loading extension module utils... +30: Loading extension module utils... +31: Loading extension module utils... +31: Loading extension module utils... + 9: Loading extension module utils... + 8: Loading extension module utils... +18: Loading extension module utils... +10: Loading extension module utils... + 9: Loading extension module utils... +18: Loading extension module utils... +10: Loading extension module utils... +18: Loading extension module utils... +15: Loading extension module utils... +15: Loading extension module utils... +11: Loading extension module utils... +11: Loading extension module utils... +18: Loading extension module utils... +13: Loading extension module utils... +13: Loading extension module utils... +12: Loading extension module utils... +14: Loading extension module utils... +23: Loading extension module utils... +12: Loading extension module utils... +14: Loading extension module utils... +23: Loading extension module utils... +19: Loading extension module utils... +22: Loading extension module utils... +19: Loading extension module utils... +13: Loading extension module utils... +21: Loading extension module utils... +22: Loading extension module utils... +15: Loading extension module utils... +21: Loading extension module utils... +14: Loading extension module utils... +12: Loading extension module utils... +13: Loading extension module utils... +14: Loading extension module utils... + 9: Loading extension module utils... +15: Loading extension module utils... +10: Loading extension module utils... +12: Loading extension module utils... +11: Loading extension module utils... + 8: Loading extension module utils... +10: Loading extension module utils... +20: Loading extension module utils... +11: Loading extension module utils... +20: Loading extension module utils... + 9: Loading extension module utils... +11: Loading extension module utils... +10: Loading extension module utils... +21: Loading extension module utils... +11: Loading extension module utils... +10: Loading extension module utils... +21: Loading extension module utils... +20: Loading extension module utils... +19: Loading extension module utils... +20: Loading extension module utils... +22: Loading extension module utils... +23: Loading extension module utils... +19: Loading extension module utils... +22: Loading extension module utils... +23: Loading extension module utils... +15: Loading extension module utils... +15: Loading extension module utils... +14: Loading extension module utils... +14: Loading extension module utils... +13: Loading extension module utils... +13: Loading extension module utils... +12: Loading extension module utils... +12: Loading extension module utils... +16: Loading extension module utils... +20: Loading extension module utils... +22: Loading extension module utils... +21: Loading extension module utils... +16: Loading extension module utils... +17: Loading extension module utils... +22: Loading extension module utils... +18: Loading extension module utils... +17: Loading extension module utils... +21: Loading extension module utils... +20: Loading extension module utils... +23: Loading extension module utils... +18: Loading extension module utils... +23: Loading extension module utils... +19: Loading extension module utils... +19: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: +16: Loading extension module utils... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +17: +17: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: +13: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +13: +13: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +13: No modifications detected for re-loaded extension module utils, skipping build step... +13: Loading extension module utils... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: +12: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +14: +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +14: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +12: No modifications detected for re-loaded extension module utils, skipping build step... +12: Loading extension module utils... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +12: +12: +12: Loading extension module utils... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +12: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +12: +12: +12: Loading extension module utils... +15: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... +15: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +15: +15: Loading extension module utils...Loading extension module utils... +15: No modifications detected for re-loaded extension module utils, skipping build step... +14: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +14: +14: Loading extension module utils...Loading extension module utils... +14: +14: No modifications detected for re-loaded extension module utils, skipping build step... +15: +15: Loading extension module utils... +15: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +15: +15: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... +15: +15: No modifications detected for re-loaded extension module utils, skipping build step... +15: Loading extension module utils... +14: Loading extension module utils... +14: No modifications detected for re-loaded extension module utils, skipping build step... +14: Loading extension module utils... +14: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +14: +14: Loading extension module utils...Loading extension module utils... +14: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +14: +14: +14: Loading extension module utils...Loading extension module utils... +14: +15: +15: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: +10: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +10: +10: +10: +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: Loading extension module utils... +10: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +10: +10: Loading extension module utils...Loading extension module utils... +10: +10: No modifications detected for re-loaded extension module utils, skipping build step... +10: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +10: +10: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +10: +10: Loading extension module utils... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: + 8: + 8: + 8: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: + 8: + 8: Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... + 8: + 8: + 8: Loading extension module utils... + 8: No modifications detected for re-loaded extension module utils, skipping build step... + 8: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +18: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils...Loading extension module utils... +18: +18: +18: Loading extension module utils... +18: Loading extension module utils... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: +23: No modifications detected for re-loaded extension module utils, skipping build step... +23: Loading extension module utils... +11: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +23: +23: +23: +23: Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... +23: +23: +23: +23: No modifications detected for re-loaded extension module utils, skipping build step... +23: Loading extension module utils... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +11: No modifications detected for re-loaded extension module utils, skipping build step... +11: Loading extension module utils... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: +21: +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +21: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +21: +21: Loading extension module utils...Loading extension module utils... +21: +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +21: +21: +21: Loading extension module utils...Loading extension module utils... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: +21: Loading extension module utils... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: +22: +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... +22: +22: +22: Loading extension module utils... +22: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +19: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... +19: +19: Loading extension module utils...Loading extension module utils... +19: +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 9: No modifications detected for re-loaded extension module utils, skipping build step... + 9: Loading extension module utils... + 9: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 9: + 9: Loading extension module utils...Loading extension module utils... + 9: + 9: No modifications detected for re-loaded extension module utils, skipping build step... + 9: Loading extension module utils... + 9: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 9: + 9: Loading extension module utils...Loading extension module utils... + 9: + 9: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 9: + 9: Loading extension module utils...Loading extension module utils... + 9: + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +24: Building extension module utils... +24: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +24: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +24: Emitting ninja build file /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu/utils/build.ninja... +24: Building extension module utils... +24: Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +24: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... + 0: Loading extension module utils... + 0: Loading extension module utils... + 1: Loading extension module utils... + 1: Loading extension module utils... + 7: Loading extension module utils... + 7: Loading extension module utils... + 5: Loading extension module utils... + 4: Loading extension module utils... + 3: Loading extension module utils... + 5: Loading extension module utils... + 4: Loading extension module utils... + 3: Loading extension module utils... + 2: Loading extension module utils... + 2: Loading extension module utils... + 6: Loading extension module utils... + 6: Loading extension module utils... + 0: Loading extension module utils... + 0: Loading extension module utils... + 0: Loading extension module utils... + 1: Loading extension module utils... + 1: Loading extension module utils... + 1: Loading extension module utils... + 2: Loading extension module utils... + 2: Loading extension module utils... + 3: Loading extension module utils... + 3: Loading extension module utils... + 4: Loading extension module utils... + 7: Loading extension module utils... + 7: Loading extension module utils... + 2: Loading extension module utils... + 1: Loading extension module utils... + 2: Loading extension module utils... + 5: Loading extension module utils... + 6: Loading extension module utils... + 5: Loading extension module utils... + 6: Loading extension module utils... + 4: Loading extension module utils... + 3: Loading extension module utils... + 4: Loading extension module utils... + 4: Loading extension module utils... + 3: Loading extension module utils... + 5: Loading extension module utils... + 6: Loading extension module utils... + 7: Loading extension module utils... + 7: Loading extension module utils... + 5: Loading extension module utils... + 6: Loading extension module utils... + 0: Loading extension module utils... +24: Loading extension module utils... +30: Loading extension module utils... +25: Loading extension module utils... +27: Loading extension module utils... +28: Loading extension module utils... +29: Loading extension module utils... +26: Loading extension module utils... +30: Loading extension module utils... +27: Loading extension module utils... +26: Loading extension module utils... +28: Loading extension module utils... +31: Loading extension module utils... +29: Loading extension module utils... +25: Loading extension module utils... +31: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... +24: Loading extension module utils... + 0: Loading extension module utils... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: + 1: +25: Loading extension module utils... +21: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 1: + 1: Loading extension module utils...Loading extension module utils... + 1: +25: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... +22: Loading extension module utils... +19: Loading extension module utils... +21: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... +22: Loading extension module utils... +18: Loading extension module utils... +19: Loading extension module utils... +20: Loading extension module utils... +17: Loading extension module utils... +16: Loading extension module utils... +18: Loading extension module utils... +20: Loading extension module utils... +17: Loading extension module utils... +23: Loading extension module utils... +23: Loading extension module utils... +26: Loading extension module utils... +26: Loading extension module utils... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + 7: + 7: + 7: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... + 7: + 7: Loading extension module utils... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: + 3: + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 3: + 3: Loading extension module utils... + 3: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Loading extension module utils... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... +27: Loading extension module utils... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 4: + 4: Loading extension module utils...Loading extension module utils... + 4: + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + 2: + 2: Loading extension module utils... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... + 2: Loading extension module utils... + 2: + 2: Loading extension module utils... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: Loading extension module utils... + 3: Loading extension module utils...Loading extension module utils... + 3: + 5: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 5: + 5: Loading extension module utils...Loading extension module utils... + 5: + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... + 2: Loading extension module utils... + 4: Loading extension module utils... + 2: Loading extension module utils... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... + 4: Loading extension module utils... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + 5: + 5: Loading extension module utils... + 5: Loading extension module utils...Loading extension module utils... + 1: Loading extension module utils...Loading extension module utils... + 1: + 5: + 7: Loading extension module utils... + 7: Loading extension module utils... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + 6: + 6: Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + 6: + 6: + 6: Loading extension module utils... + 6: Loading extension module utils... +28: Loading extension module utils... + 6: Loading extension module utils... +28: Loading extension module utils... +24: Loading extension module utils... +29: Loading extension module utils... +29: Loading extension module utils... +30: Loading extension module utils... +30: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Loading extension module utils... +31: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... +26: Loading extension module utils... +26: Loading extension module utils... +24: Loading extension module utils... +28: Loading extension module utils... +25: Loading extension module utils... +25: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: Loading extension module utils... +27: Loading extension module utils... +27: Loading extension module utils... +30: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +28: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Loading extension module utils... +31: Loading extension module utils... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +29: Loading extension module utils... +29: Loading extension module utils... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +21: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils... +21: No modifications detected for re-loaded extension module utils, skipping build step... +21: Loading extension module utils... +22: No modifications detected for re-loaded extension module utils, skipping build step... +22: Loading extension module utils... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +18: No modifications detected for re-loaded extension module utils, skipping build step... +18: Loading extension module utils... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +16: No modifications detected for re-loaded extension module utils, skipping build step... +16: Loading extension module utils... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +17: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +17: No modifications detected for re-loaded extension module utils, skipping build step... +17: Loading extension module utils... +19: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +19: No modifications detected for re-loaded extension module utils, skipping build step... +19: Loading extension module utils... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +20: No modifications detected for re-loaded extension module utils, skipping build step... +20: Loading extension module utils... +23: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +23: No modifications detected for re-loaded extension module utils, skipping build step... +23: Loading extension module utils... +23: No modifications detected for re-loaded extension module utils, skipping build step... +23: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 3: No modifications detected for re-loaded extension module utils, skipping build step... + 3: Loading extension module utils... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 5: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 4: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 2: No modifications detected for re-loaded extension module utils, skipping build step... + 2: Loading extension module utils... + 4: No modifications detected for re-loaded extension module utils, skipping build step... + 4: Loading extension module utils... + 5: No modifications detected for re-loaded extension module utils, skipping build step... + 5: Loading extension module utils... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... + 1: No modifications detected for re-loaded extension module utils, skipping build step... + 1: Loading extension module utils... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 7: No modifications detected for re-loaded extension module utils, skipping build step... + 7: Loading extension module utils... + 6: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... + 6: No modifications detected for re-loaded extension module utils, skipping build step... + 6: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root...No modifications detected for re-loaded extension module utils, skipping build step... +29: +29: Loading extension module utils... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +25: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +25: No modifications detected for re-loaded extension module utils, skipping build step... +25: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +30: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +30: No modifications detected for re-loaded extension module utils, skipping build step... +30: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +26: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +28: No modifications detected for re-loaded extension module utils, skipping build step... +28: Loading extension module utils... +26: No modifications detected for re-loaded extension module utils, skipping build step... +26: Loading extension module utils... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +24: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +27: No modifications detected for re-loaded extension module utils, skipping build step... +27: Loading extension module utils... +31: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... +29: No modifications detected for re-loaded extension module utils, skipping build step... +29: Loading extension module utils... +24: No modifications detected for re-loaded extension module utils, skipping build step... +24: Loading extension module utils... +31: No modifications detected for re-loaded extension module utils, skipping build step... +31: Loading extension module utils... + 0: Using /pfs/lustrep4/users/muennighoff/.cache/torch_extensions/py39_cpu as PyTorch extensions root... + 0: No modifications detected for re-loaded extension module utils, skipping build step... + 0: Loading extension module utils... + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/utils.py:349: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + 0: warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") diff --git a/8b712b400m/3324489.out b/8b712b400m/3324489.out new file mode 100644 index 0000000000000000000000000000000000000000..0c94a330526757a6bf43539f0ae7109394e7effd --- /dev/null +++ b/8b712b400m/3324489.out @@ -0,0 +1,14936 @@ +Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 +Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 4 --pipeline-model-parallel-size 4 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 1 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b712b400mval --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 1 --lr-warmup-samples 0 --clip-grad 1.0 --weight-decay 1e-1 --override-lr-scheduler --reset-progress --no-load-optim --log-interval 10 --save-interval 5000 --eval-interval 1 --eval-iters 100 --eval-only true --tensorboard-dir tensorboard_8b712b400mval --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b712b400m --load checkpoints_8b712b400m --train-weighted-split-paths-path train400m.txt --valid-weighted-split-paths-path val.txt --data-impl mmap --num-workers 0 --valid-num-workers 0 --deepspeed --deepspeed_config ds_configs/3324489.json --zero-stage 0 +START 3324489: Thu 16 Mar 2023 07:03:56 PM EET + 0: + 0: + 0: ======================= ROCm System Management Interface ======================= + 0: ================================= Concise Info ================================= + 0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 0: 0 46.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: 2 40.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: 4 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: 6 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 0: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 0: ================================================================================ + 0: ============================= End of ROCm SMI Log ============================== +19: +19: +19: ======================= ROCm System Management Interface ======================= +19: ================================= Concise Info ================================= +19: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +19: 0 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: 2 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: 4 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: 6 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +19: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +19: ================================================================================ +19: ============================= End of ROCm SMI Log ============================== + 9: + 9: + 9: ======================= ROCm System Management Interface ======================= + 9: ================================= Concise Info ================================= + 9: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 9: 0 47.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: 2 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: 4 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: 6 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 9: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 9: ================================================================================ + 9: ============================= End of ROCm SMI Log ============================== +12: +12: +12: ======================= ROCm System Management Interface ======================= +12: ================================= Concise Info ================================= +12: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +12: 0 47.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: 2 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: 4 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: 6 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +12: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +12: ================================================================================ +12: ============================= End of ROCm SMI Log ============================== + 7: + 7: + 7: ======================= ROCm System Management Interface ======================= + 7: ================================= Concise Info ================================= + 7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 7: 0 48.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: 2 43.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: 4 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: 6 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 7: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 7: ================================================================================ + 7: ============================= End of ROCm SMI Log ============================== +13: +13: +13: ======================= ROCm System Management Interface ======================= +13: ================================= Concise Info ================================= +13: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +13: 0 50.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: 2 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: 4 44.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: 6 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +13: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +13: ================================================================================ +13: ============================= End of ROCm SMI Log ============================== + 1: + 1: + 1: ======================= ROCm System Management Interface ======================= + 1: ================================= Concise Info ================================= + 1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 1: 0 44.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: 2 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: 4 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 1: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 1: ================================================================================ + 1: ============================= End of ROCm SMI Log ============================== +18: +18: +18: ======================= ROCm System Management Interface ======================= +18: ================================= Concise Info ================================= +18: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +18: 0 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: 2 37.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: 4 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: 6 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +18: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +18: ================================================================================ +18: ============================= End of ROCm SMI Log ============================== +28: +28: +28: ======================= ROCm System Management Interface ======================= +28: ================================= Concise Info ================================= +28: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +28: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: 2 38.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: 4 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: 6 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +28: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +28: ================================================================================ +28: ============================= End of ROCm SMI Log ============================== +26: +26: +26: ======================= ROCm System Management Interface ======================= +26: ================================= Concise Info ================================= +26: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +26: 0 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: 4 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: 6 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +26: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +26: ================================================================================ +26: ============================= End of ROCm SMI Log ============================== +24: +24: +24: ======================= ROCm System Management Interface ======================= +24: ================================= Concise Info ================================= +24: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +24: 0 47.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: 2 41.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: 4 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: 6 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +24: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +24: ================================================================================ +24: ============================= End of ROCm SMI Log ============================== +21: +21: +21: ======================= ROCm System Management Interface ======================= +21: ================================= Concise Info ================================= +21: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +21: 0 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: 2 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: 4 41.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: 6 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +21: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +21: ================================================================================ +21: ============================= End of ROCm SMI Log ============================== + 6: + 6: + 6: ======================= ROCm System Management Interface ======================= + 6: ================================= Concise Info ================================= + 6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 6: 0 43.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: 2 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: 4 44.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: 6 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 6: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 6: ================================================================================ + 6: ============================= End of ROCm SMI Log ============================== +15: +15: +15: ======================= ROCm System Management Interface ======================= +15: ================================= Concise Info ================================= +15: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +15: 0 47.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: 2 47.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: 4 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: 6 36.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +15: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +15: ================================================================================ +15: ============================= End of ROCm SMI Log ============================== + 5: + 5: + 5: ======================= ROCm System Management Interface ======================= + 5: ================================= Concise Info ================================= + 5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 5: 0 49.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: 2 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: 6 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 5: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 5: ================================================================================ + 5: ============================= End of ROCm SMI Log ============================== + 2: + 2: + 2: ======================= ROCm System Management Interface ======================= + 2: ================================= Concise Info ================================= + 2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 2: 0 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: 2 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: 6 42.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 2: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 2: ================================================================================ + 2: ============================= End of ROCm SMI Log ============================== +17: +17: +17: ======================= ROCm System Management Interface ======================= +17: ================================= Concise Info ================================= +17: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +17: 0 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: 2 37.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: 4 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: 6 38.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +17: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +17: ================================================================================ +17: ============================= End of ROCm SMI Log ============================== +11: +11: +11: ======================= ROCm System Management Interface ======================= +11: ================================= Concise Info ================================= +11: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +11: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: 2 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: 4 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: 6 42.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +11: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +11: ================================================================================ +11: ============================= End of ROCm SMI Log ============================== +31: +31: +31: ======================= ROCm System Management Interface ======================= +31: ================================= Concise Info ================================= +31: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +31: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: 2 44.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: 4 46.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: 6 44.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +31: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +31: ================================================================================ +31: ============================= End of ROCm SMI Log ============================== +30: +30: +30: ======================= ROCm System Management Interface ======================= +30: ================================= Concise Info ================================= +30: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +30: 0 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: 2 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: 4 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: 6 40.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +30: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +30: ================================================================================ +30: ============================= End of ROCm SMI Log ============================== +14: +14: +14: ======================= ROCm System Management Interface ======================= +14: ================================= Concise Info ================================= +14: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +14: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: 2 36.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: 4 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 5 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: 6 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +14: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +14: ================================================================================ +14: ============================= End of ROCm SMI Log ============================== +22: +22: +22: ======================= ROCm System Management Interface ======================= +22: ================================= Concise Info ================================= +22: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +22: 0 37.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: 2 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: 4 44.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: 6 38.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +22: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +22: ================================================================================ +22: ============================= End of ROCm SMI Log ============================== +25: +25: +25: ======================= ROCm System Management Interface ======================= +25: ================================= Concise Info ================================= +25: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +25: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: 2 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: 6 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +25: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +25: ================================================================================ +25: ============================= End of ROCm SMI Log ============================== + 8: + 8: + 8: ======================= ROCm System Management Interface ======================= + 8: ================================= Concise Info ================================= + 8: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 8: 0 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: 2 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 3 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: 4 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: 6 44.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 8: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 8: ================================================================================ + 8: ============================= End of ROCm SMI Log ============================== +23: +23: +23: ======================= ROCm System Management Interface ======================= +23: ================================= Concise Info ================================= +23: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +23: 0 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: 2 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: 4 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: 6 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +23: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +23: ================================================================================ +23: ============================= End of ROCm SMI Log ============================== +16: +16: +16: ======================= ROCm System Management Interface ======================= +16: ================================= Concise Info ================================= +16: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +16: 0 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: 2 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: 4 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: 6 38.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +16: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +16: ================================================================================ +16: ============================= End of ROCm SMI Log ============================== + 4: + 4: + 4: ======================= ROCm System Management Interface ======================= + 4: ================================= Concise Info ================================= + 4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 4: 0 43.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: 2 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: 4 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 4: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 4: ================================================================================ + 4: ============================= End of ROCm SMI Log ============================== +10: +10: +10: ======================= ROCm System Management Interface ======================= +10: ================================= Concise Info ================================= +10: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +10: 0 46.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: 2 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: 4 46.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: 6 38.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +10: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +10: ================================================================================ +10: ============================= End of ROCm SMI Log ============================== + 3: + 3: + 3: ======================= ROCm System Management Interface ======================= + 3: ================================= Concise Info ================================= + 3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% + 3: 0 48.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: 2 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 3 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: 4 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: 6 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% + 3: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% + 3: ================================================================================ + 3: ============================= End of ROCm SMI Log ============================== +29: +29: +29: ======================= ROCm System Management Interface ======================= +29: ================================= Concise Info ================================= +29: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +29: 0 45.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: 2 38.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: 4 43.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: 6 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +29: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +29: ================================================================================ +29: ============================= End of ROCm SMI Log ============================== +27: +27: +27: ======================= ROCm System Management Interface ======================= +27: ================================= Concise Info ================================= +27: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +27: 0 48.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: 2 39.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: 4 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: 6 39.0c 80.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +27: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +27: ================================================================================ +27: ============================= End of ROCm SMI Log ============================== +20: +20: +20: ======================= ROCm System Management Interface ======================= +20: ================================= Concise Info ================================= +20: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% +20: 0 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: 2 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: 4 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 5 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: 6 42.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% +20: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% +20: ================================================================================ +20: ============================= End of ROCm SMI Log ============================== +18: Launching on nid005050 (18/32), master nid005032 port 9999, GPUs 8, CUDA: True + 4: Launching on nid005036 (4/32), master nid005032 port 9999, GPUs 8, CUDA: True + 5: Launching on nid005037 (5/32), master nid005032 port 9999, GPUs 8, CUDA: True +21: Launching on nid005053 (21/32), master nid005032 port 9999, GPUs 8, CUDA: True + 0: Launching on nid005032 (0/32), master nid005032 port 9999, GPUs 8, CUDA: True +12: Launching on nid005044 (12/32), master nid005032 port 9999, GPUs 8, CUDA: True +13: Launching on nid005045 (13/32), master nid005032 port 9999, GPUs 8, CUDA: True + 6: Launching on nid005038 (6/32), master nid005032 port 9999, GPUs 8, CUDA: True +22: Launching on nid005054 (22/32), master nid005032 port 9999, GPUs 8, CUDA: True +11: Launching on nid005043 (11/32), master nid005032 port 9999, GPUs 8, CUDA: True +27: Launching on nid005059 (27/32), master nid005032 port 9999, GPUs 8, CUDA: True +26: Launching on nid005058 (26/32), master nid005032 port 9999, GPUs 8, CUDA: True +23: Launching on nid005055 (23/32), master nid005032 port 9999, GPUs 8, CUDA: True +16: Launching on nid005048 (16/32), master nid005032 port 9999, GPUs 8, CUDA: True +29: Launching on nid005061 (29/32), master nid005032 port 9999, GPUs 8, CUDA: True +30: Launching on nid005062 (30/32), master nid005032 port 9999, GPUs 8, CUDA: True + 2: Launching on nid005034 (2/32), master nid005032 port 9999, GPUs 8, CUDA: True + 8: Launching on nid005040 (8/32), master nid005032 port 9999, GPUs 8, CUDA: True + 1: Launching on nid005033 (1/32), master nid005032 port 9999, GPUs 8, CUDA: True +24: Launching on nid005056 (24/32), master nid005032 port 9999, GPUs 8, CUDA: True + 9: Launching on nid005041 (9/32), master nid005032 port 9999, GPUs 8, CUDA: True +17: Launching on nid005049 (17/32), master nid005032 port 9999, GPUs 8, CUDA: True +19: Launching on nid005051 (19/32), master nid005032 port 9999, GPUs 8, CUDA: True +20: Launching on nid005052 (20/32), master nid005032 port 9999, GPUs 8, CUDA: True +15: Launching on nid005047 (15/32), master nid005032 port 9999, GPUs 8, CUDA: True +10: Launching on nid005042 (10/32), master nid005032 port 9999, GPUs 8, CUDA: True +14: Launching on nid005046 (14/32), master nid005032 port 9999, GPUs 8, CUDA: True +28: Launching on nid005060 (28/32), master nid005032 port 9999, GPUs 8, CUDA: True +25: Launching on nid005057 (25/32), master nid005032 port 9999, GPUs 8, CUDA: True + 3: Launching on nid005035 (3/32), master nid005032 port 9999, GPUs 8, CUDA: True +31: Launching on nid005063 (31/32), master nid005032 port 9999, GPUs 8, CUDA: True + 7: Launching on nid005039 (7/32), master nid005032 port 9999, GPUs 8, CUDA: True + 0: using world size: 256, data-parallel-size: 16, tensor-model-parallel size: 4, pipeline-model-parallel size: 4 + 0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. + 0: using torch.bfloat16 for parameters ... + 0: ------------------------ arguments ------------------------ + 0: abort_on_unmet_fused_kernel_constraints ......... False + 0: accumulate_allreduce_grads_in_fp32 .............. True + 0: adam_beta1 ...................................... 0.9 + 0: adam_beta2 ...................................... 0.999 + 0: adam_eps ........................................ 1e-08 + 0: adlr_autoresume ................................. False + 0: adlr_autoresume_interval ........................ 1000 + 0: apply_query_key_layer_scaling ................... True + 0: apply_residual_connection_post_layernorm ........ False + 0: attention_dropout ............................... 0.1 + 0: attention_softmax_in_fp32 ....................... False + 0: bert_binary_head ................................ True + 0: bert_load ....................................... None + 0: bf16 ............................................ True + 0: bias_dropout_fusion ............................. True + 0: bias_gelu_fusion ................................ True + 0: biencoder_projection_dim ........................ 0 + 0: biencoder_shared_query_context_model ............ False + 0: block_data_path ................................. None + 0: checkpoint_activations .......................... False + 0: checkpoint_in_cpu ............................... False + 0: checkpoint_num_layers ........................... 1 + 0: clip_grad ....................................... 1.0 + 0: codecarbon_dir .................................. None + 0: consumed_train_samples .......................... 0 + 0: consumed_train_tokens ........................... 0 + 0: consumed_valid_samples .......................... 0 + 0: contigious_checkpointing ........................ False + 0: cpu_optimizer ................................... False + 0: cpu_torch_adam .................................. False + 0: curriculum_learning ............................. False + 0: data_impl ....................................... mmap + 0: data_parallel_size .............................. 16 + 0: data_path ....................................... None + 0: dataloader_type ................................. single + 0: DDP_impl ........................................ local + 0: decoder_seq_length .............................. None + 0: deepscale ....................................... False + 0: deepscale_config ................................ None + 0: deepspeed ....................................... True + 0: deepspeed_activation_checkpointing .............. False + 0: deepspeed_config ................................ ds_configs/3324489.json + 0: deepspeed_mpi ................................... False + 0: distribute_checkpointed_activations ............. False + 0: distributed_backend ............................. nccl + 0: embed_layernorm ................................. False + 0: embedding_path .................................. None + 0: encoder_seq_length .............................. 2048 + 0: eod_mask_loss ................................... False + 0: eval_interval ................................... 1 + 0: eval_iters ...................................... 100 + 0: eval_only ....................................... True + 0: evidence_data_path .............................. None + 0: exit_duration_in_mins ........................... None + 0: exit_interval ................................... None + 0: ffn_hidden_size ................................. 16384 + 0: finetune ........................................ False + 0: fp16 ............................................ False + 0: fp16_lm_cross_entropy ........................... False + 0: fp32_residual_connection ........................ False + 0: gigaflos_no_embeds .............................. 0 + 0: global_batch_size ............................... 1024 + 0: glu_activation .................................. None + 0: hidden_dropout .................................. 0.1 + 0: hidden_size ..................................... 4096 + 0: hysteresis ...................................... 2 + 0: ict_head_size ................................... None + 0: ict_load ........................................ None + 0: img_dim ......................................... 224 + 0: indexer_batch_size .............................. 128 + 0: indexer_log_interval ............................ 1000 + 0: inference ....................................... False + 0: init_method_std ................................. 0.02 + 0: init_method_xavier_uniform ...................... False + 0: initial_loss_scale .............................. 4294967296 + 0: kill_switch_path ................................ kill-switch-8b712b400mval + 0: kv_channels ..................................... 128 + 0: layer_norm_fusion ............................... True + 0: layernorm_epsilon ............................... 1e-05 + 0: lazy_mpu_init ................................... None + 0: load ............................................ checkpoints_8b712b400m + 0: local_rank ...................................... None + 0: log_batch_size_to_tensorboard ................... True + 0: log_interval .................................... 10 + 0: log_learning_rate_to_tensorboard ................ True + 0: log_level ....................................... None + 0: log_level_replica ............................... None + 0: log_loss_scale_to_tensorboard ................... True + 0: log_num_zeros_in_grad ........................... False + 0: log_params_norm ................................. False + 0: log_path ........................................ None + 0: log_timers_to_tensorboard ....................... True + 0: log_validation_ppl_to_tensorboard ............... True + 0: loss_on_targets_only ............................ False + 0: loss_scale ...................................... None + 0: loss_scale_window ............................... 1000 + 0: lr .............................................. 0.0002 + 0: lr_decay_iters .................................. None + 0: lr_decay_samples ................................ 1 + 0: lr_decay_style .................................. cosine + 0: lr_decay_tokens ................................. None + 0: lr_warmup_fraction .............................. None + 0: lr_warmup_iters ................................. 0 + 0: lr_warmup_samples ............................... 0 + 0: make_vocab_size_divisible_by .................... 128 + 0: mask_prob ....................................... 0.15 + 0: masked_softmax_fusion ........................... True + 0: max_position_embeddings ......................... 2048 + 0: mean_noise_span_length .......................... None + 0: memory_centric_tiled_linear ..................... False + 0: merge_file ...................................... gpt2/merges.txt + 0: micro_batch_size ................................ 2 + 0: min_loss_scale .................................. 1.0 + 0: min_lr .......................................... 2e-05 + 0: mmap_warmup ..................................... False + 0: no_load_optim ................................... True + 0: no_load_rng ..................................... None + 0: no_save_optim ................................... None + 0: no_save_rng ..................................... None + 0: noise_density ................................... None + 0: num_attention_heads ............................. 32 + 0: num_channels .................................... 3 + 0: num_classes ..................................... 1000 + 0: num_layers ...................................... 42 + 0: num_layers_per_virtual_pipeline_stage ........... None + 0: num_workers ..................................... 0 + 0: onnx_safe ....................................... None + 0: openai_gelu ..................................... False + 0: optimizer ....................................... adam + 0: optimizer_fusion ................................ True + 0: override_lr_scheduler ........................... True + 0: pad_vocab_size_to ............................... None + 0: params_dtype .................................... torch.bfloat16 + 0: partition_activations ........................... False + 0: patch_dim ....................................... 16 + 0: pipeline_model_parallel_size .................... 4 + 0: position_embedding_type ......................... PositionEmbeddingType.absolute + 0: pp_partition_method ............................. None + 0: profile_backward ................................ False + 0: query_in_block_prob ............................. 0.1 + 0: rampup_batch_size ............................... None + 0: rank ............................................ 0 + 0: remote_device ................................... none + 0: reset_attention_mask ............................ False + 0: reset_position_ids .............................. False + 0: reset_progress .................................. True + 0: retriever_report_topk_accuracies ................ [] + 0: retriever_score_scaling ......................... False + 0: retriever_seq_length ............................ 256 + 0: reweight_loss_based_on_position_frequency ....... False + 0: sample_rate ..................................... 1.0 + 0: save ............................................ checkpoints_8b712b400m + 0: save_interval ................................... 5000 + 0: scatter_gather_tensors_in_pipeline .............. True + 0: scattered_embeddings ............................ False + 0: seed ............................................ 1234 + 0: seq_length ...................................... 2048 + 0: sgd_momentum .................................... 0.9 + 0: short_seq_prob .................................. 0.1 + 0: skip_train_iteration_range ...................... None + 0: split ........................................... None + 0: split_transformers .............................. False + 0: sync_tp_duplicated_parameters ................... False + 0: synchronize_each_layer .......................... False + 0: tensor_model_parallel_size ...................... 4 + 0: tensorboard_dir ................................. tensorboard_8b712b400mval + 0: tensorboard_log_interval ........................ 1 + 0: tensorboard_queue_size .......................... 5 + 0: test_weighted_split_paths ....................... None + 0: test_weighted_split_paths_path .................. None + 0: tile_factor ..................................... 1 + 0: titles_data_path ................................ None + 0: tokenizer_name_or_path .......................... None + 0: tokenizer_type .................................. GPT2BPETokenizer + 0: train_iters ..................................... None + 0: train_samples ................................... 1 + 0: train_tokens .................................... None + 0: train_weighted_split_names ...................... ['train'] + 0: train_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document']] + 0: train_weighted_split_paths_path ................. None + 0: train_weighted_split_splits ..................... [['0:1']] + 0: train_weighted_split_weights .................... [['1.0']] + 0: universal_checkpoint ............................ False + 0: use_bnb_optimizer ............................... False + 0: use_checkpoint_lr_scheduler ..................... False + 0: use_contiguous_buffers_in_ddp ................... True + 0: use_cpu_initialization .......................... None + 0: use_one_sent_docs ............................... False + 0: use_pin_memory .................................. False + 0: valid_num_workers ............................... 0 + 0: valid_weighted_split_names ...................... ['validation'] + 0: valid_weighted_split_paths ...................... [['/scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document']] + 0: valid_weighted_split_paths_path ................. None + 0: valid_weighted_split_splits ..................... [['0:1']] + 0: valid_weighted_split_weights .................... [['1.0']] + 0: virtual_pipeline_model_parallel_size ............ None + 0: vocab_extra_ids ................................. 0 + 0: vocab_file ...................................... gpt2/vocab.json + 0: weight_decay .................................... 0.1 + 0: world_size ...................................... 256 + 0: zero_allgather_bucket_size ...................... 0.0 + 0: zero_contigious_gradients ....................... False + 0: zero_reduce_bucket_size ......................... 0.0 + 0: zero_reduce_scatter ............................. False + 0: zero_stage ...................................... 0 + 0: -------------------- end of arguments --------------------- + 0: setting number of micro-batches to constant 32 + 0: > building GPT2BPETokenizer tokenizer ... + 0: > padded vocab (size: 50257) with 431 dummy tokens (new size: 50688) + 0: DeepSpeed general environment info: + 0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] + 0: torch version .................... 1.13.0+rocm5.2 + 0: torch cuda version ............... None + 0: torch hip version ................ 5.2.21151-afdc89f8 + 0: nvcc version ..................... None + 0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] + 0: deepspeed info ................... 0.7.5, unknown, unknown + 0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 + 0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** + 0: > initializing torch distributed ... + 0: [2023-03-16 19:04:54,426] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +31: > setting tensorboard ... + 0: > initializing tensor model parallel with size 4 + 0: > initializing pipeline model parallel with size 4 + 0: > setting random seeds to 1234 ... + 0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 + 0: > compiling dataset index builder ... + 0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' + 0: make: Nothing to be done for 'default'. + 0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' + 0: >>> done with dataset index builder. Compilation time: 0.094 seconds + 0: > compiling and loading fused kernels ... + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] + 0: Total number of unsupported CUDA function calls: 0 + 0: + 0: + 0: Total number of replaced kernel launches: 87 + 0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] + 0: Total number of unsupported CUDA function calls: 0 + 0: + 0: + 0: Total number of replaced kernel launches: 63 + 0: [1/1] c++ scaled_masked_softmax_hip.o scaled_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_masked_softmax_cuda.so + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] + 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] + 0: Total number of unsupported CUDA function calls: 0 + 0: + 0: + 0: Total number of replaced kernel launches: 67 + 0: ninja: no work to do. + 0: >>> done with compiling and loading fused kernels. Compilation time: 15.220 seconds + 0: time to initialize megatron (seconds): 72.747 + 0: [after megatron is initialized] datetime: 2023-03-16 19:05:12 + 0: building GPT model ... + 0: [2023-03-16 19:05:12,902] [INFO] [utils.py:827:see_memory_usage] Before Building Model + 0: [2023-03-16 19:05:12,903] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB + 0: [2023-03-16 19:05:12,903] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 30.93 GB, percent = 6.1% + 0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None + 0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=0, model=2): 2, ProcessCoord(pipe=0, data=0, model=3): 3, ProcessCoord(pipe=0, data=1, model=0): 4, ProcessCoord(pipe=0, data=1, model=1): 5, ProcessCoord(pipe=0, data=1, model=2): 6, ProcessCoord(pipe=0, data=1, model=3): 7, ProcessCoord(pipe=0, data=2, model=0): 8, ProcessCoord(pipe=0, data=2, model=1): 9, ProcessCoord(pipe=0, data=2, model=2): 10, ProcessCoord(pipe=0, data=2, model=3): 11, ProcessCoord(pipe=0, data=3, model=0): 12, ProcessCoord(pipe=0, data=3, model=1): 13, ProcessCoord(pipe=0, data=3, model=2): 14, ProcessCoord(pipe=0, data=3, model=3): 15, ProcessCoord(pipe=0, data=4, model=0): 16, ProcessCoord(pipe=0, data=4, model=1): 17, ProcessCoord(pipe=0, data=4, model=2): 18, ProcessCoord(pipe=0, data=4, model=3): 19, ProcessCoord(pipe=0, data=5, model=0): 20, ProcessCoord(pipe=0, data=5, model=1): 21, ProcessCoord(pipe=0, data=5, model=2): 22, ProcessCoord(pipe=0, data=5, + 0: model=3): 23, ProcessCoord(pipe=0, data=6, model=0): 24, ProcessCoord(pipe=0, data=6, model=1): 25, ProcessCoord(pipe=0, data=6, model=2): 26, ProcessCoord(pipe=0, data=6, model=3): 27, ProcessCoord(pipe=0, data=7, model=0): 28, ProcessCoord(pipe=0, data=7, model=1): 29, ProcessCoord(pipe=0, data=7, model=2): 30, ProcessCoord(pipe=0, data=7, model=3): 31, ProcessCoord(pipe=0, data=8, model=0): 32, ProcessCoord(pipe=0, data=8, model=1): 33, ProcessCoord(pipe=0, data=8, model=2): 34, ProcessCoord(pipe=0, data=8, model=3): 35, ProcessCoord(pipe=0, data=9, model=0): 36, ProcessCoord(pipe=0, data=9, model=1): 37, ProcessCoord(pipe=0, data=9, model=2): 38, ProcessCoord(pipe=0, data=9, model=3): 39, ProcessCoord(pipe=0, data=10, model=0): 40, ProcessCoord(pipe=0, data=10, model=1): 41, ProcessCoord(pipe=0, data=10, model=2): 42, ProcessCoord(pipe=0, data=10, model=3): 43, ProcessCoord(pipe=0, data=11, model=0): 44, ProcessCoord(pipe=0, data=11, model=1): 45, ProcessCoord(pipe=0, data=11, model=2): 46, ProcessCoord( + 0: pipe=0, data=11, model=3): 47, ProcessCoord(pipe=0, data=12, model=0): 48, ProcessCoord(pipe=0, data=12, model=1): 49, ProcessCoord(pipe=0, data=12, model=2): 50, ProcessCoord(pipe=0, data=12, model=3): 51, ProcessCoord(pipe=0, data=13, model=0): 52, ProcessCoord(pipe=0, data=13, model=1): 53, ProcessCoord(pipe=0, data=13, model=2): 54, ProcessCoord(pipe=0, data=13, model=3): 55, ProcessCoord(pipe=0, data=14, model=0): 56, ProcessCoord(pipe=0, data=14, model=1): 57, ProcessCoord(pipe=0, data=14, model=2): 58, ProcessCoord(pipe=0, data=14, model=3): 59, ProcessCoord(pipe=0, data=15, model=0): 60, ProcessCoord(pipe=0, data=15, model=1): 61, ProcessCoord(pipe=0, data=15, model=2): 62, ProcessCoord(pipe=0, data=15, model=3): 63, ProcessCoord(pipe=1, data=0, model=0): 64, ProcessCoord(pipe=1, data=0, model=1): 65, ProcessCoord(pipe=1, data=0, model=2): 66, ProcessCoord(pipe=1, data=0, model=3): 67, ProcessCoord(pipe=1, data=1, model=0): 68, ProcessCoord(pipe=1, data=1, model=1): 69, ProcessCoord(pipe=1, data=1, mo + 0: del=2): 70, ProcessCoord(pipe=1, data=1, model=3): 71, ProcessCoord(pipe=1, data=2, model=0): 72, ProcessCoord(pipe=1, data=2, model=1): 73, ProcessCoord(pipe=1, data=2, model=2): 74, ProcessCoord(pipe=1, data=2, model=3): 75, ProcessCoord(pipe=1, data=3, model=0): 76, ProcessCoord(pipe=1, data=3, model=1): 77, ProcessCoord(pipe=1, data=3, model=2): 78, ProcessCoord(pipe=1, data=3, model=3): 79, ProcessCoord(pipe=1, data=4, model=0): 80, ProcessCoord(pipe=1, data=4, model=1): 81, ProcessCoord(pipe=1, data=4, model=2): 82, ProcessCoord(pipe=1, data=4, model=3): 83, ProcessCoord(pipe=1, data=5, model=0): 84, ProcessCoord(pipe=1, data=5, model=1): 85, ProcessCoord(pipe=1, data=5, model=2): 86, ProcessCoord(pipe=1, data=5, model=3): 87, ProcessCoord(pipe=1, data=6, model=0): 88, ProcessCoord(pipe=1, data=6, model=1): 89, ProcessCoord(pipe=1, data=6, model=2): 90, ProcessCoord(pipe=1, data=6, model=3): 91, ProcessCoord(pipe=1, data=7, model=0): 92, ProcessCoord(pipe=1, data=7, model=1): 93, ProcessCoord(pipe=1, da + 0: ta=7, model=2): 94, ProcessCoord(pipe=1, data=7, model=3): 95, ProcessCoord(pipe=1, data=8, model=0): 96, ProcessCoord(pipe=1, data=8, model=1): 97, ProcessCoord(pipe=1, data=8, model=2): 98, ProcessCoord(pipe=1, data=8, model=3): 99, ProcessCoord(pipe=1, data=9, model=0): 100, ProcessCoord(pipe=1, data=9, model=1): 101, ProcessCoord(pipe=1, data=9, model=2): 102, ProcessCoord(pipe=1, data=9, model=3): 103, ProcessCoord(pipe=1, data=10, model=0): 104, ProcessCoord(pipe=1, data=10, model=1): 105, ProcessCoord(pipe=1, data=10, model=2): 106, ProcessCoord(pipe=1, data=10, model=3): 107, ProcessCoord(pipe=1, data=11, model=0): 108, ProcessCoord(pipe=1, data=11, model=1): 109, ProcessCoord(pipe=1, data=11, model=2): 110, ProcessCoord(pipe=1, data=11, model=3): 111, ProcessCoord(pipe=1, data=12, model=0): 112, ProcessCoord(pipe=1, data=12, model=1): 113, ProcessCoord(pipe=1, data=12, model=2): 114, ProcessCoord(pipe=1, data=12, model=3): 115, ProcessCoord(pipe=1, data=13, model=0): 116, ProcessCoord(pipe=1, data=13 + 0: , model=1): 117, ProcessCoord(pipe=1, data=13, model=2): 118, ProcessCoord(pipe=1, data=13, model=3): 119, ProcessCoord(pipe=1, data=14, model=0): 120, ProcessCoord(pipe=1, data=14, model=1): 121, ProcessCoord(pipe=1, data=14, model=2): 122, ProcessCoord(pipe=1, data=14, model=3): 123, ProcessCoord(pipe=1, data=15, model=0): 124, ProcessCoord(pipe=1, data=15, model=1): 125, ProcessCoord(pipe=1, data=15, model=2): 126, ProcessCoord(pipe=1, data=15, model=3): 127, ProcessCoord(pipe=2, data=0, model=0): 128, ProcessCoord(pipe=2, data=0, model=1): 129, ProcessCoord(pipe=2, data=0, model=2): 130, ProcessCoord(pipe=2, data=0, model=3): 131, ProcessCoord(pipe=2, data=1, model=0): 132, ProcessCoord(pipe=2, data=1, model=1): 133, ProcessCoord(pipe=2, data=1, model=2): 134, ProcessCoord(pipe=2, data=1, model=3): 135, ProcessCoord(pipe=2, data=2, model=0): 136, ProcessCoord(pipe=2, data=2, model=1): 137, ProcessCoord(pipe=2, data=2, model=2): 138, ProcessCoord(pipe=2, data=2, model=3): 139, ProcessCoord(pipe=2, data=3, + 0: model=0): 140, ProcessCoord(pipe=2, data=3, model=1): 141, ProcessCoord(pipe=2, data=3, model=2): 142, ProcessCoord(pipe=2, data=3, model=3): 143, ProcessCoord(pipe=2, data=4, model=0): 144, ProcessCoord(pipe=2, data=4, model=1): 145, ProcessCoord(pipe=2, data=4, model=2): 146, ProcessCoord(pipe=2, data=4, model=3): 147, ProcessCoord(pipe=2, data=5, model=0): 148, ProcessCoord(pipe=2, data=5, model=1): 149, ProcessCoord(pipe=2, data=5, model=2): 150, ProcessCoord(pipe=2, data=5, model=3): 151, ProcessCoord(pipe=2, data=6, model=0): 152, ProcessCoord(pipe=2, data=6, model=1): 153, ProcessCoord(pipe=2, data=6, model=2): 154, ProcessCoord(pipe=2, data=6, model=3): 155, ProcessCoord(pipe=2, data=7, model=0): 156, ProcessCoord(pipe=2, data=7, model=1): 157, ProcessCoord(pipe=2, data=7, model=2): 158, ProcessCoord(pipe=2, data=7, model=3): 159, ProcessCoord(pipe=2, data=8, model=0): 160, ProcessCoord(pipe=2, data=8, model=1): 161, ProcessCoord(pipe=2, data=8, model=2): 162, ProcessCoord(pipe=2, data=8, model=3): 16 + 0: 3, ProcessCoord(pipe=2, data=9, model=0): 164, ProcessCoord(pipe=2, data=9, model=1): 165, ProcessCoord(pipe=2, data=9, model=2): 166, ProcessCoord(pipe=2, data=9, model=3): 167, ProcessCoord(pipe=2, data=10, model=0): 168, ProcessCoord(pipe=2, data=10, model=1): 169, ProcessCoord(pipe=2, data=10, model=2): 170, ProcessCoord(pipe=2, data=10, model=3): 171, ProcessCoord(pipe=2, data=11, model=0): 172, ProcessCoord(pipe=2, data=11, model=1): 173, ProcessCoord(pipe=2, data=11, model=2): 174, ProcessCoord(pipe=2, data=11, model=3): 175, ProcessCoord(pipe=2, data=12, model=0): 176, ProcessCoord(pipe=2, data=12, model=1): 177, ProcessCoord(pipe=2, data=12, model=2): 178, ProcessCoord(pipe=2, data=12, model=3): 179, ProcessCoord(pipe=2, data=13, model=0): 180, ProcessCoord(pipe=2, data=13, model=1): 181, ProcessCoord(pipe=2, data=13, model=2): 182, ProcessCoord(pipe=2, data=13, model=3): 183, ProcessCoord(pipe=2, data=14, model=0): 184, ProcessCoord(pipe=2, data=14, model=1): 185, ProcessCoord(pipe=2, data=14, model + 0: =2): 186, ProcessCoord(pipe=2, data=14, model=3): 187, ProcessCoord(pipe=2, data=15, model=0): 188, ProcessCoord(pipe=2, data=15, model=1): 189, ProcessCoord(pipe=2, data=15, model=2): 190, ProcessCoord(pipe=2, data=15, model=3): 191, ProcessCoord(pipe=3, data=0, model=0): 192, ProcessCoord(pipe=3, data=0, model=1): 193, ProcessCoord(pipe=3, data=0, model=2): 194, ProcessCoord(pipe=3, data=0, model=3): 195, ProcessCoord(pipe=3, data=1, model=0): 196, ProcessCoord(pipe=3, data=1, model=1): 197, ProcessCoord(pipe=3, data=1, model=2): 198, ProcessCoord(pipe=3, data=1, model=3): 199, ProcessCoord(pipe=3, data=2, model=0): 200, ProcessCoord(pipe=3, data=2, model=1): 201, ProcessCoord(pipe=3, data=2, model=2): 202, ProcessCoord(pipe=3, data=2, model=3): 203, ProcessCoord(pipe=3, data=3, model=0): 204, ProcessCoord(pipe=3, data=3, model=1): 205, ProcessCoord(pipe=3, data=3, model=2): 206, ProcessCoord(pipe=3, data=3, model=3): 207, ProcessCoord(pipe=3, data=4, model=0): 208, ProcessCoord(pipe=3, data=4, model=1): 20 + 0: 9, ProcessCoord(pipe=3, data=4, model=2): 210, ProcessCoord(pipe=3, data=4, model=3): 211, ProcessCoord(pipe=3, data=5, model=0): 212, ProcessCoord(pipe=3, data=5, model=1): 213, ProcessCoord(pipe=3, data=5, model=2): 214, ProcessCoord(pipe=3, data=5, model=3): 215, ProcessCoord(pipe=3, data=6, model=0): 216, ProcessCoord(pipe=3, data=6, model=1): 217, ProcessCoord(pipe=3, data=6, model=2): 218, ProcessCoord(pipe=3, data=6, model=3): 219, ProcessCoord(pipe=3, data=7, model=0): 220, ProcessCoord(pipe=3, data=7, model=1): 221, ProcessCoord(pipe=3, data=7, model=2): 222, ProcessCoord(pipe=3, data=7, model=3): 223, ProcessCoord(pipe=3, data=8, model=0): 224, ProcessCoord(pipe=3, data=8, model=1): 225, ProcessCoord(pipe=3, data=8, model=2): 226, ProcessCoord(pipe=3, data=8, model=3): 227, ProcessCoord(pipe=3, data=9, model=0): 228, ProcessCoord(pipe=3, data=9, model=1): 229, ProcessCoord(pipe=3, data=9, model=2): 230, ProcessCoord(pipe=3, data=9, model=3): 231, ProcessCoord(pipe=3, data=10, model=0): 232, ProcessC + 0: oord(pipe=3, data=10, model=1): 233, ProcessCoord(pipe=3, data=10, model=2): 234, ProcessCoord(pipe=3, data=10, model=3): 235, ProcessCoord(pipe=3, data=11, model=0): 236, ProcessCoord(pipe=3, data=11, model=1): 237, ProcessCoord(pipe=3, data=11, model=2): 238, ProcessCoord(pipe=3, data=11, model=3): 239, ProcessCoord(pipe=3, data=12, model=0): 240, ProcessCoord(pipe=3, data=12, model=1): 241, ProcessCoord(pipe=3, data=12, model=2): 242, ProcessCoord(pipe=3, data=12, model=3): 243, ProcessCoord(pipe=3, data=13, model=0): 244, ProcessCoord(pipe=3, data=13, model=1): 245, ProcessCoord(pipe=3, data=13, model=2): 246, ProcessCoord(pipe=3, data=13, model=3): 247, ProcessCoord(pipe=3, data=14, model=0): 248, ProcessCoord(pipe=3, data=14, model=1): 249, ProcessCoord(pipe=3, data=14, model=2): 250, ProcessCoord(pipe=3, data=14, model=3): 251, ProcessCoord(pipe=3, data=15, model=0): 252, ProcessCoord(pipe=3, data=15, model=1): 253, ProcessCoord(pipe=3, data=15, model=2): 254, ProcessCoord(pipe=3, data=15, model=3): 25 + 0: 5} + 0: [2023-03-16 19:05:14,710] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer + 0: stage=0 layers=14 + 0: 0: _to_float16 + 0: 1: EmbeddingPipe + 0: 2: + 0: 3: ParallelTransformerLayerPipe + 0: 4: ParallelTransformerLayerPipe + 0: 5: ParallelTransformerLayerPipe + 0: 6: ParallelTransformerLayerPipe + 0: 7: ParallelTransformerLayerPipe + 0: 8: ParallelTransformerLayerPipe + 0: 9: ParallelTransformerLayerPipe + 0: 10: ParallelTransformerLayerPipe + 0: 11: ParallelTransformerLayerPipe + 0: 12: ParallelTransformerLayerPipe + 0: 13: ParallelTransformerLayerPipe + 0: stage=1 layers=11 + 0: 14: ParallelTransformerLayerPipe + 0: 15: ParallelTransformerLayerPipe + 0: 16: ParallelTransformerLayerPipe + 0: 17: ParallelTransformerLayerPipe + 0: 18: ParallelTransformerLayerPipe + 0: 19: ParallelTransformerLayerPipe + 0: 20: ParallelTransformerLayerPipe + 0: 21: ParallelTransformerLayerPipe + 0: 22: ParallelTransformerLayerPipe + 0: 23: ParallelTransformerLayerPipe + 0: 24: ParallelTransformerLayerPipe + 0: stage=2 layers=11 + 0: 25: ParallelTransformerLayerPipe + 0: 26: ParallelTransformerLayerPipe + 0: 27: ParallelTransformerLayerPipe + 0: 28: ParallelTransformerLayerPipe + 0: 29: ParallelTransformerLayerPipe + 0: 30: ParallelTransformerLayerPipe + 0: 31: ParallelTransformerLayerPipe + 0: 32: ParallelTransformerLayerPipe + 0: 33: ParallelTransformerLayerPipe + 0: 34: ParallelTransformerLayerPipe + 0: 35: ParallelTransformerLayerPipe + 0: stage=3 layers=13 + 0: 36: ParallelTransformerLayerPipe + 0: 37: ParallelTransformerLayerPipe + 0: 38: ParallelTransformerLayerPipe + 0: 39: ParallelTransformerLayerPipe + 0: 40: ParallelTransformerLayerPipe + 0: 41: ParallelTransformerLayerPipe + 0: 42: ParallelTransformerLayerPipe + 0: 43: ParallelTransformerLayerPipe + 0: 44: ParallelTransformerLayerPipe + 0: 45: undo + 0: 46: MixedFusedLayerNorm + 0: 47: EmbeddingPipe + 0: 48: float16_to_fp32 + 0: loss: CrossEntropy + 0: [2023-03-16 19:05:15,876] [INFO] [utils.py:827:see_memory_usage] After Building Model + 0: [2023-03-16 19:05:15,877] [INFO] [utils.py:828:see_memory_usage] MA 1.16 GB Max_MA 1.16 GB CA 1.19 GB Max_CA 1 GB + 0: [2023-03-16 19:05:15,877] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 31.65 GB, percent = 6.3% + 0: setting training iterations to 0 + 0: > learning rate decay style: cosine + 0: DeepSpeed is enabled. + 0: [2023-03-16 19:05:15,878] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown +16: ninja: no work to do. +16: Time to load utils op: 0.19463872909545898 seconds +24: Time to load utils op: 0.10918331146240234 seconds +24: Time to load utils op: 0.1091458797454834 seconds +25: Time to load utils op: 0.10978388786315918 seconds +25: Time to load utils op: 0.10982775688171387 seconds +16: Time to load utils op: 0.20254135131835938 seconds +16: Time to load utils op: 0.20444154739379883 seconds +16: Time to load utils op: 0.20452880859375 seconds + 8: Time to load utils op: 0.20759248733520508 seconds + 8: Time to load utils op: 0.1086578369140625 secondsTime to load utils op: 0.20761728286743164 seconds + 8: + 8: Time to load utils op: 0.10498523712158203 seconds + 8: Time to load utils op: 0.2023484706878662 seconds +12: Time to load utils op: 0.10781502723693848 secondsTime to load utils op: 0.10790848731994629 seconds +12: +11: Time to load utils op: 0.10804200172424316 secondsTime to load utils op: 0.10800623893737793 seconds +11: +17: Time to load utils op: 0.20749783515930176 seconds +17: Time to load utils op: 0.21108055114746094 seconds + 9: Time to load utils op: 0.10798406600952148 seconds + 9: Time to load utils op: 0.10730457305908203 seconds +17: Time to load utils op: 0.20610642433166504 seconds +13: Time to load utils op: 0.10815310478210449 seconds +13: Time to load utils op: 0.10837650299072266 seconds + 9: Time to load utils op: 0.20478129386901855 seconds +14: Time to load utils op: 0.10881829261779785 seconds +14: Time to load utils op: 0.10871481895446777 seconds + 9: Time to load utils op: 0.2053685188293457 seconds +15: Time to load utils op: 0.1090083122253418 seconds +15: Time to load utils op: 0.1091151237487793 seconds +10: Time to load utils op: 0.10871243476867676 seconds + 8: Time to load utils op: 0.20186352729797363 seconds +10: Time to load utils op: 0.10912632942199707 seconds +17: Time to load utils op: 0.20227742195129395 seconds +26: Time to load utils op: 0.1105337142944336 secondsTime to load utils op: 0.11053991317749023 seconds +26: + 9: Time to load utils op: 0.2023770809173584 seconds + 8: Time to load utils op: 0.10175752639770508 seconds + 9: Time to load utils op: 0.20217204093933105 seconds +10: Time to load utils op: 0.20416879653930664 seconds +10: Time to load utils op: 0.2042236328125 seconds +15: Time to load utils op: 0.20429301261901855 seconds +15: Time to load utils op: 0.20455193519592285 seconds +11: Time to load utils op: 0.2041761875152588 seconds +11: Time to load utils op: 0.20437192916870117 seconds +13: Time to load utils op: 0.20446348190307617 seconds +27: Time to load utils op: 0.1097104549407959 seconds +27: Time to load utils op: 0.10972237586975098 seconds +14: Time to load utils op: 0.2040717601776123 seconds +13: Time to load utils op: 0.20465683937072754 seconds +14: Time to load utils op: 0.2043139934539795 seconds +12: Time to load utils op: 0.20488286018371582 seconds +12: Time to load utils op: 0.20502042770385742 seconds +13: Time to load utils op: 0.10165762901306152 seconds + 9: Time to load utils op: 0.10171151161193848 seconds +14: Time to load utils op: 0.10188674926757812 seconds +12: Time to load utils op: 0.10182619094848633 seconds +28: Time to load utils op: 0.10962843894958496 seconds +12: Time to load utils op: 0.10190701484680176 seconds +13: Time to load utils op: 0.10197663307189941 seconds +28: Time to load utils op: 0.10962748527526855 seconds +15: Time to load utils op: 0.10182690620422363 seconds + 8: Time to load utils op: 0.10157155990600586 seconds +15: Time to load utils op: 0.1018228530883789 seconds +10: Time to load utils op: 0.10187554359436035 seconds +14: Time to load utils op: 0.10189962387084961 seconds +10: Time to load utils op: 0.1019437313079834 seconds +11: Time to load utils op: 0.10203433036804199 seconds +11: Time to load utils op: 0.1020965576171875 seconds + 9: Time to load utils op: 0.10236477851867676 seconds +11: Time to load utils op: 0.20215129852294922 seconds +31: Time to load utils op: 0.10881495475769043 secondsTime to load utils op: 0.10880398750305176 seconds +31: +10: Time to load utils op: 0.20229005813598633 seconds +29: Time to load utils op: 0.10944032669067383 secondsTime to load utils op: 0.10943436622619629 seconds +29: +11: Time to load utils op: 0.20216846466064453 seconds +30: Time to load utils op: 0.10927271842956543 seconds +30: Time to load utils op: 0.10927963256835938 seconds +10: Time to load utils op: 0.2025127410888672 seconds +15: Time to load utils op: 0.2021632194519043 seconds +15: Time to load utils op: 0.20220208168029785 seconds +14: Time to load utils op: 0.20241189002990723 seconds +14: Time to load utils op: 0.20243263244628906 seconds +18: Time to load utils op: 0.21012115478515625 secondsTime to load utils op: 0.21012639999389648 secondsTime to load utils op: 0.20846772193908691 seconds +18: +18: +18: Time to load utils op: 0.20641088485717773 seconds +13: Time to load utils op: 0.20236873626708984 seconds +12: Time to load utils op: 0.2019650936126709 seconds +13: Time to load utils op: 0.2022995948791504 seconds +12: Time to load utils op: 0.20223474502563477 seconds +23: Time to load utils op: 0.2106325626373291 secondsTime to load utils op: 0.2106151580810547 seconds +23: +23: Time to load utils op: 0.2057645320892334 seconds +22: Time to load utils op: 0.20576786994934082 seconds +22: Time to load utils op: 0.20979070663452148 secondsTime to load utils op: 0.20979738235473633 seconds +22: +23: Time to load utils op: 0.20561766624450684 seconds +22: Time to load utils op: 0.20600199699401855 seconds +21: Time to load utils op: 0.20730829238891602 secondsTime to load utils op: 0.21117138862609863 seconds +21: +21: Time to load utils op: 0.21118998527526855 seconds +21: Time to load utils op: 0.20705509185791016 seconds +20: Time to load utils op: 0.2107393741607666 secondsTime to load utils op: 0.2074601650238037 seconds +20: +20: Time to load utils op: 0.21076273918151855 seconds +20: Time to load utils op: 0.20737743377685547 seconds +19: Time to load utils op: 0.20761871337890625 secondsTime to load utils op: 0.21188974380493164 secondsTime to load utils op: 0.20735812187194824 secondsTime to load utils op: 0.21185922622680664 seconds +19: +19: +19: +16: Time to load utils op: 0.20196175575256348 seconds +20: Time to load utils op: 0.20183801651000977 seconds +22: Time to load utils op: 0.20180654525756836 seconds +21: Time to load utils op: 0.2017812728881836 seconds +17: Time to load utils op: 0.20195555686950684 seconds +18: Time to load utils op: 0.20205974578857422 seconds +22: Time to load utils op: 0.2019646167755127 seconds +16: Time to load utils op: 0.2020092010498047 seconds +18: Time to load utils op: 0.2021806240081787 seconds +17: Time to load utils op: 0.20223212242126465 seconds +23: Time to load utils op: 0.20226550102233887 seconds +21: Time to load utils op: 0.20210671424865723 seconds +20: Time to load utils op: 0.20228123664855957 seconds +23: Time to load utils op: 0.2025763988494873 seconds +19: Time to load utils op: 0.20885276794433594 seconds +19: Time to load utils op: 0.2090141773223877 seconds +24: Time to load utils op: 0.0007302761077880859 seconds +24: Time to load utils op: 0.0008573532104492188 seconds +16: Time to load utils op: 0.0004718303680419922 seconds +16: Time to load utils op: 0.00048828125 seconds +16: Time to load utils op: 0.0005042552947998047 seconds +16: Time to load utils op: 0.0004668235778808594 seconds +16: Time to load utils op: 0.0003464221954345703 seconds +16: Time to load utils op: 0.0003333091735839844 seconds +17: Time to load utils op: 0.0005097389221191406 seconds +17: Time to load utils op: 0.0005125999450683594 seconds +17: Time to load utils op: 0.0005195140838623047 seconds +17: Time to load utils op: 0.000560760498046875 seconds +17: Time to load utils op: 0.0005536079406738281 seconds +17: Time to load utils op: 0.0004668235778808594 seconds +25: Time to load utils op: 0.0011758804321289062 seconds +25: Time to load utils op: 0.0013632774353027344 seconds +16: Time to load utils op: 0.20214176177978516 seconds +13: Time to load utils op: 0.0004661083221435547 seconds +13: Time to load utils op: 0.00048065185546875 seconds +13: Time to load utils op: 0.0004923343658447266 seconds +13: Time to load utils op: 0.0005354881286621094 seconds +13: Time to load utils op: 0.0005538463592529297 seconds +13: Time to load utils op: 0.0005643367767333984 seconds +13: Time to load utils op: 0.0005824565887451172 seconds +13: Time to load utils op: 0.0006012916564941406 seconds +12: Time to load utils op: 0.0005040168762207031 seconds +12: Time to load utils op: 0.0005218982696533203 seconds +12: Time to load utils op: 0.0005595684051513672 seconds +12: Time to load utils op: 0.000579833984375 seconds +12: Time to load utils op: 0.0006878376007080078 seconds +12: Time to load utils op: 0.0006554126739501953 secondsTime to load utils op: 0.0006766319274902344 seconds +12: +12: Time to load utils op: 0.0006167888641357422 seconds +15: Time to load utils op: 0.0006535053253173828 seconds +14: Time to load utils op: 0.0010063648223876953 secondsTime to load utils op: 0.0010018348693847656 secondsTime to load utils op: 0.0010249614715576172 seconds +14: +14: +14: Time to load utils op: 0.0010302066802978516 seconds +15: Time to load utils op: 0.0007302761077880859 seconds +14: Time to load utils op: 0.0010323524475097656 secondsTime to load utils op: 0.0010237693786621094 seconds +14: Time to load utils op: 0.001054525375366211 secondsTime to load utils op: 0.0010478496551513672 seconds +15: Time to load utils op: 0.0006906986236572266 seconds +14: +14: +15: Time to load utils op: 0.0006954669952392578 seconds +15: Time to load utils op: 0.0007078647613525391 seconds +15: Time to load utils op: 0.0007181167602539062 seconds +15: Time to load utils op: 0.0007343292236328125 secondsTime to load utils op: 0.0007181167602539062 seconds +15: +26: Time to load utils op: 0.0006623268127441406 seconds +10: Time to load utils op: 0.0004858970642089844 seconds +26: Time to load utils op: 0.0005755424499511719 seconds +10: Time to load utils op: 0.0005042552947998047 seconds +10: Time to load utils op: 0.0005292892456054688 seconds +10: Time to load utils op: 0.0005457401275634766 seconds +10: Time to load utils op: 0.0005800724029541016 seconds +10: Time to load utils op: 0.0006070137023925781 seconds +10: Time to load utils op: 0.000614166259765625 seconds +10: Time to load utils op: 0.000614166259765625 seconds +28: Time to load utils op: 0.0007555484771728516 seconds +28: Time to load utils op: 0.0008893013000488281 seconds + 8: Time to load utils op: 0.0005025863647460938 seconds +30: Time to load utils op: 0.0009157657623291016 seconds + 8: Time to load utils op: 0.0007421970367431641 seconds + 8: Time to load utils op: 0.0007815361022949219 seconds +30: Time to load utils op: 0.0011174678802490234 seconds + 8: Time to load utils op: 0.0007178783416748047 seconds + 8: Time to load utils op: 0.0007224082946777344 seconds + 8: Time to load utils op: 0.0007309913635253906 secondsTime to load utils op: 0.0007345676422119141 seconds + 8: + 8: Time to load utils op: 0.000732421875 seconds +27: Time to load utils op: 0.0006518363952636719 seconds +18: Time to load utils op: 0.00046944618225097656 seconds +18: Time to load utils op: 0.0004546642303466797 seconds +18: Time to load utils op: 0.0005068778991699219 secondsTime to load utils op: 0.0005209445953369141 secondsTime to load utils op: 0.0005481243133544922 secondsTime to load utils op: 0.0005092620849609375 seconds +18: +18: +18: +27: Time to load utils op: 0.0007245540618896484 seconds +29: Time to load utils op: 0.0005967617034912109 seconds +31: Time to load utils op: 0.0006871223449707031 seconds +29: Time to load utils op: 0.0006968975067138672 seconds +31: Time to load utils op: 0.0005669593811035156 seconds +23: Time to load utils op: 0.00046133995056152344 seconds +23: Time to load utils op: 0.00044417381286621094 secondsTime to load utils op: 0.0004885196685791016 secondsTime to load utils op: 0.0004296302795410156 secondsTime to load utils op: 0.0004215240478515625 secondsTime to load utils op: 0.0004336833953857422 seconds +23: +23: +23: +23: +11: Time to load utils op: 0.0005028247833251953 seconds +11: Time to load utils op: 0.0005161762237548828 seconds +11: Time to load utils op: 0.0005245208740234375 seconds +11: Time to load utils op: 0.0005695819854736328 seconds +11: Time to load utils op: 0.0005691051483154297 seconds +11: Time to load utils op: 0.0006124973297119141 seconds +11: Time to load utils op: 0.0006287097930908203 seconds +11: Time to load utils op: 0.0006632804870605469 seconds +16: Time to load utils op: 0.0005500316619873047 seconds +21: Time to load utils op: 0.0005731582641601562 seconds +21: Time to load utils op: 0.0005807876586914062 seconds +21: Time to load utils op: 0.0006229877471923828 seconds +21: Time to load utils op: 0.000598907470703125 seconds +21: Time to load utils op: 0.0005929470062255859 secondsTime to load utils op: 0.0005624294281005859 seconds +21: +22: Time to load utils op: 0.000518798828125 seconds +22: Time to load utils op: 0.0005400180816650391 seconds +22: Time to load utils op: 0.0005605220794677734 secondsTime to load utils op: 0.0005581378936767578 seconds +22: +22: Time to load utils op: 0.0005314350128173828 seconds +22: Time to load utils op: 0.0005664825439453125 seconds +20: Time to load utils op: 0.0005009174346923828 seconds +20: Time to load utils op: 0.0005095005035400391 seconds +20: Time to load utils op: 0.0005152225494384766 seconds +20: Time to load utils op: 0.0005512237548828125 seconds +20: Time to load utils op: 0.0005652904510498047 seconds +20: Time to load utils op: 0.0006074905395507812 seconds +19: Time to load utils op: 0.00047326087951660156 seconds +19: Time to load utils op: 0.00047659873962402344 seconds +19: Time to load utils op: 0.0005176067352294922 secondsTime to load utils op: 0.0004951953887939453 seconds +19: +19: Time to load utils op: 0.0005636215209960938 seconds +19: Time to load utils op: 0.0005412101745605469 seconds + 9: Time to load utils op: 0.0005753040313720703 seconds + 9: Time to load utils op: 0.0006015300750732422 secondsTime to load utils op: 0.0005817413330078125 seconds + 9: + 9: Time to load utils op: 0.0006058216094970703 seconds + 9: Time to load utils op: 0.0006239414215087891 seconds + 9: Time to load utils op: 0.0005738735198974609 seconds + 9: Time to load utils op: 0.0006496906280517578 seconds + 9: Time to load utils op: 0.0006220340728759766 seconds + 0: [2023-03-16 19:05:16,646] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False + 0: [2023-03-16 19:05:16,646] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer + 0: [2023-03-16 19:05:16,646] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer + 0: [2023-03-16 19:05:16,649] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam + 0: [2023-03-16 19:05:16,649] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer +24: ninja: no work to do. +24: Time to load utils op: 0.18449687957763672 seconds + 0: [2023-03-16 19:05:16,777] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer + 0: [2023-03-16 19:05:16,778] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.18 GB CA 1.21 GB Max_CA 1 GB + 0: [2023-03-16 19:05:16,778] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.21 GB, percent = 6.4% +24: Time to load utils op: 0.0006079673767089844 seconds +24: ninja: no work to do. +24: Time to load utils op: 0.1367654800415039 seconds +24: Time to load utils op: 0.0006194114685058594 seconds + 0: Time to load utils op: 0.31090402603149414 seconds + 0: Time to load utils op: 0.30623531341552734 seconds + 1: Time to load utils op: 0.3104572296142578 secondsTime to load utils op: 0.3106234073638916 seconds + 1: + 0: Time to load utils op: 0.3021731376647949 seconds + 4: Time to load utils op: 0.3102378845214844 seconds + 4: Time to load utils op: 0.31026721000671387 seconds + 7: Time to load utils op: 0.3109738826751709 seconds + 7: Time to load utils op: 0.3109729290008545 seconds + 2: Time to load utils op: 0.3110201358795166 secondsTime to load utils op: 0.3110530376434326 seconds + 2: + 3: Time to load utils op: 0.311981201171875 seconds + 3: Time to load utils op: 0.31200623512268066 seconds + 0: Time to load utils op: 0.30217528343200684 seconds + 5: Time to load utils op: 0.3118593692779541 secondsTime to load utils op: 0.3118891716003418 seconds + 5: + 6: Time to load utils op: 0.31178998947143555 seconds + 6: Time to load utils op: 0.3118102550506592 seconds + 0: Time to load utils op: 0.30187439918518066 seconds + 1: Time to load utils op: 0.303342342376709 secondsTime to load utils op: 0.30319643020629883 seconds + 1: + 1: Time to load utils op: 0.3020198345184326 seconds + 3: Time to load utils op: 0.30240392684936523 seconds + 2: Time to load utils op: 0.30275464057922363 secondsTime to load utils op: 0.302793025970459 seconds + 2: + 3: Time to load utils op: 0.30266261100769043 seconds + 4: Time to load utils op: 0.30242347717285156 seconds + 7: Time to load utils op: 0.30232810974121094 secondsTime to load utils op: 0.3022606372833252 seconds + 7: + 1: Time to load utils op: 0.30287742614746094 seconds + 2: Time to load utils op: 0.30205416679382324 seconds + 5: Time to load utils op: 0.30269622802734375 seconds + 2: Time to load utils op: 0.3019726276397705 seconds + 5: Time to load utils op: 0.30290985107421875 seconds + 6: Time to load utils op: 0.30252647399902344 seconds + 6: Time to load utils op: 0.302553653717041 seconds + 3: Time to load utils op: 0.30220746994018555 seconds + 4: Time to load utils op: 0.301922082901001 seconds + 4: Time to load utils op: 0.30269622802734375 seconds + 4: Time to load utils op: 0.30197858810424805 seconds + 3: Time to load utils op: 0.30228734016418457 seconds + 5: Time to load utils op: 0.3020336627960205 seconds + 6: Time to load utils op: 0.302004337310791 seconds + 7: Time to load utils op: 0.30205774307250977 seconds + 7: Time to load utils op: 0.3020944595336914 seconds + 5: Time to load utils op: 0.3022627830505371 seconds + 6: Time to load utils op: 0.30241847038269043 seconds + 0: Time to load utils op: 0.2022867202758789 seconds +24: Time to load utils op: 0.2026195526123047 seconds +30: Time to load utils op: 0.20310235023498535 seconds +30: Time to load utils op: 0.2032608985900879 seconds +27: Time to load utils op: 0.20348405838012695 secondsTime to load utils op: 0.2034916877746582 seconds +27: + 0: Time to load utils op: 0.0004725456237792969 seconds + 0: Time to load utils op: 0.0005121231079101562 seconds +25: Time to load utils op: 0.20360803604125977 seconds + 0: Time to load utils op: 0.0004184246063232422 seconds + 0: Time to load utils op: 0.0005199909210205078 seconds + 0: Time to load utils op: 0.00042510032653808594 seconds +29: Time to load utils op: 0.20362281799316406 seconds +28: Time to load utils op: 0.20392107963562012 seconds +26: Time to load utils op: 0.20383310317993164 seconds +26: Time to load utils op: 0.2038283348083496 seconds +31: Time to load utils op: 0.20388293266296387 seconds +28: Time to load utils op: 0.2039799690246582 seconds +25: Time to load utils op: 0.2040112018585205 seconds +31: Time to load utils op: 0.20364713668823242 seconds +29: Time to load utils op: 0.20398235321044922 seconds +24: Time to load utils op: 0.40290379524230957 seconds + 0: Time to load utils op: 0.30223917961120605 seconds + 1: Time to load utils op: 0.000530242919921875 seconds + 1: Time to load utils op: 0.0005347728729248047 seconds + 1: Time to load utils op: 0.0005095005035400391 seconds + 1: Time to load utils op: 0.0005316734313964844 seconds + 1: Time to load utils op: 0.0005526542663574219 seconds + 1: Time to load utils op: 0.0005748271942138672 seconds +25: Time to load utils op: 0.4028761386871338 seconds +25: Time to load utils op: 0.4034111499786377 seconds +21: Time to load utils op: 0.603428840637207 seconds +21: Time to load utils op: 0.6033275127410889 seconds +19: Time to load utils op: 0.6033811569213867 seconds +17: Time to load utils op: 0.603482723236084 seconds +18: Time to load utils op: 0.6034314632415771 seconds +20: Time to load utils op: 0.6036121845245361 seconds +22: Time to load utils op: 0.6035585403442383 seconds +17: Time to load utils op: 0.6034836769104004 seconds +19: Time to load utils op: 0.6034834384918213 seconds +22: Time to load utils op: 0.6034426689147949 seconds +16: Time to load utils op: 0.6034464836120605 seconds +20: Time to load utils op: 0.6036300659179688 seconds +18: Time to load utils op: 0.603689432144165 seconds +23: Time to load utils op: 0.6038739681243896 secondsTime to load utils op: 0.6038737297058105 seconds +23: + 7: Time to load utils op: 0.0005309581756591797 seconds + 7: Time to load utils op: 0.0005431175231933594 seconds + 7: Time to load utils op: 0.0005116462707519531 seconds + 7: Time to load utils op: 0.0005440711975097656 secondsTime to load utils op: 0.0005490779876708984 seconds + 7: + 7: Time to load utils op: 0.0005385875701904297 seconds +26: Time to load utils op: 0.40334272384643555 seconds +26: Time to load utils op: 0.4033632278442383 seconds + 3: Time to load utils op: 0.0005066394805908203 seconds + 3: Time to load utils op: 0.0005145072937011719 seconds + 3: Time to load utils op: 0.0005171298980712891 seconds + 3: Time to load utils op: 0.0005276203155517578 seconds + 3: Time to load utils op: 0.0005633831024169922 secondsTime to load utils op: 0.0005648136138916016 seconds + 3: +24: Time to load utils op: 0.00030994415283203125 seconds +30: Time to load utils op: 0.0004868507385253906 seconds +30: Time to load utils op: 0.0003285408020019531 seconds +27: Time to load utils op: 0.0004291534423828125 seconds +27: Time to load utils op: 0.0003380775451660156 seconds + 4: Time to load utils op: 0.00047087669372558594 seconds + 4: Time to load utils op: 0.0005130767822265625 seconds + 4: Time to load utils op: 0.0005161762237548828 seconds + 4: Time to load utils op: 0.0005218982696533203 seconds + 4: Time to load utils op: 0.00048470497131347656 seconds + 4: Time to load utils op: 0.0005333423614501953 seconds +29: Time to load utils op: 0.0004482269287109375 seconds +28: Time to load utils op: 0.0004420280456542969 seconds +25: Time to load utils op: 0.00045990943908691406 seconds +25: Time to load utils op: 0.0003190040588378906 seconds +29: Time to load utils op: 0.0003228187561035156 seconds +28: Time to load utils op: 0.00033736228942871094 seconds +26: Time to load utils op: 0.0004913806915283203 seconds +26: Time to load utils op: 0.0003380775451660156 seconds +27: Time to load utils op: 0.4041018486022949 secondsTime to load utils op: 0.40434741973876953 seconds +27: + 2: Time to load utils op: 0.00048041343688964844 seconds +31: Time to load utils op: 0.0004639625549316406 seconds + 2: Time to load utils op: 0.0004878044128417969 seconds +31: Time to load utils op: 0.0004711151123046875 seconds + 2: Time to load utils op: 0.0004124641418457031 seconds + 2: Time to load utils op: 0.0005393028259277344 secondsTime to load utils op: 0.0005180835723876953 seconds + 2: + 2: Time to load utils op: 0.0005300045013427734 seconds + 5: Time to load utils op: 0.0005285739898681641 seconds + 5: Time to load utils op: 0.0005376338958740234 seconds + 5: Time to load utils op: 0.0005369186401367188 seconds + 5: Time to load utils op: 0.0005755424499511719 seconds + 5: Time to load utils op: 0.0005896091461181641 seconds + 5: Time to load utils op: 0.0005674362182617188 seconds + 3: Time to load utils op: 0.30222415924072266 seconds + 6: Time to load utils op: 0.0004849433898925781 seconds + 0: Time to load utils op: 0.30199408531188965 seconds + 6: Time to load utils op: 0.0005099773406982422 seconds + 3: Time to load utils op: 0.3023216724395752 seconds + 6: Time to load utils op: 0.0005078315734863281 seconds + 6: Time to load utils op: 0.0005686283111572266 secondsTime to load utils op: 0.0005743503570556641 seconds + 6: Time to load utils op: 0.0005850791931152344 seconds + 6: + 5: Time to load utils op: 0.3023257255554199 seconds + 2: Time to load utils op: 0.3022451400756836 seconds + 4: Time to load utils op: 0.3023371696472168 seconds + 5: Time to load utils op: 0.30227065086364746 seconds + 4: Time to load utils op: 0.30236005783081055 seconds + 2: Time to load utils op: 0.30245423316955566 seconds + 7: Time to load utils op: 0.30228710174560547 seconds + 1: Time to load utils op: 0.3027305603027344 seconds + 7: Time to load utils op: 0.30278897285461426 seconds + 1: Time to load utils op: 0.30266690254211426 seconds + 6: Time to load utils op: 0.30301785469055176 seconds + 6: Time to load utils op: 0.30269622802734375 seconds +28: Time to load utils op: 0.40323972702026367 seconds +28: Time to load utils op: 0.4034004211425781 seconds +29: Time to load utils op: 0.403353214263916 seconds +24: Time to load utils op: 0.4027373790740967 seconds +29: Time to load utils op: 0.4033951759338379 seconds +30: Time to load utils op: 0.4026913642883301 seconds +30: Time to load utils op: 0.4033691883087158 seconds +24: Time to load utils op: 0.0003693103790283203 seconds + 0: Time to load utils op: 0.0003464221954345703 seconds +31: Time to load utils op: 0.40389585494995117 secondsTime to load utils op: 0.40381908416748047 seconds +31: +25: Time to load utils op: 0.00035881996154785156 seconds +26: Time to load utils op: 0.4023854732513428 seconds +25: Time to load utils op: 0.000301361083984375 seconds +30: Time to load utils op: 0.4021141529083252 seconds +28: Time to load utils op: 0.40231776237487793 seconds +26: Time to load utils op: 0.4023888111114502 seconds +25: Time to load utils op: 0.40215373039245605 seconds +30: Time to load utils op: 0.40213465690612793 seconds +25: Time to load utils op: 0.40249133110046387 seconds +27: Time to load utils op: 0.4022862911224365 seconds +24: Time to load utils op: 0.40265965461730957 seconds +28: Time to load utils op: 0.4025278091430664 seconds +22: Time to load utils op: 0.0004527568817138672 seconds +21: Time to load utils op: 0.000446319580078125 seconds +27: Time to load utils op: 0.4023921489715576 seconds +21: Time to load utils op: 0.00045490264892578125 seconds +22: Time to load utils op: 0.00031828880310058594 seconds +31: Time to load utils op: 0.4027431011199951 seconds +29: Time to load utils op: 0.4026038646697998 seconds +31: Time to load utils op: 0.4027442932128906 seconds +18: Time to load utils op: 0.00044155120849609375 seconds +18: Time to load utils op: 0.00044155120849609375 seconds +16: Time to load utils op: 0.0004391670227050781 seconds +17: Time to load utils op: 0.0004324913024902344 seconds +29: Time to load utils op: 0.4029242992401123 seconds +19: Time to load utils op: 0.0004398822784423828 seconds +17: Time to load utils op: 0.00031256675720214844 seconds +19: Time to load utils op: 0.00031495094299316406 seconds +20: Time to load utils op: 0.00048613548278808594 seconds +20: Time to load utils op: 0.0005068778991699219 seconds +23: Time to load utils op: 0.00045108795166015625 seconds +23: Time to load utils op: 0.00033783912658691406 seconds +26: Time to load utils op: 0.00035881996154785156 seconds +26: Time to load utils op: 0.00035309791564941406 seconds +27: Time to load utils op: 0.0003948211669921875 seconds +27: Time to load utils op: 0.0003666877746582031 seconds + 0: Time to load utils op: 0.0003495216369628906 seconds + 3: Time to load utils op: 0.00036644935607910156 seconds + 3: Time to load utils op: 0.0003554821014404297 seconds + 2: Time to load utils op: 0.00033664703369140625 seconds + 5: Time to load utils op: 0.00032639503479003906 seconds + 4: Time to load utils op: 0.0003380775451660156 seconds + 2: Time to load utils op: 0.00036597251892089844 seconds + 4: Time to load utils op: 0.0003447532653808594 seconds + 5: Time to load utils op: 0.0003654956817626953 seconds + 1: Time to load utils op: 0.0003495216369628906 seconds + 7: Time to load utils op: 0.0003154277801513672 seconds + 1: Time to load utils op: 0.0003590583801269531 seconds +28: Time to load utils op: 0.00032806396484375 seconds + 7: Time to load utils op: 0.00035953521728515625 seconds +28: Time to load utils op: 0.0003428459167480469 seconds + 6: Time to load utils op: 0.0003695487976074219 seconds + 6: Time to load utils op: 0.0003535747528076172 seconds +24: Time to load utils op: 0.00033211708068847656 seconds +29: Time to load utils op: 0.0003299713134765625 seconds +29: Time to load utils op: 0.000362396240234375 seconds +30: Time to load utils op: 0.0003445148468017578 seconds +30: Time to load utils op: 0.00034046173095703125 seconds +25: Time to load utils op: 0.0003516674041748047 seconds +28: Time to load utils op: 0.0003311634063720703 seconds +25: Time to load utils op: 0.0003275871276855469 seconds +31: Time to load utils op: 0.0003669261932373047 seconds +30: Time to load utils op: 0.00031304359436035156 seconds +30: Time to load utils op: 0.00034427642822265625 seconds +31: Time to load utils op: 0.0003490447998046875 seconds +26: Time to load utils op: 0.00031638145446777344 seconds +28: Time to load utils op: 0.00037097930908203125 seconds +26: Time to load utils op: 0.00035381317138671875 seconds +31: Time to load utils op: 0.00036025047302246094 seconds +27: Time to load utils op: 0.00033926963806152344 seconds +29: Time to load utils op: 0.00034737586975097656 seconds +27: Time to load utils op: 0.00035643577575683594 seconds +29: Time to load utils op: 0.00033283233642578125 seconds +24: Time to load utils op: 0.0003616809844970703 seconds +31: Time to load utils op: 0.0003578662872314453 seconds + 0: [2023-03-16 19:05:17,107] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 + 0: [2023-03-16 19:05:17,108] [INFO] [utils.py:828:see_memory_usage] MA 1.15 GB Max_MA 1.15 GB CA 1.21 GB Max_CA 1 GB + 0: [2023-03-16 19:05:17,108] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.32 GB, percent = 6.4% + 0: [2023-03-16 19:05:17,225] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 + 0: [2023-03-16 19:05:17,225] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.06 GB Max_CA 3 GB + 0: [2023-03-16 19:05:17,226] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.34 GB, percent = 6.4% + 0: [2023-03-16 19:05:17,339] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 + 0: [2023-03-16 19:05:17,340] [INFO] [utils.py:828:see_memory_usage] MA 2.43 GB Max_MA 2.43 GB CA 3.06 GB Max_CA 3 GB + 0: [2023-03-16 19:05:17,340] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.44 GB, percent = 6.4% + 0: [2023-03-16 19:05:17,455] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 + 0: [2023-03-16 19:05:17,456] [INFO] [utils.py:828:see_memory_usage] MA 3.58 GB Max_MA 3.58 GB CA 4.75 GB Max_CA 5 GB + 0: [2023-03-16 19:05:17,456] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.64 GB, percent = 6.5% + 0: [2023-03-16 19:05:17,564] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 + 0: [2023-03-16 19:05:17,564] [INFO] [utils.py:828:see_memory_usage] MA 3.58 GB Max_MA 3.58 GB CA 4.75 GB Max_CA 5 GB + 0: [2023-03-16 19:05:17,564] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.65 GB, percent = 6.5% + 0: [2023-03-16 19:05:17,674] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 + 0: [2023-03-16 19:05:17,675] [INFO] [utils.py:828:see_memory_usage] MA 3.58 GB Max_MA 3.58 GB CA 4.75 GB Max_CA 5 GB + 0: [2023-03-16 19:05:17,675] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.7 GB, percent = 6.5% + 0: [2023-03-16 19:05:17,781] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer + 0: [2023-03-16 19:05:17,782] [INFO] [utils.py:828:see_memory_usage] MA 3.58 GB Max_MA 3.58 GB CA 4.75 GB Max_CA 5 GB + 0: [2023-03-16 19:05:17,782] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.7 GB, percent = 6.5% + 0: [2023-03-16 19:05:17,894] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer + 0: [2023-03-16 19:05:17,894] [INFO] [utils.py:828:see_memory_usage] MA 3.87 GB Max_MA 3.87 GB CA 5.04 GB Max_CA 5 GB + 0: [2023-03-16 19:05:17,895] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.7 GB, percent = 6.5% + 0: [2023-03-16 19:05:18,002] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer + 0: [2023-03-16 19:05:18,003] [INFO] [utils.py:828:see_memory_usage] MA 3.87 GB Max_MA 3.87 GB CA 5.04 GB Max_CA 5 GB + 0: [2023-03-16 19:05:18,003] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 32.7 GB, percent = 6.5% + 0: [2023-03-16 19:05:18,003] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam + 0: [2023-03-16 19:05:18,003] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler + 0: [2023-03-16 19:05:18,003] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = + 0: [2023-03-16 19:05:18,003] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0002, 0.0002, 0.0002], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] activation_checkpointing_config { + 0: "partition_activations": false, + 0: "contiguous_memory_optimization": false, + 0: "cpu_checkpointing": false, + 0: "number_checkpoints": null, + 0: "synchronize_checkpoint_boundary": false, + 0: "profile": false + 0: } + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] amp_enabled .................. False + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] amp_params ................... False + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] autotuning_config ............ { + 0: "enabled": false, + 0: "start_step": null, + 0: "end_step": null, + 0: "metric_path": null, + 0: "arg_mappings": null, + 0: "metric": "throughput", + 0: "model_info": null, + 0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", + 0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", + 0: "overwrite": true, + 0: "fast": true, + 0: "start_profile_step": 3, + 0: "end_profile_step": 5, + 0: "tuner_type": "gridsearch", + 0: "tuner_early_stopping": 5, + 0: "tuner_num_trials": 50, + 0: "model_info_path": null, + 0: "mp_size": 1, + 0: "max_train_batch_size": null, + 0: "min_train_batch_size": 1, + 0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, + 0: "min_train_micro_batch_size_per_gpu": 1, + 0: "num_tuning_micro_batch_sizes": 3 + 0: } + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] bfloat16_enabled ............. True + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] comms_config ................. + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] communication_data_type ...... None + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa + 0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} + 0: [2023-03-16 19:05:18,004] [INFO] [config.py:1011:print] curriculum_enabled ........... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] curriculum_params ............ False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] dataloader_drop_last ......... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] disable_allgather ............ False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] dump_state ................... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] elasticity_enabled ........... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] flops_profiler_config ........ { + 0: "enabled": false, + 0: "profile_step": 1, + 0: "module_depth": -1, + 0: "top_modules": 1, + 0: "detailed": true, + 0: "output_file": null + 0: } + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] fp16_auto_cast ............... None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] fp16_enabled ................. False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] global_rank .................. 0 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 32 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] load_universal_checkpoint .... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] loss_scale ................... 1.0 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] memory_breakdown ............. False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] monitor_config ............... + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] nebula_config ................ { + 0: "enabled": false, + 0: "persistent_storage_path": null, + 0: "persistent_time_interval": 100, + 0: "num_of_version_in_retention": 2, + 0: "enable_nebula_load": true, + 0: "load_path": null + 0: } + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] optimizer_name ............... None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] optimizer_params ............. None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] pld_enabled .................. False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] pld_params ................... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] prescale_gradients ........... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] scheduler_name ............... None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] scheduler_params ............. None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] sparse_attention ............. None + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] steps_per_print .............. 2000 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] train_batch_size ............. 1024 + 0: [2023-03-16 19:05:18,005] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 2 + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] use_node_local_storage ....... False + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] world_size ................... 16 + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] zero_enabled ................. False + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 + 0: [2023-03-16 19:05:18,006] [INFO] [config.py:996:print_user_config] json = { + 0: "train_micro_batch_size_per_gpu": 2, + 0: "train_batch_size": 1.024000e+03, + 0: "gradient_clipping": 1.0, + 0: "zero_optimization": { + 0: "stage": 0 + 0: }, + 0: "bf16": { + 0: "enabled": true + 0: }, + 0: "steps_per_print": 2.000000e+03, + 0: "wall_clock_breakdown": false + 0: } + 0: Time to load utils op: 0.0004265308380126953 seconds + 0: [2023-03-16 19:05:18,006] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=32 micro_batch_size=2 + 0: [2023-03-16 19:05:18,252] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=14 [0, 14) STAGE_PARAMS=614290432 (614.290M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 0: [2023-03-16 19:05:18,252] [INFO] [engine.py:145:__init__] RANK=2 STAGE=0 LAYERS=14 [0, 14) STAGE_PARAMS=614290432 (614.290M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 0: [2023-03-16 19:05:18,252] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=14 [0, 14) STAGE_PARAMS=614290432 (614.290M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 0: [2023-03-16 19:05:18,252] [INFO] [engine.py:145:__init__] RANK=3 STAGE=0 LAYERS=14 [0, 14) STAGE_PARAMS=614290432 (614.290M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 8: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=64 STAGE=1 LAYERS=11 [14, 25) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 8: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=65 STAGE=1 LAYERS=11 [14, 25) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +16: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=128 STAGE=2 LAYERS=11 [25, 36) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 8: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=67 STAGE=1 LAYERS=11 [14, 25) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) + 8: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=66 STAGE=1 LAYERS=11 [14, 25) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +16: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=129 STAGE=2 LAYERS=11 [25, 36) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +24: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=192 STAGE=3 LAYERS=13 [36, 49) STAGE_PARAMS=513571840 (513.572M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +16: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=130 STAGE=2 LAYERS=11 [25, 36) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +24: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=194 STAGE=3 LAYERS=13 [36, 49) STAGE_PARAMS=513571840 (513.572M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +24: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=193 STAGE=3 LAYERS=13 [36, 49) STAGE_PARAMS=513571840 (513.572M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +24: [2023-03-16 19:05:18,253] [INFO] [engine.py:145:__init__] RANK=195 STAGE=3 LAYERS=13 [36, 49) STAGE_PARAMS=513571840 (513.572M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +16: [2023-03-16 19:05:18,254] [INFO] [engine.py:145:__init__] RANK=131 STAGE=2 LAYERS=11 [25, 36) STAGE_PARAMS=553997312 (553.997M) TOTAL_PARAMS=8943427584 (8943.428M) UNIQUE_PARAMS=8702255104 (8702.255M) +12: [2023-03-16 19:05:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 8: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +12: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +28: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +24: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +23: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +18: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +16: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +27: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +15: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +21: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +17: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +31: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 5: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +14: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +29: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +10: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +25: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +26: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +19: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +13: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +22: [2023-03-16 19:05:19,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 0: [2023-03-16 19:05:19,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 0: [2023-03-16 19:05:19,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 0: [2023-03-16 19:05:19,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +23: [2023-03-16 19:05:19,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +23: [2023-03-16 19:05:19,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +23: [2023-03-16 19:05:19,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +23: [2023-03-16 19:05:19,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +23: [2023-03-16 19:05:19,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 0: [2023-03-16 19:05:19,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 0: [2023-03-16 19:05:19,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 0: [2023-03-16 19:05:19,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 0: [2023-03-16 19:05:19,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 0: [2023-03-16 19:05:19,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 0: [2023-03-16 19:05:19,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +23: [2023-03-16 19:05:19,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +23: [2023-03-16 19:05:19,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +23: [2023-03-16 19:05:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +23: [2023-03-16 19:05:19,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +23: [2023-03-16 19:05:19,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +23: [2023-03-16 19:05:19,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +23: [2023-03-16 19:05:19,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +23: [2023-03-16 19:05:19,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +16: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... + 8: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +24: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +25: [2023-03-16 19:05:19,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... + 8: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 6: [2023-03-16 19:05:19,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +16: [2023-03-16 19:05:19,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +23: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +16: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +24: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +25: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +16: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +16: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +24: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +25: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +23: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... +16: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 7: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... +23: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +16: [2023-03-16 19:05:19,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +25: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +25: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +24: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. + 7: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +23: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +16: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +12: [2023-03-16 19:05:19,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... + 7: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +18: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +28: [2023-03-16 19:05:19,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... + 7: [2023-03-16 19:05:19,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 7: [2023-03-16 19:05:19,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 7: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 2: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 8: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 8: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... +12: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +12: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. + 4: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +14: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 4: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +18: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. + 8: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +20: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 7: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. +12: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 6: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 2: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... + 7: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. +12: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +24: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. + 6: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 4: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +25: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +27: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +20: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... + 4: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +28: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +15: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +28: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. + 7: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +28: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +25: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +13: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... + 6: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... + 7: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +24: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. + 2: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +18: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +24: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +25: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +28: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +28: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +28: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +25: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +28: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +24: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. +18: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +25: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. + 4: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 6: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. +25: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +24: [2023-03-16 19:05:19,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +24: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 2: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 6: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 2: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +25: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. +25: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 2: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 2: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 4: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +14: [2023-03-16 19:05:19,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +28: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. + 7: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +28: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +14: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 4: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +27: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. + 4: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +27: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +20: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. + 7: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +28: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +27: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. + 7: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +14: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +28: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +27: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +15: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +20: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. + 2: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +14: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +27: [2023-03-16 19:05:19,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +27: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +27: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +27: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +13: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +27: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +27: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +15: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +15: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +27: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +17: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +20: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +17: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +27: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +20: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +13: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +20: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +20: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 3: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... + 3: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 9: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 3: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... +21: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... + 1: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt... +11: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +19: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +30: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +30: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +30: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +15: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +13: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +14: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +10: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt... +12: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt... +15: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. + 8: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. + 3: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +11: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +19: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt... +12: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. + 8: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 3: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 5: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +14: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +13: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +26: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +29: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +15: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. + 8: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +12: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... +31: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt... + 1: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_00_model_states.pt. +12: [2023-03-16 19:05:19,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 1: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +17: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. + 1: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 3: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. +17: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. + 3: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. +17: [2023-03-16 19:05:19,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. + 9: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 9: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 3: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +17: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +17: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +17: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +21: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. + 3: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 9: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +17: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +30: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +21: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +10: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. + 1: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. +11: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +30: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +26: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +21: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +10: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. + 9: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +17: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +11: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_04_model_states.pt. +19: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +21: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. + 1: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_01_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 9: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +30: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +21: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +10: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +11: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. + 1: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. + 5: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +29: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +30: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +26: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. + 1: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_02_model_states.pt. +19: [2023-03-16 19:05:19,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. + 5: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +10: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +11: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_07_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +10: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. + 1: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +11: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. + 1: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +10: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +31: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_13_model_states.pt. +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_03_model_states.pt. +31: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +11: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. + 1: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +10: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +11: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +21: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +29: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +26: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +21: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +19: [2023-03-16 19:05:19,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +19: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +29: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +19: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +26: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +19: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_12_model_states.pt. +29: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_14_model_states.pt. +29: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +19: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +26: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +19: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +19: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +31: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +29: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +29: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +26: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +28: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +24: [2023-03-16 19:05:19,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +31: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +29: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +26: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +24: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +27: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +25: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +28: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +27: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +30: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +25: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_15_model_states.pt. +29: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +26: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +28: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +26: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +29: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +27: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +25: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +28: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +27: [2023-03-16 19:05:19,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +25: [2023-03-16 19:05:19,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. + 8: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. + 8: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +12: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +13: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +10: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. + 9: [2023-03-16 19:05:19,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +15: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +12: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +14: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +10: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +15: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. + 9: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. + 8: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +14: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +11: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +13: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_05_model_states.pt. +12: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +12: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +22: [2023-03-16 19:05:19,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt... +22: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +22: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_10_model_states.pt. +22: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +22: [2023-03-16 19:05:19,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_09_model_states.pt. +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_08_model_states.pt. +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_11_model_states.pt. +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. + 8: [2023-03-16 19:05:19,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +12: [2023-03-16 19:05:19,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +12: [2023-03-16 19:05:19,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. + 8: [2023-03-16 19:05:19,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +12: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +12: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +15: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +13: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +15: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. + 9: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +14: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. + 9: [2023-03-16 19:05:19,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +14: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +15: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. +11: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. + 9: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/mp_rank_06_model_states.pt. + 9: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +10: [2023-03-16 19:05:19,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +11: [2023-03-16 19:05:19,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 6: [2023-03-16 19:05:19,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 0: [2023-03-16 19:05:19,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 0: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 0: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 0: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 2: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 7: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 6: [2023-03-16 19:05:19,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 6: [2023-03-16 19:05:19,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 6: [2023-03-16 19:05:19,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 6: [2023-03-16 19:05:19,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 7: [2023-03-16 19:05:19,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 6: [2023-03-16 19:05:19,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 7: [2023-03-16 19:05:19,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 7: [2023-03-16 19:05:19,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 7: [2023-03-16 19:05:19,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 7: [2023-03-16 19:05:19,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 7: [2023-03-16 19:05:19,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 4: [2023-03-16 19:05:19,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 4: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 5: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 4: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 4: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 5: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 1: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... +23: [2023-03-16 19:05:19,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... + 3: [2023-03-16 19:05:19,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 3: [2023-03-16 19:05:19,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 3: [2023-03-16 19:05:19,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 3: [2023-03-16 19:05:19,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... +23: [2023-03-16 19:05:19,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +23: [2023-03-16 19:05:19,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +23: [2023-03-16 19:05:19,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... + 1: [2023-03-16 19:05:19,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 5: [2023-03-16 19:05:19,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 5: [2023-03-16 19:05:19,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 5: [2023-03-16 19:05:19,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... +23: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... + 5: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 5: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 1: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... +23: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +23: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +23: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... + 1: [2023-03-16 19:05:19,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt... + 1: [2023-03-16 19:05:19,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt... + 1: [2023-03-16 19:05:19,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... + 1: [2023-03-16 19:05:19,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt... +26: [2023-03-16 19:05:19,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +16: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +26: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +16: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +16: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +26: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +26: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +26: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +26: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +16: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +16: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +16: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +16: [2023-03-16 19:05:19,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +26: [2023-03-16 19:05:19,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +16: [2023-03-16 19:05:19,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +26: [2023-03-16 19:05:19,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +16: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +21: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +17: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +20: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +20: [2023-03-16 19:05:19,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +20: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +20: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +20: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +20: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +20: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +17: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +18: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +20: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +18: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +18: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +21: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +16: [2023-03-16 19:05:19,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +16: [2023-03-16 19:05:19,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +18: [2023-03-16 19:05:19,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +18: [2023-03-16 19:05:19,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +18: [2023-03-16 19:05:19,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +17: [2023-03-16 19:05:19,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,602] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +16: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +20: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +17: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +20: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... + 0: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +17: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +17: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +17: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +20: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +16: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +16: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +17: [2023-03-16 19:05:19,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +20: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... + 6: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +16: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +16: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +17: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +19: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +19: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +19: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +20: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +22: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +22: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +22: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +19: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +19: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +19: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +19: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +19: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 0: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +20: [2023-03-16 19:05:19,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +20: [2023-03-16 19:05:19,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +20: [2023-03-16 19:05:19,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... + 7: [2023-03-16 19:05:19,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +19: [2023-03-16 19:05:19,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... +19: [2023-03-16 19:05:19,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +19: [2023-03-16 19:05:19,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +19: [2023-03-16 19:05:19,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt... + 7: [2023-03-16 19:05:19,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +19: [2023-03-16 19:05:19,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt... +19: [2023-03-16 19:05:19,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt... +19: [2023-03-16 19:05:19,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... +19: [2023-03-16 19:05:19,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt... + 6: [2023-03-16 19:05:19,615] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 2: [2023-03-16 19:05:19,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +23: [2023-03-16 19:05:19,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +23: [2023-03-16 19:05:19,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. + 7: [2023-03-16 19:05:19,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. + 6: [2023-03-16 19:05:19,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +29: [2023-03-16 19:05:19,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +29: [2023-03-16 19:05:19,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +29: [2023-03-16 19:05:19,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +29: [2023-03-16 19:05:19,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +29: [2023-03-16 19:05:19,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +29: [2023-03-16 19:05:19,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +29: [2023-03-16 19:05:19,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... + 1: [2023-03-16 19:05:19,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_00-model_states.pt. +29: [2023-03-16 19:05:19,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +16: [2023-03-16 19:05:19,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +23: [2023-03-16 19:05:19,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:19,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +26: [2023-03-16 19:05:19,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +26: [2023-03-16 19:05:19,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +21: [2023-03-16 19:05:19,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +18: [2023-03-16 19:05:19,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 4: [2023-03-16 19:05:19,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +21: [2023-03-16 19:05:19,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 3: [2023-03-16 19:05:19,634] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +18: [2023-03-16 19:05:19,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +17: [2023-03-16 19:05:19,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +25: [2023-03-16 19:05:19,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +16: [2023-03-16 19:05:19,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +17: [2023-03-16 19:05:19,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +30: [2023-03-16 19:05:19,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. + 5: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +22: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +30: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +20: [2023-03-16 19:05:19,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +22: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +25: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +25: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +25: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +25: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +25: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +20: [2023-03-16 19:05:19,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +25: [2023-03-16 19:05:19,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +24: [2023-03-16 19:05:19,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +25: [2023-03-16 19:05:19,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +16: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +24: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +30: [2023-03-16 19:05:19,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +21: [2023-03-16 19:05:19,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +30: [2023-03-16 19:05:19,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +30: [2023-03-16 19:05:19,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +30: [2023-03-16 19:05:19,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +30: [2023-03-16 19:05:19,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +26: [2023-03-16 19:05:19,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:19,644] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +24: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +26: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +21: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +19: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_00-model_states.pt. +24: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +17: [2023-03-16 19:05:19,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +24: [2023-03-16 19:05:19,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +18: [2023-03-16 19:05:19,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +18: [2023-03-16 19:05:19,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +24: [2023-03-16 19:05:19,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +24: [2023-03-16 19:05:19,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +24: [2023-03-16 19:05:19,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +24: [2023-03-16 19:05:19,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +17: [2023-03-16 19:05:19,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +16: [2023-03-16 19:05:19,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +22: [2023-03-16 19:05:19,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +22: [2023-03-16 19:05:19,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +20: [2023-03-16 19:05:19,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +20: [2023-03-16 19:05:19,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:19,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:19,656] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +14: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +14: [2023-03-16 19:05:19,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +28: [2023-03-16 19:05:19,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +14: [2023-03-16 19:05:19,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +28: [2023-03-16 19:05:19,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +14: [2023-03-16 19:05:19,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +14: [2023-03-16 19:05:19,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +14: [2023-03-16 19:05:19,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +28: [2023-03-16 19:05:19,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +14: [2023-03-16 19:05:19,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +14: [2023-03-16 19:05:19,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +14: [2023-03-16 19:05:19,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +14: [2023-03-16 19:05:19,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +28: [2023-03-16 19:05:19,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +28: [2023-03-16 19:05:19,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +28: [2023-03-16 19:05:19,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +28: [2023-03-16 19:05:19,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +28: [2023-03-16 19:05:19,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +27: [2023-03-16 19:05:19,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +29: [2023-03-16 19:05:19,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +29: [2023-03-16 19:05:19,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +27: [2023-03-16 19:05:19,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +11: [2023-03-16 19:05:19,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +11: [2023-03-16 19:05:19,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +27: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +11: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +11: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +11: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +27: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +11: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +11: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +11: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +27: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +27: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +27: [2023-03-16 19:05:19,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +27: [2023-03-16 19:05:19,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +27: [2023-03-16 19:05:19,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +30: [2023-03-16 19:05:19,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +25: [2023-03-16 19:05:19,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +11: [2023-03-16 19:05:19,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +25: [2023-03-16 19:05:19,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +11: [2023-03-16 19:05:19,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +11: [2023-03-16 19:05:19,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +30: [2023-03-16 19:05:19,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +11: [2023-03-16 19:05:19,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +11: [2023-03-16 19:05:19,678] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +11: [2023-03-16 19:05:19,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +11: [2023-03-16 19:05:19,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +11: [2023-03-16 19:05:19,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +24: [2023-03-16 19:05:19,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +24: [2023-03-16 19:05:19,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +31: [2023-03-16 19:05:19,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +30: [2023-03-16 19:05:19,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:19,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +31: [2023-03-16 19:05:19,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +30: [2023-03-16 19:05:19,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +25: [2023-03-16 19:05:19,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +25: [2023-03-16 19:05:19,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:19,690] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt... +15: [2023-03-16 19:05:19,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +31: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt... +15: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +31: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt... +31: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +31: [2023-03-16 19:05:19,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt... +15: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +15: [2023-03-16 19:05:19,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +15: [2023-03-16 19:05:19,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +14: [2023-03-16 19:05:19,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +24: [2023-03-16 19:05:19,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:19,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +12: [2023-03-16 19:05:19,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +12: [2023-03-16 19:05:19,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +12: [2023-03-16 19:05:19,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +15: [2023-03-16 19:05:19,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +15: [2023-03-16 19:05:19,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +15: [2023-03-16 19:05:19,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +15: [2023-03-16 19:05:19,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +15: [2023-03-16 19:05:19,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +12: [2023-03-16 19:05:19,698] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +12: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +10: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... + 9: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... + 9: [2023-03-16 19:05:19,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... + 8: [2023-03-16 19:05:19,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +28: [2023-03-16 19:05:19,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +28: [2023-03-16 19:05:19,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. + 8: [2023-03-16 19:05:19,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... + 8: [2023-03-16 19:05:19,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... + 9: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +27: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +14: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +10: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... + 9: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +10: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +10: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +13: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +13: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +13: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +10: [2023-03-16 19:05:19,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,706] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... +27: [2023-03-16 19:05:19,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +10: [2023-03-16 19:05:19,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... +13: [2023-03-16 19:05:19,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt... +13: [2023-03-16 19:05:19,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +13: [2023-03-16 19:05:19,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... + 8: [2023-03-16 19:05:19,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt... + 8: [2023-03-16 19:05:19,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt... + 8: [2023-03-16 19:05:19,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... + 8: [2023-03-16 19:05:19,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt... +14: [2023-03-16 19:05:19,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +11: [2023-03-16 19:05:19,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +11: [2023-03-16 19:05:19,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +28: [2023-03-16 19:05:19,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +28: [2023-03-16 19:05:19,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:19,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +11: [2023-03-16 19:05:19,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +11: [2023-03-16 19:05:19,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +12: [2023-03-16 19:05:19,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +31: [2023-03-16 19:05:19,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_00-model_states.pt. +12: [2023-03-16 19:05:19,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. + 9: [2023-03-16 19:05:19,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +31: [2023-03-16 19:05:19,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +31: [2023-03-16 19:05:19,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +10: [2023-03-16 19:05:19,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +13: [2023-03-16 19:05:19,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +12: [2023-03-16 19:05:19,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_00-model_states.pt. +15: [2023-03-16 19:05:19,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +15: [2023-03-16 19:05:19,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +10: [2023-03-16 19:05:19,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:19,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:19,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +10: [2023-03-16 19:05:19,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 1: [2023-03-16 19:05:19,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 1: [2023-03-16 19:05:19,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 7: [2023-03-16 19:05:19,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 2: [2023-03-16 19:05:19,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 2: [2023-03-16 19:05:19,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 7: [2023-03-16 19:05:19,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 2: [2023-03-16 19:05:19,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 2: [2023-03-16 19:05:19,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 2: [2023-03-16 19:05:19,838] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 5: [2023-03-16 19:05:19,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 2: [2023-03-16 19:05:19,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 0: [2023-03-16 19:05:19,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 5: [2023-03-16 19:05:19,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 7: [2023-03-16 19:05:19,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 0: [2023-03-16 19:05:19,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +17: [2023-03-16 19:05:19,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +17: [2023-03-16 19:05:19,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 2: [2023-03-16 19:05:19,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 2: [2023-03-16 19:05:19,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 4: [2023-03-16 19:05:19,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 4: [2023-03-16 19:05:19,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 7: [2023-03-16 19:05:19,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +17: [2023-03-16 19:05:19,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +17: [2023-03-16 19:05:19,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +28: [2023-03-16 19:05:19,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +28: [2023-03-16 19:05:19,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 2: [2023-03-16 19:05:19,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 6: [2023-03-16 19:05:19,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. +22: [2023-03-16 19:05:19,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 2: [2023-03-16 19:05:19,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +22: [2023-03-16 19:05:19,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 4: [2023-03-16 19:05:19,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 1: [2023-03-16 19:05:19,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 4: [2023-03-16 19:05:19,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. +28: [2023-03-16 19:05:19,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +28: [2023-03-16 19:05:19,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. +29: [2023-03-16 19:05:19,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +29: [2023-03-16 19:05:19,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 0: [2023-03-16 19:05:19,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. +22: [2023-03-16 19:05:19,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +16: [2023-03-16 19:05:19,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +22: [2023-03-16 19:05:19,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:19,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +21: [2023-03-16 19:05:19,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +16: [2023-03-16 19:05:19,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 0: [2023-03-16 19:05:19,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 6: [2023-03-16 19:05:19,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +18: [2023-03-16 19:05:19,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 1: [2023-03-16 19:05:19,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +18: [2023-03-16 19:05:19,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 5: [2023-03-16 19:05:19,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +25: [2023-03-16 19:05:19,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +29: [2023-03-16 19:05:19,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +25: [2023-03-16 19:05:19,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 3: [2023-03-16 19:05:19,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. +29: [2023-03-16 19:05:19,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +20: [2023-03-16 19:05:19,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 3: [2023-03-16 19:05:19,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. +19: [2023-03-16 19:05:19,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +20: [2023-03-16 19:05:19,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +19: [2023-03-16 19:05:19,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 5: [2023-03-16 19:05:19,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +23: [2023-03-16 19:05:19,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. + 3: [2023-03-16 19:05:19,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. +16: [2023-03-16 19:05:19,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +26: [2023-03-16 19:05:19,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 0: [2023-03-16 19:05:19,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +23: [2023-03-16 19:05:19,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_03-model_states.pt. +16: [2023-03-16 19:05:19,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +27: [2023-03-16 19:05:19,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +27: [2023-03-16 19:05:19,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 3: [2023-03-16 19:05:19,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. +26: [2023-03-16 19:05:19,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +21: [2023-03-16 19:05:19,899] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:19,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +18: [2023-03-16 19:05:19,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +18: [2023-03-16 19:05:19,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +25: [2023-03-16 19:05:19,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 6: [2023-03-16 19:05:19,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. +25: [2023-03-16 19:05:19,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 6: [2023-03-16 19:05:19,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. +20: [2023-03-16 19:05:19,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:19,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:19,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 5: [2023-03-16 19:05:19,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. +23: [2023-03-16 19:05:19,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +20: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +26: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +30: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 3: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. +30: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +26: [2023-03-16 19:05:19,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:19,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 3: [2023-03-16 19:05:19,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +24: [2023-03-16 19:05:19,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. +31: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 0: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. +31: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_02-model_states.pt. + 0: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_02-model_states.pt. + 3: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +27: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 7: [2023-03-16 19:05:19,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 3: [2023-03-16 19:05:19,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 3: [2023-03-16 19:05:19,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +30: [2023-03-16 19:05:19,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 6: [2023-03-16 19:05:19,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:19,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +24: [2023-03-16 19:05:19,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +30: [2023-03-16 19:05:19,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:19,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:19,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:19,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:19,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:19,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 3: [2023-03-16 19:05:19,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 4: [2023-03-16 19:05:19,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 1: [2023-03-16 19:05:19,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. +22: [2023-03-16 19:05:19,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +22: [2023-03-16 19:05:19,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. + 1: [2023-03-16 19:05:19,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_03-model_states.pt. + 4: [2023-03-16 19:05:19,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. + 4: [2023-03-16 19:05:19,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_01-model_01-model_states.pt. +22: [2023-03-16 19:05:19,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +22: [2023-03-16 19:05:19,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +23: [2023-03-16 19:05:19,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +23: [2023-03-16 19:05:19,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. + 4: [2023-03-16 19:05:19,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +22: [2023-03-16 19:05:19,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +22: [2023-03-16 19:05:19,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +21: [2023-03-16 19:05:19,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +17: [2023-03-16 19:05:19,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +17: [2023-03-16 19:05:19,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +21: [2023-03-16 19:05:19,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +19: [2023-03-16 19:05:19,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +19: [2023-03-16 19:05:19,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. + 4: [2023-03-16 19:05:19,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 4: [2023-03-16 19:05:19,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +16: [2023-03-16 19:05:19,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +16: [2023-03-16 19:05:19,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +22: [2023-03-16 19:05:19,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:19,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +23: [2023-03-16 19:05:19,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +22: [2023-03-16 19:05:19,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:19,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:19,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +18: [2023-03-16 19:05:19,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +18: [2023-03-16 19:05:19,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +19: [2023-03-16 19:05:19,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:19,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +17: [2023-03-16 19:05:19,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +17: [2023-03-16 19:05:19,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:19,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:19,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +16: [2023-03-16 19:05:19,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +26: [2023-03-16 19:05:19,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +26: [2023-03-16 19:05:19,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +16: [2023-03-16 19:05:19,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +25: [2023-03-16 19:05:19,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +25: [2023-03-16 19:05:19,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +27: [2023-03-16 19:05:19,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +27: [2023-03-16 19:05:19,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +24: [2023-03-16 19:05:19,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +23: [2023-03-16 19:05:19,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:19,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +31: [2023-03-16 19:05:19,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +31: [2023-03-16 19:05:19,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +24: [2023-03-16 19:05:19,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +25: [2023-03-16 19:05:19,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +29: [2023-03-16 19:05:19,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +29: [2023-03-16 19:05:19,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +30: [2023-03-16 19:05:19,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +30: [2023-03-16 19:05:19,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +18: [2023-03-16 19:05:19,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +18: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +12: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +14: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +25: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +12: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +14: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +29: [2023-03-16 19:05:19,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +13: [2023-03-16 19:05:19,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +29: [2023-03-16 19:05:19,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +13: [2023-03-16 19:05:19,968] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +26: [2023-03-16 19:05:19,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +26: [2023-03-16 19:05:19,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. + 9: [2023-03-16 19:05:19,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +27: [2023-03-16 19:05:19,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +27: [2023-03-16 19:05:19,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +28: [2023-03-16 19:05:19,970] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +25: [2023-03-16 19:05:19,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +28: [2023-03-16 19:05:19,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_01-model_states.pt. +25: [2023-03-16 19:05:19,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +31: [2023-03-16 19:05:19,973] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +10: [2023-03-16 19:05:19,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +30: [2023-03-16 19:05:19,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:19,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:19,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +10: [2023-03-16 19:05:19,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +30: [2023-03-16 19:05:19,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:19,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +15: [2023-03-16 19:05:19,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +15: [2023-03-16 19:05:19,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +12: [2023-03-16 19:05:19,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +12: [2023-03-16 19:05:19,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +25: [2023-03-16 19:05:19,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +29: [2023-03-16 19:05:19,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +29: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +13: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +29: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +13: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +14: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +25: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 9: [2023-03-16 19:05:19,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +14: [2023-03-16 19:05:19,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +11: [2023-03-16 19:05:19,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. +27: [2023-03-16 19:05:19,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:19,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +20: [2023-03-16 19:05:19,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +28: [2023-03-16 19:05:19,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +29: [2023-03-16 19:05:19,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +28: [2023-03-16 19:05:19,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +20: [2023-03-16 19:05:19,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_02-model_states.pt. +11: [2023-03-16 19:05:19,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_01-model_states.pt. + 8: [2023-03-16 19:05:19,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +10: [2023-03-16 19:05:19,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +10: [2023-03-16 19:05:19,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +15: [2023-03-16 19:05:19,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 8: [2023-03-16 19:05:19,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +15: [2023-03-16 19:05:19,987] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +21: [2023-03-16 19:05:19,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +21: [2023-03-16 19:05:19,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. + 1: [2023-03-16 19:05:19,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 1: [2023-03-16 19:05:19,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +11: [2023-03-16 19:05:19,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 1: [2023-03-16 19:05:19,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +20: [2023-03-16 19:05:19,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +11: [2023-03-16 19:05:19,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +20: [2023-03-16 19:05:19,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 1: [2023-03-16 19:05:19,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 1: [2023-03-16 19:05:19,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 1: [2023-03-16 19:05:19,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 1: [2023-03-16 19:05:19,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 1: [2023-03-16 19:05:19,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... +14: [2023-03-16 19:05:19,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +14: [2023-03-16 19:05:19,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. + 9: [2023-03-16 19:05:19,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +11: [2023-03-16 19:05:19,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. + 9: [2023-03-16 19:05:19,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +11: [2023-03-16 19:05:19,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +21: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +18: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +17: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +17: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +23: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,001] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +18: [2023-03-16 19:05:20,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +18: [2023-03-16 19:05:20,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +18: [2023-03-16 19:05:20,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,002] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +23: [2023-03-16 19:05:20,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +18: [2023-03-16 19:05:20,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +18: [2023-03-16 19:05:20,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +18: [2023-03-16 19:05:20,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +15: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +15: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +23: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +29: [2023-03-16 19:05:20,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +29: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +29: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +29: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +30: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +30: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +30: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +30: [2023-03-16 19:05:20,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +10: [2023-03-16 19:05:20,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +26: [2023-03-16 19:05:20,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +10: [2023-03-16 19:05:20,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +31: [2023-03-16 19:05:20,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +29: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +14: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +29: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +26: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +26: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +30: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +30: [2023-03-16 19:05:20,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +30: [2023-03-16 19:05:20,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +30: [2023-03-16 19:05:20,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +31: [2023-03-16 19:05:20,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +14: [2023-03-16 19:05:20,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +26: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +26: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +14: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +31: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +30: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +11: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +26: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +31: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... + 9: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +14: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +11: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +17: [2023-03-16 19:05:20,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +26: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +13: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +17: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +29: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +29: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... + 9: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +25: [2023-03-16 19:05:20,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +24: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +25: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +25: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +30: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +13: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +24: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +24: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +12: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +24: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +24: [2023-03-16 19:05:20,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +21: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +21: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +21: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +25: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +21: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +21: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +25: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +21: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +12: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_02-model_states.pt. +25: [2023-03-16 19:05:20,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +30: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +25: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +24: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +24: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +24: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +24: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +24: [2023-03-16 19:05:20,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +27: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +15: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +21: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +21: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +21: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +15: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +30: [2023-03-16 19:05:20,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +21: [2023-03-16 19:05:20,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +25: [2023-03-16 19:05:20,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +25: [2023-03-16 19:05:20,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +21: [2023-03-16 19:05:20,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +10: [2023-03-16 19:05:20,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +27: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +29: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +20: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +10: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +20: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +27: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +27: [2023-03-16 19:05:20,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +27: [2023-03-16 19:05:20,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +27: [2023-03-16 19:05:20,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +29: [2023-03-16 19:05:20,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +25: [2023-03-16 19:05:20,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +10: [2023-03-16 19:05:20,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +12: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +10: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +26: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +12: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +29: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +14: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +29: [2023-03-16 19:05:20,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +29: [2023-03-16 19:05:20,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +29: [2023-03-16 19:05:20,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +27: [2023-03-16 19:05:20,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +14: [2023-03-16 19:05:20,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:20,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +29: [2023-03-16 19:05:20,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +29: [2023-03-16 19:05:20,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +12: [2023-03-16 19:05:20,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:20,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:20,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +13: [2023-03-16 19:05:20,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +12: [2023-03-16 19:05:20,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +11: [2023-03-16 19:05:20,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +11: [2023-03-16 19:05:20,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. + 9: [2023-03-16 19:05:20,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. + 9: [2023-03-16 19:05:20,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. + 1: [2023-03-16 19:05:20,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +15: [2023-03-16 19:05:20,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +15: [2023-03-16 19:05:20,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +10: [2023-03-16 19:05:20,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +20: [2023-03-16 19:05:20,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +10: [2023-03-16 19:05:20,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +20: [2023-03-16 19:05:20,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +12: [2023-03-16 19:05:20,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +26: [2023-03-16 19:05:20,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +26: [2023-03-16 19:05:20,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +12: [2023-03-16 19:05:20,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +18: [2023-03-16 19:05:20,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +13: [2023-03-16 19:05:20,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. +13: [2023-03-16 19:05:20,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +23: [2023-03-16 19:05:20,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +23: [2023-03-16 19:05:20,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +11: [2023-03-16 19:05:20,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +30: [2023-03-16 19:05:20,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +31: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +11: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_14-model_03-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +21: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +21: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +30: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +15: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +15: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +28: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +29: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +28: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +28: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... +29: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 2: [2023-03-16 19:05:20,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... +28: [2023-03-16 19:05:20,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +28: [2023-03-16 19:05:20,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +11: [2023-03-16 19:05:20,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +11: [2023-03-16 19:05:20,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +11: [2023-03-16 19:05:20,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +11: [2023-03-16 19:05:20,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... +21: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +21: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... +28: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 4: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +14: [2023-03-16 19:05:20,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +28: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +14: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +25: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +28: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +28: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt... +14: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +28: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +14: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +28: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt... +11: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +26: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +31: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +11: [2023-03-16 19:05:20,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +24: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +11: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +14: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +11: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +28: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +18: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +14: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +14: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +24: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +14: [2023-03-16 19:05:20,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +25: [2023-03-16 19:05:20,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +27: [2023-03-16 19:05:20,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +21: [2023-03-16 19:05:20,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +21: [2023-03-16 19:05:20,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +27: [2023-03-16 19:05:20,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +31: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 6: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 7: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... +30: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +23: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +23: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +29: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +29: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +14: [2023-03-16 19:05:20,049] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +30: [2023-03-16 19:05:20,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... +24: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. + 5: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +24: [2023-03-16 19:05:20,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 6: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... +11: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +14: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +11: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +14: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 3: [2023-03-16 19:05:20,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 3: [2023-03-16 19:05:20,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... +25: [2023-03-16 19:05:20,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 3: [2023-03-16 19:05:20,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... +31: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... +31: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +26: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 6: [2023-03-16 19:05:20,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... +26: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +14: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... +14: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... +24: [2023-03-16 19:05:20,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +11: [2023-03-16 19:05:20,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +11: [2023-03-16 19:05:20,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +21: [2023-03-16 19:05:20,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +27: [2023-03-16 19:05:20,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +24: [2023-03-16 19:05:20,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +20: [2023-03-16 19:05:20,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +20: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +20: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +25: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +30: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +20: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt... +20: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +20: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +30: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. + 5: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt... +21: [2023-03-16 19:05:20,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +14: [2023-03-16 19:05:20,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +27: [2023-03-16 19:05:20,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt... +20: [2023-03-16 19:05:20,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +20: [2023-03-16 19:05:20,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +20: [2023-03-16 19:05:20,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +28: [2023-03-16 19:05:20,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +18: [2023-03-16 19:05:20,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +18: [2023-03-16 19:05:20,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +28: [2023-03-16 19:05:20,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_36-model_03-model_states.pt. +20: [2023-03-16 19:05:20,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +20: [2023-03-16 19:05:20,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +20: [2023-03-16 19:05:20,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +14: [2023-03-16 19:05:20,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +24: [2023-03-16 19:05:20,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +24: [2023-03-16 19:05:20,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +14: [2023-03-16 19:05:20,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +31: [2023-03-16 19:05:20,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +31: [2023-03-16 19:05:20,065] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +20: [2023-03-16 19:05:20,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +26: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +26: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +16: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +16: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +16: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +16: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +16: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +30: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +16: [2023-03-16 19:05:20,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,068] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +10: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +20: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +16: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +18: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +20: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +30: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +13: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +13: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +13: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +16: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +16: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +12: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +16: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +16: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +13: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +13: [2023-03-16 19:05:20,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +16: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +13: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +28: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +10: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +10: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +17: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +11: [2023-03-16 19:05:20,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +18: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +28: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt... +10: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +11: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +12: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +12: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +14: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +17: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +19: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +19: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +12: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +15: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +20: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... + 9: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +19: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +19: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +19: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... + 9: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +26: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +26: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... + 2: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +15: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +15: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +15: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +19: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +12: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +15: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +12: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +12: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +11: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,072] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +15: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +13: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +13: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +13: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +17: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +13: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +15: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +10: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +17: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +15: [2023-03-16 19:05:20,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +17: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +10: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +10: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +17: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +11: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +15: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +17: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +10: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +19: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +10: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +17: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +13: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +17: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +17: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:20,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +19: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... + 8: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +13: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +13: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +19: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +19: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +13: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 8: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +13: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +19: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +19: [2023-03-16 19:05:20,075] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... + 8: [2023-03-16 19:05:20,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +15: [2023-03-16 19:05:20,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt... +15: [2023-03-16 19:05:20,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +15: [2023-03-16 19:05:20,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +15: [2023-03-16 19:05:20,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt... +28: [2023-03-16 19:05:20,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +14: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +15: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +15: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +15: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt... +11: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 4: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +11: [2023-03-16 19:05:20,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 9: [2023-03-16 19:05:20,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +28: [2023-03-16 19:05:20,079] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,080] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 9: [2023-03-16 19:05:20,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 8: [2023-03-16 19:05:20,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +11: [2023-03-16 19:05:20,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +14: [2023-03-16 19:05:20,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +11: [2023-03-16 19:05:20,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... + 2: [2023-03-16 19:05:20,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,089] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +28: [2023-03-16 19:05:20,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +19: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +16: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. + 8: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt... +16: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +19: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_25-model_01-model_states.pt. +14: [2023-03-16 19:05:20,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +20: [2023-03-16 19:05:20,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +28: [2023-03-16 19:05:20,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,093] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_00-model_states.pt. +20: [2023-03-16 19:05:20,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +16: [2023-03-16 19:05:20,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +12: [2023-03-16 19:05:20,097] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +24: [2023-03-16 19:05:20,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +10: [2023-03-16 19:05:20,098] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +24: [2023-03-16 19:05:20,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +16: [2023-03-16 19:05:20,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +24: [2023-03-16 19:05:20,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... + 9: [2023-03-16 19:05:20,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +19: [2023-03-16 19:05:20,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +31: [2023-03-16 19:05:20,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +12: [2023-03-16 19:05:20,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +19: [2023-03-16 19:05:20,101] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +31: [2023-03-16 19:05:20,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +30: [2023-03-16 19:05:20,101] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +24: [2023-03-16 19:05:20,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... + 3: [2023-03-16 19:05:20,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +17: [2023-03-16 19:05:20,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +10: [2023-03-16 19:05:20,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +16: [2023-03-16 19:05:20,102] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +31: [2023-03-16 19:05:20,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +31: [2023-03-16 19:05:20,103] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +17: [2023-03-16 19:05:20,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +18: [2023-03-16 19:05:20,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +30: [2023-03-16 19:05:20,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +20: [2023-03-16 19:05:20,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +13: [2023-03-16 19:05:20,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +13: [2023-03-16 19:05:20,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +16: [2023-03-16 19:05:20,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +30: [2023-03-16 19:05:20,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +18: [2023-03-16 19:05:20,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +19: [2023-03-16 19:05:20,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +15: [2023-03-16 19:05:20,106] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +18: [2023-03-16 19:05:20,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +19: [2023-03-16 19:05:20,107] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +20: [2023-03-16 19:05:20,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +30: [2023-03-16 19:05:20,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +15: [2023-03-16 19:05:20,108] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_00-model_states.pt. +18: [2023-03-16 19:05:20,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +16: [2023-03-16 19:05:20,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +12: [2023-03-16 19:05:20,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +10: [2023-03-16 19:05:20,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +28: [2023-03-16 19:05:20,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +28: [2023-03-16 19:05:20,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_00-model_states.pt. +28: [2023-03-16 19:05:20,112] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +12: [2023-03-16 19:05:20,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +17: [2023-03-16 19:05:20,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +28: [2023-03-16 19:05:20,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt... +10: [2023-03-16 19:05:20,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +17: [2023-03-16 19:05:20,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +13: [2023-03-16 19:05:20,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +16: [2023-03-16 19:05:20,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +19: [2023-03-16 19:05:20,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +15: [2023-03-16 19:05:20,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +19: [2023-03-16 19:05:20,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +15: [2023-03-16 19:05:20,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +19: [2023-03-16 19:05:20,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +19: [2023-03-16 19:05:20,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +19: [2023-03-16 19:05:20,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +16: [2023-03-16 19:05:20,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +16: [2023-03-16 19:05:20,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +19: [2023-03-16 19:05:20,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +16: [2023-03-16 19:05:20,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +16: [2023-03-16 19:05:20,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +22: [2023-03-16 19:05:20,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt... +22: [2023-03-16 19:05:20,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +22: [2023-03-16 19:05:20,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt... +22: [2023-03-16 19:05:20,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +22: [2023-03-16 19:05:20,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt... +22: [2023-03-16 19:05:20,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... +22: [2023-03-16 19:05:20,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 1: [2023-03-16 19:05:20,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 1: [2023-03-16 19:05:20,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,299] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 6: [2023-03-16 19:05:20,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 3: [2023-03-16 19:05:20,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 3: [2023-03-16 19:05:20,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. +13: [2023-03-16 19:05:20,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +13: [2023-03-16 19:05:20,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. + 5: [2023-03-16 19:05:20,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 5: [2023-03-16 19:05:20,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 1: [2023-03-16 19:05:20,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 1: [2023-03-16 19:05:20,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +22: [2023-03-16 19:05:20,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. +22: [2023-03-16 19:05:20,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +11: [2023-03-16 19:05:20,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +11: [2023-03-16 19:05:20,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. + 7: [2023-03-16 19:05:20,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 7: [2023-03-16 19:05:20,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. +21: [2023-03-16 19:05:20,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +21: [2023-03-16 19:05:20,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +27: [2023-03-16 19:05:20,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. + 2: [2023-03-16 19:05:20,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. +29: [2023-03-16 19:05:20,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +27: [2023-03-16 19:05:20,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. + 6: [2023-03-16 19:05:20,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +29: [2023-03-16 19:05:20,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. + 6: [2023-03-16 19:05:20,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +31: [2023-03-16 19:05:20,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +13: [2023-03-16 19:05:20,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +24: [2023-03-16 19:05:20,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +31: [2023-03-16 19:05:20,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +13: [2023-03-16 19:05:20,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. +24: [2023-03-16 19:05:20,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. + 4: [2023-03-16 19:05:20,321] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 3: [2023-03-16 19:05:20,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 5: [2023-03-16 19:05:20,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,323] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +22: [2023-03-16 19:05:20,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +23: [2023-03-16 19:05:20,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +23: [2023-03-16 19:05:20,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +22: [2023-03-16 19:05:20,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +11: [2023-03-16 19:05:20,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +11: [2023-03-16 19:05:20,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +21: [2023-03-16 19:05:20,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +21: [2023-03-16 19:05:20,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +16: [2023-03-16 19:05:20,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +16: [2023-03-16 19:05:20,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +27: [2023-03-16 19:05:20,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +12: [2023-03-16 19:05:20,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +12: [2023-03-16 19:05:20,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +27: [2023-03-16 19:05:20,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +29: [2023-03-16 19:05:20,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +29: [2023-03-16 19:05:20,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. + 0: [2023-03-16 19:05:20,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. +24: [2023-03-16 19:05:20,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +24: [2023-03-16 19:05:20,333] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +15: [2023-03-16 19:05:20,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. + 2: [2023-03-16 19:05:20,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. +15: [2023-03-16 19:05:20,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. + 2: [2023-03-16 19:05:20,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. +28: [2023-03-16 19:05:20,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +20: [2023-03-16 19:05:20,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +23: [2023-03-16 19:05:20,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +23: [2023-03-16 19:05:20,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +20: [2023-03-16 19:05:20,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +18: [2023-03-16 19:05:20,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +18: [2023-03-16 19:05:20,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +14: [2023-03-16 19:05:20,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +14: [2023-03-16 19:05:20,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. + 4: [2023-03-16 19:05:20,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +17: [2023-03-16 19:05:20,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +17: [2023-03-16 19:05:20,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +10: [2023-03-16 19:05:20,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. + 6: [2023-03-16 19:05:20,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. +12: [2023-03-16 19:05:20,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +16: [2023-03-16 19:05:20,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +16: [2023-03-16 19:05:20,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +12: [2023-03-16 19:05:20,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +28: [2023-03-16 19:05:20,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +10: [2023-03-16 19:05:20,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +15: [2023-03-16 19:05:20,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +15: [2023-03-16 19:05:20,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. + 5: [2023-03-16 19:05:20,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. + 5: [2023-03-16 19:05:20,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 0: [2023-03-16 19:05:20,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +15: [2023-03-16 19:05:20,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +27: [2023-03-16 19:05:20,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +27: [2023-03-16 19:05:20,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +15: [2023-03-16 19:05:20,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +13: [2023-03-16 19:05:20,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +11: [2023-03-16 19:05:20,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +11: [2023-03-16 19:05:20,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +10: [2023-03-16 19:05:20,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +20: [2023-03-16 19:05:20,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. + 7: [2023-03-16 19:05:20,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. +28: [2023-03-16 19:05:20,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +20: [2023-03-16 19:05:20,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +14: [2023-03-16 19:05:20,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +10: [2023-03-16 19:05:20,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +18: [2023-03-16 19:05:20,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +18: [2023-03-16 19:05:20,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +14: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_02-model_states.pt. +26: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +19: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +26: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. + 4: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 4: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. +17: [2023-03-16 19:05:20,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +17: [2023-03-16 19:05:20,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +14: [2023-03-16 19:05:20,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +10: [2023-03-16 19:05:20,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +28: [2023-03-16 19:05:20,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +25: [2023-03-16 19:05:20,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +19: [2023-03-16 19:05:20,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +25: [2023-03-16 19:05:20,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +24: [2023-03-16 19:05:20,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +15: [2023-03-16 19:05:20,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_02-model_states.pt. +24: [2023-03-16 19:05:20,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +15: [2023-03-16 19:05:20,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +14: [2023-03-16 19:05:20,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +12: [2023-03-16 19:05:20,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,357] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +12: [2023-03-16 19:05:20,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +23: [2023-03-16 19:05:20,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +29: [2023-03-16 19:05:20,358] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. + 5: [2023-03-16 19:05:20,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +29: [2023-03-16 19:05:20,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +27: [2023-03-16 19:05:20,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +27: [2023-03-16 19:05:20,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +11: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +23: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +23: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +11: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +26: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +21: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. + 9: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +14: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +21: [2023-03-16 19:05:20,361] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +10: [2023-03-16 19:05:20,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_03-model_states.pt. +10: [2023-03-16 19:05:20,362] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +19: [2023-03-16 19:05:20,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. + 7: [2023-03-16 19:05:20,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +14: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +19: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +19: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +19: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +10: [2023-03-16 19:05:20,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +26: [2023-03-16 19:05:20,363] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +26: [2023-03-16 19:05:20,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +26: [2023-03-16 19:05:20,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +23: [2023-03-16 19:05:20,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +23: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +25: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 3: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 1: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +23: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. + 0: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +25: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 7: [2023-03-16 19:05:20,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 6: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 1: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +24: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_03-model_states.pt. +30: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +30: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +23: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +24: [2023-03-16 19:05:20,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +12: [2023-03-16 19:05:20,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +12: [2023-03-16 19:05:20,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 7: [2023-03-16 19:05:20,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +29: [2023-03-16 19:05:20,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +17: [2023-03-16 19:05:20,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +29: [2023-03-16 19:05:20,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +17: [2023-03-16 19:05:20,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +20: [2023-03-16 19:05:20,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +20: [2023-03-16 19:05:20,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +17: [2023-03-16 19:05:20,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +17: [2023-03-16 19:05:20,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +18: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +18: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +31: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +26: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_03-model_states.pt. +23: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +23: [2023-03-16 19:05:20,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +21: [2023-03-16 19:05:20,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +19: [2023-03-16 19:05:20,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +21: [2023-03-16 19:05:20,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +18: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. + 2: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +26: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +19: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +21: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +21: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +30: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +21: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +21: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +16: [2023-03-16 19:05:20,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +16: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +16: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +30: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +21: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +16: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +16: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +21: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +16: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +23: [2023-03-16 19:05:20,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +18: [2023-03-16 19:05:20,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. + 2: [2023-03-16 19:05:20,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +23: [2023-03-16 19:05:20,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +21: [2023-03-16 19:05:20,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +16: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +16: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +16: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +21: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +16: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. + 2: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... +30: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +18: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... +16: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +21: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +18: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +21: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +16: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +17: [2023-03-16 19:05:20,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +23: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +17: [2023-03-16 19:05:20,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +18: [2023-03-16 19:05:20,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +18: [2023-03-16 19:05:20,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +17: [2023-03-16 19:05:20,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +30: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +23: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +20: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +20: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +17: [2023-03-16 19:05:20,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +20: [2023-03-16 19:05:20,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +20: [2023-03-16 19:05:20,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +17: [2023-03-16 19:05:20,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +18: [2023-03-16 19:05:20,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +31: [2023-03-16 19:05:20,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +18: [2023-03-16 19:05:20,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +17: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +20: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +20: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +31: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +20: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +31: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +17: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +20: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +28: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +31: [2023-03-16 19:05:20,386] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +28: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +27: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +27: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +20: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +20: [2023-03-16 19:05:20,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +29: [2023-03-16 19:05:20,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +18: [2023-03-16 19:05:20,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +18: [2023-03-16 19:05:20,388] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +29: [2023-03-16 19:05:20,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. + 2: [2023-03-16 19:05:20,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +18: [2023-03-16 19:05:20,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +17: [2023-03-16 19:05:20,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +19: [2023-03-16 19:05:20,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +18: [2023-03-16 19:05:20,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +18: [2023-03-16 19:05:20,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +24: [2023-03-16 19:05:20,389] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +24: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +17: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +30: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +30: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +19: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +21: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +16: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 4: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +17: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +21: [2023-03-16 19:05:20,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +18: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +17: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. + 2: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_03-model_01-model_states.pt. +16: [2023-03-16 19:05:20,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 3: [2023-03-16 19:05:20,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +17: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 4: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 1: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +16: [2023-03-16 19:05:20,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +16: [2023-03-16 19:05:20,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... +25: [2023-03-16 19:05:20,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. +25: [2023-03-16 19:05:20,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_02-model_states.pt. + 5: [2023-03-16 19:05:20,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +19: [2023-03-16 19:05:20,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +11: [2023-03-16 19:05:20,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +11: [2023-03-16 19:05:20,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +28: [2023-03-16 19:05:20,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +19: [2023-03-16 19:05:20,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +31: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +29: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +27: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +27: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +19: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +31: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +31: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +31: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +25: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +26: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +29: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +25: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +25: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +31: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +26: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +26: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +26: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +23: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +28: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +29: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +29: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +25: [2023-03-16 19:05:20,400] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +30: [2023-03-16 19:05:20,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +30: [2023-03-16 19:05:20,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +29: [2023-03-16 19:05:20,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +24: [2023-03-16 19:05:20,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +24: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +23: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +31: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +31: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +25: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +25: [2023-03-16 19:05:20,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +26: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +29: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +29: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +26: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +29: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +30: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +19: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +26: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +25: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +30: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +28: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +26: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +31: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +28: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +29: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +28: [2023-03-16 19:05:20,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +28: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +29: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +27: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +19: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +27: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +27: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +27: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +27: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +25: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +27: [2023-03-16 19:05:20,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +19: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +12: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +27: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +27: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... + 7: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +14: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +26: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +14: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +21: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +29: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +27: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +12: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +27: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +12: [2023-03-16 19:05:20,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +27: [2023-03-16 19:05:20,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +14: [2023-03-16 19:05:20,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +27: [2023-03-16 19:05:20,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +14: [2023-03-16 19:05:20,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +29: [2023-03-16 19:05:20,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +23: [2023-03-16 19:05:20,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +26: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... + 4: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +12: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +12: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +12: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +21: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +26: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +12: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +21: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... + 5: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +25: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +25: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +29: [2023-03-16 19:05:20,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... + 6: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +19: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +11: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 6: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +14: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +29: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +16: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +14: [2023-03-16 19:05:20,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +21: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +14: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +14: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +26: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +19: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 6: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +19: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... + 0: [2023-03-16 19:05:20,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +21: [2023-03-16 19:05:20,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +17: [2023-03-16 19:05:20,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... +18: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 0: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 6: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 6: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... + 6: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +11: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +19: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +19: [2023-03-16 19:05:20,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +23: [2023-03-16 19:05:20,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +19: [2023-03-16 19:05:20,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... + 2: [2023-03-16 19:05:20,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +30: [2023-03-16 19:05:20,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt... +30: [2023-03-16 19:05:20,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +23: [2023-03-16 19:05:20,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +17: [2023-03-16 19:05:20,416] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +30: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +30: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +17: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +20: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. + 2: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +20: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +20: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. + 3: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +31: [2023-03-16 19:05:20,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +18: [2023-03-16 19:05:20,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +21: [2023-03-16 19:05:20,419] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +20: [2023-03-16 19:05:20,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +17: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +20: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +31: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +24: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +16: [2023-03-16 19:05:20,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +24: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +24: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +24: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +24: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +24: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +21: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +30: [2023-03-16 19:05:20,421] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +18: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +18: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +16: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +17: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +16: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +20: [2023-03-16 19:05:20,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +17: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... + 3: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +24: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +18: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +20: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,423] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +23: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +17: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +24: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +24: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +21: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +17: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +17: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... + 7: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +30: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +17: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +21: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +30: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +24: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +18: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +24: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +18: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +24: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +18: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +17: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... + 6: [2023-03-16 19:05:20,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +20: [2023-03-16 19:05:20,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +21: [2023-03-16 19:05:20,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... + 0: [2023-03-16 19:05:20,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt... +19: [2023-03-16 19:05:20,426] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +23: [2023-03-16 19:05:20,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +18: [2023-03-16 19:05:20,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +30: [2023-03-16 19:05:20,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... + 4: [2023-03-16 19:05:20,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +16: [2023-03-16 19:05:20,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +19: [2023-03-16 19:05:20,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +21: [2023-03-16 19:05:20,428] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +18: [2023-03-16 19:05:20,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,429] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +29: [2023-03-16 19:05:20,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +17: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +20: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +28: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +20: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +31: [2023-03-16 19:05:20,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +27: [2023-03-16 19:05:20,431] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +18: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... + 2: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +21: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +28: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +27: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +29: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +18: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +19: [2023-03-16 19:05:20,432] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +31: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt... +26: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... +29: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +29: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +24: [2023-03-16 19:05:20,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +27: [2023-03-16 19:05:20,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... + 1: [2023-03-16 19:05:20,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +27: [2023-03-16 19:05:20,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +20: [2023-03-16 19:05:20,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +27: [2023-03-16 19:05:20,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +16: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +16: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +16: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +12: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +26: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +10: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +27: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +10: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +10: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +24: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +10: [2023-03-16 19:05:20,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +10: [2023-03-16 19:05:20,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +31: [2023-03-16 19:05:20,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +10: [2023-03-16 19:05:20,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +29: [2023-03-16 19:05:20,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,436] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +12: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +28: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +24: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +25: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +30: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +28: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +16: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +28: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +10: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +20: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +15: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +29: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +30: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +15: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. + 1: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +16: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +16: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... + 9: [2023-03-16 19:05:20,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +25: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +19: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +31: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +14: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +24: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +14: [2023-03-16 19:05:20,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +10: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +10: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +10: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +10: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +10: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +31: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +31: [2023-03-16 19:05:20,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +28: [2023-03-16 19:05:20,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +25: [2023-03-16 19:05:20,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +16: [2023-03-16 19:05:20,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +14: [2023-03-16 19:05:20,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +31: [2023-03-16 19:05:20,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +31: [2023-03-16 19:05:20,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +15: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +25: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... + 0: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +15: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +19: [2023-03-16 19:05:20,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +14: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. +19: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +15: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +15: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... + 4: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +29: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +29: [2023-03-16 19:05:20,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +25: [2023-03-16 19:05:20,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +26: [2023-03-16 19:05:20,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +19: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +27: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +15: [2023-03-16 19:05:20,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +25: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt... +31: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +27: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +15: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +15: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +15: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +12: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,448] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +26: [2023-03-16 19:05:20,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +19: [2023-03-16 19:05:20,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +12: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +28: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +11: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +25: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +30: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +11: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +11: [2023-03-16 19:05:20,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +13: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +13: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +13: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +13: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +13: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +13: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt... + 9: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +19: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +30: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +14: [2023-03-16 19:05:20,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +19: [2023-03-16 19:05:20,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. + 9: [2023-03-16 19:05:20,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +28: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +11: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +24: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +11: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +14: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +25: [2023-03-16 19:05:20,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +31: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +11: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +11: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +11: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +13: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +11: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +13: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +13: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +11: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... +11: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... + 7: [2023-03-16 19:05:20,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +13: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... + 7: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +12: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. + 9: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +12: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +10: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +28: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +10: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +28: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... + 0: [2023-03-16 19:05:20,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +14: [2023-03-16 19:05:20,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt... + 9: [2023-03-16 19:05:20,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +14: [2023-03-16 19:05:20,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt... +24: [2023-03-16 19:05:20,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +13: [2023-03-16 19:05:20,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. +13: [2023-03-16 19:05:20,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_15-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +20: [2023-03-16 19:05:20,461] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +20: [2023-03-16 19:05:20,462] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +26: [2023-03-16 19:05:20,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +26: [2023-03-16 19:05:20,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +20: [2023-03-16 19:05:20,463] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +20: [2023-03-16 19:05:20,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... + 8: [2023-03-16 19:05:20,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +10: [2023-03-16 19:05:20,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +24: [2023-03-16 19:05:20,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +12: [2023-03-16 19:05:20,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +12: [2023-03-16 19:05:20,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +10: [2023-03-16 19:05:20,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +24: [2023-03-16 19:05:20,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +10: [2023-03-16 19:05:20,469] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +25: [2023-03-16 19:05:20,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +25: [2023-03-16 19:05:20,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +13: [2023-03-16 19:05:20,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +13: [2023-03-16 19:05:20,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt... +10: [2023-03-16 19:05:20,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +26: [2023-03-16 19:05:20,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +26: [2023-03-16 19:05:20,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,478] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +28: [2023-03-16 19:05:20,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +15: [2023-03-16 19:05:20,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +15: [2023-03-16 19:05:20,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +25: [2023-03-16 19:05:20,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +11: [2023-03-16 19:05:20,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +10: [2023-03-16 19:05:20,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +28: [2023-03-16 19:05:20,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +28: [2023-03-16 19:05:20,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +25: [2023-03-16 19:05:20,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +15: [2023-03-16 19:05:20,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +14: [2023-03-16 19:05:20,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +28: [2023-03-16 19:05:20,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +15: [2023-03-16 19:05:20,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +15: [2023-03-16 19:05:20,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +14: [2023-03-16 19:05:20,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +10: [2023-03-16 19:05:20,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +13: [2023-03-16 19:05:20,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +15: [2023-03-16 19:05:20,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +13: [2023-03-16 19:05:20,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +22: [2023-03-16 19:05:20,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +22: [2023-03-16 19:05:20,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_02-model_states.pt. +15: [2023-03-16 19:05:20,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +11: [2023-03-16 19:05:20,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +10: [2023-03-16 19:05:20,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +11: [2023-03-16 19:05:20,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +12: [2023-03-16 19:05:20,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +10: [2023-03-16 19:05:20,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +12: [2023-03-16 19:05:20,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... + 9: [2023-03-16 19:05:20,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +12: [2023-03-16 19:05:20,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... + 8: [2023-03-16 19:05:20,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +15: [2023-03-16 19:05:20,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +30: [2023-03-16 19:05:20,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +30: [2023-03-16 19:05:20,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_37-model_01-model_states.pt. +13: [2023-03-16 19:05:20,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +10: [2023-03-16 19:05:20,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +22: [2023-03-16 19:05:20,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +13: [2023-03-16 19:05:20,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +22: [2023-03-16 19:05:20,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +13: [2023-03-16 19:05:20,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +10: [2023-03-16 19:05:20,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... + 8: [2023-03-16 19:05:20,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +13: [2023-03-16 19:05:20,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +26: [2023-03-16 19:05:20,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +26: [2023-03-16 19:05:20,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +26: [2023-03-16 19:05:20,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +25: [2023-03-16 19:05:20,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +13: [2023-03-16 19:05:20,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_00-model_states.pt. +26: [2023-03-16 19:05:20,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +25: [2023-03-16 19:05:20,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +13: [2023-03-16 19:05:20,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt... +30: [2023-03-16 19:05:20,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +25: [2023-03-16 19:05:20,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +30: [2023-03-16 19:05:20,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt... +25: [2023-03-16 19:05:20,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +22: [2023-03-16 19:05:20,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +22: [2023-03-16 19:05:20,540] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_01-model_states.pt. +22: [2023-03-16 19:05:20,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +22: [2023-03-16 19:05:20,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_26-model_03-model_states.pt. +30: [2023-03-16 19:05:20,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +30: [2023-03-16 19:05:20,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +22: [2023-03-16 19:05:20,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +22: [2023-03-16 19:05:20,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +30: [2023-03-16 19:05:20,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_00-model_states.pt. +22: [2023-03-16 19:05:20,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +30: [2023-03-16 19:05:20,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt... +22: [2023-03-16 19:05:20,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +22: [2023-03-16 19:05:20,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +22: [2023-03-16 19:05:20,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt... +22: [2023-03-16 19:05:20,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +22: [2023-03-16 19:05:20,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt... +22: [2023-03-16 19:05:20,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +22: [2023-03-16 19:05:20,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt... +22: [2023-03-16 19:05:20,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +22: [2023-03-16 19:05:20,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt... +22: [2023-03-16 19:05:20,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_00-model_states.pt. +22: [2023-03-16 19:05:20,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +22: [2023-03-16 19:05:20,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +23: [2023-03-16 19:05:20,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +21: [2023-03-16 19:05:20,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +21: [2023-03-16 19:05:20,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +17: [2023-03-16 19:05:20,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +17: [2023-03-16 19:05:20,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +17: [2023-03-16 19:05:20,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +17: [2023-03-16 19:05:20,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +20: [2023-03-16 19:05:20,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +20: [2023-03-16 19:05:20,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +16: [2023-03-16 19:05:20,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +16: [2023-03-16 19:05:20,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +16: [2023-03-16 19:05:20,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +16: [2023-03-16 19:05:20,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +19: [2023-03-16 19:05:20,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +19: [2023-03-16 19:05:20,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +21: [2023-03-16 19:05:20,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +21: [2023-03-16 19:05:20,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +20: [2023-03-16 19:05:20,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +20: [2023-03-16 19:05:20,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +18: [2023-03-16 19:05:20,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +19: [2023-03-16 19:05:20,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +18: [2023-03-16 19:05:20,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +23: [2023-03-16 19:05:20,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +19: [2023-03-16 19:05:20,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +16: [2023-03-16 19:05:20,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +16: [2023-03-16 19:05:20,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +17: [2023-03-16 19:05:20,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +19: [2023-03-16 19:05:20,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +30: [2023-03-16 19:05:20,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +19: [2023-03-16 19:05:20,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +19: [2023-03-16 19:05:20,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +19: [2023-03-16 19:05:20,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +30: [2023-03-16 19:05:20,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +20: [2023-03-16 19:05:20,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +21: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +20: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +21: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +23: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +17: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +16: [2023-03-16 19:05:20,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +18: [2023-03-16 19:05:20,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +18: [2023-03-16 19:05:20,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,699] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +20: [2023-03-16 19:05:20,699] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +20: [2023-03-16 19:05:20,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +17: [2023-03-16 19:05:20,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +17: [2023-03-16 19:05:20,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +23: [2023-03-16 19:05:20,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... + 1: [2023-03-16 19:05:20,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 1: [2023-03-16 19:05:20,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +18: [2023-03-16 19:05:20,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +18: [2023-03-16 19:05:20,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +18: [2023-03-16 19:05:20,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +18: [2023-03-16 19:05:20,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +17: [2023-03-16 19:05:20,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:20,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +16: [2023-03-16 19:05:20,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +21: [2023-03-16 19:05:20,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +21: [2023-03-16 19:05:20,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +21: [2023-03-16 19:05:20,707] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +21: [2023-03-16 19:05:20,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +16: [2023-03-16 19:05:20,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +30: [2023-03-16 19:05:20,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +30: [2023-03-16 19:05:20,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +20: [2023-03-16 19:05:20,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 5: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 4: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +17: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. + 2: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +21: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 5: [2023-03-16 19:05:20,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +17: [2023-03-16 19:05:20,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. + 0: [2023-03-16 19:05:20,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. + 7: [2023-03-16 19:05:20,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_03-model_states.pt. +21: [2023-03-16 19:05:20,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +20: [2023-03-16 19:05:20,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +21: [2023-03-16 19:05:20,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +17: [2023-03-16 19:05:20,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:20,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +21: [2023-03-16 19:05:20,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +20: [2023-03-16 19:05:20,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +17: [2023-03-16 19:05:20,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +21: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +21: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +20: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +17: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,714] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +19: [2023-03-16 19:05:20,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +14: [2023-03-16 19:05:20,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +14: [2023-03-16 19:05:20,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +19: [2023-03-16 19:05:20,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,721] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +19: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +18: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +17: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +29: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +19: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... + 0: [2023-03-16 19:05:20,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +17: [2023-03-16 19:05:20,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +29: [2023-03-16 19:05:20,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. + 7: [2023-03-16 19:05:20,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +19: [2023-03-16 19:05:20,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +25: [2023-03-16 19:05:20,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +18: [2023-03-16 19:05:20,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +25: [2023-03-16 19:05:20,724] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +16: [2023-03-16 19:05:20,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +15: [2023-03-16 19:05:20,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +15: [2023-03-16 19:05:20,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +19: [2023-03-16 19:05:20,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +24: [2023-03-16 19:05:20,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +25: [2023-03-16 19:05:20,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +25: [2023-03-16 19:05:20,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +24: [2023-03-16 19:05:20,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +27: [2023-03-16 19:05:20,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +27: [2023-03-16 19:05:20,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +27: [2023-03-16 19:05:20,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +27: [2023-03-16 19:05:20,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +28: [2023-03-16 19:05:20,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +18: [2023-03-16 19:05:20,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +26: [2023-03-16 19:05:20,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. + 5: [2023-03-16 19:05:20,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +26: [2023-03-16 19:05:20,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. + 5: [2023-03-16 19:05:20,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +28: [2023-03-16 19:05:20,730] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +27: [2023-03-16 19:05:20,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +27: [2023-03-16 19:05:20,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +14: [2023-03-16 19:05:20,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +18: [2023-03-16 19:05:20,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +20: [2023-03-16 19:05:20,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +20: [2023-03-16 19:05:20,732] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +14: [2023-03-16 19:05:20,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +23: [2023-03-16 19:05:20,732] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +10: [2023-03-16 19:05:20,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +10: [2023-03-16 19:05:20,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +18: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +29: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. + 9: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +28: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +28: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. + 9: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +29: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +29: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +21: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +19: [2023-03-16 19:05:20,735] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +29: [2023-03-16 19:05:20,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +25: [2023-03-16 19:05:20,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +25: [2023-03-16 19:05:20,737] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +15: [2023-03-16 19:05:20,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +15: [2023-03-16 19:05:20,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +26: [2023-03-16 19:05:20,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +31: [2023-03-16 19:05:20,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +24: [2023-03-16 19:05:20,739] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +13: [2023-03-16 19:05:20,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +13: [2023-03-16 19:05:20,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. + 8: [2023-03-16 19:05:20,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +10: [2023-03-16 19:05:20,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +31: [2023-03-16 19:05:20,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +16: [2023-03-16 19:05:20,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +25: [2023-03-16 19:05:20,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +18: [2023-03-16 19:05:20,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +24: [2023-03-16 19:05:20,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +19: [2023-03-16 19:05:20,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +26: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +21: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +21: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +11: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. + 2: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +11: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +25: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +24: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +21: [2023-03-16 19:05:20,741] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +11: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +11: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +24: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +28: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +27: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +16: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... + 5: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. + 6: [2023-03-16 19:05:20,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. + 1: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +24: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. + 6: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +28: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +27: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +24: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +16: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +27: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +27: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +11: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. + 4: [2023-03-16 19:05:20,743] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. + 7: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +10: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +26: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +26: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +21: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +21: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. + 0: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +20: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:20,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +27: [2023-03-16 19:05:20,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +27: [2023-03-16 19:05:20,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +17: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +17: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +18: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_02-model_states.pt. +20: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +10: [2023-03-16 19:05:20,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +10: [2023-03-16 19:05:20,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +14: [2023-03-16 19:05:20,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +14: [2023-03-16 19:05:20,747] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +21: [2023-03-16 19:05:20,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:20,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +17: [2023-03-16 19:05:20,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +29: [2023-03-16 19:05:20,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +14: [2023-03-16 19:05:20,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +28: [2023-03-16 19:05:20,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +14: [2023-03-16 19:05:20,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +29: [2023-03-16 19:05:20,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +12: [2023-03-16 19:05:20,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +13: [2023-03-16 19:05:20,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +26: [2023-03-16 19:05:20,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +11: [2023-03-16 19:05:20,744] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +16: [2023-03-16 19:05:20,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +16: [2023-03-16 19:05:20,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +15: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +13: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +13: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +12: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +26: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +12: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +15: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +12: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +12: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +13: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,750] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +31: [2023-03-16 19:05:20,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +17: [2023-03-16 19:05:20,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +19: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +17: [2023-03-16 19:05:20,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +31: [2023-03-16 19:05:20,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +31: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +31: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +31: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +31: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +30: [2023-03-16 19:05:20,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +30: [2023-03-16 19:05:20,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. +19: [2023-03-16 19:05:20,753] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +24: [2023-03-16 19:05:20,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +13: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +13: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +18: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +11: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +21: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +25: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +11: [2023-03-16 19:05:20,755] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +12: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +24: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +25: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +15: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +12: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_02-model_states.pt. +29: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +17: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +17: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,756] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +24: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +15: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +29: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +21: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +18: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +17: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +23: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. + 7: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +18: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +17: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +20: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +21: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +23: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. + 6: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +20: [2023-03-16 19:05:20,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. + 4: [2023-03-16 19:05:20,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +10: [2023-03-16 19:05:20,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +23: [2023-03-16 19:05:20,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,758] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +23: [2023-03-16 19:05:20,758] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +15: [2023-03-16 19:05:20,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +15: [2023-03-16 19:05:20,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +24: [2023-03-16 19:05:20,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +19: [2023-03-16 19:05:20,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +19: [2023-03-16 19:05:20,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +14: [2023-03-16 19:05:20,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +14: [2023-03-16 19:05:20,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +16: [2023-03-16 19:05:20,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +16: [2023-03-16 19:05:20,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +14: [2023-03-16 19:05:20,764] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +31: [2023-03-16 19:05:20,765] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. + 8: [2023-03-16 19:05:20,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +26: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +26: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +19: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +12: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +14: [2023-03-16 19:05:20,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +31: [2023-03-16 19:05:20,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +30: [2023-03-16 19:05:20,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +19: [2023-03-16 19:05:20,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... +31: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +14: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +14: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +12: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +12: [2023-03-16 19:05:20,768] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +30: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... +23: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +31: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +10: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +29: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +17: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +18: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +21: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +25: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +25: [2023-03-16 19:05:20,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +18: [2023-03-16 19:05:20,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +21: [2023-03-16 19:05:20,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +20: [2023-03-16 19:05:20,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +28: [2023-03-16 19:05:20,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +20: [2023-03-16 19:05:20,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +28: [2023-03-16 19:05:20,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_02-model_states.pt. +10: [2023-03-16 19:05:20,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +29: [2023-03-16 19:05:20,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +10: [2023-03-16 19:05:20,771] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +17: [2023-03-16 19:05:20,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +15: [2023-03-16 19:05:20,772] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +10: [2023-03-16 19:05:20,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +15: [2023-03-16 19:05:20,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +19: [2023-03-16 19:05:20,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +11: [2023-03-16 19:05:20,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +23: [2023-03-16 19:05:20,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +23: [2023-03-16 19:05:20,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +11: [2023-03-16 19:05:20,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. + 1: [2023-03-16 19:05:20,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 1: [2023-03-16 19:05:20,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. +15: [2023-03-16 19:05:20,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +15: [2023-03-16 19:05:20,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +23: [2023-03-16 19:05:20,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +19: [2023-03-16 19:05:20,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +14: [2023-03-16 19:05:20,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +14: [2023-03-16 19:05:20,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 4: [2023-03-16 19:05:20,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. +16: [2023-03-16 19:05:20,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +15: [2023-03-16 19:05:20,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +15: [2023-03-16 19:05:20,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +10: [2023-03-16 19:05:20,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +16: [2023-03-16 19:05:20,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +11: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +26: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +26: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,780] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 3: [2023-03-16 19:05:20,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. +10: [2023-03-16 19:05:20,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 9: [2023-03-16 19:05:20,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +28: [2023-03-16 19:05:20,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +20: [2023-03-16 19:05:20,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +20: [2023-03-16 19:05:20,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +13: [2023-03-16 19:05:20,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +28: [2023-03-16 19:05:20,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +13: [2023-03-16 19:05:20,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +13: [2023-03-16 19:05:20,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_03-model_states.pt. +11: [2023-03-16 19:05:20,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 2: [2023-03-16 19:05:20,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 1: [2023-03-16 19:05:20,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +13: [2023-03-16 19:05:20,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +10: [2023-03-16 19:05:20,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +11: [2023-03-16 19:05:20,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +14: [2023-03-16 19:05:20,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +13: [2023-03-16 19:05:20,788] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +11: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +30: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. +10: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +30: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_38-model_01-model_states.pt. + 4: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +13: [2023-03-16 19:05:20,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +14: [2023-03-16 19:05:20,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +16: [2023-03-16 19:05:20,790] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +15: [2023-03-16 19:05:20,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +13: [2023-03-16 19:05:20,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +16: [2023-03-16 19:05:20,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +15: [2023-03-16 19:05:20,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +12: [2023-03-16 19:05:20,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +13: [2023-03-16 19:05:20,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +11: [2023-03-16 19:05:20,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +11: [2023-03-16 19:05:20,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +10: [2023-03-16 19:05:20,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +16: [2023-03-16 19:05:20,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +10: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +20: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +12: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +20: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +11: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +11: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +17: [2023-03-16 19:05:20,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +16: [2023-03-16 19:05:20,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +11: [2023-03-16 19:05:20,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +18: [2023-03-16 19:05:20,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +18: [2023-03-16 19:05:20,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. + 3: [2023-03-16 19:05:20,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... +21: [2023-03-16 19:05:20,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,798] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +17: [2023-03-16 19:05:20,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +13: [2023-03-16 19:05:20,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... +16: [2023-03-16 19:05:20,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +18: [2023-03-16 19:05:20,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +10: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +14: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +30: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +12: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +17: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +13: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +13: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +30: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +18: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +16: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +23: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +13: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +20: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +12: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +19: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +21: [2023-03-16 19:05:20,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +31: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +31: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +19: [2023-03-16 19:05:20,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +21: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +31: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +31: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +21: [2023-03-16 19:05:20,801] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +31: [2023-03-16 19:05:20,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +18: [2023-03-16 19:05:20,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +17: [2023-03-16 19:05:20,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +15: [2023-03-16 19:05:20,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +31: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +18: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +12: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +31: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +23: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +31: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +15: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +31: [2023-03-16 19:05:20,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +19: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... + 3: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +12: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +14: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +12: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +31: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +19: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +15: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +19: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +12: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +31: [2023-03-16 19:05:20,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +10: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +20: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +31: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +15: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +31: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +23: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +20: [2023-03-16 19:05:20,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +19: [2023-03-16 19:05:20,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... + 3: [2023-03-16 19:05:20,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +23: [2023-03-16 19:05:20,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +23: [2023-03-16 19:05:20,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +15: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +15: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +20: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +12: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +31: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +12: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +31: [2023-03-16 19:05:20,808] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +31: [2023-03-16 19:05:20,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +11: [2023-03-16 19:05:20,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +19: [2023-03-16 19:05:20,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +18: [2023-03-16 19:05:20,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +18: [2023-03-16 19:05:20,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +14: [2023-03-16 19:05:20,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +23: [2023-03-16 19:05:20,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... + 7: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 6: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +19: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... + 7: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 6: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,812] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +14: [2023-03-16 19:05:20,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 6: [2023-03-16 19:05:20,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +10: [2023-03-16 19:05:20,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +23: [2023-03-16 19:05:20,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +14: [2023-03-16 19:05:20,813] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 6: [2023-03-16 19:05:20,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 9: [2023-03-16 19:05:20,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +14: [2023-03-16 19:05:20,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 0: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 9: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +13: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +10: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +14: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +12: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +14: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. + 9: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +13: [2023-03-16 19:05:20,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +12: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +11: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +15: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +15: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +11: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... +11: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +11: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. + 0: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 0: [2023-03-16 19:05:20,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +16: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +27: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +27: [2023-03-16 19:05:20,818] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +27: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +27: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +27: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +27: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +27: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +27: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +22: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +11: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +10: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +11: [2023-03-16 19:05:20,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +16: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. +16: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... + 4: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_16-model_01-model_states.pt. + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +27: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +22: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_03-model_states.pt. +27: [2023-03-16 19:05:20,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +27: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +27: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +27: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... +27: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +16: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... + 5: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 4: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 5: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 4: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 4: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 4: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 9: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +27: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +27: [2023-03-16 19:05:20,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 3: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 0: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 2: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 1: [2023-03-16 19:05:20,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 1: [2023-03-16 19:05:20,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 1: [2023-03-16 19:05:20,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 1: [2023-03-16 19:05:20,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 1: [2023-03-16 19:05:20,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +20: [2023-03-16 19:05:20,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +20: [2023-03-16 19:05:20,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 2: [2023-03-16 19:05:20,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 2: [2023-03-16 19:05:20,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 2: [2023-03-16 19:05:20,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 2: [2023-03-16 19:05:20,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... +10: [2023-03-16 19:05:20,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +14: [2023-03-16 19:05:20,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 9: [2023-03-16 19:05:20,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 9: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 3: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +14: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +13: [2023-03-16 19:05:20,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +13: [2023-03-16 19:05:20,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +10: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +15: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 3: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +20: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +20: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... + 8: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +15: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +11: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +12: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... +31: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +22: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +11: [2023-03-16 19:05:20,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 8: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... +22: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +31: [2023-03-16 19:05:20,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt... +22: [2023-03-16 19:05:20,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. +13: [2023-03-16 19:05:20,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +22: [2023-03-16 19:05:20,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_02-model_states.pt. + 7: [2023-03-16 19:05:20,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,839] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +13: [2023-03-16 19:05:20,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +18: [2023-03-16 19:05:20,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +18: [2023-03-16 19:05:20,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +13: [2023-03-16 19:05:20,840] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 6: [2023-03-16 19:05:20,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +13: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 7: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt... + 7: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt... + 7: [2023-03-16 19:05:20,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +18: [2023-03-16 19:05:20,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +18: [2023-03-16 19:05:20,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +31: [2023-03-16 19:05:20,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 5: [2023-03-16 19:05:20,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,848] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. +31: [2023-03-16 19:05:20,848] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 0: [2023-03-16 19:05:20,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +27: [2023-03-16 19:05:20,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +27: [2023-03-16 19:05:20,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +22: [2023-03-16 19:05:20,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +22: [2023-03-16 19:05:20,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 1: [2023-03-16 19:05:20,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +10: [2023-03-16 19:05:20,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 7: [2023-03-16 19:05:20,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +13: [2023-03-16 19:05:20,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +14: [2023-03-16 19:05:20,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 0: [2023-03-16 19:05:20,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +15: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 5: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +15: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,860] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +13: [2023-03-16 19:05:20,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +27: [2023-03-16 19:05:20,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 4: [2023-03-16 19:05:20,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +27: [2023-03-16 19:05:20,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +10: [2023-03-16 19:05:20,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +12: [2023-03-16 19:05:20,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 9: [2023-03-16 19:05:20,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 0: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 9: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +14: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 1: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +13: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +15: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +14: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +13: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +15: [2023-03-16 19:05:20,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 6: [2023-03-16 19:05:20,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +11: [2023-03-16 19:05:20,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +11: [2023-03-16 19:05:20,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +12: [2023-03-16 19:05:20,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +14: [2023-03-16 19:05:20,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +12: [2023-03-16 19:05:20,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 1: [2023-03-16 19:05:20,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. +11: [2023-03-16 19:05:20,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +11: [2023-03-16 19:05:20,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... + 4: [2023-03-16 19:05:20,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 5: [2023-03-16 19:05:20,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 8: [2023-03-16 19:05:20,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt... +22: [2023-03-16 19:05:20,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. +22: [2023-03-16 19:05:20,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_27-model_01-model_states.pt. + 7: [2023-03-16 19:05:20,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +26: [2023-03-16 19:05:20,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 8: [2023-03-16 19:05:20,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 4: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +26: [2023-03-16 19:05:20,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. + 2: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 2: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +30: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +26: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +30: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +26: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +26: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +26: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +24: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +24: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +28: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +28: [2023-03-16 19:05:20,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +28: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +28: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +28: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +28: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +30: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +30: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +24: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +30: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +30: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +24: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +24: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +30: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +30: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +30: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +29: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +29: [2023-03-16 19:05:20,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +29: [2023-03-16 19:05:20,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +29: [2023-03-16 19:05:20,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +28: [2023-03-16 19:05:20,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +29: [2023-03-16 19:05:20,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... + 8: [2023-03-16 19:05:20,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +29: [2023-03-16 19:05:20,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +29: [2023-03-16 19:05:20,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +29: [2023-03-16 19:05:20,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +28: [2023-03-16 19:05:20,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +28: [2023-03-16 19:05:20,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +28: [2023-03-16 19:05:20,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +28: [2023-03-16 19:05:20,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +22: [2023-03-16 19:05:20,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +22: [2023-03-16 19:05:20,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:20,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 6: [2023-03-16 19:05:20,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 6: [2023-03-16 19:05:20,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 6: [2023-03-16 19:05:20,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +30: [2023-03-16 19:05:20,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +26: [2023-03-16 19:05:20,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +22: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +29: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +29: [2023-03-16 19:05:20,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +24: [2023-03-16 19:05:20,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +22: [2023-03-16 19:05:20,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +22: [2023-03-16 19:05:20,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt... +22: [2023-03-16 19:05:20,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +22: [2023-03-16 19:05:20,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +22: [2023-03-16 19:05:20,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt... +22: [2023-03-16 19:05:20,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt... +22: [2023-03-16 19:05:20,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +28: [2023-03-16 19:05:20,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +28: [2023-03-16 19:05:20,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +22: [2023-03-16 19:05:20,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +25: [2023-03-16 19:05:20,876] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +25: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt... +25: [2023-03-16 19:05:20,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +25: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt... +25: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +25: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt... +25: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +25: [2023-03-16 19:05:20,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt... +25: [2023-03-16 19:05:20,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +25: [2023-03-16 19:05:20,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_00-model_states.pt. +30: [2023-03-16 19:05:20,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +24: [2023-03-16 19:05:20,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +22: [2023-03-16 19:05:20,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt... +26: [2023-03-16 19:05:20,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +30: [2023-03-16 19:05:20,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +26: [2023-03-16 19:05:20,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +29: [2023-03-16 19:05:20,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +29: [2023-03-16 19:05:20,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +24: [2023-03-16 19:05:20,926] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +28: [2023-03-16 19:05:20,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +28: [2023-03-16 19:05:20,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +22: [2023-03-16 19:05:20,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_00-model_states.pt. +22: [2023-03-16 19:05:20,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +22: [2023-03-16 19:05:20,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +25: [2023-03-16 19:05:20,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +25: [2023-03-16 19:05:20,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +17: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +17: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +31: [2023-03-16 19:05:21,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +31: [2023-03-16 19:05:21,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +16: [2023-03-16 19:05:21,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:21,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +21: [2023-03-16 19:05:21,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:21,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +21: [2023-03-16 19:05:21,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:21,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +19: [2023-03-16 19:05:21,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +23: [2023-03-16 19:05:21,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +23: [2023-03-16 19:05:21,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +23: [2023-03-16 19:05:21,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +23: [2023-03-16 19:05:21,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +20: [2023-03-16 19:05:21,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +20: [2023-03-16 19:05:21,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +31: [2023-03-16 19:05:21,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +20: [2023-03-16 19:05:21,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +20: [2023-03-16 19:05:21,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +20: [2023-03-16 19:05:21,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +31: [2023-03-16 19:05:21,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +20: [2023-03-16 19:05:21,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +31: [2023-03-16 19:05:21,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +11: [2023-03-16 19:05:21,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +31: [2023-03-16 19:05:21,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +11: [2023-03-16 19:05:21,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +15: [2023-03-16 19:05:21,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +15: [2023-03-16 19:05:21,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +21: [2023-03-16 19:05:21,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +21: [2023-03-16 19:05:21,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +10: [2023-03-16 19:05:21,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +10: [2023-03-16 19:05:21,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +20: [2023-03-16 19:05:21,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +10: [2023-03-16 19:05:21,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +10: [2023-03-16 19:05:21,079] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +20: [2023-03-16 19:05:21,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +31: [2023-03-16 19:05:21,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +19: [2023-03-16 19:05:21,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +19: [2023-03-16 19:05:21,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +31: [2023-03-16 19:05:21,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +11: [2023-03-16 19:05:21,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +11: [2023-03-16 19:05:21,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +17: [2023-03-16 19:05:21,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,083] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +17: [2023-03-16 19:05:21,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +15: [2023-03-16 19:05:21,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +27: [2023-03-16 19:05:21,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +27: [2023-03-16 19:05:21,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +16: [2023-03-16 19:05:21,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +15: [2023-03-16 19:05:21,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +21: [2023-03-16 19:05:21,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +21: [2023-03-16 19:05:21,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +19: [2023-03-16 19:05:21,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +23: [2023-03-16 19:05:21,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +23: [2023-03-16 19:05:21,091] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +23: [2023-03-16 19:05:21,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +24: [2023-03-16 19:05:21,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +24: [2023-03-16 19:05:21,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +19: [2023-03-16 19:05:21,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,094] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:21,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +17: [2023-03-16 19:05:21,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +18: [2023-03-16 19:05:21,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +27: [2023-03-16 19:05:21,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +27: [2023-03-16 19:05:21,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +21: [2023-03-16 19:05:21,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +21: [2023-03-16 19:05:21,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +18: [2023-03-16 19:05:21,100] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +20: [2023-03-16 19:05:21,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +18: [2023-03-16 19:05:21,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +18: [2023-03-16 19:05:21,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +20: [2023-03-16 19:05:21,104] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +23: [2023-03-16 19:05:21,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +23: [2023-03-16 19:05:21,105] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:21,106] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:21,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +23: [2023-03-16 19:05:21,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +20: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +10: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +19: [2023-03-16 19:05:21,104] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +24: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +19: [2023-03-16 19:05:21,108] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +24: [2023-03-16 19:05:21,109] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +23: [2023-03-16 19:05:21,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +20: [2023-03-16 19:05:21,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +12: [2023-03-16 19:05:21,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +10: [2023-03-16 19:05:21,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +10: [2023-03-16 19:05:21,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +12: [2023-03-16 19:05:21,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +14: [2023-03-16 19:05:21,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +14: [2023-03-16 19:05:21,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +10: [2023-03-16 19:05:21,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +21: [2023-03-16 19:05:21,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +13: [2023-03-16 19:05:21,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +21: [2023-03-16 19:05:21,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +13: [2023-03-16 19:05:21,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +16: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +12: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_03-model_states.pt. +20: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +15: [2023-03-16 19:05:21,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +20: [2023-03-16 19:05:21,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,116] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +15: [2023-03-16 19:05:21,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +15: [2023-03-16 19:05:21,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +12: [2023-03-16 19:05:21,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +12: [2023-03-16 19:05:21,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +15: [2023-03-16 19:05:21,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +20: [2023-03-16 19:05:21,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +28: [2023-03-16 19:05:21,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +29: [2023-03-16 19:05:21,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +29: [2023-03-16 19:05:21,118] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +15: [2023-03-16 19:05:21,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +18: [2023-03-16 19:05:21,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +12: [2023-03-16 19:05:21,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +19: [2023-03-16 19:05:21,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +20: [2023-03-16 19:05:21,120] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +28: [2023-03-16 19:05:21,120] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +13: [2023-03-16 19:05:21,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +15: [2023-03-16 19:05:21,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +13: [2023-03-16 19:05:21,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +21: [2023-03-16 19:05:21,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. + 0: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +14: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. + 3: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. + 2: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +13: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +23: [2023-03-16 19:05:21,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +14: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +15: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +17: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +22: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +17: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +22: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. + 9: [2023-03-16 19:05:21,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +24: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +24: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +17: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +10: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +15: [2023-03-16 19:05:21,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... + 7: [2023-03-16 19:05:21,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +12: [2023-03-16 19:05:21,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +19: [2023-03-16 19:05:21,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +13: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +13: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +19: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +23: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +19: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +17: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +23: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +14: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +12: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +21: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +23: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +10: [2023-03-16 19:05:21,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. + 5: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +10: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +10: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +19: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 8: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +21: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +21: [2023-03-16 19:05:21,129] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +28: [2023-03-16 19:05:21,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +14: [2023-03-16 19:05:21,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +17: [2023-03-16 19:05:21,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +29: [2023-03-16 19:05:21,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +29: [2023-03-16 19:05:21,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +28: [2023-03-16 19:05:21,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +17: [2023-03-16 19:05:21,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,133] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. + 4: [2023-03-16 19:05:21,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_03-model_states.pt. +29: [2023-03-16 19:05:21,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +29: [2023-03-16 19:05:21,134] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. + 3: [2023-03-16 19:05:21,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +21: [2023-03-16 19:05:21,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +21: [2023-03-16 19:05:21,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. + 0: [2023-03-16 19:05:21,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +23: [2023-03-16 19:05:21,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 2: [2023-03-16 19:05:21,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +22: [2023-03-16 19:05:21,136] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +22: [2023-03-16 19:05:21,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +18: [2023-03-16 19:05:21,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +18: [2023-03-16 19:05:21,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +24: [2023-03-16 19:05:21,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +24: [2023-03-16 19:05:21,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +23: [2023-03-16 19:05:21,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +18: [2023-03-16 19:05:21,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. + 6: [2023-03-16 19:05:21,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 3: [2023-03-16 19:05:21,140] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 5: [2023-03-16 19:05:21,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +17: [2023-03-16 19:05:21,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +23: [2023-03-16 19:05:21,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +14: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +19: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +17: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +17: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +19: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +17: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +16: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +23: [2023-03-16 19:05:21,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +30: [2023-03-16 19:05:21,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +30: [2023-03-16 19:05:21,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +19: [2023-03-16 19:05:21,142] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +23: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +17: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +19: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +23: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +23: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. + 1: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +27: [2023-03-16 19:05:21,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +17: [2023-03-16 19:05:21,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +27: [2023-03-16 19:05:21,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +14: [2023-03-16 19:05:21,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,145] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... + 4: [2023-03-16 19:05:21,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +20: [2023-03-16 19:05:21,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +20: [2023-03-16 19:05:21,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +18: [2023-03-16 19:05:21,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +21: [2023-03-16 19:05:21,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +15: [2023-03-16 19:05:21,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +16: [2023-03-16 19:05:21,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +12: [2023-03-16 19:05:21,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +29: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +29: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +16: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_02-model_states.pt. +21: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +14: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +16: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +25: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 0: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. +25: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. + 1: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 6: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 5: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 2: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. +18: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 0: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 2: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 1: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 5: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. +25: [2023-03-16 19:05:21,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. + 3: [2023-03-16 19:05:21,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. +15: [2023-03-16 19:05:21,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +25: [2023-03-16 19:05:21,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +18: [2023-03-16 19:05:21,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_02-model_states.pt. +18: [2023-03-16 19:05:21,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:21,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +16: [2023-03-16 19:05:21,151] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 3: [2023-03-16 19:05:21,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +12: [2023-03-16 19:05:21,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +20: [2023-03-16 19:05:21,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +20: [2023-03-16 19:05:21,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +28: [2023-03-16 19:05:21,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +22: [2023-03-16 19:05:21,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +22: [2023-03-16 19:05:21,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_03-model_states.pt. +28: [2023-03-16 19:05:21,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +16: [2023-03-16 19:05:21,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +18: [2023-03-16 19:05:21,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +11: [2023-03-16 19:05:21,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +11: [2023-03-16 19:05:21,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. + 9: [2023-03-16 19:05:21,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +23: [2023-03-16 19:05:21,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +16: [2023-03-16 19:05:21,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +27: [2023-03-16 19:05:21,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +30: [2023-03-16 19:05:21,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +13: [2023-03-16 19:05:21,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +18: [2023-03-16 19:05:21,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +27: [2023-03-16 19:05:21,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +23: [2023-03-16 19:05:21,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +30: [2023-03-16 19:05:21,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +26: [2023-03-16 19:05:21,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +30: [2023-03-16 19:05:21,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +26: [2023-03-16 19:05:21,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +10: [2023-03-16 19:05:21,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +26: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +13: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +26: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_03-model_states.pt. +20: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +19: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +20: [2023-03-16 19:05:21,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +19: [2023-03-16 19:05:21,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +10: [2023-03-16 19:05:21,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +10: [2023-03-16 19:05:21,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +16: [2023-03-16 19:05:21,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +15: [2023-03-16 19:05:21,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +30: [2023-03-16 19:05:21,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_01-model_states.pt. +12: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +14: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +10: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +12: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +13: [2023-03-16 19:05:21,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... + 2: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +16: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +25: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +12: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +21: [2023-03-16 19:05:21,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +15: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +10: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +25: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +10: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +12: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. + 0: [2023-03-16 19:05:21,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +12: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +12: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +12: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +25: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +25: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +20: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +11: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +21: [2023-03-16 19:05:21,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +11: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +21: [2023-03-16 19:05:21,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +28: [2023-03-16 19:05:21,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +22: [2023-03-16 19:05:21,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +22: [2023-03-16 19:05:21,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +28: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +12: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +20: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +11: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +23: [2023-03-16 19:05:21,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +11: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +13: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +14: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +30: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,168] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +13: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... + 8: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +11: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... + 8: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +11: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +23: [2023-03-16 19:05:21,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... + 7: [2023-03-16 19:05:21,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. +13: [2023-03-16 19:05:21,170] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +26: [2023-03-16 19:05:21,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +26: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +14: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +26: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +17: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +26: [2023-03-16 19:05:21,171] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +19: [2023-03-16 19:05:21,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +14: [2023-03-16 19:05:21,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +23: [2023-03-16 19:05:21,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +17: [2023-03-16 19:05:21,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +17: [2023-03-16 19:05:21,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +30: [2023-03-16 19:05:21,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 0: [2023-03-16 19:05:21,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +17: [2023-03-16 19:05:21,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 1: [2023-03-16 19:05:21,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. +19: [2023-03-16 19:05:21,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +17: [2023-03-16 19:05:21,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +23: [2023-03-16 19:05:21,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +17: [2023-03-16 19:05:21,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... + 2: [2023-03-16 19:05:21,176] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 2: [2023-03-16 19:05:21,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. +21: [2023-03-16 19:05:21,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +14: [2023-03-16 19:05:21,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +17: [2023-03-16 19:05:21,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +10: [2023-03-16 19:05:21,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +17: [2023-03-16 19:05:21,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +10: [2023-03-16 19:05:21,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +12: [2023-03-16 19:05:21,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +12: [2023-03-16 19:05:21,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +21: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +15: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. +15: [2023-03-16 19:05:21,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +13: [2023-03-16 19:05:21,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +14: [2023-03-16 19:05:21,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt... +21: [2023-03-16 19:05:21,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +18: [2023-03-16 19:05:21,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +21: [2023-03-16 19:05:21,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... + 7: [2023-03-16 19:05:21,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +13: [2023-03-16 19:05:21,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +14: [2023-03-16 19:05:21,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +14: [2023-03-16 19:05:21,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +19: [2023-03-16 19:05:21,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_02-model_states.pt. +16: [2023-03-16 19:05:21,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +18: [2023-03-16 19:05:21,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +18: [2023-03-16 19:05:21,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +14: [2023-03-16 19:05:21,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +16: [2023-03-16 19:05:21,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +18: [2023-03-16 19:05:21,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +18: [2023-03-16 19:05:21,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +20: [2023-03-16 19:05:21,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +18: [2023-03-16 19:05:21,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... + 2: [2023-03-16 19:05:21,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +23: [2023-03-16 19:05:21,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +16: [2023-03-16 19:05:21,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +20: [2023-03-16 19:05:21,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... + 4: [2023-03-16 19:05:21,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +23: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +15: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +15: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +20: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +16: [2023-03-16 19:05:21,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... + 6: [2023-03-16 19:05:21,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +23: [2023-03-16 19:05:21,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +19: [2023-03-16 19:05:21,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +19: [2023-03-16 19:05:21,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +20: [2023-03-16 19:05:21,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +23: [2023-03-16 19:05:21,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +16: [2023-03-16 19:05:21,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +16: [2023-03-16 19:05:21,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... + 6: [2023-03-16 19:05:21,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +11: [2023-03-16 19:05:21,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,198] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +20: [2023-03-16 19:05:21,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +20: [2023-03-16 19:05:21,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +11: [2023-03-16 19:05:21,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +19: [2023-03-16 19:05:21,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +19: [2023-03-16 19:05:21,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +20: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +20: [2023-03-16 19:05:21,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +10: [2023-03-16 19:05:21,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +12: [2023-03-16 19:05:21,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +12: [2023-03-16 19:05:21,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 6: [2023-03-16 19:05:21,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 6: [2023-03-16 19:05:21,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 6: [2023-03-16 19:05:21,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... +12: [2023-03-16 19:05:21,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +12: [2023-03-16 19:05:21,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +10: [2023-03-16 19:05:21,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +10: [2023-03-16 19:05:21,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +13: [2023-03-16 19:05:21,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +11: [2023-03-16 19:05:21,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +10: [2023-03-16 19:05:21,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +13: [2023-03-16 19:05:21,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +11: [2023-03-16 19:05:21,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +22: [2023-03-16 19:05:21,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +22: [2023-03-16 19:05:21,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_28-model_01-model_states.pt. +10: [2023-03-16 19:05:21,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +10: [2023-03-16 19:05:21,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +22: [2023-03-16 19:05:21,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +22: [2023-03-16 19:05:21,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +22: [2023-03-16 19:05:21,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt... +22: [2023-03-16 19:05:21,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... +22: [2023-03-16 19:05:21,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +22: [2023-03-16 19:05:21,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... +22: [2023-03-16 19:05:21,222] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,225] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +15: [2023-03-16 19:05:21,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +15: [2023-03-16 19:05:21,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... + 9: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... +10: [2023-03-16 19:05:21,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... +22: [2023-03-16 19:05:21,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 0: [2023-03-16 19:05:21,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +15: [2023-03-16 19:05:21,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +15: [2023-03-16 19:05:21,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 0: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +22: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +10: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,231] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 6: [2023-03-16 19:05:21,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 3: [2023-03-16 19:05:21,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 3: [2023-03-16 19:05:21,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 3: [2023-03-16 19:05:21,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 3: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +29: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 8: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +29: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +27: [2023-03-16 19:05:21,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +27: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +31: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +24: [2023-03-16 19:05:21,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +26: [2023-03-16 19:05:21,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +24: [2023-03-16 19:05:21,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +26: [2023-03-16 19:05:21,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +31: [2023-03-16 19:05:21,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +31: [2023-03-16 19:05:21,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +31: [2023-03-16 19:05:21,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,237] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt... +15: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +31: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +15: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +30: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +30: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +30: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +30: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +30: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +31: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +30: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +31: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +14: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +31: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +14: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +11: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 4: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... +30: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +30: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +30: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +28: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 4: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +30: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +25: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +28: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +30: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +30: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +25: [2023-03-16 19:05:21,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +11: [2023-03-16 19:05:21,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +13: [2023-03-16 19:05:21,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +13: [2023-03-16 19:05:21,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,243] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +12: [2023-03-16 19:05:21,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +12: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 1: [2023-03-16 19:05:21,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 1: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 1: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 1: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +29: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +29: [2023-03-16 19:05:21,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +30: [2023-03-16 19:05:21,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. +30: [2023-03-16 19:05:21,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_39-model_02-model_states.pt. + 6: [2023-03-16 19:05:21,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +27: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... +27: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... +24: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt... +26: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +26: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... + 2: [2023-03-16 19:05:21,248] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt... +24: [2023-03-16 19:05:21,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +28: [2023-03-16 19:05:21,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +28: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +27: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_17-model_01-model_states.pt. +15: [2023-03-16 19:05:21,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +28: [2023-03-16 19:05:21,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +28: [2023-03-16 19:05:21,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +28: [2023-03-16 19:05:21,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +28: [2023-03-16 19:05:21,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +22: [2023-03-16 19:05:21,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +28: [2023-03-16 19:05:21,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +27: [2023-03-16 19:05:21,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +28: [2023-03-16 19:05:21,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +14: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +11: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +15: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +27: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +31: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +31: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +27: [2023-03-16 19:05:21,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +14: [2023-03-16 19:05:21,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +28: [2023-03-16 19:05:21,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +27: [2023-03-16 19:05:21,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +27: [2023-03-16 19:05:21,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +13: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +25: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +25: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +10: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +25: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +28: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +13: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +28: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +25: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +25: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +11: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +28: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +25: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +25: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +25: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +28: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +12: [2023-03-16 19:05:21,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +10: [2023-03-16 19:05:21,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +25: [2023-03-16 19:05:21,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +12: [2023-03-16 19:05:21,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +28: [2023-03-16 19:05:21,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +25: [2023-03-16 19:05:21,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +25: [2023-03-16 19:05:21,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +25: [2023-03-16 19:05:21,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +25: [2023-03-16 19:05:21,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +25: [2023-03-16 19:05:21,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +30: [2023-03-16 19:05:21,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +30: [2023-03-16 19:05:21,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +22: [2023-03-16 19:05:21,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,261] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,264] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt... +10: [2023-03-16 19:05:21,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +22: [2023-03-16 19:05:21,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +10: [2023-03-16 19:05:21,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +22: [2023-03-16 19:05:21,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... +31: [2023-03-16 19:05:21,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,267] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,270] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,270] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +22: [2023-03-16 19:05:21,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_00-model_states.pt. +22: [2023-03-16 19:05:21,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +30: [2023-03-16 19:05:21,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +22: [2023-03-16 19:05:21,276] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt... + 0: [2023-03-16 19:05:21,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +27: [2023-03-16 19:05:21,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +30: [2023-03-16 19:05:21,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +15: [2023-03-16 19:05:21,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +31: [2023-03-16 19:05:21,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,280] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +14: [2023-03-16 19:05:21,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +27: [2023-03-16 19:05:21,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +27: [2023-03-16 19:05:21,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +15: [2023-03-16 19:05:21,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +15: [2023-03-16 19:05:21,282] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +14: [2023-03-16 19:05:21,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +14: [2023-03-16 19:05:21,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +27: [2023-03-16 19:05:21,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +27: [2023-03-16 19:05:21,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. + 5: [2023-03-16 19:05:21,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_04-model_01-model_states.pt. +13: [2023-03-16 19:05:21,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +15: [2023-03-16 19:05:21,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +28: [2023-03-16 19:05:21,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +14: [2023-03-16 19:05:21,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +12: [2023-03-16 19:05:21,285] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +13: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +11: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +25: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +11: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +28: [2023-03-16 19:05:21,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +12: [2023-03-16 19:05:21,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +13: [2023-03-16 19:05:21,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +12: [2023-03-16 19:05:21,288] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... + 9: [2023-03-16 19:05:21,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +25: [2023-03-16 19:05:21,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +12: [2023-03-16 19:05:21,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +13: [2023-03-16 19:05:21,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +28: [2023-03-16 19:05:21,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +11: [2023-03-16 19:05:21,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... + 9: [2023-03-16 19:05:21,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +11: [2023-03-16 19:05:21,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +25: [2023-03-16 19:05:21,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +25: [2023-03-16 19:05:21,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +30: [2023-03-16 19:05:21,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +25: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +30: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... +28: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +31: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +25: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +31: [2023-03-16 19:05:21,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +30: [2023-03-16 19:05:21,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +29: [2023-03-16 19:05:21,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +29: [2023-03-16 19:05:21,294] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +27: [2023-03-16 19:05:21,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +29: [2023-03-16 19:05:21,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +27: [2023-03-16 19:05:21,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +28: [2023-03-16 19:05:21,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +31: [2023-03-16 19:05:21,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +30: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_00-model_states.pt. +29: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +29: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +29: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +29: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +30: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +28: [2023-03-16 19:05:21,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +30: [2023-03-16 19:05:21,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,298] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt... + 5: [2023-03-16 19:05:21,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt... +28: [2023-03-16 19:05:21,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,304] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +28: [2023-03-16 19:05:21,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. +29: [2023-03-16 19:05:21,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +29: [2023-03-16 19:05:21,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... + 5: [2023-03-16 19:05:21,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt... +29: [2023-03-16 19:05:21,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,336] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +26: [2023-03-16 19:05:21,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +24: [2023-03-16 19:05:21,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +24: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +24: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +24: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +26: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +24: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +24: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +26: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +26: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +26: [2023-03-16 19:05:21,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +26: [2023-03-16 19:05:21,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +26: [2023-03-16 19:05:21,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt... +24: [2023-03-16 19:05:21,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt... +24: [2023-03-16 19:05:21,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt... +24: [2023-03-16 19:05:21,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt... +24: [2023-03-16 19:05:21,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +26: [2023-03-16 19:05:21,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_00-model_states.pt. +24: [2023-03-16 19:05:21,391] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. + 8: [2023-03-16 19:05:21,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +26: [2023-03-16 19:05:21,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +26: [2023-03-16 19:05:21,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +24: [2023-03-16 19:05:21,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +17: [2023-03-16 19:05:21,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +17: [2023-03-16 19:05:21,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +17: [2023-03-16 19:05:21,406] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +21: [2023-03-16 19:05:21,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +21: [2023-03-16 19:05:21,412] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +21: [2023-03-16 19:05:21,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +21: [2023-03-16 19:05:21,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +13: [2023-03-16 19:05:21,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +13: [2023-03-16 19:05:21,417] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +15: [2023-03-16 19:05:21,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +15: [2023-03-16 19:05:21,422] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +18: [2023-03-16 19:05:21,423] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,424] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +18: [2023-03-16 19:05:21,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +13: [2023-03-16 19:05:21,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +12: [2023-03-16 19:05:21,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +12: [2023-03-16 19:05:21,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +13: [2023-03-16 19:05:21,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +10: [2023-03-16 19:05:21,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +15: [2023-03-16 19:05:21,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +10: [2023-03-16 19:05:21,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +15: [2023-03-16 19:05:21,434] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +17: [2023-03-16 19:05:21,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +17: [2023-03-16 19:05:21,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +12: [2023-03-16 19:05:21,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +12: [2023-03-16 19:05:21,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +21: [2023-03-16 19:05:21,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +10: [2023-03-16 19:05:21,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +10: [2023-03-16 19:05:21,447] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +11: [2023-03-16 19:05:21,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +21: [2023-03-16 19:05:21,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,451] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. + 9: [2023-03-16 19:05:21,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +22: [2023-03-16 19:05:21,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +28: [2023-03-16 19:05:21,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +28: [2023-03-16 19:05:21,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +18: [2023-03-16 19:05:21,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +14: [2023-03-16 19:05:21,454] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +14: [2023-03-16 19:05:21,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +19: [2023-03-16 19:05:21,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +19: [2023-03-16 19:05:21,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +21: [2023-03-16 19:05:21,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +23: [2023-03-16 19:05:21,456] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +14: [2023-03-16 19:05:21,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +14: [2023-03-16 19:05:21,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +21: [2023-03-16 19:05:21,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +21: [2023-03-16 19:05:21,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +21: [2023-03-16 19:05:21,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +16: [2023-03-16 19:05:21,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +16: [2023-03-16 19:05:21,458] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +16: [2023-03-16 19:05:21,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +16: [2023-03-16 19:05:21,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +11: [2023-03-16 19:05:21,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +18: [2023-03-16 19:05:21,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +18: [2023-03-16 19:05:21,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +11: [2023-03-16 19:05:21,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +16: [2023-03-16 19:05:21,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +11: [2023-03-16 19:05:21,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +31: [2023-03-16 19:05:21,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +31: [2023-03-16 19:05:21,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +23: [2023-03-16 19:05:21,463] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +22: [2023-03-16 19:05:21,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +22: [2023-03-16 19:05:21,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +14: [2023-03-16 19:05:21,464] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +28: [2023-03-16 19:05:21,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +23: [2023-03-16 19:05:21,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +28: [2023-03-16 19:05:21,465] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +23: [2023-03-16 19:05:21,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +18: [2023-03-16 19:05:21,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +18: [2023-03-16 19:05:21,466] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +10: [2023-03-16 19:05:21,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +10: [2023-03-16 19:05:21,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +23: [2023-03-16 19:05:21,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,467] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +19: [2023-03-16 19:05:21,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +23: [2023-03-16 19:05:21,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +20: [2023-03-16 19:05:21,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +10: [2023-03-16 19:05:21,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +20: [2023-03-16 19:05:21,468] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +19: [2023-03-16 19:05:21,468] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +10: [2023-03-16 19:05:21,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +21: [2023-03-16 19:05:21,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +14: [2023-03-16 19:05:21,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_03-model_states.pt. +20: [2023-03-16 19:05:21,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +23: [2023-03-16 19:05:21,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +27: [2023-03-16 19:05:21,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +21: [2023-03-16 19:05:21,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +27: [2023-03-16 19:05:21,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +20: [2023-03-16 19:05:21,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +18: [2023-03-16 19:05:21,471] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,472] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +18: [2023-03-16 19:05:21,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +31: [2023-03-16 19:05:21,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +17: [2023-03-16 19:05:21,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +31: [2023-03-16 19:05:21,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +17: [2023-03-16 19:05:21,476] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +10: [2023-03-16 19:05:21,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +14: [2023-03-16 19:05:21,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +10: [2023-03-16 19:05:21,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... + 8: [2023-03-16 19:05:21,480] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... + 8: [2023-03-16 19:05:21,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,481] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +13: [2023-03-16 19:05:21,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +13: [2023-03-16 19:05:21,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +13: [2023-03-16 19:05:21,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +13: [2023-03-16 19:05:21,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +27: [2023-03-16 19:05:21,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +13: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +15: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +15: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +13: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +13: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +13: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +15: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +21: [2023-03-16 19:05:21,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +21: [2023-03-16 19:05:21,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +14: [2023-03-16 19:05:21,486] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +12: [2023-03-16 19:05:21,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,486] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +15: [2023-03-16 19:05:21,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +15: [2023-03-16 19:05:21,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +17: [2023-03-16 19:05:21,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +17: [2023-03-16 19:05:21,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +16: [2023-03-16 19:05:21,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +15: [2023-03-16 19:05:21,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +15: [2023-03-16 19:05:21,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +12: [2023-03-16 19:05:21,488] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +19: [2023-03-16 19:05:21,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +15: [2023-03-16 19:05:21,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +12: [2023-03-16 19:05:21,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +19: [2023-03-16 19:05:21,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +14: [2023-03-16 19:05:21,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +16: [2023-03-16 19:05:21,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +10: [2023-03-16 19:05:21,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +16: [2023-03-16 19:05:21,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +10: [2023-03-16 19:05:21,490] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +10: [2023-03-16 19:05:21,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +23: [2023-03-16 19:05:21,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +12: [2023-03-16 19:05:21,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +12: [2023-03-16 19:05:21,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +11: [2023-03-16 19:05:21,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +14: [2023-03-16 19:05:21,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +23: [2023-03-16 19:05:21,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +23: [2023-03-16 19:05:21,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +11: [2023-03-16 19:05:21,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +23: [2023-03-16 19:05:21,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +20: [2023-03-16 19:05:21,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +20: [2023-03-16 19:05:21,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +18: [2023-03-16 19:05:21,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +21: [2023-03-16 19:05:21,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +11: [2023-03-16 19:05:21,497] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +25: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +21: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +25: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +22: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +22: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +23: [2023-03-16 19:05:21,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +18: [2023-03-16 19:05:21,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +18: [2023-03-16 19:05:21,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +22: [2023-03-16 19:05:21,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +22: [2023-03-16 19:05:21,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_01-model_states.pt. +20: [2023-03-16 19:05:21,500] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +19: [2023-03-16 19:05:21,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +23: [2023-03-16 19:05:21,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +19: [2023-03-16 19:05:21,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +23: [2023-03-16 19:05:21,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +20: [2023-03-16 19:05:21,501] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +17: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +20: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +14: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +23: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,502] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +16: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +10: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +19: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +20: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +10: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +23: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +19: [2023-03-16 19:05:21,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +20: [2023-03-16 19:05:21,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +19: [2023-03-16 19:05:21,504] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +18: [2023-03-16 19:05:21,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +18: [2023-03-16 19:05:21,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_03-model_states.pt. +20: [2023-03-16 19:05:21,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +19: [2023-03-16 19:05:21,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +21: [2023-03-16 19:05:21,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +23: [2023-03-16 19:05:21,505] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +11: [2023-03-16 19:05:21,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +16: [2023-03-16 19:05:21,506] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +14: [2023-03-16 19:05:21,507] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +17: [2023-03-16 19:05:21,507] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +29: [2023-03-16 19:05:21,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +11: [2023-03-16 19:05:21,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +29: [2023-03-16 19:05:21,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +16: [2023-03-16 19:05:21,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +21: [2023-03-16 19:05:21,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +23: [2023-03-16 19:05:21,508] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... + 8: [2023-03-16 19:05:21,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +16: [2023-03-16 19:05:21,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +16: [2023-03-16 19:05:21,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. + 8: [2023-03-16 19:05:21,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +19: [2023-03-16 19:05:21,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +25: [2023-03-16 19:05:21,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +20: [2023-03-16 19:05:21,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +25: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +19: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +22: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +12: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +20: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +22: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +16: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +16: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +12: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +23: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +21: [2023-03-16 19:05:21,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +21: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +21: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_29-model_02-model_states.pt. +23: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +14: [2023-03-16 19:05:21,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +18: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +10: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +22: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +30: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +22: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +10: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +30: [2023-03-16 19:05:21,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +21: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +23: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +19: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +13: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +13: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +20: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +19: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +22: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +22: [2023-03-16 19:05:21,515] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +16: [2023-03-16 19:05:21,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +16: [2023-03-16 19:05:21,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +22: [2023-03-16 19:05:21,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +22: [2023-03-16 19:05:21,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +14: [2023-03-16 19:05:21,516] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... +18: [2023-03-16 19:05:21,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +10: [2023-03-16 19:05:21,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +18: [2023-03-16 19:05:21,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +20: [2023-03-16 19:05:21,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +20: [2023-03-16 19:05:21,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +15: [2023-03-16 19:05:21,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +23: [2023-03-16 19:05:21,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +23: [2023-03-16 19:05:21,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +15: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +17: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +17: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +20: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +29: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +16: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +19: [2023-03-16 19:05:21,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,521] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +29: [2023-03-16 19:05:21,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +12: [2023-03-16 19:05:21,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. + 2: [2023-03-16 19:05:21,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +19: [2023-03-16 19:05:21,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +12: [2023-03-16 19:05:21,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +16: [2023-03-16 19:05:21,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +16: [2023-03-16 19:05:21,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +21: [2023-03-16 19:05:21,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt... +30: [2023-03-16 19:05:21,527] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +13: [2023-03-16 19:05:21,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +30: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +13: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +10: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +10: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +14: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +14: [2023-03-16 19:05:21,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +13: [2023-03-16 19:05:21,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +21: [2023-03-16 19:05:21,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +14: [2023-03-16 19:05:21,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +19: [2023-03-16 19:05:21,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +13: [2023-03-16 19:05:21,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +14: [2023-03-16 19:05:21,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt... + 5: [2023-03-16 19:05:21,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +15: [2023-03-16 19:05:21,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +15: [2023-03-16 19:05:21,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +21: [2023-03-16 19:05:21,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +21: [2023-03-16 19:05:21,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +21: [2023-03-16 19:05:21,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +16: [2023-03-16 19:05:21,535] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +17: [2023-03-16 19:05:21,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. + 0: [2023-03-16 19:05:21,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +12: [2023-03-16 19:05:21,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +18: [2023-03-16 19:05:21,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... + 9: [2023-03-16 19:05:21,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +17: [2023-03-16 19:05:21,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +16: [2023-03-16 19:05:21,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +17: [2023-03-16 19:05:21,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +23: [2023-03-16 19:05:21,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. + 6: [2023-03-16 19:05:21,539] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +17: [2023-03-16 19:05:21,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +23: [2023-03-16 19:05:21,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +12: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +14: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +22: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +13: [2023-03-16 19:05:21,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +22: [2023-03-16 19:05:21,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +23: [2023-03-16 19:05:21,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +20: [2023-03-16 19:05:21,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +22: [2023-03-16 19:05:21,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +22: [2023-03-16 19:05:21,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +13: [2023-03-16 19:05:21,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +18: [2023-03-16 19:05:21,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +20: [2023-03-16 19:05:21,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... + 5: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +23: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +24: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +16: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +24: [2023-03-16 19:05:21,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +23: [2023-03-16 19:05:21,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... + 9: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +22: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_05-model_01-model_states.pt. +16: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +20: [2023-03-16 19:05:21,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. + 1: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. + 0: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +19: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +19: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +18: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +23: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +19: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +20: [2023-03-16 19:05:21,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +23: [2023-03-16 19:05:21,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +22: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... +20: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... + 6: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +16: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +18: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +18: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +17: [2023-03-16 19:05:21,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +17: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +20: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +20: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +23: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +19: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +19: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +19: [2023-03-16 19:05:21,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +20: [2023-03-16 19:05:21,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +15: [2023-03-16 19:05:21,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +15: [2023-03-16 19:05:21,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +16: [2023-03-16 19:05:21,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +21: [2023-03-16 19:05:21,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +17: [2023-03-16 19:05:21,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +17: [2023-03-16 19:05:21,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +20: [2023-03-16 19:05:21,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. + 7: [2023-03-16 19:05:21,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_03-model_states.pt. +27: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +12: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +20: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... +20: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +21: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +11: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +27: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +25: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +27: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +20: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +27: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +25: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +25: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 0: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +25: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +11: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 1: [2023-03-16 19:05:21,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 8: [2023-03-16 19:05:21,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_02-model_states.pt. +27: [2023-03-16 19:05:21,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 3: [2023-03-16 19:05:21,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +19: [2023-03-16 19:05:21,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +12: [2023-03-16 19:05:21,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... + 3: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +12: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +25: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +27: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... + 7: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +25: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +25: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +22: [2023-03-16 19:05:21,559] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. +10: [2023-03-16 19:05:21,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +24: [2023-03-16 19:05:21,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +19: [2023-03-16 19:05:21,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +12: [2023-03-16 19:05:21,561] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +10: [2023-03-16 19:05:21,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +22: [2023-03-16 19:05:21,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt... + 7: [2023-03-16 19:05:21,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +24: [2023-03-16 19:05:21,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +22: [2023-03-16 19:05:21,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,562] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +22: [2023-03-16 19:05:21,562] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... +21: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... + 1: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +31: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 5: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt... +31: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +10: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +26: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +14: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +28: [2023-03-16 19:05:21,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +26: [2023-03-16 19:05:21,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +14: [2023-03-16 19:05:21,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +10: [2023-03-16 19:05:21,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +28: [2023-03-16 19:05:21,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +15: [2023-03-16 19:05:21,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +12: [2023-03-16 19:05:21,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +12: [2023-03-16 19:05:21,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +10: [2023-03-16 19:05:21,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +16: [2023-03-16 19:05:21,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +10: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +15: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. + 0: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +26: [2023-03-16 19:05:21,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. +26: [2023-03-16 19:05:21,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_03-model_states.pt. + 0: [2023-03-16 19:05:21,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +16: [2023-03-16 19:05:21,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,568] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +11: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +15: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +15: [2023-03-16 19:05:21,569] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +13: [2023-03-16 19:05:21,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +13: [2023-03-16 19:05:21,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,571] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +13: [2023-03-16 19:05:21,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +13: [2023-03-16 19:05:21,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... + 1: [2023-03-16 19:05:21,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +14: [2023-03-16 19:05:21,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +31: [2023-03-16 19:05:21,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +11: [2023-03-16 19:05:21,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +11: [2023-03-16 19:05:21,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +10: [2023-03-16 19:05:21,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +31: [2023-03-16 19:05:21,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +28: [2023-03-16 19:05:21,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +13: [2023-03-16 19:05:21,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +26: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_18-model_01-model_states.pt. +28: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +14: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +15: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +12: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +14: [2023-03-16 19:05:21,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +26: [2023-03-16 19:05:21,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +13: [2023-03-16 19:05:21,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +14: [2023-03-16 19:05:21,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +26: [2023-03-16 19:05:21,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +26: [2023-03-16 19:05:21,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +10: [2023-03-16 19:05:21,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +14: [2023-03-16 19:05:21,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +15: [2023-03-16 19:05:21,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +29: [2023-03-16 19:05:21,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 0: [2023-03-16 19:05:21,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... +25: [2023-03-16 19:05:21,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +29: [2023-03-16 19:05:21,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +25: [2023-03-16 19:05:21,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +14: [2023-03-16 19:05:21,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +27: [2023-03-16 19:05:21,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +27: [2023-03-16 19:05:21,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... +13: [2023-03-16 19:05:21,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... +13: [2023-03-16 19:05:21,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +27: [2023-03-16 19:05:21,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +30: [2023-03-16 19:05:21,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 8: [2023-03-16 19:05:21,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +27: [2023-03-16 19:05:21,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... +11: [2023-03-16 19:05:21,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +11: [2023-03-16 19:05:21,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +25: [2023-03-16 19:05:21,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_00-model_states.pt. +31: [2023-03-16 19:05:21,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +31: [2023-03-16 19:05:21,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +25: [2023-03-16 19:05:21,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +31: [2023-03-16 19:05:21,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +31: [2023-03-16 19:05:21,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... +25: [2023-03-16 19:05:21,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt... +25: [2023-03-16 19:05:21,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +31: [2023-03-16 19:05:21,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +31: [2023-03-16 19:05:21,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +31: [2023-03-16 19:05:21,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +31: [2023-03-16 19:05:21,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +27: [2023-03-16 19:05:21,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,597] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +28: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +28: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +28: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +28: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +28: [2023-03-16 19:05:21,598] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +27: [2023-03-16 19:05:21,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +30: [2023-03-16 19:05:21,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +28: [2023-03-16 19:05:21,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... + 0: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +15: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +30: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +30: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +28: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +12: [2023-03-16 19:05:21,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +30: [2023-03-16 19:05:21,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +30: [2023-03-16 19:05:21,601] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +15: [2023-03-16 19:05:21,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +28: [2023-03-16 19:05:21,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... + 7: [2023-03-16 19:05:21,601] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +31: [2023-03-16 19:05:21,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +31: [2023-03-16 19:05:21,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +28: [2023-03-16 19:05:21,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +12: [2023-03-16 19:05:21,602] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 0: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +10: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,603] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 7: [2023-03-16 19:05:21,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 7: [2023-03-16 19:05:21,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +29: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +31: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +10: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +13: [2023-03-16 19:05:21,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +25: [2023-03-16 19:05:21,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +12: [2023-03-16 19:05:21,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +15: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +11: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... +15: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +25: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +31: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +29: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +13: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +29: [2023-03-16 19:05:21,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +14: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +29: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... + 7: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +29: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... + 5: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +14: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +12: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... + 8: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +31: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +31: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +15: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. +24: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_01-model_states.pt. + 0: [2023-03-16 19:05:21,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +10: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 5: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +29: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +14: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +15: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +14: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +29: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 8: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... +28: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 5: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +10: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 2: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 2: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 7: [2023-03-16 19:05:21,612] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 3: [2023-03-16 19:05:21,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 3: [2023-03-16 19:05:21,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +28: [2023-03-16 19:05:21,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 8: [2023-03-16 19:05:21,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +13: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 2: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +15: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 3: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +28: [2023-03-16 19:05:21,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +26: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +26: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +13: [2023-03-16 19:05:21,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +11: [2023-03-16 19:05:21,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +15: [2023-03-16 19:05:21,616] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +11: [2023-03-16 19:05:21,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +24: [2023-03-16 19:05:21,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +24: [2023-03-16 19:05:21,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +28: [2023-03-16 19:05:21,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,617] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... + 1: [2023-03-16 19:05:21,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 7: [2023-03-16 19:05:21,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +24: [2023-03-16 19:05:21,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +24: [2023-03-16 19:05:21,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,618] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +30: [2023-03-16 19:05:21,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,619] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +30: [2023-03-16 19:05:21,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +24: [2023-03-16 19:05:21,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt... +24: [2023-03-16 19:05:21,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +24: [2023-03-16 19:05:21,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +11: [2023-03-16 19:05:21,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +11: [2023-03-16 19:05:21,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... + 0: [2023-03-16 19:05:21,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +25: [2023-03-16 19:05:21,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +24: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +24: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +31: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +31: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. +29: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. +28: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. +27: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 0: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 1: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. +27: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_40-model_02-model_states.pt. + 0: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 4: [2023-03-16 19:05:21,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 5: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. +26: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 2: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. +26: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 2: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. +26: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +24: [2023-03-16 19:05:21,624] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +31: [2023-03-16 19:05:21,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +27: [2023-03-16 19:05:21,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +25: [2023-03-16 19:05:21,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_00-model_states.pt. +25: [2023-03-16 19:05:21,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +28: [2023-03-16 19:05:21,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +26: [2023-03-16 19:05:21,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +26: [2023-03-16 19:05:21,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +26: [2023-03-16 19:05:21,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt... +26: [2023-03-16 19:05:21,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +26: [2023-03-16 19:05:21,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... + 9: [2023-03-16 19:05:21,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt... +26: [2023-03-16 19:05:21,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +26: [2023-03-16 19:05:21,629] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +27: [2023-03-16 19:05:21,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +28: [2023-03-16 19:05:21,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +24: [2023-03-16 19:05:21,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +27: [2023-03-16 19:05:21,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +24: [2023-03-16 19:05:21,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +30: [2023-03-16 19:05:21,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +30: [2023-03-16 19:05:21,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +30: [2023-03-16 19:05:21,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +31: [2023-03-16 19:05:21,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +25: [2023-03-16 19:05:21,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +25: [2023-03-16 19:05:21,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +29: [2023-03-16 19:05:21,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +27: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... +31: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +27: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,636] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +30: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +29: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,637] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +29: [2023-03-16 19:05:21,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... + 5: [2023-03-16 19:05:21,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,644] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +28: [2023-03-16 19:05:21,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_02-model_states.pt. + 3: [2023-03-16 19:05:21,647] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +30: [2023-03-16 19:05:21,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +24: [2023-03-16 19:05:21,649] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +31: [2023-03-16 19:05:21,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +29: [2023-03-16 19:05:21,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +29: [2023-03-16 19:05:21,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +30: [2023-03-16 19:05:21,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,654] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +28: [2023-03-16 19:05:21,655] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +24: [2023-03-16 19:05:21,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +29: [2023-03-16 19:05:21,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +28: [2023-03-16 19:05:21,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,657] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +28: [2023-03-16 19:05:21,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +29: [2023-03-16 19:05:21,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +26: [2023-03-16 19:05:21,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +26: [2023-03-16 19:05:21,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +24: [2023-03-16 19:05:21,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +31: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +24: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt... +31: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +26: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +27: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +26: [2023-03-16 19:05:21,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +30: [2023-03-16 19:05:21,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +27: [2023-03-16 19:05:21,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +31: [2023-03-16 19:05:21,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +31: [2023-03-16 19:05:21,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +24: [2023-03-16 19:05:21,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... + 3: [2023-03-16 19:05:21,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +28: [2023-03-16 19:05:21,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:21,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +25: [2023-03-16 19:05:21,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +24: [2023-03-16 19:05:21,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +27: [2023-03-16 19:05:21,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +30: [2023-03-16 19:05:21,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. +24: [2023-03-16 19:05:21,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +24: [2023-03-16 19:05:21,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:21,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... +25: [2023-03-16 19:05:21,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +24: [2023-03-16 19:05:21,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... +27: [2023-03-16 19:05:21,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +30: [2023-03-16 19:05:21,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +24: [2023-03-16 19:05:21,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt... + 4: [2023-03-16 19:05:21,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,670] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 1: [2023-03-16 19:05:21,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 6: [2023-03-16 19:05:21,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 2: [2023-03-16 19:05:21,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 4: [2023-03-16 19:05:21,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 3: [2023-03-16 19:05:21,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 3: [2023-03-16 19:05:21,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... +26: [2023-03-16 19:05:21,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... +26: [2023-03-16 19:05:21,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 5: [2023-03-16 19:05:21,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 7: [2023-03-16 19:05:21,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... + 7: [2023-03-16 19:05:21,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt... +21: [2023-03-16 19:05:21,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +21: [2023-03-16 19:05:21,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. + 5: [2023-03-16 19:05:21,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_06-model_01-model_states.pt. +22: [2023-03-16 19:05:21,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +22: [2023-03-16 19:05:21,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +22: [2023-03-16 19:05:21,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +18: [2023-03-16 19:05:21,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt... +17: [2023-03-16 19:05:21,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +17: [2023-03-16 19:05:21,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +17: [2023-03-16 19:05:21,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +17: [2023-03-16 19:05:21,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +21: [2023-03-16 19:05:21,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +23: [2023-03-16 19:05:21,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +19: [2023-03-16 19:05:21,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +23: [2023-03-16 19:05:21,789] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +19: [2023-03-16 19:05:21,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +20: [2023-03-16 19:05:21,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +20: [2023-03-16 19:05:21,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +19: [2023-03-16 19:05:21,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +19: [2023-03-16 19:05:21,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +23: [2023-03-16 19:05:21,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +21: [2023-03-16 19:05:21,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +18: [2023-03-16 19:05:21,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +21: [2023-03-16 19:05:21,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +23: [2023-03-16 19:05:21,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +23: [2023-03-16 19:05:21,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +18: [2023-03-16 19:05:21,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +18: [2023-03-16 19:05:21,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +16: [2023-03-16 19:05:21,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +22: [2023-03-16 19:05:21,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +16: [2023-03-16 19:05:21,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +11: [2023-03-16 19:05:21,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +11: [2023-03-16 19:05:21,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +16: [2023-03-16 19:05:21,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +16: [2023-03-16 19:05:21,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +16: [2023-03-16 19:05:21,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +22: [2023-03-16 19:05:21,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +16: [2023-03-16 19:05:21,805] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. + 5: [2023-03-16 19:05:21,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +21: [2023-03-16 19:05:21,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:21,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +17: [2023-03-16 19:05:21,807] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +20: [2023-03-16 19:05:21,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +22: [2023-03-16 19:05:21,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,809] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +18: [2023-03-16 19:05:21,810] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_00-model_states.pt. +19: [2023-03-16 19:05:21,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. +19: [2023-03-16 19:05:21,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. + 5: [2023-03-16 19:05:21,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +21: [2023-03-16 19:05:21,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +17: [2023-03-16 19:05:21,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +23: [2023-03-16 19:05:21,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,811] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +18: [2023-03-16 19:05:21,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,812] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt... +14: [2023-03-16 19:05:21,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +18: [2023-03-16 19:05:21,814] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +17: [2023-03-16 19:05:21,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +14: [2023-03-16 19:05:21,815] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +11: [2023-03-16 19:05:21,815] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:21,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +22: [2023-03-16 19:05:21,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:21,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +16: [2023-03-16 19:05:21,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +16: [2023-03-16 19:05:21,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +17: [2023-03-16 19:05:21,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +17: [2023-03-16 19:05:21,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +20: [2023-03-16 19:05:21,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +19: [2023-03-16 19:05:21,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +15: [2023-03-16 19:05:21,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +19: [2023-03-16 19:05:21,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +15: [2023-03-16 19:05:21,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +23: [2023-03-16 19:05:21,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +19: [2023-03-16 19:05:21,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +19: [2023-03-16 19:05:21,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +12: [2023-03-16 19:05:21,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +12: [2023-03-16 19:05:21,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +10: [2023-03-16 19:05:21,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. + 9: [2023-03-16 19:05:21,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +17: [2023-03-16 19:05:21,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +12: [2023-03-16 19:05:21,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +18: [2023-03-16 19:05:21,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +10: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. + 9: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +13: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +17: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +11: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +20: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +19: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +13: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +21: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +21: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +11: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +20: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +19: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. + 8: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. + 9: [2023-03-16 19:05:21,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. + 8: [2023-03-16 19:05:21,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +13: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +15: [2023-03-16 19:05:21,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:21,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +10: [2023-03-16 19:05:21,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +17: [2023-03-16 19:05:21,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +10: [2023-03-16 19:05:21,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +16: [2023-03-16 19:05:21,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +15: [2023-03-16 19:05:21,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +13: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +10: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +13: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +21: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +21: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +12: [2023-03-16 19:05:21,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +15: [2023-03-16 19:05:21,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +15: [2023-03-16 19:05:21,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +15: [2023-03-16 19:05:21,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +10: [2023-03-16 19:05:21,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +18: [2023-03-16 19:05:21,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +23: [2023-03-16 19:05:21,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +19: [2023-03-16 19:05:21,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +11: [2023-03-16 19:05:21,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +11: [2023-03-16 19:05:21,838] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +10: [2023-03-16 19:05:21,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +18: [2023-03-16 19:05:21,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +14: [2023-03-16 19:05:21,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +10: [2023-03-16 19:05:21,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. + 8: [2023-03-16 19:05:21,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +14: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +10: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +15: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. +15: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_02-model_states.pt. + 8: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +19: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +19: [2023-03-16 19:05:21,841] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:21,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +23: [2023-03-16 19:05:21,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +23: [2023-03-16 19:05:21,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +10: [2023-03-16 19:05:21,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +19: [2023-03-16 19:05:21,843] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +16: [2023-03-16 19:05:21,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +11: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +21: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +21: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +12: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:21,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +12: [2023-03-16 19:05:21,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +12: [2023-03-16 19:05:21,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +13: [2023-03-16 19:05:21,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +22: [2023-03-16 19:05:21,847] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +17: [2023-03-16 19:05:21,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +11: [2023-03-16 19:05:21,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +11: [2023-03-16 19:05:21,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +23: [2023-03-16 19:05:21,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +23: [2023-03-16 19:05:21,849] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +22: [2023-03-16 19:05:21,850] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +18: [2023-03-16 19:05:21,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +17: [2023-03-16 19:05:21,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +17: [2023-03-16 19:05:21,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +22: [2023-03-16 19:05:21,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +16: [2023-03-16 19:05:21,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +17: [2023-03-16 19:05:21,851] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +14: [2023-03-16 19:05:21,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +20: [2023-03-16 19:05:21,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +11: [2023-03-16 19:05:21,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:21,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +16: [2023-03-16 19:05:21,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +16: [2023-03-16 19:05:21,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +22: [2023-03-16 19:05:21,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... + 8: [2023-03-16 19:05:21,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +10: [2023-03-16 19:05:21,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +20: [2023-03-16 19:05:21,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +14: [2023-03-16 19:05:21,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:21,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +15: [2023-03-16 19:05:21,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:21,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +18: [2023-03-16 19:05:21,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +18: [2023-03-16 19:05:21,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +17: [2023-03-16 19:05:21,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +21: [2023-03-16 19:05:21,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +17: [2023-03-16 19:05:21,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 9: [2023-03-16 19:05:21,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +16: [2023-03-16 19:05:21,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +16: [2023-03-16 19:05:21,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +10: [2023-03-16 19:05:21,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:21,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:21,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +19: [2023-03-16 19:05:21,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +20: [2023-03-16 19:05:21,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +11: [2023-03-16 19:05:21,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +23: [2023-03-16 19:05:21,863] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +19: [2023-03-16 19:05:21,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +20: [2023-03-16 19:05:21,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +17: [2023-03-16 19:05:21,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +22: [2023-03-16 19:05:21,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +12: [2023-03-16 19:05:21,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +20: [2023-03-16 19:05:21,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +17: [2023-03-16 19:05:21,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +13: [2023-03-16 19:05:21,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +23: [2023-03-16 19:05:21,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +16: [2023-03-16 19:05:21,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +13: [2023-03-16 19:05:21,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +19: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +10: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +16: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +11: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +16: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +13: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. + 9: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_03-model_states.pt. +15: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +18: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +12: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +20: [2023-03-16 19:05:21,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_02-model_states.pt. +19: [2023-03-16 19:05:21,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt... +18: [2023-03-16 19:05:21,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +15: [2023-03-16 19:05:21,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... + 9: [2023-03-16 19:05:21,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +18: [2023-03-16 19:05:21,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +18: [2023-03-16 19:05:21,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... + 9: [2023-03-16 19:05:21,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +15: [2023-03-16 19:05:21,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +18: [2023-03-16 19:05:21,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. + 4: [2023-03-16 19:05:21,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,872] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +18: [2023-03-16 19:05:21,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +19: [2023-03-16 19:05:21,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,873] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +14: [2023-03-16 19:05:21,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:21,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,874] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +11: [2023-03-16 19:05:21,875] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +23: [2023-03-16 19:05:21,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +22: [2023-03-16 19:05:21,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +10: [2023-03-16 19:05:21,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:21,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +22: [2023-03-16 19:05:21,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +12: [2023-03-16 19:05:21,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +10: [2023-03-16 19:05:21,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +16: [2023-03-16 19:05:21,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +19: [2023-03-16 19:05:21,879] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +10: [2023-03-16 19:05:21,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +13: [2023-03-16 19:05:21,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:21,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +14: [2023-03-16 19:05:21,880] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:21,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +11: [2023-03-16 19:05:21,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +20: [2023-03-16 19:05:21,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +16: [2023-03-16 19:05:21,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +13: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +14: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +10: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +16: [2023-03-16 19:05:21,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +14: [2023-03-16 19:05:21,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +16: [2023-03-16 19:05:21,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +13: [2023-03-16 19:05:21,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +10: [2023-03-16 19:05:21,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +18: [2023-03-16 19:05:21,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +12: [2023-03-16 19:05:21,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:21,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +12: [2023-03-16 19:05:21,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +18: [2023-03-16 19:05:21,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +10: [2023-03-16 19:05:21,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +15: [2023-03-16 19:05:21,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:21,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +21: [2023-03-16 19:05:21,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +21: [2023-03-16 19:05:21,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +12: [2023-03-16 19:05:21,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +10: [2023-03-16 19:05:21,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +12: [2023-03-16 19:05:21,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +11: [2023-03-16 19:05:21,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +11: [2023-03-16 19:05:21,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +12: [2023-03-16 19:05:21,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 9: [2023-03-16 19:05:21,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +21: [2023-03-16 19:05:21,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +21: [2023-03-16 19:05:21,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +12: [2023-03-16 19:05:21,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +14: [2023-03-16 19:05:21,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 9: [2023-03-16 19:05:21,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +21: [2023-03-16 19:05:21,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... + 8: [2023-03-16 19:05:21,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... + 8: [2023-03-16 19:05:21,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +10: [2023-03-16 19:05:21,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +17: [2023-03-16 19:05:21,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +23: [2023-03-16 19:05:21,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +10: [2023-03-16 19:05:21,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +23: [2023-03-16 19:05:21,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +22: [2023-03-16 19:05:21,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +14: [2023-03-16 19:05:21,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +22: [2023-03-16 19:05:21,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. + 8: [2023-03-16 19:05:21,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +20: [2023-03-16 19:05:21,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +17: [2023-03-16 19:05:21,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +20: [2023-03-16 19:05:21,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +19: [2023-03-16 19:05:21,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +19: [2023-03-16 19:05:21,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +11: [2023-03-16 19:05:21,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +11: [2023-03-16 19:05:21,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +16: [2023-03-16 19:05:21,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +14: [2023-03-16 19:05:21,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +15: [2023-03-16 19:05:21,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +16: [2023-03-16 19:05:21,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +15: [2023-03-16 19:05:21,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:21,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +21: [2023-03-16 19:05:21,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 8: [2023-03-16 19:05:21,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +12: [2023-03-16 19:05:21,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +15: [2023-03-16 19:05:21,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... +15: [2023-03-16 19:05:21,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt... + 6: [2023-03-16 19:05:21,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,900] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +17: [2023-03-16 19:05:21,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 8: [2023-03-16 19:05:21,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +12: [2023-03-16 19:05:21,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +10: [2023-03-16 19:05:21,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +17: [2023-03-16 19:05:21,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +10: [2023-03-16 19:05:21,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +17: [2023-03-16 19:05:21,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +17: [2023-03-16 19:05:21,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_30-model_03-model_states.pt. +23: [2023-03-16 19:05:21,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 5: [2023-03-16 19:05:21,906] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +22: [2023-03-16 19:05:21,906] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +20: [2023-03-16 19:05:21,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +19: [2023-03-16 19:05:21,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +19: [2023-03-16 19:05:21,907] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +22: [2023-03-16 19:05:21,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +23: [2023-03-16 19:05:21,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +22: [2023-03-16 19:05:21,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +27: [2023-03-16 19:05:21,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +20: [2023-03-16 19:05:21,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +22: [2023-03-16 19:05:21,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +27: [2023-03-16 19:05:21,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +16: [2023-03-16 19:05:21,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +27: [2023-03-16 19:05:21,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +27: [2023-03-16 19:05:21,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +13: [2023-03-16 19:05:21,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:21,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +22: [2023-03-16 19:05:21,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +20: [2023-03-16 19:05:21,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... + 0: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +20: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +23: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +24: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +18: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +18: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +24: [2023-03-16 19:05:21,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +16: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +16: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +13: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... +13: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt... + 6: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +31: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +30: [2023-03-16 19:05:21,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +31: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +22: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +20: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +28: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +23: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +29: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +29: [2023-03-16 19:05:21,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +17: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... +29: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +31: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +31: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +30: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +18: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +29: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +18: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +17: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +16: [2023-03-16 19:05:21,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt... +29: [2023-03-16 19:05:21,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +29: [2023-03-16 19:05:21,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 4: [2023-03-16 19:05:21,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +16: [2023-03-16 19:05:21,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. + 2: [2023-03-16 19:05:21,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +28: [2023-03-16 19:05:21,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +27: [2023-03-16 19:05:21,922] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +25: [2023-03-16 19:05:21,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:21,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +16: [2023-03-16 19:05:21,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +25: [2023-03-16 19:05:21,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +16: [2023-03-16 19:05:21,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +25: [2023-03-16 19:05:21,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +16: [2023-03-16 19:05:21,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +25: [2023-03-16 19:05:21,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +21: [2023-03-16 19:05:21,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +24: [2023-03-16 19:05:21,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +24: [2023-03-16 19:05:21,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,929] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +26: [2023-03-16 19:05:21,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +23: [2023-03-16 19:05:21,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +26: [2023-03-16 19:05:21,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +25: [2023-03-16 19:05:21,929] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +29: [2023-03-16 19:05:21,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +29: [2023-03-16 19:05:21,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +21: [2023-03-16 19:05:21,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +23: [2023-03-16 19:05:21,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +26: [2023-03-16 19:05:21,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +26: [2023-03-16 19:05:21,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:21,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +21: [2023-03-16 19:05:21,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +27: [2023-03-16 19:05:21,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +28: [2023-03-16 19:05:21,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. + 3: [2023-03-16 19:05:21,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 3: [2023-03-16 19:05:21,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +21: [2023-03-16 19:05:21,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... + 2: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 3: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 2: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 2: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 2: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 6: [2023-03-16 19:05:21,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +31: [2023-03-16 19:05:21,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +23: [2023-03-16 19:05:21,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +31: [2023-03-16 19:05:21,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +30: [2023-03-16 19:05:21,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +20: [2023-03-16 19:05:21,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +20: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +30: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +22: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +22: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +19: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +19: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +27: [2023-03-16 19:05:21,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +23: [2023-03-16 19:05:21,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +20: [2023-03-16 19:05:21,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... + 1: [2023-03-16 19:05:21,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 1: [2023-03-16 19:05:21,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +26: [2023-03-16 19:05:21,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. +20: [2023-03-16 19:05:21,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +26: [2023-03-16 19:05:21,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 7: [2023-03-16 19:05:21,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. + 0: [2023-03-16 19:05:21,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +19: [2023-03-16 19:05:21,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +19: [2023-03-16 19:05:21,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... + 0: [2023-03-16 19:05:21,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_03-model_states.pt. +22: [2023-03-16 19:05:21,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +22: [2023-03-16 19:05:21,941] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +25: [2023-03-16 19:05:21,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 7: [2023-03-16 19:05:21,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 1: [2023-03-16 19:05:21,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +27: [2023-03-16 19:05:21,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +24: [2023-03-16 19:05:21,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +24: [2023-03-16 19:05:21,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:21,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +25: [2023-03-16 19:05:21,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +24: [2023-03-16 19:05:21,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +17: [2023-03-16 19:05:21,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +24: [2023-03-16 19:05:21,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +28: [2023-03-16 19:05:21,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 2: [2023-03-16 19:05:21,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +31: [2023-03-16 19:05:21,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +31: [2023-03-16 19:05:21,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +17: [2023-03-16 19:05:21,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... + 3: [2023-03-16 19:05:21,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +31: [2023-03-16 19:05:21,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 3: [2023-03-16 19:05:21,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 3: [2023-03-16 19:05:21,950] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +29: [2023-03-16 19:05:21,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:21,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +17: [2023-03-16 19:05:21,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_00-model_states.pt. +30: [2023-03-16 19:05:21,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:21,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +30: [2023-03-16 19:05:21,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +30: [2023-03-16 19:05:21,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +26: [2023-03-16 19:05:21,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 0: [2023-03-16 19:05:21,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +17: [2023-03-16 19:05:21,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt... +31: [2023-03-16 19:05:21,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 7: [2023-03-16 19:05:21,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +28: [2023-03-16 19:05:21,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +26: [2023-03-16 19:05:21,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. + 1: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +27: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 0: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 7: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +24: [2023-03-16 19:05:21,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. + 7: [2023-03-16 19:05:21,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,958] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +25: [2023-03-16 19:05:21,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +24: [2023-03-16 19:05:21,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +30: [2023-03-16 19:05:21,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +27: [2023-03-16 19:05:21,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +30: [2023-03-16 19:05:21,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +29: [2023-03-16 19:05:21,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +24: [2023-03-16 19:05:21,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +29: [2023-03-16 19:05:21,962] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +29: [2023-03-16 19:05:21,963] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:21,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +27: [2023-03-16 19:05:21,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +31: [2023-03-16 19:05:21,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +25: [2023-03-16 19:05:21,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +28: [2023-03-16 19:05:21,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:21,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:21,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:21,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +29: [2023-03-16 19:05:21,965] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +29: [2023-03-16 19:05:21,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +28: [2023-03-16 19:05:21,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +28: [2023-03-16 19:05:21,967] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +27: [2023-03-16 19:05:21,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +30: [2023-03-16 19:05:21,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:21,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +30: [2023-03-16 19:05:21,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:21,970] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +25: [2023-03-16 19:05:21,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +28: [2023-03-16 19:05:21,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +25: [2023-03-16 19:05:21,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:21,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +26: [2023-03-16 19:05:21,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:21,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +30: [2023-03-16 19:05:21,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +26: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +27: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. + 6: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +26: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. + 6: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +27: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. + 6: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +30: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:21,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +26: [2023-03-16 19:05:21,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:21,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +27: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 6: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +28: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +25: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +25: [2023-03-16 19:05:21,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +25: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +28: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +29: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +24: [2023-03-16 19:05:21,979] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 6: [2023-03-16 19:05:21,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +30: [2023-03-16 19:05:21,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +31: [2023-03-16 19:05:21,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +31: [2023-03-16 19:05:21,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +30: [2023-03-16 19:05:21,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_02-model_states.pt. +31: [2023-03-16 19:05:21,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +25: [2023-03-16 19:05:21,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +13: [2023-03-16 19:05:21,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 6: [2023-03-16 19:05:21,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +13: [2023-03-16 19:05:21,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +28: [2023-03-16 19:05:21,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 6: [2023-03-16 19:05:21,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +31: [2023-03-16 19:05:21,982] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +31: [2023-03-16 19:05:21,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +26: [2023-03-16 19:05:21,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +26: [2023-03-16 19:05:21,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 6: [2023-03-16 19:05:21,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... +28: [2023-03-16 19:05:21,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... + 6: [2023-03-16 19:05:21,984] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... +24: [2023-03-16 19:05:21,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +31: [2023-03-16 19:05:21,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +28: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +25: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 4: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +25: [2023-03-16 19:05:21,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +27: [2023-03-16 19:05:21,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +27: [2023-03-16 19:05:21,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 4: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +27: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +27: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +30: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,990] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 1: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +12: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +12: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 4: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... +26: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +25: [2023-03-16 19:05:21,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +29: [2023-03-16 19:05:21,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +29: [2023-03-16 19:05:21,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +30: [2023-03-16 19:05:21,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 4: [2023-03-16 19:05:21,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +30: [2023-03-16 19:05:21,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +26: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +26: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 4: [2023-03-16 19:05:21,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +30: [2023-03-16 19:05:21,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 1: [2023-03-16 19:05:21,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... +30: [2023-03-16 19:05:21,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... + 4: [2023-03-16 19:05:21,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 5: [2023-03-16 19:05:21,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +30: [2023-03-16 19:05:21,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... + 1: [2023-03-16 19:05:21,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +26: [2023-03-16 19:05:21,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +24: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +31: [2023-03-16 19:05:21,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:21,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +24: [2023-03-16 19:05:21,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +13: [2023-03-16 19:05:21,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +13: [2023-03-16 19:05:21,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 5: [2023-03-16 19:05:21,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 1: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... +26: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +28: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +26: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +28: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 5: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 5: [2023-03-16 19:05:21,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:21,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +25: [2023-03-16 19:05:22,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +25: [2023-03-16 19:05:22,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +26: [2023-03-16 19:05:22,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt... +24: [2023-03-16 19:05:22,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +26: [2023-03-16 19:05:22,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +26: [2023-03-16 19:05:22,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +12: [2023-03-16 19:05:22,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +12: [2023-03-16 19:05:22,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +24: [2023-03-16 19:05:22,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:22,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +29: [2023-03-16 19:05:22,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... +24: [2023-03-16 19:05:22,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 6: [2023-03-16 19:05:22,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +31: [2023-03-16 19:05:22,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. +24: [2023-03-16 19:05:22,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 1: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 7: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +31: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_41-model_03-model_states.pt. + 4: [2023-03-16 19:05:22,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +28: [2023-03-16 19:05:22,008] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +24: [2023-03-16 19:05:22,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +28: [2023-03-16 19:05:22,009] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +28: [2023-03-16 19:05:22,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +30: [2023-03-16 19:05:22,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:22,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +28: [2023-03-16 19:05:22,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 0: [2023-03-16 19:05:22,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 1: [2023-03-16 19:05:22,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 1: [2023-03-16 19:05:22,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 2: [2023-03-16 19:05:22,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,013] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,013] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +30: [2023-03-16 19:05:22,014] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:22,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +29: [2023-03-16 19:05:22,014] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +28: [2023-03-16 19:05:22,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:22,016] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +29: [2023-03-16 19:05:22,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +28: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 2: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 2: [2023-03-16 19:05:22,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +29: [2023-03-16 19:05:22,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. + 3: [2023-03-16 19:05:22,018] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +31: [2023-03-16 19:05:22,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,018] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,019] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 2: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 7: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +24: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 6: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +24: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +31: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +27: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +25: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +27: [2023-03-16 19:05:22,021] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +27: [2023-03-16 19:05:22,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:22,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,023] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +27: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +25: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 3: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +13: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +27: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +27: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +27: [2023-03-16 19:05:22,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +30: [2023-03-16 19:05:22,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 7: [2023-03-16 19:05:22,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... + 1: [2023-03-16 19:05:22,025] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +25: [2023-03-16 19:05:22,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +30: [2023-03-16 19:05:22,025] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt... +12: [2023-03-16 19:05:22,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +13: [2023-03-16 19:05:22,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +25: [2023-03-16 19:05:22,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +30: [2023-03-16 19:05:22,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,027] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +26: [2023-03-16 19:05:22,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +25: [2023-03-16 19:05:22,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +12: [2023-03-16 19:05:22,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 9: [2023-03-16 19:05:22,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +31: [2023-03-16 19:05:22,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +31: [2023-03-16 19:05:22,030] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +26: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,030] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +25: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 4: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_02-model_states.pt. +25: [2023-03-16 19:05:22,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +12: [2023-03-16 19:05:22,032] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +28: [2023-03-16 19:05:22,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +25: [2023-03-16 19:05:22,033] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +26: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +28: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +31: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +31: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... +13: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +13: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +12: [2023-03-16 19:05:22,034] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +26: [2023-03-16 19:05:22,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt... + 4: [2023-03-16 19:05:22,037] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +26: [2023-03-16 19:05:22,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +26: [2023-03-16 19:05:22,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:22,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +29: [2023-03-16 19:05:22,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +29: [2023-03-16 19:05:22,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +29: [2023-03-16 19:05:22,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 9: [2023-03-16 19:05:22,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +28: [2023-03-16 19:05:22,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 4: [2023-03-16 19:05:22,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +28: [2023-03-16 19:05:22,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 4: [2023-03-16 19:05:22,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,045] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:22,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +11: [2023-03-16 19:05:22,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 5: [2023-03-16 19:05:22,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +31: [2023-03-16 19:05:22,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_07-model_01-model_states.pt. +26: [2023-03-16 19:05:22,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +26: [2023-03-16 19:05:22,049] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... +31: [2023-03-16 19:05:22,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 0: [2023-03-16 19:05:22,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +24: [2023-03-16 19:05:22,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. +31: [2023-03-16 19:05:22,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +31: [2023-03-16 19:05:22,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 2: [2023-03-16 19:05:22,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +24: [2023-03-16 19:05:22,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 6: [2023-03-16 19:05:22,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... +24: [2023-03-16 19:05:22,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +24: [2023-03-16 19:05:22,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,061] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 2: [2023-03-16 19:05:22,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt... +11: [2023-03-16 19:05:22,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +11: [2023-03-16 19:05:22,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,063] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,067] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,067] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 2: [2023-03-16 19:05:22,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,069] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,071] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 3: [2023-03-16 19:05:22,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,080] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,081] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,085] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,088] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,092] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +11: [2023-03-16 19:05:22,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,094] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_00-model_states.pt. +11: [2023-03-16 19:05:22,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt... +11: [2023-03-16 19:05:22,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +11: [2023-03-16 19:05:22,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +13: [2023-03-16 19:05:22,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +13: [2023-03-16 19:05:22,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. + 8: [2023-03-16 19:05:22,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +10: [2023-03-16 19:05:22,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +10: [2023-03-16 19:05:22,122] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +13: [2023-03-16 19:05:22,127] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +16: [2023-03-16 19:05:22,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +16: [2023-03-16 19:05:22,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +13: [2023-03-16 19:05:22,128] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +14: [2023-03-16 19:05:22,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +16: [2023-03-16 19:05:22,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +16: [2023-03-16 19:05:22,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +14: [2023-03-16 19:05:22,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +23: [2023-03-16 19:05:22,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +23: [2023-03-16 19:05:22,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +14: [2023-03-16 19:05:22,131] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +10: [2023-03-16 19:05:22,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +23: [2023-03-16 19:05:22,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +23: [2023-03-16 19:05:22,133] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +19: [2023-03-16 19:05:22,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,132] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +19: [2023-03-16 19:05:22,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +23: [2023-03-16 19:05:22,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +10: [2023-03-16 19:05:22,135] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +23: [2023-03-16 19:05:22,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +21: [2023-03-16 19:05:22,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +21: [2023-03-16 19:05:22,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +18: [2023-03-16 19:05:22,145] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +18: [2023-03-16 19:05:22,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +23: [2023-03-16 19:05:22,146] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +19: [2023-03-16 19:05:22,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +18: [2023-03-16 19:05:22,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,147] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +23: [2023-03-16 19:05:22,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +21: [2023-03-16 19:05:22,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +21: [2023-03-16 19:05:22,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +12: [2023-03-16 19:05:22,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +17: [2023-03-16 19:05:22,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +17: [2023-03-16 19:05:22,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +12: [2023-03-16 19:05:22,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +16: [2023-03-16 19:05:22,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +21: [2023-03-16 19:05:22,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +21: [2023-03-16 19:05:22,157] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +21: [2023-03-16 19:05:22,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +14: [2023-03-16 19:05:22,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +21: [2023-03-16 19:05:22,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +17: [2023-03-16 19:05:22,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +17: [2023-03-16 19:05:22,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +10: [2023-03-16 19:05:22,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +16: [2023-03-16 19:05:22,160] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +14: [2023-03-16 19:05:22,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +15: [2023-03-16 19:05:22,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +15: [2023-03-16 19:05:22,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +17: [2023-03-16 19:05:22,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +10: [2023-03-16 19:05:22,162] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +14: [2023-03-16 19:05:22,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +16: [2023-03-16 19:05:22,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +16: [2023-03-16 19:05:22,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. + 8: [2023-03-16 19:05:22,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +12: [2023-03-16 19:05:22,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +22: [2023-03-16 19:05:22,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +23: [2023-03-16 19:05:22,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +23: [2023-03-16 19:05:22,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +10: [2023-03-16 19:05:22,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +14: [2023-03-16 19:05:22,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +20: [2023-03-16 19:05:22,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +22: [2023-03-16 19:05:22,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +19: [2023-03-16 19:05:22,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +22: [2023-03-16 19:05:22,166] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +22: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +10: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 9: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +20: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +15: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +15: [2023-03-16 19:05:22,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +10: [2023-03-16 19:05:22,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +10: [2023-03-16 19:05:22,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +20: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +18: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +18: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +20: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +12: [2023-03-16 19:05:22,169] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +16: [2023-03-16 19:05:22,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +14: [2023-03-16 19:05:22,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +20: [2023-03-16 19:05:22,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +11: [2023-03-16 19:05:22,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +11: [2023-03-16 19:05:22,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +15: [2023-03-16 19:05:22,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:22,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +21: [2023-03-16 19:05:22,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,174] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_01-model_states.pt. +21: [2023-03-16 19:05:22,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +14: [2023-03-16 19:05:22,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +14: [2023-03-16 19:05:22,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +14: [2023-03-16 19:05:22,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +16: [2023-03-16 19:05:22,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +23: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +16: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +12: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +23: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +23: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +12: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +23: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +19: [2023-03-16 19:05:22,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +15: [2023-03-16 19:05:22,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +22: [2023-03-16 19:05:22,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +11: [2023-03-16 19:05:22,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +18: [2023-03-16 19:05:22,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:22,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +20: [2023-03-16 19:05:22,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +12: [2023-03-16 19:05:22,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +12: [2023-03-16 19:05:22,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +10: [2023-03-16 19:05:22,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:22,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +10: [2023-03-16 19:05:22,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +11: [2023-03-16 19:05:22,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:22,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +12: [2023-03-16 19:05:22,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +12: [2023-03-16 19:05:22,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:22,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +23: [2023-03-16 19:05:22,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +13: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +23: [2023-03-16 19:05:22,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +13: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +13: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +10: [2023-03-16 19:05:22,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:22,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +22: [2023-03-16 19:05:22,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +21: [2023-03-16 19:05:22,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +11: [2023-03-16 19:05:22,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +10: [2023-03-16 19:05:22,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +21: [2023-03-16 19:05:22,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +18: [2023-03-16 19:05:22,189] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +10: [2023-03-16 19:05:22,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:22,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +17: [2023-03-16 19:05:22,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +13: [2023-03-16 19:05:22,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:22,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +10: [2023-03-16 19:05:22,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:22,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +12: [2023-03-16 19:05:22,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +18: [2023-03-16 19:05:22,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +12: [2023-03-16 19:05:22,192] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:22,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +15: [2023-03-16 19:05:22,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +15: [2023-03-16 19:05:22,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +19: [2023-03-16 19:05:22,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +19: [2023-03-16 19:05:22,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +21: [2023-03-16 19:05:22,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +15: [2023-03-16 19:05:22,195] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:22,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +17: [2023-03-16 19:05:22,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +10: [2023-03-16 19:05:22,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +10: [2023-03-16 19:05:22,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_02-model_states.pt. +12: [2023-03-16 19:05:22,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +23: [2023-03-16 19:05:22,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +23: [2023-03-16 19:05:22,199] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:22,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +20: [2023-03-16 19:05:22,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +12: [2023-03-16 19:05:22,200] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +15: [2023-03-16 19:05:22,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +21: [2023-03-16 19:05:22,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +15: [2023-03-16 19:05:22,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +22: [2023-03-16 19:05:22,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +14: [2023-03-16 19:05:22,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +12: [2023-03-16 19:05:22,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +17: [2023-03-16 19:05:22,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +14: [2023-03-16 19:05:22,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,205] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +19: [2023-03-16 19:05:22,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +17: [2023-03-16 19:05:22,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +17: [2023-03-16 19:05:22,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +19: [2023-03-16 19:05:22,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +12: [2023-03-16 19:05:22,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +20: [2023-03-16 19:05:22,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +21: [2023-03-16 19:05:22,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +16: [2023-03-16 19:05:22,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +14: [2023-03-16 19:05:22,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +14: [2023-03-16 19:05:22,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +15: [2023-03-16 19:05:22,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +21: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +16: [2023-03-16 19:05:22,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +15: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +22: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +23: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +18: [2023-03-16 19:05:22,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +18: [2023-03-16 19:05:22,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,211] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +23: [2023-03-16 19:05:22,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +15: [2023-03-16 19:05:22,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +18: [2023-03-16 19:05:22,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +16: [2023-03-16 19:05:22,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +15: [2023-03-16 19:05:22,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +10: [2023-03-16 19:05:22,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +16: [2023-03-16 19:05:22,212] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +10: [2023-03-16 19:05:22,213] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +20: [2023-03-16 19:05:22,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +12: [2023-03-16 19:05:22,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +10: [2023-03-16 19:05:22,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +10: [2023-03-16 19:05:22,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +17: [2023-03-16 19:05:22,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +19: [2023-03-16 19:05:22,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,216] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +18: [2023-03-16 19:05:22,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +11: [2023-03-16 19:05:22,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +22: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +12: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +16: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. +10: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +16: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_02-model_states.pt. + 8: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +22: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,218] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +19: [2023-03-16 19:05:22,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +13: [2023-03-16 19:05:22,219] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +14: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +18: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +22: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +13: [2023-03-16 19:05:22,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +11: [2023-03-16 19:05:22,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +13: [2023-03-16 19:05:22,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_03-model_states.pt. +11: [2023-03-16 19:05:22,221] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +22: [2023-03-16 19:05:22,222] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +20: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,223] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +12: [2023-03-16 19:05:22,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +22: [2023-03-16 19:05:22,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +10: [2023-03-16 19:05:22,224] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +11: [2023-03-16 19:05:22,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +22: [2023-03-16 19:05:22,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +22: [2023-03-16 19:05:22,224] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +13: [2023-03-16 19:05:22,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +12: [2023-03-16 19:05:22,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +12: [2023-03-16 19:05:22,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +20: [2023-03-16 19:05:22,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +20: [2023-03-16 19:05:22,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt... +10: [2023-03-16 19:05:22,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +10: [2023-03-16 19:05:22,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +12: [2023-03-16 19:05:22,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,228] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +15: [2023-03-16 19:05:22,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,228] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,229] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +15: [2023-03-16 19:05:22,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +12: [2023-03-16 19:05:22,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +11: [2023-03-16 19:05:22,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +11: [2023-03-16 19:05:22,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +16: [2023-03-16 19:05:22,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +11: [2023-03-16 19:05:22,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +12: [2023-03-16 19:05:22,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +11: [2023-03-16 19:05:22,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +16: [2023-03-16 19:05:22,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +13: [2023-03-16 19:05:22,233] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,234] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +19: [2023-03-16 19:05:22,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +19: [2023-03-16 19:05:22,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +11: [2023-03-16 19:05:22,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +13: [2023-03-16 19:05:22,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +13: [2023-03-16 19:05:22,235] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +23: [2023-03-16 19:05:22,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +10: [2023-03-16 19:05:22,238] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +13: [2023-03-16 19:05:22,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... + 9: [2023-03-16 19:05:22,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +10: [2023-03-16 19:05:22,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +10: [2023-03-16 19:05:22,240] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... +20: [2023-03-16 19:05:22,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +21: [2023-03-16 19:05:22,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +23: [2023-03-16 19:05:22,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +15: [2023-03-16 19:05:22,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +21: [2023-03-16 19:05:22,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +23: [2023-03-16 19:05:22,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +23: [2023-03-16 19:05:22,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +21: [2023-03-16 19:05:22,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +15: [2023-03-16 19:05:22,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +21: [2023-03-16 19:05:22,245] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +18: [2023-03-16 19:05:22,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +18: [2023-03-16 19:05:22,247] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +19: [2023-03-16 19:05:22,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +19: [2023-03-16 19:05:22,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +14: [2023-03-16 19:05:22,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +22: [2023-03-16 19:05:22,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +22: [2023-03-16 19:05:22,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +10: [2023-03-16 19:05:22,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +14: [2023-03-16 19:05:22,252] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +14: [2023-03-16 19:05:22,252] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +19: [2023-03-16 19:05:22,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +19: [2023-03-16 19:05:22,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +20: [2023-03-16 19:05:22,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +10: [2023-03-16 19:05:22,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +20: [2023-03-16 19:05:22,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:22,253] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +14: [2023-03-16 19:05:22,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +10: [2023-03-16 19:05:22,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... + 8: [2023-03-16 19:05:22,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +18: [2023-03-16 19:05:22,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,256] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +20: [2023-03-16 19:05:22,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +18: [2023-03-16 19:05:22,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +20: [2023-03-16 19:05:22,257] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +16: [2023-03-16 19:05:22,257] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,258] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +22: [2023-03-16 19:05:22,259] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +23: [2023-03-16 19:05:22,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +23: [2023-03-16 19:05:22,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +16: [2023-03-16 19:05:22,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +17: [2023-03-16 19:05:22,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +11: [2023-03-16 19:05:22,261] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +17: [2023-03-16 19:05:22,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +11: [2023-03-16 19:05:22,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +18: [2023-03-16 19:05:22,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +17: [2023-03-16 19:05:22,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +18: [2023-03-16 19:05:22,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +16: [2023-03-16 19:05:22,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +22: [2023-03-16 19:05:22,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +11: [2023-03-16 19:05:22,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +22: [2023-03-16 19:05:22,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +22: [2023-03-16 19:05:22,263] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +16: [2023-03-16 19:05:22,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +11: [2023-03-16 19:05:22,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +14: [2023-03-16 19:05:22,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +22: [2023-03-16 19:05:22,266] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt... +21: [2023-03-16 19:05:22,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +16: [2023-03-16 19:05:22,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +13: [2023-03-16 19:05:22,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +16: [2023-03-16 19:05:22,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +13: [2023-03-16 19:05:22,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +14: [2023-03-16 19:05:22,269] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +21: [2023-03-16 19:05:22,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_31-model_03-model_states.pt. +13: [2023-03-16 19:05:22,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +13: [2023-03-16 19:05:22,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt... +30: [2023-03-16 19:05:22,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +30: [2023-03-16 19:05:22,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +31: [2023-03-16 19:05:22,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +27: [2023-03-16 19:05:22,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +27: [2023-03-16 19:05:22,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +31: [2023-03-16 19:05:22,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +29: [2023-03-16 19:05:22,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +26: [2023-03-16 19:05:22,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +29: [2023-03-16 19:05:22,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +26: [2023-03-16 19:05:22,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +23: [2023-03-16 19:05:22,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +23: [2023-03-16 19:05:22,274] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +17: [2023-03-16 19:05:22,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +17: [2023-03-16 19:05:22,275] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +18: [2023-03-16 19:05:22,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +21: [2023-03-16 19:05:22,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +25: [2023-03-16 19:05:22,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +28: [2023-03-16 19:05:22,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +16: [2023-03-16 19:05:22,279] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +25: [2023-03-16 19:05:22,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +28: [2023-03-16 19:05:22,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +19: [2023-03-16 19:05:22,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +19: [2023-03-16 19:05:22,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +16: [2023-03-16 19:05:22,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +21: [2023-03-16 19:05:22,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt... +20: [2023-03-16 19:05:22,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +24: [2023-03-16 19:05:22,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +24: [2023-03-16 19:05:22,283] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_01-model_states.pt. +19: [2023-03-16 19:05:22,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +19: [2023-03-16 19:05:22,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +31: [2023-03-16 19:05:22,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:22,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:22,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +27: [2023-03-16 19:05:22,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +27: [2023-03-16 19:05:22,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +30: [2023-03-16 19:05:22,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +30: [2023-03-16 19:05:22,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:22,285] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +22: [2023-03-16 19:05:22,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +20: [2023-03-16 19:05:22,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +26: [2023-03-16 19:05:22,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +22: [2023-03-16 19:05:22,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +20: [2023-03-16 19:05:22,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +20: [2023-03-16 19:05:22,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +25: [2023-03-16 19:05:22,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +25: [2023-03-16 19:05:22,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +28: [2023-03-16 19:05:22,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +25: [2023-03-16 19:05:22,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +25: [2023-03-16 19:05:22,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +28: [2023-03-16 19:05:22,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +25: [2023-03-16 19:05:22,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +22: [2023-03-16 19:05:22,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +25: [2023-03-16 19:05:22,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +14: [2023-03-16 19:05:22,295] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. +22: [2023-03-16 19:05:22,295] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +24: [2023-03-16 19:05:22,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:22,296] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +26: [2023-03-16 19:05:22,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,297] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 4: [2023-03-16 19:05:22,298] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +26: [2023-03-16 19:05:22,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,299] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +29: [2023-03-16 19:05:22,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +14: [2023-03-16 19:05:22,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +29: [2023-03-16 19:05:22,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:22,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +23: [2023-03-16 19:05:22,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +14: [2023-03-16 19:05:22,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,304] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +17: [2023-03-16 19:05:22,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +28: [2023-03-16 19:05:22,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +14: [2023-03-16 19:05:22,305] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... +28: [2023-03-16 19:05:22,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +25: [2023-03-16 19:05:22,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +28: [2023-03-16 19:05:22,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +23: [2023-03-16 19:05:22,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +25: [2023-03-16 19:05:22,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +16: [2023-03-16 19:05:22,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +28: [2023-03-16 19:05:22,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +18: [2023-03-16 19:05:22,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +18: [2023-03-16 19:05:22,307] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 5: [2023-03-16 19:05:22,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +16: [2023-03-16 19:05:22,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +17: [2023-03-16 19:05:22,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +21: [2023-03-16 19:05:22,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +21: [2023-03-16 19:05:22,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... + 4: [2023-03-16 19:05:22,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +18: [2023-03-16 19:05:22,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +18: [2023-03-16 19:05:22,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +29: [2023-03-16 19:05:22,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +23: [2023-03-16 19:05:22,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +17: [2023-03-16 19:05:22,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +28: [2023-03-16 19:05:22,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +16: [2023-03-16 19:05:22,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +23: [2023-03-16 19:05:22,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +27: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +28: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +26: [2023-03-16 19:05:22,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 7: [2023-03-16 19:05:22,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +16: [2023-03-16 19:05:22,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 3: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +30: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +30: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +30: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +27: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +30: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +27: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +24: [2023-03-16 19:05:22,315] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +24: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +29: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +25: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +21: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_00-model_states.pt. +31: [2023-03-16 19:05:22,316] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +24: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +31: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +27: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +30: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +30: [2023-03-16 19:05:22,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +30: [2023-03-16 19:05:22,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +24: [2023-03-16 19:05:22,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:22,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +21: [2023-03-16 19:05:22,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt... +31: [2023-03-16 19:05:22,318] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 0: [2023-03-16 19:05:22,319] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +30: [2023-03-16 19:05:22,319] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +25: [2023-03-16 19:05:22,320] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,321] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +25: [2023-03-16 19:05:22,322] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... + 6: [2023-03-16 19:05:22,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. + 6: [2023-03-16 19:05:22,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_03-model_states.pt. +25: [2023-03-16 19:05:22,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +24: [2023-03-16 19:05:22,324] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +26: [2023-03-16 19:05:22,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,326] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +26: [2023-03-16 19:05:22,326] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +24: [2023-03-16 19:05:22,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +24: [2023-03-16 19:05:22,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +26: [2023-03-16 19:05:22,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +15: [2023-03-16 19:05:22,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,328] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +28: [2023-03-16 19:05:22,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +15: [2023-03-16 19:05:22,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_19-model_01-model_states.pt. +24: [2023-03-16 19:05:22,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... +25: [2023-03-16 19:05:22,330] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,332] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +29: [2023-03-16 19:05:22,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +28: [2023-03-16 19:05:22,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,335] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +26: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +25: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +29: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +25: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. + 1: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 6: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 5: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 4: [2023-03-16 19:05:22,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 6: [2023-03-16 19:05:22,338] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 5: [2023-03-16 19:05:22,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 7: [2023-03-16 19:05:22,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,338] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +24: [2023-03-16 19:05:22,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +26: [2023-03-16 19:05:22,339] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 6: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. +25: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,340] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,341] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. +15: [2023-03-16 19:05:22,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... +15: [2023-03-16 19:05:22,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 0: [2023-03-16 19:05:22,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. + 5: [2023-03-16 19:05:22,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_01-model_states.pt. +28: [2023-03-16 19:05:22,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +27: [2023-03-16 19:05:22,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +28: [2023-03-16 19:05:22,344] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +30: [2023-03-16 19:05:22,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +31: [2023-03-16 19:05:22,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +28: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +24: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,349] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +25: [2023-03-16 19:05:22,349] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +25: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +31: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +29: [2023-03-16 19:05:22,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:22,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:22,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,352] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 4: [2023-03-16 19:05:22,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +28: [2023-03-16 19:05:22,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,354] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... +30: [2023-03-16 19:05:22,354] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,355] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +13: [2023-03-16 19:05:22,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +27: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +13: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... +30: [2023-03-16 19:05:22,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 5: [2023-03-16 19:05:22,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... +28: [2023-03-16 19:05:22,359] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:22,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +24: [2023-03-16 19:05:22,361] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +26: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +26: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. + 4: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +28: [2023-03-16 19:05:22,364] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 0: [2023-03-16 19:05:22,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 4: [2023-03-16 19:05:22,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. + 6: [2023-03-16 19:05:22,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +13: [2023-03-16 19:05:22,368] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,368] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 6: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +30: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +12: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. + 2: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +12: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +11: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. + 2: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +11: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. + 2: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +13: [2023-03-16 19:05:22,370] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,369] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +15: [2023-03-16 19:05:22,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 7: [2023-03-16 19:05:22,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... +15: [2023-03-16 19:05:22,371] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 0: [2023-03-16 19:05:22,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 2: [2023-03-16 19:05:22,372] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... + 2: [2023-03-16 19:05:22,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +15: [2023-03-16 19:05:22,373] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt... + 9: [2023-03-16 19:05:22,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +29: [2023-03-16 19:05:22,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +29: [2023-03-16 19:05:22,375] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +26: [2023-03-16 19:05:22,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,378] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +25: [2023-03-16 19:05:22,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +25: [2023-03-16 19:05:22,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +25: [2023-03-16 19:05:22,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +12: [2023-03-16 19:05:22,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +12: [2023-03-16 19:05:22,381] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +25: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... + 4: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 1: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 5: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +11: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,382] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +11: [2023-03-16 19:05:22,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +28: [2023-03-16 19:05:22,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +28: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... +29: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,385] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 6: [2023-03-16 19:05:22,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt... +28: [2023-03-16 19:05:22,385] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +28: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... +29: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 7: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 5: [2023-03-16 19:05:22,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +27: [2023-03-16 19:05:22,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +27: [2023-03-16 19:05:22,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. + 0: [2023-03-16 19:05:22,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 5: [2023-03-16 19:05:22,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 5: [2023-03-16 19:05:22,390] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +28: [2023-03-16 19:05:22,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. + 3: [2023-03-16 19:05:22,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +28: [2023-03-16 19:05:22,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. + 3: [2023-03-16 19:05:22,392] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,394] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... +30: [2023-03-16 19:05:22,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +30: [2023-03-16 19:05:22,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +31: [2023-03-16 19:05:22,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. +31: [2023-03-16 19:05:22,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. + 3: [2023-03-16 19:05:22,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,395] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 3: [2023-03-16 19:05:22,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 3: [2023-03-16 19:05:22,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... + 6: [2023-03-16 19:05:22,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt... +13: [2023-03-16 19:05:22,397] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +24: [2023-03-16 19:05:22,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. + 1: [2023-03-16 19:05:22,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +24: [2023-03-16 19:05:22,398] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_02-model_states.pt. + 3: [2023-03-16 19:05:22,398] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +13: [2023-03-16 19:05:22,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +27: [2023-03-16 19:05:22,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +27: [2023-03-16 19:05:22,399] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +13: [2023-03-16 19:05:22,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +13: [2023-03-16 19:05:22,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 3: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 7: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 3: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 0: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. +12: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 2: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 0: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 4: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. +28: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +28: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 2: [2023-03-16 19:05:22,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 6: [2023-03-16 19:05:22,405] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +26: [2023-03-16 19:05:22,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +11: [2023-03-16 19:05:22,405] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +11: [2023-03-16 19:05:22,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +26: [2023-03-16 19:05:22,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +11: [2023-03-16 19:05:22,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +26: [2023-03-16 19:05:22,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +31: [2023-03-16 19:05:22,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:22,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +11: [2023-03-16 19:05:22,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +30: [2023-03-16 19:05:22,408] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,408] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 5: [2023-03-16 19:05:22,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_08-model_02-model_states.pt. +30: [2023-03-16 19:05:22,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +12: [2023-03-16 19:05:22,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +24: [2023-03-16 19:05:22,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:22,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,411] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +12: [2023-03-16 19:05:22,411] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +12: [2023-03-16 19:05:22,413] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,415] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,416] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,422] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt... +27: [2023-03-16 19:05:22,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +27: [2023-03-16 19:05:22,430] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +27: [2023-03-16 19:05:22,431] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +29: [2023-03-16 19:05:22,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +29: [2023-03-16 19:05:22,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +28: [2023-03-16 19:05:22,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +28: [2023-03-16 19:05:22,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +30: [2023-03-16 19:05:22,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +30: [2023-03-16 19:05:22,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +28: [2023-03-16 19:05:22,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +28: [2023-03-16 19:05:22,437] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +30: [2023-03-16 19:05:22,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +31: [2023-03-16 19:05:22,439] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +31: [2023-03-16 19:05:22,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +30: [2023-03-16 19:05:22,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +25: [2023-03-16 19:05:22,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +25: [2023-03-16 19:05:22,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +30: [2023-03-16 19:05:22,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +31: [2023-03-16 19:05:22,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +24: [2023-03-16 19:05:22,441] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +30: [2023-03-16 19:05:22,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +24: [2023-03-16 19:05:22,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +31: [2023-03-16 19:05:22,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +10: [2023-03-16 19:05:22,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +10: [2023-03-16 19:05:22,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +24: [2023-03-16 19:05:22,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +24: [2023-03-16 19:05:22,445] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt... +27: [2023-03-16 19:05:22,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +27: [2023-03-16 19:05:22,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +29: [2023-03-16 19:05:22,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +29: [2023-03-16 19:05:22,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +26: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. + 3: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. +26: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. + 1: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,448] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,449] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 2: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +31: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 7: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 4: [2023-03-16 19:05:22,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +30: [2023-03-16 19:05:22,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +30: [2023-03-16 19:05:22,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 2: [2023-03-16 19:05:22,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +24: [2023-03-16 19:05:22,452] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +25: [2023-03-16 19:05:22,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:22,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_42-model_03-model_states.pt. +25: [2023-03-16 19:05:22,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +31: [2023-03-16 19:05:22,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,453] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +10: [2023-03-16 19:05:22,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,454] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +10: [2023-03-16 19:05:22,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +27: [2023-03-16 19:05:22,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... + 6: [2023-03-16 19:05:22,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt... +27: [2023-03-16 19:05:22,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +26: [2023-03-16 19:05:22,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:22,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +24: [2023-03-16 19:05:22,467] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt... +31: [2023-03-16 19:05:22,472] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +31: [2023-03-16 19:05:22,474] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +29: [2023-03-16 19:05:22,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,477] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +29: [2023-03-16 19:05:22,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +29: [2023-03-16 19:05:22,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +30: [2023-03-16 19:05:22,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +10: [2023-03-16 19:05:22,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +12: [2023-03-16 19:05:22,480] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +30: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +10: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +25: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +15: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +12: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +15: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +11: [2023-03-16 19:05:22,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +11: [2023-03-16 19:05:22,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:22,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +25: [2023-03-16 19:05:22,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +12: [2023-03-16 19:05:22,482] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +12: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +31: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +11: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +25: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +13: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +10: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +30: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +13: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +30: [2023-03-16 19:05:22,483] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +15: [2023-03-16 19:05:22,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +11: [2023-03-16 19:05:22,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +25: [2023-03-16 19:05:22,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +15: [2023-03-16 19:05:22,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +31: [2023-03-16 19:05:22,485] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +27: [2023-03-16 19:05:22,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +10: [2023-03-16 19:05:22,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +27: [2023-03-16 19:05:22,489] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +26: [2023-03-16 19:05:22,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +26: [2023-03-16 19:05:22,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +10: [2023-03-16 19:05:22,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +26: [2023-03-16 19:05:22,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +26: [2023-03-16 19:05:22,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +10: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +21: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +19: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +21: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +19: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +27: [2023-03-16 19:05:22,494] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +10: [2023-03-16 19:05:22,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +21: [2023-03-16 19:05:22,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +27: [2023-03-16 19:05:22,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +19: [2023-03-16 19:05:22,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +21: [2023-03-16 19:05:22,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +24: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. +14: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +24: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +19: [2023-03-16 19:05:22,498] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +14: [2023-03-16 19:05:22,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,499] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +24: [2023-03-16 19:05:22,500] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +24: [2023-03-16 19:05:22,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt... +12: [2023-03-16 19:05:22,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:22,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +13: [2023-03-16 19:05:22,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +20: [2023-03-16 19:05:22,510] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +20: [2023-03-16 19:05:22,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +12: [2023-03-16 19:05:22,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +15: [2023-03-16 19:05:22,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +11: [2023-03-16 19:05:22,514] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +19: [2023-03-16 19:05:22,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +12: [2023-03-16 19:05:22,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:22,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:22,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:22,517] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +11: [2023-03-16 19:05:22,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +11: [2023-03-16 19:05:22,518] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +19: [2023-03-16 19:05:22,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +10: [2023-03-16 19:05:22,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +11: [2023-03-16 19:05:22,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +13: [2023-03-16 19:05:22,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +14: [2023-03-16 19:05:22,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +20: [2023-03-16 19:05:22,522] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +20: [2023-03-16 19:05:22,523] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +21: [2023-03-16 19:05:22,523] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +21: [2023-03-16 19:05:22,524] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +13: [2023-03-16 19:05:22,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +13: [2023-03-16 19:05:22,524] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:22,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. +15: [2023-03-16 19:05:22,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_20-model_01-model_states.pt. + 9: [2023-03-16 19:05:22,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +12: [2023-03-16 19:05:22,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +12: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +12: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +15: [2023-03-16 19:05:22,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +14: [2023-03-16 19:05:22,527] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +11: [2023-03-16 19:05:22,528] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:22,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +14: [2023-03-16 19:05:22,528] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +19: [2023-03-16 19:05:22,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +15: [2023-03-16 19:05:22,529] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +19: [2023-03-16 19:05:22,529] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +14: [2023-03-16 19:05:22,530] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:22,530] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:22,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +19: [2023-03-16 19:05:22,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +11: [2023-03-16 19:05:22,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +11: [2023-03-16 19:05:22,531] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,532] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +15: [2023-03-16 19:05:22,532] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +11: [2023-03-16 19:05:22,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +14: [2023-03-16 19:05:22,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +21: [2023-03-16 19:05:22,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +19: [2023-03-16 19:05:22,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +21: [2023-03-16 19:05:22,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +10: [2023-03-16 19:05:22,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +10: [2023-03-16 19:05:22,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +14: [2023-03-16 19:05:22,537] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +17: [2023-03-16 19:05:22,537] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +17: [2023-03-16 19:05:22,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +18: [2023-03-16 19:05:22,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +12: [2023-03-16 19:05:22,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:22,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +15: [2023-03-16 19:05:22,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +12: [2023-03-16 19:05:22,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +15: [2023-03-16 19:05:22,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt... +15: [2023-03-16 19:05:22,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +15: [2023-03-16 19:05:22,541] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +14: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +22: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +19: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +22: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +14: [2023-03-16 19:05:22,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +17: [2023-03-16 19:05:22,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +14: [2023-03-16 19:05:22,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +22: [2023-03-16 19:05:22,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +23: [2023-03-16 19:05:22,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +13: [2023-03-16 19:05:22,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +22: [2023-03-16 19:05:22,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +13: [2023-03-16 19:05:22,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_03-model_states.pt. +17: [2023-03-16 19:05:22,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +16: [2023-03-16 19:05:22,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +16: [2023-03-16 19:05:22,546] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +10: [2023-03-16 19:05:22,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +10: [2023-03-16 19:05:22,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +17: [2023-03-16 19:05:22,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +19: [2023-03-16 19:05:22,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +18: [2023-03-16 19:05:22,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +23: [2023-03-16 19:05:22,554] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +13: [2023-03-16 19:05:22,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +20: [2023-03-16 19:05:22,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +13: [2023-03-16 19:05:22,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,557] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +18: [2023-03-16 19:05:22,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +20: [2023-03-16 19:05:22,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +20: [2023-03-16 19:05:22,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +13: [2023-03-16 19:05:22,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +20: [2023-03-16 19:05:22,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +13: [2023-03-16 19:05:22,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +13: [2023-03-16 19:05:22,559] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +20: [2023-03-16 19:05:22,560] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +11: [2023-03-16 19:05:22,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +11: [2023-03-16 19:05:22,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +19: [2023-03-16 19:05:22,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +12: [2023-03-16 19:05:22,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +19: [2023-03-16 19:05:22,564] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +12: [2023-03-16 19:05:22,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +23: [2023-03-16 19:05:22,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +19: [2023-03-16 19:05:22,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +11: [2023-03-16 19:05:22,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +22: [2023-03-16 19:05:22,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +22: [2023-03-16 19:05:22,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +23: [2023-03-16 19:05:22,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +23: [2023-03-16 19:05:22,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +20: [2023-03-16 19:05:22,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +14: [2023-03-16 19:05:22,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +19: [2023-03-16 19:05:22,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +20: [2023-03-16 19:05:22,567] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +11: [2023-03-16 19:05:22,567] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +16: [2023-03-16 19:05:22,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +12: [2023-03-16 19:05:22,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +16: [2023-03-16 19:05:22,568] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +15: [2023-03-16 19:05:22,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +12: [2023-03-16 19:05:22,570] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +15: [2023-03-16 19:05:22,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +15: [2023-03-16 19:05:22,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +21: [2023-03-16 19:05:22,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +21: [2023-03-16 19:05:22,571] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_01-model_states.pt. +15: [2023-03-16 19:05:22,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +15: [2023-03-16 19:05:22,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +15: [2023-03-16 19:05:22,572] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +14: [2023-03-16 19:05:22,573] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_00-model_states.pt. +15: [2023-03-16 19:05:22,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +15: [2023-03-16 19:05:22,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +14: [2023-03-16 19:05:22,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +14: [2023-03-16 19:05:22,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt... +19: [2023-03-16 19:05:22,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +19: [2023-03-16 19:05:22,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +14: [2023-03-16 19:05:22,575] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +18: [2023-03-16 19:05:22,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +16: [2023-03-16 19:05:22,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,576] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +14: [2023-03-16 19:05:22,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +10: [2023-03-16 19:05:22,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +23: [2023-03-16 19:05:22,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. + 8: [2023-03-16 19:05:22,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... + 9: [2023-03-16 19:05:22,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... + 9: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +10: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +18: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +22: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +22: [2023-03-16 19:05:22,579] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +22: [2023-03-16 19:05:22,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +22: [2023-03-16 19:05:22,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +22: [2023-03-16 19:05:22,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +10: [2023-03-16 19:05:22,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +22: [2023-03-16 19:05:22,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +20: [2023-03-16 19:05:22,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +23: [2023-03-16 19:05:22,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +23: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +16: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +21: [2023-03-16 19:05:22,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +21: [2023-03-16 19:05:22,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +17: [2023-03-16 19:05:22,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +16: [2023-03-16 19:05:22,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_02-model_states.pt. +23: [2023-03-16 19:05:22,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +16: [2023-03-16 19:05:22,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +21: [2023-03-16 19:05:22,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +21: [2023-03-16 19:05:22,584] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +18: [2023-03-16 19:05:22,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +19: [2023-03-16 19:05:22,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +19: [2023-03-16 19:05:22,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +13: [2023-03-16 19:05:22,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,590] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +18: [2023-03-16 19:05:22,591] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +23: [2023-03-16 19:05:22,591] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +16: [2023-03-16 19:05:22,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,592] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,592] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +13: [2023-03-16 19:05:22,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +13: [2023-03-16 19:05:22,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt... +20: [2023-03-16 19:05:22,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +22: [2023-03-16 19:05:22,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +21: [2023-03-16 19:05:22,594] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +21: [2023-03-16 19:05:22,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +17: [2023-03-16 19:05:22,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,595] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +17: [2023-03-16 19:05:22,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +17: [2023-03-16 19:05:22,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +22: [2023-03-16 19:05:22,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +22: [2023-03-16 19:05:22,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,596] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +22: [2023-03-16 19:05:22,597] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +16: [2023-03-16 19:05:22,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +23: [2023-03-16 19:05:22,599] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,600] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +18: [2023-03-16 19:05:22,604] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +20: [2023-03-16 19:05:22,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +20: [2023-03-16 19:05:22,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +20: [2023-03-16 19:05:22,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +23: [2023-03-16 19:05:22,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +25: [2023-03-16 19:05:22,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +16: [2023-03-16 19:05:22,608] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +25: [2023-03-16 19:05:22,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +21: [2023-03-16 19:05:22,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +21: [2023-03-16 19:05:22,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +20: [2023-03-16 19:05:22,611] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +22: [2023-03-16 19:05:22,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +20: [2023-03-16 19:05:22,613] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +10: [2023-03-16 19:05:22,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +10: [2023-03-16 19:05:22,616] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +21: [2023-03-16 19:05:22,617] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +19: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +31: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +24: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +27: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +27: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +24: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +30: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +22: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +30: [2023-03-16 19:05:22,618] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +31: [2023-03-16 19:05:22,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +28: [2023-03-16 19:05:22,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +23: [2023-03-16 19:05:22,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +28: [2023-03-16 19:05:22,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +16: [2023-03-16 19:05:22,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +19: [2023-03-16 19:05:22,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +21: [2023-03-16 19:05:22,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +22: [2023-03-16 19:05:22,620] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +18: [2023-03-16 19:05:22,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +15: [2023-03-16 19:05:22,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +22: [2023-03-16 19:05:22,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +15: [2023-03-16 19:05:22,621] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +16: [2023-03-16 19:05:22,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +25: [2023-03-16 19:05:22,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +25: [2023-03-16 19:05:22,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +23: [2023-03-16 19:05:22,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +18: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +23: [2023-03-16 19:05:22,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +16: [2023-03-16 19:05:22,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +17: [2023-03-16 19:05:22,626] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +16: [2023-03-16 19:05:22,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +17: [2023-03-16 19:05:22,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +18: [2023-03-16 19:05:22,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +16: [2023-03-16 19:05:22,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +22: [2023-03-16 19:05:22,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +12: [2023-03-16 19:05:22,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +19: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +21: [2023-03-16 19:05:22,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +19: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +21: [2023-03-16 19:05:22,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,627] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +21: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +21: [2023-03-16 19:05:22,625] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +11: [2023-03-16 19:05:22,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +11: [2023-03-16 19:05:22,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +12: [2023-03-16 19:05:22,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +14: [2023-03-16 19:05:22,628] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +26: [2023-03-16 19:05:22,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +26: [2023-03-16 19:05:22,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +22: [2023-03-16 19:05:22,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +14: [2023-03-16 19:05:22,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_02-model_states.pt. +16: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... +17: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +31: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +17: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +10: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +22: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +30: [2023-03-16 19:05:22,630] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +24: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +29: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +24: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +22: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +23: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +29: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_01-model_states.pt. +23: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +10: [2023-03-16 19:05:22,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +27: [2023-03-16 19:05:22,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +31: [2023-03-16 19:05:22,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,632] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +27: [2023-03-16 19:05:22,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,633] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +20: [2023-03-16 19:05:22,633] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +22: [2023-03-16 19:05:22,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +22: [2023-03-16 19:05:22,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +15: [2023-03-16 19:05:22,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +15: [2023-03-16 19:05:22,635] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +26: [2023-03-16 19:05:22,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +16: [2023-03-16 19:05:22,636] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +26: [2023-03-16 19:05:22,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +21: [2023-03-16 19:05:22,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +18: [2023-03-16 19:05:22,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +16: [2023-03-16 19:05:22,638] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +20: [2023-03-16 19:05:22,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +21: [2023-03-16 19:05:22,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +18: [2023-03-16 19:05:22,638] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +19: [2023-03-16 19:05:22,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. +19: [2023-03-16 19:05:22,639] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_32-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +16: [2023-03-16 19:05:22,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt... + 9: [2023-03-16 19:05:22,639] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +12: [2023-03-16 19:05:22,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +16: [2023-03-16 19:05:22,640] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +29: [2023-03-16 19:05:22,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,641] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +29: [2023-03-16 19:05:22,641] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +14: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +16: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +11: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +12: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +11: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +26: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +14: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +16: [2023-03-16 19:05:22,642] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt... +17: [2023-03-16 19:05:22,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +29: [2023-03-16 19:05:22,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +17: [2023-03-16 19:05:22,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +29: [2023-03-16 19:05:22,643] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,645] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +20: [2023-03-16 19:05:22,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +29: [2023-03-16 19:05:22,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +29: [2023-03-16 19:05:22,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +22: [2023-03-16 19:05:22,647] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +22: [2023-03-16 19:05:22,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +26: [2023-03-16 19:05:22,649] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +27: [2023-03-16 19:05:22,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +27: [2023-03-16 19:05:22,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +21: [2023-03-16 19:05:22,650] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +21: [2023-03-16 19:05:22,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +18: [2023-03-16 19:05:22,651] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +19: [2023-03-16 19:05:22,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +27: [2023-03-16 19:05:22,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +19: [2023-03-16 19:05:22,652] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +20: [2023-03-16 19:05:22,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt... +29: [2023-03-16 19:05:22,653] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +27: [2023-03-16 19:05:22,653] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +29: [2023-03-16 19:05:22,654] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +28: [2023-03-16 19:05:22,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +28: [2023-03-16 19:05:22,655] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,656] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +28: [2023-03-16 19:05:22,657] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +28: [2023-03-16 19:05:22,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +27: [2023-03-16 19:05:22,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +27: [2023-03-16 19:05:22,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +10: [2023-03-16 19:05:22,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +28: [2023-03-16 19:05:22,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +23: [2023-03-16 19:05:22,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +23: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +27: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +28: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +25: [2023-03-16 19:05:22,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +10: [2023-03-16 19:05:22,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +15: [2023-03-16 19:05:22,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:22,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,663] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +23: [2023-03-16 19:05:22,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... + 5: [2023-03-16 19:05:22,663] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +23: [2023-03-16 19:05:22,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +26: [2023-03-16 19:05:22,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +26: [2023-03-16 19:05:22,664] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +25: [2023-03-16 19:05:22,664] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +28: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +25: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +27: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +27: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +30: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,665] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +10: [2023-03-16 19:05:22,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +29: [2023-03-16 19:05:22,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +15: [2023-03-16 19:05:22,666] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +16: [2023-03-16 19:05:22,666] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +28: [2023-03-16 19:05:22,667] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +26: [2023-03-16 19:05:22,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +27: [2023-03-16 19:05:22,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +29: [2023-03-16 19:05:22,667] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +12: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +24: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +30: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +24: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +24: [2023-03-16 19:05:22,668] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +24: [2023-03-16 19:05:22,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +28: [2023-03-16 19:05:22,669] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +16: [2023-03-16 19:05:22,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... + 1: [2023-03-16 19:05:22,669] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +22: [2023-03-16 19:05:22,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +15: [2023-03-16 19:05:22,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,670] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +14: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +24: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +24: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +24: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +24: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +30: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +27: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +22: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +17: [2023-03-16 19:05:22,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +11: [2023-03-16 19:05:22,672] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +15: [2023-03-16 19:05:22,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +27: [2023-03-16 19:05:22,672] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +14: [2023-03-16 19:05:22,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +14: [2023-03-16 19:05:22,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,673] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +12: [2023-03-16 19:05:22,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +17: [2023-03-16 19:05:22,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +11: [2023-03-16 19:05:22,674] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. + 3: [2023-03-16 19:05:22,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +18: [2023-03-16 19:05:22,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +24: [2023-03-16 19:05:22,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +11: [2023-03-16 19:05:22,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,675] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +29: [2023-03-16 19:05:22,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +29: [2023-03-16 19:05:22,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +24: [2023-03-16 19:05:22,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +26: [2023-03-16 19:05:22,676] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +26: [2023-03-16 19:05:22,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +18: [2023-03-16 19:05:22,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +17: [2023-03-16 19:05:22,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +16: [2023-03-16 19:05:22,677] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +11: [2023-03-16 19:05:22,677] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +22: [2023-03-16 19:05:22,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,678] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +26: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +26: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +12: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +17: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... + 2: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +16: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +28: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +20: [2023-03-16 19:05:22,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... +22: [2023-03-16 19:05:22,680] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +26: [2023-03-16 19:05:22,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +12: [2023-03-16 19:05:22,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt... + 4: [2023-03-16 19:05:22,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +20: [2023-03-16 19:05:22,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +20: [2023-03-16 19:05:22,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +29: [2023-03-16 19:05:22,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,682] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +27: [2023-03-16 19:05:22,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +18: [2023-03-16 19:05:22,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +30: [2023-03-16 19:05:22,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +20: [2023-03-16 19:05:22,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +30: [2023-03-16 19:05:22,684] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. + 6: [2023-03-16 19:05:22,685] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +18: [2023-03-16 19:05:22,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +19: [2023-03-16 19:05:22,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +21: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,683] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +21: [2023-03-16 19:05:22,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +19: [2023-03-16 19:05:22,684] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_00-model_states.pt. +21: [2023-03-16 19:05:22,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +21: [2023-03-16 19:05:22,681] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +29: [2023-03-16 19:05:22,685] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,686] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. + 3: [2023-03-16 19:05:22,686] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,687] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +27: [2023-03-16 19:05:22,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +24: [2023-03-16 19:05:22,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_03-model_states.pt. +29: [2023-03-16 19:05:22,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:22,688] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +26: [2023-03-16 19:05:22,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +19: [2023-03-16 19:05:22,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +19: [2023-03-16 19:05:22,689] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt... +29: [2023-03-16 19:05:22,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +24: [2023-03-16 19:05:22,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +29: [2023-03-16 19:05:22,691] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +29: [2023-03-16 19:05:22,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +26: [2023-03-16 19:05:22,691] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +29: [2023-03-16 19:05:22,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +29: [2023-03-16 19:05:22,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +29: [2023-03-16 19:05:22,692] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:22,692] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +29: [2023-03-16 19:05:22,693] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +29: [2023-03-16 19:05:22,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +29: [2023-03-16 19:05:22,694] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +25: [2023-03-16 19:05:22,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,695] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +28: [2023-03-16 19:05:22,695] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +31: [2023-03-16 19:05:22,696] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +27: [2023-03-16 19:05:22,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,696] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +27: [2023-03-16 19:05:22,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,697] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,698] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +29: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_02-model_states.pt. +27: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +26: [2023-03-16 19:05:22,700] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +27: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +26: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +29: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +30: [2023-03-16 19:05:22,701] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +28: [2023-03-16 19:05:22,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +24: [2023-03-16 19:05:22,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,702] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,702] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,703] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +31: [2023-03-16 19:05:22,704] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt... +24: [2023-03-16 19:05:22,705] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,705] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +27: [2023-03-16 19:05:22,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +27: [2023-03-16 19:05:22,707] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +27: [2023-03-16 19:05:22,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +27: [2023-03-16 19:05:22,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +30: [2023-03-16 19:05:22,708] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:22,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_12_optim_states.pt... +29: [2023-03-16 19:05:22,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_12_optim_states.pt... +27: [2023-03-16 19:05:22,709] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +27: [2023-03-16 19:05:22,709] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:22,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +31: [2023-03-16 19:05:22,710] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +27: [2023-03-16 19:05:22,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +27: [2023-03-16 19:05:22,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,711] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:22,711] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +26: [2023-03-16 19:05:22,713] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +26: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +28: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +28: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:22,715] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +28: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:22,716] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +27: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_12_optim_states.pt... +27: [2023-03-16 19:05:22,717] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_12_optim_states.pt... +30: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +26: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +26: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +26: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:22,718] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +24: [2023-03-16 19:05:22,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:22,719] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:22,719] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:22,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:22,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:22,720] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:22,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +24: [2023-03-16 19:05:22,720] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +24: [2023-03-16 19:05:22,722] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,722] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +30: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +30: [2023-03-16 19:05:22,723] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:22,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,724] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_12_optim_states.pt... +25: [2023-03-16 19:05:22,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_12_optim_states.pt... + 0: [2023-03-16 19:05:22,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +26: [2023-03-16 19:05:22,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_12_optim_states.pt... +26: [2023-03-16 19:05:22,726] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_12_optim_states.pt... +24: [2023-03-16 19:05:22,727] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +28: [2023-03-16 19:05:22,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_12_optim_states.pt... +28: [2023-03-16 19:05:22,727] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_12_optim_states.pt... + 5: [2023-03-16 19:05:22,728] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +24: [2023-03-16 19:05:22,729] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,729] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. +30: [2023-03-16 19:05:22,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_12_optim_states.pt... +30: [2023-03-16 19:05:22,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_12_optim_states.pt... +24: [2023-03-16 19:05:22,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_12_optim_states.pt... +24: [2023-03-16 19:05:22,730] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_12_optim_states.pt... + 2: [2023-03-16 19:05:22,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 2: [2023-03-16 19:05:22,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 0: [2023-03-16 19:05:22,733] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 6: [2023-03-16 19:05:22,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 0: [2023-03-16 19:05:22,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 6: [2023-03-16 19:05:22,734] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 7: [2023-03-16 19:05:22,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 7: [2023-03-16 19:05:22,736] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 0: [2023-03-16 19:05:22,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +31: [2023-03-16 19:05:22,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 5: [2023-03-16 19:05:22,737] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 4: [2023-03-16 19:05:22,739] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 0: [2023-03-16 19:05:22,740] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +31: [2023-03-16 19:05:22,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,740] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,741] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,742] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,742] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +31: [2023-03-16 19:05:22,743] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... +25: [2023-03-16 19:05:22,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 0: [2023-03-16 19:05:22,745] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,746] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,746] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,748] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt... + 7: [2023-03-16 19:05:22,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,749] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,751] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,751] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,752] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,752] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,754] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +31: [2023-03-16 19:05:22,754] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:22,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,755] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,757] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,759] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,760] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 7: [2023-03-16 19:05:22,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 0: [2023-03-16 19:05:22,761] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +11: [2023-03-16 19:05:22,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,761] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +11: [2023-03-16 19:05:22,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,763] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 5: [2023-03-16 19:05:22,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,764] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,766] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:22,766] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +31: [2023-03-16 19:05:22,767] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:22,767] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,768] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,769] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +31: [2023-03-16 19:05:22,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_12_optim_states.pt... + 6: [2023-03-16 19:05:22,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 6: [2023-03-16 19:05:22,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,772] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,773] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,773] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,774] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,774] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,775] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 1: [2023-03-16 19:05:22,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,776] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +31: [2023-03-16 19:05:22,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_12_optim_states.pt... + 1: [2023-03-16 19:05:22,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... +11: [2023-03-16 19:05:22,776] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 9: [2023-03-16 19:05:22,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 9: [2023-03-16 19:05:22,777] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 3: [2023-03-16 19:05:22,777] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,778] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... +30: [2023-03-16 19:05:22,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +24: [2023-03-16 19:05:22,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +30: [2023-03-16 19:05:22,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,778] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +29: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +24: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +29: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +11: [2023-03-16 19:05:22,779] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +31: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +25: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,781] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +25: [2023-03-16 19:05:22,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 5: [2023-03-16 19:05:22,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +28: [2023-03-16 19:05:22,782] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 2: [2023-03-16 19:05:22,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,782] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 2: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 7: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 7: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... +26: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +26: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +28: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 4: [2023-03-16 19:05:22,783] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,785] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 8: [2023-03-16 19:05:22,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 5: [2023-03-16 19:05:22,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 4: [2023-03-16 19:05:22,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,786] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... +27: [2023-03-16 19:05:22,786] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. +27: [2023-03-16 19:05:22,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_43-model_03-model_states.pt. + 4: [2023-03-16 19:05:22,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt... + 4: [2023-03-16 19:05:22,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 4: [2023-03-16 19:05:22,787] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,789] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 6: [2023-03-16 19:05:22,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... +29: [2023-03-16 19:05:22,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +29: [2023-03-16 19:05:22,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +24: [2023-03-16 19:05:22,792] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +30: [2023-03-16 19:05:22,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +24: [2023-03-16 19:05:22,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +12: [2023-03-16 19:05:22,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +31: [2023-03-16 19:05:22,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +12: [2023-03-16 19:05:22,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,794] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt... + 3: [2023-03-16 19:05:22,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +25: [2023-03-16 19:05:22,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +28: [2023-03-16 19:05:22,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,795] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +25: [2023-03-16 19:05:22,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +26: [2023-03-16 19:05:22,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +31: [2023-03-16 19:05:22,796] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,797] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +28: [2023-03-16 19:05:22,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,799] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +27: [2023-03-16 19:05:22,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +27: [2023-03-16 19:05:22,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt... +11: [2023-03-16 19:05:22,800] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,803] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +11: [2023-03-16 19:05:22,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... + 1: [2023-03-16 19:05:22,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 7: [2023-03-16 19:05:22,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. +12: [2023-03-16 19:05:22,806] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +12: [2023-03-16 19:05:22,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,807] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,809] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +11: [2023-03-16 19:05:22,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 7: [2023-03-16 19:05:22,811] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. +11: [2023-03-16 19:05:22,813] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +13: [2023-03-16 19:05:22,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +13: [2023-03-16 19:05:22,814] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. + 1: [2023-03-16 19:05:22,816] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,816] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,818] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,819] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +28: [2023-03-16 19:05:22,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,820] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,820] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +28: [2023-03-16 19:05:22,821] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +30: [2023-03-16 19:05:22,821] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +11: [2023-03-16 19:05:22,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +24: [2023-03-16 19:05:22,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +29: [2023-03-16 19:05:22,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +29: [2023-03-16 19:05:22,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +11: [2023-03-16 19:05:22,823] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +30: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... + 8: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +31: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +25: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,824] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +11: [2023-03-16 19:05:22,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +11: [2023-03-16 19:05:22,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +29: [2023-03-16 19:05:22,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +29: [2023-03-16 19:05:22,825] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... + 8: [2023-03-16 19:05:22,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +24: [2023-03-16 19:05:22,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +31: [2023-03-16 19:05:22,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +25: [2023-03-16 19:05:22,826] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +13: [2023-03-16 19:05:22,827] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +26: [2023-03-16 19:05:22,827] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +30: [2023-03-16 19:05:22,828] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... + 8: [2023-03-16 19:05:22,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +13: [2023-03-16 19:05:22,829] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +31: [2023-03-16 19:05:22,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +26: [2023-03-16 19:05:22,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +26: [2023-03-16 19:05:22,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... + 8: [2023-03-16 19:05:22,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +24: [2023-03-16 19:05:22,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. +26: [2023-03-16 19:05:22,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +31: [2023-03-16 19:05:22,831] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... + 3: [2023-03-16 19:05:22,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. +24: [2023-03-16 19:05:22,832] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +25: [2023-03-16 19:05:22,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,833] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +27: [2023-03-16 19:05:22,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +27: [2023-03-16 19:05:22,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +28: [2023-03-16 19:05:22,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_00-model_states.pt. +25: [2023-03-16 19:05:22,835] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +12: [2023-03-16 19:05:22,835] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,836] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +28: [2023-03-16 19:05:22,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +27: [2023-03-16 19:05:22,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +27: [2023-03-16 19:05:22,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt... +12: [2023-03-16 19:05:22,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +12: [2023-03-16 19:05:22,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +12: [2023-03-16 19:05:22,839] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... + 0: [2023-03-16 19:05:22,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 5: [2023-03-16 19:05:22,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 0: [2023-03-16 19:05:22,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 5: [2023-03-16 19:05:22,840] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 4: [2023-03-16 19:05:22,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 4: [2023-03-16 19:05:22,843] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 2: [2023-03-16 19:05:22,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 2: [2023-03-16 19:05:22,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 1: [2023-03-16 19:05:22,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 1: [2023-03-16 19:05:22,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_09-model_02-model_states.pt. + 3: [2023-03-16 19:05:22,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,847] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,851] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +21: [2023-03-16 19:05:22,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:22,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:22,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +21: [2023-03-16 19:05:22,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +11: [2023-03-16 19:05:22,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,853] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 6: [2023-03-16 19:05:22,853] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... +19: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 0: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +21: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +19: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +13: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +21: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +11: [2023-03-16 19:05:22,854] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 6: [2023-03-16 19:05:22,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 7: [2023-03-16 19:05:22,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 7: [2023-03-16 19:05:22,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +13: [2023-03-16 19:05:22,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... + 2: [2023-03-16 19:05:22,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,857] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... +13: [2023-03-16 19:05:22,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 1: [2023-03-16 19:05:22,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt... + 7: [2023-03-16 19:05:22,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 7: [2023-03-16 19:05:22,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... +13: [2023-03-16 19:05:22,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +11: [2023-03-16 19:05:22,864] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +11: [2023-03-16 19:05:22,866] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:22,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +10: [2023-03-16 19:05:22,870] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:22,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +18: [2023-03-16 19:05:22,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +12: [2023-03-16 19:05:22,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +12: [2023-03-16 19:05:22,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +19: [2023-03-16 19:05:22,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +12: [2023-03-16 19:05:22,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +12: [2023-03-16 19:05:22,877] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 3: [2023-03-16 19:05:22,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 3: [2023-03-16 19:05:22,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... +21: [2023-03-16 19:05:22,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:22,881] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:22,882] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:22,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +21: [2023-03-16 19:05:22,883] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:22,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 5: [2023-03-16 19:05:22,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. +18: [2023-03-16 19:05:22,885] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +19: [2023-03-16 19:05:22,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:22,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 2: [2023-03-16 19:05:22,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 4: [2023-03-16 19:05:22,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 0: [2023-03-16 19:05:22,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 0: [2023-03-16 19:05:22,887] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 5: [2023-03-16 19:05:22,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... +19: [2023-03-16 19:05:22,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,888] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +19: [2023-03-16 19:05:22,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. + 4: [2023-03-16 19:05:22,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 2: [2023-03-16 19:05:22,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 2: [2023-03-16 19:05:22,890] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... +19: [2023-03-16 19:05:22,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +10: [2023-03-16 19:05:22,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 4: [2023-03-16 19:05:22,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +13: [2023-03-16 19:05:22,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 1: [2023-03-16 19:05:22,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... + 1: [2023-03-16 19:05:22,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt... +13: [2023-03-16 19:05:22,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +12: [2023-03-16 19:05:22,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +14: [2023-03-16 19:05:22,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +14: [2023-03-16 19:05:22,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +14: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +11: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +12: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +14: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +21: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +11: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. + 9: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +13: [2023-03-16 19:05:22,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +21: [2023-03-16 19:05:22,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +13: [2023-03-16 19:05:22,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +13: [2023-03-16 19:05:22,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. + 8: [2023-03-16 19:05:22,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +13: [2023-03-16 19:05:22,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +19: [2023-03-16 19:05:22,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. + 9: [2023-03-16 19:05:22,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +14: [2023-03-16 19:05:22,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +14: [2023-03-16 19:05:22,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,898] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +15: [2023-03-16 19:05:22,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +15: [2023-03-16 19:05:22,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_03-model_states.pt. +15: [2023-03-16 19:05:22,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +15: [2023-03-16 19:05:22,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +19: [2023-03-16 19:05:22,902] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +10: [2023-03-16 19:05:22,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +19: [2023-03-16 19:05:22,903] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +15: [2023-03-16 19:05:22,904] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:22,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +13: [2023-03-16 19:05:22,905] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +15: [2023-03-16 19:05:22,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +12: [2023-03-16 19:05:22,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +10: [2023-03-16 19:05:22,908] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +12: [2023-03-16 19:05:22,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +12: [2023-03-16 19:05:22,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +10: [2023-03-16 19:05:22,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +14: [2023-03-16 19:05:22,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +11: [2023-03-16 19:05:22,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +14: [2023-03-16 19:05:22,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +11: [2023-03-16 19:05:22,910] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:22,911] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +15: [2023-03-16 19:05:22,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:22,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:22,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +12: [2023-03-16 19:05:22,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +18: [2023-03-16 19:05:22,912] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:22,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +13: [2023-03-16 19:05:22,913] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:22,914] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,914] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,916] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:22,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:22,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:22,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +12: [2023-03-16 19:05:22,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +18: [2023-03-16 19:05:22,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +18: [2023-03-16 19:05:22,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +12: [2023-03-16 19:05:22,924] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +19: [2023-03-16 19:05:22,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:22,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +16: [2023-03-16 19:05:22,925] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +16: [2023-03-16 19:05:22,926] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +16: [2023-03-16 19:05:22,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +18: [2023-03-16 19:05:22,927] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:22,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,928] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +11: [2023-03-16 19:05:22,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. + 9: [2023-03-16 19:05:22,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +11: [2023-03-16 19:05:22,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +19: [2023-03-16 19:05:22,932] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +13: [2023-03-16 19:05:22,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +13: [2023-03-16 19:05:22,933] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +11: [2023-03-16 19:05:22,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +19: [2023-03-16 19:05:22,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +14: [2023-03-16 19:05:22,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +22: [2023-03-16 19:05:22,934] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +14: [2023-03-16 19:05:22,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +22: [2023-03-16 19:05:22,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +11: [2023-03-16 19:05:22,935] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +15: [2023-03-16 19:05:22,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +23: [2023-03-16 19:05:22,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +23: [2023-03-16 19:05:22,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +22: [2023-03-16 19:05:22,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +15: [2023-03-16 19:05:22,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +10: [2023-03-16 19:05:22,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +15: [2023-03-16 19:05:22,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +23: [2023-03-16 19:05:22,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +10: [2023-03-16 19:05:22,938] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +22: [2023-03-16 19:05:22,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +15: [2023-03-16 19:05:22,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,940] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:22,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +20: [2023-03-16 19:05:22,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +20: [2023-03-16 19:05:22,940] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +12: [2023-03-16 19:05:22,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +11: [2023-03-16 19:05:22,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,942] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +11: [2023-03-16 19:05:22,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +20: [2023-03-16 19:05:22,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +11: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +15: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +20: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +12: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +17: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +10: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +14: [2023-03-16 19:05:22,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +17: [2023-03-16 19:05:22,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +12: [2023-03-16 19:05:22,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +22: [2023-03-16 19:05:22,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. + 9: [2023-03-16 19:05:22,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,943] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +11: [2023-03-16 19:05:22,945] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +22: [2023-03-16 19:05:22,945] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +15: [2023-03-16 19:05:22,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +14: [2023-03-16 19:05:22,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +17: [2023-03-16 19:05:22,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +12: [2023-03-16 19:05:22,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. + 8: [2023-03-16 19:05:22,946] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +13: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +17: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +13: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +17: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +14: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +17: [2023-03-16 19:05:22,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +14: [2023-03-16 19:05:22,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +14: [2023-03-16 19:05:22,948] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +14: [2023-03-16 19:05:22,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +14: [2023-03-16 19:05:22,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +14: [2023-03-16 19:05:22,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +16: [2023-03-16 19:05:22,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +16: [2023-03-16 19:05:22,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +12: [2023-03-16 19:05:22,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... + 9: [2023-03-16 19:05:22,949] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +13: [2023-03-16 19:05:22,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +13: [2023-03-16 19:05:22,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:22,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +10: [2023-03-16 19:05:22,951] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +21: [2023-03-16 19:05:22,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +21: [2023-03-16 19:05:22,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +15: [2023-03-16 19:05:22,952] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +23: [2023-03-16 19:05:22,952] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +23: [2023-03-16 19:05:22,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +20: [2023-03-16 19:05:22,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +20: [2023-03-16 19:05:22,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_01-model_states.pt. +13: [2023-03-16 19:05:22,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +13: [2023-03-16 19:05:22,953] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +15: [2023-03-16 19:05:22,954] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... + 8: [2023-03-16 19:05:22,955] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +13: [2023-03-16 19:05:22,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +13: [2023-03-16 19:05:22,956] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +12: [2023-03-16 19:05:22,957] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +22: [2023-03-16 19:05:22,958] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +22: [2023-03-16 19:05:22,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +13: [2023-03-16 19:05:22,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +13: [2023-03-16 19:05:22,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... + 9: [2023-03-16 19:05:22,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +17: [2023-03-16 19:05:22,959] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +16: [2023-03-16 19:05:22,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +12: [2023-03-16 19:05:22,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +16: [2023-03-16 19:05:22,960] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +17: [2023-03-16 19:05:22,960] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 9: [2023-03-16 19:05:22,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt... +14: [2023-03-16 19:05:22,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +14: [2023-03-16 19:05:22,961] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +16: [2023-03-16 19:05:22,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +16: [2023-03-16 19:05:22,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +20: [2023-03-16 19:05:22,964] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:22,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +21: [2023-03-16 19:05:22,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +21: [2023-03-16 19:05:22,966] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +20: [2023-03-16 19:05:22,967] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 8: [2023-03-16 19:05:22,969] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. + 8: [2023-03-16 19:05:22,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +29: [2023-03-16 19:05:22,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +29: [2023-03-16 19:05:22,971] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +16: [2023-03-16 19:05:22,972] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +23: [2023-03-16 19:05:22,973] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,974] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +22: [2023-03-16 19:05:22,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +22: [2023-03-16 19:05:22,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +11: [2023-03-16 19:05:22,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,975] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +10: [2023-03-16 19:05:22,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +16: [2023-03-16 19:05:22,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +10: [2023-03-16 19:05:22,976] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +31: [2023-03-16 19:05:22,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +31: [2023-03-16 19:05:22,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +20: [2023-03-16 19:05:22,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +17: [2023-03-16 19:05:22,977] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +11: [2023-03-16 19:05:22,977] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +11: [2023-03-16 19:05:22,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +27: [2023-03-16 19:05:22,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +27: [2023-03-16 19:05:22,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +11: [2023-03-16 19:05:22,980] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +23: [2023-03-16 19:05:22,980] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +20: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +17: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +24: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +26: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +10: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +10: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... + 8: [2023-03-16 19:05:22,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +17: [2023-03-16 19:05:22,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +30: [2023-03-16 19:05:22,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +30: [2023-03-16 19:05:22,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +10: [2023-03-16 19:05:22,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +10: [2023-03-16 19:05:22,982] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +17: [2023-03-16 19:05:22,983] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +26: [2023-03-16 19:05:22,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +14: [2023-03-16 19:05:22,983] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +24: [2023-03-16 19:05:22,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +14: [2023-03-16 19:05:22,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +29: [2023-03-16 19:05:22,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:22,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:22,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +29: [2023-03-16 19:05:22,985] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 8: [2023-03-16 19:05:22,985] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +12: [2023-03-16 19:05:22,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +28: [2023-03-16 19:05:22,986] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +23: [2023-03-16 19:05:22,986] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +25: [2023-03-16 19:05:22,987] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. + 5: [2023-03-16 19:05:22,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +25: [2023-03-16 19:05:22,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +12: [2023-03-16 19:05:22,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +21: [2023-03-16 19:05:22,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +21: [2023-03-16 19:05:22,988] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +22: [2023-03-16 19:05:22,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 5: [2023-03-16 19:05:22,989] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +14: [2023-03-16 19:05:22,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +12: [2023-03-16 19:05:22,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +14: [2023-03-16 19:05:22,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +22: [2023-03-16 19:05:22,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +27: [2023-03-16 19:05:22,990] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +27: [2023-03-16 19:05:22,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +17: [2023-03-16 19:05:22,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +20: [2023-03-16 19:05:22,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:22,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +28: [2023-03-16 19:05:22,991] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_02-model_states.pt. +12: [2023-03-16 19:05:22,991] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +27: [2023-03-16 19:05:22,992] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +22: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +31: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +27: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +31: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 9: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +15: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +26: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +17: [2023-03-16 19:05:22,993] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +15: [2023-03-16 19:05:22,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_02-model_states.pt. +10: [2023-03-16 19:05:22,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +23: [2023-03-16 19:05:22,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +26: [2023-03-16 19:05:22,994] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:22,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +17: [2023-03-16 19:05:22,994] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +24: [2023-03-16 19:05:22,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:22,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:22,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:22,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +17: [2023-03-16 19:05:22,995] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +20: [2023-03-16 19:05:22,995] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +26: [2023-03-16 19:05:22,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +10: [2023-03-16 19:05:22,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +26: [2023-03-16 19:05:22,996] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +20: [2023-03-16 19:05:22,996] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +30: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +22: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +21: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +23: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +30: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +28: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +23: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +23: [2023-03-16 19:05:22,997] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +23: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +31: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +31: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +20: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +26: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +28: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +14: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +29: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +29: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +16: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +24: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +22: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +24: [2023-03-16 19:05:22,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +16: [2023-03-16 19:05:22,998] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 3: [2023-03-16 19:05:22,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +21: [2023-03-16 19:05:22,999] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +21: [2023-03-16 19:05:22,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +25: [2023-03-16 19:05:22,999] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:23,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +14: [2023-03-16 19:05:23,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +25: [2023-03-16 19:05:23,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:23,000] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +21: [2023-03-16 19:05:23,000] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +16: [2023-03-16 19:05:23,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +14: [2023-03-16 19:05:23,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +16: [2023-03-16 19:05:23,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +23: [2023-03-16 19:05:23,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +23: [2023-03-16 19:05:23,001] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +30: [2023-03-16 19:05:23,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +30: [2023-03-16 19:05:23,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +21: [2023-03-16 19:05:23,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +14: [2023-03-16 19:05:23,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +27: [2023-03-16 19:05:23,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +27: [2023-03-16 19:05:23,002] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +21: [2023-03-16 19:05:23,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +28: [2023-03-16 19:05:23,003] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +28: [2023-03-16 19:05:23,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +28: [2023-03-16 19:05:23,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:23,004] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +20: [2023-03-16 19:05:23,004] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +25: [2023-03-16 19:05:23,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... +25: [2023-03-16 19:05:23,005] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt... + 9: [2023-03-16 19:05:23,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:23,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +20: [2023-03-16 19:05:23,006] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt... +15: [2023-03-16 19:05:23,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,007] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +23: [2023-03-16 19:05:23,008] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +29: [2023-03-16 19:05:23,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +29: [2023-03-16 19:05:23,009] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,010] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +20: [2023-03-16 19:05:23,010] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +23: [2023-03-16 19:05:23,011] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +20: [2023-03-16 19:05:23,011] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,012] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,012] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,015] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +27: [2023-03-16 19:05:23,015] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +24: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +24: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +31: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +26: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +27: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +30: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +31: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +27: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +30: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +27: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +28: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +29: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +26: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +28: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +29: [2023-03-16 19:05:23,016] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. + 8: [2023-03-16 19:05:23,017] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +25: [2023-03-16 19:05:23,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. +25: [2023-03-16 19:05:23,019] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_02-model_states.pt. + 8: [2023-03-16 19:05:23,020] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +29: [2023-03-16 19:05:23,021] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +15: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +29: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +29: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +15: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_21-model_01-model_states.pt. +29: [2023-03-16 19:05:23,022] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +20: [2023-03-16 19:05:23,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +28: [2023-03-16 19:05:23,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_14_optim_states.pt... +28: [2023-03-16 19:05:23,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_14_optim_states.pt... +20: [2023-03-16 19:05:23,023] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +30: [2023-03-16 19:05:23,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_14_optim_states.pt... +30: [2023-03-16 19:05:23,024] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_14_optim_states.pt... +29: [2023-03-16 19:05:23,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_14_optim_states.pt... +29: [2023-03-16 19:05:23,026] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_14_optim_states.pt... +17: [2023-03-16 19:05:23,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +17: [2023-03-16 19:05:23,026] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +10: [2023-03-16 19:05:23,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +10: [2023-03-16 19:05:23,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +27: [2023-03-16 19:05:23,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +27: [2023-03-16 19:05:23,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:23,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_14_optim_states.pt... +31: [2023-03-16 19:05:23,028] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_14_optim_states.pt... +27: [2023-03-16 19:05:23,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +27: [2023-03-16 19:05:23,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +10: [2023-03-16 19:05:23,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +26: [2023-03-16 19:05:23,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_14_optim_states.pt... +10: [2023-03-16 19:05:23,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +26: [2023-03-16 19:05:23,029] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_14_optim_states.pt... + 1: [2023-03-16 19:05:23,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +14: [2023-03-16 19:05:23,031] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +27: [2023-03-16 19:05:23,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +27: [2023-03-16 19:05:23,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +27: [2023-03-16 19:05:23,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_14_optim_states.pt... +27: [2023-03-16 19:05:23,032] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_14_optim_states.pt... +23: [2023-03-16 19:05:23,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +14: [2023-03-16 19:05:23,033] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +23: [2023-03-16 19:05:23,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +25: [2023-03-16 19:05:23,034] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +25: [2023-03-16 19:05:23,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +15: [2023-03-16 19:05:23,035] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +27: [2023-03-16 19:05:23,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. + 6: [2023-03-16 19:05:23,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +27: [2023-03-16 19:05:23,035] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. + 6: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +14: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +14: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +15: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt... +21: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +25: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_14_optim_states.pt... +25: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_14_optim_states.pt... +15: [2023-03-16 19:05:23,036] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +16: [2023-03-16 19:05:23,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +23: [2023-03-16 19:05:23,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +23: [2023-03-16 19:05:23,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +23: [2023-03-16 19:05:23,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. + 9: [2023-03-16 19:05:23,037] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +16: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +17: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +21: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +26: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +15: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +17: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +29: [2023-03-16 19:05:23,038] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +17: [2023-03-16 19:05:23,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +26: [2023-03-16 19:05:23,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +23: [2023-03-16 19:05:23,039] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +29: [2023-03-16 19:05:23,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. + 9: [2023-03-16 19:05:23,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +21: [2023-03-16 19:05:23,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +21: [2023-03-16 19:05:23,040] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +28: [2023-03-16 19:05:23,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +17: [2023-03-16 19:05:23,040] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +28: [2023-03-16 19:05:23,041] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +24: [2023-03-16 19:05:23,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_14_optim_states.pt... +24: [2023-03-16 19:05:23,041] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_14_optim_states.pt... + 9: [2023-03-16 19:05:23,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,042] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +20: [2023-03-16 19:05:23,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +20: [2023-03-16 19:05:23,042] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,043] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +22: [2023-03-16 19:05:23,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +15: [2023-03-16 19:05:23,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,044] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +22: [2023-03-16 19:05:23,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +24: [2023-03-16 19:05:23,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +31: [2023-03-16 19:05:23,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +24: [2023-03-16 19:05:23,043] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +31: [2023-03-16 19:05:23,044] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. + 7: [2023-03-16 19:05:23,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +29: [2023-03-16 19:05:23,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_13_optim_states.pt... +29: [2023-03-16 19:05:23,045] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_13_optim_states.pt... +22: [2023-03-16 19:05:23,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +22: [2023-03-16 19:05:23,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +25: [2023-03-16 19:05:23,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +15: [2023-03-16 19:05:23,046] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt... +25: [2023-03-16 19:05:23,046] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +25: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_01-model_states.pt. +25: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +25: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... + 7: [2023-03-16 19:05:23,047] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +16: [2023-03-16 19:05:23,048] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +23: [2023-03-16 19:05:23,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:23,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +26: [2023-03-16 19:05:23,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +17: [2023-03-16 19:05:23,050] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +26: [2023-03-16 19:05:23,051] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:23,051] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +28: [2023-03-16 19:05:23,052] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +28: [2023-03-16 19:05:23,052] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +16: [2023-03-16 19:05:23,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +25: [2023-03-16 19:05:23,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +17: [2023-03-16 19:05:23,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +25: [2023-03-16 19:05:23,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +26: [2023-03-16 19:05:23,053] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +20: [2023-03-16 19:05:23,053] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +26: [2023-03-16 19:05:23,054] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:23,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +28: [2023-03-16 19:05:23,054] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +26: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +20: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +20: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +26: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +27: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_13_optim_states.pt... +27: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_13_optim_states.pt... +20: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +20: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +24: [2023-03-16 19:05:23,055] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +31: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +31: [2023-03-16 19:05:23,057] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +20: [2023-03-16 19:05:23,057] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +24: [2023-03-16 19:05:23,056] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +22: [2023-03-16 19:05:23,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +22: [2023-03-16 19:05:23,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +28: [2023-03-16 19:05:23,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +28: [2023-03-16 19:05:23,058] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +24: [2023-03-16 19:05:23,058] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +24: [2023-03-16 19:05:23,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +28: [2023-03-16 19:05:23,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:23,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +28: [2023-03-16 19:05:23,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +30: [2023-03-16 19:05:23,059] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +22: [2023-03-16 19:05:23,060] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +30: [2023-03-16 19:05:23,060] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:23,061] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +31: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +31: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... +22: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +30: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +15: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +30: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:23,062] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt... + 2: [2023-03-16 19:05:23,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. + 2: [2023-03-16 19:05:23,063] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_03-model_states.pt. +30: [2023-03-16 19:05:23,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +30: [2023-03-16 19:05:23,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +31: [2023-03-16 19:05:23,064] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +26: [2023-03-16 19:05:23,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_13_optim_states.pt... +26: [2023-03-16 19:05:23,064] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_13_optim_states.pt... +31: [2023-03-16 19:05:23,065] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_01-model_states.pt. +15: [2023-03-16 19:05:23,066] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +17: [2023-03-16 19:05:23,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +15: [2023-03-16 19:05:23,066] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_00-model_states.pt. +17: [2023-03-16 19:05:23,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +28: [2023-03-16 19:05:23,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_13_optim_states.pt... +28: [2023-03-16 19:05:23,068] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_13_optim_states.pt... +15: [2023-03-16 19:05:23,069] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt... +21: [2023-03-16 19:05:23,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +21: [2023-03-16 19:05:23,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +18: [2023-03-16 19:05:23,070] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +18: [2023-03-16 19:05:23,071] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +25: [2023-03-16 19:05:23,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_13_optim_states.pt... +25: [2023-03-16 19:05:23,072] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_13_optim_states.pt... +31: [2023-03-16 19:05:23,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_13_optim_states.pt... +31: [2023-03-16 19:05:23,073] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_13_optim_states.pt... +19: [2023-03-16 19:05:23,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +17: [2023-03-16 19:05:23,073] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:23,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_03-model_states.pt. +30: [2023-03-16 19:05:23,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_13_optim_states.pt... +30: [2023-03-16 19:05:23,074] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_13_optim_states.pt... +16: [2023-03-16 19:05:23,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +17: [2023-03-16 19:05:23,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +17: [2023-03-16 19:05:23,076] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... + 2: [2023-03-16 19:05:23,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +16: [2023-03-16 19:05:23,076] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +17: [2023-03-16 19:05:23,077] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +18: [2023-03-16 19:05:23,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +18: [2023-03-16 19:05:23,077] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,078] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +12: [2023-03-16 19:05:23,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +12: [2023-03-16 19:05:23,078] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +23: [2023-03-16 19:05:23,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +23: [2023-03-16 19:05:23,081] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:23,082] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +21: [2023-03-16 19:05:23,082] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +18: [2023-03-16 19:05:23,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:23,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +21: [2023-03-16 19:05:23,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +23: [2023-03-16 19:05:23,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +16: [2023-03-16 19:05:23,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +16: [2023-03-16 19:05:23,083] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +18: [2023-03-16 19:05:23,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +16: [2023-03-16 19:05:23,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +24: [2023-03-16 19:05:23,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_13_optim_states.pt... +24: [2023-03-16 19:05:23,084] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_13_optim_states.pt... +16: [2023-03-16 19:05:23,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +17: [2023-03-16 19:05:23,084] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +22: [2023-03-16 19:05:23,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +20: [2023-03-16 19:05:23,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:23,085] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_33-model_02-model_states.pt. +22: [2023-03-16 19:05:23,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +17: [2023-03-16 19:05:23,086] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +20: [2023-03-16 19:05:23,086] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +20: [2023-03-16 19:05:23,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +20: [2023-03-16 19:05:23,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +18: [2023-03-16 19:05:23,089] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +19: [2023-03-16 19:05:23,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +19: [2023-03-16 19:05:23,087] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +12: [2023-03-16 19:05:23,090] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:23,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +12: [2023-03-16 19:05:23,091] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +22: [2023-03-16 19:05:23,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +22: [2023-03-16 19:05:23,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +22: [2023-03-16 19:05:23,093] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:23,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +16: [2023-03-16 19:05:23,095] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. +22: [2023-03-16 19:05:23,096] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,096] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. +22: [2023-03-16 19:05:23,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +22: [2023-03-16 19:05:23,097] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +16: [2023-03-16 19:05:23,098] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... +19: [2023-03-16 19:05:23,099] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,110] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +21: [2023-03-16 19:05:23,110] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,111] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +21: [2023-03-16 19:05:23,111] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +12: [2023-03-16 19:05:23,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +21: [2023-03-16 19:05:23,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,112] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,113] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +12: [2023-03-16 19:05:23,113] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +21: [2023-03-16 19:05:23,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +11: [2023-03-16 19:05:23,114] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 0: [2023-03-16 19:05:23,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,114] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,115] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +18: [2023-03-16 19:05:23,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +18: [2023-03-16 19:05:23,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +11: [2023-03-16 19:05:23,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. + 7: [2023-03-16 19:05:23,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,117] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... +19: [2023-03-16 19:05:23,117] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,118] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... +18: [2023-03-16 19:05:23,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +18: [2023-03-16 19:05:23,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +19: [2023-03-16 19:05:23,119] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +18: [2023-03-16 19:05:23,119] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +12: [2023-03-16 19:05:23,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +18: [2023-03-16 19:05:23,121] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +12: [2023-03-16 19:05:23,122] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +18: [2023-03-16 19:05:23,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... +18: [2023-03-16 19:05:23,123] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +16: [2023-03-16 19:05:23,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,124] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +11: [2023-03-16 19:05:23,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +16: [2023-03-16 19:05:23,126] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... + 8: [2023-03-16 19:05:23,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. + 8: [2023-03-16 19:05:23,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +19: [2023-03-16 19:05:23,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +19: [2023-03-16 19:05:23,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. +13: [2023-03-16 19:05:23,127] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +16: [2023-03-16 19:05:23,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +13: [2023-03-16 19:05:23,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. + 9: [2023-03-16 19:05:23,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +19: [2023-03-16 19:05:23,128] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,129] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +11: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +16: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +19: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 7: [2023-03-16 19:05:23,130] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 4: [2023-03-16 19:05:23,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 4: [2023-03-16 19:05:23,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,132] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... +19: [2023-03-16 19:05:23,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... +19: [2023-03-16 19:05:23,134] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,135] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 5: [2023-03-16 19:05:23,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,137] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 3: [2023-03-16 19:05:23,137] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,138] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +24: [2023-03-16 19:05:23,138] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +24: [2023-03-16 19:05:23,139] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 8: [2023-03-16 19:05:23,139] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:23,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +13: [2023-03-16 19:05:23,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,140] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,141] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,142] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,143] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,144] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,146] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 2: [2023-03-16 19:05:23,148] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,148] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +11: [2023-03-16 19:05:23,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +23: [2023-03-16 19:05:23,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,149] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,149] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +23: [2023-03-16 19:05:23,150] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,150] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +23: [2023-03-16 19:05:23,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +23: [2023-03-16 19:05:23,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,152] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,153] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +24: [2023-03-16 19:05:23,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +11: [2023-03-16 19:05:23,154] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +24: [2023-03-16 19:05:23,154] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +24: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 6: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 1: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +24: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +24: [2023-03-16 19:05:23,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,156] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,156] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,157] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,158] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,158] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 4: [2023-03-16 19:05:23,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,159] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,160] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,161] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 6: [2023-03-16 19:05:23,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +28: [2023-03-16 19:05:23,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +11: [2023-03-16 19:05:23,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,163] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +28: [2023-03-16 19:05:23,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +31: [2023-03-16 19:05:23,163] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +31: [2023-03-16 19:05:23,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +24: [2023-03-16 19:05:23,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +24: [2023-03-16 19:05:23,164] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +11: [2023-03-16 19:05:23,164] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +25: [2023-03-16 19:05:23,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 7: [2023-03-16 19:05:23,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,165] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. +25: [2023-03-16 19:05:23,166] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 8: [2023-03-16 19:05:23,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +26: [2023-03-16 19:05:23,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +13: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. +26: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 5: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,169] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_01-model_states.pt. + 8: [2023-03-16 19:05:23,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +13: [2023-03-16 19:05:23,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,170] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +13: [2023-03-16 19:05:23,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +29: [2023-03-16 19:05:23,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +29: [2023-03-16 19:05:23,171] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,172] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +19: [2023-03-16 19:05:23,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +19: [2023-03-16 19:05:23,172] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +13: [2023-03-16 19:05:23,173] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... + 8: [2023-03-16 19:05:23,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +30: [2023-03-16 19:05:23,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +19: [2023-03-16 19:05:23,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +31: [2023-03-16 19:05:23,175] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +19: [2023-03-16 19:05:23,174] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +28: [2023-03-16 19:05:23,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... + 0: [2023-03-16 19:05:23,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,176] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... +28: [2023-03-16 19:05:23,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +28: [2023-03-16 19:05:23,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:23,177] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +27: [2023-03-16 19:05:23,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +28: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +27: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,179] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +31: [2023-03-16 19:05:23,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +31: [2023-03-16 19:05:23,178] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +31: [2023-03-16 19:05:23,179] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +26: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +26: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +25: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +26: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +23: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +25: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +30: [2023-03-16 19:05:23,181] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_44-model_03-model_states.pt. +25: [2023-03-16 19:05:23,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +25: [2023-03-16 19:05:23,181] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +26: [2023-03-16 19:05:23,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +26: [2023-03-16 19:05:23,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +26: [2023-03-16 19:05:23,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... + 1: [2023-03-16 19:05:23,182] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +18: [2023-03-16 19:05:23,182] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +18: [2023-03-16 19:05:23,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,183] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +28: [2023-03-16 19:05:23,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +28: [2023-03-16 19:05:23,183] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +26: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 2: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +18: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +26: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,184] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +29: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +18: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +29: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +29: [2023-03-16 19:05:23,185] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +29: [2023-03-16 19:05:23,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +31: [2023-03-16 19:05:23,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +31: [2023-03-16 19:05:23,186] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... + 1: [2023-03-16 19:05:23,186] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,187] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,187] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +23: [2023-03-16 19:05:23,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +25: [2023-03-16 19:05:23,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +25: [2023-03-16 19:05:23,188] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,188] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,189] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 5: [2023-03-16 19:05:23,190] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... +31: [2023-03-16 19:05:23,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +31: [2023-03-16 19:05:23,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,190] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +27: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +27: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +27: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +30: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +27: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +24: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_15_optim_states.pt... +24: [2023-03-16 19:05:23,191] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_15_optim_states.pt... + 2: [2023-03-16 19:05:23,193] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +28: [2023-03-16 19:05:23,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. + 1: [2023-03-16 19:05:23,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,193] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_10-model_02-model_states.pt. +27: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +27: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +29: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +29: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... + 2: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +30: [2023-03-16 19:05:23,194] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt... +30: [2023-03-16 19:05:23,195] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_00-model_states.pt. +25: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_15_optim_states.pt... +25: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_15_optim_states.pt... +27: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +27: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +29: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +29: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 2: [2023-03-16 19:05:23,196] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +31: [2023-03-16 19:05:23,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_15_optim_states.pt... + 6: [2023-03-16 19:05:23,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +31: [2023-03-16 19:05:23,197] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_15_optim_states.pt... +28: [2023-03-16 19:05:23,197] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,199] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,200] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,201] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 0: [2023-03-16 19:05:23,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +30: [2023-03-16 19:05:23,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... +19: [2023-03-16 19:05:23,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,203] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +27: [2023-03-16 19:05:23,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_15_optim_states.pt... +27: [2023-03-16 19:05:23,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_15_optim_states.pt... + 4: [2023-03-16 19:05:23,204] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... +23: [2023-03-16 19:05:23,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_08_optim_states.pt... +23: [2023-03-16 19:05:23,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_08_optim_states.pt... + 0: [2023-03-16 19:05:23,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... +30: [2023-03-16 19:05:23,205] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,206] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,206] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,207] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,207] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +29: [2023-03-16 19:05:23,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_15_optim_states.pt... +29: [2023-03-16 19:05:23,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_15_optim_states.pt... + 1: [2023-03-16 19:05:23,208] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... +30: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. + 5: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +21: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +30: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_46-model_03-model_states.pt. +21: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +19: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,209] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 7: [2023-03-16 19:05:23,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... +21: [2023-03-16 19:05:23,210] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,210] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +21: [2023-03-16 19:05:23,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +28: [2023-03-16 19:05:23,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_15_optim_states.pt... +28: [2023-03-16 19:05:23,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_15_optim_states.pt... + 5: [2023-03-16 19:05:23,211] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 6: [2023-03-16 19:05:23,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +18: [2023-03-16 19:05:23,213] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 6: [2023-03-16 19:05:23,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 5: [2023-03-16 19:05:23,214] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... +18: [2023-03-16 19:05:23,215] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +22: [2023-03-16 19:05:23,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +22: [2023-03-16 19:05:23,216] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +16: [2023-03-16 19:05:23,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +16: [2023-03-16 19:05:23,217] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +22: [2023-03-16 19:05:23,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:23,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,220] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... +30: [2023-03-16 19:05:23,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_15_optim_states.pt... +30: [2023-03-16 19:05:23,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_15_optim_states.pt... +26: [2023-03-16 19:05:23,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_15_optim_states.pt... +26: [2023-03-16 19:05:23,221] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_15_optim_states.pt... + 2: [2023-03-16 19:05:23,223] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +17: [2023-03-16 19:05:23,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +17: [2023-03-16 19:05:23,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +17: [2023-03-16 19:05:23,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +17: [2023-03-16 19:05:23,227] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,227] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +19: [2023-03-16 19:05:23,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_08_optim_states.pt... +19: [2023-03-16 19:05:23,230] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_08_optim_states.pt... + 2: [2023-03-16 19:05:23,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 2: [2023-03-16 19:05:23,231] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 2: [2023-03-16 19:05:23,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 2: [2023-03-16 19:05:23,232] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,235] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,236] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +21: [2023-03-16 19:05:23,237] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +18: [2023-03-16 19:05:23,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_08_optim_states.pt... +18: [2023-03-16 19:05:23,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_08_optim_states.pt... + 3: [2023-03-16 19:05:23,238] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,239] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... +11: [2023-03-16 19:05:23,239] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +13: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +13: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,241] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_00-model_states.pt. +12: [2023-03-16 19:05:23,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +11: [2023-03-16 19:05:23,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:23,241] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +14: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +11: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +14: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +13: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +12: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +15: [2023-03-16 19:05:23,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +21: [2023-03-16 19:05:23,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... + 1: [2023-03-16 19:05:23,244] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt... +19: [2023-03-16 19:05:23,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +19: [2023-03-16 19:05:23,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +16: [2023-03-16 19:05:23,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +18: [2023-03-16 19:05:23,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +18: [2023-03-16 19:05:23,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +10: [2023-03-16 19:05:23,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +10: [2023-03-16 19:05:23,248] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +22: [2023-03-16 19:05:23,249] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +15: [2023-03-16 19:05:23,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +16: [2023-03-16 19:05:23,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +10: [2023-03-16 19:05:23,250] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +22: [2023-03-16 19:05:23,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +10: [2023-03-16 19:05:23,251] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,253] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:23,254] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:23,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +17: [2023-03-16 19:05:23,255] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +15: [2023-03-16 19:05:23,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,255] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +17: [2023-03-16 19:05:23,256] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +15: [2023-03-16 19:05:23,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +15: [2023-03-16 19:05:23,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +19: [2023-03-16 19:05:23,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +19: [2023-03-16 19:05:23,259] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +21: [2023-03-16 19:05:23,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_08_optim_states.pt... +21: [2023-03-16 19:05:23,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_08_optim_states.pt... +18: [2023-03-16 19:05:23,260] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +11: [2023-03-16 19:05:23,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +16: [2023-03-16 19:05:23,265] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_08_optim_states.pt... +13: [2023-03-16 19:05:23,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +16: [2023-03-16 19:05:23,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_08_optim_states.pt... + 8: [2023-03-16 19:05:23,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +15: [2023-03-16 19:05:23,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:23,271] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +10: [2023-03-16 19:05:23,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +14: [2023-03-16 19:05:23,271] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +13: [2023-03-16 19:05:23,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +14: [2023-03-16 19:05:23,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +10: [2023-03-16 19:05:23,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_22-model_01-model_states.pt. +12: [2023-03-16 19:05:23,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,278] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +10: [2023-03-16 19:05:23,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +20: [2023-03-16 19:05:23,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +20: [2023-03-16 19:05:23,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +10: [2023-03-16 19:05:23,281] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... + 8: [2023-03-16 19:05:23,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. + 8: [2023-03-16 19:05:23,274] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +10: [2023-03-16 19:05:23,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +22: [2023-03-16 19:05:23,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_08_optim_states.pt... +22: [2023-03-16 19:05:23,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_08_optim_states.pt... +12: [2023-03-16 19:05:23,282] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_04_optim_states.pt... +20: [2023-03-16 19:05:23,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +17: [2023-03-16 19:05:23,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_08_optim_states.pt... +17: [2023-03-16 19:05:23,283] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_08_optim_states.pt... +20: [2023-03-16 19:05:23,284] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,284] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +18: [2023-03-16 19:05:23,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... + 8: [2023-03-16 19:05:23,286] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +19: [2023-03-16 19:05:23,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,287] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +14: [2023-03-16 19:05:23,287] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +15: [2023-03-16 19:05:23,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +15: [2023-03-16 19:05:23,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,288] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +14: [2023-03-16 19:05:23,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +18: [2023-03-16 19:05:23,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +19: [2023-03-16 19:05:23,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,289] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +19: [2023-03-16 19:05:23,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +15: [2023-03-16 19:05:23,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +15: [2023-03-16 19:05:23,290] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +18: [2023-03-16 19:05:23,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +10: [2023-03-16 19:05:23,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt... +11: [2023-03-16 19:05:23,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_04_optim_states.pt... +11: [2023-03-16 19:05:23,291] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_04_optim_states.pt... +19: [2023-03-16 19:05:23,292] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +15: [2023-03-16 19:05:23,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,293] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_04_optim_states.pt... +13: [2023-03-16 19:05:23,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_04_optim_states.pt... + 8: [2023-03-16 19:05:23,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_04_optim_states.pt... +13: [2023-03-16 19:05:23,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_04_optim_states.pt... + 8: [2023-03-16 19:05:23,297] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_04_optim_states.pt... +11: [2023-03-16 19:05:23,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +11: [2023-03-16 19:05:23,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +10: [2023-03-16 19:05:23,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +13: [2023-03-16 19:05:23,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +13: [2023-03-16 19:05:23,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +10: [2023-03-16 19:05:23,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +10: [2023-03-16 19:05:23,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_04_optim_states.pt... +10: [2023-03-16 19:05:23,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_04_optim_states.pt... + 9: [2023-03-16 19:05:23,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,303] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +15: [2023-03-16 19:05:23,306] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +10: [2023-03-16 19:05:23,305] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_04_optim_states.pt... +10: [2023-03-16 19:05:23,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... + 9: [2023-03-16 19:05:23,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_04_optim_states.pt... +14: [2023-03-16 19:05:23,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_04_optim_states.pt... +14: [2023-03-16 19:05:23,307] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_04_optim_states.pt... +15: [2023-03-16 19:05:23,308] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +15: [2023-03-16 19:05:23,309] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_04_optim_states.pt... +15: [2023-03-16 19:05:23,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +15: [2023-03-16 19:05:23,310] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_04_optim_states.pt... +16: [2023-03-16 19:05:23,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +16: [2023-03-16 19:05:23,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. + 8: [2023-03-16 19:05:23,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +20: [2023-03-16 19:05:23,312] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +13: [2023-03-16 19:05:23,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:23,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +20: [2023-03-16 19:05:23,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +14: [2023-03-16 19:05:23,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +10: [2023-03-16 19:05:23,314] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +14: [2023-03-16 19:05:23,314] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +11: [2023-03-16 19:05:23,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +11: [2023-03-16 19:05:23,315] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:23,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,318] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,320] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +14: [2023-03-16 19:05:23,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +11: [2023-03-16 19:05:23,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +11: [2023-03-16 19:05:23,323] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_03-model_states.pt. +16: [2023-03-16 19:05:23,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,324] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +14: [2023-03-16 19:05:23,325] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... + 8: [2023-03-16 19:05:23,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +14: [2023-03-16 19:05:23,327] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +14: [2023-03-16 19:05:23,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +10: [2023-03-16 19:05:23,328] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_00-model_states.pt. +14: [2023-03-16 19:05:23,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:23,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +10: [2023-03-16 19:05:23,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. + 9: [2023-03-16 19:05:23,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +14: [2023-03-16 19:05:23,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... + 9: [2023-03-16 19:05:23,329] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +14: [2023-03-16 19:05:23,330] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +14: [2023-03-16 19:05:23,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +10: [2023-03-16 19:05:23,331] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt... +12: [2023-03-16 19:05:23,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +13: [2023-03-16 19:05:23,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +12: [2023-03-16 19:05:23,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +13: [2023-03-16 19:05:23,332] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. + 9: [2023-03-16 19:05:23,333] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,334] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +11: [2023-03-16 19:05:23,335] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +15: [2023-03-16 19:05:23,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +11: [2023-03-16 19:05:23,337] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,337] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_02-model_states.pt. +20: [2023-03-16 19:05:23,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +20: [2023-03-16 19:05:23,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. + 9: [2023-03-16 19:05:23,339] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,341] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,342] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +14: [2023-03-16 19:05:23,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,340] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:23,343] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:23,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:23,344] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +14: [2023-03-16 19:05:23,345] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:23,345] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:23,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:23,346] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +10: [2023-03-16 19:05:23,347] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +13: [2023-03-16 19:05:23,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +13: [2023-03-16 19:05:23,347] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,348] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,348] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:23,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +13: [2023-03-16 19:05:23,350] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +11: [2023-03-16 19:05:23,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +11: [2023-03-16 19:05:23,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +15: [2023-03-16 19:05:23,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +20: [2023-03-16 19:05:23,351] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +20: [2023-03-16 19:05:23,352] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +15: [2023-03-16 19:05:23,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +22: [2023-03-16 19:05:23,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +11: [2023-03-16 19:05:23,353] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +22: [2023-03-16 19:05:23,353] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +16: [2023-03-16 19:05:23,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +16: [2023-03-16 19:05:23,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +10: [2023-03-16 19:05:23,357] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +20: [2023-03-16 19:05:23,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_08_optim_states.pt... +20: [2023-03-16 19:05:23,358] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_08_optim_states.pt... +23: [2023-03-16 19:05:23,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +23: [2023-03-16 19:05:23,359] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +10: [2023-03-16 19:05:23,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +16: [2023-03-16 19:05:23,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +16: [2023-03-16 19:05:23,360] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +14: [2023-03-16 19:05:23,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,363] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,365] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... + 9: [2023-03-16 19:05:23,365] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +22: [2023-03-16 19:05:23,366] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +14: [2023-03-16 19:05:23,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,367] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +22: [2023-03-16 19:05:23,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +11: [2023-03-16 19:05:23,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +14: [2023-03-16 19:05:23,369] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt... +23: [2023-03-16 19:05:23,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +12: [2023-03-16 19:05:23,372] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +23: [2023-03-16 19:05:23,371] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +14: [2023-03-16 19:05:23,373] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +12: [2023-03-16 19:05:23,374] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +10: [2023-03-16 19:05:23,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +20: [2023-03-16 19:05:23,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +12: [2023-03-16 19:05:23,375] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +14: [2023-03-16 19:05:23,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... + 9: [2023-03-16 19:05:23,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +13: [2023-03-16 19:05:23,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +20: [2023-03-16 19:05:23,376] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +10: [2023-03-16 19:05:23,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +10: [2023-03-16 19:05:23,377] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +13: [2023-03-16 19:05:23,377] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,378] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +10: [2023-03-16 19:05:23,379] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,379] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +14: [2023-03-16 19:05:23,380] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +20: [2023-03-16 19:05:23,381] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... + 8: [2023-03-16 19:05:23,382] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +20: [2023-03-16 19:05:23,383] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +15: [2023-03-16 19:05:23,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +15: [2023-03-16 19:05:23,384] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,386] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +13: [2023-03-16 19:05:23,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +13: [2023-03-16 19:05:23,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,387] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 5: [2023-03-16 19:05:23,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 5: [2023-03-16 19:05:23,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,389] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,388] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,392] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,393] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +23: [2023-03-16 19:05:23,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +19: [2023-03-16 19:05:23,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +23: [2023-03-16 19:05:23,396] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +16: [2023-03-16 19:05:23,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +22: [2023-03-16 19:05:23,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +16: [2023-03-16 19:05:23,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,396] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +15: [2023-03-16 19:05:23,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +15: [2023-03-16 19:05:23,397] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt... +19: [2023-03-16 19:05:23,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. + 5: [2023-03-16 19:05:23,400] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +23: [2023-03-16 19:05:23,401] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,401] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 6: [2023-03-16 19:05:23,402] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,402] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +23: [2023-03-16 19:05:23,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +22: [2023-03-16 19:05:23,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +22: [2023-03-16 19:05:23,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... + 7: [2023-03-16 19:05:23,403] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +22: [2023-03-16 19:05:23,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +22: [2023-03-16 19:05:23,404] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. + 2: [2023-03-16 19:05:23,406] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_03-model_states.pt. +17: [2023-03-16 19:05:23,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +22: [2023-03-16 19:05:23,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +17: [2023-03-16 19:05:23,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +19: [2023-03-16 19:05:23,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,407] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,409] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,410] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +22: [2023-03-16 19:05:23,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +18: [2023-03-16 19:05:23,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +18: [2023-03-16 19:05:23,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. + 6: [2023-03-16 19:05:23,414] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +19: [2023-03-16 19:05:23,412] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +21: [2023-03-16 19:05:23,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +21: [2023-03-16 19:05:23,413] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. + 6: [2023-03-16 19:05:23,415] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +22: [2023-03-16 19:05:23,417] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:23,418] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:23,418] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. + 2: [2023-03-16 19:05:23,419] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +20: [2023-03-16 19:05:23,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +20: [2023-03-16 19:05:23,420] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +17: [2023-03-16 19:05:23,420] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +17: [2023-03-16 19:05:23,421] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +17: [2023-03-16 19:05:23,425] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +21: [2023-03-16 19:05:23,425] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,426] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +17: [2023-03-16 19:05:23,427] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +21: [2023-03-16 19:05:23,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:23,427] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +19: [2023-03-16 19:05:23,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +20: [2023-03-16 19:05:23,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +19: [2023-03-16 19:05:23,429] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +20: [2023-03-16 19:05:23,430] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +20: [2023-03-16 19:05:23,432] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:23,433] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +23: [2023-03-16 19:05:23,434] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +20: [2023-03-16 19:05:23,435] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +23: [2023-03-16 19:05:23,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_03-model_states.pt. +19: [2023-03-16 19:05:23,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +19: [2023-03-16 19:05:23,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +16: [2023-03-16 19:05:23,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +19: [2023-03-16 19:05:23,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +19: [2023-03-16 19:05:23,438] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +17: [2023-03-16 19:05:23,439] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,440] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +17: [2023-03-16 19:05:23,441] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +16: [2023-03-16 19:05:23,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +20: [2023-03-16 19:05:23,442] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +20: [2023-03-16 19:05:23,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +19: [2023-03-16 19:05:23,443] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +21: [2023-03-16 19:05:23,443] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +21: [2023-03-16 19:05:23,444] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +23: [2023-03-16 19:05:23,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +23: [2023-03-16 19:05:23,445] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +19: [2023-03-16 19:05:23,446] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,446] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +18: [2023-03-16 19:05:23,447] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +22: [2023-03-16 19:05:23,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,449] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. +18: [2023-03-16 19:05:23,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,450] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. +23: [2023-03-16 19:05:23,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +22: [2023-03-16 19:05:23,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +23: [2023-03-16 19:05:23,451] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,452] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +17: [2023-03-16 19:05:23,453] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +17: [2023-03-16 19:05:23,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +17: [2023-03-16 19:05:23,455] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +21: [2023-03-16 19:05:23,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +21: [2023-03-16 19:05:23,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +23: [2023-03-16 19:05:23,456] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +21: [2023-03-16 19:05:23,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +17: [2023-03-16 19:05:23,457] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +21: [2023-03-16 19:05:23,458] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +18: [2023-03-16 19:05:23,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,459] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +22: [2023-03-16 19:05:23,460] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +23: [2023-03-16 19:05:23,460] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,461] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +22: [2023-03-16 19:05:23,462] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,464] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +20: [2023-03-16 19:05:23,465] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +22: [2023-03-16 19:05:23,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +22: [2023-03-16 19:05:23,466] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +22: [2023-03-16 19:05:23,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +22: [2023-03-16 19:05:23,469] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +20: [2023-03-16 19:05:23,470] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +17: [2023-03-16 19:05:23,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +21: [2023-03-16 19:05:23,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +20: [2023-03-16 19:05:23,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +21: [2023-03-16 19:05:23,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +17: [2023-03-16 19:05:23,473] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +20: [2023-03-16 19:05:23,473] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +20: [2023-03-16 19:05:23,474] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +19: [2023-03-16 19:05:23,475] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +20: [2023-03-16 19:05:23,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +20: [2023-03-16 19:05:23,475] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +17: [2023-03-16 19:05:23,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +19: [2023-03-16 19:05:23,476] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +20: [2023-03-16 19:05:23,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +19: [2023-03-16 19:05:23,477] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +17: [2023-03-16 19:05:23,478] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +19: [2023-03-16 19:05:23,479] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +16: [2023-03-16 19:05:23,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. +16: [2023-03-16 19:05:23,479] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,481] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. +23: [2023-03-16 19:05:23,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +23: [2023-03-16 19:05:23,484] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +21: [2023-03-16 19:05:23,485] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +21: [2023-03-16 19:05:23,487] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +21: [2023-03-16 19:05:23,487] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +18: [2023-03-16 19:05:23,488] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +23: [2023-03-16 19:05:23,489] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +23: [2023-03-16 19:05:23,490] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +18: [2023-03-16 19:05:23,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +21: [2023-03-16 19:05:23,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +23: [2023-03-16 19:05:23,491] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +16: [2023-03-16 19:05:23,491] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +23: [2023-03-16 19:05:23,492] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... + 5: [2023-03-16 19:05:23,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +23: [2023-03-16 19:05:23,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt... +16: [2023-03-16 19:05:23,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... +18: [2023-03-16 19:05:23,493] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,493] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +23: [2023-03-16 19:05:23,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... + 5: [2023-03-16 19:05:23,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,494] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... +18: [2023-03-16 19:05:23,495] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,495] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 7: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 6: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 6: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,496] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 5: [2023-03-16 19:05:23,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,497] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,498] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,499] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,501] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 4: [2023-03-16 19:05:23,503] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. +11: [2023-03-16 19:05:23,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +11: [2023-03-16 19:05:23,504] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +17: [2023-03-16 19:05:23,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +17: [2023-03-16 19:05:23,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. +13: [2023-03-16 19:05:23,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +13: [2023-03-16 19:05:23,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,505] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 1: [2023-03-16 19:05:23,506] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,509] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,509] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,510] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... +12: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,511] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... +12: [2023-03-16 19:05:23,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 8: [2023-03-16 19:05:23,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 8: [2023-03-16 19:05:23,513] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,514] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,515] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,516] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +11: [2023-03-16 19:05:23,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +17: [2023-03-16 19:05:23,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,517] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... +17: [2023-03-16 19:05:23,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +11: [2023-03-16 19:05:23,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +13: [2023-03-16 19:05:23,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,518] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +13: [2023-03-16 19:05:23,519] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,520] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +16: [2023-03-16 19:05:23,521] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +16: [2023-03-16 19:05:23,522] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +12: [2023-03-16 19:05:23,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +16: [2023-03-16 19:05:23,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +16: [2023-03-16 19:05:23,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt... +12: [2023-03-16 19:05:23,525] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,526] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,533] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,534] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,534] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,535] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 2: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,536] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,538] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,538] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,539] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 6: [2023-03-16 19:05:23,540] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 0: [2023-03-16 19:05:23,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 3: [2023-03-16 19:05:23,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 5: [2023-03-16 19:05:23,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,541] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,542] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 3: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 3: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 6: [2023-03-16 19:05:23,543] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 0: [2023-03-16 19:05:23,544] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 0: [2023-03-16 19:05:23,544] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,545] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,545] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,546] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 5: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +11: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... +13: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 5: [2023-03-16 19:05:23,547] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,548] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... +17: [2023-03-16 19:05:23,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +17: [2023-03-16 19:05:23,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... +13: [2023-03-16 19:05:23,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,549] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +11: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +11: [2023-03-16 19:05:23,550] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,551] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... +11: [2023-03-16 19:05:23,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +12: [2023-03-16 19:05:23,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,552] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,553] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,553] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +17: [2023-03-16 19:05:23,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +17: [2023-03-16 19:05:23,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +13: [2023-03-16 19:05:23,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +13: [2023-03-16 19:05:23,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,554] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,555] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +12: [2023-03-16 19:05:23,556] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,555] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,558] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,558] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 8: [2023-03-16 19:05:23,561] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +12: [2023-03-16 19:05:23,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +12: [2023-03-16 19:05:23,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 5: [2023-03-16 19:05:23,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 9: [2023-03-16 19:05:23,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,563] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_11-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 1: [2023-03-16 19:05:23,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 1: [2023-03-16 19:05:23,564] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,563] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 1: [2023-03-16 19:05:23,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 0: [2023-03-16 19:05:23,565] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,565] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,566] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,566] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,570] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 9: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +21: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +21: [2023-03-16 19:05:23,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_34-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,573] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 2: [2023-03-16 19:05:23,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,574] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,575] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt... + 4: [2023-03-16 19:05:23,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,576] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,577] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... +12: [2023-03-16 19:05:23,577] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. + 6: [2023-03-16 19:05:23,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +12: [2023-03-16 19:05:23,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,578] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,580] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,580] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 7: [2023-03-16 19:05:23,581] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... +14: [2023-03-16 19:05:23,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +14: [2023-03-16 19:05:23,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,582] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,583] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +21: [2023-03-16 19:05:23,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,583] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +21: [2023-03-16 19:05:23,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,585] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +10: [2023-03-16 19:05:23,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +10: [2023-03-16 19:05:23,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. + 3: [2023-03-16 19:05:23,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,587] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,587] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,588] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,589] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,590] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... +14: [2023-03-16 19:05:23,593] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. + 9: [2023-03-16 19:05:23,593] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,594] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +14: [2023-03-16 19:05:23,595] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +10: [2023-03-16 19:05:23,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... +10: [2023-03-16 19:05:23,599] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,598] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +12: [2023-03-16 19:05:23,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_07_optim_states.pt... +12: [2023-03-16 19:05:23,606] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_07_optim_states.pt... + 5: [2023-03-16 19:05:23,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,607] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,608] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,609] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,610] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,610] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_00-model_states.pt. +21: [2023-03-16 19:05:23,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. +21: [2023-03-16 19:05:23,609] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +21: [2023-03-16 19:05:23,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +21: [2023-03-16 19:05:23,611] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt... +14: [2023-03-16 19:05:23,615] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +11: [2023-03-16 19:05:23,619] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +14: [2023-03-16 19:05:23,621] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +15: [2023-03-16 19:05:23,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +15: [2023-03-16 19:05:23,622] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. + 9: [2023-03-16 19:05:23,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_07_optim_states.pt... + 9: [2023-03-16 19:05:23,622] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_07_optim_states.pt... + 4: [2023-03-16 19:05:23,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,623] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +11: [2023-03-16 19:05:23,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +10: [2023-03-16 19:05:23,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,626] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +10: [2023-03-16 19:05:23,627] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +14: [2023-03-16 19:05:23,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +10: [2023-03-16 19:05:23,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +14: [2023-03-16 19:05:23,629] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +15: [2023-03-16 19:05:23,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +14: [2023-03-16 19:05:23,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +10: [2023-03-16 19:05:23,631] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 8: [2023-03-16 19:05:23,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. + 8: [2023-03-16 19:05:23,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +15: [2023-03-16 19:05:23,634] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_23-model_01-model_states.pt. +18: [2023-03-16 19:05:23,635] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +18: [2023-03-16 19:05:23,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +11: [2023-03-16 19:05:23,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_07_optim_states.pt... +11: [2023-03-16 19:05:23,640] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_07_optim_states.pt... +13: [2023-03-16 19:05:23,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +13: [2023-03-16 19:05:23,645] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +15: [2023-03-16 19:05:23,646] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... +15: [2023-03-16 19:05:23,648] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. + 9: [2023-03-16 19:05:23,651] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +18: [2023-03-16 19:05:23,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_09_optim_states.pt... +18: [2023-03-16 19:05:23,658] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_09_optim_states.pt... +19: [2023-03-16 19:05:23,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +19: [2023-03-16 19:05:23,659] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. + 8: [2023-03-16 19:05:23,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_07_optim_states.pt... + 8: [2023-03-16 19:05:23,659] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_07_optim_states.pt... +14: [2023-03-16 19:05:23,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_07_optim_states.pt... +14: [2023-03-16 19:05:23,661] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_07_optim_states.pt... +16: [2023-03-16 19:05:23,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +15: [2023-03-16 19:05:23,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_07_optim_states.pt... +15: [2023-03-16 19:05:23,662] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_07_optim_states.pt... +16: [2023-03-16 19:05:23,662] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +15: [2023-03-16 19:05:23,673] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +10: [2023-03-16 19:05:23,674] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +10: [2023-03-16 19:05:23,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_03-model_states.pt. +15: [2023-03-16 19:05:23,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... + 9: [2023-03-16 19:05:23,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_06_optim_states.pt... + 9: [2023-03-16 19:05:23,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_06_optim_states.pt... +13: [2023-03-16 19:05:23,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_07_optim_states.pt... +13: [2023-03-16 19:05:23,676] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_07_optim_states.pt... +20: [2023-03-16 19:05:23,679] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +20: [2023-03-16 19:05:23,680] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +15: [2023-03-16 19:05:23,681] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_00-model_states.pt. +19: [2023-03-16 19:05:23,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_09_optim_states.pt... +19: [2023-03-16 19:05:23,682] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_09_optim_states.pt... +15: [2023-03-16 19:05:23,683] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt... +14: [2023-03-16 19:05:23,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +14: [2023-03-16 19:05:23,688] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +23: [2023-03-16 19:05:23,689] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +23: [2023-03-16 19:05:23,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +22: [2023-03-16 19:05:23,693] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +22: [2023-03-16 19:05:23,694] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +12: [2023-03-16 19:05:23,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +12: [2023-03-16 19:05:23,703] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +10: [2023-03-16 19:05:23,704] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +10: [2023-03-16 19:05:23,706] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +16: [2023-03-16 19:05:23,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_09_optim_states.pt... +16: [2023-03-16 19:05:23,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_09_optim_states.pt... +10: [2023-03-16 19:05:23,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_07_optim_states.pt... +10: [2023-03-16 19:05:23,708] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_07_optim_states.pt... +13: [2023-03-16 19:05:23,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +13: [2023-03-16 19:05:23,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +11: [2023-03-16 19:05:23,715] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +11: [2023-03-16 19:05:23,716] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +23: [2023-03-16 19:05:23,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_09_optim_states.pt... +23: [2023-03-16 19:05:23,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_09_optim_states.pt... +20: [2023-03-16 19:05:23,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_09_optim_states.pt... +20: [2023-03-16 19:05:23,725] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_09_optim_states.pt... +12: [2023-03-16 19:05:23,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_06_optim_states.pt... +10: [2023-03-16 19:05:23,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_06_optim_states.pt... +12: [2023-03-16 19:05:23,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_06_optim_states.pt... +10: [2023-03-16 19:05:23,731] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_06_optim_states.pt... +22: [2023-03-16 19:05:23,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_09_optim_states.pt... +22: [2023-03-16 19:05:23,733] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_09_optim_states.pt... +14: [2023-03-16 19:05:23,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_06_optim_states.pt... +14: [2023-03-16 19:05:23,734] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_06_optim_states.pt... +11: [2023-03-16 19:05:23,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_06_optim_states.pt... +11: [2023-03-16 19:05:23,738] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_06_optim_states.pt... +20: [2023-03-16 19:05:23,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +20: [2023-03-16 19:05:23,744] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +13: [2023-03-16 19:05:23,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_06_optim_states.pt... +13: [2023-03-16 19:05:23,747] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_06_optim_states.pt... +17: [2023-03-16 19:05:23,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +17: [2023-03-16 19:05:23,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +15: [2023-03-16 19:05:23,762] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +15: [2023-03-16 19:05:23,763] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +20: [2023-03-16 19:05:23,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_10_optim_states.pt... +20: [2023-03-16 19:05:23,770] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_10_optim_states.pt... + 8: [2023-03-16 19:05:23,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. + 8: [2023-03-16 19:05:23,771] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_02-model_states.pt. +16: [2023-03-16 19:05:23,775] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. +16: [2023-03-16 19:05:23,779] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +19: [2023-03-16 19:05:23,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +19: [2023-03-16 19:05:23,780] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +17: [2023-03-16 19:05:23,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_10_optim_states.pt... +17: [2023-03-16 19:05:23,783] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_10_optim_states.pt... + 2: [2023-03-16 19:05:23,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 2: [2023-03-16 19:05:23,785] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. +20: [2023-03-16 19:05:23,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +22: [2023-03-16 19:05:23,790] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +22: [2023-03-16 19:05:23,791] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,791] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,793] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +20: [2023-03-16 19:05:23,793] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +17: [2023-03-16 19:05:23,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +17: [2023-03-16 19:05:23,794] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. +23: [2023-03-16 19:05:23,795] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +23: [2023-03-16 19:05:23,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +15: [2023-03-16 19:05:23,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_06_optim_states.pt... +15: [2023-03-16 19:05:23,797] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_06_optim_states.pt... +16: [2023-03-16 19:05:23,798] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +16: [2023-03-16 19:05:23,799] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,800] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +19: [2023-03-16 19:05:23,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_10_optim_states.pt... +19: [2023-03-16 19:05:23,802] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_10_optim_states.pt... +16: [2023-03-16 19:05:23,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_11_optim_states.pt... +16: [2023-03-16 19:05:23,803] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_11_optim_states.pt... +18: [2023-03-16 19:05:23,804] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 8: [2023-03-16 19:05:23,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_06_optim_states.pt... + 8: [2023-03-16 19:05:23,804] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_06_optim_states.pt... +18: [2023-03-16 19:05:23,805] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +18: [2023-03-16 19:05:23,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +18: [2023-03-16 19:05:23,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +21: [2023-03-16 19:05:23,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +21: [2023-03-16 19:05:23,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_02-model_states.pt. +22: [2023-03-16 19:05:23,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_10_optim_states.pt... +22: [2023-03-16 19:05:23,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_10_optim_states.pt... +16: [2023-03-16 19:05:23,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_10_optim_states.pt... +16: [2023-03-16 19:05:23,817] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_10_optim_states.pt... +20: [2023-03-16 19:05:23,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_11_optim_states.pt... +20: [2023-03-16 19:05:23,819] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_11_optim_states.pt... +17: [2023-03-16 19:05:23,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_09_optim_states.pt... +17: [2023-03-16 19:05:23,822] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_09_optim_states.pt... +23: [2023-03-16 19:05:23,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_10_optim_states.pt... +23: [2023-03-16 19:05:23,823] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_10_optim_states.pt... + 1: [2023-03-16 19:05:23,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 1: [2023-03-16 19:05:23,824] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. +17: [2023-03-16 19:05:23,825] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +17: [2023-03-16 19:05:23,826] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 3: [2023-03-16 19:05:23,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,830] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. +18: [2023-03-16 19:05:23,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_10_optim_states.pt... +18: [2023-03-16 19:05:23,830] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_10_optim_states.pt... +21: [2023-03-16 19:05:23,831] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. +21: [2023-03-16 19:05:23,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. +23: [2023-03-16 19:05:23,832] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +23: [2023-03-16 19:05:23,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +22: [2023-03-16 19:05:23,834] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. +18: [2023-03-16 19:05:23,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_11_optim_states.pt... +18: [2023-03-16 19:05:23,834] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_11_optim_states.pt... +22: [2023-03-16 19:05:23,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,837] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,842] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,842] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 6: [2023-03-16 19:05:23,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 4: [2023-03-16 19:05:23,844] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,844] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 6: [2023-03-16 19:05:23,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 4: [2023-03-16 19:05:23,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,845] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 6: [2023-03-16 19:05:23,846] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. +17: [2023-03-16 19:05:23,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_11_optim_states.pt... +17: [2023-03-16 19:05:23,846] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_11_optim_states.pt... + 7: [2023-03-16 19:05:23,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,849] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,850] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. +21: [2023-03-16 19:05:23,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_10_optim_states.pt... +21: [2023-03-16 19:05:23,852] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_10_optim_states.pt... + 7: [2023-03-16 19:05:23,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 7: [2023-03-16 19:05:23,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,855] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 2: [2023-03-16 19:05:23,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,855] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,856] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 5: [2023-03-16 19:05:23,857] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 0: [2023-03-16 19:05:23,858] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +21: [2023-03-16 19:05:23,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. + 6: [2023-03-16 19:05:23,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,859] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +21: [2023-03-16 19:05:23,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +21: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_11_optim_states.pt... +21: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_11_optim_states.pt... +22: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_11_optim_states.pt... +22: [2023-03-16 19:05:23,860] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_11_optim_states.pt... + 7: [2023-03-16 19:05:23,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 2: [2023-03-16 19:05:23,861] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 8: [2023-03-16 19:05:23,862] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,862] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,863] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,864] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 1: [2023-03-16 19:05:23,865] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,865] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 4: [2023-03-16 19:05:23,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_02-model_states.pt. + 3: [2023-03-16 19:05:23,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,866] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,867] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,868] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 3: [2023-03-16 19:05:23,868] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,869] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,869] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 5: [2023-03-16 19:05:23,870] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 5: [2023-03-16 19:05:23,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 5: [2023-03-16 19:05:23,871] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,873] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,875] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. +23: [2023-03-16 19:05:23,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_11_optim_states.pt... +23: [2023-03-16 19:05:23,876] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_11_optim_states.pt... + 0: [2023-03-16 19:05:23,877] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 0: [2023-03-16 19:05:23,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,878] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... +13: [2023-03-16 19:05:23,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +13: [2023-03-16 19:05:23,878] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,879] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 4: [2023-03-16 19:05:23,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,880] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,881] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,882] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,883] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 5: [2023-03-16 19:05:23,884] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 9: [2023-03-16 19:05:23,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +19: [2023-03-16 19:05:23,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 7: [2023-03-16 19:05:23,886] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,886] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +19: [2023-03-16 19:05:23,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_35-model_03-model_states.pt. + 4: [2023-03-16 19:05:23,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,888] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 5: [2023-03-16 19:05:23,889] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 2: [2023-03-16 19:05:23,889] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +12: [2023-03-16 19:05:23,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 6: [2023-03-16 19:05:23,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +12: [2023-03-16 19:05:23,890] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +21: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_09_optim_states.pt... +21: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_09_optim_states.pt... + 6: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 8: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_05_optim_states.pt... + 8: [2023-03-16 19:05:23,891] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_05_optim_states.pt... + 0: [2023-03-16 19:05:23,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,892] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,893] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,893] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 0: [2023-03-16 19:05:23,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 0: [2023-03-16 19:05:23,894] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 0: [2023-03-16 19:05:23,895] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 2: [2023-03-16 19:05:23,896] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 0: [2023-03-16 19:05:23,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 0: [2023-03-16 19:05:23,896] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 5: [2023-03-16 19:05:23,897] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 6: [2023-03-16 19:05:23,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,897] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 6: [2023-03-16 19:05:23,898] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,899] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,900] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 6: [2023-03-16 19:05:23,901] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +11: [2023-03-16 19:05:23,903] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +11: [2023-03-16 19:05:23,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 5: [2023-03-16 19:05:23,904] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 9: [2023-03-16 19:05:23,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_05_optim_states.pt... + 9: [2023-03-16 19:05:23,905] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_05_optim_states.pt... + 5: [2023-03-16 19:05:23,907] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,909] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,910] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 4: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 4: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 2: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 7: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 7: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 7: [2023-03-16 19:05:23,911] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... +19: [2023-03-16 19:05:23,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_11_optim_states.pt... +19: [2023-03-16 19:05:23,912] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_11_optim_states.pt... + 4: [2023-03-16 19:05:23,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,913] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,915] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 7: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 5: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... + 2: [2023-03-16 19:05:23,918] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... + 7: [2023-03-16 19:05:23,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,919] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_12-model_01-model_states.pt. + 4: [2023-03-16 19:05:23,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 4: [2023-03-16 19:05:23,919] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,920] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,920] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 1: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 3: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... + 1: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 1: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 5: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,921] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,922] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... + 5: [2023-03-16 19:05:23,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 5: [2023-03-16 19:05:23,923] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt... + 6: [2023-03-16 19:05:23,924] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +13: [2023-03-16 19:05:23,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_05_optim_states.pt... +13: [2023-03-16 19:05:23,925] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_05_optim_states.pt... + 0: [2023-03-16 19:05:23,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: [2023-03-16 19:05:23,927] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 0: > overriding learning rate value to 0.0002 + 0: > overriding minimum learning rate value to 2e-05 + 0: > overriding warmup iterations value to 0 + 0: > overriding total number of iterations value to 1 + 0: > overriding decay style value to cosine + 1: [2023-03-16 19:05:23,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 3: [2023-03-16 19:05:23,930] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 1: [2023-03-16 19:05:23,931] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,931] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,932] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt... + 5: [2023-03-16 19:05:23,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 0: [2023-03-16 19:05:23,934] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 0: [2023-03-16 19:05:23,936] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +11: [2023-03-16 19:05:23,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_05_optim_states.pt... +11: [2023-03-16 19:05:23,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_05_optim_states.pt... +12: [2023-03-16 19:05:23,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_05_optim_states.pt... +12: [2023-03-16 19:05:23,936] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_05_optim_states.pt... + 5: [2023-03-16 19:05:23,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... + 5: [2023-03-16 19:05:23,937] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... + 0: [2023-03-16 19:05:23,938] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt... + 2: [2023-03-16 19:05:23,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 4: [2023-03-16 19:05:23,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... + 4: [2023-03-16 19:05:23,939] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... +27: [2023-03-16 19:05:23,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_12_optim_states.pt. +27: [2023-03-16 19:05:23,941] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 220 + 5: [2023-03-16 19:05:23,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 7: [2023-03-16 19:05:23,941] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 2: [2023-03-16 19:05:23,942] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 5: [2023-03-16 19:05:23,943] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 2: [2023-03-16 19:05:23,944] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 6: [2023-03-16 19:05:23,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... + 6: [2023-03-16 19:05:23,944] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... + 0: [2023-03-16 19:05:23,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... + 2: [2023-03-16 19:05:23,946] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 0: [2023-03-16 19:05:23,947] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... + 7: [2023-03-16 19:05:23,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +14: [2023-03-16 19:05:23,947] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +14: [2023-03-16 19:05:23,948] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +15: [2023-03-16 19:05:23,949] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +15: [2023-03-16 19:05:23,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +27: [2023-03-16 19:05:23,952] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 220 + 1: [2023-03-16 19:05:23,955] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. +10: [2023-03-16 19:05:23,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. +10: [2023-03-16 19:05:23,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_24-model_01-model_states.pt. + 1: [2023-03-16 19:05:23,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,961] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,963] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,965] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 3: [2023-03-16 19:05:23,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 1: [2023-03-16 19:05:23,966] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_00-model_states.pt. + 3: [2023-03-16 19:05:23,968] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 1: [2023-03-16 19:05:23,969] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt... + 7: [2023-03-16 19:05:23,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... + 7: [2023-03-16 19:05:23,971] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... +30: [2023-03-16 19:05:23,972] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_12_optim_states.pt. +30: [2023-03-16 19:05:23,972] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 244 +14: [2023-03-16 19:05:23,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_05_optim_states.pt... +14: [2023-03-16 19:05:23,974] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_05_optim_states.pt... + 1: [2023-03-16 19:05:23,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... + 1: [2023-03-16 19:05:23,975] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... +15: [2023-03-16 19:05:23,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_05_optim_states.pt... +15: [2023-03-16 19:05:23,976] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_05_optim_states.pt... +31: [2023-03-16 19:05:23,978] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_12_optim_states.pt. +31: [2023-03-16 19:05:23,978] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 252 +10: [2023-03-16 19:05:23,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_05_optim_states.pt... +10: [2023-03-16 19:05:23,981] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_05_optim_states.pt... +30: [2023-03-16 19:05:23,983] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 244 +31: [2023-03-16 19:05:23,989] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 252 +29: [2023-03-16 19:05:24,039] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_12_optim_states.pt. +29: [2023-03-16 19:05:24,040] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 232 +29: [2023-03-16 19:05:24,052] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 232 +30: [2023-03-16 19:05:24,102] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_12_optim_states.pt. +30: [2023-03-16 19:05:24,102] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 240 +25: [2023-03-16 19:05:24,105] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_12_optim_states.pt. +25: [2023-03-16 19:05:24,105] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 200 +30: [2023-03-16 19:05:24,113] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 240 +24: [2023-03-16 19:05:24,116] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_12_optim_states.pt. +24: [2023-03-16 19:05:24,117] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 192 +25: [2023-03-16 19:05:24,118] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 200 +24: [2023-03-16 19:05:24,128] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 192 +24: [2023-03-16 19:05:24,136] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_12_optim_states.pt. +24: [2023-03-16 19:05:24,136] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 196 +24: [2023-03-16 19:05:24,148] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 196 +29: [2023-03-16 19:05:24,153] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_12_optim_states.pt. +29: [2023-03-16 19:05:24,153] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 236 +28: [2023-03-16 19:05:24,161] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_12_optim_states.pt. +28: [2023-03-16 19:05:24,162] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 228 +29: [2023-03-16 19:05:24,165] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 236 +27: [2023-03-16 19:05:24,167] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_12_optim_states.pt. +27: [2023-03-16 19:05:24,167] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 216 +28: [2023-03-16 19:05:24,173] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 228 + 5: [2023-03-16 19:05:24,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 5: [2023-03-16 19:05:24,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. +28: [2023-03-16 19:05:24,177] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_12_optim_states.pt. +28: [2023-03-16 19:05:24,177] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 224 +27: [2023-03-16 19:05:24,178] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 216 + 2: [2023-03-16 19:05:24,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 2: [2023-03-16 19:05:24,180] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. +28: [2023-03-16 19:05:24,189] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 224 + 6: [2023-03-16 19:05:24,201] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 5: [2023-03-16 19:05:24,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... + 5: [2023-03-16 19:05:24,202] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... + 1: [2023-03-16 19:05:24,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 1: [2023-03-16 19:05:24,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 6: [2023-03-16 19:05:24,202] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 7: [2023-03-16 19:05:24,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 7: [2023-03-16 19:05:24,203] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 4: [2023-03-16 19:05:24,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 4: [2023-03-16 19:05:24,204] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. +31: [2023-03-16 19:05:24,208] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_12_optim_states.pt. +31: [2023-03-16 19:05:24,208] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 248 + 6: [2023-03-16 19:05:24,212] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 6: [2023-03-16 19:05:24,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 3: [2023-03-16 19:05:24,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 3: [2023-03-16 19:05:24,218] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. +31: [2023-03-16 19:05:24,219] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 248 + 2: [2023-03-16 19:05:24,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... + 2: [2023-03-16 19:05:24,219] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... + 4: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... + 4: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... + 7: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... + 7: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... + 1: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... + 1: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... + 7: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 7: [2023-03-16 19:05:24,226] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 0: [2023-03-16 19:05:24,233] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 0: [2023-03-16 19:05:24,234] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_03-model_states.pt. + 6: [2023-03-16 19:05:24,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... + 6: [2023-03-16 19:05:24,236] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... + 1: [2023-03-16 19:05:24,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 5: [2023-03-16 19:05:24,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 1: [2023-03-16 19:05:24,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 5: [2023-03-16 19:05:24,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 0: [2023-03-16 19:05:24,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 4: [2023-03-16 19:05:24,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 0: [2023-03-16 19:05:24,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 4: [2023-03-16 19:05:24,246] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 6: [2023-03-16 19:05:24,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... + 6: [2023-03-16 19:05:24,246] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... + 3: [2023-03-16 19:05:24,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... + 3: [2023-03-16 19:05:24,249] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... +25: [2023-03-16 19:05:24,251] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_12_optim_states.pt. +25: [2023-03-16 19:05:24,251] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 204 + 2: [2023-03-16 19:05:24,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 2: [2023-03-16 19:05:24,258] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 3: [2023-03-16 19:05:24,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 3: [2023-03-16 19:05:24,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_01-model_states.pt. + 7: [2023-03-16 19:05:24,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... + 7: [2023-03-16 19:05:24,262] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... +25: [2023-03-16 19:05:24,262] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 204 + 3: [2023-03-16 19:05:24,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 3: [2023-03-16 19:05:24,266] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. +26: [2023-03-16 19:05:24,267] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_14_optim_states.pt. +26: [2023-03-16 19:05:24,267] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 214 + 5: [2023-03-16 19:05:24,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... + 5: [2023-03-16 19:05:24,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... + 0: [2023-03-16 19:05:24,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... + 0: [2023-03-16 19:05:24,268] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... + 0: [2023-03-16 19:05:24,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... + 0: [2023-03-16 19:05:24,273] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... + 1: [2023-03-16 19:05:24,275] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 1: [2023-03-16 19:05:24,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 1: [2023-03-16 19:05:24,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... + 1: [2023-03-16 19:05:24,277] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... +26: [2023-03-16 19:05:24,277] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_12_optim_states.pt. +26: [2023-03-16 19:05:24,277] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 208 + 4: [2023-03-16 19:05:24,278] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 2: [2023-03-16 19:05:24,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 4: [2023-03-16 19:05:24,279] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 2: [2023-03-16 19:05:24,280] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 4: [2023-03-16 19:05:24,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... + 4: [2023-03-16 19:05:24,281] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... + 6: [2023-03-16 19:05:24,286] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. +26: [2023-03-16 19:05:24,288] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 208 + 7: [2023-03-16 19:05:24,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 7: [2023-03-16 19:05:24,290] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 0: [2023-03-16 19:05:24,291] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 6: [2023-03-16 19:05:24,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 0: [2023-03-16 19:05:24,292] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 5: [2023-03-16 19:05:24,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 2: [2023-03-16 19:05:24,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... + 2: [2023-03-16 19:05:24,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... + 3: [2023-03-16 19:05:24,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... + 3: [2023-03-16 19:05:24,300] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... +31: [2023-03-16 19:05:24,300] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_14_optim_states.pt. +31: [2023-03-16 19:05:24,300] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 254 + 5: [2023-03-16 19:05:24,301] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/layer_13-model_02-model_states.pt. + 3: [2023-03-16 19:05:24,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... + 3: [2023-03-16 19:05:24,301] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... + 4: [2023-03-16 19:05:24,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... + 4: [2023-03-16 19:05:24,302] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... +26: [2023-03-16 19:05:24,306] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 214 + 6: [2023-03-16 19:05:24,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... + 2: [2023-03-16 19:05:24,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... + 6: [2023-03-16 19:05:24,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... + 2: [2023-03-16 19:05:24,308] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... + 1: [2023-03-16 19:05:24,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... + 1: [2023-03-16 19:05:24,311] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... + 0: [2023-03-16 19:05:24,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... + 0: [2023-03-16 19:05:24,312] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... + 7: [2023-03-16 19:05:24,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... + 7: [2023-03-16 19:05:24,313] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... +26: [2023-03-16 19:05:24,317] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_12_optim_states.pt. +26: [2023-03-16 19:05:24,317] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 212 +26: [2023-03-16 19:05:24,329] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 212 + 5: [2023-03-16 19:05:24,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... + 5: [2023-03-16 19:05:24,329] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... +31: [2023-03-16 19:05:24,332] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 254 +29: [2023-03-16 19:05:24,336] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_13_optim_states.pt. +29: [2023-03-16 19:05:24,336] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 233 +26: [2023-03-16 19:05:24,346] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_13_optim_states.pt. +26: [2023-03-16 19:05:24,346] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 213 +29: [2023-03-16 19:05:24,348] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 233 +26: [2023-03-16 19:05:24,357] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 213 +29: [2023-03-16 19:05:24,374] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_13_optim_states.pt. +29: [2023-03-16 19:05:24,374] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 237 +30: [2023-03-16 19:05:24,380] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_14_optim_states.pt. +30: [2023-03-16 19:05:24,381] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 242 +29: [2023-03-16 19:05:24,387] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 237 +28: [2023-03-16 19:05:24,393] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_14_optim_states.pt. +28: [2023-03-16 19:05:24,393] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 226 +29: [2023-03-16 19:05:24,394] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_14_optim_states.pt. +29: [2023-03-16 19:05:24,394] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 238 +30: [2023-03-16 19:05:24,395] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 242 +25: [2023-03-16 19:05:24,395] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_14_optim_states.pt. +25: [2023-03-16 19:05:24,395] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 206 +31: [2023-03-16 19:05:24,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_14_optim_states.pt. +31: [2023-03-16 19:05:24,403] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 250 +25: [2023-03-16 19:05:24,405] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 206 +29: [2023-03-16 19:05:24,406] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 238 +28: [2023-03-16 19:05:24,408] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 226 +24: [2023-03-16 19:05:24,424] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_13_optim_states.pt. +31: [2023-03-16 19:05:24,422] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 250 +24: [2023-03-16 19:05:24,424] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 193 +24: [2023-03-16 19:05:24,435] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 193 +25: [2023-03-16 19:05:24,435] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_13_optim_states.pt. +25: [2023-03-16 19:05:24,436] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 205 +30: [2023-03-16 19:05:24,436] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_13_optim_states.pt. +30: [2023-03-16 19:05:24,436] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 245 +28: [2023-03-16 19:05:24,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_14_optim_states.pt. +28: [2023-03-16 19:05:24,437] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 230 +31: [2023-03-16 19:05:24,437] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_13_optim_states.pt. +31: [2023-03-16 19:05:24,437] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 249 +28: [2023-03-16 19:05:24,448] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 230 +25: [2023-03-16 19:05:24,449] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 205 +30: [2023-03-16 19:05:24,450] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 245 +30: [2023-03-16 19:05:24,455] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_15_optim_states.pt. +30: [2023-03-16 19:05:24,455] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 243 +31: [2023-03-16 19:05:24,457] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 249 +24: [2023-03-16 19:05:24,457] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_14_optim_states.pt. +24: [2023-03-16 19:05:24,457] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 194 +26: [2023-03-16 19:05:24,459] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_14_optim_states.pt. +26: [2023-03-16 19:05:24,459] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 210 +24: [2023-03-16 19:05:24,469] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 194 +26: [2023-03-16 19:05:24,470] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 210 +30: [2023-03-16 19:05:24,471] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 243 +25: [2023-03-16 19:05:24,484] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_15_optim_states.pt. +25: [2023-03-16 19:05:24,484] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 207 +30: [2023-03-16 19:05:24,492] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_13_optim_states.pt. +30: [2023-03-16 19:05:24,492] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 241 +25: [2023-03-16 19:05:24,496] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 207 +27: [2023-03-16 19:05:24,502] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_14_optim_states.pt. +27: [2023-03-16 19:05:24,502] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 222 +30: [2023-03-16 19:05:24,508] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 241 +28: [2023-03-16 19:05:24,508] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_13_optim_states.pt. +28: [2023-03-16 19:05:24,508] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 229 +27: [2023-03-16 19:05:24,512] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 222 +29: [2023-03-16 19:05:24,512] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_14_optim_states.pt. +29: [2023-03-16 19:05:24,512] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 234 +31: [2023-03-16 19:05:24,519] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_15_optim_states.pt. +31: [2023-03-16 19:05:24,519] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 255 +28: [2023-03-16 19:05:24,523] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 229 +27: [2023-03-16 19:05:24,525] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_14_optim_states.pt. +29: [2023-03-16 19:05:24,525] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 234 +27: [2023-03-16 19:05:24,526] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 218 +30: [2023-03-16 19:05:24,526] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_14_optim_states.pt. +30: [2023-03-16 19:05:24,526] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 246 +23: [2023-03-16 19:05:24,531] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_08_optim_states.pt. +28: [2023-03-16 19:05:24,533] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_13_optim_states.pt. +28: [2023-03-16 19:05:24,533] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 225 +27: [2023-03-16 19:05:24,536] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 218 +23: [2023-03-16 19:05:24,531] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 188 +31: [2023-03-16 19:05:24,541] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 255 +30: [2023-03-16 19:05:24,541] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 246 +23: [2023-03-16 19:05:24,544] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 188 +24: [2023-03-16 19:05:24,548] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_13_optim_states.pt. +24: [2023-03-16 19:05:24,549] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 197 +29: [2023-03-16 19:05:24,551] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_15_optim_states.pt. +29: [2023-03-16 19:05:24,551] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 239 +28: [2023-03-16 19:05:24,552] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 225 +21: [2023-03-16 19:05:24,552] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_08_optim_states.pt. +21: [2023-03-16 19:05:24,552] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 172 +27: [2023-03-16 19:05:24,557] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_15_optim_states.pt. +27: [2023-03-16 19:05:24,557] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 223 +16: [2023-03-16 19:05:24,560] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_08_optim_states.pt. +16: [2023-03-16 19:05:24,560] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 128 +24: [2023-03-16 19:05:24,560] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 197 +21: [2023-03-16 19:05:24,564] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 172 +29: [2023-03-16 19:05:24,566] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 239 +27: [2023-03-16 19:05:24,568] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 223 +25: [2023-03-16 19:05:24,572] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_14_optim_states.pt. +25: [2023-03-16 19:05:24,572] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 202 +16: [2023-03-16 19:05:24,572] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 128 +26: [2023-03-16 19:05:24,578] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_13_optim_states.pt. +26: [2023-03-16 19:05:24,578] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 209 +27: [2023-03-16 19:05:24,581] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_13_optim_states.pt. +27: [2023-03-16 19:05:24,581] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 217 +25: [2023-03-16 19:05:24,585] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 202 +24: [2023-03-16 19:05:24,586] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_15_optim_states.pt. +24: [2023-03-16 19:05:24,586] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 195 +31: [2023-03-16 19:05:24,589] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_13_optim_states.pt. +31: [2023-03-16 19:05:24,589] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 253 +26: [2023-03-16 19:05:24,591] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 209 +27: [2023-03-16 19:05:24,593] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 217 + 8: [2023-03-16 19:05:24,596] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_04_optim_states.pt. + 8: [2023-03-16 19:05:24,597] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 68 +24: [2023-03-16 19:05:24,599] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 195 +25: [2023-03-16 19:05:24,600] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_15_optim_states.pt. +25: [2023-03-16 19:05:24,600] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 203 +31: [2023-03-16 19:05:24,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_15_optim_states.pt. +18: [2023-03-16 19:05:24,603] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_08_optim_states.pt. +18: [2023-03-16 19:05:24,603] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 148 +31: [2023-03-16 19:05:24,603] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 251 +15: [2023-03-16 19:05:24,604] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_04_optim_states.pt. +15: [2023-03-16 19:05:24,604] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 120 +31: [2023-03-16 19:05:24,604] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 253 +15: [2023-03-16 19:05:24,605] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_04_optim_states.pt. +15: [2023-03-16 19:05:24,605] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 124 + 8: [2023-03-16 19:05:24,609] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 68 +25: [2023-03-16 19:05:24,611] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 203 +18: [2023-03-16 19:05:24,615] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 148 +15: [2023-03-16 19:05:24,617] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 120 +15: [2023-03-16 19:05:24,618] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 124 +29: [2023-03-16 19:05:24,620] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_15_optim_states.pt. +29: [2023-03-16 19:05:24,620] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 235 +31: [2023-03-16 19:05:24,621] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 251 +17: [2023-03-16 19:05:24,623] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_08_optim_states.pt. +17: [2023-03-16 19:05:24,623] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 136 +25: [2023-03-16 19:05:24,632] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_13_optim_states.pt. +25: [2023-03-16 19:05:24,632] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 201 +29: [2023-03-16 19:05:24,635] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 235 +17: [2023-03-16 19:05:24,636] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 136 +28: [2023-03-16 19:05:24,637] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_15_optim_states.pt. +28: [2023-03-16 19:05:24,637] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 231 +18: [2023-03-16 19:05:24,643] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_08_optim_states.pt. +18: [2023-03-16 19:05:24,643] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 144 +25: [2023-03-16 19:05:24,645] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 201 +11: [2023-03-16 19:05:24,650] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_04_optim_states.pt. +11: [2023-03-16 19:05:24,650] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 88 +28: [2023-03-16 19:05:24,655] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 231 +18: [2023-03-16 19:05:24,656] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 144 +24: [2023-03-16 19:05:24,660] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_14_optim_states.pt. +24: [2023-03-16 19:05:24,660] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 198 +26: [2023-03-16 19:05:24,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_15_optim_states.pt. +26: [2023-03-16 19:05:24,661] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 215 +11: [2023-03-16 19:05:24,661] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 88 +27: [2023-03-16 19:05:24,671] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_13_optim_states.pt. +27: [2023-03-16 19:05:24,671] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 221 +24: [2023-03-16 19:05:24,672] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 198 +26: [2023-03-16 19:05:24,673] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 215 +30: [2023-03-16 19:05:24,675] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_15_optim_states.pt. +30: [2023-03-16 19:05:24,675] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 247 +27: [2023-03-16 19:05:24,683] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 221 +30: [2023-03-16 19:05:24,690] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 247 +23: [2023-03-16 19:05:24,690] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_08_optim_states.pt. +23: [2023-03-16 19:05:24,690] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 184 +23: [2023-03-16 19:05:24,703] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 184 + 9: [2023-03-16 19:05:24,721] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_04_optim_states.pt. + 9: [2023-03-16 19:05:24,721] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 72 +26: [2023-03-16 19:05:24,725] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_15_optim_states.pt. +26: [2023-03-16 19:05:24,725] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 211 +22: [2023-03-16 19:05:24,728] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_08_optim_states.pt. +22: [2023-03-16 19:05:24,728] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 176 +24: [2023-03-16 19:05:24,731] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_15_optim_states.pt. +24: [2023-03-16 19:05:24,731] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 199 + 9: [2023-03-16 19:05:24,731] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 72 +12: [2023-03-16 19:05:24,735] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_04_optim_states.pt. +12: [2023-03-16 19:05:24,735] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 96 +26: [2023-03-16 19:05:24,737] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 211 +22: [2023-03-16 19:05:24,740] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 176 +24: [2023-03-16 19:05:24,744] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 199 +12: [2023-03-16 19:05:24,749] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 96 +13: [2023-03-16 19:05:24,756] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_04_optim_states.pt. +13: [2023-03-16 19:05:24,756] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 108 +10: [2023-03-16 19:05:24,765] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_04_optim_states.pt. +10: [2023-03-16 19:05:24,765] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 84 +13: [2023-03-16 19:05:24,768] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 108 +10: [2023-03-16 19:05:24,778] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 84 +20: [2023-03-16 19:05:24,787] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_08_optim_states.pt. +20: [2023-03-16 19:05:24,787] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 164 +20: [2023-03-16 19:05:24,800] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 164 +14: [2023-03-16 19:05:24,801] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_04_optim_states.pt. +14: [2023-03-16 19:05:24,801] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 116 +14: [2023-03-16 19:05:24,813] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 116 +13: [2023-03-16 19:05:24,837] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_04_optim_states.pt. +13: [2023-03-16 19:05:24,837] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 104 +13: [2023-03-16 19:05:24,850] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 104 +19: [2023-03-16 19:05:24,858] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_08_optim_states.pt. +21: [2023-03-16 19:05:24,859] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_08_optim_states.pt. +19: [2023-03-16 19:05:24,858] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 152 +21: [2023-03-16 19:05:24,859] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 168 +12: [2023-03-16 19:05:24,861] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_04_optim_states.pt. +12: [2023-03-16 19:05:24,862] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 100 +19: [2023-03-16 19:05:24,871] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 152 +12: [2023-03-16 19:05:24,874] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 100 +21: [2023-03-16 19:05:24,877] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 168 + 8: [2023-03-16 19:05:24,892] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_04_optim_states.pt. + 8: [2023-03-16 19:05:24,893] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 64 +19: [2023-03-16 19:05:24,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_08_optim_states.pt. +19: [2023-03-16 19:05:24,895] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 156 + 9: [2023-03-16 19:05:24,895] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_04_optim_states.pt. + 9: [2023-03-16 19:05:24,895] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 76 +17: [2023-03-16 19:05:24,902] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_08_optim_states.pt. +17: [2023-03-16 19:05:24,903] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 140 + 8: [2023-03-16 19:05:24,905] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 64 +19: [2023-03-16 19:05:24,906] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 156 + 9: [2023-03-16 19:05:24,907] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 76 +17: [2023-03-16 19:05:24,914] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 140 +20: [2023-03-16 19:05:24,930] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_08_optim_states.pt. +20: [2023-03-16 19:05:24,930] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 160 +11: [2023-03-16 19:05:24,937] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_04_optim_states.pt. +11: [2023-03-16 19:05:24,937] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 92 +20: [2023-03-16 19:05:24,942] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 160 +11: [2023-03-16 19:05:24,949] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 92 +22: [2023-03-16 19:05:24,951] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_08_optim_states.pt. +22: [2023-03-16 19:05:24,951] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 180 +10: [2023-03-16 19:05:24,954] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_04_optim_states.pt. +10: [2023-03-16 19:05:24,954] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 80 + 9: [2023-03-16 19:05:24,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_07_optim_states.pt. +14: [2023-03-16 19:05:24,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_04_optim_states.pt. + 9: [2023-03-16 19:05:24,957] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 75 +14: [2023-03-16 19:05:24,957] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 112 +14: [2023-03-16 19:05:24,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_06_optim_states.pt. +14: [2023-03-16 19:05:24,959] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 118 +13: [2023-03-16 19:05:24,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_06_optim_states.pt. +13: [2023-03-16 19:05:24,960] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 110 +22: [2023-03-16 19:05:24,964] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 180 +10: [2023-03-16 19:05:24,966] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 80 + 9: [2023-03-16 19:05:24,971] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 75 +13: [2023-03-16 19:05:24,971] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 110 +14: [2023-03-16 19:05:24,972] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 118 +14: [2023-03-16 19:05:24,972] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 112 +23: [2023-03-16 19:05:24,984] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_09_optim_states.pt. +23: [2023-03-16 19:05:24,984] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 189 +23: [2023-03-16 19:05:24,995] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 189 +13: [2023-03-16 19:05:25,028] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_07_optim_states.pt. +13: [2023-03-16 19:05:25,028] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 111 +20: [2023-03-16 19:05:25,029] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_10_optim_states.pt. +20: [2023-03-16 19:05:25,030] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 162 +20: [2023-03-16 19:05:25,040] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 162 +13: [2023-03-16 19:05:25,040] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 111 +28: [2023-03-16 19:05:25,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_15_optim_states.pt. +28: [2023-03-16 19:05:25,047] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 227 +14: [2023-03-16 19:05:25,047] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_07_optim_states.pt. +14: [2023-03-16 19:05:25,047] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 115 +28: [2023-03-16 19:05:25,057] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 227 +14: [2023-03-16 19:05:25,059] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 115 +11: [2023-03-16 19:05:25,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_07_optim_states.pt. +11: [2023-03-16 19:05:25,074] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 95 +10: [2023-03-16 19:05:25,074] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_06_optim_states.pt. +10: [2023-03-16 19:05:25,074] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 86 +21: [2023-03-16 19:05:25,075] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_10_optim_states.pt. +21: [2023-03-16 19:05:25,075] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 170 +11: [2023-03-16 19:05:25,087] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 95 +21: [2023-03-16 19:05:25,087] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 170 +15: [2023-03-16 19:05:25,087] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_07_optim_states.pt. +15: [2023-03-16 19:05:25,087] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 123 +10: [2023-03-16 19:05:25,089] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 86 +18: [2023-03-16 19:05:25,095] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_11_optim_states.pt. +18: [2023-03-16 19:05:25,095] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 147 +15: [2023-03-16 19:05:25,097] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 123 +11: [2023-03-16 19:05:25,103] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_07_optim_states.pt. +11: [2023-03-16 19:05:25,103] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 91 +18: [2023-03-16 19:05:25,107] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 147 +11: [2023-03-16 19:05:25,115] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 91 + 9: [2023-03-16 19:05:25,125] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_07_optim_states.pt. + 9: [2023-03-16 19:05:25,126] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 79 + 8: [2023-03-16 19:05:25,126] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_07_optim_states.pt. + 8: [2023-03-16 19:05:25,126] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 71 +22: [2023-03-16 19:05:25,131] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_10_optim_states.pt. +22: [2023-03-16 19:05:25,132] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 178 + 9: [2023-03-16 19:05:25,138] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 79 + 8: [2023-03-16 19:05:25,140] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 71 +22: [2023-03-16 19:05:25,143] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 178 + 9: [2023-03-16 19:05:25,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_06_optim_states.pt. + 9: [2023-03-16 19:05:25,147] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 78 +12: [2023-03-16 19:05:25,147] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_06_optim_states.pt. +12: [2023-03-16 19:05:25,148] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 98 +10: [2023-03-16 19:05:25,151] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_07_optim_states.pt. +10: [2023-03-16 19:05:25,151] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 87 +19: [2023-03-16 19:05:25,155] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_09_optim_states.pt. +19: [2023-03-16 19:05:25,155] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 157 + 9: [2023-03-16 19:05:25,159] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 78 +18: [2023-03-16 19:05:25,159] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_09_optim_states.pt. +18: [2023-03-16 19:05:25,159] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 149 +12: [2023-03-16 19:05:25,160] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 98 +19: [2023-03-16 19:05:25,162] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_11_optim_states.pt. +19: [2023-03-16 19:05:25,162] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 159 +10: [2023-03-16 19:05:25,164] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 87 +19: [2023-03-16 19:05:25,167] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 157 + 5: [2023-03-16 19:05:25,168] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. + 5: [2023-03-16 19:05:25,168] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 40 +18: [2023-03-16 19:05:25,169] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 149 +17: [2023-03-16 19:05:25,173] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_11_optim_states.pt. +17: [2023-03-16 19:05:25,173] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 143 +12: [2023-03-16 19:05:25,175] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_07_optim_states.pt. +12: [2023-03-16 19:05:25,176] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 103 +19: [2023-03-16 19:05:25,175] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 159 + 5: [2023-03-16 19:05:25,182] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 40 +17: [2023-03-16 19:05:25,184] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 143 +12: [2023-03-16 19:05:25,186] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 103 +18: [2023-03-16 19:05:25,192] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_09_optim_states.pt. +18: [2023-03-16 19:05:25,192] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 145 +10: [2023-03-16 19:05:25,194] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_06_optim_states.pt. +10: [2023-03-16 19:05:25,194] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 82 +17: [2023-03-16 19:05:25,198] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_09_optim_states.pt. +17: [2023-03-16 19:05:25,198] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 137 +18: [2023-03-16 19:05:25,204] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 145 +10: [2023-03-16 19:05:25,208] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 82 +17: [2023-03-16 19:05:25,210] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 137 +12: [2023-03-16 19:05:25,214] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_07_optim_states.pt. +12: [2023-03-16 19:05:25,214] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 99 +21: [2023-03-16 19:05:25,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_11_optim_states.pt. +13: [2023-03-16 19:05:25,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_07_optim_states.pt. +21: [2023-03-16 19:05:25,225] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_09_optim_states.pt. +21: [2023-03-16 19:05:25,225] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 175 +13: [2023-03-16 19:05:25,225] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 107 +21: [2023-03-16 19:05:25,225] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 173 +12: [2023-03-16 19:05:25,228] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 99 +14: [2023-03-16 19:05:25,230] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_07_optim_states.pt. +14: [2023-03-16 19:05:25,231] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 119 +16: [2023-03-16 19:05:25,232] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_11_optim_states.pt. +16: [2023-03-16 19:05:25,233] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 131 +13: [2023-03-16 19:05:25,239] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 107 +21: [2023-03-16 19:05:25,240] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 173 +21: [2023-03-16 19:05:25,240] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 175 +23: [2023-03-16 19:05:25,240] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_09_optim_states.pt. +23: [2023-03-16 19:05:25,241] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 185 +21: [2023-03-16 19:05:25,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_10_optim_states.pt. +21: [2023-03-16 19:05:25,242] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 174 +11: [2023-03-16 19:05:25,243] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_05_optim_states.pt. +11: [2023-03-16 19:05:25,243] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 89 +16: [2023-03-16 19:05:25,244] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 131 +14: [2023-03-16 19:05:25,244] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 119 +14: [2023-03-16 19:05:25,245] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_06_optim_states.pt. +14: [2023-03-16 19:05:25,245] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 114 + 9: [2023-03-16 19:05:25,247] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_05_optim_states.pt. + 9: [2023-03-16 19:05:25,247] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 73 +12: [2023-03-16 19:05:25,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_06_optim_states.pt. +12: [2023-03-16 19:05:25,250] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 102 +22: [2023-03-16 19:05:25,250] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_11_optim_states.pt. +22: [2023-03-16 19:05:25,251] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 183 +23: [2023-03-16 19:05:25,252] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 185 +21: [2023-03-16 19:05:25,255] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 174 +11: [2023-03-16 19:05:25,255] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 89 + 9: [2023-03-16 19:05:25,258] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 73 +14: [2023-03-16 19:05:25,259] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 114 + 8: [2023-03-16 19:05:25,260] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_05_optim_states.pt. + 8: [2023-03-16 19:05:25,261] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 65 +12: [2023-03-16 19:05:25,262] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 102 +22: [2023-03-16 19:05:25,263] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 183 +19: [2023-03-16 19:05:25,263] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_10_optim_states.pt. +19: [2023-03-16 19:05:25,263] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 158 +27: [2023-03-16 19:05:25,264] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_15_optim_states.pt. +27: [2023-03-16 19:05:25,265] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 219 +15: [2023-03-16 19:05:25,265] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_06_optim_states.pt. +15: [2023-03-16 19:05:25,265] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 122 +21: [2023-03-16 19:05:25,268] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_09_optim_states.pt. +21: [2023-03-16 19:05:25,268] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 169 +23: [2023-03-16 19:05:25,269] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_10_optim_states.pt. +23: [2023-03-16 19:05:25,269] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 190 + 8: [2023-03-16 19:05:25,272] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 65 +16: [2023-03-16 19:05:25,272] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_10_optim_states.pt. +16: [2023-03-16 19:05:25,272] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 130 +20: [2023-03-16 19:05:25,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_09_optim_states.pt. +20: [2023-03-16 19:05:25,273] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 161 +15: [2023-03-16 19:05:25,273] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_07_optim_states.pt. +15: [2023-03-16 19:05:25,274] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 127 +15: [2023-03-16 19:05:25,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_06_optim_states.pt. +15: [2023-03-16 19:05:25,276] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 126 +21: [2023-03-16 19:05:25,276] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_11_optim_states.pt. +21: [2023-03-16 19:05:25,276] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 171 +27: [2023-03-16 19:05:25,278] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 219 +19: [2023-03-16 19:05:25,277] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 158 +15: [2023-03-16 19:05:25,278] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 122 +21: [2023-03-16 19:05:25,279] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 169 +23: [2023-03-16 19:05:25,283] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 190 +16: [2023-03-16 19:05:25,284] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 130 +20: [2023-03-16 19:05:25,285] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 161 +15: [2023-03-16 19:05:25,287] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 127 +21: [2023-03-16 19:05:25,289] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 171 + 8: [2023-03-16 19:05:25,289] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_07_optim_states.pt. + 8: [2023-03-16 19:05:25,289] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 67 +15: [2023-03-16 19:05:25,290] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 126 +20: [2023-03-16 19:05:25,293] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_09_optim_states.pt. +20: [2023-03-16 19:05:25,293] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 165 +10: [2023-03-16 19:05:25,294] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_07_optim_states.pt. +10: [2023-03-16 19:05:25,294] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 83 +17: [2023-03-16 19:05:25,296] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_10_optim_states.pt. +17: [2023-03-16 19:05:25,296] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 142 + 8: [2023-03-16 19:05:25,300] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 67 +11: [2023-03-16 19:05:25,302] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_06_optim_states.pt. +11: [2023-03-16 19:05:25,303] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 94 +17: [2023-03-16 19:05:25,303] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_10_optim_states.pt. +17: [2023-03-16 19:05:25,304] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 138 +22: [2023-03-16 19:05:25,306] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_09_optim_states.pt. +22: [2023-03-16 19:05:25,306] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 181 +20: [2023-03-16 19:05:25,307] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 165 +10: [2023-03-16 19:05:25,309] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 83 +12: [2023-03-16 19:05:25,309] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_05_optim_states.pt. +12: [2023-03-16 19:05:25,309] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 101 +23: [2023-03-16 19:05:25,310] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_11_optim_states.pt. +23: [2023-03-16 19:05:25,310] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 191 +11: [2023-03-16 19:05:25,311] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_06_optim_states.pt. +11: [2023-03-16 19:05:25,311] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 90 +17: [2023-03-16 19:05:25,312] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 142 +18: [2023-03-16 19:05:25,313] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_10_optim_states.pt. +18: [2023-03-16 19:05:25,313] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 150 +11: [2023-03-16 19:05:25,314] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 94 +17: [2023-03-16 19:05:25,317] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 138 +22: [2023-03-16 19:05:25,319] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 181 +11: [2023-03-16 19:05:25,322] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 90 +23: [2023-03-16 19:05:25,322] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_11_optim_states.pt. +12: [2023-03-16 19:05:25,322] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 101 +23: [2023-03-16 19:05:25,322] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 187 +23: [2023-03-16 19:05:25,324] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 191 +19: [2023-03-16 19:05:25,325] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_11_optim_states.pt. +19: [2023-03-16 19:05:25,325] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 155 +18: [2023-03-16 19:05:25,326] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 150 +19: [2023-03-16 19:05:25,327] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_10_optim_states.pt. +19: [2023-03-16 19:05:25,328] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 154 + 8: [2023-03-16 19:05:25,331] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_06_optim_states.pt. + 8: [2023-03-16 19:05:25,332] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 70 +23: [2023-03-16 19:05:25,335] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 187 +19: [2023-03-16 19:05:25,340] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 155 +19: [2023-03-16 19:05:25,341] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 154 +15: [2023-03-16 19:05:25,342] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_05_optim_states.pt. +15: [2023-03-16 19:05:25,342] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 125 +13: [2023-03-16 19:05:25,343] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_05_optim_states.pt. +13: [2023-03-16 19:05:25,344] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 109 + 8: [2023-03-16 19:05:25,345] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 70 +18: [2023-03-16 19:05:25,350] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_10_optim_states.pt. +18: [2023-03-16 19:05:25,350] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 146 +14: [2023-03-16 19:05:25,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_05_optim_states.pt. +14: [2023-03-16 19:05:25,351] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 113 +16: [2023-03-16 19:05:25,351] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_10_optim_states.pt. +16: [2023-03-16 19:05:25,352] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 134 + 8: [2023-03-16 19:05:25,355] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_06_optim_states.pt. + 8: [2023-03-16 19:05:25,355] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 66 +15: [2023-03-16 19:05:25,356] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 125 +22: [2023-03-16 19:05:25,356] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_11_optim_states.pt. +22: [2023-03-16 19:05:25,356] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 179 +13: [2023-03-16 19:05:25,356] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 109 +16: [2023-03-16 19:05:25,362] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 134 +18: [2023-03-16 19:05:25,362] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 146 +22: [2023-03-16 19:05:25,362] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_09_optim_states.pt. +22: [2023-03-16 19:05:25,362] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 177 +14: [2023-03-16 19:05:25,363] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 113 +11: [2023-03-16 19:05:25,364] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_05_optim_states.pt. +11: [2023-03-16 19:05:25,364] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 93 +17: [2023-03-16 19:05:25,366] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_09_optim_states.pt. +17: [2023-03-16 19:05:25,366] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 141 +12: [2023-03-16 19:05:25,367] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_05_optim_states.pt. +12: [2023-03-16 19:05:25,367] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 97 + 8: [2023-03-16 19:05:25,368] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 66 +17: [2023-03-16 19:05:25,370] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_11_optim_states.pt. +17: [2023-03-16 19:05:25,370] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 139 +22: [2023-03-16 19:05:25,372] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 179 +18: [2023-03-16 19:05:25,376] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_11_optim_states.pt. +18: [2023-03-16 19:05:25,376] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 151 +22: [2023-03-16 19:05:25,377] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 177 +11: [2023-03-16 19:05:25,377] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 93 +17: [2023-03-16 19:05:25,378] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 141 +12: [2023-03-16 19:05:25,380] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 97 +17: [2023-03-16 19:05:25,383] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 139 +20: [2023-03-16 19:05:25,383] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_11_optim_states.pt. +20: [2023-03-16 19:05:25,383] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 163 +20: [2023-03-16 19:05:25,387] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_10_optim_states.pt. +20: [2023-03-16 19:05:25,387] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 166 +18: [2023-03-16 19:05:25,389] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 151 +10: [2023-03-16 19:05:25,390] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_05_optim_states.pt. +10: [2023-03-16 19:05:25,390] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 81 +13: [2023-03-16 19:05:25,391] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_05_optim_states.pt. +13: [2023-03-16 19:05:25,391] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 105 +15: [2023-03-16 19:05:25,399] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_05_optim_states.pt. +15: [2023-03-16 19:05:25,399] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 121 +20: [2023-03-16 19:05:25,400] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 163 +20: [2023-03-16 19:05:25,401] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 166 +14: [2023-03-16 19:05:25,403] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_05_optim_states.pt. +14: [2023-03-16 19:05:25,403] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 117 +13: [2023-03-16 19:05:25,406] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 105 +19: [2023-03-16 19:05:25,407] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_09_optim_states.pt. +19: [2023-03-16 19:05:25,407] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 153 +13: [2023-03-16 19:05:25,409] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_06_optim_states.pt. +13: [2023-03-16 19:05:25,409] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 106 +15: [2023-03-16 19:05:25,411] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 121 +10: [2023-03-16 19:05:25,411] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 81 +23: [2023-03-16 19:05:25,414] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_10_optim_states.pt. +23: [2023-03-16 19:05:25,414] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 186 +14: [2023-03-16 19:05:25,415] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 117 +19: [2023-03-16 19:05:25,420] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 153 +13: [2023-03-16 19:05:25,423] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 106 +23: [2023-03-16 19:05:25,425] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 186 +22: [2023-03-16 19:05:25,433] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_10_optim_states.pt. +22: [2023-03-16 19:05:25,434] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 182 + 8: [2023-03-16 19:05:25,438] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_05_optim_states.pt. + 8: [2023-03-16 19:05:25,438] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 69 +20: [2023-03-16 19:05:25,442] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_11_optim_states.pt. +20: [2023-03-16 19:05:25,442] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 167 +22: [2023-03-16 19:05:25,447] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 182 + 8: [2023-03-16 19:05:25,448] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 69 +20: [2023-03-16 19:05:25,459] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 167 + 9: [2023-03-16 19:05:25,470] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_05_optim_states.pt. + 9: [2023-03-16 19:05:25,470] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 77 +10: [2023-03-16 19:05:25,482] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_05_optim_states.pt. +10: [2023-03-16 19:05:25,482] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 85 + 9: [2023-03-16 19:05:25,482] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 77 +10: [2023-03-16 19:05:25,492] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 85 + 4: [2023-03-16 19:05:25,549] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. + 4: [2023-03-16 19:05:25,550] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 36 + 4: [2023-03-16 19:05:25,562] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 36 + 3: [2023-03-16 19:05:25,574] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. + 3: [2023-03-16 19:05:25,574] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 31 + 0: [2023-03-16 19:05:25,582] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. + 0: [2023-03-16 19:05:25,582] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 0 + 4: [2023-03-16 19:05:25,585] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. + 4: [2023-03-16 19:05:25,585] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 32 + 3: [2023-03-16 19:05:25,588] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 31 + 0: [2023-03-16 19:05:25,596] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 0 + 4: [2023-03-16 19:05:25,598] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 32 + 5: [2023-03-16 19:05:25,606] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. + 5: [2023-03-16 19:05:25,606] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 44 + 0: could not find arguments in the checkpoint ... + 0: checkpoint version 3.0 + 1: [2023-03-16 19:05:25,613] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. + 1: [2023-03-16 19:05:25,614] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 12 + 7: [2023-03-16 19:05:25,614] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. + 7: [2023-03-16 19:05:25,614] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 60 + 5: [2023-03-16 19:05:25,620] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 44 + 2: [2023-03-16 19:05:25,624] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. + 2: [2023-03-16 19:05:25,625] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 20 + 1: [2023-03-16 19:05:25,627] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 12 + 7: [2023-03-16 19:05:25,628] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 60 + 7: [2023-03-16 19:05:25,631] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. + 7: [2023-03-16 19:05:25,631] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 56 + 2: [2023-03-16 19:05:25,638] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 20 + 7: [2023-03-16 19:05:25,643] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 56 + 6: [2023-03-16 19:05:25,648] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. + 6: [2023-03-16 19:05:25,648] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 52 + 6: [2023-03-16 19:05:25,652] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. + 6: [2023-03-16 19:05:25,652] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 48 + 0: [2023-03-16 19:05:25,661] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. + 0: [2023-03-16 19:05:25,661] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 4 + 6: [2023-03-16 19:05:25,662] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 52 + 6: [2023-03-16 19:05:25,667] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 48 + 0: [2023-03-16 19:05:25,672] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 4 + 3: [2023-03-16 19:05:25,697] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. + 3: [2023-03-16 19:05:25,697] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 28 + 1: [2023-03-16 19:05:25,710] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. + 1: [2023-03-16 19:05:25,710] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 15 + 3: [2023-03-16 19:05:25,711] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 28 + 6: [2023-03-16 19:05:25,712] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. + 6: [2023-03-16 19:05:25,712] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 49 + 3: [2023-03-16 19:05:25,717] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. + 3: [2023-03-16 19:05:25,717] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 24 + 1: [2023-03-16 19:05:25,722] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 15 + 6: [2023-03-16 19:05:25,726] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 49 + 7: [2023-03-16 19:05:25,726] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. + 7: [2023-03-16 19:05:25,726] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 59 + 3: [2023-03-16 19:05:25,730] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 24 + 7: [2023-03-16 19:05:25,738] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. + 7: [2023-03-16 19:05:25,738] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 62 + 7: [2023-03-16 19:05:25,742] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 59 + 2: [2023-03-16 19:05:25,750] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. + 2: [2023-03-16 19:05:25,751] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 21 + 7: [2023-03-16 19:05:25,753] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 62 + 1: [2023-03-16 19:05:25,759] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. + 1: [2023-03-16 19:05:25,759] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 14 + 2: [2023-03-16 19:05:25,765] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 21 + 0: [2023-03-16 19:05:25,770] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. + 0: [2023-03-16 19:05:25,771] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 6 + 1: [2023-03-16 19:05:25,773] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 14 + 7: [2023-03-16 19:05:25,784] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. + 7: [2023-03-16 19:05:25,784] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 61 + 0: [2023-03-16 19:05:25,785] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 6 + 0: [2023-03-16 19:05:25,792] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. + 0: [2023-03-16 19:05:25,792] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 5 + 4: [2023-03-16 19:05:25,796] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. + 4: [2023-03-16 19:05:25,796] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 34 + 7: [2023-03-16 19:05:25,801] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 61 + 3: [2023-03-16 19:05:25,802] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. + 3: [2023-03-16 19:05:25,802] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 27 + 0: [2023-03-16 19:05:25,805] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 5 + 1: [2023-03-16 19:05:25,806] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. + 1: [2023-03-16 19:05:25,806] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 8 + 6: [2023-03-16 19:05:25,808] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. + 6: [2023-03-16 19:05:25,808] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 55 + 1: [2023-03-16 19:05:25,810] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. + 1: [2023-03-16 19:05:25,810] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 13 + 4: [2023-03-16 19:05:25,812] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 34 + 3: [2023-03-16 19:05:25,814] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 27 + 1: [2023-03-16 19:05:25,820] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 8 + 1: [2023-03-16 19:05:25,824] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 13 + 6: [2023-03-16 19:05:25,825] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 55 + 4: [2023-03-16 19:05:25,829] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. + 4: [2023-03-16 19:05:25,829] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 39 + 7: [2023-03-16 19:05:25,833] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. + 7: [2023-03-16 19:05:25,833] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 63 + 4: [2023-03-16 19:05:25,841] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 39 + 7: [2023-03-16 19:05:25,848] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 63 + 5: [2023-03-16 19:05:25,852] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. + 5: [2023-03-16 19:05:25,852] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 46 + 1: [2023-03-16 19:05:25,856] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. + 1: [2023-03-16 19:05:25,857] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 11 + 5: [2023-03-16 19:05:25,866] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 46 + 3: [2023-03-16 19:05:25,871] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. + 3: [2023-03-16 19:05:25,872] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 30 + 1: [2023-03-16 19:05:25,872] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 11 + 4: [2023-03-16 19:05:25,872] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. + 4: [2023-03-16 19:05:25,872] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 38 + 4: [2023-03-16 19:05:25,884] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. + 4: [2023-03-16 19:05:25,884] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 35 + 7: [2023-03-16 19:05:25,885] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. + 7: [2023-03-16 19:05:25,885] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 57 + 4: [2023-03-16 19:05:25,885] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 38 + 3: [2023-03-16 19:05:25,885] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 30 + 6: [2023-03-16 19:05:25,887] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. + 6: [2023-03-16 19:05:25,887] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 51 + 7: [2023-03-16 19:05:25,894] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. + 7: [2023-03-16 19:05:25,894] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 58 + 6: [2023-03-16 19:05:25,899] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 51 + 4: [2023-03-16 19:05:25,899] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 35 + 7: [2023-03-16 19:05:25,901] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 57 + 2: [2023-03-16 19:05:25,901] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. + 2: [2023-03-16 19:05:25,901] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 19 + 2: [2023-03-16 19:05:25,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. + 2: [2023-03-16 19:05:25,908] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 17 + 2: [2023-03-16 19:05:25,908] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. + 2: [2023-03-16 19:05:25,909] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 18 + 1: [2023-03-16 19:05:25,909] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. + 1: [2023-03-16 19:05:25,909] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 10 + 7: [2023-03-16 19:05:25,910] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 58 + 2: [2023-03-16 19:05:25,916] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 19 + 6: [2023-03-16 19:05:25,916] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. + 6: [2023-03-16 19:05:25,916] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 53 + 2: [2023-03-16 19:05:25,917] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. + 2: [2023-03-16 19:05:25,917] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 16 + 5: [2023-03-16 19:05:25,921] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. + 5: [2023-03-16 19:05:25,922] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 47 + 2: [2023-03-16 19:05:25,924] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 18 + 2: [2023-03-16 19:05:25,924] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 17 + 1: [2023-03-16 19:05:25,927] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 10 + 4: [2023-03-16 19:05:25,928] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. + 4: [2023-03-16 19:05:25,928] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 33 + 2: [2023-03-16 19:05:25,931] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 16 + 6: [2023-03-16 19:05:25,932] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 53 + 5: [2023-03-16 19:05:25,934] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 47 + 0: [2023-03-16 19:05:25,935] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. + 0: [2023-03-16 19:05:25,935] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 3 + 3: [2023-03-16 19:05:25,939] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. + 3: [2023-03-16 19:05:25,939] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 25 + 4: [2023-03-16 19:05:25,945] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 33 + 0: [2023-03-16 19:05:25,949] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 3 + 6: [2023-03-16 19:05:25,950] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. + 6: [2023-03-16 19:05:25,950] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 50 + 2: [2023-03-16 19:05:25,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. + 5: [2023-03-16 19:05:25,953] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. + 2: [2023-03-16 19:05:25,953] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 23 + 5: [2023-03-16 19:05:25,953] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 43 + 3: [2023-03-16 19:05:25,954] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 25 + 3: [2023-03-16 19:05:25,957] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. + 3: [2023-03-16 19:05:25,957] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 29 + 3: [2023-03-16 19:05:25,959] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. + 3: [2023-03-16 19:05:25,959] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 26 + 1: [2023-03-16 19:05:25,964] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. + 1: [2023-03-16 19:05:25,964] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 9 + 6: [2023-03-16 19:05:25,964] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 50 + 2: [2023-03-16 19:05:25,967] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 23 + 5: [2023-03-16 19:05:25,968] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 43 + 3: [2023-03-16 19:05:25,972] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 29 + 3: [2023-03-16 19:05:25,974] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 26 + 1: [2023-03-16 19:05:25,979] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 9 + 5: [2023-03-16 19:05:25,989] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. + 5: [2023-03-16 19:05:25,989] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 42 + 2: [2023-03-16 19:05:25,992] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. + 2: [2023-03-16 19:05:25,992] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 22 + 4: [2023-03-16 19:05:26,005] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. + 4: [2023-03-16 19:05:26,005] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 37 + 5: [2023-03-16 19:05:26,005] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 42 + 0: [2023-03-16 19:05:26,006] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. + 0: [2023-03-16 19:05:26,006] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 7 + 0: [2023-03-16 19:05:26,007] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. + 0: [2023-03-16 19:05:26,007] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 1 + 2: [2023-03-16 19:05:26,007] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 22 + 4: [2023-03-16 19:05:26,019] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 37 + 0: [2023-03-16 19:05:26,025] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 7 + 0: [2023-03-16 19:05:26,026] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 1 + 5: [2023-03-16 19:05:26,027] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. + 5: [2023-03-16 19:05:26,028] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 41 + 5: [2023-03-16 19:05:26,042] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 41 + 5: [2023-03-16 19:05:26,059] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. + 5: [2023-03-16 19:05:26,060] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 45 + 5: [2023-03-16 19:05:26,074] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 45 + 0: [2023-03-16 19:05:26,100] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. + 0: [2023-03-16 19:05:26,101] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 2 + 0: [2023-03-16 19:05:26,117] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 2 + 9: [2023-03-16 19:05:26,144] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_06_optim_states.pt. + 9: [2023-03-16 19:05:26,145] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 74 + 9: [2023-03-16 19:05:26,157] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 74 +16: [2023-03-16 19:05:26,687] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_08_optim_states.pt. +16: [2023-03-16 19:05:26,688] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 132 +16: [2023-03-16 19:05:26,699] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 132 + 6: [2023-03-16 19:05:26,845] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. + 6: [2023-03-16 19:05:26,846] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 54 + 6: [2023-03-16 19:05:26,859] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 54 +16: [2023-03-16 19:05:27,048] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_09_optim_states.pt. +16: [2023-03-16 19:05:27,048] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 133 +16: [2023-03-16 19:05:27,063] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 133 +16: [2023-03-16 19:05:27,152] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_11_optim_states.pt. +16: [2023-03-16 19:05:27,153] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 135 +16: [2023-03-16 19:05:27,166] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 135 +16: [2023-03-16 19:05:27,471] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from checkpoints_8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_09_optim_states.pt. +16: [2023-03-16 19:05:27,471] [INFO] [engine.py:2844:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 129 +16: [2023-03-16 19:05:27,484] [INFO] [engine.py:2784:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 129 + 0: successfully loaded checkpoint from checkpoints_8b712b400m at iteration 0 +31: time (ms) | load-checkpoint: 8239.78 + 0: estimated model parameters: 9.828646912 + 0: estimated model parameters without embeddings: 8.863956992 + 0: [after model, optimizer, and learning rate scheduler are built] datetime: 2023-03-16 19:05:27 + 0: > building train, validation, and test datasets ... + 0: > datasets target sizes (minimum size): + 0: train: 1 + 0: validation: 102400 + 0: test: 102400 + 0: > building train, validation, and test datasets for GPT ... + 0: > building dataset index ... + 0: reading sizes... + 0: reading pointers... + 0: reading document index... + 0: creating numpy buffer of mmap... + 0: creating memory view of numpy buffer... + 0: > finished creating indexed dataset in 0.008569 seconds + 0: number of documents: 835726 + 0: > dataset split: + 0: train: + 0: document indices in [0, 835726) total of 835726 documents + 0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_doc_idx.npy + 0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_sample_idx.npy + 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_400M_text_document_train_indexmap_1ns_2048sl_1234s_shuffle_idx.npy + 0: loaded indexed file in 0.075 seconds + 0: total number of samples: 195101 + 0: total number of epochs: 1 + 0: > building dataset index ... + 0: reading sizes... + 0: reading pointers... + 0: reading document index... + 0: creating numpy buffer of mmap... + 0: creating memory view of numpy buffer... + 0: > finished creating indexed dataset in 0.007390 seconds + 0: number of documents: 364608 + 0: > dataset split: + 0: validation: + 0: document indices in [0, 364608) total of 364608 documents + 0: > loading doc-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_102400ns_2048sl_1234s_doc_idx.npy + 0: > loading sample-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_102400ns_2048sl_1234s_sample_idx.npy + 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document_validation_indexmap_102400ns_2048sl_1234s_shuffle_idx.npy + 0: loaded indexed file in 0.080 seconds + 0: total number of samples: 169955 + 0: total number of epochs: 2 + 0: > finished creating GPT datasets ... + 0: [after dataloaders are built] datetime: 2023-03-16 19:05:29 + 0: done with setup ... + 0: training ... +31: time (ms) | model-and-optimizer-setup: 15009.47 | train/valid/test-data-iterators-setup: 1271.19 + 0: [after training is done] datetime: 2023-03-16 19:05:29 +31: ----------------------------------------------------------------------------------------------------------------- +31: validation loss at the end of training for val data | lm loss value: 4.538278E+00 | lm loss PPL: 9.352957E+01 | +31: ----------------------------------------------------------------------------------------------------------------- +END 3324489: Thu 16 Mar 2023 07:11:28 PM EET diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4d46b59afb80637aad87d988a6155445d010bfb --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14565ef5cd79eda8a6ae010d37a27872fc4fb632a367abdff819eec41cebb0a3 +size 460722583 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..65d4eaefde5356db3ec8a8e683da469e2eac0619 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:178205554b04a65727d102939ded69fd079dcf8e37588498063ff053db8000da +size 460722583 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43b4f9d31afada9dde672d724b363c8ca914a3a4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0870a711719831ed3c41f62f9d36517bc802cfef326253c2951c8c217ec1b3dd +size 460722583 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cad615cbb4dbf32cd78a5168ce5e59e65d3dd5e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:243130c27860693ca609e7ee48391026eda129efb1ab7a80b2a107a591d24765 +size 460722583 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f6985f9ea5037c2db943b49e4709b2f94798fbe --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a7a875e8a89e2e0e8d668f7c1733ec6c2da042e92f39753655979916ab66df +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e7da1007ee1679703b4ff6beb4f89aad40d30b1 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30aa810e14874448d4acbb5040ab896a507af566f083cfa6aab49352caec677b +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d4030466cb120b44f0693df2e34defac4c47a9a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fca6f6ba8d3d5d3be0265c892fe41dda218d2e55140369dbd98fe1a80170869 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8bb26526792fe6c27f158e7b7b4388462811f2d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1b96e02b8b499edc28e3306a633716e60742ff52b27ce17d684d52019a0ef5 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a521a2d9233cf95b615a29725a8a26b2f5d0c26e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4080ba240a6f3d9b7517e6421a827237f71254f1b420329708ae3e52d310261d +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6e107678269ec9c4b93e09139b027a9419221b4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18a2046ce23e846a3ba7a4eac373b768015e746702dfdc16c419bfb6996b7a87 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0beb85adead87faf47dc929eeced8cc51ccfe12 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ababc22ab802d63bc5dba43c35844cbd8089c90b3ce9c05086fa9817fce0cf2 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dd903f9624cea023909dea642100127a26be127 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19022d7e4043068a45420ac65e6ec9db45fb3bde99bc76990d7ddc95209c52dc +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..585f5252d2a74977e1dfc693ebeac9e430219622 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e60fbfc604d3ed03c8f93dd12d11d4f28da2e6d8484176226ae16a6f18d5ba5 +size 385183575 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ee5a0448ad423eeb496104874fc644380f0cdb5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2906f418ee5e275f429c21d39a1b25417af4ee66188e7dc9b3d6d4dbd5708499 +size 385183575 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e22fffbd58e5f16b4636a6f5bd40eda1f8658d7e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd9fe646fc3c868e19c8678359dead427ebbb11f2c1d7d58699952f1e9ff6054 +size 385183575 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8468898844215be72f26e5baeb0b69c5aa3280e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_0_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c4c6143986e58c4c058e7ca0489243b9203acffc1ecedd45083d6cbc7500b4e +size 385183575 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1fd79b84d8015df8a6fe3cf3233f4d006252cd8 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b74d9acf066888c4a4df73963e2d99a24a11cc84485bed24e28b8ca171f2ffdc +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fe63b35a43427267c45cd845b32a87b7f722ae8 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15bd670079e902040c68d730dc4bf9132421facfd962f84a13ac7247b798c6dd +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9400e6d9b7d8c5c67534e15d6bdd4b105b76b261 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379429ebe7765d966e56b23e7a26209547d6e8dd483a85f1bce782085f55f2bb +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5cd5d7e37ecc2ca996aaac12ed28c8e2b1069fc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d168a28cc4078a59f1e5c09297092634e42ec43eca05499b2b51935dff1972c0 +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5a3caecdba6448253199ec754eed0869b91b3e5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d40389747a77ae7ead86062056e2f0093a6674910ea29ff300058d387a131d3 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ded57a4ad1ae729a1d7e95f9bfc3c4a6c75914ad --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38b1e1d26aa041c4264641fda8c4b3d9886145883759510e591a53cd6998e9e1 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0298d3d4e43cd0ba97d3f5fbe3aebc193bac122 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2d544db634e96d284879af058e852dfd67de5870a025301d32ffa8dc944f05e +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1ed6cc5642aec99bda8a33787a4224e0491e544 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e29b4841297c12ac0b7e10b75fb66c6de6fff212220c631ca677fa460ec2af20 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dec5973cbdf72c3b1598f75f1b0a9b271f08678 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a5f9faa79fb94b2f73a6638bcf0a2deea50e5c28660e0e55d72898059c4e4b4 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31c39147d70a6b94a9e815657a7b8bf8fa83aab9 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5804d8970e3cef103b22c1eebc722ce4577bd92e94cc9acf54795208967d0d56 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e0abca74fa929fe85533a2400d05511776d82eb --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaaac8a27df6f97ab646d779fdab41c73325612a3cd5d20e6bb5dfb6eb6b7bc4 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..830f3a3b74a2adb374b605cfd4e3a4a581698c7c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d88100e24d301804727526a66db97e88d6255309f2570020f119f016cf93dfd7 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4329e5f8ae172dfd7f273e45ca69c25368a2c11e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a152b671cc123e9d930bee7fc6aed67457f374d4a56a369c4c8ab47bbdec14 +size 385183842 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4b4d9a5f175f374b57a61c141a1985c8e096ff8 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205c917c4acd1c4454bbd244b6adc94538c2b60ab8ea51dda18f4ccbb7a717e6 +size 385183842 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7582854bca1f3362fd43a631617ee58b3a81255 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87dd952d5e9719b64bcb96301c791641e6ee96a72e049ef837e0fc63c7efbda9 +size 385183842 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0f43340e2276cca3f3afbd4d65ad6f9fd372198 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_10_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d73c52740632287a9e047e00087f60384eb111a500a0d4c3009239b0514bcaab +size 385183842 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2015bb64ed35e585bc2686f4435cb09728bfbd9d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04b3b479f68b54825e1385b8645eb58d4e1d41e3ad5e7fe81eca60faeb0ec861 +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfc0dd16bf685852eb4523ae995f29f3d5a1eefd --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70eb1711c81097ba8858626e75287dbe339e81221e84cf9993ae9f9e1479732f +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd16526c152a781d37bec082301aa3d401d42e61 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d556e68576ccb4b4127b60f0c89d77689347238cebf7315b69687d728282f7cf +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..153d988e28e949ef10c9e93293f77a1177b02a26 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0dc3a4779506b3e728b7c36096604a96c7875f74aa5b12319319e80fec3badb +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cef477dbff5dfeac884c20566330630ea602405 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816b980edb65334831cea5ef3e780324df440a746865e4bfd249e7c27264fe41 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9c824e0fc9a9594e42bf3a0ab017054e4c5abe4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ae53fa6e8d79d2ba4b198fff176db64e1b99ceda52d5527e3e891441750e87c +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd7bcaa545a0854b5ff37a6b7b595ac5bd620b14 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b990913d9bc7404ca87add12134547f44bbb0e6d21b7aff507cf81952f571732 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4271ebb4aaf7ea7d8f14ce658340b23d1bc8216e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ab5564777888845e8696b1b4f5a4ca8292791a8975c362b32f4c0a1e7db5ca +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..713783f1fdda3c4a0f5a2bc91c3935daf0736b40 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59358528ae338f6d5fb27f2af5b54ae524769e7469b400b91be3cfb384cea198 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..76ab5aec0070a66052385eeb01795c277c9fe295 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f449e908b8a4300fa83984a4b0f7d4eb7cf25b26637973d0b71050af404a4a4 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fc4c25664d816aa9aeedbd551bc8ff2de83f7cb --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d8cf90e3b68037f63ba7d716c207bc7c5646a8878d598b5a9bc3bfb9ca1bc6b +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d6abbfc97d316082ed8c7254e4db14501dd5a99 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73d5f29c997a020a81787877707f1d1c13322955dd7bd872d1edcbd08b144faa +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f12419f562bae4ebe16f60bc3f11d628b6de0b3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c744ca7e729ebd6ee5ca06a06ab1d5f32720d07c575efd028838df761a64ef49 +size 385183650 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..78412f0faa84072004b663941a00e4ba9fc10dd6 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c775e8e723647b577bb7cbfccdebb9d2f2699eb7dd54b8cbf863099c594f559 +size 385183650 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..09e1737651ed6c2c3411bfcc704a4ab3b9de163a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e315c2a792aa4f2b112bcbf7ce94cb9c1562f84920eb3df8e880e194123532 +size 385183650 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4253c32c21d195e754a66fe873889c86ce9a60bc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_11_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c63366673eb8f7e20591fdf0a4a0373328c764b4dbbc7b16475e6283d0ffe10 +size 385183650 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..929732ae3326c4891b8e8dafed1e1a1452fccd93 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81c1594623407c019c8268f6a06b4e61cd3c9ad2f5fc369eaab6246c044a9375 +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9af0f881f9b387c240333196ba27aa8149c40e3e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e43057728e76adbf3e11572203b68d8c262698904c589158884b2b1c908d902 +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae8deceec75a3861518928332989b6e73a4e3f8a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dedd957b3b1bf09fbabde5e2ea75e93d56af27bafad50ee0022eed469523006 +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6705fc467f80eae7a9292cc9d074622df965af0b --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:893bc14deeecd4a0555ec983ec9085dffb48521e8382e808710a28cf01af6207 +size 460722786 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ae95e7d601e734f75875b337bfa1148e25606bd --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19401dd1ba25b2e4c2f321ea192f3258c2d8c9999c066261af255414ac5e1cb1 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8408f2d121412c8c86e8c3cf0f43e63e6d61382f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cad1585bd6e5a9d054b7ff9d4fcec9fb20e339810991e493fd861fd4036cd368 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..415ff97ea1aca9c976666905b01b87fa7cad280e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2150e70919f3d685658131a2e83984d74511d3f783e9915fbde58ac815276f +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a25add5488844707758a6e8aec0624b744a37a9d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b491fba9888966a6090f7117bb7ff79b297c89b13617bf9a1dc7082a0e316ee9 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..639bd7d74ac38122fc451976e2039e61b6359d05 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4db1addf8757c11bd50348bb7669abc96c9fb437270fe4ca9daf3eb0594716c +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70033766f0ef66acecf00de16023a6f7947434ac --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e2775e794f546dbfa9dbdee3a74ef04e90a3f47bae5a2cf10bda5a03edbbad0 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb36b1717aeb0b39a91e2a6a3fe3acb26032c3bc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44b8c7805043a9494269cc3d9b34f814ce93c0a1fd60049086c70e0e8c21b3e +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..915a3f6d70e44988258c640be2470615e9096a78 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79fd6d2bc35f6f5d389c201e237237f1627dfe1bd9ce47e6b6f27f8bf8fda50c +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a59a9e91018d3ada39991d64726c98abd739a8e7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5ceb45126c28b35a431cc09dea61e083ac50da3c9a9da5b63909877338df982 +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2860e5766d4b6b1ee6c61993edc3588fb56d7ee4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75520fd68443e94748dda7963653083d4b97da3fa675edbb5c38e08a8f17e908 +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..faf9cb66ffada07b62a52f6751e36d63c1f1fbe0 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da38511566d54fb203d4b0fc66ce59756b016b8b89b664e1011f749c99dccc60 +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f246edb869046e89c78cdbae1d1c45ccbbf32d33 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_12_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:187996a87f45bff6b151509f05baa7c2652bd719b4e2cc5f069987ea4d9a9182 +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb16061b7843bdc0238b685e1d516b9edc0f080e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aeab0cfa3ff8ebffc83cf80af6ad754be4d163b47de2472f2e0e03cfd356447c +size 460722850 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6552724ef61e10ae403e171cfb54931d16e7b78 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d299c22b79687cec0e0d338958c463d5096060fc43c74ef42b597f84ef7377 +size 460722850 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53a14bd0e52c8e3591481afd2297fb5700a7274b --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1544369aec169946a95469e68086d6df4fee9e1f0f396db4c11113c690bbba28 +size 460722850 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb7661f0419ee2fd52394cefa5cbdba28fc4b3af --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fe7ef52bd6591aa5195d7e6c1ac17fa468daff8ca8b97677793a44556a53764 +size 460722850 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b86465062863dd2e36456573b45b5b01f154aa29 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ff1e11f0243b4ebfa12e0c7c8abb0500c9ed660a4129d80921706578f221e6 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..834e2663202d09d28221a35c8b0f9d60120cfc57 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:494ed93a91d7e3278eeb496a8caa718e66f6c294f18f75c6f7aad5652d002aa2 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8811a2e6cf5483e2ebd34914fa980957c04b50a0 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83801d8595bb2de5ed8284640bcb394bcd80083caead47c568cd37e49ef6af1f +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a48705f7a53d1e0336f36c14dd98dda12a5f2be4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eca6b4865419fed0698587c69c367872d429c3f53068576ef2d19b7a7a863b0 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bd735b29e5fab314c9571fb489f6ff0c99c9416 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b9323e0a5a6be09f2fd35a336eac266d67d6de7905dff2dccd9afac07290962 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ac35a32dba57dacf02f47eaacf0e88332bc93a5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c980df1c3bbed7d560753505ea6fbe92d81763cf504bc1350c5782380f1f5c +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe4e909043ab8979556da5ca12870ca4b5c169a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:281efbaabe395ec3fdd7ac04211b2f8eb337feacaf77c736d026fce281eae836 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31f1cb7e5558b90584ba5c0c52f422c2af4108fa --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b56c43f4f194724238f2074558f869b2d986a2042a9c69e7e132d1bd691659 +size 415502882 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..546b946cfede8e22916f35bed5d10d20c75e78ba --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a065a436127f7c18ffce0faa5a15447ebe1d01fc6d59698d092dff8ffa92b2 +size 385183778 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75dc766c61d0a9aac4372ba7c29bbc48abe7d1cd --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3154a574a6a05c27dffa7ef8dcdd18534a96b1f031d4c16d264e2bb5a5ac241c +size 385183778 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55276e740d86acddd5429181e0397d5c118162d4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ade1cba3d2c2bb9b76376c18e7ef2c24ae034cce474fe25deb1e8a754bf93b +size 385183778 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..31a1b0643f096891602493352f20047c0419e5a7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_13_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6d7964553c9a199d9eae2d8c68f0e891b5e3098f7ec4421a63091e4aea0bc5 +size 385183778 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3aa556ea33f1e0cfcad30b8e77abf757fa5b019f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d4e31ac13fc523ace0450113e226903a15beca6a0136ac4197a7ee3425275c1 +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1404038e267e10dec32c0d6ff593b1a82638cdc4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:436afac6b30ce5a56ffe7535f15d034750e852b261edbe153771ca477a996614 +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9632cf4f643705de2dbfa2d66de3caed17775afd --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dcb62cb1d84bf78a0d5df671caf93f05c724e0c1da28a0cd49e229f91e4f00c +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73c7e5a02e8aef3b777068bd6fa07082cfb6919f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:113aa6312fbe2a95af2fd02fc3be3edbaece7c9994455535f4d0d497c06fad82 +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..74a276b4fe58d7a60cfb9eef328d6388f0d82313 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:274d61d30dfaa8220b1ac6c5b6167c03c9b90a23545b99331fc5e39748ae0e72 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3b7dae9ec36581578c4d2e68be79013503e30ca --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:622544aaa799cd4bc8f8f5c8efbcc50d8b1b55d146b20fe43819baad2221408a +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c60c22da8c014a89e812d03f8386a9feda3f341 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1732dd3718b4980a5a4faaa56bfd9531cde52fd38059c3790139a18bb3379ccf +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51d4b77f4b4f020d4a1343f828951275061fb269 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc355b7a375c968b66412350a96826f9aa9ebc6c9c9c71467b7c97b50c42181b +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35a1ea1e3b9f4be61026d08c785d6c367a23b924 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ebd9dcc677bb7cd36b6ebf586b6cde7aaec13647b09b47ac3bdb5e385f604e8 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37db4b0e31b22f58e97982066e34eafc46517524 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a70005a56c525b1574bdc2e444d50407172754cc82e4bbabc4616915da557ba +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0717bfa376b90499581e17fd4505211cefc1c7d3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:363137b2f88ab3c542f3e7b21ead7124e0274689e286f930b05034bbeb4acc26 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ecf4566b91a663e612851b5ba9b9d2e66e96dff --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e346d8f3e52032d5f77db71e3e5e6165f4f2005eb2c17d90cf953a54035ab440 +size 415502754 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88e86a5318d79cd19d58c211e4a2d2d2b0d60e57 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f684608d19d9b8c6ebdb4f39fbccd3da07e918f38c613cbccb53828ef4de0cbb +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..908087c5815074bd7b7dcda65edc1e3ecb4a9b32 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad48d58c2159b903d2836c35bffd08aba143a535b25a88257c1c54ef17a0cc67 +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9564c93cbeaa0cd761c791e179a58865a8a6d22f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4804129bd7f1fac021ca2b4ce8cd433ab5bb705b68fe65c0517de36634d9319d +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9017c6cef8cdccf87b52e2a532d9037f89db7e17 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_14_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f3e8df68dd3ea4e9797ebc8c8c949ae88b589f2229b5a4756ee7b2e26040c5f +size 385183714 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..97eb76904a919cbf277c179b0b66057295325ca6 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957043c064af26a9aa8084f84bfca62eaeb2bbd0cef176a27a7f1b9a40face0e +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63ed70fdee07c2c1310f052cfb7f5fdaeb3cd59c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b7731a675d621fb9b9d40855a8893b1b364e1209fdd483d31f3df5dc2713ef +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d49c8225caff7717b06ef976e463592a84569344 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a148808db617e3b035fa4dc20f4389bfed4e6688b9c36431e1365c4220765b22 +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd7e1ba2af6191801990ec2deaebcc9a5f3bda3c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f473b1a0c8684444cc087809dbec83e650828b2034d5a3933651701046e7c9c +size 460722658 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..301cd65a83c185b27907f845ab597680e10602cf --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab65abbccfc3cf6640023c9ef2976febc00c70fa5e5fea96ee21f5468b287b4 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21db8d851cee0cdb66018c4ab46468079d943b05 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80eabc925e148e3f0ed58a54a5d0c6ecdba2f4fba2994fd68c160ce7f54e502a +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6432839074c35324791c06039af15872a2d0a21 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f76cf7f02c8842c8f461989035f04a24456a4310cc57f44508d61afb69f7a56 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e42b917199a20733f4d5bbec326f6204ed21b51f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:940bdb14ca76252c7811f772bc2193798f16ebd2f11f1b0f68f4f89253c8c829 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fdae6d4979b500416ef703008d54798a1855f54 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:016e3fb5b7a97c89c56f0c36f87a3fc098f59b0147b8f7523f5299010891a103 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6246ebe724c28ea0c50370eaa15d7b5ba9ffdf15 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08587388647f1e37ff15aa3381f86b472a93ac213c906d17287b79dff2d2059e +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c12e88fbde8e0d872710e67e88336fb9c19b8f3f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f03e91a024d69340f482f0c44bf8438be12bc8b8fd72b4bee9f7535de4464f +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1452d01a210532528da63f5eb31ec2f876ea9494 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18e59dbfae1af58867015df2c8d94d7fa99ae582cf842d74464ce74331d43971 +size 415502818 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04dfe34aa991da4b2d692baf32684c8a3f9a5ba9 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff028031db2116e67cc598828f183168ce77aed7892e2246e9ba6b9771c9214 +size 385183586 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05a6bddbe7bba0a4425bb826d664190e62259776 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e5f77726fedb088f6c368da0583f2010e6249e3ceca37a141695329f0f9dc64 +size 385183586 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e4f094bd1cfde981177994e06dc7aa3d61a3593 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:159aa6005d8562f4c85ef7d9c0c224082f2b26da26c4018d070c5d3fd6b46b73 +size 385183586 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e10c0fc645d4637402e783c0a7d480ea535fa0b --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_15_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:880d8026cc811f14f05d30d8bd93fc2716c2e9d0951ec28dfca34e80aa1d223d +size 385183586 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..90259a3ae83bfe74353ba814b98d64fb6c5ca250 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d69d6729f623ed5b27082af91fe5b769c8d780c3de7e0c816dc1623855bdae +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d5cfa5acacfea5e40943d0779f975aab3494926 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a03d5acace2322501f227228d28960d16b9a3eb9baeced275028dc72a13f2672 +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2faf0826ebe04a6c7a1beec4eb0488a06e814638 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4594400b6bef0d56b804672e70c24e0649c3e7d2b55e018760b244833580a822 +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43a0fcf0b49e973ddf6f335fdbf15193270f882d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46f5c33a8bb77c0becfc379587aa67320857a60105c4c339ed745c3ec9640474 +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5422b1c0483807b7884cbed84c520d2cc01a6d26 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df72f23d14a9dde2a7e52a5e58d96d07fde7e1ab0df23ceedaf08e5a492eb498 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab712bfb59f9d0ac175a9198e1a21ae7333f3d10 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90250a20f5686ad1cabe881ba9849bfb1763d97d6848197371554a2940b21f44 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c99212e2be7f6b28c7f0fcf8ff1e6ef9d8f6cdaf --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e498fd340298eeb6a7bf5b58b10984905a70f865742540cdb80502191e3110 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da1fa6944639e95cd89f418f29381a9109d46cbc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7630306f689b07e18b268182105942bac3912a835c185cedf9bdffbab0d7cb47 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e24ec91782f5173bcf235ae02686e5eee79d5365 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a9e7bc31d63b990027d98edb9ab310b55ec6a6b31ea4a1c435b29e006f397e +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da23566c57816eb0cb7828d388d66a034455f2f3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a209b0b50bd3d792ae59e7c113a4f6d9a1b9daeed95c72c41eda7076d7c351ee +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8296096ba30af7d77bdfff75c598484f1177dc7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1c5ec725d961f73d5f5c77f16a2e6cdd731aec5ce7333db27bdb73bbec69d2 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4f1cd01e2aaec5577bdec8be640b76bf2da0ec4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047dec5beeada9984099e65c7bb1bdb2d615e75b2530b47887127a717a6aae63 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e74f95aac322ca134646ff1fafde91a2ef94801 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7691caebce271883f3074c32e221e9d958bb891528fc4d55e1b5d5960e84d24b +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61207cd46cbe07e485f221a1f8309f71f2e4c248 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ed538f74bab9c6bcd57290ab53bd81470ce47c417d333333333e188103de61 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..24d53a8ded604498bba65e874d7200be08bdd747 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23da8cfc888e276653d3679713d57668131fd6b46d789d89336657193437ecf6 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3393bbd7e831a38dcd98c7b762c1117ff81e4b43 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_1_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08e68f67f699f478426f4aa96f54bec77cde32cb4a730112b9edac78ac3c1399 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c30aec8f64bce4d91d1357b71d65003bad8d5fbc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:debafd6651a4ecfcdc19b7ce669278b5d94e063280f988b669bf1bc24444cd7b +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0e75ac97d10e7dc079f4005b01e75d5ad61dca3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:358851487d51bd79eef2a9fd52ec6bc29802b085cdf018b1accb1093609e3787 +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf33a14194739d00aac111b03a8cfd33f2856c89 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b61872a502abf8705f72f310f51a942ddc404212030f0c46c9b4d2cbed7bb52 +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a81aa35f759645894f93b5775c46a2a5a1e25f6 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d179b1308e7c0af0e77502a262545cec2886711920378813f62f40048a4b9eb0 +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ce5261c00a89e02376929d88370a2b23c4a010f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdcae1a2f6b2ea4fee0fda9a483d6cf5b31cc7182a9ed4852ce48e3854aa5d9e +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d46581200bed2fcab5c7067094243ccd4485f3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8c4cf8482d11afdb66ae1339b0e7721ebf0c4b92a0d00e1e4f2917c9c4b215a +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7aaccf16b3bc6054a7fd24e47db18837e3164bb6 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb81aa30e9a7ad75472c526e67ccd08fecf81333f13f65db74ac4c5ae62db64c +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee0def74fbd770da0e11b0816aa8b93bc10445c3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c21c2255bcf97f80d0772672c2f84512b59db1cc237df55fbb0bc0fcbb2c79 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8773d76443a84ea926c6ea1f10d9fa4fd5c4480 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e03db2a89284467f19537f13814c7524889e5af47320eb85b496a670ac77a23 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..30b8d9a478f544a44644d46187b630a257d8e544 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cc1604b95a411374f613fbf8e235834fefb81fdd94bab78b0ffbfba36ea29cb +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70ff83261b823a2dfb39c5e39d8e2d28935e6bac --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f29b1e188a84b701619daa56dcd53908b0263e3c4f10c409ec068423362c7a0f +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8b6c4638b0bd645501f9d21fb4c235e18613095 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d828ae0928c25b675e6cf8bcb82e62bc385fac97d78357697a14f00d86bb96 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3f43dea792a7e7ea3bd677d81acaf29c70cab4a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2073e188507000fd9670d1cb158bcb00d8384acaaba13ec139f4b2d931c1f574 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..557245c6448cca4bdeb65c5061b70725abc413b1 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a568d5c9e472f6a899b539965599a50827215753da8d467ca5f2a4b3361d63b1 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd116054f935c3bdaf88b64c157e15c5292d2589 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2355959db3baa81a9a0829fc1bea2ba396c06bcdb5a66b11c21ccc2d9a032a1 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d70a0055e9e0294129acc1feda655f29ddc56953 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_2_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18a14b3b18bf049d4fa4856039aa27d60491356463c42fd1376c8cae73a1a92 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fea65f7403d32d8a58d1bb3449cf2df173c84146 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bfd994af1a1644c4be2b63baad73bbb6f3e8bf856cb1acf4c5793c775646d9a +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d72ac8b4a11e8994ac6718ffd84c44c5861064e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9441bfe1f494509f021b98e158ab60bfa2e4d90073f757b59e804aad212b3beb +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c299cf9bd5262f7469185a54093fed49c8ee1281 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e78ccf2692617466f1edfe0e16934dca9c8785a52d1a7b0a9fa557afecdd2ef5 +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd96ec8ebab4c3fa017cb48e05782496f8a5b99f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0f3febe0bd2fd77afe1c913a9a173b91ed661567e0326f40302406a5b4ae4d +size 460722839 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..604579d4c8ff7dfa4e9bd2c5a21be7da14133844 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d588400ff979887e5cb9ad0e145d16a3463de3c88a5931af6cac15935d408513 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9ad874591ab1e1bf2ff38c9e8d0b5c77edc8144 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:085e85d21df1db5ce73b9cd955ee10a579a44e4f861533158f8484daf217e605 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c19dfe2e3f1218f97c091ad37d25973a957f4044 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acda1df3d92089e94285a0be0bc9721af62a309de28d1fc1f2b7884b1dc28098 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1a5557d5cc0cb870c6d0b6b045e22e6a4e849157 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f54b3887e2ebc50638297ba962b88d4d866c0f7a6c5c7c4b5e9d245f9c003d5 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f9d58b4a061d88997d1110c0f81d347974fa121 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f246e5568514abe2bf43ce128e9f0e01064d93154253a04b866830d9b172b28d +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53d07cd7507780531a8b1bcd3cd7d999490705ff --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4a7145f784a60f406b34cc1f7795514c550272e67e22ab0e86babf2eec119f +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2ad8d4dc8b505e3f0cfa39f902a31c98b485dd0 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1997338aa71a7e8f957a51250cfbada2305d23d1ce3ec9beb4e7a7d0e42d6603 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..110ea56194b284a08c57c547f8d03da290613ea2 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1af2a1cc908856b089cbf09bf4acd0d328f51348ee09df519d5a82fef54bf494 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2335134f95d20f79a3a922acf5244045af7d2ed --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b694259d402acd72e0daed0d7a507ef0962639596b32d11f950461186e9268 +size 385183895 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c0b457a3457cdde5170ab38f288c4ca26f3cd8d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dca04913c8910f8b9ad8eec42a12979ca4d8237881910bfaf545c597195f19be +size 385183895 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..de349d45b7d6d332cfcb9641fb5579a17371055d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12fd70fe740140ef2183c5e26d197eea023b244eebc7f78ba47fbcd873d26fb7 +size 385183895 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4c238a1390998fc7d030f18b985ac1cc8255d21 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_3_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff18f79726011f77ada2fc6a4aa05ff745337ec8c2fc380926cc6d6afb57d6e0 +size 385183895 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..076d0000e8c8cc48030c55db0c27173e67a748c4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aef669ff53eec39cc0f073f8f640dd6873317b546aa3973e0163e1e7575efa2a +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cbcaa9d7e20ed7a2f265c67821ac187c685b9f5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9800783d4eca8d1c41f52199d408292b67d16239a7b2e2c95aa0c8f648a3cf87 +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15fcc1109c74204b769e7319bd5e7836fc5adba3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:747bcf3afc700ecea84644bda3b4a8a4a2c52144f86132224eea18e3fe42d948 +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d4420ee0a34afef87fc84f4fa3c26049b803097 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2012fbe46f67eff9afde2c23dd34edb0f361259e127e62a710c56727384a6524 +size 460722647 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..05e3720b0ea91b05fd4672f6b51c4fa4f2f061c6 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:850b86721e2dc3ed2dd59bad6e0ea3717ca5921ee0abda1474f55dd6999edd60 +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..252cf588922fd5cbafda0ee04bb02d383ce4e924 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8a4792946086ca4217021b8e039550e360e2e57e0a7d106f9950f909bf9c2c +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c97ec9fc34c0e28a2d7b3003dbe3a0df38833a9d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bafe435aedd4d71863ebd185608324e9e7f27114d0168b638d6670f5a007eda8 +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b65046bba08bb522e369ec1daeee85355513f607 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ee0c66c704777ab5e0abac964a0524765b47d88d68a4771ae424bd6b43adbbd +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b54f7862b72cf9c21779eaad04f82c7d9fcb74b --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d05895900581b6fe1faec070d1df00811fc641508092971c67a06241c5103271 +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3a333097d0699507e5cf5fd000532dd59420da8 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:920612bc03b9a1c381413ad9a10b139efd3c8565ef7b38c8116958970dc00edf +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..91badc9be74bbe2c10ea50a5cc7d7016cdd55e2e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72ce4a80a5903d76e6712fece652af6da2a2b4ca568ac651807aa675b6b284fd +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d4d26a1e44b3886bcf7cdadb08c74a3bd4646a1 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5390faec2cd830793cf6ec2dba39adff705c871bccc47b01d0576e553339a63f +size 415502743 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a01a8589b0cc910a44ae92b10e261eabc5ef3ecb --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:071fbe829268cddf04b0d70f12b7773cb04fb9e8f43526d578008f510e353cd7 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f1f44ddae8f8bdff00351af66e2b481f2d178de --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12529bed2032469eaeb446377d640c69261950ba1ac6b58a9e2ec0ca46ac7dba +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..57808757f0a69cb50ce54286023af1a56f386325 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb190ff9e794800c2fc43e850e2c930ae796da926a205641468bde5a52a9fd08 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9cd13e1592ae653d845dd335ba9b68018511cfb --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_4_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c49eff5293edd4ff42cb0be7af36f02cd48025d88318124eaf08f97b9627baf3 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..219c0649b2087698f3fc1a100e2b63693e3a7bad --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e5bc92d138f018cd3d5c50667c2a1e6ee9da813676e56137ff4d155c614fb2 +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..20f8b3ac48217633933ae262b4d37f2f19b98c49 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8033f3ae3b4f59dbc5ee7c208f1442ffe7aebd3111c52a67dafd74109aa38b1b +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..061836a3f8e10c8fc6ffd47740277e5a2385b38a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91c480cf645ed958251b324ca10e9f6cd26a7d58ef570490935f0cc85c719162 +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d33f88b93abae10d7b61c1644c2afeb358b919c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d783bdc914283a89945ced2296538ea7472b565c17d66711926fac8b165f52e +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4394b585bf3bcd3318f16fe991f3414d585b9776 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee4d7227b2c683b490c719e45ef4d232d2a7010871c5e88061606450bb92918 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a945c24ff53aaff6b29091ec745e935a1ca17d5f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c255aabc0552a71d4ed127d7f54ee8ce04ec86211b4611ff39bbe7fd69a27f6 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f11de5b9929199c24c999122f07492edf553dff --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dde085963bf3a460c4a1f4a78feca87ace889f626adb106e85258ffeb59d584 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..100e1a27b690d29a14a875e6181a4f6adac2b760 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25dc630a573b5c4ac9a930eee1890837901062a20c0ab323290f4d745d9ad140 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d572091800a9aa69c737cad8a0ec2b85e0c4a7c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08feebbbf9e19ec260930e30c122868ff84c264f279e47de7bbb61ffb515325d +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51ba816639204487dfd3d3a8be03833ce29b575c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbb0d872183b00f45f8985443453935c5e75e131c65a1dab471d13f2c58fe71 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb82b7a7c785c737d9ec43652a636b20e6cd7344 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b3f895018b46ba4fee1dd9f187d13c4bd48bde7e7e907c8f52c29571be4016 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..40bf9e4ddb83d4d84b6a9c91aeaae52e8c7426cb --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e264800e195ba6ed8e84747c6070a93ce9924ecc76eb277983a18e66542a4565 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39987735590f64f055f309f92e35287f3c39efa3 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ab33e7526e814abe6b6edfff946f6e10426d291212da9150b43461f8849436 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c22ab686fc261485591e02de59ffc767d8a52694 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3c29f6ab800affab3fe577c83d106a8134d6d36021104101996d727a9a99f83 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9eea4ba149a3e5a5d72734eb3bea08e2567ee654 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba5694bcb1a64acea02ec19e2158e120e58f05731c3fe49fc8abb79360a7753 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9f82391718b6f4f9f7323e22c6155cccd89e08d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_5_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdee9b73570629c200ec113f16b5fba057fa2c9aea13dbf180a9ee6f7ee3cb44 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f555ebcb7ddca30d4164bd61914c89861a6430e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f40dfa725d39c2d31dde122e531c597e029b2511bf1447565d6b23e17c6da2ed +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6e1b107f518ee62b27a6aa1ed3b2b9656984611 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68a982c22d254ab210be8c54e168b376d7255c00a9a5b1c5283eaef37d466851 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d253c9f1e8ec310928582007f11b93124f031721 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f2d686c80526d99799a001a0bf4d064b1b51248d1d7d18f43b7c92c37c8060 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6208ea74d9c007e88f909743f8e3ffbbe75f0b7d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f5797f27d793900771d33cca6dde9674a1150375fe6721f00ef483415699805 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4de261f71f535c25fb3dada343f71111024fc5d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:909ce4267c272ff2de3a7cf5d309382ba4b186377b4c44533d51170864d13114 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..52da311f56c60a5587d8c31687f3a1ed18b5d11f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f650ad7ca648e8d17cd71474d22f3937513e5aac5ce4ef268094fb0fa3568f9 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85870fee4dd483520185be6718fa5fdd995e652a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be47aebd3efb9646212851c56994e5c623baf3fcbd048ef3da45d6e512fa6b9 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..296af947ba8e0a553886eb8b350af3a3fe618e2f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ea19662fc1c2428a203036146009625d2f87ec2fbeffd67d7e5aab59071789 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e6bf9d07a338d62a8ecd845589032638eb83cb0 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14354d3697967d5a615c826ec32f89e3085e577f7ae67c39443a40fe13208423 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54d516e487be6e85148ac00b5da960f1b4451db5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b36b5060589d2e9e14dd8d811c3f6ede8cba8621983a3d601eec8e2bc6d522f +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae55eef8be647a4bbb391223a6df132d8c5d9ea7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38136db351db6e07222355f5593b8d1744e50523326e1c46dad890d5c67a70a9 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..88203b9ca60efb7379048e2f7c829238b453ebdf --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f29dde7e2c71e8ca1be5d31939e4222b53655c49ee5741b5f2f0e8fd615a5d25 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03d8600e1d47bb95a531d3451465844cb384012c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fbf72fa261c3c2944652a42767534e57c333518efb5f03fca3e029d4cd0ed4a +size 385183703 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43085e166af79ccc26a01836d16269eb9622b63a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836bf5ecddb7befee2902e488f18aacae65ba5288645fa44e742e0529878034c +size 385183703 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..085040dff21ca5f16874f14b1c2503936cd8996d --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1c1df5e4df8357014b98bbc480e9dbb395f6d501f3941b126498d74205ead25 +size 385183703 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d8dcf884ad43cdde7b67fc1acab007df19303f1 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_6_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36f6d8b3e8b04a9a08faf65478bdf7f8a04385f33227aaf3f66c6ccf5b099a60 +size 385183703 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..553f92552fe920351e8413118de40334a54a88da --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7608fd180819fac370739206b6a26bb5bbb9d11cd3b1846f1f168c6f216e1b6 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfca8753f3601bbad7c94b03c4a57c20c40734da --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f354907844ac2d21dea76b7b5217c953c5863cb9ca23505de4f8cb3b53dcff +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..119d57115e6305235b67a47728298715674e1624 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95fffb30e81838e3cfa952efe34a57925ffc7afa753b980dbe0f13b46b374c69 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..489d26ad0dce07296db89381e988979d27911905 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:326c3bab1d7883fbe47d537c65a796e8916d289eec13c18d1d13fb2bfa0ebdaa +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..acfcefb96928532c1c692616386798526f7924a2 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cbb3347034a1014ddc9a2491fa10176a2c8e123c43cff42caf99dec0f9d7af5 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e189dad5174b0260e1d27f488b0c3fc09d46fdf5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:886fd922c0ae7866da238d335873b4115ad1475c6516fcc08bbf5e50917ff60a +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c14ec9170764345a051d1bbbc566516edb49e7ae --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229c6f664fd617e676fac1c146c2ccb691ddac1e7841eef1686dd91566986a8c +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99c8cc60558dd675390c24ebe760d597216741c8 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2cbb971e73e725dd2271284c7cc6558580ac1d940288090d616d8d6f959e108 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..879d3852e57deee126c1ad5a109757aca7fffe25 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbd29b4fe6f801955910426b33c8da9b47d7b07d5e60f10348a7f4730c94a706 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96cc16a1cfbaac02cddbc7e86bbd9801e27d4d05 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66964e911252fb8d8a90a61785d6ef3097edb45ee58dee4b6dbf8800364873d4 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..756b504ab5058e3214af78563388998661aa5e7f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caabfe2df3ff2fcac8497da32aa43904fb9c6e7b78e615ab1a9151404b88a0d5 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc739b0d2434846f783e900969d5da23baf20ff1 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07d1b7c4e448801a9e608ae1b194b1c90b20383c52a708d142bac6ea2046427 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cf35499916613f6faddf9c1bdaf1d69c963b21a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95123ef2116632dda0fb91a8ca423444f85caa59ac4d61c118d84ca5a48be61b +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8e474b1d50aa7b8bd64af96f6fcc1f8f75a264b --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cffdeb4a37839251b3f99923fca62ee34f020a410480fa4777ff52aa8d944bb4 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14fbe789b335f73631c6d0c675655b1a2d529df7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94ec3ed6f8825d4d3bf128253edca0b6bd1959b2a38dde5b80f779f8b020386c +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a8ed937123034f798db7c5bfe0dbabc9f774fca --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_7_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6aa5f2c0af178fe889a7ba456ee41cc69c6871d8a8b75fa9665779cd583e69b +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fd02aad1b756d985aa66d4f9f2fdb892dbe1dc4 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a31f42de8fbfb1e17b4c6ab95e955cd3884ffc5dca5627982e05d35ad2f7e70 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fed1a6381d78f4bd26fdbf0ce2448ef0faaa536 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:207cd6041d73bf6e33348881941cb633759f4f47ed45510add3255ca86e3dbf4 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..349ad039e0007138a52747cec4144d135cd018dc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d3f5574749ac80d120fc98485689590225f24a0768935343ee132ff287caae +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4897c0067e8867ebc8a7bd3be0320a41cc7666a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:562f870294c97e5652a8a7ef2768e49af5a5817c9502108a421cad6e0c421362 +size 460722775 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c4f0357ce22aca75de5380bf27c97e9238408cd --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5833bf1daa0e47f3d0dcbf61919d24a5bef7e0231af20a3723743411c2bc1499 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b32b7316e2b4d5422c46b9e84216a698483ff --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0b8bacdfecf1f6ac8a34f497a9ec078d1ececceb234a708d9b55d5c4ba3fc0f +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..61202fa7706110fe3c8724356fd97bded281e634 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27fed758c39f0c6417c7d78922036412ea69f30e52482e21ab1dc3acb2f30fc1 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4deb8059814ee3abeaffc2a0c61f34fc9f24ae62 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43ca5b794f9c8448cc2d164ea352cdc5e14d1a8d69c56279782326b07f93a1e9 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..707226c254a41dd59055f6b922ac565fb3911c25 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2be0393d860b0e180e15267a256a5e6b097e14437bb8ad04c2331ab999042660 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10a483d8bc819443ce68817aae99c98fb1453a5a --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6cfa142bb2c92d69f6cf35b71443ce9d59f56f1b53a5fde85b9acc42106661c +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..14826f67ef7e7b9b787aad0fbf9550cb7b0ee341 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8425dfe52f279e7e74475e8b22c48b18d47f666a9fb86374f1670a7d7cdddcf4 +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d868d8d6e212b7989e81208bfcdf92d4e86837d7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b7d9de225ed7ab290fd4ced5a1cdcf62a7e818de1cae304d11587927356e3cb +size 415502807 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc09d67b09869465e0281d94ac8f1651c002e06e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1e2d6082c21bf442787058c764725f4cd43f0dcbdf9a06f37b5abc32c93b8e1 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cc3e5e758422de15fb575af0cfc51952b142644 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c2b9cc18123cc0d8d4263b18c3bbea1bca045e56a0931b1ecb5a10102573500 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02cc9adbc24b311f6eb8e168d47f63c275faf354 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed6924d19594b2d4a5f3d2f2156a7d5727ba468eb542900ed1c85f6f85c17001 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b44592da3415308822b50a5a885ae4f8d25c8bc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_8_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7718a06bafe0cb48ba2a9c475744c769f4eae92fea2649df7b071fa59fa42966 +size 385183639 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc86e3d7b561d0d87560f08731ff3aba133acede --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9852028533fb7e0676aa2504d13063ef303d821f19dbdfc6048be00830485f5a +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df0f30cb15163719e97226aeb6c05b764313d255 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19cc2bb4332d0f46b68c4bb728f7663095426476e293e4c2de592bcd063bb659 +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9eb4a68ae85198b50f492ead472ed8fc99fe3b46 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfc61119af5f4ebfa88d37319263c70485174d2eef46741597438d03a88ecc7f +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03603fc3014f2116f9055451b659b362dbc59f7c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f585ae81b0d45503ff8fcee0f33c3b420f99de4e73672c23f1833bc94592050d +size 460722711 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_04_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_04_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..318cb10acd72cf9516446be415c5cb34a4fc848e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_04_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf608276abe187b49565d0ee808c189104979fb88eb5a909fc8c5d69014c815 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_05_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_05_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c037319add333d8905daba0d1fc10c3c37678c4e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_05_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7bbd457bffd9fe504637461101fca03b5d9561d5b319fd63c4124897971b723 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_06_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_06_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48db51952c7432e0e6515aaa3e318606d96adda5 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_06_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41f03db5eecd6fc6751ceb9068671091cb8314ddbfa9be03620729182c23c17b +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_07_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_07_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5482328fc0ffdb557eb53f2e8cfc17c403bfe700 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_07_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbbd867fd44a48b27c7ee5aa344ba9f026de2134f852a61f32c3b70ad5f7adc4 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_08_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_08_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b296c0a2c120ed664794cf5124d216822fbe226f --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_08_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:956cf3e9bc1fda90d6b03a9fa3f1cb6be196aaf2f58c8ed79b34c7626ae2f1df +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_09_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_09_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1cf9e9ae5ed5f61bc29f04d9e2ca19af97b9a7e --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_09_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:650abd30478fbe90acb0878b77696703f6af4fd74e2a06b96b10e09a2c7ef504 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_10_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_10_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..07ddcb560a5cef365561650a7a19a49e87ee9f21 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_10_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f46b4de8a2b95d10c43ae5d52d234319b951e4b0ee7cc4332af028dcd88bca6 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_11_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_11_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ae204f1aae3e8da23fe18901a8f6db44aee2c9c --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_11_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14e983bed06c582cab6b609a099f7f7fd7484ff5ce75212b7b147580f1faf4e7 +size 415502871 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_12_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_12_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e52003e395a57688b9a62bfa89d4fecbb5b757bc --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_12_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87644d6b61d4fc3bd7a667361a72ff9205a5aafaeabc3617736071224125ca79 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_13_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_13_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f26326fb7684c0f4cf4f4767d5bd98cd5bf4e616 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_13_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fdc2dff940fecd7a8ae6d4f79912b0819a99876c0d11a47955ad54df95141d4 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_14_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_14_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03818938cf04586ef6b55bd4675a360b89d0c1c7 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_14_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:407664bf23000d3286803ea4a77643dfa003acda638569fe019731e25a9cef93 +size 385183767 diff --git a/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_15_optim_states.pt b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_15_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6f5daaa8fe01551af0c814bd4b47bf4a02700c0 --- /dev/null +++ b/8b712b400m/global_step5494/bf16_zero_pp_rank_9_mp_rank_15_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e14db28e7384b736fe147ce6048f7fe70d383ae3985a7b8550fa6fcbf611bf2 +size 385183767 diff --git a/8b712b400m/global_step5494/layer_01-model_00-model_states.pt b/8b712b400m/global_step5494/layer_01-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfc52039ed97114e332656c0f8099ac7cac70146 --- /dev/null +++ b/8b712b400m/global_step5494/layer_01-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99275a40ccf710752f6c2ab3dbeebad73fdc9a19c93783777c3a898ac1f6f84 +size 120587523 diff --git a/8b712b400m/global_step5494/layer_01-model_01-model_states.pt b/8b712b400m/global_step5494/layer_01-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3812d52ccbb9190d7e134086effed439c0d816bb --- /dev/null +++ b/8b712b400m/global_step5494/layer_01-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05471e71961756319a4a258d796903c2c99902a248b01ce8a43f7947e1d5fb2 +size 120587523 diff --git a/8b712b400m/global_step5494/layer_01-model_02-model_states.pt b/8b712b400m/global_step5494/layer_01-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b5e6190ea88e5e812d9cf92681eb73160bc2fb9 --- /dev/null +++ b/8b712b400m/global_step5494/layer_01-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58840d0a3f3b14771bd3fe21db2ea9d309ab94e45255c106461ea3c283a4c70b +size 120587523 diff --git a/8b712b400m/global_step5494/layer_01-model_03-model_states.pt b/8b712b400m/global_step5494/layer_01-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe5edd077f1408e06f006c895be5b20ab3dd796d --- /dev/null +++ b/8b712b400m/global_step5494/layer_01-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612a0922811d5dc00411d069c44edebce6c6686bd530507fc3c5b50e9de69cf1 +size 120587523 diff --git a/8b712b400m/global_step5494/layer_03-model_00-model_states.pt b/8b712b400m/global_step5494/layer_03-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed2d25dbad8e1bdf0292e19590938991f798a775 --- /dev/null +++ b/8b712b400m/global_step5494/layer_03-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebdfa074eaa1177ab8438f514e2cf5ce215b7115c4b0a766136370f81529ae08 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_03-model_01-model_states.pt b/8b712b400m/global_step5494/layer_03-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d60b7ba8ea28f743f2038f0bab406cc24104901 --- /dev/null +++ b/8b712b400m/global_step5494/layer_03-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:359a5650ca2485f920fc454a4ae59da4c163e57d0ee297aa78123cd5cf79470f +size 100731139 diff --git a/8b712b400m/global_step5494/layer_03-model_02-model_states.pt b/8b712b400m/global_step5494/layer_03-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..917a97db03ecd148661443befb7dcf7858b18b73 --- /dev/null +++ b/8b712b400m/global_step5494/layer_03-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e06bc662a74854b78d6a6c723e799022653e8b7789c088de7931a52eced20b6a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_03-model_03-model_states.pt b/8b712b400m/global_step5494/layer_03-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d85c1b903e881d709b5db436467049186e0ad07 --- /dev/null +++ b/8b712b400m/global_step5494/layer_03-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ba3f4e10ffde32a0057bc6c33ea5e2f9c8234e873aa67bc3ba3c62fc9e9eb4 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_04-model_00-model_states.pt b/8b712b400m/global_step5494/layer_04-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11a4582ddb7f75a660a4fabb688683c83b1cc1a6 --- /dev/null +++ b/8b712b400m/global_step5494/layer_04-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c37e80a29002af3ddea1797d2c0d1ce27f750ec5a8ed68abec408678a54b0b +size 100731139 diff --git a/8b712b400m/global_step5494/layer_04-model_01-model_states.pt b/8b712b400m/global_step5494/layer_04-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c65017e706faac6aed859db735a5ce1ecf99ea04 --- /dev/null +++ b/8b712b400m/global_step5494/layer_04-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d46e7b4951c285e194d64e2569a0159b7893e2e9525c58fddace8e7a45e5b393 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_04-model_02-model_states.pt b/8b712b400m/global_step5494/layer_04-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b29f34fdcf10a94faf6d970c90a84c63c2a7919 --- /dev/null +++ b/8b712b400m/global_step5494/layer_04-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb351bb46691d43f78745d050f090e758c6eee2575cc743e254bf5ad5c8320b3 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_04-model_03-model_states.pt b/8b712b400m/global_step5494/layer_04-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7806aadc6b7118bb5b61706ba8e86abef89e5b83 --- /dev/null +++ b/8b712b400m/global_step5494/layer_04-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbc5963a24e0116f0c8c161b3354abeb6476c63d554dc303f61d75fe62457757 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_05-model_00-model_states.pt b/8b712b400m/global_step5494/layer_05-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a83cb5c603251b27c38680cdb05c4fdc0d6048a2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_05-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db16171cf13f03adcdcd0444ef5f1f0123309546ccd80ff1fddf9b3639d9566a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_05-model_01-model_states.pt b/8b712b400m/global_step5494/layer_05-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f15c9cef695e44843385ca788d83ac27ad6bd1e --- /dev/null +++ b/8b712b400m/global_step5494/layer_05-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3023493699d638f582c1e48acb9ab86e84aeb3409775c5c2acfec6c5af620bd +size 100731139 diff --git a/8b712b400m/global_step5494/layer_05-model_02-model_states.pt b/8b712b400m/global_step5494/layer_05-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28181b82545bfcd246e658327bcf04bd10fa284c --- /dev/null +++ b/8b712b400m/global_step5494/layer_05-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97e80eb2da7c1d165640c399620df4a00eaa806bd0bb00c2a8ce7501dd37e8a6 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_05-model_03-model_states.pt b/8b712b400m/global_step5494/layer_05-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b820d7a4fbf7cbdad754876acef28ffbcad229d1 --- /dev/null +++ b/8b712b400m/global_step5494/layer_05-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f1ed7ff8fd8cd2ea85c15b7610d4e9a576e8334783c25d316be057817ea8f5 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_06-model_00-model_states.pt b/8b712b400m/global_step5494/layer_06-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80a2baca41d339ca323c10fa2eaa7605f86ae90b --- /dev/null +++ b/8b712b400m/global_step5494/layer_06-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1aa3cb7319ba1aa225f2283f99546958eb8c0a5c89dd8c58c5eaef3216e9e34 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_06-model_01-model_states.pt b/8b712b400m/global_step5494/layer_06-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9548269f523c22aeeb0aecaf74fb1d44dc25c8d5 --- /dev/null +++ b/8b712b400m/global_step5494/layer_06-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16d3eaaa290edcbdd30b2b311c7c8f166dff4220c7b510194ab5c1cb615ca94a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_06-model_02-model_states.pt b/8b712b400m/global_step5494/layer_06-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a6906a8e804d0b3fcd6fd7417a9cb93836a3b90 --- /dev/null +++ b/8b712b400m/global_step5494/layer_06-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9967127da09e5fc32a147c3c1aaa761fdba03f26cfd80b03027c32514c9add3 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_06-model_03-model_states.pt b/8b712b400m/global_step5494/layer_06-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc21648902420a12d6f0b432eef8b899bd45183e --- /dev/null +++ b/8b712b400m/global_step5494/layer_06-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eafae2af9796d14fc43c29e86c65e098017b4b3e276cc88eb60b71c01d394c04 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_07-model_00-model_states.pt b/8b712b400m/global_step5494/layer_07-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4765af59f05b5a51430e21ff8f85b98d38a5ff93 --- /dev/null +++ b/8b712b400m/global_step5494/layer_07-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547155122c3005bdf48bd0f6f20209286e7b8bb2989e4ac326942a4c7caa76b5 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_07-model_01-model_states.pt b/8b712b400m/global_step5494/layer_07-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8de810c17ac096247dcbc0568ec9d007785c10d5 --- /dev/null +++ b/8b712b400m/global_step5494/layer_07-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac6e56449dc1f43bc2f41bad05a3f146fd550e02215eff5cc2dc209f207e29f +size 100731139 diff --git a/8b712b400m/global_step5494/layer_07-model_02-model_states.pt b/8b712b400m/global_step5494/layer_07-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..59011d482394d18480978dd63408635de5d3cd20 --- /dev/null +++ b/8b712b400m/global_step5494/layer_07-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a921a17f8c9fff58ca6eb3aef0e71a0b838914ffdb39e35a30dc48464d050097 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_07-model_03-model_states.pt b/8b712b400m/global_step5494/layer_07-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63b20406e236c7fac0a8706eae5926a2d840e014 --- /dev/null +++ b/8b712b400m/global_step5494/layer_07-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964045420c8345959fd8b8d684586553ed3019cd285f0df319ee4b6680f3e49a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_08-model_00-model_states.pt b/8b712b400m/global_step5494/layer_08-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc22fe70753b148be6f305cfbf72690ea15b80b2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_08-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2b23d293fcd4a213c5b4c16378db4c44a3d8227673f831675893862daa07410 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_08-model_01-model_states.pt b/8b712b400m/global_step5494/layer_08-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60198812ea010d02f8d8913e69a8f29bfb6f5d8f --- /dev/null +++ b/8b712b400m/global_step5494/layer_08-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f11d54683a6fc5cf3e1b545084315a04255262dca5cd3dc2921f04799e24f15 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_08-model_02-model_states.pt b/8b712b400m/global_step5494/layer_08-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2eb7acb64656d32097a006f45c9132df71ada3d6 --- /dev/null +++ b/8b712b400m/global_step5494/layer_08-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c961b9320ba5829af09991bd0934647d06388f21b7b328ca434619283eea38 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_08-model_03-model_states.pt b/8b712b400m/global_step5494/layer_08-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f58a7728c057d72f5af544a068f5fc43c330af9e --- /dev/null +++ b/8b712b400m/global_step5494/layer_08-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f353a6d8e077328bd31dc099136b53d46076288efba256ed5529f4db02ae76 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_09-model_00-model_states.pt b/8b712b400m/global_step5494/layer_09-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..22cd867eb1f62c219fc6e3a8bdc947488e021853 --- /dev/null +++ b/8b712b400m/global_step5494/layer_09-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9c547cf89396c1e251f43686928b3b51d47a76dfa5545ef56d60f190fc29cb +size 100731139 diff --git a/8b712b400m/global_step5494/layer_09-model_01-model_states.pt b/8b712b400m/global_step5494/layer_09-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfd0c5990c9d6af64ddda2f47b80e19b2f86a304 --- /dev/null +++ b/8b712b400m/global_step5494/layer_09-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60511ba8a90112d75fa122ef881b795c740de1fb750f5b0239997e6bb33f2cb7 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_09-model_02-model_states.pt b/8b712b400m/global_step5494/layer_09-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96b443d9c9950267f2d788ac575e7ec00821857e --- /dev/null +++ b/8b712b400m/global_step5494/layer_09-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4fadac730f6a5a86268095df8e7bf58d221b66a8a1cac66ede9f959eca7c56 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_09-model_03-model_states.pt b/8b712b400m/global_step5494/layer_09-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecb32a419800e5465d744c421f89e663df5db83a --- /dev/null +++ b/8b712b400m/global_step5494/layer_09-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67e32d846b1945794dd6b5a8edd38c2efa802a40e366ed07b998c93253370eb9 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_10-model_00-model_states.pt b/8b712b400m/global_step5494/layer_10-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b40e746c158033b55615a2cb1a68de11af9c76d --- /dev/null +++ b/8b712b400m/global_step5494/layer_10-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a4d1697ddc55eb9520a50137ec2175c995fce9b2d36a7d029e5b101e23f3754 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_10-model_01-model_states.pt b/8b712b400m/global_step5494/layer_10-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..555feddeeba77ccda4d6f006ad26ad121521bcfc --- /dev/null +++ b/8b712b400m/global_step5494/layer_10-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91143ac57bb1811ce82ffdbd12724a2d381ed02404550312b7d4d856094921c2 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_10-model_02-model_states.pt b/8b712b400m/global_step5494/layer_10-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..376bc66cdc57719da6e66262112145c9bdf23f3c --- /dev/null +++ b/8b712b400m/global_step5494/layer_10-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b6a38579b2e3bb153f435a842f1020f7c064fd744302ac71358cb038583d84 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_10-model_03-model_states.pt b/8b712b400m/global_step5494/layer_10-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7ce3086d984442454805c1ce052705351970522 --- /dev/null +++ b/8b712b400m/global_step5494/layer_10-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ccfa9dc5eb2bc54a987c94836b30d64376554fcb902cfc053e410f592c7cbe +size 100731139 diff --git a/8b712b400m/global_step5494/layer_11-model_00-model_states.pt b/8b712b400m/global_step5494/layer_11-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..06b1346b25d1d7fa99cab82ccd8203d45f58abdd --- /dev/null +++ b/8b712b400m/global_step5494/layer_11-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:231b3c1d5da4097d2264d1ee58c4eb255d3a67aa637eccaa7a3e5b2cf74fb8a4 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_11-model_01-model_states.pt b/8b712b400m/global_step5494/layer_11-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd293a61270607820603082f9e420f096ad60703 --- /dev/null +++ b/8b712b400m/global_step5494/layer_11-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14b4dbf9bd095580908402cc193d3ef618bd8605270d04d9eb099f4d3fd8c1e0 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_11-model_02-model_states.pt b/8b712b400m/global_step5494/layer_11-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0750539713900668aaaac866f02a2b1e81092d15 --- /dev/null +++ b/8b712b400m/global_step5494/layer_11-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d046868b1eafbd7fb9af7f7b9ac1081ea331cefee0c2780558c69cbfb376117d +size 100731139 diff --git a/8b712b400m/global_step5494/layer_11-model_03-model_states.pt b/8b712b400m/global_step5494/layer_11-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a84dd0405f6ed152cbb292951467c1dc65b214c --- /dev/null +++ b/8b712b400m/global_step5494/layer_11-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c928ec008ded502e21781181c0eb49ee214230fbeab478da533b4d33163104 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_12-model_00-model_states.pt b/8b712b400m/global_step5494/layer_12-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d6cd33feff66fd5ac6d85574e2889b5e41efd7a --- /dev/null +++ b/8b712b400m/global_step5494/layer_12-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:286f2e28c34a3a03513c40927e5fea2807866ffc1e31e881aa9387bb42824631 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_12-model_01-model_states.pt b/8b712b400m/global_step5494/layer_12-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..be0ce8904bd8ea32cae5c987b14d8e700cbe5acd --- /dev/null +++ b/8b712b400m/global_step5494/layer_12-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5510272988a9b915330b8e8bd4ad112938c1aa867e0191b6d29884901ba84a8 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_12-model_02-model_states.pt b/8b712b400m/global_step5494/layer_12-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e48d8e0f65047308a5b364a518263398fd6bfc0 --- /dev/null +++ b/8b712b400m/global_step5494/layer_12-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138889866d6b2059e645ea0f8515a7c03a69b4c71750902e4e3152f3d77e8151 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_12-model_03-model_states.pt b/8b712b400m/global_step5494/layer_12-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..822b54284289be433e1dab5d27ca6f132a3c5903 --- /dev/null +++ b/8b712b400m/global_step5494/layer_12-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4febbf2c4efc412f5eb5972f4aac99a27a8842c4e7a9be2cda6c8c9c24decab +size 100731139 diff --git a/8b712b400m/global_step5494/layer_13-model_00-model_states.pt b/8b712b400m/global_step5494/layer_13-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19cb23b5878e1344096bc54b93a7562729e04713 --- /dev/null +++ b/8b712b400m/global_step5494/layer_13-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e45991a0bd0fcc1cf48c37a13f1e534ed9f19ebd0e443b1bbd6f2440df44647 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_13-model_01-model_states.pt b/8b712b400m/global_step5494/layer_13-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7af413bc6e70da8a796cb4f59fd36135626e5788 --- /dev/null +++ b/8b712b400m/global_step5494/layer_13-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd1705cf3928f2003ce8609b640656c4637da75e712e2b58526ef64012a83ad9 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_13-model_02-model_states.pt b/8b712b400m/global_step5494/layer_13-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b726485ef481ac85cb6154e7101428761ce37e88 --- /dev/null +++ b/8b712b400m/global_step5494/layer_13-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5adaad05d9ee173b699f4e2e8cf2816b6a25bfe80f839afd8362a5a3a2676652 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_13-model_03-model_states.pt b/8b712b400m/global_step5494/layer_13-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b7c89bcc75dba7b1c3a42dd034ab6cdf7c1b46c --- /dev/null +++ b/8b712b400m/global_step5494/layer_13-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25fd9f87df5ea2228a6b2492a42b17140f5cafd38d0cbafc1dc2ce122d4ff85e +size 100731139 diff --git a/8b712b400m/global_step5494/layer_14-model_00-model_states.pt b/8b712b400m/global_step5494/layer_14-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fdf041d00b8b58a264baf016242eaf9cffdd8ca --- /dev/null +++ b/8b712b400m/global_step5494/layer_14-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e392165c8e984cc616a904339bb8b7713260260fb32a28ef3857fdd2eeec5494 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_14-model_01-model_states.pt b/8b712b400m/global_step5494/layer_14-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9e3de2996e263442fe5b17f604e641f919afd38 --- /dev/null +++ b/8b712b400m/global_step5494/layer_14-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019c1abddafc35ae02589a41d6db78733d41940f614a1208ec598f928206eed2 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_14-model_02-model_states.pt b/8b712b400m/global_step5494/layer_14-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fa21b069bc105f0ff1a6b3963430394303ab000 --- /dev/null +++ b/8b712b400m/global_step5494/layer_14-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96c48e7c0b177c2fa0ee59cb7fcaa1511fcc81a69586940c736db3e43dd8ef21 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_14-model_03-model_states.pt b/8b712b400m/global_step5494/layer_14-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..48d5d7211cfec1ee217cd740bf606eee57dd2b56 --- /dev/null +++ b/8b712b400m/global_step5494/layer_14-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c2d4401409cf5341eac2497ff8f35feba28da1e9a79138ca8524dfac67068b7 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_15-model_00-model_states.pt b/8b712b400m/global_step5494/layer_15-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5412ca0b75ea308283073f9ba9f75150a6637292 --- /dev/null +++ b/8b712b400m/global_step5494/layer_15-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0748f5b282e0cfe42d3ddf40b8b555b0a4eb5a855efaf768303cabef99e505 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_15-model_01-model_states.pt b/8b712b400m/global_step5494/layer_15-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..08618b63faa9c8797ebb49942352567e50b4260a --- /dev/null +++ b/8b712b400m/global_step5494/layer_15-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22957d8deee2a5ed5fa0bca715b932604d805e60ea4421649494e7a09810abd +size 100731139 diff --git a/8b712b400m/global_step5494/layer_15-model_02-model_states.pt b/8b712b400m/global_step5494/layer_15-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d76c27033de5afdae37b0b76d136bef1c3f04034 --- /dev/null +++ b/8b712b400m/global_step5494/layer_15-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:076d4079eda425b680cccac3ce7a810115ad3b372c8350ef5e8c74053878025b +size 100731139 diff --git a/8b712b400m/global_step5494/layer_15-model_03-model_states.pt b/8b712b400m/global_step5494/layer_15-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83049e7a1014d6c50b0eb8f80db95444dcdf90db --- /dev/null +++ b/8b712b400m/global_step5494/layer_15-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68312092ac4d444f121fe2894c42fea34399cebf83b2a24604506997f6cfc003 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_16-model_00-model_states.pt b/8b712b400m/global_step5494/layer_16-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8901749a71d31b80419db8d0e2a8fffd00faa1c7 --- /dev/null +++ b/8b712b400m/global_step5494/layer_16-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88090c8efb0b06a8a6f36d0b3127aaa54a429a9493b44af42a59661062913f95 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_16-model_01-model_states.pt b/8b712b400m/global_step5494/layer_16-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bc52609ec653bf2bd5b4b456dfbc4e92ccc38e4 --- /dev/null +++ b/8b712b400m/global_step5494/layer_16-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffea535712970657eaf0efa032a73484de19864a0bdacaad975131217e2071fb +size 100731139 diff --git a/8b712b400m/global_step5494/layer_16-model_02-model_states.pt b/8b712b400m/global_step5494/layer_16-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e1da86d2684d865574bce0b1463554fcd70b94a --- /dev/null +++ b/8b712b400m/global_step5494/layer_16-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b012b37c549bd2945c74c057d6ca9d98bffc116ae2d196194aaf33e30e6046ae +size 100731139 diff --git a/8b712b400m/global_step5494/layer_16-model_03-model_states.pt b/8b712b400m/global_step5494/layer_16-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..abe149553d132cf7e76e45712a8f7ad4fc815782 --- /dev/null +++ b/8b712b400m/global_step5494/layer_16-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:580cfe5dc94048ccbc793c2d59ee4cd67241530a6e6a7ac13f435295ec5f53c7 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_17-model_00-model_states.pt b/8b712b400m/global_step5494/layer_17-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58d0dab37fbe52f9c254a8796a4bf28ad50507d0 --- /dev/null +++ b/8b712b400m/global_step5494/layer_17-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a956a4c4a7b2c988296fa5266d11477ec90de7ae9a61c1a7054c423bfde10587 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_17-model_01-model_states.pt b/8b712b400m/global_step5494/layer_17-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db84224369a00c6a7d541ab7e2dc24886db3039b --- /dev/null +++ b/8b712b400m/global_step5494/layer_17-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c89a76c58cc96dad73acee85ee9bea3476173b7e880dd0d89596ad94d5391f0 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_17-model_02-model_states.pt b/8b712b400m/global_step5494/layer_17-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..32d7470d9866e18c5f9e6e2d017aba55fbd33b30 --- /dev/null +++ b/8b712b400m/global_step5494/layer_17-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb255387998ab46df87e99ba34185438a4e68d7f69dde9082e76a672cbfe1576 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_17-model_03-model_states.pt b/8b712b400m/global_step5494/layer_17-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..129d888ba32f50469e97c1ce51cc1aa60ae3e00f --- /dev/null +++ b/8b712b400m/global_step5494/layer_17-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:366f4974f0f6cd11e4f954cf10bd02536126bf8ad320ffede3a37470ec158828 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_18-model_00-model_states.pt b/8b712b400m/global_step5494/layer_18-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ed2de76f307147bce07f4b42bd8c5b673efa174 --- /dev/null +++ b/8b712b400m/global_step5494/layer_18-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea4c4cc038b6153e41a3d7dae3a35b7099de86fa9f61fcb8deb263cf3483b671 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_18-model_01-model_states.pt b/8b712b400m/global_step5494/layer_18-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..565fdb61b4eab17a3943029aad2b2999ff223d2b --- /dev/null +++ b/8b712b400m/global_step5494/layer_18-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:420632526715f77115b8ca377dea0e21907afe955adf5179dc4b7b03d30aeaa5 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_18-model_02-model_states.pt b/8b712b400m/global_step5494/layer_18-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cf7b887e157d92a122603e986c10ee5f12bbc45 --- /dev/null +++ b/8b712b400m/global_step5494/layer_18-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76201460a24a3e73250d0194c46c2273613d14dec3af18dee37c44b74a65ffb1 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_18-model_03-model_states.pt b/8b712b400m/global_step5494/layer_18-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e026e5c54e304cdba382e719b34cdf681e439dfa --- /dev/null +++ b/8b712b400m/global_step5494/layer_18-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f25a9732bd751bec911c51e66773017ac0c54931c074d4e620ffd8a1a9a560 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_19-model_00-model_states.pt b/8b712b400m/global_step5494/layer_19-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..77390743675e18156c3f2fec4dc4fd150c134743 --- /dev/null +++ b/8b712b400m/global_step5494/layer_19-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e43922dcbbefa29e6a8239d41fb5644578358654599188c3f112902e8bf9429 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_19-model_01-model_states.pt b/8b712b400m/global_step5494/layer_19-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6cdeaf82c89c0cc85376a04f06c483a05548b80 --- /dev/null +++ b/8b712b400m/global_step5494/layer_19-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3511cebb78a5ff27f3a318e5d8de6e39e8856ce057dd529682be0ce6750118 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_19-model_02-model_states.pt b/8b712b400m/global_step5494/layer_19-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..821d9772f0f311d8cb1b62ca7d3dc10b2f9c5bfa --- /dev/null +++ b/8b712b400m/global_step5494/layer_19-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7183dacf5bf972bdcb57522edf268bb792aed52cff907358f221324ea7da5248 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_19-model_03-model_states.pt b/8b712b400m/global_step5494/layer_19-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af6953d931ed2d0f2b6e536a88e62f35ac7af166 --- /dev/null +++ b/8b712b400m/global_step5494/layer_19-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f33b57b823365b6101bc785bd406bd3e453016027b2d281c7b7d178b2b00c728 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_20-model_00-model_states.pt b/8b712b400m/global_step5494/layer_20-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..85a10b5faee882e6795e9432f313ed028ee37ab1 --- /dev/null +++ b/8b712b400m/global_step5494/layer_20-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad009f42010453c9d11964f74392b3aea743f40def13ba2f3d698f7dfb9d6364 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_20-model_01-model_states.pt b/8b712b400m/global_step5494/layer_20-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..382664e099c255f6fbc704cec418db1e8e982ee8 --- /dev/null +++ b/8b712b400m/global_step5494/layer_20-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae92ac53c1018c544cdad554a5994d14c3570ce833cac7876cd255c59f21dae2 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_20-model_02-model_states.pt b/8b712b400m/global_step5494/layer_20-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ccfc8586d841f7a992e49ce24ae8aea16cfc1adc --- /dev/null +++ b/8b712b400m/global_step5494/layer_20-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fed85e1f82ff460cbe6ffac68fb10a45d6216cda51badd02ef5ac3245d14054 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_20-model_03-model_states.pt b/8b712b400m/global_step5494/layer_20-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a330ef0e486b24a49384bb70711adc55c7c52c43 --- /dev/null +++ b/8b712b400m/global_step5494/layer_20-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d9c19640d531ac02a56158d2040e696f1de5dbdb8c04c47cbfa3538aecf524 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_21-model_00-model_states.pt b/8b712b400m/global_step5494/layer_21-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d052ec5d1402b3c42d69daa967872d94796763ef --- /dev/null +++ b/8b712b400m/global_step5494/layer_21-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c09768883c3f8c74c7783ac493c8c198b567a33bb5ccb3d621a85c11c119bfe4 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_21-model_01-model_states.pt b/8b712b400m/global_step5494/layer_21-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..41642f6a47d69c9f6592377f29d38b48e6d8f65e --- /dev/null +++ b/8b712b400m/global_step5494/layer_21-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cda896404be34a5e52e818b63924f7cbf20b65a5827ec0b75d545c37a15f58e +size 100731139 diff --git a/8b712b400m/global_step5494/layer_21-model_02-model_states.pt b/8b712b400m/global_step5494/layer_21-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a4c67a3b4a2d902bb743173c9c20719ab755907 --- /dev/null +++ b/8b712b400m/global_step5494/layer_21-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52bd0f317f2b4db00f695668b36cc74c9ab3e84e075dacbaf04750dc076fab1d +size 100731139 diff --git a/8b712b400m/global_step5494/layer_21-model_03-model_states.pt b/8b712b400m/global_step5494/layer_21-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce8a26d707cc9ece5e79dac36d8044a6d0129beb --- /dev/null +++ b/8b712b400m/global_step5494/layer_21-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fd5b6753a64cd93c4298cec0ee73cabc321c73a0ea0aed153c336bceb7ec6e2 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_22-model_00-model_states.pt b/8b712b400m/global_step5494/layer_22-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d7261aeb649ba897b9032ff703f6757dfce5af7 --- /dev/null +++ b/8b712b400m/global_step5494/layer_22-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29ace2c80ec41a7b58dd60980fc98c63430c8e172dc4f85ab4612b51c5ee006 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_22-model_01-model_states.pt b/8b712b400m/global_step5494/layer_22-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51bf30c13088373d6aaa99dccb85c4cd025a210d --- /dev/null +++ b/8b712b400m/global_step5494/layer_22-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a888e8a355cb05d3bbe6b796ae75659413cfc570e9885dc10e6b4400f8fba0f +size 100731139 diff --git a/8b712b400m/global_step5494/layer_22-model_02-model_states.pt b/8b712b400m/global_step5494/layer_22-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d003245cdb213c4680e77c72286ece9cb6b2446c --- /dev/null +++ b/8b712b400m/global_step5494/layer_22-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2749bed0351ca7998ab655e036d2a63aede9011451d87932cad22720c1a6d9ec +size 100731139 diff --git a/8b712b400m/global_step5494/layer_22-model_03-model_states.pt b/8b712b400m/global_step5494/layer_22-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6609380e17f12ed8f88ffdaef8fac29f8fd63704 --- /dev/null +++ b/8b712b400m/global_step5494/layer_22-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d48374cb28bd1d0ecfebe5b65fcd540dba1d5fcf5f9090decd9fb8e24845e9 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_23-model_00-model_states.pt b/8b712b400m/global_step5494/layer_23-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15079b7082ccd8a995290be1e691d155de6f2e97 --- /dev/null +++ b/8b712b400m/global_step5494/layer_23-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510bc61e16e7903d93e04398b22119680d28d064a92de3796e0d8e4afbb27f2d +size 100731139 diff --git a/8b712b400m/global_step5494/layer_23-model_01-model_states.pt b/8b712b400m/global_step5494/layer_23-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35ee9ce81ac17f320c8e58c1b8268cf0d5e878f8 --- /dev/null +++ b/8b712b400m/global_step5494/layer_23-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9370f962bbc9757c26df83bf219699fae2c5025b1e6970b73aad5b17b947c458 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_23-model_02-model_states.pt b/8b712b400m/global_step5494/layer_23-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..088837b973882c3c45e2a64e75d99c9cb56e1ce4 --- /dev/null +++ b/8b712b400m/global_step5494/layer_23-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670bcc15bbaae6d7a52d0d1c8851e045bc66a53da100e3a9e8ecb98763df29e6 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_23-model_03-model_states.pt b/8b712b400m/global_step5494/layer_23-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a5258968b9895ae90e2169833c710fa35af186f3 --- /dev/null +++ b/8b712b400m/global_step5494/layer_23-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b38eddedb9a0a829010cb4e6b2cf26bdfdb35fc3130dd12ba5f9f8957d6057f +size 100731139 diff --git a/8b712b400m/global_step5494/layer_24-model_00-model_states.pt b/8b712b400m/global_step5494/layer_24-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4543a2ebd9aa830c5c15703004d11be341579c6 --- /dev/null +++ b/8b712b400m/global_step5494/layer_24-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e2cc80f962ce4a59fa03cbb7883403a127d899ce4bb9c115549f454632da90 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_24-model_01-model_states.pt b/8b712b400m/global_step5494/layer_24-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb25253fe61988afcb7630d3258ab8d1592bc9e9 --- /dev/null +++ b/8b712b400m/global_step5494/layer_24-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:503a8406d2fb0b01b21559fc35b296ae54c3d73d215fa861d3a4accd17aaf27a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_24-model_02-model_states.pt b/8b712b400m/global_step5494/layer_24-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e186f7cfb6c372f55ab45d8921105c2aa9a959f --- /dev/null +++ b/8b712b400m/global_step5494/layer_24-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa03b7e5c8b010668f4dac8b6b8de5c659394dac6725635f46ce24176cdd08de +size 100731139 diff --git a/8b712b400m/global_step5494/layer_24-model_03-model_states.pt b/8b712b400m/global_step5494/layer_24-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f02547fa5120864c1c43585ae39962e857c0c71 --- /dev/null +++ b/8b712b400m/global_step5494/layer_24-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:909e7569eb726395314abdd44be76ce267a02b1b9c83746aeb47a946fc90a99b +size 100731139 diff --git a/8b712b400m/global_step5494/layer_25-model_00-model_states.pt b/8b712b400m/global_step5494/layer_25-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc9061cacae5e232a97fc0af5e8f4df96e4338f0 --- /dev/null +++ b/8b712b400m/global_step5494/layer_25-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ea657c397fed3d01df376a66929934a737005ec3feab2b1011a8b22015d715a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_25-model_01-model_states.pt b/8b712b400m/global_step5494/layer_25-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..313c7ad0482078d34172eef59ac543ec9c802f7d --- /dev/null +++ b/8b712b400m/global_step5494/layer_25-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d14048b5822f25487eaeeda1e1c779b49bcfe13b70b5700712eb732d42e2756 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_25-model_02-model_states.pt b/8b712b400m/global_step5494/layer_25-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3c87c98793b07147ee6394804cf28b15927c452 --- /dev/null +++ b/8b712b400m/global_step5494/layer_25-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39e91694b907fdd23236a7af9b16cf1a7aad38cff7aedec49f69e41811598c2c +size 100731139 diff --git a/8b712b400m/global_step5494/layer_25-model_03-model_states.pt b/8b712b400m/global_step5494/layer_25-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d162f9e7565897e32f9b7f8ca7cc9c11abcb625f --- /dev/null +++ b/8b712b400m/global_step5494/layer_25-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0081cf91074bb924a95e48a8659be656405a4c50f0c0b708572ed61f4e169467 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_26-model_00-model_states.pt b/8b712b400m/global_step5494/layer_26-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef1c78df7c2b5fadad95053f6fff84f3484c0c58 --- /dev/null +++ b/8b712b400m/global_step5494/layer_26-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b74c9c441493379866bddc34c103671cde4c9f558eb14efd7cc7a350a0d549 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_26-model_01-model_states.pt b/8b712b400m/global_step5494/layer_26-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a7db330dd6ea24641c9be3b42854c8ed2e503c6 --- /dev/null +++ b/8b712b400m/global_step5494/layer_26-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5bd08b43a7e76f8c13119f175e5f5ab96a6908a496beb5658731e0a1f60db5 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_26-model_02-model_states.pt b/8b712b400m/global_step5494/layer_26-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee0dc3b230f8732108a8bd8e309aec5feef5c10e --- /dev/null +++ b/8b712b400m/global_step5494/layer_26-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:000bd5ce309f3b9312222da9c662cd8ae495dfa936f8b6eafe621b500f13d306 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_26-model_03-model_states.pt b/8b712b400m/global_step5494/layer_26-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9694334a87b3a1c9982e6942dbf6b28892358226 --- /dev/null +++ b/8b712b400m/global_step5494/layer_26-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465e0cb6130c1ee972a7892eac3100ea205ba29ae510a1b5adc723dfccf04d83 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_27-model_00-model_states.pt b/8b712b400m/global_step5494/layer_27-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ef314fd1aab5b9f8087d1240418776b752872b8 --- /dev/null +++ b/8b712b400m/global_step5494/layer_27-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58578278e79195e1a2502122f13c9d52982aed306030f527e2b7d838b889e007 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_27-model_01-model_states.pt b/8b712b400m/global_step5494/layer_27-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba7cba96b9fc5af2a8f3e26adbc35ae974688f4f --- /dev/null +++ b/8b712b400m/global_step5494/layer_27-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fe056362874b49807fdcf6a682db5b2ab12e1bea295bf04736b571362291653 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_27-model_02-model_states.pt b/8b712b400m/global_step5494/layer_27-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9b0bd1ba6649eb4a62f9af52d531a2394668395 --- /dev/null +++ b/8b712b400m/global_step5494/layer_27-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19447f461160aa9ff8851a1b39564dbf37547b393bb1e4f05f53cc15662604dc +size 100731139 diff --git a/8b712b400m/global_step5494/layer_27-model_03-model_states.pt b/8b712b400m/global_step5494/layer_27-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8105c5c0c4c80981d2f4bbd8dfe48646a2e18719 --- /dev/null +++ b/8b712b400m/global_step5494/layer_27-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d6c39fdcacc37418fc24a4b3c593e37cfb3c128e83bf81fe69bbd7afce2beb +size 100731139 diff --git a/8b712b400m/global_step5494/layer_28-model_00-model_states.pt b/8b712b400m/global_step5494/layer_28-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe26e0609bd87e5a410f81aa78c1a52ab1cffb39 --- /dev/null +++ b/8b712b400m/global_step5494/layer_28-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0870731e4a192065798a4722dd335bea9d9c462ce87210e4e1c398a59d7e91c +size 100731139 diff --git a/8b712b400m/global_step5494/layer_28-model_01-model_states.pt b/8b712b400m/global_step5494/layer_28-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1416732a94d60a9a8f9c7ccbe14eb9eac6943e71 --- /dev/null +++ b/8b712b400m/global_step5494/layer_28-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66153b292dff69c3ba141d5ecc7960ac52382686d0176fdfc4cc7ecbd803743c +size 100731139 diff --git a/8b712b400m/global_step5494/layer_28-model_02-model_states.pt b/8b712b400m/global_step5494/layer_28-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ada6fb1c9e75aa8710c580d02b3097ae4a83230 --- /dev/null +++ b/8b712b400m/global_step5494/layer_28-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39cf012276aad25148bf3ee6f5a39a2e155eab232bc9b0949825b2c233fbf889 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_28-model_03-model_states.pt b/8b712b400m/global_step5494/layer_28-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5423dca8cfbe8cf66afda3479b0d05d76beaa9c --- /dev/null +++ b/8b712b400m/global_step5494/layer_28-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:973bd87f6c3e5f80197210c68af69cc6e49ae51323b48fc06901b519e8857b90 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_29-model_00-model_states.pt b/8b712b400m/global_step5494/layer_29-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b29c765c667f10cfcec78a2e8ac9ed24565df95f --- /dev/null +++ b/8b712b400m/global_step5494/layer_29-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b727df078295f96753047a8fb7bae897ae2500388e0769aeed555d903835605 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_29-model_01-model_states.pt b/8b712b400m/global_step5494/layer_29-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0cbd5128499fb97c48a1c1e721aedd73933ad86 --- /dev/null +++ b/8b712b400m/global_step5494/layer_29-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75033a7d9a491bf60b294e72a965e553756b5f42108630f8cea6e82e700f9e93 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_29-model_02-model_states.pt b/8b712b400m/global_step5494/layer_29-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13715d337a7c5166877182dc191f0ec94dfdacb9 --- /dev/null +++ b/8b712b400m/global_step5494/layer_29-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb63c49261dae7e7e3a701bf1af69b5f3098b21ea554c9d19f5e7fb27d2bf721 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_29-model_03-model_states.pt b/8b712b400m/global_step5494/layer_29-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5289aa916f5418a812a5da5d36477355306cd20b --- /dev/null +++ b/8b712b400m/global_step5494/layer_29-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4b95f732c362be33fd7fabf78cd00023b7782ba9490393d26b91057070248ac +size 100731139 diff --git a/8b712b400m/global_step5494/layer_30-model_00-model_states.pt b/8b712b400m/global_step5494/layer_30-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a37df994af60649a4a5af4aef8c700e2c4594a2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_30-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5da652dd72984e2612bc9cd8e3d8f5e49ea772affa36105db0c7030170c93e92 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_30-model_01-model_states.pt b/8b712b400m/global_step5494/layer_30-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36af1346f6d2527707fd0ca34fd5463f7c0e0665 --- /dev/null +++ b/8b712b400m/global_step5494/layer_30-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a33a1ec0a7a6513b8677d5670fc3217c028a7a0410fb8ef44a7736baccc387 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_30-model_02-model_states.pt b/8b712b400m/global_step5494/layer_30-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a1bc2eb2476ea21b29cdaa62e05912799124cdc --- /dev/null +++ b/8b712b400m/global_step5494/layer_30-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beba1b68622420c7bb23eab01d3b86497c055d7c3251e69bbf8bb1de3ac52253 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_30-model_03-model_states.pt b/8b712b400m/global_step5494/layer_30-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cb623fe43dacb60daab7f0bc1471c4e0a5c7e43 --- /dev/null +++ b/8b712b400m/global_step5494/layer_30-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5350f06791d0c1fc08bbd7ec2d75d1c55ea0b304cf79df8093488272de61e94 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_31-model_00-model_states.pt b/8b712b400m/global_step5494/layer_31-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..86bc3e7f81b0cc868dc826846763c54b632795e1 --- /dev/null +++ b/8b712b400m/global_step5494/layer_31-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6c6420f10646d1ef44cf5d6ac20ea03c2d735e55951ccf4625330fbcbd9024 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_31-model_01-model_states.pt b/8b712b400m/global_step5494/layer_31-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23df86dcba89392f4bedeaac57c563f3851507af --- /dev/null +++ b/8b712b400m/global_step5494/layer_31-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3172852712c9e8002b3ed13948a75e6bd31fe01f8d7b7abbd69fc95b2bedc7 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_31-model_02-model_states.pt b/8b712b400m/global_step5494/layer_31-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8894d05d37e0b19a9524a31fd4a93a0f4fdc4b4a --- /dev/null +++ b/8b712b400m/global_step5494/layer_31-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:712949395fcd7efe733780dd9a9468166cf2f22864b36c86cdc04d536469de61 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_31-model_03-model_states.pt b/8b712b400m/global_step5494/layer_31-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea6cbf8fbe1151222fad12307c91010ebf4e72dc --- /dev/null +++ b/8b712b400m/global_step5494/layer_31-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a56d865e98308b3efe90c8c0ac36733b775847bf4891331106927f518baee03 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_32-model_00-model_states.pt b/8b712b400m/global_step5494/layer_32-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e0f12006e7fe3b841e8c4fe1ff9edb80276601c --- /dev/null +++ b/8b712b400m/global_step5494/layer_32-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f96c101e44c1ead9428ebe14c1aee2cc5378a88f1b5d4f0fb63a431b37a6fae9 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_32-model_01-model_states.pt b/8b712b400m/global_step5494/layer_32-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a48725c9ca7eb03f1d82fb17c84e626cc75f7fcf --- /dev/null +++ b/8b712b400m/global_step5494/layer_32-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4dae2ed22fd1c7cf8e96551a3616abcec1dc09d004ade5173a6a54db9dc47b1 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_32-model_02-model_states.pt b/8b712b400m/global_step5494/layer_32-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc14cb7702b3d40fe34f57235ca46e52487eac27 --- /dev/null +++ b/8b712b400m/global_step5494/layer_32-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:050d8e66acac63536b73e8114dbe8841ffe49fa8b7d23393d470e555c9810a79 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_32-model_03-model_states.pt b/8b712b400m/global_step5494/layer_32-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f1262644142bfca7c4f48511cd447e7462a424e --- /dev/null +++ b/8b712b400m/global_step5494/layer_32-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86af2a5ba15781f456fa45a8fa4849834c1c28db0585a3f673dd660be3d7e7db +size 100731139 diff --git a/8b712b400m/global_step5494/layer_33-model_00-model_states.pt b/8b712b400m/global_step5494/layer_33-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..44b209dd72ba44804c3430703a706fd9c0785599 --- /dev/null +++ b/8b712b400m/global_step5494/layer_33-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13226d50f919cf534af8d7cfb812b4b44a65ad8513af724f288afe19374c94ea +size 100731139 diff --git a/8b712b400m/global_step5494/layer_33-model_01-model_states.pt b/8b712b400m/global_step5494/layer_33-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..23013d78306b3539671ef74a7dbd3302309566e4 --- /dev/null +++ b/8b712b400m/global_step5494/layer_33-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:999e6306f0c1d9cbc12e5154ea35fb845f673380ce7e8782e8afbe365396ea7a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_33-model_02-model_states.pt b/8b712b400m/global_step5494/layer_33-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a98f2eb6f128ba7e9e6d9bfaa2672ba85e177ad --- /dev/null +++ b/8b712b400m/global_step5494/layer_33-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:669149676b121f161bd639415cf5c00c9c4845371ad633876805a1a7f36a0065 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_33-model_03-model_states.pt b/8b712b400m/global_step5494/layer_33-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b3a013af036a588f4980cf791bc724260c656a9 --- /dev/null +++ b/8b712b400m/global_step5494/layer_33-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71cfdc92d34ae91a4bbb866cfc5bf575cfe0ab9f6582f5e9aa3f62aec7be365a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_34-model_00-model_states.pt b/8b712b400m/global_step5494/layer_34-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa2f7aaafa4b6108526a94cb96447a807c5f893c --- /dev/null +++ b/8b712b400m/global_step5494/layer_34-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c930b23495657a372768f1a1bd16312308080cfd14a8521c84fcc1627092d82 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_34-model_01-model_states.pt b/8b712b400m/global_step5494/layer_34-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..207f76f49064f319737c7ca68af2ec6d785c29eb --- /dev/null +++ b/8b712b400m/global_step5494/layer_34-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3d26c823c6c16f6ff4655096690ab7af3de7cbf2f2b5049eb358e0e8bb5fe13 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_34-model_02-model_states.pt b/8b712b400m/global_step5494/layer_34-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..75ff88ae0d95d1bcce031e3390a3ef1254646567 --- /dev/null +++ b/8b712b400m/global_step5494/layer_34-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39bddb91caab1d28301e714a5771847a14763d3c2150656314032453b687bdf5 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_34-model_03-model_states.pt b/8b712b400m/global_step5494/layer_34-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..847f8b260f6c1578428542e2ef396bf551cff2b2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_34-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:081a3acc291574b3c956fd6c6501e3378c46a8df2276486e2e2fa453ef605458 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_35-model_00-model_states.pt b/8b712b400m/global_step5494/layer_35-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..322bc37e827ff335c6b8ee4ae5c5d1754d6d5d29 --- /dev/null +++ b/8b712b400m/global_step5494/layer_35-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95b0b223e1be7d2932418ada6af3256cbf4866bd788a7c7291340182e42c4779 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_35-model_01-model_states.pt b/8b712b400m/global_step5494/layer_35-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..969a23ee6999ce1af9caa7b2e828ad1b7c77088c --- /dev/null +++ b/8b712b400m/global_step5494/layer_35-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9a95cbcf86769e2c48eddac388c2f00c0aa21f8bb7ccb2495f0d077e0aa751 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_35-model_02-model_states.pt b/8b712b400m/global_step5494/layer_35-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e0488ce2e43fc6bde93abfe88023a441b597dc3 --- /dev/null +++ b/8b712b400m/global_step5494/layer_35-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77790732ba2c0f0a892c6256a07bc7e9fd6bb027336851ff13121c818f3d0277 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_35-model_03-model_states.pt b/8b712b400m/global_step5494/layer_35-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..779e860ba23de013d1a55c72707ecaab6d177512 --- /dev/null +++ b/8b712b400m/global_step5494/layer_35-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a7c23b1cc50eac5ab884f6772a14cfddf5ce0bc6f6bab7316484d73e28e2eb7 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_36-model_00-model_states.pt b/8b712b400m/global_step5494/layer_36-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5af634a5b42cad278de27887885468ea09253c47 --- /dev/null +++ b/8b712b400m/global_step5494/layer_36-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdcb96295116443009c49469af184930153ee4f28da9830395b39953cd4ce534 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_36-model_01-model_states.pt b/8b712b400m/global_step5494/layer_36-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e947b0252ca8ab704677249233588bd91e68f74 --- /dev/null +++ b/8b712b400m/global_step5494/layer_36-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c345efe707f15e7e85d39cc594a0a59c74a9e9c47cd376477f3c36d48d1e382 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_36-model_02-model_states.pt b/8b712b400m/global_step5494/layer_36-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b71d069fcb8008ce77ad2b7eeeba6532e1278298 --- /dev/null +++ b/8b712b400m/global_step5494/layer_36-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c747625bc9aa6860731a88912814b4965df2145f5feca92693f0a6eb418fbef +size 100731139 diff --git a/8b712b400m/global_step5494/layer_36-model_03-model_states.pt b/8b712b400m/global_step5494/layer_36-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5af9b01174b707cdbd8011ad401ad5a199249610 --- /dev/null +++ b/8b712b400m/global_step5494/layer_36-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5df95dfa1ed91be2a81c6d7bad2738b19cff2dec5aeac57957e822d47f38d56d +size 100731139 diff --git a/8b712b400m/global_step5494/layer_37-model_00-model_states.pt b/8b712b400m/global_step5494/layer_37-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6ad0dcfc0fb3efc78ad5dc9fcf962848bf9b0d1 --- /dev/null +++ b/8b712b400m/global_step5494/layer_37-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa84c327e760caa0a180c99d6182b860ca6c667f4f7d117572e57ed68600c3d5 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_37-model_01-model_states.pt b/8b712b400m/global_step5494/layer_37-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6f425642d564989e9041dde4e8c732265de11c2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_37-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9fd5cf5b662d9978c1b8843e707c69192e61ab2d67372b342982caf4cb5b7a3 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_37-model_02-model_states.pt b/8b712b400m/global_step5494/layer_37-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99d29619dc9483f1244bfc1bce2917a635694753 --- /dev/null +++ b/8b712b400m/global_step5494/layer_37-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:948339e8ac52d521b6d52b29f960bf6f51b19881b573018b65891b51742cf731 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_37-model_03-model_states.pt b/8b712b400m/global_step5494/layer_37-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0f24fdb514c31b3d87f85dd3d59b55797baa994 --- /dev/null +++ b/8b712b400m/global_step5494/layer_37-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e794d265dbd12073651cab807384609e95737240fbe03680f6d4cfeafbded49b +size 100731139 diff --git a/8b712b400m/global_step5494/layer_38-model_00-model_states.pt b/8b712b400m/global_step5494/layer_38-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3db85b8b7be86ede2ddfdecabd50e1ece8b93d02 --- /dev/null +++ b/8b712b400m/global_step5494/layer_38-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:161218de721eeb9a6b34017b1c98cd0d6c6ff0c06dc21eba2150c7c3eab747e3 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_38-model_01-model_states.pt b/8b712b400m/global_step5494/layer_38-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cb878d971c4ceaed6ae913b610b07c7431504d8 --- /dev/null +++ b/8b712b400m/global_step5494/layer_38-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdacbfaae2482592632267f795341dc4bc5f8a2522a33599a2cf1c5195f5af7f +size 100731139 diff --git a/8b712b400m/global_step5494/layer_38-model_02-model_states.pt b/8b712b400m/global_step5494/layer_38-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd36cf2343f844e9cf8eb7f3a9bbd6158f727185 --- /dev/null +++ b/8b712b400m/global_step5494/layer_38-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe2c267eac315561ff975b7e2856cc7932dc031cd62b6a1ebf282281947ddb6 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_38-model_03-model_states.pt b/8b712b400m/global_step5494/layer_38-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df95a89034d32b12a12823c1f64bf29597cc7b2a --- /dev/null +++ b/8b712b400m/global_step5494/layer_38-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c25f797716905352a80684c5f68868a0988bf0b754c9b38e8f05eaa2ae7815bb +size 100731139 diff --git a/8b712b400m/global_step5494/layer_39-model_00-model_states.pt b/8b712b400m/global_step5494/layer_39-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f3576ea4aa6cc6623c71c2a12576c3fc1731f52 --- /dev/null +++ b/8b712b400m/global_step5494/layer_39-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b973d3636c2f86ab58769d1a5feb6534122227e0d8061d58762cd9a388415638 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_39-model_01-model_states.pt b/8b712b400m/global_step5494/layer_39-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d2e3ea4e52266e3803dabcfcd3dba2a913feb9d --- /dev/null +++ b/8b712b400m/global_step5494/layer_39-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990b764dc4283ce0ec6d1cde7cafc882ef3b8a39b2db71335bf5d8c2989d289e +size 100731139 diff --git a/8b712b400m/global_step5494/layer_39-model_02-model_states.pt b/8b712b400m/global_step5494/layer_39-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..751c701df2d794b836209b81daa18fb703572a21 --- /dev/null +++ b/8b712b400m/global_step5494/layer_39-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fb49219865230be37a5616406973a9dc0bd719006c9ac50fe6b3defe179e40 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_39-model_03-model_states.pt b/8b712b400m/global_step5494/layer_39-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..933a056052e4e4103014e79f853fe20c6f0ace15 --- /dev/null +++ b/8b712b400m/global_step5494/layer_39-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29305e06dbb3a1cc6ffcb616c6a91d5f57bc75eafe40e1af702491a5ad51078b +size 100731139 diff --git a/8b712b400m/global_step5494/layer_40-model_00-model_states.pt b/8b712b400m/global_step5494/layer_40-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58b42a4b2655aa10d2d5736fc17815f9f766e97d --- /dev/null +++ b/8b712b400m/global_step5494/layer_40-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b87541086e1c33f0ab233b52045c69e6eb8f63e3426bcc517d23c423efc051 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_40-model_01-model_states.pt b/8b712b400m/global_step5494/layer_40-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..851fd168ca9b548992d518060d3af9223f388885 --- /dev/null +++ b/8b712b400m/global_step5494/layer_40-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b503e0c22e28b57e18f5a55f7fa4197fd08c15a9ad2337677dfb5ca006abb571 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_40-model_02-model_states.pt b/8b712b400m/global_step5494/layer_40-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a581cfb591ea8797be989b18f44d7231a22b3497 --- /dev/null +++ b/8b712b400m/global_step5494/layer_40-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5335624559f825d28676bac545066f24586a58483add441aff3eafd7ad1e360d +size 100731139 diff --git a/8b712b400m/global_step5494/layer_40-model_03-model_states.pt b/8b712b400m/global_step5494/layer_40-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03e165f843a1c1df5b93df0e37c9e3f458ba22f3 --- /dev/null +++ b/8b712b400m/global_step5494/layer_40-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3bd2917fc15a8e1c6e2f5f89445aee52da8a945877f8ec95ddfbd93bc2b2530 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_41-model_00-model_states.pt b/8b712b400m/global_step5494/layer_41-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0768b8f4610d7ed59e3850575833c4bc4ed6a578 --- /dev/null +++ b/8b712b400m/global_step5494/layer_41-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5e4714decc3904c9ec687514b20cfd9f1738e06262e2d3aa5afe2f89fff48d +size 100731139 diff --git a/8b712b400m/global_step5494/layer_41-model_01-model_states.pt b/8b712b400m/global_step5494/layer_41-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..745060d9c608d63fc2ed6d9c098023f8cc547e6b --- /dev/null +++ b/8b712b400m/global_step5494/layer_41-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa2fa94b69ee48abae5a42e89e3dd6bb4d41980713470304a544079f197d8a5a +size 100731139 diff --git a/8b712b400m/global_step5494/layer_41-model_02-model_states.pt b/8b712b400m/global_step5494/layer_41-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6977c99bcf2dc8d8e77fe7b3f6b31d7b34a1288 --- /dev/null +++ b/8b712b400m/global_step5494/layer_41-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3d345393ac71ac0faa0867f3e4072338406a0d5d9f6ae4bd579459df328e99 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_41-model_03-model_states.pt b/8b712b400m/global_step5494/layer_41-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f3433f16ee40c9c16e33a8cec5ebd06d9b1dbf2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_41-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c23bfa602d33c5e295407c45399cad13710dac57d6689053df8df2f0ddb5a9 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_42-model_00-model_states.pt b/8b712b400m/global_step5494/layer_42-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b275200bbe6d858c9c68761a6c403d75e3040202 --- /dev/null +++ b/8b712b400m/global_step5494/layer_42-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1642d008d0a65b9713e37017245f6c497d2c52a364607befe99146be748339d0 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_42-model_01-model_states.pt b/8b712b400m/global_step5494/layer_42-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8633351b734bee41aff4650369d39e03dc055bc --- /dev/null +++ b/8b712b400m/global_step5494/layer_42-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9461ccb3ca1b777d53771d81a090b4085bbcdca9addf480d2136cd81da7566d2 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_42-model_02-model_states.pt b/8b712b400m/global_step5494/layer_42-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4305341dc2a4e9fd459ea455eeb77fac6a42b1f5 --- /dev/null +++ b/8b712b400m/global_step5494/layer_42-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc5a84149a701ce6ebabd78dfdce6ea463a2e8781b7905dddd1f7060b0ea7479 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_42-model_03-model_states.pt b/8b712b400m/global_step5494/layer_42-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..263625d320650d233482f2bc168a97ec7100fa67 --- /dev/null +++ b/8b712b400m/global_step5494/layer_42-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f31be83a5e4d05f13f1cac6212997d1ae5e97efb3277a464a7b40f51fca75987 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_43-model_00-model_states.pt b/8b712b400m/global_step5494/layer_43-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec907b404357b2634d9e1606261a09e8980d6e74 --- /dev/null +++ b/8b712b400m/global_step5494/layer_43-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b5d6c30acd967e394aacf3d75c9f36ded913cc2e407a5f8616fb06ea0196734 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_43-model_01-model_states.pt b/8b712b400m/global_step5494/layer_43-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbdf7b44453fcd066675c6bcd0ac278561231450 --- /dev/null +++ b/8b712b400m/global_step5494/layer_43-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1647882d335ae1051936710f81e7f1bd7463a5dd9bc239b37b5e676bbcf8494c +size 100731139 diff --git a/8b712b400m/global_step5494/layer_43-model_02-model_states.pt b/8b712b400m/global_step5494/layer_43-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e30fba1477c1c2d0cc835dcea26fdeea118fa0e2 --- /dev/null +++ b/8b712b400m/global_step5494/layer_43-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c83cb8b26c570cd441a83cce6bf3f1b09f7252ebad3ad6901c60abc090d791c +size 100731139 diff --git a/8b712b400m/global_step5494/layer_43-model_03-model_states.pt b/8b712b400m/global_step5494/layer_43-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e34e8def5ae63674804d4035d787083d63774bb3 --- /dev/null +++ b/8b712b400m/global_step5494/layer_43-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44feca1493ad0d514ebec839f319702ee4bba4f5def2da769cc566d1be2d9da0 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_44-model_00-model_states.pt b/8b712b400m/global_step5494/layer_44-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4560707985513ea854d14aca5b47f37ba2e0f101 --- /dev/null +++ b/8b712b400m/global_step5494/layer_44-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bcd0643602ab8667f0a2e95deeb52b63482991289e00062a610e187cfcbd3c0 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_44-model_01-model_states.pt b/8b712b400m/global_step5494/layer_44-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6e2719937f06fa1d0a0cf00327814a36a9e3efb --- /dev/null +++ b/8b712b400m/global_step5494/layer_44-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6187d7f52e6152332a6b2637a06292da4ac731fe1b73fdf3a139bcb96ff3523 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_44-model_02-model_states.pt b/8b712b400m/global_step5494/layer_44-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58a81f03c865c0a5cb6e978b2e1da2384228ff51 --- /dev/null +++ b/8b712b400m/global_step5494/layer_44-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c582bdbec4d09605607758f467e7a7c3735a3932af2f00947b1e193ff048457 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_44-model_03-model_states.pt b/8b712b400m/global_step5494/layer_44-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cca31cfef37e5a2000e32a566bf647a6a0448f1f --- /dev/null +++ b/8b712b400m/global_step5494/layer_44-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d116f45e3edcc9ae125e986df3d88fa4d48101d8231ee890282daf850b72f8 +size 100731139 diff --git a/8b712b400m/global_step5494/layer_46-model_00-model_states.pt b/8b712b400m/global_step5494/layer_46-model_00-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eea5c70d0935aa3eda2b923fc69a9bedc0a35b0e --- /dev/null +++ b/8b712b400m/global_step5494/layer_46-model_00-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a437ff7980f083c5e65c74bf34226d38eb1456e9cf026bf37c1bd81bdd76e1 +size 17603 diff --git a/8b712b400m/global_step5494/layer_46-model_01-model_states.pt b/8b712b400m/global_step5494/layer_46-model_01-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c5bbdda26652fe2d03f395f3c5fbb61e25fe842 --- /dev/null +++ b/8b712b400m/global_step5494/layer_46-model_01-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3614028cdf0f515b18a8b17636eb1858497767f77425980d219d28c7d6a30d0e +size 17603 diff --git a/8b712b400m/global_step5494/layer_46-model_02-model_states.pt b/8b712b400m/global_step5494/layer_46-model_02-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e0c31626d430bf7d5bf416d3943c4c775795c2a --- /dev/null +++ b/8b712b400m/global_step5494/layer_46-model_02-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4269c7dfbea5815f7c05c25ca5f678842da3fd518246b6ae107d4783835f92e3 +size 17603 diff --git a/8b712b400m/global_step5494/layer_46-model_03-model_states.pt b/8b712b400m/global_step5494/layer_46-model_03-model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4acb3ebf3684a831f8a09ccbea62fe494c0e62f7 --- /dev/null +++ b/8b712b400m/global_step5494/layer_46-model_03-model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bd4ef57eb525fce0512a8a11472be26b78c056152f36117ec1062cea15cb663 +size 17603 diff --git a/8b712b400m/global_step5494/mp_rank_00_model_states.pt b/8b712b400m/global_step5494/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..654db50cecddf2f033329ebc2bbdadd85973051e --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:687c4bf40a150c85414d1e91834d8db5345c257f395e85af6ef81051b1676855 +size 32307 diff --git a/8b712b400m/global_step5494/mp_rank_01_model_states.pt b/8b712b400m/global_step5494/mp_rank_01_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25a3268dbaa313a87cb006580a0a4602c9d3a600 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_01_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfbc61d8f716651ddbd130168beee0debd1574033b287b15217599c16b3af2f2 +size 32307 diff --git a/8b712b400m/global_step5494/mp_rank_02_model_states.pt b/8b712b400m/global_step5494/mp_rank_02_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3f261a131736165fad79dd79a52c8d932098210 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_02_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86620d36ce1c8a2c6117294d1caef8e2b12ce39994cbd3430c7a2d8e7d8c2d34 +size 32307 diff --git a/8b712b400m/global_step5494/mp_rank_03_model_states.pt b/8b712b400m/global_step5494/mp_rank_03_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f13a5079076eab1d9b24e7c22206ceb164a1e14 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_03_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be54d691d7bbc699bf3cce747e242b6284bd04e91523e3208270d40a55b76078 +size 32307 diff --git a/8b712b400m/global_step5494/mp_rank_04_model_states.pt b/8b712b400m/global_step5494/mp_rank_04_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea65f466683d19cc137446eb4a99d1d9bd666e56 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_04_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1680d94d6122ad37eb1b19535a2afb729dbea87cba6111e1a9f72f3f720ad349 +size 32179 diff --git a/8b712b400m/global_step5494/mp_rank_05_model_states.pt b/8b712b400m/global_step5494/mp_rank_05_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..403375aca8d072734b243c65fe0161725db565a8 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_05_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28184d794329089942526e9d2672c163dc1f2ea7c044c006c09e343152ab532 +size 32179 diff --git a/8b712b400m/global_step5494/mp_rank_06_model_states.pt b/8b712b400m/global_step5494/mp_rank_06_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdb42f19c6d97ba51123b6943f6bee786eef4a91 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_06_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3dee25fe020add0e361447e70b3fa4bc38e67bf9bd09449618aec153469b05 +size 32179 diff --git a/8b712b400m/global_step5494/mp_rank_07_model_states.pt b/8b712b400m/global_step5494/mp_rank_07_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0dd51b5be48ecac2f26e7983350b32d3d606ae9 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_07_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c0c5dd19983353593e7e7e200f634775a78d0d70082572505e2b228e32c030d +size 32179 diff --git a/8b712b400m/global_step5494/mp_rank_08_model_states.pt b/8b712b400m/global_step5494/mp_rank_08_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2088f791276c1b959704e45cd0c47077a5b5e49 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_08_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e5e063aa942658471beb1f8362873c7b66c9f98204adcca984daa5116b4404d +size 32243 diff --git a/8b712b400m/global_step5494/mp_rank_09_model_states.pt b/8b712b400m/global_step5494/mp_rank_09_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13d86ede0d96450934b63fe5cb6eedce07fbe07e --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_09_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e9051dc2b887d7603dacf0cb31cbb25c00606c6d607d28ce3b341a8e9d8724 +size 32243 diff --git a/8b712b400m/global_step5494/mp_rank_10_model_states.pt b/8b712b400m/global_step5494/mp_rank_10_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a9d71a94dfdaa30dcbe40201a032bfb73a11a85 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_10_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0b1b4870f7f5ae210698e94ddcb10320505aa4f312c436644d6beda5dfbc171 +size 32243 diff --git a/8b712b400m/global_step5494/mp_rank_11_model_states.pt b/8b712b400m/global_step5494/mp_rank_11_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a46d84f0ba563e148d2ef209e098d527572ded8 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_11_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39da379903645996b2bd20d0fe3a0ad9d76316a1c10e0eed2499914016f30881 +size 32243 diff --git a/8b712b400m/global_step5494/mp_rank_12_model_states.pt b/8b712b400m/global_step5494/mp_rank_12_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f17ca1f537b5a77af1d486c29095832ecdcb27f --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_12_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9528a5b9d136f7786d8ca99b1b9303b54fcd90d832e72375ccad8568b3183ddb +size 30963 diff --git a/8b712b400m/global_step5494/mp_rank_13_model_states.pt b/8b712b400m/global_step5494/mp_rank_13_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e12f278f585eb66f371f34b52dcb9a92bbd22b51 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_13_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da71f02889c6ea5581acb3b3d853a20c3a90fd54cb682a7db07359fb617c4a1f +size 30963 diff --git a/8b712b400m/global_step5494/mp_rank_14_model_states.pt b/8b712b400m/global_step5494/mp_rank_14_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5cfde98eb167de7edfdd8a97e5db9a379975d25 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_14_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f97ca4ec01535b10c0bd9bd527de2412a6138160c0df84e7b1d99a0d6732883b +size 30963 diff --git a/8b712b400m/global_step5494/mp_rank_15_model_states.pt b/8b712b400m/global_step5494/mp_rank_15_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..87a8e7a806b2ecbd299b03df5b76e2010b8a75d1 --- /dev/null +++ b/8b712b400m/global_step5494/mp_rank_15_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72fa55c5c1535959aa5906528ea746540b0837027f38242fc99f61d28a64703c +size 30963 diff --git a/8b712b400m/sbatch_8b712b400m.sh b/8b712b400m/sbatch_8b712b400m.sh new file mode 100644 index 0000000000000000000000000000000000000000..9da7dfabfe116845d3d3105e3a70874d896279c5 --- /dev/null +++ b/8b712b400m/sbatch_8b712b400m.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007542 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=8b712b400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" + +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=4 +TP_SIZE=4 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=2 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_9293M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=5000 + +# Tokens: 11522010000 +# -> Samples: 5625981 +TRAIN_SAMPLES=5_625_981 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 56_260 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" + diff --git a/8b712b400m/sbatch_8b712b400mval.sh b/8b712b400m/sbatch_8b712b400mval.sh new file mode 100644 index 0000000000000000000000000000000000000000..430514663aab71978fcc439ccde04c7429fbf0e0 --- /dev/null +++ b/8b712b400m/sbatch_8b712b400mval.sh @@ -0,0 +1,172 @@ +#!/bin/bash +#SBATCH --exclude=nid007542 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=8b712b400mval +VARIANT_CKPT=8b712b400m + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT_CKPT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" +#DATA_PATH="/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document" + +TRAIN_DATA_PATH=train400m.txt +# "train: 1.0 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_12B_text_document" +VALID_DATA_PATH=val.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" + +PP_SIZE=4 +TP_SIZE=4 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=2 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_9293M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=5000 + +# Tokens: 11522010000 +# -> Samples: 5625981 +TRAIN_SAMPLES=1 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 0 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --override-lr-scheduler \ + --reset-progress \ + --no-load-optim \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1 \ + --eval-iters 100 \ + --eval-only true \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + --num-workers 0 \ + --valid-num-workers 0 \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" + diff --git a/8b712b400m/tensorboard_8b712b400m/events.out.tfevents.1678909724.nid005063.21852.0 b/8b712b400m/tensorboard_8b712b400m/events.out.tfevents.1678909724.nid005063.21852.0 new file mode 100644 index 0000000000000000000000000000000000000000..bb954f73e1c0d224dffe9bb876893454d1ebb53d --- /dev/null +++ b/8b712b400m/tensorboard_8b712b400m/events.out.tfevents.1678909724.nid005063.21852.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e8bcd5a9250b39bd36c7837fa0bcb75f2ec83534c4fff71931d8364b199546 +size 9804671 diff --git a/8b712b400m/tensorboard_8b712b400mval/events.out.tfevents.1678986294.nid005063.73763.0 b/8b712b400m/tensorboard_8b712b400mval/events.out.tfevents.1678986294.nid005063.73763.0 new file mode 100644 index 0000000000000000000000000000000000000000..efd542dc72fbcae11494e6b59058049c079c80fb --- /dev/null +++ b/8b712b400m/tensorboard_8b712b400mval/events.out.tfevents.1678986294.nid005063.73763.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e60441fc3bb6ffdef8c7b271da6be974e2b23723e476c1c04fb7af2eb0af742 +size 980